From: Stig Venaas Date: Tue, 26 Feb 2002 18:38:40 +0000 (+0000) Subject: Added code for approximate matching in UTF8bvnormalize() and changed to use X-Git-Tag: OPENLDAP_REL_ENG_2_MP~390 X-Git-Url: https://git.sur5r.net/?a=commitdiff_plain;h=94983da942ed75a4e872952419d54f0a2a826405;p=openldap Added code for approximate matching in UTF8bvnormalize() and changed to use this in approxMatch etc in schema_init.c --- diff --git a/include/ldap_pvt_uc.h b/include/ldap_pvt_uc.h index b6840fd57f..ba20d28fab 100644 --- a/include/ldap_pvt_uc.h +++ b/include/ldap_pvt_uc.h @@ -141,6 +141,7 @@ LDAP_LUNICODE_F(void) ucstr2upper( #define LDAP_UTF8_CASEFOLD 0x1U #define LDAP_UTF8_ARG1NFC 0x2U #define LDAP_UTF8_ARG2NFC 0x4U +#define LDAP_UTF8_APPROX 0x8U LDAP_LUNICODE_F(char *) UTF8normalize( struct berval *, diff --git a/libraries/liblunicode/ucstr.c b/libraries/liblunicode/ucstr.c index 988417b804..fa45868ebd 100644 --- a/libraries/liblunicode/ucstr.c +++ b/libraries/liblunicode/ucstr.c @@ -245,12 +245,14 @@ char * UTF8normalize( struct berval * UTF8bvnormalize( struct berval *bv, struct berval *newbv, - unsigned casefold ) + unsigned flags ) { int i, j, len, clen, outpos, ucsoutlen, outsize, last; char *out, *s; unsigned long *ucs, *p, *ucsout; - + + unsigned casefold = flags & LDAP_UTF8_CASEFOLD; + unsigned approx = flags & LDAP_UTF8_APPROX; static unsigned char mask[] = { 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 }; @@ -361,20 +363,28 @@ struct berval * UTF8bvnormalize( } /* normalize ucs of length p - ucs */ uccanondecomp( ucs, p - ucs, &ucsout, &ucsoutlen ); - ucsoutlen = uccanoncomp( ucsout, ucsoutlen ); - /* convert ucs to utf-8 and store in out */ - for ( j = 0; j < ucsoutlen; j++ ) { - /* allocate more space if not enough room for - 6 bytes and terminator */ - if ( outsize - outpos < 7 ) { - outsize = ucsoutlen - j + outpos + 6; - out = (char *) realloc( out, outsize ); - if ( out == NULL ) { - free( ucs ); - return NULL; + if ( approx ) { + for ( j = 0; j < ucsoutlen; j++ ) { + if ( ucsout[j] < 0x80 ) { + out[outpos++] = ucsout[j]; } } - outpos += ldap_x_ucs4_to_utf8( ucsout[j], &out[outpos] ); + } else { + ucsoutlen = uccanoncomp( ucsout, ucsoutlen ); + /* convert ucs to utf-8 and store in out */ + for ( j = 0; j < ucsoutlen; j++ ) { + /* allocate more space if not enough room for + 6 bytes and terminator */ + if ( outsize - outpos < 7 ) { + outsize = ucsoutlen - j + outpos + 6; + out = (char *) realloc( out, outsize ); + if ( out == NULL ) { + free( ucs ); + return NULL; + } + } + outpos += ldap_x_ucs4_to_utf8( ucsout[j], &out[outpos] ); + } } if ( i == len ) { diff --git a/servers/slapd/schema_init.c b/servers/slapd/schema_init.c index d8d82496b7..40df017554 100644 --- a/servers/slapd/schema_init.c +++ b/servers/slapd/schema_init.c @@ -646,30 +646,6 @@ err: return NULL; } -/* Strip characters with the 8th bit set */ -static char * -strip8bitChars( - char *in ) -{ - char *p = in, *q; - - if( in == NULL ) { - return NULL; - } - while( *p ) { - if( *p & 0x80 ) { - q = p; - while( *++q & 0x80 ) { - /* empty */ - } - p = AC_MEMCPY(p, q, strlen(q) + 1); - } else { - p++; - } - } - return in; -} - #ifndef SLAPD_APPROX_OLDSINGLESTRING #if defined(SLAPD_APPROX_INITIALS) @@ -689,31 +665,27 @@ approxMatch( struct berval *value, void *assertedValue ) { - char *val, *nval, *assertv, **values, **words, *c; + struct berval *nval, *assertv; + char *val, **values, **words, *c; int i, count, len, nextchunk=0, nextavail=0; - size_t avlen; /* Yes, this is necessary */ - nval = UTF8normalize( value, LDAP_UTF8_NOCASEFOLD ); + nval = UTF8bvnormalize( value, NULL, LDAP_UTF8_APPROX ); if( nval == NULL ) { *matchp = 1; return LDAP_SUCCESS; } - strip8bitChars( nval ); /* Yes, this is necessary */ - assertv = UTF8normalize( ((struct berval *)assertedValue), - LDAP_UTF8_NOCASEFOLD ); + assertv = UTF8bvnormalize( ((struct berval *)assertedValue), NULL, LDAP_UTF8_APPROX ); if( assertv == NULL ) { - ch_free( nval ); + ber_bvfree( nval ); *matchp = 1; return LDAP_SUCCESS; } - strip8bitChars( assertv ); - avlen = strlen( assertv ); /* Isolate how many words there are */ - for( c=nval,count=1; *c; c++ ) { + for ( c = nval->bv_val, count = 1; *c; c++ ) { c = strpbrk( c, SLAPD_APPROX_DELIMITER ); if ( c == NULL ) break; *c = '\0'; @@ -723,7 +695,7 @@ approxMatch( /* Get a phonetic copy of each word */ words = (char **)ch_malloc( count * sizeof(char *) ); values = (char **)ch_malloc( count * sizeof(char *) ); - for( c=nval,i=0; ibv_val, i = 0; i < count; i++, c += strlen(c) + 1 ) { words[i] = c; values[i] = phonetic(c); } @@ -731,8 +703,8 @@ approxMatch( /* Work through the asserted value's words, to see if at least some of the words are there, in the same order. */ len = 0; - while ( (size_t) nextchunk < avlen ) { - len = strcspn( assertv + nextchunk, SLAPD_APPROX_DELIMITER); + while ( (ber_len_t) nextchunk < assertv->bv_len ) { + len = strcspn( assertv->bv_val + nextchunk, SLAPD_APPROX_DELIMITER); if( len == 0 ) { nextchunk++; continue; @@ -741,7 +713,7 @@ approxMatch( else if( len == 1 ) { /* Single letter words need to at least match one word's initial */ for( i=nextavail; ibv_val + nextchunk, words[i], 1 )) { nextavail=i+1; break; } @@ -749,8 +721,8 @@ approxMatch( #endif else { /* Isolate the next word in the asserted value and phonetic it */ - assertv[nextchunk+len] = '\0'; - val = phonetic( assertv + nextchunk ); + assertv->bv_val[nextchunk+len] = '\0'; + val = phonetic( assertv->bv_val + nextchunk ); /* See if this phonetic chunk is in the remaining words of *value */ for( i=nextavail; ibv_val != NULL ); /* Isolate how many words there are. There will be a key for each */ - for( wordcount=0,c=val; *c; c++) { + for( wordcount = 0, c = val->bv_val; *c; c++) { len = strcspn(c, SLAPD_APPROX_DELIMITER); if( len >= SLAPD_APPROX_WORDLEN ) wordcount++; c+= len; @@ -829,7 +801,7 @@ approxIndexer( keys = newkeys; /* Get a phonetic copy of each word */ - for( c=val,i=0; ibv_val, i = 0; i < wordcount; c += len + 1 ) { len = strlen( c ); if( len < SLAPD_APPROX_WORDLEN ) continue; ber_str2bv( phonetic( c ), 0, 0, &keys[keycount] ); @@ -837,7 +809,7 @@ approxIndexer( i++; } - free( val ); + ber_bvfree( val ); } keys[keycount].bv_val = NULL; *keysp = keys; @@ -855,23 +827,23 @@ approxFilter( void * assertValue, BerVarray *keysp ) { - char *val, *c; + char *c; int i, count, len; + struct berval *val; BerVarray keys; /* Yes, this is necessary */ - val = UTF8normalize( ((struct berval *)assertValue), - LDAP_UTF8_NOCASEFOLD ); - if( val == NULL ) { + val = UTF8bvnormalize( ((struct berval *)assertValue), NULL, LDAP_UTF8_APPROX ); + if( val == NULL || val->bv_val == NULL ) { keys = (struct berval *)ch_malloc( sizeof(struct berval) ); keys[0].bv_val = NULL; *keysp = keys; + ber_bvfree( val ); return LDAP_SUCCESS; } - strip8bitChars( val ); /* Isolate how many words there are. There will be a key for each */ - for( count=0,c=val; *c; c++) { + for( count = 0,c = val->bv_val; *c; c++) { len = strcspn(c, SLAPD_APPROX_DELIMITER); if( len >= SLAPD_APPROX_WORDLEN ) count++; c+= len; @@ -883,14 +855,14 @@ approxFilter( keys = (struct berval *)ch_malloc( (count + 1) * sizeof(struct berval) ); /* Get a phonetic copy of each word */ - for( c=val,i=0; ibv_val, i = 0; i < count; c += len + 1 ) { len = strlen(c); if( len < SLAPD_APPROX_WORDLEN ) continue; ber_str2bv( phonetic( c ), 0, 0, &keys[i] ); i++; } - free( val ); + ber_bvfree( val ); keys[count].bv_val = NULL; *keysp = keys;