Added code for approximate matching in UTF8bvnormalize() and changed to use

author Stig Venaas <venaas@openldap.org>

Tue, 26 Feb 2002 18:38:40 +0000 (18:38 +0000)

committer Stig Venaas <venaas@openldap.org>

Tue, 26 Feb 2002 18:38:40 +0000 (18:38 +0000)
author Stig Venaas <venaas@openldap.org>
Tue, 26 Feb 2002 18:38:40 +0000 (18:38 +0000)
committer Stig Venaas <venaas@openldap.org>
Tue, 26 Feb 2002 18:38:40 +0000 (18:38 +0000)
diff --git a/include/ldap_pvt_uc.h b/include/ldap_pvt_uc.h

index b6840fd57f20009916c88c168eca1b66d9d6da90..ba20d28fab07a8f113429c137a4d16e277ebe3a1 100644 (file)
--- a/include/ldap_pvt_uc.h
+++ b/include/ldap_pvt_uc.h
@@ -141,6 +141,7 @@ LDAP_LUNICODE_F(void) ucstr2upper(
  #define LDAP_UTF8_CASEFOLD     0x1U
  #define LDAP_UTF8_ARG1NFC      0x2U
  #define LDAP_UTF8_ARG2NFC      0x4U
+#define LDAP_UTF8_APPROX       0x8U
  
  LDAP_LUNICODE_F(char *) UTF8normalize(
         struct berval *,
diff --git a/libraries/liblunicode/ucstr.c b/libraries/liblunicode/ucstr.c

index 988417b8043fe4174b3ba228a68404f1cd03638b..fa45868ebda6fed3000f890ce6fa6d709257b773 100644 (file)
--- a/libraries/liblunicode/ucstr.c
+++ b/libraries/liblunicode/ucstr.c
@@ -245,12 +245,14 @@ char * UTF8normalize(
  struct berval * UTF8bvnormalize(
         struct berval *bv,
         struct berval *newbv,
-       unsigned casefold )
+       unsigned flags )
  {
         int i, j, len, clen, outpos, ucsoutlen, outsize, last;
         char *out, *s;
         unsigned long *ucs, *p, *ucsout;
-       
+
+       unsigned casefold = flags & LDAP_UTF8_CASEFOLD;
+       unsigned approx = flags & LDAP_UTF8_APPROX;
         static unsigned char mask[] = {
                  0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
  
@@ -361,20 +363,28 @@ struct berval * UTF8bvnormalize(
                  }
                 /* normalize ucs of length p - ucs */
                 uccanondecomp( ucs, p - ucs, &ucsout, &ucsoutlen );    
-               ucsoutlen = uccanoncomp( ucsout, ucsoutlen );
-               /* convert ucs to utf-8 and store in out */
-               for ( j = 0; j < ucsoutlen; j++ ) {
-                       /* allocate more space if not enough room for
-                          6 bytes and terminator */
-                       if ( outsize - outpos < 7 ) {
-                               outsize = ucsoutlen - j + outpos + 6;
-                               out = (char *) realloc( out, outsize );
-                               if ( out == NULL ) {
-                                       free( ucs );
-                                       return NULL;
+               if ( approx ) {
+                       for ( j = 0; j < ucsoutlen; j++ ) {
+                               if ( ucsout[j] < 0x80 ) {
+                                       out[outpos++] = ucsout[j];
                                 }
                         }
-                       outpos += ldap_x_ucs4_to_utf8( ucsout[j], &out[outpos] );
+               } else {
+                       ucsoutlen = uccanoncomp( ucsout, ucsoutlen );
+                       /* convert ucs to utf-8 and store in out */
+                       for ( j = 0; j < ucsoutlen; j++ ) {
+                               /* allocate more space if not enough room for
+                                  6 bytes and terminator */
+                               if ( outsize - outpos < 7 ) {
+                                       outsize = ucsoutlen - j + outpos + 6;
+                                       out = (char *) realloc( out, outsize );
+                                       if ( out == NULL ) {
+                                               free( ucs );
+                                               return NULL;
+                                       }
+                               }
+                               outpos += ldap_x_ucs4_to_utf8( ucsout[j], &out[outpos] );
+                       }
                 }
                 
                 if ( i == len ) {
diff --git a/servers/slapd/schema_init.c b/servers/slapd/schema_init.c

index d8d82496b7e3d6651f36025ffe138842a18f1ec4..40df0175540e4c40b881e0283388f41fc2aa64d2 100644 (file)
--- a/servers/slapd/schema_init.c
+++ b/servers/slapd/schema_init.c
@@ -646,30 +646,6 @@ err:
         return NULL;
  }
  
-/* Strip characters with the 8th bit set */
-static char *
-strip8bitChars(
-       char *in )      
-{
-       char *p = in, *q;
-  
-       if( in == NULL ) {
-               return NULL;
-       }
-       while( *p ) {
-               if( *p & 0x80 ) {
-                       q = p;
-                       while( *++q & 0x80 ) {
-                               /* empty */
-                       }
-                       p = AC_MEMCPY(p, q, strlen(q) + 1);
-               } else {
-                       p++;
-               }
-       }
-       return in;
-}
-
  #ifndef SLAPD_APPROX_OLDSINGLESTRING
  
  #if defined(SLAPD_APPROX_INITIALS)
@@ -689,31 +665,27 @@ approxMatch(
         struct berval *value,
         void *assertedValue )
  {
-       char *val, *nval, *assertv, **values, **words, *c;
+       struct berval *nval, *assertv;
+       char *val, **values, **words, *c;
         int i, count, len, nextchunk=0, nextavail=0;
-       size_t avlen;
  
         /* Yes, this is necessary */
-       nval = UTF8normalize( value, LDAP_UTF8_NOCASEFOLD );
+       nval = UTF8bvnormalize( value, NULL, LDAP_UTF8_APPROX );
         if( nval == NULL ) {
                 *matchp = 1;
                 return LDAP_SUCCESS;
         }
-       strip8bitChars( nval );
  
         /* Yes, this is necessary */
-       assertv = UTF8normalize( ((struct berval *)assertedValue),
-               LDAP_UTF8_NOCASEFOLD );
+       assertv = UTF8bvnormalize( ((struct berval *)assertedValue), NULL, LDAP_UTF8_APPROX );
         if( assertv == NULL ) {
-               ch_free( nval );
+               ber_bvfree( nval );
                 *matchp = 1;
                 return LDAP_SUCCESS;
         }
-       strip8bitChars( assertv );
-       avlen = strlen( assertv );
  
         /* Isolate how many words there are */
-       for( c=nval,count=1; *c; c++ ) {
+       for ( c = nval->bv_val, count = 1; *c; c++ ) {
                 c = strpbrk( c, SLAPD_APPROX_DELIMITER );
                 if ( c == NULL ) break;
                 *c = '\0';
@@ -723,7 +695,7 @@ approxMatch(
         /* Get a phonetic copy of each word */
         words = (char **)ch_malloc( count * sizeof(char *) );
         values = (char **)ch_malloc( count * sizeof(char *) );
-       for( c=nval,i=0;  i<count;  i++,c+=strlen(c)+1 ) {
+       for ( c = nval->bv_val, i = 0;  i < count; i++, c += strlen(c) + 1 ) {
                 words[i] = c;
                 values[i] = phonetic(c);
         }
@@ -731,8 +703,8 @@ approxMatch(
         /* Work through the asserted value's words, to see if at least some
            of the words are there, in the same order. */
         len = 0;
-       while ( (size_t) nextchunk < avlen ) {
-               len = strcspn( assertv + nextchunk, SLAPD_APPROX_DELIMITER);
+       while ( (ber_len_t) nextchunk < assertv->bv_len ) {
+               len = strcspn( assertv->bv_val + nextchunk, SLAPD_APPROX_DELIMITER);
                 if( len == 0 ) {
                         nextchunk++;
                         continue;
@@ -741,7 +713,7 @@ approxMatch(
                 else if( len == 1 ) {
                         /* Single letter words need to at least match one word's initial */
                         for( i=nextavail; i<count; i++ )
-                               if( !strncasecmp( assertv+nextchunk, words[i], 1 )) {
+                               if( !strncasecmp( assertv->bv_val + nextchunk, words[i], 1 )) {
                                         nextavail=i+1;
                                         break;
                                 }
@@ -749,8 +721,8 @@ approxMatch(
  #endif
                 else {
                         /* Isolate the next word in the asserted value and phonetic it */
-                       assertv[nextchunk+len] = '\0';
-                       val = phonetic( assertv + nextchunk );
+                       assertv->bv_val[nextchunk+len] = '\0';
+                       val = phonetic( assertv->bv_val + nextchunk );
  
                         /* See if this phonetic chunk is in the remaining words of *value */
                         for( i=nextavail; i<count; i++ ){
@@ -781,13 +753,13 @@ approxMatch(
         }
  
         /* Cleanup allocs */
-       free( assertv );
+       ber_bvfree( assertv );
         for( i=0; i<count; i++ ) {
                 ch_free( values[i] );
         }
         ch_free( values );
         ch_free( words );
-       ch_free( nval );
+       ber_bvfree( nval );
  
         return LDAP_SUCCESS;
  }
@@ -802,18 +774,18 @@ approxIndexer(
         BerVarray values,
         BerVarray *keysp )
  {
-       char *val, *c;
+       char *c;
         int i,j, len, wordcount, keycount=0;
-       struct berval *newkeys;
+       struct berval *val, *newkeys;
         BerVarray keys=NULL;
  
         for( j=0; values[j].bv_val != NULL; j++ ) {
                 /* Yes, this is necessary */
-               val = UTF8normalize( &values[j], LDAP_UTF8_NOCASEFOLD );
-               strip8bitChars( val );
+               val = UTF8bvnormalize( &values[j], NULL, LDAP_UTF8_APPROX );
+               assert( val != NULL && val->bv_val != NULL );
  
                 /* Isolate how many words there are. There will be a key for each */
-               for( wordcount=0,c=val;  *c;  c++) {
+               for( wordcount = 0, c = val->bv_val; *c; c++) {
                         len = strcspn(c, SLAPD_APPROX_DELIMITER);
                         if( len >= SLAPD_APPROX_WORDLEN ) wordcount++;
                         c+= len;
@@ -829,7 +801,7 @@ approxIndexer(
                 keys = newkeys;
  
                 /* Get a phonetic copy of each word */
-               for( c=val,i=0;  i<wordcount;  c+=len+1  ) {
+               for( c = val->bv_val, i = 0; i < wordcount; c += len + 1 ) {
                         len = strlen( c );
                         if( len < SLAPD_APPROX_WORDLEN ) continue;
                         ber_str2bv( phonetic( c ), 0, 0, &keys[keycount] );
@@ -837,7 +809,7 @@ approxIndexer(
                         i++;
                 }
  
-               free( val );
+               ber_bvfree( val );
         }
         keys[keycount].bv_val = NULL;
         *keysp = keys;
@@ -855,23 +827,23 @@ approxFilter(
         void * assertValue,
         BerVarray *keysp )
  {
-       char *val, *c;
+       char *c;
         int i, count, len;
+       struct berval *val;
         BerVarray keys;
  
         /* Yes, this is necessary */
-       val = UTF8normalize( ((struct berval *)assertValue),
-               LDAP_UTF8_NOCASEFOLD );
-       if( val == NULL ) {
+       val = UTF8bvnormalize( ((struct berval *)assertValue), NULL, LDAP_UTF8_APPROX );
+       if( val == NULL || val->bv_val == NULL ) {
                 keys = (struct berval *)ch_malloc( sizeof(struct berval) );
                 keys[0].bv_val = NULL;
                 *keysp = keys;
+               ber_bvfree( val );
                 return LDAP_SUCCESS;
         }
-       strip8bitChars( val );
  
         /* Isolate how many words there are. There will be a key for each */
-       for( count=0,c=val;  *c;  c++) {
+       for( count = 0,c = val->bv_val; *c; c++) {
                 len = strcspn(c, SLAPD_APPROX_DELIMITER);
                 if( len >= SLAPD_APPROX_WORDLEN ) count++;
                 c+= len;
@@ -883,14 +855,14 @@ approxFilter(
         keys = (struct berval *)ch_malloc( (count + 1) * sizeof(struct berval) );
  
         /* Get a phonetic copy of each word */
-       for( c=val,i=0;  i<count; c+=len+1 ) {
+       for( c = val->bv_val, i = 0; i < count; c += len + 1 ) {
                 len = strlen(c);
                 if( len < SLAPD_APPROX_WORDLEN ) continue;
                 ber_str2bv( phonetic( c ), 0, 0, &keys[i] );
                 i++;
         }
  
-       free( val );
+       ber_bvfree( val );
  
         keys[count].bv_val = NULL;
         *keysp = keys;
author	Stig Venaas <venaas@openldap.org>
	Tue, 26 Feb 2002 18:38:40 +0000 (18:38 +0000)
committer	Stig Venaas <venaas@openldap.org>
	Tue, 26 Feb 2002 18:38:40 +0000 (18:38 +0000)
include/ldap_pvt_uc.h		patch \| blob \| history
libraries/liblunicode/ucstr.c		patch \| blob \| history
servers/slapd/schema_init.c		patch \| blob \| history