]> git.sur5r.net Git - openldap/commitdiff
Added code for approximate matching in UTF8bvnormalize() and changed to use
authorStig Venaas <venaas@openldap.org>
Tue, 26 Feb 2002 18:38:40 +0000 (18:38 +0000)
committerStig Venaas <venaas@openldap.org>
Tue, 26 Feb 2002 18:38:40 +0000 (18:38 +0000)
this in approxMatch etc in schema_init.c

include/ldap_pvt_uc.h
libraries/liblunicode/ucstr.c
servers/slapd/schema_init.c

index b6840fd57f20009916c88c168eca1b66d9d6da90..ba20d28fab07a8f113429c137a4d16e277ebe3a1 100644 (file)
@@ -141,6 +141,7 @@ LDAP_LUNICODE_F(void) ucstr2upper(
 #define LDAP_UTF8_CASEFOLD     0x1U
 #define LDAP_UTF8_ARG1NFC      0x2U
 #define LDAP_UTF8_ARG2NFC      0x4U
+#define LDAP_UTF8_APPROX       0x8U
 
 LDAP_LUNICODE_F(char *) UTF8normalize(
        struct berval *,
index 988417b8043fe4174b3ba228a68404f1cd03638b..fa45868ebda6fed3000f890ce6fa6d709257b773 100644 (file)
@@ -245,12 +245,14 @@ char * UTF8normalize(
 struct berval * UTF8bvnormalize(
        struct berval *bv,
        struct berval *newbv,
-       unsigned casefold )
+       unsigned flags )
 {
        int i, j, len, clen, outpos, ucsoutlen, outsize, last;
        char *out, *s;
        unsigned long *ucs, *p, *ucsout;
-       
+
+       unsigned casefold = flags & LDAP_UTF8_CASEFOLD;
+       unsigned approx = flags & LDAP_UTF8_APPROX;
        static unsigned char mask[] = {
                 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
 
@@ -361,20 +363,28 @@ struct berval * UTF8bvnormalize(
                 }
                /* normalize ucs of length p - ucs */
                uccanondecomp( ucs, p - ucs, &ucsout, &ucsoutlen );    
-               ucsoutlen = uccanoncomp( ucsout, ucsoutlen );
-               /* convert ucs to utf-8 and store in out */
-               for ( j = 0; j < ucsoutlen; j++ ) {
-                       /* allocate more space if not enough room for
-                          6 bytes and terminator */
-                       if ( outsize - outpos < 7 ) {
-                               outsize = ucsoutlen - j + outpos + 6;
-                               out = (char *) realloc( out, outsize );
-                               if ( out == NULL ) {
-                                       free( ucs );
-                                       return NULL;
+               if ( approx ) {
+                       for ( j = 0; j < ucsoutlen; j++ ) {
+                               if ( ucsout[j] < 0x80 ) {
+                                       out[outpos++] = ucsout[j];
                                }
                        }
-                       outpos += ldap_x_ucs4_to_utf8( ucsout[j], &out[outpos] );
+               } else {
+                       ucsoutlen = uccanoncomp( ucsout, ucsoutlen );
+                       /* convert ucs to utf-8 and store in out */
+                       for ( j = 0; j < ucsoutlen; j++ ) {
+                               /* allocate more space if not enough room for
+                                  6 bytes and terminator */
+                               if ( outsize - outpos < 7 ) {
+                                       outsize = ucsoutlen - j + outpos + 6;
+                                       out = (char *) realloc( out, outsize );
+                                       if ( out == NULL ) {
+                                               free( ucs );
+                                               return NULL;
+                                       }
+                               }
+                               outpos += ldap_x_ucs4_to_utf8( ucsout[j], &out[outpos] );
+                       }
                }
                
                if ( i == len ) {
index d8d82496b7e3d6651f36025ffe138842a18f1ec4..40df0175540e4c40b881e0283388f41fc2aa64d2 100644 (file)
@@ -646,30 +646,6 @@ err:
        return NULL;
 }
 
-/* Strip characters with the 8th bit set */
-static char *
-strip8bitChars(
-       char *in )      
-{
-       char *p = in, *q;
-  
-       if( in == NULL ) {
-               return NULL;
-       }
-       while( *p ) {
-               if( *p & 0x80 ) {
-                       q = p;
-                       while( *++q & 0x80 ) {
-                               /* empty */
-                       }
-                       p = AC_MEMCPY(p, q, strlen(q) + 1);
-               } else {
-                       p++;
-               }
-       }
-       return in;
-}
-
 #ifndef SLAPD_APPROX_OLDSINGLESTRING
 
 #if defined(SLAPD_APPROX_INITIALS)
@@ -689,31 +665,27 @@ approxMatch(
        struct berval *value,
        void *assertedValue )
 {
-       char *val, *nval, *assertv, **values, **words, *c;
+       struct berval *nval, *assertv;
+       char *val, **values, **words, *c;
        int i, count, len, nextchunk=0, nextavail=0;
-       size_t avlen;
 
        /* Yes, this is necessary */
-       nval = UTF8normalize( value, LDAP_UTF8_NOCASEFOLD );
+       nval = UTF8bvnormalize( value, NULL, LDAP_UTF8_APPROX );
        if( nval == NULL ) {
                *matchp = 1;
                return LDAP_SUCCESS;
        }
-       strip8bitChars( nval );
 
        /* Yes, this is necessary */
-       assertv = UTF8normalize( ((struct berval *)assertedValue),
-               LDAP_UTF8_NOCASEFOLD );
+       assertv = UTF8bvnormalize( ((struct berval *)assertedValue), NULL, LDAP_UTF8_APPROX );
        if( assertv == NULL ) {
-               ch_free( nval );
+               ber_bvfree( nval );
                *matchp = 1;
                return LDAP_SUCCESS;
        }
-       strip8bitChars( assertv );
-       avlen = strlen( assertv );
 
        /* Isolate how many words there are */
-       for( c=nval,count=1; *c; c++ ) {
+       for ( c = nval->bv_val, count = 1; *c; c++ ) {
                c = strpbrk( c, SLAPD_APPROX_DELIMITER );
                if ( c == NULL ) break;
                *c = '\0';
@@ -723,7 +695,7 @@ approxMatch(
        /* Get a phonetic copy of each word */
        words = (char **)ch_malloc( count * sizeof(char *) );
        values = (char **)ch_malloc( count * sizeof(char *) );
-       for( c=nval,i=0;  i<count;  i++,c+=strlen(c)+1 ) {
+       for ( c = nval->bv_val, i = 0;  i < count; i++, c += strlen(c) + 1 ) {
                words[i] = c;
                values[i] = phonetic(c);
        }
@@ -731,8 +703,8 @@ approxMatch(
        /* Work through the asserted value's words, to see if at least some
           of the words are there, in the same order. */
        len = 0;
-       while ( (size_t) nextchunk < avlen ) {
-               len = strcspn( assertv + nextchunk, SLAPD_APPROX_DELIMITER);
+       while ( (ber_len_t) nextchunk < assertv->bv_len ) {
+               len = strcspn( assertv->bv_val + nextchunk, SLAPD_APPROX_DELIMITER);
                if( len == 0 ) {
                        nextchunk++;
                        continue;
@@ -741,7 +713,7 @@ approxMatch(
                else if( len == 1 ) {
                        /* Single letter words need to at least match one word's initial */
                        for( i=nextavail; i<count; i++ )
-                               if( !strncasecmp( assertv+nextchunk, words[i], 1 )) {
+                               if( !strncasecmp( assertv->bv_val + nextchunk, words[i], 1 )) {
                                        nextavail=i+1;
                                        break;
                                }
@@ -749,8 +721,8 @@ approxMatch(
 #endif
                else {
                        /* Isolate the next word in the asserted value and phonetic it */
-                       assertv[nextchunk+len] = '\0';
-                       val = phonetic( assertv + nextchunk );
+                       assertv->bv_val[nextchunk+len] = '\0';
+                       val = phonetic( assertv->bv_val + nextchunk );
 
                        /* See if this phonetic chunk is in the remaining words of *value */
                        for( i=nextavail; i<count; i++ ){
@@ -781,13 +753,13 @@ approxMatch(
        }
 
        /* Cleanup allocs */
-       free( assertv );
+       ber_bvfree( assertv );
        for( i=0; i<count; i++ ) {
                ch_free( values[i] );
        }
        ch_free( values );
        ch_free( words );
-       ch_free( nval );
+       ber_bvfree( nval );
 
        return LDAP_SUCCESS;
 }
@@ -802,18 +774,18 @@ approxIndexer(
        BerVarray values,
        BerVarray *keysp )
 {
-       char *val, *c;
+       char *c;
        int i,j, len, wordcount, keycount=0;
-       struct berval *newkeys;
+       struct berval *val, *newkeys;
        BerVarray keys=NULL;
 
        for( j=0; values[j].bv_val != NULL; j++ ) {
                /* Yes, this is necessary */
-               val = UTF8normalize( &values[j], LDAP_UTF8_NOCASEFOLD );
-               strip8bitChars( val );
+               val = UTF8bvnormalize( &values[j], NULL, LDAP_UTF8_APPROX );
+               assert( val != NULL && val->bv_val != NULL );
 
                /* Isolate how many words there are. There will be a key for each */
-               for( wordcount=0,c=val;  *c;  c++) {
+               for( wordcount = 0, c = val->bv_val; *c; c++) {
                        len = strcspn(c, SLAPD_APPROX_DELIMITER);
                        if( len >= SLAPD_APPROX_WORDLEN ) wordcount++;
                        c+= len;
@@ -829,7 +801,7 @@ approxIndexer(
                keys = newkeys;
 
                /* Get a phonetic copy of each word */
-               for( c=val,i=0;  i<wordcount;  c+=len+1  ) {
+               for( c = val->bv_val, i = 0; i < wordcount; c += len + 1 ) {
                        len = strlen( c );
                        if( len < SLAPD_APPROX_WORDLEN ) continue;
                        ber_str2bv( phonetic( c ), 0, 0, &keys[keycount] );
@@ -837,7 +809,7 @@ approxIndexer(
                        i++;
                }
 
-               free( val );
+               ber_bvfree( val );
        }
        keys[keycount].bv_val = NULL;
        *keysp = keys;
@@ -855,23 +827,23 @@ approxFilter(
        void * assertValue,
        BerVarray *keysp )
 {
-       char *val, *c;
+       char *c;
        int i, count, len;
+       struct berval *val;
        BerVarray keys;
 
        /* Yes, this is necessary */
-       val = UTF8normalize( ((struct berval *)assertValue),
-               LDAP_UTF8_NOCASEFOLD );
-       if( val == NULL ) {
+       val = UTF8bvnormalize( ((struct berval *)assertValue), NULL, LDAP_UTF8_APPROX );
+       if( val == NULL || val->bv_val == NULL ) {
                keys = (struct berval *)ch_malloc( sizeof(struct berval) );
                keys[0].bv_val = NULL;
                *keysp = keys;
+               ber_bvfree( val );
                return LDAP_SUCCESS;
        }
-       strip8bitChars( val );
 
        /* Isolate how many words there are. There will be a key for each */
-       for( count=0,c=val;  *c;  c++) {
+       for( count = 0,c = val->bv_val; *c; c++) {
                len = strcspn(c, SLAPD_APPROX_DELIMITER);
                if( len >= SLAPD_APPROX_WORDLEN ) count++;
                c+= len;
@@ -883,14 +855,14 @@ approxFilter(
        keys = (struct berval *)ch_malloc( (count + 1) * sizeof(struct berval) );
 
        /* Get a phonetic copy of each word */
-       for( c=val,i=0;  i<count; c+=len+1 ) {
+       for( c = val->bv_val, i = 0; i < count; c += len + 1 ) {
                len = strlen(c);
                if( len < SLAPD_APPROX_WORDLEN ) continue;
                ber_str2bv( phonetic( c ), 0, 0, &keys[i] );
                i++;
        }
 
-       free( val );
+       ber_bvfree( val );
 
        keys[count].bv_val = NULL;
        *keysp = keys;