From 94983da942ed75a4e872952419d54f0a2a826405 Mon Sep 17 00:00:00 2001
From: Stig Venaas <venaas@openldap.org>
Date: Tue, 26 Feb 2002 18:38:40 +0000
Subject: [PATCH] Added code for approximate matching in UTF8bvnormalize() and
 changed to use this in approxMatch etc in schema_init.c

---
 include/ldap_pvt_uc.h         |  1 +
 libraries/liblunicode/ucstr.c | 38 ++++++++++------
 servers/slapd/schema_init.c   | 86 ++++++++++++-----------------------
 3 files changed, 54 insertions(+), 71 deletions(-)

diff --git a/include/ldap_pvt_uc.h b/include/ldap_pvt_uc.h
index b6840fd57f..ba20d28fab 100644
--- a/include/ldap_pvt_uc.h
+++ b/include/ldap_pvt_uc.h
@@ -141,6 +141,7 @@ LDAP_LUNICODE_F(void) ucstr2upper(
 #define LDAP_UTF8_CASEFOLD	0x1U
 #define LDAP_UTF8_ARG1NFC	0x2U
 #define LDAP_UTF8_ARG2NFC	0x4U
+#define LDAP_UTF8_APPROX	0x8U
 
 LDAP_LUNICODE_F(char *) UTF8normalize(
 	struct berval *,
diff --git a/libraries/liblunicode/ucstr.c b/libraries/liblunicode/ucstr.c
index 988417b804..fa45868ebd 100644
--- a/libraries/liblunicode/ucstr.c
+++ b/libraries/liblunicode/ucstr.c
@@ -245,12 +245,14 @@ char * UTF8normalize(
 struct berval * UTF8bvnormalize(
 	struct berval *bv,
 	struct berval *newbv,
-	unsigned casefold )
+	unsigned flags )
 {
 	int i, j, len, clen, outpos, ucsoutlen, outsize, last;
 	char *out, *s;
 	unsigned long *ucs, *p, *ucsout;
-	
+
+	unsigned casefold = flags & LDAP_UTF8_CASEFOLD;
+	unsigned approx = flags & LDAP_UTF8_APPROX;
 	static unsigned char mask[] = {
                 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
 
@@ -361,20 +363,28 @@ struct berval * UTF8bvnormalize(
                 }
 		/* normalize ucs of length p - ucs */
 		uccanondecomp( ucs, p - ucs, &ucsout, &ucsoutlen );    
-		ucsoutlen = uccanoncomp( ucsout, ucsoutlen );
-		/* convert ucs to utf-8 and store in out */
-		for ( j = 0; j < ucsoutlen; j++ ) {
-			/* allocate more space if not enough room for
-			   6 bytes and terminator */
-			if ( outsize - outpos < 7 ) {
-				outsize = ucsoutlen - j + outpos + 6;
-				out = (char *) realloc( out, outsize );
-				if ( out == NULL ) {
-					free( ucs );
-					return NULL;
+		if ( approx ) {
+			for ( j = 0; j < ucsoutlen; j++ ) {
+				if ( ucsout[j] < 0x80 ) {
+					out[outpos++] = ucsout[j];
 				}
 			}
-			outpos += ldap_x_ucs4_to_utf8( ucsout[j], &out[outpos] );
+		} else {
+			ucsoutlen = uccanoncomp( ucsout, ucsoutlen );
+			/* convert ucs to utf-8 and store in out */
+			for ( j = 0; j < ucsoutlen; j++ ) {
+				/* allocate more space if not enough room for
+				   6 bytes and terminator */
+				if ( outsize - outpos < 7 ) {
+					outsize = ucsoutlen - j + outpos + 6;
+					out = (char *) realloc( out, outsize );
+					if ( out == NULL ) {
+						free( ucs );
+						return NULL;
+					}
+				}
+				outpos += ldap_x_ucs4_to_utf8( ucsout[j], &out[outpos] );
+			}
 		}
 		
 		if ( i == len ) {
diff --git a/servers/slapd/schema_init.c b/servers/slapd/schema_init.c
index d8d82496b7..40df017554 100644
--- a/servers/slapd/schema_init.c
+++ b/servers/slapd/schema_init.c
@@ -646,30 +646,6 @@ err:
 	return NULL;
 }
 
-/* Strip characters with the 8th bit set */
-static char *
-strip8bitChars(
-	char *in )      
-{
-	char *p = in, *q;
-  
-	if( in == NULL ) {
-		return NULL;
-	}
-	while( *p ) {
-		if( *p & 0x80 ) {
-			q = p;
-			while( *++q & 0x80 ) {
-				/* empty */
-			}
-			p = AC_MEMCPY(p, q, strlen(q) + 1);
-		} else {
-			p++;
-		}
-	}
-	return in;
-}
-
 #ifndef SLAPD_APPROX_OLDSINGLESTRING
 
 #if defined(SLAPD_APPROX_INITIALS)
@@ -689,31 +665,27 @@ approxMatch(
 	struct berval *value,
 	void *assertedValue )
 {
-	char *val, *nval, *assertv, **values, **words, *c;
+	struct berval *nval, *assertv;
+	char *val, **values, **words, *c;
 	int i, count, len, nextchunk=0, nextavail=0;
-	size_t avlen;
 
 	/* Yes, this is necessary */
-	nval = UTF8normalize( value, LDAP_UTF8_NOCASEFOLD );
+	nval = UTF8bvnormalize( value, NULL, LDAP_UTF8_APPROX );
 	if( nval == NULL ) {
 		*matchp = 1;
 		return LDAP_SUCCESS;
 	}
-	strip8bitChars( nval );
 
 	/* Yes, this is necessary */
-	assertv = UTF8normalize( ((struct berval *)assertedValue),
-		LDAP_UTF8_NOCASEFOLD );
+	assertv = UTF8bvnormalize( ((struct berval *)assertedValue), NULL, LDAP_UTF8_APPROX );
 	if( assertv == NULL ) {
-		ch_free( nval );
+		ber_bvfree( nval );
 		*matchp = 1;
 		return LDAP_SUCCESS;
 	}
-	strip8bitChars( assertv );
-	avlen = strlen( assertv );
 
 	/* Isolate how many words there are */
-	for( c=nval,count=1; *c; c++ ) {
+	for ( c = nval->bv_val, count = 1; *c; c++ ) {
 		c = strpbrk( c, SLAPD_APPROX_DELIMITER );
 		if ( c == NULL ) break;
 		*c = '\0';
@@ -723,7 +695,7 @@ approxMatch(
 	/* Get a phonetic copy of each word */
 	words = (char **)ch_malloc( count * sizeof(char *) );
 	values = (char **)ch_malloc( count * sizeof(char *) );
-	for( c=nval,i=0;  i<count;  i++,c+=strlen(c)+1 ) {
+	for ( c = nval->bv_val, i = 0;  i < count; i++, c += strlen(c) + 1 ) {
 		words[i] = c;
 		values[i] = phonetic(c);
 	}
@@ -731,8 +703,8 @@ approxMatch(
 	/* Work through the asserted value's words, to see if at least some
 	   of the words are there, in the same order. */
 	len = 0;
-	while ( (size_t) nextchunk < avlen ) {
-		len = strcspn( assertv + nextchunk, SLAPD_APPROX_DELIMITER);
+	while ( (ber_len_t) nextchunk < assertv->bv_len ) {
+		len = strcspn( assertv->bv_val + nextchunk, SLAPD_APPROX_DELIMITER);
 		if( len == 0 ) {
 			nextchunk++;
 			continue;
@@ -741,7 +713,7 @@ approxMatch(
 		else if( len == 1 ) {
 			/* Single letter words need to at least match one word's initial */
 			for( i=nextavail; i<count; i++ )
-				if( !strncasecmp( assertv+nextchunk, words[i], 1 )) {
+				if( !strncasecmp( assertv->bv_val + nextchunk, words[i], 1 )) {
 					nextavail=i+1;
 					break;
 				}
@@ -749,8 +721,8 @@ approxMatch(
 #endif
 		else {
 			/* Isolate the next word in the asserted value and phonetic it */
-			assertv[nextchunk+len] = '\0';
-			val = phonetic( assertv + nextchunk );
+			assertv->bv_val[nextchunk+len] = '\0';
+			val = phonetic( assertv->bv_val + nextchunk );
 
 			/* See if this phonetic chunk is in the remaining words of *value */
 			for( i=nextavail; i<count; i++ ){
@@ -781,13 +753,13 @@ approxMatch(
 	}
 
 	/* Cleanup allocs */
-	free( assertv );
+	ber_bvfree( assertv );
 	for( i=0; i<count; i++ ) {
 		ch_free( values[i] );
 	}
 	ch_free( values );
 	ch_free( words );
-	ch_free( nval );
+	ber_bvfree( nval );
 
 	return LDAP_SUCCESS;
 }
@@ -802,18 +774,18 @@ approxIndexer(
 	BerVarray values,
 	BerVarray *keysp )
 {
-	char *val, *c;
+	char *c;
 	int i,j, len, wordcount, keycount=0;
-	struct berval *newkeys;
+	struct berval *val, *newkeys;
 	BerVarray keys=NULL;
 
 	for( j=0; values[j].bv_val != NULL; j++ ) {
 		/* Yes, this is necessary */
-		val = UTF8normalize( &values[j], LDAP_UTF8_NOCASEFOLD );
-		strip8bitChars( val );
+		val = UTF8bvnormalize( &values[j], NULL, LDAP_UTF8_APPROX );
+		assert( val != NULL && val->bv_val != NULL );
 
 		/* Isolate how many words there are. There will be a key for each */
-		for( wordcount=0,c=val;	 *c;  c++) {
+		for( wordcount = 0, c = val->bv_val; *c; c++) {
 			len = strcspn(c, SLAPD_APPROX_DELIMITER);
 			if( len >= SLAPD_APPROX_WORDLEN ) wordcount++;
 			c+= len;
@@ -829,7 +801,7 @@ approxIndexer(
 		keys = newkeys;
 
 		/* Get a phonetic copy of each word */
-		for( c=val,i=0;	 i<wordcount;  c+=len+1	 ) {
+		for( c = val->bv_val, i = 0; i < wordcount; c += len + 1 ) {
 			len = strlen( c );
 			if( len < SLAPD_APPROX_WORDLEN ) continue;
 			ber_str2bv( phonetic( c ), 0, 0, &keys[keycount] );
@@ -837,7 +809,7 @@ approxIndexer(
 			i++;
 		}
 
-		free( val );
+		ber_bvfree( val );
 	}
 	keys[keycount].bv_val = NULL;
 	*keysp = keys;
@@ -855,23 +827,23 @@ approxFilter(
 	void * assertValue,
 	BerVarray *keysp )
 {
-	char *val, *c;
+	char *c;
 	int i, count, len;
+	struct berval *val;
 	BerVarray keys;
 
 	/* Yes, this is necessary */
-	val = UTF8normalize( ((struct berval *)assertValue),
-		LDAP_UTF8_NOCASEFOLD );
-	if( val == NULL ) {
+	val = UTF8bvnormalize( ((struct berval *)assertValue), NULL, LDAP_UTF8_APPROX );
+	if( val == NULL || val->bv_val == NULL ) {
 		keys = (struct berval *)ch_malloc( sizeof(struct berval) );
 		keys[0].bv_val = NULL;
 		*keysp = keys;
+		ber_bvfree( val );
 		return LDAP_SUCCESS;
 	}
-	strip8bitChars( val );
 
 	/* Isolate how many words there are. There will be a key for each */
-	for( count=0,c=val;  *c;  c++) {
+	for( count = 0,c = val->bv_val; *c; c++) {
 		len = strcspn(c, SLAPD_APPROX_DELIMITER);
 		if( len >= SLAPD_APPROX_WORDLEN ) count++;
 		c+= len;
@@ -883,14 +855,14 @@ approxFilter(
 	keys = (struct berval *)ch_malloc( (count + 1) * sizeof(struct berval) );
 
 	/* Get a phonetic copy of each word */
-	for( c=val,i=0;	 i<count; c+=len+1 ) {
+	for( c = val->bv_val, i = 0; i < count; c += len + 1 ) {
 		len = strlen(c);
 		if( len < SLAPD_APPROX_WORDLEN ) continue;
 		ber_str2bv( phonetic( c ), 0, 0, &keys[i] );
 		i++;
 	}
 
-	free( val );
+	ber_bvfree( val );
 
 	keys[count].bv_val = NULL;
 	*keysp = keys;
-- 
2.39.5