X-Git-Url: https://git.sur5r.net/?a=blobdiff_plain;f=libraries%2Fliblunicode%2Fucstr.c;h=9246dc8edbfbe2273c745e9591a37b9beb2a7005;hb=0355abeb1a0ea471970c5313a69ce5b20916b408;hp=bfd33d379cf4dec601301d3d7e1f73b08081cfba;hpb=5ac196e3f9ef99fe2c25c48c80cedbe7cdca7e94;p=openldap

diff --git a/libraries/liblunicode/ucstr.c b/libraries/liblunicode/ucstr.c
index bfd33d379c..9246dc8edb 100644
--- a/libraries/liblunicode/ucstr.c
+++ b/libraries/liblunicode/ucstr.c
@@ -1,11 +1,34 @@
-#include "portable.h"
+/* $OpenLDAP$ */
+/* This work is part of OpenLDAP Software <http://www.openldap.org/>.
+ *
+ * Copyright 1998-2007 The OpenLDAP Foundation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted only as authorized by the OpenLDAP
+ * Public License.
+ *
+ * A copy of this license is available in file LICENSE in the
+ * top-level directory of the distribution or, alternatively, at
+ * <http://www.OpenLDAP.org/license.html>.
+ */
 
-#include <ldap_pvt_uc.h>
+#include "portable.h"
 
+#include <ac/bytes.h>
 #include <ac/ctype.h>
 #include <ac/string.h>
 #include <ac/stdlib.h>
 
+#include <lber_pvt.h>
+
+#include <ldap_utf8.h>
+#include <ldap_pvt_uc.h>
+
+#define	malloc(x)	ber_memalloc_x(x,ctx)
+#define	realloc(x,y)	ber_memrealloc_x(x,y,ctx)
+#define	free(x)		ber_memfree_x(x,ctx)
+
 int ucstrncmp(
 	const ldap_unicode_t *u1,
 	const ldap_unicode_t *u2,
@@ -28,8 +51,8 @@ int ucstrncasecmp(
 	ber_len_t n )
 {
 	for(; 0 < n; ++u1, ++u2, --n ) {
-		ldap_unicode_t uu1 = uctoupper( *u1 );
-		ldap_unicode_t uu2 = uctoupper( *u2 );
+		ldap_unicode_t uu1 = uctolower( *u1 );
+		ldap_unicode_t uu2 = uctolower( *u2 );
 
 		if( uu1 != uu2 ) {
 			return uu1 < uu2 ? -1 : +1;
@@ -60,9 +83,9 @@ ldap_unicode_t * ucstrncasechr(
 	ber_len_t n,
 	ldap_unicode_t c )
 {
-	c = uctoupper( c );
+	c = uctolower( c );
 	for(; 0 < n; ++u, --n ) {
-		if( uctoupper( *u ) == c ) {
+		if( uctolower( *u ) == c ) {
 			return (ldap_unicode_t *) u;
 		}
 	}
@@ -79,52 +102,84 @@ void ucstr2upper(
 	}
 }
 
-char * UTF8normalize(
-	const char *s,
-	char casefold )
+struct berval * UTF8bvnormalize(
+	struct berval *bv,
+	struct berval *newbv,
+	unsigned flags,
+	void *ctx )
 {
 	int i, j, len, clen, outpos, ucsoutlen, outsize, last;
-	char *out;
-	unsigned long *ucs, *p, *ucsout;
+	char *out, *outtmp, *s;
+	ac_uint4 *ucs, *p, *ucsout;
 
 	static unsigned char mask[] = {
-                0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
+		0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
 
-	if ( s == NULL ) {
+	unsigned casefold = flags & LDAP_UTF8_CASEFOLD;
+	unsigned approx = flags & LDAP_UTF8_APPROX;
+
+	if ( bv == NULL ) {
 		return NULL;
 	}
-	
-	len = strlen( s );
+
+	s = bv->bv_val;
+	len = bv->bv_len;
 
 	if ( len == 0 ) {
-		out = (char *) malloc( 1 );
-		*out = '\0';
-		return out;
+		return ber_dupbv_x( newbv, bv, ctx );
 	}
 	
-	outsize = len + 7;
-	out = (char *) malloc( outsize );
-	if ( out == NULL ) {
-		return NULL;
-	}
-
-	outpos = 0;
+	/* Should first check to see if string is already in proper
+	 * normalized form. This is almost as time consuming as
+	 * the normalization though.
+	 */
 
 	/* finish off everything up to character before first non-ascii */
 	if ( LDAP_UTF8_ISASCII( s ) ) {
-		for ( i = 1; (i < len) && LDAP_UTF8_ISASCII(s + i); i++ ) {
-			out[outpos++] = casefold ? TOUPPER( s[i-1] ) : s[i-1];
-		}
-		if ( i == len ) {
-			out[outpos++] = casefold ? TOUPPER( s[len - 1] ) : s[len - 1];
-			out[outpos] = '\0';
-			return out;
+		if ( casefold ) {
+			outsize = len + 7;
+			out = (char *) malloc( outsize );
+			if ( out == NULL ) {
+				return NULL;
+			}
+			outpos = 0;
+
+			for ( i = 1; (i < len) && LDAP_UTF8_ISASCII(s + i); i++ ) {
+				out[outpos++] = TOLOWER( s[i-1] );
+			}
+			if ( i == len ) {
+				out[outpos++] = TOLOWER( s[len-1] );
+				out[outpos] = '\0';
+				return ber_str2bv( out, outpos, 0, newbv);
+			}
+		} else {
+			for ( i = 1; (i < len) && LDAP_UTF8_ISASCII(s + i); i++ ) {
+				/* empty */
+			}
+
+			if ( i == len ) {
+				return ber_str2bv_x( s, len, 1, newbv, ctx );
+			}
+				
+			outsize = len + 7;
+			out = (char *) malloc( outsize );
+			if ( out == NULL ) {
+				return NULL;
+			}
+			outpos = i - 1;
+			memcpy(out, s, outpos);
 		}
 	} else {
+		outsize = len + 7;
+		out = (char *) malloc( outsize );
+		if ( out == NULL ) {
+			return NULL;
+		}
+		outpos = 0;
 		i = 0;
 	}
 
-	p = ucs = (long *) malloc( len * sizeof(*ucs) );
+	p = ucs = malloc( len * sizeof(*ucs) );
 	if ( ucs == NULL ) {
 		free(out);
 		return NULL;
@@ -132,7 +187,7 @@ char * UTF8normalize(
 
 	/* convert character before first non-ascii to ucs-4 */
 	if ( i > 0 ) {
-		*p = casefold ? TOUPPER( s[i - 1] ) : s[i - 1];
+		*p = casefold ? TOLOWER( s[i-1] ) : s[i-1];
 		p++;
 	}
 
@@ -141,7 +196,7 @@ char * UTF8normalize(
 		/* s[i] is non-ascii */
 		/* convert everything up to next ascii to ucs-4 */
 		while ( i < len ) {
-			clen = LDAP_UTF8_CHARLEN( s + i );
+			clen = LDAP_UTF8_CHARLEN2( s + i, clen );
 			if ( clen == 0 ) {
 				free( ucs );
 				free( out );
@@ -164,27 +219,41 @@ char * UTF8normalize(
 				i++;
 			}
 			if ( casefold ) {
-				*p = uctoupper( *p );
+				*p = uctolower( *p );
 			}
 			p++;
-                }
+		}
 		/* normalize ucs of length p - ucs */
-		uccanondecomp( ucs, p - ucs, &ucsout, &ucsoutlen );    
-		ucsoutlen = uccanoncomp( ucsout, ucsoutlen );
-		/* convert ucs to utf-8 and store in out */
-		for ( j = 0; j < ucsoutlen; j++ ) {
-			/* allocate more space if not enough room for
-			   6 bytes and terminator */
-			if ( outsize - outpos < 7 ) {
-				outsize = ucsoutlen - j + outpos + 6;
-				out = (char *) realloc( out, outsize );
-				if ( out == NULL ) {
-					free( ucs );
-					return NULL;
+		uccompatdecomp( ucs, p - ucs, &ucsout, &ucsoutlen, ctx );
+		if ( approx ) {
+			for ( j = 0; j < ucsoutlen; j++ ) {
+				if ( ucsout[j] < 0x80 ) {
+					out[outpos++] = ucsout[j];
+				}
+			}
+		} else {
+			ucsoutlen = uccanoncomp( ucsout, ucsoutlen );
+			/* convert ucs to utf-8 and store in out */
+			for ( j = 0; j < ucsoutlen; j++ ) {
+				/* allocate more space if not enough room for
+				   6 bytes and terminator */
+				if ( outsize - outpos < 7 ) {
+					outsize = ucsoutlen - j + outpos + 6;
+					outtmp = (char *) realloc( out, outsize );
+					if ( outtmp == NULL ) {
+						free( out );
+						free( ucs );
+						free( ucsout );
+						return NULL;
+					}
+					out = outtmp;
 				}
+				outpos += ldap_x_ucs4_to_utf8( ucsout[j], &out[outpos] );
 			}
-			outpos += ldap_ucs4_to_utf8( ucsout[j], &out[outpos] );
 		}
+
+		free( ucsout );
+		ucsout = NULL;
 		
 		if ( i == len ) {
 			break;
@@ -192,82 +261,174 @@ char * UTF8normalize(
 
 		last = i;
 
+		/* Allocate more space in out if necessary */
+		if (len - i >= outsize - outpos) {
+			outsize += 1 + ((len - i) - (outsize - outpos));
+			outtmp = (char *) realloc(out, outsize);
+			if (outtmp == NULL) {
+				free(out);
+				free(ucs);
+				return NULL;
+			}
+			out = outtmp;
+		}
+
 		/* s[i] is ascii */
 		/* finish off everything up to char before next non-ascii */
 		for ( i++; (i < len) && LDAP_UTF8_ISASCII(s + i); i++ ) {
-			out[outpos++] = casefold ? TOUPPER( s[i-1] ) : s[i-1];
+			out[outpos++] = casefold ? TOLOWER( s[i-1] ) : s[i-1];
 		}
 		if ( i == len ) {
-			out[outpos++] = casefold ? TOUPPER( s[len - 1] ) : s[len - 1];
+			out[outpos++] = casefold ? TOLOWER( s[len-1] ) : s[len-1];
 			break;
 		}
 
 		/* convert character before next non-ascii to ucs-4 */
-		*ucs = casefold ? TOUPPER( s[i - 1] ) : s[i - 1];
+		*ucs = casefold ? TOLOWER( s[i-1] ) : s[i-1];
 		p = ucs + 1;
-	}		
+	}
+
 	free( ucs );
 	out[outpos] = '\0';
-	return out;
+	return ber_str2bv( out, outpos, 0, newbv );
 }
 
-/* compare UTF8-strings, optionally ignore casing, string pointers must not be NULL */
+/* compare UTF8-strings, optionally ignore casing */
 /* slow, should be optimized */
-int UTF8normcmp(
-	const char *s1,
-	const char *s2,
-	char casefold )
+int UTF8bvnormcmp(
+	struct berval *bv1,
+	struct berval *bv2,
+	unsigned flags,
+	void *ctx )
 {
-	int i, l1, l2, len, ulen, res;
-	unsigned long *ucs, *ucsout1, *ucsout2;
+	int i, l1, l2, len, ulen, res = 0;
+	char *s1, *s2, *done;
+	ac_uint4 *ucs, *ucsout1, *ucsout2;
 
-	l1 = strlen( s1 );
-	l2 = strlen( s2 );
+	unsigned casefold = flags & LDAP_UTF8_CASEFOLD;
+	unsigned norm1 = flags & LDAP_UTF8_ARG1NFC;
+	unsigned norm2 = flags & LDAP_UTF8_ARG2NFC;
 
-	if ( ( l1 == 0 ) || ( l2 == 0 ) ) {
-		if ( l1 == l2 ) {
-			return 0;
+	if (bv1 == NULL) {
+		return bv2 == NULL ? 0 : -1;
+
+	} else if (bv2 == NULL) {
+		return 1;
+	}
+
+	l1 = bv1->bv_len;
+	l2 = bv2->bv_len;
+
+	len = (l1 < l2) ? l1 : l2;
+	if (len == 0) {
+		return l1 == 0 ? (l2 == 0 ? 0 : -1) : 1;
+	}
+
+	s1 = bv1->bv_val;
+	s2 = bv2->bv_val;
+	done = s1 + len;
+
+	while ( (s1 < done) && LDAP_UTF8_ISASCII(s1) && LDAP_UTF8_ISASCII(s2) ) {
+		if (casefold) {
+			char c1 = TOLOWER(*s1);
+			char c2 = TOLOWER(*s2);
+			res = c1 - c2;
+		} else {
+			res = *s1 - *s2;
+		}			
+		s1++;
+		s2++;
+		if (res) {
+			/* done unless next character in s1 or s2 is non-ascii */
+			if (s1 < done) {
+				if (!LDAP_UTF8_ISASCII(s1) || !LDAP_UTF8_ISASCII(s2)) {
+					break;
+				}
+			} else if (((len < l1) && !LDAP_UTF8_ISASCII(s1)) ||
+				((len < l2) && !LDAP_UTF8_ISASCII(s2)))
+			{
+				break;
+			}
+			return res;
 		}
-		return *s1 - *s2 > 0 ? 1 : -1;
 	}
-	
-	ucs = (long *) malloc( ( l1 > l2 ? l1 : l2 ) * sizeof(*ucs) );
+
+	/* We have encountered non-ascii or strings equal up to len */
+
+	/* set i to number of iterations */
+	i = s1 - done + len;
+	/* passed through loop at least once? */
+	if (i > 0) {
+		if (!res && (s1 == done) &&
+		    ((len == l1) || LDAP_UTF8_ISASCII(s1)) &&
+		    ((len == l2) || LDAP_UTF8_ISASCII(s2))) {
+			/* all ascii and equal up to len */
+			return l1 - l2;
+		}
+
+		/* rewind one char, and do normalized compare from there */
+		s1--;
+		s2--;
+		l1 -= i - 1;
+		l2 -= i - 1;
+	}
+			
+	/* Should first check to see if strings are already in
+	 * proper normalized form.
+	 */
+	ucs = malloc( ( ( norm1 || l1 > l2 ) ? l1 : l2 ) * sizeof(*ucs) );
 	if ( ucs == NULL ) {
 		return l1 > l2 ? 1 : -1; /* what to do??? */
 	}
 	
 	/*
 	 * XXYYZ: we convert to ucs4 even though -llunicode
-	 * expects ucs2 in an unsigned long
+	 * expects ucs2 in an ac_uint4
 	 */
 	
 	/* convert and normalize 1st string */
 	for ( i = 0, ulen = 0; i < l1; i += len, ulen++ ) {
-                ucs[ulen] = ldap_utf8_to_ucs4( s1 + i );
-                if ( ucs[ulen] == LDAP_UCS4_INVALID ) {
+		ucs[ulen] = ldap_x_utf8_to_ucs4( s1 + i );
+		if ( ucs[ulen] == LDAP_UCS4_INVALID ) {
 			free( ucs );
-                        return -1; /* what to do??? */
-                }
+			return -1; /* what to do??? */
+		}
 		len = LDAP_UTF8_CHARLEN( s1 + i );
 	}
-	uccanondecomp( ucs, ulen, &ucsout1, &l1 );
-	l1 = uccanoncomp( ucsout1, l1 );
+
+	if ( norm1 ) {
+		ucsout1 = ucs;
+		l1 = ulen;
+		ucs = malloc( l2 * sizeof(*ucs) );
+		if ( ucs == NULL ) {
+			free( ucsout1 );
+			return l1 > l2 ? 1 : -1; /* what to do??? */
+		}
+	} else {
+		uccompatdecomp( ucs, ulen, &ucsout1, &l1, ctx );
+		l1 = uccanoncomp( ucsout1, l1 );
+	}
 
 	/* convert and normalize 2nd string */
 	for ( i = 0, ulen = 0; i < l2; i += len, ulen++ ) {
-                ucs[ulen] = ldap_utf8_to_ucs4( s2 + i );
-                if ( ucs[ulen] == LDAP_UCS4_INVALID ) {
+		ucs[ulen] = ldap_x_utf8_to_ucs4( s2 + i );
+		if ( ucs[ulen] == LDAP_UCS4_INVALID ) {
 			free( ucsout1 );
 			free( ucs );
-                        return 1; /* what to do??? */
-                }
+			return 1; /* what to do??? */
+		}
 		len = LDAP_UTF8_CHARLEN( s2 + i );
 	}
-	uccanondecomp( ucs, ulen, &ucsout2, &l2 );
-	l2 = uccanoncomp( ucsout2, l2 );
-
-	free( ucs );
 
+	if ( norm2 ) {
+		ucsout2 = ucs;
+		l2 = ulen;
+	} else {
+		uccompatdecomp( ucs, ulen, &ucsout2, &l2, ctx );
+		l2 = uccanoncomp( ucsout2, l2 );
+		free( ucs );
+	}
+	
 	res = casefold
 		? ucstrncasecmp( ucsout1, ucsout2, l1 < l2 ? l1 : l2 )
 		: ucstrncmp( ucsout1, ucsout2, l1 < l2 ? l1 : l2 );