X-Git-Url: https://git.sur5r.net/?a=blobdiff_plain;f=libraries%2Fliblunicode%2Fucstr.c;h=9246dc8edbfbe2273c745e9591a37b9beb2a7005;hb=0355abeb1a0ea471970c5313a69ce5b20916b408;hp=d0d76214396d62f9a500776ac8e557d4470dc21d;hpb=217103b1381666a28efef58ab87b06934c0b66bc;p=openldap diff --git a/libraries/liblunicode/ucstr.c b/libraries/liblunicode/ucstr.c index d0d7621439..9246dc8edb 100644 --- a/libraries/liblunicode/ucstr.c +++ b/libraries/liblunicode/ucstr.c @@ -1,24 +1,33 @@ -/* - * Copyright 2000-2002 The OpenLDAP Foundation - * COPYING RESTRICTIONS APPLY. See COPYRIGHT File in top level directory - * of this package for details. +/* $OpenLDAP$ */ +/* This work is part of OpenLDAP Software . + * + * Copyright 1998-2007 The OpenLDAP Foundation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . */ #include "portable.h" +#include #include #include #include -#include - -#define malloc(x) ber_memalloc(x) -#define realloc(x,y) ber_memrealloc(x,y) -#define free(x) ber_memfree(x) +#include #include #include +#define malloc(x) ber_memalloc_x(x,ctx) +#define realloc(x,y) ber_memrealloc_x(x,y,ctx) +#define free(x) ber_memfree_x(x,ctx) int ucstrncmp( const ldap_unicode_t *u1, @@ -42,8 +51,8 @@ int ucstrncasecmp( ber_len_t n ) { for(; 0 < n; ++u1, ++u2, --n ) { - ldap_unicode_t uu1 = uctoupper( *u1 ); - ldap_unicode_t uu2 = uctoupper( *u2 ); + ldap_unicode_t uu1 = uctolower( *u1 ); + ldap_unicode_t uu2 = uctolower( *u2 ); if( uu1 != uu2 ) { return uu1 < uu2 ? -1 : +1; @@ -74,9 +83,9 @@ ldap_unicode_t * ucstrncasechr( ber_len_t n, ldap_unicode_t c ) { - c = uctoupper( c ); + c = uctolower( c ); for(; 0 < n; ++u, --n ) { - if( uctoupper( *u ) == c ) { + if( uctolower( *u ) == c ) { return (ldap_unicode_t *) u; } } @@ -93,167 +102,21 @@ void ucstr2upper( } } -char * UTF8normalize( - struct berval *bv, - unsigned casefold ) -{ - int i, j, len, clen, outpos, ucsoutlen, outsize, last; - char *out, *s; - unsigned long *ucs, *p, *ucsout; - - static unsigned char mask[] = { - 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 }; - - if ( bv == NULL ) { - return NULL; - } - - s = bv->bv_val; - len = bv->bv_len; - - /* See if the string is pure ASCII so we can shortcut */ - for ( i=0; ibv_val, len ); - } else { - for ( j=0; j 0 ) { - *p = casefold ? TOUPPER( s[i - 1] ) : s[i - 1]; - p++; - } - - /* s[i] is now first non-ascii character */ - for (;;) { - /* s[i] is non-ascii */ - /* convert everything up to next ascii to ucs-4 */ - while ( i < len ) { - clen = LDAP_UTF8_CHARLEN2( s + i, clen ); - if ( clen == 0 ) { - free( ucs ); - free( out ); - return NULL; - } - if ( clen == 1 ) { - /* ascii */ - break; - } - *p = s[i] & mask[clen]; - i++; - for( j = 1; j < clen; j++ ) { - if ( (s[i] & 0xc0) != 0x80 ) { - free( ucs ); - free( out ); - return NULL; - } - *p <<= 6; - *p |= s[i] & 0x3f; - i++; - } - if ( casefold ) { - *p = uctoupper( *p ); - } - p++; - } - /* normalize ucs of length p - ucs */ - uccanondecomp( ucs, p - ucs, &ucsout, &ucsoutlen ); - ucsoutlen = uccanoncomp( ucsout, ucsoutlen ); - /* convert ucs to utf-8 and store in out */ - for ( j = 0; j < ucsoutlen; j++ ) { - /* allocate more space if not enough room for - 6 bytes and terminator */ - if ( outsize - outpos < 7 ) { - outsize = ucsoutlen - j + outpos + 6; - out = (char *) realloc( out, outsize ); - if ( out == NULL ) { - free( ucs ); - return NULL; - } - } - outpos += ldap_x_ucs4_to_utf8( ucsout[j], &out[outpos] ); - } - - if ( i == len ) { - break; - } - - last = i; - - /* s[i] is ascii */ - /* finish off everything up to char before next non-ascii */ - for ( i++; (i < len) && LDAP_UTF8_ISASCII(s + i); i++ ) { - out[outpos++] = casefold ? TOUPPER( s[i-1] ) : s[i-1]; - } - if ( i == len ) { - out[outpos++] = casefold ? TOUPPER( s[len - 1] ) : s[len - 1]; - break; - } - - /* convert character before next non-ascii to ucs-4 */ - *ucs = casefold ? TOUPPER( s[i - 1] ) : s[i - 1]; - p = ucs + 1; - } - free( ucs ); - out[outpos] = '\0'; - return out; -} - struct berval * UTF8bvnormalize( struct berval *bv, struct berval *newbv, - unsigned casefold ) + unsigned flags, + void *ctx ) { int i, j, len, clen, outpos, ucsoutlen, outsize, last; - char *out, *s; - unsigned long *ucs, *p, *ucsout; - + char *out, *outtmp, *s; + ac_uint4 *ucs, *p, *ucsout; + static unsigned char mask[] = { - 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 }; + 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 }; + + unsigned casefold = flags & LDAP_UTF8_CASEFOLD; + unsigned approx = flags & LDAP_UTF8_APPROX; if ( bv == NULL ) { return NULL; @@ -263,12 +126,12 @@ struct berval * UTF8bvnormalize( len = bv->bv_len; if ( len == 0 ) { - return ber_dupbv( newbv, bv ); + return ber_dupbv_x( newbv, bv, ctx ); } - /* FIXME: Should first check to see if string is already in - * proper normalized form. This is almost as time consuming - * as the normalization though. + /* Should first check to see if string is already in proper + * normalized form. This is almost as time consuming as + * the normalization though. */ /* finish off everything up to character before first non-ascii */ @@ -282,10 +145,10 @@ struct berval * UTF8bvnormalize( outpos = 0; for ( i = 1; (i < len) && LDAP_UTF8_ISASCII(s + i); i++ ) { - out[outpos++] = TOUPPER( s[i-1] ); + out[outpos++] = TOLOWER( s[i-1] ); } if ( i == len ) { - out[outpos++] = TOUPPER( s[len - 1] ); + out[outpos++] = TOLOWER( s[len-1] ); out[outpos] = '\0'; return ber_str2bv( out, outpos, 0, newbv); } @@ -295,7 +158,7 @@ struct berval * UTF8bvnormalize( } if ( i == len ) { - return ber_str2bv( s, len, 1, newbv ); + return ber_str2bv_x( s, len, 1, newbv, ctx ); } outsize = len + 7; @@ -316,7 +179,7 @@ struct berval * UTF8bvnormalize( i = 0; } - p = ucs = (long *) malloc( len * sizeof(*ucs) ); + p = ucs = malloc( len * sizeof(*ucs) ); if ( ucs == NULL ) { free(out); return NULL; @@ -324,7 +187,7 @@ struct berval * UTF8bvnormalize( /* convert character before first non-ascii to ucs-4 */ if ( i > 0 ) { - *p = casefold ? TOUPPER( s[i - 1] ) : s[i - 1]; + *p = casefold ? TOLOWER( s[i-1] ) : s[i-1]; p++; } @@ -356,27 +219,41 @@ struct berval * UTF8bvnormalize( i++; } if ( casefold ) { - *p = uctoupper( *p ); + *p = uctolower( *p ); } p++; - } + } /* normalize ucs of length p - ucs */ - uccanondecomp( ucs, p - ucs, &ucsout, &ucsoutlen ); - ucsoutlen = uccanoncomp( ucsout, ucsoutlen ); - /* convert ucs to utf-8 and store in out */ - for ( j = 0; j < ucsoutlen; j++ ) { - /* allocate more space if not enough room for - 6 bytes and terminator */ - if ( outsize - outpos < 7 ) { - outsize = ucsoutlen - j + outpos + 6; - out = (char *) realloc( out, outsize ); - if ( out == NULL ) { - free( ucs ); - return NULL; + uccompatdecomp( ucs, p - ucs, &ucsout, &ucsoutlen, ctx ); + if ( approx ) { + for ( j = 0; j < ucsoutlen; j++ ) { + if ( ucsout[j] < 0x80 ) { + out[outpos++] = ucsout[j]; } } - outpos += ldap_x_ucs4_to_utf8( ucsout[j], &out[outpos] ); + } else { + ucsoutlen = uccanoncomp( ucsout, ucsoutlen ); + /* convert ucs to utf-8 and store in out */ + for ( j = 0; j < ucsoutlen; j++ ) { + /* allocate more space if not enough room for + 6 bytes and terminator */ + if ( outsize - outpos < 7 ) { + outsize = ucsoutlen - j + outpos + 6; + outtmp = (char *) realloc( out, outsize ); + if ( outtmp == NULL ) { + free( out ); + free( ucs ); + free( ucsout ); + return NULL; + } + out = outtmp; + } + outpos += ldap_x_ucs4_to_utf8( ucsout[j], &out[outpos] ); + } } + + free( ucsout ); + ucsout = NULL; if ( i == len ) { break; @@ -384,106 +261,174 @@ struct berval * UTF8bvnormalize( last = i; + /* Allocate more space in out if necessary */ + if (len - i >= outsize - outpos) { + outsize += 1 + ((len - i) - (outsize - outpos)); + outtmp = (char *) realloc(out, outsize); + if (outtmp == NULL) { + free(out); + free(ucs); + return NULL; + } + out = outtmp; + } + /* s[i] is ascii */ /* finish off everything up to char before next non-ascii */ for ( i++; (i < len) && LDAP_UTF8_ISASCII(s + i); i++ ) { - out[outpos++] = casefold ? TOUPPER( s[i-1] ) : s[i-1]; + out[outpos++] = casefold ? TOLOWER( s[i-1] ) : s[i-1]; } if ( i == len ) { - out[outpos++] = casefold ? TOUPPER( s[len - 1] ) : s[len - 1]; + out[outpos++] = casefold ? TOLOWER( s[len-1] ) : s[len-1]; break; } /* convert character before next non-ascii to ucs-4 */ - *ucs = casefold ? TOUPPER( s[i - 1] ) : s[i - 1]; + *ucs = casefold ? TOLOWER( s[i-1] ) : s[i-1]; p = ucs + 1; - } + } + free( ucs ); out[outpos] = '\0'; return ber_str2bv( out, outpos, 0, newbv ); } -/* compare UTF8-strings, optionally ignore casing, string pointers must not be NULL */ +/* compare UTF8-strings, optionally ignore casing */ /* slow, should be optimized */ -int UTF8normcmp( - const char *s1, - const char *s2, - unsigned casefold ) +int UTF8bvnormcmp( + struct berval *bv1, + struct berval *bv2, + unsigned flags, + void *ctx ) { - int i, l1, l2, len, ulen, res; - unsigned long *ucs, *ucsout1, *ucsout2; + int i, l1, l2, len, ulen, res = 0; + char *s1, *s2, *done; + ac_uint4 *ucs, *ucsout1, *ucsout2; - l1 = strlen( s1 ); - l2 = strlen( s2 ); + unsigned casefold = flags & LDAP_UTF8_CASEFOLD; + unsigned norm1 = flags & LDAP_UTF8_ARG1NFC; + unsigned norm2 = flags & LDAP_UTF8_ARG2NFC; - if ( ( l1 == 0 ) || ( l2 == 0 ) ) { - if ( l1 == l2 ) { - return 0; - } - return *s1 - *s2 > 0 ? 1 : -1; + if (bv1 == NULL) { + return bv2 == NULL ? 0 : -1; + + } else if (bv2 == NULL) { + return 1; } - - /* See if we can get away with a straight ASCII compare */ + + l1 = bv1->bv_len; + l2 = bv2->bv_len; + len = (l1 < l2) ? l1 : l2; - for ( i = 0; ibv_val; + s2 = bv2->bv_val; + done = s1 + len; + + while ( (s1 < done) && LDAP_UTF8_ISASCII(s1) && LDAP_UTF8_ISASCII(s2) ) { if (casefold) { - char c1 = TOUPPER(s1[i]); - char c2 = TOUPPER(s2[i]); - res = c1 - c2; + char c1 = TOLOWER(*s1); + char c2 = TOLOWER(*s2); + res = c1 - c2; } else { - res = s1[i] - s2[i]; - } - if (res) + res = *s1 - *s2; + } + s1++; + s2++; + if (res) { + /* done unless next character in s1 or s2 is non-ascii */ + if (s1 < done) { + if (!LDAP_UTF8_ISASCII(s1) || !LDAP_UTF8_ISASCII(s2)) { + break; + } + } else if (((len < l1) && !LDAP_UTF8_ISASCII(s1)) || + ((len < l2) && !LDAP_UTF8_ISASCII(s2))) + { + break; + } return res; + } } - /* Strings were ASCII, equal up to minlen */ - if (i == len) - return l1 - l2; - - /* FIXME: Should first check to see if strings are already in + + /* We have encountered non-ascii or strings equal up to len */ + + /* set i to number of iterations */ + i = s1 - done + len; + /* passed through loop at least once? */ + if (i > 0) { + if (!res && (s1 == done) && + ((len == l1) || LDAP_UTF8_ISASCII(s1)) && + ((len == l2) || LDAP_UTF8_ISASCII(s2))) { + /* all ascii and equal up to len */ + return l1 - l2; + } + + /* rewind one char, and do normalized compare from there */ + s1--; + s2--; + l1 -= i - 1; + l2 -= i - 1; + } + + /* Should first check to see if strings are already in * proper normalized form. */ - - ucs = (long *) malloc( ( l1 > l2 ? l1 : l2 ) * sizeof(*ucs) ); + ucs = malloc( ( ( norm1 || l1 > l2 ) ? l1 : l2 ) * sizeof(*ucs) ); if ( ucs == NULL ) { return l1 > l2 ? 1 : -1; /* what to do??? */ } /* * XXYYZ: we convert to ucs4 even though -llunicode - * expects ucs2 in an unsigned long + * expects ucs2 in an ac_uint4 */ /* convert and normalize 1st string */ for ( i = 0, ulen = 0; i < l1; i += len, ulen++ ) { - ucs[ulen] = ldap_x_utf8_to_ucs4( s1 + i ); - if ( ucs[ulen] == LDAP_UCS4_INVALID ) { + ucs[ulen] = ldap_x_utf8_to_ucs4( s1 + i ); + if ( ucs[ulen] == LDAP_UCS4_INVALID ) { free( ucs ); - return -1; /* what to do??? */ - } + return -1; /* what to do??? */ + } len = LDAP_UTF8_CHARLEN( s1 + i ); } - uccanondecomp( ucs, ulen, &ucsout1, &l1 ); - l1 = uccanoncomp( ucsout1, l1 ); + + if ( norm1 ) { + ucsout1 = ucs; + l1 = ulen; + ucs = malloc( l2 * sizeof(*ucs) ); + if ( ucs == NULL ) { + free( ucsout1 ); + return l1 > l2 ? 1 : -1; /* what to do??? */ + } + } else { + uccompatdecomp( ucs, ulen, &ucsout1, &l1, ctx ); + l1 = uccanoncomp( ucsout1, l1 ); + } /* convert and normalize 2nd string */ for ( i = 0, ulen = 0; i < l2; i += len, ulen++ ) { - ucs[ulen] = ldap_x_utf8_to_ucs4( s2 + i ); - if ( ucs[ulen] == LDAP_UCS4_INVALID ) { + ucs[ulen] = ldap_x_utf8_to_ucs4( s2 + i ); + if ( ucs[ulen] == LDAP_UCS4_INVALID ) { free( ucsout1 ); free( ucs ); - return 1; /* what to do??? */ - } + return 1; /* what to do??? */ + } len = LDAP_UTF8_CHARLEN( s2 + i ); } - uccanondecomp( ucs, ulen, &ucsout2, &l2 ); - l2 = uccanoncomp( ucsout2, l2 ); - - free( ucs ); + if ( norm2 ) { + ucsout2 = ucs; + l2 = ulen; + } else { + uccompatdecomp( ucs, ulen, &ucsout2, &l2, ctx ); + l2 = uccanoncomp( ucsout2, l2 ); + free( ucs ); + } + res = casefold ? ucstrncasecmp( ucsout1, ucsout2, l1 < l2 ? l1 : l2 ) : ucstrncmp( ucsout1, ucsout2, l1 < l2 ? l1 : l2 );