X-Git-Url: https://git.sur5r.net/?a=blobdiff_plain;f=libraries%2Fliblunicode%2Fucstr.c;h=9246dc8edbfbe2273c745e9591a37b9beb2a7005;hb=0355abeb1a0ea471970c5313a69ce5b20916b408;hp=bfd33d379cf4dec601301d3d7e1f73b08081cfba;hpb=5ac196e3f9ef99fe2c25c48c80cedbe7cdca7e94;p=openldap diff --git a/libraries/liblunicode/ucstr.c b/libraries/liblunicode/ucstr.c index bfd33d379c..9246dc8edb 100644 --- a/libraries/liblunicode/ucstr.c +++ b/libraries/liblunicode/ucstr.c @@ -1,11 +1,34 @@ -#include "portable.h" +/* $OpenLDAP$ */ +/* This work is part of OpenLDAP Software . + * + * Copyright 1998-2007 The OpenLDAP Foundation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ -#include +#include "portable.h" +#include #include #include #include +#include + +#include +#include + +#define malloc(x) ber_memalloc_x(x,ctx) +#define realloc(x,y) ber_memrealloc_x(x,y,ctx) +#define free(x) ber_memfree_x(x,ctx) + int ucstrncmp( const ldap_unicode_t *u1, const ldap_unicode_t *u2, @@ -28,8 +51,8 @@ int ucstrncasecmp( ber_len_t n ) { for(; 0 < n; ++u1, ++u2, --n ) { - ldap_unicode_t uu1 = uctoupper( *u1 ); - ldap_unicode_t uu2 = uctoupper( *u2 ); + ldap_unicode_t uu1 = uctolower( *u1 ); + ldap_unicode_t uu2 = uctolower( *u2 ); if( uu1 != uu2 ) { return uu1 < uu2 ? -1 : +1; @@ -60,9 +83,9 @@ ldap_unicode_t * ucstrncasechr( ber_len_t n, ldap_unicode_t c ) { - c = uctoupper( c ); + c = uctolower( c ); for(; 0 < n; ++u, --n ) { - if( uctoupper( *u ) == c ) { + if( uctolower( *u ) == c ) { return (ldap_unicode_t *) u; } } @@ -79,52 +102,84 @@ void ucstr2upper( } } -char * UTF8normalize( - const char *s, - char casefold ) +struct berval * UTF8bvnormalize( + struct berval *bv, + struct berval *newbv, + unsigned flags, + void *ctx ) { int i, j, len, clen, outpos, ucsoutlen, outsize, last; - char *out; - unsigned long *ucs, *p, *ucsout; + char *out, *outtmp, *s; + ac_uint4 *ucs, *p, *ucsout; static unsigned char mask[] = { - 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 }; + 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 }; - if ( s == NULL ) { + unsigned casefold = flags & LDAP_UTF8_CASEFOLD; + unsigned approx = flags & LDAP_UTF8_APPROX; + + if ( bv == NULL ) { return NULL; } - - len = strlen( s ); + + s = bv->bv_val; + len = bv->bv_len; if ( len == 0 ) { - out = (char *) malloc( 1 ); - *out = '\0'; - return out; + return ber_dupbv_x( newbv, bv, ctx ); } - outsize = len + 7; - out = (char *) malloc( outsize ); - if ( out == NULL ) { - return NULL; - } - - outpos = 0; + /* Should first check to see if string is already in proper + * normalized form. This is almost as time consuming as + * the normalization though. + */ /* finish off everything up to character before first non-ascii */ if ( LDAP_UTF8_ISASCII( s ) ) { - for ( i = 1; (i < len) && LDAP_UTF8_ISASCII(s + i); i++ ) { - out[outpos++] = casefold ? TOUPPER( s[i-1] ) : s[i-1]; - } - if ( i == len ) { - out[outpos++] = casefold ? TOUPPER( s[len - 1] ) : s[len - 1]; - out[outpos] = '\0'; - return out; + if ( casefold ) { + outsize = len + 7; + out = (char *) malloc( outsize ); + if ( out == NULL ) { + return NULL; + } + outpos = 0; + + for ( i = 1; (i < len) && LDAP_UTF8_ISASCII(s + i); i++ ) { + out[outpos++] = TOLOWER( s[i-1] ); + } + if ( i == len ) { + out[outpos++] = TOLOWER( s[len-1] ); + out[outpos] = '\0'; + return ber_str2bv( out, outpos, 0, newbv); + } + } else { + for ( i = 1; (i < len) && LDAP_UTF8_ISASCII(s + i); i++ ) { + /* empty */ + } + + if ( i == len ) { + return ber_str2bv_x( s, len, 1, newbv, ctx ); + } + + outsize = len + 7; + out = (char *) malloc( outsize ); + if ( out == NULL ) { + return NULL; + } + outpos = i - 1; + memcpy(out, s, outpos); } } else { + outsize = len + 7; + out = (char *) malloc( outsize ); + if ( out == NULL ) { + return NULL; + } + outpos = 0; i = 0; } - p = ucs = (long *) malloc( len * sizeof(*ucs) ); + p = ucs = malloc( len * sizeof(*ucs) ); if ( ucs == NULL ) { free(out); return NULL; @@ -132,7 +187,7 @@ char * UTF8normalize( /* convert character before first non-ascii to ucs-4 */ if ( i > 0 ) { - *p = casefold ? TOUPPER( s[i - 1] ) : s[i - 1]; + *p = casefold ? TOLOWER( s[i-1] ) : s[i-1]; p++; } @@ -141,7 +196,7 @@ char * UTF8normalize( /* s[i] is non-ascii */ /* convert everything up to next ascii to ucs-4 */ while ( i < len ) { - clen = LDAP_UTF8_CHARLEN( s + i ); + clen = LDAP_UTF8_CHARLEN2( s + i, clen ); if ( clen == 0 ) { free( ucs ); free( out ); @@ -164,27 +219,41 @@ char * UTF8normalize( i++; } if ( casefold ) { - *p = uctoupper( *p ); + *p = uctolower( *p ); } p++; - } + } /* normalize ucs of length p - ucs */ - uccanondecomp( ucs, p - ucs, &ucsout, &ucsoutlen ); - ucsoutlen = uccanoncomp( ucsout, ucsoutlen ); - /* convert ucs to utf-8 and store in out */ - for ( j = 0; j < ucsoutlen; j++ ) { - /* allocate more space if not enough room for - 6 bytes and terminator */ - if ( outsize - outpos < 7 ) { - outsize = ucsoutlen - j + outpos + 6; - out = (char *) realloc( out, outsize ); - if ( out == NULL ) { - free( ucs ); - return NULL; + uccompatdecomp( ucs, p - ucs, &ucsout, &ucsoutlen, ctx ); + if ( approx ) { + for ( j = 0; j < ucsoutlen; j++ ) { + if ( ucsout[j] < 0x80 ) { + out[outpos++] = ucsout[j]; + } + } + } else { + ucsoutlen = uccanoncomp( ucsout, ucsoutlen ); + /* convert ucs to utf-8 and store in out */ + for ( j = 0; j < ucsoutlen; j++ ) { + /* allocate more space if not enough room for + 6 bytes and terminator */ + if ( outsize - outpos < 7 ) { + outsize = ucsoutlen - j + outpos + 6; + outtmp = (char *) realloc( out, outsize ); + if ( outtmp == NULL ) { + free( out ); + free( ucs ); + free( ucsout ); + return NULL; + } + out = outtmp; } + outpos += ldap_x_ucs4_to_utf8( ucsout[j], &out[outpos] ); } - outpos += ldap_ucs4_to_utf8( ucsout[j], &out[outpos] ); } + + free( ucsout ); + ucsout = NULL; if ( i == len ) { break; @@ -192,82 +261,174 @@ char * UTF8normalize( last = i; + /* Allocate more space in out if necessary */ + if (len - i >= outsize - outpos) { + outsize += 1 + ((len - i) - (outsize - outpos)); + outtmp = (char *) realloc(out, outsize); + if (outtmp == NULL) { + free(out); + free(ucs); + return NULL; + } + out = outtmp; + } + /* s[i] is ascii */ /* finish off everything up to char before next non-ascii */ for ( i++; (i < len) && LDAP_UTF8_ISASCII(s + i); i++ ) { - out[outpos++] = casefold ? TOUPPER( s[i-1] ) : s[i-1]; + out[outpos++] = casefold ? TOLOWER( s[i-1] ) : s[i-1]; } if ( i == len ) { - out[outpos++] = casefold ? TOUPPER( s[len - 1] ) : s[len - 1]; + out[outpos++] = casefold ? TOLOWER( s[len-1] ) : s[len-1]; break; } /* convert character before next non-ascii to ucs-4 */ - *ucs = casefold ? TOUPPER( s[i - 1] ) : s[i - 1]; + *ucs = casefold ? TOLOWER( s[i-1] ) : s[i-1]; p = ucs + 1; - } + } + free( ucs ); out[outpos] = '\0'; - return out; + return ber_str2bv( out, outpos, 0, newbv ); } -/* compare UTF8-strings, optionally ignore casing, string pointers must not be NULL */ +/* compare UTF8-strings, optionally ignore casing */ /* slow, should be optimized */ -int UTF8normcmp( - const char *s1, - const char *s2, - char casefold ) +int UTF8bvnormcmp( + struct berval *bv1, + struct berval *bv2, + unsigned flags, + void *ctx ) { - int i, l1, l2, len, ulen, res; - unsigned long *ucs, *ucsout1, *ucsout2; + int i, l1, l2, len, ulen, res = 0; + char *s1, *s2, *done; + ac_uint4 *ucs, *ucsout1, *ucsout2; - l1 = strlen( s1 ); - l2 = strlen( s2 ); + unsigned casefold = flags & LDAP_UTF8_CASEFOLD; + unsigned norm1 = flags & LDAP_UTF8_ARG1NFC; + unsigned norm2 = flags & LDAP_UTF8_ARG2NFC; - if ( ( l1 == 0 ) || ( l2 == 0 ) ) { - if ( l1 == l2 ) { - return 0; + if (bv1 == NULL) { + return bv2 == NULL ? 0 : -1; + + } else if (bv2 == NULL) { + return 1; + } + + l1 = bv1->bv_len; + l2 = bv2->bv_len; + + len = (l1 < l2) ? l1 : l2; + if (len == 0) { + return l1 == 0 ? (l2 == 0 ? 0 : -1) : 1; + } + + s1 = bv1->bv_val; + s2 = bv2->bv_val; + done = s1 + len; + + while ( (s1 < done) && LDAP_UTF8_ISASCII(s1) && LDAP_UTF8_ISASCII(s2) ) { + if (casefold) { + char c1 = TOLOWER(*s1); + char c2 = TOLOWER(*s2); + res = c1 - c2; + } else { + res = *s1 - *s2; + } + s1++; + s2++; + if (res) { + /* done unless next character in s1 or s2 is non-ascii */ + if (s1 < done) { + if (!LDAP_UTF8_ISASCII(s1) || !LDAP_UTF8_ISASCII(s2)) { + break; + } + } else if (((len < l1) && !LDAP_UTF8_ISASCII(s1)) || + ((len < l2) && !LDAP_UTF8_ISASCII(s2))) + { + break; + } + return res; } - return *s1 - *s2 > 0 ? 1 : -1; } - - ucs = (long *) malloc( ( l1 > l2 ? l1 : l2 ) * sizeof(*ucs) ); + + /* We have encountered non-ascii or strings equal up to len */ + + /* set i to number of iterations */ + i = s1 - done + len; + /* passed through loop at least once? */ + if (i > 0) { + if (!res && (s1 == done) && + ((len == l1) || LDAP_UTF8_ISASCII(s1)) && + ((len == l2) || LDAP_UTF8_ISASCII(s2))) { + /* all ascii and equal up to len */ + return l1 - l2; + } + + /* rewind one char, and do normalized compare from there */ + s1--; + s2--; + l1 -= i - 1; + l2 -= i - 1; + } + + /* Should first check to see if strings are already in + * proper normalized form. + */ + ucs = malloc( ( ( norm1 || l1 > l2 ) ? l1 : l2 ) * sizeof(*ucs) ); if ( ucs == NULL ) { return l1 > l2 ? 1 : -1; /* what to do??? */ } /* * XXYYZ: we convert to ucs4 even though -llunicode - * expects ucs2 in an unsigned long + * expects ucs2 in an ac_uint4 */ /* convert and normalize 1st string */ for ( i = 0, ulen = 0; i < l1; i += len, ulen++ ) { - ucs[ulen] = ldap_utf8_to_ucs4( s1 + i ); - if ( ucs[ulen] == LDAP_UCS4_INVALID ) { + ucs[ulen] = ldap_x_utf8_to_ucs4( s1 + i ); + if ( ucs[ulen] == LDAP_UCS4_INVALID ) { free( ucs ); - return -1; /* what to do??? */ - } + return -1; /* what to do??? */ + } len = LDAP_UTF8_CHARLEN( s1 + i ); } - uccanondecomp( ucs, ulen, &ucsout1, &l1 ); - l1 = uccanoncomp( ucsout1, l1 ); + + if ( norm1 ) { + ucsout1 = ucs; + l1 = ulen; + ucs = malloc( l2 * sizeof(*ucs) ); + if ( ucs == NULL ) { + free( ucsout1 ); + return l1 > l2 ? 1 : -1; /* what to do??? */ + } + } else { + uccompatdecomp( ucs, ulen, &ucsout1, &l1, ctx ); + l1 = uccanoncomp( ucsout1, l1 ); + } /* convert and normalize 2nd string */ for ( i = 0, ulen = 0; i < l2; i += len, ulen++ ) { - ucs[ulen] = ldap_utf8_to_ucs4( s2 + i ); - if ( ucs[ulen] == LDAP_UCS4_INVALID ) { + ucs[ulen] = ldap_x_utf8_to_ucs4( s2 + i ); + if ( ucs[ulen] == LDAP_UCS4_INVALID ) { free( ucsout1 ); free( ucs ); - return 1; /* what to do??? */ - } + return 1; /* what to do??? */ + } len = LDAP_UTF8_CHARLEN( s2 + i ); } - uccanondecomp( ucs, ulen, &ucsout2, &l2 ); - l2 = uccanoncomp( ucsout2, l2 ); - - free( ucs ); + if ( norm2 ) { + ucsout2 = ucs; + l2 = ulen; + } else { + uccompatdecomp( ucs, ulen, &ucsout2, &l2, ctx ); + l2 = uccanoncomp( ucsout2, l2 ); + free( ucs ); + } + res = casefold ? ucstrncasecmp( ucsout1, ucsout2, l1 < l2 ? l1 : l2 ) : ucstrncmp( ucsout1, ucsout2, l1 < l2 ? l1 : l2 );