From: Stig Venaas Date: Wed, 31 Jan 2001 15:45:30 +0000 (+0000) Subject: Adding UTF8normcmp() for normalizing and comparing two UTF8 strings X-Git-Tag: LDBM_PRE_GIANT_RWLOCK~1522 X-Git-Url: https://git.sur5r.net/?a=commitdiff_plain;h=7a3c92f2a7999b94b9c0dfefd5c0e6b2df8088ed;p=openldap Adding UTF8normcmp() for normalizing and comparing two UTF8 strings --- diff --git a/include/ldap_pvt_uc.h b/include/ldap_pvt_uc.h index c735271133..b58e7371a2 100644 --- a/include/ldap_pvt_uc.h +++ b/include/ldap_pvt_uc.h @@ -131,7 +131,12 @@ LDAP_LUNICODE_F(void) ucstr2upper( LDAP_LUNICODE_F(char *) UTF8normalize( const char *, - char casefold ); + char ); + +LDAP_LUNICODE_F(int) UTF8normcmp( + const char *, + const char *, + char ); LDAP_END_DECL diff --git a/libraries/liblunicode/ucstr.c b/libraries/liblunicode/ucstr.c index b19a0fcde4..6518620e32 100644 --- a/libraries/liblunicode/ucstr.c +++ b/libraries/liblunicode/ucstr.c @@ -207,3 +207,75 @@ char * UTF8normalize( out[outpos] = '\0'; return out; } + +/* compare UTF8-strings, optionally ignore casing, string pointers must not be NULL */ +/* slow, should be optimized */ +int UTF8normcmp( + const char *s1, + const char *s2, + char casefold ) +{ + int i, l1, l2, len, ulen, res; + unsigned long *ucs, *ucsout1, *ucsout2; + + l1 = strlen( s1 ); + l2 = strlen( s2 ); + + if ( ( l1 == 0 ) || ( l2 == 0 ) ) { + if ( l1 == l2 ) { + return 0; + } + return *s1 - *s2 > 0 ? 1 : -1; + } + + ucs = (long *) malloc( ( l1 > l2 ? l1 : l2 ) * sizeof(*ucs) ); + if ( ucs == NULL ) { + return l1 > l2 ? 1 : -1; /* what to do??? */ + } + + /* + * XXYYZ: we convert to ucs4 even though -llunicode + * expects ucs2 in an unsigned long + */ + + /* convert and normalize 1st string */ + for ( i = 0, ulen = 0; i < l1; i += len, ulen++ ) { + ucs[ulen] = ldap_utf8_to_ucs4( s1 + i ); + if ( ucs[ulen] == LDAP_UCS4_INVALID ) { + free( ucs ); + return -1; /* what to do??? */ + } + len = LDAP_UTF8_CHARLEN( s1 + i ); + } + uccanondecomp( ucs, ulen, &ucsout1, &l1 ); + l1 = uccanoncomp( ucsout1, l1 ); + + /* convert and normalize 2nd string */ + for ( i = 0, ulen = 0; i < l2; i += len, ulen++ ) { + ucs[ulen] = ldap_utf8_to_ucs4( s2 + i ); + if ( ucs[ulen] == LDAP_UCS4_INVALID ) { + free( ucsout1 ); + free( ucs ); + return 1; /* what to do??? */ + } + len = LDAP_UTF8_CHARLEN( s2 + i ); + } + uccanondecomp( ucs, ulen, &ucsout2, &l2 ); + l2 = uccanoncomp( ucsout2, l2 ); + + free( ucs ); + + res = casefold + ? ucstrncasecmp( ucsout1, ucsout2, l1 < l2 ? l1 : l2 ) + : ucstrncmp( ucsout1, ucsout2, l1 < l2 ? l1 : l2 ); + free( ucsout1 ); + free( ucsout2 ); + + if ( res != 0 ) { + return res; + } + if ( l1 == l2 ) { + return 0; + } + return l1 > l2 ? 1 : -1; +}