git.sur5r.net Git - openldap/blob - libraries/liblunicode/ucstr.c

   1 /*
   2  * Copyright 2000-2002 The OpenLDAP Foundation
   3  * COPYING RESTRICTIONS APPLY.  See COPYRIGHT File in top level directory
   4  * of this package for details.
   5  */
   6
   7 #include "portable.h"
   8
   9 #include <ac/ctype.h>
  10 #include <ac/string.h>
  11 #include <ac/stdlib.h>
  12
  13 #include <lber.h>
  14
  15 #define malloc(x)       ber_memalloc(x)
  16 #define realloc(x,y)    ber_memrealloc(x,y)
  17 #define free(x)         ber_memfree(x)
  18
  19 #include <ldap_utf8.h>
  20 #include <ldap_pvt_uc.h>
  21
  22
  23 int ucstrncmp(
  24         const ldap_unicode_t *u1,
  25         const ldap_unicode_t *u2,
  26         ber_len_t n )
  27 {
  28         for(; 0 < n; ++u1, ++u2, --n ) {
  29                 if( *u1 != *u2 ) {
  30                         return *u1 < *u2 ? -1 : +1;
  31                 }
  32                 if ( *u1 == 0 ) {
  33                         return 0;
  34                 }
  35         }
  36         return 0;
  37 }
  38
  39 int ucstrncasecmp(
  40         const ldap_unicode_t *u1,
  41         const ldap_unicode_t *u2,
  42         ber_len_t n )
  43 {
  44         for(; 0 < n; ++u1, ++u2, --n ) {
  45                 ldap_unicode_t uu1 = uctoupper( *u1 );
  46                 ldap_unicode_t uu2 = uctoupper( *u2 );
  47
  48                 if( uu1 != uu2 ) {
  49                         return uu1 < uu2 ? -1 : +1;
  50                 }
  51                 if ( uu1 == 0 ) {
  52                         return 0;
  53                 }
  54         }
  55         return 0;
  56 }
  57
  58 ldap_unicode_t * ucstrnchr(
  59         const ldap_unicode_t *u,
  60         ber_len_t n,
  61         ldap_unicode_t c )
  62 {
  63         for(; 0 < n; ++u, --n ) {
  64                 if( *u == c ) {
  65                         return (ldap_unicode_t *) u;
  66                 }
  67         }
  68
  69         return NULL;
  70 }
  71
  72 ldap_unicode_t * ucstrncasechr(
  73         const ldap_unicode_t *u,
  74         ber_len_t n,
  75         ldap_unicode_t c )
  76 {
  77         c = uctoupper( c );
  78         for(; 0 < n; ++u, --n ) {
  79                 if( uctoupper( *u ) == c ) {
  80                         return (ldap_unicode_t *) u;
  81                 }
  82         }
  83
  84         return NULL;
  85 }
  86
  87 void ucstr2upper(
  88         ldap_unicode_t *u,
  89         ber_len_t n )
  90 {
  91         for(; 0 < n; ++u, --n ) {
  92                 *u = uctoupper( *u );
  93         }
  94 }
  95
  96 char * UTF8normalize(
  97         struct berval *bv,
  98         unsigned casefold )
  99 {
 100         int i, j, len, clen, outpos, ucsoutlen, outsize, last;
 101         char *out, *s;
 102         unsigned long *ucs, *p, *ucsout;
 103
 104         static unsigned char mask[] = {
 105                 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
 106
 107         if ( bv == NULL ) {
 108                 return NULL;
 109         }
 110
 111         s = bv->bv_val;
 112         len = bv->bv_len;
 113
 114         /* See if the string is pure ASCII so we can shortcut */
 115         for ( i=0; i<len; i++ ) {
 116                 if ( s[i] & 0x80 )      /* non-ASCII */
 117                         break;
 118         }
 119
 120         /* It's pure ASCII or zero-len */
 121         if ( i == len ) {
 122                 out = malloc( len + 1 );
 123                 if ( i && !casefold ) {
 124                         strncpy( out, bv->bv_val, len );
 125                 } else {
 126                         for ( j=0; j<i; j++ )
 127                                 out[j] = TOUPPER( s[j] );
 128                 }
 129                 out[len] = '\0';
 130                 return out;
 131         }
 132
 133         outsize = len + 7;
 134         out = (char *) malloc( outsize );
 135         if ( out == NULL ) {
 136                 return NULL;
 137         }
 138
 139         /* FIXME: Should first check to see if string is already in
 140          * proper normalized form.
 141          */
 142
 143         outpos = 0;
 144
 145         /* finish off everything up to character before first non-ascii */
 146         if ( LDAP_UTF8_ISASCII( s ) ) {
 147                 for ( i = 1; (i < len) && LDAP_UTF8_ISASCII(s + i); i++ ) {
 148                         out[outpos++] = casefold ? TOUPPER( s[i-1] ) : s[i-1];
 149                 }
 150                 if ( i == len ) {
 151                         out[outpos++] = casefold ? TOUPPER( s[len - 1] ) : s[len - 1];
 152                         out[outpos] = '\0';
 153                         return out;
 154                 }
 155         } else {
 156                 i = 0;
 157         }
 158
 159         p = ucs = (long *) malloc( len * sizeof(*ucs) );
 160         if ( ucs == NULL ) {
 161                 free(out);
 162                 return NULL;
 163         }
 164
 165         /* convert character before first non-ascii to ucs-4 */
 166         if ( i > 0 ) {
 167                 *p = casefold ? TOUPPER( s[i - 1] ) : s[i - 1];
 168                 p++;
 169         }
 170
 171         /* s[i] is now first non-ascii character */
 172         for (;;) {
 173                 /* s[i] is non-ascii */
 174                 /* convert everything up to next ascii to ucs-4 */
 175                 while ( i < len ) {
 176                         clen = LDAP_UTF8_CHARLEN2( s + i, clen );
 177                         if ( clen == 0 ) {
 178                                 free( ucs );
 179                                 free( out );
 180                                 return NULL;
 181                         }
 182                         if ( clen == 1 ) {
 183                                 /* ascii */
 184                                 break;
 185                         }
 186                         *p = s[i] & mask[clen];
 187                         i++;
 188                         for( j = 1; j < clen; j++ ) {
 189                                 if ( (s[i] & 0xc0) != 0x80 ) {
 190                                         free( ucs );
 191                                         free( out );
 192                                         return NULL;
 193                                 }
 194                                 *p <<= 6;
 195                                 *p |= s[i] & 0x3f;
 196                                 i++;
 197                         }
 198                         if ( casefold ) {
 199                                 *p = uctoupper( *p );
 200                         }
 201                         p++;
 202                 }
 203                 /* normalize ucs of length p - ucs */
 204                 uccanondecomp( ucs, p - ucs, &ucsout, &ucsoutlen );
 205                 ucsoutlen = uccanoncomp( ucsout, ucsoutlen );
 206                 /* convert ucs to utf-8 and store in out */
 207                 for ( j = 0; j < ucsoutlen; j++ ) {
 208                         /* allocate more space if not enough room for
 209                            6 bytes and terminator */
 210                         if ( outsize - outpos < 7 ) {
 211                                 outsize = ucsoutlen - j + outpos + 6;
 212                                 out = (char *) realloc( out, outsize );
 213                                 if ( out == NULL ) {
 214                                         free( ucs );
 215                                         return NULL;
 216                                 }
 217                         }
 218                         outpos += ldap_x_ucs4_to_utf8( ucsout[j], &out[outpos] );
 219                 }
 220
 221                 if ( i == len ) {
 222                         break;
 223                 }
 224
 225                 last = i;
 226
 227                 /* s[i] is ascii */
 228                 /* finish off everything up to char before next non-ascii */
 229                 for ( i++; (i < len) && LDAP_UTF8_ISASCII(s + i); i++ ) {
 230                         out[outpos++] = casefold ? TOUPPER( s[i-1] ) : s[i-1];
 231                 }
 232                 if ( i == len ) {
 233                         out[outpos++] = casefold ? TOUPPER( s[len - 1] ) : s[len - 1];
 234                         break;
 235                 }
 236
 237                 /* convert character before next non-ascii to ucs-4 */
 238                 *ucs = casefold ? TOUPPER( s[i - 1] ) : s[i - 1];
 239                 p = ucs + 1;
 240         }
 241         free( ucs );
 242         out[outpos] = '\0';
 243         return out;
 244 }
 245
 246 /* compare UTF8-strings, optionally ignore casing, string pointers must not be NULL */
 247 /* slow, should be optimized */
 248 int UTF8normcmp(
 249         const char *s1,
 250         const char *s2,
 251         unsigned casefold )
 252 {
 253         int i, l1, l2, len, ulen, res;
 254         unsigned long *ucs, *ucsout1, *ucsout2;
 255
 256         l1 = strlen( s1 );
 257         l2 = strlen( s2 );
 258
 259         if ( ( l1 == 0 ) || ( l2 == 0 ) ) {
 260                 if ( l1 == l2 ) {
 261                         return 0;
 262                 }
 263                 return *s1 - *s2 > 0 ? 1 : -1;
 264         }
 265
 266         /* See if we can get away with a straight ASCII compare */
 267         len = (l1 < l2) ? l1 : l2;
 268         for ( i = 0; i<len; i++ ) {
 269                 /* Is either char non-ASCII? */
 270                 if ((s1[i] & 0x80) || (s2[i] & 0x80))
 271                         break;
 272                 if (casefold) {
 273                         char c1 = TOUPPER(s1[i]);
 274                         char c2 = TOUPPER(s2[i]);
 275                         res = c1 - c2;
 276                 } else {
 277                         res = s1[i] - s2[i];
 278                 }
 279                 if (res)
 280                         return res;
 281         }
 282         /* Strings were ASCII, equal up to minlen */
 283         if (i == len)
 284                 return l1 - l2;
 285
 286         /* FIXME: Should first check to see if strings are already in
 287          * proper normalized form.
 288          */
 289
 290         ucs = (long *) malloc( ( l1 > l2 ? l1 : l2 ) * sizeof(*ucs) );
 291         if ( ucs == NULL ) {
 292                 return l1 > l2 ? 1 : -1; /* what to do??? */
 293         }
 294
 295         /*
 296          * XXYYZ: we convert to ucs4 even though -llunicode
 297          * expects ucs2 in an unsigned long
 298          */
 299
 300         /* convert and normalize 1st string */
 301         for ( i = 0, ulen = 0; i < l1; i += len, ulen++ ) {
 302                 ucs[ulen] = ldap_x_utf8_to_ucs4( s1 + i );
 303                 if ( ucs[ulen] == LDAP_UCS4_INVALID ) {
 304                         free( ucs );
 305                         return -1; /* what to do??? */
 306                 }
 307                 len = LDAP_UTF8_CHARLEN( s1 + i );
 308         }
 309         uccanondecomp( ucs, ulen, &ucsout1, &l1 );
 310         l1 = uccanoncomp( ucsout1, l1 );
 311
 312         /* convert and normalize 2nd string */
 313         for ( i = 0, ulen = 0; i < l2; i += len, ulen++ ) {
 314                 ucs[ulen] = ldap_x_utf8_to_ucs4( s2 + i );
 315                 if ( ucs[ulen] == LDAP_UCS4_INVALID ) {
 316                         free( ucsout1 );
 317                         free( ucs );
 318                         return 1; /* what to do??? */
 319                 }
 320                 len = LDAP_UTF8_CHARLEN( s2 + i );
 321         }
 322         uccanondecomp( ucs, ulen, &ucsout2, &l2 );
 323         l2 = uccanoncomp( ucsout2, l2 );
 324
 325         free( ucs );
 326
 327         res = casefold
 328                 ? ucstrncasecmp( ucsout1, ucsout2, l1 < l2 ? l1 : l2 )
 329                 : ucstrncmp( ucsout1, ucsout2, l1 < l2 ? l1 : l2 );
 330         free( ucsout1 );
 331         free( ucsout2 );
 332
 333         if ( res != 0 ) {
 334                 return res;
 335         }
 336         if ( l1 == l2 ) {
 337                 return 0;
 338         }
 339         return l1 > l2 ? 1 : -1;
 340 }