git.sur5r.net Git - openldap/blob - libraries/liblunicode/ucstr.c

   1 /*
   2  * Copyright 2000-2002 The OpenLDAP Foundation
   3  * COPYING RESTRICTIONS APPLY.  See COPYRIGHT File in top level directory
   4  * of this package for details.
   5  */
   6
   7 #include "portable.h"
   8
   9 #include <ac/ctype.h>
  10 #include <ac/string.h>
  11 #include <ac/stdlib.h>
  12
  13 #include <lber.h>
  14
  15 #define malloc(x)       ber_memalloc(x)
  16 #define realloc(x,y)    ber_memrealloc(x,y)
  17 #define free(x)         ber_memfree(x)
  18
  19 #include <ldap_utf8.h>
  20 #include <ldap_pvt_uc.h>
  21
  22
  23 int ucstrncmp(
  24         const ldap_unicode_t *u1,
  25         const ldap_unicode_t *u2,
  26         ber_len_t n )
  27 {
  28         for(; 0 < n; ++u1, ++u2, --n ) {
  29                 if( *u1 != *u2 ) {
  30                         return *u1 < *u2 ? -1 : +1;
  31                 }
  32                 if ( *u1 == 0 ) {
  33                         return 0;
  34                 }
  35         }
  36         return 0;
  37 }
  38
  39 int ucstrncasecmp(
  40         const ldap_unicode_t *u1,
  41         const ldap_unicode_t *u2,
  42         ber_len_t n )
  43 {
  44         for(; 0 < n; ++u1, ++u2, --n ) {
  45                 ldap_unicode_t uu1 = uctoupper( *u1 );
  46                 ldap_unicode_t uu2 = uctoupper( *u2 );
  47
  48                 if( uu1 != uu2 ) {
  49                         return uu1 < uu2 ? -1 : +1;
  50                 }
  51                 if ( uu1 == 0 ) {
  52                         return 0;
  53                 }
  54         }
  55         return 0;
  56 }
  57
  58 ldap_unicode_t * ucstrnchr(
  59         const ldap_unicode_t *u,
  60         ber_len_t n,
  61         ldap_unicode_t c )
  62 {
  63         for(; 0 < n; ++u, --n ) {
  64                 if( *u == c ) {
  65                         return (ldap_unicode_t *) u;
  66                 }
  67         }
  68
  69         return NULL;
  70 }
  71
  72 ldap_unicode_t * ucstrncasechr(
  73         const ldap_unicode_t *u,
  74         ber_len_t n,
  75         ldap_unicode_t c )
  76 {
  77         c = uctoupper( c );
  78         for(; 0 < n; ++u, --n ) {
  79                 if( uctoupper( *u ) == c ) {
  80                         return (ldap_unicode_t *) u;
  81                 }
  82         }
  83
  84         return NULL;
  85 }
  86
  87 void ucstr2upper(
  88         ldap_unicode_t *u,
  89         ber_len_t n )
  90 {
  91         for(; 0 < n; ++u, --n ) {
  92                 *u = uctoupper( *u );
  93         }
  94 }
  95
  96 char * UTF8normalize(
  97         struct berval *bv,
  98         unsigned casefold )
  99 {
 100         int i, j, len, clen, outpos, ucsoutlen, outsize, last;
 101         char *out, *s;
 102         unsigned long *ucs, *p, *ucsout;
 103
 104         static unsigned char mask[] = {
 105                 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
 106
 107         if ( bv == NULL ) {
 108                 return NULL;
 109         }
 110
 111         s = bv->bv_val;
 112         len = bv->bv_len;
 113
 114         /* See if the string is pure ASCII so we can shortcut */
 115         for ( i=0; i<len; i++ ) {
 116                 if ( s[i] & 0x80 )      /* non-ASCII */
 117                         break;
 118         }
 119
 120         /* It's pure ASCII or zero-len */
 121         if ( i == len ) {
 122                 out = malloc( len + 1 );
 123                 if ( i && !casefold ) {
 124                         strncpy( out, bv->bv_val, len );
 125                 } else {
 126                         for ( j=0; j<i; j++ )
 127                                 out[j] = TOUPPER( s[j] );
 128                 }
 129                 out[len] = '\0';
 130                 return out;
 131         }
 132
 133         outsize = len + 7;
 134         out = (char *) malloc( outsize );
 135         if ( out == NULL ) {
 136                 return NULL;
 137         }
 138
 139         /* FIXME: Should first check to see if string is already in
 140          * proper normalized form.
 141          */
 142
 143         outpos = 0;
 144
 145         /* finish off everything up to character before first non-ascii */
 146         if ( LDAP_UTF8_ISASCII( s ) ) {
 147                 for ( i = 1; (i < len) && LDAP_UTF8_ISASCII(s + i); i++ ) {
 148                         out[outpos++] = casefold ? TOUPPER( s[i-1] ) : s[i-1];
 149                 }
 150                 if ( i == len ) {
 151                         out[outpos++] = casefold ? TOUPPER( s[len - 1] ) : s[len - 1];
 152                         out[outpos] = '\0';
 153                         return out;
 154                 }
 155         } else {
 156                 i = 0;
 157         }
 158
 159         p = ucs = (long *) malloc( len * sizeof(*ucs) );
 160         if ( ucs == NULL ) {
 161                 free(out);
 162                 return NULL;
 163         }
 164
 165         /* convert character before first non-ascii to ucs-4 */
 166         if ( i > 0 ) {
 167                 *p = casefold ? TOUPPER( s[i - 1] ) : s[i - 1];
 168                 p++;
 169         }
 170
 171         /* s[i] is now first non-ascii character */
 172         for (;;) {
 173                 /* s[i] is non-ascii */
 174                 /* convert everything up to next ascii to ucs-4 */
 175                 while ( i < len ) {
 176                         clen = LDAP_UTF8_CHARLEN2( s + i, clen );
 177                         if ( clen == 0 ) {
 178                                 free( ucs );
 179                                 free( out );
 180                                 return NULL;
 181                         }
 182                         if ( clen == 1 ) {
 183                                 /* ascii */
 184                                 break;
 185                         }
 186                         *p = s[i] & mask[clen];
 187                         i++;
 188                         for( j = 1; j < clen; j++ ) {
 189                                 if ( (s[i] & 0xc0) != 0x80 ) {
 190                                         free( ucs );
 191                                         free( out );
 192                                         return NULL;
 193                                 }
 194                                 *p <<= 6;
 195                                 *p |= s[i] & 0x3f;
 196                                 i++;
 197                         }
 198                         if ( casefold ) {
 199                                 *p = uctoupper( *p );
 200                         }
 201                         p++;
 202                 }
 203                 /* normalize ucs of length p - ucs */
 204                 uccanondecomp( ucs, p - ucs, &ucsout, &ucsoutlen );
 205                 ucsoutlen = uccanoncomp( ucsout, ucsoutlen );
 206                 /* convert ucs to utf-8 and store in out */
 207                 for ( j = 0; j < ucsoutlen; j++ ) {
 208                         /* allocate more space if not enough room for
 209                            6 bytes and terminator */
 210                         if ( outsize - outpos < 7 ) {
 211                                 outsize = ucsoutlen - j + outpos + 6;
 212                                 out = (char *) realloc( out, outsize );
 213                                 if ( out == NULL ) {
 214                                         free( ucs );
 215                                         return NULL;
 216                                 }
 217                         }
 218                         outpos += ldap_x_ucs4_to_utf8( ucsout[j], &out[outpos] );
 219                 }
 220
 221                 if ( i == len ) {
 222                         break;
 223                 }
 224
 225                 last = i;
 226
 227                 /* s[i] is ascii */
 228                 /* finish off everything up to char before next non-ascii */
 229                 for ( i++; (i < len) && LDAP_UTF8_ISASCII(s + i); i++ ) {
 230                         out[outpos++] = casefold ? TOUPPER( s[i-1] ) : s[i-1];
 231                 }
 232                 if ( i == len ) {
 233                         out[outpos++] = casefold ? TOUPPER( s[len - 1] ) : s[len - 1];
 234                         break;
 235                 }
 236
 237                 /* convert character before next non-ascii to ucs-4 */
 238                 *ucs = casefold ? TOUPPER( s[i - 1] ) : s[i - 1];
 239                 p = ucs + 1;
 240         }
 241         free( ucs );
 242         out[outpos] = '\0';
 243         return out;
 244 }
 245
 246 struct berval * UTF8bvnormalize(
 247         struct berval *bv,
 248         struct berval *newbv,
 249         unsigned casefold )
 250 {
 251         int i, j, len, clen, outpos, ucsoutlen, outsize, last;
 252         char *out, *s;
 253         unsigned long *ucs, *p, *ucsout;
 254
 255         static unsigned char mask[] = {
 256                 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
 257
 258         if ( bv == NULL ) {
 259                 return NULL;
 260         }
 261
 262         s = bv->bv_val;
 263         len = bv->bv_len;
 264
 265         if ( len == 0 ) {
 266                 return ber_dupbv( newbv, bv );
 267         }
 268
 269         /* FIXME: Should first check to see if string is already in
 270          * proper normalized form. This is almost as time consuming
 271          * as the normalization though.
 272          */
 273
 274         /* finish off everything up to character before first non-ascii */
 275         if ( LDAP_UTF8_ISASCII( s ) ) {
 276                 if ( casefold ) {
 277                         outsize = len + 7;
 278                         out = (char *) malloc( outsize );
 279                         if ( out == NULL ) {
 280                                 return NULL;
 281                         }
 282                         outpos = 0;
 283
 284                         for ( i = 1; (i < len) && LDAP_UTF8_ISASCII(s + i); i++ ) {
 285                                 out[outpos++] = TOUPPER( s[i-1] );
 286                         }
 287                         if ( i == len ) {
 288                                 out[outpos++] = TOUPPER( s[len - 1] );
 289                                 out[outpos] = '\0';
 290                                 return ber_str2bv( out, outpos, 0, newbv);
 291                         }
 292                 } else {
 293                         for ( i = 1; (i < len) && LDAP_UTF8_ISASCII(s + i); i++ ) {
 294                                 /* empty */
 295                         }
 296
 297                         if ( i == len ) {
 298                                 return ber_str2bv( s, len, 1, newbv );
 299                         }
 300
 301                         outsize = len + 7;
 302                         out = (char *) malloc( outsize );
 303                         if ( out == NULL ) {
 304                                 return NULL;
 305                         }
 306                         outpos = i - 1;
 307                         memcpy(out, s, outpos);
 308                 }
 309         } else {
 310                 outsize = len + 7;
 311                 out = (char *) malloc( outsize );
 312                 if ( out == NULL ) {
 313                         return NULL;
 314                 }
 315                 outpos = 0;
 316                 i = 0;
 317         }
 318
 319         p = ucs = (long *) malloc( len * sizeof(*ucs) );
 320         if ( ucs == NULL ) {
 321                 free(out);
 322                 return NULL;
 323         }
 324
 325         /* convert character before first non-ascii to ucs-4 */
 326         if ( i > 0 ) {
 327                 *p = casefold ? TOUPPER( s[i - 1] ) : s[i - 1];
 328                 p++;
 329         }
 330
 331         /* s[i] is now first non-ascii character */
 332         for (;;) {
 333                 /* s[i] is non-ascii */
 334                 /* convert everything up to next ascii to ucs-4 */
 335                 while ( i < len ) {
 336                         clen = LDAP_UTF8_CHARLEN2( s + i, clen );
 337                         if ( clen == 0 ) {
 338                                 free( ucs );
 339                                 free( out );
 340                                 return NULL;
 341                         }
 342                         if ( clen == 1 ) {
 343                                 /* ascii */
 344                                 break;
 345                         }
 346                         *p = s[i] & mask[clen];
 347                         i++;
 348                         for( j = 1; j < clen; j++ ) {
 349                                 if ( (s[i] & 0xc0) != 0x80 ) {
 350                                         free( ucs );
 351                                         free( out );
 352                                         return NULL;
 353                                 }
 354                                 *p <<= 6;
 355                                 *p |= s[i] & 0x3f;
 356                                 i++;
 357                         }
 358                         if ( casefold ) {
 359                                 *p = uctoupper( *p );
 360                         }
 361                         p++;
 362                 }
 363                 /* normalize ucs of length p - ucs */
 364                 uccanondecomp( ucs, p - ucs, &ucsout, &ucsoutlen );
 365                 ucsoutlen = uccanoncomp( ucsout, ucsoutlen );
 366                 /* convert ucs to utf-8 and store in out */
 367                 for ( j = 0; j < ucsoutlen; j++ ) {
 368                         /* allocate more space if not enough room for
 369                            6 bytes and terminator */
 370                         if ( outsize - outpos < 7 ) {
 371                                 outsize = ucsoutlen - j + outpos + 6;
 372                                 out = (char *) realloc( out, outsize );
 373                                 if ( out == NULL ) {
 374                                         free( ucs );
 375                                         return NULL;
 376                                 }
 377                         }
 378                         outpos += ldap_x_ucs4_to_utf8( ucsout[j], &out[outpos] );
 379                 }
 380
 381                 if ( i == len ) {
 382                         break;
 383                 }
 384
 385                 last = i;
 386
 387                 /* s[i] is ascii */
 388                 /* finish off everything up to char before next non-ascii */
 389                 for ( i++; (i < len) && LDAP_UTF8_ISASCII(s + i); i++ ) {
 390                         out[outpos++] = casefold ? TOUPPER( s[i-1] ) : s[i-1];
 391                 }
 392                 if ( i == len ) {
 393                         out[outpos++] = casefold ? TOUPPER( s[len - 1] ) : s[len - 1];
 394                         break;
 395                 }
 396
 397                 /* convert character before next non-ascii to ucs-4 */
 398                 *ucs = casefold ? TOUPPER( s[i - 1] ) : s[i - 1];
 399                 p = ucs + 1;
 400         }
 401         free( ucs );
 402         out[outpos] = '\0';
 403         return ber_str2bv( out, outpos, 0, newbv );
 404 }
 405
 406 /* compare UTF8-strings, optionally ignore casing, string pointers must not be NULL */
 407 /* slow, should be optimized */
 408 int UTF8normcmp(
 409         const char *s1,
 410         const char *s2,
 411         unsigned casefold )
 412 {
 413         int i, l1, l2, len, ulen, res;
 414         unsigned long *ucs, *ucsout1, *ucsout2;
 415
 416         l1 = strlen( s1 );
 417         l2 = strlen( s2 );
 418
 419         if ( ( l1 == 0 ) || ( l2 == 0 ) ) {
 420                 if ( l1 == l2 ) {
 421                         return 0;
 422                 }
 423                 return *s1 - *s2 > 0 ? 1 : -1;
 424         }
 425
 426         /* See if we can get away with a straight ASCII compare */
 427         len = (l1 < l2) ? l1 : l2;
 428         for ( i = 0; i<len; i++ ) {
 429                 /* Is either char non-ASCII? */
 430                 if ((s1[i] & 0x80) || (s2[i] & 0x80))
 431                         break;
 432                 if (casefold) {
 433                         char c1 = TOUPPER(s1[i]);
 434                         char c2 = TOUPPER(s2[i]);
 435                         res = c1 - c2;
 436                 } else {
 437                         res = s1[i] - s2[i];
 438                 }
 439                 if (res)
 440                         return res;
 441         }
 442         /* Strings were ASCII, equal up to minlen */
 443         if (i == len)
 444                 return l1 - l2;
 445
 446         /* FIXME: Should first check to see if strings are already in
 447          * proper normalized form.
 448          */
 449
 450         ucs = (long *) malloc( ( l1 > l2 ? l1 : l2 ) * sizeof(*ucs) );
 451         if ( ucs == NULL ) {
 452                 return l1 > l2 ? 1 : -1; /* what to do??? */
 453         }
 454
 455         /*
 456          * XXYYZ: we convert to ucs4 even though -llunicode
 457          * expects ucs2 in an unsigned long
 458          */
 459
 460         /* convert and normalize 1st string */
 461         for ( i = 0, ulen = 0; i < l1; i += len, ulen++ ) {
 462                 ucs[ulen] = ldap_x_utf8_to_ucs4( s1 + i );
 463                 if ( ucs[ulen] == LDAP_UCS4_INVALID ) {
 464                         free( ucs );
 465                         return -1; /* what to do??? */
 466                 }
 467                 len = LDAP_UTF8_CHARLEN( s1 + i );
 468         }
 469         uccanondecomp( ucs, ulen, &ucsout1, &l1 );
 470         l1 = uccanoncomp( ucsout1, l1 );
 471
 472         /* convert and normalize 2nd string */
 473         for ( i = 0, ulen = 0; i < l2; i += len, ulen++ ) {
 474                 ucs[ulen] = ldap_x_utf8_to_ucs4( s2 + i );
 475                 if ( ucs[ulen] == LDAP_UCS4_INVALID ) {
 476                         free( ucsout1 );
 477                         free( ucs );
 478                         return 1; /* what to do??? */
 479                 }
 480                 len = LDAP_UTF8_CHARLEN( s2 + i );
 481         }
 482         uccanondecomp( ucs, ulen, &ucsout2, &l2 );
 483         l2 = uccanoncomp( ucsout2, l2 );
 484
 485         free( ucs );
 486
 487         res = casefold
 488                 ? ucstrncasecmp( ucsout1, ucsout2, l1 < l2 ? l1 : l2 )
 489                 : ucstrncmp( ucsout1, ucsout2, l1 < l2 ? l1 : l2 );
 490         free( ucsout1 );
 491         free( ucsout2 );
 492
 493         if ( res != 0 ) {
 494                 return res;
 495         }
 496         if ( l1 == l2 ) {
 497                 return 0;
 498         }
 499         return l1 > l2 ? 1 : -1;
 500 }
 501
 502 /* compare UTF8-strings, optionally ignore casing */
 503 /* slow, should be optimized */
 504 int UTF8bvnormcmp(
 505         struct berval *bv1,
 506         struct berval *bv2,
 507         unsigned casefold )
 508 {
 509         int i, l1, l2, len, ulen, res;
 510         char *s1, *s2, *done;
 511         unsigned long *ucs, *ucsout1, *ucsout2;
 512
 513         if (bv1 == NULL) {
 514                 return bv2 == NULL ? 0 : -1;
 515         } else if (bv2 == NULL) {
 516                 return 1;
 517         }
 518
 519         l1 = bv1->bv_len;
 520         l2 = bv2->bv_len;
 521
 522         len = (l1 < l2) ? l1 : l2;
 523         if (len == 0) {
 524                 return l1 == 0 ? (l2 == 0 ? 0 : -1) : 1;
 525         }
 526
 527         s1 = bv1->bv_val;
 528         s2 = bv2->bv_val;
 529         done = s1 + len;
 530
 531         while ( (s1 < done) && LDAP_UTF8_ISASCII(s1) && LDAP_UTF8_ISASCII(s2) ) {
 532                 if (casefold) {
 533                         char c1 = TOUPPER(*s1);
 534                         char c2 = TOUPPER(*s2);
 535                         res = c1 - c2;
 536                 } else {
 537                         res = *s1 - *s2;
 538                 }
 539                 s1++;
 540                 s2++;
 541                 if (res) {
 542                         /* done unless next character in s1 or s2 is non-ascii */
 543                         if (s1 < done) {
 544                                 if (!LDAP_UTF8_ISASCII(s1) || !LDAP_UTF8_ISASCII(s2)) {
 545                                         break;
 546                                 }
 547                         } else if ((len < l1) && !LDAP_UTF8_ISASCII(s1) ||
 548                                    (len < l2) && !LDAP_UTF8_ISASCII(s2)) {
 549                                 break;
 550                         }
 551                         return res;
 552                 }
 553         }
 554
 555         /* We have encountered non-ascii or strings equal up to len */
 556
 557         /* set i to number of iterations */
 558         i = s1 - done + len;
 559         /* passed through loop at least once? */
 560         if (i > 0) {
 561                 if (!res && (s1 == done) &&
 562                     ((len == l1) || LDAP_UTF8_ISASCII(s1)) &&
 563                     ((len == l2) || LDAP_UTF8_ISASCII(s2))) {
 564                         /* all ascii and equal up to len */
 565                         return l1 - l2;
 566                 }
 567
 568                 /* rewind one char, and do normalized compare from there */
 569                 s1--;
 570                 s2--;
 571                 l1 -= i - 1;
 572                 l2 -= i - 1;
 573         }
 574
 575         /* FIXME: Should first check to see if strings are already in
 576          * proper normalized form.
 577          */
 578
 579         ucs = (long *) malloc( ( l1 > l2 ? l1 : l2 ) * sizeof(*ucs) );
 580         if ( ucs == NULL ) {
 581                 return l1 > l2 ? 1 : -1; /* what to do??? */
 582         }
 583
 584         /*
 585          * XXYYZ: we convert to ucs4 even though -llunicode
 586          * expects ucs2 in an unsigned long
 587          */
 588
 589         /* convert and normalize 1st string */
 590         for ( i = 0, ulen = 0; i < l1; i += len, ulen++ ) {
 591                 ucs[ulen] = ldap_x_utf8_to_ucs4( s1 + i );
 592                 if ( ucs[ulen] == LDAP_UCS4_INVALID ) {
 593                         free( ucs );
 594                         return -1; /* what to do??? */
 595                 }
 596                 len = LDAP_UTF8_CHARLEN( s1 + i );
 597         }
 598         uccanondecomp( ucs, ulen, &ucsout1, &l1 );
 599         l1 = uccanoncomp( ucsout1, l1 );
 600
 601         /* convert and normalize 2nd string */
 602         for ( i = 0, ulen = 0; i < l2; i += len, ulen++ ) {
 603                 ucs[ulen] = ldap_x_utf8_to_ucs4( s2 + i );
 604                 if ( ucs[ulen] == LDAP_UCS4_INVALID ) {
 605                         free( ucsout1 );
 606                         free( ucs );
 607                         return 1; /* what to do??? */
 608                 }
 609                 len = LDAP_UTF8_CHARLEN( s2 + i );
 610         }
 611         uccanondecomp( ucs, ulen, &ucsout2, &l2 );
 612         l2 = uccanoncomp( ucsout2, l2 );
 613
 614         free( ucs );
 615
 616         res = casefold
 617                 ? ucstrncasecmp( ucsout1, ucsout2, l1 < l2 ? l1 : l2 )
 618                 : ucstrncmp( ucsout1, ucsout2, l1 < l2 ? l1 : l2 );
 619         free( ucsout1 );
 620         free( ucsout2 );
 621
 622         if ( res != 0 ) {
 623                 return res;
 624         }
 625         if ( l1 == l2 ) {
 626                 return 0;
 627         }
 628         return l1 > l2 ? 1 : -1;
 629 }