git.sur5r.net Git - openldap/blob - libraries/libldap/utf-8.c

   1 /* $OpenLDAP$ */
   2 /*
   3  * Copyright 1998-2002 The OpenLDAP Foundation, All Rights Reserved.
   4  * COPYING RESTRICTIONS APPLY, see COPYRIGHT file
   5  */
   6
   7 /*
   8  * Basic UTF-8 routines
   9  *
  10  * These routines are "dumb".  Though they understand UTF-8,
  11  * they don't grok Unicode.  That is, they can push bits,
  12  * but don't have a clue what the bits represent.  That's
  13  * good enough for use with the LDAP Client SDK.
  14  *
  15  * These routines are not optimized.
  16  */
  17
  18 #include "portable.h"
  19
  20 #include <stdio.h>
  21
  22 #include <ac/stdlib.h>
  23
  24 #include <ac/socket.h>
  25 #include <ac/string.h>
  26 #include <ac/time.h>
  27
  28 #include "ldap_utf8.h"
  29
  30 #include "ldap-int.h"
  31 #include "ldap_defaults.h"
  32
  33 /*
  34  * Basic UTF-8 routines
  35  */
  36
  37 /*
  38  * return the number of bytes required to hold the
  39  * NULL-terminated UTF-8 string NOT INCLUDING the
  40  * termination.
  41  */
  42 ber_len_t ldap_utf8_bytes( const char * p )
  43 {
  44         ber_len_t bytes;
  45
  46         for( bytes=0; p[bytes]; bytes++ ) {
  47                 /* EMPTY */ ;
  48         }
  49
  50         return bytes;
  51 }
  52
  53 ber_len_t ldap_utf8_chars( const char * p )
  54 {
  55         /* could be optimized and could check for invalid sequences */
  56         ber_len_t chars=0;
  57
  58         for( ; *p ; LDAP_UTF8_INCR(p) ) {
  59                 chars++;
  60         }
  61
  62         return chars;
  63 }
  64
  65 /* return offset to next character */
  66 int ldap_utf8_offset( const char * p )
  67 {
  68         return LDAP_UTF8_NEXT(p) - p;
  69 }
  70
  71 /*
  72  * Returns length indicated by first byte.
  73  */
  74 const char ldap_utf8_lentab[] = {
  75         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  76         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  77         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  78         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  79         0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  80         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  81         3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  82         4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 };
  83
  84 int ldap_utf8_charlen( const char * p )
  85 {
  86         if (!(*p & 0x80))
  87                 return 1;
  88
  89         return ldap_utf8_lentab[*(const unsigned char *)p ^ 0x80];
  90 }
  91
  92 /*
  93  * Make sure the UTF-8 char used the shortest possible encoding
  94  * returns charlen if valid, 0 if not.
  95  *
  96  * Here are the valid UTF-8 encodings, taken from RFC 2279 page 4.
  97  * The table is slightly modified from that of the RFC.
  98  *
  99  * UCS-4 range (hex)      UTF-8 sequence (binary)
 100  * 0000 0000-0000 007F   0.......
 101  * 0000 0080-0000 07FF   110++++. 10......
 102  * 0000 0800-0000 FFFF   1110++++ 10+..... 10......
 103  * 0001 0000-001F FFFF   11110+++ 10++.... 10...... 10......
 104  * 0020 0000-03FF FFFF   111110++ 10+++... 10...... 10...... 10......
 105  * 0400 0000-7FFF FFFF   1111110+ 10++++.. 10...... 10...... 10...... 10......
 106  *
 107  * The '.' bits are "don't cares". When validating a UTF-8 sequence,
 108  * at least one of the '+' bits must be set, otherwise the character
 109  * should have been encoded in fewer octets. Note that in the two-octet
 110  * case, only the first octet needs to be validated, and this is done
 111  * in the ldap_utf8_lentab[] above.
 112  */
 113
 114 /* mask of required bits in second octet */
 115 #undef c
 116 #define c const char
 117 c ldap_utf8_mintab[] = {
 118         (c)0x20, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
 119         (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
 120         (c)0x30, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
 121         (c)0x38, (c)0x80, (c)0x80, (c)0x80, (c)0x3c, (c)0x80, (c)0x00, (c)0x00 };
 122 #undef c
 123
 124 int ldap_utf8_charlen2( const char * p )
 125 {
 126         int i = LDAP_UTF8_CHARLEN( p );
 127
 128         if ( i > 2 ) {
 129                 if ( !( ldap_utf8_mintab[*p & 0x1f] & p[1] ) )
 130                         i = 0;
 131         }
 132         return i;
 133 }
 134
 135 /* conv UTF-8 to UCS-4, useful for comparisons */
 136 ldap_ucs4_t ldap_x_utf8_to_ucs4( const char * p )
 137 {
 138     const unsigned char *c = p;
 139     ldap_ucs4_t ch;
 140         int len, i;
 141         static unsigned char mask[] = {
 142                 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
 143
 144         len = LDAP_UTF8_CHARLEN2(p, len);
 145
 146         if( len == 0 ) return LDAP_UCS4_INVALID;
 147
 148         ch = c[0] & mask[len];
 149
 150         for(i=1; i < len; i++) {
 151                 if ((c[i] & 0xc0) != 0x80) {
 152                         return LDAP_UCS4_INVALID;
 153                 }
 154
 155                 ch <<= 6;
 156                 ch |= c[i] & 0x3f;
 157         }
 158
 159         return ch;
 160 }
 161
 162 /* conv UCS-4 to UTF-8, not used */
 163 int ldap_x_ucs4_to_utf8( ldap_ucs4_t c, char *buf )
 164 {
 165         int len=0;
 166         unsigned char* p = buf;
 167
 168         /* not a valid Unicode character */
 169         if ( c < 0 ) return 0;
 170
 171         /* Just return length, don't convert */
 172         if(buf == NULL) {
 173                 if( c < 0x80 ) return 1;
 174                 else if( c < 0x800 ) return 2;
 175                 else if( c < 0x10000 ) return 3;
 176                 else if( c < 0x200000 ) return 4;
 177                 else if( c < 0x4000000 ) return 5;
 178                 else return 6;
 179         }
 180
 181         if( c < 0x80 ) {
 182                 p[len++] = c;
 183
 184         } else if( c < 0x800 ) {
 185                 p[len++] = 0xc0 | ( c >> 6 );
 186                 p[len++] = 0x80 | ( c & 0x3f );
 187
 188         } else if( c < 0x10000 ) {
 189                 p[len++] = 0xe0 | ( c >> 12 );
 190                 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
 191                 p[len++] = 0x80 | ( c & 0x3f );
 192
 193         } else if( c < 0x200000 ) {
 194                 p[len++] = 0xf0 | ( c >> 18 );
 195                 p[len++] = 0x80 | ( (c >> 12) & 0x3f );
 196                 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
 197                 p[len++] = 0x80 | ( c & 0x3f );
 198
 199         } else if( c < 0x4000000 ) {
 200                 p[len++] = 0xf8 | ( c >> 24 );
 201                 p[len++] = 0x80 | ( (c >> 18) & 0x3f );
 202                 p[len++] = 0x80 | ( (c >> 12) & 0x3f );
 203                 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
 204                 p[len++] = 0x80 | ( c & 0x3f );
 205
 206         } else /* if( c < 0x80000000 ) */ {
 207                 p[len++] = 0xfc | ( c >> 30 );
 208                 p[len++] = 0x80 | ( (c >> 24) & 0x3f );
 209                 p[len++] = 0x80 | ( (c >> 18) & 0x3f );
 210                 p[len++] = 0x80 | ( (c >> 12) & 0x3f );
 211                 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
 212                 p[len++] = 0x80 | ( c & 0x3f );
 213         }
 214
 215         return len;
 216 }
 217
 218 #define LDAP_UCS_UTF8LEN(c)     \
 219         c < 0 ? 0 : (c < 0x80 ? 1 : (c < 0x800 ? 2 : (c < 0x10000 ? 3 : \
 220         (c < 0x200000 ? 4 : (c < 0x4000000 ? 5 : 6)))))
 221
 222 /* Convert a string to UTF-8 format. The input string is expected to
 223  * have characters of 1, 2, or 4 octets (in network byte order)
 224  * corresponding to the ASN.1 T61STRING, BMPSTRING, and UNIVERSALSTRING
 225  * types respectively. (Here T61STRING just means that there is one
 226  * octet per character and characters may use the high bit of the octet.
 227  * The characters are assumed to use ISO mappings, no provision is made
 228  * for converting from T.61 coding rules to Unicode.)
 229  */
 230
 231 int
 232 ldap_ucs_to_utf8s( struct berval *ucs, int csize, struct berval *utf8s )
 233 {
 234         unsigned char *in, *end;
 235         char *ptr;
 236         ldap_ucs4_t u;
 237         int i, l = 0;
 238
 239         utf8s->bv_val = NULL;
 240         utf8s->bv_len = 0;
 241
 242         in = (unsigned char *)ucs->bv_val;
 243
 244         /* Make sure we stop at an even multiple of csize */
 245         end = in + ( ucs->bv_len & ~(csize-1) );
 246
 247         for (; in < end; ) {
 248                 u = *in++;
 249                 if (csize > 1) {
 250                         u <<= 8;
 251                         u |= *in++;
 252                 }
 253                 if (csize > 2) {
 254                         u <<= 8;
 255                         u |= *in++;
 256                         u <<= 8;
 257                         u |= *in++;
 258                 }
 259                 i = LDAP_UCS_UTF8LEN(u);
 260                 if (i == 0)
 261                         return LDAP_INVALID_SYNTAX;
 262                 l += i;
 263         }
 264
 265         utf8s->bv_val = LDAP_MALLOC( l+1 );
 266         if (utf8s->bv_val == NULL)
 267                 return LDAP_NO_MEMORY;
 268         utf8s->bv_len = l;
 269
 270         ptr = utf8s->bv_val;
 271         for (in = (unsigned char *)ucs->bv_val; in < end; ) {
 272                 u = *in++;
 273                 if (csize > 1) {
 274                         u <<= 8;
 275                         u |= *in++;
 276                 }
 277                 if (csize > 2) {
 278                         u <<= 8;
 279                         u |= *in++;
 280                         u <<= 8;
 281                         u |= *in++;
 282                 }
 283                 ptr += ldap_x_ucs4_to_utf8(u, ptr);
 284         }
 285         *ptr = '\0';
 286         return LDAP_SUCCESS;
 287 }
 288
 289 /*
 290  * Advance to the next UTF-8 character
 291  *
 292  * Ignores length of multibyte character, instead rely on
 293  * continuation markers to find start of next character.
 294  * This allows for "resyncing" of when invalid characters
 295  * are provided provided the start of the next character
 296  * is appears within the 6 bytes examined.
 297  */
 298 char* ldap_utf8_next( const char * p )
 299 {
 300         int i;
 301         const unsigned char *u = p;
 302
 303         if( LDAP_UTF8_ISASCII(u) ) {
 304                 return (char *) &p[1];
 305         }
 306
 307         for( i=1; i<6; i++ ) {
 308                 if ( ( u[i] & 0xc0 ) != 0x80 ) {
 309                         return (char *) &p[i];
 310                 }
 311         }
 312
 313         return (char *) &p[i];
 314 }
 315
 316 /*
 317  * Advance to the previous UTF-8 character
 318  *
 319  * Ignores length of multibyte character, instead rely on
 320  * continuation markers to find start of next character.
 321  * This allows for "resyncing" of when invalid characters
 322  * are provided provided the start of the next character
 323  * is appears within the 6 bytes examined.
 324  */
 325 char* ldap_utf8_prev( const char * p )
 326 {
 327         int i;
 328         const unsigned char *u = p;
 329
 330         for( i=-1; i>-6 ; i-- ) {
 331                 if ( ( u[i] & 0xc0 ) != 0x80 ) {
 332                         return (char *) &p[i];
 333                 }
 334         }
 335
 336         return (char *) &p[i];
 337 }
 338
 339 /*
 340  * Copy one UTF-8 character from src to dst returning
 341  * number of bytes copied.
 342  *
 343  * Ignores length of multibyte character, instead rely on
 344  * continuation markers to find start of next character.
 345  * This allows for "resyncing" of when invalid characters
 346  * are provided provided the start of the next character
 347  * is appears within the 6 bytes examined.
 348  */
 349 int ldap_utf8_copy( char* dst, const char *src )
 350 {
 351         int i;
 352         const unsigned char *u = src;
 353
 354         dst[0] = src[0];
 355
 356         if( LDAP_UTF8_ISASCII(u) ) {
 357                 return 1;
 358         }
 359
 360         for( i=1; i<6; i++ ) {
 361                 if ( ( u[i] & 0xc0 ) != 0x80 ) {
 362                         return i;
 363                 }
 364                 dst[i] = src[i];
 365         }
 366
 367         return i;
 368 }
 369
 370 #ifndef UTF8_ALPHA_CTYPE
 371 /*
 372  * UTF-8 ctype routines
 373  * Only deals with characters < 0x80 (ie: US-ASCII)
 374  */
 375
 376 int ldap_utf8_isascii( const char * p )
 377 {
 378         unsigned c = * (const unsigned char *) p;
 379         return LDAP_ASCII(c);
 380 }
 381
 382 int ldap_utf8_isdigit( const char * p )
 383 {
 384         unsigned c = * (const unsigned char *) p;
 385
 386         if(!LDAP_ASCII(c)) return 0;
 387
 388         return LDAP_DIGIT( c );
 389 }
 390
 391 int ldap_utf8_isxdigit( const char * p )
 392 {
 393         unsigned c = * (const unsigned char *) p;
 394
 395         if(!LDAP_ASCII(c)) return 0;
 396
 397         return LDAP_HEX(c);
 398 }
 399
 400 int ldap_utf8_isspace( const char * p )
 401 {
 402         unsigned c = * (const unsigned char *) p;
 403
 404         if(!LDAP_ASCII(c)) return 0;
 405
 406         switch(c) {
 407         case ' ':
 408         case '\t':
 409         case '\n':
 410         case '\r':
 411         case '\v':
 412         case '\f':
 413                 return 1;
 414         }
 415
 416         return 0;
 417 }
 418
 419 /*
 420  * These are not needed by the C SDK and are
 421  * not "good enough" for general use.
 422  */
 423 int ldap_utf8_isalpha( const char * p )
 424 {
 425         unsigned c = * (const unsigned char *) p;
 426
 427         if(!LDAP_ASCII(c)) return 0;
 428
 429         return LDAP_ALPHA(c);
 430 }
 431
 432 int ldap_utf8_isalnum( const char * p )
 433 {
 434         unsigned c = * (const unsigned char *) p;
 435
 436         if(!LDAP_ASCII(c)) return 0;
 437
 438         return LDAP_ALNUM(c);
 439 }
 440
 441 int ldap_utf8_islower( const char * p )
 442 {
 443         unsigned c = * (const unsigned char *) p;
 444
 445         if(!LDAP_ASCII(c)) return 0;
 446
 447         return LDAP_LOWER(c);
 448 }
 449
 450 int ldap_utf8_isupper( const char * p )
 451 {
 452         unsigned c = * (const unsigned char *) p;
 453
 454         if(!LDAP_ASCII(c)) return 0;
 455
 456         return LDAP_UPPER(c);
 457 }
 458 #endif
 459
 460
 461 /*
 462  * UTF-8 string routines
 463  */
 464
 465 /* like strchr() */
 466 char * (ldap_utf8_strchr)( const char *str, const char *chr )
 467 {
 468         for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
 469                 if( ldap_x_utf8_to_ucs4( str ) == ldap_x_utf8_to_ucs4( chr ) ) {
 470                         return (char *) str;
 471                 }
 472         }
 473
 474         return NULL;
 475 }
 476
 477 /* like strcspn() but returns number of bytes, not characters */
 478 ber_len_t (ldap_utf8_strcspn)( const char *str, const char *set )
 479 {
 480         const char *cstr;
 481         const char *cset;
 482
 483         for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
 484                 for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
 485                         if( ldap_x_utf8_to_ucs4( cstr ) == ldap_x_utf8_to_ucs4( cset ) ) {
 486                                 return cstr - str;
 487                         }
 488                 }
 489         }
 490
 491         return cstr - str;
 492 }
 493
 494 /* like strspn() but returns number of bytes, not characters */
 495 ber_len_t (ldap_utf8_strspn)( const char *str, const char *set )
 496 {
 497         const char *cstr;
 498         const char *cset;
 499
 500         for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
 501                 for( cset = set; ; LDAP_UTF8_INCR(cset) ) {
 502                         if( *cset == '\0' ) {
 503                                 return cstr - str;
 504                         }
 505
 506                         if( ldap_x_utf8_to_ucs4( cstr ) == ldap_x_utf8_to_ucs4( cset ) ) {
 507                                 break;
 508                         }
 509                 }
 510         }
 511
 512         return cstr - str;
 513 }
 514
 515 /* like strpbrk(), replaces strchr() as well */
 516 char *(ldap_utf8_strpbrk)( const char *str, const char *set )
 517 {
 518         for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
 519                 const char *cset;
 520
 521                 for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
 522                         if( ldap_x_utf8_to_ucs4( str ) == ldap_x_utf8_to_ucs4( cset ) ) {
 523                                 return (char *) str;
 524                         }
 525                 }
 526         }
 527
 528         return NULL;
 529 }
 530
 531 /* like strtok_r(), not strtok() */
 532 char *(ldap_utf8_strtok)(char *str, const char *sep, char **last)
 533 {
 534         char *begin;
 535         char *end;
 536
 537         if( last == NULL ) return NULL;
 538
 539         begin = str ? str : *last;
 540
 541         begin += ldap_utf8_strspn( begin, sep );
 542
 543         if( *begin == '\0' ) {
 544                 *last = NULL;
 545                 return NULL;
 546         }
 547
 548         end = &begin[ ldap_utf8_strcspn( begin, sep ) ];
 549
 550         if( *end != '\0' ) {
 551                 char *next = LDAP_UTF8_NEXT( end );
 552                 *end = '\0';
 553                 end = next;
 554         }
 555
 556         *last = end;
 557         return begin;
 558 }