git.sur5r.net Git - openldap/blob - libraries/libldap/utf-8.c

   1 /* $OpenLDAP$ */
   2 /*
   3  * Copyright 1998-2003 The OpenLDAP Foundation, All Rights Reserved.
   4  * COPYING RESTRICTIONS APPLY, see COPYRIGHT file
   5  */
   6 /*
   7  * Portions Copyright (C) The Internet Society (1998)
   8  * UTF-8 encodings are derived from those in RFC 2279;
   9  * see RFC for full legal notices.
  10  */
  11
  12 /*
  13  * Basic UTF-8 routines
  14  *
  15  * These routines are "dumb".  Though they understand UTF-8,
  16  * they don't grok Unicode.  That is, they can push bits,
  17  * but don't have a clue what the bits represent.  That's
  18  * good enough for use with the LDAP Client SDK.
  19  *
  20  * These routines are not optimized.
  21  */
  22
  23 #include "portable.h"
  24
  25 #include <stdio.h>
  26
  27 #include <ac/stdlib.h>
  28
  29 #include <ac/socket.h>
  30 #include <ac/string.h>
  31 #include <ac/time.h>
  32
  33 #include "ldap_utf8.h"
  34
  35 #include "ldap-int.h"
  36 #include "ldap_defaults.h"
  37
  38 /*
  39  * Basic UTF-8 routines
  40  */
  41
  42 /*
  43  * return the number of bytes required to hold the
  44  * NULL-terminated UTF-8 string NOT INCLUDING the
  45  * termination.
  46  */
  47 ber_len_t ldap_utf8_bytes( const char * p )
  48 {
  49         ber_len_t bytes;
  50
  51         for( bytes=0; p[bytes]; bytes++ ) {
  52                 /* EMPTY */ ;
  53         }
  54
  55         return bytes;
  56 }
  57
  58 ber_len_t ldap_utf8_chars( const char * p )
  59 {
  60         /* could be optimized and could check for invalid sequences */
  61         ber_len_t chars=0;
  62
  63         for( ; *p ; LDAP_UTF8_INCR(p) ) {
  64                 chars++;
  65         }
  66
  67         return chars;
  68 }
  69
  70 /* return offset to next character */
  71 int ldap_utf8_offset( const char * p )
  72 {
  73         return LDAP_UTF8_NEXT(p) - p;
  74 }
  75
  76 /*
  77  * Returns length indicated by first byte.
  78  */
  79 const char ldap_utf8_lentab[] = {
  80         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  81         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  82         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  83         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  84         0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  85         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  86         3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  87         4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 };
  88
  89 int ldap_utf8_charlen( const char * p )
  90 {
  91         if (!(*p & 0x80))
  92                 return 1;
  93
  94         return ldap_utf8_lentab[*(const unsigned char *)p ^ 0x80];
  95 }
  96
  97 /*
  98  * Make sure the UTF-8 char used the shortest possible encoding
  99  * returns charlen if valid, 0 if not.
 100  *
 101  * Here are the valid UTF-8 encodings, taken from RFC 2279 page 4.
 102  * The table is slightly modified from that of the RFC.
 103  *
 104  * UCS-4 range (hex)      UTF-8 sequence (binary)
 105  * 0000 0000-0000 007F   0.......
 106  * 0000 0080-0000 07FF   110++++. 10......
 107  * 0000 0800-0000 FFFF   1110++++ 10+..... 10......
 108  * 0001 0000-001F FFFF   11110+++ 10++.... 10...... 10......
 109  * 0020 0000-03FF FFFF   111110++ 10+++... 10...... 10...... 10......
 110  * 0400 0000-7FFF FFFF   1111110+ 10++++.. 10...... 10...... 10...... 10......
 111  *
 112  * The '.' bits are "don't cares". When validating a UTF-8 sequence,
 113  * at least one of the '+' bits must be set, otherwise the character
 114  * should have been encoded in fewer octets. Note that in the two-octet
 115  * case, only the first octet needs to be validated, and this is done
 116  * in the ldap_utf8_lentab[] above.
 117  */
 118
 119 /* mask of required bits in second octet */
 120 #undef c
 121 #define c const char
 122 c ldap_utf8_mintab[] = {
 123         (c)0x20, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
 124         (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
 125         (c)0x30, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
 126         (c)0x38, (c)0x80, (c)0x80, (c)0x80, (c)0x3c, (c)0x80, (c)0x00, (c)0x00 };
 127 #undef c
 128
 129 int ldap_utf8_charlen2( const char * p )
 130 {
 131         int i = LDAP_UTF8_CHARLEN( p );
 132
 133         if ( i > 2 ) {
 134                 if ( !( ldap_utf8_mintab[*p & 0x1f] & p[1] ) )
 135                         i = 0;
 136         }
 137         return i;
 138 }
 139
 140 /* conv UTF-8 to UCS-4, useful for comparisons */
 141 ldap_ucs4_t ldap_x_utf8_to_ucs4( const char * p )
 142 {
 143     const unsigned char *c = (const unsigned char *) p;
 144     ldap_ucs4_t ch;
 145         int len, i;
 146         static unsigned char mask[] = {
 147                 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
 148
 149         len = LDAP_UTF8_CHARLEN2(p, len);
 150
 151         if( len == 0 ) return LDAP_UCS4_INVALID;
 152
 153         ch = c[0] & mask[len];
 154
 155         for(i=1; i < len; i++) {
 156                 if ((c[i] & 0xc0) != 0x80) {
 157                         return LDAP_UCS4_INVALID;
 158                 }
 159
 160                 ch <<= 6;
 161                 ch |= c[i] & 0x3f;
 162         }
 163
 164         return ch;
 165 }
 166
 167 /* conv UCS-4 to UTF-8, not used */
 168 int ldap_x_ucs4_to_utf8( ldap_ucs4_t c, char *buf )
 169 {
 170         int len=0;
 171         unsigned char* p = (unsigned char *) buf;
 172
 173         /* not a valid Unicode character */
 174         if ( c < 0 ) return 0;
 175
 176         /* Just return length, don't convert */
 177         if(buf == NULL) {
 178                 if( c < 0x80 ) return 1;
 179                 else if( c < 0x800 ) return 2;
 180                 else if( c < 0x10000 ) return 3;
 181                 else if( c < 0x200000 ) return 4;
 182                 else if( c < 0x4000000 ) return 5;
 183                 else return 6;
 184         }
 185
 186         if( c < 0x80 ) {
 187                 p[len++] = c;
 188
 189         } else if( c < 0x800 ) {
 190                 p[len++] = 0xc0 | ( c >> 6 );
 191                 p[len++] = 0x80 | ( c & 0x3f );
 192
 193         } else if( c < 0x10000 ) {
 194                 p[len++] = 0xe0 | ( c >> 12 );
 195                 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
 196                 p[len++] = 0x80 | ( c & 0x3f );
 197
 198         } else if( c < 0x200000 ) {
 199                 p[len++] = 0xf0 | ( c >> 18 );
 200                 p[len++] = 0x80 | ( (c >> 12) & 0x3f );
 201                 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
 202                 p[len++] = 0x80 | ( c & 0x3f );
 203
 204         } else if( c < 0x4000000 ) {
 205                 p[len++] = 0xf8 | ( c >> 24 );
 206                 p[len++] = 0x80 | ( (c >> 18) & 0x3f );
 207                 p[len++] = 0x80 | ( (c >> 12) & 0x3f );
 208                 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
 209                 p[len++] = 0x80 | ( c & 0x3f );
 210
 211         } else /* if( c < 0x80000000 ) */ {
 212                 p[len++] = 0xfc | ( c >> 30 );
 213                 p[len++] = 0x80 | ( (c >> 24) & 0x3f );
 214                 p[len++] = 0x80 | ( (c >> 18) & 0x3f );
 215                 p[len++] = 0x80 | ( (c >> 12) & 0x3f );
 216                 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
 217                 p[len++] = 0x80 | ( c & 0x3f );
 218         }
 219
 220         return len;
 221 }
 222
 223 #define LDAP_UCS_UTF8LEN(c)     \
 224         c < 0 ? 0 : (c < 0x80 ? 1 : (c < 0x800 ? 2 : (c < 0x10000 ? 3 : \
 225         (c < 0x200000 ? 4 : (c < 0x4000000 ? 5 : 6)))))
 226
 227 /* Convert a string to UTF-8 format. The input string is expected to
 228  * have characters of 1, 2, or 4 octets (in network byte order)
 229  * corresponding to the ASN.1 T61STRING, BMPSTRING, and UNIVERSALSTRING
 230  * types respectively. (Here T61STRING just means that there is one
 231  * octet per character and characters may use the high bit of the octet.
 232  * The characters are assumed to use ISO mappings, no provision is made
 233  * for converting from T.61 coding rules to Unicode.)
 234  */
 235
 236 int
 237 ldap_ucs_to_utf8s( struct berval *ucs, int csize, struct berval *utf8s )
 238 {
 239         unsigned char *in, *end;
 240         char *ptr;
 241         ldap_ucs4_t u;
 242         int i, l = 0;
 243
 244         utf8s->bv_val = NULL;
 245         utf8s->bv_len = 0;
 246
 247         in = (unsigned char *)ucs->bv_val;
 248
 249         /* Make sure we stop at an even multiple of csize */
 250         end = in + ( ucs->bv_len & ~(csize-1) );
 251
 252         for (; in < end; ) {
 253                 u = *in++;
 254                 if (csize > 1) {
 255                         u <<= 8;
 256                         u |= *in++;
 257                 }
 258                 if (csize > 2) {
 259                         u <<= 8;
 260                         u |= *in++;
 261                         u <<= 8;
 262                         u |= *in++;
 263                 }
 264                 i = LDAP_UCS_UTF8LEN(u);
 265                 if (i == 0)
 266                         return LDAP_INVALID_SYNTAX;
 267                 l += i;
 268         }
 269
 270         utf8s->bv_val = LDAP_MALLOC( l+1 );
 271         if (utf8s->bv_val == NULL)
 272                 return LDAP_NO_MEMORY;
 273         utf8s->bv_len = l;
 274
 275         ptr = utf8s->bv_val;
 276         for (in = (unsigned char *)ucs->bv_val; in < end; ) {
 277                 u = *in++;
 278                 if (csize > 1) {
 279                         u <<= 8;
 280                         u |= *in++;
 281                 }
 282                 if (csize > 2) {
 283                         u <<= 8;
 284                         u |= *in++;
 285                         u <<= 8;
 286                         u |= *in++;
 287                 }
 288                 ptr += ldap_x_ucs4_to_utf8(u, ptr);
 289         }
 290         *ptr = '\0';
 291         return LDAP_SUCCESS;
 292 }
 293
 294 /*
 295  * Advance to the next UTF-8 character
 296  *
 297  * Ignores length of multibyte character, instead rely on
 298  * continuation markers to find start of next character.
 299  * This allows for "resyncing" of when invalid characters
 300  * are provided provided the start of the next character
 301  * is appears within the 6 bytes examined.
 302  */
 303 char* ldap_utf8_next( const char * p )
 304 {
 305         int i;
 306         const unsigned char *u = (const unsigned char *) p;
 307
 308         if( LDAP_UTF8_ISASCII(u) ) {
 309                 return (char *) &p[1];
 310         }
 311
 312         for( i=1; i<6; i++ ) {
 313                 if ( ( u[i] & 0xc0 ) != 0x80 ) {
 314                         return (char *) &p[i];
 315                 }
 316         }
 317
 318         return (char *) &p[i];
 319 }
 320
 321 /*
 322  * Advance to the previous UTF-8 character
 323  *
 324  * Ignores length of multibyte character, instead rely on
 325  * continuation markers to find start of next character.
 326  * This allows for "resyncing" of when invalid characters
 327  * are provided provided the start of the next character
 328  * is appears within the 6 bytes examined.
 329  */
 330 char* ldap_utf8_prev( const char * p )
 331 {
 332         int i;
 333         const unsigned char *u = (const unsigned char *) p;
 334
 335         for( i=-1; i>-6 ; i-- ) {
 336                 if ( ( u[i] & 0xc0 ) != 0x80 ) {
 337                         return (char *) &p[i];
 338                 }
 339         }
 340
 341         return (char *) &p[i];
 342 }
 343
 344 /*
 345  * Copy one UTF-8 character from src to dst returning
 346  * number of bytes copied.
 347  *
 348  * Ignores length of multibyte character, instead rely on
 349  * continuation markers to find start of next character.
 350  * This allows for "resyncing" of when invalid characters
 351  * are provided provided the start of the next character
 352  * is appears within the 6 bytes examined.
 353  */
 354 int ldap_utf8_copy( char* dst, const char *src )
 355 {
 356         int i;
 357         const unsigned char *u = (const unsigned char *) src;
 358
 359         dst[0] = src[0];
 360
 361         if( LDAP_UTF8_ISASCII(u) ) {
 362                 return 1;
 363         }
 364
 365         for( i=1; i<6; i++ ) {
 366                 if ( ( u[i] & 0xc0 ) != 0x80 ) {
 367                         return i;
 368                 }
 369                 dst[i] = src[i];
 370         }
 371
 372         return i;
 373 }
 374
 375 #ifndef UTF8_ALPHA_CTYPE
 376 /*
 377  * UTF-8 ctype routines
 378  * Only deals with characters < 0x80 (ie: US-ASCII)
 379  */
 380
 381 int ldap_utf8_isascii( const char * p )
 382 {
 383         unsigned c = * (const unsigned char *) p;
 384         return LDAP_ASCII(c);
 385 }
 386
 387 int ldap_utf8_isdigit( const char * p )
 388 {
 389         unsigned c = * (const unsigned char *) p;
 390
 391         if(!LDAP_ASCII(c)) return 0;
 392
 393         return LDAP_DIGIT( c );
 394 }
 395
 396 int ldap_utf8_isxdigit( const char * p )
 397 {
 398         unsigned c = * (const unsigned char *) p;
 399
 400         if(!LDAP_ASCII(c)) return 0;
 401
 402         return LDAP_HEX(c);
 403 }
 404
 405 int ldap_utf8_isspace( const char * p )
 406 {
 407         unsigned c = * (const unsigned char *) p;
 408
 409         if(!LDAP_ASCII(c)) return 0;
 410
 411         switch(c) {
 412         case ' ':
 413         case '\t':
 414         case '\n':
 415         case '\r':
 416         case '\v':
 417         case '\f':
 418                 return 1;
 419         }
 420
 421         return 0;
 422 }
 423
 424 /*
 425  * These are not needed by the C SDK and are
 426  * not "good enough" for general use.
 427  */
 428 int ldap_utf8_isalpha( const char * p )
 429 {
 430         unsigned c = * (const unsigned char *) p;
 431
 432         if(!LDAP_ASCII(c)) return 0;
 433
 434         return LDAP_ALPHA(c);
 435 }
 436
 437 int ldap_utf8_isalnum( const char * p )
 438 {
 439         unsigned c = * (const unsigned char *) p;
 440
 441         if(!LDAP_ASCII(c)) return 0;
 442
 443         return LDAP_ALNUM(c);
 444 }
 445
 446 int ldap_utf8_islower( const char * p )
 447 {
 448         unsigned c = * (const unsigned char *) p;
 449
 450         if(!LDAP_ASCII(c)) return 0;
 451
 452         return LDAP_LOWER(c);
 453 }
 454
 455 int ldap_utf8_isupper( const char * p )
 456 {
 457         unsigned c = * (const unsigned char *) p;
 458
 459         if(!LDAP_ASCII(c)) return 0;
 460
 461         return LDAP_UPPER(c);
 462 }
 463 #endif
 464
 465
 466 /*
 467  * UTF-8 string routines
 468  */
 469
 470 /* like strchr() */
 471 char * (ldap_utf8_strchr)( const char *str, const char *chr )
 472 {
 473         for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
 474                 if( ldap_x_utf8_to_ucs4( str ) == ldap_x_utf8_to_ucs4( chr ) ) {
 475                         return (char *) str;
 476                 }
 477         }
 478
 479         return NULL;
 480 }
 481
 482 /* like strcspn() but returns number of bytes, not characters */
 483 ber_len_t (ldap_utf8_strcspn)( const char *str, const char *set )
 484 {
 485         const char *cstr;
 486         const char *cset;
 487
 488         for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
 489                 for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
 490                         if( ldap_x_utf8_to_ucs4( cstr ) == ldap_x_utf8_to_ucs4( cset ) ) {
 491                                 return cstr - str;
 492                         }
 493                 }
 494         }
 495
 496         return cstr - str;
 497 }
 498
 499 /* like strspn() but returns number of bytes, not characters */
 500 ber_len_t (ldap_utf8_strspn)( const char *str, const char *set )
 501 {
 502         const char *cstr;
 503         const char *cset;
 504
 505         for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
 506                 for( cset = set; ; LDAP_UTF8_INCR(cset) ) {
 507                         if( *cset == '\0' ) {
 508                                 return cstr - str;
 509                         }
 510
 511                         if( ldap_x_utf8_to_ucs4( cstr ) == ldap_x_utf8_to_ucs4( cset ) ) {
 512                                 break;
 513                         }
 514                 }
 515         }
 516
 517         return cstr - str;
 518 }
 519
 520 /* like strpbrk(), replaces strchr() as well */
 521 char *(ldap_utf8_strpbrk)( const char *str, const char *set )
 522 {
 523         for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
 524                 const char *cset;
 525
 526                 for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
 527                         if( ldap_x_utf8_to_ucs4( str ) == ldap_x_utf8_to_ucs4( cset ) ) {
 528                                 return (char *) str;
 529                         }
 530                 }
 531         }
 532
 533         return NULL;
 534 }
 535
 536 /* like strtok_r(), not strtok() */
 537 char *(ldap_utf8_strtok)(char *str, const char *sep, char **last)
 538 {
 539         char *begin;
 540         char *end;
 541
 542         if( last == NULL ) return NULL;
 543
 544         begin = str ? str : *last;
 545
 546         begin += ldap_utf8_strspn( begin, sep );
 547
 548         if( *begin == '\0' ) {
 549                 *last = NULL;
 550                 return NULL;
 551         }
 552
 553         end = &begin[ ldap_utf8_strcspn( begin, sep ) ];
 554
 555         if( *end != '\0' ) {
 556                 char *next = LDAP_UTF8_NEXT( end );
 557                 *end = '\0';
 558                 end = next;
 559         }
 560
 561         *last = end;
 562         return begin;
 563 }