git.sur5r.net Git - openldap/blob - libraries/libldap/utf-8.c

   1 /* $OpenLDAP$ */
   2 /*
   3  * Copyright 1998-2002 The OpenLDAP Foundation, All Rights Reserved.
   4  * COPYING RESTRICTIONS APPLY, see COPYRIGHT file
   5  */
   6
   7 /*
   8  * Basic UTF-8 routines
   9  *
  10  * These routines are "dumb".  Though they understand UTF-8,
  11  * they don't grok Unicode.  That is, they can push bits,
  12  * but don't have a clue what the bits represent.  That's
  13  * good enough for use with the LDAP Client SDK.
  14  *
  15  * These routines are not optimized.
  16  */
  17
  18 #include "portable.h"
  19
  20 #include <stdio.h>
  21
  22 #include <ac/stdlib.h>
  23
  24 #include <ac/socket.h>
  25 #include <ac/string.h>
  26 #include <ac/time.h>
  27
  28 #include "ldap_utf8.h"
  29
  30 #include "ldap-int.h"
  31 #include "ldap_defaults.h"
  32
  33 /*
  34  * Basic UTF-8 routines
  35  */
  36
  37 /*
  38  * return the number of bytes required to hold the
  39  * NULL-terminated UTF-8 string NOT INCLUDING the
  40  * termination.
  41  */
  42 ber_len_t ldap_utf8_bytes( const char * p )
  43 {
  44         ber_len_t bytes;
  45
  46         for( bytes=0; p[bytes]; bytes++ ) {
  47                 /* EMPTY */ ;
  48         }
  49
  50         return bytes;
  51 }
  52
  53 ber_len_t ldap_utf8_chars( const char * p )
  54 {
  55         /* could be optimized and could check for invalid sequences */
  56         ber_len_t chars=0;
  57
  58         for( ; *p ; LDAP_UTF8_INCR(p) ) {
  59                 chars++;
  60         }
  61
  62         return chars;
  63 }
  64
  65 /* return offset to next character */
  66 int ldap_utf8_offset( const char * p )
  67 {
  68         return LDAP_UTF8_NEXT(p) - p;
  69 }
  70
  71 /*
  72  * Returns length indicated by first byte.
  73  */
  74 const char ldap_utf8_lentab[] = {
  75         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  76         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  77         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  78         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  79         0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  80         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  81         3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  82         4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 };
  83
  84 int ldap_utf8_charlen( const char * p )
  85 {
  86         if (!(*p & 0x80))
  87                 return 1;
  88
  89         return ldap_utf8_lentab[*(unsigned char *)p ^ 0x80];
  90 }
  91
  92 /*
  93  * Make sure the UTF-8 char used the shortest possible encoding
  94  * returns charlen if valid, 0 if not.
  95  *
  96  * Here are the valid UTF-8 encodings, taken from RFC 2279 page 4.
  97  * The table is slightly modified from that of the RFC.
  98  *
  99  * UCS-4 range (hex)      UTF-8 sequence (binary)
 100  * 0000 0000-0000 007F   0.......
 101  * 0000 0080-0000 07FF   110++++. 10......
 102  * 0000 0800-0000 FFFF   1110++++ 10+..... 10......
 103  * 0001 0000-001F FFFF   11110+++ 10++.... 10...... 10......
 104  * 0020 0000-03FF FFFF   111110++ 10+++... 10...... 10...... 10......
 105  * 0400 0000-7FFF FFFF   1111110+ 10++++.. 10...... 10...... 10...... 10......
 106  *
 107  * The '.' bits are "don't cares". When validating a UTF-8 sequence,
 108  * at least one of the '+' bits must be set, otherwise the character
 109  * should have been encoded in fewer octets. Note that in the two-octet
 110  * case, only the first octet needs to be validated, and this is done
 111  * in the ldap_utf8_lentab[] above.
 112  */
 113
 114 /* mask of required bits in second octet */
 115 #undef c
 116 #define c const char
 117 c ldap_utf8_mintab[] = {
 118         (c)0x20, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
 119         (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
 120         (c)0x30, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
 121         (c)0x38, (c)0x80, (c)0x80, (c)0x80, (c)0x3c, (c)0x80, (c)0x00, (c)0x00 };
 122 #undef c
 123
 124 int ldap_utf8_charlen2( const char * p )
 125 {
 126         int i = LDAP_UTF8_CHARLEN( p );
 127
 128         if ( i > 2 ) {
 129                 if ( !( ldap_utf8_mintab[*p & 0x1f] & p[1] ) )
 130                         i = 0;
 131         }
 132         return i;
 133 }
 134
 135 /* conv UTF-8 to UCS-4, useful for comparisons */
 136 ldap_ucs4_t ldap_x_utf8_to_ucs4( const char * p )
 137 {
 138     const unsigned char *c = p;
 139     ldap_ucs4_t ch;
 140         int len, i;
 141         static unsigned char mask[] = {
 142                 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
 143
 144         len = LDAP_UTF8_CHARLEN2(p, len);
 145
 146         if( len == 0 ) return LDAP_UCS4_INVALID;
 147
 148         ch = c[0] & mask[len];
 149
 150         for(i=1; i < len; i++) {
 151                 if ((c[i] & 0xc0) != 0x80) {
 152                         return LDAP_UCS4_INVALID;
 153                 }
 154
 155                 ch <<= 6;
 156                 ch |= c[i] & 0x3f;
 157         }
 158
 159         return ch;
 160 }
 161
 162 /* conv UCS-4 to UTF-8, not used */
 163 int ldap_x_ucs4_to_utf8( ldap_ucs4_t c, char *buf )
 164 {
 165         int len=0;
 166         unsigned char* p = buf;
 167         if(buf == NULL) return 0;
 168
 169         if ( c < 0 ) {
 170                 /* not a valid Unicode character */
 171
 172         } else if( c < 0x80 ) {
 173                 p[len++] = c;
 174
 175         } else if( c < 0x800 ) {
 176                 p[len++] = 0xc0 | ( c >> 6 );
 177                 p[len++] = 0x80 | ( c & 0x3f );
 178
 179         } else if( c < 0x10000 ) {
 180                 p[len++] = 0xe0 | ( c >> 12 );
 181                 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
 182                 p[len++] = 0x80 | ( c & 0x3f );
 183
 184         } else if( c < 0x200000 ) {
 185                 p[len++] = 0xf0 | ( c >> 18 );
 186                 p[len++] = 0x80 | ( (c >> 12) & 0x3f );
 187                 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
 188                 p[len++] = 0x80 | ( c & 0x3f );
 189
 190         } else if( c < 0x4000000 ) {
 191                 p[len++] = 0xf8 | ( c >> 24 );
 192                 p[len++] = 0x80 | ( (c >> 18) & 0x3f );
 193                 p[len++] = 0x80 | ( (c >> 12) & 0x3f );
 194                 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
 195                 p[len++] = 0x80 | ( c & 0x3f );
 196
 197         } else /* if( c < 0x80000000 ) */ {
 198                 p[len++] = 0xfc | ( c >> 30 );
 199                 p[len++] = 0x80 | ( (c >> 24) & 0x3f );
 200                 p[len++] = 0x80 | ( (c >> 18) & 0x3f );
 201                 p[len++] = 0x80 | ( (c >> 12) & 0x3f );
 202                 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
 203                 p[len++] = 0x80 | ( c & 0x3f );
 204         }
 205
 206         buf[len] = '\0';
 207         return len;
 208 }
 209
 210 /*
 211  * Advance to the next UTF-8 character
 212  *
 213  * Ignores length of multibyte character, instead rely on
 214  * continuation markers to find start of next character.
 215  * This allows for "resyncing" of when invalid characters
 216  * are provided provided the start of the next character
 217  * is appears within the 6 bytes examined.
 218  */
 219 char* ldap_utf8_next( const char * p )
 220 {
 221         int i;
 222         const unsigned char *u = p;
 223
 224         if( LDAP_UTF8_ISASCII(u) ) {
 225                 return (char *) &p[1];
 226         }
 227
 228         for( i=1; i<6; i++ ) {
 229                 if ( ( u[i] & 0xc0 ) != 0x80 ) {
 230                         return (char *) &p[i];
 231                 }
 232         }
 233
 234         return (char *) &p[i];
 235 }
 236
 237 /*
 238  * Advance to the previous UTF-8 character
 239  *
 240  * Ignores length of multibyte character, instead rely on
 241  * continuation markers to find start of next character.
 242  * This allows for "resyncing" of when invalid characters
 243  * are provided provided the start of the next character
 244  * is appears within the 6 bytes examined.
 245  */
 246 char* ldap_utf8_prev( const char * p )
 247 {
 248         int i;
 249         const unsigned char *u = p;
 250
 251         for( i=-1; i>-6 ; i-- ) {
 252                 if ( ( u[i] & 0xc0 ) != 0x80 ) {
 253                         return (char *) &p[i];
 254                 }
 255         }
 256
 257         return (char *) &p[i];
 258 }
 259
 260 /*
 261  * Copy one UTF-8 character from src to dst returning
 262  * number of bytes copied.
 263  *
 264  * Ignores length of multibyte character, instead rely on
 265  * continuation markers to find start of next character.
 266  * This allows for "resyncing" of when invalid characters
 267  * are provided provided the start of the next character
 268  * is appears within the 6 bytes examined.
 269  */
 270 int ldap_utf8_copy( char* dst, const char *src )
 271 {
 272         int i;
 273         const unsigned char *u = src;
 274
 275         dst[0] = src[0];
 276
 277         if( LDAP_UTF8_ISASCII(u) ) {
 278                 return 1;
 279         }
 280
 281         for( i=1; i<6; i++ ) {
 282                 if ( ( u[i] & 0xc0 ) != 0x80 ) {
 283                         return i;
 284                 }
 285                 dst[i] = src[i];
 286         }
 287
 288         return i;
 289 }
 290
 291 #ifndef UTF8_ALPHA_CTYPE
 292 /*
 293  * UTF-8 ctype routines
 294  * Only deals with characters < 0x80 (ie: US-ASCII)
 295  */
 296
 297 int ldap_utf8_isascii( const char * p )
 298 {
 299         unsigned c = * (const unsigned char *) p;
 300         return LDAP_ASCII(c);
 301 }
 302
 303 int ldap_utf8_isdigit( const char * p )
 304 {
 305         unsigned c = * (const unsigned char *) p;
 306
 307         if(!LDAP_ASCII(c)) return 0;
 308
 309         return LDAP_DIGIT( c );
 310 }
 311
 312 int ldap_utf8_isxdigit( const char * p )
 313 {
 314         unsigned c = * (const unsigned char *) p;
 315
 316         if(!LDAP_ASCII(c)) return 0;
 317
 318         return LDAP_HEX(c);
 319 }
 320
 321 int ldap_utf8_isspace( const char * p )
 322 {
 323         unsigned c = * (const unsigned char *) p;
 324
 325         if(!LDAP_ASCII(c)) return 0;
 326
 327         switch(c) {
 328         case ' ':
 329         case '\t':
 330         case '\n':
 331         case '\r':
 332         case '\v':
 333         case '\f':
 334                 return 1;
 335         }
 336
 337         return 0;
 338 }
 339
 340 /*
 341  * These are not needed by the C SDK and are
 342  * not "good enough" for general use.
 343  */
 344 int ldap_utf8_isalpha( const char * p )
 345 {
 346         unsigned c = * (const unsigned char *) p;
 347
 348         if(!LDAP_ASCII(c)) return 0;
 349
 350         return LDAP_ALPHA(c);
 351 }
 352
 353 int ldap_utf8_isalnum( const char * p )
 354 {
 355         unsigned c = * (const unsigned char *) p;
 356
 357         if(!LDAP_ASCII(c)) return 0;
 358
 359         return LDAP_ALNUM(c);
 360 }
 361
 362 int ldap_utf8_islower( const char * p )
 363 {
 364         unsigned c = * (const unsigned char *) p;
 365
 366         if(!LDAP_ASCII(c)) return 0;
 367
 368         return LDAP_LOWER(c);
 369 }
 370
 371 int ldap_utf8_isupper( const char * p )
 372 {
 373         unsigned c = * (const unsigned char *) p;
 374
 375         if(!LDAP_ASCII(c)) return 0;
 376
 377         return LDAP_UPPER(c);
 378 }
 379 #endif
 380
 381
 382 /*
 383  * UTF-8 string routines
 384  */
 385
 386 /* like strchr() */
 387 char * (ldap_utf8_strchr)( const char *str, const char *chr )
 388 {
 389         for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
 390                 if( ldap_x_utf8_to_ucs4( str ) == ldap_x_utf8_to_ucs4( chr ) ) {
 391                         return (char *) str;
 392                 }
 393         }
 394
 395         return NULL;
 396 }
 397
 398 /* like strcspn() but returns number of bytes, not characters */
 399 ber_len_t (ldap_utf8_strcspn)( const char *str, const char *set )
 400 {
 401         const char *cstr;
 402         const char *cset;
 403
 404         for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
 405                 for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
 406                         if( ldap_x_utf8_to_ucs4( cstr ) == ldap_x_utf8_to_ucs4( cset ) ) {
 407                                 return cstr - str;
 408                         }
 409                 }
 410         }
 411
 412         return cstr - str;
 413 }
 414
 415 /* like strspn() but returns number of bytes, not characters */
 416 ber_len_t (ldap_utf8_strspn)( const char *str, const char *set )
 417 {
 418         const char *cstr;
 419         const char *cset;
 420
 421         for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
 422                 for( cset = set; ; LDAP_UTF8_INCR(cset) ) {
 423                         if( *cset == '\0' ) {
 424                                 return cstr - str;
 425                         }
 426
 427                         if( ldap_x_utf8_to_ucs4( cstr ) == ldap_x_utf8_to_ucs4( cset ) ) {
 428                                 break;
 429                         }
 430                 }
 431         }
 432
 433         return cstr - str;
 434 }
 435
 436 /* like strpbrk(), replaces strchr() as well */
 437 char *(ldap_utf8_strpbrk)( const char *str, const char *set )
 438 {
 439         for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
 440                 const char *cset;
 441
 442                 for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
 443                         if( ldap_x_utf8_to_ucs4( str ) == ldap_x_utf8_to_ucs4( cset ) ) {
 444                                 return (char *) str;
 445                         }
 446                 }
 447         }
 448
 449         return NULL;
 450 }
 451
 452 /* like strtok_r(), not strtok() */
 453 char *(ldap_utf8_strtok)(char *str, const char *sep, char **last)
 454 {
 455         char *begin;
 456         char *end;
 457
 458         if( last == NULL ) return NULL;
 459
 460         begin = str ? str : *last;
 461
 462         begin += ldap_utf8_strspn( begin, sep );
 463
 464         if( *begin == '\0' ) {
 465                 *last = NULL;
 466                 return NULL;
 467         }
 468
 469         end = &begin[ ldap_utf8_strcspn( begin, sep ) ];
 470
 471         if( *end != '\0' ) {
 472                 char *next = LDAP_UTF8_NEXT( end );
 473                 *end = '\0';
 474                 end = next;
 475         }
 476
 477         *last = end;
 478         return begin;
 479 }