git.sur5r.net Git - openldap/blob - libraries/libldap/utf-8.c

   1 /* $OpenLDAP$ */
   2 /*
   3  * Copyright 1998-2002 The OpenLDAP Foundation, All Rights Reserved.
   4  * COPYING RESTRICTIONS APPLY, see COPYRIGHT file
   5  */
   6
   7 /*
   8  * Basic UTF-8 routines
   9  *
  10  * These routines are "dumb".  Though they understand UTF-8,
  11  * they don't grok Unicode.  That is, they can push bits,
  12  * but don't have a clue what the bits represent.  That's
  13  * good enough for use with the LDAP Client SDK.
  14  *
  15  * These routines are not optimized.
  16  */
  17
  18 #include "portable.h"
  19
  20 #include <stdio.h>
  21
  22 #include <ac/stdlib.h>
  23
  24 #include <ac/socket.h>
  25 #include <ac/string.h>
  26 #include <ac/time.h>
  27
  28 #include "ldap_utf8.h"
  29
  30 #include "ldap-int.h"
  31 #include "ldap_defaults.h"
  32
  33 /*
  34  * Basic UTF-8 routines
  35  */
  36
  37 /*
  38  * return the number of bytes required to hold the
  39  * NULL-terminated UTF-8 string NOT INCLUDING the
  40  * termination.
  41  */
  42 ber_len_t ldap_utf8_bytes( const char * p )
  43 {
  44         ber_len_t bytes;
  45
  46         for( bytes=0; p[bytes]; bytes++ ) {
  47                 /* EMPTY */ ;
  48         }
  49
  50         return bytes;
  51 }
  52
  53 ber_len_t ldap_utf8_chars( const char * p )
  54 {
  55         /* could be optimized and could check for invalid sequences */
  56         ber_len_t chars=0;
  57
  58         for( ; *p ; LDAP_UTF8_INCR(p) ) {
  59                 chars++;
  60         }
  61
  62         return chars;
  63 }
  64
  65 /* return offset to next character */
  66 int ldap_utf8_offset( const char * p )
  67 {
  68         return LDAP_UTF8_NEXT(p) - p;
  69 }
  70
  71 /*
  72  * Returns length indicated by first byte.
  73  */
  74 const char ldap_utf8_lentab[] = {
  75         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  76         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  77         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  78         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  79         0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  80         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  81         3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  82         4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 };
  83
  84 int ldap_utf8_charlen( const char * p )
  85 {
  86         if (!(*p & 0x80))
  87                 return 1;
  88
  89         return ldap_utf8_lentab[*(unsigned char *)p ^ 0x80];
  90 }
  91
  92 /*
  93  * Make sure the UTF-8 char used the shortest possible encoding
  94  * returns charlen if valid, 0 if not.
  95  *
  96  * Here are the valid UTF-8 encodings, taken from RFC 2279 page 4.
  97  * The table is slightly modified from that of the RFC.
  98  *
  99  * UCS-4 range (hex)      UTF-8 sequence (binary)
 100  * 0000 0000-0000 007F   0.......
 101  * 0000 0080-0000 07FF   110++++. 10......
 102  * 0000 0800-0000 FFFF   1110++++ 10+..... 10......
 103  * 0001 0000-001F FFFF   11110+++ 10++.... 10...... 10......
 104  * 0020 0000-03FF FFFF   111110++ 10+++... 10...... 10...... 10......
 105  * 0400 0000-7FFF FFFF   1111110+ 10++++.. 10...... 10...... 10...... 10......
 106  *
 107  * The '.' bits are "don't cares". When validating a UTF-8 sequence,
 108  * at least one of the '+' bits must be set, otherwise the character
 109  * should have been encoded in fewer octets. Note that in the two-octet
 110  * case, only the first octet needs to be validated, and this is done
 111  * in the ldap_utf8_lentab[] above.
 112  */
 113
 114 /* mask of required bits in second octet */
 115 const char ldap_utf8_mintab[] = {
 116         0x20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
 117         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
 118         0x30, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
 119         0x38, 0x80, 0x80, 0x80, 0x3c, 0x80, 0x00, 0x00 };
 120
 121 int ldap_utf8_charlen2( const char * p )
 122 {
 123         int i = LDAP_UTF8_CHARLEN( p );
 124
 125         if ( i > 2 ) {
 126                 if ( !( ldap_utf8_mintab[*p & 0x1f] & p[1] ) )
 127                         i = 0;
 128         }
 129         return i;
 130 }
 131
 132 /* conv UTF-8 to UCS-4, useful for comparisons */
 133 ldap_ucs4_t ldap_x_utf8_to_ucs4( const char * p )
 134 {
 135     const unsigned char *c = p;
 136     ldap_ucs4_t ch;
 137         int len, i;
 138         static unsigned char mask[] = {
 139                 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
 140
 141         len = LDAP_UTF8_CHARLEN2(p, len);
 142
 143         if( len == 0 ) return LDAP_UCS4_INVALID;
 144
 145         ch = c[0] & mask[len];
 146
 147         for(i=1; i < len; i++) {
 148                 if ((c[i] & 0xc0) != 0x80) {
 149                         return LDAP_UCS4_INVALID;
 150                 }
 151
 152                 ch <<= 6;
 153                 ch |= c[i] & 0x3f;
 154         }
 155
 156         return ch;
 157 }
 158
 159 /* conv UCS-4 to UTF-8, not used */
 160 int ldap_x_ucs4_to_utf8( ldap_ucs4_t c, char *buf )
 161 {
 162         int len=0;
 163         unsigned char* p = buf;
 164         if(buf == NULL) return 0;
 165
 166         if ( c < 0 ) {
 167                 /* not a valid Unicode character */
 168
 169         } else if( c < 0x80 ) {
 170                 p[len++] = c;
 171
 172         } else if( c < 0x800 ) {
 173                 p[len++] = 0xc0 | ( c >> 6 );
 174                 p[len++] = 0x80 | ( c & 0x3f );
 175
 176         } else if( c < 0x10000 ) {
 177                 p[len++] = 0xe0 | ( c >> 12 );
 178                 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
 179                 p[len++] = 0x80 | ( c & 0x3f );
 180
 181         } else if( c < 0x200000 ) {
 182                 p[len++] = 0xf0 | ( c >> 18 );
 183                 p[len++] = 0x80 | ( (c >> 12) & 0x3f );
 184                 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
 185                 p[len++] = 0x80 | ( c & 0x3f );
 186
 187         } else if( c < 0x4000000 ) {
 188                 p[len++] = 0xf8 | ( c >> 24 );
 189                 p[len++] = 0x80 | ( (c >> 18) & 0x3f );
 190                 p[len++] = 0x80 | ( (c >> 12) & 0x3f );
 191                 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
 192                 p[len++] = 0x80 | ( c & 0x3f );
 193
 194         } else /* if( c < 0x80000000 ) */ {
 195                 p[len++] = 0xfc | ( c >> 30 );
 196                 p[len++] = 0x80 | ( (c >> 24) & 0x3f );
 197                 p[len++] = 0x80 | ( (c >> 18) & 0x3f );
 198                 p[len++] = 0x80 | ( (c >> 12) & 0x3f );
 199                 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
 200                 p[len++] = 0x80 | ( c & 0x3f );
 201         }
 202
 203         buf[len] = '\0';
 204         return len;
 205 }
 206
 207 /*
 208  * Advance to the next UTF-8 character
 209  *
 210  * Ignores length of multibyte character, instead rely on
 211  * continuation markers to find start of next character.
 212  * This allows for "resyncing" of when invalid characters
 213  * are provided provided the start of the next character
 214  * is appears within the 6 bytes examined.
 215  */
 216 char* ldap_utf8_next( const char * p )
 217 {
 218         int i;
 219         const unsigned char *u = p;
 220
 221         if( LDAP_UTF8_ISASCII(u) ) {
 222                 return (char *) &p[1];
 223         }
 224
 225         for( i=1; i<6; i++ ) {
 226                 if ( ( u[i] & 0xc0 ) != 0x80 ) {
 227                         return (char *) &p[i];
 228                 }
 229         }
 230
 231         return (char *) &p[i];
 232 }
 233
 234 /*
 235  * Advance to the previous UTF-8 character
 236  *
 237  * Ignores length of multibyte character, instead rely on
 238  * continuation markers to find start of next character.
 239  * This allows for "resyncing" of when invalid characters
 240  * are provided provided the start of the next character
 241  * is appears within the 6 bytes examined.
 242  */
 243 char* ldap_utf8_prev( const char * p )
 244 {
 245         int i;
 246         const unsigned char *u = p;
 247
 248         for( i=-1; i>-6 ; i-- ) {
 249                 if ( ( u[i] & 0xc0 ) != 0x80 ) {
 250                         return (char *) &p[i];
 251                 }
 252         }
 253
 254         return (char *) &p[i];
 255 }
 256
 257 /*
 258  * Copy one UTF-8 character from src to dst returning
 259  * number of bytes copied.
 260  *
 261  * Ignores length of multibyte character, instead rely on
 262  * continuation markers to find start of next character.
 263  * This allows for "resyncing" of when invalid characters
 264  * are provided provided the start of the next character
 265  * is appears within the 6 bytes examined.
 266  */
 267 int ldap_utf8_copy( char* dst, const char *src )
 268 {
 269         int i;
 270         const unsigned char *u = src;
 271
 272         dst[0] = src[0];
 273
 274         if( LDAP_UTF8_ISASCII(u) ) {
 275                 return 1;
 276         }
 277
 278         for( i=1; i<6; i++ ) {
 279                 if ( ( u[i] & 0xc0 ) != 0x80 ) {
 280                         return i;
 281                 }
 282                 dst[i] = src[i];
 283         }
 284
 285         return i;
 286 }
 287
 288 #ifndef UTF8_ALPHA_CTYPE
 289 /*
 290  * UTF-8 ctype routines
 291  * Only deals with characters < 0x80 (ie: US-ASCII)
 292  */
 293
 294 int ldap_utf8_isascii( const char * p )
 295 {
 296         unsigned c = * (const unsigned char *) p;
 297         return LDAP_ASCII(c);
 298 }
 299
 300 int ldap_utf8_isdigit( const char * p )
 301 {
 302         unsigned c = * (const unsigned char *) p;
 303
 304         if(!LDAP_ASCII(c)) return 0;
 305
 306         return LDAP_DIGIT( c );
 307 }
 308
 309 int ldap_utf8_isxdigit( const char * p )
 310 {
 311         unsigned c = * (const unsigned char *) p;
 312
 313         if(!LDAP_ASCII(c)) return 0;
 314
 315         return LDAP_HEX(c);
 316 }
 317
 318 int ldap_utf8_isspace( const char * p )
 319 {
 320         unsigned c = * (const unsigned char *) p;
 321
 322         if(!LDAP_ASCII(c)) return 0;
 323
 324         switch(c) {
 325         case ' ':
 326         case '\t':
 327         case '\n':
 328         case '\r':
 329         case '\v':
 330         case '\f':
 331                 return 1;
 332         }
 333
 334         return 0;
 335 }
 336
 337 /*
 338  * These are not needed by the C SDK and are
 339  * not "good enough" for general use.
 340  */
 341 int ldap_utf8_isalpha( const char * p )
 342 {
 343         unsigned c = * (const unsigned char *) p;
 344
 345         if(!LDAP_ASCII(c)) return 0;
 346
 347         return LDAP_ALPHA(c);
 348 }
 349
 350 int ldap_utf8_isalnum( const char * p )
 351 {
 352         unsigned c = * (const unsigned char *) p;
 353
 354         if(!LDAP_ASCII(c)) return 0;
 355
 356         return LDAP_ALNUM(c);
 357 }
 358
 359 int ldap_utf8_islower( const char * p )
 360 {
 361         unsigned c = * (const unsigned char *) p;
 362
 363         if(!LDAP_ASCII(c)) return 0;
 364
 365         return LDAP_LOWER(c);
 366 }
 367
 368 int ldap_utf8_isupper( const char * p )
 369 {
 370         unsigned c = * (const unsigned char *) p;
 371
 372         if(!LDAP_ASCII(c)) return 0;
 373
 374         return LDAP_UPPER(c);
 375 }
 376 #endif
 377
 378
 379 /*
 380  * UTF-8 string routines
 381  */
 382
 383 /* like strchr() */
 384 char * (ldap_utf8_strchr)( const char *str, const char *chr )
 385 {
 386         for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
 387                 if( ldap_x_utf8_to_ucs4( str ) == ldap_x_utf8_to_ucs4( chr ) ) {
 388                         return (char *) str;
 389                 }
 390         }
 391
 392         return NULL;
 393 }
 394
 395 /* like strcspn() but returns number of bytes, not characters */
 396 ber_len_t (ldap_utf8_strcspn)( const char *str, const char *set )
 397 {
 398         const char *cstr;
 399         const char *cset;
 400
 401         for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
 402                 for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
 403                         if( ldap_x_utf8_to_ucs4( cstr ) == ldap_x_utf8_to_ucs4( cset ) ) {
 404                                 return cstr - str;
 405                         }
 406                 }
 407         }
 408
 409         return cstr - str;
 410 }
 411
 412 /* like strspn() but returns number of bytes, not characters */
 413 ber_len_t (ldap_utf8_strspn)( const char *str, const char *set )
 414 {
 415         const char *cstr;
 416         const char *cset;
 417
 418         for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
 419                 for( cset = set; ; LDAP_UTF8_INCR(cset) ) {
 420                         if( *cset == '\0' ) {
 421                                 return cstr - str;
 422                         }
 423
 424                         if( ldap_x_utf8_to_ucs4( cstr ) == ldap_x_utf8_to_ucs4( cset ) ) {
 425                                 break;
 426                         }
 427                 }
 428         }
 429
 430         return cstr - str;
 431 }
 432
 433 /* like strpbrk(), replaces strchr() as well */
 434 char *(ldap_utf8_strpbrk)( const char *str, const char *set )
 435 {
 436         for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
 437                 const char *cset;
 438
 439                 for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
 440                         if( ldap_x_utf8_to_ucs4( str ) == ldap_x_utf8_to_ucs4( cset ) ) {
 441                                 return (char *) str;
 442                         }
 443                 }
 444         }
 445
 446         return NULL;
 447 }
 448
 449 /* like strtok_r(), not strtok() */
 450 char *(ldap_utf8_strtok)(char *str, const char *sep, char **last)
 451 {
 452         char *begin;
 453         char *end;
 454
 455         if( last == NULL ) return NULL;
 456
 457         begin = str ? str : *last;
 458
 459         begin += ldap_utf8_strspn( begin, sep );
 460
 461         if( *begin == '\0' ) {
 462                 *last = NULL;
 463                 return NULL;
 464         }
 465
 466         end = &begin[ ldap_utf8_strcspn( begin, sep ) ];
 467
 468         if( *end != '\0' ) {
 469                 char *next = LDAP_UTF8_NEXT( end );
 470                 *end = '\0';
 471                 end = next;
 472         }
 473
 474         *last = end;
 475         return begin;
 476 }