git.sur5r.net Git - openldap/blob - libraries/libldap/utf-8.c

   1 /* $OpenLDAP$ */
   2 /*
   3  * Copyright 1998-2002 The OpenLDAP Foundation, All Rights Reserved.
   4  * COPYING RESTRICTIONS APPLY, see COPYRIGHT file
   5  */
   6
   7 /*
   8  * Basic UTF-8 routines
   9  *
  10  * These routines are "dumb".  Though they understand UTF-8,
  11  * they don't grok Unicode.  That is, they can push bits,
  12  * but don't have a clue what the bits represent.  That's
  13  * good enough for use with the LDAP Client SDK.
  14  *
  15  * These routines are not optimized.
  16  */
  17
  18 #include "portable.h"
  19
  20 #include <stdio.h>
  21
  22 #include <ac/stdlib.h>
  23
  24 #include <ac/socket.h>
  25 #include <ac/string.h>
  26 #include <ac/time.h>
  27
  28 #include "ldap_utf8.h"
  29
  30 #include "ldap-int.h"
  31 #include "ldap_defaults.h"
  32
  33 /*
  34  * Basic UTF-8 routines
  35  */
  36
  37 /*
  38  * return the number of bytes required to hold the
  39  * NULL-terminated UTF-8 string NOT INCLUDING the
  40  * termination.
  41  */
  42 ber_len_t ldap_utf8_bytes( const char * p )
  43 {
  44         ber_len_t bytes;
  45
  46         for( bytes=0; p[bytes]; bytes++ ) {
  47                 /* EMPTY */ ;
  48         }
  49
  50         return bytes;
  51 }
  52
  53 ber_len_t ldap_utf8_chars( const char * p )
  54 {
  55         /* could be optimized and could check for invalid sequences */
  56         ber_len_t chars=0;
  57
  58         for( ; *p ; LDAP_UTF8_INCR(p) ) {
  59                 chars++;
  60         }
  61
  62         return chars;
  63 }
  64
  65 /* return offset to next character */
  66 int ldap_utf8_offset( const char * p )
  67 {
  68         return LDAP_UTF8_NEXT(p) - p;
  69 }
  70
  71 /*
  72  * Returns length indicated by first byte.
  73  *
  74  * This function should use a table lookup.
  75  */
  76 const char ldap_utf8_lentab[] = {
  77         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  78         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  79         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  80         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  81         0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  82         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  83         3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  84         4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 };
  85
  86 int ldap_utf8_charlen( const char * p )
  87 {
  88         if (!(*p & 0x80))
  89                 return 1;
  90
  91         return ldap_utf8_lentab[*(unsigned char *)p ^ 0x80];
  92 }
  93
  94 /*
  95  * Make sure the UTF-8 char used the shortest possible encoding
  96  * returns charlen if valid, 0 if not.
  97  */
  98
  99 /* mask of required bits in second octet */
 100 const char ldap_utf8_mintab[] = {
 101         0x20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
 102         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
 103         0x30, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
 104         0x38, 0x80, 0x80, 0x80, 0x3c, 0x80, 0x00, 0x00 };
 105
 106 int ldap_utf8_charlen2( const char * p )
 107 {
 108         int i = LDAP_UTF8_CHARLEN( p );
 109
 110         if ( i > 2 ) {
 111                 if ( !( ldap_utf8_mintab[*p & 0x1f] & p[1] ) )
 112                         i = 0;
 113         }
 114         return i;
 115 }
 116
 117 /* conv UTF-8 to UCS-4, useful for comparisons */
 118 ldap_ucs4_t ldap_x_utf8_to_ucs4( const char * p )
 119 {
 120     const unsigned char *c = p;
 121     ldap_ucs4_t ch;
 122         int len, i;
 123         static unsigned char mask[] = {
 124                 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
 125
 126         len = LDAP_UTF8_CHARLEN2(p, len);
 127
 128         if( len == 0 ) return LDAP_UCS4_INVALID;
 129
 130         ch = c[0] & mask[len];
 131
 132         for(i=1; i < len; i++) {
 133                 if ((c[i] & 0xc0) != 0x80) {
 134                         return LDAP_UCS4_INVALID;
 135                 }
 136
 137                 ch <<= 6;
 138                 ch |= c[i] & 0x3f;
 139         }
 140
 141         return ch;
 142 }
 143
 144 /* conv UCS-4 to UTF-8, not used */
 145 int ldap_x_ucs4_to_utf8( ldap_ucs4_t c, char *buf )
 146 {
 147         int len=0;
 148         unsigned char* p = buf;
 149         if(buf == NULL) return 0;
 150
 151         if ( c < 0 ) {
 152                 /* not a valid Unicode character */
 153
 154         } else if( c < 0x80 ) {
 155                 p[len++] = c;
 156
 157         } else if( c < 0x800 ) {
 158                 p[len++] = 0xc0 | ( c >> 6 );
 159                 p[len++] = 0x80 | ( c & 0x3f );
 160
 161         } else if( c < 0x10000 ) {
 162                 p[len++] = 0xe0 | ( c >> 12 );
 163                 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
 164                 p[len++] = 0x80 | ( c & 0x3f );
 165
 166         } else if( c < 0x200000 ) {
 167                 p[len++] = 0xf0 | ( c >> 18 );
 168                 p[len++] = 0x80 | ( (c >> 12) & 0x3f );
 169                 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
 170                 p[len++] = 0x80 | ( c & 0x3f );
 171
 172         } else if( c < 0x4000000 ) {
 173                 p[len++] = 0xf8 | ( c >> 24 );
 174                 p[len++] = 0x80 | ( (c >> 18) & 0x3f );
 175                 p[len++] = 0x80 | ( (c >> 12) & 0x3f );
 176                 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
 177                 p[len++] = 0x80 | ( c & 0x3f );
 178
 179         } else /* if( c < 0x80000000 ) */ {
 180                 p[len++] = 0xfc | ( c >> 30 );
 181                 p[len++] = 0x80 | ( (c >> 24) & 0x3f );
 182                 p[len++] = 0x80 | ( (c >> 18) & 0x3f );
 183                 p[len++] = 0x80 | ( (c >> 12) & 0x3f );
 184                 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
 185                 p[len++] = 0x80 | ( c & 0x3f );
 186         }
 187
 188         buf[len] = '\0';
 189         return len;
 190 }
 191
 192 /*
 193  * Advance to the next UTF-8 character
 194  *
 195  * Ignores length of multibyte character, instead rely on
 196  * continuation markers to find start of next character.
 197  * This allows for "resyncing" of when invalid characters
 198  * are provided provided the start of the next character
 199  * is appears within the 6 bytes examined.
 200  */
 201 char* ldap_utf8_next( const char * p )
 202 {
 203         int i;
 204         const unsigned char *u = p;
 205
 206         if( LDAP_UTF8_ISASCII(u) ) {
 207                 return (char *) &p[1];
 208         }
 209
 210         for( i=1; i<6; i++ ) {
 211                 if ( ( u[i] & 0xc0 ) != 0x80 ) {
 212                         return (char *) &p[i];
 213                 }
 214         }
 215
 216         return (char *) &p[i];
 217 }
 218
 219 /*
 220  * Advance to the previous UTF-8 character
 221  *
 222  * Ignores length of multibyte character, instead rely on
 223  * continuation markers to find start of next character.
 224  * This allows for "resyncing" of when invalid characters
 225  * are provided provided the start of the next character
 226  * is appears within the 6 bytes examined.
 227  */
 228 char* ldap_utf8_prev( const char * p )
 229 {
 230         int i;
 231         const unsigned char *u = p;
 232
 233         for( i=-1; i>-6 ; i-- ) {
 234                 if ( ( u[i] & 0xc0 ) != 0x80 ) {
 235                         return (char *) &p[i];
 236                 }
 237         }
 238
 239         return (char *) &p[i];
 240 }
 241
 242 /*
 243  * Copy one UTF-8 character from src to dst returning
 244  * number of bytes copied.
 245  *
 246  * Ignores length of multibyte character, instead rely on
 247  * continuation markers to find start of next character.
 248  * This allows for "resyncing" of when invalid characters
 249  * are provided provided the start of the next character
 250  * is appears within the 6 bytes examined.
 251  */
 252 int ldap_utf8_copy( char* dst, const char *src )
 253 {
 254         int i;
 255         const unsigned char *u = src;
 256
 257         dst[0] = src[0];
 258
 259         if( LDAP_UTF8_ISASCII(u) ) {
 260                 return 1;
 261         }
 262
 263         for( i=1; i<6; i++ ) {
 264                 if ( ( u[i] & 0xc0 ) != 0x80 ) {
 265                         return i;
 266                 }
 267                 dst[i] = src[i];
 268         }
 269
 270         return i;
 271 }
 272
 273 #ifndef UTF8_ALPHA_CTYPE
 274 /*
 275  * UTF-8 ctype routines
 276  * Only deals with characters < 0x80 (ie: US-ASCII)
 277  */
 278
 279 int ldap_utf8_isascii( const char * p )
 280 {
 281         unsigned c = * (const unsigned char *) p;
 282         return LDAP_ASCII(c);
 283 }
 284
 285 int ldap_utf8_isdigit( const char * p )
 286 {
 287         unsigned c = * (const unsigned char *) p;
 288
 289         if(!LDAP_ASCII(c)) return 0;
 290
 291         return LDAP_DIGIT( c );
 292 }
 293
 294 int ldap_utf8_isxdigit( const char * p )
 295 {
 296         unsigned c = * (const unsigned char *) p;
 297
 298         if(!LDAP_ASCII(c)) return 0;
 299
 300         return LDAP_HEX(c);
 301 }
 302
 303 int ldap_utf8_isspace( const char * p )
 304 {
 305         unsigned c = * (const unsigned char *) p;
 306
 307         if(!LDAP_ASCII(c)) return 0;
 308
 309         switch(c) {
 310         case ' ':
 311         case '\t':
 312         case '\n':
 313         case '\r':
 314         case '\v':
 315         case '\f':
 316                 return 1;
 317         }
 318
 319         return 0;
 320 }
 321
 322 /*
 323  * These are not needed by the C SDK and are
 324  * not "good enough" for general use.
 325  */
 326 int ldap_utf8_isalpha( const char * p )
 327 {
 328         unsigned c = * (const unsigned char *) p;
 329
 330         if(!LDAP_ASCII(c)) return 0;
 331
 332         return LDAP_ALPHA(c);
 333 }
 334
 335 int ldap_utf8_isalnum( const char * p )
 336 {
 337         unsigned c = * (const unsigned char *) p;
 338
 339         if(!LDAP_ASCII(c)) return 0;
 340
 341         return LDAP_ALNUM(c);
 342 }
 343
 344 int ldap_utf8_islower( const char * p )
 345 {
 346         unsigned c = * (const unsigned char *) p;
 347
 348         if(!LDAP_ASCII(c)) return 0;
 349
 350         return LDAP_LOWER(c);
 351 }
 352
 353 int ldap_utf8_isupper( const char * p )
 354 {
 355         unsigned c = * (const unsigned char *) p;
 356
 357         if(!LDAP_ASCII(c)) return 0;
 358
 359         return LDAP_UPPER(c);
 360 }
 361 #endif
 362
 363
 364 /*
 365  * UTF-8 string routines
 366  */
 367
 368 /* like strchr() */
 369 char * (ldap_utf8_strchr)( const char *str, const char *chr )
 370 {
 371         for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
 372                 if( ldap_x_utf8_to_ucs4( str ) == ldap_x_utf8_to_ucs4( chr ) ) {
 373                         return (char *) str;
 374                 }
 375         }
 376
 377         return NULL;
 378 }
 379
 380 /* like strcspn() but returns number of bytes, not characters */
 381 ber_len_t (ldap_utf8_strcspn)( const char *str, const char *set )
 382 {
 383         const char *cstr;
 384         const char *cset;
 385
 386         for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
 387                 for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
 388                         if( ldap_x_utf8_to_ucs4( cstr ) == ldap_x_utf8_to_ucs4( cset ) ) {
 389                                 return cstr - str;
 390                         }
 391                 }
 392         }
 393
 394         return cstr - str;
 395 }
 396
 397 /* like strspn() but returns number of bytes, not characters */
 398 ber_len_t (ldap_utf8_strspn)( const char *str, const char *set )
 399 {
 400         const char *cstr;
 401         const char *cset;
 402
 403         for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
 404                 for( cset = set; ; LDAP_UTF8_INCR(cset) ) {
 405                         if( *cset == '\0' ) {
 406                                 return cstr - str;
 407                         }
 408
 409                         if( ldap_x_utf8_to_ucs4( cstr ) == ldap_x_utf8_to_ucs4( cset ) ) {
 410                                 break;
 411                         }
 412                 }
 413         }
 414
 415         return cstr - str;
 416 }
 417
 418 /* like strpbrk(), replaces strchr() as well */
 419 char *(ldap_utf8_strpbrk)( const char *str, const char *set )
 420 {
 421         for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
 422                 const char *cset;
 423
 424                 for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
 425                         if( ldap_x_utf8_to_ucs4( str ) == ldap_x_utf8_to_ucs4( cset ) ) {
 426                                 return (char *) str;
 427                         }
 428                 }
 429         }
 430
 431         return NULL;
 432 }
 433
 434 /* like strtok_r(), not strtok() */
 435 char *(ldap_utf8_strtok)(char *str, const char *sep, char **last)
 436 {
 437         char *begin;
 438         char *end;
 439
 440         if( last == NULL ) return NULL;
 441
 442         begin = str ? str : *last;
 443
 444         begin += ldap_utf8_strspn( begin, sep );
 445
 446         if( *begin == '\0' ) {
 447                 *last = NULL;
 448                 return NULL;
 449         }
 450
 451         end = &begin[ ldap_utf8_strcspn( begin, sep ) ];
 452
 453         if( *end != '\0' ) {
 454                 char *next = LDAP_UTF8_NEXT( end );
 455                 *end = '\0';
 456                 end = next;
 457         }
 458
 459         *last = end;
 460         return begin;
 461 }