git.sur5r.net Git - openldap/blob - libraries/libldap/utf-8.c

   1 /* $OpenLDAP$ */
   2 /*
   3  * Copyright 1998-2000 The OpenLDAP Foundation, All Rights Reserved.
   4  * COPYING RESTRICTIONS APPLY, see COPYRIGHT file
   5  */
   6
   7 /*
   8  * Basic UTF-8 routines
   9  *
  10  * These routines are "dumb".  Though they understand UTF-8,
  11  * they don't grok Unicode.  That is, they can push bits,
  12  * but don't have a clue what the bits represent.  That's
  13  * good enough for use with the LDAP Client SDK.
  14  *
  15  * These routines are not optimized.
  16  */
  17
  18 #include "portable.h"
  19
  20 #include <stdio.h>
  21
  22 #include <ac/stdlib.h>
  23
  24 #include <ac/socket.h>
  25 #include <ac/string.h>
  26 #include <ac/time.h>
  27
  28 #include "ldap-int.h"
  29 #include "ldap_defaults.h"
  30
  31 #undef ISASCII
  32 #define ISASCII(uc)     ((uc) < 0x100)
  33 #undef UCS4_INVALID
  34 #define UCS4_INVALID    0x80000000U
  35
  36 /*
  37  * Basic UTF-8 routines
  38  */
  39
  40 /*
  41  * return the number of bytes required to hold the
  42  * NULL-terminated UTF-8 string INCLUDING the
  43  * termination.
  44  */
  45 ber_len_t ldap_utf8_bytes( const char * p )
  46 {
  47         ber_len_t bytes = 0;
  48
  49         if( p == NULL ) return bytes;
  50
  51         while( p[bytes++] ) {
  52                 /* EMPTY */ ;
  53         }
  54
  55         return bytes;
  56 }
  57
  58 ber_len_t ldap_utf8_chars( const char * p )
  59 {
  60         /* could be optimized and could check for invalid sequences */
  61         ber_len_t chars=0;
  62
  63         for( ; *p ; LDAP_UTF8_INCR(p) ) {
  64                 chars++;
  65         };
  66
  67         return chars;
  68 }
  69
  70 /* return offset to next character */
  71 int ldap_utf8_offset( const char * p )
  72 {
  73         return LDAP_UTF8_NEXT(p) - p;
  74 }
  75
  76 /*
  77  * Returns length indicated by first byte.
  78  *
  79  * This function should use a table lookup.
  80  */
  81 int ldap_utf8_charlen( const char * p )
  82 {
  83         unsigned c = * (const unsigned char *) p;
  84
  85         if ((c & 0xfe ) == 0xfc) {
  86                 return 6;
  87         }
  88
  89         if ((c & 0xfc ) == 0xf8) {
  90                 return 5;
  91         }
  92
  93         if ((c & 0xf8 ) == 0xf0) {
  94                 return 4;
  95         }
  96
  97         if ((c & 0xf0 ) == 0xe0) {
  98                 return 3;
  99         }
 100
 101         if ((c & 0xe0 ) == 0xc0) {
 102                 return 2;
 103         }
 104
 105         if ((c & 0x80 ) == 0x80) {
 106                 /* INVALID */
 107                 return 0;
 108         }
 109
 110         return 1;
 111 }
 112
 113 /* conv UTF-8 to UCS-4, useful for comparisons */
 114 ber_int_t ldap_utf8_to_ucs4( const char * p )
 115 {
 116     const unsigned char *c = p;
 117     ber_int_t ch;
 118         int len, i;
 119         static unsigned char mask[] = {
 120                 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
 121
 122         len = LDAP_UTF8_CHARLEN(p);
 123
 124         if( len == 0 ) return UCS4_INVALID;
 125
 126         ch = c[0] & mask[len];
 127
 128         for(i=1; i < len; i++) {
 129                 if ((c[i] & 0xc0) != 0x80) {
 130                         return UCS4_INVALID;
 131                 }
 132
 133                 ch <<= 6;
 134                 ch |= c[i] & 0x3f;
 135         }
 136
 137         return ch;
 138 }
 139
 140 /* conv UCS-4 to UTF-8, not used */
 141 int ldap_ucs4_to_utf8( ber_int_t c, char *buf )
 142 {
 143         int len=0;
 144         unsigned char* p = buf;
 145         if(buf == NULL) return 0;
 146
 147         if ( c < 0 ) {
 148                 /* not a valid Unicode character */
 149
 150         } else if( c < 0x80 ) {
 151                 p[len++] = c;
 152
 153         } else if( c < 0x800 ) {
 154                 p[len++] = 0xc0 | ( c >> 6 );
 155                 p[len++] = 0x80 | ( c & 0x3f );
 156
 157         } else if( c < 0x10000 ) {
 158                 p[len++] = 0xe0 | ( c >> 12 );
 159                 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
 160                 p[len++] = 0x80 | ( c & 0x3f );
 161
 162         } else if( c < 0x200000 ) {
 163                 p[len++] = 0xf0 | ( c >> 18 );
 164                 p[len++] = 0x80 | ( (c >> 12) & 0x3f );
 165                 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
 166                 p[len++] = 0x80 | ( c & 0x3f );
 167
 168         } else if( c < 0x400000 ) {
 169                 p[len++] = 0xf8 | ( c >> 24 );
 170                 p[len++] = 0x80 | ( (c >> 18) & 0x3f );
 171                 p[len++] = 0x80 | ( (c >> 12) & 0x3f );
 172                 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
 173                 p[len++] = 0x80 | ( c & 0x3f );
 174
 175         } else /* if( c < 0x80000000 ) */ {
 176                 p[len++] = 0xfc | ( c >> 30 );
 177                 p[len++] = 0x80 | ( (c >> 24) & 0x3f );
 178                 p[len++] = 0x80 | ( (c >> 18) & 0x3f );
 179                 p[len++] = 0x80 | ( (c >> 12) & 0x3f );
 180                 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
 181                 p[len++] = 0x80 | ( c & 0x3f );
 182         }
 183
 184         buf[len] = '\0';
 185         return len;
 186 }
 187
 188 /*
 189  * Advance to the next UTF-8 character
 190  *
 191  * Ignores length of multibyte character, instead rely on
 192  * continuation markers to find start of next character.
 193  * This allows for "resyncing" of when invalid characters
 194  * are provided provided the start of the next character
 195  * is appears within the 6 bytes examined.
 196  */
 197 char* ldap_utf8_next( const char * p )
 198 {
 199         int i;
 200         const unsigned char *u = p;
 201
 202         if( LDAP_UTF8_ISASCII(u) ) {
 203                 return (char *) &p[1];
 204         }
 205
 206         for( i=1; i<6; i++ ) {
 207                 if ( u[i] & 0xc0 != 0x80 ) {
 208                         return (char *) &p[i];
 209                 }
 210         }
 211
 212         return (char *) &p[i];
 213 }
 214
 215 /*
 216  * Advance to the previous UTF-8 character
 217  *
 218  * Ignores length of multibyte character, instead rely on
 219  * continuation markers to find start of next character.
 220  * This allows for "resyncing" of when invalid characters
 221  * are provided provided the start of the next character
 222  * is appears within the 6 bytes examined.
 223  */
 224 char* ldap_utf8_prev( const char * p )
 225 {
 226         int i;
 227         const unsigned char *u = p;
 228
 229         for( i=-1; i>-6 ; i-- ) {
 230                 if ( u[i] & 0xc0 != 0x80 ) {
 231                         return (char *) &p[i];
 232                 }
 233         }
 234
 235         return (char *) &p[i];
 236 }
 237
 238 /*
 239  * Copy one UTF-8 character from src to dst returning
 240  * number of bytes copied.
 241  *
 242  * Ignores length of multibyte character, instead rely on
 243  * continuation markers to find start of next character.
 244  * This allows for "resyncing" of when invalid characters
 245  * are provided provided the start of the next character
 246  * is appears within the 6 bytes examined.
 247  */
 248 int ldap_utf8_copy( char* dst, const char *src )
 249 {
 250         int i;
 251         const unsigned char *u = src;
 252
 253         dst[0] = src[0];
 254
 255         if( LDAP_UTF8_ISASCII(u) ) {
 256                 return 1;
 257         }
 258
 259         for( i=1; i<6; i++ ) {
 260                 if ( u[i] & 0xc0 != 0x80 ) {
 261                         return i;
 262                 }
 263                 dst[i] = src[i];
 264         }
 265
 266         return i;
 267 }
 268
 269 /*
 270  * UTF-8 ctype routines
 271  * Only deals with characters < 0x100 (ie: US-ASCII)
 272  */
 273
 274 int ldap_utf8_isascii( const char * p )
 275 {
 276         unsigned c = * (const unsigned char *) p;
 277         return ISASCII(c);
 278 }
 279
 280 int ldap_utf8_isdigit( const char * p )
 281 {
 282         unsigned c = * (const unsigned char *) p;
 283
 284         if(!ISASCII(c)) return 0;
 285
 286         return c >= '0' && c <= '9';
 287 }
 288
 289 int ldap_utf8_isxdigit( const char * p )
 290 {
 291         unsigned c = * (const unsigned char *) p;
 292
 293         if(!ISASCII(c)) return 0;
 294
 295         return ( c >= '0' && c <= '9' )
 296                 || ( c >= 'A' && c <= 'F' )
 297                 || ( c >= 'a' && c <= 'f' );
 298 }
 299
 300 int ldap_utf8_isspace( const char * p )
 301 {
 302         unsigned c = * (const unsigned char *) p;
 303
 304         if(!ISASCII(c)) return 0;
 305
 306         switch(c) {
 307         case ' ':
 308         case '\t':
 309         case '\n':
 310         case '\r':
 311         case '\v':
 312         case '\f':
 313                 return 1;
 314         }
 315
 316         return 0;
 317 }
 318
 319 #ifndef UTF8_ALPHA_CTYPE
 320 /*
 321  * These are not needed by the C SDK and are
 322  * not "good enough" for general use.
 323  */
 324 int ldap_utf8_isalpha( const char * p )
 325 {
 326         unsigned c = * (const unsigned char *) p;
 327
 328         if(!ISASCII(c)) return 0;
 329
 330         return ( c >= 'A' && c <= 'Z' )
 331                 || ( c >= 'a' && c <= 'z' );
 332 }
 333
 334 int ldap_utf8_isalnum( const char * p )
 335 {
 336         unsigned c = * (const unsigned char *) p;
 337
 338         if(!ISASCII(c)) return 0;
 339
 340         return ( c >= '0' && c <= '9' )
 341                 || ( c >= 'A' && c <= 'Z' )
 342                 || ( c >= 'a' && c <= 'z' );
 343 }
 344
 345 int ldap_utf8_islower( const char * p )
 346 {
 347         unsigned c = * (const unsigned char *) p;
 348
 349         if(!ISASCII(c)) return 0;
 350
 351         return ( c >= 'a' && c <= 'z' );
 352 }
 353
 354 int ldap_utf8_isupper( const char * p )
 355 {
 356         unsigned c = * (const unsigned char *) p;
 357
 358         if(!ISASCII(c)) return 0;
 359
 360         return ( c >= 'A' && c <= 'Z' );
 361 }
 362 #endif
 363
 364
 365 /*
 366  * UTF-8 string routines
 367  */
 368
 369 /* like strchr() */
 370 char * (ldap_utf8_strchr)( const char *str, const char *chr )
 371 {
 372         for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
 373                 if( ldap_utf8_to_ucs4( str ) == ldap_utf8_to_ucs4( chr ) ) {
 374                         return (char *) str;
 375                 }
 376         }
 377
 378         return NULL;
 379 }
 380
 381 /* like strcspn() but returns number of bytes, not characters */
 382 ber_len_t (ldap_utf8_strcspn)( const char *str, const char *set )
 383 {
 384         const char *cstr;
 385         const char *cset;
 386
 387         for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
 388                 for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
 389                         if( ldap_utf8_to_ucs4( cstr ) == ldap_utf8_to_ucs4( cset ) ) {
 390                                 return cstr - str;
 391                         }
 392                 }
 393         }
 394
 395         return cstr - str;
 396 }
 397
 398 /* like strspn() but returns number of bytes, not characters */
 399 ber_len_t (ldap_utf8_strspn)( const char *str, const char *set )
 400 {
 401         const char *cstr;
 402         const char *cset;
 403
 404         for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
 405
 406                 for( cset = set; ; LDAP_UTF8_INCR(cset) ) {
 407                         if( *cset == '\0' ) {
 408                                 return cstr - str;
 409                         }
 410
 411                         if( ldap_utf8_to_ucs4( cstr ) == ldap_utf8_to_ucs4( cset ) ) {
 412                                 break;
 413                         }
 414                 }
 415         }
 416
 417         return cstr - str;
 418 }
 419
 420 /* like strpbrk(), replaces strchr() as well */
 421 char *(ldap_utf8_strpbrk)( const char *str, const char *set )
 422 {
 423         int len;
 424
 425         for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
 426                 const char *cset;
 427
 428                 for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
 429                         if( ldap_utf8_to_ucs4( str ) == ldap_utf8_to_ucs4( cset ) ) {
 430                                 return (char *) str;
 431                         }
 432                 }
 433         }
 434
 435         return NULL;
 436 }
 437
 438 /* like strtok_r(), not strtok() */
 439 char *(ldap_utf8_strtok)(char *str, const char *sep, char **last)
 440 {
 441         char *begin;
 442         char *end;
 443
 444         if( last == NULL ) return NULL;
 445
 446         begin = str ? str : *last;
 447
 448         begin += ldap_utf8_strspn( begin, sep );
 449
 450         if( *begin == '\0' ) {
 451                 *last = NULL;
 452                 return NULL;
 453         }
 454
 455         end = &begin[ ldap_utf8_strcspn( begin, sep ) ];
 456
 457         if( *end != '\0' ) {
 458                 char *next = LDAP_UTF8_NEXT( end );
 459                 *end = '\0';
 460                 end = next;
 461         }
 462
 463         *last = end;
 464         return begin;
 465 }