git.sur5r.net Git - openldap/blob - libraries/libldap/utf-8.c

   1 /* $OpenLDAP$ */
   2 /*
   3  * Copyright 1998-2000 The OpenLDAP Foundation, All Rights Reserved.
   4  * COPYING RESTRICTIONS APPLY, see COPYRIGHT file
   5  */
   6
   7 /*
   8  * Basic UTF-8 routines
   9  *
  10  * These routines are "dumb".  Though they understand UTF-8,
  11  * they don't grok Unicode.  That is, they can push bits,
  12  * but don't have a clue what the bits represent.  That's
  13  * good enough for use with the LDAP Client SDK.
  14  *
  15  * These routines are not optimized.
  16  */
  17
  18 #include "portable.h"
  19
  20 #include <stdio.h>
  21
  22 #include <ac/stdlib.h>
  23
  24 #include <ac/socket.h>
  25 #include <ac/string.h>
  26 #include <ac/time.h>
  27
  28 #include "ldap-int.h"
  29 #include "ldap_defaults.h"
  30
  31 #undef ISASCII
  32 #define ISASCII(uc)     ((uc) < 0x80)
  33
  34 /*
  35  * Basic UTF-8 routines
  36  */
  37
  38 /*
  39  * return the number of bytes required to hold the
  40  * NULL-terminated UTF-8 string NOT INCLUDING the
  41  * termination.
  42  */
  43 ber_len_t ldap_utf8_bytes( const char * p )
  44 {
  45         ber_len_t bytes;
  46
  47         for( bytes=0; p[bytes]; bytes++ ) {
  48                 /* EMPTY */ ;
  49         }
  50
  51         return bytes;
  52 }
  53
  54 ber_len_t ldap_utf8_chars( const char * p )
  55 {
  56         /* could be optimized and could check for invalid sequences */
  57         ber_len_t chars=0;
  58
  59         for( ; *p ; LDAP_UTF8_INCR(p) ) {
  60                 chars++;
  61         };
  62
  63         return chars;
  64 }
  65
  66 /* return offset to next character */
  67 int ldap_utf8_offset( const char * p )
  68 {
  69         return LDAP_UTF8_NEXT(p) - p;
  70 }
  71
  72 /*
  73  * Returns length indicated by first byte.
  74  *
  75  * This function should use a table lookup.
  76  */
  77 int ldap_utf8_charlen( const char * p )
  78 {
  79         unsigned c = * (const unsigned char *) p;
  80
  81         if ((c & 0xfe ) == 0xfc) {
  82                 return 6;
  83         }
  84
  85         if ((c & 0xfc ) == 0xf8) {
  86                 return 5;
  87         }
  88
  89         if ((c & 0xf8 ) == 0xf0) {
  90                 return 4;
  91         }
  92
  93         if ((c & 0xf0 ) == 0xe0) {
  94                 return 3;
  95         }
  96
  97         if ((c & 0xe0 ) == 0xc0) {
  98                 return 2;
  99         }
 100
 101         if ((c & 0x80 ) == 0x80) {
 102                 /* INVALID */
 103                 return 0;
 104         }
 105
 106         return 1;
 107 }
 108
 109 /* conv UTF-8 to UCS-4, useful for comparisons */
 110 ldap_ucs4_t ldap_utf8_to_ucs4( const char * p )
 111 {
 112     const unsigned char *c = p;
 113     ldap_ucs4_t ch;
 114         int len, i;
 115         static unsigned char mask[] = {
 116                 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
 117
 118         len = LDAP_UTF8_CHARLEN(p);
 119
 120         if( len == 0 ) return LDAP_UCS4_INVALID;
 121
 122         ch = c[0] & mask[len];
 123
 124         for(i=1; i < len; i++) {
 125                 if ((c[i] & 0xc0) != 0x80) {
 126                         return LDAP_UCS4_INVALID;
 127                 }
 128
 129                 ch <<= 6;
 130                 ch |= c[i] & 0x3f;
 131         }
 132
 133         return ch;
 134 }
 135
 136 /* conv UCS-4 to UTF-8, not used */
 137 int ldap_ucs4_to_utf8( ldap_ucs4_t c, char *buf )
 138 {
 139         int len=0;
 140         unsigned char* p = buf;
 141         if(buf == NULL) return 0;
 142
 143         if ( c < 0 ) {
 144                 /* not a valid Unicode character */
 145
 146         } else if( c < 0x80 ) {
 147                 p[len++] = c;
 148
 149         } else if( c < 0x800 ) {
 150                 p[len++] = 0xc0 | ( c >> 6 );
 151                 p[len++] = 0x80 | ( c & 0x3f );
 152
 153         } else if( c < 0x10000 ) {
 154                 p[len++] = 0xe0 | ( c >> 12 );
 155                 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
 156                 p[len++] = 0x80 | ( c & 0x3f );
 157
 158         } else if( c < 0x200000 ) {
 159                 p[len++] = 0xf0 | ( c >> 18 );
 160                 p[len++] = 0x80 | ( (c >> 12) & 0x3f );
 161                 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
 162                 p[len++] = 0x80 | ( c & 0x3f );
 163
 164         } else if( c < 0x4000000 ) {
 165                 p[len++] = 0xf8 | ( c >> 24 );
 166                 p[len++] = 0x80 | ( (c >> 18) & 0x3f );
 167                 p[len++] = 0x80 | ( (c >> 12) & 0x3f );
 168                 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
 169                 p[len++] = 0x80 | ( c & 0x3f );
 170
 171         } else /* if( c < 0x80000000 ) */ {
 172                 p[len++] = 0xfc | ( c >> 30 );
 173                 p[len++] = 0x80 | ( (c >> 24) & 0x3f );
 174                 p[len++] = 0x80 | ( (c >> 18) & 0x3f );
 175                 p[len++] = 0x80 | ( (c >> 12) & 0x3f );
 176                 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
 177                 p[len++] = 0x80 | ( c & 0x3f );
 178         }
 179
 180         buf[len] = '\0';
 181         return len;
 182 }
 183
 184 /*
 185  * Advance to the next UTF-8 character
 186  *
 187  * Ignores length of multibyte character, instead rely on
 188  * continuation markers to find start of next character.
 189  * This allows for "resyncing" of when invalid characters
 190  * are provided provided the start of the next character
 191  * is appears within the 6 bytes examined.
 192  */
 193 char* ldap_utf8_next( const char * p )
 194 {
 195         int i;
 196         const unsigned char *u = p;
 197
 198         if( LDAP_UTF8_ISASCII(u) ) {
 199                 return (char *) &p[1];
 200         }
 201
 202         for( i=1; i<6; i++ ) {
 203                 if ( ( u[i] & 0xc0 ) != 0x80 ) {
 204                         return (char *) &p[i];
 205                 }
 206         }
 207
 208         return (char *) &p[i];
 209 }
 210
 211 /*
 212  * Advance to the previous UTF-8 character
 213  *
 214  * Ignores length of multibyte character, instead rely on
 215  * continuation markers to find start of next character.
 216  * This allows for "resyncing" of when invalid characters
 217  * are provided provided the start of the next character
 218  * is appears within the 6 bytes examined.
 219  */
 220 char* ldap_utf8_prev( const char * p )
 221 {
 222         int i;
 223         const unsigned char *u = p;
 224
 225         for( i=-1; i>-6 ; i-- ) {
 226                 if ( ( u[i] & 0xc0 ) != 0x80 ) {
 227                         return (char *) &p[i];
 228                 }
 229         }
 230
 231         return (char *) &p[i];
 232 }
 233
 234 /*
 235  * Copy one UTF-8 character from src to dst returning
 236  * number of bytes copied.
 237  *
 238  * Ignores length of multibyte character, instead rely on
 239  * continuation markers to find start of next character.
 240  * This allows for "resyncing" of when invalid characters
 241  * are provided provided the start of the next character
 242  * is appears within the 6 bytes examined.
 243  */
 244 int ldap_utf8_copy( char* dst, const char *src )
 245 {
 246         int i;
 247         const unsigned char *u = src;
 248
 249         dst[0] = src[0];
 250
 251         if( LDAP_UTF8_ISASCII(u) ) {
 252                 return 1;
 253         }
 254
 255         for( i=1; i<6; i++ ) {
 256                 if ( ( u[i] & 0xc0 ) != 0x80 ) {
 257                         return i;
 258                 }
 259                 dst[i] = src[i];
 260         }
 261
 262         return i;
 263 }
 264
 265 /*
 266  * UTF-8 ctype routines
 267  * Only deals with characters < 0x80 (ie: US-ASCII)
 268  */
 269
 270 int ldap_utf8_isascii( const char * p )
 271 {
 272         unsigned c = * (const unsigned char *) p;
 273         return ISASCII(c);
 274 }
 275
 276 int ldap_utf8_isdigit( const char * p )
 277 {
 278         unsigned c = * (const unsigned char *) p;
 279
 280         if(!ISASCII(c)) return 0;
 281
 282         return c >= '0' && c <= '9';
 283 }
 284
 285 int ldap_utf8_isxdigit( const char * p )
 286 {
 287         unsigned c = * (const unsigned char *) p;
 288
 289         if(!ISASCII(c)) return 0;
 290
 291         return ( c >= '0' && c <= '9' )
 292                 || ( c >= 'A' && c <= 'F' )
 293                 || ( c >= 'a' && c <= 'f' );
 294 }
 295
 296 int ldap_utf8_isspace( const char * p )
 297 {
 298         unsigned c = * (const unsigned char *) p;
 299
 300         if(!ISASCII(c)) return 0;
 301
 302         switch(c) {
 303         case ' ':
 304         case '\t':
 305         case '\n':
 306         case '\r':
 307         case '\v':
 308         case '\f':
 309                 return 1;
 310         }
 311
 312         return 0;
 313 }
 314
 315 #ifndef UTF8_ALPHA_CTYPE
 316 /*
 317  * These are not needed by the C SDK and are
 318  * not "good enough" for general use.
 319  */
 320 int ldap_utf8_isalpha( const char * p )
 321 {
 322         unsigned c = * (const unsigned char *) p;
 323
 324         if(!ISASCII(c)) return 0;
 325
 326         return ( c >= 'A' && c <= 'Z' )
 327                 || ( c >= 'a' && c <= 'z' );
 328 }
 329
 330 int ldap_utf8_isalnum( const char * p )
 331 {
 332         unsigned c = * (const unsigned char *) p;
 333
 334         if(!ISASCII(c)) return 0;
 335
 336         return ( c >= '0' && c <= '9' )
 337                 || ( c >= 'A' && c <= 'Z' )
 338                 || ( c >= 'a' && c <= 'z' );
 339 }
 340
 341 int ldap_utf8_islower( const char * p )
 342 {
 343         unsigned c = * (const unsigned char *) p;
 344
 345         if(!ISASCII(c)) return 0;
 346
 347         return ( c >= 'a' && c <= 'z' );
 348 }
 349
 350 int ldap_utf8_isupper( const char * p )
 351 {
 352         unsigned c = * (const unsigned char *) p;
 353
 354         if(!ISASCII(c)) return 0;
 355
 356         return ( c >= 'A' && c <= 'Z' );
 357 }
 358 #endif
 359
 360
 361 /*
 362  * UTF-8 string routines
 363  */
 364
 365 /* like strchr() */
 366 char * (ldap_utf8_strchr)( const char *str, const char *chr )
 367 {
 368         for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
 369                 if( ldap_utf8_to_ucs4( str ) == ldap_utf8_to_ucs4( chr ) ) {
 370                         return (char *) str;
 371                 }
 372         }
 373
 374         return NULL;
 375 }
 376
 377 /* like strcspn() but returns number of bytes, not characters */
 378 ber_len_t (ldap_utf8_strcspn)( const char *str, const char *set )
 379 {
 380         const char *cstr;
 381         const char *cset;
 382
 383         for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
 384                 for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
 385                         if( ldap_utf8_to_ucs4( cstr ) == ldap_utf8_to_ucs4( cset ) ) {
 386                                 return cstr - str;
 387                         }
 388                 }
 389         }
 390
 391         return cstr - str;
 392 }
 393
 394 /* like strspn() but returns number of bytes, not characters */
 395 ber_len_t (ldap_utf8_strspn)( const char *str, const char *set )
 396 {
 397         const char *cstr;
 398         const char *cset;
 399
 400         for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
 401                 for( cset = set; ; LDAP_UTF8_INCR(cset) ) {
 402                         if( *cset == '\0' ) {
 403                                 return cstr - str;
 404                         }
 405
 406                         if( ldap_utf8_to_ucs4( cstr ) == ldap_utf8_to_ucs4( cset ) ) {
 407                                 break;
 408                         }
 409                 }
 410         }
 411
 412         return cstr - str;
 413 }
 414
 415 /* like strpbrk(), replaces strchr() as well */
 416 char *(ldap_utf8_strpbrk)( const char *str, const char *set )
 417 {
 418         for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
 419                 const char *cset;
 420
 421                 for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
 422                         if( ldap_utf8_to_ucs4( str ) == ldap_utf8_to_ucs4( cset ) ) {
 423                                 return (char *) str;
 424                         }
 425                 }
 426         }
 427
 428         return NULL;
 429 }
 430
 431 /* like strtok_r(), not strtok() */
 432 char *(ldap_utf8_strtok)(char *str, const char *sep, char **last)
 433 {
 434         char *begin;
 435         char *end;
 436
 437         if( last == NULL ) return NULL;
 438
 439         begin = str ? str : *last;
 440
 441         begin += ldap_utf8_strspn( begin, sep );
 442
 443         if( *begin == '\0' ) {
 444                 *last = NULL;
 445                 return NULL;
 446         }
 447
 448         end = &begin[ ldap_utf8_strcspn( begin, sep ) ];
 449
 450         if( *end != '\0' ) {
 451                 char *next = LDAP_UTF8_NEXT( end );
 452                 *end = '\0';
 453                 end = next;
 454         }
 455
 456         *last = end;
 457         return begin;
 458 }