git.sur5r.net Git - openldap/blob - libraries/libldap/utf-8.c

   1 /* $OpenLDAP$ */
   2 /*
   3  * Copyright 1998-2000 The OpenLDAP Foundation, All Rights Reserved.
   4  * COPYING RESTRICTIONS APPLY, see COPYRIGHT file
   5  */
   6
   7 /*
   8  * Basic UTF-8 routines
   9  *
  10  * These routines are "dumb".  Though they understand UTF-8,
  11  * they don't grok Unicode.  That is, they can push bits,
  12  * but don't have a clue what the bits represent.  That's
  13  * good enough for use with the LDAP Client SDK.
  14  *
  15  * These routines are not optimized.
  16  */
  17
  18 #include "portable.h"
  19
  20 #include <stdio.h>
  21
  22 #include <ac/stdlib.h>
  23
  24 #include <ac/socket.h>
  25 #include <ac/string.h>
  26 #include <ac/time.h>
  27
  28 #include "ldap-int.h"
  29 #include "ldap_defaults.h"
  30
  31 #undef ISASCII
  32 #define ISASCII(uc)     ((uc) < 0x100)
  33 #undef UCS4_INVALID
  34 #define UCS4_INVALID    0x80000000U
  35
  36 /*
  37  * Basic UTF-8 routines
  38  */
  39
  40 /*
  41  * return the number of bytes required to hold the
  42  * NULL-terminated UTF-8 string NOT INCLUDING the
  43  * termination.
  44  */
  45 ber_len_t ldap_utf8_bytes( const char * p )
  46 {
  47         ber_len_t bytes;
  48
  49         for( bytes=0; p[bytes]; bytes++ ) {
  50                 /* EMPTY */ ;
  51         }
  52
  53         return bytes;
  54 }
  55
  56 ber_len_t ldap_utf8_chars( const char * p )
  57 {
  58         /* could be optimized and could check for invalid sequences */
  59         ber_len_t chars=0;
  60
  61         for( ; *p ; LDAP_UTF8_INCR(p) ) {
  62                 chars++;
  63         };
  64
  65         return chars;
  66 }
  67
  68 /* return offset to next character */
  69 int ldap_utf8_offset( const char * p )
  70 {
  71         return LDAP_UTF8_NEXT(p) - p;
  72 }
  73
  74 /*
  75  * Returns length indicated by first byte.
  76  *
  77  * This function should use a table lookup.
  78  */
  79 int ldap_utf8_charlen( const char * p )
  80 {
  81         unsigned c = * (const unsigned char *) p;
  82
  83         if ((c & 0xfe ) == 0xfc) {
  84                 return 6;
  85         }
  86
  87         if ((c & 0xfc ) == 0xf8) {
  88                 return 5;
  89         }
  90
  91         if ((c & 0xf8 ) == 0xf0) {
  92                 return 4;
  93         }
  94
  95         if ((c & 0xf0 ) == 0xe0) {
  96                 return 3;
  97         }
  98
  99         if ((c & 0xe0 ) == 0xc0) {
 100                 return 2;
 101         }
 102
 103         if ((c & 0x80 ) == 0x80) {
 104                 /* INVALID */
 105                 return 0;
 106         }
 107
 108         return 1;
 109 }
 110
 111 /* conv UTF-8 to UCS-4, useful for comparisons */
 112 ber_int_t ldap_utf8_to_ucs4( const char * p )
 113 {
 114     const unsigned char *c = p;
 115     ber_int_t ch;
 116         int len, i;
 117         static unsigned char mask[] = {
 118                 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
 119
 120         len = LDAP_UTF8_CHARLEN(p);
 121
 122         if( len == 0 ) return UCS4_INVALID;
 123
 124         ch = c[0] & mask[len];
 125
 126         for(i=1; i < len; i++) {
 127                 if ((c[i] & 0xc0) != 0x80) {
 128                         return UCS4_INVALID;
 129                 }
 130
 131                 ch <<= 6;
 132                 ch |= c[i] & 0x3f;
 133         }
 134
 135         return ch;
 136 }
 137
 138 /* conv UCS-4 to UTF-8, not used */
 139 int ldap_ucs4_to_utf8( ber_int_t c, char *buf )
 140 {
 141         int len=0;
 142         unsigned char* p = buf;
 143         if(buf == NULL) return 0;
 144
 145         if ( c < 0 ) {
 146                 /* not a valid Unicode character */
 147
 148         } else if( c < 0x80 ) {
 149                 p[len++] = c;
 150
 151         } else if( c < 0x800 ) {
 152                 p[len++] = 0xc0 | ( c >> 6 );
 153                 p[len++] = 0x80 | ( c & 0x3f );
 154
 155         } else if( c < 0x10000 ) {
 156                 p[len++] = 0xe0 | ( c >> 12 );
 157                 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
 158                 p[len++] = 0x80 | ( c & 0x3f );
 159
 160         } else if( c < 0x200000 ) {
 161                 p[len++] = 0xf0 | ( c >> 18 );
 162                 p[len++] = 0x80 | ( (c >> 12) & 0x3f );
 163                 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
 164                 p[len++] = 0x80 | ( c & 0x3f );
 165
 166         } else if( c < 0x400000 ) {
 167                 p[len++] = 0xf8 | ( c >> 24 );
 168                 p[len++] = 0x80 | ( (c >> 18) & 0x3f );
 169                 p[len++] = 0x80 | ( (c >> 12) & 0x3f );
 170                 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
 171                 p[len++] = 0x80 | ( c & 0x3f );
 172
 173         } else /* if( c < 0x80000000 ) */ {
 174                 p[len++] = 0xfc | ( c >> 30 );
 175                 p[len++] = 0x80 | ( (c >> 24) & 0x3f );
 176                 p[len++] = 0x80 | ( (c >> 18) & 0x3f );
 177                 p[len++] = 0x80 | ( (c >> 12) & 0x3f );
 178                 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
 179                 p[len++] = 0x80 | ( c & 0x3f );
 180         }
 181
 182         buf[len] = '\0';
 183         return len;
 184 }
 185
 186 /*
 187  * Advance to the next UTF-8 character
 188  *
 189  * Ignores length of multibyte character, instead rely on
 190  * continuation markers to find start of next character.
 191  * This allows for "resyncing" of when invalid characters
 192  * are provided provided the start of the next character
 193  * is appears within the 6 bytes examined.
 194  */
 195 char* ldap_utf8_next( const char * p )
 196 {
 197         int i;
 198         const unsigned char *u = p;
 199
 200         if( LDAP_UTF8_ISASCII(u) ) {
 201                 return (char *) &p[1];
 202         }
 203
 204         for( i=1; i<6; i++ ) {
 205                 if ( u[i] & 0xc0 != 0x80 ) {
 206                         return (char *) &p[i];
 207                 }
 208         }
 209
 210         return (char *) &p[i];
 211 }
 212
 213 /*
 214  * Advance to the previous UTF-8 character
 215  *
 216  * Ignores length of multibyte character, instead rely on
 217  * continuation markers to find start of next character.
 218  * This allows for "resyncing" of when invalid characters
 219  * are provided provided the start of the next character
 220  * is appears within the 6 bytes examined.
 221  */
 222 char* ldap_utf8_prev( const char * p )
 223 {
 224         int i;
 225         const unsigned char *u = p;
 226
 227         for( i=-1; i>-6 ; i-- ) {
 228                 if ( u[i] & 0xc0 != 0x80 ) {
 229                         return (char *) &p[i];
 230                 }
 231         }
 232
 233         return (char *) &p[i];
 234 }
 235
 236 /*
 237  * Copy one UTF-8 character from src to dst returning
 238  * number of bytes copied.
 239  *
 240  * Ignores length of multibyte character, instead rely on
 241  * continuation markers to find start of next character.
 242  * This allows for "resyncing" of when invalid characters
 243  * are provided provided the start of the next character
 244  * is appears within the 6 bytes examined.
 245  */
 246 int ldap_utf8_copy( char* dst, const char *src )
 247 {
 248         int i;
 249         const unsigned char *u = src;
 250
 251         dst[0] = src[0];
 252
 253         if( LDAP_UTF8_ISASCII(u) ) {
 254                 return 1;
 255         }
 256
 257         for( i=1; i<6; i++ ) {
 258                 if ( u[i] & 0xc0 != 0x80 ) {
 259                         return i;
 260                 }
 261                 dst[i] = src[i];
 262         }
 263
 264         return i;
 265 }
 266
 267 /*
 268  * UTF-8 ctype routines
 269  * Only deals with characters < 0x100 (ie: US-ASCII)
 270  */
 271
 272 int ldap_utf8_isascii( const char * p )
 273 {
 274         unsigned c = * (const unsigned char *) p;
 275         return ISASCII(c);
 276 }
 277
 278 int ldap_utf8_isdigit( const char * p )
 279 {
 280         unsigned c = * (const unsigned char *) p;
 281
 282         if(!ISASCII(c)) return 0;
 283
 284         return c >= '0' && c <= '9';
 285 }
 286
 287 int ldap_utf8_isxdigit( const char * p )
 288 {
 289         unsigned c = * (const unsigned char *) p;
 290
 291         if(!ISASCII(c)) return 0;
 292
 293         return ( c >= '0' && c <= '9' )
 294                 || ( c >= 'A' && c <= 'F' )
 295                 || ( c >= 'a' && c <= 'f' );
 296 }
 297
 298 int ldap_utf8_isspace( const char * p )
 299 {
 300         unsigned c = * (const unsigned char *) p;
 301
 302         if(!ISASCII(c)) return 0;
 303
 304         switch(c) {
 305         case ' ':
 306         case '\t':
 307         case '\n':
 308         case '\r':
 309         case '\v':
 310         case '\f':
 311                 return 1;
 312         }
 313
 314         return 0;
 315 }
 316
 317 #ifndef UTF8_ALPHA_CTYPE
 318 /*
 319  * These are not needed by the C SDK and are
 320  * not "good enough" for general use.
 321  */
 322 int ldap_utf8_isalpha( const char * p )
 323 {
 324         unsigned c = * (const unsigned char *) p;
 325
 326         if(!ISASCII(c)) return 0;
 327
 328         return ( c >= 'A' && c <= 'Z' )
 329                 || ( c >= 'a' && c <= 'z' );
 330 }
 331
 332 int ldap_utf8_isalnum( const char * p )
 333 {
 334         unsigned c = * (const unsigned char *) p;
 335
 336         if(!ISASCII(c)) return 0;
 337
 338         return ( c >= '0' && c <= '9' )
 339                 || ( c >= 'A' && c <= 'Z' )
 340                 || ( c >= 'a' && c <= 'z' );
 341 }
 342
 343 int ldap_utf8_islower( const char * p )
 344 {
 345         unsigned c = * (const unsigned char *) p;
 346
 347         if(!ISASCII(c)) return 0;
 348
 349         return ( c >= 'a' && c <= 'z' );
 350 }
 351
 352 int ldap_utf8_isupper( const char * p )
 353 {
 354         unsigned c = * (const unsigned char *) p;
 355
 356         if(!ISASCII(c)) return 0;
 357
 358         return ( c >= 'A' && c <= 'Z' );
 359 }
 360 #endif
 361
 362
 363 /*
 364  * UTF-8 string routines
 365  */
 366
 367 /* like strchr() */
 368 char * (ldap_utf8_strchr)( const char *str, const char *chr )
 369 {
 370         for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
 371                 if( ldap_utf8_to_ucs4( str ) == ldap_utf8_to_ucs4( chr ) ) {
 372                         return (char *) str;
 373                 }
 374         }
 375
 376         return NULL;
 377 }
 378
 379 /* like strcspn() but returns number of bytes, not characters */
 380 ber_len_t (ldap_utf8_strcspn)( const char *str, const char *set )
 381 {
 382         const char *cstr;
 383         const char *cset;
 384
 385         for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
 386                 for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
 387                         if( ldap_utf8_to_ucs4( cstr ) == ldap_utf8_to_ucs4( cset ) ) {
 388                                 return cstr - str;
 389                         }
 390                 }
 391         }
 392
 393         return cstr - str;
 394 }
 395
 396 /* like strspn() but returns number of bytes, not characters */
 397 ber_len_t (ldap_utf8_strspn)( const char *str, const char *set )
 398 {
 399         const char *cstr;
 400         const char *cset;
 401
 402         for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
 403
 404                 for( cset = set; ; LDAP_UTF8_INCR(cset) ) {
 405                         if( *cset == '\0' ) {
 406                                 return cstr - str;
 407                         }
 408
 409                         if( ldap_utf8_to_ucs4( cstr ) == ldap_utf8_to_ucs4( cset ) ) {
 410                                 break;
 411                         }
 412                 }
 413         }
 414
 415         return cstr - str;
 416 }
 417
 418 /* like strpbrk(), replaces strchr() as well */
 419 char *(ldap_utf8_strpbrk)( const char *str, const char *set )
 420 {
 421         for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
 422                 const char *cset;
 423
 424                 for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
 425                         if( ldap_utf8_to_ucs4( str ) == ldap_utf8_to_ucs4( cset ) ) {
 426                                 return (char *) str;
 427                         }
 428                 }
 429         }
 430
 431         return NULL;
 432 }
 433
 434 /* like strtok_r(), not strtok() */
 435 char *(ldap_utf8_strtok)(char *str, const char *sep, char **last)
 436 {
 437         char *begin;
 438         char *end;
 439
 440         if( last == NULL ) return NULL;
 441
 442         begin = str ? str : *last;
 443
 444         begin += ldap_utf8_strspn( begin, sep );
 445
 446         if( *begin == '\0' ) {
 447                 *last = NULL;
 448                 return NULL;
 449         }
 450
 451         end = &begin[ ldap_utf8_strcspn( begin, sep ) ];
 452
 453         if( *end != '\0' ) {
 454                 char *next = LDAP_UTF8_NEXT( end );
 455                 *end = '\0';
 456                 end = next;
 457         }
 458
 459         *last = end;
 460         return begin;
 461 }