git.sur5r.net Git - openldap/blob - libraries/libldap/utf-8.c

   1 /* $OpenLDAP$ */
   2 /*
   3  * Copyright 1998-2002 The OpenLDAP Foundation, All Rights Reserved.
   4  * COPYING RESTRICTIONS APPLY, see COPYRIGHT file
   5  */
   6
   7 /*
   8  * Basic UTF-8 routines
   9  *
  10  * These routines are "dumb".  Though they understand UTF-8,
  11  * they don't grok Unicode.  That is, they can push bits,
  12  * but don't have a clue what the bits represent.  That's
  13  * good enough for use with the LDAP Client SDK.
  14  *
  15  * These routines are not optimized.
  16  */
  17
  18 #include "portable.h"
  19
  20 #include <stdio.h>
  21
  22 #include <ac/stdlib.h>
  23
  24 #include <ac/socket.h>
  25 #include <ac/string.h>
  26 #include <ac/time.h>
  27
  28 #include "ldap_utf8.h"
  29
  30 #include "ldap-int.h"
  31 #include "ldap_defaults.h"
  32
  33 /*
  34  * Basic UTF-8 routines
  35  */
  36
  37 /*
  38  * return the number of bytes required to hold the
  39  * NULL-terminated UTF-8 string NOT INCLUDING the
  40  * termination.
  41  */
  42 ber_len_t ldap_utf8_bytes( const char * p )
  43 {
  44         ber_len_t bytes;
  45
  46         for( bytes=0; p[bytes]; bytes++ ) {
  47                 /* EMPTY */ ;
  48         }
  49
  50         return bytes;
  51 }
  52
  53 ber_len_t ldap_utf8_chars( const char * p )
  54 {
  55         /* could be optimized and could check for invalid sequences */
  56         ber_len_t chars=0;
  57
  58         for( ; *p ; LDAP_UTF8_INCR(p) ) {
  59                 chars++;
  60         }
  61
  62         return chars;
  63 }
  64
  65 /* return offset to next character */
  66 int ldap_utf8_offset( const char * p )
  67 {
  68         return LDAP_UTF8_NEXT(p) - p;
  69 }
  70
  71 /*
  72  * Returns length indicated by first byte.
  73  *
  74  * This function should use a table lookup.
  75  */
  76 const char ldap_utf8_lentab[] = {
  77         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  78         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  79         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  80         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  81         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  82         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  83         3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  84         4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 };
  85
  86 int ldap_utf8_charlen( const char * p )
  87 {
  88         if (!(*p & 0x80))
  89                 return 1;
  90
  91         return ldap_utf8_lentab[*(unsigned char *)p ^ 0x80];
  92 }
  93
  94 /* conv UTF-8 to UCS-4, useful for comparisons */
  95 ldap_ucs4_t ldap_x_utf8_to_ucs4( const char * p )
  96 {
  97     const unsigned char *c = p;
  98     ldap_ucs4_t ch;
  99         int len, i;
 100         static unsigned char mask[] = {
 101                 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
 102
 103         len = LDAP_UTF8_CHARLEN(p);
 104
 105         if( len == 0 ) return LDAP_UCS4_INVALID;
 106
 107         ch = c[0] & mask[len];
 108
 109         for(i=1; i < len; i++) {
 110                 if ((c[i] & 0xc0) != 0x80) {
 111                         return LDAP_UCS4_INVALID;
 112                 }
 113
 114                 ch <<= 6;
 115                 ch |= c[i] & 0x3f;
 116         }
 117
 118         return ch;
 119 }
 120
 121 /* conv UCS-4 to UTF-8, not used */
 122 int ldap_x_ucs4_to_utf8( ldap_ucs4_t c, char *buf )
 123 {
 124         int len=0;
 125         unsigned char* p = buf;
 126         if(buf == NULL) return 0;
 127
 128         if ( c < 0 ) {
 129                 /* not a valid Unicode character */
 130
 131         } else if( c < 0x80 ) {
 132                 p[len++] = c;
 133
 134         } else if( c < 0x800 ) {
 135                 p[len++] = 0xc0 | ( c >> 6 );
 136                 p[len++] = 0x80 | ( c & 0x3f );
 137
 138         } else if( c < 0x10000 ) {
 139                 p[len++] = 0xe0 | ( c >> 12 );
 140                 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
 141                 p[len++] = 0x80 | ( c & 0x3f );
 142
 143         } else if( c < 0x200000 ) {
 144                 p[len++] = 0xf0 | ( c >> 18 );
 145                 p[len++] = 0x80 | ( (c >> 12) & 0x3f );
 146                 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
 147                 p[len++] = 0x80 | ( c & 0x3f );
 148
 149         } else if( c < 0x4000000 ) {
 150                 p[len++] = 0xf8 | ( c >> 24 );
 151                 p[len++] = 0x80 | ( (c >> 18) & 0x3f );
 152                 p[len++] = 0x80 | ( (c >> 12) & 0x3f );
 153                 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
 154                 p[len++] = 0x80 | ( c & 0x3f );
 155
 156         } else /* if( c < 0x80000000 ) */ {
 157                 p[len++] = 0xfc | ( c >> 30 );
 158                 p[len++] = 0x80 | ( (c >> 24) & 0x3f );
 159                 p[len++] = 0x80 | ( (c >> 18) & 0x3f );
 160                 p[len++] = 0x80 | ( (c >> 12) & 0x3f );
 161                 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
 162                 p[len++] = 0x80 | ( c & 0x3f );
 163         }
 164
 165         buf[len] = '\0';
 166         return len;
 167 }
 168
 169 /*
 170  * Advance to the next UTF-8 character
 171  *
 172  * Ignores length of multibyte character, instead rely on
 173  * continuation markers to find start of next character.
 174  * This allows for "resyncing" of when invalid characters
 175  * are provided provided the start of the next character
 176  * is appears within the 6 bytes examined.
 177  */
 178 char* ldap_utf8_next( const char * p )
 179 {
 180         int i;
 181         const unsigned char *u = p;
 182
 183         if( LDAP_UTF8_ISASCII(u) ) {
 184                 return (char *) &p[1];
 185         }
 186
 187         for( i=1; i<6; i++ ) {
 188                 if ( ( u[i] & 0xc0 ) != 0x80 ) {
 189                         return (char *) &p[i];
 190                 }
 191         }
 192
 193         return (char *) &p[i];
 194 }
 195
 196 /*
 197  * Advance to the previous UTF-8 character
 198  *
 199  * Ignores length of multibyte character, instead rely on
 200  * continuation markers to find start of next character.
 201  * This allows for "resyncing" of when invalid characters
 202  * are provided provided the start of the next character
 203  * is appears within the 6 bytes examined.
 204  */
 205 char* ldap_utf8_prev( const char * p )
 206 {
 207         int i;
 208         const unsigned char *u = p;
 209
 210         for( i=-1; i>-6 ; i-- ) {
 211                 if ( ( u[i] & 0xc0 ) != 0x80 ) {
 212                         return (char *) &p[i];
 213                 }
 214         }
 215
 216         return (char *) &p[i];
 217 }
 218
 219 /*
 220  * Copy one UTF-8 character from src to dst returning
 221  * number of bytes copied.
 222  *
 223  * Ignores length of multibyte character, instead rely on
 224  * continuation markers to find start of next character.
 225  * This allows for "resyncing" of when invalid characters
 226  * are provided provided the start of the next character
 227  * is appears within the 6 bytes examined.
 228  */
 229 int ldap_utf8_copy( char* dst, const char *src )
 230 {
 231         int i;
 232         const unsigned char *u = src;
 233
 234         dst[0] = src[0];
 235
 236         if( LDAP_UTF8_ISASCII(u) ) {
 237                 return 1;
 238         }
 239
 240         for( i=1; i<6; i++ ) {
 241                 if ( ( u[i] & 0xc0 ) != 0x80 ) {
 242                         return i;
 243                 }
 244                 dst[i] = src[i];
 245         }
 246
 247         return i;
 248 }
 249
 250 #ifndef UTF8_ALPHA_CTYPE
 251 /*
 252  * UTF-8 ctype routines
 253  * Only deals with characters < 0x80 (ie: US-ASCII)
 254  */
 255
 256 int ldap_utf8_isascii( const char * p )
 257 {
 258         unsigned c = * (const unsigned char *) p;
 259         return LDAP_ASCII(c);
 260 }
 261
 262 int ldap_utf8_isdigit( const char * p )
 263 {
 264         unsigned c = * (const unsigned char *) p;
 265
 266         if(!LDAP_ASCII(c)) return 0;
 267
 268         return LDAP_DIGIT( c );
 269 }
 270
 271 int ldap_utf8_isxdigit( const char * p )
 272 {
 273         unsigned c = * (const unsigned char *) p;
 274
 275         if(!LDAP_ASCII(c)) return 0;
 276
 277         return LDAP_HEX(c);
 278 }
 279
 280 int ldap_utf8_isspace( const char * p )
 281 {
 282         unsigned c = * (const unsigned char *) p;
 283
 284         if(!LDAP_ASCII(c)) return 0;
 285
 286         switch(c) {
 287         case ' ':
 288         case '\t':
 289         case '\n':
 290         case '\r':
 291         case '\v':
 292         case '\f':
 293                 return 1;
 294         }
 295
 296         return 0;
 297 }
 298
 299 /*
 300  * These are not needed by the C SDK and are
 301  * not "good enough" for general use.
 302  */
 303 int ldap_utf8_isalpha( const char * p )
 304 {
 305         unsigned c = * (const unsigned char *) p;
 306
 307         if(!LDAP_ASCII(c)) return 0;
 308
 309         return LDAP_ALPHA(c);
 310 }
 311
 312 int ldap_utf8_isalnum( const char * p )
 313 {
 314         unsigned c = * (const unsigned char *) p;
 315
 316         if(!LDAP_ASCII(c)) return 0;
 317
 318         return LDAP_ALNUM(c);
 319 }
 320
 321 int ldap_utf8_islower( const char * p )
 322 {
 323         unsigned c = * (const unsigned char *) p;
 324
 325         if(!LDAP_ASCII(c)) return 0;
 326
 327         return LDAP_LOWER(c);
 328 }
 329
 330 int ldap_utf8_isupper( const char * p )
 331 {
 332         unsigned c = * (const unsigned char *) p;
 333
 334         if(!LDAP_ASCII(c)) return 0;
 335
 336         return LDAP_UPPER(c);
 337 }
 338 #endif
 339
 340
 341 /*
 342  * UTF-8 string routines
 343  */
 344
 345 /* like strchr() */
 346 char * (ldap_utf8_strchr)( const char *str, const char *chr )
 347 {
 348         for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
 349                 if( ldap_x_utf8_to_ucs4( str ) == ldap_x_utf8_to_ucs4( chr ) ) {
 350                         return (char *) str;
 351                 }
 352         }
 353
 354         return NULL;
 355 }
 356
 357 /* like strcspn() but returns number of bytes, not characters */
 358 ber_len_t (ldap_utf8_strcspn)( const char *str, const char *set )
 359 {
 360         const char *cstr;
 361         const char *cset;
 362
 363         for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
 364                 for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
 365                         if( ldap_x_utf8_to_ucs4( cstr ) == ldap_x_utf8_to_ucs4( cset ) ) {
 366                                 return cstr - str;
 367                         }
 368                 }
 369         }
 370
 371         return cstr - str;
 372 }
 373
 374 /* like strspn() but returns number of bytes, not characters */
 375 ber_len_t (ldap_utf8_strspn)( const char *str, const char *set )
 376 {
 377         const char *cstr;
 378         const char *cset;
 379
 380         for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
 381                 for( cset = set; ; LDAP_UTF8_INCR(cset) ) {
 382                         if( *cset == '\0' ) {
 383                                 return cstr - str;
 384                         }
 385
 386                         if( ldap_x_utf8_to_ucs4( cstr ) == ldap_x_utf8_to_ucs4( cset ) ) {
 387                                 break;
 388                         }
 389                 }
 390         }
 391
 392         return cstr - str;
 393 }
 394
 395 /* like strpbrk(), replaces strchr() as well */
 396 char *(ldap_utf8_strpbrk)( const char *str, const char *set )
 397 {
 398         for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
 399                 const char *cset;
 400
 401                 for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
 402                         if( ldap_x_utf8_to_ucs4( str ) == ldap_x_utf8_to_ucs4( cset ) ) {
 403                                 return (char *) str;
 404                         }
 405                 }
 406         }
 407
 408         return NULL;
 409 }
 410
 411 /* like strtok_r(), not strtok() */
 412 char *(ldap_utf8_strtok)(char *str, const char *sep, char **last)
 413 {
 414         char *begin;
 415         char *end;
 416
 417         if( last == NULL ) return NULL;
 418
 419         begin = str ? str : *last;
 420
 421         begin += ldap_utf8_strspn( begin, sep );
 422
 423         if( *begin == '\0' ) {
 424                 *last = NULL;
 425                 return NULL;
 426         }
 427
 428         end = &begin[ ldap_utf8_strcspn( begin, sep ) ];
 429
 430         if( *end != '\0' ) {
 431                 char *next = LDAP_UTF8_NEXT( end );
 432                 *end = '\0';
 433                 end = next;
 434         }
 435
 436         *last = end;
 437         return begin;
 438 }