git.sur5r.net Git - openldap/blob - libraries/libldap/utf-8.c

   1 /* utf-8.c -- Basic UTF-8 routines */
   2 /* $OpenLDAP$ */
   3 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
   4  *
   5  * Copyright 1998-2004 The OpenLDAP Foundation.
   6  * All rights reserved.
   7  *
   8  * Redistribution and use in source and binary forms, with or without
   9  * modification, are permitted only as authorized by the OpenLDAP
  10  * Public License.
  11  *
  12  * A copy of this license is available in the file LICENSE in the
  13  * top-level directory of the distribution or, alternatively, at
  14  * <http://www.OpenLDAP.org/license.html>.
  15  */
  16 /* Portions Copyright (C) The Internet Society (1998)
  17  * UTF-8 encodings are derived from those in RFC 2279;
  18  * see RFC for full legal notices.
  19  */
  20
  21 /* Basic UTF-8 routines
  22  *
  23  * These routines are "dumb".  Though they understand UTF-8,
  24  * they don't grok Unicode.  That is, they can push bits,
  25  * but don't have a clue what the bits represent.  That's
  26  * good enough for use with the LDAP Client SDK.
  27  *
  28  * These routines are not optimized.
  29  */
  30
  31 #include "portable.h"
  32
  33 #include <stdio.h>
  34
  35 #include <ac/stdlib.h>
  36
  37 #include <ac/socket.h>
  38 #include <ac/string.h>
  39 #include <ac/time.h>
  40
  41 #include "ldap_utf8.h"
  42
  43 #include "ldap-int.h"
  44 #include "ldap_defaults.h"
  45
  46 /*
  47  * return the number of bytes required to hold the
  48  * NULL-terminated UTF-8 string NOT INCLUDING the
  49  * termination.
  50  */
  51 ber_len_t ldap_utf8_bytes( const char * p )
  52 {
  53         ber_len_t bytes;
  54
  55         for( bytes=0; p[bytes]; bytes++ ) {
  56                 /* EMPTY */ ;
  57         }
  58
  59         return bytes;
  60 }
  61
  62 ber_len_t ldap_utf8_chars( const char * p )
  63 {
  64         /* could be optimized and could check for invalid sequences */
  65         ber_len_t chars=0;
  66
  67         for( ; *p ; LDAP_UTF8_INCR(p) ) {
  68                 chars++;
  69         }
  70
  71         return chars;
  72 }
  73
  74 /* return offset to next character */
  75 int ldap_utf8_offset( const char * p )
  76 {
  77         return LDAP_UTF8_NEXT(p) - p;
  78 }
  79
  80 /*
  81  * Returns length indicated by first byte.
  82  */
  83 const char ldap_utf8_lentab[] = {
  84         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  85         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  86         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  87         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  88         0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  89         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  90         3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  91         4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 };
  92
  93 int ldap_utf8_charlen( const char * p )
  94 {
  95         if (!(*p & 0x80))
  96                 return 1;
  97
  98         return ldap_utf8_lentab[*(const unsigned char *)p ^ 0x80];
  99 }
 100
 101 /*
 102  * Make sure the UTF-8 char used the shortest possible encoding
 103  * returns charlen if valid, 0 if not.
 104  *
 105  * Here are the valid UTF-8 encodings, taken from RFC 2279 page 4.
 106  * The table is slightly modified from that of the RFC.
 107  *
 108  * UCS-4 range (hex)      UTF-8 sequence (binary)
 109  * 0000 0000-0000 007F   0.......
 110  * 0000 0080-0000 07FF   110++++. 10......
 111  * 0000 0800-0000 FFFF   1110++++ 10+..... 10......
 112  * 0001 0000-001F FFFF   11110+++ 10++.... 10...... 10......
 113  * 0020 0000-03FF FFFF   111110++ 10+++... 10...... 10...... 10......
 114  * 0400 0000-7FFF FFFF   1111110+ 10++++.. 10...... 10...... 10...... 10......
 115  *
 116  * The '.' bits are "don't cares". When validating a UTF-8 sequence,
 117  * at least one of the '+' bits must be set, otherwise the character
 118  * should have been encoded in fewer octets. Note that in the two-octet
 119  * case, only the first octet needs to be validated, and this is done
 120  * in the ldap_utf8_lentab[] above.
 121  */
 122
 123 /* mask of required bits in second octet */
 124 #undef c
 125 #define c const char
 126 c ldap_utf8_mintab[] = {
 127         (c)0x20, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
 128         (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
 129         (c)0x30, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
 130         (c)0x38, (c)0x80, (c)0x80, (c)0x80, (c)0x3c, (c)0x80, (c)0x00, (c)0x00 };
 131 #undef c
 132
 133 int ldap_utf8_charlen2( const char * p )
 134 {
 135         int i = LDAP_UTF8_CHARLEN( p );
 136
 137         if ( i > 2 ) {
 138                 if ( !( ldap_utf8_mintab[*p & 0x1f] & p[1] ) )
 139                         i = 0;
 140         }
 141         return i;
 142 }
 143
 144 /* conv UTF-8 to UCS-4, useful for comparisons */
 145 ldap_ucs4_t ldap_x_utf8_to_ucs4( const char * p )
 146 {
 147     const unsigned char *c = (const unsigned char *) p;
 148     ldap_ucs4_t ch;
 149         int len, i;
 150         static unsigned char mask[] = {
 151                 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
 152
 153         len = LDAP_UTF8_CHARLEN2(p, len);
 154
 155         if( len == 0 ) return LDAP_UCS4_INVALID;
 156
 157         ch = c[0] & mask[len];
 158
 159         for(i=1; i < len; i++) {
 160                 if ((c[i] & 0xc0) != 0x80) {
 161                         return LDAP_UCS4_INVALID;
 162                 }
 163
 164                 ch <<= 6;
 165                 ch |= c[i] & 0x3f;
 166         }
 167
 168         return ch;
 169 }
 170
 171 /* conv UCS-4 to UTF-8, not used */
 172 int ldap_x_ucs4_to_utf8( ldap_ucs4_t c, char *buf )
 173 {
 174         int len=0;
 175         unsigned char* p = (unsigned char *) buf;
 176
 177         /* not a valid Unicode character */
 178         if ( c < 0 ) return 0;
 179
 180         /* Just return length, don't convert */
 181         if(buf == NULL) {
 182                 if( c < 0x80 ) return 1;
 183                 else if( c < 0x800 ) return 2;
 184                 else if( c < 0x10000 ) return 3;
 185                 else if( c < 0x200000 ) return 4;
 186                 else if( c < 0x4000000 ) return 5;
 187                 else return 6;
 188         }
 189
 190         if( c < 0x80 ) {
 191                 p[len++] = c;
 192
 193         } else if( c < 0x800 ) {
 194                 p[len++] = 0xc0 | ( c >> 6 );
 195                 p[len++] = 0x80 | ( c & 0x3f );
 196
 197         } else if( c < 0x10000 ) {
 198                 p[len++] = 0xe0 | ( c >> 12 );
 199                 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
 200                 p[len++] = 0x80 | ( c & 0x3f );
 201
 202         } else if( c < 0x200000 ) {
 203                 p[len++] = 0xf0 | ( c >> 18 );
 204                 p[len++] = 0x80 | ( (c >> 12) & 0x3f );
 205                 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
 206                 p[len++] = 0x80 | ( c & 0x3f );
 207
 208         } else if( c < 0x4000000 ) {
 209                 p[len++] = 0xf8 | ( c >> 24 );
 210                 p[len++] = 0x80 | ( (c >> 18) & 0x3f );
 211                 p[len++] = 0x80 | ( (c >> 12) & 0x3f );
 212                 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
 213                 p[len++] = 0x80 | ( c & 0x3f );
 214
 215         } else /* if( c < 0x80000000 ) */ {
 216                 p[len++] = 0xfc | ( c >> 30 );
 217                 p[len++] = 0x80 | ( (c >> 24) & 0x3f );
 218                 p[len++] = 0x80 | ( (c >> 18) & 0x3f );
 219                 p[len++] = 0x80 | ( (c >> 12) & 0x3f );
 220                 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
 221                 p[len++] = 0x80 | ( c & 0x3f );
 222         }
 223
 224         return len;
 225 }
 226
 227 #define LDAP_UCS_UTF8LEN(c)     \
 228         c < 0 ? 0 : (c < 0x80 ? 1 : (c < 0x800 ? 2 : (c < 0x10000 ? 3 : \
 229         (c < 0x200000 ? 4 : (c < 0x4000000 ? 5 : 6)))))
 230
 231 /* Convert a string to UTF-8 format. The input string is expected to
 232  * have characters of 1, 2, or 4 octets (in network byte order)
 233  * corresponding to the ASN.1 T61STRING, BMPSTRING, and UNIVERSALSTRING
 234  * types respectively. (Here T61STRING just means that there is one
 235  * octet per character and characters may use the high bit of the octet.
 236  * The characters are assumed to use ISO mappings, no provision is made
 237  * for converting from T.61 coding rules to Unicode.)
 238  */
 239
 240 int
 241 ldap_ucs_to_utf8s( struct berval *ucs, int csize, struct berval *utf8s )
 242 {
 243         unsigned char *in, *end;
 244         char *ptr;
 245         ldap_ucs4_t u;
 246         int i, l = 0;
 247
 248         utf8s->bv_val = NULL;
 249         utf8s->bv_len = 0;
 250
 251         in = (unsigned char *)ucs->bv_val;
 252
 253         /* Make sure we stop at an even multiple of csize */
 254         end = in + ( ucs->bv_len & ~(csize-1) );
 255
 256         for (; in < end; ) {
 257                 u = *in++;
 258                 if (csize > 1) {
 259                         u <<= 8;
 260                         u |= *in++;
 261                 }
 262                 if (csize > 2) {
 263                         u <<= 8;
 264                         u |= *in++;
 265                         u <<= 8;
 266                         u |= *in++;
 267                 }
 268                 i = LDAP_UCS_UTF8LEN(u);
 269                 if (i == 0)
 270                         return LDAP_INVALID_SYNTAX;
 271                 l += i;
 272         }
 273
 274         utf8s->bv_val = LDAP_MALLOC( l+1 );
 275         if (utf8s->bv_val == NULL)
 276                 return LDAP_NO_MEMORY;
 277         utf8s->bv_len = l;
 278
 279         ptr = utf8s->bv_val;
 280         for (in = (unsigned char *)ucs->bv_val; in < end; ) {
 281                 u = *in++;
 282                 if (csize > 1) {
 283                         u <<= 8;
 284                         u |= *in++;
 285                 }
 286                 if (csize > 2) {
 287                         u <<= 8;
 288                         u |= *in++;
 289                         u <<= 8;
 290                         u |= *in++;
 291                 }
 292                 ptr += ldap_x_ucs4_to_utf8(u, ptr);
 293         }
 294         *ptr = '\0';
 295         return LDAP_SUCCESS;
 296 }
 297
 298 /*
 299  * Advance to the next UTF-8 character
 300  *
 301  * Ignores length of multibyte character, instead rely on
 302  * continuation markers to find start of next character.
 303  * This allows for "resyncing" of when invalid characters
 304  * are provided provided the start of the next character
 305  * is appears within the 6 bytes examined.
 306  */
 307 char* ldap_utf8_next( const char * p )
 308 {
 309         int i;
 310         const unsigned char *u = (const unsigned char *) p;
 311
 312         if( LDAP_UTF8_ISASCII(u) ) {
 313                 return (char *) &p[1];
 314         }
 315
 316         for( i=1; i<6; i++ ) {
 317                 if ( ( u[i] & 0xc0 ) != 0x80 ) {
 318                         return (char *) &p[i];
 319                 }
 320         }
 321
 322         return (char *) &p[i];
 323 }
 324
 325 /*
 326  * Advance to the previous UTF-8 character
 327  *
 328  * Ignores length of multibyte character, instead rely on
 329  * continuation markers to find start of next character.
 330  * This allows for "resyncing" of when invalid characters
 331  * are provided provided the start of the next character
 332  * is appears within the 6 bytes examined.
 333  */
 334 char* ldap_utf8_prev( const char * p )
 335 {
 336         int i;
 337         const unsigned char *u = (const unsigned char *) p;
 338
 339         for( i=-1; i>-6 ; i-- ) {
 340                 if ( ( u[i] & 0xc0 ) != 0x80 ) {
 341                         return (char *) &p[i];
 342                 }
 343         }
 344
 345         return (char *) &p[i];
 346 }
 347
 348 /*
 349  * Copy one UTF-8 character from src to dst returning
 350  * number of bytes copied.
 351  *
 352  * Ignores length of multibyte character, instead rely on
 353  * continuation markers to find start of next character.
 354  * This allows for "resyncing" of when invalid characters
 355  * are provided provided the start of the next character
 356  * is appears within the 6 bytes examined.
 357  */
 358 int ldap_utf8_copy( char* dst, const char *src )
 359 {
 360         int i;
 361         const unsigned char *u = (const unsigned char *) src;
 362
 363         dst[0] = src[0];
 364
 365         if( LDAP_UTF8_ISASCII(u) ) {
 366                 return 1;
 367         }
 368
 369         for( i=1; i<6; i++ ) {
 370                 if ( ( u[i] & 0xc0 ) != 0x80 ) {
 371                         return i;
 372                 }
 373                 dst[i] = src[i];
 374         }
 375
 376         return i;
 377 }
 378
 379 #ifndef UTF8_ALPHA_CTYPE
 380 /*
 381  * UTF-8 ctype routines
 382  * Only deals with characters < 0x80 (ie: US-ASCII)
 383  */
 384
 385 int ldap_utf8_isascii( const char * p )
 386 {
 387         unsigned c = * (const unsigned char *) p;
 388         return LDAP_ASCII(c);
 389 }
 390
 391 int ldap_utf8_isdigit( const char * p )
 392 {
 393         unsigned c = * (const unsigned char *) p;
 394
 395         if(!LDAP_ASCII(c)) return 0;
 396
 397         return LDAP_DIGIT( c );
 398 }
 399
 400 int ldap_utf8_isxdigit( const char * p )
 401 {
 402         unsigned c = * (const unsigned char *) p;
 403
 404         if(!LDAP_ASCII(c)) return 0;
 405
 406         return LDAP_HEX(c);
 407 }
 408
 409 int ldap_utf8_isspace( const char * p )
 410 {
 411         unsigned c = * (const unsigned char *) p;
 412
 413         if(!LDAP_ASCII(c)) return 0;
 414
 415         switch(c) {
 416         case ' ':
 417         case '\t':
 418         case '\n':
 419         case '\r':
 420         case '\v':
 421         case '\f':
 422                 return 1;
 423         }
 424
 425         return 0;
 426 }
 427
 428 /*
 429  * These are not needed by the C SDK and are
 430  * not "good enough" for general use.
 431  */
 432 int ldap_utf8_isalpha( const char * p )
 433 {
 434         unsigned c = * (const unsigned char *) p;
 435
 436         if(!LDAP_ASCII(c)) return 0;
 437
 438         return LDAP_ALPHA(c);
 439 }
 440
 441 int ldap_utf8_isalnum( const char * p )
 442 {
 443         unsigned c = * (const unsigned char *) p;
 444
 445         if(!LDAP_ASCII(c)) return 0;
 446
 447         return LDAP_ALNUM(c);
 448 }
 449
 450 int ldap_utf8_islower( const char * p )
 451 {
 452         unsigned c = * (const unsigned char *) p;
 453
 454         if(!LDAP_ASCII(c)) return 0;
 455
 456         return LDAP_LOWER(c);
 457 }
 458
 459 int ldap_utf8_isupper( const char * p )
 460 {
 461         unsigned c = * (const unsigned char *) p;
 462
 463         if(!LDAP_ASCII(c)) return 0;
 464
 465         return LDAP_UPPER(c);
 466 }
 467 #endif
 468
 469
 470 /*
 471  * UTF-8 string routines
 472  */
 473
 474 /* like strchr() */
 475 char * (ldap_utf8_strchr)( const char *str, const char *chr )
 476 {
 477         for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
 478                 if( ldap_x_utf8_to_ucs4( str ) == ldap_x_utf8_to_ucs4( chr ) ) {
 479                         return (char *) str;
 480                 }
 481         }
 482
 483         return NULL;
 484 }
 485
 486 /* like strcspn() but returns number of bytes, not characters */
 487 ber_len_t (ldap_utf8_strcspn)( const char *str, const char *set )
 488 {
 489         const char *cstr;
 490         const char *cset;
 491
 492         for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
 493                 for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
 494                         if( ldap_x_utf8_to_ucs4( cstr ) == ldap_x_utf8_to_ucs4( cset ) ) {
 495                                 return cstr - str;
 496                         }
 497                 }
 498         }
 499
 500         return cstr - str;
 501 }
 502
 503 /* like strspn() but returns number of bytes, not characters */
 504 ber_len_t (ldap_utf8_strspn)( const char *str, const char *set )
 505 {
 506         const char *cstr;
 507         const char *cset;
 508
 509         for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
 510                 for( cset = set; ; LDAP_UTF8_INCR(cset) ) {
 511                         if( *cset == '\0' ) {
 512                                 return cstr - str;
 513                         }
 514
 515                         if( ldap_x_utf8_to_ucs4( cstr ) == ldap_x_utf8_to_ucs4( cset ) ) {
 516                                 break;
 517                         }
 518                 }
 519         }
 520
 521         return cstr - str;
 522 }
 523
 524 /* like strpbrk(), replaces strchr() as well */
 525 char *(ldap_utf8_strpbrk)( const char *str, const char *set )
 526 {
 527         for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
 528                 const char *cset;
 529
 530                 for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
 531                         if( ldap_x_utf8_to_ucs4( str ) == ldap_x_utf8_to_ucs4( cset ) ) {
 532                                 return (char *) str;
 533                         }
 534                 }
 535         }
 536
 537         return NULL;
 538 }
 539
 540 /* like strtok_r(), not strtok() */
 541 char *(ldap_utf8_strtok)(char *str, const char *sep, char **last)
 542 {
 543         char *begin;
 544         char *end;
 545
 546         if( last == NULL ) return NULL;
 547
 548         begin = str ? str : *last;
 549
 550         begin += ldap_utf8_strspn( begin, sep );
 551
 552         if( *begin == '\0' ) {
 553                 *last = NULL;
 554                 return NULL;
 555         }
 556
 557         end = &begin[ ldap_utf8_strcspn( begin, sep ) ];
 558
 559         if( *end != '\0' ) {
 560                 char *next = LDAP_UTF8_NEXT( end );
 561                 *end = '\0';
 562                 end = next;
 563         }
 564
 565         *last = end;
 566         return begin;
 567 }