3 * Copyright 1998-2003 The OpenLDAP Foundation, All Rights Reserved.
4 * COPYING RESTRICTIONS APPLY, see COPYRIGHT file
7 * Portions Copyright (C) The Internet Society (1998)
8 * UTF-8 encodings are derived from those in RFC 2279;
9 * see RFC for full legal notices.
13 * Basic UTF-8 routines
15 * These routines are "dumb". Though they understand UTF-8,
16 * they don't grok Unicode. That is, they can push bits,
17 * but don't have a clue what the bits represent. That's
18 * good enough for use with the LDAP Client SDK.
20 * These routines are not optimized.
27 #include <ac/stdlib.h>
29 #include <ac/socket.h>
30 #include <ac/string.h>
33 #include "ldap_utf8.h"
36 #include "ldap_defaults.h"
39 * Basic UTF-8 routines
43 * return the number of bytes required to hold the
44 * NULL-terminated UTF-8 string NOT INCLUDING the
47 ber_len_t ldap_utf8_bytes( const char * p )
51 for( bytes=0; p[bytes]; bytes++ ) {
58 ber_len_t ldap_utf8_chars( const char * p )
60 /* could be optimized and could check for invalid sequences */
63 for( ; *p ; LDAP_UTF8_INCR(p) ) {
70 /* return offset to next character */
71 int ldap_utf8_offset( const char * p )
73 return LDAP_UTF8_NEXT(p) - p;
77 * Returns length indicated by first byte.
79 const char ldap_utf8_lentab[] = {
80 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
81 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
82 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
83 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
84 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
85 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
86 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
87 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 };
89 int ldap_utf8_charlen( const char * p )
94 return ldap_utf8_lentab[*(const unsigned char *)p ^ 0x80];
98 * Make sure the UTF-8 char used the shortest possible encoding
99 * returns charlen if valid, 0 if not.
101 * Here are the valid UTF-8 encodings, taken from RFC 2279 page 4.
102 * The table is slightly modified from that of the RFC.
104 * UCS-4 range (hex) UTF-8 sequence (binary)
105 * 0000 0000-0000 007F 0.......
106 * 0000 0080-0000 07FF 110++++. 10......
107 * 0000 0800-0000 FFFF 1110++++ 10+..... 10......
108 * 0001 0000-001F FFFF 11110+++ 10++.... 10...... 10......
109 * 0020 0000-03FF FFFF 111110++ 10+++... 10...... 10...... 10......
110 * 0400 0000-7FFF FFFF 1111110+ 10++++.. 10...... 10...... 10...... 10......
112 * The '.' bits are "don't cares". When validating a UTF-8 sequence,
113 * at least one of the '+' bits must be set, otherwise the character
114 * should have been encoded in fewer octets. Note that in the two-octet
115 * case, only the first octet needs to be validated, and this is done
116 * in the ldap_utf8_lentab[] above.
119 /* mask of required bits in second octet */
122 c ldap_utf8_mintab[] = {
123 (c)0x20, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
124 (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
125 (c)0x30, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
126 (c)0x38, (c)0x80, (c)0x80, (c)0x80, (c)0x3c, (c)0x80, (c)0x00, (c)0x00 };
129 int ldap_utf8_charlen2( const char * p )
131 int i = LDAP_UTF8_CHARLEN( p );
134 if ( !( ldap_utf8_mintab[*p & 0x1f] & p[1] ) )
140 /* conv UTF-8 to UCS-4, useful for comparisons */
141 ldap_ucs4_t ldap_x_utf8_to_ucs4( const char * p )
143 const unsigned char *c = (const unsigned char *) p;
146 static unsigned char mask[] = {
147 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
149 len = LDAP_UTF8_CHARLEN2(p, len);
151 if( len == 0 ) return LDAP_UCS4_INVALID;
153 ch = c[0] & mask[len];
155 for(i=1; i < len; i++) {
156 if ((c[i] & 0xc0) != 0x80) {
157 return LDAP_UCS4_INVALID;
167 /* conv UCS-4 to UTF-8, not used */
168 int ldap_x_ucs4_to_utf8( ldap_ucs4_t c, char *buf )
171 unsigned char* p = (unsigned char *) buf;
173 /* not a valid Unicode character */
174 if ( c < 0 ) return 0;
176 /* Just return length, don't convert */
178 if( c < 0x80 ) return 1;
179 else if( c < 0x800 ) return 2;
180 else if( c < 0x10000 ) return 3;
181 else if( c < 0x200000 ) return 4;
182 else if( c < 0x4000000 ) return 5;
189 } else if( c < 0x800 ) {
190 p[len++] = 0xc0 | ( c >> 6 );
191 p[len++] = 0x80 | ( c & 0x3f );
193 } else if( c < 0x10000 ) {
194 p[len++] = 0xe0 | ( c >> 12 );
195 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
196 p[len++] = 0x80 | ( c & 0x3f );
198 } else if( c < 0x200000 ) {
199 p[len++] = 0xf0 | ( c >> 18 );
200 p[len++] = 0x80 | ( (c >> 12) & 0x3f );
201 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
202 p[len++] = 0x80 | ( c & 0x3f );
204 } else if( c < 0x4000000 ) {
205 p[len++] = 0xf8 | ( c >> 24 );
206 p[len++] = 0x80 | ( (c >> 18) & 0x3f );
207 p[len++] = 0x80 | ( (c >> 12) & 0x3f );
208 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
209 p[len++] = 0x80 | ( c & 0x3f );
211 } else /* if( c < 0x80000000 ) */ {
212 p[len++] = 0xfc | ( c >> 30 );
213 p[len++] = 0x80 | ( (c >> 24) & 0x3f );
214 p[len++] = 0x80 | ( (c >> 18) & 0x3f );
215 p[len++] = 0x80 | ( (c >> 12) & 0x3f );
216 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
217 p[len++] = 0x80 | ( c & 0x3f );
223 #define LDAP_UCS_UTF8LEN(c) \
224 c < 0 ? 0 : (c < 0x80 ? 1 : (c < 0x800 ? 2 : (c < 0x10000 ? 3 : \
225 (c < 0x200000 ? 4 : (c < 0x4000000 ? 5 : 6)))))
227 /* Convert a string to UTF-8 format. The input string is expected to
228 * have characters of 1, 2, or 4 octets (in network byte order)
229 * corresponding to the ASN.1 T61STRING, BMPSTRING, and UNIVERSALSTRING
230 * types respectively. (Here T61STRING just means that there is one
231 * octet per character and characters may use the high bit of the octet.
232 * The characters are assumed to use ISO mappings, no provision is made
233 * for converting from T.61 coding rules to Unicode.)
237 ldap_ucs_to_utf8s( struct berval *ucs, int csize, struct berval *utf8s )
239 unsigned char *in, *end;
244 utf8s->bv_val = NULL;
247 in = (unsigned char *)ucs->bv_val;
249 /* Make sure we stop at an even multiple of csize */
250 end = in + ( ucs->bv_len & ~(csize-1) );
264 i = LDAP_UCS_UTF8LEN(u);
266 return LDAP_INVALID_SYNTAX;
270 utf8s->bv_val = LDAP_MALLOC( l+1 );
271 if (utf8s->bv_val == NULL)
272 return LDAP_NO_MEMORY;
276 for (in = (unsigned char *)ucs->bv_val; in < end; ) {
288 ptr += ldap_x_ucs4_to_utf8(u, ptr);
295 * Advance to the next UTF-8 character
297 * Ignores length of multibyte character, instead rely on
298 * continuation markers to find start of next character.
299 * This allows for "resyncing" of when invalid characters
300 * are provided provided the start of the next character
301 * is appears within the 6 bytes examined.
303 char* ldap_utf8_next( const char * p )
306 const unsigned char *u = (const unsigned char *) p;
308 if( LDAP_UTF8_ISASCII(u) ) {
309 return (char *) &p[1];
312 for( i=1; i<6; i++ ) {
313 if ( ( u[i] & 0xc0 ) != 0x80 ) {
314 return (char *) &p[i];
318 return (char *) &p[i];
322 * Advance to the previous UTF-8 character
324 * Ignores length of multibyte character, instead rely on
325 * continuation markers to find start of next character.
326 * This allows for "resyncing" of when invalid characters
327 * are provided provided the start of the next character
328 * is appears within the 6 bytes examined.
330 char* ldap_utf8_prev( const char * p )
333 const unsigned char *u = (const unsigned char *) p;
335 for( i=-1; i>-6 ; i-- ) {
336 if ( ( u[i] & 0xc0 ) != 0x80 ) {
337 return (char *) &p[i];
341 return (char *) &p[i];
345 * Copy one UTF-8 character from src to dst returning
346 * number of bytes copied.
348 * Ignores length of multibyte character, instead rely on
349 * continuation markers to find start of next character.
350 * This allows for "resyncing" of when invalid characters
351 * are provided provided the start of the next character
352 * is appears within the 6 bytes examined.
354 int ldap_utf8_copy( char* dst, const char *src )
357 const unsigned char *u = (const unsigned char *) src;
361 if( LDAP_UTF8_ISASCII(u) ) {
365 for( i=1; i<6; i++ ) {
366 if ( ( u[i] & 0xc0 ) != 0x80 ) {
375 #ifndef UTF8_ALPHA_CTYPE
377 * UTF-8 ctype routines
378 * Only deals with characters < 0x80 (ie: US-ASCII)
381 int ldap_utf8_isascii( const char * p )
383 unsigned c = * (const unsigned char *) p;
384 return LDAP_ASCII(c);
387 int ldap_utf8_isdigit( const char * p )
389 unsigned c = * (const unsigned char *) p;
391 if(!LDAP_ASCII(c)) return 0;
393 return LDAP_DIGIT( c );
396 int ldap_utf8_isxdigit( const char * p )
398 unsigned c = * (const unsigned char *) p;
400 if(!LDAP_ASCII(c)) return 0;
405 int ldap_utf8_isspace( const char * p )
407 unsigned c = * (const unsigned char *) p;
409 if(!LDAP_ASCII(c)) return 0;
425 * These are not needed by the C SDK and are
426 * not "good enough" for general use.
428 int ldap_utf8_isalpha( const char * p )
430 unsigned c = * (const unsigned char *) p;
432 if(!LDAP_ASCII(c)) return 0;
434 return LDAP_ALPHA(c);
437 int ldap_utf8_isalnum( const char * p )
439 unsigned c = * (const unsigned char *) p;
441 if(!LDAP_ASCII(c)) return 0;
443 return LDAP_ALNUM(c);
446 int ldap_utf8_islower( const char * p )
448 unsigned c = * (const unsigned char *) p;
450 if(!LDAP_ASCII(c)) return 0;
452 return LDAP_LOWER(c);
455 int ldap_utf8_isupper( const char * p )
457 unsigned c = * (const unsigned char *) p;
459 if(!LDAP_ASCII(c)) return 0;
461 return LDAP_UPPER(c);
467 * UTF-8 string routines
471 char * (ldap_utf8_strchr)( const char *str, const char *chr )
473 for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
474 if( ldap_x_utf8_to_ucs4( str ) == ldap_x_utf8_to_ucs4( chr ) ) {
482 /* like strcspn() but returns number of bytes, not characters */
483 ber_len_t (ldap_utf8_strcspn)( const char *str, const char *set )
488 for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
489 for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
490 if( ldap_x_utf8_to_ucs4( cstr ) == ldap_x_utf8_to_ucs4( cset ) ) {
499 /* like strspn() but returns number of bytes, not characters */
500 ber_len_t (ldap_utf8_strspn)( const char *str, const char *set )
505 for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
506 for( cset = set; ; LDAP_UTF8_INCR(cset) ) {
507 if( *cset == '\0' ) {
511 if( ldap_x_utf8_to_ucs4( cstr ) == ldap_x_utf8_to_ucs4( cset ) ) {
520 /* like strpbrk(), replaces strchr() as well */
521 char *(ldap_utf8_strpbrk)( const char *str, const char *set )
523 for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
526 for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
527 if( ldap_x_utf8_to_ucs4( str ) == ldap_x_utf8_to_ucs4( cset ) ) {
536 /* like strtok_r(), not strtok() */
537 char *(ldap_utf8_strtok)(char *str, const char *sep, char **last)
542 if( last == NULL ) return NULL;
544 begin = str ? str : *last;
546 begin += ldap_utf8_strspn( begin, sep );
548 if( *begin == '\0' ) {
553 end = &begin[ ldap_utf8_strcspn( begin, sep ) ];
556 char *next = LDAP_UTF8_NEXT( end );