3 * Copyright 1998-2002 The OpenLDAP Foundation, All Rights Reserved.
4 * COPYING RESTRICTIONS APPLY, see COPYRIGHT file
10 * These routines are "dumb". Though they understand UTF-8,
11 * they don't grok Unicode. That is, they can push bits,
12 * but don't have a clue what the bits represent. That's
13 * good enough for use with the LDAP Client SDK.
15 * These routines are not optimized.
22 #include <ac/stdlib.h>
24 #include <ac/socket.h>
25 #include <ac/string.h>
28 #include "ldap_utf8.h"
31 #include "ldap_defaults.h"
34 * Basic UTF-8 routines
38 * return the number of bytes required to hold the
39 * NULL-terminated UTF-8 string NOT INCLUDING the
42 ber_len_t ldap_utf8_bytes( const char * p )
46 for( bytes=0; p[bytes]; bytes++ ) {
53 ber_len_t ldap_utf8_chars( const char * p )
55 /* could be optimized and could check for invalid sequences */
58 for( ; *p ; LDAP_UTF8_INCR(p) ) {
65 /* return offset to next character */
66 int ldap_utf8_offset( const char * p )
68 return LDAP_UTF8_NEXT(p) - p;
72 * Returns length indicated by first byte.
74 const char ldap_utf8_lentab[] = {
75 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
76 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
77 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
78 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
79 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
80 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
81 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
82 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 };
84 int ldap_utf8_charlen( const char * p )
89 return ldap_utf8_lentab[*(unsigned char *)p ^ 0x80];
93 * Make sure the UTF-8 char used the shortest possible encoding
94 * returns charlen if valid, 0 if not.
96 * Here are the valid UTF-8 encodings, taken from RFC 2279 page 4.
97 * The table is slightly modified from that of the RFC.
99 * UCS-4 range (hex) UTF-8 sequence (binary)
100 * 0000 0000-0000 007F 0.......
101 * 0000 0080-0000 07FF 110++++. 10......
102 * 0000 0800-0000 FFFF 1110++++ 10+..... 10......
103 * 0001 0000-001F FFFF 11110+++ 10++.... 10...... 10......
104 * 0020 0000-03FF FFFF 111110++ 10+++... 10...... 10...... 10......
105 * 0400 0000-7FFF FFFF 1111110+ 10++++.. 10...... 10...... 10...... 10......
107 * The '.' bits are "don't cares". When validating a UTF-8 sequence,
108 * at least one of the '+' bits must be set, otherwise the character
109 * should have been encoded in fewer octets. Note that in the two-octet
110 * case, only the first octet needs to be validated, and this is done
111 * in the ldap_utf8_lentab[] above.
114 /* mask of required bits in second octet */
117 c ldap_utf8_mintab[] = {
118 (c)0x20, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
119 (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
120 (c)0x30, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
121 (c)0x38, (c)0x80, (c)0x80, (c)0x80, (c)0x3c, (c)0x80, (c)0x00, (c)0x00 };
124 int ldap_utf8_charlen2( const char * p )
126 int i = LDAP_UTF8_CHARLEN( p );
129 if ( !( ldap_utf8_mintab[*p & 0x1f] & p[1] ) )
135 /* conv UTF-8 to UCS-4, useful for comparisons */
136 ldap_ucs4_t ldap_x_utf8_to_ucs4( const char * p )
138 const unsigned char *c = p;
141 static unsigned char mask[] = {
142 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
144 len = LDAP_UTF8_CHARLEN2(p, len);
146 if( len == 0 ) return LDAP_UCS4_INVALID;
148 ch = c[0] & mask[len];
150 for(i=1; i < len; i++) {
151 if ((c[i] & 0xc0) != 0x80) {
152 return LDAP_UCS4_INVALID;
162 /* conv UCS-4 to UTF-8, not used */
163 int ldap_x_ucs4_to_utf8( ldap_ucs4_t c, char *buf )
166 unsigned char* p = buf;
167 if(buf == NULL) return 0;
170 /* not a valid Unicode character */
172 } else if( c < 0x80 ) {
175 } else if( c < 0x800 ) {
176 p[len++] = 0xc0 | ( c >> 6 );
177 p[len++] = 0x80 | ( c & 0x3f );
179 } else if( c < 0x10000 ) {
180 p[len++] = 0xe0 | ( c >> 12 );
181 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
182 p[len++] = 0x80 | ( c & 0x3f );
184 } else if( c < 0x200000 ) {
185 p[len++] = 0xf0 | ( c >> 18 );
186 p[len++] = 0x80 | ( (c >> 12) & 0x3f );
187 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
188 p[len++] = 0x80 | ( c & 0x3f );
190 } else if( c < 0x4000000 ) {
191 p[len++] = 0xf8 | ( c >> 24 );
192 p[len++] = 0x80 | ( (c >> 18) & 0x3f );
193 p[len++] = 0x80 | ( (c >> 12) & 0x3f );
194 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
195 p[len++] = 0x80 | ( c & 0x3f );
197 } else /* if( c < 0x80000000 ) */ {
198 p[len++] = 0xfc | ( c >> 30 );
199 p[len++] = 0x80 | ( (c >> 24) & 0x3f );
200 p[len++] = 0x80 | ( (c >> 18) & 0x3f );
201 p[len++] = 0x80 | ( (c >> 12) & 0x3f );
202 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
203 p[len++] = 0x80 | ( c & 0x3f );
211 * Advance to the next UTF-8 character
213 * Ignores length of multibyte character, instead rely on
214 * continuation markers to find start of next character.
215 * This allows for "resyncing" of when invalid characters
216 * are provided provided the start of the next character
217 * is appears within the 6 bytes examined.
219 char* ldap_utf8_next( const char * p )
222 const unsigned char *u = p;
224 if( LDAP_UTF8_ISASCII(u) ) {
225 return (char *) &p[1];
228 for( i=1; i<6; i++ ) {
229 if ( ( u[i] & 0xc0 ) != 0x80 ) {
230 return (char *) &p[i];
234 return (char *) &p[i];
238 * Advance to the previous UTF-8 character
240 * Ignores length of multibyte character, instead rely on
241 * continuation markers to find start of next character.
242 * This allows for "resyncing" of when invalid characters
243 * are provided provided the start of the next character
244 * is appears within the 6 bytes examined.
246 char* ldap_utf8_prev( const char * p )
249 const unsigned char *u = p;
251 for( i=-1; i>-6 ; i-- ) {
252 if ( ( u[i] & 0xc0 ) != 0x80 ) {
253 return (char *) &p[i];
257 return (char *) &p[i];
261 * Copy one UTF-8 character from src to dst returning
262 * number of bytes copied.
264 * Ignores length of multibyte character, instead rely on
265 * continuation markers to find start of next character.
266 * This allows for "resyncing" of when invalid characters
267 * are provided provided the start of the next character
268 * is appears within the 6 bytes examined.
270 int ldap_utf8_copy( char* dst, const char *src )
273 const unsigned char *u = src;
277 if( LDAP_UTF8_ISASCII(u) ) {
281 for( i=1; i<6; i++ ) {
282 if ( ( u[i] & 0xc0 ) != 0x80 ) {
291 #ifndef UTF8_ALPHA_CTYPE
293 * UTF-8 ctype routines
294 * Only deals with characters < 0x80 (ie: US-ASCII)
297 int ldap_utf8_isascii( const char * p )
299 unsigned c = * (const unsigned char *) p;
300 return LDAP_ASCII(c);
303 int ldap_utf8_isdigit( const char * p )
305 unsigned c = * (const unsigned char *) p;
307 if(!LDAP_ASCII(c)) return 0;
309 return LDAP_DIGIT( c );
312 int ldap_utf8_isxdigit( const char * p )
314 unsigned c = * (const unsigned char *) p;
316 if(!LDAP_ASCII(c)) return 0;
321 int ldap_utf8_isspace( const char * p )
323 unsigned c = * (const unsigned char *) p;
325 if(!LDAP_ASCII(c)) return 0;
341 * These are not needed by the C SDK and are
342 * not "good enough" for general use.
344 int ldap_utf8_isalpha( const char * p )
346 unsigned c = * (const unsigned char *) p;
348 if(!LDAP_ASCII(c)) return 0;
350 return LDAP_ALPHA(c);
353 int ldap_utf8_isalnum( const char * p )
355 unsigned c = * (const unsigned char *) p;
357 if(!LDAP_ASCII(c)) return 0;
359 return LDAP_ALNUM(c);
362 int ldap_utf8_islower( const char * p )
364 unsigned c = * (const unsigned char *) p;
366 if(!LDAP_ASCII(c)) return 0;
368 return LDAP_LOWER(c);
371 int ldap_utf8_isupper( const char * p )
373 unsigned c = * (const unsigned char *) p;
375 if(!LDAP_ASCII(c)) return 0;
377 return LDAP_UPPER(c);
383 * UTF-8 string routines
387 char * (ldap_utf8_strchr)( const char *str, const char *chr )
389 for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
390 if( ldap_x_utf8_to_ucs4( str ) == ldap_x_utf8_to_ucs4( chr ) ) {
398 /* like strcspn() but returns number of bytes, not characters */
399 ber_len_t (ldap_utf8_strcspn)( const char *str, const char *set )
404 for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
405 for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
406 if( ldap_x_utf8_to_ucs4( cstr ) == ldap_x_utf8_to_ucs4( cset ) ) {
415 /* like strspn() but returns number of bytes, not characters */
416 ber_len_t (ldap_utf8_strspn)( const char *str, const char *set )
421 for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
422 for( cset = set; ; LDAP_UTF8_INCR(cset) ) {
423 if( *cset == '\0' ) {
427 if( ldap_x_utf8_to_ucs4( cstr ) == ldap_x_utf8_to_ucs4( cset ) ) {
436 /* like strpbrk(), replaces strchr() as well */
437 char *(ldap_utf8_strpbrk)( const char *str, const char *set )
439 for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
442 for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
443 if( ldap_x_utf8_to_ucs4( str ) == ldap_x_utf8_to_ucs4( cset ) ) {
452 /* like strtok_r(), not strtok() */
453 char *(ldap_utf8_strtok)(char *str, const char *sep, char **last)
458 if( last == NULL ) return NULL;
460 begin = str ? str : *last;
462 begin += ldap_utf8_strspn( begin, sep );
464 if( *begin == '\0' ) {
469 end = &begin[ ldap_utf8_strcspn( begin, sep ) ];
472 char *next = LDAP_UTF8_NEXT( end );