3 * Copyright 1998-2002 The OpenLDAP Foundation, All Rights Reserved.
4 * COPYING RESTRICTIONS APPLY, see COPYRIGHT file
10 * These routines are "dumb". Though they understand UTF-8,
11 * they don't grok Unicode. That is, they can push bits,
12 * but don't have a clue what the bits represent. That's
13 * good enough for use with the LDAP Client SDK.
15 * These routines are not optimized.
22 #include <ac/stdlib.h>
24 #include <ac/socket.h>
25 #include <ac/string.h>
28 #include "ldap_utf8.h"
31 #include "ldap_defaults.h"
34 * Basic UTF-8 routines
38 * return the number of bytes required to hold the
39 * NULL-terminated UTF-8 string NOT INCLUDING the
42 ber_len_t ldap_utf8_bytes( const char * p )
46 for( bytes=0; p[bytes]; bytes++ ) {
53 ber_len_t ldap_utf8_chars( const char * p )
55 /* could be optimized and could check for invalid sequences */
58 for( ; *p ; LDAP_UTF8_INCR(p) ) {
65 /* return offset to next character */
66 int ldap_utf8_offset( const char * p )
68 return LDAP_UTF8_NEXT(p) - p;
72 * Returns length indicated by first byte.
74 const char ldap_utf8_lentab[] = {
75 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
76 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
77 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
78 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
79 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
80 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
81 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
82 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 };
84 int ldap_utf8_charlen( const char * p )
89 return ldap_utf8_lentab[*(unsigned char *)p ^ 0x80];
93 * Make sure the UTF-8 char used the shortest possible encoding
94 * returns charlen if valid, 0 if not.
96 * Here are the valid UTF-8 encodings, taken from RFC 2279 page 4.
97 * The table is slightly modified from that of the RFC.
99 * UCS-4 range (hex) UTF-8 sequence (binary)
100 * 0000 0000-0000 007F 0.......
101 * 0000 0080-0000 07FF 110++++. 10......
102 * 0000 0800-0000 FFFF 1110++++ 10+..... 10......
103 * 0001 0000-001F FFFF 11110+++ 10++.... 10...... 10......
104 * 0020 0000-03FF FFFF 111110++ 10+++... 10...... 10...... 10......
105 * 0400 0000-7FFF FFFF 1111110+ 10++++.. 10...... 10...... 10...... 10......
107 * The '.' bits are "don't cares". When validating a UTF-8 sequence,
108 * at least one of the '+' bits must be set, otherwise the character
109 * should have been encoded in fewer octets. Note that in the two-octet
110 * case, only the first octet needs to be validated, and this is done
111 * in the ldap_utf8_lentab[] above.
114 /* mask of required bits in second octet */
115 const char ldap_utf8_mintab[] = {
116 0x20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
117 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
118 0x30, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
119 0x38, 0x80, 0x80, 0x80, 0x3c, 0x80, 0x00, 0x00 };
121 int ldap_utf8_charlen2( const char * p )
123 int i = LDAP_UTF8_CHARLEN( p );
126 if ( !( ldap_utf8_mintab[*p & 0x1f] & p[1] ) )
132 /* conv UTF-8 to UCS-4, useful for comparisons */
133 ldap_ucs4_t ldap_x_utf8_to_ucs4( const char * p )
135 const unsigned char *c = p;
138 static unsigned char mask[] = {
139 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
141 len = LDAP_UTF8_CHARLEN2(p, len);
143 if( len == 0 ) return LDAP_UCS4_INVALID;
145 ch = c[0] & mask[len];
147 for(i=1; i < len; i++) {
148 if ((c[i] & 0xc0) != 0x80) {
149 return LDAP_UCS4_INVALID;
159 /* conv UCS-4 to UTF-8, not used */
160 int ldap_x_ucs4_to_utf8( ldap_ucs4_t c, char *buf )
163 unsigned char* p = buf;
164 if(buf == NULL) return 0;
167 /* not a valid Unicode character */
169 } else if( c < 0x80 ) {
172 } else if( c < 0x800 ) {
173 p[len++] = 0xc0 | ( c >> 6 );
174 p[len++] = 0x80 | ( c & 0x3f );
176 } else if( c < 0x10000 ) {
177 p[len++] = 0xe0 | ( c >> 12 );
178 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
179 p[len++] = 0x80 | ( c & 0x3f );
181 } else if( c < 0x200000 ) {
182 p[len++] = 0xf0 | ( c >> 18 );
183 p[len++] = 0x80 | ( (c >> 12) & 0x3f );
184 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
185 p[len++] = 0x80 | ( c & 0x3f );
187 } else if( c < 0x4000000 ) {
188 p[len++] = 0xf8 | ( c >> 24 );
189 p[len++] = 0x80 | ( (c >> 18) & 0x3f );
190 p[len++] = 0x80 | ( (c >> 12) & 0x3f );
191 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
192 p[len++] = 0x80 | ( c & 0x3f );
194 } else /* if( c < 0x80000000 ) */ {
195 p[len++] = 0xfc | ( c >> 30 );
196 p[len++] = 0x80 | ( (c >> 24) & 0x3f );
197 p[len++] = 0x80 | ( (c >> 18) & 0x3f );
198 p[len++] = 0x80 | ( (c >> 12) & 0x3f );
199 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
200 p[len++] = 0x80 | ( c & 0x3f );
208 * Advance to the next UTF-8 character
210 * Ignores length of multibyte character, instead rely on
211 * continuation markers to find start of next character.
212 * This allows for "resyncing" of when invalid characters
213 * are provided provided the start of the next character
214 * is appears within the 6 bytes examined.
216 char* ldap_utf8_next( const char * p )
219 const unsigned char *u = p;
221 if( LDAP_UTF8_ISASCII(u) ) {
222 return (char *) &p[1];
225 for( i=1; i<6; i++ ) {
226 if ( ( u[i] & 0xc0 ) != 0x80 ) {
227 return (char *) &p[i];
231 return (char *) &p[i];
235 * Advance to the previous UTF-8 character
237 * Ignores length of multibyte character, instead rely on
238 * continuation markers to find start of next character.
239 * This allows for "resyncing" of when invalid characters
240 * are provided provided the start of the next character
241 * is appears within the 6 bytes examined.
243 char* ldap_utf8_prev( const char * p )
246 const unsigned char *u = p;
248 for( i=-1; i>-6 ; i-- ) {
249 if ( ( u[i] & 0xc0 ) != 0x80 ) {
250 return (char *) &p[i];
254 return (char *) &p[i];
258 * Copy one UTF-8 character from src to dst returning
259 * number of bytes copied.
261 * Ignores length of multibyte character, instead rely on
262 * continuation markers to find start of next character.
263 * This allows for "resyncing" of when invalid characters
264 * are provided provided the start of the next character
265 * is appears within the 6 bytes examined.
267 int ldap_utf8_copy( char* dst, const char *src )
270 const unsigned char *u = src;
274 if( LDAP_UTF8_ISASCII(u) ) {
278 for( i=1; i<6; i++ ) {
279 if ( ( u[i] & 0xc0 ) != 0x80 ) {
288 #ifndef UTF8_ALPHA_CTYPE
290 * UTF-8 ctype routines
291 * Only deals with characters < 0x80 (ie: US-ASCII)
294 int ldap_utf8_isascii( const char * p )
296 unsigned c = * (const unsigned char *) p;
297 return LDAP_ASCII(c);
300 int ldap_utf8_isdigit( const char * p )
302 unsigned c = * (const unsigned char *) p;
304 if(!LDAP_ASCII(c)) return 0;
306 return LDAP_DIGIT( c );
309 int ldap_utf8_isxdigit( const char * p )
311 unsigned c = * (const unsigned char *) p;
313 if(!LDAP_ASCII(c)) return 0;
318 int ldap_utf8_isspace( const char * p )
320 unsigned c = * (const unsigned char *) p;
322 if(!LDAP_ASCII(c)) return 0;
338 * These are not needed by the C SDK and are
339 * not "good enough" for general use.
341 int ldap_utf8_isalpha( const char * p )
343 unsigned c = * (const unsigned char *) p;
345 if(!LDAP_ASCII(c)) return 0;
347 return LDAP_ALPHA(c);
350 int ldap_utf8_isalnum( const char * p )
352 unsigned c = * (const unsigned char *) p;
354 if(!LDAP_ASCII(c)) return 0;
356 return LDAP_ALNUM(c);
359 int ldap_utf8_islower( const char * p )
361 unsigned c = * (const unsigned char *) p;
363 if(!LDAP_ASCII(c)) return 0;
365 return LDAP_LOWER(c);
368 int ldap_utf8_isupper( const char * p )
370 unsigned c = * (const unsigned char *) p;
372 if(!LDAP_ASCII(c)) return 0;
374 return LDAP_UPPER(c);
380 * UTF-8 string routines
384 char * (ldap_utf8_strchr)( const char *str, const char *chr )
386 for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
387 if( ldap_x_utf8_to_ucs4( str ) == ldap_x_utf8_to_ucs4( chr ) ) {
395 /* like strcspn() but returns number of bytes, not characters */
396 ber_len_t (ldap_utf8_strcspn)( const char *str, const char *set )
401 for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
402 for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
403 if( ldap_x_utf8_to_ucs4( cstr ) == ldap_x_utf8_to_ucs4( cset ) ) {
412 /* like strspn() but returns number of bytes, not characters */
413 ber_len_t (ldap_utf8_strspn)( const char *str, const char *set )
418 for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
419 for( cset = set; ; LDAP_UTF8_INCR(cset) ) {
420 if( *cset == '\0' ) {
424 if( ldap_x_utf8_to_ucs4( cstr ) == ldap_x_utf8_to_ucs4( cset ) ) {
433 /* like strpbrk(), replaces strchr() as well */
434 char *(ldap_utf8_strpbrk)( const char *str, const char *set )
436 for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
439 for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
440 if( ldap_x_utf8_to_ucs4( str ) == ldap_x_utf8_to_ucs4( cset ) ) {
449 /* like strtok_r(), not strtok() */
450 char *(ldap_utf8_strtok)(char *str, const char *sep, char **last)
455 if( last == NULL ) return NULL;
457 begin = str ? str : *last;
459 begin += ldap_utf8_strspn( begin, sep );
461 if( *begin == '\0' ) {
466 end = &begin[ ldap_utf8_strcspn( begin, sep ) ];
469 char *next = LDAP_UTF8_NEXT( end );