3 * Copyright 1998-2002 The OpenLDAP Foundation, All Rights Reserved.
4 * COPYING RESTRICTIONS APPLY, see COPYRIGHT file
10 * These routines are "dumb". Though they understand UTF-8,
11 * they don't grok Unicode. That is, they can push bits,
12 * but don't have a clue what the bits represent. That's
13 * good enough for use with the LDAP Client SDK.
15 * These routines are not optimized.
22 #include <ac/stdlib.h>
24 #include <ac/socket.h>
25 #include <ac/string.h>
28 #include "ldap_utf8.h"
31 #include "ldap_defaults.h"
34 * Basic UTF-8 routines
38 * return the number of bytes required to hold the
39 * NULL-terminated UTF-8 string NOT INCLUDING the
42 ber_len_t ldap_utf8_bytes( const char * p )
46 for( bytes=0; p[bytes]; bytes++ ) {
53 ber_len_t ldap_utf8_chars( const char * p )
55 /* could be optimized and could check for invalid sequences */
58 for( ; *p ; LDAP_UTF8_INCR(p) ) {
65 /* return offset to next character */
66 int ldap_utf8_offset( const char * p )
68 return LDAP_UTF8_NEXT(p) - p;
72 * Returns length indicated by first byte.
74 * This function should use a table lookup.
76 const char ldap_utf8_lentab[] = {
77 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
78 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
79 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
80 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
81 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
82 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
83 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
84 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 };
86 int ldap_utf8_charlen( const char * p )
91 return ldap_utf8_lentab[*(unsigned char *)p ^ 0x80];
95 * Make sure the UTF-8 char used the shortest possible encoding
96 * returns charlen if valid, 0 if not.
99 /* mask of required bits in second octet */
100 const char ldap_utf8_mintab[] = {
101 0x20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
102 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
103 0x30, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
104 0x38, 0x80, 0x80, 0x80, 0x3c, 0x80, 0x00, 0x00 };
106 int ldap_utf8_charlen2( const char * p )
108 int i = LDAP_UTF8_CHARLEN( p );
111 if ( !( ldap_utf8_mintab[*p & 0x1f] & p[1] ) )
117 /* conv UTF-8 to UCS-4, useful for comparisons */
118 ldap_ucs4_t ldap_x_utf8_to_ucs4( const char * p )
120 const unsigned char *c = p;
123 static unsigned char mask[] = {
124 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
126 len = LDAP_UTF8_CHARLEN2(p, len);
128 if( len == 0 ) return LDAP_UCS4_INVALID;
130 ch = c[0] & mask[len];
132 for(i=1; i < len; i++) {
133 if ((c[i] & 0xc0) != 0x80) {
134 return LDAP_UCS4_INVALID;
144 /* conv UCS-4 to UTF-8, not used */
145 int ldap_x_ucs4_to_utf8( ldap_ucs4_t c, char *buf )
148 unsigned char* p = buf;
149 if(buf == NULL) return 0;
152 /* not a valid Unicode character */
154 } else if( c < 0x80 ) {
157 } else if( c < 0x800 ) {
158 p[len++] = 0xc0 | ( c >> 6 );
159 p[len++] = 0x80 | ( c & 0x3f );
161 } else if( c < 0x10000 ) {
162 p[len++] = 0xe0 | ( c >> 12 );
163 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
164 p[len++] = 0x80 | ( c & 0x3f );
166 } else if( c < 0x200000 ) {
167 p[len++] = 0xf0 | ( c >> 18 );
168 p[len++] = 0x80 | ( (c >> 12) & 0x3f );
169 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
170 p[len++] = 0x80 | ( c & 0x3f );
172 } else if( c < 0x4000000 ) {
173 p[len++] = 0xf8 | ( c >> 24 );
174 p[len++] = 0x80 | ( (c >> 18) & 0x3f );
175 p[len++] = 0x80 | ( (c >> 12) & 0x3f );
176 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
177 p[len++] = 0x80 | ( c & 0x3f );
179 } else /* if( c < 0x80000000 ) */ {
180 p[len++] = 0xfc | ( c >> 30 );
181 p[len++] = 0x80 | ( (c >> 24) & 0x3f );
182 p[len++] = 0x80 | ( (c >> 18) & 0x3f );
183 p[len++] = 0x80 | ( (c >> 12) & 0x3f );
184 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
185 p[len++] = 0x80 | ( c & 0x3f );
193 * Advance to the next UTF-8 character
195 * Ignores length of multibyte character, instead rely on
196 * continuation markers to find start of next character.
197 * This allows for "resyncing" of when invalid characters
198 * are provided provided the start of the next character
199 * is appears within the 6 bytes examined.
201 char* ldap_utf8_next( const char * p )
204 const unsigned char *u = p;
206 if( LDAP_UTF8_ISASCII(u) ) {
207 return (char *) &p[1];
210 for( i=1; i<6; i++ ) {
211 if ( ( u[i] & 0xc0 ) != 0x80 ) {
212 return (char *) &p[i];
216 return (char *) &p[i];
220 * Advance to the previous UTF-8 character
222 * Ignores length of multibyte character, instead rely on
223 * continuation markers to find start of next character.
224 * This allows for "resyncing" of when invalid characters
225 * are provided provided the start of the next character
226 * is appears within the 6 bytes examined.
228 char* ldap_utf8_prev( const char * p )
231 const unsigned char *u = p;
233 for( i=-1; i>-6 ; i-- ) {
234 if ( ( u[i] & 0xc0 ) != 0x80 ) {
235 return (char *) &p[i];
239 return (char *) &p[i];
243 * Copy one UTF-8 character from src to dst returning
244 * number of bytes copied.
246 * Ignores length of multibyte character, instead rely on
247 * continuation markers to find start of next character.
248 * This allows for "resyncing" of when invalid characters
249 * are provided provided the start of the next character
250 * is appears within the 6 bytes examined.
252 int ldap_utf8_copy( char* dst, const char *src )
255 const unsigned char *u = src;
259 if( LDAP_UTF8_ISASCII(u) ) {
263 for( i=1; i<6; i++ ) {
264 if ( ( u[i] & 0xc0 ) != 0x80 ) {
273 #ifndef UTF8_ALPHA_CTYPE
275 * UTF-8 ctype routines
276 * Only deals with characters < 0x80 (ie: US-ASCII)
279 int ldap_utf8_isascii( const char * p )
281 unsigned c = * (const unsigned char *) p;
282 return LDAP_ASCII(c);
285 int ldap_utf8_isdigit( const char * p )
287 unsigned c = * (const unsigned char *) p;
289 if(!LDAP_ASCII(c)) return 0;
291 return LDAP_DIGIT( c );
294 int ldap_utf8_isxdigit( const char * p )
296 unsigned c = * (const unsigned char *) p;
298 if(!LDAP_ASCII(c)) return 0;
303 int ldap_utf8_isspace( const char * p )
305 unsigned c = * (const unsigned char *) p;
307 if(!LDAP_ASCII(c)) return 0;
323 * These are not needed by the C SDK and are
324 * not "good enough" for general use.
326 int ldap_utf8_isalpha( const char * p )
328 unsigned c = * (const unsigned char *) p;
330 if(!LDAP_ASCII(c)) return 0;
332 return LDAP_ALPHA(c);
335 int ldap_utf8_isalnum( const char * p )
337 unsigned c = * (const unsigned char *) p;
339 if(!LDAP_ASCII(c)) return 0;
341 return LDAP_ALNUM(c);
344 int ldap_utf8_islower( const char * p )
346 unsigned c = * (const unsigned char *) p;
348 if(!LDAP_ASCII(c)) return 0;
350 return LDAP_LOWER(c);
353 int ldap_utf8_isupper( const char * p )
355 unsigned c = * (const unsigned char *) p;
357 if(!LDAP_ASCII(c)) return 0;
359 return LDAP_UPPER(c);
365 * UTF-8 string routines
369 char * (ldap_utf8_strchr)( const char *str, const char *chr )
371 for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
372 if( ldap_x_utf8_to_ucs4( str ) == ldap_x_utf8_to_ucs4( chr ) ) {
380 /* like strcspn() but returns number of bytes, not characters */
381 ber_len_t (ldap_utf8_strcspn)( const char *str, const char *set )
386 for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
387 for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
388 if( ldap_x_utf8_to_ucs4( cstr ) == ldap_x_utf8_to_ucs4( cset ) ) {
397 /* like strspn() but returns number of bytes, not characters */
398 ber_len_t (ldap_utf8_strspn)( const char *str, const char *set )
403 for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
404 for( cset = set; ; LDAP_UTF8_INCR(cset) ) {
405 if( *cset == '\0' ) {
409 if( ldap_x_utf8_to_ucs4( cstr ) == ldap_x_utf8_to_ucs4( cset ) ) {
418 /* like strpbrk(), replaces strchr() as well */
419 char *(ldap_utf8_strpbrk)( const char *str, const char *set )
421 for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
424 for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
425 if( ldap_x_utf8_to_ucs4( str ) == ldap_x_utf8_to_ucs4( cset ) ) {
434 /* like strtok_r(), not strtok() */
435 char *(ldap_utf8_strtok)(char *str, const char *sep, char **last)
440 if( last == NULL ) return NULL;
442 begin = str ? str : *last;
444 begin += ldap_utf8_strspn( begin, sep );
446 if( *begin == '\0' ) {
451 end = &begin[ ldap_utf8_strcspn( begin, sep ) ];
454 char *next = LDAP_UTF8_NEXT( end );