3 * Copyright 1998-2002 The OpenLDAP Foundation, All Rights Reserved.
4 * COPYING RESTRICTIONS APPLY, see COPYRIGHT file
10 * These routines are "dumb". Though they understand UTF-8,
11 * they don't grok Unicode. That is, they can push bits,
12 * but don't have a clue what the bits represent. That's
13 * good enough for use with the LDAP Client SDK.
15 * These routines are not optimized.
22 #include <ac/stdlib.h>
24 #include <ac/socket.h>
25 #include <ac/string.h>
28 #include "ldap_utf8.h"
31 #include "ldap_defaults.h"
34 * Basic UTF-8 routines
38 * return the number of bytes required to hold the
39 * NULL-terminated UTF-8 string NOT INCLUDING the
42 ber_len_t ldap_utf8_bytes( const char * p )
46 for( bytes=0; p[bytes]; bytes++ ) {
53 ber_len_t ldap_utf8_chars( const char * p )
55 /* could be optimized and could check for invalid sequences */
58 for( ; *p ; LDAP_UTF8_INCR(p) ) {
65 /* return offset to next character */
66 int ldap_utf8_offset( const char * p )
68 return LDAP_UTF8_NEXT(p) - p;
72 * Returns length indicated by first byte.
74 const char ldap_utf8_lentab[] = {
75 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
76 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
77 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
78 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
79 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
80 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
81 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
82 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 };
84 int ldap_utf8_charlen( const char * p )
89 return ldap_utf8_lentab[*(const unsigned char *)p ^ 0x80];
93 * Make sure the UTF-8 char used the shortest possible encoding
94 * returns charlen if valid, 0 if not.
96 * Here are the valid UTF-8 encodings, taken from RFC 2279 page 4.
97 * The table is slightly modified from that of the RFC.
99 * UCS-4 range (hex) UTF-8 sequence (binary)
100 * 0000 0000-0000 007F 0.......
101 * 0000 0080-0000 07FF 110++++. 10......
102 * 0000 0800-0000 FFFF 1110++++ 10+..... 10......
103 * 0001 0000-001F FFFF 11110+++ 10++.... 10...... 10......
104 * 0020 0000-03FF FFFF 111110++ 10+++... 10...... 10...... 10......
105 * 0400 0000-7FFF FFFF 1111110+ 10++++.. 10...... 10...... 10...... 10......
107 * The '.' bits are "don't cares". When validating a UTF-8 sequence,
108 * at least one of the '+' bits must be set, otherwise the character
109 * should have been encoded in fewer octets. Note that in the two-octet
110 * case, only the first octet needs to be validated, and this is done
111 * in the ldap_utf8_lentab[] above.
114 /* mask of required bits in second octet */
117 c ldap_utf8_mintab[] = {
118 (c)0x20, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
119 (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
120 (c)0x30, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
121 (c)0x38, (c)0x80, (c)0x80, (c)0x80, (c)0x3c, (c)0x80, (c)0x00, (c)0x00 };
124 int ldap_utf8_charlen2( const char * p )
126 int i = LDAP_UTF8_CHARLEN( p );
129 if ( !( ldap_utf8_mintab[*p & 0x1f] & p[1] ) )
135 /* conv UTF-8 to UCS-4, useful for comparisons */
136 ldap_ucs4_t ldap_x_utf8_to_ucs4( const char * p )
138 const unsigned char *c = p;
141 static unsigned char mask[] = {
142 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
144 len = LDAP_UTF8_CHARLEN2(p, len);
146 if( len == 0 ) return LDAP_UCS4_INVALID;
148 ch = c[0] & mask[len];
150 for(i=1; i < len; i++) {
151 if ((c[i] & 0xc0) != 0x80) {
152 return LDAP_UCS4_INVALID;
162 /* conv UCS-4 to UTF-8, not used */
163 int ldap_x_ucs4_to_utf8( ldap_ucs4_t c, char *buf )
166 unsigned char* p = buf;
168 /* not a valid Unicode character */
169 if ( c < 0 ) return 0;
171 /* Just return length, don't convert */
173 if( c < 0x80 ) return 1;
174 else if( c < 0x800 ) return 2;
175 else if( c < 0x10000 ) return 3;
176 else if( c < 0x200000 ) return 4;
177 else if( c < 0x4000000 ) return 5;
184 } else if( c < 0x800 ) {
185 p[len++] = 0xc0 | ( c >> 6 );
186 p[len++] = 0x80 | ( c & 0x3f );
188 } else if( c < 0x10000 ) {
189 p[len++] = 0xe0 | ( c >> 12 );
190 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
191 p[len++] = 0x80 | ( c & 0x3f );
193 } else if( c < 0x200000 ) {
194 p[len++] = 0xf0 | ( c >> 18 );
195 p[len++] = 0x80 | ( (c >> 12) & 0x3f );
196 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
197 p[len++] = 0x80 | ( c & 0x3f );
199 } else if( c < 0x4000000 ) {
200 p[len++] = 0xf8 | ( c >> 24 );
201 p[len++] = 0x80 | ( (c >> 18) & 0x3f );
202 p[len++] = 0x80 | ( (c >> 12) & 0x3f );
203 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
204 p[len++] = 0x80 | ( c & 0x3f );
206 } else /* if( c < 0x80000000 ) */ {
207 p[len++] = 0xfc | ( c >> 30 );
208 p[len++] = 0x80 | ( (c >> 24) & 0x3f );
209 p[len++] = 0x80 | ( (c >> 18) & 0x3f );
210 p[len++] = 0x80 | ( (c >> 12) & 0x3f );
211 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
212 p[len++] = 0x80 | ( c & 0x3f );
218 #define LDAP_UCS_UTF8LEN(c) \
219 c < 0 ? 0 : (c < 0x80 ? 1 : (c < 0x800 ? 2 : (c < 0x10000 ? 3 : \
220 (c < 0x200000 ? 4 : (c < 0x4000000 ? 5 : 6)))))
222 /* Convert a string to UTF-8 format. The input string is expected to
223 * have characters of 1, 2, or 4 octets (in network byte order)
224 * corresponding to the ASN.1 T61STRING, BMPSTRING, and UNIVERSALSTRING
225 * types respectively. (Here T61STRING just means that there is one
226 * octet per character and characters may use the high bit of the octet.
227 * The characters are assumed to use ISO mappings, no provision is made
228 * for converting from T.61 coding rules to Unicode.)
232 ldap_ucs_to_utf8s( struct berval *ucs, int csize, struct berval *utf8s )
234 unsigned char *in, *end;
239 utf8s->bv_val = NULL;
242 in = (unsigned char *)ucs->bv_val;
244 /* Make sure we stop at an even multiple of csize */
245 end = in + ( ucs->bv_len & ~(csize-1) );
259 i = LDAP_UCS_UTF8LEN(u);
261 return LDAP_INVALID_SYNTAX;
265 utf8s->bv_val = LDAP_MALLOC( l+1 );
266 if (utf8s->bv_val == NULL)
267 return LDAP_NO_MEMORY;
271 for (in = (unsigned char *)ucs->bv_val; in < end; ) {
283 ptr += ldap_x_ucs4_to_utf8(u, ptr);
290 * Advance to the next UTF-8 character
292 * Ignores length of multibyte character, instead rely on
293 * continuation markers to find start of next character.
294 * This allows for "resyncing" of when invalid characters
295 * are provided provided the start of the next character
296 * is appears within the 6 bytes examined.
298 char* ldap_utf8_next( const char * p )
301 const unsigned char *u = p;
303 if( LDAP_UTF8_ISASCII(u) ) {
304 return (char *) &p[1];
307 for( i=1; i<6; i++ ) {
308 if ( ( u[i] & 0xc0 ) != 0x80 ) {
309 return (char *) &p[i];
313 return (char *) &p[i];
317 * Advance to the previous UTF-8 character
319 * Ignores length of multibyte character, instead rely on
320 * continuation markers to find start of next character.
321 * This allows for "resyncing" of when invalid characters
322 * are provided provided the start of the next character
323 * is appears within the 6 bytes examined.
325 char* ldap_utf8_prev( const char * p )
328 const unsigned char *u = p;
330 for( i=-1; i>-6 ; i-- ) {
331 if ( ( u[i] & 0xc0 ) != 0x80 ) {
332 return (char *) &p[i];
336 return (char *) &p[i];
340 * Copy one UTF-8 character from src to dst returning
341 * number of bytes copied.
343 * Ignores length of multibyte character, instead rely on
344 * continuation markers to find start of next character.
345 * This allows for "resyncing" of when invalid characters
346 * are provided provided the start of the next character
347 * is appears within the 6 bytes examined.
349 int ldap_utf8_copy( char* dst, const char *src )
352 const unsigned char *u = src;
356 if( LDAP_UTF8_ISASCII(u) ) {
360 for( i=1; i<6; i++ ) {
361 if ( ( u[i] & 0xc0 ) != 0x80 ) {
370 #ifndef UTF8_ALPHA_CTYPE
372 * UTF-8 ctype routines
373 * Only deals with characters < 0x80 (ie: US-ASCII)
376 int ldap_utf8_isascii( const char * p )
378 unsigned c = * (const unsigned char *) p;
379 return LDAP_ASCII(c);
382 int ldap_utf8_isdigit( const char * p )
384 unsigned c = * (const unsigned char *) p;
386 if(!LDAP_ASCII(c)) return 0;
388 return LDAP_DIGIT( c );
391 int ldap_utf8_isxdigit( const char * p )
393 unsigned c = * (const unsigned char *) p;
395 if(!LDAP_ASCII(c)) return 0;
400 int ldap_utf8_isspace( const char * p )
402 unsigned c = * (const unsigned char *) p;
404 if(!LDAP_ASCII(c)) return 0;
420 * These are not needed by the C SDK and are
421 * not "good enough" for general use.
423 int ldap_utf8_isalpha( const char * p )
425 unsigned c = * (const unsigned char *) p;
427 if(!LDAP_ASCII(c)) return 0;
429 return LDAP_ALPHA(c);
432 int ldap_utf8_isalnum( const char * p )
434 unsigned c = * (const unsigned char *) p;
436 if(!LDAP_ASCII(c)) return 0;
438 return LDAP_ALNUM(c);
441 int ldap_utf8_islower( const char * p )
443 unsigned c = * (const unsigned char *) p;
445 if(!LDAP_ASCII(c)) return 0;
447 return LDAP_LOWER(c);
450 int ldap_utf8_isupper( const char * p )
452 unsigned c = * (const unsigned char *) p;
454 if(!LDAP_ASCII(c)) return 0;
456 return LDAP_UPPER(c);
462 * UTF-8 string routines
466 char * (ldap_utf8_strchr)( const char *str, const char *chr )
468 for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
469 if( ldap_x_utf8_to_ucs4( str ) == ldap_x_utf8_to_ucs4( chr ) ) {
477 /* like strcspn() but returns number of bytes, not characters */
478 ber_len_t (ldap_utf8_strcspn)( const char *str, const char *set )
483 for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
484 for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
485 if( ldap_x_utf8_to_ucs4( cstr ) == ldap_x_utf8_to_ucs4( cset ) ) {
494 /* like strspn() but returns number of bytes, not characters */
495 ber_len_t (ldap_utf8_strspn)( const char *str, const char *set )
500 for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
501 for( cset = set; ; LDAP_UTF8_INCR(cset) ) {
502 if( *cset == '\0' ) {
506 if( ldap_x_utf8_to_ucs4( cstr ) == ldap_x_utf8_to_ucs4( cset ) ) {
515 /* like strpbrk(), replaces strchr() as well */
516 char *(ldap_utf8_strpbrk)( const char *str, const char *set )
518 for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
521 for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
522 if( ldap_x_utf8_to_ucs4( str ) == ldap_x_utf8_to_ucs4( cset ) ) {
531 /* like strtok_r(), not strtok() */
532 char *(ldap_utf8_strtok)(char *str, const char *sep, char **last)
537 if( last == NULL ) return NULL;
539 begin = str ? str : *last;
541 begin += ldap_utf8_strspn( begin, sep );
543 if( *begin == '\0' ) {
548 end = &begin[ ldap_utf8_strcspn( begin, sep ) ];
551 char *next = LDAP_UTF8_NEXT( end );