1 /* utf-8.c -- Basic UTF-8 routines */
3 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
5 * Copyright 1998-2004 The OpenLDAP Foundation.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted only as authorized by the OpenLDAP
12 * A copy of this license is available in the file LICENSE in the
13 * top-level directory of the distribution or, alternatively, at
14 * <http://www.OpenLDAP.org/license.html>.
16 /* Portions Copyright (C) The Internet Society (1998)
17 * UTF-8 encodings are derived from those in RFC 2279;
18 * see RFC for full legal notices.
21 /* Basic UTF-8 routines
23 * These routines are "dumb". Though they understand UTF-8,
24 * they don't grok Unicode. That is, they can push bits,
25 * but don't have a clue what the bits represent. That's
26 * good enough for use with the LDAP Client SDK.
28 * These routines are not optimized.
35 #include <ac/stdlib.h>
37 #include <ac/socket.h>
38 #include <ac/string.h>
41 #include "ldap_utf8.h"
44 #include "ldap_defaults.h"
47 * return the number of bytes required to hold the
48 * NULL-terminated UTF-8 string NOT INCLUDING the
51 ber_len_t ldap_utf8_bytes( const char * p )
55 for( bytes=0; p[bytes]; bytes++ ) {
62 ber_len_t ldap_utf8_chars( const char * p )
64 /* could be optimized and could check for invalid sequences */
67 for( ; *p ; LDAP_UTF8_INCR(p) ) {
74 /* return offset to next character */
75 int ldap_utf8_offset( const char * p )
77 return LDAP_UTF8_NEXT(p) - p;
81 * Returns length indicated by first byte.
83 const char ldap_utf8_lentab[] = {
84 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
85 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
86 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
87 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
88 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
89 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
90 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
91 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 };
93 int ldap_utf8_charlen( const char * p )
98 return ldap_utf8_lentab[*(const unsigned char *)p ^ 0x80];
102 * Make sure the UTF-8 char used the shortest possible encoding
103 * returns charlen if valid, 0 if not.
105 * Here are the valid UTF-8 encodings, taken from RFC 2279 page 4.
106 * The table is slightly modified from that of the RFC.
108 * UCS-4 range (hex) UTF-8 sequence (binary)
109 * 0000 0000-0000 007F 0.......
110 * 0000 0080-0000 07FF 110++++. 10......
111 * 0000 0800-0000 FFFF 1110++++ 10+..... 10......
112 * 0001 0000-001F FFFF 11110+++ 10++.... 10...... 10......
113 * 0020 0000-03FF FFFF 111110++ 10+++... 10...... 10...... 10......
114 * 0400 0000-7FFF FFFF 1111110+ 10++++.. 10...... 10...... 10...... 10......
116 * The '.' bits are "don't cares". When validating a UTF-8 sequence,
117 * at least one of the '+' bits must be set, otherwise the character
118 * should have been encoded in fewer octets. Note that in the two-octet
119 * case, only the first octet needs to be validated, and this is done
120 * in the ldap_utf8_lentab[] above.
123 /* mask of required bits in second octet */
126 c ldap_utf8_mintab[] = {
127 (c)0x20, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
128 (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
129 (c)0x30, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
130 (c)0x38, (c)0x80, (c)0x80, (c)0x80, (c)0x3c, (c)0x80, (c)0x00, (c)0x00 };
133 int ldap_utf8_charlen2( const char * p )
135 int i = LDAP_UTF8_CHARLEN( p );
138 if ( !( ldap_utf8_mintab[*p & 0x1f] & p[1] ) )
144 /* conv UTF-8 to UCS-4, useful for comparisons */
145 ldap_ucs4_t ldap_x_utf8_to_ucs4( const char * p )
147 const unsigned char *c = (const unsigned char *) p;
150 static unsigned char mask[] = {
151 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
153 len = LDAP_UTF8_CHARLEN2(p, len);
155 if( len == 0 ) return LDAP_UCS4_INVALID;
157 ch = c[0] & mask[len];
159 for(i=1; i < len; i++) {
160 if ((c[i] & 0xc0) != 0x80) {
161 return LDAP_UCS4_INVALID;
171 /* conv UCS-4 to UTF-8, not used */
172 int ldap_x_ucs4_to_utf8( ldap_ucs4_t c, char *buf )
175 unsigned char* p = (unsigned char *) buf;
177 /* not a valid Unicode character */
178 if ( c < 0 ) return 0;
180 /* Just return length, don't convert */
182 if( c < 0x80 ) return 1;
183 else if( c < 0x800 ) return 2;
184 else if( c < 0x10000 ) return 3;
185 else if( c < 0x200000 ) return 4;
186 else if( c < 0x4000000 ) return 5;
193 } else if( c < 0x800 ) {
194 p[len++] = 0xc0 | ( c >> 6 );
195 p[len++] = 0x80 | ( c & 0x3f );
197 } else if( c < 0x10000 ) {
198 p[len++] = 0xe0 | ( c >> 12 );
199 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
200 p[len++] = 0x80 | ( c & 0x3f );
202 } else if( c < 0x200000 ) {
203 p[len++] = 0xf0 | ( c >> 18 );
204 p[len++] = 0x80 | ( (c >> 12) & 0x3f );
205 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
206 p[len++] = 0x80 | ( c & 0x3f );
208 } else if( c < 0x4000000 ) {
209 p[len++] = 0xf8 | ( c >> 24 );
210 p[len++] = 0x80 | ( (c >> 18) & 0x3f );
211 p[len++] = 0x80 | ( (c >> 12) & 0x3f );
212 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
213 p[len++] = 0x80 | ( c & 0x3f );
215 } else /* if( c < 0x80000000 ) */ {
216 p[len++] = 0xfc | ( c >> 30 );
217 p[len++] = 0x80 | ( (c >> 24) & 0x3f );
218 p[len++] = 0x80 | ( (c >> 18) & 0x3f );
219 p[len++] = 0x80 | ( (c >> 12) & 0x3f );
220 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
221 p[len++] = 0x80 | ( c & 0x3f );
227 #define LDAP_UCS_UTF8LEN(c) \
228 c < 0 ? 0 : (c < 0x80 ? 1 : (c < 0x800 ? 2 : (c < 0x10000 ? 3 : \
229 (c < 0x200000 ? 4 : (c < 0x4000000 ? 5 : 6)))))
231 /* Convert a string to UTF-8 format. The input string is expected to
232 * have characters of 1, 2, or 4 octets (in network byte order)
233 * corresponding to the ASN.1 T61STRING, BMPSTRING, and UNIVERSALSTRING
234 * types respectively. (Here T61STRING just means that there is one
235 * octet per character and characters may use the high bit of the octet.
236 * The characters are assumed to use ISO mappings, no provision is made
237 * for converting from T.61 coding rules to Unicode.)
241 ldap_ucs_to_utf8s( struct berval *ucs, int csize, struct berval *utf8s )
243 unsigned char *in, *end;
248 utf8s->bv_val = NULL;
251 in = (unsigned char *)ucs->bv_val;
253 /* Make sure we stop at an even multiple of csize */
254 end = in + ( ucs->bv_len & ~(csize-1) );
268 i = LDAP_UCS_UTF8LEN(u);
270 return LDAP_INVALID_SYNTAX;
274 utf8s->bv_val = LDAP_MALLOC( l+1 );
275 if (utf8s->bv_val == NULL)
276 return LDAP_NO_MEMORY;
280 for (in = (unsigned char *)ucs->bv_val; in < end; ) {
292 ptr += ldap_x_ucs4_to_utf8(u, ptr);
299 * Advance to the next UTF-8 character
301 * Ignores length of multibyte character, instead rely on
302 * continuation markers to find start of next character.
303 * This allows for "resyncing" of when invalid characters
304 * are provided provided the start of the next character
305 * is appears within the 6 bytes examined.
307 char* ldap_utf8_next( const char * p )
310 const unsigned char *u = (const unsigned char *) p;
312 if( LDAP_UTF8_ISASCII(u) ) {
313 return (char *) &p[1];
316 for( i=1; i<6; i++ ) {
317 if ( ( u[i] & 0xc0 ) != 0x80 ) {
318 return (char *) &p[i];
322 return (char *) &p[i];
326 * Advance to the previous UTF-8 character
328 * Ignores length of multibyte character, instead rely on
329 * continuation markers to find start of next character.
330 * This allows for "resyncing" of when invalid characters
331 * are provided provided the start of the next character
332 * is appears within the 6 bytes examined.
334 char* ldap_utf8_prev( const char * p )
337 const unsigned char *u = (const unsigned char *) p;
339 for( i=-1; i>-6 ; i-- ) {
340 if ( ( u[i] & 0xc0 ) != 0x80 ) {
341 return (char *) &p[i];
345 return (char *) &p[i];
349 * Copy one UTF-8 character from src to dst returning
350 * number of bytes copied.
352 * Ignores length of multibyte character, instead rely on
353 * continuation markers to find start of next character.
354 * This allows for "resyncing" of when invalid characters
355 * are provided provided the start of the next character
356 * is appears within the 6 bytes examined.
358 int ldap_utf8_copy( char* dst, const char *src )
361 const unsigned char *u = (const unsigned char *) src;
365 if( LDAP_UTF8_ISASCII(u) ) {
369 for( i=1; i<6; i++ ) {
370 if ( ( u[i] & 0xc0 ) != 0x80 ) {
379 #ifndef UTF8_ALPHA_CTYPE
381 * UTF-8 ctype routines
382 * Only deals with characters < 0x80 (ie: US-ASCII)
385 int ldap_utf8_isascii( const char * p )
387 unsigned c = * (const unsigned char *) p;
388 return LDAP_ASCII(c);
391 int ldap_utf8_isdigit( const char * p )
393 unsigned c = * (const unsigned char *) p;
395 if(!LDAP_ASCII(c)) return 0;
397 return LDAP_DIGIT( c );
400 int ldap_utf8_isxdigit( const char * p )
402 unsigned c = * (const unsigned char *) p;
404 if(!LDAP_ASCII(c)) return 0;
409 int ldap_utf8_isspace( const char * p )
411 unsigned c = * (const unsigned char *) p;
413 if(!LDAP_ASCII(c)) return 0;
429 * These are not needed by the C SDK and are
430 * not "good enough" for general use.
432 int ldap_utf8_isalpha( const char * p )
434 unsigned c = * (const unsigned char *) p;
436 if(!LDAP_ASCII(c)) return 0;
438 return LDAP_ALPHA(c);
441 int ldap_utf8_isalnum( const char * p )
443 unsigned c = * (const unsigned char *) p;
445 if(!LDAP_ASCII(c)) return 0;
447 return LDAP_ALNUM(c);
450 int ldap_utf8_islower( const char * p )
452 unsigned c = * (const unsigned char *) p;
454 if(!LDAP_ASCII(c)) return 0;
456 return LDAP_LOWER(c);
459 int ldap_utf8_isupper( const char * p )
461 unsigned c = * (const unsigned char *) p;
463 if(!LDAP_ASCII(c)) return 0;
465 return LDAP_UPPER(c);
471 * UTF-8 string routines
475 char * (ldap_utf8_strchr)( const char *str, const char *chr )
477 for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
478 if( ldap_x_utf8_to_ucs4( str ) == ldap_x_utf8_to_ucs4( chr ) ) {
486 /* like strcspn() but returns number of bytes, not characters */
487 ber_len_t (ldap_utf8_strcspn)( const char *str, const char *set )
492 for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
493 for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
494 if( ldap_x_utf8_to_ucs4( cstr ) == ldap_x_utf8_to_ucs4( cset ) ) {
503 /* like strspn() but returns number of bytes, not characters */
504 ber_len_t (ldap_utf8_strspn)( const char *str, const char *set )
509 for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
510 for( cset = set; ; LDAP_UTF8_INCR(cset) ) {
511 if( *cset == '\0' ) {
515 if( ldap_x_utf8_to_ucs4( cstr ) == ldap_x_utf8_to_ucs4( cset ) ) {
524 /* like strpbrk(), replaces strchr() as well */
525 char *(ldap_utf8_strpbrk)( const char *str, const char *set )
527 for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
530 for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
531 if( ldap_x_utf8_to_ucs4( str ) == ldap_x_utf8_to_ucs4( cset ) ) {
540 /* like strtok_r(), not strtok() */
541 char *(ldap_utf8_strtok)(char *str, const char *sep, char **last)
546 if( last == NULL ) return NULL;
548 begin = str ? str : *last;
550 begin += ldap_utf8_strspn( begin, sep );
552 if( *begin == '\0' ) {
557 end = &begin[ ldap_utf8_strcspn( begin, sep ) ];
560 char *next = LDAP_UTF8_NEXT( end );