3 * Copyright 1998-2000 The OpenLDAP Foundation, All Rights Reserved.
4 * COPYING RESTRICTIONS APPLY, see COPYRIGHT file
10 * These routines are "dumb". Though they understand UTF-8,
11 * they don't grok Unicode. That is, they can push bits,
12 * but don't have a clue what the bits represent. That's
13 * good enough for use with the LDAP Client SDK.
15 * These routines are not optimized.
22 #include <ac/stdlib.h>
24 #include <ac/socket.h>
25 #include <ac/string.h>
29 #include "ldap_defaults.h"
32 #define ISASCII(uc) ((uc) < 0x80)
35 * Basic UTF-8 routines
39 * return the number of bytes required to hold the
40 * NULL-terminated UTF-8 string NOT INCLUDING the
43 ber_len_t ldap_utf8_bytes( const char * p )
47 for( bytes=0; p[bytes]; bytes++ ) {
54 ber_len_t ldap_utf8_chars( const char * p )
56 /* could be optimized and could check for invalid sequences */
59 for( ; *p ; LDAP_UTF8_INCR(p) ) {
66 /* return offset to next character */
67 int ldap_utf8_offset( const char * p )
69 return LDAP_UTF8_NEXT(p) - p;
73 * Returns length indicated by first byte.
75 * This function should use a table lookup.
77 int ldap_utf8_charlen( const char * p )
79 unsigned c = * (const unsigned char *) p;
81 if ((c & 0xfe ) == 0xfc) {
85 if ((c & 0xfc ) == 0xf8) {
89 if ((c & 0xf8 ) == 0xf0) {
93 if ((c & 0xf0 ) == 0xe0) {
97 if ((c & 0xe0 ) == 0xc0) {
101 if ((c & 0x80 ) == 0x80) {
109 /* conv UTF-8 to UCS-4, useful for comparisons */
110 ldap_ucs4_t ldap_utf8_to_ucs4( const char * p )
112 const unsigned char *c = p;
115 static unsigned char mask[] = {
116 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
118 len = LDAP_UTF8_CHARLEN(p);
120 if( len == 0 ) return LDAP_UCS4_INVALID;
122 ch = c[0] & mask[len];
124 for(i=1; i < len; i++) {
125 if ((c[i] & 0xc0) != 0x80) {
126 return LDAP_UCS4_INVALID;
136 /* conv UCS-4 to UTF-8, not used */
137 int ldap_ucs4_to_utf8( ldap_ucs4_t c, char *buf )
140 unsigned char* p = buf;
141 if(buf == NULL) return 0;
144 /* not a valid Unicode character */
146 } else if( c < 0x80 ) {
149 } else if( c < 0x800 ) {
150 p[len++] = 0xc0 | ( c >> 6 );
151 p[len++] = 0x80 | ( c & 0x3f );
153 } else if( c < 0x10000 ) {
154 p[len++] = 0xe0 | ( c >> 12 );
155 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
156 p[len++] = 0x80 | ( c & 0x3f );
158 } else if( c < 0x200000 ) {
159 p[len++] = 0xf0 | ( c >> 18 );
160 p[len++] = 0x80 | ( (c >> 12) & 0x3f );
161 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
162 p[len++] = 0x80 | ( c & 0x3f );
164 } else if( c < 0x4000000 ) {
165 p[len++] = 0xf8 | ( c >> 24 );
166 p[len++] = 0x80 | ( (c >> 18) & 0x3f );
167 p[len++] = 0x80 | ( (c >> 12) & 0x3f );
168 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
169 p[len++] = 0x80 | ( c & 0x3f );
171 } else /* if( c < 0x80000000 ) */ {
172 p[len++] = 0xfc | ( c >> 30 );
173 p[len++] = 0x80 | ( (c >> 24) & 0x3f );
174 p[len++] = 0x80 | ( (c >> 18) & 0x3f );
175 p[len++] = 0x80 | ( (c >> 12) & 0x3f );
176 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
177 p[len++] = 0x80 | ( c & 0x3f );
185 * Advance to the next UTF-8 character
187 * Ignores length of multibyte character, instead rely on
188 * continuation markers to find start of next character.
189 * This allows for "resyncing" of when invalid characters
190 * are provided provided the start of the next character
191 * is appears within the 6 bytes examined.
193 char* ldap_utf8_next( const char * p )
196 const unsigned char *u = p;
198 if( LDAP_UTF8_ISASCII(u) ) {
199 return (char *) &p[1];
202 for( i=1; i<6; i++ ) {
203 if ( ( u[i] & 0xc0 ) != 0x80 ) {
204 return (char *) &p[i];
208 return (char *) &p[i];
212 * Advance to the previous UTF-8 character
214 * Ignores length of multibyte character, instead rely on
215 * continuation markers to find start of next character.
216 * This allows for "resyncing" of when invalid characters
217 * are provided provided the start of the next character
218 * is appears within the 6 bytes examined.
220 char* ldap_utf8_prev( const char * p )
223 const unsigned char *u = p;
225 for( i=-1; i>-6 ; i-- ) {
226 if ( ( u[i] & 0xc0 ) != 0x80 ) {
227 return (char *) &p[i];
231 return (char *) &p[i];
235 * Copy one UTF-8 character from src to dst returning
236 * number of bytes copied.
238 * Ignores length of multibyte character, instead rely on
239 * continuation markers to find start of next character.
240 * This allows for "resyncing" of when invalid characters
241 * are provided provided the start of the next character
242 * is appears within the 6 bytes examined.
244 int ldap_utf8_copy( char* dst, const char *src )
247 const unsigned char *u = src;
251 if( LDAP_UTF8_ISASCII(u) ) {
255 for( i=1; i<6; i++ ) {
256 if ( ( u[i] & 0xc0 ) != 0x80 ) {
266 * UTF-8 ctype routines
267 * Only deals with characters < 0x80 (ie: US-ASCII)
270 int ldap_utf8_isascii( const char * p )
272 unsigned c = * (const unsigned char *) p;
276 int ldap_utf8_isdigit( const char * p )
278 unsigned c = * (const unsigned char *) p;
280 if(!ISASCII(c)) return 0;
282 return c >= '0' && c <= '9';
285 int ldap_utf8_isxdigit( const char * p )
287 unsigned c = * (const unsigned char *) p;
289 if(!ISASCII(c)) return 0;
291 return ( c >= '0' && c <= '9' )
292 || ( c >= 'A' && c <= 'F' )
293 || ( c >= 'a' && c <= 'f' );
296 int ldap_utf8_isspace( const char * p )
298 unsigned c = * (const unsigned char *) p;
300 if(!ISASCII(c)) return 0;
315 #ifndef UTF8_ALPHA_CTYPE
317 * These are not needed by the C SDK and are
318 * not "good enough" for general use.
320 int ldap_utf8_isalpha( const char * p )
322 unsigned c = * (const unsigned char *) p;
324 if(!ISASCII(c)) return 0;
326 return ( c >= 'A' && c <= 'Z' )
327 || ( c >= 'a' && c <= 'z' );
330 int ldap_utf8_isalnum( const char * p )
332 unsigned c = * (const unsigned char *) p;
334 if(!ISASCII(c)) return 0;
336 return ( c >= '0' && c <= '9' )
337 || ( c >= 'A' && c <= 'Z' )
338 || ( c >= 'a' && c <= 'z' );
341 int ldap_utf8_islower( const char * p )
343 unsigned c = * (const unsigned char *) p;
345 if(!ISASCII(c)) return 0;
347 return ( c >= 'a' && c <= 'z' );
350 int ldap_utf8_isupper( const char * p )
352 unsigned c = * (const unsigned char *) p;
354 if(!ISASCII(c)) return 0;
356 return ( c >= 'A' && c <= 'Z' );
362 * UTF-8 string routines
366 char * (ldap_utf8_strchr)( const char *str, const char *chr )
368 for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
369 if( ldap_utf8_to_ucs4( str ) == ldap_utf8_to_ucs4( chr ) ) {
377 /* like strcspn() but returns number of bytes, not characters */
378 ber_len_t (ldap_utf8_strcspn)( const char *str, const char *set )
383 for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
384 for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
385 if( ldap_utf8_to_ucs4( cstr ) == ldap_utf8_to_ucs4( cset ) ) {
394 /* like strspn() but returns number of bytes, not characters */
395 ber_len_t (ldap_utf8_strspn)( const char *str, const char *set )
400 for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
401 for( cset = set; ; LDAP_UTF8_INCR(cset) ) {
402 if( *cset == '\0' ) {
406 if( ldap_utf8_to_ucs4( cstr ) == ldap_utf8_to_ucs4( cset ) ) {
415 /* like strpbrk(), replaces strchr() as well */
416 char *(ldap_utf8_strpbrk)( const char *str, const char *set )
418 for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
421 for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
422 if( ldap_utf8_to_ucs4( str ) == ldap_utf8_to_ucs4( cset ) ) {
431 /* like strtok_r(), not strtok() */
432 char *(ldap_utf8_strtok)(char *str, const char *sep, char **last)
437 if( last == NULL ) return NULL;
439 begin = str ? str : *last;
441 begin += ldap_utf8_strspn( begin, sep );
443 if( *begin == '\0' ) {
448 end = &begin[ ldap_utf8_strcspn( begin, sep ) ];
451 char *next = LDAP_UTF8_NEXT( end );