2 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
4 * Copyright 1998-2009 The OpenLDAP Foundation.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted only as authorized by the OpenLDAP
11 * A copy of this license is available in the file LICENSE in the
12 * top-level directory of the distribution or, alternatively, at
13 * <http://www.OpenLDAP.org/license.html>.
15 /* Portions Copyright (C) 1999, 2000 Novell, Inc. All Rights Reserved.
17 * THIS WORK IS SUBJECT TO U.S. AND INTERNATIONAL COPYRIGHT LAWS AND
18 * TREATIES. USE, MODIFICATION, AND REDISTRIBUTION OF THIS WORK IS SUBJECT
19 * TO VERSION 2.0.1 OF THE OPENLDAP PUBLIC LICENSE, A COPY OF WHICH IS
20 * AVAILABLE AT HTTP://WWW.OPENLDAP.ORG/LICENSE.HTML OR IN THE FILE "LICENSE"
21 * IN THE TOP-LEVEL DIRECTORY OF THE DISTRIBUTION. ANY USE OR EXPLOITATION
22 * OF THIS WORK OTHER THAN AS AUTHORIZED IN VERSION 2.0.1 OF THE OPENLDAP
23 * PUBLIC LICENSE, OR OTHER PRIOR WRITTEN CONSENT FROM NOVELL, COULD SUBJECT
24 * THE PERPETRATOR TO CRIMINAL AND CIVIL LIABILITY.
26 * Note: A verbatim copy of version 2.0.1 of the OpenLDAP Public License
27 * can be found in the file "build/LICENSE-2.0.1" in this distribution
28 * of OpenLDAP Software.
32 * UTF-8 Conversion Routines
34 * These routines convert between Wide Character and UTF-8,
35 * or between MultiByte and UTF-8 encodings.
37 * Both single character and string versions of the functions are provided.
38 * All functions return -1 if the character or string cannot be converted.
43 #if SIZEOF_WCHAR_T >= 4
44 /* These routines assume ( sizeof(wchar_t) >= 4 ) */
47 #include <ac/stdlib.h> /* For wctomb, wcstombs, mbtowc, mbstowcs */
48 #include <ac/string.h>
49 #include <ac/time.h> /* for time_t */
53 #include <ldap_utf8.h>
55 static unsigned char mask[] = { 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
58 /*-----------------------------------------------------------------------------
64 2-character UTF-8 sequence: 11 bits
67 3-character UTF-8 16 bits
68 1110xxxx 10xxxxxx 10xxxxxx
71 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
74 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
77 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
79 Unicode address space (0 - 0x10FFFF) 21 bits
80 ISO-10646 address space (0 - 0x7FFFFFFF) 31 bits
82 Note: This code does not prevent UTF-8 sequences which are longer than
83 necessary from being decoded.
86 /*-----------------------------------------------------------------------------
87 Convert a UTF-8 character to a wide char.
88 Return the length of the UTF-8 input character in bytes.
91 ldap_x_utf8_to_wc ( wchar_t *wchar, const char *utf8char )
96 if (utf8char == NULL) return -1;
98 /* Get UTF-8 sequence length from 1st byte */
99 utflen = LDAP_UTF8_CHARLEN2(utf8char, utflen);
101 if( utflen==0 || utflen > (int)LDAP_MAX_UTF8_LEN ) return -1;
103 /* First byte minus length tag */
104 ch = (wchar_t)(utf8char[0] & mask[utflen]);
106 for(i=1; i < utflen; i++) {
107 /* Subsequent bytes must start with 10 */
108 if ((utf8char[i] & 0xc0) != 0x80) return -1;
110 ch <<= 6; /* 6 bits of data in each subsequent byte */
111 ch |= (wchar_t)(utf8char[i] & 0x3f);
114 if (wchar) *wchar = ch;
119 /*-----------------------------------------------------------------------------
120 Convert a UTF-8 string to a wide char string.
121 No more than 'count' wide chars will be written to the output buffer.
122 Return the size of the converted string in wide chars, excl null terminator.
125 ldap_x_utf8s_to_wcs ( wchar_t *wcstr, const char *utf8str, size_t count )
132 /* If input ptr is NULL or empty... */
133 if (utf8str == NULL || !*utf8str) {
139 /* Examine next UTF-8 character. If output buffer is NULL, ignore count */
140 while ( *utf8str && (wcstr==NULL || wclen<count) ) {
141 /* Get UTF-8 sequence length from 1st byte */
142 utflen = LDAP_UTF8_CHARLEN2(utf8str, utflen);
144 if( utflen==0 || utflen > (int)LDAP_MAX_UTF8_LEN ) return -1;
146 /* First byte minus length tag */
147 ch = (wchar_t)(utf8str[0] & mask[utflen]);
149 for(i=1; i < utflen; i++) {
150 /* Subsequent bytes must start with 10 */
151 if ((utf8str[i] & 0xc0) != 0x80) return -1;
153 ch <<= 6; /* 6 bits of data in each subsequent byte */
154 ch |= (wchar_t)(utf8str[i] & 0x3f);
157 if (wcstr) wcstr[wclen] = ch;
159 utf8str += utflen; /* Move to next UTF-8 character */
160 wclen++; /* Count number of wide chars stored/required */
163 /* Add null terminator if there's room in the buffer. */
164 if (wcstr && wclen < count) wcstr[wclen] = 0;
170 /*-----------------------------------------------------------------------------
171 Convert one wide char to a UTF-8 character.
172 Return the length of the converted UTF-8 character in bytes.
173 No more than 'count' bytes will be written to the output buffer.
176 ldap_x_wc_to_utf8 ( char *utf8char, wchar_t wchar, size_t count )
180 if (utf8char == NULL) /* Just determine the required UTF-8 char length. */
188 if( wchar < 0x10000 )
190 if( wchar < 0x200000 )
192 if( wchar < 0x4000000 )
194 #if SIZEOF_WCHAR_T > 4
195 /* UL is not strictly needed by ANSI C */
196 if( wchar < (wchar_t)0x80000000UL )
197 #endif /* SIZEOF_WCHAR_T > 4 */
203 if ( wchar < 0 ) { /* Invalid wide character */
206 } else if( wchar < 0x80 ) {
208 utf8char[len++] = (char)wchar;
211 } else if( wchar < 0x800 ) {
213 utf8char[len++] = 0xc0 | ( wchar >> 6 );
214 utf8char[len++] = 0x80 | ( wchar & 0x3f );
217 } else if( wchar < 0x10000 ) {
219 utf8char[len++] = 0xe0 | ( wchar >> 12 );
220 utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
221 utf8char[len++] = 0x80 | ( wchar & 0x3f );
224 } else if( wchar < 0x200000 ) {
226 utf8char[len++] = 0xf0 | ( wchar >> 18 );
227 utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
228 utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
229 utf8char[len++] = 0x80 | ( wchar & 0x3f );
232 } else if( wchar < 0x4000000 ) {
234 utf8char[len++] = 0xf8 | ( wchar >> 24 );
235 utf8char[len++] = 0x80 | ( (wchar >> 18) & 0x3f );
236 utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
237 utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
238 utf8char[len++] = 0x80 | ( wchar & 0x3f );
242 #if SIZEOF_WCHAR_T > 4
243 /* UL is not strictly needed by ANSI C */
244 if( wchar < (wchar_t)0x80000000UL )
245 #endif /* SIZEOF_WCHAR_T > 4 */
248 utf8char[len++] = 0xfc | ( wchar >> 30 );
249 utf8char[len++] = 0x80 | ( (wchar >> 24) & 0x3f );
250 utf8char[len++] = 0x80 | ( (wchar >> 18) & 0x3f );
251 utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
252 utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
253 utf8char[len++] = 0x80 | ( wchar & 0x3f );
256 #if SIZEOF_WCHAR_T > 4
259 #endif /* SIZEOF_WCHAR_T > 4 */
267 /*-----------------------------------------------------------------------------
268 Convert a wide char string to a UTF-8 string.
269 No more than 'count' bytes will be written to the output buffer.
270 Return the # of bytes written to the output buffer, excl null terminator.
273 ldap_x_wcs_to_utf8s ( char *utf8str, const wchar_t *wcstr, size_t count )
278 wchar_t empty = 0; /* To avoid use of L"" construct */
280 if (wcstr == NULL) /* Treat input ptr NULL as an empty string */
283 if (utf8str == NULL) /* Just compute size of output, excl null */
287 /* Get UTF-8 size of next wide char */
288 n = ldap_x_wc_to_utf8( NULL, *wcstr++, LDAP_MAX_UTF8_LEN);
298 /* Do the actual conversion. */
300 n = 1; /* In case of empty wcstr */
303 n = ldap_x_wc_to_utf8( p, *wcstr++, count);
305 if (n <= 0) /* If encoding error (-1) or won't fit (0), quit */
309 count -= n; /* Space left in output buffer */
312 /* If not enough room for last character, pad remainder with null
313 so that return value = original count, indicating buffer full. */
320 /* Add a null terminator if there's room. */
324 if (n == -1) /* Conversion encountered invalid wide char. */
327 /* Return the number of bytes written to output buffer, excl null. */
328 return (p - utf8str);
332 /*-----------------------------------------------------------------------------
333 Convert a UTF-8 character to a MultiByte character.
334 Return the size of the converted character in bytes.
337 ldap_x_utf8_to_mb ( char *mbchar, const char *utf8char,
338 int (*f_wctomb)(char *mbchar, wchar_t wchar) )
342 char tmp[6]; /* Large enough for biggest multibyte char */
344 if (f_wctomb == NULL) /* If no conversion function was given... */
345 f_wctomb = wctomb; /* use the local ANSI C function */
347 /* First convert UTF-8 char to a wide char */
348 n = ldap_x_utf8_to_wc( &wchar, utf8char);
351 return -1; /* Invalid UTF-8 character */
354 n = f_wctomb( tmp, wchar );
356 n = f_wctomb( mbchar, wchar);
361 /*-----------------------------------------------------------------------------
362 Convert a UTF-8 string to a MultiByte string.
363 No more than 'count' bytes will be written to the output buffer.
364 Return the size of the converted string in bytes, excl null terminator.
367 ldap_x_utf8s_to_mbs ( char *mbstr, const char *utf8str, size_t count,
368 size_t (*f_wcstombs)(char *mbstr, const wchar_t *wcstr, size_t count) )
374 if (f_wcstombs == NULL) /* If no conversion function was given... */
375 f_wcstombs = wcstombs; /* use the local ANSI C function */
377 if (utf8str == NULL || *utf8str == 0) /* NULL or empty input string */
384 /* Allocate memory for the maximum size wchar string that we could get. */
385 wcsize = strlen(utf8str) + 1;
386 wcs = (wchar_t *)LDAP_MALLOC(wcsize * sizeof(wchar_t));
388 return -1; /* Memory allocation failure. */
390 /* First convert the UTF-8 string to a wide char string */
391 n = ldap_x_utf8s_to_wcs( wcs, utf8str, wcsize);
393 /* Then convert wide char string to multi-byte string */
396 n = f_wcstombs(mbstr, wcs, count);
404 /*-----------------------------------------------------------------------------
405 Convert a MultiByte character to a UTF-8 character.
406 'mbsize' indicates the number of bytes of 'mbchar' to check.
407 Returns the number of bytes written to the output character.
410 ldap_x_mb_to_utf8 ( char *utf8char, const char *mbchar, size_t mbsize,
411 int (*f_mbtowc)(wchar_t *wchar, const char *mbchar, size_t count) )
416 if (f_mbtowc == NULL) /* If no conversion function was given... */
417 f_mbtowc = mbtowc; /* use the local ANSI C function */
419 if (mbsize == 0) /* 0 is not valid. */
422 if (mbchar == NULL || *mbchar == 0)
429 /* First convert the MB char to a Wide Char */
430 n = f_mbtowc( &wchar, mbchar, mbsize);
435 /* Convert the Wide Char to a UTF-8 character. */
436 n = ldap_x_wc_to_utf8( utf8char, wchar, LDAP_MAX_UTF8_LEN);
442 /*-----------------------------------------------------------------------------
443 Convert a MultiByte string to a UTF-8 string.
444 No more than 'count' bytes will be written to the output buffer.
445 Return the size of the converted string in bytes, excl null terminator.
448 ldap_x_mbs_to_utf8s ( char *utf8str, const char *mbstr, size_t count,
449 size_t (*f_mbstowcs)(wchar_t *wcstr, const char *mbstr, size_t count) )
455 if (mbstr == NULL) /* Treat NULL input string as an empty string */
458 if (f_mbstowcs == NULL) /* If no conversion function was given... */
459 f_mbstowcs = mbstowcs; /* use the local ANSI C function */
461 /* Allocate memory for the maximum size wchar string that we could get. */
462 wcsize = strlen(mbstr) + 1;
463 wcs = (wchar_t *)LDAP_MALLOC( wcsize * sizeof(wchar_t) );
467 /* First convert multi-byte string to a wide char string */
468 n = f_mbstowcs(wcs, mbstr, wcsize);
470 /* Convert wide char string to UTF-8 string */
473 n = ldap_x_wcs_to_utf8s( utf8str, wcs, count);
481 #endif /* SIZEOF_WCHAR_T >= 4 */