3 * Copyright 2000-2003 The OpenLDAP Foundation, All Rights Reserved.
4 * COPYING RESTRICTIONS APPLY, see COPYRIGHT file
7 /* $Novell: /ldap/src/cldap/libraries/libldap/utfconv.c,v 1.3 2000/12/11 19:35:37 dsteck Exp $ */
8 /******************************************************************************
9 * Copyright (C) 1999, 2000 Novell, Inc. All Rights Reserved.
11 * THIS WORK IS SUBJECT TO U.S. AND INTERNATIONAL COPYRIGHT LAWS AND
12 * TREATIES. USE, MODIFICATION, AND REDISTRIBUTION OF THIS WORK IS SUBJECT
13 * TO VERSION 2.0.1 OF THE OPENLDAP PUBLIC LICENSE, A COPY OF WHICH IS
14 * AVAILABLE AT HTTP://WWW.OPENLDAP.ORG/LICENSE.HTML OR IN THE FILE "LICENSE"
15 * IN THE TOP-LEVEL DIRECTORY OF THE DISTRIBUTION. ANY USE OR EXPLOITATION
16 * OF THIS WORK OTHER THAN AS AUTHORIZED IN VERSION 2.0.1 OF THE OPENLDAP
17 * PUBLIC LICENSE, OR OTHER PRIOR WRITTEN CONSENT FROM NOVELL, COULD SUBJECT
18 * THE PERPETRATOR TO CRIMINAL AND CIVIL LIABILITY.
19 ******************************************************************************/
20 /* Note: A verbatim copy of version 2.0.1 of the OpenLDAP Public License
21 * can be found in the file "build/LICENSE-2.0.1" in this distribution
22 * of OpenLDAP Software.
26 * UTF-8 Conversion Routines
28 * These routines convert between Wide Character and UTF-8,
29 * or between MultiByte and UTF-8 encodings.
31 * Both single character and string versions of the functions are provided.
32 * All functions return -1 if the character or string cannot be converted.
38 #include <ac/stdlib.h> /* For wctomb, wcstombs, mbtowc, mbstowcs */
39 #include <ac/string.h>
40 #include <ac/time.h> /* for time_t */
44 #include <ldap_utf8.h>
46 static unsigned char mask[] = { 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
49 /*-----------------------------------------------------------------------------
55 2-character UTF-8 sequence: 11 bits
58 3-character UTF-8 16 bits
59 1110xxxx 10xxxxxx 10xxxxxx
62 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
65 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
68 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
70 Unicode address space (0 - 0x10FFFF) 21 bits
71 ISO-10646 address space (0 - 0x7FFFFFFF) 31 bits
73 Note: This code does not prevent UTF-8 sequences which are longer than
74 necessary from being decoded.
77 /*-----------------------------------------------------------------------------
78 Convert a UTF-8 character to a wide char.
79 Return the length of the UTF-8 input character in bytes.
82 ldap_x_utf8_to_wc ( wchar_t *wchar, const char *utf8char )
87 /* If input ptr is NULL, treat it as empty string. */
91 /* Get UTF-8 sequence length from 1st byte */
92 utflen = LDAP_UTF8_CHARLEN2(utf8char, utflen);
94 if( utflen==0 || utflen > (int)LDAP_MAX_UTF8_LEN )
95 return -1; /* Invalid input */
97 /* First byte minus length tag */
98 ch = (wchar_t)(utf8char[0] & mask[utflen]);
100 for(i=1; i < utflen; i++)
102 /* Subsequent bytes must start with 10 */
103 if ((utf8char[i] & 0xc0) != 0x80)
106 ch <<= 6; /* 6 bits of data in each subsequent byte */
107 ch |= (wchar_t)(utf8char[i] & 0x3f);
116 /*-----------------------------------------------------------------------------
117 Convert a UTF-8 string to a wide char string.
118 No more than 'count' wide chars will be written to the output buffer.
119 Return the size of the converted string in wide chars, excl null terminator.
122 ldap_x_utf8s_to_wcs ( wchar_t *wcstr, const char *utf8str, size_t count )
129 /* If input ptr is NULL, treat it as empty string. */
133 /* Examine next UTF-8 character. If output buffer is NULL, ignore count */
134 while ( *utf8str && (wcstr==NULL || wclen<count) )
136 /* Get UTF-8 sequence length from 1st byte */
137 utflen = LDAP_UTF8_CHARLEN2(utf8str, utflen);
139 if( utflen==0 || utflen > (int)LDAP_MAX_UTF8_LEN )
140 return -1; /* Invalid input */
142 /* First byte minus length tag */
143 ch = (wchar_t)(utf8str[0] & mask[utflen]);
145 for(i=1; i < utflen; i++)
147 /* Subsequent bytes must start with 10 */
148 if ((utf8str[i] & 0xc0) != 0x80)
151 ch <<= 6; /* 6 bits of data in each subsequent byte */
152 ch |= (wchar_t)(utf8str[i] & 0x3f);
158 utf8str += utflen; /* Move to next UTF-8 character */
159 wclen++; /* Count number of wide chars stored/required */
162 /* Add null terminator if there's room in the buffer. */
163 if (wcstr && wclen < count)
170 /*-----------------------------------------------------------------------------
171 Convert one wide char to a UTF-8 character.
172 Return the length of the converted UTF-8 character in bytes.
173 No more than 'count' bytes will be written to the output buffer.
176 ldap_x_wc_to_utf8 ( char *utf8char, wchar_t wchar, size_t count )
180 if (utf8char == NULL) /* Just determine the required UTF-8 char length. */
188 if( wchar < 0x10000 )
190 if( wchar < 0x200000 )
192 if( wchar < 0x4000000 )
194 if( wchar < 0x80000000 )
200 if ( wchar < 0 ) { /* Invalid wide character */
203 } else if( wchar < 0x80 ) {
205 utf8char[len++] = (char)wchar;
208 } else if( wchar < 0x800 ) {
210 utf8char[len++] = 0xc0 | ( wchar >> 6 );
211 utf8char[len++] = 0x80 | ( wchar & 0x3f );
214 } else if( wchar < 0x10000 ) {
216 utf8char[len++] = 0xe0 | ( wchar >> 12 );
217 utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
218 utf8char[len++] = 0x80 | ( wchar & 0x3f );
221 } else if( wchar < 0x200000 ) {
223 utf8char[len++] = 0xf0 | ( wchar >> 18 );
224 utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
225 utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
226 utf8char[len++] = 0x80 | ( wchar & 0x3f );
229 } else if( wchar < 0x4000000 ) {
231 utf8char[len++] = 0xf8 | ( wchar >> 24 );
232 utf8char[len++] = 0x80 | ( (wchar >> 18) & 0x3f );
233 utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
234 utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
235 utf8char[len++] = 0x80 | ( wchar & 0x3f );
238 } else if( wchar < 0x80000000 ) {
240 utf8char[len++] = 0xfc | ( wchar >> 30 );
241 utf8char[len++] = 0x80 | ( (wchar >> 24) & 0x3f );
242 utf8char[len++] = 0x80 | ( (wchar >> 18) & 0x3f );
243 utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
244 utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
245 utf8char[len++] = 0x80 | ( wchar & 0x3f );
256 /*-----------------------------------------------------------------------------
257 Convert a wide char string to a UTF-8 string.
258 No more than 'count' bytes will be written to the output buffer.
259 Return the # of bytes written to the output buffer, excl null terminator.
262 ldap_x_wcs_to_utf8s ( char *utf8str, const wchar_t *wcstr, size_t count )
267 wchar_t empty = 0; /* To avoid use of L"" construct */
269 if (wcstr == NULL) /* Treat input ptr NULL as an empty string */
272 if (utf8str == NULL) /* Just compute size of output, excl null */
276 /* Get UTF-8 size of next wide char */
277 n = ldap_x_wc_to_utf8( NULL, *wcstr++, LDAP_MAX_UTF8_LEN);
287 /* Do the actual conversion. */
289 n = 1; /* In case of empty wcstr */
292 n = ldap_x_wc_to_utf8( p, *wcstr++, count);
294 if (n <= 0) /* If encoding error (-1) or won't fit (0), quit */
298 count -= n; /* Space left in output buffer */
301 /* If not enough room for last character, pad remainder with null
302 so that return value = original count, indicating buffer full. */
309 /* Add a null terminator if there's room. */
313 if (n == -1) /* Conversion encountered invalid wide char. */
316 /* Return the number of bytes written to output buffer, excl null. */
317 return (p - utf8str);
321 /*-----------------------------------------------------------------------------
322 Convert a UTF-8 character to a MultiByte character.
323 Return the size of the converted character in bytes.
326 ldap_x_utf8_to_mb ( char *mbchar, const char *utf8char,
327 int (*f_wctomb)(char *mbchar, wchar_t wchar) )
331 char tmp[6]; /* Large enough for biggest multibyte char */
333 if (f_wctomb == NULL) /* If no conversion function was given... */
334 f_wctomb = wctomb; /* use the local ANSI C function */
336 /* First convert UTF-8 char to a wide char */
337 n = ldap_x_utf8_to_wc( &wchar, utf8char);
340 return -1; /* Invalid UTF-8 character */
343 n = f_wctomb( tmp, wchar );
345 n = f_wctomb( mbchar, wchar);
350 /*-----------------------------------------------------------------------------
351 Convert a UTF-8 string to a MultiByte string.
352 No more than 'count' bytes will be written to the output buffer.
353 Return the size of the converted string in bytes, excl null terminator.
356 ldap_x_utf8s_to_mbs ( char *mbstr, const char *utf8str, size_t count,
357 size_t (*f_wcstombs)(char *mbstr, const wchar_t *wcstr, size_t count) )
363 if (f_wcstombs == NULL) /* If no conversion function was given... */
364 f_wcstombs = wcstombs; /* use the local ANSI C function */
366 if (utf8str == NULL || *utf8str == 0) /* NULL or empty input string */
373 /* Allocate memory for the maximum size wchar string that we could get. */
374 wcsize = strlen(utf8str) + 1;
375 wcs = (wchar_t *)LDAP_MALLOC(wcsize * sizeof(wchar_t));
377 return -1; /* Memory allocation failure. */
379 /* First convert the UTF-8 string to a wide char string */
380 n = ldap_x_utf8s_to_wcs( wcs, utf8str, wcsize);
382 /* Then convert wide char string to multi-byte string */
385 n = f_wcstombs(mbstr, wcs, count);
393 /*-----------------------------------------------------------------------------
394 Convert a MultiByte character to a UTF-8 character.
395 'mbsize' indicates the number of bytes of 'mbchar' to check.
396 Returns the number of bytes written to the output character.
399 ldap_x_mb_to_utf8 ( char *utf8char, const char *mbchar, size_t mbsize,
400 int (*f_mbtowc)(wchar_t *wchar, const char *mbchar, size_t count) )
405 if (f_mbtowc == NULL) /* If no conversion function was given... */
406 f_mbtowc = mbtowc; /* use the local ANSI C function */
408 if (mbsize == 0) /* 0 is not valid. */
411 if (mbchar == NULL || *mbchar == 0)
418 /* First convert the MB char to a Wide Char */
419 n = f_mbtowc( &wchar, mbchar, mbsize);
424 /* Convert the Wide Char to a UTF-8 character. */
425 n = ldap_x_wc_to_utf8( utf8char, wchar, LDAP_MAX_UTF8_LEN);
431 /*-----------------------------------------------------------------------------
432 Convert a MultiByte string to a UTF-8 string.
433 No more than 'count' bytes will be written to the output buffer.
434 Return the size of the converted string in bytes, excl null terminator.
437 ldap_x_mbs_to_utf8s ( char *utf8str, const char *mbstr, size_t count,
438 size_t (*f_mbstowcs)(wchar_t *wcstr, const char *mbstr, size_t count) )
444 if (mbstr == NULL) /* Treat NULL input string as an empty string */
447 if (f_mbstowcs == NULL) /* If no conversion function was given... */
448 f_mbstowcs = mbstowcs; /* use the local ANSI C function */
450 /* Allocate memory for the maximum size wchar string that we could get. */
451 wcsize = strlen(mbstr) + 1;
452 wcs = (wchar_t *)LDAP_MALLOC( wcsize * sizeof(wchar_t) );
456 /* First convert multi-byte string to a wide char string */
457 n = f_mbstowcs(wcs, mbstr, wcsize);
459 /* Convert wide char string to UTF-8 string */
462 n = ldap_x_wcs_to_utf8s( utf8str, wcs, count);