3 * Copyright 2000-2002 The OpenLDAP Foundation, All Rights Reserved.
4 * COPYING RESTRICTIONS APPLY, see COPYRIGHT file
7 /* $Novell: /ldap/src/cldap/libraries/libldap/utfconv.c,v 1.3 2000/12/11 19:35:37 dsteck Exp $ */
8 /******************************************************************************
9 * Copyright (C) 1999, 2000 Novell, Inc. All Rights Reserved.
11 * THIS WORK IS SUBJECT TO U.S. AND INTERNATIONAL COPYRIGHT LAWS AND
12 * TREATIES. USE, MODIFICATION, AND REDISTRIBUTION OF THIS WORK IS SUBJECT
13 * TO VERSION 2.0.1 OF THE OPENLDAP PUBLIC LICENSE, A COPY OF WHICH IS
14 * AVAILABLE AT HTTP://WWW.OPENLDAP.ORG/LICENSE.HTML OR IN THE FILE "LICENSE"
15 * IN THE TOP-LEVEL DIRECTORY OF THE DISTRIBUTION. ANY USE OR EXPLOITATION
16 * OF THIS WORK OTHER THAN AS AUTHORIZED IN VERSION 2.0.1 OF THE OPENLDAP
17 * PUBLIC LICENSE, OR OTHER PRIOR WRITTEN CONSENT FROM NOVELL, COULD SUBJECT
18 * THE PERPETRATOR TO CRIMINAL AND CIVIL LIABILITY.
19 ******************************************************************************/
22 * UTF-8 Conversion Routines
24 * These routines convert between Wide Character and UTF-8,
25 * or between MultiByte and UTF-8 encodings.
27 * Both single character and string versions of the functions are provided.
28 * All functions return -1 if the character or string cannot be converted.
34 #include <ac/stdlib.h> /* For wctomb, wcstombs, mbtowc, mbstowcs */
35 #include <ac/string.h>
36 #include <ac/time.h> /* for time_t */
40 #include <ldap_utf8.h>
42 static unsigned char mask[] = { 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
45 /*-----------------------------------------------------------------------------
51 2-character UTF-8 sequence: 11 bits
54 3-character UTF-8 16 bits
55 1110xxxx 10xxxxxx 10xxxxxx
58 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
61 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
64 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
66 Unicode address space (0 - 0x10FFFF) 21 bits
67 ISO-10646 address space (0 - 0x7FFFFFFF) 31 bits
69 Note: This code does not prevent UTF-8 sequences which are longer than
70 necessary from being decoded.
73 /*-----------------------------------------------------------------------------
74 Convert a UTF-8 character to a wide char.
75 Return the length of the UTF-8 input character in bytes.
78 ldap_x_utf8_to_wc ( wchar_t *wchar, const char *utf8char )
83 /* If input ptr is NULL, treat it as empty string. */
87 /* Get UTF-8 sequence length from 1st byte */
88 utflen = LDAP_UTF8_CHARLEN2(utf8char, utflen);
90 if( utflen==0 || utflen > (int)LDAP_MAX_UTF8_LEN )
91 return -1; /* Invalid input */
93 /* First byte minus length tag */
94 ch = (wchar_t)(utf8char[0] & mask[utflen]);
96 for(i=1; i < utflen; i++)
98 /* Subsequent bytes must start with 10 */
99 if ((utf8char[i] & 0xc0) != 0x80)
102 ch <<= 6; /* 6 bits of data in each subsequent byte */
103 ch |= (wchar_t)(utf8char[i] & 0x3f);
112 /*-----------------------------------------------------------------------------
113 Convert a UTF-8 string to a wide char string.
114 No more than 'count' wide chars will be written to the output buffer.
115 Return the size of the converted string in wide chars, excl null terminator.
118 ldap_x_utf8s_to_wcs ( wchar_t *wcstr, const char *utf8str, size_t count )
125 /* If input ptr is NULL, treat it as empty string. */
129 /* Examine next UTF-8 character. If output buffer is NULL, ignore count */
130 while ( *utf8str && (wcstr==NULL || wclen<count) )
132 /* Get UTF-8 sequence length from 1st byte */
133 utflen = LDAP_UTF8_CHARLEN2(utf8str, utflen);
135 if( utflen==0 || utflen > (int)LDAP_MAX_UTF8_LEN )
136 return -1; /* Invalid input */
138 /* First byte minus length tag */
139 ch = (wchar_t)(utf8str[0] & mask[utflen]);
141 for(i=1; i < utflen; i++)
143 /* Subsequent bytes must start with 10 */
144 if ((utf8str[i] & 0xc0) != 0x80)
147 ch <<= 6; /* 6 bits of data in each subsequent byte */
148 ch |= (wchar_t)(utf8str[i] & 0x3f);
154 utf8str += utflen; /* Move to next UTF-8 character */
155 wclen++; /* Count number of wide chars stored/required */
158 /* Add null terminator if there's room in the buffer. */
159 if (wcstr && wclen < count)
166 /*-----------------------------------------------------------------------------
167 Convert one wide char to a UTF-8 character.
168 Return the length of the converted UTF-8 character in bytes.
169 No more than 'count' bytes will be written to the output buffer.
172 ldap_x_wc_to_utf8 ( char *utf8char, wchar_t wchar, size_t count )
176 if (utf8char == NULL) /* Just determine the required UTF-8 char length. */
184 if( wchar < 0x10000 )
186 if( wchar < 0x200000 )
188 if( wchar < 0x4000000 )
190 if( wchar < 0x80000000 )
196 if ( wchar < 0 ) { /* Invalid wide character */
199 } else if( wchar < 0x80 ) {
201 utf8char[len++] = (char)wchar;
204 } else if( wchar < 0x800 ) {
206 utf8char[len++] = 0xc0 | ( wchar >> 6 );
207 utf8char[len++] = 0x80 | ( wchar & 0x3f );
210 } else if( wchar < 0x10000 ) {
212 utf8char[len++] = 0xe0 | ( wchar >> 12 );
213 utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
214 utf8char[len++] = 0x80 | ( wchar & 0x3f );
217 } else if( wchar < 0x200000 ) {
219 utf8char[len++] = 0xf0 | ( wchar >> 18 );
220 utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
221 utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
222 utf8char[len++] = 0x80 | ( wchar & 0x3f );
225 } else if( wchar < 0x4000000 ) {
227 utf8char[len++] = 0xf8 | ( wchar >> 24 );
228 utf8char[len++] = 0x80 | ( (wchar >> 18) & 0x3f );
229 utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
230 utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
231 utf8char[len++] = 0x80 | ( wchar & 0x3f );
234 } else if( wchar < 0x80000000 ) {
236 utf8char[len++] = 0xfc | ( wchar >> 30 );
237 utf8char[len++] = 0x80 | ( (wchar >> 24) & 0x3f );
238 utf8char[len++] = 0x80 | ( (wchar >> 18) & 0x3f );
239 utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
240 utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
241 utf8char[len++] = 0x80 | ( wchar & 0x3f );
252 /*-----------------------------------------------------------------------------
253 Convert a wide char string to a UTF-8 string.
254 No more than 'count' bytes will be written to the output buffer.
255 Return the # of bytes written to the output buffer, excl null terminator.
258 ldap_x_wcs_to_utf8s ( char *utf8str, const wchar_t *wcstr, size_t count )
263 wchar_t empty = 0; /* To avoid use of L"" construct */
265 if (wcstr == NULL) /* Treat input ptr NULL as an empty string */
268 if (utf8str == NULL) /* Just compute size of output, excl null */
272 /* Get UTF-8 size of next wide char */
273 n = ldap_x_wc_to_utf8( NULL, *wcstr++, LDAP_MAX_UTF8_LEN);
283 /* Do the actual conversion. */
285 n = 1; /* In case of empty wcstr */
288 n = ldap_x_wc_to_utf8( p, *wcstr++, count);
290 if (n <= 0) /* If encoding error (-1) or won't fit (0), quit */
294 count -= n; /* Space left in output buffer */
297 /* If not enough room for last character, pad remainder with null
298 so that return value = original count, indicating buffer full. */
305 /* Add a null terminator if there's room. */
309 if (n == -1) /* Conversion encountered invalid wide char. */
312 /* Return the number of bytes written to output buffer, excl null. */
313 return (p - utf8str);
317 /*-----------------------------------------------------------------------------
318 Convert a UTF-8 character to a MultiByte character.
319 Return the size of the converted character in bytes.
322 ldap_x_utf8_to_mb ( char *mbchar, const char *utf8char,
323 int (*f_wctomb)(char *mbchar, wchar_t wchar) )
327 char tmp[6]; /* Large enough for biggest multibyte char */
329 if (f_wctomb == NULL) /* If no conversion function was given... */
330 f_wctomb = wctomb; /* use the local ANSI C function */
332 /* First convert UTF-8 char to a wide char */
333 n = ldap_x_utf8_to_wc( &wchar, utf8char);
336 return -1; /* Invalid UTF-8 character */
339 n = f_wctomb( tmp, wchar );
341 n = f_wctomb( mbchar, wchar);
346 /*-----------------------------------------------------------------------------
347 Convert a UTF-8 string to a MultiByte string.
348 No more than 'count' bytes will be written to the output buffer.
349 Return the size of the converted string in bytes, excl null terminator.
352 ldap_x_utf8s_to_mbs ( char *mbstr, const char *utf8str, size_t count,
353 size_t (*f_wcstombs)(char *mbstr, const wchar_t *wcstr, size_t count) )
359 if (f_wcstombs == NULL) /* If no conversion function was given... */
360 f_wcstombs = wcstombs; /* use the local ANSI C function */
362 if (utf8str == NULL || *utf8str == 0) /* NULL or empty input string */
369 /* Allocate memory for the maximum size wchar string that we could get. */
370 wcsize = strlen(utf8str) + 1;
371 wcs = (wchar_t *)LDAP_MALLOC(wcsize * sizeof(wchar_t));
373 return -1; /* Memory allocation failure. */
375 /* First convert the UTF-8 string to a wide char string */
376 n = ldap_x_utf8s_to_wcs( wcs, utf8str, wcsize);
378 /* Then convert wide char string to multi-byte string */
381 n = f_wcstombs(mbstr, wcs, count);
389 /*-----------------------------------------------------------------------------
390 Convert a MultiByte character to a UTF-8 character.
391 'mbsize' indicates the number of bytes of 'mbchar' to check.
392 Returns the number of bytes written to the output character.
395 ldap_x_mb_to_utf8 ( char *utf8char, const char *mbchar, size_t mbsize,
396 int (*f_mbtowc)(wchar_t *wchar, const char *mbchar, size_t count) )
401 if (f_mbtowc == NULL) /* If no conversion function was given... */
402 f_mbtowc = mbtowc; /* use the local ANSI C function */
404 if (mbsize == 0) /* 0 is not valid. */
407 if (mbchar == NULL || *mbchar == 0)
414 /* First convert the MB char to a Wide Char */
415 n = f_mbtowc( &wchar, mbchar, mbsize);
420 /* Convert the Wide Char to a UTF-8 character. */
421 n = ldap_x_wc_to_utf8( utf8char, wchar, LDAP_MAX_UTF8_LEN);
427 /*-----------------------------------------------------------------------------
428 Convert a MultiByte string to a UTF-8 string.
429 No more than 'count' bytes will be written to the output buffer.
430 Return the size of the converted string in bytes, excl null terminator.
433 ldap_x_mbs_to_utf8s ( char *utf8str, const char *mbstr, size_t count,
434 size_t (*f_mbstowcs)(wchar_t *wcstr, const char *mbstr, size_t count) )
440 if (mbstr == NULL) /* Treat NULL input string as an empty string */
443 if (f_mbstowcs == NULL) /* If no conversion function was given... */
444 f_mbstowcs = mbstowcs; /* use the local ANSI C function */
446 /* Allocate memory for the maximum size wchar string that we could get. */
447 wcsize = strlen(mbstr) + 1;
448 wcs = (wchar_t *)LDAP_MALLOC( wcsize * sizeof(wchar_t) );
452 /* First convert multi-byte string to a wide char string */
453 n = f_mbstowcs(wcs, mbstr, wcsize);
455 /* Convert wide char string to UTF-8 string */
458 n = ldap_x_wcs_to_utf8s( utf8str, wcs, count);