--- /dev/null
+ Dec 5, 2000
+ Dave Steck
+ Novell, Inc.
+
+ UTF-8 Conversion Functions
+
+
+1. Strings in the LDAP C SDK should be encoded in UTF-8 format.
+ However, most platforms do not provide APIs for converting to
+ this format. If they do, they are platform-specific.
+
+ As a result, most applications (knowingly or not) use local strings
+ with LDAP functions. This works fine for 7-bit ASCII characters,
+ but will fail with 8-bit European characters, Asian characters, etc.
+
+ We propose adding the following platform-independent conversion functions
+ to the OpenLDAP SDK. There are 4 functions for converting between UTF-8
+ and wide characters, and 4 functions for converting between UTF-8 and
+ multibyte characters.
+
+ For multibyte to UTF-8 conversions, charset translation is necessary.
+ While a full charset translator is not practical or appropriate for the
+ LDAP SDK, we can pass the translator function in as an argument.
+ A NULL for this argument will use the ANSI C functions mbtowc, mbstowcs,
+ wctomb, and wcstombs.
+
+2. UTF-8 <--> Wide Character conversions
+
+The following new conversion routines will be added, following the pattern of
+the ANSI C conversion routines (mbtowc, mbstowcs, etc). These routines use
+the wchar_t type. wchar_t is 2 bytes on some systems and 4 bytes on others.
+However the advantage of using wchar_t is that all the standard wide character
+string functions may be used on these strings: wcslen, wcscpy, etc.
+
+ int ldap_x_utf8_to_wc - Convert a single UTF-8 encoded character to a wide character.
+ int ldap_x_utf8s_to_wcs - Convert a UTF-8 string to a wide character string.
+ int ldap_x_wc_to_utf8 - Convert a single wide character to a UTF-8 sequence.
+ int ldap_x_wcs_to_utf8s - Convert a wide character string to a UTF-8 string.
+
+
+2.1 ldap_x_utf8_to_wc - Convert a single UTF-8 encoded character to a wide character.
+
+int ldap_x_utf8_to_wc ( wchar_t *wchar, const char *utf8char )
+
+ wchar (OUT) Points to a wide character code to receive the
+ converted character.
+
+ utf8char (IN) Address of the UTF8 sequence of bytes.
+
+Return Value:
+ If successful, the function returns the length in
+ bytes of the UTF-8 input character.
+
+ If utf8char is NULL or points to an empty string, the
+ function returns 1 and a NULL is written to wchar.
+
+ If utf8char contains an invalid UTF-8 sequence -1 is returned.
+
+
+2.2 ldap_x_utf8s_to_wcs - Convert a UTF-8 string to a wide character string.
+
+int ldap_x_utf8s_to_wcs (wchar_t *wcstr, const char *utf8str, size_t count)
+
+ wcstr (OUT) Points to a wide char buffer to receive the
+ converted wide char string. The output string will be
+ null terminated if there is space for it in the
+ buffer.
+
+ utf8str (IN) Address of the null-terminated UTF-8 string to convert.
+
+ count (IN) The number of UTF-8 characters to convert, or
+ equivalently, the size of the output buffer in wide
+ characters.
+
+Return Value:
+ If successful, the function returns the number of wide
+ characters written to wcstr, excluding the null termination
+ character, if any.
+
+ If wcstr is NULL, the function returns the number of wide
+ characters required to contain the converted string,
+ excluding the null termination character.
+
+ If an invalid UTF-8 sequence is encountered, the
+ function returns -1.
+
+ If the return value equals count, there was not enough space to fit the
+ string and the null terminator in the buffer.
+
+
+2.3 ldap_x_wc_to_utf8 - Convert a single wide character to a UTF-8 sequence.
+
+int ldap_x_wc_to_utf8 ( char *utf8char, wchar_t wchar, count )
+
+ utf8char (OUT) Points to a byte array to receive the converted UTF-8
+ string.
+
+ wchar (IN) The wide character to convert.
+
+ count (IN) The maximum number of bytes to write to the output
+ buffer. Normally set this to LDAP_MAX_UTF8_LEN, which
+ is defined as 3 or 6 depending on the size of wchar_t.
+ A partial character will not be written.
+
+Return Value:
+ If successful, the function returns the length in bytes of
+ the converted UTF-8 output character.
+
+ If wchar is NULL, the function returns 1 and a NULL is
+ written to utf8char.
+
+ If wchar cannot be converted to a UTF-8 character, the
+ function returns -1.
+
+
+2.4 int ldap_x_wcs_to_utf8s - Convert a wide character string to a UTF-8 string.
+
+int ldap_x_wcs_to_utf8s (char *utf8str, const wchar_t *wcstr, size_t count)
+
+ utf8str (OUT) Points to a byte array to receive the converted
+ UTF-8 string. The output string will be null
+ terminated if there is space for it in the
+ buffer.
+
+
+ wcstr (IN) Address of the null-terminated wide char string to convert.
+
+ count (IN) The size of the output buffer in bytes.
+
+Return Value:
+ If successful, the function returns the number of bytes
+ written to utf8str, excluding the null termination
+ character, if any.
+
+ If utf8str is NULL, the function returns the number of
+ bytes required to contain the converted string, excluding
+ the null termination character. The 'count' parameter is ignored.
+
+ If the function encounters a wide character that cannot
+ be mapped to a UTF-8 sequence, the function returns -1.
+
+ If the return value equals count, there was not enough space to fit
+ the string and the null terminator in the buffer.
+
+
+
+3. Multi-byte <--> UTF-8 Conversions
+
+These functions convert the string in a two-step process, from multibyte
+to Wide, then from Wide to UTF8, or vice versa. This conversion requires a
+charset translation routine, which is passed in as an argument.
+
+ ldap_x_mb_to_utf8 - Convert a multi-byte character to a UTF-8 character.
+ ldap_x_mbs_to_utf8s - Convert a multi-byte string to a UTF-8 string.
+ ldap_x_utf8_to_mb - Convert a UTF-8 character to a multi-byte character.
+ ldap_x_utf8s_to_mbs - Convert a UTF-8 string to a multi-byte string.
+
+3.1 ldap_x_mb_to_utf8 - Convert a multi-byte character to a UTF-8 character.
+
+int ldap_x_mb_to_utf8 ( char *utf8char, const char *mbchar, size_t mbsize, int (*f_mbtowc)(wchar_t *wchar, const char *mbchar, size_t count) )
+
+ utf8char (OUT) Points to a byte buffer to receive the converted
+ UTF-8 character. May be NULL. The output is not
+ null-terminated.
+
+ mbchar (IN) Address of a sequence of bytes forming a multibyte character.
+
+ mbsize (IN) The maximum number of bytes of the mbchar argument to
+ check. This should normally be MB_CUR_MAX.
+
+ f_mbtowc (IN) The function to use for converting a multibyte
+ character to a wide character. If NULL, the local
+ ANSI C routine mbtowc is used.
+
+Return Value:
+ If successful, the function returns the length in bytes of
+ the UTF-8 output character.
+
+ If utf8char is NULL, count is ignored and the funtion
+ returns the number of bytes that would be written to the
+ output char.
+
+ If count is zero, 0 is returned and nothing is written to
+ utf8char.
+
+ If mbchar is NULL or points to an empty string, the
+ function returns 1 and a null byte is written to utf8char.
+
+ If mbchar contains an invalid multi-byte character, -1 is returned.
+
+
+3.2 ldap_x_mbs_to_utf8s - Convert a multi-byte string to a UTF-8 string.
+
+int ldap_x_mbs_to_utf8s (char *utf8str, const char *mbstr, size_t count,
+ size_t (*f_mbstowcs)(wchar_t *wcstr, const char *mbstr, size_t count))
+
+utf8str (OUT) Points to a buffer to receive the converted UTF-8 string.
+ May be NULL.
+
+ mbchar (IN) Address of the null-terminated multi-byte input string.
+
+ count (IN) The size of the output buffer in bytes.
+
+ f_mbstowcs (IN) The function to use for converting a multibyte string
+ to a wide character string. If NULL, the local ANSI
+ C routine mbstowcs is used.
+
+Return Value:
+ If successful, the function returns the length in
+ bytes of the UTF-8 output string, excluding the null
+ terminator, if present.
+
+ If utf8str is NULL, count is ignored and the function
+ returns the number of bytes required for the output string,
+ excluding the NULL.
+
+ If count is zero, 0 is returned and nothing is written to utf8str.
+
+ If mbstr is NULL or points to an empty string, the
+ function returns 1 and a null byte is written to utf8str.
+
+ If mbstr contains an invalid multi-byte character, -1 is returned.
+
+ If the returned value is equal to count, the entire null-terminated
+ string would not fit in the output buffer.
+
+
+3.3 ldap_x_utf8_to_mb - Convert a UTF-8 character to a multi-byte character.
+
+int ldap_x_utf8_to_mb ( char *mbchar, const char *utf8char,
+ int (*f_wctomb)(char *mbchar, wchar_t wchar) )
+
+mbchar (OUT) Points to a byte buffer to receive the converted multi-byte
+ character. May be NULL.
+
+ utf8char (IN) Address of the UTF-8 character sequence.
+
+ f_wctomb (IN) The function to use for converting a wide character
+ to a multibyte character. If NULL, the local
+ ANSI C routine wctomb is used.
+
+
+Return Value:
+ If successful, the function returns the length in
+ bytes of the multi-byte output character.
+
+ If utf8char is NULL or points to an empty string, the
+ function returns 1 and a null byte is written to mbchar.
+
+ If utf8char contains an invalid UTF-8 sequence, -1 is returned.
+
+
+3.4 int ldap_x_utf8s_to_mbs - Convert a UTF-8 string to a multi-byte string.
+
+
+int ldap_x_utf8s_to_mbs ( char *mbstr, const char *utf8str, size_t count,
+ size_t (*f_wcstombs)(char *mbstr, const wchar_t *wcstr, size_t count) )
+
+ mbstr (OUT) Points to a byte buffer to receive the converted
+ multi-byte string. May be NULL.
+
+ utf8str (IN) Address of the null-terminated UTF-8 string to convert.
+
+ count (IN) The size of the output buffer in bytes.
+
+ f_wcstombs (IN) The function to use for converting a wide character
+ string to a multibyte string. If NULL, the local
+ ANSI C routine wcstombs is used.
+
+Return Value:
+ If successful, the function returns the number of bytes
+ written to mbstr, excluding the null termination
+ character, if any.
+
+ If mbstr is NULL, count is ignored and the funtion
+ returns the number of bytes required for the output string,
+ excluding the NULL.
+
+ If count is zero, 0 is returned and nothing is written to
+ mbstr.
+
+ If utf8str is NULL or points to an empty string, the
+ function returns 1 and a null byte is written to mbstr.
+
+ If an invalid UTF-8 character is encountered, the
+ function returns -1.
+
+The output string will be null terminated if there is space for it in
+the output buffer.
+
+
--- /dev/null
+/* $OpenLDAP$ */
+/* $Novell: /ldap/src/cldap/include/ldap_utf8.h,v 1.3 2000/12/04 20:23:20 dsteck Exp $
+/*
+ * Copyright 2000 The OpenLDAP Foundation, Redwood City, California, USA
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms are permitted only
+ * as authorized by the OpenLDAP Public License. A copy of this
+ * license is available at http://www.OpenLDAP.org/license.html or
+ * in file LICENSE in the top-level directory of the distribution.
+ */
+/******************************************************************************
+ * This notice applies to changes, created by or for Novell, Inc.,
+ * to preexisting works for which notices appear elsewhere in this file.
+ *
+ * Copyright (C) 2000 Novell, Inc. All Rights Reserved.
+ *
+ * THIS WORK IS SUBJECT TO U.S. AND INTERNATIONAL COPYRIGHT LAWS AND TREATIES.
+ * USE, MODIFICATION, AND REDISTRIBUTION OF THIS WORK IS SUBJECT TO VERSION
+ * 2.0.1 OF THE OPENLDAP PUBLIC LICENSE, A COPY OF WHICH IS AVAILABLE AT
+ * HTTP://WWW.OPENLDAP.ORG/LICENSE.HTML OR IN THE FILE "LICENSE" IN THE
+ * TOP-LEVEL DIRECTORY OF THE DISTRIBUTION. ANY USE OR EXPLOITATION OF THIS
+ * WORK OTHER THAN AS AUTHORIZED IN VERSION 2.0.1 OF THE OPENLDAP PUBLIC
+ * LICENSE, OR OTHER PRIOR WRITTEN CONSENT FROM NOVELL, COULD SUBJECT THE
+ * PERPETRATOR TO CRIMINAL AND CIVIL LIABILITY.
+ ******************************************************************************/
+
+#ifndef _LDAP_UTF8_H
+#define _LDAP_UTF8_H
+
+LDAP_BEGIN_DECL
+
+/*
+ * UTF-8 Utility Routines (in utf-8.c)
+ */
+
+#define LDAP_UCS4_INVALID (0x80000000U)
+
+/* LDAP_MAX_UTF8_LEN is 3 or 6 depending on size of wchar_t */
+#define LDAP_MAX_UTF8_LEN sizeof(wchar_t)*3/2
+
+
+/*
+ * UTF-8 Conversion Routines. (in utfconv.c)
+ */
+
+/* UTF-8 character to Wide Char */
+LDAP_F(int)
+ldap_x_utf8_to_wc ( wchar_t *wchar, const char *utf8char );
+
+/* UTF-8 string to Wide Char string */
+LDAP_F(int)
+ldap_x_utf8s_to_wcs ( wchar_t *wcstr, const char *utf8str, size_t count );
+
+/* Wide Char to UTF-8 character */
+LDAP_F(int)
+ldap_x_wc_to_utf8 ( char *utf8char, wchar_t wchar, size_t count );
+
+/* Wide Char string to UTF-8 string */
+LDAP_F(int)
+ldap_x_wcs_to_utf8s ( char *utf8str, const wchar_t *wcstr, size_t count );
+
+
+/* UTF-8 character to MultiByte character */
+LDAP_F(int)
+ldap_x_utf8_to_mb ( char *mbchar, const char *utf8char,
+ int (*f_wctomb)(char *mbchar, wchar_t wchar) );
+
+/* UTF-8 string to MultiByte string */
+LDAP_F(int)
+ldap_x_utf8s_to_mbs ( char *mbstr, const char *utf8str, size_t count,
+ size_t (*f_wcstombs)(char *mbstr, const wchar_t *wcstr, size_t count) );
+
+/* MultiByte character to UTF-8 character */
+LDAP_F(int)
+ldap_x_mb_to_utf8 ( char *utf8char, const char *mbchar, size_t mbsize,
+ int (*f_mbtowc)(wchar_t *wchar, const char *mbchar, size_t count) );
+
+/* MultiByte string to UTF-8 string */
+LDAP_F(int)
+ldap_x_mbs_to_utf8s ( char *utf8str, const char *mbstr, size_t count,
+ size_t (*f_mbstowcs)(wchar_t *wcstr, const char *mbstr, size_t count) );
+
+LDAP_END_DECL
+
+#endif /* _LDAP_UTF8_H */
request.c os-ip.c url.c sortctrl.c vlvctrl.c \
init.c options.c print.c string.c util-int.c schema.c \
charray.c tls.c dn.c os-local.c dnssrv.c \
- utf-8.c
+ utf-8.c utf-8-conv.c
OBJS = bind.lo open.lo result.lo error.lo compare.lo search.lo \
controls.lo messages.lo references.lo extended.lo cyrus.lo \
modify.lo add.lo modrdn.lo delete.lo abandon.lo ufn.lo cache.lo \
request.lo os-ip.lo url.lo sortctrl.lo vlvctrl.lo \
init.lo options.lo print.lo string.lo util-int.lo schema.lo \
charray.lo tls.lo dn.lo os-local.lo dnssrv.lo \
- utf-8.lo
+ utf-8.lo utf-8-conv.lo
LDAP_INCDIR= ../../include
LDAP_LIBDIR= ../../libraries
--- /dev/null
+/* $OpenLDAP$ */
+/*
+ * Copyright 2000 The OpenLDAP Foundation, All Rights Reserved.
+ * COPYING RESTRICTIONS APPLY, see COPYRIGHT file
+ */
+
+/* $Novell: /ldap/src/cldap/libraries/libldap/utfconv.c,v 1.3 2000/12/11 19:35:37 dsteck Exp $ */
+/******************************************************************************
+ * Copyright (C) 1999, 2000 Novell, Inc. All Rights Reserved.
+ *
+ * THIS WORK IS SUBJECT TO U.S. AND INTERNATIONAL COPYRIGHT LAWS AND
+ * TREATIES. USE, MODIFICATION, AND REDISTRIBUTION OF THIS WORK IS SUBJECT
+ * TO VERSION 2.0.1 OF THE OPENLDAP PUBLIC LICENSE, A COPY OF WHICH IS
+ * AVAILABLE AT HTTP://WWW.OPENLDAP.ORG/LICENSE.HTML OR IN THE FILE "LICENSE"
+ * IN THE TOP-LEVEL DIRECTORY OF THE DISTRIBUTION. ANY USE OR EXPLOITATION
+ * OF THIS WORK OTHER THAN AS AUTHORIZED IN VERSION 2.0.1 OF THE OPENLDAP
+ * PUBLIC LICENSE, OR OTHER PRIOR WRITTEN CONSENT FROM NOVELL, COULD SUBJECT
+ * THE PERPETRATOR TO CRIMINAL AND CIVIL LIABILITY.
+ ******************************************************************************/
+
+/*
+ * UTF-8 Conversion Routines
+ *
+ * These routines convert between Wide Character and UTF-8,
+ * or between MultiByte and UTF-8 encodings.
+ *
+ * Both single character and string versions of the functions are provided.
+ * All functions return -1 if the character or string cannot be converted.
+ */
+
+#include "portable.h"
+
+#include <stdio.h>
+#include <ac/stdlib.h> /* For wctomb, wcstombs, mbtowc, mbstowcs */
+#include <ac/string.h>
+#include <ac/time.h> /* for time_t */
+
+#include "ldap-int.h"
+
+#include <ldap_utf8.h>
+
+static unsigned char mask[] = { 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
+
+
+/*-----------------------------------------------------------------------------
+ UTF-8 Format Summary
+
+ASCII chars 7 bits
+ 0xxxxxxx
+
+2-character UTF-8 sequence: 11 bits
+ 110xxxxx 10xxxxxx
+
+3-character UTF-8 16 bits
+ 1110xxxx 10xxxxxx 10xxxxxx
+
+4-char UTF-8 21 bits
+ 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+
+5-char UTF-8 26 bits
+ 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+
+6-char UTF-8 31 bits
+ 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+
+Unicode address space (0 - 0x10FFFF) 21 bits
+ISO-10646 address space (0 - 0x7FFFFFFF) 31 bits
+
+Note: This code does not prevent UTF-8 sequences which are longer than
+ necessary from being decoded.
+*/
+
+/*------------------------------------------------------------------------------
+ Convert a UTF-8 character to a wide char.
+ Return the length of the UTF-8 input character in bytes.
+*/
+int
+ldap_x_utf8_to_wc ( wchar_t *wchar, const char *utf8char )
+{
+ int utflen, i;
+ wchar_t ch;
+
+ /* If input ptr is NULL, treat it as empty string. */
+ if (utf8char == NULL)
+ utf8char = "";
+
+ /* Get UTF-8 sequence length from 1st byte */
+ utflen = UTF8_CHARLEN(utf8char);
+
+ if( utflen==0 || utflen > LDAP_MAX_UTF8_LEN )
+ return -1; /* Invalid input */
+
+ /* First byte minus length tag */
+ ch = (wchar_t)(utf8char[0] & mask[utflen]);
+
+ for(i=1; i < utflen; i++)
+ {
+ /* Subsequent bytes must start with 10 */
+ if ((utf8char[i] & 0xc0) != 0x80)
+ return -1;
+
+ ch <<= 6; /* 6 bits of data in each subsequent byte */
+ ch |= (wchar_t)(utf8char[i] & 0x3f);
+ }
+
+ if (wchar)
+ *wchar = ch;
+
+ return utflen;
+}
+
+/*-----------------------------------------------------------------------------
+ Convert a UTF-8 string to a wide char string.
+ No more than 'count' wide chars will be written to the output buffer.
+ Return the size of the converted string in wide chars, excl null terminator.
+*/
+int
+ldap_x_utf8s_to_wcs ( wchar_t *wcstr, const char *utf8str, size_t count )
+{
+ size_t wclen = 0;
+ int utflen, i;
+ wchar_t ch;
+
+
+ /* If input ptr is NULL, treat it as empty string. */
+ if (utf8str == NULL)
+ utf8str = "";
+
+ /* Examine next UTF-8 character. If output buffer is NULL, ignore count */
+ while ( *utf8str && (wcstr==NULL || wclen<count) )
+ {
+ /* Get UTF-8 sequence length from 1st byte */
+ utflen = UTF8_CHARLEN(utf8str);
+
+ if( utflen==0 || utflen > LDAP_MAX_UTF8_LEN )
+ return -1; /* Invalid input */
+
+ /* First byte minus length tag */
+ ch = (wchar_t)(utf8str[0] & mask[utflen]);
+
+ for(i=1; i < utflen; i++)
+ {
+ /* Subsequent bytes must start with 10 */
+ if ((utf8str[i] & 0xc0) != 0x80)
+ return -1;
+
+ ch <<= 6; /* 6 bits of data in each subsequent byte */
+ ch |= (wchar_t)(utf8str[i] & 0x3f);
+ }
+
+ if (wcstr)
+ wcstr[wclen] = ch;
+
+ utf8str += utflen; /* Move to next UTF-8 character */
+ wclen++; /* Count number of wide chars stored/required */
+ }
+
+ /* Add null terminator if there's room in the buffer. */
+ if (wcstr && wclen < count)
+ wcstr[wclen] = 0;
+
+ return wclen;
+}
+
+
+/*------------------------------------------------------------------------------
+ Convert one wide char to a UTF-8 character.
+ Return the length of the converted UTF-8 character in bytes.
+ No more than 'count' bytes will be written to the output buffer.
+*/
+int
+ldap_x_wc_to_utf8 ( char *utf8char, wchar_t wchar, size_t count )
+{
+ int len=0;
+
+ if (utf8char == NULL) /* Just determine the required UTF-8 char length. */
+ { /* Ignore count */
+ if( wchar < 0 )
+ return -1;
+ if( wchar < 0x80 )
+ return 1;
+ if( wchar < 0x800 )
+ return 2;
+ if( wchar < 0x10000 )
+ return 3;
+ if( wchar < 0x200000 )
+ return 4;
+ if( wchar < 0x4000000 )
+ return 5;
+ if( wchar < 0x80000000 )
+ return 6;
+ return -1;
+ }
+
+
+ if ( wchar < 0 ) { /* Invalid wide character */
+ len = -1;
+
+ } else if( wchar < 0x80 ) {
+ if (count >= 1) {
+ utf8char[len++] = (char)wchar;
+ }
+
+ } else if( wchar < 0x800 ) {
+ if (count >=2) {
+ utf8char[len++] = 0xc0 | ( wchar >> 6 );
+ utf8char[len++] = 0x80 | ( wchar & 0x3f );
+ }
+
+ } else if( wchar < 0x10000 ) {
+ if (count >= 3) {
+ utf8char[len++] = 0xe0 | ( wchar >> 12 );
+ utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
+ utf8char[len++] = 0x80 | ( wchar & 0x3f );
+ }
+
+ } else if( wchar < 0x200000 ) {
+ if (count >= 4) {
+ utf8char[len++] = 0xf0 | ( wchar >> 18 );
+ utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
+ utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
+ utf8char[len++] = 0x80 | ( wchar & 0x3f );
+ }
+
+ } else if( wchar < 0x4000000 ) {
+ if (count >= 5) {
+ utf8char[len++] = 0xf8 | ( wchar >> 24 );
+ utf8char[len++] = 0x80 | ( (wchar >> 18) & 0x3f );
+ utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
+ utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
+ utf8char[len++] = 0x80 | ( wchar & 0x3f );
+ }
+
+ } else if( wchar < 0x80000000 ) {
+ if (count >= 6) {
+ utf8char[len++] = 0xfc | ( wchar >> 30 );
+ utf8char[len++] = 0x80 | ( (wchar >> 24) & 0x3f );
+ utf8char[len++] = 0x80 | ( (wchar >> 18) & 0x3f );
+ utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
+ utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
+ utf8char[len++] = 0x80 | ( wchar & 0x3f );
+ }
+
+ } else
+ len = -1;
+
+ return len;
+
+}
+
+
+/*-----------------------------------------------------------------------------
+ Convert a wide char string to a UTF-8 string.
+ No more than 'count' bytes will be written to the output buffer.
+ Return the # of bytes written to the output buffer, excl null terminator.
+*/
+int
+ldap_x_wcs_to_utf8s ( char *utf8str, const wchar_t *wcstr, size_t count )
+{
+ int len = 0;
+ int n;
+ char *p = utf8str;
+ wchar_t empty = 0; /* To avoid use of L"" construct */
+
+ if (wcstr == NULL) /* Treat input ptr NULL as an empty string */
+ wcstr = ∅
+
+ if (utf8str == NULL) /* Just compute size of output, excl null */
+ {
+ while (*wcstr)
+ {
+ /* Get UTF-8 size of next wide char */
+ n = ldap_x_wc_to_utf8( NULL, *wcstr++, LDAP_MAX_UTF8_LEN);
+ if (n == -1)
+ return -1;
+ len += n;
+ }
+
+ return len;
+ }
+
+
+ /* Do the actual conversion. */
+
+ n = 1; /* In case of empty wcstr */
+ while (*wcstr)
+ {
+ n = ldap_x_wc_to_utf8( p, *wcstr++, count);
+
+ if (n <= 0) /* If encoding error (-1) or won't fit (0), quit */
+ break;
+
+ p += n;
+ count -= n; /* Space left in output buffer */
+ }
+
+ /* If not enough room for last character, pad remainder with null
+ so that return value = original count, indicating buffer full. */
+ if (n == 0)
+ {
+ while (count--)
+ *p++ = 0;
+ }
+
+ /* Add a null terminator if there's room. */
+ else if (count)
+ *p = 0;
+
+ if (n == -1) /* Conversion encountered invalid wide char. */
+ return -1;
+
+ /* Return the number of bytes written to output buffer, excl null. */
+ return (p - utf8str);
+}
+
+
+/*-----------------------------------------------------------------------------
+ Convert a UTF-8 character to a MultiByte character.
+ Return the size of the converted character in bytes.
+*/
+int
+ldap_x_utf8_to_mb ( char *mbchar, const char *utf8char,
+ int (*f_wctomb)(char *mbchar, wchar_t wchar) )
+{
+ wchar_t wchar;
+ int n;
+ char tmp[6]; /* Large enough for biggest multibyte char */
+
+ if (f_wctomb == NULL) /* If no conversion function was given... */
+ f_wctomb = wctomb; /* use the local ANSI C function */
+
+ /* First convert UTF-8 char to a wide char */
+ n = ldap_x_utf8_to_wc( &wchar, utf8char);
+
+ if (n == -1)
+ return -1; /* Invalid UTF-8 character */
+
+ if (mbchar == NULL)
+ n = f_wctomb( tmp, wchar );
+ else
+ n = f_wctomb( mbchar, wchar);
+
+ return n;
+}
+
+/*-----------------------------------------------------------------------------
+ Convert a UTF-8 string to a MultiByte string.
+ No more than 'count' bytes will be written to the output buffer.
+ Return the size of the converted string in bytes, excl null terminator.
+*/
+int
+ldap_x_utf8s_to_mbs ( char *mbstr, const char *utf8str, size_t count,
+ size_t (*f_wcstombs)(char *mbstr, const wchar_t *wcstr, size_t count) )
+{
+ wchar_t *wcs;
+ size_t wcsize;
+ int n;
+
+ if (f_wcstombs == NULL) /* If no conversion function was given... */
+ f_wcstombs = wcstombs; /* use the local ANSI C function */
+
+ if (utf8str == NULL || *utf8str == 0) /* NULL or empty input string */
+ {
+ if (mbstr)
+ *mbstr = 0;
+ return 0;
+ }
+
+/* Allocate memory for the maximum size wchar string that we could get. */
+ wcsize = strlen(utf8str) + 1;
+ wcs = (wchar_t *)LDAP_MALLOC(wcsize * sizeof(wchar_t));
+ if (wcs == NULL)
+ return -1; /* Memory allocation failure. */
+
+ /* First convert the UTF-8 string to a wide char string */
+ n = ldap_x_utf8s_to_wcs( wcs, utf8str, wcsize);
+
+ /* Then convert wide char string to multi-byte string */
+ if (n != -1)
+ {
+ n = f_wcstombs(mbstr, wcs, count);
+ }
+
+ LDAP_FREE(wcs);
+
+ return n;
+}
+
+/*-----------------------------------------------------------------------------
+ Convert a MultiByte character to a UTF-8 character.
+ 'mbsize' indicates the number of bytes of 'mbchar' to check.
+ Returns the number of bytes written to the output character.
+*/
+int
+ldap_x_mb_to_utf8 ( char *utf8char, const char *mbchar, size_t mbsize,
+ int (*f_mbtowc)(wchar_t *wchar, const char *mbchar, size_t count) )
+{
+ wchar_t wchar;
+ int n;
+
+ if (f_mbtowc == NULL) /* If no conversion function was given... */
+ f_mbtowc = mbtowc; /* use the local ANSI C function */
+
+ if (mbsize == 0) /* 0 is not valid. */
+ return -1;
+
+ if (mbchar == NULL || *mbchar == 0)
+ {
+ if (utf8char)
+ *utf8char = 0;
+ return 1;
+ }
+
+ /* First convert the MB char to a Wide Char */
+ n = f_mbtowc( &wchar, mbchar, mbsize);
+
+ if (n == -1)
+ return -1;
+
+ /* Convert the Wide Char to a UTF-8 character. */
+ n = ldap_x_wc_to_utf8( utf8char, wchar, LDAP_MAX_UTF8_LEN);
+
+ return n;
+}
+
+
+/*-----------------------------------------------------------------------------
+ Convert a MultiByte string to a UTF-8 string.
+ No more than 'count' bytes will be written to the output buffer.
+ Return the size of the converted string in bytes, excl null terminator.
+*/
+int
+ldap_x_mbs_to_utf8s ( char *utf8str, const char *mbstr, size_t count,
+ size_t (*f_mbstowcs)(wchar_t *wcstr, const char *mbstr, size_t count) )
+{
+ wchar_t *wcs;
+ int n;
+ size_t wcsize;
+
+ if (mbstr == NULL) /* Treat NULL input string as an empty string */
+ mbstr = "";
+
+ if (f_mbstowcs == NULL) /* If no conversion function was given... */
+ f_mbstowcs = mbstowcs; /* use the local ANSI C function */
+
+ /* Allocate memory for the maximum size wchar string that we could get. */
+ wcsize = strlen(mbstr) + 1;
+ wcs = (wchar_t *)LDAP_MALLOC( wcsize * sizeof(wchar_t) );
+ if (wcs == NULL)
+ return -1;
+
+ /* First convert multi-byte string to a wide char string */
+ n = f_mbstowcs(wcs, mbstr, wcsize);
+
+ /* Convert wide char string to UTF-8 string */
+ if (n != -1)
+ {
+ n = ldap_x_wcs_to_utf8s( utf8str, wcs, count);
+ }
+
+ LDAP_FREE(wcs);
+
+ return n;
+}