Add UTF-8 wc/mb conversion routines contributed by Novell.

author Kurt Zeilenga <kurt@openldap.org>

Thu, 28 Dec 2000 02:20:37 +0000 (02:20 +0000)

committer Kurt Zeilenga <kurt@openldap.org>

Thu, 28 Dec 2000 02:20:37 +0000 (02:20 +0000)
author Kurt Zeilenga <kurt@openldap.org>
Thu, 28 Dec 2000 02:20:37 +0000 (02:20 +0000)
committer Kurt Zeilenga <kurt@openldap.org>
Thu, 28 Dec 2000 02:20:37 +0000 (02:20 +0000)
diff --git a/doc/devel/utfconv.txt b/doc/devel/utfconv.txt

new file mode 100644 (file)

index 0000000..88dfb1d
--- /dev/null
+++ b/doc/devel/utfconv.txt
@@ -0,0 +1,291 @@
+                                                                Dec 5, 2000
+                                                                Dave Steck
+                                                                Novell, Inc.
+                    
+                    UTF-8 Conversion Functions
+
+
+1.  Strings in the LDAP C SDK should be encoded in UTF-8 format.
+    However, most platforms do not provide APIs for converting to
+    this format.  If they do, they are platform-specific.
+    
+    As a result, most applications (knowingly or not) use local strings
+    with LDAP functions.  This works fine for 7-bit ASCII characters,
+    but will fail with 8-bit European characters, Asian characters, etc.
+    
+    We propose adding the following platform-independent conversion functions 
+    to the OpenLDAP SDK.  There are 4 functions for converting between UTF-8 
+    and wide characters, and 4 functions for converting between UTF-8 and 
+    multibyte characters.
+
+    For multibyte to UTF-8 conversions, charset translation is necessary.
+    While a full charset translator is not practical or appropriate for the
+    LDAP SDK, we can pass the translator function in as an argument.
+    A NULL for this argument will use the ANSI C functions mbtowc, mbstowcs,
+    wctomb, and wcstombs.
+
+2.  UTF-8 <--> Wide Character conversions
+
+The following new conversion routines will be added, following the pattern of 
+the ANSI C conversion routines (mbtowc, mbstowcs, etc).  These routines use
+the wchar_t type.  wchar_t is 2 bytes on some systems and 4 bytes on others.  
+However the advantage of using wchar_t is that all the standard wide character 
+string functions may be used on these strings:   wcslen, wcscpy, etc.
+
+   int ldap_x_utf8_to_wc  -  Convert a single UTF-8 encoded character to a wide character.
+   int ldap_x_utf8s_to_wcs  -  Convert a UTF-8 string to a wide character string.
+   int ldap_x_wc_to_utf8  -  Convert a single wide character to a UTF-8 sequence.
+   int ldap_x_wcs_to_utf8s  -  Convert a wide character string to a UTF-8 string.
+
+
+2.1  ldap_x_utf8_to_wc  -  Convert a single UTF-8  encoded character to a wide character.
+
+int ldap_x_utf8_to_wc ( wchar_t *wchar, const char *utf8char )
+
+  wchar                (OUT)   Points to a wide character code to receive the 
+                    converted character.
+
+  utf8char     (IN)    Address of the UTF8 sequence of bytes.
+
+Return Value:
+               If successful, the function returns the length in 
+        bytes of the UTF-8 input character.
+
+        If utf8char is NULL or points to an empty string, the
+        function returns 1 and a NULL is written to wchar.
+        
+        If utf8char contains an invalid UTF-8 sequence -1 is returned.
+
+
+2.2  ldap_x_utf8s_to_wcs   -  Convert a UTF-8 string to a wide character string.
+
+int ldap_x_utf8s_to_wcs (wchar_t *wcstr, const char *utf8str, size_t count)
+
+  wcstr                (OUT)   Points to a wide char buffer to receive the 
+                    converted wide char string. The output string will be 
+                    null terminated if there is space for it in the 
+                    buffer.
+
+  utf8str   (IN)       Address of the null-terminated UTF-8 string to convert.  
+
+  count                (IN)    The number of UTF-8 characters to convert, or
+                               equivalently, the size of the output buffer in wide
+                               characters.
+
+Return Value:
+    If successful, the function returns the number of wide
+    characters written to wcstr, excluding the null termination
+    character, if any.
+
+       If wcstr is NULL, the function returns the number of wide
+    characters required to contain the converted string,
+    excluding the null termination character.
+
+    If an invalid UTF-8 sequence is encountered, the 
+    function returns -1. 
+
+    If the return value equals count, there was not enough space to fit the 
+    string and the null terminator in the buffer.  
+
+
+2.3  ldap_x_wc_to_utf8  -  Convert a single wide character to a UTF-8 sequence.
+
+int ldap_x_wc_to_utf8 ( char *utf8char, wchar_t wchar, count )
+
+  utf8char     (OUT)   Points to a byte array to receive the converted UTF-8
+                               string.
+
+  wchar                (IN)    The wide character to convert.
+
+  count                (IN)    The maximum number of bytes to write to the output
+                    buffer.  Normally set this to LDAP_MAX_UTF8_LEN, which 
+                    is defined as 3 or 6 depending on the size of wchar_t.  
+                    A partial character will not be written.
+                    
+Return Value:
+               If successful, the function returns the length in bytes of
+               the converted UTF-8 output character.
+
+        If wchar is NULL, the function returns 1 and a NULL is 
+        written to utf8char.
+        
+        If wchar cannot be converted to a UTF-8 character, the 
+        function returns -1.
+
+
+2.4  int ldap_x_wcs_to_utf8s  -  Convert a wide character string to a UTF-8 string.
+
+int ldap_x_wcs_to_utf8s (char *utf8str, const wchar_t *wcstr, size_t count)
+
+  utf8str      (OUT)   Points to a byte array to receive the converted 
+                    UTF-8 string. The output string will be null 
+                    terminated if there is space for it in the 
+                    buffer.
+
+
+  wcstr                (IN)    Address of the null-terminated wide char string to convert.
+
+  count                (IN)    The size of the output buffer in bytes.
+
+Return Value:
+               If successful, the function returns the number of bytes
+               written to utf8str, excluding the null termination
+        character, if any.
+
+               If utf8str is NULL, the function returns the number of
+        bytes required to contain the converted string, excluding 
+        the null termination character.  The 'count' parameter is ignored.
+        
+        If the function encounters a wide character that cannot 
+        be mapped to a UTF-8 sequence, the function returns -1.
+        
+        If the return value equals count, there was not enough space to fit 
+        the string and the null terminator in the buffer.
+
+
+
+3. Multi-byte <--> UTF-8 Conversions
+
+These functions convert the string in a two-step process, from multibyte 
+to Wide, then from Wide to UTF8, or vice versa.  This conversion requires a 
+charset translation routine, which is passed in as an argument.
+ 
+   ldap_x_mb_to_utf8  -  Convert a multi-byte character  to a UTF-8 character.
+   ldap_x_mbs_to_utf8s  -  Convert a multi-byte string to a UTF-8 string.
+   ldap_x_utf8_to_mb  -  Convert a UTF-8 character to a multi-byte character.
+   ldap_x_utf8s_to_mbs  -  Convert a UTF-8 string to a multi-byte string.
+
+3.1  ldap_x_mb_to_utf8  - Convert a multi-byte character  to a UTF-8 character.
+
+int ldap_x_mb_to_utf8 ( char *utf8char, const char *mbchar, size_t mbsize, int (*f_mbtowc)(wchar_t *wchar, const char *mbchar, size_t count)  )
+
+  utf8char     (OUT)   Points to a byte buffer to receive the converted 
+                    UTF-8 character.  May be NULL.  The output is not
+                    null-terminated.
+
+  mbchar    (IN)       Address of a sequence of bytes forming a multibyte character.
+
+  mbsize       (IN)    The maximum number of bytes of the mbchar argument to 
+                    check.  This should normally be MB_CUR_MAX.
+
+  f_mbtowc     (IN)    The function to use for converting a multibyte 
+                    character to a wide character.  If NULL, the local 
+                    ANSI C routine mbtowc is used.
+
+Return Value:
+               If successful, the function returns the length in bytes of
+        the UTF-8 output character.  
+        
+        If utf8char is NULL, count is ignored and the funtion 
+        returns the number of bytes that would be written to the 
+        output char.
+
+        If count is zero, 0 is returned and nothing is written to
+        utf8char.
+         
+        If mbchar is NULL or points to an empty string, the 
+        function returns 1 and a null byte is written to utf8char.
+        
+        If mbchar contains an invalid multi-byte character, -1 is returned.
+
+
+3.2  ldap_x_mbs_to_utf8s  - Convert a multi-byte string  to a UTF-8 string.
+
+int ldap_x_mbs_to_utf8s (char *utf8str, const char *mbstr, size_t count, 
+        size_t (*f_mbstowcs)(wchar_t *wcstr, const char *mbstr, size_t count))
+
+utf8str            (OUT)       Points to a buffer to receive the converted UTF-8 string.  
+                    May be NULL.
+
+  mbchar       (IN)    Address of the null-terminated multi-byte input string.
+
+  count            (IN)        The size of the output buffer in bytes.
+
+  f_mbstowcs (IN)      The function to use for converting a multibyte string
+                       to a wide character string.  If NULL, the local ANSI
+                       C routine mbstowcs is used.
+
+Return Value:
+               If successful, the function returns the length in 
+        bytes of the UTF-8 output string, excluding the null
+        terminator, if present.
+        
+        If utf8str is NULL, count is ignored and the function 
+        returns the number of bytes required for the output string, 
+        excluding the NULL.
+        
+        If count is zero, 0 is returned and nothing is written to utf8str.
+         
+        If mbstr is NULL or points to an empty string, the 
+        function returns 1 and a null byte is written to utf8str.
+        
+        If mbstr contains an invalid multi-byte character, -1 is returned.
+        
+        If the returned value is equal to count, the entire null-terminated 
+        string would not fit in the output buffer.
+
+
+3.3  ldap_x_utf8_to_mb  -  Convert a UTF-8 character to a multi-byte character.
+
+int ldap_x_utf8_to_mb ( char *mbchar, const char *utf8char,
+                        int (*f_wctomb)(char *mbchar, wchar_t wchar) )
+
+mbchar (OUT)   Points to a byte buffer to receive the converted multi-byte 
+                character.  May be NULL.
+
+  utf8char     (IN)    Address of the UTF-8 character sequence.
+
+  f_wctomb     (IN)    The function to use for converting a wide character 
+                    to a multibyte character.  If NULL, the local 
+                    ANSI C routine wctomb is used.
+
+
+Return Value:
+               If successful, the function returns the length in 
+        bytes of the multi-byte output character.  
+        
+        If utf8char is NULL or points to an empty string, the 
+        function returns 1 and a null byte is written to mbchar.
+        
+        If utf8char contains an invalid UTF-8 sequence, -1 is returned.
+
+
+3.4  int ldap_x_utf8s_to_mbs  - Convert a UTF-8 string to a multi-byte string.
+
+
+int ldap_x_utf8s_to_mbs ( char *mbstr, const char *utf8str, size_t count, 
+        size_t (*f_wcstombs)(char *mbstr, const wchar_t *wcstr, size_t count) )
+
+  mbstr                (OUT)   Points to a byte buffer to receive the converted 
+                    multi-byte string.  May be NULL.
+
+  utf8str   (IN)       Address of the null-terminated UTF-8 string to convert.
+
+  count                (IN)    The size of the output buffer in bytes.
+
+  f_wcstombs (IN)      The function to use for converting a wide character 
+                    string to a multibyte string.  If NULL, the local 
+                    ANSI C routine wcstombs is used.
+
+Return Value:
+        If successful, the function returns the number of bytes
+               written to mbstr, excluding the null termination
+        character, if any.
+
+        If mbstr is NULL, count is ignored and the funtion 
+        returns the number of bytes required for the output string,
+        excluding the NULL.
+        
+        If count is zero, 0 is returned and nothing is written to
+        mbstr.
+        
+        If utf8str is NULL or points to an empty string, the 
+        function returns 1 and a null byte is written to mbstr.
+        
+        If an invalid UTF-8 character is encountered, the 
+        function returns -1.
+
+The output string will be null terminated if there is space for it in 
+the output buffer.
+
+
diff --git a/include/ldap_utf8.h b/include/ldap_utf8.h

new file mode 100644 (file)

index 0000000..4646d2c
--- /dev/null
+++ b/include/ldap_utf8.h
@@ -0,0 +1,86 @@
+/* $OpenLDAP$ */
+/* $Novell: /ldap/src/cldap/include/ldap_utf8.h,v 1.3 2000/12/04 20:23:20 dsteck Exp $ 
+/*
+ * Copyright 2000 The OpenLDAP Foundation, Redwood City, California, USA
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms are permitted only
+ * as authorized by the OpenLDAP Public License.  A copy of this
+ * license is available at http://www.OpenLDAP.org/license.html or
+ * in file LICENSE in the top-level directory of the distribution.
+ */
+/******************************************************************************
+ * This notice applies to changes, created by or for Novell, Inc.,
+ * to preexisting works for which notices appear elsewhere in this file.
+ *
+ * Copyright (C) 2000 Novell, Inc. All Rights Reserved.
+ *
+ * THIS WORK IS SUBJECT TO U.S. AND INTERNATIONAL COPYRIGHT LAWS AND TREATIES.
+ * USE, MODIFICATION, AND REDISTRIBUTION OF THIS WORK IS SUBJECT TO VERSION
+ * 2.0.1 OF THE OPENLDAP PUBLIC LICENSE, A COPY OF WHICH IS AVAILABLE AT
+ * HTTP://WWW.OPENLDAP.ORG/LICENSE.HTML OR IN THE FILE "LICENSE" IN THE
+ * TOP-LEVEL DIRECTORY OF THE DISTRIBUTION. ANY USE OR EXPLOITATION OF THIS
+ * WORK OTHER THAN AS AUTHORIZED IN VERSION 2.0.1 OF THE OPENLDAP PUBLIC
+ * LICENSE, OR OTHER PRIOR WRITTEN CONSENT FROM NOVELL, COULD SUBJECT THE
+ * PERPETRATOR TO CRIMINAL AND CIVIL LIABILITY. 
+ ******************************************************************************/
+
+#ifndef _LDAP_UTF8_H
+#define _LDAP_UTF8_H
+
+LDAP_BEGIN_DECL
+
+/*  
+ * UTF-8 Utility Routines (in utf-8.c)
+ */
+
+#define LDAP_UCS4_INVALID (0x80000000U)
+
+/* LDAP_MAX_UTF8_LEN is 3 or 6 depending on size of wchar_t */
+#define LDAP_MAX_UTF8_LEN  sizeof(wchar_t)*3/2
+
+
+/*
+ * UTF-8 Conversion Routines.   (in utfconv.c)
+ */
+
+/* UTF-8 character to Wide Char */
+LDAP_F(int)
+ldap_x_utf8_to_wc ( wchar_t *wchar, const char *utf8char );
+
+/* UTF-8 string to Wide Char string */
+LDAP_F(int)
+ldap_x_utf8s_to_wcs ( wchar_t *wcstr, const char *utf8str, size_t count );
+
+/* Wide Char to UTF-8 character */
+LDAP_F(int)
+ldap_x_wc_to_utf8 ( char *utf8char, wchar_t wchar, size_t count );
+
+/* Wide Char string to UTF-8 string */
+LDAP_F(int)
+ldap_x_wcs_to_utf8s ( char *utf8str, const wchar_t *wcstr, size_t count );
+
+
+/* UTF-8 character to MultiByte character */
+LDAP_F(int)
+ldap_x_utf8_to_mb ( char *mbchar, const char *utf8char,
+               int (*f_wctomb)(char *mbchar, wchar_t wchar) );
+
+/* UTF-8 string to MultiByte string */
+LDAP_F(int)
+ldap_x_utf8s_to_mbs ( char *mbstr, const char *utf8str, size_t count,
+               size_t (*f_wcstombs)(char *mbstr, const wchar_t *wcstr, size_t count) );
+
+/* MultiByte character to UTF-8 character */
+LDAP_F(int)
+ldap_x_mb_to_utf8 ( char *utf8char, const char *mbchar, size_t mbsize,
+               int (*f_mbtowc)(wchar_t *wchar, const char *mbchar, size_t count) );
+
+/* MultiByte string to UTF-8 string */
+LDAP_F(int)
+ldap_x_mbs_to_utf8s ( char *utf8str, const char *mbstr, size_t count,
+               size_t (*f_mbstowcs)(wchar_t *wcstr, const char *mbstr, size_t count) );
+
+LDAP_END_DECL
+
+#endif /* _LDAP_UTF8_H */
diff --git a/libraries/libldap/Makefile.in b/libraries/libldap/Makefile.in

index 2be2d6f9256bb448c56081d46641f1e26bdb66fb..12a01a71d8bcca73363e052d07bc728d6c744046 100644 (file)
--- a/libraries/libldap/Makefile.in
+++ b/libraries/libldap/Makefile.in
@@ -18,7 +18,7 @@ SRCS  = bind.c open.c result.c error.c compare.c search.c \
         request.c os-ip.c url.c sortctrl.c vlvctrl.c \
         init.c options.c print.c string.c util-int.c schema.c \
         charray.c tls.c dn.c os-local.c dnssrv.c \
-       utf-8.c
+       utf-8.c utf-8-conv.c
  OBJS   = bind.lo open.lo result.lo error.lo compare.lo search.lo \
         controls.lo messages.lo references.lo extended.lo cyrus.lo \
         modify.lo add.lo modrdn.lo delete.lo abandon.lo ufn.lo cache.lo \
@@ -28,7 +28,7 @@ OBJS  = bind.lo open.lo result.lo error.lo compare.lo search.lo \
         request.lo os-ip.lo url.lo sortctrl.lo vlvctrl.lo \
         init.lo options.lo print.lo string.lo util-int.lo schema.lo \
         charray.lo tls.lo dn.lo os-local.lo dnssrv.lo \
-       utf-8.lo
+       utf-8.lo utf-8-conv.lo
  
  LDAP_INCDIR= ../../include       
  LDAP_LIBDIR= ../../libraries
diff --git a/libraries/libldap/utf-8-conv.c b/libraries/libldap/utf-8-conv.c

new file mode 100644 (file)

index 0000000..04a464a
--- /dev/null
+++ b/libraries/libldap/utf-8-conv.c
@@ -0,0 +1,464 @@
+/* $OpenLDAP$ */
+/*
+ * Copyright 2000 The OpenLDAP Foundation, All Rights Reserved.
+ * COPYING RESTRICTIONS APPLY, see COPYRIGHT file
+ */
+
+/* $Novell: /ldap/src/cldap/libraries/libldap/utfconv.c,v 1.3 2000/12/11 19:35:37 dsteck Exp $ */
+/******************************************************************************
+ * Copyright (C) 1999, 2000 Novell, Inc. All Rights Reserved.
+ * 
+ * THIS WORK IS SUBJECT TO U.S. AND INTERNATIONAL COPYRIGHT LAWS AND
+ * TREATIES. USE, MODIFICATION, AND REDISTRIBUTION OF THIS WORK IS SUBJECT
+ * TO VERSION 2.0.1 OF THE OPENLDAP PUBLIC LICENSE, A COPY OF WHICH IS
+ * AVAILABLE AT HTTP://WWW.OPENLDAP.ORG/LICENSE.HTML OR IN THE FILE "LICENSE"
+ * IN THE TOP-LEVEL DIRECTORY OF THE DISTRIBUTION. ANY USE OR EXPLOITATION
+ * OF THIS WORK OTHER THAN AS AUTHORIZED IN VERSION 2.0.1 OF THE OPENLDAP
+ * PUBLIC LICENSE, OR OTHER PRIOR WRITTEN CONSENT FROM NOVELL, COULD SUBJECT
+ * THE PERPETRATOR TO CRIMINAL AND CIVIL LIABILITY. 
+ ******************************************************************************/
+
+/*
+ * UTF-8 Conversion Routines
+ *
+ * These routines convert between Wide Character and UTF-8,
+ * or between MultiByte and UTF-8 encodings.
+ *
+ * Both single character and string versions of the functions are provided.
+ * All functions return -1 if the character or string cannot be converted.
+ */
+
+#include "portable.h"
+
+#include <stdio.h>
+#include <ac/stdlib.h>         /* For wctomb, wcstombs, mbtowc, mbstowcs */
+#include <ac/string.h>
+#include <ac/time.h>           /* for time_t */
+
+#include "ldap-int.h"
+
+#include <ldap_utf8.h>
+
+static unsigned char mask[] = { 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
+
+
+/*-----------------------------------------------------------------------------
+                                       UTF-8 Format Summary
+
+ASCII chars                                            7 bits
+    0xxxxxxx
+    
+2-character UTF-8 sequence:        11 bits
+    110xxxxx  10xxxxxx
+
+3-character UTF-8                  16 bits
+    1110xxxx  10xxxxxx  10xxxxxx   
+    
+4-char UTF-8                       21 bits 
+    11110xxx  10xxxxxx  10xxxxxx  10xxxxxx
+    
+5-char UTF-8                       26 bits
+    111110xx  10xxxxxx  10xxxxxx  10xxxxxx  10xxxxxx
+    
+6-char UTF-8                       31 bits
+    1111110x  10xxxxxx  10xxxxxx  10xxxxxx  10xxxxxx  10xxxxxx
+    
+Unicode address space   (0 - 0x10FFFF)    21 bits
+ISO-10646 address space (0 - 0x7FFFFFFF)  31 bits
+
+Note:  This code does not prevent UTF-8 sequences which are longer than
+          necessary from being decoded.
+*/
+
+/*------------------------------------------------------------------------------ 
+   Convert a UTF-8 character to a wide char. 
+   Return the length of the UTF-8 input character in bytes.
+*/
+int
+ldap_x_utf8_to_wc ( wchar_t *wchar, const char *utf8char )
+{
+       int utflen, i;
+       wchar_t ch;
+
+       /* If input ptr is NULL, treat it as empty string. */
+       if (utf8char == NULL)
+               utf8char = "";
+
+       /* Get UTF-8 sequence length from 1st byte */
+       utflen = UTF8_CHARLEN(utf8char);
+       
+       if( utflen==0 || utflen > LDAP_MAX_UTF8_LEN )
+               return -1;                                                                      /* Invalid input */
+
+       /* First byte minus length tag */
+       ch = (wchar_t)(utf8char[0] & mask[utflen]);
+       
+       for(i=1; i < utflen; i++)
+       {
+               /* Subsequent bytes must start with 10 */
+               if ((utf8char[i] & 0xc0) != 0x80)
+                       return -1;
+       
+               ch <<= 6;                       /* 6 bits of data in each subsequent byte */
+               ch |= (wchar_t)(utf8char[i] & 0x3f);
+       }
+       
+       if (wchar)
+               *wchar = ch;
+
+       return utflen;
+}
+
+/*-----------------------------------------------------------------------------
+   Convert a UTF-8 string to a wide char string.
+   No more than 'count' wide chars will be written to the output buffer.
+   Return the size of the converted string in wide chars, excl null terminator.
+*/
+int
+ldap_x_utf8s_to_wcs ( wchar_t *wcstr, const char *utf8str, size_t count )
+{
+       size_t wclen = 0;
+       int utflen, i;
+       wchar_t ch;
+
+
+       /* If input ptr is NULL, treat it as empty string. */
+       if (utf8str == NULL)
+               utf8str = "";
+
+       /* Examine next UTF-8 character.  If output buffer is NULL, ignore count */
+       while ( *utf8str && (wcstr==NULL || wclen<count) )
+       {
+               /* Get UTF-8 sequence length from 1st byte */
+               utflen = UTF8_CHARLEN(utf8str);
+               
+               if( utflen==0 || utflen > LDAP_MAX_UTF8_LEN )
+                       return -1;                                                                      /* Invalid input */
+
+               /* First byte minus length tag */
+               ch = (wchar_t)(utf8str[0] & mask[utflen]);
+               
+               for(i=1; i < utflen; i++)
+               {
+                       /* Subsequent bytes must start with 10 */
+                       if ((utf8str[i] & 0xc0) != 0x80)
+                               return -1;
+               
+                       ch <<= 6;                       /* 6 bits of data in each subsequent byte */
+                       ch |= (wchar_t)(utf8str[i] & 0x3f);
+               }
+               
+               if (wcstr)
+                       wcstr[wclen] = ch;
+               
+               utf8str += utflen;              /* Move to next UTF-8 character */
+               wclen++;                                /* Count number of wide chars stored/required */
+       }
+
+       /* Add null terminator if there's room in the buffer. */
+       if (wcstr && wclen < count)
+               wcstr[wclen] = 0;
+
+       return wclen;
+}
+
+
+/*------------------------------------------------------------------------------ 
+   Convert one wide char to a UTF-8 character.
+   Return the length of the converted UTF-8 character in bytes.
+   No more than 'count' bytes will be written to the output buffer.
+*/
+int
+ldap_x_wc_to_utf8 ( char *utf8char, wchar_t wchar, size_t count )
+{
+       int len=0;
+
+       if (utf8char == NULL)   /* Just determine the required UTF-8 char length. */
+       {                                               /* Ignore count */
+               if( wchar < 0 )
+                       return -1;
+               if( wchar < 0x80 )
+                       return 1;
+               if( wchar < 0x800 )
+                       return 2; 
+               if( wchar < 0x10000 )
+                       return 3;
+               if( wchar < 0x200000 ) 
+                       return 4;
+               if( wchar < 0x4000000 ) 
+                       return 5;
+               if( wchar < 0x80000000 )
+                       return 6;
+               return -1;
+       }
+
+       
+       if ( wchar < 0 ) {                              /* Invalid wide character */
+               len = -1;
+
+       } else if( wchar < 0x80 ) {
+               if (count >= 1) {
+                       utf8char[len++] = (char)wchar;
+               }
+
+       } else if( wchar < 0x800 ) {
+               if (count >=2) {
+                       utf8char[len++] = 0xc0 | ( wchar >> 6 );
+                       utf8char[len++] = 0x80 | ( wchar & 0x3f );
+               }
+
+       } else if( wchar < 0x10000 ) {
+               if (count >= 3) {       
+                       utf8char[len++] = 0xe0 | ( wchar >> 12 );
+                       utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
+                       utf8char[len++] = 0x80 | ( wchar & 0x3f );
+               }
+       
+       } else if( wchar < 0x200000 ) {
+               if (count >= 4) {
+                       utf8char[len++] = 0xf0 | ( wchar >> 18 );
+                       utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
+                       utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
+                       utf8char[len++] = 0x80 | ( wchar & 0x3f );
+               }
+
+       } else if( wchar < 0x4000000 ) {
+               if (count >= 5) {
+                       utf8char[len++] = 0xf8 | ( wchar >> 24 );
+                       utf8char[len++] = 0x80 | ( (wchar >> 18) & 0x3f );
+                       utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
+                       utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
+                       utf8char[len++] = 0x80 | ( wchar & 0x3f );
+               }
+
+       } else if( wchar < 0x80000000 ) {
+               if (count >= 6) {
+                       utf8char[len++] = 0xfc | ( wchar >> 30 );
+                       utf8char[len++] = 0x80 | ( (wchar >> 24) & 0x3f );
+                       utf8char[len++] = 0x80 | ( (wchar >> 18) & 0x3f );
+                       utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
+                       utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
+                       utf8char[len++] = 0x80 | ( wchar & 0x3f );
+               }
+
+       } else
+               len = -1;
+       
+       return len;
+
+}
+
+
+/*-----------------------------------------------------------------------------
+   Convert a wide char string to a UTF-8 string.
+   No more than 'count' bytes will be written to the output buffer.
+   Return the # of bytes written to the output buffer, excl null terminator.
+*/
+int
+ldap_x_wcs_to_utf8s ( char *utf8str, const wchar_t *wcstr, size_t count )
+{
+       int len = 0;
+       int n;
+       char *p = utf8str;
+       wchar_t empty = 0;              /* To avoid use of L"" construct */
+
+       if (wcstr == NULL)              /* Treat input ptr NULL as an empty string */
+               wcstr = &empty;
+
+       if (utf8str == NULL)    /* Just compute size of output, excl null */
+       {
+               while (*wcstr)
+               {
+                       /* Get UTF-8 size of next wide char */
+                       n = ldap_x_wc_to_utf8( NULL, *wcstr++, LDAP_MAX_UTF8_LEN);
+                       if (n == -1)
+                               return -1;
+                       len += n;
+               }
+
+               return len;
+       }
+
+       
+       /* Do the actual conversion. */
+
+       n = 1;                                  /* In case of empty wcstr */
+       while (*wcstr)
+       {
+               n = ldap_x_wc_to_utf8( p, *wcstr++, count);
+               
+               if (n <= 0)             /* If encoding error (-1) or won't fit (0), quit */
+                       break;
+               
+               p += n;
+               count -= n;                     /* Space left in output buffer */
+       }
+
+       /* If not enough room for last character, pad remainder with null
+          so that return value = original count, indicating buffer full. */
+       if (n == 0)
+       {
+               while (count--)
+                       *p++ = 0;
+       }
+
+       /* Add a null terminator if there's room. */
+       else if (count)
+               *p = 0;
+
+       if (n == -1)                    /* Conversion encountered invalid wide char. */
+               return -1;
+
+       /* Return the number of bytes written to output buffer, excl null. */ 
+       return (p - utf8str);
+}
+
+
+/*-----------------------------------------------------------------------------
+   Convert a UTF-8 character to a MultiByte character.
+   Return the size of the converted character in bytes.
+*/
+int
+ldap_x_utf8_to_mb ( char *mbchar, const char *utf8char,
+               int (*f_wctomb)(char *mbchar, wchar_t wchar) )
+{
+       wchar_t wchar;
+       int n;
+       char tmp[6];                            /* Large enough for biggest multibyte char */
+
+       if (f_wctomb == NULL)           /* If no conversion function was given... */
+               f_wctomb = wctomb;              /*    use the local ANSI C function */
+ 
+       /* First convert UTF-8 char to a wide char */
+       n = ldap_x_utf8_to_wc( &wchar, utf8char);
+
+       if (n == -1)
+               return -1;              /* Invalid UTF-8 character */
+
+       if (mbchar == NULL)
+               n = f_wctomb( tmp, wchar );
+       else
+               n = f_wctomb( mbchar, wchar);
+
+       return n;
+}
+
+/*-----------------------------------------------------------------------------
+   Convert a UTF-8 string to a MultiByte string.
+   No more than 'count' bytes will be written to the output buffer.
+   Return the size of the converted string in bytes, excl null terminator.
+*/
+int
+ldap_x_utf8s_to_mbs ( char *mbstr, const char *utf8str, size_t count,
+               size_t (*f_wcstombs)(char *mbstr, const wchar_t *wcstr, size_t count) )
+{
+       wchar_t *wcs;
+       size_t wcsize;
+    int n;
+
+       if (f_wcstombs == NULL)         /* If no conversion function was given... */
+               f_wcstombs = wcstombs;  /*    use the local ANSI C function */
+ 
+       if (utf8str == NULL || *utf8str == 0)   /* NULL or empty input string */
+       {
+               if (mbstr)
+                       *mbstr = 0;
+               return 0;
+       }
+
+/* Allocate memory for the maximum size wchar string that we could get. */
+       wcsize = strlen(utf8str) + 1;
+       wcs = (wchar_t *)LDAP_MALLOC(wcsize * sizeof(wchar_t));
+       if (wcs == NULL)
+               return -1;                              /* Memory allocation failure. */
+
+       /* First convert the UTF-8 string to a wide char string */
+       n = ldap_x_utf8s_to_wcs( wcs, utf8str, wcsize);
+
+       /* Then convert wide char string to multi-byte string */
+       if (n != -1)
+       {
+               n = f_wcstombs(mbstr, wcs, count);
+       }
+
+       LDAP_FREE(wcs);
+
+       return n;
+}
+
+/*-----------------------------------------------------------------------------
+   Convert a MultiByte character to a UTF-8 character.
+   'mbsize' indicates the number of bytes of 'mbchar' to check.
+   Returns the number of bytes written to the output character.
+*/
+int
+ldap_x_mb_to_utf8 ( char *utf8char, const char *mbchar, size_t mbsize,
+               int (*f_mbtowc)(wchar_t *wchar, const char *mbchar, size_t count) )
+{
+    wchar_t wchar;
+    int n;
+
+       if (f_mbtowc == NULL)           /* If no conversion function was given... */
+               f_mbtowc = mbtowc;              /*    use the local ANSI C function */
+ 
+    if (mbsize == 0)                           /* 0 is not valid. */
+        return -1;
+
+    if (mbchar == NULL || *mbchar == 0)
+    {
+        if (utf8char)
+            *utf8char = 0;
+        return 1;
+    }
+
+       /* First convert the MB char to a Wide Char */
+       n = f_mbtowc( &wchar, mbchar, mbsize);
+
+       if (n == -1)
+               return -1;
+
+       /* Convert the Wide Char to a UTF-8 character. */
+       n = ldap_x_wc_to_utf8( utf8char, wchar, LDAP_MAX_UTF8_LEN);
+
+       return n;
+}
+
+
+/*-----------------------------------------------------------------------------
+   Convert a MultiByte string to a UTF-8 string.
+   No more than 'count' bytes will be written to the output buffer.
+   Return the size of the converted string in bytes, excl null terminator.
+*/   
+int
+ldap_x_mbs_to_utf8s ( char *utf8str, const char *mbstr, size_t count,
+               size_t (*f_mbstowcs)(wchar_t *wcstr, const char *mbstr, size_t count) )
+{
+       wchar_t *wcs;
+       int n;
+       size_t wcsize;
+
+       if (mbstr == NULL)                 /* Treat NULL input string as an empty string */
+               mbstr = "";
+
+       if (f_mbstowcs == NULL)         /* If no conversion function was given... */
+               f_mbstowcs = mbstowcs;  /*    use the local ANSI C function */
+ 
+       /* Allocate memory for the maximum size wchar string that we could get. */
+       wcsize = strlen(mbstr) + 1;
+       wcs = (wchar_t *)LDAP_MALLOC( wcsize * sizeof(wchar_t) );
+       if (wcs == NULL)
+               return -1;
+
+       /* First convert multi-byte string to a wide char string */
+       n = f_mbstowcs(wcs, mbstr, wcsize);
+
+       /* Convert wide char string to UTF-8 string */
+       if (n != -1)
+       {
+               n = ldap_x_wcs_to_utf8s( utf8str, wcs, count);
+       }
+
+       LDAP_FREE(wcs);
+
+       return n;       
+}
author	Kurt Zeilenga <kurt@openldap.org>
	Thu, 28 Dec 2000 02:20:37 +0000 (02:20 +0000)
committer	Kurt Zeilenga <kurt@openldap.org>
	Thu, 28 Dec 2000 02:20:37 +0000 (02:20 +0000)
doc/devel/utfconv.txt	[new file with mode: 0644]	patch \| blob
include/ldap_utf8.h	[new file with mode: 0644]	patch \| blob
libraries/libldap/Makefile.in		patch \| blob \| history
libraries/libldap/utf-8-conv.c	[new file with mode: 0644]	patch \| blob