git.sur5r.net Git - openldap/blob - doc/devel/utfconv.txt

   1                                                                 Dec 5, 2000
   2                                                                 Dave Steck
   3                                                                 Novell, Inc.
   4
   5                     UTF-8 Conversion Functions
   6
   7
   8 1.  Strings in the LDAP C SDK should be encoded in UTF-8 format.
   9     However, most platforms do not provide APIs for converting to
  10     this format.  If they do, they are platform-specific.
  11
  12     As a result, most applications (knowingly or not) use local strings
  13     with LDAP functions.  This works fine for 7-bit ASCII characters,
  14     but will fail with 8-bit European characters, Asian characters, etc.
  15
  16     We propose adding the following platform-independent conversion functions
  17     to the OpenLDAP SDK.  There are 4 functions for converting between UTF-8
  18     and wide characters, and 4 functions for converting between UTF-8 and
  19     multibyte characters.
  20
  21     For multibyte to UTF-8 conversions, charset translation is necessary.
  22     While a full charset translator is not practical or appropriate for the
  23     LDAP SDK, we can pass the translator function in as an argument.
  24     A NULL for this argument will use the ANSI C functions mbtowc, mbstowcs,
  25     wctomb, and wcstombs.
  26
  27 2.  UTF-8 <--> Wide Character conversions
  28
  29 The following new conversion routines will be added, following the pattern of
  30 the ANSI C conversion routines (mbtowc, mbstowcs, etc).  These routines use
  31 the wchar_t type.  wchar_t is 2 bytes on some systems and 4 bytes on others.
  32 However the advantage of using wchar_t is that all the standard wide character
  33 string functions may be used on these strings:   wcslen, wcscpy, etc.
  34
  35    int ldap_x_utf8_to_wc  -  Convert a single UTF-8 encoded character to a wide character.
  36    int ldap_x_utf8s_to_wcs  -  Convert a UTF-8 string to a wide character string.
  37    int ldap_x_wc_to_utf8  -  Convert a single wide character to a UTF-8 sequence.
  38    int ldap_x_wcs_to_utf8s  -  Convert a wide character string to a UTF-8 string.
  39
  40
  41 2.1  ldap_x_utf8_to_wc  -  Convert a single UTF-8  encoded character to a wide character.
  42
  43 int ldap_x_utf8_to_wc ( wchar_t *wchar, const char *utf8char )
  44
  45   wchar         (OUT)   Points to a wide character code to receive the
  46                     converted character.
  47
  48   utf8char      (IN)    Address of the UTF8 sequence of bytes.
  49
  50 Return Value:
  51                 If successful, the function returns the length in
  52         bytes of the UTF-8 input character.
  53
  54         If utf8char is NULL or points to an empty string, the
  55         function returns 1 and a NULL is written to wchar.
  56
  57         If utf8char contains an invalid UTF-8 sequence -1 is returned.
  58
  59
  60 2.2  ldap_x_utf8s_to_wcs   -  Convert a UTF-8 string to a wide character string.
  61
  62 int ldap_x_utf8s_to_wcs (wchar_t *wcstr, const char *utf8str, size_t count)
  63
  64   wcstr         (OUT)   Points to a wide char buffer to receive the
  65                     converted wide char string. The output string will be
  66                     null terminated if there is space for it in the
  67                     buffer.
  68
  69   utf8str   (IN)        Address of the null-terminated UTF-8 string to convert.
  70
  71   count         (IN)    The number of UTF-8 characters to convert, or
  72                                 equivalently, the size of the output buffer in wide
  73                                 characters.
  74
  75 Return Value:
  76     If successful, the function returns the number of wide
  77     characters written to wcstr, excluding the null termination
  78     character, if any.
  79
  80         If wcstr is NULL, the function returns the number of wide
  81     characters required to contain the converted string,
  82     excluding the null termination character.
  83
  84     If an invalid UTF-8 sequence is encountered, the
  85     function returns -1.
  86
  87     If the return value equals count, there was not enough space to fit the
  88     string and the null terminator in the buffer.
  89
  90
  91 2.3  ldap_x_wc_to_utf8  -  Convert a single wide character to a UTF-8 sequence.
  92
  93 int ldap_x_wc_to_utf8 ( char *utf8char, wchar_t wchar, count )
  94
  95   utf8char      (OUT)   Points to a byte array to receive the converted UTF-8
  96                                 string.
  97
  98   wchar         (IN)    The wide character to convert.
  99
 100   count         (IN)    The maximum number of bytes to write to the output
 101                     buffer.  Normally set this to LDAP_MAX_UTF8_LEN, which
 102                     is defined as 3 or 6 depending on the size of wchar_t.
 103                     A partial character will not be written.
 104
 105 Return Value:
 106                 If successful, the function returns the length in bytes of
 107                 the converted UTF-8 output character.
 108
 109         If wchar is NULL, the function returns 1 and a NULL is
 110         written to utf8char.
 111
 112         If wchar cannot be converted to a UTF-8 character, the
 113         function returns -1.
 114
 115
 116 2.4  int ldap_x_wcs_to_utf8s  -  Convert a wide character string to a UTF-8 string.
 117
 118 int ldap_x_wcs_to_utf8s (char *utf8str, const wchar_t *wcstr, size_t count)
 119
 120   utf8str       (OUT)   Points to a byte array to receive the converted
 121                     UTF-8 string. The output string will be null
 122                     terminated if there is space for it in the
 123                     buffer.
 124
 125
 126   wcstr         (IN)    Address of the null-terminated wide char string to convert.
 127
 128   count         (IN)    The size of the output buffer in bytes.
 129
 130 Return Value:
 131                 If successful, the function returns the number of bytes
 132                 written to utf8str, excluding the null termination
 133         character, if any.
 134
 135                 If utf8str is NULL, the function returns the number of
 136         bytes required to contain the converted string, excluding
 137         the null termination character.  The 'count' parameter is ignored.
 138
 139         If the function encounters a wide character that cannot
 140         be mapped to a UTF-8 sequence, the function returns -1.
 141
 142         If the return value equals count, there was not enough space to fit
 143         the string and the null terminator in the buffer.
 144
 145
 146
 147 3. Multi-byte <--> UTF-8 Conversions
 148
 149 These functions convert the string in a two-step process, from multibyte
 150 to Wide, then from Wide to UTF8, or vice versa.  This conversion requires a
 151 charset translation routine, which is passed in as an argument.
 152
 153    ldap_x_mb_to_utf8  -  Convert a multi-byte character  to a UTF-8 character.
 154    ldap_x_mbs_to_utf8s  -  Convert a multi-byte string to a UTF-8 string.
 155    ldap_x_utf8_to_mb  -  Convert a UTF-8 character to a multi-byte character.
 156    ldap_x_utf8s_to_mbs  -  Convert a UTF-8 string to a multi-byte string.
 157
 158 3.1  ldap_x_mb_to_utf8  - Convert a multi-byte character  to a UTF-8 character.
 159
 160 int ldap_x_mb_to_utf8 ( char *utf8char, const char *mbchar, size_t mbsize, int (*f_mbtowc)(wchar_t *wchar, const char *mbchar, size_t count)  )
 161
 162   utf8char      (OUT)   Points to a byte buffer to receive the converted
 163                     UTF-8 character.  May be NULL.  The output is not
 164                     null-terminated.
 165
 166   mbchar    (IN)        Address of a sequence of bytes forming a multibyte character.
 167
 168   mbsize        (IN)    The maximum number of bytes of the mbchar argument to
 169                     check.  This should normally be MB_CUR_MAX.
 170
 171   f_mbtowc      (IN)    The function to use for converting a multibyte
 172                     character to a wide character.  If NULL, the local
 173                     ANSI C routine mbtowc is used.
 174
 175 Return Value:
 176                 If successful, the function returns the length in bytes of
 177         the UTF-8 output character.
 178
 179         If utf8char is NULL, count is ignored and the funtion
 180         returns the number of bytes that would be written to the
 181         output char.
 182
 183         If count is zero, 0 is returned and nothing is written to
 184         utf8char.
 185
 186         If mbchar is NULL or points to an empty string, the
 187         function returns 1 and a null byte is written to utf8char.
 188
 189         If mbchar contains an invalid multi-byte character, -1 is returned.
 190
 191
 192 3.2  ldap_x_mbs_to_utf8s  - Convert a multi-byte string  to a UTF-8 string.
 193
 194 int ldap_x_mbs_to_utf8s (char *utf8str, const char *mbstr, size_t count,
 195         size_t (*f_mbstowcs)(wchar_t *wcstr, const char *mbstr, size_t count))
 196
 197 utf8str     (OUT)       Points to a buffer to receive the converted UTF-8 string.
 198                     May be NULL.
 199
 200   mbchar        (IN)    Address of the null-terminated multi-byte input string.
 201
 202   count     (IN)        The size of the output buffer in bytes.
 203
 204   f_mbstowcs (IN)       The function to use for converting a multibyte string
 205                         to a wide character string.  If NULL, the local ANSI
 206                         C routine mbstowcs is used.
 207
 208 Return Value:
 209                 If successful, the function returns the length in
 210         bytes of the UTF-8 output string, excluding the null
 211         terminator, if present.
 212
 213         If utf8str is NULL, count is ignored and the function
 214         returns the number of bytes required for the output string,
 215         excluding the NULL.
 216
 217         If count is zero, 0 is returned and nothing is written to utf8str.
 218
 219         If mbstr is NULL or points to an empty string, the
 220         function returns 1 and a null byte is written to utf8str.
 221
 222         If mbstr contains an invalid multi-byte character, -1 is returned.
 223
 224         If the returned value is equal to count, the entire null-terminated
 225         string would not fit in the output buffer.
 226
 227
 228 3.3  ldap_x_utf8_to_mb  -  Convert a UTF-8 character to a multi-byte character.
 229
 230 int ldap_x_utf8_to_mb ( char *mbchar, const char *utf8char,
 231                         int (*f_wctomb)(char *mbchar, wchar_t wchar) )
 232
 233 mbchar  (OUT)   Points to a byte buffer to receive the converted multi-byte
 234                 character.  May be NULL.
 235
 236   utf8char      (IN)    Address of the UTF-8 character sequence.
 237
 238   f_wctomb      (IN)    The function to use for converting a wide character
 239                     to a multibyte character.  If NULL, the local
 240                     ANSI C routine wctomb is used.
 241
 242
 243 Return Value:
 244                 If successful, the function returns the length in
 245         bytes of the multi-byte output character.
 246
 247         If utf8char is NULL or points to an empty string, the
 248         function returns 1 and a null byte is written to mbchar.
 249
 250         If utf8char contains an invalid UTF-8 sequence, -1 is returned.
 251
 252
 253 3.4  int ldap_x_utf8s_to_mbs  - Convert a UTF-8 string to a multi-byte string.
 254
 255
 256 int ldap_x_utf8s_to_mbs ( char *mbstr, const char *utf8str, size_t count,
 257         size_t (*f_wcstombs)(char *mbstr, const wchar_t *wcstr, size_t count) )
 258
 259   mbstr         (OUT)   Points to a byte buffer to receive the converted
 260                     multi-byte string.  May be NULL.
 261
 262   utf8str   (IN)        Address of the null-terminated UTF-8 string to convert.
 263
 264   count         (IN)    The size of the output buffer in bytes.
 265
 266   f_wcstombs (IN)       The function to use for converting a wide character
 267                     string to a multibyte string.  If NULL, the local
 268                     ANSI C routine wcstombs is used.
 269
 270 Return Value:
 271         If successful, the function returns the number of bytes
 272                 written to mbstr, excluding the null termination
 273         character, if any.
 274
 275         If mbstr is NULL, count is ignored and the funtion
 276         returns the number of bytes required for the output string,
 277         excluding the NULL.
 278
 279         If count is zero, 0 is returned and nothing is written to
 280         mbstr.
 281
 282         If utf8str is NULL or points to an empty string, the
 283         function returns 1 and a null byte is written to mbstr.
 284
 285         If an invalid UTF-8 character is encountered, the
 286         function returns -1.
 287
 288 The output string will be null terminated if there is space for it in
 289 the output buffer.
 290
 291