git.sur5r.net Git - openldap/blob - libraries/libldap/utf-8-conv.c

   1 /* $OpenLDAP$ */
   2 /*
   3  * Copyright 2000 The OpenLDAP Foundation, All Rights Reserved.
   4  * COPYING RESTRICTIONS APPLY, see COPYRIGHT file
   5  */
   6
   7 /* $Novell: /ldap/src/cldap/libraries/libldap/utfconv.c,v 1.3 2000/12/11 19:35:37 dsteck Exp $ */
   8 /******************************************************************************
   9  * Copyright (C) 1999, 2000 Novell, Inc. All Rights Reserved.
  10  *
  11  * THIS WORK IS SUBJECT TO U.S. AND INTERNATIONAL COPYRIGHT LAWS AND
  12  * TREATIES. USE, MODIFICATION, AND REDISTRIBUTION OF THIS WORK IS SUBJECT
  13  * TO VERSION 2.0.1 OF THE OPENLDAP PUBLIC LICENSE, A COPY OF WHICH IS
  14  * AVAILABLE AT HTTP://WWW.OPENLDAP.ORG/LICENSE.HTML OR IN THE FILE "LICENSE"
  15  * IN THE TOP-LEVEL DIRECTORY OF THE DISTRIBUTION. ANY USE OR EXPLOITATION
  16  * OF THIS WORK OTHER THAN AS AUTHORIZED IN VERSION 2.0.1 OF THE OPENLDAP
  17  * PUBLIC LICENSE, OR OTHER PRIOR WRITTEN CONSENT FROM NOVELL, COULD SUBJECT
  18  * THE PERPETRATOR TO CRIMINAL AND CIVIL LIABILITY.
  19  ******************************************************************************/
  20
  21 /*
  22  * UTF-8 Conversion Routines
  23  *
  24  * These routines convert between Wide Character and UTF-8,
  25  * or between MultiByte and UTF-8 encodings.
  26  *
  27  * Both single character and string versions of the functions are provided.
  28  * All functions return -1 if the character or string cannot be converted.
  29  */
  30
  31 #include "portable.h"
  32
  33 #include <stdio.h>
  34 #include <ac/stdlib.h>          /* For wctomb, wcstombs, mbtowc, mbstowcs */
  35 #include <ac/string.h>
  36 #include <ac/time.h>            /* for time_t */
  37
  38 #include "ldap-int.h"
  39
  40 #include <ldap_utf8.h>
  41
  42 static unsigned char mask[] = { 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
  43
  44
  45 /*-----------------------------------------------------------------------------
  46                                         UTF-8 Format Summary
  47
  48 ASCII chars                                             7 bits
  49     0xxxxxxx
  50
  51 2-character UTF-8 sequence:        11 bits
  52     110xxxxx  10xxxxxx
  53
  54 3-character UTF-8                  16 bits
  55     1110xxxx  10xxxxxx  10xxxxxx
  56
  57 4-char UTF-8                       21 bits
  58     11110xxx  10xxxxxx  10xxxxxx  10xxxxxx
  59
  60 5-char UTF-8                       26 bits
  61     111110xx  10xxxxxx  10xxxxxx  10xxxxxx  10xxxxxx
  62
  63 6-char UTF-8                       31 bits
  64     1111110x  10xxxxxx  10xxxxxx  10xxxxxx  10xxxxxx  10xxxxxx
  65
  66 Unicode address space   (0 - 0x10FFFF)    21 bits
  67 ISO-10646 address space (0 - 0x7FFFFFFF)  31 bits
  68
  69 Note:  This code does not prevent UTF-8 sequences which are longer than
  70            necessary from being decoded.
  71 */
  72
  73 /*------------------------------------------------------------------------------
  74    Convert a UTF-8 character to a wide char.
  75    Return the length of the UTF-8 input character in bytes.
  76 */
  77 int
  78 ldap_x_utf8_to_wc ( wchar_t *wchar, const char *utf8char )
  79 {
  80         int utflen, i;
  81         wchar_t ch;
  82
  83         /* If input ptr is NULL, treat it as empty string. */
  84         if (utf8char == NULL)
  85                 utf8char = "";
  86
  87         /* Get UTF-8 sequence length from 1st byte */
  88         utflen = LDAP_UTF8_CHARLEN(utf8char);
  89
  90         if( utflen==0 || utflen > LDAP_MAX_UTF8_LEN )
  91                 return -1;                                                                      /* Invalid input */
  92
  93         /* First byte minus length tag */
  94         ch = (wchar_t)(utf8char[0] & mask[utflen]);
  95
  96         for(i=1; i < utflen; i++)
  97         {
  98                 /* Subsequent bytes must start with 10 */
  99                 if ((utf8char[i] & 0xc0) != 0x80)
 100                         return -1;
 101
 102                 ch <<= 6;                       /* 6 bits of data in each subsequent byte */
 103                 ch |= (wchar_t)(utf8char[i] & 0x3f);
 104         }
 105
 106         if (wchar)
 107                 *wchar = ch;
 108
 109         return utflen;
 110 }
 111
 112 /*-----------------------------------------------------------------------------
 113    Convert a UTF-8 string to a wide char string.
 114    No more than 'count' wide chars will be written to the output buffer.
 115    Return the size of the converted string in wide chars, excl null terminator.
 116 */
 117 int
 118 ldap_x_utf8s_to_wcs ( wchar_t *wcstr, const char *utf8str, size_t count )
 119 {
 120         size_t wclen = 0;
 121         int utflen, i;
 122         wchar_t ch;
 123
 124
 125         /* If input ptr is NULL, treat it as empty string. */
 126         if (utf8str == NULL)
 127                 utf8str = "";
 128
 129         /* Examine next UTF-8 character.  If output buffer is NULL, ignore count */
 130         while ( *utf8str && (wcstr==NULL || wclen<count) )
 131         {
 132                 /* Get UTF-8 sequence length from 1st byte */
 133                 utflen = LDAP_UTF8_CHARLEN(utf8str);
 134
 135                 if( utflen==0 || utflen > LDAP_MAX_UTF8_LEN )
 136                         return -1;                                                                      /* Invalid input */
 137
 138                 /* First byte minus length tag */
 139                 ch = (wchar_t)(utf8str[0] & mask[utflen]);
 140
 141                 for(i=1; i < utflen; i++)
 142                 {
 143                         /* Subsequent bytes must start with 10 */
 144                         if ((utf8str[i] & 0xc0) != 0x80)
 145                                 return -1;
 146
 147                         ch <<= 6;                       /* 6 bits of data in each subsequent byte */
 148                         ch |= (wchar_t)(utf8str[i] & 0x3f);
 149                 }
 150
 151                 if (wcstr)
 152                         wcstr[wclen] = ch;
 153
 154                 utf8str += utflen;              /* Move to next UTF-8 character */
 155                 wclen++;                                /* Count number of wide chars stored/required */
 156         }
 157
 158         /* Add null terminator if there's room in the buffer. */
 159         if (wcstr && wclen < count)
 160                 wcstr[wclen] = 0;
 161
 162         return wclen;
 163 }
 164
 165
 166 /*------------------------------------------------------------------------------
 167    Convert one wide char to a UTF-8 character.
 168    Return the length of the converted UTF-8 character in bytes.
 169    No more than 'count' bytes will be written to the output buffer.
 170 */
 171 int
 172 ldap_x_wc_to_utf8 ( char *utf8char, wchar_t wchar, size_t count )
 173 {
 174         int len=0;
 175
 176         if (utf8char == NULL)   /* Just determine the required UTF-8 char length. */
 177         {                                               /* Ignore count */
 178                 if( wchar < 0 )
 179                         return -1;
 180                 if( wchar < 0x80 )
 181                         return 1;
 182                 if( wchar < 0x800 )
 183                         return 2;
 184                 if( wchar < 0x10000 )
 185                         return 3;
 186                 if( wchar < 0x200000 )
 187                         return 4;
 188                 if( wchar < 0x4000000 )
 189                         return 5;
 190                 if( wchar < 0x80000000 )
 191                         return 6;
 192                 return -1;
 193         }
 194
 195
 196         if ( wchar < 0 ) {                              /* Invalid wide character */
 197                 len = -1;
 198
 199         } else if( wchar < 0x80 ) {
 200                 if (count >= 1) {
 201                         utf8char[len++] = (char)wchar;
 202                 }
 203
 204         } else if( wchar < 0x800 ) {
 205                 if (count >=2) {
 206                         utf8char[len++] = 0xc0 | ( wchar >> 6 );
 207                         utf8char[len++] = 0x80 | ( wchar & 0x3f );
 208                 }
 209
 210         } else if( wchar < 0x10000 ) {
 211                 if (count >= 3) {
 212                         utf8char[len++] = 0xe0 | ( wchar >> 12 );
 213                         utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
 214                         utf8char[len++] = 0x80 | ( wchar & 0x3f );
 215                 }
 216
 217         } else if( wchar < 0x200000 ) {
 218                 if (count >= 4) {
 219                         utf8char[len++] = 0xf0 | ( wchar >> 18 );
 220                         utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
 221                         utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
 222                         utf8char[len++] = 0x80 | ( wchar & 0x3f );
 223                 }
 224
 225         } else if( wchar < 0x4000000 ) {
 226                 if (count >= 5) {
 227                         utf8char[len++] = 0xf8 | ( wchar >> 24 );
 228                         utf8char[len++] = 0x80 | ( (wchar >> 18) & 0x3f );
 229                         utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
 230                         utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
 231                         utf8char[len++] = 0x80 | ( wchar & 0x3f );
 232                 }
 233
 234         } else if( wchar < 0x80000000 ) {
 235                 if (count >= 6) {
 236                         utf8char[len++] = 0xfc | ( wchar >> 30 );
 237                         utf8char[len++] = 0x80 | ( (wchar >> 24) & 0x3f );
 238                         utf8char[len++] = 0x80 | ( (wchar >> 18) & 0x3f );
 239                         utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
 240                         utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
 241                         utf8char[len++] = 0x80 | ( wchar & 0x3f );
 242                 }
 243
 244         } else
 245                 len = -1;
 246
 247         return len;
 248
 249 }
 250
 251
 252 /*-----------------------------------------------------------------------------
 253    Convert a wide char string to a UTF-8 string.
 254    No more than 'count' bytes will be written to the output buffer.
 255    Return the # of bytes written to the output buffer, excl null terminator.
 256 */
 257 int
 258 ldap_x_wcs_to_utf8s ( char *utf8str, const wchar_t *wcstr, size_t count )
 259 {
 260         int len = 0;
 261         int n;
 262         char *p = utf8str;
 263         wchar_t empty = 0;              /* To avoid use of L"" construct */
 264
 265         if (wcstr == NULL)              /* Treat input ptr NULL as an empty string */
 266                 wcstr = &empty;
 267
 268         if (utf8str == NULL)    /* Just compute size of output, excl null */
 269         {
 270                 while (*wcstr)
 271                 {
 272                         /* Get UTF-8 size of next wide char */
 273                         n = ldap_x_wc_to_utf8( NULL, *wcstr++, LDAP_MAX_UTF8_LEN);
 274                         if (n == -1)
 275                                 return -1;
 276                         len += n;
 277                 }
 278
 279                 return len;
 280         }
 281
 282
 283         /* Do the actual conversion. */
 284
 285         n = 1;                                  /* In case of empty wcstr */
 286         while (*wcstr)
 287         {
 288                 n = ldap_x_wc_to_utf8( p, *wcstr++, count);
 289
 290                 if (n <= 0)             /* If encoding error (-1) or won't fit (0), quit */
 291                         break;
 292
 293                 p += n;
 294                 count -= n;                     /* Space left in output buffer */
 295         }
 296
 297         /* If not enough room for last character, pad remainder with null
 298            so that return value = original count, indicating buffer full. */
 299         if (n == 0)
 300         {
 301                 while (count--)
 302                         *p++ = 0;
 303         }
 304
 305         /* Add a null terminator if there's room. */
 306         else if (count)
 307                 *p = 0;
 308
 309         if (n == -1)                    /* Conversion encountered invalid wide char. */
 310                 return -1;
 311
 312         /* Return the number of bytes written to output buffer, excl null. */
 313         return (p - utf8str);
 314 }
 315
 316
 317 /*-----------------------------------------------------------------------------
 318    Convert a UTF-8 character to a MultiByte character.
 319    Return the size of the converted character in bytes.
 320 */
 321 int
 322 ldap_x_utf8_to_mb ( char *mbchar, const char *utf8char,
 323                 int (*f_wctomb)(char *mbchar, wchar_t wchar) )
 324 {
 325         wchar_t wchar;
 326         int n;
 327         char tmp[6];                            /* Large enough for biggest multibyte char */
 328
 329         if (f_wctomb == NULL)           /* If no conversion function was given... */
 330                 f_wctomb = wctomb;              /*    use the local ANSI C function */
 331
 332         /* First convert UTF-8 char to a wide char */
 333         n = ldap_x_utf8_to_wc( &wchar, utf8char);
 334
 335         if (n == -1)
 336                 return -1;              /* Invalid UTF-8 character */
 337
 338         if (mbchar == NULL)
 339                 n = f_wctomb( tmp, wchar );
 340         else
 341                 n = f_wctomb( mbchar, wchar);
 342
 343         return n;
 344 }
 345
 346 /*-----------------------------------------------------------------------------
 347    Convert a UTF-8 string to a MultiByte string.
 348    No more than 'count' bytes will be written to the output buffer.
 349    Return the size of the converted string in bytes, excl null terminator.
 350 */
 351 int
 352 ldap_x_utf8s_to_mbs ( char *mbstr, const char *utf8str, size_t count,
 353                 size_t (*f_wcstombs)(char *mbstr, const wchar_t *wcstr, size_t count) )
 354 {
 355         wchar_t *wcs;
 356         size_t wcsize;
 357     int n;
 358
 359         if (f_wcstombs == NULL)         /* If no conversion function was given... */
 360                 f_wcstombs = wcstombs;  /*    use the local ANSI C function */
 361
 362         if (utf8str == NULL || *utf8str == 0)   /* NULL or empty input string */
 363         {
 364                 if (mbstr)
 365                         *mbstr = 0;
 366                 return 0;
 367         }
 368
 369 /* Allocate memory for the maximum size wchar string that we could get. */
 370         wcsize = strlen(utf8str) + 1;
 371         wcs = (wchar_t *)LDAP_MALLOC(wcsize * sizeof(wchar_t));
 372         if (wcs == NULL)
 373                 return -1;                              /* Memory allocation failure. */
 374
 375         /* First convert the UTF-8 string to a wide char string */
 376         n = ldap_x_utf8s_to_wcs( wcs, utf8str, wcsize);
 377
 378         /* Then convert wide char string to multi-byte string */
 379         if (n != -1)
 380         {
 381                 n = f_wcstombs(mbstr, wcs, count);
 382         }
 383
 384         LDAP_FREE(wcs);
 385
 386         return n;
 387 }
 388
 389 /*-----------------------------------------------------------------------------
 390    Convert a MultiByte character to a UTF-8 character.
 391    'mbsize' indicates the number of bytes of 'mbchar' to check.
 392    Returns the number of bytes written to the output character.
 393 */
 394 int
 395 ldap_x_mb_to_utf8 ( char *utf8char, const char *mbchar, size_t mbsize,
 396                 int (*f_mbtowc)(wchar_t *wchar, const char *mbchar, size_t count) )
 397 {
 398     wchar_t wchar;
 399     int n;
 400
 401         if (f_mbtowc == NULL)           /* If no conversion function was given... */
 402                 f_mbtowc = mbtowc;              /*    use the local ANSI C function */
 403
 404     if (mbsize == 0)                            /* 0 is not valid. */
 405         return -1;
 406
 407     if (mbchar == NULL || *mbchar == 0)
 408     {
 409         if (utf8char)
 410             *utf8char = 0;
 411         return 1;
 412     }
 413
 414         /* First convert the MB char to a Wide Char */
 415         n = f_mbtowc( &wchar, mbchar, mbsize);
 416
 417         if (n == -1)
 418                 return -1;
 419
 420         /* Convert the Wide Char to a UTF-8 character. */
 421         n = ldap_x_wc_to_utf8( utf8char, wchar, LDAP_MAX_UTF8_LEN);
 422
 423         return n;
 424 }
 425
 426
 427 /*-----------------------------------------------------------------------------
 428    Convert a MultiByte string to a UTF-8 string.
 429    No more than 'count' bytes will be written to the output buffer.
 430    Return the size of the converted string in bytes, excl null terminator.
 431 */
 432 int
 433 ldap_x_mbs_to_utf8s ( char *utf8str, const char *mbstr, size_t count,
 434                 size_t (*f_mbstowcs)(wchar_t *wcstr, const char *mbstr, size_t count) )
 435 {
 436         wchar_t *wcs;
 437         int n;
 438         size_t wcsize;
 439
 440         if (mbstr == NULL)                 /* Treat NULL input string as an empty string */
 441                 mbstr = "";
 442
 443         if (f_mbstowcs == NULL)         /* If no conversion function was given... */
 444                 f_mbstowcs = mbstowcs;  /*    use the local ANSI C function */
 445
 446         /* Allocate memory for the maximum size wchar string that we could get. */
 447         wcsize = strlen(mbstr) + 1;
 448         wcs = (wchar_t *)LDAP_MALLOC( wcsize * sizeof(wchar_t) );
 449         if (wcs == NULL)
 450                 return -1;
 451
 452         /* First convert multi-byte string to a wide char string */
 453         n = f_mbstowcs(wcs, mbstr, wcsize);
 454
 455         /* Convert wide char string to UTF-8 string */
 456         if (n != -1)
 457         {
 458                 n = ldap_x_wcs_to_utf8s( utf8str, wcs, count);
 459         }
 460
 461         LDAP_FREE(wcs);
 462
 463         return n;
 464 }