git.sur5r.net Git - openldap/blob - libraries/libldap/utf-8-conv.c

   1 /* $OpenLDAP$ */
   2 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
   3  *
   4  * Copyright 1998-2004 The OpenLDAP Foundation.
   5  * All rights reserved.
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted only as authorized by the OpenLDAP
   9  * Public License.
  10  *
  11  * A copy of this license is available in the file LICENSE in the
  12  * top-level directory of the distribution or, alternatively, at
  13  * <http://www.OpenLDAP.org/license.html>.
  14  */
  15 /* Portions Copyright (C) 1999, 2000 Novell, Inc. All Rights Reserved.
  16  *
  17  * THIS WORK IS SUBJECT TO U.S. AND INTERNATIONAL COPYRIGHT LAWS AND
  18  * TREATIES. USE, MODIFICATION, AND REDISTRIBUTION OF THIS WORK IS SUBJECT
  19  * TO VERSION 2.0.1 OF THE OPENLDAP PUBLIC LICENSE, A COPY OF WHICH IS
  20  * AVAILABLE AT HTTP://WWW.OPENLDAP.ORG/LICENSE.HTML OR IN THE FILE "LICENSE"
  21  * IN THE TOP-LEVEL DIRECTORY OF THE DISTRIBUTION. ANY USE OR EXPLOITATION
  22  * OF THIS WORK OTHER THAN AS AUTHORIZED IN VERSION 2.0.1 OF THE OPENLDAP
  23  * PUBLIC LICENSE, OR OTHER PRIOR WRITTEN CONSENT FROM NOVELL, COULD SUBJECT
  24  * THE PERPETRATOR TO CRIMINAL AND CIVIL LIABILITY.
  25  *---
  26  * Note: A verbatim copy of version 2.0.1 of the OpenLDAP Public License
  27  * can be found in the file "build/LICENSE-2.0.1" in this distribution
  28  * of OpenLDAP Software.
  29  */
  30
  31 /*
  32  * UTF-8 Conversion Routines
  33  *
  34  * These routines convert between Wide Character and UTF-8,
  35  * or between MultiByte and UTF-8 encodings.
  36  *
  37  * Both single character and string versions of the functions are provided.
  38  * All functions return -1 if the character or string cannot be converted.
  39  */
  40
  41 #include "portable.h"
  42
  43 #include <stdio.h>
  44 #include <ac/stdlib.h>          /* For wctomb, wcstombs, mbtowc, mbstowcs */
  45 #include <ac/string.h>
  46 #include <ac/time.h>            /* for time_t */
  47
  48 #include "ldap-int.h"
  49
  50 #include <ldap_utf8.h>
  51
  52 static unsigned char mask[] = { 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
  53
  54
  55 /*-----------------------------------------------------------------------------
  56                                         UTF-8 Format Summary
  57
  58 ASCII chars                                             7 bits
  59     0xxxxxxx
  60
  61 2-character UTF-8 sequence:        11 bits
  62     110xxxxx  10xxxxxx
  63
  64 3-character UTF-8                  16 bits
  65     1110xxxx  10xxxxxx  10xxxxxx
  66
  67 4-char UTF-8                       21 bits
  68     11110xxx  10xxxxxx  10xxxxxx  10xxxxxx
  69
  70 5-char UTF-8                       26 bits
  71     111110xx  10xxxxxx  10xxxxxx  10xxxxxx  10xxxxxx
  72
  73 6-char UTF-8                       31 bits
  74     1111110x  10xxxxxx  10xxxxxx  10xxxxxx  10xxxxxx  10xxxxxx
  75
  76 Unicode address space   (0 - 0x10FFFF)    21 bits
  77 ISO-10646 address space (0 - 0x7FFFFFFF)  31 bits
  78
  79 Note:  This code does not prevent UTF-8 sequences which are longer than
  80            necessary from being decoded.
  81 */
  82
  83 /*-----------------------------------------------------------------------------
  84    Convert a UTF-8 character to a wide char.
  85    Return the length of the UTF-8 input character in bytes.
  86 */
  87 int
  88 ldap_x_utf8_to_wc ( wchar_t *wchar, const char *utf8char )
  89 {
  90         int utflen, i;
  91         wchar_t ch;
  92
  93         /* If input ptr is NULL, treat it as empty string. */
  94         if (utf8char == NULL)
  95                 utf8char = "";
  96
  97         /* Get UTF-8 sequence length from 1st byte */
  98         utflen = LDAP_UTF8_CHARLEN2(utf8char, utflen);
  99
 100         if( utflen==0 || utflen > (int)LDAP_MAX_UTF8_LEN )
 101                 return -1;                                                                      /* Invalid input */
 102
 103         /* First byte minus length tag */
 104         ch = (wchar_t)(utf8char[0] & mask[utflen]);
 105
 106         for(i=1; i < utflen; i++)
 107         {
 108                 /* Subsequent bytes must start with 10 */
 109                 if ((utf8char[i] & 0xc0) != 0x80)
 110                         return -1;
 111
 112                 ch <<= 6;                       /* 6 bits of data in each subsequent byte */
 113                 ch |= (wchar_t)(utf8char[i] & 0x3f);
 114         }
 115
 116         if (wchar)
 117                 *wchar = ch;
 118
 119         return utflen;
 120 }
 121
 122 /*-----------------------------------------------------------------------------
 123    Convert a UTF-8 string to a wide char string.
 124    No more than 'count' wide chars will be written to the output buffer.
 125    Return the size of the converted string in wide chars, excl null terminator.
 126 */
 127 int
 128 ldap_x_utf8s_to_wcs ( wchar_t *wcstr, const char *utf8str, size_t count )
 129 {
 130         size_t wclen = 0;
 131         int utflen, i;
 132         wchar_t ch;
 133
 134
 135         /* If input ptr is NULL, treat it as empty string. */
 136         if (utf8str == NULL)
 137                 utf8str = "";
 138
 139         /* Examine next UTF-8 character.  If output buffer is NULL, ignore count */
 140         while ( *utf8str && (wcstr==NULL || wclen<count) )
 141         {
 142                 /* Get UTF-8 sequence length from 1st byte */
 143                 utflen = LDAP_UTF8_CHARLEN2(utf8str, utflen);
 144
 145                 if( utflen==0 || utflen > (int)LDAP_MAX_UTF8_LEN )
 146                         return -1;                                                                      /* Invalid input */
 147
 148                 /* First byte minus length tag */
 149                 ch = (wchar_t)(utf8str[0] & mask[utflen]);
 150
 151                 for(i=1; i < utflen; i++)
 152                 {
 153                         /* Subsequent bytes must start with 10 */
 154                         if ((utf8str[i] & 0xc0) != 0x80)
 155                                 return -1;
 156
 157                         ch <<= 6;                       /* 6 bits of data in each subsequent byte */
 158                         ch |= (wchar_t)(utf8str[i] & 0x3f);
 159                 }
 160
 161                 if (wcstr)
 162                         wcstr[wclen] = ch;
 163
 164                 utf8str += utflen;              /* Move to next UTF-8 character */
 165                 wclen++;                                /* Count number of wide chars stored/required */
 166         }
 167
 168         /* Add null terminator if there's room in the buffer. */
 169         if (wcstr && wclen < count)
 170                 wcstr[wclen] = 0;
 171
 172         return wclen;
 173 }
 174
 175
 176 /*-----------------------------------------------------------------------------
 177    Convert one wide char to a UTF-8 character.
 178    Return the length of the converted UTF-8 character in bytes.
 179    No more than 'count' bytes will be written to the output buffer.
 180 */
 181 int
 182 ldap_x_wc_to_utf8 ( char *utf8char, wchar_t wchar, size_t count )
 183 {
 184         int len=0;
 185
 186         if (utf8char == NULL)   /* Just determine the required UTF-8 char length. */
 187         {                                               /* Ignore count */
 188                 if( wchar < 0 )
 189                         return -1;
 190                 if( wchar < 0x80 )
 191                         return 1;
 192                 if( wchar < 0x800 )
 193                         return 2;
 194                 if( wchar < 0x10000 )
 195                         return 3;
 196                 if( wchar < 0x200000 )
 197                         return 4;
 198                 if( wchar < 0x4000000 )
 199                         return 5;
 200                 if( wchar < 0x80000000 )
 201                         return 6;
 202                 return -1;
 203         }
 204
 205
 206         if ( wchar < 0 ) {                              /* Invalid wide character */
 207                 len = -1;
 208
 209         } else if( wchar < 0x80 ) {
 210                 if (count >= 1) {
 211                         utf8char[len++] = (char)wchar;
 212                 }
 213
 214         } else if( wchar < 0x800 ) {
 215                 if (count >=2) {
 216                         utf8char[len++] = 0xc0 | ( wchar >> 6 );
 217                         utf8char[len++] = 0x80 | ( wchar & 0x3f );
 218                 }
 219
 220         } else if( wchar < 0x10000 ) {
 221                 if (count >= 3) {
 222                         utf8char[len++] = 0xe0 | ( wchar >> 12 );
 223                         utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
 224                         utf8char[len++] = 0x80 | ( wchar & 0x3f );
 225                 }
 226
 227         } else if( wchar < 0x200000 ) {
 228                 if (count >= 4) {
 229                         utf8char[len++] = 0xf0 | ( wchar >> 18 );
 230                         utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
 231                         utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
 232                         utf8char[len++] = 0x80 | ( wchar & 0x3f );
 233                 }
 234
 235         } else if( wchar < 0x4000000 ) {
 236                 if (count >= 5) {
 237                         utf8char[len++] = 0xf8 | ( wchar >> 24 );
 238                         utf8char[len++] = 0x80 | ( (wchar >> 18) & 0x3f );
 239                         utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
 240                         utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
 241                         utf8char[len++] = 0x80 | ( wchar & 0x3f );
 242                 }
 243
 244         } else if( wchar < 0x80000000 ) {
 245                 if (count >= 6) {
 246                         utf8char[len++] = 0xfc | ( wchar >> 30 );
 247                         utf8char[len++] = 0x80 | ( (wchar >> 24) & 0x3f );
 248                         utf8char[len++] = 0x80 | ( (wchar >> 18) & 0x3f );
 249                         utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
 250                         utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
 251                         utf8char[len++] = 0x80 | ( wchar & 0x3f );
 252                 }
 253
 254         } else
 255                 len = -1;
 256
 257         return len;
 258
 259 }
 260
 261
 262 /*-----------------------------------------------------------------------------
 263    Convert a wide char string to a UTF-8 string.
 264    No more than 'count' bytes will be written to the output buffer.
 265    Return the # of bytes written to the output buffer, excl null terminator.
 266 */
 267 int
 268 ldap_x_wcs_to_utf8s ( char *utf8str, const wchar_t *wcstr, size_t count )
 269 {
 270         int len = 0;
 271         int n;
 272         char *p = utf8str;
 273         wchar_t empty = 0;              /* To avoid use of L"" construct */
 274
 275         if (wcstr == NULL)              /* Treat input ptr NULL as an empty string */
 276                 wcstr = &empty;
 277
 278         if (utf8str == NULL)    /* Just compute size of output, excl null */
 279         {
 280                 while (*wcstr)
 281                 {
 282                         /* Get UTF-8 size of next wide char */
 283                         n = ldap_x_wc_to_utf8( NULL, *wcstr++, LDAP_MAX_UTF8_LEN);
 284                         if (n == -1)
 285                                 return -1;
 286                         len += n;
 287                 }
 288
 289                 return len;
 290         }
 291
 292
 293         /* Do the actual conversion. */
 294
 295         n = 1;                                  /* In case of empty wcstr */
 296         while (*wcstr)
 297         {
 298                 n = ldap_x_wc_to_utf8( p, *wcstr++, count);
 299
 300                 if (n <= 0)             /* If encoding error (-1) or won't fit (0), quit */
 301                         break;
 302
 303                 p += n;
 304                 count -= n;                     /* Space left in output buffer */
 305         }
 306
 307         /* If not enough room for last character, pad remainder with null
 308            so that return value = original count, indicating buffer full. */
 309         if (n == 0)
 310         {
 311                 while (count--)
 312                         *p++ = 0;
 313         }
 314
 315         /* Add a null terminator if there's room. */
 316         else if (count)
 317                 *p = 0;
 318
 319         if (n == -1)                    /* Conversion encountered invalid wide char. */
 320                 return -1;
 321
 322         /* Return the number of bytes written to output buffer, excl null. */
 323         return (p - utf8str);
 324 }
 325
 326
 327 /*-----------------------------------------------------------------------------
 328    Convert a UTF-8 character to a MultiByte character.
 329    Return the size of the converted character in bytes.
 330 */
 331 int
 332 ldap_x_utf8_to_mb ( char *mbchar, const char *utf8char,
 333                 int (*f_wctomb)(char *mbchar, wchar_t wchar) )
 334 {
 335         wchar_t wchar;
 336         int n;
 337         char tmp[6];                            /* Large enough for biggest multibyte char */
 338
 339         if (f_wctomb == NULL)           /* If no conversion function was given... */
 340                 f_wctomb = wctomb;              /*    use the local ANSI C function */
 341
 342         /* First convert UTF-8 char to a wide char */
 343         n = ldap_x_utf8_to_wc( &wchar, utf8char);
 344
 345         if (n == -1)
 346                 return -1;              /* Invalid UTF-8 character */
 347
 348         if (mbchar == NULL)
 349                 n = f_wctomb( tmp, wchar );
 350         else
 351                 n = f_wctomb( mbchar, wchar);
 352
 353         return n;
 354 }
 355
 356 /*-----------------------------------------------------------------------------
 357    Convert a UTF-8 string to a MultiByte string.
 358    No more than 'count' bytes will be written to the output buffer.
 359    Return the size of the converted string in bytes, excl null terminator.
 360 */
 361 int
 362 ldap_x_utf8s_to_mbs ( char *mbstr, const char *utf8str, size_t count,
 363                 size_t (*f_wcstombs)(char *mbstr, const wchar_t *wcstr, size_t count) )
 364 {
 365         wchar_t *wcs;
 366         size_t wcsize;
 367     int n;
 368
 369         if (f_wcstombs == NULL)         /* If no conversion function was given... */
 370                 f_wcstombs = wcstombs;  /*    use the local ANSI C function */
 371
 372         if (utf8str == NULL || *utf8str == 0)   /* NULL or empty input string */
 373         {
 374                 if (mbstr)
 375                         *mbstr = 0;
 376                 return 0;
 377         }
 378
 379 /* Allocate memory for the maximum size wchar string that we could get. */
 380         wcsize = strlen(utf8str) + 1;
 381         wcs = (wchar_t *)LDAP_MALLOC(wcsize * sizeof(wchar_t));
 382         if (wcs == NULL)
 383                 return -1;                              /* Memory allocation failure. */
 384
 385         /* First convert the UTF-8 string to a wide char string */
 386         n = ldap_x_utf8s_to_wcs( wcs, utf8str, wcsize);
 387
 388         /* Then convert wide char string to multi-byte string */
 389         if (n != -1)
 390         {
 391                 n = f_wcstombs(mbstr, wcs, count);
 392         }
 393
 394         LDAP_FREE(wcs);
 395
 396         return n;
 397 }
 398
 399 /*-----------------------------------------------------------------------------
 400    Convert a MultiByte character to a UTF-8 character.
 401    'mbsize' indicates the number of bytes of 'mbchar' to check.
 402    Returns the number of bytes written to the output character.
 403 */
 404 int
 405 ldap_x_mb_to_utf8 ( char *utf8char, const char *mbchar, size_t mbsize,
 406                 int (*f_mbtowc)(wchar_t *wchar, const char *mbchar, size_t count) )
 407 {
 408     wchar_t wchar;
 409     int n;
 410
 411         if (f_mbtowc == NULL)           /* If no conversion function was given... */
 412                 f_mbtowc = mbtowc;              /*    use the local ANSI C function */
 413
 414     if (mbsize == 0)                            /* 0 is not valid. */
 415         return -1;
 416
 417     if (mbchar == NULL || *mbchar == 0)
 418     {
 419         if (utf8char)
 420             *utf8char = 0;
 421         return 1;
 422     }
 423
 424         /* First convert the MB char to a Wide Char */
 425         n = f_mbtowc( &wchar, mbchar, mbsize);
 426
 427         if (n == -1)
 428                 return -1;
 429
 430         /* Convert the Wide Char to a UTF-8 character. */
 431         n = ldap_x_wc_to_utf8( utf8char, wchar, LDAP_MAX_UTF8_LEN);
 432
 433         return n;
 434 }
 435
 436
 437 /*-----------------------------------------------------------------------------
 438    Convert a MultiByte string to a UTF-8 string.
 439    No more than 'count' bytes will be written to the output buffer.
 440    Return the size of the converted string in bytes, excl null terminator.
 441 */
 442 int
 443 ldap_x_mbs_to_utf8s ( char *utf8str, const char *mbstr, size_t count,
 444                 size_t (*f_mbstowcs)(wchar_t *wcstr, const char *mbstr, size_t count) )
 445 {
 446         wchar_t *wcs;
 447         int n;
 448         size_t wcsize;
 449
 450         if (mbstr == NULL)                 /* Treat NULL input string as an empty string */
 451                 mbstr = "";
 452
 453         if (f_mbstowcs == NULL)         /* If no conversion function was given... */
 454                 f_mbstowcs = mbstowcs;  /*    use the local ANSI C function */
 455
 456         /* Allocate memory for the maximum size wchar string that we could get. */
 457         wcsize = strlen(mbstr) + 1;
 458         wcs = (wchar_t *)LDAP_MALLOC( wcsize * sizeof(wchar_t) );
 459         if (wcs == NULL)
 460                 return -1;
 461
 462         /* First convert multi-byte string to a wide char string */
 463         n = f_mbstowcs(wcs, mbstr, wcsize);
 464
 465         /* Convert wide char string to UTF-8 string */
 466         if (n != -1)
 467         {
 468                 n = ldap_x_wcs_to_utf8s( utf8str, wcs, count);
 469         }
 470
 471         LDAP_FREE(wcs);
 472
 473         return n;
 474 }