git.sur5r.net Git - openldap/blob - libraries/libldap/utf-8-conv.c

   1 /* $OpenLDAP$ */
   2 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
   3  *
   4  * Copyright 1998-2004 The OpenLDAP Foundation.
   5  * All rights reserved.
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted only as authorized by the OpenLDAP
   9  * Public License.
  10  *
  11  * A copy of this license is available in the file LICENSE in the
  12  * top-level directory of the distribution or, alternatively, at
  13  * <http://www.OpenLDAP.org/license.html>.
  14  */
  15 /* Portions Copyright (C) 1999, 2000 Novell, Inc. All Rights Reserved.
  16  *
  17  * THIS WORK IS SUBJECT TO U.S. AND INTERNATIONAL COPYRIGHT LAWS AND
  18  * TREATIES. USE, MODIFICATION, AND REDISTRIBUTION OF THIS WORK IS SUBJECT
  19  * TO VERSION 2.0.1 OF THE OPENLDAP PUBLIC LICENSE, A COPY OF WHICH IS
  20  * AVAILABLE AT HTTP://WWW.OPENLDAP.ORG/LICENSE.HTML OR IN THE FILE "LICENSE"
  21  * IN THE TOP-LEVEL DIRECTORY OF THE DISTRIBUTION. ANY USE OR EXPLOITATION
  22  * OF THIS WORK OTHER THAN AS AUTHORIZED IN VERSION 2.0.1 OF THE OPENLDAP
  23  * PUBLIC LICENSE, OR OTHER PRIOR WRITTEN CONSENT FROM NOVELL, COULD SUBJECT
  24  * THE PERPETRATOR TO CRIMINAL AND CIVIL LIABILITY.
  25  *---
  26  * Note: A verbatim copy of version 2.0.1 of the OpenLDAP Public License
  27  * can be found in the file "build/LICENSE-2.0.1" in this distribution
  28  * of OpenLDAP Software.
  29  */
  30
  31 /*
  32  * UTF-8 Conversion Routines
  33  *
  34  * These routines convert between Wide Character and UTF-8,
  35  * or between MultiByte and UTF-8 encodings.
  36  *
  37  * Both single character and string versions of the functions are provided.
  38  * All functions return -1 if the character or string cannot be converted.
  39  */
  40
  41 #include "portable.h"
  42
  43 #if SIZEOF_WCHAR_T >= 4
  44 /* These routines assume ( sizeof(wchar_t) >= 4 ) */
  45
  46 #include <stdio.h>
  47 #include <ac/stdlib.h>          /* For wctomb, wcstombs, mbtowc, mbstowcs */
  48 #include <ac/string.h>
  49 #include <ac/time.h>            /* for time_t */
  50
  51 #include "ldap-int.h"
  52
  53 #include <ldap_utf8.h>
  54
  55 static unsigned char mask[] = { 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
  56
  57
  58 /*-----------------------------------------------------------------------------
  59                                         UTF-8 Format Summary
  60
  61 ASCII chars                                             7 bits
  62     0xxxxxxx
  63
  64 2-character UTF-8 sequence:        11 bits
  65     110xxxxx  10xxxxxx
  66
  67 3-character UTF-8                  16 bits
  68     1110xxxx  10xxxxxx  10xxxxxx
  69
  70 4-char UTF-8                       21 bits
  71     11110xxx  10xxxxxx  10xxxxxx  10xxxxxx
  72
  73 5-char UTF-8                       26 bits
  74     111110xx  10xxxxxx  10xxxxxx  10xxxxxx  10xxxxxx
  75
  76 6-char UTF-8                       31 bits
  77     1111110x  10xxxxxx  10xxxxxx  10xxxxxx  10xxxxxx  10xxxxxx
  78
  79 Unicode address space   (0 - 0x10FFFF)    21 bits
  80 ISO-10646 address space (0 - 0x7FFFFFFF)  31 bits
  81
  82 Note:  This code does not prevent UTF-8 sequences which are longer than
  83            necessary from being decoded.
  84 */
  85
  86 /*-----------------------------------------------------------------------------
  87    Convert a UTF-8 character to a wide char.
  88    Return the length of the UTF-8 input character in bytes.
  89 */
  90 int
  91 ldap_x_utf8_to_wc ( wchar_t *wchar, const char *utf8char )
  92 {
  93         int utflen, i;
  94         wchar_t ch;
  95
  96         /* If input ptr is NULL, treat it as empty string. */
  97         if (utf8char == NULL)
  98                 utf8char = "";
  99
 100         /* Get UTF-8 sequence length from 1st byte */
 101         utflen = LDAP_UTF8_CHARLEN2(utf8char, utflen);
 102
 103         if( utflen==0 || utflen > (int)LDAP_MAX_UTF8_LEN )
 104                 return -1;                                                                      /* Invalid input */
 105
 106         /* First byte minus length tag */
 107         ch = (wchar_t)(utf8char[0] & mask[utflen]);
 108
 109         for(i=1; i < utflen; i++)
 110         {
 111                 /* Subsequent bytes must start with 10 */
 112                 if ((utf8char[i] & 0xc0) != 0x80)
 113                         return -1;
 114
 115                 ch <<= 6;                       /* 6 bits of data in each subsequent byte */
 116                 ch |= (wchar_t)(utf8char[i] & 0x3f);
 117         }
 118
 119         if (wchar)
 120                 *wchar = ch;
 121
 122         return utflen;
 123 }
 124
 125 /*-----------------------------------------------------------------------------
 126    Convert a UTF-8 string to a wide char string.
 127    No more than 'count' wide chars will be written to the output buffer.
 128    Return the size of the converted string in wide chars, excl null terminator.
 129 */
 130 int
 131 ldap_x_utf8s_to_wcs ( wchar_t *wcstr, const char *utf8str, size_t count )
 132 {
 133         size_t wclen = 0;
 134         int utflen, i;
 135         wchar_t ch;
 136
 137
 138         /* If input ptr is NULL, treat it as empty string. */
 139         if (utf8str == NULL)
 140                 utf8str = "";
 141
 142         /* Examine next UTF-8 character.  If output buffer is NULL, ignore count */
 143         while ( *utf8str && (wcstr==NULL || wclen<count) )
 144         {
 145                 /* Get UTF-8 sequence length from 1st byte */
 146                 utflen = LDAP_UTF8_CHARLEN2(utf8str, utflen);
 147
 148                 if( utflen==0 || utflen > (int)LDAP_MAX_UTF8_LEN )
 149                         return -1;                                                                      /* Invalid input */
 150
 151                 /* First byte minus length tag */
 152                 ch = (wchar_t)(utf8str[0] & mask[utflen]);
 153
 154                 for(i=1; i < utflen; i++)
 155                 {
 156                         /* Subsequent bytes must start with 10 */
 157                         if ((utf8str[i] & 0xc0) != 0x80)
 158                                 return -1;
 159
 160                         ch <<= 6;                       /* 6 bits of data in each subsequent byte */
 161                         ch |= (wchar_t)(utf8str[i] & 0x3f);
 162                 }
 163
 164                 if (wcstr)
 165                         wcstr[wclen] = ch;
 166
 167                 utf8str += utflen;              /* Move to next UTF-8 character */
 168                 wclen++;                                /* Count number of wide chars stored/required */
 169         }
 170
 171         /* Add null terminator if there's room in the buffer. */
 172         if (wcstr && wclen < count)
 173                 wcstr[wclen] = 0;
 174
 175         return wclen;
 176 }
 177
 178
 179 /*-----------------------------------------------------------------------------
 180    Convert one wide char to a UTF-8 character.
 181    Return the length of the converted UTF-8 character in bytes.
 182    No more than 'count' bytes will be written to the output buffer.
 183 */
 184 int
 185 ldap_x_wc_to_utf8 ( char *utf8char, wchar_t wchar, size_t count )
 186 {
 187         int len=0;
 188
 189         if (utf8char == NULL)   /* Just determine the required UTF-8 char length. */
 190         {                                               /* Ignore count */
 191                 if( wchar < 0 )
 192                         return -1;
 193                 if( wchar < 0x80 )
 194                         return 1;
 195                 if( wchar < 0x800 )
 196                         return 2;
 197                 if( wchar < 0x10000 )
 198                         return 3;
 199                 if( wchar < 0x200000 )
 200                         return 4;
 201                 if( wchar < 0x4000000 )
 202                         return 5;
 203                 if( wchar < 0x80000000 )
 204                         return 6;
 205                 return -1;
 206         }
 207
 208
 209         if ( wchar < 0 ) {                              /* Invalid wide character */
 210                 len = -1;
 211
 212         } else if( wchar < 0x80 ) {
 213                 if (count >= 1) {
 214                         utf8char[len++] = (char)wchar;
 215                 }
 216
 217         } else if( wchar < 0x800 ) {
 218                 if (count >=2) {
 219                         utf8char[len++] = 0xc0 | ( wchar >> 6 );
 220                         utf8char[len++] = 0x80 | ( wchar & 0x3f );
 221                 }
 222
 223         } else if( wchar < 0x10000 ) {
 224                 if (count >= 3) {
 225                         utf8char[len++] = 0xe0 | ( wchar >> 12 );
 226                         utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
 227                         utf8char[len++] = 0x80 | ( wchar & 0x3f );
 228                 }
 229
 230         } else if( wchar < 0x200000 ) {
 231                 if (count >= 4) {
 232                         utf8char[len++] = 0xf0 | ( wchar >> 18 );
 233                         utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
 234                         utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
 235                         utf8char[len++] = 0x80 | ( wchar & 0x3f );
 236                 }
 237
 238         } else if( wchar < 0x4000000 ) {
 239                 if (count >= 5) {
 240                         utf8char[len++] = 0xf8 | ( wchar >> 24 );
 241                         utf8char[len++] = 0x80 | ( (wchar >> 18) & 0x3f );
 242                         utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
 243                         utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
 244                         utf8char[len++] = 0x80 | ( wchar & 0x3f );
 245                 }
 246
 247         } else if( wchar < 0x80000000 ) {
 248                 if (count >= 6) {
 249                         utf8char[len++] = 0xfc | ( wchar >> 30 );
 250                         utf8char[len++] = 0x80 | ( (wchar >> 24) & 0x3f );
 251                         utf8char[len++] = 0x80 | ( (wchar >> 18) & 0x3f );
 252                         utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
 253                         utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
 254                         utf8char[len++] = 0x80 | ( wchar & 0x3f );
 255                 }
 256
 257         } else
 258                 len = -1;
 259
 260         return len;
 261
 262 }
 263
 264
 265 /*-----------------------------------------------------------------------------
 266    Convert a wide char string to a UTF-8 string.
 267    No more than 'count' bytes will be written to the output buffer.
 268    Return the # of bytes written to the output buffer, excl null terminator.
 269 */
 270 int
 271 ldap_x_wcs_to_utf8s ( char *utf8str, const wchar_t *wcstr, size_t count )
 272 {
 273         int len = 0;
 274         int n;
 275         char *p = utf8str;
 276         wchar_t empty = 0;              /* To avoid use of L"" construct */
 277
 278         if (wcstr == NULL)              /* Treat input ptr NULL as an empty string */
 279                 wcstr = &empty;
 280
 281         if (utf8str == NULL)    /* Just compute size of output, excl null */
 282         {
 283                 while (*wcstr)
 284                 {
 285                         /* Get UTF-8 size of next wide char */
 286                         n = ldap_x_wc_to_utf8( NULL, *wcstr++, LDAP_MAX_UTF8_LEN);
 287                         if (n == -1)
 288                                 return -1;
 289                         len += n;
 290                 }
 291
 292                 return len;
 293         }
 294
 295
 296         /* Do the actual conversion. */
 297
 298         n = 1;                                  /* In case of empty wcstr */
 299         while (*wcstr)
 300         {
 301                 n = ldap_x_wc_to_utf8( p, *wcstr++, count);
 302
 303                 if (n <= 0)             /* If encoding error (-1) or won't fit (0), quit */
 304                         break;
 305
 306                 p += n;
 307                 count -= n;                     /* Space left in output buffer */
 308         }
 309
 310         /* If not enough room for last character, pad remainder with null
 311            so that return value = original count, indicating buffer full. */
 312         if (n == 0)
 313         {
 314                 while (count--)
 315                         *p++ = 0;
 316         }
 317
 318         /* Add a null terminator if there's room. */
 319         else if (count)
 320                 *p = 0;
 321
 322         if (n == -1)                    /* Conversion encountered invalid wide char. */
 323                 return -1;
 324
 325         /* Return the number of bytes written to output buffer, excl null. */
 326         return (p - utf8str);
 327 }
 328
 329
 330 /*-----------------------------------------------------------------------------
 331    Convert a UTF-8 character to a MultiByte character.
 332    Return the size of the converted character in bytes.
 333 */
 334 int
 335 ldap_x_utf8_to_mb ( char *mbchar, const char *utf8char,
 336                 int (*f_wctomb)(char *mbchar, wchar_t wchar) )
 337 {
 338         wchar_t wchar;
 339         int n;
 340         char tmp[6];                            /* Large enough for biggest multibyte char */
 341
 342         if (f_wctomb == NULL)           /* If no conversion function was given... */
 343                 f_wctomb = wctomb;              /*    use the local ANSI C function */
 344
 345         /* First convert UTF-8 char to a wide char */
 346         n = ldap_x_utf8_to_wc( &wchar, utf8char);
 347
 348         if (n == -1)
 349                 return -1;              /* Invalid UTF-8 character */
 350
 351         if (mbchar == NULL)
 352                 n = f_wctomb( tmp, wchar );
 353         else
 354                 n = f_wctomb( mbchar, wchar);
 355
 356         return n;
 357 }
 358
 359 /*-----------------------------------------------------------------------------
 360    Convert a UTF-8 string to a MultiByte string.
 361    No more than 'count' bytes will be written to the output buffer.
 362    Return the size of the converted string in bytes, excl null terminator.
 363 */
 364 int
 365 ldap_x_utf8s_to_mbs ( char *mbstr, const char *utf8str, size_t count,
 366                 size_t (*f_wcstombs)(char *mbstr, const wchar_t *wcstr, size_t count) )
 367 {
 368         wchar_t *wcs;
 369         size_t wcsize;
 370     int n;
 371
 372         if (f_wcstombs == NULL)         /* If no conversion function was given... */
 373                 f_wcstombs = wcstombs;  /*    use the local ANSI C function */
 374
 375         if (utf8str == NULL || *utf8str == 0)   /* NULL or empty input string */
 376         {
 377                 if (mbstr)
 378                         *mbstr = 0;
 379                 return 0;
 380         }
 381
 382 /* Allocate memory for the maximum size wchar string that we could get. */
 383         wcsize = strlen(utf8str) + 1;
 384         wcs = (wchar_t *)LDAP_MALLOC(wcsize * sizeof(wchar_t));
 385         if (wcs == NULL)
 386                 return -1;                              /* Memory allocation failure. */
 387
 388         /* First convert the UTF-8 string to a wide char string */
 389         n = ldap_x_utf8s_to_wcs( wcs, utf8str, wcsize);
 390
 391         /* Then convert wide char string to multi-byte string */
 392         if (n != -1)
 393         {
 394                 n = f_wcstombs(mbstr, wcs, count);
 395         }
 396
 397         LDAP_FREE(wcs);
 398
 399         return n;
 400 }
 401
 402 /*-----------------------------------------------------------------------------
 403    Convert a MultiByte character to a UTF-8 character.
 404    'mbsize' indicates the number of bytes of 'mbchar' to check.
 405    Returns the number of bytes written to the output character.
 406 */
 407 int
 408 ldap_x_mb_to_utf8 ( char *utf8char, const char *mbchar, size_t mbsize,
 409                 int (*f_mbtowc)(wchar_t *wchar, const char *mbchar, size_t count) )
 410 {
 411     wchar_t wchar;
 412     int n;
 413
 414         if (f_mbtowc == NULL)           /* If no conversion function was given... */
 415                 f_mbtowc = mbtowc;              /*    use the local ANSI C function */
 416
 417     if (mbsize == 0)                            /* 0 is not valid. */
 418         return -1;
 419
 420     if (mbchar == NULL || *mbchar == 0)
 421     {
 422         if (utf8char)
 423             *utf8char = 0;
 424         return 1;
 425     }
 426
 427         /* First convert the MB char to a Wide Char */
 428         n = f_mbtowc( &wchar, mbchar, mbsize);
 429
 430         if (n == -1)
 431                 return -1;
 432
 433         /* Convert the Wide Char to a UTF-8 character. */
 434         n = ldap_x_wc_to_utf8( utf8char, wchar, LDAP_MAX_UTF8_LEN);
 435
 436         return n;
 437 }
 438
 439
 440 /*-----------------------------------------------------------------------------
 441    Convert a MultiByte string to a UTF-8 string.
 442    No more than 'count' bytes will be written to the output buffer.
 443    Return the size of the converted string in bytes, excl null terminator.
 444 */
 445 int
 446 ldap_x_mbs_to_utf8s ( char *utf8str, const char *mbstr, size_t count,
 447                 size_t (*f_mbstowcs)(wchar_t *wcstr, const char *mbstr, size_t count) )
 448 {
 449         wchar_t *wcs;
 450         int n;
 451         size_t wcsize;
 452
 453         if (mbstr == NULL)                 /* Treat NULL input string as an empty string */
 454                 mbstr = "";
 455
 456         if (f_mbstowcs == NULL)         /* If no conversion function was given... */
 457                 f_mbstowcs = mbstowcs;  /*    use the local ANSI C function */
 458
 459         /* Allocate memory for the maximum size wchar string that we could get. */
 460         wcsize = strlen(mbstr) + 1;
 461         wcs = (wchar_t *)LDAP_MALLOC( wcsize * sizeof(wchar_t) );
 462         if (wcs == NULL)
 463                 return -1;
 464
 465         /* First convert multi-byte string to a wide char string */
 466         n = f_mbstowcs(wcs, mbstr, wcsize);
 467
 468         /* Convert wide char string to UTF-8 string */
 469         if (n != -1)
 470         {
 471                 n = ldap_x_wcs_to_utf8s( utf8str, wcs, count);
 472         }
 473
 474         LDAP_FREE(wcs);
 475
 476         return n;
 477 }
 478
 479 #endif