git.sur5r.net Git - openldap/blob - libraries/libldap/utf-8-conv.c

   1 /* $OpenLDAP$ */
   2 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
   3  *
   4  * Copyright 1998-2013 The OpenLDAP Foundation.
   5  * All rights reserved.
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted only as authorized by the OpenLDAP
   9  * Public License.
  10  *
  11  * A copy of this license is available in the file LICENSE in the
  12  * top-level directory of the distribution or, alternatively, at
  13  * <http://www.OpenLDAP.org/license.html>.
  14  */
  15 /* Portions Copyright (C) 1999, 2000 Novell, Inc. All Rights Reserved.
  16  *
  17  * THIS WORK IS SUBJECT TO U.S. AND INTERNATIONAL COPYRIGHT LAWS AND
  18  * TREATIES. USE, MODIFICATION, AND REDISTRIBUTION OF THIS WORK IS SUBJECT
  19  * TO VERSION 2.0.1 OF THE OPENLDAP PUBLIC LICENSE, A COPY OF WHICH IS
  20  * AVAILABLE AT HTTP://WWW.OPENLDAP.ORG/LICENSE.HTML OR IN THE FILE "LICENSE"
  21  * IN THE TOP-LEVEL DIRECTORY OF THE DISTRIBUTION. ANY USE OR EXPLOITATION
  22  * OF THIS WORK OTHER THAN AS AUTHORIZED IN VERSION 2.0.1 OF THE OPENLDAP
  23  * PUBLIC LICENSE, OR OTHER PRIOR WRITTEN CONSENT FROM NOVELL, COULD SUBJECT
  24  * THE PERPETRATOR TO CRIMINAL AND CIVIL LIABILITY.
  25  *---
  26  * Note: A verbatim copy of version 2.0.1 of the OpenLDAP Public License
  27  * can be found in the file "build/LICENSE-2.0.1" in this distribution
  28  * of OpenLDAP Software.
  29  */
  30
  31 /*
  32  * UTF-8 Conversion Routines
  33  *
  34  * These routines convert between Wide Character and UTF-8,
  35  * or between MultiByte and UTF-8 encodings.
  36  *
  37  * Both single character and string versions of the functions are provided.
  38  * All functions return -1 if the character or string cannot be converted.
  39  */
  40
  41 #include "portable.h"
  42
  43 #if SIZEOF_WCHAR_T >= 4
  44 /* These routines assume ( sizeof(wchar_t) >= 4 ) */
  45
  46 #include <stdio.h>
  47 #include <ac/stdlib.h>          /* For wctomb, wcstombs, mbtowc, mbstowcs */
  48 #include <ac/string.h>
  49 #include <ac/time.h>            /* for time_t */
  50
  51 #include "ldap-int.h"
  52
  53 #include <ldap_utf8.h>
  54
  55 static unsigned char mask[] = { 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
  56
  57
  58 /*-----------------------------------------------------------------------------
  59                                         UTF-8 Format Summary
  60
  61 ASCII chars                                             7 bits
  62     0xxxxxxx
  63
  64 2-character UTF-8 sequence:        11 bits
  65     110xxxxx  10xxxxxx
  66
  67 3-character UTF-8                  16 bits
  68     1110xxxx  10xxxxxx  10xxxxxx
  69
  70 4-char UTF-8                       21 bits
  71     11110xxx  10xxxxxx  10xxxxxx  10xxxxxx
  72
  73 5-char UTF-8                       26 bits
  74     111110xx  10xxxxxx  10xxxxxx  10xxxxxx  10xxxxxx
  75
  76 6-char UTF-8                       31 bits
  77     1111110x  10xxxxxx  10xxxxxx  10xxxxxx  10xxxxxx  10xxxxxx
  78
  79 Unicode address space   (0 - 0x10FFFF)    21 bits
  80 ISO-10646 address space (0 - 0x7FFFFFFF)  31 bits
  81
  82 Note: This code does not prevent UTF-8 sequences which are longer than
  83       necessary from being decoded.
  84 */
  85
  86 /*-----------------------------------------------------------------------------
  87    Convert a UTF-8 character to a wide char.
  88    Return the length of the UTF-8 input character in bytes.
  89 */
  90 int
  91 ldap_x_utf8_to_wc ( wchar_t *wchar, const char *utf8char )
  92 {
  93         int utflen, i;
  94         wchar_t ch;
  95
  96         if (utf8char == NULL) return -1;
  97
  98         /* Get UTF-8 sequence length from 1st byte */
  99         utflen = LDAP_UTF8_CHARLEN2(utf8char, utflen);
 100
 101         if( utflen==0 || utflen > (int)LDAP_MAX_UTF8_LEN ) return -1;
 102
 103         /* First byte minus length tag */
 104         ch = (wchar_t)(utf8char[0] & mask[utflen]);
 105
 106         for(i=1; i < utflen; i++) {
 107                 /* Subsequent bytes must start with 10 */
 108                 if ((utf8char[i] & 0xc0) != 0x80) return -1;
 109
 110                 ch <<= 6;                       /* 6 bits of data in each subsequent byte */
 111                 ch |= (wchar_t)(utf8char[i] & 0x3f);
 112         }
 113
 114         if (wchar) *wchar = ch;
 115
 116         return utflen;
 117 }
 118
 119 /*-----------------------------------------------------------------------------
 120    Convert a UTF-8 string to a wide char string.
 121    No more than 'count' wide chars will be written to the output buffer.
 122    Return the size of the converted string in wide chars, excl null terminator.
 123 */
 124 int
 125 ldap_x_utf8s_to_wcs ( wchar_t *wcstr, const char *utf8str, size_t count )
 126 {
 127         size_t wclen = 0;
 128         int utflen, i;
 129         wchar_t ch;
 130
 131
 132         /* If input ptr is NULL or empty... */
 133         if (utf8str == NULL || !*utf8str) {
 134                 if ( wcstr )
 135                         *wcstr = 0;
 136                 return 0;
 137         }
 138
 139         /* Examine next UTF-8 character.  If output buffer is NULL, ignore count */
 140         while ( *utf8str && (wcstr==NULL || wclen<count) ) {
 141                 /* Get UTF-8 sequence length from 1st byte */
 142                 utflen = LDAP_UTF8_CHARLEN2(utf8str, utflen);
 143
 144                 if( utflen==0 || utflen > (int)LDAP_MAX_UTF8_LEN ) return -1;
 145
 146                 /* First byte minus length tag */
 147                 ch = (wchar_t)(utf8str[0] & mask[utflen]);
 148
 149                 for(i=1; i < utflen; i++) {
 150                         /* Subsequent bytes must start with 10 */
 151                         if ((utf8str[i] & 0xc0) != 0x80) return -1;
 152
 153                         ch <<= 6;                       /* 6 bits of data in each subsequent byte */
 154                         ch |= (wchar_t)(utf8str[i] & 0x3f);
 155                 }
 156
 157                 if (wcstr) wcstr[wclen] = ch;
 158
 159                 utf8str += utflen;      /* Move to next UTF-8 character */
 160                 wclen++;                        /* Count number of wide chars stored/required */
 161         }
 162
 163         /* Add null terminator if there's room in the buffer. */
 164         if (wcstr && wclen < count) wcstr[wclen] = 0;
 165
 166         return wclen;
 167 }
 168
 169
 170 /*-----------------------------------------------------------------------------
 171    Convert one wide char to a UTF-8 character.
 172    Return the length of the converted UTF-8 character in bytes.
 173    No more than 'count' bytes will be written to the output buffer.
 174 */
 175 int
 176 ldap_x_wc_to_utf8 ( char *utf8char, wchar_t wchar, size_t count )
 177 {
 178         int len=0;
 179
 180         if (utf8char == NULL)   /* Just determine the required UTF-8 char length. */
 181         {                                               /* Ignore count */
 182                 if( wchar < 0 )
 183                         return -1;
 184                 if( wchar < 0x80 )
 185                         return 1;
 186                 if( wchar < 0x800 )
 187                         return 2;
 188                 if( wchar < 0x10000 )
 189                         return 3;
 190                 if( wchar < 0x200000 )
 191                         return 4;
 192                 if( wchar < 0x4000000 )
 193                         return 5;
 194 #if SIZEOF_WCHAR_T > 4
 195                 /* UL is not strictly needed by ANSI C */
 196                 if( wchar < (wchar_t)0x80000000UL )
 197 #endif /* SIZEOF_WCHAR_T > 4 */
 198                         return 6;
 199                 return -1;
 200         }
 201
 202
 203         if ( wchar < 0 ) {                              /* Invalid wide character */
 204                 len = -1;
 205
 206         } else if( wchar < 0x80 ) {
 207                 if (count >= 1) {
 208                         utf8char[len++] = (char)wchar;
 209                 }
 210
 211         } else if( wchar < 0x800 ) {
 212                 if (count >=2) {
 213                         utf8char[len++] = 0xc0 | ( wchar >> 6 );
 214                         utf8char[len++] = 0x80 | ( wchar & 0x3f );
 215                 }
 216
 217         } else if( wchar < 0x10000 ) {
 218                 if (count >= 3) {
 219                         utf8char[len++] = 0xe0 | ( wchar >> 12 );
 220                         utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
 221                         utf8char[len++] = 0x80 | ( wchar & 0x3f );
 222                 }
 223
 224         } else if( wchar < 0x200000 ) {
 225                 if (count >= 4) {
 226                         utf8char[len++] = 0xf0 | ( wchar >> 18 );
 227                         utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
 228                         utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
 229                         utf8char[len++] = 0x80 | ( wchar & 0x3f );
 230                 }
 231
 232         } else if( wchar < 0x4000000 ) {
 233                 if (count >= 5) {
 234                         utf8char[len++] = 0xf8 | ( wchar >> 24 );
 235                         utf8char[len++] = 0x80 | ( (wchar >> 18) & 0x3f );
 236                         utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
 237                         utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
 238                         utf8char[len++] = 0x80 | ( wchar & 0x3f );
 239                 }
 240
 241         } else
 242 #if SIZEOF_WCHAR_T > 4
 243                 /* UL is not strictly needed by ANSI C */
 244                 if( wchar < (wchar_t)0x80000000UL )
 245 #endif /* SIZEOF_WCHAR_T > 4 */
 246         {
 247                 if (count >= 6) {
 248                         utf8char[len++] = 0xfc | ( wchar >> 30 );
 249                         utf8char[len++] = 0x80 | ( (wchar >> 24) & 0x3f );
 250                         utf8char[len++] = 0x80 | ( (wchar >> 18) & 0x3f );
 251                         utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
 252                         utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
 253                         utf8char[len++] = 0x80 | ( wchar & 0x3f );
 254                 }
 255
 256 #if SIZEOF_WCHAR_T > 4
 257         } else {
 258                 len = -1;
 259 #endif /* SIZEOF_WCHAR_T > 4 */
 260         }
 261
 262         return len;
 263
 264 }
 265
 266
 267 /*-----------------------------------------------------------------------------
 268    Convert a wide char string to a UTF-8 string.
 269    No more than 'count' bytes will be written to the output buffer.
 270    Return the # of bytes written to the output buffer, excl null terminator.
 271 */
 272 int
 273 ldap_x_wcs_to_utf8s ( char *utf8str, const wchar_t *wcstr, size_t count )
 274 {
 275         int len = 0;
 276         int n;
 277         char *p = utf8str;
 278         wchar_t empty = 0;              /* To avoid use of L"" construct */
 279
 280         if (wcstr == NULL)              /* Treat input ptr NULL as an empty string */
 281                 wcstr = &empty;
 282
 283         if (utf8str == NULL)    /* Just compute size of output, excl null */
 284         {
 285                 while (*wcstr)
 286                 {
 287                         /* Get UTF-8 size of next wide char */
 288                         n = ldap_x_wc_to_utf8( NULL, *wcstr++, LDAP_MAX_UTF8_LEN);
 289                         if (n == -1)
 290                                 return -1;
 291                         len += n;
 292                 }
 293
 294                 return len;
 295         }
 296
 297
 298         /* Do the actual conversion. */
 299
 300         n = 1;                                  /* In case of empty wcstr */
 301         while (*wcstr)
 302         {
 303                 n = ldap_x_wc_to_utf8( p, *wcstr++, count);
 304
 305                 if (n <= 0)             /* If encoding error (-1) or won't fit (0), quit */
 306                         break;
 307
 308                 p += n;
 309                 count -= n;                     /* Space left in output buffer */
 310         }
 311
 312         /* If not enough room for last character, pad remainder with null
 313            so that return value = original count, indicating buffer full. */
 314         if (n == 0)
 315         {
 316                 while (count--)
 317                         *p++ = 0;
 318         }
 319
 320         /* Add a null terminator if there's room. */
 321         else if (count)
 322                 *p = 0;
 323
 324         if (n == -1)                    /* Conversion encountered invalid wide char. */
 325                 return -1;
 326
 327         /* Return the number of bytes written to output buffer, excl null. */
 328         return (p - utf8str);
 329 }
 330
 331 #ifdef ANDROID
 332 int wctomb(char *s, wchar_t wc) { return wcrtomb(s,wc,NULL); }
 333 int mbtowc(wchar_t *pwc, const char *s, size_t n) { return mbrtowc(pwc, s, n, NULL); }
 334 #endif
 335
 336 /*-----------------------------------------------------------------------------
 337    Convert a UTF-8 character to a MultiByte character.
 338    Return the size of the converted character in bytes.
 339 */
 340 int
 341 ldap_x_utf8_to_mb ( char *mbchar, const char *utf8char,
 342                 int (*f_wctomb)(char *mbchar, wchar_t wchar) )
 343 {
 344         wchar_t wchar;
 345         int n;
 346         char tmp[6];                            /* Large enough for biggest multibyte char */
 347
 348         if (f_wctomb == NULL)           /* If no conversion function was given... */
 349                 f_wctomb = wctomb;              /*    use the local ANSI C function */
 350
 351         /* First convert UTF-8 char to a wide char */
 352         n = ldap_x_utf8_to_wc( &wchar, utf8char);
 353
 354         if (n == -1)
 355                 return -1;              /* Invalid UTF-8 character */
 356
 357         if (mbchar == NULL)
 358                 n = f_wctomb( tmp, wchar );
 359         else
 360                 n = f_wctomb( mbchar, wchar);
 361
 362         return n;
 363 }
 364
 365 /*-----------------------------------------------------------------------------
 366    Convert a UTF-8 string to a MultiByte string.
 367    No more than 'count' bytes will be written to the output buffer.
 368    Return the size of the converted string in bytes, excl null terminator.
 369 */
 370 int
 371 ldap_x_utf8s_to_mbs ( char *mbstr, const char *utf8str, size_t count,
 372                 size_t (*f_wcstombs)(char *mbstr, const wchar_t *wcstr, size_t count) )
 373 {
 374         wchar_t *wcs;
 375         size_t wcsize;
 376     int n;
 377
 378         if (f_wcstombs == NULL)         /* If no conversion function was given... */
 379                 f_wcstombs = wcstombs;  /*    use the local ANSI C function */
 380
 381         if (utf8str == NULL || *utf8str == 0)   /* NULL or empty input string */
 382         {
 383                 if (mbstr)
 384                         *mbstr = 0;
 385                 return 0;
 386         }
 387
 388 /* Allocate memory for the maximum size wchar string that we could get. */
 389         wcsize = strlen(utf8str) + 1;
 390         wcs = (wchar_t *)LDAP_MALLOC(wcsize * sizeof(wchar_t));
 391         if (wcs == NULL)
 392                 return -1;                              /* Memory allocation failure. */
 393
 394         /* First convert the UTF-8 string to a wide char string */
 395         n = ldap_x_utf8s_to_wcs( wcs, utf8str, wcsize);
 396
 397         /* Then convert wide char string to multi-byte string */
 398         if (n != -1)
 399         {
 400                 n = f_wcstombs(mbstr, wcs, count);
 401         }
 402
 403         LDAP_FREE(wcs);
 404
 405         return n;
 406 }
 407
 408 /*-----------------------------------------------------------------------------
 409    Convert a MultiByte character to a UTF-8 character.
 410    'mbsize' indicates the number of bytes of 'mbchar' to check.
 411    Returns the number of bytes written to the output character.
 412 */
 413 int
 414 ldap_x_mb_to_utf8 ( char *utf8char, const char *mbchar, size_t mbsize,
 415                 int (*f_mbtowc)(wchar_t *wchar, const char *mbchar, size_t count) )
 416 {
 417     wchar_t wchar;
 418     int n;
 419
 420         if (f_mbtowc == NULL)           /* If no conversion function was given... */
 421                 f_mbtowc = mbtowc;              /*    use the local ANSI C function */
 422
 423     if (mbsize == 0)                            /* 0 is not valid. */
 424         return -1;
 425
 426     if (mbchar == NULL || *mbchar == 0)
 427     {
 428         if (utf8char)
 429             *utf8char = 0;
 430         return 1;
 431     }
 432
 433         /* First convert the MB char to a Wide Char */
 434         n = f_mbtowc( &wchar, mbchar, mbsize);
 435
 436         if (n == -1)
 437                 return -1;
 438
 439         /* Convert the Wide Char to a UTF-8 character. */
 440         n = ldap_x_wc_to_utf8( utf8char, wchar, LDAP_MAX_UTF8_LEN);
 441
 442         return n;
 443 }
 444
 445
 446 /*-----------------------------------------------------------------------------
 447    Convert a MultiByte string to a UTF-8 string.
 448    No more than 'count' bytes will be written to the output buffer.
 449    Return the size of the converted string in bytes, excl null terminator.
 450 */
 451 int
 452 ldap_x_mbs_to_utf8s ( char *utf8str, const char *mbstr, size_t count,
 453                 size_t (*f_mbstowcs)(wchar_t *wcstr, const char *mbstr, size_t count) )
 454 {
 455         wchar_t *wcs;
 456         int n;
 457         size_t wcsize;
 458
 459         if (mbstr == NULL)                 /* Treat NULL input string as an empty string */
 460                 mbstr = "";
 461
 462         if (f_mbstowcs == NULL)         /* If no conversion function was given... */
 463                 f_mbstowcs = mbstowcs;  /*    use the local ANSI C function */
 464
 465         /* Allocate memory for the maximum size wchar string that we could get. */
 466         wcsize = strlen(mbstr) + 1;
 467         wcs = (wchar_t *)LDAP_MALLOC( wcsize * sizeof(wchar_t) );
 468         if (wcs == NULL)
 469                 return -1;
 470
 471         /* First convert multi-byte string to a wide char string */
 472         n = f_mbstowcs(wcs, mbstr, wcsize);
 473
 474         /* Convert wide char string to UTF-8 string */
 475         if (n != -1)
 476         {
 477                 n = ldap_x_wcs_to_utf8s( utf8str, wcs, count);
 478         }
 479
 480         LDAP_FREE(wcs);
 481
 482         return n;
 483 }
 484
 485 #endif /* SIZEOF_WCHAR_T >= 4 */