git.sur5r.net Git - openldap/blob - libraries/libldap/utf-8-conv.c

   1 /* $OpenLDAP$ */
   2 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
   3  *
   4  * Copyright 1998-2006 The OpenLDAP Foundation.
   5  * All rights reserved.
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted only as authorized by the OpenLDAP
   9  * Public License.
  10  *
  11  * A copy of this license is available in the file LICENSE in the
  12  * top-level directory of the distribution or, alternatively, at
  13  * <http://www.OpenLDAP.org/license.html>.
  14  */
  15 /* Portions Copyright (C) 1999, 2000 Novell, Inc. All Rights Reserved.
  16  *
  17  * THIS WORK IS SUBJECT TO U.S. AND INTERNATIONAL COPYRIGHT LAWS AND
  18  * TREATIES. USE, MODIFICATION, AND REDISTRIBUTION OF THIS WORK IS SUBJECT
  19  * TO VERSION 2.0.1 OF THE OPENLDAP PUBLIC LICENSE, A COPY OF WHICH IS
  20  * AVAILABLE AT HTTP://WWW.OPENLDAP.ORG/LICENSE.HTML OR IN THE FILE "LICENSE"
  21  * IN THE TOP-LEVEL DIRECTORY OF THE DISTRIBUTION. ANY USE OR EXPLOITATION
  22  * OF THIS WORK OTHER THAN AS AUTHORIZED IN VERSION 2.0.1 OF THE OPENLDAP
  23  * PUBLIC LICENSE, OR OTHER PRIOR WRITTEN CONSENT FROM NOVELL, COULD SUBJECT
  24  * THE PERPETRATOR TO CRIMINAL AND CIVIL LIABILITY.
  25  *---
  26  * Note: A verbatim copy of version 2.0.1 of the OpenLDAP Public License
  27  * can be found in the file "build/LICENSE-2.0.1" in this distribution
  28  * of OpenLDAP Software.
  29  */
  30
  31 /*
  32  * UTF-8 Conversion Routines
  33  *
  34  * These routines convert between Wide Character and UTF-8,
  35  * or between MultiByte and UTF-8 encodings.
  36  *
  37  * Both single character and string versions of the functions are provided.
  38  * All functions return -1 if the character or string cannot be converted.
  39  */
  40
  41 #include "portable.h"
  42
  43 #if SIZEOF_WCHAR_T >= 4
  44 /* These routines assume ( sizeof(wchar_t) >= 4 ) */
  45
  46 #include <stdio.h>
  47 #include <ac/stdlib.h>          /* For wctomb, wcstombs, mbtowc, mbstowcs */
  48 #include <ac/string.h>
  49 #include <ac/time.h>            /* for time_t */
  50
  51 #include "ldap-int.h"
  52
  53 #include <ldap_utf8.h>
  54
  55 static unsigned char mask[] = { 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
  56
  57
  58 /*-----------------------------------------------------------------------------
  59                                         UTF-8 Format Summary
  60
  61 ASCII chars                                             7 bits
  62     0xxxxxxx
  63
  64 2-character UTF-8 sequence:        11 bits
  65     110xxxxx  10xxxxxx
  66
  67 3-character UTF-8                  16 bits
  68     1110xxxx  10xxxxxx  10xxxxxx
  69
  70 4-char UTF-8                       21 bits
  71     11110xxx  10xxxxxx  10xxxxxx  10xxxxxx
  72
  73 5-char UTF-8                       26 bits
  74     111110xx  10xxxxxx  10xxxxxx  10xxxxxx  10xxxxxx
  75
  76 6-char UTF-8                       31 bits
  77     1111110x  10xxxxxx  10xxxxxx  10xxxxxx  10xxxxxx  10xxxxxx
  78
  79 Unicode address space   (0 - 0x10FFFF)    21 bits
  80 ISO-10646 address space (0 - 0x7FFFFFFF)  31 bits
  81
  82 Note: This code does not prevent UTF-8 sequences which are longer than
  83       necessary from being decoded.
  84 */
  85
  86 /*-----------------------------------------------------------------------------
  87    Convert a UTF-8 character to a wide char.
  88    Return the length of the UTF-8 input character in bytes.
  89 */
  90 int
  91 ldap_x_utf8_to_wc ( wchar_t *wchar, const char *utf8char )
  92 {
  93         int utflen, i;
  94         wchar_t ch;
  95
  96         if (utf8char == NULL) return -1;
  97
  98         /* Get UTF-8 sequence length from 1st byte */
  99         utflen = LDAP_UTF8_CHARLEN2(utf8char, utflen);
 100
 101         if( utflen==0 || utflen > (int)LDAP_MAX_UTF8_LEN ) return -1;
 102
 103         /* First byte minus length tag */
 104         ch = (wchar_t)(utf8char[0] & mask[utflen]);
 105
 106         for(i=1; i < utflen; i++) {
 107                 /* Subsequent bytes must start with 10 */
 108                 if ((utf8char[i] & 0xc0) != 0x80) return -1;
 109
 110                 ch <<= 6;                       /* 6 bits of data in each subsequent byte */
 111                 ch |= (wchar_t)(utf8char[i] & 0x3f);
 112         }
 113
 114         if (wchar) *wchar = ch;
 115
 116         return utflen;
 117 }
 118
 119 /*-----------------------------------------------------------------------------
 120    Convert a UTF-8 string to a wide char string.
 121    No more than 'count' wide chars will be written to the output buffer.
 122    Return the size of the converted string in wide chars, excl null terminator.
 123 */
 124 int
 125 ldap_x_utf8s_to_wcs ( wchar_t *wcstr, const char *utf8str, size_t count )
 126 {
 127         size_t wclen = 0;
 128         int utflen, i;
 129         wchar_t ch;
 130
 131
 132         /* If input ptr is NULL, treat it as empty string. */
 133         if (utf8str == NULL) utf8str = "";
 134
 135         /* Examine next UTF-8 character.  If output buffer is NULL, ignore count */
 136         while ( *utf8str && (wcstr==NULL || wclen<count) ) {
 137                 /* Get UTF-8 sequence length from 1st byte */
 138                 utflen = LDAP_UTF8_CHARLEN2(utf8str, utflen);
 139
 140                 if( utflen==0 || utflen > (int)LDAP_MAX_UTF8_LEN ) return -1;
 141
 142                 /* First byte minus length tag */
 143                 ch = (wchar_t)(utf8str[0] & mask[utflen]);
 144
 145                 for(i=1; i < utflen; i++) {
 146                         /* Subsequent bytes must start with 10 */
 147                         if ((utf8str[i] & 0xc0) != 0x80) return -1;
 148
 149                         ch <<= 6;                       /* 6 bits of data in each subsequent byte */
 150                         ch |= (wchar_t)(utf8str[i] & 0x3f);
 151                 }
 152
 153                 if (wcstr) wcstr[wclen] = ch;
 154
 155                 utf8str += utflen;      /* Move to next UTF-8 character */
 156                 wclen++;                        /* Count number of wide chars stored/required */
 157         }
 158
 159         /* Add null terminator if there's room in the buffer. */
 160         if (wcstr && wclen < count) wcstr[wclen] = 0;
 161
 162         return wclen;
 163 }
 164
 165
 166 /*-----------------------------------------------------------------------------
 167    Convert one wide char to a UTF-8 character.
 168    Return the length of the converted UTF-8 character in bytes.
 169    No more than 'count' bytes will be written to the output buffer.
 170 */
 171 int
 172 ldap_x_wc_to_utf8 ( char *utf8char, wchar_t wchar, size_t count )
 173 {
 174         int len=0;
 175
 176         if (utf8char == NULL)   /* Just determine the required UTF-8 char length. */
 177         {                                               /* Ignore count */
 178                 if( wchar < 0 )
 179                         return -1;
 180                 if( wchar < 0x80 )
 181                         return 1;
 182                 if( wchar < 0x800 )
 183                         return 2;
 184                 if( wchar < 0x10000 )
 185                         return 3;
 186                 if( wchar < 0x200000 )
 187                         return 4;
 188                 if( wchar < 0x4000000 )
 189                         return 5;
 190                 if( wchar < 0x80000000 )
 191                         return 6;
 192                 return -1;
 193         }
 194
 195
 196         if ( wchar < 0 ) {                              /* Invalid wide character */
 197                 len = -1;
 198
 199         } else if( wchar < 0x80 ) {
 200                 if (count >= 1) {
 201                         utf8char[len++] = (char)wchar;
 202                 }
 203
 204         } else if( wchar < 0x800 ) {
 205                 if (count >=2) {
 206                         utf8char[len++] = 0xc0 | ( wchar >> 6 );
 207                         utf8char[len++] = 0x80 | ( wchar & 0x3f );
 208                 }
 209
 210         } else if( wchar < 0x10000 ) {
 211                 if (count >= 3) {
 212                         utf8char[len++] = 0xe0 | ( wchar >> 12 );
 213                         utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
 214                         utf8char[len++] = 0x80 | ( wchar & 0x3f );
 215                 }
 216
 217         } else if( wchar < 0x200000 ) {
 218                 if (count >= 4) {
 219                         utf8char[len++] = 0xf0 | ( wchar >> 18 );
 220                         utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
 221                         utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
 222                         utf8char[len++] = 0x80 | ( wchar & 0x3f );
 223                 }
 224
 225         } else if( wchar < 0x4000000 ) {
 226                 if (count >= 5) {
 227                         utf8char[len++] = 0xf8 | ( wchar >> 24 );
 228                         utf8char[len++] = 0x80 | ( (wchar >> 18) & 0x3f );
 229                         utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
 230                         utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
 231                         utf8char[len++] = 0x80 | ( wchar & 0x3f );
 232                 }
 233
 234         } else if( wchar < 0x80000000 ) {
 235                 if (count >= 6) {
 236                         utf8char[len++] = 0xfc | ( wchar >> 30 );
 237                         utf8char[len++] = 0x80 | ( (wchar >> 24) & 0x3f );
 238                         utf8char[len++] = 0x80 | ( (wchar >> 18) & 0x3f );
 239                         utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
 240                         utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
 241                         utf8char[len++] = 0x80 | ( wchar & 0x3f );
 242                 }
 243
 244         } else
 245                 len = -1;
 246
 247         return len;
 248
 249 }
 250
 251
 252 /*-----------------------------------------------------------------------------
 253    Convert a wide char string to a UTF-8 string.
 254    No more than 'count' bytes will be written to the output buffer.
 255    Return the # of bytes written to the output buffer, excl null terminator.
 256 */
 257 int
 258 ldap_x_wcs_to_utf8s ( char *utf8str, const wchar_t *wcstr, size_t count )
 259 {
 260         int len = 0;
 261         int n;
 262         char *p = utf8str;
 263         wchar_t empty = 0;              /* To avoid use of L"" construct */
 264
 265         if (wcstr == NULL)              /* Treat input ptr NULL as an empty string */
 266                 wcstr = &empty;
 267
 268         if (utf8str == NULL)    /* Just compute size of output, excl null */
 269         {
 270                 while (*wcstr)
 271                 {
 272                         /* Get UTF-8 size of next wide char */
 273                         n = ldap_x_wc_to_utf8( NULL, *wcstr++, LDAP_MAX_UTF8_LEN);
 274                         if (n == -1)
 275                                 return -1;
 276                         len += n;
 277                 }
 278
 279                 return len;
 280         }
 281
 282
 283         /* Do the actual conversion. */
 284
 285         n = 1;                                  /* In case of empty wcstr */
 286         while (*wcstr)
 287         {
 288                 n = ldap_x_wc_to_utf8( p, *wcstr++, count);
 289
 290                 if (n <= 0)             /* If encoding error (-1) or won't fit (0), quit */
 291                         break;
 292
 293                 p += n;
 294                 count -= n;                     /* Space left in output buffer */
 295         }
 296
 297         /* If not enough room for last character, pad remainder with null
 298            so that return value = original count, indicating buffer full. */
 299         if (n == 0)
 300         {
 301                 while (count--)
 302                         *p++ = 0;
 303         }
 304
 305         /* Add a null terminator if there's room. */
 306         else if (count)
 307                 *p = 0;
 308
 309         if (n == -1)                    /* Conversion encountered invalid wide char. */
 310                 return -1;
 311
 312         /* Return the number of bytes written to output buffer, excl null. */
 313         return (p - utf8str);
 314 }
 315
 316
 317 /*-----------------------------------------------------------------------------
 318    Convert a UTF-8 character to a MultiByte character.
 319    Return the size of the converted character in bytes.
 320 */
 321 int
 322 ldap_x_utf8_to_mb ( char *mbchar, const char *utf8char,
 323                 int (*f_wctomb)(char *mbchar, wchar_t wchar) )
 324 {
 325         wchar_t wchar;
 326         int n;
 327         char tmp[6];                            /* Large enough for biggest multibyte char */
 328
 329         if (f_wctomb == NULL)           /* If no conversion function was given... */
 330                 f_wctomb = wctomb;              /*    use the local ANSI C function */
 331
 332         /* First convert UTF-8 char to a wide char */
 333         n = ldap_x_utf8_to_wc( &wchar, utf8char);
 334
 335         if (n == -1)
 336                 return -1;              /* Invalid UTF-8 character */
 337
 338         if (mbchar == NULL)
 339                 n = f_wctomb( tmp, wchar );
 340         else
 341                 n = f_wctomb( mbchar, wchar);
 342
 343         return n;
 344 }
 345
 346 /*-----------------------------------------------------------------------------
 347    Convert a UTF-8 string to a MultiByte string.
 348    No more than 'count' bytes will be written to the output buffer.
 349    Return the size of the converted string in bytes, excl null terminator.
 350 */
 351 int
 352 ldap_x_utf8s_to_mbs ( char *mbstr, const char *utf8str, size_t count,
 353                 size_t (*f_wcstombs)(char *mbstr, const wchar_t *wcstr, size_t count) )
 354 {
 355         wchar_t *wcs;
 356         size_t wcsize;
 357     int n;
 358
 359         if (f_wcstombs == NULL)         /* If no conversion function was given... */
 360                 f_wcstombs = wcstombs;  /*    use the local ANSI C function */
 361
 362         if (utf8str == NULL || *utf8str == 0)   /* NULL or empty input string */
 363         {
 364                 if (mbstr)
 365                         *mbstr = 0;
 366                 return 0;
 367         }
 368
 369 /* Allocate memory for the maximum size wchar string that we could get. */
 370         wcsize = strlen(utf8str) + 1;
 371         wcs = (wchar_t *)LDAP_MALLOC(wcsize * sizeof(wchar_t));
 372         if (wcs == NULL)
 373                 return -1;                              /* Memory allocation failure. */
 374
 375         /* First convert the UTF-8 string to a wide char string */
 376         n = ldap_x_utf8s_to_wcs( wcs, utf8str, wcsize);
 377
 378         /* Then convert wide char string to multi-byte string */
 379         if (n != -1)
 380         {
 381                 n = f_wcstombs(mbstr, wcs, count);
 382         }
 383
 384         LDAP_FREE(wcs);
 385
 386         return n;
 387 }
 388
 389 /*-----------------------------------------------------------------------------
 390    Convert a MultiByte character to a UTF-8 character.
 391    'mbsize' indicates the number of bytes of 'mbchar' to check.
 392    Returns the number of bytes written to the output character.
 393 */
 394 int
 395 ldap_x_mb_to_utf8 ( char *utf8char, const char *mbchar, size_t mbsize,
 396                 int (*f_mbtowc)(wchar_t *wchar, const char *mbchar, size_t count) )
 397 {
 398     wchar_t wchar;
 399     int n;
 400
 401         if (f_mbtowc == NULL)           /* If no conversion function was given... */
 402                 f_mbtowc = mbtowc;              /*    use the local ANSI C function */
 403
 404     if (mbsize == 0)                            /* 0 is not valid. */
 405         return -1;
 406
 407     if (mbchar == NULL || *mbchar == 0)
 408     {
 409         if (utf8char)
 410             *utf8char = 0;
 411         return 1;
 412     }
 413
 414         /* First convert the MB char to a Wide Char */
 415         n = f_mbtowc( &wchar, mbchar, mbsize);
 416
 417         if (n == -1)
 418                 return -1;
 419
 420         /* Convert the Wide Char to a UTF-8 character. */
 421         n = ldap_x_wc_to_utf8( utf8char, wchar, LDAP_MAX_UTF8_LEN);
 422
 423         return n;
 424 }
 425
 426
 427 /*-----------------------------------------------------------------------------
 428    Convert a MultiByte string to a UTF-8 string.
 429    No more than 'count' bytes will be written to the output buffer.
 430    Return the size of the converted string in bytes, excl null terminator.
 431 */
 432 int
 433 ldap_x_mbs_to_utf8s ( char *utf8str, const char *mbstr, size_t count,
 434                 size_t (*f_mbstowcs)(wchar_t *wcstr, const char *mbstr, size_t count) )
 435 {
 436         wchar_t *wcs;
 437         int n;
 438         size_t wcsize;
 439
 440         if (mbstr == NULL)                 /* Treat NULL input string as an empty string */
 441                 mbstr = "";
 442
 443         if (f_mbstowcs == NULL)         /* If no conversion function was given... */
 444                 f_mbstowcs = mbstowcs;  /*    use the local ANSI C function */
 445
 446         /* Allocate memory for the maximum size wchar string that we could get. */
 447         wcsize = strlen(mbstr) + 1;
 448         wcs = (wchar_t *)LDAP_MALLOC( wcsize * sizeof(wchar_t) );
 449         if (wcs == NULL)
 450                 return -1;
 451
 452         /* First convert multi-byte string to a wide char string */
 453         n = f_mbstowcs(wcs, mbstr, wcsize);
 454
 455         /* Convert wide char string to UTF-8 string */
 456         if (n != -1)
 457         {
 458                 n = ldap_x_wcs_to_utf8s( utf8str, wcs, count);
 459         }
 460
 461         LDAP_FREE(wcs);
 462
 463         return n;
 464 }
 465
 466 #endif