git.sur5r.net Git - openldap/blob - libraries/liblunicode/ucdata/ucgendat.c

   1 /* $OpenLDAP$ */
   2 /*
   3  * Copyright 2001 Computing Research Labs, New Mexico State University
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice shall be included in
  13  * all copies or substantial portions of the Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
  19  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
  20  * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
  21  * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  */
  23 /* $Id: ucgendat.c,v 1.4 2001/01/02 18:46:20 mleisher Exp $" */
  24
  25 #include "portable.h"
  26 #include "ldap_config.h"
  27
  28 #include <stdio.h>
  29 #include <ac/stdlib.h>
  30 #include <ac/string.h>
  31 #include <ac/unistd.h>
  32
  33 #define ishdigit(cc) (((cc) >= '0' && (cc) <= '9') ||\
  34                       ((cc) >= 'A' && (cc) <= 'F') ||\
  35                       ((cc) >= 'a' && (cc) <= 'f'))
  36
  37 /*
  38  * A header written to the output file with the byte-order-mark and the number
  39  * of property nodes.
  40  */
  41 static unsigned short hdr[2] = {0xfeff, 0};
  42
  43 #define NUMPROPS 50
  44 #define NEEDPROPS (NUMPROPS + (4 - (NUMPROPS & 3)))
  45
  46 typedef struct {
  47     char *name;
  48     int len;
  49 } _prop_t;
  50
  51 /*
  52  * List of properties expected to be found in the Unicode Character Database
  53  * including some implementation specific properties.
  54  *
  55  * The implementation specific properties are:
  56  * Cm = Composed (can be decomposed)
  57  * Nb = Non-breaking
  58  * Sy = Symmetric (has left and right forms)
  59  * Hd = Hex digit
  60  * Qm = Quote marks
  61  * Mr = Mirroring
  62  * Ss = Space, other
  63  * Cp = Defined character
  64  */
  65 static _prop_t props[NUMPROPS] = {
  66     {"Mn", 2}, {"Mc", 2}, {"Me", 2}, {"Nd", 2}, {"Nl", 2}, {"No", 2},
  67     {"Zs", 2}, {"Zl", 2}, {"Zp", 2}, {"Cc", 2}, {"Cf", 2}, {"Cs", 2},
  68     {"Co", 2}, {"Cn", 2}, {"Lu", 2}, {"Ll", 2}, {"Lt", 2}, {"Lm", 2},
  69     {"Lo", 2}, {"Pc", 2}, {"Pd", 2}, {"Ps", 2}, {"Pe", 2}, {"Po", 2},
  70     {"Sm", 2}, {"Sc", 2}, {"Sk", 2}, {"So", 2}, {"L",  1}, {"R",  1},
  71     {"EN", 2}, {"ES", 2}, {"ET", 2}, {"AN", 2}, {"CS", 2}, {"B",  1},
  72     {"S",  1}, {"WS", 2}, {"ON", 2},
  73     {"Cm", 2}, {"Nb", 2}, {"Sy", 2}, {"Hd", 2}, {"Qm", 2}, {"Mr", 2},
  74     {"Ss", 2}, {"Cp", 2}, {"Pi", 2}, {"Pf", 2}, {"AL", 2}
  75 };
  76
  77 typedef struct {
  78     unsigned long *ranges;
  79     unsigned short used;
  80     unsigned short size;
  81 } _ranges_t;
  82
  83 static _ranges_t proptbl[NUMPROPS];
  84
  85 /*
  86  * Make sure this array is sized to be on a 4-byte boundary at compile time.
  87  */
  88 static unsigned short propcnt[NEEDPROPS];
  89
  90 /*
  91  * Array used to collect a decomposition before adding it to the decomposition
  92  * table.
  93  */
  94 static unsigned long dectmp[64];
  95 static unsigned long dectmp_size;
  96
  97 typedef struct {
  98     unsigned long code;
  99     unsigned short size;
 100     unsigned short used;
 101     unsigned long *decomp;
 102 } _decomp_t;
 103
 104 /*
 105  * List of decomposition.  Created and expanded in order as the characters are
 106  * encountered.
 107  */
 108 static _decomp_t *decomps;
 109 static unsigned long decomps_used;
 110 static unsigned long decomps_size;
 111
 112 /*
 113  * Composition exclusion table stuff.
 114  */
 115 #define COMPEX_SET(c) (compexs[(c) >> 5] |= (1 << ((c) & 31)))
 116 #define COMPEX_TEST(c) (compexs[(c) >> 5] & (1 << ((c) & 31)))
 117 static unsigned long compexs[2048];
 118
 119 /*
 120  * Struct for holding a composition pair, and array of composition pairs
 121  */
 122 typedef struct {
 123     unsigned long comp;
 124     unsigned long count;
 125     unsigned long code1;
 126     unsigned long code2;
 127 } _comp_t;
 128
 129 static _comp_t *comps;
 130 static unsigned long comps_used;
 131
 132 /*
 133  * Types and lists for handling lists of case mappings.
 134  */
 135 typedef struct {
 136     unsigned long key;
 137     unsigned long other1;
 138     unsigned long other2;
 139 } _case_t;
 140
 141 static _case_t *upper;
 142 static _case_t *lower;
 143 static _case_t *title;
 144 static unsigned long upper_used;
 145 static unsigned long upper_size;
 146 static unsigned long lower_used;
 147 static unsigned long lower_size;
 148 static unsigned long title_used;
 149 static unsigned long title_size;
 150
 151 /*
 152  * Array used to collect case mappings before adding them to a list.
 153  */
 154 static unsigned long cases[3];
 155
 156 /*
 157  * An array to hold ranges for combining classes.
 158  */
 159 static unsigned long *ccl;
 160 static unsigned long ccl_used;
 161 static unsigned long ccl_size;
 162
 163 /*
 164  * Structures for handling numbers.
 165  */
 166 typedef struct {
 167     unsigned long code;
 168     unsigned long idx;
 169 } _codeidx_t;
 170
 171 typedef struct {
 172     short numerator;
 173     short denominator;
 174 } _num_t;
 175
 176 /*
 177  * Arrays to hold the mapping of codes to numbers.
 178  */
 179 static _codeidx_t *ncodes;
 180 static unsigned long ncodes_used;
 181 static unsigned long ncodes_size;
 182
 183 static _num_t *nums;
 184 static unsigned long nums_used;
 185 static unsigned long nums_size;
 186
 187 /*
 188  * Array for holding numbers.
 189  */
 190 static _num_t *nums;
 191 static unsigned long nums_used;
 192 static unsigned long nums_size;
 193
 194 static void
 195 add_range(unsigned long start, unsigned long end, char *p1, char *p2)
 196 {
 197     int i, j, k, len;
 198     _ranges_t *rlp;
 199     char *name;
 200
 201     for (k = 0; k < 2; k++) {
 202         if (k == 0) {
 203             name = p1;
 204             len = 2;
 205         } else {
 206             if (p2 == 0)
 207               break;
 208
 209             name = p2;
 210             len = 1;
 211         }
 212
 213         for (i = 0; i < NUMPROPS; i++) {
 214             if (props[i].len == len && memcmp(props[i].name, name, len) == 0)
 215               break;
 216         }
 217
 218         if (i == NUMPROPS)
 219           continue;
 220
 221         rlp = &proptbl[i];
 222
 223         /*
 224          * Resize the range list if necessary.
 225          */
 226         if (rlp->used == rlp->size) {
 227             if (rlp->size == 0)
 228               rlp->ranges = (unsigned long *)
 229                   malloc(sizeof(unsigned long) << 3);
 230             else
 231               rlp->ranges = (unsigned long *)
 232                   realloc((char *) rlp->ranges,
 233                           sizeof(unsigned long) * (rlp->size + 8));
 234             rlp->size += 8;
 235         }
 236
 237         /*
 238          * If this is the first code for this property list, just add it
 239          * and return.
 240          */
 241         if (rlp->used == 0) {
 242             rlp->ranges[0] = start;
 243             rlp->ranges[1] = end;
 244             rlp->used += 2;
 245             continue;
 246         }
 247
 248         /*
 249          * Optimize the case of adding the range to the end.
 250          */
 251         j = rlp->used - 1;
 252         if (start > rlp->ranges[j]) {
 253             j = rlp->used;
 254             rlp->ranges[j++] = start;
 255             rlp->ranges[j++] = end;
 256             rlp->used = j;
 257             continue;
 258         }
 259
 260         /*
 261          * Need to locate the insertion point.
 262          */
 263         for (i = 0;
 264              i < rlp->used && start > rlp->ranges[i + 1] + 1; i += 2) ;
 265
 266         /*
 267          * If the start value lies in the current range, then simply set the
 268          * new end point of the range to the end value passed as a parameter.
 269          */
 270         if (rlp->ranges[i] <= start && start <= rlp->ranges[i + 1] + 1) {
 271             rlp->ranges[i + 1] = end;
 272             return;
 273         }
 274
 275         /*
 276          * Shift following values up by two.
 277          */
 278         for (j = rlp->used; j > i; j -= 2) {
 279             rlp->ranges[j] = rlp->ranges[j - 2];
 280             rlp->ranges[j + 1] = rlp->ranges[j - 1];
 281         }
 282
 283         /*
 284          * Add the new range at the insertion point.
 285          */
 286         rlp->ranges[i] = start;
 287         rlp->ranges[i + 1] = end;
 288         rlp->used += 2;
 289     }
 290 }
 291
 292 static void
 293 ordered_range_insert(unsigned long c, char *name, int len)
 294 {
 295     int i, j;
 296     unsigned long s, e;
 297     _ranges_t *rlp;
 298
 299     if (len == 0)
 300       return;
 301
 302     /*
 303      * Deal with directionality codes introduced in Unicode 3.0.
 304      */
 305     if ((len == 2 && memcmp(name, "BN", 2) == 0) ||
 306         (len == 3 &&
 307          (memcmp(name, "NSM", 3) == 0 || memcmp(name, "PDF", 3) == 0 ||
 308           memcmp(name, "LRE", 3) == 0 || memcmp(name, "LRO", 3) == 0 ||
 309           memcmp(name, "RLE", 3) == 0 || memcmp(name, "RLO", 3) == 0))) {
 310         /*
 311          * Mark all of these as Other Neutral to preserve compatibility with
 312          * older versions.
 313          */
 314         len = 2;
 315         name = "ON";
 316     }
 317
 318     for (i = 0; i < NUMPROPS; i++) {
 319         if (props[i].len == len && memcmp(props[i].name, name, len) == 0)
 320           break;
 321     }
 322
 323     if (i == NUMPROPS)
 324       return;
 325
 326     /*
 327      * Have a match, so insert the code in order.
 328      */
 329     rlp = &proptbl[i];
 330
 331     /*
 332      * Resize the range list if necessary.
 333      */
 334     if (rlp->used == rlp->size) {
 335         if (rlp->size == 0)
 336           rlp->ranges = (unsigned long *)
 337               malloc(sizeof(unsigned long) << 3);
 338         else
 339           rlp->ranges = (unsigned long *)
 340               realloc((char *) rlp->ranges,
 341                       sizeof(unsigned long) * (rlp->size + 8));
 342         rlp->size += 8;
 343     }
 344
 345     /*
 346      * If this is the first code for this property list, just add it
 347      * and return.
 348      */
 349     if (rlp->used == 0) {
 350         rlp->ranges[0] = rlp->ranges[1] = c;
 351         rlp->used += 2;
 352         return;
 353     }
 354
 355     /*
 356      * Optimize the cases of extending the last range and adding new ranges to
 357      * the end.
 358      */
 359     j = rlp->used - 1;
 360     e = rlp->ranges[j];
 361     s = rlp->ranges[j - 1];
 362
 363     if (c == e + 1) {
 364         /*
 365          * Extend the last range.
 366          */
 367         rlp->ranges[j] = c;
 368         return;
 369     }
 370
 371     if (c > e + 1) {
 372         /*
 373          * Start another range on the end.
 374          */
 375         j = rlp->used;
 376         rlp->ranges[j] = rlp->ranges[j + 1] = c;
 377         rlp->used += 2;
 378         return;
 379     }
 380
 381     if (c >= s)
 382       /*
 383        * The code is a duplicate of a code in the last range, so just return.
 384        */
 385       return;
 386
 387     /*
 388      * The code should be inserted somewhere before the last range in the
 389      * list.  Locate the insertion point.
 390      */
 391     for (i = 0;
 392          i < rlp->used && c > rlp->ranges[i + 1] + 1; i += 2) ;
 393
 394     s = rlp->ranges[i];
 395     e = rlp->ranges[i + 1];
 396
 397     if (c == e + 1)
 398       /*
 399        * Simply extend the current range.
 400        */
 401       rlp->ranges[i + 1] = c;
 402     else if (c < s) {
 403         /*
 404          * Add a new entry before the current location.  Shift all entries
 405          * before the current one up by one to make room.
 406          */
 407         for (j = rlp->used; j > i; j -= 2) {
 408             rlp->ranges[j] = rlp->ranges[j - 2];
 409             rlp->ranges[j + 1] = rlp->ranges[j - 1];
 410         }
 411         rlp->ranges[i] = rlp->ranges[i + 1] = c;
 412
 413         rlp->used += 2;
 414     }
 415 }
 416
 417 static void
 418 add_decomp(unsigned long code)
 419 {
 420     unsigned long i, j, size;
 421
 422     /*
 423      * Add the code to the composite property.
 424      */
 425     ordered_range_insert(code, "Cm", 2);
 426
 427     /*
 428      * Locate the insertion point for the code.
 429      */
 430     for (i = 0; i < decomps_used && code > decomps[i].code; i++) ;
 431
 432     /*
 433      * Allocate space for a new decomposition.
 434      */
 435     if (decomps_used == decomps_size) {
 436         if (decomps_size == 0)
 437           decomps = (_decomp_t *) malloc(sizeof(_decomp_t) << 3);
 438         else
 439           decomps = (_decomp_t *)
 440               realloc((char *) decomps,
 441                       sizeof(_decomp_t) * (decomps_size + 8));
 442         (void) memset((char *) (decomps + decomps_size), '\0',
 443                       sizeof(_decomp_t) << 3);
 444         decomps_size += 8;
 445     }
 446
 447     if (i < decomps_used && code != decomps[i].code) {
 448         /*
 449          * Shift the decomps up by one if the codes don't match.
 450          */
 451         for (j = decomps_used; j > i; j--)
 452           (void) AC_MEMCPY((char *) &decomps[j], (char *) &decomps[j - 1],
 453                         sizeof(_decomp_t));
 454     }
 455
 456     /*
 457      * Insert or replace a decomposition.
 458      */
 459     size = dectmp_size + (4 - (dectmp_size & 3));
 460     if (decomps[i].size < size) {
 461         if (decomps[i].size == 0)
 462           decomps[i].decomp = (unsigned long *)
 463               malloc(sizeof(unsigned long) * size);
 464         else
 465           decomps[i].decomp = (unsigned long *)
 466               realloc((char *) decomps[i].decomp,
 467                       sizeof(unsigned long) * size);
 468         decomps[i].size = size;
 469     }
 470
 471     if (decomps[i].code != code)
 472       decomps_used++;
 473
 474     decomps[i].code = code;
 475     decomps[i].used = dectmp_size;
 476     (void) AC_MEMCPY((char *) decomps[i].decomp, (char *) dectmp,
 477                   sizeof(unsigned long) * dectmp_size);
 478
 479     /*
 480      * NOTICE: This needs changing later so it is more general than simply
 481      * pairs.  This calculation is done here to simplify allocation elsewhere.
 482      */
 483     if (dectmp_size == 2)
 484       comps_used++;
 485 }
 486
 487 static void
 488 add_title(unsigned long code)
 489 {
 490     unsigned long i, j;
 491
 492     /*
 493      * Always map the code to itself.
 494      */
 495     cases[2] = code;
 496
 497     if (title_used == title_size) {
 498         if (title_size == 0)
 499           title = (_case_t *) malloc(sizeof(_case_t) << 3);
 500         else
 501           title = (_case_t *) realloc((char *) title,
 502                                       sizeof(_case_t) * (title_size + 8));
 503         title_size += 8;
 504     }
 505
 506     /*
 507      * Locate the insertion point.
 508      */
 509     for (i = 0; i < title_used && code > title[i].key; i++) ;
 510
 511     if (i < title_used) {
 512         /*
 513          * Shift the array up by one.
 514          */
 515         for (j = title_used; j > i; j--)
 516           (void) AC_MEMCPY((char *) &title[j], (char *) &title[j - 1],
 517                         sizeof(_case_t));
 518     }
 519
 520     title[i].key = cases[2];    /* Title */
 521     title[i].other1 = cases[0]; /* Upper */
 522     title[i].other2 = cases[1]; /* Lower */
 523
 524     title_used++;
 525 }
 526
 527 static void
 528 add_upper(unsigned long code)
 529 {
 530     unsigned long i, j;
 531
 532     /*
 533      * Always map the code to itself.
 534      */
 535     cases[0] = code;
 536
 537     /*
 538      * If the title case character is not present, then make it the same as
 539      * the upper case.
 540      */
 541     if (cases[2] == 0)
 542       cases[2] = code;
 543
 544     if (upper_used == upper_size) {
 545         if (upper_size == 0)
 546           upper = (_case_t *) malloc(sizeof(_case_t) << 3);
 547         else
 548           upper = (_case_t *) realloc((char *) upper,
 549                                       sizeof(_case_t) * (upper_size + 8));
 550         upper_size += 8;
 551     }
 552
 553     /*
 554      * Locate the insertion point.
 555      */
 556     for (i = 0; i < upper_used && code > upper[i].key; i++) ;
 557
 558     if (i < upper_used) {
 559         /*
 560          * Shift the array up by one.
 561          */
 562         for (j = upper_used; j > i; j--)
 563           (void) AC_MEMCPY((char *) &upper[j], (char *) &upper[j - 1],
 564                         sizeof(_case_t));
 565     }
 566
 567     upper[i].key = cases[0];    /* Upper */
 568     upper[i].other1 = cases[1]; /* Lower */
 569     upper[i].other2 = cases[2]; /* Title */
 570
 571     upper_used++;
 572 }
 573
 574 static void
 575 add_lower(unsigned long code)
 576 {
 577     unsigned long i, j;
 578
 579     /*
 580      * Always map the code to itself.
 581      */
 582     cases[1] = code;
 583
 584     /*
 585      * If the title case character is empty, then make it the same as the
 586      * upper case.
 587      */
 588     if (cases[2] == 0)
 589       cases[2] = cases[0];
 590
 591     if (lower_used == lower_size) {
 592         if (lower_size == 0)
 593           lower = (_case_t *) malloc(sizeof(_case_t) << 3);
 594         else
 595           lower = (_case_t *) realloc((char *) lower,
 596                                       sizeof(_case_t) * (lower_size + 8));
 597         lower_size += 8;
 598     }
 599
 600     /*
 601      * Locate the insertion point.
 602      */
 603     for (i = 0; i < lower_used && code > lower[i].key; i++) ;
 604
 605     if (i < lower_used) {
 606         /*
 607          * Shift the array up by one.
 608          */
 609         for (j = lower_used; j > i; j--)
 610           (void) AC_MEMCPY((char *) &lower[j], (char *) &lower[j - 1],
 611                         sizeof(_case_t));
 612     }
 613
 614     lower[i].key = cases[1];    /* Lower */
 615     lower[i].other1 = cases[0]; /* Upper */
 616     lower[i].other2 = cases[2]; /* Title */
 617
 618     lower_used++;
 619 }
 620
 621 static void
 622 ordered_ccl_insert(unsigned long c, unsigned long ccl_code)
 623 {
 624     unsigned long i, j;
 625
 626     if (ccl_used == ccl_size) {
 627         if (ccl_size == 0)
 628           ccl = (unsigned long *) malloc(sizeof(unsigned long) * 24);
 629         else
 630           ccl = (unsigned long *)
 631               realloc((char *) ccl, sizeof(unsigned long) * (ccl_size + 24));
 632         ccl_size += 24;
 633     }
 634
 635     /*
 636      * Optimize adding the first item.
 637      */
 638     if (ccl_used == 0) {
 639         ccl[0] = ccl[1] = c;
 640         ccl[2] = ccl_code;
 641         ccl_used += 3;
 642         return;
 643     }
 644
 645     /*
 646      * Handle the special case of extending the range on the end.  This
 647      * requires that the combining class codes are the same.
 648      */
 649     if (ccl_code == ccl[ccl_used - 1] && c == ccl[ccl_used - 2] + 1) {
 650         ccl[ccl_used - 2] = c;
 651         return;
 652     }
 653
 654     /*
 655      * Handle the special case of adding another range on the end.
 656      */
 657     if (c > ccl[ccl_used - 2] + 1 ||
 658         (c == ccl[ccl_used - 2] + 1 && ccl_code != ccl[ccl_used - 1])) {
 659         ccl[ccl_used++] = c;
 660         ccl[ccl_used++] = c;
 661         ccl[ccl_used++] = ccl_code;
 662         return;
 663     }
 664
 665     /*
 666      * Locate either the insertion point or range for the code.
 667      */
 668     for (i = 0; i < ccl_used && c > ccl[i + 1] + 1; i += 3) ;
 669
 670     if (ccl_code == ccl[i + 2] && c == ccl[i + 1] + 1) {
 671         /*
 672          * Extend an existing range.
 673          */
 674         ccl[i + 1] = c;
 675         return;
 676     } else if (c < ccl[i]) {
 677         /*
 678          * Start a new range before the current location.
 679          */
 680         for (j = ccl_used; j > i; j -= 3) {
 681             ccl[j] = ccl[j - 3];
 682             ccl[j - 1] = ccl[j - 4];
 683             ccl[j - 2] = ccl[j - 5];
 684         }
 685         ccl[i] = ccl[i + 1] = c;
 686         ccl[i + 2] = ccl_code;
 687     }
 688 }
 689
 690 /*
 691  * Adds a number if it does not already exist and returns an index value
 692  * multiplied by 2.
 693  */
 694 static unsigned long
 695 make_number(short num, short denom)
 696 {
 697     unsigned long n;
 698
 699     /*
 700      * Determine if the number already exists.
 701      */
 702     for (n = 0; n < nums_used; n++) {
 703         if (nums[n].numerator == num && nums[n].denominator == denom)
 704           return n << 1;
 705     }
 706
 707     if (nums_used == nums_size) {
 708         if (nums_size == 0)
 709           nums = (_num_t *) malloc(sizeof(_num_t) << 3);
 710         else
 711           nums = (_num_t *) realloc((char *) nums,
 712                                     sizeof(_num_t) * (nums_size + 8));
 713         nums_size += 8;
 714     }
 715
 716     n = nums_used++;
 717     nums[n].numerator = num;
 718     nums[n].denominator = denom;
 719
 720     return n << 1;
 721 }
 722
 723 static void
 724 add_number(unsigned long code, short num, short denom)
 725 {
 726     unsigned long i, j;
 727
 728     /*
 729      * Insert the code in order.
 730      */
 731     for (i = 0; i < ncodes_used && code > ncodes[i].code; i++) ;
 732
 733     /*
 734      * Handle the case of the codes matching and simply replace the number
 735      * that was there before.
 736      */
 737     if (ncodes_used > 0 && code == ncodes[i].code) {
 738         ncodes[i].idx = make_number(num, denom);
 739         return;
 740     }
 741
 742     /*
 743      * Resize the array if necessary.
 744      */
 745     if (ncodes_used == ncodes_size) {
 746         if (ncodes_size == 0)
 747           ncodes = (_codeidx_t *) malloc(sizeof(_codeidx_t) << 3);
 748         else
 749           ncodes = (_codeidx_t *)
 750               realloc((char *) ncodes, sizeof(_codeidx_t) * (ncodes_size + 8));
 751
 752         ncodes_size += 8;
 753     }
 754
 755     /*
 756      * Shift things around to insert the code if necessary.
 757      */
 758     if (i < ncodes_used) {
 759         for (j = ncodes_used; j > i; j--) {
 760             ncodes[j].code = ncodes[j - 1].code;
 761             ncodes[j].idx = ncodes[j - 1].idx;
 762         }
 763     }
 764     ncodes[i].code = code;
 765     ncodes[i].idx = make_number(num, denom);
 766
 767     ncodes_used++;
 768 }
 769
 770 /*
 771  * This routine assumes that the line is a valid Unicode Character Database
 772  * entry.
 773  */
 774 static void
 775 read_cdata(FILE *in)
 776 {
 777     unsigned long i, lineno, skip, code, ccl_code;
 778     short wnum, neg, number[2];
 779     char line[512], *s, *e;
 780
 781     lineno = skip = 0;
 782     while (fscanf(in, "%[^\n]\n", line) != EOF) {
 783         lineno++;
 784
 785         /*
 786          * Skip blank lines and lines that start with a '#'.
 787          */
 788         if (line[0] == 0 || line[0] == '#')
 789           continue;
 790
 791         /*
 792          * If lines need to be skipped, do it here.
 793          */
 794         if (skip) {
 795             skip--;
 796             continue;
 797         }
 798
 799         /*
 800          * Collect the code.  The code can be up to 6 hex digits in length to
 801          * allow surrogates to be specified.
 802          */
 803         for (s = line, i = code = 0; *s != ';' && i < 6; i++, s++) {
 804             code <<= 4;
 805             if (*s >= '0' && *s <= '9')
 806               code += *s - '0';
 807             else if (*s >= 'A' && *s <= 'F')
 808               code += (*s - 'A') + 10;
 809             else if (*s >= 'a' && *s <= 'f')
 810               code += (*s - 'a') + 10;
 811         }
 812
 813         /*
 814          * Handle the following special cases:
 815          * 1. 4E00-9FA5 CJK Ideographs.
 816          * 2. AC00-D7A3 Hangul Syllables.
 817          * 3. D800-DFFF Surrogates.
 818          * 4. E000-F8FF Private Use Area.
 819          * 5. F900-FA2D Han compatibility.
 820          */
 821         switch (code) {
 822           case 0x4e00:
 823             /*
 824              * The Han ideographs.
 825              */
 826             add_range(0x4e00, 0x9fff, "Lo", "L");
 827
 828             /*
 829              * Add the characters to the defined category.
 830              */
 831             add_range(0x4e00, 0x9fa5, "Cp", 0);
 832
 833             skip = 1;
 834             break;
 835           case 0xac00:
 836             /*
 837              * The Hangul syllables.
 838              */
 839             add_range(0xac00, 0xd7a3, "Lo", "L");
 840
 841             /*
 842              * Add the characters to the defined category.
 843              */
 844             add_range(0xac00, 0xd7a3, "Cp", 0);
 845
 846             skip = 1;
 847             break;
 848           case 0xd800:
 849             /*
 850              * Make a range of all surrogates and assume some default
 851              * properties.
 852              */
 853             add_range(0x010000, 0x10ffff, "Cs", "L");
 854             skip = 5;
 855             break;
 856           case 0xe000:
 857             /*
 858              * The Private Use area.  Add with a default set of properties.
 859              */
 860             add_range(0xe000, 0xf8ff, "Co", "L");
 861             skip = 1;
 862             break;
 863           case 0xf900:
 864             /*
 865              * The CJK compatibility area.
 866              */
 867             add_range(0xf900, 0xfaff, "Lo", "L");
 868
 869             /*
 870              * Add the characters to the defined category.
 871              */
 872             add_range(0xf900, 0xfaff, "Cp", 0);
 873
 874             skip = 1;
 875         }
 876
 877         if (skip)
 878           continue;
 879
 880         /*
 881          * Add the code to the defined category.
 882          */
 883         ordered_range_insert(code, "Cp", 2);
 884
 885         /*
 886          * Locate the first character property field.
 887          */
 888         for (i = 0; *s != 0 && i < 2; s++) {
 889             if (*s == ';')
 890               i++;
 891         }
 892         for (e = s; *e && *e != ';'; e++) ;
 893
 894         ordered_range_insert(code, s, e - s);
 895
 896         /*
 897          * Locate the combining class code.
 898          */
 899         for (s = e; *s != 0 && i < 3; s++) {
 900             if (*s == ';')
 901               i++;
 902         }
 903
 904         /*
 905          * Convert the combining class code from decimal.
 906          */
 907         for (ccl_code = 0, e = s; *e && *e != ';'; e++)
 908           ccl_code = (ccl_code * 10) + (*e - '0');
 909
 910         /*
 911          * Add the code if it not 0.
 912          */
 913         if (ccl_code != 0)
 914           ordered_ccl_insert(code, ccl_code);
 915
 916         /*
 917          * Locate the second character property field.
 918          */
 919         for (s = e; *s != 0 && i < 4; s++) {
 920             if (*s == ';')
 921               i++;
 922         }
 923         for (e = s; *e && *e != ';'; e++) ;
 924
 925         ordered_range_insert(code, s, e - s);
 926
 927         /*
 928          * Check for a decomposition.
 929          */
 930         s = ++e;
 931         if (*s != ';' && *s != '<') {
 932             /*
 933              * Collect the codes of the decomposition.
 934              */
 935             for (dectmp_size = 0; *s != ';'; ) {
 936                 /*
 937                  * Skip all leading non-hex digits.
 938                  */
 939                 while (!ishdigit(*s))
 940                   s++;
 941
 942                 for (dectmp[dectmp_size] = 0; ishdigit(*s); s++) {
 943                     dectmp[dectmp_size] <<= 4;
 944                     if (*s >= '0' && *s <= '9')
 945                       dectmp[dectmp_size] += *s - '0';
 946                     else if (*s >= 'A' && *s <= 'F')
 947                       dectmp[dectmp_size] += (*s - 'A') + 10;
 948                     else if (*s >= 'a' && *s <= 'f')
 949                       dectmp[dectmp_size] += (*s - 'a') + 10;
 950                 }
 951                 dectmp_size++;
 952             }
 953
 954             /*
 955              * If there are any codes in the temporary decomposition array,
 956              * then add the character with its decomposition.
 957              */
 958             if (dectmp_size > 0)
 959               add_decomp(code);
 960         }
 961
 962         /*
 963          * Skip to the number field.
 964          */
 965         for (i = 0; i < 3 && *s; s++) {
 966             if (*s == ';')
 967               i++;
 968         }
 969
 970         /*
 971          * Scan the number in.
 972          */
 973         number[0] = number[1] = 0;
 974         for (e = s, neg = wnum = 0; *e && *e != ';'; e++) {
 975             if (*e == '-') {
 976                 neg = 1;
 977                 continue;
 978             }
 979
 980             if (*e == '/') {
 981                 /*
 982                  * Move the the denominator of the fraction.
 983                  */
 984                 if (neg)
 985                   number[wnum] *= -1;
 986                 neg = 0;
 987                 e++;
 988                 wnum++;
 989             }
 990             number[wnum] = (number[wnum] * 10) + (*e - '0');
 991         }
 992
 993         if (e > s) {
 994             /*
 995              * Adjust the denominator in case of integers and add the number.
 996              */
 997             if (wnum == 0)
 998               number[1] = number[0];
 999
1000             add_number(code, number[0], number[1]);
1001         }
1002
1003         /*
1004          * Skip to the start of the possible case mappings.
1005          */
1006         for (s = e, i = 0; i < 4 && *s; s++) {
1007             if (*s == ';')
1008               i++;
1009         }
1010
1011         /*
1012          * Collect the case mappings.
1013          */
1014         cases[0] = cases[1] = cases[2] = 0;
1015         for (i = 0; i < 3; i++) {
1016             while (ishdigit(*s)) {
1017                 cases[i] <<= 4;
1018                 if (*s >= '0' && *s <= '9')
1019                   cases[i] += *s - '0';
1020                 else if (*s >= 'A' && *s <= 'F')
1021                   cases[i] += (*s - 'A') + 10;
1022                 else if (*s >= 'a' && *s <= 'f')
1023                   cases[i] += (*s - 'a') + 10;
1024                 s++;
1025             }
1026             if (*s == ';')
1027               s++;
1028         }
1029         if (cases[0] && cases[1])
1030           /*
1031            * Add the upper and lower mappings for a title case character.
1032            */
1033           add_title(code);
1034         else if (cases[1])
1035           /*
1036            * Add the lower and title case mappings for the upper case
1037            * character.
1038            */
1039           add_upper(code);
1040         else if (cases[0])
1041           /*
1042            * Add the upper and title case mappings for the lower case
1043            * character.
1044            */
1045           add_lower(code);
1046     }
1047 }
1048
1049 static _decomp_t *
1050 find_decomp(unsigned long code)
1051 {
1052     long l, r, m;
1053
1054     l = 0;
1055     r = decomps_used - 1;
1056     while (l <= r) {
1057         m = (l + r) >> 1;
1058         if (code > decomps[m].code)
1059           l = m + 1;
1060         else if (code < decomps[m].code)
1061           r = m - 1;
1062         else
1063           return &decomps[m];
1064     }
1065     return 0;
1066 }
1067
1068 static void
1069 decomp_it(_decomp_t *d)
1070 {
1071     unsigned long i;
1072     _decomp_t *dp;
1073
1074     for (i = 0; i < d->used; i++) {
1075         if ((dp = find_decomp(d->decomp[i])) != 0)
1076           decomp_it(dp);
1077         else
1078           dectmp[dectmp_size++] = d->decomp[i];
1079     }
1080 }
1081
1082 /*
1083  * Expand all decompositions by recursively decomposing each character
1084  * in the decomposition.
1085  */
1086 static void
1087 expand_decomp(void)
1088 {
1089     unsigned long i;
1090
1091     for (i = 0; i < decomps_used; i++) {
1092         dectmp_size = 0;
1093         decomp_it(&decomps[i]);
1094         if (dectmp_size > 0)
1095           add_decomp(decomps[i].code);
1096     }
1097 }
1098
1099 static int
1100 cmpcomps(_comp_t *comp1, _comp_t *comp2)
1101 {
1102     long diff = comp1->code1 - comp2->code1;
1103
1104     if (!diff)
1105         diff = comp1->code2 - comp2->code2;
1106     return (int) diff;
1107 }
1108
1109 /*
1110  * Load composition exclusion data
1111  */
1112 static void
1113 read_compexdata(FILE *in)
1114 {
1115     unsigned short i, code;
1116     char line[512], *s;
1117
1118     (void) memset((char *) compexs, 0, sizeof(unsigned long) << 11);
1119
1120     while (fscanf(in, "%[^\n]\n", line) != EOF) {
1121         /*
1122          * Skip blank lines and lines that start with a '#'.
1123          */
1124         if (line[0] == 0 || line[0] == '#')
1125             continue;
1126
1127         /*
1128          * Collect the code.  Assume max 4 digits
1129          */
1130
1131         for (s = line, i = code = 0; *s != '#' && i < 4; i++, s++) {
1132             code <<= 4;
1133             if (*s >= '0' && *s <= '9')
1134                 code += *s - '0';
1135             else if (*s >= 'A' && *s <= 'F')
1136                 code += (*s - 'A') + 10;
1137             else if (*s >= 'a' && *s <= 'f')
1138                 code += (*s - 'a') + 10;
1139         }
1140         COMPEX_SET(code);
1141     }
1142 }
1143
1144 /*
1145  * Creates array of compositions from decomposition array
1146  */
1147 static void
1148 create_comps(void)
1149 {
1150     unsigned long i, cu;
1151
1152     comps = (_comp_t *) malloc(comps_used * sizeof(_comp_t));
1153
1154     for (i = cu = 0; i < decomps_used; i++) {
1155         if (decomps[i].used != 2 || COMPEX_TEST(decomps[i].code))
1156             continue;
1157         comps[cu].comp = decomps[i].code;
1158         comps[cu].count = 2;
1159         comps[cu].code1 = decomps[i].decomp[0];
1160         comps[cu].code2 = decomps[i].decomp[1];
1161         cu++;
1162     }
1163     comps_used = cu;
1164     qsort(comps, comps_used, sizeof(_comp_t),
1165           (int (*)(const void *, const void *)) cmpcomps);
1166 }
1167
1168 static void
1169 write_cdata(char *opath)
1170 {
1171     FILE *out;
1172     unsigned long i, idx, bytes, nprops;
1173     unsigned short casecnt[2];
1174     char path[BUFSIZ];
1175
1176     /*****************************************************************
1177      *
1178      * Generate the ctype data.
1179      *
1180      *****************************************************************/
1181
1182     /*
1183      * Open the ctype.dat file.
1184      */
1185     sprintf(path, "%s%sctype.dat", opath, LDAP_DIRSEP);
1186     if ((out = fopen(path, "wb")) == 0)
1187       return;
1188
1189     /*
1190      * Collect the offsets for the properties.  The offsets array is
1191      * on a 4-byte boundary to keep things efficient for architectures
1192      * that need such a thing.
1193      */
1194     for (i = idx = 0; i < NUMPROPS; i++) {
1195         propcnt[i] = (proptbl[i].used != 0) ? idx : 0xffff;
1196         idx += proptbl[i].used;
1197     }
1198
1199     /*
1200      * Add the sentinel index which is used by the binary search as the upper
1201      * bound for a search.
1202      */
1203     propcnt[i] = idx;
1204
1205     /*
1206      * Record the actual number of property lists.  This may be different than
1207      * the number of offsets actually written because of aligning on a 4-byte
1208      * boundary.
1209      */
1210     hdr[1] = NUMPROPS;
1211
1212     /*
1213      * Calculate the byte count needed and pad the property counts array to a
1214      * 4-byte boundary.
1215      */
1216     if ((bytes = sizeof(unsigned short) * (NUMPROPS + 1)) & 3)
1217       bytes += 4 - (bytes & 3);
1218     nprops = bytes / sizeof(unsigned short);
1219     bytes += sizeof(unsigned long) * idx;
1220
1221     /*
1222      * Write the header.
1223      */
1224     fwrite((char *) hdr, sizeof(unsigned short), 2, out);
1225
1226     /*
1227      * Write the byte count.
1228      */
1229     fwrite((char *) &bytes, sizeof(unsigned long), 1, out);
1230
1231     /*
1232      * Write the property list counts.
1233      */
1234     fwrite((char *) propcnt, sizeof(unsigned short), nprops, out);
1235
1236     /*
1237      * Write the property lists.
1238      */
1239     for (i = 0; i < NUMPROPS; i++) {
1240         if (proptbl[i].used > 0)
1241           fwrite((char *) proptbl[i].ranges, sizeof(unsigned long),
1242                  proptbl[i].used, out);
1243     }
1244
1245     fclose(out);
1246
1247     /*****************************************************************
1248      *
1249      * Generate the case mapping data.
1250      *
1251      *****************************************************************/
1252
1253     /*
1254      * Open the case.dat file.
1255      */
1256     sprintf(path, "%s%scase.dat", opath, LDAP_DIRSEP);
1257     if ((out = fopen(path, "wb")) == 0)
1258       return;
1259
1260     /*
1261      * Write the case mapping tables.
1262      */
1263     hdr[1] = upper_used + lower_used + title_used;
1264     casecnt[0] = upper_used;
1265     casecnt[1] = lower_used;
1266
1267     /*
1268      * Write the header.
1269      */
1270     fwrite((char *) hdr, sizeof(unsigned short), 2, out);
1271
1272     /*
1273      * Write the upper and lower case table sizes.
1274      */
1275     fwrite((char *) casecnt, sizeof(unsigned short), 2, out);
1276
1277     if (upper_used > 0)
1278       /*
1279        * Write the upper case table.
1280        */
1281       fwrite((char *) upper, sizeof(_case_t), upper_used, out);
1282
1283     if (lower_used > 0)
1284       /*
1285        * Write the lower case table.
1286        */
1287       fwrite((char *) lower, sizeof(_case_t), lower_used, out);
1288
1289     if (title_used > 0)
1290       /*
1291        * Write the title case table.
1292        */
1293       fwrite((char *) title, sizeof(_case_t), title_used, out);
1294
1295     fclose(out);
1296
1297     /*****************************************************************
1298      *
1299      * Generate the composition data.
1300      *
1301      *****************************************************************/
1302
1303     /*
1304      * Create compositions from decomposition data
1305      */
1306     create_comps();
1307
1308     /*
1309      * Open the comp.dat file.
1310      */
1311     sprintf(path, "%s%scomp.dat", opath, LDAP_DIRSEP);
1312     if ((out = fopen(path, "wb")) == 0)
1313         return;
1314
1315     /*
1316      * Write the header.
1317      */
1318     hdr[1] = (unsigned short) comps_used * 4;
1319     fwrite((char *) hdr, sizeof(unsigned short), 2, out);
1320
1321     /*
1322      * Write out the byte count to maintain header size.
1323      */
1324     bytes = comps_used * sizeof(_comp_t);
1325     fwrite((char *) &bytes, sizeof(unsigned long), 1, out);
1326
1327     /*
1328      * Now, if comps exist, write them out.
1329      */
1330     if (comps_used > 0)
1331         fwrite((char *) comps, sizeof(_comp_t), comps_used, out);
1332
1333     fclose(out);
1334
1335     /*****************************************************************
1336      *
1337      * Generate the decomposition data.
1338      *
1339      *****************************************************************/
1340
1341     /*
1342      * Fully expand all decompositions before generating the output file.
1343      */
1344     expand_decomp();
1345
1346     /*
1347      * Open the decomp.dat file.
1348      */
1349     sprintf(path, "%s%sdecomp.dat", opath, LDAP_DIRSEP);
1350     if ((out = fopen(path, "wb")) == 0)
1351       return;
1352
1353     hdr[1] = decomps_used;
1354
1355     /*
1356      * Write the header.
1357      */
1358     fwrite((char *) hdr, sizeof(unsigned short), 2, out);
1359
1360     /*
1361      * Write a temporary byte count which will be calculated as the
1362      * decompositions are written out.
1363      */
1364     bytes = 0;
1365     fwrite((char *) &bytes, sizeof(unsigned long), 1, out);
1366
1367     if (decomps_used) {
1368         /*
1369          * Write the list of decomp nodes.
1370          */
1371         for (i = idx = 0; i < decomps_used; i++) {
1372             fwrite((char *) &decomps[i].code, sizeof(unsigned long), 1, out);
1373             fwrite((char *) &idx, sizeof(unsigned long), 1, out);
1374             idx += decomps[i].used;
1375         }
1376
1377         /*
1378          * Write the sentinel index as the last decomp node.
1379          */
1380         fwrite((char *) &idx, sizeof(unsigned long), 1, out);
1381
1382         /*
1383          * Write the decompositions themselves.
1384          */
1385         for (i = 0; i < decomps_used; i++)
1386           fwrite((char *) decomps[i].decomp, sizeof(unsigned long),
1387                  decomps[i].used, out);
1388
1389         /*
1390          * Seek back to the beginning and write the byte count.
1391          */
1392         bytes = (sizeof(unsigned long) * idx) +
1393             (sizeof(unsigned long) * ((hdr[1] << 1) + 1));
1394         fseek(out, sizeof(unsigned short) << 1, 0L);
1395         fwrite((char *) &bytes, sizeof(unsigned long), 1, out);
1396
1397         fclose(out);
1398     }
1399
1400     /*****************************************************************
1401      *
1402      * Generate the combining class data.
1403      *
1404      *****************************************************************/
1405
1406     /*
1407      * Open the cmbcl.dat file.
1408      */
1409     sprintf(path, "%s%scmbcl.dat", opath, LDAP_DIRSEP);
1410     if ((out = fopen(path, "wb")) == 0)
1411       return;
1412
1413     /*
1414      * Set the number of ranges used.  Each range has a combining class which
1415      * means each entry is a 3-tuple.
1416      */
1417     hdr[1] = ccl_used / 3;
1418
1419     /*
1420      * Write the header.
1421      */
1422     fwrite((char *) hdr, sizeof(unsigned short), 2, out);
1423
1424     /*
1425      * Write out the byte count to maintain header size.
1426      */
1427     bytes = ccl_used * sizeof(unsigned long);
1428     fwrite((char *) &bytes, sizeof(unsigned long), 1, out);
1429
1430     if (ccl_used > 0)
1431       /*
1432        * Write the combining class ranges out.
1433        */
1434       fwrite((char *) ccl, sizeof(unsigned long), ccl_used, out);
1435
1436     fclose(out);
1437
1438     /*****************************************************************
1439      *
1440      * Generate the number data.
1441      *
1442      *****************************************************************/
1443
1444     /*
1445      * Open the num.dat file.
1446      */
1447     sprintf(path, "%s%snum.dat", opath, LDAP_DIRSEP);
1448     if ((out = fopen(path, "wb")) == 0)
1449       return;
1450
1451     /*
1452      * The count part of the header will be the total number of codes that
1453      * have numbers.
1454      */
1455     hdr[1] = (unsigned short) (ncodes_used << 1);
1456     bytes = (ncodes_used * sizeof(_codeidx_t)) + (nums_used * sizeof(_num_t));
1457
1458     /*
1459      * Write the header.
1460      */
1461     fwrite((char *) hdr, sizeof(unsigned short), 2, out);
1462
1463     /*
1464      * Write out the byte count to maintain header size.
1465      */
1466     fwrite((char *) &bytes, sizeof(unsigned long), 1, out);
1467
1468     /*
1469      * Now, if number mappings exist, write them out.
1470      */
1471     if (ncodes_used > 0) {
1472         fwrite((char *) ncodes, sizeof(_codeidx_t), ncodes_used, out);
1473         fwrite((char *) nums, sizeof(_num_t), nums_used, out);
1474     }
1475
1476     fclose(out);
1477 }
1478
1479 static void
1480 usage(char *prog)
1481 {
1482     fprintf(stderr,
1483             "Usage: %s [-o output-directory|-x composition-exclusions]", prog);
1484     fprintf(stderr, " datafile1 datafile2 ...\n\n");
1485     fprintf(stderr,
1486             "-o output-directory\n\t\tWrite the output files to a different");
1487     fprintf(stderr, " directory (default: .).\n");
1488     fprintf(stderr,
1489             "-x composition-exclusion\n\t\tFile of composition codes");
1490     fprintf(stderr, " that should be excluded.\n");
1491     exit(1);
1492 }
1493
1494 int
1495 main(int argc, char *argv[])
1496 {
1497     FILE *in;
1498     char *prog, *opath;
1499
1500     if ((prog = strrchr(argv[0], *LDAP_DIRSEP)) != 0)
1501       prog++;
1502     else
1503       prog = argv[0];
1504
1505     opath = 0;
1506     in = stdin;
1507
1508     argc--;
1509     argv++;
1510
1511     while (argc > 0) {
1512         if (argv[0][0] == '-') {
1513             switch (argv[0][1]) {
1514               case 'o':
1515                 argc--;
1516                 argv++;
1517                 opath = argv[0];
1518                 break;
1519               case 'x':
1520                 argc--;
1521                 argv++;
1522                 if ((in = fopen(argv[0], "rb")) == 0)
1523                   fprintf(stderr,
1524                           "%s: unable to open composition exclusion file %s\n",
1525                           prog, argv[0]);
1526                 else {
1527                     read_compexdata(in);
1528                     fclose(in);
1529                     in = 0;
1530                 }
1531                 break;
1532               default:
1533                 usage(prog);
1534             }
1535         } else {
1536             if (in != stdin && in != NULL)
1537               fclose(in);
1538             if ((in = fopen(argv[0], "rb")) == 0)
1539               fprintf(stderr, "%s: unable to open ctype file %s\n",
1540                       prog, argv[0]);
1541             else {
1542                 read_cdata(in);
1543                 fclose(in);
1544                 in = 0;
1545             }
1546         }
1547         argc--;
1548         argv++;
1549     }
1550
1551     if (opath == 0)
1552       opath = ".";
1553     write_cdata(opath);
1554
1555     return 0;
1556 }