git.sur5r.net Git - openldap/blob - libraries/liblunicode/ucdata/ucgendat.c

   1 /* $OpenLDAP$ */
   2 /*
   3  * Copyright 2001 Computing Research Labs, New Mexico State University
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice shall be included in
  13  * all copies or substantial portions of the Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
  19  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
  20  * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
  21  * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  */
  23 /* $Id: ucgendat.c,v 1.4 2001/01/02 18:46:20 mleisher Exp $" */
  24
  25 #include "portable.h"
  26
  27 #include <stdio.h>
  28 #include <stdlib.h>
  29 #include <ac/string.h>
  30 #include <ac/unistd.h>
  31
  32 #define ishdigit(cc) (((cc) >= '0' && (cc) <= '9') ||\
  33                       ((cc) >= 'A' && (cc) <= 'F') ||\
  34                       ((cc) >= 'a' && (cc) <= 'f'))
  35
  36 /*
  37  * A header written to the output file with the byte-order-mark and the number
  38  * of property nodes.
  39  */
  40 static unsigned short hdr[2] = {0xfeff, 0};
  41
  42 #define NUMPROPS 50
  43 #define NEEDPROPS (NUMPROPS + (4 - (NUMPROPS & 3)))
  44
  45 typedef struct {
  46     char *name;
  47     int len;
  48 } _prop_t;
  49
  50 /*
  51  * List of properties expected to be found in the Unicode Character Database
  52  * including some implementation specific properties.
  53  *
  54  * The implementation specific properties are:
  55  * Cm = Composed (can be decomposed)
  56  * Nb = Non-breaking
  57  * Sy = Symmetric (has left and right forms)
  58  * Hd = Hex digit
  59  * Qm = Quote marks
  60  * Mr = Mirroring
  61  * Ss = Space, other
  62  * Cp = Defined character
  63  */
  64 static _prop_t props[NUMPROPS] = {
  65     {"Mn", 2}, {"Mc", 2}, {"Me", 2}, {"Nd", 2}, {"Nl", 2}, {"No", 2},
  66     {"Zs", 2}, {"Zl", 2}, {"Zp", 2}, {"Cc", 2}, {"Cf", 2}, {"Cs", 2},
  67     {"Co", 2}, {"Cn", 2}, {"Lu", 2}, {"Ll", 2}, {"Lt", 2}, {"Lm", 2},
  68     {"Lo", 2}, {"Pc", 2}, {"Pd", 2}, {"Ps", 2}, {"Pe", 2}, {"Po", 2},
  69     {"Sm", 2}, {"Sc", 2}, {"Sk", 2}, {"So", 2}, {"L",  1}, {"R",  1},
  70     {"EN", 2}, {"ES", 2}, {"ET", 2}, {"AN", 2}, {"CS", 2}, {"B",  1},
  71     {"S",  1}, {"WS", 2}, {"ON", 2},
  72     {"Cm", 2}, {"Nb", 2}, {"Sy", 2}, {"Hd", 2}, {"Qm", 2}, {"Mr", 2},
  73     {"Ss", 2}, {"Cp", 2}, {"Pi", 2}, {"Pf", 2}, {"AL", 2}
  74 };
  75
  76 typedef struct {
  77     unsigned long *ranges;
  78     unsigned short used;
  79     unsigned short size;
  80 } _ranges_t;
  81
  82 static _ranges_t proptbl[NUMPROPS];
  83
  84 /*
  85  * Make sure this array is sized to be on a 4-byte boundary at compile time.
  86  */
  87 static unsigned short propcnt[NEEDPROPS];
  88
  89 /*
  90  * Array used to collect a decomposition before adding it to the decomposition
  91  * table.
  92  */
  93 static unsigned long dectmp[64];
  94 static unsigned long dectmp_size;
  95
  96 typedef struct {
  97     unsigned long code;
  98     unsigned short size;
  99     unsigned short used;
 100     unsigned long *decomp;
 101 } _decomp_t;
 102
 103 /*
 104  * List of decomposition.  Created and expanded in order as the characters are
 105  * encountered.
 106  */
 107 static _decomp_t *decomps;
 108 static unsigned long decomps_used;
 109 static unsigned long decomps_size;
 110
 111 /*
 112  * Composition exclusion table stuff.
 113  */
 114 #define COMPEX_SET(c) (compexs[(c) >> 15] |= (1 << ((c) & 31)))
 115 #define COMPEX_TEST(c) (compexs[(c) >> 15] & (1 << ((c) & 31)))
 116 static unsigned long compexs[2048];
 117
 118 /*
 119  * Struct for holding a composition pair, and array of composition pairs
 120  */
 121 typedef struct {
 122     unsigned long comp;
 123     unsigned long count;
 124     unsigned long code1;
 125     unsigned long code2;
 126 } _comp_t;
 127
 128 static _comp_t *comps;
 129 static unsigned long comps_used;
 130
 131 /*
 132  * Types and lists for handling lists of case mappings.
 133  */
 134 typedef struct {
 135     unsigned long key;
 136     unsigned long other1;
 137     unsigned long other2;
 138 } _case_t;
 139
 140 static _case_t *upper;
 141 static _case_t *lower;
 142 static _case_t *title;
 143 static unsigned long upper_used;
 144 static unsigned long upper_size;
 145 static unsigned long lower_used;
 146 static unsigned long lower_size;
 147 static unsigned long title_used;
 148 static unsigned long title_size;
 149
 150 /*
 151  * Array used to collect case mappings before adding them to a list.
 152  */
 153 static unsigned long cases[3];
 154
 155 /*
 156  * An array to hold ranges for combining classes.
 157  */
 158 static unsigned long *ccl;
 159 static unsigned long ccl_used;
 160 static unsigned long ccl_size;
 161
 162 /*
 163  * Structures for handling numbers.
 164  */
 165 typedef struct {
 166     unsigned long code;
 167     unsigned long idx;
 168 } _codeidx_t;
 169
 170 typedef struct {
 171     short numerator;
 172     short denominator;
 173 } _num_t;
 174
 175 /*
 176  * Arrays to hold the mapping of codes to numbers.
 177  */
 178 static _codeidx_t *ncodes;
 179 static unsigned long ncodes_used;
 180 static unsigned long ncodes_size;
 181
 182 static _num_t *nums;
 183 static unsigned long nums_used;
 184 static unsigned long nums_size;
 185
 186 /*
 187  * Array for holding numbers.
 188  */
 189 static _num_t *nums;
 190 static unsigned long nums_used;
 191 static unsigned long nums_size;
 192
 193 static void
 194 add_range(unsigned long start, unsigned long end, char *p1, char *p2)
 195 {
 196     int i, j, k, len;
 197     _ranges_t *rlp;
 198     char *name;
 199
 200     for (k = 0; k < 2; k++) {
 201         if (k == 0) {
 202             name = p1;
 203             len = 2;
 204         } else {
 205             if (p2 == 0)
 206               break;
 207
 208             name = p2;
 209             len = 1;
 210         }
 211
 212         for (i = 0; i < NUMPROPS; i++) {
 213             if (props[i].len == len && memcmp(props[i].name, name, len) == 0)
 214               break;
 215         }
 216
 217         if (i == NUMPROPS)
 218           continue;
 219
 220         rlp = &proptbl[i];
 221
 222         /*
 223          * Resize the range list if necessary.
 224          */
 225         if (rlp->used == rlp->size) {
 226             if (rlp->size == 0)
 227               rlp->ranges = (unsigned long *)
 228                   malloc(sizeof(unsigned long) << 3);
 229             else
 230               rlp->ranges = (unsigned long *)
 231                   realloc((char *) rlp->ranges,
 232                           sizeof(unsigned long) * (rlp->size + 8));
 233             rlp->size += 8;
 234         }
 235
 236         /*
 237          * If this is the first code for this property list, just add it
 238          * and return.
 239          */
 240         if (rlp->used == 0) {
 241             rlp->ranges[0] = start;
 242             rlp->ranges[1] = end;
 243             rlp->used += 2;
 244             continue;
 245         }
 246
 247         /*
 248          * Optimize the case of adding the range to the end.
 249          */
 250         j = rlp->used - 1;
 251         if (start > rlp->ranges[j]) {
 252             j = rlp->used;
 253             rlp->ranges[j++] = start;
 254             rlp->ranges[j++] = end;
 255             rlp->used = j;
 256             continue;
 257         }
 258
 259         /*
 260          * Need to locate the insertion point.
 261          */
 262         for (i = 0;
 263              i < rlp->used && start > rlp->ranges[i + 1] + 1; i += 2) ;
 264
 265         /*
 266          * If the start value lies in the current range, then simply set the
 267          * new end point of the range to the end value passed as a parameter.
 268          */
 269         if (rlp->ranges[i] <= start && start <= rlp->ranges[i + 1] + 1) {
 270             rlp->ranges[i + 1] = end;
 271             return;
 272         }
 273
 274         /*
 275          * Shift following values up by two.
 276          */
 277         for (j = rlp->used; j > i; j -= 2) {
 278             rlp->ranges[j] = rlp->ranges[j - 2];
 279             rlp->ranges[j + 1] = rlp->ranges[j - 1];
 280         }
 281
 282         /*
 283          * Add the new range at the insertion point.
 284          */
 285         rlp->ranges[i] = start;
 286         rlp->ranges[i + 1] = end;
 287         rlp->used += 2;
 288     }
 289 }
 290
 291 static void
 292 ordered_range_insert(unsigned long c, char *name, int len)
 293 {
 294     int i, j;
 295     unsigned long s, e;
 296     _ranges_t *rlp;
 297
 298     if (len == 0)
 299       return;
 300
 301     /*
 302      * Deal with directionality codes introduced in Unicode 3.0.
 303      */
 304     if ((len == 2 && memcmp(name, "BN", 2) == 0) ||
 305         (len == 3 &&
 306          (memcmp(name, "NSM", 3) == 0 || memcmp(name, "PDF", 3) == 0 ||
 307           memcmp(name, "LRE", 3) == 0 || memcmp(name, "LRO", 3) == 0 ||
 308           memcmp(name, "RLE", 3) == 0 || memcmp(name, "RLO", 3) == 0))) {
 309         /*
 310          * Mark all of these as Other Neutral to preserve compatibility with
 311          * older versions.
 312          */
 313         len = 2;
 314         name = "ON";
 315     }
 316
 317     for (i = 0; i < NUMPROPS; i++) {
 318         if (props[i].len == len && memcmp(props[i].name, name, len) == 0)
 319           break;
 320     }
 321
 322     if (i == NUMPROPS)
 323       return;
 324
 325     /*
 326      * Have a match, so insert the code in order.
 327      */
 328     rlp = &proptbl[i];
 329
 330     /*
 331      * Resize the range list if necessary.
 332      */
 333     if (rlp->used == rlp->size) {
 334         if (rlp->size == 0)
 335           rlp->ranges = (unsigned long *)
 336               malloc(sizeof(unsigned long) << 3);
 337         else
 338           rlp->ranges = (unsigned long *)
 339               realloc((char *) rlp->ranges,
 340                       sizeof(unsigned long) * (rlp->size + 8));
 341         rlp->size += 8;
 342     }
 343
 344     /*
 345      * If this is the first code for this property list, just add it
 346      * and return.
 347      */
 348     if (rlp->used == 0) {
 349         rlp->ranges[0] = rlp->ranges[1] = c;
 350         rlp->used += 2;
 351         return;
 352     }
 353
 354     /*
 355      * Optimize the cases of extending the last range and adding new ranges to
 356      * the end.
 357      */
 358     j = rlp->used - 1;
 359     e = rlp->ranges[j];
 360     s = rlp->ranges[j - 1];
 361
 362     if (c == e + 1) {
 363         /*
 364          * Extend the last range.
 365          */
 366         rlp->ranges[j] = c;
 367         return;
 368     }
 369
 370     if (c > e + 1) {
 371         /*
 372          * Start another range on the end.
 373          */
 374         j = rlp->used;
 375         rlp->ranges[j] = rlp->ranges[j + 1] = c;
 376         rlp->used += 2;
 377         return;
 378     }
 379
 380     if (c >= s)
 381       /*
 382        * The code is a duplicate of a code in the last range, so just return.
 383        */
 384       return;
 385
 386     /*
 387      * The code should be inserted somewhere before the last range in the
 388      * list.  Locate the insertion point.
 389      */
 390     for (i = 0;
 391          i < rlp->used && c > rlp->ranges[i + 1] + 1; i += 2) ;
 392
 393     s = rlp->ranges[i];
 394     e = rlp->ranges[i + 1];
 395
 396     if (c == e + 1)
 397       /*
 398        * Simply extend the current range.
 399        */
 400       rlp->ranges[i + 1] = c;
 401     else if (c < s) {
 402         /*
 403          * Add a new entry before the current location.  Shift all entries
 404          * before the current one up by one to make room.
 405          */
 406         for (j = rlp->used; j > i; j -= 2) {
 407             rlp->ranges[j] = rlp->ranges[j - 2];
 408             rlp->ranges[j + 1] = rlp->ranges[j - 1];
 409         }
 410         rlp->ranges[i] = rlp->ranges[i + 1] = c;
 411
 412         rlp->used += 2;
 413     }
 414 }
 415
 416 static void
 417 add_decomp(unsigned long code)
 418 {
 419     unsigned long i, j, size;
 420
 421     /*
 422      * Add the code to the composite property.
 423      */
 424     ordered_range_insert(code, "Cm", 2);
 425
 426     /*
 427      * Locate the insertion point for the code.
 428      */
 429     for (i = 0; i < decomps_used && code > decomps[i].code; i++) ;
 430
 431     /*
 432      * Allocate space for a new decomposition.
 433      */
 434     if (decomps_used == decomps_size) {
 435         if (decomps_size == 0)
 436           decomps = (_decomp_t *) malloc(sizeof(_decomp_t) << 3);
 437         else
 438           decomps = (_decomp_t *)
 439               realloc((char *) decomps,
 440                       sizeof(_decomp_t) * (decomps_size + 8));
 441         (void) memset((char *) (decomps + decomps_size), '\0',
 442                       sizeof(_decomp_t) << 3);
 443         decomps_size += 8;
 444     }
 445
 446     if (i < decomps_used && code != decomps[i].code) {
 447         /*
 448          * Shift the decomps up by one if the codes don't match.
 449          */
 450         for (j = decomps_used; j > i; j--)
 451           (void) AC_MEMCPY((char *) &decomps[j], (char *) &decomps[j - 1],
 452                         sizeof(_decomp_t));
 453     }
 454
 455     /*
 456      * Insert or replace a decomposition.
 457      */
 458     size = dectmp_size + (4 - (dectmp_size & 3));
 459     if (decomps[i].size < size) {
 460         if (decomps[i].size == 0)
 461           decomps[i].decomp = (unsigned long *)
 462               malloc(sizeof(unsigned long) * size);
 463         else
 464           decomps[i].decomp = (unsigned long *)
 465               realloc((char *) decomps[i].decomp,
 466                       sizeof(unsigned long) * size);
 467         decomps[i].size = size;
 468     }
 469
 470     if (decomps[i].code != code)
 471       decomps_used++;
 472
 473     decomps[i].code = code;
 474     decomps[i].used = dectmp_size;
 475     (void) AC_MEMCPY((char *) decomps[i].decomp, (char *) dectmp,
 476                   sizeof(unsigned long) * dectmp_size);
 477
 478     /*
 479      * NOTICE: This needs changing later so it is more general than simply
 480      * pairs.  This calculation is done here to simplify allocation elsewhere.
 481      */
 482     if (dectmp_size == 2)
 483       comps_used++;
 484 }
 485
 486 static void
 487 add_title(unsigned long code)
 488 {
 489     unsigned long i, j;
 490
 491     /*
 492      * Always map the code to itself.
 493      */
 494     cases[2] = code;
 495
 496     if (title_used == title_size) {
 497         if (title_size == 0)
 498           title = (_case_t *) malloc(sizeof(_case_t) << 3);
 499         else
 500           title = (_case_t *) realloc((char *) title,
 501                                       sizeof(_case_t) * (title_size + 8));
 502         title_size += 8;
 503     }
 504
 505     /*
 506      * Locate the insertion point.
 507      */
 508     for (i = 0; i < title_used && code > title[i].key; i++) ;
 509
 510     if (i < title_used) {
 511         /*
 512          * Shift the array up by one.
 513          */
 514         for (j = title_used; j > i; j--)
 515           (void) AC_MEMCPY((char *) &title[j], (char *) &title[j - 1],
 516                         sizeof(_case_t));
 517     }
 518
 519     title[i].key = cases[2];    /* Title */
 520     title[i].other1 = cases[0]; /* Upper */
 521     title[i].other2 = cases[1]; /* Lower */
 522
 523     title_used++;
 524 }
 525
 526 static void
 527 add_upper(unsigned long code)
 528 {
 529     unsigned long i, j;
 530
 531     /*
 532      * Always map the code to itself.
 533      */
 534     cases[0] = code;
 535
 536     /*
 537      * If the title case character is not present, then make it the same as
 538      * the upper case.
 539      */
 540     if (cases[2] == 0)
 541       cases[2] = code;
 542
 543     if (upper_used == upper_size) {
 544         if (upper_size == 0)
 545           upper = (_case_t *) malloc(sizeof(_case_t) << 3);
 546         else
 547           upper = (_case_t *) realloc((char *) upper,
 548                                       sizeof(_case_t) * (upper_size + 8));
 549         upper_size += 8;
 550     }
 551
 552     /*
 553      * Locate the insertion point.
 554      */
 555     for (i = 0; i < upper_used && code > upper[i].key; i++) ;
 556
 557     if (i < upper_used) {
 558         /*
 559          * Shift the array up by one.
 560          */
 561         for (j = upper_used; j > i; j--)
 562           (void) AC_MEMCPY((char *) &upper[j], (char *) &upper[j - 1],
 563                         sizeof(_case_t));
 564     }
 565
 566     upper[i].key = cases[0];    /* Upper */
 567     upper[i].other1 = cases[1]; /* Lower */
 568     upper[i].other2 = cases[2]; /* Title */
 569
 570     upper_used++;
 571 }
 572
 573 static void
 574 add_lower(unsigned long code)
 575 {
 576     unsigned long i, j;
 577
 578     /*
 579      * Always map the code to itself.
 580      */
 581     cases[1] = code;
 582
 583     /*
 584      * If the title case character is empty, then make it the same as the
 585      * upper case.
 586      */
 587     if (cases[2] == 0)
 588       cases[2] = cases[0];
 589
 590     if (lower_used == lower_size) {
 591         if (lower_size == 0)
 592           lower = (_case_t *) malloc(sizeof(_case_t) << 3);
 593         else
 594           lower = (_case_t *) realloc((char *) lower,
 595                                       sizeof(_case_t) * (lower_size + 8));
 596         lower_size += 8;
 597     }
 598
 599     /*
 600      * Locate the insertion point.
 601      */
 602     for (i = 0; i < lower_used && code > lower[i].key; i++) ;
 603
 604     if (i < lower_used) {
 605         /*
 606          * Shift the array up by one.
 607          */
 608         for (j = lower_used; j > i; j--)
 609           (void) AC_MEMCPY((char *) &lower[j], (char *) &lower[j - 1],
 610                         sizeof(_case_t));
 611     }
 612
 613     lower[i].key = cases[1];    /* Lower */
 614     lower[i].other1 = cases[0]; /* Upper */
 615     lower[i].other2 = cases[2]; /* Title */
 616
 617     lower_used++;
 618 }
 619
 620 static void
 621 ordered_ccl_insert(unsigned long c, unsigned long ccl_code)
 622 {
 623     unsigned long i, j;
 624
 625     if (ccl_used == ccl_size) {
 626         if (ccl_size == 0)
 627           ccl = (unsigned long *) malloc(sizeof(unsigned long) * 24);
 628         else
 629           ccl = (unsigned long *)
 630               realloc((char *) ccl, sizeof(unsigned long) * (ccl_size + 24));
 631         ccl_size += 24;
 632     }
 633
 634     /*
 635      * Optimize adding the first item.
 636      */
 637     if (ccl_used == 0) {
 638         ccl[0] = ccl[1] = c;
 639         ccl[2] = ccl_code;
 640         ccl_used += 3;
 641         return;
 642     }
 643
 644     /*
 645      * Handle the special case of extending the range on the end.  This
 646      * requires that the combining class codes are the same.
 647      */
 648     if (ccl_code == ccl[ccl_used - 1] && c == ccl[ccl_used - 2] + 1) {
 649         ccl[ccl_used - 2] = c;
 650         return;
 651     }
 652
 653     /*
 654      * Handle the special case of adding another range on the end.
 655      */
 656     if (c > ccl[ccl_used - 2] + 1 ||
 657         (c == ccl[ccl_used - 2] + 1 && ccl_code != ccl[ccl_used - 1])) {
 658         ccl[ccl_used++] = c;
 659         ccl[ccl_used++] = c;
 660         ccl[ccl_used++] = ccl_code;
 661         return;
 662     }
 663
 664     /*
 665      * Locate either the insertion point or range for the code.
 666      */
 667     for (i = 0; i < ccl_used && c > ccl[i + 1] + 1; i += 3) ;
 668
 669     if (ccl_code == ccl[i + 2] && c == ccl[i + 1] + 1) {
 670         /*
 671          * Extend an existing range.
 672          */
 673         ccl[i + 1] = c;
 674         return;
 675     } else if (c < ccl[i]) {
 676         /*
 677          * Start a new range before the current location.
 678          */
 679         for (j = ccl_used; j > i; j -= 3) {
 680             ccl[j] = ccl[j - 3];
 681             ccl[j - 1] = ccl[j - 4];
 682             ccl[j - 2] = ccl[j - 5];
 683         }
 684         ccl[i] = ccl[i + 1] = c;
 685         ccl[i + 2] = ccl_code;
 686     }
 687 }
 688
 689 /*
 690  * Adds a number if it does not already exist and returns an index value
 691  * multiplied by 2.
 692  */
 693 static unsigned long
 694 make_number(short num, short denom)
 695 {
 696     unsigned long n;
 697
 698     /*
 699      * Determine if the number already exists.
 700      */
 701     for (n = 0; n < nums_used; n++) {
 702         if (nums[n].numerator == num && nums[n].denominator == denom)
 703           return n << 1;
 704     }
 705
 706     if (nums_used == nums_size) {
 707         if (nums_size == 0)
 708           nums = (_num_t *) malloc(sizeof(_num_t) << 3);
 709         else
 710           nums = (_num_t *) realloc((char *) nums,
 711                                     sizeof(_num_t) * (nums_size + 8));
 712         nums_size += 8;
 713     }
 714
 715     n = nums_used++;
 716     nums[n].numerator = num;
 717     nums[n].denominator = denom;
 718
 719     return n << 1;
 720 }
 721
 722 static void
 723 add_number(unsigned long code, short num, short denom)
 724 {
 725     unsigned long i, j;
 726
 727     /*
 728      * Insert the code in order.
 729      */
 730     for (i = 0; i < ncodes_used && code > ncodes[i].code; i++) ;
 731
 732     /*
 733      * Handle the case of the codes matching and simply replace the number
 734      * that was there before.
 735      */
 736     if (ncodes_used > 0 && code == ncodes[i].code) {
 737         ncodes[i].idx = make_number(num, denom);
 738         return;
 739     }
 740
 741     /*
 742      * Resize the array if necessary.
 743      */
 744     if (ncodes_used == ncodes_size) {
 745         if (ncodes_size == 0)
 746           ncodes = (_codeidx_t *) malloc(sizeof(_codeidx_t) << 3);
 747         else
 748           ncodes = (_codeidx_t *)
 749               realloc((char *) ncodes, sizeof(_codeidx_t) * (ncodes_size + 8));
 750
 751         ncodes_size += 8;
 752     }
 753
 754     /*
 755      * Shift things around to insert the code if necessary.
 756      */
 757     if (i < ncodes_used) {
 758         for (j = ncodes_used; j > i; j--) {
 759             ncodes[j].code = ncodes[j - 1].code;
 760             ncodes[j].idx = ncodes[j - 1].idx;
 761         }
 762     }
 763     ncodes[i].code = code;
 764     ncodes[i].idx = make_number(num, denom);
 765
 766     ncodes_used++;
 767 }
 768
 769 /*
 770  * This routine assumes that the line is a valid Unicode Character Database
 771  * entry.
 772  */
 773 static void
 774 read_cdata(FILE *in)
 775 {
 776     unsigned long i, lineno, skip, code, ccl_code;
 777     short wnum, neg, number[2];
 778     char line[512], *s, *e;
 779
 780     lineno = skip = 0;
 781     while (fscanf(in, "%[^\n]\n", line) != EOF) {
 782         lineno++;
 783
 784         /*
 785          * Skip blank lines and lines that start with a '#'.
 786          */
 787         if (line[0] == 0 || line[0] == '#')
 788           continue;
 789
 790         /*
 791          * If lines need to be skipped, do it here.
 792          */
 793         if (skip) {
 794             skip--;
 795             continue;
 796         }
 797
 798         /*
 799          * Collect the code.  The code can be up to 6 hex digits in length to
 800          * allow surrogates to be specified.
 801          */
 802         for (s = line, i = code = 0; *s != ';' && i < 6; i++, s++) {
 803             code <<= 4;
 804             if (*s >= '0' && *s <= '9')
 805               code += *s - '0';
 806             else if (*s >= 'A' && *s <= 'F')
 807               code += (*s - 'A') + 10;
 808             else if (*s >= 'a' && *s <= 'f')
 809               code += (*s - 'a') + 10;
 810         }
 811
 812         /*
 813          * Handle the following special cases:
 814          * 1. 4E00-9FA5 CJK Ideographs.
 815          * 2. AC00-D7A3 Hangul Syllables.
 816          * 3. D800-DFFF Surrogates.
 817          * 4. E000-F8FF Private Use Area.
 818          * 5. F900-FA2D Han compatibility.
 819          */
 820         switch (code) {
 821           case 0x4e00:
 822             /*
 823              * The Han ideographs.
 824              */
 825             add_range(0x4e00, 0x9fff, "Lo", "L");
 826
 827             /*
 828              * Add the characters to the defined category.
 829              */
 830             add_range(0x4e00, 0x9fa5, "Cp", 0);
 831
 832             skip = 1;
 833             break;
 834           case 0xac00:
 835             /*
 836              * The Hangul syllables.
 837              */
 838             add_range(0xac00, 0xd7a3, "Lo", "L");
 839
 840             /*
 841              * Add the characters to the defined category.
 842              */
 843             add_range(0xac00, 0xd7a3, "Cp", 0);
 844
 845             skip = 1;
 846             break;
 847           case 0xd800:
 848             /*
 849              * Make a range of all surrogates and assume some default
 850              * properties.
 851              */
 852             add_range(0x010000, 0x10ffff, "Cs", "L");
 853             skip = 5;
 854             break;
 855           case 0xe000:
 856             /*
 857              * The Private Use area.  Add with a default set of properties.
 858              */
 859             add_range(0xe000, 0xf8ff, "Co", "L");
 860             skip = 1;
 861             break;
 862           case 0xf900:
 863             /*
 864              * The CJK compatibility area.
 865              */
 866             add_range(0xf900, 0xfaff, "Lo", "L");
 867
 868             /*
 869              * Add the characters to the defined category.
 870              */
 871             add_range(0xf900, 0xfaff, "Cp", 0);
 872
 873             skip = 1;
 874         }
 875
 876         if (skip)
 877           continue;
 878
 879         /*
 880          * Add the code to the defined category.
 881          */
 882         ordered_range_insert(code, "Cp", 2);
 883
 884         /*
 885          * Locate the first character property field.
 886          */
 887         for (i = 0; *s != 0 && i < 2; s++) {
 888             if (*s == ';')
 889               i++;
 890         }
 891         for (e = s; *e && *e != ';'; e++) ;
 892
 893         ordered_range_insert(code, s, e - s);
 894
 895         /*
 896          * Locate the combining class code.
 897          */
 898         for (s = e; *s != 0 && i < 3; s++) {
 899             if (*s == ';')
 900               i++;
 901         }
 902
 903         /*
 904          * Convert the combining class code from decimal.
 905          */
 906         for (ccl_code = 0, e = s; *e && *e != ';'; e++)
 907           ccl_code = (ccl_code * 10) + (*e - '0');
 908
 909         /*
 910          * Add the code if it not 0.
 911          */
 912         if (ccl_code != 0)
 913           ordered_ccl_insert(code, ccl_code);
 914
 915         /*
 916          * Locate the second character property field.
 917          */
 918         for (s = e; *s != 0 && i < 4; s++) {
 919             if (*s == ';')
 920               i++;
 921         }
 922         for (e = s; *e && *e != ';'; e++) ;
 923
 924         ordered_range_insert(code, s, e - s);
 925
 926         /*
 927          * Check for a decomposition.
 928          */
 929         s = ++e;
 930         if (*s != ';' && *s != '<') {
 931             /*
 932              * Collect the codes of the decomposition.
 933              */
 934             for (dectmp_size = 0; *s != ';'; ) {
 935                 /*
 936                  * Skip all leading non-hex digits.
 937                  */
 938                 while (!ishdigit(*s))
 939                   s++;
 940
 941                 for (dectmp[dectmp_size] = 0; ishdigit(*s); s++) {
 942                     dectmp[dectmp_size] <<= 4;
 943                     if (*s >= '0' && *s <= '9')
 944                       dectmp[dectmp_size] += *s - '0';
 945                     else if (*s >= 'A' && *s <= 'F')
 946                       dectmp[dectmp_size] += (*s - 'A') + 10;
 947                     else if (*s >= 'a' && *s <= 'f')
 948                       dectmp[dectmp_size] += (*s - 'a') + 10;
 949                 }
 950                 dectmp_size++;
 951             }
 952
 953             /*
 954              * If there are any codes in the temporary decomposition array,
 955              * then add the character with its decomposition.
 956              */
 957             if (dectmp_size > 0)
 958               add_decomp(code);
 959         }
 960
 961         /*
 962          * Skip to the number field.
 963          */
 964         for (i = 0; i < 3 && *s; s++) {
 965             if (*s == ';')
 966               i++;
 967         }
 968
 969         /*
 970          * Scan the number in.
 971          */
 972         number[0] = number[1] = 0;
 973         for (e = s, neg = wnum = 0; *e && *e != ';'; e++) {
 974             if (*e == '-') {
 975                 neg = 1;
 976                 continue;
 977             }
 978
 979             if (*e == '/') {
 980                 /*
 981                  * Move the the denominator of the fraction.
 982                  */
 983                 if (neg)
 984                   number[wnum] *= -1;
 985                 neg = 0;
 986                 e++;
 987                 wnum++;
 988             }
 989             number[wnum] = (number[wnum] * 10) + (*e - '0');
 990         }
 991
 992         if (e > s) {
 993             /*
 994              * Adjust the denominator in case of integers and add the number.
 995              */
 996             if (wnum == 0)
 997               number[1] = number[0];
 998
 999             add_number(code, number[0], number[1]);
1000         }
1001
1002         /*
1003          * Skip to the start of the possible case mappings.
1004          */
1005         for (s = e, i = 0; i < 4 && *s; s++) {
1006             if (*s == ';')
1007               i++;
1008         }
1009
1010         /*
1011          * Collect the case mappings.
1012          */
1013         cases[0] = cases[1] = cases[2] = 0;
1014         for (i = 0; i < 3; i++) {
1015             while (ishdigit(*s)) {
1016                 cases[i] <<= 4;
1017                 if (*s >= '0' && *s <= '9')
1018                   cases[i] += *s - '0';
1019                 else if (*s >= 'A' && *s <= 'F')
1020                   cases[i] += (*s - 'A') + 10;
1021                 else if (*s >= 'a' && *s <= 'f')
1022                   cases[i] += (*s - 'a') + 10;
1023                 s++;
1024             }
1025             if (*s == ';')
1026               s++;
1027         }
1028         if (cases[0] && cases[1])
1029           /*
1030            * Add the upper and lower mappings for a title case character.
1031            */
1032           add_title(code);
1033         else if (cases[1])
1034           /*
1035            * Add the lower and title case mappings for the upper case
1036            * character.
1037            */
1038           add_upper(code);
1039         else if (cases[0])
1040           /*
1041            * Add the upper and title case mappings for the lower case
1042            * character.
1043            */
1044           add_lower(code);
1045     }
1046 }
1047
1048 static _decomp_t *
1049 find_decomp(unsigned long code)
1050 {
1051     long l, r, m;
1052
1053     l = 0;
1054     r = decomps_used - 1;
1055     while (l <= r) {
1056         m = (l + r) >> 1;
1057         if (code > decomps[m].code)
1058           l = m + 1;
1059         else if (code < decomps[m].code)
1060           r = m - 1;
1061         else
1062           return &decomps[m];
1063     }
1064     return 0;
1065 }
1066
1067 static void
1068 decomp_it(_decomp_t *d)
1069 {
1070     unsigned long i;
1071     _decomp_t *dp;
1072
1073     for (i = 0; i < d->used; i++) {
1074         if ((dp = find_decomp(d->decomp[i])) != 0)
1075           decomp_it(dp);
1076         else
1077           dectmp[dectmp_size++] = d->decomp[i];
1078     }
1079 }
1080
1081 /*
1082  * Expand all decompositions by recursively decomposing each character
1083  * in the decomposition.
1084  */
1085 static void
1086 expand_decomp(void)
1087 {
1088     unsigned long i;
1089
1090     for (i = 0; i < decomps_used; i++) {
1091         dectmp_size = 0;
1092         decomp_it(&decomps[i]);
1093         if (dectmp_size > 0)
1094           add_decomp(decomps[i].code);
1095     }
1096 }
1097
1098 static int
1099 cmpcomps(_comp_t *comp1, _comp_t *comp2)
1100 {
1101     long diff = comp1->code1 - comp2->code1;
1102
1103     if (!diff)
1104         diff = comp1->code2 - comp2->code2;
1105     return (int) diff;
1106 }
1107
1108 /*
1109  * Load composition exclusion data
1110  */
1111 static void
1112 read_compexdata(FILE *in)
1113 {
1114     unsigned short i, code;
1115     char line[512], *s;
1116
1117     (void) memset((char *) compexs, 0, sizeof(unsigned long) << 11);
1118
1119     while (fscanf(in, "%[^\n]\n", line) != EOF) {
1120         /*
1121          * Skip blank lines and lines that start with a '#'.
1122          */
1123         if (line[0] == 0 || line[0] == '#')
1124             continue;
1125
1126         /*
1127          * Collect the code.  Assume max 4 digits
1128          */
1129
1130         for (s = line, i = code = 0; *s != '#' && i < 4; i++, s++) {
1131             code <<= 4;
1132             if (*s >= '0' && *s <= '9')
1133                 code += *s - '0';
1134             else if (*s >= 'A' && *s <= 'F')
1135                 code += (*s - 'A') + 10;
1136             else if (*s >= 'a' && *s <= 'f')
1137                 code += (*s - 'a') + 10;
1138         }
1139         COMPEX_SET(code);
1140     }
1141 }
1142
1143 /*
1144  * Creates array of compositions from decomposition array
1145  */
1146 static void
1147 create_comps(void)
1148 {
1149     unsigned long i, cu;
1150
1151     comps = (_comp_t *) malloc(comps_used * sizeof(_comp_t));
1152
1153     for (i = cu = 0; i < decomps_used; i++) {
1154         if (decomps[i].used != 2 || COMPEX_TEST(decomps[i].code))
1155             continue;
1156         comps[cu].comp = decomps[i].code;
1157         comps[cu].count = 2;
1158         comps[cu].code1 = decomps[i].decomp[0];
1159         comps[cu].code2 = decomps[i].decomp[1];
1160         cu++;
1161     }
1162     qsort(comps, comps_used, sizeof(_comp_t),
1163           (int (*)(const void *, const void *)) cmpcomps);
1164 }
1165
1166 static void
1167 write_cdata(char *opath)
1168 {
1169     FILE *out;
1170     unsigned long i, idx, bytes, nprops;
1171     unsigned short casecnt[2];
1172     char path[BUFSIZ];
1173
1174     /*****************************************************************
1175      *
1176      * Generate the ctype data.
1177      *
1178      *****************************************************************/
1179
1180     /*
1181      * Open the ctype.dat file.
1182      */
1183     sprintf(path, "%s/ctype.dat", opath);
1184     if ((out = fopen(path, "wb")) == 0)
1185       return;
1186
1187     /*
1188      * Collect the offsets for the properties.  The offsets array is
1189      * on a 4-byte boundary to keep things efficient for architectures
1190      * that need such a thing.
1191      */
1192     for (i = idx = 0; i < NUMPROPS; i++) {
1193         propcnt[i] = (proptbl[i].used != 0) ? idx : 0xffff;
1194         idx += proptbl[i].used;
1195     }
1196
1197     /*
1198      * Add the sentinel index which is used by the binary search as the upper
1199      * bound for a search.
1200      */
1201     propcnt[i] = idx;
1202
1203     /*
1204      * Record the actual number of property lists.  This may be different than
1205      * the number of offsets actually written because of aligning on a 4-byte
1206      * boundary.
1207      */
1208     hdr[1] = NUMPROPS;
1209
1210     /*
1211      * Calculate the byte count needed and pad the property counts array to a
1212      * 4-byte boundary.
1213      */
1214     if ((bytes = sizeof(unsigned short) * (NUMPROPS + 1)) & 3)
1215       bytes += 4 - (bytes & 3);
1216     nprops = bytes / sizeof(unsigned short);
1217     bytes += sizeof(unsigned long) * idx;
1218
1219     /*
1220      * Write the header.
1221      */
1222     fwrite((char *) hdr, sizeof(unsigned short), 2, out);
1223
1224     /*
1225      * Write the byte count.
1226      */
1227     fwrite((char *) &bytes, sizeof(unsigned long), 1, out);
1228
1229     /*
1230      * Write the property list counts.
1231      */
1232     fwrite((char *) propcnt, sizeof(unsigned short), nprops, out);
1233
1234     /*
1235      * Write the property lists.
1236      */
1237     for (i = 0; i < NUMPROPS; i++) {
1238         if (proptbl[i].used > 0)
1239           fwrite((char *) proptbl[i].ranges, sizeof(unsigned long),
1240                  proptbl[i].used, out);
1241     }
1242
1243     fclose(out);
1244
1245     /*****************************************************************
1246      *
1247      * Generate the case mapping data.
1248      *
1249      *****************************************************************/
1250
1251     /*
1252      * Open the case.dat file.
1253      */
1254     sprintf(path, "%s/case.dat", opath);
1255     if ((out = fopen(path, "wb")) == 0)
1256       return;
1257
1258     /*
1259      * Write the case mapping tables.
1260      */
1261     hdr[1] = upper_used + lower_used + title_used;
1262     casecnt[0] = upper_used;
1263     casecnt[1] = lower_used;
1264
1265     /*
1266      * Write the header.
1267      */
1268     fwrite((char *) hdr, sizeof(unsigned short), 2, out);
1269
1270     /*
1271      * Write the upper and lower case table sizes.
1272      */
1273     fwrite((char *) casecnt, sizeof(unsigned short), 2, out);
1274
1275     if (upper_used > 0)
1276       /*
1277        * Write the upper case table.
1278        */
1279       fwrite((char *) upper, sizeof(_case_t), upper_used, out);
1280
1281     if (lower_used > 0)
1282       /*
1283        * Write the lower case table.
1284        */
1285       fwrite((char *) lower, sizeof(_case_t), lower_used, out);
1286
1287     if (title_used > 0)
1288       /*
1289        * Write the title case table.
1290        */
1291       fwrite((char *) title, sizeof(_case_t), title_used, out);
1292
1293     fclose(out);
1294
1295     /*****************************************************************
1296      *
1297      * Generate the composition data.
1298      *
1299      *****************************************************************/
1300
1301     /*
1302      * Create compositions from decomposition data
1303      */
1304     create_comps();
1305
1306     /*
1307      * Open the comp.dat file.
1308      */
1309     sprintf(path, "%s/comp.dat", opath);
1310     if ((out = fopen(path, "wb")) == 0)
1311         return;
1312
1313     /*
1314      * Write the header.
1315      */
1316     hdr[1] = (unsigned short) comps_used * 4;
1317     fwrite((char *) hdr, sizeof(unsigned short), 2, out);
1318
1319     /*
1320      * Write out the byte count to maintain header size.
1321      */
1322     bytes = comps_used * sizeof(_comp_t);
1323     fwrite((char *) &bytes, sizeof(unsigned long), 1, out);
1324
1325     /*
1326      * Now, if comps exist, write them out.
1327      */
1328     if (comps_used > 0)
1329         fwrite((char *) comps, sizeof(_comp_t), comps_used, out);
1330
1331     fclose(out);
1332
1333     /*****************************************************************
1334      *
1335      * Generate the decomposition data.
1336      *
1337      *****************************************************************/
1338
1339     /*
1340      * Fully expand all decompositions before generating the output file.
1341      */
1342     expand_decomp();
1343
1344     /*
1345      * Open the decomp.dat file.
1346      */
1347     sprintf(path, "%s/decomp.dat", opath);
1348     if ((out = fopen(path, "wb")) == 0)
1349       return;
1350
1351     hdr[1] = decomps_used;
1352
1353     /*
1354      * Write the header.
1355      */
1356     fwrite((char *) hdr, sizeof(unsigned short), 2, out);
1357
1358     /*
1359      * Write a temporary byte count which will be calculated as the
1360      * decompositions are written out.
1361      */
1362     bytes = 0;
1363     fwrite((char *) &bytes, sizeof(unsigned long), 1, out);
1364
1365     if (decomps_used) {
1366         /*
1367          * Write the list of decomp nodes.
1368          */
1369         for (i = idx = 0; i < decomps_used; i++) {
1370             fwrite((char *) &decomps[i].code, sizeof(unsigned long), 1, out);
1371             fwrite((char *) &idx, sizeof(unsigned long), 1, out);
1372             idx += decomps[i].used;
1373         }
1374
1375         /*
1376          * Write the sentinel index as the last decomp node.
1377          */
1378         fwrite((char *) &idx, sizeof(unsigned long), 1, out);
1379
1380         /*
1381          * Write the decompositions themselves.
1382          */
1383         for (i = 0; i < decomps_used; i++)
1384           fwrite((char *) decomps[i].decomp, sizeof(unsigned long),
1385                  decomps[i].used, out);
1386
1387         /*
1388          * Seek back to the beginning and write the byte count.
1389          */
1390         bytes = (sizeof(unsigned long) * idx) +
1391             (sizeof(unsigned long) * ((hdr[1] << 1) + 1));
1392         fseek(out, sizeof(unsigned short) << 1, 0L);
1393         fwrite((char *) &bytes, sizeof(unsigned long), 1, out);
1394
1395         fclose(out);
1396     }
1397
1398     /*****************************************************************
1399      *
1400      * Generate the combining class data.
1401      *
1402      *****************************************************************/
1403
1404     /*
1405      * Open the cmbcl.dat file.
1406      */
1407     sprintf(path, "%s/cmbcl.dat", opath);
1408     if ((out = fopen(path, "wb")) == 0)
1409       return;
1410
1411     /*
1412      * Set the number of ranges used.  Each range has a combining class which
1413      * means each entry is a 3-tuple.
1414      */
1415     hdr[1] = ccl_used / 3;
1416
1417     /*
1418      * Write the header.
1419      */
1420     fwrite((char *) hdr, sizeof(unsigned short), 2, out);
1421
1422     /*
1423      * Write out the byte count to maintain header size.
1424      */
1425     bytes = ccl_used * sizeof(unsigned long);
1426     fwrite((char *) &bytes, sizeof(unsigned long), 1, out);
1427
1428     if (ccl_used > 0)
1429       /*
1430        * Write the combining class ranges out.
1431        */
1432       fwrite((char *) ccl, sizeof(unsigned long), ccl_used, out);
1433
1434     fclose(out);
1435
1436     /*****************************************************************
1437      *
1438      * Generate the number data.
1439      *
1440      *****************************************************************/
1441
1442     /*
1443      * Open the num.dat file.
1444      */
1445     sprintf(path, "%s/num.dat", opath);
1446     if ((out = fopen(path, "wb")) == 0)
1447       return;
1448
1449     /*
1450      * The count part of the header will be the total number of codes that
1451      * have numbers.
1452      */
1453     hdr[1] = (unsigned short) (ncodes_used << 1);
1454     bytes = (ncodes_used * sizeof(_codeidx_t)) + (nums_used * sizeof(_num_t));
1455
1456     /*
1457      * Write the header.
1458      */
1459     fwrite((char *) hdr, sizeof(unsigned short), 2, out);
1460
1461     /*
1462      * Write out the byte count to maintain header size.
1463      */
1464     fwrite((char *) &bytes, sizeof(unsigned long), 1, out);
1465
1466     /*
1467      * Now, if number mappings exist, write them out.
1468      */
1469     if (ncodes_used > 0) {
1470         fwrite((char *) ncodes, sizeof(_codeidx_t), ncodes_used, out);
1471         fwrite((char *) nums, sizeof(_num_t), nums_used, out);
1472     }
1473
1474     fclose(out);
1475 }
1476
1477 static void
1478 usage(char *prog)
1479 {
1480     fprintf(stderr,
1481             "Usage: %s [-o output-directory|-x composition-exclusions]", prog);
1482     fprintf(stderr, " datafile1 datafile2 ...\n\n");
1483     fprintf(stderr,
1484             "-o output-directory\n\t\tWrite the output files to a different");
1485     fprintf(stderr, " directory (default: .).\n");
1486     fprintf(stderr,
1487             "-x composition-exclusion\n\t\tFile of composition codes");
1488     fprintf(stderr, " that should be excluded.\n");
1489     exit(1);
1490 }
1491
1492 int
1493 main(int argc, char *argv[])
1494 {
1495     FILE *in;
1496     char *prog, *opath;
1497
1498     if ((prog = strrchr(argv[0], '/')) != 0)
1499       prog++;
1500     else
1501       prog = argv[0];
1502
1503     opath = 0;
1504     in = stdin;
1505
1506     argc--;
1507     argv++;
1508
1509     while (argc > 0) {
1510         if (argv[0][0] == '-') {
1511             switch (argv[0][1]) {
1512               case 'o':
1513                 argc--;
1514                 argv++;
1515                 opath = argv[0];
1516                 break;
1517               case 'x':
1518                 argc--;
1519                 argv++;
1520                 if ((in = fopen(argv[0], "rb")) == 0)
1521                   fprintf(stderr,
1522                           "%s: unable to open composition exclusion file %s\n",
1523                           prog, argv[0]);
1524                 else {
1525                     read_compexdata(in);
1526                     fclose(in);
1527                     in = 0;
1528                 }
1529                 break;
1530               default:
1531                 usage(prog);
1532             }
1533         } else {
1534             if (in != stdin && in != NULL)
1535               fclose(in);
1536             if ((in = fopen(argv[0], "rb")) == 0)
1537               fprintf(stderr, "%s: unable to open ctype file %s\n",
1538                       prog, argv[0]);
1539             else {
1540                 read_cdata(in);
1541                 fclose(in);
1542                 in = 0;
1543             }
1544         }
1545         argc--;
1546         argv++;
1547     }
1548
1549     if (opath == 0)
1550       opath = ".";
1551     write_cdata(opath);
1552
1553     return 0;
1554 }