git.sur5r.net Git - openldap/blob - libraries/liblunicode/ucdata/ucgendat.c

   1 /*
   2  * Copyright 1999 Computing Research Labs, New Mexico State University
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice shall be included in
  12  * all copies or substantial portions of the Software.
  13  *
  14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  17  * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
  18  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
  19  * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
  20  * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  21  */
  22 #ifndef lint
  23 #ifdef __GNUC__
  24 static char rcsid[] __attribute__ ((unused)) = "$Id: ucgendat.c,v 1.3 1999/10/07 20:49:56 mleisher Exp $";
  25 #else
  26 static char rcsid[] = "$Id: ucgendat.c,v 1.3 1999/10/07 20:49:56 mleisher Exp $";
  27 #endif
  28 #endif
  29
  30 #include "portable.h"
  31
  32 #include <stdio.h>
  33 #include <stdlib.h>
  34 #include <ac/string.h>
  35 #include <ac/unistd.h>
  36
  37 #define ishdigit(cc) (((cc) >= '0' && (cc) <= '9') ||\
  38                       ((cc) >= 'A' && (cc) <= 'F') ||\
  39                       ((cc) >= 'a' && (cc) <= 'f'))
  40
  41 /*
  42  * A header written to the output file with the byte-order-mark and the number
  43  * of property nodes.
  44  */
  45 static unsigned short hdr[2] = {0xfeff, 0};
  46
  47 #define NUMPROPS 49
  48 #define NEEDPROPS (NUMPROPS + (4 - (NUMPROPS & 3)))
  49
  50 typedef struct {
  51     char *name;
  52     int len;
  53 } _prop_t;
  54
  55 /*
  56  * List of properties expected to be found in the Unicode Character Database
  57  * including some implementation specific properties.
  58  *
  59  * The implementation specific properties are:
  60  * Cm = Composed (can be decomposed)
  61  * Nb = Non-breaking
  62  * Sy = Symmetric (has left and right forms)
  63  * Hd = Hex digit
  64  * Qm = Quote marks
  65  * Mr = Mirroring
  66  * Ss = Space, other
  67  * Cp = Defined character
  68  */
  69 static _prop_t props[NUMPROPS] = {
  70     {"Mn", 2}, {"Mc", 2}, {"Me", 2}, {"Nd", 2}, {"Nl", 2}, {"No", 2},
  71     {"Zs", 2}, {"Zl", 2}, {"Zp", 2}, {"Cc", 2}, {"Cf", 2}, {"Cs", 2},
  72     {"Co", 2}, {"Cn", 2}, {"Lu", 2}, {"Ll", 2}, {"Lt", 2}, {"Lm", 2},
  73     {"Lo", 2}, {"Pc", 2}, {"Pd", 2}, {"Ps", 2}, {"Pe", 2}, {"Po", 2},
  74     {"Sm", 2}, {"Sc", 2}, {"Sk", 2}, {"So", 2}, {"L",  1}, {"R",  1},
  75     {"EN", 2}, {"ES", 2}, {"ET", 2}, {"AN", 2}, {"CS", 2}, {"B",  1},
  76     {"S",  1}, {"WS", 2}, {"ON", 2},
  77     {"Cm", 2}, {"Nb", 2}, {"Sy", 2}, {"Hd", 2}, {"Qm", 2}, {"Mr", 2},
  78     {"Ss", 2}, {"Cp", 2}, {"Pi", 2}, {"Pf", 2}
  79 };
  80
  81 typedef struct {
  82     unsigned long *ranges;
  83     unsigned short used;
  84     unsigned short size;
  85 } _ranges_t;
  86
  87 static _ranges_t proptbl[NUMPROPS];
  88
  89 /*
  90  * Make sure this array is sized to be on a 4-byte boundary at compile time.
  91  */
  92 static unsigned short propcnt[NEEDPROPS];
  93
  94 /*
  95  * Array used to collect a decomposition before adding it to the decomposition
  96  * table.
  97  */
  98 static unsigned long dectmp[64];
  99 static unsigned long dectmp_size;
 100
 101 typedef struct {
 102     unsigned long code;
 103     unsigned short size;
 104     unsigned short used;
 105     unsigned long *decomp;
 106 } _decomp_t;
 107
 108 /*
 109  * List of decomposition.  Created and expanded in order as the characters are
 110  * encountered.
 111  */
 112 static _decomp_t *decomps;
 113 static unsigned long decomps_used;
 114 static unsigned long decomps_size;
 115
 116 /*
 117  * Types and lists for handling lists of case mappings.
 118  */
 119 typedef struct {
 120     unsigned long key;
 121     unsigned long other1;
 122     unsigned long other2;
 123 } _case_t;
 124
 125 static _case_t *upper;
 126 static _case_t *lower;
 127 static _case_t *title;
 128 static unsigned long upper_used;
 129 static unsigned long upper_size;
 130 static unsigned long lower_used;
 131 static unsigned long lower_size;
 132 static unsigned long title_used;
 133 static unsigned long title_size;
 134
 135 /*
 136  * Array used to collect case mappings before adding them to a list.
 137  */
 138 static unsigned long cases[3];
 139
 140 /*
 141  * An array to hold ranges for combining classes.
 142  */
 143 static unsigned long *ccl;
 144 static unsigned long ccl_used;
 145 static unsigned long ccl_size;
 146
 147 /*
 148  * Structures for handling numbers.
 149  */
 150 typedef struct {
 151     unsigned long code;
 152     unsigned long idx;
 153 } _codeidx_t;
 154
 155 typedef struct {
 156     short numerator;
 157     short denominator;
 158 } _num_t;
 159
 160 /*
 161  * Arrays to hold the mapping of codes to numbers.
 162  */
 163 static _codeidx_t *ncodes;
 164 static unsigned long ncodes_used;
 165 static unsigned long ncodes_size;
 166
 167 static _num_t *nums;
 168 static unsigned long nums_used;
 169 static unsigned long nums_size;
 170
 171 /*
 172  * Array for holding numbers.
 173  */
 174 static _num_t *nums;
 175 static unsigned long nums_used;
 176 static unsigned long nums_size;
 177
 178 static void
 179 add_range(unsigned long start, unsigned long end, char *p1, char *p2)
 180 {
 181     int i, j, k, len;
 182     _ranges_t *rlp;
 183     char *name;
 184
 185     for (k = 0; k < 2; k++) {
 186         if (k == 0) {
 187             name = p1;
 188             len = 2;
 189         } else {
 190             if (p2 == 0)
 191               break;
 192
 193             name = p2;
 194             len = 1;
 195         }
 196
 197         for (i = 0; i < NUMPROPS; i++) {
 198             if (props[i].len == len && memcmp(props[i].name, name, len) == 0)
 199               break;
 200         }
 201
 202         if (i == NUMPROPS)
 203           continue;
 204
 205         rlp = &proptbl[i];
 206
 207         /*
 208          * Resize the range list if necessary.
 209          */
 210         if (rlp->used == rlp->size) {
 211             if (rlp->size == 0)
 212               rlp->ranges = (unsigned long *)
 213                   malloc(sizeof(unsigned long) << 3);
 214             else
 215               rlp->ranges = (unsigned long *)
 216                   realloc((char *) rlp->ranges,
 217                           sizeof(unsigned long) * (rlp->size + 8));
 218             rlp->size += 8;
 219         }
 220
 221         /*
 222          * If this is the first code for this property list, just add it
 223          * and return.
 224          */
 225         if (rlp->used == 0) {
 226             rlp->ranges[0] = start;
 227             rlp->ranges[1] = end;
 228             rlp->used += 2;
 229             continue;
 230         }
 231
 232         /*
 233          * Optimize the case of adding the range to the end.
 234          */
 235         j = rlp->used - 1;
 236         if (start > rlp->ranges[j]) {
 237             j = rlp->used;
 238             rlp->ranges[j++] = start;
 239             rlp->ranges[j++] = end;
 240             rlp->used = j;
 241             continue;
 242         }
 243
 244         /*
 245          * Need to locate the insertion point.
 246          */
 247         for (i = 0;
 248              i < rlp->used && start > rlp->ranges[i + 1] + 1; i += 2) ;
 249
 250         /*
 251          * If the start value lies in the current range, then simply set the
 252          * new end point of the range to the end value passed as a parameter.
 253          */
 254         if (rlp->ranges[i] <= start && start <= rlp->ranges[i + 1] + 1) {
 255             rlp->ranges[i + 1] = end;
 256             return;
 257         }
 258
 259         /*
 260          * Shift following values up by two.
 261          */
 262         for (j = rlp->used; j > i; j -= 2) {
 263             rlp->ranges[j] = rlp->ranges[j - 2];
 264             rlp->ranges[j + 1] = rlp->ranges[j - 1];
 265         }
 266
 267         /*
 268          * Add the new range at the insertion point.
 269          */
 270         rlp->ranges[i] = start;
 271         rlp->ranges[i + 1] = end;
 272         rlp->used += 2;
 273     }
 274 }
 275
 276 static void
 277 ordered_range_insert(unsigned long c, char *name, int len)
 278 {
 279     int i, j;
 280     unsigned long s, e;
 281     _ranges_t *rlp;
 282
 283     if (len == 0)
 284       return;
 285
 286     /*
 287      * Deal with directionality codes introduced in Unicode 3.0.
 288      */
 289     if (len == 2) {
 290         if (memcmp(name, "AL", 2) == 0) {
 291             /*
 292              * Mark the Arabic letters as having RTL directionality.
 293              */
 294             len = 1;
 295             name = "R";
 296         } else if (memcmp(name, "BN", 2) == 0) {
 297             /*
 298              * Mark the control characters as being Other Neutrals.
 299              */
 300             len = 2;
 301             name = "ON";
 302         }
 303     } else if (len == 3 &&
 304                (memcmp(name, "NSM", 3) == 0 || memcmp(name, "PDF", 3) == 0 ||
 305                 memcmp(name, "LRE", 3) == 0 || memcmp(name, "LRO", 3) == 0 ||
 306                 memcmp(name, "RLE", 3) == 0 || memcmp(name, "RLO", 3) == 0)) {
 307         /*
 308          * Mark all of these as Other Neutral to preserve compatibility with
 309          * older versions.
 310          */
 311         len = 2;
 312         name = "ON";
 313     }
 314
 315     for (i = 0; i < NUMPROPS; i++) {
 316         if (props[i].len == len && memcmp(props[i].name, name, len) == 0)
 317           break;
 318     }
 319
 320     if (i == NUMPROPS)
 321       return;
 322
 323     /*
 324      * Have a match, so insert the code in order.
 325      */
 326     rlp = &proptbl[i];
 327
 328     /*
 329      * Resize the range list if necessary.
 330      */
 331     if (rlp->used == rlp->size) {
 332         if (rlp->size == 0)
 333           rlp->ranges = (unsigned long *)
 334               malloc(sizeof(unsigned long) << 3);
 335         else
 336           rlp->ranges = (unsigned long *)
 337               realloc((char *) rlp->ranges,
 338                       sizeof(unsigned long) * (rlp->size + 8));
 339         rlp->size += 8;
 340     }
 341
 342     /*
 343      * If this is the first code for this property list, just add it
 344      * and return.
 345      */
 346     if (rlp->used == 0) {
 347         rlp->ranges[0] = rlp->ranges[1] = c;
 348         rlp->used += 2;
 349         return;
 350     }
 351
 352     /*
 353      * Optimize the cases of extending the last range and adding new ranges to
 354      * the end.
 355      */
 356     j = rlp->used - 1;
 357     e = rlp->ranges[j];
 358     s = rlp->ranges[j - 1];
 359
 360     if (c == e + 1) {
 361         /*
 362          * Extend the last range.
 363          */
 364         rlp->ranges[j] = c;
 365         return;
 366     }
 367
 368     if (c > e + 1) {
 369         /*
 370          * Start another range on the end.
 371          */
 372         j = rlp->used;
 373         rlp->ranges[j] = rlp->ranges[j + 1] = c;
 374         rlp->used += 2;
 375         return;
 376     }
 377
 378     if (c >= s)
 379       /*
 380        * The code is a duplicate of a code in the last range, so just return.
 381        */
 382       return;
 383
 384     /*
 385      * The code should be inserted somewhere before the last range in the
 386      * list.  Locate the insertion point.
 387      */
 388     for (i = 0;
 389          i < rlp->used && c > rlp->ranges[i + 1] + 1; i += 2) ;
 390
 391     s = rlp->ranges[i];
 392     e = rlp->ranges[i + 1];
 393
 394     if (c == e + 1)
 395       /*
 396        * Simply extend the current range.
 397        */
 398       rlp->ranges[i + 1] = c;
 399     else if (c < s) {
 400         /*
 401          * Add a new entry before the current location.  Shift all entries
 402          * before the current one up by one to make room.
 403          */
 404         for (j = rlp->used; j > i; j -= 2) {
 405             rlp->ranges[j] = rlp->ranges[j - 2];
 406             rlp->ranges[j + 1] = rlp->ranges[j - 1];
 407         }
 408         rlp->ranges[i] = rlp->ranges[i + 1] = c;
 409
 410         rlp->used += 2;
 411     }
 412 }
 413
 414 static void
 415 add_decomp(unsigned long code)
 416 {
 417     unsigned long i, j, size;
 418
 419     /*
 420      * Add the code to the composite property.
 421      */
 422     ordered_range_insert(code, "Cm", 2);
 423
 424     /*
 425      * Locate the insertion point for the code.
 426      */
 427     for (i = 0; i < decomps_used && code > decomps[i].code; i++) ;
 428
 429     /*
 430      * Allocate space for a new decomposition.
 431      */
 432     if (decomps_used == decomps_size) {
 433         if (decomps_size == 0)
 434           decomps = (_decomp_t *) malloc(sizeof(_decomp_t) << 3);
 435         else
 436           decomps = (_decomp_t *)
 437               realloc((char *) decomps,
 438                       sizeof(_decomp_t) * (decomps_size + 8));
 439         (void) memset((char *) (decomps + decomps_size), 0,
 440                       sizeof(_decomp_t) << 3);
 441         decomps_size += 8;
 442     }
 443
 444     if (i < decomps_used && code != decomps[i].code) {
 445         /*
 446          * Shift the decomps up by one if the codes don't match.
 447          */
 448         for (j = decomps_used; j > i; j--)
 449           (void) memcpy((char *) &decomps[j], (char *) &decomps[j - 1],
 450                         sizeof(_decomp_t));
 451     }
 452
 453     /*
 454      * Insert or replace a decomposition.
 455      */
 456     size = dectmp_size + (4 - (dectmp_size & 3));
 457     if (decomps[i].size < size) {
 458         if (decomps[i].size == 0)
 459           decomps[i].decomp = (unsigned long *)
 460               malloc(sizeof(unsigned long) * size);
 461         else
 462           decomps[i].decomp = (unsigned long *)
 463               realloc((char *) decomps[i].decomp,
 464                       sizeof(unsigned long) * size);
 465         decomps[i].size = size;
 466     }
 467
 468     if (decomps[i].code != code)
 469       decomps_used++;
 470
 471     decomps[i].code = code;
 472     decomps[i].used = dectmp_size;
 473     (void) memcpy((char *) decomps[i].decomp, (char *) dectmp,
 474                   sizeof(unsigned long) * dectmp_size);
 475
 476 }
 477
 478 static void
 479 add_title(unsigned long code)
 480 {
 481     unsigned long i, j;
 482
 483     /*
 484      * Always map the code to itself.
 485      */
 486     cases[2] = code;
 487
 488     if (title_used == title_size) {
 489         if (title_size == 0)
 490           title = (_case_t *) malloc(sizeof(_case_t) << 3);
 491         else
 492           title = (_case_t *) realloc((char *) title,
 493                                       sizeof(_case_t) * (title_size + 8));
 494         title_size += 8;
 495     }
 496
 497     /*
 498      * Locate the insertion point.
 499      */
 500     for (i = 0; i < title_used && code > title[i].key; i++) ;
 501
 502     if (i < title_used) {
 503         /*
 504          * Shift the array up by one.
 505          */
 506         for (j = title_used; j > i; j--)
 507           (void) memcpy((char *) &title[j], (char *) &title[j - 1],
 508                         sizeof(_case_t));
 509     }
 510
 511     title[i].key = cases[2];    /* Title */
 512     title[i].other1 = cases[0]; /* Upper */
 513     title[i].other2 = cases[1]; /* Lower */
 514
 515     title_used++;
 516 }
 517
 518 static void
 519 add_upper(unsigned long code)
 520 {
 521     unsigned long i, j;
 522
 523     /*
 524      * Always map the code to itself.
 525      */
 526     cases[0] = code;
 527
 528     /*
 529      * If the title case character is not present, then make it the same as
 530      * the upper case.
 531      */
 532     if (cases[2] == 0)
 533       cases[2] = code;
 534
 535     if (upper_used == upper_size) {
 536         if (upper_size == 0)
 537           upper = (_case_t *) malloc(sizeof(_case_t) << 3);
 538         else
 539           upper = (_case_t *) realloc((char *) upper,
 540                                       sizeof(_case_t) * (upper_size + 8));
 541         upper_size += 8;
 542     }
 543
 544     /*
 545      * Locate the insertion point.
 546      */
 547     for (i = 0; i < upper_used && code > upper[i].key; i++) ;
 548
 549     if (i < upper_used) {
 550         /*
 551          * Shift the array up by one.
 552          */
 553         for (j = upper_used; j > i; j--)
 554           (void) memcpy((char *) &upper[j], (char *) &upper[j - 1],
 555                         sizeof(_case_t));
 556     }
 557
 558     upper[i].key = cases[0];    /* Upper */
 559     upper[i].other1 = cases[1]; /* Lower */
 560     upper[i].other2 = cases[2]; /* Title */
 561
 562     upper_used++;
 563 }
 564
 565 static void
 566 add_lower(unsigned long code)
 567 {
 568     unsigned long i, j;
 569
 570     /*
 571      * Always map the code to itself.
 572      */
 573     cases[1] = code;
 574
 575     /*
 576      * If the title case character is empty, then make it the same as the
 577      * upper case.
 578      */
 579     if (cases[2] == 0)
 580       cases[2] = cases[0];
 581
 582     if (lower_used == lower_size) {
 583         if (lower_size == 0)
 584           lower = (_case_t *) malloc(sizeof(_case_t) << 3);
 585         else
 586           lower = (_case_t *) realloc((char *) lower,
 587                                       sizeof(_case_t) * (lower_size + 8));
 588         lower_size += 8;
 589     }
 590
 591     /*
 592      * Locate the insertion point.
 593      */
 594     for (i = 0; i < lower_used && code > lower[i].key; i++) ;
 595
 596     if (i < lower_used) {
 597         /*
 598          * Shift the array up by one.
 599          */
 600         for (j = lower_used; j > i; j--)
 601           (void) memcpy((char *) &lower[j], (char *) &lower[j - 1],
 602                         sizeof(_case_t));
 603     }
 604
 605     lower[i].key = cases[1];    /* Lower */
 606     lower[i].other1 = cases[0]; /* Upper */
 607     lower[i].other2 = cases[2]; /* Title */
 608
 609     lower_used++;
 610 }
 611
 612 static void
 613 ordered_ccl_insert(unsigned long c, unsigned long ccl_code)
 614 {
 615     unsigned long i, j;
 616
 617     if (ccl_used == ccl_size) {
 618         if (ccl_size == 0)
 619           ccl = (unsigned long *) malloc(sizeof(unsigned long) * 24);
 620         else
 621           ccl = (unsigned long *)
 622               realloc((char *) ccl, sizeof(unsigned long) * (ccl_size + 24));
 623         ccl_size += 24;
 624     }
 625
 626     /*
 627      * Optimize adding the first item.
 628      */
 629     if (ccl_used == 0) {
 630         ccl[0] = ccl[1] = c;
 631         ccl[2] = ccl_code;
 632         ccl_used += 3;
 633         return;
 634     }
 635
 636     /*
 637      * Handle the special case of extending the range on the end.  This
 638      * requires that the combining class codes are the same.
 639      */
 640     if (ccl_code == ccl[ccl_used - 1] && c == ccl[ccl_used - 2] + 1) {
 641         ccl[ccl_used - 2] = c;
 642         return;
 643     }
 644
 645     /*
 646      * Handle the special case of adding another range on the end.
 647      */
 648     if (c > ccl[ccl_used - 2] + 1 ||
 649         (c == ccl[ccl_used - 2] + 1 && ccl_code != ccl[ccl_used - 1])) {
 650         ccl[ccl_used++] = c;
 651         ccl[ccl_used++] = c;
 652         ccl[ccl_used++] = ccl_code;
 653         return;
 654     }
 655
 656     /*
 657      * Locate either the insertion point or range for the code.
 658      */
 659     for (i = 0; i < ccl_used && c > ccl[i + 1] + 1; i += 3) ;
 660
 661     if (ccl_code == ccl[i + 2] && c == ccl[i + 1] + 1) {
 662         /*
 663          * Extend an existing range.
 664          */
 665         ccl[i + 1] = c;
 666         return;
 667     } else if (c < ccl[i]) {
 668         /*
 669          * Start a new range before the current location.
 670          */
 671         for (j = ccl_used; j > i; j -= 3) {
 672             ccl[j] = ccl[j - 3];
 673             ccl[j - 1] = ccl[j - 4];
 674             ccl[j - 2] = ccl[j - 5];
 675         }
 676         ccl[i] = ccl[i + 1] = c;
 677         ccl[i + 2] = ccl_code;
 678     }
 679 }
 680
 681 /*
 682  * Adds a number if it does not already exist and returns an index value
 683  * multiplied by 2.
 684  */
 685 static unsigned long
 686 make_number(short num, short denom)
 687 {
 688     unsigned long n;
 689
 690     /*
 691      * Determine if the number already exists.
 692      */
 693     for (n = 0; n < nums_used; n++) {
 694         if (nums[n].numerator == num && nums[n].denominator == denom)
 695           return n << 1;
 696     }
 697
 698     if (nums_used == nums_size) {
 699         if (nums_size == 0)
 700           nums = (_num_t *) malloc(sizeof(_num_t) << 3);
 701         else
 702           nums = (_num_t *) realloc((char *) nums,
 703                                     sizeof(_num_t) * (nums_size + 8));
 704         nums_size += 8;
 705     }
 706
 707     n = nums_used++;
 708     nums[n].numerator = num;
 709     nums[n].denominator = denom;
 710
 711     return n << 1;
 712 }
 713
 714 static void
 715 add_number(unsigned long code, short num, short denom)
 716 {
 717     unsigned long i, j;
 718
 719     /*
 720      * Insert the code in order.
 721      */
 722     for (i = 0; i < ncodes_used && code > ncodes[i].code; i++) ;
 723
 724     /*
 725      * Handle the case of the codes matching and simply replace the number
 726      * that was there before.
 727      */
 728     if (ncodes_used > 0 && code == ncodes[i].code) {
 729         ncodes[i].idx = make_number(num, denom);
 730         return;
 731     }
 732
 733     /*
 734      * Resize the array if necessary.
 735      */
 736     if (ncodes_used == ncodes_size) {
 737         if (ncodes_size == 0)
 738           ncodes = (_codeidx_t *) malloc(sizeof(_codeidx_t) << 3);
 739         else
 740           ncodes = (_codeidx_t *)
 741               realloc((char *) ncodes, sizeof(_codeidx_t) * (ncodes_size + 8));
 742
 743         ncodes_size += 8;
 744     }
 745
 746     /*
 747      * Shift things around to insert the code if necessary.
 748      */
 749     if (i < ncodes_used) {
 750         for (j = ncodes_used; j > i; j--) {
 751             ncodes[j].code = ncodes[j - 1].code;
 752             ncodes[j].idx = ncodes[j - 1].idx;
 753         }
 754     }
 755     ncodes[i].code = code;
 756     ncodes[i].idx = make_number(num, denom);
 757
 758     ncodes_used++;
 759 }
 760
 761 /*
 762  * This routine assumes that the line is a valid Unicode Character Database
 763  * entry.
 764  */
 765 static void
 766 read_cdata(FILE *in)
 767 {
 768     unsigned long i, lineno, skip, code, ccl_code;
 769     short wnum, neg, number[2];
 770     char line[512], *s, *e;
 771
 772     lineno = skip = 0;
 773     while (fscanf(in, "%[^\n]\n", line) != EOF) {
 774         lineno++;
 775
 776         /*
 777          * Skip blank lines and lines that start with a '#'.
 778          */
 779         if (line[0] == 0 || line[0] == '#')
 780           continue;
 781
 782         /*
 783          * If lines need to be skipped, do it here.
 784          */
 785         if (skip) {
 786             skip--;
 787             continue;
 788         }
 789
 790         /*
 791          * Collect the code.  The code can be up to 6 hex digits in length to
 792          * allow surrogates to be specified.
 793          */
 794         for (s = line, i = code = 0; *s != ';' && i < 6; i++, s++) {
 795             code <<= 4;
 796             if (*s >= '0' && *s <= '9')
 797               code += *s - '0';
 798             else if (*s >= 'A' && *s <= 'F')
 799               code += (*s - 'A') + 10;
 800             else if (*s >= 'a' && *s <= 'f')
 801               code += (*s - 'a') + 10;
 802         }
 803
 804         /*
 805          * Handle the following special cases:
 806          * 1. 4E00-9FA5 CJK Ideographs.
 807          * 2. AC00-D7A3 Hangul Syllables.
 808          * 3. D800-DFFF Surrogates.
 809          * 4. E000-F8FF Private Use Area.
 810          * 5. F900-FA2D Han compatibility.
 811          */
 812         switch (code) {
 813           case 0x4e00:
 814             /*
 815              * The Han ideographs.
 816              */
 817             add_range(0x4e00, 0x9fff, "Lo", "L");
 818
 819             /*
 820              * Add the characters to the defined category.
 821              */
 822             add_range(0x4e00, 0x9fa5, "Cp", 0);
 823
 824             skip = 1;
 825             break;
 826           case 0xac00:
 827             /*
 828              * The Hangul syllables.
 829              */
 830             add_range(0xac00, 0xd7a3, "Lo", "L");
 831
 832             /*
 833              * Add the characters to the defined category.
 834              */
 835             add_range(0xac00, 0xd7a3, "Cp", 0);
 836
 837             skip = 1;
 838             break;
 839           case 0xd800:
 840             /*
 841              * Make a range of all surrogates and assume some default
 842              * properties.
 843              */
 844             add_range(0x010000, 0x10ffff, "Cs", "L");
 845             skip = 5;
 846             break;
 847           case 0xe000:
 848             /*
 849              * The Private Use area.  Add with a default set of properties.
 850              */
 851             add_range(0xe000, 0xf8ff, "Co", "L");
 852             skip = 1;
 853             break;
 854           case 0xf900:
 855             /*
 856              * The CJK compatibility area.
 857              */
 858             add_range(0xf900, 0xfaff, "Lo", "L");
 859
 860             /*
 861              * Add the characters to the defined category.
 862              */
 863             add_range(0xf900, 0xfaff, "Cp", 0);
 864
 865             skip = 1;
 866         }
 867
 868         if (skip)
 869           continue;
 870
 871         /*
 872          * Add the code to the defined category.
 873          */
 874         ordered_range_insert(code, "Cp", 2);
 875
 876         /*
 877          * Locate the first character property field.
 878          */
 879         for (i = 0; *s != 0 && i < 2; s++) {
 880             if (*s == ';')
 881               i++;
 882         }
 883         for (e = s; *e && *e != ';'; e++) ;
 884
 885         ordered_range_insert(code, s, e - s);
 886
 887         /*
 888          * Locate the combining class code.
 889          */
 890         for (s = e; *s != 0 && i < 3; s++) {
 891             if (*s == ';')
 892               i++;
 893         }
 894
 895         /*
 896          * Convert the combining class code from decimal.
 897          */
 898         for (ccl_code = 0, e = s; *e && *e != ';'; e++)
 899           ccl_code = (ccl_code * 10) + (*e - '0');
 900
 901         /*
 902          * Add the code if it not 0.
 903          */
 904         if (ccl_code != 0)
 905           ordered_ccl_insert(code, ccl_code);
 906
 907         /*
 908          * Locate the second character property field.
 909          */
 910         for (s = e; *s != 0 && i < 4; s++) {
 911             if (*s == ';')
 912               i++;
 913         }
 914         for (e = s; *e && *e != ';'; e++) ;
 915
 916         ordered_range_insert(code, s, e - s);
 917
 918         /*
 919          * Check for a decomposition.
 920          */
 921         s = ++e;
 922         if (*s != ';' && *s != '<') {
 923             /*
 924              * Collect the codes of the decomposition.
 925              */
 926             for (dectmp_size = 0; *s != ';'; ) {
 927                 /*
 928                  * Skip all leading non-hex digits.
 929                  */
 930                 while (!ishdigit(*s))
 931                   s++;
 932
 933                 for (dectmp[dectmp_size] = 0; ishdigit(*s); s++) {
 934                     dectmp[dectmp_size] <<= 4;
 935                     if (*s >= '0' && *s <= '9')
 936                       dectmp[dectmp_size] += *s - '0';
 937                     else if (*s >= 'A' && *s <= 'F')
 938                       dectmp[dectmp_size] += (*s - 'A') + 10;
 939                     else if (*s >= 'a' && *s <= 'f')
 940                       dectmp[dectmp_size] += (*s - 'a') + 10;
 941                 }
 942                 dectmp_size++;
 943             }
 944
 945             /*
 946              * If there is more than one code in the temporary decomposition
 947              * array, then add the character with its decomposition.
 948              */
 949             if (dectmp_size > 1)
 950               add_decomp(code);
 951         }
 952
 953         /*
 954          * Skip to the number field.
 955          */
 956         for (i = 0; i < 3 && *s; s++) {
 957             if (*s == ';')
 958               i++;
 959         }
 960
 961         /*
 962          * Scan the number in.
 963          */
 964         number[0] = number[1] = 0;
 965         for (e = s, neg = wnum = 0; *e && *e != ';'; e++) {
 966             if (*e == '-') {
 967                 neg = 1;
 968                 continue;
 969             }
 970
 971             if (*e == '/') {
 972                 /*
 973                  * Move the the denominator of the fraction.
 974                  */
 975                 if (neg)
 976                   number[wnum] *= -1;
 977                 neg = 0;
 978                 e++;
 979                 wnum++;
 980             }
 981             number[wnum] = (number[wnum] * 10) + (*e - '0');
 982         }
 983
 984         if (e > s) {
 985             /*
 986              * Adjust the denominator in case of integers and add the number.
 987              */
 988             if (wnum == 0)
 989               number[1] = number[0];
 990
 991             add_number(code, number[0], number[1]);
 992         }
 993
 994         /*
 995          * Skip to the start of the possible case mappings.
 996          */
 997         for (s = e, i = 0; i < 4 && *s; s++) {
 998             if (*s == ';')
 999               i++;
1000         }
1001
1002         /*
1003          * Collect the case mappings.
1004          */
1005         cases[0] = cases[1] = cases[2] = 0;
1006         for (i = 0; i < 3; i++) {
1007             while (ishdigit(*s)) {
1008                 cases[i] <<= 4;
1009                 if (*s >= '0' && *s <= '9')
1010                   cases[i] += *s - '0';
1011                 else if (*s >= 'A' && *s <= 'F')
1012                   cases[i] += (*s - 'A') + 10;
1013                 else if (*s >= 'a' && *s <= 'f')
1014                   cases[i] += (*s - 'a') + 10;
1015                 s++;
1016             }
1017             if (*s == ';')
1018               s++;
1019         }
1020         if (cases[0] && cases[1])
1021           /*
1022            * Add the upper and lower mappings for a title case character.
1023            */
1024           add_title(code);
1025         else if (cases[1])
1026           /*
1027            * Add the lower and title case mappings for the upper case
1028            * character.
1029            */
1030           add_upper(code);
1031         else if (cases[0])
1032           /*
1033            * Add the upper and title case mappings for the lower case
1034            * character.
1035            */
1036           add_lower(code);
1037     }
1038 }
1039
1040 static _decomp_t *
1041 find_decomp(unsigned long code)
1042 {
1043     long l, r, m;
1044
1045     l = 0;
1046     r = decomps_used - 1;
1047     while (l <= r) {
1048         m = (l + r) >> 1;
1049         if (code > decomps[m].code)
1050           l = m + 1;
1051         else if (code < decomps[m].code)
1052           r = m - 1;
1053         else
1054           return &decomps[m];
1055     }
1056     return 0;
1057 }
1058
1059 static void
1060 decomp_it(_decomp_t *d)
1061 {
1062     unsigned long i;
1063     _decomp_t *dp;
1064
1065     for (i = 0; i < d->used; i++) {
1066         if ((dp = find_decomp(d->decomp[i])) != 0)
1067           decomp_it(dp);
1068         else
1069           dectmp[dectmp_size++] = d->decomp[i];
1070     }
1071 }
1072
1073 /*
1074  * Expand all decompositions by recursively decomposing each character
1075  * in the decomposition.
1076  */
1077 static void
1078 expand_decomp(void)
1079 {
1080     unsigned long i;
1081
1082     for (i = 0; i < decomps_used; i++) {
1083         dectmp_size = 0;
1084         decomp_it(&decomps[i]);
1085         if (dectmp_size > 0)
1086           add_decomp(decomps[i].code);
1087     }
1088 }
1089
1090 static void
1091 write_cdata(char *opath)
1092 {
1093     FILE *out;
1094     unsigned long i, idx, bytes, nprops;
1095     unsigned short casecnt[2];
1096     char path[BUFSIZ];
1097
1098     /*****************************************************************
1099      *
1100      * Generate the ctype data.
1101      *
1102      *****************************************************************/
1103
1104     /*
1105      * Open the ctype.dat file.
1106      */
1107     sprintf(path, "%s/ctype.dat", opath);
1108     if ((out = fopen(path, "wb")) == 0)
1109       return;
1110
1111     /*
1112      * Collect the offsets for the properties.  The offsets array is
1113      * on a 4-byte boundary to keep things efficient for architectures
1114      * that need such a thing.
1115      */
1116     for (i = idx = 0; i < NUMPROPS; i++) {
1117         propcnt[i] = (proptbl[i].used != 0) ? idx : 0xffff;
1118         idx += proptbl[i].used;
1119     }
1120
1121     /*
1122      * Add the sentinel index which is used by the binary search as the upper
1123      * bound for a search.
1124      */
1125     propcnt[i] = idx;
1126
1127     /*
1128      * Record the actual number of property lists.  This may be different than
1129      * the number of offsets actually written because of aligning on a 4-byte
1130      * boundary.
1131      */
1132     hdr[1] = NUMPROPS;
1133
1134     /*
1135      * Calculate the byte count needed and pad the property counts array to a
1136      * 4-byte boundary.
1137      */
1138     if ((bytes = sizeof(unsigned short) * (NUMPROPS + 1)) & 3)
1139       bytes += 4 - (bytes & 3);
1140     nprops = bytes / sizeof(unsigned short);
1141     bytes += sizeof(unsigned long) * idx;
1142
1143     /*
1144      * Write the header.
1145      */
1146     fwrite((char *) hdr, sizeof(unsigned short), 2, out);
1147
1148     /*
1149      * Write the byte count.
1150      */
1151     fwrite((char *) &bytes, sizeof(unsigned long), 1, out);
1152
1153     /*
1154      * Write the property list counts.
1155      */
1156     fwrite((char *) propcnt, sizeof(unsigned short), nprops, out);
1157
1158     /*
1159      * Write the property lists.
1160      */
1161     for (i = 0; i < NUMPROPS; i++) {
1162         if (proptbl[i].used > 0)
1163           fwrite((char *) proptbl[i].ranges, sizeof(unsigned long),
1164                  proptbl[i].used, out);
1165     }
1166
1167     fclose(out);
1168
1169     /*****************************************************************
1170      *
1171      * Generate the case mapping data.
1172      *
1173      *****************************************************************/
1174
1175     /*
1176      * Open the case.dat file.
1177      */
1178     sprintf(path, "%s/case.dat", opath);
1179     if ((out = fopen(path, "wb")) == 0)
1180       return;
1181
1182     /*
1183      * Write the case mapping tables.
1184      */
1185     hdr[1] = upper_used + lower_used + title_used;
1186     casecnt[0] = upper_used;
1187     casecnt[1] = lower_used;
1188
1189     /*
1190      * Write the header.
1191      */
1192     fwrite((char *) hdr, sizeof(unsigned short), 2, out);
1193
1194     /*
1195      * Write the upper and lower case table sizes.
1196      */
1197     fwrite((char *) casecnt, sizeof(unsigned short), 2, out);
1198
1199     if (upper_used > 0)
1200       /*
1201        * Write the upper case table.
1202        */
1203       fwrite((char *) upper, sizeof(_case_t), upper_used, out);
1204
1205     if (lower_used > 0)
1206       /*
1207        * Write the lower case table.
1208        */
1209       fwrite((char *) lower, sizeof(_case_t), lower_used, out);
1210
1211     if (title_used > 0)
1212       /*
1213        * Write the title case table.
1214        */
1215       fwrite((char *) title, sizeof(_case_t), title_used, out);
1216
1217     fclose(out);
1218
1219     /*****************************************************************
1220      *
1221      * Generate the decomposition data.
1222      *
1223      *****************************************************************/
1224
1225     /*
1226      * Fully expand all decompositions before generating the output file.
1227      */
1228     expand_decomp();
1229
1230     /*
1231      * Open the decomp.dat file.
1232      */
1233     sprintf(path, "%s/decomp.dat", opath);
1234     if ((out = fopen(path, "wb")) == 0)
1235       return;
1236
1237     hdr[1] = decomps_used;
1238
1239     /*
1240      * Write the header.
1241      */
1242     fwrite((char *) hdr, sizeof(unsigned short), 2, out);
1243
1244     /*
1245      * Write a temporary byte count which will be calculated as the
1246      * decompositions are written out.
1247      */
1248     bytes = 0;
1249     fwrite((char *) &bytes, sizeof(unsigned long), 1, out);
1250
1251     if (decomps_used) {
1252         /*
1253          * Write the list of decomp nodes.
1254          */
1255         for (i = idx = 0; i < decomps_used; i++) {
1256             fwrite((char *) &decomps[i].code, sizeof(unsigned long), 1, out);
1257             fwrite((char *) &idx, sizeof(unsigned long), 1, out);
1258             idx += decomps[i].used;
1259         }
1260
1261         /*
1262          * Write the sentinel index as the last decomp node.
1263          */
1264         fwrite((char *) &idx, sizeof(unsigned long), 1, out);
1265
1266         /*
1267          * Write the decompositions themselves.
1268          */
1269         for (i = 0; i < decomps_used; i++)
1270           fwrite((char *) decomps[i].decomp, sizeof(unsigned long),
1271                  decomps[i].used, out);
1272
1273         /*
1274          * Seek back to the beginning and write the byte count.
1275          */
1276         bytes = (sizeof(unsigned long) * idx) +
1277             (sizeof(unsigned long) * ((hdr[1] << 1) + 1));
1278         fseek(out, sizeof(unsigned short) << 1, 0L);
1279         fwrite((char *) &bytes, sizeof(unsigned long), 1, out);
1280
1281         fclose(out);
1282     }
1283
1284     /*****************************************************************
1285      *
1286      * Generate the combining class data.
1287      *
1288      *****************************************************************/
1289
1290     /*
1291      * Open the cmbcl.dat file.
1292      */
1293     sprintf(path, "%s/cmbcl.dat", opath);
1294     if ((out = fopen(path, "wb")) == 0)
1295       return;
1296
1297     /*
1298      * Set the number of ranges used.  Each range has a combining class which
1299      * means each entry is a 3-tuple.
1300      */
1301     hdr[1] = ccl_used / 3;
1302
1303     /*
1304      * Write the header.
1305      */
1306     fwrite((char *) hdr, sizeof(unsigned short), 2, out);
1307
1308     /*
1309      * Write out the byte count to maintain header size.
1310      */
1311     bytes = ccl_used * sizeof(unsigned long);
1312     fwrite((char *) &bytes, sizeof(unsigned long), 1, out);
1313
1314     if (ccl_used > 0)
1315       /*
1316        * Write the combining class ranges out.
1317        */
1318       fwrite((char *) ccl, sizeof(unsigned long), ccl_used, out);
1319
1320     fclose(out);
1321
1322     /*****************************************************************
1323      *
1324      * Generate the number data.
1325      *
1326      *****************************************************************/
1327
1328     /*
1329      * Open the num.dat file.
1330      */
1331     sprintf(path, "%s/num.dat", opath);
1332     if ((out = fopen(path, "wb")) == 0)
1333       return;
1334
1335     /*
1336      * The count part of the header will be the total number of codes that
1337      * have numbers.
1338      */
1339     hdr[1] = (unsigned short) (ncodes_used << 1);
1340     bytes = (ncodes_used * sizeof(_codeidx_t)) + (nums_used * sizeof(_num_t));
1341
1342     /*
1343      * Write the header.
1344      */
1345     fwrite((char *) hdr, sizeof(unsigned short), 2, out);
1346
1347     /*
1348      * Write out the byte count to maintain header size.
1349      */
1350     fwrite((char *) &bytes, sizeof(unsigned long), 1, out);
1351
1352     /*
1353      * Now, if number mappings exist, write them out.
1354      */
1355     if (ncodes_used > 0) {
1356         fwrite((char *) ncodes, sizeof(_codeidx_t), ncodes_used, out);
1357         fwrite((char *) nums, sizeof(_num_t), nums_used, out);
1358     }
1359
1360     fclose(out);
1361 }
1362
1363 int
1364 main(int argc, char *argv[])
1365 {
1366     FILE *in;
1367     char *prog, *opath;
1368
1369     if ((prog = strrchr(argv[0], '/')) != 0)
1370       prog++;
1371     else
1372       prog = argv[0];
1373
1374     opath = 0;
1375     in = stdin;
1376
1377     argc--;
1378     argv++;
1379
1380     while (argc > 0) {
1381         if (argv[0][0] == '-' && argv[0][1] == 'o') {
1382             argc--;
1383             argv++;
1384             opath = argv[0];
1385         } else {
1386             if (in != stdin && in != NULL)
1387               fclose(in);
1388             if ((in = fopen(argv[0], "rb")) == 0)
1389               fprintf(stderr, "%s: unable to open ctype file %s\n",
1390                       prog, argv[0]);
1391             else {
1392                 read_cdata(in);
1393                 fclose(in);
1394                 in = 0;
1395             }
1396         }
1397         argc--;
1398         argv++;
1399     }
1400
1401     if (opath == 0)
1402       opath = ".";
1403     write_cdata(opath);
1404
1405     return 0;
1406 }