git.sur5r.net Git - openldap/blob - libraries/liblunicode/ucdata/ucgendat.c

   1 /* $OpenLDAP$
   2 /*
   3  * Copyright 1999 Computing Research Labs, New Mexico State University
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice shall be included in
  13  * all copies or substantial portions of the Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
  19  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
  20  * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
  21  * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  */
  23 /* $Id: ucgendat.c,v 1.3 1999/10/07 20:49:56 mleisher Exp $" */
  24
  25 #include "portable.h"
  26
  27 #include <stdio.h>
  28 #include <stdlib.h>
  29 #include <ac/string.h>
  30 #include <ac/unistd.h>
  31
  32 #define ishdigit(cc) (((cc) >= '0' && (cc) <= '9') ||\
  33                       ((cc) >= 'A' && (cc) <= 'F') ||\
  34                       ((cc) >= 'a' && (cc) <= 'f'))
  35
  36 /*
  37  * A header written to the output file with the byte-order-mark and the number
  38  * of property nodes.
  39  */
  40 static unsigned short hdr[2] = {0xfeff, 0};
  41
  42 #define NUMPROPS 49
  43 #define NEEDPROPS (NUMPROPS + (4 - (NUMPROPS & 3)))
  44
  45 typedef struct {
  46     char *name;
  47     int len;
  48 } _prop_t;
  49
  50 /*
  51  * List of properties expected to be found in the Unicode Character Database
  52  * including some implementation specific properties.
  53  *
  54  * The implementation specific properties are:
  55  * Cm = Composed (can be decomposed)
  56  * Nb = Non-breaking
  57  * Sy = Symmetric (has left and right forms)
  58  * Hd = Hex digit
  59  * Qm = Quote marks
  60  * Mr = Mirroring
  61  * Ss = Space, other
  62  * Cp = Defined character
  63  */
  64 static _prop_t props[NUMPROPS] = {
  65     {"Mn", 2}, {"Mc", 2}, {"Me", 2}, {"Nd", 2}, {"Nl", 2}, {"No", 2},
  66     {"Zs", 2}, {"Zl", 2}, {"Zp", 2}, {"Cc", 2}, {"Cf", 2}, {"Cs", 2},
  67     {"Co", 2}, {"Cn", 2}, {"Lu", 2}, {"Ll", 2}, {"Lt", 2}, {"Lm", 2},
  68     {"Lo", 2}, {"Pc", 2}, {"Pd", 2}, {"Ps", 2}, {"Pe", 2}, {"Po", 2},
  69     {"Sm", 2}, {"Sc", 2}, {"Sk", 2}, {"So", 2}, {"L",  1}, {"R",  1},
  70     {"EN", 2}, {"ES", 2}, {"ET", 2}, {"AN", 2}, {"CS", 2}, {"B",  1},
  71     {"S",  1}, {"WS", 2}, {"ON", 2},
  72     {"Cm", 2}, {"Nb", 2}, {"Sy", 2}, {"Hd", 2}, {"Qm", 2}, {"Mr", 2},
  73     {"Ss", 2}, {"Cp", 2}, {"Pi", 2}, {"Pf", 2}
  74 };
  75
  76 typedef struct {
  77     unsigned long *ranges;
  78     unsigned short used;
  79     unsigned short size;
  80 } _ranges_t;
  81
  82 static _ranges_t proptbl[NUMPROPS];
  83
  84 /*
  85  * Make sure this array is sized to be on a 4-byte boundary at compile time.
  86  */
  87 static unsigned short propcnt[NEEDPROPS];
  88
  89 /*
  90  * Array used to collect a decomposition before adding it to the decomposition
  91  * table.
  92  */
  93 static unsigned long dectmp[64];
  94 static unsigned long dectmp_size;
  95
  96 typedef struct {
  97     unsigned long code;
  98     unsigned short size;
  99     unsigned short used;
 100     unsigned long *decomp;
 101 } _decomp_t;
 102
 103 /*
 104  * List of decomposition.  Created and expanded in order as the characters are
 105  * encountered.
 106  */
 107 static _decomp_t *decomps;
 108 static unsigned long decomps_used;
 109 static unsigned long decomps_size;
 110
 111 /*
 112  * Types and lists for handling lists of case mappings.
 113  */
 114 typedef struct {
 115     unsigned long key;
 116     unsigned long other1;
 117     unsigned long other2;
 118 } _case_t;
 119
 120 static _case_t *upper;
 121 static _case_t *lower;
 122 static _case_t *title;
 123 static unsigned long upper_used;
 124 static unsigned long upper_size;
 125 static unsigned long lower_used;
 126 static unsigned long lower_size;
 127 static unsigned long title_used;
 128 static unsigned long title_size;
 129
 130 /*
 131  * Array used to collect case mappings before adding them to a list.
 132  */
 133 static unsigned long cases[3];
 134
 135 /*
 136  * An array to hold ranges for combining classes.
 137  */
 138 static unsigned long *ccl;
 139 static unsigned long ccl_used;
 140 static unsigned long ccl_size;
 141
 142 /*
 143  * Structures for handling numbers.
 144  */
 145 typedef struct {
 146     unsigned long code;
 147     unsigned long idx;
 148 } _codeidx_t;
 149
 150 typedef struct {
 151     short numerator;
 152     short denominator;
 153 } _num_t;
 154
 155 /*
 156  * Arrays to hold the mapping of codes to numbers.
 157  */
 158 static _codeidx_t *ncodes;
 159 static unsigned long ncodes_used;
 160 static unsigned long ncodes_size;
 161
 162 static _num_t *nums;
 163 static unsigned long nums_used;
 164 static unsigned long nums_size;
 165
 166 /*
 167  * Array for holding numbers.
 168  */
 169 static _num_t *nums;
 170 static unsigned long nums_used;
 171 static unsigned long nums_size;
 172
 173 static void
 174 add_range(unsigned long start, unsigned long end, char *p1, char *p2)
 175 {
 176     int i, j, k, len;
 177     _ranges_t *rlp;
 178     char *name;
 179
 180     for (k = 0; k < 2; k++) {
 181         if (k == 0) {
 182             name = p1;
 183             len = 2;
 184         } else {
 185             if (p2 == 0)
 186               break;
 187
 188             name = p2;
 189             len = 1;
 190         }
 191
 192         for (i = 0; i < NUMPROPS; i++) {
 193             if (props[i].len == len && memcmp(props[i].name, name, len) == 0)
 194               break;
 195         }
 196
 197         if (i == NUMPROPS)
 198           continue;
 199
 200         rlp = &proptbl[i];
 201
 202         /*
 203          * Resize the range list if necessary.
 204          */
 205         if (rlp->used == rlp->size) {
 206             if (rlp->size == 0)
 207               rlp->ranges = (unsigned long *)
 208                   malloc(sizeof(unsigned long) << 3);
 209             else
 210               rlp->ranges = (unsigned long *)
 211                   realloc((char *) rlp->ranges,
 212                           sizeof(unsigned long) * (rlp->size + 8));
 213             rlp->size += 8;
 214         }
 215
 216         /*
 217          * If this is the first code for this property list, just add it
 218          * and return.
 219          */
 220         if (rlp->used == 0) {
 221             rlp->ranges[0] = start;
 222             rlp->ranges[1] = end;
 223             rlp->used += 2;
 224             continue;
 225         }
 226
 227         /*
 228          * Optimize the case of adding the range to the end.
 229          */
 230         j = rlp->used - 1;
 231         if (start > rlp->ranges[j]) {
 232             j = rlp->used;
 233             rlp->ranges[j++] = start;
 234             rlp->ranges[j++] = end;
 235             rlp->used = j;
 236             continue;
 237         }
 238
 239         /*
 240          * Need to locate the insertion point.
 241          */
 242         for (i = 0;
 243              i < rlp->used && start > rlp->ranges[i + 1] + 1; i += 2) ;
 244
 245         /*
 246          * If the start value lies in the current range, then simply set the
 247          * new end point of the range to the end value passed as a parameter.
 248          */
 249         if (rlp->ranges[i] <= start && start <= rlp->ranges[i + 1] + 1) {
 250             rlp->ranges[i + 1] = end;
 251             return;
 252         }
 253
 254         /*
 255          * Shift following values up by two.
 256          */
 257         for (j = rlp->used; j > i; j -= 2) {
 258             rlp->ranges[j] = rlp->ranges[j - 2];
 259             rlp->ranges[j + 1] = rlp->ranges[j - 1];
 260         }
 261
 262         /*
 263          * Add the new range at the insertion point.
 264          */
 265         rlp->ranges[i] = start;
 266         rlp->ranges[i + 1] = end;
 267         rlp->used += 2;
 268     }
 269 }
 270
 271 static void
 272 ordered_range_insert(unsigned long c, char *name, int len)
 273 {
 274     int i, j;
 275     unsigned long s, e;
 276     _ranges_t *rlp;
 277
 278     if (len == 0)
 279       return;
 280
 281     /*
 282      * Deal with directionality codes introduced in Unicode 3.0.
 283      */
 284     if (len == 2) {
 285         if (memcmp(name, "AL", 2) == 0) {
 286             /*
 287              * Mark the Arabic letters as having RTL directionality.
 288              */
 289             len = 1;
 290             name = "R";
 291         } else if (memcmp(name, "BN", 2) == 0) {
 292             /*
 293              * Mark the control characters as being Other Neutrals.
 294              */
 295             len = 2;
 296             name = "ON";
 297         }
 298     } else if (len == 3 &&
 299                (memcmp(name, "NSM", 3) == 0 || memcmp(name, "PDF", 3) == 0 ||
 300                 memcmp(name, "LRE", 3) == 0 || memcmp(name, "LRO", 3) == 0 ||
 301                 memcmp(name, "RLE", 3) == 0 || memcmp(name, "RLO", 3) == 0)) {
 302         /*
 303          * Mark all of these as Other Neutral to preserve compatibility with
 304          * older versions.
 305          */
 306         len = 2;
 307         name = "ON";
 308     }
 309
 310     for (i = 0; i < NUMPROPS; i++) {
 311         if (props[i].len == len && memcmp(props[i].name, name, len) == 0)
 312           break;
 313     }
 314
 315     if (i == NUMPROPS)
 316       return;
 317
 318     /*
 319      * Have a match, so insert the code in order.
 320      */
 321     rlp = &proptbl[i];
 322
 323     /*
 324      * Resize the range list if necessary.
 325      */
 326     if (rlp->used == rlp->size) {
 327         if (rlp->size == 0)
 328           rlp->ranges = (unsigned long *)
 329               malloc(sizeof(unsigned long) << 3);
 330         else
 331           rlp->ranges = (unsigned long *)
 332               realloc((char *) rlp->ranges,
 333                       sizeof(unsigned long) * (rlp->size + 8));
 334         rlp->size += 8;
 335     }
 336
 337     /*
 338      * If this is the first code for this property list, just add it
 339      * and return.
 340      */
 341     if (rlp->used == 0) {
 342         rlp->ranges[0] = rlp->ranges[1] = c;
 343         rlp->used += 2;
 344         return;
 345     }
 346
 347     /*
 348      * Optimize the cases of extending the last range and adding new ranges to
 349      * the end.
 350      */
 351     j = rlp->used - 1;
 352     e = rlp->ranges[j];
 353     s = rlp->ranges[j - 1];
 354
 355     if (c == e + 1) {
 356         /*
 357          * Extend the last range.
 358          */
 359         rlp->ranges[j] = c;
 360         return;
 361     }
 362
 363     if (c > e + 1) {
 364         /*
 365          * Start another range on the end.
 366          */
 367         j = rlp->used;
 368         rlp->ranges[j] = rlp->ranges[j + 1] = c;
 369         rlp->used += 2;
 370         return;
 371     }
 372
 373     if (c >= s)
 374       /*
 375        * The code is a duplicate of a code in the last range, so just return.
 376        */
 377       return;
 378
 379     /*
 380      * The code should be inserted somewhere before the last range in the
 381      * list.  Locate the insertion point.
 382      */
 383     for (i = 0;
 384          i < rlp->used && c > rlp->ranges[i + 1] + 1; i += 2) ;
 385
 386     s = rlp->ranges[i];
 387     e = rlp->ranges[i + 1];
 388
 389     if (c == e + 1)
 390       /*
 391        * Simply extend the current range.
 392        */
 393       rlp->ranges[i + 1] = c;
 394     else if (c < s) {
 395         /*
 396          * Add a new entry before the current location.  Shift all entries
 397          * before the current one up by one to make room.
 398          */
 399         for (j = rlp->used; j > i; j -= 2) {
 400             rlp->ranges[j] = rlp->ranges[j - 2];
 401             rlp->ranges[j + 1] = rlp->ranges[j - 1];
 402         }
 403         rlp->ranges[i] = rlp->ranges[i + 1] = c;
 404
 405         rlp->used += 2;
 406     }
 407 }
 408
 409 static void
 410 add_decomp(unsigned long code)
 411 {
 412     unsigned long i, j, size;
 413
 414     /*
 415      * Add the code to the composite property.
 416      */
 417     ordered_range_insert(code, "Cm", 2);
 418
 419     /*
 420      * Locate the insertion point for the code.
 421      */
 422     for (i = 0; i < decomps_used && code > decomps[i].code; i++) ;
 423
 424     /*
 425      * Allocate space for a new decomposition.
 426      */
 427     if (decomps_used == decomps_size) {
 428         if (decomps_size == 0)
 429           decomps = (_decomp_t *) malloc(sizeof(_decomp_t) << 3);
 430         else
 431           decomps = (_decomp_t *)
 432               realloc((char *) decomps,
 433                       sizeof(_decomp_t) * (decomps_size + 8));
 434         (void) memset((char *) (decomps + decomps_size), '\0',
 435                       sizeof(_decomp_t) << 3);
 436         decomps_size += 8;
 437     }
 438
 439     if (i < decomps_used && code != decomps[i].code) {
 440         /*
 441          * Shift the decomps up by one if the codes don't match.
 442          */
 443         for (j = decomps_used; j > i; j--)
 444           (void) AC_MEMCPY((char *) &decomps[j], (char *) &decomps[j - 1],
 445                         sizeof(_decomp_t));
 446     }
 447
 448     /*
 449      * Insert or replace a decomposition.
 450      */
 451     size = dectmp_size + (4 - (dectmp_size & 3));
 452     if (decomps[i].size < size) {
 453         if (decomps[i].size == 0)
 454           decomps[i].decomp = (unsigned long *)
 455               malloc(sizeof(unsigned long) * size);
 456         else
 457           decomps[i].decomp = (unsigned long *)
 458               realloc((char *) decomps[i].decomp,
 459                       sizeof(unsigned long) * size);
 460         decomps[i].size = size;
 461     }
 462
 463     if (decomps[i].code != code)
 464       decomps_used++;
 465
 466     decomps[i].code = code;
 467     decomps[i].used = dectmp_size;
 468     (void) AC_MEMCPY((char *) decomps[i].decomp, (char *) dectmp,
 469                   sizeof(unsigned long) * dectmp_size);
 470
 471 }
 472
 473 static void
 474 add_title(unsigned long code)
 475 {
 476     unsigned long i, j;
 477
 478     /*
 479      * Always map the code to itself.
 480      */
 481     cases[2] = code;
 482
 483     if (title_used == title_size) {
 484         if (title_size == 0)
 485           title = (_case_t *) malloc(sizeof(_case_t) << 3);
 486         else
 487           title = (_case_t *) realloc((char *) title,
 488                                       sizeof(_case_t) * (title_size + 8));
 489         title_size += 8;
 490     }
 491
 492     /*
 493      * Locate the insertion point.
 494      */
 495     for (i = 0; i < title_used && code > title[i].key; i++) ;
 496
 497     if (i < title_used) {
 498         /*
 499          * Shift the array up by one.
 500          */
 501         for (j = title_used; j > i; j--)
 502           (void) AC_MEMCPY((char *) &title[j], (char *) &title[j - 1],
 503                         sizeof(_case_t));
 504     }
 505
 506     title[i].key = cases[2];    /* Title */
 507     title[i].other1 = cases[0]; /* Upper */
 508     title[i].other2 = cases[1]; /* Lower */
 509
 510     title_used++;
 511 }
 512
 513 static void
 514 add_upper(unsigned long code)
 515 {
 516     unsigned long i, j;
 517
 518     /*
 519      * Always map the code to itself.
 520      */
 521     cases[0] = code;
 522
 523     /*
 524      * If the title case character is not present, then make it the same as
 525      * the upper case.
 526      */
 527     if (cases[2] == 0)
 528       cases[2] = code;
 529
 530     if (upper_used == upper_size) {
 531         if (upper_size == 0)
 532           upper = (_case_t *) malloc(sizeof(_case_t) << 3);
 533         else
 534           upper = (_case_t *) realloc((char *) upper,
 535                                       sizeof(_case_t) * (upper_size + 8));
 536         upper_size += 8;
 537     }
 538
 539     /*
 540      * Locate the insertion point.
 541      */
 542     for (i = 0; i < upper_used && code > upper[i].key; i++) ;
 543
 544     if (i < upper_used) {
 545         /*
 546          * Shift the array up by one.
 547          */
 548         for (j = upper_used; j > i; j--)
 549           (void) AC_MEMCPY((char *) &upper[j], (char *) &upper[j - 1],
 550                         sizeof(_case_t));
 551     }
 552
 553     upper[i].key = cases[0];    /* Upper */
 554     upper[i].other1 = cases[1]; /* Lower */
 555     upper[i].other2 = cases[2]; /* Title */
 556
 557     upper_used++;
 558 }
 559
 560 static void
 561 add_lower(unsigned long code)
 562 {
 563     unsigned long i, j;
 564
 565     /*
 566      * Always map the code to itself.
 567      */
 568     cases[1] = code;
 569
 570     /*
 571      * If the title case character is empty, then make it the same as the
 572      * upper case.
 573      */
 574     if (cases[2] == 0)
 575       cases[2] = cases[0];
 576
 577     if (lower_used == lower_size) {
 578         if (lower_size == 0)
 579           lower = (_case_t *) malloc(sizeof(_case_t) << 3);
 580         else
 581           lower = (_case_t *) realloc((char *) lower,
 582                                       sizeof(_case_t) * (lower_size + 8));
 583         lower_size += 8;
 584     }
 585
 586     /*
 587      * Locate the insertion point.
 588      */
 589     for (i = 0; i < lower_used && code > lower[i].key; i++) ;
 590
 591     if (i < lower_used) {
 592         /*
 593          * Shift the array up by one.
 594          */
 595         for (j = lower_used; j > i; j--)
 596           (void) AC_MEMCPY((char *) &lower[j], (char *) &lower[j - 1],
 597                         sizeof(_case_t));
 598     }
 599
 600     lower[i].key = cases[1];    /* Lower */
 601     lower[i].other1 = cases[0]; /* Upper */
 602     lower[i].other2 = cases[2]; /* Title */
 603
 604     lower_used++;
 605 }
 606
 607 static void
 608 ordered_ccl_insert(unsigned long c, unsigned long ccl_code)
 609 {
 610     unsigned long i, j;
 611
 612     if (ccl_used == ccl_size) {
 613         if (ccl_size == 0)
 614           ccl = (unsigned long *) malloc(sizeof(unsigned long) * 24);
 615         else
 616           ccl = (unsigned long *)
 617               realloc((char *) ccl, sizeof(unsigned long) * (ccl_size + 24));
 618         ccl_size += 24;
 619     }
 620
 621     /*
 622      * Optimize adding the first item.
 623      */
 624     if (ccl_used == 0) {
 625         ccl[0] = ccl[1] = c;
 626         ccl[2] = ccl_code;
 627         ccl_used += 3;
 628         return;
 629     }
 630
 631     /*
 632      * Handle the special case of extending the range on the end.  This
 633      * requires that the combining class codes are the same.
 634      */
 635     if (ccl_code == ccl[ccl_used - 1] && c == ccl[ccl_used - 2] + 1) {
 636         ccl[ccl_used - 2] = c;
 637         return;
 638     }
 639
 640     /*
 641      * Handle the special case of adding another range on the end.
 642      */
 643     if (c > ccl[ccl_used - 2] + 1 ||
 644         (c == ccl[ccl_used - 2] + 1 && ccl_code != ccl[ccl_used - 1])) {
 645         ccl[ccl_used++] = c;
 646         ccl[ccl_used++] = c;
 647         ccl[ccl_used++] = ccl_code;
 648         return;
 649     }
 650
 651     /*
 652      * Locate either the insertion point or range for the code.
 653      */
 654     for (i = 0; i < ccl_used && c > ccl[i + 1] + 1; i += 3) ;
 655
 656     if (ccl_code == ccl[i + 2] && c == ccl[i + 1] + 1) {
 657         /*
 658          * Extend an existing range.
 659          */
 660         ccl[i + 1] = c;
 661         return;
 662     } else if (c < ccl[i]) {
 663         /*
 664          * Start a new range before the current location.
 665          */
 666         for (j = ccl_used; j > i; j -= 3) {
 667             ccl[j] = ccl[j - 3];
 668             ccl[j - 1] = ccl[j - 4];
 669             ccl[j - 2] = ccl[j - 5];
 670         }
 671         ccl[i] = ccl[i + 1] = c;
 672         ccl[i + 2] = ccl_code;
 673     }
 674 }
 675
 676 /*
 677  * Adds a number if it does not already exist and returns an index value
 678  * multiplied by 2.
 679  */
 680 static unsigned long
 681 make_number(short num, short denom)
 682 {
 683     unsigned long n;
 684
 685     /*
 686      * Determine if the number already exists.
 687      */
 688     for (n = 0; n < nums_used; n++) {
 689         if (nums[n].numerator == num && nums[n].denominator == denom)
 690           return n << 1;
 691     }
 692
 693     if (nums_used == nums_size) {
 694         if (nums_size == 0)
 695           nums = (_num_t *) malloc(sizeof(_num_t) << 3);
 696         else
 697           nums = (_num_t *) realloc((char *) nums,
 698                                     sizeof(_num_t) * (nums_size + 8));
 699         nums_size += 8;
 700     }
 701
 702     n = nums_used++;
 703     nums[n].numerator = num;
 704     nums[n].denominator = denom;
 705
 706     return n << 1;
 707 }
 708
 709 static void
 710 add_number(unsigned long code, short num, short denom)
 711 {
 712     unsigned long i, j;
 713
 714     /*
 715      * Insert the code in order.
 716      */
 717     for (i = 0; i < ncodes_used && code > ncodes[i].code; i++) ;
 718
 719     /*
 720      * Handle the case of the codes matching and simply replace the number
 721      * that was there before.
 722      */
 723     if (ncodes_used > 0 && code == ncodes[i].code) {
 724         ncodes[i].idx = make_number(num, denom);
 725         return;
 726     }
 727
 728     /*
 729      * Resize the array if necessary.
 730      */
 731     if (ncodes_used == ncodes_size) {
 732         if (ncodes_size == 0)
 733           ncodes = (_codeidx_t *) malloc(sizeof(_codeidx_t) << 3);
 734         else
 735           ncodes = (_codeidx_t *)
 736               realloc((char *) ncodes, sizeof(_codeidx_t) * (ncodes_size + 8));
 737
 738         ncodes_size += 8;
 739     }
 740
 741     /*
 742      * Shift things around to insert the code if necessary.
 743      */
 744     if (i < ncodes_used) {
 745         for (j = ncodes_used; j > i; j--) {
 746             ncodes[j].code = ncodes[j - 1].code;
 747             ncodes[j].idx = ncodes[j - 1].idx;
 748         }
 749     }
 750     ncodes[i].code = code;
 751     ncodes[i].idx = make_number(num, denom);
 752
 753     ncodes_used++;
 754 }
 755
 756 /*
 757  * This routine assumes that the line is a valid Unicode Character Database
 758  * entry.
 759  */
 760 static void
 761 read_cdata(FILE *in)
 762 {
 763     unsigned long i, lineno, skip, code, ccl_code;
 764     short wnum, neg, number[2];
 765     char line[512], *s, *e;
 766
 767     lineno = skip = 0;
 768     while (fscanf(in, "%[^\n]\n", line) != EOF) {
 769         lineno++;
 770
 771         /*
 772          * Skip blank lines and lines that start with a '#'.
 773          */
 774         if (line[0] == 0 || line[0] == '#')
 775           continue;
 776
 777         /*
 778          * If lines need to be skipped, do it here.
 779          */
 780         if (skip) {
 781             skip--;
 782             continue;
 783         }
 784
 785         /*
 786          * Collect the code.  The code can be up to 6 hex digits in length to
 787          * allow surrogates to be specified.
 788          */
 789         for (s = line, i = code = 0; *s != ';' && i < 6; i++, s++) {
 790             code <<= 4;
 791             if (*s >= '0' && *s <= '9')
 792               code += *s - '0';
 793             else if (*s >= 'A' && *s <= 'F')
 794               code += (*s - 'A') + 10;
 795             else if (*s >= 'a' && *s <= 'f')
 796               code += (*s - 'a') + 10;
 797         }
 798
 799         /*
 800          * Handle the following special cases:
 801          * 1. 4E00-9FA5 CJK Ideographs.
 802          * 2. AC00-D7A3 Hangul Syllables.
 803          * 3. D800-DFFF Surrogates.
 804          * 4. E000-F8FF Private Use Area.
 805          * 5. F900-FA2D Han compatibility.
 806          */
 807         switch (code) {
 808           case 0x4e00:
 809             /*
 810              * The Han ideographs.
 811              */
 812             add_range(0x4e00, 0x9fff, "Lo", "L");
 813
 814             /*
 815              * Add the characters to the defined category.
 816              */
 817             add_range(0x4e00, 0x9fa5, "Cp", 0);
 818
 819             skip = 1;
 820             break;
 821           case 0xac00:
 822             /*
 823              * The Hangul syllables.
 824              */
 825             add_range(0xac00, 0xd7a3, "Lo", "L");
 826
 827             /*
 828              * Add the characters to the defined category.
 829              */
 830             add_range(0xac00, 0xd7a3, "Cp", 0);
 831
 832             skip = 1;
 833             break;
 834           case 0xd800:
 835             /*
 836              * Make a range of all surrogates and assume some default
 837              * properties.
 838              */
 839             add_range(0x010000, 0x10ffff, "Cs", "L");
 840             skip = 5;
 841             break;
 842           case 0xe000:
 843             /*
 844              * The Private Use area.  Add with a default set of properties.
 845              */
 846             add_range(0xe000, 0xf8ff, "Co", "L");
 847             skip = 1;
 848             break;
 849           case 0xf900:
 850             /*
 851              * The CJK compatibility area.
 852              */
 853             add_range(0xf900, 0xfaff, "Lo", "L");
 854
 855             /*
 856              * Add the characters to the defined category.
 857              */
 858             add_range(0xf900, 0xfaff, "Cp", 0);
 859
 860             skip = 1;
 861         }
 862
 863         if (skip)
 864           continue;
 865
 866         /*
 867          * Add the code to the defined category.
 868          */
 869         ordered_range_insert(code, "Cp", 2);
 870
 871         /*
 872          * Locate the first character property field.
 873          */
 874         for (i = 0; *s != 0 && i < 2; s++) {
 875             if (*s == ';')
 876               i++;
 877         }
 878         for (e = s; *e && *e != ';'; e++) ;
 879
 880         ordered_range_insert(code, s, e - s);
 881
 882         /*
 883          * Locate the combining class code.
 884          */
 885         for (s = e; *s != 0 && i < 3; s++) {
 886             if (*s == ';')
 887               i++;
 888         }
 889
 890         /*
 891          * Convert the combining class code from decimal.
 892          */
 893         for (ccl_code = 0, e = s; *e && *e != ';'; e++)
 894           ccl_code = (ccl_code * 10) + (*e - '0');
 895
 896         /*
 897          * Add the code if it not 0.
 898          */
 899         if (ccl_code != 0)
 900           ordered_ccl_insert(code, ccl_code);
 901
 902         /*
 903          * Locate the second character property field.
 904          */
 905         for (s = e; *s != 0 && i < 4; s++) {
 906             if (*s == ';')
 907               i++;
 908         }
 909         for (e = s; *e && *e != ';'; e++) ;
 910
 911         ordered_range_insert(code, s, e - s);
 912
 913         /*
 914          * Check for a decomposition.
 915          */
 916         s = ++e;
 917         if (*s != ';' && *s != '<') {
 918             /*
 919              * Collect the codes of the decomposition.
 920              */
 921             for (dectmp_size = 0; *s != ';'; ) {
 922                 /*
 923                  * Skip all leading non-hex digits.
 924                  */
 925                 while (!ishdigit(*s))
 926                   s++;
 927
 928                 for (dectmp[dectmp_size] = 0; ishdigit(*s); s++) {
 929                     dectmp[dectmp_size] <<= 4;
 930                     if (*s >= '0' && *s <= '9')
 931                       dectmp[dectmp_size] += *s - '0';
 932                     else if (*s >= 'A' && *s <= 'F')
 933                       dectmp[dectmp_size] += (*s - 'A') + 10;
 934                     else if (*s >= 'a' && *s <= 'f')
 935                       dectmp[dectmp_size] += (*s - 'a') + 10;
 936                 }
 937                 dectmp_size++;
 938             }
 939
 940             /*
 941              * If there is more than one code in the temporary decomposition
 942              * array, then add the character with its decomposition.
 943              */
 944             if (dectmp_size > 1)
 945               add_decomp(code);
 946         }
 947
 948         /*
 949          * Skip to the number field.
 950          */
 951         for (i = 0; i < 3 && *s; s++) {
 952             if (*s == ';')
 953               i++;
 954         }
 955
 956         /*
 957          * Scan the number in.
 958          */
 959         number[0] = number[1] = 0;
 960         for (e = s, neg = wnum = 0; *e && *e != ';'; e++) {
 961             if (*e == '-') {
 962                 neg = 1;
 963                 continue;
 964             }
 965
 966             if (*e == '/') {
 967                 /*
 968                  * Move the the denominator of the fraction.
 969                  */
 970                 if (neg)
 971                   number[wnum] *= -1;
 972                 neg = 0;
 973                 e++;
 974                 wnum++;
 975             }
 976             number[wnum] = (number[wnum] * 10) + (*e - '0');
 977         }
 978
 979         if (e > s) {
 980             /*
 981              * Adjust the denominator in case of integers and add the number.
 982              */
 983             if (wnum == 0)
 984               number[1] = number[0];
 985
 986             add_number(code, number[0], number[1]);
 987         }
 988
 989         /*
 990          * Skip to the start of the possible case mappings.
 991          */
 992         for (s = e, i = 0; i < 4 && *s; s++) {
 993             if (*s == ';')
 994               i++;
 995         }
 996
 997         /*
 998          * Collect the case mappings.
 999          */
1000         cases[0] = cases[1] = cases[2] = 0;
1001         for (i = 0; i < 3; i++) {
1002             while (ishdigit(*s)) {
1003                 cases[i] <<= 4;
1004                 if (*s >= '0' && *s <= '9')
1005                   cases[i] += *s - '0';
1006                 else if (*s >= 'A' && *s <= 'F')
1007                   cases[i] += (*s - 'A') + 10;
1008                 else if (*s >= 'a' && *s <= 'f')
1009                   cases[i] += (*s - 'a') + 10;
1010                 s++;
1011             }
1012             if (*s == ';')
1013               s++;
1014         }
1015         if (cases[0] && cases[1])
1016           /*
1017            * Add the upper and lower mappings for a title case character.
1018            */
1019           add_title(code);
1020         else if (cases[1])
1021           /*
1022            * Add the lower and title case mappings for the upper case
1023            * character.
1024            */
1025           add_upper(code);
1026         else if (cases[0])
1027           /*
1028            * Add the upper and title case mappings for the lower case
1029            * character.
1030            */
1031           add_lower(code);
1032     }
1033 }
1034
1035 static _decomp_t *
1036 find_decomp(unsigned long code)
1037 {
1038     long l, r, m;
1039
1040     l = 0;
1041     r = decomps_used - 1;
1042     while (l <= r) {
1043         m = (l + r) >> 1;
1044         if (code > decomps[m].code)
1045           l = m + 1;
1046         else if (code < decomps[m].code)
1047           r = m - 1;
1048         else
1049           return &decomps[m];
1050     }
1051     return 0;
1052 }
1053
1054 static void
1055 decomp_it(_decomp_t *d)
1056 {
1057     unsigned long i;
1058     _decomp_t *dp;
1059
1060     for (i = 0; i < d->used; i++) {
1061         if ((dp = find_decomp(d->decomp[i])) != 0)
1062           decomp_it(dp);
1063         else
1064           dectmp[dectmp_size++] = d->decomp[i];
1065     }
1066 }
1067
1068 /*
1069  * Expand all decompositions by recursively decomposing each character
1070  * in the decomposition.
1071  */
1072 static void
1073 expand_decomp(void)
1074 {
1075     unsigned long i;
1076
1077     for (i = 0; i < decomps_used; i++) {
1078         dectmp_size = 0;
1079         decomp_it(&decomps[i]);
1080         if (dectmp_size > 0)
1081           add_decomp(decomps[i].code);
1082     }
1083 }
1084
1085 static void
1086 write_cdata(char *opath)
1087 {
1088     FILE *out;
1089     unsigned long i, idx, bytes, nprops;
1090     unsigned short casecnt[2];
1091     char path[BUFSIZ];
1092
1093     /*****************************************************************
1094      *
1095      * Generate the ctype data.
1096      *
1097      *****************************************************************/
1098
1099     /*
1100      * Open the ctype.dat file.
1101      */
1102     sprintf(path, "%s/ctype.dat", opath);
1103     if ((out = fopen(path, "wb")) == 0)
1104       return;
1105
1106     /*
1107      * Collect the offsets for the properties.  The offsets array is
1108      * on a 4-byte boundary to keep things efficient for architectures
1109      * that need such a thing.
1110      */
1111     for (i = idx = 0; i < NUMPROPS; i++) {
1112         propcnt[i] = (proptbl[i].used != 0) ? idx : 0xffff;
1113         idx += proptbl[i].used;
1114     }
1115
1116     /*
1117      * Add the sentinel index which is used by the binary search as the upper
1118      * bound for a search.
1119      */
1120     propcnt[i] = idx;
1121
1122     /*
1123      * Record the actual number of property lists.  This may be different than
1124      * the number of offsets actually written because of aligning on a 4-byte
1125      * boundary.
1126      */
1127     hdr[1] = NUMPROPS;
1128
1129     /*
1130      * Calculate the byte count needed and pad the property counts array to a
1131      * 4-byte boundary.
1132      */
1133     if ((bytes = sizeof(unsigned short) * (NUMPROPS + 1)) & 3)
1134       bytes += 4 - (bytes & 3);
1135     nprops = bytes / sizeof(unsigned short);
1136     bytes += sizeof(unsigned long) * idx;
1137
1138     /*
1139      * Write the header.
1140      */
1141     fwrite((char *) hdr, sizeof(unsigned short), 2, out);
1142
1143     /*
1144      * Write the byte count.
1145      */
1146     fwrite((char *) &bytes, sizeof(unsigned long), 1, out);
1147
1148     /*
1149      * Write the property list counts.
1150      */
1151     fwrite((char *) propcnt, sizeof(unsigned short), nprops, out);
1152
1153     /*
1154      * Write the property lists.
1155      */
1156     for (i = 0; i < NUMPROPS; i++) {
1157         if (proptbl[i].used > 0)
1158           fwrite((char *) proptbl[i].ranges, sizeof(unsigned long),
1159                  proptbl[i].used, out);
1160     }
1161
1162     fclose(out);
1163
1164     /*****************************************************************
1165      *
1166      * Generate the case mapping data.
1167      *
1168      *****************************************************************/
1169
1170     /*
1171      * Open the case.dat file.
1172      */
1173     sprintf(path, "%s/case.dat", opath);
1174     if ((out = fopen(path, "wb")) == 0)
1175       return;
1176
1177     /*
1178      * Write the case mapping tables.
1179      */
1180     hdr[1] = upper_used + lower_used + title_used;
1181     casecnt[0] = upper_used;
1182     casecnt[1] = lower_used;
1183
1184     /*
1185      * Write the header.
1186      */
1187     fwrite((char *) hdr, sizeof(unsigned short), 2, out);
1188
1189     /*
1190      * Write the upper and lower case table sizes.
1191      */
1192     fwrite((char *) casecnt, sizeof(unsigned short), 2, out);
1193
1194     if (upper_used > 0)
1195       /*
1196        * Write the upper case table.
1197        */
1198       fwrite((char *) upper, sizeof(_case_t), upper_used, out);
1199
1200     if (lower_used > 0)
1201       /*
1202        * Write the lower case table.
1203        */
1204       fwrite((char *) lower, sizeof(_case_t), lower_used, out);
1205
1206     if (title_used > 0)
1207       /*
1208        * Write the title case table.
1209        */
1210       fwrite((char *) title, sizeof(_case_t), title_used, out);
1211
1212     fclose(out);
1213
1214     /*****************************************************************
1215      *
1216      * Generate the decomposition data.
1217      *
1218      *****************************************************************/
1219
1220     /*
1221      * Fully expand all decompositions before generating the output file.
1222      */
1223     expand_decomp();
1224
1225     /*
1226      * Open the decomp.dat file.
1227      */
1228     sprintf(path, "%s/decomp.dat", opath);
1229     if ((out = fopen(path, "wb")) == 0)
1230       return;
1231
1232     hdr[1] = decomps_used;
1233
1234     /*
1235      * Write the header.
1236      */
1237     fwrite((char *) hdr, sizeof(unsigned short), 2, out);
1238
1239     /*
1240      * Write a temporary byte count which will be calculated as the
1241      * decompositions are written out.
1242      */
1243     bytes = 0;
1244     fwrite((char *) &bytes, sizeof(unsigned long), 1, out);
1245
1246     if (decomps_used) {
1247         /*
1248          * Write the list of decomp nodes.
1249          */
1250         for (i = idx = 0; i < decomps_used; i++) {
1251             fwrite((char *) &decomps[i].code, sizeof(unsigned long), 1, out);
1252             fwrite((char *) &idx, sizeof(unsigned long), 1, out);
1253             idx += decomps[i].used;
1254         }
1255
1256         /*
1257          * Write the sentinel index as the last decomp node.
1258          */
1259         fwrite((char *) &idx, sizeof(unsigned long), 1, out);
1260
1261         /*
1262          * Write the decompositions themselves.
1263          */
1264         for (i = 0; i < decomps_used; i++)
1265           fwrite((char *) decomps[i].decomp, sizeof(unsigned long),
1266                  decomps[i].used, out);
1267
1268         /*
1269          * Seek back to the beginning and write the byte count.
1270          */
1271         bytes = (sizeof(unsigned long) * idx) +
1272             (sizeof(unsigned long) * ((hdr[1] << 1) + 1));
1273         fseek(out, sizeof(unsigned short) << 1, 0L);
1274         fwrite((char *) &bytes, sizeof(unsigned long), 1, out);
1275
1276         fclose(out);
1277     }
1278
1279     /*****************************************************************
1280      *
1281      * Generate the combining class data.
1282      *
1283      *****************************************************************/
1284
1285     /*
1286      * Open the cmbcl.dat file.
1287      */
1288     sprintf(path, "%s/cmbcl.dat", opath);
1289     if ((out = fopen(path, "wb")) == 0)
1290       return;
1291
1292     /*
1293      * Set the number of ranges used.  Each range has a combining class which
1294      * means each entry is a 3-tuple.
1295      */
1296     hdr[1] = ccl_used / 3;
1297
1298     /*
1299      * Write the header.
1300      */
1301     fwrite((char *) hdr, sizeof(unsigned short), 2, out);
1302
1303     /*
1304      * Write out the byte count to maintain header size.
1305      */
1306     bytes = ccl_used * sizeof(unsigned long);
1307     fwrite((char *) &bytes, sizeof(unsigned long), 1, out);
1308
1309     if (ccl_used > 0)
1310       /*
1311        * Write the combining class ranges out.
1312        */
1313       fwrite((char *) ccl, sizeof(unsigned long), ccl_used, out);
1314
1315     fclose(out);
1316
1317     /*****************************************************************
1318      *
1319      * Generate the number data.
1320      *
1321      *****************************************************************/
1322
1323     /*
1324      * Open the num.dat file.
1325      */
1326     sprintf(path, "%s/num.dat", opath);
1327     if ((out = fopen(path, "wb")) == 0)
1328       return;
1329
1330     /*
1331      * The count part of the header will be the total number of codes that
1332      * have numbers.
1333      */
1334     hdr[1] = (unsigned short) (ncodes_used << 1);
1335     bytes = (ncodes_used * sizeof(_codeidx_t)) + (nums_used * sizeof(_num_t));
1336
1337     /*
1338      * Write the header.
1339      */
1340     fwrite((char *) hdr, sizeof(unsigned short), 2, out);
1341
1342     /*
1343      * Write out the byte count to maintain header size.
1344      */
1345     fwrite((char *) &bytes, sizeof(unsigned long), 1, out);
1346
1347     /*
1348      * Now, if number mappings exist, write them out.
1349      */
1350     if (ncodes_used > 0) {
1351         fwrite((char *) ncodes, sizeof(_codeidx_t), ncodes_used, out);
1352         fwrite((char *) nums, sizeof(_num_t), nums_used, out);
1353     }
1354
1355     fclose(out);
1356 }
1357
1358 int
1359 main(int argc, char *argv[])
1360 {
1361     FILE *in;
1362     char *prog, *opath;
1363
1364     if ((prog = strrchr(argv[0], '/')) != 0)
1365       prog++;
1366     else
1367       prog = argv[0];
1368
1369     opath = 0;
1370     in = stdin;
1371
1372     argc--;
1373     argv++;
1374
1375     while (argc > 0) {
1376         if (argv[0][0] == '-' && argv[0][1] == 'o') {
1377             argc--;
1378             argv++;
1379             opath = argv[0];
1380         } else {
1381             if (in != stdin && in != NULL)
1382               fclose(in);
1383             if ((in = fopen(argv[0], "rb")) == 0)
1384               fprintf(stderr, "%s: unable to open ctype file %s\n",
1385                       prog, argv[0]);
1386             else {
1387                 read_cdata(in);
1388                 fclose(in);
1389                 in = 0;
1390             }
1391         }
1392         argc--;
1393         argv++;
1394     }
1395
1396     if (opath == 0)
1397       opath = ".";
1398     write_cdata(opath);
1399
1400     return 0;
1401 }