X-Git-Url: https://git.sur5r.net/?a=blobdiff_plain;f=libraries%2Fliblunicode%2Fucdata%2Fucgendat.c;h=3be66a670eb34df44ed6984e32a4ab437131cac4;hb=a7c595088c1561c4f08932a97f9a716ae9dabd9c;hp=507503f3b53bfd172726660fbeb379a624a98470;hpb=6c2ea5ec24cba752386d3b185b7b760b4eade0e3;p=openldap diff --git a/libraries/liblunicode/ucdata/ucgendat.c b/libraries/liblunicode/ucdata/ucgendat.c index 507503f3b5..3be66a670e 100644 --- a/libraries/liblunicode/ucdata/ucgendat.c +++ b/libraries/liblunicode/ucdata/ucgendat.c @@ -1,6 +1,18 @@ /* $OpenLDAP$ */ -/* - * Copyright 2001 Computing Research Labs, New Mexico State University +/* This work is part of OpenLDAP Software . + * + * Copyright 1998-2012 The OpenLDAP Foundation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ +/* Copyright 2001 Computing Research Labs, New Mexico State University * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -23,12 +35,23 @@ /* $Id: ucgendat.c,v 1.4 2001/01/02 18:46:20 mleisher Exp $" */ #include "portable.h" +#include "ldap_config.h" #include -#include +#include +#include #include #include +#include + +#include + +#ifndef HARDCODE_DATA +#define HARDCODE_DATA 1 +#endif + +#undef ishdigit #define ishdigit(cc) (((cc) >= '0' && (cc) <= '9') ||\ ((cc) >= 'A' && (cc) <= 'F') ||\ ((cc) >= 'a' && (cc) <= 'f')) @@ -37,7 +60,7 @@ * A header written to the output file with the byte-order-mark and the number * of property nodes. */ -static unsigned short hdr[2] = {0xfeff, 0}; +static ac_uint2 hdr[2] = {0xfeff, 0}; #define NUMPROPS 50 #define NEEDPROPS (NUMPROPS + (4 - (NUMPROPS & 3))) @@ -74,9 +97,9 @@ static _prop_t props[NUMPROPS] = { }; typedef struct { - unsigned long *ranges; - unsigned short used; - unsigned short size; + ac_uint4 *ranges; + ac_uint2 used; + ac_uint2 size; } _ranges_t; static _ranges_t proptbl[NUMPROPS]; @@ -84,87 +107,92 @@ static _ranges_t proptbl[NUMPROPS]; /* * Make sure this array is sized to be on a 4-byte boundary at compile time. */ -static unsigned short propcnt[NEEDPROPS]; +static ac_uint2 propcnt[NEEDPROPS]; /* * Array used to collect a decomposition before adding it to the decomposition * table. */ -static unsigned long dectmp[64]; -static unsigned long dectmp_size; +static ac_uint4 dectmp[64]; +static ac_uint4 dectmp_size; typedef struct { - unsigned long code; - unsigned short size; - unsigned short used; - unsigned long *decomp; + ac_uint4 code; + ac_uint2 size; + ac_uint2 used; + ac_uint4 *decomp; } _decomp_t; /* * List of decomposition. Created and expanded in order as the characters are - * encountered. + * encountered. First list contains canonical mappings, second also includes + * compatibility mappings. */ static _decomp_t *decomps; -static unsigned long decomps_used; -static unsigned long decomps_size; +static ac_uint4 decomps_used; +static ac_uint4 decomps_size; + +static _decomp_t *kdecomps; +static ac_uint4 kdecomps_used; +static ac_uint4 kdecomps_size; /* * Composition exclusion table stuff. */ -#define COMPEX_SET(c) (compexs[(c) >> 15] |= (1 << ((c) & 31))) -#define COMPEX_TEST(c) (compexs[(c) >> 15] & (1 << ((c) & 31))) -static unsigned long compexs[2048]; +#define COMPEX_SET(c) (compexs[(c) >> 5] |= (1 << ((c) & 31))) +#define COMPEX_TEST(c) (compexs[(c) >> 5] & (1 << ((c) & 31))) +static ac_uint4 compexs[8192]; /* * Struct for holding a composition pair, and array of composition pairs */ typedef struct { - unsigned long comp; - unsigned long count; - unsigned long code1; - unsigned long code2; + ac_uint4 comp; + ac_uint4 count; + ac_uint4 code1; + ac_uint4 code2; } _comp_t; static _comp_t *comps; -static unsigned long comps_used; +static ac_uint4 comps_used; /* * Types and lists for handling lists of case mappings. */ typedef struct { - unsigned long key; - unsigned long other1; - unsigned long other2; + ac_uint4 key; + ac_uint4 other1; + ac_uint4 other2; } _case_t; static _case_t *upper; static _case_t *lower; static _case_t *title; -static unsigned long upper_used; -static unsigned long upper_size; -static unsigned long lower_used; -static unsigned long lower_size; -static unsigned long title_used; -static unsigned long title_size; +static ac_uint4 upper_used; +static ac_uint4 upper_size; +static ac_uint4 lower_used; +static ac_uint4 lower_size; +static ac_uint4 title_used; +static ac_uint4 title_size; /* * Array used to collect case mappings before adding them to a list. */ -static unsigned long cases[3]; +static ac_uint4 cases[3]; /* * An array to hold ranges for combining classes. */ -static unsigned long *ccl; -static unsigned long ccl_used; -static unsigned long ccl_size; +static ac_uint4 *ccl; +static ac_uint4 ccl_used; +static ac_uint4 ccl_size; /* * Structures for handling numbers. */ typedef struct { - unsigned long code; - unsigned long idx; + ac_uint4 code; + ac_uint4 idx; } _codeidx_t; typedef struct { @@ -176,22 +204,22 @@ typedef struct { * Arrays to hold the mapping of codes to numbers. */ static _codeidx_t *ncodes; -static unsigned long ncodes_used; -static unsigned long ncodes_size; +static ac_uint4 ncodes_used; +static ac_uint4 ncodes_size; static _num_t *nums; -static unsigned long nums_used; -static unsigned long nums_size; +static ac_uint4 nums_used; +static ac_uint4 nums_size; /* * Array for holding numbers. */ static _num_t *nums; -static unsigned long nums_used; -static unsigned long nums_size; +static ac_uint4 nums_used; +static ac_uint4 nums_size; static void -add_range(unsigned long start, unsigned long end, char *p1, char *p2) +add_range(ac_uint4 start, ac_uint4 end, char *p1, char *p2) { int i, j, k, len; _ranges_t *rlp; @@ -224,12 +252,12 @@ add_range(unsigned long start, unsigned long end, char *p1, char *p2) */ if (rlp->used == rlp->size) { if (rlp->size == 0) - rlp->ranges = (unsigned long *) - malloc(sizeof(unsigned long) << 3); + rlp->ranges = (ac_uint4 *) + malloc(sizeof(ac_uint4) << 3); else - rlp->ranges = (unsigned long *) + rlp->ranges = (ac_uint4 *) realloc((char *) rlp->ranges, - sizeof(unsigned long) * (rlp->size + 8)); + sizeof(ac_uint4) * (rlp->size + 8)); rlp->size += 8; } @@ -289,10 +317,10 @@ add_range(unsigned long start, unsigned long end, char *p1, char *p2) } static void -ordered_range_insert(unsigned long c, char *name, int len) +ordered_range_insert(ac_uint4 c, char *name, int len) { int i, j; - unsigned long s, e; + ac_uint4 s, e; _ranges_t *rlp; if (len == 0) @@ -332,12 +360,12 @@ ordered_range_insert(unsigned long c, char *name, int len) */ if (rlp->used == rlp->size) { if (rlp->size == 0) - rlp->ranges = (unsigned long *) - malloc(sizeof(unsigned long) << 3); + rlp->ranges = (ac_uint4 *) + malloc(sizeof(ac_uint4) << 3); else - rlp->ranges = (unsigned long *) + rlp->ranges = (ac_uint4 *) realloc((char *) rlp->ranges, - sizeof(unsigned long) * (rlp->size + 8)); + sizeof(ac_uint4) * (rlp->size + 8)); rlp->size += 8; } @@ -414,41 +442,56 @@ ordered_range_insert(unsigned long c, char *name, int len) } static void -add_decomp(unsigned long code) +add_decomp(ac_uint4 code, short compat) { - unsigned long i, j, size; - + ac_uint4 i, j, size; + _decomp_t **pdecomps; + ac_uint4 *pdecomps_used; + ac_uint4 *pdecomps_size; + + if (compat) { + pdecomps = &kdecomps; + pdecomps_used = &kdecomps_used; + pdecomps_size = &kdecomps_size; + } else { + pdecomps = &decomps; + pdecomps_used = &decomps_used; + pdecomps_size = &decomps_size; + } + /* * Add the code to the composite property. */ - ordered_range_insert(code, "Cm", 2); + if (!compat) { + ordered_range_insert(code, "Cm", 2); + } /* * Locate the insertion point for the code. */ - for (i = 0; i < decomps_used && code > decomps[i].code; i++) ; + for (i = 0; i < *pdecomps_used && code > (*pdecomps)[i].code; i++) ; /* * Allocate space for a new decomposition. */ - if (decomps_used == decomps_size) { - if (decomps_size == 0) - decomps = (_decomp_t *) malloc(sizeof(_decomp_t) << 3); + if (*pdecomps_used == *pdecomps_size) { + if (*pdecomps_size == 0) + *pdecomps = (_decomp_t *) malloc(sizeof(_decomp_t) << 3); else - decomps = (_decomp_t *) - realloc((char *) decomps, - sizeof(_decomp_t) * (decomps_size + 8)); - (void) memset((char *) (decomps + decomps_size), '\0', + *pdecomps = (_decomp_t *) + realloc((char *) *pdecomps, + sizeof(_decomp_t) * (*pdecomps_size + 8)); + (void) memset((char *) (*pdecomps + *pdecomps_size), '\0', sizeof(_decomp_t) << 3); - decomps_size += 8; + *pdecomps_size += 8; } - if (i < decomps_used && code != decomps[i].code) { + if (i < *pdecomps_used && code != (*pdecomps)[i].code) { /* * Shift the decomps up by one if the codes don't match. */ - for (j = decomps_used; j > i; j--) - (void) AC_MEMCPY((char *) &decomps[j], (char *) &decomps[j - 1], + for (j = *pdecomps_used; j > i; j--) + (void) AC_MEMCPY((char *) &(*pdecomps)[j], (char *) &(*pdecomps)[j - 1], sizeof(_decomp_t)); } @@ -456,37 +499,37 @@ add_decomp(unsigned long code) * Insert or replace a decomposition. */ size = dectmp_size + (4 - (dectmp_size & 3)); - if (decomps[i].size < size) { - if (decomps[i].size == 0) - decomps[i].decomp = (unsigned long *) - malloc(sizeof(unsigned long) * size); + if ((*pdecomps)[i].size < size) { + if ((*pdecomps)[i].size == 0) + (*pdecomps)[i].decomp = (ac_uint4 *) + malloc(sizeof(ac_uint4) * size); else - decomps[i].decomp = (unsigned long *) - realloc((char *) decomps[i].decomp, - sizeof(unsigned long) * size); - decomps[i].size = size; + (*pdecomps)[i].decomp = (ac_uint4 *) + realloc((char *) (*pdecomps)[i].decomp, + sizeof(ac_uint4) * size); + (*pdecomps)[i].size = size; } - if (decomps[i].code != code) - decomps_used++; + if ((*pdecomps)[i].code != code) + (*pdecomps_used)++; - decomps[i].code = code; - decomps[i].used = dectmp_size; - (void) AC_MEMCPY((char *) decomps[i].decomp, (char *) dectmp, - sizeof(unsigned long) * dectmp_size); + (*pdecomps)[i].code = code; + (*pdecomps)[i].used = dectmp_size; + (void) AC_MEMCPY((char *) (*pdecomps)[i].decomp, (char *) dectmp, + sizeof(ac_uint4) * dectmp_size); /* * NOTICE: This needs changing later so it is more general than simply * pairs. This calculation is done here to simplify allocation elsewhere. */ - if (dectmp_size == 2) + if (!compat && dectmp_size == 2) comps_used++; } static void -add_title(unsigned long code) +add_title(ac_uint4 code) { - unsigned long i, j; + ac_uint4 i, j; /* * Always map the code to itself. @@ -524,9 +567,9 @@ add_title(unsigned long code) } static void -add_upper(unsigned long code) +add_upper(ac_uint4 code) { - unsigned long i, j; + ac_uint4 i, j; /* * Always map the code to itself. @@ -571,9 +614,9 @@ add_upper(unsigned long code) } static void -add_lower(unsigned long code) +add_lower(ac_uint4 code) { - unsigned long i, j; + ac_uint4 i, j; /* * Always map the code to itself. @@ -618,16 +661,16 @@ add_lower(unsigned long code) } static void -ordered_ccl_insert(unsigned long c, unsigned long ccl_code) +ordered_ccl_insert(ac_uint4 c, ac_uint4 ccl_code) { - unsigned long i, j; + ac_uint4 i, j; if (ccl_used == ccl_size) { if (ccl_size == 0) - ccl = (unsigned long *) malloc(sizeof(unsigned long) * 24); + ccl = (ac_uint4 *) malloc(sizeof(ac_uint4) * 24); else - ccl = (unsigned long *) - realloc((char *) ccl, sizeof(unsigned long) * (ccl_size + 24)); + ccl = (ac_uint4 *) + realloc((char *) ccl, sizeof(ac_uint4) * (ccl_size + 24)); ccl_size += 24; } @@ -690,10 +733,10 @@ ordered_ccl_insert(unsigned long c, unsigned long ccl_code) * Adds a number if it does not already exist and returns an index value * multiplied by 2. */ -static unsigned long +static ac_uint4 make_number(short num, short denom) { - unsigned long n; + ac_uint4 n; /* * Determine if the number already exists. @@ -720,9 +763,9 @@ make_number(short num, short denom) } static void -add_number(unsigned long code, short num, short denom) +add_number(ac_uint4 code, short num, short denom) { - unsigned long i, j; + ac_uint4 i, j; /* * Insert the code in order. @@ -733,7 +776,7 @@ add_number(unsigned long code, short num, short denom) * Handle the case of the codes matching and simply replace the number * that was there before. */ - if (ncodes_used > 0 && code == ncodes[i].code) { + if (i < ncodes_used && code == ncodes[i].code) { ncodes[i].idx = make_number(num, denom); return; } @@ -773,12 +816,13 @@ add_number(unsigned long code, short num, short denom) static void read_cdata(FILE *in) { - unsigned long i, lineno, skip, code, ccl_code; - short wnum, neg, number[2]; + ac_uint4 i, lineno, skip, code, ccl_code; + short wnum, neg, number[2], compat; char line[512], *s, *e; lineno = skip = 0; - while (fscanf(in, "%[^\n]\n", line) != EOF) { + while (fgets(line, sizeof(line), in)) { + if( (s=strchr(line, '\n')) ) *s = '\0'; lineno++; /* @@ -816,8 +860,17 @@ read_cdata(FILE *in) * 3. D800-DFFF Surrogates. * 4. E000-F8FF Private Use Area. * 5. F900-FA2D Han compatibility. + * ...Plus additional ranges in newer Unicode versions... */ switch (code) { + case 0x3400: + /* CJK Ideograph Extension A */ + add_range(0x3400, 0x4db5, "Lo", "L"); + + add_range(0x3400, 0x4db5, "Cp", 0); + + skip = 1; + break; case 0x4e00: /* * The Han ideographs. @@ -871,6 +924,26 @@ read_cdata(FILE *in) add_range(0xf900, 0xfaff, "Cp", 0); skip = 1; + break; + case 0x20000: + /* CJK Ideograph Extension B */ + add_range(0x20000, 0x2a6d6, "Lo", "L"); + + add_range(0x20000, 0x2a6d6, "Cp", 0); + + skip = 1; + break; + case 0xf0000: + /* Plane 15 private use */ + add_range(0xf0000, 0xffffd, "Co", "L"); + skip = 1; + break; + + case 0x100000: + /* Plane 16 private use */ + add_range(0x100000, 0x10fffd, "Co", "L"); + skip = 1; + break; } if (skip) @@ -927,7 +1000,14 @@ read_cdata(FILE *in) * Check for a decomposition. */ s = ++e; - if (*s != ';' && *s != '<') { + if (*s != ';') { + compat = *s == '<'; + if (compat) { + /* + * Skip compatibility formatting tag. + */ + while (*s++ != '>'); + } /* * Collect the codes of the decomposition. */ @@ -936,7 +1016,7 @@ read_cdata(FILE *in) * Skip all leading non-hex digits. */ while (!ishdigit(*s)) - s++; + s++; for (dectmp[dectmp_size] = 0; ishdigit(*s); s++) { dectmp[dectmp_size] <<= 4; @@ -954,8 +1034,12 @@ read_cdata(FILE *in) * If there are any codes in the temporary decomposition array, * then add the character with its decomposition. */ - if (dectmp_size > 0) - add_decomp(code); + if (dectmp_size > 0) { + if (!compat) { + add_decomp(code, 0); + } + add_decomp(code, 1); + } } /* @@ -994,7 +1078,7 @@ read_cdata(FILE *in) * Adjust the denominator in case of integers and add the number. */ if (wnum == 0) - number[1] = number[0]; + number[1] = 1; add_number(code, number[0], number[1]); } @@ -1046,33 +1130,35 @@ read_cdata(FILE *in) } static _decomp_t * -find_decomp(unsigned long code) +find_decomp(ac_uint4 code, short compat) { long l, r, m; - + _decomp_t *decs; + l = 0; - r = decomps_used - 1; + r = (compat ? kdecomps_used : decomps_used) - 1; + decs = compat ? kdecomps : decomps; while (l <= r) { m = (l + r) >> 1; - if (code > decomps[m].code) + if (code > decs[m].code) l = m + 1; - else if (code < decomps[m].code) + else if (code < decs[m].code) r = m - 1; else - return &decomps[m]; + return &decs[m]; } return 0; } static void -decomp_it(_decomp_t *d) +decomp_it(_decomp_t *d, short compat) { - unsigned long i; + ac_uint4 i; _decomp_t *dp; for (i = 0; i < d->used; i++) { - if ((dp = find_decomp(d->decomp[i])) != 0) - decomp_it(dp); + if ((dp = find_decomp(d->decomp[i], compat)) != 0) + decomp_it(dp, compat); else dectmp[dectmp_size++] = d->decomp[i]; } @@ -1085,19 +1171,27 @@ decomp_it(_decomp_t *d) static void expand_decomp(void) { - unsigned long i; + ac_uint4 i; for (i = 0; i < decomps_used; i++) { dectmp_size = 0; - decomp_it(&decomps[i]); + decomp_it(&decomps[i], 0); + if (dectmp_size > 0) + add_decomp(decomps[i].code, 0); + } + + for (i = 0; i < kdecomps_used; i++) { + dectmp_size = 0; + decomp_it(&kdecomps[i], 1); if (dectmp_size > 0) - add_decomp(decomps[i].code); + add_decomp(kdecomps[i].code, 1); } } static int -cmpcomps(_comp_t *comp1, _comp_t *comp2) +cmpcomps(const void *v_comp1, const void *v_comp2) { + const _comp_t *comp1 = v_comp1, *comp2 = v_comp2; long diff = comp1->code1 - comp2->code1; if (!diff) @@ -1111,12 +1205,14 @@ cmpcomps(_comp_t *comp1, _comp_t *comp2) static void read_compexdata(FILE *in) { - unsigned short i, code; + ac_uint2 i; + ac_uint4 code; char line[512], *s; - (void) memset((char *) compexs, 0, sizeof(unsigned long) << 11); + (void) memset((char *) compexs, 0, sizeof(compexs)); - while (fscanf(in, "%[^\n]\n", line) != EOF) { + while (fgets(line, sizeof(line), in)) { + if( (s=strchr(line, '\n')) ) *s = '\0'; /* * Skip blank lines and lines that start with a '#'. */ @@ -1124,10 +1220,11 @@ read_compexdata(FILE *in) continue; /* - * Collect the code. Assume max 4 digits + * Collect the code. Assume max 6 digits */ - for (s = line, i = code = 0; *s != '#' && i < 4; i++, s++) { + for (s = line, i = code = 0; *s != '#' && i < 6; i++, s++) { + if (isspace((unsigned char)*s)) break; code <<= 4; if (*s >= '0' && *s <= '9') code += *s - '0'; @@ -1146,7 +1243,7 @@ read_compexdata(FILE *in) static void create_comps(void) { - unsigned long i, cu; + ac_uint4 i, cu; comps = (_comp_t *) malloc(comps_used * sizeof(_comp_t)); @@ -1159,17 +1256,41 @@ create_comps(void) comps[cu].code2 = decomps[i].decomp[1]; cu++; } - qsort(comps, comps_used, sizeof(_comp_t), - (int (*)(const void *, const void *)) cmpcomps); + comps_used = cu; + qsort(comps, comps_used, sizeof(_comp_t), cmpcomps); } +#if HARDCODE_DATA +static void +write_case(FILE *out, _case_t *tab, int num, int first) +{ + int i; + + for (i=0; i 0) { + for (j=0; j 0) - fwrite((char *) proptbl[i].ranges, sizeof(unsigned long), + fwrite((char *) proptbl[i].ranges, sizeof(ac_uint4), proptbl[i].used, out); } fclose(out); +#endif /***************************************************************** * @@ -1248,10 +1407,41 @@ write_cdata(char *opath) * *****************************************************************/ +#if HARDCODE_DATA + fprintf(out, PREF "ac_uint4 _uccase_size = %ld;\n\n", + (long) (upper_used + lower_used + title_used)); + + fprintf(out, PREF "ac_uint2 _uccase_len[2] = {%ld, %ld};\n\n", + (long) upper_used, (long) lower_used); + fprintf(out, PREF "ac_uint4 _uccase_map[] = {"); + + if (upper_used > 0) + /* + * Write the upper case table. + */ + write_case(out, upper, upper_used, 1); + + if (lower_used > 0) + /* + * Write the lower case table. + */ + write_case(out, lower, lower_used, !upper_used); + + if (title_used > 0) + /* + * Write the title case table. + */ + write_case(out, title, title_used, !(upper_used||lower_used)); + + if (!(upper_used || lower_used || title_used)) + fprintf(out, "\t0"); + + fprintf(out, "\n};\n\n"); +#else /* * Open the case.dat file. */ - sprintf(path, "%s/case.dat", opath); + snprintf(path, sizeof path, "%s" LDAP_DIRSEP "case.dat", opath); if ((out = fopen(path, "wb")) == 0) return; @@ -1265,12 +1455,12 @@ write_cdata(char *opath) /* * Write the header. */ - fwrite((char *) hdr, sizeof(unsigned short), 2, out); + fwrite((char *) hdr, sizeof(ac_uint2), 2, out); /* * Write the upper and lower case table sizes. */ - fwrite((char *) casecnt, sizeof(unsigned short), 2, out); + fwrite((char *) casecnt, sizeof(ac_uint2), 2, out); if (upper_used > 0) /* @@ -1291,6 +1481,7 @@ write_cdata(char *opath) fwrite((char *) title, sizeof(_case_t), title_used, out); fclose(out); +#endif /***************************************************************** * @@ -1303,24 +1494,45 @@ write_cdata(char *opath) */ create_comps(); +#if HARDCODE_DATA + fprintf(out, PREF "ac_uint4 _uccomp_size = %ld;\n\n", + comps_used * 4L); + + fprintf(out, PREF "ac_uint4 _uccomp_data[] = {"); + + /* + * Now, if comps exist, write them out. + */ + if (comps_used > 0) { + for (i=0; i 0) { + /* + * Write the combining class ranges out. + */ + for (i = 0; i 0) /* * Write the combining class ranges out. */ - fwrite((char *) ccl, sizeof(unsigned long), ccl_used, out); + fwrite((char *) ccl, sizeof(ac_uint4), ccl_used, out); fclose(out); +#endif /***************************************************************** * @@ -1439,10 +1803,45 @@ write_cdata(char *opath) * *****************************************************************/ +#if HARDCODE_DATA + fprintf(out, PREF "ac_uint4 _ucnum_size = %lu;\n\n", + (unsigned long)ncodes_used<<1); + + fprintf(out, PREF "ac_uint4 _ucnum_nodes[] = {"); + + /* + * Now, if number mappings exist, write them out. + */ + if (ncodes_used > 0) { + for (i = 0; i