From e08bc054c7f308f19ebc19d19f29f70ca9147348 Mon Sep 17 00:00:00 2001 From: Stig Venaas Date: Fri, 19 Apr 2002 12:59:57 +0000 Subject: [PATCH] Adding support for compatibility decomposition to ucdata lib, and switch from NFC to NFKC in UTF8bvnormalize() and UTF8bvnormcmp() --- libraries/liblunicode/Makefile.in | 2 +- libraries/liblunicode/ucdata/ucdata.c | 138 +++++++++++++++++- libraries/liblunicode/ucdata/ucdata.h | 19 ++- libraries/liblunicode/ucdata/ucgendat.c | 182 ++++++++++++++++++------ libraries/liblunicode/ucstr.c | 6 +- 5 files changed, 294 insertions(+), 53 deletions(-) diff --git a/libraries/liblunicode/Makefile.in b/libraries/liblunicode/Makefile.in index a243aac306..6bf119d2ff 100644 --- a/libraries/liblunicode/Makefile.in +++ b/libraries/liblunicode/Makefile.in @@ -23,7 +23,7 @@ ucgendat: $(XLIBS) ucgendat.o $(LTLINK) -o $@ ucgendat.o $(LIBS) ./ucgendat $(srcdir)/UnicodeData.txt -x $(srcdir)/CompositionExclusions.txt -DATFILES = case.dat cmbcl.dat comp.dat ctype.dat decomp.dat num.dat +DATFILES = case.dat cmbcl.dat comp.dat ctype.dat decomp.dat num.dat kdecomp.dat install-local: $(PROGRAMS) FORCE -$(MKDIR) $(DESTDIR)$(datadir)/ucdata diff --git a/libraries/liblunicode/ucdata/ucdata.c b/libraries/liblunicode/ucdata/ucdata.c index e4ded225fa..beb4184f03 100644 --- a/libraries/liblunicode/ucdata/ucdata.c +++ b/libraries/liblunicode/ucdata/ucdata.c @@ -674,6 +674,10 @@ static unsigned long _ucdcmp_size; static unsigned long *_ucdcmp_nodes; static unsigned long *_ucdcmp_decomp; +static unsigned long _uckdcmp_size; +static unsigned long *_uckdcmp_nodes; +static unsigned long *_uckdcmp_decomp; + /* * Return -1 on error, 0 if okay */ @@ -729,6 +733,61 @@ _ucdcmp_load(char *paths, int reload) return 0; } +/* + * Return -1 on error, 0 if okay + */ +static int +_uckdcmp_load(char *paths, int reload) +{ + FILE *in; + unsigned long size, i; + _ucheader_t hdr; + + if (_uckdcmp_size > 0) { + if (!reload) + /* + * The decompositions have already been loaded. + */ + return 0; + + free((char *) _uckdcmp_nodes); + _uckdcmp_size = 0; + } + + if ((in = _ucopenfile(paths, "kdecomp.dat", "rb")) == 0) + return -1; + + /* + * Load the header. + */ + fread((char *) &hdr, sizeof(_ucheader_t), 1, in); + + if (hdr.bom == 0xfffe) { + hdr.cnt = endian_short(hdr.cnt); + hdr.size.bytes = endian_long(hdr.size.bytes); + } + + _uckdcmp_size = hdr.cnt << 1; + _uckdcmp_nodes = (unsigned long *) malloc(hdr.size.bytes); + _uckdcmp_decomp = _uckdcmp_nodes + (_uckdcmp_size + 1); + + /* + * Read the decomposition data in. + */ + size = hdr.size.bytes / sizeof(unsigned long); + fread((char *) _uckdcmp_nodes, sizeof(unsigned long), size, in); + + /* + * Do an endian swap if necessary. + */ + if (hdr.bom == 0xfffe) { + for (i = 0; i < size; i++) + _uckdcmp_nodes[i] = endian_long(_uckdcmp_nodes[i]); + } + fclose(in); + return 0; +} + static void _ucdcmp_unload(void) { @@ -743,11 +802,29 @@ _ucdcmp_unload(void) _ucdcmp_size = 0; } +static void +_uckdcmp_unload(void) +{ + if (_uckdcmp_size == 0) + return; + + /* + * Only need to free the offsets because the memory is allocated as a + * single block. + */ + free((char *) _uckdcmp_nodes); + _uckdcmp_size = 0; +} + int ucdecomp(unsigned long code, unsigned long *num, unsigned long **decomp) { long l, r, m; + if (code < _ucdcmp_nodes[0]) { + return 0; + } + l = 0; r = _ucdcmp_nodes[_ucdcmp_size] - 1; @@ -771,6 +848,38 @@ ucdecomp(unsigned long code, unsigned long *num, unsigned long **decomp) return 0; } +int +uckdecomp(unsigned long code, unsigned long *num, unsigned long **decomp) +{ + long l, r, m; + + if (code < _uckdcmp_nodes[0]) { + return 0; + } + + l = 0; + r = _uckdcmp_nodes[_uckdcmp_size] - 1; + + while (l <= r) { + /* + * Determine a "mid" point and adjust to make sure the mid point is at + * the beginning of a code+offset pair. + */ + m = (l + r) >> 1; + m -= (m & 1); + if (code > _uckdcmp_nodes[m]) + l = m + 2; + else if (code < _uckdcmp_nodes[m]) + r = m - 2; + else if (code == _uckdcmp_nodes[m]) { + *num = _uckdcmp_nodes[m + 3] - _uckdcmp_nodes[m + 1]; + *decomp = &_uckdcmp_decomp[_uckdcmp_nodes[m + 1]]; + return 1; + } + } + return 0; +} + int ucdecomp_hangul(unsigned long code, unsigned long *num, unsigned long decomp[]) { @@ -786,9 +895,10 @@ ucdecomp_hangul(unsigned long code, unsigned long *num, unsigned long decomp[]) return 1; } -int -uccanondecomp(const unsigned long *in, int inlen, - unsigned long **out, int *outlen) +/* mode == 0 for canonical, mode == 1 for compatibility */ +static int +uccanoncompatdecomp(const unsigned long *in, int inlen, + unsigned long **out, int *outlen, short mode) { int l, size; unsigned i, j, k; @@ -801,7 +911,7 @@ uccanondecomp(const unsigned long *in, int inlen, i = 0; for (j = 0; j < (unsigned) inlen; j++) { - if (ucdecomp(in[j], &num, &decomp)) { + if (mode ? uckdecomp(in[j], &num, &decomp) : ucdecomp(in[j], &num, &decomp)) { if ( size - i < num) { size = inlen + i - j + num - 1; *out = (unsigned long *) realloc(*out, size * sizeof(**out)); @@ -855,6 +965,20 @@ uccanondecomp(const unsigned long *in, int inlen, return *outlen = i; } +int +uccanondecomp(const unsigned long *in, int inlen, + unsigned long **out, int *outlen) +{ + return uccanoncompatdecomp(in, inlen, out, outlen, 0); +} + +int +uccompatdecomp(const unsigned long *in, int inlen, + unsigned long **out, int *outlen) +{ + return uccanoncompatdecomp(in, inlen, out, outlen, 1); +} + /************************************************************************** * * Support for combining classes. @@ -1152,6 +1276,8 @@ ucdata_load(char *paths, int masks) error |= _ucnumb_load(paths, 0) < 0 ? UCDATA_NUM : 0; if (masks & UCDATA_COMP) error |= _uccomp_load(paths, 0) < 0 ? UCDATA_COMP : 0; + if (masks & UCDATA_KDECOMP) + error |= _uckdcmp_load(paths, 0) < 0 ? UCDATA_KDECOMP : 0; return -error; } @@ -1171,6 +1297,8 @@ ucdata_unload(int masks) _ucnumb_unload(); if (masks & UCDATA_COMP) _uccomp_unload(); + if (masks & UCDATA_KDECOMP) + _uckdcmp_unload(); } /* @@ -1193,6 +1321,8 @@ ucdata_reload(char *paths, int masks) error |= _ucnumb_load(paths, 1) < 0 ? UCDATA_NUM : 0; if (masks & UCDATA_COMP) error |= _uccomp_load(paths, 1) < 0 ? UCDATA_COMP : 0; + if (masks & UCDATA_KDECOMP) + error |= _uckdcmp_load(paths, 1) < 0 ? UCDATA_KDECOMP : 0; return -error; } diff --git a/libraries/liblunicode/ucdata/ucdata.h b/libraries/liblunicode/ucdata/ucdata.h index eb065b5ca8..d0fbe92aa6 100644 --- a/libraries/liblunicode/ucdata/ucdata.h +++ b/libraries/liblunicode/ucdata/ucdata.h @@ -249,6 +249,14 @@ LDAP_LUNICODE_F (int) ucdecomp LDAP_P((unsigned long code, unsigned long *num, unsigned long **decomp)); +/* + * Equivalent to ucdecomp() except that it includes compatibility + * decompositions. + */ +LDAP_LUNICODE_F (int) +uckdecomp LDAP_P((unsigned long code, unsigned long *num, + unsigned long **decomp)); + /* * If the code is a Hangul syllable, this routine decomposes it into the array * passed. The array size should be at least 3. @@ -267,6 +275,14 @@ LDAP_LUNICODE_F (int) uccanondecomp LDAP_P((const unsigned long *in, int inlen, unsigned long **out, int *outlen)); +/* + * Equivalent to uccanondecomp() except that it includes compatibility + * decompositions. + */ +LDAP_LUNICODE_F (int) +uccompatdecomp LDAP_P((const unsigned long *in, int inlen, + unsigned long **out, int *outlen)); + /************************************************************************** * * Functions for getting combining classes. @@ -318,9 +334,10 @@ LDAP_LUNICODE_F (int) ucgetdigit LDAP_P((unsigned long code)); #define UCDATA_CMBCL 0x08 #define UCDATA_NUM 0x10 #define UCDATA_COMP 0x20 +#define UCDATA_KDECOMP 0x40 #define UCDATA_ALL (UCDATA_CASE|UCDATA_CTYPE|UCDATA_DECOMP|\ - UCDATA_CMBCL|UCDATA_NUM|UCDATA_COMP) + UCDATA_CMBCL|UCDATA_NUM|UCDATA_COMP|UCDATA_KDECOMP) /* * Functions to load, unload, and reload specific data files. diff --git a/libraries/liblunicode/ucdata/ucgendat.c b/libraries/liblunicode/ucdata/ucgendat.c index 2b67122316..591fcc4f10 100644 --- a/libraries/liblunicode/ucdata/ucgendat.c +++ b/libraries/liblunicode/ucdata/ucgendat.c @@ -108,12 +108,17 @@ typedef struct { /* * List of decomposition. Created and expanded in order as the characters are - * encountered. + * encountered. First list contains canonical mappings, second also includes + * compatibility mappings. */ static _decomp_t *decomps; static unsigned long decomps_used; static unsigned long decomps_size; +static _decomp_t *kdecomps; +static unsigned long kdecomps_used; +static unsigned long kdecomps_size; + /* * Composition exclusion table stuff. */ @@ -420,41 +425,56 @@ ordered_range_insert(unsigned long c, char *name, int len) } static void -add_decomp(unsigned long code) +add_decomp(unsigned long code, short compat) { unsigned long i, j, size; - + _decomp_t **pdecomps; + unsigned long *pdecomps_used; + unsigned long *pdecomps_size; + + if (compat) { + pdecomps = &kdecomps; + pdecomps_used = &kdecomps_used; + pdecomps_size = &kdecomps_size; + } else { + pdecomps = &decomps; + pdecomps_used = &decomps_used; + pdecomps_size = &decomps_size; + } + /* * Add the code to the composite property. */ - ordered_range_insert(code, "Cm", 2); + if (!compat) { + ordered_range_insert(code, "Cm", 2); + } /* * Locate the insertion point for the code. */ - for (i = 0; i < decomps_used && code > decomps[i].code; i++) ; + for (i = 0; i < *pdecomps_used && code > (*pdecomps)[i].code; i++) ; /* * Allocate space for a new decomposition. */ - if (decomps_used == decomps_size) { - if (decomps_size == 0) - decomps = (_decomp_t *) malloc(sizeof(_decomp_t) << 3); + if (*pdecomps_used == *pdecomps_size) { + if (*pdecomps_size == 0) + *pdecomps = (_decomp_t *) malloc(sizeof(_decomp_t) << 3); else - decomps = (_decomp_t *) - realloc((char *) decomps, - sizeof(_decomp_t) * (decomps_size + 8)); - (void) memset((char *) (decomps + decomps_size), '\0', + *pdecomps = (_decomp_t *) + realloc((char *) *pdecomps, + sizeof(_decomp_t) * (*pdecomps_size + 8)); + (void) memset((char *) (*pdecomps + *pdecomps_size), '\0', sizeof(_decomp_t) << 3); - decomps_size += 8; + *pdecomps_size += 8; } - if (i < decomps_used && code != decomps[i].code) { + if (i < *pdecomps_used && code != (*pdecomps)[i].code) { /* * Shift the decomps up by one if the codes don't match. */ - for (j = decomps_used; j > i; j--) - (void) AC_MEMCPY((char *) &decomps[j], (char *) &decomps[j - 1], + for (j = *pdecomps_used; j > i; j--) + (void) AC_MEMCPY((char *) &(*pdecomps)[j], (char *) &(*pdecomps)[j - 1], sizeof(_decomp_t)); } @@ -462,30 +482,30 @@ add_decomp(unsigned long code) * Insert or replace a decomposition. */ size = dectmp_size + (4 - (dectmp_size & 3)); - if (decomps[i].size < size) { - if (decomps[i].size == 0) - decomps[i].decomp = (unsigned long *) + if ((*pdecomps)[i].size < size) { + if ((*pdecomps)[i].size == 0) + (*pdecomps)[i].decomp = (unsigned long *) malloc(sizeof(unsigned long) * size); else - decomps[i].decomp = (unsigned long *) - realloc((char *) decomps[i].decomp, + (*pdecomps)[i].decomp = (unsigned long *) + realloc((char *) (*pdecomps)[i].decomp, sizeof(unsigned long) * size); - decomps[i].size = size; + (*pdecomps)[i].size = size; } - if (decomps[i].code != code) - decomps_used++; + if ((*pdecomps)[i].code != code) + (*pdecomps_used)++; - decomps[i].code = code; - decomps[i].used = dectmp_size; - (void) AC_MEMCPY((char *) decomps[i].decomp, (char *) dectmp, + (*pdecomps)[i].code = code; + (*pdecomps)[i].used = dectmp_size; + (void) AC_MEMCPY((char *) (*pdecomps)[i].decomp, (char *) dectmp, sizeof(unsigned long) * dectmp_size); /* * NOTICE: This needs changing later so it is more general than simply * pairs. This calculation is done here to simplify allocation elsewhere. */ - if (dectmp_size == 2) + if (!compat && dectmp_size == 2) comps_used++; } @@ -780,7 +800,7 @@ static void read_cdata(FILE *in) { unsigned long i, lineno, skip, code, ccl_code; - short wnum, neg, number[2]; + short wnum, neg, number[2], compat; char line[512], *s, *e; lineno = skip = 0; @@ -933,7 +953,14 @@ read_cdata(FILE *in) * Check for a decomposition. */ s = ++e; - if (*s != ';' && *s != '<') { + if (*s != ';') { + compat = *s == '<'; + if (compat) { + /* + * Skip compatibility formatting tag. + */ + while (*s++ != '>'); + } /* * Collect the codes of the decomposition. */ @@ -942,7 +969,7 @@ read_cdata(FILE *in) * Skip all leading non-hex digits. */ while (!ishdigit(*s)) - s++; + s++; for (dectmp[dectmp_size] = 0; ishdigit(*s); s++) { dectmp[dectmp_size] <<= 4; @@ -960,8 +987,12 @@ read_cdata(FILE *in) * If there are any codes in the temporary decomposition array, * then add the character with its decomposition. */ - if (dectmp_size > 0) - add_decomp(code); + if (dectmp_size > 0) { + if (!compat) { + add_decomp(code, 0); + } + add_decomp(code, 1); + } } /* @@ -1052,33 +1083,35 @@ read_cdata(FILE *in) } static _decomp_t * -find_decomp(unsigned long code) +find_decomp(unsigned long code, short compat) { long l, r, m; - + _decomp_t *decs; + l = 0; - r = decomps_used - 1; + r = (compat ? kdecomps_used : decomps_used) - 1; + decs = compat ? kdecomps : decomps; while (l <= r) { m = (l + r) >> 1; - if (code > decomps[m].code) + if (code > decs[m].code) l = m + 1; - else if (code < decomps[m].code) + else if (code < decs[m].code) r = m - 1; else - return &decomps[m]; + return &decs[m]; } return 0; } static void -decomp_it(_decomp_t *d) +decomp_it(_decomp_t *d, short compat) { unsigned long i; _decomp_t *dp; for (i = 0; i < d->used; i++) { - if ((dp = find_decomp(d->decomp[i])) != 0) - decomp_it(dp); + if ((dp = find_decomp(d->decomp[i], compat)) != 0) + decomp_it(dp, compat); else dectmp[dectmp_size++] = d->decomp[i]; } @@ -1095,9 +1128,16 @@ expand_decomp(void) for (i = 0; i < decomps_used; i++) { dectmp_size = 0; - decomp_it(&decomps[i]); + decomp_it(&decomps[i], 0); if (dectmp_size > 0) - add_decomp(decomps[i].code); + add_decomp(decomps[i].code, 0); + } + + for (i = 0; i < kdecomps_used; i++) { + dectmp_size = 0; + decomp_it(&kdecomps[i], 1); + if (dectmp_size > 0) + add_decomp(kdecomps[i].code, 1); } } @@ -1402,6 +1442,60 @@ write_cdata(char *opath) fclose(out); } + /* + * Open the kdecomp.dat file. + */ + sprintf(path, "%s%skdecomp.dat", opath, LDAP_DIRSEP); + if ((out = fopen(path, "wb")) == 0) + return; + + hdr[1] = kdecomps_used; + + /* + * Write the header. + */ + fwrite((char *) hdr, sizeof(unsigned short), 2, out); + + /* + * Write a temporary byte count which will be calculated as the + * decompositions are written out. + */ + bytes = 0; + fwrite((char *) &bytes, sizeof(unsigned long), 1, out); + + if (kdecomps_used) { + /* + * Write the list of kdecomp nodes. + */ + for (i = idx = 0; i < kdecomps_used; i++) { + fwrite((char *) &kdecomps[i].code, sizeof(unsigned long), 1, out); + fwrite((char *) &idx, sizeof(unsigned long), 1, out); + idx += kdecomps[i].used; + } + + /* + * Write the sentinel index as the last decomp node. + */ + fwrite((char *) &idx, sizeof(unsigned long), 1, out); + + /* + * Write the decompositions themselves. + */ + for (i = 0; i < kdecomps_used; i++) + fwrite((char *) kdecomps[i].decomp, sizeof(unsigned long), + kdecomps[i].used, out); + + /* + * Seek back to the beginning and write the byte count. + */ + bytes = (sizeof(unsigned long) * idx) + + (sizeof(unsigned long) * ((hdr[1] << 1) + 1)); + fseek(out, sizeof(unsigned short) << 1, 0L); + fwrite((char *) &bytes, sizeof(unsigned long), 1, out); + + fclose(out); + } + /***************************************************************** * * Generate the combining class data. diff --git a/libraries/liblunicode/ucstr.c b/libraries/liblunicode/ucstr.c index 18a882111d..bc46a95169 100644 --- a/libraries/liblunicode/ucstr.c +++ b/libraries/liblunicode/ucstr.c @@ -212,7 +212,7 @@ struct berval * UTF8bvnormalize( p++; } /* normalize ucs of length p - ucs */ - uccanondecomp( ucs, p - ucs, &ucsout, &ucsoutlen ); + uccompatdecomp( ucs, p - ucs, &ucsout, &ucsoutlen ); if ( approx ) { for ( j = 0; j < ucsoutlen; j++ ) { if ( ucsout[j] < 0x80 ) { @@ -370,7 +370,7 @@ int UTF8bvnormcmp( return l1 > l2 ? 1 : -1; /* what to do??? */ } } else { - uccanondecomp( ucs, ulen, &ucsout1, &l1 ); + uccompatdecomp( ucs, ulen, &ucsout1, &l1 ); l1 = uccanoncomp( ucsout1, l1 ); } @@ -389,7 +389,7 @@ int UTF8bvnormcmp( ucsout2 = ucs; l2 = ulen; } else { - uccanondecomp( ucs, ulen, &ucsout2, &l2 ); + uccompatdecomp( ucs, ulen, &ucsout2, &l2 ); l2 = uccanoncomp( ucsout2, l2 ); free( ucs ); } -- 2.39.5