2 * Copyright 1999 Computing Research Labs, New Mexico State University
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
18 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
19 * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
20 * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24 static char rcsid[] __attribute__ ((unused)) = "$Id: ucdata.c,v 1.3 1999/08/23 16:14:09 mleisher Exp $";
26 static char rcsid[] = "$Id: ucdata.c,v 1.3 1999/08/23 16:14:09 mleisher Exp $";
34 #include <ac/string.h>
35 #include <ac/unistd.h>
40 /**************************************************************************
42 * Miscellaneous types, data, and support functions.
44 **************************************************************************/
51 unsigned short len[2];
56 * A simple array of 32-bit masks for lookup.
58 static unsigned long masks32[32] = {
59 0x00000001, 0x00000002, 0x00000004, 0x00000008, 0x00000010, 0x00000020,
60 0x00000040, 0x00000080, 0x00000100, 0x00000200, 0x00000400, 0x00000800,
61 0x00001000, 0x00002000, 0x00004000, 0x00008000, 0x00010000, 0x00020000,
62 0x00040000, 0x00080000, 0x00100000, 0x00200000, 0x00400000, 0x00800000,
63 0x01000000, 0x02000000, 0x04000000, 0x08000000, 0x10000000, 0x20000000,
64 0x40000000, 0x80000000
67 #define endian_short(cc) (((cc) >> 8) | (((cc) & 0xff) << 8))
68 #define endian_long(cc) ((((cc) & 0xff) << 24)|((((cc) >> 8) & 0xff) << 16)|\
69 ((((cc) >> 16) & 0xff) << 8)|((cc) >> 24))
72 _ucopenfile(char *paths, char *filename, char *mode)
75 char *fp, *dp, *pp, path[BUFSIZ];
77 if (filename == 0 || *filename == 0)
83 while (*dp && *dp != ':')
92 if ((f = fopen(path, mode)) != 0)
102 /**************************************************************************
104 * Support for the character properties.
106 **************************************************************************/
108 static unsigned long _ucprop_size;
109 static unsigned short *_ucprop_offsets;
110 static unsigned long *_ucprop_ranges;
113 _ucprop_load(char *paths, int reload)
116 unsigned long size, i;
119 if (_ucprop_size > 0) {
122 * The character properties have already been loaded.
127 * Unload the current character property data in preparation for
128 * loading a new copy. Only the first array has to be deallocated
129 * because all the memory for the arrays is allocated as a single
132 free((char *) _ucprop_offsets);
136 if ((in = _ucopenfile(paths, "ctype.dat", "rb")) == 0)
142 fread((char *) &hdr, sizeof(_ucheader_t), 1, in);
144 if (hdr.bom == 0xfffe) {
145 hdr.cnt = endian_short(hdr.cnt);
146 hdr.size.bytes = endian_long(hdr.size.bytes);
149 if ((_ucprop_size = hdr.cnt) == 0) {
155 * Allocate all the storage needed for the lookup table.
157 _ucprop_offsets = (unsigned short *) malloc(hdr.size.bytes);
160 * Calculate the offset into the storage for the ranges. The offsets
161 * array is on a 4-byte boundary and one larger than the value provided in
162 * the header count field. This means the offset to the ranges must be
163 * calculated after aligning the count to a 4-byte boundary.
165 if ((size = ((hdr.cnt + 1) * sizeof(unsigned short))) & 3)
166 size += 4 - (size & 3);
168 _ucprop_ranges = (unsigned long *) (_ucprop_offsets + size);
171 * Load the offset array.
173 fread((char *) _ucprop_offsets, sizeof(unsigned short), size, in);
176 * Do an endian swap if necessary. Don't forget there is an extra node on
177 * the end with the final index.
179 if (hdr.bom == 0xfffe) {
180 for (i = 0; i <= _ucprop_size; i++)
181 _ucprop_offsets[i] = endian_short(_ucprop_offsets[i]);
185 * Load the ranges. The number of elements is in the last array position
188 fread((char *) _ucprop_ranges, sizeof(unsigned long),
189 _ucprop_offsets[_ucprop_size], in);
194 * Do an endian swap if necessary.
196 if (hdr.bom == 0xfffe) {
197 for (i = 0; i < _ucprop_offsets[_ucprop_size]; i++)
198 _ucprop_ranges[i] = endian_long(_ucprop_ranges[i]);
205 if (_ucprop_size == 0)
209 * Only need to free the offsets because the memory is allocated as a
212 free((char *) _ucprop_offsets);
217 _ucprop_lookup(unsigned long code, unsigned long n)
222 * There is an extra node on the end of the offsets to allow this routine
223 * to work right. If the index is 0xffff, then there are no nodes for the
226 if ((l = _ucprop_offsets[n]) == 0xffff)
230 * Locate the next offset that is not 0xffff. The sentinel at the end of
231 * the array is the max index value.
234 n + m < _ucprop_size && _ucprop_offsets[n + m] == 0xffff; m++) ;
236 r = _ucprop_offsets[n + m] - 1;
240 * Determine a "mid" point and adjust to make sure the mid point is at
241 * the beginning of a range pair.
245 if (code > _ucprop_ranges[m + 1])
247 else if (code < _ucprop_ranges[m])
249 else if (code >= _ucprop_ranges[m] && code <= _ucprop_ranges[m + 1])
256 ucisprop(unsigned long code, unsigned long mask1, unsigned long mask2)
260 if (mask1 == 0 && mask2 == 0)
263 for (i = 0; mask1 && i < 32; i++) {
264 if ((mask1 & masks32[i]) && _ucprop_lookup(code, i))
268 for (i = 32; mask2 && i < _ucprop_size; i++) {
269 if ((mask2 & masks32[i & 31]) && _ucprop_lookup(code, i))
276 /**************************************************************************
278 * Support for case mapping.
280 **************************************************************************/
282 static unsigned long _uccase_size;
283 static unsigned short _uccase_len[2];
284 static unsigned long *_uccase_map;
287 _uccase_load(char *paths, int reload)
293 if (_uccase_size > 0) {
296 * The case mappings have already been loaded.
300 free((char *) _uccase_map);
304 if ((in = _ucopenfile(paths, "case.dat", "rb")) == 0)
310 fread((char *) &hdr, sizeof(_ucheader_t), 1, in);
312 if (hdr.bom == 0xfffe) {
313 hdr.cnt = endian_short(hdr.cnt);
314 hdr.size.len[0] = endian_short(hdr.size.len[0]);
315 hdr.size.len[1] = endian_short(hdr.size.len[1]);
319 * Set the node count and lengths of the upper and lower case mapping
322 _uccase_size = hdr.cnt * 3;
323 _uccase_len[0] = hdr.size.len[0] * 3;
324 _uccase_len[1] = hdr.size.len[1] * 3;
326 _uccase_map = (unsigned long *)
327 malloc(_uccase_size * sizeof(unsigned long));
330 * Load the case mapping table.
332 fread((char *) _uccase_map, sizeof(unsigned long), _uccase_size, in);
335 * Do an endian swap if necessary.
337 if (hdr.bom == 0xfffe) {
338 for (i = 0; i < _uccase_size; i++)
339 _uccase_map[i] = endian_long(_uccase_map[i]);
346 if (_uccase_size == 0)
349 free((char *) _uccase_map);
354 _uccase_lookup(unsigned long code, long l, long r, int field)
359 * Do the binary search.
363 * Determine a "mid" point and adjust to make sure the mid point is at
364 * the beginning of a case mapping triple.
368 if (code > _uccase_map[m])
370 else if (code < _uccase_map[m])
372 else if (code == _uccase_map[m])
373 return _uccase_map[m + field];
380 uctoupper(unsigned long code)
388 if (ucislower(code)) {
390 * The character is lower case.
394 r = (l + _uccase_len[1]) - 3;
397 * The character is title case.
400 l = _uccase_len[0] + _uccase_len[1];
401 r = _uccase_size - 3;
403 return _uccase_lookup(code, l, r, field);
407 uctolower(unsigned long code)
415 if (ucisupper(code)) {
417 * The character is upper case.
421 r = _uccase_len[0] - 3;
424 * The character is title case.
427 l = _uccase_len[0] + _uccase_len[1];
428 r = _uccase_size - 3;
430 return _uccase_lookup(code, l, r, field);
434 uctotitle(unsigned long code)
443 * The offset will always be the same for converting to title case.
447 if (ucisupper(code)) {
449 * The character is upper case.
452 r = _uccase_len[0] - 3;
455 * The character is lower case.
458 r = (l + _uccase_len[1]) - 3;
460 return _uccase_lookup(code, l, r, field);
463 /**************************************************************************
465 * Support for decompositions.
467 **************************************************************************/
469 static unsigned long _ucdcmp_size;
470 static unsigned long *_ucdcmp_nodes;
471 static unsigned long *_ucdcmp_decomp;
474 _ucdcmp_load(char *paths, int reload)
477 unsigned long size, i;
480 if (_ucdcmp_size > 0) {
483 * The decompositions have already been loaded.
487 free((char *) _ucdcmp_nodes);
491 if ((in = _ucopenfile(paths, "decomp.dat", "rb")) == 0)
497 fread((char *) &hdr, sizeof(_ucheader_t), 1, in);
499 if (hdr.bom == 0xfffe) {
500 hdr.cnt = endian_short(hdr.cnt);
501 hdr.size.bytes = endian_long(hdr.size.bytes);
504 _ucdcmp_size = hdr.cnt << 1;
505 _ucdcmp_nodes = (unsigned long *) malloc(hdr.size.bytes);
506 _ucdcmp_decomp = _ucdcmp_nodes + (_ucdcmp_size + 1);
509 * Read the decomposition data in.
511 size = hdr.size.bytes / sizeof(unsigned long);
512 fread((char *) _ucdcmp_nodes, sizeof(unsigned long), size, in);
515 * Do an endian swap if necessary.
517 if (hdr.bom == 0xfffe) {
518 for (i = 0; i < size; i++)
519 _ucdcmp_nodes[i] = endian_long(_ucdcmp_nodes[i]);
526 if (_ucdcmp_size == 0)
530 * Only need to free the offsets because the memory is allocated as a
533 free((char *) _ucdcmp_nodes);
538 ucdecomp(unsigned long code, unsigned long *num, unsigned long **decomp)
543 r = _ucdcmp_nodes[_ucdcmp_size] - 1;
547 * Determine a "mid" point and adjust to make sure the mid point is at
548 * the beginning of a code+offset pair.
552 if (code > _ucdcmp_nodes[m])
554 else if (code < _ucdcmp_nodes[m])
556 else if (code == _ucdcmp_nodes[m]) {
557 *num = _ucdcmp_nodes[m + 3] - _ucdcmp_nodes[m + 1];
558 *decomp = &_ucdcmp_decomp[_ucdcmp_nodes[m + 1]];
566 ucdecomp_hangul(unsigned long code, unsigned long *num, unsigned long decomp[])
568 if (!ucishangul(code))
572 decomp[0] = 0x1100 + (unsigned long) (code / 588);
573 decomp[1] = 0x1161 + (unsigned long) ((code % 588) / 28);
574 decomp[2] = 0x11a7 + (unsigned long) (code % 28);
575 *num = (decomp[2] != 0x11a7) ? 3 : 2;
580 /**************************************************************************
582 * Support for combining classes.
584 **************************************************************************/
586 static unsigned long _uccmcl_size;
587 static unsigned long *_uccmcl_nodes;
590 _uccmcl_load(char *paths, int reload)
596 if (_uccmcl_size > 0) {
599 * The combining classes have already been loaded.
603 free((char *) _uccmcl_nodes);
607 if ((in = _ucopenfile(paths, "cmbcl.dat", "rb")) == 0)
613 fread((char *) &hdr, sizeof(_ucheader_t), 1, in);
615 if (hdr.bom == 0xfffe) {
616 hdr.cnt = endian_short(hdr.cnt);
617 hdr.size.bytes = endian_long(hdr.size.bytes);
620 _uccmcl_size = hdr.cnt * 3;
621 _uccmcl_nodes = (unsigned long *) malloc(hdr.size.bytes);
624 * Read the combining classes in.
626 fread((char *) _uccmcl_nodes, sizeof(unsigned long), _uccmcl_size, in);
629 * Do an endian swap if necessary.
631 if (hdr.bom == 0xfffe) {
632 for (i = 0; i < _uccmcl_size; i++)
633 _uccmcl_nodes[i] = endian_long(_uccmcl_nodes[i]);
640 if (_uccmcl_size == 0)
643 free((char *) _uccmcl_nodes);
648 uccombining_class(unsigned long code)
653 r = _uccmcl_size - 1;
658 if (code > _uccmcl_nodes[m + 1])
660 else if (code < _uccmcl_nodes[m])
662 else if (code >= _uccmcl_nodes[m] && code <= _uccmcl_nodes[m + 1])
663 return _uccmcl_nodes[m + 2];
668 /**************************************************************************
670 * Support for numeric values.
672 **************************************************************************/
674 static unsigned long *_ucnum_nodes;
675 static unsigned long _ucnum_size;
676 static short *_ucnum_vals;
679 _ucnumb_load(char *paths, int reload)
682 unsigned long size, i;
685 if (_ucnum_size > 0) {
688 * The numbers have already been loaded.
692 free((char *) _ucnum_nodes);
696 if ((in = _ucopenfile(paths, "num.dat", "rb")) == 0)
702 fread((char *) &hdr, sizeof(_ucheader_t), 1, in);
704 if (hdr.bom == 0xfffe) {
705 hdr.cnt = endian_short(hdr.cnt);
706 hdr.size.bytes = endian_long(hdr.size.bytes);
709 _ucnum_size = hdr.cnt;
710 _ucnum_nodes = (unsigned long *) malloc(hdr.size.bytes);
711 _ucnum_vals = (short *) (_ucnum_nodes + _ucnum_size);
714 * Read the combining classes in.
716 fread((char *) _ucnum_nodes, sizeof(unsigned char), hdr.size.bytes, in);
719 * Do an endian swap if necessary.
721 if (hdr.bom == 0xfffe) {
722 for (i = 0; i < _ucnum_size; i++)
723 _ucnum_nodes[i] = endian_long(_ucnum_nodes[i]);
726 * Determine the number of values that have to be adjusted.
728 size = (hdr.size.bytes -
729 (_ucnum_size * (sizeof(unsigned long) << 1))) /
732 for (i = 0; i < size; i++)
733 _ucnum_vals[i] = endian_short(_ucnum_vals[i]);
740 if (_ucnum_size == 0)
743 free((char *) _ucnum_nodes);
748 ucnumber_lookup(unsigned long code, struct ucnumber *num)
757 * Determine a "mid" point and adjust to make sure the mid point is at
758 * the beginning of a code+offset pair.
762 if (code > _ucnum_nodes[m])
764 else if (code < _ucnum_nodes[m])
767 vp = _ucnum_vals + _ucnum_nodes[m + 1];
768 num->numerator = (int) *vp++;
769 num->denominator = (int) *vp;
777 ucdigit_lookup(unsigned long code, int *digit)
786 * Determine a "mid" point and adjust to make sure the mid point is at
787 * the beginning of a code+offset pair.
791 if (code > _ucnum_nodes[m])
793 else if (code < _ucnum_nodes[m])
796 vp = _ucnum_vals + _ucnum_nodes[m + 1];
797 if (*vp == *(vp + 1)) {
808 ucgetnumber(unsigned long code)
813 * Initialize with some arbitrary value, because the caller simply cannot
814 * tell for sure if the code is a number without calling the ucisnumber()
815 * macro before calling this function.
817 num.numerator = num.denominator = -111;
819 (void) ucnumber_lookup(code, &num);
825 ucgetdigit(unsigned long code)
830 * Initialize with some arbitrary value, because the caller simply cannot
831 * tell for sure if the code is a number without calling the ucisdigit()
832 * macro before calling this function.
836 (void) ucdigit_lookup(code, &dig);
841 /**************************************************************************
843 * Setup and cleanup routines.
845 **************************************************************************/
848 ucdata_load(char *paths, int masks)
850 if (masks & UCDATA_CTYPE)
851 _ucprop_load(paths, 0);
852 if (masks & UCDATA_CASE)
853 _uccase_load(paths, 0);
854 if (masks & UCDATA_DECOMP)
855 _ucdcmp_load(paths, 0);
856 if (masks & UCDATA_CMBCL)
857 _uccmcl_load(paths, 0);
858 if (masks & UCDATA_NUM)
859 _ucnumb_load(paths, 0);
863 ucdata_unload(int masks)
865 if (masks & UCDATA_CTYPE)
867 if (masks & UCDATA_CASE)
869 if (masks & UCDATA_DECOMP)
871 if (masks & UCDATA_CMBCL)
873 if (masks & UCDATA_NUM)
878 ucdata_reload(char *paths, int masks)
880 if (masks & UCDATA_CTYPE)
881 _ucprop_load(paths, 1);
882 if (masks & UCDATA_CASE)
883 _uccase_load(paths, 1);
884 if (masks & UCDATA_DECOMP)
885 _ucdcmp_load(paths, 1);
886 if (masks & UCDATA_CMBCL)
887 _uccmcl_load(paths, 1);
888 if (masks & UCDATA_NUM)
889 _ucnumb_load(paths, 1);
898 unsigned long i, lo, *dec;
906 printf("NOT WEAK\n");
908 printf("LOWER 0x%04lX\n", uctolower(0xff3a));
909 printf("UPPER 0x%04lX\n", uctoupper(0xff5a));
911 if (ucisalpha(0x1d5))
914 printf("NOT ALPHA\n");
916 if (ucisupper(0x1d5)) {
918 lo = uctolower(0x1d5);
919 printf("0x%04lx\n", lo);
920 lo = uctotitle(0x1d5);
921 printf("0x%04lx\n", lo);
923 printf("NOT UPPER\n");
925 if (ucistitle(0x1d5))
928 printf("NOT TITLE\n");
930 if (uciscomposite(0x1d5))
931 printf("COMPOSITE\n");
933 printf("NOT COMPOSITE\n");
935 if (ucdecomp(0x1d5, &lo, &dec)) {
936 for (i = 0; i < lo; i++)
937 printf("0x%04lx ", dec[i]);
941 if ((lo = uccombining_class(0x41)) != 0)
942 printf("0x41 CCL %ld\n", lo);
944 if (ucisxdigit(0xfeff))
945 printf("0xFEFF HEX DIGIT\n");
947 printf("0xFEFF NOT HEX DIGIT\n");
949 if (ucisdefined(0x10000))
950 printf("0x10000 DEFINED\n");
952 printf("0x10000 NOT DEFINED\n");
954 if (ucnumber_lookup(0x30, &num)) {
955 if (num.numerator != num.denominator)
956 printf("UCNUMBER: 0x30 = %d/%d\n", num.numerator, num.denominator);
958 printf("UCNUMBER: 0x30 = %d\n", num.numerator);
960 printf("UCNUMBER: 0x30 NOT A NUMBER\n");
962 if (ucnumber_lookup(0xbc, &num)) {
963 if (num.numerator != num.denominator)
964 printf("UCNUMBER: 0xbc = %d/%d\n", num.numerator, num.denominator);
966 printf("UCNUMBER: 0xbc = %d\n", num.numerator);
968 printf("UCNUMBER: 0xbc NOT A NUMBER\n");
971 if (ucnumber_lookup(0xff19, &num)) {
972 if (num.numerator != num.denominator)
973 printf("UCNUMBER: 0xff19 = %d/%d\n", num.numerator, num.denominator);
975 printf("UCNUMBER: 0xff19 = %d\n", num.numerator);
977 printf("UCNUMBER: 0xff19 NOT A NUMBER\n");
979 if (ucnumber_lookup(0x4e00, &num)) {
980 if (num.numerator != num.denominator)
981 printf("UCNUMBER: 0x4e00 = %d/%d\n", num.numerator, num.denominator);
983 printf("UCNUMBER: 0x4e00 = %d\n", num.numerator);
985 printf("UCNUMBER: 0x4e00 NOT A NUMBER\n");
987 if (ucdigit_lookup(0x06f9, &dig))
988 printf("UCDIGIT: 0x6f9 = %d\n", dig);
990 printf("UCDIGIT: 0x6f9 NOT A NUMBER\n");
992 dig = ucgetdigit(0x0969);
993 printf("UCGETDIGIT: 0x969 = %d\n", dig);
995 num = ucgetnumber(0x30);
996 if (num.numerator != num.denominator)
997 printf("UCGETNUMBER: 0x30 = %d/%d\n", num.numerator, num.denominator);
999 printf("UCGETNUMBER: 0x30 = %d\n", num.numerator);
1001 num = ucgetnumber(0xbc);
1002 if (num.numerator != num.denominator)
1003 printf("UCGETNUMBER: 0xbc = %d/%d\n", num.numerator, num.denominator);
1005 printf("UCGETNUMBER: 0xbc = %d\n", num.numerator);
1007 num = ucgetnumber(0xff19);
1008 if (num.numerator != num.denominator)
1009 printf("UCGETNUMBER: 0xff19 = %d/%d\n", num.numerator, num.denominator);
1011 printf("UCGETNUMBER: 0xff19 = %d\n", num.numerator);