From fe98d9fa7b313ffe51f09ea175e5126200793bcd Mon Sep 17 00:00:00 2001 From: Kurt Zeilenga Date: Tue, 25 Jan 2000 22:38:34 +0000 Subject: [PATCH] Initial revision --- libraries/liblunicode/ucdata/MUTTUCData.txt | 303 +++ libraries/liblunicode/ucdata/README | 300 +++ libraries/liblunicode/ucdata/UCData.java | 935 +++++++ libraries/liblunicode/ucdata/UCDataTest.java | 94 + libraries/liblunicode/ucdata/api.txt | 343 +++ libraries/liblunicode/ucdata/bidiapi.txt | 84 + libraries/liblunicode/ucdata/format.txt | 243 ++ libraries/liblunicode/ucdata/ucdata.c | 1161 +++++++++ libraries/liblunicode/ucdata/ucdata.h | 306 +++ libraries/liblunicode/ucdata/ucdata.man | 464 ++++ libraries/liblunicode/ucdata/ucgendat.c | 1485 +++++++++++ libraries/liblunicode/ucdata/ucpgba.c | 813 ++++++ libraries/liblunicode/ucdata/ucpgba.h | 162 ++ libraries/liblunicode/ucdata/ucpgba.man | 97 + libraries/liblunicode/ure/README | 212 ++ libraries/liblunicode/ure/ure.c | 2304 ++++++++++++++++++ libraries/liblunicode/ure/ure.h | 150 ++ libraries/liblunicode/ure/urestubs.c | 64 + libraries/liblunicode/utbm/README | 121 + libraries/liblunicode/utbm/utbm.c | 497 ++++ libraries/liblunicode/utbm/utbm.h | 109 + libraries/liblunicode/utbm/utbmstub.c | 125 + 22 files changed, 10372 insertions(+) create mode 100644 libraries/liblunicode/ucdata/MUTTUCData.txt create mode 100644 libraries/liblunicode/ucdata/README create mode 100644 libraries/liblunicode/ucdata/UCData.java create mode 100644 libraries/liblunicode/ucdata/UCDataTest.java create mode 100644 libraries/liblunicode/ucdata/api.txt create mode 100644 libraries/liblunicode/ucdata/bidiapi.txt create mode 100644 libraries/liblunicode/ucdata/format.txt create mode 100644 libraries/liblunicode/ucdata/ucdata.c create mode 100644 libraries/liblunicode/ucdata/ucdata.h create mode 100644 libraries/liblunicode/ucdata/ucdata.man create mode 100644 libraries/liblunicode/ucdata/ucgendat.c create mode 100644 libraries/liblunicode/ucdata/ucpgba.c create mode 100644 libraries/liblunicode/ucdata/ucpgba.h create mode 100644 libraries/liblunicode/ucdata/ucpgba.man create mode 100644 libraries/liblunicode/ure/README create mode 100644 libraries/liblunicode/ure/ure.c create mode 100644 libraries/liblunicode/ure/ure.h create mode 100644 libraries/liblunicode/ure/urestubs.c create mode 100644 libraries/liblunicode/utbm/README create mode 100644 libraries/liblunicode/utbm/utbm.c create mode 100644 libraries/liblunicode/utbm/utbm.h create mode 100644 libraries/liblunicode/utbm/utbmstub.c diff --git a/libraries/liblunicode/ucdata/MUTTUCData.txt b/libraries/liblunicode/ucdata/MUTTUCData.txt new file mode 100644 index 0000000000..82c4659411 --- /dev/null +++ b/libraries/liblunicode/ucdata/MUTTUCData.txt @@ -0,0 +1,303 @@ +# +# $Id: MUTTUCData.txt,v 1.3 1999/10/29 00:04:35 mleisher Exp $ +# +# Copyright 1999 Computing Research Labs, New Mexico State University +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY +# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT +# OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +# THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# +# Implementation specific character properties. +# +# +# Space, other. +# +0009;;Ss;;;;;;;;;;;; +000A;;Ss;;;;;;;;;;;; +000B;;Ss;;;;;;;;;;;; +000C;;Ss;;;;;;;;;;;; +000D;;Ss;;;;;;;;;;;; +# +# Non-breaking. +# +00A0;;Nb;;;;;;;;;;;; +2007;;Nb;;;;;;;;;;;; +2011;;Nb;;;;;;;;;;;; +FEFF;;Nb;;;;;;;;;;;; +# +# Symmetric. +# +0028;;Sy;;;;;;;;;;;; +0029;;Sy;;;;;;;;;;;; +005B;;Sy;;;;;;;;;;;; +005D;;Sy;;;;;;;;;;;; +007B;;Sy;;;;;;;;;;;; +007D;;Sy;;;;;;;;;;;; +00AB;;Sy;;;;;;;;;;;; +00BB;;Sy;;;;;;;;;;;; +0F3A;;Sy;;;;;;;;;;;; +0F3B;;Sy;;;;;;;;;;;; +0F3C;;Sy;;;;;;;;;;;; +0F3D;;Sy;;;;;;;;;;;; +0F3E;;Sy;;;;;;;;;;;; +0F3F;;Sy;;;;;;;;;;;; +2018;;Sy;;;;;;;;;;;; +2019;;Sy;;;;;;;;;;;; +201A;;Sy;;;;;;;;;;;; +201B;;Sy;;;;;;;;;;;; +201C;;Sy;;;;;;;;;;;; +201D;;Sy;;;;;;;;;;;; +201E;;Sy;;;;;;;;;;;; +201F;;Sy;;;;;;;;;;;; +2039;;Sy;;;;;;;;;;;; +203A;;Sy;;;;;;;;;;;; +2045;;Sy;;;;;;;;;;;; +2046;;Sy;;;;;;;;;;;; +207D;;Sy;;;;;;;;;;;; +207E;;Sy;;;;;;;;;;;; +208D;;Sy;;;;;;;;;;;; +208E;;Sy;;;;;;;;;;;; +2329;;Sy;;;;;;;;;;;; +232A;;Sy;;;;;;;;;;;; +3008;;Sy;;;;;;;;;;;; +3009;;Sy;;;;;;;;;;;; +300A;;Sy;;;;;;;;;;;; +300B;;Sy;;;;;;;;;;;; +300C;;Sy;;;;;;;;;;;; +300D;;Sy;;;;;;;;;;;; +300E;;Sy;;;;;;;;;;;; +300F;;Sy;;;;;;;;;;;; +3010;;Sy;;;;;;;;;;;; +3011;;Sy;;;;;;;;;;;; +3014;;Sy;;;;;;;;;;;; +3015;;Sy;;;;;;;;;;;; +3016;;Sy;;;;;;;;;;;; +3017;;Sy;;;;;;;;;;;; +3018;;Sy;;;;;;;;;;;; +3019;;Sy;;;;;;;;;;;; +301A;;Sy;;;;;;;;;;;; +301B;;Sy;;;;;;;;;;;; +301D;;Sy;;;;;;;;;;;; +301E;;Sy;;;;;;;;;;;; +301F;;Sy;;;;;;;;;;;; +FD3E;;Sy;;;;;;;;;;;; +FD3F;;Sy;;;;;;;;;;;; +FE35;;Sy;;;;;;;;;;;; +FE36;;Sy;;;;;;;;;;;; +FE37;;Sy;;;;;;;;;;;; +FE38;;Sy;;;;;;;;;;;; +FE39;;Sy;;;;;;;;;;;; +FE3A;;Sy;;;;;;;;;;;; +FE3B;;Sy;;;;;;;;;;;; +FE3C;;Sy;;;;;;;;;;;; +FE3D;;Sy;;;;;;;;;;;; +FE3E;;Sy;;;;;;;;;;;; +FE3F;;Sy;;;;;;;;;;;; +FE40;;Sy;;;;;;;;;;;; +FE41;;Sy;;;;;;;;;;;; +FE42;;Sy;;;;;;;;;;;; +FE43;;Sy;;;;;;;;;;;; +FE44;;Sy;;;;;;;;;;;; +FE59;;Sy;;;;;;;;;;;; +FE5A;;Sy;;;;;;;;;;;; +FE5B;;Sy;;;;;;;;;;;; +FE5C;;Sy;;;;;;;;;;;; +FE5D;;Sy;;;;;;;;;;;; +FE5E;;Sy;;;;;;;;;;;; +FF08;;Sy;;;;;;;;;;;; +FF09;;Sy;;;;;;;;;;;; +FF3B;;Sy;;;;;;;;;;;; +FF3D;;Sy;;;;;;;;;;;; +FF5B;;Sy;;;;;;;;;;;; +FF5D;;Sy;;;;;;;;;;;; +FF62;;Sy;;;;;;;;;;;; +FF63;;Sy;;;;;;;;;;;; +# +# Hex digit. +# +0030;;Hd;;;;;;;;;;;; +0031;;Hd;;;;;;;;;;;; +0032;;Hd;;;;;;;;;;;; +0033;;Hd;;;;;;;;;;;; +0034;;Hd;;;;;;;;;;;; +0035;;Hd;;;;;;;;;;;; +0036;;Hd;;;;;;;;;;;; +0037;;Hd;;;;;;;;;;;; +0038;;Hd;;;;;;;;;;;; +0039;;Hd;;;;;;;;;;;; +0041;;Hd;;;;;;;;;;;; +0042;;Hd;;;;;;;;;;;; +0043;;Hd;;;;;;;;;;;; +0044;;Hd;;;;;;;;;;;; +0045;;Hd;;;;;;;;;;;; +0046;;Hd;;;;;;;;;;;; +0061;;Hd;;;;;;;;;;;; +0062;;Hd;;;;;;;;;;;; +0063;;Hd;;;;;;;;;;;; +0064;;Hd;;;;;;;;;;;; +0065;;Hd;;;;;;;;;;;; +0066;;Hd;;;;;;;;;;;; +FF10;;Hd;;;;;;;;;;;; +FF11;;Hd;;;;;;;;;;;; +FF12;;Hd;;;;;;;;;;;; +FF13;;Hd;;;;;;;;;;;; +FF14;;Hd;;;;;;;;;;;; +FF15;;Hd;;;;;;;;;;;; +FF16;;Hd;;;;;;;;;;;; +FF17;;Hd;;;;;;;;;;;; +FF18;;Hd;;;;;;;;;;;; +FF19;;Hd;;;;;;;;;;;; +FF21;;Hd;;;;;;;;;;;; +FF22;;Hd;;;;;;;;;;;; +FF23;;Hd;;;;;;;;;;;; +FF24;;Hd;;;;;;;;;;;; +FF25;;Hd;;;;;;;;;;;; +FF26;;Hd;;;;;;;;;;;; +FF41;;Hd;;;;;;;;;;;; +FF42;;Hd;;;;;;;;;;;; +FF43;;Hd;;;;;;;;;;;; +FF44;;Hd;;;;;;;;;;;; +FF45;;Hd;;;;;;;;;;;; +FF46;;Hd;;;;;;;;;;;; +# +# Quote marks. +# +0022;;Qm;;;;;;;;;;;; +0027;;Qm;;;;;;;;;;;; +00AB;;Qm;;;;;;;;;;;; +00BB;;Qm;;;;;;;;;;;; +2018;;Qm;;;;;;;;;;;; +2019;;Qm;;;;;;;;;;;; +201A;;Qm;;;;;;;;;;;; +201B;;Qm;;;;;;;;;;;; +201C;;Qm;;;;;;;;;;;; +201D;;Qm;;;;;;;;;;;; +201E;;Qm;;;;;;;;;;;; +201F;;Qm;;;;;;;;;;;; +2039;;Qm;;;;;;;;;;;; +203A;;Qm;;;;;;;;;;;; +300C;;Qm;;;;;;;;;;;; +300D;;Qm;;;;;;;;;;;; +300E;;Qm;;;;;;;;;;;; +300F;;Qm;;;;;;;;;;;; +301D;;Qm;;;;;;;;;;;; +301E;;Qm;;;;;;;;;;;; +301F;;Qm;;;;;;;;;;;; +FE41;;Qm;;;;;;;;;;;; +FE42;;Qm;;;;;;;;;;;; +FE43;;Qm;;;;;;;;;;;; +FE44;;Qm;;;;;;;;;;;; +FF02;;Qm;;;;;;;;;;;; +FF07;;Qm;;;;;;;;;;;; +FF62;;Qm;;;;;;;;;;;; +FF63;;Qm;;;;;;;;;;;; +# +# Special Devanagari forms +# +E900;DEVANAGARI KSHA LIGATURE;Lo;0;L;0915 094D 0937;;;;N;;;;; +E901;DEVANAGARI GNYA LIGATURE;Lo;0;L;091C 094D 091E;;;;N;;;;; +E902;DEVANAGARI TTA LIGATURE;Lo;0;L;0924 094D 0924;;;;N;;;;; +E903;DEVANAGARI TRA LIGATURE;Lo;0;L;0924 094D 0930;;;;N;;;;; +E904;DEVANAGARI SHCHA LIGATURE;Lo;0;L;0936 094D 091B;;;;N;;;;; +E905;DEVANAGARI SHRA LIGATURE;Lo;0;L;0936 094D 0930;;;;N;;;;; +E906;DEVANAGARI SHVA LIGATURE;Lo;0;L;0936 094D 0935;;;;N;;;;; +E907;DEVANAGARI KRA LIGATURE;Lo;0;L;;;;;N;;;;; +E908;DEVANAGARI JRA LIGATURE;Lo;0;L;;;;;N;;;;; +E909;DEVANAGARI ZRA LIGATURE;Lo;0;L;;;;;N;;;;; +E90A;DEVANAGARI PHRA LIGATURE;Lo;0;L;;;;;N;;;;; +E90B;DEVANAGARI FRA LIGATURE;Lo;0;L;;;;;N;;;;; +E90C;DEVANAGARI PRA LIGATURE;Lo;0;L;;;;;N;;;;; +E90D;DEVANAGARI SRA LIGATURE;Lo;0;L;;;;;N;;;;; +E90E;DEVANAGARI RU LIGATURE;Lo;0;L;;;;;N;;;;; +E90F;DEVANAGARI RUU LIGATURE;Lo;0;L;;;;;N;;;;; +E915;DEVANAGARI HALF LETTER KA;Lo;0;L;;;;;N;;;;; +E916;DEVANAGARI HALF LETTER KHA;Lo;0;L;;;;;N;;;;; +E917;DEVANAGARI HALF LETTER GA;Lo;0;L;;;;;N;;;;; +E918;DEVANAGARI HALF LETTER GHA;Lo;0;L;;;;;N;;;;; +E919;DEVANAGARI HALF LETTER NGA;Lo;0;L;;;;;N;;;;; +E91A;DEVANAGARI HALF LETTER CA;Lo;0;L;;;;;N;;;;; +E91B;DEVANAGARI HALF LETTER CHA;Lo;0;L;;;;;N;;;;; +E91C;DEVANAGARI HALF LETTER JA;Lo;0;L;;;;;N;;;;; +E91D;DEVANAGARI HALF LETTER JHA;Lo;0;L;;;;;N;;;;; +E91E;DEVANAGARI HALF LETTER NYA;Lo;0;L;;;;;N;;;;; +E91F;DEVANAGARI HALF LETTER TTA;Lo;0;L;;;;;N;;;;; +E920;DEVANAGARI HALF LETTER TTHA;Lo;0;L;;;;;N;;;;; +E921;DEVANAGARI HALF LETTER DDA;Lo;0;L;;;;;N;;;;; +E922;DEVANAGARI HALF LETTER DDHA;Lo;0;L;;;;;N;;;;; +E923;DEVANAGARI HALF LETTER NNA;Lo;0;L;;;;;N;;;;; +E924;DEVANAGARI HALF LETTER TA;Lo;0;L;;;;;N;;;;; +E925;DEVANAGARI HALF LETTER THA;Lo;0;L;;;;;N;;;;; +E926;DEVANAGARI HALF LETTER DA;Lo;0;L;;;;;N;;;;; +E927;DEVANAGARI HALF LETTER DHA;Lo;0;L;;;;;N;;;;; +E928;DEVANAGARI HALF LETTER NA;Lo;0;L;;;;;N;;;;; +E929;DEVANAGARI HALF LETTER NNNA;Lo;0;L;0928 093C;;;;N;;;;; +E92A;DEVANAGARI HALF LETTER PA;Lo;0;L;;;;;N;;;;; +E92B;DEVANAGARI HALF LETTER PHA;Lo;0;L;;;;;N;;;;; +E92C;DEVANAGARI HALF LETTER BA;Lo;0;L;;;;;N;;;;; +E92D;DEVANAGARI HALF LETTER BHA;Lo;0;L;;;;;N;;;;; +E92E;DEVANAGARI HALF LETTER MA;Lo;0;L;;;;;N;;;;; +E92F;DEVANAGARI HALF LETTER YA;Lo;0;L;;;;;N;;;;; +E930;DEVANAGARI HALF LETTER RA;Lo;0;L;;;;;N;;;;; +E931;DEVANAGARI HALF LETTER RRA;Lo;0;L;0930 093C;;;;N;;;;; +E932;DEVANAGARI HALF LETTER LA;Lo;0;L;;;;;N;;;;; +E933;DEVANAGARI HALF LETTER LLA;Lo;0;L;;;;;N;;;;; +E934;DEVANAGARI HALF LETTER LLLA;Lo;0;L;0933 093C;;;;N;;;;; +E935;DEVANAGARI HALF LETTER VA;Lo;0;L;;;;;N;;;;; +E936;DEVANAGARI HALF LETTER SHA;Lo;0;L;;;;;N;;;;; +E937;DEVANAGARI HALF LETTER SSA;Lo;0;L;;;;;N;;;;; +E938;DEVANAGARI HALF LETTER SA;Lo;0;L;;;;;N;;;;; +E939;DEVANAGARI HALF LETTER HA;Lo;0;L;;;;;N;;;;; +E940;DEVANAGARI KKA LIGATURE;Lo;0;L;0915 094D 0915;;;;N;;;;; +E941;DEVANAGARI KTA LIGATURE;Lo;0;L;0915 094D 0924;;;;N;;;;; +E942;DEVANAGARI NGKA LIGATURE;Lo;0;L;0919 094D 0915;;;;N;;;;; +E943;DEVANAGARI NGKHA LIGATURE;Lo;0;L;0919 094D 0916;;;;N;;;;; +E944;DEVANAGARI NGGA LIGATURE;Lo;0;L;0919 094D 0917;;;;N;;;;; +E945;DEVANAGARI NGGHA LIGATURE;Lo;0;L;0919 094D 0918;;;;N;;;;; +E946;DEVANAGARI NYJA LIGATURE;Lo;0;L;091E 094D 091C;;;;N;;;;; +E947;DEVANAGARI DGHA LIGATURE;Lo;0;L;0926 094D 0918;;;;N;;;;; +E948;DEVANAGARI DDA LIGATURE;Lo;0;L;0926 094D 0926;;;;N;;;;; +E949;DEVANAGARI DDHA LIGATURE;Lo;0;L;0926 094D 0927;;;;N;;;;; +E94A;DEVANAGARI DBA LIGATURE;Lo;0;L;0926 094D 092C;;;;N;;;;; +E94B;DEVANAGARI DBHA LIGATURE;Lo;0;L;0926 094D 092D;;;;N;;;;; +E94C;DEVANAGARI DMA LIGATURE;Lo;0;L;0926 094D 092E;;;;N;;;;; +E94D;DEVANAGARI DYA LIGATURE;Lo;0;L;0926 094D 092F;;;;N;;;;; +E94E;DEVANAGARI DVA LIGATURE;Lo;0;L;0926 094D 0935;;;;N;;;;; +E94F;DEVANAGARI TT-TTA LIGATURE;Lo;0;L;091F 094D 091F;;;;N;;;;; +E950;DEVANAGARI TT-TTHA LIGATURE;Lo;0;L;091F 094D 0920;;;;N;;;;; +E951;DEVANAGARI TTH-TTHA LIGATURE;Lo;0;L;0920 094D 0920;;;;N;;;;; +E952;DEVANAGARI DD-GA LIGATURE;Lo;0;L;0921 094D 0917;;;;N;;;;; +E953;DEVANAGARI DD-DDA LIGATURE;Lo;0;L;0921 094D 0921;;;;N;;;;; +E954;DEVANAGARI DD-DDHA LIGATURE;Lo;0;L;0921 094D 0922;;;;N;;;;; +E955;DEVANAGARI NNA LIGATURE;Lo;0;L;0928 094D 0928;;;;N;;;;; +E956;DEVANAGARI HMA LIGATURE;Lo;0;L;0939 094D 092E;;;;N;;;;; +E957;DEVANAGARI HYA LIGATURE;Lo;0;L;0939 094D 092F;;;;N;;;;; +E958;DEVANAGARI HLA LIGATURE;Lo;0;L;0939 094D 0932;;;;N;;;;; +E959;DEVANAGARI HVA LIGATURE;Lo;0;L;0939 094D 0935;;;;N;;;;; +E95A;DEVANAGARI STRA LIGATURE;Lo;0;L;0938 094D 0924 094D 0930;;;;N;;;;; +E970;DEVANAGARI HALF KSHA LIGATURE;Lo;0;L;0915 094D 0937;;;;N;;;;; +E971;DEVANAGARI HALF GNYA LIGATURE;Lo;0;L;091C 094D 091E;;;;N;;;;; +E972;DEVANAGARI HALF TTA LIGATURE;Lo;0;L;0924 094D 0924;;;;N;;;;; +E973;DEVANAGARI HALF TRA LIGATURE;Lo;0;L;0924 094D 0930;;;;N;;;;; +E974;DEVANAGARI HALF SHCHA LIGATURE;Lo;0;L;0936 094D 091B;;;;N;;;;; +E975;DEVANAGARI HALF SHRA LIGATURE;Lo;0;L;0936 094D 0930;;;;N;;;;; +E976;DEVANAGARI HALF SHVA LIGATURE;Lo;0;L;0936 094D 0935;;;;N;;;;; +E97B;DEVANAGARI SIGN RRA-REPHA;Mn;36;L;;;;;N;;;;; +E97C;DEVANAGARI HAR LIGATURE;Lo;0;L;0939 0943;;;;N;;;;; +E97D;DEVANAGARI SIGN EYELASH RA;Lo;0;L;;;;;N;;;;; +E97E;DEVANAGARI SIGN REPHA;Mn;36;L;;;;;N;;;;; +E97F;DEVANAGARI SIGN SUBJOINED RA;Mn;36;L;;;;;N;;;;; diff --git a/libraries/liblunicode/ucdata/README b/libraries/liblunicode/ucdata/README new file mode 100644 index 0000000000..88e12c9590 --- /dev/null +++ b/libraries/liblunicode/ucdata/README @@ -0,0 +1,300 @@ +# +# $Id: README,v 1.32 1999/11/29 16:41:05 mleisher Exp $ +# + + MUTT UCData Package 2.4 + ----------------------- + +This is a package that supports ctype-like operations for Unicode UCS-2 text +(and surrogates), case mapping, decomposition lookup, and provides a +bidirectional reordering algorithm. To use it, you will need to get the +latest "UnicodeData-*.txt" (or later) file from the Unicode Web or FTP site. + +The character information portion of the package consists of three parts: + + 1. A program called "ucgendat" which generates five data files from the + UnicodeData-*.txt file. The files are: + + A. case.dat - the case mappings. + B. ctype.dat - the character property tables. + C. decomp.dat - the character decompositions. + D. cmbcl.dat - the non-zero combining classes. + E. num.dat - the codes representing numbers. + + 2. The "ucdata.[ch]" files which implement the functions needed to + check to see if a character matches groups of properties, to map between + upper, lower, and title case, to look up the decomposition of a + character, look up the combining class of a character, and get the number + value of a character. + + 3. The UCData.java class which provides the same API (with minor changes for + the numbers) and loads the same binary data files as the C code. + +A short reference to the functions available is in the "api.txt" file. + +Techie Details +============== + +The "ucgendat" program parses files from the command line which are all in the +Unicode Character Database (UCDB) format. An additional properties file, +"MUTTUCData.txt", provides some extra properties for some characters. + +The program looks for the two character properties fields (2 and 4), the +combining class field (3), the decomposition field (5), the numeric value +field (8), and the case mapping fields (12, 13, and 14). The decompositions +are recursively expanded before being written out. + +The decomposition table contains all the canonical decompositions. This means +all decompositions that do not have tags such as "" or "". + +The data is almost all stored as unsigned longs (32-bits assumed) and the +routines that load the data take care of endian swaps when necessary. This +also means that surrogates (>= 0x10000) can be placed in the data files the +"ucgendat" program parses. + +The data is written as external files and broken into five parts so it can be +selectively updated at runtime if necessary. + +The data files currently generated from the "ucgendat" program total about 56K +in size all together. + +The format of the binary data files is documented in the "format.txt" file. + +========================================================================== + + The "Pretty Good Bidi Algorithm" + -------------------------------- + +This routine provides an alternative to the Unicode Bidi algorithm. The +difference is that this version of the PGBA does not handle the explicit +directional codes (LRE, RLE, LRO, RLO, PDF). It should now produce the same +results as the Unicode BiDi algorithm for implicit reordering. Included are +functions for doing cursor motion in both logical and visual order. + +This implementation is provided to demonstrate an effective alternate method +for implicit reordering. To make this useful for an application, it probably +needs some changes to the memory allocation and deallocation, as well as data +structure additions for rendering. + +Mark Leisher +19 November 1999 + +----------------------------------------------------------------------------- + +CHANGES +======= + +Version 2.4 +----------- +1. Improved some bidi algorithm documentation in the code. + +2. Fixed a code mixup that produced a non-working version. + +Version 2.3 +----------- +1. Fixed a misspelling in the ucpgba.h header file. + +2. Fixed a bug which caused trailing weak non-digit sequences to be left out of + the reordered string in the bidi algorithm. + +3. Fixed a problem with weak sequences containing non-spacing marks in the + bidi algorithm. + +4. Fixed a problem with text runs of the opposite direction of the string + surrounding a weak + neutral text run appearing in the wrong order in the + bidi algorithm. + +5. Added a default overall direction parameter to the reordering function for + cases of strings with no strong directional characters in the bidi + algorithm. + +6. The bidi API documentation was improved. + +7. Added a man page for the bidi API. + +Version 2.2 +----------- +1. Fixed a problem with the bidi algorithm locating directional section + boundaries. + +2. Fixed a problem with the bidi algorithm starting the reordering correctly. + +3. Fixed a problem with the bidi algorithm determining end boundaries for LTR + segments. + +4. Fixed a problem with the bidi algorithm reordering weak (digits and number + separators) segments. + +5. Added automatic switching of symmetrically paired characters when + reversing RTL segments. + +6. Added a missing symmetric character to the extra character properties in + MUTTUCData.txt. + +7. Added support for doing logical and visual cursor traversal. + +Version 2.1 +----------- +1. Updated the ucgendat program to handle the Unicode 3.0 character database + properties. The AL and BM bidi properties gets marked as strong RTL and + Other Neutral, the NSM, LRE, RLE, PDF, LRO, and RLO controls all get marked + as Other Neutral. + +2. Fixed some problems with testing against signed values in the UCData.java + code and some minor cleanup. + +3. Added the "Pretty Good Bidi Algorithm." + +Version 2.0 +----------- +1. Removed the old Java stuff for a new class that loads directly from the + same data files as the C code does. + +2. Fixed a problem with choosing the correct field when mapping case. + +3. Adjust some search routines to start their search in the correct position. + +4. Moved the copyright year to 1999. + +Version 1.9 +----------- +1. Fixed a problem with an incorrect amount of storage being allocated for the + combining class nodes. + +2. Fixed an invalid initialization in the number code. + +3. Changed the Java template file formatting a bit. + +4. Added tables and function for getting decompositions in the Java class. + +Version 1.8 +----------- +1. Fixed a problem with adding certain ranges. + +2. Added two more macros for testing for identifiers. + +3. Tested with the UnicodeData-2.1.5.txt file. + +Version 1.7 +----------- +1. Fixed a problem with looking up decompositions in "ucgendat." + +Version 1.6 +----------- +1. Added two new properties introduced with UnicodeData-2.1.4.txt. + +2. Changed the "ucgendat.c" program a little to automatically align the + property data on a 4-byte boundary when new properties are added. + +3. Changed the "ucgendat.c" programs to only generate canonical + decompositions. + +4. Added two new macros ucisinitialpunct() and ucisfinalpunct() to check for + initial and final punctuation characters. + +5. Minor additions and changes to the documentation. + +Version 1.5 +----------- +1. Changed all file open calls to include binary mode with "b" for DOS/WIN + platforms. + +2. Wrapped the unistd.h include so it won't be included when compiled under + Win32. + +3. Fixed a bad range check for hex digits in ucgendat.c. + +4. Fixed a bad endian swap for combining classes. + +5. Added code to make a number table and associated lookup functions. + Functions added are ucnumber(), ucdigit(), and ucgetnumber(). The last + function is to maintain compatibility with John Cowan's "uctype" package. + +Version 1.4 +----------- +1. Fixed a bug with adding a range. + +2. Fixed a bug with inserting a range in order. + +3. Fixed incorrectly specified ucisdefined() and ucisundefined() macros. + +4. Added the missing unload for the combining class data. + +5. Fixed a bad macro placement in ucisweak(). + +Version 1.3 +----------- +1. Bug with case mapping calculations fixed. + +2. Bug with empty character property entries fixed. + +3. Bug with incorrect type in the combining class lookup fixed. + +4. Some corrections done to api.txt. + +5. Bug in certain character property lookups fixed. + +6. Added a character property table that records the defined characters. + +7. Replaced ucisunknown() with ucisdefined() and ucisundefined(). + +Version 1.2 +----------- +1. Added code to ucgendat to generate a combining class table. + +2. Fixed an endian problem with the byte count of decompositions. + +3. Fixed some minor problems in the "format.txt" file. + +4. Removed some bogus "Ss" values from MUTTUCData.txt file. + +5. Added API function to get combining class. + +6. Changed the open mode to "rb" so binary data files will be opened correctly + on DOS/WIN as well as other platforms. + +7. Added the "api.txt" file. + +Version 1.1 +----------- +1. Added ucisxdigit() which I overlooked. + +2. Added UC_LT to the ucisalpha() macro which I overlooked. + +3. Change uciscntrl() to include UC_CF. + +4. Added ucisocntrl() and ucfntcntrl() macros. + +5. Added a ucisblank() which I overlooked. + +6. Added missing properties to ucissymbol() and ucisnumber(). + +7. Added ucisgraph() and ucisprint(). + +8. Changed the "Mr" property to "Sy" to mark this subset of mirroring + characters as symmetric to avoid trampling the Unicode/ISO10646 sense of + mirroring. + +9. Added another property called "Ss" which includes control characters + traditionally seen as spaces in the isspace() macro. + +10. Added a bunch of macros to be API compatible with John Cowan's package. + +ACKNOWLEDGEMENTS +================ + +Thanks go to John Cowan for pointing out lots of +missing things and giving me stuff, particularly a bunch of new macros. + +Thanks go to Bob Verbrugge for pointing out +various bugs. + +Thanks go to Christophe Pierret for pointing +out that file modes need to have "b" for DOS/WIN machines, pointing out +unistd.h is not a Win 32 header, and pointing out a problem with ucisalnum(). + +Thanks go to Kent Johnson for finding a bug that caused +incomplete decompositions to be generated by the "ucgendat" program. + +Thanks go to Valeriy E. Ushakov for spotting an allocation +error and an initialization error. diff --git a/libraries/liblunicode/ucdata/UCData.java b/libraries/liblunicode/ucdata/UCData.java new file mode 100644 index 0000000000..08d02035dd --- /dev/null +++ b/libraries/liblunicode/ucdata/UCData.java @@ -0,0 +1,935 @@ +/* + * $Id: UCData.java,v 1.2 1999/10/07 20:49:56 mleisher Exp $ + * + * Copyright 1999 Computing Research Labs, New Mexico State University + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT + * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ +import java.io.*; +import java.net.*; + +public class UCData { + private static byte[] buffer; + private static boolean endian; + private static int bytes, buffpos; + + // + // Do the static initialization. + // + static { + buffer = new byte[24576]; + } + + private static boolean load_file(InputStream in) { + buffpos = 0; + try { + bytes = in.read(buffer); + } catch (IOException e) { + return false; + } + endian = (buffer[0] == -2 && buffer[1] == -2); + buffpos = 2; + return (bytes > 0); + } + + private static int getInt() { + int b1, b2, b3, b4; + + if (!endian) { + b1 = buffer[buffpos++]; + b2 = buffer[buffpos++]; + b3 = buffer[buffpos++]; + b4 = buffer[buffpos++]; + } else { + b4 = buffer[buffpos++]; + b3 = buffer[buffpos++]; + b2 = buffer[buffpos++]; + b1 = buffer[buffpos++]; + } + if (b1 < 0) + b1 += 256; + if (b2 < 0) + b2 += 256; + if (b3 < 0) + b3 += 256; + if (b4 < 0) + b4 += 256; + return ((b1 << 24) | (b2 << 16) | (b3 << 8) | b4); + } + + private static int getInt(int from) { + buffpos = from; + return getInt(); + } + + private static short getShort() { + int b1, b2; + + if (!endian) { + b1 = buffer[buffpos++]; + b2 = buffer[buffpos++]; + } else { + b2 = buffer[buffpos++]; + b1 = buffer[buffpos++]; + } + if (b1 < 0) + b1 += 256; + if (b2 < 0) + b2 += 256; + + return (short) ((b1 << 8) | b2); + } + + private static short getShort(int from) { + buffpos = from; + return getShort(); + } + + /********************************************************************** + * + * Character type info section. + * + **********************************************************************/ + + private static int masks32[] = { + 0x00000001, 0x00000002, 0x00000004, 0x00000008, 0x00000010, 0x00000020, + 0x00000040, 0x00000080, 0x00000100, 0x00000200, 0x00000400, 0x00000800, + 0x00001000, 0x00002000, 0x00004000, 0x00008000, 0x00010000, 0x00020000, + 0x00040000, 0x00080000, 0x00100000, 0x00200000, 0x00400000, 0x00800000, + 0x01000000, 0x02000000, 0x04000000, 0x08000000, 0x10000000, 0x20000000, + 0x40000000, 0x80000000 + }; + + // + // The arrays with the character property info. + // + private static short[] _ucprop_offsets = null; + private static int[] _ucprop_ranges = null; + + public static final int UC_MN = 0x00000001; + public static final int UC_MC = 0x00000002; + public static final int UC_ME = 0x00000004; + public static final int UC_ND = 0x00000008; + public static final int UC_NL = 0x00000010; + public static final int UC_NO = 0x00000020; + public static final int UC_ZS = 0x00000040; + public static final int UC_ZL = 0x00000080; + public static final int UC_ZP = 0x00000100; + public static final int UC_CC = 0x00000200; + public static final int UC_CF = 0x00000400; + public static final int UC_OS = 0x00000800; + public static final int UC_CO = 0x00001000; + public static final int UC_CN = 0x00002000; + public static final int UC_LU = 0x00004000; + public static final int UC_LL = 0x00008000; + public static final int UC_LT = 0x00010000; + public static final int UC_LM = 0x00020000; + public static final int UC_LO = 0x00040000; + public static final int UC_PC = 0x00080000; + public static final int UC_PD = 0x00100000; + public static final int UC_PS = 0x00200000; + public static final int UC_PE = 0x00400000; + public static final int UC_PO = 0x00800000; + public static final int UC_SM = 0x01000000; + public static final int UC_SC = 0x02000000; + public static final int UC_SK = 0x04000000; + public static final int UC_SO = 0x08000000; + public static final int UC_L = 0x10000000; + public static final int UC_R = 0x20000000; + public static final int UC_EN = 0x40000000; + public static final int UC_ES = 0x80000000; + public static final int UC_ET = 0x00000001; + public static final int UC_AN = 0x00000002; + public static final int UC_CS = 0x00000004; + public static final int UC_B = 0x00000008; + public static final int UC_S = 0x00000010; + public static final int UC_WS = 0x00000020; + public static final int UC_ON = 0x00000040; + public static final int UC_CM = 0x00000080; + public static final int UC_NB = 0x00000100; + public static final int UC_SY = 0x00000200; + public static final int UC_HD = 0x00000400; + public static final int UC_QM = 0x00000800; + public static final int UC_MR = 0x00001000; + public static final int UC_SS = 0x00002000; + public static final int UC_CP = 0x00004000; + public static final int UC_PI = 0x00008000; + public static final int UC_PF = 0x00010000; + + private static boolean _ucprop_load(URL where) { + int i, hsize, size = 0; + boolean res; + InputStream in = null; + + // + // If the offsets array is not null, then this file has been loaded. + // + if (_ucprop_offsets != null) + return true; + + try { + in = where.openStream(); + } catch (IOException e1) { + return false; + } + + res = load_file(in); + + try { + in.close(); + } catch (IOException e) {} + + if (res == false) + return res; + + hsize = getShort(); + + if (((size = (hsize + 1) << 1) & 3) != 0) + size += 4 - (size & 3); + + _ucprop_offsets = new short[hsize + 1]; + + // + // Skip the byte count which won't be needed. + // + buffpos += 4; + + // + // Adjust the byte count used to position at the beginning of the + // ranges to include the 4 bytes at the beginning and the byte count + // which is unused. + // + size += 8; + + for (i = 0; i <= hsize; i++) + _ucprop_offsets[i] = getShort(); + + // + // Now allocate the ranges. + // + _ucprop_ranges = new int[_ucprop_offsets[hsize]]; + for (i = 0, buffpos = size; i < _ucprop_offsets[hsize]; i++) + _ucprop_ranges[i] = getInt(); + + return true; + } + + private static void _ucprop_unload() { + _ucprop_offsets = null; + _ucprop_ranges = null; + } + + private static boolean uclookup(int code, int n) { + int l, r, m; + + if ((l = _ucprop_offsets[n]) == -1) + return false; + + for (m = 1; n + m < _ucprop_offsets.length && + _ucprop_offsets[n + m] == -1; m++) ; + + r = _ucprop_offsets[n + m] - 1; + while (l <= r) { + m = (l + r) >> 1; + m -= (m & 1); + if (code > _ucprop_ranges[m + 1]) + l = m + 2; + else if (code < _ucprop_ranges[m]) + r = m - 2; + else if (_ucprop_ranges[m] <= code && code <= _ucprop_ranges[m+1]) + return true; + } + return false; + } + + public static boolean ucisprop(int code, int mask1, int mask2) { + int i; + + if (mask1 == 0 && mask2 == 0) + return false; + + if (mask1 != 0) { + for (i = 0; i < 32; i++) { + if ((mask1 & masks32[i]) != 0 && uclookup(code, i)) + return true; + } + } + + if (mask2 != 0) { + for (i = 32; i < _ucprop_offsets.length; i++) { + if ((mask2 & masks32[i & 31]) != 0 && uclookup(code, i)) + return true; + } + } + return false; + } + + public static boolean ucisalpha(int code) { + return ucisprop(code, UC_LU|UC_LL|UC_LM|UC_LO|UC_LT, 0); + } + public static boolean ucisdigit(int code) { + return ucisprop(code, UC_ND, 0); + } + public static boolean ucisalnum(int code) { + return ucisprop(code, UC_LU|UC_LL|UC_LM|UC_LO|UC_LT|UC_ND, 0); + } + public static boolean uciscntrl(int code) { + return ucisprop(code, UC_CC|UC_CF, 0); + } + public static boolean ucisspace(int code) { + return ucisprop(code, UC_ZS|UC_SS, 0); + } + public static boolean ucisblank(int code) { + return ucisprop(code, UC_ZS, 0); + } + public static boolean ucispunct(int code) { + return ucisprop(code, UC_PD|UC_PS|UC_PE|UC_PO, UC_PI|UC_PF); + } + public static boolean ucisgraph(int code) { + return ucisprop(code, UC_MN|UC_MC|UC_ME|UC_ND|UC_NL|UC_NO| + UC_LU|UC_LL|UC_LT|UC_LM|UC_LO|UC_PC|UC_PD| + UC_PS|UC_PE|UC_PO|UC_SM|UC_SM|UC_SC|UC_SK| + UC_SO, UC_PI|UC_PF); + } + public static boolean ucisprint(int code) { + return ucisprop(code, UC_MN|UC_MC|UC_ME|UC_ND|UC_NL|UC_NO| + UC_LU|UC_LL|UC_LT|UC_LM|UC_LO|UC_PC|UC_PD| + UC_PS|UC_PE|UC_PO|UC_SM|UC_SM|UC_SC|UC_SK| + UC_SO|UC_ZS, UC_PI|UC_PF); + } + public static boolean ucisupper(int code) { + return ucisprop(code, UC_LU, 0); + } + public static boolean ucislower(int code) { + return ucisprop(code, UC_LL, 0); + } + public static boolean ucistitle(int code) { + return ucisprop(code, UC_LT, 0); + } + public static boolean ucisxdigit(int code) { + return ucisprop(code, 0, UC_HD); + } + public static boolean ucisisocntrl(int code) { + return ucisprop(code, UC_CC, 0); + } + public static boolean ucisfmtcntrl(int code) { + return ucisprop(code, UC_CF, 0); + } + public static boolean ucissymbol(int code) { + return ucisprop(code, UC_SM|UC_SC|UC_SO|UC_SK, 0); + } + public static boolean ucisnumber(int code) { + return ucisprop(code, UC_ND|UC_NO|UC_NL, 0); + } + public static boolean ucisnonspacing(int code) { + return ucisprop(code, UC_MN, 0); + } + public static boolean ucisopenpunct(int code) { + return ucisprop(code, UC_PS, 0); + } + public static boolean ucisclosepunct(int code) { + return ucisprop(code, UC_PE, 0); + } + public static boolean ucisinitialpunct(int code) { + return ucisprop(code, 0, UC_PI); + } + public static boolean ucisfinalpunct(int code) { + return ucisprop(code, 0, UC_PF); + } + public static boolean uciscomposite(int code) { + return ucisprop(code, 0, UC_CM); + } + public static boolean ucishex(int code) { + return ucisprop(code, 0, UC_HD); + } + public static boolean ucisquote(int code) { + return ucisprop(code, 0, UC_QM); + } + public static boolean ucissymmetric(int code) { + return ucisprop(code, 0, UC_SY); + } + public static boolean ucismirroring(int code) { + return ucisprop(code, 0, UC_MR); + } + public static boolean ucisnonbreaking(int code) { + return ucisprop(code, 0, UC_NB); + } + public static boolean ucisrtl(int code) { + return ucisprop(code, UC_R, 0); + } + public static boolean ucisltr(int code) { + return ucisprop(code, UC_L, 0); + } + public static boolean ucisstrong(int code) { + return ucisprop(code, UC_L|UC_R, 0); + } + public static boolean ucisweak(int code) { + return ucisprop(code, UC_EN|UC_ES, UC_ET|UC_AN|UC_CS); + } + public static boolean ucisneutral(int code) { + return ucisprop(code, 0, UC_B|UC_S|UC_WS|UC_ON); + } + public static boolean ucisseparator(int code) { + return ucisprop(code, 0, UC_B|UC_S); + } + public static boolean ucismark(int code) { + return ucisprop(code, UC_MN|UC_MC|UC_ME, 0); + } + public static boolean ucismodif(int code) { + return ucisprop(code, UC_LM, 0); + } + public static boolean ucisletnum(int code) { + return ucisprop(code, UC_NL, 0); + } + public static boolean ucisconnect(int code) { + return ucisprop(code, UC_PC, 0); + } + public static boolean ucisdash(int code) { + return ucisprop(code, UC_PD, 0); + } + public static boolean ucismath(int code) { + return ucisprop(code, UC_SM, 0); + } + public static boolean uciscurrency(int code) { + return ucisprop(code, UC_SC, 0); + } + public static boolean ucismodifsymbol(int code) { + return ucisprop(code, UC_SK, 0); + } + public static boolean ucisnsmark(int code) { + return ucisprop(code, UC_MN, 0); + } + public static boolean ucisspmark(int code) { + return ucisprop(code, UC_MC, 0); + } + public static boolean ucisenclosing(int code) { + return ucisprop(code, UC_ME, 0); + } + public static boolean ucisprivate(int code) { + return ucisprop(code, UC_CO, 0); + } + public static boolean ucissurrogate(int code) { + return ucisprop(code, UC_OS, 0); + } + public static boolean ucislsep(int code) { + return ucisprop(code, UC_ZL, 0); + } + public static boolean ucispsep(int code) { + return ucisprop(code, UC_ZP, 0); + } + public static boolean ucisidentstart(int code) { + return ucisprop(code, UC_LU|UC_LL|UC_LT|UC_LO|UC_NL, 0); + } + public static boolean ucisidentpart(int code) { + return ucisprop(code, UC_LU|UC_LL|UC_LT|UC_LO|UC_NL| + UC_MN|UC_MC|UC_ND|UC_PC|UC_CF, 0); + } + public static boolean ucisdefined(int code) { + return ucisprop(code, 0, UC_CP); + } + public static boolean ucisundefined(int code) { + return (ucisprop(code, 0, UC_CP) == true) ? false : true; + } + public static boolean ucishan(int code) { + return ((0x4e00 <= code && code <= 0x9fff) || + (0xf900 <= code && code <= 0xfaff)) ? true : false; + } + public static boolean ucishangul(int code) { + return (0xac00 <= code && code <= 0xd7ff) ? true : false; + } + + /********************************************************************** + * + * Case mapping section. + * + **********************************************************************/ + + private static int[] _uccase_len = {0, 0}; + private static int[] _uccase_map = null; + + private static boolean _uccase_load(URL where) { + int i, n; + boolean res; + InputStream in = null; + + // + // If this array exists, then the file has already been loaded. + // + if (_uccase_map != null) + return true; + + try { + in = where.openStream(); + } catch (IOException e1) { + return false; + } + + res = load_file(in); + + try { + in.close(); + } catch (IOException e) {} + + if (res == false) + return res; + + n = getShort(2) * 3; + _uccase_len[0] = getShort() * 3; + _uccase_len[1] = getShort() * 3; + + _uccase_map = new int[n]; + for (i = 0; i < n; i++) + _uccase_map[i] = getInt(); + + return true; + } + + private static void _uccase_unload() { + _uccase_len[0] = _uccase_len[1] = 0; + _uccase_map = null; + } + + private static int _uccase_lookup(int code, int l, int r, int field) { + int m; + + while (l <= r) { + m = (l + r) >> 1; + m -= (m % 3); + if (code > _uccase_map[m]) + l = m + 3; + else if (code < _uccase_map[m]) + r = m - 3; + else + return _uccase_map[m + field]; + } + return -1; + } + + public static int uctoupper(int code) { + int l, r, field; + + if (ucisupper(code)) + return code; + + if (ucislower(code)) { + // + // Lower case. + // + field = 2; + l = _uccase_len[0]; + r = (l + _uccase_len[1]) - 3; + } else { + // + // Title case. + // + field = 1; + l = _uccase_len[0] + _uccase_len[1]; + r = _uccase_map.length - 3; + } + return _uccase_lookup(code, l, r, field); + } + + public static int uctolower(int code) { + int l, r, field; + + if (ucislower(code)) + return code; + + if (ucisupper(code)) { + // + // Upper case. + // + field = 1; + l = 0; + r = _uccase_len[0] - 3; + } else { + // + // Title case. + // + field = 2; + l = _uccase_len[0] + _uccase_len[1]; + r = _uccase_map.length - 1; + } + return _uccase_lookup(code, l, r, field); + } + + public static int uctotitle(int code) { + int l, r, field; + + if (ucistitle(code)) + return code; + + field = 2; + if (ucisupper(code)) { + // + // Upper case. + // + l = 0; + r = _uccase_len[0] - 3; + } else { + // + // Lower case. + // + l = _uccase_len[0]; + r = (l + _uccase_len[1]) - 3; + } + return _uccase_lookup(code, l, r, field); + } + + /********************************************************************** + * + * Character decomposition section. + * + **********************************************************************/ + + static int _ucdcmp_node_count = 0; + static int[] _ucdcmp_data = null; + + private static boolean _ucdcmp_load(URL where) { + int i, bcnt; + boolean res; + InputStream in = null; + + // + // If this array is not null, then the file has already been loaded. + // + if (_ucdcmp_data != null) + return true; + + try { + in = where.openStream(); + } catch (IOException e1) { + return false; + } + + res = load_file(in); + + try { + in.close(); + } catch (IOException e) {} + + if (res == false) + return res; + + // + // This specifies how many of the _ucdmp_data elements are nodes which + // leaves the remaining number to be decompositions. + // + _ucdcmp_node_count = getShort() << 1; + + bcnt = getInt() >> 2; + + _ucdcmp_data = new int[bcnt]; + + for (i = 0; i < bcnt; i++) + _ucdcmp_data[i] = getInt(); + + return res; + } + + private static void _ucdcmp_unload() { + _ucdcmp_node_count = 0; + _ucdcmp_data = null; + } + + public static int[] ucdecomp(int code) { + int l, r, m, out[]; + + l = 0; + r = _ucdcmp_data[_ucdcmp_node_count] - 1; + + while (l <= r) { + // + // Determine a "mid" point and adjust to make sure the mid point + // is at the beginning of a code+offset pair. + // + m = (l + r) >> 1; + m -= (m & 1); + if (code > _ucdcmp_data[m]) + l = m + 2; + else if (code < _ucdcmp_data[m]) + r = m - 2; + else { + l = _ucdcmp_data[m + 3] - _ucdcmp_data[m + 1]; + out = new int[l]; + for (r = 0; r < l; r++) + out[r] = _ucdcmp_data[_ucdcmp_node_count + 1 + + _ucdcmp_data[m + 1] + r]; + return out; + } + } + return null; + } + + public static int[] ucdecomp_hangul(int code) { + int out[], decomp[] = {0, 0, 0}; + + if (!ucishangul(code)) + return null; + + code -= 0xac00; + decomp[0] = 0x1100 + (code / 588); + decomp[1] = 0x1161 + ((code % 588) / 28); + decomp[2] = 0x11a7 + (code % 28); + + out = new int[(decomp[2] != 0x11a7) ? 3 : 2]; + out[0] = decomp[0]; + out[1] = decomp[1]; + if (decomp[0] != 0x11a7) + out[2] = decomp[2]; + return out; + } + + /********************************************************************** + * + * Combining class section. + * + **********************************************************************/ + + private static int[] _uccmbcl_nodes = null; + + private static boolean _uccmbcl_load(URL where) { + int i, n; + boolean res; + InputStream in = null; + + // + // If this array is not null, the file has already been loaded. + // + if (_uccmbcl_nodes != null) + return true; + + try { + in = where.openStream(); + } catch (IOException e1) { + return false; + } + + res = load_file(in); + + try { + in.close(); + } catch (IOException e) {} + + if (res == false) + return res; + + n = getShort() * 3; + + buffpos += 4; + + _uccmbcl_nodes = new int[n]; + for (i = 0; i < n; i++) + _uccmbcl_nodes[i] = getInt(); + + return true; + } + + private static void _uccmbcl_unload() { + _uccmbcl_nodes = null; + } + + public static int uccombining_class(int code) { + int l, r, m; + + l = 0; + r = _uccmbcl_nodes.length - 3; + + while (l <= r) { + m = (l + r) >> 1; + m -= (m % 3); + if (code > _uccmbcl_nodes[m + 1]) + l = m + 3; + else if (code < _uccmbcl_nodes[m]) + r = m - 3; + else if (_uccmbcl_nodes[m] <= code && + code <= _uccmbcl_nodes[m + 1]) + return _uccmbcl_nodes[m + 2]; + } + return 0; + } + + /********************************************************************** + * + * Number section. + * + **********************************************************************/ + + private static short[] _ucnum_vals; + private static int[] _ucnum_nodes; + + private static boolean _ucnumb_load(URL where) { + int i, n, b; + boolean res; + InputStream in = null; + + // + // If this array is not null, then the file has already been loaded. + // + if (_ucnum_nodes != null) + return true; + + try { + in = where.openStream(); + } catch (IOException e1) { + return false; + } + + res = load_file(in); + + try { + in.close(); + } catch (IOException e) {} + + if (res == false) + return res; + + n = getShort(); + b = (getInt() - (n << 2)) >> 1; + + _ucnum_nodes = new int[n]; + for (i = 0; i < n; i++) + _ucnum_nodes[i] = getInt(); + + _ucnum_vals = new short[b]; + for (i = 0; i < b; i++) + _ucnum_vals[i] = getShort(); + + return true; + } + + private static void _ucnumb_unload() { + _ucnum_vals = null; + _ucnum_nodes = null; + } + + public static boolean ucnumber_lookup(int code, int[] result) { + int l, r, m; + + result[0] = result[1] = 0; + + l = 0; + r = _ucnum_nodes.length - 1; + while (l <= r) { + m = (l + r) >> 1; + m -= (m & 1); + if (code > _ucnum_nodes[m]) + l = m + 2; + else if (code < _ucnum_nodes[m]) + r = m - 2; + else { + result[0] = _ucnum_vals[_ucnum_nodes[m + 1]]; + result[1] = _ucnum_vals[_ucnum_nodes[m + 1] + 1]; + return true; + } + } + return false; + } + + public static boolean ucdigit_lookup(int code, int[] result) { + int l, r, m; + + result[0] = -1; + + l = 0; + r = _ucnum_nodes.length - 1; + while (l <= r) { + m = (l + r) >> 1; + m -= (m & 1); + if (code > _ucnum_nodes[m]) + l = m + 2; + else if (code < _ucnum_nodes[m]) + r = m - 2; + else { + short d1 = _ucnum_vals[_ucnum_nodes[m + 1]]; + short d2 = _ucnum_vals[_ucnum_nodes[m + 1] + 1]; + if (d1 == d2) { + result[0] = d1; + return true; + } + return false; + } + } + return false; + } + + /********************************************************************** + * + * File loading and unloading routines. + * + **********************************************************************/ + + // + // Masks that combine to load and unload files using a base URL. + // + public final static int UCDATA_CASE = 0x01; + public final static int UCDATA_CTYPE = 0x02; + public final static int UCDATA_DECOMP = 0x04; + public final static int UCDATA_CMBCL = 0x08; + public final static int UCDATA_NUM = 0x10; + public final static int UCDATA_ALL = 0x1f; + + public static void ucdata_load(URL base, int masks) { + // + // Make sure the base has the trailing slash. + // + String url = base.toString(); + if (url.lastIndexOf('/') != url.length() - 1) + url += "/"; + + if ((masks & UCDATA_CTYPE) != 0) { + try { + _ucprop_load(new URL(url + "ctype.dat")); + } catch (MalformedURLException mue) {} + } + if ((masks & UCDATA_CASE) != 0) { + try { + _uccase_load(new URL(url + "case.dat")); + } catch (MalformedURLException mue) {} + } + if ((masks & UCDATA_DECOMP) != 0) { + try { + _ucdcmp_load(new URL(url + "decomp.dat")); + } catch (MalformedURLException mue) {} + } + if ((masks & UCDATA_CMBCL) != 0) { + try { + _uccmbcl_load(new URL(url + "cmbcl.dat")); + } catch (MalformedURLException mue) {} + } + if ((masks & UCDATA_NUM) != 0) { + try { + _ucnumb_load(new URL(url + "num.dat")); + } catch (MalformedURLException mue) {} + } + } + + public static void ucdata_unload(int masks) { + if ((masks & UCDATA_CTYPE) != 0) + _ucprop_unload(); + if ((masks & UCDATA_CASE) != 0) + _uccase_unload(); + if ((masks & UCDATA_DECOMP) != 0) + _ucdcmp_unload(); + if ((masks & UCDATA_CMBCL) != 0) + _uccmbcl_unload(); + if ((masks & UCDATA_NUM) != 0) + _ucnumb_unload(); + } +} diff --git a/libraries/liblunicode/ucdata/UCDataTest.java b/libraries/liblunicode/ucdata/UCDataTest.java new file mode 100644 index 0000000000..fa36f04f60 --- /dev/null +++ b/libraries/liblunicode/ucdata/UCDataTest.java @@ -0,0 +1,94 @@ +/* + * $Id: UCDataTest.java,v 1.1 1999/08/23 16:14:08 mleisher Exp $ + * + * Copyright 1999 Computing Research Labs, New Mexico State University + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT + * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ +import java.io.*; +import java.net.*; +import UCData.*; + +public class UCDataTest { + /********************************************************************** + * + * Main. + * + **********************************************************************/ + + public static void main(String[] args) { + URL url = null; + + try { + url = new URL("file:/home/mleisher/unicode/textutils/ucdata"); + } catch (MalformedURLException mue) {} + + UCData.ucdata_load(url, UCData.UCDATA_ALL); + + if (UCData.ucisalpha(0x1d5)) + System.out.println("0x1d5 is alpha"); + else + System.out.println("0x1d5 is not alpha"); + + int c; + + c = UCData.uctolower(0x1f1); + System.out.println("0x1f1 lower is 0x"+Integer.toHexString(c)); + c = UCData.uctotitle(0x1f1); + System.out.println("0x1f1 title is 0x"+Integer.toHexString(c)); + + c = UCData.uctolower(0xff3a); + System.out.println("0xff3a lower is 0x"+Integer.toHexString(c)); + c = UCData.uctotitle(0xff3a); + System.out.println("0xff3a title is 0x"+Integer.toHexString(c)); + + int[] decomp = UCData.ucdecomp(0x1d5); + if (decomp != null) { + System.out.print("0x1d5 decomposition :"); + for (int i = 0; i < decomp.length; i++) + System.out.print("0x"+Integer.toHexString(decomp[i])+" "); + System.out.println(""); + } + + int ccl = UCData.uccombining_class(0x41); + System.out.println("0x41 combining class " + ccl); + ccl = UCData.uccombining_class(0xfe23); + System.out.println("0xfe23 combining class " + ccl); + + int num[] = {0,0}; + if (UCData.ucnumber_lookup(0x30, num)) { + if (num[0] != num[1]) + System.out.println("0x30 is fraction "+num[0]+"/"+num[1]); + else + System.out.println("0x30 is digit "+num[0]); + } + + if (UCData.ucnumber_lookup(0xbc, num)) { + if (num[0] != num[1]) + System.out.println("0xbc is fraction "+num[0]+"/"+num[1]); + else + System.out.println("0xbc is digit "+num[0]); + } + + if (UCData.ucdigit_lookup(0x6f9, num)) + System.out.println("0x6f9 is digit " + num[0]); + else + System.out.println("0x6f9 is not a digit"); + } +} diff --git a/libraries/liblunicode/ucdata/api.txt b/libraries/liblunicode/ucdata/api.txt new file mode 100644 index 0000000000..e6bc4689b8 --- /dev/null +++ b/libraries/liblunicode/ucdata/api.txt @@ -0,0 +1,343 @@ +# +# $Id: api.txt,v 1.2 1999/11/19 15:24:29 mleisher Exp $ +# + + The MUTT UCData API + ------------------- + + + + +----------------------------------------------------------------------------- + +Macros that combine to select data tables for ucdata_load(), ucdata_unload(), +and ucdata_reload(). + +#define UCDATA_CASE 0x01 +#define UCDATA_CTYPE 0x02 +#define UCDATA_DECOMP 0x04 +#define UCDATA_CMBCL 0x08 +#define UCDATA_NUM 0x10 +#define UCATA_ALL (UCDATA_CASE|UCDATA_CTYPE|UCDATA_DECOMP|\ + UCDATA_CMBCL|UCDATA_NUM) +----------------------------------------------------------------------------- + +void ucdata_load(char *paths, int masks) + + This function initializes the UCData library by locating the data files in + one of the colon-separated directories in the `paths' parameter. The data + files to be loaded are specified in the `masks' parameter as a bitwise + combination of the macros listed above. + + This should be called before using any of the other functions. + + NOTE: the ucdata_setup(char *paths) function is now a macro that expands + into this function at compile time. + +----------------------------------------------------------------------------- + +void ucdata_unload(int masks) + + This function unloads the data tables specified in the `masks' parameter. + + This function should be called when the application is done using the UCData + package. + + NOTE: the ucdata_cleanup() function is now a macro that expands into this + function at compile time. + +----------------------------------------------------------------------------- + +void ucdata_reload(char *paths, int masks) + + This function reloads the data files from one of the colon-separated + directories in the `paths' parameter. The data files to be reloaded are + specified in the `masks' parameter as a bitwise combination of the macros + listed above. + + If the data files have already been loaded, they are unloaded before the + data files are loaded again. + +----------------------------------------------------------------------------- + +int ucdecomp(unsigned long code, unsigned long *num, unsigned long **decomp) + + This function determines if a character has a decomposition and returns the + decomposition information if it exists. + + If a zero is returned, there is no decomposition. If a non-zero is + returned, then the `num' and `decomp' variables are filled in with the + appropriate values. + + Example call: + + unsigned long i, num, *decomp; + + if (ucdecomp(0x1d5, &num, &decomp) != 0) { + for (i = 0; i < num; i++) + printf("0x%08lX,", decomp[i]); + putchar('\n'); + } + +----------------------------------------------------------------------------- + +int ucdecomp_hangul(unsigned long code, unsigned long *num, + unsigned long decomp[]) + + This function determines if a Hangul syllable has a decomposition and + returns the decomposition information. + + An array of at least size 3 should be passed to the function for the + decomposition of the syllable. + + If a zero is returned, the character is not a Hangul syllable. If a + non-zero is returned, the `num' field will be 2 or 3 and the syllable will + be decomposed into the `decomp' array arithmetically. + + Example call: + + unsigned long i, num, decomp[3]; + + if (ucdecomp_hangul(0xb1ba, &num, &decomp) != 0) { + for (i = 0; i < num; i++) + printf("0x%08lX,", decomp[i]); + putchar('\n'); + } + +----------------------------------------------------------------------------- + +struct ucnumber { + int numerator; + int denominator; +}; + +int ucnumber_lookup(unsigned long code, struct ucnumber *num) + + This function determines if the code is a number and fills in the `num' + field with the numerator and denominator. If the code happens to be a + single digit, the numerator and denominator fields will be the same. + + If the function returns 0, the code is not a number. Any other return + value means the code is a number. + +int ucdigit_lookup(unsigned long code, int *digit) + + This function determines if the code is a digit and fills in the `digit' + field with the digit value. + + If the function returns 0, the code is not a number. Any other return + value means the code is a number. + +struct ucnumber ucgetnumber(unsigned long code) + + This is a compatibility function with John Cowan's "uctype" package. It + uses ucnumber_lookup(). + +int ucgetdigit(unsigned long code) + + This is a compatibility function with John Cowan's "uctype" package. It + uses ucdigit_lookup(). + +----------------------------------------------------------------------------- + +unsigned long uctoupper(unsigned long code) + + This function returns the code unchanged if it is already upper case or has + no upper case equivalent. Otherwise the upper case equivalent is returned. + +----------------------------------------------------------------------------- + +unsigned long uctolower(unsigned long code) + + This function returns the code unchanged if it is already lower case or has + no lower case equivalent. Otherwise the lower case equivalent is returned. + +----------------------------------------------------------------------------- + +unsigned long uctotitle(unsigned long code) + + This function returns the code unchanged if it is already title case or has + no title case equivalent. Otherwise the title case equivalent is returned. + +----------------------------------------------------------------------------- + +int ucisalpha(unsigned long code) +int ucisalnum(unsigned long code) +int ucisdigit(unsigned long code) +int uciscntrl(unsigned long code) +int ucisspace(unsigned long code) +int ucisblank(unsigned long code) +int ucispunct(unsigned long code) +int ucisgraph(unsigned long code) +int ucisprint(unsigned long code) +int ucisxdigit(unsigned long code) + +int ucisupper(unsigned long code) +int ucislower(unsigned long code) +int ucistitle(unsigned long code) + + These functions (actually macros) determine if a character has these + properties. These behave in a fashion very similar to the venerable ctype + package. + +----------------------------------------------------------------------------- + +int ucisisocntrl(unsigned long code) + + Is the character a C0 control character (< 32) ? + +int ucisfmtcntrl(unsigned long code) + + Is the character a format control character? + +int ucissymbol(unsigned long code) + + Is the character a symbol? + +int ucisnumber(unsigned long code) + + Is the character a number or digit? + +int ucisnonspacing(unsigned long code) + + Is the character non-spacing? + +int ucisopenpunct(unsigned long code) + + Is the character an open/left punctuation (i.e. '[') + +int ucisclosepunct(unsigned long code) + + Is the character an close/right punctuation (i.e. ']') + +int ucisinitialpunct(unsigned long code) + + Is the character an initial punctuation (i.e. U+2018 LEFT SINGLE QUOTATION + MARK) + +int ucisfinalpunct(unsigned long code) + + Is the character a final punctuation (i.e. U+2019 RIGHT SINGLE QUOTATION + MARK) + +int uciscomposite(unsigned long code) + + Can the character be decomposed into a set of other characters? + +int ucisquote(unsigned long code) + + Is the character one of the many quotation marks? + +int ucissymmetric(unsigned long code) + + Is the character one that has an opposite form (i.e. <>) + +int ucismirroring(unsigned long code) + + Is the character mirroring (superset of symmetric)? + +int ucisnonbreaking(unsigned long code) + + Is the character non-breaking (i.e. non-breaking space)? + +int ucisrtl(unsigned long code) + + Does the character have strong right-to-left directionality (i.e. Arabic + letters)? + +int ucisltr(unsigned long code) + + Does the character have strong left-to-right directionality (i.e. Latin + letters)? + +int ucisstrong(unsigned long code) + + Does the character have strong directionality? + +int ucisweak(unsigned long code) + + Does the character have weak directionality (i.e. numbers)? + +int ucisneutral(unsigned long code) + + Does the character have neutral directionality (i.e. whitespace)? + +int ucisseparator(unsigned long code) + + Is the character a block or segment separator? + +int ucislsep(unsigned long code) + + Is the character a line separator? + +int ucispsep(unsigned long code) + + Is the character a paragraph separator? + +int ucismark(unsigned long code) + + Is the character a mark of some kind? + +int ucisnsmark(unsigned long code) + + Is the character a non-spacing mark? + +int ucisspmark(unsigned long code) + + Is the character a spacing mark? + +int ucismodif(unsigned long code) + + Is the character a modifier letter? + +int ucismodifsymbol(unsigned long code) + + Is the character a modifier symbol? + +int ucisletnum(unsigned long code) + + Is the character a number represented by a letter? + +int ucisconnect(unsigned long code) + + Is the character connecting punctuation? + +int ucisdash(unsigned long code) + + Is the character dash punctuation? + +int ucismath(unsigned long code) + + Is the character a math character? + +int uciscurrency(unsigned long code) + + Is the character a currency character? + +int ucisenclosing(unsigned long code) + + Is the character enclosing (i.e. enclosing box)? + +int ucisprivate(unsigned long code) + + Is the character from the Private Use Area? + +int ucissurrogate(unsigned long code) + + Is the character one of the surrogate codes? + +int ucisdefined(unsigned long code) + + Is the character defined (appeared in one of the data files)? + +int ucisundefined(unsigned long code) + + Is the character not defined (non-Unicode)? + +int ucishan(unsigned long code) + + Is the character a Han ideograph? + +int ucishangul(unsigned long code) + + Is the character a pre-composed Hangul syllable? diff --git a/libraries/liblunicode/ucdata/bidiapi.txt b/libraries/liblunicode/ucdata/bidiapi.txt new file mode 100644 index 0000000000..dffd12e5fe --- /dev/null +++ b/libraries/liblunicode/ucdata/bidiapi.txt @@ -0,0 +1,84 @@ +# +# $Id: bidiapi.txt,v 1.2 1999/11/19 15:24:29 mleisher Exp $ +# + + "Pretty Good Bidi Algorithm" API + +The PGBA (Pretty Good Bidi Algorithm) is an effective alternative to the +Unicode BiDi algorithm. It currently provides only implicit reordering and +does not yet support explicit reordering codes that the Unicode BiDi algorithm +supports. In addition to reordering, the PGBA includes cursor movement +support for both visual and logical navigation. + +----------------------------------------------------------------------------- + +#define UCPGBA_LTR 0 +#define UCPGBA_RTL 1 + + These macros appear in the `direction' field of the data structures. + +#define UCPGBA_CURSOR_VISUAL 0 +#define UCPGBA_CURSOR_LOGICAL 1 + + These macros are used to set the cursor movement for each reordered string. + +----------------------------------------------------------------------------- + +ucstring_t *ucstring_create(unsigned long *source, unsigned long start, + unsigned long end, int default_direction, + int cursor_motion) + + This function will create a reordered string by using the implicit + directionality of the characters in the specified substring. + + The `default_direction' parameter should be one of UCPGBA_LTR or UCPGBA_RTL + and is used only in cases where a string contains no characters with strong + directionality. + + The `cursor_motion' parameter should be one of UCPGBA_CURSOR_VISUAL or + UCPGBA_CURSOR_LOGICAL, and is used to specify the initial cursor motion + behavior. This behavior can be switched at any time using + ustring_set_cursor_motion(). + +----------------------------------------------------------------------------- + +void ucstring_free(ucstring_t *string) + + This function will deallocate the memory used by the string, incuding the + string itself. + +----------------------------------------------------------------------------- + +void ucstring_cursor_info(ustring_t *string, int *direction, + unsigned long *position) + + This function will return the text position of the internal cursor and the + directionality of the text at that position. The position returned is the + original text position of the character. + +----------------------------------------------------------------------------- + +int ucstring_set_cursor_motion(ucstring_t *string, int cursor_motion) + + This function will change the cursor motion type and return the previous + cursor motion type. + +----------------------------------------------------------------------------- + +int ucstring_cursor_right(ucstring_t *string, int count) + + This function will move the internal cursor to the right according to the + type of cursor motion set for the string. + + If no cursor motion is performed, it returns 0. Otherwise it will return a + 1. + +----------------------------------------------------------------------------- + +int ucstring_cursor_left(ucstring_t *string, int count) + + This function will move the internal cursor to the left according to the + type of cursor motion set for the string. + + If no cursor motion is performed, it returns 0. Otherwise it will return a + 1. diff --git a/libraries/liblunicode/ucdata/format.txt b/libraries/liblunicode/ucdata/format.txt new file mode 100644 index 0000000000..0c0d2cf211 --- /dev/null +++ b/libraries/liblunicode/ucdata/format.txt @@ -0,0 +1,243 @@ +# +# $Id: format.txt,v 1.1 1998/07/24 15:17:21 mleisher Exp $ +# + +CHARACTER DATA +============== + +This package generates some data files that contain character properties useful +for text processing. + +CHARACTER PROPERTIES +==================== + +The first data file is called "ctype.dat" and contains a compressed form of +the character properties found in the Unicode Character Database (UCDB). +Additional properties can be specified in limited UCDB format in another file +to avoid modifying the original UCDB. + +The following is a property name and code table to be used with the character +data: + +NAME CODE DESCRIPTION +--------------------- +Mn 0 Mark, Non-Spacing +Mc 1 Mark, Spacing Combining +Me 2 Mark, Enclosing +Nd 3 Number, Decimal Digit +Nl 4 Number, Letter +No 5 Number, Other +Zs 6 Separator, Space +Zl 7 Separator, Line +Zp 8 Separator, Paragraph +Cc 9 Other, Control +Cf 10 Other, Format +Cs 11 Other, Surrogate +Co 12 Other, Private Use +Cn 13 Other, Not Assigned +Lu 14 Letter, Uppercase +Ll 15 Letter, Lowercase +Lt 16 Letter, Titlecase +Lm 17 Letter, Modifier +Lo 18 Letter, Other +Pc 19 Punctuation, Connector +Pd 20 Punctuation, Dash +Ps 21 Punctuation, Open +Pe 22 Punctuation, Close +Po 23 Punctuation, Other +Sm 24 Symbol, Math +Sc 25 Symbol, Currency +Sk 26 Symbol, Modifier +So 27 Symbol, Other +L 28 Left-To-Right +R 29 Right-To-Left +EN 30 European Number +ES 31 European Number Separator +ET 32 European Number Terminator +AN 33 Arabic Number +CS 34 Common Number Separator +B 35 Block Separator +S 36 Segment Separator +WS 37 Whitespace +ON 38 Other Neutrals +Pi 47 Punctuation, Initial +Pf 48 Punctuation, Final +# +# Implementation specific properties. +# +Cm 39 Composite +Nb 40 Non-Breaking +Sy 41 Symmetric (characters which are part of open/close pairs) +Hd 42 Hex Digit +Qm 43 Quote Mark +Mr 44 Mirroring +Ss 45 Space, Other (controls viewed as spaces in ctype isspace()) +Cp 46 Defined character + +The actual binary data is formatted as follows: + + Assumptions: unsigned short is at least 16-bits in size and unsigned long + is at least 32-bits in size. + + unsigned short ByteOrderMark + unsigned short OffsetArraySize + unsigned long Bytes + unsigned short Offsets[OffsetArraySize + 1] + unsigned long Ranges[N], N = value of Offsets[OffsetArraySize] + + The Bytes field provides the total byte count used for the Offsets[] and + Ranges[] arrays. The Offsets[] array is aligned on a 4-byte boundary and + there is always one extra node on the end to hold the final index of the + Ranges[] array. The Ranges[] array contains pairs of 4-byte values + representing a range of Unicode characters. The pairs are arranged in + increasing order by the first character code in the range. + + Determining if a particular character is in the property list requires a + simple binary search to determine if a character is in any of the ranges + for the property. + + If the ByteOrderMark is equal to 0xFFFE, then the data was generated on a + machine with a different endian order and the values must be byte-swapped. + + To swap a 16-bit value: + c = (c >> 8) | ((c & 0xff) << 8) + + To swap a 32-bit value: + c = ((c & 0xff) << 24) | (((c >> 8) & 0xff) << 16) | + (((c >> 16) & 0xff) << 8) | (c >> 24) + +CASE MAPPINGS +============= + +The next data file is called "case.dat" and contains three case mapping tables +in the following order: upper, lower, and title case. Each table is in +increasing order by character code and each mapping contains 3 unsigned longs +which represent the possible mappings. + +The format for the binary form of these tables is: + + unsigned short ByteOrderMark + unsigned short NumMappingNodes, count of all mapping nodes + unsigned short CaseTableSizes[2], upper and lower mapping node counts + unsigned long CaseTables[NumMappingNodes] + + The starting indexes of the case tables are calculated as following: + + UpperIndex = 0; + LowerIndex = CaseTableSizes[0] * 3; + TitleIndex = LowerIndex + CaseTableSizes[1] * 3; + + The order of the fields for the three tables are: + + Upper case + ---------- + unsigned long upper; + unsigned long lower; + unsigned long title; + + Lower case + ---------- + unsigned long lower; + unsigned long upper; + unsigned long title; + + Title case + ---------- + unsigned long title; + unsigned long upper; + unsigned long lower; + + If the ByteOrderMark is equal to 0xFFFE, endian swapping is required in the + same way as described in the CHARACTER PROPERTIES section. + + Because the tables are in increasing order by character code, locating a + mapping requires a simple binary search on one of the 3 codes that make up + each node. + + It is important to note that there can only be 65536 mapping nodes which + divided into 3 portions allows 21845 nodes for each case mapping table. The + distribution of mappings may be more or less than 21845 per table, but only + 65536 are allowed. + +DECOMPOSITIONS +============== + +The next data file is called "decomp.dat" and contains the decomposition data +for all characters with decompositions containing more than one character and +are *not* compatibility decompositions. Compatibility decompositions are +signaled in the UCDB format by the use of the tag in the +decomposition field. Each list of character codes represents a full +decomposition of a composite character. The nodes are arranged in increasing +order by character code. + +The format for the binary form of this table is: + + unsigned short ByteOrderMark + unsigned short NumDecompNodes, count of all decomposition nodes + unsigned long Bytes + unsigned long DecompNodes[(NumDecompNodes * 2) + 1] + unsigned long Decomp[N], N = sum of all counts in DecompNodes[] + + If the ByteOrderMark is equal to 0xFFFE, endian swapping is required in the + same way as described in the CHARACTER PROPERTIES section. + + The DecompNodes[] array consists of pairs of unsigned longs, the first of + which is the character code and the second is the initial index of the list + of character codes representing the decomposition. + + Locating the decomposition of a composite character requires a binary search + for a character code in the DecompNodes[] array and using its index to + locate the start of the decomposition. The length of the decomposition list + is the index in the following element in DecompNode[] minus the current + index. + +COMBINING CLASSES +================= + +The fourth data file is called "cmbcl.dat" and contains the characters with +non-zero combining classes. + +The format for the binary form of this table is: + + unsigned short ByteOrderMark + unsigned short NumCCLNodes + unsigned long Bytes + unsigned long CCLNodes[NumCCLNodes * 3] + + If the ByteOrderMark is equal to 0xFFFE, endian swapping is required in the + same way as described in the CHARACTER PROPERTIES section. + + The CCLNodes[] array consists of groups of three unsigned longs. The first + and second are the beginning and ending of a range and the third is the + combining class of that range. + + If a character is not found in this table, then the combining class is + assumed to be 0. + + It is important to note that only 65536 distinct ranges plus combining class + can be specified because the NumCCLNodes is usually a 16-bit number. + +NUMBER TABLE +============ + +The final data file is called "num.dat" and contains the characters that have +a numeric value associated with them. + +The format for the binary form of the table is: + + unsigned short ByteOrderMark + unsigned short NumNumberNodes + unsigned long Bytes + unsigned long NumberNodes[NumNumberNodes] + unsigned short ValueNodes[(Bytes - (NumNumberNodes * sizeof(unsigned long))) + / sizeof(short)] + + If the ByteOrderMark is equal to 0xFFFE, endian swapping is required in the + same way as described in the CHARACTER PROPERTIES section. + + The NumberNodes array contains pairs of values, the first of which is the + character code and the second an index into the ValueNodes array. The + ValueNodes array contains pairs of integers which represent the numerator + and denominator of the numeric value of the character. If the character + happens to map to an integer, both the values in ValueNodes will be the + same. diff --git a/libraries/liblunicode/ucdata/ucdata.c b/libraries/liblunicode/ucdata/ucdata.c new file mode 100644 index 0000000000..29f97bd708 --- /dev/null +++ b/libraries/liblunicode/ucdata/ucdata.c @@ -0,0 +1,1161 @@ +/* + * Copyright 1999 Computing Research Labs, New Mexico State University + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT + * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ +#ifndef lint +#ifdef __GNUC__ +static char rcsid[] __attribute__ ((unused)) = "$Id: ucdata.c,v 1.3 1999/08/23 16:14:09 mleisher Exp $"; +#else +static char rcsid[] = "$Id: ucdata.c,v 1.3 1999/08/23 16:14:09 mleisher Exp $"; +#endif +#endif + +#include +#include +#include +#ifndef WIN32 +#include +#endif + +#include "ucdata.h" + +/************************************************************************** + * + * Miscellaneous types, data, and support functions. + * + **************************************************************************/ + +typedef struct { + unsigned short bom; + unsigned short cnt; + union { + unsigned long bytes; + unsigned short len[2]; + } size; +} _ucheader_t; + +/* + * A simple array of 32-bit masks for lookup. + */ +static unsigned long masks32[32] = { + 0x00000001, 0x00000002, 0x00000004, 0x00000008, 0x00000010, 0x00000020, + 0x00000040, 0x00000080, 0x00000100, 0x00000200, 0x00000400, 0x00000800, + 0x00001000, 0x00002000, 0x00004000, 0x00008000, 0x00010000, 0x00020000, + 0x00040000, 0x00080000, 0x00100000, 0x00200000, 0x00400000, 0x00800000, + 0x01000000, 0x02000000, 0x04000000, 0x08000000, 0x10000000, 0x20000000, + 0x40000000, 0x80000000 +}; + +#define endian_short(cc) (((cc) >> 8) | (((cc) & 0xff) << 8)) +#define endian_long(cc) ((((cc) & 0xff) << 24)|((((cc) >> 8) & 0xff) << 16)|\ + ((((cc) >> 16) & 0xff) << 8)|((cc) >> 24)) + +static FILE * +#ifdef __STDC__ +_ucopenfile(char *paths, char *filename, char *mode) +#else +_ucopenfile(paths, filename, mode) +char *paths, *filename, *mode; +#endif +{ + FILE *f; + char *fp, *dp, *pp, path[BUFSIZ]; + + if (filename == 0 || *filename == 0) + return 0; + + dp = paths; + while (dp && *dp) { + pp = path; + while (*dp && *dp != ':') + *pp++ = *dp++; + *pp++ = '/'; + + fp = filename; + while (*fp) + *pp++ = *fp++; + *pp = 0; + + if ((f = fopen(path, mode)) != 0) + return f; + + if (*dp == ':') + dp++; + } + + return 0; +} + +/************************************************************************** + * + * Support for the character properties. + * + **************************************************************************/ + +static unsigned long _ucprop_size; +static unsigned short *_ucprop_offsets; +static unsigned long *_ucprop_ranges; + +static void +#ifdef __STDC__ +_ucprop_load(char *paths, int reload) +#else +_ucprop_load(paths, reload) +char *paths; +int reload; +#endif +{ + FILE *in; + unsigned long size, i; + _ucheader_t hdr; + + if (_ucprop_size > 0) { + if (!reload) + /* + * The character properties have already been loaded. + */ + return; + + /* + * Unload the current character property data in preparation for + * loading a new copy. Only the first array has to be deallocated + * because all the memory for the arrays is allocated as a single + * block. + */ + free((char *) _ucprop_offsets); + _ucprop_size = 0; + } + + if ((in = _ucopenfile(paths, "ctype.dat", "rb")) == 0) + return; + + /* + * Load the header. + */ + fread((char *) &hdr, sizeof(_ucheader_t), 1, in); + + if (hdr.bom == 0xfffe) { + hdr.cnt = endian_short(hdr.cnt); + hdr.size.bytes = endian_long(hdr.size.bytes); + } + + if ((_ucprop_size = hdr.cnt) == 0) { + fclose(in); + return; + } + + /* + * Allocate all the storage needed for the lookup table. + */ + _ucprop_offsets = (unsigned short *) malloc(hdr.size.bytes); + + /* + * Calculate the offset into the storage for the ranges. The offsets + * array is on a 4-byte boundary and one larger than the value provided in + * the header count field. This means the offset to the ranges must be + * calculated after aligning the count to a 4-byte boundary. + */ + if ((size = ((hdr.cnt + 1) * sizeof(unsigned short))) & 3) + size += 4 - (size & 3); + size >>= 1; + _ucprop_ranges = (unsigned long *) (_ucprop_offsets + size); + + /* + * Load the offset array. + */ + fread((char *) _ucprop_offsets, sizeof(unsigned short), size, in); + + /* + * Do an endian swap if necessary. Don't forget there is an extra node on + * the end with the final index. + */ + if (hdr.bom == 0xfffe) { + for (i = 0; i <= _ucprop_size; i++) + _ucprop_offsets[i] = endian_short(_ucprop_offsets[i]); + } + + /* + * Load the ranges. The number of elements is in the last array position + * of the offsets. + */ + fread((char *) _ucprop_ranges, sizeof(unsigned long), + _ucprop_offsets[_ucprop_size], in); + + fclose(in); + + /* + * Do an endian swap if necessary. + */ + if (hdr.bom == 0xfffe) { + for (i = 0; i < _ucprop_offsets[_ucprop_size]; i++) + _ucprop_ranges[i] = endian_long(_ucprop_ranges[i]); + } +} + +static void +#ifdef __STDC__ +_ucprop_unload(void) +#else +_ucprop_unload() +#endif +{ + if (_ucprop_size == 0) + return; + + /* + * Only need to free the offsets because the memory is allocated as a + * single block. + */ + free((char *) _ucprop_offsets); + _ucprop_size = 0; +} + +static int +#ifdef __STDC__ +_ucprop_lookup(unsigned long code, unsigned long n) +#else +_ucprop_lookup(code, n) +unsigned long code, n; +#endif +{ + long l, r, m; + + /* + * There is an extra node on the end of the offsets to allow this routine + * to work right. If the index is 0xffff, then there are no nodes for the + * property. + */ + if ((l = _ucprop_offsets[n]) == 0xffff) + return 0; + + /* + * Locate the next offset that is not 0xffff. The sentinel at the end of + * the array is the max index value. + */ + for (m = 1; + n + m < _ucprop_size && _ucprop_offsets[n + m] == 0xffff; m++) ; + + r = _ucprop_offsets[n + m] - 1; + + while (l <= r) { + /* + * Determine a "mid" point and adjust to make sure the mid point is at + * the beginning of a range pair. + */ + m = (l + r) >> 1; + m -= (m & 1); + if (code > _ucprop_ranges[m + 1]) + l = m + 2; + else if (code < _ucprop_ranges[m]) + r = m - 2; + else if (code >= _ucprop_ranges[m] && code <= _ucprop_ranges[m + 1]) + return 1; + } + return 0; +} + +int +#ifdef __STDC__ +ucisprop(unsigned long code, unsigned long mask1, unsigned long mask2) +#else +ucisprop(code, mask1, mask2) +unsigned long code, mask1, mask2; +#endif +{ + unsigned long i; + + if (mask1 == 0 && mask2 == 0) + return 0; + + for (i = 0; mask1 && i < 32; i++) { + if ((mask1 & masks32[i]) && _ucprop_lookup(code, i)) + return 1; + } + + for (i = 32; mask2 && i < _ucprop_size; i++) { + if ((mask2 & masks32[i & 31]) && _ucprop_lookup(code, i)) + return 1; + } + + return 0; +} + +/************************************************************************** + * + * Support for case mapping. + * + **************************************************************************/ + +static unsigned long _uccase_size; +static unsigned short _uccase_len[2]; +static unsigned long *_uccase_map; + +static void +#ifdef __STDC__ +_uccase_load(char *paths, int reload) +#else +_uccase_load(paths, reload) +char *paths; +int reload; +#endif +{ + FILE *in; + unsigned long i; + _ucheader_t hdr; + + if (_uccase_size > 0) { + if (!reload) + /* + * The case mappings have already been loaded. + */ + return; + + free((char *) _uccase_map); + _uccase_size = 0; + } + + if ((in = _ucopenfile(paths, "case.dat", "rb")) == 0) + return; + + /* + * Load the header. + */ + fread((char *) &hdr, sizeof(_ucheader_t), 1, in); + + if (hdr.bom == 0xfffe) { + hdr.cnt = endian_short(hdr.cnt); + hdr.size.len[0] = endian_short(hdr.size.len[0]); + hdr.size.len[1] = endian_short(hdr.size.len[1]); + } + + /* + * Set the node count and lengths of the upper and lower case mapping + * tables. + */ + _uccase_size = hdr.cnt * 3; + _uccase_len[0] = hdr.size.len[0] * 3; + _uccase_len[1] = hdr.size.len[1] * 3; + + _uccase_map = (unsigned long *) + malloc(_uccase_size * sizeof(unsigned long)); + + /* + * Load the case mapping table. + */ + fread((char *) _uccase_map, sizeof(unsigned long), _uccase_size, in); + + /* + * Do an endian swap if necessary. + */ + if (hdr.bom == 0xfffe) { + for (i = 0; i < _uccase_size; i++) + _uccase_map[i] = endian_long(_uccase_map[i]); + } +} + +static void +#ifdef __STDC__ +_uccase_unload(void) +#else +_uccase_unload() +#endif +{ + if (_uccase_size == 0) + return; + + free((char *) _uccase_map); + _uccase_size = 0; +} + +static unsigned long +#ifdef __STDC__ +_uccase_lookup(unsigned long code, long l, long r, int field) +#else +_uccase_lookup(code, l, r, field) +unsigned long code; +long l, r; +int field; +#endif +{ + long m; + + /* + * Do the binary search. + */ + while (l <= r) { + /* + * Determine a "mid" point and adjust to make sure the mid point is at + * the beginning of a case mapping triple. + */ + m = (l + r) >> 1; + m -= (m % 3); + if (code > _uccase_map[m]) + l = m + 3; + else if (code < _uccase_map[m]) + r = m - 3; + else if (code == _uccase_map[m]) + return _uccase_map[m + field]; + } + + return code; +} + +unsigned long +#ifdef __STDC__ +uctoupper(unsigned long code) +#else +uctoupper(code) +unsigned long code; +#endif +{ + int field; + long l, r; + + if (ucisupper(code)) + return code; + + if (ucislower(code)) { + /* + * The character is lower case. + */ + field = 2; + l = _uccase_len[0]; + r = (l + _uccase_len[1]) - 3; + } else { + /* + * The character is title case. + */ + field = 1; + l = _uccase_len[0] + _uccase_len[1]; + r = _uccase_size - 3; + } + return _uccase_lookup(code, l, r, field); +} + +unsigned long +#ifdef __STDC__ +uctolower(unsigned long code) +#else +uctolower(code) +unsigned long code; +#endif +{ + int field; + long l, r; + + if (ucislower(code)) + return code; + + if (ucisupper(code)) { + /* + * The character is upper case. + */ + field = 1; + l = 0; + r = _uccase_len[0] - 3; + } else { + /* + * The character is title case. + */ + field = 2; + l = _uccase_len[0] + _uccase_len[1]; + r = _uccase_size - 3; + } + return _uccase_lookup(code, l, r, field); +} + +unsigned long +#ifdef __STDC__ +uctotitle(unsigned long code) +#else +uctotitle(code) +unsigned long code; +#endif +{ + int field; + long l, r; + + if (ucistitle(code)) + return code; + + /* + * The offset will always be the same for converting to title case. + */ + field = 2; + + if (ucisupper(code)) { + /* + * The character is upper case. + */ + l = 0; + r = _uccase_len[0] - 3; + } else { + /* + * The character is lower case. + */ + l = _uccase_len[0]; + r = (l + _uccase_len[1]) - 3; + } + return _uccase_lookup(code, l, r, field); +} + +/************************************************************************** + * + * Support for decompositions. + * + **************************************************************************/ + +static unsigned long _ucdcmp_size; +static unsigned long *_ucdcmp_nodes; +static unsigned long *_ucdcmp_decomp; + +static void +#ifdef __STDC__ +_ucdcmp_load(char *paths, int reload) +#else +_ucdcmp_load(paths, reload) +char *paths; +int reload; +#endif +{ + FILE *in; + unsigned long size, i; + _ucheader_t hdr; + + if (_ucdcmp_size > 0) { + if (!reload) + /* + * The decompositions have already been loaded. + */ + return; + + free((char *) _ucdcmp_nodes); + _ucdcmp_size = 0; + } + + if ((in = _ucopenfile(paths, "decomp.dat", "rb")) == 0) + return; + + /* + * Load the header. + */ + fread((char *) &hdr, sizeof(_ucheader_t), 1, in); + + if (hdr.bom == 0xfffe) { + hdr.cnt = endian_short(hdr.cnt); + hdr.size.bytes = endian_long(hdr.size.bytes); + } + + _ucdcmp_size = hdr.cnt << 1; + _ucdcmp_nodes = (unsigned long *) malloc(hdr.size.bytes); + _ucdcmp_decomp = _ucdcmp_nodes + (_ucdcmp_size + 1); + + /* + * Read the decomposition data in. + */ + size = hdr.size.bytes / sizeof(unsigned long); + fread((char *) _ucdcmp_nodes, sizeof(unsigned long), size, in); + + /* + * Do an endian swap if necessary. + */ + if (hdr.bom == 0xfffe) { + for (i = 0; i < size; i++) + _ucdcmp_nodes[i] = endian_long(_ucdcmp_nodes[i]); + } +} + +static void +#ifdef __STDC__ +_ucdcmp_unload(void) +#else +_ucdcmp_unload() +#endif +{ + if (_ucdcmp_size == 0) + return; + + /* + * Only need to free the offsets because the memory is allocated as a + * single block. + */ + free((char *) _ucdcmp_nodes); + _ucdcmp_size = 0; +} + +int +#ifdef __STDC__ +ucdecomp(unsigned long code, unsigned long *num, unsigned long **decomp) +#else +ucdecomp(code, num, decomp) +unsigned long code, *num, **decomp; +#endif +{ + long l, r, m; + + l = 0; + r = _ucdcmp_nodes[_ucdcmp_size] - 1; + + while (l <= r) { + /* + * Determine a "mid" point and adjust to make sure the mid point is at + * the beginning of a code+offset pair. + */ + m = (l + r) >> 1; + m -= (m & 1); + if (code > _ucdcmp_nodes[m]) + l = m + 2; + else if (code < _ucdcmp_nodes[m]) + r = m - 2; + else if (code == _ucdcmp_nodes[m]) { + *num = _ucdcmp_nodes[m + 3] - _ucdcmp_nodes[m + 1]; + *decomp = &_ucdcmp_decomp[_ucdcmp_nodes[m + 1]]; + return 1; + } + } + return 0; +} + +int +#ifdef __STDC__ +ucdecomp_hangul(unsigned long code, unsigned long *num, unsigned long decomp[]) +#else +ucdecomp_hangul(code, num, decomp) +unsigned long code, *num, decomp[]; +#endif +{ + if (!ucishangul(code)) + return 0; + + code -= 0xac00; + decomp[0] = 0x1100 + (unsigned long) (code / 588); + decomp[1] = 0x1161 + (unsigned long) ((code % 588) / 28); + decomp[2] = 0x11a7 + (unsigned long) (code % 28); + *num = (decomp[2] != 0x11a7) ? 3 : 2; + + return 1; +} + +/************************************************************************** + * + * Support for combining classes. + * + **************************************************************************/ + +static unsigned long _uccmcl_size; +static unsigned long *_uccmcl_nodes; + +static void +#ifdef __STDC__ +_uccmcl_load(char *paths, int reload) +#else +_uccmcl_load(paths, reload) +char *paths; +int reload; +#endif +{ + FILE *in; + unsigned long i; + _ucheader_t hdr; + + if (_uccmcl_size > 0) { + if (!reload) + /* + * The combining classes have already been loaded. + */ + return; + + free((char *) _uccmcl_nodes); + _uccmcl_size = 0; + } + + if ((in = _ucopenfile(paths, "cmbcl.dat", "rb")) == 0) + return; + + /* + * Load the header. + */ + fread((char *) &hdr, sizeof(_ucheader_t), 1, in); + + if (hdr.bom == 0xfffe) { + hdr.cnt = endian_short(hdr.cnt); + hdr.size.bytes = endian_long(hdr.size.bytes); + } + + _uccmcl_size = hdr.cnt * 3; + _uccmcl_nodes = (unsigned long *) malloc(hdr.size.bytes); + + /* + * Read the combining classes in. + */ + fread((char *) _uccmcl_nodes, sizeof(unsigned long), _uccmcl_size, in); + + /* + * Do an endian swap if necessary. + */ + if (hdr.bom == 0xfffe) { + for (i = 0; i < _uccmcl_size; i++) + _uccmcl_nodes[i] = endian_long(_uccmcl_nodes[i]); + } +} + +static void +#ifdef __STDC__ +_uccmcl_unload(void) +#else +_uccmcl_unload() +#endif +{ + if (_uccmcl_size == 0) + return; + + free((char *) _uccmcl_nodes); + _uccmcl_size = 0; +} + +unsigned long +#ifdef __STDC__ +uccombining_class(unsigned long code) +#else +uccombining_class(code) +unsigned long code; +#endif +{ + long l, r, m; + + l = 0; + r = _uccmcl_size - 1; + + while (l <= r) { + m = (l + r) >> 1; + m -= (m % 3); + if (code > _uccmcl_nodes[m + 1]) + l = m + 3; + else if (code < _uccmcl_nodes[m]) + r = m - 3; + else if (code >= _uccmcl_nodes[m] && code <= _uccmcl_nodes[m + 1]) + return _uccmcl_nodes[m + 2]; + } + return 0; +} + +/************************************************************************** + * + * Support for numeric values. + * + **************************************************************************/ + +static unsigned long *_ucnum_nodes; +static unsigned long _ucnum_size; +static short *_ucnum_vals; + +static void +#ifdef __STDC__ +_ucnumb_load(char *paths, int reload) +#else +_ucnumb_load(paths, reload) +char *paths; +int reload; +#endif +{ + FILE *in; + unsigned long size, i; + _ucheader_t hdr; + + if (_ucnum_size > 0) { + if (!reload) + /* + * The numbers have already been loaded. + */ + return; + + free((char *) _ucnum_nodes); + _ucnum_size = 0; + } + + if ((in = _ucopenfile(paths, "num.dat", "rb")) == 0) + return; + + /* + * Load the header. + */ + fread((char *) &hdr, sizeof(_ucheader_t), 1, in); + + if (hdr.bom == 0xfffe) { + hdr.cnt = endian_short(hdr.cnt); + hdr.size.bytes = endian_long(hdr.size.bytes); + } + + _ucnum_size = hdr.cnt; + _ucnum_nodes = (unsigned long *) malloc(hdr.size.bytes); + _ucnum_vals = (short *) (_ucnum_nodes + _ucnum_size); + + /* + * Read the combining classes in. + */ + fread((char *) _ucnum_nodes, sizeof(unsigned char), hdr.size.bytes, in); + + /* + * Do an endian swap if necessary. + */ + if (hdr.bom == 0xfffe) { + for (i = 0; i < _ucnum_size; i++) + _ucnum_nodes[i] = endian_long(_ucnum_nodes[i]); + + /* + * Determine the number of values that have to be adjusted. + */ + size = (hdr.size.bytes - + (_ucnum_size * (sizeof(unsigned long) << 1))) / + sizeof(short); + + for (i = 0; i < size; i++) + _ucnum_vals[i] = endian_short(_ucnum_vals[i]); + } +} + +static void +#ifdef __STDC__ +_ucnumb_unload(void) +#else +_ucnumb_unload() +#endif +{ + if (_ucnum_size == 0) + return; + + free((char *) _ucnum_nodes); + _ucnum_size = 0; +} + +int +#ifdef __STDC__ +ucnumber_lookup(unsigned long code, struct ucnumber *num) +#else +ucnumber_lookup(code, num) +unsigned long code; +struct ucnumber *num; +#endif +{ + long l, r, m; + short *vp; + + l = 0; + r = _ucnum_size - 1; + while (l <= r) { + /* + * Determine a "mid" point and adjust to make sure the mid point is at + * the beginning of a code+offset pair. + */ + m = (l + r) >> 1; + m -= (m & 1); + if (code > _ucnum_nodes[m]) + l = m + 2; + else if (code < _ucnum_nodes[m]) + r = m - 2; + else { + vp = _ucnum_vals + _ucnum_nodes[m + 1]; + num->numerator = (int) *vp++; + num->denominator = (int) *vp; + return 1; + } + } + return 0; +} + +int +#ifdef __STDC__ +ucdigit_lookup(unsigned long code, int *digit) +#else +ucdigit_lookup(code, digit) +unsigned long code; +int *digit; +#endif +{ + long l, r, m; + short *vp; + + l = 0; + r = _ucnum_size - 1; + while (l <= r) { + /* + * Determine a "mid" point and adjust to make sure the mid point is at + * the beginning of a code+offset pair. + */ + m = (l + r) >> 1; + m -= (m & 1); + if (code > _ucnum_nodes[m]) + l = m + 2; + else if (code < _ucnum_nodes[m]) + r = m - 2; + else { + vp = _ucnum_vals + _ucnum_nodes[m + 1]; + if (*vp == *(vp + 1)) { + *digit = *vp; + return 1; + } + return 0; + } + } + return 0; +} + +struct ucnumber +#ifdef __STDC__ +ucgetnumber(unsigned long code) +#else +ucgetnumber(code) +unsigned long code; +#endif +{ + struct ucnumber num; + + /* + * Initialize with some arbitrary value, because the caller simply cannot + * tell for sure if the code is a number without calling the ucisnumber() + * macro before calling this function. + */ + num.numerator = num.denominator = -111; + + (void) ucnumber_lookup(code, &num); + + return num; +} + +int +#ifdef __STDC__ +ucgetdigit(unsigned long code) +#else +ucgetdigit(code) +unsigned long code; +#endif +{ + int dig; + + /* + * Initialize with some arbitrary value, because the caller simply cannot + * tell for sure if the code is a number without calling the ucisdigit() + * macro before calling this function. + */ + dig = -111; + + (void) ucdigit_lookup(code, &dig); + + return dig; +} + +/************************************************************************** + * + * Setup and cleanup routines. + * + **************************************************************************/ + +void +#ifdef __STDC__ +ucdata_load(char *paths, int masks) +#else +ucdata_load(paths, masks) +char *paths; +int masks; +#endif +{ + if (masks & UCDATA_CTYPE) + _ucprop_load(paths, 0); + if (masks & UCDATA_CASE) + _uccase_load(paths, 0); + if (masks & UCDATA_DECOMP) + _ucdcmp_load(paths, 0); + if (masks & UCDATA_CMBCL) + _uccmcl_load(paths, 0); + if (masks & UCDATA_NUM) + _ucnumb_load(paths, 0); +} + +void +#ifdef __STDC__ +ucdata_unload(int masks) +#else +ucdata_unload(masks) +int masks; +#endif +{ + if (masks & UCDATA_CTYPE) + _ucprop_unload(); + if (masks & UCDATA_CASE) + _uccase_unload(); + if (masks & UCDATA_DECOMP) + _ucdcmp_unload(); + if (masks & UCDATA_CMBCL) + _uccmcl_unload(); + if (masks & UCDATA_NUM) + _ucnumb_unload(); +} + +void +#ifdef __STDC__ +ucdata_reload(char *paths, int masks) +#else +ucdata_reload(paths, masks) +char *paths; +int masks; +#endif +{ + if (masks & UCDATA_CTYPE) + _ucprop_load(paths, 1); + if (masks & UCDATA_CASE) + _uccase_load(paths, 1); + if (masks & UCDATA_DECOMP) + _ucdcmp_load(paths, 1); + if (masks & UCDATA_CMBCL) + _uccmcl_load(paths, 1); + if (masks & UCDATA_NUM) + _ucnumb_load(paths, 1); +} + +#ifdef TEST + +void +#ifdef __STDC__ +main(void) +#else +main() +#endif +{ + int dig; + unsigned long i, lo, *dec; + struct ucnumber num; + + ucdata_setup("."); + + if (ucisweak(0x30)) + printf("WEAK\n"); + else + printf("NOT WEAK\n"); + + printf("LOWER 0x%04lX\n", uctolower(0xff3a)); + printf("UPPER 0x%04lX\n", uctoupper(0xff5a)); + + if (ucisalpha(0x1d5)) + printf("ALPHA\n"); + else + printf("NOT ALPHA\n"); + + if (ucisupper(0x1d5)) { + printf("UPPER\n"); + lo = uctolower(0x1d5); + printf("0x%04lx\n", lo); + lo = uctotitle(0x1d5); + printf("0x%04lx\n", lo); + } else + printf("NOT UPPER\n"); + + if (ucistitle(0x1d5)) + printf("TITLE\n"); + else + printf("NOT TITLE\n"); + + if (uciscomposite(0x1d5)) + printf("COMPOSITE\n"); + else + printf("NOT COMPOSITE\n"); + + if (ucdecomp(0x1d5, &lo, &dec)) { + for (i = 0; i < lo; i++) + printf("0x%04lx ", dec[i]); + putchar('\n'); + } + + if ((lo = uccombining_class(0x41)) != 0) + printf("0x41 CCL %ld\n", lo); + + if (ucisxdigit(0xfeff)) + printf("0xFEFF HEX DIGIT\n"); + else + printf("0xFEFF NOT HEX DIGIT\n"); + + if (ucisdefined(0x10000)) + printf("0x10000 DEFINED\n"); + else + printf("0x10000 NOT DEFINED\n"); + + if (ucnumber_lookup(0x30, &num)) { + if (num.numerator != num.denominator) + printf("UCNUMBER: 0x30 = %d/%d\n", num.numerator, num.denominator); + else + printf("UCNUMBER: 0x30 = %d\n", num.numerator); + } else + printf("UCNUMBER: 0x30 NOT A NUMBER\n"); + + if (ucnumber_lookup(0xbc, &num)) { + if (num.numerator != num.denominator) + printf("UCNUMBER: 0xbc = %d/%d\n", num.numerator, num.denominator); + else + printf("UCNUMBER: 0xbc = %d\n", num.numerator); + } else + printf("UCNUMBER: 0xbc NOT A NUMBER\n"); + + + if (ucnumber_lookup(0xff19, &num)) { + if (num.numerator != num.denominator) + printf("UCNUMBER: 0xff19 = %d/%d\n", num.numerator, num.denominator); + else + printf("UCNUMBER: 0xff19 = %d\n", num.numerator); + } else + printf("UCNUMBER: 0xff19 NOT A NUMBER\n"); + + if (ucnumber_lookup(0x4e00, &num)) { + if (num.numerator != num.denominator) + printf("UCNUMBER: 0x4e00 = %d/%d\n", num.numerator, num.denominator); + else + printf("UCNUMBER: 0x4e00 = %d\n", num.numerator); + } else + printf("UCNUMBER: 0x4e00 NOT A NUMBER\n"); + + if (ucdigit_lookup(0x06f9, &dig)) + printf("UCDIGIT: 0x6f9 = %d\n", dig); + else + printf("UCDIGIT: 0x6f9 NOT A NUMBER\n"); + + dig = ucgetdigit(0x0969); + printf("UCGETDIGIT: 0x969 = %d\n", dig); + + num = ucgetnumber(0x30); + if (num.numerator != num.denominator) + printf("UCGETNUMBER: 0x30 = %d/%d\n", num.numerator, num.denominator); + else + printf("UCGETNUMBER: 0x30 = %d\n", num.numerator); + + num = ucgetnumber(0xbc); + if (num.numerator != num.denominator) + printf("UCGETNUMBER: 0xbc = %d/%d\n", num.numerator, num.denominator); + else + printf("UCGETNUMBER: 0xbc = %d\n", num.numerator); + + num = ucgetnumber(0xff19); + if (num.numerator != num.denominator) + printf("UCGETNUMBER: 0xff19 = %d/%d\n", num.numerator, num.denominator); + else + printf("UCGETNUMBER: 0xff19 = %d\n", num.numerator); + + ucdata_cleanup(); + exit(0); +} + +#endif /* TEST */ diff --git a/libraries/liblunicode/ucdata/ucdata.h b/libraries/liblunicode/ucdata/ucdata.h new file mode 100644 index 0000000000..57b6999bcc --- /dev/null +++ b/libraries/liblunicode/ucdata/ucdata.h @@ -0,0 +1,306 @@ +/* + * Copyright 1999 Computing Research Labs, New Mexico State University + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT + * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ +#ifndef _h_ucdata +#define _h_ucdata + +/* + * $Id: ucdata.h,v 1.5 1999/11/19 15:24:29 mleisher Exp $ + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#undef __ +#ifdef __STDC__ +#define __(x) x +#else +#define __(x) () +#endif + +#define UCDATA_VERSION "2.3" + +/************************************************************************** + * + * Masks and macros for character properties. + * + **************************************************************************/ + +/* + * Values that can appear in the `mask1' parameter of the ucisprop() + * function. + */ +#define UC_MN 0x00000001 /* Mark, Non-Spacing */ +#define UC_MC 0x00000002 /* Mark, Spacing Combining */ +#define UC_ME 0x00000004 /* Mark, Enclosing */ +#define UC_ND 0x00000008 /* Number, Decimal Digit */ +#define UC_NL 0x00000010 /* Number, Letter */ +#define UC_NO 0x00000020 /* Number, Other */ +#define UC_ZS 0x00000040 /* Separator, Space */ +#define UC_ZL 0x00000080 /* Separator, Line */ +#define UC_ZP 0x00000100 /* Separator, Paragraph */ +#define UC_CC 0x00000200 /* Other, Control */ +#define UC_CF 0x00000400 /* Other, Format */ +#define UC_OS 0x00000800 /* Other, Surrogate */ +#define UC_CO 0x00001000 /* Other, Private Use */ +#define UC_CN 0x00002000 /* Other, Not Assigned */ +#define UC_LU 0x00004000 /* Letter, Uppercase */ +#define UC_LL 0x00008000 /* Letter, Lowercase */ +#define UC_LT 0x00010000 /* Letter, Titlecase */ +#define UC_LM 0x00020000 /* Letter, Modifier */ +#define UC_LO 0x00040000 /* Letter, Other */ +#define UC_PC 0x00080000 /* Punctuation, Connector */ +#define UC_PD 0x00100000 /* Punctuation, Dash */ +#define UC_PS 0x00200000 /* Punctuation, Open */ +#define UC_PE 0x00400000 /* Punctuation, Close */ +#define UC_PO 0x00800000 /* Punctuation, Other */ +#define UC_SM 0x01000000 /* Symbol, Math */ +#define UC_SC 0x02000000 /* Symbol, Currency */ +#define UC_SK 0x04000000 /* Symbol, Modifier */ +#define UC_SO 0x08000000 /* Symbol, Other */ +#define UC_L 0x10000000 /* Left-To-Right */ +#define UC_R 0x20000000 /* Right-To-Left */ +#define UC_EN 0x40000000 /* European Number */ +#define UC_ES 0x80000000 /* European Number Separator */ + +/* + * Values that can appear in the `mask2' parameter of the ucisprop() + * function. + */ +#define UC_ET 0x00000001 /* European Number Terminator */ +#define UC_AN 0x00000002 /* Arabic Number */ +#define UC_CS 0x00000004 /* Common Number Separator */ +#define UC_B 0x00000008 /* Block Separator */ +#define UC_S 0x00000010 /* Segment Separator */ +#define UC_WS 0x00000020 /* Whitespace */ +#define UC_ON 0x00000040 /* Other Neutrals */ +/* + * Implementation specific character properties. + */ +#define UC_CM 0x00000080 /* Composite */ +#define UC_NB 0x00000100 /* Non-Breaking */ +#define UC_SY 0x00000200 /* Symmetric */ +#define UC_HD 0x00000400 /* Hex Digit */ +#define UC_QM 0x00000800 /* Quote Mark */ +#define UC_MR 0x00001000 /* Mirroring */ +#define UC_SS 0x00002000 /* Space, other */ + +#define UC_CP 0x00004000 /* Defined */ + +/* + * Added for UnicodeData-2.1.3. + */ +#define UC_PI 0x00008000 /* Punctuation, Initial */ +#define UC_PF 0x00010000 /* Punctuation, Final */ + +/* + * This is the primary function for testing to see if a character has some set + * of properties. The macros that test for various character properties all + * call this function with some set of masks. + */ +extern int ucisprop __((unsigned long code, unsigned long mask1, + unsigned long mask2)); + +#define ucisalpha(cc) ucisprop(cc, UC_LU|UC_LL|UC_LM|UC_LO|UC_LT, 0) +#define ucisdigit(cc) ucisprop(cc, UC_ND, 0) +#define ucisalnum(cc) ucisprop(cc, UC_LU|UC_LL|UC_LM|UC_LO|UC_LT|UC_ND, 0) +#define uciscntrl(cc) ucisprop(cc, UC_CC|UC_CF, 0) +#define ucisspace(cc) ucisprop(cc, UC_ZS|UC_SS, 0) +#define ucisblank(cc) ucisprop(cc, UC_ZS, 0) +#define ucispunct(cc) ucisprop(cc, UC_PD|UC_PS|UC_PE|UC_PO, UC_PI|UC_PF) +#define ucisgraph(cc) ucisprop(cc, UC_MN|UC_MC|UC_ME|UC_ND|UC_NL|UC_NO|\ + UC_LU|UC_LL|UC_LT|UC_LM|UC_LO|UC_PC|UC_PD|\ + UC_PS|UC_PE|UC_PO|UC_SM|UC_SM|UC_SC|UC_SK|\ + UC_SO, UC_PI|UC_PF) +#define ucisprint(cc) ucisprop(cc, UC_MN|UC_MC|UC_ME|UC_ND|UC_NL|UC_NO|\ + UC_LU|UC_LL|UC_LT|UC_LM|UC_LO|UC_PC|UC_PD|\ + UC_PS|UC_PE|UC_PO|UC_SM|UC_SM|UC_SC|UC_SK|\ + UC_SO|UC_ZS, UC_PI|UC_PF) +#define ucisupper(cc) ucisprop(cc, UC_LU, 0) +#define ucislower(cc) ucisprop(cc, UC_LL, 0) +#define ucistitle(cc) ucisprop(cc, UC_LT, 0) +#define ucisxdigit(cc) ucisprop(cc, 0, UC_HD) + +#define ucisisocntrl(cc) ucisprop(cc, UC_CC, 0) +#define ucisfmtcntrl(cc) ucisprop(cc, UC_CF, 0) + +#define ucissymbol(cc) ucisprop(cc, UC_SM|UC_SC|UC_SO|UC_SK, 0) +#define ucisnumber(cc) ucisprop(cc, UC_ND|UC_NO|UC_NL, 0) +#define ucisnonspacing(cc) ucisprop(cc, UC_MN, 0) +#define ucisopenpunct(cc) ucisprop(cc, UC_PS, 0) +#define ucisclosepunct(cc) ucisprop(cc, UC_PE, 0) +#define ucisinitialpunct(cc) ucisprop(cc, 0, UC_PI) +#define ucisfinalpunct(cc) ucisprop(cc, 0, UC_PF) + +#define uciscomposite(cc) ucisprop(cc, 0, UC_CM) +#define ucishex(cc) ucisprop(cc, 0, UC_HD) +#define ucisquote(cc) ucisprop(cc, 0, UC_QM) +#define ucissymmetric(cc) ucisprop(cc, 0, UC_SY) +#define ucismirroring(cc) ucisprop(cc, 0, UC_MR) +#define ucisnonbreaking(cc) ucisprop(cc, 0, UC_NB) + +/* + * Directionality macros. + */ +#define ucisrtl(cc) ucisprop(cc, UC_R, 0) +#define ucisltr(cc) ucisprop(cc, UC_L, 0) +#define ucisstrong(cc) ucisprop(cc, UC_L|UC_R, 0) +#define ucisweak(cc) ucisprop(cc, UC_EN|UC_ES, UC_ET|UC_AN|UC_CS) +#define ucisneutral(cc) ucisprop(cc, 0, UC_B|UC_S|UC_WS|UC_ON) +#define ucisseparator(cc) ucisprop(cc, 0, UC_B|UC_S) + +/* + * Other macros inspired by John Cowan. + */ +#define ucismark(cc) ucisprop(cc, UC_MN|UC_MC|UC_ME, 0) +#define ucismodif(cc) ucisprop(cc, UC_LM, 0) +#define ucisletnum(cc) ucisprop(cc, UC_NL, 0) +#define ucisconnect(cc) ucisprop(cc, UC_PC, 0) +#define ucisdash(cc) ucisprop(cc, UC_PD, 0) +#define ucismath(cc) ucisprop(cc, UC_SM, 0) +#define uciscurrency(cc) ucisprop(cc, UC_SC, 0) +#define ucismodifsymbol(cc) ucisprop(cc, UC_SK, 0) +#define ucisnsmark(cc) ucisprop(cc, UC_MN, 0) +#define ucisspmark(cc) ucisprop(cc, UC_MC, 0) +#define ucisenclosing(cc) ucisprop(cc, UC_ME, 0) +#define ucisprivate(cc) ucisprop(cc, UC_CO, 0) +#define ucissurrogate(cc) ucisprop(cc, UC_OS, 0) +#define ucislsep(cc) ucisprop(cc, UC_ZL, 0) +#define ucispsep(cc) ucisprop(cc, UC_ZP, 0) + +#define ucisidentstart(cc) ucisprop(cc, UC_LU|UC_LL|UC_LT|UC_LO|UC_NL, 0) +#define ucisidentpart(cc) ucisprop(cc, UC_LU|UC_LL|UC_LT|UC_LO|UC_NL|\ + UC_MN|UC_MC|UC_ND|UC_PC|UC_CF, 0) + +#define ucisdefined(cc) ucisprop(cc, 0, UC_CP) +#define ucisundefined(cc) !ucisprop(cc, 0, UC_CP) + +/* + * Other miscellaneous character property macros. + */ +#define ucishan(cc) (((cc) >= 0x4e00 && (cc) <= 0x9fff) ||\ + ((cc) >= 0xf900 && (cc) <= 0xfaff)) +#define ucishangul(cc) ((cc) >= 0xac00 && (cc) <= 0xd7ff) + +/************************************************************************** + * + * Functions for case conversion. + * + **************************************************************************/ + +extern unsigned long uctoupper __((unsigned long code)); +extern unsigned long uctolower __((unsigned long code)); +extern unsigned long uctotitle __((unsigned long code)); + +/************************************************************************** + * + * Functions for getting decompositions. + * + **************************************************************************/ + +/* + * This routine determines if the code has a decomposition. If it returns 0, + * there is no decomposition. Any other value indicates a decomposition was + * returned. + */ +extern int ucdecomp __((unsigned long code, unsigned long *num, + + unsigned long **decomp)); + +/* + * If the code is a Hangul syllable, this routine decomposes it into the array + * passed. The array size should be at least 3. + */ +extern int ucdecomp_hangul __((unsigned long code, unsigned long *num, + unsigned long decomp[])); + +/************************************************************************** + * + * Functions for getting combining classes. + * + **************************************************************************/ + +/* + * This will return the combining class for a character to be used with the + * Canonical Ordering algorithm. + */ +extern unsigned long uccombining_class __((unsigned long code)); + +/************************************************************************** + * + * Functions for getting numbers and digits. + * + **************************************************************************/ + +struct ucnumber { + int numerator; + int denominator; +}; + +extern int ucnumber_lookup __((unsigned long code, struct ucnumber *num)); +extern int ucdigit_lookup __((unsigned long code, int *digit)); + +/* + * For compatibility with John Cowan's "uctype" package. + */ +extern struct ucnumber ucgetnumber __((unsigned long code)); +extern int ucgetdigit __((unsigned long code)); + +/************************************************************************** + * + * Functions library initialization and cleanup. + * + **************************************************************************/ + +/* + * Macros for specifying the data tables to be loaded, unloaded, or reloaded + * by the ucdata_load(), ucdata_unload(), and ucdata_reload() routines. + */ +#define UCDATA_CASE 0x01 +#define UCDATA_CTYPE 0x02 +#define UCDATA_DECOMP 0x04 +#define UCDATA_CMBCL 0x08 +#define UCDATA_NUM 0x10 + +#define UCDATA_ALL (UCDATA_CASE|UCDATA_CTYPE|UCDATA_DECOMP|\ + UCDATA_CMBCL|UCDATA_NUM) + +/* + * Functions to load, unload, and reload specific data files. + */ +extern void ucdata_load __((char *paths, int mask)); +extern void ucdata_unload __((int mask)); +extern void ucdata_reload __((char *paths, int mask)); + +/* + * Deprecated functions, now just compatibility macros. + */ +#define ucdata_setup(p) ucdata_load(p, UCDATA_ALL) +#define ucdata_cleanup() ucdata_unload(UCDATA_ALL) + +#undef __ + +#ifdef __cplusplus +} +#endif + +#endif /* _h_ucdata */ diff --git a/libraries/liblunicode/ucdata/ucdata.man b/libraries/liblunicode/ucdata/ucdata.man new file mode 100644 index 0000000000..f0c5aaa068 --- /dev/null +++ b/libraries/liblunicode/ucdata/ucdata.man @@ -0,0 +1,464 @@ +.\" +.\" $Id: ucdata.man,v 1.4 1999/11/19 16:08:33 mleisher Exp $ +.\" +.TH ucdata 3 "19 November 1999" +.SH NAME +ucdata \- package for providing Unicode/ISO10646 character information + +.SH SYNOPSIS +#include +.sp +void ucdata_load(char * paths, int masks) +.sp +void ucdata_unload(int masks) +.sp +void ucdata_reload(char * paths, int masks) +.sp +int ucdecomp(unsigned long code, unsigned long *num, unsigned long **decomp) +.sp +int ucdecomp_hangul(unsigned long code, unsigned long *num, +unsigned long decomp[]) +.sp +.nf +struct ucnumber { + int numerator; + int denominator; +}; +.sp +int ucnumber_lookup(unsigned long code, struct ucnumber *num) +.sp +int ucdigit_lookup(unsigned long code, int *digit) +.sp +struct ucnumber ucgetnumber(unsigned long code) +.sp +int ucgetdigit(unsigned long code) +.sp +unsigned long uctoupper(unsigned long code) +.sp +unsigned long uctolower(unsigned long code) +.sp +unsigned long uctotitle(unsigned long code) +.sp +int ucisalpha(unsigned long code) +.sp +int ucisalnum(unsigned long code) +.sp +int ucisdigit(unsigned long code) +.sp +int uciscntrl(unsigned long code) +.sp +int ucisspace(unsigned long code) +.sp +int ucisblank(unsigned long code) +.sp +int ucispunct(unsigned long code) +.sp +int ucisgraph(unsigned long code) +.sp +int ucisprint(unsigned long code) +.sp +int ucisxdigit(unsigned long code) +.sp +int ucisupper(unsigned long code) +.sp +int ucislower(unsigned long code) +.sp +int ucistitle(unsigned long code) +.sp +int ucisisocntrl(unsigned long code) +.sp +int ucisfmtcntrl(unsigned long code) +.sp +int ucissymbol(unsigned long code) +.sp +int ucisnumber(unsigned long code) +.sp +int ucisnonspacing(unsigned long code) +.sp +int ucisopenpunct(unsigned long code) +.sp +int ucisclosepunct(unsigned long code) +.sp +int ucisinitialpunct(unsigned long code) +.sp +int ucisfinalpunct(unsigned long code) +.sp +int uciscomposite(unsigned long code) +.sp +int ucisquote(unsigned long code) +.sp +int ucissymmetric(unsigned long code) +.sp +int ucismirroring(unsigned long code) +.sp +int ucisnonbreaking(unsigned long code) +.sp +int ucisrtl(unsigned long code) +.sp +int ucisltr(unsigned long code) +.sp +int ucisstrong(unsigned long code) +.sp +int ucisweak(unsigned long code) +.sp +int ucisneutral(unsigned long code) +.sp +int ucisseparator(unsigned long code) +.sp +int ucislsep(unsigned long code) +.sp +int ucispsep(unsigned long code) +.sp +int ucismark(unsigned long code) +.sp +int ucisnsmark(unsigned long code) +.sp +int ucisspmark(unsigned long code) +.sp +int ucismodif(unsigned long code) +.sp +int ucismodifsymbol(unsigned long code) +.sp +int ucisletnum(unsigned long code) +.sp +int ucisconnect(unsigned long code) +.sp +int ucisdash(unsigned long code) +.sp +int ucismath(unsigned long code) +.sp +int uciscurrency(unsigned long code) +.sp +int ucisenclosing(unsigned long code) +.sp +int ucisprivate(unsigned long code) +.sp +int ucissurrogate(unsigned long code) +.sp +int ucisidentstart(unsigned long code) +.sp +int ucisidentpart(unsigned long code) +.sp +int ucisdefined(unsigned long code) +.sp +int ucisundefined(unsigned long code) +.sp +int ucishan(unsigned long code) +.sp +int ucishangul(unsigned long code) + +.SH DESCRIPTION +.TP 4 +.BR Macros +.br +UCDATA_CASE +.br +UCDATA_CTYPE +.br +UCDATA_DECOMP +.br +UCDATA_CMBCL +.br +UCDATA_NUM +.br +UCDATA_ALL +.br +.TP 4 +.BR ucdata_load() +This function initializes the UCData library by locating the data files in one +of the colon-separated directories in the `paths' parameter. The data files +to be loaded are specified in the `masks' parameter as a bitwise combination +of the macros listed above. +.sp +This should be called before using any of the other functions. +.TP 4 +.BR ucdata_unload() +This function unloads the data tables specified in the `masks' parameter. +.sp +This function should be called when the application is done using the UCData +package. +.TP 4 +.BR ucdata_reload() +This function reloads the data files from one of the colon-separated +directories in the `paths' parameter. The data files to be reloaded are +specified in the `masks' parameter as a bitwise combination of the macros +listed above. +.TP 4 +.BR ucdecomp() +This function determines if a character has a decomposition and returns the +decomposition information if it exists. +.sp +If a zero is returned, there is no decomposition. If a non-zero is +returned, then the `num' and `decomp' variables are filled in with the +appropriate values. +.sp +Example call: +.sp +.nf + unsigned long i, num, *decomp; + + if (ucdecomp(0x1d5, &num, &decomp) != 0) { + for (i = 0; i < num; i++) + printf("0x%08lX,", decomp[i]); + putchar('\n'); + } +.TP 4 +.BR ucdecomp_hangul() +This function determines if a Hangul syllable has a +decomposition and returns the decomposition information. +.sp +An array of at least size 3 should be passed to the function +for the decomposition of the syllable. +.sp +If a zero is returned, the character is not a Hangul +syllable. If a non-zero is returned, the `num' field +will be 2 or 3 and the syllable will be decomposed into +the `decomp' array arithmetically. +.sp +Example call: +.sp +.nf + unsigned long i, num, decomp[3]; + + if (ucdecomp_hangul(0xb1ba, &num, &decomp) != 0) { + for (i = 0; i < num; i++) + printf("0x%08lX,", decomp[i]); + putchar('\n'); + } +.TP 4 +.BR ucnumber_lookup() +This function determines if the code is a number and +fills in the `num' field with the numerator and +denominator. If the code happens to be a single digit, +the numerator and denominator fields will be the same. +.sp +If the function returns 0, the code is not a number. +Any other return value means the code is a number. +.TP 4 +.BR ucdigit_lookup() +This function determines if the code is a digit and +fills in the `digit' field with the digit value. +.sp +If the function returns 0, the code is not a number. +Any other return value means the code is a number. +.TP 4 +.BR ucgetnumber() +This is a compatibility function with John Cowan's +"uctype" package. It uses ucnumber_lookup(). +.TP 4 +.BR ucgetdigit() +This is a compatibility function with John Cowan's +"uctype" package. It uses ucdigit_lookup(). +.TP 4 +.BR uctoupper() +This function returns the code unchanged if it is +already upper case or has no upper case equivalent. +Otherwise the upper case equivalent is returned. +.TP 4 +.BR uctolower() +This function returns the code unchanged if it is +already lower case or has no lower case equivalent. +Otherwise the lower case equivalent is returned. +.TP 4 +.BR uctotitle() +This function returns the code unchanged if it is +already title case or has no title case equivalent. +Otherwise the title case equivalent is returned. +.TP 4 +.BR ucisalpha() +Test if \fIcode\fR is an alpha character. +.TP 4 +.BR ucisalnum() +Test if \fIcode\fR is an alpha or digit character. +.TP 4 +.BR ucisdigit() +Test if \fIcode\fR is a digit character. +.TP 4 +.BR uciscntrl() +Test if \fIcode\fR is a control character. +.TP 4 +.BR ucisspace() +Test if \fIcode\fR is a space character. +.TP 4 +.BR ucisblank() +Test if \fIcode\fR is a blank character. +.TP 4 +.BR ucispunct() +Test if \fIcode\fR is a punctuation character. +.TP 4 +.BR ucisgraph() +Test if \fIcode\fR is a graphical (visible) character. +.TP 4 +.BR ucisprint() +Test if \fIcode\fR is a printable character. +.TP 4 +.BR ucisxdigit() +Test if \fIcode\fR is a hexadecimal digit character. +.TP 4 +.BR ucisupper() +Test if \fIcode\fR is an upper case character. +.TP 4 +.BR ucislower() +Test if \fIcode\fR is a lower case character. +.TP 4 +.BR ucistitle() +Test if \fIcode\fR is a title case character. +.TP 4 +.BR ucisisocntrl() +Is the character a C0 control character (< 32)? +.TP 4 +.BR ucisfmtcntrl() +Is the character a format control character? +.TP 4 +.BR ucissymbol() +Is the character a symbol? +.TP 4 +.BR ucisnumber() +Is the character a number or digit? +.TP 4 +.BR ucisnonspacing() +Is the character non-spacing? +.TP 4 +.BR ucisopenpunct() +Is the character an open/left punctuation (i.e. '[') +.TP 4 +.BR ucisclosepunct() +Is the character an close/right punctuation (i.e. ']') +.TP 4 +.BR ucisinitialpunct() +Is the character an initial punctuation (i.e. U+2018 LEFT +SINGLE QUOTATION MARK) +.TP 4 +.BR ucisfinalpunct() +Is the character a final punctuation (i.e. U+2019 RIGHT +SINGLE QUOTATION MARK) +.TP 4 +.BR uciscomposite() +Can the character be decomposed into a set of other +characters? +.TP 4 +.BR ucisquote() +Is the character one of the many quotation marks? +.TP 4 +.BR ucissymmetric() +Is the character one that has an opposite form +(i.e. <>) +.TP 4 +.BR ucismirroring() +Is the character mirroring (superset of symmetric)? +.TP 4 +.BR ucisnonbreaking() +Is the character non-breaking (i.e. non-breaking +space)? +.TP 4 +.BR ucisrtl() +Does the character have strong right-to-left +directionality (i.e. Arabic letters)? +.TP 4 +.BR ucisltr() +Does the character have strong left-to-right +directionality (i.e. Latin letters)? +.TP 4 +.BR ucisstrong() +Does the character have strong directionality? +.TP 4 +.BR ucisweak() +Does the character have weak directionality +(i.e. numbers)? +.TP 4 +.BR ucisneutral() +Does the character have neutral directionality +(i.e. whitespace)? +.TP 4 +.BR ucisseparator() +Is the character a block or segment separator? +.TP 4 +.BR ucislsep() +Is the character a line separator? +.TP 4 +.BR ucispsep() +Is the character a paragraph separator? +.TP 4 +.BR ucismark() +Is the character a mark of some kind? +.TP 4 +.BR ucisnsmark() +Is the character a non-spacing mark? +.TP 4 +.BR ucisspmark() +Is the character a spacing mark? +.TP 4 +.BR ucismodif() +Is the character a modifier letter? +.TP 4 +.BR ucismodifsymbol() +Is the character a modifier symbol? +.TP 4 +.BR ucisletnum() +Is the character a number represented by a letter? +.TP 4 +.BR ucisconnect() +Is the character connecting punctuation? +.TP 4 +.BR ucisdash() +Is the character dash punctuation? +.TP 4 +.BR ucismath() +Is the character a math character? +.TP 4 +.BR uciscurrency() +Is the character a currency character? +.TP 4 +.BR ucisenclosing() +Is the character enclosing (i.e. enclosing box)? +.TP 4 +.BR ucisprivate() +Is the character from the Private Use Area? +.TP 4 +.BR ucissurrogate() +Is the character one of the surrogate codes? +.TP 4 +.BR ucisidentstart() +Is the character a legal initial character of an identifier? +.TP 4 +.BR ucisidentpart() +Is the character a legal identifier character? +.TP 4 +.BR ucisdefined() +Is the character defined (appeared in one of the data +files)? +.TP 4 +.BR ucisundefined() +Is the character not defined (non-Unicode)? +.TP 4 +.BR ucishan() +Is the character a Han ideograph? +.TP 4 +.BR ucishangul() +Is the character a pre-composed Hangul syllable? + +.SH "SEE ALSO" +ctype(3) + +.SH ACKNOWLEDGMENTS +These are people who have helped with patches or +alerted me about problems. +.sp +John Cowan +.br +Bob Verbrugge +.br +Christophe Pierret +.br +Kent Johnson +.br +Valeriy E. Ushakov + +.SH AUTHOR +Mark Leisher +.br +Computing Research Lab +.br +New Mexico State University +.br +Email: mleisher@crl.nmsu.edu diff --git a/libraries/liblunicode/ucdata/ucgendat.c b/libraries/liblunicode/ucdata/ucgendat.c new file mode 100644 index 0000000000..64cb153a8d --- /dev/null +++ b/libraries/liblunicode/ucdata/ucgendat.c @@ -0,0 +1,1485 @@ +/* + * Copyright 1999 Computing Research Labs, New Mexico State University + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT + * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ +#ifndef lint +#ifdef __GNUC__ +static char rcsid[] __attribute__ ((unused)) = "$Id: ucgendat.c,v 1.3 1999/10/07 20:49:56 mleisher Exp $"; +#else +static char rcsid[] = "$Id: ucgendat.c,v 1.3 1999/10/07 20:49:56 mleisher Exp $"; +#endif +#endif + +#include +#include +#include +#ifndef WIN32 +#include +#endif + +#define ishdigit(cc) (((cc) >= '0' && (cc) <= '9') ||\ + ((cc) >= 'A' && (cc) <= 'F') ||\ + ((cc) >= 'a' && (cc) <= 'f')) + +/* + * A header written to the output file with the byte-order-mark and the number + * of property nodes. + */ +static unsigned short hdr[2] = {0xfeff, 0}; + +#define NUMPROPS 49 +#define NEEDPROPS (NUMPROPS + (4 - (NUMPROPS & 3))) + +typedef struct { + char *name; + int len; +} _prop_t; + +/* + * List of properties expected to be found in the Unicode Character Database + * including some implementation specific properties. + * + * The implementation specific properties are: + * Cm = Composed (can be decomposed) + * Nb = Non-breaking + * Sy = Symmetric (has left and right forms) + * Hd = Hex digit + * Qm = Quote marks + * Mr = Mirroring + * Ss = Space, other + * Cp = Defined character + */ +static _prop_t props[NUMPROPS] = { + {"Mn", 2}, {"Mc", 2}, {"Me", 2}, {"Nd", 2}, {"Nl", 2}, {"No", 2}, + {"Zs", 2}, {"Zl", 2}, {"Zp", 2}, {"Cc", 2}, {"Cf", 2}, {"Cs", 2}, + {"Co", 2}, {"Cn", 2}, {"Lu", 2}, {"Ll", 2}, {"Lt", 2}, {"Lm", 2}, + {"Lo", 2}, {"Pc", 2}, {"Pd", 2}, {"Ps", 2}, {"Pe", 2}, {"Po", 2}, + {"Sm", 2}, {"Sc", 2}, {"Sk", 2}, {"So", 2}, {"L", 1}, {"R", 1}, + {"EN", 2}, {"ES", 2}, {"ET", 2}, {"AN", 2}, {"CS", 2}, {"B", 1}, + {"S", 1}, {"WS", 2}, {"ON", 2}, + {"Cm", 2}, {"Nb", 2}, {"Sy", 2}, {"Hd", 2}, {"Qm", 2}, {"Mr", 2}, + {"Ss", 2}, {"Cp", 2}, {"Pi", 2}, {"Pf", 2} +}; + +typedef struct { + unsigned long *ranges; + unsigned short used; + unsigned short size; +} _ranges_t; + +static _ranges_t proptbl[NUMPROPS]; + +/* + * Make sure this array is sized to be on a 4-byte boundary at compile time. + */ +static unsigned short propcnt[NEEDPROPS]; + +/* + * Array used to collect a decomposition before adding it to the decomposition + * table. + */ +static unsigned long dectmp[64]; +static unsigned long dectmp_size; + +typedef struct { + unsigned long code; + unsigned short size; + unsigned short used; + unsigned long *decomp; +} _decomp_t; + +/* + * List of decomposition. Created and expanded in order as the characters are + * encountered. + */ +static _decomp_t *decomps; +static unsigned long decomps_used; +static unsigned long decomps_size; + +/* + * Types and lists for handling lists of case mappings. + */ +typedef struct { + unsigned long key; + unsigned long other1; + unsigned long other2; +} _case_t; + +static _case_t *upper; +static _case_t *lower; +static _case_t *title; +static unsigned long upper_used; +static unsigned long upper_size; +static unsigned long lower_used; +static unsigned long lower_size; +static unsigned long title_used; +static unsigned long title_size; + +/* + * Array used to collect case mappings before adding them to a list. + */ +static unsigned long cases[3]; + +/* + * An array to hold ranges for combining classes. + */ +static unsigned long *ccl; +static unsigned long ccl_used; +static unsigned long ccl_size; + +/* + * Structures for handling numbers. + */ +typedef struct { + unsigned long code; + unsigned long idx; +} _codeidx_t; + +typedef struct { + short numerator; + short denominator; +} _num_t; + +/* + * Arrays to hold the mapping of codes to numbers. + */ +static _codeidx_t *ncodes; +static unsigned long ncodes_used; +static unsigned long ncodes_size; + +static _num_t *nums; +static unsigned long nums_used; +static unsigned long nums_size; + +/* + * Array for holding numbers. + */ +static _num_t *nums; +static unsigned long nums_used; +static unsigned long nums_size; + +static void +#ifdef __STDC__ +add_range(unsigned long start, unsigned long end, char *p1, char *p2) +#else +add_range(start, end, p1, p2) +unsigned long start, end; +char *p1, *p2; +#endif +{ + int i, j, k, len; + _ranges_t *rlp; + char *name; + + for (k = 0; k < 2; k++) { + if (k == 0) { + name = p1; + len = 2; + } else { + if (p2 == 0) + break; + + name = p2; + len = 1; + } + + for (i = 0; i < NUMPROPS; i++) { + if (props[i].len == len && memcmp(props[i].name, name, len) == 0) + break; + } + + if (i == NUMPROPS) + continue; + + rlp = &proptbl[i]; + + /* + * Resize the range list if necessary. + */ + if (rlp->used == rlp->size) { + if (rlp->size == 0) + rlp->ranges = (unsigned long *) + malloc(sizeof(unsigned long) << 3); + else + rlp->ranges = (unsigned long *) + realloc((char *) rlp->ranges, + sizeof(unsigned long) * (rlp->size + 8)); + rlp->size += 8; + } + + /* + * If this is the first code for this property list, just add it + * and return. + */ + if (rlp->used == 0) { + rlp->ranges[0] = start; + rlp->ranges[1] = end; + rlp->used += 2; + continue; + } + + /* + * Optimize the case of adding the range to the end. + */ + j = rlp->used - 1; + if (start > rlp->ranges[j]) { + j = rlp->used; + rlp->ranges[j++] = start; + rlp->ranges[j++] = end; + rlp->used = j; + continue; + } + + /* + * Need to locate the insertion point. + */ + for (i = 0; + i < rlp->used && start > rlp->ranges[i + 1] + 1; i += 2) ; + + /* + * If the start value lies in the current range, then simply set the + * new end point of the range to the end value passed as a parameter. + */ + if (rlp->ranges[i] <= start && start <= rlp->ranges[i + 1] + 1) { + rlp->ranges[i + 1] = end; + return; + } + + /* + * Shift following values up by two. + */ + for (j = rlp->used; j > i; j -= 2) { + rlp->ranges[j] = rlp->ranges[j - 2]; + rlp->ranges[j + 1] = rlp->ranges[j - 1]; + } + + /* + * Add the new range at the insertion point. + */ + rlp->ranges[i] = start; + rlp->ranges[i + 1] = end; + rlp->used += 2; + } +} + +static void +#ifdef __STDC__ +ordered_range_insert(unsigned long c, char *name, int len) +#else +ordered_range_insert(c, name, len) +unsigned long c; +char *name; +int len; +#endif +{ + int i, j; + unsigned long s, e; + _ranges_t *rlp; + + if (len == 0) + return; + + /* + * Deal with directionality codes introduced in Unicode 3.0. + */ + if (len == 2) { + if (memcmp(name, "AL", 2) == 0) { + /* + * Mark the Arabic letters as having RTL directionality. + */ + len = 1; + name = "R"; + } else if (memcmp(name, "BN", 2) == 0) { + /* + * Mark the control characters as being Other Neutrals. + */ + len = 2; + name = "ON"; + } + } else if (len == 3 && + (memcmp(name, "NSM", 3) == 0 || memcmp(name, "PDF", 3) == 0 || + memcmp(name, "LRE", 3) == 0 || memcmp(name, "LRO", 3) == 0 || + memcmp(name, "RLE", 3) == 0 || memcmp(name, "RLO", 3) == 0)) { + /* + * Mark all of these as Other Neutral to preserve compatibility with + * older versions. + */ + len = 2; + name = "ON"; + } + + for (i = 0; i < NUMPROPS; i++) { + if (props[i].len == len && memcmp(props[i].name, name, len) == 0) + break; + } + + if (i == NUMPROPS) + return; + + /* + * Have a match, so insert the code in order. + */ + rlp = &proptbl[i]; + + /* + * Resize the range list if necessary. + */ + if (rlp->used == rlp->size) { + if (rlp->size == 0) + rlp->ranges = (unsigned long *) + malloc(sizeof(unsigned long) << 3); + else + rlp->ranges = (unsigned long *) + realloc((char *) rlp->ranges, + sizeof(unsigned long) * (rlp->size + 8)); + rlp->size += 8; + } + + /* + * If this is the first code for this property list, just add it + * and return. + */ + if (rlp->used == 0) { + rlp->ranges[0] = rlp->ranges[1] = c; + rlp->used += 2; + return; + } + + /* + * Optimize the cases of extending the last range and adding new ranges to + * the end. + */ + j = rlp->used - 1; + e = rlp->ranges[j]; + s = rlp->ranges[j - 1]; + + if (c == e + 1) { + /* + * Extend the last range. + */ + rlp->ranges[j] = c; + return; + } + + if (c > e + 1) { + /* + * Start another range on the end. + */ + j = rlp->used; + rlp->ranges[j] = rlp->ranges[j + 1] = c; + rlp->used += 2; + return; + } + + if (c >= s) + /* + * The code is a duplicate of a code in the last range, so just return. + */ + return; + + /* + * The code should be inserted somewhere before the last range in the + * list. Locate the insertion point. + */ + for (i = 0; + i < rlp->used && c > rlp->ranges[i + 1] + 1; i += 2) ; + + s = rlp->ranges[i]; + e = rlp->ranges[i + 1]; + + if (c == e + 1) + /* + * Simply extend the current range. + */ + rlp->ranges[i + 1] = c; + else if (c < s) { + /* + * Add a new entry before the current location. Shift all entries + * before the current one up by one to make room. + */ + for (j = rlp->used; j > i; j -= 2) { + rlp->ranges[j] = rlp->ranges[j - 2]; + rlp->ranges[j + 1] = rlp->ranges[j - 1]; + } + rlp->ranges[i] = rlp->ranges[i + 1] = c; + + rlp->used += 2; + } +} + +static void +#ifdef __STDC__ +add_decomp(unsigned long code) +#else +add_decomp(code) +unsigned long code; +#endif +{ + unsigned long i, j, size; + + /* + * Add the code to the composite property. + */ + ordered_range_insert(code, "Cm", 2); + + /* + * Locate the insertion point for the code. + */ + for (i = 0; i < decomps_used && code > decomps[i].code; i++) ; + + /* + * Allocate space for a new decomposition. + */ + if (decomps_used == decomps_size) { + if (decomps_size == 0) + decomps = (_decomp_t *) malloc(sizeof(_decomp_t) << 3); + else + decomps = (_decomp_t *) + realloc((char *) decomps, + sizeof(_decomp_t) * (decomps_size + 8)); + (void) memset((char *) (decomps + decomps_size), 0, + sizeof(_decomp_t) << 3); + decomps_size += 8; + } + + if (i < decomps_used && code != decomps[i].code) { + /* + * Shift the decomps up by one if the codes don't match. + */ + for (j = decomps_used; j > i; j--) + (void) memcpy((char *) &decomps[j], (char *) &decomps[j - 1], + sizeof(_decomp_t)); + } + + /* + * Insert or replace a decomposition. + */ + size = dectmp_size + (4 - (dectmp_size & 3)); + if (decomps[i].size < size) { + if (decomps[i].size == 0) + decomps[i].decomp = (unsigned long *) + malloc(sizeof(unsigned long) * size); + else + decomps[i].decomp = (unsigned long *) + realloc((char *) decomps[i].decomp, + sizeof(unsigned long) * size); + decomps[i].size = size; + } + + if (decomps[i].code != code) + decomps_used++; + + decomps[i].code = code; + decomps[i].used = dectmp_size; + (void) memcpy((char *) decomps[i].decomp, (char *) dectmp, + sizeof(unsigned long) * dectmp_size); + +} + +static void +#ifdef __STDC__ +add_title(unsigned long code) +#else +add_title(code) +unsigned long code; +#endif +{ + unsigned long i, j; + + /* + * Always map the code to itself. + */ + cases[2] = code; + + if (title_used == title_size) { + if (title_size == 0) + title = (_case_t *) malloc(sizeof(_case_t) << 3); + else + title = (_case_t *) realloc((char *) title, + sizeof(_case_t) * (title_size + 8)); + title_size += 8; + } + + /* + * Locate the insertion point. + */ + for (i = 0; i < title_used && code > title[i].key; i++) ; + + if (i < title_used) { + /* + * Shift the array up by one. + */ + for (j = title_used; j > i; j--) + (void) memcpy((char *) &title[j], (char *) &title[j - 1], + sizeof(_case_t)); + } + + title[i].key = cases[2]; /* Title */ + title[i].other1 = cases[0]; /* Upper */ + title[i].other2 = cases[1]; /* Lower */ + + title_used++; +} + +static void +#ifdef __STDC__ +add_upper(unsigned long code) +#else +add_upper(code) +unsigned long code; +#endif +{ + unsigned long i, j; + + /* + * Always map the code to itself. + */ + cases[0] = code; + + /* + * If the title case character is not present, then make it the same as + * the upper case. + */ + if (cases[2] == 0) + cases[2] = code; + + if (upper_used == upper_size) { + if (upper_size == 0) + upper = (_case_t *) malloc(sizeof(_case_t) << 3); + else + upper = (_case_t *) realloc((char *) upper, + sizeof(_case_t) * (upper_size + 8)); + upper_size += 8; + } + + /* + * Locate the insertion point. + */ + for (i = 0; i < upper_used && code > upper[i].key; i++) ; + + if (i < upper_used) { + /* + * Shift the array up by one. + */ + for (j = upper_used; j > i; j--) + (void) memcpy((char *) &upper[j], (char *) &upper[j - 1], + sizeof(_case_t)); + } + + upper[i].key = cases[0]; /* Upper */ + upper[i].other1 = cases[1]; /* Lower */ + upper[i].other2 = cases[2]; /* Title */ + + upper_used++; +} + +static void +#ifdef __STDC__ +add_lower(unsigned long code) +#else +add_lower(code) +unsigned long code; +#endif +{ + unsigned long i, j; + + /* + * Always map the code to itself. + */ + cases[1] = code; + + /* + * If the title case character is empty, then make it the same as the + * upper case. + */ + if (cases[2] == 0) + cases[2] = cases[0]; + + if (lower_used == lower_size) { + if (lower_size == 0) + lower = (_case_t *) malloc(sizeof(_case_t) << 3); + else + lower = (_case_t *) realloc((char *) lower, + sizeof(_case_t) * (lower_size + 8)); + lower_size += 8; + } + + /* + * Locate the insertion point. + */ + for (i = 0; i < lower_used && code > lower[i].key; i++) ; + + if (i < lower_used) { + /* + * Shift the array up by one. + */ + for (j = lower_used; j > i; j--) + (void) memcpy((char *) &lower[j], (char *) &lower[j - 1], + sizeof(_case_t)); + } + + lower[i].key = cases[1]; /* Lower */ + lower[i].other1 = cases[0]; /* Upper */ + lower[i].other2 = cases[2]; /* Title */ + + lower_used++; +} + +static void +#ifdef __STDC__ +ordered_ccl_insert(unsigned long c, unsigned long ccl_code) +#else +ordered_ccl_insert(c, ccl_code) +unsigned long c, ccl_code; +#endif +{ + unsigned long i, j; + + if (ccl_used == ccl_size) { + if (ccl_size == 0) + ccl = (unsigned long *) malloc(sizeof(unsigned long) * 24); + else + ccl = (unsigned long *) + realloc((char *) ccl, sizeof(unsigned long) * (ccl_size + 24)); + ccl_size += 24; + } + + /* + * Optimize adding the first item. + */ + if (ccl_used == 0) { + ccl[0] = ccl[1] = c; + ccl[2] = ccl_code; + ccl_used += 3; + return; + } + + /* + * Handle the special case of extending the range on the end. This + * requires that the combining class codes are the same. + */ + if (ccl_code == ccl[ccl_used - 1] && c == ccl[ccl_used - 2] + 1) { + ccl[ccl_used - 2] = c; + return; + } + + /* + * Handle the special case of adding another range on the end. + */ + if (c > ccl[ccl_used - 2] + 1 || + (c == ccl[ccl_used - 2] + 1 && ccl_code != ccl[ccl_used - 1])) { + ccl[ccl_used++] = c; + ccl[ccl_used++] = c; + ccl[ccl_used++] = ccl_code; + return; + } + + /* + * Locate either the insertion point or range for the code. + */ + for (i = 0; i < ccl_used && c > ccl[i + 1] + 1; i += 3) ; + + if (ccl_code == ccl[i + 2] && c == ccl[i + 1] + 1) { + /* + * Extend an existing range. + */ + ccl[i + 1] = c; + return; + } else if (c < ccl[i]) { + /* + * Start a new range before the current location. + */ + for (j = ccl_used; j > i; j -= 3) { + ccl[j] = ccl[j - 3]; + ccl[j - 1] = ccl[j - 4]; + ccl[j - 2] = ccl[j - 5]; + } + ccl[i] = ccl[i + 1] = c; + ccl[i + 2] = ccl_code; + } +} + +/* + * Adds a number if it does not already exist and returns an index value + * multiplied by 2. + */ +static unsigned long +#ifdef __STDC__ +make_number(short num, short denom) +#else +make_number(num, denom) +short num, denom; +#endif +{ + unsigned long n; + + /* + * Determine if the number already exists. + */ + for (n = 0; n < nums_used; n++) { + if (nums[n].numerator == num && nums[n].denominator == denom) + return n << 1; + } + + if (nums_used == nums_size) { + if (nums_size == 0) + nums = (_num_t *) malloc(sizeof(_num_t) << 3); + else + nums = (_num_t *) realloc((char *) nums, + sizeof(_num_t) * (nums_size + 8)); + nums_size += 8; + } + + n = nums_used++; + nums[n].numerator = num; + nums[n].denominator = denom; + + return n << 1; +} + +static void +#ifdef __STDC__ +add_number(unsigned long code, short num, short denom) +#else +add_number(code, num, denom) +unsigned long code; +short num, denom; +#endif +{ + unsigned long i, j; + + /* + * Insert the code in order. + */ + for (i = 0; i < ncodes_used && code > ncodes[i].code; i++) ; + + /* + * Handle the case of the codes matching and simply replace the number + * that was there before. + */ + if (ncodes_used > 0 && code == ncodes[i].code) { + ncodes[i].idx = make_number(num, denom); + return; + } + + /* + * Resize the array if necessary. + */ + if (ncodes_used == ncodes_size) { + if (ncodes_size == 0) + ncodes = (_codeidx_t *) malloc(sizeof(_codeidx_t) << 3); + else + ncodes = (_codeidx_t *) + realloc((char *) ncodes, sizeof(_codeidx_t) * (ncodes_size + 8)); + + ncodes_size += 8; + } + + /* + * Shift things around to insert the code if necessary. + */ + if (i < ncodes_used) { + for (j = ncodes_used; j > i; j--) { + ncodes[j].code = ncodes[j - 1].code; + ncodes[j].idx = ncodes[j - 1].idx; + } + } + ncodes[i].code = code; + ncodes[i].idx = make_number(num, denom); + + ncodes_used++; +} + +/* + * This routine assumes that the line is a valid Unicode Character Database + * entry. + */ +static void +#ifdef __STDC__ +read_cdata(FILE *in) +#else +read_cdata(in) +FILE *in; +#endif +{ + unsigned long i, lineno, skip, code, ccl_code; + short wnum, neg, number[2]; + char line[512], *s, *e; + + lineno = skip = 0; + while (fscanf(in, "%[^\n]\n", line) != EOF) { + lineno++; + + /* + * Skip blank lines and lines that start with a '#'. + */ + if (line[0] == 0 || line[0] == '#') + continue; + + /* + * If lines need to be skipped, do it here. + */ + if (skip) { + skip--; + continue; + } + + /* + * Collect the code. The code can be up to 6 hex digits in length to + * allow surrogates to be specified. + */ + for (s = line, i = code = 0; *s != ';' && i < 6; i++, s++) { + code <<= 4; + if (*s >= '0' && *s <= '9') + code += *s - '0'; + else if (*s >= 'A' && *s <= 'F') + code += (*s - 'A') + 10; + else if (*s >= 'a' && *s <= 'f') + code += (*s - 'a') + 10; + } + + /* + * Handle the following special cases: + * 1. 4E00-9FA5 CJK Ideographs. + * 2. AC00-D7A3 Hangul Syllables. + * 3. D800-DFFF Surrogates. + * 4. E000-F8FF Private Use Area. + * 5. F900-FA2D Han compatibility. + */ + switch (code) { + case 0x4e00: + /* + * The Han ideographs. + */ + add_range(0x4e00, 0x9fff, "Lo", "L"); + + /* + * Add the characters to the defined category. + */ + add_range(0x4e00, 0x9fa5, "Cp", 0); + + skip = 1; + break; + case 0xac00: + /* + * The Hangul syllables. + */ + add_range(0xac00, 0xd7a3, "Lo", "L"); + + /* + * Add the characters to the defined category. + */ + add_range(0xac00, 0xd7a3, "Cp", 0); + + skip = 1; + break; + case 0xd800: + /* + * Make a range of all surrogates and assume some default + * properties. + */ + add_range(0x010000, 0x10ffff, "Cs", "L"); + skip = 5; + break; + case 0xe000: + /* + * The Private Use area. Add with a default set of properties. + */ + add_range(0xe000, 0xf8ff, "Co", "L"); + skip = 1; + break; + case 0xf900: + /* + * The CJK compatibility area. + */ + add_range(0xf900, 0xfaff, "Lo", "L"); + + /* + * Add the characters to the defined category. + */ + add_range(0xf900, 0xfaff, "Cp", 0); + + skip = 1; + } + + if (skip) + continue; + + /* + * Add the code to the defined category. + */ + ordered_range_insert(code, "Cp", 2); + + /* + * Locate the first character property field. + */ + for (i = 0; *s != 0 && i < 2; s++) { + if (*s == ';') + i++; + } + for (e = s; *e && *e != ';'; e++) ; + + ordered_range_insert(code, s, e - s); + + /* + * Locate the combining class code. + */ + for (s = e; *s != 0 && i < 3; s++) { + if (*s == ';') + i++; + } + + /* + * Convert the combining class code from decimal. + */ + for (ccl_code = 0, e = s; *e && *e != ';'; e++) + ccl_code = (ccl_code * 10) + (*e - '0'); + + /* + * Add the code if it not 0. + */ + if (ccl_code != 0) + ordered_ccl_insert(code, ccl_code); + + /* + * Locate the second character property field. + */ + for (s = e; *s != 0 && i < 4; s++) { + if (*s == ';') + i++; + } + for (e = s; *e && *e != ';'; e++) ; + + ordered_range_insert(code, s, e - s); + + /* + * Check for a decomposition. + */ + s = ++e; + if (*s != ';' && *s != '<') { + /* + * Collect the codes of the decomposition. + */ + for (dectmp_size = 0; *s != ';'; ) { + /* + * Skip all leading non-hex digits. + */ + while (!ishdigit(*s)) + s++; + + for (dectmp[dectmp_size] = 0; ishdigit(*s); s++) { + dectmp[dectmp_size] <<= 4; + if (*s >= '0' && *s <= '9') + dectmp[dectmp_size] += *s - '0'; + else if (*s >= 'A' && *s <= 'F') + dectmp[dectmp_size] += (*s - 'A') + 10; + else if (*s >= 'a' && *s <= 'f') + dectmp[dectmp_size] += (*s - 'a') + 10; + } + dectmp_size++; + } + + /* + * If there is more than one code in the temporary decomposition + * array, then add the character with its decomposition. + */ + if (dectmp_size > 1) + add_decomp(code); + } + + /* + * Skip to the number field. + */ + for (i = 0; i < 3 && *s; s++) { + if (*s == ';') + i++; + } + + /* + * Scan the number in. + */ + number[0] = number[1] = 0; + for (e = s, neg = wnum = 0; *e && *e != ';'; e++) { + if (*e == '-') { + neg = 1; + continue; + } + + if (*e == '/') { + /* + * Move the the denominator of the fraction. + */ + if (neg) + number[wnum] *= -1; + neg = 0; + e++; + wnum++; + } + number[wnum] = (number[wnum] * 10) + (*e - '0'); + } + + if (e > s) { + /* + * Adjust the denominator in case of integers and add the number. + */ + if (wnum == 0) + number[1] = number[0]; + + add_number(code, number[0], number[1]); + } + + /* + * Skip to the start of the possible case mappings. + */ + for (s = e, i = 0; i < 4 && *s; s++) { + if (*s == ';') + i++; + } + + /* + * Collect the case mappings. + */ + cases[0] = cases[1] = cases[2] = 0; + for (i = 0; i < 3; i++) { + while (ishdigit(*s)) { + cases[i] <<= 4; + if (*s >= '0' && *s <= '9') + cases[i] += *s - '0'; + else if (*s >= 'A' && *s <= 'F') + cases[i] += (*s - 'A') + 10; + else if (*s >= 'a' && *s <= 'f') + cases[i] += (*s - 'a') + 10; + s++; + } + if (*s == ';') + s++; + } + if (cases[0] && cases[1]) + /* + * Add the upper and lower mappings for a title case character. + */ + add_title(code); + else if (cases[1]) + /* + * Add the lower and title case mappings for the upper case + * character. + */ + add_upper(code); + else if (cases[0]) + /* + * Add the upper and title case mappings for the lower case + * character. + */ + add_lower(code); + } +} + +static _decomp_t * +#ifdef __STDC__ +find_decomp(unsigned long code) +#else +find_decomp(code) +unsigned long code; +#endif +{ + long l, r, m; + + l = 0; + r = decomps_used - 1; + while (l <= r) { + m = (l + r) >> 1; + if (code > decomps[m].code) + l = m + 1; + else if (code < decomps[m].code) + r = m - 1; + else + return &decomps[m]; + } + return 0; +} + +static void +#ifdef __STDC__ +decomp_it(_decomp_t *d) +#else +decomp_it(d) +_decomp_t *d; +#endif +{ + unsigned long i; + _decomp_t *dp; + + for (i = 0; i < d->used; i++) { + if ((dp = find_decomp(d->decomp[i])) != 0) + decomp_it(dp); + else + dectmp[dectmp_size++] = d->decomp[i]; + } +} + +/* + * Expand all decompositions by recursively decomposing each character + * in the decomposition. + */ +static void +#ifdef __STDC__ +expand_decomp(void) +#else +expand_decomp() +#endif +{ + unsigned long i; + + for (i = 0; i < decomps_used; i++) { + dectmp_size = 0; + decomp_it(&decomps[i]); + if (dectmp_size > 0) + add_decomp(decomps[i].code); + } +} + +static void +#ifdef __STDC__ +write_cdata(char *opath) +#else +write_cdata(opath) +char *opath; +#endif +{ + FILE *out; + unsigned long i, idx, bytes, nprops; + unsigned short casecnt[2]; + char path[BUFSIZ]; + + /***************************************************************** + * + * Generate the ctype data. + * + *****************************************************************/ + + /* + * Open the ctype.dat file. + */ + sprintf(path, "%s/ctype.dat", opath); + if ((out = fopen(path, "wb")) == 0) + return; + + /* + * Collect the offsets for the properties. The offsets array is + * on a 4-byte boundary to keep things efficient for architectures + * that need such a thing. + */ + for (i = idx = 0; i < NUMPROPS; i++) { + propcnt[i] = (proptbl[i].used != 0) ? idx : 0xffff; + idx += proptbl[i].used; + } + + /* + * Add the sentinel index which is used by the binary search as the upper + * bound for a search. + */ + propcnt[i] = idx; + + /* + * Record the actual number of property lists. This may be different than + * the number of offsets actually written because of aligning on a 4-byte + * boundary. + */ + hdr[1] = NUMPROPS; + + /* + * Calculate the byte count needed and pad the property counts array to a + * 4-byte boundary. + */ + if ((bytes = sizeof(unsigned short) * (NUMPROPS + 1)) & 3) + bytes += 4 - (bytes & 3); + nprops = bytes / sizeof(unsigned short); + bytes += sizeof(unsigned long) * idx; + + /* + * Write the header. + */ + fwrite((char *) hdr, sizeof(unsigned short), 2, out); + + /* + * Write the byte count. + */ + fwrite((char *) &bytes, sizeof(unsigned long), 1, out); + + /* + * Write the property list counts. + */ + fwrite((char *) propcnt, sizeof(unsigned short), nprops, out); + + /* + * Write the property lists. + */ + for (i = 0; i < NUMPROPS; i++) { + if (proptbl[i].used > 0) + fwrite((char *) proptbl[i].ranges, sizeof(unsigned long), + proptbl[i].used, out); + } + + fclose(out); + + /***************************************************************** + * + * Generate the case mapping data. + * + *****************************************************************/ + + /* + * Open the case.dat file. + */ + sprintf(path, "%s/case.dat", opath); + if ((out = fopen(path, "wb")) == 0) + return; + + /* + * Write the case mapping tables. + */ + hdr[1] = upper_used + lower_used + title_used; + casecnt[0] = upper_used; + casecnt[1] = lower_used; + + /* + * Write the header. + */ + fwrite((char *) hdr, sizeof(unsigned short), 2, out); + + /* + * Write the upper and lower case table sizes. + */ + fwrite((char *) casecnt, sizeof(unsigned short), 2, out); + + if (upper_used > 0) + /* + * Write the upper case table. + */ + fwrite((char *) upper, sizeof(_case_t), upper_used, out); + + if (lower_used > 0) + /* + * Write the lower case table. + */ + fwrite((char *) lower, sizeof(_case_t), lower_used, out); + + if (title_used > 0) + /* + * Write the title case table. + */ + fwrite((char *) title, sizeof(_case_t), title_used, out); + + fclose(out); + + /***************************************************************** + * + * Generate the decomposition data. + * + *****************************************************************/ + + /* + * Fully expand all decompositions before generating the output file. + */ + expand_decomp(); + + /* + * Open the decomp.dat file. + */ + sprintf(path, "%s/decomp.dat", opath); + if ((out = fopen(path, "wb")) == 0) + return; + + hdr[1] = decomps_used; + + /* + * Write the header. + */ + fwrite((char *) hdr, sizeof(unsigned short), 2, out); + + /* + * Write a temporary byte count which will be calculated as the + * decompositions are written out. + */ + bytes = 0; + fwrite((char *) &bytes, sizeof(unsigned long), 1, out); + + if (decomps_used) { + /* + * Write the list of decomp nodes. + */ + for (i = idx = 0; i < decomps_used; i++) { + fwrite((char *) &decomps[i].code, sizeof(unsigned long), 1, out); + fwrite((char *) &idx, sizeof(unsigned long), 1, out); + idx += decomps[i].used; + } + + /* + * Write the sentinel index as the last decomp node. + */ + fwrite((char *) &idx, sizeof(unsigned long), 1, out); + + /* + * Write the decompositions themselves. + */ + for (i = 0; i < decomps_used; i++) + fwrite((char *) decomps[i].decomp, sizeof(unsigned long), + decomps[i].used, out); + + /* + * Seek back to the beginning and write the byte count. + */ + bytes = (sizeof(unsigned long) * idx) + + (sizeof(unsigned long) * ((hdr[1] << 1) + 1)); + fseek(out, sizeof(unsigned short) << 1, 0L); + fwrite((char *) &bytes, sizeof(unsigned long), 1, out); + + fclose(out); + } + + /***************************************************************** + * + * Generate the combining class data. + * + *****************************************************************/ + + /* + * Open the cmbcl.dat file. + */ + sprintf(path, "%s/cmbcl.dat", opath); + if ((out = fopen(path, "wb")) == 0) + return; + + /* + * Set the number of ranges used. Each range has a combining class which + * means each entry is a 3-tuple. + */ + hdr[1] = ccl_used / 3; + + /* + * Write the header. + */ + fwrite((char *) hdr, sizeof(unsigned short), 2, out); + + /* + * Write out the byte count to maintain header size. + */ + bytes = ccl_used * sizeof(unsigned long); + fwrite((char *) &bytes, sizeof(unsigned long), 1, out); + + if (ccl_used > 0) + /* + * Write the combining class ranges out. + */ + fwrite((char *) ccl, sizeof(unsigned long), ccl_used, out); + + fclose(out); + + /***************************************************************** + * + * Generate the number data. + * + *****************************************************************/ + + /* + * Open the num.dat file. + */ + sprintf(path, "%s/num.dat", opath); + if ((out = fopen(path, "wb")) == 0) + return; + + /* + * The count part of the header will be the total number of codes that + * have numbers. + */ + hdr[1] = (unsigned short) (ncodes_used << 1); + bytes = (ncodes_used * sizeof(_codeidx_t)) + (nums_used * sizeof(_num_t)); + + /* + * Write the header. + */ + fwrite((char *) hdr, sizeof(unsigned short), 2, out); + + /* + * Write out the byte count to maintain header size. + */ + fwrite((char *) &bytes, sizeof(unsigned long), 1, out); + + /* + * Now, if number mappings exist, write them out. + */ + if (ncodes_used > 0) { + fwrite((char *) ncodes, sizeof(_codeidx_t), ncodes_used, out); + fwrite((char *) nums, sizeof(_num_t), nums_used, out); + } + + fclose(out); +} + +int +#ifdef __STDC__ +main(int argc, char *argv[]) +#else +main(argc, argv) +int argc; +char *argv[]; +#endif +{ + FILE *in; + char *prog, *opath; + + if ((prog = strrchr(argv[0], '/')) != 0) + prog++; + else + prog = argv[0]; + + opath = 0; + in = stdin; + + argc--; + argv++; + + while (argc > 0) { + if (argv[0][0] == '-' && argv[0][1] == 'o') { + argc--; + argv++; + opath = argv[0]; + } else { + if (in != stdin) + fclose(in); + if ((in = fopen(argv[0], "rb")) == 0) + fprintf(stderr, "%s: unable to open ctype file %s\n", + prog, argv[0]); + else { + read_cdata(in); + fclose(in); + in = 0; + } + } + argc--; + argv++; + } + + if (opath == 0) + opath = "."; + write_cdata(opath); + + return 0; +} diff --git a/libraries/liblunicode/ucdata/ucpgba.c b/libraries/liblunicode/ucdata/ucpgba.c new file mode 100644 index 0000000000..6c4d767cea --- /dev/null +++ b/libraries/liblunicode/ucdata/ucpgba.c @@ -0,0 +1,813 @@ +/* + * Copyright 1999 Computing Research Labs, New Mexico State University + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT + * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ +#ifndef lint +#ifdef __GNUC__ +static char rcsid[] __attribute__ ((unused)) = "$Id: ucpgba.c,v 1.4 1999/11/29 16:41:06 mleisher Exp $"; +#else +static char rcsid[] = "$Id: ucpgba.c,v 1.4 1999/11/29 16:41:06 mleisher Exp $"; +#endif +#endif + +#include +#include +#include "ucdata.h" +#include "ucpgba.h" + +/* + * These macros are used while reordering of RTL runs of text for the + * special case of non-spacing characters being in runs of weakly + * directional text. They check for weak and non-spacing, and digits and + * non-spacing. + */ +#define ISWEAKSPECIAL(cc) ucisprop(cc, UC_EN|UC_ES|UC_MN, UC_ET|UC_AN|UC_CS) +#define ISDIGITSPECIAL(cc) ucisprop(cc, UC_ND|UC_MN, 0) + +/* + * These macros are used while breaking a string into runs of text in + * different directions. Descriptions: + * + * ISLTR_LTR - Test for members of an LTR run in an LTR context. This looks + * for characters with ltr, non-spacing, weak, and neutral + * properties. + * + * ISRTL_RTL - Test for members of an RTL run in an RTL context. This looks + * for characters with rtl, non-spacing, weak, and neutral + * properties. + * + * ISRTL_NEUTRAL - Test for RTL or neutral characters. + * + * ISWEAK_NEUTRAL - Test for weak or neutral characters. + */ +#define ISLTR_LTR(cc) ucisprop(cc, UC_L|UC_MN|UC_EN|UC_ES,\ + UC_ET|UC_AN|UC_CS|UC_B|UC_S|UC_WS|UC_ON) + +#define ISRTL_RTL(cc) ucisprop(cc, UC_R|UC_MN|UC_EN|UC_ES,\ + UC_ET|UC_AN|UC_CS|UC_B|UC_S|UC_WS|UC_ON) + +#define ISRTL_NEUTRAL(cc) ucisprop(cc, UC_R, UC_B|UC_S|UC_WS|UC_ON) +#define ISWEAK_NEUTRAL(cc) ucisprop(cc, UC_EN|UC_ES, \ + UC_B|UC_S|UC_WS|UC_ON|UC_ET|UC_AN|UC_CS) + +/* + * This table is temporarily hard-coded here until it can be constructed + * automatically somehow. + */ +static unsigned long _symmetric_pairs[] = { + 0x0028, 0x0029, 0x0029, 0x0028, 0x003C, 0x003E, 0x003E, 0x003C, + 0x005B, 0x005D, 0x005D, 0x005B, 0x007B, 0x007D, 0x007D, 0x007B, + 0x2045, 0x2046, 0x2046, 0x2045, 0x207D, 0x207E, 0x207E, 0x207D, + 0x208D, 0x208E, 0x208E, 0x208D, 0x3008, 0x3009, 0x3009, 0x3008, + 0x300A, 0x300B, 0x300B, 0x300A, 0x300C, 0x300D, 0x300D, 0x300C, + 0x300E, 0x300F, 0x300F, 0x300E, 0x3010, 0x3011, 0x3011, 0x3010, + 0x3014, 0x3015, 0x3015, 0x3014, 0x3016, 0x3017, 0x3017, 0x3016, + 0x3018, 0x3019, 0x3019, 0x3018, 0x301A, 0x301B, 0x301B, 0x301A, + 0xFD3E, 0xFD3F, 0xFD3F, 0xFD3E, 0xFE59, 0xFE5A, 0xFE5A, 0xFE59, + 0xFE5B, 0xFE5C, 0xFE5C, 0xFE5B, 0xFE5D, 0xFE5E, 0xFE5E, 0xFE5D, + 0xFF08, 0xFF09, 0xFF09, 0xFF08, 0xFF3B, 0xFF3D, 0xFF3D, 0xFF3B, + 0xFF5B, 0xFF5D, 0xFF5D, 0xFF5B, 0xFF62, 0xFF63, 0xFF63, 0xFF62, +}; + +static int _symmetric_pairs_size = +sizeof(_symmetric_pairs)/sizeof(_symmetric_pairs[0]); + +/* + * This routine looks up the other form of a symmetric pair. + */ +static unsigned long +#ifdef __STDC__ +_ucsymmetric_pair(unsigned long c) +#else +_ucsymmetric_pair(c) +unsigned long c; +#endif +{ + int i; + + for (i = 0; i < _symmetric_pairs_size; i += 2) { + if (_symmetric_pairs[i] == c) + return _symmetric_pairs[i+1]; + } + return c; +} + +/* + * This routine creates a new run, copies the text into it, links it into the + * logical text order chain and returns it to the caller to be linked into + * the visual text order chain. + */ +static ucrun_t * +#ifdef __STDC__ +_add_run(ucstring_t *str, unsigned long *src, + unsigned long start, unsigned long end, int direction) +#else +_add_run(str, src, start, end, direction) +ucstring_t *str; +unsigned long *src, start, end; +int direction; +#endif +{ + long i, t; + ucrun_t *run; + + run = (ucrun_t *) malloc(sizeof(ucrun_t)); + run->visual_next = run->visual_prev = 0; + run->direction = direction; + + run->cursor = ~0; + + run->chars = (unsigned long *) + malloc(sizeof(unsigned long) * ((end - start) << 1)); + run->positions = run->chars + (end - start); + + run->source = src; + run->start = start; + run->end = end; + + if (direction == UCPGBA_RTL) { + /* + * Copy the source text into the run in reverse order and select + * replacements for the pairwise punctuation and the <> characters. + */ + for (i = 0, t = end - 1; start < end; start++, t--, i++) { + run->positions[i] = t; + if (ucissymmetric(src[t]) || src[t] == '<' || src[t] == '>') + run->chars[i] = _ucsymmetric_pair(src[t]); + else + run->chars[i] = src[t]; + } + } else { + /* + * Copy the source text into the run directly. + */ + for (i = start; i < end; i++) { + run->positions[i - start] = i; + run->chars[i - start] = src[i]; + } + } + + /* + * Add the run to the logical list for cursor traversal. + */ + if (str->logical_first == 0) + str->logical_first = str->logical_last = run; + else { + run->logical_prev = str->logical_last; + str->logical_last->logical_next = run; + str->logical_last = run; + } + + return run; +} + +static void +#ifdef __STDC__ +_ucadd_rtl_segment(ucstring_t *str, unsigned long *source, unsigned long start, + unsigned long end) +#else +_ucadd_rtl_segment(str, source, start, end) +ucstring_t *str; +unsigned long *source, start, end; +#endif +{ + unsigned long s, e; + ucrun_t *run, *lrun; + + /* + * This is used to splice runs into strings with overall LTR direction. + * The `lrun' variable will never be NULL because at least one LTR run was + * added before this RTL run. + */ + lrun = str->visual_last; + + for (e = s = start; s < end;) { + for (; e < end && ISRTL_NEUTRAL(source[e]); e++) ; + + if (e > s) { + run = _add_run(str, source, s, e, UCPGBA_RTL); + + /* + * Add the run to the visual list for cursor traversal. + */ + if (str->visual_first != 0) { + if (str->direction == UCPGBA_LTR) { + run->visual_prev = lrun; + run->visual_next = lrun->visual_next; + if (lrun->visual_next != 0) + lrun->visual_next->visual_prev = run; + lrun->visual_next = run; + if (lrun == str->visual_last) + str->visual_last = run; + } else { + run->visual_next = str->visual_first; + str->visual_first->visual_prev = run; + str->visual_first = run; + } + } else + str->visual_first = str->visual_last = run; + } + + /* + * Now handle the weak sequences such that multiple non-digit groups + * are kept together appropriately and added as RTL sequences. + */ + for (s = e; e < end && ISWEAKSPECIAL(source[e]); e++) { + if (!ISDIGITSPECIAL(source[e]) && + (e + 1 == end || !ISDIGITSPECIAL(source[e + 1]))) + break; + } + + if (e > s) { + run = _add_run(str, source, s, e, UCPGBA_LTR); + + /* + * Add the run to the visual list for cursor traversal. + */ + if (str->visual_first != 0) { + if (str->direction == UCPGBA_LTR) { + run->visual_prev = lrun; + run->visual_next = lrun->visual_next; + if (lrun->visual_next != 0) + lrun->visual_next->visual_prev = run; + lrun->visual_next = run; + if (lrun == str->visual_last) + str->visual_last = run; + } else { + run->visual_next = str->visual_first; + str->visual_first->visual_prev = run; + str->visual_first = run; + } + } else + str->visual_first = str->visual_last = run; + } + + /* + * Collect all weak non-digit sequences for an RTL segment. These + * will appear as part of the next RTL segment or will be added as + * an RTL segment by themselves. + */ + for (s = e; e < end && ucisweak(source[e]) && !ucisdigit(source[e]); + e++) ; + } + + /* + * Capture any weak non-digit sequences that occur at the end of the RTL + * run. + */ + if (e > s) { + run = _add_run(str, source, s, e, UCPGBA_RTL); + + /* + * Add the run to the visual list for cursor traversal. + */ + if (str->visual_first != 0) { + if (str->direction == UCPGBA_LTR) { + run->visual_prev = lrun; + run->visual_next = lrun->visual_next; + if (lrun->visual_next != 0) + lrun->visual_next->visual_prev = run; + lrun->visual_next = run; + if (lrun == str->visual_last) + str->visual_last = run; + } else { + run->visual_next = str->visual_first; + str->visual_first->visual_prev = run; + str->visual_first = run; + } + } else + str->visual_first = str->visual_last = run; + } +} + +static void +#ifdef __STDC__ +_ucadd_ltr_segment(ucstring_t *str, unsigned long *source, unsigned long start, + unsigned long end) +#else +_ucadd_ltr_segment(str, source, start, end) +ucstring_t *str; +unsigned long *source, start, end; +#endif +{ + ucrun_t *run; + + run = _add_run(str, source, start, end, UCPGBA_LTR); + + /* + * Add the run to the visual list for cursor traversal. + */ + if (str->visual_first != 0) { + if (str->direction == UCPGBA_LTR) { + run->visual_prev = str->visual_last; + str->visual_last->visual_next = run; + str->visual_last = run; + } else { + run->visual_next = str->visual_first; + str->visual_first->visual_prev = run; + str->visual_first = run; + } + } else + str->visual_first = str->visual_last = run; +} + +ucstring_t * +#ifdef __STDC__ +ucstring_create(unsigned long *source, unsigned long start, unsigned long end, + int default_direction, int cursor_motion) +#else +ucstring_create(source, start, end, default_direction, cursor_motion) +unsigned long *source, start, end; +int default_direction, cursor_motion; +#endif +{ + int rtl_first; + unsigned long s, e; + ucstring_t *str; + + str = (ucstring_t *) malloc(sizeof(ucstring_t)); + + /* + * Set the initial values. + */ + str->cursor_motion = cursor_motion; + str->logical_first = str->logical_last = 0; + str->visual_first = str->visual_last = str->cursor = 0; + str->source = source; + str->start = start; + str->end = end; + + /* + * If the length of the string is 0, then just return it at this point. + */ + if (start == end) + return str; + + /* + * This flag indicates whether the collection loop for RTL is called + * before the LTR loop the first time. + */ + rtl_first = 0; + + /* + * Look for the first character in the string that has strong + * directionality. + */ + for (s = start; s < end && !ucisstrong(source[s]); s++) ; + + if (s == end) + /* + * If the string contains no characters with strong directionality, use + * the default direction. + */ + str->direction = default_direction; + else + str->direction = ucisrtl(source[s]) ? UCPGBA_RTL : UCPGBA_LTR; + + if (str->direction == UCPGBA_RTL) + /* + * Set the flag that causes the RTL collection loop to run first. + */ + rtl_first = 1; + + /* + * This loop now separates the string into runs based on directionality. + */ + for (s = e = 0; s < end; s = e) { + if (!rtl_first) { + /* + * Determine the next run of LTR text. + */ + + while (e < end && ISLTR_LTR(source[e])) + e++; + if (str->direction != UCPGBA_LTR) { + while (e > s && ISWEAK_NEUTRAL(source[e - 1])) + e--; + } + + /* + * Add the LTR segment to the string. + */ + if (e > s) + _ucadd_ltr_segment(str, source, s, e); + } + + /* + * Determine the next run of RTL text. + */ + s = e; + while (e < end && ISRTL_RTL(source[e])) + e++; + if (str->direction != UCPGBA_RTL) { + while (e > s && ISWEAK_NEUTRAL(source[e - 1])) + e--; + } + + /* + * Add the RTL segment to the string. + */ + if (e > s) + _ucadd_rtl_segment(str, source, s, e); + + /* + * Clear the flag that allowed the RTL collection loop to run first + * for strings with overall RTL directionality. + */ + rtl_first = 0; + } + + /* + * Set up the initial cursor run. + */ + str->cursor = str->logical_first; + if (str != 0) + str->cursor->cursor = (str->cursor->direction == UCPGBA_RTL) ? + str->cursor->end - str->cursor->start : 0; + + return str; +} + +void +#ifdef __STDC__ +ucstring_free(ucstring_t *s) +#else +ucstring_free(s) +ucstring_t *s; +#endif +{ + ucrun_t *l, *r; + + if (s == 0) + return; + + for (l = 0, r = s->visual_first; r != 0; r = r->visual_next) { + if (r->end > r->start) + free((char *) r->chars); + if (l) + free((char *) l); + l = r; + } + if (l) + free((char *) l); + + free((char *) s); +} + +int +#ifdef __STDC__ +ucstring_set_cursor_motion(ucstring_t *str, int cursor_motion) +#else +ucstring_set_cursor_motion(s, cursor_motion) +ucstring_t *str; +int cursor_motion; +#endif +{ + int n; + + if (str == 0) + return -1; + + n = str->cursor_motion; + str->cursor_motion = cursor_motion; + return n; +} + +static int +#ifdef __STDC__ +_ucstring_visual_cursor_right(ucstring_t *str, int count) +#else +_ucstring_visual_cursor_right(str, count) +ucstring_t *str; +int count; +#endif +{ + int cnt = count; + unsigned long size; + ucrun_t *cursor; + + if (str == 0) + return 0; + + cursor = str->cursor; + while (cnt > 0) { + size = cursor->end - cursor->start; + if ((cursor->direction == UCPGBA_RTL && cursor->cursor + 1 == size) || + cursor->cursor + 1 > size) { + /* + * If the next run is NULL, then the cursor is already on the + * far right end already. + */ + if (cursor->visual_next == 0) + /* + * If movement occured, then report it. + */ + return (cnt != count); + + /* + * Move to the next run. + */ + str->cursor = cursor = cursor->visual_next; + cursor->cursor = (cursor->direction == UCPGBA_RTL) ? -1 : 0; + size = cursor->end - cursor->start; + } else + cursor->cursor++; + cnt--; + } + return 1; +} + +static int +#ifdef __STDC__ +_ucstring_logical_cursor_right(ucstring_t *str, int count) +#else +_ucstring_logical_cursor_right(str, count) +ucstring_t *str; +int count; +#endif +{ + int cnt = count; + unsigned long size; + ucrun_t *cursor; + + if (str == 0) + return 0; + + cursor = str->cursor; + while (cnt > 0) { + size = cursor->end - cursor->start; + if (str->direction == UCPGBA_RTL) { + if (cursor->direction == UCPGBA_RTL) { + if (cursor->cursor + 1 == size) { + if (cursor == str->logical_first) + /* + * Already at the beginning of the string. + */ + return (cnt != count); + + str->cursor = cursor = cursor->logical_prev; + size = cursor->end - cursor->start; + cursor->cursor = (cursor->direction == UCPGBA_LTR) ? + size : 0; + } else + cursor->cursor++; + } else { + if (cursor->cursor == 0) { + if (cursor == str->logical_first) + /* + * At the beginning of the string already. + */ + return (cnt != count); + + str->cursor = cursor = cursor->logical_prev; + size = cursor->end - cursor->start; + cursor->cursor = (cursor->direction == UCPGBA_LTR) ? + size : 0; + } else + cursor->cursor--; + } + } else { + if (cursor->direction == UCPGBA_RTL) { + if (cursor->cursor == 0) { + if (cursor == str->logical_last) + /* + * Already at the end of the string. + */ + return (cnt != count); + + str->cursor = cursor = cursor->logical_next; + size = cursor->end - cursor->start; + cursor->cursor = (cursor->direction == UCPGBA_LTR) ? + 0 : size - 1; + } else + cursor->cursor--; + } else { + if (cursor->cursor + 1 > size) { + if (cursor == str->logical_last) + /* + * Already at the end of the string. + */ + return (cnt != count); + + str->cursor = cursor = cursor->logical_next; + cursor->cursor = (cursor->direction == UCPGBA_LTR) ? + 0 : size - 1; + } else + cursor->cursor++; + } + } + cnt--; + } + return 1; +} + +int +#ifdef __STDC__ +ucstring_cursor_right(ucstring_t *str, int count) +#else +ucstring_cursor_right(str, count) +ucstring_t *str; +int count; +#endif +{ + if (str == 0) + return 0; + return (str->cursor_motion == UCPGBA_CURSOR_VISUAL) ? + _ucstring_visual_cursor_right(str, count) : + _ucstring_logical_cursor_right(str, count); +} + +static int +#ifdef __STDC__ +_ucstring_visual_cursor_left(ucstring_t *str, int count) +#else +_ucstring_visual_cursor_left(str, count) +ucstring_t *str; +int count; +#endif +{ + int cnt = count; + unsigned long size; + ucrun_t *cursor; + + if (str == 0) + return 0; + + cursor = str->cursor; + while (cnt > 0) { + size = cursor->end - cursor->start; + if ((cursor->direction == UCPGBA_LTR && cursor->cursor == 0) || + cursor->cursor - 1 < -1) { + /* + * If the preceding run is NULL, then the cursor is already on the + * far left end already. + */ + if (cursor->visual_prev == 0) + /* + * If movement occured, then report it. + */ + return (cnt != count); + + /* + * Move to the previous run. + */ + str->cursor = cursor = cursor->visual_prev; + size = cursor->end - cursor->start; + cursor->cursor = (cursor->direction == UCPGBA_RTL) ? + size : size - 1; + } else + cursor->cursor--; + cnt--; + } + return 1; +} + +static int +#ifdef __STDC__ +_ucstring_logical_cursor_left(ucstring_t *str, int count) +#else +_ucstring_logical_cursor_left(str, count) +ucstring_t *str; +int count; +#endif +{ + int cnt = count; + unsigned long size; + ucrun_t *cursor; + + if (str == 0) + return 0; + + cursor = str->cursor; + while (cnt > 0) { + size = cursor->end - cursor->start; + if (str->direction == UCPGBA_RTL) { + if (cursor->direction == UCPGBA_RTL) { + if (cursor->cursor == -1) { + if (cursor == str->logical_last) + /* + * Already at the end of the string. + */ + return (cnt != count); + + str->cursor = cursor = cursor->logical_next; + size = cursor->end - cursor->start; + cursor->cursor = (cursor->direction == UCPGBA_LTR) ? + 0 : size - 1; + } else + cursor->cursor--; + } else { + if (cursor->cursor + 1 > size) { + if (cursor == str->logical_last) + /* + * At the end of the string already. + */ + return (cnt != count); + + str->cursor = cursor = cursor->logical_next; + size = cursor->end - cursor->start; + cursor->cursor = (cursor->direction == UCPGBA_LTR) ? + 0 : size - 1; + } else + cursor->cursor++; + } + } else { + if (cursor->direction == UCPGBA_RTL) { + if (cursor->cursor + 1 == size) { + if (cursor == str->logical_first) + /* + * Already at the beginning of the string. + */ + return (cnt != count); + + str->cursor = cursor = cursor->logical_prev; + size = cursor->end - cursor->start; + cursor->cursor = (cursor->direction == UCPGBA_LTR) ? + size : 0; + } else + cursor->cursor++; + } else { + if (cursor->cursor == 0) { + if (cursor == str->logical_first) + /* + * Already at the beginning of the string. + */ + return (cnt != count); + + str->cursor = cursor = cursor->logical_prev; + cursor->cursor = (cursor->direction == UCPGBA_LTR) ? + size : 0; + } else + cursor->cursor--; + } + } + cnt--; + } + return 1; +} + +int +#ifdef __STDC__ +ucstring_cursor_left(ucstring_t *str, int count) +#else +ucstring_cursor_left(str, count) +ucstring_t *str; +int count; +#endif +{ + if (str == 0) + return 0; + return (str->cursor_motion == UCPGBA_CURSOR_VISUAL) ? + _ucstring_visual_cursor_left(str, count) : + _ucstring_logical_cursor_left(str, count); +} + +void +#ifdef __STDC__ +ucstring_cursor_info(ucstring_t *str, int *direction, unsigned long *position) +#else +ucstring_cursor_info(str, direction, position) +ucstring_t *str, int *direction; +unsigned long *position; +#endif +{ + long c; + unsigned long size; + ucrun_t *cursor; + + if (str == 0 || direction == 0 || position == 0) + return; + + cursor = str->cursor; + + *direction = cursor->direction; + + c = cursor->cursor; + size = cursor->end - cursor->start; + + if (c == size) + *position = (cursor->direction == UCPGBA_RTL) ? + cursor->start : cursor->positions[c - 1]; + else if (c == -1) + *position = (cursor->direction == UCPGBA_RTL) ? + cursor->end : cursor->start; + else + *position = cursor->positions[c]; +} diff --git a/libraries/liblunicode/ucdata/ucpgba.h b/libraries/liblunicode/ucdata/ucpgba.h new file mode 100644 index 0000000000..cd4c8bdd3f --- /dev/null +++ b/libraries/liblunicode/ucdata/ucpgba.h @@ -0,0 +1,162 @@ +/* + * Copyright 1999 Computing Research Labs, New Mexico State University + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT + * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ +#ifndef _h_ucpgba +#define _h_ucpgba + +/* + * $Id: ucpgba.h,v 1.4 1999/11/19 15:24:30 mleisher Exp $ + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#undef __ +#ifdef __STDC__ +#define __(x) x +#else +#define __(x) () +#endif + +/*************************************************************************** + * + * Macros and types. + * + ***************************************************************************/ + +/* + * These are the direction values that can appear in render runs and render + * strings. + */ +#define UCPGBA_LTR 0 +#define UCPGBA_RTL 1 + +/* + * These are the flags for cursor motion. + */ +#define UCPGBA_CURSOR_VISUAL 0 +#define UCPGBA_CURSOR_LOGICAL 1 + +/* + * This structure is used to contain runs of text in a particular direction. + */ +typedef struct _ucrun_t { + struct _ucrun_t *visual_prev; /* Pointer to the previous visual run. */ + struct _ucrun_t *visual_next; /* Pointer to the next visual run. */ + + struct _ucrun_t *logical_prev; /* Pointer to the previous logical run. */ + struct _ucrun_t *logical_next; /* Pointer to the next logical run. */ + + int direction; /* Direction of the run. */ + + long cursor; /* Position of "cursor" in the string. */ + + unsigned long *chars; /* List of characters for the run. */ + unsigned long *positions; /* List of original positions in source. */ + + unsigned long *source; /* The source string. */ + unsigned long start; /* Beginning offset in the source string. */ + unsigned long end; /* Ending offset in the source string. */ +} ucrun_t; + +/* + * This represents a string of runs rendered up to a point that is not + * platform specific. + */ +typedef struct _ucstring_t { + int direction; /* Overall direction of the string. */ + + int cursor_motion; /* Logical or visual cursor motion flag. */ + + ucrun_t *cursor; /* The run containing the "cursor." */ + + ucrun_t *logical_first; /* First run in the logical order. */ + ucrun_t *logical_last; /* Last run in the logical order. */ + + ucrun_t *visual_first; /* First run in the visual order. */ + ucrun_t *visual_last; /* Last run in the visual order. */ + + unsigned long *source; /* The source string. */ + unsigned long start; /* The beginning offset in the source. */ + unsigned long end; /* The ending offset in the source. */ +} ucstring_t; + +/*************************************************************************** + * + * API + * + ***************************************************************************/ + +/* + * This creates and reorders the specified substring using the + * "Pretty Good Bidi Algorithm." A default direction is provided for cases + * of a string containing no strong direction characters and the default + * cursor motion should be provided. + */ +extern ucstring_t *ucstring_create __((unsigned long *source, + unsigned long start, + unsigned long end, + int default_direction, + int cursor_motion)); +/* + * This releases the string. + */ +extern void ucstring_free __((ucstring_t *string)); + +/* + * This changes the cursor motion flag for the string. + */ +extern int ucstring_set_cursor_motion __((ucstring_t *string, + int cursor_motion)); + +/* + * This function will move the cursor to the right depending on the + * type of cursor motion that was specified for the string. + * + * A 0 is returned if no cursor motion is performed, otherwise a + * 1 is returned. + */ +extern int ucstring_cursor_right __((ucstring_t *string, int count)); + +/* + * This function will move the cursor to the left depending on the + * type of cursor motion that was specified for the string. + * + * A 0 is returned if no cursor motion is performed, otherwise a + * 1 is returned. + */ +extern int ucstring_cursor_left __((ucstring_t *string, int count)); + +/* + * This routine retrieves the direction of the run containing the cursor + * and the actual position in the original text string. + */ +extern void ucstring_cursor_info __((ucstring_t *string, int *direction, + unsigned long *position)); + +#undef __ + +#ifdef __cplusplus +} +#endif + +#endif /* _h_ucpgba */ diff --git a/libraries/liblunicode/ucdata/ucpgba.man b/libraries/liblunicode/ucdata/ucpgba.man new file mode 100644 index 0000000000..4486509726 --- /dev/null +++ b/libraries/liblunicode/ucdata/ucpgba.man @@ -0,0 +1,97 @@ +.\" +.\" $Id: ucpgba.man,v 1.1 1999/11/19 16:08:34 mleisher Exp $ +.\" +.TH ucpgba 3 "19 November 1999" +.SH NAME +ucpgba \- functions for doing bidirectional reordering of Unicode text and +logical and visual cursor motion + +.SH SYNOPSIS +.nf +#include +#include + +ucstring_t *ucstring_create(unsigned long *source, unsigned long start, + unsigned long end, int default_direction, + int cursor_motion) +.sp +void ucstring_free(ucstring_t *string) +.sp +int ucstring_set_cursor_motion(ucstring_t *string, int cursor_motion) +.sp +int ucstring_cursor_right(ucstring_t *string, int count) +.sp +int ucstring_cursor_left(ucstring_t *string, int count) +.sp +void ucstring_cursor_info(ucstring_t *string, int *direction, + unsigned long *position) + +.SH DESCRIPTION +.TP 4 +.BR Macros +UCPGBA_LTR +.br +UCPGBA_RTL +.br +UCPGBA_CURSOR_VISUAL +.br +UCPGBA_CURSOR_LOGICAL + +.TP 4 +.BR ucstring_create() +This function will create a reordered string by using the implicit +directionality of the characters in the specified substring. +.sp +The `default_direction' parameter should be one of UCPGBA_LTR or UCPGBA_RTL +and is used only in cases where a string contains no characters with strong +directionality. +.sp +The `cursor_motion' parameter should be one of UCPGBA_CURSOR_VISUAL or +UCPGBA_CURSOR_LOGICAL, and is used to specify the initial cursor motion +behavior. This behavior can be switched at any time using +ustring_set_cursor_motion(). + +.TP 4 +.BR ucstring_free() +This function will deallocate the memory used by the string, incuding the +string itself. + +.TP 4 +.BR ucstring_cursor_info() +This function will return the text position of the internal cursor and the +directionality of the text at that position. The position returned is the +original text position of the character. + +.TP 4 +.BR ucstring_set_cursor_motion() +This function will change the cursor motion type and return the previous +cursor motion type. + +.TP 4 +.BR ucstring_cursor_right() +This function will move the internal cursor to the right according to the +type of cursor motion set for the string. +.sp +If no cursor motion is performed, it returns 0. Otherwise it will return a 1. + +.TP 4 +.BR ucstring_cursor_left() +This function will move the internal cursor to the left according to the +type of cursor motion set for the string. +.sp +If no cursor motion is performed, it returns 0. Otherwise it will return a 1. + +.SH "SEE ALSO" +ucdata(3) + +.SH ACKNOWLEDGMENTS +These are people who have helped with patches or alerted me about problems. + +.SH AUTHOR +Mark Leisher +.br +Computing Research Lab +.br +New Mexico State University +.br +Email: mleisher@crl.nmsu.edu diff --git a/libraries/liblunicode/ure/README b/libraries/liblunicode/ure/README new file mode 100644 index 0000000000..c9918f5fd3 --- /dev/null +++ b/libraries/liblunicode/ure/README @@ -0,0 +1,212 @@ +# +# $Id: README,v 1.3 1999/09/21 15:47:43 mleisher Exp $ +# +# Copyright 1997, 1998, 1999 Computing Research Labs, +# New Mexico State University +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY +# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT +# OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +# THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# + + + Unicode and Regular Expressions + Version 0.5 + +This is a simple regular expression package for matching against Unicode text +in UCS2 form. The implementation of this URE package is a variation on the +RE->DFA algorithm done by Mark Hopkins (markh@csd4.csd.uwm.edu). Mark +Hopkins' algorithm had the virtue of being very simple, so it was used as a +model. + +--------------------------------------------------------------------------- + +Assumptions: + + o Regular expression and text already normalized. + + o Conversion to lower case assumes a 1-1 mapping. + +Definitions: + + Separator - any one of U+2028, U+2029, '\n', '\r'. + +Operators: + . - match any character. + * - match zero or more of the last subexpression. + + - match one or more of the last subexpression. + ? - match zero or one of the last subexpression. + () - subexpression grouping. + + Notes: + + o The "." operator normally does not match separators, but a flag is + available for the ure_exec() function that will allow this operator to + match a separator. + +Literals and Constants: + + c - literal UCS2 character. + \x.... - hexadecimal number of up to 4 digits. + \X.... - hexadecimal number of up to 4 digits. + \u.... - hexadecimal number of up to 4 digits. + \U.... - hexadecimal number of up to 4 digits. + +Character classes: + + [...] - Character class. + [^...] - Negated character class. + \pN1,N2,...,Nn - Character properties class. + \PN1,N2,...,Nn - Negated character properties class. + + POSIX character classes recognized: + + :alnum: + :alpha: + :cntrl: + :digit: + :graph: + :lower: + :print: + :punct: + :space: + :upper: + :xdigit: + + Notes: + + o Character property classes are \p or \P followed by a comma separated + list of integers between 1 and 32. These integers are references to + the following character properties: + + N Character Property + -------------------------- + 1 _URE_NONSPACING + 2 _URE_COMBINING + 3 _URE_NUMDIGIT + 4 _URE_NUMOTHER + 5 _URE_SPACESEP + 6 _URE_LINESEP + 7 _URE_PARASEP + 8 _URE_CNTRL + 9 _URE_PUA + 10 _URE_UPPER + 11 _URE_LOWER + 12 _URE_TITLE + 13 _URE_MODIFIER + 14 _URE_OTHERLETTER + 15 _URE_DASHPUNCT + 16 _URE_OPENPUNCT + 17 _URE_CLOSEPUNCT + 18 _URE_OTHERPUNCT + 19 _URE_MATHSYM + 20 _URE_CURRENCYSYM + 21 _URE_OTHERSYM + 22 _URE_LTR + 23 _URE_RTL + 24 _URE_EURONUM + 25 _URE_EURONUMSEP + 26 _URE_EURONUMTERM + 27 _URE_ARABNUM + 28 _URE_COMMONSEP + 29 _URE_BLOCKSEP + 30 _URE_SEGMENTSEP + 31 _URE_WHITESPACE + 32 _URE_OTHERNEUT + + o Character classes can contain literals, constants, and character + property classes. Example: + + [abc\U10A\p1,3,4] + +--------------------------------------------------------------------------- + +Before using URE +---------------- +Before URE is used, two functions need to be created. One to check if a +character matches a set of URE character properties, and one to convert a +character to lower case. + +Stubs for these function are located in the urestubs.c file. + +Using URE +--------- + +Sample pseudo-code fragment. + + ure_buffer_t rebuf; + ure_dfa_t dfa; + ucs2_t *re, *text; + unsigned long relen, textlen; + unsigned long match_start, match_end; + + /* + * Allocate the dynamic storage needed to compile regular expressions. + */ + rebuf = ure_buffer_create(); + + for each regular expression in a list { + re = next regular expression; + relen = length(re); + + /* + * Compile the regular expression with the case insensitive flag + * turned on. + */ + dfa = ure_compile(re, relen, 1, rebuf); + + /* + * Look for the first match in some text. The matching will be done + * in a case insensitive manner because the expression was compiled + * with the case insensitive flag on. + */ + if (ure_exec(dfa, 0, text, textlen, &match_start, &match_end)) + printf("MATCH: %ld %ld\n", match_start, match_end); + + /* + * Look for the first match in some text, ignoring non-spacing + * characters. + */ + if (ure_exec(dfa, URE_IGNORE_NONSPACING, text, textlen, + &match_start, &match_end)) + printf("MATCH: %ld %ld\n", match_start, match_end); + + /* + * Free the DFA. + */ + ure_free_dfa(dfa); + } + + /* + * Free the dynamic storage used for compiling the expressions. + */ + ure_free_buffer(rebuf); + +--------------------------------------------------------------------------- + +Mark Leisher +29 March 1997 + +=========================================================================== + +CHANGES +------- + +Version: 0.5 +Date : 21 September 1999 +========================== + 1. Added copyright stuff and put in CVS. diff --git a/libraries/liblunicode/ure/ure.c b/libraries/liblunicode/ure/ure.c new file mode 100644 index 0000000000..2175034966 --- /dev/null +++ b/libraries/liblunicode/ure/ure.c @@ -0,0 +1,2304 @@ +/* + * Copyright 1997, 1998, 1999 Computing Research Labs, + * New Mexico State University + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT + * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ +#ifndef lint +static char rcsid[] = "$Id: ure.c,v 1.2 1999/09/21 15:47:43 mleisher Exp $"; +#endif + +#include +#include +#include +#include "ure.h" + +/* + * Flags used internally in the DFA. + */ +#define _URE_DFA_CASEFOLD 0x01 +#define _URE_DFA_BLANKLINE 0x02 + +static unsigned long cclass_flags[] = { + 0, + _URE_NONSPACING, + _URE_COMBINING, + _URE_NUMDIGIT, + _URE_NUMOTHER, + _URE_SPACESEP, + _URE_LINESEP, + _URE_PARASEP, + _URE_CNTRL, + _URE_PUA, + _URE_UPPER, + _URE_LOWER, + _URE_TITLE, + _URE_MODIFIER, + _URE_OTHERLETTER, + _URE_DASHPUNCT, + _URE_OPENPUNCT, + _URE_CLOSEPUNCT, + _URE_OTHERPUNCT, + _URE_MATHSYM, + _URE_CURRENCYSYM, + _URE_OTHERSYM, + _URE_LTR, + _URE_RTL, + _URE_EURONUM, + _URE_EURONUMSEP, + _URE_EURONUMTERM, + _URE_ARABNUM, + _URE_COMMONSEP, + _URE_BLOCKSEP, + _URE_SEGMENTSEP, + _URE_WHITESPACE, + _URE_OTHERNEUT, +}; + +/* + * Symbol types for the DFA. + */ +#define _URE_ANY_CHAR 1 +#define _URE_CHAR 2 +#define _URE_CCLASS 3 +#define _URE_NCCLASS 4 +#define _URE_BOL_ANCHOR 5 +#define _URE_EOL_ANCHOR 6 + +/* + * Op codes for converting the NFA to a DFA. + */ +#define _URE_SYMBOL 10 +#define _URE_PAREN 11 +#define _URE_QUEST 12 +#define _URE_STAR 13 +#define _URE_PLUS 14 +#define _URE_ONE 15 +#define _URE_AND 16 +#define _URE_OR 17 + +#define _URE_NOOP 0xffff + +#define _URE_REGSTART 0x8000 +#define _URE_REGEND 0x4000 + +/* + * Structure used to handle a compacted range of characters. + */ +typedef struct { + ucs4_t min_code; + ucs4_t max_code; +} _ure_range_t; + +typedef struct { + _ure_range_t *ranges; + ucs2_t ranges_used; + ucs2_t ranges_size; +} _ure_ccl_t; + +typedef union { + ucs4_t chr; + _ure_ccl_t ccl; +} _ure_sym_t; + +/* + * This is a general element structure used for expressions and stack + * elements. + */ +typedef struct { + ucs2_t reg; + ucs2_t onstack; + ucs2_t type; + ucs2_t lhs; + ucs2_t rhs; +} _ure_elt_t; + +/* + * This is a structure used to track a list or a stack of states. + */ +typedef struct { + ucs2_t *slist; + ucs2_t slist_size; + ucs2_t slist_used; +} _ure_stlist_t; + +/* + * Structure to track the list of unique states for a symbol + * during reduction. + */ +typedef struct { + ucs2_t id; + ucs2_t type; + unsigned long mods; + unsigned long props; + _ure_sym_t sym; + _ure_stlist_t states; +} _ure_symtab_t; + +/* + * Structure to hold a single state. + */ +typedef struct { + ucs2_t id; + ucs2_t accepting; + ucs2_t pad; + _ure_stlist_t st; + _ure_elt_t *trans; + ucs2_t trans_size; + ucs2_t trans_used; +} _ure_state_t; + +/* + * Structure used for keeping lists of states. + */ +typedef struct { + _ure_state_t *states; + ucs2_t states_size; + ucs2_t states_used; +} _ure_statetable_t; + +/* + * Structure to track pairs of DFA states when equivalent states are + * merged. + */ +typedef struct { + ucs2_t l; + ucs2_t r; +} _ure_equiv_t; + +/* + * Structure used for constructing the NFA and reducing to a minimal DFA. + */ +typedef struct _ure_buffer_t { + int reducing; + int error; + unsigned long flags; + + _ure_stlist_t stack; + + /* + * Table of unique symbols encountered. + */ + _ure_symtab_t *symtab; + ucs2_t symtab_size; + ucs2_t symtab_used; + + /* + * Tracks the unique expressions generated for the NFA and when the NFA is + * reduced. + */ + _ure_elt_t *expr; + ucs2_t expr_used; + ucs2_t expr_size; + + /* + * The reduced table of unique groups of NFA states. + */ + _ure_statetable_t states; + + /* + * Tracks states when equivalent states are merged. + */ + _ure_equiv_t *equiv; + ucs2_t equiv_used; + ucs2_t equiv_size; +} _ure_buffer_t; + +typedef struct { + ucs2_t symbol; + ucs2_t next_state; +} _ure_trans_t; + +typedef struct { + ucs2_t accepting; + ucs2_t ntrans; + _ure_trans_t *trans; +} _ure_dstate_t; + +typedef struct _ure_dfa_t { + unsigned long flags; + + _ure_symtab_t *syms; + ucs2_t nsyms; + + _ure_dstate_t *states; + ucs2_t nstates; + + _ure_trans_t *trans; + ucs2_t ntrans; +} _ure_dfa_t; + +/************************************************************************* + * + * Functions. + * + *************************************************************************/ + +static void +#if NeedFunctionPrototypes +_ure_memmove(char *dest, char *src, unsigned long bytes) +#else +_ure_memmove(dest, src, bytes) +char *dest, *src; +unsigned long bytes; +#endif +{ + long i, j; + + i = (long) bytes; + j = i & 7; + i = (i + 7) >> 3; + + /* + * Do a memmove using Ye Olde Duff's Device for efficiency. + */ + if (src < dest) { + src += bytes; + dest += bytes; + + switch (j) { + case 0: do { + *--dest = *--src; + case 7: *--dest = *--src; + case 6: *--dest = *--src; + case 5: *--dest = *--src; + case 4: *--dest = *--src; + case 3: *--dest = *--src; + case 2: *--dest = *--src; + case 1: *--dest = *--src; + } while (--i > 0); + } + } else if (src > dest) { + switch (j) { + case 0: do { + *dest++ = *src++; + case 7: *dest++ = *src++; + case 6: *dest++ = *src++; + case 5: *dest++ = *src++; + case 4: *dest++ = *src++; + case 3: *dest++ = *src++; + case 2: *dest++ = *src++; + case 1: *dest++ = *src++; + } while (--i > 0); + } + } +} + +static void +#ifdef __STDC__ +_ure_push(ucs2_t v, _ure_buffer_t *b) +#else +_ure_push(v, b) +ucs2_t v; +_ure_buffer_t *b; +#endif +{ + _ure_stlist_t *s; + + if (b == 0) + return; + + /* + * If the `reducing' parameter is non-zero, check to see if the value + * passed is already on the stack. + */ + if (b->reducing != 0 && b->expr[v].onstack != 0) + return; + + s = &b->stack; + if (s->slist_used == s->slist_size) { + if (s->slist_size == 0) + s->slist = (ucs2_t *) malloc(sizeof(ucs2_t) << 3); + else + s->slist = (ucs2_t *) realloc((char *) s->slist, + sizeof(ucs2_t) * (s->slist_size + 8)); + s->slist_size += 8; + } + s->slist[s->slist_used++] = v; + + /* + * If the `reducing' parameter is non-zero, flag the element as being on + * the stack. + */ + if (b->reducing != 0) + b->expr[v].onstack = 1; +} + +static ucs2_t +#ifdef __STDC__ +_ure_peek(_ure_buffer_t *b) +#else +_ure_peek(b) +_ure_buffer_t *b; +#endif +{ + if (b == 0 || b->stack.slist_used == 0) + return _URE_NOOP; + + return b->stack.slist[b->stack.slist_used - 1]; +} + +static ucs2_t +#ifdef __STDC__ +_ure_pop(_ure_buffer_t *b) +#else +_ure_pop(b) +_ure_buffer_t *b; +#endif +{ + ucs2_t v; + + if (b == 0 || b->stack.slist_used == 0) + return _URE_NOOP; + + v = b->stack.slist[--b->stack.slist_used]; + if (b->reducing) + b->expr[v].onstack = 0; + + return v; +} + +/************************************************************************* + * + * Start symbol parse functions. + * + *************************************************************************/ + +/* + * Parse a comma-separated list of integers that represent character + * properties. Combine them into a mask that is returned in the `mask' + * variable, and return the number of characters consumed. + */ +static unsigned long +#ifdef __STDC__ +_ure_prop_list(ucs2_t *pp, unsigned long limit, unsigned long *mask, + _ure_buffer_t *b) +#else +_ure_prop_list(pp, limit, mask, b) +ucs2_t *pp; +unsigned long limit, *mask; +_ure_buffer_t *b; +#endif +{ + unsigned long n, m; + ucs2_t *sp, *ep; + + sp = pp; + ep = sp + limit; + + for (m = n = 0; b->error == _URE_OK && sp < ep; sp++) { + if (*sp == ',') { + /* + * Encountered a comma, so select the next character property flag + * and reset the number. + */ + m |= cclass_flags[n]; + n = 0; + } else if (*sp >= '0' && *sp <= '9') + /* + * Encountered a digit, so start or continue building the cardinal + * that represents the character property flag. + */ + n = (n * 10) + (*sp - '0'); + else + /* + * Encountered something that is not part of the property list. + * Indicate that we are done. + */ + break; + + /* + * If a property number greater than 32 occurs, then there is a + * problem. Most likely a missing comma separator. + */ + if (n > 32) + b->error = _URE_INVALID_PROPERTY; + } + + if (n != 0) + m |= cclass_flags[n]; + + /* + * Set the mask that represents the group of character properties. + */ + *mask = m; + + /* + * Return the number of characters consumed. + */ + return sp - pp; +} + +/* + * Collect a hex number with 1 to 4 digits and return the number + * of characters used. + */ +static unsigned long +#ifdef __STDC__ +_ure_hex(ucs2_t *np, unsigned long limit, ucs4_t *n) +#else +_ure_hex(np, limit, n) +ucs2_t *np; +unsigned long limit; +ucs4_t *n; +#endif +{ + ucs2_t i; + ucs2_t *sp, *ep; + ucs4_t nn; + + sp = np; + ep = sp + limit; + + for (nn = 0, i = 0; i < 4 && sp < ep; i++, sp++) { + if (*sp >= '0' && *sp <= '9') + nn = (nn << 4) + (*sp - '0'); + else if (*sp >= 'A' && *sp <= 'F') + nn = (nn << 4) + ((*sp - 'A') + 10); + else if (*sp >= 'a' && *sp <= 'f') + nn = (nn << 4) + ((*sp - 'a') + 10); + else + /* + * Encountered something that is not a hex digit. + */ + break; + } + + /* + * Assign the character code collected and return the number of + * characters used. + */ + *n = nn; + + return sp - np; +} + +/* + * Insert a range into a character class, removing duplicates and ordering + * them in increasing range-start order. + */ +static void +#ifdef __STDC__ +_ure_add_range(_ure_ccl_t *ccl, _ure_range_t *r, _ure_buffer_t *b) +#else +_ure_add_range(ccl, r, b) +_ure_ccl_t *ccl; +_ure_range_t *r; +_ure_buffer_t *b; +#endif +{ + ucs2_t i; + ucs4_t tmp; + _ure_range_t *rp; + + /* + * If the `casefold' flag is set, then make sure both endpoints of the + * range are converted to lower case. + */ + if (b->flags & _URE_DFA_CASEFOLD) { + r->min_code = _ure_tolower(r->min_code); + r->max_code = _ure_tolower(r->max_code); + } + + /* + * Swap the range endpoints if they are not in increasing order. + */ + if (r->min_code > r->max_code) { + tmp = r->min_code; + r->min_code = r->max_code; + r->max_code = tmp; + } + + for (i = 0, rp = ccl->ranges; + i < ccl->ranges_used && r->min_code < rp->min_code; i++, rp++) ; + + /* + * Check for a duplicate. + */ + if (i < ccl->ranges_used && + r->min_code == rp->min_code && r->max_code == rp->max_code) + return; + + if (ccl->ranges_used == ccl->ranges_size) { + if (ccl->ranges_size == 0) + ccl->ranges = (_ure_range_t *) malloc(sizeof(_ure_range_t) << 3); + else + ccl->ranges = (_ure_range_t *) + realloc((char *) ccl->ranges, + sizeof(_ure_range_t) * (ccl->ranges_size + 8)); + ccl->ranges_size += 8; + } + + rp = ccl->ranges + ccl->ranges_used; + + if (i < ccl->ranges_used) + _ure_memmove((char *) (rp + 1), (char *) rp, + sizeof(_ure_range_t) * (ccl->ranges_used - i)); + + ccl->ranges_used++; + rp->min_code = r->min_code; + rp->max_code = r->max_code; +} + +#define _URE_ALPHA_MASK (_URE_UPPER|_URE_LOWER|_URE_OTHERLETTER|\ +_URE_MODIFIER|_URE_TITLE|_URE_NONSPACING|_URE_COMBINING) +#define _URE_ALNUM_MASK (_URE_ALPHA_MASK|_URE_NUMDIGIT) +#define _URE_PUNCT_MASK (_URE_DASHPUNCT|_URE_OPENPUNCT|_URE_CLOSEPUNCT|\ +_URE_OTHERPUNCT) +#define _URE_GRAPH_MASK (_URE_NUMDIGIT|_URE_NUMOTHER|_URE_ALPHA_MASK|\ +_URE_MATHSYM|_URE_CURRENCYSYM|_URE_OTHERSYM) +#define _URE_PRINT_MASK (_URE_GRAPH_MASK|_URE_SPACESEP) +#define _URE_SPACE_MASK (_URE_SPACESEP|_URE_LINESEP|_URE_PARASEP) + +typedef void (*_ure_cclsetup_t)( +#ifdef __STDC__ + _ure_symtab_t *sym, + unsigned long mask, + _ure_buffer_t *b +#endif +); + +typedef struct { + ucs2_t key; + unsigned long len; + unsigned long next; + _ure_cclsetup_t func; + unsigned long mask; +} _ure_trie_t; + +static void +#ifdef __STDC__ +_ure_ccl_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b) +#else +_ure_ccl_setup(sym, mask, b) +_ure_symtab_t *sym; +unsigned long mask; +_ure_buffer_t *b; +#endif +{ + sym->props |= mask; +} + +static void +#ifdef __STDC__ +_ure_space_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b) +#else +_ure_space_setup(sym, mask, *b) +_ure_symtab_t *sym; +unsigned long mask; +_ure_buffer_t b; +#endif +{ + _ure_range_t range; + + sym->props |= mask; + + /* + * Add the additional characters needed for handling isspace(). + */ + range.min_code = range.max_code = '\t'; + _ure_add_range(&sym->sym.ccl, &range, b); + range.min_code = range.max_code = '\r'; + _ure_add_range(&sym->sym.ccl, &range, b); + range.min_code = range.max_code = '\n'; + _ure_add_range(&sym->sym.ccl, &range, b); + range.min_code = range.max_code = '\f'; + _ure_add_range(&sym->sym.ccl, &range, b); + range.min_code = range.max_code = 0xfeff; + _ure_add_range(&sym->sym.ccl, &range, b); +} + +static void +#ifdef __STDC__ +_ure_xdigit_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b) +#else +_ure_xdigit_setup(sym, mask, b) +_ure_symtab_t *sym; +unsigned long mask; +_ure_buffer_t *b; +#endif +{ + _ure_range_t range; + + /* + * Add the additional characters needed for handling isxdigit(). + */ + range.min_code = '0'; + range.max_code = '9'; + _ure_add_range(&sym->sym.ccl, &range, b); + range.min_code = 'A'; + range.max_code = 'F'; + _ure_add_range(&sym->sym.ccl, &range, b); + range.min_code = 'a'; + range.max_code = 'f'; + _ure_add_range(&sym->sym.ccl, &range, b); +} + +static _ure_trie_t cclass_trie[] = { + {0x003a, 1, 1, 0, 0}, + {0x0061, 9, 10, 0, 0}, + {0x0063, 8, 19, 0, 0}, + {0x0064, 7, 24, 0, 0}, + {0x0067, 6, 29, 0, 0}, + {0x006c, 5, 34, 0, 0}, + {0x0070, 4, 39, 0, 0}, + {0x0073, 3, 49, 0, 0}, + {0x0075, 2, 54, 0, 0}, + {0x0078, 1, 59, 0, 0}, + {0x006c, 1, 11, 0, 0}, + {0x006e, 2, 13, 0, 0}, + {0x0070, 1, 16, 0, 0}, + {0x0075, 1, 14, 0, 0}, + {0x006d, 1, 15, 0, 0}, + {0x003a, 1, 16, _ure_ccl_setup, _URE_ALNUM_MASK}, + {0x0068, 1, 17, 0, 0}, + {0x0061, 1, 18, 0, 0}, + {0x003a, 1, 19, _ure_ccl_setup, _URE_ALPHA_MASK}, + {0x006e, 1, 20, 0, 0}, + {0x0074, 1, 21, 0, 0}, + {0x0072, 1, 22, 0, 0}, + {0x006c, 1, 23, 0, 0}, + {0x003a, 1, 24, _ure_ccl_setup, _URE_CNTRL}, + {0x0069, 1, 25, 0, 0}, + {0x0067, 1, 26, 0, 0}, + {0x0069, 1, 27, 0, 0}, + {0x0074, 1, 28, 0, 0}, + {0x003a, 1, 29, _ure_ccl_setup, _URE_NUMDIGIT}, + {0x0072, 1, 30, 0, 0}, + {0x0061, 1, 31, 0, 0}, + {0x0070, 1, 32, 0, 0}, + {0x0068, 1, 33, 0, 0}, + {0x003a, 1, 34, _ure_ccl_setup, _URE_GRAPH_MASK}, + {0x006f, 1, 35, 0, 0}, + {0x0077, 1, 36, 0, 0}, + {0x0065, 1, 37, 0, 0}, + {0x0072, 1, 38, 0, 0}, + {0x003a, 1, 39, _ure_ccl_setup, _URE_LOWER}, + {0x0072, 2, 41, 0, 0}, + {0x0075, 1, 45, 0, 0}, + {0x0069, 1, 42, 0, 0}, + {0x006e, 1, 43, 0, 0}, + {0x0074, 1, 44, 0, 0}, + {0x003a, 1, 45, _ure_ccl_setup, _URE_PRINT_MASK}, + {0x006e, 1, 46, 0, 0}, + {0x0063, 1, 47, 0, 0}, + {0x0074, 1, 48, 0, 0}, + {0x003a, 1, 49, _ure_ccl_setup, _URE_PUNCT_MASK}, + {0x0070, 1, 50, 0, 0}, + {0x0061, 1, 51, 0, 0}, + {0x0063, 1, 52, 0, 0}, + {0x0065, 1, 53, 0, 0}, + {0x003a, 1, 54, _ure_space_setup, _URE_SPACE_MASK}, + {0x0070, 1, 55, 0, 0}, + {0x0070, 1, 56, 0, 0}, + {0x0065, 1, 57, 0, 0}, + {0x0072, 1, 58, 0, 0}, + {0x003a, 1, 59, _ure_ccl_setup, _URE_UPPER}, + {0x0064, 1, 60, 0, 0}, + {0x0069, 1, 61, 0, 0}, + {0x0067, 1, 62, 0, 0}, + {0x0069, 1, 63, 0, 0}, + {0x0074, 1, 64, 0, 0}, + {0x003a, 1, 65, _ure_xdigit_setup, 0}, +}; + +/* + * Probe for one of the POSIX colon delimited character classes in the static + * trie. + */ +static unsigned long +#ifdef __STDC__ +_ure_posix_ccl(ucs2_t *cp, unsigned long limit, _ure_symtab_t *sym, + _ure_buffer_t *b) +#else +_ure_posix_ccl(cp, limit, sym, b) +ucs2_t *cp; +unsigned long limit; +_ure_symtab_t *sym; +_ure_buffer_t *b; +#endif +{ + int i; + unsigned long n; + _ure_trie_t *tp; + ucs2_t *sp, *ep; + + /* + * If the number of characters left is less than 7, then this cannot be + * interpreted as one of the colon delimited classes. + */ + if (limit < 7) + return 0; + + sp = cp; + ep = sp + limit; + tp = cclass_trie; + for (i = 0; sp < ep && i < 8; i++, sp++) { + n = tp->len; + + for (; n > 0 && tp->key != *sp; tp++, n--) ; + + if (n == 0) + return 0; + + if (*sp == ':' && (i == 6 || i == 7)) { + sp++; + break; + } + if (sp + 1 < ep) + tp = cclass_trie + tp->next; + } + if (tp->func == 0) + return 0; + + (*tp->func)(sym, tp->mask, b); + + return sp - cp; +} + +/* + * Construct a list of ranges and return the number of characters consumed. + */ +static unsigned long +#ifdef __STDC__ +_ure_cclass(ucs2_t *cp, unsigned long limit, _ure_symtab_t *symp, + _ure_buffer_t *b) +#else +_ure_cclass(cp, limit, symp, b) +ucs2_t *cp; +unsigned long limit; +_ure_symtab_t *symp; +_ure_buffer_t *b; +#endif +{ + int range_end; + unsigned long n; + ucs2_t *sp, *ep; + ucs4_t c, last; + _ure_ccl_t *cclp; + _ure_range_t range; + + sp = cp; + ep = sp + limit; + + if (*sp == '^') { + symp->type = _URE_NCCLASS; + sp++; + } else + symp->type = _URE_CCLASS; + + for (last = 0, range_end = 0; + b->error == _URE_OK && sp < ep && *sp != ']'; ) { + c = *sp++; + if (c == '\\') { + if (sp == ep) { + /* + * The EOS was encountered when expecting the reverse solidus + * to be followed by the character it is escaping. Set an + * error code and return the number of characters consumed up + * to this point. + */ + b->error = _URE_UNEXPECTED_EOS; + return sp - cp; + } + + c = *sp++; + switch (c) { + case 'a': + c = 0x07; + break; + case 'b': + c = 0x08; + break; + case 'f': + c = 0x0c; + break; + case 'n': + c = 0x0a; + break; + case 'r': + c = 0x0d; + break; + case 't': + c = 0x09; + break; + case 'v': + c = 0x0b; + break; + case 'p': + case 'P': + sp += _ure_prop_list(sp, ep - sp, &symp->props, b); + /* + * Invert the bit mask of the properties if this is a negated + * character class or if 'P' is used to specify a list of + * character properties that should *not* match in a + * character class. + */ + if (c == 'P') + symp->props = ~symp->props; + continue; + break; + case 'x': + case 'X': + case 'u': + case 'U': + if (sp < ep && + ((*sp >= '0' && *sp <= '9') || + (*sp >= 'A' && *sp <= 'F') || + (*sp >= 'a' && *sp <= 'f'))) + sp += _ure_hex(sp, ep - sp, &c); + } + } else if (c == ':') { + /* + * Probe for a POSIX colon delimited character class. + */ + sp--; + if ((n = _ure_posix_ccl(sp, ep - sp, symp, b)) == 0) + sp++; + else { + sp += n; + continue; + } + } + + cclp = &symp->sym.ccl; + + /* + * Check to see if the current character is a low surrogate that needs + * to be combined with a preceding high surrogate. + */ + if (last != 0) { + if (c >= 0xdc00 && c <= 0xdfff) + /* + * Construct the UTF16 character code. + */ + c = 0x10000 + (((last & 0x03ff) << 10) | (c & 0x03ff)); + else { + /* + * Add the isolated high surrogate to the range. + */ + if (range_end == 1) + range.max_code = last & 0xffff; + else + range.min_code = range.max_code = last & 0xffff; + + _ure_add_range(cclp, &range, b); + range_end = 0; + } + } + + /* + * Clear the last character code. + */ + last = 0; + + /* + * This slightly awkward code handles the different cases needed to + * construct a range. + */ + if (c >= 0xd800 && c <= 0xdbff) { + /* + * If the high surrogate is followed by a range indicator, simply + * add it as the range start. Otherwise, save it in case the next + * character is a low surrogate. + */ + if (*sp == '-') { + sp++; + range.min_code = c; + range_end = 1; + } else + last = c; + } else if (range_end == 1) { + range.max_code = c; + _ure_add_range(cclp, &range, b); + range_end = 0; + } else { + range.min_code = range.max_code = c; + if (*sp == '-') { + sp++; + range_end = 1; + } else + _ure_add_range(cclp, &range, b); + } + } + + if (sp < ep && *sp == ']') + sp++; + else + /* + * The parse was not terminated by the character class close symbol + * (']'), so set an error code. + */ + b->error = _URE_CCLASS_OPEN; + + return sp - cp; +} + +/* + * Probe for a low surrogate hex code. + */ +static unsigned long +#ifdef __STDC__ +_ure_probe_ls(ucs2_t *ls, unsigned long limit, ucs4_t *c) +#else +_ure_probe_ls(ls, limit, c) +ucs2_t *ls; +unsigned long limit; +ucs4_t *c; +#endif +{ + ucs4_t i, code; + ucs2_t *sp, *ep; + + for (i = code = 0, sp = ls, ep = sp + limit; i < 4 && sp < ep; sp++) { + if (*sp >= '0' && *sp <= '9') + code = (code << 4) + (*sp - '0'); + else if (*sp >= 'A' && *sp <= 'F') + code = (code << 4) + ((*sp - 'A') + 10); + else if (*sp >= 'a' && *sp <= 'f') + code = (code << 4) + ((*sp - 'a') + 10); + else + break; + } + + *c = code; + return (0xdc00 <= code && code <= 0xdfff) ? sp - ls : 0; +} + +static unsigned long +#ifdef __STDC__ +_ure_compile_symbol(ucs2_t *sym, unsigned long limit, _ure_symtab_t *symp, + _ure_buffer_t *b) +#else +_ure_compile_symbol(sym, limit, symp, b) +ucs2_t *sym; +unsigned long limit; +_ure_symtab_t *symp; +_ure_buffer_t *b; +#endif +{ + ucs4_t c; + ucs2_t *sp, *ep; + + sp = sym; + ep = sym + limit; + + if ((c = *sp++) == '\\') { + + if (sp == ep) { + /* + * The EOS was encountered when expecting the reverse solidus to + * be followed by the character it is escaping. Set an error code + * and return the number of characters consumed up to this point. + */ + b->error = _URE_UNEXPECTED_EOS; + return sp - sym; + } + + c = *sp++; + switch (c) { + case 'p': + case 'P': + symp->type = (c == 'p') ? _URE_CCLASS : _URE_NCCLASS; + sp += _ure_prop_list(sp, ep - sp, &symp->props, b); + break; + case 'a': + symp->type = _URE_CHAR; + symp->sym.chr = 0x07; + break; + case 'b': + symp->type = _URE_CHAR; + symp->sym.chr = 0x08; + break; + case 'f': + symp->type = _URE_CHAR; + symp->sym.chr = 0x0c; + break; + case 'n': + symp->type = _URE_CHAR; + symp->sym.chr = 0x0a; + break; + case 'r': + symp->type = _URE_CHAR; + symp->sym.chr = 0x0d; + break; + case 't': + symp->type = _URE_CHAR; + symp->sym.chr = 0x09; + break; + case 'v': + symp->type = _URE_CHAR; + symp->sym.chr = 0x0b; + break; + case 'x': + case 'X': + case 'u': + case 'U': + /* + * Collect between 1 and 4 digits representing a UCS2 code. Fall + * through to the next case. + */ + if (sp < ep && + ((*sp >= '0' && *sp <= '9') || + (*sp >= 'A' && *sp <= 'F') || + (*sp >= 'a' && *sp <= 'f'))) + sp += _ure_hex(sp, ep - sp, &c); + /* FALLTHROUGH */ + default: + /* + * Simply add an escaped character here. + */ + symp->type = _URE_CHAR; + symp->sym.chr = c; + } + } else if (c == '^' || c == '$') + /* + * Handle the BOL and EOL anchors. This actually consists simply of + * setting a flag that indicates that the user supplied anchor match + * function should be called. This needs to be done instead of simply + * matching line/paragraph separators because beginning-of-text and + * end-of-text tests are needed as well. + */ + symp->type = (c == '^') ? _URE_BOL_ANCHOR : _URE_EOL_ANCHOR; + else if (c == '[') + /* + * Construct a character class. + */ + sp += _ure_cclass(sp, ep - sp, symp, b); + else if (c == '.') + symp->type = _URE_ANY_CHAR; + else { + symp->type = _URE_CHAR; + symp->sym.chr = c; + } + + /* + * If the symbol type happens to be a character and is a high surrogate, + * then probe forward to see if it is followed by a low surrogate that + * needs to be added. + */ + if (sp < ep && symp->type == _URE_CHAR && + 0xd800 <= symp->sym.chr && symp->sym.chr <= 0xdbff) { + + if (0xdc00 <= *sp && *sp <= 0xdfff) { + symp->sym.chr = 0x10000 + (((symp->sym.chr & 0x03ff) << 10) | + (*sp & 0x03ff)); + sp++; + } else if (*sp == '\\' && (*(sp + 1) == 'x' || *(sp + 1) == 'X' || + *(sp + 1) == 'u' || *(sp + 1) == 'U')) { + sp += _ure_probe_ls(sp + 2, ep - (sp + 2), &c); + if (0xdc00 <= c && c <= 0xdfff) { + /* + * Take into account the \[xu] in front of the hex code. + */ + sp += 2; + symp->sym.chr = 0x10000 + (((symp->sym.chr & 0x03ff) << 10) | + (c & 0x03ff)); + } + } + } + + /* + * Last, make sure any _URE_CHAR type symbols are changed to lower case if + * the `casefold' flag is set. + */ + if ((b->flags & _URE_DFA_CASEFOLD) && symp->type == _URE_CHAR) + symp->sym.chr = _ure_tolower(symp->sym.chr); + + /* + * If the symbol constructed is anything other than one of the anchors, + * make sure the _URE_DFA_BLANKLINE flag is removed. + */ + if (symp->type != _URE_BOL_ANCHOR && symp->type != _URE_EOL_ANCHOR) + b->flags &= ~_URE_DFA_BLANKLINE; + + /* + * Return the number of characters consumed. + */ + return sp - sym; +} + +static int +#ifdef __STDC__ +_ure_sym_neq(_ure_symtab_t *a, _ure_symtab_t *b) +#else +_ure_sym_neq(a, b) +_ure_symtab_t *a, *b; +#endif +{ + if (a->type != b->type || a->mods != b->mods || a->props != b->props) + return 1; + + if (a->type == _URE_CCLASS || a->type == _URE_NCCLASS) { + if (a->sym.ccl.ranges_used != b->sym.ccl.ranges_used) + return 1; + if (a->sym.ccl.ranges_used > 0 && + memcmp((char *) a->sym.ccl.ranges, (char *) b->sym.ccl.ranges, + sizeof(_ure_range_t) * a->sym.ccl.ranges_used) != 0) + return 1; + } else if (a->type == _URE_CHAR && a->sym.chr != b->sym.chr) + return 1; + return 0; +} + +/* + * Construct a symbol, but only keep unique symbols. + */ +static ucs2_t +#ifdef __stdc__ +_ure_make_symbol(ucs2_t *sym, unsigned long limit, unsigned long *consumed, + _ure_buffer_t *b) +#else +_ure_make_symbol(sym, limit, consumed, b) +ucs2_t *sym; +unsigned long limit, *consumed; +_ure_buffer_t *b; +#endif +{ + ucs2_t i; + _ure_symtab_t *sp, symbol; + + /* + * Build the next symbol so we can test to see if it is already in the + * symbol table. + */ + (void) memset((char *) &symbol, 0, sizeof(_ure_symtab_t)); + *consumed = _ure_compile_symbol(sym, limit, &symbol, b); + + /* + * Check to see if the symbol exists. + */ + for (i = 0, sp = b->symtab; + i < b->symtab_used && _ure_sym_neq(&symbol, sp); i++, sp++) ; + + if (i < b->symtab_used) { + /* + * Free up any ranges used for the symbol. + */ + if ((symbol.type == _URE_CCLASS || symbol.type == _URE_NCCLASS) && + symbol.sym.ccl.ranges_size > 0) + free((char *) symbol.sym.ccl.ranges); + + return b->symtab[i].id; + } + + /* + * Need to add the new symbol. + */ + if (b->symtab_used == b->symtab_size) { + if (b->symtab_size == 0) + b->symtab = (_ure_symtab_t *) malloc(sizeof(_ure_symtab_t) << 3); + else + b->symtab = (_ure_symtab_t *) + realloc((char *) b->symtab, + sizeof(_ure_symtab_t) * (b->symtab_size + 8)); + sp = b->symtab + b->symtab_size; + (void) memset((char *) sp, 0, sizeof(_ure_symtab_t) << 3); + b->symtab_size += 8; + } + + symbol.id = b->symtab_used++; + (void) memcpy((char *) &b->symtab[symbol.id], (char *) &symbol, + sizeof(_ure_symtab_t)); + + return symbol.id; +} + +/************************************************************************* + * + * End symbol parse functions. + * + *************************************************************************/ + +static ucs2_t +#ifdef __stdc__ +_ure_make_expr(ucs2_t type, ucs2_t lhs, ucs2_t rhs, _ure_buffer_t *b) +#else +_ure_make_expr(type, lhs, rhs, b) +ucs2_t type, lhs, rhs; +_ure_buffer_t *b; +#endif +{ + ucs2_t i; + + if (b == 0) + return _URE_NOOP; + + /* + * Determine if the expression already exists or not. + */ + for (i = 0; i < b->expr_used; i++) { + if (b->expr[i].type == type && b->expr[i].lhs == lhs && + b->expr[i].rhs == rhs) + break; + } + if (i < b->expr_used) + return i; + + /* + * Need to add a new expression. + */ + if (b->expr_used == b->expr_size) { + if (b->expr_size == 0) + b->expr = (_ure_elt_t *) malloc(sizeof(_ure_elt_t) << 3); + else + b->expr = (_ure_elt_t *) + realloc((char *) b->expr, + sizeof(_ure_elt_t) * (b->expr_size + 8)); + b->expr_size += 8; + } + + b->expr[b->expr_used].onstack = 0; + b->expr[b->expr_used].type = type; + b->expr[b->expr_used].lhs = lhs; + b->expr[b->expr_used].rhs = rhs; + + return b->expr_used++; +} + +static unsigned char spmap[] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, +}; + +#define _ure_isspecial(cc) ((cc) > 0x20 && (cc) < 0x7f && \ + (spmap[(cc) >> 3] & (1 << ((cc) & 7)))) + +/* + * Convert the regular expression into an NFA in a form that will be easy to + * reduce to a DFA. The starting state for the reduction will be returned. + */ +static ucs2_t +#ifdef __STDC__ +_ure_re2nfa(ucs2_t *re, unsigned long relen, _ure_buffer_t *b) +#else +_ure_re2nfa(re, relen, b) +ucs2_t *re; +unsigned long relen; +_ure_buffer_t *b; +#endif +{ + ucs2_t c, state, top, sym, *sp, *ep; + unsigned long used; + + state = _URE_NOOP; + + sp = re; + ep = sp + relen; + while (b->error == _URE_OK && sp < ep) { + c = *sp++; + switch (c) { + case '(': + _ure_push(_URE_PAREN, b); + break; + case ')': + /* + * Check for the case of too many close parentheses. + */ + if (_ure_peek(b) == _URE_NOOP) { + b->error = _URE_UNBALANCED_GROUP; + break; + } + + while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR) + /* + * Make an expression with the AND or OR operator and its right + * hand side. + */ + state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b); + + /* + * Remove the _URE_PAREN off the stack. + */ + (void) _ure_pop(b); + break; + case '*': + state = _ure_make_expr(_URE_STAR, state, _URE_NOOP, b); + break; + case '+': + state = _ure_make_expr(_URE_PLUS, state, _URE_NOOP, b); + break; + case '?': + state = _ure_make_expr(_URE_QUEST, state, _URE_NOOP, b); + break; + case '|': + while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR) + /* + * Make an expression with the AND or OR operator and its right + * hand side. + */ + state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b); + + _ure_push(state, b); + _ure_push(_URE_OR, b); + break; + default: + sp--; + sym = _ure_make_symbol(sp, ep - sp, &used, b); + sp += used; + state = _ure_make_expr(_URE_SYMBOL, sym, _URE_NOOP, b); + break; + } + + if (c != '(' && c != '|' && sp < ep && + (!_ure_isspecial(*sp) || *sp == '(')) { + _ure_push(state, b); + _ure_push(_URE_AND, b); + } + } + while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR) + /* + * Make an expression with the AND or OR operator and its right + * hand side. + */ + state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b); + + if (b->stack.slist_used > 0) + b->error = _URE_UNBALANCED_GROUP; + + return (b->error == _URE_OK) ? state : _URE_NOOP; +} + +static void +#ifdef __STDC__ +_ure_add_symstate(ucs2_t sym, ucs2_t state, _ure_buffer_t *b) +#else +_ure_add_symstate(sym, state, b) +ucs2_t sym, state; +_ure_buffer_t *b; +#endif +{ + ucs2_t i, *stp; + _ure_symtab_t *sp; + + /* + * Locate the symbol in the symbol table so the state can be added. + * If the symbol doesn't exist, then a real problem exists. + */ + for (i = 0, sp = b->symtab; i < b->symtab_used && sym != sp->id; + i++, sp++) ; + + /* + * Now find out if the state exists in the symbol's state list. + */ + for (i = 0, stp = sp->states.slist; + i < sp->states.slist_used && state > *stp; i++, stp++) ; + + if (i == sp->states.slist_used || state < *stp) { + /* + * Need to add the state in order. + */ + if (sp->states.slist_used == sp->states.slist_size) { + if (sp->states.slist_size == 0) + sp->states.slist = (ucs2_t *) malloc(sizeof(ucs2_t) << 3); + else + sp->states.slist = (ucs2_t *) + realloc((char *) sp->states.slist, + sizeof(ucs2_t) * (sp->states.slist_size + 8)); + sp->states.slist_size += 8; + } + if (i < sp->states.slist_used) + (void) _ure_memmove((char *) (sp->states.slist + i + 1), + (char *) (sp->states.slist + i), + sizeof(ucs2_t) * (sp->states.slist_used - i)); + sp->states.slist[i] = state; + sp->states.slist_used++; + } +} + +static ucs2_t +#ifdef __STDC__ +_ure_add_state(ucs2_t nstates, ucs2_t *states, _ure_buffer_t *b) +#else +_ure_add_state(nstates, states, b) +ucs2_t nstates, *states; +_ure_buffer_t *b; +#endif +{ + ucs2_t i; + _ure_state_t *sp; + + for (i = 0, sp = b->states.states; i < b->states.states_used; i++, sp++) { + if (sp->st.slist_used == nstates && + memcmp((char *) states, (char *) sp->st.slist, + sizeof(ucs2_t) * nstates) == 0) + break; + } + + if (i == b->states.states_used) { + /* + * Need to add a new DFA state (set of NFA states). + */ + if (b->states.states_used == b->states.states_size) { + if (b->states.states_size == 0) + b->states.states = (_ure_state_t *) + malloc(sizeof(_ure_state_t) << 3); + else + b->states.states = (_ure_state_t *) + realloc((char *) b->states.states, + sizeof(_ure_state_t) * (b->states.states_size + 8)); + sp = b->states.states + b->states.states_size; + (void) memset((char *) sp, 0, sizeof(_ure_state_t) << 3); + b->states.states_size += 8; + } + + sp = b->states.states + b->states.states_used++; + sp->id = i; + + if (sp->st.slist_used + nstates > sp->st.slist_size) { + if (sp->st.slist_size == 0) + sp->st.slist = (ucs2_t *) + malloc(sizeof(ucs2_t) * (sp->st.slist_used + nstates)); + else + sp->st.slist = (ucs2_t *) + realloc((char *) sp->st.slist, + sizeof(ucs2_t) * (sp->st.slist_used + nstates)); + sp->st.slist_size = sp->st.slist_used + nstates; + } + sp->st.slist_used = nstates; + (void) memcpy((char *) sp->st.slist, (char *) states, + sizeof(ucs2_t) * nstates); + } + + /* + * Return the ID of the DFA state representing a group of NFA states. + */ + return i; +} + +static void +#ifdef __STDC__ +_ure_reduce(ucs2_t start, _ure_buffer_t *b) +#else +_ure_reduce(start, b) +ucs2_t start; +_ure_buffer_t *b; +#endif +{ + ucs2_t i, j, state, eval, syms, rhs; + ucs2_t s1, s2, ns1, ns2; + _ure_state_t *sp; + _ure_symtab_t *smp; + + b->reducing = 1; + + /* + * Add the starting state for the reduction. + */ + _ure_add_state(1, &start, b); + + /* + * Process each set of NFA states that get created. + */ + for (i = 0; i < b->states.states_used; i++) { + sp = b->states.states + i; + + /* + * Push the current states on the stack. + */ + for (j = 0; j < sp->st.slist_used; j++) + _ure_push(sp->st.slist[j], b); + + /* + * Reduce the NFA states. + */ + for (j = sp->accepting = syms = 0; j < b->stack.slist_used; j++) { + state = b->stack.slist[j]; + eval = 1; + + /* + * This inner loop is the iterative equivalent of recursively + * reducing subexpressions generated as a result of a reduction. + */ + while (eval) { + switch (b->expr[state].type) { + case _URE_SYMBOL: + ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b); + _ure_add_symstate(b->expr[state].lhs, ns1, b); + syms++; + eval = 0; + break; + case _URE_ONE: + sp->accepting = 1; + eval = 0; + break; + case _URE_QUEST: + s1 = b->expr[state].lhs; + ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b); + state = _ure_make_expr(_URE_OR, ns1, s1, b); + break; + case _URE_PLUS: + s1 = b->expr[state].lhs; + ns1 = _ure_make_expr(_URE_STAR, s1, _URE_NOOP, b); + state = _ure_make_expr(_URE_AND, s1, ns1, b); + break; + case _URE_STAR: + s1 = b->expr[state].lhs; + ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b); + ns2 = _ure_make_expr(_URE_PLUS, s1, _URE_NOOP, b); + state = _ure_make_expr(_URE_OR, ns1, ns2, b); + break; + case _URE_OR: + s1 = b->expr[state].lhs; + s2 = b->expr[state].rhs; + _ure_push(s1, b); + _ure_push(s2, b); + eval = 0; + break; + case _URE_AND: + s1 = b->expr[state].lhs; + s2 = b->expr[state].rhs; + switch (b->expr[s1].type) { + case _URE_SYMBOL: + _ure_add_symstate(b->expr[s1].lhs, s2, b); + syms++; + eval = 0; + break; + case _URE_ONE: + state = s2; + break; + case _URE_QUEST: + ns1 = b->expr[s1].lhs; + ns2 = _ure_make_expr(_URE_AND, ns1, s2, b); + state = _ure_make_expr(_URE_OR, s2, ns2, b); + break; + case _URE_PLUS: + ns1 = b->expr[s1].lhs; + ns2 = _ure_make_expr(_URE_OR, s2, state, b); + state = _ure_make_expr(_URE_AND, ns1, ns2, b); + break; + case _URE_STAR: + ns1 = b->expr[s1].lhs; + ns2 = _ure_make_expr(_URE_AND, ns1, state, b); + state = _ure_make_expr(_URE_OR, s2, ns2, b); + break; + case _URE_OR: + ns1 = b->expr[s1].lhs; + ns2 = b->expr[s1].rhs; + ns1 = _ure_make_expr(_URE_AND, ns1, s2, b); + ns2 = _ure_make_expr(_URE_AND, ns2, s2, b); + state = _ure_make_expr(_URE_OR, ns1, ns2, b); + break; + case _URE_AND: + ns1 = b->expr[s1].lhs; + ns2 = b->expr[s1].rhs; + ns2 = _ure_make_expr(_URE_AND, ns2, s2, b); + state = _ure_make_expr(_URE_AND, ns1, ns2, b); + break; + } + } + } + } + + /* + * Clear the state stack. + */ + while (_ure_pop(b) != _URE_NOOP) ; + + /* + * Reset the state pointer because the reduction may have moved it + * during a reallocation. + */ + sp = b->states.states + i; + + /* + * Generate the DFA states for the symbols collected during the + * current reduction. + */ + if (sp->trans_used + syms > sp->trans_size) { + if (sp->trans_size == 0) + sp->trans = (_ure_elt_t *) + malloc(sizeof(_ure_elt_t) * (sp->trans_used + syms)); + else + sp->trans = (_ure_elt_t *) + realloc((char *) sp->trans, + sizeof(_ure_elt_t) * (sp->trans_used + syms)); + sp->trans_size = sp->trans_used + syms; + } + + /* + * Go through the symbol table and generate the DFA state transitions + * for each symbol that has collected NFA states. + */ + for (j = syms = 0, smp = b->symtab; j < b->symtab_used; j++, smp++) { + sp = b->states.states + i; + + if (smp->states.slist_used > 0) { + sp->trans[syms].lhs = smp->id; + rhs = _ure_add_state(smp->states.slist_used, + smp->states.slist, b); + /* + * Reset the state pointer in case the reallocation moves it + * in memory. + */ + sp = b->states.states + i; + sp->trans[syms].rhs = rhs; + + smp->states.slist_used = 0; + syms++; + } + } + + /* + * Set the number of transitions actually used. + */ + sp->trans_used = syms; + } + b->reducing = 0; +} + +static void +#ifdef __STDC__ +_ure_add_equiv(ucs2_t l, ucs2_t r, _ure_buffer_t *b) +#else +_ure_add_equiv(l, r, b) +ucs2_t l, r; +_ure_buffer_t *b; +#endif +{ + ucs2_t tmp; + + l = b->states.states[l].id; + r = b->states.states[r].id; + + if (l == r) + return; + + if (l > r) { + tmp = l; + l = r; + r = tmp; + } + + /* + * Check to see if the equivalence pair already exists. + */ + for (tmp = 0; tmp < b->equiv_used && + (b->equiv[tmp].l != l || b->equiv[tmp].r != r); + tmp++) ; + + if (tmp < b->equiv_used) + return; + + if (b->equiv_used == b->equiv_size) { + if (b->equiv_size == 0) + b->equiv = (_ure_equiv_t *) malloc(sizeof(_ure_equiv_t) << 3); + else + b->equiv = (_ure_equiv_t *) realloc((char *) b->equiv, + sizeof(_ure_equiv_t) * + (b->equiv_size + 8)); + b->equiv_size += 8; + } + b->equiv[b->equiv_used].l = l; + b->equiv[b->equiv_used].r = r; + b->equiv_used++; +} + +/* + * Merge the DFA states that are equivalent. + */ +static void +#ifdef __STDC__ +_ure_merge_equiv(_ure_buffer_t *b) +#else +_ure_merge_equiv(b) +_ure_buffer_t *b; +#endif +{ + ucs2_t i, j, k, eq, done; + _ure_state_t *sp1, *sp2, *ls, *rs; + + for (i = 0; i < b->states.states_used; i++) { + sp1 = b->states.states + i; + if (sp1->id != i) + continue; + for (j = 0; j < i; j++) { + sp2 = b->states.states + j; + if (sp2->id != j) + continue; + b->equiv_used = 0; + _ure_add_equiv(i, j, b); + for (eq = 0, done = 0; eq < b->equiv_used; eq++) { + ls = b->states.states + b->equiv[eq].l; + rs = b->states.states + b->equiv[eq].r; + if (ls->accepting != rs->accepting || + ls->trans_used != rs->trans_used) { + done = 1; + break; + } + for (k = 0; k < ls->trans_used && + ls->trans[k].lhs == rs->trans[k].lhs; k++) ; + if (k < ls->trans_used) { + done = 1; + break; + } + + for (k = 0; k < ls->trans_used; k++) + _ure_add_equiv(ls->trans[k].rhs, rs->trans[k].rhs, b); + } + if (done == 0) + break; + } + for (eq = 0; j < i && eq < b->equiv_used; eq++) + b->states.states[b->equiv[eq].r].id = + b->states.states[b->equiv[eq].l].id; + } + + /* + * Renumber the states appropriately. + */ + for (i = eq = 0, sp1 = b->states.states; i < b->states.states_used; + sp1++, i++) + sp1->id = (sp1->id == i) ? eq++ : b->states.states[sp1->id].id; +} + +/************************************************************************* + * + * API. + * + *************************************************************************/ + +ure_buffer_t +#ifdef __STDC__ +ure_buffer_create(void) +#else +ure_buffer_create() +#endif +{ + ure_buffer_t b; + + b = (ure_buffer_t) calloc(1, sizeof(_ure_buffer_t)); + + return b; +} + +void +#ifdef __STDC__ +ure_buffer_free(ure_buffer_t buf) +#else +ure_buffer_free(buf) +ure_buffer_t buf; +#endif +{ + unsigned long i; + + if (buf == 0) + return; + + if (buf->stack.slist_size > 0) + free((char *) buf->stack.slist); + + if (buf->expr_size > 0) + free((char *) buf->expr); + + for (i = 0; i < buf->symtab_size; i++) { + if (buf->symtab[i].states.slist_size > 0) + free((char *) buf->symtab[i].states.slist); + } + + if (buf->symtab_size > 0) + free((char *) buf->symtab); + + for (i = 0; i < buf->states.states_size; i++) { + if (buf->states.states[i].trans_size > 0) + free((char *) buf->states.states[i].trans); + if (buf->states.states[i].st.slist_size > 0) + free((char *) buf->states.states[i].st.slist); + } + + if (buf->states.states_size > 0) + free((char *) buf->states.states); + + if (buf->equiv_size > 0) + free((char *) buf->equiv); + + free((char *) buf); +} + +ure_dfa_t +#ifdef __STDC__ +ure_compile(ucs2_t *re, unsigned long relen, int casefold, ure_buffer_t buf) +#else +ure_compile(re, relen, casefold, buf) +ucs2_t *re; +unsigned long relen; +int casefold; +ure_buffer_t buf; +#endif +{ + ucs2_t i, j, state; + _ure_state_t *sp; + _ure_dstate_t *dsp; + _ure_trans_t *tp; + ure_dfa_t dfa; + + if (re == 0 || *re == 0 || relen == 0 || buf == 0) + return 0; + + /* + * Reset the various fields of the compilation buffer. Default the flags + * to indicate the presense of the "^$" pattern. If any other pattern + * occurs, then this flag will be removed. This is done to catch this + * special pattern and handle it specially when matching. + */ + buf->flags = _URE_DFA_BLANKLINE | ((casefold) ? _URE_DFA_CASEFOLD : 0); + buf->reducing = 0; + buf->stack.slist_used = 0; + buf->expr_used = 0; + + for (i = 0; i < buf->symtab_used; i++) + buf->symtab[i].states.slist_used = 0; + buf->symtab_used = 0; + + for (i = 0; i < buf->states.states_used; i++) { + buf->states.states[i].st.slist_used = 0; + buf->states.states[i].trans_used = 0; + } + buf->states.states_used = 0; + + /* + * Construct the NFA. If this stage returns a 0, then an error occured or + * an empty expression was passed. + */ + if ((state = _ure_re2nfa(re, relen, buf)) == _URE_NOOP) + return 0; + + /* + * Do the expression reduction to get the initial DFA. + */ + _ure_reduce(state, buf); + + /* + * Merge all the equivalent DFA states. + */ + _ure_merge_equiv(buf); + + /* + * Construct the minimal DFA. + */ + dfa = (ure_dfa_t) malloc(sizeof(_ure_dfa_t)); + (void) memset((char *) dfa, 0, sizeof(_ure_dfa_t)); + + dfa->flags = buf->flags & (_URE_DFA_CASEFOLD|_URE_DFA_BLANKLINE); + + /* + * Free up the NFA state groups and transfer the symbols from the buffer + * to the DFA. + */ + for (i = 0; i < buf->symtab_size; i++) { + if (buf->symtab[i].states.slist_size > 0) + free((char *) buf->symtab[i].states.slist); + } + dfa->syms = buf->symtab; + dfa->nsyms = buf->symtab_used; + + buf->symtab_used = buf->symtab_size = 0; + + /* + * Collect the total number of states and transitions needed for the DFA. + */ + for (i = state = 0, sp = buf->states.states; i < buf->states.states_used; + i++, sp++) { + if (sp->id == state) { + dfa->nstates++; + dfa->ntrans += sp->trans_used; + state++; + } + } + + /* + * Allocate enough space for the states and transitions. + */ + dfa->states = (_ure_dstate_t *) malloc(sizeof(_ure_dstate_t) * + dfa->nstates); + dfa->trans = (_ure_trans_t *) malloc(sizeof(_ure_trans_t) * dfa->ntrans); + + /* + * Actually transfer the DFA states from the buffer. + */ + dsp = dfa->states; + tp = dfa->trans; + for (i = state = 0, sp = buf->states.states; i < buf->states.states_used; + i++, sp++) { + if (sp->id == state) { + dsp->trans = tp; + dsp->ntrans = sp->trans_used; + dsp->accepting = sp->accepting; + + /* + * Add the transitions for the state. + */ + for (j = 0; j < dsp->ntrans; j++, tp++) { + tp->symbol = sp->trans[j].lhs; + tp->next_state = buf->states.states[sp->trans[j].rhs].id; + } + + dsp++; + state++; + } + } + + return dfa; +} + +void +#ifdef __STDC__ +ure_dfa_free(ure_dfa_t dfa) +#else +ure_dfa_free(dfa) +ure_dfa_t dfa; +#endif +{ + ucs2_t i; + + if (dfa == 0) + return; + + for (i = 0; i < dfa->nsyms; i++) { + if ((dfa->syms[i].type == _URE_CCLASS || + dfa->syms[i].type == _URE_NCCLASS) && + dfa->syms[i].sym.ccl.ranges_size > 0) + free((char *) dfa->syms[i].sym.ccl.ranges); + } + if (dfa->nsyms > 0) + free((char *) dfa->syms); + + if (dfa->nstates > 0) + free((char *) dfa->states); + if (dfa->ntrans > 0) + free((char *) dfa->trans); + free((char *) dfa); +} + +void +#ifdef __STDC__ +ure_write_dfa(ure_dfa_t dfa, FILE *out) +#else +ure_write_dfa(dfa, out) +ure_dfa_t dfa; +FILE *out; +#endif +{ + ucs2_t i, j, k, h, l; + _ure_dstate_t *sp; + _ure_symtab_t *sym; + _ure_range_t *rp; + + if (dfa == 0 || out == 0) + return; + + /* + * Write all the different character classes. + */ + for (i = 0, sym = dfa->syms; i < dfa->nsyms; i++, sym++) { + if (sym->type == _URE_CCLASS || sym->type == _URE_NCCLASS) { + fprintf(out, "C%hd = ", sym->id); + if (sym->sym.ccl.ranges_used > 0) { + putc('[', out); + if (sym->type == _URE_NCCLASS) + putc('^', out); + } + if (sym->props != 0) { + if (sym->type == _URE_NCCLASS) + fprintf(out, "\\P"); + else + fprintf(out, "\\p"); + for (k = h = 0; k < 32; k++) { + if (sym->props & (1 << k)) { + if (h != 0) + putc(',', out); + fprintf(out, "%hd", k + 1); + h = 1; + } + } + } + /* + * Dump the ranges. + */ + for (k = 0, rp = sym->sym.ccl.ranges; + k < sym->sym.ccl.ranges_used; k++, rp++) { + /* + * Check for UTF16 characters. + */ + if (0x10000 <= rp->min_code && + rp->min_code <= 0x10ffff) { + h = ((rp->min_code - 0x10000) >> 10) + 0xd800; + l = ((rp->min_code - 0x10000) & 1023) + 0xdc00; + fprintf(out, "\\x%04hX\\x%04hX", h, l); + } else + fprintf(out, "\\x%04lX", rp->min_code & 0xffff); + if (rp->max_code != rp->min_code) { + putc('-', out); + if (rp->max_code >= 0x10000 && + rp->max_code <= 0x10ffff) { + h = ((rp->max_code - 0x10000) >> 10) + 0xd800; + l = ((rp->max_code - 0x10000) & 1023) + 0xdc00; + fprintf(out, "\\x%04hX\\x%04hX", h, l); + } else + fprintf(out, "\\x%04lX", rp->max_code & 0xffff); + } + } + if (sym->sym.ccl.ranges_used > 0) + putc(']', out); + putc('\n', out); + } + } + + for (i = 0, sp = dfa->states; i < dfa->nstates; i++, sp++) { + fprintf(out, "S%hd = ", i); + if (sp->accepting) { + fprintf(out, "1 "); + if (sp->ntrans) + fprintf(out, "| "); + } + for (j = 0; j < sp->ntrans; j++) { + if (j > 0) + fprintf(out, "| "); + + sym = dfa->syms + sp->trans[j].symbol; + switch (sym->type) { + case _URE_CHAR: + if (0x10000 <= sym->sym.chr && sym->sym.chr <= 0x10ffff) { + /* + * Take care of UTF16 characters. + */ + h = ((sym->sym.chr - 0x10000) >> 10) + 0xd800; + l = ((sym->sym.chr - 0x10000) & 1023) + 0xdc00; + fprintf(out, "\\x%04hX\\x%04hX ", h, l); + } else + fprintf(out, "\\x%04lX ", sym->sym.chr & 0xffff); + break; + case _URE_ANY_CHAR: + fprintf(out, " "); + break; + case _URE_BOL_ANCHOR: + fprintf(out, " "); + break; + case _URE_EOL_ANCHOR: + fprintf(out, " "); + break; + case _URE_CCLASS: + case _URE_NCCLASS: + fprintf(out, "[C%hd] ", sym->id); + break; + } + fprintf(out, "S%hd", sp->trans[j].next_state); + if (j + 1 < sp->ntrans) + putc(' ', out); + } + putc('\n', out); + } +} + +#define _ure_issep(cc) ((cc) == '\n' || (cc) == '\r' || (cc) == 0x2028 ||\ + (cc) == 0x2029) + +int +#ifdef __STDC__ +ure_exec(ure_dfa_t dfa, int flags, ucs2_t *text, unsigned long textlen, + unsigned long *match_start, unsigned long *match_end) +#else +ure_exec(dfa, flags, text, textlen, match_start, match_end) +ure_dfa_t dfa; +int flags; +ucs2_t *text; +unsigned long textlen, *match_start, *match_end; +#endif +{ + int i, j, matched, found, skip; + unsigned long ms, me; + ucs4_t c; + ucs2_t *sp, *ep, *lp; + _ure_dstate_t *stp; + _ure_symtab_t *sym; + _ure_range_t *rp; + + if (dfa == 0 || text == 0) + return 0; + + /* + * Handle the special case of an empty string matching the "^$" pattern. + */ + if (textlen == 0 && (dfa->flags & _URE_DFA_BLANKLINE)) { + *match_start = *match_end = 0; + return 1; + } + + sp = text; + ep = sp + textlen; + + ms = me = ~0; + + stp = dfa->states; + + for (found = skip = 0; found == 0 && sp < ep; ) { + lp = sp; + c = *sp++; + + /* + * Check to see if this is a high surrogate that should be + * combined with a following low surrogate. + */ + if (sp < ep && 0xd800 <= c && c <= 0xdbff && + 0xdc00 <= *sp && *sp <= 0xdfff) + c = 0x10000 + (((c & 0x03ff) << 10) | (*sp++ & 0x03ff)); + + /* + * Determine if the character is non-spacing and should be skipped. + */ + if (_ure_matches_properties(_URE_NONSPACING, c) && + (flags & URE_IGNORE_NONSPACING)) { + sp++; + continue; + } + + if (dfa->flags & _URE_DFA_CASEFOLD) + c = _ure_tolower(c); + + /* + * See if one of the transitions matches. + */ + for (i = 0, matched = 0; matched == 0 && i < stp->ntrans; i++) { + sym = dfa->syms + stp->trans[i].symbol; + switch (sym->type) { + case _URE_ANY_CHAR: + if ((flags & URE_DOT_MATCHES_SEPARATORS) || + !_ure_issep(c)) + matched = 1; + break; + case _URE_CHAR: + if (c == sym->sym.chr) + matched = 1; + break; + case _URE_BOL_ANCHOR: + if (lp == text) { + sp = lp; + matched = 1; + } else if (_ure_issep(c)) { + if (c == '\r' && sp < ep && *sp == '\n') + sp++; + lp = sp; + matched = 1; + } + break; + case _URE_EOL_ANCHOR: + if (_ure_issep(c)) { + /* + * Put the pointer back before the separator so the match + * end position will be correct. This case will also + * cause the `sp' pointer to be advanced over the current + * separator once the match end point has been recorded. + */ + sp = lp; + matched = 1; + } + break; + case _URE_CCLASS: + case _URE_NCCLASS: + if (sym->props != 0) + matched = _ure_matches_properties(sym->props, c); + for (j = 0, rp = sym->sym.ccl.ranges; + j < sym->sym.ccl.ranges_used; j++, rp++) { + if (rp->min_code <= c && c <= rp->max_code) + matched = 1; + } + if (sym->type == _URE_NCCLASS) + matched = !matched; + break; + } + + if (matched) { + if (ms == ~0) + ms = lp - text; + else + me = sp - text; + stp = dfa->states + stp->trans[i].next_state; + + /* + * If the match was an EOL anchor, adjust the pointer past the + * separator that caused the match. The correct match + * position has been recorded already. + */ + if (sym->type == _URE_EOL_ANCHOR) { + /* + * Skip the character that caused the match. + */ + sp++; + + /* + * Handle the infamous CRLF situation. + */ + if (sp < ep && c == '\r' && *sp == '\n') + sp++; + } + } + } + + if (matched == 0) { + if (stp->accepting == 0) { + /* + * If the last state was not accepting, then reset + * and start over. + */ + stp = dfa->states; + ms = me = ~0; + } else + /* + * The last state was accepting, so terminate the matching + * loop to avoid more work. + */ + found = 1; + } else if (sp == ep) { + if (!stp->accepting) { + /* + * This ugly hack is to make sure the end-of-line anchors + * match when the source text hits the end. This is only done + * if the last subexpression matches. + */ + for (i = 0; found == 0 && i < stp->ntrans; i++) { + sym = dfa->syms + stp->trans[i].symbol; + if (sym->type ==_URE_EOL_ANCHOR) { + stp = dfa->states + stp->trans[i].next_state; + if (stp->accepting) { + me = sp - text; + found = 1; + } else + break; + } + } + } else { + /* + * Make sure any conditions that match all the way to the end + * of the string match. + */ + found = 1; + me = sp - text; + } + } + } + + if (found == 0) + ms = me = ~0; + + *match_start = ms; + *match_end = me; + + return (ms != ~0) ? 1 : 0; +} diff --git a/libraries/liblunicode/ure/ure.h b/libraries/liblunicode/ure/ure.h new file mode 100644 index 0000000000..7d7fbb2e07 --- /dev/null +++ b/libraries/liblunicode/ure/ure.h @@ -0,0 +1,150 @@ +/* + * Copyright 1997, 1998, 1999 Computing Research Labs, + * New Mexico State University + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT + * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ +#ifndef _h_ure +#define _h_ure + +/* + * $Id: ure.h,v 1.2 1999/09/21 15:47:44 mleisher Exp $ + */ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#undef __ +#ifdef __STDC__ +#define __(x) x +#else +#define __(x) () +#endif + +/* + * Set of character class flags. + */ +#define _URE_NONSPACING 0x00000001 +#define _URE_COMBINING 0x00000002 +#define _URE_NUMDIGIT 0x00000004 +#define _URE_NUMOTHER 0x00000008 +#define _URE_SPACESEP 0x00000010 +#define _URE_LINESEP 0x00000020 +#define _URE_PARASEP 0x00000040 +#define _URE_CNTRL 0x00000080 +#define _URE_PUA 0x00000100 + +#define _URE_UPPER 0x00000200 +#define _URE_LOWER 0x00000400 +#define _URE_TITLE 0x00000800 +#define _URE_MODIFIER 0x00001000 +#define _URE_OTHERLETTER 0x00002000 +#define _URE_DASHPUNCT 0x00004000 +#define _URE_OPENPUNCT 0x00008000 +#define _URE_CLOSEPUNCT 0x00010000 +#define _URE_OTHERPUNCT 0x00020000 +#define _URE_MATHSYM 0x00040000 +#define _URE_CURRENCYSYM 0x00080000 +#define _URE_OTHERSYM 0x00100000 + +#define _URE_LTR 0x00200000 +#define _URE_RTL 0x00400000 + +#define _URE_EURONUM 0x00800000 +#define _URE_EURONUMSEP 0x01000000 +#define _URE_EURONUMTERM 0x02000000 +#define _URE_ARABNUM 0x04000000 +#define _URE_COMMONSEP 0x08000000 + +#define _URE_BLOCKSEP 0x10000000 +#define _URE_SEGMENTSEP 0x20000000 + +#define _URE_WHITESPACE 0x40000000 +#define _URE_OTHERNEUT 0x80000000 + +/* + * Error codes. + */ +#define _URE_OK 0 +#define _URE_UNEXPECTED_EOS -1 +#define _URE_CCLASS_OPEN -2 +#define _URE_UNBALANCED_GROUP -3 +#define _URE_INVALID_PROPERTY -4 + +/* + * Options that can be combined for searching. + */ +#define URE_IGNORE_NONSPACING 0x01 +#define URE_DOT_MATCHES_SEPARATORS 0x02 + +typedef unsigned long ucs4_t; +typedef unsigned short ucs2_t; + +/* + * Opaque type for memory used when compiling expressions. + */ +typedef struct _ure_buffer_t *ure_buffer_t; + +/* + * Opaque type for the minimal DFA used when matching. + */ +typedef struct _ure_dfa_t *ure_dfa_t; + +/************************************************************************* + * + * API. + * + *************************************************************************/ + +extern ure_buffer_t ure_buffer_create __((void)); + +extern void ure_buffer_free __((ure_buffer_t buf)); + +extern ure_dfa_t ure_compile __((ucs2_t *re, unsigned long relen, + int casefold, ure_buffer_t buf)); + +extern void ure_dfa_free __((ure_dfa_t dfa)); + +extern void ure_write_dfa __((ure_dfa_t dfa, FILE *out)); + +extern int ure_exec __((ure_dfa_t dfa, int flags, + ucs2_t *text, unsigned long textlen, + unsigned long *match_start, unsigned long *match_end)); + +/************************************************************************* + * + * Prototypes for stub functions used for URE. These need to be rewritten to + * use the Unicode support available on the system. + * + *************************************************************************/ + +extern ucs4_t _ure_tolower __((ucs4_t c)); + +extern int _ure_matches_properties __((unsigned long props, ucs4_t c)); + +#undef __ + +#ifdef __cplusplus +} +#endif + +#endif /* _h_ure */ diff --git a/libraries/liblunicode/ure/urestubs.c b/libraries/liblunicode/ure/urestubs.c new file mode 100644 index 0000000000..10157238c1 --- /dev/null +++ b/libraries/liblunicode/ure/urestubs.c @@ -0,0 +1,64 @@ +/* + * Copyright 1997, 1998, 1999 Computing Research Labs, + * New Mexico State University + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT + * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ +#ifndef lint +static char rcsid[] = "$Id: urestubs.c,v 1.2 1999/09/21 15:47:44 mleisher Exp $"; +#endif + +#include "ure.h" + +/* + * This file contains stub routines needed by the URE package to test + * character properties and other Unicode implementation specific details. + */ + +/* + * This routine should return the lower case equivalent for the character or, + * if there is no lower case quivalent, the character itself. + */ +ucs4_t +#ifdef __STDC__ +_ure_tolower(ucs4_t c) +#else +_ure_tolower(c) +ucs4_t c; +#endif +{ + return c; +} + +/* + * This routine takes a set of URE character property flags (see ure.h) along + * with a character and tests to see if the character has one or more of those + * properties. + */ +int +#ifdef __STDC__ +_ure_matches_properties(unsigned long props, ucs4_t c) +#else +_ure_matches_properties(props, c) +unsigned long props; +ucs4_t c; +#endif +{ + return 1; +} diff --git a/libraries/liblunicode/utbm/README b/libraries/liblunicode/utbm/README new file mode 100644 index 0000000000..8c0212dcf5 --- /dev/null +++ b/libraries/liblunicode/utbm/README @@ -0,0 +1,121 @@ +# +# $Id: README,v 1.1 1999/09/21 15:45:17 mleisher Exp $ +# +# Copyright 1997, 1998, 1999 Computing Research Labs, +# New Mexico State University +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY +# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT +# OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +# THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# + + Unicode and Boyer-Moore Searching + Version 0.2 + +UTBM (Unicode Tuned Boyer-Moore) is a simple package that provides tuned +Boyer-Moore searches on Unicode UCS2 text (handles high and low surrogates). + +--------------------------------------------------------------------------- + +Assumptions: + + o Search pattern and text already normalized in some fasion. + + o Upper, lower, and title case conversions are one-to-one. + + o For conversions between upper, lower, and title case, UCS2 characters + always convert to other UCS2 characters, and UTF-16 characters always + convert to other UTF-16 characters. + +Flags: + + UTBM provides three processing flags: + + o UTBM_CASEFOLD - search in a case-insensitive manner. + + o UTBM_IGNORE_NONSPACING - ignore non-spacing characters in the pattern and + the text. + + o UTBM_SPACE_COMPRESS - view as a *single space*, sequential groups of + U+2028, U+2029, '\n', '\r', '\t', and any + character identified as a space by the Unicode + support on the platform. + + This flag also causes all characters identified + as control by the Unicode support on the + platform to be ignored (except for '\n', '\r', + and '\t'). + +--------------------------------------------------------------------------- + +Before using UTBM +----------------- +Before UTBM is used, some functions need to be created. The "utbmstub.c" file +contains stubs that need to be rewritten so they work with the Unicode support +on the platform on which this package is being used. + +Using UTBM +---------- + +Sample pseudo-code fragment. + + utbm_pattern_t pat; + ucs2_t *pattern, *text; + unsigned long patternlen, textlen; + unsigned long flags, match_start, match_end; + + /* + * Allocate the dynamic storage needed for a search pattern. + */ + pat = utbm_create_pattern(); + + /* + * Set the search flags desired. + */ + flags = UTBM_CASEFOLD|UTBM_IGNORE_NONSPACING; + + /* + * Compile the search pattern. + */ + utbm_compile(pattern, patternlen, flags, pat); + + /* + * Find the first occurance of the search pattern in the text. + */ + if (utbm_exec(pat, text, textlen, &match_start, &match_end)) + printf("MATCH: %ld %ld\n", match_start, match_end); + + /* + * Free the dynamic storage used for the search pattern. + */ + ure_free_pattern(pat); + +--------------------------------------------------------------------------- + +Mark Leisher +2 May 1997 + +=========================================================================== + +CHANGES +------- + +Version: 0.2 +Date : 21 September 1999 +========================== + 1. Added copyright stuff and put in CVS. + diff --git a/libraries/liblunicode/utbm/utbm.c b/libraries/liblunicode/utbm/utbm.c new file mode 100644 index 0000000000..32904b0aa9 --- /dev/null +++ b/libraries/liblunicode/utbm/utbm.c @@ -0,0 +1,497 @@ +/* + * Copyright 1997, 1998, 1999 Computing Research Labs, + * New Mexico State University + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT + * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ +#ifndef lint +static char rcsid[] = "$Id: utbm.c,v 1.1 1999/09/21 15:45:17 mleisher Exp $"; +#endif + +/* + * Assumptions: + * 1. Case conversions of UTF-16 characters must also be UTF-16 characters. + * 2. Case conversions are all one-to-one. + * 3. Text and pattern have already been normalized in some fashion. + */ + +#include +#include +#include +#include "utbm.h" + +/* + * Single pattern character. + */ +typedef struct { + ucs4_t lc; + ucs4_t uc; + ucs4_t tc; +} _utbm_char_t; + +typedef struct { + _utbm_char_t *ch; + unsigned long skip; +} _utbm_skip_t; + +typedef struct _utbm_pattern_t { + unsigned long flags; + + _utbm_char_t *pat; + unsigned long pat_used; + unsigned long pat_size; + unsigned long patlen; + + _utbm_skip_t *skip; + unsigned long skip_used; + unsigned long skip_size; + + unsigned long md4; +} _utbm_pattern_t; + +/************************************************************************* + * + * Support functions. + * + *************************************************************************/ + +/* + * Routine to look up the skip value for a character. + */ +static unsigned long +#ifdef __STDC__ +_utbm_skip(utbm_pattern_t p, ucs2_t *start, ucs2_t *end) +#else +_utbm_skip(p, start, end) +utbm_pattern_t p; +ucs2_t *start, *end; +#endif +{ + unsigned long i; + ucs4_t c1, c2; + _utbm_skip_t *sp; + + if (start >= end) + return 0; + + c1 = *start; + c2 = (start + 1 < end) ? *(start + 1) : ~0; + if (0xd800 <= c1 && c1 <= 0xdbff && 0xdc00 <= c2 && c2 <= 0xdfff) + c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff)); + + for (i = 0, sp = p->skip; i < p->skip_used; i++, sp++) { + if (!((c1 ^ sp->ch->uc) & (c1 ^ sp->ch->lc) & (c1 ^ sp->ch->tc))) { + return ((unsigned long) (end - start) < sp->skip) ? + end - start : sp->skip; + } + } + return p->patlen; +} + +static int +#ifdef __STDC__ +_utbm_match(utbm_pattern_t pat, ucs2_t *text, ucs2_t *start, ucs2_t *end, + unsigned long *match_start, unsigned long *match_end) +#else +_utbm_match(pat, text, start, end, match_start, match_end) +utbm_pattern_t pat; +ucs2_t *text, *start, *end; +unsigned long *match_start, *match_end; +#endif +{ + int check_space; + ucs4_t c1, c2; + unsigned long count; + _utbm_char_t *cp; + + /* + * Set the potential match endpoint first. + */ + *match_end = (start - text) + 1; + + c1 = *start; + c2 = (start + 1 < end) ? *(start + 1) : ~0; + if (0xd800 <= c1 && c1 <= 0xdbff && 0xdc00 <= c2 && c2 <= 0xdfff) { + c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff)); + /* + * Adjust the match end point to occur after the UTF-16 character. + */ + *match_end = *match_end + 1; + } + + if (pat->pat_used == 1) { + *match_start = start - text; + return 1; + } + + /* + * Compare backward. + */ + cp = pat->pat + (pat->pat_used - 1); + + for (count = pat->patlen; start > text && count > 0;) { + /* + * Ignore non-spacing characters if indicated. + */ + if (pat->flags & UTBM_IGNORE_NONSPACING) { + while (start > text && _utbm_nonspacing(c1)) { + c2 = *--start; + c1 = (start - 1 > text) ? *(start - 1) : ~0; + if (0xdc00 <= c2 && c2 <= 0xdfff && + 0xd800 <= c1 && c1 <= 0xdbff) { + c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff)); + start--; + } else + c1 = c2; + } + } + + /* + * Handle space compression if indicated. + */ + if (pat->flags & UTBM_SPACE_COMPRESS) { + check_space = 0; + while (start > text && + (_utbm_isspace(c1, 1) || _utbm_iscntrl(c1))) { + check_space = _utbm_isspace(c1, 1); + c2 = *--start; + c1 = (start - 1 > text) ? *(start - 1) : ~0; + if (0xdc00 <= c2 && c2 <= 0xdfff && + 0xd800 <= c1 && c1 <= 0xdbff) { + c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff)); + start--; + } else + c1 = c2; + } + /* + * Handle things if space compression was indicated and one or + * more member characters were found. + */ + if (check_space) { + if (cp->uc != ' ') + return 0; + cp--; + count--; + } + } + + /* + * Handle the normal comparison cases. + */ + if (count > 0 && ((c1 ^ cp->uc) & (c1 ^ cp->lc) & (c1 ^ cp->tc))) + return 0; + + count -= (c1 >= 0x10000) ? 2 : 1; + if (count > 0) { + cp--; + + /* + * Get the next preceding character. + */ + if (start > text) { + c2 = *--start; + c1 = (start - 1 > text) ? *(start - 1) : ~0; + if (0xdc00 <= c2 && c2 <= 0xdfff && + 0xd800 <= c1 && c1 <= 0xdbff) { + c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff)); + start--; + } else + c1 = c2; + } + } + } + + /* + * Set the match start position. + */ + *match_start = start - text; + return 1; +} + +/************************************************************************* + * + * API. + * + *************************************************************************/ + +utbm_pattern_t +#ifdef __STDC__ +utbm_create_pattern(void) +#else +utbm_create_pattern() +#endif +{ + utbm_pattern_t p; + + p = (utbm_pattern_t) malloc(sizeof(_utbm_pattern_t)); + (void) memset((char *) p, 0, sizeof(_utbm_pattern_t)); + return p; +} + +void +#ifdef __STDC__ +utbm_free_pattern(utbm_pattern_t pattern) +#else +utbm_free_pattern(pattern) +utbm_pattern_t pattern; +#endif +{ + if (pattern == 0) + return; + + if (pattern->pat_size > 0) + free((char *) pattern->pat); + + if (pattern->skip_size > 0) + free((char *) pattern->skip); + + free((char *) pattern); +} + +void +#ifdef __STDC__ +utbm_compile(ucs2_t *pat, unsigned long patlen, unsigned long flags, + utbm_pattern_t p) +#else +utbm_compile(pat, patlen, flags, p) +ucs2_t *pat; +unsigned long patlen, flags; +utbm_pattern_t p; +#endif +{ + int have_space; + unsigned long i, j, k, slen; + _utbm_char_t *cp; + _utbm_skip_t *sp; + ucs4_t c1, c2, sentinel; + + if (p == 0 || pat == 0 || *pat == 0 || patlen == 0) + return; + + /* + * Reset the pattern buffer. + */ + p->patlen = p->pat_used = p->skip_used = 0; + + /* + * Set the flags. + */ + p->flags = flags; + + /* + * Initialize the extra skip flag. + */ + p->md4 = 1; + + /* + * Allocate more storage if necessary. + */ + if (patlen > p->pat_size) { + if (p->pat_size == 0) { + p->pat = (_utbm_char_t *) malloc(sizeof(_utbm_char_t) * patlen); + p->skip = (_utbm_skip_t *) malloc(sizeof(_utbm_skip_t) * patlen); + } else { + p->pat = (_utbm_char_t *) + realloc((char *) p->pat, sizeof(_utbm_char_t) * patlen); + p->skip = (_utbm_skip_t *) + realloc((char *) p->skip, sizeof(_utbm_skip_t) * patlen); + } + p->pat_size = p->skip_size = patlen; + } + + /* + * Preprocess the pattern to remove controls (if specified) and determine + * case. + */ + for (have_space = 0, cp = p->pat, i = 0; i < patlen; i++) { + c1 = pat[i]; + c2 = (i + 1 < patlen) ? pat[i + 1] : ~0; + if (0xd800 <= c1 && c1 <= 0xdbff && 0xdc00 <= c2 && c2 <= 0xdfff) + c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff)); + + /* + * Make sure the `have_space' flag is turned off if the character + * is not an appropriate one. + */ + if (!_utbm_isspace(c1, flags & UTBM_SPACE_COMPRESS)) + have_space = 0; + + /* + * If non-spacing characters should be ignored, do it here. + */ + if ((flags & UTBM_IGNORE_NONSPACING) && _utbm_nonspacing(c1)) + continue; + + /* + * Check if spaces and controls need to be compressed. + */ + if (flags & UTBM_SPACE_COMPRESS) { + if (_utbm_isspace(c1, 1)) { + if (!have_space) { + /* + * Add a space and set the flag. + */ + cp->uc = cp->lc = cp->tc = ' '; + cp++; + + /* + * Increase the real pattern length. + */ + p->patlen++; + sentinel = ' '; + have_space = 1; + } + continue; + } + + /* + * Ignore all control characters. + */ + if (_utbm_iscntrl(c1)) + continue; + } + + /* + * Add the character. + */ + if (flags & UTBM_CASEFOLD) { + cp->uc = _utbm_toupper(c1); + cp->lc = _utbm_tolower(c1); + cp->tc = _utbm_totitle(c1); + } else + cp->uc = cp->lc = cp->tc = c1; + + /* + * Set the sentinel character. + */ + sentinel = cp->uc; + + /* + * Move to the next character. + */ + cp++; + + /* + * Increase the real pattern length appropriately. + */ + p->patlen += (c1 >= 0x10000) ? 2 : 1; + + /* + * Increment the loop index for UTF-16 characters. + */ + i += (c1 >= 0x10000) ? 1 : 0; + + } + + /* + * Set the number of characters actually used. + */ + p->pat_used = cp - p->pat; + + /* + * Go through and construct the skip array and determine the actual length + * of the pattern in UCS2 terms. + */ + slen = p->patlen - 1; + cp = p->pat; + for (i = k = 0; i < p->pat_used; i++, cp++) { + /* + * Locate the character in the skip array. + */ + for (sp = p->skip, j = 0; + j < p->skip_used && sp->ch->uc != cp->uc; j++, sp++) ; + + /* + * If the character is not found, set the new skip element and + * increase the number of skip elements. + */ + if (j == p->skip_used) { + sp->ch = cp; + p->skip_used++; + } + + /* + * Set the updated skip value. If the character is UTF-16 and is + * not the last one in the pattern, add one to its skip value. + */ + sp->skip = slen - k; + if (cp->uc >= 0x10000 && k + 2 < slen) + sp->skip++; + + /* + * Set the new extra skip for the sentinel character. + */ + if (((cp->uc >= 0x10000 && k + 2 <= slen) || k + 1 <= slen) && + cp->uc == sentinel) + p->md4 = slen - k; + + /* + * Increase the actual index. + */ + k += (cp->uc >= 0x10000) ? 2 : 1; + } +} + +int +#ifdef __STDC__ +utbm_exec(utbm_pattern_t pat, ucs2_t *text, unsigned long textlen, + unsigned long *match_start, unsigned long *match_end) +#else +utbm_exec(pat, text, textlen, match_start, match_end) +utbm_pattern_t pat; +ucs2_t *text; +unsigned long textlen, *match_start, *match_end; +#endif +{ + unsigned long k; + ucs2_t *start, *end; + + if (pat == 0 || pat->pat_used == 0 || text == 0 || textlen == 0 || + textlen < pat->patlen) + return 0; + + start = text + pat->patlen; + end = text + textlen; + + /* + * Adjust the start point if it points to a low surrogate. + */ + if (0xdc00 <= *start && *start <= 0xdfff && + 0xd800 <= *(start - 1) && *(start - 1) <= 0xdbff) + start--; + + while (start < end) { + while ((k = _utbm_skip(pat, start, end))) { + start += k; + if (start < end && 0xdc00 <= *start && *start <= 0xdfff && + 0xd800 <= *(start - 1) && *(start - 1) <= 0xdbff) + start--; + } + + if (start < end && + _utbm_match(pat, text, start, end, match_start, match_end)) + return 1; + + start += pat->md4; + if (start < end && 0xdc00 <= *start && *start <= 0xdfff && + 0xd800 <= *(start - 1) && *(start - 1) <= 0xdbff) + start--; + } + return 0; +} diff --git a/libraries/liblunicode/utbm/utbm.h b/libraries/liblunicode/utbm/utbm.h new file mode 100644 index 0000000000..0050186cc5 --- /dev/null +++ b/libraries/liblunicode/utbm/utbm.h @@ -0,0 +1,109 @@ +/* + * Copyright 1997, 1998, 1999 Computing Research Labs, + * New Mexico State University + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT + * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ +#ifndef _h_utbm +#define _h_utbm + +/* + * $Id: utbm.h,v 1.1 1999/09/21 15:45:18 mleisher Exp $ + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#undef __ +#ifdef __STDC__ +#define __(x) x +#else +#define __(x) () +#endif + +/************************************************************************* + * + * Types. + * + *************************************************************************/ + +/* + * Fundamental character types. + */ +typedef unsigned long ucs4_t; +typedef unsigned short ucs2_t; + +/* + * An opaque type used for the search pattern. + */ +typedef struct _utbm_pattern_t *utbm_pattern_t; + +/************************************************************************* + * + * Flags. + * + *************************************************************************/ + +#define UTBM_CASEFOLD 0x01 +#define UTBM_IGNORE_NONSPACING 0x02 +#define UTBM_SPACE_COMPRESS 0x04 + +/************************************************************************* + * + * API. + * + *************************************************************************/ + +extern utbm_pattern_t utbm_create_pattern __((void)); + +extern void utbm_free_pattern __((utbm_pattern_t pattern)); + +extern void utbm_compile __((ucs2_t *pat, unsigned long patlen, + unsigned long flags, utbm_pattern_t pattern)); + +extern int utbm_exec __((utbm_pattern_t pat, ucs2_t *text, + unsigned long textlen, unsigned long *match_start, + unsigned long *match_end)); + +/************************************************************************* + * + * Prototypes for the stub functions needed. + * + *************************************************************************/ + +extern int _utbm_isspace __((ucs4_t c, int compress)); + +extern int _utbm_iscntrl __((ucs4_t c)); + +extern int _utbm_nonspacing __((ucs4_t c)); + +extern ucs4_t _utbm_tolower __((ucs4_t c)); + +extern ucs4_t _utbm_toupper __((ucs4_t c)); + +extern ucs4_t _utbm_totitle __((ucs4_t c)); + +#undef __ + +#ifdef __cplusplus +} +#endif + +#endif /* _h_utbm */ diff --git a/libraries/liblunicode/utbm/utbmstub.c b/libraries/liblunicode/utbm/utbmstub.c new file mode 100644 index 0000000000..51a2c77cc0 --- /dev/null +++ b/libraries/liblunicode/utbm/utbmstub.c @@ -0,0 +1,125 @@ +/* + * Copyright 1997, 1998, 1999 Computing Research Labs, + * New Mexico State University + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT + * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ +#ifndef lint +static char rcsid[] = "$Id: utbmstub.c,v 1.1 1999/09/21 15:45:18 mleisher Exp $"; +#endif + +#include "utbm.h" + +/* + * This should be redefined to use the `isspace' function available in the + * Unicode support on the platform where this is being used. + */ +#define _platform_isspace(x) 0 + +/* + * Return non-zero for any character that should be considered the equivalent + * of a space character. Return zero otherwise. + */ +int +#ifdef __STDC__ +_utbm_isspace(ucs4_t c, int compress) +#else +_utbm_isspace(c, compress) +ucs4_t c; +int compress; +#endif +{ + if (compress) + return (c == 0x09 || c == 0x0a || c == 0x0d || + c == 0x2028 || c == 0x2029 || _platform_isspace(c)) ? 1 : 0; + + return _platform_isspace(c); + +} + +/* + * Return non-zero if the character is a control character, or zero otherwise. + */ +int +#ifdef __STDC__ +_utbm_iscntrl(ucs4_t c) +#else +_utbm_iscntrl(c) +ucs4_t c; +#endif +{ + return 0; +} + +/* + * Return non-zero if the character is a non-spacing character, or zero + * otherwise. + */ +int +#ifdef __STDC__ +_utbm_nonspacing(ucs4_t c) +#else +_utbm_nonspacing(c) +ucs4_t c; +#endif +{ + return 0; +} + +/* + * Convert a character to lower case. + */ +ucs4_t +#ifdef __STDC__ +_utbm_tolower(ucs4_t c) +#else +_utbm_tolower(c) +ucs4_t c; +#endif +{ + return c; +} + +/* + * Convert a character to upper case. + */ +ucs4_t +#ifdef __STDC__ +_utbm_toupper(ucs4_t c) +#else +_utbm_toupper(c) +ucs4_t c; +#endif +{ + return c; +} + +/* + * Convert a character to title case. + */ +ucs4_t +#ifdef __STDC__ +_utbm_totitle(ucs4_t c) +#else +_utbm_totitle(c) +ucs4_t c; +#endif +{ + return c; +} -- 2.39.5