--- /dev/null
+#
+# $Id: MUTTUCData.txt,v 1.3 1999/10/29 00:04:35 mleisher Exp $
+#
+# Copyright 1999 Computing Research Labs, New Mexico State University
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
+# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
+# OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+# THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#
+#
+# Implementation specific character properties.
+#
+#
+# Space, other.
+#
+0009;;Ss;;;;;;;;;;;;
+000A;;Ss;;;;;;;;;;;;
+000B;;Ss;;;;;;;;;;;;
+000C;;Ss;;;;;;;;;;;;
+000D;;Ss;;;;;;;;;;;;
+#
+# Non-breaking.
+#
+00A0;;Nb;;;;;;;;;;;;
+2007;;Nb;;;;;;;;;;;;
+2011;;Nb;;;;;;;;;;;;
+FEFF;;Nb;;;;;;;;;;;;
+#
+# Symmetric.
+#
+0028;;Sy;;;;;;;;;;;;
+0029;;Sy;;;;;;;;;;;;
+005B;;Sy;;;;;;;;;;;;
+005D;;Sy;;;;;;;;;;;;
+007B;;Sy;;;;;;;;;;;;
+007D;;Sy;;;;;;;;;;;;
+00AB;;Sy;;;;;;;;;;;;
+00BB;;Sy;;;;;;;;;;;;
+0F3A;;Sy;;;;;;;;;;;;
+0F3B;;Sy;;;;;;;;;;;;
+0F3C;;Sy;;;;;;;;;;;;
+0F3D;;Sy;;;;;;;;;;;;
+0F3E;;Sy;;;;;;;;;;;;
+0F3F;;Sy;;;;;;;;;;;;
+2018;;Sy;;;;;;;;;;;;
+2019;;Sy;;;;;;;;;;;;
+201A;;Sy;;;;;;;;;;;;
+201B;;Sy;;;;;;;;;;;;
+201C;;Sy;;;;;;;;;;;;
+201D;;Sy;;;;;;;;;;;;
+201E;;Sy;;;;;;;;;;;;
+201F;;Sy;;;;;;;;;;;;
+2039;;Sy;;;;;;;;;;;;
+203A;;Sy;;;;;;;;;;;;
+2045;;Sy;;;;;;;;;;;;
+2046;;Sy;;;;;;;;;;;;
+207D;;Sy;;;;;;;;;;;;
+207E;;Sy;;;;;;;;;;;;
+208D;;Sy;;;;;;;;;;;;
+208E;;Sy;;;;;;;;;;;;
+2329;;Sy;;;;;;;;;;;;
+232A;;Sy;;;;;;;;;;;;
+3008;;Sy;;;;;;;;;;;;
+3009;;Sy;;;;;;;;;;;;
+300A;;Sy;;;;;;;;;;;;
+300B;;Sy;;;;;;;;;;;;
+300C;;Sy;;;;;;;;;;;;
+300D;;Sy;;;;;;;;;;;;
+300E;;Sy;;;;;;;;;;;;
+300F;;Sy;;;;;;;;;;;;
+3010;;Sy;;;;;;;;;;;;
+3011;;Sy;;;;;;;;;;;;
+3014;;Sy;;;;;;;;;;;;
+3015;;Sy;;;;;;;;;;;;
+3016;;Sy;;;;;;;;;;;;
+3017;;Sy;;;;;;;;;;;;
+3018;;Sy;;;;;;;;;;;;
+3019;;Sy;;;;;;;;;;;;
+301A;;Sy;;;;;;;;;;;;
+301B;;Sy;;;;;;;;;;;;
+301D;;Sy;;;;;;;;;;;;
+301E;;Sy;;;;;;;;;;;;
+301F;;Sy;;;;;;;;;;;;
+FD3E;;Sy;;;;;;;;;;;;
+FD3F;;Sy;;;;;;;;;;;;
+FE35;;Sy;;;;;;;;;;;;
+FE36;;Sy;;;;;;;;;;;;
+FE37;;Sy;;;;;;;;;;;;
+FE38;;Sy;;;;;;;;;;;;
+FE39;;Sy;;;;;;;;;;;;
+FE3A;;Sy;;;;;;;;;;;;
+FE3B;;Sy;;;;;;;;;;;;
+FE3C;;Sy;;;;;;;;;;;;
+FE3D;;Sy;;;;;;;;;;;;
+FE3E;;Sy;;;;;;;;;;;;
+FE3F;;Sy;;;;;;;;;;;;
+FE40;;Sy;;;;;;;;;;;;
+FE41;;Sy;;;;;;;;;;;;
+FE42;;Sy;;;;;;;;;;;;
+FE43;;Sy;;;;;;;;;;;;
+FE44;;Sy;;;;;;;;;;;;
+FE59;;Sy;;;;;;;;;;;;
+FE5A;;Sy;;;;;;;;;;;;
+FE5B;;Sy;;;;;;;;;;;;
+FE5C;;Sy;;;;;;;;;;;;
+FE5D;;Sy;;;;;;;;;;;;
+FE5E;;Sy;;;;;;;;;;;;
+FF08;;Sy;;;;;;;;;;;;
+FF09;;Sy;;;;;;;;;;;;
+FF3B;;Sy;;;;;;;;;;;;
+FF3D;;Sy;;;;;;;;;;;;
+FF5B;;Sy;;;;;;;;;;;;
+FF5D;;Sy;;;;;;;;;;;;
+FF62;;Sy;;;;;;;;;;;;
+FF63;;Sy;;;;;;;;;;;;
+#
+# Hex digit.
+#
+0030;;Hd;;;;;;;;;;;;
+0031;;Hd;;;;;;;;;;;;
+0032;;Hd;;;;;;;;;;;;
+0033;;Hd;;;;;;;;;;;;
+0034;;Hd;;;;;;;;;;;;
+0035;;Hd;;;;;;;;;;;;
+0036;;Hd;;;;;;;;;;;;
+0037;;Hd;;;;;;;;;;;;
+0038;;Hd;;;;;;;;;;;;
+0039;;Hd;;;;;;;;;;;;
+0041;;Hd;;;;;;;;;;;;
+0042;;Hd;;;;;;;;;;;;
+0043;;Hd;;;;;;;;;;;;
+0044;;Hd;;;;;;;;;;;;
+0045;;Hd;;;;;;;;;;;;
+0046;;Hd;;;;;;;;;;;;
+0061;;Hd;;;;;;;;;;;;
+0062;;Hd;;;;;;;;;;;;
+0063;;Hd;;;;;;;;;;;;
+0064;;Hd;;;;;;;;;;;;
+0065;;Hd;;;;;;;;;;;;
+0066;;Hd;;;;;;;;;;;;
+FF10;;Hd;;;;;;;;;;;;
+FF11;;Hd;;;;;;;;;;;;
+FF12;;Hd;;;;;;;;;;;;
+FF13;;Hd;;;;;;;;;;;;
+FF14;;Hd;;;;;;;;;;;;
+FF15;;Hd;;;;;;;;;;;;
+FF16;;Hd;;;;;;;;;;;;
+FF17;;Hd;;;;;;;;;;;;
+FF18;;Hd;;;;;;;;;;;;
+FF19;;Hd;;;;;;;;;;;;
+FF21;;Hd;;;;;;;;;;;;
+FF22;;Hd;;;;;;;;;;;;
+FF23;;Hd;;;;;;;;;;;;
+FF24;;Hd;;;;;;;;;;;;
+FF25;;Hd;;;;;;;;;;;;
+FF26;;Hd;;;;;;;;;;;;
+FF41;;Hd;;;;;;;;;;;;
+FF42;;Hd;;;;;;;;;;;;
+FF43;;Hd;;;;;;;;;;;;
+FF44;;Hd;;;;;;;;;;;;
+FF45;;Hd;;;;;;;;;;;;
+FF46;;Hd;;;;;;;;;;;;
+#
+# Quote marks.
+#
+0022;;Qm;;;;;;;;;;;;
+0027;;Qm;;;;;;;;;;;;
+00AB;;Qm;;;;;;;;;;;;
+00BB;;Qm;;;;;;;;;;;;
+2018;;Qm;;;;;;;;;;;;
+2019;;Qm;;;;;;;;;;;;
+201A;;Qm;;;;;;;;;;;;
+201B;;Qm;;;;;;;;;;;;
+201C;;Qm;;;;;;;;;;;;
+201D;;Qm;;;;;;;;;;;;
+201E;;Qm;;;;;;;;;;;;
+201F;;Qm;;;;;;;;;;;;
+2039;;Qm;;;;;;;;;;;;
+203A;;Qm;;;;;;;;;;;;
+300C;;Qm;;;;;;;;;;;;
+300D;;Qm;;;;;;;;;;;;
+300E;;Qm;;;;;;;;;;;;
+300F;;Qm;;;;;;;;;;;;
+301D;;Qm;;;;;;;;;;;;
+301E;;Qm;;;;;;;;;;;;
+301F;;Qm;;;;;;;;;;;;
+FE41;;Qm;;;;;;;;;;;;
+FE42;;Qm;;;;;;;;;;;;
+FE43;;Qm;;;;;;;;;;;;
+FE44;;Qm;;;;;;;;;;;;
+FF02;;Qm;;;;;;;;;;;;
+FF07;;Qm;;;;;;;;;;;;
+FF62;;Qm;;;;;;;;;;;;
+FF63;;Qm;;;;;;;;;;;;
+#
+# Special Devanagari forms
+#
+E900;DEVANAGARI KSHA LIGATURE;Lo;0;L;0915 094D 0937;;;;N;;;;;
+E901;DEVANAGARI GNYA LIGATURE;Lo;0;L;091C 094D 091E;;;;N;;;;;
+E902;DEVANAGARI TTA LIGATURE;Lo;0;L;0924 094D 0924;;;;N;;;;;
+E903;DEVANAGARI TRA LIGATURE;Lo;0;L;0924 094D 0930;;;;N;;;;;
+E904;DEVANAGARI SHCHA LIGATURE;Lo;0;L;0936 094D 091B;;;;N;;;;;
+E905;DEVANAGARI SHRA LIGATURE;Lo;0;L;0936 094D 0930;;;;N;;;;;
+E906;DEVANAGARI SHVA LIGATURE;Lo;0;L;0936 094D 0935;;;;N;;;;;
+E907;DEVANAGARI KRA LIGATURE;Lo;0;L;;;;;N;;;;;
+E908;DEVANAGARI JRA LIGATURE;Lo;0;L;;;;;N;;;;;
+E909;DEVANAGARI ZRA LIGATURE;Lo;0;L;;;;;N;;;;;
+E90A;DEVANAGARI PHRA LIGATURE;Lo;0;L;;;;;N;;;;;
+E90B;DEVANAGARI FRA LIGATURE;Lo;0;L;;;;;N;;;;;
+E90C;DEVANAGARI PRA LIGATURE;Lo;0;L;;;;;N;;;;;
+E90D;DEVANAGARI SRA LIGATURE;Lo;0;L;;;;;N;;;;;
+E90E;DEVANAGARI RU LIGATURE;Lo;0;L;;;;;N;;;;;
+E90F;DEVANAGARI RUU LIGATURE;Lo;0;L;;;;;N;;;;;
+E915;DEVANAGARI HALF LETTER KA;Lo;0;L;;;;;N;;;;;
+E916;DEVANAGARI HALF LETTER KHA;Lo;0;L;;;;;N;;;;;
+E917;DEVANAGARI HALF LETTER GA;Lo;0;L;;;;;N;;;;;
+E918;DEVANAGARI HALF LETTER GHA;Lo;0;L;;;;;N;;;;;
+E919;DEVANAGARI HALF LETTER NGA;Lo;0;L;;;;;N;;;;;
+E91A;DEVANAGARI HALF LETTER CA;Lo;0;L;;;;;N;;;;;
+E91B;DEVANAGARI HALF LETTER CHA;Lo;0;L;;;;;N;;;;;
+E91C;DEVANAGARI HALF LETTER JA;Lo;0;L;;;;;N;;;;;
+E91D;DEVANAGARI HALF LETTER JHA;Lo;0;L;;;;;N;;;;;
+E91E;DEVANAGARI HALF LETTER NYA;Lo;0;L;;;;;N;;;;;
+E91F;DEVANAGARI HALF LETTER TTA;Lo;0;L;;;;;N;;;;;
+E920;DEVANAGARI HALF LETTER TTHA;Lo;0;L;;;;;N;;;;;
+E921;DEVANAGARI HALF LETTER DDA;Lo;0;L;;;;;N;;;;;
+E922;DEVANAGARI HALF LETTER DDHA;Lo;0;L;;;;;N;;;;;
+E923;DEVANAGARI HALF LETTER NNA;Lo;0;L;;;;;N;;;;;
+E924;DEVANAGARI HALF LETTER TA;Lo;0;L;;;;;N;;;;;
+E925;DEVANAGARI HALF LETTER THA;Lo;0;L;;;;;N;;;;;
+E926;DEVANAGARI HALF LETTER DA;Lo;0;L;;;;;N;;;;;
+E927;DEVANAGARI HALF LETTER DHA;Lo;0;L;;;;;N;;;;;
+E928;DEVANAGARI HALF LETTER NA;Lo;0;L;;;;;N;;;;;
+E929;DEVANAGARI HALF LETTER NNNA;Lo;0;L;0928 093C;;;;N;;;;;
+E92A;DEVANAGARI HALF LETTER PA;Lo;0;L;;;;;N;;;;;
+E92B;DEVANAGARI HALF LETTER PHA;Lo;0;L;;;;;N;;;;;
+E92C;DEVANAGARI HALF LETTER BA;Lo;0;L;;;;;N;;;;;
+E92D;DEVANAGARI HALF LETTER BHA;Lo;0;L;;;;;N;;;;;
+E92E;DEVANAGARI HALF LETTER MA;Lo;0;L;;;;;N;;;;;
+E92F;DEVANAGARI HALF LETTER YA;Lo;0;L;;;;;N;;;;;
+E930;DEVANAGARI HALF LETTER RA;Lo;0;L;;;;;N;;;;;
+E931;DEVANAGARI HALF LETTER RRA;Lo;0;L;0930 093C;;;;N;;;;;
+E932;DEVANAGARI HALF LETTER LA;Lo;0;L;;;;;N;;;;;
+E933;DEVANAGARI HALF LETTER LLA;Lo;0;L;;;;;N;;;;;
+E934;DEVANAGARI HALF LETTER LLLA;Lo;0;L;0933 093C;;;;N;;;;;
+E935;DEVANAGARI HALF LETTER VA;Lo;0;L;;;;;N;;;;;
+E936;DEVANAGARI HALF LETTER SHA;Lo;0;L;;;;;N;;;;;
+E937;DEVANAGARI HALF LETTER SSA;Lo;0;L;;;;;N;;;;;
+E938;DEVANAGARI HALF LETTER SA;Lo;0;L;;;;;N;;;;;
+E939;DEVANAGARI HALF LETTER HA;Lo;0;L;;;;;N;;;;;
+E940;DEVANAGARI KKA LIGATURE;Lo;0;L;0915 094D 0915;;;;N;;;;;
+E941;DEVANAGARI KTA LIGATURE;Lo;0;L;0915 094D 0924;;;;N;;;;;
+E942;DEVANAGARI NGKA LIGATURE;Lo;0;L;0919 094D 0915;;;;N;;;;;
+E943;DEVANAGARI NGKHA LIGATURE;Lo;0;L;0919 094D 0916;;;;N;;;;;
+E944;DEVANAGARI NGGA LIGATURE;Lo;0;L;0919 094D 0917;;;;N;;;;;
+E945;DEVANAGARI NGGHA LIGATURE;Lo;0;L;0919 094D 0918;;;;N;;;;;
+E946;DEVANAGARI NYJA LIGATURE;Lo;0;L;091E 094D 091C;;;;N;;;;;
+E947;DEVANAGARI DGHA LIGATURE;Lo;0;L;0926 094D 0918;;;;N;;;;;
+E948;DEVANAGARI DDA LIGATURE;Lo;0;L;0926 094D 0926;;;;N;;;;;
+E949;DEVANAGARI DDHA LIGATURE;Lo;0;L;0926 094D 0927;;;;N;;;;;
+E94A;DEVANAGARI DBA LIGATURE;Lo;0;L;0926 094D 092C;;;;N;;;;;
+E94B;DEVANAGARI DBHA LIGATURE;Lo;0;L;0926 094D 092D;;;;N;;;;;
+E94C;DEVANAGARI DMA LIGATURE;Lo;0;L;0926 094D 092E;;;;N;;;;;
+E94D;DEVANAGARI DYA LIGATURE;Lo;0;L;0926 094D 092F;;;;N;;;;;
+E94E;DEVANAGARI DVA LIGATURE;Lo;0;L;0926 094D 0935;;;;N;;;;;
+E94F;DEVANAGARI TT-TTA LIGATURE;Lo;0;L;091F 094D 091F;;;;N;;;;;
+E950;DEVANAGARI TT-TTHA LIGATURE;Lo;0;L;091F 094D 0920;;;;N;;;;;
+E951;DEVANAGARI TTH-TTHA LIGATURE;Lo;0;L;0920 094D 0920;;;;N;;;;;
+E952;DEVANAGARI DD-GA LIGATURE;Lo;0;L;0921 094D 0917;;;;N;;;;;
+E953;DEVANAGARI DD-DDA LIGATURE;Lo;0;L;0921 094D 0921;;;;N;;;;;
+E954;DEVANAGARI DD-DDHA LIGATURE;Lo;0;L;0921 094D 0922;;;;N;;;;;
+E955;DEVANAGARI NNA LIGATURE;Lo;0;L;0928 094D 0928;;;;N;;;;;
+E956;DEVANAGARI HMA LIGATURE;Lo;0;L;0939 094D 092E;;;;N;;;;;
+E957;DEVANAGARI HYA LIGATURE;Lo;0;L;0939 094D 092F;;;;N;;;;;
+E958;DEVANAGARI HLA LIGATURE;Lo;0;L;0939 094D 0932;;;;N;;;;;
+E959;DEVANAGARI HVA LIGATURE;Lo;0;L;0939 094D 0935;;;;N;;;;;
+E95A;DEVANAGARI STRA LIGATURE;Lo;0;L;0938 094D 0924 094D 0930;;;;N;;;;;
+E970;DEVANAGARI HALF KSHA LIGATURE;Lo;0;L;0915 094D 0937;;;;N;;;;;
+E971;DEVANAGARI HALF GNYA LIGATURE;Lo;0;L;091C 094D 091E;;;;N;;;;;
+E972;DEVANAGARI HALF TTA LIGATURE;Lo;0;L;0924 094D 0924;;;;N;;;;;
+E973;DEVANAGARI HALF TRA LIGATURE;Lo;0;L;0924 094D 0930;;;;N;;;;;
+E974;DEVANAGARI HALF SHCHA LIGATURE;Lo;0;L;0936 094D 091B;;;;N;;;;;
+E975;DEVANAGARI HALF SHRA LIGATURE;Lo;0;L;0936 094D 0930;;;;N;;;;;
+E976;DEVANAGARI HALF SHVA LIGATURE;Lo;0;L;0936 094D 0935;;;;N;;;;;
+E97B;DEVANAGARI SIGN RRA-REPHA;Mn;36;L;;;;;N;;;;;
+E97C;DEVANAGARI HAR LIGATURE;Lo;0;L;0939 0943;;;;N;;;;;
+E97D;DEVANAGARI SIGN EYELASH RA;Lo;0;L;;;;;N;;;;;
+E97E;DEVANAGARI SIGN REPHA;Mn;36;L;;;;;N;;;;;
+E97F;DEVANAGARI SIGN SUBJOINED RA;Mn;36;L;;;;;N;;;;;
--- /dev/null
+#
+# $Id: README,v 1.32 1999/11/29 16:41:05 mleisher Exp $
+#
+
+ MUTT UCData Package 2.4
+ -----------------------
+
+This is a package that supports ctype-like operations for Unicode UCS-2 text
+(and surrogates), case mapping, decomposition lookup, and provides a
+bidirectional reordering algorithm. To use it, you will need to get the
+latest "UnicodeData-*.txt" (or later) file from the Unicode Web or FTP site.
+
+The character information portion of the package consists of three parts:
+
+ 1. A program called "ucgendat" which generates five data files from the
+ UnicodeData-*.txt file. The files are:
+
+ A. case.dat - the case mappings.
+ B. ctype.dat - the character property tables.
+ C. decomp.dat - the character decompositions.
+ D. cmbcl.dat - the non-zero combining classes.
+ E. num.dat - the codes representing numbers.
+
+ 2. The "ucdata.[ch]" files which implement the functions needed to
+ check to see if a character matches groups of properties, to map between
+ upper, lower, and title case, to look up the decomposition of a
+ character, look up the combining class of a character, and get the number
+ value of a character.
+
+ 3. The UCData.java class which provides the same API (with minor changes for
+ the numbers) and loads the same binary data files as the C code.
+
+A short reference to the functions available is in the "api.txt" file.
+
+Techie Details
+==============
+
+The "ucgendat" program parses files from the command line which are all in the
+Unicode Character Database (UCDB) format. An additional properties file,
+"MUTTUCData.txt", provides some extra properties for some characters.
+
+The program looks for the two character properties fields (2 and 4), the
+combining class field (3), the decomposition field (5), the numeric value
+field (8), and the case mapping fields (12, 13, and 14). The decompositions
+are recursively expanded before being written out.
+
+The decomposition table contains all the canonical decompositions. This means
+all decompositions that do not have tags such as "<compat>" or "<font>".
+
+The data is almost all stored as unsigned longs (32-bits assumed) and the
+routines that load the data take care of endian swaps when necessary. This
+also means that surrogates (>= 0x10000) can be placed in the data files the
+"ucgendat" program parses.
+
+The data is written as external files and broken into five parts so it can be
+selectively updated at runtime if necessary.
+
+The data files currently generated from the "ucgendat" program total about 56K
+in size all together.
+
+The format of the binary data files is documented in the "format.txt" file.
+
+==========================================================================
+
+ The "Pretty Good Bidi Algorithm"
+ --------------------------------
+
+This routine provides an alternative to the Unicode Bidi algorithm. The
+difference is that this version of the PGBA does not handle the explicit
+directional codes (LRE, RLE, LRO, RLO, PDF). It should now produce the same
+results as the Unicode BiDi algorithm for implicit reordering. Included are
+functions for doing cursor motion in both logical and visual order.
+
+This implementation is provided to demonstrate an effective alternate method
+for implicit reordering. To make this useful for an application, it probably
+needs some changes to the memory allocation and deallocation, as well as data
+structure additions for rendering.
+
+Mark Leisher <mleisher@crl.nmsu.edu>
+19 November 1999
+
+-----------------------------------------------------------------------------
+
+CHANGES
+=======
+
+Version 2.4
+-----------
+1. Improved some bidi algorithm documentation in the code.
+
+2. Fixed a code mixup that produced a non-working version.
+
+Version 2.3
+-----------
+1. Fixed a misspelling in the ucpgba.h header file.
+
+2. Fixed a bug which caused trailing weak non-digit sequences to be left out of
+ the reordered string in the bidi algorithm.
+
+3. Fixed a problem with weak sequences containing non-spacing marks in the
+ bidi algorithm.
+
+4. Fixed a problem with text runs of the opposite direction of the string
+ surrounding a weak + neutral text run appearing in the wrong order in the
+ bidi algorithm.
+
+5. Added a default overall direction parameter to the reordering function for
+ cases of strings with no strong directional characters in the bidi
+ algorithm.
+
+6. The bidi API documentation was improved.
+
+7. Added a man page for the bidi API.
+
+Version 2.2
+-----------
+1. Fixed a problem with the bidi algorithm locating directional section
+ boundaries.
+
+2. Fixed a problem with the bidi algorithm starting the reordering correctly.
+
+3. Fixed a problem with the bidi algorithm determining end boundaries for LTR
+ segments.
+
+4. Fixed a problem with the bidi algorithm reordering weak (digits and number
+ separators) segments.
+
+5. Added automatic switching of symmetrically paired characters when
+ reversing RTL segments.
+
+6. Added a missing symmetric character to the extra character properties in
+ MUTTUCData.txt.
+
+7. Added support for doing logical and visual cursor traversal.
+
+Version 2.1
+-----------
+1. Updated the ucgendat program to handle the Unicode 3.0 character database
+ properties. The AL and BM bidi properties gets marked as strong RTL and
+ Other Neutral, the NSM, LRE, RLE, PDF, LRO, and RLO controls all get marked
+ as Other Neutral.
+
+2. Fixed some problems with testing against signed values in the UCData.java
+ code and some minor cleanup.
+
+3. Added the "Pretty Good Bidi Algorithm."
+
+Version 2.0
+-----------
+1. Removed the old Java stuff for a new class that loads directly from the
+ same data files as the C code does.
+
+2. Fixed a problem with choosing the correct field when mapping case.
+
+3. Adjust some search routines to start their search in the correct position.
+
+4. Moved the copyright year to 1999.
+
+Version 1.9
+-----------
+1. Fixed a problem with an incorrect amount of storage being allocated for the
+ combining class nodes.
+
+2. Fixed an invalid initialization in the number code.
+
+3. Changed the Java template file formatting a bit.
+
+4. Added tables and function for getting decompositions in the Java class.
+
+Version 1.8
+-----------
+1. Fixed a problem with adding certain ranges.
+
+2. Added two more macros for testing for identifiers.
+
+3. Tested with the UnicodeData-2.1.5.txt file.
+
+Version 1.7
+-----------
+1. Fixed a problem with looking up decompositions in "ucgendat."
+
+Version 1.6
+-----------
+1. Added two new properties introduced with UnicodeData-2.1.4.txt.
+
+2. Changed the "ucgendat.c" program a little to automatically align the
+ property data on a 4-byte boundary when new properties are added.
+
+3. Changed the "ucgendat.c" programs to only generate canonical
+ decompositions.
+
+4. Added two new macros ucisinitialpunct() and ucisfinalpunct() to check for
+ initial and final punctuation characters.
+
+5. Minor additions and changes to the documentation.
+
+Version 1.5
+-----------
+1. Changed all file open calls to include binary mode with "b" for DOS/WIN
+ platforms.
+
+2. Wrapped the unistd.h include so it won't be included when compiled under
+ Win32.
+
+3. Fixed a bad range check for hex digits in ucgendat.c.
+
+4. Fixed a bad endian swap for combining classes.
+
+5. Added code to make a number table and associated lookup functions.
+ Functions added are ucnumber(), ucdigit(), and ucgetnumber(). The last
+ function is to maintain compatibility with John Cowan's "uctype" package.
+
+Version 1.4
+-----------
+1. Fixed a bug with adding a range.
+
+2. Fixed a bug with inserting a range in order.
+
+3. Fixed incorrectly specified ucisdefined() and ucisundefined() macros.
+
+4. Added the missing unload for the combining class data.
+
+5. Fixed a bad macro placement in ucisweak().
+
+Version 1.3
+-----------
+1. Bug with case mapping calculations fixed.
+
+2. Bug with empty character property entries fixed.
+
+3. Bug with incorrect type in the combining class lookup fixed.
+
+4. Some corrections done to api.txt.
+
+5. Bug in certain character property lookups fixed.
+
+6. Added a character property table that records the defined characters.
+
+7. Replaced ucisunknown() with ucisdefined() and ucisundefined().
+
+Version 1.2
+-----------
+1. Added code to ucgendat to generate a combining class table.
+
+2. Fixed an endian problem with the byte count of decompositions.
+
+3. Fixed some minor problems in the "format.txt" file.
+
+4. Removed some bogus "Ss" values from MUTTUCData.txt file.
+
+5. Added API function to get combining class.
+
+6. Changed the open mode to "rb" so binary data files will be opened correctly
+ on DOS/WIN as well as other platforms.
+
+7. Added the "api.txt" file.
+
+Version 1.1
+-----------
+1. Added ucisxdigit() which I overlooked.
+
+2. Added UC_LT to the ucisalpha() macro which I overlooked.
+
+3. Change uciscntrl() to include UC_CF.
+
+4. Added ucisocntrl() and ucfntcntrl() macros.
+
+5. Added a ucisblank() which I overlooked.
+
+6. Added missing properties to ucissymbol() and ucisnumber().
+
+7. Added ucisgraph() and ucisprint().
+
+8. Changed the "Mr" property to "Sy" to mark this subset of mirroring
+ characters as symmetric to avoid trampling the Unicode/ISO10646 sense of
+ mirroring.
+
+9. Added another property called "Ss" which includes control characters
+ traditionally seen as spaces in the isspace() macro.
+
+10. Added a bunch of macros to be API compatible with John Cowan's package.
+
+ACKNOWLEDGEMENTS
+================
+
+Thanks go to John Cowan <cowan@locke.ccil.org> for pointing out lots of
+missing things and giving me stuff, particularly a bunch of new macros.
+
+Thanks go to Bob Verbrugge <bob_verbrugge@nl.compuware.com> for pointing out
+various bugs.
+
+Thanks go to Christophe Pierret <cpierret@businessobjects.com> for pointing
+out that file modes need to have "b" for DOS/WIN machines, pointing out
+unistd.h is not a Win 32 header, and pointing out a problem with ucisalnum().
+
+Thanks go to Kent Johnson <kent@pondview.mv.com> for finding a bug that caused
+incomplete decompositions to be generated by the "ucgendat" program.
+
+Thanks go to Valeriy E. Ushakov <uwe@ptc.spbu.ru> for spotting an allocation
+error and an initialization error.
--- /dev/null
+/*
+ * $Id: UCData.java,v 1.2 1999/10/07 20:49:56 mleisher Exp $
+ *
+ * Copyright 1999 Computing Research Labs, New Mexico State University
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
+ * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+ * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+import java.io.*;
+import java.net.*;
+
+public class UCData {
+ private static byte[] buffer;
+ private static boolean endian;
+ private static int bytes, buffpos;
+
+ //
+ // Do the static initialization.
+ //
+ static {
+ buffer = new byte[24576];
+ }
+
+ private static boolean load_file(InputStream in) {
+ buffpos = 0;
+ try {
+ bytes = in.read(buffer);
+ } catch (IOException e) {
+ return false;
+ }
+ endian = (buffer[0] == -2 && buffer[1] == -2);
+ buffpos = 2;
+ return (bytes > 0);
+ }
+
+ private static int getInt() {
+ int b1, b2, b3, b4;
+
+ if (!endian) {
+ b1 = buffer[buffpos++];
+ b2 = buffer[buffpos++];
+ b3 = buffer[buffpos++];
+ b4 = buffer[buffpos++];
+ } else {
+ b4 = buffer[buffpos++];
+ b3 = buffer[buffpos++];
+ b2 = buffer[buffpos++];
+ b1 = buffer[buffpos++];
+ }
+ if (b1 < 0)
+ b1 += 256;
+ if (b2 < 0)
+ b2 += 256;
+ if (b3 < 0)
+ b3 += 256;
+ if (b4 < 0)
+ b4 += 256;
+ return ((b1 << 24) | (b2 << 16) | (b3 << 8) | b4);
+ }
+
+ private static int getInt(int from) {
+ buffpos = from;
+ return getInt();
+ }
+
+ private static short getShort() {
+ int b1, b2;
+
+ if (!endian) {
+ b1 = buffer[buffpos++];
+ b2 = buffer[buffpos++];
+ } else {
+ b2 = buffer[buffpos++];
+ b1 = buffer[buffpos++];
+ }
+ if (b1 < 0)
+ b1 += 256;
+ if (b2 < 0)
+ b2 += 256;
+
+ return (short) ((b1 << 8) | b2);
+ }
+
+ private static short getShort(int from) {
+ buffpos = from;
+ return getShort();
+ }
+
+ /**********************************************************************
+ *
+ * Character type info section.
+ *
+ **********************************************************************/
+
+ private static int masks32[] = {
+ 0x00000001, 0x00000002, 0x00000004, 0x00000008, 0x00000010, 0x00000020,
+ 0x00000040, 0x00000080, 0x00000100, 0x00000200, 0x00000400, 0x00000800,
+ 0x00001000, 0x00002000, 0x00004000, 0x00008000, 0x00010000, 0x00020000,
+ 0x00040000, 0x00080000, 0x00100000, 0x00200000, 0x00400000, 0x00800000,
+ 0x01000000, 0x02000000, 0x04000000, 0x08000000, 0x10000000, 0x20000000,
+ 0x40000000, 0x80000000
+ };
+
+ //
+ // The arrays with the character property info.
+ //
+ private static short[] _ucprop_offsets = null;
+ private static int[] _ucprop_ranges = null;
+
+ public static final int UC_MN = 0x00000001;
+ public static final int UC_MC = 0x00000002;
+ public static final int UC_ME = 0x00000004;
+ public static final int UC_ND = 0x00000008;
+ public static final int UC_NL = 0x00000010;
+ public static final int UC_NO = 0x00000020;
+ public static final int UC_ZS = 0x00000040;
+ public static final int UC_ZL = 0x00000080;
+ public static final int UC_ZP = 0x00000100;
+ public static final int UC_CC = 0x00000200;
+ public static final int UC_CF = 0x00000400;
+ public static final int UC_OS = 0x00000800;
+ public static final int UC_CO = 0x00001000;
+ public static final int UC_CN = 0x00002000;
+ public static final int UC_LU = 0x00004000;
+ public static final int UC_LL = 0x00008000;
+ public static final int UC_LT = 0x00010000;
+ public static final int UC_LM = 0x00020000;
+ public static final int UC_LO = 0x00040000;
+ public static final int UC_PC = 0x00080000;
+ public static final int UC_PD = 0x00100000;
+ public static final int UC_PS = 0x00200000;
+ public static final int UC_PE = 0x00400000;
+ public static final int UC_PO = 0x00800000;
+ public static final int UC_SM = 0x01000000;
+ public static final int UC_SC = 0x02000000;
+ public static final int UC_SK = 0x04000000;
+ public static final int UC_SO = 0x08000000;
+ public static final int UC_L = 0x10000000;
+ public static final int UC_R = 0x20000000;
+ public static final int UC_EN = 0x40000000;
+ public static final int UC_ES = 0x80000000;
+ public static final int UC_ET = 0x00000001;
+ public static final int UC_AN = 0x00000002;
+ public static final int UC_CS = 0x00000004;
+ public static final int UC_B = 0x00000008;
+ public static final int UC_S = 0x00000010;
+ public static final int UC_WS = 0x00000020;
+ public static final int UC_ON = 0x00000040;
+ public static final int UC_CM = 0x00000080;
+ public static final int UC_NB = 0x00000100;
+ public static final int UC_SY = 0x00000200;
+ public static final int UC_HD = 0x00000400;
+ public static final int UC_QM = 0x00000800;
+ public static final int UC_MR = 0x00001000;
+ public static final int UC_SS = 0x00002000;
+ public static final int UC_CP = 0x00004000;
+ public static final int UC_PI = 0x00008000;
+ public static final int UC_PF = 0x00010000;
+
+ private static boolean _ucprop_load(URL where) {
+ int i, hsize, size = 0;
+ boolean res;
+ InputStream in = null;
+
+ //
+ // If the offsets array is not null, then this file has been loaded.
+ //
+ if (_ucprop_offsets != null)
+ return true;
+
+ try {
+ in = where.openStream();
+ } catch (IOException e1) {
+ return false;
+ }
+
+ res = load_file(in);
+
+ try {
+ in.close();
+ } catch (IOException e) {}
+
+ if (res == false)
+ return res;
+
+ hsize = getShort();
+
+ if (((size = (hsize + 1) << 1) & 3) != 0)
+ size += 4 - (size & 3);
+
+ _ucprop_offsets = new short[hsize + 1];
+
+ //
+ // Skip the byte count which won't be needed.
+ //
+ buffpos += 4;
+
+ //
+ // Adjust the byte count used to position at the beginning of the
+ // ranges to include the 4 bytes at the beginning and the byte count
+ // which is unused.
+ //
+ size += 8;
+
+ for (i = 0; i <= hsize; i++)
+ _ucprop_offsets[i] = getShort();
+
+ //
+ // Now allocate the ranges.
+ //
+ _ucprop_ranges = new int[_ucprop_offsets[hsize]];
+ for (i = 0, buffpos = size; i < _ucprop_offsets[hsize]; i++)
+ _ucprop_ranges[i] = getInt();
+
+ return true;
+ }
+
+ private static void _ucprop_unload() {
+ _ucprop_offsets = null;
+ _ucprop_ranges = null;
+ }
+
+ private static boolean uclookup(int code, int n) {
+ int l, r, m;
+
+ if ((l = _ucprop_offsets[n]) == -1)
+ return false;
+
+ for (m = 1; n + m < _ucprop_offsets.length &&
+ _ucprop_offsets[n + m] == -1; m++) ;
+
+ r = _ucprop_offsets[n + m] - 1;
+ while (l <= r) {
+ m = (l + r) >> 1;
+ m -= (m & 1);
+ if (code > _ucprop_ranges[m + 1])
+ l = m + 2;
+ else if (code < _ucprop_ranges[m])
+ r = m - 2;
+ else if (_ucprop_ranges[m] <= code && code <= _ucprop_ranges[m+1])
+ return true;
+ }
+ return false;
+ }
+
+ public static boolean ucisprop(int code, int mask1, int mask2) {
+ int i;
+
+ if (mask1 == 0 && mask2 == 0)
+ return false;
+
+ if (mask1 != 0) {
+ for (i = 0; i < 32; i++) {
+ if ((mask1 & masks32[i]) != 0 && uclookup(code, i))
+ return true;
+ }
+ }
+
+ if (mask2 != 0) {
+ for (i = 32; i < _ucprop_offsets.length; i++) {
+ if ((mask2 & masks32[i & 31]) != 0 && uclookup(code, i))
+ return true;
+ }
+ }
+ return false;
+ }
+
+ public static boolean ucisalpha(int code) {
+ return ucisprop(code, UC_LU|UC_LL|UC_LM|UC_LO|UC_LT, 0);
+ }
+ public static boolean ucisdigit(int code) {
+ return ucisprop(code, UC_ND, 0);
+ }
+ public static boolean ucisalnum(int code) {
+ return ucisprop(code, UC_LU|UC_LL|UC_LM|UC_LO|UC_LT|UC_ND, 0);
+ }
+ public static boolean uciscntrl(int code) {
+ return ucisprop(code, UC_CC|UC_CF, 0);
+ }
+ public static boolean ucisspace(int code) {
+ return ucisprop(code, UC_ZS|UC_SS, 0);
+ }
+ public static boolean ucisblank(int code) {
+ return ucisprop(code, UC_ZS, 0);
+ }
+ public static boolean ucispunct(int code) {
+ return ucisprop(code, UC_PD|UC_PS|UC_PE|UC_PO, UC_PI|UC_PF);
+ }
+ public static boolean ucisgraph(int code) {
+ return ucisprop(code, UC_MN|UC_MC|UC_ME|UC_ND|UC_NL|UC_NO|
+ UC_LU|UC_LL|UC_LT|UC_LM|UC_LO|UC_PC|UC_PD|
+ UC_PS|UC_PE|UC_PO|UC_SM|UC_SM|UC_SC|UC_SK|
+ UC_SO, UC_PI|UC_PF);
+ }
+ public static boolean ucisprint(int code) {
+ return ucisprop(code, UC_MN|UC_MC|UC_ME|UC_ND|UC_NL|UC_NO|
+ UC_LU|UC_LL|UC_LT|UC_LM|UC_LO|UC_PC|UC_PD|
+ UC_PS|UC_PE|UC_PO|UC_SM|UC_SM|UC_SC|UC_SK|
+ UC_SO|UC_ZS, UC_PI|UC_PF);
+ }
+ public static boolean ucisupper(int code) {
+ return ucisprop(code, UC_LU, 0);
+ }
+ public static boolean ucislower(int code) {
+ return ucisprop(code, UC_LL, 0);
+ }
+ public static boolean ucistitle(int code) {
+ return ucisprop(code, UC_LT, 0);
+ }
+ public static boolean ucisxdigit(int code) {
+ return ucisprop(code, 0, UC_HD);
+ }
+ public static boolean ucisisocntrl(int code) {
+ return ucisprop(code, UC_CC, 0);
+ }
+ public static boolean ucisfmtcntrl(int code) {
+ return ucisprop(code, UC_CF, 0);
+ }
+ public static boolean ucissymbol(int code) {
+ return ucisprop(code, UC_SM|UC_SC|UC_SO|UC_SK, 0);
+ }
+ public static boolean ucisnumber(int code) {
+ return ucisprop(code, UC_ND|UC_NO|UC_NL, 0);
+ }
+ public static boolean ucisnonspacing(int code) {
+ return ucisprop(code, UC_MN, 0);
+ }
+ public static boolean ucisopenpunct(int code) {
+ return ucisprop(code, UC_PS, 0);
+ }
+ public static boolean ucisclosepunct(int code) {
+ return ucisprop(code, UC_PE, 0);
+ }
+ public static boolean ucisinitialpunct(int code) {
+ return ucisprop(code, 0, UC_PI);
+ }
+ public static boolean ucisfinalpunct(int code) {
+ return ucisprop(code, 0, UC_PF);
+ }
+ public static boolean uciscomposite(int code) {
+ return ucisprop(code, 0, UC_CM);
+ }
+ public static boolean ucishex(int code) {
+ return ucisprop(code, 0, UC_HD);
+ }
+ public static boolean ucisquote(int code) {
+ return ucisprop(code, 0, UC_QM);
+ }
+ public static boolean ucissymmetric(int code) {
+ return ucisprop(code, 0, UC_SY);
+ }
+ public static boolean ucismirroring(int code) {
+ return ucisprop(code, 0, UC_MR);
+ }
+ public static boolean ucisnonbreaking(int code) {
+ return ucisprop(code, 0, UC_NB);
+ }
+ public static boolean ucisrtl(int code) {
+ return ucisprop(code, UC_R, 0);
+ }
+ public static boolean ucisltr(int code) {
+ return ucisprop(code, UC_L, 0);
+ }
+ public static boolean ucisstrong(int code) {
+ return ucisprop(code, UC_L|UC_R, 0);
+ }
+ public static boolean ucisweak(int code) {
+ return ucisprop(code, UC_EN|UC_ES, UC_ET|UC_AN|UC_CS);
+ }
+ public static boolean ucisneutral(int code) {
+ return ucisprop(code, 0, UC_B|UC_S|UC_WS|UC_ON);
+ }
+ public static boolean ucisseparator(int code) {
+ return ucisprop(code, 0, UC_B|UC_S);
+ }
+ public static boolean ucismark(int code) {
+ return ucisprop(code, UC_MN|UC_MC|UC_ME, 0);
+ }
+ public static boolean ucismodif(int code) {
+ return ucisprop(code, UC_LM, 0);
+ }
+ public static boolean ucisletnum(int code) {
+ return ucisprop(code, UC_NL, 0);
+ }
+ public static boolean ucisconnect(int code) {
+ return ucisprop(code, UC_PC, 0);
+ }
+ public static boolean ucisdash(int code) {
+ return ucisprop(code, UC_PD, 0);
+ }
+ public static boolean ucismath(int code) {
+ return ucisprop(code, UC_SM, 0);
+ }
+ public static boolean uciscurrency(int code) {
+ return ucisprop(code, UC_SC, 0);
+ }
+ public static boolean ucismodifsymbol(int code) {
+ return ucisprop(code, UC_SK, 0);
+ }
+ public static boolean ucisnsmark(int code) {
+ return ucisprop(code, UC_MN, 0);
+ }
+ public static boolean ucisspmark(int code) {
+ return ucisprop(code, UC_MC, 0);
+ }
+ public static boolean ucisenclosing(int code) {
+ return ucisprop(code, UC_ME, 0);
+ }
+ public static boolean ucisprivate(int code) {
+ return ucisprop(code, UC_CO, 0);
+ }
+ public static boolean ucissurrogate(int code) {
+ return ucisprop(code, UC_OS, 0);
+ }
+ public static boolean ucislsep(int code) {
+ return ucisprop(code, UC_ZL, 0);
+ }
+ public static boolean ucispsep(int code) {
+ return ucisprop(code, UC_ZP, 0);
+ }
+ public static boolean ucisidentstart(int code) {
+ return ucisprop(code, UC_LU|UC_LL|UC_LT|UC_LO|UC_NL, 0);
+ }
+ public static boolean ucisidentpart(int code) {
+ return ucisprop(code, UC_LU|UC_LL|UC_LT|UC_LO|UC_NL|
+ UC_MN|UC_MC|UC_ND|UC_PC|UC_CF, 0);
+ }
+ public static boolean ucisdefined(int code) {
+ return ucisprop(code, 0, UC_CP);
+ }
+ public static boolean ucisundefined(int code) {
+ return (ucisprop(code, 0, UC_CP) == true) ? false : true;
+ }
+ public static boolean ucishan(int code) {
+ return ((0x4e00 <= code && code <= 0x9fff) ||
+ (0xf900 <= code && code <= 0xfaff)) ? true : false;
+ }
+ public static boolean ucishangul(int code) {
+ return (0xac00 <= code && code <= 0xd7ff) ? true : false;
+ }
+
+ /**********************************************************************
+ *
+ * Case mapping section.
+ *
+ **********************************************************************/
+
+ private static int[] _uccase_len = {0, 0};
+ private static int[] _uccase_map = null;
+
+ private static boolean _uccase_load(URL where) {
+ int i, n;
+ boolean res;
+ InputStream in = null;
+
+ //
+ // If this array exists, then the file has already been loaded.
+ //
+ if (_uccase_map != null)
+ return true;
+
+ try {
+ in = where.openStream();
+ } catch (IOException e1) {
+ return false;
+ }
+
+ res = load_file(in);
+
+ try {
+ in.close();
+ } catch (IOException e) {}
+
+ if (res == false)
+ return res;
+
+ n = getShort(2) * 3;
+ _uccase_len[0] = getShort() * 3;
+ _uccase_len[1] = getShort() * 3;
+
+ _uccase_map = new int[n];
+ for (i = 0; i < n; i++)
+ _uccase_map[i] = getInt();
+
+ return true;
+ }
+
+ private static void _uccase_unload() {
+ _uccase_len[0] = _uccase_len[1] = 0;
+ _uccase_map = null;
+ }
+
+ private static int _uccase_lookup(int code, int l, int r, int field) {
+ int m;
+
+ while (l <= r) {
+ m = (l + r) >> 1;
+ m -= (m % 3);
+ if (code > _uccase_map[m])
+ l = m + 3;
+ else if (code < _uccase_map[m])
+ r = m - 3;
+ else
+ return _uccase_map[m + field];
+ }
+ return -1;
+ }
+
+ public static int uctoupper(int code) {
+ int l, r, field;
+
+ if (ucisupper(code))
+ return code;
+
+ if (ucislower(code)) {
+ //
+ // Lower case.
+ //
+ field = 2;
+ l = _uccase_len[0];
+ r = (l + _uccase_len[1]) - 3;
+ } else {
+ //
+ // Title case.
+ //
+ field = 1;
+ l = _uccase_len[0] + _uccase_len[1];
+ r = _uccase_map.length - 3;
+ }
+ return _uccase_lookup(code, l, r, field);
+ }
+
+ public static int uctolower(int code) {
+ int l, r, field;
+
+ if (ucislower(code))
+ return code;
+
+ if (ucisupper(code)) {
+ //
+ // Upper case.
+ //
+ field = 1;
+ l = 0;
+ r = _uccase_len[0] - 3;
+ } else {
+ //
+ // Title case.
+ //
+ field = 2;
+ l = _uccase_len[0] + _uccase_len[1];
+ r = _uccase_map.length - 1;
+ }
+ return _uccase_lookup(code, l, r, field);
+ }
+
+ public static int uctotitle(int code) {
+ int l, r, field;
+
+ if (ucistitle(code))
+ return code;
+
+ field = 2;
+ if (ucisupper(code)) {
+ //
+ // Upper case.
+ //
+ l = 0;
+ r = _uccase_len[0] - 3;
+ } else {
+ //
+ // Lower case.
+ //
+ l = _uccase_len[0];
+ r = (l + _uccase_len[1]) - 3;
+ }
+ return _uccase_lookup(code, l, r, field);
+ }
+
+ /**********************************************************************
+ *
+ * Character decomposition section.
+ *
+ **********************************************************************/
+
+ static int _ucdcmp_node_count = 0;
+ static int[] _ucdcmp_data = null;
+
+ private static boolean _ucdcmp_load(URL where) {
+ int i, bcnt;
+ boolean res;
+ InputStream in = null;
+
+ //
+ // If this array is not null, then the file has already been loaded.
+ //
+ if (_ucdcmp_data != null)
+ return true;
+
+ try {
+ in = where.openStream();
+ } catch (IOException e1) {
+ return false;
+ }
+
+ res = load_file(in);
+
+ try {
+ in.close();
+ } catch (IOException e) {}
+
+ if (res == false)
+ return res;
+
+ //
+ // This specifies how many of the _ucdmp_data elements are nodes which
+ // leaves the remaining number to be decompositions.
+ //
+ _ucdcmp_node_count = getShort() << 1;
+
+ bcnt = getInt() >> 2;
+
+ _ucdcmp_data = new int[bcnt];
+
+ for (i = 0; i < bcnt; i++)
+ _ucdcmp_data[i] = getInt();
+
+ return res;
+ }
+
+ private static void _ucdcmp_unload() {
+ _ucdcmp_node_count = 0;
+ _ucdcmp_data = null;
+ }
+
+ public static int[] ucdecomp(int code) {
+ int l, r, m, out[];
+
+ l = 0;
+ r = _ucdcmp_data[_ucdcmp_node_count] - 1;
+
+ while (l <= r) {
+ //
+ // Determine a "mid" point and adjust to make sure the mid point
+ // is at the beginning of a code+offset pair.
+ //
+ m = (l + r) >> 1;
+ m -= (m & 1);
+ if (code > _ucdcmp_data[m])
+ l = m + 2;
+ else if (code < _ucdcmp_data[m])
+ r = m - 2;
+ else {
+ l = _ucdcmp_data[m + 3] - _ucdcmp_data[m + 1];
+ out = new int[l];
+ for (r = 0; r < l; r++)
+ out[r] = _ucdcmp_data[_ucdcmp_node_count + 1 +
+ _ucdcmp_data[m + 1] + r];
+ return out;
+ }
+ }
+ return null;
+ }
+
+ public static int[] ucdecomp_hangul(int code) {
+ int out[], decomp[] = {0, 0, 0};
+
+ if (!ucishangul(code))
+ return null;
+
+ code -= 0xac00;
+ decomp[0] = 0x1100 + (code / 588);
+ decomp[1] = 0x1161 + ((code % 588) / 28);
+ decomp[2] = 0x11a7 + (code % 28);
+
+ out = new int[(decomp[2] != 0x11a7) ? 3 : 2];
+ out[0] = decomp[0];
+ out[1] = decomp[1];
+ if (decomp[0] != 0x11a7)
+ out[2] = decomp[2];
+ return out;
+ }
+
+ /**********************************************************************
+ *
+ * Combining class section.
+ *
+ **********************************************************************/
+
+ private static int[] _uccmbcl_nodes = null;
+
+ private static boolean _uccmbcl_load(URL where) {
+ int i, n;
+ boolean res;
+ InputStream in = null;
+
+ //
+ // If this array is not null, the file has already been loaded.
+ //
+ if (_uccmbcl_nodes != null)
+ return true;
+
+ try {
+ in = where.openStream();
+ } catch (IOException e1) {
+ return false;
+ }
+
+ res = load_file(in);
+
+ try {
+ in.close();
+ } catch (IOException e) {}
+
+ if (res == false)
+ return res;
+
+ n = getShort() * 3;
+
+ buffpos += 4;
+
+ _uccmbcl_nodes = new int[n];
+ for (i = 0; i < n; i++)
+ _uccmbcl_nodes[i] = getInt();
+
+ return true;
+ }
+
+ private static void _uccmbcl_unload() {
+ _uccmbcl_nodes = null;
+ }
+
+ public static int uccombining_class(int code) {
+ int l, r, m;
+
+ l = 0;
+ r = _uccmbcl_nodes.length - 3;
+
+ while (l <= r) {
+ m = (l + r) >> 1;
+ m -= (m % 3);
+ if (code > _uccmbcl_nodes[m + 1])
+ l = m + 3;
+ else if (code < _uccmbcl_nodes[m])
+ r = m - 3;
+ else if (_uccmbcl_nodes[m] <= code &&
+ code <= _uccmbcl_nodes[m + 1])
+ return _uccmbcl_nodes[m + 2];
+ }
+ return 0;
+ }
+
+ /**********************************************************************
+ *
+ * Number section.
+ *
+ **********************************************************************/
+
+ private static short[] _ucnum_vals;
+ private static int[] _ucnum_nodes;
+
+ private static boolean _ucnumb_load(URL where) {
+ int i, n, b;
+ boolean res;
+ InputStream in = null;
+
+ //
+ // If this array is not null, then the file has already been loaded.
+ //
+ if (_ucnum_nodes != null)
+ return true;
+
+ try {
+ in = where.openStream();
+ } catch (IOException e1) {
+ return false;
+ }
+
+ res = load_file(in);
+
+ try {
+ in.close();
+ } catch (IOException e) {}
+
+ if (res == false)
+ return res;
+
+ n = getShort();
+ b = (getInt() - (n << 2)) >> 1;
+
+ _ucnum_nodes = new int[n];
+ for (i = 0; i < n; i++)
+ _ucnum_nodes[i] = getInt();
+
+ _ucnum_vals = new short[b];
+ for (i = 0; i < b; i++)
+ _ucnum_vals[i] = getShort();
+
+ return true;
+ }
+
+ private static void _ucnumb_unload() {
+ _ucnum_vals = null;
+ _ucnum_nodes = null;
+ }
+
+ public static boolean ucnumber_lookup(int code, int[] result) {
+ int l, r, m;
+
+ result[0] = result[1] = 0;
+
+ l = 0;
+ r = _ucnum_nodes.length - 1;
+ while (l <= r) {
+ m = (l + r) >> 1;
+ m -= (m & 1);
+ if (code > _ucnum_nodes[m])
+ l = m + 2;
+ else if (code < _ucnum_nodes[m])
+ r = m - 2;
+ else {
+ result[0] = _ucnum_vals[_ucnum_nodes[m + 1]];
+ result[1] = _ucnum_vals[_ucnum_nodes[m + 1] + 1];
+ return true;
+ }
+ }
+ return false;
+ }
+
+ public static boolean ucdigit_lookup(int code, int[] result) {
+ int l, r, m;
+
+ result[0] = -1;
+
+ l = 0;
+ r = _ucnum_nodes.length - 1;
+ while (l <= r) {
+ m = (l + r) >> 1;
+ m -= (m & 1);
+ if (code > _ucnum_nodes[m])
+ l = m + 2;
+ else if (code < _ucnum_nodes[m])
+ r = m - 2;
+ else {
+ short d1 = _ucnum_vals[_ucnum_nodes[m + 1]];
+ short d2 = _ucnum_vals[_ucnum_nodes[m + 1] + 1];
+ if (d1 == d2) {
+ result[0] = d1;
+ return true;
+ }
+ return false;
+ }
+ }
+ return false;
+ }
+
+ /**********************************************************************
+ *
+ * File loading and unloading routines.
+ *
+ **********************************************************************/
+
+ //
+ // Masks that combine to load and unload files using a base URL.
+ //
+ public final static int UCDATA_CASE = 0x01;
+ public final static int UCDATA_CTYPE = 0x02;
+ public final static int UCDATA_DECOMP = 0x04;
+ public final static int UCDATA_CMBCL = 0x08;
+ public final static int UCDATA_NUM = 0x10;
+ public final static int UCDATA_ALL = 0x1f;
+
+ public static void ucdata_load(URL base, int masks) {
+ //
+ // Make sure the base has the trailing slash.
+ //
+ String url = base.toString();
+ if (url.lastIndexOf('/') != url.length() - 1)
+ url += "/";
+
+ if ((masks & UCDATA_CTYPE) != 0) {
+ try {
+ _ucprop_load(new URL(url + "ctype.dat"));
+ } catch (MalformedURLException mue) {}
+ }
+ if ((masks & UCDATA_CASE) != 0) {
+ try {
+ _uccase_load(new URL(url + "case.dat"));
+ } catch (MalformedURLException mue) {}
+ }
+ if ((masks & UCDATA_DECOMP) != 0) {
+ try {
+ _ucdcmp_load(new URL(url + "decomp.dat"));
+ } catch (MalformedURLException mue) {}
+ }
+ if ((masks & UCDATA_CMBCL) != 0) {
+ try {
+ _uccmbcl_load(new URL(url + "cmbcl.dat"));
+ } catch (MalformedURLException mue) {}
+ }
+ if ((masks & UCDATA_NUM) != 0) {
+ try {
+ _ucnumb_load(new URL(url + "num.dat"));
+ } catch (MalformedURLException mue) {}
+ }
+ }
+
+ public static void ucdata_unload(int masks) {
+ if ((masks & UCDATA_CTYPE) != 0)
+ _ucprop_unload();
+ if ((masks & UCDATA_CASE) != 0)
+ _uccase_unload();
+ if ((masks & UCDATA_DECOMP) != 0)
+ _ucdcmp_unload();
+ if ((masks & UCDATA_CMBCL) != 0)
+ _uccmbcl_unload();
+ if ((masks & UCDATA_NUM) != 0)
+ _ucnumb_unload();
+ }
+}
--- /dev/null
+/*
+ * $Id: UCDataTest.java,v 1.1 1999/08/23 16:14:08 mleisher Exp $
+ *
+ * Copyright 1999 Computing Research Labs, New Mexico State University
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
+ * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+ * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+import java.io.*;
+import java.net.*;
+import UCData.*;
+
+public class UCDataTest {
+ /**********************************************************************
+ *
+ * Main.
+ *
+ **********************************************************************/
+
+ public static void main(String[] args) {
+ URL url = null;
+
+ try {
+ url = new URL("file:/home/mleisher/unicode/textutils/ucdata");
+ } catch (MalformedURLException mue) {}
+
+ UCData.ucdata_load(url, UCData.UCDATA_ALL);
+
+ if (UCData.ucisalpha(0x1d5))
+ System.out.println("0x1d5 is alpha");
+ else
+ System.out.println("0x1d5 is not alpha");
+
+ int c;
+
+ c = UCData.uctolower(0x1f1);
+ System.out.println("0x1f1 lower is 0x"+Integer.toHexString(c));
+ c = UCData.uctotitle(0x1f1);
+ System.out.println("0x1f1 title is 0x"+Integer.toHexString(c));
+
+ c = UCData.uctolower(0xff3a);
+ System.out.println("0xff3a lower is 0x"+Integer.toHexString(c));
+ c = UCData.uctotitle(0xff3a);
+ System.out.println("0xff3a title is 0x"+Integer.toHexString(c));
+
+ int[] decomp = UCData.ucdecomp(0x1d5);
+ if (decomp != null) {
+ System.out.print("0x1d5 decomposition :");
+ for (int i = 0; i < decomp.length; i++)
+ System.out.print("0x"+Integer.toHexString(decomp[i])+" ");
+ System.out.println("");
+ }
+
+ int ccl = UCData.uccombining_class(0x41);
+ System.out.println("0x41 combining class " + ccl);
+ ccl = UCData.uccombining_class(0xfe23);
+ System.out.println("0xfe23 combining class " + ccl);
+
+ int num[] = {0,0};
+ if (UCData.ucnumber_lookup(0x30, num)) {
+ if (num[0] != num[1])
+ System.out.println("0x30 is fraction "+num[0]+"/"+num[1]);
+ else
+ System.out.println("0x30 is digit "+num[0]);
+ }
+
+ if (UCData.ucnumber_lookup(0xbc, num)) {
+ if (num[0] != num[1])
+ System.out.println("0xbc is fraction "+num[0]+"/"+num[1]);
+ else
+ System.out.println("0xbc is digit "+num[0]);
+ }
+
+ if (UCData.ucdigit_lookup(0x6f9, num))
+ System.out.println("0x6f9 is digit " + num[0]);
+ else
+ System.out.println("0x6f9 is not a digit");
+ }
+}
--- /dev/null
+#
+# $Id: api.txt,v 1.2 1999/11/19 15:24:29 mleisher Exp $
+#
+
+ The MUTT UCData API
+ -------------------
+
+
+
+
+-----------------------------------------------------------------------------
+
+Macros that combine to select data tables for ucdata_load(), ucdata_unload(),
+and ucdata_reload().
+
+#define UCDATA_CASE 0x01
+#define UCDATA_CTYPE 0x02
+#define UCDATA_DECOMP 0x04
+#define UCDATA_CMBCL 0x08
+#define UCDATA_NUM 0x10
+#define UCATA_ALL (UCDATA_CASE|UCDATA_CTYPE|UCDATA_DECOMP|\
+ UCDATA_CMBCL|UCDATA_NUM)
+-----------------------------------------------------------------------------
+
+void ucdata_load(char *paths, int masks)
+
+ This function initializes the UCData library by locating the data files in
+ one of the colon-separated directories in the `paths' parameter. The data
+ files to be loaded are specified in the `masks' parameter as a bitwise
+ combination of the macros listed above.
+
+ This should be called before using any of the other functions.
+
+ NOTE: the ucdata_setup(char *paths) function is now a macro that expands
+ into this function at compile time.
+
+-----------------------------------------------------------------------------
+
+void ucdata_unload(int masks)
+
+ This function unloads the data tables specified in the `masks' parameter.
+
+ This function should be called when the application is done using the UCData
+ package.
+
+ NOTE: the ucdata_cleanup() function is now a macro that expands into this
+ function at compile time.
+
+-----------------------------------------------------------------------------
+
+void ucdata_reload(char *paths, int masks)
+
+ This function reloads the data files from one of the colon-separated
+ directories in the `paths' parameter. The data files to be reloaded are
+ specified in the `masks' parameter as a bitwise combination of the macros
+ listed above.
+
+ If the data files have already been loaded, they are unloaded before the
+ data files are loaded again.
+
+-----------------------------------------------------------------------------
+
+int ucdecomp(unsigned long code, unsigned long *num, unsigned long **decomp)
+
+ This function determines if a character has a decomposition and returns the
+ decomposition information if it exists.
+
+ If a zero is returned, there is no decomposition. If a non-zero is
+ returned, then the `num' and `decomp' variables are filled in with the
+ appropriate values.
+
+ Example call:
+
+ unsigned long i, num, *decomp;
+
+ if (ucdecomp(0x1d5, &num, &decomp) != 0) {
+ for (i = 0; i < num; i++)
+ printf("0x%08lX,", decomp[i]);
+ putchar('\n');
+ }
+
+-----------------------------------------------------------------------------
+
+int ucdecomp_hangul(unsigned long code, unsigned long *num,
+ unsigned long decomp[])
+
+ This function determines if a Hangul syllable has a decomposition and
+ returns the decomposition information.
+
+ An array of at least size 3 should be passed to the function for the
+ decomposition of the syllable.
+
+ If a zero is returned, the character is not a Hangul syllable. If a
+ non-zero is returned, the `num' field will be 2 or 3 and the syllable will
+ be decomposed into the `decomp' array arithmetically.
+
+ Example call:
+
+ unsigned long i, num, decomp[3];
+
+ if (ucdecomp_hangul(0xb1ba, &num, &decomp) != 0) {
+ for (i = 0; i < num; i++)
+ printf("0x%08lX,", decomp[i]);
+ putchar('\n');
+ }
+
+-----------------------------------------------------------------------------
+
+struct ucnumber {
+ int numerator;
+ int denominator;
+};
+
+int ucnumber_lookup(unsigned long code, struct ucnumber *num)
+
+ This function determines if the code is a number and fills in the `num'
+ field with the numerator and denominator. If the code happens to be a
+ single digit, the numerator and denominator fields will be the same.
+
+ If the function returns 0, the code is not a number. Any other return
+ value means the code is a number.
+
+int ucdigit_lookup(unsigned long code, int *digit)
+
+ This function determines if the code is a digit and fills in the `digit'
+ field with the digit value.
+
+ If the function returns 0, the code is not a number. Any other return
+ value means the code is a number.
+
+struct ucnumber ucgetnumber(unsigned long code)
+
+ This is a compatibility function with John Cowan's "uctype" package. It
+ uses ucnumber_lookup().
+
+int ucgetdigit(unsigned long code)
+
+ This is a compatibility function with John Cowan's "uctype" package. It
+ uses ucdigit_lookup().
+
+-----------------------------------------------------------------------------
+
+unsigned long uctoupper(unsigned long code)
+
+ This function returns the code unchanged if it is already upper case or has
+ no upper case equivalent. Otherwise the upper case equivalent is returned.
+
+-----------------------------------------------------------------------------
+
+unsigned long uctolower(unsigned long code)
+
+ This function returns the code unchanged if it is already lower case or has
+ no lower case equivalent. Otherwise the lower case equivalent is returned.
+
+-----------------------------------------------------------------------------
+
+unsigned long uctotitle(unsigned long code)
+
+ This function returns the code unchanged if it is already title case or has
+ no title case equivalent. Otherwise the title case equivalent is returned.
+
+-----------------------------------------------------------------------------
+
+int ucisalpha(unsigned long code)
+int ucisalnum(unsigned long code)
+int ucisdigit(unsigned long code)
+int uciscntrl(unsigned long code)
+int ucisspace(unsigned long code)
+int ucisblank(unsigned long code)
+int ucispunct(unsigned long code)
+int ucisgraph(unsigned long code)
+int ucisprint(unsigned long code)
+int ucisxdigit(unsigned long code)
+
+int ucisupper(unsigned long code)
+int ucislower(unsigned long code)
+int ucistitle(unsigned long code)
+
+ These functions (actually macros) determine if a character has these
+ properties. These behave in a fashion very similar to the venerable ctype
+ package.
+
+-----------------------------------------------------------------------------
+
+int ucisisocntrl(unsigned long code)
+
+ Is the character a C0 control character (< 32) ?
+
+int ucisfmtcntrl(unsigned long code)
+
+ Is the character a format control character?
+
+int ucissymbol(unsigned long code)
+
+ Is the character a symbol?
+
+int ucisnumber(unsigned long code)
+
+ Is the character a number or digit?
+
+int ucisnonspacing(unsigned long code)
+
+ Is the character non-spacing?
+
+int ucisopenpunct(unsigned long code)
+
+ Is the character an open/left punctuation (i.e. '[')
+
+int ucisclosepunct(unsigned long code)
+
+ Is the character an close/right punctuation (i.e. ']')
+
+int ucisinitialpunct(unsigned long code)
+
+ Is the character an initial punctuation (i.e. U+2018 LEFT SINGLE QUOTATION
+ MARK)
+
+int ucisfinalpunct(unsigned long code)
+
+ Is the character a final punctuation (i.e. U+2019 RIGHT SINGLE QUOTATION
+ MARK)
+
+int uciscomposite(unsigned long code)
+
+ Can the character be decomposed into a set of other characters?
+
+int ucisquote(unsigned long code)
+
+ Is the character one of the many quotation marks?
+
+int ucissymmetric(unsigned long code)
+
+ Is the character one that has an opposite form (i.e. <>)
+
+int ucismirroring(unsigned long code)
+
+ Is the character mirroring (superset of symmetric)?
+
+int ucisnonbreaking(unsigned long code)
+
+ Is the character non-breaking (i.e. non-breaking space)?
+
+int ucisrtl(unsigned long code)
+
+ Does the character have strong right-to-left directionality (i.e. Arabic
+ letters)?
+
+int ucisltr(unsigned long code)
+
+ Does the character have strong left-to-right directionality (i.e. Latin
+ letters)?
+
+int ucisstrong(unsigned long code)
+
+ Does the character have strong directionality?
+
+int ucisweak(unsigned long code)
+
+ Does the character have weak directionality (i.e. numbers)?
+
+int ucisneutral(unsigned long code)
+
+ Does the character have neutral directionality (i.e. whitespace)?
+
+int ucisseparator(unsigned long code)
+
+ Is the character a block or segment separator?
+
+int ucislsep(unsigned long code)
+
+ Is the character a line separator?
+
+int ucispsep(unsigned long code)
+
+ Is the character a paragraph separator?
+
+int ucismark(unsigned long code)
+
+ Is the character a mark of some kind?
+
+int ucisnsmark(unsigned long code)
+
+ Is the character a non-spacing mark?
+
+int ucisspmark(unsigned long code)
+
+ Is the character a spacing mark?
+
+int ucismodif(unsigned long code)
+
+ Is the character a modifier letter?
+
+int ucismodifsymbol(unsigned long code)
+
+ Is the character a modifier symbol?
+
+int ucisletnum(unsigned long code)
+
+ Is the character a number represented by a letter?
+
+int ucisconnect(unsigned long code)
+
+ Is the character connecting punctuation?
+
+int ucisdash(unsigned long code)
+
+ Is the character dash punctuation?
+
+int ucismath(unsigned long code)
+
+ Is the character a math character?
+
+int uciscurrency(unsigned long code)
+
+ Is the character a currency character?
+
+int ucisenclosing(unsigned long code)
+
+ Is the character enclosing (i.e. enclosing box)?
+
+int ucisprivate(unsigned long code)
+
+ Is the character from the Private Use Area?
+
+int ucissurrogate(unsigned long code)
+
+ Is the character one of the surrogate codes?
+
+int ucisdefined(unsigned long code)
+
+ Is the character defined (appeared in one of the data files)?
+
+int ucisundefined(unsigned long code)
+
+ Is the character not defined (non-Unicode)?
+
+int ucishan(unsigned long code)
+
+ Is the character a Han ideograph?
+
+int ucishangul(unsigned long code)
+
+ Is the character a pre-composed Hangul syllable?
--- /dev/null
+#
+# $Id: bidiapi.txt,v 1.2 1999/11/19 15:24:29 mleisher Exp $
+#
+
+ "Pretty Good Bidi Algorithm" API
+
+The PGBA (Pretty Good Bidi Algorithm) is an effective alternative to the
+Unicode BiDi algorithm. It currently provides only implicit reordering and
+does not yet support explicit reordering codes that the Unicode BiDi algorithm
+supports. In addition to reordering, the PGBA includes cursor movement
+support for both visual and logical navigation.
+
+-----------------------------------------------------------------------------
+
+#define UCPGBA_LTR 0
+#define UCPGBA_RTL 1
+
+ These macros appear in the `direction' field of the data structures.
+
+#define UCPGBA_CURSOR_VISUAL 0
+#define UCPGBA_CURSOR_LOGICAL 1
+
+ These macros are used to set the cursor movement for each reordered string.
+
+-----------------------------------------------------------------------------
+
+ucstring_t *ucstring_create(unsigned long *source, unsigned long start,
+ unsigned long end, int default_direction,
+ int cursor_motion)
+
+ This function will create a reordered string by using the implicit
+ directionality of the characters in the specified substring.
+
+ The `default_direction' parameter should be one of UCPGBA_LTR or UCPGBA_RTL
+ and is used only in cases where a string contains no characters with strong
+ directionality.
+
+ The `cursor_motion' parameter should be one of UCPGBA_CURSOR_VISUAL or
+ UCPGBA_CURSOR_LOGICAL, and is used to specify the initial cursor motion
+ behavior. This behavior can be switched at any time using
+ ustring_set_cursor_motion().
+
+-----------------------------------------------------------------------------
+
+void ucstring_free(ucstring_t *string)
+
+ This function will deallocate the memory used by the string, incuding the
+ string itself.
+
+-----------------------------------------------------------------------------
+
+void ucstring_cursor_info(ustring_t *string, int *direction,
+ unsigned long *position)
+
+ This function will return the text position of the internal cursor and the
+ directionality of the text at that position. The position returned is the
+ original text position of the character.
+
+-----------------------------------------------------------------------------
+
+int ucstring_set_cursor_motion(ucstring_t *string, int cursor_motion)
+
+ This function will change the cursor motion type and return the previous
+ cursor motion type.
+
+-----------------------------------------------------------------------------
+
+int ucstring_cursor_right(ucstring_t *string, int count)
+
+ This function will move the internal cursor to the right according to the
+ type of cursor motion set for the string.
+
+ If no cursor motion is performed, it returns 0. Otherwise it will return a
+ 1.
+
+-----------------------------------------------------------------------------
+
+int ucstring_cursor_left(ucstring_t *string, int count)
+
+ This function will move the internal cursor to the left according to the
+ type of cursor motion set for the string.
+
+ If no cursor motion is performed, it returns 0. Otherwise it will return a
+ 1.
--- /dev/null
+#
+# $Id: format.txt,v 1.1 1998/07/24 15:17:21 mleisher Exp $
+#
+
+CHARACTER DATA
+==============
+
+This package generates some data files that contain character properties useful
+for text processing.
+
+CHARACTER PROPERTIES
+====================
+
+The first data file is called "ctype.dat" and contains a compressed form of
+the character properties found in the Unicode Character Database (UCDB).
+Additional properties can be specified in limited UCDB format in another file
+to avoid modifying the original UCDB.
+
+The following is a property name and code table to be used with the character
+data:
+
+NAME CODE DESCRIPTION
+---------------------
+Mn 0 Mark, Non-Spacing
+Mc 1 Mark, Spacing Combining
+Me 2 Mark, Enclosing
+Nd 3 Number, Decimal Digit
+Nl 4 Number, Letter
+No 5 Number, Other
+Zs 6 Separator, Space
+Zl 7 Separator, Line
+Zp 8 Separator, Paragraph
+Cc 9 Other, Control
+Cf 10 Other, Format
+Cs 11 Other, Surrogate
+Co 12 Other, Private Use
+Cn 13 Other, Not Assigned
+Lu 14 Letter, Uppercase
+Ll 15 Letter, Lowercase
+Lt 16 Letter, Titlecase
+Lm 17 Letter, Modifier
+Lo 18 Letter, Other
+Pc 19 Punctuation, Connector
+Pd 20 Punctuation, Dash
+Ps 21 Punctuation, Open
+Pe 22 Punctuation, Close
+Po 23 Punctuation, Other
+Sm 24 Symbol, Math
+Sc 25 Symbol, Currency
+Sk 26 Symbol, Modifier
+So 27 Symbol, Other
+L 28 Left-To-Right
+R 29 Right-To-Left
+EN 30 European Number
+ES 31 European Number Separator
+ET 32 European Number Terminator
+AN 33 Arabic Number
+CS 34 Common Number Separator
+B 35 Block Separator
+S 36 Segment Separator
+WS 37 Whitespace
+ON 38 Other Neutrals
+Pi 47 Punctuation, Initial
+Pf 48 Punctuation, Final
+#
+# Implementation specific properties.
+#
+Cm 39 Composite
+Nb 40 Non-Breaking
+Sy 41 Symmetric (characters which are part of open/close pairs)
+Hd 42 Hex Digit
+Qm 43 Quote Mark
+Mr 44 Mirroring
+Ss 45 Space, Other (controls viewed as spaces in ctype isspace())
+Cp 46 Defined character
+
+The actual binary data is formatted as follows:
+
+ Assumptions: unsigned short is at least 16-bits in size and unsigned long
+ is at least 32-bits in size.
+
+ unsigned short ByteOrderMark
+ unsigned short OffsetArraySize
+ unsigned long Bytes
+ unsigned short Offsets[OffsetArraySize + 1]
+ unsigned long Ranges[N], N = value of Offsets[OffsetArraySize]
+
+ The Bytes field provides the total byte count used for the Offsets[] and
+ Ranges[] arrays. The Offsets[] array is aligned on a 4-byte boundary and
+ there is always one extra node on the end to hold the final index of the
+ Ranges[] array. The Ranges[] array contains pairs of 4-byte values
+ representing a range of Unicode characters. The pairs are arranged in
+ increasing order by the first character code in the range.
+
+ Determining if a particular character is in the property list requires a
+ simple binary search to determine if a character is in any of the ranges
+ for the property.
+
+ If the ByteOrderMark is equal to 0xFFFE, then the data was generated on a
+ machine with a different endian order and the values must be byte-swapped.
+
+ To swap a 16-bit value:
+ c = (c >> 8) | ((c & 0xff) << 8)
+
+ To swap a 32-bit value:
+ c = ((c & 0xff) << 24) | (((c >> 8) & 0xff) << 16) |
+ (((c >> 16) & 0xff) << 8) | (c >> 24)
+
+CASE MAPPINGS
+=============
+
+The next data file is called "case.dat" and contains three case mapping tables
+in the following order: upper, lower, and title case. Each table is in
+increasing order by character code and each mapping contains 3 unsigned longs
+which represent the possible mappings.
+
+The format for the binary form of these tables is:
+
+ unsigned short ByteOrderMark
+ unsigned short NumMappingNodes, count of all mapping nodes
+ unsigned short CaseTableSizes[2], upper and lower mapping node counts
+ unsigned long CaseTables[NumMappingNodes]
+
+ The starting indexes of the case tables are calculated as following:
+
+ UpperIndex = 0;
+ LowerIndex = CaseTableSizes[0] * 3;
+ TitleIndex = LowerIndex + CaseTableSizes[1] * 3;
+
+ The order of the fields for the three tables are:
+
+ Upper case
+ ----------
+ unsigned long upper;
+ unsigned long lower;
+ unsigned long title;
+
+ Lower case
+ ----------
+ unsigned long lower;
+ unsigned long upper;
+ unsigned long title;
+
+ Title case
+ ----------
+ unsigned long title;
+ unsigned long upper;
+ unsigned long lower;
+
+ If the ByteOrderMark is equal to 0xFFFE, endian swapping is required in the
+ same way as described in the CHARACTER PROPERTIES section.
+
+ Because the tables are in increasing order by character code, locating a
+ mapping requires a simple binary search on one of the 3 codes that make up
+ each node.
+
+ It is important to note that there can only be 65536 mapping nodes which
+ divided into 3 portions allows 21845 nodes for each case mapping table. The
+ distribution of mappings may be more or less than 21845 per table, but only
+ 65536 are allowed.
+
+DECOMPOSITIONS
+==============
+
+The next data file is called "decomp.dat" and contains the decomposition data
+for all characters with decompositions containing more than one character and
+are *not* compatibility decompositions. Compatibility decompositions are
+signaled in the UCDB format by the use of the <compat> tag in the
+decomposition field. Each list of character codes represents a full
+decomposition of a composite character. The nodes are arranged in increasing
+order by character code.
+
+The format for the binary form of this table is:
+
+ unsigned short ByteOrderMark
+ unsigned short NumDecompNodes, count of all decomposition nodes
+ unsigned long Bytes
+ unsigned long DecompNodes[(NumDecompNodes * 2) + 1]
+ unsigned long Decomp[N], N = sum of all counts in DecompNodes[]
+
+ If the ByteOrderMark is equal to 0xFFFE, endian swapping is required in the
+ same way as described in the CHARACTER PROPERTIES section.
+
+ The DecompNodes[] array consists of pairs of unsigned longs, the first of
+ which is the character code and the second is the initial index of the list
+ of character codes representing the decomposition.
+
+ Locating the decomposition of a composite character requires a binary search
+ for a character code in the DecompNodes[] array and using its index to
+ locate the start of the decomposition. The length of the decomposition list
+ is the index in the following element in DecompNode[] minus the current
+ index.
+
+COMBINING CLASSES
+=================
+
+The fourth data file is called "cmbcl.dat" and contains the characters with
+non-zero combining classes.
+
+The format for the binary form of this table is:
+
+ unsigned short ByteOrderMark
+ unsigned short NumCCLNodes
+ unsigned long Bytes
+ unsigned long CCLNodes[NumCCLNodes * 3]
+
+ If the ByteOrderMark is equal to 0xFFFE, endian swapping is required in the
+ same way as described in the CHARACTER PROPERTIES section.
+
+ The CCLNodes[] array consists of groups of three unsigned longs. The first
+ and second are the beginning and ending of a range and the third is the
+ combining class of that range.
+
+ If a character is not found in this table, then the combining class is
+ assumed to be 0.
+
+ It is important to note that only 65536 distinct ranges plus combining class
+ can be specified because the NumCCLNodes is usually a 16-bit number.
+
+NUMBER TABLE
+============
+
+The final data file is called "num.dat" and contains the characters that have
+a numeric value associated with them.
+
+The format for the binary form of the table is:
+
+ unsigned short ByteOrderMark
+ unsigned short NumNumberNodes
+ unsigned long Bytes
+ unsigned long NumberNodes[NumNumberNodes]
+ unsigned short ValueNodes[(Bytes - (NumNumberNodes * sizeof(unsigned long)))
+ / sizeof(short)]
+
+ If the ByteOrderMark is equal to 0xFFFE, endian swapping is required in the
+ same way as described in the CHARACTER PROPERTIES section.
+
+ The NumberNodes array contains pairs of values, the first of which is the
+ character code and the second an index into the ValueNodes array. The
+ ValueNodes array contains pairs of integers which represent the numerator
+ and denominator of the numeric value of the character. If the character
+ happens to map to an integer, both the values in ValueNodes will be the
+ same.
--- /dev/null
+/*
+ * Copyright 1999 Computing Research Labs, New Mexico State University
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
+ * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+ * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef lint
+#ifdef __GNUC__
+static char rcsid[] __attribute__ ((unused)) = "$Id: ucdata.c,v 1.3 1999/08/23 16:14:09 mleisher Exp $";
+#else
+static char rcsid[] = "$Id: ucdata.c,v 1.3 1999/08/23 16:14:09 mleisher Exp $";
+#endif
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#ifndef WIN32
+#include <unistd.h>
+#endif
+
+#include "ucdata.h"
+
+/**************************************************************************
+ *
+ * Miscellaneous types, data, and support functions.
+ *
+ **************************************************************************/
+
+typedef struct {
+ unsigned short bom;
+ unsigned short cnt;
+ union {
+ unsigned long bytes;
+ unsigned short len[2];
+ } size;
+} _ucheader_t;
+
+/*
+ * A simple array of 32-bit masks for lookup.
+ */
+static unsigned long masks32[32] = {
+ 0x00000001, 0x00000002, 0x00000004, 0x00000008, 0x00000010, 0x00000020,
+ 0x00000040, 0x00000080, 0x00000100, 0x00000200, 0x00000400, 0x00000800,
+ 0x00001000, 0x00002000, 0x00004000, 0x00008000, 0x00010000, 0x00020000,
+ 0x00040000, 0x00080000, 0x00100000, 0x00200000, 0x00400000, 0x00800000,
+ 0x01000000, 0x02000000, 0x04000000, 0x08000000, 0x10000000, 0x20000000,
+ 0x40000000, 0x80000000
+};
+
+#define endian_short(cc) (((cc) >> 8) | (((cc) & 0xff) << 8))
+#define endian_long(cc) ((((cc) & 0xff) << 24)|((((cc) >> 8) & 0xff) << 16)|\
+ ((((cc) >> 16) & 0xff) << 8)|((cc) >> 24))
+
+static FILE *
+#ifdef __STDC__
+_ucopenfile(char *paths, char *filename, char *mode)
+#else
+_ucopenfile(paths, filename, mode)
+char *paths, *filename, *mode;
+#endif
+{
+ FILE *f;
+ char *fp, *dp, *pp, path[BUFSIZ];
+
+ if (filename == 0 || *filename == 0)
+ return 0;
+
+ dp = paths;
+ while (dp && *dp) {
+ pp = path;
+ while (*dp && *dp != ':')
+ *pp++ = *dp++;
+ *pp++ = '/';
+
+ fp = filename;
+ while (*fp)
+ *pp++ = *fp++;
+ *pp = 0;
+
+ if ((f = fopen(path, mode)) != 0)
+ return f;
+
+ if (*dp == ':')
+ dp++;
+ }
+
+ return 0;
+}
+
+/**************************************************************************
+ *
+ * Support for the character properties.
+ *
+ **************************************************************************/
+
+static unsigned long _ucprop_size;
+static unsigned short *_ucprop_offsets;
+static unsigned long *_ucprop_ranges;
+
+static void
+#ifdef __STDC__
+_ucprop_load(char *paths, int reload)
+#else
+_ucprop_load(paths, reload)
+char *paths;
+int reload;
+#endif
+{
+ FILE *in;
+ unsigned long size, i;
+ _ucheader_t hdr;
+
+ if (_ucprop_size > 0) {
+ if (!reload)
+ /*
+ * The character properties have already been loaded.
+ */
+ return;
+
+ /*
+ * Unload the current character property data in preparation for
+ * loading a new copy. Only the first array has to be deallocated
+ * because all the memory for the arrays is allocated as a single
+ * block.
+ */
+ free((char *) _ucprop_offsets);
+ _ucprop_size = 0;
+ }
+
+ if ((in = _ucopenfile(paths, "ctype.dat", "rb")) == 0)
+ return;
+
+ /*
+ * Load the header.
+ */
+ fread((char *) &hdr, sizeof(_ucheader_t), 1, in);
+
+ if (hdr.bom == 0xfffe) {
+ hdr.cnt = endian_short(hdr.cnt);
+ hdr.size.bytes = endian_long(hdr.size.bytes);
+ }
+
+ if ((_ucprop_size = hdr.cnt) == 0) {
+ fclose(in);
+ return;
+ }
+
+ /*
+ * Allocate all the storage needed for the lookup table.
+ */
+ _ucprop_offsets = (unsigned short *) malloc(hdr.size.bytes);
+
+ /*
+ * Calculate the offset into the storage for the ranges. The offsets
+ * array is on a 4-byte boundary and one larger than the value provided in
+ * the header count field. This means the offset to the ranges must be
+ * calculated after aligning the count to a 4-byte boundary.
+ */
+ if ((size = ((hdr.cnt + 1) * sizeof(unsigned short))) & 3)
+ size += 4 - (size & 3);
+ size >>= 1;
+ _ucprop_ranges = (unsigned long *) (_ucprop_offsets + size);
+
+ /*
+ * Load the offset array.
+ */
+ fread((char *) _ucprop_offsets, sizeof(unsigned short), size, in);
+
+ /*
+ * Do an endian swap if necessary. Don't forget there is an extra node on
+ * the end with the final index.
+ */
+ if (hdr.bom == 0xfffe) {
+ for (i = 0; i <= _ucprop_size; i++)
+ _ucprop_offsets[i] = endian_short(_ucprop_offsets[i]);
+ }
+
+ /*
+ * Load the ranges. The number of elements is in the last array position
+ * of the offsets.
+ */
+ fread((char *) _ucprop_ranges, sizeof(unsigned long),
+ _ucprop_offsets[_ucprop_size], in);
+
+ fclose(in);
+
+ /*
+ * Do an endian swap if necessary.
+ */
+ if (hdr.bom == 0xfffe) {
+ for (i = 0; i < _ucprop_offsets[_ucprop_size]; i++)
+ _ucprop_ranges[i] = endian_long(_ucprop_ranges[i]);
+ }
+}
+
+static void
+#ifdef __STDC__
+_ucprop_unload(void)
+#else
+_ucprop_unload()
+#endif
+{
+ if (_ucprop_size == 0)
+ return;
+
+ /*
+ * Only need to free the offsets because the memory is allocated as a
+ * single block.
+ */
+ free((char *) _ucprop_offsets);
+ _ucprop_size = 0;
+}
+
+static int
+#ifdef __STDC__
+_ucprop_lookup(unsigned long code, unsigned long n)
+#else
+_ucprop_lookup(code, n)
+unsigned long code, n;
+#endif
+{
+ long l, r, m;
+
+ /*
+ * There is an extra node on the end of the offsets to allow this routine
+ * to work right. If the index is 0xffff, then there are no nodes for the
+ * property.
+ */
+ if ((l = _ucprop_offsets[n]) == 0xffff)
+ return 0;
+
+ /*
+ * Locate the next offset that is not 0xffff. The sentinel at the end of
+ * the array is the max index value.
+ */
+ for (m = 1;
+ n + m < _ucprop_size && _ucprop_offsets[n + m] == 0xffff; m++) ;
+
+ r = _ucprop_offsets[n + m] - 1;
+
+ while (l <= r) {
+ /*
+ * Determine a "mid" point and adjust to make sure the mid point is at
+ * the beginning of a range pair.
+ */
+ m = (l + r) >> 1;
+ m -= (m & 1);
+ if (code > _ucprop_ranges[m + 1])
+ l = m + 2;
+ else if (code < _ucprop_ranges[m])
+ r = m - 2;
+ else if (code >= _ucprop_ranges[m] && code <= _ucprop_ranges[m + 1])
+ return 1;
+ }
+ return 0;
+}
+
+int
+#ifdef __STDC__
+ucisprop(unsigned long code, unsigned long mask1, unsigned long mask2)
+#else
+ucisprop(code, mask1, mask2)
+unsigned long code, mask1, mask2;
+#endif
+{
+ unsigned long i;
+
+ if (mask1 == 0 && mask2 == 0)
+ return 0;
+
+ for (i = 0; mask1 && i < 32; i++) {
+ if ((mask1 & masks32[i]) && _ucprop_lookup(code, i))
+ return 1;
+ }
+
+ for (i = 32; mask2 && i < _ucprop_size; i++) {
+ if ((mask2 & masks32[i & 31]) && _ucprop_lookup(code, i))
+ return 1;
+ }
+
+ return 0;
+}
+
+/**************************************************************************
+ *
+ * Support for case mapping.
+ *
+ **************************************************************************/
+
+static unsigned long _uccase_size;
+static unsigned short _uccase_len[2];
+static unsigned long *_uccase_map;
+
+static void
+#ifdef __STDC__
+_uccase_load(char *paths, int reload)
+#else
+_uccase_load(paths, reload)
+char *paths;
+int reload;
+#endif
+{
+ FILE *in;
+ unsigned long i;
+ _ucheader_t hdr;
+
+ if (_uccase_size > 0) {
+ if (!reload)
+ /*
+ * The case mappings have already been loaded.
+ */
+ return;
+
+ free((char *) _uccase_map);
+ _uccase_size = 0;
+ }
+
+ if ((in = _ucopenfile(paths, "case.dat", "rb")) == 0)
+ return;
+
+ /*
+ * Load the header.
+ */
+ fread((char *) &hdr, sizeof(_ucheader_t), 1, in);
+
+ if (hdr.bom == 0xfffe) {
+ hdr.cnt = endian_short(hdr.cnt);
+ hdr.size.len[0] = endian_short(hdr.size.len[0]);
+ hdr.size.len[1] = endian_short(hdr.size.len[1]);
+ }
+
+ /*
+ * Set the node count and lengths of the upper and lower case mapping
+ * tables.
+ */
+ _uccase_size = hdr.cnt * 3;
+ _uccase_len[0] = hdr.size.len[0] * 3;
+ _uccase_len[1] = hdr.size.len[1] * 3;
+
+ _uccase_map = (unsigned long *)
+ malloc(_uccase_size * sizeof(unsigned long));
+
+ /*
+ * Load the case mapping table.
+ */
+ fread((char *) _uccase_map, sizeof(unsigned long), _uccase_size, in);
+
+ /*
+ * Do an endian swap if necessary.
+ */
+ if (hdr.bom == 0xfffe) {
+ for (i = 0; i < _uccase_size; i++)
+ _uccase_map[i] = endian_long(_uccase_map[i]);
+ }
+}
+
+static void
+#ifdef __STDC__
+_uccase_unload(void)
+#else
+_uccase_unload()
+#endif
+{
+ if (_uccase_size == 0)
+ return;
+
+ free((char *) _uccase_map);
+ _uccase_size = 0;
+}
+
+static unsigned long
+#ifdef __STDC__
+_uccase_lookup(unsigned long code, long l, long r, int field)
+#else
+_uccase_lookup(code, l, r, field)
+unsigned long code;
+long l, r;
+int field;
+#endif
+{
+ long m;
+
+ /*
+ * Do the binary search.
+ */
+ while (l <= r) {
+ /*
+ * Determine a "mid" point and adjust to make sure the mid point is at
+ * the beginning of a case mapping triple.
+ */
+ m = (l + r) >> 1;
+ m -= (m % 3);
+ if (code > _uccase_map[m])
+ l = m + 3;
+ else if (code < _uccase_map[m])
+ r = m - 3;
+ else if (code == _uccase_map[m])
+ return _uccase_map[m + field];
+ }
+
+ return code;
+}
+
+unsigned long
+#ifdef __STDC__
+uctoupper(unsigned long code)
+#else
+uctoupper(code)
+unsigned long code;
+#endif
+{
+ int field;
+ long l, r;
+
+ if (ucisupper(code))
+ return code;
+
+ if (ucislower(code)) {
+ /*
+ * The character is lower case.
+ */
+ field = 2;
+ l = _uccase_len[0];
+ r = (l + _uccase_len[1]) - 3;
+ } else {
+ /*
+ * The character is title case.
+ */
+ field = 1;
+ l = _uccase_len[0] + _uccase_len[1];
+ r = _uccase_size - 3;
+ }
+ return _uccase_lookup(code, l, r, field);
+}
+
+unsigned long
+#ifdef __STDC__
+uctolower(unsigned long code)
+#else
+uctolower(code)
+unsigned long code;
+#endif
+{
+ int field;
+ long l, r;
+
+ if (ucislower(code))
+ return code;
+
+ if (ucisupper(code)) {
+ /*
+ * The character is upper case.
+ */
+ field = 1;
+ l = 0;
+ r = _uccase_len[0] - 3;
+ } else {
+ /*
+ * The character is title case.
+ */
+ field = 2;
+ l = _uccase_len[0] + _uccase_len[1];
+ r = _uccase_size - 3;
+ }
+ return _uccase_lookup(code, l, r, field);
+}
+
+unsigned long
+#ifdef __STDC__
+uctotitle(unsigned long code)
+#else
+uctotitle(code)
+unsigned long code;
+#endif
+{
+ int field;
+ long l, r;
+
+ if (ucistitle(code))
+ return code;
+
+ /*
+ * The offset will always be the same for converting to title case.
+ */
+ field = 2;
+
+ if (ucisupper(code)) {
+ /*
+ * The character is upper case.
+ */
+ l = 0;
+ r = _uccase_len[0] - 3;
+ } else {
+ /*
+ * The character is lower case.
+ */
+ l = _uccase_len[0];
+ r = (l + _uccase_len[1]) - 3;
+ }
+ return _uccase_lookup(code, l, r, field);
+}
+
+/**************************************************************************
+ *
+ * Support for decompositions.
+ *
+ **************************************************************************/
+
+static unsigned long _ucdcmp_size;
+static unsigned long *_ucdcmp_nodes;
+static unsigned long *_ucdcmp_decomp;
+
+static void
+#ifdef __STDC__
+_ucdcmp_load(char *paths, int reload)
+#else
+_ucdcmp_load(paths, reload)
+char *paths;
+int reload;
+#endif
+{
+ FILE *in;
+ unsigned long size, i;
+ _ucheader_t hdr;
+
+ if (_ucdcmp_size > 0) {
+ if (!reload)
+ /*
+ * The decompositions have already been loaded.
+ */
+ return;
+
+ free((char *) _ucdcmp_nodes);
+ _ucdcmp_size = 0;
+ }
+
+ if ((in = _ucopenfile(paths, "decomp.dat", "rb")) == 0)
+ return;
+
+ /*
+ * Load the header.
+ */
+ fread((char *) &hdr, sizeof(_ucheader_t), 1, in);
+
+ if (hdr.bom == 0xfffe) {
+ hdr.cnt = endian_short(hdr.cnt);
+ hdr.size.bytes = endian_long(hdr.size.bytes);
+ }
+
+ _ucdcmp_size = hdr.cnt << 1;
+ _ucdcmp_nodes = (unsigned long *) malloc(hdr.size.bytes);
+ _ucdcmp_decomp = _ucdcmp_nodes + (_ucdcmp_size + 1);
+
+ /*
+ * Read the decomposition data in.
+ */
+ size = hdr.size.bytes / sizeof(unsigned long);
+ fread((char *) _ucdcmp_nodes, sizeof(unsigned long), size, in);
+
+ /*
+ * Do an endian swap if necessary.
+ */
+ if (hdr.bom == 0xfffe) {
+ for (i = 0; i < size; i++)
+ _ucdcmp_nodes[i] = endian_long(_ucdcmp_nodes[i]);
+ }
+}
+
+static void
+#ifdef __STDC__
+_ucdcmp_unload(void)
+#else
+_ucdcmp_unload()
+#endif
+{
+ if (_ucdcmp_size == 0)
+ return;
+
+ /*
+ * Only need to free the offsets because the memory is allocated as a
+ * single block.
+ */
+ free((char *) _ucdcmp_nodes);
+ _ucdcmp_size = 0;
+}
+
+int
+#ifdef __STDC__
+ucdecomp(unsigned long code, unsigned long *num, unsigned long **decomp)
+#else
+ucdecomp(code, num, decomp)
+unsigned long code, *num, **decomp;
+#endif
+{
+ long l, r, m;
+
+ l = 0;
+ r = _ucdcmp_nodes[_ucdcmp_size] - 1;
+
+ while (l <= r) {
+ /*
+ * Determine a "mid" point and adjust to make sure the mid point is at
+ * the beginning of a code+offset pair.
+ */
+ m = (l + r) >> 1;
+ m -= (m & 1);
+ if (code > _ucdcmp_nodes[m])
+ l = m + 2;
+ else if (code < _ucdcmp_nodes[m])
+ r = m - 2;
+ else if (code == _ucdcmp_nodes[m]) {
+ *num = _ucdcmp_nodes[m + 3] - _ucdcmp_nodes[m + 1];
+ *decomp = &_ucdcmp_decomp[_ucdcmp_nodes[m + 1]];
+ return 1;
+ }
+ }
+ return 0;
+}
+
+int
+#ifdef __STDC__
+ucdecomp_hangul(unsigned long code, unsigned long *num, unsigned long decomp[])
+#else
+ucdecomp_hangul(code, num, decomp)
+unsigned long code, *num, decomp[];
+#endif
+{
+ if (!ucishangul(code))
+ return 0;
+
+ code -= 0xac00;
+ decomp[0] = 0x1100 + (unsigned long) (code / 588);
+ decomp[1] = 0x1161 + (unsigned long) ((code % 588) / 28);
+ decomp[2] = 0x11a7 + (unsigned long) (code % 28);
+ *num = (decomp[2] != 0x11a7) ? 3 : 2;
+
+ return 1;
+}
+
+/**************************************************************************
+ *
+ * Support for combining classes.
+ *
+ **************************************************************************/
+
+static unsigned long _uccmcl_size;
+static unsigned long *_uccmcl_nodes;
+
+static void
+#ifdef __STDC__
+_uccmcl_load(char *paths, int reload)
+#else
+_uccmcl_load(paths, reload)
+char *paths;
+int reload;
+#endif
+{
+ FILE *in;
+ unsigned long i;
+ _ucheader_t hdr;
+
+ if (_uccmcl_size > 0) {
+ if (!reload)
+ /*
+ * The combining classes have already been loaded.
+ */
+ return;
+
+ free((char *) _uccmcl_nodes);
+ _uccmcl_size = 0;
+ }
+
+ if ((in = _ucopenfile(paths, "cmbcl.dat", "rb")) == 0)
+ return;
+
+ /*
+ * Load the header.
+ */
+ fread((char *) &hdr, sizeof(_ucheader_t), 1, in);
+
+ if (hdr.bom == 0xfffe) {
+ hdr.cnt = endian_short(hdr.cnt);
+ hdr.size.bytes = endian_long(hdr.size.bytes);
+ }
+
+ _uccmcl_size = hdr.cnt * 3;
+ _uccmcl_nodes = (unsigned long *) malloc(hdr.size.bytes);
+
+ /*
+ * Read the combining classes in.
+ */
+ fread((char *) _uccmcl_nodes, sizeof(unsigned long), _uccmcl_size, in);
+
+ /*
+ * Do an endian swap if necessary.
+ */
+ if (hdr.bom == 0xfffe) {
+ for (i = 0; i < _uccmcl_size; i++)
+ _uccmcl_nodes[i] = endian_long(_uccmcl_nodes[i]);
+ }
+}
+
+static void
+#ifdef __STDC__
+_uccmcl_unload(void)
+#else
+_uccmcl_unload()
+#endif
+{
+ if (_uccmcl_size == 0)
+ return;
+
+ free((char *) _uccmcl_nodes);
+ _uccmcl_size = 0;
+}
+
+unsigned long
+#ifdef __STDC__
+uccombining_class(unsigned long code)
+#else
+uccombining_class(code)
+unsigned long code;
+#endif
+{
+ long l, r, m;
+
+ l = 0;
+ r = _uccmcl_size - 1;
+
+ while (l <= r) {
+ m = (l + r) >> 1;
+ m -= (m % 3);
+ if (code > _uccmcl_nodes[m + 1])
+ l = m + 3;
+ else if (code < _uccmcl_nodes[m])
+ r = m - 3;
+ else if (code >= _uccmcl_nodes[m] && code <= _uccmcl_nodes[m + 1])
+ return _uccmcl_nodes[m + 2];
+ }
+ return 0;
+}
+
+/**************************************************************************
+ *
+ * Support for numeric values.
+ *
+ **************************************************************************/
+
+static unsigned long *_ucnum_nodes;
+static unsigned long _ucnum_size;
+static short *_ucnum_vals;
+
+static void
+#ifdef __STDC__
+_ucnumb_load(char *paths, int reload)
+#else
+_ucnumb_load(paths, reload)
+char *paths;
+int reload;
+#endif
+{
+ FILE *in;
+ unsigned long size, i;
+ _ucheader_t hdr;
+
+ if (_ucnum_size > 0) {
+ if (!reload)
+ /*
+ * The numbers have already been loaded.
+ */
+ return;
+
+ free((char *) _ucnum_nodes);
+ _ucnum_size = 0;
+ }
+
+ if ((in = _ucopenfile(paths, "num.dat", "rb")) == 0)
+ return;
+
+ /*
+ * Load the header.
+ */
+ fread((char *) &hdr, sizeof(_ucheader_t), 1, in);
+
+ if (hdr.bom == 0xfffe) {
+ hdr.cnt = endian_short(hdr.cnt);
+ hdr.size.bytes = endian_long(hdr.size.bytes);
+ }
+
+ _ucnum_size = hdr.cnt;
+ _ucnum_nodes = (unsigned long *) malloc(hdr.size.bytes);
+ _ucnum_vals = (short *) (_ucnum_nodes + _ucnum_size);
+
+ /*
+ * Read the combining classes in.
+ */
+ fread((char *) _ucnum_nodes, sizeof(unsigned char), hdr.size.bytes, in);
+
+ /*
+ * Do an endian swap if necessary.
+ */
+ if (hdr.bom == 0xfffe) {
+ for (i = 0; i < _ucnum_size; i++)
+ _ucnum_nodes[i] = endian_long(_ucnum_nodes[i]);
+
+ /*
+ * Determine the number of values that have to be adjusted.
+ */
+ size = (hdr.size.bytes -
+ (_ucnum_size * (sizeof(unsigned long) << 1))) /
+ sizeof(short);
+
+ for (i = 0; i < size; i++)
+ _ucnum_vals[i] = endian_short(_ucnum_vals[i]);
+ }
+}
+
+static void
+#ifdef __STDC__
+_ucnumb_unload(void)
+#else
+_ucnumb_unload()
+#endif
+{
+ if (_ucnum_size == 0)
+ return;
+
+ free((char *) _ucnum_nodes);
+ _ucnum_size = 0;
+}
+
+int
+#ifdef __STDC__
+ucnumber_lookup(unsigned long code, struct ucnumber *num)
+#else
+ucnumber_lookup(code, num)
+unsigned long code;
+struct ucnumber *num;
+#endif
+{
+ long l, r, m;
+ short *vp;
+
+ l = 0;
+ r = _ucnum_size - 1;
+ while (l <= r) {
+ /*
+ * Determine a "mid" point and adjust to make sure the mid point is at
+ * the beginning of a code+offset pair.
+ */
+ m = (l + r) >> 1;
+ m -= (m & 1);
+ if (code > _ucnum_nodes[m])
+ l = m + 2;
+ else if (code < _ucnum_nodes[m])
+ r = m - 2;
+ else {
+ vp = _ucnum_vals + _ucnum_nodes[m + 1];
+ num->numerator = (int) *vp++;
+ num->denominator = (int) *vp;
+ return 1;
+ }
+ }
+ return 0;
+}
+
+int
+#ifdef __STDC__
+ucdigit_lookup(unsigned long code, int *digit)
+#else
+ucdigit_lookup(code, digit)
+unsigned long code;
+int *digit;
+#endif
+{
+ long l, r, m;
+ short *vp;
+
+ l = 0;
+ r = _ucnum_size - 1;
+ while (l <= r) {
+ /*
+ * Determine a "mid" point and adjust to make sure the mid point is at
+ * the beginning of a code+offset pair.
+ */
+ m = (l + r) >> 1;
+ m -= (m & 1);
+ if (code > _ucnum_nodes[m])
+ l = m + 2;
+ else if (code < _ucnum_nodes[m])
+ r = m - 2;
+ else {
+ vp = _ucnum_vals + _ucnum_nodes[m + 1];
+ if (*vp == *(vp + 1)) {
+ *digit = *vp;
+ return 1;
+ }
+ return 0;
+ }
+ }
+ return 0;
+}
+
+struct ucnumber
+#ifdef __STDC__
+ucgetnumber(unsigned long code)
+#else
+ucgetnumber(code)
+unsigned long code;
+#endif
+{
+ struct ucnumber num;
+
+ /*
+ * Initialize with some arbitrary value, because the caller simply cannot
+ * tell for sure if the code is a number without calling the ucisnumber()
+ * macro before calling this function.
+ */
+ num.numerator = num.denominator = -111;
+
+ (void) ucnumber_lookup(code, &num);
+
+ return num;
+}
+
+int
+#ifdef __STDC__
+ucgetdigit(unsigned long code)
+#else
+ucgetdigit(code)
+unsigned long code;
+#endif
+{
+ int dig;
+
+ /*
+ * Initialize with some arbitrary value, because the caller simply cannot
+ * tell for sure if the code is a number without calling the ucisdigit()
+ * macro before calling this function.
+ */
+ dig = -111;
+
+ (void) ucdigit_lookup(code, &dig);
+
+ return dig;
+}
+
+/**************************************************************************
+ *
+ * Setup and cleanup routines.
+ *
+ **************************************************************************/
+
+void
+#ifdef __STDC__
+ucdata_load(char *paths, int masks)
+#else
+ucdata_load(paths, masks)
+char *paths;
+int masks;
+#endif
+{
+ if (masks & UCDATA_CTYPE)
+ _ucprop_load(paths, 0);
+ if (masks & UCDATA_CASE)
+ _uccase_load(paths, 0);
+ if (masks & UCDATA_DECOMP)
+ _ucdcmp_load(paths, 0);
+ if (masks & UCDATA_CMBCL)
+ _uccmcl_load(paths, 0);
+ if (masks & UCDATA_NUM)
+ _ucnumb_load(paths, 0);
+}
+
+void
+#ifdef __STDC__
+ucdata_unload(int masks)
+#else
+ucdata_unload(masks)
+int masks;
+#endif
+{
+ if (masks & UCDATA_CTYPE)
+ _ucprop_unload();
+ if (masks & UCDATA_CASE)
+ _uccase_unload();
+ if (masks & UCDATA_DECOMP)
+ _ucdcmp_unload();
+ if (masks & UCDATA_CMBCL)
+ _uccmcl_unload();
+ if (masks & UCDATA_NUM)
+ _ucnumb_unload();
+}
+
+void
+#ifdef __STDC__
+ucdata_reload(char *paths, int masks)
+#else
+ucdata_reload(paths, masks)
+char *paths;
+int masks;
+#endif
+{
+ if (masks & UCDATA_CTYPE)
+ _ucprop_load(paths, 1);
+ if (masks & UCDATA_CASE)
+ _uccase_load(paths, 1);
+ if (masks & UCDATA_DECOMP)
+ _ucdcmp_load(paths, 1);
+ if (masks & UCDATA_CMBCL)
+ _uccmcl_load(paths, 1);
+ if (masks & UCDATA_NUM)
+ _ucnumb_load(paths, 1);
+}
+
+#ifdef TEST
+
+void
+#ifdef __STDC__
+main(void)
+#else
+main()
+#endif
+{
+ int dig;
+ unsigned long i, lo, *dec;
+ struct ucnumber num;
+
+ ucdata_setup(".");
+
+ if (ucisweak(0x30))
+ printf("WEAK\n");
+ else
+ printf("NOT WEAK\n");
+
+ printf("LOWER 0x%04lX\n", uctolower(0xff3a));
+ printf("UPPER 0x%04lX\n", uctoupper(0xff5a));
+
+ if (ucisalpha(0x1d5))
+ printf("ALPHA\n");
+ else
+ printf("NOT ALPHA\n");
+
+ if (ucisupper(0x1d5)) {
+ printf("UPPER\n");
+ lo = uctolower(0x1d5);
+ printf("0x%04lx\n", lo);
+ lo = uctotitle(0x1d5);
+ printf("0x%04lx\n", lo);
+ } else
+ printf("NOT UPPER\n");
+
+ if (ucistitle(0x1d5))
+ printf("TITLE\n");
+ else
+ printf("NOT TITLE\n");
+
+ if (uciscomposite(0x1d5))
+ printf("COMPOSITE\n");
+ else
+ printf("NOT COMPOSITE\n");
+
+ if (ucdecomp(0x1d5, &lo, &dec)) {
+ for (i = 0; i < lo; i++)
+ printf("0x%04lx ", dec[i]);
+ putchar('\n');
+ }
+
+ if ((lo = uccombining_class(0x41)) != 0)
+ printf("0x41 CCL %ld\n", lo);
+
+ if (ucisxdigit(0xfeff))
+ printf("0xFEFF HEX DIGIT\n");
+ else
+ printf("0xFEFF NOT HEX DIGIT\n");
+
+ if (ucisdefined(0x10000))
+ printf("0x10000 DEFINED\n");
+ else
+ printf("0x10000 NOT DEFINED\n");
+
+ if (ucnumber_lookup(0x30, &num)) {
+ if (num.numerator != num.denominator)
+ printf("UCNUMBER: 0x30 = %d/%d\n", num.numerator, num.denominator);
+ else
+ printf("UCNUMBER: 0x30 = %d\n", num.numerator);
+ } else
+ printf("UCNUMBER: 0x30 NOT A NUMBER\n");
+
+ if (ucnumber_lookup(0xbc, &num)) {
+ if (num.numerator != num.denominator)
+ printf("UCNUMBER: 0xbc = %d/%d\n", num.numerator, num.denominator);
+ else
+ printf("UCNUMBER: 0xbc = %d\n", num.numerator);
+ } else
+ printf("UCNUMBER: 0xbc NOT A NUMBER\n");
+
+
+ if (ucnumber_lookup(0xff19, &num)) {
+ if (num.numerator != num.denominator)
+ printf("UCNUMBER: 0xff19 = %d/%d\n", num.numerator, num.denominator);
+ else
+ printf("UCNUMBER: 0xff19 = %d\n", num.numerator);
+ } else
+ printf("UCNUMBER: 0xff19 NOT A NUMBER\n");
+
+ if (ucnumber_lookup(0x4e00, &num)) {
+ if (num.numerator != num.denominator)
+ printf("UCNUMBER: 0x4e00 = %d/%d\n", num.numerator, num.denominator);
+ else
+ printf("UCNUMBER: 0x4e00 = %d\n", num.numerator);
+ } else
+ printf("UCNUMBER: 0x4e00 NOT A NUMBER\n");
+
+ if (ucdigit_lookup(0x06f9, &dig))
+ printf("UCDIGIT: 0x6f9 = %d\n", dig);
+ else
+ printf("UCDIGIT: 0x6f9 NOT A NUMBER\n");
+
+ dig = ucgetdigit(0x0969);
+ printf("UCGETDIGIT: 0x969 = %d\n", dig);
+
+ num = ucgetnumber(0x30);
+ if (num.numerator != num.denominator)
+ printf("UCGETNUMBER: 0x30 = %d/%d\n", num.numerator, num.denominator);
+ else
+ printf("UCGETNUMBER: 0x30 = %d\n", num.numerator);
+
+ num = ucgetnumber(0xbc);
+ if (num.numerator != num.denominator)
+ printf("UCGETNUMBER: 0xbc = %d/%d\n", num.numerator, num.denominator);
+ else
+ printf("UCGETNUMBER: 0xbc = %d\n", num.numerator);
+
+ num = ucgetnumber(0xff19);
+ if (num.numerator != num.denominator)
+ printf("UCGETNUMBER: 0xff19 = %d/%d\n", num.numerator, num.denominator);
+ else
+ printf("UCGETNUMBER: 0xff19 = %d\n", num.numerator);
+
+ ucdata_cleanup();
+ exit(0);
+}
+
+#endif /* TEST */
--- /dev/null
+/*
+ * Copyright 1999 Computing Research Labs, New Mexico State University
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
+ * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+ * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef _h_ucdata
+#define _h_ucdata
+
+/*
+ * $Id: ucdata.h,v 1.5 1999/11/19 15:24:29 mleisher Exp $
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#undef __
+#ifdef __STDC__
+#define __(x) x
+#else
+#define __(x) ()
+#endif
+
+#define UCDATA_VERSION "2.3"
+
+/**************************************************************************
+ *
+ * Masks and macros for character properties.
+ *
+ **************************************************************************/
+
+/*
+ * Values that can appear in the `mask1' parameter of the ucisprop()
+ * function.
+ */
+#define UC_MN 0x00000001 /* Mark, Non-Spacing */
+#define UC_MC 0x00000002 /* Mark, Spacing Combining */
+#define UC_ME 0x00000004 /* Mark, Enclosing */
+#define UC_ND 0x00000008 /* Number, Decimal Digit */
+#define UC_NL 0x00000010 /* Number, Letter */
+#define UC_NO 0x00000020 /* Number, Other */
+#define UC_ZS 0x00000040 /* Separator, Space */
+#define UC_ZL 0x00000080 /* Separator, Line */
+#define UC_ZP 0x00000100 /* Separator, Paragraph */
+#define UC_CC 0x00000200 /* Other, Control */
+#define UC_CF 0x00000400 /* Other, Format */
+#define UC_OS 0x00000800 /* Other, Surrogate */
+#define UC_CO 0x00001000 /* Other, Private Use */
+#define UC_CN 0x00002000 /* Other, Not Assigned */
+#define UC_LU 0x00004000 /* Letter, Uppercase */
+#define UC_LL 0x00008000 /* Letter, Lowercase */
+#define UC_LT 0x00010000 /* Letter, Titlecase */
+#define UC_LM 0x00020000 /* Letter, Modifier */
+#define UC_LO 0x00040000 /* Letter, Other */
+#define UC_PC 0x00080000 /* Punctuation, Connector */
+#define UC_PD 0x00100000 /* Punctuation, Dash */
+#define UC_PS 0x00200000 /* Punctuation, Open */
+#define UC_PE 0x00400000 /* Punctuation, Close */
+#define UC_PO 0x00800000 /* Punctuation, Other */
+#define UC_SM 0x01000000 /* Symbol, Math */
+#define UC_SC 0x02000000 /* Symbol, Currency */
+#define UC_SK 0x04000000 /* Symbol, Modifier */
+#define UC_SO 0x08000000 /* Symbol, Other */
+#define UC_L 0x10000000 /* Left-To-Right */
+#define UC_R 0x20000000 /* Right-To-Left */
+#define UC_EN 0x40000000 /* European Number */
+#define UC_ES 0x80000000 /* European Number Separator */
+
+/*
+ * Values that can appear in the `mask2' parameter of the ucisprop()
+ * function.
+ */
+#define UC_ET 0x00000001 /* European Number Terminator */
+#define UC_AN 0x00000002 /* Arabic Number */
+#define UC_CS 0x00000004 /* Common Number Separator */
+#define UC_B 0x00000008 /* Block Separator */
+#define UC_S 0x00000010 /* Segment Separator */
+#define UC_WS 0x00000020 /* Whitespace */
+#define UC_ON 0x00000040 /* Other Neutrals */
+/*
+ * Implementation specific character properties.
+ */
+#define UC_CM 0x00000080 /* Composite */
+#define UC_NB 0x00000100 /* Non-Breaking */
+#define UC_SY 0x00000200 /* Symmetric */
+#define UC_HD 0x00000400 /* Hex Digit */
+#define UC_QM 0x00000800 /* Quote Mark */
+#define UC_MR 0x00001000 /* Mirroring */
+#define UC_SS 0x00002000 /* Space, other */
+
+#define UC_CP 0x00004000 /* Defined */
+
+/*
+ * Added for UnicodeData-2.1.3.
+ */
+#define UC_PI 0x00008000 /* Punctuation, Initial */
+#define UC_PF 0x00010000 /* Punctuation, Final */
+
+/*
+ * This is the primary function for testing to see if a character has some set
+ * of properties. The macros that test for various character properties all
+ * call this function with some set of masks.
+ */
+extern int ucisprop __((unsigned long code, unsigned long mask1,
+ unsigned long mask2));
+
+#define ucisalpha(cc) ucisprop(cc, UC_LU|UC_LL|UC_LM|UC_LO|UC_LT, 0)
+#define ucisdigit(cc) ucisprop(cc, UC_ND, 0)
+#define ucisalnum(cc) ucisprop(cc, UC_LU|UC_LL|UC_LM|UC_LO|UC_LT|UC_ND, 0)
+#define uciscntrl(cc) ucisprop(cc, UC_CC|UC_CF, 0)
+#define ucisspace(cc) ucisprop(cc, UC_ZS|UC_SS, 0)
+#define ucisblank(cc) ucisprop(cc, UC_ZS, 0)
+#define ucispunct(cc) ucisprop(cc, UC_PD|UC_PS|UC_PE|UC_PO, UC_PI|UC_PF)
+#define ucisgraph(cc) ucisprop(cc, UC_MN|UC_MC|UC_ME|UC_ND|UC_NL|UC_NO|\
+ UC_LU|UC_LL|UC_LT|UC_LM|UC_LO|UC_PC|UC_PD|\
+ UC_PS|UC_PE|UC_PO|UC_SM|UC_SM|UC_SC|UC_SK|\
+ UC_SO, UC_PI|UC_PF)
+#define ucisprint(cc) ucisprop(cc, UC_MN|UC_MC|UC_ME|UC_ND|UC_NL|UC_NO|\
+ UC_LU|UC_LL|UC_LT|UC_LM|UC_LO|UC_PC|UC_PD|\
+ UC_PS|UC_PE|UC_PO|UC_SM|UC_SM|UC_SC|UC_SK|\
+ UC_SO|UC_ZS, UC_PI|UC_PF)
+#define ucisupper(cc) ucisprop(cc, UC_LU, 0)
+#define ucislower(cc) ucisprop(cc, UC_LL, 0)
+#define ucistitle(cc) ucisprop(cc, UC_LT, 0)
+#define ucisxdigit(cc) ucisprop(cc, 0, UC_HD)
+
+#define ucisisocntrl(cc) ucisprop(cc, UC_CC, 0)
+#define ucisfmtcntrl(cc) ucisprop(cc, UC_CF, 0)
+
+#define ucissymbol(cc) ucisprop(cc, UC_SM|UC_SC|UC_SO|UC_SK, 0)
+#define ucisnumber(cc) ucisprop(cc, UC_ND|UC_NO|UC_NL, 0)
+#define ucisnonspacing(cc) ucisprop(cc, UC_MN, 0)
+#define ucisopenpunct(cc) ucisprop(cc, UC_PS, 0)
+#define ucisclosepunct(cc) ucisprop(cc, UC_PE, 0)
+#define ucisinitialpunct(cc) ucisprop(cc, 0, UC_PI)
+#define ucisfinalpunct(cc) ucisprop(cc, 0, UC_PF)
+
+#define uciscomposite(cc) ucisprop(cc, 0, UC_CM)
+#define ucishex(cc) ucisprop(cc, 0, UC_HD)
+#define ucisquote(cc) ucisprop(cc, 0, UC_QM)
+#define ucissymmetric(cc) ucisprop(cc, 0, UC_SY)
+#define ucismirroring(cc) ucisprop(cc, 0, UC_MR)
+#define ucisnonbreaking(cc) ucisprop(cc, 0, UC_NB)
+
+/*
+ * Directionality macros.
+ */
+#define ucisrtl(cc) ucisprop(cc, UC_R, 0)
+#define ucisltr(cc) ucisprop(cc, UC_L, 0)
+#define ucisstrong(cc) ucisprop(cc, UC_L|UC_R, 0)
+#define ucisweak(cc) ucisprop(cc, UC_EN|UC_ES, UC_ET|UC_AN|UC_CS)
+#define ucisneutral(cc) ucisprop(cc, 0, UC_B|UC_S|UC_WS|UC_ON)
+#define ucisseparator(cc) ucisprop(cc, 0, UC_B|UC_S)
+
+/*
+ * Other macros inspired by John Cowan.
+ */
+#define ucismark(cc) ucisprop(cc, UC_MN|UC_MC|UC_ME, 0)
+#define ucismodif(cc) ucisprop(cc, UC_LM, 0)
+#define ucisletnum(cc) ucisprop(cc, UC_NL, 0)
+#define ucisconnect(cc) ucisprop(cc, UC_PC, 0)
+#define ucisdash(cc) ucisprop(cc, UC_PD, 0)
+#define ucismath(cc) ucisprop(cc, UC_SM, 0)
+#define uciscurrency(cc) ucisprop(cc, UC_SC, 0)
+#define ucismodifsymbol(cc) ucisprop(cc, UC_SK, 0)
+#define ucisnsmark(cc) ucisprop(cc, UC_MN, 0)
+#define ucisspmark(cc) ucisprop(cc, UC_MC, 0)
+#define ucisenclosing(cc) ucisprop(cc, UC_ME, 0)
+#define ucisprivate(cc) ucisprop(cc, UC_CO, 0)
+#define ucissurrogate(cc) ucisprop(cc, UC_OS, 0)
+#define ucislsep(cc) ucisprop(cc, UC_ZL, 0)
+#define ucispsep(cc) ucisprop(cc, UC_ZP, 0)
+
+#define ucisidentstart(cc) ucisprop(cc, UC_LU|UC_LL|UC_LT|UC_LO|UC_NL, 0)
+#define ucisidentpart(cc) ucisprop(cc, UC_LU|UC_LL|UC_LT|UC_LO|UC_NL|\
+ UC_MN|UC_MC|UC_ND|UC_PC|UC_CF, 0)
+
+#define ucisdefined(cc) ucisprop(cc, 0, UC_CP)
+#define ucisundefined(cc) !ucisprop(cc, 0, UC_CP)
+
+/*
+ * Other miscellaneous character property macros.
+ */
+#define ucishan(cc) (((cc) >= 0x4e00 && (cc) <= 0x9fff) ||\
+ ((cc) >= 0xf900 && (cc) <= 0xfaff))
+#define ucishangul(cc) ((cc) >= 0xac00 && (cc) <= 0xd7ff)
+
+/**************************************************************************
+ *
+ * Functions for case conversion.
+ *
+ **************************************************************************/
+
+extern unsigned long uctoupper __((unsigned long code));
+extern unsigned long uctolower __((unsigned long code));
+extern unsigned long uctotitle __((unsigned long code));
+
+/**************************************************************************
+ *
+ * Functions for getting decompositions.
+ *
+ **************************************************************************/
+
+/*
+ * This routine determines if the code has a decomposition. If it returns 0,
+ * there is no decomposition. Any other value indicates a decomposition was
+ * returned.
+ */
+extern int ucdecomp __((unsigned long code, unsigned long *num,
+
+ unsigned long **decomp));
+
+/*
+ * If the code is a Hangul syllable, this routine decomposes it into the array
+ * passed. The array size should be at least 3.
+ */
+extern int ucdecomp_hangul __((unsigned long code, unsigned long *num,
+ unsigned long decomp[]));
+
+/**************************************************************************
+ *
+ * Functions for getting combining classes.
+ *
+ **************************************************************************/
+
+/*
+ * This will return the combining class for a character to be used with the
+ * Canonical Ordering algorithm.
+ */
+extern unsigned long uccombining_class __((unsigned long code));
+
+/**************************************************************************
+ *
+ * Functions for getting numbers and digits.
+ *
+ **************************************************************************/
+
+struct ucnumber {
+ int numerator;
+ int denominator;
+};
+
+extern int ucnumber_lookup __((unsigned long code, struct ucnumber *num));
+extern int ucdigit_lookup __((unsigned long code, int *digit));
+
+/*
+ * For compatibility with John Cowan's "uctype" package.
+ */
+extern struct ucnumber ucgetnumber __((unsigned long code));
+extern int ucgetdigit __((unsigned long code));
+
+/**************************************************************************
+ *
+ * Functions library initialization and cleanup.
+ *
+ **************************************************************************/
+
+/*
+ * Macros for specifying the data tables to be loaded, unloaded, or reloaded
+ * by the ucdata_load(), ucdata_unload(), and ucdata_reload() routines.
+ */
+#define UCDATA_CASE 0x01
+#define UCDATA_CTYPE 0x02
+#define UCDATA_DECOMP 0x04
+#define UCDATA_CMBCL 0x08
+#define UCDATA_NUM 0x10
+
+#define UCDATA_ALL (UCDATA_CASE|UCDATA_CTYPE|UCDATA_DECOMP|\
+ UCDATA_CMBCL|UCDATA_NUM)
+
+/*
+ * Functions to load, unload, and reload specific data files.
+ */
+extern void ucdata_load __((char *paths, int mask));
+extern void ucdata_unload __((int mask));
+extern void ucdata_reload __((char *paths, int mask));
+
+/*
+ * Deprecated functions, now just compatibility macros.
+ */
+#define ucdata_setup(p) ucdata_load(p, UCDATA_ALL)
+#define ucdata_cleanup() ucdata_unload(UCDATA_ALL)
+
+#undef __
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _h_ucdata */
--- /dev/null
+.\"
+.\" $Id: ucdata.man,v 1.4 1999/11/19 16:08:33 mleisher Exp $
+.\"
+.TH ucdata 3 "19 November 1999"
+.SH NAME
+ucdata \- package for providing Unicode/ISO10646 character information
+
+.SH SYNOPSIS
+#include <ucdata.h>
+.sp
+void ucdata_load(char * paths, int masks)
+.sp
+void ucdata_unload(int masks)
+.sp
+void ucdata_reload(char * paths, int masks)
+.sp
+int ucdecomp(unsigned long code, unsigned long *num, unsigned long **decomp)
+.sp
+int ucdecomp_hangul(unsigned long code, unsigned long *num,
+unsigned long decomp[])
+.sp
+.nf
+struct ucnumber {
+ int numerator;
+ int denominator;
+};
+.sp
+int ucnumber_lookup(unsigned long code, struct ucnumber *num)
+.sp
+int ucdigit_lookup(unsigned long code, int *digit)
+.sp
+struct ucnumber ucgetnumber(unsigned long code)
+.sp
+int ucgetdigit(unsigned long code)
+.sp
+unsigned long uctoupper(unsigned long code)
+.sp
+unsigned long uctolower(unsigned long code)
+.sp
+unsigned long uctotitle(unsigned long code)
+.sp
+int ucisalpha(unsigned long code)
+.sp
+int ucisalnum(unsigned long code)
+.sp
+int ucisdigit(unsigned long code)
+.sp
+int uciscntrl(unsigned long code)
+.sp
+int ucisspace(unsigned long code)
+.sp
+int ucisblank(unsigned long code)
+.sp
+int ucispunct(unsigned long code)
+.sp
+int ucisgraph(unsigned long code)
+.sp
+int ucisprint(unsigned long code)
+.sp
+int ucisxdigit(unsigned long code)
+.sp
+int ucisupper(unsigned long code)
+.sp
+int ucislower(unsigned long code)
+.sp
+int ucistitle(unsigned long code)
+.sp
+int ucisisocntrl(unsigned long code)
+.sp
+int ucisfmtcntrl(unsigned long code)
+.sp
+int ucissymbol(unsigned long code)
+.sp
+int ucisnumber(unsigned long code)
+.sp
+int ucisnonspacing(unsigned long code)
+.sp
+int ucisopenpunct(unsigned long code)
+.sp
+int ucisclosepunct(unsigned long code)
+.sp
+int ucisinitialpunct(unsigned long code)
+.sp
+int ucisfinalpunct(unsigned long code)
+.sp
+int uciscomposite(unsigned long code)
+.sp
+int ucisquote(unsigned long code)
+.sp
+int ucissymmetric(unsigned long code)
+.sp
+int ucismirroring(unsigned long code)
+.sp
+int ucisnonbreaking(unsigned long code)
+.sp
+int ucisrtl(unsigned long code)
+.sp
+int ucisltr(unsigned long code)
+.sp
+int ucisstrong(unsigned long code)
+.sp
+int ucisweak(unsigned long code)
+.sp
+int ucisneutral(unsigned long code)
+.sp
+int ucisseparator(unsigned long code)
+.sp
+int ucislsep(unsigned long code)
+.sp
+int ucispsep(unsigned long code)
+.sp
+int ucismark(unsigned long code)
+.sp
+int ucisnsmark(unsigned long code)
+.sp
+int ucisspmark(unsigned long code)
+.sp
+int ucismodif(unsigned long code)
+.sp
+int ucismodifsymbol(unsigned long code)
+.sp
+int ucisletnum(unsigned long code)
+.sp
+int ucisconnect(unsigned long code)
+.sp
+int ucisdash(unsigned long code)
+.sp
+int ucismath(unsigned long code)
+.sp
+int uciscurrency(unsigned long code)
+.sp
+int ucisenclosing(unsigned long code)
+.sp
+int ucisprivate(unsigned long code)
+.sp
+int ucissurrogate(unsigned long code)
+.sp
+int ucisidentstart(unsigned long code)
+.sp
+int ucisidentpart(unsigned long code)
+.sp
+int ucisdefined(unsigned long code)
+.sp
+int ucisundefined(unsigned long code)
+.sp
+int ucishan(unsigned long code)
+.sp
+int ucishangul(unsigned long code)
+
+.SH DESCRIPTION
+.TP 4
+.BR Macros
+.br
+UCDATA_CASE
+.br
+UCDATA_CTYPE
+.br
+UCDATA_DECOMP
+.br
+UCDATA_CMBCL
+.br
+UCDATA_NUM
+.br
+UCDATA_ALL
+.br
+.TP 4
+.BR ucdata_load()
+This function initializes the UCData library by locating the data files in one
+of the colon-separated directories in the `paths' parameter. The data files
+to be loaded are specified in the `masks' parameter as a bitwise combination
+of the macros listed above.
+.sp
+This should be called before using any of the other functions.
+.TP 4
+.BR ucdata_unload()
+This function unloads the data tables specified in the `masks' parameter.
+.sp
+This function should be called when the application is done using the UCData
+package.
+.TP 4
+.BR ucdata_reload()
+This function reloads the data files from one of the colon-separated
+directories in the `paths' parameter. The data files to be reloaded are
+specified in the `masks' parameter as a bitwise combination of the macros
+listed above.
+.TP 4
+.BR ucdecomp()
+This function determines if a character has a decomposition and returns the
+decomposition information if it exists.
+.sp
+If a zero is returned, there is no decomposition. If a non-zero is
+returned, then the `num' and `decomp' variables are filled in with the
+appropriate values.
+.sp
+Example call:
+.sp
+.nf
+ unsigned long i, num, *decomp;
+
+ if (ucdecomp(0x1d5, &num, &decomp) != 0) {
+ for (i = 0; i < num; i++)
+ printf("0x%08lX,", decomp[i]);
+ putchar('\n');
+ }
+.TP 4
+.BR ucdecomp_hangul()
+This function determines if a Hangul syllable has a
+decomposition and returns the decomposition information.
+.sp
+An array of at least size 3 should be passed to the function
+for the decomposition of the syllable.
+.sp
+If a zero is returned, the character is not a Hangul
+syllable. If a non-zero is returned, the `num' field
+will be 2 or 3 and the syllable will be decomposed into
+the `decomp' array arithmetically.
+.sp
+Example call:
+.sp
+.nf
+ unsigned long i, num, decomp[3];
+
+ if (ucdecomp_hangul(0xb1ba, &num, &decomp) != 0) {
+ for (i = 0; i < num; i++)
+ printf("0x%08lX,", decomp[i]);
+ putchar('\n');
+ }
+.TP 4
+.BR ucnumber_lookup()
+This function determines if the code is a number and
+fills in the `num' field with the numerator and
+denominator. If the code happens to be a single digit,
+the numerator and denominator fields will be the same.
+.sp
+If the function returns 0, the code is not a number.
+Any other return value means the code is a number.
+.TP 4
+.BR ucdigit_lookup()
+This function determines if the code is a digit and
+fills in the `digit' field with the digit value.
+.sp
+If the function returns 0, the code is not a number.
+Any other return value means the code is a number.
+.TP 4
+.BR ucgetnumber()
+This is a compatibility function with John Cowan's
+"uctype" package. It uses ucnumber_lookup().
+.TP 4
+.BR ucgetdigit()
+This is a compatibility function with John Cowan's
+"uctype" package. It uses ucdigit_lookup().
+.TP 4
+.BR uctoupper()
+This function returns the code unchanged if it is
+already upper case or has no upper case equivalent.
+Otherwise the upper case equivalent is returned.
+.TP 4
+.BR uctolower()
+This function returns the code unchanged if it is
+already lower case or has no lower case equivalent.
+Otherwise the lower case equivalent is returned.
+.TP 4
+.BR uctotitle()
+This function returns the code unchanged if it is
+already title case or has no title case equivalent.
+Otherwise the title case equivalent is returned.
+.TP 4
+.BR ucisalpha()
+Test if \fIcode\fR is an alpha character.
+.TP 4
+.BR ucisalnum()
+Test if \fIcode\fR is an alpha or digit character.
+.TP 4
+.BR ucisdigit()
+Test if \fIcode\fR is a digit character.
+.TP 4
+.BR uciscntrl()
+Test if \fIcode\fR is a control character.
+.TP 4
+.BR ucisspace()
+Test if \fIcode\fR is a space character.
+.TP 4
+.BR ucisblank()
+Test if \fIcode\fR is a blank character.
+.TP 4
+.BR ucispunct()
+Test if \fIcode\fR is a punctuation character.
+.TP 4
+.BR ucisgraph()
+Test if \fIcode\fR is a graphical (visible) character.
+.TP 4
+.BR ucisprint()
+Test if \fIcode\fR is a printable character.
+.TP 4
+.BR ucisxdigit()
+Test if \fIcode\fR is a hexadecimal digit character.
+.TP 4
+.BR ucisupper()
+Test if \fIcode\fR is an upper case character.
+.TP 4
+.BR ucislower()
+Test if \fIcode\fR is a lower case character.
+.TP 4
+.BR ucistitle()
+Test if \fIcode\fR is a title case character.
+.TP 4
+.BR ucisisocntrl()
+Is the character a C0 control character (< 32)?
+.TP 4
+.BR ucisfmtcntrl()
+Is the character a format control character?
+.TP 4
+.BR ucissymbol()
+Is the character a symbol?
+.TP 4
+.BR ucisnumber()
+Is the character a number or digit?
+.TP 4
+.BR ucisnonspacing()
+Is the character non-spacing?
+.TP 4
+.BR ucisopenpunct()
+Is the character an open/left punctuation (i.e. '[')
+.TP 4
+.BR ucisclosepunct()
+Is the character an close/right punctuation (i.e. ']')
+.TP 4
+.BR ucisinitialpunct()
+Is the character an initial punctuation (i.e. U+2018 LEFT
+SINGLE QUOTATION MARK)
+.TP 4
+.BR ucisfinalpunct()
+Is the character a final punctuation (i.e. U+2019 RIGHT
+SINGLE QUOTATION MARK)
+.TP 4
+.BR uciscomposite()
+Can the character be decomposed into a set of other
+characters?
+.TP 4
+.BR ucisquote()
+Is the character one of the many quotation marks?
+.TP 4
+.BR ucissymmetric()
+Is the character one that has an opposite form
+(i.e. <>)
+.TP 4
+.BR ucismirroring()
+Is the character mirroring (superset of symmetric)?
+.TP 4
+.BR ucisnonbreaking()
+Is the character non-breaking (i.e. non-breaking
+space)?
+.TP 4
+.BR ucisrtl()
+Does the character have strong right-to-left
+directionality (i.e. Arabic letters)?
+.TP 4
+.BR ucisltr()
+Does the character have strong left-to-right
+directionality (i.e. Latin letters)?
+.TP 4
+.BR ucisstrong()
+Does the character have strong directionality?
+.TP 4
+.BR ucisweak()
+Does the character have weak directionality
+(i.e. numbers)?
+.TP 4
+.BR ucisneutral()
+Does the character have neutral directionality
+(i.e. whitespace)?
+.TP 4
+.BR ucisseparator()
+Is the character a block or segment separator?
+.TP 4
+.BR ucislsep()
+Is the character a line separator?
+.TP 4
+.BR ucispsep()
+Is the character a paragraph separator?
+.TP 4
+.BR ucismark()
+Is the character a mark of some kind?
+.TP 4
+.BR ucisnsmark()
+Is the character a non-spacing mark?
+.TP 4
+.BR ucisspmark()
+Is the character a spacing mark?
+.TP 4
+.BR ucismodif()
+Is the character a modifier letter?
+.TP 4
+.BR ucismodifsymbol()
+Is the character a modifier symbol?
+.TP 4
+.BR ucisletnum()
+Is the character a number represented by a letter?
+.TP 4
+.BR ucisconnect()
+Is the character connecting punctuation?
+.TP 4
+.BR ucisdash()
+Is the character dash punctuation?
+.TP 4
+.BR ucismath()
+Is the character a math character?
+.TP 4
+.BR uciscurrency()
+Is the character a currency character?
+.TP 4
+.BR ucisenclosing()
+Is the character enclosing (i.e. enclosing box)?
+.TP 4
+.BR ucisprivate()
+Is the character from the Private Use Area?
+.TP 4
+.BR ucissurrogate()
+Is the character one of the surrogate codes?
+.TP 4
+.BR ucisidentstart()
+Is the character a legal initial character of an identifier?
+.TP 4
+.BR ucisidentpart()
+Is the character a legal identifier character?
+.TP 4
+.BR ucisdefined()
+Is the character defined (appeared in one of the data
+files)?
+.TP 4
+.BR ucisundefined()
+Is the character not defined (non-Unicode)?
+.TP 4
+.BR ucishan()
+Is the character a Han ideograph?
+.TP 4
+.BR ucishangul()
+Is the character a pre-composed Hangul syllable?
+
+.SH "SEE ALSO"
+ctype(3)
+
+.SH ACKNOWLEDGMENTS
+These are people who have helped with patches or
+alerted me about problems.
+.sp
+John Cowan <cowan@locke.ccil.org>
+.br
+Bob Verbrugge <bob_verbrugge@nl.compuware.com>
+.br
+Christophe Pierret <cpierret@businessobjects.com>
+.br
+Kent Johnson <kent@pondview.mv.com>
+.br
+Valeriy E. Ushakov <uwe@ptc.spbu.ru>
+
+.SH AUTHOR
+Mark Leisher
+.br
+Computing Research Lab
+.br
+New Mexico State University
+.br
+Email: mleisher@crl.nmsu.edu
--- /dev/null
+/*
+ * Copyright 1999 Computing Research Labs, New Mexico State University
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
+ * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+ * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef lint
+#ifdef __GNUC__
+static char rcsid[] __attribute__ ((unused)) = "$Id: ucgendat.c,v 1.3 1999/10/07 20:49:56 mleisher Exp $";
+#else
+static char rcsid[] = "$Id: ucgendat.c,v 1.3 1999/10/07 20:49:56 mleisher Exp $";
+#endif
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#ifndef WIN32
+#include <unistd.h>
+#endif
+
+#define ishdigit(cc) (((cc) >= '0' && (cc) <= '9') ||\
+ ((cc) >= 'A' && (cc) <= 'F') ||\
+ ((cc) >= 'a' && (cc) <= 'f'))
+
+/*
+ * A header written to the output file with the byte-order-mark and the number
+ * of property nodes.
+ */
+static unsigned short hdr[2] = {0xfeff, 0};
+
+#define NUMPROPS 49
+#define NEEDPROPS (NUMPROPS + (4 - (NUMPROPS & 3)))
+
+typedef struct {
+ char *name;
+ int len;
+} _prop_t;
+
+/*
+ * List of properties expected to be found in the Unicode Character Database
+ * including some implementation specific properties.
+ *
+ * The implementation specific properties are:
+ * Cm = Composed (can be decomposed)
+ * Nb = Non-breaking
+ * Sy = Symmetric (has left and right forms)
+ * Hd = Hex digit
+ * Qm = Quote marks
+ * Mr = Mirroring
+ * Ss = Space, other
+ * Cp = Defined character
+ */
+static _prop_t props[NUMPROPS] = {
+ {"Mn", 2}, {"Mc", 2}, {"Me", 2}, {"Nd", 2}, {"Nl", 2}, {"No", 2},
+ {"Zs", 2}, {"Zl", 2}, {"Zp", 2}, {"Cc", 2}, {"Cf", 2}, {"Cs", 2},
+ {"Co", 2}, {"Cn", 2}, {"Lu", 2}, {"Ll", 2}, {"Lt", 2}, {"Lm", 2},
+ {"Lo", 2}, {"Pc", 2}, {"Pd", 2}, {"Ps", 2}, {"Pe", 2}, {"Po", 2},
+ {"Sm", 2}, {"Sc", 2}, {"Sk", 2}, {"So", 2}, {"L", 1}, {"R", 1},
+ {"EN", 2}, {"ES", 2}, {"ET", 2}, {"AN", 2}, {"CS", 2}, {"B", 1},
+ {"S", 1}, {"WS", 2}, {"ON", 2},
+ {"Cm", 2}, {"Nb", 2}, {"Sy", 2}, {"Hd", 2}, {"Qm", 2}, {"Mr", 2},
+ {"Ss", 2}, {"Cp", 2}, {"Pi", 2}, {"Pf", 2}
+};
+
+typedef struct {
+ unsigned long *ranges;
+ unsigned short used;
+ unsigned short size;
+} _ranges_t;
+
+static _ranges_t proptbl[NUMPROPS];
+
+/*
+ * Make sure this array is sized to be on a 4-byte boundary at compile time.
+ */
+static unsigned short propcnt[NEEDPROPS];
+
+/*
+ * Array used to collect a decomposition before adding it to the decomposition
+ * table.
+ */
+static unsigned long dectmp[64];
+static unsigned long dectmp_size;
+
+typedef struct {
+ unsigned long code;
+ unsigned short size;
+ unsigned short used;
+ unsigned long *decomp;
+} _decomp_t;
+
+/*
+ * List of decomposition. Created and expanded in order as the characters are
+ * encountered.
+ */
+static _decomp_t *decomps;
+static unsigned long decomps_used;
+static unsigned long decomps_size;
+
+/*
+ * Types and lists for handling lists of case mappings.
+ */
+typedef struct {
+ unsigned long key;
+ unsigned long other1;
+ unsigned long other2;
+} _case_t;
+
+static _case_t *upper;
+static _case_t *lower;
+static _case_t *title;
+static unsigned long upper_used;
+static unsigned long upper_size;
+static unsigned long lower_used;
+static unsigned long lower_size;
+static unsigned long title_used;
+static unsigned long title_size;
+
+/*
+ * Array used to collect case mappings before adding them to a list.
+ */
+static unsigned long cases[3];
+
+/*
+ * An array to hold ranges for combining classes.
+ */
+static unsigned long *ccl;
+static unsigned long ccl_used;
+static unsigned long ccl_size;
+
+/*
+ * Structures for handling numbers.
+ */
+typedef struct {
+ unsigned long code;
+ unsigned long idx;
+} _codeidx_t;
+
+typedef struct {
+ short numerator;
+ short denominator;
+} _num_t;
+
+/*
+ * Arrays to hold the mapping of codes to numbers.
+ */
+static _codeidx_t *ncodes;
+static unsigned long ncodes_used;
+static unsigned long ncodes_size;
+
+static _num_t *nums;
+static unsigned long nums_used;
+static unsigned long nums_size;
+
+/*
+ * Array for holding numbers.
+ */
+static _num_t *nums;
+static unsigned long nums_used;
+static unsigned long nums_size;
+
+static void
+#ifdef __STDC__
+add_range(unsigned long start, unsigned long end, char *p1, char *p2)
+#else
+add_range(start, end, p1, p2)
+unsigned long start, end;
+char *p1, *p2;
+#endif
+{
+ int i, j, k, len;
+ _ranges_t *rlp;
+ char *name;
+
+ for (k = 0; k < 2; k++) {
+ if (k == 0) {
+ name = p1;
+ len = 2;
+ } else {
+ if (p2 == 0)
+ break;
+
+ name = p2;
+ len = 1;
+ }
+
+ for (i = 0; i < NUMPROPS; i++) {
+ if (props[i].len == len && memcmp(props[i].name, name, len) == 0)
+ break;
+ }
+
+ if (i == NUMPROPS)
+ continue;
+
+ rlp = &proptbl[i];
+
+ /*
+ * Resize the range list if necessary.
+ */
+ if (rlp->used == rlp->size) {
+ if (rlp->size == 0)
+ rlp->ranges = (unsigned long *)
+ malloc(sizeof(unsigned long) << 3);
+ else
+ rlp->ranges = (unsigned long *)
+ realloc((char *) rlp->ranges,
+ sizeof(unsigned long) * (rlp->size + 8));
+ rlp->size += 8;
+ }
+
+ /*
+ * If this is the first code for this property list, just add it
+ * and return.
+ */
+ if (rlp->used == 0) {
+ rlp->ranges[0] = start;
+ rlp->ranges[1] = end;
+ rlp->used += 2;
+ continue;
+ }
+
+ /*
+ * Optimize the case of adding the range to the end.
+ */
+ j = rlp->used - 1;
+ if (start > rlp->ranges[j]) {
+ j = rlp->used;
+ rlp->ranges[j++] = start;
+ rlp->ranges[j++] = end;
+ rlp->used = j;
+ continue;
+ }
+
+ /*
+ * Need to locate the insertion point.
+ */
+ for (i = 0;
+ i < rlp->used && start > rlp->ranges[i + 1] + 1; i += 2) ;
+
+ /*
+ * If the start value lies in the current range, then simply set the
+ * new end point of the range to the end value passed as a parameter.
+ */
+ if (rlp->ranges[i] <= start && start <= rlp->ranges[i + 1] + 1) {
+ rlp->ranges[i + 1] = end;
+ return;
+ }
+
+ /*
+ * Shift following values up by two.
+ */
+ for (j = rlp->used; j > i; j -= 2) {
+ rlp->ranges[j] = rlp->ranges[j - 2];
+ rlp->ranges[j + 1] = rlp->ranges[j - 1];
+ }
+
+ /*
+ * Add the new range at the insertion point.
+ */
+ rlp->ranges[i] = start;
+ rlp->ranges[i + 1] = end;
+ rlp->used += 2;
+ }
+}
+
+static void
+#ifdef __STDC__
+ordered_range_insert(unsigned long c, char *name, int len)
+#else
+ordered_range_insert(c, name, len)
+unsigned long c;
+char *name;
+int len;
+#endif
+{
+ int i, j;
+ unsigned long s, e;
+ _ranges_t *rlp;
+
+ if (len == 0)
+ return;
+
+ /*
+ * Deal with directionality codes introduced in Unicode 3.0.
+ */
+ if (len == 2) {
+ if (memcmp(name, "AL", 2) == 0) {
+ /*
+ * Mark the Arabic letters as having RTL directionality.
+ */
+ len = 1;
+ name = "R";
+ } else if (memcmp(name, "BN", 2) == 0) {
+ /*
+ * Mark the control characters as being Other Neutrals.
+ */
+ len = 2;
+ name = "ON";
+ }
+ } else if (len == 3 &&
+ (memcmp(name, "NSM", 3) == 0 || memcmp(name, "PDF", 3) == 0 ||
+ memcmp(name, "LRE", 3) == 0 || memcmp(name, "LRO", 3) == 0 ||
+ memcmp(name, "RLE", 3) == 0 || memcmp(name, "RLO", 3) == 0)) {
+ /*
+ * Mark all of these as Other Neutral to preserve compatibility with
+ * older versions.
+ */
+ len = 2;
+ name = "ON";
+ }
+
+ for (i = 0; i < NUMPROPS; i++) {
+ if (props[i].len == len && memcmp(props[i].name, name, len) == 0)
+ break;
+ }
+
+ if (i == NUMPROPS)
+ return;
+
+ /*
+ * Have a match, so insert the code in order.
+ */
+ rlp = &proptbl[i];
+
+ /*
+ * Resize the range list if necessary.
+ */
+ if (rlp->used == rlp->size) {
+ if (rlp->size == 0)
+ rlp->ranges = (unsigned long *)
+ malloc(sizeof(unsigned long) << 3);
+ else
+ rlp->ranges = (unsigned long *)
+ realloc((char *) rlp->ranges,
+ sizeof(unsigned long) * (rlp->size + 8));
+ rlp->size += 8;
+ }
+
+ /*
+ * If this is the first code for this property list, just add it
+ * and return.
+ */
+ if (rlp->used == 0) {
+ rlp->ranges[0] = rlp->ranges[1] = c;
+ rlp->used += 2;
+ return;
+ }
+
+ /*
+ * Optimize the cases of extending the last range and adding new ranges to
+ * the end.
+ */
+ j = rlp->used - 1;
+ e = rlp->ranges[j];
+ s = rlp->ranges[j - 1];
+
+ if (c == e + 1) {
+ /*
+ * Extend the last range.
+ */
+ rlp->ranges[j] = c;
+ return;
+ }
+
+ if (c > e + 1) {
+ /*
+ * Start another range on the end.
+ */
+ j = rlp->used;
+ rlp->ranges[j] = rlp->ranges[j + 1] = c;
+ rlp->used += 2;
+ return;
+ }
+
+ if (c >= s)
+ /*
+ * The code is a duplicate of a code in the last range, so just return.
+ */
+ return;
+
+ /*
+ * The code should be inserted somewhere before the last range in the
+ * list. Locate the insertion point.
+ */
+ for (i = 0;
+ i < rlp->used && c > rlp->ranges[i + 1] + 1; i += 2) ;
+
+ s = rlp->ranges[i];
+ e = rlp->ranges[i + 1];
+
+ if (c == e + 1)
+ /*
+ * Simply extend the current range.
+ */
+ rlp->ranges[i + 1] = c;
+ else if (c < s) {
+ /*
+ * Add a new entry before the current location. Shift all entries
+ * before the current one up by one to make room.
+ */
+ for (j = rlp->used; j > i; j -= 2) {
+ rlp->ranges[j] = rlp->ranges[j - 2];
+ rlp->ranges[j + 1] = rlp->ranges[j - 1];
+ }
+ rlp->ranges[i] = rlp->ranges[i + 1] = c;
+
+ rlp->used += 2;
+ }
+}
+
+static void
+#ifdef __STDC__
+add_decomp(unsigned long code)
+#else
+add_decomp(code)
+unsigned long code;
+#endif
+{
+ unsigned long i, j, size;
+
+ /*
+ * Add the code to the composite property.
+ */
+ ordered_range_insert(code, "Cm", 2);
+
+ /*
+ * Locate the insertion point for the code.
+ */
+ for (i = 0; i < decomps_used && code > decomps[i].code; i++) ;
+
+ /*
+ * Allocate space for a new decomposition.
+ */
+ if (decomps_used == decomps_size) {
+ if (decomps_size == 0)
+ decomps = (_decomp_t *) malloc(sizeof(_decomp_t) << 3);
+ else
+ decomps = (_decomp_t *)
+ realloc((char *) decomps,
+ sizeof(_decomp_t) * (decomps_size + 8));
+ (void) memset((char *) (decomps + decomps_size), 0,
+ sizeof(_decomp_t) << 3);
+ decomps_size += 8;
+ }
+
+ if (i < decomps_used && code != decomps[i].code) {
+ /*
+ * Shift the decomps up by one if the codes don't match.
+ */
+ for (j = decomps_used; j > i; j--)
+ (void) memcpy((char *) &decomps[j], (char *) &decomps[j - 1],
+ sizeof(_decomp_t));
+ }
+
+ /*
+ * Insert or replace a decomposition.
+ */
+ size = dectmp_size + (4 - (dectmp_size & 3));
+ if (decomps[i].size < size) {
+ if (decomps[i].size == 0)
+ decomps[i].decomp = (unsigned long *)
+ malloc(sizeof(unsigned long) * size);
+ else
+ decomps[i].decomp = (unsigned long *)
+ realloc((char *) decomps[i].decomp,
+ sizeof(unsigned long) * size);
+ decomps[i].size = size;
+ }
+
+ if (decomps[i].code != code)
+ decomps_used++;
+
+ decomps[i].code = code;
+ decomps[i].used = dectmp_size;
+ (void) memcpy((char *) decomps[i].decomp, (char *) dectmp,
+ sizeof(unsigned long) * dectmp_size);
+
+}
+
+static void
+#ifdef __STDC__
+add_title(unsigned long code)
+#else
+add_title(code)
+unsigned long code;
+#endif
+{
+ unsigned long i, j;
+
+ /*
+ * Always map the code to itself.
+ */
+ cases[2] = code;
+
+ if (title_used == title_size) {
+ if (title_size == 0)
+ title = (_case_t *) malloc(sizeof(_case_t) << 3);
+ else
+ title = (_case_t *) realloc((char *) title,
+ sizeof(_case_t) * (title_size + 8));
+ title_size += 8;
+ }
+
+ /*
+ * Locate the insertion point.
+ */
+ for (i = 0; i < title_used && code > title[i].key; i++) ;
+
+ if (i < title_used) {
+ /*
+ * Shift the array up by one.
+ */
+ for (j = title_used; j > i; j--)
+ (void) memcpy((char *) &title[j], (char *) &title[j - 1],
+ sizeof(_case_t));
+ }
+
+ title[i].key = cases[2]; /* Title */
+ title[i].other1 = cases[0]; /* Upper */
+ title[i].other2 = cases[1]; /* Lower */
+
+ title_used++;
+}
+
+static void
+#ifdef __STDC__
+add_upper(unsigned long code)
+#else
+add_upper(code)
+unsigned long code;
+#endif
+{
+ unsigned long i, j;
+
+ /*
+ * Always map the code to itself.
+ */
+ cases[0] = code;
+
+ /*
+ * If the title case character is not present, then make it the same as
+ * the upper case.
+ */
+ if (cases[2] == 0)
+ cases[2] = code;
+
+ if (upper_used == upper_size) {
+ if (upper_size == 0)
+ upper = (_case_t *) malloc(sizeof(_case_t) << 3);
+ else
+ upper = (_case_t *) realloc((char *) upper,
+ sizeof(_case_t) * (upper_size + 8));
+ upper_size += 8;
+ }
+
+ /*
+ * Locate the insertion point.
+ */
+ for (i = 0; i < upper_used && code > upper[i].key; i++) ;
+
+ if (i < upper_used) {
+ /*
+ * Shift the array up by one.
+ */
+ for (j = upper_used; j > i; j--)
+ (void) memcpy((char *) &upper[j], (char *) &upper[j - 1],
+ sizeof(_case_t));
+ }
+
+ upper[i].key = cases[0]; /* Upper */
+ upper[i].other1 = cases[1]; /* Lower */
+ upper[i].other2 = cases[2]; /* Title */
+
+ upper_used++;
+}
+
+static void
+#ifdef __STDC__
+add_lower(unsigned long code)
+#else
+add_lower(code)
+unsigned long code;
+#endif
+{
+ unsigned long i, j;
+
+ /*
+ * Always map the code to itself.
+ */
+ cases[1] = code;
+
+ /*
+ * If the title case character is empty, then make it the same as the
+ * upper case.
+ */
+ if (cases[2] == 0)
+ cases[2] = cases[0];
+
+ if (lower_used == lower_size) {
+ if (lower_size == 0)
+ lower = (_case_t *) malloc(sizeof(_case_t) << 3);
+ else
+ lower = (_case_t *) realloc((char *) lower,
+ sizeof(_case_t) * (lower_size + 8));
+ lower_size += 8;
+ }
+
+ /*
+ * Locate the insertion point.
+ */
+ for (i = 0; i < lower_used && code > lower[i].key; i++) ;
+
+ if (i < lower_used) {
+ /*
+ * Shift the array up by one.
+ */
+ for (j = lower_used; j > i; j--)
+ (void) memcpy((char *) &lower[j], (char *) &lower[j - 1],
+ sizeof(_case_t));
+ }
+
+ lower[i].key = cases[1]; /* Lower */
+ lower[i].other1 = cases[0]; /* Upper */
+ lower[i].other2 = cases[2]; /* Title */
+
+ lower_used++;
+}
+
+static void
+#ifdef __STDC__
+ordered_ccl_insert(unsigned long c, unsigned long ccl_code)
+#else
+ordered_ccl_insert(c, ccl_code)
+unsigned long c, ccl_code;
+#endif
+{
+ unsigned long i, j;
+
+ if (ccl_used == ccl_size) {
+ if (ccl_size == 0)
+ ccl = (unsigned long *) malloc(sizeof(unsigned long) * 24);
+ else
+ ccl = (unsigned long *)
+ realloc((char *) ccl, sizeof(unsigned long) * (ccl_size + 24));
+ ccl_size += 24;
+ }
+
+ /*
+ * Optimize adding the first item.
+ */
+ if (ccl_used == 0) {
+ ccl[0] = ccl[1] = c;
+ ccl[2] = ccl_code;
+ ccl_used += 3;
+ return;
+ }
+
+ /*
+ * Handle the special case of extending the range on the end. This
+ * requires that the combining class codes are the same.
+ */
+ if (ccl_code == ccl[ccl_used - 1] && c == ccl[ccl_used - 2] + 1) {
+ ccl[ccl_used - 2] = c;
+ return;
+ }
+
+ /*
+ * Handle the special case of adding another range on the end.
+ */
+ if (c > ccl[ccl_used - 2] + 1 ||
+ (c == ccl[ccl_used - 2] + 1 && ccl_code != ccl[ccl_used - 1])) {
+ ccl[ccl_used++] = c;
+ ccl[ccl_used++] = c;
+ ccl[ccl_used++] = ccl_code;
+ return;
+ }
+
+ /*
+ * Locate either the insertion point or range for the code.
+ */
+ for (i = 0; i < ccl_used && c > ccl[i + 1] + 1; i += 3) ;
+
+ if (ccl_code == ccl[i + 2] && c == ccl[i + 1] + 1) {
+ /*
+ * Extend an existing range.
+ */
+ ccl[i + 1] = c;
+ return;
+ } else if (c < ccl[i]) {
+ /*
+ * Start a new range before the current location.
+ */
+ for (j = ccl_used; j > i; j -= 3) {
+ ccl[j] = ccl[j - 3];
+ ccl[j - 1] = ccl[j - 4];
+ ccl[j - 2] = ccl[j - 5];
+ }
+ ccl[i] = ccl[i + 1] = c;
+ ccl[i + 2] = ccl_code;
+ }
+}
+
+/*
+ * Adds a number if it does not already exist and returns an index value
+ * multiplied by 2.
+ */
+static unsigned long
+#ifdef __STDC__
+make_number(short num, short denom)
+#else
+make_number(num, denom)
+short num, denom;
+#endif
+{
+ unsigned long n;
+
+ /*
+ * Determine if the number already exists.
+ */
+ for (n = 0; n < nums_used; n++) {
+ if (nums[n].numerator == num && nums[n].denominator == denom)
+ return n << 1;
+ }
+
+ if (nums_used == nums_size) {
+ if (nums_size == 0)
+ nums = (_num_t *) malloc(sizeof(_num_t) << 3);
+ else
+ nums = (_num_t *) realloc((char *) nums,
+ sizeof(_num_t) * (nums_size + 8));
+ nums_size += 8;
+ }
+
+ n = nums_used++;
+ nums[n].numerator = num;
+ nums[n].denominator = denom;
+
+ return n << 1;
+}
+
+static void
+#ifdef __STDC__
+add_number(unsigned long code, short num, short denom)
+#else
+add_number(code, num, denom)
+unsigned long code;
+short num, denom;
+#endif
+{
+ unsigned long i, j;
+
+ /*
+ * Insert the code in order.
+ */
+ for (i = 0; i < ncodes_used && code > ncodes[i].code; i++) ;
+
+ /*
+ * Handle the case of the codes matching and simply replace the number
+ * that was there before.
+ */
+ if (ncodes_used > 0 && code == ncodes[i].code) {
+ ncodes[i].idx = make_number(num, denom);
+ return;
+ }
+
+ /*
+ * Resize the array if necessary.
+ */
+ if (ncodes_used == ncodes_size) {
+ if (ncodes_size == 0)
+ ncodes = (_codeidx_t *) malloc(sizeof(_codeidx_t) << 3);
+ else
+ ncodes = (_codeidx_t *)
+ realloc((char *) ncodes, sizeof(_codeidx_t) * (ncodes_size + 8));
+
+ ncodes_size += 8;
+ }
+
+ /*
+ * Shift things around to insert the code if necessary.
+ */
+ if (i < ncodes_used) {
+ for (j = ncodes_used; j > i; j--) {
+ ncodes[j].code = ncodes[j - 1].code;
+ ncodes[j].idx = ncodes[j - 1].idx;
+ }
+ }
+ ncodes[i].code = code;
+ ncodes[i].idx = make_number(num, denom);
+
+ ncodes_used++;
+}
+
+/*
+ * This routine assumes that the line is a valid Unicode Character Database
+ * entry.
+ */
+static void
+#ifdef __STDC__
+read_cdata(FILE *in)
+#else
+read_cdata(in)
+FILE *in;
+#endif
+{
+ unsigned long i, lineno, skip, code, ccl_code;
+ short wnum, neg, number[2];
+ char line[512], *s, *e;
+
+ lineno = skip = 0;
+ while (fscanf(in, "%[^\n]\n", line) != EOF) {
+ lineno++;
+
+ /*
+ * Skip blank lines and lines that start with a '#'.
+ */
+ if (line[0] == 0 || line[0] == '#')
+ continue;
+
+ /*
+ * If lines need to be skipped, do it here.
+ */
+ if (skip) {
+ skip--;
+ continue;
+ }
+
+ /*
+ * Collect the code. The code can be up to 6 hex digits in length to
+ * allow surrogates to be specified.
+ */
+ for (s = line, i = code = 0; *s != ';' && i < 6; i++, s++) {
+ code <<= 4;
+ if (*s >= '0' && *s <= '9')
+ code += *s - '0';
+ else if (*s >= 'A' && *s <= 'F')
+ code += (*s - 'A') + 10;
+ else if (*s >= 'a' && *s <= 'f')
+ code += (*s - 'a') + 10;
+ }
+
+ /*
+ * Handle the following special cases:
+ * 1. 4E00-9FA5 CJK Ideographs.
+ * 2. AC00-D7A3 Hangul Syllables.
+ * 3. D800-DFFF Surrogates.
+ * 4. E000-F8FF Private Use Area.
+ * 5. F900-FA2D Han compatibility.
+ */
+ switch (code) {
+ case 0x4e00:
+ /*
+ * The Han ideographs.
+ */
+ add_range(0x4e00, 0x9fff, "Lo", "L");
+
+ /*
+ * Add the characters to the defined category.
+ */
+ add_range(0x4e00, 0x9fa5, "Cp", 0);
+
+ skip = 1;
+ break;
+ case 0xac00:
+ /*
+ * The Hangul syllables.
+ */
+ add_range(0xac00, 0xd7a3, "Lo", "L");
+
+ /*
+ * Add the characters to the defined category.
+ */
+ add_range(0xac00, 0xd7a3, "Cp", 0);
+
+ skip = 1;
+ break;
+ case 0xd800:
+ /*
+ * Make a range of all surrogates and assume some default
+ * properties.
+ */
+ add_range(0x010000, 0x10ffff, "Cs", "L");
+ skip = 5;
+ break;
+ case 0xe000:
+ /*
+ * The Private Use area. Add with a default set of properties.
+ */
+ add_range(0xe000, 0xf8ff, "Co", "L");
+ skip = 1;
+ break;
+ case 0xf900:
+ /*
+ * The CJK compatibility area.
+ */
+ add_range(0xf900, 0xfaff, "Lo", "L");
+
+ /*
+ * Add the characters to the defined category.
+ */
+ add_range(0xf900, 0xfaff, "Cp", 0);
+
+ skip = 1;
+ }
+
+ if (skip)
+ continue;
+
+ /*
+ * Add the code to the defined category.
+ */
+ ordered_range_insert(code, "Cp", 2);
+
+ /*
+ * Locate the first character property field.
+ */
+ for (i = 0; *s != 0 && i < 2; s++) {
+ if (*s == ';')
+ i++;
+ }
+ for (e = s; *e && *e != ';'; e++) ;
+
+ ordered_range_insert(code, s, e - s);
+
+ /*
+ * Locate the combining class code.
+ */
+ for (s = e; *s != 0 && i < 3; s++) {
+ if (*s == ';')
+ i++;
+ }
+
+ /*
+ * Convert the combining class code from decimal.
+ */
+ for (ccl_code = 0, e = s; *e && *e != ';'; e++)
+ ccl_code = (ccl_code * 10) + (*e - '0');
+
+ /*
+ * Add the code if it not 0.
+ */
+ if (ccl_code != 0)
+ ordered_ccl_insert(code, ccl_code);
+
+ /*
+ * Locate the second character property field.
+ */
+ for (s = e; *s != 0 && i < 4; s++) {
+ if (*s == ';')
+ i++;
+ }
+ for (e = s; *e && *e != ';'; e++) ;
+
+ ordered_range_insert(code, s, e - s);
+
+ /*
+ * Check for a decomposition.
+ */
+ s = ++e;
+ if (*s != ';' && *s != '<') {
+ /*
+ * Collect the codes of the decomposition.
+ */
+ for (dectmp_size = 0; *s != ';'; ) {
+ /*
+ * Skip all leading non-hex digits.
+ */
+ while (!ishdigit(*s))
+ s++;
+
+ for (dectmp[dectmp_size] = 0; ishdigit(*s); s++) {
+ dectmp[dectmp_size] <<= 4;
+ if (*s >= '0' && *s <= '9')
+ dectmp[dectmp_size] += *s - '0';
+ else if (*s >= 'A' && *s <= 'F')
+ dectmp[dectmp_size] += (*s - 'A') + 10;
+ else if (*s >= 'a' && *s <= 'f')
+ dectmp[dectmp_size] += (*s - 'a') + 10;
+ }
+ dectmp_size++;
+ }
+
+ /*
+ * If there is more than one code in the temporary decomposition
+ * array, then add the character with its decomposition.
+ */
+ if (dectmp_size > 1)
+ add_decomp(code);
+ }
+
+ /*
+ * Skip to the number field.
+ */
+ for (i = 0; i < 3 && *s; s++) {
+ if (*s == ';')
+ i++;
+ }
+
+ /*
+ * Scan the number in.
+ */
+ number[0] = number[1] = 0;
+ for (e = s, neg = wnum = 0; *e && *e != ';'; e++) {
+ if (*e == '-') {
+ neg = 1;
+ continue;
+ }
+
+ if (*e == '/') {
+ /*
+ * Move the the denominator of the fraction.
+ */
+ if (neg)
+ number[wnum] *= -1;
+ neg = 0;
+ e++;
+ wnum++;
+ }
+ number[wnum] = (number[wnum] * 10) + (*e - '0');
+ }
+
+ if (e > s) {
+ /*
+ * Adjust the denominator in case of integers and add the number.
+ */
+ if (wnum == 0)
+ number[1] = number[0];
+
+ add_number(code, number[0], number[1]);
+ }
+
+ /*
+ * Skip to the start of the possible case mappings.
+ */
+ for (s = e, i = 0; i < 4 && *s; s++) {
+ if (*s == ';')
+ i++;
+ }
+
+ /*
+ * Collect the case mappings.
+ */
+ cases[0] = cases[1] = cases[2] = 0;
+ for (i = 0; i < 3; i++) {
+ while (ishdigit(*s)) {
+ cases[i] <<= 4;
+ if (*s >= '0' && *s <= '9')
+ cases[i] += *s - '0';
+ else if (*s >= 'A' && *s <= 'F')
+ cases[i] += (*s - 'A') + 10;
+ else if (*s >= 'a' && *s <= 'f')
+ cases[i] += (*s - 'a') + 10;
+ s++;
+ }
+ if (*s == ';')
+ s++;
+ }
+ if (cases[0] && cases[1])
+ /*
+ * Add the upper and lower mappings for a title case character.
+ */
+ add_title(code);
+ else if (cases[1])
+ /*
+ * Add the lower and title case mappings for the upper case
+ * character.
+ */
+ add_upper(code);
+ else if (cases[0])
+ /*
+ * Add the upper and title case mappings for the lower case
+ * character.
+ */
+ add_lower(code);
+ }
+}
+
+static _decomp_t *
+#ifdef __STDC__
+find_decomp(unsigned long code)
+#else
+find_decomp(code)
+unsigned long code;
+#endif
+{
+ long l, r, m;
+
+ l = 0;
+ r = decomps_used - 1;
+ while (l <= r) {
+ m = (l + r) >> 1;
+ if (code > decomps[m].code)
+ l = m + 1;
+ else if (code < decomps[m].code)
+ r = m - 1;
+ else
+ return &decomps[m];
+ }
+ return 0;
+}
+
+static void
+#ifdef __STDC__
+decomp_it(_decomp_t *d)
+#else
+decomp_it(d)
+_decomp_t *d;
+#endif
+{
+ unsigned long i;
+ _decomp_t *dp;
+
+ for (i = 0; i < d->used; i++) {
+ if ((dp = find_decomp(d->decomp[i])) != 0)
+ decomp_it(dp);
+ else
+ dectmp[dectmp_size++] = d->decomp[i];
+ }
+}
+
+/*
+ * Expand all decompositions by recursively decomposing each character
+ * in the decomposition.
+ */
+static void
+#ifdef __STDC__
+expand_decomp(void)
+#else
+expand_decomp()
+#endif
+{
+ unsigned long i;
+
+ for (i = 0; i < decomps_used; i++) {
+ dectmp_size = 0;
+ decomp_it(&decomps[i]);
+ if (dectmp_size > 0)
+ add_decomp(decomps[i].code);
+ }
+}
+
+static void
+#ifdef __STDC__
+write_cdata(char *opath)
+#else
+write_cdata(opath)
+char *opath;
+#endif
+{
+ FILE *out;
+ unsigned long i, idx, bytes, nprops;
+ unsigned short casecnt[2];
+ char path[BUFSIZ];
+
+ /*****************************************************************
+ *
+ * Generate the ctype data.
+ *
+ *****************************************************************/
+
+ /*
+ * Open the ctype.dat file.
+ */
+ sprintf(path, "%s/ctype.dat", opath);
+ if ((out = fopen(path, "wb")) == 0)
+ return;
+
+ /*
+ * Collect the offsets for the properties. The offsets array is
+ * on a 4-byte boundary to keep things efficient for architectures
+ * that need such a thing.
+ */
+ for (i = idx = 0; i < NUMPROPS; i++) {
+ propcnt[i] = (proptbl[i].used != 0) ? idx : 0xffff;
+ idx += proptbl[i].used;
+ }
+
+ /*
+ * Add the sentinel index which is used by the binary search as the upper
+ * bound for a search.
+ */
+ propcnt[i] = idx;
+
+ /*
+ * Record the actual number of property lists. This may be different than
+ * the number of offsets actually written because of aligning on a 4-byte
+ * boundary.
+ */
+ hdr[1] = NUMPROPS;
+
+ /*
+ * Calculate the byte count needed and pad the property counts array to a
+ * 4-byte boundary.
+ */
+ if ((bytes = sizeof(unsigned short) * (NUMPROPS + 1)) & 3)
+ bytes += 4 - (bytes & 3);
+ nprops = bytes / sizeof(unsigned short);
+ bytes += sizeof(unsigned long) * idx;
+
+ /*
+ * Write the header.
+ */
+ fwrite((char *) hdr, sizeof(unsigned short), 2, out);
+
+ /*
+ * Write the byte count.
+ */
+ fwrite((char *) &bytes, sizeof(unsigned long), 1, out);
+
+ /*
+ * Write the property list counts.
+ */
+ fwrite((char *) propcnt, sizeof(unsigned short), nprops, out);
+
+ /*
+ * Write the property lists.
+ */
+ for (i = 0; i < NUMPROPS; i++) {
+ if (proptbl[i].used > 0)
+ fwrite((char *) proptbl[i].ranges, sizeof(unsigned long),
+ proptbl[i].used, out);
+ }
+
+ fclose(out);
+
+ /*****************************************************************
+ *
+ * Generate the case mapping data.
+ *
+ *****************************************************************/
+
+ /*
+ * Open the case.dat file.
+ */
+ sprintf(path, "%s/case.dat", opath);
+ if ((out = fopen(path, "wb")) == 0)
+ return;
+
+ /*
+ * Write the case mapping tables.
+ */
+ hdr[1] = upper_used + lower_used + title_used;
+ casecnt[0] = upper_used;
+ casecnt[1] = lower_used;
+
+ /*
+ * Write the header.
+ */
+ fwrite((char *) hdr, sizeof(unsigned short), 2, out);
+
+ /*
+ * Write the upper and lower case table sizes.
+ */
+ fwrite((char *) casecnt, sizeof(unsigned short), 2, out);
+
+ if (upper_used > 0)
+ /*
+ * Write the upper case table.
+ */
+ fwrite((char *) upper, sizeof(_case_t), upper_used, out);
+
+ if (lower_used > 0)
+ /*
+ * Write the lower case table.
+ */
+ fwrite((char *) lower, sizeof(_case_t), lower_used, out);
+
+ if (title_used > 0)
+ /*
+ * Write the title case table.
+ */
+ fwrite((char *) title, sizeof(_case_t), title_used, out);
+
+ fclose(out);
+
+ /*****************************************************************
+ *
+ * Generate the decomposition data.
+ *
+ *****************************************************************/
+
+ /*
+ * Fully expand all decompositions before generating the output file.
+ */
+ expand_decomp();
+
+ /*
+ * Open the decomp.dat file.
+ */
+ sprintf(path, "%s/decomp.dat", opath);
+ if ((out = fopen(path, "wb")) == 0)
+ return;
+
+ hdr[1] = decomps_used;
+
+ /*
+ * Write the header.
+ */
+ fwrite((char *) hdr, sizeof(unsigned short), 2, out);
+
+ /*
+ * Write a temporary byte count which will be calculated as the
+ * decompositions are written out.
+ */
+ bytes = 0;
+ fwrite((char *) &bytes, sizeof(unsigned long), 1, out);
+
+ if (decomps_used) {
+ /*
+ * Write the list of decomp nodes.
+ */
+ for (i = idx = 0; i < decomps_used; i++) {
+ fwrite((char *) &decomps[i].code, sizeof(unsigned long), 1, out);
+ fwrite((char *) &idx, sizeof(unsigned long), 1, out);
+ idx += decomps[i].used;
+ }
+
+ /*
+ * Write the sentinel index as the last decomp node.
+ */
+ fwrite((char *) &idx, sizeof(unsigned long), 1, out);
+
+ /*
+ * Write the decompositions themselves.
+ */
+ for (i = 0; i < decomps_used; i++)
+ fwrite((char *) decomps[i].decomp, sizeof(unsigned long),
+ decomps[i].used, out);
+
+ /*
+ * Seek back to the beginning and write the byte count.
+ */
+ bytes = (sizeof(unsigned long) * idx) +
+ (sizeof(unsigned long) * ((hdr[1] << 1) + 1));
+ fseek(out, sizeof(unsigned short) << 1, 0L);
+ fwrite((char *) &bytes, sizeof(unsigned long), 1, out);
+
+ fclose(out);
+ }
+
+ /*****************************************************************
+ *
+ * Generate the combining class data.
+ *
+ *****************************************************************/
+
+ /*
+ * Open the cmbcl.dat file.
+ */
+ sprintf(path, "%s/cmbcl.dat", opath);
+ if ((out = fopen(path, "wb")) == 0)
+ return;
+
+ /*
+ * Set the number of ranges used. Each range has a combining class which
+ * means each entry is a 3-tuple.
+ */
+ hdr[1] = ccl_used / 3;
+
+ /*
+ * Write the header.
+ */
+ fwrite((char *) hdr, sizeof(unsigned short), 2, out);
+
+ /*
+ * Write out the byte count to maintain header size.
+ */
+ bytes = ccl_used * sizeof(unsigned long);
+ fwrite((char *) &bytes, sizeof(unsigned long), 1, out);
+
+ if (ccl_used > 0)
+ /*
+ * Write the combining class ranges out.
+ */
+ fwrite((char *) ccl, sizeof(unsigned long), ccl_used, out);
+
+ fclose(out);
+
+ /*****************************************************************
+ *
+ * Generate the number data.
+ *
+ *****************************************************************/
+
+ /*
+ * Open the num.dat file.
+ */
+ sprintf(path, "%s/num.dat", opath);
+ if ((out = fopen(path, "wb")) == 0)
+ return;
+
+ /*
+ * The count part of the header will be the total number of codes that
+ * have numbers.
+ */
+ hdr[1] = (unsigned short) (ncodes_used << 1);
+ bytes = (ncodes_used * sizeof(_codeidx_t)) + (nums_used * sizeof(_num_t));
+
+ /*
+ * Write the header.
+ */
+ fwrite((char *) hdr, sizeof(unsigned short), 2, out);
+
+ /*
+ * Write out the byte count to maintain header size.
+ */
+ fwrite((char *) &bytes, sizeof(unsigned long), 1, out);
+
+ /*
+ * Now, if number mappings exist, write them out.
+ */
+ if (ncodes_used > 0) {
+ fwrite((char *) ncodes, sizeof(_codeidx_t), ncodes_used, out);
+ fwrite((char *) nums, sizeof(_num_t), nums_used, out);
+ }
+
+ fclose(out);
+}
+
+int
+#ifdef __STDC__
+main(int argc, char *argv[])
+#else
+main(argc, argv)
+int argc;
+char *argv[];
+#endif
+{
+ FILE *in;
+ char *prog, *opath;
+
+ if ((prog = strrchr(argv[0], '/')) != 0)
+ prog++;
+ else
+ prog = argv[0];
+
+ opath = 0;
+ in = stdin;
+
+ argc--;
+ argv++;
+
+ while (argc > 0) {
+ if (argv[0][0] == '-' && argv[0][1] == 'o') {
+ argc--;
+ argv++;
+ opath = argv[0];
+ } else {
+ if (in != stdin)
+ fclose(in);
+ if ((in = fopen(argv[0], "rb")) == 0)
+ fprintf(stderr, "%s: unable to open ctype file %s\n",
+ prog, argv[0]);
+ else {
+ read_cdata(in);
+ fclose(in);
+ in = 0;
+ }
+ }
+ argc--;
+ argv++;
+ }
+
+ if (opath == 0)
+ opath = ".";
+ write_cdata(opath);
+
+ return 0;
+}
--- /dev/null
+/*
+ * Copyright 1999 Computing Research Labs, New Mexico State University
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
+ * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+ * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef lint
+#ifdef __GNUC__
+static char rcsid[] __attribute__ ((unused)) = "$Id: ucpgba.c,v 1.4 1999/11/29 16:41:06 mleisher Exp $";
+#else
+static char rcsid[] = "$Id: ucpgba.c,v 1.4 1999/11/29 16:41:06 mleisher Exp $";
+#endif
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "ucdata.h"
+#include "ucpgba.h"
+
+/*
+ * These macros are used while reordering of RTL runs of text for the
+ * special case of non-spacing characters being in runs of weakly
+ * directional text. They check for weak and non-spacing, and digits and
+ * non-spacing.
+ */
+#define ISWEAKSPECIAL(cc) ucisprop(cc, UC_EN|UC_ES|UC_MN, UC_ET|UC_AN|UC_CS)
+#define ISDIGITSPECIAL(cc) ucisprop(cc, UC_ND|UC_MN, 0)
+
+/*
+ * These macros are used while breaking a string into runs of text in
+ * different directions. Descriptions:
+ *
+ * ISLTR_LTR - Test for members of an LTR run in an LTR context. This looks
+ * for characters with ltr, non-spacing, weak, and neutral
+ * properties.
+ *
+ * ISRTL_RTL - Test for members of an RTL run in an RTL context. This looks
+ * for characters with rtl, non-spacing, weak, and neutral
+ * properties.
+ *
+ * ISRTL_NEUTRAL - Test for RTL or neutral characters.
+ *
+ * ISWEAK_NEUTRAL - Test for weak or neutral characters.
+ */
+#define ISLTR_LTR(cc) ucisprop(cc, UC_L|UC_MN|UC_EN|UC_ES,\
+ UC_ET|UC_AN|UC_CS|UC_B|UC_S|UC_WS|UC_ON)
+
+#define ISRTL_RTL(cc) ucisprop(cc, UC_R|UC_MN|UC_EN|UC_ES,\
+ UC_ET|UC_AN|UC_CS|UC_B|UC_S|UC_WS|UC_ON)
+
+#define ISRTL_NEUTRAL(cc) ucisprop(cc, UC_R, UC_B|UC_S|UC_WS|UC_ON)
+#define ISWEAK_NEUTRAL(cc) ucisprop(cc, UC_EN|UC_ES, \
+ UC_B|UC_S|UC_WS|UC_ON|UC_ET|UC_AN|UC_CS)
+
+/*
+ * This table is temporarily hard-coded here until it can be constructed
+ * automatically somehow.
+ */
+static unsigned long _symmetric_pairs[] = {
+ 0x0028, 0x0029, 0x0029, 0x0028, 0x003C, 0x003E, 0x003E, 0x003C,
+ 0x005B, 0x005D, 0x005D, 0x005B, 0x007B, 0x007D, 0x007D, 0x007B,
+ 0x2045, 0x2046, 0x2046, 0x2045, 0x207D, 0x207E, 0x207E, 0x207D,
+ 0x208D, 0x208E, 0x208E, 0x208D, 0x3008, 0x3009, 0x3009, 0x3008,
+ 0x300A, 0x300B, 0x300B, 0x300A, 0x300C, 0x300D, 0x300D, 0x300C,
+ 0x300E, 0x300F, 0x300F, 0x300E, 0x3010, 0x3011, 0x3011, 0x3010,
+ 0x3014, 0x3015, 0x3015, 0x3014, 0x3016, 0x3017, 0x3017, 0x3016,
+ 0x3018, 0x3019, 0x3019, 0x3018, 0x301A, 0x301B, 0x301B, 0x301A,
+ 0xFD3E, 0xFD3F, 0xFD3F, 0xFD3E, 0xFE59, 0xFE5A, 0xFE5A, 0xFE59,
+ 0xFE5B, 0xFE5C, 0xFE5C, 0xFE5B, 0xFE5D, 0xFE5E, 0xFE5E, 0xFE5D,
+ 0xFF08, 0xFF09, 0xFF09, 0xFF08, 0xFF3B, 0xFF3D, 0xFF3D, 0xFF3B,
+ 0xFF5B, 0xFF5D, 0xFF5D, 0xFF5B, 0xFF62, 0xFF63, 0xFF63, 0xFF62,
+};
+
+static int _symmetric_pairs_size =
+sizeof(_symmetric_pairs)/sizeof(_symmetric_pairs[0]);
+
+/*
+ * This routine looks up the other form of a symmetric pair.
+ */
+static unsigned long
+#ifdef __STDC__
+_ucsymmetric_pair(unsigned long c)
+#else
+_ucsymmetric_pair(c)
+unsigned long c;
+#endif
+{
+ int i;
+
+ for (i = 0; i < _symmetric_pairs_size; i += 2) {
+ if (_symmetric_pairs[i] == c)
+ return _symmetric_pairs[i+1];
+ }
+ return c;
+}
+
+/*
+ * This routine creates a new run, copies the text into it, links it into the
+ * logical text order chain and returns it to the caller to be linked into
+ * the visual text order chain.
+ */
+static ucrun_t *
+#ifdef __STDC__
+_add_run(ucstring_t *str, unsigned long *src,
+ unsigned long start, unsigned long end, int direction)
+#else
+_add_run(str, src, start, end, direction)
+ucstring_t *str;
+unsigned long *src, start, end;
+int direction;
+#endif
+{
+ long i, t;
+ ucrun_t *run;
+
+ run = (ucrun_t *) malloc(sizeof(ucrun_t));
+ run->visual_next = run->visual_prev = 0;
+ run->direction = direction;
+
+ run->cursor = ~0;
+
+ run->chars = (unsigned long *)
+ malloc(sizeof(unsigned long) * ((end - start) << 1));
+ run->positions = run->chars + (end - start);
+
+ run->source = src;
+ run->start = start;
+ run->end = end;
+
+ if (direction == UCPGBA_RTL) {
+ /*
+ * Copy the source text into the run in reverse order and select
+ * replacements for the pairwise punctuation and the <> characters.
+ */
+ for (i = 0, t = end - 1; start < end; start++, t--, i++) {
+ run->positions[i] = t;
+ if (ucissymmetric(src[t]) || src[t] == '<' || src[t] == '>')
+ run->chars[i] = _ucsymmetric_pair(src[t]);
+ else
+ run->chars[i] = src[t];
+ }
+ } else {
+ /*
+ * Copy the source text into the run directly.
+ */
+ for (i = start; i < end; i++) {
+ run->positions[i - start] = i;
+ run->chars[i - start] = src[i];
+ }
+ }
+
+ /*
+ * Add the run to the logical list for cursor traversal.
+ */
+ if (str->logical_first == 0)
+ str->logical_first = str->logical_last = run;
+ else {
+ run->logical_prev = str->logical_last;
+ str->logical_last->logical_next = run;
+ str->logical_last = run;
+ }
+
+ return run;
+}
+
+static void
+#ifdef __STDC__
+_ucadd_rtl_segment(ucstring_t *str, unsigned long *source, unsigned long start,
+ unsigned long end)
+#else
+_ucadd_rtl_segment(str, source, start, end)
+ucstring_t *str;
+unsigned long *source, start, end;
+#endif
+{
+ unsigned long s, e;
+ ucrun_t *run, *lrun;
+
+ /*
+ * This is used to splice runs into strings with overall LTR direction.
+ * The `lrun' variable will never be NULL because at least one LTR run was
+ * added before this RTL run.
+ */
+ lrun = str->visual_last;
+
+ for (e = s = start; s < end;) {
+ for (; e < end && ISRTL_NEUTRAL(source[e]); e++) ;
+
+ if (e > s) {
+ run = _add_run(str, source, s, e, UCPGBA_RTL);
+
+ /*
+ * Add the run to the visual list for cursor traversal.
+ */
+ if (str->visual_first != 0) {
+ if (str->direction == UCPGBA_LTR) {
+ run->visual_prev = lrun;
+ run->visual_next = lrun->visual_next;
+ if (lrun->visual_next != 0)
+ lrun->visual_next->visual_prev = run;
+ lrun->visual_next = run;
+ if (lrun == str->visual_last)
+ str->visual_last = run;
+ } else {
+ run->visual_next = str->visual_first;
+ str->visual_first->visual_prev = run;
+ str->visual_first = run;
+ }
+ } else
+ str->visual_first = str->visual_last = run;
+ }
+
+ /*
+ * Now handle the weak sequences such that multiple non-digit groups
+ * are kept together appropriately and added as RTL sequences.
+ */
+ for (s = e; e < end && ISWEAKSPECIAL(source[e]); e++) {
+ if (!ISDIGITSPECIAL(source[e]) &&
+ (e + 1 == end || !ISDIGITSPECIAL(source[e + 1])))
+ break;
+ }
+
+ if (e > s) {
+ run = _add_run(str, source, s, e, UCPGBA_LTR);
+
+ /*
+ * Add the run to the visual list for cursor traversal.
+ */
+ if (str->visual_first != 0) {
+ if (str->direction == UCPGBA_LTR) {
+ run->visual_prev = lrun;
+ run->visual_next = lrun->visual_next;
+ if (lrun->visual_next != 0)
+ lrun->visual_next->visual_prev = run;
+ lrun->visual_next = run;
+ if (lrun == str->visual_last)
+ str->visual_last = run;
+ } else {
+ run->visual_next = str->visual_first;
+ str->visual_first->visual_prev = run;
+ str->visual_first = run;
+ }
+ } else
+ str->visual_first = str->visual_last = run;
+ }
+
+ /*
+ * Collect all weak non-digit sequences for an RTL segment. These
+ * will appear as part of the next RTL segment or will be added as
+ * an RTL segment by themselves.
+ */
+ for (s = e; e < end && ucisweak(source[e]) && !ucisdigit(source[e]);
+ e++) ;
+ }
+
+ /*
+ * Capture any weak non-digit sequences that occur at the end of the RTL
+ * run.
+ */
+ if (e > s) {
+ run = _add_run(str, source, s, e, UCPGBA_RTL);
+
+ /*
+ * Add the run to the visual list for cursor traversal.
+ */
+ if (str->visual_first != 0) {
+ if (str->direction == UCPGBA_LTR) {
+ run->visual_prev = lrun;
+ run->visual_next = lrun->visual_next;
+ if (lrun->visual_next != 0)
+ lrun->visual_next->visual_prev = run;
+ lrun->visual_next = run;
+ if (lrun == str->visual_last)
+ str->visual_last = run;
+ } else {
+ run->visual_next = str->visual_first;
+ str->visual_first->visual_prev = run;
+ str->visual_first = run;
+ }
+ } else
+ str->visual_first = str->visual_last = run;
+ }
+}
+
+static void
+#ifdef __STDC__
+_ucadd_ltr_segment(ucstring_t *str, unsigned long *source, unsigned long start,
+ unsigned long end)
+#else
+_ucadd_ltr_segment(str, source, start, end)
+ucstring_t *str;
+unsigned long *source, start, end;
+#endif
+{
+ ucrun_t *run;
+
+ run = _add_run(str, source, start, end, UCPGBA_LTR);
+
+ /*
+ * Add the run to the visual list for cursor traversal.
+ */
+ if (str->visual_first != 0) {
+ if (str->direction == UCPGBA_LTR) {
+ run->visual_prev = str->visual_last;
+ str->visual_last->visual_next = run;
+ str->visual_last = run;
+ } else {
+ run->visual_next = str->visual_first;
+ str->visual_first->visual_prev = run;
+ str->visual_first = run;
+ }
+ } else
+ str->visual_first = str->visual_last = run;
+}
+
+ucstring_t *
+#ifdef __STDC__
+ucstring_create(unsigned long *source, unsigned long start, unsigned long end,
+ int default_direction, int cursor_motion)
+#else
+ucstring_create(source, start, end, default_direction, cursor_motion)
+unsigned long *source, start, end;
+int default_direction, cursor_motion;
+#endif
+{
+ int rtl_first;
+ unsigned long s, e;
+ ucstring_t *str;
+
+ str = (ucstring_t *) malloc(sizeof(ucstring_t));
+
+ /*
+ * Set the initial values.
+ */
+ str->cursor_motion = cursor_motion;
+ str->logical_first = str->logical_last = 0;
+ str->visual_first = str->visual_last = str->cursor = 0;
+ str->source = source;
+ str->start = start;
+ str->end = end;
+
+ /*
+ * If the length of the string is 0, then just return it at this point.
+ */
+ if (start == end)
+ return str;
+
+ /*
+ * This flag indicates whether the collection loop for RTL is called
+ * before the LTR loop the first time.
+ */
+ rtl_first = 0;
+
+ /*
+ * Look for the first character in the string that has strong
+ * directionality.
+ */
+ for (s = start; s < end && !ucisstrong(source[s]); s++) ;
+
+ if (s == end)
+ /*
+ * If the string contains no characters with strong directionality, use
+ * the default direction.
+ */
+ str->direction = default_direction;
+ else
+ str->direction = ucisrtl(source[s]) ? UCPGBA_RTL : UCPGBA_LTR;
+
+ if (str->direction == UCPGBA_RTL)
+ /*
+ * Set the flag that causes the RTL collection loop to run first.
+ */
+ rtl_first = 1;
+
+ /*
+ * This loop now separates the string into runs based on directionality.
+ */
+ for (s = e = 0; s < end; s = e) {
+ if (!rtl_first) {
+ /*
+ * Determine the next run of LTR text.
+ */
+
+ while (e < end && ISLTR_LTR(source[e]))
+ e++;
+ if (str->direction != UCPGBA_LTR) {
+ while (e > s && ISWEAK_NEUTRAL(source[e - 1]))
+ e--;
+ }
+
+ /*
+ * Add the LTR segment to the string.
+ */
+ if (e > s)
+ _ucadd_ltr_segment(str, source, s, e);
+ }
+
+ /*
+ * Determine the next run of RTL text.
+ */
+ s = e;
+ while (e < end && ISRTL_RTL(source[e]))
+ e++;
+ if (str->direction != UCPGBA_RTL) {
+ while (e > s && ISWEAK_NEUTRAL(source[e - 1]))
+ e--;
+ }
+
+ /*
+ * Add the RTL segment to the string.
+ */
+ if (e > s)
+ _ucadd_rtl_segment(str, source, s, e);
+
+ /*
+ * Clear the flag that allowed the RTL collection loop to run first
+ * for strings with overall RTL directionality.
+ */
+ rtl_first = 0;
+ }
+
+ /*
+ * Set up the initial cursor run.
+ */
+ str->cursor = str->logical_first;
+ if (str != 0)
+ str->cursor->cursor = (str->cursor->direction == UCPGBA_RTL) ?
+ str->cursor->end - str->cursor->start : 0;
+
+ return str;
+}
+
+void
+#ifdef __STDC__
+ucstring_free(ucstring_t *s)
+#else
+ucstring_free(s)
+ucstring_t *s;
+#endif
+{
+ ucrun_t *l, *r;
+
+ if (s == 0)
+ return;
+
+ for (l = 0, r = s->visual_first; r != 0; r = r->visual_next) {
+ if (r->end > r->start)
+ free((char *) r->chars);
+ if (l)
+ free((char *) l);
+ l = r;
+ }
+ if (l)
+ free((char *) l);
+
+ free((char *) s);
+}
+
+int
+#ifdef __STDC__
+ucstring_set_cursor_motion(ucstring_t *str, int cursor_motion)
+#else
+ucstring_set_cursor_motion(s, cursor_motion)
+ucstring_t *str;
+int cursor_motion;
+#endif
+{
+ int n;
+
+ if (str == 0)
+ return -1;
+
+ n = str->cursor_motion;
+ str->cursor_motion = cursor_motion;
+ return n;
+}
+
+static int
+#ifdef __STDC__
+_ucstring_visual_cursor_right(ucstring_t *str, int count)
+#else
+_ucstring_visual_cursor_right(str, count)
+ucstring_t *str;
+int count;
+#endif
+{
+ int cnt = count;
+ unsigned long size;
+ ucrun_t *cursor;
+
+ if (str == 0)
+ return 0;
+
+ cursor = str->cursor;
+ while (cnt > 0) {
+ size = cursor->end - cursor->start;
+ if ((cursor->direction == UCPGBA_RTL && cursor->cursor + 1 == size) ||
+ cursor->cursor + 1 > size) {
+ /*
+ * If the next run is NULL, then the cursor is already on the
+ * far right end already.
+ */
+ if (cursor->visual_next == 0)
+ /*
+ * If movement occured, then report it.
+ */
+ return (cnt != count);
+
+ /*
+ * Move to the next run.
+ */
+ str->cursor = cursor = cursor->visual_next;
+ cursor->cursor = (cursor->direction == UCPGBA_RTL) ? -1 : 0;
+ size = cursor->end - cursor->start;
+ } else
+ cursor->cursor++;
+ cnt--;
+ }
+ return 1;
+}
+
+static int
+#ifdef __STDC__
+_ucstring_logical_cursor_right(ucstring_t *str, int count)
+#else
+_ucstring_logical_cursor_right(str, count)
+ucstring_t *str;
+int count;
+#endif
+{
+ int cnt = count;
+ unsigned long size;
+ ucrun_t *cursor;
+
+ if (str == 0)
+ return 0;
+
+ cursor = str->cursor;
+ while (cnt > 0) {
+ size = cursor->end - cursor->start;
+ if (str->direction == UCPGBA_RTL) {
+ if (cursor->direction == UCPGBA_RTL) {
+ if (cursor->cursor + 1 == size) {
+ if (cursor == str->logical_first)
+ /*
+ * Already at the beginning of the string.
+ */
+ return (cnt != count);
+
+ str->cursor = cursor = cursor->logical_prev;
+ size = cursor->end - cursor->start;
+ cursor->cursor = (cursor->direction == UCPGBA_LTR) ?
+ size : 0;
+ } else
+ cursor->cursor++;
+ } else {
+ if (cursor->cursor == 0) {
+ if (cursor == str->logical_first)
+ /*
+ * At the beginning of the string already.
+ */
+ return (cnt != count);
+
+ str->cursor = cursor = cursor->logical_prev;
+ size = cursor->end - cursor->start;
+ cursor->cursor = (cursor->direction == UCPGBA_LTR) ?
+ size : 0;
+ } else
+ cursor->cursor--;
+ }
+ } else {
+ if (cursor->direction == UCPGBA_RTL) {
+ if (cursor->cursor == 0) {
+ if (cursor == str->logical_last)
+ /*
+ * Already at the end of the string.
+ */
+ return (cnt != count);
+
+ str->cursor = cursor = cursor->logical_next;
+ size = cursor->end - cursor->start;
+ cursor->cursor = (cursor->direction == UCPGBA_LTR) ?
+ 0 : size - 1;
+ } else
+ cursor->cursor--;
+ } else {
+ if (cursor->cursor + 1 > size) {
+ if (cursor == str->logical_last)
+ /*
+ * Already at the end of the string.
+ */
+ return (cnt != count);
+
+ str->cursor = cursor = cursor->logical_next;
+ cursor->cursor = (cursor->direction == UCPGBA_LTR) ?
+ 0 : size - 1;
+ } else
+ cursor->cursor++;
+ }
+ }
+ cnt--;
+ }
+ return 1;
+}
+
+int
+#ifdef __STDC__
+ucstring_cursor_right(ucstring_t *str, int count)
+#else
+ucstring_cursor_right(str, count)
+ucstring_t *str;
+int count;
+#endif
+{
+ if (str == 0)
+ return 0;
+ return (str->cursor_motion == UCPGBA_CURSOR_VISUAL) ?
+ _ucstring_visual_cursor_right(str, count) :
+ _ucstring_logical_cursor_right(str, count);
+}
+
+static int
+#ifdef __STDC__
+_ucstring_visual_cursor_left(ucstring_t *str, int count)
+#else
+_ucstring_visual_cursor_left(str, count)
+ucstring_t *str;
+int count;
+#endif
+{
+ int cnt = count;
+ unsigned long size;
+ ucrun_t *cursor;
+
+ if (str == 0)
+ return 0;
+
+ cursor = str->cursor;
+ while (cnt > 0) {
+ size = cursor->end - cursor->start;
+ if ((cursor->direction == UCPGBA_LTR && cursor->cursor == 0) ||
+ cursor->cursor - 1 < -1) {
+ /*
+ * If the preceding run is NULL, then the cursor is already on the
+ * far left end already.
+ */
+ if (cursor->visual_prev == 0)
+ /*
+ * If movement occured, then report it.
+ */
+ return (cnt != count);
+
+ /*
+ * Move to the previous run.
+ */
+ str->cursor = cursor = cursor->visual_prev;
+ size = cursor->end - cursor->start;
+ cursor->cursor = (cursor->direction == UCPGBA_RTL) ?
+ size : size - 1;
+ } else
+ cursor->cursor--;
+ cnt--;
+ }
+ return 1;
+}
+
+static int
+#ifdef __STDC__
+_ucstring_logical_cursor_left(ucstring_t *str, int count)
+#else
+_ucstring_logical_cursor_left(str, count)
+ucstring_t *str;
+int count;
+#endif
+{
+ int cnt = count;
+ unsigned long size;
+ ucrun_t *cursor;
+
+ if (str == 0)
+ return 0;
+
+ cursor = str->cursor;
+ while (cnt > 0) {
+ size = cursor->end - cursor->start;
+ if (str->direction == UCPGBA_RTL) {
+ if (cursor->direction == UCPGBA_RTL) {
+ if (cursor->cursor == -1) {
+ if (cursor == str->logical_last)
+ /*
+ * Already at the end of the string.
+ */
+ return (cnt != count);
+
+ str->cursor = cursor = cursor->logical_next;
+ size = cursor->end - cursor->start;
+ cursor->cursor = (cursor->direction == UCPGBA_LTR) ?
+ 0 : size - 1;
+ } else
+ cursor->cursor--;
+ } else {
+ if (cursor->cursor + 1 > size) {
+ if (cursor == str->logical_last)
+ /*
+ * At the end of the string already.
+ */
+ return (cnt != count);
+
+ str->cursor = cursor = cursor->logical_next;
+ size = cursor->end - cursor->start;
+ cursor->cursor = (cursor->direction == UCPGBA_LTR) ?
+ 0 : size - 1;
+ } else
+ cursor->cursor++;
+ }
+ } else {
+ if (cursor->direction == UCPGBA_RTL) {
+ if (cursor->cursor + 1 == size) {
+ if (cursor == str->logical_first)
+ /*
+ * Already at the beginning of the string.
+ */
+ return (cnt != count);
+
+ str->cursor = cursor = cursor->logical_prev;
+ size = cursor->end - cursor->start;
+ cursor->cursor = (cursor->direction == UCPGBA_LTR) ?
+ size : 0;
+ } else
+ cursor->cursor++;
+ } else {
+ if (cursor->cursor == 0) {
+ if (cursor == str->logical_first)
+ /*
+ * Already at the beginning of the string.
+ */
+ return (cnt != count);
+
+ str->cursor = cursor = cursor->logical_prev;
+ cursor->cursor = (cursor->direction == UCPGBA_LTR) ?
+ size : 0;
+ } else
+ cursor->cursor--;
+ }
+ }
+ cnt--;
+ }
+ return 1;
+}
+
+int
+#ifdef __STDC__
+ucstring_cursor_left(ucstring_t *str, int count)
+#else
+ucstring_cursor_left(str, count)
+ucstring_t *str;
+int count;
+#endif
+{
+ if (str == 0)
+ return 0;
+ return (str->cursor_motion == UCPGBA_CURSOR_VISUAL) ?
+ _ucstring_visual_cursor_left(str, count) :
+ _ucstring_logical_cursor_left(str, count);
+}
+
+void
+#ifdef __STDC__
+ucstring_cursor_info(ucstring_t *str, int *direction, unsigned long *position)
+#else
+ucstring_cursor_info(str, direction, position)
+ucstring_t *str, int *direction;
+unsigned long *position;
+#endif
+{
+ long c;
+ unsigned long size;
+ ucrun_t *cursor;
+
+ if (str == 0 || direction == 0 || position == 0)
+ return;
+
+ cursor = str->cursor;
+
+ *direction = cursor->direction;
+
+ c = cursor->cursor;
+ size = cursor->end - cursor->start;
+
+ if (c == size)
+ *position = (cursor->direction == UCPGBA_RTL) ?
+ cursor->start : cursor->positions[c - 1];
+ else if (c == -1)
+ *position = (cursor->direction == UCPGBA_RTL) ?
+ cursor->end : cursor->start;
+ else
+ *position = cursor->positions[c];
+}
--- /dev/null
+/*
+ * Copyright 1999 Computing Research Labs, New Mexico State University
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
+ * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+ * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef _h_ucpgba
+#define _h_ucpgba
+
+/*
+ * $Id: ucpgba.h,v 1.4 1999/11/19 15:24:30 mleisher Exp $
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#undef __
+#ifdef __STDC__
+#define __(x) x
+#else
+#define __(x) ()
+#endif
+
+/***************************************************************************
+ *
+ * Macros and types.
+ *
+ ***************************************************************************/
+
+/*
+ * These are the direction values that can appear in render runs and render
+ * strings.
+ */
+#define UCPGBA_LTR 0
+#define UCPGBA_RTL 1
+
+/*
+ * These are the flags for cursor motion.
+ */
+#define UCPGBA_CURSOR_VISUAL 0
+#define UCPGBA_CURSOR_LOGICAL 1
+
+/*
+ * This structure is used to contain runs of text in a particular direction.
+ */
+typedef struct _ucrun_t {
+ struct _ucrun_t *visual_prev; /* Pointer to the previous visual run. */
+ struct _ucrun_t *visual_next; /* Pointer to the next visual run. */
+
+ struct _ucrun_t *logical_prev; /* Pointer to the previous logical run. */
+ struct _ucrun_t *logical_next; /* Pointer to the next logical run. */
+
+ int direction; /* Direction of the run. */
+
+ long cursor; /* Position of "cursor" in the string. */
+
+ unsigned long *chars; /* List of characters for the run. */
+ unsigned long *positions; /* List of original positions in source. */
+
+ unsigned long *source; /* The source string. */
+ unsigned long start; /* Beginning offset in the source string. */
+ unsigned long end; /* Ending offset in the source string. */
+} ucrun_t;
+
+/*
+ * This represents a string of runs rendered up to a point that is not
+ * platform specific.
+ */
+typedef struct _ucstring_t {
+ int direction; /* Overall direction of the string. */
+
+ int cursor_motion; /* Logical or visual cursor motion flag. */
+
+ ucrun_t *cursor; /* The run containing the "cursor." */
+
+ ucrun_t *logical_first; /* First run in the logical order. */
+ ucrun_t *logical_last; /* Last run in the logical order. */
+
+ ucrun_t *visual_first; /* First run in the visual order. */
+ ucrun_t *visual_last; /* Last run in the visual order. */
+
+ unsigned long *source; /* The source string. */
+ unsigned long start; /* The beginning offset in the source. */
+ unsigned long end; /* The ending offset in the source. */
+} ucstring_t;
+
+/***************************************************************************
+ *
+ * API
+ *
+ ***************************************************************************/
+
+/*
+ * This creates and reorders the specified substring using the
+ * "Pretty Good Bidi Algorithm." A default direction is provided for cases
+ * of a string containing no strong direction characters and the default
+ * cursor motion should be provided.
+ */
+extern ucstring_t *ucstring_create __((unsigned long *source,
+ unsigned long start,
+ unsigned long end,
+ int default_direction,
+ int cursor_motion));
+/*
+ * This releases the string.
+ */
+extern void ucstring_free __((ucstring_t *string));
+
+/*
+ * This changes the cursor motion flag for the string.
+ */
+extern int ucstring_set_cursor_motion __((ucstring_t *string,
+ int cursor_motion));
+
+/*
+ * This function will move the cursor to the right depending on the
+ * type of cursor motion that was specified for the string.
+ *
+ * A 0 is returned if no cursor motion is performed, otherwise a
+ * 1 is returned.
+ */
+extern int ucstring_cursor_right __((ucstring_t *string, int count));
+
+/*
+ * This function will move the cursor to the left depending on the
+ * type of cursor motion that was specified for the string.
+ *
+ * A 0 is returned if no cursor motion is performed, otherwise a
+ * 1 is returned.
+ */
+extern int ucstring_cursor_left __((ucstring_t *string, int count));
+
+/*
+ * This routine retrieves the direction of the run containing the cursor
+ * and the actual position in the original text string.
+ */
+extern void ucstring_cursor_info __((ucstring_t *string, int *direction,
+ unsigned long *position));
+
+#undef __
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _h_ucpgba */
--- /dev/null
+.\"
+.\" $Id: ucpgba.man,v 1.1 1999/11/19 16:08:34 mleisher Exp $
+.\"
+.TH ucpgba 3 "19 November 1999"
+.SH NAME
+ucpgba \- functions for doing bidirectional reordering of Unicode text and
+logical and visual cursor motion
+
+.SH SYNOPSIS
+.nf
+#include <ucdata.h>
+#include <ucpgba.h>
+
+ucstring_t *ucstring_create(unsigned long *source, unsigned long start,
+ unsigned long end, int default_direction,
+ int cursor_motion)
+.sp
+void ucstring_free(ucstring_t *string)
+.sp
+int ucstring_set_cursor_motion(ucstring_t *string, int cursor_motion)
+.sp
+int ucstring_cursor_right(ucstring_t *string, int count)
+.sp
+int ucstring_cursor_left(ucstring_t *string, int count)
+.sp
+void ucstring_cursor_info(ucstring_t *string, int *direction,
+ unsigned long *position)
+
+.SH DESCRIPTION
+.TP 4
+.BR Macros
+UCPGBA_LTR
+.br
+UCPGBA_RTL
+.br
+UCPGBA_CURSOR_VISUAL
+.br
+UCPGBA_CURSOR_LOGICAL
+
+.TP 4
+.BR ucstring_create()
+This function will create a reordered string by using the implicit
+directionality of the characters in the specified substring.
+.sp
+The `default_direction' parameter should be one of UCPGBA_LTR or UCPGBA_RTL
+and is used only in cases where a string contains no characters with strong
+directionality.
+.sp
+The `cursor_motion' parameter should be one of UCPGBA_CURSOR_VISUAL or
+UCPGBA_CURSOR_LOGICAL, and is used to specify the initial cursor motion
+behavior. This behavior can be switched at any time using
+ustring_set_cursor_motion().
+
+.TP 4
+.BR ucstring_free()
+This function will deallocate the memory used by the string, incuding the
+string itself.
+
+.TP 4
+.BR ucstring_cursor_info()
+This function will return the text position of the internal cursor and the
+directionality of the text at that position. The position returned is the
+original text position of the character.
+
+.TP 4
+.BR ucstring_set_cursor_motion()
+This function will change the cursor motion type and return the previous
+cursor motion type.
+
+.TP 4
+.BR ucstring_cursor_right()
+This function will move the internal cursor to the right according to the
+type of cursor motion set for the string.
+.sp
+If no cursor motion is performed, it returns 0. Otherwise it will return a 1.
+
+.TP 4
+.BR ucstring_cursor_left()
+This function will move the internal cursor to the left according to the
+type of cursor motion set for the string.
+.sp
+If no cursor motion is performed, it returns 0. Otherwise it will return a 1.
+
+.SH "SEE ALSO"
+ucdata(3)
+
+.SH ACKNOWLEDGMENTS
+These are people who have helped with patches or alerted me about problems.
+
+.SH AUTHOR
+Mark Leisher
+.br
+Computing Research Lab
+.br
+New Mexico State University
+.br
+Email: mleisher@crl.nmsu.edu
--- /dev/null
+#
+# $Id: README,v 1.3 1999/09/21 15:47:43 mleisher Exp $
+#
+# Copyright 1997, 1998, 1999 Computing Research Labs,
+# New Mexico State University
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
+# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
+# OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+# THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#
+
+
+ Unicode and Regular Expressions
+ Version 0.5
+
+This is a simple regular expression package for matching against Unicode text
+in UCS2 form. The implementation of this URE package is a variation on the
+RE->DFA algorithm done by Mark Hopkins (markh@csd4.csd.uwm.edu). Mark
+Hopkins' algorithm had the virtue of being very simple, so it was used as a
+model.
+
+---------------------------------------------------------------------------
+
+Assumptions:
+
+ o Regular expression and text already normalized.
+
+ o Conversion to lower case assumes a 1-1 mapping.
+
+Definitions:
+
+ Separator - any one of U+2028, U+2029, '\n', '\r'.
+
+Operators:
+ . - match any character.
+ * - match zero or more of the last subexpression.
+ + - match one or more of the last subexpression.
+ ? - match zero or one of the last subexpression.
+ () - subexpression grouping.
+
+ Notes:
+
+ o The "." operator normally does not match separators, but a flag is
+ available for the ure_exec() function that will allow this operator to
+ match a separator.
+
+Literals and Constants:
+
+ c - literal UCS2 character.
+ \x.... - hexadecimal number of up to 4 digits.
+ \X.... - hexadecimal number of up to 4 digits.
+ \u.... - hexadecimal number of up to 4 digits.
+ \U.... - hexadecimal number of up to 4 digits.
+
+Character classes:
+
+ [...] - Character class.
+ [^...] - Negated character class.
+ \pN1,N2,...,Nn - Character properties class.
+ \PN1,N2,...,Nn - Negated character properties class.
+
+ POSIX character classes recognized:
+
+ :alnum:
+ :alpha:
+ :cntrl:
+ :digit:
+ :graph:
+ :lower:
+ :print:
+ :punct:
+ :space:
+ :upper:
+ :xdigit:
+
+ Notes:
+
+ o Character property classes are \p or \P followed by a comma separated
+ list of integers between 1 and 32. These integers are references to
+ the following character properties:
+
+ N Character Property
+ --------------------------
+ 1 _URE_NONSPACING
+ 2 _URE_COMBINING
+ 3 _URE_NUMDIGIT
+ 4 _URE_NUMOTHER
+ 5 _URE_SPACESEP
+ 6 _URE_LINESEP
+ 7 _URE_PARASEP
+ 8 _URE_CNTRL
+ 9 _URE_PUA
+ 10 _URE_UPPER
+ 11 _URE_LOWER
+ 12 _URE_TITLE
+ 13 _URE_MODIFIER
+ 14 _URE_OTHERLETTER
+ 15 _URE_DASHPUNCT
+ 16 _URE_OPENPUNCT
+ 17 _URE_CLOSEPUNCT
+ 18 _URE_OTHERPUNCT
+ 19 _URE_MATHSYM
+ 20 _URE_CURRENCYSYM
+ 21 _URE_OTHERSYM
+ 22 _URE_LTR
+ 23 _URE_RTL
+ 24 _URE_EURONUM
+ 25 _URE_EURONUMSEP
+ 26 _URE_EURONUMTERM
+ 27 _URE_ARABNUM
+ 28 _URE_COMMONSEP
+ 29 _URE_BLOCKSEP
+ 30 _URE_SEGMENTSEP
+ 31 _URE_WHITESPACE
+ 32 _URE_OTHERNEUT
+
+ o Character classes can contain literals, constants, and character
+ property classes. Example:
+
+ [abc\U10A\p1,3,4]
+
+---------------------------------------------------------------------------
+
+Before using URE
+----------------
+Before URE is used, two functions need to be created. One to check if a
+character matches a set of URE character properties, and one to convert a
+character to lower case.
+
+Stubs for these function are located in the urestubs.c file.
+
+Using URE
+---------
+
+Sample pseudo-code fragment.
+
+ ure_buffer_t rebuf;
+ ure_dfa_t dfa;
+ ucs2_t *re, *text;
+ unsigned long relen, textlen;
+ unsigned long match_start, match_end;
+
+ /*
+ * Allocate the dynamic storage needed to compile regular expressions.
+ */
+ rebuf = ure_buffer_create();
+
+ for each regular expression in a list {
+ re = next regular expression;
+ relen = length(re);
+
+ /*
+ * Compile the regular expression with the case insensitive flag
+ * turned on.
+ */
+ dfa = ure_compile(re, relen, 1, rebuf);
+
+ /*
+ * Look for the first match in some text. The matching will be done
+ * in a case insensitive manner because the expression was compiled
+ * with the case insensitive flag on.
+ */
+ if (ure_exec(dfa, 0, text, textlen, &match_start, &match_end))
+ printf("MATCH: %ld %ld\n", match_start, match_end);
+
+ /*
+ * Look for the first match in some text, ignoring non-spacing
+ * characters.
+ */
+ if (ure_exec(dfa, URE_IGNORE_NONSPACING, text, textlen,
+ &match_start, &match_end))
+ printf("MATCH: %ld %ld\n", match_start, match_end);
+
+ /*
+ * Free the DFA.
+ */
+ ure_free_dfa(dfa);
+ }
+
+ /*
+ * Free the dynamic storage used for compiling the expressions.
+ */
+ ure_free_buffer(rebuf);
+
+---------------------------------------------------------------------------
+
+Mark Leisher <mleisher@crl.nmsu.edu>
+29 March 1997
+
+===========================================================================
+
+CHANGES
+-------
+
+Version: 0.5
+Date : 21 September 1999
+==========================
+ 1. Added copyright stuff and put in CVS.
--- /dev/null
+/*
+ * Copyright 1997, 1998, 1999 Computing Research Labs,
+ * New Mexico State University
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
+ * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+ * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef lint
+static char rcsid[] = "$Id: ure.c,v 1.2 1999/09/21 15:47:43 mleisher Exp $";
+#endif
+
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include "ure.h"
+
+/*
+ * Flags used internally in the DFA.
+ */
+#define _URE_DFA_CASEFOLD 0x01
+#define _URE_DFA_BLANKLINE 0x02
+
+static unsigned long cclass_flags[] = {
+ 0,
+ _URE_NONSPACING,
+ _URE_COMBINING,
+ _URE_NUMDIGIT,
+ _URE_NUMOTHER,
+ _URE_SPACESEP,
+ _URE_LINESEP,
+ _URE_PARASEP,
+ _URE_CNTRL,
+ _URE_PUA,
+ _URE_UPPER,
+ _URE_LOWER,
+ _URE_TITLE,
+ _URE_MODIFIER,
+ _URE_OTHERLETTER,
+ _URE_DASHPUNCT,
+ _URE_OPENPUNCT,
+ _URE_CLOSEPUNCT,
+ _URE_OTHERPUNCT,
+ _URE_MATHSYM,
+ _URE_CURRENCYSYM,
+ _URE_OTHERSYM,
+ _URE_LTR,
+ _URE_RTL,
+ _URE_EURONUM,
+ _URE_EURONUMSEP,
+ _URE_EURONUMTERM,
+ _URE_ARABNUM,
+ _URE_COMMONSEP,
+ _URE_BLOCKSEP,
+ _URE_SEGMENTSEP,
+ _URE_WHITESPACE,
+ _URE_OTHERNEUT,
+};
+
+/*
+ * Symbol types for the DFA.
+ */
+#define _URE_ANY_CHAR 1
+#define _URE_CHAR 2
+#define _URE_CCLASS 3
+#define _URE_NCCLASS 4
+#define _URE_BOL_ANCHOR 5
+#define _URE_EOL_ANCHOR 6
+
+/*
+ * Op codes for converting the NFA to a DFA.
+ */
+#define _URE_SYMBOL 10
+#define _URE_PAREN 11
+#define _URE_QUEST 12
+#define _URE_STAR 13
+#define _URE_PLUS 14
+#define _URE_ONE 15
+#define _URE_AND 16
+#define _URE_OR 17
+
+#define _URE_NOOP 0xffff
+
+#define _URE_REGSTART 0x8000
+#define _URE_REGEND 0x4000
+
+/*
+ * Structure used to handle a compacted range of characters.
+ */
+typedef struct {
+ ucs4_t min_code;
+ ucs4_t max_code;
+} _ure_range_t;
+
+typedef struct {
+ _ure_range_t *ranges;
+ ucs2_t ranges_used;
+ ucs2_t ranges_size;
+} _ure_ccl_t;
+
+typedef union {
+ ucs4_t chr;
+ _ure_ccl_t ccl;
+} _ure_sym_t;
+
+/*
+ * This is a general element structure used for expressions and stack
+ * elements.
+ */
+typedef struct {
+ ucs2_t reg;
+ ucs2_t onstack;
+ ucs2_t type;
+ ucs2_t lhs;
+ ucs2_t rhs;
+} _ure_elt_t;
+
+/*
+ * This is a structure used to track a list or a stack of states.
+ */
+typedef struct {
+ ucs2_t *slist;
+ ucs2_t slist_size;
+ ucs2_t slist_used;
+} _ure_stlist_t;
+
+/*
+ * Structure to track the list of unique states for a symbol
+ * during reduction.
+ */
+typedef struct {
+ ucs2_t id;
+ ucs2_t type;
+ unsigned long mods;
+ unsigned long props;
+ _ure_sym_t sym;
+ _ure_stlist_t states;
+} _ure_symtab_t;
+
+/*
+ * Structure to hold a single state.
+ */
+typedef struct {
+ ucs2_t id;
+ ucs2_t accepting;
+ ucs2_t pad;
+ _ure_stlist_t st;
+ _ure_elt_t *trans;
+ ucs2_t trans_size;
+ ucs2_t trans_used;
+} _ure_state_t;
+
+/*
+ * Structure used for keeping lists of states.
+ */
+typedef struct {
+ _ure_state_t *states;
+ ucs2_t states_size;
+ ucs2_t states_used;
+} _ure_statetable_t;
+
+/*
+ * Structure to track pairs of DFA states when equivalent states are
+ * merged.
+ */
+typedef struct {
+ ucs2_t l;
+ ucs2_t r;
+} _ure_equiv_t;
+
+/*
+ * Structure used for constructing the NFA and reducing to a minimal DFA.
+ */
+typedef struct _ure_buffer_t {
+ int reducing;
+ int error;
+ unsigned long flags;
+
+ _ure_stlist_t stack;
+
+ /*
+ * Table of unique symbols encountered.
+ */
+ _ure_symtab_t *symtab;
+ ucs2_t symtab_size;
+ ucs2_t symtab_used;
+
+ /*
+ * Tracks the unique expressions generated for the NFA and when the NFA is
+ * reduced.
+ */
+ _ure_elt_t *expr;
+ ucs2_t expr_used;
+ ucs2_t expr_size;
+
+ /*
+ * The reduced table of unique groups of NFA states.
+ */
+ _ure_statetable_t states;
+
+ /*
+ * Tracks states when equivalent states are merged.
+ */
+ _ure_equiv_t *equiv;
+ ucs2_t equiv_used;
+ ucs2_t equiv_size;
+} _ure_buffer_t;
+
+typedef struct {
+ ucs2_t symbol;
+ ucs2_t next_state;
+} _ure_trans_t;
+
+typedef struct {
+ ucs2_t accepting;
+ ucs2_t ntrans;
+ _ure_trans_t *trans;
+} _ure_dstate_t;
+
+typedef struct _ure_dfa_t {
+ unsigned long flags;
+
+ _ure_symtab_t *syms;
+ ucs2_t nsyms;
+
+ _ure_dstate_t *states;
+ ucs2_t nstates;
+
+ _ure_trans_t *trans;
+ ucs2_t ntrans;
+} _ure_dfa_t;
+
+/*************************************************************************
+ *
+ * Functions.
+ *
+ *************************************************************************/
+
+static void
+#if NeedFunctionPrototypes
+_ure_memmove(char *dest, char *src, unsigned long bytes)
+#else
+_ure_memmove(dest, src, bytes)
+char *dest, *src;
+unsigned long bytes;
+#endif
+{
+ long i, j;
+
+ i = (long) bytes;
+ j = i & 7;
+ i = (i + 7) >> 3;
+
+ /*
+ * Do a memmove using Ye Olde Duff's Device for efficiency.
+ */
+ if (src < dest) {
+ src += bytes;
+ dest += bytes;
+
+ switch (j) {
+ case 0: do {
+ *--dest = *--src;
+ case 7: *--dest = *--src;
+ case 6: *--dest = *--src;
+ case 5: *--dest = *--src;
+ case 4: *--dest = *--src;
+ case 3: *--dest = *--src;
+ case 2: *--dest = *--src;
+ case 1: *--dest = *--src;
+ } while (--i > 0);
+ }
+ } else if (src > dest) {
+ switch (j) {
+ case 0: do {
+ *dest++ = *src++;
+ case 7: *dest++ = *src++;
+ case 6: *dest++ = *src++;
+ case 5: *dest++ = *src++;
+ case 4: *dest++ = *src++;
+ case 3: *dest++ = *src++;
+ case 2: *dest++ = *src++;
+ case 1: *dest++ = *src++;
+ } while (--i > 0);
+ }
+ }
+}
+
+static void
+#ifdef __STDC__
+_ure_push(ucs2_t v, _ure_buffer_t *b)
+#else
+_ure_push(v, b)
+ucs2_t v;
+_ure_buffer_t *b;
+#endif
+{
+ _ure_stlist_t *s;
+
+ if (b == 0)
+ return;
+
+ /*
+ * If the `reducing' parameter is non-zero, check to see if the value
+ * passed is already on the stack.
+ */
+ if (b->reducing != 0 && b->expr[v].onstack != 0)
+ return;
+
+ s = &b->stack;
+ if (s->slist_used == s->slist_size) {
+ if (s->slist_size == 0)
+ s->slist = (ucs2_t *) malloc(sizeof(ucs2_t) << 3);
+ else
+ s->slist = (ucs2_t *) realloc((char *) s->slist,
+ sizeof(ucs2_t) * (s->slist_size + 8));
+ s->slist_size += 8;
+ }
+ s->slist[s->slist_used++] = v;
+
+ /*
+ * If the `reducing' parameter is non-zero, flag the element as being on
+ * the stack.
+ */
+ if (b->reducing != 0)
+ b->expr[v].onstack = 1;
+}
+
+static ucs2_t
+#ifdef __STDC__
+_ure_peek(_ure_buffer_t *b)
+#else
+_ure_peek(b)
+_ure_buffer_t *b;
+#endif
+{
+ if (b == 0 || b->stack.slist_used == 0)
+ return _URE_NOOP;
+
+ return b->stack.slist[b->stack.slist_used - 1];
+}
+
+static ucs2_t
+#ifdef __STDC__
+_ure_pop(_ure_buffer_t *b)
+#else
+_ure_pop(b)
+_ure_buffer_t *b;
+#endif
+{
+ ucs2_t v;
+
+ if (b == 0 || b->stack.slist_used == 0)
+ return _URE_NOOP;
+
+ v = b->stack.slist[--b->stack.slist_used];
+ if (b->reducing)
+ b->expr[v].onstack = 0;
+
+ return v;
+}
+
+/*************************************************************************
+ *
+ * Start symbol parse functions.
+ *
+ *************************************************************************/
+
+/*
+ * Parse a comma-separated list of integers that represent character
+ * properties. Combine them into a mask that is returned in the `mask'
+ * variable, and return the number of characters consumed.
+ */
+static unsigned long
+#ifdef __STDC__
+_ure_prop_list(ucs2_t *pp, unsigned long limit, unsigned long *mask,
+ _ure_buffer_t *b)
+#else
+_ure_prop_list(pp, limit, mask, b)
+ucs2_t *pp;
+unsigned long limit, *mask;
+_ure_buffer_t *b;
+#endif
+{
+ unsigned long n, m;
+ ucs2_t *sp, *ep;
+
+ sp = pp;
+ ep = sp + limit;
+
+ for (m = n = 0; b->error == _URE_OK && sp < ep; sp++) {
+ if (*sp == ',') {
+ /*
+ * Encountered a comma, so select the next character property flag
+ * and reset the number.
+ */
+ m |= cclass_flags[n];
+ n = 0;
+ } else if (*sp >= '0' && *sp <= '9')
+ /*
+ * Encountered a digit, so start or continue building the cardinal
+ * that represents the character property flag.
+ */
+ n = (n * 10) + (*sp - '0');
+ else
+ /*
+ * Encountered something that is not part of the property list.
+ * Indicate that we are done.
+ */
+ break;
+
+ /*
+ * If a property number greater than 32 occurs, then there is a
+ * problem. Most likely a missing comma separator.
+ */
+ if (n > 32)
+ b->error = _URE_INVALID_PROPERTY;
+ }
+
+ if (n != 0)
+ m |= cclass_flags[n];
+
+ /*
+ * Set the mask that represents the group of character properties.
+ */
+ *mask = m;
+
+ /*
+ * Return the number of characters consumed.
+ */
+ return sp - pp;
+}
+
+/*
+ * Collect a hex number with 1 to 4 digits and return the number
+ * of characters used.
+ */
+static unsigned long
+#ifdef __STDC__
+_ure_hex(ucs2_t *np, unsigned long limit, ucs4_t *n)
+#else
+_ure_hex(np, limit, n)
+ucs2_t *np;
+unsigned long limit;
+ucs4_t *n;
+#endif
+{
+ ucs2_t i;
+ ucs2_t *sp, *ep;
+ ucs4_t nn;
+
+ sp = np;
+ ep = sp + limit;
+
+ for (nn = 0, i = 0; i < 4 && sp < ep; i++, sp++) {
+ if (*sp >= '0' && *sp <= '9')
+ nn = (nn << 4) + (*sp - '0');
+ else if (*sp >= 'A' && *sp <= 'F')
+ nn = (nn << 4) + ((*sp - 'A') + 10);
+ else if (*sp >= 'a' && *sp <= 'f')
+ nn = (nn << 4) + ((*sp - 'a') + 10);
+ else
+ /*
+ * Encountered something that is not a hex digit.
+ */
+ break;
+ }
+
+ /*
+ * Assign the character code collected and return the number of
+ * characters used.
+ */
+ *n = nn;
+
+ return sp - np;
+}
+
+/*
+ * Insert a range into a character class, removing duplicates and ordering
+ * them in increasing range-start order.
+ */
+static void
+#ifdef __STDC__
+_ure_add_range(_ure_ccl_t *ccl, _ure_range_t *r, _ure_buffer_t *b)
+#else
+_ure_add_range(ccl, r, b)
+_ure_ccl_t *ccl;
+_ure_range_t *r;
+_ure_buffer_t *b;
+#endif
+{
+ ucs2_t i;
+ ucs4_t tmp;
+ _ure_range_t *rp;
+
+ /*
+ * If the `casefold' flag is set, then make sure both endpoints of the
+ * range are converted to lower case.
+ */
+ if (b->flags & _URE_DFA_CASEFOLD) {
+ r->min_code = _ure_tolower(r->min_code);
+ r->max_code = _ure_tolower(r->max_code);
+ }
+
+ /*
+ * Swap the range endpoints if they are not in increasing order.
+ */
+ if (r->min_code > r->max_code) {
+ tmp = r->min_code;
+ r->min_code = r->max_code;
+ r->max_code = tmp;
+ }
+
+ for (i = 0, rp = ccl->ranges;
+ i < ccl->ranges_used && r->min_code < rp->min_code; i++, rp++) ;
+
+ /*
+ * Check for a duplicate.
+ */
+ if (i < ccl->ranges_used &&
+ r->min_code == rp->min_code && r->max_code == rp->max_code)
+ return;
+
+ if (ccl->ranges_used == ccl->ranges_size) {
+ if (ccl->ranges_size == 0)
+ ccl->ranges = (_ure_range_t *) malloc(sizeof(_ure_range_t) << 3);
+ else
+ ccl->ranges = (_ure_range_t *)
+ realloc((char *) ccl->ranges,
+ sizeof(_ure_range_t) * (ccl->ranges_size + 8));
+ ccl->ranges_size += 8;
+ }
+
+ rp = ccl->ranges + ccl->ranges_used;
+
+ if (i < ccl->ranges_used)
+ _ure_memmove((char *) (rp + 1), (char *) rp,
+ sizeof(_ure_range_t) * (ccl->ranges_used - i));
+
+ ccl->ranges_used++;
+ rp->min_code = r->min_code;
+ rp->max_code = r->max_code;
+}
+
+#define _URE_ALPHA_MASK (_URE_UPPER|_URE_LOWER|_URE_OTHERLETTER|\
+_URE_MODIFIER|_URE_TITLE|_URE_NONSPACING|_URE_COMBINING)
+#define _URE_ALNUM_MASK (_URE_ALPHA_MASK|_URE_NUMDIGIT)
+#define _URE_PUNCT_MASK (_URE_DASHPUNCT|_URE_OPENPUNCT|_URE_CLOSEPUNCT|\
+_URE_OTHERPUNCT)
+#define _URE_GRAPH_MASK (_URE_NUMDIGIT|_URE_NUMOTHER|_URE_ALPHA_MASK|\
+_URE_MATHSYM|_URE_CURRENCYSYM|_URE_OTHERSYM)
+#define _URE_PRINT_MASK (_URE_GRAPH_MASK|_URE_SPACESEP)
+#define _URE_SPACE_MASK (_URE_SPACESEP|_URE_LINESEP|_URE_PARASEP)
+
+typedef void (*_ure_cclsetup_t)(
+#ifdef __STDC__
+ _ure_symtab_t *sym,
+ unsigned long mask,
+ _ure_buffer_t *b
+#endif
+);
+
+typedef struct {
+ ucs2_t key;
+ unsigned long len;
+ unsigned long next;
+ _ure_cclsetup_t func;
+ unsigned long mask;
+} _ure_trie_t;
+
+static void
+#ifdef __STDC__
+_ure_ccl_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b)
+#else
+_ure_ccl_setup(sym, mask, b)
+_ure_symtab_t *sym;
+unsigned long mask;
+_ure_buffer_t *b;
+#endif
+{
+ sym->props |= mask;
+}
+
+static void
+#ifdef __STDC__
+_ure_space_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b)
+#else
+_ure_space_setup(sym, mask, *b)
+_ure_symtab_t *sym;
+unsigned long mask;
+_ure_buffer_t b;
+#endif
+{
+ _ure_range_t range;
+
+ sym->props |= mask;
+
+ /*
+ * Add the additional characters needed for handling isspace().
+ */
+ range.min_code = range.max_code = '\t';
+ _ure_add_range(&sym->sym.ccl, &range, b);
+ range.min_code = range.max_code = '\r';
+ _ure_add_range(&sym->sym.ccl, &range, b);
+ range.min_code = range.max_code = '\n';
+ _ure_add_range(&sym->sym.ccl, &range, b);
+ range.min_code = range.max_code = '\f';
+ _ure_add_range(&sym->sym.ccl, &range, b);
+ range.min_code = range.max_code = 0xfeff;
+ _ure_add_range(&sym->sym.ccl, &range, b);
+}
+
+static void
+#ifdef __STDC__
+_ure_xdigit_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b)
+#else
+_ure_xdigit_setup(sym, mask, b)
+_ure_symtab_t *sym;
+unsigned long mask;
+_ure_buffer_t *b;
+#endif
+{
+ _ure_range_t range;
+
+ /*
+ * Add the additional characters needed for handling isxdigit().
+ */
+ range.min_code = '0';
+ range.max_code = '9';
+ _ure_add_range(&sym->sym.ccl, &range, b);
+ range.min_code = 'A';
+ range.max_code = 'F';
+ _ure_add_range(&sym->sym.ccl, &range, b);
+ range.min_code = 'a';
+ range.max_code = 'f';
+ _ure_add_range(&sym->sym.ccl, &range, b);
+}
+
+static _ure_trie_t cclass_trie[] = {
+ {0x003a, 1, 1, 0, 0},
+ {0x0061, 9, 10, 0, 0},
+ {0x0063, 8, 19, 0, 0},
+ {0x0064, 7, 24, 0, 0},
+ {0x0067, 6, 29, 0, 0},
+ {0x006c, 5, 34, 0, 0},
+ {0x0070, 4, 39, 0, 0},
+ {0x0073, 3, 49, 0, 0},
+ {0x0075, 2, 54, 0, 0},
+ {0x0078, 1, 59, 0, 0},
+ {0x006c, 1, 11, 0, 0},
+ {0x006e, 2, 13, 0, 0},
+ {0x0070, 1, 16, 0, 0},
+ {0x0075, 1, 14, 0, 0},
+ {0x006d, 1, 15, 0, 0},
+ {0x003a, 1, 16, _ure_ccl_setup, _URE_ALNUM_MASK},
+ {0x0068, 1, 17, 0, 0},
+ {0x0061, 1, 18, 0, 0},
+ {0x003a, 1, 19, _ure_ccl_setup, _URE_ALPHA_MASK},
+ {0x006e, 1, 20, 0, 0},
+ {0x0074, 1, 21, 0, 0},
+ {0x0072, 1, 22, 0, 0},
+ {0x006c, 1, 23, 0, 0},
+ {0x003a, 1, 24, _ure_ccl_setup, _URE_CNTRL},
+ {0x0069, 1, 25, 0, 0},
+ {0x0067, 1, 26, 0, 0},
+ {0x0069, 1, 27, 0, 0},
+ {0x0074, 1, 28, 0, 0},
+ {0x003a, 1, 29, _ure_ccl_setup, _URE_NUMDIGIT},
+ {0x0072, 1, 30, 0, 0},
+ {0x0061, 1, 31, 0, 0},
+ {0x0070, 1, 32, 0, 0},
+ {0x0068, 1, 33, 0, 0},
+ {0x003a, 1, 34, _ure_ccl_setup, _URE_GRAPH_MASK},
+ {0x006f, 1, 35, 0, 0},
+ {0x0077, 1, 36, 0, 0},
+ {0x0065, 1, 37, 0, 0},
+ {0x0072, 1, 38, 0, 0},
+ {0x003a, 1, 39, _ure_ccl_setup, _URE_LOWER},
+ {0x0072, 2, 41, 0, 0},
+ {0x0075, 1, 45, 0, 0},
+ {0x0069, 1, 42, 0, 0},
+ {0x006e, 1, 43, 0, 0},
+ {0x0074, 1, 44, 0, 0},
+ {0x003a, 1, 45, _ure_ccl_setup, _URE_PRINT_MASK},
+ {0x006e, 1, 46, 0, 0},
+ {0x0063, 1, 47, 0, 0},
+ {0x0074, 1, 48, 0, 0},
+ {0x003a, 1, 49, _ure_ccl_setup, _URE_PUNCT_MASK},
+ {0x0070, 1, 50, 0, 0},
+ {0x0061, 1, 51, 0, 0},
+ {0x0063, 1, 52, 0, 0},
+ {0x0065, 1, 53, 0, 0},
+ {0x003a, 1, 54, _ure_space_setup, _URE_SPACE_MASK},
+ {0x0070, 1, 55, 0, 0},
+ {0x0070, 1, 56, 0, 0},
+ {0x0065, 1, 57, 0, 0},
+ {0x0072, 1, 58, 0, 0},
+ {0x003a, 1, 59, _ure_ccl_setup, _URE_UPPER},
+ {0x0064, 1, 60, 0, 0},
+ {0x0069, 1, 61, 0, 0},
+ {0x0067, 1, 62, 0, 0},
+ {0x0069, 1, 63, 0, 0},
+ {0x0074, 1, 64, 0, 0},
+ {0x003a, 1, 65, _ure_xdigit_setup, 0},
+};
+
+/*
+ * Probe for one of the POSIX colon delimited character classes in the static
+ * trie.
+ */
+static unsigned long
+#ifdef __STDC__
+_ure_posix_ccl(ucs2_t *cp, unsigned long limit, _ure_symtab_t *sym,
+ _ure_buffer_t *b)
+#else
+_ure_posix_ccl(cp, limit, sym, b)
+ucs2_t *cp;
+unsigned long limit;
+_ure_symtab_t *sym;
+_ure_buffer_t *b;
+#endif
+{
+ int i;
+ unsigned long n;
+ _ure_trie_t *tp;
+ ucs2_t *sp, *ep;
+
+ /*
+ * If the number of characters left is less than 7, then this cannot be
+ * interpreted as one of the colon delimited classes.
+ */
+ if (limit < 7)
+ return 0;
+
+ sp = cp;
+ ep = sp + limit;
+ tp = cclass_trie;
+ for (i = 0; sp < ep && i < 8; i++, sp++) {
+ n = tp->len;
+
+ for (; n > 0 && tp->key != *sp; tp++, n--) ;
+
+ if (n == 0)
+ return 0;
+
+ if (*sp == ':' && (i == 6 || i == 7)) {
+ sp++;
+ break;
+ }
+ if (sp + 1 < ep)
+ tp = cclass_trie + tp->next;
+ }
+ if (tp->func == 0)
+ return 0;
+
+ (*tp->func)(sym, tp->mask, b);
+
+ return sp - cp;
+}
+
+/*
+ * Construct a list of ranges and return the number of characters consumed.
+ */
+static unsigned long
+#ifdef __STDC__
+_ure_cclass(ucs2_t *cp, unsigned long limit, _ure_symtab_t *symp,
+ _ure_buffer_t *b)
+#else
+_ure_cclass(cp, limit, symp, b)
+ucs2_t *cp;
+unsigned long limit;
+_ure_symtab_t *symp;
+_ure_buffer_t *b;
+#endif
+{
+ int range_end;
+ unsigned long n;
+ ucs2_t *sp, *ep;
+ ucs4_t c, last;
+ _ure_ccl_t *cclp;
+ _ure_range_t range;
+
+ sp = cp;
+ ep = sp + limit;
+
+ if (*sp == '^') {
+ symp->type = _URE_NCCLASS;
+ sp++;
+ } else
+ symp->type = _URE_CCLASS;
+
+ for (last = 0, range_end = 0;
+ b->error == _URE_OK && sp < ep && *sp != ']'; ) {
+ c = *sp++;
+ if (c == '\\') {
+ if (sp == ep) {
+ /*
+ * The EOS was encountered when expecting the reverse solidus
+ * to be followed by the character it is escaping. Set an
+ * error code and return the number of characters consumed up
+ * to this point.
+ */
+ b->error = _URE_UNEXPECTED_EOS;
+ return sp - cp;
+ }
+
+ c = *sp++;
+ switch (c) {
+ case 'a':
+ c = 0x07;
+ break;
+ case 'b':
+ c = 0x08;
+ break;
+ case 'f':
+ c = 0x0c;
+ break;
+ case 'n':
+ c = 0x0a;
+ break;
+ case 'r':
+ c = 0x0d;
+ break;
+ case 't':
+ c = 0x09;
+ break;
+ case 'v':
+ c = 0x0b;
+ break;
+ case 'p':
+ case 'P':
+ sp += _ure_prop_list(sp, ep - sp, &symp->props, b);
+ /*
+ * Invert the bit mask of the properties if this is a negated
+ * character class or if 'P' is used to specify a list of
+ * character properties that should *not* match in a
+ * character class.
+ */
+ if (c == 'P')
+ symp->props = ~symp->props;
+ continue;
+ break;
+ case 'x':
+ case 'X':
+ case 'u':
+ case 'U':
+ if (sp < ep &&
+ ((*sp >= '0' && *sp <= '9') ||
+ (*sp >= 'A' && *sp <= 'F') ||
+ (*sp >= 'a' && *sp <= 'f')))
+ sp += _ure_hex(sp, ep - sp, &c);
+ }
+ } else if (c == ':') {
+ /*
+ * Probe for a POSIX colon delimited character class.
+ */
+ sp--;
+ if ((n = _ure_posix_ccl(sp, ep - sp, symp, b)) == 0)
+ sp++;
+ else {
+ sp += n;
+ continue;
+ }
+ }
+
+ cclp = &symp->sym.ccl;
+
+ /*
+ * Check to see if the current character is a low surrogate that needs
+ * to be combined with a preceding high surrogate.
+ */
+ if (last != 0) {
+ if (c >= 0xdc00 && c <= 0xdfff)
+ /*
+ * Construct the UTF16 character code.
+ */
+ c = 0x10000 + (((last & 0x03ff) << 10) | (c & 0x03ff));
+ else {
+ /*
+ * Add the isolated high surrogate to the range.
+ */
+ if (range_end == 1)
+ range.max_code = last & 0xffff;
+ else
+ range.min_code = range.max_code = last & 0xffff;
+
+ _ure_add_range(cclp, &range, b);
+ range_end = 0;
+ }
+ }
+
+ /*
+ * Clear the last character code.
+ */
+ last = 0;
+
+ /*
+ * This slightly awkward code handles the different cases needed to
+ * construct a range.
+ */
+ if (c >= 0xd800 && c <= 0xdbff) {
+ /*
+ * If the high surrogate is followed by a range indicator, simply
+ * add it as the range start. Otherwise, save it in case the next
+ * character is a low surrogate.
+ */
+ if (*sp == '-') {
+ sp++;
+ range.min_code = c;
+ range_end = 1;
+ } else
+ last = c;
+ } else if (range_end == 1) {
+ range.max_code = c;
+ _ure_add_range(cclp, &range, b);
+ range_end = 0;
+ } else {
+ range.min_code = range.max_code = c;
+ if (*sp == '-') {
+ sp++;
+ range_end = 1;
+ } else
+ _ure_add_range(cclp, &range, b);
+ }
+ }
+
+ if (sp < ep && *sp == ']')
+ sp++;
+ else
+ /*
+ * The parse was not terminated by the character class close symbol
+ * (']'), so set an error code.
+ */
+ b->error = _URE_CCLASS_OPEN;
+
+ return sp - cp;
+}
+
+/*
+ * Probe for a low surrogate hex code.
+ */
+static unsigned long
+#ifdef __STDC__
+_ure_probe_ls(ucs2_t *ls, unsigned long limit, ucs4_t *c)
+#else
+_ure_probe_ls(ls, limit, c)
+ucs2_t *ls;
+unsigned long limit;
+ucs4_t *c;
+#endif
+{
+ ucs4_t i, code;
+ ucs2_t *sp, *ep;
+
+ for (i = code = 0, sp = ls, ep = sp + limit; i < 4 && sp < ep; sp++) {
+ if (*sp >= '0' && *sp <= '9')
+ code = (code << 4) + (*sp - '0');
+ else if (*sp >= 'A' && *sp <= 'F')
+ code = (code << 4) + ((*sp - 'A') + 10);
+ else if (*sp >= 'a' && *sp <= 'f')
+ code = (code << 4) + ((*sp - 'a') + 10);
+ else
+ break;
+ }
+
+ *c = code;
+ return (0xdc00 <= code && code <= 0xdfff) ? sp - ls : 0;
+}
+
+static unsigned long
+#ifdef __STDC__
+_ure_compile_symbol(ucs2_t *sym, unsigned long limit, _ure_symtab_t *symp,
+ _ure_buffer_t *b)
+#else
+_ure_compile_symbol(sym, limit, symp, b)
+ucs2_t *sym;
+unsigned long limit;
+_ure_symtab_t *symp;
+_ure_buffer_t *b;
+#endif
+{
+ ucs4_t c;
+ ucs2_t *sp, *ep;
+
+ sp = sym;
+ ep = sym + limit;
+
+ if ((c = *sp++) == '\\') {
+
+ if (sp == ep) {
+ /*
+ * The EOS was encountered when expecting the reverse solidus to
+ * be followed by the character it is escaping. Set an error code
+ * and return the number of characters consumed up to this point.
+ */
+ b->error = _URE_UNEXPECTED_EOS;
+ return sp - sym;
+ }
+
+ c = *sp++;
+ switch (c) {
+ case 'p':
+ case 'P':
+ symp->type = (c == 'p') ? _URE_CCLASS : _URE_NCCLASS;
+ sp += _ure_prop_list(sp, ep - sp, &symp->props, b);
+ break;
+ case 'a':
+ symp->type = _URE_CHAR;
+ symp->sym.chr = 0x07;
+ break;
+ case 'b':
+ symp->type = _URE_CHAR;
+ symp->sym.chr = 0x08;
+ break;
+ case 'f':
+ symp->type = _URE_CHAR;
+ symp->sym.chr = 0x0c;
+ break;
+ case 'n':
+ symp->type = _URE_CHAR;
+ symp->sym.chr = 0x0a;
+ break;
+ case 'r':
+ symp->type = _URE_CHAR;
+ symp->sym.chr = 0x0d;
+ break;
+ case 't':
+ symp->type = _URE_CHAR;
+ symp->sym.chr = 0x09;
+ break;
+ case 'v':
+ symp->type = _URE_CHAR;
+ symp->sym.chr = 0x0b;
+ break;
+ case 'x':
+ case 'X':
+ case 'u':
+ case 'U':
+ /*
+ * Collect between 1 and 4 digits representing a UCS2 code. Fall
+ * through to the next case.
+ */
+ if (sp < ep &&
+ ((*sp >= '0' && *sp <= '9') ||
+ (*sp >= 'A' && *sp <= 'F') ||
+ (*sp >= 'a' && *sp <= 'f')))
+ sp += _ure_hex(sp, ep - sp, &c);
+ /* FALLTHROUGH */
+ default:
+ /*
+ * Simply add an escaped character here.
+ */
+ symp->type = _URE_CHAR;
+ symp->sym.chr = c;
+ }
+ } else if (c == '^' || c == '$')
+ /*
+ * Handle the BOL and EOL anchors. This actually consists simply of
+ * setting a flag that indicates that the user supplied anchor match
+ * function should be called. This needs to be done instead of simply
+ * matching line/paragraph separators because beginning-of-text and
+ * end-of-text tests are needed as well.
+ */
+ symp->type = (c == '^') ? _URE_BOL_ANCHOR : _URE_EOL_ANCHOR;
+ else if (c == '[')
+ /*
+ * Construct a character class.
+ */
+ sp += _ure_cclass(sp, ep - sp, symp, b);
+ else if (c == '.')
+ symp->type = _URE_ANY_CHAR;
+ else {
+ symp->type = _URE_CHAR;
+ symp->sym.chr = c;
+ }
+
+ /*
+ * If the symbol type happens to be a character and is a high surrogate,
+ * then probe forward to see if it is followed by a low surrogate that
+ * needs to be added.
+ */
+ if (sp < ep && symp->type == _URE_CHAR &&
+ 0xd800 <= symp->sym.chr && symp->sym.chr <= 0xdbff) {
+
+ if (0xdc00 <= *sp && *sp <= 0xdfff) {
+ symp->sym.chr = 0x10000 + (((symp->sym.chr & 0x03ff) << 10) |
+ (*sp & 0x03ff));
+ sp++;
+ } else if (*sp == '\\' && (*(sp + 1) == 'x' || *(sp + 1) == 'X' ||
+ *(sp + 1) == 'u' || *(sp + 1) == 'U')) {
+ sp += _ure_probe_ls(sp + 2, ep - (sp + 2), &c);
+ if (0xdc00 <= c && c <= 0xdfff) {
+ /*
+ * Take into account the \[xu] in front of the hex code.
+ */
+ sp += 2;
+ symp->sym.chr = 0x10000 + (((symp->sym.chr & 0x03ff) << 10) |
+ (c & 0x03ff));
+ }
+ }
+ }
+
+ /*
+ * Last, make sure any _URE_CHAR type symbols are changed to lower case if
+ * the `casefold' flag is set.
+ */
+ if ((b->flags & _URE_DFA_CASEFOLD) && symp->type == _URE_CHAR)
+ symp->sym.chr = _ure_tolower(symp->sym.chr);
+
+ /*
+ * If the symbol constructed is anything other than one of the anchors,
+ * make sure the _URE_DFA_BLANKLINE flag is removed.
+ */
+ if (symp->type != _URE_BOL_ANCHOR && symp->type != _URE_EOL_ANCHOR)
+ b->flags &= ~_URE_DFA_BLANKLINE;
+
+ /*
+ * Return the number of characters consumed.
+ */
+ return sp - sym;
+}
+
+static int
+#ifdef __STDC__
+_ure_sym_neq(_ure_symtab_t *a, _ure_symtab_t *b)
+#else
+_ure_sym_neq(a, b)
+_ure_symtab_t *a, *b;
+#endif
+{
+ if (a->type != b->type || a->mods != b->mods || a->props != b->props)
+ return 1;
+
+ if (a->type == _URE_CCLASS || a->type == _URE_NCCLASS) {
+ if (a->sym.ccl.ranges_used != b->sym.ccl.ranges_used)
+ return 1;
+ if (a->sym.ccl.ranges_used > 0 &&
+ memcmp((char *) a->sym.ccl.ranges, (char *) b->sym.ccl.ranges,
+ sizeof(_ure_range_t) * a->sym.ccl.ranges_used) != 0)
+ return 1;
+ } else if (a->type == _URE_CHAR && a->sym.chr != b->sym.chr)
+ return 1;
+ return 0;
+}
+
+/*
+ * Construct a symbol, but only keep unique symbols.
+ */
+static ucs2_t
+#ifdef __stdc__
+_ure_make_symbol(ucs2_t *sym, unsigned long limit, unsigned long *consumed,
+ _ure_buffer_t *b)
+#else
+_ure_make_symbol(sym, limit, consumed, b)
+ucs2_t *sym;
+unsigned long limit, *consumed;
+_ure_buffer_t *b;
+#endif
+{
+ ucs2_t i;
+ _ure_symtab_t *sp, symbol;
+
+ /*
+ * Build the next symbol so we can test to see if it is already in the
+ * symbol table.
+ */
+ (void) memset((char *) &symbol, 0, sizeof(_ure_symtab_t));
+ *consumed = _ure_compile_symbol(sym, limit, &symbol, b);
+
+ /*
+ * Check to see if the symbol exists.
+ */
+ for (i = 0, sp = b->symtab;
+ i < b->symtab_used && _ure_sym_neq(&symbol, sp); i++, sp++) ;
+
+ if (i < b->symtab_used) {
+ /*
+ * Free up any ranges used for the symbol.
+ */
+ if ((symbol.type == _URE_CCLASS || symbol.type == _URE_NCCLASS) &&
+ symbol.sym.ccl.ranges_size > 0)
+ free((char *) symbol.sym.ccl.ranges);
+
+ return b->symtab[i].id;
+ }
+
+ /*
+ * Need to add the new symbol.
+ */
+ if (b->symtab_used == b->symtab_size) {
+ if (b->symtab_size == 0)
+ b->symtab = (_ure_symtab_t *) malloc(sizeof(_ure_symtab_t) << 3);
+ else
+ b->symtab = (_ure_symtab_t *)
+ realloc((char *) b->symtab,
+ sizeof(_ure_symtab_t) * (b->symtab_size + 8));
+ sp = b->symtab + b->symtab_size;
+ (void) memset((char *) sp, 0, sizeof(_ure_symtab_t) << 3);
+ b->symtab_size += 8;
+ }
+
+ symbol.id = b->symtab_used++;
+ (void) memcpy((char *) &b->symtab[symbol.id], (char *) &symbol,
+ sizeof(_ure_symtab_t));
+
+ return symbol.id;
+}
+
+/*************************************************************************
+ *
+ * End symbol parse functions.
+ *
+ *************************************************************************/
+
+static ucs2_t
+#ifdef __stdc__
+_ure_make_expr(ucs2_t type, ucs2_t lhs, ucs2_t rhs, _ure_buffer_t *b)
+#else
+_ure_make_expr(type, lhs, rhs, b)
+ucs2_t type, lhs, rhs;
+_ure_buffer_t *b;
+#endif
+{
+ ucs2_t i;
+
+ if (b == 0)
+ return _URE_NOOP;
+
+ /*
+ * Determine if the expression already exists or not.
+ */
+ for (i = 0; i < b->expr_used; i++) {
+ if (b->expr[i].type == type && b->expr[i].lhs == lhs &&
+ b->expr[i].rhs == rhs)
+ break;
+ }
+ if (i < b->expr_used)
+ return i;
+
+ /*
+ * Need to add a new expression.
+ */
+ if (b->expr_used == b->expr_size) {
+ if (b->expr_size == 0)
+ b->expr = (_ure_elt_t *) malloc(sizeof(_ure_elt_t) << 3);
+ else
+ b->expr = (_ure_elt_t *)
+ realloc((char *) b->expr,
+ sizeof(_ure_elt_t) * (b->expr_size + 8));
+ b->expr_size += 8;
+ }
+
+ b->expr[b->expr_used].onstack = 0;
+ b->expr[b->expr_used].type = type;
+ b->expr[b->expr_used].lhs = lhs;
+ b->expr[b->expr_used].rhs = rhs;
+
+ return b->expr_used++;
+}
+
+static unsigned char spmap[] = {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+};
+
+#define _ure_isspecial(cc) ((cc) > 0x20 && (cc) < 0x7f && \
+ (spmap[(cc) >> 3] & (1 << ((cc) & 7))))
+
+/*
+ * Convert the regular expression into an NFA in a form that will be easy to
+ * reduce to a DFA. The starting state for the reduction will be returned.
+ */
+static ucs2_t
+#ifdef __STDC__
+_ure_re2nfa(ucs2_t *re, unsigned long relen, _ure_buffer_t *b)
+#else
+_ure_re2nfa(re, relen, b)
+ucs2_t *re;
+unsigned long relen;
+_ure_buffer_t *b;
+#endif
+{
+ ucs2_t c, state, top, sym, *sp, *ep;
+ unsigned long used;
+
+ state = _URE_NOOP;
+
+ sp = re;
+ ep = sp + relen;
+ while (b->error == _URE_OK && sp < ep) {
+ c = *sp++;
+ switch (c) {
+ case '(':
+ _ure_push(_URE_PAREN, b);
+ break;
+ case ')':
+ /*
+ * Check for the case of too many close parentheses.
+ */
+ if (_ure_peek(b) == _URE_NOOP) {
+ b->error = _URE_UNBALANCED_GROUP;
+ break;
+ }
+
+ while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR)
+ /*
+ * Make an expression with the AND or OR operator and its right
+ * hand side.
+ */
+ state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b);
+
+ /*
+ * Remove the _URE_PAREN off the stack.
+ */
+ (void) _ure_pop(b);
+ break;
+ case '*':
+ state = _ure_make_expr(_URE_STAR, state, _URE_NOOP, b);
+ break;
+ case '+':
+ state = _ure_make_expr(_URE_PLUS, state, _URE_NOOP, b);
+ break;
+ case '?':
+ state = _ure_make_expr(_URE_QUEST, state, _URE_NOOP, b);
+ break;
+ case '|':
+ while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR)
+ /*
+ * Make an expression with the AND or OR operator and its right
+ * hand side.
+ */
+ state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b);
+
+ _ure_push(state, b);
+ _ure_push(_URE_OR, b);
+ break;
+ default:
+ sp--;
+ sym = _ure_make_symbol(sp, ep - sp, &used, b);
+ sp += used;
+ state = _ure_make_expr(_URE_SYMBOL, sym, _URE_NOOP, b);
+ break;
+ }
+
+ if (c != '(' && c != '|' && sp < ep &&
+ (!_ure_isspecial(*sp) || *sp == '(')) {
+ _ure_push(state, b);
+ _ure_push(_URE_AND, b);
+ }
+ }
+ while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR)
+ /*
+ * Make an expression with the AND or OR operator and its right
+ * hand side.
+ */
+ state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b);
+
+ if (b->stack.slist_used > 0)
+ b->error = _URE_UNBALANCED_GROUP;
+
+ return (b->error == _URE_OK) ? state : _URE_NOOP;
+}
+
+static void
+#ifdef __STDC__
+_ure_add_symstate(ucs2_t sym, ucs2_t state, _ure_buffer_t *b)
+#else
+_ure_add_symstate(sym, state, b)
+ucs2_t sym, state;
+_ure_buffer_t *b;
+#endif
+{
+ ucs2_t i, *stp;
+ _ure_symtab_t *sp;
+
+ /*
+ * Locate the symbol in the symbol table so the state can be added.
+ * If the symbol doesn't exist, then a real problem exists.
+ */
+ for (i = 0, sp = b->symtab; i < b->symtab_used && sym != sp->id;
+ i++, sp++) ;
+
+ /*
+ * Now find out if the state exists in the symbol's state list.
+ */
+ for (i = 0, stp = sp->states.slist;
+ i < sp->states.slist_used && state > *stp; i++, stp++) ;
+
+ if (i == sp->states.slist_used || state < *stp) {
+ /*
+ * Need to add the state in order.
+ */
+ if (sp->states.slist_used == sp->states.slist_size) {
+ if (sp->states.slist_size == 0)
+ sp->states.slist = (ucs2_t *) malloc(sizeof(ucs2_t) << 3);
+ else
+ sp->states.slist = (ucs2_t *)
+ realloc((char *) sp->states.slist,
+ sizeof(ucs2_t) * (sp->states.slist_size + 8));
+ sp->states.slist_size += 8;
+ }
+ if (i < sp->states.slist_used)
+ (void) _ure_memmove((char *) (sp->states.slist + i + 1),
+ (char *) (sp->states.slist + i),
+ sizeof(ucs2_t) * (sp->states.slist_used - i));
+ sp->states.slist[i] = state;
+ sp->states.slist_used++;
+ }
+}
+
+static ucs2_t
+#ifdef __STDC__
+_ure_add_state(ucs2_t nstates, ucs2_t *states, _ure_buffer_t *b)
+#else
+_ure_add_state(nstates, states, b)
+ucs2_t nstates, *states;
+_ure_buffer_t *b;
+#endif
+{
+ ucs2_t i;
+ _ure_state_t *sp;
+
+ for (i = 0, sp = b->states.states; i < b->states.states_used; i++, sp++) {
+ if (sp->st.slist_used == nstates &&
+ memcmp((char *) states, (char *) sp->st.slist,
+ sizeof(ucs2_t) * nstates) == 0)
+ break;
+ }
+
+ if (i == b->states.states_used) {
+ /*
+ * Need to add a new DFA state (set of NFA states).
+ */
+ if (b->states.states_used == b->states.states_size) {
+ if (b->states.states_size == 0)
+ b->states.states = (_ure_state_t *)
+ malloc(sizeof(_ure_state_t) << 3);
+ else
+ b->states.states = (_ure_state_t *)
+ realloc((char *) b->states.states,
+ sizeof(_ure_state_t) * (b->states.states_size + 8));
+ sp = b->states.states + b->states.states_size;
+ (void) memset((char *) sp, 0, sizeof(_ure_state_t) << 3);
+ b->states.states_size += 8;
+ }
+
+ sp = b->states.states + b->states.states_used++;
+ sp->id = i;
+
+ if (sp->st.slist_used + nstates > sp->st.slist_size) {
+ if (sp->st.slist_size == 0)
+ sp->st.slist = (ucs2_t *)
+ malloc(sizeof(ucs2_t) * (sp->st.slist_used + nstates));
+ else
+ sp->st.slist = (ucs2_t *)
+ realloc((char *) sp->st.slist,
+ sizeof(ucs2_t) * (sp->st.slist_used + nstates));
+ sp->st.slist_size = sp->st.slist_used + nstates;
+ }
+ sp->st.slist_used = nstates;
+ (void) memcpy((char *) sp->st.slist, (char *) states,
+ sizeof(ucs2_t) * nstates);
+ }
+
+ /*
+ * Return the ID of the DFA state representing a group of NFA states.
+ */
+ return i;
+}
+
+static void
+#ifdef __STDC__
+_ure_reduce(ucs2_t start, _ure_buffer_t *b)
+#else
+_ure_reduce(start, b)
+ucs2_t start;
+_ure_buffer_t *b;
+#endif
+{
+ ucs2_t i, j, state, eval, syms, rhs;
+ ucs2_t s1, s2, ns1, ns2;
+ _ure_state_t *sp;
+ _ure_symtab_t *smp;
+
+ b->reducing = 1;
+
+ /*
+ * Add the starting state for the reduction.
+ */
+ _ure_add_state(1, &start, b);
+
+ /*
+ * Process each set of NFA states that get created.
+ */
+ for (i = 0; i < b->states.states_used; i++) {
+ sp = b->states.states + i;
+
+ /*
+ * Push the current states on the stack.
+ */
+ for (j = 0; j < sp->st.slist_used; j++)
+ _ure_push(sp->st.slist[j], b);
+
+ /*
+ * Reduce the NFA states.
+ */
+ for (j = sp->accepting = syms = 0; j < b->stack.slist_used; j++) {
+ state = b->stack.slist[j];
+ eval = 1;
+
+ /*
+ * This inner loop is the iterative equivalent of recursively
+ * reducing subexpressions generated as a result of a reduction.
+ */
+ while (eval) {
+ switch (b->expr[state].type) {
+ case _URE_SYMBOL:
+ ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b);
+ _ure_add_symstate(b->expr[state].lhs, ns1, b);
+ syms++;
+ eval = 0;
+ break;
+ case _URE_ONE:
+ sp->accepting = 1;
+ eval = 0;
+ break;
+ case _URE_QUEST:
+ s1 = b->expr[state].lhs;
+ ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b);
+ state = _ure_make_expr(_URE_OR, ns1, s1, b);
+ break;
+ case _URE_PLUS:
+ s1 = b->expr[state].lhs;
+ ns1 = _ure_make_expr(_URE_STAR, s1, _URE_NOOP, b);
+ state = _ure_make_expr(_URE_AND, s1, ns1, b);
+ break;
+ case _URE_STAR:
+ s1 = b->expr[state].lhs;
+ ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b);
+ ns2 = _ure_make_expr(_URE_PLUS, s1, _URE_NOOP, b);
+ state = _ure_make_expr(_URE_OR, ns1, ns2, b);
+ break;
+ case _URE_OR:
+ s1 = b->expr[state].lhs;
+ s2 = b->expr[state].rhs;
+ _ure_push(s1, b);
+ _ure_push(s2, b);
+ eval = 0;
+ break;
+ case _URE_AND:
+ s1 = b->expr[state].lhs;
+ s2 = b->expr[state].rhs;
+ switch (b->expr[s1].type) {
+ case _URE_SYMBOL:
+ _ure_add_symstate(b->expr[s1].lhs, s2, b);
+ syms++;
+ eval = 0;
+ break;
+ case _URE_ONE:
+ state = s2;
+ break;
+ case _URE_QUEST:
+ ns1 = b->expr[s1].lhs;
+ ns2 = _ure_make_expr(_URE_AND, ns1, s2, b);
+ state = _ure_make_expr(_URE_OR, s2, ns2, b);
+ break;
+ case _URE_PLUS:
+ ns1 = b->expr[s1].lhs;
+ ns2 = _ure_make_expr(_URE_OR, s2, state, b);
+ state = _ure_make_expr(_URE_AND, ns1, ns2, b);
+ break;
+ case _URE_STAR:
+ ns1 = b->expr[s1].lhs;
+ ns2 = _ure_make_expr(_URE_AND, ns1, state, b);
+ state = _ure_make_expr(_URE_OR, s2, ns2, b);
+ break;
+ case _URE_OR:
+ ns1 = b->expr[s1].lhs;
+ ns2 = b->expr[s1].rhs;
+ ns1 = _ure_make_expr(_URE_AND, ns1, s2, b);
+ ns2 = _ure_make_expr(_URE_AND, ns2, s2, b);
+ state = _ure_make_expr(_URE_OR, ns1, ns2, b);
+ break;
+ case _URE_AND:
+ ns1 = b->expr[s1].lhs;
+ ns2 = b->expr[s1].rhs;
+ ns2 = _ure_make_expr(_URE_AND, ns2, s2, b);
+ state = _ure_make_expr(_URE_AND, ns1, ns2, b);
+ break;
+ }
+ }
+ }
+ }
+
+ /*
+ * Clear the state stack.
+ */
+ while (_ure_pop(b) != _URE_NOOP) ;
+
+ /*
+ * Reset the state pointer because the reduction may have moved it
+ * during a reallocation.
+ */
+ sp = b->states.states + i;
+
+ /*
+ * Generate the DFA states for the symbols collected during the
+ * current reduction.
+ */
+ if (sp->trans_used + syms > sp->trans_size) {
+ if (sp->trans_size == 0)
+ sp->trans = (_ure_elt_t *)
+ malloc(sizeof(_ure_elt_t) * (sp->trans_used + syms));
+ else
+ sp->trans = (_ure_elt_t *)
+ realloc((char *) sp->trans,
+ sizeof(_ure_elt_t) * (sp->trans_used + syms));
+ sp->trans_size = sp->trans_used + syms;
+ }
+
+ /*
+ * Go through the symbol table and generate the DFA state transitions
+ * for each symbol that has collected NFA states.
+ */
+ for (j = syms = 0, smp = b->symtab; j < b->symtab_used; j++, smp++) {
+ sp = b->states.states + i;
+
+ if (smp->states.slist_used > 0) {
+ sp->trans[syms].lhs = smp->id;
+ rhs = _ure_add_state(smp->states.slist_used,
+ smp->states.slist, b);
+ /*
+ * Reset the state pointer in case the reallocation moves it
+ * in memory.
+ */
+ sp = b->states.states + i;
+ sp->trans[syms].rhs = rhs;
+
+ smp->states.slist_used = 0;
+ syms++;
+ }
+ }
+
+ /*
+ * Set the number of transitions actually used.
+ */
+ sp->trans_used = syms;
+ }
+ b->reducing = 0;
+}
+
+static void
+#ifdef __STDC__
+_ure_add_equiv(ucs2_t l, ucs2_t r, _ure_buffer_t *b)
+#else
+_ure_add_equiv(l, r, b)
+ucs2_t l, r;
+_ure_buffer_t *b;
+#endif
+{
+ ucs2_t tmp;
+
+ l = b->states.states[l].id;
+ r = b->states.states[r].id;
+
+ if (l == r)
+ return;
+
+ if (l > r) {
+ tmp = l;
+ l = r;
+ r = tmp;
+ }
+
+ /*
+ * Check to see if the equivalence pair already exists.
+ */
+ for (tmp = 0; tmp < b->equiv_used &&
+ (b->equiv[tmp].l != l || b->equiv[tmp].r != r);
+ tmp++) ;
+
+ if (tmp < b->equiv_used)
+ return;
+
+ if (b->equiv_used == b->equiv_size) {
+ if (b->equiv_size == 0)
+ b->equiv = (_ure_equiv_t *) malloc(sizeof(_ure_equiv_t) << 3);
+ else
+ b->equiv = (_ure_equiv_t *) realloc((char *) b->equiv,
+ sizeof(_ure_equiv_t) *
+ (b->equiv_size + 8));
+ b->equiv_size += 8;
+ }
+ b->equiv[b->equiv_used].l = l;
+ b->equiv[b->equiv_used].r = r;
+ b->equiv_used++;
+}
+
+/*
+ * Merge the DFA states that are equivalent.
+ */
+static void
+#ifdef __STDC__
+_ure_merge_equiv(_ure_buffer_t *b)
+#else
+_ure_merge_equiv(b)
+_ure_buffer_t *b;
+#endif
+{
+ ucs2_t i, j, k, eq, done;
+ _ure_state_t *sp1, *sp2, *ls, *rs;
+
+ for (i = 0; i < b->states.states_used; i++) {
+ sp1 = b->states.states + i;
+ if (sp1->id != i)
+ continue;
+ for (j = 0; j < i; j++) {
+ sp2 = b->states.states + j;
+ if (sp2->id != j)
+ continue;
+ b->equiv_used = 0;
+ _ure_add_equiv(i, j, b);
+ for (eq = 0, done = 0; eq < b->equiv_used; eq++) {
+ ls = b->states.states + b->equiv[eq].l;
+ rs = b->states.states + b->equiv[eq].r;
+ if (ls->accepting != rs->accepting ||
+ ls->trans_used != rs->trans_used) {
+ done = 1;
+ break;
+ }
+ for (k = 0; k < ls->trans_used &&
+ ls->trans[k].lhs == rs->trans[k].lhs; k++) ;
+ if (k < ls->trans_used) {
+ done = 1;
+ break;
+ }
+
+ for (k = 0; k < ls->trans_used; k++)
+ _ure_add_equiv(ls->trans[k].rhs, rs->trans[k].rhs, b);
+ }
+ if (done == 0)
+ break;
+ }
+ for (eq = 0; j < i && eq < b->equiv_used; eq++)
+ b->states.states[b->equiv[eq].r].id =
+ b->states.states[b->equiv[eq].l].id;
+ }
+
+ /*
+ * Renumber the states appropriately.
+ */
+ for (i = eq = 0, sp1 = b->states.states; i < b->states.states_used;
+ sp1++, i++)
+ sp1->id = (sp1->id == i) ? eq++ : b->states.states[sp1->id].id;
+}
+
+/*************************************************************************
+ *
+ * API.
+ *
+ *************************************************************************/
+
+ure_buffer_t
+#ifdef __STDC__
+ure_buffer_create(void)
+#else
+ure_buffer_create()
+#endif
+{
+ ure_buffer_t b;
+
+ b = (ure_buffer_t) calloc(1, sizeof(_ure_buffer_t));
+
+ return b;
+}
+
+void
+#ifdef __STDC__
+ure_buffer_free(ure_buffer_t buf)
+#else
+ure_buffer_free(buf)
+ure_buffer_t buf;
+#endif
+{
+ unsigned long i;
+
+ if (buf == 0)
+ return;
+
+ if (buf->stack.slist_size > 0)
+ free((char *) buf->stack.slist);
+
+ if (buf->expr_size > 0)
+ free((char *) buf->expr);
+
+ for (i = 0; i < buf->symtab_size; i++) {
+ if (buf->symtab[i].states.slist_size > 0)
+ free((char *) buf->symtab[i].states.slist);
+ }
+
+ if (buf->symtab_size > 0)
+ free((char *) buf->symtab);
+
+ for (i = 0; i < buf->states.states_size; i++) {
+ if (buf->states.states[i].trans_size > 0)
+ free((char *) buf->states.states[i].trans);
+ if (buf->states.states[i].st.slist_size > 0)
+ free((char *) buf->states.states[i].st.slist);
+ }
+
+ if (buf->states.states_size > 0)
+ free((char *) buf->states.states);
+
+ if (buf->equiv_size > 0)
+ free((char *) buf->equiv);
+
+ free((char *) buf);
+}
+
+ure_dfa_t
+#ifdef __STDC__
+ure_compile(ucs2_t *re, unsigned long relen, int casefold, ure_buffer_t buf)
+#else
+ure_compile(re, relen, casefold, buf)
+ucs2_t *re;
+unsigned long relen;
+int casefold;
+ure_buffer_t buf;
+#endif
+{
+ ucs2_t i, j, state;
+ _ure_state_t *sp;
+ _ure_dstate_t *dsp;
+ _ure_trans_t *tp;
+ ure_dfa_t dfa;
+
+ if (re == 0 || *re == 0 || relen == 0 || buf == 0)
+ return 0;
+
+ /*
+ * Reset the various fields of the compilation buffer. Default the flags
+ * to indicate the presense of the "^$" pattern. If any other pattern
+ * occurs, then this flag will be removed. This is done to catch this
+ * special pattern and handle it specially when matching.
+ */
+ buf->flags = _URE_DFA_BLANKLINE | ((casefold) ? _URE_DFA_CASEFOLD : 0);
+ buf->reducing = 0;
+ buf->stack.slist_used = 0;
+ buf->expr_used = 0;
+
+ for (i = 0; i < buf->symtab_used; i++)
+ buf->symtab[i].states.slist_used = 0;
+ buf->symtab_used = 0;
+
+ for (i = 0; i < buf->states.states_used; i++) {
+ buf->states.states[i].st.slist_used = 0;
+ buf->states.states[i].trans_used = 0;
+ }
+ buf->states.states_used = 0;
+
+ /*
+ * Construct the NFA. If this stage returns a 0, then an error occured or
+ * an empty expression was passed.
+ */
+ if ((state = _ure_re2nfa(re, relen, buf)) == _URE_NOOP)
+ return 0;
+
+ /*
+ * Do the expression reduction to get the initial DFA.
+ */
+ _ure_reduce(state, buf);
+
+ /*
+ * Merge all the equivalent DFA states.
+ */
+ _ure_merge_equiv(buf);
+
+ /*
+ * Construct the minimal DFA.
+ */
+ dfa = (ure_dfa_t) malloc(sizeof(_ure_dfa_t));
+ (void) memset((char *) dfa, 0, sizeof(_ure_dfa_t));
+
+ dfa->flags = buf->flags & (_URE_DFA_CASEFOLD|_URE_DFA_BLANKLINE);
+
+ /*
+ * Free up the NFA state groups and transfer the symbols from the buffer
+ * to the DFA.
+ */
+ for (i = 0; i < buf->symtab_size; i++) {
+ if (buf->symtab[i].states.slist_size > 0)
+ free((char *) buf->symtab[i].states.slist);
+ }
+ dfa->syms = buf->symtab;
+ dfa->nsyms = buf->symtab_used;
+
+ buf->symtab_used = buf->symtab_size = 0;
+
+ /*
+ * Collect the total number of states and transitions needed for the DFA.
+ */
+ for (i = state = 0, sp = buf->states.states; i < buf->states.states_used;
+ i++, sp++) {
+ if (sp->id == state) {
+ dfa->nstates++;
+ dfa->ntrans += sp->trans_used;
+ state++;
+ }
+ }
+
+ /*
+ * Allocate enough space for the states and transitions.
+ */
+ dfa->states = (_ure_dstate_t *) malloc(sizeof(_ure_dstate_t) *
+ dfa->nstates);
+ dfa->trans = (_ure_trans_t *) malloc(sizeof(_ure_trans_t) * dfa->ntrans);
+
+ /*
+ * Actually transfer the DFA states from the buffer.
+ */
+ dsp = dfa->states;
+ tp = dfa->trans;
+ for (i = state = 0, sp = buf->states.states; i < buf->states.states_used;
+ i++, sp++) {
+ if (sp->id == state) {
+ dsp->trans = tp;
+ dsp->ntrans = sp->trans_used;
+ dsp->accepting = sp->accepting;
+
+ /*
+ * Add the transitions for the state.
+ */
+ for (j = 0; j < dsp->ntrans; j++, tp++) {
+ tp->symbol = sp->trans[j].lhs;
+ tp->next_state = buf->states.states[sp->trans[j].rhs].id;
+ }
+
+ dsp++;
+ state++;
+ }
+ }
+
+ return dfa;
+}
+
+void
+#ifdef __STDC__
+ure_dfa_free(ure_dfa_t dfa)
+#else
+ure_dfa_free(dfa)
+ure_dfa_t dfa;
+#endif
+{
+ ucs2_t i;
+
+ if (dfa == 0)
+ return;
+
+ for (i = 0; i < dfa->nsyms; i++) {
+ if ((dfa->syms[i].type == _URE_CCLASS ||
+ dfa->syms[i].type == _URE_NCCLASS) &&
+ dfa->syms[i].sym.ccl.ranges_size > 0)
+ free((char *) dfa->syms[i].sym.ccl.ranges);
+ }
+ if (dfa->nsyms > 0)
+ free((char *) dfa->syms);
+
+ if (dfa->nstates > 0)
+ free((char *) dfa->states);
+ if (dfa->ntrans > 0)
+ free((char *) dfa->trans);
+ free((char *) dfa);
+}
+
+void
+#ifdef __STDC__
+ure_write_dfa(ure_dfa_t dfa, FILE *out)
+#else
+ure_write_dfa(dfa, out)
+ure_dfa_t dfa;
+FILE *out;
+#endif
+{
+ ucs2_t i, j, k, h, l;
+ _ure_dstate_t *sp;
+ _ure_symtab_t *sym;
+ _ure_range_t *rp;
+
+ if (dfa == 0 || out == 0)
+ return;
+
+ /*
+ * Write all the different character classes.
+ */
+ for (i = 0, sym = dfa->syms; i < dfa->nsyms; i++, sym++) {
+ if (sym->type == _URE_CCLASS || sym->type == _URE_NCCLASS) {
+ fprintf(out, "C%hd = ", sym->id);
+ if (sym->sym.ccl.ranges_used > 0) {
+ putc('[', out);
+ if (sym->type == _URE_NCCLASS)
+ putc('^', out);
+ }
+ if (sym->props != 0) {
+ if (sym->type == _URE_NCCLASS)
+ fprintf(out, "\\P");
+ else
+ fprintf(out, "\\p");
+ for (k = h = 0; k < 32; k++) {
+ if (sym->props & (1 << k)) {
+ if (h != 0)
+ putc(',', out);
+ fprintf(out, "%hd", k + 1);
+ h = 1;
+ }
+ }
+ }
+ /*
+ * Dump the ranges.
+ */
+ for (k = 0, rp = sym->sym.ccl.ranges;
+ k < sym->sym.ccl.ranges_used; k++, rp++) {
+ /*
+ * Check for UTF16 characters.
+ */
+ if (0x10000 <= rp->min_code &&
+ rp->min_code <= 0x10ffff) {
+ h = ((rp->min_code - 0x10000) >> 10) + 0xd800;
+ l = ((rp->min_code - 0x10000) & 1023) + 0xdc00;
+ fprintf(out, "\\x%04hX\\x%04hX", h, l);
+ } else
+ fprintf(out, "\\x%04lX", rp->min_code & 0xffff);
+ if (rp->max_code != rp->min_code) {
+ putc('-', out);
+ if (rp->max_code >= 0x10000 &&
+ rp->max_code <= 0x10ffff) {
+ h = ((rp->max_code - 0x10000) >> 10) + 0xd800;
+ l = ((rp->max_code - 0x10000) & 1023) + 0xdc00;
+ fprintf(out, "\\x%04hX\\x%04hX", h, l);
+ } else
+ fprintf(out, "\\x%04lX", rp->max_code & 0xffff);
+ }
+ }
+ if (sym->sym.ccl.ranges_used > 0)
+ putc(']', out);
+ putc('\n', out);
+ }
+ }
+
+ for (i = 0, sp = dfa->states; i < dfa->nstates; i++, sp++) {
+ fprintf(out, "S%hd = ", i);
+ if (sp->accepting) {
+ fprintf(out, "1 ");
+ if (sp->ntrans)
+ fprintf(out, "| ");
+ }
+ for (j = 0; j < sp->ntrans; j++) {
+ if (j > 0)
+ fprintf(out, "| ");
+
+ sym = dfa->syms + sp->trans[j].symbol;
+ switch (sym->type) {
+ case _URE_CHAR:
+ if (0x10000 <= sym->sym.chr && sym->sym.chr <= 0x10ffff) {
+ /*
+ * Take care of UTF16 characters.
+ */
+ h = ((sym->sym.chr - 0x10000) >> 10) + 0xd800;
+ l = ((sym->sym.chr - 0x10000) & 1023) + 0xdc00;
+ fprintf(out, "\\x%04hX\\x%04hX ", h, l);
+ } else
+ fprintf(out, "\\x%04lX ", sym->sym.chr & 0xffff);
+ break;
+ case _URE_ANY_CHAR:
+ fprintf(out, "<any> ");
+ break;
+ case _URE_BOL_ANCHOR:
+ fprintf(out, "<bol-anchor> ");
+ break;
+ case _URE_EOL_ANCHOR:
+ fprintf(out, "<eol-anchor> ");
+ break;
+ case _URE_CCLASS:
+ case _URE_NCCLASS:
+ fprintf(out, "[C%hd] ", sym->id);
+ break;
+ }
+ fprintf(out, "S%hd", sp->trans[j].next_state);
+ if (j + 1 < sp->ntrans)
+ putc(' ', out);
+ }
+ putc('\n', out);
+ }
+}
+
+#define _ure_issep(cc) ((cc) == '\n' || (cc) == '\r' || (cc) == 0x2028 ||\
+ (cc) == 0x2029)
+
+int
+#ifdef __STDC__
+ure_exec(ure_dfa_t dfa, int flags, ucs2_t *text, unsigned long textlen,
+ unsigned long *match_start, unsigned long *match_end)
+#else
+ure_exec(dfa, flags, text, textlen, match_start, match_end)
+ure_dfa_t dfa;
+int flags;
+ucs2_t *text;
+unsigned long textlen, *match_start, *match_end;
+#endif
+{
+ int i, j, matched, found, skip;
+ unsigned long ms, me;
+ ucs4_t c;
+ ucs2_t *sp, *ep, *lp;
+ _ure_dstate_t *stp;
+ _ure_symtab_t *sym;
+ _ure_range_t *rp;
+
+ if (dfa == 0 || text == 0)
+ return 0;
+
+ /*
+ * Handle the special case of an empty string matching the "^$" pattern.
+ */
+ if (textlen == 0 && (dfa->flags & _URE_DFA_BLANKLINE)) {
+ *match_start = *match_end = 0;
+ return 1;
+ }
+
+ sp = text;
+ ep = sp + textlen;
+
+ ms = me = ~0;
+
+ stp = dfa->states;
+
+ for (found = skip = 0; found == 0 && sp < ep; ) {
+ lp = sp;
+ c = *sp++;
+
+ /*
+ * Check to see if this is a high surrogate that should be
+ * combined with a following low surrogate.
+ */
+ if (sp < ep && 0xd800 <= c && c <= 0xdbff &&
+ 0xdc00 <= *sp && *sp <= 0xdfff)
+ c = 0x10000 + (((c & 0x03ff) << 10) | (*sp++ & 0x03ff));
+
+ /*
+ * Determine if the character is non-spacing and should be skipped.
+ */
+ if (_ure_matches_properties(_URE_NONSPACING, c) &&
+ (flags & URE_IGNORE_NONSPACING)) {
+ sp++;
+ continue;
+ }
+
+ if (dfa->flags & _URE_DFA_CASEFOLD)
+ c = _ure_tolower(c);
+
+ /*
+ * See if one of the transitions matches.
+ */
+ for (i = 0, matched = 0; matched == 0 && i < stp->ntrans; i++) {
+ sym = dfa->syms + stp->trans[i].symbol;
+ switch (sym->type) {
+ case _URE_ANY_CHAR:
+ if ((flags & URE_DOT_MATCHES_SEPARATORS) ||
+ !_ure_issep(c))
+ matched = 1;
+ break;
+ case _URE_CHAR:
+ if (c == sym->sym.chr)
+ matched = 1;
+ break;
+ case _URE_BOL_ANCHOR:
+ if (lp == text) {
+ sp = lp;
+ matched = 1;
+ } else if (_ure_issep(c)) {
+ if (c == '\r' && sp < ep && *sp == '\n')
+ sp++;
+ lp = sp;
+ matched = 1;
+ }
+ break;
+ case _URE_EOL_ANCHOR:
+ if (_ure_issep(c)) {
+ /*
+ * Put the pointer back before the separator so the match
+ * end position will be correct. This case will also
+ * cause the `sp' pointer to be advanced over the current
+ * separator once the match end point has been recorded.
+ */
+ sp = lp;
+ matched = 1;
+ }
+ break;
+ case _URE_CCLASS:
+ case _URE_NCCLASS:
+ if (sym->props != 0)
+ matched = _ure_matches_properties(sym->props, c);
+ for (j = 0, rp = sym->sym.ccl.ranges;
+ j < sym->sym.ccl.ranges_used; j++, rp++) {
+ if (rp->min_code <= c && c <= rp->max_code)
+ matched = 1;
+ }
+ if (sym->type == _URE_NCCLASS)
+ matched = !matched;
+ break;
+ }
+
+ if (matched) {
+ if (ms == ~0)
+ ms = lp - text;
+ else
+ me = sp - text;
+ stp = dfa->states + stp->trans[i].next_state;
+
+ /*
+ * If the match was an EOL anchor, adjust the pointer past the
+ * separator that caused the match. The correct match
+ * position has been recorded already.
+ */
+ if (sym->type == _URE_EOL_ANCHOR) {
+ /*
+ * Skip the character that caused the match.
+ */
+ sp++;
+
+ /*
+ * Handle the infamous CRLF situation.
+ */
+ if (sp < ep && c == '\r' && *sp == '\n')
+ sp++;
+ }
+ }
+ }
+
+ if (matched == 0) {
+ if (stp->accepting == 0) {
+ /*
+ * If the last state was not accepting, then reset
+ * and start over.
+ */
+ stp = dfa->states;
+ ms = me = ~0;
+ } else
+ /*
+ * The last state was accepting, so terminate the matching
+ * loop to avoid more work.
+ */
+ found = 1;
+ } else if (sp == ep) {
+ if (!stp->accepting) {
+ /*
+ * This ugly hack is to make sure the end-of-line anchors
+ * match when the source text hits the end. This is only done
+ * if the last subexpression matches.
+ */
+ for (i = 0; found == 0 && i < stp->ntrans; i++) {
+ sym = dfa->syms + stp->trans[i].symbol;
+ if (sym->type ==_URE_EOL_ANCHOR) {
+ stp = dfa->states + stp->trans[i].next_state;
+ if (stp->accepting) {
+ me = sp - text;
+ found = 1;
+ } else
+ break;
+ }
+ }
+ } else {
+ /*
+ * Make sure any conditions that match all the way to the end
+ * of the string match.
+ */
+ found = 1;
+ me = sp - text;
+ }
+ }
+ }
+
+ if (found == 0)
+ ms = me = ~0;
+
+ *match_start = ms;
+ *match_end = me;
+
+ return (ms != ~0) ? 1 : 0;
+}
--- /dev/null
+/*
+ * Copyright 1997, 1998, 1999 Computing Research Labs,
+ * New Mexico State University
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
+ * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+ * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef _h_ure
+#define _h_ure
+
+/*
+ * $Id: ure.h,v 1.2 1999/09/21 15:47:44 mleisher Exp $
+ */
+
+#include <stdio.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#undef __
+#ifdef __STDC__
+#define __(x) x
+#else
+#define __(x) ()
+#endif
+
+/*
+ * Set of character class flags.
+ */
+#define _URE_NONSPACING 0x00000001
+#define _URE_COMBINING 0x00000002
+#define _URE_NUMDIGIT 0x00000004
+#define _URE_NUMOTHER 0x00000008
+#define _URE_SPACESEP 0x00000010
+#define _URE_LINESEP 0x00000020
+#define _URE_PARASEP 0x00000040
+#define _URE_CNTRL 0x00000080
+#define _URE_PUA 0x00000100
+
+#define _URE_UPPER 0x00000200
+#define _URE_LOWER 0x00000400
+#define _URE_TITLE 0x00000800
+#define _URE_MODIFIER 0x00001000
+#define _URE_OTHERLETTER 0x00002000
+#define _URE_DASHPUNCT 0x00004000
+#define _URE_OPENPUNCT 0x00008000
+#define _URE_CLOSEPUNCT 0x00010000
+#define _URE_OTHERPUNCT 0x00020000
+#define _URE_MATHSYM 0x00040000
+#define _URE_CURRENCYSYM 0x00080000
+#define _URE_OTHERSYM 0x00100000
+
+#define _URE_LTR 0x00200000
+#define _URE_RTL 0x00400000
+
+#define _URE_EURONUM 0x00800000
+#define _URE_EURONUMSEP 0x01000000
+#define _URE_EURONUMTERM 0x02000000
+#define _URE_ARABNUM 0x04000000
+#define _URE_COMMONSEP 0x08000000
+
+#define _URE_BLOCKSEP 0x10000000
+#define _URE_SEGMENTSEP 0x20000000
+
+#define _URE_WHITESPACE 0x40000000
+#define _URE_OTHERNEUT 0x80000000
+
+/*
+ * Error codes.
+ */
+#define _URE_OK 0
+#define _URE_UNEXPECTED_EOS -1
+#define _URE_CCLASS_OPEN -2
+#define _URE_UNBALANCED_GROUP -3
+#define _URE_INVALID_PROPERTY -4
+
+/*
+ * Options that can be combined for searching.
+ */
+#define URE_IGNORE_NONSPACING 0x01
+#define URE_DOT_MATCHES_SEPARATORS 0x02
+
+typedef unsigned long ucs4_t;
+typedef unsigned short ucs2_t;
+
+/*
+ * Opaque type for memory used when compiling expressions.
+ */
+typedef struct _ure_buffer_t *ure_buffer_t;
+
+/*
+ * Opaque type for the minimal DFA used when matching.
+ */
+typedef struct _ure_dfa_t *ure_dfa_t;
+
+/*************************************************************************
+ *
+ * API.
+ *
+ *************************************************************************/
+
+extern ure_buffer_t ure_buffer_create __((void));
+
+extern void ure_buffer_free __((ure_buffer_t buf));
+
+extern ure_dfa_t ure_compile __((ucs2_t *re, unsigned long relen,
+ int casefold, ure_buffer_t buf));
+
+extern void ure_dfa_free __((ure_dfa_t dfa));
+
+extern void ure_write_dfa __((ure_dfa_t dfa, FILE *out));
+
+extern int ure_exec __((ure_dfa_t dfa, int flags,
+ ucs2_t *text, unsigned long textlen,
+ unsigned long *match_start, unsigned long *match_end));
+
+/*************************************************************************
+ *
+ * Prototypes for stub functions used for URE. These need to be rewritten to
+ * use the Unicode support available on the system.
+ *
+ *************************************************************************/
+
+extern ucs4_t _ure_tolower __((ucs4_t c));
+
+extern int _ure_matches_properties __((unsigned long props, ucs4_t c));
+
+#undef __
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _h_ure */
--- /dev/null
+/*
+ * Copyright 1997, 1998, 1999 Computing Research Labs,
+ * New Mexico State University
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
+ * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+ * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef lint
+static char rcsid[] = "$Id: urestubs.c,v 1.2 1999/09/21 15:47:44 mleisher Exp $";
+#endif
+
+#include "ure.h"
+
+/*
+ * This file contains stub routines needed by the URE package to test
+ * character properties and other Unicode implementation specific details.
+ */
+
+/*
+ * This routine should return the lower case equivalent for the character or,
+ * if there is no lower case quivalent, the character itself.
+ */
+ucs4_t
+#ifdef __STDC__
+_ure_tolower(ucs4_t c)
+#else
+_ure_tolower(c)
+ucs4_t c;
+#endif
+{
+ return c;
+}
+
+/*
+ * This routine takes a set of URE character property flags (see ure.h) along
+ * with a character and tests to see if the character has one or more of those
+ * properties.
+ */
+int
+#ifdef __STDC__
+_ure_matches_properties(unsigned long props, ucs4_t c)
+#else
+_ure_matches_properties(props, c)
+unsigned long props;
+ucs4_t c;
+#endif
+{
+ return 1;
+}
--- /dev/null
+#
+# $Id: README,v 1.1 1999/09/21 15:45:17 mleisher Exp $
+#
+# Copyright 1997, 1998, 1999 Computing Research Labs,
+# New Mexico State University
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
+# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
+# OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+# THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#
+
+ Unicode and Boyer-Moore Searching
+ Version 0.2
+
+UTBM (Unicode Tuned Boyer-Moore) is a simple package that provides tuned
+Boyer-Moore searches on Unicode UCS2 text (handles high and low surrogates).
+
+---------------------------------------------------------------------------
+
+Assumptions:
+
+ o Search pattern and text already normalized in some fasion.
+
+ o Upper, lower, and title case conversions are one-to-one.
+
+ o For conversions between upper, lower, and title case, UCS2 characters
+ always convert to other UCS2 characters, and UTF-16 characters always
+ convert to other UTF-16 characters.
+
+Flags:
+
+ UTBM provides three processing flags:
+
+ o UTBM_CASEFOLD - search in a case-insensitive manner.
+
+ o UTBM_IGNORE_NONSPACING - ignore non-spacing characters in the pattern and
+ the text.
+
+ o UTBM_SPACE_COMPRESS - view as a *single space*, sequential groups of
+ U+2028, U+2029, '\n', '\r', '\t', and any
+ character identified as a space by the Unicode
+ support on the platform.
+
+ This flag also causes all characters identified
+ as control by the Unicode support on the
+ platform to be ignored (except for '\n', '\r',
+ and '\t').
+
+---------------------------------------------------------------------------
+
+Before using UTBM
+-----------------
+Before UTBM is used, some functions need to be created. The "utbmstub.c" file
+contains stubs that need to be rewritten so they work with the Unicode support
+on the platform on which this package is being used.
+
+Using UTBM
+----------
+
+Sample pseudo-code fragment.
+
+ utbm_pattern_t pat;
+ ucs2_t *pattern, *text;
+ unsigned long patternlen, textlen;
+ unsigned long flags, match_start, match_end;
+
+ /*
+ * Allocate the dynamic storage needed for a search pattern.
+ */
+ pat = utbm_create_pattern();
+
+ /*
+ * Set the search flags desired.
+ */
+ flags = UTBM_CASEFOLD|UTBM_IGNORE_NONSPACING;
+
+ /*
+ * Compile the search pattern.
+ */
+ utbm_compile(pattern, patternlen, flags, pat);
+
+ /*
+ * Find the first occurance of the search pattern in the text.
+ */
+ if (utbm_exec(pat, text, textlen, &match_start, &match_end))
+ printf("MATCH: %ld %ld\n", match_start, match_end);
+
+ /*
+ * Free the dynamic storage used for the search pattern.
+ */
+ ure_free_pattern(pat);
+
+---------------------------------------------------------------------------
+
+Mark Leisher <mleisher@crl.nmsu.edu>
+2 May 1997
+
+===========================================================================
+
+CHANGES
+-------
+
+Version: 0.2
+Date : 21 September 1999
+==========================
+ 1. Added copyright stuff and put in CVS.
+
--- /dev/null
+/*
+ * Copyright 1997, 1998, 1999 Computing Research Labs,
+ * New Mexico State University
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
+ * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+ * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef lint
+static char rcsid[] = "$Id: utbm.c,v 1.1 1999/09/21 15:45:17 mleisher Exp $";
+#endif
+
+/*
+ * Assumptions:
+ * 1. Case conversions of UTF-16 characters must also be UTF-16 characters.
+ * 2. Case conversions are all one-to-one.
+ * 3. Text and pattern have already been normalized in some fashion.
+ */
+
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include "utbm.h"
+
+/*
+ * Single pattern character.
+ */
+typedef struct {
+ ucs4_t lc;
+ ucs4_t uc;
+ ucs4_t tc;
+} _utbm_char_t;
+
+typedef struct {
+ _utbm_char_t *ch;
+ unsigned long skip;
+} _utbm_skip_t;
+
+typedef struct _utbm_pattern_t {
+ unsigned long flags;
+
+ _utbm_char_t *pat;
+ unsigned long pat_used;
+ unsigned long pat_size;
+ unsigned long patlen;
+
+ _utbm_skip_t *skip;
+ unsigned long skip_used;
+ unsigned long skip_size;
+
+ unsigned long md4;
+} _utbm_pattern_t;
+
+/*************************************************************************
+ *
+ * Support functions.
+ *
+ *************************************************************************/
+
+/*
+ * Routine to look up the skip value for a character.
+ */
+static unsigned long
+#ifdef __STDC__
+_utbm_skip(utbm_pattern_t p, ucs2_t *start, ucs2_t *end)
+#else
+_utbm_skip(p, start, end)
+utbm_pattern_t p;
+ucs2_t *start, *end;
+#endif
+{
+ unsigned long i;
+ ucs4_t c1, c2;
+ _utbm_skip_t *sp;
+
+ if (start >= end)
+ return 0;
+
+ c1 = *start;
+ c2 = (start + 1 < end) ? *(start + 1) : ~0;
+ if (0xd800 <= c1 && c1 <= 0xdbff && 0xdc00 <= c2 && c2 <= 0xdfff)
+ c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
+
+ for (i = 0, sp = p->skip; i < p->skip_used; i++, sp++) {
+ if (!((c1 ^ sp->ch->uc) & (c1 ^ sp->ch->lc) & (c1 ^ sp->ch->tc))) {
+ return ((unsigned long) (end - start) < sp->skip) ?
+ end - start : sp->skip;
+ }
+ }
+ return p->patlen;
+}
+
+static int
+#ifdef __STDC__
+_utbm_match(utbm_pattern_t pat, ucs2_t *text, ucs2_t *start, ucs2_t *end,
+ unsigned long *match_start, unsigned long *match_end)
+#else
+_utbm_match(pat, text, start, end, match_start, match_end)
+utbm_pattern_t pat;
+ucs2_t *text, *start, *end;
+unsigned long *match_start, *match_end;
+#endif
+{
+ int check_space;
+ ucs4_t c1, c2;
+ unsigned long count;
+ _utbm_char_t *cp;
+
+ /*
+ * Set the potential match endpoint first.
+ */
+ *match_end = (start - text) + 1;
+
+ c1 = *start;
+ c2 = (start + 1 < end) ? *(start + 1) : ~0;
+ if (0xd800 <= c1 && c1 <= 0xdbff && 0xdc00 <= c2 && c2 <= 0xdfff) {
+ c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
+ /*
+ * Adjust the match end point to occur after the UTF-16 character.
+ */
+ *match_end = *match_end + 1;
+ }
+
+ if (pat->pat_used == 1) {
+ *match_start = start - text;
+ return 1;
+ }
+
+ /*
+ * Compare backward.
+ */
+ cp = pat->pat + (pat->pat_used - 1);
+
+ for (count = pat->patlen; start > text && count > 0;) {
+ /*
+ * Ignore non-spacing characters if indicated.
+ */
+ if (pat->flags & UTBM_IGNORE_NONSPACING) {
+ while (start > text && _utbm_nonspacing(c1)) {
+ c2 = *--start;
+ c1 = (start - 1 > text) ? *(start - 1) : ~0;
+ if (0xdc00 <= c2 && c2 <= 0xdfff &&
+ 0xd800 <= c1 && c1 <= 0xdbff) {
+ c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
+ start--;
+ } else
+ c1 = c2;
+ }
+ }
+
+ /*
+ * Handle space compression if indicated.
+ */
+ if (pat->flags & UTBM_SPACE_COMPRESS) {
+ check_space = 0;
+ while (start > text &&
+ (_utbm_isspace(c1, 1) || _utbm_iscntrl(c1))) {
+ check_space = _utbm_isspace(c1, 1);
+ c2 = *--start;
+ c1 = (start - 1 > text) ? *(start - 1) : ~0;
+ if (0xdc00 <= c2 && c2 <= 0xdfff &&
+ 0xd800 <= c1 && c1 <= 0xdbff) {
+ c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
+ start--;
+ } else
+ c1 = c2;
+ }
+ /*
+ * Handle things if space compression was indicated and one or
+ * more member characters were found.
+ */
+ if (check_space) {
+ if (cp->uc != ' ')
+ return 0;
+ cp--;
+ count--;
+ }
+ }
+
+ /*
+ * Handle the normal comparison cases.
+ */
+ if (count > 0 && ((c1 ^ cp->uc) & (c1 ^ cp->lc) & (c1 ^ cp->tc)))
+ return 0;
+
+ count -= (c1 >= 0x10000) ? 2 : 1;
+ if (count > 0) {
+ cp--;
+
+ /*
+ * Get the next preceding character.
+ */
+ if (start > text) {
+ c2 = *--start;
+ c1 = (start - 1 > text) ? *(start - 1) : ~0;
+ if (0xdc00 <= c2 && c2 <= 0xdfff &&
+ 0xd800 <= c1 && c1 <= 0xdbff) {
+ c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
+ start--;
+ } else
+ c1 = c2;
+ }
+ }
+ }
+
+ /*
+ * Set the match start position.
+ */
+ *match_start = start - text;
+ return 1;
+}
+
+/*************************************************************************
+ *
+ * API.
+ *
+ *************************************************************************/
+
+utbm_pattern_t
+#ifdef __STDC__
+utbm_create_pattern(void)
+#else
+utbm_create_pattern()
+#endif
+{
+ utbm_pattern_t p;
+
+ p = (utbm_pattern_t) malloc(sizeof(_utbm_pattern_t));
+ (void) memset((char *) p, 0, sizeof(_utbm_pattern_t));
+ return p;
+}
+
+void
+#ifdef __STDC__
+utbm_free_pattern(utbm_pattern_t pattern)
+#else
+utbm_free_pattern(pattern)
+utbm_pattern_t pattern;
+#endif
+{
+ if (pattern == 0)
+ return;
+
+ if (pattern->pat_size > 0)
+ free((char *) pattern->pat);
+
+ if (pattern->skip_size > 0)
+ free((char *) pattern->skip);
+
+ free((char *) pattern);
+}
+
+void
+#ifdef __STDC__
+utbm_compile(ucs2_t *pat, unsigned long patlen, unsigned long flags,
+ utbm_pattern_t p)
+#else
+utbm_compile(pat, patlen, flags, p)
+ucs2_t *pat;
+unsigned long patlen, flags;
+utbm_pattern_t p;
+#endif
+{
+ int have_space;
+ unsigned long i, j, k, slen;
+ _utbm_char_t *cp;
+ _utbm_skip_t *sp;
+ ucs4_t c1, c2, sentinel;
+
+ if (p == 0 || pat == 0 || *pat == 0 || patlen == 0)
+ return;
+
+ /*
+ * Reset the pattern buffer.
+ */
+ p->patlen = p->pat_used = p->skip_used = 0;
+
+ /*
+ * Set the flags.
+ */
+ p->flags = flags;
+
+ /*
+ * Initialize the extra skip flag.
+ */
+ p->md4 = 1;
+
+ /*
+ * Allocate more storage if necessary.
+ */
+ if (patlen > p->pat_size) {
+ if (p->pat_size == 0) {
+ p->pat = (_utbm_char_t *) malloc(sizeof(_utbm_char_t) * patlen);
+ p->skip = (_utbm_skip_t *) malloc(sizeof(_utbm_skip_t) * patlen);
+ } else {
+ p->pat = (_utbm_char_t *)
+ realloc((char *) p->pat, sizeof(_utbm_char_t) * patlen);
+ p->skip = (_utbm_skip_t *)
+ realloc((char *) p->skip, sizeof(_utbm_skip_t) * patlen);
+ }
+ p->pat_size = p->skip_size = patlen;
+ }
+
+ /*
+ * Preprocess the pattern to remove controls (if specified) and determine
+ * case.
+ */
+ for (have_space = 0, cp = p->pat, i = 0; i < patlen; i++) {
+ c1 = pat[i];
+ c2 = (i + 1 < patlen) ? pat[i + 1] : ~0;
+ if (0xd800 <= c1 && c1 <= 0xdbff && 0xdc00 <= c2 && c2 <= 0xdfff)
+ c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
+
+ /*
+ * Make sure the `have_space' flag is turned off if the character
+ * is not an appropriate one.
+ */
+ if (!_utbm_isspace(c1, flags & UTBM_SPACE_COMPRESS))
+ have_space = 0;
+
+ /*
+ * If non-spacing characters should be ignored, do it here.
+ */
+ if ((flags & UTBM_IGNORE_NONSPACING) && _utbm_nonspacing(c1))
+ continue;
+
+ /*
+ * Check if spaces and controls need to be compressed.
+ */
+ if (flags & UTBM_SPACE_COMPRESS) {
+ if (_utbm_isspace(c1, 1)) {
+ if (!have_space) {
+ /*
+ * Add a space and set the flag.
+ */
+ cp->uc = cp->lc = cp->tc = ' ';
+ cp++;
+
+ /*
+ * Increase the real pattern length.
+ */
+ p->patlen++;
+ sentinel = ' ';
+ have_space = 1;
+ }
+ continue;
+ }
+
+ /*
+ * Ignore all control characters.
+ */
+ if (_utbm_iscntrl(c1))
+ continue;
+ }
+
+ /*
+ * Add the character.
+ */
+ if (flags & UTBM_CASEFOLD) {
+ cp->uc = _utbm_toupper(c1);
+ cp->lc = _utbm_tolower(c1);
+ cp->tc = _utbm_totitle(c1);
+ } else
+ cp->uc = cp->lc = cp->tc = c1;
+
+ /*
+ * Set the sentinel character.
+ */
+ sentinel = cp->uc;
+
+ /*
+ * Move to the next character.
+ */
+ cp++;
+
+ /*
+ * Increase the real pattern length appropriately.
+ */
+ p->patlen += (c1 >= 0x10000) ? 2 : 1;
+
+ /*
+ * Increment the loop index for UTF-16 characters.
+ */
+ i += (c1 >= 0x10000) ? 1 : 0;
+
+ }
+
+ /*
+ * Set the number of characters actually used.
+ */
+ p->pat_used = cp - p->pat;
+
+ /*
+ * Go through and construct the skip array and determine the actual length
+ * of the pattern in UCS2 terms.
+ */
+ slen = p->patlen - 1;
+ cp = p->pat;
+ for (i = k = 0; i < p->pat_used; i++, cp++) {
+ /*
+ * Locate the character in the skip array.
+ */
+ for (sp = p->skip, j = 0;
+ j < p->skip_used && sp->ch->uc != cp->uc; j++, sp++) ;
+
+ /*
+ * If the character is not found, set the new skip element and
+ * increase the number of skip elements.
+ */
+ if (j == p->skip_used) {
+ sp->ch = cp;
+ p->skip_used++;
+ }
+
+ /*
+ * Set the updated skip value. If the character is UTF-16 and is
+ * not the last one in the pattern, add one to its skip value.
+ */
+ sp->skip = slen - k;
+ if (cp->uc >= 0x10000 && k + 2 < slen)
+ sp->skip++;
+
+ /*
+ * Set the new extra skip for the sentinel character.
+ */
+ if (((cp->uc >= 0x10000 && k + 2 <= slen) || k + 1 <= slen) &&
+ cp->uc == sentinel)
+ p->md4 = slen - k;
+
+ /*
+ * Increase the actual index.
+ */
+ k += (cp->uc >= 0x10000) ? 2 : 1;
+ }
+}
+
+int
+#ifdef __STDC__
+utbm_exec(utbm_pattern_t pat, ucs2_t *text, unsigned long textlen,
+ unsigned long *match_start, unsigned long *match_end)
+#else
+utbm_exec(pat, text, textlen, match_start, match_end)
+utbm_pattern_t pat;
+ucs2_t *text;
+unsigned long textlen, *match_start, *match_end;
+#endif
+{
+ unsigned long k;
+ ucs2_t *start, *end;
+
+ if (pat == 0 || pat->pat_used == 0 || text == 0 || textlen == 0 ||
+ textlen < pat->patlen)
+ return 0;
+
+ start = text + pat->patlen;
+ end = text + textlen;
+
+ /*
+ * Adjust the start point if it points to a low surrogate.
+ */
+ if (0xdc00 <= *start && *start <= 0xdfff &&
+ 0xd800 <= *(start - 1) && *(start - 1) <= 0xdbff)
+ start--;
+
+ while (start < end) {
+ while ((k = _utbm_skip(pat, start, end))) {
+ start += k;
+ if (start < end && 0xdc00 <= *start && *start <= 0xdfff &&
+ 0xd800 <= *(start - 1) && *(start - 1) <= 0xdbff)
+ start--;
+ }
+
+ if (start < end &&
+ _utbm_match(pat, text, start, end, match_start, match_end))
+ return 1;
+
+ start += pat->md4;
+ if (start < end && 0xdc00 <= *start && *start <= 0xdfff &&
+ 0xd800 <= *(start - 1) && *(start - 1) <= 0xdbff)
+ start--;
+ }
+ return 0;
+}
--- /dev/null
+/*
+ * Copyright 1997, 1998, 1999 Computing Research Labs,
+ * New Mexico State University
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
+ * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+ * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef _h_utbm
+#define _h_utbm
+
+/*
+ * $Id: utbm.h,v 1.1 1999/09/21 15:45:18 mleisher Exp $
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#undef __
+#ifdef __STDC__
+#define __(x) x
+#else
+#define __(x) ()
+#endif
+
+/*************************************************************************
+ *
+ * Types.
+ *
+ *************************************************************************/
+
+/*
+ * Fundamental character types.
+ */
+typedef unsigned long ucs4_t;
+typedef unsigned short ucs2_t;
+
+/*
+ * An opaque type used for the search pattern.
+ */
+typedef struct _utbm_pattern_t *utbm_pattern_t;
+
+/*************************************************************************
+ *
+ * Flags.
+ *
+ *************************************************************************/
+
+#define UTBM_CASEFOLD 0x01
+#define UTBM_IGNORE_NONSPACING 0x02
+#define UTBM_SPACE_COMPRESS 0x04
+
+/*************************************************************************
+ *
+ * API.
+ *
+ *************************************************************************/
+
+extern utbm_pattern_t utbm_create_pattern __((void));
+
+extern void utbm_free_pattern __((utbm_pattern_t pattern));
+
+extern void utbm_compile __((ucs2_t *pat, unsigned long patlen,
+ unsigned long flags, utbm_pattern_t pattern));
+
+extern int utbm_exec __((utbm_pattern_t pat, ucs2_t *text,
+ unsigned long textlen, unsigned long *match_start,
+ unsigned long *match_end));
+
+/*************************************************************************
+ *
+ * Prototypes for the stub functions needed.
+ *
+ *************************************************************************/
+
+extern int _utbm_isspace __((ucs4_t c, int compress));
+
+extern int _utbm_iscntrl __((ucs4_t c));
+
+extern int _utbm_nonspacing __((ucs4_t c));
+
+extern ucs4_t _utbm_tolower __((ucs4_t c));
+
+extern ucs4_t _utbm_toupper __((ucs4_t c));
+
+extern ucs4_t _utbm_totitle __((ucs4_t c));
+
+#undef __
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _h_utbm */
--- /dev/null
+/*
+ * Copyright 1997, 1998, 1999 Computing Research Labs,
+ * New Mexico State University
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
+ * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+ * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef lint
+static char rcsid[] = "$Id: utbmstub.c,v 1.1 1999/09/21 15:45:18 mleisher Exp $";
+#endif
+
+#include "utbm.h"
+
+/*
+ * This should be redefined to use the `isspace' function available in the
+ * Unicode support on the platform where this is being used.
+ */
+#define _platform_isspace(x) 0
+
+/*
+ * Return non-zero for any character that should be considered the equivalent
+ * of a space character. Return zero otherwise.
+ */
+int
+#ifdef __STDC__
+_utbm_isspace(ucs4_t c, int compress)
+#else
+_utbm_isspace(c, compress)
+ucs4_t c;
+int compress;
+#endif
+{
+ if (compress)
+ return (c == 0x09 || c == 0x0a || c == 0x0d ||
+ c == 0x2028 || c == 0x2029 || _platform_isspace(c)) ? 1 : 0;
+
+ return _platform_isspace(c);
+
+}
+
+/*
+ * Return non-zero if the character is a control character, or zero otherwise.
+ */
+int
+#ifdef __STDC__
+_utbm_iscntrl(ucs4_t c)
+#else
+_utbm_iscntrl(c)
+ucs4_t c;
+#endif
+{
+ return 0;
+}
+
+/*
+ * Return non-zero if the character is a non-spacing character, or zero
+ * otherwise.
+ */
+int
+#ifdef __STDC__
+_utbm_nonspacing(ucs4_t c)
+#else
+_utbm_nonspacing(c)
+ucs4_t c;
+#endif
+{
+ return 0;
+}
+
+/*
+ * Convert a character to lower case.
+ */
+ucs4_t
+#ifdef __STDC__
+_utbm_tolower(ucs4_t c)
+#else
+_utbm_tolower(c)
+ucs4_t c;
+#endif
+{
+ return c;
+}
+
+/*
+ * Convert a character to upper case.
+ */
+ucs4_t
+#ifdef __STDC__
+_utbm_toupper(ucs4_t c)
+#else
+_utbm_toupper(c)
+ucs4_t c;
+#endif
+{
+ return c;
+}
+
+/*
+ * Convert a character to title case.
+ */
+ucs4_t
+#ifdef __STDC__
+_utbm_totitle(ucs4_t c)
+#else
+_utbm_totitle(c)
+ucs4_t c;
+#endif
+{
+ return c;
+}