git.sur5r.net Git - openldap/blob - libraries/liblunicode/ure/README

   1 #
   2 # $Id: README,v 1.3 1999/09/21 15:47:43 mleisher Exp $
   3 #
   4 # Copyright 1997, 1998, 1999 Computing Research Labs,
   5 # New Mexico State University
   6 #
   7 # Permission is hereby granted, free of charge, to any person obtaining a
   8 # copy of this software and associated documentation files (the "Software"),
   9 # to deal in the Software without restriction, including without limitation
  10 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
  11 # and/or sell copies of the Software, and to permit persons to whom the
  12 # Software is furnished to do so, subject to the following conditions:
  13 #
  14 # The above copyright notice and this permission notice shall be included in
  15 # all copies or substantial portions of the Software.
  16 #
  17 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  20 # THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
  21 # CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
  22 # OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
  23 # THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  24 #
  25
  26
  27                        Unicode and Regular Expressions
  28                                  Version 0.5
  29
  30 This is a simple regular expression package for matching against Unicode text
  31 in UCS2 form.  The implementation of this URE package is a variation on the
  32 RE->DFA algorithm done by Mark Hopkins (markh@csd4.csd.uwm.edu).  Mark
  33 Hopkins' algorithm had the virtue of being very simple, so it was used as a
  34 model.
  35
  36 ---------------------------------------------------------------------------
  37
  38 Assumptions:
  39
  40   o  Regular expression and text already normalized.
  41
  42   o  Conversion to lower case assumes a 1-1 mapping.
  43
  44 Definitions:
  45
  46   Separator - any one of U+2028, U+2029, '\n', '\r'.
  47
  48 Operators:
  49   .   - match any character.
  50   *   - match zero or more of the last subexpression.
  51   +   - match one or more of the last subexpression.
  52   ?   - match zero or one of the last subexpression.
  53   ()  - subexpression grouping.
  54
  55   Notes:
  56
  57     o  The "." operator normally does not match separators, but a flag is
  58        available for the ure_exec() function that will allow this operator to
  59        match a separator.
  60
  61 Literals and Constants:
  62
  63   c       - literal UCS2 character.
  64   \x....  - hexadecimal number of up to 4 digits.
  65   \X....  - hexadecimal number of up to 4 digits.
  66   \u....  - hexadecimal number of up to 4 digits.
  67   \U....  - hexadecimal number of up to 4 digits.
  68
  69 Character classes:
  70
  71   [...]           - Character class.
  72   [^...]          - Negated character class.
  73   \pN1,N2,...,Nn  - Character properties class.
  74   \PN1,N2,...,Nn  - Negated character properties class.
  75
  76   POSIX character classes recognized:
  77
  78     :alnum:
  79     :alpha:
  80     :cntrl:
  81     :digit:
  82     :graph:
  83     :lower:
  84     :print:
  85     :punct:
  86     :space:
  87     :upper:
  88     :xdigit:
  89
  90   Notes:
  91
  92     o  Character property classes are \p or \P followed by a comma separated
  93        list of integers between 1 and 32.  These integers are references to
  94        the following character properties:
  95
  96         N       Character Property
  97         --------------------------
  98         1       _URE_NONSPACING
  99         2       _URE_COMBINING
 100         3       _URE_NUMDIGIT
 101         4       _URE_NUMOTHER
 102         5       _URE_SPACESEP
 103         6       _URE_LINESEP
 104         7       _URE_PARASEP
 105         8       _URE_CNTRL
 106         9       _URE_PUA
 107         10      _URE_UPPER
 108         11      _URE_LOWER
 109         12      _URE_TITLE
 110         13      _URE_MODIFIER
 111         14      _URE_OTHERLETTER
 112         15      _URE_DASHPUNCT
 113         16      _URE_OPENPUNCT
 114         17      _URE_CLOSEPUNCT
 115         18      _URE_OTHERPUNCT
 116         19      _URE_MATHSYM
 117         20      _URE_CURRENCYSYM
 118         21      _URE_OTHERSYM
 119         22      _URE_LTR
 120         23      _URE_RTL
 121         24      _URE_EURONUM
 122         25      _URE_EURONUMSEP
 123         26      _URE_EURONUMTERM
 124         27      _URE_ARABNUM
 125         28      _URE_COMMONSEP
 126         29      _URE_BLOCKSEP
 127         30      _URE_SEGMENTSEP
 128         31      _URE_WHITESPACE
 129         32      _URE_OTHERNEUT
 130
 131     o  Character classes can contain literals, constants, and character
 132        property classes. Example:
 133
 134        [abc\U10A\p1,3,4]
 135
 136 ---------------------------------------------------------------------------
 137
 138 Before using URE
 139 ----------------
 140 Before URE is used, two functions need to be created.  One to check if a
 141 character matches a set of URE character properties, and one to convert a
 142 character to lower case.
 143
 144 Stubs for these function are located in the urestubs.c file.
 145
 146 Using URE
 147 ---------
 148
 149 Sample pseudo-code fragment.
 150
 151   ure_buffer_t rebuf;
 152   ure_dfa_t dfa;
 153   ucs2_t *re, *text;
 154   unsigned long relen, textlen;
 155   unsigned long match_start, match_end;
 156
 157   /*
 158    * Allocate the dynamic storage needed to compile regular expressions.
 159    */
 160   rebuf = ure_buffer_create();
 161
 162   for each regular expression in a list {
 163       re = next regular expression;
 164       relen = length(re);
 165
 166       /*
 167        * Compile the regular expression with the case insensitive flag
 168        * turned on.
 169        */
 170       dfa = ure_compile(re, relen, 1, rebuf);
 171
 172       /*
 173        * Look for the first match in some text.  The matching will be done
 174        * in a case insensitive manner because the expression was compiled
 175        * with the case insensitive flag on.
 176        */
 177       if (ure_exec(dfa, 0, text, textlen, &match_start, &match_end))
 178         printf("MATCH: %ld %ld\n", match_start, match_end);
 179
 180       /*
 181        * Look for the first match in some text, ignoring non-spacing
 182        * characters.
 183        */
 184       if (ure_exec(dfa, URE_IGNORE_NONSPACING, text, textlen,
 185                    &match_start, &match_end))
 186         printf("MATCH: %ld %ld\n", match_start, match_end);
 187
 188       /*
 189        * Free the DFA.
 190        */
 191       ure_free_dfa(dfa);
 192   }
 193
 194   /*
 195    * Free the dynamic storage used for compiling the expressions.
 196    */
 197   ure_free_buffer(rebuf);
 198
 199 ---------------------------------------------------------------------------
 200
 201 Mark Leisher <mleisher@crl.nmsu.edu>
 202 29 March 1997
 203
 204 ===========================================================================
 205
 206 CHANGES
 207 -------
 208
 209 Version: 0.5
 210 Date   : 21 September 1999
 211 ==========================
 212   1. Added copyright stuff and put in CVS.