3 * Copyright 2000 The OpenLDAP Foundation, All Rights Reserved.
4 * COPYING RESTRICTIONS APPLY, see COPYRIGHT file
7 * Copyright 1997, 1998, 1999 Computing Research Labs,
8 * New Mexico State University
10 * Permission is hereby granted, free of charge, to any person obtaining a
11 * copy of this software and associated documentation files (the "Software"),
12 * to deal in the Software without restriction, including without limitation
13 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
14 * and/or sell copies of the Software, and to permit persons to whom the
15 * Software is furnished to do so, subject to the following conditions:
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
24 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
25 * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
26 * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
29 static char rcsid[] = "$Id: ure.c,v 1.2 1999/09/21 15:47:43 mleisher Exp $";
35 #include <ac/unistd.h>
36 #include <ac/string.h>
41 * Flags used internally in the DFA.
43 #define _URE_DFA_CASEFOLD 0x01
44 #define _URE_DFA_BLANKLINE 0x02
46 static unsigned long cclass_flags[] = {
83 * Symbol types for the DFA.
85 #define _URE_ANY_CHAR 1
88 #define _URE_NCCLASS 4
89 #define _URE_BOL_ANCHOR 5
90 #define _URE_EOL_ANCHOR 6
93 * Op codes for converting the NFA to a DFA.
95 #define _URE_SYMBOL 10
104 #define _URE_NOOP 0xffff
106 #define _URE_REGSTART 0x8000
107 #define _URE_REGEND 0x4000
110 * Structure used to handle a compacted range of characters.
118 _ure_range_t *ranges;
129 * This is a general element structure used for expressions and stack
141 * This is a structure used to track a list or a stack of states.
150 * Structure to track the list of unique states for a symbol
159 _ure_stlist_t states;
163 * Structure to hold a single state.
176 * Structure used for keeping lists of states.
179 _ure_state_t *states;
185 * Structure to track pairs of DFA states when equivalent states are
194 * Structure used for constructing the NFA and reducing to a minimal DFA.
196 typedef struct _ure_buffer_t {
204 * Table of unique symbols encountered.
206 _ure_symtab_t *symtab;
211 * Tracks the unique expressions generated for the NFA and when the NFA is
219 * The reduced table of unique groups of NFA states.
221 _ure_statetable_t states;
224 * Tracks states when equivalent states are merged.
242 typedef struct _ure_dfa_t {
248 _ure_dstate_t *states;
255 /*************************************************************************
259 *************************************************************************/
262 _ure_memmove(char *dest, char *src, unsigned long bytes)
271 * Do a memmove using Ye Olde Duff's Device for efficiency.
280 case 7: *--dest = *--src;
281 case 6: *--dest = *--src;
282 case 5: *--dest = *--src;
283 case 4: *--dest = *--src;
284 case 3: *--dest = *--src;
285 case 2: *--dest = *--src;
286 case 1: *--dest = *--src;
289 } else if (src > dest) {
293 case 7: *dest++ = *src++;
294 case 6: *dest++ = *src++;
295 case 5: *dest++ = *src++;
296 case 4: *dest++ = *src++;
297 case 3: *dest++ = *src++;
298 case 2: *dest++ = *src++;
299 case 1: *dest++ = *src++;
306 _ure_push(ucs2_t v, _ure_buffer_t *b)
314 * If the `reducing' parameter is non-zero, check to see if the value
315 * passed is already on the stack.
317 if (b->reducing != 0 && b->expr[v].onstack != 0)
321 if (s->slist_used == s->slist_size) {
322 if (s->slist_size == 0)
323 s->slist = (ucs2_t *) malloc(sizeof(ucs2_t) << 3);
325 s->slist = (ucs2_t *) realloc((char *) s->slist,
326 sizeof(ucs2_t) * (s->slist_size + 8));
329 s->slist[s->slist_used++] = v;
332 * If the `reducing' parameter is non-zero, flag the element as being on
335 if (b->reducing != 0)
336 b->expr[v].onstack = 1;
340 _ure_peek(_ure_buffer_t *b)
342 if (b == 0 || b->stack.slist_used == 0)
345 return b->stack.slist[b->stack.slist_used - 1];
349 _ure_pop(_ure_buffer_t *b)
353 if (b == 0 || b->stack.slist_used == 0)
356 v = b->stack.slist[--b->stack.slist_used];
358 b->expr[v].onstack = 0;
363 /*************************************************************************
365 * Start symbol parse functions.
367 *************************************************************************/
370 * Parse a comma-separated list of integers that represent character
371 * properties. Combine them into a mask that is returned in the `mask'
372 * variable, and return the number of characters consumed.
375 _ure_prop_list(ucs2_t *pp, unsigned long limit, unsigned long *mask,
384 for (m = n = 0; b->error == _URE_OK && sp < ep; sp++) {
387 * Encountered a comma, so select the next character property flag
388 * and reset the number.
390 m |= cclass_flags[n];
392 } else if (*sp >= '0' && *sp <= '9')
394 * Encountered a digit, so start or continue building the cardinal
395 * that represents the character property flag.
397 n = (n * 10) + (*sp - '0');
400 * Encountered something that is not part of the property list.
401 * Indicate that we are done.
406 * If a property number greater than 32 occurs, then there is a
407 * problem. Most likely a missing comma separator.
410 b->error = _URE_INVALID_PROPERTY;
414 m |= cclass_flags[n];
417 * Set the mask that represents the group of character properties.
422 * Return the number of characters consumed.
428 * Collect a hex number with 1 to 4 digits and return the number
429 * of characters used.
432 _ure_hex(ucs2_t *np, unsigned long limit, ucs4_t *n)
441 for (nn = 0, i = 0; i < 4 && sp < ep; i++, sp++) {
442 if (*sp >= '0' && *sp <= '9')
443 nn = (nn << 4) + (*sp - '0');
444 else if (*sp >= 'A' && *sp <= 'F')
445 nn = (nn << 4) + ((*sp - 'A') + 10);
446 else if (*sp >= 'a' && *sp <= 'f')
447 nn = (nn << 4) + ((*sp - 'a') + 10);
450 * Encountered something that is not a hex digit.
456 * Assign the character code collected and return the number of
465 * Insert a range into a character class, removing duplicates and ordering
466 * them in increasing range-start order.
469 _ure_add_range(_ure_ccl_t *ccl, _ure_range_t *r, _ure_buffer_t *b)
476 * If the `casefold' flag is set, then make sure both endpoints of the
477 * range are converted to lower case.
479 if (b->flags & _URE_DFA_CASEFOLD) {
480 r->min_code = _ure_tolower(r->min_code);
481 r->max_code = _ure_tolower(r->max_code);
485 * Swap the range endpoints if they are not in increasing order.
487 if (r->min_code > r->max_code) {
489 r->min_code = r->max_code;
493 for (i = 0, rp = ccl->ranges;
494 i < ccl->ranges_used && r->min_code < rp->min_code; i++, rp++) ;
497 * Check for a duplicate.
499 if (i < ccl->ranges_used &&
500 r->min_code == rp->min_code && r->max_code == rp->max_code)
503 if (ccl->ranges_used == ccl->ranges_size) {
504 if (ccl->ranges_size == 0)
505 ccl->ranges = (_ure_range_t *) malloc(sizeof(_ure_range_t) << 3);
507 ccl->ranges = (_ure_range_t *)
508 realloc((char *) ccl->ranges,
509 sizeof(_ure_range_t) * (ccl->ranges_size + 8));
510 ccl->ranges_size += 8;
513 rp = ccl->ranges + ccl->ranges_used;
515 if (i < ccl->ranges_used)
516 _ure_memmove((char *) (rp + 1), (char *) rp,
517 sizeof(_ure_range_t) * (ccl->ranges_used - i));
520 rp->min_code = r->min_code;
521 rp->max_code = r->max_code;
524 #define _URE_ALPHA_MASK (_URE_UPPER|_URE_LOWER|_URE_OTHERLETTER|\
525 _URE_MODIFIER|_URE_TITLE|_URE_NONSPACING|_URE_COMBINING)
526 #define _URE_ALNUM_MASK (_URE_ALPHA_MASK|_URE_NUMDIGIT)
527 #define _URE_PUNCT_MASK (_URE_DASHPUNCT|_URE_OPENPUNCT|_URE_CLOSEPUNCT|\
529 #define _URE_GRAPH_MASK (_URE_NUMDIGIT|_URE_NUMOTHER|_URE_ALPHA_MASK|\
530 _URE_MATHSYM|_URE_CURRENCYSYM|_URE_OTHERSYM)
531 #define _URE_PRINT_MASK (_URE_GRAPH_MASK|_URE_SPACESEP)
532 #define _URE_SPACE_MASK (_URE_SPACESEP|_URE_LINESEP|_URE_PARASEP)
534 typedef void (*_ure_cclsetup_t)(
544 _ure_cclsetup_t func;
549 _ure_ccl_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b)
555 _ure_space_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b)
562 * Add the additional characters needed for handling isspace().
564 range.min_code = range.max_code = '\t';
565 _ure_add_range(&sym->sym.ccl, &range, b);
566 range.min_code = range.max_code = '\r';
567 _ure_add_range(&sym->sym.ccl, &range, b);
568 range.min_code = range.max_code = '\n';
569 _ure_add_range(&sym->sym.ccl, &range, b);
570 range.min_code = range.max_code = '\f';
571 _ure_add_range(&sym->sym.ccl, &range, b);
572 range.min_code = range.max_code = 0xfeff;
573 _ure_add_range(&sym->sym.ccl, &range, b);
577 _ure_xdigit_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b)
582 * Add the additional characters needed for handling isxdigit().
584 range.min_code = '0';
585 range.max_code = '9';
586 _ure_add_range(&sym->sym.ccl, &range, b);
587 range.min_code = 'A';
588 range.max_code = 'F';
589 _ure_add_range(&sym->sym.ccl, &range, b);
590 range.min_code = 'a';
591 range.max_code = 'f';
592 _ure_add_range(&sym->sym.ccl, &range, b);
595 static _ure_trie_t cclass_trie[] = {
596 {0x003a, 1, 1, 0, 0},
597 {0x0061, 9, 10, 0, 0},
598 {0x0063, 8, 19, 0, 0},
599 {0x0064, 7, 24, 0, 0},
600 {0x0067, 6, 29, 0, 0},
601 {0x006c, 5, 34, 0, 0},
602 {0x0070, 4, 39, 0, 0},
603 {0x0073, 3, 49, 0, 0},
604 {0x0075, 2, 54, 0, 0},
605 {0x0078, 1, 59, 0, 0},
606 {0x006c, 1, 11, 0, 0},
607 {0x006e, 2, 13, 0, 0},
608 {0x0070, 1, 16, 0, 0},
609 {0x0075, 1, 14, 0, 0},
610 {0x006d, 1, 15, 0, 0},
611 {0x003a, 1, 16, _ure_ccl_setup, _URE_ALNUM_MASK},
612 {0x0068, 1, 17, 0, 0},
613 {0x0061, 1, 18, 0, 0},
614 {0x003a, 1, 19, _ure_ccl_setup, _URE_ALPHA_MASK},
615 {0x006e, 1, 20, 0, 0},
616 {0x0074, 1, 21, 0, 0},
617 {0x0072, 1, 22, 0, 0},
618 {0x006c, 1, 23, 0, 0},
619 {0x003a, 1, 24, _ure_ccl_setup, _URE_CNTRL},
620 {0x0069, 1, 25, 0, 0},
621 {0x0067, 1, 26, 0, 0},
622 {0x0069, 1, 27, 0, 0},
623 {0x0074, 1, 28, 0, 0},
624 {0x003a, 1, 29, _ure_ccl_setup, _URE_NUMDIGIT},
625 {0x0072, 1, 30, 0, 0},
626 {0x0061, 1, 31, 0, 0},
627 {0x0070, 1, 32, 0, 0},
628 {0x0068, 1, 33, 0, 0},
629 {0x003a, 1, 34, _ure_ccl_setup, _URE_GRAPH_MASK},
630 {0x006f, 1, 35, 0, 0},
631 {0x0077, 1, 36, 0, 0},
632 {0x0065, 1, 37, 0, 0},
633 {0x0072, 1, 38, 0, 0},
634 {0x003a, 1, 39, _ure_ccl_setup, _URE_LOWER},
635 {0x0072, 2, 41, 0, 0},
636 {0x0075, 1, 45, 0, 0},
637 {0x0069, 1, 42, 0, 0},
638 {0x006e, 1, 43, 0, 0},
639 {0x0074, 1, 44, 0, 0},
640 {0x003a, 1, 45, _ure_ccl_setup, _URE_PRINT_MASK},
641 {0x006e, 1, 46, 0, 0},
642 {0x0063, 1, 47, 0, 0},
643 {0x0074, 1, 48, 0, 0},
644 {0x003a, 1, 49, _ure_ccl_setup, _URE_PUNCT_MASK},
645 {0x0070, 1, 50, 0, 0},
646 {0x0061, 1, 51, 0, 0},
647 {0x0063, 1, 52, 0, 0},
648 {0x0065, 1, 53, 0, 0},
649 {0x003a, 1, 54, _ure_space_setup, _URE_SPACE_MASK},
650 {0x0070, 1, 55, 0, 0},
651 {0x0070, 1, 56, 0, 0},
652 {0x0065, 1, 57, 0, 0},
653 {0x0072, 1, 58, 0, 0},
654 {0x003a, 1, 59, _ure_ccl_setup, _URE_UPPER},
655 {0x0064, 1, 60, 0, 0},
656 {0x0069, 1, 61, 0, 0},
657 {0x0067, 1, 62, 0, 0},
658 {0x0069, 1, 63, 0, 0},
659 {0x0074, 1, 64, 0, 0},
660 {0x003a, 1, 65, _ure_xdigit_setup, 0},
664 * Probe for one of the POSIX colon delimited character classes in the static
668 _ure_posix_ccl(ucs2_t *cp, unsigned long limit, _ure_symtab_t *sym,
677 * If the number of characters left is less than 7, then this cannot be
678 * interpreted as one of the colon delimited classes.
686 for (i = 0; sp < ep && i < 8; i++, sp++) {
689 for (; n > 0 && tp->key != *sp; tp++, n--) ;
694 if (*sp == ':' && (i == 6 || i == 7)) {
699 tp = cclass_trie + tp->next;
704 (*tp->func)(sym, tp->mask, b);
710 * Construct a list of ranges and return the number of characters consumed.
713 _ure_cclass(ucs2_t *cp, unsigned long limit, _ure_symtab_t *symp,
727 symp->type = _URE_NCCLASS;
730 symp->type = _URE_CCLASS;
732 for (last = 0, range_end = 0;
733 b->error == _URE_OK && sp < ep && *sp != ']'; ) {
738 * The EOS was encountered when expecting the reverse solidus
739 * to be followed by the character it is escaping. Set an
740 * error code and return the number of characters consumed up
743 b->error = _URE_UNEXPECTED_EOS;
772 sp += _ure_prop_list(sp, ep - sp, &symp->props, b);
774 * Invert the bit mask of the properties if this is a negated
775 * character class or if 'P' is used to specify a list of
776 * character properties that should *not* match in a
780 symp->props = ~symp->props;
788 ((*sp >= '0' && *sp <= '9') ||
789 (*sp >= 'A' && *sp <= 'F') ||
790 (*sp >= 'a' && *sp <= 'f')))
791 sp += _ure_hex(sp, ep - sp, &c);
793 } else if (c == ':') {
795 * Probe for a POSIX colon delimited character class.
798 if ((n = _ure_posix_ccl(sp, ep - sp, symp, b)) == 0)
806 cclp = &symp->sym.ccl;
809 * Check to see if the current character is a low surrogate that needs
810 * to be combined with a preceding high surrogate.
813 if (c >= 0xdc00 && c <= 0xdfff)
815 * Construct the UTF16 character code.
817 c = 0x10000 + (((last & 0x03ff) << 10) | (c & 0x03ff));
820 * Add the isolated high surrogate to the range.
823 range.max_code = last & 0xffff;
825 range.min_code = range.max_code = last & 0xffff;
827 _ure_add_range(cclp, &range, b);
833 * Clear the last character code.
838 * This slightly awkward code handles the different cases needed to
841 if (c >= 0xd800 && c <= 0xdbff) {
843 * If the high surrogate is followed by a range indicator, simply
844 * add it as the range start. Otherwise, save it in case the next
845 * character is a low surrogate.
853 } else if (range_end == 1) {
855 _ure_add_range(cclp, &range, b);
858 range.min_code = range.max_code = c;
863 _ure_add_range(cclp, &range, b);
867 if (sp < ep && *sp == ']')
871 * The parse was not terminated by the character class close symbol
872 * (']'), so set an error code.
874 b->error = _URE_CCLASS_OPEN;
880 * Probe for a low surrogate hex code.
883 _ure_probe_ls(ucs2_t *ls, unsigned long limit, ucs4_t *c)
888 for (i = code = 0, sp = ls, ep = sp + limit; i < 4 && sp < ep; sp++) {
889 if (*sp >= '0' && *sp <= '9')
890 code = (code << 4) + (*sp - '0');
891 else if (*sp >= 'A' && *sp <= 'F')
892 code = (code << 4) + ((*sp - 'A') + 10);
893 else if (*sp >= 'a' && *sp <= 'f')
894 code = (code << 4) + ((*sp - 'a') + 10);
900 return (0xdc00 <= code && code <= 0xdfff) ? sp - ls : 0;
904 _ure_compile_symbol(ucs2_t *sym, unsigned long limit, _ure_symtab_t *symp,
913 if ((c = *sp++) == '\\') {
917 * The EOS was encountered when expecting the reverse solidus to
918 * be followed by the character it is escaping. Set an error code
919 * and return the number of characters consumed up to this point.
921 b->error = _URE_UNEXPECTED_EOS;
929 symp->type = (c == 'p') ? _URE_CCLASS : _URE_NCCLASS;
930 sp += _ure_prop_list(sp, ep - sp, &symp->props, b);
933 symp->type = _URE_CHAR;
934 symp->sym.chr = 0x07;
937 symp->type = _URE_CHAR;
938 symp->sym.chr = 0x08;
941 symp->type = _URE_CHAR;
942 symp->sym.chr = 0x0c;
945 symp->type = _URE_CHAR;
946 symp->sym.chr = 0x0a;
949 symp->type = _URE_CHAR;
950 symp->sym.chr = 0x0d;
953 symp->type = _URE_CHAR;
954 symp->sym.chr = 0x09;
957 symp->type = _URE_CHAR;
958 symp->sym.chr = 0x0b;
965 * Collect between 1 and 4 digits representing a UCS2 code. Fall
966 * through to the next case.
969 ((*sp >= '0' && *sp <= '9') ||
970 (*sp >= 'A' && *sp <= 'F') ||
971 (*sp >= 'a' && *sp <= 'f')))
972 sp += _ure_hex(sp, ep - sp, &c);
976 * Simply add an escaped character here.
978 symp->type = _URE_CHAR;
981 } else if (c == '^' || c == '$')
983 * Handle the BOL and EOL anchors. This actually consists simply of
984 * setting a flag that indicates that the user supplied anchor match
985 * function should be called. This needs to be done instead of simply
986 * matching line/paragraph separators because beginning-of-text and
987 * end-of-text tests are needed as well.
989 symp->type = (c == '^') ? _URE_BOL_ANCHOR : _URE_EOL_ANCHOR;
992 * Construct a character class.
994 sp += _ure_cclass(sp, ep - sp, symp, b);
996 symp->type = _URE_ANY_CHAR;
998 symp->type = _URE_CHAR;
1003 * If the symbol type happens to be a character and is a high surrogate,
1004 * then probe forward to see if it is followed by a low surrogate that
1005 * needs to be added.
1007 if (sp < ep && symp->type == _URE_CHAR &&
1008 0xd800 <= symp->sym.chr && symp->sym.chr <= 0xdbff) {
1010 if (0xdc00 <= *sp && *sp <= 0xdfff) {
1011 symp->sym.chr = 0x10000 + (((symp->sym.chr & 0x03ff) << 10) |
1014 } else if (*sp == '\\' && (*(sp + 1) == 'x' || *(sp + 1) == 'X' ||
1015 *(sp + 1) == 'u' || *(sp + 1) == 'U')) {
1016 sp += _ure_probe_ls(sp + 2, ep - (sp + 2), &c);
1017 if (0xdc00 <= c && c <= 0xdfff) {
1019 * Take into account the \[xu] in front of the hex code.
1022 symp->sym.chr = 0x10000 + (((symp->sym.chr & 0x03ff) << 10) |
1029 * Last, make sure any _URE_CHAR type symbols are changed to lower case if
1030 * the `casefold' flag is set.
1032 if ((b->flags & _URE_DFA_CASEFOLD) && symp->type == _URE_CHAR)
1033 symp->sym.chr = _ure_tolower(symp->sym.chr);
1036 * If the symbol constructed is anything other than one of the anchors,
1037 * make sure the _URE_DFA_BLANKLINE flag is removed.
1039 if (symp->type != _URE_BOL_ANCHOR && symp->type != _URE_EOL_ANCHOR)
1040 b->flags &= ~_URE_DFA_BLANKLINE;
1043 * Return the number of characters consumed.
1049 _ure_sym_neq(_ure_symtab_t *a, _ure_symtab_t *b)
1051 if (a->type != b->type || a->mods != b->mods || a->props != b->props)
1054 if (a->type == _URE_CCLASS || a->type == _URE_NCCLASS) {
1055 if (a->sym.ccl.ranges_used != b->sym.ccl.ranges_used)
1057 if (a->sym.ccl.ranges_used > 0 &&
1058 memcmp((char *) a->sym.ccl.ranges, (char *) b->sym.ccl.ranges,
1059 sizeof(_ure_range_t) * a->sym.ccl.ranges_used) != 0)
1061 } else if (a->type == _URE_CHAR && a->sym.chr != b->sym.chr)
1067 * Construct a symbol, but only keep unique symbols.
1070 _ure_make_symbol(ucs2_t *sym, unsigned long limit, unsigned long *consumed,
1074 _ure_symtab_t *sp, symbol;
1077 * Build the next symbol so we can test to see if it is already in the
1080 (void) memset((char *) &symbol, '\0', sizeof(_ure_symtab_t));
1081 *consumed = _ure_compile_symbol(sym, limit, &symbol, b);
1084 * Check to see if the symbol exists.
1086 for (i = 0, sp = b->symtab;
1087 i < b->symtab_used && _ure_sym_neq(&symbol, sp); i++, sp++) ;
1089 if (i < b->symtab_used) {
1091 * Free up any ranges used for the symbol.
1093 if ((symbol.type == _URE_CCLASS || symbol.type == _URE_NCCLASS) &&
1094 symbol.sym.ccl.ranges_size > 0)
1095 free((char *) symbol.sym.ccl.ranges);
1097 return b->symtab[i].id;
1101 * Need to add the new symbol.
1103 if (b->symtab_used == b->symtab_size) {
1104 if (b->symtab_size == 0)
1105 b->symtab = (_ure_symtab_t *) malloc(sizeof(_ure_symtab_t) << 3);
1107 b->symtab = (_ure_symtab_t *)
1108 realloc((char *) b->symtab,
1109 sizeof(_ure_symtab_t) * (b->symtab_size + 8));
1110 sp = b->symtab + b->symtab_size;
1111 (void) memset((char *) sp, '\0', sizeof(_ure_symtab_t) << 3);
1112 b->symtab_size += 8;
1115 symbol.id = b->symtab_used++;
1116 (void) memcpy((char *) &b->symtab[symbol.id], (char *) &symbol,
1117 sizeof(_ure_symtab_t));
1122 /*************************************************************************
1124 * End symbol parse functions.
1126 *************************************************************************/
1129 _ure_make_expr(ucs2_t type, ucs2_t lhs, ucs2_t rhs, _ure_buffer_t *b)
1137 * Determine if the expression already exists or not.
1139 for (i = 0; i < b->expr_used; i++) {
1140 if (b->expr[i].type == type && b->expr[i].lhs == lhs &&
1141 b->expr[i].rhs == rhs)
1144 if (i < b->expr_used)
1148 * Need to add a new expression.
1150 if (b->expr_used == b->expr_size) {
1151 if (b->expr_size == 0)
1152 b->expr = (_ure_elt_t *) malloc(sizeof(_ure_elt_t) << 3);
1154 b->expr = (_ure_elt_t *)
1155 realloc((char *) b->expr,
1156 sizeof(_ure_elt_t) * (b->expr_size + 8));
1160 b->expr[b->expr_used].onstack = 0;
1161 b->expr[b->expr_used].type = type;
1162 b->expr[b->expr_used].lhs = lhs;
1163 b->expr[b->expr_used].rhs = rhs;
1165 return b->expr_used++;
1168 static unsigned char spmap[] = {
1169 0x00, 0x00, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00,
1170 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1171 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1174 #define _ure_isspecial(cc) ((cc) > 0x20 && (cc) < 0x7f && \
1175 (spmap[(cc) >> 3] & (1 << ((cc) & 7))))
1178 * Convert the regular expression into an NFA in a form that will be easy to
1179 * reduce to a DFA. The starting state for the reduction will be returned.
1182 _ure_re2nfa(ucs2_t *re, unsigned long relen, _ure_buffer_t *b)
1184 ucs2_t c, state, top, sym, *sp, *ep;
1191 while (b->error == _URE_OK && sp < ep) {
1195 _ure_push(_URE_PAREN, b);
1199 * Check for the case of too many close parentheses.
1201 if (_ure_peek(b) == _URE_NOOP) {
1202 b->error = _URE_UNBALANCED_GROUP;
1206 while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR)
1208 * Make an expression with the AND or OR operator and its right
1211 state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b);
1214 * Remove the _URE_PAREN off the stack.
1219 state = _ure_make_expr(_URE_STAR, state, _URE_NOOP, b);
1222 state = _ure_make_expr(_URE_PLUS, state, _URE_NOOP, b);
1225 state = _ure_make_expr(_URE_QUEST, state, _URE_NOOP, b);
1228 while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR)
1230 * Make an expression with the AND or OR operator and its right
1233 state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b);
1235 _ure_push(state, b);
1236 _ure_push(_URE_OR, b);
1240 sym = _ure_make_symbol(sp, ep - sp, &used, b);
1242 state = _ure_make_expr(_URE_SYMBOL, sym, _URE_NOOP, b);
1246 if (c != '(' && c != '|' && sp < ep &&
1247 (!_ure_isspecial(*sp) || *sp == '(')) {
1248 _ure_push(state, b);
1249 _ure_push(_URE_AND, b);
1252 while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR)
1254 * Make an expression with the AND or OR operator and its right
1257 state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b);
1259 if (b->stack.slist_used > 0)
1260 b->error = _URE_UNBALANCED_GROUP;
1262 return (b->error == _URE_OK) ? state : _URE_NOOP;
1266 _ure_add_symstate(ucs2_t sym, ucs2_t state, _ure_buffer_t *b)
1272 * Locate the symbol in the symbol table so the state can be added.
1273 * If the symbol doesn't exist, then a real problem exists.
1275 for (i = 0, sp = b->symtab; i < b->symtab_used && sym != sp->id;
1279 * Now find out if the state exists in the symbol's state list.
1281 for (i = 0, stp = sp->states.slist;
1282 i < sp->states.slist_used && state > *stp; i++, stp++) ;
1284 if (i == sp->states.slist_used || state < *stp) {
1286 * Need to add the state in order.
1288 if (sp->states.slist_used == sp->states.slist_size) {
1289 if (sp->states.slist_size == 0)
1290 sp->states.slist = (ucs2_t *) malloc(sizeof(ucs2_t) << 3);
1292 sp->states.slist = (ucs2_t *)
1293 realloc((char *) sp->states.slist,
1294 sizeof(ucs2_t) * (sp->states.slist_size + 8));
1295 sp->states.slist_size += 8;
1297 if (i < sp->states.slist_used)
1298 (void) _ure_memmove((char *) (sp->states.slist + i + 1),
1299 (char *) (sp->states.slist + i),
1300 sizeof(ucs2_t) * (sp->states.slist_used - i));
1301 sp->states.slist[i] = state;
1302 sp->states.slist_used++;
1307 _ure_add_state(ucs2_t nstates, ucs2_t *states, _ure_buffer_t *b)
1312 for (i = 0, sp = b->states.states; i < b->states.states_used; i++, sp++) {
1313 if (sp->st.slist_used == nstates &&
1314 memcmp((char *) states, (char *) sp->st.slist,
1315 sizeof(ucs2_t) * nstates) == 0)
1319 if (i == b->states.states_used) {
1321 * Need to add a new DFA state (set of NFA states).
1323 if (b->states.states_used == b->states.states_size) {
1324 if (b->states.states_size == 0)
1325 b->states.states = (_ure_state_t *)
1326 malloc(sizeof(_ure_state_t) << 3);
1328 b->states.states = (_ure_state_t *)
1329 realloc((char *) b->states.states,
1330 sizeof(_ure_state_t) * (b->states.states_size + 8));
1331 sp = b->states.states + b->states.states_size;
1332 (void) memset((char *) sp, '\0', sizeof(_ure_state_t) << 3);
1333 b->states.states_size += 8;
1336 sp = b->states.states + b->states.states_used++;
1339 if (sp->st.slist_used + nstates > sp->st.slist_size) {
1340 if (sp->st.slist_size == 0)
1341 sp->st.slist = (ucs2_t *)
1342 malloc(sizeof(ucs2_t) * (sp->st.slist_used + nstates));
1344 sp->st.slist = (ucs2_t *)
1345 realloc((char *) sp->st.slist,
1346 sizeof(ucs2_t) * (sp->st.slist_used + nstates));
1347 sp->st.slist_size = sp->st.slist_used + nstates;
1349 sp->st.slist_used = nstates;
1350 (void) memcpy((char *) sp->st.slist, (char *) states,
1351 sizeof(ucs2_t) * nstates);
1355 * Return the ID of the DFA state representing a group of NFA states.
1361 _ure_reduce(ucs2_t start, _ure_buffer_t *b)
1363 ucs2_t i, j, state, eval, syms, rhs;
1364 ucs2_t s1, s2, ns1, ns2;
1371 * Add the starting state for the reduction.
1373 _ure_add_state(1, &start, b);
1376 * Process each set of NFA states that get created.
1378 for (i = 0; i < b->states.states_used; i++) {
1379 sp = b->states.states + i;
1382 * Push the current states on the stack.
1384 for (j = 0; j < sp->st.slist_used; j++)
1385 _ure_push(sp->st.slist[j], b);
1388 * Reduce the NFA states.
1390 for (j = sp->accepting = syms = 0; j < b->stack.slist_used; j++) {
1391 state = b->stack.slist[j];
1395 * This inner loop is the iterative equivalent of recursively
1396 * reducing subexpressions generated as a result of a reduction.
1399 switch (b->expr[state].type) {
1401 ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b);
1402 _ure_add_symstate(b->expr[state].lhs, ns1, b);
1411 s1 = b->expr[state].lhs;
1412 ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b);
1413 state = _ure_make_expr(_URE_OR, ns1, s1, b);
1416 s1 = b->expr[state].lhs;
1417 ns1 = _ure_make_expr(_URE_STAR, s1, _URE_NOOP, b);
1418 state = _ure_make_expr(_URE_AND, s1, ns1, b);
1421 s1 = b->expr[state].lhs;
1422 ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b);
1423 ns2 = _ure_make_expr(_URE_PLUS, s1, _URE_NOOP, b);
1424 state = _ure_make_expr(_URE_OR, ns1, ns2, b);
1427 s1 = b->expr[state].lhs;
1428 s2 = b->expr[state].rhs;
1434 s1 = b->expr[state].lhs;
1435 s2 = b->expr[state].rhs;
1436 switch (b->expr[s1].type) {
1438 _ure_add_symstate(b->expr[s1].lhs, s2, b);
1446 ns1 = b->expr[s1].lhs;
1447 ns2 = _ure_make_expr(_URE_AND, ns1, s2, b);
1448 state = _ure_make_expr(_URE_OR, s2, ns2, b);
1451 ns1 = b->expr[s1].lhs;
1452 ns2 = _ure_make_expr(_URE_OR, s2, state, b);
1453 state = _ure_make_expr(_URE_AND, ns1, ns2, b);
1456 ns1 = b->expr[s1].lhs;
1457 ns2 = _ure_make_expr(_URE_AND, ns1, state, b);
1458 state = _ure_make_expr(_URE_OR, s2, ns2, b);
1461 ns1 = b->expr[s1].lhs;
1462 ns2 = b->expr[s1].rhs;
1463 ns1 = _ure_make_expr(_URE_AND, ns1, s2, b);
1464 ns2 = _ure_make_expr(_URE_AND, ns2, s2, b);
1465 state = _ure_make_expr(_URE_OR, ns1, ns2, b);
1468 ns1 = b->expr[s1].lhs;
1469 ns2 = b->expr[s1].rhs;
1470 ns2 = _ure_make_expr(_URE_AND, ns2, s2, b);
1471 state = _ure_make_expr(_URE_AND, ns1, ns2, b);
1479 * Clear the state stack.
1481 while (_ure_pop(b) != _URE_NOOP) ;
1484 * Reset the state pointer because the reduction may have moved it
1485 * during a reallocation.
1487 sp = b->states.states + i;
1490 * Generate the DFA states for the symbols collected during the
1491 * current reduction.
1493 if (sp->trans_used + syms > sp->trans_size) {
1494 if (sp->trans_size == 0)
1495 sp->trans = (_ure_elt_t *)
1496 malloc(sizeof(_ure_elt_t) * (sp->trans_used + syms));
1498 sp->trans = (_ure_elt_t *)
1499 realloc((char *) sp->trans,
1500 sizeof(_ure_elt_t) * (sp->trans_used + syms));
1501 sp->trans_size = sp->trans_used + syms;
1505 * Go through the symbol table and generate the DFA state transitions
1506 * for each symbol that has collected NFA states.
1508 for (j = syms = 0, smp = b->symtab; j < b->symtab_used; j++, smp++) {
1509 sp = b->states.states + i;
1511 if (smp->states.slist_used > 0) {
1512 sp->trans[syms].lhs = smp->id;
1513 rhs = _ure_add_state(smp->states.slist_used,
1514 smp->states.slist, b);
1516 * Reset the state pointer in case the reallocation moves it
1519 sp = b->states.states + i;
1520 sp->trans[syms].rhs = rhs;
1522 smp->states.slist_used = 0;
1528 * Set the number of transitions actually used.
1530 sp->trans_used = syms;
1536 _ure_add_equiv(ucs2_t l, ucs2_t r, _ure_buffer_t *b)
1540 l = b->states.states[l].id;
1541 r = b->states.states[r].id;
1553 * Check to see if the equivalence pair already exists.
1555 for (tmp = 0; tmp < b->equiv_used &&
1556 (b->equiv[tmp].l != l || b->equiv[tmp].r != r);
1559 if (tmp < b->equiv_used)
1562 if (b->equiv_used == b->equiv_size) {
1563 if (b->equiv_size == 0)
1564 b->equiv = (_ure_equiv_t *) malloc(sizeof(_ure_equiv_t) << 3);
1566 b->equiv = (_ure_equiv_t *) realloc((char *) b->equiv,
1567 sizeof(_ure_equiv_t) *
1568 (b->equiv_size + 8));
1571 b->equiv[b->equiv_used].l = l;
1572 b->equiv[b->equiv_used].r = r;
1577 * Merge the DFA states that are equivalent.
1580 _ure_merge_equiv(_ure_buffer_t *b)
1582 ucs2_t i, j, k, eq, done;
1583 _ure_state_t *sp1, *sp2, *ls, *rs;
1585 for (i = 0; i < b->states.states_used; i++) {
1586 sp1 = b->states.states + i;
1589 for (j = 0; j < i; j++) {
1590 sp2 = b->states.states + j;
1594 _ure_add_equiv(i, j, b);
1595 for (eq = 0, done = 0; eq < b->equiv_used; eq++) {
1596 ls = b->states.states + b->equiv[eq].l;
1597 rs = b->states.states + b->equiv[eq].r;
1598 if (ls->accepting != rs->accepting ||
1599 ls->trans_used != rs->trans_used) {
1603 for (k = 0; k < ls->trans_used &&
1604 ls->trans[k].lhs == rs->trans[k].lhs; k++) ;
1605 if (k < ls->trans_used) {
1610 for (k = 0; k < ls->trans_used; k++)
1611 _ure_add_equiv(ls->trans[k].rhs, rs->trans[k].rhs, b);
1616 for (eq = 0; j < i && eq < b->equiv_used; eq++)
1617 b->states.states[b->equiv[eq].r].id =
1618 b->states.states[b->equiv[eq].l].id;
1622 * Renumber the states appropriately.
1624 for (i = eq = 0, sp1 = b->states.states; i < b->states.states_used;
1626 sp1->id = (sp1->id == i) ? eq++ : b->states.states[sp1->id].id;
1629 /*************************************************************************
1633 *************************************************************************/
1636 ure_buffer_create(void)
1640 b = (ure_buffer_t) calloc(1, sizeof(_ure_buffer_t));
1646 ure_buffer_free(ure_buffer_t buf)
1653 if (buf->stack.slist_size > 0)
1654 free((char *) buf->stack.slist);
1656 if (buf->expr_size > 0)
1657 free((char *) buf->expr);
1659 for (i = 0; i < buf->symtab_size; i++) {
1660 if (buf->symtab[i].states.slist_size > 0)
1661 free((char *) buf->symtab[i].states.slist);
1664 if (buf->symtab_size > 0)
1665 free((char *) buf->symtab);
1667 for (i = 0; i < buf->states.states_size; i++) {
1668 if (buf->states.states[i].trans_size > 0)
1669 free((char *) buf->states.states[i].trans);
1670 if (buf->states.states[i].st.slist_size > 0)
1671 free((char *) buf->states.states[i].st.slist);
1674 if (buf->states.states_size > 0)
1675 free((char *) buf->states.states);
1677 if (buf->equiv_size > 0)
1678 free((char *) buf->equiv);
1684 ure_compile(ucs2_t *re, unsigned long relen, int casefold, ure_buffer_t buf)
1692 if (re == 0 || *re == 0 || relen == 0 || buf == 0)
1696 * Reset the various fields of the compilation buffer. Default the flags
1697 * to indicate the presense of the "^$" pattern. If any other pattern
1698 * occurs, then this flag will be removed. This is done to catch this
1699 * special pattern and handle it specially when matching.
1701 buf->flags = _URE_DFA_BLANKLINE | ((casefold) ? _URE_DFA_CASEFOLD : 0);
1703 buf->stack.slist_used = 0;
1706 for (i = 0; i < buf->symtab_used; i++)
1707 buf->symtab[i].states.slist_used = 0;
1708 buf->symtab_used = 0;
1710 for (i = 0; i < buf->states.states_used; i++) {
1711 buf->states.states[i].st.slist_used = 0;
1712 buf->states.states[i].trans_used = 0;
1714 buf->states.states_used = 0;
1717 * Construct the NFA. If this stage returns a 0, then an error occured or
1718 * an empty expression was passed.
1720 if ((state = _ure_re2nfa(re, relen, buf)) == _URE_NOOP)
1724 * Do the expression reduction to get the initial DFA.
1726 _ure_reduce(state, buf);
1729 * Merge all the equivalent DFA states.
1731 _ure_merge_equiv(buf);
1734 * Construct the minimal DFA.
1736 dfa = (ure_dfa_t) malloc(sizeof(_ure_dfa_t));
1737 (void) memset((char *) dfa, '\0', sizeof(_ure_dfa_t));
1739 dfa->flags = buf->flags & (_URE_DFA_CASEFOLD|_URE_DFA_BLANKLINE);
1742 * Free up the NFA state groups and transfer the symbols from the buffer
1745 for (i = 0; i < buf->symtab_size; i++) {
1746 if (buf->symtab[i].states.slist_size > 0)
1747 free((char *) buf->symtab[i].states.slist);
1749 dfa->syms = buf->symtab;
1750 dfa->nsyms = buf->symtab_used;
1752 buf->symtab_used = buf->symtab_size = 0;
1755 * Collect the total number of states and transitions needed for the DFA.
1757 for (i = state = 0, sp = buf->states.states; i < buf->states.states_used;
1759 if (sp->id == state) {
1761 dfa->ntrans += sp->trans_used;
1767 * Allocate enough space for the states and transitions.
1769 dfa->states = (_ure_dstate_t *) malloc(sizeof(_ure_dstate_t) *
1771 dfa->trans = (_ure_trans_t *) malloc(sizeof(_ure_trans_t) * dfa->ntrans);
1774 * Actually transfer the DFA states from the buffer.
1778 for (i = state = 0, sp = buf->states.states; i < buf->states.states_used;
1780 if (sp->id == state) {
1782 dsp->ntrans = sp->trans_used;
1783 dsp->accepting = sp->accepting;
1786 * Add the transitions for the state.
1788 for (j = 0; j < dsp->ntrans; j++, tp++) {
1789 tp->symbol = sp->trans[j].lhs;
1790 tp->next_state = buf->states.states[sp->trans[j].rhs].id;
1802 ure_dfa_free(ure_dfa_t dfa)
1809 for (i = 0; i < dfa->nsyms; i++) {
1810 if ((dfa->syms[i].type == _URE_CCLASS ||
1811 dfa->syms[i].type == _URE_NCCLASS) &&
1812 dfa->syms[i].sym.ccl.ranges_size > 0)
1813 free((char *) dfa->syms[i].sym.ccl.ranges);
1816 free((char *) dfa->syms);
1818 if (dfa->nstates > 0)
1819 free((char *) dfa->states);
1820 if (dfa->ntrans > 0)
1821 free((char *) dfa->trans);
1826 ure_write_dfa(ure_dfa_t dfa, FILE *out)
1828 ucs2_t i, j, k, h, l;
1833 if (dfa == 0 || out == 0)
1837 * Write all the different character classes.
1839 for (i = 0, sym = dfa->syms; i < dfa->nsyms; i++, sym++) {
1840 if (sym->type == _URE_CCLASS || sym->type == _URE_NCCLASS) {
1841 fprintf(out, "C%hd = ", sym->id);
1842 if (sym->sym.ccl.ranges_used > 0) {
1844 if (sym->type == _URE_NCCLASS)
1847 if (sym->props != 0) {
1848 if (sym->type == _URE_NCCLASS)
1849 fprintf(out, "\\P");
1851 fprintf(out, "\\p");
1852 for (k = h = 0; k < 32; k++) {
1853 if (sym->props & (1 << k)) {
1856 fprintf(out, "%hd", k + 1);
1864 for (k = 0, rp = sym->sym.ccl.ranges;
1865 k < sym->sym.ccl.ranges_used; k++, rp++) {
1867 * Check for UTF16 characters.
1869 if (0x10000 <= rp->min_code &&
1870 rp->min_code <= 0x10ffff) {
1871 h = ((rp->min_code - 0x10000) >> 10) + 0xd800;
1872 l = ((rp->min_code - 0x10000) & 1023) + 0xdc00;
1873 fprintf(out, "\\x%04hX\\x%04hX", h, l);
1875 fprintf(out, "\\x%04lX", rp->min_code & 0xffff);
1876 if (rp->max_code != rp->min_code) {
1878 if (rp->max_code >= 0x10000 &&
1879 rp->max_code <= 0x10ffff) {
1880 h = ((rp->max_code - 0x10000) >> 10) + 0xd800;
1881 l = ((rp->max_code - 0x10000) & 1023) + 0xdc00;
1882 fprintf(out, "\\x%04hX\\x%04hX", h, l);
1884 fprintf(out, "\\x%04lX", rp->max_code & 0xffff);
1887 if (sym->sym.ccl.ranges_used > 0)
1893 for (i = 0, sp = dfa->states; i < dfa->nstates; i++, sp++) {
1894 fprintf(out, "S%hd = ", i);
1895 if (sp->accepting) {
1900 for (j = 0; j < sp->ntrans; j++) {
1904 sym = dfa->syms + sp->trans[j].symbol;
1905 switch (sym->type) {
1907 if (0x10000 <= sym->sym.chr && sym->sym.chr <= 0x10ffff) {
1909 * Take care of UTF16 characters.
1911 h = ((sym->sym.chr - 0x10000) >> 10) + 0xd800;
1912 l = ((sym->sym.chr - 0x10000) & 1023) + 0xdc00;
1913 fprintf(out, "\\x%04hX\\x%04hX ", h, l);
1915 fprintf(out, "\\x%04lX ", sym->sym.chr & 0xffff);
1918 fprintf(out, "<any> ");
1920 case _URE_BOL_ANCHOR:
1921 fprintf(out, "<bol-anchor> ");
1923 case _URE_EOL_ANCHOR:
1924 fprintf(out, "<eol-anchor> ");
1928 fprintf(out, "[C%hd] ", sym->id);
1931 fprintf(out, "S%hd", sp->trans[j].next_state);
1932 if (j + 1 < sp->ntrans)
1939 #define _ure_issep(cc) ((cc) == '\n' || (cc) == '\r' || (cc) == 0x2028 ||\
1943 ure_exec(ure_dfa_t dfa, int flags, ucs2_t *text, unsigned long textlen,
1944 unsigned long *match_start, unsigned long *match_end)
1946 int i, j, matched, found, skip;
1947 unsigned long ms, me;
1949 ucs2_t *sp, *ep, *lp;
1954 if (dfa == 0 || text == 0)
1958 * Handle the special case of an empty string matching the "^$" pattern.
1960 if (textlen == 0 && (dfa->flags & _URE_DFA_BLANKLINE)) {
1961 *match_start = *match_end = 0;
1972 for (found = skip = 0; found == 0 && sp < ep; ) {
1977 * Check to see if this is a high surrogate that should be
1978 * combined with a following low surrogate.
1980 if (sp < ep && 0xd800 <= c && c <= 0xdbff &&
1981 0xdc00 <= *sp && *sp <= 0xdfff)
1982 c = 0x10000 + (((c & 0x03ff) << 10) | (*sp++ & 0x03ff));
1985 * Determine if the character is non-spacing and should be skipped.
1987 if (_ure_matches_properties(_URE_NONSPACING, c) &&
1988 (flags & URE_IGNORE_NONSPACING)) {
1993 if (dfa->flags & _URE_DFA_CASEFOLD)
1994 c = _ure_tolower(c);
1997 * See if one of the transitions matches.
1999 for (i = 0, matched = 0; matched == 0 && i < stp->ntrans; i++) {
2000 sym = dfa->syms + stp->trans[i].symbol;
2001 switch (sym->type) {
2003 if ((flags & URE_DOT_MATCHES_SEPARATORS) ||
2008 if (c == sym->sym.chr)
2011 case _URE_BOL_ANCHOR:
2015 } else if (_ure_issep(c)) {
2016 if (c == '\r' && sp < ep && *sp == '\n')
2022 case _URE_EOL_ANCHOR:
2023 if (_ure_issep(c)) {
2025 * Put the pointer back before the separator so the match
2026 * end position will be correct. This case will also
2027 * cause the `sp' pointer to be advanced over the current
2028 * separator once the match end point has been recorded.
2036 if (sym->props != 0)
2037 matched = _ure_matches_properties(sym->props, c);
2038 for (j = 0, rp = sym->sym.ccl.ranges;
2039 j < sym->sym.ccl.ranges_used; j++, rp++) {
2040 if (rp->min_code <= c && c <= rp->max_code)
2043 if (sym->type == _URE_NCCLASS)
2053 stp = dfa->states + stp->trans[i].next_state;
2056 * If the match was an EOL anchor, adjust the pointer past the
2057 * separator that caused the match. The correct match
2058 * position has been recorded already.
2060 if (sym->type == _URE_EOL_ANCHOR) {
2062 * Skip the character that caused the match.
2067 * Handle the infamous CRLF situation.
2069 if (sp < ep && c == '\r' && *sp == '\n')
2076 if (stp->accepting == 0) {
2078 * If the last state was not accepting, then reset
2085 * The last state was accepting, so terminate the matching
2086 * loop to avoid more work.
2089 } else if (sp == ep) {
2090 if (!stp->accepting) {
2092 * This ugly hack is to make sure the end-of-line anchors
2093 * match when the source text hits the end. This is only done
2094 * if the last subexpression matches.
2096 for (i = 0; found == 0 && i < stp->ntrans; i++) {
2097 sym = dfa->syms + stp->trans[i].symbol;
2098 if (sym->type ==_URE_EOL_ANCHOR) {
2099 stp = dfa->states + stp->trans[i].next_state;
2100 if (stp->accepting) {
2109 * Make sure any conditions that match all the way to the end
2110 * of the string match.
2124 return (ms != ~0) ? 1 : 0;