2 * Copyright 1997, 1998, 1999 Computing Research Labs,
3 * New Mexico State University
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
19 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
20 * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
21 * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24 static char rcsid[] = "$Id: ure.c,v 1.2 1999/09/21 15:47:43 mleisher Exp $";
30 #include <ac/unistd.h>
31 #include <ac/string.h>
36 * Flags used internally in the DFA.
38 #define _URE_DFA_CASEFOLD 0x01
39 #define _URE_DFA_BLANKLINE 0x02
41 static unsigned long cclass_flags[] = {
78 * Symbol types for the DFA.
80 #define _URE_ANY_CHAR 1
83 #define _URE_NCCLASS 4
84 #define _URE_BOL_ANCHOR 5
85 #define _URE_EOL_ANCHOR 6
88 * Op codes for converting the NFA to a DFA.
90 #define _URE_SYMBOL 10
99 #define _URE_NOOP 0xffff
101 #define _URE_REGSTART 0x8000
102 #define _URE_REGEND 0x4000
105 * Structure used to handle a compacted range of characters.
113 _ure_range_t *ranges;
124 * This is a general element structure used for expressions and stack
136 * This is a structure used to track a list or a stack of states.
145 * Structure to track the list of unique states for a symbol
154 _ure_stlist_t states;
158 * Structure to hold a single state.
171 * Structure used for keeping lists of states.
174 _ure_state_t *states;
180 * Structure to track pairs of DFA states when equivalent states are
189 * Structure used for constructing the NFA and reducing to a minimal DFA.
191 typedef struct _ure_buffer_t {
199 * Table of unique symbols encountered.
201 _ure_symtab_t *symtab;
206 * Tracks the unique expressions generated for the NFA and when the NFA is
214 * The reduced table of unique groups of NFA states.
216 _ure_statetable_t states;
219 * Tracks states when equivalent states are merged.
237 typedef struct _ure_dfa_t {
243 _ure_dstate_t *states;
250 /*************************************************************************
254 *************************************************************************/
257 _ure_memmove(char *dest, char *src, unsigned long bytes)
266 * Do a memmove using Ye Olde Duff's Device for efficiency.
275 case 7: *--dest = *--src;
276 case 6: *--dest = *--src;
277 case 5: *--dest = *--src;
278 case 4: *--dest = *--src;
279 case 3: *--dest = *--src;
280 case 2: *--dest = *--src;
281 case 1: *--dest = *--src;
284 } else if (src > dest) {
288 case 7: *dest++ = *src++;
289 case 6: *dest++ = *src++;
290 case 5: *dest++ = *src++;
291 case 4: *dest++ = *src++;
292 case 3: *dest++ = *src++;
293 case 2: *dest++ = *src++;
294 case 1: *dest++ = *src++;
301 _ure_push(ucs2_t v, _ure_buffer_t *b)
309 * If the `reducing' parameter is non-zero, check to see if the value
310 * passed is already on the stack.
312 if (b->reducing != 0 && b->expr[v].onstack != 0)
316 if (s->slist_used == s->slist_size) {
317 if (s->slist_size == 0)
318 s->slist = (ucs2_t *) malloc(sizeof(ucs2_t) << 3);
320 s->slist = (ucs2_t *) realloc((char *) s->slist,
321 sizeof(ucs2_t) * (s->slist_size + 8));
324 s->slist[s->slist_used++] = v;
327 * If the `reducing' parameter is non-zero, flag the element as being on
330 if (b->reducing != 0)
331 b->expr[v].onstack = 1;
335 _ure_peek(_ure_buffer_t *b)
337 if (b == 0 || b->stack.slist_used == 0)
340 return b->stack.slist[b->stack.slist_used - 1];
344 _ure_pop(_ure_buffer_t *b)
348 if (b == 0 || b->stack.slist_used == 0)
351 v = b->stack.slist[--b->stack.slist_used];
353 b->expr[v].onstack = 0;
358 /*************************************************************************
360 * Start symbol parse functions.
362 *************************************************************************/
365 * Parse a comma-separated list of integers that represent character
366 * properties. Combine them into a mask that is returned in the `mask'
367 * variable, and return the number of characters consumed.
370 _ure_prop_list(ucs2_t *pp, unsigned long limit, unsigned long *mask,
379 for (m = n = 0; b->error == _URE_OK && sp < ep; sp++) {
382 * Encountered a comma, so select the next character property flag
383 * and reset the number.
385 m |= cclass_flags[n];
387 } else if (*sp >= '0' && *sp <= '9')
389 * Encountered a digit, so start or continue building the cardinal
390 * that represents the character property flag.
392 n = (n * 10) + (*sp - '0');
395 * Encountered something that is not part of the property list.
396 * Indicate that we are done.
401 * If a property number greater than 32 occurs, then there is a
402 * problem. Most likely a missing comma separator.
405 b->error = _URE_INVALID_PROPERTY;
409 m |= cclass_flags[n];
412 * Set the mask that represents the group of character properties.
417 * Return the number of characters consumed.
423 * Collect a hex number with 1 to 4 digits and return the number
424 * of characters used.
427 _ure_hex(ucs2_t *np, unsigned long limit, ucs4_t *n)
436 for (nn = 0, i = 0; i < 4 && sp < ep; i++, sp++) {
437 if (*sp >= '0' && *sp <= '9')
438 nn = (nn << 4) + (*sp - '0');
439 else if (*sp >= 'A' && *sp <= 'F')
440 nn = (nn << 4) + ((*sp - 'A') + 10);
441 else if (*sp >= 'a' && *sp <= 'f')
442 nn = (nn << 4) + ((*sp - 'a') + 10);
445 * Encountered something that is not a hex digit.
451 * Assign the character code collected and return the number of
460 * Insert a range into a character class, removing duplicates and ordering
461 * them in increasing range-start order.
464 _ure_add_range(_ure_ccl_t *ccl, _ure_range_t *r, _ure_buffer_t *b)
471 * If the `casefold' flag is set, then make sure both endpoints of the
472 * range are converted to lower case.
474 if (b->flags & _URE_DFA_CASEFOLD) {
475 r->min_code = _ure_tolower(r->min_code);
476 r->max_code = _ure_tolower(r->max_code);
480 * Swap the range endpoints if they are not in increasing order.
482 if (r->min_code > r->max_code) {
484 r->min_code = r->max_code;
488 for (i = 0, rp = ccl->ranges;
489 i < ccl->ranges_used && r->min_code < rp->min_code; i++, rp++) ;
492 * Check for a duplicate.
494 if (i < ccl->ranges_used &&
495 r->min_code == rp->min_code && r->max_code == rp->max_code)
498 if (ccl->ranges_used == ccl->ranges_size) {
499 if (ccl->ranges_size == 0)
500 ccl->ranges = (_ure_range_t *) malloc(sizeof(_ure_range_t) << 3);
502 ccl->ranges = (_ure_range_t *)
503 realloc((char *) ccl->ranges,
504 sizeof(_ure_range_t) * (ccl->ranges_size + 8));
505 ccl->ranges_size += 8;
508 rp = ccl->ranges + ccl->ranges_used;
510 if (i < ccl->ranges_used)
511 _ure_memmove((char *) (rp + 1), (char *) rp,
512 sizeof(_ure_range_t) * (ccl->ranges_used - i));
515 rp->min_code = r->min_code;
516 rp->max_code = r->max_code;
519 #define _URE_ALPHA_MASK (_URE_UPPER|_URE_LOWER|_URE_OTHERLETTER|\
520 _URE_MODIFIER|_URE_TITLE|_URE_NONSPACING|_URE_COMBINING)
521 #define _URE_ALNUM_MASK (_URE_ALPHA_MASK|_URE_NUMDIGIT)
522 #define _URE_PUNCT_MASK (_URE_DASHPUNCT|_URE_OPENPUNCT|_URE_CLOSEPUNCT|\
524 #define _URE_GRAPH_MASK (_URE_NUMDIGIT|_URE_NUMOTHER|_URE_ALPHA_MASK|\
525 _URE_MATHSYM|_URE_CURRENCYSYM|_URE_OTHERSYM)
526 #define _URE_PRINT_MASK (_URE_GRAPH_MASK|_URE_SPACESEP)
527 #define _URE_SPACE_MASK (_URE_SPACESEP|_URE_LINESEP|_URE_PARASEP)
529 typedef void (*_ure_cclsetup_t)(
539 _ure_cclsetup_t func;
544 _ure_ccl_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b)
550 _ure_space_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b)
557 * Add the additional characters needed for handling isspace().
559 range.min_code = range.max_code = '\t';
560 _ure_add_range(&sym->sym.ccl, &range, b);
561 range.min_code = range.max_code = '\r';
562 _ure_add_range(&sym->sym.ccl, &range, b);
563 range.min_code = range.max_code = '\n';
564 _ure_add_range(&sym->sym.ccl, &range, b);
565 range.min_code = range.max_code = '\f';
566 _ure_add_range(&sym->sym.ccl, &range, b);
567 range.min_code = range.max_code = 0xfeff;
568 _ure_add_range(&sym->sym.ccl, &range, b);
572 _ure_xdigit_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b)
577 * Add the additional characters needed for handling isxdigit().
579 range.min_code = '0';
580 range.max_code = '9';
581 _ure_add_range(&sym->sym.ccl, &range, b);
582 range.min_code = 'A';
583 range.max_code = 'F';
584 _ure_add_range(&sym->sym.ccl, &range, b);
585 range.min_code = 'a';
586 range.max_code = 'f';
587 _ure_add_range(&sym->sym.ccl, &range, b);
590 static _ure_trie_t cclass_trie[] = {
591 {0x003a, 1, 1, 0, 0},
592 {0x0061, 9, 10, 0, 0},
593 {0x0063, 8, 19, 0, 0},
594 {0x0064, 7, 24, 0, 0},
595 {0x0067, 6, 29, 0, 0},
596 {0x006c, 5, 34, 0, 0},
597 {0x0070, 4, 39, 0, 0},
598 {0x0073, 3, 49, 0, 0},
599 {0x0075, 2, 54, 0, 0},
600 {0x0078, 1, 59, 0, 0},
601 {0x006c, 1, 11, 0, 0},
602 {0x006e, 2, 13, 0, 0},
603 {0x0070, 1, 16, 0, 0},
604 {0x0075, 1, 14, 0, 0},
605 {0x006d, 1, 15, 0, 0},
606 {0x003a, 1, 16, _ure_ccl_setup, _URE_ALNUM_MASK},
607 {0x0068, 1, 17, 0, 0},
608 {0x0061, 1, 18, 0, 0},
609 {0x003a, 1, 19, _ure_ccl_setup, _URE_ALPHA_MASK},
610 {0x006e, 1, 20, 0, 0},
611 {0x0074, 1, 21, 0, 0},
612 {0x0072, 1, 22, 0, 0},
613 {0x006c, 1, 23, 0, 0},
614 {0x003a, 1, 24, _ure_ccl_setup, _URE_CNTRL},
615 {0x0069, 1, 25, 0, 0},
616 {0x0067, 1, 26, 0, 0},
617 {0x0069, 1, 27, 0, 0},
618 {0x0074, 1, 28, 0, 0},
619 {0x003a, 1, 29, _ure_ccl_setup, _URE_NUMDIGIT},
620 {0x0072, 1, 30, 0, 0},
621 {0x0061, 1, 31, 0, 0},
622 {0x0070, 1, 32, 0, 0},
623 {0x0068, 1, 33, 0, 0},
624 {0x003a, 1, 34, _ure_ccl_setup, _URE_GRAPH_MASK},
625 {0x006f, 1, 35, 0, 0},
626 {0x0077, 1, 36, 0, 0},
627 {0x0065, 1, 37, 0, 0},
628 {0x0072, 1, 38, 0, 0},
629 {0x003a, 1, 39, _ure_ccl_setup, _URE_LOWER},
630 {0x0072, 2, 41, 0, 0},
631 {0x0075, 1, 45, 0, 0},
632 {0x0069, 1, 42, 0, 0},
633 {0x006e, 1, 43, 0, 0},
634 {0x0074, 1, 44, 0, 0},
635 {0x003a, 1, 45, _ure_ccl_setup, _URE_PRINT_MASK},
636 {0x006e, 1, 46, 0, 0},
637 {0x0063, 1, 47, 0, 0},
638 {0x0074, 1, 48, 0, 0},
639 {0x003a, 1, 49, _ure_ccl_setup, _URE_PUNCT_MASK},
640 {0x0070, 1, 50, 0, 0},
641 {0x0061, 1, 51, 0, 0},
642 {0x0063, 1, 52, 0, 0},
643 {0x0065, 1, 53, 0, 0},
644 {0x003a, 1, 54, _ure_space_setup, _URE_SPACE_MASK},
645 {0x0070, 1, 55, 0, 0},
646 {0x0070, 1, 56, 0, 0},
647 {0x0065, 1, 57, 0, 0},
648 {0x0072, 1, 58, 0, 0},
649 {0x003a, 1, 59, _ure_ccl_setup, _URE_UPPER},
650 {0x0064, 1, 60, 0, 0},
651 {0x0069, 1, 61, 0, 0},
652 {0x0067, 1, 62, 0, 0},
653 {0x0069, 1, 63, 0, 0},
654 {0x0074, 1, 64, 0, 0},
655 {0x003a, 1, 65, _ure_xdigit_setup, 0},
659 * Probe for one of the POSIX colon delimited character classes in the static
663 _ure_posix_ccl(ucs2_t *cp, unsigned long limit, _ure_symtab_t *sym,
672 * If the number of characters left is less than 7, then this cannot be
673 * interpreted as one of the colon delimited classes.
681 for (i = 0; sp < ep && i < 8; i++, sp++) {
684 for (; n > 0 && tp->key != *sp; tp++, n--) ;
689 if (*sp == ':' && (i == 6 || i == 7)) {
694 tp = cclass_trie + tp->next;
699 (*tp->func)(sym, tp->mask, b);
705 * Construct a list of ranges and return the number of characters consumed.
708 _ure_cclass(ucs2_t *cp, unsigned long limit, _ure_symtab_t *symp,
722 symp->type = _URE_NCCLASS;
725 symp->type = _URE_CCLASS;
727 for (last = 0, range_end = 0;
728 b->error == _URE_OK && sp < ep && *sp != ']'; ) {
733 * The EOS was encountered when expecting the reverse solidus
734 * to be followed by the character it is escaping. Set an
735 * error code and return the number of characters consumed up
738 b->error = _URE_UNEXPECTED_EOS;
767 sp += _ure_prop_list(sp, ep - sp, &symp->props, b);
769 * Invert the bit mask of the properties if this is a negated
770 * character class or if 'P' is used to specify a list of
771 * character properties that should *not* match in a
775 symp->props = ~symp->props;
783 ((*sp >= '0' && *sp <= '9') ||
784 (*sp >= 'A' && *sp <= 'F') ||
785 (*sp >= 'a' && *sp <= 'f')))
786 sp += _ure_hex(sp, ep - sp, &c);
788 } else if (c == ':') {
790 * Probe for a POSIX colon delimited character class.
793 if ((n = _ure_posix_ccl(sp, ep - sp, symp, b)) == 0)
801 cclp = &symp->sym.ccl;
804 * Check to see if the current character is a low surrogate that needs
805 * to be combined with a preceding high surrogate.
808 if (c >= 0xdc00 && c <= 0xdfff)
810 * Construct the UTF16 character code.
812 c = 0x10000 + (((last & 0x03ff) << 10) | (c & 0x03ff));
815 * Add the isolated high surrogate to the range.
818 range.max_code = last & 0xffff;
820 range.min_code = range.max_code = last & 0xffff;
822 _ure_add_range(cclp, &range, b);
828 * Clear the last character code.
833 * This slightly awkward code handles the different cases needed to
836 if (c >= 0xd800 && c <= 0xdbff) {
838 * If the high surrogate is followed by a range indicator, simply
839 * add it as the range start. Otherwise, save it in case the next
840 * character is a low surrogate.
848 } else if (range_end == 1) {
850 _ure_add_range(cclp, &range, b);
853 range.min_code = range.max_code = c;
858 _ure_add_range(cclp, &range, b);
862 if (sp < ep && *sp == ']')
866 * The parse was not terminated by the character class close symbol
867 * (']'), so set an error code.
869 b->error = _URE_CCLASS_OPEN;
875 * Probe for a low surrogate hex code.
878 _ure_probe_ls(ucs2_t *ls, unsigned long limit, ucs4_t *c)
883 for (i = code = 0, sp = ls, ep = sp + limit; i < 4 && sp < ep; sp++) {
884 if (*sp >= '0' && *sp <= '9')
885 code = (code << 4) + (*sp - '0');
886 else if (*sp >= 'A' && *sp <= 'F')
887 code = (code << 4) + ((*sp - 'A') + 10);
888 else if (*sp >= 'a' && *sp <= 'f')
889 code = (code << 4) + ((*sp - 'a') + 10);
895 return (0xdc00 <= code && code <= 0xdfff) ? sp - ls : 0;
899 _ure_compile_symbol(ucs2_t *sym, unsigned long limit, _ure_symtab_t *symp,
908 if ((c = *sp++) == '\\') {
912 * The EOS was encountered when expecting the reverse solidus to
913 * be followed by the character it is escaping. Set an error code
914 * and return the number of characters consumed up to this point.
916 b->error = _URE_UNEXPECTED_EOS;
924 symp->type = (c == 'p') ? _URE_CCLASS : _URE_NCCLASS;
925 sp += _ure_prop_list(sp, ep - sp, &symp->props, b);
928 symp->type = _URE_CHAR;
929 symp->sym.chr = 0x07;
932 symp->type = _URE_CHAR;
933 symp->sym.chr = 0x08;
936 symp->type = _URE_CHAR;
937 symp->sym.chr = 0x0c;
940 symp->type = _URE_CHAR;
941 symp->sym.chr = 0x0a;
944 symp->type = _URE_CHAR;
945 symp->sym.chr = 0x0d;
948 symp->type = _URE_CHAR;
949 symp->sym.chr = 0x09;
952 symp->type = _URE_CHAR;
953 symp->sym.chr = 0x0b;
960 * Collect between 1 and 4 digits representing a UCS2 code. Fall
961 * through to the next case.
964 ((*sp >= '0' && *sp <= '9') ||
965 (*sp >= 'A' && *sp <= 'F') ||
966 (*sp >= 'a' && *sp <= 'f')))
967 sp += _ure_hex(sp, ep - sp, &c);
971 * Simply add an escaped character here.
973 symp->type = _URE_CHAR;
976 } else if (c == '^' || c == '$')
978 * Handle the BOL and EOL anchors. This actually consists simply of
979 * setting a flag that indicates that the user supplied anchor match
980 * function should be called. This needs to be done instead of simply
981 * matching line/paragraph separators because beginning-of-text and
982 * end-of-text tests are needed as well.
984 symp->type = (c == '^') ? _URE_BOL_ANCHOR : _URE_EOL_ANCHOR;
987 * Construct a character class.
989 sp += _ure_cclass(sp, ep - sp, symp, b);
991 symp->type = _URE_ANY_CHAR;
993 symp->type = _URE_CHAR;
998 * If the symbol type happens to be a character and is a high surrogate,
999 * then probe forward to see if it is followed by a low surrogate that
1000 * needs to be added.
1002 if (sp < ep && symp->type == _URE_CHAR &&
1003 0xd800 <= symp->sym.chr && symp->sym.chr <= 0xdbff) {
1005 if (0xdc00 <= *sp && *sp <= 0xdfff) {
1006 symp->sym.chr = 0x10000 + (((symp->sym.chr & 0x03ff) << 10) |
1009 } else if (*sp == '\\' && (*(sp + 1) == 'x' || *(sp + 1) == 'X' ||
1010 *(sp + 1) == 'u' || *(sp + 1) == 'U')) {
1011 sp += _ure_probe_ls(sp + 2, ep - (sp + 2), &c);
1012 if (0xdc00 <= c && c <= 0xdfff) {
1014 * Take into account the \[xu] in front of the hex code.
1017 symp->sym.chr = 0x10000 + (((symp->sym.chr & 0x03ff) << 10) |
1024 * Last, make sure any _URE_CHAR type symbols are changed to lower case if
1025 * the `casefold' flag is set.
1027 if ((b->flags & _URE_DFA_CASEFOLD) && symp->type == _URE_CHAR)
1028 symp->sym.chr = _ure_tolower(symp->sym.chr);
1031 * If the symbol constructed is anything other than one of the anchors,
1032 * make sure the _URE_DFA_BLANKLINE flag is removed.
1034 if (symp->type != _URE_BOL_ANCHOR && symp->type != _URE_EOL_ANCHOR)
1035 b->flags &= ~_URE_DFA_BLANKLINE;
1038 * Return the number of characters consumed.
1044 _ure_sym_neq(_ure_symtab_t *a, _ure_symtab_t *b)
1046 if (a->type != b->type || a->mods != b->mods || a->props != b->props)
1049 if (a->type == _URE_CCLASS || a->type == _URE_NCCLASS) {
1050 if (a->sym.ccl.ranges_used != b->sym.ccl.ranges_used)
1052 if (a->sym.ccl.ranges_used > 0 &&
1053 memcmp((char *) a->sym.ccl.ranges, (char *) b->sym.ccl.ranges,
1054 sizeof(_ure_range_t) * a->sym.ccl.ranges_used) != 0)
1056 } else if (a->type == _URE_CHAR && a->sym.chr != b->sym.chr)
1062 * Construct a symbol, but only keep unique symbols.
1065 _ure_make_symbol(ucs2_t *sym, unsigned long limit, unsigned long *consumed,
1069 _ure_symtab_t *sp, symbol;
1072 * Build the next symbol so we can test to see if it is already in the
1075 (void) memset((char *) &symbol, 0, sizeof(_ure_symtab_t));
1076 *consumed = _ure_compile_symbol(sym, limit, &symbol, b);
1079 * Check to see if the symbol exists.
1081 for (i = 0, sp = b->symtab;
1082 i < b->symtab_used && _ure_sym_neq(&symbol, sp); i++, sp++) ;
1084 if (i < b->symtab_used) {
1086 * Free up any ranges used for the symbol.
1088 if ((symbol.type == _URE_CCLASS || symbol.type == _URE_NCCLASS) &&
1089 symbol.sym.ccl.ranges_size > 0)
1090 free((char *) symbol.sym.ccl.ranges);
1092 return b->symtab[i].id;
1096 * Need to add the new symbol.
1098 if (b->symtab_used == b->symtab_size) {
1099 if (b->symtab_size == 0)
1100 b->symtab = (_ure_symtab_t *) malloc(sizeof(_ure_symtab_t) << 3);
1102 b->symtab = (_ure_symtab_t *)
1103 realloc((char *) b->symtab,
1104 sizeof(_ure_symtab_t) * (b->symtab_size + 8));
1105 sp = b->symtab + b->symtab_size;
1106 (void) memset((char *) sp, 0, sizeof(_ure_symtab_t) << 3);
1107 b->symtab_size += 8;
1110 symbol.id = b->symtab_used++;
1111 (void) memcpy((char *) &b->symtab[symbol.id], (char *) &symbol,
1112 sizeof(_ure_symtab_t));
1117 /*************************************************************************
1119 * End symbol parse functions.
1121 *************************************************************************/
1124 _ure_make_expr(ucs2_t type, ucs2_t lhs, ucs2_t rhs, _ure_buffer_t *b)
1132 * Determine if the expression already exists or not.
1134 for (i = 0; i < b->expr_used; i++) {
1135 if (b->expr[i].type == type && b->expr[i].lhs == lhs &&
1136 b->expr[i].rhs == rhs)
1139 if (i < b->expr_used)
1143 * Need to add a new expression.
1145 if (b->expr_used == b->expr_size) {
1146 if (b->expr_size == 0)
1147 b->expr = (_ure_elt_t *) malloc(sizeof(_ure_elt_t) << 3);
1149 b->expr = (_ure_elt_t *)
1150 realloc((char *) b->expr,
1151 sizeof(_ure_elt_t) * (b->expr_size + 8));
1155 b->expr[b->expr_used].onstack = 0;
1156 b->expr[b->expr_used].type = type;
1157 b->expr[b->expr_used].lhs = lhs;
1158 b->expr[b->expr_used].rhs = rhs;
1160 return b->expr_used++;
1163 static unsigned char spmap[] = {
1164 0x00, 0x00, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00,
1165 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1166 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1169 #define _ure_isspecial(cc) ((cc) > 0x20 && (cc) < 0x7f && \
1170 (spmap[(cc) >> 3] & (1 << ((cc) & 7))))
1173 * Convert the regular expression into an NFA in a form that will be easy to
1174 * reduce to a DFA. The starting state for the reduction will be returned.
1177 _ure_re2nfa(ucs2_t *re, unsigned long relen, _ure_buffer_t *b)
1179 ucs2_t c, state, top, sym, *sp, *ep;
1186 while (b->error == _URE_OK && sp < ep) {
1190 _ure_push(_URE_PAREN, b);
1194 * Check for the case of too many close parentheses.
1196 if (_ure_peek(b) == _URE_NOOP) {
1197 b->error = _URE_UNBALANCED_GROUP;
1201 while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR)
1203 * Make an expression with the AND or OR operator and its right
1206 state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b);
1209 * Remove the _URE_PAREN off the stack.
1214 state = _ure_make_expr(_URE_STAR, state, _URE_NOOP, b);
1217 state = _ure_make_expr(_URE_PLUS, state, _URE_NOOP, b);
1220 state = _ure_make_expr(_URE_QUEST, state, _URE_NOOP, b);
1223 while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR)
1225 * Make an expression with the AND or OR operator and its right
1228 state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b);
1230 _ure_push(state, b);
1231 _ure_push(_URE_OR, b);
1235 sym = _ure_make_symbol(sp, ep - sp, &used, b);
1237 state = _ure_make_expr(_URE_SYMBOL, sym, _URE_NOOP, b);
1241 if (c != '(' && c != '|' && sp < ep &&
1242 (!_ure_isspecial(*sp) || *sp == '(')) {
1243 _ure_push(state, b);
1244 _ure_push(_URE_AND, b);
1247 while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR)
1249 * Make an expression with the AND or OR operator and its right
1252 state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b);
1254 if (b->stack.slist_used > 0)
1255 b->error = _URE_UNBALANCED_GROUP;
1257 return (b->error == _URE_OK) ? state : _URE_NOOP;
1261 _ure_add_symstate(ucs2_t sym, ucs2_t state, _ure_buffer_t *b)
1267 * Locate the symbol in the symbol table so the state can be added.
1268 * If the symbol doesn't exist, then a real problem exists.
1270 for (i = 0, sp = b->symtab; i < b->symtab_used && sym != sp->id;
1274 * Now find out if the state exists in the symbol's state list.
1276 for (i = 0, stp = sp->states.slist;
1277 i < sp->states.slist_used && state > *stp; i++, stp++) ;
1279 if (i == sp->states.slist_used || state < *stp) {
1281 * Need to add the state in order.
1283 if (sp->states.slist_used == sp->states.slist_size) {
1284 if (sp->states.slist_size == 0)
1285 sp->states.slist = (ucs2_t *) malloc(sizeof(ucs2_t) << 3);
1287 sp->states.slist = (ucs2_t *)
1288 realloc((char *) sp->states.slist,
1289 sizeof(ucs2_t) * (sp->states.slist_size + 8));
1290 sp->states.slist_size += 8;
1292 if (i < sp->states.slist_used)
1293 (void) _ure_memmove((char *) (sp->states.slist + i + 1),
1294 (char *) (sp->states.slist + i),
1295 sizeof(ucs2_t) * (sp->states.slist_used - i));
1296 sp->states.slist[i] = state;
1297 sp->states.slist_used++;
1302 _ure_add_state(ucs2_t nstates, ucs2_t *states, _ure_buffer_t *b)
1307 for (i = 0, sp = b->states.states; i < b->states.states_used; i++, sp++) {
1308 if (sp->st.slist_used == nstates &&
1309 memcmp((char *) states, (char *) sp->st.slist,
1310 sizeof(ucs2_t) * nstates) == 0)
1314 if (i == b->states.states_used) {
1316 * Need to add a new DFA state (set of NFA states).
1318 if (b->states.states_used == b->states.states_size) {
1319 if (b->states.states_size == 0)
1320 b->states.states = (_ure_state_t *)
1321 malloc(sizeof(_ure_state_t) << 3);
1323 b->states.states = (_ure_state_t *)
1324 realloc((char *) b->states.states,
1325 sizeof(_ure_state_t) * (b->states.states_size + 8));
1326 sp = b->states.states + b->states.states_size;
1327 (void) memset((char *) sp, 0, sizeof(_ure_state_t) << 3);
1328 b->states.states_size += 8;
1331 sp = b->states.states + b->states.states_used++;
1334 if (sp->st.slist_used + nstates > sp->st.slist_size) {
1335 if (sp->st.slist_size == 0)
1336 sp->st.slist = (ucs2_t *)
1337 malloc(sizeof(ucs2_t) * (sp->st.slist_used + nstates));
1339 sp->st.slist = (ucs2_t *)
1340 realloc((char *) sp->st.slist,
1341 sizeof(ucs2_t) * (sp->st.slist_used + nstates));
1342 sp->st.slist_size = sp->st.slist_used + nstates;
1344 sp->st.slist_used = nstates;
1345 (void) memcpy((char *) sp->st.slist, (char *) states,
1346 sizeof(ucs2_t) * nstates);
1350 * Return the ID of the DFA state representing a group of NFA states.
1356 _ure_reduce(ucs2_t start, _ure_buffer_t *b)
1358 ucs2_t i, j, state, eval, syms, rhs;
1359 ucs2_t s1, s2, ns1, ns2;
1366 * Add the starting state for the reduction.
1368 _ure_add_state(1, &start, b);
1371 * Process each set of NFA states that get created.
1373 for (i = 0; i < b->states.states_used; i++) {
1374 sp = b->states.states + i;
1377 * Push the current states on the stack.
1379 for (j = 0; j < sp->st.slist_used; j++)
1380 _ure_push(sp->st.slist[j], b);
1383 * Reduce the NFA states.
1385 for (j = sp->accepting = syms = 0; j < b->stack.slist_used; j++) {
1386 state = b->stack.slist[j];
1390 * This inner loop is the iterative equivalent of recursively
1391 * reducing subexpressions generated as a result of a reduction.
1394 switch (b->expr[state].type) {
1396 ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b);
1397 _ure_add_symstate(b->expr[state].lhs, ns1, b);
1406 s1 = b->expr[state].lhs;
1407 ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b);
1408 state = _ure_make_expr(_URE_OR, ns1, s1, b);
1411 s1 = b->expr[state].lhs;
1412 ns1 = _ure_make_expr(_URE_STAR, s1, _URE_NOOP, b);
1413 state = _ure_make_expr(_URE_AND, s1, ns1, b);
1416 s1 = b->expr[state].lhs;
1417 ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b);
1418 ns2 = _ure_make_expr(_URE_PLUS, s1, _URE_NOOP, b);
1419 state = _ure_make_expr(_URE_OR, ns1, ns2, b);
1422 s1 = b->expr[state].lhs;
1423 s2 = b->expr[state].rhs;
1429 s1 = b->expr[state].lhs;
1430 s2 = b->expr[state].rhs;
1431 switch (b->expr[s1].type) {
1433 _ure_add_symstate(b->expr[s1].lhs, s2, b);
1441 ns1 = b->expr[s1].lhs;
1442 ns2 = _ure_make_expr(_URE_AND, ns1, s2, b);
1443 state = _ure_make_expr(_URE_OR, s2, ns2, b);
1446 ns1 = b->expr[s1].lhs;
1447 ns2 = _ure_make_expr(_URE_OR, s2, state, b);
1448 state = _ure_make_expr(_URE_AND, ns1, ns2, b);
1451 ns1 = b->expr[s1].lhs;
1452 ns2 = _ure_make_expr(_URE_AND, ns1, state, b);
1453 state = _ure_make_expr(_URE_OR, s2, ns2, b);
1456 ns1 = b->expr[s1].lhs;
1457 ns2 = b->expr[s1].rhs;
1458 ns1 = _ure_make_expr(_URE_AND, ns1, s2, b);
1459 ns2 = _ure_make_expr(_URE_AND, ns2, s2, b);
1460 state = _ure_make_expr(_URE_OR, ns1, ns2, b);
1463 ns1 = b->expr[s1].lhs;
1464 ns2 = b->expr[s1].rhs;
1465 ns2 = _ure_make_expr(_URE_AND, ns2, s2, b);
1466 state = _ure_make_expr(_URE_AND, ns1, ns2, b);
1474 * Clear the state stack.
1476 while (_ure_pop(b) != _URE_NOOP) ;
1479 * Reset the state pointer because the reduction may have moved it
1480 * during a reallocation.
1482 sp = b->states.states + i;
1485 * Generate the DFA states for the symbols collected during the
1486 * current reduction.
1488 if (sp->trans_used + syms > sp->trans_size) {
1489 if (sp->trans_size == 0)
1490 sp->trans = (_ure_elt_t *)
1491 malloc(sizeof(_ure_elt_t) * (sp->trans_used + syms));
1493 sp->trans = (_ure_elt_t *)
1494 realloc((char *) sp->trans,
1495 sizeof(_ure_elt_t) * (sp->trans_used + syms));
1496 sp->trans_size = sp->trans_used + syms;
1500 * Go through the symbol table and generate the DFA state transitions
1501 * for each symbol that has collected NFA states.
1503 for (j = syms = 0, smp = b->symtab; j < b->symtab_used; j++, smp++) {
1504 sp = b->states.states + i;
1506 if (smp->states.slist_used > 0) {
1507 sp->trans[syms].lhs = smp->id;
1508 rhs = _ure_add_state(smp->states.slist_used,
1509 smp->states.slist, b);
1511 * Reset the state pointer in case the reallocation moves it
1514 sp = b->states.states + i;
1515 sp->trans[syms].rhs = rhs;
1517 smp->states.slist_used = 0;
1523 * Set the number of transitions actually used.
1525 sp->trans_used = syms;
1531 _ure_add_equiv(ucs2_t l, ucs2_t r, _ure_buffer_t *b)
1535 l = b->states.states[l].id;
1536 r = b->states.states[r].id;
1548 * Check to see if the equivalence pair already exists.
1550 for (tmp = 0; tmp < b->equiv_used &&
1551 (b->equiv[tmp].l != l || b->equiv[tmp].r != r);
1554 if (tmp < b->equiv_used)
1557 if (b->equiv_used == b->equiv_size) {
1558 if (b->equiv_size == 0)
1559 b->equiv = (_ure_equiv_t *) malloc(sizeof(_ure_equiv_t) << 3);
1561 b->equiv = (_ure_equiv_t *) realloc((char *) b->equiv,
1562 sizeof(_ure_equiv_t) *
1563 (b->equiv_size + 8));
1566 b->equiv[b->equiv_used].l = l;
1567 b->equiv[b->equiv_used].r = r;
1572 * Merge the DFA states that are equivalent.
1575 _ure_merge_equiv(_ure_buffer_t *b)
1577 ucs2_t i, j, k, eq, done;
1578 _ure_state_t *sp1, *sp2, *ls, *rs;
1580 for (i = 0; i < b->states.states_used; i++) {
1581 sp1 = b->states.states + i;
1584 for (j = 0; j < i; j++) {
1585 sp2 = b->states.states + j;
1589 _ure_add_equiv(i, j, b);
1590 for (eq = 0, done = 0; eq < b->equiv_used; eq++) {
1591 ls = b->states.states + b->equiv[eq].l;
1592 rs = b->states.states + b->equiv[eq].r;
1593 if (ls->accepting != rs->accepting ||
1594 ls->trans_used != rs->trans_used) {
1598 for (k = 0; k < ls->trans_used &&
1599 ls->trans[k].lhs == rs->trans[k].lhs; k++) ;
1600 if (k < ls->trans_used) {
1605 for (k = 0; k < ls->trans_used; k++)
1606 _ure_add_equiv(ls->trans[k].rhs, rs->trans[k].rhs, b);
1611 for (eq = 0; j < i && eq < b->equiv_used; eq++)
1612 b->states.states[b->equiv[eq].r].id =
1613 b->states.states[b->equiv[eq].l].id;
1617 * Renumber the states appropriately.
1619 for (i = eq = 0, sp1 = b->states.states; i < b->states.states_used;
1621 sp1->id = (sp1->id == i) ? eq++ : b->states.states[sp1->id].id;
1624 /*************************************************************************
1628 *************************************************************************/
1631 ure_buffer_create(void)
1635 b = (ure_buffer_t) calloc(1, sizeof(_ure_buffer_t));
1641 ure_buffer_free(ure_buffer_t buf)
1648 if (buf->stack.slist_size > 0)
1649 free((char *) buf->stack.slist);
1651 if (buf->expr_size > 0)
1652 free((char *) buf->expr);
1654 for (i = 0; i < buf->symtab_size; i++) {
1655 if (buf->symtab[i].states.slist_size > 0)
1656 free((char *) buf->symtab[i].states.slist);
1659 if (buf->symtab_size > 0)
1660 free((char *) buf->symtab);
1662 for (i = 0; i < buf->states.states_size; i++) {
1663 if (buf->states.states[i].trans_size > 0)
1664 free((char *) buf->states.states[i].trans);
1665 if (buf->states.states[i].st.slist_size > 0)
1666 free((char *) buf->states.states[i].st.slist);
1669 if (buf->states.states_size > 0)
1670 free((char *) buf->states.states);
1672 if (buf->equiv_size > 0)
1673 free((char *) buf->equiv);
1679 ure_compile(ucs2_t *re, unsigned long relen, int casefold, ure_buffer_t buf)
1687 if (re == 0 || *re == 0 || relen == 0 || buf == 0)
1691 * Reset the various fields of the compilation buffer. Default the flags
1692 * to indicate the presense of the "^$" pattern. If any other pattern
1693 * occurs, then this flag will be removed. This is done to catch this
1694 * special pattern and handle it specially when matching.
1696 buf->flags = _URE_DFA_BLANKLINE | ((casefold) ? _URE_DFA_CASEFOLD : 0);
1698 buf->stack.slist_used = 0;
1701 for (i = 0; i < buf->symtab_used; i++)
1702 buf->symtab[i].states.slist_used = 0;
1703 buf->symtab_used = 0;
1705 for (i = 0; i < buf->states.states_used; i++) {
1706 buf->states.states[i].st.slist_used = 0;
1707 buf->states.states[i].trans_used = 0;
1709 buf->states.states_used = 0;
1712 * Construct the NFA. If this stage returns a 0, then an error occured or
1713 * an empty expression was passed.
1715 if ((state = _ure_re2nfa(re, relen, buf)) == _URE_NOOP)
1719 * Do the expression reduction to get the initial DFA.
1721 _ure_reduce(state, buf);
1724 * Merge all the equivalent DFA states.
1726 _ure_merge_equiv(buf);
1729 * Construct the minimal DFA.
1731 dfa = (ure_dfa_t) malloc(sizeof(_ure_dfa_t));
1732 (void) memset((char *) dfa, 0, sizeof(_ure_dfa_t));
1734 dfa->flags = buf->flags & (_URE_DFA_CASEFOLD|_URE_DFA_BLANKLINE);
1737 * Free up the NFA state groups and transfer the symbols from the buffer
1740 for (i = 0; i < buf->symtab_size; i++) {
1741 if (buf->symtab[i].states.slist_size > 0)
1742 free((char *) buf->symtab[i].states.slist);
1744 dfa->syms = buf->symtab;
1745 dfa->nsyms = buf->symtab_used;
1747 buf->symtab_used = buf->symtab_size = 0;
1750 * Collect the total number of states and transitions needed for the DFA.
1752 for (i = state = 0, sp = buf->states.states; i < buf->states.states_used;
1754 if (sp->id == state) {
1756 dfa->ntrans += sp->trans_used;
1762 * Allocate enough space for the states and transitions.
1764 dfa->states = (_ure_dstate_t *) malloc(sizeof(_ure_dstate_t) *
1766 dfa->trans = (_ure_trans_t *) malloc(sizeof(_ure_trans_t) * dfa->ntrans);
1769 * Actually transfer the DFA states from the buffer.
1773 for (i = state = 0, sp = buf->states.states; i < buf->states.states_used;
1775 if (sp->id == state) {
1777 dsp->ntrans = sp->trans_used;
1778 dsp->accepting = sp->accepting;
1781 * Add the transitions for the state.
1783 for (j = 0; j < dsp->ntrans; j++, tp++) {
1784 tp->symbol = sp->trans[j].lhs;
1785 tp->next_state = buf->states.states[sp->trans[j].rhs].id;
1797 ure_dfa_free(ure_dfa_t dfa)
1804 for (i = 0; i < dfa->nsyms; i++) {
1805 if ((dfa->syms[i].type == _URE_CCLASS ||
1806 dfa->syms[i].type == _URE_NCCLASS) &&
1807 dfa->syms[i].sym.ccl.ranges_size > 0)
1808 free((char *) dfa->syms[i].sym.ccl.ranges);
1811 free((char *) dfa->syms);
1813 if (dfa->nstates > 0)
1814 free((char *) dfa->states);
1815 if (dfa->ntrans > 0)
1816 free((char *) dfa->trans);
1821 ure_write_dfa(ure_dfa_t dfa, FILE *out)
1823 ucs2_t i, j, k, h, l;
1828 if (dfa == 0 || out == 0)
1832 * Write all the different character classes.
1834 for (i = 0, sym = dfa->syms; i < dfa->nsyms; i++, sym++) {
1835 if (sym->type == _URE_CCLASS || sym->type == _URE_NCCLASS) {
1836 fprintf(out, "C%hd = ", sym->id);
1837 if (sym->sym.ccl.ranges_used > 0) {
1839 if (sym->type == _URE_NCCLASS)
1842 if (sym->props != 0) {
1843 if (sym->type == _URE_NCCLASS)
1844 fprintf(out, "\\P");
1846 fprintf(out, "\\p");
1847 for (k = h = 0; k < 32; k++) {
1848 if (sym->props & (1 << k)) {
1851 fprintf(out, "%hd", k + 1);
1859 for (k = 0, rp = sym->sym.ccl.ranges;
1860 k < sym->sym.ccl.ranges_used; k++, rp++) {
1862 * Check for UTF16 characters.
1864 if (0x10000 <= rp->min_code &&
1865 rp->min_code <= 0x10ffff) {
1866 h = ((rp->min_code - 0x10000) >> 10) + 0xd800;
1867 l = ((rp->min_code - 0x10000) & 1023) + 0xdc00;
1868 fprintf(out, "\\x%04hX\\x%04hX", h, l);
1870 fprintf(out, "\\x%04lX", rp->min_code & 0xffff);
1871 if (rp->max_code != rp->min_code) {
1873 if (rp->max_code >= 0x10000 &&
1874 rp->max_code <= 0x10ffff) {
1875 h = ((rp->max_code - 0x10000) >> 10) + 0xd800;
1876 l = ((rp->max_code - 0x10000) & 1023) + 0xdc00;
1877 fprintf(out, "\\x%04hX\\x%04hX", h, l);
1879 fprintf(out, "\\x%04lX", rp->max_code & 0xffff);
1882 if (sym->sym.ccl.ranges_used > 0)
1888 for (i = 0, sp = dfa->states; i < dfa->nstates; i++, sp++) {
1889 fprintf(out, "S%hd = ", i);
1890 if (sp->accepting) {
1895 for (j = 0; j < sp->ntrans; j++) {
1899 sym = dfa->syms + sp->trans[j].symbol;
1900 switch (sym->type) {
1902 if (0x10000 <= sym->sym.chr && sym->sym.chr <= 0x10ffff) {
1904 * Take care of UTF16 characters.
1906 h = ((sym->sym.chr - 0x10000) >> 10) + 0xd800;
1907 l = ((sym->sym.chr - 0x10000) & 1023) + 0xdc00;
1908 fprintf(out, "\\x%04hX\\x%04hX ", h, l);
1910 fprintf(out, "\\x%04lX ", sym->sym.chr & 0xffff);
1913 fprintf(out, "<any> ");
1915 case _URE_BOL_ANCHOR:
1916 fprintf(out, "<bol-anchor> ");
1918 case _URE_EOL_ANCHOR:
1919 fprintf(out, "<eol-anchor> ");
1923 fprintf(out, "[C%hd] ", sym->id);
1926 fprintf(out, "S%hd", sp->trans[j].next_state);
1927 if (j + 1 < sp->ntrans)
1934 #define _ure_issep(cc) ((cc) == '\n' || (cc) == '\r' || (cc) == 0x2028 ||\
1938 ure_exec(ure_dfa_t dfa, int flags, ucs2_t *text, unsigned long textlen,
1939 unsigned long *match_start, unsigned long *match_end)
1941 int i, j, matched, found, skip;
1942 unsigned long ms, me;
1944 ucs2_t *sp, *ep, *lp;
1949 if (dfa == 0 || text == 0)
1953 * Handle the special case of an empty string matching the "^$" pattern.
1955 if (textlen == 0 && (dfa->flags & _URE_DFA_BLANKLINE)) {
1956 *match_start = *match_end = 0;
1967 for (found = skip = 0; found == 0 && sp < ep; ) {
1972 * Check to see if this is a high surrogate that should be
1973 * combined with a following low surrogate.
1975 if (sp < ep && 0xd800 <= c && c <= 0xdbff &&
1976 0xdc00 <= *sp && *sp <= 0xdfff)
1977 c = 0x10000 + (((c & 0x03ff) << 10) | (*sp++ & 0x03ff));
1980 * Determine if the character is non-spacing and should be skipped.
1982 if (_ure_matches_properties(_URE_NONSPACING, c) &&
1983 (flags & URE_IGNORE_NONSPACING)) {
1988 if (dfa->flags & _URE_DFA_CASEFOLD)
1989 c = _ure_tolower(c);
1992 * See if one of the transitions matches.
1994 for (i = 0, matched = 0; matched == 0 && i < stp->ntrans; i++) {
1995 sym = dfa->syms + stp->trans[i].symbol;
1996 switch (sym->type) {
1998 if ((flags & URE_DOT_MATCHES_SEPARATORS) ||
2003 if (c == sym->sym.chr)
2006 case _URE_BOL_ANCHOR:
2010 } else if (_ure_issep(c)) {
2011 if (c == '\r' && sp < ep && *sp == '\n')
2017 case _URE_EOL_ANCHOR:
2018 if (_ure_issep(c)) {
2020 * Put the pointer back before the separator so the match
2021 * end position will be correct. This case will also
2022 * cause the `sp' pointer to be advanced over the current
2023 * separator once the match end point has been recorded.
2031 if (sym->props != 0)
2032 matched = _ure_matches_properties(sym->props, c);
2033 for (j = 0, rp = sym->sym.ccl.ranges;
2034 j < sym->sym.ccl.ranges_used; j++, rp++) {
2035 if (rp->min_code <= c && c <= rp->max_code)
2038 if (sym->type == _URE_NCCLASS)
2048 stp = dfa->states + stp->trans[i].next_state;
2051 * If the match was an EOL anchor, adjust the pointer past the
2052 * separator that caused the match. The correct match
2053 * position has been recorded already.
2055 if (sym->type == _URE_EOL_ANCHOR) {
2057 * Skip the character that caused the match.
2062 * Handle the infamous CRLF situation.
2064 if (sp < ep && c == '\r' && *sp == '\n')
2071 if (stp->accepting == 0) {
2073 * If the last state was not accepting, then reset
2080 * The last state was accepting, so terminate the matching
2081 * loop to avoid more work.
2084 } else if (sp == ep) {
2085 if (!stp->accepting) {
2087 * This ugly hack is to make sure the end-of-line anchors
2088 * match when the source text hits the end. This is only done
2089 * if the last subexpression matches.
2091 for (i = 0; found == 0 && i < stp->ntrans; i++) {
2092 sym = dfa->syms + stp->trans[i].symbol;
2093 if (sym->type ==_URE_EOL_ANCHOR) {
2094 stp = dfa->states + stp->trans[i].next_state;
2095 if (stp->accepting) {
2104 * Make sure any conditions that match all the way to the end
2105 * of the string match.
2119 return (ms != ~0) ? 1 : 0;