3 * Copyright 2000 The OpenLDAP Foundation, All Rights Reserved.
4 * COPYING RESTRICTIONS APPLY, see COPYRIGHT file
7 * Copyright 1997, 1998, 1999 Computing Research Labs,
8 * New Mexico State University
10 * Permission is hereby granted, free of charge, to any person obtaining a
11 * copy of this software and associated documentation files (the "Software"),
12 * to deal in the Software without restriction, including without limitation
13 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
14 * and/or sell copies of the Software, and to permit persons to whom the
15 * Software is furnished to do so, subject to the following conditions:
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
24 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
25 * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
26 * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
28 /* $Id: ure.c,v 1.2 1999/09/21 15:47:43 mleisher Exp $" */
32 #include <ac/stdlib.h>
33 #include <ac/string.h>
34 #include <ac/unistd.h>
39 * Flags used internally in the DFA.
41 #define _URE_DFA_CASEFOLD 0x01
42 #define _URE_DFA_BLANKLINE 0x02
44 static unsigned long cclass_flags[] = {
81 * Symbol types for the DFA.
83 #define _URE_ANY_CHAR 1
86 #define _URE_NCCLASS 4
87 #define _URE_BOL_ANCHOR 5
88 #define _URE_EOL_ANCHOR 6
91 * Op codes for converting the NFA to a DFA.
93 #define _URE_SYMBOL 10
102 #define _URE_NOOP 0xffff
104 #define _URE_REGSTART 0x8000
105 #define _URE_REGEND 0x4000
108 * Structure used to handle a compacted range of characters.
116 _ure_range_t *ranges;
127 * This is a general element structure used for expressions and stack
139 * This is a structure used to track a list or a stack of states.
148 * Structure to track the list of unique states for a symbol
157 _ure_stlist_t states;
161 * Structure to hold a single state.
174 * Structure used for keeping lists of states.
177 _ure_state_t *states;
183 * Structure to track pairs of DFA states when equivalent states are
192 * Structure used for constructing the NFA and reducing to a minimal DFA.
194 typedef struct _ure_buffer_t {
202 * Table of unique symbols encountered.
204 _ure_symtab_t *symtab;
209 * Tracks the unique expressions generated for the NFA and when the NFA is
217 * The reduced table of unique groups of NFA states.
219 _ure_statetable_t states;
222 * Tracks states when equivalent states are merged.
240 typedef struct _ure_dfa_t {
246 _ure_dstate_t *states;
253 /*************************************************************************
257 *************************************************************************/
260 _ure_memmove(char *dest, char *src, unsigned long bytes)
269 * Do a memmove using Ye Olde Duff's Device for efficiency.
278 case 7: *--dest = *--src;
279 case 6: *--dest = *--src;
280 case 5: *--dest = *--src;
281 case 4: *--dest = *--src;
282 case 3: *--dest = *--src;
283 case 2: *--dest = *--src;
284 case 1: *--dest = *--src;
287 } else if (src > dest) {
291 case 7: *dest++ = *src++;
292 case 6: *dest++ = *src++;
293 case 5: *dest++ = *src++;
294 case 4: *dest++ = *src++;
295 case 3: *dest++ = *src++;
296 case 2: *dest++ = *src++;
297 case 1: *dest++ = *src++;
304 _ure_push(ucs2_t v, _ure_buffer_t *b)
312 * If the `reducing' parameter is non-zero, check to see if the value
313 * passed is already on the stack.
315 if (b->reducing != 0 && b->expr[v].onstack != 0)
319 if (s->slist_used == s->slist_size) {
320 if (s->slist_size == 0)
321 s->slist = (ucs2_t *) malloc(sizeof(ucs2_t) << 3);
323 s->slist = (ucs2_t *) realloc((char *) s->slist,
324 sizeof(ucs2_t) * (s->slist_size + 8));
327 s->slist[s->slist_used++] = v;
330 * If the `reducing' parameter is non-zero, flag the element as being on
333 if (b->reducing != 0)
334 b->expr[v].onstack = 1;
338 _ure_peek(_ure_buffer_t *b)
340 if (b == 0 || b->stack.slist_used == 0)
343 return b->stack.slist[b->stack.slist_used - 1];
347 _ure_pop(_ure_buffer_t *b)
351 if (b == 0 || b->stack.slist_used == 0)
354 v = b->stack.slist[--b->stack.slist_used];
356 b->expr[v].onstack = 0;
361 /*************************************************************************
363 * Start symbol parse functions.
365 *************************************************************************/
368 * Parse a comma-separated list of integers that represent character
369 * properties. Combine them into a mask that is returned in the `mask'
370 * variable, and return the number of characters consumed.
373 _ure_prop_list(ucs2_t *pp, unsigned long limit, unsigned long *mask,
382 for (m = n = 0; b->error == _URE_OK && sp < ep; sp++) {
385 * Encountered a comma, so select the next character property flag
386 * and reset the number.
388 m |= cclass_flags[n];
390 } else if (*sp >= '0' && *sp <= '9')
392 * Encountered a digit, so start or continue building the cardinal
393 * that represents the character property flag.
395 n = (n * 10) + (*sp - '0');
398 * Encountered something that is not part of the property list.
399 * Indicate that we are done.
404 * If a property number greater than 32 occurs, then there is a
405 * problem. Most likely a missing comma separator.
408 b->error = _URE_INVALID_PROPERTY;
412 m |= cclass_flags[n];
415 * Set the mask that represents the group of character properties.
420 * Return the number of characters consumed.
426 * Collect a hex number with 1 to 4 digits and return the number
427 * of characters used.
430 _ure_hex(ucs2_t *np, unsigned long limit, ucs4_t *n)
439 for (nn = 0, i = 0; i < 4 && sp < ep; i++, sp++) {
440 if (*sp >= '0' && *sp <= '9')
441 nn = (nn << 4) + (*sp - '0');
442 else if (*sp >= 'A' && *sp <= 'F')
443 nn = (nn << 4) + ((*sp - 'A') + 10);
444 else if (*sp >= 'a' && *sp <= 'f')
445 nn = (nn << 4) + ((*sp - 'a') + 10);
448 * Encountered something that is not a hex digit.
454 * Assign the character code collected and return the number of
463 * Insert a range into a character class, removing duplicates and ordering
464 * them in increasing range-start order.
467 _ure_add_range(_ure_ccl_t *ccl, _ure_range_t *r, _ure_buffer_t *b)
474 * If the `casefold' flag is set, then make sure both endpoints of the
475 * range are converted to lower case.
477 if (b->flags & _URE_DFA_CASEFOLD) {
478 r->min_code = _ure_tolower(r->min_code);
479 r->max_code = _ure_tolower(r->max_code);
483 * Swap the range endpoints if they are not in increasing order.
485 if (r->min_code > r->max_code) {
487 r->min_code = r->max_code;
491 for (i = 0, rp = ccl->ranges;
492 i < ccl->ranges_used && r->min_code < rp->min_code; i++, rp++) ;
495 * Check for a duplicate.
497 if (i < ccl->ranges_used &&
498 r->min_code == rp->min_code && r->max_code == rp->max_code)
501 if (ccl->ranges_used == ccl->ranges_size) {
502 if (ccl->ranges_size == 0)
503 ccl->ranges = (_ure_range_t *) malloc(sizeof(_ure_range_t) << 3);
505 ccl->ranges = (_ure_range_t *)
506 realloc((char *) ccl->ranges,
507 sizeof(_ure_range_t) * (ccl->ranges_size + 8));
508 ccl->ranges_size += 8;
511 rp = ccl->ranges + ccl->ranges_used;
513 if (i < ccl->ranges_used)
514 _ure_memmove((char *) (rp + 1), (char *) rp,
515 sizeof(_ure_range_t) * (ccl->ranges_used - i));
518 rp->min_code = r->min_code;
519 rp->max_code = r->max_code;
522 #define _URE_ALPHA_MASK (_URE_UPPER|_URE_LOWER|_URE_OTHERLETTER|\
523 _URE_MODIFIER|_URE_TITLE|_URE_NONSPACING|_URE_COMBINING)
524 #define _URE_ALNUM_MASK (_URE_ALPHA_MASK|_URE_NUMDIGIT)
525 #define _URE_PUNCT_MASK (_URE_DASHPUNCT|_URE_OPENPUNCT|_URE_CLOSEPUNCT|\
527 #define _URE_GRAPH_MASK (_URE_NUMDIGIT|_URE_NUMOTHER|_URE_ALPHA_MASK|\
528 _URE_MATHSYM|_URE_CURRENCYSYM|_URE_OTHERSYM)
529 #define _URE_PRINT_MASK (_URE_GRAPH_MASK|_URE_SPACESEP)
530 #define _URE_SPACE_MASK (_URE_SPACESEP|_URE_LINESEP|_URE_PARASEP)
532 typedef void (*_ure_cclsetup_t)(
542 _ure_cclsetup_t func;
547 _ure_ccl_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b)
553 _ure_space_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b)
560 * Add the additional characters needed for handling isspace().
562 range.min_code = range.max_code = '\t';
563 _ure_add_range(&sym->sym.ccl, &range, b);
564 range.min_code = range.max_code = '\r';
565 _ure_add_range(&sym->sym.ccl, &range, b);
566 range.min_code = range.max_code = '\n';
567 _ure_add_range(&sym->sym.ccl, &range, b);
568 range.min_code = range.max_code = '\f';
569 _ure_add_range(&sym->sym.ccl, &range, b);
570 range.min_code = range.max_code = 0xfeff;
571 _ure_add_range(&sym->sym.ccl, &range, b);
575 _ure_xdigit_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b)
580 * Add the additional characters needed for handling isxdigit().
582 range.min_code = '0';
583 range.max_code = '9';
584 _ure_add_range(&sym->sym.ccl, &range, b);
585 range.min_code = 'A';
586 range.max_code = 'F';
587 _ure_add_range(&sym->sym.ccl, &range, b);
588 range.min_code = 'a';
589 range.max_code = 'f';
590 _ure_add_range(&sym->sym.ccl, &range, b);
593 static _ure_trie_t cclass_trie[] = {
594 {0x003a, 1, 1, 0, 0},
595 {0x0061, 9, 10, 0, 0},
596 {0x0063, 8, 19, 0, 0},
597 {0x0064, 7, 24, 0, 0},
598 {0x0067, 6, 29, 0, 0},
599 {0x006c, 5, 34, 0, 0},
600 {0x0070, 4, 39, 0, 0},
601 {0x0073, 3, 49, 0, 0},
602 {0x0075, 2, 54, 0, 0},
603 {0x0078, 1, 59, 0, 0},
604 {0x006c, 1, 11, 0, 0},
605 {0x006e, 2, 13, 0, 0},
606 {0x0070, 1, 16, 0, 0},
607 {0x0075, 1, 14, 0, 0},
608 {0x006d, 1, 15, 0, 0},
609 {0x003a, 1, 16, _ure_ccl_setup, _URE_ALNUM_MASK},
610 {0x0068, 1, 17, 0, 0},
611 {0x0061, 1, 18, 0, 0},
612 {0x003a, 1, 19, _ure_ccl_setup, _URE_ALPHA_MASK},
613 {0x006e, 1, 20, 0, 0},
614 {0x0074, 1, 21, 0, 0},
615 {0x0072, 1, 22, 0, 0},
616 {0x006c, 1, 23, 0, 0},
617 {0x003a, 1, 24, _ure_ccl_setup, _URE_CNTRL},
618 {0x0069, 1, 25, 0, 0},
619 {0x0067, 1, 26, 0, 0},
620 {0x0069, 1, 27, 0, 0},
621 {0x0074, 1, 28, 0, 0},
622 {0x003a, 1, 29, _ure_ccl_setup, _URE_NUMDIGIT},
623 {0x0072, 1, 30, 0, 0},
624 {0x0061, 1, 31, 0, 0},
625 {0x0070, 1, 32, 0, 0},
626 {0x0068, 1, 33, 0, 0},
627 {0x003a, 1, 34, _ure_ccl_setup, _URE_GRAPH_MASK},
628 {0x006f, 1, 35, 0, 0},
629 {0x0077, 1, 36, 0, 0},
630 {0x0065, 1, 37, 0, 0},
631 {0x0072, 1, 38, 0, 0},
632 {0x003a, 1, 39, _ure_ccl_setup, _URE_LOWER},
633 {0x0072, 2, 41, 0, 0},
634 {0x0075, 1, 45, 0, 0},
635 {0x0069, 1, 42, 0, 0},
636 {0x006e, 1, 43, 0, 0},
637 {0x0074, 1, 44, 0, 0},
638 {0x003a, 1, 45, _ure_ccl_setup, _URE_PRINT_MASK},
639 {0x006e, 1, 46, 0, 0},
640 {0x0063, 1, 47, 0, 0},
641 {0x0074, 1, 48, 0, 0},
642 {0x003a, 1, 49, _ure_ccl_setup, _URE_PUNCT_MASK},
643 {0x0070, 1, 50, 0, 0},
644 {0x0061, 1, 51, 0, 0},
645 {0x0063, 1, 52, 0, 0},
646 {0x0065, 1, 53, 0, 0},
647 {0x003a, 1, 54, _ure_space_setup, _URE_SPACE_MASK},
648 {0x0070, 1, 55, 0, 0},
649 {0x0070, 1, 56, 0, 0},
650 {0x0065, 1, 57, 0, 0},
651 {0x0072, 1, 58, 0, 0},
652 {0x003a, 1, 59, _ure_ccl_setup, _URE_UPPER},
653 {0x0064, 1, 60, 0, 0},
654 {0x0069, 1, 61, 0, 0},
655 {0x0067, 1, 62, 0, 0},
656 {0x0069, 1, 63, 0, 0},
657 {0x0074, 1, 64, 0, 0},
658 {0x003a, 1, 65, _ure_xdigit_setup, 0},
662 * Probe for one of the POSIX colon delimited character classes in the static
666 _ure_posix_ccl(ucs2_t *cp, unsigned long limit, _ure_symtab_t *sym,
675 * If the number of characters left is less than 7, then this cannot be
676 * interpreted as one of the colon delimited classes.
684 for (i = 0; sp < ep && i < 8; i++, sp++) {
687 for (; n > 0 && tp->key != *sp; tp++, n--) ;
692 if (*sp == ':' && (i == 6 || i == 7)) {
697 tp = cclass_trie + tp->next;
702 (*tp->func)(sym, tp->mask, b);
708 * Construct a list of ranges and return the number of characters consumed.
711 _ure_cclass(ucs2_t *cp, unsigned long limit, _ure_symtab_t *symp,
725 symp->type = _URE_NCCLASS;
728 symp->type = _URE_CCLASS;
730 for (last = 0, range_end = 0;
731 b->error == _URE_OK && sp < ep && *sp != ']'; ) {
736 * The EOS was encountered when expecting the reverse solidus
737 * to be followed by the character it is escaping. Set an
738 * error code and return the number of characters consumed up
741 b->error = _URE_UNEXPECTED_EOS;
770 sp += _ure_prop_list(sp, ep - sp, &symp->props, b);
772 * Invert the bit mask of the properties if this is a negated
773 * character class or if 'P' is used to specify a list of
774 * character properties that should *not* match in a
778 symp->props = ~symp->props;
786 ((*sp >= '0' && *sp <= '9') ||
787 (*sp >= 'A' && *sp <= 'F') ||
788 (*sp >= 'a' && *sp <= 'f')))
789 sp += _ure_hex(sp, ep - sp, &c);
791 } else if (c == ':') {
793 * Probe for a POSIX colon delimited character class.
796 if ((n = _ure_posix_ccl(sp, ep - sp, symp, b)) == 0)
804 cclp = &symp->sym.ccl;
807 * Check to see if the current character is a low surrogate that needs
808 * to be combined with a preceding high surrogate.
811 if (c >= 0xdc00 && c <= 0xdfff)
813 * Construct the UTF16 character code.
815 c = 0x10000 + (((last & 0x03ff) << 10) | (c & 0x03ff));
818 * Add the isolated high surrogate to the range.
821 range.max_code = last & 0xffff;
823 range.min_code = range.max_code = last & 0xffff;
825 _ure_add_range(cclp, &range, b);
831 * Clear the last character code.
836 * This slightly awkward code handles the different cases needed to
839 if (c >= 0xd800 && c <= 0xdbff) {
841 * If the high surrogate is followed by a range indicator, simply
842 * add it as the range start. Otherwise, save it in case the next
843 * character is a low surrogate.
851 } else if (range_end == 1) {
853 _ure_add_range(cclp, &range, b);
856 range.min_code = range.max_code = c;
861 _ure_add_range(cclp, &range, b);
865 if (sp < ep && *sp == ']')
869 * The parse was not terminated by the character class close symbol
870 * (']'), so set an error code.
872 b->error = _URE_CCLASS_OPEN;
878 * Probe for a low surrogate hex code.
881 _ure_probe_ls(ucs2_t *ls, unsigned long limit, ucs4_t *c)
886 for (i = code = 0, sp = ls, ep = sp + limit; i < 4 && sp < ep; sp++) {
887 if (*sp >= '0' && *sp <= '9')
888 code = (code << 4) + (*sp - '0');
889 else if (*sp >= 'A' && *sp <= 'F')
890 code = (code << 4) + ((*sp - 'A') + 10);
891 else if (*sp >= 'a' && *sp <= 'f')
892 code = (code << 4) + ((*sp - 'a') + 10);
898 return (0xdc00 <= code && code <= 0xdfff) ? sp - ls : 0;
902 _ure_compile_symbol(ucs2_t *sym, unsigned long limit, _ure_symtab_t *symp,
911 if ((c = *sp++) == '\\') {
915 * The EOS was encountered when expecting the reverse solidus to
916 * be followed by the character it is escaping. Set an error code
917 * and return the number of characters consumed up to this point.
919 b->error = _URE_UNEXPECTED_EOS;
927 symp->type = (c == 'p') ? _URE_CCLASS : _URE_NCCLASS;
928 sp += _ure_prop_list(sp, ep - sp, &symp->props, b);
931 symp->type = _URE_CHAR;
932 symp->sym.chr = 0x07;
935 symp->type = _URE_CHAR;
936 symp->sym.chr = 0x08;
939 symp->type = _URE_CHAR;
940 symp->sym.chr = 0x0c;
943 symp->type = _URE_CHAR;
944 symp->sym.chr = 0x0a;
947 symp->type = _URE_CHAR;
948 symp->sym.chr = 0x0d;
951 symp->type = _URE_CHAR;
952 symp->sym.chr = 0x09;
955 symp->type = _URE_CHAR;
956 symp->sym.chr = 0x0b;
963 * Collect between 1 and 4 digits representing a UCS2 code. Fall
964 * through to the next case.
967 ((*sp >= '0' && *sp <= '9') ||
968 (*sp >= 'A' && *sp <= 'F') ||
969 (*sp >= 'a' && *sp <= 'f')))
970 sp += _ure_hex(sp, ep - sp, &c);
974 * Simply add an escaped character here.
976 symp->type = _URE_CHAR;
979 } else if (c == '^' || c == '$')
981 * Handle the BOL and EOL anchors. This actually consists simply of
982 * setting a flag that indicates that the user supplied anchor match
983 * function should be called. This needs to be done instead of simply
984 * matching line/paragraph separators because beginning-of-text and
985 * end-of-text tests are needed as well.
987 symp->type = (c == '^') ? _URE_BOL_ANCHOR : _URE_EOL_ANCHOR;
990 * Construct a character class.
992 sp += _ure_cclass(sp, ep - sp, symp, b);
994 symp->type = _URE_ANY_CHAR;
996 symp->type = _URE_CHAR;
1001 * If the symbol type happens to be a character and is a high surrogate,
1002 * then probe forward to see if it is followed by a low surrogate that
1003 * needs to be added.
1005 if (sp < ep && symp->type == _URE_CHAR &&
1006 0xd800 <= symp->sym.chr && symp->sym.chr <= 0xdbff) {
1008 if (0xdc00 <= *sp && *sp <= 0xdfff) {
1009 symp->sym.chr = 0x10000 + (((symp->sym.chr & 0x03ff) << 10) |
1012 } else if (*sp == '\\' && (*(sp + 1) == 'x' || *(sp + 1) == 'X' ||
1013 *(sp + 1) == 'u' || *(sp + 1) == 'U')) {
1014 sp += _ure_probe_ls(sp + 2, ep - (sp + 2), &c);
1015 if (0xdc00 <= c && c <= 0xdfff) {
1017 * Take into account the \[xu] in front of the hex code.
1020 symp->sym.chr = 0x10000 + (((symp->sym.chr & 0x03ff) << 10) |
1027 * Last, make sure any _URE_CHAR type symbols are changed to lower case if
1028 * the `casefold' flag is set.
1030 if ((b->flags & _URE_DFA_CASEFOLD) && symp->type == _URE_CHAR)
1031 symp->sym.chr = _ure_tolower(symp->sym.chr);
1034 * If the symbol constructed is anything other than one of the anchors,
1035 * make sure the _URE_DFA_BLANKLINE flag is removed.
1037 if (symp->type != _URE_BOL_ANCHOR && symp->type != _URE_EOL_ANCHOR)
1038 b->flags &= ~_URE_DFA_BLANKLINE;
1041 * Return the number of characters consumed.
1047 _ure_sym_neq(_ure_symtab_t *a, _ure_symtab_t *b)
1049 if (a->type != b->type || a->mods != b->mods || a->props != b->props)
1052 if (a->type == _URE_CCLASS || a->type == _URE_NCCLASS) {
1053 if (a->sym.ccl.ranges_used != b->sym.ccl.ranges_used)
1055 if (a->sym.ccl.ranges_used > 0 &&
1056 memcmp((char *) a->sym.ccl.ranges, (char *) b->sym.ccl.ranges,
1057 sizeof(_ure_range_t) * a->sym.ccl.ranges_used) != 0)
1059 } else if (a->type == _URE_CHAR && a->sym.chr != b->sym.chr)
1065 * Construct a symbol, but only keep unique symbols.
1068 _ure_make_symbol(ucs2_t *sym, unsigned long limit, unsigned long *consumed,
1072 _ure_symtab_t *sp, symbol;
1075 * Build the next symbol so we can test to see if it is already in the
1078 (void) memset((char *) &symbol, '\0', sizeof(_ure_symtab_t));
1079 *consumed = _ure_compile_symbol(sym, limit, &symbol, b);
1082 * Check to see if the symbol exists.
1084 for (i = 0, sp = b->symtab;
1085 i < b->symtab_used && _ure_sym_neq(&symbol, sp); i++, sp++) ;
1087 if (i < b->symtab_used) {
1089 * Free up any ranges used for the symbol.
1091 if ((symbol.type == _URE_CCLASS || symbol.type == _URE_NCCLASS) &&
1092 symbol.sym.ccl.ranges_size > 0)
1093 free((char *) symbol.sym.ccl.ranges);
1095 return b->symtab[i].id;
1099 * Need to add the new symbol.
1101 if (b->symtab_used == b->symtab_size) {
1102 if (b->symtab_size == 0)
1103 b->symtab = (_ure_symtab_t *) malloc(sizeof(_ure_symtab_t) << 3);
1105 b->symtab = (_ure_symtab_t *)
1106 realloc((char *) b->symtab,
1107 sizeof(_ure_symtab_t) * (b->symtab_size + 8));
1108 sp = b->symtab + b->symtab_size;
1109 (void) memset((char *) sp, '\0', sizeof(_ure_symtab_t) << 3);
1110 b->symtab_size += 8;
1113 symbol.id = b->symtab_used++;
1114 (void) AC_MEMCPY((char *) &b->symtab[symbol.id], (char *) &symbol,
1115 sizeof(_ure_symtab_t));
1120 /*************************************************************************
1122 * End symbol parse functions.
1124 *************************************************************************/
1127 _ure_make_expr(ucs2_t type, ucs2_t lhs, ucs2_t rhs, _ure_buffer_t *b)
1135 * Determine if the expression already exists or not.
1137 for (i = 0; i < b->expr_used; i++) {
1138 if (b->expr[i].type == type && b->expr[i].lhs == lhs &&
1139 b->expr[i].rhs == rhs)
1142 if (i < b->expr_used)
1146 * Need to add a new expression.
1148 if (b->expr_used == b->expr_size) {
1149 if (b->expr_size == 0)
1150 b->expr = (_ure_elt_t *) malloc(sizeof(_ure_elt_t) << 3);
1152 b->expr = (_ure_elt_t *)
1153 realloc((char *) b->expr,
1154 sizeof(_ure_elt_t) * (b->expr_size + 8));
1158 b->expr[b->expr_used].onstack = 0;
1159 b->expr[b->expr_used].type = type;
1160 b->expr[b->expr_used].lhs = lhs;
1161 b->expr[b->expr_used].rhs = rhs;
1163 return b->expr_used++;
1166 static unsigned char spmap[] = {
1167 0x00, 0x00, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00,
1168 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1169 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1172 #define _ure_isspecial(cc) ((cc) > 0x20 && (cc) < 0x7f && \
1173 (spmap[(cc) >> 3] & (1 << ((cc) & 7))))
1176 * Convert the regular expression into an NFA in a form that will be easy to
1177 * reduce to a DFA. The starting state for the reduction will be returned.
1180 _ure_re2nfa(ucs2_t *re, unsigned long relen, _ure_buffer_t *b)
1182 ucs2_t c, state, top, sym, *sp, *ep;
1189 while (b->error == _URE_OK && sp < ep) {
1193 _ure_push(_URE_PAREN, b);
1197 * Check for the case of too many close parentheses.
1199 if (_ure_peek(b) == _URE_NOOP) {
1200 b->error = _URE_UNBALANCED_GROUP;
1204 while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR)
1206 * Make an expression with the AND or OR operator and its right
1209 state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b);
1212 * Remove the _URE_PAREN off the stack.
1217 state = _ure_make_expr(_URE_STAR, state, _URE_NOOP, b);
1220 state = _ure_make_expr(_URE_PLUS, state, _URE_NOOP, b);
1223 state = _ure_make_expr(_URE_QUEST, state, _URE_NOOP, b);
1226 while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR)
1228 * Make an expression with the AND or OR operator and its right
1231 state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b);
1233 _ure_push(state, b);
1234 _ure_push(_URE_OR, b);
1238 sym = _ure_make_symbol(sp, ep - sp, &used, b);
1240 state = _ure_make_expr(_URE_SYMBOL, sym, _URE_NOOP, b);
1244 if (c != '(' && c != '|' && sp < ep &&
1245 (!_ure_isspecial(*sp) || *sp == '(')) {
1246 _ure_push(state, b);
1247 _ure_push(_URE_AND, b);
1250 while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR)
1252 * Make an expression with the AND or OR operator and its right
1255 state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b);
1257 if (b->stack.slist_used > 0)
1258 b->error = _URE_UNBALANCED_GROUP;
1260 return (b->error == _URE_OK) ? state : _URE_NOOP;
1264 _ure_add_symstate(ucs2_t sym, ucs2_t state, _ure_buffer_t *b)
1270 * Locate the symbol in the symbol table so the state can be added.
1271 * If the symbol doesn't exist, then a real problem exists.
1273 for (i = 0, sp = b->symtab; i < b->symtab_used && sym != sp->id;
1277 * Now find out if the state exists in the symbol's state list.
1279 for (i = 0, stp = sp->states.slist;
1280 i < sp->states.slist_used && state > *stp; i++, stp++) ;
1282 if (i == sp->states.slist_used || state < *stp) {
1284 * Need to add the state in order.
1286 if (sp->states.slist_used == sp->states.slist_size) {
1287 if (sp->states.slist_size == 0)
1288 sp->states.slist = (ucs2_t *) malloc(sizeof(ucs2_t) << 3);
1290 sp->states.slist = (ucs2_t *)
1291 realloc((char *) sp->states.slist,
1292 sizeof(ucs2_t) * (sp->states.slist_size + 8));
1293 sp->states.slist_size += 8;
1295 if (i < sp->states.slist_used)
1296 (void) _ure_memmove((char *) (sp->states.slist + i + 1),
1297 (char *) (sp->states.slist + i),
1298 sizeof(ucs2_t) * (sp->states.slist_used - i));
1299 sp->states.slist[i] = state;
1300 sp->states.slist_used++;
1305 _ure_add_state(ucs2_t nstates, ucs2_t *states, _ure_buffer_t *b)
1310 for (i = 0, sp = b->states.states; i < b->states.states_used; i++, sp++) {
1311 if (sp->st.slist_used == nstates &&
1312 memcmp((char *) states, (char *) sp->st.slist,
1313 sizeof(ucs2_t) * nstates) == 0)
1317 if (i == b->states.states_used) {
1319 * Need to add a new DFA state (set of NFA states).
1321 if (b->states.states_used == b->states.states_size) {
1322 if (b->states.states_size == 0)
1323 b->states.states = (_ure_state_t *)
1324 malloc(sizeof(_ure_state_t) << 3);
1326 b->states.states = (_ure_state_t *)
1327 realloc((char *) b->states.states,
1328 sizeof(_ure_state_t) * (b->states.states_size + 8));
1329 sp = b->states.states + b->states.states_size;
1330 (void) memset((char *) sp, '\0', sizeof(_ure_state_t) << 3);
1331 b->states.states_size += 8;
1334 sp = b->states.states + b->states.states_used++;
1337 if (sp->st.slist_used + nstates > sp->st.slist_size) {
1338 if (sp->st.slist_size == 0)
1339 sp->st.slist = (ucs2_t *)
1340 malloc(sizeof(ucs2_t) * (sp->st.slist_used + nstates));
1342 sp->st.slist = (ucs2_t *)
1343 realloc((char *) sp->st.slist,
1344 sizeof(ucs2_t) * (sp->st.slist_used + nstates));
1345 sp->st.slist_size = sp->st.slist_used + nstates;
1347 sp->st.slist_used = nstates;
1348 (void) AC_MEMCPY((char *) sp->st.slist, (char *) states,
1349 sizeof(ucs2_t) * nstates);
1353 * Return the ID of the DFA state representing a group of NFA states.
1359 _ure_reduce(ucs2_t start, _ure_buffer_t *b)
1361 ucs2_t i, j, state, eval, syms, rhs;
1362 ucs2_t s1, s2, ns1, ns2;
1369 * Add the starting state for the reduction.
1371 _ure_add_state(1, &start, b);
1374 * Process each set of NFA states that get created.
1376 for (i = 0; i < b->states.states_used; i++) {
1377 sp = b->states.states + i;
1380 * Push the current states on the stack.
1382 for (j = 0; j < sp->st.slist_used; j++)
1383 _ure_push(sp->st.slist[j], b);
1386 * Reduce the NFA states.
1388 for (j = sp->accepting = syms = 0; j < b->stack.slist_used; j++) {
1389 state = b->stack.slist[j];
1393 * This inner loop is the iterative equivalent of recursively
1394 * reducing subexpressions generated as a result of a reduction.
1397 switch (b->expr[state].type) {
1399 ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b);
1400 _ure_add_symstate(b->expr[state].lhs, ns1, b);
1409 s1 = b->expr[state].lhs;
1410 ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b);
1411 state = _ure_make_expr(_URE_OR, ns1, s1, b);
1414 s1 = b->expr[state].lhs;
1415 ns1 = _ure_make_expr(_URE_STAR, s1, _URE_NOOP, b);
1416 state = _ure_make_expr(_URE_AND, s1, ns1, b);
1419 s1 = b->expr[state].lhs;
1420 ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b);
1421 ns2 = _ure_make_expr(_URE_PLUS, s1, _URE_NOOP, b);
1422 state = _ure_make_expr(_URE_OR, ns1, ns2, b);
1425 s1 = b->expr[state].lhs;
1426 s2 = b->expr[state].rhs;
1432 s1 = b->expr[state].lhs;
1433 s2 = b->expr[state].rhs;
1434 switch (b->expr[s1].type) {
1436 _ure_add_symstate(b->expr[s1].lhs, s2, b);
1444 ns1 = b->expr[s1].lhs;
1445 ns2 = _ure_make_expr(_URE_AND, ns1, s2, b);
1446 state = _ure_make_expr(_URE_OR, s2, ns2, b);
1449 ns1 = b->expr[s1].lhs;
1450 ns2 = _ure_make_expr(_URE_OR, s2, state, b);
1451 state = _ure_make_expr(_URE_AND, ns1, ns2, b);
1454 ns1 = b->expr[s1].lhs;
1455 ns2 = _ure_make_expr(_URE_AND, ns1, state, b);
1456 state = _ure_make_expr(_URE_OR, s2, ns2, b);
1459 ns1 = b->expr[s1].lhs;
1460 ns2 = b->expr[s1].rhs;
1461 ns1 = _ure_make_expr(_URE_AND, ns1, s2, b);
1462 ns2 = _ure_make_expr(_URE_AND, ns2, s2, b);
1463 state = _ure_make_expr(_URE_OR, ns1, ns2, b);
1466 ns1 = b->expr[s1].lhs;
1467 ns2 = b->expr[s1].rhs;
1468 ns2 = _ure_make_expr(_URE_AND, ns2, s2, b);
1469 state = _ure_make_expr(_URE_AND, ns1, ns2, b);
1477 * Clear the state stack.
1479 while (_ure_pop(b) != _URE_NOOP) ;
1482 * Reset the state pointer because the reduction may have moved it
1483 * during a reallocation.
1485 sp = b->states.states + i;
1488 * Generate the DFA states for the symbols collected during the
1489 * current reduction.
1491 if (sp->trans_used + syms > sp->trans_size) {
1492 if (sp->trans_size == 0)
1493 sp->trans = (_ure_elt_t *)
1494 malloc(sizeof(_ure_elt_t) * (sp->trans_used + syms));
1496 sp->trans = (_ure_elt_t *)
1497 realloc((char *) sp->trans,
1498 sizeof(_ure_elt_t) * (sp->trans_used + syms));
1499 sp->trans_size = sp->trans_used + syms;
1503 * Go through the symbol table and generate the DFA state transitions
1504 * for each symbol that has collected NFA states.
1506 for (j = syms = 0, smp = b->symtab; j < b->symtab_used; j++, smp++) {
1507 sp = b->states.states + i;
1509 if (smp->states.slist_used > 0) {
1510 sp->trans[syms].lhs = smp->id;
1511 rhs = _ure_add_state(smp->states.slist_used,
1512 smp->states.slist, b);
1514 * Reset the state pointer in case the reallocation moves it
1517 sp = b->states.states + i;
1518 sp->trans[syms].rhs = rhs;
1520 smp->states.slist_used = 0;
1526 * Set the number of transitions actually used.
1528 sp->trans_used = syms;
1534 _ure_add_equiv(ucs2_t l, ucs2_t r, _ure_buffer_t *b)
1538 l = b->states.states[l].id;
1539 r = b->states.states[r].id;
1551 * Check to see if the equivalence pair already exists.
1553 for (tmp = 0; tmp < b->equiv_used &&
1554 (b->equiv[tmp].l != l || b->equiv[tmp].r != r);
1557 if (tmp < b->equiv_used)
1560 if (b->equiv_used == b->equiv_size) {
1561 if (b->equiv_size == 0)
1562 b->equiv = (_ure_equiv_t *) malloc(sizeof(_ure_equiv_t) << 3);
1564 b->equiv = (_ure_equiv_t *) realloc((char *) b->equiv,
1565 sizeof(_ure_equiv_t) *
1566 (b->equiv_size + 8));
1569 b->equiv[b->equiv_used].l = l;
1570 b->equiv[b->equiv_used].r = r;
1575 * Merge the DFA states that are equivalent.
1578 _ure_merge_equiv(_ure_buffer_t *b)
1580 ucs2_t i, j, k, eq, done;
1581 _ure_state_t *sp1, *sp2, *ls, *rs;
1583 for (i = 0; i < b->states.states_used; i++) {
1584 sp1 = b->states.states + i;
1587 for (j = 0; j < i; j++) {
1588 sp2 = b->states.states + j;
1592 _ure_add_equiv(i, j, b);
1593 for (eq = 0, done = 0; eq < b->equiv_used; eq++) {
1594 ls = b->states.states + b->equiv[eq].l;
1595 rs = b->states.states + b->equiv[eq].r;
1596 if (ls->accepting != rs->accepting ||
1597 ls->trans_used != rs->trans_used) {
1601 for (k = 0; k < ls->trans_used &&
1602 ls->trans[k].lhs == rs->trans[k].lhs; k++) ;
1603 if (k < ls->trans_used) {
1608 for (k = 0; k < ls->trans_used; k++)
1609 _ure_add_equiv(ls->trans[k].rhs, rs->trans[k].rhs, b);
1614 for (eq = 0; j < i && eq < b->equiv_used; eq++)
1615 b->states.states[b->equiv[eq].r].id =
1616 b->states.states[b->equiv[eq].l].id;
1620 * Renumber the states appropriately.
1622 for (i = eq = 0, sp1 = b->states.states; i < b->states.states_used;
1624 sp1->id = (sp1->id == i) ? eq++ : b->states.states[sp1->id].id;
1627 /*************************************************************************
1631 *************************************************************************/
1634 ure_buffer_create(void)
1638 b = (ure_buffer_t) calloc(1, sizeof(_ure_buffer_t));
1644 ure_buffer_free(ure_buffer_t buf)
1651 if (buf->stack.slist_size > 0)
1652 free((char *) buf->stack.slist);
1654 if (buf->expr_size > 0)
1655 free((char *) buf->expr);
1657 for (i = 0; i < buf->symtab_size; i++) {
1658 if (buf->symtab[i].states.slist_size > 0)
1659 free((char *) buf->symtab[i].states.slist);
1662 if (buf->symtab_size > 0)
1663 free((char *) buf->symtab);
1665 for (i = 0; i < buf->states.states_size; i++) {
1666 if (buf->states.states[i].trans_size > 0)
1667 free((char *) buf->states.states[i].trans);
1668 if (buf->states.states[i].st.slist_size > 0)
1669 free((char *) buf->states.states[i].st.slist);
1672 if (buf->states.states_size > 0)
1673 free((char *) buf->states.states);
1675 if (buf->equiv_size > 0)
1676 free((char *) buf->equiv);
1682 ure_compile(ucs2_t *re, unsigned long relen, int casefold, ure_buffer_t buf)
1690 if (re == 0 || *re == 0 || relen == 0 || buf == 0)
1694 * Reset the various fields of the compilation buffer. Default the flags
1695 * to indicate the presense of the "^$" pattern. If any other pattern
1696 * occurs, then this flag will be removed. This is done to catch this
1697 * special pattern and handle it specially when matching.
1699 buf->flags = _URE_DFA_BLANKLINE | ((casefold) ? _URE_DFA_CASEFOLD : 0);
1701 buf->stack.slist_used = 0;
1704 for (i = 0; i < buf->symtab_used; i++)
1705 buf->symtab[i].states.slist_used = 0;
1706 buf->symtab_used = 0;
1708 for (i = 0; i < buf->states.states_used; i++) {
1709 buf->states.states[i].st.slist_used = 0;
1710 buf->states.states[i].trans_used = 0;
1712 buf->states.states_used = 0;
1715 * Construct the NFA. If this stage returns a 0, then an error occured or
1716 * an empty expression was passed.
1718 if ((state = _ure_re2nfa(re, relen, buf)) == _URE_NOOP)
1722 * Do the expression reduction to get the initial DFA.
1724 _ure_reduce(state, buf);
1727 * Merge all the equivalent DFA states.
1729 _ure_merge_equiv(buf);
1732 * Construct the minimal DFA.
1734 dfa = (ure_dfa_t) malloc(sizeof(_ure_dfa_t));
1735 (void) memset((char *) dfa, '\0', sizeof(_ure_dfa_t));
1737 dfa->flags = buf->flags & (_URE_DFA_CASEFOLD|_URE_DFA_BLANKLINE);
1740 * Free up the NFA state groups and transfer the symbols from the buffer
1743 for (i = 0; i < buf->symtab_size; i++) {
1744 if (buf->symtab[i].states.slist_size > 0)
1745 free((char *) buf->symtab[i].states.slist);
1747 dfa->syms = buf->symtab;
1748 dfa->nsyms = buf->symtab_used;
1750 buf->symtab_used = buf->symtab_size = 0;
1753 * Collect the total number of states and transitions needed for the DFA.
1755 for (i = state = 0, sp = buf->states.states; i < buf->states.states_used;
1757 if (sp->id == state) {
1759 dfa->ntrans += sp->trans_used;
1765 * Allocate enough space for the states and transitions.
1767 dfa->states = (_ure_dstate_t *) malloc(sizeof(_ure_dstate_t) *
1769 dfa->trans = (_ure_trans_t *) malloc(sizeof(_ure_trans_t) * dfa->ntrans);
1772 * Actually transfer the DFA states from the buffer.
1776 for (i = state = 0, sp = buf->states.states; i < buf->states.states_used;
1778 if (sp->id == state) {
1780 dsp->ntrans = sp->trans_used;
1781 dsp->accepting = sp->accepting;
1784 * Add the transitions for the state.
1786 for (j = 0; j < dsp->ntrans; j++, tp++) {
1787 tp->symbol = sp->trans[j].lhs;
1788 tp->next_state = buf->states.states[sp->trans[j].rhs].id;
1800 ure_dfa_free(ure_dfa_t dfa)
1807 for (i = 0; i < dfa->nsyms; i++) {
1808 if ((dfa->syms[i].type == _URE_CCLASS ||
1809 dfa->syms[i].type == _URE_NCCLASS) &&
1810 dfa->syms[i].sym.ccl.ranges_size > 0)
1811 free((char *) dfa->syms[i].sym.ccl.ranges);
1814 free((char *) dfa->syms);
1816 if (dfa->nstates > 0)
1817 free((char *) dfa->states);
1818 if (dfa->ntrans > 0)
1819 free((char *) dfa->trans);
1824 ure_write_dfa(ure_dfa_t dfa, FILE *out)
1826 ucs2_t i, j, k, h, l;
1831 if (dfa == 0 || out == 0)
1835 * Write all the different character classes.
1837 for (i = 0, sym = dfa->syms; i < dfa->nsyms; i++, sym++) {
1838 if (sym->type == _URE_CCLASS || sym->type == _URE_NCCLASS) {
1839 fprintf(out, "C%hd = ", sym->id);
1840 if (sym->sym.ccl.ranges_used > 0) {
1842 if (sym->type == _URE_NCCLASS)
1845 if (sym->props != 0) {
1846 if (sym->type == _URE_NCCLASS)
1847 fprintf(out, "\\P");
1849 fprintf(out, "\\p");
1850 for (k = h = 0; k < 32; k++) {
1851 if (sym->props & (1 << k)) {
1854 fprintf(out, "%hd", k + 1);
1862 for (k = 0, rp = sym->sym.ccl.ranges;
1863 k < sym->sym.ccl.ranges_used; k++, rp++) {
1865 * Check for UTF16 characters.
1867 if (0x10000 <= rp->min_code &&
1868 rp->min_code <= 0x10ffff) {
1869 h = (ucs2_t) (((rp->min_code - 0x10000) >> 10) + 0xd800);
1870 l = (ucs2_t) (((rp->min_code - 0x10000) & 1023) + 0xdc00);
1871 fprintf(out, "\\x%04hX\\x%04hX", h, l);
1873 fprintf(out, "\\x%04lX", rp->min_code & 0xffff);
1874 if (rp->max_code != rp->min_code) {
1876 if (rp->max_code >= 0x10000 &&
1877 rp->max_code <= 0x10ffff) {
1878 h = (ucs2_t) (((rp->max_code - 0x10000) >> 10) + 0xd800);
1879 l = (ucs2_t) (((rp->max_code - 0x10000) & 1023) + 0xdc00);
1880 fprintf(out, "\\x%04hX\\x%04hX", h, l);
1882 fprintf(out, "\\x%04lX", rp->max_code & 0xffff);
1885 if (sym->sym.ccl.ranges_used > 0)
1891 for (i = 0, sp = dfa->states; i < dfa->nstates; i++, sp++) {
1892 fprintf(out, "S%hd = ", i);
1893 if (sp->accepting) {
1898 for (j = 0; j < sp->ntrans; j++) {
1902 sym = dfa->syms + sp->trans[j].symbol;
1903 switch (sym->type) {
1905 if (0x10000 <= sym->sym.chr && sym->sym.chr <= 0x10ffff) {
1907 * Take care of UTF16 characters.
1909 h = (ucs2_t) (((sym->sym.chr - 0x10000) >> 10) + 0xd800);
1910 l = (ucs2_t) (((sym->sym.chr - 0x10000) & 1023) + 0xdc00);
1911 fprintf(out, "\\x%04hX\\x%04hX ", h, l);
1913 fprintf(out, "\\x%04lX ", sym->sym.chr & 0xffff);
1916 fprintf(out, "<any> ");
1918 case _URE_BOL_ANCHOR:
1919 fprintf(out, "<bol-anchor> ");
1921 case _URE_EOL_ANCHOR:
1922 fprintf(out, "<eol-anchor> ");
1926 fprintf(out, "[C%hd] ", sym->id);
1929 fprintf(out, "S%hd", sp->trans[j].next_state);
1930 if (j + 1 < sp->ntrans)
1937 #define _ure_issep(cc) ((cc) == '\n' || (cc) == '\r' || (cc) == 0x2028 ||\
1941 ure_exec(ure_dfa_t dfa, int flags, ucs2_t *text, unsigned long textlen,
1942 unsigned long *match_start, unsigned long *match_end)
1944 int i, j, matched, found, skip;
1945 unsigned long ms, me;
1947 ucs2_t *sp, *ep, *lp;
1952 if (dfa == 0 || text == 0)
1956 * Handle the special case of an empty string matching the "^$" pattern.
1958 if (textlen == 0 && (dfa->flags & _URE_DFA_BLANKLINE)) {
1959 *match_start = *match_end = 0;
1970 for (found = skip = 0; found == 0 && sp < ep; ) {
1975 * Check to see if this is a high surrogate that should be
1976 * combined with a following low surrogate.
1978 if (sp < ep && 0xd800 <= c && c <= 0xdbff &&
1979 0xdc00 <= *sp && *sp <= 0xdfff)
1980 c = 0x10000 + (((c & 0x03ff) << 10) | (*sp++ & 0x03ff));
1983 * Determine if the character is non-spacing and should be skipped.
1985 if (_ure_matches_properties(_URE_NONSPACING, c) &&
1986 (flags & URE_IGNORE_NONSPACING)) {
1991 if (dfa->flags & _URE_DFA_CASEFOLD)
1992 c = _ure_tolower(c);
1995 * See if one of the transitions matches.
1997 for (i = 0, matched = 0; matched == 0 && i < stp->ntrans; i++) {
1998 sym = dfa->syms + stp->trans[i].symbol;
1999 switch (sym->type) {
2001 if ((flags & URE_DOT_MATCHES_SEPARATORS) ||
2006 if (c == sym->sym.chr)
2009 case _URE_BOL_ANCHOR:
2013 } else if (_ure_issep(c)) {
2014 if (c == '\r' && sp < ep && *sp == '\n')
2020 case _URE_EOL_ANCHOR:
2021 if (_ure_issep(c)) {
2023 * Put the pointer back before the separator so the match
2024 * end position will be correct. This case will also
2025 * cause the `sp' pointer to be advanced over the current
2026 * separator once the match end point has been recorded.
2034 if (sym->props != 0)
2035 matched = _ure_matches_properties(sym->props, c);
2036 for (j = 0, rp = sym->sym.ccl.ranges;
2037 j < sym->sym.ccl.ranges_used; j++, rp++) {
2038 if (rp->min_code <= c && c <= rp->max_code)
2041 if (sym->type == _URE_NCCLASS)
2051 stp = dfa->states + stp->trans[i].next_state;
2054 * If the match was an EOL anchor, adjust the pointer past the
2055 * separator that caused the match. The correct match
2056 * position has been recorded already.
2058 if (sym->type == _URE_EOL_ANCHOR) {
2060 * Skip the character that caused the match.
2065 * Handle the infamous CRLF situation.
2067 if (sp < ep && c == '\r' && *sp == '\n')
2074 if (stp->accepting == 0) {
2076 * If the last state was not accepting, then reset
2083 * The last state was accepting, so terminate the matching
2084 * loop to avoid more work.
2087 } else if (sp == ep) {
2088 if (!stp->accepting) {
2090 * This ugly hack is to make sure the end-of-line anchors
2091 * match when the source text hits the end. This is only done
2092 * if the last subexpression matches.
2094 for (i = 0; found == 0 && i < stp->ntrans; i++) {
2095 sym = dfa->syms + stp->trans[i].symbol;
2096 if (sym->type ==_URE_EOL_ANCHOR) {
2097 stp = dfa->states + stp->trans[i].next_state;
2098 if (stp->accepting) {
2107 * Make sure any conditions that match all the way to the end
2108 * of the string match.
2122 return (ms != ~0) ? 1 : 0;