3 #if defined( MACOS ) || defined( DOS ) || defined( _WIN32 ) || defined( NEED_BSDREGEX )
7 * regex - Regular expression pattern matching and replacement
9 * By: Ozan S. Yigit (oz)
10 * Dept. of Computer Science
13 * These routines are the PUBLIC DOMAIN equivalents of regex
14 * routines as found in 4.nBSD UN*X, with minor extensions.
16 * These routines are derived from various implementations found
17 * in software tools books, and Conroy's grep. They are NOT derived
18 * from licensed/restricted software.
19 * For more interesting/academic/complicated implementations,
20 * see Henry Spencer's regexp routines, or GNU Emacs pattern
23 * Vendor Modification history:
25 * Revision 1.12 1996/04/25 16:20:59 mcs
26 * make re_exec() match "" with ".*" and similar patterns
27 * hopefully this change doesn't break anything else!
29 * Revision 1.11 1994/12/14 21:33:45 mcs
30 * use new NEED_BSDREGEX
31 * fix pmatch() prototype
33 * Revision 1.10 1994/12/12 18:16:39 mcs
36 * Revision 1.9 1994/11/15 19:16:35 mcs
37 * add (CHAR) cast to make VisualC++ happy
39 * Revision 1.8 1994/11/08 21:14:32 mcs
42 * Revision 1.7 1994/07/23 19:51:24 mcs
43 * use ANSI-style inline function parameters
45 * Revision 1.6 1993/10/18 01:52:32 tim
48 * Revision 1.5 1993/09/28 21:37:54 mcs
49 * HP/UX needs the regex we include (not in its libc)
51 * Revision 1.4 1993/08/27 15:59:52 mcs
54 * Revision 1.3 1993/08/27 15:49:47 mcs
55 * added missing 0 to octal constants
56 * use unsigned char for CHAR under DOS
58 * Revision 1.2 1993/08/27 14:57:48 mcs
59 * add proto. for pmatch
61 * Revision 1.1 1993/08/18 21:20:02 mcs
64 * Revision 1.4 1991/10/17 03:56:42 oz
65 * miscellaneous changes, small cleanups etc.
67 * Revision 1.3 1989/04/01 14:18:09 oz
68 * Change all references to a dfa: this is actually an nfa.
70 * Revision 1.2 88/08/28 15:36:04 oz
71 * Use a complement bitmap to represent NCL.
72 * This removes the need to have seperate
73 * code in the pmatch case block - it is
76 * Use the actual CCL code in the CLO
77 * section of pmatch. No need for a recursive
80 * Use a bitmap table to set char bits in an
84 * re_comp: compile a regular expression into a NFA.
89 * re_exec: execute the NFA to match a pattern.
94 * re_modw change re_exec's understanding of what a "word"
95 * looks like (for \< and \>) by adding into the
96 * hidden word-syntax table.
101 * re_subs: substitute the matched portions in a new string.
103 * int re_subs(src, dst)
107 * re_fail: failure routine for re_exec.
109 * void re_fail(msg, op)
113 * Regular Expressions:
115 * [1] char matches itself, unless it is a special
116 * character (metachar): . \ [ ] * + ^ $
118 * [2] . matches any character.
120 * [3] \ matches the character following it, except
121 * when followed by a left or right round bracket,
122 * a digit 1 to 9 or a left or right angle bracket.
123 * (see [7], [8] and [9])
124 * It is used as an escape character for all
125 * other meta-characters, and itself. When used
126 * in a set ([4]), it is treated as an ordinary
129 * [4] [set] matches one of the characters in the set.
130 * If the first character in the set is "^",
131 * it matches a character NOT in the set, i.e.
132 * complements the set. A shorthand S-E is
133 * used to specify a set of characters S upto
134 * E, inclusive. The special characters "]" and
135 * "-" have no special meaning if they appear
136 * as the first chars in the set.
139 * [a-z] any lowercase alpha
141 * [^]-] any char except ] and -
143 * [^A-Z] any char except uppercase
148 * [5] * any regular expression form [1] to [4], followed by
149 * closure char (*) matches zero or more matches of
152 * [6] + same as [5], except it matches one or more.
154 * [7] a regular expression in the form [1] to [10], enclosed
155 * as \(form\) matches what form matches. The enclosure
156 * creates a set of tags, used for [8] and for
157 * pattern substution. The tagged forms are numbered
160 * [8] a \ followed by a digit 1 to 9 matches whatever a
161 * previously tagged regular expression ([7]) matched.
163 * [9] \< a regular expression starting with a \< construct
164 * \> and/or ending with a \> construct, restricts the
165 * pattern matching to the beginning of a word, and/or
166 * the end of a word. A word is defined to be a character
167 * string beginning and/or ending with the characters
168 * A-Z a-z 0-9 and _. It must also be preceded and/or
169 * followed by any character outside those mentioned.
171 * [10] a composite regular expression xy where x and y
172 * are in the form [1] to [10] matches the longest
173 * match of x followed by a match for y.
175 * [11] ^ a regular expression starting with a ^ character
176 * $ and/or ending with a $ character, restricts the
177 * pattern matching to the beginning of the line,
178 * or the end of line. [anchors] Elsewhere in the
179 * pattern, ^ and $ are treated as ordinary characters.
184 * HCR's Hugh Redelmeier has been most helpful in various
185 * stages of development. He convinced me to include BOW
186 * and EOW constructs, originally invented by Rob Pike at
187 * the University of Toronto.
190 * Software tools Kernighan & Plauger
191 * Software tools in Pascal Kernighan & Plauger
192 * Grep [rsx-11 C dist] David Conroy
193 * ed - text editor Un*x Programmer's Manual
194 * Advanced editing on Un*x B. W. Kernighan
195 * RegExp routines Henry Spencer
199 * This implementation uses a bit-set representation for character
200 * classes for speed and compactness. Each character is represented
201 * by one bit in a 128-bit block. Thus, CCL always takes a
202 * constant 16 bytes in the internal nfa, and re_exec does a single
203 * bit comparison to locate the character in the set.
208 * compile: CHR f CHR o CLO CHR o END CLO ANY END END
209 * matches: fo foo fooo foobar fobar foxx ...
211 * pattern: fo[ob]a[rz]
212 * compile: CHR f CHR o CCL bitset CHR a CCL bitset END
213 * matches: fobar fooar fobaz fooaz
216 * compile: CHR f CHR o CHR o CHR \ CLO CHR \ END END
217 * matches: foo\ foo\\ foo\\\ ...
219 * pattern: \(foo\)[1-3]\1 (same as foo[1-3]foo)
220 * compile: BOT 1 CHR f CHR o CHR o EOT 1 CCL bitset REF 1 END
221 * matches: foo1foo foo2foo foo3foo
223 * pattern: \(fo.*\)-\1
224 * compile: BOT 1 CHR f CHR o CLO ANY END EOT 1 CHR - REF 1 END
225 * matches: foo-foo fo-fo fob-fob foobar-foobar ...
249 * The following defines are not meant to be changeable.
250 * They are for readability only.
254 #define BITBLK MAXCHR/CHRBIT
260 #if defined( DOS ) || defined( _WIN32 )
261 typedef unsigned char CHAR;
263 typedef /*unsigned*/ char CHAR;
266 static int tagstk[MAXTAG]; /* subpat tag stack..*/
267 static CHAR nfa[MAXNFA]; /* automaton.. */
268 static int sta = NOP; /* status of lastpat */
270 static CHAR bittab[BITBLK]; /* bit table for CCL */
271 /* pre-set bits... */
272 static CHAR bitarr[] = {1,2,4,8,16,32,64,128};
277 bittab[((c) & BLKIND) >> 3] |= bitarr[(c) & BITIND];
280 #define badpat(x) (*nfa = END, x)
281 #define store(x) *mp++ = x
286 register char *p; /* pattern pointer */
287 register CHAR *mp=nfa; /* nfa pointer */
288 register CHAR *lp; /* saved pointer.. */
289 register CHAR *sp=nfa; /* another one.. */
291 register int tagi = 0; /* tag stack index */
292 register int tagc = 1; /* actual tag count */
295 register CHAR mask; /* xor mask -CCL/NCL */
302 return badpat("No previous regular expression");
305 for (p = pat; *p; p++) {
309 case '.': /* match any char.. */
313 case '^': /* match beginning.. */
322 case '$': /* match endofline.. */
331 case '[': /* match char class..*/
341 if (*p == '-') /* real dash */
343 if (*p == ']') /* real brac */
345 while (*p && *p != ']') {
346 if (*p == '-' && *(p+1) && *(p+1) != ']') {
354 else if (*p == '\\' && *(p+1)) {
363 return badpat("Missing ]");
365 for (n = 0; n < BITBLK; bittab[n++] = (char) 0)
366 store(mask ^ bittab[n]);
370 case '*': /* match 0 or more.. */
371 case '+': /* match 1 or more.. */
373 return badpat("Empty closure");
374 lp = sp; /* previous opcode */
375 if (*lp == CLO) /* equivalence.. */
385 return badpat("Illegal closure");
391 for (sp = mp; lp < sp; lp++)
403 case '\\': /* tags, backrefs .. */
408 tagstk[++tagi] = tagc;
413 return badpat("Too many \\(\\) pairs");
417 return badpat("Null pattern inside \\(\\)");
420 store(tagstk[tagi--]);
423 return badpat("Unmatched \\)");
430 return badpat("Null pattern inside \\<\\>");
443 if (tagi > 0 && tagstk[tagi] == n)
444 return badpat("Cyclical reference");
450 return badpat("Undetermined reference");
480 default : /* an ordinary char */
488 return badpat("Unmatched \\(");
499 static char *pmatch( char *lp, CHAR *ap );
500 #else /* NEEDPROTOS */
501 static char *pmatch();
502 #endif /* NEEDPROTOS */
506 * execute nfa to find a match.
508 * special cases: (nfa[0])
510 * Match only once, starting from the
513 * First locate the character without
514 * calling pmatch, and if found, call
515 * pmatch for the remaining string.
517 * re_comp failed, poor luser did not
518 * check for it. Fail fast.
520 * If a match is found, bopat[0] and eopat[0] are set
521 * to the beginning and the end of the matched fragment,
530 register char *ep = 0;
531 register CHAR *ap = nfa;
548 case BOL: /* anchored: match from BOL only */
551 case CHR: /* ordinary char: locate it fast */
553 while (*lp && *lp != c)
555 if (!*lp) /* if EOS, fail, else fall thru. */
557 default: /* regular matching all the way. */
559 if ((ep = pmatch(lp,ap)))
565 case END: /* munged automaton. fail always */
577 * pmatch: internal routine for the hard part
579 * This code is partly snarfed from an early grep written by
580 * David Conroy. The backref and tag stuff, and various other
581 * innovations are by oz.
583 * special case optimizations: (nfa[n], nfa[n+1])
585 * We KNOW .* will match everything upto the
586 * end of line. Thus, directly go to the end of
587 * line, without recursive pmatch calls. As in
588 * the other closure cases, the remaining pattern
589 * must be matched by moving backwards on the
590 * string recursively, to find a match for xy
591 * (x is ".*" and y is the remaining pattern)
592 * where the match satisfies the LONGEST match for
593 * x followed by a match for y.
595 * We can again scan the string forward for the
596 * single char and at the point of failure, we
597 * execute the remaining nfa recursively, same as
600 * At the end of a successful match, bopat[n] and eopat[n]
601 * are set to the beginning and end of subpatterns matched
602 * by tagged expressions (n = 1 to 9).
607 extern void re_fail();
611 * character classification table for word boundary operators BOW
612 * and EOW. the reason for not using ctype macros is that we can
613 * let the user add into our own table. see re_modw. This table
614 * is not in the bitset form, since we may wish to extend it in the
615 * future for other character classifications.
617 * TRUE for 0-9 A-Z a-z _
619 static char chrtyp[MAXCHR] = {
620 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
621 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
622 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
623 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
624 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
625 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
626 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
627 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
628 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
629 1, 0, 0, 0, 0, 1, 0, 1, 1, 1,
630 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
631 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
632 1, 1, 1, 0, 0, 0, 0, 0
635 #define inascii(x) (0177&(x))
636 #define iswordc(x) chrtyp[inascii(x)]
637 #define isinset(x,y) ((x)[((y)&BLKIND)>>3] & bitarr[(y)&BITIND])
640 * skip values for CLO XXX to skip past the closure
643 #define ANYSKIP 2 /* [CLO] ANY END ... */
644 #define CHRSKIP 3 /* [CLO] CHR chr END ... */
645 #define CCLSKIP 18 /* [CLO] CCL 16bytes END ... */
648 pmatch( char *lp, CHAR *ap)
650 register int op, c, n;
651 register char *e; /* extra pointer for CLO */
652 register char *bp; /* beginning of subpat.. */
653 register char *ep; /* ending of subpat.. */
654 char *are; /* to save the line ptr. */
656 while ((op = *ap++) != END)
688 if (lp!=bol && iswordc(lp[-1]) || !iswordc(*lp))
692 if (lp==bol || !iswordc(lp[-1]) || iswordc(*lp))
714 while (*lp && c == *lp)
719 while ((c = *lp) && isinset(ap+1,c))
724 re_fail("closure: bad nfa.", *ap);
731 if (e = pmatch(lp, ap))
737 re_fail("re_exec: bad nfa.", op);
745 * add new characters into the word table to change re_exec's
746 * understanding of what a word should look like. Note that we
747 * only accept additions into the word definition.
749 * If the string parameter is 0 or null string, the table is
750 * reset back to the default containing A-Z a-z 0-9 _. [We use
751 * the compact bitset representation for the default table]
754 static CHAR deftab[16] = {
755 0, 0, 0, 0, 0, 0, 0377, 003, 0376, 0377, 0377, 0207,
756 0376, 0377, 0377, 007
765 for (i = 0; i < MAXCHR; i++)
766 if (!isinset(deftab,i))
776 * substitute the matched portions of the src in dst.
778 * & substitute the entire matched pattern.
780 * \digit substitute a subpattern, with the given tag number.
781 * Tags are numbered from 1 to 9. If the particular
782 * tagged subpattern does not exist, null is substituted.
785 re_subs( char *src, char *dst)
792 if (!*src || !bopat[0])
804 if (c >= '0' && c <= '9') {
814 if ((bp = bopat[pin]) && (ep = eopat[pin])) {
815 while (*bp && bp < ep)
827 * symbolic - produce a symbolic dump of the nfa
831 printf("pattern: %s\n", s);
832 printf("nfacode:\n");
860 printf("\tCHR %c\n",*ap++);
872 printf("BOT: %d\n",*ap++);
875 printf("EOT: %d\n",*ap++);
884 printf("REF: %d\n",*ap++);
888 for (n = 0; n < MAXCHR; n++)
889 if (isinset(ap,(CHAR)n)) {
891 printf("^%c", n ^ 0x040);
899 printf("bad nfa. opcode %o\n", ap[-1]);
905 #endif /* MACOS or DOS or NEED_BSDREGEX */