3 #if defined( MACOS ) || defined( DOS ) || defined( _WIN32 ) || defined( NEED_BSDREGEX )
7 * regex - Regular expression pattern matching and replacement
9 * By: Ozan S. Yigit (oz)
10 * Dept. of Computer Science
13 * These routines are the PUBLIC DOMAIN equivalents of regex
14 * routines as found in 4.nBSD UN*X, with minor extensions.
16 * These routines are derived from various implementations found
17 * in software tools books, and Conroy's grep. They are NOT derived
18 * from licensed/restricted software.
19 * For more interesting/academic/complicated implementations,
20 * see Henry Spencer's regexp routines, or GNU Emacs pattern
23 * Modification history:
26 * Revision 1.2 1996/04/25 16:24:11 mcs
27 * make re_exec() match "" with ".*" and similar patterns
28 * hopefully this change doesn't break anything else!
30 * Revision 1.1 1995/02/03 15:56:52 tim
33 * Revision 1.11 1994/12/14 21:33:45 mcs
34 * use new NEED_BSDREGEX
35 * fix pmatch() prototype
37 * Revision 1.10 1994/12/12 18:16:39 mcs
40 * Revision 1.9 1994/11/15 19:16:35 mcs
41 * add (CHAR) cast to make VisualC++ happy
43 * Revision 1.8 1994/11/08 21:14:32 mcs
46 * Revision 1.7 1994/07/23 19:51:24 mcs
47 * use ANSI-style inline function parameters
49 * Revision 1.6 1993/10/18 01:52:32 tim
52 * Revision 1.5 1993/09/28 21:37:54 mcs
53 * HP/UX needs the regex we include (not in its libc)
55 * Revision 1.4 1993/08/27 15:59:52 mcs
58 * Revision 1.3 1993/08/27 15:49:47 mcs
59 * added missing 0 to octal constants
60 * use unsigned char for CHAR under DOS
62 * Revision 1.2 1993/08/27 14:57:48 mcs
63 * add proto. for pmatch
65 * Revision 1.1 1993/08/18 21:20:02 mcs
68 * Revision 1.4 1991/10/17 03:56:42 oz
69 * miscellaneous changes, small cleanups etc.
71 * Revision 1.3 1989/04/01 14:18:09 oz
72 * Change all references to a dfa: this is actually an nfa.
74 * Revision 1.2 88/08/28 15:36:04 oz
75 * Use a complement bitmap to represent NCL.
76 * This removes the need to have seperate
77 * code in the pmatch case block - it is
80 * Use the actual CCL code in the CLO
81 * section of pmatch. No need for a recursive
84 * Use a bitmap table to set char bits in an
88 * re_comp: compile a regular expression into a NFA.
93 * re_exec: execute the NFA to match a pattern.
98 * re_modw change re_exec's understanding of what a "word"
99 * looks like (for \< and \>) by adding into the
100 * hidden word-syntax table.
105 * re_subs: substitute the matched portions in a new string.
107 * int re_subs(src, dst)
111 * re_fail: failure routine for re_exec.
113 * void re_fail(msg, op)
117 * Regular Expressions:
119 * [1] char matches itself, unless it is a special
120 * character (metachar): . \ [ ] * + ^ $
122 * [2] . matches any character.
124 * [3] \ matches the character following it, except
125 * when followed by a left or right round bracket,
126 * a digit 1 to 9 or a left or right angle bracket.
127 * (see [7], [8] and [9])
128 * It is used as an escape character for all
129 * other meta-characters, and itself. When used
130 * in a set ([4]), it is treated as an ordinary
133 * [4] [set] matches one of the characters in the set.
134 * If the first character in the set is "^",
135 * it matches a character NOT in the set, i.e.
136 * complements the set. A shorthand S-E is
137 * used to specify a set of characters S upto
138 * E, inclusive. The special characters "]" and
139 * "-" have no special meaning if they appear
140 * as the first chars in the set.
143 * [a-z] any lowercase alpha
145 * [^]-] any char except ] and -
147 * [^A-Z] any char except uppercase
152 * [5] * any regular expression form [1] to [4], followed by
153 * closure char (*) matches zero or more matches of
156 * [6] + same as [5], except it matches one or more.
158 * [7] a regular expression in the form [1] to [10], enclosed
159 * as \(form\) matches what form matches. The enclosure
160 * creates a set of tags, used for [8] and for
161 * pattern substution. The tagged forms are numbered
164 * [8] a \ followed by a digit 1 to 9 matches whatever a
165 * previously tagged regular expression ([7]) matched.
167 * [9] \< a regular expression starting with a \< construct
168 * \> and/or ending with a \> construct, restricts the
169 * pattern matching to the beginning of a word, and/or
170 * the end of a word. A word is defined to be a character
171 * string beginning and/or ending with the characters
172 * A-Z a-z 0-9 and _. It must also be preceded and/or
173 * followed by any character outside those mentioned.
175 * [10] a composite regular expression xy where x and y
176 * are in the form [1] to [10] matches the longest
177 * match of x followed by a match for y.
179 * [11] ^ a regular expression starting with a ^ character
180 * $ and/or ending with a $ character, restricts the
181 * pattern matching to the beginning of the line,
182 * or the end of line. [anchors] Elsewhere in the
183 * pattern, ^ and $ are treated as ordinary characters.
188 * HCR's Hugh Redelmeier has been most helpful in various
189 * stages of development. He convinced me to include BOW
190 * and EOW constructs, originally invented by Rob Pike at
191 * the University of Toronto.
194 * Software tools Kernighan & Plauger
195 * Software tools in Pascal Kernighan & Plauger
196 * Grep [rsx-11 C dist] David Conroy
197 * ed - text editor Un*x Programmer's Manual
198 * Advanced editing on Un*x B. W. Kernighan
199 * RegExp routines Henry Spencer
203 * This implementation uses a bit-set representation for character
204 * classes for speed and compactness. Each character is represented
205 * by one bit in a 128-bit block. Thus, CCL always takes a
206 * constant 16 bytes in the internal nfa, and re_exec does a single
207 * bit comparison to locate the character in the set.
212 * compile: CHR f CHR o CLO CHR o END CLO ANY END END
213 * matches: fo foo fooo foobar fobar foxx ...
215 * pattern: fo[ob]a[rz]
216 * compile: CHR f CHR o CCL bitset CHR a CCL bitset END
217 * matches: fobar fooar fobaz fooaz
220 * compile: CHR f CHR o CHR o CHR \ CLO CHR \ END END
221 * matches: foo\ foo\\ foo\\\ ...
223 * pattern: \(foo\)[1-3]\1 (same as foo[1-3]foo)
224 * compile: BOT 1 CHR f CHR o CHR o EOT 1 CCL bitset REF 1 END
225 * matches: foo1foo foo2foo foo3foo
227 * pattern: \(fo.*\)-\1
228 * compile: BOT 1 CHR f CHR o CLO ANY END EOT 1 CHR - REF 1 END
229 * matches: foo-foo fo-fo fob-fob foobar-foobar ...
253 * The following defines are not meant to be changeable.
254 * They are for readability only.
258 #define BITBLK MAXCHR/CHRBIT
264 #if defined( DOS ) || defined( _WIN32 )
265 typedef unsigned char CHAR;
267 typedef /*unsigned*/ char CHAR;
270 static int tagstk[MAXTAG]; /* subpat tag stack..*/
271 static CHAR nfa[MAXNFA]; /* automaton.. */
272 static int sta = NOP; /* status of lastpat */
274 static CHAR bittab[BITBLK]; /* bit table for CCL */
275 /* pre-set bits... */
276 static CHAR bitarr[] = {1,2,4,8,16,32,64,128};
281 bittab[((c) & BLKIND) >> 3] |= bitarr[(c) & BITIND];
284 #define badpat(x) (*nfa = END, x)
285 #define store(x) *mp++ = x
290 register char *p; /* pattern pointer */
291 register CHAR *mp=nfa; /* nfa pointer */
292 register CHAR *lp; /* saved pointer.. */
293 register CHAR *sp=nfa; /* another one.. */
295 register int tagi = 0; /* tag stack index */
296 register int tagc = 1; /* actual tag count */
299 register CHAR mask; /* xor mask -CCL/NCL */
306 return badpat("No previous regular expression");
309 for (p = pat; *p; p++) {
313 case '.': /* match any char.. */
317 case '^': /* match beginning.. */
326 case '$': /* match endofline.. */
335 case '[': /* match char class..*/
345 if (*p == '-') /* real dash */
347 if (*p == ']') /* real brac */
349 while (*p && *p != ']') {
350 if (*p == '-' && *(p+1) && *(p+1) != ']') {
358 else if (*p == '\\' && *(p+1)) {
367 return badpat("Missing ]");
369 for (n = 0; n < BITBLK; bittab[n++] = (char) 0)
370 store(mask ^ bittab[n]);
374 case '*': /* match 0 or more.. */
375 case '+': /* match 1 or more.. */
377 return badpat("Empty closure");
378 lp = sp; /* previous opcode */
379 if (*lp == CLO) /* equivalence.. */
389 return badpat("Illegal closure");
395 for (sp = mp; lp < sp; lp++)
407 case '\\': /* tags, backrefs .. */
412 tagstk[++tagi] = tagc;
417 return badpat("Too many \\(\\) pairs");
421 return badpat("Null pattern inside \\(\\)");
424 store(tagstk[tagi--]);
427 return badpat("Unmatched \\)");
434 return badpat("Null pattern inside \\<\\>");
447 if (tagi > 0 && tagstk[tagi] == n)
448 return badpat("Cyclical reference");
454 return badpat("Undetermined reference");
484 default : /* an ordinary char */
492 return badpat("Unmatched \\(");
503 static char *pmatch( char *lp, CHAR *ap );
504 #else /* NEEDPROTOS */
505 static char *pmatch();
506 #endif /* NEEDPROTOS */
510 * execute nfa to find a match.
512 * special cases: (nfa[0])
514 * Match only once, starting from the
517 * First locate the character without
518 * calling pmatch, and if found, call
519 * pmatch for the remaining string.
521 * re_comp failed, poor luser did not
522 * check for it. Fail fast.
524 * If a match is found, bopat[0] and eopat[0] are set
525 * to the beginning and the end of the matched fragment,
534 register char *ep = 0;
535 register CHAR *ap = nfa;
552 case BOL: /* anchored: match from BOL only */
555 case CHR: /* ordinary char: locate it fast */
557 while (*lp && *lp != c)
559 if (!*lp) /* if EOS, fail, else fall thru. */
561 default: /* regular matching all the way. */
563 if ((ep = pmatch(lp,ap)))
569 case END: /* munged automaton. fail always */
581 * pmatch: internal routine for the hard part
583 * This code is partly snarfed from an early grep written by
584 * David Conroy. The backref and tag stuff, and various other
585 * innovations are by oz.
587 * special case optimizations: (nfa[n], nfa[n+1])
589 * We KNOW .* will match everything upto the
590 * end of line. Thus, directly go to the end of
591 * line, without recursive pmatch calls. As in
592 * the other closure cases, the remaining pattern
593 * must be matched by moving backwards on the
594 * string recursively, to find a match for xy
595 * (x is ".*" and y is the remaining pattern)
596 * where the match satisfies the LONGEST match for
597 * x followed by a match for y.
599 * We can again scan the string forward for the
600 * single char and at the point of failure, we
601 * execute the remaining nfa recursively, same as
604 * At the end of a successful match, bopat[n] and eopat[n]
605 * are set to the beginning and end of subpatterns matched
606 * by tagged expressions (n = 1 to 9).
611 extern void re_fail();
615 * character classification table for word boundary operators BOW
616 * and EOW. the reason for not using ctype macros is that we can
617 * let the user add into our own table. see re_modw. This table
618 * is not in the bitset form, since we may wish to extend it in the
619 * future for other character classifications.
621 * TRUE for 0-9 A-Z a-z _
623 static char chrtyp[MAXCHR] = {
624 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
625 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
626 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
627 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
628 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
629 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
630 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
631 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
632 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
633 1, 0, 0, 0, 0, 1, 0, 1, 1, 1,
634 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
635 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
636 1, 1, 1, 0, 0, 0, 0, 0
639 #define inascii(x) (0177&(x))
640 #define iswordc(x) chrtyp[inascii(x)]
641 #define isinset(x,y) ((x)[((y)&BLKIND)>>3] & bitarr[(y)&BITIND])
644 * skip values for CLO XXX to skip past the closure
647 #define ANYSKIP 2 /* [CLO] ANY END ... */
648 #define CHRSKIP 3 /* [CLO] CHR chr END ... */
649 #define CCLSKIP 18 /* [CLO] CCL 16bytes END ... */
652 pmatch( char *lp, CHAR *ap)
654 register int op, c, n;
655 register char *e; /* extra pointer for CLO */
656 register char *bp; /* beginning of subpat.. */
657 register char *ep; /* ending of subpat.. */
658 char *are; /* to save the line ptr. */
660 while ((op = *ap++) != END)
692 if (lp!=bol && iswordc(lp[-1]) || !iswordc(*lp))
696 if (lp==bol || !iswordc(lp[-1]) || iswordc(*lp))
718 while (*lp && c == *lp)
723 while ((c = *lp) && isinset(ap+1,c))
728 re_fail("closure: bad nfa.", *ap);
735 if (e = pmatch(lp, ap))
741 re_fail("re_exec: bad nfa.", op);
749 * add new characters into the word table to change re_exec's
750 * understanding of what a word should look like. Note that we
751 * only accept additions into the word definition.
753 * If the string parameter is 0 or null string, the table is
754 * reset back to the default containing A-Z a-z 0-9 _. [We use
755 * the compact bitset representation for the default table]
758 static CHAR deftab[16] = {
759 0, 0, 0, 0, 0, 0, 0377, 003, 0376, 0377, 0377, 0207,
760 0376, 0377, 0377, 007
769 for (i = 0; i < MAXCHR; i++)
770 if (!isinset(deftab,i))
780 * substitute the matched portions of the src in dst.
782 * & substitute the entire matched pattern.
784 * \digit substitute a subpattern, with the given tag number.
785 * Tags are numbered from 1 to 9. If the particular
786 * tagged subpattern does not exist, null is substituted.
789 re_subs( char *src, char *dst)
796 if (!*src || !bopat[0])
808 if (c >= '0' && c <= '9') {
818 if ((bp = bopat[pin]) && (ep = eopat[pin])) {
819 while (*bp && bp < ep)
831 * symbolic - produce a symbolic dump of the nfa
835 printf("pattern: %s\n", s);
836 printf("nfacode:\n");
864 printf("\tCHR %c\n",*ap++);
876 printf("BOT: %d\n",*ap++);
879 printf("EOT: %d\n",*ap++);
888 printf("REF: %d\n",*ap++);
892 for (n = 0; n < MAXCHR; n++)
893 if (isinset(ap,(CHAR)n)) {
895 printf("^%c", n ^ 0x040);
903 printf("bad nfa. opcode %o\n", ap[-1]);
909 #endif /* MACOS or DOS or NEED_BSDREGEX */