3 #if defined( MACOS ) || defined( DOS ) || defined( _WIN32 ) || defined( NEED_BSDREGEX )
7 * regex - Regular expression pattern matching and replacement
9 * By: Ozan S. Yigit (oz)
10 * Dept. of Computer Science
13 * These routines are the PUBLIC DOMAIN equivalents of regex
14 * routines as found in 4.nBSD UN*X, with minor extensions.
16 * These routines are derived from various implementations found
17 * in software tools books, and Conroy's grep. They are NOT derived
18 * from licensed/restricted software.
19 * For more interesting/academic/complicated implementations,
20 * see Henry Spencer's regexp routines, or GNU Emacs pattern
23 * Modification history:
26 * Revision 1.12 1996/04/25 16:20:59 mcs
27 * make re_exec() match "" with ".*" and similar patterns
28 * hopefully this change doesn't break anything else!
30 * Revision 1.11 1994/12/14 21:33:45 mcs
31 * use new NEED_BSDREGEX
32 * fix pmatch() prototype
34 * Revision 1.10 1994/12/12 18:16:39 mcs
37 * Revision 1.9 1994/11/15 19:16:35 mcs
38 * add (CHAR) cast to make VisualC++ happy
40 * Revision 1.8 1994/11/08 21:14:32 mcs
43 * Revision 1.7 1994/07/23 19:51:24 mcs
44 * use ANSI-style inline function parameters
46 * Revision 1.6 1993/10/18 01:52:32 tim
49 * Revision 1.5 1993/09/28 21:37:54 mcs
50 * HP/UX needs the regex we include (not in its libc)
52 * Revision 1.4 1993/08/27 15:59:52 mcs
55 * Revision 1.3 1993/08/27 15:49:47 mcs
56 * added missing 0 to octal constants
57 * use unsigned char for CHAR under DOS
59 * Revision 1.2 1993/08/27 14:57:48 mcs
60 * add proto. for pmatch
62 * Revision 1.1 1993/08/18 21:20:02 mcs
65 * Revision 1.4 1991/10/17 03:56:42 oz
66 * miscellaneous changes, small cleanups etc.
68 * Revision 1.3 1989/04/01 14:18:09 oz
69 * Change all references to a dfa: this is actually an nfa.
71 * Revision 1.2 88/08/28 15:36:04 oz
72 * Use a complement bitmap to represent NCL.
73 * This removes the need to have seperate
74 * code in the pmatch case block - it is
77 * Use the actual CCL code in the CLO
78 * section of pmatch. No need for a recursive
81 * Use a bitmap table to set char bits in an
85 * re_comp: compile a regular expression into a NFA.
90 * re_exec: execute the NFA to match a pattern.
95 * re_modw change re_exec's understanding of what a "word"
96 * looks like (for \< and \>) by adding into the
97 * hidden word-syntax table.
102 * re_subs: substitute the matched portions in a new string.
104 * int re_subs(src, dst)
108 * re_fail: failure routine for re_exec.
110 * void re_fail(msg, op)
114 * Regular Expressions:
116 * [1] char matches itself, unless it is a special
117 * character (metachar): . \ [ ] * + ^ $
119 * [2] . matches any character.
121 * [3] \ matches the character following it, except
122 * when followed by a left or right round bracket,
123 * a digit 1 to 9 or a left or right angle bracket.
124 * (see [7], [8] and [9])
125 * It is used as an escape character for all
126 * other meta-characters, and itself. When used
127 * in a set ([4]), it is treated as an ordinary
130 * [4] [set] matches one of the characters in the set.
131 * If the first character in the set is "^",
132 * it matches a character NOT in the set, i.e.
133 * complements the set. A shorthand S-E is
134 * used to specify a set of characters S upto
135 * E, inclusive. The special characters "]" and
136 * "-" have no special meaning if they appear
137 * as the first chars in the set.
140 * [a-z] any lowercase alpha
142 * [^]-] any char except ] and -
144 * [^A-Z] any char except uppercase
149 * [5] * any regular expression form [1] to [4], followed by
150 * closure char (*) matches zero or more matches of
153 * [6] + same as [5], except it matches one or more.
155 * [7] a regular expression in the form [1] to [10], enclosed
156 * as \(form\) matches what form matches. The enclosure
157 * creates a set of tags, used for [8] and for
158 * pattern substution. The tagged forms are numbered
161 * [8] a \ followed by a digit 1 to 9 matches whatever a
162 * previously tagged regular expression ([7]) matched.
164 * [9] \< a regular expression starting with a \< construct
165 * \> and/or ending with a \> construct, restricts the
166 * pattern matching to the beginning of a word, and/or
167 * the end of a word. A word is defined to be a character
168 * string beginning and/or ending with the characters
169 * A-Z a-z 0-9 and _. It must also be preceded and/or
170 * followed by any character outside those mentioned.
172 * [10] a composite regular expression xy where x and y
173 * are in the form [1] to [10] matches the longest
174 * match of x followed by a match for y.
176 * [11] ^ a regular expression starting with a ^ character
177 * $ and/or ending with a $ character, restricts the
178 * pattern matching to the beginning of the line,
179 * or the end of line. [anchors] Elsewhere in the
180 * pattern, ^ and $ are treated as ordinary characters.
185 * HCR's Hugh Redelmeier has been most helpful in various
186 * stages of development. He convinced me to include BOW
187 * and EOW constructs, originally invented by Rob Pike at
188 * the University of Toronto.
191 * Software tools Kernighan & Plauger
192 * Software tools in Pascal Kernighan & Plauger
193 * Grep [rsx-11 C dist] David Conroy
194 * ed - text editor Un*x Programmer's Manual
195 * Advanced editing on Un*x B. W. Kernighan
196 * RegExp routines Henry Spencer
200 * This implementation uses a bit-set representation for character
201 * classes for speed and compactness. Each character is represented
202 * by one bit in a 128-bit block. Thus, CCL always takes a
203 * constant 16 bytes in the internal nfa, and re_exec does a single
204 * bit comparison to locate the character in the set.
209 * compile: CHR f CHR o CLO CHR o END CLO ANY END END
210 * matches: fo foo fooo foobar fobar foxx ...
212 * pattern: fo[ob]a[rz]
213 * compile: CHR f CHR o CCL bitset CHR a CCL bitset END
214 * matches: fobar fooar fobaz fooaz
217 * compile: CHR f CHR o CHR o CHR \ CLO CHR \ END END
218 * matches: foo\ foo\\ foo\\\ ...
220 * pattern: \(foo\)[1-3]\1 (same as foo[1-3]foo)
221 * compile: BOT 1 CHR f CHR o CHR o EOT 1 CCL bitset REF 1 END
222 * matches: foo1foo foo2foo foo3foo
224 * pattern: \(fo.*\)-\1
225 * compile: BOT 1 CHR f CHR o CLO ANY END EOT 1 CHR - REF 1 END
226 * matches: foo-foo fo-fo fob-fob foobar-foobar ...
250 * The following defines are not meant to be changeable.
251 * They are for readability only.
255 #define BITBLK MAXCHR/CHRBIT
261 #if defined( DOS ) || defined( _WIN32 )
262 typedef unsigned char CHAR;
264 typedef /*unsigned*/ char CHAR;
267 static int tagstk[MAXTAG]; /* subpat tag stack..*/
268 static CHAR nfa[MAXNFA]; /* automaton.. */
269 static int sta = NOP; /* status of lastpat */
271 static CHAR bittab[BITBLK]; /* bit table for CCL */
272 /* pre-set bits... */
273 static CHAR bitarr[] = {1,2,4,8,16,32,64,128};
278 bittab[((c) & BLKIND) >> 3] |= bitarr[(c) & BITIND];
281 #define badpat(x) (*nfa = END, x)
282 #define store(x) *mp++ = x
287 register char *p; /* pattern pointer */
288 register CHAR *mp=nfa; /* nfa pointer */
289 register CHAR *lp; /* saved pointer.. */
290 register CHAR *sp=nfa; /* another one.. */
292 register int tagi = 0; /* tag stack index */
293 register int tagc = 1; /* actual tag count */
296 register CHAR mask; /* xor mask -CCL/NCL */
303 return badpat("No previous regular expression");
306 for (p = pat; *p; p++) {
310 case '.': /* match any char.. */
314 case '^': /* match beginning.. */
323 case '$': /* match endofline.. */
332 case '[': /* match char class..*/
342 if (*p == '-') /* real dash */
344 if (*p == ']') /* real brac */
346 while (*p && *p != ']') {
347 if (*p == '-' && *(p+1) && *(p+1) != ']') {
355 else if (*p == '\\' && *(p+1)) {
364 return badpat("Missing ]");
366 for (n = 0; n < BITBLK; bittab[n++] = (char) 0)
367 store(mask ^ bittab[n]);
371 case '*': /* match 0 or more.. */
372 case '+': /* match 1 or more.. */
374 return badpat("Empty closure");
375 lp = sp; /* previous opcode */
376 if (*lp == CLO) /* equivalence.. */
386 return badpat("Illegal closure");
392 for (sp = mp; lp < sp; lp++)
404 case '\\': /* tags, backrefs .. */
409 tagstk[++tagi] = tagc;
414 return badpat("Too many \\(\\) pairs");
418 return badpat("Null pattern inside \\(\\)");
421 store(tagstk[tagi--]);
424 return badpat("Unmatched \\)");
431 return badpat("Null pattern inside \\<\\>");
444 if (tagi > 0 && tagstk[tagi] == n)
445 return badpat("Cyclical reference");
451 return badpat("Undetermined reference");
481 default : /* an ordinary char */
489 return badpat("Unmatched \\(");
500 static char *pmatch( char *lp, CHAR *ap );
501 #else /* NEEDPROTOS */
502 static char *pmatch();
503 #endif /* NEEDPROTOS */
507 * execute nfa to find a match.
509 * special cases: (nfa[0])
511 * Match only once, starting from the
514 * First locate the character without
515 * calling pmatch, and if found, call
516 * pmatch for the remaining string.
518 * re_comp failed, poor luser did not
519 * check for it. Fail fast.
521 * If a match is found, bopat[0] and eopat[0] are set
522 * to the beginning and the end of the matched fragment,
531 register char *ep = 0;
532 register CHAR *ap = nfa;
549 case BOL: /* anchored: match from BOL only */
552 case CHR: /* ordinary char: locate it fast */
554 while (*lp && *lp != c)
556 if (!*lp) /* if EOS, fail, else fall thru. */
558 default: /* regular matching all the way. */
560 if ((ep = pmatch(lp,ap)))
566 case END: /* munged automaton. fail always */
578 * pmatch: internal routine for the hard part
580 * This code is partly snarfed from an early grep written by
581 * David Conroy. The backref and tag stuff, and various other
582 * innovations are by oz.
584 * special case optimizations: (nfa[n], nfa[n+1])
586 * We KNOW .* will match everything upto the
587 * end of line. Thus, directly go to the end of
588 * line, without recursive pmatch calls. As in
589 * the other closure cases, the remaining pattern
590 * must be matched by moving backwards on the
591 * string recursively, to find a match for xy
592 * (x is ".*" and y is the remaining pattern)
593 * where the match satisfies the LONGEST match for
594 * x followed by a match for y.
596 * We can again scan the string forward for the
597 * single char and at the point of failure, we
598 * execute the remaining nfa recursively, same as
601 * At the end of a successful match, bopat[n] and eopat[n]
602 * are set to the beginning and end of subpatterns matched
603 * by tagged expressions (n = 1 to 9).
608 extern void re_fail();
612 * character classification table for word boundary operators BOW
613 * and EOW. the reason for not using ctype macros is that we can
614 * let the user add into our own table. see re_modw. This table
615 * is not in the bitset form, since we may wish to extend it in the
616 * future for other character classifications.
618 * TRUE for 0-9 A-Z a-z _
620 static char chrtyp[MAXCHR] = {
621 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
622 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
623 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
624 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
625 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
626 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
627 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
628 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
629 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
630 1, 0, 0, 0, 0, 1, 0, 1, 1, 1,
631 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
632 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
633 1, 1, 1, 0, 0, 0, 0, 0
636 #define inascii(x) (0177&(x))
637 #define iswordc(x) chrtyp[inascii(x)]
638 #define isinset(x,y) ((x)[((y)&BLKIND)>>3] & bitarr[(y)&BITIND])
641 * skip values for CLO XXX to skip past the closure
644 #define ANYSKIP 2 /* [CLO] ANY END ... */
645 #define CHRSKIP 3 /* [CLO] CHR chr END ... */
646 #define CCLSKIP 18 /* [CLO] CCL 16bytes END ... */
649 pmatch( char *lp, CHAR *ap)
651 register int op, c, n;
652 register char *e; /* extra pointer for CLO */
653 register char *bp; /* beginning of subpat.. */
654 register char *ep; /* ending of subpat.. */
655 char *are; /* to save the line ptr. */
657 while ((op = *ap++) != END)
689 if (lp!=bol && iswordc(lp[-1]) || !iswordc(*lp))
693 if (lp==bol || !iswordc(lp[-1]) || iswordc(*lp))
715 while (*lp && c == *lp)
720 while ((c = *lp) && isinset(ap+1,c))
725 re_fail("closure: bad nfa.", *ap);
732 if (e = pmatch(lp, ap))
738 re_fail("re_exec: bad nfa.", op);
746 * add new characters into the word table to change re_exec's
747 * understanding of what a word should look like. Note that we
748 * only accept additions into the word definition.
750 * If the string parameter is 0 or null string, the table is
751 * reset back to the default containing A-Z a-z 0-9 _. [We use
752 * the compact bitset representation for the default table]
755 static CHAR deftab[16] = {
756 0, 0, 0, 0, 0, 0, 0377, 003, 0376, 0377, 0377, 0207,
757 0376, 0377, 0377, 007
766 for (i = 0; i < MAXCHR; i++)
767 if (!isinset(deftab,i))
777 * substitute the matched portions of the src in dst.
779 * & substitute the entire matched pattern.
781 * \digit substitute a subpattern, with the given tag number.
782 * Tags are numbered from 1 to 9. If the particular
783 * tagged subpattern does not exist, null is substituted.
786 re_subs( char *src, char *dst)
793 if (!*src || !bopat[0])
805 if (c >= '0' && c <= '9') {
815 if ((bp = bopat[pin]) && (ep = eopat[pin])) {
816 while (*bp && bp < ep)
828 * symbolic - produce a symbolic dump of the nfa
832 printf("pattern: %s\n", s);
833 printf("nfacode:\n");
861 printf("\tCHR %c\n",*ap++);
873 printf("BOT: %d\n",*ap++);
876 printf("EOT: %d\n",*ap++);
885 printf("REF: %d\n",*ap++);
889 for (n = 0; n < MAXCHR; n++)
890 if (isinset(ap,(CHAR)n)) {
892 printf("^%c", n ^ 0x040);
900 printf("bad nfa. opcode %o\n", ap[-1]);
906 #endif /* MACOS or DOS or NEED_BSDREGEX */