3 #if defined( MACOS ) || defined( DOS ) || defined( _WIN32 ) || defined( NEED_BSDREGEX )
7 * regex - Regular expression pattern matching and replacement
9 * By: Ozan S. Yigit (oz)
10 * Dept. of Computer Science
13 * These routines are the PUBLIC DOMAIN equivalents of regex
14 * routines as found in 4.nBSD UN*X, with minor extensions.
16 * These routines are derived from various implementations found
17 * in software tools books, and Conroy's grep. They are NOT derived
18 * from licensed/restricted software.
19 * For more interesting/academic/complicated implementations,
20 * see Henry Spencer's regexp routines, or GNU Emacs pattern
23 * Vendor Modification history:
25 * Revision 1.2 1996/04/25 16:24:11 mcs
26 * make re_exec() match "" with ".*" and similar patterns
27 * hopefully this change doesn't break anything else!
29 * Revision 1.1 1995/02/03 15:56:52 tim
32 * Revision 1.11 1994/12/14 21:33:45 mcs
33 * use new NEED_BSDREGEX
34 * fix pmatch() prototype
36 * Revision 1.10 1994/12/12 18:16:39 mcs
39 * Revision 1.9 1994/11/15 19:16:35 mcs
40 * add (CHAR) cast to make VisualC++ happy
42 * Revision 1.8 1994/11/08 21:14:32 mcs
45 * Revision 1.7 1994/07/23 19:51:24 mcs
46 * use ANSI-style inline function parameters
48 * Revision 1.6 1993/10/18 01:52:32 tim
51 * Revision 1.5 1993/09/28 21:37:54 mcs
52 * HP/UX needs the regex we include (not in its libc)
54 * Revision 1.4 1993/08/27 15:59:52 mcs
57 * Revision 1.3 1993/08/27 15:49:47 mcs
58 * added missing 0 to octal constants
59 * use unsigned char for CHAR under DOS
61 * Revision 1.2 1993/08/27 14:57:48 mcs
62 * add proto. for pmatch
64 * Revision 1.1 1993/08/18 21:20:02 mcs
67 * Revision 1.4 1991/10/17 03:56:42 oz
68 * miscellaneous changes, small cleanups etc.
70 * Revision 1.3 1989/04/01 14:18:09 oz
71 * Change all references to a dfa: this is actually an nfa.
73 * Revision 1.2 88/08/28 15:36:04 oz
74 * Use a complement bitmap to represent NCL.
75 * This removes the need to have seperate
76 * code in the pmatch case block - it is
79 * Use the actual CCL code in the CLO
80 * section of pmatch. No need for a recursive
83 * Use a bitmap table to set char bits in an
87 * re_comp: compile a regular expression into a NFA.
92 * re_exec: execute the NFA to match a pattern.
97 * re_modw change re_exec's understanding of what a "word"
98 * looks like (for \< and \>) by adding into the
99 * hidden word-syntax table.
104 * re_subs: substitute the matched portions in a new string.
106 * int re_subs(src, dst)
110 * re_fail: failure routine for re_exec.
112 * void re_fail(msg, op)
116 * Regular Expressions:
118 * [1] char matches itself, unless it is a special
119 * character (metachar): . \ [ ] * + ^ $
121 * [2] . matches any character.
123 * [3] \ matches the character following it, except
124 * when followed by a left or right round bracket,
125 * a digit 1 to 9 or a left or right angle bracket.
126 * (see [7], [8] and [9])
127 * It is used as an escape character for all
128 * other meta-characters, and itself. When used
129 * in a set ([4]), it is treated as an ordinary
132 * [4] [set] matches one of the characters in the set.
133 * If the first character in the set is "^",
134 * it matches a character NOT in the set, i.e.
135 * complements the set. A shorthand S-E is
136 * used to specify a set of characters S upto
137 * E, inclusive. The special characters "]" and
138 * "-" have no special meaning if they appear
139 * as the first chars in the set.
142 * [a-z] any lowercase alpha
144 * [^]-] any char except ] and -
146 * [^A-Z] any char except uppercase
151 * [5] * any regular expression form [1] to [4], followed by
152 * closure char (*) matches zero or more matches of
155 * [6] + same as [5], except it matches one or more.
157 * [7] a regular expression in the form [1] to [10], enclosed
158 * as \(form\) matches what form matches. The enclosure
159 * creates a set of tags, used for [8] and for
160 * pattern substution. The tagged forms are numbered
163 * [8] a \ followed by a digit 1 to 9 matches whatever a
164 * previously tagged regular expression ([7]) matched.
166 * [9] \< a regular expression starting with a \< construct
167 * \> and/or ending with a \> construct, restricts the
168 * pattern matching to the beginning of a word, and/or
169 * the end of a word. A word is defined to be a character
170 * string beginning and/or ending with the characters
171 * A-Z a-z 0-9 and _. It must also be preceded and/or
172 * followed by any character outside those mentioned.
174 * [10] a composite regular expression xy where x and y
175 * are in the form [1] to [10] matches the longest
176 * match of x followed by a match for y.
178 * [11] ^ a regular expression starting with a ^ character
179 * $ and/or ending with a $ character, restricts the
180 * pattern matching to the beginning of the line,
181 * or the end of line. [anchors] Elsewhere in the
182 * pattern, ^ and $ are treated as ordinary characters.
187 * HCR's Hugh Redelmeier has been most helpful in various
188 * stages of development. He convinced me to include BOW
189 * and EOW constructs, originally invented by Rob Pike at
190 * the University of Toronto.
193 * Software tools Kernighan & Plauger
194 * Software tools in Pascal Kernighan & Plauger
195 * Grep [rsx-11 C dist] David Conroy
196 * ed - text editor Un*x Programmer's Manual
197 * Advanced editing on Un*x B. W. Kernighan
198 * RegExp routines Henry Spencer
202 * This implementation uses a bit-set representation for character
203 * classes for speed and compactness. Each character is represented
204 * by one bit in a 128-bit block. Thus, CCL always takes a
205 * constant 16 bytes in the internal nfa, and re_exec does a single
206 * bit comparison to locate the character in the set.
211 * compile: CHR f CHR o CLO CHR o END CLO ANY END END
212 * matches: fo foo fooo foobar fobar foxx ...
214 * pattern: fo[ob]a[rz]
215 * compile: CHR f CHR o CCL bitset CHR a CCL bitset END
216 * matches: fobar fooar fobaz fooaz
219 * compile: CHR f CHR o CHR o CHR \ CLO CHR \ END END
220 * matches: foo\ foo\\ foo\\\ ...
222 * pattern: \(foo\)[1-3]\1 (same as foo[1-3]foo)
223 * compile: BOT 1 CHR f CHR o CHR o EOT 1 CCL bitset REF 1 END
224 * matches: foo1foo foo2foo foo3foo
226 * pattern: \(fo.*\)-\1
227 * compile: BOT 1 CHR f CHR o CLO ANY END EOT 1 CHR - REF 1 END
228 * matches: foo-foo fo-fo fob-fob foobar-foobar ...
252 * The following defines are not meant to be changeable.
253 * They are for readability only.
257 #define BITBLK MAXCHR/CHRBIT
263 #if defined( DOS ) || defined( _WIN32 )
264 typedef unsigned char CHAR;
266 typedef /*unsigned*/ char CHAR;
269 static int tagstk[MAXTAG]; /* subpat tag stack..*/
270 static CHAR nfa[MAXNFA]; /* automaton.. */
271 static int sta = NOP; /* status of lastpat */
273 static CHAR bittab[BITBLK]; /* bit table for CCL */
274 /* pre-set bits... */
275 static CHAR bitarr[] = {1,2,4,8,16,32,64,128};
280 bittab[((c) & BLKIND) >> 3] |= bitarr[(c) & BITIND];
283 #define badpat(x) (*nfa = END, x)
284 #define store(x) *mp++ = x
289 register char *p; /* pattern pointer */
290 register CHAR *mp=nfa; /* nfa pointer */
291 register CHAR *lp; /* saved pointer.. */
292 register CHAR *sp=nfa; /* another one.. */
294 register int tagi = 0; /* tag stack index */
295 register int tagc = 1; /* actual tag count */
298 register CHAR mask; /* xor mask -CCL/NCL */
305 return badpat("No previous regular expression");
308 for (p = pat; *p; p++) {
312 case '.': /* match any char.. */
316 case '^': /* match beginning.. */
325 case '$': /* match endofline.. */
334 case '[': /* match char class..*/
344 if (*p == '-') /* real dash */
346 if (*p == ']') /* real brac */
348 while (*p && *p != ']') {
349 if (*p == '-' && *(p+1) && *(p+1) != ']') {
357 else if (*p == '\\' && *(p+1)) {
366 return badpat("Missing ]");
368 for (n = 0; n < BITBLK; bittab[n++] = (char) 0)
369 store(mask ^ bittab[n]);
373 case '*': /* match 0 or more.. */
374 case '+': /* match 1 or more.. */
376 return badpat("Empty closure");
377 lp = sp; /* previous opcode */
378 if (*lp == CLO) /* equivalence.. */
388 return badpat("Illegal closure");
394 for (sp = mp; lp < sp; lp++)
406 case '\\': /* tags, backrefs .. */
411 tagstk[++tagi] = tagc;
416 return badpat("Too many \\(\\) pairs");
420 return badpat("Null pattern inside \\(\\)");
423 store(tagstk[tagi--]);
426 return badpat("Unmatched \\)");
433 return badpat("Null pattern inside \\<\\>");
446 if (tagi > 0 && tagstk[tagi] == n)
447 return badpat("Cyclical reference");
453 return badpat("Undetermined reference");
483 default : /* an ordinary char */
491 return badpat("Unmatched \\(");
502 static char *pmatch( char *lp, CHAR *ap );
503 #else /* NEEDPROTOS */
504 static char *pmatch();
505 #endif /* NEEDPROTOS */
509 * execute nfa to find a match.
511 * special cases: (nfa[0])
513 * Match only once, starting from the
516 * First locate the character without
517 * calling pmatch, and if found, call
518 * pmatch for the remaining string.
520 * re_comp failed, poor luser did not
521 * check for it. Fail fast.
523 * If a match is found, bopat[0] and eopat[0] are set
524 * to the beginning and the end of the matched fragment,
533 register char *ep = 0;
534 register CHAR *ap = nfa;
551 case BOL: /* anchored: match from BOL only */
554 case CHR: /* ordinary char: locate it fast */
556 while (*lp && *lp != c)
558 if (!*lp) /* if EOS, fail, else fall thru. */
560 default: /* regular matching all the way. */
562 if ((ep = pmatch(lp,ap)))
568 case END: /* munged automaton. fail always */
580 * pmatch: internal routine for the hard part
582 * This code is partly snarfed from an early grep written by
583 * David Conroy. The backref and tag stuff, and various other
584 * innovations are by oz.
586 * special case optimizations: (nfa[n], nfa[n+1])
588 * We KNOW .* will match everything upto the
589 * end of line. Thus, directly go to the end of
590 * line, without recursive pmatch calls. As in
591 * the other closure cases, the remaining pattern
592 * must be matched by moving backwards on the
593 * string recursively, to find a match for xy
594 * (x is ".*" and y is the remaining pattern)
595 * where the match satisfies the LONGEST match for
596 * x followed by a match for y.
598 * We can again scan the string forward for the
599 * single char and at the point of failure, we
600 * execute the remaining nfa recursively, same as
603 * At the end of a successful match, bopat[n] and eopat[n]
604 * are set to the beginning and end of subpatterns matched
605 * by tagged expressions (n = 1 to 9).
610 extern void re_fail();
614 * character classification table for word boundary operators BOW
615 * and EOW. the reason for not using ctype macros is that we can
616 * let the user add into our own table. see re_modw. This table
617 * is not in the bitset form, since we may wish to extend it in the
618 * future for other character classifications.
620 * TRUE for 0-9 A-Z a-z _
622 static char chrtyp[MAXCHR] = {
623 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
624 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
625 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
626 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
627 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
628 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
629 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
630 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
631 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
632 1, 0, 0, 0, 0, 1, 0, 1, 1, 1,
633 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
634 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
635 1, 1, 1, 0, 0, 0, 0, 0
638 #define inascii(x) (0177&(x))
639 #define iswordc(x) chrtyp[inascii(x)]
640 #define isinset(x,y) ((x)[((y)&BLKIND)>>3] & bitarr[(y)&BITIND])
643 * skip values for CLO XXX to skip past the closure
646 #define ANYSKIP 2 /* [CLO] ANY END ... */
647 #define CHRSKIP 3 /* [CLO] CHR chr END ... */
648 #define CCLSKIP 18 /* [CLO] CCL 16bytes END ... */
651 pmatch( char *lp, CHAR *ap)
653 register int op, c, n;
654 register char *e; /* extra pointer for CLO */
655 register char *bp; /* beginning of subpat.. */
656 register char *ep; /* ending of subpat.. */
657 char *are; /* to save the line ptr. */
659 while ((op = *ap++) != END)
691 if (lp!=bol && iswordc(lp[-1]) || !iswordc(*lp))
695 if (lp==bol || !iswordc(lp[-1]) || iswordc(*lp))
717 while (*lp && c == *lp)
722 while ((c = *lp) && isinset(ap+1,c))
727 re_fail("closure: bad nfa.", *ap);
734 if (e = pmatch(lp, ap))
740 re_fail("re_exec: bad nfa.", op);
748 * add new characters into the word table to change re_exec's
749 * understanding of what a word should look like. Note that we
750 * only accept additions into the word definition.
752 * If the string parameter is 0 or null string, the table is
753 * reset back to the default containing A-Z a-z 0-9 _. [We use
754 * the compact bitset representation for the default table]
757 static CHAR deftab[16] = {
758 0, 0, 0, 0, 0, 0, 0377, 003, 0376, 0377, 0377, 0207,
759 0376, 0377, 0377, 007
768 for (i = 0; i < MAXCHR; i++)
769 if (!isinset(deftab,i))
779 * substitute the matched portions of the src in dst.
781 * & substitute the entire matched pattern.
783 * \digit substitute a subpattern, with the given tag number.
784 * Tags are numbered from 1 to 9. If the particular
785 * tagged subpattern does not exist, null is substituted.
788 re_subs( char *src, char *dst)
795 if (!*src || !bopat[0])
807 if (c >= '0' && c <= '9') {
817 if ((bp = bopat[pin]) && (ep = eopat[pin])) {
818 while (*bp && bp < ep)
830 * symbolic - produce a symbolic dump of the nfa
834 printf("pattern: %s\n", s);
835 printf("nfacode:\n");
863 printf("\tCHR %c\n",*ap++);
875 printf("BOT: %d\n",*ap++);
878 printf("EOT: %d\n",*ap++);
887 printf("REF: %d\n",*ap++);
891 for (n = 0; n < MAXCHR; n++)
892 if (isinset(ap,(CHAR)n)) {
894 printf("^%c", n ^ 0x040);
902 printf("bad nfa. opcode %o\n", ap[-1]);
908 #endif /* MACOS or DOS or NEED_BSDREGEX */