3 * Author: Tatu Ylonen <ylo@ngs.fi>
5 * Copyright (c) 1991 Tatu Ylonen, Espoo, Finland
7 * Permission to use, copy, modify, distribute, and sell this software
8 * and its documentation for any purpose is hereby granted without
9 * fee, provided that the above copyright notice appear in all copies.
10 * This software is provided "as is" without express or implied
13 * Created: Thu Sep 26 17:14:05 1991 ylo
14 * Last modified: Mon Nov 4 17:06:48 1991 ylo
15 * Ported to Think C: 19 Jan 1992 guido@cwi.nl
17 * This code draws many ideas from the regular expression packages by
18 * Henry Spencer of the University of Toronto and Richard Stallman of
19 * the Free Software Foundation.
21 * Emacs-specific code and syntax table code is almost directly borrowed
24 * Bugs fixed and lots of reorganization by Jeffrey C. Ollie, April
25 * 1997 Thanks for bug reports and ideas from Andrew Kuchling, Tim
26 * Peters, Guido van Rossum, Ka-Ping Yee, Sjoerd Mullender, and
27 * probably one or two others that I'm forgetting.
34 #define set_error(x) bufp->errmsg=((char *)(x))
35 #define got_error bufp->errmsg!=NULL
37 /* The original code blithely assumed that sizeof(short) == 2. Not
38 * always true. Original instances of "(short)x" were replaced by
39 * SHORT(x), where SHORT is #defined below. */
41 #define SHORT(x) ((x) & 0x8000 ? (x) - 0x10000 : (x))
43 /* The stack implementation is taken from an idea by Andrew Kuchling.
44 * It's a doubly linked list of arrays. The advantages of this over a
45 * simple linked list are that the number of mallocs required are
46 * reduced. It also makes it possible to statically allocate enough
47 * space so that small patterns don't ever need to call malloc.
49 * The advantages over a single array is that is periodically
50 * realloced when more space is needed is that we avoid ever copying
53 /* item_t is the basic stack element. Defined as a union of
54 * structures so that both registers, failure points, and counters can
55 * be pushed/popped from the stack. There's nothing built into the
56 * item to keep track of whether a certain stack item is a register, a
57 * failure point, or a counter. */
59 typedef union item_t {
80 #define STACK_PAGE_SIZE 256
81 #define NUM_REGISTERS 256
83 /* A 'page' of stack items. */
85 typedef struct item_page_t {
86 item_t items[STACK_PAGE_SIZE];
87 struct item_page_t *prev;
88 struct item_page_t *next;
92 typedef struct match_state {
93 /* The number of registers that have been pushed onto the stack
94 * since the last failure point. */
98 /* Used to control when registers need to be pushed onto the
103 /* The number of failure points on the stack. */
107 /* Storage for the registers. Each register consists of two
108 * pointers to characters. So register N is represented as
109 * start[N] and end[N]. The pointers must be converted to
110 * offsets from the beginning of the string before returning the
111 * registers to the calling program. */
113 unsigned char *start[NUM_REGISTERS];
114 unsigned char *end[NUM_REGISTERS];
116 /* Keeps track of whether a register has changed recently. */
118 int changed[NUM_REGISTERS];
120 /* Structure to encapsulate the stack. */
122 /* index into the current page. If index == 0 and you need
123 * to pop an item, move to the previous page and set index
124 * = STACK_PAGE_SIZE - 1. Otherwise decrement index to
125 * push a page. If index == STACK_PAGE_SIZE and you need
126 * to push a page move to the next page and set index =
127 * 0. If there is no new next page, allocate a new page
128 * and link it in. Otherwise, increment index to push a
132 item_page_t *current; /* Pointer to the current page. */
133 item_page_t first; /* First page is statically allocated. */
137 /* Initialize a state object */
139 /* #define NEW_STATE(state) \ */
140 /* memset(&state, 0, (void *)(&state.stack) - (void *)(&state)); \ */
141 /* state.stack.current = &state.stack.first; \ */
142 /* state.stack.first.prev = NULL; \ */
143 /* state.stack.first.next = NULL; \ */
144 /* state.stack.index = 0; \ */
145 /* state.level = 1 */
147 #define NEW_STATE(state, nregs) \
150 for (i = 0; i < nregs; i++) \
152 state.start[i] = NULL; \
153 state.end[i] = NULL; \
154 state.changed[i] = 0; \
156 state.stack.current = &state.stack.first; \
157 state.stack.first.prev = NULL; \
158 state.stack.first.next = NULL; \
159 state.stack.index = 0; \
166 /* Free any memory that might have been malloc'd */
168 #define FREE_STATE(state) \
169 while(state.stack.first.next != NULL) \
171 state.stack.current = state.stack.first.next; \
172 state.stack.first.next = state.stack.current->next; \
173 free(state.stack.current); \
176 /* Discard the top 'count' stack items. */
178 #define STACK_DISCARD(stack, count, on_error) \
179 stack.index -= count; \
180 while (stack.index < 0) \
182 if (stack.current->prev == NULL) \
184 stack.current = stack.current->prev; \
185 stack.index += STACK_PAGE_SIZE; \
188 /* Store a pointer to the previous item on the stack. Used to pop an
189 * item off of the stack. */
191 #define STACK_PREV(stack, top, on_error) \
192 if (stack.index == 0) \
194 if (stack.current->prev == NULL) \
196 stack.current = stack.current->prev; \
197 stack.index = STACK_PAGE_SIZE - 1; \
203 top = &(stack.current->items[stack.index])
205 /* Store a pointer to the next item on the stack. Used to push an item
206 * on to the stack. */
208 #define STACK_NEXT(stack, top, on_error) \
209 if (stack.index == STACK_PAGE_SIZE) \
211 if (stack.current->next == NULL) \
213 stack.current->next = (item_page_t *)malloc(sizeof(item_page_t)); \
214 if (stack.current->next == NULL) \
216 stack.current->next->prev = stack.current; \
217 stack.current->next->next = NULL; \
219 stack.current = stack.current->next; \
222 top = &(stack.current->items[stack.index++])
224 /* Store a pointer to the item that is 'count' items back in the
225 * stack. STACK_BACK(stack, top, 1, on_error) is equivalent to
226 * STACK_TOP(stack, top, on_error). */
228 #define STACK_BACK(stack, top, count, on_error) \
231 item_page_t *current; \
232 current = stack.current; \
233 index = stack.index - (count); \
236 if (current->prev == NULL) \
238 current = current->prev; \
239 index += STACK_PAGE_SIZE; \
241 top = &(current->items[index]); \
244 /* Store a pointer to the top item on the stack. Execute the
245 * 'on_error' code if there are no items on the stack. */
247 #define STACK_TOP(stack, top, on_error) \
248 if (stack.index == 0) \
250 if (stack.current->prev == NULL) \
252 top = &(stack.current->prev->items[STACK_PAGE_SIZE - 1]); \
256 top = &(stack.current->items[stack.index - 1]); \
259 /* Test to see if the stack is empty */
261 #define STACK_EMPTY(stack) ((stack.index == 0) && \
262 (stack.current->prev == NULL))
264 /* Return the start of register 'reg' */
266 #define GET_REG_START(state, reg) (state.start[reg])
268 /* Return the end of register 'reg' */
270 #define GET_REG_END(state, reg) (state.end[reg])
272 /* Set the start of register 'reg'. If the state of the register needs
273 * saving, push it on the stack. */
275 #define SET_REG_START(state, reg, text, on_error) \
276 if(state.changed[reg] < state.level) \
279 STACK_NEXT(state.stack, item, on_error); \
280 item->reg.num = reg; \
281 item->reg.start = state.start[reg]; \
282 item->reg.end = state.end[reg]; \
283 item->reg.level = state.changed[reg]; \
284 state.changed[reg] = state.level; \
287 state.start[reg] = text
289 /* Set the end of register 'reg'. If the state of the register needs
290 * saving, push it on the stack. */
292 #define SET_REG_END(state, reg, text, on_error) \
293 if(state.changed[reg] < state.level) \
296 STACK_NEXT(state.stack, item, on_error); \
297 item->reg.num = reg; \
298 item->reg.start = state.start[reg]; \
299 item->reg.end = state.end[reg]; \
300 item->reg.level = state.changed[reg]; \
301 state.changed[reg] = state.level; \
304 state.end[reg] = text
306 #define PUSH_FAILURE(state, xcode, xtext, on_error) \
309 STACK_NEXT(state.stack, item, on_error); \
310 item->fail.code = xcode; \
311 item->fail.text = xtext; \
312 item->fail.count = state.count; \
313 item->fail.level = state.level; \
314 item->fail.phantom = 0; \
320 /* Update the last failure point with a new position in the text. */
322 #define UPDATE_FAILURE(state, xtext, on_error) \
325 STACK_BACK(state.stack, item, state.count + 1, on_error); \
326 if (!item->fail.phantom) \
329 STACK_NEXT(state.stack, item2, on_error); \
330 item2->fail.code = item->fail.code; \
331 item2->fail.text = xtext; \
332 item2->fail.count = state.count; \
333 item2->fail.level = state.level; \
334 item2->fail.phantom = 1; \
341 STACK_DISCARD(state.stack, state.count, on_error); \
342 STACK_TOP(state.stack, item, on_error); \
343 item->fail.text = xtext; \
349 #define POP_FAILURE(state, xcode, xtext, on_empty, on_error) \
354 while(state.count > 0) \
356 STACK_PREV(state.stack, item, on_error); \
357 state.start[item->reg.num] = item->reg.start; \
358 state.end[item->reg.num] = item->reg.end; \
359 state.changed[item->reg.num] = item->reg.level; \
362 STACK_PREV(state.stack, item, on_empty); \
363 xcode = item->fail.code; \
364 xtext = item->fail.text; \
365 state.count = item->fail.count; \
366 state.level = item->fail.level; \
369 while (item->fail.text == NULL); \
372 enum regexp_compiled_ops { /* opcodes for compiled regexp */
373 Cend, /* end of pattern reached */
374 Cbol, /* beginning of line */
375 Ceol, /* end of line */
376 Cset, /* character set. Followed by 32 bytes of set. */
377 Cexact, /* followed by a byte to match */
378 Canychar, /* matches any character except newline */
379 Cstart_memory, /* set register start addr (followed by reg number) */
380 Cend_memory, /* set register end addr (followed by reg number) */
381 Cmatch_memory, /* match a duplicate of reg contents (regnum follows) */
382 Cjump, /* followed by two bytes (lsb,msb) of displacement. */
383 Cstar_jump, /* will change to jump/update_failure_jump at runtime */
384 Cfailure_jump, /* jump to addr on failure */
385 Cupdate_failure_jump, /* update topmost failure point and jump */
386 Cdummy_failure_jump, /* push a dummy failure point and jump */
387 Cbegbuf, /* match at beginning of buffer */
388 Cendbuf, /* match at end of buffer */
389 Cwordbeg, /* match at beginning of word */
390 Cwordend, /* match at end of word */
391 Cwordbound, /* match if at word boundary */
392 Cnotwordbound, /* match if not at word boundary */
393 Csyntaxspec, /* matches syntax code (1 byte follows) */
394 Cnotsyntaxspec, /* matches if syntax code does not match (1 byte follows) */
398 enum regexp_syntax_op { /* syntax codes for plain and quoted characters */
399 Rend, /* special code for end of regexp */
400 Rnormal, /* normal character */
401 Ranychar, /* any character except newline */
402 Rquote, /* the quote character */
403 Rbol, /* match beginning of line */
404 Reol, /* match end of line */
405 Roptional, /* match preceding expression optionally */
406 Rstar, /* match preceding expr zero or more times */
407 Rplus, /* match preceding expr one or more times */
408 Ror, /* match either of alternatives */
409 Ropenpar, /* opening parenthesis */
410 Rclosepar, /* closing parenthesis */
411 Rmemory, /* match memory register */
412 Rextended_memory, /* \vnn to match registers 10-99 */
413 Ropenset, /* open set. Internal syntax hard-coded below. */
414 /* the following are gnu extensions to "normal" regexp syntax */
415 Rbegbuf, /* beginning of buffer */
416 Rendbuf, /* end of buffer */
417 Rwordchar, /* word character */
418 Rnotwordchar, /* not word character */
419 Rwordbeg, /* beginning of word */
420 Rwordend, /* end of word */
421 Rwordbound, /* word bound */
422 Rnotwordbound, /* not word bound */
426 static int re_compile_initialized = 0;
427 static int regexp_syntax = RE_SYNTAX_EGREP;
428 int re_syntax = RE_SYNTAX_EGREP; /* Exported copy of regexp_syntax */
429 static unsigned char plain_ops[256];
430 static unsigned char quoted_ops[256];
431 static unsigned char precedences[Rnum_ops];
432 static int regexp_context_indep_ops;
433 static int regexp_ansi_sequences;
435 #define NUM_LEVELS 5 /* number of precedence levels in use */
436 #define MAX_NESTING 100 /* max nesting level of operators */
438 #define SYNTAX(ch) re_syntax_table[(unsigned char)(ch)]
440 unsigned char re_syntax_table[256];
442 void re_compile_initialize(void)
446 static int syntax_table_inited = 0;
448 if (!syntax_table_inited) {
449 syntax_table_inited = 1;
450 memset(re_syntax_table, 0, 256);
451 for (a = 'a'; a <= 'z'; a++)
452 re_syntax_table[a] = Sword;
453 for (a = 'A'; a <= 'Z'; a++)
454 re_syntax_table[a] = Sword;
455 for (a = '0'; a <= '9'; a++)
456 re_syntax_table[a] = Sword | Sdigit | Shexdigit;
457 for (a = '0'; a <= '7'; a++)
458 re_syntax_table[a] |= Soctaldigit;
459 for (a = 'A'; a <= 'F'; a++)
460 re_syntax_table[a] |= Shexdigit;
461 for (a = 'a'; a <= 'f'; a++)
462 re_syntax_table[a] |= Shexdigit;
463 re_syntax_table[(int)'_'] = Sword;
464 for (a = 9; a <= 13; a++)
465 re_syntax_table[a] = Swhitespace;
466 re_syntax_table[(int)' '] = Swhitespace;
468 re_compile_initialized = 1;
469 for (a = 0; a < 256; a++) {
470 plain_ops[a] = Rnormal;
471 quoted_ops[a] = Rnormal;
473 for (a = '0'; a <= '9'; a++)
474 quoted_ops[a] = Rmemory;
475 plain_ops[(int)'\134'] = Rquote;
476 if (regexp_syntax & RE_NO_BK_PARENS) {
477 plain_ops[(int)'('] = Ropenpar;
478 plain_ops[(int)')'] = Rclosepar;
480 quoted_ops[(int)'('] = Ropenpar;
481 quoted_ops[(int)')'] = Rclosepar;
483 if (regexp_syntax & RE_NO_BK_VBAR) {
484 plain_ops[(int)'\174'] = Ror;
486 quoted_ops[(int)'\174'] = Ror;
487 plain_ops[(int)'*'] = Rstar;
488 if (regexp_syntax & RE_BK_PLUS_QM) {
489 quoted_ops[(int)'+'] = Rplus;
490 quoted_ops[(int)'?'] = Roptional;
492 plain_ops[(int)'+'] = Rplus;
493 plain_ops[(int)'?'] = Roptional;
495 if (regexp_syntax & RE_NEWLINE_OR) {
496 plain_ops[(int)'\n'] = Ror;
498 plain_ops[(int)'\133'] = Ropenset;
499 plain_ops[(int)'\136'] = Rbol;
500 plain_ops[(int)'$'] = Reol;
501 plain_ops[(int)'.'] = Ranychar;
502 if (!(regexp_syntax & RE_NO_GNU_EXTENSIONS)) {
503 quoted_ops[(int)'w'] = Rwordchar;
504 quoted_ops[(int)'W'] = Rnotwordchar;
505 quoted_ops[(int)'<'] = Rwordbeg;
506 quoted_ops[(int)'>'] = Rwordend;
507 quoted_ops[(int)'b'] = Rwordbound;
508 quoted_ops[(int)'B'] = Rnotwordbound;
509 quoted_ops[(int)'`'] = Rbegbuf;
510 quoted_ops[(int)'\''] = Rendbuf;
512 if (regexp_syntax & RE_ANSI_HEX) {
513 quoted_ops[(int)'v'] = Rextended_memory;
515 for (a = 0; a < Rnum_ops; a++) {
518 if (regexp_syntax & RE_TIGHT_VBAR) {
519 precedences[Ror] = 3;
520 precedences[Rbol] = 2;
521 precedences[Reol] = 2;
523 precedences[Ror] = 2;
524 precedences[Rbol] = 3;
525 precedences[Reol] = 3;
527 precedences[Rclosepar] = 1;
528 precedences[Rend] = 0;
529 regexp_context_indep_ops = (regexp_syntax & RE_CONTEXT_INDEP_OPS) != 0;
530 regexp_ansi_sequences = (regexp_syntax & RE_ANSI_HEX) != 0;
534 int re_set_syntax(int syntax) {
538 regexp_syntax = syntax;
539 re_syntax = syntax; /* Exported copy */
540 re_compile_initialize();
544 static int hex_char_to_decimal(int ch) {
545 if (ch >= '0' && ch <= '9')
547 if (ch >= 'a' && ch <= 'f')
548 return ch - 'a' + 10;
549 if (ch >= 'A' && ch <= 'F')
550 return ch - 'A' + 10;
554 static void re_compile_fastmap_aux(regex_t * bufp, unsigned char *code, int pos,
555 unsigned char *visited,
556 unsigned char *can_be_null,
557 unsigned char *fastmap)
564 return; /* we have already been here */
567 switch (code[pos++]) {
578 for (a = 0; a < 256; a++)
582 syntaxcode = code[pos++];
583 for (a = 0; a < 256; a++)
584 if (SYNTAX(a) & syntaxcode)
588 syntaxcode = code[pos++];
589 for (a = 0; a < 256; a++)
590 if (!(SYNTAX(a) & syntaxcode))
594 fastmap[(int)'\n'] = 1;
595 if (*can_be_null == 0)
596 *can_be_null = 2; /* can match null, but only at end of buffer */
599 for (a = 0; a < 256 / 8; a++)
600 if (code[pos + a] != 0)
601 for (b = 0; b < 8; b++)
602 if (code[pos + a] & (1 << b))
603 fastmap[(a << 3) + b] = 1;
607 fastmap[(unsigned char)code[pos]] = 1;
610 for (a = 0; a < 256; a++)
619 for (a = 0; a < 256; a++)
624 case Cdummy_failure_jump:
625 case Cupdate_failure_jump:
627 a = (unsigned char)code[pos++];
628 a |= (unsigned char)code[pos++] << 8;
629 pos += (int)SHORT(a);
631 /* argh... the regexp contains empty loops. This is not
632 good, as this may cause a failure stack overflow when
633 matching. Oh well. */
634 /* this path leads nowhere; pursue other paths. */
640 a = (unsigned char)code[pos++];
641 a |= (unsigned char)code[pos++] << 8;
642 a = pos + (int)SHORT(a);
643 re_compile_fastmap_aux(bufp, code, a, visited, can_be_null, fastmap);
649 set_error("Unknown regex opcode: memory corrupted?");
655 static int re_do_compile_fastmap(regex_t * bufp, unsigned char *buffer, int used,
656 int pos, unsigned char *can_be_null,
657 unsigned char *fastmap)
659 unsigned char small_visited[512], *visited;
661 if (used <= (int)sizeof(small_visited))
662 visited = small_visited;
664 visited = (unsigned char *)malloc(used);
669 memset(fastmap, 0, 256);
670 memset(visited, 0, used);
671 re_compile_fastmap_aux(bufp, buffer, pos, visited, can_be_null, fastmap);
672 if (visited != small_visited)
677 void re_compile_fastmap(regex_t * bufp)
679 if (!bufp->fastmap || bufp->fastmap_accurate)
681 // assert(bufp->used > 0);
682 if (!re_do_compile_fastmap(bufp, bufp->buffer,
683 bufp->used, 0, &bufp->can_be_null, bufp->fastmap))
687 if (bufp->buffer[0] == Cbol)
688 bufp->anchor = 1; /* begline */
689 else if (bufp->buffer[0] == Cbegbuf)
690 bufp->anchor = 2; /* begbuf */
692 bufp->anchor = 0; /* none */
693 bufp->fastmap_accurate = 1;
699 * ... code for operand of star
701 * 2: ... code after star
703 * We change the star_jump to update_failure_jump if we can determine
704 * that it is safe to do so; otherwise we change it to an ordinary
711 * 2: ... code for operand of plus
713 * 3: ... code after plus
715 * For star_jump considerations this is processed identically to star.
719 static int re_optimize_star_jump(regex_t * bufp, unsigned char *code)
721 unsigned char map[256];
722 unsigned char can_be_null;
728 int num_instructions = 0;
730 a = (unsigned char)*code++;
731 a |= (unsigned char)*code++ << 8;
734 p1 = code + a + 3; /* skip the failure_jump */
735 /* Check that the jump is within the pattern */
736 if (p1 < bufp->buffer || bufp->buffer + bufp->used < p1) {
737 set_error("Regex VM jump out of bounds (failure_jump opt)");
740 // assert(p1[-3] == Cfailure_jump);
742 /* p1 points inside loop, p2 points to after loop */
743 if (!re_do_compile_fastmap(bufp, bufp->buffer, bufp->used,
744 (int)(p2 - bufp->buffer), &can_be_null, map))
745 goto make_normal_jump;
747 /* If we might introduce a new update point inside the
748 * loop, we can't optimize because then update_jump would
749 * update a wrong failure point. Thus we have to be
750 * quite careful here.
753 /* loop until we find something that consumes a character */
771 ch = (unsigned char)*p1++;
773 goto make_normal_jump;
776 for (b = 0; b < 256; b++)
777 if (b != '\n' && map[b])
778 goto make_normal_jump;
781 for (b = 0; b < 256; b++)
782 if ((p1[b >> 3] & (1 << (b & 7))) && map[b])
783 goto make_normal_jump;
787 goto make_normal_jump;
791 /* now we know that we can't backtrack. */
792 while (p1 != p2 - 3) {
821 case Cupdate_failure_jump:
822 case Cdummy_failure_jump:
823 goto make_normal_jump;
829 /* make_update_jump: */
831 a += 3; /* jump to after the Cfailure_jump */
832 code[0] = Cupdate_failure_jump;
835 if (num_instructions > 1)
837 // assert(num_instructions == 1);
838 /* if the only instruction matches a single character, we can do
840 p1 = code + 3 + a; /* start of sole instruction */
841 if (*p1 == Cset || *p1 == Cexact || *p1 == Canychar ||
842 *p1 == Csyntaxspec || *p1 == Cnotsyntaxspec)
852 static int re_optimize(regex_t * bufp)
884 if (!re_optimize_star_jump(bufp, code)) {
888 case Cupdate_failure_jump:
890 case Cdummy_failure_jump:
901 #define NEXTCHAR(var) \
904 goto ends_prematurely; \
905 (var) = regex[pos]; \
909 #define ALLOC(amount) \
911 if (pattern_offset+(amount) > alloc) \
913 alloc += 256 + (amount); \
914 pattern = (unsigned char *)realloc(pattern, alloc); \
916 goto out_of_memory; \
920 #define STORE(ch) pattern[pattern_offset++] = (ch)
922 #define CURRENT_LEVEL_START (starts[starts_base + current_level])
924 #define SET_LEVEL_START starts[starts_base + current_level] = pattern_offset
926 #define PUSH_LEVEL_STARTS \
927 if (starts_base < (MAX_NESTING-1)*NUM_LEVELS) \
928 starts_base += NUM_LEVELS; \
932 #define POP_LEVEL_STARTS starts_base -= NUM_LEVELS
934 #define PUT_ADDR(offset,addr) \
936 int disp = (addr) - (offset) - 2; \
937 pattern[(offset)] = disp & 0xff; \
938 pattern[(offset)+1] = (disp>>8) & 0xff; \
941 #define INSERT_JUMP(pos,type,addr) \
943 int a, p = (pos), t = (type), ad = (addr); \
944 for (a = pattern_offset - 1; a >= p; a--) \
945 pattern[a + 3] = pattern[a]; \
948 pattern_offset += 3; \
951 #define SETBIT(buf,offset,bit) (buf)[(offset)+(bit)/8] |= (1<<((bit) & 7))
955 bufp->allocated = alloc; \
956 bufp->buffer = pattern; \
957 bufp->used = pattern_offset; \
960 #define GETHEX(var) \
962 unsigned char gethex_ch, gethex_value; \
963 NEXTCHAR(gethex_ch); \
964 gethex_value = hex_char_to_decimal(gethex_ch); \
965 if (gethex_value == 16) \
967 NEXTCHAR(gethex_ch); \
968 gethex_ch = hex_char_to_decimal(gethex_ch); \
969 if (gethex_ch == 16) \
971 (var) = gethex_value * 16 + gethex_ch; \
974 #define ANSI_TRANSLATE(ch) \
981 ch = 7; /* audible bell */ \
987 ch = 8; /* backspace */ \
993 ch = 12; /* form feed */ \
999 ch = 10; /* line feed */ \
1005 ch = 13; /* carriage return */ \
1017 ch = 11; /* vertical tab */ \
1020 case 'x': /* hex code */ \
1028 /* other characters passed through */ \
1030 ch = translate[(unsigned char)ch]; \
1036 const char *re_compile_pattern(regex_t * bufp, unsigned char *regex)
1044 int pattern_offset = 0, alloc;
1045 int starts[NUM_LEVELS * MAX_NESTING];
1047 int future_jumps[MAX_NESTING];
1049 unsigned char ch = '\0';
1050 unsigned char *pattern;
1051 unsigned char *translate;
1054 int num_open_registers;
1055 int open_registers[RE_NREGS];
1056 int beginning_context;
1057 int size = strlen((char *)regex);
1059 if (!re_compile_initialized)
1060 re_compile_initialize();
1062 bufp->fastmap_accurate = 0;
1063 bufp->uses_registers = 1;
1064 bufp->num_registers = 1;
1065 translate = bufp->translate;
1066 pattern = bufp->buffer;
1067 alloc = bufp->allocated;
1068 if (alloc == 0 || pattern == NULL) {
1070 pattern = (unsigned char *)malloc(alloc);
1079 num_open_registers = 0;
1082 beginning_context = 1;
1084 /* we use Rend dummy to ensure that pending jumps are updated
1085 (due to low priority of Rend) before exiting the loop. */
1087 while (op != Rend) {
1093 ch = translate[(unsigned char)ch];
1094 op = plain_ops[(unsigned char)ch];
1097 op = quoted_ops[(unsigned char)ch];
1098 if (op == Rnormal && regexp_ansi_sequences)
1102 level = precedences[op];
1103 /* printf("ch='%c' op=%d level=%d current_level=%d
1104 curlevstart=%d\n", ch, op, level, current_level,
1105 CURRENT_LEVEL_START); */
1106 if (level > current_level) {
1107 for (current_level++; current_level < level; current_level++)
1110 } else if (level < current_level) {
1111 current_level = level;
1112 for (; num_jumps > 0 &&
1113 future_jumps[num_jumps - 1] >= CURRENT_LEVEL_START; num_jumps--)
1114 PUT_ADDR(future_jumps[num_jumps - 1], pattern_offset);
1122 store_opcode_and_arg: /* opcode & ch must be set */
1136 set_error("Rquote");
1137 /*NOTREACHED*/ case Rbol:
1138 if (!beginning_context) {
1139 if (regexp_context_indep_ops)
1147 if (!((pos >= size) ||
1148 ((regexp_syntax & RE_NO_BK_VBAR) ?
1149 (regex[pos] == '\174') :
1150 (pos + 1 < size && regex[pos] == '\134' &&
1151 regex[pos + 1] == '\174')) ||
1152 ((regexp_syntax & RE_NO_BK_PARENS) ?
1153 (regex[pos] == ')') :
1154 (pos + 1 < size && regex[pos] == '\134' &&
1155 regex[pos + 1] == ')')))) {
1156 if (regexp_context_indep_ops)
1166 if (beginning_context) {
1167 if (regexp_context_indep_ops)
1172 if (CURRENT_LEVEL_START == pattern_offset)
1173 break; /* ignore empty patterns for ? */
1175 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump, pattern_offset + 3);
1179 if (beginning_context) {
1180 if (regexp_context_indep_ops)
1185 if (CURRENT_LEVEL_START == pattern_offset)
1186 break; /* ignore empty patterns for + and * */
1188 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump, pattern_offset + 6);
1189 INSERT_JUMP(pattern_offset, Cstar_jump, CURRENT_LEVEL_START);
1190 if (op == Rplus) /* jump over initial failure_jump */
1191 INSERT_JUMP(CURRENT_LEVEL_START, Cdummy_failure_jump,
1192 CURRENT_LEVEL_START + 6);
1196 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump, pattern_offset + 6);
1197 if (num_jumps >= MAX_NESTING)
1200 future_jumps[num_jumps++] = pattern_offset;
1208 if (next_register < RE_NREGS) {
1209 bufp->uses_registers = 1;
1211 STORE(Cstart_memory);
1212 STORE(next_register);
1213 open_registers[num_open_registers++] = next_register;
1214 bufp->num_registers++;
1224 if (paren_depth <= 0)
1225 goto parenthesis_error;
1227 current_level = precedences[Ropenpar];
1229 if (paren_depth < num_open_registers) {
1230 bufp->uses_registers = 1;
1233 num_open_registers--;
1234 STORE(open_registers[num_open_registers]);
1239 goto bad_match_register;
1240 if (!(ch >= '0' && ch <= '9')) {
1241 goto bad_match_register;
1243 bufp->uses_registers = 1;
1244 opcode = Cmatch_memory;
1246 goto store_opcode_and_arg;
1247 case Rextended_memory:
1249 if (ch < '0' || ch > '9')
1250 goto bad_match_register;
1252 if (a < '0' || a > '9')
1253 goto bad_match_register;
1254 ch = 10 * (a - '0') + ch - '0';
1255 if (ch == 0 || ch >= RE_NREGS)
1256 goto bad_match_register;
1257 bufp->uses_registers = 1;
1258 opcode = Cmatch_memory;
1259 goto store_opcode_and_arg;
1271 offset = pattern_offset;
1272 for (a = 0; a < 256 / 8; a++)
1276 ch = translate[(unsigned char)ch];
1281 ch = translate[(unsigned char)ch];
1287 while (ch != '\135' || firstchar) {
1289 if (regexp_ansi_sequences && ch == '\134') {
1294 for (a = prev; a <= (int)ch; a++)
1295 SETBIT(pattern, offset, a);
1298 } else if (prev != -1 && ch == '-')
1301 SETBIT(pattern, offset, ch);
1306 ch = translate[(unsigned char)ch];
1309 SETBIT(pattern, offset, '-');
1311 for (a = 0; a < 256 / 8; a++)
1312 pattern[offset + a] ^= 0xff;
1328 opcode = Csyntaxspec;
1330 goto store_opcode_and_arg;
1334 opcode = Cnotsyntaxspec;
1336 goto store_opcode_and_arg;
1350 opcode = Cwordbound;
1355 opcode = Cnotwordbound;
1363 beginning_context = (op == Ropenpar || op == Ror);
1365 if (starts_base != 0)
1366 goto parenthesis_error;
1367 // assert(num_jumps == 0);
1371 if (!re_optimize(bufp))
1372 return "Optimization error";
1377 return "Badly placed special character";
1381 return "Bad match register number";
1385 return "Bad hexadecimal number";
1389 return "Badly placed parenthesis";
1393 return "Out of memory";
1397 return "Regular expression ends prematurely";
1401 return "Regular expression too complex";
1409 #undef CURRENT_LEVEL_START
1410 #undef SET_LEVEL_START
1411 #undef PUSH_LEVEL_STARTS
1412 #undef POP_LEVEL_STARTS
1418 #define PREFETCH if (text == textend) goto fail
1420 #define NEXTCHAR(var) \
1422 var = (unsigned char)*text++; \
1424 var = translate[var]
1427 int regcomp(regex_t * bufp, const char *regex, int cflags)
1429 memset(bufp, 0, sizeof(regex_t));
1430 re_compile_pattern(bufp, (unsigned char *)regex);
1437 int regexec(regex_t * preg, const char *string, size_t nmatch,
1438 regmatch_t pmatch[], int eflags)
1441 int len = strlen(string);
1442 struct re_registers regs;
1443 stat = re_search(preg, (unsigned char *)string, len, 0, len, ®s);
1444 /* stat is the start position in the string base 0 where
1445 * the pattern was found or negative if not found.
1447 return stat < 0 ? -1 : 0;
1450 size_t regerror(int errcode, regex_t * preg, char *errbuf, size_t errbuf_size)
1452 bstrncpy(errbuf, preg->errmsg, errbuf_size);
1456 void regfree(regex_t * preg)
1460 int re_match(regex_t * bufp, unsigned char *string, int size, int pos,
1461 regexp_registers_t old_regs)
1463 unsigned char *code;
1464 unsigned char *translate;
1465 unsigned char *text;
1466 unsigned char *textstart;
1467 unsigned char *textend;
1473 unsigned char *regstart;
1474 unsigned char *regend;
1478 // assert(pos >= 0 && size >= 0);
1479 // assert(pos <= size);
1481 text = string + pos;
1483 textend = string + size;
1485 code = bufp->buffer;
1487 translate = bufp->translate;
1489 NEW_STATE(state, bufp->num_registers);
1495 match_end = text - textstart;
1497 old_regs->start[0] = pos;
1498 old_regs->end[0] = match_end;
1499 if (!bufp->uses_registers) {
1500 for (a = 1; a < RE_NREGS; a++) {
1501 old_regs->start[a] = -1;
1502 old_regs->end[a] = -1;
1505 for (a = 1; a < bufp->num_registers; a++) {
1506 if ((GET_REG_START(state, a) == NULL) ||
1507 (GET_REG_END(state, a) == NULL)) {
1508 old_regs->start[a] = -1;
1509 old_regs->end[a] = -1;
1512 old_regs->start[a] = GET_REG_START(state, a) - textstart;
1513 old_regs->end[a] = GET_REG_END(state, a) - textstart;
1515 for (; a < RE_NREGS; a++) {
1516 old_regs->start[a] = -1;
1517 old_regs->end[a] = -1;
1522 return match_end - pos;
1526 if (text == textstart || text[-1] == '\n')
1527 goto continue_matching;
1532 if (text == textend || *text == '\n')
1533 goto continue_matching;
1539 if (code[ch / 8] & (1 << (ch & 7))) {
1541 goto continue_matching;
1548 if (ch != (unsigned char)*code++)
1550 goto continue_matching;
1557 goto continue_matching;
1562 SET_REG_START(state, reg, text, goto error);
1563 goto continue_matching;
1568 SET_REG_END(state, reg, text, goto error);
1569 goto continue_matching;
1574 regstart = GET_REG_START(state, reg);
1575 regend = GET_REG_END(state, reg);
1576 if ((regstart == NULL) || (regend == NULL))
1577 goto fail; /* or should we just match nothing? */
1578 regsize = regend - regstart;
1580 if (regsize > (textend - text))
1583 for (; regstart < regend; regstart++, text++)
1584 if (translate[*regstart] != translate[*text])
1587 for (; regstart < regend; regstart++, text++)
1588 if (*regstart != *text)
1590 goto continue_matching;
1592 case Cupdate_failure_jump:
1594 UPDATE_FAILURE(state, text, goto error);
1595 /* fall to next case */
1597 /* treat Cstar_jump just like Cjump if it hasn't been optimized */
1601 a = (unsigned char)*code++;
1602 a |= (unsigned char)*code++ << 8;
1603 code += (int)SHORT(a);
1604 if (code < bufp->buffer || bufp->buffer + bufp->used < code) {
1605 set_error("Regex VM jump out of bounds (Cjump)");
1609 goto continue_matching;
1611 case Cdummy_failure_jump:
1613 unsigned char *failuredest;
1615 a = (unsigned char)*code++;
1616 a |= (unsigned char)*code++ << 8;
1618 // assert(*code == Cfailure_jump);
1619 b = (unsigned char)code[1];
1620 b |= (unsigned char)code[2] << 8;
1621 failuredest = code + (int)SHORT(b) + 3;
1622 if (failuredest < bufp->buffer || bufp->buffer + bufp->used < failuredest) {
1624 ("Regex VM jump out of bounds (Cdummy_failure_jump failuredest)");
1628 PUSH_FAILURE(state, failuredest, NULL, goto error);
1630 if (code < bufp->buffer || bufp->buffer + bufp->used < code) {
1631 set_error("Regex VM jump out of bounds (Cdummy_failure_jump code)");
1635 goto continue_matching;
1639 a = (unsigned char)*code++;
1640 a |= (unsigned char)*code++ << 8;
1642 if (code + a < bufp->buffer || bufp->buffer + bufp->used < code + a) {
1643 set_error("Regex VM jump out of bounds (Cfailure_jump)");
1647 PUSH_FAILURE(state, code + a, text, goto error);
1648 goto continue_matching;
1652 unsigned char *pinst;
1653 a = (unsigned char)*code++;
1654 a |= (unsigned char)*code++ << 8;
1657 if (pinst < bufp->buffer || bufp->buffer + bufp->used < pinst) {
1658 set_error("Regex VM jump out of bounds (Crepeat1)");
1662 /* pinst is sole instruction in loop, and it matches a
1663 * single character. Since Crepeat1 was originally a
1664 * Cupdate_failure_jump, we also know that backtracking
1665 * is useless: so long as the single-character
1666 * expression matches, it must be used. Also, in the
1667 * case of +, we've already matched one character, so +
1668 * can't fail: nothing here can cause a failure. */
1673 while (text < textend) {
1674 ch = translate[(unsigned char)*text];
1675 if (pinst[ch / 8] & (1 << (ch & 7)))
1681 while (text < textend) {
1682 ch = (unsigned char)*text;
1683 if (pinst[ch / 8] & (1 << (ch & 7)))
1693 ch = (unsigned char)*pinst;
1695 while (text < textend && translate[(unsigned char)*text] == ch)
1698 while (text < textend && (unsigned char)*text == ch)
1705 while (text < textend && (unsigned char)*text != '\n')
1711 a = (unsigned char)*pinst;
1713 while (text < textend && (SYNTAX(translate[*text]) & a))
1716 while (text < textend && (SYNTAX(*text) & a))
1721 case Cnotsyntaxspec:
1723 a = (unsigned char)*pinst;
1725 while (text < textend && !(SYNTAX(translate[*text]) & a))
1728 while (text < textend && !(SYNTAX(*text) & a))
1736 set_error("Unknown regex opcode: memory corrupted?");
1740 /* due to the funky way + and * are compiled, the top
1741 * failure- stack entry at this point is actually a
1742 * success entry -- update it & pop it */
1743 UPDATE_FAILURE(state, text, goto error);
1744 goto fail; /* i.e., succeed <wink/sigh> */
1748 if (text == textstart)
1749 goto continue_matching;
1754 if (text == textend)
1755 goto continue_matching;
1760 if (text == textend)
1762 if (!(SYNTAX(*text) & Sword))
1764 if (text == textstart)
1765 goto continue_matching;
1766 if (!(SYNTAX(text[-1]) & Sword))
1767 goto continue_matching;
1772 if (text == textstart)
1774 if (!(SYNTAX(text[-1]) & Sword))
1776 if (text == textend)
1777 goto continue_matching;
1778 if (!(SYNTAX(*text) & Sword))
1779 goto continue_matching;
1784 /* Note: as in gnu regexp, this also matches at the
1785 * beginning and end of buffer. */
1787 if (text == textstart || text == textend)
1788 goto continue_matching;
1789 if ((SYNTAX(text[-1]) & Sword) ^ (SYNTAX(*text) & Sword))
1790 goto continue_matching;
1795 /* Note: as in gnu regexp, this never matches at the
1796 * beginning and end of buffer. */
1797 if (text == textstart || text == textend)
1799 if (!((SYNTAX(text[-1]) & Sword) ^ (SYNTAX(*text) & Sword)))
1800 goto continue_matching;
1806 if (!(SYNTAX(ch) & (unsigned char)*code++))
1808 goto continue_matching;
1810 case Cnotsyntaxspec:
1813 if (SYNTAX(ch) & (unsigned char)*code++)
1815 goto continue_matching;
1820 set_error("Unknown regex opcode: memory corrupted?");
1827 #if 0 /* This line is never reached --Guido */
1834 /* Using "break;" in the above switch statement is equivalent to "goto fail;" */
1836 POP_FAILURE(state, code, text, goto done_matching, goto error);
1837 goto continue_matching;
1840 /* if(translated != NULL) */
1841 /* free(translated); */
1846 /* if (translated != NULL) */
1847 /* free(translated); */
1856 int re_search(regex_t * bufp, unsigned char *string, int size, int pos,
1857 int range, regexp_registers_t regs)
1859 unsigned char *fastmap;
1860 unsigned char *translate;
1861 unsigned char *text;
1862 unsigned char *partstart;
1863 unsigned char *partend;
1866 unsigned char anchor;
1868 // assert(size >= 0 && pos >= 0);
1869 // assert(pos + range >= 0 && pos + range <= size); /* Bugfix by ylo */
1871 fastmap = bufp->fastmap;
1872 translate = bufp->translate;
1873 if (fastmap && !bufp->fastmap_accurate) {
1874 re_compile_fastmap(bufp);
1879 anchor = bufp->anchor;
1880 if (bufp->can_be_null == 1) /* can_be_null == 2: can match null at eob */
1896 for (; range >= 0; range--, pos += dir) {
1898 if (dir == 1) { /* searching forwards */
1900 text = string + pos;
1901 partend = string + size;
1904 while (text != partend &&
1905 !fastmap[(unsigned char)translate[(unsigned char)*text]])
1908 while (text != partend && !fastmap[(unsigned char)*text])
1910 pos += text - partstart;
1911 range -= text - partstart;
1912 if (pos == size && bufp->can_be_null == 0)
1914 } else { /* searching backwards */
1915 text = string + pos;
1916 partstart = string + pos - range;
1919 while (text != partstart && !fastmap[(unsigned char)
1920 translate[(unsigned char)*text]])
1923 while (text != partstart && !fastmap[(unsigned char)*text])
1925 pos -= partend - text;
1926 range -= partend - text;
1929 if (anchor == 1) { /* anchored to begline */
1930 if (pos > 0 && (string[pos - 1] != '\n'))
1933 // assert(pos >= 0 && pos <= size);
1934 ret = re_match(bufp, string, size, pos, regs);
1946 ** c-file-style: "python"