3 * Author: Tatu Ylonen <ylo@ngs.fi>
5 * Copyright (c) 1991 Tatu Ylonen, Espoo, Finland
7 * Permission to use, copy, modify, distribute, and sell this software
8 * and its documentation for any purpose is hereby granted without
9 * fee, provided that the above copyright notice appear in all copies.
10 * This software is provided "as is" without express or implied
13 * Created: Thu Sep 26 17:14:05 1991 ylo
14 * Last modified: Mon Nov 4 17:06:48 1991 ylo
15 * Ported to Think C: 19 Jan 1992 guido@cwi.nl
17 * This code draws many ideas from the regular expression packages by
18 * Henry Spencer of the University of Toronto and Richard Stallman of
19 * the Free Software Foundation.
21 * Emacs-specific code and syntax table code is almost directly borrowed
24 * Bugs fixed and lots of reorganization by Jeffrey C. Ollie, April
25 * 1997 Thanks for bug reports and ideas from Andrew Kuchling, Tim
26 * Peters, Guido van Rossum, Ka-Ping Yee, Sjoerd Mullender, and
27 * probably one or two others that I'm forgetting.
31 * This file modified to work with Bacula and C++ by
32 * Kern Sibbald, April 2006
38 #define set_error(x) bufp->errmsg=((char *)(x))
39 #define got_error bufp->errmsg!=NULL
41 /* The original code blithely assumed that sizeof(short) == 2. Not
42 * always true. Original instances of "(short)x" were replaced by
43 * SHORT(x), where SHORT is #defined below. */
45 #define SHORT(x) ((x) & 0x8000 ? (x) - 0x10000 : (x))
47 /* The stack implementation is taken from an idea by Andrew Kuchling.
48 * It's a doubly linked list of arrays. The advantages of this over a
49 * simple linked list are that the number of mallocs required are
50 * reduced. It also makes it possible to statically allocate enough
51 * space so that small patterns don't ever need to call malloc.
53 * The advantages over a single array is that is periodically
54 * realloced when more space is needed is that we avoid ever copying
57 /* item_t is the basic stack element. Defined as a union of
58 * structures so that both registers, failure points, and counters can
59 * be pushed/popped from the stack. There's nothing built into the
60 * item to keep track of whether a certain stack item is a register, a
61 * failure point, or a counter. */
63 typedef union item_t {
84 #define STACK_PAGE_SIZE 256
85 #define NUM_REGISTERS 256
87 /* A 'page' of stack items. */
89 typedef struct item_page_t {
90 item_t items[STACK_PAGE_SIZE];
91 struct item_page_t *prev;
92 struct item_page_t *next;
96 typedef struct match_state {
97 /* The number of registers that have been pushed onto the stack
98 * since the last failure point. */
102 /* Used to control when registers need to be pushed onto the
107 /* The number of failure points on the stack. */
111 /* Storage for the registers. Each register consists of two
112 * pointers to characters. So register N is represented as
113 * start[N] and end[N]. The pointers must be converted to
114 * offsets from the beginning of the string before returning the
115 * registers to the calling program. */
117 unsigned char *start[NUM_REGISTERS];
118 unsigned char *end[NUM_REGISTERS];
120 /* Keeps track of whether a register has changed recently. */
122 int changed[NUM_REGISTERS];
124 /* Structure to encapsulate the stack. */
126 /* index into the current page. If index == 0 and you need
127 * to pop an item, move to the previous page and set index
128 * = STACK_PAGE_SIZE - 1. Otherwise decrement index to
129 * push a page. If index == STACK_PAGE_SIZE and you need
130 * to push a page move to the next page and set index =
131 * 0. If there is no new next page, allocate a new page
132 * and link it in. Otherwise, increment index to push a
136 item_page_t *current; /* Pointer to the current page. */
137 item_page_t first; /* First page is statically allocated. */
141 /* Initialize a state object */
143 /* #define NEW_STATE(state) \ */
144 /* memset(&state, 0, (void *)(&state.stack) - (void *)(&state)); \ */
145 /* state.stack.current = &state.stack.first; \ */
146 /* state.stack.first.prev = NULL; \ */
147 /* state.stack.first.next = NULL; \ */
148 /* state.stack.index = 0; \ */
149 /* state.level = 1 */
151 #define NEW_STATE(state, nregs) \
154 for (i = 0; i < nregs; i++) \
156 state.start[i] = NULL; \
157 state.end[i] = NULL; \
158 state.changed[i] = 0; \
160 state.stack.current = &state.stack.first; \
161 state.stack.first.prev = NULL; \
162 state.stack.first.next = NULL; \
163 state.stack.index = 0; \
170 /* Free any memory that might have been malloc'd */
172 #define FREE_STATE(state) \
173 while(state.stack.first.next != NULL) \
175 state.stack.current = state.stack.first.next; \
176 state.stack.first.next = state.stack.current->next; \
177 free(state.stack.current); \
180 /* Discard the top 'count' stack items. */
182 #define STACK_DISCARD(stack, count, on_error) \
183 stack.index -= count; \
184 while (stack.index < 0) \
186 if (stack.current->prev == NULL) \
188 stack.current = stack.current->prev; \
189 stack.index += STACK_PAGE_SIZE; \
192 /* Store a pointer to the previous item on the stack. Used to pop an
193 * item off of the stack. */
195 #define STACK_PREV(stack, top, on_error) \
196 if (stack.index == 0) \
198 if (stack.current->prev == NULL) \
200 stack.current = stack.current->prev; \
201 stack.index = STACK_PAGE_SIZE - 1; \
207 top = &(stack.current->items[stack.index])
209 /* Store a pointer to the next item on the stack. Used to push an item
210 * on to the stack. */
212 #define STACK_NEXT(stack, top, on_error) \
213 if (stack.index == STACK_PAGE_SIZE) \
215 if (stack.current->next == NULL) \
217 stack.current->next = (item_page_t *)malloc(sizeof(item_page_t)); \
218 if (stack.current->next == NULL) \
220 stack.current->next->prev = stack.current; \
221 stack.current->next->next = NULL; \
223 stack.current = stack.current->next; \
226 top = &(stack.current->items[stack.index++])
228 /* Store a pointer to the item that is 'count' items back in the
229 * stack. STACK_BACK(stack, top, 1, on_error) is equivalent to
230 * STACK_TOP(stack, top, on_error). */
232 #define STACK_BACK(stack, top, count, on_error) \
235 item_page_t *current; \
236 current = stack.current; \
237 index = stack.index - (count); \
240 if (current->prev == NULL) \
242 current = current->prev; \
243 index += STACK_PAGE_SIZE; \
245 top = &(current->items[index]); \
248 /* Store a pointer to the top item on the stack. Execute the
249 * 'on_error' code if there are no items on the stack. */
251 #define STACK_TOP(stack, top, on_error) \
252 if (stack.index == 0) \
254 if (stack.current->prev == NULL) \
256 top = &(stack.current->prev->items[STACK_PAGE_SIZE - 1]); \
260 top = &(stack.current->items[stack.index - 1]); \
263 /* Test to see if the stack is empty */
265 #define STACK_EMPTY(stack) ((stack.index == 0) && \
266 (stack.current->prev == NULL))
268 /* Return the start of register 'reg' */
270 #define GET_REG_START(state, reg) (state.start[reg])
272 /* Return the end of register 'reg' */
274 #define GET_REG_END(state, reg) (state.end[reg])
276 /* Set the start of register 'reg'. If the state of the register needs
277 * saving, push it on the stack. */
279 #define SET_REG_START(state, reg, text, on_error) \
280 if(state.changed[reg] < state.level) \
283 STACK_NEXT(state.stack, item, on_error); \
284 item->reg.num = reg; \
285 item->reg.start = state.start[reg]; \
286 item->reg.end = state.end[reg]; \
287 item->reg.level = state.changed[reg]; \
288 state.changed[reg] = state.level; \
291 state.start[reg] = text
293 /* Set the end of register 'reg'. If the state of the register needs
294 * saving, push it on the stack. */
296 #define SET_REG_END(state, reg, text, on_error) \
297 if(state.changed[reg] < state.level) \
300 STACK_NEXT(state.stack, item, on_error); \
301 item->reg.num = reg; \
302 item->reg.start = state.start[reg]; \
303 item->reg.end = state.end[reg]; \
304 item->reg.level = state.changed[reg]; \
305 state.changed[reg] = state.level; \
308 state.end[reg] = text
310 #define PUSH_FAILURE(state, xcode, xtext, on_error) \
313 STACK_NEXT(state.stack, item, on_error); \
314 item->fail.code = xcode; \
315 item->fail.text = xtext; \
316 item->fail.count = state.count; \
317 item->fail.level = state.level; \
318 item->fail.phantom = 0; \
324 /* Update the last failure point with a new position in the text. */
326 #define UPDATE_FAILURE(state, xtext, on_error) \
329 STACK_BACK(state.stack, item, state.count + 1, on_error); \
330 if (!item->fail.phantom) \
333 STACK_NEXT(state.stack, item2, on_error); \
334 item2->fail.code = item->fail.code; \
335 item2->fail.text = xtext; \
336 item2->fail.count = state.count; \
337 item2->fail.level = state.level; \
338 item2->fail.phantom = 1; \
345 STACK_DISCARD(state.stack, state.count, on_error); \
346 STACK_TOP(state.stack, item, on_error); \
347 item->fail.text = xtext; \
353 #define POP_FAILURE(state, xcode, xtext, on_empty, on_error) \
358 while(state.count > 0) \
360 STACK_PREV(state.stack, item, on_error); \
361 state.start[item->reg.num] = item->reg.start; \
362 state.end[item->reg.num] = item->reg.end; \
363 state.changed[item->reg.num] = item->reg.level; \
366 STACK_PREV(state.stack, item, on_empty); \
367 xcode = item->fail.code; \
368 xtext = item->fail.text; \
369 state.count = item->fail.count; \
370 state.level = item->fail.level; \
373 while (item->fail.text == NULL); \
376 enum regexp_compiled_ops { /* opcodes for compiled regexp */
377 Cend, /* end of pattern reached */
378 Cbol, /* beginning of line */
379 Ceol, /* end of line */
380 Cset, /* character set. Followed by 32 bytes of set. */
381 Cexact, /* followed by a byte to match */
382 Canychar, /* matches any character except newline */
383 Cstart_memory, /* set register start addr (followed by reg number) */
384 Cend_memory, /* set register end addr (followed by reg number) */
385 Cmatch_memory, /* match a duplicate of reg contents (regnum follows) */
386 Cjump, /* followed by two bytes (lsb,msb) of displacement. */
387 Cstar_jump, /* will change to jump/update_failure_jump at runtime */
388 Cfailure_jump, /* jump to addr on failure */
389 Cupdate_failure_jump, /* update topmost failure point and jump */
390 Cdummy_failure_jump, /* push a dummy failure point and jump */
391 Cbegbuf, /* match at beginning of buffer */
392 Cendbuf, /* match at end of buffer */
393 Cwordbeg, /* match at beginning of word */
394 Cwordend, /* match at end of word */
395 Cwordbound, /* match if at word boundary */
396 Cnotwordbound, /* match if not at word boundary */
397 Csyntaxspec, /* matches syntax code (1 byte follows) */
398 Cnotsyntaxspec, /* matches if syntax code does not match (1 byte follows) */
402 enum regexp_syntax_op { /* syntax codes for plain and quoted characters */
403 Rend, /* special code for end of regexp */
404 Rnormal, /* normal character */
405 Ranychar, /* any character except newline */
406 Rquote, /* the quote character */
407 Rbol, /* match beginning of line */
408 Reol, /* match end of line */
409 Roptional, /* match preceding expression optionally */
410 Rstar, /* match preceding expr zero or more times */
411 Rplus, /* match preceding expr one or more times */
412 Ror, /* match either of alternatives */
413 Ropenpar, /* opening parenthesis */
414 Rclosepar, /* closing parenthesis */
415 Rmemory, /* match memory register */
416 Rextended_memory, /* \vnn to match registers 10-99 */
417 Ropenset, /* open set. Internal syntax hard-coded below. */
418 /* the following are gnu extensions to "normal" regexp syntax */
419 Rbegbuf, /* beginning of buffer */
420 Rendbuf, /* end of buffer */
421 Rwordchar, /* word character */
422 Rnotwordchar, /* not word character */
423 Rwordbeg, /* beginning of word */
424 Rwordend, /* end of word */
425 Rwordbound, /* word bound */
426 Rnotwordbound, /* not word bound */
430 static int re_compile_initialized = 0;
431 static int regexp_syntax = RE_SYNTAX_EGREP;
432 int re_syntax = RE_SYNTAX_EGREP; /* Exported copy of regexp_syntax */
433 static unsigned char plain_ops[256];
434 static unsigned char quoted_ops[256];
435 static unsigned char precedences[Rnum_ops];
436 static int regexp_context_indep_ops;
437 static int regexp_ansi_sequences;
439 #define NUM_LEVELS 5 /* number of precedence levels in use */
440 #define MAX_NESTING 100 /* max nesting level of operators */
442 #define SYNTAX(ch) re_syntax_table[(unsigned char)(ch)]
444 unsigned char re_syntax_table[256];
446 void re_compile_initialize(void)
450 static int syntax_table_inited = 0;
452 if (!syntax_table_inited) {
453 syntax_table_inited = 1;
454 memset(re_syntax_table, 0, 256);
455 for (a = 'a'; a <= 'z'; a++)
456 re_syntax_table[a] = Sword;
457 for (a = 'A'; a <= 'Z'; a++)
458 re_syntax_table[a] = Sword;
459 for (a = '0'; a <= '9'; a++)
460 re_syntax_table[a] = Sword | Sdigit | Shexdigit;
461 for (a = '0'; a <= '7'; a++)
462 re_syntax_table[a] |= Soctaldigit;
463 for (a = 'A'; a <= 'F'; a++)
464 re_syntax_table[a] |= Shexdigit;
465 for (a = 'a'; a <= 'f'; a++)
466 re_syntax_table[a] |= Shexdigit;
467 re_syntax_table[(int)'_'] = Sword;
468 for (a = 9; a <= 13; a++)
469 re_syntax_table[a] = Swhitespace;
470 re_syntax_table[(int)' '] = Swhitespace;
472 re_compile_initialized = 1;
473 for (a = 0; a < 256; a++) {
474 plain_ops[a] = Rnormal;
475 quoted_ops[a] = Rnormal;
477 for (a = '0'; a <= '9'; a++)
478 quoted_ops[a] = Rmemory;
479 plain_ops[(int)'\134'] = Rquote;
480 if (regexp_syntax & RE_NO_BK_PARENS) {
481 plain_ops[(int)'('] = Ropenpar;
482 plain_ops[(int)')'] = Rclosepar;
484 quoted_ops[(int)'('] = Ropenpar;
485 quoted_ops[(int)')'] = Rclosepar;
487 if (regexp_syntax & RE_NO_BK_VBAR) {
488 plain_ops[(int)'\174'] = Ror;
490 quoted_ops[(int)'\174'] = Ror;
492 plain_ops[(int)'*'] = Rstar;
493 if (regexp_syntax & RE_BK_PLUS_QM) {
494 quoted_ops[(int)'+'] = Rplus;
495 quoted_ops[(int)'?'] = Roptional;
497 plain_ops[(int)'+'] = Rplus;
498 plain_ops[(int)'?'] = Roptional;
500 if (regexp_syntax & RE_NEWLINE_OR) {
501 plain_ops[(int)'\n'] = Ror;
503 plain_ops[(int)'\133'] = Ropenset;
504 plain_ops[(int)'\136'] = Rbol;
505 plain_ops[(int)'$'] = Reol;
506 plain_ops[(int)'.'] = Ranychar;
507 if (!(regexp_syntax & RE_NO_GNU_EXTENSIONS)) {
508 quoted_ops[(int)'w'] = Rwordchar;
509 quoted_ops[(int)'W'] = Rnotwordchar;
510 quoted_ops[(int)'<'] = Rwordbeg;
511 quoted_ops[(int)'>'] = Rwordend;
512 quoted_ops[(int)'b'] = Rwordbound;
513 quoted_ops[(int)'B'] = Rnotwordbound;
514 quoted_ops[(int)'`'] = Rbegbuf;
515 quoted_ops[(int)'\''] = Rendbuf;
517 if (regexp_syntax & RE_ANSI_HEX) {
518 quoted_ops[(int)'v'] = Rextended_memory;
520 for (a = 0; a < Rnum_ops; a++) {
523 if (regexp_syntax & RE_TIGHT_VBAR) {
524 precedences[Ror] = 3;
525 precedences[Rbol] = 2;
526 precedences[Reol] = 2;
528 precedences[Ror] = 2;
529 precedences[Rbol] = 3;
530 precedences[Reol] = 3;
532 precedences[Rclosepar] = 1;
533 precedences[Rend] = 0;
534 regexp_context_indep_ops = (regexp_syntax & RE_CONTEXT_INDEP_OPS) != 0;
535 regexp_ansi_sequences = (regexp_syntax & RE_ANSI_HEX) != 0;
538 int re_set_syntax(int syntax) {
542 regexp_syntax = syntax;
543 re_syntax = syntax; /* Exported copy */
544 re_compile_initialize();
548 static int hex_char_to_decimal(int ch) {
549 if (ch >= '0' && ch <= '9')
551 if (ch >= 'a' && ch <= 'f')
552 return ch - 'a' + 10;
553 if (ch >= 'A' && ch <= 'F')
554 return ch - 'A' + 10;
558 static void re_compile_fastmap_aux(regex_t * bufp, unsigned char *code, int pos,
559 unsigned char *visited,
560 unsigned char *can_be_null,
561 unsigned char *fastmap)
568 return; /* we have already been here */
571 switch (code[pos++]) {
582 for (a = 0; a < 256; a++)
586 syntaxcode = code[pos++];
587 for (a = 0; a < 256; a++)
588 if (SYNTAX(a) & syntaxcode)
592 syntaxcode = code[pos++];
593 for (a = 0; a < 256; a++)
594 if (!(SYNTAX(a) & syntaxcode))
598 fastmap[(int)'\n'] = 1;
599 if (*can_be_null == 0)
600 *can_be_null = 2; /* can match null, but only at end of buffer */
603 for (a = 0; a < 256 / 8; a++)
604 if (code[pos + a] != 0)
605 for (b = 0; b < 8; b++)
606 if (code[pos + a] & (1 << b))
607 fastmap[(a << 3) + b] = 1;
611 fastmap[(unsigned char)code[pos]] = 1;
614 for (a = 0; a < 256; a++)
623 for (a = 0; a < 256; a++)
628 case Cdummy_failure_jump:
629 case Cupdate_failure_jump:
631 a = (unsigned char)code[pos++];
632 a |= (unsigned char)code[pos++] << 8;
633 pos += (int)SHORT(a);
635 /* argh... the regexp contains empty loops. This is not
636 good, as this may cause a failure stack overflow when
637 matching. Oh well. */
638 /* this path leads nowhere; pursue other paths. */
644 a = (unsigned char)code[pos++];
645 a |= (unsigned char)code[pos++] << 8;
646 a = pos + (int)SHORT(a);
647 re_compile_fastmap_aux(bufp, code, a, visited, can_be_null, fastmap);
653 set_error("Unknown regex opcode: memory corrupted?");
659 static int re_do_compile_fastmap(regex_t * bufp, unsigned char *buffer, int used,
660 int pos, unsigned char *can_be_null,
661 unsigned char *fastmap)
663 unsigned char small_visited[512], *visited;
665 if (used <= (int)sizeof(small_visited))
666 visited = small_visited;
668 visited = (unsigned char *)malloc(used);
673 memset(fastmap, 0, 256);
674 memset(visited, 0, used);
675 re_compile_fastmap_aux(bufp, buffer, pos, visited, can_be_null, fastmap);
676 if (visited != small_visited)
681 void re_compile_fastmap(regex_t * bufp)
683 if (!bufp->fastmap || bufp->fastmap_accurate)
685 // assert(bufp->used > 0);
686 if (!re_do_compile_fastmap(bufp, bufp->buffer,
687 bufp->used, 0, &bufp->can_be_null, bufp->fastmap))
691 if (bufp->buffer[0] == Cbol)
692 bufp->anchor = 1; /* begline */
693 else if (bufp->buffer[0] == Cbegbuf)
694 bufp->anchor = 2; /* begbuf */
696 bufp->anchor = 0; /* none */
697 bufp->fastmap_accurate = 1;
703 * ... code for operand of star
705 * 2: ... code after star
707 * We change the star_jump to update_failure_jump if we can determine
708 * that it is safe to do so; otherwise we change it to an ordinary
715 * 2: ... code for operand of plus
717 * 3: ... code after plus
719 * For star_jump considerations this is processed identically to star.
723 static int re_optimize_star_jump(regex_t * bufp, unsigned char *code)
725 unsigned char map[256];
726 unsigned char can_be_null;
732 int num_instructions = 0;
734 a = (unsigned char)*code++;
735 a |= (unsigned char)*code++ << 8;
738 p1 = code + a + 3; /* skip the failure_jump */
739 /* Check that the jump is within the pattern */
740 if (p1 < bufp->buffer || bufp->buffer + bufp->used < p1) {
741 set_error("Regex VM jump out of bounds (failure_jump opt)");
744 // assert(p1[-3] == Cfailure_jump);
746 /* p1 points inside loop, p2 points to after loop */
747 if (!re_do_compile_fastmap(bufp, bufp->buffer, bufp->used,
748 (int)(p2 - bufp->buffer), &can_be_null, map))
749 goto make_normal_jump;
751 /* If we might introduce a new update point inside the
752 * loop, we can't optimize because then update_jump would
753 * update a wrong failure point. Thus we have to be
754 * quite careful here.
757 /* loop until we find something that consumes a character */
775 ch = (unsigned char)*p1++;
777 goto make_normal_jump;
780 for (b = 0; b < 256; b++)
781 if (b != '\n' && map[b])
782 goto make_normal_jump;
785 for (b = 0; b < 256; b++)
786 if ((p1[b >> 3] & (1 << (b & 7))) && map[b])
787 goto make_normal_jump;
791 goto make_normal_jump;
795 /* now we know that we can't backtrack. */
796 while (p1 != p2 - 3) {
825 case Cupdate_failure_jump:
826 case Cdummy_failure_jump:
827 goto make_normal_jump;
833 /* make_update_jump: */
835 a += 3; /* jump to after the Cfailure_jump */
836 code[0] = Cupdate_failure_jump;
839 if (num_instructions > 1)
841 // assert(num_instructions == 1);
842 /* if the only instruction matches a single character, we can do
844 p1 = code + 3 + a; /* start of sole instruction */
845 if (*p1 == Cset || *p1 == Cexact || *p1 == Canychar ||
846 *p1 == Csyntaxspec || *p1 == Cnotsyntaxspec)
856 static int re_optimize(regex_t * bufp)
888 if (!re_optimize_star_jump(bufp, code)) {
892 case Cupdate_failure_jump:
894 case Cdummy_failure_jump:
905 #define NEXTCHAR(var) \
908 goto ends_prematurely; \
909 (var) = regex[pos]; \
913 #define ALLOC(amount) \
915 if (pattern_offset+(amount) > alloc) \
917 alloc += 256 + (amount); \
918 pattern = (unsigned char *)realloc(pattern, alloc); \
920 goto out_of_memory; \
924 #define STORE(ch) pattern[pattern_offset++] = (ch)
926 #define CURRENT_LEVEL_START (starts[starts_base + current_level])
928 #define SET_LEVEL_START starts[starts_base + current_level] = pattern_offset
930 #define PUSH_LEVEL_STARTS \
931 if (starts_base < (MAX_NESTING-1)*NUM_LEVELS) \
932 starts_base += NUM_LEVELS; \
936 #define POP_LEVEL_STARTS starts_base -= NUM_LEVELS
938 #define PUT_ADDR(offset,addr) \
940 int disp = (addr) - (offset) - 2; \
941 pattern[(offset)] = disp & 0xff; \
942 pattern[(offset)+1] = (disp>>8) & 0xff; \
945 #define INSERT_JUMP(pos,type,addr) \
947 int a, p = (pos), t = (type), ad = (addr); \
948 for (a = pattern_offset - 1; a >= p; a--) \
949 pattern[a + 3] = pattern[a]; \
952 pattern_offset += 3; \
955 #define SETBIT(buf,offset,bit) (buf)[(offset)+(bit)/8] |= (1<<((bit) & 7))
959 bufp->allocated = alloc; \
960 bufp->buffer = pattern; \
961 bufp->used = pattern_offset; \
964 #define GETHEX(var) \
966 unsigned char gethex_ch, gethex_value; \
967 NEXTCHAR(gethex_ch); \
968 gethex_value = hex_char_to_decimal(gethex_ch); \
969 if (gethex_value == 16) \
971 NEXTCHAR(gethex_ch); \
972 gethex_ch = hex_char_to_decimal(gethex_ch); \
973 if (gethex_ch == 16) \
975 (var) = gethex_value * 16 + gethex_ch; \
978 #define ANSI_TRANSLATE(ch) \
985 ch = 7; /* audible bell */ \
991 ch = 8; /* backspace */ \
997 ch = 12; /* form feed */ \
1003 ch = 10; /* line feed */ \
1009 ch = 13; /* carriage return */ \
1021 ch = 11; /* vertical tab */ \
1024 case 'x': /* hex code */ \
1032 /* other characters passed through */ \
1034 ch = translate[(unsigned char)ch]; \
1040 const char *re_compile_pattern(regex_t * bufp, unsigned char *regex)
1048 int pattern_offset = 0, alloc;
1049 int starts[NUM_LEVELS * MAX_NESTING];
1051 int future_jumps[MAX_NESTING];
1053 unsigned char ch = '\0';
1054 unsigned char *pattern;
1055 unsigned char *translate;
1058 int num_open_registers;
1059 int open_registers[RE_NREGS];
1060 int beginning_context;
1061 int size = strlen((char *)regex);
1063 if (!re_compile_initialized)
1064 re_compile_initialize();
1066 bufp->fastmap_accurate = 0;
1067 bufp->uses_registers = 1;
1068 bufp->num_registers = 1;
1069 translate = bufp->translate;
1070 pattern = bufp->buffer;
1071 alloc = bufp->allocated;
1072 if (alloc == 0 || pattern == NULL) {
1074 pattern = (unsigned char *)malloc(alloc);
1083 num_open_registers = 0;
1086 beginning_context = 1;
1088 /* we use Rend dummy to ensure that pending jumps are updated
1089 (due to low priority of Rend) before exiting the loop. */
1091 while (op != Rend) {
1097 ch = translate[(unsigned char)ch];
1098 op = plain_ops[(unsigned char)ch];
1101 op = quoted_ops[(unsigned char)ch];
1102 if (op == Rnormal && regexp_ansi_sequences)
1106 level = precedences[op];
1107 /* printf("ch='%c' op=%d level=%d current_level=%d
1108 curlevstart=%d\n", ch, op, level, current_level,
1109 CURRENT_LEVEL_START); */
1110 if (level > current_level) {
1111 for (current_level++; current_level < level; current_level++)
1114 } else if (level < current_level) {
1115 current_level = level;
1116 for (; num_jumps > 0 &&
1117 future_jumps[num_jumps - 1] >= CURRENT_LEVEL_START; num_jumps--)
1118 PUT_ADDR(future_jumps[num_jumps - 1], pattern_offset);
1126 store_opcode_and_arg: /* opcode & ch must be set */
1140 set_error("Rquote");
1141 /*NOTREACHED*/ case Rbol:
1142 if (!beginning_context) {
1143 if (regexp_context_indep_ops)
1151 if (!((pos >= size) ||
1152 ((regexp_syntax & RE_NO_BK_VBAR) ?
1153 (regex[pos] == '\174') :
1154 (pos + 1 < size && regex[pos] == '\134' &&
1155 regex[pos + 1] == '\174')) ||
1156 ((regexp_syntax & RE_NO_BK_PARENS) ?
1157 (regex[pos] == ')') :
1158 (pos + 1 < size && regex[pos] == '\134' &&
1159 regex[pos + 1] == ')')))) {
1160 if (regexp_context_indep_ops)
1170 if (beginning_context) {
1171 if (regexp_context_indep_ops)
1176 if (CURRENT_LEVEL_START == pattern_offset)
1177 break; /* ignore empty patterns for ? */
1179 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump, pattern_offset + 3);
1183 if (beginning_context) {
1184 if (regexp_context_indep_ops)
1189 if (CURRENT_LEVEL_START == pattern_offset)
1190 break; /* ignore empty patterns for + and * */
1192 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump, pattern_offset + 6);
1193 INSERT_JUMP(pattern_offset, Cstar_jump, CURRENT_LEVEL_START);
1194 if (op == Rplus) /* jump over initial failure_jump */
1195 INSERT_JUMP(CURRENT_LEVEL_START, Cdummy_failure_jump,
1196 CURRENT_LEVEL_START + 6);
1200 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump, pattern_offset + 6);
1201 if (num_jumps >= MAX_NESTING)
1204 future_jumps[num_jumps++] = pattern_offset;
1212 if (next_register < RE_NREGS) {
1213 bufp->uses_registers = 1;
1215 STORE(Cstart_memory);
1216 STORE(next_register);
1217 open_registers[num_open_registers++] = next_register;
1218 bufp->num_registers++;
1228 if (paren_depth <= 0)
1229 goto parenthesis_error;
1231 current_level = precedences[Ropenpar];
1233 if (paren_depth < num_open_registers) {
1234 bufp->uses_registers = 1;
1237 num_open_registers--;
1238 STORE(open_registers[num_open_registers]);
1243 goto bad_match_register;
1244 if (!(ch >= '0' && ch <= '9')) {
1245 goto bad_match_register;
1247 bufp->uses_registers = 1;
1248 opcode = Cmatch_memory;
1250 goto store_opcode_and_arg;
1251 case Rextended_memory:
1253 if (ch < '0' || ch > '9')
1254 goto bad_match_register;
1256 if (a < '0' || a > '9')
1257 goto bad_match_register;
1258 ch = 10 * (a - '0') + ch - '0';
1259 if (ch == 0 || ch >= RE_NREGS)
1260 goto bad_match_register;
1261 bufp->uses_registers = 1;
1262 opcode = Cmatch_memory;
1263 goto store_opcode_and_arg;
1275 offset = pattern_offset;
1276 for (a = 0; a < 256 / 8; a++)
1280 ch = translate[(unsigned char)ch];
1285 ch = translate[(unsigned char)ch];
1291 while (ch != '\135' || firstchar) {
1293 if (regexp_ansi_sequences && ch == '\134') {
1298 for (a = prev; a <= (int)ch; a++)
1299 SETBIT(pattern, offset, a);
1302 } else if (prev != -1 && ch == '-')
1305 SETBIT(pattern, offset, ch);
1310 ch = translate[(unsigned char)ch];
1313 SETBIT(pattern, offset, '-');
1315 for (a = 0; a < 256 / 8; a++)
1316 pattern[offset + a] ^= 0xff;
1332 opcode = Csyntaxspec;
1334 goto store_opcode_and_arg;
1338 opcode = Cnotsyntaxspec;
1340 goto store_opcode_and_arg;
1354 opcode = Cwordbound;
1359 opcode = Cnotwordbound;
1367 beginning_context = (op == Ropenpar || op == Ror);
1369 if (starts_base != 0)
1370 goto parenthesis_error;
1371 // assert(num_jumps == 0);
1375 if (!re_optimize(bufp))
1376 return "Optimization error";
1381 return "Badly placed special character";
1385 return "Bad match register number";
1389 return "Bad hexadecimal number";
1393 return "Badly placed parenthesis";
1397 return "Out of memory";
1401 return "Regular expression ends prematurely";
1405 return "Regular expression too complex";
1413 #undef CURRENT_LEVEL_START
1414 #undef SET_LEVEL_START
1415 #undef PUSH_LEVEL_STARTS
1416 #undef POP_LEVEL_STARTS
1422 #define PREFETCH if (text == textend) goto fail
1424 #define NEXTCHAR(var) \
1426 var = (unsigned char)*text++; \
1428 var = translate[var]
1431 int regcomp(regex_t * bufp, const char *regex, int cflags)
1433 memset(bufp, 0, sizeof(regex_t));
1434 re_compile_pattern(bufp, (unsigned char *)regex);
1441 int regexec(regex_t * preg, const char *string, size_t nmatch,
1442 regmatch_t pmatch[], int eflags)
1445 int len = strlen(string);
1446 struct re_registers regs;
1447 stat = re_search(preg, (unsigned char *)string, len, 0, len, ®s);
1448 /* stat is the start position in the string base 0 where
1449 * the pattern was found or negative if not found.
1451 return stat < 0 ? -1 : 0;
1454 size_t regerror(int errcode, regex_t * preg, char *errbuf, size_t errbuf_size)
1456 bstrncpy(errbuf, preg->errmsg, errbuf_size);
1460 void regfree(regex_t * preg)
1464 int re_match(regex_t * bufp, unsigned char *string, int size, int pos,
1465 regexp_registers_t old_regs)
1467 unsigned char *code;
1468 unsigned char *translate;
1469 unsigned char *text;
1470 unsigned char *textstart;
1471 unsigned char *textend;
1477 unsigned char *regstart;
1478 unsigned char *regend;
1482 // assert(pos >= 0 && size >= 0);
1483 // assert(pos <= size);
1485 text = string + pos;
1487 textend = string + size;
1489 code = bufp->buffer;
1491 translate = bufp->translate;
1493 NEW_STATE(state, bufp->num_registers);
1499 match_end = text - textstart;
1501 old_regs->start[0] = pos;
1502 old_regs->end[0] = match_end;
1503 if (!bufp->uses_registers) {
1504 for (a = 1; a < RE_NREGS; a++) {
1505 old_regs->start[a] = -1;
1506 old_regs->end[a] = -1;
1509 for (a = 1; a < bufp->num_registers; a++) {
1510 if ((GET_REG_START(state, a) == NULL) ||
1511 (GET_REG_END(state, a) == NULL)) {
1512 old_regs->start[a] = -1;
1513 old_regs->end[a] = -1;
1516 old_regs->start[a] = GET_REG_START(state, a) - textstart;
1517 old_regs->end[a] = GET_REG_END(state, a) - textstart;
1519 for (; a < RE_NREGS; a++) {
1520 old_regs->start[a] = -1;
1521 old_regs->end[a] = -1;
1526 return match_end - pos;
1530 if (text == textstart || text[-1] == '\n')
1531 goto continue_matching;
1536 if (text == textend || *text == '\n')
1537 goto continue_matching;
1543 if (code[ch / 8] & (1 << (ch & 7))) {
1545 goto continue_matching;
1552 if (ch != (unsigned char)*code++)
1554 goto continue_matching;
1561 goto continue_matching;
1566 SET_REG_START(state, reg, text, goto error);
1567 goto continue_matching;
1572 SET_REG_END(state, reg, text, goto error);
1573 goto continue_matching;
1578 regstart = GET_REG_START(state, reg);
1579 regend = GET_REG_END(state, reg);
1580 if ((regstart == NULL) || (regend == NULL))
1581 goto fail; /* or should we just match nothing? */
1582 regsize = regend - regstart;
1584 if (regsize > (textend - text))
1587 for (; regstart < regend; regstart++, text++)
1588 if (translate[*regstart] != translate[*text])
1591 for (; regstart < regend; regstart++, text++)
1592 if (*regstart != *text)
1594 goto continue_matching;
1596 case Cupdate_failure_jump:
1598 UPDATE_FAILURE(state, text, goto error);
1599 /* fall to next case */
1601 /* treat Cstar_jump just like Cjump if it hasn't been optimized */
1605 a = (unsigned char)*code++;
1606 a |= (unsigned char)*code++ << 8;
1607 code += (int)SHORT(a);
1608 if (code < bufp->buffer || bufp->buffer + bufp->used < code) {
1609 set_error("Regex VM jump out of bounds (Cjump)");
1613 goto continue_matching;
1615 case Cdummy_failure_jump:
1617 unsigned char *failuredest;
1619 a = (unsigned char)*code++;
1620 a |= (unsigned char)*code++ << 8;
1622 // assert(*code == Cfailure_jump);
1623 b = (unsigned char)code[1];
1624 b |= (unsigned char)code[2] << 8;
1625 failuredest = code + (int)SHORT(b) + 3;
1626 if (failuredest < bufp->buffer || bufp->buffer + bufp->used < failuredest) {
1628 ("Regex VM jump out of bounds (Cdummy_failure_jump failuredest)");
1632 PUSH_FAILURE(state, failuredest, NULL, goto error);
1634 if (code < bufp->buffer || bufp->buffer + bufp->used < code) {
1635 set_error("Regex VM jump out of bounds (Cdummy_failure_jump code)");
1639 goto continue_matching;
1643 a = (unsigned char)*code++;
1644 a |= (unsigned char)*code++ << 8;
1646 if (code + a < bufp->buffer || bufp->buffer + bufp->used < code + a) {
1647 set_error("Regex VM jump out of bounds (Cfailure_jump)");
1651 PUSH_FAILURE(state, code + a, text, goto error);
1652 goto continue_matching;
1656 unsigned char *pinst;
1657 a = (unsigned char)*code++;
1658 a |= (unsigned char)*code++ << 8;
1661 if (pinst < bufp->buffer || bufp->buffer + bufp->used < pinst) {
1662 set_error("Regex VM jump out of bounds (Crepeat1)");
1666 /* pinst is sole instruction in loop, and it matches a
1667 * single character. Since Crepeat1 was originally a
1668 * Cupdate_failure_jump, we also know that backtracking
1669 * is useless: so long as the single-character
1670 * expression matches, it must be used. Also, in the
1671 * case of +, we've already matched one character, so +
1672 * can't fail: nothing here can cause a failure. */
1677 while (text < textend) {
1678 ch = translate[(unsigned char)*text];
1679 if (pinst[ch / 8] & (1 << (ch & 7)))
1685 while (text < textend) {
1686 ch = (unsigned char)*text;
1687 if (pinst[ch / 8] & (1 << (ch & 7)))
1697 ch = (unsigned char)*pinst;
1699 while (text < textend && translate[(unsigned char)*text] == ch)
1702 while (text < textend && (unsigned char)*text == ch)
1709 while (text < textend && (unsigned char)*text != '\n')
1715 a = (unsigned char)*pinst;
1717 while (text < textend && (SYNTAX(translate[*text]) & a))
1720 while (text < textend && (SYNTAX(*text) & a))
1725 case Cnotsyntaxspec:
1727 a = (unsigned char)*pinst;
1729 while (text < textend && !(SYNTAX(translate[*text]) & a))
1732 while (text < textend && !(SYNTAX(*text) & a))
1740 set_error("Unknown regex opcode: memory corrupted?");
1744 /* due to the funky way + and * are compiled, the top
1745 * failure- stack entry at this point is actually a
1746 * success entry -- update it & pop it */
1747 UPDATE_FAILURE(state, text, goto error);
1748 goto fail; /* i.e., succeed <wink/sigh> */
1752 if (text == textstart)
1753 goto continue_matching;
1758 if (text == textend)
1759 goto continue_matching;
1764 if (text == textend)
1766 if (!(SYNTAX(*text) & Sword))
1768 if (text == textstart)
1769 goto continue_matching;
1770 if (!(SYNTAX(text[-1]) & Sword))
1771 goto continue_matching;
1776 if (text == textstart)
1778 if (!(SYNTAX(text[-1]) & Sword))
1780 if (text == textend)
1781 goto continue_matching;
1782 if (!(SYNTAX(*text) & Sword))
1783 goto continue_matching;
1788 /* Note: as in gnu regexp, this also matches at the
1789 * beginning and end of buffer. */
1791 if (text == textstart || text == textend)
1792 goto continue_matching;
1793 if ((SYNTAX(text[-1]) & Sword) ^ (SYNTAX(*text) & Sword))
1794 goto continue_matching;
1799 /* Note: as in gnu regexp, this never matches at the
1800 * beginning and end of buffer. */
1801 if (text == textstart || text == textend)
1803 if (!((SYNTAX(text[-1]) & Sword) ^ (SYNTAX(*text) & Sword)))
1804 goto continue_matching;
1810 if (!(SYNTAX(ch) & (unsigned char)*code++))
1812 goto continue_matching;
1814 case Cnotsyntaxspec:
1817 if (SYNTAX(ch) & (unsigned char)*code++)
1819 goto continue_matching;
1824 set_error("Unknown regex opcode: memory corrupted?");
1831 #if 0 /* This line is never reached --Guido */
1838 /* Using "break;" in the above switch statement is equivalent to "goto fail;" */
1840 POP_FAILURE(state, code, text, goto done_matching, goto error);
1841 goto continue_matching;
1844 /* if(translated != NULL) */
1845 /* free(translated); */
1850 /* if (translated != NULL) */
1851 /* free(translated); */
1860 int re_search(regex_t * bufp, unsigned char *string, int size, int pos,
1861 int range, regexp_registers_t regs)
1863 unsigned char *fastmap;
1864 unsigned char *translate;
1865 unsigned char *text;
1866 unsigned char *partstart;
1867 unsigned char *partend;
1870 unsigned char anchor;
1872 // assert(size >= 0 && pos >= 0);
1873 // assert(pos + range >= 0 && pos + range <= size); /* Bugfix by ylo */
1875 fastmap = bufp->fastmap;
1876 translate = bufp->translate;
1877 if (fastmap && !bufp->fastmap_accurate) {
1878 re_compile_fastmap(bufp);
1883 anchor = bufp->anchor;
1884 if (bufp->can_be_null == 1) /* can_be_null == 2: can match null at eob */
1900 for (; range >= 0; range--, pos += dir) {
1902 if (dir == 1) { /* searching forwards */
1904 text = string + pos;
1905 partend = string + size;
1908 while (text != partend &&
1909 !fastmap[(unsigned char)translate[(unsigned char)*text]])
1912 while (text != partend && !fastmap[(unsigned char)*text])
1914 pos += text - partstart;
1915 range -= text - partstart;
1916 if (pos == size && bufp->can_be_null == 0)
1918 } else { /* searching backwards */
1919 text = string + pos;
1920 partstart = string + pos - range;
1923 while (text != partstart && !fastmap[(unsigned char)
1924 translate[(unsigned char)*text]])
1927 while (text != partstart && !fastmap[(unsigned char)*text])
1929 pos -= partend - text;
1930 range -= partend - text;
1933 if (anchor == 1) { /* anchored to begline */
1934 if (pos > 0 && (string[pos - 1] != '\n'))
1937 // assert(pos >= 0 && pos <= size);
1938 ret = re_match(bufp, string, size, pos, regs);
1950 ** c-file-style: "python"