3 * Author: Tatu Ylonen <ylo@ngs.fi>
5 * Copyright (c) 1991 Tatu Ylonen, Espoo, Finland
7 * Permission to use, copy, modify, distribute, and sell this software
8 * and its documentation for any purpose is hereby granted without
9 * fee, provided that the above copyright notice appear in all copies.
10 * This software is provided "as is" without express or implied
13 * Created: Thu Sep 26 17:14:05 1991 ylo
14 * Last modified: Mon Nov 4 17:06:48 1991 ylo
15 * Ported to Think C: 19 Jan 1992 guido@cwi.nl
17 * This code draws many ideas from the regular expression packages by
18 * Henry Spencer of the University of Toronto and Richard Stallman of
19 * the Free Software Foundation.
21 * Emacs-specific code and syntax table code is almost directly borrowed
24 * Bugs fixed and lots of reorganization by Jeffrey C. Ollie, April
25 * 1997 Thanks for bug reports and ideas from Andrew Kuchling, Tim
26 * Peters, Guido van Rossum, Ka-Ping Yee, Sjoerd Mullender, and
27 * probably one or two others that I'm forgetting.
29 * This file modified to work with Bacula and C++ by
30 * Kern Sibbald, April 2006
32 * This file modified to work with REG_ICASE and Bacula by
33 * Eric Bollengier April 2007
36 Bacula® - The Network Backup Solution
38 Copyright (C) 2006-2014 Free Software Foundation Europe e.V.
40 The main author of Bacula is Kern Sibbald, with contributions from many
41 others, a complete list can be found in the file AUTHORS.
43 You may use this file and others of this release according to the
44 license defined in the LICENSE file, which includes the Affero General
45 Public License, v3.0 ("AGPLv3") and some additional permissions and
46 terms pursuant to its AGPLv3 Section 7.
48 Bacula® is a registered trademark of Kern Sibbald.
55 #define set_error(x) bufp->errmsg=((char *)(x))
56 #define got_error bufp->errmsg!=NULL
58 /* The original code blithely assumed that sizeof(short) == 2. Not
59 * always true. Original instances of "(short)x" were replaced by
60 * SHORT(x), where SHORT is #defined below. */
62 #define SHORT(x) ((x) & 0x8000 ? (x) - 0x10000 : (x))
64 /* The stack implementation is taken from an idea by Andrew Kuchling.
65 * It's a doubly linked list of arrays. The advantages of this over a
66 * simple linked list are that the number of mallocs required are
67 * reduced. It also makes it possible to statically allocate enough
68 * space so that small patterns don't ever need to call malloc.
70 * The advantages over a single array is that is periodically
71 * realloced when more space is needed is that we avoid ever copying
74 /* item_t is the basic stack element. Defined as a union of
75 * structures so that both registers, failure points, and counters can
76 * be pushed/popped from the stack. There's nothing built into the
77 * item to keep track of whether a certain stack item is a register, a
78 * failure point, or a counter. */
80 typedef union item_t {
101 #define B_STACK_PAGE_SIZE 256
102 #define NUM_REGISTERS 256
104 /* A 'page' of stack items. */
106 typedef struct item_page_t {
107 item_t items[B_STACK_PAGE_SIZE];
108 struct item_page_t *prev;
109 struct item_page_t *next;
113 typedef struct match_state {
114 /* The number of registers that have been pushed onto the stack
115 * since the last failure point. */
119 /* Used to control when registers need to be pushed onto the
124 /* The number of failure points on the stack. */
128 /* Storage for the registers. Each register consists of two
129 * pointers to characters. So register N is represented as
130 * start[N] and end[N]. The pointers must be converted to
131 * offsets from the beginning of the string before returning the
132 * registers to the calling program. */
134 unsigned char *start[NUM_REGISTERS];
135 unsigned char *end[NUM_REGISTERS];
137 /* Keeps track of whether a register has changed recently. */
139 int changed[NUM_REGISTERS];
141 /* Structure to encapsulate the stack. */
143 /* index into the current page. If index == 0 and you need
144 * to pop an item, move to the previous page and set index
145 * = B_STACK_PAGE_SIZE - 1. Otherwise decrement index to
146 * push a page. If index == B_STACK_PAGE_SIZE and you need
147 * to push a page move to the next page and set index =
148 * 0. If there is no new next page, allocate a new page
149 * and link it in. Otherwise, increment index to push a
153 item_page_t *current; /* Pointer to the current page. */
154 item_page_t first; /* First page is statically allocated. */
158 /* Initialize a state object */
160 /* #define NEW_STATE(state) \ */
161 /* memset(&state, 0, (void *)(&state.stack) - (void *)(&state)); \ */
162 /* state.stack.current = &state.stack.first; \ */
163 /* state.stack.first.prev = NULL; \ */
164 /* state.stack.first.next = NULL; \ */
165 /* state.stack.index = 0; \ */
166 /* state.level = 1 */
168 #define NEW_STATE(state, nregs) \
171 for (i = 0; i < nregs; i++) \
173 state.start[i] = NULL; \
174 state.end[i] = NULL; \
175 state.changed[i] = 0; \
177 state.stack.current = &state.stack.first; \
178 state.stack.first.prev = NULL; \
179 state.stack.first.next = NULL; \
180 state.stack.index = 0; \
187 /* Free any memory that might have been malloc'd */
189 #define FREE_STATE(state) \
190 while(state.stack.first.next != NULL) \
192 state.stack.current = state.stack.first.next; \
193 state.stack.first.next = state.stack.current->next; \
194 free(state.stack.current); \
197 /* Discard the top 'count' stack items. */
199 #define STACK_DISCARD(stack, count, on_error) \
200 stack.index -= count; \
201 while (stack.index < 0) \
203 if (stack.current->prev == NULL) \
205 stack.current = stack.current->prev; \
206 stack.index += B_STACK_PAGE_SIZE; \
209 /* Store a pointer to the previous item on the stack. Used to pop an
210 * item off of the stack. */
212 #define STACK_PREV(stack, top, on_error) \
213 if (stack.index == 0) \
215 if (stack.current->prev == NULL) \
217 stack.current = stack.current->prev; \
218 stack.index = B_STACK_PAGE_SIZE - 1; \
224 top = &(stack.current->items[stack.index])
226 /* Store a pointer to the next item on the stack. Used to push an item
227 * on to the stack. */
229 #define STACK_NEXT(stack, top, on_error) \
230 if (stack.index == B_STACK_PAGE_SIZE) \
232 if (stack.current->next == NULL) \
234 stack.current->next = (item_page_t *)malloc(sizeof(item_page_t)); \
235 if (stack.current->next == NULL) \
237 stack.current->next->prev = stack.current; \
238 stack.current->next->next = NULL; \
240 stack.current = stack.current->next; \
243 top = &(stack.current->items[stack.index++])
245 /* Store a pointer to the item that is 'count' items back in the
246 * stack. STACK_BACK(stack, top, 1, on_error) is equivalent to
247 * STACK_TOP(stack, top, on_error). */
249 #define STACK_BACK(stack, top, count, on_error) \
252 item_page_t *current; \
253 current = stack.current; \
254 index = stack.index - (count); \
257 if (current->prev == NULL) \
259 current = current->prev; \
260 index += B_STACK_PAGE_SIZE; \
262 top = &(current->items[index]); \
265 /* Store a pointer to the top item on the stack. Execute the
266 * 'on_error' code if there are no items on the stack. */
268 #define STACK_TOP(stack, top, on_error) \
269 if (stack.index == 0) \
271 if (stack.current->prev == NULL) \
273 top = &(stack.current->prev->items[B_STACK_PAGE_SIZE - 1]); \
277 top = &(stack.current->items[stack.index - 1]); \
280 /* Test to see if the stack is empty */
282 #define STACK_EMPTY(stack) ((stack.index == 0) && \
283 (stack.current->prev == NULL))
285 /* Return the start of register 'reg' */
287 #define GET_REG_START(state, reg) (state.start[reg])
289 /* Return the end of register 'reg' */
291 #define GET_REG_END(state, reg) (state.end[reg])
293 /* Set the start of register 'reg'. If the state of the register needs
294 * saving, push it on the stack. */
296 #define SET_REG_START(state, reg, text, on_error) \
297 if(state.changed[reg] < state.level) \
300 STACK_NEXT(state.stack, item, on_error); \
301 item->reg.num = reg; \
302 item->reg.start = state.start[reg]; \
303 item->reg.end = state.end[reg]; \
304 item->reg.level = state.changed[reg]; \
305 state.changed[reg] = state.level; \
308 state.start[reg] = text
310 /* Set the end of register 'reg'. If the state of the register needs
311 * saving, push it on the stack. */
313 #define SET_REG_END(state, reg, text, on_error) \
314 if(state.changed[reg] < state.level) \
317 STACK_NEXT(state.stack, item, on_error); \
318 item->reg.num = reg; \
319 item->reg.start = state.start[reg]; \
320 item->reg.end = state.end[reg]; \
321 item->reg.level = state.changed[reg]; \
322 state.changed[reg] = state.level; \
325 state.end[reg] = text
327 #define PUSH_FAILURE(state, xcode, xtext, on_error) \
330 STACK_NEXT(state.stack, item, on_error); \
331 item->fail.code = xcode; \
332 item->fail.text = xtext; \
333 item->fail.count = state.count; \
334 item->fail.level = state.level; \
335 item->fail.phantom = 0; \
341 /* Update the last failure point with a new position in the text. */
343 #define UPDATE_FAILURE(state, xtext, on_error) \
346 STACK_BACK(state.stack, item, state.count + 1, on_error); \
347 if (!item->fail.phantom) \
350 STACK_NEXT(state.stack, item2, on_error); \
351 item2->fail.code = item->fail.code; \
352 item2->fail.text = xtext; \
353 item2->fail.count = state.count; \
354 item2->fail.level = state.level; \
355 item2->fail.phantom = 1; \
362 STACK_DISCARD(state.stack, state.count, on_error); \
363 STACK_TOP(state.stack, item, on_error); \
364 item->fail.text = xtext; \
370 #define POP_FAILURE(state, xcode, xtext, on_empty, on_error) \
375 while(state.count > 0) \
377 STACK_PREV(state.stack, item, on_error); \
378 state.start[item->reg.num] = item->reg.start; \
379 state.end[item->reg.num] = item->reg.end; \
380 state.changed[item->reg.num] = item->reg.level; \
383 STACK_PREV(state.stack, item, on_empty); \
384 xcode = item->fail.code; \
385 xtext = item->fail.text; \
386 state.count = item->fail.count; \
387 state.level = item->fail.level; \
390 while (item->fail.text == NULL); \
393 enum regexp_compiled_ops { /* opcodes for compiled regexp */
394 Cend, /* end of pattern reached */
395 Cbol, /* beginning of line */
396 Ceol, /* end of line */
397 Cset, /* character set. Followed by 32 bytes of set. */
398 Cexact, /* followed by a byte to match */
399 Canychar, /* matches any character except newline */
400 Cstart_memory, /* set register start addr (followed by reg number) */
401 Cend_memory, /* set register end addr (followed by reg number) */
402 Cmatch_memory, /* match a duplicate of reg contents (regnum follows) */
403 Cjump, /* followed by two bytes (lsb,msb) of displacement. */
404 Cstar_jump, /* will change to jump/update_failure_jump at runtime */
405 Cfailure_jump, /* jump to addr on failure */
406 Cupdate_failure_jump, /* update topmost failure point and jump */
407 Cdummy_failure_jump, /* push a dummy failure point and jump */
408 Cbegbuf, /* match at beginning of buffer */
409 Cendbuf, /* match at end of buffer */
410 Cwordbeg, /* match at beginning of word */
411 Cwordend, /* match at end of word */
412 Cwordbound, /* match if at word boundary */
413 Cnotwordbound, /* match if not at word boundary */
414 Csyntaxspec, /* matches syntax code (1 byte follows) */
415 Cnotsyntaxspec, /* matches if syntax code does not match (1 byte follows) */
419 enum regexp_syntax_op { /* syntax codes for plain and quoted characters */
420 Rend, /* special code for end of regexp */
421 Rnormal, /* normal character */
422 Ranychar, /* any character except newline */
423 Rquote, /* the quote character */
424 Rbol, /* match beginning of line */
425 Reol, /* match end of line */
426 Roptional, /* match preceding expression optionally */
427 Rstar, /* match preceding expr zero or more times */
428 Rplus, /* match preceding expr one or more times */
429 Ror, /* match either of alternatives */
430 Ropenpar, /* opening parenthesis */
431 Rclosepar, /* closing parenthesis */
432 Rmemory, /* match memory register */
433 Rextended_memory, /* \vnn to match registers 10-99 */
434 Ropenset, /* open set. Internal syntax hard-coded below. */
435 /* the following are gnu extensions to "normal" regexp syntax */
436 Rbegbuf, /* beginning of buffer */
437 Rendbuf, /* end of buffer */
438 Rwordchar, /* word character */
439 Rnotwordchar, /* not word character */
440 Rwordbeg, /* beginning of word */
441 Rwordend, /* end of word */
442 Rwordbound, /* word bound */
443 Rnotwordbound, /* not word bound */
447 static int re_compile_initialized = 0;
448 static int regexp_syntax = RE_SYNTAX_EGREP;
449 int re_syntax = RE_SYNTAX_EGREP; /* Exported copy of regexp_syntax */
450 static unsigned char plain_ops[256];
451 static unsigned char quoted_ops[256];
452 static unsigned char precedences[Rnum_ops];
453 static int regexp_context_indep_ops;
454 static int regexp_ansi_sequences;
456 #define NUM_LEVELS 5 /* number of precedence levels in use */
457 #define MAX_NESTING 100 /* max nesting level of operators */
459 #define SYNTAX(ch) re_syntax_table[(unsigned char)(ch)]
461 unsigned char re_syntax_table[256];
463 void re_compile_initialize(void)
467 static int syntax_table_inited = 0;
469 if (!syntax_table_inited) {
470 syntax_table_inited = 1;
471 memset(re_syntax_table, 0, 256);
472 for (a = 'a'; a <= 'z'; a++)
473 re_syntax_table[a] = Sword;
474 for (a = 'A'; a <= 'Z'; a++)
475 re_syntax_table[a] = Sword;
476 for (a = '0'; a <= '9'; a++)
477 re_syntax_table[a] = Sword | Sdigit | Shexdigit;
478 for (a = '0'; a <= '7'; a++)
479 re_syntax_table[a] |= Soctaldigit;
480 for (a = 'A'; a <= 'F'; a++)
481 re_syntax_table[a] |= Shexdigit;
482 for (a = 'a'; a <= 'f'; a++)
483 re_syntax_table[a] |= Shexdigit;
484 re_syntax_table[(int)'_'] = Sword;
485 for (a = 9; a <= 13; a++)
486 re_syntax_table[a] = Swhitespace;
487 re_syntax_table[(int)' '] = Swhitespace;
489 re_compile_initialized = 1;
490 for (a = 0; a < 256; a++) {
491 plain_ops[a] = Rnormal;
492 quoted_ops[a] = Rnormal;
494 for (a = '0'; a <= '9'; a++)
495 quoted_ops[a] = Rmemory;
496 plain_ops[(int)'\134'] = Rquote;
497 if (regexp_syntax & RE_NO_BK_PARENS) {
498 plain_ops[(int)'('] = Ropenpar;
499 plain_ops[(int)')'] = Rclosepar;
501 quoted_ops[(int)'('] = Ropenpar;
502 quoted_ops[(int)')'] = Rclosepar;
504 if (regexp_syntax & RE_NO_BK_VBAR) {
505 plain_ops[(int)'\174'] = Ror;
507 quoted_ops[(int)'\174'] = Ror;
509 plain_ops[(int)'*'] = Rstar;
510 if (regexp_syntax & RE_BK_PLUS_QM) {
511 quoted_ops[(int)'+'] = Rplus;
512 quoted_ops[(int)'?'] = Roptional;
514 plain_ops[(int)'+'] = Rplus;
515 plain_ops[(int)'?'] = Roptional;
517 if (regexp_syntax & RE_NEWLINE_OR) {
518 plain_ops[(int)'\n'] = Ror;
520 plain_ops[(int)'\133'] = Ropenset;
521 plain_ops[(int)'\136'] = Rbol;
522 plain_ops[(int)'$'] = Reol;
523 plain_ops[(int)'.'] = Ranychar;
524 if (!(regexp_syntax & RE_NO_GNU_EXTENSIONS)) {
525 quoted_ops[(int)'w'] = Rwordchar;
526 quoted_ops[(int)'W'] = Rnotwordchar;
527 quoted_ops[(int)'<'] = Rwordbeg;
528 quoted_ops[(int)'>'] = Rwordend;
529 quoted_ops[(int)'b'] = Rwordbound;
530 quoted_ops[(int)'B'] = Rnotwordbound;
531 quoted_ops[(int)'`'] = Rbegbuf;
532 quoted_ops[(int)'\''] = Rendbuf;
534 if (regexp_syntax & RE_ANSI_HEX) {
535 quoted_ops[(int)'v'] = Rextended_memory;
537 for (a = 0; a < Rnum_ops; a++) {
540 if (regexp_syntax & RE_TIGHT_VBAR) {
541 precedences[Ror] = 3;
542 precedences[Rbol] = 2;
543 precedences[Reol] = 2;
545 precedences[Ror] = 2;
546 precedences[Rbol] = 3;
547 precedences[Reol] = 3;
549 precedences[Rclosepar] = 1;
550 precedences[Rend] = 0;
551 regexp_context_indep_ops = (regexp_syntax & RE_CONTEXT_INDEP_OPS) != 0;
552 regexp_ansi_sequences = (regexp_syntax & RE_ANSI_HEX) != 0;
555 int re_set_syntax(int syntax) {
559 regexp_syntax = syntax;
560 re_syntax = syntax; /* Exported copy */
561 re_compile_initialize();
565 static int hex_char_to_decimal(int ch) {
566 if (ch >= '0' && ch <= '9')
568 if (ch >= 'a' && ch <= 'f')
569 return ch - 'a' + 10;
570 if (ch >= 'A' && ch <= 'F')
571 return ch - 'A' + 10;
575 static void re_compile_fastmap_aux(regex_t * bufp, unsigned char *code, int pos,
576 unsigned char *visited,
577 unsigned char *can_be_null,
578 unsigned char *fastmap)
585 return; /* we have already been here */
588 switch (code[pos++]) {
599 for (a = 0; a < 256; a++)
603 syntaxcode = code[pos++];
604 for (a = 0; a < 256; a++)
605 if (SYNTAX(a) & syntaxcode)
609 syntaxcode = code[pos++];
610 for (a = 0; a < 256; a++)
611 if (!(SYNTAX(a) & syntaxcode))
615 fastmap[(int)'\n'] = 1;
616 if (*can_be_null == 0)
617 *can_be_null = 2; /* can match null, but only at end of buffer */
620 for (a = 0; a < 256 / 8; a++)
621 if (code[pos + a] != 0)
622 for (b = 0; b < 8; b++)
623 if (code[pos + a] & (1 << b))
624 fastmap[(a << 3) + b] = 1;
628 fastmap[(unsigned char)code[pos]] = 1;
631 for (a = 0; a < 256; a++)
640 for (a = 0; a < 256; a++)
645 case Cdummy_failure_jump:
646 case Cupdate_failure_jump:
648 a = (unsigned char)code[pos++];
649 a |= (unsigned char)code[pos++] << 8;
650 pos += (int)SHORT(a);
652 /* argh... the regexp contains empty loops. This is not
653 good, as this may cause a failure stack overflow when
654 matching. Oh well. */
655 /* this path leads nowhere; pursue other paths. */
661 a = (unsigned char)code[pos++];
662 a |= (unsigned char)code[pos++] << 8;
663 a = pos + (int)SHORT(a);
664 re_compile_fastmap_aux(bufp, code, a, visited, can_be_null, fastmap);
670 set_error("Unknown regex opcode: memory corrupted?");
676 static int re_do_compile_fastmap(regex_t * bufp, unsigned char *buffer, int used,
677 int pos, unsigned char *can_be_null,
678 unsigned char *fastmap)
680 unsigned char small_visited[512], *visited;
682 if (used <= (int)sizeof(small_visited))
683 visited = small_visited;
685 visited = (unsigned char *)malloc(used);
690 memset(fastmap, 0, 256);
691 memset(visited, 0, used);
692 re_compile_fastmap_aux(bufp, buffer, pos, visited, can_be_null, fastmap);
693 if (visited != small_visited)
698 void re_compile_fastmap(regex_t * bufp)
700 if (!bufp->fastmap || bufp->fastmap_accurate)
702 // assert(bufp->used > 0);
703 if (!re_do_compile_fastmap(bufp, bufp->buffer,
704 bufp->used, 0, &bufp->can_be_null, bufp->fastmap))
708 if (bufp->buffer[0] == Cbol)
709 bufp->anchor = 1; /* begline */
710 else if (bufp->buffer[0] == Cbegbuf)
711 bufp->anchor = 2; /* begbuf */
713 bufp->anchor = 0; /* none */
714 bufp->fastmap_accurate = 1;
720 * ... code for operand of star
722 * 2: ... code after star
724 * We change the star_jump to update_failure_jump if we can determine
725 * that it is safe to do so; otherwise we change it to an ordinary
732 * 2: ... code for operand of plus
734 * 3: ... code after plus
736 * For star_jump considerations this is processed identically to star.
740 static int re_optimize_star_jump(regex_t * bufp, unsigned char *code)
742 unsigned char map[256];
743 unsigned char can_be_null;
749 int num_instructions = 0;
751 a = (unsigned char)*code++;
752 a |= (unsigned char)*code++ << 8;
755 p1 = code + a + 3; /* skip the failure_jump */
756 /* Check that the jump is within the pattern */
757 if (p1 < bufp->buffer || bufp->buffer + bufp->used < p1) {
758 set_error("Regex VM jump out of bounds (failure_jump opt)");
761 // assert(p1[-3] == Cfailure_jump);
763 /* p1 points inside loop, p2 points to after loop */
764 if (!re_do_compile_fastmap(bufp, bufp->buffer, bufp->used,
765 (int)(p2 - bufp->buffer), &can_be_null, map))
766 goto make_normal_jump;
768 /* If we might introduce a new update point inside the
769 * loop, we can't optimize because then update_jump would
770 * update a wrong failure point. Thus we have to be
771 * quite careful here.
774 /* loop until we find something that consumes a character */
792 ch = (unsigned char)*p1++;
794 goto make_normal_jump;
797 for (b = 0; b < 256; b++)
798 if (b != '\n' && map[b])
799 goto make_normal_jump;
802 for (b = 0; b < 256; b++)
803 if ((p1[b >> 3] & (1 << (b & 7))) && map[b])
804 goto make_normal_jump;
808 goto make_normal_jump;
812 /* now we know that we can't backtrack. */
813 while (p1 != p2 - 3) {
842 case Cupdate_failure_jump:
843 case Cdummy_failure_jump:
844 goto make_normal_jump;
850 /* make_update_jump: */
852 a += 3; /* jump to after the Cfailure_jump */
853 code[0] = Cupdate_failure_jump;
856 if (num_instructions > 1)
858 // assert(num_instructions == 1);
859 /* if the only instruction matches a single character, we can do
861 p1 = code + 3 + a; /* start of sole instruction */
862 if (*p1 == Cset || *p1 == Cexact || *p1 == Canychar ||
863 *p1 == Csyntaxspec || *p1 == Cnotsyntaxspec)
873 static int re_optimize(regex_t * bufp)
905 if (!re_optimize_star_jump(bufp, code)) {
909 case Cupdate_failure_jump:
911 case Cdummy_failure_jump:
922 #define NEXTCHAR(var) \
925 goto ends_prematurely; \
926 (var) = regex[pos]; \
930 #define ALLOC(amount) \
932 if (pattern_offset+(amount) > alloc) \
934 alloc += 256 + (amount); \
935 pattern = (unsigned char *)realloc(pattern, alloc); \
937 goto out_of_memory; \
941 #define STORE(ch) pattern[pattern_offset++] = (ch)
943 #define CURRENT_LEVEL_START (starts[starts_base + current_level])
945 #define SET_LEVEL_START starts[starts_base + current_level] = pattern_offset
947 #define PUSH_LEVEL_STARTS \
948 if (starts_base < (MAX_NESTING-1)*NUM_LEVELS) \
949 starts_base += NUM_LEVELS; \
953 #define POP_LEVEL_STARTS starts_base -= NUM_LEVELS
955 #define PUT_ADDR(offset,addr) \
957 int disp = (addr) - (offset) - 2; \
958 pattern[(offset)] = disp & 0xff; \
959 pattern[(offset)+1] = (disp>>8) & 0xff; \
962 #define INSERT_JUMP(pos,type,addr) \
964 int a, p = (pos), t = (type), ad = (addr); \
965 for (a = pattern_offset - 1; a >= p; a--) \
966 pattern[a + 3] = pattern[a]; \
969 pattern_offset += 3; \
972 #define SETBIT(buf,offset,bit) (buf)[(offset)+(bit)/8] |= (1<<((bit) & 7))
976 bufp->allocated = alloc; \
977 bufp->buffer = pattern; \
978 bufp->used = pattern_offset; \
981 #define GETHEX(var) \
983 unsigned char gethex_ch, gethex_value; \
984 NEXTCHAR(gethex_ch); \
985 gethex_value = hex_char_to_decimal(gethex_ch); \
986 if (gethex_value == 16) \
988 NEXTCHAR(gethex_ch); \
989 gethex_ch = hex_char_to_decimal(gethex_ch); \
990 if (gethex_ch == 16) \
992 (var) = gethex_value * 16 + gethex_ch; \
995 #define ANSI_TRANSLATE(ch) \
1002 ch = 7; /* audible bell */ \
1008 ch = 8; /* backspace */ \
1014 ch = 12; /* form feed */ \
1020 ch = 10; /* line feed */ \
1026 ch = 13; /* carriage return */ \
1038 ch = 11; /* vertical tab */ \
1041 case 'x': /* hex code */ \
1049 /* other characters passed through */ \
1051 ch = translate[(unsigned char)ch]; \
1057 const char *re_compile_pattern(regex_t * bufp, unsigned char *regex)
1065 int pattern_offset = 0, alloc;
1066 int starts[NUM_LEVELS * MAX_NESTING];
1068 int future_jumps[MAX_NESTING];
1070 unsigned char ch = '\0';
1071 unsigned char *pattern;
1072 unsigned char *translate;
1075 int num_open_registers;
1076 int open_registers[RE_NREGS];
1077 int beginning_context;
1078 int size = strlen((char *)regex);
1080 if (!re_compile_initialized)
1081 re_compile_initialize();
1083 bufp->fastmap_accurate = 0;
1084 bufp->uses_registers = 1;
1085 bufp->num_registers = 1;
1086 translate = bufp->translate;
1087 pattern = bufp->buffer;
1088 alloc = bufp->allocated;
1089 if (alloc == 0 || pattern == NULL) {
1091 bufp->buffer = pattern = (unsigned char *)malloc(alloc);
1100 num_open_registers = 0;
1103 beginning_context = 1;
1105 /* we use Rend dummy to ensure that pending jumps are updated
1106 (due to low priority of Rend) before exiting the loop. */
1108 while (op != Rend) {
1114 ch = translate[(unsigned char)ch];
1115 op = plain_ops[(unsigned char)ch];
1118 op = quoted_ops[(unsigned char)ch];
1119 if (op == Rnormal && regexp_ansi_sequences)
1123 level = precedences[op];
1124 /* printf("ch='%c' op=%d level=%d current_level=%d
1125 curlevstart=%d\n", ch, op, level, current_level,
1126 CURRENT_LEVEL_START); */
1127 if (level > current_level) {
1128 for (current_level++; current_level < level; current_level++)
1131 } else if (level < current_level) {
1132 current_level = level;
1133 for (; num_jumps > 0 &&
1134 future_jumps[num_jumps - 1] >= CURRENT_LEVEL_START; num_jumps--)
1135 PUT_ADDR(future_jumps[num_jumps - 1], pattern_offset);
1143 store_opcode_and_arg: /* opcode & ch must be set */
1157 set_error("Rquote");
1158 /*NOTREACHED*/ case Rbol:
1159 if (!beginning_context) {
1160 if (regexp_context_indep_ops)
1168 if (!((pos >= size) ||
1169 ((regexp_syntax & RE_NO_BK_VBAR) ?
1170 (regex[pos] == '\174') :
1171 (pos + 1 < size && regex[pos] == '\134' &&
1172 regex[pos + 1] == '\174')) ||
1173 ((regexp_syntax & RE_NO_BK_PARENS) ?
1174 (regex[pos] == ')') :
1175 (pos + 1 < size && regex[pos] == '\134' &&
1176 regex[pos + 1] == ')')))) {
1177 if (regexp_context_indep_ops)
1187 if (beginning_context) {
1188 if (regexp_context_indep_ops)
1193 if (CURRENT_LEVEL_START == pattern_offset)
1194 break; /* ignore empty patterns for ? */
1196 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump, pattern_offset + 3);
1200 if (beginning_context) {
1201 if (regexp_context_indep_ops)
1206 if (CURRENT_LEVEL_START == pattern_offset)
1207 break; /* ignore empty patterns for + and * */
1209 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump, pattern_offset + 6);
1210 INSERT_JUMP(pattern_offset, Cstar_jump, CURRENT_LEVEL_START);
1211 if (op == Rplus) /* jump over initial failure_jump */
1212 INSERT_JUMP(CURRENT_LEVEL_START, Cdummy_failure_jump,
1213 CURRENT_LEVEL_START + 6);
1217 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump, pattern_offset + 6);
1218 if (num_jumps >= MAX_NESTING)
1221 future_jumps[num_jumps++] = pattern_offset;
1229 if (next_register < RE_NREGS) {
1230 bufp->uses_registers = 1;
1232 STORE(Cstart_memory);
1233 STORE(next_register);
1234 open_registers[num_open_registers++] = next_register;
1235 bufp->num_registers++;
1245 if (paren_depth <= 0)
1246 goto parenthesis_error;
1248 current_level = precedences[Ropenpar];
1250 if (paren_depth < num_open_registers) {
1251 bufp->uses_registers = 1;
1254 num_open_registers--;
1255 STORE(open_registers[num_open_registers]);
1260 goto bad_match_register;
1261 if (!(ch >= '0' && ch <= '9')) {
1262 goto bad_match_register;
1264 bufp->uses_registers = 1;
1265 opcode = Cmatch_memory;
1267 goto store_opcode_and_arg;
1268 case Rextended_memory:
1270 if (ch < '0' || ch > '9')
1271 goto bad_match_register;
1273 if (a < '0' || a > '9')
1274 goto bad_match_register;
1275 ch = 10 * (a - '0') + ch - '0';
1276 if (ch == 0 || ch >= RE_NREGS)
1277 goto bad_match_register;
1278 bufp->uses_registers = 1;
1279 opcode = Cmatch_memory;
1280 goto store_opcode_and_arg;
1292 offset = pattern_offset;
1293 for (a = 0; a < 256 / 8; a++)
1297 ch = translate[(unsigned char)ch];
1302 ch = translate[(unsigned char)ch];
1308 while (ch != '\135' || firstchar) {
1310 if (regexp_ansi_sequences && ch == '\134') {
1315 for (a = prev; a <= (int)ch; a++)
1316 SETBIT(pattern, offset, a);
1319 } else if (prev != -1 && ch == '-')
1322 SETBIT(pattern, offset, ch);
1327 ch = translate[(unsigned char)ch];
1330 SETBIT(pattern, offset, '-');
1332 for (a = 0; a < 256 / 8; a++)
1333 pattern[offset + a] ^= 0xff;
1349 opcode = Csyntaxspec;
1351 goto store_opcode_and_arg;
1355 opcode = Cnotsyntaxspec;
1357 goto store_opcode_and_arg;
1371 opcode = Cwordbound;
1376 opcode = Cnotwordbound;
1384 beginning_context = (op == Ropenpar || op == Ror);
1386 if (starts_base != 0)
1387 goto parenthesis_error;
1388 // assert(num_jumps == 0);
1392 if (!re_optimize(bufp))
1393 return "Optimization error";
1398 return "Badly placed special character";
1402 return "Bad match register number";
1406 return "Bad hexadecimal number";
1410 return "Badly placed parenthesis";
1414 return "Out of memory";
1418 return "Regular expression ends prematurely";
1422 return "Regular expression too complex";
1430 #undef CURRENT_LEVEL_START
1431 #undef SET_LEVEL_START
1432 #undef PUSH_LEVEL_STARTS
1433 #undef POP_LEVEL_STARTS
1439 #define PREFETCH if (text == textend) goto fail
1441 #define NEXTCHAR(var) \
1443 var = (unsigned char)*text++; \
1445 var = translate[var]
1448 int regcomp(regex_t * bufp, const char *regex, int cflags)
1450 memset(bufp, 0, sizeof(regex_t));
1451 bufp->cflags = cflags;
1452 if (bufp->cflags & REG_ICASE) {
1453 char *p, *lcase = bstrdup(regex);
1454 for( p = lcase; *p ; p++) {
1457 re_compile_pattern(bufp, (unsigned char *)lcase);
1460 re_compile_pattern(bufp, (unsigned char *)regex);
1468 void re_registers_to_regmatch(regexp_registers_t old_regs,
1469 regmatch_t pmatch[],
1474 /* We have to set the last entry to -1 */
1475 nmatch = nmatch - 1;
1476 for (i=0; (i < nmatch) && (old_regs->start[i] > -1) ; i++) {
1477 pmatch[i].rm_so = old_regs->start[i];
1478 pmatch[i].rm_eo = old_regs->end[i];
1481 pmatch[i].rm_eo = pmatch[i].rm_so = -1;
1484 int regexec(regex_t * preg, const char *string, size_t nmatch,
1485 regmatch_t pmatch[], int eflags)
1488 int len = strlen(string);
1489 struct re_registers regs;
1490 stat = re_search(preg, (unsigned char *)string, len, 0, len, ®s);
1492 re_registers_to_regmatch(®s, pmatch, nmatch);
1494 /* stat is the start position in the string base 0 where
1495 * the pattern was found or negative if not found.
1497 return stat < 0 ? -1 : 0;
1500 size_t regerror(int errcode, regex_t * preg, char *errbuf, size_t errbuf_size)
1502 bstrncpy(errbuf, preg->errmsg, errbuf_size);
1506 void regfree(regex_t * preg)
1509 free_pool_memory(preg->lcase);
1514 preg->buffer = NULL;
1518 int re_match(regex_t * bufp, unsigned char *string, int size, int pos,
1519 regexp_registers_t old_regs)
1521 unsigned char *code;
1522 unsigned char *translate;
1523 unsigned char *text;
1524 unsigned char *textstart;
1525 unsigned char *textend;
1531 unsigned char *regstart;
1532 unsigned char *regend;
1536 // assert(pos >= 0 && size >= 0);
1537 // assert(pos <= size);
1539 text = string + pos;
1541 textend = string + size;
1543 code = bufp->buffer;
1545 translate = bufp->translate;
1547 NEW_STATE(state, bufp->num_registers);
1553 match_end = text - textstart;
1555 old_regs->start[0] = pos;
1556 old_regs->end[0] = match_end;
1557 if (!bufp->uses_registers) {
1558 for (a = 1; a < RE_NREGS; a++) {
1559 old_regs->start[a] = -1;
1560 old_regs->end[a] = -1;
1563 for (a = 1; a < bufp->num_registers; a++) {
1564 if ((GET_REG_START(state, a) == NULL) ||
1565 (GET_REG_END(state, a) == NULL)) {
1566 old_regs->start[a] = -1;
1567 old_regs->end[a] = -1;
1570 old_regs->start[a] = GET_REG_START(state, a) - textstart;
1571 old_regs->end[a] = GET_REG_END(state, a) - textstart;
1573 for (; a < RE_NREGS; a++) {
1574 old_regs->start[a] = -1;
1575 old_regs->end[a] = -1;
1580 return match_end - pos;
1584 if (text == textstart || text[-1] == '\n')
1585 goto continue_matching;
1590 if (text == textend || *text == '\n')
1591 goto continue_matching;
1597 if (code[ch / 8] & (1 << (ch & 7))) {
1599 goto continue_matching;
1606 if (ch != (unsigned char)*code++)
1608 goto continue_matching;
1615 goto continue_matching;
1620 SET_REG_START(state, reg, text, goto error);
1621 goto continue_matching;
1626 SET_REG_END(state, reg, text, goto error);
1627 goto continue_matching;
1632 regstart = GET_REG_START(state, reg);
1633 regend = GET_REG_END(state, reg);
1634 if ((regstart == NULL) || (regend == NULL))
1635 goto fail; /* or should we just match nothing? */
1636 regsize = regend - regstart;
1638 if (regsize > (textend - text))
1641 for (; regstart < regend; regstart++, text++)
1642 if (translate[*regstart] != translate[*text])
1645 for (; regstart < regend; regstart++, text++)
1646 if (*regstart != *text)
1648 goto continue_matching;
1650 case Cupdate_failure_jump:
1652 UPDATE_FAILURE(state, text, goto error);
1653 /* fall to next case */
1655 /* treat Cstar_jump just like Cjump if it hasn't been optimized */
1659 a = (unsigned char)*code++;
1660 a |= (unsigned char)*code++ << 8;
1661 code += (int)SHORT(a);
1662 if (code < bufp->buffer || bufp->buffer + bufp->used < code) {
1663 set_error("Regex VM jump out of bounds (Cjump)");
1667 goto continue_matching;
1669 case Cdummy_failure_jump:
1671 unsigned char *failuredest;
1673 a = (unsigned char)*code++;
1674 a |= (unsigned char)*code++ << 8;
1676 // assert(*code == Cfailure_jump);
1677 b = (unsigned char)code[1];
1678 b |= (unsigned char)code[2] << 8;
1679 failuredest = code + (int)SHORT(b) + 3;
1680 if (failuredest < bufp->buffer || bufp->buffer + bufp->used < failuredest) {
1682 ("Regex VM jump out of bounds (Cdummy_failure_jump failuredest)");
1686 PUSH_FAILURE(state, failuredest, NULL, goto error);
1688 if (code < bufp->buffer || bufp->buffer + bufp->used < code) {
1689 set_error("Regex VM jump out of bounds (Cdummy_failure_jump code)");
1693 goto continue_matching;
1697 a = (unsigned char)*code++;
1698 a |= (unsigned char)*code++ << 8;
1700 if (code + a < bufp->buffer || bufp->buffer + bufp->used < code + a) {
1701 set_error("Regex VM jump out of bounds (Cfailure_jump)");
1705 PUSH_FAILURE(state, code + a, text, goto error);
1706 goto continue_matching;
1710 unsigned char *pinst;
1711 a = (unsigned char)*code++;
1712 a |= (unsigned char)*code++ << 8;
1715 if (pinst < bufp->buffer || bufp->buffer + bufp->used < pinst) {
1716 set_error("Regex VM jump out of bounds (Crepeat1)");
1720 /* pinst is sole instruction in loop, and it matches a
1721 * single character. Since Crepeat1 was originally a
1722 * Cupdate_failure_jump, we also know that backtracking
1723 * is useless: so long as the single-character
1724 * expression matches, it must be used. Also, in the
1725 * case of +, we've already matched one character, so +
1726 * can't fail: nothing here can cause a failure. */
1731 while (text < textend) {
1732 ch = translate[(unsigned char)*text];
1733 if (pinst[ch / 8] & (1 << (ch & 7)))
1739 while (text < textend) {
1740 ch = (unsigned char)*text;
1741 if (pinst[ch / 8] & (1 << (ch & 7)))
1751 ch = (unsigned char)*pinst;
1753 while (text < textend && translate[(unsigned char)*text] == ch)
1756 while (text < textend && (unsigned char)*text == ch)
1763 while (text < textend && (unsigned char)*text != '\n')
1769 a = (unsigned char)*pinst;
1771 while (text < textend && (SYNTAX(translate[*text]) & a))
1774 while (text < textend && (SYNTAX(*text) & a))
1779 case Cnotsyntaxspec:
1781 a = (unsigned char)*pinst;
1783 while (text < textend && !(SYNTAX(translate[*text]) & a))
1786 while (text < textend && !(SYNTAX(*text) & a))
1794 set_error("Unknown regex opcode: memory corrupted?");
1798 /* due to the funky way + and * are compiled, the top
1799 * failure- stack entry at this point is actually a
1800 * success entry -- update it & pop it */
1801 UPDATE_FAILURE(state, text, goto error);
1802 goto fail; /* i.e., succeed <wink/sigh> */
1806 if (text == textstart)
1807 goto continue_matching;
1812 if (text == textend)
1813 goto continue_matching;
1818 if (text == textend)
1820 if (!(SYNTAX(*text) & Sword))
1822 if (text == textstart)
1823 goto continue_matching;
1824 if (!(SYNTAX(text[-1]) & Sword))
1825 goto continue_matching;
1830 if (text == textstart)
1832 if (!(SYNTAX(text[-1]) & Sword))
1834 if (text == textend)
1835 goto continue_matching;
1836 if (!(SYNTAX(*text) & Sword))
1837 goto continue_matching;
1842 /* Note: as in gnu regexp, this also matches at the
1843 * beginning and end of buffer. */
1845 if (text == textstart || text == textend)
1846 goto continue_matching;
1847 if ((SYNTAX(text[-1]) & Sword) ^ (SYNTAX(*text) & Sword))
1848 goto continue_matching;
1853 /* Note: as in gnu regexp, this never matches at the
1854 * beginning and end of buffer. */
1855 if (text == textstart || text == textend)
1857 if (!((SYNTAX(text[-1]) & Sword) ^ (SYNTAX(*text) & Sword)))
1858 goto continue_matching;
1864 if (!(SYNTAX(ch) & (unsigned char)*code++))
1866 goto continue_matching;
1868 case Cnotsyntaxspec:
1871 if (SYNTAX(ch) & (unsigned char)*code++)
1873 goto continue_matching;
1878 set_error("Unknown regex opcode: memory corrupted?");
1885 #if 0 /* This line is never reached --Guido */
1892 /* Using "break;" in the above switch statement is equivalent to "goto fail;" */
1894 POP_FAILURE(state, code, text, goto done_matching, goto error);
1895 goto continue_matching;
1898 /* if(translated != NULL) */
1899 /* free(translated); */
1904 /* if (translated != NULL) */
1905 /* free(translated); */
1914 int re_search(regex_t * bufp, unsigned char *str, int size, int pos,
1915 int range, regexp_registers_t regs)
1917 unsigned char *fastmap;
1918 unsigned char *translate;
1919 unsigned char *text;
1920 unsigned char *partstart;
1921 unsigned char *partend;
1924 unsigned char anchor;
1925 unsigned char *string = str;
1927 if (bufp->cflags & REG_ICASE) { /* we must use string in lowercase */
1928 int len = strlen((const char *)str);
1930 bufp->lcase = get_pool_memory(PM_FNAME);
1932 bufp->lcase = check_pool_memory_size(bufp->lcase, len+1);
1933 unsigned char *dst = (unsigned char *)bufp->lcase;
1935 *dst++ = tolower(*string++);
1938 string = (unsigned char *)bufp->lcase;
1941 // assert(size >= 0 && pos >= 0);
1942 // assert(pos + range >= 0 && pos + range <= size); /* Bugfix by ylo */
1944 fastmap = bufp->fastmap;
1945 translate = bufp->translate;
1946 if (fastmap && !bufp->fastmap_accurate) {
1947 re_compile_fastmap(bufp);
1952 anchor = bufp->anchor;
1953 if (bufp->can_be_null == 1) /* can_be_null == 2: can match null at eob */
1969 for (; range >= 0; range--, pos += dir) {
1971 if (dir == 1) { /* searching forwards */
1973 text = string + pos;
1974 partend = string + size;
1977 while (text != partend &&
1978 !fastmap[(unsigned char)translate[(unsigned char)*text]])
1981 while (text != partend && !fastmap[(unsigned char)*text])
1983 pos += text - partstart;
1984 range -= text - partstart;
1985 if (pos == size && bufp->can_be_null == 0)
1987 } else { /* searching backwards */
1988 text = string + pos;
1989 partstart = string + pos - range;
1992 while (text != partstart && !fastmap[(unsigned char)
1993 translate[(unsigned char)*text]])
1996 while (text != partstart && !fastmap[(unsigned char)*text])
1998 pos -= partend - text;
1999 range -= partend - text;
2002 if (anchor == 1) { /* anchored to begline */
2003 if (pos > 0 && (string[pos - 1] != '\n'))
2006 // assert(pos >= 0 && pos <= size);
2007 ret = re_match(bufp, string, size, pos, regs);
2019 ** c-file-style: "python"