3 * Author: Tatu Ylonen <ylo@ngs.fi>
5 * Copyright (c) 1991 Tatu Ylonen, Espoo, Finland
7 * Permission to use, copy, modify, distribute, and sell this software
8 * and its documentation for any purpose is hereby granted without
9 * fee, provided that the above copyright notice appear in all copies.
10 * This software is provided "as is" without express or implied
13 * Created: Thu Sep 26 17:14:05 1991 ylo
14 * Last modified: Mon Nov 4 17:06:48 1991 ylo
15 * Ported to Think C: 19 Jan 1992 guido@cwi.nl
17 * This code draws many ideas from the regular expression packages by
18 * Henry Spencer of the University of Toronto and Richard Stallman of
19 * the Free Software Foundation.
21 * Emacs-specific code and syntax table code is almost directly borrowed
24 * Bugs fixed and lots of reorganization by Jeffrey C. Ollie, April
25 * 1997 Thanks for bug reports and ideas from Andrew Kuchling, Tim
26 * Peters, Guido van Rossum, Ka-Ping Yee, Sjoerd Mullender, and
27 * probably one or two others that I'm forgetting.
29 * This file modified to work with Bacula and C++ by
30 * Kern Sibbald, April 2006
32 * This file modified to work with REG_ICASE and Bacula by
33 * Eric Bollengier April 2007
36 Bacula(R) - The Network Backup Solution
38 Copyright (C) 2000-2015 Kern Sibbald
39 Copyright (C) 2006-2014 Free Software Foundation Europe e.V.
41 The original author of Bacula is Kern Sibbald, with contributions
42 from many others, a complete list can be found in the file AUTHORS.
44 You may use this file and others of this release according to the
45 license defined in the LICENSE file, which includes the Affero General
46 Public License, v3.0 ("AGPLv3") and some additional permissions and
47 terms pursuant to its AGPLv3 Section 7.
49 This notice must be preserved when any source code is
50 conveyed and/or propagated.
52 Bacula(R) is a registered trademark of Kern Sibbald.
59 #define set_error(x) bufp->errmsg=((char *)(x))
60 #define got_error bufp->errmsg!=NULL
62 /* The original code blithely assumed that sizeof(short) == 2. Not
63 * always true. Original instances of "(short)x" were replaced by
64 * SHORT(x), where SHORT is #defined below. */
66 #define SHORT(x) ((x) & 0x8000 ? (x) - 0x10000 : (x))
68 /* The stack implementation is taken from an idea by Andrew Kuchling.
69 * It's a doubly linked list of arrays. The advantages of this over a
70 * simple linked list are that the number of mallocs required are
71 * reduced. It also makes it possible to statically allocate enough
72 * space so that small patterns don't ever need to call malloc.
74 * The advantages over a single array is that is periodically
75 * realloced when more space is needed is that we avoid ever copying
78 /* item_t is the basic stack element. Defined as a union of
79 * structures so that both registers, failure points, and counters can
80 * be pushed/popped from the stack. There's nothing built into the
81 * item to keep track of whether a certain stack item is a register, a
82 * failure point, or a counter. */
84 typedef union item_t {
105 #define B_STACK_PAGE_SIZE 256
106 #define NUM_REGISTERS 256
108 /* A 'page' of stack items. */
110 typedef struct item_page_t {
111 item_t items[B_STACK_PAGE_SIZE];
112 struct item_page_t *prev;
113 struct item_page_t *next;
117 typedef struct match_state {
118 /* The number of registers that have been pushed onto the stack
119 * since the last failure point. */
123 /* Used to control when registers need to be pushed onto the
128 /* The number of failure points on the stack. */
132 /* Storage for the registers. Each register consists of two
133 * pointers to characters. So register N is represented as
134 * start[N] and end[N]. The pointers must be converted to
135 * offsets from the beginning of the string before returning the
136 * registers to the calling program. */
138 unsigned char *start[NUM_REGISTERS];
139 unsigned char *end[NUM_REGISTERS];
141 /* Keeps track of whether a register has changed recently. */
143 int changed[NUM_REGISTERS];
145 /* Structure to encapsulate the stack. */
147 /* index into the current page. If index == 0 and you need
148 * to pop an item, move to the previous page and set index
149 * = B_STACK_PAGE_SIZE - 1. Otherwise decrement index to
150 * push a page. If index == B_STACK_PAGE_SIZE and you need
151 * to push a page move to the next page and set index =
152 * 0. If there is no new next page, allocate a new page
153 * and link it in. Otherwise, increment index to push a
157 item_page_t *current; /* Pointer to the current page. */
158 item_page_t first; /* First page is statically allocated. */
162 /* Initialize a state object */
164 /* #define NEW_STATE(state) \ */
165 /* memset(&state, 0, (void *)(&state.stack) - (void *)(&state)); \ */
166 /* state.stack.current = &state.stack.first; \ */
167 /* state.stack.first.prev = NULL; \ */
168 /* state.stack.first.next = NULL; \ */
169 /* state.stack.index = 0; \ */
170 /* state.level = 1 */
172 #define NEW_STATE(state, nregs) \
175 for (i = 0; i < nregs; i++) \
177 state.start[i] = NULL; \
178 state.end[i] = NULL; \
179 state.changed[i] = 0; \
181 state.stack.current = &state.stack.first; \
182 state.stack.first.prev = NULL; \
183 state.stack.first.next = NULL; \
184 state.stack.index = 0; \
191 /* Free any memory that might have been malloc'd */
193 #define FREE_STATE(state) \
194 while(state.stack.first.next != NULL) \
196 state.stack.current = state.stack.first.next; \
197 state.stack.first.next = state.stack.current->next; \
198 free(state.stack.current); \
201 /* Discard the top 'count' stack items. */
203 #define STACK_DISCARD(stack, count, on_error) \
204 stack.index -= count; \
205 while (stack.index < 0) \
207 if (stack.current->prev == NULL) \
209 stack.current = stack.current->prev; \
210 stack.index += B_STACK_PAGE_SIZE; \
213 /* Store a pointer to the previous item on the stack. Used to pop an
214 * item off of the stack. */
216 #define STACK_PREV(stack, top, on_error) \
217 if (stack.index == 0) \
219 if (stack.current->prev == NULL) \
221 stack.current = stack.current->prev; \
222 stack.index = B_STACK_PAGE_SIZE - 1; \
228 top = &(stack.current->items[stack.index])
230 /* Store a pointer to the next item on the stack. Used to push an item
231 * on to the stack. */
233 #define STACK_NEXT(stack, top, on_error) \
234 if (stack.index == B_STACK_PAGE_SIZE) \
236 if (stack.current->next == NULL) \
238 stack.current->next = (item_page_t *)malloc(sizeof(item_page_t)); \
239 if (stack.current->next == NULL) \
241 stack.current->next->prev = stack.current; \
242 stack.current->next->next = NULL; \
244 stack.current = stack.current->next; \
247 top = &(stack.current->items[stack.index++])
249 /* Store a pointer to the item that is 'count' items back in the
250 * stack. STACK_BACK(stack, top, 1, on_error) is equivalent to
251 * STACK_TOP(stack, top, on_error). */
253 #define STACK_BACK(stack, top, count, on_error) \
256 item_page_t *current; \
257 current = stack.current; \
258 index = stack.index - (count); \
261 if (current->prev == NULL) \
263 current = current->prev; \
264 index += B_STACK_PAGE_SIZE; \
266 top = &(current->items[index]); \
269 /* Store a pointer to the top item on the stack. Execute the
270 * 'on_error' code if there are no items on the stack. */
272 #define STACK_TOP(stack, top, on_error) \
273 if (stack.index == 0) \
275 if (stack.current->prev == NULL) \
277 top = &(stack.current->prev->items[B_STACK_PAGE_SIZE - 1]); \
281 top = &(stack.current->items[stack.index - 1]); \
284 /* Test to see if the stack is empty */
286 #define STACK_EMPTY(stack) ((stack.index == 0) && \
287 (stack.current->prev == NULL))
289 /* Return the start of register 'reg' */
291 #define GET_REG_START(state, reg) (state.start[reg])
293 /* Return the end of register 'reg' */
295 #define GET_REG_END(state, reg) (state.end[reg])
297 /* Set the start of register 'reg'. If the state of the register needs
298 * saving, push it on the stack. */
300 #define SET_REG_START(state, reg, text, on_error) \
301 if(state.changed[reg] < state.level) \
304 STACK_NEXT(state.stack, item, on_error); \
305 item->reg.num = reg; \
306 item->reg.start = state.start[reg]; \
307 item->reg.end = state.end[reg]; \
308 item->reg.level = state.changed[reg]; \
309 state.changed[reg] = state.level; \
312 state.start[reg] = text
314 /* Set the end of register 'reg'. If the state of the register needs
315 * saving, push it on the stack. */
317 #define SET_REG_END(state, reg, text, on_error) \
318 if(state.changed[reg] < state.level) \
321 STACK_NEXT(state.stack, item, on_error); \
322 item->reg.num = reg; \
323 item->reg.start = state.start[reg]; \
324 item->reg.end = state.end[reg]; \
325 item->reg.level = state.changed[reg]; \
326 state.changed[reg] = state.level; \
329 state.end[reg] = text
331 #define PUSH_FAILURE(state, xcode, xtext, on_error) \
334 STACK_NEXT(state.stack, item, on_error); \
335 item->fail.code = xcode; \
336 item->fail.text = xtext; \
337 item->fail.count = state.count; \
338 item->fail.level = state.level; \
339 item->fail.phantom = 0; \
345 /* Update the last failure point with a new position in the text. */
347 #define UPDATE_FAILURE(state, xtext, on_error) \
350 STACK_BACK(state.stack, item, state.count + 1, on_error); \
351 if (!item->fail.phantom) \
354 STACK_NEXT(state.stack, item2, on_error); \
355 item2->fail.code = item->fail.code; \
356 item2->fail.text = xtext; \
357 item2->fail.count = state.count; \
358 item2->fail.level = state.level; \
359 item2->fail.phantom = 1; \
366 STACK_DISCARD(state.stack, state.count, on_error); \
367 STACK_TOP(state.stack, item, on_error); \
368 item->fail.text = xtext; \
374 #define POP_FAILURE(state, xcode, xtext, on_empty, on_error) \
379 while(state.count > 0) \
381 STACK_PREV(state.stack, item, on_error); \
382 state.start[item->reg.num] = item->reg.start; \
383 state.end[item->reg.num] = item->reg.end; \
384 state.changed[item->reg.num] = item->reg.level; \
387 STACK_PREV(state.stack, item, on_empty); \
388 xcode = item->fail.code; \
389 xtext = item->fail.text; \
390 state.count = item->fail.count; \
391 state.level = item->fail.level; \
394 while (item->fail.text == NULL); \
397 enum regexp_compiled_ops { /* opcodes for compiled regexp */
398 Cend, /* end of pattern reached */
399 Cbol, /* beginning of line */
400 Ceol, /* end of line */
401 Cset, /* character set. Followed by 32 bytes of set. */
402 Cexact, /* followed by a byte to match */
403 Canychar, /* matches any character except newline */
404 Cstart_memory, /* set register start addr (followed by reg number) */
405 Cend_memory, /* set register end addr (followed by reg number) */
406 Cmatch_memory, /* match a duplicate of reg contents (regnum follows) */
407 Cjump, /* followed by two bytes (lsb,msb) of displacement. */
408 Cstar_jump, /* will change to jump/update_failure_jump at runtime */
409 Cfailure_jump, /* jump to addr on failure */
410 Cupdate_failure_jump, /* update topmost failure point and jump */
411 Cdummy_failure_jump, /* push a dummy failure point and jump */
412 Cbegbuf, /* match at beginning of buffer */
413 Cendbuf, /* match at end of buffer */
414 Cwordbeg, /* match at beginning of word */
415 Cwordend, /* match at end of word */
416 Cwordbound, /* match if at word boundary */
417 Cnotwordbound, /* match if not at word boundary */
418 Csyntaxspec, /* matches syntax code (1 byte follows) */
419 Cnotsyntaxspec, /* matches if syntax code does not match (1 byte follows) */
423 enum regexp_syntax_op { /* syntax codes for plain and quoted characters */
424 Rend, /* special code for end of regexp */
425 Rnormal, /* normal character */
426 Ranychar, /* any character except newline */
427 Rquote, /* the quote character */
428 Rbol, /* match beginning of line */
429 Reol, /* match end of line */
430 Roptional, /* match preceding expression optionally */
431 Rstar, /* match preceding expr zero or more times */
432 Rplus, /* match preceding expr one or more times */
433 Ror, /* match either of alternatives */
434 Ropenpar, /* opening parenthesis */
435 Rclosepar, /* closing parenthesis */
436 Rmemory, /* match memory register */
437 Rextended_memory, /* \vnn to match registers 10-99 */
438 Ropenset, /* open set. Internal syntax hard-coded below. */
439 /* the following are gnu extensions to "normal" regexp syntax */
440 Rbegbuf, /* beginning of buffer */
441 Rendbuf, /* end of buffer */
442 Rwordchar, /* word character */
443 Rnotwordchar, /* not word character */
444 Rwordbeg, /* beginning of word */
445 Rwordend, /* end of word */
446 Rwordbound, /* word bound */
447 Rnotwordbound, /* not word bound */
451 static int re_compile_initialized = 0;
452 static int regexp_syntax = RE_SYNTAX_EGREP;
453 int re_syntax = RE_SYNTAX_EGREP; /* Exported copy of regexp_syntax */
454 static unsigned char plain_ops[256];
455 static unsigned char quoted_ops[256];
456 static unsigned char precedences[Rnum_ops];
457 static int regexp_context_indep_ops;
458 static int regexp_ansi_sequences;
460 #define NUM_LEVELS 5 /* number of precedence levels in use */
461 #define MAX_NESTING 100 /* max nesting level of operators */
463 #define SYNTAX(ch) re_syntax_table[(unsigned char)(ch)]
465 unsigned char re_syntax_table[256];
467 void re_compile_initialize(void)
471 static int syntax_table_inited = 0;
473 if (!syntax_table_inited) {
474 syntax_table_inited = 1;
475 memset(re_syntax_table, 0, 256);
476 for (a = 'a'; a <= 'z'; a++)
477 re_syntax_table[a] = Sword;
478 for (a = 'A'; a <= 'Z'; a++)
479 re_syntax_table[a] = Sword;
480 for (a = '0'; a <= '9'; a++)
481 re_syntax_table[a] = Sword | Sdigit | Shexdigit;
482 for (a = '0'; a <= '7'; a++)
483 re_syntax_table[a] |= Soctaldigit;
484 for (a = 'A'; a <= 'F'; a++)
485 re_syntax_table[a] |= Shexdigit;
486 for (a = 'a'; a <= 'f'; a++)
487 re_syntax_table[a] |= Shexdigit;
488 re_syntax_table[(int)'_'] = Sword;
489 for (a = 9; a <= 13; a++)
490 re_syntax_table[a] = Swhitespace;
491 re_syntax_table[(int)' '] = Swhitespace;
493 re_compile_initialized = 1;
494 for (a = 0; a < 256; a++) {
495 plain_ops[a] = Rnormal;
496 quoted_ops[a] = Rnormal;
498 for (a = '0'; a <= '9'; a++)
499 quoted_ops[a] = Rmemory;
500 plain_ops[(int)'\134'] = Rquote;
501 if (regexp_syntax & RE_NO_BK_PARENS) {
502 plain_ops[(int)'('] = Ropenpar;
503 plain_ops[(int)')'] = Rclosepar;
505 quoted_ops[(int)'('] = Ropenpar;
506 quoted_ops[(int)')'] = Rclosepar;
508 if (regexp_syntax & RE_NO_BK_VBAR) {
509 plain_ops[(int)'\174'] = Ror;
511 quoted_ops[(int)'\174'] = Ror;
513 plain_ops[(int)'*'] = Rstar;
514 if (regexp_syntax & RE_BK_PLUS_QM) {
515 quoted_ops[(int)'+'] = Rplus;
516 quoted_ops[(int)'?'] = Roptional;
518 plain_ops[(int)'+'] = Rplus;
519 plain_ops[(int)'?'] = Roptional;
521 if (regexp_syntax & RE_NEWLINE_OR) {
522 plain_ops[(int)'\n'] = Ror;
524 plain_ops[(int)'\133'] = Ropenset;
525 plain_ops[(int)'\136'] = Rbol;
526 plain_ops[(int)'$'] = Reol;
527 plain_ops[(int)'.'] = Ranychar;
528 if (!(regexp_syntax & RE_NO_GNU_EXTENSIONS)) {
529 quoted_ops[(int)'w'] = Rwordchar;
530 quoted_ops[(int)'W'] = Rnotwordchar;
531 quoted_ops[(int)'<'] = Rwordbeg;
532 quoted_ops[(int)'>'] = Rwordend;
533 quoted_ops[(int)'b'] = Rwordbound;
534 quoted_ops[(int)'B'] = Rnotwordbound;
535 quoted_ops[(int)'`'] = Rbegbuf;
536 quoted_ops[(int)'\''] = Rendbuf;
538 if (regexp_syntax & RE_ANSI_HEX) {
539 quoted_ops[(int)'v'] = Rextended_memory;
541 for (a = 0; a < Rnum_ops; a++) {
544 if (regexp_syntax & RE_TIGHT_VBAR) {
545 precedences[Ror] = 3;
546 precedences[Rbol] = 2;
547 precedences[Reol] = 2;
549 precedences[Ror] = 2;
550 precedences[Rbol] = 3;
551 precedences[Reol] = 3;
553 precedences[Rclosepar] = 1;
554 precedences[Rend] = 0;
555 regexp_context_indep_ops = (regexp_syntax & RE_CONTEXT_INDEP_OPS) != 0;
556 regexp_ansi_sequences = (regexp_syntax & RE_ANSI_HEX) != 0;
559 int re_set_syntax(int syntax) {
563 regexp_syntax = syntax;
564 re_syntax = syntax; /* Exported copy */
565 re_compile_initialize();
569 static int hex_char_to_decimal(int ch) {
570 if (ch >= '0' && ch <= '9')
572 if (ch >= 'a' && ch <= 'f')
573 return ch - 'a' + 10;
574 if (ch >= 'A' && ch <= 'F')
575 return ch - 'A' + 10;
579 static void re_compile_fastmap_aux(regex_t * bufp, unsigned char *code, int pos,
580 unsigned char *visited,
581 unsigned char *can_be_null,
582 unsigned char *fastmap)
589 return; /* we have already been here */
592 switch (code[pos++]) {
603 for (a = 0; a < 256; a++)
607 syntaxcode = code[pos++];
608 for (a = 0; a < 256; a++)
609 if (SYNTAX(a) & syntaxcode)
613 syntaxcode = code[pos++];
614 for (a = 0; a < 256; a++)
615 if (!(SYNTAX(a) & syntaxcode))
619 fastmap[(int)'\n'] = 1;
620 if (*can_be_null == 0)
621 *can_be_null = 2; /* can match null, but only at end of buffer */
624 for (a = 0; a < 256 / 8; a++)
625 if (code[pos + a] != 0)
626 for (b = 0; b < 8; b++)
627 if (code[pos + a] & (1 << b))
628 fastmap[(a << 3) + b] = 1;
632 fastmap[(unsigned char)code[pos]] = 1;
635 for (a = 0; a < 256; a++)
644 for (a = 0; a < 256; a++)
649 case Cdummy_failure_jump:
650 case Cupdate_failure_jump:
652 a = (unsigned char)code[pos++];
653 a |= (unsigned char)code[pos++] << 8;
654 pos += (int)SHORT(a);
656 /* argh... the regexp contains empty loops. This is not
657 good, as this may cause a failure stack overflow when
658 matching. Oh well. */
659 /* this path leads nowhere; pursue other paths. */
665 a = (unsigned char)code[pos++];
666 a |= (unsigned char)code[pos++] << 8;
667 a = pos + (int)SHORT(a);
668 re_compile_fastmap_aux(bufp, code, a, visited, can_be_null, fastmap);
674 set_error("Unknown regex opcode: memory corrupted?");
680 static int re_do_compile_fastmap(regex_t * bufp, unsigned char *buffer, int used,
681 int pos, unsigned char *can_be_null,
682 unsigned char *fastmap)
684 unsigned char small_visited[512], *visited;
686 if (used <= (int)sizeof(small_visited))
687 visited = small_visited;
689 visited = (unsigned char *)malloc(used);
694 memset(fastmap, 0, 256);
695 memset(visited, 0, used);
696 re_compile_fastmap_aux(bufp, buffer, pos, visited, can_be_null, fastmap);
697 if (visited != small_visited)
702 void re_compile_fastmap(regex_t * bufp)
704 if (!bufp->fastmap || bufp->fastmap_accurate)
706 // assert(bufp->used > 0);
707 if (!re_do_compile_fastmap(bufp, bufp->buffer,
708 bufp->used, 0, &bufp->can_be_null, bufp->fastmap))
712 if (bufp->buffer[0] == Cbol)
713 bufp->anchor = 1; /* begline */
714 else if (bufp->buffer[0] == Cbegbuf)
715 bufp->anchor = 2; /* begbuf */
717 bufp->anchor = 0; /* none */
718 bufp->fastmap_accurate = 1;
724 * ... code for operand of star
726 * 2: ... code after star
728 * We change the star_jump to update_failure_jump if we can determine
729 * that it is safe to do so; otherwise we change it to an ordinary
736 * 2: ... code for operand of plus
738 * 3: ... code after plus
740 * For star_jump considerations this is processed identically to star.
744 static int re_optimize_star_jump(regex_t * bufp, unsigned char *code)
746 unsigned char map[256];
747 unsigned char can_be_null;
753 int num_instructions = 0;
755 a = (unsigned char)*code++;
756 a |= (unsigned char)*code++ << 8;
759 p1 = code + a + 3; /* skip the failure_jump */
760 /* Check that the jump is within the pattern */
761 if (p1 < bufp->buffer || bufp->buffer + bufp->used < p1) {
762 set_error("Regex VM jump out of bounds (failure_jump opt)");
765 // assert(p1[-3] == Cfailure_jump);
767 /* p1 points inside loop, p2 points to after loop */
768 if (!re_do_compile_fastmap(bufp, bufp->buffer, bufp->used,
769 (int)(p2 - bufp->buffer), &can_be_null, map))
770 goto make_normal_jump;
772 /* If we might introduce a new update point inside the
773 * loop, we can't optimize because then update_jump would
774 * update a wrong failure point. Thus we have to be
775 * quite careful here.
778 /* loop until we find something that consumes a character */
796 ch = (unsigned char)*p1++;
798 goto make_normal_jump;
801 for (b = 0; b < 256; b++)
802 if (b != '\n' && map[b])
803 goto make_normal_jump;
806 for (b = 0; b < 256; b++)
807 if ((p1[b >> 3] & (1 << (b & 7))) && map[b])
808 goto make_normal_jump;
812 goto make_normal_jump;
816 /* now we know that we can't backtrack. */
817 while (p1 != p2 - 3) {
846 case Cupdate_failure_jump:
847 case Cdummy_failure_jump:
848 goto make_normal_jump;
854 /* make_update_jump: */
856 a += 3; /* jump to after the Cfailure_jump */
857 code[0] = Cupdate_failure_jump;
860 if (num_instructions > 1)
862 // assert(num_instructions == 1);
863 /* if the only instruction matches a single character, we can do
865 p1 = code + 3 + a; /* start of sole instruction */
866 if (*p1 == Cset || *p1 == Cexact || *p1 == Canychar ||
867 *p1 == Csyntaxspec || *p1 == Cnotsyntaxspec)
877 static int re_optimize(regex_t * bufp)
909 if (!re_optimize_star_jump(bufp, code)) {
913 case Cupdate_failure_jump:
915 case Cdummy_failure_jump:
926 #define NEXTCHAR(var) \
929 goto ends_prematurely; \
930 (var) = regex[pos]; \
934 #define ALLOC(amount) \
936 if (pattern_offset+(amount) > alloc) \
938 alloc += 256 + (amount); \
939 pattern = (unsigned char *)realloc(pattern, alloc); \
941 goto out_of_memory; \
945 #define STORE(ch) pattern[pattern_offset++] = (ch)
947 #define CURRENT_LEVEL_START (starts[starts_base + current_level])
949 #define SET_LEVEL_START starts[starts_base + current_level] = pattern_offset
951 #define PUSH_LEVEL_STARTS \
952 if (starts_base < (MAX_NESTING-1)*NUM_LEVELS) \
953 starts_base += NUM_LEVELS; \
957 #define POP_LEVEL_STARTS starts_base -= NUM_LEVELS
959 #define PUT_ADDR(offset,addr) \
961 int disp = (addr) - (offset) - 2; \
962 pattern[(offset)] = disp & 0xff; \
963 pattern[(offset)+1] = (disp>>8) & 0xff; \
966 #define INSERT_JUMP(pos,type,addr) \
968 int a, p = (pos), t = (type), ad = (addr); \
969 for (a = pattern_offset - 1; a >= p; a--) \
970 pattern[a + 3] = pattern[a]; \
973 pattern_offset += 3; \
976 #define SETBIT(buf,offset,bit) (buf)[(offset)+(bit)/8] |= (1<<((bit) & 7))
980 bufp->allocated = alloc; \
981 bufp->buffer = pattern; \
982 bufp->used = pattern_offset; \
985 #define GETHEX(var) \
987 unsigned char gethex_ch, gethex_value; \
988 NEXTCHAR(gethex_ch); \
989 gethex_value = hex_char_to_decimal(gethex_ch); \
990 if (gethex_value == 16) \
992 NEXTCHAR(gethex_ch); \
993 gethex_ch = hex_char_to_decimal(gethex_ch); \
994 if (gethex_ch == 16) \
996 (var) = gethex_value * 16 + gethex_ch; \
999 #define ANSI_TRANSLATE(ch) \
1006 ch = 7; /* audible bell */ \
1012 ch = 8; /* backspace */ \
1018 ch = 12; /* form feed */ \
1024 ch = 10; /* line feed */ \
1030 ch = 13; /* carriage return */ \
1042 ch = 11; /* vertical tab */ \
1045 case 'x': /* hex code */ \
1053 /* other characters passed through */ \
1055 ch = translate[(unsigned char)ch]; \
1061 const char *re_compile_pattern(regex_t * bufp, unsigned char *regex)
1069 int pattern_offset = 0, alloc;
1070 int starts[NUM_LEVELS * MAX_NESTING];
1072 int future_jumps[MAX_NESTING];
1074 unsigned char ch = '\0';
1075 unsigned char *pattern;
1076 unsigned char *translate;
1079 int num_open_registers;
1080 int open_registers[RE_NREGS];
1081 int beginning_context;
1082 int size = strlen((char *)regex);
1084 if (!re_compile_initialized)
1085 re_compile_initialize();
1087 bufp->fastmap_accurate = 0;
1088 bufp->uses_registers = 1;
1089 bufp->num_registers = 1;
1090 translate = bufp->translate;
1091 pattern = bufp->buffer;
1092 alloc = bufp->allocated;
1093 if (alloc == 0 || pattern == NULL) {
1095 bufp->buffer = pattern = (unsigned char *)malloc(alloc);
1104 num_open_registers = 0;
1107 beginning_context = 1;
1109 /* we use Rend dummy to ensure that pending jumps are updated
1110 (due to low priority of Rend) before exiting the loop. */
1112 while (op != Rend) {
1118 ch = translate[(unsigned char)ch];
1119 op = plain_ops[(unsigned char)ch];
1122 op = quoted_ops[(unsigned char)ch];
1123 if (op == Rnormal && regexp_ansi_sequences)
1127 level = precedences[op];
1128 /* printf("ch='%c' op=%d level=%d current_level=%d
1129 curlevstart=%d\n", ch, op, level, current_level,
1130 CURRENT_LEVEL_START); */
1131 if (level > current_level) {
1132 for (current_level++; current_level < level; current_level++)
1135 } else if (level < current_level) {
1136 current_level = level;
1137 for (; num_jumps > 0 &&
1138 future_jumps[num_jumps - 1] >= CURRENT_LEVEL_START; num_jumps--)
1139 PUT_ADDR(future_jumps[num_jumps - 1], pattern_offset);
1147 store_opcode_and_arg: /* opcode & ch must be set */
1161 set_error("Rquote");
1162 /*NOTREACHED*/ case Rbol:
1163 if (!beginning_context) {
1164 if (regexp_context_indep_ops)
1172 if (!((pos >= size) ||
1173 ((regexp_syntax & RE_NO_BK_VBAR) ?
1174 (regex[pos] == '\174') :
1175 (pos + 1 < size && regex[pos] == '\134' &&
1176 regex[pos + 1] == '\174')) ||
1177 ((regexp_syntax & RE_NO_BK_PARENS) ?
1178 (regex[pos] == ')') :
1179 (pos + 1 < size && regex[pos] == '\134' &&
1180 regex[pos + 1] == ')')))) {
1181 if (regexp_context_indep_ops)
1191 if (beginning_context) {
1192 if (regexp_context_indep_ops)
1197 if (CURRENT_LEVEL_START == pattern_offset)
1198 break; /* ignore empty patterns for ? */
1200 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump, pattern_offset + 3);
1204 if (beginning_context) {
1205 if (regexp_context_indep_ops)
1210 if (CURRENT_LEVEL_START == pattern_offset)
1211 break; /* ignore empty patterns for + and * */
1213 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump, pattern_offset + 6);
1214 INSERT_JUMP(pattern_offset, Cstar_jump, CURRENT_LEVEL_START);
1215 if (op == Rplus) /* jump over initial failure_jump */
1216 INSERT_JUMP(CURRENT_LEVEL_START, Cdummy_failure_jump,
1217 CURRENT_LEVEL_START + 6);
1221 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump, pattern_offset + 6);
1222 if (num_jumps >= MAX_NESTING)
1225 future_jumps[num_jumps++] = pattern_offset;
1233 if (next_register < RE_NREGS) {
1234 bufp->uses_registers = 1;
1236 STORE(Cstart_memory);
1237 STORE(next_register);
1238 open_registers[num_open_registers++] = next_register;
1239 bufp->num_registers++;
1249 if (paren_depth <= 0)
1250 goto parenthesis_error;
1252 current_level = precedences[Ropenpar];
1254 if (paren_depth < num_open_registers) {
1255 bufp->uses_registers = 1;
1258 num_open_registers--;
1259 STORE(open_registers[num_open_registers]);
1264 goto bad_match_register;
1265 if (!(ch >= '0' && ch <= '9')) {
1266 goto bad_match_register;
1268 bufp->uses_registers = 1;
1269 opcode = Cmatch_memory;
1271 goto store_opcode_and_arg;
1272 case Rextended_memory:
1274 if (ch < '0' || ch > '9')
1275 goto bad_match_register;
1277 if (a < '0' || a > '9')
1278 goto bad_match_register;
1279 ch = 10 * (a - '0') + ch - '0';
1280 if (ch == 0 || ch >= RE_NREGS)
1281 goto bad_match_register;
1282 bufp->uses_registers = 1;
1283 opcode = Cmatch_memory;
1284 goto store_opcode_and_arg;
1296 offset = pattern_offset;
1297 for (a = 0; a < 256 / 8; a++)
1301 ch = translate[(unsigned char)ch];
1306 ch = translate[(unsigned char)ch];
1312 while (ch != '\135' || firstchar) {
1314 if (regexp_ansi_sequences && ch == '\134') {
1319 for (a = prev; a <= (int)ch; a++)
1320 SETBIT(pattern, offset, a);
1323 } else if (prev != -1 && ch == '-')
1326 SETBIT(pattern, offset, ch);
1331 ch = translate[(unsigned char)ch];
1334 SETBIT(pattern, offset, '-');
1336 for (a = 0; a < 256 / 8; a++)
1337 pattern[offset + a] ^= 0xff;
1353 opcode = Csyntaxspec;
1355 goto store_opcode_and_arg;
1359 opcode = Cnotsyntaxspec;
1361 goto store_opcode_and_arg;
1375 opcode = Cwordbound;
1380 opcode = Cnotwordbound;
1388 beginning_context = (op == Ropenpar || op == Ror);
1390 if (starts_base != 0)
1391 goto parenthesis_error;
1392 // assert(num_jumps == 0);
1396 if (!re_optimize(bufp))
1397 return "Optimization error";
1402 return "Badly placed special character";
1406 return "Bad match register number";
1410 return "Bad hexadecimal number";
1414 return "Badly placed parenthesis";
1418 return "Out of memory";
1422 return "Regular expression ends prematurely";
1426 return "Regular expression too complex";
1434 #undef CURRENT_LEVEL_START
1435 #undef SET_LEVEL_START
1436 #undef PUSH_LEVEL_STARTS
1437 #undef POP_LEVEL_STARTS
1443 #define PREFETCH if (text == textend) goto fail
1445 #define NEXTCHAR(var) \
1447 var = (unsigned char)*text++; \
1449 var = translate[var]
1452 int regcomp(regex_t * bufp, const char *regex, int cflags)
1454 memset(bufp, 0, sizeof(regex_t));
1455 bufp->cflags = cflags;
1456 if (bufp->cflags & REG_ICASE) {
1457 char *p, *lcase = bstrdup(regex);
1458 for( p = lcase; *p ; p++) {
1461 re_compile_pattern(bufp, (unsigned char *)lcase);
1464 re_compile_pattern(bufp, (unsigned char *)regex);
1472 void re_registers_to_regmatch(regexp_registers_t old_regs,
1473 regmatch_t pmatch[],
1478 /* We have to set the last entry to -1 */
1479 nmatch = nmatch - 1;
1480 for (i=0; (i < nmatch) && (old_regs->start[i] > -1) ; i++) {
1481 pmatch[i].rm_so = old_regs->start[i];
1482 pmatch[i].rm_eo = old_regs->end[i];
1485 pmatch[i].rm_eo = pmatch[i].rm_so = -1;
1488 int regexec(regex_t * preg, const char *string, size_t nmatch,
1489 regmatch_t pmatch[], int eflags)
1492 int len = strlen(string);
1493 struct re_registers regs;
1494 stat = re_search(preg, (unsigned char *)string, len, 0, len, ®s);
1495 if (stat >= 0 && nmatch > 0) {
1496 re_registers_to_regmatch(®s, pmatch, nmatch);
1498 /* stat is the start position in the string base 0 where
1499 * the pattern was found or negative if not found.
1501 return stat < 0 ? -1 : 0;
1504 size_t regerror(int errcode, regex_t * preg, char *errbuf, size_t errbuf_size)
1506 bstrncpy(errbuf, preg->errmsg, errbuf_size);
1510 void regfree(regex_t * preg)
1513 free_pool_memory(preg->lcase);
1518 preg->buffer = NULL;
1522 int re_match(regex_t * bufp, unsigned char *string, int size, int pos,
1523 regexp_registers_t old_regs)
1525 unsigned char *code;
1526 unsigned char *translate;
1527 unsigned char *text;
1528 unsigned char *textstart;
1529 unsigned char *textend;
1535 unsigned char *regstart;
1536 unsigned char *regend;
1540 // assert(pos >= 0 && size >= 0);
1541 // assert(pos <= size);
1543 text = string + pos;
1545 textend = string + size;
1547 code = bufp->buffer;
1549 translate = bufp->translate;
1551 NEW_STATE(state, bufp->num_registers);
1557 match_end = text - textstart;
1559 old_regs->start[0] = pos;
1560 old_regs->end[0] = match_end;
1561 if (!bufp->uses_registers) {
1562 for (a = 1; a < RE_NREGS; a++) {
1563 old_regs->start[a] = -1;
1564 old_regs->end[a] = -1;
1567 for (a = 1; a < bufp->num_registers; a++) {
1568 if ((GET_REG_START(state, a) == NULL) ||
1569 (GET_REG_END(state, a) == NULL)) {
1570 old_regs->start[a] = -1;
1571 old_regs->end[a] = -1;
1574 old_regs->start[a] = GET_REG_START(state, a) - textstart;
1575 old_regs->end[a] = GET_REG_END(state, a) - textstart;
1577 for (; a < RE_NREGS; a++) {
1578 old_regs->start[a] = -1;
1579 old_regs->end[a] = -1;
1584 return match_end - pos;
1588 if (text == textstart || text[-1] == '\n')
1589 goto continue_matching;
1594 if (text == textend || *text == '\n')
1595 goto continue_matching;
1601 if (code[ch / 8] & (1 << (ch & 7))) {
1603 goto continue_matching;
1610 if (ch != (unsigned char)*code++)
1612 goto continue_matching;
1619 goto continue_matching;
1624 SET_REG_START(state, reg, text, goto error);
1625 goto continue_matching;
1630 SET_REG_END(state, reg, text, goto error);
1631 goto continue_matching;
1636 regstart = GET_REG_START(state, reg);
1637 regend = GET_REG_END(state, reg);
1638 if ((regstart == NULL) || (regend == NULL))
1639 goto fail; /* or should we just match nothing? */
1640 regsize = regend - regstart;
1642 if (regsize > (textend - text))
1645 for (; regstart < regend; regstart++, text++)
1646 if (translate[*regstart] != translate[*text])
1649 for (; regstart < regend; regstart++, text++)
1650 if (*regstart != *text)
1652 goto continue_matching;
1654 case Cupdate_failure_jump:
1656 UPDATE_FAILURE(state, text, goto error);
1657 /* fall to next case */
1659 /* treat Cstar_jump just like Cjump if it hasn't been optimized */
1663 a = (unsigned char)*code++;
1664 a |= (unsigned char)*code++ << 8;
1665 code += (int)SHORT(a);
1666 if (code < bufp->buffer || bufp->buffer + bufp->used < code) {
1667 set_error("Regex VM jump out of bounds (Cjump)");
1671 goto continue_matching;
1673 case Cdummy_failure_jump:
1675 unsigned char *failuredest;
1677 a = (unsigned char)*code++;
1678 a |= (unsigned char)*code++ << 8;
1680 // assert(*code == Cfailure_jump);
1681 b = (unsigned char)code[1];
1682 b |= (unsigned char)code[2] << 8;
1683 failuredest = code + (int)SHORT(b) + 3;
1684 if (failuredest < bufp->buffer || bufp->buffer + bufp->used < failuredest) {
1686 ("Regex VM jump out of bounds (Cdummy_failure_jump failuredest)");
1690 PUSH_FAILURE(state, failuredest, NULL, goto error);
1692 if (code < bufp->buffer || bufp->buffer + bufp->used < code) {
1693 set_error("Regex VM jump out of bounds (Cdummy_failure_jump code)");
1697 goto continue_matching;
1701 a = (unsigned char)*code++;
1702 a |= (unsigned char)*code++ << 8;
1704 if (code + a < bufp->buffer || bufp->buffer + bufp->used < code + a) {
1705 set_error("Regex VM jump out of bounds (Cfailure_jump)");
1709 PUSH_FAILURE(state, code + a, text, goto error);
1710 goto continue_matching;
1714 unsigned char *pinst;
1715 a = (unsigned char)*code++;
1716 a |= (unsigned char)*code++ << 8;
1719 if (pinst < bufp->buffer || bufp->buffer + bufp->used < pinst) {
1720 set_error("Regex VM jump out of bounds (Crepeat1)");
1724 /* pinst is sole instruction in loop, and it matches a
1725 * single character. Since Crepeat1 was originally a
1726 * Cupdate_failure_jump, we also know that backtracking
1727 * is useless: so long as the single-character
1728 * expression matches, it must be used. Also, in the
1729 * case of +, we've already matched one character, so +
1730 * can't fail: nothing here can cause a failure. */
1735 while (text < textend) {
1736 ch = translate[(unsigned char)*text];
1737 if (pinst[ch / 8] & (1 << (ch & 7)))
1743 while (text < textend) {
1744 ch = (unsigned char)*text;
1745 if (pinst[ch / 8] & (1 << (ch & 7)))
1755 ch = (unsigned char)*pinst;
1757 while (text < textend && translate[(unsigned char)*text] == ch)
1760 while (text < textend && (unsigned char)*text == ch)
1767 while (text < textend && (unsigned char)*text != '\n')
1773 a = (unsigned char)*pinst;
1775 while (text < textend && (SYNTAX(translate[*text]) & a))
1778 while (text < textend && (SYNTAX(*text) & a))
1783 case Cnotsyntaxspec:
1785 a = (unsigned char)*pinst;
1787 while (text < textend && !(SYNTAX(translate[*text]) & a))
1790 while (text < textend && !(SYNTAX(*text) & a))
1798 set_error("Unknown regex opcode: memory corrupted?");
1802 /* due to the funky way + and * are compiled, the top
1803 * failure- stack entry at this point is actually a
1804 * success entry -- update it & pop it */
1805 UPDATE_FAILURE(state, text, goto error);
1806 goto fail; /* i.e., succeed <wink/sigh> */
1810 if (text == textstart)
1811 goto continue_matching;
1816 if (text == textend)
1817 goto continue_matching;
1822 if (text == textend)
1824 if (!(SYNTAX(*text) & Sword))
1826 if (text == textstart)
1827 goto continue_matching;
1828 if (!(SYNTAX(text[-1]) & Sword))
1829 goto continue_matching;
1834 if (text == textstart)
1836 if (!(SYNTAX(text[-1]) & Sword))
1838 if (text == textend)
1839 goto continue_matching;
1840 if (!(SYNTAX(*text) & Sword))
1841 goto continue_matching;
1846 /* Note: as in gnu regexp, this also matches at the
1847 * beginning and end of buffer. */
1849 if (text == textstart || text == textend)
1850 goto continue_matching;
1851 if ((SYNTAX(text[-1]) & Sword) ^ (SYNTAX(*text) & Sword))
1852 goto continue_matching;
1857 /* Note: as in gnu regexp, this never matches at the
1858 * beginning and end of buffer. */
1859 if (text == textstart || text == textend)
1861 if (!((SYNTAX(text[-1]) & Sword) ^ (SYNTAX(*text) & Sword)))
1862 goto continue_matching;
1868 if (!(SYNTAX(ch) & (unsigned char)*code++))
1870 goto continue_matching;
1872 case Cnotsyntaxspec:
1875 if (SYNTAX(ch) & (unsigned char)*code++)
1877 goto continue_matching;
1882 set_error("Unknown regex opcode: memory corrupted?");
1889 #if 0 /* This line is never reached --Guido */
1896 /* Using "break;" in the above switch statement is equivalent to "goto fail;" */
1898 POP_FAILURE(state, code, text, goto done_matching, goto error);
1899 goto continue_matching;
1902 /* if(translated != NULL) */
1903 /* free(translated); */
1908 /* if (translated != NULL) */
1909 /* free(translated); */
1918 int re_search(regex_t * bufp, unsigned char *str, int size, int pos,
1919 int range, regexp_registers_t regs)
1921 unsigned char *fastmap;
1922 unsigned char *translate;
1923 unsigned char *text;
1924 unsigned char *partstart;
1925 unsigned char *partend;
1928 unsigned char anchor;
1929 unsigned char *string = str;
1931 if (bufp->cflags & REG_ICASE) { /* we must use string in lowercase */
1932 int len = strlen((const char *)str);
1934 bufp->lcase = get_pool_memory(PM_FNAME);
1936 bufp->lcase = check_pool_memory_size(bufp->lcase, len+1);
1937 unsigned char *dst = (unsigned char *)bufp->lcase;
1939 *dst++ = tolower(*string++);
1942 string = (unsigned char *)bufp->lcase;
1945 // assert(size >= 0 && pos >= 0);
1946 // assert(pos + range >= 0 && pos + range <= size); /* Bugfix by ylo */
1948 fastmap = bufp->fastmap;
1949 translate = bufp->translate;
1950 if (fastmap && !bufp->fastmap_accurate) {
1951 re_compile_fastmap(bufp);
1956 anchor = bufp->anchor;
1957 if (bufp->can_be_null == 1) /* can_be_null == 2: can match null at eob */
1973 for (; range >= 0; range--, pos += dir) {
1975 if (dir == 1) { /* searching forwards */
1977 text = string + pos;
1978 partend = string + size;
1981 while (text != partend &&
1982 !fastmap[(unsigned char)translate[(unsigned char)*text]])
1985 while (text != partend && !fastmap[(unsigned char)*text])
1987 pos += text - partstart;
1988 range -= text - partstart;
1989 if (pos == size && bufp->can_be_null == 0)
1991 } else { /* searching backwards */
1992 text = string + pos;
1993 partstart = string + pos - range;
1996 while (text != partstart && !fastmap[(unsigned char)
1997 translate[(unsigned char)*text]])
2000 while (text != partstart && !fastmap[(unsigned char)*text])
2002 pos -= partend - text;
2003 range -= partend - text;
2006 if (anchor == 1) { /* anchored to begline */
2007 if (pos > 0 && (string[pos - 1] != '\n'))
2010 // assert(pos >= 0 && pos <= size);
2011 ret = re_match(bufp, string, size, pos, regs);
2023 ** c-file-style: "python"