3 * Author: Tatu Ylonen <ylo@ngs.fi>
5 * Copyright (c) 1991 Tatu Ylonen, Espoo, Finland
7 * Permission to use, copy, modify, distribute, and sell this software
8 * and its documentation for any purpose is hereby granted without
9 * fee, provided that the above copyright notice appear in all copies.
10 * This software is provided "as is" without express or implied
13 * Created: Thu Sep 26 17:14:05 1991 ylo
14 * Last modified: Mon Nov 4 17:06:48 1991 ylo
15 * Ported to Think C: 19 Jan 1992 guido@cwi.nl
17 * This code draws many ideas from the regular expression packages by
18 * Henry Spencer of the University of Toronto and Richard Stallman of
19 * the Free Software Foundation.
21 * Emacs-specific code and syntax table code is almost directly borrowed
24 * Bugs fixed and lots of reorganization by Jeffrey C. Ollie, April
25 * 1997 Thanks for bug reports and ideas from Andrew Kuchling, Tim
26 * Peters, Guido van Rossum, Ka-Ping Yee, Sjoerd Mullender, and
27 * probably one or two others that I'm forgetting.
29 * This file modified to work with Bacula and C++ by
30 * Kern Sibbald, April 2006
32 * This file modified to work with REG_ICASE and Bacula by
33 * Eric Bollengier April 2007
36 Bacula(R) - The Network Backup Solution
38 Copyright (C) 2000-2016 Kern Sibbald
40 The original author of Bacula is Kern Sibbald, with contributions
41 from many others, a complete list can be found in the file AUTHORS.
43 You may use this file and others of this release according to the
44 license defined in the LICENSE file, which includes the Affero General
45 Public License, v3.0 ("AGPLv3") and some additional permissions and
46 terms pursuant to its AGPLv3 Section 7.
48 This notice must be preserved when any source code is
49 conveyed and/or propagated.
51 Bacula(R) is a registered trademark of Kern Sibbald.
58 #define set_error(x) bufp->errmsg=((char *)(x))
59 #define got_error bufp->errmsg!=NULL
61 /* The original code blithely assumed that sizeof(short) == 2. Not
62 * always true. Original instances of "(short)x" were replaced by
63 * SHORT(x), where SHORT is #defined below. */
65 #define SHORT(x) ((x) & 0x8000 ? (x) - 0x10000 : (x))
67 /* The stack implementation is taken from an idea by Andrew Kuchling.
68 * It's a doubly linked list of arrays. The advantages of this over a
69 * simple linked list are that the number of mallocs required are
70 * reduced. It also makes it possible to statically allocate enough
71 * space so that small patterns don't ever need to call malloc.
73 * The advantages over a single array is that is periodically
74 * realloced when more space is needed is that we avoid ever copying
77 /* item_t is the basic stack element. Defined as a union of
78 * structures so that both registers, failure points, and counters can
79 * be pushed/popped from the stack. There's nothing built into the
80 * item to keep track of whether a certain stack item is a register, a
81 * failure point, or a counter. */
83 typedef union item_t {
104 #define B_STACK_PAGE_SIZE 256
105 #define NUM_REGISTERS 256
107 /* A 'page' of stack items. */
109 typedef struct item_page_t {
110 item_t items[B_STACK_PAGE_SIZE];
111 struct item_page_t *prev;
112 struct item_page_t *next;
116 typedef struct match_state {
117 /* The number of registers that have been pushed onto the stack
118 * since the last failure point. */
122 /* Used to control when registers need to be pushed onto the
127 /* The number of failure points on the stack. */
131 /* Storage for the registers. Each register consists of two
132 * pointers to characters. So register N is represented as
133 * start[N] and end[N]. The pointers must be converted to
134 * offsets from the beginning of the string before returning the
135 * registers to the calling program. */
137 unsigned char *start[NUM_REGISTERS];
138 unsigned char *end[NUM_REGISTERS];
140 /* Keeps track of whether a register has changed recently. */
142 int changed[NUM_REGISTERS];
144 /* Structure to encapsulate the stack. */
146 /* index into the current page. If index == 0 and you need
147 * to pop an item, move to the previous page and set index
148 * = B_STACK_PAGE_SIZE - 1. Otherwise decrement index to
149 * push a page. If index == B_STACK_PAGE_SIZE and you need
150 * to push a page move to the next page and set index =
151 * 0. If there is no new next page, allocate a new page
152 * and link it in. Otherwise, increment index to push a
156 item_page_t *current; /* Pointer to the current page. */
157 item_page_t first; /* First page is statically allocated. */
161 /* Initialize a state object */
163 /* #define NEW_STATE(state) \ */
164 /* memset(&state, 0, (void *)(&state.stack) - (void *)(&state)); \ */
165 /* state.stack.current = &state.stack.first; \ */
166 /* state.stack.first.prev = NULL; \ */
167 /* state.stack.first.next = NULL; \ */
168 /* state.stack.index = 0; \ */
169 /* state.level = 1 */
171 #define NEW_STATE(state, nregs) \
174 for (i = 0; i < nregs; i++) \
176 state.start[i] = NULL; \
177 state.end[i] = NULL; \
178 state.changed[i] = 0; \
180 state.stack.current = &state.stack.first; \
181 state.stack.first.prev = NULL; \
182 state.stack.first.next = NULL; \
183 state.stack.index = 0; \
190 /* Free any memory that might have been malloc'd */
192 #define FREE_STATE(state) \
193 while(state.stack.first.next != NULL) \
195 state.stack.current = state.stack.first.next; \
196 state.stack.first.next = state.stack.current->next; \
197 free(state.stack.current); \
200 /* Discard the top 'count' stack items. */
202 #define STACK_DISCARD(stack, count, on_error) \
203 stack.index -= count; \
204 while (stack.index < 0) \
206 if (stack.current->prev == NULL) \
208 stack.current = stack.current->prev; \
209 stack.index += B_STACK_PAGE_SIZE; \
212 /* Store a pointer to the previous item on the stack. Used to pop an
213 * item off of the stack. */
215 #define STACK_PREV(stack, top, on_error) \
216 if (stack.index == 0) \
218 if (stack.current->prev == NULL) \
220 stack.current = stack.current->prev; \
221 stack.index = B_STACK_PAGE_SIZE - 1; \
227 top = &(stack.current->items[stack.index])
229 /* Store a pointer to the next item on the stack. Used to push an item
230 * on to the stack. */
232 #define STACK_NEXT(stack, top, on_error) \
233 if (stack.index == B_STACK_PAGE_SIZE) \
235 if (stack.current->next == NULL) \
237 stack.current->next = (item_page_t *)malloc(sizeof(item_page_t)); \
238 if (stack.current->next == NULL) \
240 stack.current->next->prev = stack.current; \
241 stack.current->next->next = NULL; \
243 stack.current = stack.current->next; \
246 top = &(stack.current->items[stack.index++])
248 /* Store a pointer to the item that is 'count' items back in the
249 * stack. STACK_BACK(stack, top, 1, on_error) is equivalent to
250 * STACK_TOP(stack, top, on_error). */
252 #define STACK_BACK(stack, top, count, on_error) \
255 item_page_t *current; \
256 current = stack.current; \
257 index = stack.index - (count); \
260 if (current->prev == NULL) \
262 current = current->prev; \
263 index += B_STACK_PAGE_SIZE; \
265 top = &(current->items[index]); \
268 /* Store a pointer to the top item on the stack. Execute the
269 * 'on_error' code if there are no items on the stack. */
271 #define STACK_TOP(stack, top, on_error) \
272 if (stack.index == 0) \
274 if (stack.current->prev == NULL) \
276 top = &(stack.current->prev->items[B_STACK_PAGE_SIZE - 1]); \
280 top = &(stack.current->items[stack.index - 1]); \
283 /* Test to see if the stack is empty */
285 #define STACK_EMPTY(stack) ((stack.index == 0) && \
286 (stack.current->prev == NULL))
288 /* Return the start of register 'reg' */
290 #define GET_REG_START(state, reg) (state.start[reg])
292 /* Return the end of register 'reg' */
294 #define GET_REG_END(state, reg) (state.end[reg])
296 /* Set the start of register 'reg'. If the state of the register needs
297 * saving, push it on the stack. */
299 #define SET_REG_START(state, reg, text, on_error) \
300 if(state.changed[reg] < state.level) \
303 STACK_NEXT(state.stack, item, on_error); \
304 item->reg.num = reg; \
305 item->reg.start = state.start[reg]; \
306 item->reg.end = state.end[reg]; \
307 item->reg.level = state.changed[reg]; \
308 state.changed[reg] = state.level; \
311 state.start[reg] = text
313 /* Set the end of register 'reg'. If the state of the register needs
314 * saving, push it on the stack. */
316 #define SET_REG_END(state, reg, text, on_error) \
317 if(state.changed[reg] < state.level) \
320 STACK_NEXT(state.stack, item, on_error); \
321 item->reg.num = reg; \
322 item->reg.start = state.start[reg]; \
323 item->reg.end = state.end[reg]; \
324 item->reg.level = state.changed[reg]; \
325 state.changed[reg] = state.level; \
328 state.end[reg] = text
330 #define PUSH_FAILURE(state, xcode, xtext, on_error) \
333 STACK_NEXT(state.stack, item, on_error); \
334 item->fail.code = xcode; \
335 item->fail.text = xtext; \
336 item->fail.count = state.count; \
337 item->fail.level = state.level; \
338 item->fail.phantom = 0; \
344 /* Update the last failure point with a new position in the text. */
346 #define UPDATE_FAILURE(state, xtext, on_error) \
349 STACK_BACK(state.stack, item, state.count + 1, on_error); \
350 if (!item->fail.phantom) \
353 STACK_NEXT(state.stack, item2, on_error); \
354 item2->fail.code = item->fail.code; \
355 item2->fail.text = xtext; \
356 item2->fail.count = state.count; \
357 item2->fail.level = state.level; \
358 item2->fail.phantom = 1; \
365 STACK_DISCARD(state.stack, state.count, on_error); \
366 STACK_TOP(state.stack, item, on_error); \
367 item->fail.text = xtext; \
373 #define POP_FAILURE(state, xcode, xtext, on_empty, on_error) \
378 while(state.count > 0) \
380 STACK_PREV(state.stack, item, on_error); \
381 state.start[item->reg.num] = item->reg.start; \
382 state.end[item->reg.num] = item->reg.end; \
383 state.changed[item->reg.num] = item->reg.level; \
386 STACK_PREV(state.stack, item, on_empty); \
387 xcode = item->fail.code; \
388 xtext = item->fail.text; \
389 state.count = item->fail.count; \
390 state.level = item->fail.level; \
393 while (item->fail.text == NULL); \
396 enum regexp_compiled_ops { /* opcodes for compiled regexp */
397 Cend, /* end of pattern reached */
398 Cbol, /* beginning of line */
399 Ceol, /* end of line */
400 Cset, /* character set. Followed by 32 bytes of set. */
401 Cexact, /* followed by a byte to match */
402 Canychar, /* matches any character except newline */
403 Cstart_memory, /* set register start addr (followed by reg number) */
404 Cend_memory, /* set register end addr (followed by reg number) */
405 Cmatch_memory, /* match a duplicate of reg contents (regnum follows) */
406 Cjump, /* followed by two bytes (lsb,msb) of displacement. */
407 Cstar_jump, /* will change to jump/update_failure_jump at runtime */
408 Cfailure_jump, /* jump to addr on failure */
409 Cupdate_failure_jump, /* update topmost failure point and jump */
410 Cdummy_failure_jump, /* push a dummy failure point and jump */
411 Cbegbuf, /* match at beginning of buffer */
412 Cendbuf, /* match at end of buffer */
413 Cwordbeg, /* match at beginning of word */
414 Cwordend, /* match at end of word */
415 Cwordbound, /* match if at word boundary */
416 Cnotwordbound, /* match if not at word boundary */
417 Csyntaxspec, /* matches syntax code (1 byte follows) */
418 Cnotsyntaxspec, /* matches if syntax code does not match (1 byte follows) */
422 enum regexp_syntax_op { /* syntax codes for plain and quoted characters */
423 Rend, /* special code for end of regexp */
424 Rnormal, /* normal character */
425 Ranychar, /* any character except newline */
426 Rquote, /* the quote character */
427 Rbol, /* match beginning of line */
428 Reol, /* match end of line */
429 Roptional, /* match preceding expression optionally */
430 Rstar, /* match preceding expr zero or more times */
431 Rplus, /* match preceding expr one or more times */
432 Ror, /* match either of alternatives */
433 Ropenpar, /* opening parenthesis */
434 Rclosepar, /* closing parenthesis */
435 Rmemory, /* match memory register */
436 Rextended_memory, /* \vnn to match registers 10-99 */
437 Ropenset, /* open set. Internal syntax hard-coded below. */
438 /* the following are gnu extensions to "normal" regexp syntax */
439 Rbegbuf, /* beginning of buffer */
440 Rendbuf, /* end of buffer */
441 Rwordchar, /* word character */
442 Rnotwordchar, /* not word character */
443 Rwordbeg, /* beginning of word */
444 Rwordend, /* end of word */
445 Rwordbound, /* word bound */
446 Rnotwordbound, /* not word bound */
450 static int re_compile_initialized = 0;
451 static int regexp_syntax = RE_SYNTAX_EGREP;
452 int re_syntax = RE_SYNTAX_EGREP; /* Exported copy of regexp_syntax */
453 static unsigned char plain_ops[256];
454 static unsigned char quoted_ops[256];
455 static unsigned char precedences[Rnum_ops];
456 static int regexp_context_indep_ops;
457 static int regexp_ansi_sequences;
459 #define NUM_LEVELS 5 /* number of precedence levels in use */
460 #define MAX_NESTING 100 /* max nesting level of operators */
462 #define SYNTAX(ch) re_syntax_table[(unsigned char)(ch)]
464 unsigned char re_syntax_table[256];
466 void re_compile_initialize(void)
470 static int syntax_table_inited = 0;
472 if (!syntax_table_inited) {
473 syntax_table_inited = 1;
474 memset(re_syntax_table, 0, 256);
475 for (a = 'a'; a <= 'z'; a++)
476 re_syntax_table[a] = Sword;
477 for (a = 'A'; a <= 'Z'; a++)
478 re_syntax_table[a] = Sword;
479 for (a = '0'; a <= '9'; a++)
480 re_syntax_table[a] = Sword | Sdigit | Shexdigit;
481 for (a = '0'; a <= '7'; a++)
482 re_syntax_table[a] |= Soctaldigit;
483 for (a = 'A'; a <= 'F'; a++)
484 re_syntax_table[a] |= Shexdigit;
485 for (a = 'a'; a <= 'f'; a++)
486 re_syntax_table[a] |= Shexdigit;
487 re_syntax_table[(int)'_'] = Sword;
488 for (a = 9; a <= 13; a++)
489 re_syntax_table[a] = Swhitespace;
490 re_syntax_table[(int)' '] = Swhitespace;
492 re_compile_initialized = 1;
493 for (a = 0; a < 256; a++) {
494 plain_ops[a] = Rnormal;
495 quoted_ops[a] = Rnormal;
497 for (a = '0'; a <= '9'; a++)
498 quoted_ops[a] = Rmemory;
499 plain_ops[(int)'\134'] = Rquote;
500 if (regexp_syntax & RE_NO_BK_PARENS) {
501 plain_ops[(int)'('] = Ropenpar;
502 plain_ops[(int)')'] = Rclosepar;
504 quoted_ops[(int)'('] = Ropenpar;
505 quoted_ops[(int)')'] = Rclosepar;
507 if (regexp_syntax & RE_NO_BK_VBAR) {
508 plain_ops[(int)'\174'] = Ror;
510 quoted_ops[(int)'\174'] = Ror;
512 plain_ops[(int)'*'] = Rstar;
513 if (regexp_syntax & RE_BK_PLUS_QM) {
514 quoted_ops[(int)'+'] = Rplus;
515 quoted_ops[(int)'?'] = Roptional;
517 plain_ops[(int)'+'] = Rplus;
518 plain_ops[(int)'?'] = Roptional;
520 if (regexp_syntax & RE_NEWLINE_OR) {
521 plain_ops[(int)'\n'] = Ror;
523 plain_ops[(int)'\133'] = Ropenset;
524 plain_ops[(int)'\136'] = Rbol;
525 plain_ops[(int)'$'] = Reol;
526 plain_ops[(int)'.'] = Ranychar;
527 if (!(regexp_syntax & RE_NO_GNU_EXTENSIONS)) {
528 quoted_ops[(int)'w'] = Rwordchar;
529 quoted_ops[(int)'W'] = Rnotwordchar;
530 quoted_ops[(int)'<'] = Rwordbeg;
531 quoted_ops[(int)'>'] = Rwordend;
532 quoted_ops[(int)'b'] = Rwordbound;
533 quoted_ops[(int)'B'] = Rnotwordbound;
534 quoted_ops[(int)'`'] = Rbegbuf;
535 quoted_ops[(int)'\''] = Rendbuf;
537 if (regexp_syntax & RE_ANSI_HEX) {
538 quoted_ops[(int)'v'] = Rextended_memory;
540 for (a = 0; a < Rnum_ops; a++) {
543 if (regexp_syntax & RE_TIGHT_VBAR) {
544 precedences[Ror] = 3;
545 precedences[Rbol] = 2;
546 precedences[Reol] = 2;
548 precedences[Ror] = 2;
549 precedences[Rbol] = 3;
550 precedences[Reol] = 3;
552 precedences[Rclosepar] = 1;
553 precedences[Rend] = 0;
554 regexp_context_indep_ops = (regexp_syntax & RE_CONTEXT_INDEP_OPS) != 0;
555 regexp_ansi_sequences = (regexp_syntax & RE_ANSI_HEX) != 0;
558 int re_set_syntax(int syntax) {
562 regexp_syntax = syntax;
563 re_syntax = syntax; /* Exported copy */
564 re_compile_initialize();
568 static int hex_char_to_decimal(int ch) {
569 if (ch >= '0' && ch <= '9')
571 if (ch >= 'a' && ch <= 'f')
572 return ch - 'a' + 10;
573 if (ch >= 'A' && ch <= 'F')
574 return ch - 'A' + 10;
578 static void re_compile_fastmap_aux(regex_t * bufp, unsigned char *code, int pos,
579 unsigned char *visited,
580 unsigned char *can_be_null,
581 unsigned char *fastmap)
588 return; /* we have already been here */
591 switch (code[pos++]) {
602 for (a = 0; a < 256; a++)
606 syntaxcode = code[pos++];
607 for (a = 0; a < 256; a++)
608 if (SYNTAX(a) & syntaxcode)
612 syntaxcode = code[pos++];
613 for (a = 0; a < 256; a++)
614 if (!(SYNTAX(a) & syntaxcode))
618 fastmap[(int)'\n'] = 1;
619 if (*can_be_null == 0)
620 *can_be_null = 2; /* can match null, but only at end of buffer */
623 for (a = 0; a < 256 / 8; a++)
624 if (code[pos + a] != 0)
625 for (b = 0; b < 8; b++)
626 if (code[pos + a] & (1 << b))
627 fastmap[(a << 3) + b] = 1;
631 fastmap[(unsigned char)code[pos]] = 1;
634 for (a = 0; a < 256; a++)
643 for (a = 0; a < 256; a++)
648 case Cdummy_failure_jump:
649 case Cupdate_failure_jump:
651 a = (unsigned char)code[pos++];
652 a |= (unsigned char)code[pos++] << 8;
653 pos += (int)SHORT(a);
655 /* argh... the regexp contains empty loops. This is not
656 good, as this may cause a failure stack overflow when
657 matching. Oh well. */
658 /* this path leads nowhere; pursue other paths. */
664 a = (unsigned char)code[pos++];
665 a |= (unsigned char)code[pos++] << 8;
666 a = pos + (int)SHORT(a);
667 re_compile_fastmap_aux(bufp, code, a, visited, can_be_null, fastmap);
673 set_error("Unknown regex opcode: memory corrupted?");
679 static int re_do_compile_fastmap(regex_t * bufp, unsigned char *buffer, int used,
680 int pos, unsigned char *can_be_null,
681 unsigned char *fastmap)
683 unsigned char small_visited[512], *visited;
685 if (used <= (int)sizeof(small_visited))
686 visited = small_visited;
688 visited = (unsigned char *)malloc(used);
693 memset(fastmap, 0, 256);
694 memset(visited, 0, used);
695 re_compile_fastmap_aux(bufp, buffer, pos, visited, can_be_null, fastmap);
696 if (visited != small_visited)
701 void re_compile_fastmap(regex_t * bufp)
703 if (!bufp->fastmap || bufp->fastmap_accurate)
705 // assert(bufp->used > 0);
706 if (!re_do_compile_fastmap(bufp, bufp->buffer,
707 bufp->used, 0, &bufp->can_be_null, bufp->fastmap))
711 if (bufp->buffer[0] == Cbol)
712 bufp->anchor = 1; /* begline */
713 else if (bufp->buffer[0] == Cbegbuf)
714 bufp->anchor = 2; /* begbuf */
716 bufp->anchor = 0; /* none */
717 bufp->fastmap_accurate = 1;
723 * ... code for operand of star
725 * 2: ... code after star
727 * We change the star_jump to update_failure_jump if we can determine
728 * that it is safe to do so; otherwise we change it to an ordinary
735 * 2: ... code for operand of plus
737 * 3: ... code after plus
739 * For star_jump considerations this is processed identically to star.
743 static int re_optimize_star_jump(regex_t * bufp, unsigned char *code)
745 unsigned char map[256];
746 unsigned char can_be_null;
752 int num_instructions = 0;
754 a = (unsigned char)*code++;
755 a |= (unsigned char)*code++ << 8;
758 p1 = code + a + 3; /* skip the failure_jump */
759 /* Check that the jump is within the pattern */
760 if (p1 < bufp->buffer || bufp->buffer + bufp->used < p1) {
761 set_error("Regex VM jump out of bounds (failure_jump opt)");
764 // assert(p1[-3] == Cfailure_jump);
766 /* p1 points inside loop, p2 points to after loop */
767 if (!re_do_compile_fastmap(bufp, bufp->buffer, bufp->used,
768 (int)(p2 - bufp->buffer), &can_be_null, map))
769 goto make_normal_jump;
771 /* If we might introduce a new update point inside the
772 * loop, we can't optimize because then update_jump would
773 * update a wrong failure point. Thus we have to be
774 * quite careful here.
777 /* loop until we find something that consumes a character */
795 ch = (unsigned char)*p1++;
797 goto make_normal_jump;
800 for (b = 0; b < 256; b++)
801 if (b != '\n' && map[b])
802 goto make_normal_jump;
805 for (b = 0; b < 256; b++)
806 if ((p1[b >> 3] & (1 << (b & 7))) && map[b])
807 goto make_normal_jump;
811 goto make_normal_jump;
815 /* now we know that we can't backtrack. */
816 while (p1 != p2 - 3) {
845 case Cupdate_failure_jump:
846 case Cdummy_failure_jump:
847 goto make_normal_jump;
853 /* make_update_jump: */
855 a += 3; /* jump to after the Cfailure_jump */
856 code[0] = Cupdate_failure_jump;
859 if (num_instructions > 1)
861 // assert(num_instructions == 1);
862 /* if the only instruction matches a single character, we can do
864 p1 = code + 3 + a; /* start of sole instruction */
865 if (*p1 == Cset || *p1 == Cexact || *p1 == Canychar ||
866 *p1 == Csyntaxspec || *p1 == Cnotsyntaxspec)
876 static int re_optimize(regex_t * bufp)
908 if (!re_optimize_star_jump(bufp, code)) {
912 case Cupdate_failure_jump:
914 case Cdummy_failure_jump:
925 #define NEXTCHAR(var) \
928 goto ends_prematurely; \
929 (var) = regex[pos]; \
933 #define ALLOC(amount) \
935 if (pattern_offset+(amount) > alloc) \
937 alloc += 256 + (amount); \
938 pattern = (unsigned char *)realloc(pattern, alloc); \
940 goto out_of_memory; \
944 #define STORE(ch) pattern[pattern_offset++] = (ch)
946 #define CURRENT_LEVEL_START (starts[starts_base + current_level])
948 #define SET_LEVEL_START starts[starts_base + current_level] = pattern_offset
950 #define PUSH_LEVEL_STARTS \
951 if (starts_base < (MAX_NESTING-1)*NUM_LEVELS) \
952 starts_base += NUM_LEVELS; \
956 #define POP_LEVEL_STARTS starts_base -= NUM_LEVELS
958 #define PUT_ADDR(offset,addr) \
960 int disp = (addr) - (offset) - 2; \
961 pattern[(offset)] = disp & 0xff; \
962 pattern[(offset)+1] = (disp>>8) & 0xff; \
965 #define INSERT_JUMP(pos,type,addr) \
967 int a, p = (pos), t = (type), ad = (addr); \
968 for (a = pattern_offset - 1; a >= p; a--) \
969 pattern[a + 3] = pattern[a]; \
972 pattern_offset += 3; \
975 #define SETBIT(buf,offset,bit) (buf)[(offset)+(bit)/8] |= (1<<((bit) & 7))
979 bufp->allocated = alloc; \
980 bufp->buffer = pattern; \
981 bufp->used = pattern_offset; \
984 #define GETHEX(var) \
986 unsigned char gethex_ch, gethex_value; \
987 NEXTCHAR(gethex_ch); \
988 gethex_value = hex_char_to_decimal(gethex_ch); \
989 if (gethex_value == 16) \
991 NEXTCHAR(gethex_ch); \
992 gethex_ch = hex_char_to_decimal(gethex_ch); \
993 if (gethex_ch == 16) \
995 (var) = gethex_value * 16 + gethex_ch; \
998 #define ANSI_TRANSLATE(ch) \
1005 ch = 7; /* audible bell */ \
1011 ch = 8; /* backspace */ \
1017 ch = 12; /* form feed */ \
1023 ch = 10; /* line feed */ \
1029 ch = 13; /* carriage return */ \
1041 ch = 11; /* vertical tab */ \
1044 case 'x': /* hex code */ \
1052 /* other characters passed through */ \
1054 ch = translate[(unsigned char)ch]; \
1060 const char *re_compile_pattern(regex_t * bufp, unsigned char *regex)
1068 int pattern_offset = 0, alloc;
1069 int starts[NUM_LEVELS * MAX_NESTING];
1071 int future_jumps[MAX_NESTING];
1073 unsigned char ch = '\0';
1074 unsigned char *pattern;
1075 unsigned char *translate;
1078 int num_open_registers;
1079 int open_registers[RE_NREGS];
1080 int beginning_context;
1081 int size = strlen((char *)regex);
1083 if (!re_compile_initialized)
1084 re_compile_initialize();
1086 bufp->fastmap_accurate = 0;
1087 bufp->uses_registers = 1;
1088 bufp->num_registers = 1;
1089 translate = bufp->translate;
1090 pattern = bufp->buffer;
1091 alloc = bufp->allocated;
1092 if (alloc == 0 || pattern == NULL) {
1094 bufp->buffer = pattern = (unsigned char *)malloc(alloc);
1103 num_open_registers = 0;
1106 beginning_context = 1;
1108 /* we use Rend dummy to ensure that pending jumps are updated
1109 (due to low priority of Rend) before exiting the loop. */
1111 while (op != Rend) {
1117 ch = translate[(unsigned char)ch];
1118 op = plain_ops[(unsigned char)ch];
1121 op = quoted_ops[(unsigned char)ch];
1122 if (op == Rnormal && regexp_ansi_sequences)
1126 level = precedences[op];
1127 /* printf("ch='%c' op=%d level=%d current_level=%d
1128 curlevstart=%d\n", ch, op, level, current_level,
1129 CURRENT_LEVEL_START); */
1130 if (level > current_level) {
1131 for (current_level++; current_level < level; current_level++)
1134 } else if (level < current_level) {
1135 current_level = level;
1136 for (; num_jumps > 0 &&
1137 future_jumps[num_jumps - 1] >= CURRENT_LEVEL_START; num_jumps--)
1138 PUT_ADDR(future_jumps[num_jumps - 1], pattern_offset);
1146 store_opcode_and_arg: /* opcode & ch must be set */
1160 set_error("Rquote");
1161 /*NOTREACHED*/ case Rbol:
1162 if (!beginning_context) {
1163 if (regexp_context_indep_ops)
1171 if (!((pos >= size) ||
1172 ((regexp_syntax & RE_NO_BK_VBAR) ?
1173 (regex[pos] == '\174') :
1174 (pos + 1 < size && regex[pos] == '\134' &&
1175 regex[pos + 1] == '\174')) ||
1176 ((regexp_syntax & RE_NO_BK_PARENS) ?
1177 (regex[pos] == ')') :
1178 (pos + 1 < size && regex[pos] == '\134' &&
1179 regex[pos + 1] == ')')))) {
1180 if (regexp_context_indep_ops)
1190 if (beginning_context) {
1191 if (regexp_context_indep_ops)
1196 if (CURRENT_LEVEL_START == pattern_offset)
1197 break; /* ignore empty patterns for ? */
1199 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump, pattern_offset + 3);
1203 if (beginning_context) {
1204 if (regexp_context_indep_ops)
1209 if (CURRENT_LEVEL_START == pattern_offset)
1210 break; /* ignore empty patterns for + and * */
1212 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump, pattern_offset + 6);
1213 INSERT_JUMP(pattern_offset, Cstar_jump, CURRENT_LEVEL_START);
1214 if (op == Rplus) /* jump over initial failure_jump */
1215 INSERT_JUMP(CURRENT_LEVEL_START, Cdummy_failure_jump,
1216 CURRENT_LEVEL_START + 6);
1220 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump, pattern_offset + 6);
1221 if (num_jumps >= MAX_NESTING)
1224 future_jumps[num_jumps++] = pattern_offset;
1232 if (next_register < RE_NREGS) {
1233 bufp->uses_registers = 1;
1235 STORE(Cstart_memory);
1236 STORE(next_register);
1237 open_registers[num_open_registers++] = next_register;
1238 bufp->num_registers++;
1248 if (paren_depth <= 0)
1249 goto parenthesis_error;
1251 current_level = precedences[Ropenpar];
1253 if (paren_depth < num_open_registers) {
1254 bufp->uses_registers = 1;
1257 num_open_registers--;
1258 STORE(open_registers[num_open_registers]);
1263 goto bad_match_register;
1264 if (!(ch >= '0' && ch <= '9')) {
1265 goto bad_match_register;
1267 bufp->uses_registers = 1;
1268 opcode = Cmatch_memory;
1270 goto store_opcode_and_arg;
1271 case Rextended_memory:
1273 if (ch < '0' || ch > '9')
1274 goto bad_match_register;
1276 if (a < '0' || a > '9')
1277 goto bad_match_register;
1278 ch = 10 * (a - '0') + ch - '0';
1279 if (ch == 0 || ch >= RE_NREGS)
1280 goto bad_match_register;
1281 bufp->uses_registers = 1;
1282 opcode = Cmatch_memory;
1283 goto store_opcode_and_arg;
1295 offset = pattern_offset;
1296 for (a = 0; a < 256 / 8; a++)
1300 ch = translate[(unsigned char)ch];
1305 ch = translate[(unsigned char)ch];
1311 while (ch != '\135' || firstchar) {
1313 if (regexp_ansi_sequences && ch == '\134') {
1318 for (a = prev; a <= (int)ch; a++)
1319 SETBIT(pattern, offset, a);
1322 } else if (prev != -1 && ch == '-')
1325 SETBIT(pattern, offset, ch);
1330 ch = translate[(unsigned char)ch];
1333 SETBIT(pattern, offset, '-');
1335 for (a = 0; a < 256 / 8; a++)
1336 pattern[offset + a] ^= 0xff;
1352 opcode = Csyntaxspec;
1354 goto store_opcode_and_arg;
1358 opcode = Cnotsyntaxspec;
1360 goto store_opcode_and_arg;
1374 opcode = Cwordbound;
1379 opcode = Cnotwordbound;
1387 beginning_context = (op == Ropenpar || op == Ror);
1389 if (starts_base != 0)
1390 goto parenthesis_error;
1391 // assert(num_jumps == 0);
1395 if (!re_optimize(bufp))
1396 return "Optimization error";
1401 return "Badly placed special character";
1405 return "Bad match register number";
1409 return "Bad hexadecimal number";
1413 return "Badly placed parenthesis";
1417 return "Out of memory";
1421 return "Regular expression ends prematurely";
1425 return "Regular expression too complex";
1433 #undef CURRENT_LEVEL_START
1434 #undef SET_LEVEL_START
1435 #undef PUSH_LEVEL_STARTS
1436 #undef POP_LEVEL_STARTS
1442 #define PREFETCH if (text == textend) goto fail
1444 #define NEXTCHAR(var) \
1446 var = (unsigned char)*text++; \
1448 var = translate[var]
1451 int regcomp(regex_t * bufp, const char *regex, int cflags)
1453 memset(bufp, 0, sizeof(regex_t));
1454 bufp->cflags = cflags;
1455 if (bufp->cflags & REG_ICASE) {
1456 char *p, *lcase = bstrdup(regex);
1457 for( p = lcase; *p ; p++) {
1460 re_compile_pattern(bufp, (unsigned char *)lcase);
1463 re_compile_pattern(bufp, (unsigned char *)regex);
1471 void re_registers_to_regmatch(regexp_registers_t old_regs,
1472 regmatch_t pmatch[],
1477 /* We have to set the last entry to -1 */
1478 nmatch = nmatch - 1;
1479 for (i=0; (i < nmatch) && (old_regs->start[i] > -1) ; i++) {
1480 pmatch[i].rm_so = old_regs->start[i];
1481 pmatch[i].rm_eo = old_regs->end[i];
1484 pmatch[i].rm_eo = pmatch[i].rm_so = -1;
1487 int regexec(regex_t * preg, const char *string, size_t nmatch,
1488 regmatch_t pmatch[], int eflags)
1491 int len = strlen(string);
1492 struct re_registers regs;
1493 stat = re_search(preg, (unsigned char *)string, len, 0, len, ®s);
1494 if (stat >= 0 && nmatch > 0) {
1495 re_registers_to_regmatch(®s, pmatch, nmatch);
1497 /* stat is the start position in the string base 0 where
1498 * the pattern was found or negative if not found.
1500 return stat < 0 ? -1 : 0;
1503 size_t regerror(int errcode, regex_t * preg, char *errbuf, size_t errbuf_size)
1505 bstrncpy(errbuf, preg->errmsg, errbuf_size);
1509 void regfree(regex_t * preg)
1512 free_pool_memory(preg->lcase);
1517 preg->buffer = NULL;
1521 int re_match(regex_t * bufp, unsigned char *string, int size, int pos,
1522 regexp_registers_t old_regs)
1524 unsigned char *code;
1525 unsigned char *translate;
1526 unsigned char *text;
1527 unsigned char *textstart;
1528 unsigned char *textend;
1534 unsigned char *regstart;
1535 unsigned char *regend;
1539 // assert(pos >= 0 && size >= 0);
1540 // assert(pos <= size);
1542 text = string + pos;
1544 textend = string + size;
1546 code = bufp->buffer;
1548 translate = bufp->translate;
1550 NEW_STATE(state, bufp->num_registers);
1556 match_end = text - textstart;
1558 old_regs->start[0] = pos;
1559 old_regs->end[0] = match_end;
1560 if (!bufp->uses_registers) {
1561 for (a = 1; a < RE_NREGS; a++) {
1562 old_regs->start[a] = -1;
1563 old_regs->end[a] = -1;
1566 for (a = 1; a < bufp->num_registers; a++) {
1567 if ((GET_REG_START(state, a) == NULL) ||
1568 (GET_REG_END(state, a) == NULL)) {
1569 old_regs->start[a] = -1;
1570 old_regs->end[a] = -1;
1573 old_regs->start[a] = GET_REG_START(state, a) - textstart;
1574 old_regs->end[a] = GET_REG_END(state, a) - textstart;
1576 for (; a < RE_NREGS; a++) {
1577 old_regs->start[a] = -1;
1578 old_regs->end[a] = -1;
1583 return match_end - pos;
1587 if (text == textstart || text[-1] == '\n')
1588 goto continue_matching;
1593 if (text == textend || *text == '\n')
1594 goto continue_matching;
1600 if (code[ch / 8] & (1 << (ch & 7))) {
1602 goto continue_matching;
1609 if (ch != (unsigned char)*code++)
1611 goto continue_matching;
1618 goto continue_matching;
1623 SET_REG_START(state, reg, text, goto error);
1624 goto continue_matching;
1629 SET_REG_END(state, reg, text, goto error);
1630 goto continue_matching;
1635 regstart = GET_REG_START(state, reg);
1636 regend = GET_REG_END(state, reg);
1637 if ((regstart == NULL) || (regend == NULL))
1638 goto fail; /* or should we just match nothing? */
1639 regsize = regend - regstart;
1641 if (regsize > (textend - text))
1644 for (; regstart < regend; regstart++, text++)
1645 if (translate[*regstart] != translate[*text])
1648 for (; regstart < regend; regstart++, text++)
1649 if (*regstart != *text)
1651 goto continue_matching;
1653 case Cupdate_failure_jump:
1655 UPDATE_FAILURE(state, text, goto error);
1656 /* fall to next case */
1658 /* treat Cstar_jump just like Cjump if it hasn't been optimized */
1662 a = (unsigned char)*code++;
1663 a |= (unsigned char)*code++ << 8;
1664 code += (int)SHORT(a);
1665 if (code < bufp->buffer || bufp->buffer + bufp->used < code) {
1666 set_error("Regex VM jump out of bounds (Cjump)");
1670 goto continue_matching;
1672 case Cdummy_failure_jump:
1674 unsigned char *failuredest;
1676 a = (unsigned char)*code++;
1677 a |= (unsigned char)*code++ << 8;
1679 // assert(*code == Cfailure_jump);
1680 b = (unsigned char)code[1];
1681 b |= (unsigned char)code[2] << 8;
1682 failuredest = code + (int)SHORT(b) + 3;
1683 if (failuredest < bufp->buffer || bufp->buffer + bufp->used < failuredest) {
1685 ("Regex VM jump out of bounds (Cdummy_failure_jump failuredest)");
1689 PUSH_FAILURE(state, failuredest, NULL, goto error);
1691 if (code < bufp->buffer || bufp->buffer + bufp->used < code) {
1692 set_error("Regex VM jump out of bounds (Cdummy_failure_jump code)");
1696 goto continue_matching;
1700 a = (unsigned char)*code++;
1701 a |= (unsigned char)*code++ << 8;
1703 if (code + a < bufp->buffer || bufp->buffer + bufp->used < code + a) {
1704 set_error("Regex VM jump out of bounds (Cfailure_jump)");
1708 PUSH_FAILURE(state, code + a, text, goto error);
1709 goto continue_matching;
1713 unsigned char *pinst;
1714 a = (unsigned char)*code++;
1715 a |= (unsigned char)*code++ << 8;
1718 if (pinst < bufp->buffer || bufp->buffer + bufp->used < pinst) {
1719 set_error("Regex VM jump out of bounds (Crepeat1)");
1723 /* pinst is sole instruction in loop, and it matches a
1724 * single character. Since Crepeat1 was originally a
1725 * Cupdate_failure_jump, we also know that backtracking
1726 * is useless: so long as the single-character
1727 * expression matches, it must be used. Also, in the
1728 * case of +, we've already matched one character, so +
1729 * can't fail: nothing here can cause a failure. */
1734 while (text < textend) {
1735 ch = translate[(unsigned char)*text];
1736 if (pinst[ch / 8] & (1 << (ch & 7)))
1742 while (text < textend) {
1743 ch = (unsigned char)*text;
1744 if (pinst[ch / 8] & (1 << (ch & 7)))
1754 ch = (unsigned char)*pinst;
1756 while (text < textend && translate[(unsigned char)*text] == ch)
1759 while (text < textend && (unsigned char)*text == ch)
1766 while (text < textend && (unsigned char)*text != '\n')
1772 a = (unsigned char)*pinst;
1774 while (text < textend && (SYNTAX(translate[*text]) & a))
1777 while (text < textend && (SYNTAX(*text) & a))
1782 case Cnotsyntaxspec:
1784 a = (unsigned char)*pinst;
1786 while (text < textend && !(SYNTAX(translate[*text]) & a))
1789 while (text < textend && !(SYNTAX(*text) & a))
1797 set_error("Unknown regex opcode: memory corrupted?");
1801 /* due to the funky way + and * are compiled, the top
1802 * failure- stack entry at this point is actually a
1803 * success entry -- update it & pop it */
1804 UPDATE_FAILURE(state, text, goto error);
1805 goto fail; /* i.e., succeed <wink/sigh> */
1809 if (text == textstart)
1810 goto continue_matching;
1815 if (text == textend)
1816 goto continue_matching;
1821 if (text == textend)
1823 if (!(SYNTAX(*text) & Sword))
1825 if (text == textstart)
1826 goto continue_matching;
1827 if (!(SYNTAX(text[-1]) & Sword))
1828 goto continue_matching;
1833 if (text == textstart)
1835 if (!(SYNTAX(text[-1]) & Sword))
1837 if (text == textend)
1838 goto continue_matching;
1839 if (!(SYNTAX(*text) & Sword))
1840 goto continue_matching;
1845 /* Note: as in gnu regexp, this also matches at the
1846 * beginning and end of buffer. */
1848 if (text == textstart || text == textend)
1849 goto continue_matching;
1850 if ((SYNTAX(text[-1]) & Sword) ^ (SYNTAX(*text) & Sword))
1851 goto continue_matching;
1856 /* Note: as in gnu regexp, this never matches at the
1857 * beginning and end of buffer. */
1858 if (text == textstart || text == textend)
1860 if (!((SYNTAX(text[-1]) & Sword) ^ (SYNTAX(*text) & Sword)))
1861 goto continue_matching;
1867 if (!(SYNTAX(ch) & (unsigned char)*code++))
1869 goto continue_matching;
1871 case Cnotsyntaxspec:
1874 if (SYNTAX(ch) & (unsigned char)*code++)
1876 goto continue_matching;
1881 set_error("Unknown regex opcode: memory corrupted?");
1888 #if 0 /* This line is never reached --Guido */
1895 /* Using "break;" in the above switch statement is equivalent to "goto fail;" */
1897 POP_FAILURE(state, code, text, goto done_matching, goto error);
1898 goto continue_matching;
1901 /* if(translated != NULL) */
1902 /* free(translated); */
1907 /* if (translated != NULL) */
1908 /* free(translated); */
1917 int re_search(regex_t * bufp, unsigned char *str, int size, int pos,
1918 int range, regexp_registers_t regs)
1920 unsigned char *fastmap;
1921 unsigned char *translate;
1922 unsigned char *text;
1923 unsigned char *partstart;
1924 unsigned char *partend;
1927 unsigned char anchor;
1928 unsigned char *string = str;
1930 if (bufp->cflags & REG_ICASE) { /* we must use string in lowercase */
1931 int len = strlen((const char *)str);
1933 bufp->lcase = get_pool_memory(PM_FNAME);
1935 bufp->lcase = check_pool_memory_size(bufp->lcase, len+1);
1936 unsigned char *dst = (unsigned char *)bufp->lcase;
1938 *dst++ = tolower(*string++);
1941 string = (unsigned char *)bufp->lcase;
1944 // assert(size >= 0 && pos >= 0);
1945 // assert(pos + range >= 0 && pos + range <= size); /* Bugfix by ylo */
1947 fastmap = bufp->fastmap;
1948 translate = bufp->translate;
1949 if (fastmap && !bufp->fastmap_accurate) {
1950 re_compile_fastmap(bufp);
1955 anchor = bufp->anchor;
1956 if (bufp->can_be_null == 1) /* can_be_null == 2: can match null at eob */
1972 for (; range >= 0; range--, pos += dir) {
1974 if (dir == 1) { /* searching forwards */
1976 text = string + pos;
1977 partend = string + size;
1980 while (text != partend &&
1981 !fastmap[(unsigned char)translate[(unsigned char)*text]])
1984 while (text != partend && !fastmap[(unsigned char)*text])
1986 pos += text - partstart;
1987 range -= text - partstart;
1988 if (pos == size && bufp->can_be_null == 0)
1990 } else { /* searching backwards */
1991 text = string + pos;
1992 partstart = string + pos - range;
1995 while (text != partstart && !fastmap[(unsigned char)
1996 translate[(unsigned char)*text]])
1999 while (text != partstart && !fastmap[(unsigned char)*text])
2001 pos -= partend - text;
2002 range -= partend - text;
2005 if (anchor == 1) { /* anchored to begline */
2006 if (pos > 0 && (string[pos - 1] != '\n'))
2009 // assert(pos >= 0 && pos <= size);
2010 ret = re_match(bufp, string, size, pos, regs);
2022 ** c-file-style: "python"