3 * Author: Tatu Ylonen <ylo@ngs.fi>
5 * Copyright (c) 1991 Tatu Ylonen, Espoo, Finland
7 * Permission to use, copy, modify, distribute, and sell this software
8 * and its documentation for any purpose is hereby granted without
9 * fee, provided that the above copyright notice appear in all copies.
10 * This software is provided "as is" without express or implied
13 * Created: Thu Sep 26 17:14:05 1991 ylo
14 * Last modified: Mon Nov 4 17:06:48 1991 ylo
15 * Ported to Think C: 19 Jan 1992 guido@cwi.nl
17 * This code draws many ideas from the regular expression packages by
18 * Henry Spencer of the University of Toronto and Richard Stallman of
19 * the Free Software Foundation.
21 * Emacs-specific code and syntax table code is almost directly borrowed
24 * Bugs fixed and lots of reorganization by Jeffrey C. Ollie, April
25 * 1997 Thanks for bug reports and ideas from Andrew Kuchling, Tim
26 * Peters, Guido van Rossum, Ka-Ping Yee, Sjoerd Mullender, and
27 * probably one or two others that I'm forgetting.
29 * This file modified to work with Bacula and C++ by
30 * Kern Sibbald, April 2006
32 * This file modified to work with REG_ICASE and Bacula by
33 * Eric Bollengier April 2007
36 Bacula® - The Network Backup Solution
38 Copyright (C) 2006-2010 Free Software Foundation Europe e.V.
40 The main author of Bacula is Kern Sibbald, with contributions from
41 many others, a complete list can be found in the file AUTHORS.
42 This program is Free Software; you can redistribute it and/or
43 modify it under the terms of version three of the GNU Affero General Public
44 License as published by the Free Software Foundation and included
47 This program is distributed in the hope that it will be useful, but
48 WITHOUT ANY WARRANTY; without even the implied warranty of
49 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
50 General Public License for more details.
52 You should have received a copy of the GNU Affero General Public License
53 along with this program; if not, write to the Free Software
54 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
57 Bacula® is a registered trademark of Kern Sibbald.
58 The licensor of Bacula is the Free Software Foundation Europe
59 (FSFE), Fiduciary Program, Sumatrastrasse 25, 8006 Zürich,
60 Switzerland, email:ftf@fsfeurope.org.
67 #define set_error(x) bufp->errmsg=((char *)(x))
68 #define got_error bufp->errmsg!=NULL
70 /* The original code blithely assumed that sizeof(short) == 2. Not
71 * always true. Original instances of "(short)x" were replaced by
72 * SHORT(x), where SHORT is #defined below. */
74 #define SHORT(x) ((x) & 0x8000 ? (x) - 0x10000 : (x))
76 /* The stack implementation is taken from an idea by Andrew Kuchling.
77 * It's a doubly linked list of arrays. The advantages of this over a
78 * simple linked list are that the number of mallocs required are
79 * reduced. It also makes it possible to statically allocate enough
80 * space so that small patterns don't ever need to call malloc.
82 * The advantages over a single array is that is periodically
83 * realloced when more space is needed is that we avoid ever copying
86 /* item_t is the basic stack element. Defined as a union of
87 * structures so that both registers, failure points, and counters can
88 * be pushed/popped from the stack. There's nothing built into the
89 * item to keep track of whether a certain stack item is a register, a
90 * failure point, or a counter. */
92 typedef union item_t {
113 #define STACK_PAGE_SIZE 256
114 #define NUM_REGISTERS 256
116 /* A 'page' of stack items. */
118 typedef struct item_page_t {
119 item_t items[STACK_PAGE_SIZE];
120 struct item_page_t *prev;
121 struct item_page_t *next;
125 typedef struct match_state {
126 /* The number of registers that have been pushed onto the stack
127 * since the last failure point. */
131 /* Used to control when registers need to be pushed onto the
136 /* The number of failure points on the stack. */
140 /* Storage for the registers. Each register consists of two
141 * pointers to characters. So register N is represented as
142 * start[N] and end[N]. The pointers must be converted to
143 * offsets from the beginning of the string before returning the
144 * registers to the calling program. */
146 unsigned char *start[NUM_REGISTERS];
147 unsigned char *end[NUM_REGISTERS];
149 /* Keeps track of whether a register has changed recently. */
151 int changed[NUM_REGISTERS];
153 /* Structure to encapsulate the stack. */
155 /* index into the current page. If index == 0 and you need
156 * to pop an item, move to the previous page and set index
157 * = STACK_PAGE_SIZE - 1. Otherwise decrement index to
158 * push a page. If index == STACK_PAGE_SIZE and you need
159 * to push a page move to the next page and set index =
160 * 0. If there is no new next page, allocate a new page
161 * and link it in. Otherwise, increment index to push a
165 item_page_t *current; /* Pointer to the current page. */
166 item_page_t first; /* First page is statically allocated. */
170 /* Initialize a state object */
172 /* #define NEW_STATE(state) \ */
173 /* memset(&state, 0, (void *)(&state.stack) - (void *)(&state)); \ */
174 /* state.stack.current = &state.stack.first; \ */
175 /* state.stack.first.prev = NULL; \ */
176 /* state.stack.first.next = NULL; \ */
177 /* state.stack.index = 0; \ */
178 /* state.level = 1 */
180 #define NEW_STATE(state, nregs) \
183 for (i = 0; i < nregs; i++) \
185 state.start[i] = NULL; \
186 state.end[i] = NULL; \
187 state.changed[i] = 0; \
189 state.stack.current = &state.stack.first; \
190 state.stack.first.prev = NULL; \
191 state.stack.first.next = NULL; \
192 state.stack.index = 0; \
199 /* Free any memory that might have been malloc'd */
201 #define FREE_STATE(state) \
202 while(state.stack.first.next != NULL) \
204 state.stack.current = state.stack.first.next; \
205 state.stack.first.next = state.stack.current->next; \
206 free(state.stack.current); \
209 /* Discard the top 'count' stack items. */
211 #define STACK_DISCARD(stack, count, on_error) \
212 stack.index -= count; \
213 while (stack.index < 0) \
215 if (stack.current->prev == NULL) \
217 stack.current = stack.current->prev; \
218 stack.index += STACK_PAGE_SIZE; \
221 /* Store a pointer to the previous item on the stack. Used to pop an
222 * item off of the stack. */
224 #define STACK_PREV(stack, top, on_error) \
225 if (stack.index == 0) \
227 if (stack.current->prev == NULL) \
229 stack.current = stack.current->prev; \
230 stack.index = STACK_PAGE_SIZE - 1; \
236 top = &(stack.current->items[stack.index])
238 /* Store a pointer to the next item on the stack. Used to push an item
239 * on to the stack. */
241 #define STACK_NEXT(stack, top, on_error) \
242 if (stack.index == STACK_PAGE_SIZE) \
244 if (stack.current->next == NULL) \
246 stack.current->next = (item_page_t *)malloc(sizeof(item_page_t)); \
247 if (stack.current->next == NULL) \
249 stack.current->next->prev = stack.current; \
250 stack.current->next->next = NULL; \
252 stack.current = stack.current->next; \
255 top = &(stack.current->items[stack.index++])
257 /* Store a pointer to the item that is 'count' items back in the
258 * stack. STACK_BACK(stack, top, 1, on_error) is equivalent to
259 * STACK_TOP(stack, top, on_error). */
261 #define STACK_BACK(stack, top, count, on_error) \
264 item_page_t *current; \
265 current = stack.current; \
266 index = stack.index - (count); \
269 if (current->prev == NULL) \
271 current = current->prev; \
272 index += STACK_PAGE_SIZE; \
274 top = &(current->items[index]); \
277 /* Store a pointer to the top item on the stack. Execute the
278 * 'on_error' code if there are no items on the stack. */
280 #define STACK_TOP(stack, top, on_error) \
281 if (stack.index == 0) \
283 if (stack.current->prev == NULL) \
285 top = &(stack.current->prev->items[STACK_PAGE_SIZE - 1]); \
289 top = &(stack.current->items[stack.index - 1]); \
292 /* Test to see if the stack is empty */
294 #define STACK_EMPTY(stack) ((stack.index == 0) && \
295 (stack.current->prev == NULL))
297 /* Return the start of register 'reg' */
299 #define GET_REG_START(state, reg) (state.start[reg])
301 /* Return the end of register 'reg' */
303 #define GET_REG_END(state, reg) (state.end[reg])
305 /* Set the start of register 'reg'. If the state of the register needs
306 * saving, push it on the stack. */
308 #define SET_REG_START(state, reg, text, on_error) \
309 if(state.changed[reg] < state.level) \
312 STACK_NEXT(state.stack, item, on_error); \
313 item->reg.num = reg; \
314 item->reg.start = state.start[reg]; \
315 item->reg.end = state.end[reg]; \
316 item->reg.level = state.changed[reg]; \
317 state.changed[reg] = state.level; \
320 state.start[reg] = text
322 /* Set the end of register 'reg'. If the state of the register needs
323 * saving, push it on the stack. */
325 #define SET_REG_END(state, reg, text, on_error) \
326 if(state.changed[reg] < state.level) \
329 STACK_NEXT(state.stack, item, on_error); \
330 item->reg.num = reg; \
331 item->reg.start = state.start[reg]; \
332 item->reg.end = state.end[reg]; \
333 item->reg.level = state.changed[reg]; \
334 state.changed[reg] = state.level; \
337 state.end[reg] = text
339 #define PUSH_FAILURE(state, xcode, xtext, on_error) \
342 STACK_NEXT(state.stack, item, on_error); \
343 item->fail.code = xcode; \
344 item->fail.text = xtext; \
345 item->fail.count = state.count; \
346 item->fail.level = state.level; \
347 item->fail.phantom = 0; \
353 /* Update the last failure point with a new position in the text. */
355 #define UPDATE_FAILURE(state, xtext, on_error) \
358 STACK_BACK(state.stack, item, state.count + 1, on_error); \
359 if (!item->fail.phantom) \
362 STACK_NEXT(state.stack, item2, on_error); \
363 item2->fail.code = item->fail.code; \
364 item2->fail.text = xtext; \
365 item2->fail.count = state.count; \
366 item2->fail.level = state.level; \
367 item2->fail.phantom = 1; \
374 STACK_DISCARD(state.stack, state.count, on_error); \
375 STACK_TOP(state.stack, item, on_error); \
376 item->fail.text = xtext; \
382 #define POP_FAILURE(state, xcode, xtext, on_empty, on_error) \
387 while(state.count > 0) \
389 STACK_PREV(state.stack, item, on_error); \
390 state.start[item->reg.num] = item->reg.start; \
391 state.end[item->reg.num] = item->reg.end; \
392 state.changed[item->reg.num] = item->reg.level; \
395 STACK_PREV(state.stack, item, on_empty); \
396 xcode = item->fail.code; \
397 xtext = item->fail.text; \
398 state.count = item->fail.count; \
399 state.level = item->fail.level; \
402 while (item->fail.text == NULL); \
405 enum regexp_compiled_ops { /* opcodes for compiled regexp */
406 Cend, /* end of pattern reached */
407 Cbol, /* beginning of line */
408 Ceol, /* end of line */
409 Cset, /* character set. Followed by 32 bytes of set. */
410 Cexact, /* followed by a byte to match */
411 Canychar, /* matches any character except newline */
412 Cstart_memory, /* set register start addr (followed by reg number) */
413 Cend_memory, /* set register end addr (followed by reg number) */
414 Cmatch_memory, /* match a duplicate of reg contents (regnum follows) */
415 Cjump, /* followed by two bytes (lsb,msb) of displacement. */
416 Cstar_jump, /* will change to jump/update_failure_jump at runtime */
417 Cfailure_jump, /* jump to addr on failure */
418 Cupdate_failure_jump, /* update topmost failure point and jump */
419 Cdummy_failure_jump, /* push a dummy failure point and jump */
420 Cbegbuf, /* match at beginning of buffer */
421 Cendbuf, /* match at end of buffer */
422 Cwordbeg, /* match at beginning of word */
423 Cwordend, /* match at end of word */
424 Cwordbound, /* match if at word boundary */
425 Cnotwordbound, /* match if not at word boundary */
426 Csyntaxspec, /* matches syntax code (1 byte follows) */
427 Cnotsyntaxspec, /* matches if syntax code does not match (1 byte follows) */
431 enum regexp_syntax_op { /* syntax codes for plain and quoted characters */
432 Rend, /* special code for end of regexp */
433 Rnormal, /* normal character */
434 Ranychar, /* any character except newline */
435 Rquote, /* the quote character */
436 Rbol, /* match beginning of line */
437 Reol, /* match end of line */
438 Roptional, /* match preceding expression optionally */
439 Rstar, /* match preceding expr zero or more times */
440 Rplus, /* match preceding expr one or more times */
441 Ror, /* match either of alternatives */
442 Ropenpar, /* opening parenthesis */
443 Rclosepar, /* closing parenthesis */
444 Rmemory, /* match memory register */
445 Rextended_memory, /* \vnn to match registers 10-99 */
446 Ropenset, /* open set. Internal syntax hard-coded below. */
447 /* the following are gnu extensions to "normal" regexp syntax */
448 Rbegbuf, /* beginning of buffer */
449 Rendbuf, /* end of buffer */
450 Rwordchar, /* word character */
451 Rnotwordchar, /* not word character */
452 Rwordbeg, /* beginning of word */
453 Rwordend, /* end of word */
454 Rwordbound, /* word bound */
455 Rnotwordbound, /* not word bound */
459 static int re_compile_initialized = 0;
460 static int regexp_syntax = RE_SYNTAX_EGREP;
461 int re_syntax = RE_SYNTAX_EGREP; /* Exported copy of regexp_syntax */
462 static unsigned char plain_ops[256];
463 static unsigned char quoted_ops[256];
464 static unsigned char precedences[Rnum_ops];
465 static int regexp_context_indep_ops;
466 static int regexp_ansi_sequences;
468 #define NUM_LEVELS 5 /* number of precedence levels in use */
469 #define MAX_NESTING 100 /* max nesting level of operators */
471 #define SYNTAX(ch) re_syntax_table[(unsigned char)(ch)]
473 unsigned char re_syntax_table[256];
475 void re_compile_initialize(void)
479 static int syntax_table_inited = 0;
481 if (!syntax_table_inited) {
482 syntax_table_inited = 1;
483 memset(re_syntax_table, 0, 256);
484 for (a = 'a'; a <= 'z'; a++)
485 re_syntax_table[a] = Sword;
486 for (a = 'A'; a <= 'Z'; a++)
487 re_syntax_table[a] = Sword;
488 for (a = '0'; a <= '9'; a++)
489 re_syntax_table[a] = Sword | Sdigit | Shexdigit;
490 for (a = '0'; a <= '7'; a++)
491 re_syntax_table[a] |= Soctaldigit;
492 for (a = 'A'; a <= 'F'; a++)
493 re_syntax_table[a] |= Shexdigit;
494 for (a = 'a'; a <= 'f'; a++)
495 re_syntax_table[a] |= Shexdigit;
496 re_syntax_table[(int)'_'] = Sword;
497 for (a = 9; a <= 13; a++)
498 re_syntax_table[a] = Swhitespace;
499 re_syntax_table[(int)' '] = Swhitespace;
501 re_compile_initialized = 1;
502 for (a = 0; a < 256; a++) {
503 plain_ops[a] = Rnormal;
504 quoted_ops[a] = Rnormal;
506 for (a = '0'; a <= '9'; a++)
507 quoted_ops[a] = Rmemory;
508 plain_ops[(int)'\134'] = Rquote;
509 if (regexp_syntax & RE_NO_BK_PARENS) {
510 plain_ops[(int)'('] = Ropenpar;
511 plain_ops[(int)')'] = Rclosepar;
513 quoted_ops[(int)'('] = Ropenpar;
514 quoted_ops[(int)')'] = Rclosepar;
516 if (regexp_syntax & RE_NO_BK_VBAR) {
517 plain_ops[(int)'\174'] = Ror;
519 quoted_ops[(int)'\174'] = Ror;
521 plain_ops[(int)'*'] = Rstar;
522 if (regexp_syntax & RE_BK_PLUS_QM) {
523 quoted_ops[(int)'+'] = Rplus;
524 quoted_ops[(int)'?'] = Roptional;
526 plain_ops[(int)'+'] = Rplus;
527 plain_ops[(int)'?'] = Roptional;
529 if (regexp_syntax & RE_NEWLINE_OR) {
530 plain_ops[(int)'\n'] = Ror;
532 plain_ops[(int)'\133'] = Ropenset;
533 plain_ops[(int)'\136'] = Rbol;
534 plain_ops[(int)'$'] = Reol;
535 plain_ops[(int)'.'] = Ranychar;
536 if (!(regexp_syntax & RE_NO_GNU_EXTENSIONS)) {
537 quoted_ops[(int)'w'] = Rwordchar;
538 quoted_ops[(int)'W'] = Rnotwordchar;
539 quoted_ops[(int)'<'] = Rwordbeg;
540 quoted_ops[(int)'>'] = Rwordend;
541 quoted_ops[(int)'b'] = Rwordbound;
542 quoted_ops[(int)'B'] = Rnotwordbound;
543 quoted_ops[(int)'`'] = Rbegbuf;
544 quoted_ops[(int)'\''] = Rendbuf;
546 if (regexp_syntax & RE_ANSI_HEX) {
547 quoted_ops[(int)'v'] = Rextended_memory;
549 for (a = 0; a < Rnum_ops; a++) {
552 if (regexp_syntax & RE_TIGHT_VBAR) {
553 precedences[Ror] = 3;
554 precedences[Rbol] = 2;
555 precedences[Reol] = 2;
557 precedences[Ror] = 2;
558 precedences[Rbol] = 3;
559 precedences[Reol] = 3;
561 precedences[Rclosepar] = 1;
562 precedences[Rend] = 0;
563 regexp_context_indep_ops = (regexp_syntax & RE_CONTEXT_INDEP_OPS) != 0;
564 regexp_ansi_sequences = (regexp_syntax & RE_ANSI_HEX) != 0;
567 int re_set_syntax(int syntax) {
571 regexp_syntax = syntax;
572 re_syntax = syntax; /* Exported copy */
573 re_compile_initialize();
577 static int hex_char_to_decimal(int ch) {
578 if (ch >= '0' && ch <= '9')
580 if (ch >= 'a' && ch <= 'f')
581 return ch - 'a' + 10;
582 if (ch >= 'A' && ch <= 'F')
583 return ch - 'A' + 10;
587 static void re_compile_fastmap_aux(regex_t * bufp, unsigned char *code, int pos,
588 unsigned char *visited,
589 unsigned char *can_be_null,
590 unsigned char *fastmap)
597 return; /* we have already been here */
600 switch (code[pos++]) {
611 for (a = 0; a < 256; a++)
615 syntaxcode = code[pos++];
616 for (a = 0; a < 256; a++)
617 if (SYNTAX(a) & syntaxcode)
621 syntaxcode = code[pos++];
622 for (a = 0; a < 256; a++)
623 if (!(SYNTAX(a) & syntaxcode))
627 fastmap[(int)'\n'] = 1;
628 if (*can_be_null == 0)
629 *can_be_null = 2; /* can match null, but only at end of buffer */
632 for (a = 0; a < 256 / 8; a++)
633 if (code[pos + a] != 0)
634 for (b = 0; b < 8; b++)
635 if (code[pos + a] & (1 << b))
636 fastmap[(a << 3) + b] = 1;
640 fastmap[(unsigned char)code[pos]] = 1;
643 for (a = 0; a < 256; a++)
652 for (a = 0; a < 256; a++)
657 case Cdummy_failure_jump:
658 case Cupdate_failure_jump:
660 a = (unsigned char)code[pos++];
661 a |= (unsigned char)code[pos++] << 8;
662 pos += (int)SHORT(a);
664 /* argh... the regexp contains empty loops. This is not
665 good, as this may cause a failure stack overflow when
666 matching. Oh well. */
667 /* this path leads nowhere; pursue other paths. */
673 a = (unsigned char)code[pos++];
674 a |= (unsigned char)code[pos++] << 8;
675 a = pos + (int)SHORT(a);
676 re_compile_fastmap_aux(bufp, code, a, visited, can_be_null, fastmap);
682 set_error("Unknown regex opcode: memory corrupted?");
688 static int re_do_compile_fastmap(regex_t * bufp, unsigned char *buffer, int used,
689 int pos, unsigned char *can_be_null,
690 unsigned char *fastmap)
692 unsigned char small_visited[512], *visited;
694 if (used <= (int)sizeof(small_visited))
695 visited = small_visited;
697 visited = (unsigned char *)malloc(used);
702 memset(fastmap, 0, 256);
703 memset(visited, 0, used);
704 re_compile_fastmap_aux(bufp, buffer, pos, visited, can_be_null, fastmap);
705 if (visited != small_visited)
710 void re_compile_fastmap(regex_t * bufp)
712 if (!bufp->fastmap || bufp->fastmap_accurate)
714 // assert(bufp->used > 0);
715 if (!re_do_compile_fastmap(bufp, bufp->buffer,
716 bufp->used, 0, &bufp->can_be_null, bufp->fastmap))
720 if (bufp->buffer[0] == Cbol)
721 bufp->anchor = 1; /* begline */
722 else if (bufp->buffer[0] == Cbegbuf)
723 bufp->anchor = 2; /* begbuf */
725 bufp->anchor = 0; /* none */
726 bufp->fastmap_accurate = 1;
732 * ... code for operand of star
734 * 2: ... code after star
736 * We change the star_jump to update_failure_jump if we can determine
737 * that it is safe to do so; otherwise we change it to an ordinary
744 * 2: ... code for operand of plus
746 * 3: ... code after plus
748 * For star_jump considerations this is processed identically to star.
752 static int re_optimize_star_jump(regex_t * bufp, unsigned char *code)
754 unsigned char map[256];
755 unsigned char can_be_null;
761 int num_instructions = 0;
763 a = (unsigned char)*code++;
764 a |= (unsigned char)*code++ << 8;
767 p1 = code + a + 3; /* skip the failure_jump */
768 /* Check that the jump is within the pattern */
769 if (p1 < bufp->buffer || bufp->buffer + bufp->used < p1) {
770 set_error("Regex VM jump out of bounds (failure_jump opt)");
773 // assert(p1[-3] == Cfailure_jump);
775 /* p1 points inside loop, p2 points to after loop */
776 if (!re_do_compile_fastmap(bufp, bufp->buffer, bufp->used,
777 (int)(p2 - bufp->buffer), &can_be_null, map))
778 goto make_normal_jump;
780 /* If we might introduce a new update point inside the
781 * loop, we can't optimize because then update_jump would
782 * update a wrong failure point. Thus we have to be
783 * quite careful here.
786 /* loop until we find something that consumes a character */
804 ch = (unsigned char)*p1++;
806 goto make_normal_jump;
809 for (b = 0; b < 256; b++)
810 if (b != '\n' && map[b])
811 goto make_normal_jump;
814 for (b = 0; b < 256; b++)
815 if ((p1[b >> 3] & (1 << (b & 7))) && map[b])
816 goto make_normal_jump;
820 goto make_normal_jump;
824 /* now we know that we can't backtrack. */
825 while (p1 != p2 - 3) {
854 case Cupdate_failure_jump:
855 case Cdummy_failure_jump:
856 goto make_normal_jump;
862 /* make_update_jump: */
864 a += 3; /* jump to after the Cfailure_jump */
865 code[0] = Cupdate_failure_jump;
868 if (num_instructions > 1)
870 // assert(num_instructions == 1);
871 /* if the only instruction matches a single character, we can do
873 p1 = code + 3 + a; /* start of sole instruction */
874 if (*p1 == Cset || *p1 == Cexact || *p1 == Canychar ||
875 *p1 == Csyntaxspec || *p1 == Cnotsyntaxspec)
885 static int re_optimize(regex_t * bufp)
917 if (!re_optimize_star_jump(bufp, code)) {
921 case Cupdate_failure_jump:
923 case Cdummy_failure_jump:
934 #define NEXTCHAR(var) \
937 goto ends_prematurely; \
938 (var) = regex[pos]; \
942 #define ALLOC(amount) \
944 if (pattern_offset+(amount) > alloc) \
946 alloc += 256 + (amount); \
947 pattern = (unsigned char *)realloc(pattern, alloc); \
949 goto out_of_memory; \
953 #define STORE(ch) pattern[pattern_offset++] = (ch)
955 #define CURRENT_LEVEL_START (starts[starts_base + current_level])
957 #define SET_LEVEL_START starts[starts_base + current_level] = pattern_offset
959 #define PUSH_LEVEL_STARTS \
960 if (starts_base < (MAX_NESTING-1)*NUM_LEVELS) \
961 starts_base += NUM_LEVELS; \
965 #define POP_LEVEL_STARTS starts_base -= NUM_LEVELS
967 #define PUT_ADDR(offset,addr) \
969 int disp = (addr) - (offset) - 2; \
970 pattern[(offset)] = disp & 0xff; \
971 pattern[(offset)+1] = (disp>>8) & 0xff; \
974 #define INSERT_JUMP(pos,type,addr) \
976 int a, p = (pos), t = (type), ad = (addr); \
977 for (a = pattern_offset - 1; a >= p; a--) \
978 pattern[a + 3] = pattern[a]; \
981 pattern_offset += 3; \
984 #define SETBIT(buf,offset,bit) (buf)[(offset)+(bit)/8] |= (1<<((bit) & 7))
988 bufp->allocated = alloc; \
989 bufp->buffer = pattern; \
990 bufp->used = pattern_offset; \
993 #define GETHEX(var) \
995 unsigned char gethex_ch, gethex_value; \
996 NEXTCHAR(gethex_ch); \
997 gethex_value = hex_char_to_decimal(gethex_ch); \
998 if (gethex_value == 16) \
1000 NEXTCHAR(gethex_ch); \
1001 gethex_ch = hex_char_to_decimal(gethex_ch); \
1002 if (gethex_ch == 16) \
1004 (var) = gethex_value * 16 + gethex_ch; \
1007 #define ANSI_TRANSLATE(ch) \
1014 ch = 7; /* audible bell */ \
1020 ch = 8; /* backspace */ \
1026 ch = 12; /* form feed */ \
1032 ch = 10; /* line feed */ \
1038 ch = 13; /* carriage return */ \
1050 ch = 11; /* vertical tab */ \
1053 case 'x': /* hex code */ \
1061 /* other characters passed through */ \
1063 ch = translate[(unsigned char)ch]; \
1069 const char *re_compile_pattern(regex_t * bufp, unsigned char *regex)
1077 int pattern_offset = 0, alloc;
1078 int starts[NUM_LEVELS * MAX_NESTING];
1080 int future_jumps[MAX_NESTING];
1082 unsigned char ch = '\0';
1083 unsigned char *pattern;
1084 unsigned char *translate;
1087 int num_open_registers;
1088 int open_registers[RE_NREGS];
1089 int beginning_context;
1090 int size = strlen((char *)regex);
1092 if (!re_compile_initialized)
1093 re_compile_initialize();
1095 bufp->fastmap_accurate = 0;
1096 bufp->uses_registers = 1;
1097 bufp->num_registers = 1;
1098 translate = bufp->translate;
1099 pattern = bufp->buffer;
1100 alloc = bufp->allocated;
1101 if (alloc == 0 || pattern == NULL) {
1103 bufp->buffer = pattern = (unsigned char *)malloc(alloc);
1112 num_open_registers = 0;
1115 beginning_context = 1;
1117 /* we use Rend dummy to ensure that pending jumps are updated
1118 (due to low priority of Rend) before exiting the loop. */
1120 while (op != Rend) {
1126 ch = translate[(unsigned char)ch];
1127 op = plain_ops[(unsigned char)ch];
1130 op = quoted_ops[(unsigned char)ch];
1131 if (op == Rnormal && regexp_ansi_sequences)
1135 level = precedences[op];
1136 /* printf("ch='%c' op=%d level=%d current_level=%d
1137 curlevstart=%d\n", ch, op, level, current_level,
1138 CURRENT_LEVEL_START); */
1139 if (level > current_level) {
1140 for (current_level++; current_level < level; current_level++)
1143 } else if (level < current_level) {
1144 current_level = level;
1145 for (; num_jumps > 0 &&
1146 future_jumps[num_jumps - 1] >= CURRENT_LEVEL_START; num_jumps--)
1147 PUT_ADDR(future_jumps[num_jumps - 1], pattern_offset);
1155 store_opcode_and_arg: /* opcode & ch must be set */
1169 set_error("Rquote");
1170 /*NOTREACHED*/ case Rbol:
1171 if (!beginning_context) {
1172 if (regexp_context_indep_ops)
1180 if (!((pos >= size) ||
1181 ((regexp_syntax & RE_NO_BK_VBAR) ?
1182 (regex[pos] == '\174') :
1183 (pos + 1 < size && regex[pos] == '\134' &&
1184 regex[pos + 1] == '\174')) ||
1185 ((regexp_syntax & RE_NO_BK_PARENS) ?
1186 (regex[pos] == ')') :
1187 (pos + 1 < size && regex[pos] == '\134' &&
1188 regex[pos + 1] == ')')))) {
1189 if (regexp_context_indep_ops)
1199 if (beginning_context) {
1200 if (regexp_context_indep_ops)
1205 if (CURRENT_LEVEL_START == pattern_offset)
1206 break; /* ignore empty patterns for ? */
1208 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump, pattern_offset + 3);
1212 if (beginning_context) {
1213 if (regexp_context_indep_ops)
1218 if (CURRENT_LEVEL_START == pattern_offset)
1219 break; /* ignore empty patterns for + and * */
1221 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump, pattern_offset + 6);
1222 INSERT_JUMP(pattern_offset, Cstar_jump, CURRENT_LEVEL_START);
1223 if (op == Rplus) /* jump over initial failure_jump */
1224 INSERT_JUMP(CURRENT_LEVEL_START, Cdummy_failure_jump,
1225 CURRENT_LEVEL_START + 6);
1229 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump, pattern_offset + 6);
1230 if (num_jumps >= MAX_NESTING)
1233 future_jumps[num_jumps++] = pattern_offset;
1241 if (next_register < RE_NREGS) {
1242 bufp->uses_registers = 1;
1244 STORE(Cstart_memory);
1245 STORE(next_register);
1246 open_registers[num_open_registers++] = next_register;
1247 bufp->num_registers++;
1257 if (paren_depth <= 0)
1258 goto parenthesis_error;
1260 current_level = precedences[Ropenpar];
1262 if (paren_depth < num_open_registers) {
1263 bufp->uses_registers = 1;
1266 num_open_registers--;
1267 STORE(open_registers[num_open_registers]);
1272 goto bad_match_register;
1273 if (!(ch >= '0' && ch <= '9')) {
1274 goto bad_match_register;
1276 bufp->uses_registers = 1;
1277 opcode = Cmatch_memory;
1279 goto store_opcode_and_arg;
1280 case Rextended_memory:
1282 if (ch < '0' || ch > '9')
1283 goto bad_match_register;
1285 if (a < '0' || a > '9')
1286 goto bad_match_register;
1287 ch = 10 * (a - '0') + ch - '0';
1288 if (ch == 0 || ch >= RE_NREGS)
1289 goto bad_match_register;
1290 bufp->uses_registers = 1;
1291 opcode = Cmatch_memory;
1292 goto store_opcode_and_arg;
1304 offset = pattern_offset;
1305 for (a = 0; a < 256 / 8; a++)
1309 ch = translate[(unsigned char)ch];
1314 ch = translate[(unsigned char)ch];
1320 while (ch != '\135' || firstchar) {
1322 if (regexp_ansi_sequences && ch == '\134') {
1327 for (a = prev; a <= (int)ch; a++)
1328 SETBIT(pattern, offset, a);
1331 } else if (prev != -1 && ch == '-')
1334 SETBIT(pattern, offset, ch);
1339 ch = translate[(unsigned char)ch];
1342 SETBIT(pattern, offset, '-');
1344 for (a = 0; a < 256 / 8; a++)
1345 pattern[offset + a] ^= 0xff;
1361 opcode = Csyntaxspec;
1363 goto store_opcode_and_arg;
1367 opcode = Cnotsyntaxspec;
1369 goto store_opcode_and_arg;
1383 opcode = Cwordbound;
1388 opcode = Cnotwordbound;
1396 beginning_context = (op == Ropenpar || op == Ror);
1398 if (starts_base != 0)
1399 goto parenthesis_error;
1400 // assert(num_jumps == 0);
1404 if (!re_optimize(bufp))
1405 return "Optimization error";
1410 return "Badly placed special character";
1414 return "Bad match register number";
1418 return "Bad hexadecimal number";
1422 return "Badly placed parenthesis";
1426 return "Out of memory";
1430 return "Regular expression ends prematurely";
1434 return "Regular expression too complex";
1442 #undef CURRENT_LEVEL_START
1443 #undef SET_LEVEL_START
1444 #undef PUSH_LEVEL_STARTS
1445 #undef POP_LEVEL_STARTS
1451 #define PREFETCH if (text == textend) goto fail
1453 #define NEXTCHAR(var) \
1455 var = (unsigned char)*text++; \
1457 var = translate[var]
1460 int regcomp(regex_t * bufp, const char *regex, int cflags)
1462 memset(bufp, 0, sizeof(regex_t));
1463 bufp->cflags = cflags;
1464 if (bufp->cflags & REG_ICASE) {
1465 char *p, *lcase = bstrdup(regex);
1466 for( p = lcase; *p ; p++) {
1469 re_compile_pattern(bufp, (unsigned char *)lcase);
1472 re_compile_pattern(bufp, (unsigned char *)regex);
1480 void re_registers_to_regmatch(regexp_registers_t old_regs,
1481 regmatch_t pmatch[],
1486 /* We have to set the last entry to -1 */
1487 nmatch = nmatch - 1;
1488 for (i=0; (i < nmatch) && (old_regs->start[i] > -1) ; i++) {
1489 pmatch[i].rm_so = old_regs->start[i];
1490 pmatch[i].rm_eo = old_regs->end[i];
1493 pmatch[i].rm_eo = pmatch[i].rm_so = -1;
1496 int regexec(regex_t * preg, const char *string, size_t nmatch,
1497 regmatch_t pmatch[], int eflags)
1500 int len = strlen(string);
1501 struct re_registers regs;
1502 stat = re_search(preg, (unsigned char *)string, len, 0, len, ®s);
1504 re_registers_to_regmatch(®s, pmatch, nmatch);
1506 /* stat is the start position in the string base 0 where
1507 * the pattern was found or negative if not found.
1509 return stat < 0 ? -1 : 0;
1512 size_t regerror(int errcode, regex_t * preg, char *errbuf, size_t errbuf_size)
1514 bstrncpy(errbuf, preg->errmsg, errbuf_size);
1518 void regfree(regex_t * preg)
1521 free_pool_memory(preg->lcase);
1526 preg->buffer = NULL;
1530 int re_match(regex_t * bufp, unsigned char *string, int size, int pos,
1531 regexp_registers_t old_regs)
1533 unsigned char *code;
1534 unsigned char *translate;
1535 unsigned char *text;
1536 unsigned char *textstart;
1537 unsigned char *textend;
1543 unsigned char *regstart;
1544 unsigned char *regend;
1548 // assert(pos >= 0 && size >= 0);
1549 // assert(pos <= size);
1551 text = string + pos;
1553 textend = string + size;
1555 code = bufp->buffer;
1557 translate = bufp->translate;
1559 NEW_STATE(state, bufp->num_registers);
1565 match_end = text - textstart;
1567 old_regs->start[0] = pos;
1568 old_regs->end[0] = match_end;
1569 if (!bufp->uses_registers) {
1570 for (a = 1; a < RE_NREGS; a++) {
1571 old_regs->start[a] = -1;
1572 old_regs->end[a] = -1;
1575 for (a = 1; a < bufp->num_registers; a++) {
1576 if ((GET_REG_START(state, a) == NULL) ||
1577 (GET_REG_END(state, a) == NULL)) {
1578 old_regs->start[a] = -1;
1579 old_regs->end[a] = -1;
1582 old_regs->start[a] = GET_REG_START(state, a) - textstart;
1583 old_regs->end[a] = GET_REG_END(state, a) - textstart;
1585 for (; a < RE_NREGS; a++) {
1586 old_regs->start[a] = -1;
1587 old_regs->end[a] = -1;
1592 return match_end - pos;
1596 if (text == textstart || text[-1] == '\n')
1597 goto continue_matching;
1602 if (text == textend || *text == '\n')
1603 goto continue_matching;
1609 if (code[ch / 8] & (1 << (ch & 7))) {
1611 goto continue_matching;
1618 if (ch != (unsigned char)*code++)
1620 goto continue_matching;
1627 goto continue_matching;
1632 SET_REG_START(state, reg, text, goto error);
1633 goto continue_matching;
1638 SET_REG_END(state, reg, text, goto error);
1639 goto continue_matching;
1644 regstart = GET_REG_START(state, reg);
1645 regend = GET_REG_END(state, reg);
1646 if ((regstart == NULL) || (regend == NULL))
1647 goto fail; /* or should we just match nothing? */
1648 regsize = regend - regstart;
1650 if (regsize > (textend - text))
1653 for (; regstart < regend; regstart++, text++)
1654 if (translate[*regstart] != translate[*text])
1657 for (; regstart < regend; regstart++, text++)
1658 if (*regstart != *text)
1660 goto continue_matching;
1662 case Cupdate_failure_jump:
1664 UPDATE_FAILURE(state, text, goto error);
1665 /* fall to next case */
1667 /* treat Cstar_jump just like Cjump if it hasn't been optimized */
1671 a = (unsigned char)*code++;
1672 a |= (unsigned char)*code++ << 8;
1673 code += (int)SHORT(a);
1674 if (code < bufp->buffer || bufp->buffer + bufp->used < code) {
1675 set_error("Regex VM jump out of bounds (Cjump)");
1679 goto continue_matching;
1681 case Cdummy_failure_jump:
1683 unsigned char *failuredest;
1685 a = (unsigned char)*code++;
1686 a |= (unsigned char)*code++ << 8;
1688 // assert(*code == Cfailure_jump);
1689 b = (unsigned char)code[1];
1690 b |= (unsigned char)code[2] << 8;
1691 failuredest = code + (int)SHORT(b) + 3;
1692 if (failuredest < bufp->buffer || bufp->buffer + bufp->used < failuredest) {
1694 ("Regex VM jump out of bounds (Cdummy_failure_jump failuredest)");
1698 PUSH_FAILURE(state, failuredest, NULL, goto error);
1700 if (code < bufp->buffer || bufp->buffer + bufp->used < code) {
1701 set_error("Regex VM jump out of bounds (Cdummy_failure_jump code)");
1705 goto continue_matching;
1709 a = (unsigned char)*code++;
1710 a |= (unsigned char)*code++ << 8;
1712 if (code + a < bufp->buffer || bufp->buffer + bufp->used < code + a) {
1713 set_error("Regex VM jump out of bounds (Cfailure_jump)");
1717 PUSH_FAILURE(state, code + a, text, goto error);
1718 goto continue_matching;
1722 unsigned char *pinst;
1723 a = (unsigned char)*code++;
1724 a |= (unsigned char)*code++ << 8;
1727 if (pinst < bufp->buffer || bufp->buffer + bufp->used < pinst) {
1728 set_error("Regex VM jump out of bounds (Crepeat1)");
1732 /* pinst is sole instruction in loop, and it matches a
1733 * single character. Since Crepeat1 was originally a
1734 * Cupdate_failure_jump, we also know that backtracking
1735 * is useless: so long as the single-character
1736 * expression matches, it must be used. Also, in the
1737 * case of +, we've already matched one character, so +
1738 * can't fail: nothing here can cause a failure. */
1743 while (text < textend) {
1744 ch = translate[(unsigned char)*text];
1745 if (pinst[ch / 8] & (1 << (ch & 7)))
1751 while (text < textend) {
1752 ch = (unsigned char)*text;
1753 if (pinst[ch / 8] & (1 << (ch & 7)))
1763 ch = (unsigned char)*pinst;
1765 while (text < textend && translate[(unsigned char)*text] == ch)
1768 while (text < textend && (unsigned char)*text == ch)
1775 while (text < textend && (unsigned char)*text != '\n')
1781 a = (unsigned char)*pinst;
1783 while (text < textend && (SYNTAX(translate[*text]) & a))
1786 while (text < textend && (SYNTAX(*text) & a))
1791 case Cnotsyntaxspec:
1793 a = (unsigned char)*pinst;
1795 while (text < textend && !(SYNTAX(translate[*text]) & a))
1798 while (text < textend && !(SYNTAX(*text) & a))
1806 set_error("Unknown regex opcode: memory corrupted?");
1810 /* due to the funky way + and * are compiled, the top
1811 * failure- stack entry at this point is actually a
1812 * success entry -- update it & pop it */
1813 UPDATE_FAILURE(state, text, goto error);
1814 goto fail; /* i.e., succeed <wink/sigh> */
1818 if (text == textstart)
1819 goto continue_matching;
1824 if (text == textend)
1825 goto continue_matching;
1830 if (text == textend)
1832 if (!(SYNTAX(*text) & Sword))
1834 if (text == textstart)
1835 goto continue_matching;
1836 if (!(SYNTAX(text[-1]) & Sword))
1837 goto continue_matching;
1842 if (text == textstart)
1844 if (!(SYNTAX(text[-1]) & Sword))
1846 if (text == textend)
1847 goto continue_matching;
1848 if (!(SYNTAX(*text) & Sword))
1849 goto continue_matching;
1854 /* Note: as in gnu regexp, this also matches at the
1855 * beginning and end of buffer. */
1857 if (text == textstart || text == textend)
1858 goto continue_matching;
1859 if ((SYNTAX(text[-1]) & Sword) ^ (SYNTAX(*text) & Sword))
1860 goto continue_matching;
1865 /* Note: as in gnu regexp, this never matches at the
1866 * beginning and end of buffer. */
1867 if (text == textstart || text == textend)
1869 if (!((SYNTAX(text[-1]) & Sword) ^ (SYNTAX(*text) & Sword)))
1870 goto continue_matching;
1876 if (!(SYNTAX(ch) & (unsigned char)*code++))
1878 goto continue_matching;
1880 case Cnotsyntaxspec:
1883 if (SYNTAX(ch) & (unsigned char)*code++)
1885 goto continue_matching;
1890 set_error("Unknown regex opcode: memory corrupted?");
1897 #if 0 /* This line is never reached --Guido */
1904 /* Using "break;" in the above switch statement is equivalent to "goto fail;" */
1906 POP_FAILURE(state, code, text, goto done_matching, goto error);
1907 goto continue_matching;
1910 /* if(translated != NULL) */
1911 /* free(translated); */
1916 /* if (translated != NULL) */
1917 /* free(translated); */
1926 int re_search(regex_t * bufp, unsigned char *str, int size, int pos,
1927 int range, regexp_registers_t regs)
1929 unsigned char *fastmap;
1930 unsigned char *translate;
1931 unsigned char *text;
1932 unsigned char *partstart;
1933 unsigned char *partend;
1936 unsigned char anchor;
1937 unsigned char *string = str;
1939 if (bufp->cflags & REG_ICASE) { /* we must use string in lowercase */
1940 int len = strlen((const char *)str);
1942 bufp->lcase = get_pool_memory(PM_FNAME);
1944 bufp->lcase = check_pool_memory_size(bufp->lcase, len+1);
1945 unsigned char *dst = (unsigned char *)bufp->lcase;
1947 *dst++ = tolower(*string++);
1950 string = (unsigned char *)bufp->lcase;
1953 // assert(size >= 0 && pos >= 0);
1954 // assert(pos + range >= 0 && pos + range <= size); /* Bugfix by ylo */
1956 fastmap = bufp->fastmap;
1957 translate = bufp->translate;
1958 if (fastmap && !bufp->fastmap_accurate) {
1959 re_compile_fastmap(bufp);
1964 anchor = bufp->anchor;
1965 if (bufp->can_be_null == 1) /* can_be_null == 2: can match null at eob */
1981 for (; range >= 0; range--, pos += dir) {
1983 if (dir == 1) { /* searching forwards */
1985 text = string + pos;
1986 partend = string + size;
1989 while (text != partend &&
1990 !fastmap[(unsigned char)translate[(unsigned char)*text]])
1993 while (text != partend && !fastmap[(unsigned char)*text])
1995 pos += text - partstart;
1996 range -= text - partstart;
1997 if (pos == size && bufp->can_be_null == 0)
1999 } else { /* searching backwards */
2000 text = string + pos;
2001 partstart = string + pos - range;
2004 while (text != partstart && !fastmap[(unsigned char)
2005 translate[(unsigned char)*text]])
2008 while (text != partstart && !fastmap[(unsigned char)*text])
2010 pos -= partend - text;
2011 range -= partend - text;
2014 if (anchor == 1) { /* anchored to begline */
2015 if (pos > 0 && (string[pos - 1] != '\n'))
2018 // assert(pos >= 0 && pos <= size);
2019 ret = re_match(bufp, string, size, pos, regs);
2031 ** c-file-style: "python"