3 * Author: Tatu Ylonen <ylo@ngs.fi>
5 * Copyright (c) 1991 Tatu Ylonen, Espoo, Finland
7 * Permission to use, copy, modify, distribute, and sell this software
8 * and its documentation for any purpose is hereby granted without
9 * fee, provided that the above copyright notice appear in all copies.
10 * This software is provided "as is" without express or implied
13 * Created: Thu Sep 26 17:14:05 1991 ylo
14 * Last modified: Mon Nov 4 17:06:48 1991 ylo
15 * Ported to Think C: 19 Jan 1992 guido@cwi.nl
17 * This code draws many ideas from the regular expression packages by
18 * Henry Spencer of the University of Toronto and Richard Stallman of
19 * the Free Software Foundation.
21 * Emacs-specific code and syntax table code is almost directly borrowed
24 * Bugs fixed and lots of reorganization by Jeffrey C. Ollie, April
25 * 1997 Thanks for bug reports and ideas from Andrew Kuchling, Tim
26 * Peters, Guido van Rossum, Ka-Ping Yee, Sjoerd Mullender, and
27 * probably one or two others that I'm forgetting.
31 * This file modified to work with Bacula and C++ by
32 * Kern Sibbald, April 2006
34 * This file modified to work with REG_ICASE and Bacula by
35 * Eric Bollengier April 2007
38 Bacula® - The Network Backup Solution
40 Copyright (C) 2006-2008 Free Software Foundation Europe e.V.
42 The main author of Bacula is Kern Sibbald, with contributions from
43 many others, a complete list can be found in the file AUTHORS.
44 This program is Free Software; you can redistribute it and/or
45 modify it under the terms of version two of the GNU General Public
46 License as published by the Free Software Foundation and included
49 This program is distributed in the hope that it will be useful, but
50 WITHOUT ANY WARRANTY; without even the implied warranty of
51 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
52 General Public License for more details.
54 You should have received a copy of the GNU General Public License
55 along with this program; if not, write to the Free Software
56 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
59 Bacula® is a registered trademark of John Walker.
60 The licensor of Bacula is the Free Software Foundation Europe
61 (FSFE), Fiduciary Program, Sumatrastrasse 25, 8006 Zürich,
62 Switzerland, email:ftf@fsfeurope.org.
69 #define set_error(x) bufp->errmsg=((char *)(x))
70 #define got_error bufp->errmsg!=NULL
72 /* The original code blithely assumed that sizeof(short) == 2. Not
73 * always true. Original instances of "(short)x" were replaced by
74 * SHORT(x), where SHORT is #defined below. */
76 #define SHORT(x) ((x) & 0x8000 ? (x) - 0x10000 : (x))
78 /* The stack implementation is taken from an idea by Andrew Kuchling.
79 * It's a doubly linked list of arrays. The advantages of this over a
80 * simple linked list are that the number of mallocs required are
81 * reduced. It also makes it possible to statically allocate enough
82 * space so that small patterns don't ever need to call malloc.
84 * The advantages over a single array is that is periodically
85 * realloced when more space is needed is that we avoid ever copying
88 /* item_t is the basic stack element. Defined as a union of
89 * structures so that both registers, failure points, and counters can
90 * be pushed/popped from the stack. There's nothing built into the
91 * item to keep track of whether a certain stack item is a register, a
92 * failure point, or a counter. */
94 typedef union item_t {
115 #define STACK_PAGE_SIZE 256
116 #define NUM_REGISTERS 256
118 /* A 'page' of stack items. */
120 typedef struct item_page_t {
121 item_t items[STACK_PAGE_SIZE];
122 struct item_page_t *prev;
123 struct item_page_t *next;
127 typedef struct match_state {
128 /* The number of registers that have been pushed onto the stack
129 * since the last failure point. */
133 /* Used to control when registers need to be pushed onto the
138 /* The number of failure points on the stack. */
142 /* Storage for the registers. Each register consists of two
143 * pointers to characters. So register N is represented as
144 * start[N] and end[N]. The pointers must be converted to
145 * offsets from the beginning of the string before returning the
146 * registers to the calling program. */
148 unsigned char *start[NUM_REGISTERS];
149 unsigned char *end[NUM_REGISTERS];
151 /* Keeps track of whether a register has changed recently. */
153 int changed[NUM_REGISTERS];
155 /* Structure to encapsulate the stack. */
157 /* index into the current page. If index == 0 and you need
158 * to pop an item, move to the previous page and set index
159 * = STACK_PAGE_SIZE - 1. Otherwise decrement index to
160 * push a page. If index == STACK_PAGE_SIZE and you need
161 * to push a page move to the next page and set index =
162 * 0. If there is no new next page, allocate a new page
163 * and link it in. Otherwise, increment index to push a
167 item_page_t *current; /* Pointer to the current page. */
168 item_page_t first; /* First page is statically allocated. */
172 /* Initialize a state object */
174 /* #define NEW_STATE(state) \ */
175 /* memset(&state, 0, (void *)(&state.stack) - (void *)(&state)); \ */
176 /* state.stack.current = &state.stack.first; \ */
177 /* state.stack.first.prev = NULL; \ */
178 /* state.stack.first.next = NULL; \ */
179 /* state.stack.index = 0; \ */
180 /* state.level = 1 */
182 #define NEW_STATE(state, nregs) \
185 for (i = 0; i < nregs; i++) \
187 state.start[i] = NULL; \
188 state.end[i] = NULL; \
189 state.changed[i] = 0; \
191 state.stack.current = &state.stack.first; \
192 state.stack.first.prev = NULL; \
193 state.stack.first.next = NULL; \
194 state.stack.index = 0; \
201 /* Free any memory that might have been malloc'd */
203 #define FREE_STATE(state) \
204 while(state.stack.first.next != NULL) \
206 state.stack.current = state.stack.first.next; \
207 state.stack.first.next = state.stack.current->next; \
208 free(state.stack.current); \
211 /* Discard the top 'count' stack items. */
213 #define STACK_DISCARD(stack, count, on_error) \
214 stack.index -= count; \
215 while (stack.index < 0) \
217 if (stack.current->prev == NULL) \
219 stack.current = stack.current->prev; \
220 stack.index += STACK_PAGE_SIZE; \
223 /* Store a pointer to the previous item on the stack. Used to pop an
224 * item off of the stack. */
226 #define STACK_PREV(stack, top, on_error) \
227 if (stack.index == 0) \
229 if (stack.current->prev == NULL) \
231 stack.current = stack.current->prev; \
232 stack.index = STACK_PAGE_SIZE - 1; \
238 top = &(stack.current->items[stack.index])
240 /* Store a pointer to the next item on the stack. Used to push an item
241 * on to the stack. */
243 #define STACK_NEXT(stack, top, on_error) \
244 if (stack.index == STACK_PAGE_SIZE) \
246 if (stack.current->next == NULL) \
248 stack.current->next = (item_page_t *)malloc(sizeof(item_page_t)); \
249 if (stack.current->next == NULL) \
251 stack.current->next->prev = stack.current; \
252 stack.current->next->next = NULL; \
254 stack.current = stack.current->next; \
257 top = &(stack.current->items[stack.index++])
259 /* Store a pointer to the item that is 'count' items back in the
260 * stack. STACK_BACK(stack, top, 1, on_error) is equivalent to
261 * STACK_TOP(stack, top, on_error). */
263 #define STACK_BACK(stack, top, count, on_error) \
266 item_page_t *current; \
267 current = stack.current; \
268 index = stack.index - (count); \
271 if (current->prev == NULL) \
273 current = current->prev; \
274 index += STACK_PAGE_SIZE; \
276 top = &(current->items[index]); \
279 /* Store a pointer to the top item on the stack. Execute the
280 * 'on_error' code if there are no items on the stack. */
282 #define STACK_TOP(stack, top, on_error) \
283 if (stack.index == 0) \
285 if (stack.current->prev == NULL) \
287 top = &(stack.current->prev->items[STACK_PAGE_SIZE - 1]); \
291 top = &(stack.current->items[stack.index - 1]); \
294 /* Test to see if the stack is empty */
296 #define STACK_EMPTY(stack) ((stack.index == 0) && \
297 (stack.current->prev == NULL))
299 /* Return the start of register 'reg' */
301 #define GET_REG_START(state, reg) (state.start[reg])
303 /* Return the end of register 'reg' */
305 #define GET_REG_END(state, reg) (state.end[reg])
307 /* Set the start of register 'reg'. If the state of the register needs
308 * saving, push it on the stack. */
310 #define SET_REG_START(state, reg, text, on_error) \
311 if(state.changed[reg] < state.level) \
314 STACK_NEXT(state.stack, item, on_error); \
315 item->reg.num = reg; \
316 item->reg.start = state.start[reg]; \
317 item->reg.end = state.end[reg]; \
318 item->reg.level = state.changed[reg]; \
319 state.changed[reg] = state.level; \
322 state.start[reg] = text
324 /* Set the end of register 'reg'. If the state of the register needs
325 * saving, push it on the stack. */
327 #define SET_REG_END(state, reg, text, on_error) \
328 if(state.changed[reg] < state.level) \
331 STACK_NEXT(state.stack, item, on_error); \
332 item->reg.num = reg; \
333 item->reg.start = state.start[reg]; \
334 item->reg.end = state.end[reg]; \
335 item->reg.level = state.changed[reg]; \
336 state.changed[reg] = state.level; \
339 state.end[reg] = text
341 #define PUSH_FAILURE(state, xcode, xtext, on_error) \
344 STACK_NEXT(state.stack, item, on_error); \
345 item->fail.code = xcode; \
346 item->fail.text = xtext; \
347 item->fail.count = state.count; \
348 item->fail.level = state.level; \
349 item->fail.phantom = 0; \
355 /* Update the last failure point with a new position in the text. */
357 #define UPDATE_FAILURE(state, xtext, on_error) \
360 STACK_BACK(state.stack, item, state.count + 1, on_error); \
361 if (!item->fail.phantom) \
364 STACK_NEXT(state.stack, item2, on_error); \
365 item2->fail.code = item->fail.code; \
366 item2->fail.text = xtext; \
367 item2->fail.count = state.count; \
368 item2->fail.level = state.level; \
369 item2->fail.phantom = 1; \
376 STACK_DISCARD(state.stack, state.count, on_error); \
377 STACK_TOP(state.stack, item, on_error); \
378 item->fail.text = xtext; \
384 #define POP_FAILURE(state, xcode, xtext, on_empty, on_error) \
389 while(state.count > 0) \
391 STACK_PREV(state.stack, item, on_error); \
392 state.start[item->reg.num] = item->reg.start; \
393 state.end[item->reg.num] = item->reg.end; \
394 state.changed[item->reg.num] = item->reg.level; \
397 STACK_PREV(state.stack, item, on_empty); \
398 xcode = item->fail.code; \
399 xtext = item->fail.text; \
400 state.count = item->fail.count; \
401 state.level = item->fail.level; \
404 while (item->fail.text == NULL); \
407 enum regexp_compiled_ops { /* opcodes for compiled regexp */
408 Cend, /* end of pattern reached */
409 Cbol, /* beginning of line */
410 Ceol, /* end of line */
411 Cset, /* character set. Followed by 32 bytes of set. */
412 Cexact, /* followed by a byte to match */
413 Canychar, /* matches any character except newline */
414 Cstart_memory, /* set register start addr (followed by reg number) */
415 Cend_memory, /* set register end addr (followed by reg number) */
416 Cmatch_memory, /* match a duplicate of reg contents (regnum follows) */
417 Cjump, /* followed by two bytes (lsb,msb) of displacement. */
418 Cstar_jump, /* will change to jump/update_failure_jump at runtime */
419 Cfailure_jump, /* jump to addr on failure */
420 Cupdate_failure_jump, /* update topmost failure point and jump */
421 Cdummy_failure_jump, /* push a dummy failure point and jump */
422 Cbegbuf, /* match at beginning of buffer */
423 Cendbuf, /* match at end of buffer */
424 Cwordbeg, /* match at beginning of word */
425 Cwordend, /* match at end of word */
426 Cwordbound, /* match if at word boundary */
427 Cnotwordbound, /* match if not at word boundary */
428 Csyntaxspec, /* matches syntax code (1 byte follows) */
429 Cnotsyntaxspec, /* matches if syntax code does not match (1 byte follows) */
433 enum regexp_syntax_op { /* syntax codes for plain and quoted characters */
434 Rend, /* special code for end of regexp */
435 Rnormal, /* normal character */
436 Ranychar, /* any character except newline */
437 Rquote, /* the quote character */
438 Rbol, /* match beginning of line */
439 Reol, /* match end of line */
440 Roptional, /* match preceding expression optionally */
441 Rstar, /* match preceding expr zero or more times */
442 Rplus, /* match preceding expr one or more times */
443 Ror, /* match either of alternatives */
444 Ropenpar, /* opening parenthesis */
445 Rclosepar, /* closing parenthesis */
446 Rmemory, /* match memory register */
447 Rextended_memory, /* \vnn to match registers 10-99 */
448 Ropenset, /* open set. Internal syntax hard-coded below. */
449 /* the following are gnu extensions to "normal" regexp syntax */
450 Rbegbuf, /* beginning of buffer */
451 Rendbuf, /* end of buffer */
452 Rwordchar, /* word character */
453 Rnotwordchar, /* not word character */
454 Rwordbeg, /* beginning of word */
455 Rwordend, /* end of word */
456 Rwordbound, /* word bound */
457 Rnotwordbound, /* not word bound */
461 static int re_compile_initialized = 0;
462 static int regexp_syntax = RE_SYNTAX_EGREP;
463 int re_syntax = RE_SYNTAX_EGREP; /* Exported copy of regexp_syntax */
464 static unsigned char plain_ops[256];
465 static unsigned char quoted_ops[256];
466 static unsigned char precedences[Rnum_ops];
467 static int regexp_context_indep_ops;
468 static int regexp_ansi_sequences;
470 #define NUM_LEVELS 5 /* number of precedence levels in use */
471 #define MAX_NESTING 100 /* max nesting level of operators */
473 #define SYNTAX(ch) re_syntax_table[(unsigned char)(ch)]
475 unsigned char re_syntax_table[256];
477 void re_compile_initialize(void)
481 static int syntax_table_inited = 0;
483 if (!syntax_table_inited) {
484 syntax_table_inited = 1;
485 memset(re_syntax_table, 0, 256);
486 for (a = 'a'; a <= 'z'; a++)
487 re_syntax_table[a] = Sword;
488 for (a = 'A'; a <= 'Z'; a++)
489 re_syntax_table[a] = Sword;
490 for (a = '0'; a <= '9'; a++)
491 re_syntax_table[a] = Sword | Sdigit | Shexdigit;
492 for (a = '0'; a <= '7'; a++)
493 re_syntax_table[a] |= Soctaldigit;
494 for (a = 'A'; a <= 'F'; a++)
495 re_syntax_table[a] |= Shexdigit;
496 for (a = 'a'; a <= 'f'; a++)
497 re_syntax_table[a] |= Shexdigit;
498 re_syntax_table[(int)'_'] = Sword;
499 for (a = 9; a <= 13; a++)
500 re_syntax_table[a] = Swhitespace;
501 re_syntax_table[(int)' '] = Swhitespace;
503 re_compile_initialized = 1;
504 for (a = 0; a < 256; a++) {
505 plain_ops[a] = Rnormal;
506 quoted_ops[a] = Rnormal;
508 for (a = '0'; a <= '9'; a++)
509 quoted_ops[a] = Rmemory;
510 plain_ops[(int)'\134'] = Rquote;
511 if (regexp_syntax & RE_NO_BK_PARENS) {
512 plain_ops[(int)'('] = Ropenpar;
513 plain_ops[(int)')'] = Rclosepar;
515 quoted_ops[(int)'('] = Ropenpar;
516 quoted_ops[(int)')'] = Rclosepar;
518 if (regexp_syntax & RE_NO_BK_VBAR) {
519 plain_ops[(int)'\174'] = Ror;
521 quoted_ops[(int)'\174'] = Ror;
523 plain_ops[(int)'*'] = Rstar;
524 if (regexp_syntax & RE_BK_PLUS_QM) {
525 quoted_ops[(int)'+'] = Rplus;
526 quoted_ops[(int)'?'] = Roptional;
528 plain_ops[(int)'+'] = Rplus;
529 plain_ops[(int)'?'] = Roptional;
531 if (regexp_syntax & RE_NEWLINE_OR) {
532 plain_ops[(int)'\n'] = Ror;
534 plain_ops[(int)'\133'] = Ropenset;
535 plain_ops[(int)'\136'] = Rbol;
536 plain_ops[(int)'$'] = Reol;
537 plain_ops[(int)'.'] = Ranychar;
538 if (!(regexp_syntax & RE_NO_GNU_EXTENSIONS)) {
539 quoted_ops[(int)'w'] = Rwordchar;
540 quoted_ops[(int)'W'] = Rnotwordchar;
541 quoted_ops[(int)'<'] = Rwordbeg;
542 quoted_ops[(int)'>'] = Rwordend;
543 quoted_ops[(int)'b'] = Rwordbound;
544 quoted_ops[(int)'B'] = Rnotwordbound;
545 quoted_ops[(int)'`'] = Rbegbuf;
546 quoted_ops[(int)'\''] = Rendbuf;
548 if (regexp_syntax & RE_ANSI_HEX) {
549 quoted_ops[(int)'v'] = Rextended_memory;
551 for (a = 0; a < Rnum_ops; a++) {
554 if (regexp_syntax & RE_TIGHT_VBAR) {
555 precedences[Ror] = 3;
556 precedences[Rbol] = 2;
557 precedences[Reol] = 2;
559 precedences[Ror] = 2;
560 precedences[Rbol] = 3;
561 precedences[Reol] = 3;
563 precedences[Rclosepar] = 1;
564 precedences[Rend] = 0;
565 regexp_context_indep_ops = (regexp_syntax & RE_CONTEXT_INDEP_OPS) != 0;
566 regexp_ansi_sequences = (regexp_syntax & RE_ANSI_HEX) != 0;
569 int re_set_syntax(int syntax) {
573 regexp_syntax = syntax;
574 re_syntax = syntax; /* Exported copy */
575 re_compile_initialize();
579 static int hex_char_to_decimal(int ch) {
580 if (ch >= '0' && ch <= '9')
582 if (ch >= 'a' && ch <= 'f')
583 return ch - 'a' + 10;
584 if (ch >= 'A' && ch <= 'F')
585 return ch - 'A' + 10;
589 static void re_compile_fastmap_aux(regex_t * bufp, unsigned char *code, int pos,
590 unsigned char *visited,
591 unsigned char *can_be_null,
592 unsigned char *fastmap)
599 return; /* we have already been here */
602 switch (code[pos++]) {
613 for (a = 0; a < 256; a++)
617 syntaxcode = code[pos++];
618 for (a = 0; a < 256; a++)
619 if (SYNTAX(a) & syntaxcode)
623 syntaxcode = code[pos++];
624 for (a = 0; a < 256; a++)
625 if (!(SYNTAX(a) & syntaxcode))
629 fastmap[(int)'\n'] = 1;
630 if (*can_be_null == 0)
631 *can_be_null = 2; /* can match null, but only at end of buffer */
634 for (a = 0; a < 256 / 8; a++)
635 if (code[pos + a] != 0)
636 for (b = 0; b < 8; b++)
637 if (code[pos + a] & (1 << b))
638 fastmap[(a << 3) + b] = 1;
642 fastmap[(unsigned char)code[pos]] = 1;
645 for (a = 0; a < 256; a++)
654 for (a = 0; a < 256; a++)
659 case Cdummy_failure_jump:
660 case Cupdate_failure_jump:
662 a = (unsigned char)code[pos++];
663 a |= (unsigned char)code[pos++] << 8;
664 pos += (int)SHORT(a);
666 /* argh... the regexp contains empty loops. This is not
667 good, as this may cause a failure stack overflow when
668 matching. Oh well. */
669 /* this path leads nowhere; pursue other paths. */
675 a = (unsigned char)code[pos++];
676 a |= (unsigned char)code[pos++] << 8;
677 a = pos + (int)SHORT(a);
678 re_compile_fastmap_aux(bufp, code, a, visited, can_be_null, fastmap);
684 set_error("Unknown regex opcode: memory corrupted?");
690 static int re_do_compile_fastmap(regex_t * bufp, unsigned char *buffer, int used,
691 int pos, unsigned char *can_be_null,
692 unsigned char *fastmap)
694 unsigned char small_visited[512], *visited;
696 if (used <= (int)sizeof(small_visited))
697 visited = small_visited;
699 visited = (unsigned char *)malloc(used);
704 memset(fastmap, 0, 256);
705 memset(visited, 0, used);
706 re_compile_fastmap_aux(bufp, buffer, pos, visited, can_be_null, fastmap);
707 if (visited != small_visited)
712 void re_compile_fastmap(regex_t * bufp)
714 if (!bufp->fastmap || bufp->fastmap_accurate)
716 // assert(bufp->used > 0);
717 if (!re_do_compile_fastmap(bufp, bufp->buffer,
718 bufp->used, 0, &bufp->can_be_null, bufp->fastmap))
722 if (bufp->buffer[0] == Cbol)
723 bufp->anchor = 1; /* begline */
724 else if (bufp->buffer[0] == Cbegbuf)
725 bufp->anchor = 2; /* begbuf */
727 bufp->anchor = 0; /* none */
728 bufp->fastmap_accurate = 1;
734 * ... code for operand of star
736 * 2: ... code after star
738 * We change the star_jump to update_failure_jump if we can determine
739 * that it is safe to do so; otherwise we change it to an ordinary
746 * 2: ... code for operand of plus
748 * 3: ... code after plus
750 * For star_jump considerations this is processed identically to star.
754 static int re_optimize_star_jump(regex_t * bufp, unsigned char *code)
756 unsigned char map[256];
757 unsigned char can_be_null;
763 int num_instructions = 0;
765 a = (unsigned char)*code++;
766 a |= (unsigned char)*code++ << 8;
769 p1 = code + a + 3; /* skip the failure_jump */
770 /* Check that the jump is within the pattern */
771 if (p1 < bufp->buffer || bufp->buffer + bufp->used < p1) {
772 set_error("Regex VM jump out of bounds (failure_jump opt)");
775 // assert(p1[-3] == Cfailure_jump);
777 /* p1 points inside loop, p2 points to after loop */
778 if (!re_do_compile_fastmap(bufp, bufp->buffer, bufp->used,
779 (int)(p2 - bufp->buffer), &can_be_null, map))
780 goto make_normal_jump;
782 /* If we might introduce a new update point inside the
783 * loop, we can't optimize because then update_jump would
784 * update a wrong failure point. Thus we have to be
785 * quite careful here.
788 /* loop until we find something that consumes a character */
806 ch = (unsigned char)*p1++;
808 goto make_normal_jump;
811 for (b = 0; b < 256; b++)
812 if (b != '\n' && map[b])
813 goto make_normal_jump;
816 for (b = 0; b < 256; b++)
817 if ((p1[b >> 3] & (1 << (b & 7))) && map[b])
818 goto make_normal_jump;
822 goto make_normal_jump;
826 /* now we know that we can't backtrack. */
827 while (p1 != p2 - 3) {
856 case Cupdate_failure_jump:
857 case Cdummy_failure_jump:
858 goto make_normal_jump;
864 /* make_update_jump: */
866 a += 3; /* jump to after the Cfailure_jump */
867 code[0] = Cupdate_failure_jump;
870 if (num_instructions > 1)
872 // assert(num_instructions == 1);
873 /* if the only instruction matches a single character, we can do
875 p1 = code + 3 + a; /* start of sole instruction */
876 if (*p1 == Cset || *p1 == Cexact || *p1 == Canychar ||
877 *p1 == Csyntaxspec || *p1 == Cnotsyntaxspec)
887 static int re_optimize(regex_t * bufp)
919 if (!re_optimize_star_jump(bufp, code)) {
923 case Cupdate_failure_jump:
925 case Cdummy_failure_jump:
936 #define NEXTCHAR(var) \
939 goto ends_prematurely; \
940 (var) = regex[pos]; \
944 #define ALLOC(amount) \
946 if (pattern_offset+(amount) > alloc) \
948 alloc += 256 + (amount); \
949 pattern = (unsigned char *)realloc(pattern, alloc); \
951 goto out_of_memory; \
955 #define STORE(ch) pattern[pattern_offset++] = (ch)
957 #define CURRENT_LEVEL_START (starts[starts_base + current_level])
959 #define SET_LEVEL_START starts[starts_base + current_level] = pattern_offset
961 #define PUSH_LEVEL_STARTS \
962 if (starts_base < (MAX_NESTING-1)*NUM_LEVELS) \
963 starts_base += NUM_LEVELS; \
967 #define POP_LEVEL_STARTS starts_base -= NUM_LEVELS
969 #define PUT_ADDR(offset,addr) \
971 int disp = (addr) - (offset) - 2; \
972 pattern[(offset)] = disp & 0xff; \
973 pattern[(offset)+1] = (disp>>8) & 0xff; \
976 #define INSERT_JUMP(pos,type,addr) \
978 int a, p = (pos), t = (type), ad = (addr); \
979 for (a = pattern_offset - 1; a >= p; a--) \
980 pattern[a + 3] = pattern[a]; \
983 pattern_offset += 3; \
986 #define SETBIT(buf,offset,bit) (buf)[(offset)+(bit)/8] |= (1<<((bit) & 7))
990 bufp->allocated = alloc; \
991 bufp->buffer = pattern; \
992 bufp->used = pattern_offset; \
995 #define GETHEX(var) \
997 unsigned char gethex_ch, gethex_value; \
998 NEXTCHAR(gethex_ch); \
999 gethex_value = hex_char_to_decimal(gethex_ch); \
1000 if (gethex_value == 16) \
1002 NEXTCHAR(gethex_ch); \
1003 gethex_ch = hex_char_to_decimal(gethex_ch); \
1004 if (gethex_ch == 16) \
1006 (var) = gethex_value * 16 + gethex_ch; \
1009 #define ANSI_TRANSLATE(ch) \
1016 ch = 7; /* audible bell */ \
1022 ch = 8; /* backspace */ \
1028 ch = 12; /* form feed */ \
1034 ch = 10; /* line feed */ \
1040 ch = 13; /* carriage return */ \
1052 ch = 11; /* vertical tab */ \
1055 case 'x': /* hex code */ \
1063 /* other characters passed through */ \
1065 ch = translate[(unsigned char)ch]; \
1071 const char *re_compile_pattern(regex_t * bufp, unsigned char *regex)
1079 int pattern_offset = 0, alloc;
1080 int starts[NUM_LEVELS * MAX_NESTING];
1082 int future_jumps[MAX_NESTING];
1084 unsigned char ch = '\0';
1085 unsigned char *pattern;
1086 unsigned char *translate;
1089 int num_open_registers;
1090 int open_registers[RE_NREGS];
1091 int beginning_context;
1092 int size = strlen((char *)regex);
1094 if (!re_compile_initialized)
1095 re_compile_initialize();
1097 bufp->fastmap_accurate = 0;
1098 bufp->uses_registers = 1;
1099 bufp->num_registers = 1;
1100 translate = bufp->translate;
1101 pattern = bufp->buffer;
1102 alloc = bufp->allocated;
1103 if (alloc == 0 || pattern == NULL) {
1105 bufp->buffer = pattern = (unsigned char *)malloc(alloc);
1114 num_open_registers = 0;
1117 beginning_context = 1;
1119 /* we use Rend dummy to ensure that pending jumps are updated
1120 (due to low priority of Rend) before exiting the loop. */
1122 while (op != Rend) {
1128 ch = translate[(unsigned char)ch];
1129 op = plain_ops[(unsigned char)ch];
1132 op = quoted_ops[(unsigned char)ch];
1133 if (op == Rnormal && regexp_ansi_sequences)
1137 level = precedences[op];
1138 /* printf("ch='%c' op=%d level=%d current_level=%d
1139 curlevstart=%d\n", ch, op, level, current_level,
1140 CURRENT_LEVEL_START); */
1141 if (level > current_level) {
1142 for (current_level++; current_level < level; current_level++)
1145 } else if (level < current_level) {
1146 current_level = level;
1147 for (; num_jumps > 0 &&
1148 future_jumps[num_jumps - 1] >= CURRENT_LEVEL_START; num_jumps--)
1149 PUT_ADDR(future_jumps[num_jumps - 1], pattern_offset);
1157 store_opcode_and_arg: /* opcode & ch must be set */
1171 set_error("Rquote");
1172 /*NOTREACHED*/ case Rbol:
1173 if (!beginning_context) {
1174 if (regexp_context_indep_ops)
1182 if (!((pos >= size) ||
1183 ((regexp_syntax & RE_NO_BK_VBAR) ?
1184 (regex[pos] == '\174') :
1185 (pos + 1 < size && regex[pos] == '\134' &&
1186 regex[pos + 1] == '\174')) ||
1187 ((regexp_syntax & RE_NO_BK_PARENS) ?
1188 (regex[pos] == ')') :
1189 (pos + 1 < size && regex[pos] == '\134' &&
1190 regex[pos + 1] == ')')))) {
1191 if (regexp_context_indep_ops)
1201 if (beginning_context) {
1202 if (regexp_context_indep_ops)
1207 if (CURRENT_LEVEL_START == pattern_offset)
1208 break; /* ignore empty patterns for ? */
1210 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump, pattern_offset + 3);
1214 if (beginning_context) {
1215 if (regexp_context_indep_ops)
1220 if (CURRENT_LEVEL_START == pattern_offset)
1221 break; /* ignore empty patterns for + and * */
1223 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump, pattern_offset + 6);
1224 INSERT_JUMP(pattern_offset, Cstar_jump, CURRENT_LEVEL_START);
1225 if (op == Rplus) /* jump over initial failure_jump */
1226 INSERT_JUMP(CURRENT_LEVEL_START, Cdummy_failure_jump,
1227 CURRENT_LEVEL_START + 6);
1231 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump, pattern_offset + 6);
1232 if (num_jumps >= MAX_NESTING)
1235 future_jumps[num_jumps++] = pattern_offset;
1243 if (next_register < RE_NREGS) {
1244 bufp->uses_registers = 1;
1246 STORE(Cstart_memory);
1247 STORE(next_register);
1248 open_registers[num_open_registers++] = next_register;
1249 bufp->num_registers++;
1259 if (paren_depth <= 0)
1260 goto parenthesis_error;
1262 current_level = precedences[Ropenpar];
1264 if (paren_depth < num_open_registers) {
1265 bufp->uses_registers = 1;
1268 num_open_registers--;
1269 STORE(open_registers[num_open_registers]);
1274 goto bad_match_register;
1275 if (!(ch >= '0' && ch <= '9')) {
1276 goto bad_match_register;
1278 bufp->uses_registers = 1;
1279 opcode = Cmatch_memory;
1281 goto store_opcode_and_arg;
1282 case Rextended_memory:
1284 if (ch < '0' || ch > '9')
1285 goto bad_match_register;
1287 if (a < '0' || a > '9')
1288 goto bad_match_register;
1289 ch = 10 * (a - '0') + ch - '0';
1290 if (ch == 0 || ch >= RE_NREGS)
1291 goto bad_match_register;
1292 bufp->uses_registers = 1;
1293 opcode = Cmatch_memory;
1294 goto store_opcode_and_arg;
1306 offset = pattern_offset;
1307 for (a = 0; a < 256 / 8; a++)
1311 ch = translate[(unsigned char)ch];
1316 ch = translate[(unsigned char)ch];
1322 while (ch != '\135' || firstchar) {
1324 if (regexp_ansi_sequences && ch == '\134') {
1329 for (a = prev; a <= (int)ch; a++)
1330 SETBIT(pattern, offset, a);
1333 } else if (prev != -1 && ch == '-')
1336 SETBIT(pattern, offset, ch);
1341 ch = translate[(unsigned char)ch];
1344 SETBIT(pattern, offset, '-');
1346 for (a = 0; a < 256 / 8; a++)
1347 pattern[offset + a] ^= 0xff;
1363 opcode = Csyntaxspec;
1365 goto store_opcode_and_arg;
1369 opcode = Cnotsyntaxspec;
1371 goto store_opcode_and_arg;
1385 opcode = Cwordbound;
1390 opcode = Cnotwordbound;
1398 beginning_context = (op == Ropenpar || op == Ror);
1400 if (starts_base != 0)
1401 goto parenthesis_error;
1402 // assert(num_jumps == 0);
1406 if (!re_optimize(bufp))
1407 return "Optimization error";
1412 return "Badly placed special character";
1416 return "Bad match register number";
1420 return "Bad hexadecimal number";
1424 return "Badly placed parenthesis";
1428 return "Out of memory";
1432 return "Regular expression ends prematurely";
1436 return "Regular expression too complex";
1444 #undef CURRENT_LEVEL_START
1445 #undef SET_LEVEL_START
1446 #undef PUSH_LEVEL_STARTS
1447 #undef POP_LEVEL_STARTS
1453 #define PREFETCH if (text == textend) goto fail
1455 #define NEXTCHAR(var) \
1457 var = (unsigned char)*text++; \
1459 var = translate[var]
1462 int regcomp(regex_t * bufp, const char *regex, int cflags)
1464 memset(bufp, 0, sizeof(regex_t));
1465 bufp->cflags = cflags;
1466 if (bufp->cflags & REG_ICASE) {
1467 char *p, *lcase = bstrdup(regex);
1468 for( p = lcase; *p ; p++) {
1471 re_compile_pattern(bufp, (unsigned char *)lcase);
1474 re_compile_pattern(bufp, (unsigned char *)regex);
1482 void re_registers_to_regmatch(regexp_registers_t old_regs,
1483 regmatch_t pmatch[],
1488 /* We have to set the last entry to -1 */
1489 nmatch = nmatch - 1;
1490 for (i=0; (i < nmatch) && (old_regs->start[i] > -1) ; i++) {
1491 pmatch[i].rm_so = old_regs->start[i];
1492 pmatch[i].rm_eo = old_regs->end[i];
1495 pmatch[i].rm_eo = pmatch[i].rm_so = -1;
1498 int regexec(regex_t * preg, const char *string, size_t nmatch,
1499 regmatch_t pmatch[], int eflags)
1502 int len = strlen(string);
1503 struct re_registers regs;
1504 stat = re_search(preg, (unsigned char *)string, len, 0, len, ®s);
1506 re_registers_to_regmatch(®s, pmatch, nmatch);
1508 /* stat is the start position in the string base 0 where
1509 * the pattern was found or negative if not found.
1511 return stat < 0 ? -1 : 0;
1514 size_t regerror(int errcode, regex_t * preg, char *errbuf, size_t errbuf_size)
1516 bstrncpy(errbuf, preg->errmsg, errbuf_size);
1520 void regfree(regex_t * preg)
1523 free_pool_memory(preg->lcase);
1528 preg->buffer = NULL;
1532 int re_match(regex_t * bufp, unsigned char *string, int size, int pos,
1533 regexp_registers_t old_regs)
1535 unsigned char *code;
1536 unsigned char *translate;
1537 unsigned char *text;
1538 unsigned char *textstart;
1539 unsigned char *textend;
1545 unsigned char *regstart;
1546 unsigned char *regend;
1550 // assert(pos >= 0 && size >= 0);
1551 // assert(pos <= size);
1553 text = string + pos;
1555 textend = string + size;
1557 code = bufp->buffer;
1559 translate = bufp->translate;
1561 NEW_STATE(state, bufp->num_registers);
1567 match_end = text - textstart;
1569 old_regs->start[0] = pos;
1570 old_regs->end[0] = match_end;
1571 if (!bufp->uses_registers) {
1572 for (a = 1; a < RE_NREGS; a++) {
1573 old_regs->start[a] = -1;
1574 old_regs->end[a] = -1;
1577 for (a = 1; a < bufp->num_registers; a++) {
1578 if ((GET_REG_START(state, a) == NULL) ||
1579 (GET_REG_END(state, a) == NULL)) {
1580 old_regs->start[a] = -1;
1581 old_regs->end[a] = -1;
1584 old_regs->start[a] = GET_REG_START(state, a) - textstart;
1585 old_regs->end[a] = GET_REG_END(state, a) - textstart;
1587 for (; a < RE_NREGS; a++) {
1588 old_regs->start[a] = -1;
1589 old_regs->end[a] = -1;
1594 return match_end - pos;
1598 if (text == textstart || text[-1] == '\n')
1599 goto continue_matching;
1604 if (text == textend || *text == '\n')
1605 goto continue_matching;
1611 if (code[ch / 8] & (1 << (ch & 7))) {
1613 goto continue_matching;
1620 if (ch != (unsigned char)*code++)
1622 goto continue_matching;
1629 goto continue_matching;
1634 SET_REG_START(state, reg, text, goto error);
1635 goto continue_matching;
1640 SET_REG_END(state, reg, text, goto error);
1641 goto continue_matching;
1646 regstart = GET_REG_START(state, reg);
1647 regend = GET_REG_END(state, reg);
1648 if ((regstart == NULL) || (regend == NULL))
1649 goto fail; /* or should we just match nothing? */
1650 regsize = regend - regstart;
1652 if (regsize > (textend - text))
1655 for (; regstart < regend; regstart++, text++)
1656 if (translate[*regstart] != translate[*text])
1659 for (; regstart < regend; regstart++, text++)
1660 if (*regstart != *text)
1662 goto continue_matching;
1664 case Cupdate_failure_jump:
1666 UPDATE_FAILURE(state, text, goto error);
1667 /* fall to next case */
1669 /* treat Cstar_jump just like Cjump if it hasn't been optimized */
1673 a = (unsigned char)*code++;
1674 a |= (unsigned char)*code++ << 8;
1675 code += (int)SHORT(a);
1676 if (code < bufp->buffer || bufp->buffer + bufp->used < code) {
1677 set_error("Regex VM jump out of bounds (Cjump)");
1681 goto continue_matching;
1683 case Cdummy_failure_jump:
1685 unsigned char *failuredest;
1687 a = (unsigned char)*code++;
1688 a |= (unsigned char)*code++ << 8;
1690 // assert(*code == Cfailure_jump);
1691 b = (unsigned char)code[1];
1692 b |= (unsigned char)code[2] << 8;
1693 failuredest = code + (int)SHORT(b) + 3;
1694 if (failuredest < bufp->buffer || bufp->buffer + bufp->used < failuredest) {
1696 ("Regex VM jump out of bounds (Cdummy_failure_jump failuredest)");
1700 PUSH_FAILURE(state, failuredest, NULL, goto error);
1702 if (code < bufp->buffer || bufp->buffer + bufp->used < code) {
1703 set_error("Regex VM jump out of bounds (Cdummy_failure_jump code)");
1707 goto continue_matching;
1711 a = (unsigned char)*code++;
1712 a |= (unsigned char)*code++ << 8;
1714 if (code + a < bufp->buffer || bufp->buffer + bufp->used < code + a) {
1715 set_error("Regex VM jump out of bounds (Cfailure_jump)");
1719 PUSH_FAILURE(state, code + a, text, goto error);
1720 goto continue_matching;
1724 unsigned char *pinst;
1725 a = (unsigned char)*code++;
1726 a |= (unsigned char)*code++ << 8;
1729 if (pinst < bufp->buffer || bufp->buffer + bufp->used < pinst) {
1730 set_error("Regex VM jump out of bounds (Crepeat1)");
1734 /* pinst is sole instruction in loop, and it matches a
1735 * single character. Since Crepeat1 was originally a
1736 * Cupdate_failure_jump, we also know that backtracking
1737 * is useless: so long as the single-character
1738 * expression matches, it must be used. Also, in the
1739 * case of +, we've already matched one character, so +
1740 * can't fail: nothing here can cause a failure. */
1745 while (text < textend) {
1746 ch = translate[(unsigned char)*text];
1747 if (pinst[ch / 8] & (1 << (ch & 7)))
1753 while (text < textend) {
1754 ch = (unsigned char)*text;
1755 if (pinst[ch / 8] & (1 << (ch & 7)))
1765 ch = (unsigned char)*pinst;
1767 while (text < textend && translate[(unsigned char)*text] == ch)
1770 while (text < textend && (unsigned char)*text == ch)
1777 while (text < textend && (unsigned char)*text != '\n')
1783 a = (unsigned char)*pinst;
1785 while (text < textend && (SYNTAX(translate[*text]) & a))
1788 while (text < textend && (SYNTAX(*text) & a))
1793 case Cnotsyntaxspec:
1795 a = (unsigned char)*pinst;
1797 while (text < textend && !(SYNTAX(translate[*text]) & a))
1800 while (text < textend && !(SYNTAX(*text) & a))
1808 set_error("Unknown regex opcode: memory corrupted?");
1812 /* due to the funky way + and * are compiled, the top
1813 * failure- stack entry at this point is actually a
1814 * success entry -- update it & pop it */
1815 UPDATE_FAILURE(state, text, goto error);
1816 goto fail; /* i.e., succeed <wink/sigh> */
1820 if (text == textstart)
1821 goto continue_matching;
1826 if (text == textend)
1827 goto continue_matching;
1832 if (text == textend)
1834 if (!(SYNTAX(*text) & Sword))
1836 if (text == textstart)
1837 goto continue_matching;
1838 if (!(SYNTAX(text[-1]) & Sword))
1839 goto continue_matching;
1844 if (text == textstart)
1846 if (!(SYNTAX(text[-1]) & Sword))
1848 if (text == textend)
1849 goto continue_matching;
1850 if (!(SYNTAX(*text) & Sword))
1851 goto continue_matching;
1856 /* Note: as in gnu regexp, this also matches at the
1857 * beginning and end of buffer. */
1859 if (text == textstart || text == textend)
1860 goto continue_matching;
1861 if ((SYNTAX(text[-1]) & Sword) ^ (SYNTAX(*text) & Sword))
1862 goto continue_matching;
1867 /* Note: as in gnu regexp, this never matches at the
1868 * beginning and end of buffer. */
1869 if (text == textstart || text == textend)
1871 if (!((SYNTAX(text[-1]) & Sword) ^ (SYNTAX(*text) & Sword)))
1872 goto continue_matching;
1878 if (!(SYNTAX(ch) & (unsigned char)*code++))
1880 goto continue_matching;
1882 case Cnotsyntaxspec:
1885 if (SYNTAX(ch) & (unsigned char)*code++)
1887 goto continue_matching;
1892 set_error("Unknown regex opcode: memory corrupted?");
1899 #if 0 /* This line is never reached --Guido */
1906 /* Using "break;" in the above switch statement is equivalent to "goto fail;" */
1908 POP_FAILURE(state, code, text, goto done_matching, goto error);
1909 goto continue_matching;
1912 /* if(translated != NULL) */
1913 /* free(translated); */
1918 /* if (translated != NULL) */
1919 /* free(translated); */
1928 int re_search(regex_t * bufp, unsigned char *str, int size, int pos,
1929 int range, regexp_registers_t regs)
1931 unsigned char *fastmap;
1932 unsigned char *translate;
1933 unsigned char *text;
1934 unsigned char *partstart;
1935 unsigned char *partend;
1938 unsigned char anchor;
1939 unsigned char *string = str;
1941 if (bufp->cflags & REG_ICASE) { /* we must use string in lowercase */
1942 int len = strlen((const char *)str);
1944 bufp->lcase = get_pool_memory(PM_FNAME);
1946 check_pool_memory_size(bufp->lcase, len+1);
1947 unsigned char *dst = (unsigned char *)bufp->lcase;
1949 *dst++ = tolower(*string++);
1952 string = (unsigned char *)bufp->lcase;
1955 // assert(size >= 0 && pos >= 0);
1956 // assert(pos + range >= 0 && pos + range <= size); /* Bugfix by ylo */
1958 fastmap = bufp->fastmap;
1959 translate = bufp->translate;
1960 if (fastmap && !bufp->fastmap_accurate) {
1961 re_compile_fastmap(bufp);
1966 anchor = bufp->anchor;
1967 if (bufp->can_be_null == 1) /* can_be_null == 2: can match null at eob */
1983 for (; range >= 0; range--, pos += dir) {
1985 if (dir == 1) { /* searching forwards */
1987 text = string + pos;
1988 partend = string + size;
1991 while (text != partend &&
1992 !fastmap[(unsigned char)translate[(unsigned char)*text]])
1995 while (text != partend && !fastmap[(unsigned char)*text])
1997 pos += text - partstart;
1998 range -= text - partstart;
1999 if (pos == size && bufp->can_be_null == 0)
2001 } else { /* searching backwards */
2002 text = string + pos;
2003 partstart = string + pos - range;
2006 while (text != partstart && !fastmap[(unsigned char)
2007 translate[(unsigned char)*text]])
2010 while (text != partstart && !fastmap[(unsigned char)*text])
2012 pos -= partend - text;
2013 range -= partend - text;
2016 if (anchor == 1) { /* anchored to begline */
2017 if (pos > 0 && (string[pos - 1] != '\n'))
2020 // assert(pos >= 0 && pos <= size);
2021 ret = re_match(bufp, string, size, pos, regs);
2033 ** c-file-style: "python"