3 * Author: Tatu Ylonen <ylo@ngs.fi>
5 * Copyright (c) 1991 Tatu Ylonen, Espoo, Finland
7 * Permission to use, copy, modify, distribute, and sell this software
8 * and its documentation for any purpose is hereby granted without
9 * fee, provided that the above copyright notice appear in all copies.
10 * This software is provided "as is" without express or implied
13 * Created: Thu Sep 26 17:14:05 1991 ylo
14 * Last modified: Mon Nov 4 17:06:48 1991 ylo
15 * Ported to Think C: 19 Jan 1992 guido@cwi.nl
17 * This code draws many ideas from the regular expression packages by
18 * Henry Spencer of the University of Toronto and Richard Stallman of
19 * the Free Software Foundation.
21 * Emacs-specific code and syntax table code is almost directly borrowed
24 * Bugs fixed and lots of reorganization by Jeffrey C. Ollie, April
25 * 1997 Thanks for bug reports and ideas from Andrew Kuchling, Tim
26 * Peters, Guido van Rossum, Ka-Ping Yee, Sjoerd Mullender, and
27 * probably one or two others that I'm forgetting.
31 * This file modified to work with Bacula and C++ by
32 * Kern Sibbald, April 2006
34 * This file modified to work with REG_ICASE and Bacula by
35 * Eric Bollengier April 2007
38 Bacula® - The Network Backup Solution
40 Copyright (C) 2006-2006 Free Software Foundation Europe e.V.
42 The main author of Bacula is Kern Sibbald, with contributions from
43 many others, a complete list can be found in the file AUTHORS.
44 This program is Free Software; you can redistribute it and/or
45 modify it under the terms of version two of the GNU General Public
46 License as published by the Free Software Foundation and included
49 This program is distributed in the hope that it will be useful, but
50 WITHOUT ANY WARRANTY; without even the implied warranty of
51 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
52 General Public License for more details.
54 You should have received a copy of the GNU General Public License
55 along with this program; if not, write to the Free Software
56 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
59 Bacula® is a registered trademark of John Walker.
60 The licensor of Bacula is the Free Software Foundation Europe
61 (FSFE), Fiduciary Program, Sumatrastrasse 25, 8006 Zürich,
62 Switzerland, email:ftf@fsfeurope.org.
69 #define set_error(x) bufp->errmsg=((char *)(x))
70 #define got_error bufp->errmsg!=NULL
72 /* The original code blithely assumed that sizeof(short) == 2. Not
73 * always true. Original instances of "(short)x" were replaced by
74 * SHORT(x), where SHORT is #defined below. */
76 #define SHORT(x) ((x) & 0x8000 ? (x) - 0x10000 : (x))
78 /* The stack implementation is taken from an idea by Andrew Kuchling.
79 * It's a doubly linked list of arrays. The advantages of this over a
80 * simple linked list are that the number of mallocs required are
81 * reduced. It also makes it possible to statically allocate enough
82 * space so that small patterns don't ever need to call malloc.
84 * The advantages over a single array is that is periodically
85 * realloced when more space is needed is that we avoid ever copying
88 /* item_t is the basic stack element. Defined as a union of
89 * structures so that both registers, failure points, and counters can
90 * be pushed/popped from the stack. There's nothing built into the
91 * item to keep track of whether a certain stack item is a register, a
92 * failure point, or a counter. */
94 typedef union item_t {
115 #define STACK_PAGE_SIZE 256
116 #define NUM_REGISTERS 256
118 /* A 'page' of stack items. */
120 typedef struct item_page_t {
121 item_t items[STACK_PAGE_SIZE];
122 struct item_page_t *prev;
123 struct item_page_t *next;
127 typedef struct match_state {
128 /* The number of registers that have been pushed onto the stack
129 * since the last failure point. */
133 /* Used to control when registers need to be pushed onto the
138 /* The number of failure points on the stack. */
142 /* Storage for the registers. Each register consists of two
143 * pointers to characters. So register N is represented as
144 * start[N] and end[N]. The pointers must be converted to
145 * offsets from the beginning of the string before returning the
146 * registers to the calling program. */
148 unsigned char *start[NUM_REGISTERS];
149 unsigned char *end[NUM_REGISTERS];
151 /* Keeps track of whether a register has changed recently. */
153 int changed[NUM_REGISTERS];
155 /* Structure to encapsulate the stack. */
157 /* index into the current page. If index == 0 and you need
158 * to pop an item, move to the previous page and set index
159 * = STACK_PAGE_SIZE - 1. Otherwise decrement index to
160 * push a page. If index == STACK_PAGE_SIZE and you need
161 * to push a page move to the next page and set index =
162 * 0. If there is no new next page, allocate a new page
163 * and link it in. Otherwise, increment index to push a
167 item_page_t *current; /* Pointer to the current page. */
168 item_page_t first; /* First page is statically allocated. */
172 /* Initialize a state object */
174 /* #define NEW_STATE(state) \ */
175 /* memset(&state, 0, (void *)(&state.stack) - (void *)(&state)); \ */
176 /* state.stack.current = &state.stack.first; \ */
177 /* state.stack.first.prev = NULL; \ */
178 /* state.stack.first.next = NULL; \ */
179 /* state.stack.index = 0; \ */
180 /* state.level = 1 */
182 #define NEW_STATE(state, nregs) \
185 for (i = 0; i < nregs; i++) \
187 state.start[i] = NULL; \
188 state.end[i] = NULL; \
189 state.changed[i] = 0; \
191 state.stack.current = &state.stack.first; \
192 state.stack.first.prev = NULL; \
193 state.stack.first.next = NULL; \
194 state.stack.index = 0; \
201 /* Free any memory that might have been malloc'd */
203 #define FREE_STATE(state) \
204 while(state.stack.first.next != NULL) \
206 state.stack.current = state.stack.first.next; \
207 state.stack.first.next = state.stack.current->next; \
208 free(state.stack.current); \
211 /* Discard the top 'count' stack items. */
213 #define STACK_DISCARD(stack, count, on_error) \
214 stack.index -= count; \
215 while (stack.index < 0) \
217 if (stack.current->prev == NULL) \
219 stack.current = stack.current->prev; \
220 stack.index += STACK_PAGE_SIZE; \
223 /* Store a pointer to the previous item on the stack. Used to pop an
224 * item off of the stack. */
226 #define STACK_PREV(stack, top, on_error) \
227 if (stack.index == 0) \
229 if (stack.current->prev == NULL) \
231 stack.current = stack.current->prev; \
232 stack.index = STACK_PAGE_SIZE - 1; \
238 top = &(stack.current->items[stack.index])
240 /* Store a pointer to the next item on the stack. Used to push an item
241 * on to the stack. */
243 #define STACK_NEXT(stack, top, on_error) \
244 if (stack.index == STACK_PAGE_SIZE) \
246 if (stack.current->next == NULL) \
248 stack.current->next = (item_page_t *)malloc(sizeof(item_page_t)); \
249 if (stack.current->next == NULL) \
251 stack.current->next->prev = stack.current; \
252 stack.current->next->next = NULL; \
254 stack.current = stack.current->next; \
257 top = &(stack.current->items[stack.index++])
259 /* Store a pointer to the item that is 'count' items back in the
260 * stack. STACK_BACK(stack, top, 1, on_error) is equivalent to
261 * STACK_TOP(stack, top, on_error). */
263 #define STACK_BACK(stack, top, count, on_error) \
266 item_page_t *current; \
267 current = stack.current; \
268 index = stack.index - (count); \
271 if (current->prev == NULL) \
273 current = current->prev; \
274 index += STACK_PAGE_SIZE; \
276 top = &(current->items[index]); \
279 /* Store a pointer to the top item on the stack. Execute the
280 * 'on_error' code if there are no items on the stack. */
282 #define STACK_TOP(stack, top, on_error) \
283 if (stack.index == 0) \
285 if (stack.current->prev == NULL) \
287 top = &(stack.current->prev->items[STACK_PAGE_SIZE - 1]); \
291 top = &(stack.current->items[stack.index - 1]); \
294 /* Test to see if the stack is empty */
296 #define STACK_EMPTY(stack) ((stack.index == 0) && \
297 (stack.current->prev == NULL))
299 /* Return the start of register 'reg' */
301 #define GET_REG_START(state, reg) (state.start[reg])
303 /* Return the end of register 'reg' */
305 #define GET_REG_END(state, reg) (state.end[reg])
307 /* Set the start of register 'reg'. If the state of the register needs
308 * saving, push it on the stack. */
310 #define SET_REG_START(state, reg, text, on_error) \
311 if(state.changed[reg] < state.level) \
314 STACK_NEXT(state.stack, item, on_error); \
315 item->reg.num = reg; \
316 item->reg.start = state.start[reg]; \
317 item->reg.end = state.end[reg]; \
318 item->reg.level = state.changed[reg]; \
319 state.changed[reg] = state.level; \
322 state.start[reg] = text
324 /* Set the end of register 'reg'. If the state of the register needs
325 * saving, push it on the stack. */
327 #define SET_REG_END(state, reg, text, on_error) \
328 if(state.changed[reg] < state.level) \
331 STACK_NEXT(state.stack, item, on_error); \
332 item->reg.num = reg; \
333 item->reg.start = state.start[reg]; \
334 item->reg.end = state.end[reg]; \
335 item->reg.level = state.changed[reg]; \
336 state.changed[reg] = state.level; \
339 state.end[reg] = text
341 #define PUSH_FAILURE(state, xcode, xtext, on_error) \
344 STACK_NEXT(state.stack, item, on_error); \
345 item->fail.code = xcode; \
346 item->fail.text = xtext; \
347 item->fail.count = state.count; \
348 item->fail.level = state.level; \
349 item->fail.phantom = 0; \
355 /* Update the last failure point with a new position in the text. */
357 #define UPDATE_FAILURE(state, xtext, on_error) \
360 STACK_BACK(state.stack, item, state.count + 1, on_error); \
361 if (!item->fail.phantom) \
364 STACK_NEXT(state.stack, item2, on_error); \
365 item2->fail.code = item->fail.code; \
366 item2->fail.text = xtext; \
367 item2->fail.count = state.count; \
368 item2->fail.level = state.level; \
369 item2->fail.phantom = 1; \
376 STACK_DISCARD(state.stack, state.count, on_error); \
377 STACK_TOP(state.stack, item, on_error); \
378 item->fail.text = xtext; \
384 #define POP_FAILURE(state, xcode, xtext, on_empty, on_error) \
389 while(state.count > 0) \
391 STACK_PREV(state.stack, item, on_error); \
392 state.start[item->reg.num] = item->reg.start; \
393 state.end[item->reg.num] = item->reg.end; \
394 state.changed[item->reg.num] = item->reg.level; \
397 STACK_PREV(state.stack, item, on_empty); \
398 xcode = item->fail.code; \
399 xtext = item->fail.text; \
400 state.count = item->fail.count; \
401 state.level = item->fail.level; \
404 while (item->fail.text == NULL); \
407 enum regexp_compiled_ops { /* opcodes for compiled regexp */
408 Cend, /* end of pattern reached */
409 Cbol, /* beginning of line */
410 Ceol, /* end of line */
411 Cset, /* character set. Followed by 32 bytes of set. */
412 Cexact, /* followed by a byte to match */
413 Canychar, /* matches any character except newline */
414 Cstart_memory, /* set register start addr (followed by reg number) */
415 Cend_memory, /* set register end addr (followed by reg number) */
416 Cmatch_memory, /* match a duplicate of reg contents (regnum follows) */
417 Cjump, /* followed by two bytes (lsb,msb) of displacement. */
418 Cstar_jump, /* will change to jump/update_failure_jump at runtime */
419 Cfailure_jump, /* jump to addr on failure */
420 Cupdate_failure_jump, /* update topmost failure point and jump */
421 Cdummy_failure_jump, /* push a dummy failure point and jump */
422 Cbegbuf, /* match at beginning of buffer */
423 Cendbuf, /* match at end of buffer */
424 Cwordbeg, /* match at beginning of word */
425 Cwordend, /* match at end of word */
426 Cwordbound, /* match if at word boundary */
427 Cnotwordbound, /* match if not at word boundary */
428 Csyntaxspec, /* matches syntax code (1 byte follows) */
429 Cnotsyntaxspec, /* matches if syntax code does not match (1 byte follows) */
433 enum regexp_syntax_op { /* syntax codes for plain and quoted characters */
434 Rend, /* special code for end of regexp */
435 Rnormal, /* normal character */
436 Ranychar, /* any character except newline */
437 Rquote, /* the quote character */
438 Rbol, /* match beginning of line */
439 Reol, /* match end of line */
440 Roptional, /* match preceding expression optionally */
441 Rstar, /* match preceding expr zero or more times */
442 Rplus, /* match preceding expr one or more times */
443 Ror, /* match either of alternatives */
444 Ropenpar, /* opening parenthesis */
445 Rclosepar, /* closing parenthesis */
446 Rmemory, /* match memory register */
447 Rextended_memory, /* \vnn to match registers 10-99 */
448 Ropenset, /* open set. Internal syntax hard-coded below. */
449 /* the following are gnu extensions to "normal" regexp syntax */
450 Rbegbuf, /* beginning of buffer */
451 Rendbuf, /* end of buffer */
452 Rwordchar, /* word character */
453 Rnotwordchar, /* not word character */
454 Rwordbeg, /* beginning of word */
455 Rwordend, /* end of word */
456 Rwordbound, /* word bound */
457 Rnotwordbound, /* not word bound */
461 static int re_compile_initialized = 0;
462 static int regexp_syntax = RE_SYNTAX_EGREP;
463 int re_syntax = RE_SYNTAX_EGREP; /* Exported copy of regexp_syntax */
464 static unsigned char plain_ops[256];
465 static unsigned char quoted_ops[256];
466 static unsigned char precedences[Rnum_ops];
467 static int regexp_context_indep_ops;
468 static int regexp_ansi_sequences;
470 #define NUM_LEVELS 5 /* number of precedence levels in use */
471 #define MAX_NESTING 100 /* max nesting level of operators */
473 #define SYNTAX(ch) re_syntax_table[(unsigned char)(ch)]
475 unsigned char re_syntax_table[256];
477 void re_compile_initialize(void)
481 static int syntax_table_inited = 0;
483 if (!syntax_table_inited) {
484 syntax_table_inited = 1;
485 memset(re_syntax_table, 0, 256);
486 for (a = 'a'; a <= 'z'; a++)
487 re_syntax_table[a] = Sword;
488 for (a = 'A'; a <= 'Z'; a++)
489 re_syntax_table[a] = Sword;
490 for (a = '0'; a <= '9'; a++)
491 re_syntax_table[a] = Sword | Sdigit | Shexdigit;
492 for (a = '0'; a <= '7'; a++)
493 re_syntax_table[a] |= Soctaldigit;
494 for (a = 'A'; a <= 'F'; a++)
495 re_syntax_table[a] |= Shexdigit;
496 for (a = 'a'; a <= 'f'; a++)
497 re_syntax_table[a] |= Shexdigit;
498 re_syntax_table[(int)'_'] = Sword;
499 for (a = 9; a <= 13; a++)
500 re_syntax_table[a] = Swhitespace;
501 re_syntax_table[(int)' '] = Swhitespace;
503 re_compile_initialized = 1;
504 for (a = 0; a < 256; a++) {
505 plain_ops[a] = Rnormal;
506 quoted_ops[a] = Rnormal;
508 for (a = '0'; a <= '9'; a++)
509 quoted_ops[a] = Rmemory;
510 plain_ops[(int)'\134'] = Rquote;
511 if (regexp_syntax & RE_NO_BK_PARENS) {
512 plain_ops[(int)'('] = Ropenpar;
513 plain_ops[(int)')'] = Rclosepar;
515 quoted_ops[(int)'('] = Ropenpar;
516 quoted_ops[(int)')'] = Rclosepar;
518 if (regexp_syntax & RE_NO_BK_VBAR) {
519 plain_ops[(int)'\174'] = Ror;
521 quoted_ops[(int)'\174'] = Ror;
523 plain_ops[(int)'*'] = Rstar;
524 if (regexp_syntax & RE_BK_PLUS_QM) {
525 quoted_ops[(int)'+'] = Rplus;
526 quoted_ops[(int)'?'] = Roptional;
528 plain_ops[(int)'+'] = Rplus;
529 plain_ops[(int)'?'] = Roptional;
531 if (regexp_syntax & RE_NEWLINE_OR) {
532 plain_ops[(int)'\n'] = Ror;
534 plain_ops[(int)'\133'] = Ropenset;
535 plain_ops[(int)'\136'] = Rbol;
536 plain_ops[(int)'$'] = Reol;
537 plain_ops[(int)'.'] = Ranychar;
538 if (!(regexp_syntax & RE_NO_GNU_EXTENSIONS)) {
539 quoted_ops[(int)'w'] = Rwordchar;
540 quoted_ops[(int)'W'] = Rnotwordchar;
541 quoted_ops[(int)'<'] = Rwordbeg;
542 quoted_ops[(int)'>'] = Rwordend;
543 quoted_ops[(int)'b'] = Rwordbound;
544 quoted_ops[(int)'B'] = Rnotwordbound;
545 quoted_ops[(int)'`'] = Rbegbuf;
546 quoted_ops[(int)'\''] = Rendbuf;
548 if (regexp_syntax & RE_ANSI_HEX) {
549 quoted_ops[(int)'v'] = Rextended_memory;
551 for (a = 0; a < Rnum_ops; a++) {
554 if (regexp_syntax & RE_TIGHT_VBAR) {
555 precedences[Ror] = 3;
556 precedences[Rbol] = 2;
557 precedences[Reol] = 2;
559 precedences[Ror] = 2;
560 precedences[Rbol] = 3;
561 precedences[Reol] = 3;
563 precedences[Rclosepar] = 1;
564 precedences[Rend] = 0;
565 regexp_context_indep_ops = (regexp_syntax & RE_CONTEXT_INDEP_OPS) != 0;
566 regexp_ansi_sequences = (regexp_syntax & RE_ANSI_HEX) != 0;
569 int re_set_syntax(int syntax) {
573 regexp_syntax = syntax;
574 re_syntax = syntax; /* Exported copy */
575 re_compile_initialize();
579 static int hex_char_to_decimal(int ch) {
580 if (ch >= '0' && ch <= '9')
582 if (ch >= 'a' && ch <= 'f')
583 return ch - 'a' + 10;
584 if (ch >= 'A' && ch <= 'F')
585 return ch - 'A' + 10;
589 static void re_compile_fastmap_aux(regex_t * bufp, unsigned char *code, int pos,
590 unsigned char *visited,
591 unsigned char *can_be_null,
592 unsigned char *fastmap)
599 return; /* we have already been here */
602 switch (code[pos++]) {
613 for (a = 0; a < 256; a++)
617 syntaxcode = code[pos++];
618 for (a = 0; a < 256; a++)
619 if (SYNTAX(a) & syntaxcode)
623 syntaxcode = code[pos++];
624 for (a = 0; a < 256; a++)
625 if (!(SYNTAX(a) & syntaxcode))
629 fastmap[(int)'\n'] = 1;
630 if (*can_be_null == 0)
631 *can_be_null = 2; /* can match null, but only at end of buffer */
634 for (a = 0; a < 256 / 8; a++)
635 if (code[pos + a] != 0)
636 for (b = 0; b < 8; b++)
637 if (code[pos + a] & (1 << b))
638 fastmap[(a << 3) + b] = 1;
642 fastmap[(unsigned char)code[pos]] = 1;
645 for (a = 0; a < 256; a++)
654 for (a = 0; a < 256; a++)
659 case Cdummy_failure_jump:
660 case Cupdate_failure_jump:
662 a = (unsigned char)code[pos++];
663 a |= (unsigned char)code[pos++] << 8;
664 pos += (int)SHORT(a);
666 /* argh... the regexp contains empty loops. This is not
667 good, as this may cause a failure stack overflow when
668 matching. Oh well. */
669 /* this path leads nowhere; pursue other paths. */
675 a = (unsigned char)code[pos++];
676 a |= (unsigned char)code[pos++] << 8;
677 a = pos + (int)SHORT(a);
678 re_compile_fastmap_aux(bufp, code, a, visited, can_be_null, fastmap);
684 set_error("Unknown regex opcode: memory corrupted?");
690 static int re_do_compile_fastmap(regex_t * bufp, unsigned char *buffer, int used,
691 int pos, unsigned char *can_be_null,
692 unsigned char *fastmap)
694 unsigned char small_visited[512], *visited;
696 if (used <= (int)sizeof(small_visited))
697 visited = small_visited;
699 visited = (unsigned char *)malloc(used);
704 memset(fastmap, 0, 256);
705 memset(visited, 0, used);
706 re_compile_fastmap_aux(bufp, buffer, pos, visited, can_be_null, fastmap);
707 if (visited != small_visited)
712 void re_compile_fastmap(regex_t * bufp)
714 if (!bufp->fastmap || bufp->fastmap_accurate)
716 // assert(bufp->used > 0);
717 if (!re_do_compile_fastmap(bufp, bufp->buffer,
718 bufp->used, 0, &bufp->can_be_null, bufp->fastmap))
722 if (bufp->buffer[0] == Cbol)
723 bufp->anchor = 1; /* begline */
724 else if (bufp->buffer[0] == Cbegbuf)
725 bufp->anchor = 2; /* begbuf */
727 bufp->anchor = 0; /* none */
728 bufp->fastmap_accurate = 1;
734 * ... code for operand of star
736 * 2: ... code after star
738 * We change the star_jump to update_failure_jump if we can determine
739 * that it is safe to do so; otherwise we change it to an ordinary
746 * 2: ... code for operand of plus
748 * 3: ... code after plus
750 * For star_jump considerations this is processed identically to star.
754 static int re_optimize_star_jump(regex_t * bufp, unsigned char *code)
756 unsigned char map[256];
757 unsigned char can_be_null;
763 int num_instructions = 0;
765 a = (unsigned char)*code++;
766 a |= (unsigned char)*code++ << 8;
769 p1 = code + a + 3; /* skip the failure_jump */
770 /* Check that the jump is within the pattern */
771 if (p1 < bufp->buffer || bufp->buffer + bufp->used < p1) {
772 set_error("Regex VM jump out of bounds (failure_jump opt)");
775 // assert(p1[-3] == Cfailure_jump);
777 /* p1 points inside loop, p2 points to after loop */
778 if (!re_do_compile_fastmap(bufp, bufp->buffer, bufp->used,
779 (int)(p2 - bufp->buffer), &can_be_null, map))
780 goto make_normal_jump;
782 /* If we might introduce a new update point inside the
783 * loop, we can't optimize because then update_jump would
784 * update a wrong failure point. Thus we have to be
785 * quite careful here.
788 /* loop until we find something that consumes a character */
806 ch = (unsigned char)*p1++;
808 goto make_normal_jump;
811 for (b = 0; b < 256; b++)
812 if (b != '\n' && map[b])
813 goto make_normal_jump;
816 for (b = 0; b < 256; b++)
817 if ((p1[b >> 3] & (1 << (b & 7))) && map[b])
818 goto make_normal_jump;
822 goto make_normal_jump;
826 /* now we know that we can't backtrack. */
827 while (p1 != p2 - 3) {
856 case Cupdate_failure_jump:
857 case Cdummy_failure_jump:
858 goto make_normal_jump;
864 /* make_update_jump: */
866 a += 3; /* jump to after the Cfailure_jump */
867 code[0] = Cupdate_failure_jump;
870 if (num_instructions > 1)
872 // assert(num_instructions == 1);
873 /* if the only instruction matches a single character, we can do
875 p1 = code + 3 + a; /* start of sole instruction */
876 if (*p1 == Cset || *p1 == Cexact || *p1 == Canychar ||
877 *p1 == Csyntaxspec || *p1 == Cnotsyntaxspec)
887 static int re_optimize(regex_t * bufp)
919 if (!re_optimize_star_jump(bufp, code)) {
923 case Cupdate_failure_jump:
925 case Cdummy_failure_jump:
936 #define NEXTCHAR(var) \
939 goto ends_prematurely; \
940 (var) = regex[pos]; \
944 #define ALLOC(amount) \
946 if (pattern_offset+(amount) > alloc) \
948 alloc += 256 + (amount); \
949 pattern = (unsigned char *)realloc(pattern, alloc); \
951 goto out_of_memory; \
955 #define STORE(ch) pattern[pattern_offset++] = (ch)
957 #define CURRENT_LEVEL_START (starts[starts_base + current_level])
959 #define SET_LEVEL_START starts[starts_base + current_level] = pattern_offset
961 #define PUSH_LEVEL_STARTS \
962 if (starts_base < (MAX_NESTING-1)*NUM_LEVELS) \
963 starts_base += NUM_LEVELS; \
967 #define POP_LEVEL_STARTS starts_base -= NUM_LEVELS
969 #define PUT_ADDR(offset,addr) \
971 int disp = (addr) - (offset) - 2; \
972 pattern[(offset)] = disp & 0xff; \
973 pattern[(offset)+1] = (disp>>8) & 0xff; \
976 #define INSERT_JUMP(pos,type,addr) \
978 int a, p = (pos), t = (type), ad = (addr); \
979 for (a = pattern_offset - 1; a >= p; a--) \
980 pattern[a + 3] = pattern[a]; \
983 pattern_offset += 3; \
986 #define SETBIT(buf,offset,bit) (buf)[(offset)+(bit)/8] |= (1<<((bit) & 7))
990 bufp->allocated = alloc; \
991 bufp->buffer = pattern; \
992 bufp->used = pattern_offset; \
995 #define GETHEX(var) \
997 unsigned char gethex_ch, gethex_value; \
998 NEXTCHAR(gethex_ch); \
999 gethex_value = hex_char_to_decimal(gethex_ch); \
1000 if (gethex_value == 16) \
1002 NEXTCHAR(gethex_ch); \
1003 gethex_ch = hex_char_to_decimal(gethex_ch); \
1004 if (gethex_ch == 16) \
1006 (var) = gethex_value * 16 + gethex_ch; \
1009 #define ANSI_TRANSLATE(ch) \
1016 ch = 7; /* audible bell */ \
1022 ch = 8; /* backspace */ \
1028 ch = 12; /* form feed */ \
1034 ch = 10; /* line feed */ \
1040 ch = 13; /* carriage return */ \
1052 ch = 11; /* vertical tab */ \
1055 case 'x': /* hex code */ \
1063 /* other characters passed through */ \
1065 ch = translate[(unsigned char)ch]; \
1071 const char *re_compile_pattern(regex_t * bufp, unsigned char *regex)
1079 int pattern_offset = 0, alloc;
1080 int starts[NUM_LEVELS * MAX_NESTING];
1082 int future_jumps[MAX_NESTING];
1084 unsigned char ch = '\0';
1085 unsigned char *pattern;
1086 unsigned char *translate;
1089 int num_open_registers;
1090 int open_registers[RE_NREGS];
1091 int beginning_context;
1092 int size = strlen((char *)regex);
1094 if (!re_compile_initialized)
1095 re_compile_initialize();
1097 bufp->fastmap_accurate = 0;
1098 bufp->uses_registers = 1;
1099 bufp->num_registers = 1;
1100 translate = bufp->translate;
1101 pattern = bufp->buffer;
1102 alloc = bufp->allocated;
1103 if (alloc == 0 || pattern == NULL) {
1105 bufp->buffer = pattern = (unsigned char *)malloc(alloc);
1114 num_open_registers = 0;
1117 beginning_context = 1;
1119 /* we use Rend dummy to ensure that pending jumps are updated
1120 (due to low priority of Rend) before exiting the loop. */
1122 while (op != Rend) {
1128 ch = translate[(unsigned char)ch];
1129 op = plain_ops[(unsigned char)ch];
1132 op = quoted_ops[(unsigned char)ch];
1133 if (op == Rnormal && regexp_ansi_sequences)
1137 level = precedences[op];
1138 /* printf("ch='%c' op=%d level=%d current_level=%d
1139 curlevstart=%d\n", ch, op, level, current_level,
1140 CURRENT_LEVEL_START); */
1141 if (level > current_level) {
1142 for (current_level++; current_level < level; current_level++)
1145 } else if (level < current_level) {
1146 current_level = level;
1147 for (; num_jumps > 0 &&
1148 future_jumps[num_jumps - 1] >= CURRENT_LEVEL_START; num_jumps--)
1149 PUT_ADDR(future_jumps[num_jumps - 1], pattern_offset);
1157 store_opcode_and_arg: /* opcode & ch must be set */
1171 set_error("Rquote");
1172 /*NOTREACHED*/ case Rbol:
1173 if (!beginning_context) {
1174 if (regexp_context_indep_ops)
1182 if (!((pos >= size) ||
1183 ((regexp_syntax & RE_NO_BK_VBAR) ?
1184 (regex[pos] == '\174') :
1185 (pos + 1 < size && regex[pos] == '\134' &&
1186 regex[pos + 1] == '\174')) ||
1187 ((regexp_syntax & RE_NO_BK_PARENS) ?
1188 (regex[pos] == ')') :
1189 (pos + 1 < size && regex[pos] == '\134' &&
1190 regex[pos + 1] == ')')))) {
1191 if (regexp_context_indep_ops)
1201 if (beginning_context) {
1202 if (regexp_context_indep_ops)
1207 if (CURRENT_LEVEL_START == pattern_offset)
1208 break; /* ignore empty patterns for ? */
1210 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump, pattern_offset + 3);
1214 if (beginning_context) {
1215 if (regexp_context_indep_ops)
1220 if (CURRENT_LEVEL_START == pattern_offset)
1221 break; /* ignore empty patterns for + and * */
1223 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump, pattern_offset + 6);
1224 INSERT_JUMP(pattern_offset, Cstar_jump, CURRENT_LEVEL_START);
1225 if (op == Rplus) /* jump over initial failure_jump */
1226 INSERT_JUMP(CURRENT_LEVEL_START, Cdummy_failure_jump,
1227 CURRENT_LEVEL_START + 6);
1231 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump, pattern_offset + 6);
1232 if (num_jumps >= MAX_NESTING)
1235 future_jumps[num_jumps++] = pattern_offset;
1243 if (next_register < RE_NREGS) {
1244 bufp->uses_registers = 1;
1246 STORE(Cstart_memory);
1247 STORE(next_register);
1248 open_registers[num_open_registers++] = next_register;
1249 bufp->num_registers++;
1259 if (paren_depth <= 0)
1260 goto parenthesis_error;
1262 current_level = precedences[Ropenpar];
1264 if (paren_depth < num_open_registers) {
1265 bufp->uses_registers = 1;
1268 num_open_registers--;
1269 STORE(open_registers[num_open_registers]);
1274 goto bad_match_register;
1275 if (!(ch >= '0' && ch <= '9')) {
1276 goto bad_match_register;
1278 bufp->uses_registers = 1;
1279 opcode = Cmatch_memory;
1281 goto store_opcode_and_arg;
1282 case Rextended_memory:
1284 if (ch < '0' || ch > '9')
1285 goto bad_match_register;
1287 if (a < '0' || a > '9')
1288 goto bad_match_register;
1289 ch = 10 * (a - '0') + ch - '0';
1290 if (ch == 0 || ch >= RE_NREGS)
1291 goto bad_match_register;
1292 bufp->uses_registers = 1;
1293 opcode = Cmatch_memory;
1294 goto store_opcode_and_arg;
1306 offset = pattern_offset;
1307 for (a = 0; a < 256 / 8; a++)
1311 ch = translate[(unsigned char)ch];
1316 ch = translate[(unsigned char)ch];
1322 while (ch != '\135' || firstchar) {
1324 if (regexp_ansi_sequences && ch == '\134') {
1329 for (a = prev; a <= (int)ch; a++)
1330 SETBIT(pattern, offset, a);
1333 } else if (prev != -1 && ch == '-')
1336 SETBIT(pattern, offset, ch);
1341 ch = translate[(unsigned char)ch];
1344 SETBIT(pattern, offset, '-');
1346 for (a = 0; a < 256 / 8; a++)
1347 pattern[offset + a] ^= 0xff;
1363 opcode = Csyntaxspec;
1365 goto store_opcode_and_arg;
1369 opcode = Cnotsyntaxspec;
1371 goto store_opcode_and_arg;
1385 opcode = Cwordbound;
1390 opcode = Cnotwordbound;
1398 beginning_context = (op == Ropenpar || op == Ror);
1400 if (starts_base != 0)
1401 goto parenthesis_error;
1402 // assert(num_jumps == 0);
1406 if (!re_optimize(bufp))
1407 return "Optimization error";
1412 return "Badly placed special character";
1416 return "Bad match register number";
1420 return "Bad hexadecimal number";
1424 return "Badly placed parenthesis";
1428 return "Out of memory";
1432 return "Regular expression ends prematurely";
1436 return "Regular expression too complex";
1444 #undef CURRENT_LEVEL_START
1445 #undef SET_LEVEL_START
1446 #undef PUSH_LEVEL_STARTS
1447 #undef POP_LEVEL_STARTS
1453 #define PREFETCH if (text == textend) goto fail
1455 #define NEXTCHAR(var) \
1457 var = (unsigned char)*text++; \
1459 var = translate[var]
1462 int regcomp(regex_t * bufp, const char *regex, int cflags)
1464 memset(bufp, 0, sizeof(regex_t));
1465 bufp->cflags = cflags;
1466 if (bufp->cflags & REG_ICASE) {
1467 char *p, *lcase = bstrdup(regex);
1468 for( p = lcase; *p ; p++) {
1471 re_compile_pattern(bufp, (unsigned char *)lcase);
1474 re_compile_pattern(bufp, (unsigned char *)regex);
1482 void re_registers_to_regmatch(regexp_registers_t old_regs,
1483 regmatch_t pmatch[],
1488 /* We have to set the last entry to -1 */
1489 nmatch = nmatch - 1;
1490 for (i=0; (i < nmatch) && (old_regs->start[i] > -1) ; i++) {
1491 pmatch[i].rm_so = old_regs->start[i];
1492 pmatch[i].rm_eo = old_regs->end[i];
1495 pmatch[i].rm_eo = pmatch[i].rm_so = -1;
1498 int regexec(regex_t * preg, const char *string, size_t nmatch,
1499 regmatch_t pmatch[], int eflags)
1502 int len = strlen(string);
1503 struct re_registers regs;
1504 stat = re_search(preg, (unsigned char *)string, len, 0, len, ®s);
1505 re_registers_to_regmatch(®s, pmatch, nmatch);
1506 /* stat is the start position in the string base 0 where
1507 * the pattern was found or negative if not found.
1509 return stat < 0 ? -1 : 0;
1512 size_t regerror(int errcode, regex_t * preg, char *errbuf, size_t errbuf_size)
1514 bstrncpy(errbuf, preg->errmsg, errbuf_size);
1518 void regfree(regex_t * preg)
1521 free_pool_memory(preg->lcase);
1526 preg->buffer = NULL;
1530 int re_match(regex_t * bufp, unsigned char *string, int size, int pos,
1531 regexp_registers_t old_regs)
1533 unsigned char *code;
1534 unsigned char *translate;
1535 unsigned char *text;
1536 unsigned char *textstart;
1537 unsigned char *textend;
1543 unsigned char *regstart;
1544 unsigned char *regend;
1548 // assert(pos >= 0 && size >= 0);
1549 // assert(pos <= size);
1551 text = string + pos;
1553 textend = string + size;
1555 code = bufp->buffer;
1557 translate = bufp->translate;
1559 NEW_STATE(state, bufp->num_registers);
1565 match_end = text - textstart;
1567 old_regs->start[0] = pos;
1568 old_regs->end[0] = match_end;
1569 if (!bufp->uses_registers) {
1570 for (a = 1; a < RE_NREGS; a++) {
1571 old_regs->start[a] = -1;
1572 old_regs->end[a] = -1;
1575 for (a = 1; a < bufp->num_registers; a++) {
1576 if ((GET_REG_START(state, a) == NULL) ||
1577 (GET_REG_END(state, a) == NULL)) {
1578 old_regs->start[a] = -1;
1579 old_regs->end[a] = -1;
1582 old_regs->start[a] = GET_REG_START(state, a) - textstart;
1583 old_regs->end[a] = GET_REG_END(state, a) - textstart;
1585 for (; a < RE_NREGS; a++) {
1586 old_regs->start[a] = -1;
1587 old_regs->end[a] = -1;
1592 return match_end - pos;
1596 if (text == textstart || text[-1] == '\n')
1597 goto continue_matching;
1602 if (text == textend || *text == '\n')
1603 goto continue_matching;
1609 if (code[ch / 8] & (1 << (ch & 7))) {
1611 goto continue_matching;
1618 if (ch != (unsigned char)*code++)
1620 goto continue_matching;
1627 goto continue_matching;
1632 SET_REG_START(state, reg, text, goto error);
1633 goto continue_matching;
1638 SET_REG_END(state, reg, text, goto error);
1639 goto continue_matching;
1644 regstart = GET_REG_START(state, reg);
1645 regend = GET_REG_END(state, reg);
1646 if ((regstart == NULL) || (regend == NULL))
1647 goto fail; /* or should we just match nothing? */
1648 regsize = regend - regstart;
1650 if (regsize > (textend - text))
1653 for (; regstart < regend; regstart++, text++)
1654 if (translate[*regstart] != translate[*text])
1657 for (; regstart < regend; regstart++, text++)
1658 if (*regstart != *text)
1660 goto continue_matching;
1662 case Cupdate_failure_jump:
1664 UPDATE_FAILURE(state, text, goto error);
1665 /* fall to next case */
1667 /* treat Cstar_jump just like Cjump if it hasn't been optimized */
1671 a = (unsigned char)*code++;
1672 a |= (unsigned char)*code++ << 8;
1673 code += (int)SHORT(a);
1674 if (code < bufp->buffer || bufp->buffer + bufp->used < code) {
1675 set_error("Regex VM jump out of bounds (Cjump)");
1679 goto continue_matching;
1681 case Cdummy_failure_jump:
1683 unsigned char *failuredest;
1685 a = (unsigned char)*code++;
1686 a |= (unsigned char)*code++ << 8;
1688 // assert(*code == Cfailure_jump);
1689 b = (unsigned char)code[1];
1690 b |= (unsigned char)code[2] << 8;
1691 failuredest = code + (int)SHORT(b) + 3;
1692 if (failuredest < bufp->buffer || bufp->buffer + bufp->used < failuredest) {
1694 ("Regex VM jump out of bounds (Cdummy_failure_jump failuredest)");
1698 PUSH_FAILURE(state, failuredest, NULL, goto error);
1700 if (code < bufp->buffer || bufp->buffer + bufp->used < code) {
1701 set_error("Regex VM jump out of bounds (Cdummy_failure_jump code)");
1705 goto continue_matching;
1709 a = (unsigned char)*code++;
1710 a |= (unsigned char)*code++ << 8;
1712 if (code + a < bufp->buffer || bufp->buffer + bufp->used < code + a) {
1713 set_error("Regex VM jump out of bounds (Cfailure_jump)");
1717 PUSH_FAILURE(state, code + a, text, goto error);
1718 goto continue_matching;
1722 unsigned char *pinst;
1723 a = (unsigned char)*code++;
1724 a |= (unsigned char)*code++ << 8;
1727 if (pinst < bufp->buffer || bufp->buffer + bufp->used < pinst) {
1728 set_error("Regex VM jump out of bounds (Crepeat1)");
1732 /* pinst is sole instruction in loop, and it matches a
1733 * single character. Since Crepeat1 was originally a
1734 * Cupdate_failure_jump, we also know that backtracking
1735 * is useless: so long as the single-character
1736 * expression matches, it must be used. Also, in the
1737 * case of +, we've already matched one character, so +
1738 * can't fail: nothing here can cause a failure. */
1743 while (text < textend) {
1744 ch = translate[(unsigned char)*text];
1745 if (pinst[ch / 8] & (1 << (ch & 7)))
1751 while (text < textend) {
1752 ch = (unsigned char)*text;
1753 if (pinst[ch / 8] & (1 << (ch & 7)))
1763 ch = (unsigned char)*pinst;
1765 while (text < textend && translate[(unsigned char)*text] == ch)
1768 while (text < textend && (unsigned char)*text == ch)
1775 while (text < textend && (unsigned char)*text != '\n')
1781 a = (unsigned char)*pinst;
1783 while (text < textend && (SYNTAX(translate[*text]) & a))
1786 while (text < textend && (SYNTAX(*text) & a))
1791 case Cnotsyntaxspec:
1793 a = (unsigned char)*pinst;
1795 while (text < textend && !(SYNTAX(translate[*text]) & a))
1798 while (text < textend && !(SYNTAX(*text) & a))
1806 set_error("Unknown regex opcode: memory corrupted?");
1810 /* due to the funky way + and * are compiled, the top
1811 * failure- stack entry at this point is actually a
1812 * success entry -- update it & pop it */
1813 UPDATE_FAILURE(state, text, goto error);
1814 goto fail; /* i.e., succeed <wink/sigh> */
1818 if (text == textstart)
1819 goto continue_matching;
1824 if (text == textend)
1825 goto continue_matching;
1830 if (text == textend)
1832 if (!(SYNTAX(*text) & Sword))
1834 if (text == textstart)
1835 goto continue_matching;
1836 if (!(SYNTAX(text[-1]) & Sword))
1837 goto continue_matching;
1842 if (text == textstart)
1844 if (!(SYNTAX(text[-1]) & Sword))
1846 if (text == textend)
1847 goto continue_matching;
1848 if (!(SYNTAX(*text) & Sword))
1849 goto continue_matching;
1854 /* Note: as in gnu regexp, this also matches at the
1855 * beginning and end of buffer. */
1857 if (text == textstart || text == textend)
1858 goto continue_matching;
1859 if ((SYNTAX(text[-1]) & Sword) ^ (SYNTAX(*text) & Sword))
1860 goto continue_matching;
1865 /* Note: as in gnu regexp, this never matches at the
1866 * beginning and end of buffer. */
1867 if (text == textstart || text == textend)
1869 if (!((SYNTAX(text[-1]) & Sword) ^ (SYNTAX(*text) & Sword)))
1870 goto continue_matching;
1876 if (!(SYNTAX(ch) & (unsigned char)*code++))
1878 goto continue_matching;
1880 case Cnotsyntaxspec:
1883 if (SYNTAX(ch) & (unsigned char)*code++)
1885 goto continue_matching;
1890 set_error("Unknown regex opcode: memory corrupted?");
1897 #if 0 /* This line is never reached --Guido */
1904 /* Using "break;" in the above switch statement is equivalent to "goto fail;" */
1906 POP_FAILURE(state, code, text, goto done_matching, goto error);
1907 goto continue_matching;
1910 /* if(translated != NULL) */
1911 /* free(translated); */
1916 /* if (translated != NULL) */
1917 /* free(translated); */
1926 int re_search(regex_t * bufp, unsigned char *str, int size, int pos,
1927 int range, regexp_registers_t regs)
1929 unsigned char *fastmap;
1930 unsigned char *translate;
1931 unsigned char *text;
1932 unsigned char *partstart;
1933 unsigned char *partend;
1936 unsigned char anchor;
1937 unsigned char *string = str;
1939 if (bufp->cflags & REG_ICASE) { /* we must use string in lowercase */
1940 int len = strlen((const char *)str);
1942 bufp->lcase = get_pool_memory(PM_FNAME);
1944 check_pool_memory_size(bufp->lcase, len+1);
1945 unsigned char *dst = (unsigned char *)bufp->lcase;
1947 *dst++ = tolower(*string++);
1950 string = (unsigned char *)bufp->lcase;
1953 // assert(size >= 0 && pos >= 0);
1954 // assert(pos + range >= 0 && pos + range <= size); /* Bugfix by ylo */
1956 fastmap = bufp->fastmap;
1957 translate = bufp->translate;
1958 if (fastmap && !bufp->fastmap_accurate) {
1959 re_compile_fastmap(bufp);
1964 anchor = bufp->anchor;
1965 if (bufp->can_be_null == 1) /* can_be_null == 2: can match null at eob */
1981 for (; range >= 0; range--, pos += dir) {
1983 if (dir == 1) { /* searching forwards */
1985 text = string + pos;
1986 partend = string + size;
1989 while (text != partend &&
1990 !fastmap[(unsigned char)translate[(unsigned char)*text]])
1993 while (text != partend && !fastmap[(unsigned char)*text])
1995 pos += text - partstart;
1996 range -= text - partstart;
1997 if (pos == size && bufp->can_be_null == 0)
1999 } else { /* searching backwards */
2000 text = string + pos;
2001 partstart = string + pos - range;
2004 while (text != partstart && !fastmap[(unsigned char)
2005 translate[(unsigned char)*text]])
2008 while (text != partstart && !fastmap[(unsigned char)*text])
2010 pos -= partend - text;
2011 range -= partend - text;
2014 if (anchor == 1) { /* anchored to begline */
2015 if (pos > 0 && (string[pos - 1] != '\n'))
2018 // assert(pos >= 0 && pos <= size);
2019 ret = re_match(bufp, string, size, pos, regs);
2031 ** c-file-style: "python"