3 * Author: Tatu Ylonen <ylo@ngs.fi>
5 * Copyright (c) 1991 Tatu Ylonen, Espoo, Finland
7 * Permission to use, copy, modify, distribute, and sell this software
8 * and its documentation for any purpose is hereby granted without
9 * fee, provided that the above copyright notice appear in all copies.
10 * This software is provided "as is" without express or implied
13 * Created: Thu Sep 26 17:14:05 1991 ylo
14 * Last modified: Mon Nov 4 17:06:48 1991 ylo
15 * Ported to Think C: 19 Jan 1992 guido@cwi.nl
17 * This code draws many ideas from the regular expression packages by
18 * Henry Spencer of the University of Toronto and Richard Stallman of
19 * the Free Software Foundation.
21 * Emacs-specific code and syntax table code is almost directly borrowed
24 * Bugs fixed and lots of reorganization by Jeffrey C. Ollie, April
25 * 1997 Thanks for bug reports and ideas from Andrew Kuchling, Tim
26 * Peters, Guido van Rossum, Ka-Ping Yee, Sjoerd Mullender, and
27 * probably one or two others that I'm forgetting.
31 * This file modified to work with Bacula and C++ by
32 * Kern Sibbald, April 2006
35 Bacula® - The Network Backup Solution
37 Copyright (C) 2006-2006 Free Software Foundation Europe e.V.
39 The main author of Bacula is Kern Sibbald, with contributions from
40 many others, a complete list can be found in the file AUTHORS.
41 This program is Free Software; you can redistribute it and/or
42 modify it under the terms of version two of the GNU General Public
43 License as published by the Free Software Foundation plus additions
44 that are listed in the file LICENSE.
46 This program is distributed in the hope that it will be useful, but
47 WITHOUT ANY WARRANTY; without even the implied warranty of
48 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
49 General Public License for more details.
51 You should have received a copy of the GNU General Public License
52 along with this program; if not, write to the Free Software
53 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
56 Bacula® is a registered trademark of John Walker.
57 The licensor of Bacula is the Free Software Foundation Europe
58 (FSFE), Fiduciary Program, Sumatrastrasse 25, 8006 Zürich,
59 Switzerland, email:ftf@fsfeurope.org.
66 #define set_error(x) bufp->errmsg=((char *)(x))
67 #define got_error bufp->errmsg!=NULL
69 /* The original code blithely assumed that sizeof(short) == 2. Not
70 * always true. Original instances of "(short)x" were replaced by
71 * SHORT(x), where SHORT is #defined below. */
73 #define SHORT(x) ((x) & 0x8000 ? (x) - 0x10000 : (x))
75 /* The stack implementation is taken from an idea by Andrew Kuchling.
76 * It's a doubly linked list of arrays. The advantages of this over a
77 * simple linked list are that the number of mallocs required are
78 * reduced. It also makes it possible to statically allocate enough
79 * space so that small patterns don't ever need to call malloc.
81 * The advantages over a single array is that is periodically
82 * realloced when more space is needed is that we avoid ever copying
85 /* item_t is the basic stack element. Defined as a union of
86 * structures so that both registers, failure points, and counters can
87 * be pushed/popped from the stack. There's nothing built into the
88 * item to keep track of whether a certain stack item is a register, a
89 * failure point, or a counter. */
91 typedef union item_t {
112 #define STACK_PAGE_SIZE 256
113 #define NUM_REGISTERS 256
115 /* A 'page' of stack items. */
117 typedef struct item_page_t {
118 item_t items[STACK_PAGE_SIZE];
119 struct item_page_t *prev;
120 struct item_page_t *next;
124 typedef struct match_state {
125 /* The number of registers that have been pushed onto the stack
126 * since the last failure point. */
130 /* Used to control when registers need to be pushed onto the
135 /* The number of failure points on the stack. */
139 /* Storage for the registers. Each register consists of two
140 * pointers to characters. So register N is represented as
141 * start[N] and end[N]. The pointers must be converted to
142 * offsets from the beginning of the string before returning the
143 * registers to the calling program. */
145 unsigned char *start[NUM_REGISTERS];
146 unsigned char *end[NUM_REGISTERS];
148 /* Keeps track of whether a register has changed recently. */
150 int changed[NUM_REGISTERS];
152 /* Structure to encapsulate the stack. */
154 /* index into the current page. If index == 0 and you need
155 * to pop an item, move to the previous page and set index
156 * = STACK_PAGE_SIZE - 1. Otherwise decrement index to
157 * push a page. If index == STACK_PAGE_SIZE and you need
158 * to push a page move to the next page and set index =
159 * 0. If there is no new next page, allocate a new page
160 * and link it in. Otherwise, increment index to push a
164 item_page_t *current; /* Pointer to the current page. */
165 item_page_t first; /* First page is statically allocated. */
169 /* Initialize a state object */
171 /* #define NEW_STATE(state) \ */
172 /* memset(&state, 0, (void *)(&state.stack) - (void *)(&state)); \ */
173 /* state.stack.current = &state.stack.first; \ */
174 /* state.stack.first.prev = NULL; \ */
175 /* state.stack.first.next = NULL; \ */
176 /* state.stack.index = 0; \ */
177 /* state.level = 1 */
179 #define NEW_STATE(state, nregs) \
182 for (i = 0; i < nregs; i++) \
184 state.start[i] = NULL; \
185 state.end[i] = NULL; \
186 state.changed[i] = 0; \
188 state.stack.current = &state.stack.first; \
189 state.stack.first.prev = NULL; \
190 state.stack.first.next = NULL; \
191 state.stack.index = 0; \
198 /* Free any memory that might have been malloc'd */
200 #define FREE_STATE(state) \
201 while(state.stack.first.next != NULL) \
203 state.stack.current = state.stack.first.next; \
204 state.stack.first.next = state.stack.current->next; \
205 free(state.stack.current); \
208 /* Discard the top 'count' stack items. */
210 #define STACK_DISCARD(stack, count, on_error) \
211 stack.index -= count; \
212 while (stack.index < 0) \
214 if (stack.current->prev == NULL) \
216 stack.current = stack.current->prev; \
217 stack.index += STACK_PAGE_SIZE; \
220 /* Store a pointer to the previous item on the stack. Used to pop an
221 * item off of the stack. */
223 #define STACK_PREV(stack, top, on_error) \
224 if (stack.index == 0) \
226 if (stack.current->prev == NULL) \
228 stack.current = stack.current->prev; \
229 stack.index = STACK_PAGE_SIZE - 1; \
235 top = &(stack.current->items[stack.index])
237 /* Store a pointer to the next item on the stack. Used to push an item
238 * on to the stack. */
240 #define STACK_NEXT(stack, top, on_error) \
241 if (stack.index == STACK_PAGE_SIZE) \
243 if (stack.current->next == NULL) \
245 stack.current->next = (item_page_t *)malloc(sizeof(item_page_t)); \
246 if (stack.current->next == NULL) \
248 stack.current->next->prev = stack.current; \
249 stack.current->next->next = NULL; \
251 stack.current = stack.current->next; \
254 top = &(stack.current->items[stack.index++])
256 /* Store a pointer to the item that is 'count' items back in the
257 * stack. STACK_BACK(stack, top, 1, on_error) is equivalent to
258 * STACK_TOP(stack, top, on_error). */
260 #define STACK_BACK(stack, top, count, on_error) \
263 item_page_t *current; \
264 current = stack.current; \
265 index = stack.index - (count); \
268 if (current->prev == NULL) \
270 current = current->prev; \
271 index += STACK_PAGE_SIZE; \
273 top = &(current->items[index]); \
276 /* Store a pointer to the top item on the stack. Execute the
277 * 'on_error' code if there are no items on the stack. */
279 #define STACK_TOP(stack, top, on_error) \
280 if (stack.index == 0) \
282 if (stack.current->prev == NULL) \
284 top = &(stack.current->prev->items[STACK_PAGE_SIZE - 1]); \
288 top = &(stack.current->items[stack.index - 1]); \
291 /* Test to see if the stack is empty */
293 #define STACK_EMPTY(stack) ((stack.index == 0) && \
294 (stack.current->prev == NULL))
296 /* Return the start of register 'reg' */
298 #define GET_REG_START(state, reg) (state.start[reg])
300 /* Return the end of register 'reg' */
302 #define GET_REG_END(state, reg) (state.end[reg])
304 /* Set the start of register 'reg'. If the state of the register needs
305 * saving, push it on the stack. */
307 #define SET_REG_START(state, reg, text, on_error) \
308 if(state.changed[reg] < state.level) \
311 STACK_NEXT(state.stack, item, on_error); \
312 item->reg.num = reg; \
313 item->reg.start = state.start[reg]; \
314 item->reg.end = state.end[reg]; \
315 item->reg.level = state.changed[reg]; \
316 state.changed[reg] = state.level; \
319 state.start[reg] = text
321 /* Set the end of register 'reg'. If the state of the register needs
322 * saving, push it on the stack. */
324 #define SET_REG_END(state, reg, text, on_error) \
325 if(state.changed[reg] < state.level) \
328 STACK_NEXT(state.stack, item, on_error); \
329 item->reg.num = reg; \
330 item->reg.start = state.start[reg]; \
331 item->reg.end = state.end[reg]; \
332 item->reg.level = state.changed[reg]; \
333 state.changed[reg] = state.level; \
336 state.end[reg] = text
338 #define PUSH_FAILURE(state, xcode, xtext, on_error) \
341 STACK_NEXT(state.stack, item, on_error); \
342 item->fail.code = xcode; \
343 item->fail.text = xtext; \
344 item->fail.count = state.count; \
345 item->fail.level = state.level; \
346 item->fail.phantom = 0; \
352 /* Update the last failure point with a new position in the text. */
354 #define UPDATE_FAILURE(state, xtext, on_error) \
357 STACK_BACK(state.stack, item, state.count + 1, on_error); \
358 if (!item->fail.phantom) \
361 STACK_NEXT(state.stack, item2, on_error); \
362 item2->fail.code = item->fail.code; \
363 item2->fail.text = xtext; \
364 item2->fail.count = state.count; \
365 item2->fail.level = state.level; \
366 item2->fail.phantom = 1; \
373 STACK_DISCARD(state.stack, state.count, on_error); \
374 STACK_TOP(state.stack, item, on_error); \
375 item->fail.text = xtext; \
381 #define POP_FAILURE(state, xcode, xtext, on_empty, on_error) \
386 while(state.count > 0) \
388 STACK_PREV(state.stack, item, on_error); \
389 state.start[item->reg.num] = item->reg.start; \
390 state.end[item->reg.num] = item->reg.end; \
391 state.changed[item->reg.num] = item->reg.level; \
394 STACK_PREV(state.stack, item, on_empty); \
395 xcode = item->fail.code; \
396 xtext = item->fail.text; \
397 state.count = item->fail.count; \
398 state.level = item->fail.level; \
401 while (item->fail.text == NULL); \
404 enum regexp_compiled_ops { /* opcodes for compiled regexp */
405 Cend, /* end of pattern reached */
406 Cbol, /* beginning of line */
407 Ceol, /* end of line */
408 Cset, /* character set. Followed by 32 bytes of set. */
409 Cexact, /* followed by a byte to match */
410 Canychar, /* matches any character except newline */
411 Cstart_memory, /* set register start addr (followed by reg number) */
412 Cend_memory, /* set register end addr (followed by reg number) */
413 Cmatch_memory, /* match a duplicate of reg contents (regnum follows) */
414 Cjump, /* followed by two bytes (lsb,msb) of displacement. */
415 Cstar_jump, /* will change to jump/update_failure_jump at runtime */
416 Cfailure_jump, /* jump to addr on failure */
417 Cupdate_failure_jump, /* update topmost failure point and jump */
418 Cdummy_failure_jump, /* push a dummy failure point and jump */
419 Cbegbuf, /* match at beginning of buffer */
420 Cendbuf, /* match at end of buffer */
421 Cwordbeg, /* match at beginning of word */
422 Cwordend, /* match at end of word */
423 Cwordbound, /* match if at word boundary */
424 Cnotwordbound, /* match if not at word boundary */
425 Csyntaxspec, /* matches syntax code (1 byte follows) */
426 Cnotsyntaxspec, /* matches if syntax code does not match (1 byte follows) */
430 enum regexp_syntax_op { /* syntax codes for plain and quoted characters */
431 Rend, /* special code for end of regexp */
432 Rnormal, /* normal character */
433 Ranychar, /* any character except newline */
434 Rquote, /* the quote character */
435 Rbol, /* match beginning of line */
436 Reol, /* match end of line */
437 Roptional, /* match preceding expression optionally */
438 Rstar, /* match preceding expr zero or more times */
439 Rplus, /* match preceding expr one or more times */
440 Ror, /* match either of alternatives */
441 Ropenpar, /* opening parenthesis */
442 Rclosepar, /* closing parenthesis */
443 Rmemory, /* match memory register */
444 Rextended_memory, /* \vnn to match registers 10-99 */
445 Ropenset, /* open set. Internal syntax hard-coded below. */
446 /* the following are gnu extensions to "normal" regexp syntax */
447 Rbegbuf, /* beginning of buffer */
448 Rendbuf, /* end of buffer */
449 Rwordchar, /* word character */
450 Rnotwordchar, /* not word character */
451 Rwordbeg, /* beginning of word */
452 Rwordend, /* end of word */
453 Rwordbound, /* word bound */
454 Rnotwordbound, /* not word bound */
458 static int re_compile_initialized = 0;
459 static int regexp_syntax = RE_SYNTAX_EGREP;
460 int re_syntax = RE_SYNTAX_EGREP; /* Exported copy of regexp_syntax */
461 static unsigned char plain_ops[256];
462 static unsigned char quoted_ops[256];
463 static unsigned char precedences[Rnum_ops];
464 static int regexp_context_indep_ops;
465 static int regexp_ansi_sequences;
467 #define NUM_LEVELS 5 /* number of precedence levels in use */
468 #define MAX_NESTING 100 /* max nesting level of operators */
470 #define SYNTAX(ch) re_syntax_table[(unsigned char)(ch)]
472 unsigned char re_syntax_table[256];
474 void re_compile_initialize(void)
478 static int syntax_table_inited = 0;
480 if (!syntax_table_inited) {
481 syntax_table_inited = 1;
482 memset(re_syntax_table, 0, 256);
483 for (a = 'a'; a <= 'z'; a++)
484 re_syntax_table[a] = Sword;
485 for (a = 'A'; a <= 'Z'; a++)
486 re_syntax_table[a] = Sword;
487 for (a = '0'; a <= '9'; a++)
488 re_syntax_table[a] = Sword | Sdigit | Shexdigit;
489 for (a = '0'; a <= '7'; a++)
490 re_syntax_table[a] |= Soctaldigit;
491 for (a = 'A'; a <= 'F'; a++)
492 re_syntax_table[a] |= Shexdigit;
493 for (a = 'a'; a <= 'f'; a++)
494 re_syntax_table[a] |= Shexdigit;
495 re_syntax_table[(int)'_'] = Sword;
496 for (a = 9; a <= 13; a++)
497 re_syntax_table[a] = Swhitespace;
498 re_syntax_table[(int)' '] = Swhitespace;
500 re_compile_initialized = 1;
501 for (a = 0; a < 256; a++) {
502 plain_ops[a] = Rnormal;
503 quoted_ops[a] = Rnormal;
505 for (a = '0'; a <= '9'; a++)
506 quoted_ops[a] = Rmemory;
507 plain_ops[(int)'\134'] = Rquote;
508 if (regexp_syntax & RE_NO_BK_PARENS) {
509 plain_ops[(int)'('] = Ropenpar;
510 plain_ops[(int)')'] = Rclosepar;
512 quoted_ops[(int)'('] = Ropenpar;
513 quoted_ops[(int)')'] = Rclosepar;
515 if (regexp_syntax & RE_NO_BK_VBAR) {
516 plain_ops[(int)'\174'] = Ror;
518 quoted_ops[(int)'\174'] = Ror;
520 plain_ops[(int)'*'] = Rstar;
521 if (regexp_syntax & RE_BK_PLUS_QM) {
522 quoted_ops[(int)'+'] = Rplus;
523 quoted_ops[(int)'?'] = Roptional;
525 plain_ops[(int)'+'] = Rplus;
526 plain_ops[(int)'?'] = Roptional;
528 if (regexp_syntax & RE_NEWLINE_OR) {
529 plain_ops[(int)'\n'] = Ror;
531 plain_ops[(int)'\133'] = Ropenset;
532 plain_ops[(int)'\136'] = Rbol;
533 plain_ops[(int)'$'] = Reol;
534 plain_ops[(int)'.'] = Ranychar;
535 if (!(regexp_syntax & RE_NO_GNU_EXTENSIONS)) {
536 quoted_ops[(int)'w'] = Rwordchar;
537 quoted_ops[(int)'W'] = Rnotwordchar;
538 quoted_ops[(int)'<'] = Rwordbeg;
539 quoted_ops[(int)'>'] = Rwordend;
540 quoted_ops[(int)'b'] = Rwordbound;
541 quoted_ops[(int)'B'] = Rnotwordbound;
542 quoted_ops[(int)'`'] = Rbegbuf;
543 quoted_ops[(int)'\''] = Rendbuf;
545 if (regexp_syntax & RE_ANSI_HEX) {
546 quoted_ops[(int)'v'] = Rextended_memory;
548 for (a = 0; a < Rnum_ops; a++) {
551 if (regexp_syntax & RE_TIGHT_VBAR) {
552 precedences[Ror] = 3;
553 precedences[Rbol] = 2;
554 precedences[Reol] = 2;
556 precedences[Ror] = 2;
557 precedences[Rbol] = 3;
558 precedences[Reol] = 3;
560 precedences[Rclosepar] = 1;
561 precedences[Rend] = 0;
562 regexp_context_indep_ops = (regexp_syntax & RE_CONTEXT_INDEP_OPS) != 0;
563 regexp_ansi_sequences = (regexp_syntax & RE_ANSI_HEX) != 0;
566 int re_set_syntax(int syntax) {
570 regexp_syntax = syntax;
571 re_syntax = syntax; /* Exported copy */
572 re_compile_initialize();
576 static int hex_char_to_decimal(int ch) {
577 if (ch >= '0' && ch <= '9')
579 if (ch >= 'a' && ch <= 'f')
580 return ch - 'a' + 10;
581 if (ch >= 'A' && ch <= 'F')
582 return ch - 'A' + 10;
586 static void re_compile_fastmap_aux(regex_t * bufp, unsigned char *code, int pos,
587 unsigned char *visited,
588 unsigned char *can_be_null,
589 unsigned char *fastmap)
596 return; /* we have already been here */
599 switch (code[pos++]) {
610 for (a = 0; a < 256; a++)
614 syntaxcode = code[pos++];
615 for (a = 0; a < 256; a++)
616 if (SYNTAX(a) & syntaxcode)
620 syntaxcode = code[pos++];
621 for (a = 0; a < 256; a++)
622 if (!(SYNTAX(a) & syntaxcode))
626 fastmap[(int)'\n'] = 1;
627 if (*can_be_null == 0)
628 *can_be_null = 2; /* can match null, but only at end of buffer */
631 for (a = 0; a < 256 / 8; a++)
632 if (code[pos + a] != 0)
633 for (b = 0; b < 8; b++)
634 if (code[pos + a] & (1 << b))
635 fastmap[(a << 3) + b] = 1;
639 fastmap[(unsigned char)code[pos]] = 1;
642 for (a = 0; a < 256; a++)
651 for (a = 0; a < 256; a++)
656 case Cdummy_failure_jump:
657 case Cupdate_failure_jump:
659 a = (unsigned char)code[pos++];
660 a |= (unsigned char)code[pos++] << 8;
661 pos += (int)SHORT(a);
663 /* argh... the regexp contains empty loops. This is not
664 good, as this may cause a failure stack overflow when
665 matching. Oh well. */
666 /* this path leads nowhere; pursue other paths. */
672 a = (unsigned char)code[pos++];
673 a |= (unsigned char)code[pos++] << 8;
674 a = pos + (int)SHORT(a);
675 re_compile_fastmap_aux(bufp, code, a, visited, can_be_null, fastmap);
681 set_error("Unknown regex opcode: memory corrupted?");
687 static int re_do_compile_fastmap(regex_t * bufp, unsigned char *buffer, int used,
688 int pos, unsigned char *can_be_null,
689 unsigned char *fastmap)
691 unsigned char small_visited[512], *visited;
693 if (used <= (int)sizeof(small_visited))
694 visited = small_visited;
696 visited = (unsigned char *)malloc(used);
701 memset(fastmap, 0, 256);
702 memset(visited, 0, used);
703 re_compile_fastmap_aux(bufp, buffer, pos, visited, can_be_null, fastmap);
704 if (visited != small_visited)
709 void re_compile_fastmap(regex_t * bufp)
711 if (!bufp->fastmap || bufp->fastmap_accurate)
713 // assert(bufp->used > 0);
714 if (!re_do_compile_fastmap(bufp, bufp->buffer,
715 bufp->used, 0, &bufp->can_be_null, bufp->fastmap))
719 if (bufp->buffer[0] == Cbol)
720 bufp->anchor = 1; /* begline */
721 else if (bufp->buffer[0] == Cbegbuf)
722 bufp->anchor = 2; /* begbuf */
724 bufp->anchor = 0; /* none */
725 bufp->fastmap_accurate = 1;
731 * ... code for operand of star
733 * 2: ... code after star
735 * We change the star_jump to update_failure_jump if we can determine
736 * that it is safe to do so; otherwise we change it to an ordinary
743 * 2: ... code for operand of plus
745 * 3: ... code after plus
747 * For star_jump considerations this is processed identically to star.
751 static int re_optimize_star_jump(regex_t * bufp, unsigned char *code)
753 unsigned char map[256];
754 unsigned char can_be_null;
760 int num_instructions = 0;
762 a = (unsigned char)*code++;
763 a |= (unsigned char)*code++ << 8;
766 p1 = code + a + 3; /* skip the failure_jump */
767 /* Check that the jump is within the pattern */
768 if (p1 < bufp->buffer || bufp->buffer + bufp->used < p1) {
769 set_error("Regex VM jump out of bounds (failure_jump opt)");
772 // assert(p1[-3] == Cfailure_jump);
774 /* p1 points inside loop, p2 points to after loop */
775 if (!re_do_compile_fastmap(bufp, bufp->buffer, bufp->used,
776 (int)(p2 - bufp->buffer), &can_be_null, map))
777 goto make_normal_jump;
779 /* If we might introduce a new update point inside the
780 * loop, we can't optimize because then update_jump would
781 * update a wrong failure point. Thus we have to be
782 * quite careful here.
785 /* loop until we find something that consumes a character */
803 ch = (unsigned char)*p1++;
805 goto make_normal_jump;
808 for (b = 0; b < 256; b++)
809 if (b != '\n' && map[b])
810 goto make_normal_jump;
813 for (b = 0; b < 256; b++)
814 if ((p1[b >> 3] & (1 << (b & 7))) && map[b])
815 goto make_normal_jump;
819 goto make_normal_jump;
823 /* now we know that we can't backtrack. */
824 while (p1 != p2 - 3) {
853 case Cupdate_failure_jump:
854 case Cdummy_failure_jump:
855 goto make_normal_jump;
861 /* make_update_jump: */
863 a += 3; /* jump to after the Cfailure_jump */
864 code[0] = Cupdate_failure_jump;
867 if (num_instructions > 1)
869 // assert(num_instructions == 1);
870 /* if the only instruction matches a single character, we can do
872 p1 = code + 3 + a; /* start of sole instruction */
873 if (*p1 == Cset || *p1 == Cexact || *p1 == Canychar ||
874 *p1 == Csyntaxspec || *p1 == Cnotsyntaxspec)
884 static int re_optimize(regex_t * bufp)
916 if (!re_optimize_star_jump(bufp, code)) {
920 case Cupdate_failure_jump:
922 case Cdummy_failure_jump:
933 #define NEXTCHAR(var) \
936 goto ends_prematurely; \
937 (var) = regex[pos]; \
941 #define ALLOC(amount) \
943 if (pattern_offset+(amount) > alloc) \
945 alloc += 256 + (amount); \
946 pattern = (unsigned char *)realloc(pattern, alloc); \
948 goto out_of_memory; \
952 #define STORE(ch) pattern[pattern_offset++] = (ch)
954 #define CURRENT_LEVEL_START (starts[starts_base + current_level])
956 #define SET_LEVEL_START starts[starts_base + current_level] = pattern_offset
958 #define PUSH_LEVEL_STARTS \
959 if (starts_base < (MAX_NESTING-1)*NUM_LEVELS) \
960 starts_base += NUM_LEVELS; \
964 #define POP_LEVEL_STARTS starts_base -= NUM_LEVELS
966 #define PUT_ADDR(offset,addr) \
968 int disp = (addr) - (offset) - 2; \
969 pattern[(offset)] = disp & 0xff; \
970 pattern[(offset)+1] = (disp>>8) & 0xff; \
973 #define INSERT_JUMP(pos,type,addr) \
975 int a, p = (pos), t = (type), ad = (addr); \
976 for (a = pattern_offset - 1; a >= p; a--) \
977 pattern[a + 3] = pattern[a]; \
980 pattern_offset += 3; \
983 #define SETBIT(buf,offset,bit) (buf)[(offset)+(bit)/8] |= (1<<((bit) & 7))
987 bufp->allocated = alloc; \
988 bufp->buffer = pattern; \
989 bufp->used = pattern_offset; \
992 #define GETHEX(var) \
994 unsigned char gethex_ch, gethex_value; \
995 NEXTCHAR(gethex_ch); \
996 gethex_value = hex_char_to_decimal(gethex_ch); \
997 if (gethex_value == 16) \
999 NEXTCHAR(gethex_ch); \
1000 gethex_ch = hex_char_to_decimal(gethex_ch); \
1001 if (gethex_ch == 16) \
1003 (var) = gethex_value * 16 + gethex_ch; \
1006 #define ANSI_TRANSLATE(ch) \
1013 ch = 7; /* audible bell */ \
1019 ch = 8; /* backspace */ \
1025 ch = 12; /* form feed */ \
1031 ch = 10; /* line feed */ \
1037 ch = 13; /* carriage return */ \
1049 ch = 11; /* vertical tab */ \
1052 case 'x': /* hex code */ \
1060 /* other characters passed through */ \
1062 ch = translate[(unsigned char)ch]; \
1068 const char *re_compile_pattern(regex_t * bufp, unsigned char *regex)
1076 int pattern_offset = 0, alloc;
1077 int starts[NUM_LEVELS * MAX_NESTING];
1079 int future_jumps[MAX_NESTING];
1081 unsigned char ch = '\0';
1082 unsigned char *pattern;
1083 unsigned char *translate;
1086 int num_open_registers;
1087 int open_registers[RE_NREGS];
1088 int beginning_context;
1089 int size = strlen((char *)regex);
1091 if (!re_compile_initialized)
1092 re_compile_initialize();
1094 bufp->fastmap_accurate = 0;
1095 bufp->uses_registers = 1;
1096 bufp->num_registers = 1;
1097 translate = bufp->translate;
1098 pattern = bufp->buffer;
1099 alloc = bufp->allocated;
1100 if (alloc == 0 || pattern == NULL) {
1102 pattern = (unsigned char *)malloc(alloc);
1111 num_open_registers = 0;
1114 beginning_context = 1;
1116 /* we use Rend dummy to ensure that pending jumps are updated
1117 (due to low priority of Rend) before exiting the loop. */
1119 while (op != Rend) {
1125 ch = translate[(unsigned char)ch];
1126 op = plain_ops[(unsigned char)ch];
1129 op = quoted_ops[(unsigned char)ch];
1130 if (op == Rnormal && regexp_ansi_sequences)
1134 level = precedences[op];
1135 /* printf("ch='%c' op=%d level=%d current_level=%d
1136 curlevstart=%d\n", ch, op, level, current_level,
1137 CURRENT_LEVEL_START); */
1138 if (level > current_level) {
1139 for (current_level++; current_level < level; current_level++)
1142 } else if (level < current_level) {
1143 current_level = level;
1144 for (; num_jumps > 0 &&
1145 future_jumps[num_jumps - 1] >= CURRENT_LEVEL_START; num_jumps--)
1146 PUT_ADDR(future_jumps[num_jumps - 1], pattern_offset);
1154 store_opcode_and_arg: /* opcode & ch must be set */
1168 set_error("Rquote");
1169 /*NOTREACHED*/ case Rbol:
1170 if (!beginning_context) {
1171 if (regexp_context_indep_ops)
1179 if (!((pos >= size) ||
1180 ((regexp_syntax & RE_NO_BK_VBAR) ?
1181 (regex[pos] == '\174') :
1182 (pos + 1 < size && regex[pos] == '\134' &&
1183 regex[pos + 1] == '\174')) ||
1184 ((regexp_syntax & RE_NO_BK_PARENS) ?
1185 (regex[pos] == ')') :
1186 (pos + 1 < size && regex[pos] == '\134' &&
1187 regex[pos + 1] == ')')))) {
1188 if (regexp_context_indep_ops)
1198 if (beginning_context) {
1199 if (regexp_context_indep_ops)
1204 if (CURRENT_LEVEL_START == pattern_offset)
1205 break; /* ignore empty patterns for ? */
1207 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump, pattern_offset + 3);
1211 if (beginning_context) {
1212 if (regexp_context_indep_ops)
1217 if (CURRENT_LEVEL_START == pattern_offset)
1218 break; /* ignore empty patterns for + and * */
1220 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump, pattern_offset + 6);
1221 INSERT_JUMP(pattern_offset, Cstar_jump, CURRENT_LEVEL_START);
1222 if (op == Rplus) /* jump over initial failure_jump */
1223 INSERT_JUMP(CURRENT_LEVEL_START, Cdummy_failure_jump,
1224 CURRENT_LEVEL_START + 6);
1228 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump, pattern_offset + 6);
1229 if (num_jumps >= MAX_NESTING)
1232 future_jumps[num_jumps++] = pattern_offset;
1240 if (next_register < RE_NREGS) {
1241 bufp->uses_registers = 1;
1243 STORE(Cstart_memory);
1244 STORE(next_register);
1245 open_registers[num_open_registers++] = next_register;
1246 bufp->num_registers++;
1256 if (paren_depth <= 0)
1257 goto parenthesis_error;
1259 current_level = precedences[Ropenpar];
1261 if (paren_depth < num_open_registers) {
1262 bufp->uses_registers = 1;
1265 num_open_registers--;
1266 STORE(open_registers[num_open_registers]);
1271 goto bad_match_register;
1272 if (!(ch >= '0' && ch <= '9')) {
1273 goto bad_match_register;
1275 bufp->uses_registers = 1;
1276 opcode = Cmatch_memory;
1278 goto store_opcode_and_arg;
1279 case Rextended_memory:
1281 if (ch < '0' || ch > '9')
1282 goto bad_match_register;
1284 if (a < '0' || a > '9')
1285 goto bad_match_register;
1286 ch = 10 * (a - '0') + ch - '0';
1287 if (ch == 0 || ch >= RE_NREGS)
1288 goto bad_match_register;
1289 bufp->uses_registers = 1;
1290 opcode = Cmatch_memory;
1291 goto store_opcode_and_arg;
1303 offset = pattern_offset;
1304 for (a = 0; a < 256 / 8; a++)
1308 ch = translate[(unsigned char)ch];
1313 ch = translate[(unsigned char)ch];
1319 while (ch != '\135' || firstchar) {
1321 if (regexp_ansi_sequences && ch == '\134') {
1326 for (a = prev; a <= (int)ch; a++)
1327 SETBIT(pattern, offset, a);
1330 } else if (prev != -1 && ch == '-')
1333 SETBIT(pattern, offset, ch);
1338 ch = translate[(unsigned char)ch];
1341 SETBIT(pattern, offset, '-');
1343 for (a = 0; a < 256 / 8; a++)
1344 pattern[offset + a] ^= 0xff;
1360 opcode = Csyntaxspec;
1362 goto store_opcode_and_arg;
1366 opcode = Cnotsyntaxspec;
1368 goto store_opcode_and_arg;
1382 opcode = Cwordbound;
1387 opcode = Cnotwordbound;
1395 beginning_context = (op == Ropenpar || op == Ror);
1397 if (starts_base != 0)
1398 goto parenthesis_error;
1399 // assert(num_jumps == 0);
1403 if (!re_optimize(bufp))
1404 return "Optimization error";
1409 return "Badly placed special character";
1413 return "Bad match register number";
1417 return "Bad hexadecimal number";
1421 return "Badly placed parenthesis";
1425 return "Out of memory";
1429 return "Regular expression ends prematurely";
1433 return "Regular expression too complex";
1441 #undef CURRENT_LEVEL_START
1442 #undef SET_LEVEL_START
1443 #undef PUSH_LEVEL_STARTS
1444 #undef POP_LEVEL_STARTS
1450 #define PREFETCH if (text == textend) goto fail
1452 #define NEXTCHAR(var) \
1454 var = (unsigned char)*text++; \
1456 var = translate[var]
1459 int regcomp(regex_t * bufp, const char *regex, int cflags)
1461 memset(bufp, 0, sizeof(regex_t));
1462 re_compile_pattern(bufp, (unsigned char *)regex);
1469 int regexec(regex_t * preg, const char *string, size_t nmatch,
1470 regmatch_t pmatch[], int eflags)
1473 int len = strlen(string);
1474 struct re_registers regs;
1475 stat = re_search(preg, (unsigned char *)string, len, 0, len, ®s);
1476 /* stat is the start position in the string base 0 where
1477 * the pattern was found or negative if not found.
1479 return stat < 0 ? -1 : 0;
1482 size_t regerror(int errcode, regex_t * preg, char *errbuf, size_t errbuf_size)
1484 bstrncpy(errbuf, preg->errmsg, errbuf_size);
1488 void regfree(regex_t * preg)
1492 int re_match(regex_t * bufp, unsigned char *string, int size, int pos,
1493 regexp_registers_t old_regs)
1495 unsigned char *code;
1496 unsigned char *translate;
1497 unsigned char *text;
1498 unsigned char *textstart;
1499 unsigned char *textend;
1505 unsigned char *regstart;
1506 unsigned char *regend;
1510 // assert(pos >= 0 && size >= 0);
1511 // assert(pos <= size);
1513 text = string + pos;
1515 textend = string + size;
1517 code = bufp->buffer;
1519 translate = bufp->translate;
1521 NEW_STATE(state, bufp->num_registers);
1527 match_end = text - textstart;
1529 old_regs->start[0] = pos;
1530 old_regs->end[0] = match_end;
1531 if (!bufp->uses_registers) {
1532 for (a = 1; a < RE_NREGS; a++) {
1533 old_regs->start[a] = -1;
1534 old_regs->end[a] = -1;
1537 for (a = 1; a < bufp->num_registers; a++) {
1538 if ((GET_REG_START(state, a) == NULL) ||
1539 (GET_REG_END(state, a) == NULL)) {
1540 old_regs->start[a] = -1;
1541 old_regs->end[a] = -1;
1544 old_regs->start[a] = GET_REG_START(state, a) - textstart;
1545 old_regs->end[a] = GET_REG_END(state, a) - textstart;
1547 for (; a < RE_NREGS; a++) {
1548 old_regs->start[a] = -1;
1549 old_regs->end[a] = -1;
1554 return match_end - pos;
1558 if (text == textstart || text[-1] == '\n')
1559 goto continue_matching;
1564 if (text == textend || *text == '\n')
1565 goto continue_matching;
1571 if (code[ch / 8] & (1 << (ch & 7))) {
1573 goto continue_matching;
1580 if (ch != (unsigned char)*code++)
1582 goto continue_matching;
1589 goto continue_matching;
1594 SET_REG_START(state, reg, text, goto error);
1595 goto continue_matching;
1600 SET_REG_END(state, reg, text, goto error);
1601 goto continue_matching;
1606 regstart = GET_REG_START(state, reg);
1607 regend = GET_REG_END(state, reg);
1608 if ((regstart == NULL) || (regend == NULL))
1609 goto fail; /* or should we just match nothing? */
1610 regsize = regend - regstart;
1612 if (regsize > (textend - text))
1615 for (; regstart < regend; regstart++, text++)
1616 if (translate[*regstart] != translate[*text])
1619 for (; regstart < regend; regstart++, text++)
1620 if (*regstart != *text)
1622 goto continue_matching;
1624 case Cupdate_failure_jump:
1626 UPDATE_FAILURE(state, text, goto error);
1627 /* fall to next case */
1629 /* treat Cstar_jump just like Cjump if it hasn't been optimized */
1633 a = (unsigned char)*code++;
1634 a |= (unsigned char)*code++ << 8;
1635 code += (int)SHORT(a);
1636 if (code < bufp->buffer || bufp->buffer + bufp->used < code) {
1637 set_error("Regex VM jump out of bounds (Cjump)");
1641 goto continue_matching;
1643 case Cdummy_failure_jump:
1645 unsigned char *failuredest;
1647 a = (unsigned char)*code++;
1648 a |= (unsigned char)*code++ << 8;
1650 // assert(*code == Cfailure_jump);
1651 b = (unsigned char)code[1];
1652 b |= (unsigned char)code[2] << 8;
1653 failuredest = code + (int)SHORT(b) + 3;
1654 if (failuredest < bufp->buffer || bufp->buffer + bufp->used < failuredest) {
1656 ("Regex VM jump out of bounds (Cdummy_failure_jump failuredest)");
1660 PUSH_FAILURE(state, failuredest, NULL, goto error);
1662 if (code < bufp->buffer || bufp->buffer + bufp->used < code) {
1663 set_error("Regex VM jump out of bounds (Cdummy_failure_jump code)");
1667 goto continue_matching;
1671 a = (unsigned char)*code++;
1672 a |= (unsigned char)*code++ << 8;
1674 if (code + a < bufp->buffer || bufp->buffer + bufp->used < code + a) {
1675 set_error("Regex VM jump out of bounds (Cfailure_jump)");
1679 PUSH_FAILURE(state, code + a, text, goto error);
1680 goto continue_matching;
1684 unsigned char *pinst;
1685 a = (unsigned char)*code++;
1686 a |= (unsigned char)*code++ << 8;
1689 if (pinst < bufp->buffer || bufp->buffer + bufp->used < pinst) {
1690 set_error("Regex VM jump out of bounds (Crepeat1)");
1694 /* pinst is sole instruction in loop, and it matches a
1695 * single character. Since Crepeat1 was originally a
1696 * Cupdate_failure_jump, we also know that backtracking
1697 * is useless: so long as the single-character
1698 * expression matches, it must be used. Also, in the
1699 * case of +, we've already matched one character, so +
1700 * can't fail: nothing here can cause a failure. */
1705 while (text < textend) {
1706 ch = translate[(unsigned char)*text];
1707 if (pinst[ch / 8] & (1 << (ch & 7)))
1713 while (text < textend) {
1714 ch = (unsigned char)*text;
1715 if (pinst[ch / 8] & (1 << (ch & 7)))
1725 ch = (unsigned char)*pinst;
1727 while (text < textend && translate[(unsigned char)*text] == ch)
1730 while (text < textend && (unsigned char)*text == ch)
1737 while (text < textend && (unsigned char)*text != '\n')
1743 a = (unsigned char)*pinst;
1745 while (text < textend && (SYNTAX(translate[*text]) & a))
1748 while (text < textend && (SYNTAX(*text) & a))
1753 case Cnotsyntaxspec:
1755 a = (unsigned char)*pinst;
1757 while (text < textend && !(SYNTAX(translate[*text]) & a))
1760 while (text < textend && !(SYNTAX(*text) & a))
1768 set_error("Unknown regex opcode: memory corrupted?");
1772 /* due to the funky way + and * are compiled, the top
1773 * failure- stack entry at this point is actually a
1774 * success entry -- update it & pop it */
1775 UPDATE_FAILURE(state, text, goto error);
1776 goto fail; /* i.e., succeed <wink/sigh> */
1780 if (text == textstart)
1781 goto continue_matching;
1786 if (text == textend)
1787 goto continue_matching;
1792 if (text == textend)
1794 if (!(SYNTAX(*text) & Sword))
1796 if (text == textstart)
1797 goto continue_matching;
1798 if (!(SYNTAX(text[-1]) & Sword))
1799 goto continue_matching;
1804 if (text == textstart)
1806 if (!(SYNTAX(text[-1]) & Sword))
1808 if (text == textend)
1809 goto continue_matching;
1810 if (!(SYNTAX(*text) & Sword))
1811 goto continue_matching;
1816 /* Note: as in gnu regexp, this also matches at the
1817 * beginning and end of buffer. */
1819 if (text == textstart || text == textend)
1820 goto continue_matching;
1821 if ((SYNTAX(text[-1]) & Sword) ^ (SYNTAX(*text) & Sword))
1822 goto continue_matching;
1827 /* Note: as in gnu regexp, this never matches at the
1828 * beginning and end of buffer. */
1829 if (text == textstart || text == textend)
1831 if (!((SYNTAX(text[-1]) & Sword) ^ (SYNTAX(*text) & Sword)))
1832 goto continue_matching;
1838 if (!(SYNTAX(ch) & (unsigned char)*code++))
1840 goto continue_matching;
1842 case Cnotsyntaxspec:
1845 if (SYNTAX(ch) & (unsigned char)*code++)
1847 goto continue_matching;
1852 set_error("Unknown regex opcode: memory corrupted?");
1859 #if 0 /* This line is never reached --Guido */
1866 /* Using "break;" in the above switch statement is equivalent to "goto fail;" */
1868 POP_FAILURE(state, code, text, goto done_matching, goto error);
1869 goto continue_matching;
1872 /* if(translated != NULL) */
1873 /* free(translated); */
1878 /* if (translated != NULL) */
1879 /* free(translated); */
1888 int re_search(regex_t * bufp, unsigned char *string, int size, int pos,
1889 int range, regexp_registers_t regs)
1891 unsigned char *fastmap;
1892 unsigned char *translate;
1893 unsigned char *text;
1894 unsigned char *partstart;
1895 unsigned char *partend;
1898 unsigned char anchor;
1900 // assert(size >= 0 && pos >= 0);
1901 // assert(pos + range >= 0 && pos + range <= size); /* Bugfix by ylo */
1903 fastmap = bufp->fastmap;
1904 translate = bufp->translate;
1905 if (fastmap && !bufp->fastmap_accurate) {
1906 re_compile_fastmap(bufp);
1911 anchor = bufp->anchor;
1912 if (bufp->can_be_null == 1) /* can_be_null == 2: can match null at eob */
1928 for (; range >= 0; range--, pos += dir) {
1930 if (dir == 1) { /* searching forwards */
1932 text = string + pos;
1933 partend = string + size;
1936 while (text != partend &&
1937 !fastmap[(unsigned char)translate[(unsigned char)*text]])
1940 while (text != partend && !fastmap[(unsigned char)*text])
1942 pos += text - partstart;
1943 range -= text - partstart;
1944 if (pos == size && bufp->can_be_null == 0)
1946 } else { /* searching backwards */
1947 text = string + pos;
1948 partstart = string + pos - range;
1951 while (text != partstart && !fastmap[(unsigned char)
1952 translate[(unsigned char)*text]])
1955 while (text != partstart && !fastmap[(unsigned char)*text])
1957 pos -= partend - text;
1958 range -= partend - text;
1961 if (anchor == 1) { /* anchored to begline */
1962 if (pos > 0 && (string[pos - 1] != '\n'))
1965 // assert(pos >= 0 && pos <= size);
1966 ret = re_match(bufp, string, size, pos, regs);
1978 ** c-file-style: "python"