📄 gnuregex.c
字号:
* correct places in the new one. If extending the buffer results in it * being larger than MAX_BUF_SIZE, then flag memory exhausted. */#define EXTEND_BUFFER() \ do { \ unsigned char *old_buffer = bufp->buffer; \ if (bufp->allocated == MAX_BUF_SIZE) \ return REG_ESIZE; \ bufp->allocated <<= 1; \ if (bufp->allocated > MAX_BUF_SIZE) \ bufp->allocated = MAX_BUF_SIZE; \ bufp->buffer = (unsigned char *) realloc (bufp->buffer, bufp->allocated);\ if (bufp->buffer == NULL) \ return REG_ESPACE; \ /* If the buffer moved, move all the pointers into it. */ \ if (old_buffer != bufp->buffer) \ { \ b = (b - old_buffer) + bufp->buffer; \ begalt = (begalt - old_buffer) + bufp->buffer; \ if (fixup_alt_jump) \ fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer;\ if (laststart) \ laststart = (laststart - old_buffer) + bufp->buffer; \ if (pending_exact) \ pending_exact = (pending_exact - old_buffer) + bufp->buffer; \ } \ } while (0)/* Since we have one byte reserved for the register number argument to * {start,stop}_memory, the maximum number of groups we can report * things about is what fits in that byte. */#define MAX_REGNUM 255/* But patterns can have more than `MAX_REGNUM' registers. We just * ignore the excess. */typedef unsigned regnum_t;/* Macros for the compile stack. *//* Since offsets can go either forwards or backwards, this type needs to * be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */typedef int pattern_offset_t;typedef struct { pattern_offset_t begalt_offset; pattern_offset_t fixup_alt_jump; pattern_offset_t inner_group_offset; pattern_offset_t laststart_offset; regnum_t regnum;} compile_stack_elt_t;typedef struct { compile_stack_elt_t *stack; unsigned size; unsigned avail; /* Offset of next open position. */} compile_stack_type;#define INIT_COMPILE_STACK_SIZE 32#define COMPILE_STACK_EMPTY (compile_stack.avail == 0)#define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)/* The next available element. */#define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])/* Set the bit for character C in a list. */#define SET_LIST_BIT(c) \ (b[((unsigned char) (c)) / BYTEWIDTH] \ |= 1 << (((unsigned char) c) % BYTEWIDTH))/* Get the next unsigned number in the uncompiled pattern. */#define GET_UNSIGNED_NUMBER(num) \ { if (p != pend) \ { \ PATFETCH (c); \ while (ISDIGIT (c)) \ { \ if (num < 0) \ num = 0; \ num = num * 10 + c - '0'; \ if (p == pend) \ break; \ PATFETCH (c); \ } \ } \ }#define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */#define IS_CHAR_CLASS(string) \ (STREQ (string, "alpha") || STREQ (string, "upper") \ || STREQ (string, "lower") || STREQ (string, "digit") \ || STREQ (string, "alnum") || STREQ (string, "xdigit") \ || STREQ (string, "space") || STREQ (string, "print") \ || STREQ (string, "punct") || STREQ (string, "graph") \ || STREQ (string, "cntrl") || STREQ (string, "blank"))/* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX. * Returns one of error codes defined in `regex.h', or zero for success. * * Assumes the `allocated' (and perhaps `buffer') and `translate' * fields are set in BUFP on entry. * * If it succeeds, results are put in BUFP (if it returns an error, the * contents of BUFP are undefined): * `buffer' is the compiled pattern; * `syntax' is set to SYNTAX; * `used' is set to the length of the compiled pattern; * `fastmap_accurate' is zero; * `re_nsub' is the number of subexpressions in PATTERN; * `not_bol' and `not_eol' are zero; * * The `fastmap' and `newline_anchor' fields are neither * examined nor set. */static reg_errcode_tregex_compile(pattern, size, syntax, bufp) const char *pattern; int size; reg_syntax_t syntax; struct re_pattern_buffer *bufp;{ /* We fetch characters from PATTERN here. Even though PATTERN is * `char *' (i.e., signed), we declare these variables as unsigned, so * they can be reliably used as array indices. */ register unsigned char c, c1; /* A random tempory spot in PATTERN. */ const char *p1; /* Points to the end of the buffer, where we should append. */ register unsigned char *b; /* Keeps track of unclosed groups. */ compile_stack_type compile_stack; /* Points to the current (ending) position in the pattern. */ const char *p = pattern; const char *pend = pattern + size; /* How to translate the characters in the pattern. */ char *translate = bufp->translate; /* Address of the count-byte of the most recently inserted `exactn' * command. This makes it possible to tell if a new exact-match * character can be added to that command or if the character requires * a new `exactn' command. */ unsigned char *pending_exact = 0; /* Address of start of the most recently finished expression. * This tells, e.g., postfix * where to find the start of its * operand. Reset at the beginning of groups and alternatives. */ unsigned char *laststart = 0; /* Address of beginning of regexp, or inside of last group. */ unsigned char *begalt; /* Place in the uncompiled pattern (i.e., the {) to * which to go back if the interval is invalid. */ const char *beg_interval; /* Address of the place where a forward jump should go to the end of * the containing expression. Each alternative of an `or' -- except the * last -- ends with a forward jump of this sort. */ unsigned char *fixup_alt_jump = 0; /* Counts open-groups as they are encountered. Remembered for the * matching close-group on the compile stack, so the same register * number is put in the stop_memory as the start_memory. */ regnum_t regnum = 0;#ifdef DEBUG DEBUG_PRINT1("\nCompiling pattern: "); if (debug) { unsigned debug_count; for (debug_count = 0; debug_count < size; debug_count++) printchar(pattern[debug_count]); putchar('\n'); }#endif /* DEBUG */ /* Initialize the compile stack. */ compile_stack.stack = TALLOC(INIT_COMPILE_STACK_SIZE, compile_stack_elt_t); if (compile_stack.stack == NULL) return REG_ESPACE; compile_stack.size = INIT_COMPILE_STACK_SIZE; compile_stack.avail = 0; /* Initialize the pattern buffer. */ bufp->syntax = syntax; bufp->fastmap_accurate = 0; bufp->not_bol = bufp->not_eol = 0; /* Set `used' to zero, so that if we return an error, the pattern * printer (for debugging) will think there's no pattern. We reset it * at the end. */ bufp->used = 0; /* Always count groups, whether or not bufp->no_sub is set. */ bufp->re_nsub = 0;#if !defined (emacs) && !defined (SYNTAX_TABLE) /* Initialize the syntax table. */ init_syntax_once();#endif if (bufp->allocated == 0) { if (bufp->buffer) { /* If zero allocated, but buffer is non-null, try to realloc * enough space. This loses if buffer's address is bogus, but * that is the user's responsibility. */ RETALLOC(bufp->buffer, INIT_BUF_SIZE, unsigned char); } else { /* Caller did not allocate a buffer. Do it for them. */ bufp->buffer = TALLOC(INIT_BUF_SIZE, unsigned char); } if (!bufp->buffer) return REG_ESPACE; bufp->allocated = INIT_BUF_SIZE; } begalt = b = bufp->buffer; /* Loop through the uncompiled pattern until we're at the end. */ while (p != pend) { PATFETCH(c); switch (c) { case '^': { if ( /* If at start of pattern, it's an operator. */ p == pattern + 1 /* If context independent, it's an operator. */ || syntax & RE_CONTEXT_INDEP_ANCHORS /* Otherwise, depends on what's come before. */ || at_begline_loc_p(pattern, p, syntax)) BUF_PUSH(begline); else goto normal_char; } break; case '$': { if ( /* If at end of pattern, it's an operator. */ p == pend /* If context independent, it's an operator. */ || syntax & RE_CONTEXT_INDEP_ANCHORS /* Otherwise, depends on what's next. */ || at_endline_loc_p(p, pend, syntax)) BUF_PUSH(endline); else goto normal_char; } break; case '+': case '?': if ((syntax & RE_BK_PLUS_QM) || (syntax & RE_LIMITED_OPS)) goto normal_char; handle_plus: case '*': /* If there is no previous pattern... */ if (!laststart) { if (syntax & RE_CONTEXT_INVALID_OPS) return REG_BADRPT; else if (!(syntax & RE_CONTEXT_INDEP_OPS)) goto normal_char; } { /* Are we optimizing this jump? */ boolean keep_string_p = false; /* 1 means zero (many) matches is allowed. */ char zero_times_ok = 0, many_times_ok = 0; /* If there is a sequence of repetition chars, collapse it * down to just one (the right one). We can't combine * interval operators with these because of, e.g., `a{2}*', * which should only match an even number of `a's. */ for (;;) { zero_times_ok |= c != '+'; many_times_ok |= c != '?'; if (p == pend) break; PATFETCH(c); if (c == '*' || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?'))); else if (syntax & RE_BK_PLUS_QM && c == '\\') { if (p == pend) return REG_EESCAPE; PATFETCH(c1); if (!(c1 == '+' || c1 == '?')) { PATUNFETCH; PATUNFETCH; break; } c = c1; } else { PATUNFETCH; break; } /* If we get here, we found another repeat character. */ } /* Star, etc. applied to an empty pattern is equivalent * to an empty pattern. */ if (!laststart) break; /* Now we know whether or not zero matches is allowed * and also whether or not two or more matches is allowed. */ if (many_times_ok) { /* More than one repetition is allowed, so put in at the * end a backward relative jump from `b' to before the next * jump we're going to put in below (which jumps from * laststart to after this jump). * * But if we are at the `*' in the exact sequence `.*\n', * insert an unconditional jump backwards to the ., * instead of the beginning of the loop. This way we only * push a failure point once, instead of every time * through the loop. */ assert(p - 1 > pattern); /* Allocate the space for the jump. */ GET_BUFFER_SPACE(3); /* We know we are not at the first character of the pattern, * because laststart was nonzero. And we've already * incremented `p', by the way, to be the character after * the `*'. Do we have to do something analogous here * for null bytes, because of RE_DOT_NOT_NULL? */ if (TRANSLATE(*(p - 2)) == TRANSLATE('.') && zero_times_ok && p < pend && TRANSLATE(*p) == TRANSLATE('\n') && !(syntax & RE_DOT_NEWLINE)) { /* We have .*\n. */ STORE_JUMP(jump, b, laststart); keep_string_p = true; } else /* Anything else. */ STORE_JUMP(maybe_pop_jump, b, laststart - 3); /* We've added more stuff to the buffer. */ b += 3; } /* On failure, jump from laststart to b + 3, which will be the * end of the buffer after this jump is inserted. */ GET_BUFFER_SPACE(3); INSERT_JUMP(keep_string_p ? on_failure_keep_string_jump : on_failure_jump, laststart, b + 3); pending_exact = 0; b += 3; if (!zero_times_ok) { /* At least one repetition is required, so insert a * `dummy_failure_jump' before the initial * `on_failure_jump' instruction of the loop. This * effects a skip over that instruction the first time * we hit that loop. */ GET_BUFFER_SPACE(3); INSERT_JUMP(dummy_failure_jump, laststart, laststart + 6); b += 3; } } break; case '.': laststart = b; BUF_PUSH(anychar); break; case '[': { boolean had_char_class = false; if (p == pend) return REG_EBRACK; /* Ensure that we have enough space to push a charset: the * opcode, the length count, and the bitset; 34 bytes in all. */ GET_BUFFER_SPACE(34); laststart = b; /* We test `*p == '^' twice, instead of using an if * statement, so we only need one BUF_PUSH. */ BUF_PUSH(*p == '^' ? charset_not : charset); if (*p == '^') p++; /* Remember the first position in the bracket expression. */ p1 = p; /* Push the number of bytes in the bitmap. */ BUF_PUSH((1 << BYTEWIDTH) / BYTEWIDTH); /* Clear the whole map. */ memset(b, 0, (1 << BYTEWIDTH) / BYTEWIDTH); /* charset_not matches newline according to a syntax bit. */ if ((re_opcode_t) b[-2] == charset_not && (syntax & RE_HAT_LISTS_NOT_NEWLINE)) SET_LIST_BIT('\n'); /* Read in characters and ranges, setting map bits. */ for (;;) { if (p == pend) return REG_EBRACK; PATFETCH(c); /* \ might escape characters inside [...] and [^...]. */ if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\') { if (p == pend) return REG_EESCAPE; PATFETCH(c1); SET_LIST_BIT(c1); continue; } /* Could be the end of the bracket expression. If it's * not (i.e., when the bracket expression is `[]' so * far), the ']' character bit gets set way below. */ if (c == ']' && p != p1 + 1) break; /* Look ahead to see if it's a range when the last thing * was a character class. */ if (had_char_class && c == '-' && *p != ']') return REG_ERANGE; /* Look ahead to see if it's a range when the last thing * was a character: if this is a hyphen not at the * beginning or the end of a list, then it's the range * operator. */ if (c == '-' && !(p - 2 >= pattern && p[-2] == '[') && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^') && *p != ']') { reg_errcode_t ret = compile_range(&p, pend, translate, syntax, b); if (ret != REG_NOERROR)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -