📄 x_regex.c
字号:
/* Store a jump with opcode OP at LOC to location TO. We store a relative address offset by the three bytes the jump itself occupies. */#define STORE_JUMP(op, loc, to) \ store_op1 (op, loc, (UChar *)(to) - (UChar *)(loc) - 3)/* Likewise, for a two-argument jump. */#define STORE_JUMP2(op, loc, to, arg) \ store_op2 (op, loc, (UChar *)(to) - (UChar *)(loc) - 3, (Int32)arg)/* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */#define INSERT_JUMP(op, loc, to) \ insert_op1 (op, (UChar *) (loc), (UChar *)(to) - (UChar *)(loc) - 3, b)/* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */#define INSERT_JUMP2(op, loc, to, arg) \ insert_op2 (op, (UChar *) (loc), (UChar *)(to) - (UChar *)(loc) - 3, arg, b)/* This is not an arbitrary limit: the arguments which represent offsets into the pattern are two bytes long. So if 2^16 bytes turns out to be too small, many things would have to change. */#define MAX_BUF_SIZE (1L << 16)/* Extend the buffer by twice its current size via realloc and reset the pointers that pointed into the old block to point to the correct places in the new one. If extending the buffer results in it being larger than MAX_BUF_SIZE, then flag memory exhausted. */#define EXTEND_BUFFER() \ do { \ UChar *old_buffer = bufp->buffer; \ if (bufp->allocated == MAX_BUF_SIZE) \ return REG_ESIZE; \ bufp->allocated <<= 1; \ if (bufp->allocated > MAX_BUF_SIZE) \ bufp->allocated = MAX_BUF_SIZE; \ bufp->buffer = (UChar *) realloc_forced (bufp->buffer, bufp->allocated);\ if (bufp->buffer == NULL) \ return REG_ESPACE; \ /* If the buffer moved, move all the pointers into it. */ \ if (old_buffer != bufp->buffer) \ { \ b = (b - old_buffer) + bufp->buffer; \ begalt = (begalt - old_buffer) + bufp->buffer; \ if (fixup_alt_jump) \ fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer;\ if (laststart) \ laststart = (laststart - old_buffer) + bufp->buffer; \ if (pending_exact) \ pending_exact = (pending_exact - old_buffer) + bufp->buffer; \ } \ } while (0)/* Since we have one byte reserved for the register number argument to {start,stop}_memory, the maximum number of groups we can report things about is what fits in that byte. */#define MAX_REGNUM 255/* But patterns can have more than `MAX_REGNUM' registers. We just ignore the excess. */typedef Uns32 regnum_t;/* Macros for the compile stack. *//* Since offsets can go either forwards or backwards, this type needs to be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */typedef Int32 pattern_offset_t;typedef struct{ pattern_offset_t begalt_offset; pattern_offset_t fixup_alt_jump; pattern_offset_t inner_group_offset; pattern_offset_t laststart_offset; regnum_t regnum;} compile_stack_elt_t;typedef struct{ compile_stack_elt_t *stack; Uns32 size; Uns32 avail; /* Offset of next open position. */} compile_stack_type;#define INIT_COMPILE_STACK_SIZE 32#define COMPILE_STACK_EMPTY (compile_stack.avail == 0)#define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)/* The next available element. */#define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])/* Set the bit for character C in a list. */#define SET_LIST_BIT(c) \ (b[((UChar) (c)) / BYTEWIDTH] \ |= 1 << (((UChar) c) % BYTEWIDTH))/* Get the next unsigned number in the uncompiled pattern. */#define GET_UNSIGNED_NUMBER(num) \ { if (p != pend) \ { \ PATFETCH (c); \ while (ISDIGIT (c)) \ { \ if (num < 0) \ num = 0; \ num = num * 10 + c - '0'; \ if (p == pend) \ break; \ PATFETCH (c); \ } \ } \ } #define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */#define IS_CHAR_CLASS(string) \ (STREQ (string, "alpha") || STREQ (string, "upper") \ || STREQ (string, "lower") || STREQ (string, "digit") \ || STREQ (string, "alnum") || STREQ (string, "xdigit") \ || STREQ (string, "space") || STREQ (string, "print") \ || STREQ (string, "punct") || STREQ (string, "graph") \ || STREQ (string, "cntrl") || STREQ (string, "blank"))/* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX. Returns one of error codes defined in `regex.h', or zero for success. Assumes the `allocated' (and perhaps `buffer') and `translate' fields are set in BUFP on entry. If it succeeds, results are put in BUFP (if it returns an error, the contents of BUFP are undefined): `buffer' is the compiled pattern; `syntax' is set to SYNTAX; `used' is set to the length of the compiled pattern; `fastmap_accurate' is zero; `re_nsub' is the number of subexpressions in PATTERN; `not_bol' and `not_eol' are zero; The `fastmap' and `newline_anchor' fields are neither examined nor set. */static reg_errcode_tregex_compile (pattern, size, syntax, bufp) UChar *pattern; Int32 size; reg_syntax_t syntax; struct re_pattern_buffer *bufp;{ /* We fetch characters from PATTERN here. Even though PATTERN is `char *' (i.e., signed), we declare these variables as unsigned, so they can be reliably used as array indices. */ register UChar c, c1; /* A random tempory spot in PATTERN. */ UChar *p1; /* Points to the end of the buffer, where we should append. */ register UChar *b; /* Keeps track of unclosed groups. */ compile_stack_type compile_stack; /* Points to the current (ending) position in the pattern. */ UChar *p = pattern; UChar *pend = pattern + size; /* How to translate the characters in the pattern. */ UChar *translate = bufp->translate; /* Address of the count-byte of the most recently inserted `exactn' command. This makes it possible to tell if a new exact-match character can be added to that command or if the character requires a new `exactn' command. */ UChar *pending_exact = 0; /* Address of start of the most recently finished expression. This tells, e.g., postfix * where to find the start of its operand. Reset at the beginning of groups and alternatives. */ UChar *laststart = 0; /* Address of beginning of regexp, or inside of last group. */ UChar *begalt; /* Place in the uncompiled pattern (i.e., the {) to which to go back if the interval is invalid. */ UChar *beg_interval; /* Address of the place where a forward jump should go to the end of the containing expression. Each alternative of an `or' -- except the last -- ends with a forward jump of this sort. */ UChar *fixup_alt_jump = 0; /* Counts open-groups as they are encountered. Remembered for the matching close-group on the compile stack, so the same register number is put in the stop_memory as the start_memory. */ regnum_t regnum = 0;#ifdef DEBUG DEBUG_PRINT1 ("\nCompiling pattern: "); if (debug) { Uns32 debug_count; for (debug_count = 0; debug_count < size; debug_count++) printchar (pattern[debug_count]); putchar ('\n'); }#endif /* DEBUG */ /* Initialize the compile stack. */ compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t); if (compile_stack.stack == NULL) return REG_ESPACE; compile_stack.size = INIT_COMPILE_STACK_SIZE; compile_stack.avail = 0; /* Initialize the pattern buffer. */ bufp->syntax = syntax; bufp->fastmap_accurate = 0; bufp->not_bol = bufp->not_eol = 0; /* Set `used' to zero, so that if we return an error, the pattern printer (for debugging) will think there's no pattern. We reset it at the end. */ bufp->used = 0; /* Always count groups, whether or not bufp->no_sub is set. */ bufp->re_nsub = 0; #if !defined (emacs) && !defined (SYNTAX_TABLE) /* Initialize the syntax table. */ init_syntax_once ();#endif if (bufp->allocated == 0) { if (bufp->buffer) { /* If zero allocated, but buffer is non-null, try to realloc enough space. This loses if buffer's address is bogus, but that is the user's responsibility. */ RETALLOC (bufp->buffer, INIT_BUF_SIZE, UChar); } else { /* Caller did not allocate a buffer. Do it for them. */ bufp->buffer = TALLOC (INIT_BUF_SIZE, UChar); } if (!bufp->buffer) return REG_ESPACE; bufp->allocated = INIT_BUF_SIZE; } begalt = b = bufp->buffer; /* Loop through the uncompiled pattern until we're at the end. */ while (p != pend) { PATFETCH (c); switch (c) { case '^': { if ( /* If at start of pattern, it's an operator. */ p == pattern + 1 /* If context independent, it's an operator. */ || syntax & RE_CONTEXT_INDEP_ANCHORS /* Otherwise, depends on what's come before. */ || at_begline_loc_p (pattern, p, syntax)) BUF_PUSH (begline); else goto normal_char; } break; case '$': { if ( /* If at end of pattern, it's an operator. */ p == pend /* If context independent, it's an operator. */ || syntax & RE_CONTEXT_INDEP_ANCHORS /* Otherwise, depends on what's next. */ || at_endline_loc_p (p, pend, syntax)) BUF_PUSH (endline); else goto normal_char; } break; case '+': case '?': if ((syntax & RE_BK_PLUS_QM) || (syntax & RE_LIMITED_OPS)) goto normal_char; handle_plus: case '*': /* If there is no previous pattern... */ if (!laststart) { if (syntax & RE_CONTEXT_INVALID_OPS) return REG_BADRPT; else if (!(syntax & RE_CONTEXT_INDEP_OPS)) goto normal_char; } { /* Are we optimizing this jump? */ boolean keep_string_p = false; /* 1 means zero (many) matches is allowed. */ UChar zero_times_ok = 0, many_times_ok = 0; /* If there is a sequence of repetition chars, collapse it down to just one (the right one). We can't combine interval operators with these because of, e.g., `a{2}*', which should only match an even number of `a's. */ for (;;) { zero_times_ok |= c != '+'; many_times_ok |= c != '?'; if (p == pend) break; PATFETCH (c); if (c == '*' || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?'))) ; else if (syntax & RE_BK_PLUS_QM && c == '\\') { if (p == pend) return REG_EESCAPE; PATFETCH (c1); if (!(c1 == '+' || c1 == '?')) { PATUNFETCH; PATUNFETCH; break; } c = c1; } else { PATUNFETCH; break; } /* If we get here, we found another repeat character. */ } /* Star, etc. applied to an empty pattern is equivalent to an empty pattern. */ if (!laststart) break; /* Now we know whether or not zero matches is allowed and also whether or not two or more matches is allowed. */ if (many_times_ok) { /* More than one repetition is allowed, so put in at the end a backward relative jump from `b' to before the next jump we're going to put in below (which jumps from laststart to after this jump). But if we are at the `*' in the exact sequence `.*\n', insert an unconditional jump backwards to the ., instead of the beginning of the loop. This way we only push a failure point once, instead of every time through the loop. */ assert (p - 1 > pattern); /* Allocate the space for the jump. */ GET_BUFFER_SPACE (3); /* We know we are not at the first character of the pattern, because laststart was nonzero. And we've already incremented `p', by the way, to be the character after the `*'. Do we have to do something analogous here for null bytes, because of RE_DOT_NOT_NULL? */ if (TRANSLATE (*(p - 2)) == TRANSLATE ('.') && zero_times_ok && p < pend && TRANSLATE (*p) == TRANSLATE ('\n') && !(syntax & RE_DOT_NEWLINE)) { /* We have .*\n. */ STORE_JUMP (jump, b, laststart); keep_string_p = true; } else /* Anything else. */ STORE_JUMP (maybe_pop_jump, b, laststart - 3); /* We've added more stuff to the buffer. */ b += 3; } /* On failure, jump from laststart to b + 3, which will be the end of the buffer after this jump is inserted. */ GET_BUFFER_SPACE (3); INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump : on_failure_jump, laststart, b + 3); pending_exact = 0; b += 3; if (!zero_times_ok) { /* At least one repetition is required, so insert a `dummy_failure_jump' before the initial `on_failure_jump' instruction of the loop. This effects a skip over that instruction the first time we hit that loop. */ GET_BUFFER_SPACE (3); INSERT_JUMP (dummy_failure_jump, laststart, (laststart + 6)); b += 3; } } break; case '.': laststart = b; BUF_PUSH (anychar); break; case '[': { boolean had_char_class = false; if (p == pend) return REG_EBRACK; /* Ensure that we have enough space to push a charset: the opcode, the length count, and the bitset; 34 bytes in all. */ GET_BUFFER_SPACE (34); laststart = b; /* We test `*p == '^' twice, instead of using an if statement, so we only need one BUF_PUSH. */ BUF_PUSH (*p == '^' ? charset_not : charset); if (*p == '^') p++; /* Remember the first position in the bracket expression. */ p1 = p; /* Push the number of bytes in the bitmap. */ BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH); /* Clear the whole map. */ bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH); /* charset_not matches newline according to a syntax bit. */ if ((re_opcode_t) b[-2] == charset_not && (syntax & RE_HAT_LISTS_NOT_NEWLINE)) SET_LIST_BIT ('\n'); /* Read in characters and ranges, setting map bits. */ for (;;) { if (p == pend) return REG_EBRACK;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -