📄 regex.c
字号:
{start,stop}_memory, the maximum number of groups we can report things about is what fits in that byte. */#define MAX_REGNUM 255/* But patterns can have more than `MAX_REGNUM' registers. We just ignore the excess. */typedef unsigned regnum_t;/* Macros for the compile stack. *//* Since offsets can go either forwards or backwards, this type needs to be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. *//* int may be not enough when sizeof(int) == 2. */typedef long pattern_offset_t;typedef struct{ pattern_offset_t begalt_offset; pattern_offset_t fixup_alt_jump; pattern_offset_t inner_group_offset; pattern_offset_t laststart_offset; regnum_t regnum;} compile_stack_elt_t;typedef struct{ compile_stack_elt_t *stack; unsigned size; unsigned avail; /* Offset of next open position. */} compile_stack_type;#define INIT_COMPILE_STACK_SIZE 32#define COMPILE_STACK_EMPTY (compile_stack.avail == 0)#define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)/* The next available element. */#define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])/* Set the bit for character C in a list. */#define SET_LIST_BIT(c) \ (b[((unsigned char) (c)) / BYTEWIDTH] \ |= 1 << (((unsigned char) c) % BYTEWIDTH))/* Get the next unsigned number in the uncompiled pattern. */#define GET_UNSIGNED_NUMBER(num) \ { if (p != pend) \ { \ PATFETCH (c); \ while (ISDIGIT (c)) \ { \ if (num < 0) \ num = 0; \ num = num * 10 + c - '0'; \ if (p == pend) \ break; \ PATFETCH (c); \ } \ } \ }#if defined _LIBC || WIDE_CHAR_SUPPORT/* The GNU C library provides support for user-defined character classes and the functions from ISO C amendement 1. */# ifdef CHARCLASS_NAME_MAX# define CHAR_CLASS_MAX_LENGTH CHARCLASS_NAME_MAX# else/* This shouldn't happen but some implementation might still have this problem. Use a reasonable default value. */# define CHAR_CLASS_MAX_LENGTH 256# endif# ifdef _LIBC# define IS_CHAR_CLASS(string) __wctype (string)# else# define IS_CHAR_CLASS(string) wctype (string)# endif#else# define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */# define IS_CHAR_CLASS(string) \ (STREQ (string, "alpha") || STREQ (string, "upper") \ || STREQ (string, "lower") || STREQ (string, "digit") \ || STREQ (string, "alnum") || STREQ (string, "xdigit") \ || STREQ (string, "space") || STREQ (string, "print") \ || STREQ (string, "punct") || STREQ (string, "graph") \ || STREQ (string, "cntrl") || STREQ (string, "blank"))#endif#ifndef MATCH_MAY_ALLOCATE/* If we cannot allocate large objects within re_match_2_internal, we make the fail stack and register vectors global. The fail stack, we grow to the maximum size when a regexp is compiled. The register vectors, we adjust in size each time we compile a regexp, according to the number of registers it needs. */static fail_stack_type fail_stack;/* Size with which the following vectors are currently allocated. That is so we can make them bigger as needed, but never make them smaller. */static int regs_allocated_size;static const char ** regstart, ** regend;static const char ** old_regstart, ** old_regend;static const char **best_regstart, **best_regend;static register_info_type *reg_info;static const char **reg_dummy;static register_info_type *reg_info_dummy;/* Make the register vectors big enough for NUM_REGS registers, but don't make them smaller. */staticregex_grow_registers (num_regs) int num_regs;{ if (num_regs > regs_allocated_size) { RETALLOC_IF (regstart, num_regs, const char *); RETALLOC_IF (regend, num_regs, const char *); RETALLOC_IF (old_regstart, num_regs, const char *); RETALLOC_IF (old_regend, num_regs, const char *); RETALLOC_IF (best_regstart, num_regs, const char *); RETALLOC_IF (best_regend, num_regs, const char *); RETALLOC_IF (reg_info, num_regs, register_info_type); RETALLOC_IF (reg_dummy, num_regs, const char *); RETALLOC_IF (reg_info_dummy, num_regs, register_info_type); regs_allocated_size = num_regs; }}#endif /* not MATCH_MAY_ALLOCATE */static boolean group_in_compile_stack _RE_ARGS ((compile_stack_type compile_stack, regnum_t regnum));/* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX. Returns one of error codes defined in `regex.h', or zero for success. Assumes the `allocated' (and perhaps `buffer') and `translate' fields are set in BUFP on entry. If it succeeds, results are put in BUFP (if it returns an error, the contents of BUFP are undefined): `buffer' is the compiled pattern; `syntax' is set to SYNTAX; `used' is set to the length of the compiled pattern; `fastmap_accurate' is zero; `re_nsub' is the number of subexpressions in PATTERN; `not_bol' and `not_eol' are zero; The `fastmap' and `newline_anchor' fields are neither examined nor set. *//* Return, freeing storage we allocated. */#define FREE_STACK_RETURN(value) \ return (free (compile_stack.stack), value)static reg_errcode_tregex_compile (pattern, size, syntax, bufp) const char *pattern; size_t size; reg_syntax_t syntax; struct re_pattern_buffer *bufp;{ /* We fetch characters from PATTERN here. Even though PATTERN is `char *' (i.e., signed), we declare these variables as unsigned, so they can be reliably used as array indices. */ register unsigned char c, c1; /* A random temporary spot in PATTERN. */ const char *p1; /* Points to the end of the buffer, where we should append. */ register unsigned char *b; /* Keeps track of unclosed groups. */ compile_stack_type compile_stack; /* Points to the current (ending) position in the pattern. */ const char *p = pattern; const char *pend = pattern + size; /* How to translate the characters in the pattern. */ RE_TRANSLATE_TYPE translate = bufp->translate; /* Address of the count-byte of the most recently inserted `exactn' command. This makes it possible to tell if a new exact-match character can be added to that command or if the character requires a new `exactn' command. */ unsigned char *pending_exact = 0; /* Address of start of the most recently finished expression. This tells, e.g., postfix * where to find the start of its operand. Reset at the beginning of groups and alternatives. */ unsigned char *laststart = 0; /* Address of beginning of regexp, or inside of last group. */ unsigned char *begalt; /* Place in the uncompiled pattern (i.e., the {) to which to go back if the interval is invalid. */ const char *beg_interval; /* Address of the place where a forward jump should go to the end of the containing expression. Each alternative of an `or' -- except the last -- ends with a forward jump of this sort. */ unsigned char *fixup_alt_jump = 0; /* Counts open-groups as they are encountered. Remembered for the matching close-group on the compile stack, so the same register number is put in the stop_memory as the start_memory. */ regnum_t regnum = 0;#ifdef DEBUG DEBUG_PRINT1 ("\nCompiling pattern: "); if (debug) { unsigned debug_count; for (debug_count = 0; debug_count < size; debug_count++) putchar (pattern[debug_count]); putchar ('\n'); }#endif /* DEBUG */ /* Initialize the compile stack. */ compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t); if (compile_stack.stack == NULL) return REG_ESPACE; compile_stack.size = INIT_COMPILE_STACK_SIZE; compile_stack.avail = 0; /* Initialize the pattern buffer. */ bufp->syntax = syntax; bufp->fastmap_accurate = 0; bufp->not_bol = bufp->not_eol = 0; /* Set `used' to zero, so that if we return an error, the pattern printer (for debugging) will think there's no pattern. We reset it at the end. */ bufp->used = 0; /* Always count groups, whether or not bufp->no_sub is set. */ bufp->re_nsub = 0;#if !defined emacs && !defined SYNTAX_TABLE /* Initialize the syntax table. */ init_syntax_once ();#endif if (bufp->allocated == 0) { if (bufp->buffer) { /* If zero allocated, but buffer is non-null, try to realloc enough space. This loses if buffer's address is bogus, but that is the user's responsibility. */ RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char); } else { /* Caller did not allocate a buffer. Do it for them. */ bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char); } if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE); bufp->allocated = INIT_BUF_SIZE; } begalt = b = bufp->buffer; /* Loop through the uncompiled pattern until we're at the end. */ while (p != pend) { PATFETCH (c); switch (c) { case '^': { if ( /* If at start of pattern, it's an operator. */ p == pattern + 1 /* If context independent, it's an operator. */ || syntax & RE_CONTEXT_INDEP_ANCHORS /* Otherwise, depends on what's come before. */ || at_begline_loc_p (pattern, p, syntax)) BUF_PUSH (begline); else goto normal_char; } break; case '$': { if ( /* If at end of pattern, it's an operator. */ p == pend /* If context independent, it's an operator. */ || syntax & RE_CONTEXT_INDEP_ANCHORS /* Otherwise, depends on what's next. */ || at_endline_loc_p (p, pend, syntax)) BUF_PUSH (endline); else goto normal_char; } break; case '+': case '?': if ((syntax & RE_BK_PLUS_QM) || (syntax & RE_LIMITED_OPS)) goto normal_char; handle_plus: case '*': /* If there is no previous pattern... */ if (!laststart) { if (syntax & RE_CONTEXT_INVALID_OPS) FREE_STACK_RETURN (REG_BADRPT); else if (!(syntax & RE_CONTEXT_INDEP_OPS)) goto normal_char; } { /* Are we optimizing this jump? */ boolean keep_string_p = false; /* 1 means zero (many) matches is allowed. */ char zero_times_ok = 0, many_times_ok = 0; /* If there is a sequence of repetition chars, collapse it down to just one (the right one). We can't combine interval operators with these because of, e.g., `a{2}*', which should only match an even number of `a's. */ for (;;) { zero_times_ok |= c != '+'; many_times_ok |= c != '?'; if (p == pend) break; PATFETCH (c); if (c == '*' || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?'))) ; else if (syntax & RE_BK_PLUS_QM && c == '\\') { if (p == pend) FREE_STACK_RETURN (REG_EESCAPE); PATFETCH (c1); if (!(c1 == '+' || c1 == '?')) { PATUNFETCH; PATUNFETCH; break; } c = c1; } else { PATUNFETCH; break; } /* If we get here, we found another repeat character. */ } /* Star, etc. applied to an empty pattern is equivalent to an empty pattern. */ if (!laststart) break; /* Now we know whether or not zero matches is allowed and also whether or not two or more matches is allowed. */ if (many_times_ok) { /* More than one repetition is allowed, so put in at the end a backward relative jump from `b' to before the next jump we're going to put in below (which jumps from laststart to after this jump). But if we are at the `*' in the exact sequence `.*\n', insert an unconditional jump backwards to the ., instead of the beginning of the loop. This way we only push a failure point once, instead of every time through the loop. */ assert (p - 1 > pattern); /* Allocate the space for the jump. */ GET_BUFFER_SPACE (3); /* We know we are not at the first character of the pattern, because laststart was nonzero. And we've already incremented `p', by the wa
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -