📄 regex.c
字号:
/* Structure to manage work area for range table. */struct range_table_work_area{ int *table; /* actual work area. */ int allocated; /* allocated size for work area in bytes. */ int used; /* actually used size in words. */};/* Make sure that WORK_AREA can hold more N multibyte characters. */#define EXTEND_RANGE_TABLE_WORK_AREA(work_area, n) \ do { \ if (((work_area).used + (n)) * sizeof (int) > (work_area).allocated) \ { \ (work_area).allocated += 16 * sizeof (int); \ if ((work_area).table) \ (work_area).table \ = (int *) realloc ((work_area).table, (work_area).allocated); \ else \ (work_area).table \ = (int *) malloc ((work_area).allocated); \ if ((work_area).table == 0) \ FREE_STACK_RETURN (REG_ESPACE); \ } \ } while (0)/* Set a range (RANGE_START, RANGE_END) to WORK_AREA. */#define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end) \ do { \ EXTEND_RANGE_TABLE_WORK_AREA ((work_area), 2); \ (work_area).table[(work_area).used++] = (range_start); \ (work_area).table[(work_area).used++] = (range_end); \ } while (0)/* Free allocated memory for WORK_AREA. */#define FREE_RANGE_TABLE_WORK_AREA(work_area) \ do { \ if ((work_area).table) \ free ((work_area).table); \ } while (0)#define CLEAR_RANGE_TABLE_WORK_USED(work_area) ((work_area).used = 0)#define RANGE_TABLE_WORK_USED(work_area) ((work_area).used)#define RANGE_TABLE_WORK_ELT(work_area, i) ((work_area).table[i])/* Set the bit for character C in a list. */#define SET_LIST_BIT(c) \ (b[((unsigned char) (c)) / BYTEWIDTH] \ |= 1 << (((unsigned char) c) % BYTEWIDTH))/* Get the next unsigned number in the uncompiled pattern. */#define GET_UNSIGNED_NUMBER(num) \ { if (p != pend) \ { \ PATFETCH (c); \ while (ISDIGIT (c)) \ { \ if (num < 0) \ num = 0; \ num = num * 10 + c - '0'; \ if (p == pend) \ break; \ PATFETCH (c); \ } \ } \ }#define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */#define IS_CHAR_CLASS(string) \ (STREQ (string, "alpha") || STREQ (string, "upper") \ || STREQ (string, "lower") || STREQ (string, "digit") \ || STREQ (string, "alnum") || STREQ (string, "xdigit") \ || STREQ (string, "space") || STREQ (string, "print") \ || STREQ (string, "punct") || STREQ (string, "graph") \ || STREQ (string, "cntrl") || STREQ (string, "blank"))#ifndef MATCH_MAY_ALLOCATE/* If we cannot allocate large objects within re_match_2_internal, we make the fail stack and register vectors global. The fail stack, we grow to the maximum size when a regexp is compiled. The register vectors, we adjust in size each time we compile a regexp, according to the number of registers it needs. */static fail_stack_type fail_stack;/* Size with which the following vectors are currently allocated. That is so we can make them bigger as needed, but never make them smaller. */static int regs_allocated_size;static const char ** regstart, ** regend;static const char ** old_regstart, ** old_regend;static const char **best_regstart, **best_regend;static register_info_type *reg_info;static const char **reg_dummy;static register_info_type *reg_info_dummy;/* Make the register vectors big enough for NUM_REGS registers, but don't make them smaller. */staticregex_grow_registers (num_regs) int num_regs;{ if (num_regs > regs_allocated_size) { RETALLOC_IF (regstart, num_regs, const char *); RETALLOC_IF (regend, num_regs, const char *); RETALLOC_IF (old_regstart, num_regs, const char *); RETALLOC_IF (old_regend, num_regs, const char *); RETALLOC_IF (best_regstart, num_regs, const char *); RETALLOC_IF (best_regend, num_regs, const char *); RETALLOC_IF (reg_info, num_regs, register_info_type); RETALLOC_IF (reg_dummy, num_regs, const char *); RETALLOC_IF (reg_info_dummy, num_regs, register_info_type); regs_allocated_size = num_regs; }}#endif /* not MATCH_MAY_ALLOCATE *//* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX. Returns one of error codes defined in `regex.h', or zero for success. Assumes the `allocated' (and perhaps `buffer') and `translate' fields are set in BUFP on entry. If it succeeds, results are put in BUFP (if it returns an error, the contents of BUFP are undefined): `buffer' is the compiled pattern; `syntax' is set to SYNTAX; `used' is set to the length of the compiled pattern; `fastmap_accurate' is zero; `re_nsub' is the number of subexpressions in PATTERN; `not_bol' and `not_eol' are zero; The `fastmap' and `newline_anchor' fields are neither examined nor set. *//* Return, freeing storage we allocated. */#define FREE_STACK_RETURN(value) \ do { \ FREE_RANGE_TABLE_WORK_AREA (range_table_work); \ free (compile_stack.stack); \ return value; \ } while (0)static reg_errcode_tregex_compile (pattern, size, syntax, bufp) const char *pattern; int size; reg_syntax_t syntax; struct re_pattern_buffer *bufp;{ /* We fetch characters from PATTERN here. Even though PATTERN is `char *' (i.e., signed), we declare these variables as unsigned, so they can be reliably used as array indices. */ register unsigned int c, c1; /* A random temporary spot in PATTERN. */ const char *p1; /* Points to the end of the buffer, where we should append. */ register unsigned char *b; /* Keeps track of unclosed groups. */ compile_stack_type compile_stack; /* Points to the current (ending) position in the pattern. */#ifdef AIX /* `const' makes AIX compiler fail. */ char *p = pattern;#else const char *p = pattern;#endif const char *pend = pattern + size; /* How to translate the characters in the pattern. */ RE_TRANSLATE_TYPE translate = bufp->translate; /* Address of the count-byte of the most recently inserted `exactn' command. This makes it possible to tell if a new exact-match character can be added to that command or if the character requires a new `exactn' command. */ unsigned char *pending_exact = 0; /* Address of start of the most recently finished expression. This tells, e.g., postfix * where to find the start of its operand. Reset at the beginning of groups and alternatives. */ unsigned char *laststart = 0; /* Address of beginning of regexp, or inside of last group. */ unsigned char *begalt; /* Place in the uncompiled pattern (i.e., the {) to which to go back if the interval is invalid. */ const char *beg_interval; /* Address of the place where a forward jump should go to the end of the containing expression. Each alternative of an `or' -- except the last -- ends with a forward jump of this sort. */ unsigned char *fixup_alt_jump = 0; /* Counts open-groups as they are encountered. Remembered for the matching close-group on the compile stack, so the same register number is put in the stop_memory as the start_memory. */ regnum_t regnum = 0; /* Work area for range table of charset. */ struct range_table_work_area range_table_work;#ifdef DEBUG DEBUG_PRINT1 ("\nCompiling pattern: "); if (debug) { unsigned debug_count; for (debug_count = 0; debug_count < size; debug_count++) putchar (pattern[debug_count]); putchar ('\n'); }#endif /* DEBUG */ /* Initialize the compile stack. */ compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t); if (compile_stack.stack == NULL) return REG_ESPACE; compile_stack.size = INIT_COMPILE_STACK_SIZE; compile_stack.avail = 0; range_table_work.table = 0; range_table_work.allocated = 0; /* Initialize the pattern buffer. */ bufp->syntax = syntax; bufp->fastmap_accurate = 0; bufp->not_bol = bufp->not_eol = 0; /* Set `used' to zero, so that if we return an error, the pattern printer (for debugging) will think there's no pattern. We reset it at the end. */ bufp->used = 0; /* Always count groups, whether or not bufp->no_sub is set. */ bufp->re_nsub = 0;#ifdef emacs /* bufp->multibyte is set before regex_compile is called, so don't alter it. */#else /* not emacs */ /* Nothing is recognized as a multibyte character. */ bufp->multibyte = 0;#endif#if !defined (emacs) && !defined (SYNTAX_TABLE) /* Initialize the syntax table. */ init_syntax_once ();#endif if (bufp->allocated == 0) { if (bufp->buffer) { /* If zero allocated, but buffer is non-null, try to realloc enough space. This loses if buffer's address is bogus, but that is the user's responsibility. */ RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char); } else { /* Caller did not allocate a buffer. Do it for them. */ bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char); } if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE); bufp->allocated = INIT_BUF_SIZE; } begalt = b = bufp->buffer; /* Loop through the uncompiled pattern until we're at the end. */ while (p != pend) { PATFETCH (c); switch (c) { case '^': { if ( /* If at start of pattern, it's an operator. */ p == pattern + 1 /* If context independent, it's an operator. */ || syntax & RE_CONTEXT_INDEP_ANCHORS /* Otherwise, depends on what's come before. */ || at_begline_loc_p (pattern, p, syntax)) BUF_PUSH (begline); else goto normal_char; } break; case '$': { if ( /* If at end of pattern, it's an operator. */ p == pend /* If context independent, it's an operator. */ || syntax & RE_CONTEXT_INDEP_ANCHORS /* Otherwise, depends on what's next. */ || at_endline_loc_p (p, pend, syntax)) BUF_PUSH (endline); else goto normal_char; } break; case '+': case '?': if ((syntax & RE_BK_PLUS_QM) || (syntax & RE_LIMITED_OPS)) goto normal_char; handle_plus: case '*': /* If there is no previous pattern... */ if (!laststart) { if (syntax & RE_CONTEXT_INVALID_OPS) FREE_STACK_RETURN (REG_BADRPT); else if (!(syntax & RE_CONTEXT_INDEP_OPS)) goto normal_char; } { /* Are we optimizing this jump? */ boolean keep_string_p = false; /* 1 means zero (many) matches is allowed. */ char zero_times_ok = 0, many_times_ok = 0; /* If there is a sequence of repetition chars, collapse it down to just one (the right one). We can't combine interval operators with these because of, e.g., `a{2}*', which should only match an even number of `a's. */ for (;;) { zero_times_ok |= c != '+'; many_times_ok |= c != '?'; if (p == pend) break; PATFETCH (c); if (c == '*' || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?'))) ; else if (syntax & RE_BK_PLUS_QM && c == '\\') { if (p == pend) FREE_STACK_RETURN (REG_EESCAPE); PATFETCH (c1); if (!(c1 == '+' || c1 == '?')) { PATUNFETCH; PATUNFETCH; break; } c = c1; } else { PATUNFETCH; break; } /* If we get here, we found another repeat character. */ } /* Star, etc. applied to an empty pattern is equivalent to an empty pattern. */ if (!laststart) break; /* Now we know whether or not zero matches is allowed and also whether or not two or more matches is allowed. */ if (many_times_ok) { /* More than one repetition is allowed, so put in at the end a backward relative jump from `b' to before the next jump we're going to put in below (which jumps from laststart to after this jump). But if we are at the `*' in the exact sequence `.*\n', insert an unconditional jump backwards to the ., instead of the beginning of the loop. This way we only push a failure point once, instead of every time through the loop. */ assert (p - 1 > pattern); /* Allocate the space for the jump. */ GET_BUFFER_SPACE (3); /* We know we are not at the first character of the pattern, because laststart was nonzero. And we've already incremented `p', by the way, to be the character after the `*'. Do we have to do something analogous here for null bytes, because of RE_DOT_NOT_NULL? */ if (TRANSLATE ((unsigned char)*(p - 2)) == TRANSLATE ('.') && zero_times_ok && p < pend && TRANSLATE ((unsigned char)*p) == TRANSLATE ('\n') && !(syntax & RE_DOT_NEWLINE)) { /* We have .*\n. */ STORE_JUMP (jump, b, laststart); keep_string_p = true; } else /* Anything else. */ STORE_JUMP (maybe_pop_jump, b, lasts
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -