📄 regex.c
字号:
#define SET_LIST_BIT(c) \ (b[((unsigned char) (c)) / BYTEWIDTH] \ |= 1 << (((unsigned char) c) % BYTEWIDTH))/* Get the next unsigned number in the uncompiled pattern. */#define GET_UNSIGNED_NUMBER(num) \ { if (p != pend) \ { \ PATFETCH (c); \ while ('0' <= c && c <= '9') \ { \ if (num < 0) \ num = 0; \ num = num * 10 + c - '0'; \ if (p == pend) \ break; \ PATFETCH (c); \ } \ } \ }#if defined _LIBC || WIDE_CHAR_SUPPORT/* The GNU C library provides support for user-defined character classes and the functions from ISO C amendement 1. */# ifdef CHARCLASS_NAME_MAX# define CHAR_CLASS_MAX_LENGTH CHARCLASS_NAME_MAX# else/* This shouldn't happen but some implementation might still have this problem. Use a reasonable default value. */# define CHAR_CLASS_MAX_LENGTH 256# endif# ifdef _LIBC# define IS_CHAR_CLASS(string) __wctype (string)# else# define IS_CHAR_CLASS(string) wctype (string)# endif#else# define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */# define IS_CHAR_CLASS(string) \ (STREQ (string, "alpha") || STREQ (string, "upper") \ || STREQ (string, "lower") || STREQ (string, "digit") \ || STREQ (string, "alnum") || STREQ (string, "xdigit") \ || STREQ (string, "space") || STREQ (string, "print") \ || STREQ (string, "punct") || STREQ (string, "graph") \ || STREQ (string, "cntrl") || STREQ (string, "blank"))#endif#ifndef MATCH_MAY_ALLOCATE/* If we cannot allocate large objects within re_match_2_internal, we make the fail stack and register vectors global. The fail stack, we grow to the maximum size when a regexp is compiled. The register vectors, we adjust in size each time we compile a regexp, according to the number of registers it needs. */static fail_stack_type fail_stack;/* Size with which the following vectors are currently allocated. That is so we can make them bigger as needed, but never make them smaller. */static int regs_allocated_size;static const char **regstart, **regend;static const char **old_regstart, **old_regend;static const char **best_regstart, **best_regend;static register_info_type *reg_info;static const char **reg_dummy;static register_info_type *reg_info_dummy;/* Make the register vectors big enough for NUM_REGS registers, but don't make them smaller. */static regex_grow_registers(num_regs)int num_regs;{ if (num_regs > regs_allocated_size) { RETALLOC_IF(regstart, num_regs, const char *); RETALLOC_IF(regend, num_regs, const char *); RETALLOC_IF(old_regstart, num_regs, const char *); RETALLOC_IF(old_regend, num_regs, const char *); RETALLOC_IF(best_regstart, num_regs, const char *); RETALLOC_IF(best_regend, num_regs, const char *); RETALLOC_IF(reg_info, num_regs, register_info_type); RETALLOC_IF(reg_dummy, num_regs, const char *); RETALLOC_IF(reg_info_dummy, num_regs, register_info_type); regs_allocated_size = num_regs; }}#endif /* not MATCH_MAY_ALLOCATE *//* Subroutines for `regex_compile'. *//* Store OP at LOC followed by two-byte integer parameter ARG. */static inline void store_op1(op, loc, arg)re_opcode_t op;unsigned char *loc;int arg;{ *loc = (unsigned char) op; STORE_NUMBER(loc + 1, arg);}/* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */static void store_op2(op, loc, arg1, arg2)re_opcode_t op;unsigned char *loc;int arg1, arg2;{ *loc = (unsigned char) op; STORE_NUMBER(loc + 1, arg1); STORE_NUMBER(loc + 3, arg2);}/* Copy the bytes from LOC to END to open up three bytes of space at LOC for OP followed by two-byte integer parameter ARG. */static void insert_op1(op, loc, arg, end)re_opcode_t op;unsigned char *loc;int arg;unsigned char *end;{ register unsigned char *pfrom = end; register unsigned char *pto = end + 3; while (pfrom != loc) *--pto = *--pfrom; store_op1(op, loc, arg);}/* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */static void insert_op2(op, loc, arg1, arg2, end)re_opcode_t op;unsigned char *loc;int arg1, arg2;unsigned char *end;{ register unsigned char *pfrom = end; register unsigned char *pto = end + 5; while (pfrom != loc) *--pto = *--pfrom; store_op2(op, loc, arg1, arg2);}/* P points to just after a ^ in PATTERN. Return true if that ^ comes after an alternative or a begin-subexpression. We assume there is at least one character before the ^. */static boolean at_begline_loc_p(pattern, p, syntax)const char *pattern, *p;reg_syntax_t syntax;{ const char *prev = p - 2; boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\'; return /* After a subexpression? */ (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash)) /* After an alternative? */ || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash));}/* The dual of at_begline_loc_p. This one is for $. We assume there is at least one character after the $, i.e., `P < PEND'. */static boolean at_endline_loc_p(p, pend, syntax)const char *p, *pend;reg_syntax_t syntax;{ const char *next = p; boolean next_backslash = *next == '\\'; const char *next_next = p + 1 < pend ? p + 1 : 0; return /* Before a subexpression? */ (syntax & RE_NO_BK_PARENS ? *next == ')' : next_backslash && next_next && *next_next == ')') /* Before an alternative? */ || (syntax & RE_NO_BK_VBAR ? *next == '|' : next_backslash && next_next && *next_next == '|');}/* Returns true if REGNUM is in one of COMPILE_STACK's elements and false if it's not. */static boolean group_in_compile_stack _RE_ARGS((compile_stack_type compile_stack, regnum_t regnum));static boolean group_in_compile_stack(compile_stack, regnum)compile_stack_type compile_stack;regnum_t regnum;{ int this_element; for (this_element = compile_stack.avail - 1; this_element >= 0; this_element--) if (compile_stack.stack[this_element].regnum == regnum) return true; return false;}/* Read the ending character of a range (in a bracket expression) from the uncompiled pattern *P_PTR (which ends at PEND). We assume the starting character is in `P[-2]'. (`P[-1]' is the character `-'.) Then we set the translation of all bits between the starting and ending characters (inclusive) in the compiled pattern B. Return an error code. We use these short variable names so we can use the same macros as `regex_compile' itself. */static reg_errcode_t compile_range(p_ptr, pend, translate, syntax, b)const char **p_ptr, *pend;RE_TRANSLATE_TYPE translate;reg_syntax_t syntax;unsigned char *b;{ unsigned this_char; const char *p = *p_ptr; reg_errcode_t ret; char range_start[2]; char range_end[2]; char ch[2]; if (p == pend) return REG_ERANGE; /* Fetch the endpoints without translating them; the appropriate translation is done in the bit-setting loop below. */ range_start[0] = p[-2]; range_start[1] = '\0'; range_end[0] = p[0]; range_end[1] = '\0'; /* Have to increment the pointer into the pattern string, so the caller isn't still at the ending character. */ (*p_ptr)++; /* Report an error if the range is empty and the syntax prohibits this. */ ret = syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR; /* Here we see why `this_char' has to be larger than an `unsigned char' -- we would otherwise go into an infinite loop, since all characters <= 0xff. */ ch[1] = '\0'; for (this_char = 0; this_char <= (unsigned char) -1; ++this_char) { ch[0] = this_char; if (strcoll(range_start, ch) <= 0 && strcoll(ch, range_end) <= 0) { SET_LIST_BIT(TRANSLATE(this_char)); ret = REG_NOERROR; } } return ret;}/* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible characters can start a string that matches the pattern. This fastmap is used by re_search to skip quickly over impossible starting points. The caller must supply the address of a (1 << BYTEWIDTH)-byte data area as BUFP->fastmap. We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in the pattern buffer. Returns 0 if we succeed, -2 if an internal error. */int re_compile_fastmap(bufp)struct re_pattern_buffer *bufp;{ int j, k;#ifdef MATCH_MAY_ALLOCATE fail_stack_type fail_stack;#endif#ifndef REGEX_MALLOC char *destination;#endif register char *fastmap = bufp->fastmap; unsigned char *pattern = bufp->buffer; unsigned char *p = pattern; register unsigned char *pend = pattern + bufp->used;#ifdef REL_ALLOC /* This holds the pointer to the failure stack, when it is allocated relocatably. */ fail_stack_elt_t *failure_stack_ptr;#endif /* Assume that each path through the pattern can be null until proven otherwise. We set this false at the bottom of switch statement, to which we get only if a particular path doesn't match the empty string. */ boolean path_can_be_null = true; /* We aren't doing a `succeed_n' to begin with. */ boolean succeed_n_p = false; assert(fastmap != NULL && p != NULL); INIT_FAIL_STACK(); bzero(fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */ bufp->fastmap_accurate = 1; /* It will be when we're done. */ bufp->can_be_null = 0; while (1) { if (p == pend || *p == succeed) { /* We have reached the (effective) end of pattern. */ if (!FAIL_STACK_EMPTY()) { bufp->can_be_null |= path_can_be_null; /* Reset for next path. */ path_can_be_null = true; p = fail_stack.stack[--fail_stack.avail].pointer; continue; } else break; } /* We should never be about to go beyond the end of the pattern. */ assert(p < pend); switch (SWITCH_ENUM_CAST((re_opcode_t) * p++)) { /* I guess the idea here is to simply not bother with a fastmap if a backreference is used, since it's too hard to figure out the fastmap for the corresponding group. Setting `can_be_null' stops `re_search_2' from using the fastmap, so that is all we do. */ case duplicate: bufp->can_be_null = 1; goto done; /* Following are the cases which match a character. These end with `break'. */ case exactn: fastmap[p[1]] = 1; break; case charset: for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--) if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) fastmap[j] = 1; break; case charset_not: /* Chars beyond end of map must be allowed. */ for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++) fastmap[j] = 1; for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--) if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))) fastmap[j] = 1; break; case wordchar: for (j = 0; j < (1 << BYTEWIDTH); j++) if (SYNTAX(j) == Sword) fastmap[j] = 1; break; case notwordchar: for (j = 0; j < (1 << BYTEWIDTH); j++) if (SYNTAX(j) != Sword) fastmap[j] = 1; break; case anychar: { int fastmap_newline = fastmap['\n']; /* `.' matches anything ... */ for (j = 0; j < (1 << BYTEWIDTH); j++) fastmap[j] = 1; /* ... except perhaps newline. */ if (!(bufp->syntax & RE_DOT_NEWLINE)) fastmap['\n'] = fastmap_newline; /* Return if we have already set `can_be_null'; if we have, then the fastmap is irrelevant. Something's wrong here. */ else if (bufp->can_be_null) goto done; /* Otherwise, have to check alternative paths. */ break; }#ifdef emacs case syntaxspec: k = *p++; for (j = 0; j < (1 << BYTEWIDTH); j++) if (SYNTAX
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -