📄 regularexp.c
字号:
* Process main body of regex or process a parenthesized "thing". * * * * Caller must absorb opening parenthesis. * * * * Combining parenthesis handling with the base level of regular * * expression is a trifle forced, but the need to tie the tails of the * * branches to what follows makes it hard to avoid. * *----------------------------------------------------------------------*/static unsigned char * chunk (int paren, int *flag_param, len_range *range_param) { register unsigned char *ret_val = NULL; register unsigned char *this_branch; register unsigned char *ender = NULL; register int this_paren = 0; int flags_local, first = 1, zero_width, i; int old_sensitive = Is_Case_Insensitive; int old_newline = Match_Newline; len_range range_local; int look_only = 0; unsigned char *emit_look_behind_bounds = NULL; *flag_param = HAS_WIDTH; /* Tentatively. */ range_param->lower = 0; /* Idem */ range_param->upper = 0; /* Make an OPEN node, if parenthesized. */ if (paren == PAREN) { if (Total_Paren >= NSUBEXP) { sprintf (Error_Text, "number of ()'s > %d", (int) NSUBEXP); REG_FAIL (Error_Text); } this_paren = Total_Paren; Total_Paren++; ret_val = emit_node (OPEN + this_paren); } else if (paren == POS_AHEAD_OPEN || paren == NEG_AHEAD_OPEN) { *flag_param = WORST; /* Look ahead is zero width. */ look_only = 1; ret_val = emit_node (paren); } else if (paren == POS_BEHIND_OPEN || paren == NEG_BEHIND_OPEN) { *flag_param = WORST; /* Look behind is zero width. */ look_only = 1; /* We'll overwrite the zero length later on, so we save the ptr */ ret_val = emit_special (paren, 0, 0); emit_look_behind_bounds = ret_val + NODE_SIZE; } else if (paren == INSENSITIVE) { Is_Case_Insensitive = 1; } else if (paren == SENSITIVE) { Is_Case_Insensitive = 0; } else if (paren == NEWLINE) { Match_Newline = 1; } else if (paren == NO_NEWLINE) { Match_Newline = 0; } /* Pick up the branches, linking them together. */ do { this_branch = alternative (&flags_local, &range_local); if (this_branch == NULL) return (NULL); if (first) { first = 0; *range_param = range_local; if (ret_val == NULL) ret_val = this_branch; } else if (range_param->lower >= 0) { if (range_local.lower >= 0) { if (range_local.lower < range_param->lower) range_param->lower = range_local.lower; if (range_local.upper > range_param->upper) range_param->upper = range_local.upper; } else { range_param->lower = -1; /* Branches have different lengths */ range_param->upper = -1; } } tail (ret_val, this_branch); /* Connect BRANCH -> BRANCH. */ /* If any alternative could be zero width, consider the whole parenthisized thing to be zero width. */ if (!(flags_local & HAS_WIDTH)) *flag_param &= ~HAS_WIDTH; /* Are there more alternatives to process? */ if (*Reg_Parse != '|') break; Reg_Parse++; } while (1); /* Make a closing node, and hook it on the end. */ if (paren == PAREN) { ender = emit_node (CLOSE + this_paren); } else if (paren == NO_PAREN) { ender = emit_node (END); } else if (paren == POS_AHEAD_OPEN || paren == NEG_AHEAD_OPEN) { ender = emit_node (LOOK_AHEAD_CLOSE); } else if (paren == POS_BEHIND_OPEN || paren == NEG_BEHIND_OPEN) { ender = emit_node (LOOK_BEHIND_CLOSE); } else { ender = emit_node (NOTHING); } tail (ret_val, ender); /* Hook the tails of the branch alternatives to the closing node. */ for (this_branch = ret_val; this_branch != NULL; ) { branch_tail (this_branch, NODE_SIZE, ender); this_branch = next_ptr (this_branch); } /* Check for proper termination. */ if (paren != NO_PAREN && *Reg_Parse++ != ')') { REG_FAIL ("missing right parenthesis \')\'"); } else if (paren == NO_PAREN && *Reg_Parse != '\0') { if (*Reg_Parse == ')') { REG_FAIL ("missing left parenthesis \'(\'"); } else { REG_FAIL ("junk on end"); /* "Can't happen" - NOTREACHED */ } } /* Check whether look behind has a fixed size */ if (emit_look_behind_bounds) { if (range_param->lower < 0) { REG_FAIL ("look-behind does not have a bounded size"); } if (range_param->upper > 65535L) { REG_FAIL ("max. look-behind size is too large (>65535)") } if (Code_Emit_Ptr != &Compute_Size) { *emit_look_behind_bounds++ = PUT_OFFSET_L (range_param->lower); *emit_look_behind_bounds++ = PUT_OFFSET_R (range_param->lower); *emit_look_behind_bounds++ = PUT_OFFSET_L (range_param->upper); *emit_look_behind_bounds = PUT_OFFSET_R (range_param->upper); } } /* For look ahead/behind, the length must be set to zero again */ if (look_only) { range_param->lower = 0; range_param->upper = 0; } zero_width = 0; /* Set a bit in Closed_Parens to let future calls to function `back_ref' know that we have closed this set of parentheses. */ if (paren == PAREN && this_paren <= (int)sizeof (Closed_Parens) * CHAR_BIT) { SET_BIT (Closed_Parens, this_paren); /* Determine if a parenthesized expression is modified by a quantifier that can have zero width. */ if (*(Reg_Parse) == '?' || *(Reg_Parse) == '*') { zero_width++; } else if (*(Reg_Parse) == '{' && Brace_Char == '{') { if (*(Reg_Parse + 1) == ',' || *(Reg_Parse + 1) == '}') { zero_width++; } else if (*(Reg_Parse + 1) == '0') { i = 2; while (*(Reg_Parse + i) == '0') i++; if (*(Reg_Parse + i) == ',') zero_width++; } } } /* If this set of parentheses is known to never match the empty string, set a bit in Paren_Has_Width to let future calls to function back_ref know that this set of parentheses has non-zero width. This will allow star (*) or question (?) quantifiers to be aplied to a back-reference that refers to this set of parentheses. */ if ((*flag_param & HAS_WIDTH) && paren == PAREN && !zero_width && this_paren <= (int)(sizeof (Paren_Has_Width) * CHAR_BIT)) { SET_BIT (Paren_Has_Width, this_paren); } Is_Case_Insensitive = old_sensitive; Match_Newline = old_newline; return (ret_val);}/*----------------------------------------------------------------------* * alternative * * Processes one alternative of an '|' operator. Connects the NEXT * pointers of each regex atom together sequentialy. *----------------------------------------------------------------------*/static unsigned char * alternative (int *flag_param, len_range *range_param) { register unsigned char *ret_val; register unsigned char *chain; register unsigned char *latest; int flags_local; len_range range_local; *flag_param = WORST; /* Tentatively. */ range_param->lower = 0; /* Idem */ range_param->upper = 0; ret_val = emit_node (BRANCH); chain = NULL; /* Loop until we hit the start of the next alternative, the end of this set of alternatives (end of parentheses), or the end of the regex. */ while (*Reg_Parse != '|' && *Reg_Parse != ')' && *Reg_Parse != '\0') { latest = piece (&flags_local, &range_local); if (latest == NULL) return (NULL); /* Something went wrong. */ *flag_param |= flags_local & HAS_WIDTH; if (range_local.lower < 0) { /* Not a fixed length */ range_param->lower = -1; range_param->upper = -1; } else if (range_param->lower >= 0) { range_param->lower += range_local.lower; range_param->upper += range_local.upper; } if (chain != NULL) { /* Connect the regex atoms together sequentialy. */ tail (chain, latest); } chain = latest; } if (chain == NULL) { /* Loop ran zero times. */ (void) emit_node (NOTHING); } return (ret_val);}/*----------------------------------------------------------------------* * piece - something followed by possible '*', '+', '?', or "{m,n}" * * Note that the branching code sequences used for the general cases of * *, +. ?, and {m,n} are somewhat optimized: they use the same * NOTHING node as both the endmarker for their branch list and the * body of the last branch. It might seem that this node could be * dispensed with entirely, but the endmarker role is not redundant. *----------------------------------------------------------------------*/static unsigned char * piece (int *flag_param, len_range *range_param) { register unsigned char *ret_val; register unsigned char *next; register unsigned char op_code; unsigned long min_max [2] = {REG_ZERO, REG_INFINITY}; int flags_local, i, brace_present = 0; int lazy = 0, comma_present = 0; int digit_present [2] = {0,0}; len_range range_local; ret_val = atom (&flags_local, &range_local); if (ret_val == NULL) return (NULL); /* Something went wrong. */ op_code = *Reg_Parse; if (!IS_QUANTIFIER (op_code)) { *flag_param = flags_local; *range_param = range_local; return (ret_val); } else if (op_code == '{') { /* {n,m} quantifier present */ brace_present++; Reg_Parse++; /* This code will allow specifying a counting range in any of the following forms: {m,n} between m and n. {,n} same as {0,n} or between 0 and infinity. {m,} same as {m,0} or between m and infinity. {m} same as {m,m} or exactly m. {,} same as {0,0} or between 0 and infinity or just '*'. {} same as {0,0} or between 0 and infinity or just '*'. Note that specifying a max of zero, {m,0} is not allowed in the regex itself, but it is implemented internally that way to support '*', '+', and {min,} constructs and signals an unlimited number. */ for (i = 0; i < 2; i++) { /* Look for digits of number and convert as we go. The numeric maximum value for max and min of 65,535 is due to using 2 bytes to store each value in the compiled regex code. */ while (isdigit (*Reg_Parse)) { /* (6553 * 10 + 6) > 65535 (16 bit max) */ if ((min_max [i] == 6553UL && (*Reg_Parse - '0') <= 5) || (min_max [i] <= 6552UL)) { min_max [i] = (min_max [i] * 10UL) + (unsigned long) (*Reg_Parse - '0'); Reg_Parse++; digit_present [i]++; } else { if (i == 0) { sprintf (Error_Text, "min operand of {%lu%c,???} > 65535",
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -