📄 regexec.c
字号:
/* Extended regular expression matching and search library. Copyright (C) 2002, 2003 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU C Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the GNU C Library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. */static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags, re_string_t *input, int n);static void match_ctx_clean (re_match_context_t *mctx);static void match_ctx_free (re_match_context_t *cache);static void match_ctx_free_subtops (re_match_context_t *mctx);static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node, int str_idx, int from, int to);static int search_cur_bkref_entry (re_match_context_t *mctx, int str_idx);static void match_ctx_clear_flag (re_match_context_t *mctx);static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node, int str_idx);static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop, int node, int str_idx);static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts, re_dfastate_t **limited_sts, int last_node, int last_str_idx, int check_subexp);static reg_errcode_t re_search_internal (const regex_t *preg, const char *string, int length, int start, int range, int stop, size_t nmatch, regmatch_t pmatch[], int eflags);static int re_search_2_stub (struct re_pattern_buffer *bufp, const char *string1, int length1, const char *string2, int length2, int start, int range, struct re_registers *regs, int stop, int ret_len);static int re_search_stub (struct re_pattern_buffer *bufp, const char *string, int length, int start, int range, int stop, struct re_registers *regs, int ret_len);static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, int nregs, int regs_allocated);static inline re_dfastate_t *acquire_init_state_context (reg_errcode_t *err, const regex_t *preg, const re_match_context_t *mctx, int idx);static reg_errcode_t prune_impossible_nodes (const regex_t *preg, re_match_context_t *mctx);static int check_matching (const regex_t *preg, re_match_context_t *mctx, int fl_search, int fl_longest_match);static int check_halt_node_context (const re_dfa_t *dfa, int node, unsigned int context);static int check_halt_state_context (const regex_t *preg, const re_dfastate_t *state, const re_match_context_t *mctx, int idx);static void update_regs (re_dfa_t *dfa, regmatch_t *pmatch, int cur_node, int cur_idx, int nmatch);static int proceed_next_node (const regex_t *preg, int nregs, regmatch_t *regs, const re_match_context_t *mctx, int *pidx, int node, re_node_set *eps_via_nodes, struct re_fail_stack_t *fs);static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs, int str_idx, int *dests, int nregs, regmatch_t *regs, re_node_set *eps_via_nodes);static int pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs, regmatch_t *regs, re_node_set *eps_via_nodes);static reg_errcode_t set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch, regmatch_t *pmatch, int fl_backtrack);static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs);#ifdef RE_ENABLE_I18Nstatic int sift_states_iter_mb (const regex_t *preg, const re_match_context_t *mctx, re_sift_context_t *sctx, int node_idx, int str_idx, int max_str_idx);#endif /* RE_ENABLE_I18N */static reg_errcode_t sift_states_backward (const regex_t *preg, re_match_context_t *mctx, re_sift_context_t *sctx);static reg_errcode_t update_cur_sifted_state (const regex_t *preg, re_match_context_t *mctx, re_sift_context_t *sctx, int str_idx, re_node_set *dest_nodes);static reg_errcode_t add_epsilon_src_nodes (re_dfa_t *dfa, re_node_set *dest_nodes, const re_node_set *candidates);static reg_errcode_t sub_epsilon_src_nodes (re_dfa_t *dfa, int node, re_node_set *dest_nodes, const re_node_set *and_nodes);static int check_dst_limits (re_dfa_t *dfa, re_node_set *limits, re_match_context_t *mctx, int dst_node, int dst_idx, int src_node, int src_idx);static int check_dst_limits_calc_pos (re_dfa_t *dfa, re_match_context_t *mctx, int limit, re_node_set *eclosures, int subexp_idx, int node, int str_idx);static reg_errcode_t check_subexp_limits (re_dfa_t *dfa, re_node_set *dest_nodes, const re_node_set *candidates, re_node_set *limits, struct re_backref_cache_entry *bkref_ents, int str_idx);static reg_errcode_t sift_states_bkref (const regex_t *preg, re_match_context_t *mctx, re_sift_context_t *sctx, int str_idx, re_node_set *dest_nodes);static reg_errcode_t clean_state_log_if_need (re_match_context_t *mctx, int next_state_log_idx);static reg_errcode_t merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst, re_dfastate_t **src, int num);static re_dfastate_t *transit_state (reg_errcode_t *err, const regex_t *preg, re_match_context_t *mctx, re_dfastate_t *state, int fl_search);static reg_errcode_t check_subexp_matching_top (re_dfa_t *dfa, re_match_context_t *mctx, re_node_set *cur_nodes, int str_idx);static re_dfastate_t *transit_state_sb (reg_errcode_t *err, const regex_t *preg, re_dfastate_t *pstate, int fl_search, re_match_context_t *mctx);#ifdef RE_ENABLE_I18Nstatic reg_errcode_t transit_state_mb (const regex_t *preg, re_dfastate_t *pstate, re_match_context_t *mctx);#endif /* RE_ENABLE_I18N */static reg_errcode_t transit_state_bkref (const regex_t *preg, re_node_set *nodes, re_match_context_t *mctx);static reg_errcode_t get_subexp (const regex_t *preg, re_match_context_t *mctx, int bkref_node, int bkref_str_idx);static reg_errcode_t get_subexp_sub (const regex_t *preg, re_match_context_t *mctx, re_sub_match_top_t *sub_top, re_sub_match_last_t *sub_last, int bkref_node, int bkref_str);static int find_subexp_node (re_dfa_t *dfa, re_node_set *nodes, int subexp_idx, int fl_open);static reg_errcode_t check_arrival (const regex_t *preg, re_match_context_t *mctx, state_array_t *path, int top_node, int top_str, int last_node, int last_str, int fl_open);static reg_errcode_t check_arrival_add_next_nodes (const regex_t *preg, re_dfa_t *dfa, re_match_context_t *mctx, int str_idx, re_node_set *cur_nodes, re_node_set *next_nodes);static reg_errcode_t check_arrival_expand_ecl (re_dfa_t *dfa, re_node_set *cur_nodes, int ex_subexp, int fl_open);static reg_errcode_t check_arrival_expand_ecl_sub (re_dfa_t *dfa, re_node_set *dst_nodes, int target, int ex_subexp, int fl_open);static reg_errcode_t expand_bkref_cache (const regex_t *preg, re_match_context_t *mctx, re_node_set *cur_nodes, int cur_str, int last_str, int subexp_num, int fl_open);static re_dfastate_t **build_trtable (const regex_t *dfa, const re_dfastate_t *state, int fl_search);#ifdef RE_ENABLE_I18Nstatic int check_node_accept_bytes (const regex_t *preg, int node_idx, const re_string_t *input, int idx);# ifdef _LIBCstatic unsigned int find_collation_sequence_value (const unsigned char *mbs, size_t name_len);# endif /* _LIBC */#endif /* RE_ENABLE_I18N */static int group_nodes_into_DFAstates (const regex_t *dfa, const re_dfastate_t *state, re_node_set *states_node, bitset *states_ch);static int check_node_accept (const regex_t *preg, const re_token_t *node, const re_match_context_t *mctx, int idx);static reg_errcode_t extend_buffers (re_match_context_t *mctx);/* Entry point for POSIX code. *//* regexec searches for a given pattern, specified by PREG, in the string STRING. If NMATCH is zero or REG_NOSUB was set in the cflags argument to `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at least NMATCH elements, and we set them to the offsets of the corresponding matched substrings. EFLAGS specifies `execution flags' which affect matching: if REG_NOTBOL is set, then ^ does not match at the beginning of the string; if REG_NOTEOL is set, then $ does not match at the end. We return 0 if we find a match and REG_NOMATCH if not. */intregexec (preg, string, nmatch, pmatch, eflags) const regex_t *__restrict preg; const char *__restrict string; size_t nmatch; regmatch_t pmatch[]; int eflags;{ reg_errcode_t err; int length = strlen (string); if (preg->no_sub) err = re_search_internal (preg, string, length, 0, length, length, 0, NULL, eflags); else err = re_search_internal (preg, string, length, 0, length, length, nmatch, pmatch, eflags); return err != REG_NOERROR;}#ifdef _LIBCweak_alias (__regexec, regexec)#endif/* Entry points for GNU code. *//* re_match, re_search, re_match_2, re_search_2 The former two functions operate on STRING with length LENGTH, while the later two operate on concatenation of STRING1 and STRING2 with lengths LENGTH1 and LENGTH2, respectively. re_match() matches the compiled pattern in BUFP against the string, starting at index START. re_search() first tries matching at index START, then it tries to match starting from index START + 1, and so on. The last start position tried is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same way as re_match().) The parameter STOP of re_{match,search}_2 specifies that no match exceeding the first STOP characters of the concatenation of the strings should be concerned. If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match and all groups is stroed in REGS. (For the "_2" variants, the offsets are computed relative to the concatenation, not relative to the individual strings.) On success, re_match* functions return the length of the match, re_search* return the position of the start of the match. Return value -1 means no match was found and -2 indicates an internal error. */intre_match (bufp, string, length, start, regs) struct re_pattern_buffer *bufp; const char *string; int length, start; struct re_registers *regs;{ return re_search_stub (bufp, string, length, start, 0, length, regs, 1);}#ifdef _LIBCweak_alias (__re_match, re_match)#endifintre_search (bufp, string, length, start, range, regs) struct re_pattern_buffer *bufp; const char *string; int length, start, range; struct re_registers *regs;{ return re_search_stub (bufp, string, length, start, range, length, regs, 0);}#ifdef _LIBCweak_alias (__re_search, re_search)#endifintre_match_2 (bufp, string1, length1, string2, length2, start, regs, stop) struct re_pattern_buffer *bufp; const char *string1, *string2; int length1, length2, start, stop; struct re_registers *regs;{ return re_search_2_stub (bufp, string1, length1, string2, length2, start, 0, regs, stop, 1);}#ifdef _LIBCweak_alias (__re_match_2, re_match_2)#endifintre_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop) struct re_pattern_buffer *bufp; const char *string1, *string2; int length1, length2, start, range, stop; struct re_registers *regs;{ return re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs, stop, 0);}#ifdef _LIBCweak_alias (__re_search_2, re_search_2)#endifstatic intre_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs, stop, ret_len) struct re_pattern_buffer *bufp; const char *string1, *string2; int length1, length2, start, range, stop, ret_len; struct re_registers *regs;{ const char *str; int rval; int len = length1 + length2; int free_str = 0; if (BE (length1 < 0 || length2 < 0 || stop < 0, 0)) return -2; /* Concatenate the strings. */ if (length2 > 0) if (length1 > 0) { char *s = re_malloc (char, len); if (BE (s == NULL, 0)) return -2; memcpy (s, string1, length1); memcpy (s + length1, string2, length2); str = s; free_str = 1; } else str = string2; else str = string1; rval = re_search_stub (bufp, str, len, start, range, stop, regs, ret_len); if (free_str) re_free ((char *) str); return rval;}/* The parameters have the same meaning as those of re_search. Additional parameters: If RET_LEN is nonzero the length of the match is returned (re_match style); otherwise the position of the match is returned. */static intre_search_stub (bufp, string, length, start, range, stop, regs, ret_len) struct re_pattern_buffer *bufp; const char *string; int length, start, range, stop, ret_len; struct re_registers *regs;{ reg_errcode_t result; regmatch_t *pmatch; int nregs, rval; int eflags = 0; /* Check for out-of-range. */ if (BE (start < 0 || start > length, 0)) return -1; if (BE (start + range > length, 0)) range = length - start; else if (BE (start + range < 0, 0)) range = -start; eflags |= (bufp->not_bol) ? REG_NOTBOL : 0; eflags |= (bufp->not_eol) ? REG_NOTEOL : 0; /* Compile fastmap if we haven't yet. */ if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate) re_compile_fastmap (bufp); if (BE (bufp->no_sub, 0)) regs = NULL; /* We need at least 1 register. */ if (regs == NULL) nregs = 1; else if (BE (bufp->regs_allocated == REGS_FIXED && regs->num_regs < bufp->re_nsub + 1, 0)) { nregs = regs->num_regs; if (BE (nregs < 1, 0)) { /* Nothing can be copied to regs. */ regs = NULL; nregs = 1; } } else nregs = bufp->re_nsub + 1; pmatch = re_malloc (regmatch_t, nregs); if (BE (pmatch == NULL, 0)) return -2; result = re_search_internal (bufp, string, length, start, range, stop, nregs, pmatch, eflags); rval = 0; /* I hope we needn't fill ther regs with -1's when no match was found. */ if (result != REG_NOERROR) rval = -1; else if (regs != NULL) { /* If caller wants register contents data back, copy them. */ bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs, bufp->regs_allocated); if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0)) rval = -2; } if (BE (rval == 0, 1)) { if (ret_len)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -