📄 regcomp.c
字号:
}#endif if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0)) return REG_ESPACE; return REG_NOERROR;}/* Initialize WORD_CHAR table, which indicate which character is "word". In this case "word" means that it is the word construction character used by some operators like "\<", "\>", etc. */static voidinternal_functioninit_word_char (re_dfa_t *dfa){ int i, j, ch; dfa->word_ops_used = 1; for (i = 0, ch = 0; i < BITSET_WORDS; ++i) for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) if (isalnum (ch) || ch == '_') dfa->word_char[i] |= (bitset_word_t) 1 << j;}/* Free the work area which are only used while compiling. */static voidfree_workarea_compile (regex_t *preg){ re_dfa_t *dfa = (re_dfa_t *) preg->buffer; bin_tree_storage_t *storage, *next; for (storage = dfa->str_tree_storage; storage; storage = next) { next = storage->next; re_free (storage); } dfa->str_tree_storage = NULL; dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE; dfa->str_tree = NULL; re_free (dfa->org_indices); dfa->org_indices = NULL;}/* Create initial states for all contexts. */static reg_errcode_tcreate_initial_state (re_dfa_t *dfa){ Idx first, i; reg_errcode_t err; re_node_set init_nodes; /* Initial states have the epsilon closure of the node which is the first node of the regular expression. */ first = dfa->str_tree->first->node_idx; dfa->init_node = first; err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first); if (BE (err != REG_NOERROR, 0)) return err; /* The back-references which are in initial states can epsilon transit, since in this case all of the subexpressions can be null. Then we add epsilon closures of the nodes which are the next nodes of the back-references. */ if (dfa->nbackref > 0) for (i = 0; i < init_nodes.nelem; ++i) { Idx node_idx = init_nodes.elems[i]; re_token_type_t type = dfa->nodes[node_idx].type; Idx clexp_idx; if (type != OP_BACK_REF) continue; for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx) { re_token_t *clexp_node; clexp_node = dfa->nodes + init_nodes.elems[clexp_idx]; if (clexp_node->type == OP_CLOSE_SUBEXP && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx) break; } if (clexp_idx == init_nodes.nelem) continue; if (type == OP_BACK_REF) { Idx dest_idx = dfa->edests[node_idx].elems[0]; if (!re_node_set_contains (&init_nodes, dest_idx)) { re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx); i = 0; } } } /* It must be the first time to invoke acquire_state. */ dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0); /* We don't check ERR here, since the initial state must not be NULL. */ if (BE (dfa->init_state == NULL, 0)) return err; if (dfa->init_state->has_constraint) { dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes, CONTEXT_WORD); dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes, CONTEXT_NEWLINE); dfa->init_state_begbuf = re_acquire_state_context (&err, dfa, &init_nodes, CONTEXT_NEWLINE | CONTEXT_BEGBUF); if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL || dfa->init_state_begbuf == NULL, 0)) return err; } else dfa->init_state_word = dfa->init_state_nl = dfa->init_state_begbuf = dfa->init_state; re_node_set_free (&init_nodes); return REG_NOERROR;}#ifdef RE_ENABLE_I18N/* If it is possible to do searching in single byte encoding instead of UTF-8 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change DFA nodes where needed. */static voidoptimize_utf8 (re_dfa_t *dfa){ Idx node; int i; bool mb_chars = false; bool has_period = false; for (node = 0; node < dfa->nodes_len; ++node) switch (dfa->nodes[node].type) { case CHARACTER: if (dfa->nodes[node].opr.c >= ASCII_CHARS) mb_chars = true; break; case ANCHOR: switch (dfa->nodes[node].opr.ctx_type) { case LINE_FIRST: case LINE_LAST: case BUF_FIRST: case BUF_LAST: break; default: /* Word anchors etc. cannot be handled. */ return; } break; case OP_PERIOD: has_period = true; break; case OP_BACK_REF: case OP_ALT: case END_OF_RE: case OP_DUP_ASTERISK: case OP_OPEN_SUBEXP: case OP_CLOSE_SUBEXP: break; case COMPLEX_BRACKET: return; case SIMPLE_BRACKET: /* Just double check. */ { int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0 ? 0 : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS); for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i) { if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0) return; rshift = 0; } } break; default: abort (); } if (mb_chars || has_period) for (node = 0; node < dfa->nodes_len; ++node) { if (dfa->nodes[node].type == CHARACTER && dfa->nodes[node].opr.c >= ASCII_CHARS) dfa->nodes[node].mb_partial = 0; else if (dfa->nodes[node].type == OP_PERIOD) dfa->nodes[node].type = OP_UTF8_PERIOD; } /* The search can be in single byte locale. */ dfa->mb_cur_max = 1; dfa->is_utf8 = 0; dfa->has_mb_node = dfa->nbackref > 0 || has_period;}#endif/* Analyze the structure tree, and calculate "first", "next", "edest", "eclosure", and "inveclosure". */static reg_errcode_tanalyze (regex_t *preg){ re_dfa_t *dfa = (re_dfa_t *) preg->buffer; reg_errcode_t ret; /* Allocate arrays. */ dfa->nexts = re_malloc (Idx, dfa->nodes_alloc); dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc); dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc); dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc); if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL || dfa->eclosures == NULL, 0)) return REG_ESPACE; dfa->subexp_map = re_malloc (Idx, preg->re_nsub); if (dfa->subexp_map != NULL) { Idx i; for (i = 0; i < preg->re_nsub; i++) dfa->subexp_map[i] = i; preorder (dfa->str_tree, optimize_subexps, dfa); for (i = 0; i < preg->re_nsub; i++) if (dfa->subexp_map[i] != i) break; if (i == preg->re_nsub) { free (dfa->subexp_map); dfa->subexp_map = NULL; } } ret = postorder (dfa->str_tree, lower_subexps, preg); if (BE (ret != REG_NOERROR, 0)) return ret; ret = postorder (dfa->str_tree, calc_first, dfa); if (BE (ret != REG_NOERROR, 0)) return ret; preorder (dfa->str_tree, calc_next, dfa); ret = preorder (dfa->str_tree, link_nfa_nodes, dfa); if (BE (ret != REG_NOERROR, 0)) return ret; ret = calc_eclosure (dfa); if (BE (ret != REG_NOERROR, 0)) return ret; /* We only need this during the prune_impossible_nodes pass in regexec.c; skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */ if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match) || dfa->nbackref) { dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len); if (BE (dfa->inveclosures == NULL, 0)) return REG_ESPACE; ret = calc_inveclosure (dfa); } return ret;}/* Our parse trees are very unbalanced, so we cannot use a stack to implement parse tree visits. Instead, we use parent pointers and some hairy code in these two functions. */static reg_errcode_tpostorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)), void *extra){ bin_tree_t *node, *prev; for (node = root; ; ) { /* Descend down the tree, preferably to the left (or to the right if that's the only child). */ while (node->left || node->right) if (node->left) node = node->left; else node = node->right; do { reg_errcode_t err = fn (extra, node); if (BE (err != REG_NOERROR, 0)) return err; if (node->parent == NULL) return REG_NOERROR; prev = node; node = node->parent; } /* Go up while we have a node that is reached from the right. */ while (node->right == prev || node->right == NULL); node = node->right; }}static reg_errcode_tpreorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)), void *extra){ bin_tree_t *node; for (node = root; ; ) { reg_errcode_t err = fn (extra, node); if (BE (err != REG_NOERROR, 0)) return err; /* Go to the left node, or up and to the right. */ if (node->left) node = node->left; else { bin_tree_t *prev = NULL; while (node->right == prev || node->right == NULL) { prev = node; node = node->parent; if (!node) return REG_NOERROR; } node = node->right; } }}/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell re_search_internal to map the inner one's opr.idx to this one's. Adjust backreferences as well. Requires a preorder visit. */static reg_errcode_toptimize_subexps (void *extra, bin_tree_t *node){ re_dfa_t *dfa = (re_dfa_t *) extra; if (node->token.type == OP_BACK_REF && dfa->subexp_map) { int idx = node->token.opr.idx; node->token.opr.idx = dfa->subexp_map[idx]; dfa->used_bkref_map |= 1 << node->token.opr.idx; } else if (node->token.type == SUBEXP && node->left && node->left->token.type == SUBEXP) { Idx other_idx = node->left->token.opr.idx; node->left = node->left->left; if (node->left) node->left->parent = node; dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx]; if (other_idx < BITSET_WORD_BITS) dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx); } return REG_NOERROR;}/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */static reg_errcode_tlower_subexps (void *extra, bin_tree_t *node){ regex_t *preg = (regex_t *) extra; reg_errcode_t err = REG_NOERROR; if (node->left && node->left->token.type == SUBEXP) { node->left = lower_subexp (&err, preg, node->left); if (node->left) node->left->parent = node; } if (node->right && node->right->token.type == SUBEXP) { node->right = lower_subexp (&err, preg, node->right); if (node->right) node->right->parent = node; } return err;}static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node){ re_dfa_t *dfa = (re_dfa_t *) preg->buffer; bin_tree_t *body = node->left; bin_tree_t *op, *cls, *tree1, *tree; if (preg->no_sub /* We do not optimize empty subexpressions, because otherwise we may have bad CONCAT nodes with NULL children. This is obviously not very common, so we do not lose much. An example that triggers this case is the sed "script" /\(\)/x. */ && node->left != NULL && (node->token.opr.idx >= BITSET_WORD_BITS || !(dfa->used_bkref_map & ((bitset_word_t) 1 << node->token.opr.idx)))) return node->left; /* Convert the SUBEXP node to the concatenation of an OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */ op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP); cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP); tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls; tree = create_tree (dfa, op, tree1, CONCAT); if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0)) { *err = REG_ESPACE; return NULL; } op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx; op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp; return tree;}/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton nodes. Requires a postorder visit. */static reg_errcode_tcalc_first (void *extra, bin_tree_t *node){ re_dfa_t *dfa = (re_dfa_t *) extra; if (node->token.type == CONCAT) { node->first = node->left->first; node->node_idx = node->left->node_idx; } else { node->first = node; node->node_idx = re_dfa_add_node (dfa, node->token); if (BE (node->node_idx == REG_MISSING, 0)) return REG_ESPACE; } return REG_NOERROR;}/* Pass 2: compute NEXT on the tree. Preorder visit. */static reg_errcode_tcalc_next (void *extra, bin_tree_t *node){ switch (node->token.type) { case OP_DUP_ASTERISK: node->left->next = node; break; case CONCAT: node->left->next = node->right->first; node->right->next = node->next; break;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -