📄 dfa.c
字号:
|| !(syntax_bits & RE_NEWLINE_ALT)) goto normal_char; laststart = 1; return lasttok = OR; case '(': if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0)) goto normal_char; ++parens; laststart = 1; return lasttok = LPAREN; case ')': if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0)) goto normal_char; if (parens == 0 && syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD) goto normal_char; --parens; laststart = 0; return lasttok = RPAREN; case '.': if (backslash) goto normal_char;#ifdef MBS_SUPPORT if (MB_CUR_MAX > 1) { /* In multibyte environment period must match with a single character not a byte. So we use ANYCHAR. */ laststart = 0; return lasttok = ANYCHAR; }#endif /* MBS_SUPPORT */ zeroset(ccl); notset(ccl); if (!(syntax_bits & RE_DOT_NEWLINE)) clrbit(eolbyte, ccl); if (syntax_bits & RE_DOT_NOT_NULL) clrbit('\0', ccl); laststart = 0; return lasttok = CSET + charclass_index(ccl); case 'w': case 'W': if (!backslash || (syntax_bits & RE_NO_GNU_OPS)) goto normal_char; zeroset(ccl); for (c2 = 0; c2 < NOTCHAR; ++c2) if (IS_WORD_CONSTITUENT(c2)) setbit(c2, ccl); if (c == 'W') notset(ccl); laststart = 0; return lasttok = CSET + charclass_index(ccl); case '[': if (backslash) goto normal_char; laststart = 0;#ifdef MBS_SUPPORT if (MB_CUR_MAX > 1) { /* In multibyte environment a bracket expression may contain multibyte characters, which must be treated as characters (not bytes). So we parse it by parse_bracket_exp_mb(). */ parse_bracket_exp_mb(); return lasttok = MBCSET; }#endif zeroset(ccl); FETCH(c, _("Unbalanced [")); if (c == '^') { FETCH(c, _("Unbalanced [")); invert = 1; } else invert = 0; do { /* Nobody ever said this had to be fast. :-) Note that if we're looking at some other [:...:] construct, we just treat it as a bunch of ordinary characters. We can do this because we assume regex has checked for syntax errors before dfa is ever called. */ if (c == '[' && (syntax_bits & RE_CHAR_CLASSES)) for (c1 = 0; prednames[c1].name; ++c1) if (looking_at(prednames[c1].name)) { int (*pred) PARAMS ((int)) = prednames[c1].pred; for (c2 = 0; c2 < NOTCHAR; ++c2) if ((*pred)(c2)) setbit_case_fold (c2, ccl); lexptr += strlen(prednames[c1].name); lexleft -= strlen(prednames[c1].name); FETCH(c1, _("Unbalanced [")); goto skip; } if (c == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS)) FETCH(c, _("Unbalanced [")); FETCH(c1, _("Unbalanced [")); if (c1 == '-') { FETCH(c2, _("Unbalanced [")); if (c2 == ']') { /* In the case [x-], the - is an ordinary hyphen, which is left in c1, the lookahead character. */ --lexptr; ++lexleft; } else { if (c2 == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS)) FETCH(c2, _("Unbalanced [")); FETCH(c1, _("Unbalanced [")); if (!hard_LC_COLLATE) { for (; c <= c2; c++) setbit_case_fold (c, ccl); } else { /* POSIX locales are painful - leave the decision to libc */ char expr[6] = { '[', c, '-', c2, ']', '\0' }; regex_t re; if (regcomp (&re, expr, case_fold ? REG_ICASE : 0) == REG_NOERROR) { for (c = 0; c < NOTCHAR; ++c) { char buf[2] = { c, '\0' }; regmatch_t mat; if (regexec (&re, buf, 1, &mat, 0) == REG_NOERROR && mat.rm_so == 0 && mat.rm_eo == 1) setbit_case_fold (c, ccl); } regfree (&re); } } continue; } } setbit_case_fold (c, ccl); skip: ; } while ((c = c1) != ']'); if (invert) { notset(ccl); if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE) clrbit(eolbyte, ccl); } return lasttok = CSET + charclass_index(ccl); default: normal_char: laststart = 0; if (case_fold && ISALPHA(c)) { zeroset(ccl); setbit_case_fold (c, ccl); return lasttok = CSET + charclass_index(ccl); } return c; } } /* The above loop should consume at most a backslash and some other character. */ abort(); return END; /* keeps pedantic compilers happy. */}/* Recursive descent parser for regular expressions. */static token tok; /* Lookahead token. */static int depth; /* Current depth of a hypothetical stack holding deferred productions. This is used to determine the depth that will be required of the real stack later on in dfaanalyze(). *//* Add the given token to the parse tree, maintaining the depth count and updating the maximum depth if necessary. */static voidaddtok (token t){#ifdef MBS_SUPPORT if (MB_CUR_MAX > 1) { REALLOC_IF_NECESSARY(dfa->multibyte_prop, int, dfa->nmultibyte_prop, dfa->tindex); /* Set dfa->multibyte_prop. See struct dfa in dfa.h. */ if (t == MBCSET) dfa->multibyte_prop[dfa->tindex] = ((dfa->nmbcsets - 1) << 2) + 3; else if (t < NOTCHAR) dfa->multibyte_prop[dfa->tindex] = (cur_mb_len == 1)? 3 /* single-byte char */ : (((cur_mb_index == 1)? 1 : 0) /* 1st-byte of multibyte char */ + ((cur_mb_index == cur_mb_len)? 2 : 0)); /* last-byte */ else /* It may be unnecesssary, but it is safer to treat other symbols as singlebyte characters. */ dfa->multibyte_prop[dfa->tindex] = 3; }#endif REALLOC_IF_NECESSARY(dfa->tokens, token, dfa->talloc, dfa->tindex); dfa->tokens[dfa->tindex++] = t; switch (t) { case QMARK: case STAR: case PLUS: break; case CAT: case OR: case ORTOP: --depth; break; default: ++dfa->nleaves; case EMPTY: ++depth; break; } if (depth > dfa->depth) dfa->depth = depth;}/* The grammar understood by the parser is as follows. regexp: regexp OR branch branch branch: branch closure closure closure: closure QMARK closure STAR closure PLUS closure REPMN atom atom: <normal character> <multibyte character> ANYCHAR MBCSET CSET BACKREF BEGLINE ENDLINE BEGWORD ENDWORD LIMWORD NOTLIMWORD CRANGE LPAREN regexp RPAREN <empty> The parser builds a parse tree in postfix form in an array of tokens. */static voidatom (void){ if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD#ifdef MBS_SUPPORT || tok == ANYCHAR || tok == MBCSET /* MB_CUR_MAX > 1 */#endif /* MBS_SUPPORT */ || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD) { addtok(tok); tok = lex();#ifdef MBS_SUPPORT /* We treat a multibyte character as a single atom, so that DFA can treat a multibyte character as a single expression. e.g. We construct following tree from "<mb1><mb2>". <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT> <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT> */ if (MB_CUR_MAX > 1) { while (cur_mb_index > 1 && tok >= 0 && tok < NOTCHAR) { addtok(tok); addtok(CAT); tok = lex(); } }#endif /* MBS_SUPPORT */ } else if (tok == CRANGE) { /* A character range like "[a-z]" in a locale other than "C" or "POSIX". This range might any sequence of one or more characters. Unfortunately the POSIX locale primitives give us no practical way to find what character sequences might be matched. Treat this approximately like "(.\1)" -- i.e. match one character, and then punt to the full matcher. */ charclass ccl; zeroset (ccl); notset (ccl); addtok (CSET + charclass_index (ccl)); addtok (BACKREF); addtok (CAT); tok = lex (); } else if (tok == LPAREN) { tok = lex(); regexp(0); if (tok != RPAREN) dfaerror(_("Unbalanced (")); tok = lex(); } else addtok(EMPTY);}/* Return the number of tokens in the given subexpression. */static intnsubtoks (int tindex){ int ntoks1; switch (dfa->tokens[tindex - 1]) { default: return 1; case QMARK: case STAR: case PLUS: return 1 + nsubtoks(tindex - 1); case CAT: case OR: case ORTOP: ntoks1 = nsubtoks(tindex - 1); return 1 + ntoks1 + nsubtoks(tindex - 1 - ntoks1); }}/* Copy the given subexpression to the top of the tree. */static voidcopytoks (int tindex, int ntokens){ int i; for (i = 0; i < ntokens; ++i) addtok(dfa->tokens[tindex + i]);}static voidclosure (void){ int tindex, ntokens, i; atom(); while (tok == QMARK || tok == STAR || tok == PLUS || tok == REPMN) if (tok == REPMN) { ntokens = nsubtoks(dfa->tindex); tindex = dfa->tindex - ntokens; if (maxrep < 0) addtok(PLUS); if (minrep == 0) addtok(QMARK); for (i = 1; i < minrep; ++i) { copytoks(tindex, ntokens); addtok(CAT); } for (; i < maxrep; ++i) { copytoks(tindex, ntokens); addtok(QMARK); addtok(CAT); } tok = lex(); } else { addtok(tok); tok = lex(); }}static voidbranch (void){ closure(); while (tok != RPAREN && tok != OR && tok >= 0) { closure(); addtok(CAT); }}static voidregexp (int toplevel){ branch(); while (tok == OR) { tok = lex(); branch(); if (toplevel) addtok(ORTOP); else addtok(OR); }}/* Main entry point for the parser. S is a string to be parsed, len is the length of the string, so s can include NUL characters. D is a pointer to the struct dfa to parse into. */voiddfaparse (char const *s, size_t len, struct dfa *d){ dfa = d; lexstart = lexptr = s; lexleft = len; lasttok = END; laststart = 1; parens = 0;#if ENABLE_NLS hard_LC_COLLATE = hard_locale (LC_COLLATE);#endif#ifdef MBS_SUPPORT if (MB_CUR_MAX > 1) { cur_mb_index = 0; cur_mb_len = 0; memset(&mbs, 0, sizeof(mbstate_t)); }#endif /* MBS_SUPPORT */ if (! syntax_bits_set) dfaerror(_("No syntax specified")); tok = lex(); depth = d->depth; regexp(1); if (tok != END) dfaerror(_("Unbalanced )")); addtok(END - d->nregexps); addtok(CAT); if (d->nregexps) addtok(ORTOP); ++d->nregexps;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -