📄 dfa.c
字号:
int chars_al, range_sts_al, range_ends_al, ch_classes_al, equivs_al, coll_elems_al; REALLOC_IF_NECESSARY(dfa->mbcsets, struct mb_char_classes, dfa->mbcsets_alloc, dfa->nmbcsets + 1); /* dfa->multibyte_prop[] hold the index of dfa->mbcsets. We will update dfa->multibyte_prop in addtok(), because we can't decide the index in dfa->tokens[]. */ /* Initialize work are */ work_mbc = &(dfa->mbcsets[dfa->nmbcsets++]); chars_al = 1; range_sts_al = range_ends_al = 0; ch_classes_al = equivs_al = coll_elems_al = 0; MALLOC(work_mbc->chars, wchar_t, chars_al); work_mbc->nchars = work_mbc->nranges = work_mbc->nch_classes = 0; work_mbc->nequivs = work_mbc->ncoll_elems = 0; work_mbc->chars = work_mbc->ch_classes = NULL; work_mbc->range_sts = work_mbc->range_ends = NULL; work_mbc->equivs = work_mbc->coll_elems = NULL; wc = fetch_wc(_("Unbalanced [")); if (wc == L'^') { wc = fetch_wc(_("Unbalanced [")); work_mbc->invert = 1; } else work_mbc->invert = 0; do { wc1 = -1; /* mark wc1 is not initialized". */ /* Note that if we're looking at some other [:...:] construct, we just treat it as a bunch of ordinary characters. We can do this because we assume regex has checked for syntax errors before dfa is ever called. */ if (wc == L'[' && (syntax_bits & RE_CHAR_CLASSES)) {#define BRACKET_BUFFER_SIZE 128 char str[BRACKET_BUFFER_SIZE]; wc1 = wc; wc = fetch_wc(_("Unbalanced [")); /* If pattern contains `[[:', `[[.', or `[[='. */ if (cur_mb_len == 1 && (wc == L':' || wc == L'.' || wc == L'=')) { unsigned char c; unsigned char delim = (unsigned char)wc; int len = 0; for (;;) { if (! lexleft) dfaerror (_("Unbalanced [")); c = (unsigned char) *lexptr++; --lexleft; if ((c == delim && *lexptr == ']') || lexleft == 0) break; if (len < BRACKET_BUFFER_SIZE) str[len++] = c; else /* This is in any case an invalid class name. */ str[0] = '\0'; } str[len] = '\0'; if (lexleft == 0) { REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al, work_mbc->nchars + 2); work_mbc->chars[work_mbc->nchars++] = L'['; work_mbc->chars[work_mbc->nchars++] = delim; break; } if (--lexleft, *lexptr++ != ']') dfaerror (_("Unbalanced [")); if (delim == ':') /* build character class. */ { wctype_t wt; /* Query the character class as wctype_t. */ wt = wctype (str); if (ch_classes_al == 0) MALLOC(work_mbc->ch_classes, wchar_t, ++ch_classes_al); REALLOC_IF_NECESSARY(work_mbc->ch_classes, wctype_t, ch_classes_al, work_mbc->nch_classes + 1); work_mbc->ch_classes[work_mbc->nch_classes++] = wt; } else if (delim == '=' || delim == '.') { char *elem; MALLOC(elem, char, len + 1); strncpy(elem, str, len + 1); if (delim == '=') /* build equivalent class. */ { if (equivs_al == 0) MALLOC(work_mbc->equivs, char*, ++equivs_al); REALLOC_IF_NECESSARY(work_mbc->equivs, char*, equivs_al, work_mbc->nequivs + 1); work_mbc->equivs[work_mbc->nequivs++] = elem; } if (delim == '.') /* build collating element. */ { if (coll_elems_al == 0) MALLOC(work_mbc->coll_elems, char*, ++coll_elems_al); REALLOC_IF_NECESSARY(work_mbc->coll_elems, char*, coll_elems_al, work_mbc->ncoll_elems + 1); work_mbc->coll_elems[work_mbc->ncoll_elems++] = elem; } } wc = -1; } else /* We treat '[' as a normal character here. */ { wc2 = wc1; wc1 = wc; wc = wc2; /* swap */ } } else { if (wc == L'\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS)) wc = fetch_wc(("Unbalanced [")); } if (wc1 == -1) wc1 = fetch_wc(_("Unbalanced [")); if (wc1 == L'-') /* build range characters. */ { wc2 = fetch_wc(_("Unbalanced [")); if (wc2 == L']') { /* In the case [x-], the - is an ordinary hyphen, which is left in c1, the lookahead character. */ lexptr -= cur_mb_len; lexleft += cur_mb_len; wc2 = wc; } else { if (wc2 == L'\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS)) wc2 = fetch_wc(_("Unbalanced [")); wc1 = fetch_wc(_("Unbalanced [")); } if (range_sts_al == 0) { MALLOC(work_mbc->range_sts, wchar_t, ++range_sts_al); MALLOC(work_mbc->range_ends, wchar_t, ++range_ends_al); } REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t, range_sts_al, work_mbc->nranges + 1); work_mbc->range_sts[work_mbc->nranges] = wc; REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t, range_ends_al, work_mbc->nranges + 1); work_mbc->range_ends[work_mbc->nranges++] = wc2; } else if (wc != -1) /* build normal characters. */ { REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al, work_mbc->nchars + 1); work_mbc->chars[work_mbc->nchars++] = wc; } } while ((wc = wc1) != L']');}#endif /* MBS_SUPPORT */#ifdef __STDC__#define FUNC(F, P) static int F(int c) { return P(c); }#else#define FUNC(F, P) static int F(c) int c; { return P(c); }#endifFUNC(is_alpha, ISALPHA)FUNC(is_upper, ISUPPER)FUNC(is_lower, ISLOWER)FUNC(is_digit, ISDIGIT)FUNC(is_xdigit, ISXDIGIT)FUNC(is_space, ISSPACE)FUNC(is_punct, ISPUNCT)FUNC(is_alnum, ISALNUM)FUNC(is_print, ISPRINT)FUNC(is_graph, ISGRAPH)FUNC(is_cntrl, ISCNTRL)static intis_blank (int c){ return (c == ' ' || c == '\t');}/* The following list maps the names of the Posix named character classes to predicate functions that determine whether a given character is in the class. The leading [ has already been eaten by the lexical analyzer. */static struct { const char *name; int (*pred) PARAMS ((int));} const prednames[] = { { ":alpha:]", is_alpha }, { ":upper:]", is_upper }, { ":lower:]", is_lower }, { ":digit:]", is_digit }, { ":xdigit:]", is_xdigit }, { ":space:]", is_space }, { ":punct:]", is_punct }, { ":alnum:]", is_alnum }, { ":print:]", is_print }, { ":graph:]", is_graph }, { ":cntrl:]", is_cntrl }, { ":blank:]", is_blank }, { 0 }};/* Return non-zero if C is a `word-constituent' byte; zero otherwise. */#define IS_WORD_CONSTITUENT(C) (ISALNUM(C) || (C) == '_')static intlooking_at (char const *s){ size_t len; len = strlen(s); if (lexleft < len) return 0; return strncmp(s, lexptr, len) == 0;}static tokenlex (void){ unsigned c, c1, c2; int backslash = 0, invert; charclass ccl; int i; /* Basic plan: We fetch a character. If it's a backslash, we set the backslash flag and go through the loop again. On the plus side, this avoids having a duplicate of the main switch inside the backslash case. On the minus side, it means that just about every case begins with "if (backslash) ...". */ for (i = 0; i < 2; ++i) { FETCH(c, 0);#ifdef MBS_SUPPORT if (MB_CUR_MAX > 1 && cur_mb_index) /* If this is a part of a multi-byte character, we must treat this byte data as a normal character. e.g. In case of SJIS encoding, some character contains '\', but they must not be backslash. */ goto normal_char;#endif /* MBS_SUPPORT */ switch (c) { case '\\': if (backslash) goto normal_char; if (lexleft == 0) dfaerror(_("Unfinished \\ escape")); backslash = 1; break; case '^': if (backslash) goto normal_char; if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS || lasttok == END || lasttok == LPAREN || lasttok == OR) return lasttok = BEGLINE; goto normal_char; case '$': if (backslash) goto normal_char; if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS || lexleft == 0 || (syntax_bits & RE_NO_BK_PARENS ? lexleft > 0 && *lexptr == ')' : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == ')') || (syntax_bits & RE_NO_BK_VBAR ? lexleft > 0 && *lexptr == '|' : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == '|') || ((syntax_bits & RE_NEWLINE_ALT) && lexleft > 0 && *lexptr == '\n')) return lasttok = ENDLINE; goto normal_char; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': if (backslash && !(syntax_bits & RE_NO_BK_REFS)) { laststart = 0; return lasttok = BACKREF; } goto normal_char; case '`': if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) return lasttok = BEGLINE; /* FIXME: should be beginning of string */ goto normal_char; case '\'': if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) return lasttok = ENDLINE; /* FIXME: should be end of string */ goto normal_char; case '<': if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) return lasttok = BEGWORD; goto normal_char; case '>': if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) return lasttok = ENDWORD; goto normal_char; case 'b': if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) return lasttok = LIMWORD; goto normal_char; case 'B': if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) return lasttok = NOTLIMWORD; goto normal_char; case '?': if (syntax_bits & RE_LIMITED_OPS) goto normal_char; if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0)) goto normal_char; if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart) goto normal_char; return lasttok = QMARK; case '*': if (backslash) goto normal_char; if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart) goto normal_char; return lasttok = STAR; case '+': if (syntax_bits & RE_LIMITED_OPS) goto normal_char; if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0)) goto normal_char; if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart) goto normal_char; return lasttok = PLUS; case '{': if (!(syntax_bits & RE_INTERVALS)) goto normal_char; if (backslash != ((syntax_bits & RE_NO_BK_BRACES) == 0)) goto normal_char; if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart) goto normal_char; if (syntax_bits & RE_NO_BK_BRACES) { /* Scan ahead for a valid interval; if it's not valid, treat it as a literal '{'. */ int lo = -1, hi = -1; char const *p = lexptr; char const *lim = p + lexleft; for (; p != lim && ISASCIIDIGIT (*p); p++) lo = (lo < 0 ? 0 : lo * 10) + *p - '0'; if (p != lim && *p == ',') while (++p != lim && ISASCIIDIGIT (*p)) hi = (hi < 0 ? 0 : hi * 10) + *p - '0'; else hi = lo; if (p == lim || *p != '}' || lo < 0 || RE_DUP_MAX < hi || (0 <= hi && hi < lo)) goto normal_char; } minrep = 0; /* Cases: {M} - exact count {M,} - minimum count, maximum is infinity {M,N} - M through N */ FETCH(c, _("unfinished repeat count")); if (ISASCIIDIGIT (c)) { minrep = c - '0'; for (;;) { FETCH(c, _("unfinished repeat count")); if (! ISASCIIDIGIT (c)) break; minrep = 10 * minrep + c - '0'; } } else dfaerror(_("malformed repeat count")); if (c == ',') { FETCH (c, _("unfinished repeat count")); if (! ISASCIIDIGIT (c)) maxrep = -1; else { maxrep = c - '0'; for (;;) { FETCH (c, _("unfinished repeat count")); if (! ISASCIIDIGIT (c)) break; maxrep = 10 * maxrep + c - '0'; } if (0 <= maxrep && maxrep < minrep) dfaerror (_("malformed repeat count")); } } else maxrep = minrep; if (!(syntax_bits & RE_NO_BK_BRACES)) { if (c != '\\') dfaerror(_("malformed repeat count")); FETCH(c, _("unfinished repeat count")); } if (c != '}') dfaerror(_("malformed repeat count")); laststart = 0; return lasttok = REPMN; case '|': if (syntax_bits & RE_LIMITED_OPS) goto normal_char; if (backslash != ((syntax_bits & RE_NO_BK_VBAR) == 0)) goto normal_char; laststart = 1; return lasttok = OR; case '\n': if (syntax_bits & RE_LIMITED_OPS || backslash
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -