📄 dfa.c
字号:
/* Recursive descent parser for regular expressions. */static _token tok; /* Lookahead token. */static depth; /* Current depth of a hypothetical stack holding deferred productions. This is used to determine the depth that will be required of the real stack later on in reganalyze(). *//* Add the given token to the parse tree, maintaining the depth count and updating the maximum depth if necessary. */static voidaddtok(t) _token t;{ REALLOC_IF_NECESSARY(reg->tokens, _token, reg->talloc, reg->tindex); reg->tokens[reg->tindex++] = t; switch (t) { case _QMARK: case _STAR: case _PLUS: break; case _CAT: case _OR: --depth; break; default: ++reg->nleaves; case _EMPTY: ++depth; break; } if (depth > reg->depth) reg->depth = depth;}/* The grammar understood by the parser is as follows. start: regexp _ALLBEGLINE regexp regexp _ALLENDLINE _ALLBEGLINE regexp _ALLENDLINE regexp: regexp _OR branch branch branch: branch closure closure closure: closure _QMARK closure _STAR closure _PLUS atom atom: <normal character> _SET _BACKREF _BEGLINE _ENDLINE _BEGWORD _ENDWORD _LIMWORD _NOTLIMWORD <empty> The parser builds a parse tree in postfix form in an array of tokens. */#ifdef __STDC__static void regexp(void);#elsestatic void regexp();#endifstatic voidatom(){ if (tok >= 0 && (tok < _NOTCHAR || tok >= _SET || tok == _BACKREF || tok == _BEGLINE || tok == _ENDLINE || tok == _BEGWORD || tok == _ENDWORD || tok == _LIMWORD || tok == _NOTLIMWORD)) { addtok(tok); tok = lex(); } else if (tok == _LPAREN) { tok = lex(); regexp(); if (tok != _RPAREN) reg_error("Unbalanced ("); tok = lex(); } else addtok(_EMPTY);}static voidclosure(){ atom(); while (tok == _QMARK || tok == _STAR || tok == _PLUS) { addtok(tok); tok = lex(); }}static voidbranch(){ closure(); while (tok != _RPAREN && tok != _OR && tok != _ALLENDLINE && tok >= 0) { closure(); addtok(_CAT); }}static voidregexp(){ branch(); while (tok == _OR) { tok = lex(); branch(); addtok(_OR); }}/* Main entry point for the parser. S is a string to be parsed, len is the length of the string, so s can include NUL characters. R is a pointer to the struct regexp to parse into. */voidregparse(s, len, r) const char *s; size_t len; struct regexp *r;{ reg = r; lexstart = lexptr = s; lexleft = len; caret_allowed = 1; closure_allowed = 0; if (! syntax_bits_set) reg_error("No syntax specified"); tok = lex(); depth = r->depth; if (tok == _ALLBEGLINE) { addtok(_BEGLINE); tok = lex(); regexp(); addtok(_CAT); } else regexp(); if (tok == _ALLENDLINE) { addtok(_ENDLINE); addtok(_CAT); tok = lex(); } if (tok != _END) reg_error("Unbalanced )"); addtok(_END - r->nregexps); addtok(_CAT); if (r->nregexps) addtok(_OR); ++r->nregexps;}/* Some primitives for operating on sets of positions. *//* Copy one set to another; the destination must be large enough. */static voidcopy(src, dst) const _position_set *src; _position_set *dst;{ int i; for (i = 0; i < src->nelem; ++i) dst->elems[i] = src->elems[i]; dst->nelem = src->nelem;}/* Insert a position in a set. Position sets are maintained in sorted order according to index. If position already exists in the set with the same index then their constraints are logically or'd together. S->elems must point to an array large enough to hold the resulting set. */static voidinsert(p, s) _position p; _position_set *s;{ int i; _position t1, t2; for (i = 0; i < s->nelem && p.index < s->elems[i].index; ++i) ; if (i < s->nelem && p.index == s->elems[i].index) s->elems[i].constraint |= p.constraint; else { t1 = p; ++s->nelem; while (i < s->nelem) { t2 = s->elems[i]; s->elems[i++] = t1; t1 = t2; } }}/* Merge two sets of positions into a third. The result is exactly as if the positions of both sets were inserted into an initially empty set. */static voidmerge(s1, s2, m) _position_set *s1; _position_set *s2; _position_set *m;{ int i = 0, j = 0; m->nelem = 0; while (i < s1->nelem && j < s2->nelem) if (s1->elems[i].index > s2->elems[j].index) m->elems[m->nelem++] = s1->elems[i++]; else if (s1->elems[i].index < s2->elems[j].index) m->elems[m->nelem++] = s2->elems[j++]; else { m->elems[m->nelem] = s1->elems[i++]; m->elems[m->nelem++].constraint |= s2->elems[j++].constraint; } while (i < s1->nelem) m->elems[m->nelem++] = s1->elems[i++]; while (j < s2->nelem) m->elems[m->nelem++] = s2->elems[j++];}/* Delete a position from a set. */static voiddelete(p, s) _position p; _position_set *s;{ int i; for (i = 0; i < s->nelem; ++i) if (p.index == s->elems[i].index) break; if (i < s->nelem) for (--s->nelem; i < s->nelem; ++i) s->elems[i] = s->elems[i + 1];}/* Find the index of the state corresponding to the given position set with the given preceding context, or create a new state if there is no such state. Newline and letter tell whether we got here on a newline or letter, respectively. */static intstate_index(r, s, newline, letter) struct regexp *r; _position_set *s; int newline; int letter;{ int lhash = 0; int constraint; int i, j; newline = newline ? 1 : 0; letter = letter ? 1 : 0; for (i = 0; i < s->nelem; ++i) lhash ^= s->elems[i].index + s->elems[i].constraint; /* Try to find a state that exactly matches the proposed one. */ for (i = 0; i < r->sindex; ++i) { if (lhash != r->states[i].hash || s->nelem != r->states[i].elems.nelem || newline != r->states[i].newline || letter != r->states[i].letter) continue; for (j = 0; j < s->nelem; ++j) if (s->elems[j].constraint != r->states[i].elems.elems[j].constraint || s->elems[j].index != r->states[i].elems.elems[j].index) break; if (j == s->nelem) return i; } /* We'll have to create a new state. */ REALLOC_IF_NECESSARY(r->states, _dfa_state, r->salloc, r->sindex); r->states[i].hash = lhash; MALLOC(r->states[i].elems.elems, _position, s->nelem); copy(s, &r->states[i].elems); r->states[i].newline = newline; r->states[i].letter = letter; r->states[i].backref = 0; r->states[i].constraint = 0; r->states[i].first_end = 0; for (j = 0; j < s->nelem; ++j) if (r->tokens[s->elems[j].index] < 0) { constraint = s->elems[j].constraint; if (_SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 0) || _SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 1) || _SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 0) || _SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 1)) r->states[i].constraint |= constraint; if (! r->states[i].first_end) r->states[i].first_end = r->tokens[s->elems[j].index]; } else if (r->tokens[s->elems[j].index] == _BACKREF) { r->states[i].constraint = _NO_CONSTRAINT; r->states[i].backref = 1; } ++r->sindex; return i;}/* Find the epsilon closure of a set of positions. If any position of the set contains a symbol that matches the empty string in some context, replace that position with the elements of its follow labeled with an appropriate constraint. Repeat exhaustively until no funny positions are left. S->elems must be large enough to hold the result. */static voidepsclosure(s, r) _position_set *s; struct regexp *r;{ int i, j; int *visited; _position p, old; MALLOC(visited, int, r->tindex); for (i = 0; i < r->tindex; ++i) visited[i] = 0; for (i = 0; i < s->nelem; ++i) if (r->tokens[s->elems[i].index] >= _NOTCHAR && r->tokens[s->elems[i].index] != _BACKREF && r->tokens[s->elems[i].index] < _SET) { old = s->elems[i]; p.constraint = old.constraint; delete(s->elems[i], s); if (visited[old.index]) { --i; continue; } visited[old.index] = 1; switch (r->tokens[old.index]) { case _BEGLINE: p.constraint &= _BEGLINE_CONSTRAINT; break; case _ENDLINE: p.constraint &= _ENDLINE_CONSTRAINT; break; case _BEGWORD: p.constraint &= _BEGWORD_CONSTRAINT; break; case _ENDWORD: p.constraint &= _ENDWORD_CONSTRAINT; break; case _LIMWORD: p.constraint &= _ENDWORD_CONSTRAINT; break; case _NOTLIMWORD: p.constraint &= _NOTLIMWORD_CONSTRAINT; break; default: break; } for (j = 0; j < r->follows[old.index].nelem; ++j) { p.index = r->follows[old.index].elems[j].index; insert(p, s); } /* Force rescan to start at the beginning. */ i = -1; } free(visited);}/* Perform bottom-up analysis on the parse tree, computing various functions. Note that at this point, we're pretending constructs like \< are real characters rather than constraints on what can follow them. Nullable: A node is nullable if it is at the root of a regexp that can match the empty string. * _EMPTY leaves are nullable. * No other leaf is nullable. * A _QMARK or _STAR node is nullable. * A _PLUS node is nullable if its argument is nullable. * A _CAT node is nullable if both its arguments are nullable. * An _OR node is nullable if either argument is nullable. Firstpos: The firstpos of a node is the set of positions (nonempty leaves) that could correspond to the first character of a string matching the regexp rooted at the given node. * _EMPTY leaves have empty firstpos. * The firstpos of a nonempty leaf is that leaf itself. * The firstpos of a _QMARK, _STAR, or _PLUS node is the firstpos of its argument. * The firstpos of a _CAT node is the firstpos of the left argument, union the firstpos of the right if the left argument is nullable. * The firstpos of an _OR node is the union of firstpos of each argument. Lastpos: The lastpos of a node is the set of positions that could correspond to the last character of a string matching the regexp at the given node. * _EMPTY leaves have empty lastpos. * The lastpos of a nonempty leaf is that leaf itself. * The lastpos of a _QMARK, _STAR, or _PLUS node is the lastpos of its argument. * The lastpos of a _CAT node is the lastpos of its right argument, union the lastpos of the left if the right argument is nullable. * The lastpos of an _OR node is the union of the lastpos of each argument. Follow: The follow of a position is the set of positions that could correspond to the character following a character matching the node in a string matching the regexp. At this point we consider special symbols that match the empty string in some context to be just normal characters. Later, if we find that a special symbol is in a follow set, we will replace it with the elements of its follow, labeled with an appropriate constraint. * Every node in the firstpos of the argument of a _STAR or _PLUS node is in the follow of every node in the lastpos. * Every node in the firstpos of the second argument of a _CAT node is in the follow of every node in the lastpos of the first argument. Because of the postfix representation of the parse tree, the depth-first analysis is conveniently done by a linear scan with the aid of a stack. Sets are stored as arrays of the elements, obeying a stack-like allocation scheme; the number of elements in each set deeper in the stack can be used to determine the address of a particular set's array. */voidreganalyze(r, searchflag) struct regexp *r; int searchflag;{ int *nullable; /* Nullable stack. */ int *nfirstpos; /* Element count stack for firstpos sets. */ _position *firstpos; /* Array where firstpos elements are stored. */ int *nlastpos; /* Element count stack for lastpos sets. */ _position *lastpos; /* Array where lastpos elements are stored. */ int *nalloc; /* Sizes of arrays allocated to follow sets. */ _position_set tmp; /* Temporary set for merging sets. */ _position_set merged; /* Result of merging sets. */ int wants_newline; /* True if some position wants newline info. */ int *o_nullable; int *o_nfirst, *o_nlast; _position *o_firstpos, *o_lastpos; int i, j; _position *pos; r->searchflag = searchflag; MALLOC(nullable, int, r->depth); o_nullable = nullable; MALLOC(nfirstpos, int, r->depth); o_nfirst = nfirstpos; MALLOC(firstpos, _position, r->nleaves); o_firstpos = firstpos, firstpos += r->nleaves; MALLOC(nlastpos, int, r->depth); o_nlast = nlastpos; MALLOC(lastpos, _position, r->nleaves); o_lastpos = lastpos, lastpos += r->nleaves; MALLOC(nalloc, int, r->tindex); for (i = 0; i < r->tindex; ++i) nalloc[i] = 0; MALLOC(merged.elems, _position, r->nleaves); CALLOC(r->follows, _position_set, r->tindex); for (i = 0; i < r->tindex; ++i) switch (r->tokens[i]) { case _EMPTY: /* The empty set is nullable. */ *nullable++ = 1; /* The firstpos and lastpos of the empty leaf are both empty. */ *nfirstpos++ = *nlastpos++ = 0; break; case _STAR: case _PLUS: /* Every element in the firstpos of the argument is in the follow of every element in the lastpos. */ tmp.nelem = nfirstpos[-1]; tmp.elems = firstpos; pos = lastpos; for (j = 0; j < nlastpos[-1]; ++j) { merge(&tmp, &r->follows[pos[j].index], &merged); REALLOC_IF_NECESSARY(r->follows[pos[j].index].elems, _position, nalloc[pos[j].index], merged.nelem - 1); copy(&merged, &r->follows[pos[j].index]); } case _QMARK: /* A _QMARK or _STAR node is automatically nullable. */ if (r->tokens[i] != _PLUS) nullable[-1] = 1; break; case _CAT: /* Every element in the firstpos of the second argument is in the follow of every element in the lastpos of the first argument. */ tmp.nelem = nfirstpos[-1]; tmp.elems = firstpos; pos = lastpos + nlastpos[-1]; for (j = 0; j < nlastpos[-2]; ++j) { merge(&tmp, &r->follows[pos[j].index], &merged); REALLOC_IF_NECESSARY(r->follows[pos[j].index].elems, _position, nalloc[pos[j].index], merged.nelem - 1); copy(&merged, &r->follows[pos[j].index]); } /* The firstpos of a _CAT node is the firstpos of the first argument, union that of the second argument if the first is nullable. */ if (nullable[-2]) nfirstpos[-2] += nfirstpos[-1]; else firstpos += nfirstpos[-1]; --nfirstpos; /* The lastpos of a _CAT node is the lastpos of the second argument, union that of the first argument if the second is nullable. */ if (nullable[-1]) nlastpos[-2] += nlastpos[-1]; else { pos = lastpos + nlastpos[-2]; for (j = nlastpos[-1] - 1; j >= 0; --j) pos[j] = lastpos[j]; lastpos += nlastpos[-2]; nlastpos[-2] = nlastpos[-1]; } --nlastpos; /* A _CAT node is nullable if both arguments are nullable. */ nullable[-2] = nullable[-1] && nullable[-2]; --nullable;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -