📄 dfa.c
字号:
/* Some primitives for operating on sets of positions. *//* Copy one set to another; the destination must be large enough. */static voidcopy (position_set const *src, position_set *dst){ int i; for (i = 0; i < src->nelem; ++i) dst->elems[i] = src->elems[i]; dst->nelem = src->nelem;}/* Insert a position in a set. Position sets are maintained in sorted order according to index. If position already exists in the set with the same index then their constraints are logically or'd together. S->elems must point to an array large enough to hold the resulting set. */static voidinsert (position p, position_set *s){ int i; position t1, t2; for (i = 0; i < s->nelem && p.index < s->elems[i].index; ++i) continue; if (i < s->nelem && p.index == s->elems[i].index) s->elems[i].constraint |= p.constraint; else { t1 = p; ++s->nelem; while (i < s->nelem) { t2 = s->elems[i]; s->elems[i++] = t1; t1 = t2; } }}/* Merge two sets of positions into a third. The result is exactly as if the positions of both sets were inserted into an initially empty set. */static voidmerge (position_set const *s1, position_set const *s2, position_set *m){ int i = 0, j = 0; m->nelem = 0; while (i < s1->nelem && j < s2->nelem) if (s1->elems[i].index > s2->elems[j].index) m->elems[m->nelem++] = s1->elems[i++]; else if (s1->elems[i].index < s2->elems[j].index) m->elems[m->nelem++] = s2->elems[j++]; else { m->elems[m->nelem] = s1->elems[i++]; m->elems[m->nelem++].constraint |= s2->elems[j++].constraint; } while (i < s1->nelem) m->elems[m->nelem++] = s1->elems[i++]; while (j < s2->nelem) m->elems[m->nelem++] = s2->elems[j++];}/* Delete a position from a set. */static voiddelete (position p, position_set *s){ int i; for (i = 0; i < s->nelem; ++i) if (p.index == s->elems[i].index) break; if (i < s->nelem) for (--s->nelem; i < s->nelem; ++i) s->elems[i] = s->elems[i + 1];}/* Find the index of the state corresponding to the given position set with the given preceding context, or create a new state if there is no such state. Newline and letter tell whether we got here on a newline or letter, respectively. */static intstate_index (struct dfa *d, position_set const *s, int newline, int letter){ int hash = 0; int constraint; int i, j; newline = newline ? 1 : 0; letter = letter ? 1 : 0; for (i = 0; i < s->nelem; ++i) hash ^= s->elems[i].index + s->elems[i].constraint; /* Try to find a state that exactly matches the proposed one. */ for (i = 0; i < d->sindex; ++i) { if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem || newline != d->states[i].newline || letter != d->states[i].letter) continue; for (j = 0; j < s->nelem; ++j) if (s->elems[j].constraint != d->states[i].elems.elems[j].constraint || s->elems[j].index != d->states[i].elems.elems[j].index) break; if (j == s->nelem) return i; } /* We'll have to create a new state. */ REALLOC_IF_NECESSARY(d->states, dfa_state, d->salloc, d->sindex); d->states[i].hash = hash; MALLOC(d->states[i].elems.elems, position, s->nelem); copy(s, &d->states[i].elems); d->states[i].newline = newline; d->states[i].letter = letter; d->states[i].backref = 0; d->states[i].constraint = 0; d->states[i].first_end = 0;#ifdef MBS_SUPPORT if (MB_CUR_MAX > 1) d->states[i].mbps.nelem = 0;#endif for (j = 0; j < s->nelem; ++j) if (d->tokens[s->elems[j].index] < 0) { constraint = s->elems[j].constraint; if (SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 0) || SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 1) || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 0) || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 1)) d->states[i].constraint |= constraint; if (! d->states[i].first_end) d->states[i].first_end = d->tokens[s->elems[j].index]; } else if (d->tokens[s->elems[j].index] == BACKREF) { d->states[i].constraint = NO_CONSTRAINT; d->states[i].backref = 1; } ++d->sindex; return i;}/* Find the epsilon closure of a set of positions. If any position of the set contains a symbol that matches the empty string in some context, replace that position with the elements of its follow labeled with an appropriate constraint. Repeat exhaustively until no funny positions are left. S->elems must be large enough to hold the result. */static voidepsclosure (position_set *s, struct dfa const *d){ int i, j; int *visited; position p, old; MALLOC(visited, int, d->tindex); for (i = 0; i < d->tindex; ++i) visited[i] = 0; for (i = 0; i < s->nelem; ++i) if (d->tokens[s->elems[i].index] >= NOTCHAR && d->tokens[s->elems[i].index] != BACKREF#ifdef MBS_SUPPORT && d->tokens[s->elems[i].index] != ANYCHAR && d->tokens[s->elems[i].index] != MBCSET#endif && d->tokens[s->elems[i].index] < CSET) { old = s->elems[i]; p.constraint = old.constraint; delete(s->elems[i], s); if (visited[old.index]) { --i; continue; } visited[old.index] = 1; switch (d->tokens[old.index]) { case BEGLINE: p.constraint &= BEGLINE_CONSTRAINT; break; case ENDLINE: p.constraint &= ENDLINE_CONSTRAINT; break; case BEGWORD: p.constraint &= BEGWORD_CONSTRAINT; break; case ENDWORD: p.constraint &= ENDWORD_CONSTRAINT; break; case LIMWORD: p.constraint &= LIMWORD_CONSTRAINT; break; case NOTLIMWORD: p.constraint &= NOTLIMWORD_CONSTRAINT; break; default: break; } for (j = 0; j < d->follows[old.index].nelem; ++j) { p.index = d->follows[old.index].elems[j].index; insert(p, s); } /* Force rescan to start at the beginning. */ i = -1; } free(visited);}/* Perform bottom-up analysis on the parse tree, computing various functions. Note that at this point, we're pretending constructs like \< are real characters rather than constraints on what can follow them. Nullable: A node is nullable if it is at the root of a regexp that can match the empty string. * EMPTY leaves are nullable. * No other leaf is nullable. * A QMARK or STAR node is nullable. * A PLUS node is nullable if its argument is nullable. * A CAT node is nullable if both its arguments are nullable. * An OR node is nullable if either argument is nullable. Firstpos: The firstpos of a node is the set of positions (nonempty leaves) that could correspond to the first character of a string matching the regexp rooted at the given node. * EMPTY leaves have empty firstpos. * The firstpos of a nonempty leaf is that leaf itself. * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its argument. * The firstpos of a CAT node is the firstpos of the left argument, union the firstpos of the right if the left argument is nullable. * The firstpos of an OR node is the union of firstpos of each argument. Lastpos: The lastpos of a node is the set of positions that could correspond to the last character of a string matching the regexp at the given node. * EMPTY leaves have empty lastpos. * The lastpos of a nonempty leaf is that leaf itself. * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its argument. * The lastpos of a CAT node is the lastpos of its right argument, union the lastpos of the left if the right argument is nullable. * The lastpos of an OR node is the union of the lastpos of each argument. Follow: The follow of a position is the set of positions that could correspond to the character following a character matching the node in a string matching the regexp. At this point we consider special symbols that match the empty string in some context to be just normal characters. Later, if we find that a special symbol is in a follow set, we will replace it with the elements of its follow, labeled with an appropriate constraint. * Every node in the firstpos of the argument of a STAR or PLUS node is in the follow of every node in the lastpos. * Every node in the firstpos of the second argument of a CAT node is in the follow of every node in the lastpos of the first argument. Because of the postfix representation of the parse tree, the depth-first analysis is conveniently done by a linear scan with the aid of a stack. Sets are stored as arrays of the elements, obeying a stack-like allocation scheme; the number of elements in each set deeper in the stack can be used to determine the address of a particular set's array. */voiddfaanalyze (struct dfa *d, int searchflag){ int *nullable; /* Nullable stack. */ int *nfirstpos; /* Element count stack for firstpos sets. */ position *firstpos; /* Array where firstpos elements are stored. */ int *nlastpos; /* Element count stack for lastpos sets. */ position *lastpos; /* Array where lastpos elements are stored. */ int *nalloc; /* Sizes of arrays allocated to follow sets. */ position_set tmp; /* Temporary set for merging sets. */ position_set merged; /* Result of merging sets. */ int wants_newline; /* True if some position wants newline info. */ int *o_nullable; int *o_nfirst, *o_nlast; position *o_firstpos, *o_lastpos; int i, j; position *pos;#ifdef DEBUG fprintf(stderr, "dfaanalyze:\n"); for (i = 0; i < d->tindex; ++i) { fprintf(stderr, " %d:", i); prtok(d->tokens[i]); } putc('\n', stderr);#endif d->searchflag = searchflag; MALLOC(nullable, int, d->depth); o_nullable = nullable; MALLOC(nfirstpos, int, d->depth); o_nfirst = nfirstpos; MALLOC(firstpos, position, d->nleaves); o_firstpos = firstpos, firstpos += d->nleaves; MALLOC(nlastpos, int, d->depth); o_nlast = nlastpos; MALLOC(lastpos, position, d->nleaves); o_lastpos = lastpos, lastpos += d->nleaves; MALLOC(nalloc, int, d->tindex); for (i = 0; i < d->tindex; ++i) nalloc[i] = 0; MALLOC(merged.elems, position, d->nleaves); CALLOC(d->follows, position_set, d->tindex); for (i = 0; i < d->tindex; ++i)#ifdef DEBUG { /* Nonsyntactic #ifdef goo... */#endif switch (d->tokens[i]) { case EMPTY: /* The empty set is nullable. */ *nullable++ = 1; /* The firstpos and lastpos of the empty leaf are both empty. */ *nfirstpos++ = *nlastpos++ = 0; break; case STAR: case PLUS: /* Every element in the firstpos of the argument is in the follow of every element in the lastpos. */ tmp.nelem = nfirstpos[-1]; tmp.elems = firstpos; pos = lastpos; for (j = 0; j < nlastpos[-1]; ++j) { merge(&tmp, &d->follows[pos[j].index], &merged); REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position, nalloc[pos[j].index], merged.nelem - 1); copy(&merged, &d->follows[pos[j].index]); } case QMARK: /* A QMARK or STAR node is automatically nullable. */ if (d->tokens[i] != PLUS) nullable[-1] = 1; break; case CAT: /* Every element in the firstpos of the second argument is in the follow of every element in the lastpos of the first argument. */ tmp.nelem = nfirstpos[-1]; tmp.elems = firstpos; pos = lastpos + nlastpos[-1]; for (j = 0; j < nlastpos[-2]; ++j) { merge(&tmp, &d->follows[pos[j].index], &merged); REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position, nalloc[pos[j].index], merged.nelem - 1); copy(&merged, &d->follows[pos[j].index]); } /* The firstpos of a CAT node is the firstpos of the first argument, union that of the second argument if the first is nullable. */ if (nullable[-2]) nfirstpos[-2] += nfirstpos[-1]; else firstpos += nfirstpos[-1]; --nfirstpos; /* The lastpos of a CAT node is the lastpos of the second argument, union that of the first argument if the second is nullable. */ if (nullable[-1]) nlastpos[-2] += nlastpos[-1]; else { pos = lastpos + nlastpos[-2]; for (j = nlastpos[-1] - 1; j >= 0; --j) pos[j] = lastpos[j]; lastpos += nlastpos[-2]; nlastpos[-2] = nlastpos[-1]; } --nlastpos; /* A CAT node is nullable if both arguments are nullable. */ nullable[-2] = nullable[-1] && nullable[-2]; --nullable; break; case OR: case ORTOP: /* The firstpos is the union of the firstpos of each argument. */ nfirstpos[-2] += nfirstpos[-1]; --nfirstpos; /* The lastpos is the union of the lastpos of each argument. */ nlastpos[-2] += nlastpos[-1]; --nlastpos; /* An OR node is nullable if either argument is nullable. */ nullable[-2] = nullable[-1] || nullable[-2]; --nullable; break; default: /* Anything else is a nonempty position. (Note that special constructs like \< are treated as nonempty strings here; an "epsilon closure" effectively makes them nullable later. Backreferences have to get a real position so we can detect transitions on them later. But they are nullable. */ *nullable++ = d->tokens[i] == BACKREF; /* This position is in its own firstpos and lastpos. */ *nfirstpos++ = *nlastpos++ = 1; --firstpos, --lastpos; firstpos->index = lastpos->index = i; firstpos->constraint = lastpos->constraint = NO_CONSTRAINT; /* Allocate the follow set for this position. */ nalloc[i] = 1; MALLOC(d->follows[i].elems, position, nalloc[i]); break; }#ifdef DEBUG /* ... balance the above nonsyntactic #ifdef goo... */ fprintf(stderr, "node %d:", i); prtok(d->tokens[i]); putc('\n', stderr); fprintf(stderr, nullable[-1] ? " nullable: yes\n" : " nullable: no\n"); fprintf(stderr, " firstpos:"); for (j = nfirstpos[-1] - 1; j >= 0; --j) { fprintf(stderr, " %d:", firstpos[j].index); prtok(d->tokens[firstpos[j].index]); } fprintf(stderr, "\n lastpos:"); for (j = nlastpos[-1] - 1; j >= 0; --j) { fprintf(stderr, " %d:", lastpos[j].index); prtok(d->tokens[lastpos[j].index]); } putc('\n', stderr); }#endif /* For each follow set that is the follow set of a real position, replace it with its epsilon closure. */ for (i = 0; i < d->tindex; ++i) if (d->tokens[i] < NOTCHAR || d->tokens[i] == BACKREF#ifdef MBS_SUPPORT || d->tokens[i] == ANYCHAR || d->tokens[i] == MBCSET#endif || d->tokens[i] >= CSET) {#ifdef DEBUG fprintf(stderr, "follows(%d:", i); prtok(d->tokens[i]); fprintf(stderr, "):"); for (j = d->follows[i].nelem - 1; j >= 0; --j) { fprintf(stderr, " %d:", d->follows[i].elems[j].index);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -