📄 regcomp.c
字号:
rainbow(nfa, v->cm, PLAIN, COLORLESS, pre, pre); /* and ^* and \A* too -- not always necessary, but harmless */ newarc(nfa, PLAIN, nfa->bos[0], pre, pre); newarc(nfa, PLAIN, nfa->bos[1], pre, pre); } /* * Now here's the subtle part. Because many REs have no lookback * constraints, often knowing when you were in the pre state tells you * little; it's the next state(s) that are informative. But some of them * may have other inarcs, i.e. it may be possible to make actual progress * and then return to one of them. We must de-optimize such cases, * splitting each such state into progress and no-progress states. */ /* first, make a list of the states */ slist = NULL; for (a = pre->outs; a != NULL; a = a->outchain) { s = a->to; for (b = s->ins; b != NULL; b = b->inchain) if (b->from != pre) break; if (b != NULL) { /* must be split */ if (s->tmp == NULL) { /* if not already in the list */ /* (fixes bugs 505048, 230589, */ /* 840258, 504785) */ s->tmp = slist; slist = s; } } } /* do the splits */ for (s = slist; s != NULL; s = s2) { s2 = newstate(nfa); copyouts(nfa, s, s2); for (a = s->ins; a != NULL; a = b) { b = a->inchain; if (a->from != pre) { cparc(nfa, a, a->from, s2); freearc(nfa, a); } } s2 = s->tmp; s->tmp = NULL; /* clean up while we're at it */ }}/* * parse - parse an RE * * This is actually just the top level, which parses a bunch of branches * tied together with '|'. They appear in the tree as the left children * of a chain of '|' subres. */static struct subre *parse(struct vars * v, int stopper, /* EOS or ')' */ int type, /* LACON (lookahead subRE) or PLAIN */ struct state * init, /* initial state */ struct state * final) /* final state */{ struct state *left; /* scaffolding for branch */ struct state *right; struct subre *branches; /* top level */ struct subre *branch; /* current branch */ struct subre *t; /* temporary */ int firstbranch; /* is this the first branch? */ assert(stopper == ')' || stopper == EOS); branches = subre(v, '|', LONGER, init, final); NOERRN(); branch = branches; firstbranch = 1; do { /* a branch */ if (!firstbranch) { /* need a place to hang it */ branch->right = subre(v, '|', LONGER, init, final); NOERRN(); branch = branch->right; } firstbranch = 0; left = newstate(v->nfa); right = newstate(v->nfa); NOERRN(); EMPTYARC(init, left); EMPTYARC(right, final); NOERRN(); branch->left = parsebranch(v, stopper, type, left, right, 0); NOERRN(); branch->flags |= UP(branch->flags | branch->left->flags); if ((branch->flags & ~branches->flags) != 0) /* new flags */ for (t = branches; t != branch; t = t->right) t->flags |= branch->flags; } while (EAT('|')); assert(SEE(stopper) || SEE(EOS)); if (!SEE(stopper)) { assert(stopper == ')' && SEE(EOS)); ERR(REG_EPAREN); } /* optimize out simple cases */ if (branch == branches) { /* only one branch */ assert(branch->right == NULL); t = branch->left; branch->left = NULL; freesubre(v, branches); branches = t; } else if (!MESSY(branches->flags)) { /* no interesting innards */ freesubre(v, branches->left); branches->left = NULL; freesubre(v, branches->right); branches->right = NULL; branches->op = '='; } return branches;}/* * parsebranch - parse one branch of an RE * * This mostly manages concatenation, working closely with parseqatom(). * Concatenated things are bundled up as much as possible, with separate * ',' nodes introduced only when necessary due to substructure. */static struct subre *parsebranch(struct vars * v, int stopper, /* EOS or ')' */ int type, /* LACON (lookahead subRE) or PLAIN */ struct state * left, /* leftmost state */ struct state * right, /* rightmost state */ int partial) /* is this only part of a branch? */{ struct state *lp; /* left end of current construct */ int seencontent; /* is there anything in this branch yet? */ struct subre *t; lp = left; seencontent = 0; t = subre(v, '=', 0, left, right); /* op '=' is tentative */ NOERRN(); while (!SEE('|') && !SEE(stopper) && !SEE(EOS)) { if (seencontent) { /* implicit concat operator */ lp = newstate(v->nfa); NOERRN(); moveins(v->nfa, right, lp); } seencontent = 1; /* NB, recursion in parseqatom() may swallow rest of branch */ parseqatom(v, stopper, type, lp, right, t); } if (!seencontent) { /* empty branch */ if (!partial) NOTE(REG_UUNSPEC); assert(lp == left); EMPTYARC(left, right); } return t;}/* * parseqatom - parse one quantified atom or constraint of an RE * * The bookkeeping near the end cooperates very closely with parsebranch(); * in particular, it contains a recursion that can involve parsing the rest * of the branch, making this function's name somewhat inaccurate. */static voidparseqatom(struct vars * v, int stopper, /* EOS or ')' */ int type, /* LACON (lookahead subRE) or PLAIN */ struct state * lp, /* left state to hang it on */ struct state * rp, /* right state to hang it on */ struct subre * top) /* subtree top */{ struct state *s; /* temporaries for new states */ struct state *s2;#define ARCV(t, val) newarc(v->nfa, t, val, lp, rp) int m, n; struct subre *atom; /* atom's subtree */ struct subre *t; int cap; /* capturing parens? */ int pos; /* positive lookahead? */ int subno; /* capturing-parens or backref number */ int atomtype; int qprefer; /* quantifier short/long preference */ int f; struct subre **atomp; /* where the pointer to atom is */ /* initial bookkeeping */ atom = NULL; assert(lp->nouts == 0); /* must string new code */ assert(rp->nins == 0); /* between lp and rp */ subno = 0; /* just to shut lint up */ /* an atom or constraint... */ atomtype = v->nexttype; switch (atomtype) { /* first, constraints, which end by returning */ case '^': ARCV('^', 1); if (v->cflags & REG_NLANCH) ARCV(BEHIND, v->nlcolor); NEXT(); return; break; case '$': ARCV('$', 1); if (v->cflags & REG_NLANCH) ARCV(AHEAD, v->nlcolor); NEXT(); return; break; case SBEGIN: ARCV('^', 1); /* BOL */ ARCV('^', 0); /* or BOS */ NEXT(); return; break; case SEND: ARCV('$', 1); /* EOL */ ARCV('$', 0); /* or EOS */ NEXT(); return; break; case '<': wordchrs(v); /* does NEXT() */ s = newstate(v->nfa); NOERR(); nonword(v, BEHIND, lp, s); word(v, AHEAD, s, rp); return; break; case '>': wordchrs(v); /* does NEXT() */ s = newstate(v->nfa); NOERR(); word(v, BEHIND, lp, s); nonword(v, AHEAD, s, rp); return; break; case WBDRY: wordchrs(v); /* does NEXT() */ s = newstate(v->nfa); NOERR(); nonword(v, BEHIND, lp, s); word(v, AHEAD, s, rp); s = newstate(v->nfa); NOERR(); word(v, BEHIND, lp, s); nonword(v, AHEAD, s, rp); return; break; case NWBDRY: wordchrs(v); /* does NEXT() */ s = newstate(v->nfa); NOERR(); word(v, BEHIND, lp, s); word(v, AHEAD, s, rp); s = newstate(v->nfa); NOERR(); nonword(v, BEHIND, lp, s); nonword(v, AHEAD, s, rp); return; break; case LACON: /* lookahead constraint */ pos = v->nextvalue; NEXT(); s = newstate(v->nfa); s2 = newstate(v->nfa); NOERR(); t = parse(v, ')', LACON, s, s2); freesubre(v, t); /* internal structure irrelevant */ assert(SEE(')') || ISERR()); NEXT(); n = newlacon(v, s, s2, pos); NOERR(); ARCV(LACON, n); return; break; /* then errors, to get them out of the way */ case '*': case '+': case '?': case '{': ERR(REG_BADRPT); return; break; default: ERR(REG_ASSERT); return; break; /* then plain characters, and minor variants on that theme */ case ')': /* unbalanced paren */ if ((v->cflags & REG_ADVANCED) != REG_EXTENDED) { ERR(REG_EPAREN); return; } /* legal in EREs due to specification botch */ NOTE(REG_UPBOTCH); /* fallthrough into case PLAIN */ case PLAIN: onechr(v, v->nextvalue, lp, rp); okcolors(v->nfa, v->cm); NOERR(); NEXT(); break; case '[': if (v->nextvalue == 1) bracket(v, lp, rp); else cbracket(v, lp, rp); assert(SEE(']') || ISERR()); NEXT(); break; case '.': rainbow(v->nfa, v->cm, PLAIN, (v->cflags & REG_NLSTOP) ? v->nlcolor : COLORLESS, lp, rp); NEXT(); break; /* and finally the ugly stuff */ case '(': /* value flags as capturing or non */ cap = (type == LACON) ? 0 : v->nextvalue; if (cap) { v->nsubexp++; subno = v->nsubexp; if ((size_t) subno >= v->nsubs) moresubs(v, subno); assert((size_t) subno < v->nsubs); } else atomtype = PLAIN; /* something that's not '(' */ NEXT(); /* need new endpoints because tree will contain pointers */ s = newstate(v->nfa); s2 = newstate(v->nfa); NOERR(); EMPTYARC(lp, s); EMPTYARC(s2, rp); NOERR(); atom = parse(v, ')', PLAIN, s, s2); assert(SEE(')') || ISERR()); NEXT(); NOERR(); if (cap) { v->subs[subno] = atom; t = subre(v, '(', atom->flags | CAP, lp, rp); NOERR(); t->subno = subno; t->left = atom; atom = t; } /* postpone everything else pending possible {0} */ break; case BACKREF: /* the Feature From The Black Lagoon */ INSIST(type != LACON, REG_ESUBREG); INSIST(v->nextvalue < v->nsubs, REG_ESUBREG); INSIST(v->subs[v->nextvalue] != NULL, REG_ESUBREG); NOERR(); assert(v->nextvalue > 0); atom = subre(v, 'b', BACKR, lp, rp); subno = v->nextvalue; atom->subno = subno; EMPTYARC(lp, rp); /* temporarily, so there's something */ NEXT(); break; } /* ...and an atom may be followed by a quantifier */ switch (v->nexttype) { case '*': m = 0; n = INFINITY; qprefer = (v->nextvalue) ? LONGER : SHORTER; NEXT(); break; case '+': m = 1; n = INFINITY; qprefer = (v->nextvalue) ? LONGER : SHORTER; NEXT(); break; case '?': m = 0; n = 1; qprefer = (v->nextvalue) ? LONGER : SHORTER; NEXT(); break; case '{': NEXT(); m = scannum(v); if (EAT(',')) { if (SEE(DIGIT)) n = scannum(v); else n = INFINITY; if (m > n) { ERR(REG_BADBR); return; } /* {m,n} exercises preference, even if it's {m,m} */ qprefer = (v->nextvalue) ? LONGER : SHORTER; } else { n = m; /* {m} passes operand's preference through */ qprefer = 0; } if (!SEE('}')) { /* catches errors too */ ERR(REG_BADBR); return; } NEXT(); break; default: /* no quantifier */ m = n = 1; qprefer = 0; break; } /* annoying special case: {0} or {0,0} cancels everything */ if (m == 0 && n == 0) { if (atom != NULL) freesubre(v, atom); if (atomtype == '(') v->subs[subno] = NULL; delsub(v->nfa, lp, rp); EMPTYARC(lp, rp); return; } /* if not a messy case, avoid hard part */ assert(!MESSY(top->flags)); f = top->flags | qprefer | ((atom != NULL) ? atom->flags : 0); if (atomtype != '(' && atomtype != BACKREF && !MESSY(UP(f))) { if (!(m == 1 && n == 1)) repeat(v, lp, rp, m, n); if (atom != NULL) freesubre(v, atom); top->flags = f; return; } /* * hard part: something messy That is, capturing parens, back reference, * short/long clash, or an atom with substructure containing one of those. */ /* now we'll need a subre for the contents even if they're boring */ if (atom == NULL) { atom = subre(v, '=', 0, lp, rp); NOERR(); } /* * prepare a general-purpose state skeleton * * ---> [s] ---prefix---> [begin] ---atom---> [end] ----rest---> [rp] / / * [lp] ----> [s2] ----bypass--------------------- * * where bypass is an empty, and prefix is some repetitions of atom */ s = newstate(v->nfa); /* first, new endpoints for the atom */ s2 = newstate(v->nfa); NOERR(); moveouts(v->nfa, lp, s); moveins(v->nfa, rp, s2); NOERR(); atom->begin = s; atom->end = s2; s = newstate(v->nfa); /* and spots for prefix and bypass */ s2 = newstate(v->nfa); NOERR(); EMPTYARC(lp, s); EMPTYARC(lp, s2); NOERR(); /* break remaining subRE into x{...} and what follows */ t = subre(v, '.', COMBINE(qprefer, atom->flags), lp, rp); t->left = atom; atomp = &t->left; /* here we should recurse... but we must postpone that to the end */ /* split top into prefix and remaining */ assert(top->op == '=' && top->left == NULL && top->right == NULL); top->left = subre(v, '=', top->flags, top->begin, lp); top->op = '.'; top->right = t; /* if it's a backref, now is the time to replicate the subNFA */ if (atomtype == BACKREF) { assert(atom->begin->nouts == 1); /* just the EMPTY */ delsub(v->nfa, atom->begin, atom->end); assert(v->subs[subno] != NULL); /* and here's why the recursion got postponed: it must */ /* wait until the skeleton is filled in, because it may */ /* hit a backref that wants to copy the filled-in skeleton */ dupnfa(v->nfa, v->subs[subno]->begin, v->subs[subno]->end, atom->begin, atom->end); NOERR(); } /* it's quantifier time; first, turn x{0,...} into x{1,...}|empty */ if (m == 0) { EMPTYARC(s2, atom->end); /* the bypass */ assert(PREF(qprefer) != 0); f = COMBINE(qprefer, atom->flags); t = subre(v, '|', f, lp, atom->end); NOERR(); t->left = atom; t->right = subre(v, '|', PREF(f), s2, atom->end); NOERR(); t->right->left = subre(v, '=', 0, s2, atom->end); NOERR(); *atomp = t; atomp = &t->left; m = 1; } /* deal with the rest of the quantifier */ if (atomtype == BACKREF) {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -