regcomp.c

来自「tcl是工具命令语言」· C语言 代码 · 共 2,176 行 · 第 1/4 页

C
2,176
字号
	 * and no-progress states.	 */	/* first, make a list of the states */	slist = NULL;	for (a = pre->outs; a != NULL; a = a->outchain) {		s = a->to;		for (b = s->ins; b != NULL; b = b->inchain)			if (b->from != pre)				break;		if (b != NULL) {		/* must be split */			s->tmp = slist;			slist = s;		}	}	/* do the splits */	for (s = slist; s != NULL; s = s2) {		s2 = newstate(nfa);		copyouts(nfa, s, s2);		for (a = s->ins; a != NULL; a = b) {			b = a->inchain;			if (a->from != pre) {				cparc(nfa, a, a->from, s2);				freearc(nfa, a);			}		}		s2 = s->tmp;		s->tmp = NULL;		/* clean up while we're at it */	}}/* - parse - parse an RE * This is actually just the top level, which parses a bunch of branches * tied together with '|'.  They appear in the tree as the left children * of a chain of '|' subres. ^ static struct subre *parse(struct vars *, int, int, struct state *, ^ 	struct state *); */static struct subre *parse(v, stopper, type, init, final)struct vars *v;int stopper;			/* EOS or ')' */int type;			/* LACON (lookahead subRE) or PLAIN */struct state *init;		/* initial state */struct state *final;		/* final state */{	struct state *left;	/* scaffolding for branch */	struct state *right;	struct subre *branches;	/* top level */	struct subre *branch;	/* current branch */	struct subre *t;	/* temporary */	int firstbranch;	/* is this the first branch? */	assert(stopper == ')' || stopper == EOS);	branches = subre(v, '|', LONGER, init, final);	NOERRN();	branch = branches;	firstbranch = 1;	do {	/* a branch */		if (!firstbranch) {			/* need a place to hang it */			branch->right = subre(v, '|', LONGER, init, final);			NOERRN();			branch = branch->right;		}		firstbranch = 0;		left = newstate(v->nfa);		right = newstate(v->nfa);		NOERRN();		EMPTYARC(init, left);		EMPTYARC(right, final);		NOERRN();		branch->left = parsebranch(v, stopper, type, left, right, 0);		NOERRN();		branch->flags |= UP(branch->flags | branch->left->flags);		if ((branch->flags &~ branches->flags) != 0)	/* new flags */			for (t = branches; t != branch; t = t->right)				t->flags |= branch->flags;	} while (EAT('|'));	assert(SEE(stopper) || SEE(EOS));	if (!SEE(stopper)) {		assert(stopper == ')' && SEE(EOS));		ERR(REG_EPAREN);	}	/* optimize out simple cases */	if (branch == branches) {	/* only one branch */		assert(branch->right == NULL);		t = branch->left;		branch->left = NULL;		freesubre(v, branches);		branches = t;	} else if (!MESSY(branches->flags)) {	/* no interesting innards */		freesubre(v, branches->left);		branches->left = NULL;		freesubre(v, branches->right);		branches->right = NULL;		branches->op = '=';	}	return branches;}/* - parsebranch - parse one branch of an RE * This mostly manages concatenation, working closely with parseqatom(). * Concatenated things are bundled up as much as possible, with separate * ',' nodes introduced only when necessary due to substructure. ^ static struct subre *parsebranch(struct vars *, int, int, struct state *, ^ 	struct state *, int); */static struct subre *parsebranch(v, stopper, type, left, right, partial)struct vars *v;int stopper;			/* EOS or ')' */int type;			/* LACON (lookahead subRE) or PLAIN */struct state *left;		/* leftmost state */struct state *right;		/* rightmost state */int partial;			/* is this only part of a branch? */{	struct state *lp;	/* left end of current construct */	int seencontent;	/* is there anything in this branch yet? */	struct subre *t;	lp = left;	seencontent = 0;	t = subre(v, '=', 0, left, right);	/* op '=' is tentative */	NOERRN();	while (!SEE('|') && !SEE(stopper) && !SEE(EOS)) {		if (seencontent) {	/* implicit concat operator */			lp = newstate(v->nfa);			NOERRN();			moveins(v->nfa, right, lp);		}		seencontent = 1;		/* NB, recursion in parseqatom() may swallow rest of branch */		parseqatom(v, stopper, type, lp, right, t);	}	if (!seencontent) {		/* empty branch */		if (!partial)			NOTE(REG_UUNSPEC);		assert(lp == left);		EMPTYARC(left, right);	}	return t;}/* - parseqatom - parse one quantified atom or constraint of an RE * The bookkeeping near the end cooperates very closely with parsebranch(); * in particular, it contains a recursion that can involve parsing the rest * of the branch, making this function's name somewhat inaccurate. ^ static VOID parseqatom(struct vars *, int, int, struct state *, ^ 	struct state *, struct subre *); */static VOIDparseqatom(v, stopper, type, lp, rp, top)struct vars *v;int stopper;			/* EOS or ')' */int type;			/* LACON (lookahead subRE) or PLAIN */struct state *lp;		/* left state to hang it on */struct state *rp;		/* right state to hang it on */struct subre *top;		/* subtree top */{	struct state *s;	/* temporaries for new states */	struct state *s2;#	define	ARCV(t, val)	newarc(v->nfa, t, val, lp, rp)	int m, n;	struct subre *atom;	/* atom's subtree */	struct subre *t;	int cap;		/* capturing parens? */	int pos;		/* positive lookahead? */	int subno;		/* capturing-parens or backref number */	int atomtype;	int qprefer;		/* quantifier short/long preference */	int f;	struct subre **atomp;	/* where the pointer to atom is */	/* initial bookkeeping */	atom = NULL;	assert(lp->nouts == 0);	/* must string new code */	assert(rp->nins == 0);	/*  between lp and rp */	subno = 0;		/* just to shut lint up */	/* an atom or constraint... */	atomtype = v->nexttype;	switch (atomtype) {	/* first, constraints, which end by returning */	case '^':		ARCV('^', 1);		if (v->cflags&REG_NLANCH)			ARCV(BEHIND, v->nlcolor);		NEXT();		return;		break;	case '$':		ARCV('$', 1);		if (v->cflags&REG_NLANCH)			ARCV(AHEAD, v->nlcolor);		NEXT();		return;		break;	case SBEGIN:		ARCV('^', 1);	/* BOL */		ARCV('^', 0);	/* or BOS */		NEXT();		return;		break;	case SEND:		ARCV('$', 1);	/* EOL */		ARCV('$', 0);	/* or EOS */		NEXT();		return;		break;	case '<':		wordchrs(v);	/* does NEXT() */		s = newstate(v->nfa);		NOERR();		nonword(v, BEHIND, lp, s);		word(v, AHEAD, s, rp);		return;		break;	case '>':		wordchrs(v);	/* does NEXT() */		s = newstate(v->nfa);		NOERR();		word(v, BEHIND, lp, s);		nonword(v, AHEAD, s, rp);		return;		break;	case WBDRY:		wordchrs(v);	/* does NEXT() */		s = newstate(v->nfa);		NOERR();		nonword(v, BEHIND, lp, s);		word(v, AHEAD, s, rp);		s = newstate(v->nfa);		NOERR();		word(v, BEHIND, lp, s);		nonword(v, AHEAD, s, rp);		return;		break;	case NWBDRY:		wordchrs(v);	/* does NEXT() */		s = newstate(v->nfa);		NOERR();		word(v, BEHIND, lp, s);		word(v, AHEAD, s, rp);		s = newstate(v->nfa);		NOERR();		nonword(v, BEHIND, lp, s);		nonword(v, AHEAD, s, rp);		return;		break;	case LACON:	/* lookahead constraint */		pos = v->nextvalue;		NEXT();		s = newstate(v->nfa);		s2 = newstate(v->nfa);		NOERR();		t = parse(v, ')', LACON, s, s2);		freesubre(v, t);	/* internal structure irrelevant */		assert(SEE(')') || ISERR());		NEXT();		n = newlacon(v, s, s2, pos);		NOERR();		ARCV(LACON, n);		return;		break;	/* then errors, to get them out of the way */	case '*':	case '+':	case '?':	case '{':		ERR(REG_BADRPT);		return;		break;	default:		ERR(REG_ASSERT);		return;		break;	/* then plain characters, and minor variants on that theme */	case ')':		/* unbalanced paren */		if ((v->cflags&REG_ADVANCED) != REG_EXTENDED) {			ERR(REG_EPAREN);			return;		}		/* legal in EREs due to specification botch */		NOTE(REG_UPBOTCH);		/* fallthrough into case PLAIN */	case PLAIN:		onechr(v, v->nextvalue, lp, rp);		okcolors(v->nfa, v->cm);		NOERR();		NEXT();		break;	case '[':		if (v->nextvalue == 1)			bracket(v, lp, rp);		else			cbracket(v, lp, rp);		assert(SEE(']') || ISERR());		NEXT();		break;	case '.':		rainbow(v->nfa, v->cm, PLAIN,				(v->cflags&REG_NLSTOP) ? v->nlcolor : COLORLESS,				lp, rp);		NEXT();		break;	/* and finally the ugly stuff */	case '(':	/* value flags as capturing or non */		cap = (type == LACON) ? 0 : v->nextvalue;		if (cap) {			v->nsubexp++;			subno = v->nsubexp;			if ((size_t)subno >= v->nsubs)				moresubs(v, subno);			assert((size_t)subno < v->nsubs);		} else			atomtype = PLAIN;	/* something that's not '(' */		NEXT();		/* need new endpoints because tree will contain pointers */		s = newstate(v->nfa);		s2 = newstate(v->nfa);		NOERR();		EMPTYARC(lp, s);		EMPTYARC(s2, rp);		NOERR();		atom = parse(v, ')', PLAIN, s, s2);		assert(SEE(')') || ISERR());		NEXT();		NOERR();		if (cap) {			v->subs[subno] = atom;			t = subre(v, '(', atom->flags|CAP, lp, rp);			NOERR();			t->subno = subno;			t->left = atom;			atom = t;		}		/* postpone everything else pending possible {0} */		break;	case BACKREF:	/* the Feature From The Black Lagoon */		INSIST(type != LACON, REG_ESUBREG);		INSIST(v->nextvalue < v->nsubs, REG_ESUBREG);		INSIST(v->subs[v->nextvalue] != NULL, REG_ESUBREG);		NOERR();		assert(v->nextvalue > 0);		atom = subre(v, 'b', BACKR, lp, rp);		subno = v->nextvalue;		atom->subno = subno;		EMPTYARC(lp, rp);	/* temporarily, so there's something */		NEXT();		break;	}	/* ...and an atom may be followed by a quantifier */	switch (v->nexttype) {	case '*':		m = 0;		n = INFINITY;		qprefer = (v->nextvalue) ? LONGER : SHORTER;		NEXT();		break;	case '+':		m = 1;		n = INFINITY;		qprefer = (v->nextvalue) ? LONGER : SHORTER;		NEXT();		break;	case '?':		m = 0;		n = 1;		qprefer = (v->nextvalue) ? LONGER : SHORTER;		NEXT();		break;	case '{':		NEXT();		m = scannum(v);		if (EAT(',')) {			if (SEE(DIGIT))				n = scannum(v);			else				n = INFINITY;			if (m > n) {				ERR(REG_BADBR);				return;			}			/* {m,n} exercises preference, even if it's {m,m} */			qprefer = (v->nextvalue) ? LONGER : SHORTER;		} else {			n = m;			/* {m} passes operand's preference through */			qprefer = 0;		}		if (!SEE('}')) {	/* catches errors too */			ERR(REG_BADBR);			return;		}		NEXT();		break;	default:		/* no quantifier */		m = n = 1;		qprefer = 0;		break;	}	/* annoying special case:  {0} or {0,0} cancels everything */	if (m == 0 && n == 0) {		if (atom != NULL)			freesubre(v, atom);		if (atomtype == '(')			v->subs[subno] = NULL;		delsub(v->nfa, lp, rp);		EMPTYARC(lp, rp);		return;	}	/* if not a messy case, avoid hard part */	assert(!MESSY(top->flags));	f = top->flags | qprefer | ((atom != NULL) ? atom->flags : 0);	if (atomtype != '(' && atomtype != BACKREF && !MESSY(UP(f))) {		if (!(m == 1 && n == 1))			repeat(v, lp, rp, m, n);		if (atom != NULL)			freesubre(v, atom);		top->flags = f;		return;	}	/*	 * hard part:  something messy	 * That is, capturing parens, back reference, short/long clash, or	 * an atom with substructure containing one of those.	 */	/* now we'll need a subre for the contents even if they're boring */	if (atom == NULL) {		atom = subre(v, '=', 0, lp, rp);		NOERR();	}	/*	 * prepare a general-purpose state skeleton	 *	 *    ---> [s] ---prefix---> [begin] ---atom---> [end] ----rest---> [rp]	 *   /                                            /	 * [lp] ----> [s2] ----bypass---------------------	 *	 * where bypass is an empty, and prefix is some repetitions of atom	 */	s = newstate(v->nfa);		/* first, new endpoints for the atom */	s2 = newstate(v->nfa);	NOERR();	moveouts(v->nfa, lp, s);	moveins(v->nfa, rp, s2);	NOERR();	atom->begin = s;	atom->end = s2;	s = newstate(v->nfa);		/* and spots for prefix and bypass */	s2 = newstate(v->nfa);	NOERR();	EMPTYARC(lp, s);	EMPTYARC(lp, s2);	NOERR();	/* break remaining subRE into x{...} and what follows */	t = subre(v, '.', COMBINE(qprefer, atom->flags), lp, rp);	t->left = atom;	atomp = &t->left;	/* here we should recurse... but we must postpone that to the end */	/* split top into prefix and remaining */	assert(top->op == '=' && top->left == NULL && top->right == NULL);	top->left = subre(v, '=', top->flags, top->begin, lp);	top->op = '.';	top->right = t;	/* if it's a backref, now is the time to replicate the subNFA */	if (atomtype == BACKREF) {		assert(atom->begin->nouts == 1);	/* just the EMPTY */		delsub(v->nfa, atom->begin, atom->end);		assert(v->subs[subno] != NULL);		/* and here's why the recursion got postponed:  it must */		/* wait until the skeleton is filled in, because it may */		/* hit a backref that wants to copy the filled-in skeleton */		dupnfa(v->nfa, v->subs[subno]->begin, v->subs[subno]->end,						atom->begin, atom->end);		NOERR();	}	/* it's quantifier time; first, turn x{0,...} into x{1,...}|empty */	if (m == 0) {		EMPTYARC(s2, atom->end);		/* the bypass */		assert(PREF(qprefer) != 0);		f = COMBINE(qprefer, atom->flags);		t = subre(v, '|', f, lp, atom->end);		NOERR();		t->left = atom;		t->right = subre(v, '|', PREF(f), s2, atom->end);		NOERR();		t->right->left = subre(v, '=', 0, s2, atom->end);		NOERR();		*atomp = t;		atomp = &t->left;		m = 1;	}	/* deal with the rest of the quantifier */	if (atomtype == BACKREF) {		/* special case:  backrefs have internal quantifiers */		EMPTYARC(s, atom->begin);	/* empty prefix */		/* just stuff everything into atom */		repeat(v, atom->begin, atom->end, m, n);		atom->min = (short)m;		atom->max = (short)n;		atom->flags |= COMBINE(qprefer, atom->flags);	} else if (m == 1 && n == 1) {		/* no/vacuous quantifier:  done */		EMPTYARC(s, atom->begin);	/* empty prefix */	} else {		/* turn x{m,n} into x{m-1,n-1}x, with capturing */		/*  parens in only second x */		dupnfa(v->nfa, atom->begin, atom->end, s, atom->begin);		assert(m >= 1 && m != INFINITY && n >= 1);		repeat(v, s, atom->begin, m-1, (n == INFINITY) ? n : n-1);		f = COMBINE(qprefer, atom->flags);		t = subre(v, '.', f, s, atom->end);	/* prefix and atom */		NOERR();		t->left = subre(v, '=', PREF(f), s, atom->begin);		NOERR();		t->right = atom;		*atomp = t;	}	/* and finally, look after that postponed recursion */

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?