regcomp.c

来自「从一个开源软件中摘取的正则表达式模块」· C语言代码 · 共 2,243 行 · 第 1/4 页
2,243 行
		rainbow(nfa, v->cm, PLAIN, COLORLESS, pre, pre);		/* and ^* and \A* too -- not always necessary, but harmless */		newarc(nfa, PLAIN, nfa->bos[0], pre, pre);		newarc(nfa, PLAIN, nfa->bos[1], pre, pre);	}	/*	 * Now here's the subtle part.  Because many REs have no lookback	 * constraints, often knowing when you were in the pre state tells you	 * little; it's the next state(s) that are informative.  But some of them	 * may have other inarcs, i.e. it may be possible to make actual progress	 * and then return to one of them.	We must de-optimize such cases,	 * splitting each such state into progress and no-progress states.	 */	/* first, make a list of the states */	slist = NULL;	for (a = pre->outs; a != NULL; a = a->outchain)	{		s = a->to;		for (b = s->ins; b != NULL; b = b->inchain)			if (b->from != pre)				break;		if (b != NULL)		{						/* must be split */			if (s->tmp == NULL)			{					/* if not already in the list */				/* (fixes bugs 505048, 230589, */				/* 840258, 504785) */				s->tmp = slist;				slist = s;			}		}	}	/* do the splits */	for (s = slist; s != NULL; s = s2)	{		s2 = newstate(nfa);		copyouts(nfa, s, s2);		for (a = s->ins; a != NULL; a = b)		{			b = a->inchain;			if (a->from != pre)			{				cparc(nfa, a, a->from, s2);				freearc(nfa, a);			}		}		s2 = s->tmp;		s->tmp = NULL;			/* clean up while we're at it */	}}/* * parse - parse an RE * * This is actually just the top level, which parses a bunch of branches * tied together with '|'.	They appear in the tree as the left children * of a chain of '|' subres. */static struct subre *parse(struct vars * v,	  int stopper,				/* EOS or ')' */	  int type,					/* LACON (lookahead subRE) or PLAIN */	  struct state * init,		/* initial state */	  struct state * final)		/* final state */{	struct state *left;			/* scaffolding for branch */	struct state *right;	struct subre *branches;		/* top level */	struct subre *branch;		/* current branch */	struct subre *t;			/* temporary */	int			firstbranch;	/* is this the first branch? */	assert(stopper == ')' || stopper == EOS);	branches = subre(v, '|', LONGER, init, final);	NOERRN();	branch = branches;	firstbranch = 1;	do	{							/* a branch */		if (!firstbranch)		{			/* need a place to hang it */			branch->right = subre(v, '|', LONGER, init, final);			NOERRN();			branch = branch->right;		}		firstbranch = 0;		left = newstate(v->nfa);		right = newstate(v->nfa);		NOERRN();		EMPTYARC(init, left);		EMPTYARC(right, final);		NOERRN();		branch->left = parsebranch(v, stopper, type, left, right, 0);		NOERRN();		branch->flags |= UP(branch->flags | branch->left->flags);		if ((branch->flags & ~branches->flags) != 0)	/* new flags */			for (t = branches; t != branch; t = t->right)				t->flags |= branch->flags;	} while (EAT('|'));	assert(SEE(stopper) || SEE(EOS));	if (!SEE(stopper))	{		assert(stopper == ')' && SEE(EOS));		ERR(REG_EPAREN);	}	/* optimize out simple cases */	if (branch == branches)	{							/* only one branch */		assert(branch->right == NULL);		t = branch->left;		branch->left = NULL;		freesubre(v, branches);		branches = t;	}	else if (!MESSY(branches->flags))	{							/* no interesting innards */		freesubre(v, branches->left);		branches->left = NULL;		freesubre(v, branches->right);		branches->right = NULL;		branches->op = '=';	}	return branches;}/* * parsebranch - parse one branch of an RE * * This mostly manages concatenation, working closely with parseqatom(). * Concatenated things are bundled up as much as possible, with separate * ',' nodes introduced only when necessary due to substructure. */static struct subre *parsebranch(struct vars * v,			int stopper,		/* EOS or ')' */			int type,			/* LACON (lookahead subRE) or PLAIN */			struct state * left,	/* leftmost state */			struct state * right,		/* rightmost state */			int partial)		/* is this only part of a branch? */{	struct state *lp;			/* left end of current construct */	int			seencontent;	/* is there anything in this branch yet? */	struct subre *t;	lp = left;	seencontent = 0;	t = subre(v, '=', 0, left, right);	/* op '=' is tentative */	NOERRN();	while (!SEE('|') && !SEE(stopper) && !SEE(EOS))	{		if (seencontent)		{						/* implicit concat operator */			lp = newstate(v->nfa);			NOERRN();			moveins(v->nfa, right, lp);		}		seencontent = 1;		/* NB, recursion in parseqatom() may swallow rest of branch */		parseqatom(v, stopper, type, lp, right, t);	}	if (!seencontent)	{							/* empty branch */		if (!partial)			NOTE(REG_UUNSPEC);		assert(lp == left);		EMPTYARC(left, right);	}	return t;}/* * parseqatom - parse one quantified atom or constraint of an RE * * The bookkeeping near the end cooperates very closely with parsebranch(); * in particular, it contains a recursion that can involve parsing the rest * of the branch, making this function's name somewhat inaccurate. */static voidparseqatom(struct vars * v,		   int stopper,			/* EOS or ')' */		   int type,			/* LACON (lookahead subRE) or PLAIN */		   struct state * lp,	/* left state to hang it on */		   struct state * rp,	/* right state to hang it on */		   struct subre * top)	/* subtree top */{	struct state *s;			/* temporaries for new states */	struct state *s2;#define  ARCV(t, val)	 newarc(v->nfa, t, val, lp, rp)	int			m,				n;	struct subre *atom;			/* atom's subtree */	struct subre *t;	int			cap;			/* capturing parens? */	int			pos;			/* positive lookahead? */	int			subno;			/* capturing-parens or backref number */	int			atomtype;	int			qprefer;		/* quantifier short/long preference */	int			f;	struct subre **atomp;		/* where the pointer to atom is */	/* initial bookkeeping */	atom = NULL;	assert(lp->nouts == 0);		/* must string new code */	assert(rp->nins == 0);		/* between lp and rp */	subno = 0;					/* just to shut lint up */	/* an atom or constraint... */	atomtype = v->nexttype;	switch (atomtype)	{			/* first, constraints, which end by returning */		case '^':			ARCV('^', 1);			if (v->cflags & REG_NLANCH)				ARCV(BEHIND, v->nlcolor);			NEXT();			return;			break;		case '$':			ARCV('$', 1);			if (v->cflags & REG_NLANCH)				ARCV(AHEAD, v->nlcolor);			NEXT();			return;			break;		case SBEGIN:			ARCV('^', 1);		/* BOL */			ARCV('^', 0);		/* or BOS */			NEXT();			return;			break;		case SEND:			ARCV('$', 1);		/* EOL */			ARCV('$', 0);		/* or EOS */			NEXT();			return;			break;		case '<':			wordchrs(v);		/* does NEXT() */			s = newstate(v->nfa);			NOERR();			nonword(v, BEHIND, lp, s);			word(v, AHEAD, s, rp);			return;			break;		case '>':			wordchrs(v);		/* does NEXT() */			s = newstate(v->nfa);			NOERR();			word(v, BEHIND, lp, s);			nonword(v, AHEAD, s, rp);			return;			break;		case WBDRY:			wordchrs(v);		/* does NEXT() */			s = newstate(v->nfa);			NOERR();			nonword(v, BEHIND, lp, s);			word(v, AHEAD, s, rp);			s = newstate(v->nfa);			NOERR();			word(v, BEHIND, lp, s);			nonword(v, AHEAD, s, rp);			return;			break;		case NWBDRY:			wordchrs(v);		/* does NEXT() */			s = newstate(v->nfa);			NOERR();			word(v, BEHIND, lp, s);			word(v, AHEAD, s, rp);			s = newstate(v->nfa);			NOERR();			nonword(v, BEHIND, lp, s);			nonword(v, AHEAD, s, rp);			return;			break;		case LACON:				/* lookahead constraint */			pos = v->nextvalue;			NEXT();			s = newstate(v->nfa);			s2 = newstate(v->nfa);			NOERR();			t = parse(v, ')', LACON, s, s2);			freesubre(v, t);	/* internal structure irrelevant */			assert(SEE(')') || ISERR());			NEXT();			n = newlacon(v, s, s2, pos);			NOERR();			ARCV(LACON, n);			return;			break;			/* then errors, to get them out of the way */		case '*':		case '+':		case '?':		case '{':			ERR(REG_BADRPT);			return;			break;		default:			ERR(REG_ASSERT);			return;			break;			/* then plain characters, and minor variants on that theme */		case ')':				/* unbalanced paren */			if ((v->cflags & REG_ADVANCED) != REG_EXTENDED)			{				ERR(REG_EPAREN);				return;			}			/* legal in EREs due to specification botch */			NOTE(REG_UPBOTCH);			/* fallthrough into case PLAIN */		case PLAIN:			onechr(v, v->nextvalue, lp, rp);			okcolors(v->nfa, v->cm);			NOERR();			NEXT();			break;		case '[':			if (v->nextvalue == 1)				bracket(v, lp, rp);			else				cbracket(v, lp, rp);			assert(SEE(']') || ISERR());			NEXT();			break;		case '.':			rainbow(v->nfa, v->cm, PLAIN,					(v->cflags & REG_NLSTOP) ? v->nlcolor : COLORLESS,					lp, rp);			NEXT();			break;			/* and finally the ugly stuff */		case '(':				/* value flags as capturing or non */			cap = (type == LACON) ? 0 : v->nextvalue;			if (cap)			{				v->nsubexp++;				subno = v->nsubexp;				if ((size_t) subno >= v->nsubs)					moresubs(v, subno);				assert((size_t) subno < v->nsubs);			}			else				atomtype = PLAIN;		/* something that's not '(' */			NEXT();			/* need new endpoints because tree will contain pointers */			s = newstate(v->nfa);			s2 = newstate(v->nfa);			NOERR();			EMPTYARC(lp, s);			EMPTYARC(s2, rp);			NOERR();			atom = parse(v, ')', PLAIN, s, s2);			assert(SEE(')') || ISERR());			NEXT();			NOERR();			if (cap)			{				v->subs[subno] = atom;				t = subre(v, '(', atom->flags | CAP, lp, rp);				NOERR();				t->subno = subno;				t->left = atom;				atom = t;			}			/* postpone everything else pending possible {0} */			break;		case BACKREF:			/* the Feature From The Black Lagoon */			INSIST(type != LACON, REG_ESUBREG);			INSIST(v->nextvalue < v->nsubs, REG_ESUBREG);			INSIST(v->subs[v->nextvalue] != NULL, REG_ESUBREG);			NOERR();			assert(v->nextvalue > 0);			atom = subre(v, 'b', BACKR, lp, rp);			subno = v->nextvalue;			atom->subno = subno;			EMPTYARC(lp, rp);	/* temporarily, so there's something */			NEXT();			break;	}	/* ...and an atom may be followed by a quantifier */	switch (v->nexttype)	{		case '*':			m = 0;			n = INFINITY;			qprefer = (v->nextvalue) ? LONGER : SHORTER;			NEXT();			break;		case '+':			m = 1;			n = INFINITY;			qprefer = (v->nextvalue) ? LONGER : SHORTER;			NEXT();			break;		case '?':			m = 0;			n = 1;			qprefer = (v->nextvalue) ? LONGER : SHORTER;			NEXT();			break;		case '{':			NEXT();			m = scannum(v);			if (EAT(','))			{				if (SEE(DIGIT))					n = scannum(v);				else					n = INFINITY;				if (m > n)				{					ERR(REG_BADBR);					return;				}				/* {m,n} exercises preference, even if it's {m,m} */				qprefer = (v->nextvalue) ? LONGER : SHORTER;			}			else			{				n = m;				/* {m} passes operand's preference through */				qprefer = 0;			}			if (!SEE('}'))			{					/* catches errors too */				ERR(REG_BADBR);				return;			}			NEXT();			break;		default:				/* no quantifier */			m = n = 1;			qprefer = 0;			break;	}	/* annoying special case:  {0} or {0,0} cancels everything */	if (m == 0 && n == 0)	{		if (atom != NULL)			freesubre(v, atom);		if (atomtype == '(')			v->subs[subno] = NULL;		delsub(v->nfa, lp, rp);		EMPTYARC(lp, rp);		return;	}	/* if not a messy case, avoid hard part */	assert(!MESSY(top->flags));	f = top->flags | qprefer | ((atom != NULL) ? atom->flags : 0);	if (atomtype != '(' && atomtype != BACKREF && !MESSY(UP(f)))	{		if (!(m == 1 && n == 1))			repeat(v, lp, rp, m, n);		if (atom != NULL)			freesubre(v, atom);		top->flags = f;		return;	}	/*	 * hard part:  something messy That is, capturing parens, back reference,	 * short/long clash, or an atom with substructure containing one of those.	 */	/* now we'll need a subre for the contents even if they're boring */	if (atom == NULL)	{		atom = subre(v, '=', 0, lp, rp);		NOERR();	}	/*	 * prepare a general-purpose state skeleton	 *	 * ---> [s] ---prefix---> [begin] ---atom---> [end] ----rest---> [rp] / /	 * [lp] ----> [s2] ----bypass---------------------	 *	 * where bypass is an empty, and prefix is some repetitions of atom	 */	s = newstate(v->nfa);		/* first, new endpoints for the atom */	s2 = newstate(v->nfa);	NOERR();	moveouts(v->nfa, lp, s);	moveins(v->nfa, rp, s2);	NOERR();	atom->begin = s;	atom->end = s2;	s = newstate(v->nfa);		/* and spots for prefix and bypass */	s2 = newstate(v->nfa);	NOERR();	EMPTYARC(lp, s);	EMPTYARC(lp, s2);	NOERR();	/* break remaining subRE into x{...} and what follows */	t = subre(v, '.', COMBINE(qprefer, atom->flags), lp, rp);	t->left = atom;	atomp = &t->left;	/* here we should recurse... but we must postpone that to the end */	/* split top into prefix and remaining */	assert(top->op == '=' && top->left == NULL && top->right == NULL);	top->left = subre(v, '=', top->flags, top->begin, lp);	top->op = '.';	top->right = t;	/* if it's a backref, now is the time to replicate the subNFA */	if (atomtype == BACKREF)	{		assert(atom->begin->nouts == 1);		/* just the EMPTY */		delsub(v->nfa, atom->begin, atom->end);		assert(v->subs[subno] != NULL);		/* and here's why the recursion got postponed:  it must */		/* wait until the skeleton is filled in, because it may */		/* hit a backref that wants to copy the filled-in skeleton */		dupnfa(v->nfa, v->subs[subno]->begin, v->subs[subno]->end,			   atom->begin, atom->end);		NOERR();	}	/* it's quantifier time; first, turn x{0,...} into x{1,...}|empty */	if (m == 0)	{		EMPTYARC(s2, atom->end);	/* the bypass */		assert(PREF(qprefer) != 0);		f = COMBINE(qprefer, atom->flags);		t = subre(v, '|', f, lp, atom->end);		NOERR();		t->left = atom;		t->right = subre(v, '|', PREF(f), s2, atom->end);		NOERR();		t->right->left = subre(v, '=', 0, s2, atom->end);		NOERR();		*atomp = t;		atomp = &t->left;		m = 1;	}	/* deal with the rest of the quantifier */	if (atomtype == BACKREF)	{
regcomp.c - 源码说明

本页面展示了「从一个开源软件中摘取的正则表达式模块」中的 regcomp.c 源码文件，采用 C语言编程语言编写，共 2,243 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与开源软件相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?