regcomp.c

来自「Open source for regula expre」· C语言代码 · 共 1,604 行 · 第 1/3 页
1,604 行
#include <sys/types.h>#include <stdio.h>#include <string.h>#include <ctype.h>#include <limits.h>#include <stdlib.h>#include <regex.h>#include "utils.h"#include "regex2.h"#include "cclass.h"#include "cname.h"/* * parse structure, passed up and down to avoid global variables and * other clumsinesses */struct parse {	char *next;		/* next character in RE */	char *end;		/* end of string (-> NUL normally) */	int error;		/* has an error been seen? */	sop *strip;		/* malloced strip */	sopno ssize;		/* malloced strip size (allocated) */	sopno slen;		/* malloced strip length (used) */	int ncsalloc;		/* number of csets allocated */	struct re_guts *g;#	define	NPAREN	10	/* we need to remember () 1-9 for back refs */	sopno pbegin[NPAREN];	/* -> ( ([0] unused) */	sopno pend[NPAREN];	/* -> ) ([0] unused) */};#include "regcomp.ih"static char nuls[10];		/* place to point scanner in event of error *//* * macros for use with parse structure * BEWARE:  these know that the parse structure is named `p' !!! */#define	PEEK()	(*p->next)#define	PEEK2()	(*(p->next+1))#define	MORE()	(p->next < p->end)#define	MORE2()	(p->next+1 < p->end)#define	SEE(c)	(MORE() && PEEK() == (c))#define	SEETWO(a, b)	(MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))#define	EAT(c)	((SEE(c)) ? (NEXT(), 1) : 0)#define	EATTWO(a, b)	((SEETWO(a, b)) ? (NEXT2(), 1) : 0)#define	NEXT()	(p->next++)#define	NEXT2()	(p->next += 2)#define	NEXTn(n)	(p->next += (n))#define	GETNEXT()	(*p->next++)#define	SETERROR(e)	seterr(p, (e))#define	REQUIRE(co, e)	((co) || SETERROR(e))#define	MUSTSEE(c, e)	(REQUIRE(MORE() && PEEK() == (c), e))#define	MUSTEAT(c, e)	(REQUIRE(MORE() && GETNEXT() == (c), e))#define	MUSTNOTSEE(c, e)	(REQUIRE(!MORE() || PEEK() != (c), e))#define	EMIT(op, sopnd)	doemit(p, (sop)(op), (size_t)(sopnd))#define	INSERT(op, pos)	doinsert(p, (sop)(op), HERE()-(pos)+1, pos)#define	AHEAD(pos)		dofwd(p, pos, HERE()-(pos))#define	ASTERN(sop, pos)	EMIT(sop, HERE()-pos)#define	HERE()		(p->slen)#define	THERE()		(p->slen - 1)#define	THERETHERE()	(p->slen - 2)#define	DROP(n)	(p->slen -= (n))#ifndef NDEBUGstatic int never = 0;		/* for use in asserts; shuts lint up */#else#define	never	0		/* some <assert.h>s have bugs too */#endif/* - regcomp - interface for parser and compilation = extern int regcomp(regex_t *, const char *, int); = #define	REG_BASIC	0000 = #define	REG_EXTENDED	0001 = #define	REG_ICASE	0002 = #define	REG_NOSUB	0004 = #define	REG_NEWLINE	0010 = #define	REG_NOSPEC	0020 = #define	REG_PEND	0040 = #define	REG_DUMP	0200 */int				/* 0 success, otherwise REG_something */regcomp(preg, pattern, cflags)regex_t *preg;const char *pattern;int cflags;{	struct parse pa;	register struct re_guts *g;	register struct parse *p = &pa;	register int i;	register size_t len;#ifdef REDEBUG#	define	GOODFLAGS(f)	(f)#else#	define	GOODFLAGS(f)	((f)&~REG_DUMP)#endif	cflags = GOODFLAGS(cflags);	if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))		return(REG_INVARG);	if (cflags&REG_PEND) {		if (preg->re_endp < pattern)			return(REG_INVARG);		len = preg->re_endp - pattern;	} else		len = strlen((char *)pattern);	/* do the mallocs early so failure handling is easy */	g = (struct re_guts *)malloc(sizeof(struct re_guts) +							(NC-1)*sizeof(cat_t));	if (g == NULL)		return(REG_ESPACE);	p->ssize = len/(size_t)2*(size_t)3 + (size_t)1;	/* ugh */	p->strip = (sop *)malloc(p->ssize * sizeof(sop));	p->slen = 0;	if (p->strip == NULL) {		free((char *)g);		return(REG_ESPACE);	}	/* set things up */	p->g = g;	p->next = (char *)pattern;	/* convenience; we do not modify it */	p->end = p->next + len;	p->error = 0;	p->ncsalloc = 0;	for (i = 0; i < NPAREN; i++) {		p->pbegin[i] = 0;		p->pend[i] = 0;	}	g->csetsize = NC;	g->sets = NULL;	g->setbits = NULL;	g->ncsets = 0;	g->cflags = cflags;	g->iflags = 0;	g->nbol = 0;	g->neol = 0;	g->must = NULL;	g->mlen = 0;	g->nsub = 0;	g->ncategories = 1;	/* category 0 is "everything else" */	g->categories = &g->catspace[-(CHAR_MIN)];	(void) memset((char *)g->catspace, 0, NC*sizeof(cat_t));	g->backrefs = 0;	/* do it */	EMIT(OEND, 0);	g->firststate = THERE();	if (cflags&REG_EXTENDED)		p_ere(p, OUT);	else if (cflags&REG_NOSPEC)		p_str(p);	else		p_bre(p, OUT, OUT);	EMIT(OEND, 0);	g->laststate = THERE();	/* tidy up loose ends and fill things in */	categorize(p, g);	stripsnug(p, g);	findmust(p, g);	g->nplus = pluscount(p, g);	g->magic = MAGIC2;	preg->re_nsub = g->nsub;	preg->re_g = g;	preg->re_magic = MAGIC1;#ifndef REDEBUG	/* not debugging, so can't rely on the assert() in regexec() */	if (g->iflags&BAD)		SETERROR(REG_ASSERT);#endif	/* win or lose, we're done */	if (p->error != 0)	/* lose */		regfree(preg);	return(p->error);}/* - p_ere - ERE parser top level, concatenation and alternation == static void p_ere(register struct parse *p, int stop); */static voidp_ere(p, stop)register struct parse *p;int stop;			/* character this ERE should end at */{	register char c;	register sopno prevback;	register sopno prevfwd;	register sopno conc;	register int first = 1;		/* is this the first alternative? */	for (;;) {		/* do a bunch of concatenated expressions */		conc = HERE();		while (MORE() && (c = PEEK()) != '|' && c != stop)			p_ere_exp(p);		REQUIRE(HERE() != conc, REG_EMPTY);	/* require nonempty */		if (!EAT('|'))			break;		/* NOTE BREAK OUT */		if (first) {			INSERT(OCH_, conc);	/* offset is wrong */			prevfwd = conc;			prevback = conc;			first = 0;		}		ASTERN(OOR1, prevback);		prevback = THERE();		AHEAD(prevfwd);			/* fix previous offset */		prevfwd = HERE();		EMIT(OOR2, 0);			/* offset is very wrong */	}	if (!first) {		/* tail-end fixups */		AHEAD(prevfwd);		ASTERN(O_CH, prevback);	}	assert(!MORE() || SEE(stop));}/* - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op == static void p_ere_exp(register struct parse *p); */static voidp_ere_exp(p)register struct parse *p;{	register char c;	register sopno pos;	register int count;	register int count2;	register sopno subno;	int wascaret = 0;	assert(MORE());		/* caller should have ensured this */	c = GETNEXT();	pos = HERE();	switch (c) {	case '(':		REQUIRE(MORE(), REG_EPAREN);		p->g->nsub++;		subno = p->g->nsub;		if (subno < NPAREN)			p->pbegin[subno] = HERE();		EMIT(OLPAREN, subno);		if (!SEE(')'))			p_ere(p, ')');		if (subno < NPAREN) {			p->pend[subno] = HERE();			assert(p->pend[subno] != 0);		}		EMIT(ORPAREN, subno);		MUSTEAT(')', REG_EPAREN);		break;#ifndef POSIX_MISTAKE	case ')':		/* happens only if no current unmatched ( */		/*		 * You may ask, why the ifndef?  Because I didn't notice		 * this until slightly too late for 1003.2, and none of the		 * other 1003.2 regular-expression reviewers noticed it at		 * all.  So an unmatched ) is legal POSIX, at least until		 * we can get it fixed.		 */		SETERROR(REG_EPAREN);		break;#endif	case '^':		EMIT(OBOL, 0);		p->g->iflags |= USEBOL;		p->g->nbol++;		wascaret = 1;		break;	case '$':		EMIT(OEOL, 0);		p->g->iflags |= USEEOL;		p->g->neol++;		break;	case '|':		SETERROR(REG_EMPTY);		break;	case '*':	case '+':	case '?':		SETERROR(REG_BADRPT);		break;	case '.':		if (p->g->cflags&REG_NEWLINE)			nonnewline(p);		else			EMIT(OANY, 0);		break;	case '[':		p_bracket(p);		break;	case '\\':		REQUIRE(MORE(), REG_EESCAPE);		c = GETNEXT();		ordinary(p, c);		break;	case '{':		/* okay as ordinary except if digit follows */		REQUIRE(!MORE() || !isdigit(PEEK()), REG_BADRPT);		/* FALLTHROUGH */	default:		ordinary(p, c);		break;	}	if (!MORE())		return;	c = PEEK();	/* we call { a repetition if followed by a digit */	if (!( c == '*' || c == '+' || c == '?' ||				(c == '{' && MORE2() && isdigit(PEEK2())) ))		return;		/* no repetition, we're done */	NEXT();	REQUIRE(!wascaret, REG_BADRPT);	switch (c) {	case '*':	/* implemented as +? */		/* this case does not require the (y|) trick, noKLUDGE */		INSERT(OPLUS_, pos);		ASTERN(O_PLUS, pos);		INSERT(OQUEST_, pos);		ASTERN(O_QUEST, pos);		break;	case '+':		INSERT(OPLUS_, pos);		ASTERN(O_PLUS, pos);		break;	case '?':		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */		INSERT(OCH_, pos);		/* offset slightly wrong */		ASTERN(OOR1, pos);		/* this one's right */		AHEAD(pos);			/* fix the OCH_ */		EMIT(OOR2, 0);			/* offset very wrong... */		AHEAD(THERE());			/* ...so fix it */		ASTERN(O_CH, THERETHERE());		break;	case '{':		count = p_count(p);		if (EAT(',')) {			if (isdigit(PEEK())) {				count2 = p_count(p);				REQUIRE(count <= count2, REG_BADBR);			} else		/* single number with comma */				count2 = INFINITY;		} else		/* just a single number */			count2 = count;		repeat(p, pos, count, count2);		if (!EAT('}')) {	/* error heuristics */			while (MORE() && PEEK() != '}')				NEXT();			REQUIRE(MORE(), REG_EBRACE);			SETERROR(REG_BADBR);		}		break;	}	if (!MORE())		return;	c = PEEK();	if (!( c == '*' || c == '+' || c == '?' ||				(c == '{' && MORE2() && isdigit(PEEK2())) ) )		return;	SETERROR(REG_BADRPT);}/* - p_str - string (no metacharacters) "parser" == static void p_str(register struct parse *p); */static voidp_str(p)register struct parse *p;{	REQUIRE(MORE(), REG_EMPTY);	while (MORE())		ordinary(p, GETNEXT());}/* - p_bre - BRE parser top level, anchoring and concatenation == static void p_bre(register struct parse *p, register int end1, \ ==	register int end2); * Giving end1 as OUT essentially eliminates the end1/end2 check. * * This implementation is a bit of a kludge, in that a trailing $ is first * taken as an ordinary character and then revised to be an anchor.  The * only undesirable side effect is that '$' gets included as a character * category in such cases.  This is fairly harmless; not worth fixing. * The amount of lookahead needed to avoid this kludge is excessive. */static voidp_bre(p, end1, end2)register struct parse *p;register int end1;		/* first terminating character */register int end2;		/* second terminating character */{	register sopno start = HERE();	register int first = 1;			/* first subexpression? */	register int wasdollar = 0;	if (EAT('^')) {		EMIT(OBOL, 0);		p->g->iflags |= USEBOL;		p->g->nbol++;	}	while (MORE() && !SEETWO(end1, end2)) {		wasdollar = p_simp_re(p, first);		first = 0;	}	if (wasdollar) {	/* oops, that was a trailing anchor */		DROP(1);		EMIT(OEOL, 0);		p->g->iflags |= USEEOL;		p->g->neol++;	}	REQUIRE(HERE() != start, REG_EMPTY);	/* require nonempty */}/* - p_simp_re - parse a simple RE, an atom possibly followed by a repetition == static int p_simp_re(register struct parse *p, int starordinary); */static int			/* was the simple RE an unbackslashed $? */p_simp_re(p, starordinary)register struct parse *p;int starordinary;		/* is a leading * an ordinary character? */{	register int c;	register int count;	register int count2;	register sopno pos;	register int i;	register sopno subno;#	define	BACKSL	(1<<CHAR_BIT)	pos = HERE();		/* repetion op, if any, covers from here */	assert(MORE());		/* caller should have ensured this */	c = GETNEXT();	if (c == '\\') {		REQUIRE(MORE(), REG_EESCAPE);		c = BACKSL | (unsigned char)GETNEXT();	}	switch (c) {	case '.':		if (p->g->cflags&REG_NEWLINE)			nonnewline(p);		else			EMIT(OANY, 0);		break;	case '[':		p_bracket(p);		break;	case BACKSL|'{':		SETERROR(REG_BADRPT);		break;	case BACKSL|'(':		p->g->nsub++;		subno = p->g->nsub;		if (subno < NPAREN)			p->pbegin[subno] = HERE();		EMIT(OLPAREN, subno);		/* the MORE here is an error heuristic */		if (MORE() && !SEETWO('\\', ')'))			p_bre(p, '\\', ')');		if (subno < NPAREN) {			p->pend[subno] = HERE();			assert(p->pend[subno] != 0);		}		EMIT(ORPAREN, subno);		REQUIRE(EATTWO('\\', ')'), REG_EPAREN);		break;	case BACKSL|')':	/* should not get here -- must be user */	case BACKSL|'}':		SETERROR(REG_EPAREN);		break;	case BACKSL|'1':	case BACKSL|'2':	case BACKSL|'3':	case BACKSL|'4':	case BACKSL|'5':	case BACKSL|'6':	case BACKSL|'7':	case BACKSL|'8':	case BACKSL|'9':		i = (c&~BACKSL) - '0';		assert(i < NPAREN);		if (p->pend[i] != 0) {			assert(i <= p->g->nsub);			EMIT(OBACK_, i);			assert(p->pbegin[i] != 0);			assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);			assert(OP(p->strip[p->pend[i]]) == ORPAREN);			(void) dupl(p, p->pbegin[i]+1, p->pend[i]);			EMIT(O_BACK, i);		} else			SETERROR(REG_ESUBREG);		p->g->backrefs = 1;		break;	case '*':		REQUIRE(starordinary, REG_BADRPT);		/* FALLTHROUGH */	default:		ordinary(p, (char)c);	/* takes off BACKSL, if any */		break;	}	if (EAT('*')) {		/* implemented as +? */		/* this case does not require the (y|) trick, noKLUDGE */		INSERT(OPLUS_, pos);		ASTERN(O_PLUS, pos);		INSERT(OQUEST_, pos);		ASTERN(O_QUEST, pos);	} else if (EATTWO('\\', '{')) {		count = p_count(p);		if (EAT(',')) {			if (MORE() && isdigit(PEEK())) {				count2 = p_count(p);				REQUIRE(count <= count2, REG_BADBR);			} else		/* single number with comma */
regcomp.c - 源码说明

本页面展示了「Open source for regula expre」中的 regcomp.c 源码文件，采用 C语言编程语言编写，共 1,604 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与source相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?