regc_lex.c

来自「从一个开源软件中摘取的正则表达式模块」· C语言代码 · 共 1,145 行 · 第 1/2 页
1,145 行
/* * lexical analyzer * This file is #included by regcomp.c. * * Copyright (c) 1998, 1999 Henry Spencer.	All rights reserved. * * Development of this software was funded, in part, by Cray Research Inc., * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics * Corporation, none of whom are responsible for the results.  The author * thanks all of them. * * Redistribution and use in source and binary forms -- with or without * modification -- are permitted for any purpose, provided that * redistributions in source form retain this entire copyright notice and * indicate the origin and nature of any modifications. * * I'd appreciate being given credit for this package in the documentation * of software which uses it, but that is not a requirement. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $PostgreSQL: pgsql/src/backend/regex/regc_lex.c,v 1.7 2008/01/03 20:47:55 tgl Exp $ * *//* scanning macros (know about v) */#define ATEOS()		(v->now >= v->stop)#define HAVE(n)		(v->stop - v->now >= (n))#define NEXT1(c)	(!ATEOS() && *v->now == CHR(c))#define NEXT2(a,b)	(HAVE(2) && *v->now == CHR(a) && *(v->now+1) == CHR(b))#define NEXT3(a,b,c)	(HAVE(3) && *v->now == CHR(a) && \						*(v->now+1) == CHR(b) && \						*(v->now+2) == CHR(c))#define SET(c)		(v->nexttype = (c))#define SETV(c, n)	(v->nexttype = (c), v->nextvalue = (n))#define RET(c)		return (SET(c), 1)#define RETV(c, n)	return (SETV(c, n), 1)#define FAILW(e)	return (ERR(e), 0)	/* ERR does SET(EOS) */#define LASTTYPE(t) (v->lasttype == (t))/* lexical contexts */#define L_ERE	1				/* mainline ERE/ARE */#define L_BRE	2				/* mainline BRE */#define L_Q 3					/* REG_QUOTE */#define L_EBND	4				/* ERE/ARE bound */#define L_BBND	5				/* BRE bound */#define L_BRACK 6				/* brackets */#define L_CEL	7				/* collating element */#define L_ECL	8				/* equivalence class */#define L_CCL	9				/* character class */#define INTOCON(c)	(v->lexcon = (c))#define INCON(con)	(v->lexcon == (con))/* construct pointer past end of chr array */#define ENDOF(array)	((array) + sizeof(array)/sizeof(chr))/* * lexstart - set up lexical stuff, scan leading options */static voidlexstart(struct vars * v){	prefixes(v);				/* may turn on new type bits etc. */	NOERR();	if (v->cflags & REG_QUOTE)	{		assert(!(v->cflags & (REG_ADVANCED | REG_EXPANDED | REG_NEWLINE)));		INTOCON(L_Q);	}	else if (v->cflags & REG_EXTENDED)	{		assert(!(v->cflags & REG_QUOTE));		INTOCON(L_ERE);	}	else	{		assert(!(v->cflags & (REG_QUOTE | REG_ADVF)));		INTOCON(L_BRE);	}	v->nexttype = EMPTY;		/* remember we were at the start */	next(v);					/* set up the first token */}/* * prefixes - implement various special prefixes */static voidprefixes(struct vars * v){	/* literal string doesn't get any of this stuff */	if (v->cflags & REG_QUOTE)		return;	/* initial "***" gets special things */	if (HAVE(4) && NEXT3('*', '*', '*'))		switch (*(v->now + 3))		{			case CHR('?'):		/* "***?" error, msg shows version */				ERR(REG_BADPAT);				return;			/* proceed no further */				break;			case CHR('='):		/* "***=" shifts to literal string */				NOTE(REG_UNONPOSIX);				v->cflags |= REG_QUOTE;				v->cflags &= ~(REG_ADVANCED | REG_EXPANDED | REG_NEWLINE);				v->now += 4;				return;			/* and there can be no more prefixes */				break;			case CHR(':'):		/* "***:" shifts to AREs */				NOTE(REG_UNONPOSIX);				v->cflags |= REG_ADVANCED;				v->now += 4;				break;			default:			/* otherwise *** is just an error */				ERR(REG_BADRPT);				return;				break;		}	/* BREs and EREs don't get embedded options */	if ((v->cflags & REG_ADVANCED) != REG_ADVANCED)		return;	/* embedded options (AREs only) */	if (HAVE(3) && NEXT2('(', '?') && iscalpha(*(v->now + 2)))	{		NOTE(REG_UNONPOSIX);		v->now += 2;		for (; !ATEOS() && iscalpha(*v->now); v->now++)			switch (*v->now)			{				case CHR('b'):	/* BREs (but why???) */					v->cflags &= ~(REG_ADVANCED | REG_QUOTE);					break;				case CHR('c'):	/* case sensitive */					v->cflags &= ~REG_ICASE;					break;				case CHR('e'):	/* plain EREs */					v->cflags |= REG_EXTENDED;					v->cflags &= ~(REG_ADVF | REG_QUOTE);					break;				case CHR('i'):	/* case insensitive */					v->cflags |= REG_ICASE;					break;				case CHR('m'):	/* Perloid synonym for n */				case CHR('n'):	/* \n affects ^ $ . [^ */					v->cflags |= REG_NEWLINE;					break;				case CHR('p'):	/* ~Perl, \n affects . [^ */					v->cflags |= REG_NLSTOP;					v->cflags &= ~REG_NLANCH;					break;				case CHR('q'):	/* literal string */					v->cflags |= REG_QUOTE;					v->cflags &= ~REG_ADVANCED;					break;				case CHR('s'):	/* single line, \n ordinary */					v->cflags &= ~REG_NEWLINE;					break;				case CHR('t'):	/* tight syntax */					v->cflags &= ~REG_EXPANDED;					break;				case CHR('w'):	/* weird, \n affects ^ $ only */					v->cflags &= ~REG_NLSTOP;					v->cflags |= REG_NLANCH;					break;				case CHR('x'):	/* expanded syntax */					v->cflags |= REG_EXPANDED;					break;				default:					ERR(REG_BADOPT);					return;			}		if (!NEXT1(')'))		{			ERR(REG_BADOPT);			return;		}		v->now++;		if (v->cflags & REG_QUOTE)			v->cflags &= ~(REG_EXPANDED | REG_NEWLINE);	}}/* * lexnest - "call a subroutine", interpolating string at the lexical level * * Note, this is not a very general facility.  There are a number of * implicit assumptions about what sorts of strings can be subroutines. */static voidlexnest(struct vars * v,		chr *beginp,			/* start of interpolation */		chr *endp)				/* one past end of interpolation */{	assert(v->savenow == NULL); /* only one level of nesting */	v->savenow = v->now;	v->savestop = v->stop;	v->now = beginp;	v->stop = endp;}/* * string constants to interpolate as expansions of things like \d */static chr	backd[] = {			/* \d */	CHR('['), CHR('['), CHR(':'),	CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),	CHR(':'), CHR(']'), CHR(']')};static chr	backD[] = {			/* \D */	CHR('['), CHR('^'), CHR('['), CHR(':'),	CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),	CHR(':'), CHR(']'), CHR(']')};static chr	brbackd[] = {		/* \d within brackets */	CHR('['), CHR(':'),	CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),	CHR(':'), CHR(']')};static chr	backs[] = {			/* \s */	CHR('['), CHR('['), CHR(':'),	CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),	CHR(':'), CHR(']'), CHR(']')};static chr	backS[] = {			/* \S */	CHR('['), CHR('^'), CHR('['), CHR(':'),	CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),	CHR(':'), CHR(']'), CHR(']')};static chr	brbacks[] = {		/* \s within brackets */	CHR('['), CHR(':'),	CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),	CHR(':'), CHR(']')};static chr	backw[] = {			/* \w */	CHR('['), CHR('['), CHR(':'),	CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),	CHR(':'), CHR(']'), CHR('_'), CHR(']')};static chr	backW[] = {			/* \W */	CHR('['), CHR('^'), CHR('['), CHR(':'),	CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),	CHR(':'), CHR(']'), CHR('_'), CHR(']')};static chr	brbackw[] = {		/* \w within brackets */	CHR('['), CHR(':'),	CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),	CHR(':'), CHR(']'), CHR('_')};/* * lexword - interpolate a bracket expression for word characters * Possibly ought to inquire whether there is a "word" character class. */static voidlexword(struct vars * v){	lexnest(v, backw, ENDOF(backw));}/* * next - get next token */static int						/* 1 normal, 0 failure */next(struct vars * v){	chr			c;	/* errors yield an infinite sequence of failures */	if (ISERR())		return 0;				/* the error has set nexttype to EOS */	/* remember flavor of last token */	v->lasttype = v->nexttype;	/* REG_BOSONLY */	if (v->nexttype == EMPTY && (v->cflags & REG_BOSONLY))	{		/* at start of a REG_BOSONLY RE */		RETV(SBEGIN, 0);		/* same as \A */	}	/* if we're nested and we've hit end, return to outer level */	if (v->savenow != NULL && ATEOS())	{		v->now = v->savenow;		v->stop = v->savestop;		v->savenow = v->savestop = NULL;	}	/* skip white space etc. if appropriate (not in literal or []) */	if (v->cflags & REG_EXPANDED)		switch (v->lexcon)		{			case L_ERE:			case L_BRE:			case L_EBND:			case L_BBND:				skip(v);				break;		}	/* handle EOS, depending on context */	if (ATEOS())	{		switch (v->lexcon)		{			case L_ERE:			case L_BRE:			case L_Q:				RET(EOS);				break;			case L_EBND:			case L_BBND:				FAILW(REG_EBRACE);				break;			case L_BRACK:			case L_CEL:			case L_ECL:			case L_CCL:				FAILW(REG_EBRACK);				break;		}		assert(NOTREACHED);	}	/* okay, time to actually get a character */	c = *v->now++;	/* deal with the easy contexts, punt EREs to code below */	switch (v->lexcon)	{		case L_BRE:				/* punt BREs to separate function */			return brenext(v, c);			break;		case L_ERE:				/* see below */			break;		case L_Q:				/* literal strings are easy */			RETV(PLAIN, c);			break;		case L_BBND:			/* bounds are fairly simple */		case L_EBND:			switch (c)			{				case CHR('0'):				case CHR('1'):				case CHR('2'):				case CHR('3'):				case CHR('4'):				case CHR('5'):				case CHR('6'):				case CHR('7'):				case CHR('8'):				case CHR('9'):					RETV(DIGIT, (chr) DIGITVAL(c));					break;				case CHR(','):					RET(',');					break;				case CHR('}'):	/* ERE bound ends with } */					if (INCON(L_EBND))					{						INTOCON(L_ERE);						if ((v->cflags & REG_ADVF) && NEXT1('?'))						{							v->now++;							NOTE(REG_UNONPOSIX);							RETV('}', 0);						}						RETV('}', 1);					}					else						FAILW(REG_BADBR);					break;				case CHR('\\'):	/* BRE bound ends with \} */					if (INCON(L_BBND) && NEXT1('}'))					{						v->now++;						INTOCON(L_BRE);						RET('}');					}					else						FAILW(REG_BADBR);					break;				default:					FAILW(REG_BADBR);					break;			}			assert(NOTREACHED);			break;		case L_BRACK:			/* brackets are not too hard */			switch (c)			{				case CHR(']'):					if (LASTTYPE('['))						RETV(PLAIN, c);					else					{						INTOCON((v->cflags & REG_EXTENDED) ?								L_ERE : L_BRE);						RET(']');					}					break;				case CHR('\\'):					NOTE(REG_UBBS);					if (!(v->cflags & REG_ADVF))						RETV(PLAIN, c);					NOTE(REG_UNONPOSIX);					if (ATEOS())						FAILW(REG_EESCAPE);					(DISCARD) lexescape(v);					switch (v->nexttype)					{			/* not all escapes okay here */						case PLAIN:							return 1;							break;						case CCLASS:							switch (v->nextvalue)							{								case 'd':									lexnest(v, brbackd, ENDOF(brbackd));									break;								case 's':									lexnest(v, brbacks, ENDOF(brbacks));									break;								case 'w':									lexnest(v, brbackw, ENDOF(brbackw));									break;								default:									FAILW(REG_EESCAPE);									break;							}							/* lexnest done, back up and try again */							v->nexttype = v->lasttype;							return next(v);							break;					}					/* not one of the acceptable escapes */					FAILW(REG_EESCAPE);					break;				case CHR('-'):					if (LASTTYPE('[') || NEXT1(']'))						RETV(PLAIN, c);					else						RETV(RANGE, c);					break;				case CHR('['):					if (ATEOS())						FAILW(REG_EBRACK);					switch (*v->now++)					{						case CHR('.'):							INTOCON(L_CEL);							/* might or might not be locale-specific */							RET(COLLEL);							break;						case CHR('='):							INTOCON(L_ECL);							NOTE(REG_ULOCALE);							RET(ECLASS);							break;						case CHR(':'):							INTOCON(L_CCL);							NOTE(REG_ULOCALE);							RET(CCLASS);							break;						default:		/* oops */							v->now--;							RETV(PLAIN, c);							break;					}					assert(NOTREACHED);					break;				default:					RETV(PLAIN, c);					break;			}			assert(NOTREACHED);			break;		case L_CEL:				/* collating elements are easy */			if (c == CHR('.') && NEXT1(']'))			{				v->now++;				INTOCON(L_BRACK);				RETV(END, '.');			}			else				RETV(PLAIN, c);			break;		case L_ECL:				/* ditto equivalence classes */			if (c == CHR('=') && NEXT1(']'))			{				v->now++;				INTOCON(L_BRACK);				RETV(END, '=');			}			else				RETV(PLAIN, c);			break;		case L_CCL:				/* ditto character classes */			if (c == CHR(':') && NEXT1(']'))			{				v->now++;				INTOCON(L_BRACK);				RETV(END, ':');			}			else				RETV(PLAIN, c);			break;		default:			assert(NOTREACHED);			break;	}	/* that got rid of everything except EREs and AREs */	assert(INCON(L_ERE));	/* deal with EREs and AREs, except for backslashes */	switch (c)	{		case CHR('|'):			RET('|');			break;		case CHR('*'):			if ((v->cflags & REG_ADVF) && NEXT1('?'))			{				v->now++;				NOTE(REG_UNONPOSIX);				RETV('*', 0);			}			RETV('*', 1);			break;		case CHR('+'):			if ((v->cflags & REG_ADVF) && NEXT1('?'))			{				v->now++;				NOTE(REG_UNONPOSIX);				RETV('+', 0);			}			RETV('+', 1);			break;		case CHR('?'):			if ((v->cflags & REG_ADVF) && NEXT1('?'))			{				v->now++;				NOTE(REG_UNONPOSIX);				RETV('?', 0);			}			RETV('?', 1);			break;		case CHR('{'):			/* bounds start or plain character */			if (v->cflags & REG_EXPANDED)				skip(v);			if (ATEOS() || !iscdigit(*v->now))			{				NOTE(REG_UBRACES);				NOTE(REG_UUNSPEC);				RETV(PLAIN, c);			}			else			{
regc_lex.c - 源码说明

本页面展示了「从一个开源软件中摘取的正则表达式模块」中的 regc_lex.c 源码文件，采用 C语言编程语言编写，共 1,145 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与开源软件相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?