regc_lex.c

来自「tcl是工具命令语言」· C语言 代码 · 共 1,062 行 · 第 1/2 页

C
1,062
字号
/* * lexical analyzer * This file is #included by regcomp.c. * * Copyright (c) 1998, 1999 Henry Spencer.  All rights reserved. *  * Development of this software was funded, in part, by Cray Research Inc., * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics * Corporation, none of whom are responsible for the results.  The author * thanks all of them.  *  * Redistribution and use in source and binary forms -- with or without * modification -- are permitted for any purpose, provided that * redistributions in source form retain this entire copyright notice and * indicate the origin and nature of any modifications. *  * I'd appreciate being given credit for this package in the documentation * of software which uses it, but that is not a requirement. *  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * *//* scanning macros (know about v) */#define	ATEOS()		(v->now >= v->stop)#define	HAVE(n)		(v->stop - v->now >= (n))#define	NEXT1(c)	(!ATEOS() && *v->now == CHR(c))#define	NEXT2(a,b)	(HAVE(2) && *v->now == CHR(a) && *(v->now+1) == CHR(b))#define	NEXT3(a,b,c)	(HAVE(3) && *v->now == CHR(a) && \						*(v->now+1) == CHR(b) && \						*(v->now+2) == CHR(c))#define	SET(c)		(v->nexttype = (c))#define	SETV(c, n)	(v->nexttype = (c), v->nextvalue = (n))#define	RET(c)		return (SET(c), 1)#define	RETV(c, n)	return (SETV(c, n), 1)#define	FAILW(e)	return (ERR(e), 0)	/* ERR does SET(EOS) */#define	LASTTYPE(t)	(v->lasttype == (t))/* lexical contexts */#define	L_ERE	1	/* mainline ERE/ARE */#define	L_BRE	2	/* mainline BRE */#define	L_Q	3	/* REG_QUOTE */#define	L_EBND	4	/* ERE/ARE bound */#define	L_BBND	5	/* BRE bound */#define	L_BRACK	6	/* brackets */#define	L_CEL	7	/* collating element */#define	L_ECL	8	/* equivalence class */#define	L_CCL	9	/* character class */#define	INTOCON(c)	(v->lexcon = (c))#define	INCON(con)	(v->lexcon == (con))/* construct pointer past end of chr array */#define	ENDOF(array)	((array) + sizeof(array)/sizeof(chr))/* - lexstart - set up lexical stuff, scan leading options ^ static VOID lexstart(struct vars *); */static VOIDlexstart(v)struct vars *v;{	prefixes(v);			/* may turn on new type bits etc. */	NOERR();	if (v->cflags&REG_QUOTE) {		assert(!(v->cflags&(REG_ADVANCED|REG_EXPANDED|REG_NEWLINE)));		INTOCON(L_Q);	} else if (v->cflags&REG_EXTENDED) {		assert(!(v->cflags&REG_QUOTE));		INTOCON(L_ERE);	} else {		assert(!(v->cflags&(REG_QUOTE|REG_ADVF)));		INTOCON(L_BRE);	}	v->nexttype = EMPTY;		/* remember we were at the start */	next(v);			/* set up the first token */}/* - prefixes - implement various special prefixes ^ static VOID prefixes(struct vars *); */static VOIDprefixes(v)struct vars *v;{	/* literal string doesn't get any of this stuff */	if (v->cflags&REG_QUOTE)		return;	/* initial "***" gets special things */		if (HAVE(4) && NEXT3('*', '*', '*'))		switch (*(v->now + 3)) {		case CHR('?'):		/* "***?" error, msg shows version */			ERR(REG_BADPAT);			return;		/* proceed no further */			break;		case CHR('='):		/* "***=" shifts to literal string */			NOTE(REG_UNONPOSIX);			v->cflags |= REG_QUOTE;			v->cflags &= ~(REG_ADVANCED|REG_EXPANDED|REG_NEWLINE);			v->now += 4;			return;		/* and there can be no more prefixes */			break;		case CHR(':'):		/* "***:" shifts to AREs */			NOTE(REG_UNONPOSIX);			v->cflags |= REG_ADVANCED;			v->now += 4;			break;		default:		/* otherwise *** is just an error */			ERR(REG_BADRPT);			return;			break;		}	/* BREs and EREs don't get embedded options */	if ((v->cflags&REG_ADVANCED) != REG_ADVANCED)		return;	/* embedded options (AREs only) */	if (HAVE(3) && NEXT2('(', '?') && iscalpha(*(v->now + 2))) {		NOTE(REG_UNONPOSIX);		v->now += 2;		for (; !ATEOS() && iscalpha(*v->now); v->now++)			switch (*v->now) {			case CHR('b'):		/* BREs (but why???) */				v->cflags &= ~(REG_ADVANCED|REG_QUOTE);				break;			case CHR('c'):		/* case sensitive */				v->cflags &= ~REG_ICASE;				break;			case CHR('e'):		/* plain EREs */				v->cflags |= REG_EXTENDED;				v->cflags &= ~(REG_ADVF|REG_QUOTE);				break;			case CHR('i'):		/* case insensitive */				v->cflags |= REG_ICASE;				break;			case CHR('m'):		/* Perloid synonym for n */			case CHR('n'):		/* \n affects ^ $ . [^ */				v->cflags |= REG_NEWLINE;				break;			case CHR('p'):		/* ~Perl, \n affects . [^ */				v->cflags |= REG_NLSTOP;				v->cflags &= ~REG_NLANCH;				break;			case CHR('q'):		/* literal string */				v->cflags |= REG_QUOTE;				v->cflags &= ~REG_ADVANCED;				break;			case CHR('s'):		/* single line, \n ordinary */				v->cflags &= ~REG_NEWLINE;				break;			case CHR('t'):		/* tight syntax */				v->cflags &= ~REG_EXPANDED;				break;			case CHR('w'):		/* weird, \n affects ^ $ only */				v->cflags &= ~REG_NLSTOP;				v->cflags |= REG_NLANCH;				break;			case CHR('x'):		/* expanded syntax */				v->cflags |= REG_EXPANDED;				break;			default:				ERR(REG_BADOPT);				return;			}		if (!NEXT1(')')) {			ERR(REG_BADOPT);			return;		}		v->now++;		if (v->cflags&REG_QUOTE)			v->cflags &= ~(REG_EXPANDED|REG_NEWLINE);	}}/* - lexnest - "call a subroutine", interpolating string at the lexical level * Note, this is not a very general facility.  There are a number of * implicit assumptions about what sorts of strings can be subroutines. ^ static VOID lexnest(struct vars *, chr *, chr *); */static VOIDlexnest(v, beginp, endp)struct vars *v;chr *beginp;				/* start of interpolation */chr *endp;				/* one past end of interpolation */{	assert(v->savenow == NULL);	/* only one level of nesting */	v->savenow = v->now;	v->savestop = v->stop;	v->now = beginp;	v->stop = endp;}/* * string constants to interpolate as expansions of things like \d */static chr backd[] = {		/* \d */	CHR('['), CHR('['), CHR(':'),	CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),	CHR(':'), CHR(']'), CHR(']')};static chr backD[] = {		/* \D */	CHR('['), CHR('^'), CHR('['), CHR(':'),	CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),	CHR(':'), CHR(']'), CHR(']')};static chr brbackd[] = {	/* \d within brackets */	CHR('['), CHR(':'),	CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),	CHR(':'), CHR(']')};static chr backs[] = {		/* \s */	CHR('['), CHR('['), CHR(':'),	CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),	CHR(':'), CHR(']'), CHR(']')};static chr backS[] = {		/* \S */	CHR('['), CHR('^'), CHR('['), CHR(':'),	CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),	CHR(':'), CHR(']'), CHR(']')};static chr brbacks[] = {	/* \s within brackets */	CHR('['), CHR(':'),	CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),	CHR(':'), CHR(']')};static chr backw[] = {		/* \w */	CHR('['), CHR('['), CHR(':'),	CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),	CHR(':'), CHR(']'), CHR('_'), CHR(']')};static chr backW[] = {		/* \W */	CHR('['), CHR('^'), CHR('['), CHR(':'),	CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),	CHR(':'), CHR(']'), CHR('_'), CHR(']')};static chr brbackw[] = {	/* \w within brackets */	CHR('['), CHR(':'),	CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),	CHR(':'), CHR(']'), CHR('_')};/* - lexword - interpolate a bracket expression for word characters * Possibly ought to inquire whether there is a "word" character class. ^ static VOID lexword(struct vars *); */static VOIDlexword(v)struct vars *v;{	lexnest(v, backw, ENDOF(backw));}/* - next - get next token ^ static int next(struct vars *); */static int			/* 1 normal, 0 failure */next(v)struct vars *v;{	chr c;	/* errors yield an infinite sequence of failures */	if (ISERR())		return 0;	/* the error has set nexttype to EOS */	/* remember flavor of last token */	v->lasttype = v->nexttype;	/* REG_BOSONLY */	if (v->nexttype == EMPTY && (v->cflags&REG_BOSONLY)) {		/* at start of a REG_BOSONLY RE */		RETV(SBEGIN, 0);		/* same as \A */	}	/* if we're nested and we've hit end, return to outer level */	if (v->savenow != NULL && ATEOS()) {		v->now = v->savenow;		v->stop = v->savestop;		v->savenow = v->savestop = NULL;	}	/* skip white space etc. if appropriate (not in literal or []) */	if (v->cflags&REG_EXPANDED)		switch (v->lexcon) {		case L_ERE:		case L_BRE:		case L_EBND:		case L_BBND:			skip(v);			break;		}	/* handle EOS, depending on context */	if (ATEOS()) {		switch (v->lexcon) {		case L_ERE:		case L_BRE:		case L_Q:			RET(EOS);			break;		case L_EBND:		case L_BBND:			FAILW(REG_EBRACE);			break;		case L_BRACK:		case L_CEL:		case L_ECL:		case L_CCL:			FAILW(REG_EBRACK);			break;		}		assert(NOTREACHED);	}	/* okay, time to actually get a character */	c = *v->now++;	/* deal with the easy contexts, punt EREs to code below */	switch (v->lexcon) {	case L_BRE:			/* punt BREs to separate function */		return brenext(v, c);		break;	case L_ERE:			/* see below */		break;	case L_Q:			/* literal strings are easy */		RETV(PLAIN, c);		break;	case L_BBND:			/* bounds are fairly simple */	case L_EBND:		switch (c) {		case CHR('0'): case CHR('1'): case CHR('2'): case CHR('3'):		case CHR('4'): case CHR('5'): case CHR('6'): case CHR('7'):		case CHR('8'): case CHR('9'):			RETV(DIGIT, (chr)DIGITVAL(c));			break;		case CHR(','):			RET(',');			break;		case CHR('}'):		/* ERE bound ends with } */			if (INCON(L_EBND)) {				INTOCON(L_ERE);				if ((v->cflags&REG_ADVF) && NEXT1('?')) {					v->now++;					NOTE(REG_UNONPOSIX);					RETV('}', 0);				}				RETV('}', 1);			} else				FAILW(REG_BADBR);			break;		case CHR('\\'):		/* BRE bound ends with \} */			if (INCON(L_BBND) && NEXT1('}')) {				v->now++;				INTOCON(L_BRE);				RET('}');			} else				FAILW(REG_BADBR);			break;		default:			FAILW(REG_BADBR);			break;		}		assert(NOTREACHED);		break;	case L_BRACK:			/* brackets are not too hard */		switch (c) {		case CHR(']'):			if (LASTTYPE('['))				RETV(PLAIN, c);			else {				INTOCON((v->cflags&REG_EXTENDED) ?							L_ERE : L_BRE);				RET(']');			}			break;		case CHR('\\'):			NOTE(REG_UBBS);			if (!(v->cflags&REG_ADVF))				RETV(PLAIN, c);			NOTE(REG_UNONPOSIX);			if (ATEOS())				FAILW(REG_EESCAPE);			(DISCARD)lexescape(v);			switch (v->nexttype) {	/* not all escapes okay here */			case PLAIN:				return 1;				break;			case CCLASS:				switch (v->nextvalue) {				case 'd':					lexnest(v, brbackd, ENDOF(brbackd));					break;				case 's':					lexnest(v, brbacks, ENDOF(brbacks));					break;				case 'w':					lexnest(v, brbackw, ENDOF(brbackw));					break;				default:					FAILW(REG_EESCAPE);					break;				}				/* lexnest done, back up and try again */				v->nexttype = v->lasttype;				return next(v);				break;			}			/* not one of the acceptable escapes */			FAILW(REG_EESCAPE);			break;		case CHR('-'):			if (LASTTYPE('[') || NEXT1(']'))				RETV(PLAIN, c);			else				RETV(RANGE, c);			break;		case CHR('['):			if (ATEOS())				FAILW(REG_EBRACK);			switch (*v->now++) {			case CHR('.'):				INTOCON(L_CEL);				/* might or might not be locale-specific */				RET(COLLEL);				break;			case CHR('='):				INTOCON(L_ECL);				NOTE(REG_ULOCALE);				RET(ECLASS);				break;			case CHR(':'):				INTOCON(L_CCL);				NOTE(REG_ULOCALE);				RET(CCLASS);				break;			default:			/* oops */				v->now--;				RETV(PLAIN, c);				break;			}			assert(NOTREACHED);			break;		default:			RETV(PLAIN, c);			break;		}		assert(NOTREACHED);		break;	case L_CEL:			/* collating elements are easy */		if (c == CHR('.') && NEXT1(']')) {			v->now++;			INTOCON(L_BRACK);			RETV(END, '.');		} else			RETV(PLAIN, c);		break;	case L_ECL:			/* ditto equivalence classes */		if (c == CHR('=') && NEXT1(']')) {			v->now++;			INTOCON(L_BRACK);			RETV(END, '=');		} else			RETV(PLAIN, c);		break;	case L_CCL:			/* ditto character classes */		if (c == CHR(':') && NEXT1(']')) {			v->now++;			INTOCON(L_BRACK);			RETV(END, ':');		} else			RETV(PLAIN, c);		break;	default:		assert(NOTREACHED);		break;	}	/* that got rid of everything except EREs and AREs */	assert(INCON(L_ERE));	/* deal with EREs and AREs, except for backslashes */	switch (c) {	case CHR('|'):		RET('|');		break;	case CHR('*'):		if ((v->cflags&REG_ADVF) && NEXT1('?')) {			v->now++;			NOTE(REG_UNONPOSIX);			RETV('*', 0);		}		RETV('*', 1);		break;	case CHR('+'):		if ((v->cflags&REG_ADVF) && NEXT1('?')) {			v->now++;			NOTE(REG_UNONPOSIX);			RETV('+', 0);		}		RETV('+', 1);		break;	case CHR('?'):		if ((v->cflags&REG_ADVF) && NEXT1('?')) {			v->now++;			NOTE(REG_UNONPOSIX);			RETV('?', 0);		}		RETV('?', 1);		break;	case CHR('{'):		/* bounds start or plain character */		if (v->cflags&REG_EXPANDED)			skip(v);		if (ATEOS() || !iscdigit(*v->now)) {			NOTE(REG_UBRACES);

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?