regc_lex.c

来自「A*算法 A*算法 A*算法 A*算法A*算法A*算法」· C语言 代码 · 共 1,062 行 · 第 1/2 页

C
1,062
字号
			NOTE(REG_UUNSPEC);
			RETV(PLAIN, c);
		} else {
			NOTE(REG_UBOUNDS);
			INTOCON(L_EBND);
			RET('{');
		}
		assert(NOTREACHED);
		break;
	case CHR('('):		/* parenthesis, or advanced extension */
		if ((v->cflags&REG_ADVF) && NEXT1('?')) {
			NOTE(REG_UNONPOSIX);
			v->now++;
			switch (*v->now++) {
			case CHR(':'):		/* non-capturing paren */
				RETV('(', 0);
				break;
			case CHR('#'):		/* comment */
				while (!ATEOS() && *v->now != CHR(')'))
					v->now++;
				if (!ATEOS())
					v->now++;
				assert(v->nexttype == v->lasttype);
				return next(v);
				break;
			case CHR('='):		/* positive lookahead */
				NOTE(REG_ULOOKAHEAD);
				RETV(LACON, 1);
				break;
			case CHR('!'):		/* negative lookahead */
				NOTE(REG_ULOOKAHEAD);
				RETV(LACON, 0);
				break;
			default:
				FAILW(REG_BADRPT);
				break;
			}
			assert(NOTREACHED);
		}
		if (v->cflags&REG_NOSUB)
			RETV('(', 0);		/* all parens non-capturing */
		else
			RETV('(', 1);
		break;
	case CHR(')'):
		if (LASTTYPE('(')) {
			NOTE(REG_UUNSPEC);
		}
		RETV(')', c);
		break;
	case CHR('['):		/* easy except for [[:<:]] and [[:>:]] */
		if (HAVE(6) &&	*(v->now+0) == CHR('[') &&
				*(v->now+1) == CHR(':') &&
				(*(v->now+2) == CHR('<') ||
						*(v->now+2) == CHR('>')) &&
				*(v->now+3) == CHR(':') &&
				*(v->now+4) == CHR(']') &&
				*(v->now+5) == CHR(']')) {
			c = *(v->now+2);
			v->now += 6;
			NOTE(REG_UNONPOSIX);
			RET((c == CHR('<')) ? '<' : '>');
		}
		INTOCON(L_BRACK);
		if (NEXT1('^')) {
			v->now++;
			RETV('[', 0);
		}
		RETV('[', 1);
		break;
	case CHR('.'):
		RET('.');
		break;
	case CHR('^'):
		RET('^');
		break;
	case CHR('$'):
		RET('$');
		break;
	case CHR('\\'):		/* mostly punt backslashes to code below */
		if (ATEOS())
			FAILW(REG_EESCAPE);
		break;
	default:		/* ordinary character */
		RETV(PLAIN, c);
		break;
	}

	/* ERE/ARE backslash handling; backslash already eaten */
	assert(!ATEOS());
	if (!(v->cflags&REG_ADVF)) {	/* only AREs have non-trivial escapes */
		if (iscalnum(*v->now)) {
			NOTE(REG_UBSALNUM);
			NOTE(REG_UUNSPEC);
		}
		RETV(PLAIN, *v->now++);
	}
	(DISCARD)lexescape(v);
	if (ISERR())
		FAILW(REG_EESCAPE);
	if (v->nexttype == CCLASS) {	/* fudge at lexical level */
		switch (v->nextvalue) {
		case 'd':	lexnest(v, backd, ENDOF(backd)); break;
		case 'D':	lexnest(v, backD, ENDOF(backD)); break;
		case 's':	lexnest(v, backs, ENDOF(backs)); break;
		case 'S':	lexnest(v, backS, ENDOF(backS)); break;
		case 'w':	lexnest(v, backw, ENDOF(backw)); break;
		case 'W':	lexnest(v, backW, ENDOF(backW)); break;
		default:
			assert(NOTREACHED);
			FAILW(REG_ASSERT);
			break;
		}
		/* lexnest done, back up and try again */
		v->nexttype = v->lasttype;
		return next(v);
	}
	/* otherwise, lexescape has already done the work */
	return !ISERR();
}

/*
 - lexescape - parse an ARE backslash escape (backslash already eaten)
 * Note slightly nonstandard use of the CCLASS type code.
 ^ static int lexescape(struct vars *);
 */
static int			/* not actually used, but convenient for RETV */
lexescape(v)
struct vars *v;
{
	chr c;
	static chr alert[] = {
		CHR('a'), CHR('l'), CHR('e'), CHR('r'), CHR('t')
	};
	static chr esc[] = {
		CHR('E'), CHR('S'), CHR('C')
	};
	chr *save;

	assert(v->cflags&REG_ADVF);

	assert(!ATEOS());
	c = *v->now++;
	if (!iscalnum(c))
		RETV(PLAIN, c);

	NOTE(REG_UNONPOSIX);
	switch (c) {
	case CHR('a'):
		RETV(PLAIN, chrnamed(v, alert, ENDOF(alert), CHR('\007')));
		break;
	case CHR('A'):
		RETV(SBEGIN, 0);
		break;
	case CHR('b'):
		RETV(PLAIN, CHR('\b'));
		break;
	case CHR('B'):
		RETV(PLAIN, CHR('\\'));
		break;
	case CHR('c'):
		NOTE(REG_UUNPORT);
		if (ATEOS())
			FAILW(REG_EESCAPE);
		RETV(PLAIN, (chr)(*v->now++ & 037));
		break;
	case CHR('d'):
		NOTE(REG_ULOCALE);
		RETV(CCLASS, 'd');
		break;
	case CHR('D'):
		NOTE(REG_ULOCALE);
		RETV(CCLASS, 'D');
		break;
	case CHR('e'):
		NOTE(REG_UUNPORT);
		RETV(PLAIN, chrnamed(v, esc, ENDOF(esc), CHR('\033')));
		break;
	case CHR('f'):
		RETV(PLAIN, CHR('\f'));
		break;
	case CHR('m'):
		RET('<');
		break;
	case CHR('M'):
		RET('>');
		break;
	case CHR('n'):
		RETV(PLAIN, CHR('\n'));
		break;
	case CHR('r'):
		RETV(PLAIN, CHR('\r'));
		break;
	case CHR('s'):
		NOTE(REG_ULOCALE);
		RETV(CCLASS, 's');
		break;
	case CHR('S'):
		NOTE(REG_ULOCALE);
		RETV(CCLASS, 'S');
		break;
	case CHR('t'):
		RETV(PLAIN, CHR('\t'));
		break;
	case CHR('u'):
		c = lexdigits(v, 16, 4, 4);
		if (ISERR())
			FAILW(REG_EESCAPE);
		RETV(PLAIN, c);
		break;
	case CHR('U'):
		c = lexdigits(v, 16, 8, 8);
		if (ISERR())
			FAILW(REG_EESCAPE);
		RETV(PLAIN, c);
		break;
	case CHR('v'):
		RETV(PLAIN, CHR('\v'));
		break;
	case CHR('w'):
		NOTE(REG_ULOCALE);
		RETV(CCLASS, 'w');
		break;
	case CHR('W'):
		NOTE(REG_ULOCALE);
		RETV(CCLASS, 'W');
		break;
	case CHR('x'):
		NOTE(REG_UUNPORT);
		c = lexdigits(v, 16, 1, 255);	/* REs >255 long outside spec */
		if (ISERR())
			FAILW(REG_EESCAPE);
		RETV(PLAIN, c);
		break;
	case CHR('y'):
		NOTE(REG_ULOCALE);
		RETV(WBDRY, 0);
		break;
	case CHR('Y'):
		NOTE(REG_ULOCALE);
		RETV(NWBDRY, 0);
		break;
	case CHR('Z'):
		RETV(SEND, 0);
		break;
	case CHR('1'): case CHR('2'): case CHR('3'): case CHR('4'):
	case CHR('5'): case CHR('6'): case CHR('7'): case CHR('8'):
	case CHR('9'):
		save = v->now;
		v->now--;	/* put first digit back */
		c = lexdigits(v, 10, 1, 255);	/* REs >255 long outside spec */
		if (ISERR())
			FAILW(REG_EESCAPE);
		/* ugly heuristic (first test is "exactly 1 digit?") */
		if (v->now - save == 0 || (int)c <= v->nsubexp) {
			NOTE(REG_UBACKREF);
			RETV(BACKREF, (chr)c);
		}
		/* oops, doesn't look like it's a backref after all... */
		v->now = save;
		/* and fall through into octal number */
	case CHR('0'):
		NOTE(REG_UUNPORT);
		v->now--;	/* put first digit back */
		c = lexdigits(v, 8, 1, 3);
		if (ISERR())
			FAILW(REG_EESCAPE);
		RETV(PLAIN, c);
		break;
	default:
		assert(iscalpha(c));
		FAILW(REG_EESCAPE);	/* unknown alphabetic escape */
		break;
	}
	assert(NOTREACHED);
}

/*
 - lexdigits - slurp up digits and return chr value
 ^ static chr lexdigits(struct vars *, int, int, int);
 */
static chr			/* chr value; errors signalled via ERR */
lexdigits(v, base, minlen, maxlen)
struct vars *v;
int base;
int minlen;
int maxlen;
{
	uchr n;			/* unsigned to avoid overflow misbehavior */
	int len;
	chr c;
	int d;
	CONST uchr ub = (uchr) base;

	n = 0;
	for (len = 0; len < maxlen && !ATEOS(); len++) {
		c = *v->now++;
		switch (c) {
		case CHR('0'): case CHR('1'): case CHR('2'): case CHR('3'):
		case CHR('4'): case CHR('5'): case CHR('6'): case CHR('7'):
		case CHR('8'): case CHR('9'):
			d = DIGITVAL(c);
			break;
		case CHR('a'): case CHR('A'): d = 10; break;
		case CHR('b'): case CHR('B'): d = 11; break;
		case CHR('c'): case CHR('C'): d = 12; break;
		case CHR('d'): case CHR('D'): d = 13; break;
		case CHR('e'): case CHR('E'): d = 14; break;
		case CHR('f'): case CHR('F'): d = 15; break;
		default:
			v->now--;	/* oops, not a digit at all */
			d = -1;
			break;
		}

		if (d >= base) {	/* not a plausible digit */
			v->now--;
			d = -1;
		}
		if (d < 0)
			break;		/* NOTE BREAK OUT */
		n = n*ub + (uchr)d;
	}
	if (len < minlen)
		ERR(REG_EESCAPE);

	return (chr)n;
}

/*
 - brenext - get next BRE token
 * This is much like EREs except for all the stupid backslashes and the
 * context-dependency of some things.
 ^ static int brenext(struct vars *, pchr);
 */
static int			/* 1 normal, 0 failure */
brenext(v, pc)
struct vars *v;
pchr pc;
{
	chr c = (chr)pc;

	switch (c) {
	case CHR('*'):
		if (LASTTYPE(EMPTY) || LASTTYPE('(') || LASTTYPE('^'))
			RETV(PLAIN, c);
		RET('*');
		break;
	case CHR('['):
		if (HAVE(6) &&	*(v->now+0) == CHR('[') &&
				*(v->now+1) == CHR(':') &&
				(*(v->now+2) == CHR('<') ||
						*(v->now+2) == CHR('>')) &&
				*(v->now+3) == CHR(':') &&
				*(v->now+4) == CHR(']') &&
				*(v->now+5) == CHR(']')) {
			c = *(v->now+2);
			v->now += 6;
			NOTE(REG_UNONPOSIX);
			RET((c == CHR('<')) ? '<' : '>');
		}
		INTOCON(L_BRACK);
		if (NEXT1('^')) {
			v->now++;
			RETV('[', 0);
		}
		RETV('[', 1);
		break;
	case CHR('.'):
		RET('.');
		break;
	case CHR('^'):
		if (LASTTYPE(EMPTY))
			RET('^');
		if (LASTTYPE('(')) {
			NOTE(REG_UUNSPEC);
			RET('^');
		}
		RETV(PLAIN, c);
		break;
	case CHR('$'):
		if (v->cflags&REG_EXPANDED)
			skip(v);
		if (ATEOS())
			RET('$');
		if (NEXT2('\\', ')')) {
			NOTE(REG_UUNSPEC);
			RET('$');
		}
		RETV(PLAIN, c);
		break;
	case CHR('\\'):
		break;		/* see below */
	default:
		RETV(PLAIN, c);
		break;
	}

	assert(c == CHR('\\'));

	if (ATEOS())
		FAILW(REG_EESCAPE);

	c = *v->now++;
	switch (c) {
	case CHR('{'):
		INTOCON(L_BBND);
		NOTE(REG_UBOUNDS);
		RET('{');
		break;
	case CHR('('):
		RETV('(', 1);
		break;
	case CHR(')'):
		RETV(')', c);
		break;
	case CHR('<'):
		NOTE(REG_UNONPOSIX);
		RET('<');
		break;
	case CHR('>'):
		NOTE(REG_UNONPOSIX);
		RET('>');
		break;
	case CHR('1'): case CHR('2'): case CHR('3'): case CHR('4'):
	case CHR('5'): case CHR('6'): case CHR('7'): case CHR('8'):
	case CHR('9'):
		NOTE(REG_UBACKREF);
		RETV(BACKREF, (chr)DIGITVAL(c));
		break;
	default:
		if (iscalnum(c)) {
			NOTE(REG_UBSALNUM);
			NOTE(REG_UUNSPEC);
		}
		RETV(PLAIN, c);
		break;
	}

	assert(NOTREACHED);
}

/*
 - skip - skip white space and comments in expanded form
 ^ static VOID skip(struct vars *);
 */
static VOID
skip(v)
struct vars *v;
{
	chr *start = v->now;

	assert(v->cflags&REG_EXPANDED);

	for (;;) {
		while (!ATEOS() && iscspace(*v->now))
			v->now++;
		if (ATEOS() || *v->now != CHR('#'))
			break;				/* NOTE BREAK OUT */
		assert(NEXT1('#'));
		while (!ATEOS() && *v->now != CHR('\n'))
			v->now++;
		/* leave the newline to be picked up by the iscspace loop */
	}

	if (v->now != start)
		NOTE(REG_UNONPOSIX);
}

/*
 - newline - return the chr for a newline
 * This helps confine use of CHR to this source file.
 ^ static chr newline(NOPARMS);
 */
static chr
newline()
{
	return CHR('\n');
}

/*
 - ch - return the chr sequence for regc_locale.c's fake collating element ch
 * This helps confine use of CHR to this source file.  Beware that the caller
 * knows how long the sequence is.
 ^ #ifdef REG_DEBUG
 ^ static chr *ch(NOPARMS);
 ^ #endif
 */
#ifdef REG_DEBUG
static chr *
ch()
{
	static chr chstr[] = { CHR('c'), CHR('h'), CHR('\0') };

	return chstr;
}
#endif

/*
 - chrnamed - return the chr known by a given (chr string) name
 * The code is a bit clumsy, but this routine gets only such specialized
 * use that it hardly matters.
 ^ static chr chrnamed(struct vars *, chr *, chr *, pchr);
 */
static chr
chrnamed(v, startp, endp, lastresort)
struct vars *v;
chr *startp;			/* start of name */
chr *endp;			/* just past end of name */
pchr lastresort;		/* what to return if name lookup fails */
{
	celt c;
	int errsave;
	int e;
	struct cvec *cv;

	errsave = v->err;
	v->err = 0;
	c = element(v, startp, endp);
	e = v->err;
	v->err = errsave;

	if (e != 0)
		return (chr)lastresort;

	cv = range(v, c, c, 0);
	if (cv->nchrs == 0)
		return (chr)lastresort;
	return cv->chrs[0];
}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?