regexec.c

来自「A*算法 A*算法 A*算法 A*算法A*算法A*算法」· C语言 代码 · 共 1,039 行 · 第 1/2 页

C
1,039
字号

	assert(n > 0);
	if ((size_t)n >= v->nmatch)
		return;

	MDEBUG(("setting %d\n", n));
	v->pmatch[n].rm_so = OFF(begin);
	v->pmatch[n].rm_eo = OFF(end);
}

/*
 - dissect - determine subexpression matches (uncomplicated case)
 ^ static int dissect(struct vars *, struct subre *, chr *, chr *);
 */
static int			/* regexec return code */
dissect(v, t, begin, end)
struct vars *v;
struct subre *t;
chr *begin;			/* beginning of relevant substring */
chr *end;			/* end of same */
{
	assert(t != NULL);
	MDEBUG(("dissect %ld-%ld\n", LOFF(begin), LOFF(end)));

	switch (t->op) {
	case '=':		/* terminal node */
		assert(t->left == NULL && t->right == NULL);
		return REG_OKAY;	/* no action, parent did the work */
		break;
	case '|':		/* alternation */
		assert(t->left != NULL);
		return altdissect(v, t, begin, end);
		break;
	case 'b':		/* back ref -- shouldn't be calling us! */
		return REG_ASSERT;
		break;
	case '.':		/* concatenation */
		assert(t->left != NULL && t->right != NULL);
		return condissect(v, t, begin, end);
		break;
	case '(':		/* capturing */
		assert(t->left != NULL && t->right == NULL);
		assert(t->subno > 0);
		subset(v, t, begin, end);
		return dissect(v, t->left, begin, end);
		break;
	default:
		return REG_ASSERT;
		break;
	}
}

/*
 - condissect - determine concatenation subexpression matches (uncomplicated)
 ^ static int condissect(struct vars *, struct subre *, chr *, chr *);
 */
static int			/* regexec return code */
condissect(v, t, begin, end)
struct vars *v;
struct subre *t;
chr *begin;			/* beginning of relevant substring */
chr *end;			/* end of same */
{
	struct dfa *d;
	struct dfa *d2;
	chr *mid;
	int i;
	int shorter = (t->left->flags&SHORTER) ? 1 : 0;
	chr *stop = (shorter) ? end : begin;

	assert(t->op == '.');
	assert(t->left != NULL && t->left->cnfa.nstates > 0);
	assert(t->right != NULL && t->right->cnfa.nstates > 0);

	d = newdfa(v, &t->left->cnfa, &v->g->cmap, &v->dfa1);
	NOERR();
	d2 = newdfa(v, &t->right->cnfa, &v->g->cmap, &v->dfa2);
	if (ISERR()) {
		assert(d2 == NULL);
		freedfa(d);
		return v->err;
	}

	/* pick a tentative midpoint */
	if (shorter)
		mid = shortest(v, d, begin, begin, end, (chr **)NULL,
								(int *)NULL);
	else
		mid = longest(v, d, begin, end, (int *)NULL);
	if (mid == NULL) {
		freedfa(d);
		freedfa(d2);
		return REG_ASSERT;
	}
	MDEBUG(("tentative midpoint %ld\n", LOFF(mid)));

	/* iterate until satisfaction or failure */
	while (longest(v, d2, mid, end, (int *)NULL) != end) {
		/* that midpoint didn't work, find a new one */
		if (mid == stop) {
			/* all possibilities exhausted! */
			MDEBUG(("no midpoint!\n"));
			freedfa(d);
			freedfa(d2);
			return REG_ASSERT;
		}
		if (shorter)
			mid = shortest(v, d, begin, mid+1, end, (chr **)NULL,
								(int *)NULL);
		else
			mid = longest(v, d, begin, mid-1, (int *)NULL);
		if (mid == NULL) {
			/* failed to find a new one! */
			MDEBUG(("failed midpoint!\n"));
			freedfa(d);
			freedfa(d2);
			return REG_ASSERT;
		}
		MDEBUG(("new midpoint %ld\n", LOFF(mid)));
	}

	/* satisfaction */
	MDEBUG(("successful\n"));
	freedfa(d);
	freedfa(d2);
	i = dissect(v, t->left, begin, mid);
	if (i != REG_OKAY)
		return i;
	return dissect(v, t->right, mid, end);
}

/*
 - altdissect - determine alternative subexpression matches (uncomplicated)
 ^ static int altdissect(struct vars *, struct subre *, chr *, chr *);
 */
static int			/* regexec return code */
altdissect(v, t, begin, end)
struct vars *v;
struct subre *t;
chr *begin;			/* beginning of relevant substring */
chr *end;			/* end of same */
{
	struct dfa *d;
	int i;

	assert(t != NULL);
	assert(t->op == '|');

	for (i = 0; t != NULL; t = t->right, i++) {
		MDEBUG(("trying %dth\n", i));
		assert(t->left != NULL && t->left->cnfa.nstates > 0);
		d = newdfa(v, &t->left->cnfa, &v->g->cmap, &v->dfa1);
		if (ISERR())
			return v->err;
		if (longest(v, d, begin, end, (int *)NULL) == end) {
			MDEBUG(("success\n"));
			freedfa(d);
			return dissect(v, t->left, begin, end);
		}
		freedfa(d);
	}
	return REG_ASSERT;	/* none of them matched?!? */
}

/*
 - cdissect - determine subexpression matches (with complications)
 * The retry memory stores the offset of the trial midpoint from begin, 
 * plus 1 so that 0 uniquely means "clean slate".
 ^ static int cdissect(struct vars *, struct subre *, chr *, chr *);
 */
static int			/* regexec return code */
cdissect(v, t, begin, end)
struct vars *v;
struct subre *t;
chr *begin;			/* beginning of relevant substring */
chr *end;			/* end of same */
{
	int er;

	assert(t != NULL);
	MDEBUG(("cdissect %ld-%ld %c\n", LOFF(begin), LOFF(end), t->op));

	switch (t->op) {
	case '=':		/* terminal node */
		assert(t->left == NULL && t->right == NULL);
		return REG_OKAY;	/* no action, parent did the work */
		break;
	case '|':		/* alternation */
		assert(t->left != NULL);
		return caltdissect(v, t, begin, end);
		break;
	case 'b':		/* back ref -- shouldn't be calling us! */
		assert(t->left == NULL && t->right == NULL);
		return cbrdissect(v, t, begin, end);
		break;
	case '.':		/* concatenation */
		assert(t->left != NULL && t->right != NULL);
		return ccondissect(v, t, begin, end);
		break;
	case '(':		/* capturing */
		assert(t->left != NULL && t->right == NULL);
		assert(t->subno > 0);
		er = cdissect(v, t->left, begin, end);
		if (er == REG_OKAY)
			subset(v, t, begin, end);
		return er;
		break;
	default:
		return REG_ASSERT;
		break;
	}
}

/*
 - ccondissect - concatenation subexpression matches (with complications)
 * The retry memory stores the offset of the trial midpoint from begin, 
 * plus 1 so that 0 uniquely means "clean slate".
 ^ static int ccondissect(struct vars *, struct subre *, chr *, chr *);
 */
static int			/* regexec return code */
ccondissect(v, t, begin, end)
struct vars *v;
struct subre *t;
chr *begin;			/* beginning of relevant substring */
chr *end;			/* end of same */
{
	struct dfa *d;
	struct dfa *d2;
	chr *mid;
	int er;

	assert(t->op == '.');
	assert(t->left != NULL && t->left->cnfa.nstates > 0);
	assert(t->right != NULL && t->right->cnfa.nstates > 0);

	if (t->left->flags&SHORTER)		/* reverse scan */
		return crevdissect(v, t, begin, end);

	d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
	if (ISERR())
		return v->err;
	d2 = newdfa(v, &t->right->cnfa, &v->g->cmap, DOMALLOC);
	if (ISERR()) {
		freedfa(d);
		return v->err;
	}
	MDEBUG(("cconcat %d\n", t->retry));

	/* pick a tentative midpoint */
	if (v->mem[t->retry] == 0) {
		mid = longest(v, d, begin, end, (int *)NULL);
		if (mid == NULL) {
			freedfa(d);
			freedfa(d2);
			return REG_NOMATCH;
		}
		MDEBUG(("tentative midpoint %ld\n", LOFF(mid)));
		v->mem[t->retry] = (mid - begin) + 1;
	} else {
		mid = begin + (v->mem[t->retry] - 1);
		MDEBUG(("working midpoint %ld\n", LOFF(mid)));
	}

	/* iterate until satisfaction or failure */
	for (;;) {
		/* try this midpoint on for size */
		er = cdissect(v, t->left, begin, mid);
		if (er == REG_OKAY &&
				longest(v, d2, mid, end, (int *)NULL) == end &&
				(er = cdissect(v, t->right, mid, end)) == 
								REG_OKAY)
			break;			/* NOTE BREAK OUT */
		if (er != REG_OKAY && er != REG_NOMATCH) {
			freedfa(d);
			freedfa(d2);
			return er;
		}

		/* that midpoint didn't work, find a new one */
		if (mid == begin) {
			/* all possibilities exhausted */
			MDEBUG(("%d no midpoint\n", t->retry));
			freedfa(d);
			freedfa(d2);
			return REG_NOMATCH;
		}
		mid = longest(v, d, begin, mid-1, (int *)NULL);
		if (mid == NULL) {
			/* failed to find a new one */
			MDEBUG(("%d failed midpoint\n", t->retry));
			freedfa(d);
			freedfa(d2);
			return REG_NOMATCH;
		}
		MDEBUG(("%d: new midpoint %ld\n", t->retry, LOFF(mid)));
		v->mem[t->retry] = (mid - begin) + 1;
		zapmem(v, t->left);
		zapmem(v, t->right);
	}

	/* satisfaction */
	MDEBUG(("successful\n"));
	freedfa(d);
	freedfa(d2);
	return REG_OKAY;
}

/*
 - crevdissect - determine backref shortest-first subexpression matches
 * The retry memory stores the offset of the trial midpoint from begin, 
 * plus 1 so that 0 uniquely means "clean slate".
 ^ static int crevdissect(struct vars *, struct subre *, chr *, chr *);
 */
static int			/* regexec return code */
crevdissect(v, t, begin, end)
struct vars *v;
struct subre *t;
chr *begin;			/* beginning of relevant substring */
chr *end;			/* end of same */
{
	struct dfa *d;
	struct dfa *d2;
	chr *mid;
	int er;

	assert(t->op == '.');
	assert(t->left != NULL && t->left->cnfa.nstates > 0);
	assert(t->right != NULL && t->right->cnfa.nstates > 0);
	assert(t->left->flags&SHORTER);

	/* concatenation -- need to split the substring between parts */
	d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
	if (ISERR())
		return v->err;
	d2 = newdfa(v, &t->right->cnfa, &v->g->cmap, DOMALLOC);
	if (ISERR()) {
		freedfa(d);
		return v->err;
	}
	MDEBUG(("crev %d\n", t->retry));

	/* pick a tentative midpoint */
	if (v->mem[t->retry] == 0) {
		mid = shortest(v, d, begin, begin, end, (chr **)NULL, (int *)NULL);
		if (mid == NULL) {
			freedfa(d);
			freedfa(d2);
			return REG_NOMATCH;
		}
		MDEBUG(("tentative midpoint %ld\n", LOFF(mid)));
		v->mem[t->retry] = (mid - begin) + 1;
	} else {
		mid = begin + (v->mem[t->retry] - 1);
		MDEBUG(("working midpoint %ld\n", LOFF(mid)));
	}

	/* iterate until satisfaction or failure */
	for (;;) {
		/* try this midpoint on for size */
		er = cdissect(v, t->left, begin, mid);
		if (er == REG_OKAY &&
				longest(v, d2, mid, end, (int *)NULL) == end &&
				(er = cdissect(v, t->right, mid, end)) == 
								REG_OKAY)
			break;			/* NOTE BREAK OUT */
		if (er != REG_OKAY && er != REG_NOMATCH) {
			freedfa(d);
			freedfa(d2);
			return er;
		}

		/* that midpoint didn't work, find a new one */
		if (mid == end) {
			/* all possibilities exhausted */
			MDEBUG(("%d no midpoint\n", t->retry));
			freedfa(d);
			freedfa(d2);
			return REG_NOMATCH;
		}
		mid = shortest(v, d, begin, mid+1, end, (chr **)NULL, (int *)NULL);
		if (mid == NULL) {
			/* failed to find a new one */
			MDEBUG(("%d failed midpoint\n", t->retry));
			freedfa(d);
			freedfa(d2);
			return REG_NOMATCH;
		}
		MDEBUG(("%d: new midpoint %ld\n", t->retry, LOFF(mid)));
		v->mem[t->retry] = (mid - begin) + 1;
		zapmem(v, t->left);
		zapmem(v, t->right);
	}

	/* satisfaction */
	MDEBUG(("successful\n"));
	freedfa(d);
	freedfa(d2);
	return REG_OKAY;
}

/*
 - cbrdissect - determine backref subexpression matches
 ^ static int cbrdissect(struct vars *, struct subre *, chr *, chr *);
 */
static int			/* regexec return code */
cbrdissect(v, t, begin, end)
struct vars *v;
struct subre *t;
chr *begin;			/* beginning of relevant substring */
chr *end;			/* end of same */
{
	int i;
	int n = t->subno;
	size_t len;
	chr *paren;
	chr *p;
	chr *stop;
	int min = t->min;
	int max = t->max;

	assert(t != NULL);
	assert(t->op == 'b');
	assert(n >= 0);
	assert((size_t)n < v->nmatch);

	MDEBUG(("cbackref n%d %d{%d-%d}\n", t->retry, n, min, max));

	if (v->pmatch[n].rm_so == -1)
		return REG_NOMATCH;
	paren = v->start + v->pmatch[n].rm_so;
	len = v->pmatch[n].rm_eo - v->pmatch[n].rm_so;

	/* no room to maneuver -- retries are pointless */
	if (v->mem[t->retry])
		return REG_NOMATCH;
	v->mem[t->retry] = 1;

	/* special-case zero-length string */
	if (len == 0) {
		if (begin == end)
			return REG_OKAY;
		return REG_NOMATCH;
	}

	/* and too-short string */
	assert(end >= begin);
	if ((size_t)(end - begin) < len)
		return REG_NOMATCH;
	stop = end - len;

	/* count occurrences */
	i = 0;
	for (p = begin; p <= stop && (i < max || max == INFINITY); p += len) {
		if ((*v->g->compare)(paren, p, len) != 0)
				break;
		i++;
	}
	MDEBUG(("cbackref found %d\n", i));

	/* and sort it out */
	if (p != end)			/* didn't consume all of it */
		return REG_NOMATCH;
	if (min <= i && (i <= max || max == INFINITY))
		return REG_OKAY;
	return REG_NOMATCH;		/* out of range */
}

/*
 - caltdissect - determine alternative subexpression matches (w. complications)
 ^ static int caltdissect(struct vars *, struct subre *, chr *, chr *);
 */
static int			/* regexec return code */
caltdissect(v, t, begin, end)
struct vars *v;
struct subre *t;
chr *begin;			/* beginning of relevant substring */
chr *end;			/* end of same */
{
	struct dfa *d;
	int er;
#	define	UNTRIED	0	/* not yet tried at all */
#	define	TRYING	1	/* top matched, trying submatches */
#	define	TRIED	2	/* top didn't match or submatches exhausted */

	if (t == NULL)
		return REG_NOMATCH;
	assert(t->op == '|');
	if (v->mem[t->retry] == TRIED)
		return caltdissect(v, t->right, begin, end);

	MDEBUG(("calt n%d\n", t->retry));
	assert(t->left != NULL);

	if (v->mem[t->retry] == UNTRIED) {
		d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
		if (ISERR())
			return v->err;
		if (longest(v, d, begin, end, (int *)NULL) != end) {
			freedfa(d);
			v->mem[t->retry] = TRIED;
			return caltdissect(v, t->right, begin, end);
		}
		freedfa(d);
		MDEBUG(("calt matched\n"));
		v->mem[t->retry] = TRYING;
	}

	er = cdissect(v, t->left, begin, end);
	if (er != REG_NOMATCH)
		return er;

	v->mem[t->retry] = TRIED;
	return caltdissect(v, t->right, begin, end);
}



#include "rege_dfa.c"

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?