regcomp.c

来自「A*算法 A*算法 A*算法 A*算法A*算法A*算法」· C语言 代码 · 共 2,180 行 · 第 1/5 页

C
2,180
字号
/*
 * re_*comp and friends - compile REs
 * This file #includes several others (see the bottom).
 *
 * Copyright (c) 1998, 1999 Henry Spencer.  All rights reserved.
 * 
 * Development of this software was funded, in part, by Cray Research Inc.,
 * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
 * Corporation, none of whom are responsible for the results.  The author
 * thanks all of them. 
 * 
 * Redistribution and use in source and binary forms -- with or without
 * modification -- are permitted for any purpose, provided that
 * redistributions in source form retain this entire copyright notice and
 * indicate the origin and nature of any modifications.
 * 
 * I'd appreciate being given credit for this package in the documentation
 * of software which uses it, but that is not a requirement.
 * 
 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
 * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 */

#include "regguts.h"

/*
 * forward declarations, up here so forward datatypes etc. are defined early
 */
/* =====^!^===== begin forwards =====^!^===== */
/* automatically gathered by fwd; do not hand-edit */
/* === regcomp.c === */
int compile _ANSI_ARGS_((regex_t *, CONST chr *, size_t, int));
static VOID moresubs _ANSI_ARGS_((struct vars *, int));
static int freev _ANSI_ARGS_((struct vars *, int));
static VOID makesearch _ANSI_ARGS_((struct vars *, struct nfa *));
static struct subre *parse _ANSI_ARGS_((struct vars *, int, int, struct state *, struct state *));
static struct subre *parsebranch _ANSI_ARGS_((struct vars *, int, int, struct state *, struct state *, int));
static VOID parseqatom _ANSI_ARGS_((struct vars *, int, int, struct state *, struct state *, struct subre *));
static VOID nonword _ANSI_ARGS_((struct vars *, int, struct state *, struct state *));
static VOID word _ANSI_ARGS_((struct vars *, int, struct state *, struct state *));
static int scannum _ANSI_ARGS_((struct vars *));
static VOID repeat _ANSI_ARGS_((struct vars *, struct state *, struct state *, int, int));
static VOID bracket _ANSI_ARGS_((struct vars *, struct state *, struct state *));
static VOID cbracket _ANSI_ARGS_((struct vars *, struct state *, struct state *));
static VOID brackpart _ANSI_ARGS_((struct vars *, struct state *, struct state *));
static chr *scanplain _ANSI_ARGS_((struct vars *));
static VOID leaders _ANSI_ARGS_((struct vars *, struct cvec *));
static VOID onechr _ANSI_ARGS_((struct vars *, pchr, struct state *, struct state *));
static VOID dovec _ANSI_ARGS_((struct vars *, struct cvec *, struct state *, struct state *));
static celt nextleader _ANSI_ARGS_((struct vars *, pchr, pchr));
static VOID wordchrs _ANSI_ARGS_((struct vars *));
static struct subre *subre _ANSI_ARGS_((struct vars *, int, int, struct state *, struct state *));
static VOID freesubre _ANSI_ARGS_((struct vars *, struct subre *));
static VOID freesrnode _ANSI_ARGS_((struct vars *, struct subre *));
static VOID optst _ANSI_ARGS_((struct vars *, struct subre *));
static int numst _ANSI_ARGS_((struct subre *, int));
static VOID markst _ANSI_ARGS_((struct subre *));
static VOID cleanst _ANSI_ARGS_((struct vars *));
static long nfatree _ANSI_ARGS_((struct vars *, struct subre *, FILE *));
static long nfanode _ANSI_ARGS_((struct vars *, struct subre *, FILE *));
static int newlacon _ANSI_ARGS_((struct vars *, struct state *, struct state *, int));
static VOID freelacons _ANSI_ARGS_((struct subre *, int));
static VOID rfree _ANSI_ARGS_((regex_t *));
static VOID dump _ANSI_ARGS_((regex_t *, FILE *));
static VOID dumpst _ANSI_ARGS_((struct subre *, FILE *, int));
static VOID stdump _ANSI_ARGS_((struct subre *, FILE *, int));
static char *stid _ANSI_ARGS_((struct subre *, char *, size_t));
/* === regc_lex.c === */
static VOID lexstart _ANSI_ARGS_((struct vars *));
static VOID prefixes _ANSI_ARGS_((struct vars *));
static VOID lexnest _ANSI_ARGS_((struct vars *, chr *, chr *));
static VOID lexword _ANSI_ARGS_((struct vars *));
static int next _ANSI_ARGS_((struct vars *));
static int lexescape _ANSI_ARGS_((struct vars *));
static chr lexdigits _ANSI_ARGS_((struct vars *, int, int, int));
static int brenext _ANSI_ARGS_((struct vars *, pchr));
static VOID skip _ANSI_ARGS_((struct vars *));
static chr newline _ANSI_ARGS_((NOPARMS));
#ifdef REG_DEBUG
static chr *ch _ANSI_ARGS_((NOPARMS));
#endif
static chr chrnamed _ANSI_ARGS_((struct vars *, chr *, chr *, pchr));
/* === regc_color.c === */
static VOID initcm _ANSI_ARGS_((struct vars *, struct colormap *));
static VOID freecm _ANSI_ARGS_((struct colormap *));
static VOID cmtreefree _ANSI_ARGS_((struct colormap *, union tree *, int));
static color setcolor _ANSI_ARGS_((struct colormap *, pchr, pcolor));
static color maxcolor _ANSI_ARGS_((struct colormap *));
static color newcolor _ANSI_ARGS_((struct colormap *));
static VOID freecolor _ANSI_ARGS_((struct colormap *, pcolor));
static color pseudocolor _ANSI_ARGS_((struct colormap *));
static color subcolor _ANSI_ARGS_((struct colormap *, pchr c));
static color newsub _ANSI_ARGS_((struct colormap *, pcolor));
static VOID subrange _ANSI_ARGS_((struct vars *, pchr, pchr, struct state *, struct state *));
static VOID subblock _ANSI_ARGS_((struct vars *, pchr, struct state *, struct state *));
static VOID okcolors _ANSI_ARGS_((struct nfa *, struct colormap *));
static VOID colorchain _ANSI_ARGS_((struct colormap *, struct arc *));
static VOID uncolorchain _ANSI_ARGS_((struct colormap *, struct arc *));
static int singleton _ANSI_ARGS_((struct colormap *, pchr c));
static VOID rainbow _ANSI_ARGS_((struct nfa *, struct colormap *, int, pcolor, struct state *, struct state *));
static VOID colorcomplement _ANSI_ARGS_((struct nfa *, struct colormap *, int, struct state *, struct state *, struct state *));
#ifdef REG_DEBUG
static VOID dumpcolors _ANSI_ARGS_((struct colormap *, FILE *));
static VOID fillcheck _ANSI_ARGS_((struct colormap *, union tree *, int, FILE *));
static VOID dumpchr _ANSI_ARGS_((pchr, FILE *));
#endif
/* === regc_nfa.c === */
static struct nfa *newnfa _ANSI_ARGS_((struct vars *, struct colormap *, struct nfa *));
static VOID freenfa _ANSI_ARGS_((struct nfa *));
static struct state *newstate _ANSI_ARGS_((struct nfa *));
static struct state *newfstate _ANSI_ARGS_((struct nfa *, int flag));
static VOID dropstate _ANSI_ARGS_((struct nfa *, struct state *));
static VOID freestate _ANSI_ARGS_((struct nfa *, struct state *));
static VOID destroystate _ANSI_ARGS_((struct nfa *, struct state *));
static VOID newarc _ANSI_ARGS_((struct nfa *, int, pcolor, struct state *, struct state *));
static struct arc *allocarc _ANSI_ARGS_((struct nfa *, struct state *));
static VOID freearc _ANSI_ARGS_((struct nfa *, struct arc *));
static struct arc *findarc _ANSI_ARGS_((struct state *, int, pcolor));
static VOID cparc _ANSI_ARGS_((struct nfa *, struct arc *, struct state *, struct state *));
static VOID moveins _ANSI_ARGS_((struct nfa *, struct state *, struct state *));
static VOID copyins _ANSI_ARGS_((struct nfa *, struct state *, struct state *));
static VOID moveouts _ANSI_ARGS_((struct nfa *, struct state *, struct state *));
static VOID copyouts _ANSI_ARGS_((struct nfa *, struct state *, struct state *));
static VOID cloneouts _ANSI_ARGS_((struct nfa *, struct state *, struct state *, struct state *, int));
static VOID delsub _ANSI_ARGS_((struct nfa *, struct state *, struct state *));
static VOID deltraverse _ANSI_ARGS_((struct nfa *, struct state *, struct state *));
static VOID dupnfa _ANSI_ARGS_((struct nfa *, struct state *, struct state *, struct state *, struct state *));
static VOID duptraverse _ANSI_ARGS_((struct nfa *, struct state *, struct state *));
static VOID cleartraverse _ANSI_ARGS_((struct nfa *, struct state *));
static VOID specialcolors _ANSI_ARGS_((struct nfa *));
static long optimize _ANSI_ARGS_((struct nfa *, FILE *));
static VOID pullback _ANSI_ARGS_((struct nfa *, FILE *));
static int pull _ANSI_ARGS_((struct nfa *, struct arc *));
static VOID pushfwd _ANSI_ARGS_((struct nfa *, FILE *));
static int push _ANSI_ARGS_((struct nfa *, struct arc *));
#define	INCOMPATIBLE	1	/* destroys arc */
#define	SATISFIED	2	/* constraint satisfied */
#define	COMPATIBLE	3	/* compatible but not satisfied yet */
static int combine _ANSI_ARGS_((struct arc *, struct arc *));
static VOID fixempties _ANSI_ARGS_((struct nfa *, FILE *));
static int unempty _ANSI_ARGS_((struct nfa *, struct arc *));
static VOID cleanup _ANSI_ARGS_((struct nfa *));
static VOID markreachable _ANSI_ARGS_((struct nfa *, struct state *, struct state *, struct state *));
static VOID markcanreach _ANSI_ARGS_((struct nfa *, struct state *, struct state *, struct state *));
static long analyze _ANSI_ARGS_((struct nfa *));
static VOID compact _ANSI_ARGS_((struct nfa *, struct cnfa *));
static VOID carcsort _ANSI_ARGS_((struct carc *, struct carc *));
static VOID freecnfa _ANSI_ARGS_((struct cnfa *));
static VOID dumpnfa _ANSI_ARGS_((struct nfa *, FILE *));
#ifdef REG_DEBUG
static VOID dumpstate _ANSI_ARGS_((struct state *, FILE *));
static VOID dumparcs _ANSI_ARGS_((struct state *, FILE *));
static int dumprarcs _ANSI_ARGS_((struct arc *, struct state *, FILE *, int));
static VOID dumparc _ANSI_ARGS_((struct arc *, struct state *, FILE *));
#endif
static VOID dumpcnfa _ANSI_ARGS_((struct cnfa *, FILE *));
#ifdef REG_DEBUG
static VOID dumpcstate _ANSI_ARGS_((int, struct carc *, struct cnfa *, FILE *));
#endif
/* === regc_cvec.c === */
static struct cvec *newcvec _ANSI_ARGS_((int, int, int));
static struct cvec *clearcvec _ANSI_ARGS_((struct cvec *));
static VOID addchr _ANSI_ARGS_((struct cvec *, pchr));
static VOID addrange _ANSI_ARGS_((struct cvec *, pchr, pchr));
static VOID addmcce _ANSI_ARGS_((struct cvec *, chr *, chr *));
static int haschr _ANSI_ARGS_((struct cvec *, pchr));
static struct cvec *getcvec _ANSI_ARGS_((struct vars *, int, int, int));
static VOID freecvec _ANSI_ARGS_((struct cvec *));
/* === regc_locale.c === */
static int nmcces _ANSI_ARGS_((struct vars *));
static int nleaders _ANSI_ARGS_((struct vars *));
static struct cvec *allmcces _ANSI_ARGS_((struct vars *, struct cvec *));
static celt element _ANSI_ARGS_((struct vars *, chr *, chr *));
static struct cvec *range _ANSI_ARGS_((struct vars *, celt, celt, int));
static int before _ANSI_ARGS_((celt, celt));
static struct cvec *eclass _ANSI_ARGS_((struct vars *, celt, int));
static struct cvec *cclass _ANSI_ARGS_((struct vars *, chr *, chr *, int));
static struct cvec *allcases _ANSI_ARGS_((struct vars *, pchr));
static int cmp _ANSI_ARGS_((CONST chr *, CONST chr *, size_t));
static int casecmp _ANSI_ARGS_((CONST chr *, CONST chr *, size_t));
/* automatically gathered by fwd; do not hand-edit */
/* =====^!^===== end forwards =====^!^===== */



/* internal variables, bundled for easy passing around */
struct vars {
	regex_t *re;
	chr *now;		/* scan pointer into string */
	chr *stop;		/* end of string */
	chr *savenow;		/* saved now and stop for "subroutine call" */
	chr *savestop;
	int err;		/* error code (0 if none) */
	int cflags;		/* copy of compile flags */
	int lasttype;		/* type of previous token */
	int nexttype;		/* type of next token */
	chr nextvalue;		/* value (if any) of next token */
	int lexcon;		/* lexical context type (see lex.c) */
	int nsubexp;		/* subexpression count */
	struct subre **subs;	/* subRE pointer vector */
	size_t nsubs;		/* length of vector */
	struct subre *sub10[10];	/* initial vector, enough for most */
	struct nfa *nfa;	/* the NFA */
	struct colormap *cm;	/* character color map */
	color nlcolor;		/* color of newline */
	struct state *wordchrs;	/* state in nfa holding word-char outarcs */
	struct subre *tree;	/* subexpression tree */
	struct subre *treechain;	/* all tree nodes allocated */
	struct subre *treefree;		/* any free tree nodes */
	int ntree;		/* number of tree nodes */
	struct cvec *cv;	/* interface cvec */
	struct cvec *cv2;	/* utility cvec */
	struct cvec *mcces;	/* collating-element information */
#		define	ISCELEADER(v,c)	(v->mcces != NULL && haschr(v->mcces, (c)))
	struct state *mccepbegin;	/* in nfa, start of MCCE prototypes */
	struct state *mccepend;	/* in nfa, end of MCCE prototypes */
	struct subre *lacons;	/* lookahead-constraint vector */
	int nlacons;		/* size of lacons */
};

/* parsing macros; most know that `v' is the struct vars pointer */
#define	NEXT()	(next(v))		/* advance by one token */
#define	SEE(t)	(v->nexttype == (t))	/* is next token this? */
#define	EAT(t)	(SEE(t) && next(v))	/* if next is this, swallow it */
#define	VISERR(vv)	((vv)->err != 0)	/* have we seen an error yet? */
#define	ISERR()	VISERR(v)
#define	VERR(vv,e)	((vv)->nexttype = EOS, ((vv)->err) ? (vv)->err :\
							((vv)->err = (e)))
#define	ERR(e)	VERR(v, e)		/* record an error */
#define	NOERR()	{if (ISERR()) return;}	/* if error seen, return */
#define	NOERRN()	{if (ISERR()) return NULL;}	/* NOERR with retval */
#define	NOERRZ()	{if (ISERR()) return 0;}	/* NOERR with retval */
#define	INSIST(c, e)	((c) ? 0 : ERR(e))	/* if condition false, error */
#define	NOTE(b)	(v->re->re_info |= (b))		/* note visible condition */
#define	EMPTYARC(x, y)	newarc(v->nfa, EMPTY, 0, x, y)

/* token type codes, some also used as NFA arc types */
#define	EMPTY	'n'		/* no token present */
#define	EOS	'e'		/* end of string */
#define	PLAIN	'p'		/* ordinary character */
#define	DIGIT	'd'		/* digit (in bound) */
#define	BACKREF	'b'		/* back reference */
#define	COLLEL	'I'		/* start of [. */
#define	ECLASS	'E'		/* start of [= */
#define	CCLASS	'C'		/* start of [: */
#define	END	'X'		/* end of [. [= [: */
#define	RANGE	'R'		/* - within [] which might be range delim. */
#define	LACON	'L'		/* lookahead constraint subRE */
#define	AHEAD	'a'		/* color-lookahead arc */
#define	BEHIND	'r'		/* color-lookbehind arc */
#define	WBDRY	'w'		/* word boundary constraint */
#define	NWBDRY	'W'		/* non-word-boundary constraint */
#define	SBEGIN	'A'		/* beginning of string (even if not BOL) */
#define	SEND	'Z'		/* end of string (even if not EOL) */
#define	PREFER	'P'		/* length preference */

/* is an arc colored, and hence on a color chain? */
#define	COLORED(a)	((a)->type == PLAIN || (a)->type == AHEAD || \
							(a)->type == BEHIND)



/* static function list */
static struct fns functions = {
	rfree,			/* regfree insides */
};



/*
 - compile - compile regular expression
 ^ int compile(regex_t *, CONST chr *, size_t, int);
 */
int
compile(re, string, len, flags)
regex_t *re;
CONST chr *string;
size_t len;
int flags;
{
	struct vars var;
	struct vars *v = &var;
	struct guts *g;
	int i;
	size_t j;
	FILE *debug = (flags&REG_PROGRESS) ? stdout : (FILE *)NULL;
#	define	CNOERR()	{ if (ISERR()) return freev(v, v->err); }

	/* sanity checks */

	if (re == NULL || string == NULL)
		return REG_INVARG;
	if ((flags&REG_QUOTE) &&
			(flags&(REG_ADVANCED|REG_EXPANDED|REG_NEWLINE)))
		return REG_INVARG;
	if (!(flags&REG_EXTENDED) && (flags&REG_ADVF))
		return REG_INVARG;

	/* initial setup (after which freev() is callable) */
	v->re = re;
	v->now = (chr *)string;
	v->stop = v->now + len;
	v->savenow = v->savestop = NULL;
	v->err = 0;
	v->cflags = flags;
	v->nsubexp = 0;
	v->subs = v->sub10;
	v->nsubs = 10;
	for (j = 0; j < v->nsubs; j++)
		v->subs[j] = NULL;
	v->nfa = NULL;
	v->cm = NULL;
	v->nlcolor = COLORLESS;
	v->wordchrs = NULL;
	v->tree = NULL;
	v->treechain = NULL;
	v->treefree = NULL;
	v->cv = NULL;
	v->cv2 = NULL;
	v->mcces = NULL;
	v->lacons = NULL;
	v->nlacons = 0;
	re->re_magic = REMAGIC;
	re->re_info = 0;		/* bits get set during parse */
	re->re_csize = sizeof(chr);
	re->re_guts = NULL;
	re->re_fns = VS(&functions);

	/* more complex setup, malloced things */
	re->re_guts = VS(MALLOC(sizeof(struct guts)));
	if (re->re_guts == NULL)
		return freev(v, REG_ESPACE);
	g = (struct guts *)re->re_guts;
	g->tree = NULL;
	initcm(v, &g->cmap);
	v->cm = &g->cmap;
	g->lacons = NULL;
	g->nlacons = 0;
	ZAPCNFA(g->search);
	v->nfa = newnfa(v, v->cm, (struct nfa *)NULL);
	CNOERR();
	v->cv = newcvec(100, 20, 10);
	if (v->cv == NULL)
		return freev(v, REG_ESPACE);
	i = nmcces(v);
	if (i > 0) {
		v->mcces = newcvec(nleaders(v), 0, i);
		CNOERR();
		v->mcces = allmcces(v, v->mcces);
		leaders(v, v->mcces);
		addmcce(v->mcces, (chr *)NULL, (chr *)NULL);	/* dummy */
	}
	CNOERR();

	/* parsing */
	lexstart(v);			/* also handles prefixes */
	if ((v->cflags&REG_NLSTOP) || (v->cflags&REG_NLANCH)) {
		/* assign newline a unique color */
		v->nlcolor = subcolor(v->cm, newline());
		okcolors(v->nfa, v->cm);
	}
	CNOERR();
	v->tree = parse(v, EOS, PLAIN, v->nfa->init, v->nfa->final);
	assert(SEE(EOS));		/* even if error; ISERR() => SEE(EOS) */
	CNOERR();
	assert(v->tree != NULL);

	/* finish setup of nfa and its subre tree */
	specialcolors(v->nfa);
	CNOERR();
	if (debug != NULL) {
		fprintf(debug, "\n\n\n========= RAW ==========\n");
		dumpnfa(v->nfa, debug);
		dumpst(v->tree, debug, 1);
	}
	optst(v, v->tree);
	v->ntree = numst(v->tree, 1);
	markst(v->tree);
	cleanst(v);
	if (debug != NULL) {
		fprintf(debug, "\n\n\n========= TREE FIXED ==========\n");
		dumpst(v->tree, debug, 1);
	}

	/* build compacted NFAs for tree and lacons */
	re->re_info |= nfatree(v, v->tree, debug);
	CNOERR();
	assert(v->nlacons == 0 || v->lacons != NULL);
	for (i = 1; i < v->nlacons; i++) {
		if (debug != NULL)
			fprintf(debug, "\n\n\n========= LA%d ==========\n", i);
		nfanode(v, &v->lacons[i], debug);
	}
	CNOERR();
	if (v->tree->flags&SHORTER)
		NOTE(REG_USHORTEST);

	/* build compacted NFAs for tree, lacons, fast search */
	if (debug != NULL)
		fprintf(debug, "\n\n\n========= SEARCH ==========\n");
	/* can sacrifice main NFA now, so use it as work area */
	(DISCARD)optimize(v->nfa, debug);
	CNOERR();
	makesearch(v, v->nfa);
	CNOERR();
	compact(v->nfa, &g->search);
	CNOERR();

	/* looks okay, package it up */
	re->re_nsub = v->nsubexp;
	v->re = NULL;			/* freev no longer frees re */
	g->magic = GUTSMAGIC;
	g->cflags = v->cflags;
	g->info = re->re_info;
	g->nsub = re->re_nsub;
	g->tree = v->tree;
	v->tree = NULL;
	g->ntree = v->ntree;
	g->compare = (v->cflags&REG_ICASE) ? casecmp : cmp;
	g->lacons = v->lacons;
	v->lacons = NULL;
	g->nlacons = v->nlacons;

	if (flags&REG_DUMP)
		dump(re, stdout);

	assert(v->err == 0);

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?