regcomp.c
来自「A*算法 A*算法 A*算法 A*算法A*算法A*算法」· C语言 代码 · 共 2,180 行 · 第 1/5 页
C
2,180 行
/*
* re_*comp and friends - compile REs
* This file #includes several others (see the bottom).
*
* Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
*
* Development of this software was funded, in part, by Cray Research Inc.,
* UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
* Corporation, none of whom are responsible for the results. The author
* thanks all of them.
*
* Redistribution and use in source and binary forms -- with or without
* modification -- are permitted for any purpose, provided that
* redistributions in source form retain this entire copyright notice and
* indicate the origin and nature of any modifications.
*
* I'd appreciate being given credit for this package in the documentation
* of software which uses it, but that is not a requirement.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
* AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
* HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
*/
#include "regguts.h"
/*
* forward declarations, up here so forward datatypes etc. are defined early
*/
/* =====^!^===== begin forwards =====^!^===== */
/* automatically gathered by fwd; do not hand-edit */
/* === regcomp.c === */
int compile _ANSI_ARGS_((regex_t *, CONST chr *, size_t, int));
static VOID moresubs _ANSI_ARGS_((struct vars *, int));
static int freev _ANSI_ARGS_((struct vars *, int));
static VOID makesearch _ANSI_ARGS_((struct vars *, struct nfa *));
static struct subre *parse _ANSI_ARGS_((struct vars *, int, int, struct state *, struct state *));
static struct subre *parsebranch _ANSI_ARGS_((struct vars *, int, int, struct state *, struct state *, int));
static VOID parseqatom _ANSI_ARGS_((struct vars *, int, int, struct state *, struct state *, struct subre *));
static VOID nonword _ANSI_ARGS_((struct vars *, int, struct state *, struct state *));
static VOID word _ANSI_ARGS_((struct vars *, int, struct state *, struct state *));
static int scannum _ANSI_ARGS_((struct vars *));
static VOID repeat _ANSI_ARGS_((struct vars *, struct state *, struct state *, int, int));
static VOID bracket _ANSI_ARGS_((struct vars *, struct state *, struct state *));
static VOID cbracket _ANSI_ARGS_((struct vars *, struct state *, struct state *));
static VOID brackpart _ANSI_ARGS_((struct vars *, struct state *, struct state *));
static chr *scanplain _ANSI_ARGS_((struct vars *));
static VOID leaders _ANSI_ARGS_((struct vars *, struct cvec *));
static VOID onechr _ANSI_ARGS_((struct vars *, pchr, struct state *, struct state *));
static VOID dovec _ANSI_ARGS_((struct vars *, struct cvec *, struct state *, struct state *));
static celt nextleader _ANSI_ARGS_((struct vars *, pchr, pchr));
static VOID wordchrs _ANSI_ARGS_((struct vars *));
static struct subre *subre _ANSI_ARGS_((struct vars *, int, int, struct state *, struct state *));
static VOID freesubre _ANSI_ARGS_((struct vars *, struct subre *));
static VOID freesrnode _ANSI_ARGS_((struct vars *, struct subre *));
static VOID optst _ANSI_ARGS_((struct vars *, struct subre *));
static int numst _ANSI_ARGS_((struct subre *, int));
static VOID markst _ANSI_ARGS_((struct subre *));
static VOID cleanst _ANSI_ARGS_((struct vars *));
static long nfatree _ANSI_ARGS_((struct vars *, struct subre *, FILE *));
static long nfanode _ANSI_ARGS_((struct vars *, struct subre *, FILE *));
static int newlacon _ANSI_ARGS_((struct vars *, struct state *, struct state *, int));
static VOID freelacons _ANSI_ARGS_((struct subre *, int));
static VOID rfree _ANSI_ARGS_((regex_t *));
static VOID dump _ANSI_ARGS_((regex_t *, FILE *));
static VOID dumpst _ANSI_ARGS_((struct subre *, FILE *, int));
static VOID stdump _ANSI_ARGS_((struct subre *, FILE *, int));
static char *stid _ANSI_ARGS_((struct subre *, char *, size_t));
/* === regc_lex.c === */
static VOID lexstart _ANSI_ARGS_((struct vars *));
static VOID prefixes _ANSI_ARGS_((struct vars *));
static VOID lexnest _ANSI_ARGS_((struct vars *, chr *, chr *));
static VOID lexword _ANSI_ARGS_((struct vars *));
static int next _ANSI_ARGS_((struct vars *));
static int lexescape _ANSI_ARGS_((struct vars *));
static chr lexdigits _ANSI_ARGS_((struct vars *, int, int, int));
static int brenext _ANSI_ARGS_((struct vars *, pchr));
static VOID skip _ANSI_ARGS_((struct vars *));
static chr newline _ANSI_ARGS_((NOPARMS));
#ifdef REG_DEBUG
static chr *ch _ANSI_ARGS_((NOPARMS));
#endif
static chr chrnamed _ANSI_ARGS_((struct vars *, chr *, chr *, pchr));
/* === regc_color.c === */
static VOID initcm _ANSI_ARGS_((struct vars *, struct colormap *));
static VOID freecm _ANSI_ARGS_((struct colormap *));
static VOID cmtreefree _ANSI_ARGS_((struct colormap *, union tree *, int));
static color setcolor _ANSI_ARGS_((struct colormap *, pchr, pcolor));
static color maxcolor _ANSI_ARGS_((struct colormap *));
static color newcolor _ANSI_ARGS_((struct colormap *));
static VOID freecolor _ANSI_ARGS_((struct colormap *, pcolor));
static color pseudocolor _ANSI_ARGS_((struct colormap *));
static color subcolor _ANSI_ARGS_((struct colormap *, pchr c));
static color newsub _ANSI_ARGS_((struct colormap *, pcolor));
static VOID subrange _ANSI_ARGS_((struct vars *, pchr, pchr, struct state *, struct state *));
static VOID subblock _ANSI_ARGS_((struct vars *, pchr, struct state *, struct state *));
static VOID okcolors _ANSI_ARGS_((struct nfa *, struct colormap *));
static VOID colorchain _ANSI_ARGS_((struct colormap *, struct arc *));
static VOID uncolorchain _ANSI_ARGS_((struct colormap *, struct arc *));
static int singleton _ANSI_ARGS_((struct colormap *, pchr c));
static VOID rainbow _ANSI_ARGS_((struct nfa *, struct colormap *, int, pcolor, struct state *, struct state *));
static VOID colorcomplement _ANSI_ARGS_((struct nfa *, struct colormap *, int, struct state *, struct state *, struct state *));
#ifdef REG_DEBUG
static VOID dumpcolors _ANSI_ARGS_((struct colormap *, FILE *));
static VOID fillcheck _ANSI_ARGS_((struct colormap *, union tree *, int, FILE *));
static VOID dumpchr _ANSI_ARGS_((pchr, FILE *));
#endif
/* === regc_nfa.c === */
static struct nfa *newnfa _ANSI_ARGS_((struct vars *, struct colormap *, struct nfa *));
static VOID freenfa _ANSI_ARGS_((struct nfa *));
static struct state *newstate _ANSI_ARGS_((struct nfa *));
static struct state *newfstate _ANSI_ARGS_((struct nfa *, int flag));
static VOID dropstate _ANSI_ARGS_((struct nfa *, struct state *));
static VOID freestate _ANSI_ARGS_((struct nfa *, struct state *));
static VOID destroystate _ANSI_ARGS_((struct nfa *, struct state *));
static VOID newarc _ANSI_ARGS_((struct nfa *, int, pcolor, struct state *, struct state *));
static struct arc *allocarc _ANSI_ARGS_((struct nfa *, struct state *));
static VOID freearc _ANSI_ARGS_((struct nfa *, struct arc *));
static struct arc *findarc _ANSI_ARGS_((struct state *, int, pcolor));
static VOID cparc _ANSI_ARGS_((struct nfa *, struct arc *, struct state *, struct state *));
static VOID moveins _ANSI_ARGS_((struct nfa *, struct state *, struct state *));
static VOID copyins _ANSI_ARGS_((struct nfa *, struct state *, struct state *));
static VOID moveouts _ANSI_ARGS_((struct nfa *, struct state *, struct state *));
static VOID copyouts _ANSI_ARGS_((struct nfa *, struct state *, struct state *));
static VOID cloneouts _ANSI_ARGS_((struct nfa *, struct state *, struct state *, struct state *, int));
static VOID delsub _ANSI_ARGS_((struct nfa *, struct state *, struct state *));
static VOID deltraverse _ANSI_ARGS_((struct nfa *, struct state *, struct state *));
static VOID dupnfa _ANSI_ARGS_((struct nfa *, struct state *, struct state *, struct state *, struct state *));
static VOID duptraverse _ANSI_ARGS_((struct nfa *, struct state *, struct state *));
static VOID cleartraverse _ANSI_ARGS_((struct nfa *, struct state *));
static VOID specialcolors _ANSI_ARGS_((struct nfa *));
static long optimize _ANSI_ARGS_((struct nfa *, FILE *));
static VOID pullback _ANSI_ARGS_((struct nfa *, FILE *));
static int pull _ANSI_ARGS_((struct nfa *, struct arc *));
static VOID pushfwd _ANSI_ARGS_((struct nfa *, FILE *));
static int push _ANSI_ARGS_((struct nfa *, struct arc *));
#define INCOMPATIBLE 1 /* destroys arc */
#define SATISFIED 2 /* constraint satisfied */
#define COMPATIBLE 3 /* compatible but not satisfied yet */
static int combine _ANSI_ARGS_((struct arc *, struct arc *));
static VOID fixempties _ANSI_ARGS_((struct nfa *, FILE *));
static int unempty _ANSI_ARGS_((struct nfa *, struct arc *));
static VOID cleanup _ANSI_ARGS_((struct nfa *));
static VOID markreachable _ANSI_ARGS_((struct nfa *, struct state *, struct state *, struct state *));
static VOID markcanreach _ANSI_ARGS_((struct nfa *, struct state *, struct state *, struct state *));
static long analyze _ANSI_ARGS_((struct nfa *));
static VOID compact _ANSI_ARGS_((struct nfa *, struct cnfa *));
static VOID carcsort _ANSI_ARGS_((struct carc *, struct carc *));
static VOID freecnfa _ANSI_ARGS_((struct cnfa *));
static VOID dumpnfa _ANSI_ARGS_((struct nfa *, FILE *));
#ifdef REG_DEBUG
static VOID dumpstate _ANSI_ARGS_((struct state *, FILE *));
static VOID dumparcs _ANSI_ARGS_((struct state *, FILE *));
static int dumprarcs _ANSI_ARGS_((struct arc *, struct state *, FILE *, int));
static VOID dumparc _ANSI_ARGS_((struct arc *, struct state *, FILE *));
#endif
static VOID dumpcnfa _ANSI_ARGS_((struct cnfa *, FILE *));
#ifdef REG_DEBUG
static VOID dumpcstate _ANSI_ARGS_((int, struct carc *, struct cnfa *, FILE *));
#endif
/* === regc_cvec.c === */
static struct cvec *newcvec _ANSI_ARGS_((int, int, int));
static struct cvec *clearcvec _ANSI_ARGS_((struct cvec *));
static VOID addchr _ANSI_ARGS_((struct cvec *, pchr));
static VOID addrange _ANSI_ARGS_((struct cvec *, pchr, pchr));
static VOID addmcce _ANSI_ARGS_((struct cvec *, chr *, chr *));
static int haschr _ANSI_ARGS_((struct cvec *, pchr));
static struct cvec *getcvec _ANSI_ARGS_((struct vars *, int, int, int));
static VOID freecvec _ANSI_ARGS_((struct cvec *));
/* === regc_locale.c === */
static int nmcces _ANSI_ARGS_((struct vars *));
static int nleaders _ANSI_ARGS_((struct vars *));
static struct cvec *allmcces _ANSI_ARGS_((struct vars *, struct cvec *));
static celt element _ANSI_ARGS_((struct vars *, chr *, chr *));
static struct cvec *range _ANSI_ARGS_((struct vars *, celt, celt, int));
static int before _ANSI_ARGS_((celt, celt));
static struct cvec *eclass _ANSI_ARGS_((struct vars *, celt, int));
static struct cvec *cclass _ANSI_ARGS_((struct vars *, chr *, chr *, int));
static struct cvec *allcases _ANSI_ARGS_((struct vars *, pchr));
static int cmp _ANSI_ARGS_((CONST chr *, CONST chr *, size_t));
static int casecmp _ANSI_ARGS_((CONST chr *, CONST chr *, size_t));
/* automatically gathered by fwd; do not hand-edit */
/* =====^!^===== end forwards =====^!^===== */
/* internal variables, bundled for easy passing around */
struct vars {
regex_t *re;
chr *now; /* scan pointer into string */
chr *stop; /* end of string */
chr *savenow; /* saved now and stop for "subroutine call" */
chr *savestop;
int err; /* error code (0 if none) */
int cflags; /* copy of compile flags */
int lasttype; /* type of previous token */
int nexttype; /* type of next token */
chr nextvalue; /* value (if any) of next token */
int lexcon; /* lexical context type (see lex.c) */
int nsubexp; /* subexpression count */
struct subre **subs; /* subRE pointer vector */
size_t nsubs; /* length of vector */
struct subre *sub10[10]; /* initial vector, enough for most */
struct nfa *nfa; /* the NFA */
struct colormap *cm; /* character color map */
color nlcolor; /* color of newline */
struct state *wordchrs; /* state in nfa holding word-char outarcs */
struct subre *tree; /* subexpression tree */
struct subre *treechain; /* all tree nodes allocated */
struct subre *treefree; /* any free tree nodes */
int ntree; /* number of tree nodes */
struct cvec *cv; /* interface cvec */
struct cvec *cv2; /* utility cvec */
struct cvec *mcces; /* collating-element information */
# define ISCELEADER(v,c) (v->mcces != NULL && haschr(v->mcces, (c)))
struct state *mccepbegin; /* in nfa, start of MCCE prototypes */
struct state *mccepend; /* in nfa, end of MCCE prototypes */
struct subre *lacons; /* lookahead-constraint vector */
int nlacons; /* size of lacons */
};
/* parsing macros; most know that `v' is the struct vars pointer */
#define NEXT() (next(v)) /* advance by one token */
#define SEE(t) (v->nexttype == (t)) /* is next token this? */
#define EAT(t) (SEE(t) && next(v)) /* if next is this, swallow it */
#define VISERR(vv) ((vv)->err != 0) /* have we seen an error yet? */
#define ISERR() VISERR(v)
#define VERR(vv,e) ((vv)->nexttype = EOS, ((vv)->err) ? (vv)->err :\
((vv)->err = (e)))
#define ERR(e) VERR(v, e) /* record an error */
#define NOERR() {if (ISERR()) return;} /* if error seen, return */
#define NOERRN() {if (ISERR()) return NULL;} /* NOERR with retval */
#define NOERRZ() {if (ISERR()) return 0;} /* NOERR with retval */
#define INSIST(c, e) ((c) ? 0 : ERR(e)) /* if condition false, error */
#define NOTE(b) (v->re->re_info |= (b)) /* note visible condition */
#define EMPTYARC(x, y) newarc(v->nfa, EMPTY, 0, x, y)
/* token type codes, some also used as NFA arc types */
#define EMPTY 'n' /* no token present */
#define EOS 'e' /* end of string */
#define PLAIN 'p' /* ordinary character */
#define DIGIT 'd' /* digit (in bound) */
#define BACKREF 'b' /* back reference */
#define COLLEL 'I' /* start of [. */
#define ECLASS 'E' /* start of [= */
#define CCLASS 'C' /* start of [: */
#define END 'X' /* end of [. [= [: */
#define RANGE 'R' /* - within [] which might be range delim. */
#define LACON 'L' /* lookahead constraint subRE */
#define AHEAD 'a' /* color-lookahead arc */
#define BEHIND 'r' /* color-lookbehind arc */
#define WBDRY 'w' /* word boundary constraint */
#define NWBDRY 'W' /* non-word-boundary constraint */
#define SBEGIN 'A' /* beginning of string (even if not BOL) */
#define SEND 'Z' /* end of string (even if not EOL) */
#define PREFER 'P' /* length preference */
/* is an arc colored, and hence on a color chain? */
#define COLORED(a) ((a)->type == PLAIN || (a)->type == AHEAD || \
(a)->type == BEHIND)
/* static function list */
static struct fns functions = {
rfree, /* regfree insides */
};
/*
- compile - compile regular expression
^ int compile(regex_t *, CONST chr *, size_t, int);
*/
int
compile(re, string, len, flags)
regex_t *re;
CONST chr *string;
size_t len;
int flags;
{
struct vars var;
struct vars *v = &var;
struct guts *g;
int i;
size_t j;
FILE *debug = (flags®_PROGRESS) ? stdout : (FILE *)NULL;
# define CNOERR() { if (ISERR()) return freev(v, v->err); }
/* sanity checks */
if (re == NULL || string == NULL)
return REG_INVARG;
if ((flags®_QUOTE) &&
(flags&(REG_ADVANCED|REG_EXPANDED|REG_NEWLINE)))
return REG_INVARG;
if (!(flags®_EXTENDED) && (flags®_ADVF))
return REG_INVARG;
/* initial setup (after which freev() is callable) */
v->re = re;
v->now = (chr *)string;
v->stop = v->now + len;
v->savenow = v->savestop = NULL;
v->err = 0;
v->cflags = flags;
v->nsubexp = 0;
v->subs = v->sub10;
v->nsubs = 10;
for (j = 0; j < v->nsubs; j++)
v->subs[j] = NULL;
v->nfa = NULL;
v->cm = NULL;
v->nlcolor = COLORLESS;
v->wordchrs = NULL;
v->tree = NULL;
v->treechain = NULL;
v->treefree = NULL;
v->cv = NULL;
v->cv2 = NULL;
v->mcces = NULL;
v->lacons = NULL;
v->nlacons = 0;
re->re_magic = REMAGIC;
re->re_info = 0; /* bits get set during parse */
re->re_csize = sizeof(chr);
re->re_guts = NULL;
re->re_fns = VS(&functions);
/* more complex setup, malloced things */
re->re_guts = VS(MALLOC(sizeof(struct guts)));
if (re->re_guts == NULL)
return freev(v, REG_ESPACE);
g = (struct guts *)re->re_guts;
g->tree = NULL;
initcm(v, &g->cmap);
v->cm = &g->cmap;
g->lacons = NULL;
g->nlacons = 0;
ZAPCNFA(g->search);
v->nfa = newnfa(v, v->cm, (struct nfa *)NULL);
CNOERR();
v->cv = newcvec(100, 20, 10);
if (v->cv == NULL)
return freev(v, REG_ESPACE);
i = nmcces(v);
if (i > 0) {
v->mcces = newcvec(nleaders(v), 0, i);
CNOERR();
v->mcces = allmcces(v, v->mcces);
leaders(v, v->mcces);
addmcce(v->mcces, (chr *)NULL, (chr *)NULL); /* dummy */
}
CNOERR();
/* parsing */
lexstart(v); /* also handles prefixes */
if ((v->cflags®_NLSTOP) || (v->cflags®_NLANCH)) {
/* assign newline a unique color */
v->nlcolor = subcolor(v->cm, newline());
okcolors(v->nfa, v->cm);
}
CNOERR();
v->tree = parse(v, EOS, PLAIN, v->nfa->init, v->nfa->final);
assert(SEE(EOS)); /* even if error; ISERR() => SEE(EOS) */
CNOERR();
assert(v->tree != NULL);
/* finish setup of nfa and its subre tree */
specialcolors(v->nfa);
CNOERR();
if (debug != NULL) {
fprintf(debug, "\n\n\n========= RAW ==========\n");
dumpnfa(v->nfa, debug);
dumpst(v->tree, debug, 1);
}
optst(v, v->tree);
v->ntree = numst(v->tree, 1);
markst(v->tree);
cleanst(v);
if (debug != NULL) {
fprintf(debug, "\n\n\n========= TREE FIXED ==========\n");
dumpst(v->tree, debug, 1);
}
/* build compacted NFAs for tree and lacons */
re->re_info |= nfatree(v, v->tree, debug);
CNOERR();
assert(v->nlacons == 0 || v->lacons != NULL);
for (i = 1; i < v->nlacons; i++) {
if (debug != NULL)
fprintf(debug, "\n\n\n========= LA%d ==========\n", i);
nfanode(v, &v->lacons[i], debug);
}
CNOERR();
if (v->tree->flags&SHORTER)
NOTE(REG_USHORTEST);
/* build compacted NFAs for tree, lacons, fast search */
if (debug != NULL)
fprintf(debug, "\n\n\n========= SEARCH ==========\n");
/* can sacrifice main NFA now, so use it as work area */
(DISCARD)optimize(v->nfa, debug);
CNOERR();
makesearch(v, v->nfa);
CNOERR();
compact(v->nfa, &g->search);
CNOERR();
/* looks okay, package it up */
re->re_nsub = v->nsubexp;
v->re = NULL; /* freev no longer frees re */
g->magic = GUTSMAGIC;
g->cflags = v->cflags;
g->info = re->re_info;
g->nsub = re->re_nsub;
g->tree = v->tree;
v->tree = NULL;
g->ntree = v->ntree;
g->compare = (v->cflags®_ICASE) ? casecmp : cmp;
g->lacons = v->lacons;
v->lacons = NULL;
g->nlacons = v->nlacons;
if (flags®_DUMP)
dump(re, stdout);
assert(v->err == 0);
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?