📄 regcomp.c
字号:
/* * re_*comp and friends - compile REs * This file #includes several others (see the bottom). * * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved. * * Development of this software was funded, in part, by Cray Research Inc., * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics * Corporation, none of whom are responsible for the results. The author * thanks all of them. * * Redistribution and use in source and binary forms -- with or without * modification -- are permitted for any purpose, provided that * redistributions in source form retain this entire copyright notice and * indicate the origin and nature of any modifications. * * I'd appreciate being given credit for this package in the documentation * of software which uses it, but that is not a requirement. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $PostgreSQL: pgsql/src/backend/regex/regcomp.c,v 1.45 2007/10/06 16:05:54 tgl Exp $ * */#include "regguts.h"/* * forward declarations, up here so forward datatypes etc. are defined early *//* === regcomp.c === */static void moresubs(struct vars *, int);static int freev(struct vars *, int);static void makesearch(struct vars *, struct nfa *);static struct subre *parse(struct vars *, int, int, struct state *, struct state *);static struct subre *parsebranch(struct vars *, int, int, struct state *, struct state *, int);static void parseqatom(struct vars *, int, int, struct state *, struct state *, struct subre *);static void nonword(struct vars *, int, struct state *, struct state *);static void word(struct vars *, int, struct state *, struct state *);static int scannum(struct vars *);static void repeat(struct vars *, struct state *, struct state *, int, int);static void bracket(struct vars *, struct state *, struct state *);static void cbracket(struct vars *, struct state *, struct state *);static void brackpart(struct vars *, struct state *, struct state *);static chr *scanplain(struct vars *);static void leaders(struct vars *, struct cvec *);static void onechr(struct vars *, chr, struct state *, struct state *);static void dovec(struct vars *, struct cvec *, struct state *, struct state *);static celt nextleader(struct vars *, chr, chr);static void wordchrs(struct vars *);static struct subre *subre(struct vars *, int, int, struct state *, struct state *);static void freesubre(struct vars *, struct subre *);static void freesrnode(struct vars *, struct subre *);static void optst(struct vars *, struct subre *);static int numst(struct subre *, int);static void markst(struct subre *);static void cleanst(struct vars *);static long nfatree(struct vars *, struct subre *, FILE *);static long nfanode(struct vars *, struct subre *, FILE *);static int newlacon(struct vars *, struct state *, struct state *, int);static void freelacons(struct subre *, int);static void rfree(regex_t *);#ifdef REG_DEBUGstatic void dump(regex_t *, FILE *);static void dumpst(struct subre *, FILE *, int);static void stdump(struct subre *, FILE *, int);static char *stid(struct subre *, char *, size_t);#endif/* === regc_lex.c === */static void lexstart(struct vars *);static void prefixes(struct vars *);static void lexnest(struct vars *, chr *, chr *);static void lexword(struct vars *);static int next(struct vars *);static int lexescape(struct vars *);static chr lexdigits(struct vars *, int, int, int);static int brenext(struct vars *, chr);static void skip(struct vars *);static chr newline(void);static chr chrnamed(struct vars *, chr *, chr *, chr);/* === regc_color.c === */static void initcm(struct vars *, struct colormap *);static void freecm(struct colormap *);static void cmtreefree(struct colormap *, union tree *, int);static color setcolor(struct colormap *, chr, pcolor);static color maxcolor(struct colormap *);static color newcolor(struct colormap *);static void freecolor(struct colormap *, pcolor);static color pseudocolor(struct colormap *);static color subcolor(struct colormap *, chr c);static color newsub(struct colormap *, pcolor);static void subrange(struct vars *, chr, chr, struct state *, struct state *);static void subblock(struct vars *, chr, struct state *, struct state *);static void okcolors(struct nfa *, struct colormap *);static void colorchain(struct colormap *, struct arc *);static void uncolorchain(struct colormap *, struct arc *);static int singleton(struct colormap *, chr c);static void rainbow(struct nfa *, struct colormap *, int, pcolor, struct state *, struct state *);static void colorcomplement(struct nfa *, struct colormap *, int, struct state *, struct state *, struct state *);#ifdef REG_DEBUGstatic void dumpcolors(struct colormap *, FILE *);static void fillcheck(struct colormap *, union tree *, int, FILE *);static void dumpchr(chr, FILE *);#endif/* === regc_nfa.c === */static struct nfa *newnfa(struct vars *, struct colormap *, struct nfa *);static void freenfa(struct nfa *);static struct state *newstate(struct nfa *);static struct state *newfstate(struct nfa *, int flag);static void dropstate(struct nfa *, struct state *);static void freestate(struct nfa *, struct state *);static void destroystate(struct nfa *, struct state *);static void newarc(struct nfa *, int, pcolor, struct state *, struct state *);static struct arc *allocarc(struct nfa *, struct state *);static void freearc(struct nfa *, struct arc *);static struct arc *findarc(struct state *, int, pcolor);static void cparc(struct nfa *, struct arc *, struct state *, struct state *);static void moveins(struct nfa *, struct state *, struct state *);static void copyins(struct nfa *, struct state *, struct state *);static void moveouts(struct nfa *, struct state *, struct state *);static void copyouts(struct nfa *, struct state *, struct state *);static void cloneouts(struct nfa *, struct state *, struct state *, struct state *, int);static void delsub(struct nfa *, struct state *, struct state *);static void deltraverse(struct nfa *, struct state *, struct state *);static void dupnfa(struct nfa *, struct state *, struct state *, struct state *, struct state *);static void duptraverse(struct nfa *, struct state *, struct state *);static void cleartraverse(struct nfa *, struct state *);static void specialcolors(struct nfa *);static long optimize(struct nfa *, FILE *);static void pullback(struct nfa *, FILE *);static int pull(struct nfa *, struct arc *);static void pushfwd(struct nfa *, FILE *);static int push(struct nfa *, struct arc *);#define INCOMPATIBLE 1 /* destroys arc */#define SATISFIED 2 /* constraint satisfied */#define COMPATIBLE 3 /* compatible but not satisfied yet */static int combine(struct arc *, struct arc *);static void fixempties(struct nfa *, FILE *);static int unempty(struct nfa *, struct arc *);static void cleanup(struct nfa *);static void markreachable(struct nfa *, struct state *, struct state *, struct state *);static void markcanreach(struct nfa *, struct state *, struct state *, struct state *);static long analyze(struct nfa *);static void compact(struct nfa *, struct cnfa *);static void carcsort(struct carc *, struct carc *);static void freecnfa(struct cnfa *);static void dumpnfa(struct nfa *, FILE *);#ifdef REG_DEBUGstatic void dumpstate(struct state *, FILE *);static void dumparcs(struct state *, FILE *);static int dumprarcs(struct arc *, struct state *, FILE *, int);static void dumparc(struct arc *, struct state *, FILE *);static void dumpcnfa(struct cnfa *, FILE *);static void dumpcstate(int, struct carc *, struct cnfa *, FILE *);#endif/* === regc_cvec.c === */static struct cvec *newcvec(int, int, int);static struct cvec *clearcvec(struct cvec *);static void addchr(struct cvec *, chr);static void addrange(struct cvec *, chr, chr);static void addmcce(struct cvec *, chr *, chr *);static int haschr(struct cvec *, chr);static struct cvec *getcvec(struct vars *, int, int, int);static void freecvec(struct cvec *);/* === regc_locale.c === */static int pg_wc_isdigit(pg_wchar c);static int pg_wc_isalpha(pg_wchar c);static int pg_wc_isalnum(pg_wchar c);static int pg_wc_isupper(pg_wchar c);static int pg_wc_islower(pg_wchar c);static int pg_wc_isgraph(pg_wchar c);static int pg_wc_isprint(pg_wchar c);static int pg_wc_ispunct(pg_wchar c);static int pg_wc_isspace(pg_wchar c);static pg_wchar pg_wc_toupper(pg_wchar c);static pg_wchar pg_wc_tolower(pg_wchar c);static int nmcces(struct vars *);static int nleaders(struct vars *);static struct cvec *allmcces(struct vars *, struct cvec *);static celt element(struct vars *, chr *, chr *);static struct cvec *range(struct vars *, celt, celt, int);static int before(celt, celt);static struct cvec *eclass(struct vars *, celt, int);static struct cvec *cclass(struct vars *, chr *, chr *, int);static struct cvec *allcases(struct vars *, chr);static int cmp(const chr *, const chr *, size_t);static int casecmp(const chr *, const chr *, size_t);/* internal variables, bundled for easy passing around */struct vars{ regex_t *re; chr *now; /* scan pointer into string 指向扫描的字符的指针*/ chr *stop; /* end of string 字符串结尾*/ chr *savenow; /* saved now and stop for "subroutine call" */ chr *savestop; int err; /* error code (0 if none) */ int cflags; /* copy of compile flags */ int lasttype; /* type of previous token */ int nexttype; /* type of next token */ chr nextvalue; /* value (if any) of next token */ int lexcon; /* lexical context type (see lex.c) */ int nsubexp; /* subexpression count */ struct subre **subs; /* subRE pointer vector */ size_t nsubs; /* length of vector */ struct subre *sub10[10]; /* initial vector, enough for most */ struct nfa *nfa; /* the NFA */ struct colormap *cm; /* character color map */ color nlcolor; /* color of newline */ struct state *wordchrs; /* state in nfa holding word-char outarcs */ struct subre *tree; /* subexpression tree */ struct subre *treechain; /* all tree nodes allocated */ struct subre *treefree; /* any free tree nodes */ int ntree; /* number of tree nodes */ struct cvec *cv; /* interface cvec */ struct cvec *cv2; /* utility cvec */ struct cvec *mcces; /* collating-element information */#define ISCELEADER(v,c) ((v)->mcces != NULL && haschr((v)->mcces, (c))) struct state *mccepbegin; /* in nfa, start of MCCE prototypes */ struct state *mccepend; /* in nfa, end of MCCE prototypes */ struct subre *lacons; /* lookahead-constraint vector */ int nlacons; /* size of lacons */};/* parsing macros; most know that `v' is the struct vars pointer */#define NEXT() (next(v)) /* advance by one token */#define SEE(t) (v->nexttype == (t)) /* is next token this? */#define EAT(t) (SEE(t) && next(v)) /* if next is this, swallow it */#define VISERR(vv) ((vv)->err != 0) /* have we seen an error yet? */#define ISERR() VISERR(v)#define VERR(vv,e) ((vv)->nexttype = EOS, ((vv)->err) ? (vv)->err :\ ((vv)->err = (e)))#define ERR(e) VERR(v, e) /* record an error */#define NOERR() {if (ISERR()) return;} /* if error seen, return */#define NOERRN() {if (ISERR()) return NULL;} /* NOERR with retval */#define NOERRZ() {if (ISERR()) return 0;} /* NOERR with retval */#define INSIST(c, e) ((c) ? 0 : ERR(e)) /* if condition false, error */#define NOTE(b) (v->re->re_info |= (b)) /* note visible condition */#define EMPTYARC(x, y) newarc(v->nfa, EMPTY, 0, x, y)/* token type codes, some also used as NFA arc types */#define EMPTY 'n' /* no token present */#define EOS 'e' /* end of string */#define PLAIN 'p' /* ordinary character */#define DIGIT 'd' /* digit (in bound) */#define BACKREF 'b' /* back reference */#define COLLEL 'I' /* start of [. */#define ECLASS 'E' /* start of [= */#define CCLASS 'C' /* start of [: */#define END 'X' /* end of [. [= [: */#define RANGE 'R' /* - within [] which might be range delim. */#define LACON 'L' /* lookahead constraint subRE */#define AHEAD 'a' /* color-lookahead arc */#define BEHIND 'r' /* color-lookbehind arc */#define WBDRY 'w' /* word boundary constraint */#define NWBDRY 'W' /* non-word-boundary constraint */#define SBEGIN 'A' /* beginning of string (even if not BOL) */#define SEND 'Z' /* end of string (even if not EOL) */#define PREFER 'P' /* length preference *//* is an arc colored, and hence on a color chain? */#define COLORED(a) ((a)->type == PLAIN || (a)->type == AHEAD || \ (a)->type == BEHIND)/* static function list */static struct fns functions = { rfree, /* regfree insides */};/* * pg_regcomp - compile regular expression */intpg_regcomp(regex_t *re, const chr *string, size_t len, int flags){ struct vars var; struct vars *v = &var; struct guts *g; int i; size_t j;#ifdef REG_DEBUG FILE *debug = (flags & REG_PROGRESS) ? stdout : (FILE *) NULL;#else FILE *debug = (FILE *) NULL;#endif#define CNOERR() { if (ISERR()) return freev(v, v->err); } /* sanity checks */ if (re == NULL || string == NULL) return REG_INVARG; if ((flags & REG_QUOTE) && (flags & (REG_ADVANCED | REG_EXPANDED | REG_NEWLINE))) return REG_INVARG; if (!(flags & REG_EXTENDED) && (flags & REG_ADVF)) return REG_INVARG; /* initial setup (after which freev() is callable) */ v->re = re; v->now = (chr *) string; v->stop = v->now + len; v->savenow = v->savestop = NULL; v->err = 0; v->cflags = flags; v->nsubexp = 0; v->subs = v->sub10; v->nsubs = 10; for (j = 0; j < v->nsubs; j++) v->subs[j] = NULL; v->nfa = NULL; v->cm = NULL; v->nlcolor = COLORLESS; v->wordchrs = NULL; v->tree = NULL; v->treechain = NULL; v->treefree = NULL; v->cv = NULL; v->cv2 = NULL; v->mcces = NULL; v->lacons = NULL; v->nlacons = 0; re->re_magic = REMAGIC; re->re_info = 0; /* bits get set during parse */ re->re_csize = sizeof(chr); re->re_guts = NULL; re->re_fns = VS(&functions); /* more complex setup, malloced things */ re->re_guts = VS(MALLOC(sizeof(struct guts))); if (re->re_guts == NULL) return freev(v, REG_ESPACE); g = (struct guts *) re->re_guts; g->tree = NULL; initcm(v, &g->cmap); v->cm = &g->cmap; g->lacons = NULL; g->nlacons = 0; ZAPCNFA(g->search); v->nfa = newnfa(v, v->cm, (struct nfa *) NULL); CNOERR(); v->cv = newcvec(100, 20, 10); if (v->cv == NULL) return freev(v, REG_ESPACE); i = nmcces(v); if (i > 0) { v->mcces = newcvec(nleaders(v), 0, i); CNOERR(); v->mcces = allmcces(v, v->mcces); leaders(v, v->mcces); addmcce(v->mcces, (chr *) NULL, (chr *) NULL); /* dummy */ } CNOERR(); /* parsing */ lexstart(v); /* also handles prefixes */ if ((v->cflags & REG_NLSTOP) || (v->cflags & REG_NLANCH)) { /* assign newline a unique color */ v->nlcolor = subcolor(v->cm, newline()); okcolors(v->nfa, v->cm); } CNOERR(); v->tree = parse(v, EOS, PLAIN, v->nfa->init, v->nfa->final); assert(SEE(EOS)); /* even if error; ISERR() => SEE(EOS) */ CNOERR(); assert(v->tree != NULL); /* finish setup of nfa and its subre tree */ specialcolors(v->nfa); CNOERR();#ifdef REG_DEBUG if (debug != NULL) { fprintf(debug, "\n\n\n========= RAW ==========\n"); dumpnfa(v->nfa, debug); dumpst(v->tree, debug, 1); }#endif optst(v, v->tree); v->ntree = numst(v->tree, 1); markst(v->tree); cleanst(v);#ifdef REG_DEBUG if (debug != NULL) { fprintf(debug, "\n\n\n========= TREE FIXED ==========\n"); dumpst(v->tree, debug, 1); }#endif /* build compacted NFAs for tree and lacons */ re->re_info |= nfatree(v, v->tree, debug); CNOERR(); assert(v->nlacons == 0 || v->lacons != NULL); for (i = 1; i < v->nlacons; i++) {#ifdef REG_DEBUG if (debug != NULL) fprintf(debug, "\n\n\n========= LA%d ==========\n", i);#endif nfanode(v, &v->lacons[i], debug); } CNOERR(); if (v->tree->flags & SHORTER) NOTE(REG_USHORTEST); /* build compacted NFAs for tree, lacons, fast search */#ifdef REG_DEBUG if (debug != NULL) fprintf(debug, "\n\n\n========= SEARCH ==========\n");#endif /* can sacrifice main NFA now, so use it as work area */ (DISCARD) optimize(v->nfa, debug); CNOERR(); makesearch(v, v->nfa); CNOERR(); compact(v->nfa, &g->search); CNOERR(); /* looks okay, package it up */ re->re_nsub = v->nsubexp; v->re = NULL; /* freev no longer frees re */ g->magic = GUTSMAGIC; g->cflags = v->cflags; g->info = re->re_info; g->nsub = re->re_nsub; g->tree = v->tree; v->tree = NULL; g->ntree = v->ntree; g->compare = (v->cflags & REG_ICASE) ? casecmp : cmp; g->lacons = v->lacons; v->lacons = NULL; g->nlacons = v->nlacons;#ifdef REG_DEBUG if (flags & REG_DUMP) dump(re, stdout);#endif assert(v->err == 0); return freev(v, 0);}/* * moresubs - enlarge subRE vector */static voidmoresubs(struct vars * v, int wanted) /* want enough room for this one */{ struct subre **p; size_t n; assert(wanted > 0 && (size_t) wanted >= v->nsubs); n = (size_t) wanted *3 / 2 + 1; if (v->subs == v->sub10) { p = (struct subre **) MALLOC(n * sizeof(struct subre *)); if (p != NULL) memcpy(VS(p), VS(v->subs), v->nsubs * sizeof(struct subre *)); } else p = (struct subre **) REALLOC(v->subs, n * sizeof(struct subre *)); if (p == NULL) { ERR(REG_ESPACE); return; } v->subs = p; for (p = &v->subs[v->nsubs]; v->nsubs < n; p++, v->nsubs++) *p = NULL; assert(v->nsubs == n); assert((size_t) wanted < v->nsubs);}/* * freev - free vars struct's substructures where necessary * * Optionally does error-number setting, and always returns error code * (if any), to make error-handling code terser. */static intfreev(struct vars * v, int err){ if (v->re != NULL) rfree(v->re); if (v->subs != v->sub10) FREE(v->subs); if (v->nfa != NULL) freenfa(v->nfa); if (v->tree != NULL) freesubre(v, v->tree); if (v->treechain != NULL) cleanst(v); if (v->cv != NULL) freecvec(v->cv); if (v->cv2 != NULL) freecvec(v->cv2); if (v->mcces != NULL) freecvec(v->mcces); if (v->lacons != NULL) freelacons(v->lacons, v->nlacons); ERR(err); /* nop if err==0 */ return v->err;}/* * makesearch - turn an NFA into a search NFA (implicit prepend of .*?) * NFA must have been optimize()d already. */static voidmakesearch(struct vars * v, struct nfa * nfa){ struct arc *a; struct arc *b; struct state *pre = nfa->pre; struct state *s; struct state *s2; struct state *slist; /* no loops are needed if it's anchored */ for (a = pre->outs; a != NULL; a = a->outchain) { assert(a->type == PLAIN); if (a->co != nfa->bos[0] && a->co != nfa->bos[1]) break; } if (a != NULL) { /* add implicit .* in front */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -