📄 regcomp.c
字号:
/* NOTE: this is derived from Henry Spencer's regexp code, and should not * confused with the original package (see point 3 below). Thanks, Henry! *//* Additional note: this code is very heavily munged from Henry's version * in places. In some spots I've traded clarity for efficiency, so don't * blame Henry for some of the lack of readability. *//* $RCSfile: regcomp.c,v $$Revision: 4.0.1.5 $$Date: 92/06/08 15:23:36 $ * * $Log: regcomp.c,v $ * Revision 4.0.1.5 92/06/08 15:23:36 lwall * patch20: Perl now distinguishes overlapped copies from non-overlapped * patch20: /^stuff/ wrongly assumed an implicit $* == 1 * patch20: /x{0}/ was wrongly interpreted as /x{0,}/ * patch20: added \W, \S and \D inside /[...]/ * * Revision 4.0.1.4 91/11/05 22:55:14 lwall * patch11: Erratum * * Revision 4.0.1.3 91/11/05 18:22:28 lwall * patch11: minimum match length calculation in regexp is now cumulative * patch11: initial .* in pattern had dependency on value of $* * patch11: certain patterns made use of garbage pointers from uncleared memory * patch11: prepared for ctype implementations that don't define isascii() * * Revision 4.0.1.2 91/06/07 11:48:24 lwall * patch4: new copyright notice * patch4: /(x+) \1/ incorrectly optimized to not match "xxx xx" * patch4: // wouldn't use previous pattern if it started with a null character * * Revision 4.0.1.1 91/04/12 09:04:45 lwall * patch1: random cleanup in cpp namespace * * Revision 4.0 91/03/20 01:39:01 lwall * 4.0 baseline. * *//*SUPPRESS 112*//* * regcomp and regexec -- regsub and regerror are not used in perl * * Copyright (c) 1986 by University of Toronto. * Written by Henry Spencer. Not derived from licensed software. * * Permission is granted to anyone to use this software for any * purpose on any computer system, and to redistribute it freely, * subject to the following restrictions: * * 1. The author is not responsible for the consequences of use of * this software, no matter how awful, even if they arise * from defects in it. * * 2. The origin of this software must not be misrepresented, either * by explicit claim or by omission. * * 3. Altered versions must be plainly marked as such, and must not * be misrepresented as being the original software. * * **** Alterations to Henry's code are... **** **** Copyright (c) 1991, Larry Wall **** **** You may distribute under the terms of either the GNU General Public **** License or the Artistic License, as specified in the README file. * * Beware that some of this code is subtly aware of the way operator * precedence is structured in regular expressions. Serious changes in * regular-expression syntax might require a total rethink. */#include "EXTERN.h"#include "perl.h"#include "INTERN.h"#include "regcomp.h"#ifdef MSDOS# if defined(BUGGY_MSC6) /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */ # pragma optimize("a",off) /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/ # pragma optimize("w",on )# endif /* BUGGY_MSC6 */#endif /* MSDOS */#ifndef STATIC#define STATIC static#endif#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \ ((*s) == '{' && regcurly(s)))#ifdef atarist#define PERL_META "^$.[()|?+*\\"#else#define META "^$.[()|?+*\\"#endif#ifdef SPSTART#undef SPSTART /* dratted cpp namespace... */#endif/* * Flags to be passed up and down. */#define HASWIDTH 01 /* Known never to match null string. */#define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */#define SPSTART 04 /* Starts with * or +. */#define WORST 0 /* Worst case. *//* * Global work variables for regcomp(). */static char *regprecomp; /* uncompiled string. */static char *regparse; /* Input-scan pointer. */static char *regxend; /* End of input for compile */static int regnpar; /* () count. */static char *regcode; /* Code-emit pointer; ®dummy = don't. */static long regsize; /* Code size. */static int regfold;static int regsawbracket; /* Did we do {d,d} trick? */static int regsawback; /* Did we see \1, ...? *//* * Forward declarations for regcomp()'s friends. */STATIC int regcurly();STATIC char *reg();STATIC char *regbranch();STATIC char *regpiece();STATIC char *regatom();STATIC char *regclass();STATIC char *regnode();STATIC char *reganode();STATIC void regc();STATIC void reginsert();STATIC void regtail();STATIC void regoptail();/* - regcomp - compile a regular expression into internal code * * We can't allocate space until we know how big the compiled form will be, * but we can't compile it (and thus know how big it is) until we've got a * place to put the code. So we cheat: we compile it twice, once with code * generation turned off and size counting turned on, and once "for real". * This also means that we don't allocate space until we are sure that the * thing really will compile successfully, and we never have to move the * code and thus invalidate pointers into it. (Note that it has to be in * one piece because free() must be able to free it all.) [NB: not true in perl] * * Beware that the optimization-preparation code in here knows about some * of the structure of the compiled regexp. [I'll say.] */regexp *regcomp(exp,xend,fold)char *exp;char *xend;int fold;{ register regexp *r; register char *scan; register STR *longish; STR *longest; register int len; register char *first; int flags; int backish; int backest; int curback; int minlen; int sawplus = 0; int sawopen = 0; if (exp == NULL) fatal("NULL regexp argument"); /* First pass: determine size, legality. */ regfold = fold; regparse = exp; regxend = xend; regprecomp = nsavestr(exp,xend-exp); regsawbracket = 0; regsawback = 0; regnpar = 1; regsize = 0L; regcode = ®dummy; regc((char)MAGIC); if (reg(0, &flags) == NULL) { Safefree(regprecomp); regprecomp = Nullch; return(NULL); } /* Small enough for pointer-storage convention? */ if (regsize >= 32767L) /* Probably could be 65535L. */ FAIL("regexp too big"); /* Allocate space. */ Newc(1001, r, sizeof(regexp) + (unsigned)regsize, char, regexp); if (r == NULL) FAIL("regexp out of space"); /* Second pass: emit code. */ if (regsawbracket) Copy(regprecomp,exp,xend-exp,char); r->prelen = xend-exp; r->precomp = regprecomp; r->subbeg = r->subbase = NULL; regparse = exp; regnpar = 1; regcode = r->program; regc((char)MAGIC); if (reg(0, &flags) == NULL) return(NULL); /* Dig out information for optimizations. */ r->regstart = Nullstr; /* Worst-case defaults. */ r->reganch = 0; r->regmust = Nullstr; r->regback = -1; r->regstclass = Nullch; scan = r->program+1; /* First BRANCH. */ if (OP(regnext(scan)) == END) {/* Only one top-level choice. */ scan = NEXTOPER(scan); first = scan; while ((OP(first) == OPEN && (sawopen = 1)) || (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) || (OP(first) == PLUS) || (OP(first) == CURLY && ARG1(first) > 0) ) { if (OP(first) == PLUS) sawplus = 1; else first += regarglen[OP(first)]; first = NEXTOPER(first); } /* Starting-point info. */ again: if (OP(first) == EXACTLY) { r->regstart = str_make(OPERAND(first)+1,*OPERAND(first)); if (r->regstart->str_cur > !(sawstudy|fold)) fbmcompile(r->regstart,fold); } else if ((exp = index(simple,OP(first))) && exp > simple) r->regstclass = first; else if (OP(first) == BOUND || OP(first) == NBOUND) r->regstclass = first; else if (OP(first) == BOL) { r->reganch = ROPT_ANCH; first = NEXTOPER(first); goto again; } else if ((OP(first) == STAR && OP(NEXTOPER(first)) == ANY) && !(r->reganch & ROPT_ANCH) ) { /* turn .* into ^.* with an implied $*=1 */ r->reganch = ROPT_ANCH | ROPT_IMPLICIT; first = NEXTOPER(first); goto again; } if (sawplus && (!sawopen || !regsawback)) r->reganch |= ROPT_SKIP; /* x+ must match 1st of run */#ifdef DEBUGGING if (debug & 512) fprintf(stderr,"first %d next %d offset %d\n", OP(first), OP(NEXTOPER(first)), first - scan);#endif /* * If there's something expensive in the r.e., find the * longest literal string that must appear and make it the * regmust. Resolve ties in favor of later strings, since * the regstart check works with the beginning of the r.e. * and avoiding duplication strengthens checking. Not a * strong reason, but sufficient in the absence of others. * [Now we resolve ties in favor of the earlier string if * it happens that curback has been invalidated, since the * earlier string may buy us something the later one won't.] */ longish = str_make("",0); longest = str_make("",0); len = 0; minlen = 0; curback = 0; backish = 0; backest = 0; while (OP(scan) != END) { if (OP(scan) == BRANCH) { if (OP(regnext(scan)) == BRANCH) { curback = -30000; while (OP(scan) == BRANCH) scan = regnext(scan); } else /* single branch is ok */ scan = NEXTOPER(scan); } if (OP(scan) == EXACTLY) { char *t; first = scan; while (OP(t = regnext(scan)) == CLOSE) scan = t; minlen += *OPERAND(first); if (curback - backish == len) { str_ncat(longish, OPERAND(first)+1, *OPERAND(first)); len += *OPERAND(first); curback += *OPERAND(first); first = regnext(scan); } else if (*OPERAND(first) >= len + (curback >= 0)) { len = *OPERAND(first); str_nset(longish, OPERAND(first)+1,len); backish = curback; curback += len; first = regnext(scan); } else curback += *OPERAND(first); } else if (index(varies,OP(scan))) { curback = -30000; len = 0; if (longish->str_cur > longest->str_cur) { str_sset(longest,longish); backest = backish; } str_nset(longish,"",0); if (OP(scan) == PLUS && index(simple,OP(NEXTOPER(scan)))) minlen++; else if (OP(scan) == CURLY && index(simple,OP(NEXTOPER(scan)+4))) minlen += ARG1(scan); } else if (index(simple,OP(scan))) { curback++; minlen++; len = 0; if (longish->str_cur > longest->str_cur) { str_sset(longest,longish); backest = backish; } str_nset(longish,"",0); } scan = regnext(scan); } /* Prefer earlier on tie, unless we can tail match latter */ if (longish->str_cur + (OP(first) == EOL) > longest->str_cur) { str_sset(longest,longish); backest = backish; } else str_nset(longish,"",0); if (longest->str_cur && (!r->regstart || !fbminstr((unsigned char*) r->regstart->str_ptr, (unsigned char *) r->regstart->str_ptr + r->regstart->str_cur, longest) ) ) { r->regmust = longest; if (backest < 0) backest = -1; r->regback = backest; if (longest->str_cur > !(sawstudy || fold || OP(first) == EOL) ) fbmcompile(r->regmust,fold); r->regmust->str_u.str_useful = 100; if (OP(first) == EOL && longish->str_cur) r->regmust->str_pok |= SP_TAIL; } else { str_free(longest); longest = Nullstr; } str_free(longish); } r->do_folding = fold; r->nparens = regnpar - 1; r->minlen = minlen; Newz(1002, r->startp, regnpar, char*); Newz(1002, r->endp, regnpar, char*);#ifdef DEBUGGING if (debug & 512) regdump(r);#endif return(r);}/* - reg - regular expression, i.e. main body or parenthesized thing * * Caller must absorb opening parenthesis. * * Combining parenthesis handling with the base level of regular expression * is a trifle forced, but the need to tie the tails of the branches to what * follows makes it hard to avoid. */static char *reg(paren, flagp)int paren; /* Parenthesized? */int *flagp;{ register char *ret; register char *br; register char *ender; register int parno; int flags; *flagp = HASWIDTH; /* Tentatively. */ /* Make an OPEN node, if parenthesized. */ if (paren) { parno = regnpar; regnpar++; ret = reganode(OPEN, parno); } else ret = NULL; /* Pick up the branches, linking them together. */ br = regbranch(&flags); if (br == NULL) return(NULL); if (ret != NULL) regtail(ret, br); /* OPEN -> first. */ else ret = br; if (!(flags&HASWIDTH)) *flagp &= ~HASWIDTH; *flagp |= flags&SPSTART; while (*regparse == '|') { regparse++; br = regbranch(&flags); if (br == NULL) return(NULL); regtail(ret, br); /* BRANCH -> BRANCH. */ if (!(flags&HASWIDTH)) *flagp &= ~HASWIDTH; *flagp |= flags&SPSTART; } /* Make a closing node, and hook it on the end. */ if (paren) ender = reganode(CLOSE, parno); else ender = regnode(END); regtail(ret, ender); /* Hook the tails of the branches to the closing node. */ for (br = ret; br != NULL; br = regnext(br)) regoptail(br, ender); /* Check for proper termination. */ if (paren && *regparse++ != ')') { FAIL("unmatched () in regexp"); } else if (!paren && regparse < regxend) { if (*regparse == ')') { FAIL("unmatched () in regexp"); } else FAIL("junk on end of regexp"); /* "Can't happen". */ /* NOTREACHED */ } return(ret);}/* - regbranch - one alternative of an | operator * * Implements the concatenation operator. */static char *regbranch(flagp)int *flagp;{ register char *ret; register char *chain; register char *latest; int flags; *flagp = WORST; /* Tentatively. */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -