⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 regcomp.c

📁 早期freebsd实现
💻 C
📖 第 1 页 / 共 3 页
字号:
/* NOTE: this is derived from Henry Spencer's regexp code, and should not * confused with the original package (see point 3 below).  Thanks, Henry! *//* Additional note: this code is very heavily munged from Henry's version * in places.  In some spots I've traded clarity for efficiency, so don't * blame Henry for some of the lack of readability. *//* $RCSfile: regcomp.c,v $$Revision: 4.0.1.5 $$Date: 92/06/08 15:23:36 $ * * $Log:	regcomp.c,v $ * Revision 4.0.1.5  92/06/08  15:23:36  lwall * patch20: Perl now distinguishes overlapped copies from non-overlapped * patch20: /^stuff/ wrongly assumed an implicit $* == 1 * patch20: /x{0}/ was wrongly interpreted as /x{0,}/ * patch20: added \W, \S and \D inside /[...]/ *  * Revision 4.0.1.4  91/11/05  22:55:14  lwall * patch11: Erratum *  * Revision 4.0.1.3  91/11/05  18:22:28  lwall * patch11: minimum match length calculation in regexp is now cumulative * patch11: initial .* in pattern had dependency on value of $* * patch11: certain patterns made use of garbage pointers from uncleared memory * patch11: prepared for ctype implementations that don't define isascii() *  * Revision 4.0.1.2  91/06/07  11:48:24  lwall * patch4: new copyright notice * patch4: /(x+) \1/ incorrectly optimized to not match "xxx xx" * patch4: // wouldn't use previous pattern if it started with a null character *  * Revision 4.0.1.1  91/04/12  09:04:45  lwall * patch1: random cleanup in cpp namespace *  * Revision 4.0  91/03/20  01:39:01  lwall * 4.0 baseline. *  *//*SUPPRESS 112*//* * regcomp and regexec -- regsub and regerror are not used in perl * *	Copyright (c) 1986 by University of Toronto. *	Written by Henry Spencer.  Not derived from licensed software. * *	Permission is granted to anyone to use this software for any *	purpose on any computer system, and to redistribute it freely, *	subject to the following restrictions: * *	1. The author is not responsible for the consequences of use of *		this software, no matter how awful, even if they arise *		from defects in it. * *	2. The origin of this software must not be misrepresented, either *		by explicit claim or by omission. * *	3. Altered versions must be plainly marked as such, and must not *		be misrepresented as being the original software. * * ****    Alterations to Henry's code are... **** ****    Copyright (c) 1991, Larry Wall **** ****    You may distribute under the terms of either the GNU General Public ****    License or the Artistic License, as specified in the README file. * * Beware that some of this code is subtly aware of the way operator * precedence is structured in regular expressions.  Serious changes in * regular-expression syntax might require a total rethink. */#include "EXTERN.h"#include "perl.h"#include "INTERN.h"#include "regcomp.h"#ifdef MSDOS# if defined(BUGGY_MSC6) /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */ # pragma optimize("a",off) /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/ # pragma optimize("w",on )# endif /* BUGGY_MSC6 */#endif /* MSDOS */#ifndef STATIC#define	STATIC	static#endif#define	ISMULT1(c)	((c) == '*' || (c) == '+' || (c) == '?')#define	ISMULT2(s)	((*s) == '*' || (*s) == '+' || (*s) == '?' || \	((*s) == '{' && regcurly(s)))#ifdef atarist#define	PERL_META	"^$.[()|?+*\\"#else#define	META	"^$.[()|?+*\\"#endif#ifdef SPSTART#undef SPSTART		/* dratted cpp namespace... */#endif/* * Flags to be passed up and down. */#define	HASWIDTH	01	/* Known never to match null string. */#define	SIMPLE		02	/* Simple enough to be STAR/PLUS operand. */#define	SPSTART		04	/* Starts with * or +. */#define	WORST		0	/* Worst case. *//* * Global work variables for regcomp(). */static char *regprecomp;		/* uncompiled string. */static char *regparse;		/* Input-scan pointer. */static char *regxend;		/* End of input for compile */static int regnpar;		/* () count. */static char *regcode;		/* Code-emit pointer; &regdummy = don't. */static long regsize;		/* Code size. */static int regfold;static int regsawbracket;	/* Did we do {d,d} trick? */static int regsawback;		/* Did we see \1, ...? *//* * Forward declarations for regcomp()'s friends. */STATIC int regcurly();STATIC char *reg();STATIC char *regbranch();STATIC char *regpiece();STATIC char *regatom();STATIC char *regclass();STATIC char *regnode();STATIC char *reganode();STATIC void regc();STATIC void reginsert();STATIC void regtail();STATIC void regoptail();/* - regcomp - compile a regular expression into internal code * * We can't allocate space until we know how big the compiled form will be, * but we can't compile it (and thus know how big it is) until we've got a * place to put the code.  So we cheat:  we compile it twice, once with code * generation turned off and size counting turned on, and once "for real". * This also means that we don't allocate space until we are sure that the * thing really will compile successfully, and we never have to move the * code and thus invalidate pointers into it.  (Note that it has to be in * one piece because free() must be able to free it all.) [NB: not true in perl] * * Beware that the optimization-preparation code in here knows about some * of the structure of the compiled regexp.  [I'll say.] */regexp *regcomp(exp,xend,fold)char *exp;char *xend;int fold;{	register regexp *r;	register char *scan;	register STR *longish;	STR *longest;	register int len;	register char *first;	int flags;	int backish;	int backest;	int curback;	int minlen;	int sawplus = 0;	int sawopen = 0;	if (exp == NULL)		fatal("NULL regexp argument");	/* First pass: determine size, legality. */	regfold = fold;	regparse = exp;	regxend = xend;	regprecomp = nsavestr(exp,xend-exp);	regsawbracket = 0;	regsawback = 0;	regnpar = 1;	regsize = 0L;	regcode = &regdummy;	regc((char)MAGIC);	if (reg(0, &flags) == NULL) {		Safefree(regprecomp);		regprecomp = Nullch;		return(NULL);	}	/* Small enough for pointer-storage convention? */	if (regsize >= 32767L)		/* Probably could be 65535L. */		FAIL("regexp too big");	/* Allocate space. */	Newc(1001, r, sizeof(regexp) + (unsigned)regsize, char, regexp);	if (r == NULL)		FAIL("regexp out of space");	/* Second pass: emit code. */	if (regsawbracket)	    Copy(regprecomp,exp,xend-exp,char);	r->prelen = xend-exp;	r->precomp = regprecomp;	r->subbeg = r->subbase = NULL;	regparse = exp;	regnpar = 1;	regcode = r->program;	regc((char)MAGIC);	if (reg(0, &flags) == NULL)		return(NULL);	/* Dig out information for optimizations. */	r->regstart = Nullstr;	/* Worst-case defaults. */	r->reganch = 0;	r->regmust = Nullstr;	r->regback = -1;	r->regstclass = Nullch;	scan = r->program+1;			/* First BRANCH. */	if (OP(regnext(scan)) == END) {/* Only one top-level choice. */		scan = NEXTOPER(scan);		first = scan;		while ((OP(first) == OPEN && (sawopen = 1)) ||		    (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||		    (OP(first) == PLUS) ||		    (OP(first) == CURLY && ARG1(first) > 0) ) {			if (OP(first) == PLUS)			    sawplus = 1;			else			    first += regarglen[OP(first)];			first = NEXTOPER(first);		}		/* Starting-point info. */	    again:		if (OP(first) == EXACTLY) {			r->regstart =			    str_make(OPERAND(first)+1,*OPERAND(first));			if (r->regstart->str_cur > !(sawstudy|fold))				fbmcompile(r->regstart,fold);		}		else if ((exp = index(simple,OP(first))) && exp > simple)			r->regstclass = first;		else if (OP(first) == BOUND || OP(first) == NBOUND)			r->regstclass = first;		else if (OP(first) == BOL) {			r->reganch = ROPT_ANCH;			first = NEXTOPER(first);		    	goto again;		}		else if ((OP(first) == STAR && OP(NEXTOPER(first)) == ANY) &&			 !(r->reganch & ROPT_ANCH) ) {			/* turn .* into ^.* with an implied $*=1 */			r->reganch = ROPT_ANCH | ROPT_IMPLICIT;			first = NEXTOPER(first);		    	goto again;		}		if (sawplus && (!sawopen || !regsawback))		    r->reganch |= ROPT_SKIP;	/* x+ must match 1st of run */#ifdef DEBUGGING		if (debug & 512)		    fprintf(stderr,"first %d next %d offset %d\n",		      OP(first), OP(NEXTOPER(first)), first - scan);#endif		/*		 * If there's something expensive in the r.e., find the		 * longest literal string that must appear and make it the		 * regmust.  Resolve ties in favor of later strings, since		 * the regstart check works with the beginning of the r.e.		 * and avoiding duplication strengthens checking.  Not a		 * strong reason, but sufficient in the absence of others.		 * [Now we resolve ties in favor of the earlier string if		 * it happens that curback has been invalidated, since the		 * earlier string may buy us something the later one won't.]		 */		longish = str_make("",0);		longest = str_make("",0);		len = 0;		minlen = 0;		curback = 0;		backish = 0;		backest = 0;		while (OP(scan) != END) {			if (OP(scan) == BRANCH) {			    if (OP(regnext(scan)) == BRANCH) {				curback = -30000;				while (OP(scan) == BRANCH)				    scan = regnext(scan);			    }			    else	/* single branch is ok */				scan = NEXTOPER(scan);			}			if (OP(scan) == EXACTLY) {			    char *t;			    first = scan;			    while (OP(t = regnext(scan)) == CLOSE)				scan = t;			    minlen += *OPERAND(first);			    if (curback - backish == len) {				str_ncat(longish, OPERAND(first)+1,				    *OPERAND(first));				len += *OPERAND(first);				curback += *OPERAND(first);				first = regnext(scan);			    }			    else if (*OPERAND(first) >= len + (curback >= 0)) {				len = *OPERAND(first);				str_nset(longish, OPERAND(first)+1,len);				backish = curback;				curback += len;				first = regnext(scan);			    }			    else				curback += *OPERAND(first);			}			else if (index(varies,OP(scan))) {			    curback = -30000;			    len = 0;			    if (longish->str_cur > longest->str_cur) {				str_sset(longest,longish);				backest = backish;			    }			    str_nset(longish,"",0);			    if (OP(scan) == PLUS &&			      index(simple,OP(NEXTOPER(scan))))				minlen++;			    else if (OP(scan) == CURLY &&			      index(simple,OP(NEXTOPER(scan)+4)))				minlen += ARG1(scan);			}			else if (index(simple,OP(scan))) {			    curback++;			    minlen++;			    len = 0;			    if (longish->str_cur > longest->str_cur) {				str_sset(longest,longish);				backest = backish;			    }			    str_nset(longish,"",0);			}			scan = regnext(scan);		}		/* Prefer earlier on tie, unless we can tail match latter */		if (longish->str_cur + (OP(first) == EOL) > longest->str_cur) {		    str_sset(longest,longish);		    backest = backish;		}		else		    str_nset(longish,"",0);		if (longest->str_cur		    &&		    (!r->regstart		     ||		     !fbminstr((unsigned char*) r->regstart->str_ptr,			  (unsigned char *) r->regstart->str_ptr			    + r->regstart->str_cur,			  longest)		    )		   )		{			r->regmust = longest;			if (backest < 0)				backest = -1;			r->regback = backest;			if (longest->str_cur			  > !(sawstudy || fold || OP(first) == EOL) )				fbmcompile(r->regmust,fold);			r->regmust->str_u.str_useful = 100;			if (OP(first) == EOL && longish->str_cur)			    r->regmust->str_pok |= SP_TAIL;		}		else {			str_free(longest);			longest = Nullstr;		}		str_free(longish);	}	r->do_folding = fold;	r->nparens = regnpar - 1;	r->minlen = minlen;	Newz(1002, r->startp, regnpar, char*);	Newz(1002, r->endp, regnpar, char*);#ifdef DEBUGGING	if (debug & 512)		regdump(r);#endif	return(r);}/* - reg - regular expression, i.e. main body or parenthesized thing * * Caller must absorb opening parenthesis. * * Combining parenthesis handling with the base level of regular expression * is a trifle forced, but the need to tie the tails of the branches to what * follows makes it hard to avoid. */static char *reg(paren, flagp)int paren;			/* Parenthesized? */int *flagp;{	register char *ret;	register char *br;	register char *ender;	register int parno;	int flags;	*flagp = HASWIDTH;	/* Tentatively. */	/* Make an OPEN node, if parenthesized. */	if (paren) {		parno = regnpar;		regnpar++;		ret = reganode(OPEN, parno);	} else		ret = NULL;	/* Pick up the branches, linking them together. */	br = regbranch(&flags);	if (br == NULL)		return(NULL);	if (ret != NULL)		regtail(ret, br);	/* OPEN -> first. */	else		ret = br;	if (!(flags&HASWIDTH))		*flagp &= ~HASWIDTH;	*flagp |= flags&SPSTART;	while (*regparse == '|') {		regparse++;		br = regbranch(&flags);		if (br == NULL)			return(NULL);		regtail(ret, br);	/* BRANCH -> BRANCH. */		if (!(flags&HASWIDTH))			*flagp &= ~HASWIDTH;		*flagp |= flags&SPSTART;	}	/* Make a closing node, and hook it on the end. */	if (paren)	    ender = reganode(CLOSE, parno);	else	    ender = regnode(END);	regtail(ret, ender);	/* Hook the tails of the branches to the closing node. */	for (br = ret; br != NULL; br = regnext(br))		regoptail(br, ender);	/* Check for proper termination. */	if (paren && *regparse++ != ')') {		FAIL("unmatched () in regexp");	} else if (!paren && regparse < regxend) {		if (*regparse == ')') {			FAIL("unmatched () in regexp");		} else			FAIL("junk on end of regexp");	/* "Can't happen". */		/* NOTREACHED */	}	return(ret);}/* - regbranch - one alternative of an | operator * * Implements the concatenation operator. */static char *regbranch(flagp)int *flagp;{	register char *ret;	register char *chain;	register char *latest;	int flags;	*flagp = WORST;		/* Tentatively. */

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -