regcomp.c

来自「Newlib 嵌入式 C库标准实现代码」· C语言代码 · 共 2,089 行 · 第 1/4 页
2,089 行
/*- * Copyright (c) 1992, 1993, 1994 Henry Spencer. * Copyright (c) 1992, 1993, 1994 *	The Regents of the University of California.  All rights reserved. * * This code is derived from software contributed to Berkeley by * Henry Spencer. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright *    notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright *    notice, this list of conditions and the following disclaimer in the *    documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software *    must display the following acknowledgement: *	This product includes software developed by the University of *	California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors *    may be used to endorse or promote products derived from this software *    without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *	@(#)regcomp.c	8.5 (Berkeley) 3/20/94 */#if defined(LIBC_SCCS) && !defined(lint)static char sccsid[] = "@(#)regcomp.c	8.5 (Berkeley) 3/20/94";#endif /* LIBC_SCCS and not lint */#include <sys/cdefs.h>#include <sys/types.h>#include <stdio.h>#include <string.h>#include <ctype.h>#include <limits.h>#include <stdlib.h>#include <regex.h>#include "collate.h"#include "utils.h"#include "regex2.h"#include "cclass.h"#include "cname.h"/* * parse structure, passed up and down to avoid global variables and * other clumsinesses */struct parse {	char *next;		/* next character in RE */	char *end;		/* end of string (-> NUL normally) */	int error;		/* has an error been seen? */	sop *strip;		/* malloced strip */	sopno ssize;		/* malloced strip size (allocated) */	sopno slen;		/* malloced strip length (used) */	int ncsalloc;		/* number of csets allocated */	struct re_guts *g;#	define	NPAREN	10	/* we need to remember () 1-9 for back refs */	sopno pbegin[NPAREN];	/* -> ( ([0] unused) */	sopno pend[NPAREN];	/* -> ) ([0] unused) */};/* ========= begin header generated by ./mkh ========= */#ifdef __cplusplusextern "C" {#endif/* === regcomp.c === */static void p_ere(struct parse *p, int stop);static void p_ere_exp(struct parse *p);static void p_str(struct parse *p);static void p_bre(struct parse *p, int end1, int end2);static int p_simp_re(struct parse *p, int starordinary);static int p_count(struct parse *p);static void p_bracket(struct parse *p);static void p_b_term(struct parse *p, cset *cs);static void p_b_cclass(struct parse *p, cset *cs);static void p_b_eclass(struct parse *p, cset *cs);static char p_b_symbol(struct parse *p);static char p_b_coll_elem(struct parse *p, int endc);static char othercase(int ch);static void bothcases(struct parse *p, int ch);static void ordinary(struct parse *p, int ch);static void nonnewline(struct parse *p);static void repeat(struct parse *p, sopno start, int from, int to);static int seterr(struct parse *p, int e);static cset *allocset(struct parse *p);static void freeset(struct parse *p, cset *cs);static int freezeset(struct parse *p, cset *cs);static int firstch(struct parse *p, cset *cs);static int nch(struct parse *p, cset *cs);static void mcadd(struct parse *p, cset *cs, char *cp);#if usedstatic void mcsub(cset *cs, char *cp);static int mcin(cset *cs, char *cp);static char *mcfind(cset *cs, char *cp);#endifstatic void mcinvert(struct parse *p, cset *cs);static void mccase(struct parse *p, cset *cs);static int isinsets(struct re_guts *g, int c);static int samesets(struct re_guts *g, int c1, int c2);static void categorize(struct parse *p, struct re_guts *g);static sopno dupl(struct parse *p, sopno start, sopno finish);static void doemit(struct parse *p, sop op, size_t opnd);static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);static void dofwd(struct parse *p, sopno pos, sop value);static void enlarge(struct parse *p, sopno size);static void stripsnug(struct parse *p, struct re_guts *g);static void findmust(struct parse *p, struct re_guts *g);static int altoffset(sop *scan, int offset, int mccs);static void computejumps(struct parse *p, struct re_guts *g);static void computematchjumps(struct parse *p, struct re_guts *g);static sopno pluscount(struct parse *p, struct re_guts *g);#ifdef __cplusplus}#endif/* ========= end header generated by ./mkh ========= */static char nuls[10];		/* place to point scanner in event of error *//* * macros for use with parse structure * BEWARE:  these know that the parse structure is named `p' !!! */#define	PEEK()	(*p->next)#define	PEEK2()	(*(p->next+1))#define	MORE()	(p->next < p->end)#define	MORE2()	(p->next+1 < p->end)#define	SEE(c)	(MORE() && PEEK() == (c))#define	SEETWO(a, b)	(MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))#define	EAT(c)	((SEE(c)) ? (NEXT(), 1) : 0)#define	EATTWO(a, b)	((SEETWO(a, b)) ? (NEXT2(), 1) : 0)#define	NEXT()	(p->next++)#define	NEXT2()	(p->next += 2)#define	NEXTn(n)	(p->next += (n))#define	GETNEXT()	(*p->next++)#define	SETERROR(e)	seterr(p, (e))#define	REQUIRE(co, e)	((co) || SETERROR(e))#define	MUSTSEE(c, e)	(REQUIRE(MORE() && PEEK() == (c), e))#define	MUSTEAT(c, e)	(REQUIRE(MORE() && GETNEXT() == (c), e))#define	MUSTNOTSEE(c, e)	(REQUIRE(!MORE() || PEEK() != (c), e))#define	EMIT(op, sopnd)	doemit(p, (sop)(op), (size_t)(sopnd))#define	INSERT(op, pos)	doinsert(p, (sop)(op), HERE()-(pos)+1, pos)#define	AHEAD(pos)		dofwd(p, pos, HERE()-(pos))#define	ASTERN(sop, pos)	EMIT(sop, HERE()-pos)#define	HERE()		(p->slen)#define	THERE()		(p->slen - 1)#define	THERETHERE()	(p->slen - 2)#define	DROP(n)	(p->slen -= (n))#ifndef NDEBUGstatic int never = 0;		/* for use in asserts; shuts lint up */#else#define	never	0		/* some <assert.h>s have bugs too */#endif/* Macro used by computejump()/computematchjump() */#define MIN(a,b)	((a)<(b)?(a):(b))/* - regcomp - interface for parser and compilation = extern int regcomp(regex_t *, const char *, int); = #define	REG_BASIC	0000 = #define	REG_EXTENDED	0001 = #define	REG_ICASE	0002 = #define	REG_NOSUB	0004 = #define	REG_NEWLINE	0010 = #define	REG_NOSPEC	0020 = #define	REG_PEND	0040 = #define	REG_DUMP	0200 */int				/* 0 success, otherwise REG_something */regcomp(preg, pattern, cflags)regex_t *preg;const char *pattern;int cflags;{	struct parse pa;	struct re_guts *g;	struct parse *p = &pa;	int i;	size_t len;#ifdef REDEBUG#	define	GOODFLAGS(f)	(f)#else#	define	GOODFLAGS(f)	((f)&~REG_DUMP)#endif	cflags = GOODFLAGS(cflags);	if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))		return(REG_INVARG);	if (cflags&REG_PEND) {		if (preg->re_endp < pattern)			return(REG_INVARG);		len = preg->re_endp - pattern;	} else		len = strlen((char *)pattern);	/* do the mallocs early so failure handling is easy */	g = (struct re_guts *)malloc(sizeof(struct re_guts) +							(NC-1)*sizeof(cat_t));	if (g == NULL)		return(REG_ESPACE);	p->ssize = len/(size_t)2*(size_t)3 + (size_t)1;	/* ugh */	p->strip = (sop *)malloc(p->ssize * sizeof(sop));	p->slen = 0;	if (p->strip == NULL) {		free((char *)g);		return(REG_ESPACE);	}	/* set things up */	p->g = g;	p->next = (char *)pattern;	/* convenience; we do not modify it */	p->end = p->next + len;	p->error = 0;	p->ncsalloc = 0;	for (i = 0; i < NPAREN; i++) {		p->pbegin[i] = 0;		p->pend[i] = 0;	}	g->csetsize = NC;	g->sets = NULL;	g->setbits = NULL;	g->ncsets = 0;	g->cflags = cflags;	g->iflags = 0;	g->nbol = 0;	g->neol = 0;	g->must = NULL;	g->moffset = -1;	g->charjump = NULL;	g->matchjump = NULL;	g->mlen = 0;	g->nsub = 0;	g->ncategories = 1;	/* category 0 is "everything else" */	g->categories = &g->catspace[-(CHAR_MIN)];	(void) memset((char *)g->catspace, 0, NC*sizeof(cat_t));	g->backrefs = 0;	/* do it */	EMIT(OEND, 0);	g->firststate = THERE();	if (cflags&REG_EXTENDED)		p_ere(p, OUT);	else if (cflags&REG_NOSPEC)		p_str(p);	else		p_bre(p, OUT, OUT);	EMIT(OEND, 0);	g->laststate = THERE();	/* tidy up loose ends and fill things in */	categorize(p, g);	stripsnug(p, g);	findmust(p, g);	/* only use Boyer-Moore algorithm if the pattern is bigger	 * than three characters	 */	if(g->mlen > 3) {		computejumps(p, g);		computematchjumps(p, g);		if(g->matchjump == NULL && g->charjump != NULL) {			free(g->charjump);			g->charjump = NULL;		}	}	g->nplus = pluscount(p, g);	g->magic = MAGIC2;	preg->re_nsub = g->nsub;	preg->re_g = g;	preg->re_magic = MAGIC1;#ifndef REDEBUG	/* not debugging, so can't rely on the assert() in regexec() */	if (g->iflags&BAD)		SETERROR(REG_ASSERT);#endif	/* win or lose, we're done */	if (p->error != 0)	/* lose */		regfree(preg);	return(p->error);}/* - p_ere - ERE parser top level, concatenation and alternation == static void p_ere(struct parse *p, int stop); */static voidp_ere(p, stop)struct parse *p;int stop;			/* character this ERE should end at */{	char c;	sopno prevback;	sopno prevfwd;	sopno conc;	int first = 1;		/* is this the first alternative? */	for (;;) {		/* do a bunch of concatenated expressions */		conc = HERE();		while (MORE() && (c = PEEK()) != '|' && c != stop)			p_ere_exp(p);		(void)REQUIRE(HERE() != conc, REG_EMPTY);	/* require nonempty */		if (!EAT('|'))			break;		/* NOTE BREAK OUT */		if (first) {			INSERT(OCH_, conc);	/* offset is wrong */			prevfwd = conc;			prevback = conc;			first = 0;		}		ASTERN(OOR1, prevback);		prevback = THERE();		AHEAD(prevfwd);			/* fix previous offset */		prevfwd = HERE();		EMIT(OOR2, 0);			/* offset is very wrong */	}	if (!first) {		/* tail-end fixups */		AHEAD(prevfwd);		ASTERN(O_CH, prevback);	}	assert(!MORE() || SEE(stop));}/* - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op == static void p_ere_exp(struct parse *p); */static voidp_ere_exp(p)struct parse *p;{	char c;	sopno pos;	int count;	int count2;	sopno subno;	int wascaret = 0;	assert(MORE());		/* caller should have ensured this */	c = GETNEXT();	pos = HERE();	switch (c) {	case '(':		(void)REQUIRE(MORE(), REG_EPAREN);		p->g->nsub++;		subno = p->g->nsub;		if (subno < NPAREN)			p->pbegin[subno] = HERE();		EMIT(OLPAREN, subno);		if (!SEE(')'))			p_ere(p, ')');		if (subno < NPAREN) {			p->pend[subno] = HERE();			assert(p->pend[subno] != 0);		}		EMIT(ORPAREN, subno);		(void)MUSTEAT(')', REG_EPAREN);		break;#ifndef POSIX_MISTAKE	case ')':		/* happens only if no current unmatched ( */		/*		 * You may ask, why the ifndef?  Because I didn't notice		 * this until slightly too late for 1003.2, and none of the		 * other 1003.2 regular-expression reviewers noticed it at		 * all.  So an unmatched ) is legal POSIX, at least until		 * we can get it fixed.		 */		SETERROR(REG_EPAREN);		break;#endif	case '^':		EMIT(OBOL, 0);		p->g->iflags |= USEBOL;		p->g->nbol++;		wascaret = 1;		break;	case '$':		EMIT(OEOL, 0);		p->g->iflags |= USEEOL;		p->g->neol++;		break;	case '|':		SETERROR(REG_EMPTY);		break;	case '*':	case '+':	case '?':		SETERROR(REG_BADRPT);		break;	case '.':		if (p->g->cflags&REG_NEWLINE)			nonnewline(p);		else			EMIT(OANY, 0);		break;	case '[':		p_bracket(p);		break;	case '\\':		(void)REQUIRE(MORE(), REG_EESCAPE);		c = GETNEXT();		ordinary(p, c);		break;	case '{':		/* okay as ordinary except if digit follows */		(void)REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT);		/* FALLTHROUGH */	default:		ordinary(p, c);		break;	}	if (!MORE())		return;	c = PEEK();	/* we call { a repetition if followed by a digit */	if (!( c == '*' || c == '+' || c == '?' ||				(c == '{' && MORE2() && isdigit((uch)PEEK2())) ))		return;		/* no repetition, we're done */	NEXT();	(void)REQUIRE(!wascaret, REG_BADRPT);	switch (c) {	case '*':	/* implemented as +? */		/* this case does not require the (y|) trick, noKLUDGE */		INSERT(OPLUS_, pos);		ASTERN(O_PLUS, pos);		INSERT(OQUEST_, pos);		ASTERN(O_QUEST, pos);		break;	case '+':		INSERT(OPLUS_, pos);		ASTERN(O_PLUS, pos);		break;	case '?':		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */		INSERT(OCH_, pos);		/* offset slightly wrong */		ASTERN(OOR1, pos);		/* this one's right */		AHEAD(pos);			/* fix the OCH_ */		EMIT(OOR2, 0);			/* offset very wrong... */		AHEAD(THERE());			/* ...so fix it */		ASTERN(O_CH, THERETHERE());		break;	case '{':		count = p_count(p);		if (EAT(',')) {			if (isdigit((uch)PEEK())) {				count2 = p_count(p);				(void)REQUIRE(count <= count2, REG_BADBR);			} else		/* single number with comma */				count2 = INFINITY;		} else		/* just a single number */			count2 = count;		repeat(p, pos, count, count2);		if (!EAT('}')) {	/* error heuristics */			while (MORE() && PEEK() != '}')				NEXT();			(void)REQUIRE(MORE(), REG_EBRACE);			SETERROR(REG_BADBR);		}		break;	}	if (!MORE())		return;	c = PEEK();	if (!( c == '*' || c == '+' || c == '?' ||				(c == '{' && MORE2() && isdigit((uch)PEEK2())) ) )		return;	SETERROR(REG_BADRPT);}/* - p_str - string (no metacharacters) "parser" == static void p_str(struct parse *p); */static voidp_str(p)struct parse *p;{	(void)REQUIRE(MORE(), REG_EMPTY);	while (MORE())		ordinary(p, GETNEXT());}/* - p_bre - BRE parser top level, anchoring and concatenation == static void p_bre(struct parse *p, int end1, \ ==	int end2); * Giving end1 as OUT essentially eliminates the end1/end2 check. * * This implementation is a bit of a kludge, in that a trailing $ is first * taken as an ordinary character and then revised to be an anchor.  The * only undesirable side effect is that '$' gets included as a character * category in such cases.  This is fairly harmless; not worth fixing. * The amount of lookahead needed to avoid this kludge is excessive. */static void
regcomp.c - 源码说明

本页面展示了「Newlib 嵌入式 C库标准实现代码」中的 regcomp.c 源码文件，采用 C语言编程语言编写，共 2,089 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与Newlib相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?