📄 regex.c

📁 <B>Digital的Unix操作系统VAX 4.2源码</B>
💻 C
字号:
/* @(#)regex.c	4.1 (Berkeley) 12/21/80 */#/* * routines to do regular expression matching * * Entry points: * *	re_comp(s) *		char *s; *	 ... returns 0 if the string s was compiled successfully, *		     a pointer to an error message otherwise. *	     If passed 0 or a null string returns without changing *           the currently compiled re (see note 11 below). * *	re_exec(s) *		char *s; *	 ... returns 1 if the string s matches the last compiled regular *		       expression,  *		     0 if the string s failed to match the last compiled *		       regular expression, and *		    -1 if the compiled regular expression was invalid  *		       (indicating an internal error). * * The strings passed to both re_comp and re_exec may have trailing or * embedded newline characters; they are terminated by nulls. * * The identity of the author of these routines is lost in antiquity; * this is essentially the same as the re code in the original V6 ed. * * The regular expressions recognized are described below. This description * is essentially the same as that for ed. * *	A regular expression specifies a set of strings of characters. *	A member of this set of strings is said to be matched by *	the regular expression.  In the following specification for *	regular expressions the word `character' means any character but NUL. * *	1.  Any character except a special character matches itself. *	    Special characters are the regular expression delimiter plus *	    \ [ . and sometimes ^ * $. *	2.  A . matches any character. *	3.  A \ followed by any character except a digit or ( ) *	    matches that character. *	4.  A nonempty string s bracketed [s] (or [^s]) matches any *	    character in (or not in) s. In s, \ has no special meaning, *	    and ] may only appear as the first letter. A substring  *	    a-b, with a and b in ascending ASCII order, stands for *	    the inclusive range of ASCII characters. *	5.  A regular expression of form 1-4 followed by * matches a *	    sequence of 0 or more matches of the regular expression. *	6.  A regular expression, x, of form 1-8, bracketed \(x\) *	    matches what x matches. *	7.  A \ followed by a digit n matches a copy of the string that the *	    bracketed regular expression beginning with the nth \( matched. *	8.  A regular expression of form 1-8, x, followed by a regular *	    expression of form 1-7, y matches a match for x followed by *	    a match for y, with the x match being as long as possible *	    while still permitting a y match. *	9.  A regular expression of form 1-8 preceded by ^ (or followed *	    by $), is constrained to matches that begin at the left *	    (or end at the right) end of a line. *	10. A regular expression of form 1-9 picks out the longest among *	    the leftmost matches in a line. *	11. An empty regular expression stands for a copy of the last *	    regular expression encountered. *//* * constants for re's */#define	CBRA	1#define	CCHR	2#define	CDOT	4#define	CCL	6#define	NCCL	8#define	CDOL	10#define	CEOF	11#define	CKET	12#define	CBACK	18#define	CSTAR	01#define	ESIZE	512#define	NBRA	9static	char	expbuf[ESIZE], *braslist[NBRA], *braelist[NBRA];static	char	circf;/* * compile the regular expression argument into a dfa */char *re_comp(sp)	register char	*sp;{	register int	c;	register char	*ep = expbuf;	int	cclcnt, numbra = 0;	char	*lastep = 0;	char	bracket[NBRA];	char	*bracketp = &bracket[0];	static	char	*retoolong = "Regular expression too long";#define	comerr(msg) {expbuf[0] = 0; numbra = 0; return(msg); }	if (sp == 0 || *sp == '\0') {		if (*ep == 0)			return("No previous regular expression");		return(0);	}	if (*sp == '^') {		circf = 1;		sp++;	}	else		circf = 0;	for (;;) {		if (ep >= &expbuf[ESIZE])			comerr(retoolong);		if ((c = *sp++) == '\0') {			if (bracketp != bracket)				comerr("unmatched \\(");			*ep++ = CEOF;			*ep++ = 0;			return(0);		}		if (c != '*')			lastep = ep;		switch (c) {		case '.':			*ep++ = CDOT;			continue;		case '*':			if (lastep == 0 || *lastep == CBRA || *lastep == CKET)				goto defchar;			*lastep |= CSTAR;			continue;		case '$':			if (*sp != '\0')				goto defchar;			*ep++ = CDOL;			continue;		case '[':			*ep++ = CCL;			*ep++ = 0;			cclcnt = 1;			if ((c = *sp++) == '^') {				c = *sp++;				ep[-2] = NCCL;			}			do {				if (c == '\0')					comerr("missing ]");				if (c == '-' && ep [-1] != 0) {					if ((c = *sp++) == ']') {						*ep++ = '-';						cclcnt++;						break;					}					while (ep[-1] < c) {						*ep = ep[-1] + 1;						ep++;						cclcnt++;						if (ep >= &expbuf[ESIZE])							comerr(retoolong);					}				}				*ep++ = c;				cclcnt++;				if (ep >= &expbuf[ESIZE])					comerr(retoolong);			} while ((c = *sp++) != ']');			lastep[1] = cclcnt;			continue;		case '\\':			if ((c = *sp++) == '(') {				if (numbra >= NBRA)					comerr("too many \\(\\) pairs");				*bracketp++ = numbra;				*ep++ = CBRA;				*ep++ = numbra++;				continue;			}			if (c == ')') {				if (bracketp <= bracket)					comerr("unmatched \\)");				*ep++ = CKET;				*ep++ = *--bracketp;				continue;			}			if (c >= '1' && c < ('1' + NBRA)) {				*ep++ = CBACK;				*ep++ = c - '1';				continue;			}			*ep++ = CCHR;			*ep++ = c;			continue;		defchar:		default:			*ep++ = CCHR;			*ep++ = c;		}	}}/*  * match the argument string against the compiled re */intre_exec(p1)	register char	*p1;{	register char	*p2 = expbuf;	register int	c;	int	rv;	for (c = 0; c < NBRA; c++) {		braslist[c] = 0;		braelist[c] = 0;	}	if (circf)		return((advance(p1, p2)));	/*	 * fast check for first character	 */	if (*p2 == CCHR) {		c = p2[1];		do {			if (*p1 != c)				continue;			if (rv = advance(p1, p2))				return(rv);		} while (*p1++);		return(0);	}	/*	 * regular algorithm	 */	do		if (rv = advance(p1, p2))			return(rv);	while (*p1++);	return(0);}/*  * try to match the next thing in the dfa */static	intadvance(lp, ep)	register char	*lp, *ep;{	register char	*curlp;	int	ct, i;	int	rv;	for (;;)		switch (*ep++) {		case CCHR:			if (*ep++ == *lp++)				continue;			return(0);		case CDOT:			if (*lp++)				continue;			return(0);		case CDOL:			if (*lp == '\0')				continue;			return(0);		case CEOF:			return(1);		case CCL:			if (cclass(ep, *lp++, 1)) {				ep += *ep;				continue;			}			return(0);		case NCCL:			if (cclass(ep, *lp++, 0)) {				ep += *ep;				continue;			}			return(0);		case CBRA:			braslist[*ep++] = lp;			continue;		case CKET:			braelist[*ep++] = lp;			continue;		case CBACK:			if (braelist[i = *ep++] == 0)				return(-1);			if (backref(i, lp)) {				lp += braelist[i] - braslist[i];				continue;			}			return(0);		case CBACK|CSTAR:			if (braelist[i = *ep++] == 0)				return(-1);			curlp = lp;			ct = braelist[i] - braslist[i];			while (backref(i, lp))				lp += ct;			while (lp >= curlp) {				if (rv = advance(lp, ep))					return(rv);				lp -= ct;			}			continue;		case CDOT|CSTAR:			curlp = lp;			while (*lp++)				;			goto star;		case CCHR|CSTAR:			curlp = lp;			while (*lp++ == *ep)				;			ep++;			goto star;		case CCL|CSTAR:		case NCCL|CSTAR:			curlp = lp;			while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR)))				;			ep += *ep;			goto star;		star:			do {				lp--;				if (rv = advance(lp, ep))					return(rv);			} while (lp > curlp);			return(0);		default:			return(-1);		}}backref(i, lp)	register int	i;	register char	*lp;{	register char	*bp;	bp = braslist[i];	while (*bp++ == *lp++)		if (bp >= braelist[i])			return(1);	return(0);}intcclass(set, c, af)	register char	*set, c;	int	af;{	register int	n;	if (c == 0)		return(0);	n = *set++;	while (--n)		if (*set++ == c)			return(af);	return(! af);}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -