📄 regexp.cpp

📁 CodezBank is just a small application that stores source code snippets organized in a hierarhichal m
💻 CPP
📖 第 1 页 / 共 3 页
字号:
	if (tester.Size() >= 0x7fffL)	// Probably could be 0xffffL. 
	{
		regerror(REGERR_TO_BIG);
		return NULL;
	}

	m_programSize = tester.Size();
	// Allocate space. 
	program = new TCHAR[m_programSize];

	CRegCompiler comp( exp, program );
	// Second pass: emit code. 
	if (comp.reg(0, &flags) == NULL)
		return false;

	scan = program + 1;			// First BRANCH. 
	if (OP(regnext(scan)) == END)
	{		// Only one top-level choice. 
		scan = OPERAND(scan);

		// Starting-point info. 
		if (OP(scan) == EXACTLY)
			regstart = *OPERAND(scan);
		else if (OP(scan) == BOL)
			reganch = 1;

		// If there's something expensive in the r.e., find the
		// longest literal string that must appear and make it the
		// regmust.  Resolve ties in favor of later strings, since
		// the regstart check works with the beginning of the r.e.
		// and avoiding duplication strengthens checking.  Not a
		// strong reason, but sufficient in the absence of others.
		 
		if (flags&SPSTART) 
		{
			LPTSTR longest = NULL;
			size_t len = 0;

			for (; scan != NULL; scan = regnext(scan))
				if (OP(scan) == EXACTLY && _tcslen(OPERAND(scan)) >= len)
				{
					longest = OPERAND(scan);
					len = _tcslen(OPERAND(scan));
				}
			regmust = longest;
			regmlen = (int)len;
		}
	}

	return true;
}

regexp * regexp::getCopy()
{
	return new regexp( *this );
}

// reg - regular expression, i.e. main body or parenthesized thing
//
// Caller must absorb opening parenthesis.
//
// Combining parenthesis handling with the base level of regular expression
// is a trifle forced, but the need to tie the tails of the branches to what
// follows makes it hard to avoid.

LPTSTR CRegCompilerBase::reg( int paren, int *flagp )
{
	LPTSTR ret;
	LPTSTR br;
	LPTSTR ender;
	int parno = 0;
	int flags;

	*flagp = HASWIDTH;	// Tentatively. 

	if (paren)
	{
		// Make an OPEN node. 
		if (regnpar >= Regexp::NSUBEXP)
		{
			regerror(REGERR_TO_MANY_PAREN);
			return NULL;
		}
		parno = regnpar;
		regnpar++;
		ret = regnode(OPEN+parno);
	}

	// Pick up the branches, linking them together. 
	br = regbranch(&flags);
	if (br == NULL)
		return(NULL);
	if (paren)
		regtail(ret, br);	// OPEN -> first. 
	else
		ret = br;
	*flagp &= ~(~flags&HASWIDTH);	// Clear bit if bit 0. 
	*flagp |= flags&SPSTART;
	while (*regparse == '|')
	{
		regparse++;
		br = regbranch(&flags);
		if (br == NULL)
			return(NULL);
		regtail(ret, br);	// BRANCH -> BRANCH. 
		*flagp &= ~(~flags&HASWIDTH);
		*flagp |= flags&SPSTART;
	}

	// Make a closing node, and hook it on the end. 
	ender = regnode((paren) ? CLOSE+parno : END);
	regtail(ret, ender);

	// Hook the tails of the branches to the closing node. 
	for (br = ret; br != NULL; br = regnext(br))
		regoptail(br, ender);

	// Check for proper termination. 
	if (paren && *regparse++ != ')')
	{
		regerror( REGERR_UNTERMINATED_PAREN );
		return NULL;
	}
	else if (!paren && *regparse != '\0')
	{
		if (*regparse == ')')
		{
			regerror( REGERR_UNMATCHED_PAREN );
			return NULL;
		}
		else
		{
			regerror( REGERR_INTERNAL_ERROR_JUNK );
			return NULL;
		}
		// NOTREACHED 
	}

	return(ret);
}

// regbranch - one alternative of an | operator
//
// Implements the concatenation operator.

LPTSTR CRegCompilerBase::regbranch(int *flagp)
{
	LPTSTR ret;
	LPTSTR chain;
	LPTSTR latest;
	int flags;
	int c;

	*flagp = WORST;				// Tentatively. 

	ret = regnode(BRANCH);
	chain = NULL;
	while ((c = *regparse) != '\0' && c != '|' && c != ')')
	{
		latest = regpiece(&flags);
		if (latest == NULL)
			return(NULL);
		*flagp |= flags&HASWIDTH;
		if (chain == NULL)		// First piece. 
			*flagp |= flags&SPSTART;
		else
			regtail(chain, latest);
		chain = latest;
	}
	if (chain == NULL)			// Loop ran zero times. 
		(void) regnode(NOTHING);

	return(ret);
}

// regpiece - something followed by possible [*+?]
//
// Note that the branching code sequences used for ? and the general cases
// of * and + are somewhat optimized:  they use the same NOTHING node as
// both the endmarker for their branch list and the body of the last branch.
// It might seem that this node could be dispensed with entirely, but the
// endmarker role is not redundant.

LPTSTR CRegCompilerBase::regpiece(int *flagp)
{
	LPTSTR ret;
	TCHAR op;
	LPTSTR next;
	int flags;

	ret = regatom(&flags);
	if (ret == NULL)
		return(NULL);

	op = *regparse;
	if (!ISREPN(op))
	{
		*flagp = flags;
		return(ret);
	}

	if (!(flags&HASWIDTH) && op != _T( '?' ))
	{
		regerror( REGERR_OP_COULD_BE_EMPTY );
		return NULL;
	}
	switch (op)
	{
		case _T( '*' ):	*flagp = WORST|SPSTART;				break;
		case _T( '+' ):	*flagp = WORST|SPSTART|HASWIDTH;	break;
		case _T( '?' ):	*flagp = WORST;						break;
	}

	if (op == _T( '*' ) && (flags&SIMPLE))
		reginsert(STAR, ret);
	else if (op == _T( '*' ))
	{
		// Emit x* as (x&|), where & means "self". 
		reginsert(BRANCH, ret);				// Either x 
		regoptail(ret, regnode(BACK));		// and loop 
		regoptail(ret, ret);				// back 
		regtail(ret, regnode(BRANCH));		// or 
		regtail(ret, regnode(NOTHING));		// null. 
	}
	else if (op == _T( '+' ) && (flags&SIMPLE))
		reginsert(PLUS, ret);
	else if (op == _T( '+' ))
	{
		// Emit x+ as x(&|), where & means "self". 
		next = regnode(BRANCH);				// Either 
		regtail(ret, next);
		regtail(regnode(BACK), ret);		// loop back 
		regtail(next, regnode(BRANCH));		// or 
		regtail(ret, regnode(NOTHING));		// null. 
	}
	else if (op == _T( '?' ))
	{
		// Emit x? as (x|) 
		reginsert(BRANCH, ret);				// Either x 
		regtail(ret, regnode(BRANCH));		// or 
		next = regnode(NOTHING);			// null. 
		regtail(ret, next);
		regoptail(ret, next);
	}
	regparse++;
	if (ISREPN(*regparse))
	{
		regerror( REGERR_NESTED_OP );
		return NULL;
	}

	return(ret);
}

// regatom - the lowest level
//
// Optimization:  gobbles an entire sequence of ordinary characters so that
// it can turn them into a single node, which is smaller to store and
// faster to run.  Backslashed characters are exceptions, each becoming a
// separate node; the code is simpler that way and it's not worth fixing.

LPTSTR CRegCompilerBase::regatom(int * flagp)
{
	LPTSTR ret;
	int flags;

	*flagp = WORST;		// Tentatively. 

	switch ( *regparse++ )
	{
		// FIXME: these chars only have meaning at beg/end of pat?
		case '^':
			ret = regnode(BOL);
			break;
		case '$':
			ret = regnode(EOL);
			break;
		case '.':
			ret = regnode(ANY);
			*flagp |= HASWIDTH|SIMPLE;
			break;
		case '[':
		{
			int range;
			int rangeend;
			int c;

			if (*regparse == '^')
			{	// Complement of range. 
				ret = regnode(ANYBUT);
				regparse++;
			}
			else
				ret = regnode(ANYOF);
			if ((c = *regparse) == ']' || c == '-')
			{
				regc(c);
				regparse++;
			}
			while ((c = *regparse++ ) != '\0' && c != ']')
			{
				if (c != '-')
					regc(c);
				else if ((c = *regparse) == ']' || c == '\0')
					regc('-');
				else
				{
					range = (TCHAR)*(regparse-2);
					rangeend = (TCHAR)c;
					if (range > rangeend)
					{
						regerror( REGERR_INVALID_RANGE );
						return NULL;
					}
					for (range++; range <= rangeend; range++)
						regc(range);
					regparse++;
				}
			}
			regc('\0');
			if (c != ']')
			{
				regerror( REGERR_UNMATCHED_BRACE );
				return NULL;
			}
			*flagp |= HASWIDTH|SIMPLE;
			break;
		}
		case '(':
			ret = reg(1, &flags);
			if (ret == NULL)
				return(NULL);
			*flagp |= flags&(HASWIDTH|SPSTART);
			break;
		case '\0':
		case '|':
		case ')':
			// supposed to be caught earlier 
			regerror( REGERR_INTERNAL_UNEXPECTED_CHAR );
			return NULL;
		case '?':
		case '+':
		case '*':
			{
				regerror( REGERR_OP_FOLLOWS_NOTHING );
				return NULL;
			}
		case '\\':
			switch (*regparse++)
			{
				case '\0':
				{
					regerror( REGERR_TRAILING_ESC );
					return NULL;
				}
				case '<':
					ret = regnode(WORDA);
					break;
				case '>':
					ret = regnode(WORDZ);
					break;
				/* FIXME: Someday handle \1, \2, ... */
				default:
					/* Handle general quoted chars in exact-match routine */
					goto de_fault;
			}
			break;
	de_fault:
	default:
		// Encode a string of characters to be matched exactly.
		//
		// This is a bit tricky due to quoted chars and due to
		// '*', '+', and '?' taking the SINGLE char previous
		// as their operand.
		//
		// On entry, the char at regparse[-1] is going to go
		// into the string, no matter what it is.  (It could be
		// following a \ if we are entered from the '\' case.)
		// 
		// Basic idea is to pick up a good char in  ch  and
		// examine the next char.  If it's *+? then we twiddle.
		// If it's \ then we frozzle.  If it's other magic char
		// we push  ch  and terminate the string.  If none of the
		// above, we push  ch  on the string and go around again.
		//
		//  regprev  is used to remember where "the current char"
		// starts in the string, if due to a *+? we need to back
		// up and put the current char in a separate, 1-char, string.
		// When  regprev  is NULL,  ch  is the only char in the
		// string; this is used in *+? handling, and in setting
		// flags |= SIMPLE at the end.
		{
			TCHAR *regprev;
			register TCHAR ch;

			regparse--;			/* Look at cur char */
			ret = regnode(EXACTLY);
			for ( regprev = 0 ; ; ) {
				ch = *regparse++;	/* Get current char */
				switch (*regparse) {	/* look at next one */

				default:
					regc(ch);	/* Add cur to string */
					break;

				case '.': case '[': case '(':
				case ')': case '|': case '\n':
				case '$': case '^':
				case '\0':
				/* FIXME, $ and ^ should not always be magic */
				magic:
					regc(ch);	/* dump cur char */
					goto done;	/* and we are done */

				case '?': case '+': case '*':
					if (!regprev) 	/* If just ch in str, */
						goto magic;	/* use it */
					/* End mult-char string one early */
					regparse = regprev; /* Back up parse */
					goto done;

				case '\\':
					regc(ch);	/* Cur char OK */
					switch (regparse[1]){ /* Look after \ */
					case '\0':
					case '<':
					case '>':
					/* FIXME: Someday handle \1, \2, ... */
						goto done; /* Not quoted */
					default:
						/* Backup point is \, scan							 * point is after it. */
						regprev = regparse;
						regparse++; 
						continue;	/* NOT break; */
					}
				}
				regprev = regparse;	/* Set backup point */
			}
		done:
			regc('\0');
			*flagp |= HASWIDTH;
			if (!regprev)		/* One char? */
				*flagp |= SIMPLE;
		}
		break;
	}

	return(ret);
}

////////////////////////////////////////////////////////////////////////////////
// regexec and friends

// Work-variable struct for regexec().

class CRegExecutor : public CRegProgramAccessor
{
	friend bool regexp::regexec( LPCTSTR str );

	LPTSTR reginput;		// String-input pointer. 
	LPTSTR regbol;			// Beginning of input, for ^ check. 
	LPTSTR * regstartp;		// Pointer to startp array. 
	LPTSTR * regendp;		// Ditto for endp.

	regexp * prog;
public:
	CRegExecutor( regexp * prog, LPTSTR string );
protected:
	bool regtry( LPTSTR string );
	bool regmatch( LPTSTR prog );
	size_t regrepeat( LPTSTR node );
};

CRegExecutor::CRegExecutor( regexp * p, LPTSTR string )
	: regbol( string ),
	regstartp( p->startp ),
	regendp( p->endp ),
	prog(p)
{
}

#ifdef _RE_DEBUG
int regnarrate = 0;
#endif

// regexec - match a regexp against a string

bool regexp::regexec( LPCTSTR str )
{
	LPTSTR string = (LPTSTR)str;	// avert const poisoning 

	// Be paranoid. 
	if ( string == NULL )
	{
		regerror( REGERR_NULLARG );
		return false;
	}

	// Check validity of program. 
	if (*program != MAGIC)
	{
		regerror( REGERR_CORRUPTED );
		return false;
	}

	// If there is a "must appear" string, look for it. 
	if ( regmust != NULL && _tcsstr( string, regmust ) == NULL )
		return false;

	CRegExecutor executor( this, string );

	// Simplest case:  anchored match need be tried only once. 
	if ( reganch )
		return( executor.regtry( string ) );

	// Messy cases:  unanchored match. 
	if ( regstart != '\0' )
	{
		// We know what TCHAR it must start with. 
		for ( LPTSTR s = string; s != NULL; s = _tcschr( s+1 , regstart ) )
			if ( executor.regtry( s) )
				return true;
		return false;
	}
	else
	{
		// We don't -- general case. 
		for ( LPTSTR s = string; ! executor.regtry( s ); s++ )
			if (*s == '\0')
				return false;
	}
	return true;
}

// regtry - try match at specific point
bool CRegExecutor::regtry( LPTSTR string )
{
	int i;
	LPTSTR * stp;
	LPTSTR * enp;

	reginput = string;

	stp = prog->startp;
	enp = prog->endp;
	for (i = Regexp::NSUBEXP; i > 0; i--)
	{
		*stp++ = NULL;
		*enp++ = NULL;
	}
	if ( regmatch( prog->program + 1 ) )
	{
		prog->startp[0] = string;
		prog->endp[0] = reginput;
		return true;
	}
	else
		return false;
}

// regmatch - main matching routine
//
// Conceptually the strategy is simple:  check to see whether the current
// node matches, call self recursively to see whether the rest matches,
// and then act accordingly.  In practice we make some effort to avoid
// recursion, in particular by going through "ordinary" nodes (that don't
// need to know whether the rest of the match failed) by a loop instead of
// by recursion.

bool CRegExecutor::regmatch( LPTSTR prog )
{
	LPTSTR scan;	// Current node. 
	LPTSTR next;		// Next node. 

#ifdef _RE_DEBUG
	if (prog != NULL && regnarrate)
		_ftprintf(stderr, _T( "%s(\n" ), regprop(prog));
#endif
	for (scan = prog; scan != NULL; scan = next)
	{
#ifdef _RE_DEBUG
💿 文件大小 167 K
👤 上传用户 nassdaq
📂 所属分类金融证券系统
🏷️ 相关标签

#hierarhichal #application #CodezBank #organized
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -