⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 regexp.cpp

📁 检测语法的Edit类
💻 CPP
📖 第 1 页 / 共 2 页
字号:
////////////////////////////////////////////////////////////////////////////////
// RegExp.cpp
////////////////////////////////////////////////////////////////////////////////

#include "stdafx.h"
#include "RegExp.h"


// definition	number	opnd?	meaning 
#define	END		0		// no	End of program. 
#define	BOL		1		// no	Match beginning of line. 
#define	EOL		2		// no	Match end of line. 
#define	ANY		3		// no	Match any character. 
#define	ANYOF	4		// str	Match any of these. 
#define	ANYBUT	5		// str	Match any but one of these. 
#define	BRANCH	6		// node	Match this, or the next..\&. 
#define	BACK	7		// no	"next" ptr points backward. 
#define	EXACTLY	8		// str	Match this string. 
#define	NOTHING	9		// no	Match empty string. 
#define	STAR	10		// node	Match this 0 or more times. 
#define	PLUS	11		// node	Match this 1 or more times. 
#define	OPEN	20		// no	Sub-RE starts here. 
						//		OPEN+1 is number 1, etc. 
#define	CLOSE	30		// no	Analogous to OPEN. 

// Utility definitions.
 
#define	FAIL(m)		{ regerror(m); return(NULL); }
#define	ISREPN(c)	((c) == _T('*') || (c) == _T('+') || (c) == _T('?'))
#define	META		"^$.[()|?+*\\"

// Flags to be passed up and down.
 
#define	HASWIDTH	01	// Known never to match null string. 
#define	SIMPLE		02	// Simple enough to be STAR/PLUS operand. 
#define	SPSTART		04	// Starts with * or +. 
#define	WORST		0	// Worst case. 


CRegExp::CRegExp()
{
	bCompiled = FALSE;
	program = NULL;
	sFoundText = NULL;

	for( int i = 0; i < NSUBEXP; i++ )
	{
		startp[i] = NULL;
		endp[i] = NULL;
	}
}

CRegExp::~CRegExp()
{
	delete program;
	delete sFoundText;
}


CRegExp* CRegExp::RegComp(const TCHAR *exp)
{
	TCHAR *scan;
	int flags;

	if (exp == NULL)
		return NULL;
	
	bCompiled = TRUE;

	// First pass: determine size, legality. 
	bEmitCode = FALSE;
	regparse = (TCHAR *)exp;
	regnpar = 1;
	regsize = 0L;
	regdummy[0] = NOTHING;
	regdummy[1] = regdummy[2] = 0;
	regcode = regdummy;
	if (reg(0, &flags) == NULL)
		return(NULL);

	// Allocate space. 
	delete program;
	program = new TCHAR[regsize];
	memset( program, 0, regsize * sizeof(TCHAR) );

	if (program == NULL)
		return NULL;

	// Second pass: emit code. 
	bEmitCode = TRUE;
	regparse = (TCHAR *)exp;
	regnpar = 1;
	regcode = program;
	if (reg(0, &flags) == NULL)
		return NULL;

	// Dig out information for optimizations. 
	regstart = _T('\0');		// Worst-case defaults. 
	reganch = 0;
	regmust = NULL;
	regmlen = 0;
	scan = program;		// First BRANCH. 
	if (OP(regnext(scan)) == END) 
	{	
		// Only one top-level choice. 
		scan = OPERAND(scan);

		// Starting-point info. 
		if (OP(scan) == EXACTLY)
			regstart = *OPERAND(scan);
		else if (OP(scan) == BOL)
			reganch = 1;

		// If there's something expensive in the r.e., find the
		// longest literal string that must appear and make it the
		// regmust.  Resolve ties in favor of later strings, since
		// the regstart check works with the beginning of the r.e.
		// and avoiding duplication strengthens checking.  Not a
		// strong reason, but sufficient in the absence of others.
		 
		if (flags&SPSTART) 
		{
			char *longest = NULL;
			size_t len = 0;

			for (; scan != NULL; scan = regnext(scan))
				if (OP(scan) == EXACTLY && _tcslen(OPERAND(scan)) >= len) 
				{
					longest = OPERAND(scan);
					len = _tcslen(OPERAND(scan));
				}
			regmust = longest;
			regmlen = (int)len;
		}
	}

	return this;
}

// reg - regular expression, i.e. main body or parenthesized thing
//
// Caller must absorb opening parenthesis.
//
// Combining parenthesis handling with the base level of regular expression
// is a trifle forced, but the need to tie the tails of the branches to what
// follows makes it hard to avoid.
 


TCHAR *CRegExp::reg(int paren, int *flagp)
{
	char *ret;
	char *br;
	char *ender;
	int parno;
	int flags;

	*flagp = HASWIDTH;	// Tentatively. 

	if (paren) 
	{
		// Make an OPEN node. 
		if (regnpar >= NSUBEXP)
		{
			TRACE1("Too many (). NSUBEXP is set to %d\n", NSUBEXP );
			return NULL;
		}
		parno = regnpar;
		regnpar++;
		ret = regnode(OPEN+parno);
	}

	// Pick up the branches, linking them together. 
	br = regbranch(&flags);
	if (br == NULL)
		return(NULL);
	if (paren)
		regtail(ret, br);	// OPEN -> first. 
	else
		ret = br;
	*flagp &= ~(~flags&HASWIDTH);	// Clear bit if bit 0. 
	*flagp |= flags&SPSTART;
	while (*regparse == _T('|')) {
		regparse++;
		br = regbranch(&flags);
		if (br == NULL)
			return(NULL);
		regtail(ret, br);	// BRANCH -> BRANCH. 
		*flagp &= ~(~flags&HASWIDTH);
		*flagp |= flags&SPSTART;
	}

	// Make a closing node, and hook it on the end. 
	ender = regnode((paren) ? CLOSE+parno : END);
	regtail(ret, ender);

	// Hook the tails of the branches to the closing node. 
	for (br = ret; br != NULL; br = regnext(br))
		regoptail(br, ender);

	// Check for proper termination. 
	if (paren && *regparse++ != _T(')')) 
	{
		TRACE0("unterminated ()\n");
		return NULL;
	} 
	else if (!paren && *regparse != _T('\0')) 
	{
		if (*regparse == _T(')')) 
		{
			TRACE0("unmatched ()\n");
			return NULL;
		} 
		else
		{
			TRACE0("internal error: junk on end\n");
			return NULL;
		}
		// NOTREACHED 
	}

	return(ret);
}




//
// regbranch - one alternative of an | operator
//
// Implements the concatenation operator.
 
TCHAR *CRegExp::regbranch(int *flagp)
{
	TCHAR *ret;
	TCHAR *chain;
	TCHAR *latest;
	int flags;
	int c;

	*flagp = WORST;				// Tentatively. 

	ret = regnode(BRANCH);
	chain = NULL;
	while ((c = *regparse) != _T('\0') && c != _T('|') && c != _T(')')) {
		latest = regpiece(&flags);
		if (latest == NULL)
			return(NULL);
		*flagp |= flags&HASWIDTH;
		if (chain == NULL)		// First piece. 
			*flagp |= flags&SPSTART;
		else
			regtail(chain, latest);
		chain = latest;
	}
	if (chain == NULL)			// Loop ran zero times. 
		(void) regnode(NOTHING);

	return(ret);
}

//
// regpiece - something followed by possible [*+?]
//
// Note that the branching code sequences used for ? and the general cases
// of * and + are somewhat optimized:  they use the same NOTHING node as
// both the endmarker for their branch list and the body of the last branch.
// It might seem that this node could be dispensed with entirely, but the
// endmarker role is not redundant.
 
TCHAR *CRegExp::regpiece(int *flagp)
{
	TCHAR *ret;
	TCHAR op;
	TCHAR *next;
	int flags;

	ret = regatom(&flags);
	if (ret == NULL)
		return(NULL);

	op = *regparse;
	if (!ISREPN(op)) {
		*flagp = flags;
		return(ret);
	}

	if (!(flags&HASWIDTH) && op != _T('?'))
	{
		TRACE0("*+ operand could be empty\n");
		return NULL;
	}

	switch (op) {
	case _T('*'):	*flagp = WORST|SPSTART;			break;
	case _T('+'):	*flagp = WORST|SPSTART|HASWIDTH;	break;
	case _T('?'):	*flagp = WORST;				break;
	}

	if (op == _T('*') && (flags&SIMPLE))
		reginsert(STAR, ret);
	else if (op == _T('*')) {
		// Emit x* as (x&|), where & means "self". 
		reginsert(BRANCH, ret);		// Either x 
		regoptail(ret, regnode(BACK));	// and loop 
		regoptail(ret, ret);		// back 
		regtail(ret, regnode(BRANCH));	// or 
		regtail(ret, regnode(NOTHING));	// null. 
	} else if (op == _T('+') && (flags&SIMPLE))
		reginsert(PLUS, ret);
	else if (op == _T('+')) {
		// Emit x+ as x(&|), where & means "self". 
		next = regnode(BRANCH);		// Either 
		regtail(ret, next);
		regtail(regnode(BACK), ret);	// loop back 
		regtail(next, regnode(BRANCH));	// or 
		regtail(ret, regnode(NOTHING));	// null. 
	} else if (op == _T('?')) {
		// Emit x? as (x|) 
		reginsert(BRANCH, ret);		// Either x 
		regtail(ret, regnode(BRANCH));	// or 
		next = regnode(NOTHING);		// null. 
		regtail(ret, next);
		regoptail(ret, next);
	}
	regparse++;
	if (ISREPN(*regparse))
	{
		TRACE0("nested *?+\n");
		return NULL;
	}

	return(ret);
}

//
// regatom - the lowest level
//
// Optimization:  gobbles an entire sequence of ordinary characters so that
// it can turn them into a single node, which is smaller to store and
// faster to run.  Backslashed characters are exceptions, each becoming a
// separate node; the code is simpler that way and it's not worth fixing.
 
TCHAR *CRegExp::regatom(int *flagp)
{
	TCHAR *ret;
	int flags;

	*flagp = WORST;		// Tentatively. 

	switch (*regparse++) {
	case _T('^'):
		ret = regnode(BOL);
		break;
	case _T('$'):
		ret = regnode(EOL);
		break;
	case _T('.'):
		ret = regnode(ANY);
		*flagp |= HASWIDTH|SIMPLE;
		break;
	case _T('['): {
		int range;
		int rangeend;
		int c;

		if (*regparse == _T('^')) {	// Complement of range. 
			ret = regnode(ANYBUT);
			regparse++;
		} else
			ret = regnode(ANYOF);
		if ((c = *regparse) == _T(']') || c == _T('-')) {
			regc(c);
			regparse++;
		}
		while ((c = *regparse++) != _T('\0') && c != _T(']')) {
			if (c != _T('-'))
				regc(c);
			else if ((c = *regparse) == _T(']') || c == _T('\0'))
				regc(_T('-'));
			else 
			{
				range = (unsigned) (TCHAR)*(regparse-2);
				rangeend = (unsigned) (TCHAR)c;
				if (range > rangeend)
				{
					TRACE0("invalid [] range\n");
					return NULL;
				}
				for (range++; range <= rangeend; range++)
					regc(range);
				regparse++;
			}
		}
		regc(_T('\0'));
		if (c != _T(']'))
		{
			TRACE0("unmatched []\n");
			return NULL;
		}
		*flagp |= HASWIDTH|SIMPLE;
		break;
		}
	case _T('('):
		ret = reg(1, &flags);
		if (ret == NULL)
			return(NULL);
		*flagp |= flags&(HASWIDTH|SPSTART);
		break;
	case _T('\0'):
	case _T('|'):
	case _T(')'):
		// supposed to be caught earlier 
		TRACE0("internal error: \\0|) unexpected\n");
		return NULL;
		break;
	case _T('?'):
	case _T('+'):
	case _T('*'):
		TRACE0("?+* follows nothing\n");
		return NULL;
		break;
	case _T('\\'):
		if (*regparse == _T('\0'))
		{
			TRACE0("trailing \\\n");
			return NULL;
		}
		ret = regnode(EXACTLY);
		regc(*regparse++);
		regc(_T('\0'));
		*flagp |= HASWIDTH|SIMPLE;
		break;
	default: {
		size_t len;
		TCHAR ender;

		regparse--;
		len = _tcscspn(regparse, META);
		if (len == 0)
		{
			TRACE0("internal error: strcspn 0\n");
			return NULL;
		}
		ender = *(regparse+len);
		if (len > 1 && ISREPN(ender))
			len--;		// Back off clear of ?+* operand. 
		*flagp |= HASWIDTH;
		if (len == 1)
			*flagp |= SIMPLE;
		ret = regnode(EXACTLY);
		for (; len > 0; len--)
			regc(*regparse++);
		regc(_T('\0'));
		break;
		}
	}

	return(ret);
}



// reginsert - insert an operator in front of already-emitted operand
//
// Means relocating the operand.
 
void CRegExp::reginsert(TCHAR op, TCHAR *opnd)
{
	TCHAR *place;

	if (!bEmitCode) {
		regsize += 3;
		return;
	}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -