⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 regexp.cpp

📁 Crimson编辑器的英文版,完成从韩文版变成英文版的移植,并且附带可执行文件和注册表文件,无需原先的安装包,是改写编辑器的最理想选择.
💻 CPP
📖 第 1 页 / 共 2 页
字号:
////////////////////////////////////////////////////////////////////////////////
// RegExp.cpp
////////////////////////////////////////////////////////////////////////////////

#include "stdafx.h"
#include "RegExp.h"


// definition	number	opnd?	meaning
#define	END		0		// no	End of program.
#define	BOL		1		// no	Match beginning of line.
#define	EOL		2		// no	Match end of line.
#define	ANY		3		// no	Match any character.
#define	ANYOF	4		// str	Match any of these.
#define	ANYBUT	5		// str	Match any but one of these.
#define	BRANCH	6		// node	Match this, or the next..\&.
#define	BACK	7		// no	"next" ptr points backward.
#define	EXACTLY	8		// str	Match this string.
#define	NOTHING	9		// no	Match empty string.
#define	STAR	10		// node	Match this 0 or more times.
#define	PLUS	11		// node	Match this 1 or more times.
#define	OPEN	20		// no	Sub-RE starts here.
						//		OPEN+1 is number 1, etc.
#define	CLOSE	30		// no	Analogous to OPEN.

// Utility definitions.

#define	FAIL(m)		{ regerror(m); return(NULL); }
#define	ISREPN(c)	((c) == _T('*') || (c) == _T('+') || (c) == _T('?'))
#define	META		"^$.[()|?+*\\"

// Flags to be passed up and down.

#define	HASWIDTH	01	// Known never to match null string.
#define	SIMPLE		02	// Simple enough to be STAR/PLUS operand.
#define	SPSTART		04	// Starts with * or +.
#define	WORST		0	// Worst case.


CRegExp::CRegExp()
{
	bCompiled = FALSE;
	program = NULL;
	sFoundText = NULL;

	for( int i = 0; i < NSUBEXP; i++ )
	{
		startp[i] = NULL;
		endp[i] = NULL;
	}
}

CRegExp::~CRegExp()
{
	delete program;
	delete sFoundText;
}


CRegExp* CRegExp::RegComp(const TCHAR *exp)
{
	TCHAR *scan;
	int flags;

	if (exp == NULL)
		return NULL;

	bCompiled = TRUE;

	// First pass: determine size, legality.
	bEmitCode = FALSE;
	regparse = (TCHAR *)exp;
	regnpar = 1;
	regsize = 0L;
	regdummy[0] = NOTHING;
	regdummy[1] = regdummy[2] = 0;
	regcode = regdummy;
	if (reg(0, &flags) == NULL)
		return(NULL);

	// Allocate space.
	delete program;
	program = new TCHAR[regsize];
	memset( program, 0, regsize * sizeof(TCHAR) );

	if (program == NULL)
		return NULL;

	// Second pass: emit code.
	bEmitCode = TRUE;
	regparse = (TCHAR *)exp;
	regnpar = 1;
	regcode = program;
	if (reg(0, &flags) == NULL)
		return NULL;

	// Dig out information for optimizations.
	regstart = _T('\0');		// Worst-case defaults.
	reganch = 0;
	regmust = NULL;
	regmlen = 0;
	scan = program;		// First BRANCH.
	if (OP(regnext(scan)) == END)
	{
		// Only one top-level choice.
		scan = OPERAND(scan);

		// Starting-point info.
		if (OP(scan) == EXACTLY)
			regstart = *OPERAND(scan);
		else if (OP(scan) == BOL)
			reganch = 1;

		// If there's something expensive in the r.e., find the
		// longest literal string that must appear and make it the
		// regmust.  Resolve ties in favor of later strings, since
		// the regstart check works with the beginning of the r.e.
		// and avoiding duplication strengthens checking.  Not a
		// strong reason, but sufficient in the absence of others.

		if (flags&SPSTART)
		{
			char *longest = NULL;
			size_t len = 0;

			for (; scan != NULL; scan = regnext(scan))
				if (OP(scan) == EXACTLY && _tcslen(OPERAND(scan)) >= len)
				{
					longest = OPERAND(scan);
					len = _tcslen(OPERAND(scan));
				}
			regmust = longest;
			regmlen = (int)len;
		}
	}

	return this;
}

// reg - regular expression, i.e. main body or parenthesized thing
//
// Caller must absorb opening parenthesis.
//
// Combining parenthesis handling with the base level of regular expression
// is a trifle forced, but the need to tie the tails of the branches to what
// follows makes it hard to avoid.



TCHAR *CRegExp::reg(int paren, int *flagp)
{
	char *ret;
	char *br;
	char *ender;
	int parno;
	int flags;

	*flagp = HASWIDTH;	// Tentatively.

	if (paren)
	{
		// Make an OPEN node.
		if (regnpar >= NSUBEXP)
		{
			TRACE1("Too many (). NSUBEXP is set to %d\n", NSUBEXP );
			return NULL;
		}
		parno = regnpar;
		regnpar++;
		ret = regnode(OPEN+parno);
	}

	// Pick up the branches, linking them together.
	br = regbranch(&flags);
	if (br == NULL)
		return(NULL);
	if (paren)
		regtail(ret, br);	// OPEN -> first.
	else
		ret = br;
	*flagp &= ~(~flags&HASWIDTH);	// Clear bit if bit 0.
	*flagp |= flags&SPSTART;
	while (*regparse == _T('|')) {
		regparse++;
		br = regbranch(&flags);
		if (br == NULL)
			return(NULL);
		regtail(ret, br);	// BRANCH -> BRANCH.
		*flagp &= ~(~flags&HASWIDTH);
		*flagp |= flags&SPSTART;
	}

	// Make a closing node, and hook it on the end.
	ender = regnode((paren) ? CLOSE+parno : END);
	regtail(ret, ender);

	// Hook the tails of the branches to the closing node.
	for (br = ret; br != NULL; br = regnext(br))
		regoptail(br, ender);

	// Check for proper termination.
	if (paren && *regparse++ != _T(')'))
	{
		TRACE0("unterminated ()\n");
		return NULL;
	}
	else if (!paren && *regparse != _T('\0'))
	{
		if (*regparse == _T(')'))
		{
			TRACE0("unmatched ()\n");
			return NULL;
		}
		else
		{
			TRACE0("internal error: junk on end\n");
			return NULL;
		}
		// NOTREACHED
	}

	return(ret);
}




//
// regbranch - one alternative of an | operator
//
// Implements the concatenation operator.

TCHAR *CRegExp::regbranch(int *flagp)
{
	TCHAR *ret;
	TCHAR *chain;
	TCHAR *latest;
	int flags;
	int c;

	*flagp = WORST;				// Tentatively.

	ret = regnode(BRANCH);
	chain = NULL;
	while ((c = *regparse) != _T('\0') && c != _T('|') && c != _T(')')) {
		latest = regpiece(&flags);
		if (latest == NULL)
			return(NULL);
		*flagp |= flags&HASWIDTH;
		if (chain == NULL)		// First piece.
			*flagp |= flags&SPSTART;
		else
			regtail(chain, latest);
		chain = latest;
	}
	if (chain == NULL)			// Loop ran zero times.
		(void) regnode(NOTHING);

	return(ret);
}

//
// regpiece - something followed by possible [*+?]
//
// Note that the branching code sequences used for ? and the general cases
// of * and + are somewhat optimized:  they use the same NOTHING node as
// both the endmarker for their branch list and the body of the last branch.
// It might seem that this node could be dispensed with entirely, but the
// endmarker role is not redundant.

TCHAR *CRegExp::regpiece(int *flagp)
{
	TCHAR *ret;
	TCHAR op;
	TCHAR *next;
	int flags;

	ret = regatom(&flags);
	if (ret == NULL)
		return(NULL);

	op = *regparse;
	if (!ISREPN(op)) {
		*flagp = flags;
		return(ret);
	}

	if (!(flags&HASWIDTH) && op != _T('?'))
	{
		TRACE0("*+ operand could be empty\n");
		return NULL;
	}

	switch (op) {
	case _T('*'):	*flagp = WORST|SPSTART;			break;
	case _T('+'):	*flagp = WORST|SPSTART|HASWIDTH;	break;
	case _T('?'):	*flagp = WORST;				break;
	}

	if (op == _T('*') && (flags&SIMPLE))
		reginsert(STAR, ret);
	else if (op == _T('*')) {
		// Emit x* as (x&|), where & means "self".
		reginsert(BRANCH, ret);		// Either x
		regoptail(ret, regnode(BACK));	// and loop
		regoptail(ret, ret);		// back
		regtail(ret, regnode(BRANCH));	// or
		regtail(ret, regnode(NOTHING));	// null.
	} else if (op == _T('+') && (flags&SIMPLE))
		reginsert(PLUS, ret);
	else if (op == _T('+')) {
		// Emit x+ as x(&|), where & means "self".
		next = regnode(BRANCH);		// Either
		regtail(ret, next);
		regtail(regnode(BACK), ret);	// loop back
		regtail(next, regnode(BRANCH));	// or
		regtail(ret, regnode(NOTHING));	// null.
	} else if (op == _T('?')) {
		// Emit x? as (x|)
		reginsert(BRANCH, ret);		// Either x
		regtail(ret, regnode(BRANCH));	// or
		next = regnode(NOTHING);		// null.
		regtail(ret, next);
		regoptail(ret, next);
	}
	regparse++;
	if (ISREPN(*regparse))
	{
		TRACE0("nested *?+\n");
		return NULL;
	}

	return(ret);
}

//
// regatom - the lowest level
//
// Optimization:  gobbles an entire sequence of ordinary characters so that
// it can turn them into a single node, which is smaller to store and
// faster to run.  Backslashed characters are exceptions, each becoming a
// separate node; the code is simpler that way and it's not worth fixing.

TCHAR *CRegExp::regatom(int *flagp)
{
	TCHAR *ret;
	int flags;

	*flagp = WORST;		// Tentatively.

	switch (*regparse++) {
	case _T('^'):
		ret = regnode(BOL);
		break;
	case _T('$'):
		ret = regnode(EOL);
		break;
	case _T('.'):
		ret = regnode(ANY);
		*flagp |= HASWIDTH|SIMPLE;
		break;
	case _T('['): {
		int range;
		int rangeend;
		int c;

		if (*regparse == _T('^')) {	// Complement of range.
			ret = regnode(ANYBUT);
			regparse++;
		} else
			ret = regnode(ANYOF);
		if ((c = *regparse) == _T(']') || c == _T('-')) {
			regc(c);
			regparse++;
		}
		while ((c = *regparse++) != _T('\0') && c != _T(']')) {
			if (c != _T('-'))
				regc(c);
			else if ((c = *regparse) == _T(']') || c == _T('\0'))
				regc(_T('-'));
			else
			{
				range = (unsigned) (TCHAR)*(regparse-2);
				rangeend = (unsigned) (TCHAR)c;
				if (range > rangeend)
				{
					TRACE0("invalid [] range\n");
					return NULL;
				}
				for (range++; range <= rangeend; range++)
					regc(range);
				regparse++;
			}
		}
		regc(_T('\0'));
		if (c != _T(']'))
		{
			TRACE0("unmatched []\n");
			return NULL;
		}
		*flagp |= HASWIDTH|SIMPLE;
		break;
		}
	case _T('('):
		ret = reg(1, &flags);
		if (ret == NULL)
			return(NULL);
		*flagp |= flags&(HASWIDTH|SPSTART);
		break;
	case _T('\0'):
	case _T('|'):
	case _T(')'):
		// supposed to be caught earlier
		TRACE0("internal error: \\0|) unexpected\n");
		return NULL;
		break;
	case _T('?'):
	case _T('+'):
	case _T('*'):
		TRACE0("?+* follows nothing\n");
		return NULL;
		break;
	case _T('\\'):
		if (*regparse == _T('\0'))
		{
			TRACE0("trailing \\\n");
			return NULL;
		}
		ret = regnode(EXACTLY);
		regc(*regparse++);
		regc(_T('\0'));
		*flagp |= HASWIDTH|SIMPLE;
		break;
	default: {
		size_t len;
		TCHAR ender;

		regparse--;
		len = _tcscspn(regparse, META);
		if (len == 0)
		{
			TRACE0("internal error: strcspn 0\n");
			return NULL;
		}
		ender = *(regparse+len);
		if (len > 1 && ISREPN(ender))
			len--;		// Back off clear of ?+* operand.
		*flagp |= HASWIDTH;
		if (len == 1)
			*flagp |= SIMPLE;
		ret = regnode(EXACTLY);
		for (; len > 0; len--)
			regc(*regparse++);
		regc(_T('\0'));
		break;
		}
	}

	return(ret);
}



// reginsert - insert an operator in front of already-emitted operand
//
// Means relocating the operand.

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -