📄 regexp.cpp
字号:
////////////////////////////////////////////////////////////////////////////////
// RegExp.cpp
////////////////////////////////////////////////////////////////////////////////
#include "stdafx.h"
#include "RegExp.h"
// definition number opnd? meaning
#define END 0 // no End of program.
#define BOL 1 // no Match beginning of line.
#define EOL 2 // no Match end of line.
#define ANY 3 // no Match any character.
#define ANYOF 4 // str Match any of these.
#define ANYBUT 5 // str Match any but one of these.
#define BRANCH 6 // node Match this, or the next..\&.
#define BACK 7 // no "next" ptr points backward.
#define EXACTLY 8 // str Match this string.
#define NOTHING 9 // no Match empty string.
#define STAR 10 // node Match this 0 or more times.
#define PLUS 11 // node Match this 1 or more times.
#define OPEN 20 // no Sub-RE starts here.
// OPEN+1 is number 1, etc.
#define CLOSE 30 // no Analogous to OPEN.
// Utility definitions.
#define FAIL(m) { regerror(m); return(NULL); }
#define ISREPN(c) ((c) == _T('*') || (c) == _T('+') || (c) == _T('?'))
#define META "^$.[()|?+*\\"
// Flags to be passed up and down.
#define HASWIDTH 01 // Known never to match null string.
#define SIMPLE 02 // Simple enough to be STAR/PLUS operand.
#define SPSTART 04 // Starts with * or +.
#define WORST 0 // Worst case.
CRegExp::CRegExp()
{
bCompiled = FALSE;
program = NULL;
sFoundText = NULL;
for( int i = 0; i < NSUBEXP; i++ )
{
startp[i] = NULL;
endp[i] = NULL;
}
}
CRegExp::~CRegExp()
{
delete program;
delete sFoundText;
}
CRegExp* CRegExp::RegComp(const TCHAR *exp)
{
TCHAR *scan;
int flags;
if (exp == NULL)
return NULL;
bCompiled = TRUE;
// First pass: determine size, legality.
bEmitCode = FALSE;
regparse = (TCHAR *)exp;
regnpar = 1;
regsize = 0L;
regdummy[0] = NOTHING;
regdummy[1] = regdummy[2] = 0;
regcode = regdummy;
if (reg(0, &flags) == NULL)
return(NULL);
// Allocate space.
delete program;
program = new TCHAR[regsize];
memset( program, 0, regsize * sizeof(TCHAR) );
if (program == NULL)
return NULL;
// Second pass: emit code.
bEmitCode = TRUE;
regparse = (TCHAR *)exp;
regnpar = 1;
regcode = program;
if (reg(0, &flags) == NULL)
return NULL;
// Dig out information for optimizations.
regstart = _T('\0'); // Worst-case defaults.
reganch = 0;
regmust = NULL;
regmlen = 0;
scan = program; // First BRANCH.
if (OP(regnext(scan)) == END)
{
// Only one top-level choice.
scan = OPERAND(scan);
// Starting-point info.
if (OP(scan) == EXACTLY)
regstart = *OPERAND(scan);
else if (OP(scan) == BOL)
reganch = 1;
// If there's something expensive in the r.e., find the
// longest literal string that must appear and make it the
// regmust. Resolve ties in favor of later strings, since
// the regstart check works with the beginning of the r.e.
// and avoiding duplication strengthens checking. Not a
// strong reason, but sufficient in the absence of others.
if (flags&SPSTART)
{
char *longest = NULL;
size_t len = 0;
for (; scan != NULL; scan = regnext(scan))
if (OP(scan) == EXACTLY && _tcslen(OPERAND(scan)) >= len)
{
longest = OPERAND(scan);
len = _tcslen(OPERAND(scan));
}
regmust = longest;
regmlen = (int)len;
}
}
return this;
}
// reg - regular expression, i.e. main body or parenthesized thing
//
// Caller must absorb opening parenthesis.
//
// Combining parenthesis handling with the base level of regular expression
// is a trifle forced, but the need to tie the tails of the branches to what
// follows makes it hard to avoid.
TCHAR *CRegExp::reg(int paren, int *flagp)
{
char *ret;
char *br;
char *ender;
int parno;
int flags;
*flagp = HASWIDTH; // Tentatively.
if (paren)
{
// Make an OPEN node.
if (regnpar >= NSUBEXP)
{
TRACE1("Too many (). NSUBEXP is set to %d\n", NSUBEXP );
return NULL;
}
parno = regnpar;
regnpar++;
ret = regnode(OPEN+parno);
}
// Pick up the branches, linking them together.
br = regbranch(&flags);
if (br == NULL)
return(NULL);
if (paren)
regtail(ret, br); // OPEN -> first.
else
ret = br;
*flagp &= ~(~flags&HASWIDTH); // Clear bit if bit 0.
*flagp |= flags&SPSTART;
while (*regparse == _T('|')) {
regparse++;
br = regbranch(&flags);
if (br == NULL)
return(NULL);
regtail(ret, br); // BRANCH -> BRANCH.
*flagp &= ~(~flags&HASWIDTH);
*flagp |= flags&SPSTART;
}
// Make a closing node, and hook it on the end.
ender = regnode((paren) ? CLOSE+parno : END);
regtail(ret, ender);
// Hook the tails of the branches to the closing node.
for (br = ret; br != NULL; br = regnext(br))
regoptail(br, ender);
// Check for proper termination.
if (paren && *regparse++ != _T(')'))
{
TRACE0("unterminated ()\n");
return NULL;
}
else if (!paren && *regparse != _T('\0'))
{
if (*regparse == _T(')'))
{
TRACE0("unmatched ()\n");
return NULL;
}
else
{
TRACE0("internal error: junk on end\n");
return NULL;
}
// NOTREACHED
}
return(ret);
}
//
// regbranch - one alternative of an | operator
//
// Implements the concatenation operator.
TCHAR *CRegExp::regbranch(int *flagp)
{
TCHAR *ret;
TCHAR *chain;
TCHAR *latest;
int flags;
int c;
*flagp = WORST; // Tentatively.
ret = regnode(BRANCH);
chain = NULL;
while ((c = *regparse) != _T('\0') && c != _T('|') && c != _T(')')) {
latest = regpiece(&flags);
if (latest == NULL)
return(NULL);
*flagp |= flags&HASWIDTH;
if (chain == NULL) // First piece.
*flagp |= flags&SPSTART;
else
regtail(chain, latest);
chain = latest;
}
if (chain == NULL) // Loop ran zero times.
(void) regnode(NOTHING);
return(ret);
}
//
// regpiece - something followed by possible [*+?]
//
// Note that the branching code sequences used for ? and the general cases
// of * and + are somewhat optimized: they use the same NOTHING node as
// both the endmarker for their branch list and the body of the last branch.
// It might seem that this node could be dispensed with entirely, but the
// endmarker role is not redundant.
TCHAR *CRegExp::regpiece(int *flagp)
{
TCHAR *ret;
TCHAR op;
TCHAR *next;
int flags;
ret = regatom(&flags);
if (ret == NULL)
return(NULL);
op = *regparse;
if (!ISREPN(op)) {
*flagp = flags;
return(ret);
}
if (!(flags&HASWIDTH) && op != _T('?'))
{
TRACE0("*+ operand could be empty\n");
return NULL;
}
switch (op) {
case _T('*'): *flagp = WORST|SPSTART; break;
case _T('+'): *flagp = WORST|SPSTART|HASWIDTH; break;
case _T('?'): *flagp = WORST; break;
}
if (op == _T('*') && (flags&SIMPLE))
reginsert(STAR, ret);
else if (op == _T('*')) {
// Emit x* as (x&|), where & means "self".
reginsert(BRANCH, ret); // Either x
regoptail(ret, regnode(BACK)); // and loop
regoptail(ret, ret); // back
regtail(ret, regnode(BRANCH)); // or
regtail(ret, regnode(NOTHING)); // null.
} else if (op == _T('+') && (flags&SIMPLE))
reginsert(PLUS, ret);
else if (op == _T('+')) {
// Emit x+ as x(&|), where & means "self".
next = regnode(BRANCH); // Either
regtail(ret, next);
regtail(regnode(BACK), ret); // loop back
regtail(next, regnode(BRANCH)); // or
regtail(ret, regnode(NOTHING)); // null.
} else if (op == _T('?')) {
// Emit x? as (x|)
reginsert(BRANCH, ret); // Either x
regtail(ret, regnode(BRANCH)); // or
next = regnode(NOTHING); // null.
regtail(ret, next);
regoptail(ret, next);
}
regparse++;
if (ISREPN(*regparse))
{
TRACE0("nested *?+\n");
return NULL;
}
return(ret);
}
//
// regatom - the lowest level
//
// Optimization: gobbles an entire sequence of ordinary characters so that
// it can turn them into a single node, which is smaller to store and
// faster to run. Backslashed characters are exceptions, each becoming a
// separate node; the code is simpler that way and it's not worth fixing.
TCHAR *CRegExp::regatom(int *flagp)
{
TCHAR *ret;
int flags;
*flagp = WORST; // Tentatively.
switch (*regparse++) {
case _T('^'):
ret = regnode(BOL);
break;
case _T('$'):
ret = regnode(EOL);
break;
case _T('.'):
ret = regnode(ANY);
*flagp |= HASWIDTH|SIMPLE;
break;
case _T('['): {
int range;
int rangeend;
int c;
if (*regparse == _T('^')) { // Complement of range.
ret = regnode(ANYBUT);
regparse++;
} else
ret = regnode(ANYOF);
if ((c = *regparse) == _T(']') || c == _T('-')) {
regc(c);
regparse++;
}
while ((c = *regparse++) != _T('\0') && c != _T(']')) {
if (c != _T('-'))
regc(c);
else if ((c = *regparse) == _T(']') || c == _T('\0'))
regc(_T('-'));
else
{
range = (unsigned) (TCHAR)*(regparse-2);
rangeend = (unsigned) (TCHAR)c;
if (range > rangeend)
{
TRACE0("invalid [] range\n");
return NULL;
}
for (range++; range <= rangeend; range++)
regc(range);
regparse++;
}
}
regc(_T('\0'));
if (c != _T(']'))
{
TRACE0("unmatched []\n");
return NULL;
}
*flagp |= HASWIDTH|SIMPLE;
break;
}
case _T('('):
ret = reg(1, &flags);
if (ret == NULL)
return(NULL);
*flagp |= flags&(HASWIDTH|SPSTART);
break;
case _T('\0'):
case _T('|'):
case _T(')'):
// supposed to be caught earlier
TRACE0("internal error: \\0|) unexpected\n");
return NULL;
break;
case _T('?'):
case _T('+'):
case _T('*'):
TRACE0("?+* follows nothing\n");
return NULL;
break;
case _T('\\'):
if (*regparse == _T('\0'))
{
TRACE0("trailing \\\n");
return NULL;
}
ret = regnode(EXACTLY);
regc(*regparse++);
regc(_T('\0'));
*flagp |= HASWIDTH|SIMPLE;
break;
default: {
size_t len;
TCHAR ender;
regparse--;
len = _tcscspn(regparse, META);
if (len == 0)
{
TRACE0("internal error: strcspn 0\n");
return NULL;
}
ender = *(regparse+len);
if (len > 1 && ISREPN(ender))
len--; // Back off clear of ?+* operand.
*flagp |= HASWIDTH;
if (len == 1)
*flagp |= SIMPLE;
ret = regnode(EXACTLY);
for (; len > 0; len--)
regc(*regparse++);
regc(_T('\0'));
break;
}
}
return(ret);
}
// reginsert - insert an operator in front of already-emitted operand
//
// Means relocating the operand.
void CRegExp::reginsert(TCHAR op, TCHAR *opnd)
{
TCHAR *place;
if (!bEmitCode) {
regsize += 3;
return;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -