📄 regexp.cpp
字号:
if (tester.Size() >= 0x7fffL) // Probably could be 0xffffL.
{
regerror(REGERR_TO_BIG);
return NULL;
}
m_programSize = tester.Size();
// Allocate space.
program = new TCHAR[m_programSize];
CRegCompiler comp( exp, program );
// Second pass: emit code.
if (comp.reg(0, &flags) == NULL)
return false;
scan = program + 1; // First BRANCH.
if (OP(regnext(scan)) == END)
{ // Only one top-level choice.
scan = OPERAND(scan);
// Starting-point info.
if (OP(scan) == EXACTLY)
regstart = *OPERAND(scan);
else if (OP(scan) == BOL)
reganch = 1;
// If there's something expensive in the r.e., find the
// longest literal string that must appear and make it the
// regmust. Resolve ties in favor of later strings, since
// the regstart check works with the beginning of the r.e.
// and avoiding duplication strengthens checking. Not a
// strong reason, but sufficient in the absence of others.
if (flags&SPSTART)
{
LPTSTR longest = NULL;
size_t len = 0;
for (; scan != NULL; scan = regnext(scan))
if (OP(scan) == EXACTLY && _tcslen(OPERAND(scan)) >= len)
{
longest = OPERAND(scan);
len = _tcslen(OPERAND(scan));
}
regmust = longest;
regmlen = (int)len;
}
}
return true;
}
regexp * regexp::getCopy()
{
return new regexp( *this );
}
// reg - regular expression, i.e. main body or parenthesized thing
//
// Caller must absorb opening parenthesis.
//
// Combining parenthesis handling with the base level of regular expression
// is a trifle forced, but the need to tie the tails of the branches to what
// follows makes it hard to avoid.
LPTSTR CRegCompilerBase::reg( int paren, int *flagp )
{
LPTSTR ret;
LPTSTR br;
LPTSTR ender;
int parno = 0;
int flags;
*flagp = HASWIDTH; // Tentatively.
if (paren)
{
// Make an OPEN node.
if (regnpar >= Regexp::NSUBEXP)
{
regerror(REGERR_TO_MANY_PAREN);
return NULL;
}
parno = regnpar;
regnpar++;
ret = regnode(OPEN+parno);
}
// Pick up the branches, linking them together.
br = regbranch(&flags);
if (br == NULL)
return(NULL);
if (paren)
regtail(ret, br); // OPEN -> first.
else
ret = br;
*flagp &= ~(~flags&HASWIDTH); // Clear bit if bit 0.
*flagp |= flags&SPSTART;
while (*regparse == '|')
{
regparse++;
br = regbranch(&flags);
if (br == NULL)
return(NULL);
regtail(ret, br); // BRANCH -> BRANCH.
*flagp &= ~(~flags&HASWIDTH);
*flagp |= flags&SPSTART;
}
// Make a closing node, and hook it on the end.
ender = regnode((paren) ? CLOSE+parno : END);
regtail(ret, ender);
// Hook the tails of the branches to the closing node.
for (br = ret; br != NULL; br = regnext(br))
regoptail(br, ender);
// Check for proper termination.
if (paren && *regparse++ != ')')
{
regerror( REGERR_UNTERMINATED_PAREN );
return NULL;
}
else if (!paren && *regparse != '\0')
{
if (*regparse == ')')
{
regerror( REGERR_UNMATCHED_PAREN );
return NULL;
}
else
{
regerror( REGERR_INTERNAL_ERROR_JUNK );
return NULL;
}
// NOTREACHED
}
return(ret);
}
// regbranch - one alternative of an | operator
//
// Implements the concatenation operator.
LPTSTR CRegCompilerBase::regbranch(int *flagp)
{
LPTSTR ret;
LPTSTR chain;
LPTSTR latest;
int flags;
int c;
*flagp = WORST; // Tentatively.
ret = regnode(BRANCH);
chain = NULL;
while ((c = *regparse) != '\0' && c != '|' && c != ')')
{
latest = regpiece(&flags);
if (latest == NULL)
return(NULL);
*flagp |= flags&HASWIDTH;
if (chain == NULL) // First piece.
*flagp |= flags&SPSTART;
else
regtail(chain, latest);
chain = latest;
}
if (chain == NULL) // Loop ran zero times.
(void) regnode(NOTHING);
return(ret);
}
// regpiece - something followed by possible [*+?]
//
// Note that the branching code sequences used for ? and the general cases
// of * and + are somewhat optimized: they use the same NOTHING node as
// both the endmarker for their branch list and the body of the last branch.
// It might seem that this node could be dispensed with entirely, but the
// endmarker role is not redundant.
LPTSTR CRegCompilerBase::regpiece(int *flagp)
{
LPTSTR ret;
TCHAR op;
LPTSTR next;
int flags;
ret = regatom(&flags);
if (ret == NULL)
return(NULL);
op = *regparse;
if (!ISREPN(op))
{
*flagp = flags;
return(ret);
}
if (!(flags&HASWIDTH) && op != _T( '?' ))
{
regerror( REGERR_OP_COULD_BE_EMPTY );
return NULL;
}
switch (op)
{
case _T( '*' ): *flagp = WORST|SPSTART; break;
case _T( '+' ): *flagp = WORST|SPSTART|HASWIDTH; break;
case _T( '?' ): *flagp = WORST; break;
}
if (op == _T( '*' ) && (flags&SIMPLE))
reginsert(STAR, ret);
else if (op == _T( '*' ))
{
// Emit x* as (x&|), where & means "self".
reginsert(BRANCH, ret); // Either x
regoptail(ret, regnode(BACK)); // and loop
regoptail(ret, ret); // back
regtail(ret, regnode(BRANCH)); // or
regtail(ret, regnode(NOTHING)); // null.
}
else if (op == _T( '+' ) && (flags&SIMPLE))
reginsert(PLUS, ret);
else if (op == _T( '+' ))
{
// Emit x+ as x(&|), where & means "self".
next = regnode(BRANCH); // Either
regtail(ret, next);
regtail(regnode(BACK), ret); // loop back
regtail(next, regnode(BRANCH)); // or
regtail(ret, regnode(NOTHING)); // null.
}
else if (op == _T( '?' ))
{
// Emit x? as (x|)
reginsert(BRANCH, ret); // Either x
regtail(ret, regnode(BRANCH)); // or
next = regnode(NOTHING); // null.
regtail(ret, next);
regoptail(ret, next);
}
regparse++;
if (ISREPN(*regparse))
{
regerror( REGERR_NESTED_OP );
return NULL;
}
return(ret);
}
// regatom - the lowest level
//
// Optimization: gobbles an entire sequence of ordinary characters so that
// it can turn them into a single node, which is smaller to store and
// faster to run. Backslashed characters are exceptions, each becoming a
// separate node; the code is simpler that way and it's not worth fixing.
LPTSTR CRegCompilerBase::regatom(int * flagp)
{
LPTSTR ret;
int flags;
*flagp = WORST; // Tentatively.
switch ( *regparse++ )
{
// FIXME: these chars only have meaning at beg/end of pat?
case '^':
ret = regnode(BOL);
break;
case '$':
ret = regnode(EOL);
break;
case '.':
ret = regnode(ANY);
*flagp |= HASWIDTH|SIMPLE;
break;
case '[':
{
int range;
int rangeend;
int c;
if (*regparse == '^')
{ // Complement of range.
ret = regnode(ANYBUT);
regparse++;
}
else
ret = regnode(ANYOF);
if ((c = *regparse) == ']' || c == '-')
{
regc(c);
regparse++;
}
while ((c = *regparse++ ) != '\0' && c != ']')
{
if (c != '-')
regc(c);
else if ((c = *regparse) == ']' || c == '\0')
regc('-');
else
{
range = (TCHAR)*(regparse-2);
rangeend = (TCHAR)c;
if (range > rangeend)
{
regerror( REGERR_INVALID_RANGE );
return NULL;
}
for (range++; range <= rangeend; range++)
regc(range);
regparse++;
}
}
regc('\0');
if (c != ']')
{
regerror( REGERR_UNMATCHED_BRACE );
return NULL;
}
*flagp |= HASWIDTH|SIMPLE;
break;
}
case '(':
ret = reg(1, &flags);
if (ret == NULL)
return(NULL);
*flagp |= flags&(HASWIDTH|SPSTART);
break;
case '\0':
case '|':
case ')':
// supposed to be caught earlier
regerror( REGERR_INTERNAL_UNEXPECTED_CHAR );
return NULL;
case '?':
case '+':
case '*':
{
regerror( REGERR_OP_FOLLOWS_NOTHING );
return NULL;
}
case '\\':
switch (*regparse++)
{
case '\0':
{
regerror( REGERR_TRAILING_ESC );
return NULL;
}
case '<':
ret = regnode(WORDA);
break;
case '>':
ret = regnode(WORDZ);
break;
/* FIXME: Someday handle \1, \2, ... */
default:
/* Handle general quoted chars in exact-match routine */
goto de_fault;
}
break;
de_fault:
default:
// Encode a string of characters to be matched exactly.
//
// This is a bit tricky due to quoted chars and due to
// '*', '+', and '?' taking the SINGLE char previous
// as their operand.
//
// On entry, the char at regparse[-1] is going to go
// into the string, no matter what it is. (It could be
// following a \ if we are entered from the '\' case.)
//
// Basic idea is to pick up a good char in ch and
// examine the next char. If it's *+? then we twiddle.
// If it's \ then we frozzle. If it's other magic char
// we push ch and terminate the string. If none of the
// above, we push ch on the string and go around again.
//
// regprev is used to remember where "the current char"
// starts in the string, if due to a *+? we need to back
// up and put the current char in a separate, 1-char, string.
// When regprev is NULL, ch is the only char in the
// string; this is used in *+? handling, and in setting
// flags |= SIMPLE at the end.
{
TCHAR *regprev;
register TCHAR ch;
regparse--; /* Look at cur char */
ret = regnode(EXACTLY);
for ( regprev = 0 ; ; ) {
ch = *regparse++; /* Get current char */
switch (*regparse) { /* look at next one */
default:
regc(ch); /* Add cur to string */
break;
case '.': case '[': case '(':
case ')': case '|': case '\n':
case '$': case '^':
case '\0':
/* FIXME, $ and ^ should not always be magic */
magic:
regc(ch); /* dump cur char */
goto done; /* and we are done */
case '?': case '+': case '*':
if (!regprev) /* If just ch in str, */
goto magic; /* use it */
/* End mult-char string one early */
regparse = regprev; /* Back up parse */
goto done;
case '\\':
regc(ch); /* Cur char OK */
switch (regparse[1]){ /* Look after \ */
case '\0':
case '<':
case '>':
/* FIXME: Someday handle \1, \2, ... */
goto done; /* Not quoted */
default:
/* Backup point is \, scan * point is after it. */
regprev = regparse;
regparse++;
continue; /* NOT break; */
}
}
regprev = regparse; /* Set backup point */
}
done:
regc('\0');
*flagp |= HASWIDTH;
if (!regprev) /* One char? */
*flagp |= SIMPLE;
}
break;
}
return(ret);
}
////////////////////////////////////////////////////////////////////////////////
// regexec and friends
// Work-variable struct for regexec().
class CRegExecutor : public CRegProgramAccessor
{
friend bool regexp::regexec( LPCTSTR str );
LPTSTR reginput; // String-input pointer.
LPTSTR regbol; // Beginning of input, for ^ check.
LPTSTR * regstartp; // Pointer to startp array.
LPTSTR * regendp; // Ditto for endp.
regexp * prog;
public:
CRegExecutor( regexp * prog, LPTSTR string );
protected:
bool regtry( LPTSTR string );
bool regmatch( LPTSTR prog );
size_t regrepeat( LPTSTR node );
};
CRegExecutor::CRegExecutor( regexp * p, LPTSTR string )
: regbol( string ),
regstartp( p->startp ),
regendp( p->endp ),
prog(p)
{
}
#ifdef _RE_DEBUG
int regnarrate = 0;
#endif
// regexec - match a regexp against a string
bool regexp::regexec( LPCTSTR str )
{
LPTSTR string = (LPTSTR)str; // avert const poisoning
// Be paranoid.
if ( string == NULL )
{
regerror( REGERR_NULLARG );
return false;
}
// Check validity of program.
if (*program != MAGIC)
{
regerror( REGERR_CORRUPTED );
return false;
}
// If there is a "must appear" string, look for it.
if ( regmust != NULL && _tcsstr( string, regmust ) == NULL )
return false;
CRegExecutor executor( this, string );
// Simplest case: anchored match need be tried only once.
if ( reganch )
return( executor.regtry( string ) );
// Messy cases: unanchored match.
if ( regstart != '\0' )
{
// We know what TCHAR it must start with.
for ( LPTSTR s = string; s != NULL; s = _tcschr( s+1 , regstart ) )
if ( executor.regtry( s) )
return true;
return false;
}
else
{
// We don't -- general case.
for ( LPTSTR s = string; ! executor.regtry( s ); s++ )
if (*s == '\0')
return false;
}
return true;
}
// regtry - try match at specific point
bool CRegExecutor::regtry( LPTSTR string )
{
int i;
LPTSTR * stp;
LPTSTR * enp;
reginput = string;
stp = prog->startp;
enp = prog->endp;
for (i = Regexp::NSUBEXP; i > 0; i--)
{
*stp++ = NULL;
*enp++ = NULL;
}
if ( regmatch( prog->program + 1 ) )
{
prog->startp[0] = string;
prog->endp[0] = reginput;
return true;
}
else
return false;
}
// regmatch - main matching routine
//
// Conceptually the strategy is simple: check to see whether the current
// node matches, call self recursively to see whether the rest matches,
// and then act accordingly. In practice we make some effort to avoid
// recursion, in particular by going through "ordinary" nodes (that don't
// need to know whether the rest of the match failed) by a loop instead of
// by recursion.
bool CRegExecutor::regmatch( LPTSTR prog )
{
LPTSTR scan; // Current node.
LPTSTR next; // Next node.
#ifdef _RE_DEBUG
if (prog != NULL && regnarrate)
_ftprintf(stderr, _T( "%s(\n" ), regprop(prog));
#endif
for (scan = prog; scan != NULL; scan = next)
{
#ifdef _RE_DEBUG
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -