📄 cs_regexp.cpp
字号:
}
cvars.regparse++;
if ( ISMULT ( *cvars.regparse ) )
FAIL ( "nested *?+" );
return ( ret );
}
/*
- regatom - the lowest level
*
* Optimization: gobbles an entire sequence of ordinary characters so that
* it can turn them into a single node, which is smaller to store and
* faster to run. Backslashed characters are exceptions, each becoming a
* separate node; the code is simpler that way and it's not worth fixing.
*/
char *
regexp::regatom ( compiler_vars& cvars, int* flagp )
{
register char *ret;
int flags;
*flagp = WORST; /* Tentatively. */
switch ( *cvars.regparse++ )
{
case '^':
ret = regnode ( cvars, BOL );
break;
case '$':
ret = regnode ( cvars, EOL );
break;
case '.':
ret = regnode ( cvars, ANY );
*flagp |= HASWIDTH | SIMPLE;
break;
case '[':
{
int rclass;
int classend;
if ( *cvars.regparse == '^' )
{
/* Complement of range. */
ret = regnode ( cvars, ANYBUT );
cvars.regparse++;
}
else
ret = regnode ( cvars, ANYOF );
if ( *cvars.regparse == ']' || *cvars.regparse == '-' )
regc ( cvars, *cvars.regparse++ );
while ( *cvars.regparse != '\0' && *cvars.regparse != ']' )
{
if ( *cvars.regparse == '-' )
{
cvars.regparse++;
if ( *cvars.regparse == ']' || *cvars.regparse == '\0' )
regc ( cvars, '-' );
else
{
rclass = UCHARAT ( cvars.regparse - 2 ) + 1;
classend = UCHARAT ( cvars.regparse );
if ( rclass > classend + 1 )
FAIL ( "invalid [] range" );
for ( ; rclass <= classend; rclass++ )
regc ( cvars, rclass );
cvars.regparse++;
}
}
else
regc ( cvars, *cvars.regparse++ );
}
regc ( cvars, '\0' );
if ( *cvars.regparse != ']' )
FAIL ( "unmatched []" );
cvars.regparse++;
*flagp |= HASWIDTH | SIMPLE;
}
break;
case '(':
ret = reg ( cvars, 1, &flags );
if ( ret == NULL )
return ( NULL );
*flagp |= flags & ( HASWIDTH | SPSTART );
break;
case '\0':
case '|':
case ')':
FAIL ( "internal urp" ); /* Supposed to be caught earlier. */
break;
case '?':
case '+':
case '*':
FAIL ( "?+* follows nothing" );
break;
case '\\':
if ( *cvars.regparse == '\0' )
FAIL ( "trailing \\" );
ret = regnode ( cvars, EXACTLY );
regc ( cvars, *cvars.regparse++ );
regc ( cvars, '\0' );
*flagp |= HASWIDTH | SIMPLE;
break;
default:
{
register int len;
register char ender;
cvars.regparse--;
len = strcspn ( cvars.regparse, META );
if ( len <= 0 )
FAIL ( "internal disaster" );
ender = *( cvars.regparse + len );
if ( len > 1 && ISMULT ( ender ) )
len--; /* Back off clear of ?+* operand. */
*flagp |= HASWIDTH;
if ( len == 1 )
*flagp |= SIMPLE;
ret = regnode ( cvars, EXACTLY );
while ( len > 0 )
{
regc ( cvars, *cvars.regparse++ );
len--;
}
regc ( cvars, '\0' );
}
break;
}
return ( ret );
}
/*
- regnode - emit a node
*/
char * /* Location. */
regexp::regnode ( compiler_vars& cvars, char op )
{
register char *ret;
register char *ptr;
ret = cvars.regcode;
if ( ret == ®dummy )
{
cvars.regsize += 3;
return ( ret );
}
ptr = ret;
*ptr++ = op;
*ptr++ = '\0'; /* Null "next" pointer. */
*ptr++ = '\0';
cvars.regcode = ptr;
return ( ret );
}
/*
- regc - emit (if appropriate) a byte of code
*/
void
regexp::regc ( compiler_vars& cvars, char b )
{
if ( cvars.regcode != ®dummy )
*cvars.regcode++ = b;
else
cvars.regsize++;
}
/*
- reginsert - insert an operator in front of already-emitted operand
*
* Means relocating the operand.
*/
void
regexp::reginsert ( compiler_vars& cvars, char op, char* opnd )
{
register char *src;
register char *dst;
register char *place;
if ( cvars.regcode == ®dummy )
{
cvars.regsize += 3;
return;
}
src = cvars.regcode;
cvars.regcode += 3;
dst = cvars.regcode;
while ( src > opnd )
*--dst = *--src;
place = opnd; /* Op node, where operand used to be. */
*place++ = op;
*place++ = '\0';
*place++ = '\0';
}
/*
- regtail - set the next-pointer at the end of a node chain
*/
void
regexp::regtail ( compiler_vars& cvars, char* p, char* val )
{
register char *scan;
register char *temp;
register int offset;
if ( p == ®dummy )
return;
/* Find last node. */
scan = p;
for (;;)
{
temp = regnext ( scan );
if ( temp == NULL )
break;
scan = temp;
}
if ( OP ( scan ) == BACK )
offset = scan - val;
else
offset = val - scan;
*( scan + 1 ) = ( offset >> 8 ) & 0377;
*( scan + 2 ) = offset & 0377;
}
/*
- regoptail - regtail on operand of first argument; nop if operandless
*/
void
regexp::regoptail ( compiler_vars& cvars, char* p, char* val )
{
/* "Operandless" and "op != BRANCH" are synonymous in practice. */
if ( p == NULL || p == ®dummy || OP ( p ) != BRANCH )
return;
regtail ( cvars, OPERAND ( p ), val );
}
/*
* regexec and friends
*/
#ifdef DEBUG
int regnarrate = 0;
void regdump ( regexp* r );
char* regprop ( char* op );
#endif
/*
- regexec - match a regexp against a string
*/
bool
regexp::exec ( const char* string )
{
char *s;
// Be paranoid...
if ( string == NULL )
{
FAIL ( "NULL parameter" );
return false;
}
exec_vars evars;
// Check validity of program.
if ( UCHARAT ( program ) != (unsigned char) MAGIC )
{
FAIL ( "corrupted program" );
return false;
}
// If there is a "must appear" string, look for it.
if ( regmust != NULL )
{
s = const_cast<char *> ( string );
while ( ( s = strchr ( s, regmust [ 0 ] ) ) != NULL )
{
if ( strncmp ( s, regmust, regmlen ) == 0 )
break; // Found it.
s++;
}
if ( s == NULL ) // Not present.
return false;
}
// Mark beginning of line for ^ .
evars.regbol = const_cast<char *> ( string );
// Simplest case: anchored match need be tried only once.
if ( reganch )
return ( regtry ( evars, const_cast<char *> ( string ) ) != 0 );
// Messy cases: unanchored match.
s = const_cast<char *> ( string );
if ( regstart != '\0' )
// We know what char it must start with.
while ( ( s = strchr ( s, regstart ) ) != NULL )
{
if ( regtry ( evars, s ) )
return true;
s++;
}
else
// We don't -- general case.
do
{
if ( regtry ( evars, s ) )
return true;
}
while ( *s++ != '\0' );
/* Failure. */
return false;
}
/*
- regtry - try match at specific point
*/
int /* 0 failure, 1 success */
regexp::regtry ( exec_vars& evars, char* string )
{
int i;
char **sp;
char **ep;
evars.reginput = const_cast<char *> ( string );
evars.regstartp = startp;
evars.regendp = endp;
sp = startp;
ep = endp;
for ( i = NSUBEXP; i > 0; i-- )
{
*sp++ = NULL;
*ep++ = NULL;
}
if ( regmatch ( evars, program + 1 ) )
{
startp [ 0 ] = const_cast<char *> ( string );
endp [ 0 ] = evars.reginput;
return ( 1 );
}
else
return ( 0 );
}
/*
- regmatch - main matching routine
*
* Conceptually the strategy is simple: check to see whether the current
* node matches, call self recursively to see whether the rest matches,
* and then act accordingly. In practice we make some effort to avoid
* recursion, in particular by going through "ordinary" nodes (that don't
* need to know whether the rest of the match failed) by a loop instead of
* by recursion.
*/
int /* 0 failure, 1 success */
regexp::regmatch ( exec_vars& evars, char* prog )
{
char *scan; /* Current node. */
char *next; /* Next node. */
scan = prog;
#ifdef DEBUG
if ( scan != NULL && regnarrate )
fprintf ( stderr, "%s(\n", regprop ( scan ) );
#endif
while ( scan != NULL )
{
#ifdef DEBUG
if ( regnarrate )
fprintf ( stderr, "%s...\n", regprop ( scan ) );
#endif
next = regnext ( scan );
switch ( OP ( scan ) )
{
case BOL:
if ( evars.reginput != evars.regbol )
return ( 0 );
break;
case EOL:
if ( *evars.reginput != '\0' )
return ( 0 );
break;
case ANY:
if ( *evars.reginput == '\0' )
return ( 0 );
evars.reginput++;
break;
case EXACTLY:
{
int len;
const char *opnd;
opnd = OPERAND ( scan );
/* Inline the first character, for speed. */
if ( *opnd != *evars.reginput )
return ( 0 );
len = strlen ( opnd );
if ( len > 1 && strncmp ( opnd, evars.reginput, len ) != 0 )
return ( 0 );
evars.reginput += len;
}
break;
case ANYOF:
if ( *evars.reginput == '\0' ||
strchr ( OPERAND ( scan ), *evars.reginput ) == NULL )
return ( 0 );
evars.reginput++;
break;
case ANYBUT:
if ( *evars.reginput == '\0' ||
strchr ( OPERAND ( scan ), *evars.reginput ) != NULL )
return ( 0 );
evars.reginput++;
break;
case NOTHING:
break;
case BACK:
break;
case OPEN + 1:
case OPEN + 2:
case OPEN + 3:
case OPEN + 4:
case OPEN + 5:
case OPEN + 6:
case OPEN + 7:
case OPEN + 8:
case OPEN + 9:
{
register int no;
register char *save;
no = OP ( scan ) - OPEN;
save = evars.reginput;
if ( regmatch ( evars, next ) )
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -