📄 nfa.c
字号:
if( !inquote )
{
while( *Input == '{' ) /* Macro expansion required }*/
{
*++sp = Input ; /* Stack current input string. */
Input = get_macro( sp ); /* Use macro body as input string */
if( TOOHIGH(stack,sp) )
parse_err(E_MACDEPTH); /* Stack overflow, abort */
}
}
if( *Input == '"' )
{ /* At either start and end of a quoted */
inquote = ~inquote; /* string. All characters are treated as */
if( !*++Input ) /* literals while inquote is true). */
{
Current_tok = EOS ;
Lexeme = '\0';
goto exit;
}
}
saw_esc = (*Input == '\\');
if( !inquote )
{
if( isspace(*Input) )
{
Current_tok = EOS ;
Lexeme = '\0';
goto exit;
}
Lexeme = esc( &Input );
}
else
{
if( saw_esc && Input[1] == '"' )
{
Input += 2; /* Skip the escaped character */
Lexeme = '"';
}
else
Lexeme = *Input++ ;
}
Current_tok = (inquote || saw_esc) ? L : Tokmap[Lexeme] ;
exit:
return Current_tok;
}
/*--------------------------------------------------------------
* The Parser:
* A simple recursive descent parser that creates a Thompson NFA for
* a regular expression. The access routine [thompson()] is at the
* bottom. The NFA is created as a directed graph, with each node
* containing pointer's to the next node. Since the structures are
* allocated from an array, the machine can also be considered
* as an array where the state number is the array index.
*/
PRIVATE NFA *machine()
{
NFA *start;
NFA *p;
ENTER("machine");
p = start = new();
p->next = rule();
while( !MATCH(END_OF_INPUT) )
{
p->next2 = new();
p = p->next2;
p->next = rule();
}
LEAVE("machine");
return start;
}
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -*/
PRIVATE NFA *rule()
{
/* rule --> expr EOS action
* ^expr EOS action
* expr$ EOS action
*
* action --> <tabs> <string of characters>
* epsilon
*/
NFA *start = NULL;
NFA *end = NULL;
int anchor = NONE;
ENTER("rule");
if( MATCH( AT_BOL ) )
{
start = new() ;
start->edge = '\n' ;
anchor |= START ;
advance();
expr( &start->next, &end );
}
else
expr( &start, &end );
if( MATCH( AT_EOL ) )
{
/* pattern followed by a carriage-return or linefeed (use a
* character class).
*/
advance();
end->next = new() ;
end->edge = CCL ;
if( !( end->bitset = newset()) )
parse_err( E_MEM ); /* doesn't return */
ADD( end->bitset, '\n' );
if( !Unix )
ADD( end->bitset, '\r' );
end = end->next ;
anchor |= END ;
}
while( isspace(*Input) )
Input++ ;
end->accept = save( Input );
end->anchor = anchor;
advance(); /* skip past EOS */
LEAVE("rule");
return start;
}
PRIVATE void expr( startp, endp )
NFA **startp, **endp ;
{
/* Because a recursive descent compiler can't handle left recursion,
* the productions:
*
* expr -> expr OR cat_expr
* | cat_expr
*
* must be translated into:
*
* expr -> cat_expr expr'
* expr' -> OR cat_expr expr'
* epsilon
*
* which can be implemented with this loop:
*
* cat_expr
* while( match(OR) )
* cat_expr
* do the OR
*/
NFA *e2_start = NULL; /* expression to right of | */
NFA *e2_end = NULL;
NFA *p;
ENTER("expr");
cat_expr( startp, endp );
while( MATCH( OR ) )
{
advance();
cat_expr( &e2_start, &e2_end );
p = new();
p->next2 = e2_start ;
p->next = *startp ;
*startp = p;
p = new();
(*endp)->next = p;
e2_end ->next = p;
*endp = p;
}
LEAVE("expr");
}
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -*/
PRIVATE void cat_expr( startp, endp )
NFA **startp, **endp ;
{
/* The same translations that were needed in the expr rules are needed again
* here:
*
* cat_expr -> cat_expr | factor
* factor
*
* is translated to:
*
* cat_expr -> factor cat_expr'
* cat_expr' -> | factor cat_expr'
* epsilon
*/
NFA *e2_start, *e2_end;
ENTER("cat_expr");
if( first_in_cat( Current_tok ) )
factor( startp, endp );
while( first_in_cat( Current_tok ) )
{
factor( &e2_start, &e2_end );
memcpy( *endp, e2_start, sizeof(NFA));
discard( e2_start );
*endp = e2_end;
}
LEAVE("cat_expr");
}
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -*/
PRIVATE int first_in_cat( tok )
TOKEN tok;
{
switch( tok )
{
case CLOSE_PAREN:
case AT_EOL:
case OR:
case EOS: return 0;
case CLOSURE:
case PLUS_CLOSE:
case OPTIONAL: parse_err( E_CLOSE ); return 0;
case CCL_END: parse_err( E_BRACKET ); return 0;
case AT_BOL: parse_err( E_BOL ); return 0;
}
return 1;
}
PRIVATE void factor( startp, endp )
NFA **startp, **endp;
{
/* factor --> term* | term+ | term?
*/
NFA *start, *end;
ENTER("factor");
term( startp, endp );
if( MATCH(CLOSURE) || MATCH(PLUS_CLOSE) || MATCH(OPTIONAL) )
{
start = new() ;
end = new() ;
start->next = *startp ;
(*endp)->next = end ;
if( MATCH(CLOSURE) || MATCH(OPTIONAL) ) /* * or ? */
start->next2 = end;
if( MATCH(CLOSURE) || MATCH(PLUS_CLOSE) ) /* * or + */
(*endp)->next2 = *startp;
*startp = start ;
*endp = end ;
advance();
}
LEAVE("factor");
}
PRIVATE void term( startp, endp )
NFA **startp, **endp;
{
/* Process the term productions:
*
* term --> [...] | [^...] | [] | [^] | . | (expr) | <character>
*
* The [] is nonstandard. It matches a space, tab, formfeed, or newline,
* but not a carriage return (\r). All of these are single nodes in the
* NFA.
*/
NFA *start;
int c;
ENTER("term");
if( MATCH( OPEN_PAREN ) )
{
advance();
expr( startp, endp );
if( MATCH( CLOSE_PAREN ) )
advance();
else
parse_err( E_PAREN ); /* doesn't return */
}
else
{
*startp = start = new();
*endp = start->next = new();
if( !( MATCH( ANY ) || MATCH( CCL_START) ))
{
start->edge = Lexeme;
advance();
}
else
{
start->edge = CCL;
if( !( start->bitset = newset()) )
parse_err( E_MEM ); /* doesn't return */
if( MATCH( ANY ) ) /* dot (.) */
{
ADD( start->bitset, '\n' );
if( !Unix )
ADD( start->bitset, '\r' );
COMPLEMENT( start->bitset );
}
else
{
advance();
if( MATCH( AT_BOL ) ) /* Negative character class */
{
advance();
ADD( start->bitset, '\n' ); /* Don't include \n in class */
if( !Unix )
ADD( start->bitset, '\r' );
COMPLEMENT( start->bitset );
}
if( ! MATCH( CCL_END ) )
dodash( start->bitset );
else /* [] or [^] */
for( c = 0; c <= ' '; ++c )
ADD( start->bitset, c );
}
advance();
}
}
LEAVE("term");
}
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
PRIVATE void dodash( set )
SET *set; /* Pointer to ccl character set */
{
register int first;
if( MATCH( DASH ) ) /* Treat [-...] as a literal dash */
{ /* But print warning. */
warning( W_STARTDASH );
ADD( set, Lexeme );
advance();
}
for(; !MATCH( EOS ) && !MATCH( CCL_END ) ; advance() )
{
if( ! MATCH( DASH ) )
{
first = Lexeme;
ADD( set, Lexeme );
}
else /* looking at a dash */
{
advance();
if( MATCH( CCL_END ) ) /* Treat [...-] as literal */
{
warning( W_ENDDASH );
ADD( set, '-' );
}
else
{
for(; first <= Lexeme ; first++ )
ADD( set, first );
}
}
}
}
PUBLIC NFA *thompson( input_function, max_state, start_state )
char *(*input_function) P((void));
int *max_state;
NFA **start_state;
{
/* Access routine to this module. Return a pointer to a NFA transition
* table that represents the regular expression pointed to by expr or
* NULL if there's not enough memory. Modify *max_state to reflect the
* largest state number used. This number will probably be a larger
* number than the total number of states. Modify *start_state to point
* to the start state. This pointer is garbage if thompson() returned 0.
* The memory for the table is fetched from malloc(); use free() to
* discard it.
*/
CLEAR_STACK();
Ifunct = input_function;
Current_tok = EOS; /* Load first token */
advance();
Nstates = 0;
Next_alloc = 0;
*start_state = machine(); /* Manufacture the NFA */
*max_state = Next_alloc ; /* Max state # in NFA */
if( Verbose > 1 )
print_nfa( Nfa_states, *max_state, *start_state );
if( Verbose )
{
printf("%d/%d NFA states used.\n", *max_state, NFA_MAX );
printf("%s/%d bytes used for accept strings.\n\n", save(NULL), STR_MAX);
}
return Nfa_states;
}
#ifdef MAIN
#define ALLOC
#include "globals.h"
/*- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
PRIVATE pnode( nfa ) /* For debugging, print a single NFA structure */
NFA *nfa;
{
if( !nfa )
printf("(NULL)");
else
{
printf( "+--Structure at 0x%04x-------------------\n", nfa );
printf( "|edge = 0x%x (%c)\n", nfa->edge, nfa->edge );
printf( "|next = 0x%x\n", nfa->next );
printf( "|next2 = 0x%x\n", nfa->next2 );
printf( "|accept = <%s>\n", nfa->accept );
printf( "|bitset = " );
printccl( nfa->bitset );
printf("\n");
printf( "+----------------------------------------\n" );
}
}
/*- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
PRIVATE char *getline()
{
static char buf[80];
printf("%d: ", Lineno++ );
gets( buf );
return *buf ? buf : NULL ;
}
/*- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
main( argc, argv )
char **argv;
{
NFA *nfa, *start_state;
int max_state;
Verbose = 2;
nfa = thompson(getline, &max_state, &start_state);
}
#endif
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -