📄 myreg.cpp
字号:
memset( sets, 0, sizeof(SET*)*REG_INPUT_TABLE_SIZE );
// union target of same input
while( n = *r++ ) {
c = n->input;
while ( *c ) {
offset = (int)(*c++);
sets[offset] = union_set( inter->factory, n->follow_set, sets[offset] );
}
}
// for each target
for( offset = REG_MAX_CHAR; offset > 0; offset-- ) {
if( sets[offset] ) {
// printf ( "Check dstate for input %c(%x)\n", offset, offset );
// check if it is a new target
nds = find_dstate( dfa->hash_table, sets[offset] );
// add new target
if ( nds == NULL ) {
nds = add_new_dstate( dfa, sets[offset] );
}
new_dtran( ds, nds , (char) offset );
}
}
// find next unmarked dstate
while( unmarked < dfa->count ) {
if( (ds = dfa->dstates[unmarked])->marked == 0 ) {
// printf( "Goto Unmarked DSTATE: %x\n", ds );
break;
}
unmarked++;
}
}
return dfa;
}
/*
Enlarge the node pool of the interp.
Double the pool size
*/
void enlarge_node_pool( REG_INTERP* inter ) {
inter->node_pool_size *= 2;
inter->nodes = (REG_NODE*)realloc( inter->nodes, inter->node_pool_size );
if ( inter->nodes == NULL ) {
perror( "Fail to enlarge the node pool" );
}
}
/*
Initialize the reg node
*/
void init_reg_node( REG_NODE* n ) {
memset( n, 0, sizeof(REG_NODE) );
n->input = (char*) malloc(sizeof(char)*DEFAULT_INPUT_POOL_SIZE);
memset( n->input, 0, sizeof(char)*DEFAULT_INPUT_POOL_SIZE);
n->input_size = DEFAULT_INPUT_POOL_SIZE;
n->input_count = 0;
n->input[0] = 0;
}
// reg node input operations
void append_reg_node_input( REG_NODE* n, char start, char end ) {
// Make sure that start < end. If not swap them.
if ( end < start ) {
swap( start, end );
}
// New count
int count = end - start + 1 + n->input_count;
if ( count > n->input_size - 1 ) {
// Enlarge the input pool
n->input_size = ( count / DEFAULT_INPUT_POOL_SIZE + 1 ) * DEFAULT_INPUT_POOL_SIZE;
n->input = (char*) realloc( n->input, n->input_size*sizeof(char) );
}
// write at the tail
char* w = n->input + n->input_count;
for ( char i = start; i <= end; i++ ) {
*w++ = (char)i;
}
*w = 0;
// Change count after append input
n->input_count = count;
}
void append_reg_node_input( REG_NODE* n, char c ) {
append_reg_node_input( n, c, c );
}
void set_reg_node_input( REG_NODE* n, char c ) {
n->input[0] = c;
n->input[1] = 0;
n->input_count = 1;
}
void set_reg_node_input( REG_NODE* n, char start, char end ) {
n->input_count = 0;
n->input[0] = 0;
append_reg_node_input( n , start, end );
}
/*
<re> ::= <expr> { <expr> }
| <re> '|' <re>
<expr> ::= <term>
| <term> '*'
<term> ::= <label>
| '(' <re> ')'
<label> ::= <symbol>
| '[' <range> { <range> } ']'
| '[' ']' { <range> } ']'
| '[' '^' <range> { <range> } ']'
| '[' '^' ']' { <range> } ']'
<range> ::= <symbol>
| <symbol> '-' <symbol>
<symbol> ::= '.'
| 0 .. n (any element of alphabet)
| '\' <symbol>
*/
REG_NODE* re( REG_INTERP* inter, REG_CHAR_TYPE end_mask ) {
//printf ( "re\n" );
/*
<re> ::= <expr> { <expr> }
| <re> '|' <re>
*/
// current sub root
REG_NODE* n = expr( inter );
char c = 0;
// char type
REG_CHAR_TYPE t = 0;
int end = 0;
inter->sub_re_count++;
do {
c = LOOKAHEAD(inter);
t = GET_REG_CHAR_TYPE( c );
// check if the re is end or not
if ( IS_END_MARK( t ) ) {
if ( c == 0 ) {
// end of the string, always abort
if ( end_mask != END_MARK ) {
printf("Unexpected abort at the end of the string!!!");
exit(-1);
}
end = 1;
break;
} else {
if ( end_mask & t ) {
// this is the end char we are expecting
// end here
end = 1;
break;
}
}
}
// check if the re is followed by operator OR
if ( c == '|' ) {
READ(inter);
end = 0;
break;
}
n = oper_join( inter, n, expr( inter ) );
} while( 1 );
if ( end ) {
// <re> ::= <expr> { <expr> } ends
return n;
} else {
// <re> ::= <re> '|' <re>
n = oper_or( inter, n, re( inter, end_mask ) );
}
return n;
}
REG_NODE* expr( REG_INTERP* inter ) {
//printf ( "expr\n" );
/*
<expr> ::= <term>
| <term> '*'
| <term> '+'
*/
REG_NODE* n = term( inter );
if ( LOOKAHEAD(inter) == '*' ) {
// <term> '*'
n = oper_aster( inter, n );
READ(inter);
} else {
if ( LOOKAHEAD(inter) == '+' ) {
// <term> '+'
n = oper_plus( inter, n );
READ(inter);
}
}
return n;
}
REG_NODE* term( REG_INTERP* inter ) {
//printf ( "term\n" );
/*
<term> ::= <label>
| '(' <re> ')'
*/
REG_NODE* n = NULL;
if ( LOOKAHEAD( inter ) == '(' ) {
READ( inter );
n = re( inter, END_BY_PAREN | END_MARK );
if ( LOOKAHEAD( inter ) == ')' ) {
READ( inter );
} else {
// unexpected end
perror( "Unexpected end!!!" );
exit(-1);
}
} else {
n = label( inter );
}
return n;
}
REG_NODE* label( REG_INTERP* inter ) {
//printf ("label\n" );
/*
<label> ::= <symbol>
| '[' <range> { <range> } ']'
| '[' '^' <range> { <range> } ']'
*/
// support symbol only
// currently
REG_NODE* s = NULL;
if ( LOOKAHEAD(inter) == '[' ) {
READ(inter);
// <range>
if ( LOOKAHEAD(inter) == '^' ) {
// skip ^
READ(inter);
// anti range
s = anti_ranges( inter );
} else {
// range
s = ranges( inter );
}
// skip ]
READ(inter);
} else {
s = symbol( inter );
}
return s;
}
REG_NODE* symbol( REG_INTERP* inter ) {
//printf ("symbol\n" );
/*
<symbol> ::= '.'
| 0 .. n (any element of alphabet)
| '\' <symbol>
*/
REG_NODE* n = NULL;
char c = 0;
REG_CHAR_TYPE t = 0;
if ( get_symbol_char( inter, c, t ) ) {
if ( IS_WILDCHAR( t ) ) {
n = wildchar_node( inter, c );
} else {
n = char_node( inter, c );
}
} else {
printf( "Fail to get symbol text!!!\n" );
}
return n;
}
int escape_char( REG_INTERP* inter, char& c_out ) {
/*
\n
\t
\x[hex numbers]
\0[octal numbers]
\\
\[oper char]
*/
assert( '\\' == READ( inter ) );
char c = READ(inter);
REG_CHAR_TYPE t = 0;
char v = 0;
switch ( c ) {
case 'n':
// new line
c_out = '\n';
break;
case 't':
// tab
c_out = '\t';
break;
case '\\':
c_out = '\\';
break;
case 'x':
// hex
c_out = 0;
t = GET_REG_CHAR_TYPE( LOOKAHEAD( inter ) );
while( IS_HEX(t) ) {
c = READ(inter);
if( c >= '0' && c <= '9' ) {
v = c - '0';
}
if ( c >= 'a' && c <='f' ) {
v = c - 'a' + 10;
}
if ( c >= 'A' && c <= 'F') {
v = c - 'A' + 10;
}
c_out = ( c_out << 4 ) + v;
t = GET_REG_CHAR_TYPE( LOOKAHEAD( inter ) );
}
break;
case 0:
// oct
c_out = 0;
t = GET_REG_CHAR_TYPE( LOOKAHEAD( inter ) );
while( IS_OCT(t) ) {
c = READ(inter);
v = c - '0';
c_out = ( c_out << 3 ) + v;
t = GET_REG_CHAR_TYPE( LOOKAHEAD( inter ) );
}
break;
default:
c_out = c;
}
return 1;
}
int get_symbol_char( REG_INTERP* inter, char& c_out, REG_CHAR_TYPE& t_out ) {
if ( LOOKAHEAD(inter) == '\\' ) {
// this is an escape char
t_out = REG_TEXT;
return escape_char( inter, c_out );
} else {
// other char
c_out = READ( inter );
t_out = GET_REG_CHAR_TYPE(c_out);
}
return 1;
}
#define MAX_RANGE_LIST_SIZE 128
REG_NODE* anti_ranges( REG_INTERP* inter ) {
char start = 0, end = 0;
char range_list[MAX_RANGE_LIST_SIZE];
range_list[0] = 0;
REG_CHAR_TYPE t = 0;
REG_NODE* n = NULL;
// <ranges> ::= <ranges> <range>
while( range( inter, start, end, t ) ) {
if ( IS_WILDCHAR( t ) ) {
// reset the range
insert_range( range_list, '\n', '\n' );
} else {
// reverse
if ( start > 1 ) {
insert_range( range_list, 1, start );
}
if ( end < REG_MAX_CHAR ) {
insert_range( range_list, end + 1, REG_MAX_CHAR );
}
}
}
if ( range_list[0] == 0 ) {
// empty range, illegal
printf( "Empty range is not allowed" );
exit(-1);
} else {
merge_range_list( range_list );
n = range_node( inter, range_list );
}
return n;
}
REG_NODE* ranges( REG_INTERP* inter ) {
char start = 0, end = 0;
char range_list[MAX_RANGE_LIST_SIZE];
range_list[0] = 0;
REG_CHAR_TYPE t = 0;
REG_NODE* n = NULL;
// <ranges> ::= <ranges> <range>
while( range( inter, start, end, t ) ) {
if ( IS_WILDCHAR( t ) ) {
// reset the range
insert_range( range_list, 1, (char)((int)('\n') -1 ) );
insert_range( range_list, (char)((int)('\n') +1 ), REG_MAX_CHAR );
} else {
insert_range( range_list, start, end );
}
}
if ( range_list[0] == 0 ) {
// empty range, illegal
printf( "Empty range is not allowed" );
exit(-1);
} else {
merge_range_list( range_list );
n = range_node( inter, range_list );
}
return n;
}
void insert_range( char* range_list, char start, char end ) {
size_t len = strlen( range_list );
char* r = range_list;
char* tail = range_list + len;
if ( len ) {
// scan from back
// *r -- current start
// *(r+1) -- current end
// *(r+2) -- next start
// *(r+3) -- next end
while( *r ) {
// the start point in unique
if ( *r == start ) {
if ( *(r+1) < end ) {
// enlarge the range
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -