⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 myreg.cpp

📁 我的一个利用有限状态机的正则表达式的实现。
💻 CPP
📖 第 1 页 / 共 3 页
字号:
		memset( sets, 0, sizeof(SET*)*REG_INPUT_TABLE_SIZE );

		// union target of same input
		while( n = *r++ ) {
			c = n->input;
			while ( *c ) {
				offset = (int)(*c++);
				sets[offset] = union_set( inter->factory, n->follow_set, sets[offset] );
			}
		}

		// for each target
		for( offset = REG_MAX_CHAR; offset > 0; offset-- ) {
			if( sets[offset] ) {
				// printf ( "Check dstate for input %c(%x)\n", offset, offset );
				// check if it is a new target
				nds = find_dstate( dfa->hash_table, sets[offset] );
				// add new target
				if ( nds == NULL ) {
					nds = add_new_dstate( dfa, sets[offset] );					
				}
				new_dtran( ds, nds , (char) offset );				
			}
		}

		// find next unmarked dstate
		while( unmarked < dfa->count ) {
			if( (ds = dfa->dstates[unmarked])->marked == 0 )  {
				// printf( "Goto Unmarked DSTATE: %x\n", ds );	
				break;
			}
			unmarked++;
		}		
	}

	return dfa;
}

/*
    Enlarge the node pool of the interp.
    Double the pool size
*/
void enlarge_node_pool( REG_INTERP* inter ) {
    inter->node_pool_size *= 2;
    inter->nodes = (REG_NODE*)realloc( inter->nodes, inter->node_pool_size );
    if ( inter->nodes == NULL ) {
        perror( "Fail to enlarge the node pool" );
    }
}

/*
	Initialize the reg node
*/
void init_reg_node( REG_NODE* n ) {
	memset( n, 0, sizeof(REG_NODE) );
	n->input = (char*) malloc(sizeof(char)*DEFAULT_INPUT_POOL_SIZE);
	memset( n->input, 0, sizeof(char)*DEFAULT_INPUT_POOL_SIZE);
	n->input_size = DEFAULT_INPUT_POOL_SIZE;
	n->input_count = 0;
	n->input[0] = 0;
}

// reg node input operations
void append_reg_node_input( REG_NODE* n, char start, char end ) {
	// Make sure that start < end. If not swap them.
	if ( end < start ) {
		swap( start, end );
	}
	// New count
	int count = end - start + 1 + n->input_count;

	if ( count > n->input_size - 1 ) {
		// Enlarge the input pool
		n->input_size = ( count / DEFAULT_INPUT_POOL_SIZE + 1 ) * DEFAULT_INPUT_POOL_SIZE;
		n->input = (char*) realloc( n->input, n->input_size*sizeof(char) );
	}
	// write at the tail
	char* w = n->input + n->input_count;
	
	for ( char i = start; i <= end; i++ ) {
		*w++ = (char)i;
	}
	*w = 0;
	// Change count after append input
	n->input_count = count;
}

void append_reg_node_input( REG_NODE* n, char c ) {
	append_reg_node_input( n, c, c );
}


void set_reg_node_input( REG_NODE* n, char c ) {
	n->input[0] = c;
	n->input[1] = 0;
	n->input_count = 1;
}

void set_reg_node_input( REG_NODE* n, char start, char end ) {
	n->input_count = 0;
	n->input[0] = 0;
	append_reg_node_input( n , start, end );
}

/*
<re>    ::= <expr> { <expr> }
         |  <re> '|' <re>
<expr>  ::= <term>
         |  <term> '*'
<term>  ::= <label>
         |  '(' <re> ')'
<label> ::= <symbol>
         | '[' <range> { <range> } ']'
         | '[' ']' { <range> } ']'
         | '[' '^' <range> { <range> } ']'
         | '[' '^' ']' { <range> } ']'
<range> ::= <symbol>
         | <symbol> '-' <symbol>
 
<symbol> ::= '.'
          | 0 .. n (any element of alphabet)
          | '\' <symbol>
*/


REG_NODE* re( REG_INTERP* inter, REG_CHAR_TYPE end_mask ) {
	//printf ( "re\n" );
/* 
<re>    ::= <expr> { <expr> }
         |  <re> '|' <re>
*/
    // current sub root
    REG_NODE* n = expr( inter );
    char c = 0;
    // char type
    REG_CHAR_TYPE t = 0;    
    int end = 0;
	inter->sub_re_count++;
    do {
        c = LOOKAHEAD(inter);
        t = GET_REG_CHAR_TYPE( c );
        // check if the re is end or not
        if ( IS_END_MARK( t ) ) {
            if ( c == 0 ) {
                // end of the string, always abort
                if ( end_mask != END_MARK ) {
                    printf("Unexpected abort at the end of the string!!!");
                    exit(-1);
                }
                end = 1;
                break;   
            } else {
                if ( end_mask & t ) {
                    // this is the end char we are expecting
                    // end here
                    end = 1;
                    break;
                }
            }
        }
        // check if the re is followed by operator OR
        if ( c == '|' ) {
            READ(inter);
            end = 0;
            break;   
        }
		n = oper_join( inter, n, expr( inter ) );
    } while( 1 );
    
    if ( end ) {
        // <re> ::= <expr> { <expr> } ends
        return n;
    } else {
        // <re> ::= <re> '|' <re>
        n = oper_or( inter, n, re( inter, end_mask ) );
    }
    
    return n;
}

REG_NODE* expr( REG_INTERP* inter ) {
	//printf ( "expr\n" );
/*
<expr>  ::= <term>
         |  <term> '*'
		 |  <term> '+'
*/
	REG_NODE* n = term( inter );
	if ( LOOKAHEAD(inter) == '*' ) {
		// <term> '*'
		n = oper_aster( inter, n );
		READ(inter);
	} else {		
		if ( LOOKAHEAD(inter) == '+' ) {
			// <term> '+'
			n = oper_plus( inter, n );
			READ(inter);
		}
	}
	return n;
}

REG_NODE* term( REG_INTERP* inter ) {

	//printf ( "term\n" );
/*
<term>  ::= <label>
         |  '(' <re> ')'
*/
	REG_NODE* n = NULL;
	if ( LOOKAHEAD( inter ) == '(' ) {
		READ( inter );
		n = re( inter, END_BY_PAREN | END_MARK );
		if ( LOOKAHEAD( inter ) == ')' ) {
			READ( inter );
		} else {
			// unexpected end
			perror( "Unexpected end!!!" );
			exit(-1);
		}
	} else {
		n = label( inter );
	}

	return n;
}

REG_NODE* label( REG_INTERP* inter ) {
	//printf ("label\n" );
/*
<label> ::= <symbol>
         | '[' <range> { <range> } ']'
         | '[' '^' <range> { <range> } ']'         
*/
	// support symbol only
	// currently

	REG_NODE* s = NULL;
	if ( LOOKAHEAD(inter) == '[' ) {
		READ(inter);
		// <range>
		if ( LOOKAHEAD(inter) == '^' ) {
			// skip ^
			READ(inter);
			// anti range
			s = anti_ranges( inter );
		} else {
			// range
			s = ranges( inter );
		}
		// skip ]
		READ(inter);
	} else {
		s = symbol( inter );
	}

	return s;
}


REG_NODE* symbol( REG_INTERP* inter ) {
	//printf ("symbol\n" );
/*
<symbol> ::= '.'
          | 0 .. n (any element of alphabet)
          | '\' <symbol>
*/

	REG_NODE* n = NULL;
	char c = 0;
	REG_CHAR_TYPE t = 0;
	
	if ( get_symbol_char( inter, c, t ) ) {
		if ( IS_WILDCHAR( t ) ) {
			n = wildchar_node( inter, c );
		} else {
			n = char_node( inter, c );
		}
	} else {
		printf( "Fail to get symbol text!!!\n" );
	}

	return n;
}

int escape_char( REG_INTERP* inter, char& c_out ) {
	/*
		\n
		\t
		\x[hex numbers]
		\0[octal numbers]
		\\
		\[oper char]
	*/

	assert( '\\' == READ( inter ) );
	char c = READ(inter);
	REG_CHAR_TYPE t = 0;
	char v = 0;
	switch ( c ) {
	case 'n':
		// new line
		c_out = '\n';
		break;
	case 't':
		// tab
		c_out = '\t';
		break;
	case '\\':
		c_out = '\\';
		break;
	case 'x':
		// hex
		c_out = 0;
		t = GET_REG_CHAR_TYPE( LOOKAHEAD( inter ) );
		while( IS_HEX(t) ) {
			c = READ(inter);
			if( c >= '0' && c <= '9' ) {
				v = c - '0';
			}
			if ( c >= 'a' && c <='f' ) {
				v = c - 'a' + 10;
			}
			if ( c >= 'A' && c <= 'F') {
				v = c - 'A' + 10;
			}
			c_out = ( c_out << 4 ) + v;
			t = GET_REG_CHAR_TYPE( LOOKAHEAD( inter ) );
		}
		break;
	case 0:
		// oct
		c_out = 0;
		t = GET_REG_CHAR_TYPE( LOOKAHEAD( inter ) );
		while( IS_OCT(t) ) {
			c = READ(inter);
			v = c - '0';
			c_out = ( c_out << 3 ) + v;
			t = GET_REG_CHAR_TYPE( LOOKAHEAD( inter ) );
		}
		break;
	default:
		c_out = c;
	}

	return 1;
}

int get_symbol_char( REG_INTERP* inter, char& c_out, REG_CHAR_TYPE& t_out ) {
	if ( LOOKAHEAD(inter) == '\\' ) {
		// this is an escape char
		t_out = REG_TEXT;
		return escape_char( inter, c_out );
	} else {
		// other char
		c_out = READ( inter );
		t_out = GET_REG_CHAR_TYPE(c_out);
	}
	return 1;
}

#define MAX_RANGE_LIST_SIZE 128

REG_NODE* anti_ranges( REG_INTERP* inter ) {
	char start = 0, end = 0;
	char range_list[MAX_RANGE_LIST_SIZE];
	range_list[0] = 0;
	REG_CHAR_TYPE t = 0;
	REG_NODE* n = NULL;
	// <ranges> ::= <ranges> <range>
	while( range( inter, start, end, t ) ) {
		if ( IS_WILDCHAR( t ) ) {
			// reset the range
			insert_range( range_list, '\n', '\n' );
		} else {
			// reverse
			if ( start > 1 ) {
				insert_range( range_list, 1, start );
			}
			if ( end < REG_MAX_CHAR ) {
				insert_range( range_list, end + 1, REG_MAX_CHAR );
			}
		}
	}

	if ( range_list[0] == 0 ) {
		// empty range, illegal
		printf( "Empty range is not allowed" );
		exit(-1);
	} else {
		merge_range_list( range_list );
		n = range_node( inter, range_list );
	}

	return n;
}


REG_NODE* ranges( REG_INTERP* inter ) {
	char start = 0, end = 0;
	char range_list[MAX_RANGE_LIST_SIZE];
	range_list[0] = 0;
	REG_CHAR_TYPE t = 0;
	REG_NODE* n = NULL;
	// <ranges> ::= <ranges> <range>
	while( range( inter, start, end, t ) ) {
		if ( IS_WILDCHAR( t ) ) {
			// reset the range
			insert_range( range_list, 1, (char)((int)('\n') -1 ) );
			insert_range( range_list, (char)((int)('\n') +1 ), REG_MAX_CHAR );
		} else {
			insert_range( range_list, start, end );
		}
	}	

	if ( range_list[0] == 0 ) {
		// empty range, illegal
		printf( "Empty range is not allowed" );
		exit(-1);
	} else {
		merge_range_list( range_list );
		n = range_node( inter, range_list );
	}

	return n;
}

void insert_range( char* range_list, char start, char end ) {

	size_t len = strlen( range_list );
	char* r = range_list;
	char* tail = range_list + len;

	if ( len ) {
		// scan from back	
		// *r -- current start
		// *(r+1) -- current end
		// *(r+2) -- next start
		// *(r+3) -- next end
		while( *r ) {
			// the start point in unique 
			if ( *r == start ) {
				if ( *(r+1) < end ) {
					// enlarge the range

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -