⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 myreg.cpp

📁 我的一个利用有限状态机的正则表达式的实现。
💻 CPP
📖 第 1 页 / 共 3 页
字号:
					*(r+1) = end;					
				}
				return;
			}
			// insert point?
			if ( *r > start ) { break; }
			// goto next
			r+=2;
		}

		if ( *r ) {
			// move the rest to the tail
			while( tail >= r ) {
				*(tail+2) = *tail;
				*(tail+3) = *(tail+1);
				tail -= 2;
			}
		}
	} 

	// write the range to the insert point
	len += 2;
	*r = start;
	*(r+1) = end;	
	*(range_list + len ) = 0;
}

void merge_range_list( char* range_list ) {

	assert( strlen( range_list ) > 0 );

	char tmp_range_list[MAX_RANGE_LIST_SIZE];
	char* r = range_list;
	char* w = tmp_range_list;
	char s1 = 0, s2 = 0, e1 = 0, e2 = 0;
	s1 = *r++;
	e1 = *r++;	

	while( *r ) {
		s2 = *r++;
		e2 = *r++;
		// according to the insert operation, s2 is always less than s1, and there is no same s
		// there is three situation ( in x axis )
		// s1,e1,s2,e2  -- goto [s2,e2]
		// s1,s2,e1,e2	-- replace e1 with e2
		// s1,s2,e2,e1  -- ignore [s2,e2]
		if ( s2 > e1 ) { 
			// write current range
			*w++ = s1;
			*w++ = e1;
			// goto [s2,e2]
			s1 = s2;
			e1 = e2;
			continue;
		} 
		if ( s2 <= e1 && e1 <= e2 ) {
			e1 = e2;
			continue;
		}
	}
	// write the last range
	*w++ = s1;
	*w++ = e1;
	*w = 0;

	printf( "Src ranges %s\n", range_list );
	printf( "Merged ranges %s\n", tmp_range_list );

	strcpy( range_list, tmp_range_list );	
}

/*
<range> ::= <symbol>		 
         | <symbol> '-' <symbol>	     
*/
int range( REG_INTERP* inter, char& start_out, char& end_out, REG_CHAR_TYPE& t_out ) {

	REG_CHAR_TYPE t = 0;

	if( LOOKAHEAD(inter) == ']' ) return 0;

	get_symbol_char ( inter, start_out, t );
	if ( LOOKAHEAD(inter) == '-' ) {
		READ(inter);
		// range text
		get_symbol_char( inter, end_out, t );
		t_out = REG_TEXT;
		if( end_out < start_out ) {
			swap( start_out, end_out );
		}
	} else {
		// this could be wildchar
		t_out = t;
		end_out = start_out;
	}

	printf( "Range %c %c\n", start_out, end_out );
	
	return 1;
}



inline REG_NODE* range_node( REG_INTERP* inter, char* list ) {

	ENLARGE_POOL_WHEN_NECESSARY(inter)
    REG_NODE* n = GET_FREE_NODE(inter);
    
    n->type = REG_TEXT | REG_RANGE_INPUT;
	n->nullable = 0;

	char* r = list;
	char c1 = 0, c2 = 0;
	while ( *r ) {
		c1 = *r++;
		c2 = *r++;
		if ( c1 == c2 ) {
			append_reg_node_input( n, c1 );
		} else {
			append_reg_node_input( n, c1, c2 );
		}
	}

	n->first_set = new_set( inter->factory, n );
    n->last_set = n->first_set;
	// empty set
	n->follow_set = NULL;

	return n;
}

inline REG_NODE* char_node( REG_INTERP* inter, char c ) {

	//printf ("char_node %c\n", c );

    ENLARGE_POOL_WHEN_NECESSARY(inter)
    REG_NODE* n = GET_FREE_NODE(inter);
    
    n->type = REG_TEXT;
	set_reg_node_input( n, c );    
    n->nullable = 0;

    n->first_set = new_set( inter->factory, n );
    n->last_set = n->first_set;
	// empty set
	n->follow_set = NULL;
	
    return n;
}

inline REG_NODE* wildchar_node( REG_INTERP* inter, char c ) {
	
	char range[] = { 1, '\n'-1, '\n'+1, REG_MAX_CHAR, 0 };
    // REG_NODE* n = char_node( inter, c );
    // n->type = REG_WILDCHAR;
    // set_reg_node_input( n, c );
    // n->nullable = 0;

	// n->first_set = new_set( n );
    // n->last_set = n->first_set;
	// empty set
	// n->follow_set = NULL;
    
    // return n;
	return range_node( inter, range );
}

inline REG_NODE* oper( REG_INTERP* inter, char op_char, REG_NODE* l, REG_NODE* r ) {
    ENLARGE_POOL_WHEN_NECESSARY(inter)
    REG_NODE* op = GET_FREE_NODE(inter);
    // assign binary operator
	set_reg_node_input( op, op_char );    
    op->left = l;
    op->right = r;
    op->type = REG_OPER;
	if ( l ) { l->parent = op; }
	if ( r ) { r->parent = op; }
	// empty follow set by default
	op->follow_set = NULL;
	
    return op;
}

inline REG_NODE* oper_join( REG_INTERP* inter, REG_NODE* l, REG_NODE* r ) {
	printf ( "join %x %x\n", l, r );
		
	assert( l != NULL && r != NULL );
	assert( l != r );

	// get operator node 
    REG_NODE* op = oper( inter, '.', l , r );
    // nullable is ture if both side is nullable
    op->nullable = l->nullable && r->nullable;
    // operator will always have a set
    // calculate the first set
    if ( l->nullable ) {
        // union the first set
        op->first_set = union_set( inter->factory, l->first_set, r->first_set );
    } else {		
        // use the first set of l only
        op->first_set = l->first_set;        
    }
    // calculate the last set
    if ( r->nullable ) {
        // union the last set
        op->last_set = union_set( inter->factory, l->last_set, r->last_set );
    } else {
        // use the last set of r only
        op->last_set = r->last_set;        
    }

	// calculate followset
	// the followset of every node in the last set of l contains the first set of r
	// if the node is char node, the first set and the last set is itself
	
	REG_NODE** n = (REG_NODE**)l->last_set->set;
	while( *n ) {
		(*n)->follow_set = union_set( inter->factory, (*n)->follow_set, r->first_set );
		n++;
	}
    
    return op;
}

inline REG_NODE* oper_or( REG_INTERP* inter, REG_NODE* l, REG_NODE* r ) {
	printf ( "or %x %x\n", l, r );

	assert( l != NULL && r != NULL );
	assert( l != r );

    // get operator node
    REG_NODE* op = oper( inter, '|', l , r );
    // nullable is true if any side is nullable
    op->nullable = l->nullable || r->nullable;    
    
	// calculate first set and last set
    // union the first set
    op->first_set = union_set( inter->factory, l->first_set, r->first_set );
	// union the last set
	op->last_set = union_set( inter->factory, l->last_set, r->last_set );
    
    return op;
}

inline REG_NODE* oper_aster( REG_INTERP* inter, REG_NODE* l ) {
	printf ( "aster %x\n", l );
	REG_NODE* op = oper( inter, '*', l, NULL );
    op->nullable = 1;
    
    op->first_set = l->first_set;
    op->last_set = l->last_set;
    
	// calculate followset
	// the followset of every node in the last set of l contains the first set of l
	// if the node is char node, the first set and the last set is itself
	REG_NODE** n = (REG_NODE**)l->last_set->set;
	// REG_NODE** fl = NULL;
	while( *n ) {
		(*n)->follow_set = union_set( inter->factory, (*n)->follow_set, l->first_set );
		n++;
	}
    
    return op;
}

inline REG_NODE* oper_plus( REG_INTERP* inter, REG_NODE* l ) {
	printf ( "plus %x\n", l );
	REG_NODE* op = oper( inter, '*', l, NULL );
    op->nullable = 0;	// different from the aster node, it not nullable
    
    op->first_set = l->first_set;
    op->last_set = l->last_set;
    
	// calculate followset
	// the followset of every node in the last set of l contains the first set of l
	// if the node is char node, the first set and the last set is itself
	REG_NODE** n = (REG_NODE**)l->last_set->set;
	// REG_NODE** fl = NULL;
	while( *n ) {
		(*n)->follow_set = union_set( inter->factory, (*n)->follow_set, l->first_set );
		n++;
	}
    
    return op;
}

// print node to stdout
void PrintNode( REG_NODE* n ) {

	printf( "Node: %x\n", n );
	printf( "TYPE: %x %s %s\n", n->type,
			( IS_TEXT( n->type ) ? "REG_TEXT" : "" ), 
			( IS_OPER( n->type ) ? "REG_OPER" : "" )
		  );
	printf( "PARENT: %x\n", n->parent );
    printf( "LEFT: %x\nRIGHT: %x\n", n->left, n->right );
	printf( "INPUT: %s\n", n->input );
    printf( "NULLABLE: %s\n", ( n->nullable ? "true" : "false" ) );
    
    if ( n->first_set ) {
		printf( "FIRST SET\n" );
		PrintSet( n->first_set );
	} else {
		printf( "No first set\n" );
	}
	if ( n->last_set ) {
		printf ( "LAST SET\n" );
		PrintSet( n->last_set );
	} else {
		printf( "No last set\n" );
	}

	if ( n->follow_set ) {
		printf( "FOLLOWER SET\n" );
		PrintSet( n->follow_set );
	} else {
		printf( "No follower set\n" );
	}
}


void PrintREXP( REG_NODE* root ) {

	if ( IS_OPER(root->type) && root->input[0] == '*' ) {
		printf ( "(" );
	}

	if ( root->left ) {
		PrintREXP( root->left );
	}

	if ( IS_TEXT(root->type) ) {
		printf( "%s", root->input );
	} 

	if ( IS_OPER( root->type ) ) {
		switch ( root->input[0] ) {
		case '*':
			printf( ")*\n" );
			break;
		case '|':
			printf( "\t|\n" );
			break;
		case '.':
			break;
		}
	}

	if ( root->right ) {
		PrintREXP( root->right );
	}	
}


void traverse( REG_NODE* root, int level ) {
	if ( root->left ) {
		traverse( root->left, level + 1 );
	} 
	if ( root->right ) {
		traverse( root->right, level + 1 );
	}
	printf( "LEVEL %x\n", level );
	PrintNode( root );
	printf( "============================================\n" );
}


// compile a regular expression
REG_INTERP* CompileReg( const char* text ) {

	char* p = NULL;

	if ( text[0] != '^' ) {
		// no force start mark
		p = (char*) malloc( strlen( text ) + 3 );
		p[0] = '.';
		p[1] = '*';
		strcpy( p+2, text );
	} else {
		p = (char*)text;
	}

	REG_INTERP* inter = new_reg_interp( p );
	
	PrintREXP( get_syntax_tree( inter ) );
	printf("\n");
	traverse( inter->root, 0 );

	get_dfa( inter );

	if ( p != text ) { free( p ); }

	return inter;
}

// match the regular expression
REG_EXPORT int MatchReg( REG_INTERP* inter, char* text ) {

	REG_DFA* dfa = inter->dfa;
	DSTATE* ds = dfa->dstates[0];
	DTRAN* dt = NULL;
	
	// printf( "Begin match %s\n", text );
	// printf( "Start at %x\n", ds );

	char* p = text;	
	char c = 0;
	while( c = *p++ ) {
		// printf( "Input %c\n", c );
		dt = find_dtran( ds, c );
		if ( dt ) {
			// there is target
			ds = dt->tar;			
			// printf( "Goto %x according to dtran %x\n", ds, dt );
		} else {
			// the string match iff there is no dtran and the state is end state
			// printf( "There is no dtran for input %c.\n", c );
			// printf( "The end state is %x(%s)\n", ds, (ds->end ? "END" : "NONE_END" ) );
			return ds->end;			
		}
	}

	// printf( "No more input\n" );
	// printf( "The end state is %x(%s)\n", ds, (ds->end ? "END" : "NONE_END" ) );
	
	return ( c == 0 && ds->end );	
}

// release the complied regular expression instance
REG_EXPORT ReleaseReg( REG_INTERP* inter ) {

	release_dfa( inter->dfa );

	for( size_t i = 0; i< inter->node_count; i++ ) {
		release_reg_node( &inter->nodes[i] );
	}
	free( inter->nodes );


	release_set_factory( inter->factory );

}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -