📄 suffix_tree.c

📁 Language, Script, and Encoding Identification with String Kernel Classifiers
💻 C
📖 第 1 页 / 共 3 页
字号:
		else		{			(*chars_found)	= str_len;			(*edge_pos)		= str_len-1;		}#ifdef STATISTICS		counter++;#endif		return node;	}	else	{		/*find minimum out of edge length and string length, 		and scan it*/		if(str_len < length)			length = str_len;		for(*edge_pos=1, *chars_found=1; *edge_pos<length; (*chars_found)++,(*edge_pos)++)		{#ifdef STATISTICS			counter++;#endif			/*Compare current characters of the string and 			the edge. If equal - continue.*/			if(tree->tree_string[node->edge_label_start+*edge_pos] != tree->tree_string[str.begin+*edge_pos])			{				(*edge_pos)--;				return node;			}		}	}	/*The loop has advanced *edge_pos one too much.*/	(*edge_pos)--;	if((*chars_found) < str_len)		/*search is not done yet*/		*search_done = 0;	return node;}/**************************************************************//**************************************************************//*Traces for a string in the tree. This function is used inconstruction process only, and not for after-construction search of substrings. It is tailored to enable skipping (when we know a suffix is in the tree (when following a suffix link) we can avoid comparing all symbols of the edge by skipping its length immediately and thus save atomic operations - see Ukkonen's algorithm, skip trick). This function, in contradiction to the function trace_single_edge, 'sees' the whole picture, meaning itsearches a string in the whole tree and not just in a specific edge.  Input : The string, given in indice of the main string.  Output: (by value) the node where tracing has stopped.        (by reference) the edge position where last match occured,		the string position where last match occured, number of 		characters found, a flag for signaling whether search 		is done.*//**************************************************************/NODE* trace_string(SUFFIX_TREE*		tree, 				   /*node to start from*/				   NODE				*node,							   /*string to trace*/				   PATH				str,							   /*last matching position in edge*/				   DBL_WORD			*edge_pos,						   /*last matching position in tree string*/				   DBL_WORD			*chars_found,					   /*skip or not*/				   SKIP_TYPE		type)			{	/*This variable will be 1 when search is done. It is a 	return value from function trace_single_edge.*/	int	  search_done = 0;	/*This variable will hold the number of matching characters 	found in the current edge. It is a return value from 	function trace_single_edge.*/	DBL_WORD edge_chars_found;	*chars_found = 0;	while(search_done == 0)	{		*edge_pos		 = 0;		edge_chars_found = 0;		node			 = trace_single_edge(tree, node, str, edge_pos, &edge_chars_found, type, &search_done);		str.begin		+= edge_chars_found;		*chars_found	+= edge_chars_found;	}	return node;}/**************************************************************//**************************************************************//*Traces for a string in the tree. This function is used for substring search after tree construction is done. It simply traverses down the tree starting from the root until either the searched string is fully found ot one non-matching character is found.  In this function skipping is not enabled because we don't know wether the string is in the tree or not (see function trace_string above).  Input : The tree, the string W, given in actual characters           and the length of W.  Output: If the substring is found - returns the index of           the starting position of the substring in the 		  tree source string. If the substring is not           found - returns ST_ERROR.*//**************************************************************/DBL_WORD ST_FindSubstring( /*the suffix array*/			   SUFFIX_TREE*	tree,					   /*the substring to find*/			   unsigned char*	W,						   /*the length of W*/			   DBL_WORD	P)			{	/*Starts with the root's son that has the first character	of W as its incoming edge first character.*/	NODE* node = find_son(tree, tree->root, W[0]);	DBL_WORD k,j = 0, node_label_end;	/*Scan nodes down from the root until a leaf is reached 	or the substring is found.*/	while(node!=0)	{		k=node->edge_label_start;		node_label_end = get_node_label_end(tree,node);				/*Scan a single edge - compare each character with 		the searched string.*/		while(j<P && k<=node_label_end && tree->tree_string[k] == W[j])		{			j++;			k++;#ifdef STATISTICS			counter++;#endif		}				/*checking which of the stopping conditions are true*/		if(j == P)		{			/*W was found - it is a substring. Return its path			starting index.*/			return node->path_position;		}		else if(k > node_label_end)			/*Current edge is found to match, continue to 			next edge*/			node = find_son(tree, node, W[j]);		else		{			/*One non-matching symbols is found - W is not			a substring*/			return ST_ERROR;		}	}	return ST_ERROR;}/**************************************************************//**************************************************************//*Traces for a string in the tree. This function is used for substring search after tree construction is done. It simply traverses down the tree starting from the root until either the searched string is fully found ot one non-matching character is found.  In this function skipping is not enabled because we don't know wether the string is in the tree or not (see function trace_string above).  Input : The tree, the string W, given in actual characters           and the length of W.  Output: If the substring is found - returns all indices of           the starting position of the substrings found in the 	  tree source string. If the substring is not           found - returns ST_ERROR.*//**************************************************************/DBL_WORD ST_FindAllSubstring(    /*the suffix array*/				   SUFFIX_TREE*	  tree,				   /*the substring to find*/				   unsigned char* W,				   /*the length of W*/				   DBL_WORD	  P){	/*Starts with the root's son that has the first character	of W as its incoming edge first character.*/	NODE* node = find_son(tree, tree->root, W[0]);	DBL_WORD k,j = 0, node_label_end;	/*Scan nodes down from the root until a leaf is reached 	or the substring is found.*/	while(node!=0)	{		k=node->edge_label_start;		node_label_end = get_node_label_end(tree,node);				/*Scan a single edge - compare each character with 		the searched string.*/		while(j<P && k<=node_label_end && tree->tree_string[k] == W[j])		{printf("> %c\n",tree->tree_string[k]);			j++;			k++;#ifdef STATISTICS			counter++;#endif		}				/*checking which of the stopping conditions are true*/		if(j == P)		{			/*W was found - it is a substring. Print its path			starting index and the indices of all the nodes			below it.*///printf("* printChildren %d %d\n",j,P);			printChildren( node );//printf("**\n");			printNode( node );			return 0;		}		else if(k > node_label_end)			/*Current edge is found to match, continue to 			next edge*/			node = find_son(tree, node, W[j]);		else		{			/*One non-matching symbols is found - W is not			a substring*/			return ST_ERROR;		}	}	return ST_ERROR;}void printChildren( NODE* node ) {	NODE* child;	/* Print the path_positions of the immediate child nodes */		child = node->sons;	while ( child != 0 ) {//printf("**** path_position=%lu\n", child->path_position );//printf("**** father->path_position=%lu\n", child->father->path_position );		if ( !(child->path_position == child->father->path_position) ) {			/* Print the path_position of child *///printf(" child != father\n" );			printNode( child );		}		if ( child->sons != 0 ) {//printf(".. son != 0 \n" );			printChildren( child );		}//printf(".. go right\n" );		/* Move to the next node */		child = child->right_sibling;	}	return;}void printNode( NODE* node ) {	if ( node != 0 ) {		DBL_WORD temp = node->path_position;		printf( "%lu\n", temp );	}}	/**************************************************************//**************************************************************//*Follows the suffix link of the source node according to Ukkonen's rules. Input : The tree, and pos. pos is a combination of the source         node and the position in its incoming edge where 		suffix ends.Output: The destination node that represents the longest         suffix of node's path. Example: if node represents         the path "abcde" then it returns the node that         represents "bcde".*//**************************************************************/void follow_suffix_link(SUFFIX_TREE* tree, POS* pos){	/*gama is the string between node and its father, in case	node doesn't have a suffix link.*/	PATH		gama;					/*dummy argument for trace_string function.*/	DBL_WORD	chars_found = 0;			if(pos->node == tree->root)	{		return;	}	/*If node has no suffix link yet or in the middle of an	edge - remember the edge between the node and its father	(gama) and follow its father's suffix link (it must have	one by Ukkonen's lemma). After following, trace down 	gama - it must exist in the tree (and thus can use the 	skip trick - see trace_string function description).*/	if(pos->node->suffix_link == 0 || is_last_char_in_edge(tree,pos->node,pos->edge_pos) == 0)	{		/*if the node's father is the root, than no use 		following it's link (it is linked to itself). Tracing		from the root (like in the naive algorithm) is required		and is done by the calling function SEA uppon recieving		a return value of tree->root from this function.*/		if(pos->node->father == tree->root)		{			pos->node = tree->root;			return;		}				/*Store gama - the indices of node's incoming edge.*/		gama.begin		= pos->node->edge_label_start;		gama.end		= pos->node->edge_label_start + pos->edge_pos;		/*Follow father's suffix link*/		pos->node		= pos->node->father->suffix_link;		/*down-walk gama back to suffix_link's son*/		pos->node		= trace_string(tree, pos->node, gama, &(pos->edge_pos), &chars_found, skip);	}	else	{		/*If a suffix link exists - just follow it.*/		pos->node		= pos->node->suffix_link;		pos->edge_pos	= get_node_label_length(tree,pos->node)-1;	}}/**************************************************************//**************************************************************//*Creates a suffix link between node and the node 'link' which represents its largest suffix. The function could be avoided but is needed to monitor the creation of suffix links when debuging or changing the tree.Input : The node to link from, the node to link to.Output: None.*//**************************************************************/void create_suffix_link(NODE* node, NODE* link){	node->suffix_link = link;}/**************************************************************//**************************************************************//*Single-Phase-Algorithm (see Ukkonen's algorithm). Ensure that a certain extension is in the tree.1. Follows the current node's suffix link.2. Check whether the rest of the extension is in the tree.3. If it is - reports the calling function SPA of rule 3 (= current   phase is done).4. If it's not - inserts it by applying rule 2.  Input : The tree, pos - the node and position in its incoming           edge where extension begins, str - the starting and 		  ending indices of the extension, a flag indicating 		  whether the last phase ended by rule 3 (last extension		  of the last phase already existed in the tree - and if		  so, the current phase starts at not following the 		  suffix link of the fisrt extension).  Output: The rule that was applied to that extension. Can be 3          (phase is done) or 2 (a new leaf was created).*//**************************************************************/void   SEA(	SUFFIX_TREE*	tree, 			POS*            pos,			PATH			str, 			DBL_WORD*		rule_applied,			char			after_rule_3){	DBL_WORD		chars_found = 0,path_pos = str.begin;	NODE			*tmp; #ifdef DEBUG		ST_PrintTree(tree);	printf("extension: %lu  phase+1: %lu",str.begin, str.end);	if(after_rule_3 == 0)		printf("   followed from (%lu,%lu | %lu) ", pos->node->edge_label_start, get_node_label_end(tree,pos->node), pos->edge_pos);	else		printf("   starting at (%lu,%lu | %lu) ", pos->node->edge_label_start, get_node_label_end(tree,pos->node), pos->edge_pos);#endif#ifdef STATISTICS	counter++;#endif	/*follow suffix link only if it's not the first extension 	after rule 3 was applied*/	if(after_rule_3 == 0)		follow_suffix_link(tree, pos);#ifdef DEBUG	#ifdef STATISTICS	if(after_rule_3 == 0)		printf("to (%lu,%lu | %lu). counter: %lu\n", pos->node->edge_label_start, get_node_label_end(tree,pos->node),pos->edge_pos,counter);	else		printf(". counter: %lu\n", counter);#endif#endif	/*if node is root - trace whole string starting from 	the root, else - trace last character only.*/	if(pos->node == tree->root)	{		pos->node = trace_string(tree, tree->root, str, &(pos->edge_pos), &chars_found, no_skip);	}	else	{		str.begin = str.end;		chars_found = 0;		/*consider 2 cases: 				1. last character matched is the last of its edge		*/		if(is_last_char_in_edge(tree,pos->node,pos->edge_pos))		{			/*trace only last symbol of str, search in the 			NEXT edge (node).*/			tmp = find_son(tree, pos->node, tree->tree_string[str.end]);			if(tmp != 0)			{				pos->node		= tmp;
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -