📄 suffix_tree.c
字号:
/**************************************************************//***************************************************************Suffix Tree Version 2.0by: Dotan Tsadok.Instructor: Mr. Shlomo Yona.University of Haifa, Israel.August 2002.This is the implementation file SuffixTree.c.This code is an Open Source implementation of Ukkonen's algorithmfor constructing a suffix tree over a string in time and spacecomplexity O(length of the string). The code is written under strictANSI C restrictions. It may be used freely for non-profit uses.Commercial use is forbidden unless premission granted from the author.For a complete understanding of the code see Ukkonen's algorithm andreadme.txt file. The general pseudo code is:n = length of the string.ST_CreateTree: Calls n times to SPA. SPA (Single Phase Algorithm): Increase the variable e (virtual end of all leaves). Calls SEA starting with the first extension that does not already exist in the tree and ending at the first extension that already exists. SEA (Single Extension Algorithm): follow suffix link. check if current suffix exists in the tree. if it does not - apply rule 2 and then create a new suffix link. apply_rule_2: create a new leaf and maybe a new internal node as well. create_node: Create a new node or a leaf.For implementation interpretations see Basic Ideas paragraphin the Developement section of the readme.txt file.An example of the implementations of a node and its sons using linkedlists instead of arrays: (1) | | | (2)--(3)--(4)--(5)(2) is the only son of (1) (call it the first son). Other sons of (1)are connected using a linked lists starting from (2) and going to theright. (3) is the right sibling of (2) (and (2) is the left siblingof (3)), (4) is the right sibling of (3) etc. The father field ofall (2), (3), (4) and (5) points to (1), but the son field of (1)points only to (2).*//**************************************************************//**************************************************************/#include "stdlib.h"#include "stdio.h"#include "string.h"#include "suffix_tree.h"/**************************************************************//**************************************************************//*See function body.*/void ST_PrintTree(SUFFIX_TREE* tree);/*See function body.*/void ST_PrintFullNode(SUFFIX_TREE* tree, NODE* node);/**************************************************************//**************************************************************//*used in function trace_string for skipping (Ukkonen's Skip Trick).*/typedef enum SKIP_TYPE {skip, no_skip} SKIP_TYPE;/*used in function apply_rule_2 - two types of rule 2 - see function for more details.*/typedef enum RULE_2_TYPE {new_son, split} RULE_2_TYPE;/*signals whether last matching position is the last one of the current edge.*/typedef enum LAST_POS_TYPE {last_char_in_edge, other_char} LAST_POS_TYPE;/**************************************************************//**************************************************************//*Symbols an error return value for some functions. Initializedin ST_CreateTree.*/DBL_WORD ST_ERROR;/*Used for statistic measures of speed.*/DBL_WORD counter;/*Used for statistic measures of space*/DBL_WORD heap;/*Used to mark the node that has no suffix link yet.By Ukkonen, it will have one by theend og the current phase.*/NODE* suffixless;/**************************************************************//**************************************************************//*Define STATISTICS in order to view measures of speed and space while constructing and searching the suffix tree. Measures will be printed on the screen.*//*#define STATISTICS*//*Define DEBUG in order to view debug printouts to the screen while constructing and searching the suffix tree.*//*#define DEBUG*//**************************************************************//**************************************************************//**/typedef struct SUFFIXTREEPATH{ DBL_WORD begin; DBL_WORD end;} PATH;/**/typedef struct SUFFIXTREEPOS{ NODE* node; DBL_WORD edge_pos;}POS;/**************************************************************//**************************************************************//*Creates a node with the given init field-values. Input : The father of the node, the starting and ending indices of the incloming edge to that node, the path starting position of the node. Output: A pointer to that node.*//**************************************************************/NODE* create_node(NODE* father, DBL_WORD start, DBL_WORD end, DBL_WORD position){ /*Allocate a node.*/ NODE* node = (NODE*)malloc(sizeof(NODE)); if(node == 0) { printf("\nOut of memory.\n"); exit(0); }#ifdef STATISTICS heap+=sizeof(NODE);#endif /*Initialize its fields. For detailed description of the fields see SuffixTree.h*/ node->sons = 0; node->right_sibling = 0; node->left_sibling = 0; node->suffix_link = 0; node->father = father; node->path_position = position; node->edge_label_start = start; node->edge_label_end = end; return node;}/**************************************************************//**************************************************************//*Finds son of node that starts with a certain character. Input : the tree, the node to start searching from and the character to be searched in the sons. Output: A pointer to the found son, 0 if no such son.*//**************************************************************/NODE* find_son(SUFFIX_TREE* tree, NODE* node, unsigned char character){ /*point to the first son.*/ node = node->sons; /*scan all sons (all right siblings of the first son) for their first character (it has to match the character given as input to this function.*/ while(node != 0 && tree->tree_string[node->edge_label_start] != character) {#ifdef STATISTICS counter++;#endif node = node->right_sibling; } return node;}/**************************************************************//**************************************************************//*Returns the end index of the incoming edge to that node.This function is needed because for leaves the end index is not relevant, instead we must look at the variable e (the global virtual end of all leaves). Never refer directly to a leaf'send-index. Input : the tree, the node its end index we need. Output: The end index of that node (meaning the end index of the node's incoming edge).*//**************************************************************/DBL_WORD get_node_label_end(SUFFIX_TREE* tree, NODE* node){ /*If it's a leaf - return e.*/ if(node->sons == 0) return tree->e; /*If it's not a leaf - return it's real end.*/ return node->edge_label_end;}/**************************************************************//**************************************************************//*Returns the length of the incoming edge to that node. Uses get_node_label_end (see above). Input : The tree and the node its length we need. Output: the length of that node.*//**************************************************************/DBL_WORD get_node_label_length(SUFFIX_TREE* tree, NODE* node){ /*Calculate and return the lentgh of the node.*/ return get_node_label_end(tree, node) - node->edge_label_start + 1;}/**************************************************************//**************************************************************//*Returns 1 if edge_pos is the last position in node's incoming edge. Input : The tree, the node to be checked and the position in its incoming edge. Output: the length of that node.*//**************************************************************/char is_last_char_in_edge(SUFFIX_TREE* tree, NODE* node, DBL_WORD edge_pos){ if(edge_pos == get_node_label_length(tree,node)-1) return 1; return 0;}/**************************************************************//**************************************************************//*Connect right_sib as the right sibling of left_sib and vice versa. Input : The two nodes to be connected. Output: None.*//**************************************************************/void connect_siblings(NODE* left_sib, NODE* right_sib){ /*Connect the right node as the right sibling of the left node.*/ if(left_sib != 0) left_sib->right_sibling = right_sib; /*Connect the left node as the left sibling of the right node.*/ if(right_sib != 0) right_sib->left_sibling = left_sib;}/**************************************************************//**************************************************************//*Apply extension rule 2 - 2 cases: (1) (1) 1. A new son (leaf 4) is added to / \ -> / | \ a node that already has sons. (2) (3) (2)(3)(4) 2. An edge is split and a new | | leaf (2) and an internal node | (3) (3) are added. | -> / \ (1) (1) (2) Input : See parameters. Output: A pointer to the newly created leaf (new_son case) or internal node (split case).*//**************************************************************/NODE* apply_extension_rule_2(/*node 1 (see drawings)*/ NODE* node, /*start index of node 2's incoming edge*/ DBL_WORD edge_label_begin, /*end index of node 2's incoming edge*/ DBL_WORD edge_label_end, /*path start index of node 2*/ DBL_WORD path_pos, /*position in node 1's incoming edge where split is to be performed*/ DBL_WORD edge_pos, /*can be 'new_son' or 'split'*/ RULE_2_TYPE type) { NODE *new_leaf, *new_internal, *son; if(type == new_son) { /*-------new_son-------*/#ifdef DEBUG printf("rule 2: new leaf (%lu,%lu)\n",edge_label_begin,edge_label_end);#endif /*create a new leaf (4) with the characters of the extension*/ new_leaf = create_node(node, edge_label_begin , edge_label_end, path_pos); /*connect new_leaf (4) as the new son of node (1).*/ son = node->sons; while(son->right_sibling != 0) son = son->right_sibling; connect_siblings(son, new_leaf); /*return (4)*/ return new_leaf; } /*-------split-------*/#ifdef DEBUG printf("rule 2: split (%lu,%lu)\n",edge_label_begin,edge_label_end);#endif /*create a new internal node (3) at the split point.*/ new_internal = create_node(node->father, node->edge_label_start , node->edge_label_start+edge_pos, node->path_position); /*Update the node (1) incoming edge starting index (it now starts where node (3) incoming edge ends).*/ node->edge_label_start += edge_pos+1; /*create a new leaf (2) with the characters of the extension.*/ new_leaf = create_node(new_internal, edge_label_begin , edge_label_end, path_pos); /*connect new_internal (3) where node (1) was.*/ /*connect (3) with (1)'s left sibling*/ connect_siblings(node->left_sibling, new_internal); /*connect (3) with (1)'s right sibling*/ connect_siblings(new_internal, node->right_sibling); node->left_sibling = 0; /*connect (3) with (1)'s father*/ if(new_internal->father->sons == node) new_internal->father->sons = new_internal; /*connect new_leaf (2) and node (1) as sons of new_internal (3).*/ new_internal->sons = node; node->father = new_internal; connect_siblings(node, new_leaf); /*return (3)*/ return new_internal;}/**************************************************************//**************************************************************//*Traces for a string in a given node's OUTcoming edge. Itsearches only in the given edge and not other ones. Search stops when either whole string was found in the given edge, a part of the string was found but the edge ended (and the next edge must be searched too - performed by function trace_string) or one non-matching character was found. Input : The string to be searched, given in indices of the main string. Output: (by value) the node where tracing has stopped. (by reference) the edge position where last match occured, the string position where last match occured, number of characters found, a flag for signaling whether search is done, and a flag to signal whether search stopped at a last character of an edge.*//**************************************************************/NODE* trace_single_edge( SUFFIX_TREE* tree, /*node to start from*/ NODE *node, /*string to trace*/ PATH str, /*last matching position in edge*/ DBL_WORD *edge_pos, /*last matching position in tree source string*/ DBL_WORD *chars_found, /*skip or no_skip*/ SKIP_TYPE type, /*1 if search is done, 0 if not*/ int *search_done) { NODE* cont_node; DBL_WORD length,str_len; /*Set default return values*/ *search_done = 1; *edge_pos = 0; /*Search for the first character of the string in the outcoming edge of node*/ cont_node = find_son(tree, node, tree->tree_string[str.begin]); if(cont_node == 0) { /*search is done, string not found*/ *edge_pos = get_node_label_length(tree,node)-1; *chars_found = 0; return node; } /*Found first character - prepare for continuing the search*/ node = cont_node; length = get_node_label_length(tree,node); str_len = str.end - str.begin + 1; /*compare edge length and string length.*/ /*if edge is shorter then the string being searched and skipping is enabled - skip edge*/ if(type == skip) { if(length <= str_len) { (*chars_found) = length; (*edge_pos) = length-1; if(length < str_len) *search_done = 0; }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -