📄 suffix_tree.c
字号:
/******************************************************************************Suffix Tree Version 2.1AUTHORSDotan TsadokInstructor: Mr. Shlomo Yona, University of Haifa, Israel. December 2002.Current maintainer: Shlomo Yona <shlomo@cs.haifa.ac.il>COPYRIGHTCopyright 2002-2003 Shlomo YonaLICENSEThis library is free software; you can redistribute it and/or modify itunder the same terms as Perl itself.DESCRIPTION OF THIS FILE:This is the implementation file suffix_tree.c implementing the header filesuffix_tree.h.This code is an Open Source implementation of Ukkonen's algorithm forconstructing a suffix tree over a string in time and space complexityO(length of the string). The code is written under strict ANSI C.For a complete understanding of the code see Ukkonen's algorithm and thereadme.txt file.The general pseudo code is:n = length of the string.ST_CreateTree: Calls n times to SPA (Single Phase Algorithm). SPA: Increase the variable e (virtual end of all leaves). Calls SEA (Single Extension Algorithm) starting with the first extension that does not already exist in the tree and ending at the first extension that already exists. SEA : Follow suffix link. Check if current suffix exists in the tree. If it does not - apply rule 2 and then create a new suffix link. apply_rule_2: Create a new leaf and maybe a new internal node as well. create_node: Create a new node or a leaf.For implementation interpretations see Basic Ideas paragraph in the Developementsection of the readme.txt file.An example of the implementations of a node and its sons using linked listsinstead of arrays: (1) | | | (2)--(3)--(4)--(5)(2) is the only son of (1) (call it the first son). Other sons of (1) areconnected using a linked lists starting from (2) and going to the right. (3) isthe right sibling of (2) (and (2) is the left sibling of (3)), (4) is the rightsibling of (3), etc.The father field of all (2), (3), (4) and (5) points to (1), but the son fieldof (1) points only to (2).*******************************************************************************/#include "stdlib.h"#include "stdio.h"#include "string.h"#include "suffix_tree.h"/* See function body */void ST_PrintTree(SUFFIX_TREE* tree);/* See function body */void ST_PrintFullNode(SUFFIX_TREE* tree, NODE* node);/* Used in function trace_string for skipping (Ukkonen's Skip Trick). */typedef enum SKIP_TYPE {skip, no_skip} SKIP_TYPE;/* Used in function apply_rule_2 - two types of rule 2 - see function for more details.*/typedef enum RULE_2_TYPE {new_son, split} RULE_2_TYPE;/* Signals whether last matching position is the last one of the current edge */typedef enum LAST_POS_TYPE {last_char_in_edge, other_char} LAST_POS_TYPE;/* Used for statistic measures of speed. */DBL_WORD counter;/* Used for statistic measures of space. */DBL_WORD heap;/* Used to mark the node that has no suffix link yet. By Ukkonen, it will have one by the end of the current phase. */NODE* suffixless;typedef struct SUFFIXTREEPATH{ DBL_WORD begin; DBL_WORD end;} PATH;typedef struct SUFFIXTREEPOS{ NODE* node; DBL_WORD edge_pos;}POS;/******************************************************************************//* Define STATISTICS in order to view measures of speed and space while constructing and searching the suffix tree. Measures will be printed on the screen.*//* #define STATISTICS *//* Define DEBUG in order to view debug printouts to the screen while constructing and searching the suffix tree.*//* #define DEBUG *//******************************************************************************//* create_node : Creates a node with the given init field-values. Input : The father of the node, the starting and ending indices of the incloming edge to that node, the path starting position of the node. Output: A pointer to that node.*/NODE* create_node(NODE* father, DBL_WORD start, DBL_WORD end, DBL_WORD position){ /*Allocate a node.*/ NODE* node = (NODE*)malloc(sizeof(NODE)); if(node == 0) { printf("\nOut of memory.\n"); exit(0); }#ifdef STATISTICS heap+=sizeof(NODE);#endif /* Initialize node fields. For detailed description of the fields see suffix_tree.h */ node->sons = 0; node->right_sibling = 0; node->left_sibling = 0; node->suffix_link = 0; node->father = father; node->path_position = position; node->edge_label_start = start; node->edge_label_end = end; return node;}/******************************************************************************//* find_son : Finds son of node that starts with a certain character. Input : the tree, the node to start searching from and the character to be searched in the sons. Output: A pointer to the found son, 0 if no such son.*/NODE* find_son(SUFFIX_TREE* tree, NODE* node, char character){ /* Point to the first son. */ node = node->sons; /* scan all sons (all right siblings of the first son) for their first character (it has to match the character given as input to this function. */ while(node != 0 && tree->tree_string[node->edge_label_start] != character) {#ifdef STATISTICS counter++;#endif node = node->right_sibling; } return node;}/******************************************************************************//* get_node_label_end : Returns the end index of the incoming edge to that node. This function is needed because for leaves the end index is not relevant, instead we must look at the variable "e" (the global virtual end of all leaves). Never refer directly to a leaf's end-index. Input : the tree, the node its end index we need. Output: The end index of that node (meaning the end index of the node's incoming edge).*/DBL_WORD get_node_label_end(SUFFIX_TREE* tree, NODE* node){ /* If it's a leaf - return e */ if(node->sons == 0) return tree->e; /* If it's not a leaf - return its real end */ return node->edge_label_end;}/******************************************************************************//* get_node_label_length : Returns the length of the incoming edge to that node. Uses get_node_label_end (see above). Input : The tree and the node its length we need. Output: the length of that node.*/DBL_WORD get_node_label_length(SUFFIX_TREE* tree, NODE* node){ /* Calculate and return the lentgh of the node */ return get_node_label_end(tree, node) - node->edge_label_start + 1;}/******************************************************************************//* is_last_char_in_edge : Returns 1 if edge_pos is the last position in node's incoming edge. Input : The tree, the node to be checked and the position in its incoming edge. Output: the length of that node.*/char is_last_char_in_edge(SUFFIX_TREE* tree, NODE* node, DBL_WORD edge_pos){ if(edge_pos == get_node_label_length(tree,node)-1) return 1; return 0;}/******************************************************************************//* connect_siblings : Connect right_sib as the right sibling of left_sib and vice versa. Input : The two nodes to be connected. Output: None.*/void connect_siblings(NODE* left_sib, NODE* right_sib){ /* Connect the right node as the right sibling of the left node */ if(left_sib != 0) left_sib->right_sibling = right_sib; /* Connect the left node as the left sibling of the right node */ if(right_sib != 0) right_sib->left_sibling = left_sib;}/******************************************************************************//* apply_extension_rule_2 : Apply "extension rule 2" in 2 cases: 1. A new son (leaf 4) is added to a node that already has sons: (1) (1) / \ -> / | \ (2) (3) (2)(3)(4) 2. An edge is split and a new leaf (2) and an internal node (3) are added: | | | (3) | -> / \ (1) (1) (2) Input : See parameters. Output: A pointer to the newly created leaf (new_son case) or internal node (split case).*/NODE* apply_extension_rule_2( /* Node 1 (see drawings) */ NODE* node, /* Start index of node 2's incoming edge */ DBL_WORD edge_label_begin, /* End index of node 2's incoming edge */ DBL_WORD edge_label_end, /* Path start index of node 2 */ DBL_WORD path_pos, /* Position in node 1's incoming edge where split is to be performed */ DBL_WORD edge_pos, /* Can be 'new_son' or 'split' */ RULE_2_TYPE type) { NODE *new_leaf, *new_internal, *son; /*-------new_son-------*/ if(type == new_son) {#ifdef DEBUG printf("rule 2: new leaf (%lu,%lu)\n",edge_label_begin,edge_label_end);#endif /* Create a new leaf (4) with the characters of the extension */ new_leaf = create_node(node, edge_label_begin , edge_label_end, path_pos); /* Connect new_leaf (4) as the new son of node (1) */ son = node->sons; while(son->right_sibling != 0) son = son->right_sibling; connect_siblings(son, new_leaf); /* return (4) */ return new_leaf; } /*-------split-------*/#ifdef DEBUG printf("rule 2: split (%lu,%lu)\n",edge_label_begin,edge_label_end);#endif /* Create a new internal node (3) at the split point */ new_internal = create_node( node->father, node->edge_label_start, node->edge_label_start+edge_pos, node->path_position); /* Update the node (1) incoming edge starting index (it now starts where node (3) incoming edge ends) */ node->edge_label_start += edge_pos+1; /* Create a new leaf (2) with the characters of the extension */ new_leaf = create_node( new_internal, edge_label_begin, edge_label_end, path_pos); /* Connect new_internal (3) where node (1) was */ /* Connect (3) with (1)'s left sibling */ connect_siblings(node->left_sibling, new_internal); /* connect (3) with (1)'s right sibling */ connect_siblings(new_internal, node->right_sibling); node->left_sibling = 0; /* Connect (3) with (1)'s father */ if(new_internal->father->sons == node) new_internal->father->sons = new_internal; /* Connect new_leaf (2) and node (1) as sons of new_internal (3) */ new_internal->sons = node; node->father = new_internal; connect_siblings(node, new_leaf); /* return (3) */ return new_internal;}/******************************************************************************//* trace_single_edge : Traces for a string in a given node's OUTcoming edge. It searches only in the given edge and not other ones. Search stops when either whole string was found in the given edge, a part of the string was found but the edge ended (and the next edge must be searched too - performed by function trace_string) or one non-matching character was found. Input : The string to be searched, given in indices of the main string. Output: (by value) the node where tracing has stopped. (by reference) the edge position where last match occured, the string position where last match occured, number of characters found, a flag for signaling whether search is done, and a flag to signal whether search stopped at a last character of an edge.*/NODE* trace_single_edge( SUFFIX_TREE* tree, /* Node to start from */ NODE* node,
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -