📄 suffix_tree.c
字号:
/* String to trace */ PATH str, /* Last matching position in edge */ DBL_WORD* edge_pos, /* Last matching position in tree source string */ DBL_WORD* chars_found, /* Skip or no_skip*/ SKIP_TYPE type, /* 1 if search is done, 0 if not */ int* search_done) { NODE* cont_node; DBL_WORD length,str_len; /* Set default return values */ *search_done = 1; *edge_pos = 0; /* Search for the first character of the string in the outcoming edge of node */ cont_node = find_son(tree, node, tree->tree_string[str.begin]); if(cont_node == 0) { /* Search is done, string not found */ *edge_pos = get_node_label_length(tree,node)-1; *chars_found = 0; return node; } /* Found first character - prepare for continuing the search */ node = cont_node; length = get_node_label_length(tree,node); str_len = str.end - str.begin + 1; /* Compare edge length and string length. */ /* If edge is shorter then the string being searched and skipping is enabled - skip edge */ if(type == skip) { if(length <= str_len) { (*chars_found) = length; (*edge_pos) = length-1; if(length < str_len) *search_done = 0; } else { (*chars_found) = str_len; (*edge_pos) = str_len-1; }#ifdef STATISTICS counter++;#endif return node; } else { /* Find minimum out of edge length and string length, and scan it */ if(str_len < length) length = str_len; for(*edge_pos=1, *chars_found=1; *edge_pos<length; (*chars_found)++,(*edge_pos)++) {#ifdef STATISTICS counter++;#endif /* Compare current characters of the string and the edge. If equal - continue */ if(tree->tree_string[node->edge_label_start+*edge_pos] != tree->tree_string[str.begin+*edge_pos]) { (*edge_pos)--; return node; } } } /* The loop has advanced *edge_pos one too much */ (*edge_pos)--; if((*chars_found) < str_len) /* Search is not done yet */ *search_done = 0; return node;}/******************************************************************************//* trace_string : Traces for a string in the tree. This function is used in construction process only, and not for after-construction search of substrings. It is tailored to enable skipping (when we know a suffix is in the tree (when following a suffix link) we can avoid comparing all symbols of the edge by skipping its length immediately and thus save atomic operations - see Ukkonen's algorithm, skip trick). This function, in contradiction to the function trace_single_edge, 'sees' the whole picture, meaning it searches a string in the whole tree and not just in a specific edge. Input : The string, given in indice of the main string. Output: (by value) the node where tracing has stopped. (by reference) the edge position where last match occured, the string position where last match occured, number of characters found, a flag for signaling whether search is done.*/NODE* trace_string( SUFFIX_TREE* tree, /* Node to start from */ NODE* node, /* String to trace */ PATH str, /* Last matching position in edge */ DBL_WORD* edge_pos, /* Last matching position in tree string */ DBL_WORD* chars_found, /* skip or not */ SKIP_TYPE type) { /* This variable will be 1 when search is done. It is a return value from function trace_single_edge */ int search_done = 0; /* This variable will hold the number of matching characters found in the current edge. It is a return value from function trace_single_edge */ DBL_WORD edge_chars_found; *chars_found = 0; while(search_done == 0) { *edge_pos = 0; edge_chars_found = 0; node = trace_single_edge(tree, node, str, edge_pos, &edge_chars_found, type, &search_done); str.begin += edge_chars_found; *chars_found += edge_chars_found; } return node;}/******************************************************************************//* ST_FindSubstring : See suffix_tree.h for description.*/DBL_WORD ST_FindSubstring( /* The suffix array */ SUFFIX_TREE* tree, /* The substring to find */ char* W, /* The length of W */ DBL_WORD P) { /* Starts with the root's son that has the first character of W as its incoming edge first character */ NODE* node = find_son(tree, tree->root, W[0]); DBL_WORD k,j = 0, node_label_end; /* Scan nodes down from the root untill a leaf is reached or the substring is found */ while(node!=0) { k=node->edge_label_start; node_label_end = get_node_label_end(tree,node); /* Scan a single edge - compare each character with the searched string */ while(j<P && k<=node_label_end && tree->tree_string[k] == W[j]) { j++; k++;#ifdef STATISTICS counter++;#endif } /* Checking which of the stopping conditions are true */ if(j == P) { /* W was found - it is a substring. Return its path starting index */ return node->path_position; } else if(k > node_label_end) /* Current edge is found to match, continue to next edge */ node = find_son(tree, node, W[j]); else { /* One non-matching symbols is found - W is not a substring */ return ST_ERROR; } } return ST_ERROR;}/******************************************************************************//* follow_suffix_link : Follows the suffix link of the source node according to Ukkonen's rules. Input : The tree, and pos. pos is a combination of the source node and the position in its incoming edge where suffix ends. Output: The destination node that represents the longest suffix of node's path. Example: if node represents the path "abcde" then it returns the node that represents "bcde".*/void follow_suffix_link(SUFFIX_TREE* tree, POS* pos){ /* gama is the string between node and its father, in case node doesn't have a suffix link */ PATH gama; /* dummy argument for trace_string function */ DBL_WORD chars_found = 0; if(pos->node == tree->root) { return; } /* If node has no suffix link yet or in the middle of an edge - remember the edge between the node and its father (gama) and follow its father's suffix link (it must have one by Ukkonen's lemma). After following, trace down gama - it must exist in the tree (and thus can use the skip trick - see trace_string function description) */ if(pos->node->suffix_link == 0 || is_last_char_in_edge(tree,pos->node,pos->edge_pos) == 0) { /* If the node's father is the root, than no use following it's link (it is linked to itself). Tracing from the root (like in the naive algorithm) is required and is done by the calling function SEA uppon recieving a return value of tree->root from this function */ if(pos->node->father == tree->root) { pos->node = tree->root; return; } /* Store gama - the indices of node's incoming edge */ gama.begin = pos->node->edge_label_start; gama.end = pos->node->edge_label_start + pos->edge_pos; /* Follow father's suffix link */ pos->node = pos->node->father->suffix_link; /* Down-walk gama back to suffix_link's son */ pos->node = trace_string(tree, pos->node, gama, &(pos->edge_pos), &chars_found, skip); } else { /* If a suffix link exists - just follow it */ pos->node = pos->node->suffix_link; pos->edge_pos = get_node_label_length(tree,pos->node)-1; }}/******************************************************************************//* create_suffix_link : Creates a suffix link between node and the node 'link' which represents its largest suffix. The function could be avoided but is needed to monitor the creation of suffix links when debuging or changing the tree. Input : The node to link from, the node to link to. Output: None.*/void create_suffix_link(NODE* node, NODE* link){ node->suffix_link = link;}/******************************************************************************//* SEA : Single-Extension-Algorithm (see Ukkonen's algorithm). Ensure that a certain extension is in the tree. 1. Follows the current node's suffix link. 2. Check whether the rest of the extension is in the tree. 3. If it is - reports the calling function SPA of rule 3 (= current phase is done). 4. If it's not - inserts it by applying rule 2. Input : The tree, pos - the node and position in its incoming edge where extension begins, str - the starting and ending indices of the extension, a flag indicating whether the last phase ended by rule 3 (last extension of the last phase already existed in the tree - and if so, the current phase starts at not following the suffix link of the first extension). Output: The rule that was applied to that extension. Can be 3 (phase is done) or 2 (a new leaf was created).*/void SEA( SUFFIX_TREE* tree, POS* pos, PATH str, DBL_WORD* rule_applied, char after_rule_3){ DBL_WORD chars_found = 0 , path_pos = str.begin; NODE* tmp; #ifdef DEBUG ST_PrintTree(tree); printf("extension: %lu phase+1: %lu",str.begin, str.end); if(after_rule_3 == 0) printf(" followed from (%lu,%lu | %lu) ", pos->node->edge_label_start, get_node_label_end(tree,pos->node), pos->edge_pos); else printf(" starting at (%lu,%lu | %lu) ", pos->node->edge_label_start, get_node_label_end(tree,pos->node), pos->edge_pos);#endif#ifdef STATISTICS counter++;#endif /* Follow suffix link only if it's not the first extension after rule 3 was applied */ if(after_rule_3 == 0) follow_suffix_link(tree, pos);#ifdef DEBUG #ifdef STATISTICS if(after_rule_3 == 0) printf("to (%lu,%lu | %lu). counter: %lu\n", pos->node->edge_label_start, get_node_label_end(tree,pos->node),pos->edge_pos,counter); else printf(". counter: %lu\n", counter);#endif#endif /* If node is root - trace whole string starting from the root, else - trace last character only */ if(pos->node == tree->root) { pos->node = trace_string(tree, tree->root, str, &(pos->edge_pos), &chars_found, no_skip); } else { str.begin = str.end; chars_found = 0; /* Consider 2 cases: 1. last character matched is the last of its edge */ if(is_last_char_in_edge(tree,pos->node,pos->edge_pos)) { /* Trace only last symbol of str, search in the NEXT edge (node) */ tmp = find_son(tree, pos->node, tree->tree_string[str.end]); if(tmp != 0) { pos->node = tmp; pos->edge_pos = 0; chars_found = 1; } } /* 2. last character matched is NOT the last of its edge */ else { /* Trace only last symbol of str, search in the CURRENT edge (node) */ if(tree->tree_string[pos->node->edge_label_start+pos->edge_pos+1] == tree->tree_string[str.end]) { pos->edge_pos++; chars_found = 1; } } } /* If whole string was found - rule 3 applies */ if(chars_found == str.end - str.begin + 1) { *rule_applied = 3; /* If there is an internal node that has no suffix link yet (only one may exist) - create a suffix link from it to the father-node of the current position in the tree (pos) */ if(suffixless != 0) { create_suffix_link(suffixless, pos->node->father); /* Marks that no internal node with no suffix link exists */ suffixless = 0; }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -