📄 suffix_tree.c
字号:
else { (*chars_found) = str_len; (*edge_pos) = str_len-1; }#ifdef STATISTICS counter++;#endif return node; } else { /*find minimum out of edge length and string length, and scan it*/ if(str_len < length) length = str_len; for(*edge_pos=1, *chars_found=1; *edge_pos<length; (*chars_found)++,(*edge_pos)++) {#ifdef STATISTICS counter++;#endif /*Compare current characters of the string and the edge. If equal - continue.*/ if(tree->tree_string[node->edge_label_start+*edge_pos] != tree->tree_string[str.begin+*edge_pos]) { (*edge_pos)--; return node; } } } /*The loop has advanced *edge_pos one too much.*/ (*edge_pos)--; if((*chars_found) < str_len) /*search is not done yet*/ *search_done = 0; return node;}/**************************************************************//**************************************************************//*Traces for a string in the tree. This function is used inconstruction process only, and not for after-construction search of substrings. It is tailored to enable skipping (when we know a suffix is in the tree (when following a suffix link) we can avoid comparing all symbols of the edge by skipping its length immediately and thus save atomic operations - see Ukkonen's algorithm, skip trick). This function, in contradiction to the function trace_single_edge, 'sees' the whole picture, meaning itsearches a string in the whole tree and not just in a specific edge. Input : The string, given in indice of the main string. Output: (by value) the node where tracing has stopped. (by reference) the edge position where last match occured, the string position where last match occured, number of characters found, a flag for signaling whether search is done.*//**************************************************************/NODE* trace_string(SUFFIX_TREE* tree, /*node to start from*/ NODE *node, /*string to trace*/ PATH str, /*last matching position in edge*/ DBL_WORD *edge_pos, /*last matching position in tree string*/ DBL_WORD *chars_found, /*skip or not*/ SKIP_TYPE type) { /*This variable will be 1 when search is done. It is a return value from function trace_single_edge.*/ int search_done = 0; /*This variable will hold the number of matching characters found in the current edge. It is a return value from function trace_single_edge.*/ DBL_WORD edge_chars_found; *chars_found = 0; while(search_done == 0) { *edge_pos = 0; edge_chars_found = 0; node = trace_single_edge(tree, node, str, edge_pos, &edge_chars_found, type, &search_done); str.begin += edge_chars_found; *chars_found += edge_chars_found; } return node;}/**************************************************************//**************************************************************//*Traces for a string in the tree. This function is used for substring search after tree construction is done. It simply traverses down the tree starting from the root until either the searched string is fully found ot one non-matching character is found. In this function skipping is not enabled because we don't know wether the string is in the tree or not (see function trace_string above). Input : The tree, the string W, given in actual characters and the length of W. Output: If the substring is found - returns the index of the starting position of the substring in the tree source string. If the substring is not found - returns ST_ERROR.*//**************************************************************/DBL_WORD ST_FindSubstring( /*the suffix array*/ SUFFIX_TREE* tree, /*the substring to find*/ unsigned char* W, /*the length of W*/ DBL_WORD P) { /*Starts with the root's son that has the first character of W as its incoming edge first character.*/ NODE* node = find_son(tree, tree->root, W[0]); DBL_WORD k,j = 0, node_label_end; /*Scan nodes down from the root until a leaf is reached or the substring is found.*/ while(node!=0) { k=node->edge_label_start; node_label_end = get_node_label_end(tree,node); /*Scan a single edge - compare each character with the searched string.*/ while(j<P && k<=node_label_end && tree->tree_string[k] == W[j]) { j++; k++;#ifdef STATISTICS counter++;#endif } /*checking which of the stopping conditions are true*/ if(j == P) { /*W was found - it is a substring. Return its path starting index.*/ return node->path_position; } else if(k > node_label_end) /*Current edge is found to match, continue to next edge*/ node = find_son(tree, node, W[j]); else { /*One non-matching symbols is found - W is not a substring*/ return ST_ERROR; } } return ST_ERROR;}/**************************************************************//**************************************************************//*Traces for a string in the tree. This function is used for substring search after tree construction is done. It simply traverses down the tree starting from the root until either the searched string is fully found ot one non-matching character is found. In this function skipping is not enabled because we don't know wether the string is in the tree or not (see function trace_string above). Input : The tree, the string W, given in actual characters and the length of W. Output: If the substring is found - returns all indices of the starting position of the substrings found in the tree source string. If the substring is not found - returns ST_ERROR.*//**************************************************************/DBL_WORD ST_FindAllSubstring( /*the suffix array*/ SUFFIX_TREE* tree, /*the substring to find*/ unsigned char* W, /*the length of W*/ DBL_WORD P){ /*Starts with the root's son that has the first character of W as its incoming edge first character.*/ NODE* node = find_son(tree, tree->root, W[0]); DBL_WORD k,j = 0, node_label_end; /*Scan nodes down from the root until a leaf is reached or the substring is found.*/ while(node!=0) { k=node->edge_label_start; node_label_end = get_node_label_end(tree,node); /*Scan a single edge - compare each character with the searched string.*/ while(j<P && k<=node_label_end && tree->tree_string[k] == W[j]) {printf("> %c\n",tree->tree_string[k]); j++; k++;#ifdef STATISTICS counter++;#endif } /*checking which of the stopping conditions are true*/ if(j == P) { /*W was found - it is a substring. Print its path starting index and the indices of all the nodes below it.*///printf("* printChildren %d %d\n",j,P); printChildren( node );//printf("**\n"); printNode( node ); return 0; } else if(k > node_label_end) /*Current edge is found to match, continue to next edge*/ node = find_son(tree, node, W[j]); else { /*One non-matching symbols is found - W is not a substring*/ return ST_ERROR; } } return ST_ERROR;}void printChildren( NODE* node ) { NODE* child; /* Print the path_positions of the immediate child nodes */ child = node->sons; while ( child != 0 ) {//printf("**** path_position=%lu\n", child->path_position );//printf("**** father->path_position=%lu\n", child->father->path_position ); if ( !(child->path_position == child->father->path_position) ) { /* Print the path_position of child *///printf(" child != father\n" ); printNode( child ); } if ( child->sons != 0 ) {//printf(".. son != 0 \n" ); printChildren( child ); }//printf(".. go right\n" ); /* Move to the next node */ child = child->right_sibling; } return;}void printNode( NODE* node ) { if ( node != 0 ) { DBL_WORD temp = node->path_position; printf( "%lu\n", temp ); }} /**************************************************************//**************************************************************//*Follows the suffix link of the source node according to Ukkonen's rules. Input : The tree, and pos. pos is a combination of the source node and the position in its incoming edge where suffix ends.Output: The destination node that represents the longest suffix of node's path. Example: if node represents the path "abcde" then it returns the node that represents "bcde".*//**************************************************************/void follow_suffix_link(SUFFIX_TREE* tree, POS* pos){ /*gama is the string between node and its father, in case node doesn't have a suffix link.*/ PATH gama; /*dummy argument for trace_string function.*/ DBL_WORD chars_found = 0; if(pos->node == tree->root) { return; } /*If node has no suffix link yet or in the middle of an edge - remember the edge between the node and its father (gama) and follow its father's suffix link (it must have one by Ukkonen's lemma). After following, trace down gama - it must exist in the tree (and thus can use the skip trick - see trace_string function description).*/ if(pos->node->suffix_link == 0 || is_last_char_in_edge(tree,pos->node,pos->edge_pos) == 0) { /*if the node's father is the root, than no use following it's link (it is linked to itself). Tracing from the root (like in the naive algorithm) is required and is done by the calling function SEA uppon recieving a return value of tree->root from this function.*/ if(pos->node->father == tree->root) { pos->node = tree->root; return; } /*Store gama - the indices of node's incoming edge.*/ gama.begin = pos->node->edge_label_start; gama.end = pos->node->edge_label_start + pos->edge_pos; /*Follow father's suffix link*/ pos->node = pos->node->father->suffix_link; /*down-walk gama back to suffix_link's son*/ pos->node = trace_string(tree, pos->node, gama, &(pos->edge_pos), &chars_found, skip); } else { /*If a suffix link exists - just follow it.*/ pos->node = pos->node->suffix_link; pos->edge_pos = get_node_label_length(tree,pos->node)-1; }}/**************************************************************//**************************************************************//*Creates a suffix link between node and the node 'link' which represents its largest suffix. The function could be avoided but is needed to monitor the creation of suffix links when debuging or changing the tree.Input : The node to link from, the node to link to.Output: None.*//**************************************************************/void create_suffix_link(NODE* node, NODE* link){ node->suffix_link = link;}/**************************************************************//**************************************************************//*Single-Phase-Algorithm (see Ukkonen's algorithm). Ensure that a certain extension is in the tree.1. Follows the current node's suffix link.2. Check whether the rest of the extension is in the tree.3. If it is - reports the calling function SPA of rule 3 (= current phase is done).4. If it's not - inserts it by applying rule 2. Input : The tree, pos - the node and position in its incoming edge where extension begins, str - the starting and ending indices of the extension, a flag indicating whether the last phase ended by rule 3 (last extension of the last phase already existed in the tree - and if so, the current phase starts at not following the suffix link of the fisrt extension). Output: The rule that was applied to that extension. Can be 3 (phase is done) or 2 (a new leaf was created).*//**************************************************************/void SEA( SUFFIX_TREE* tree, POS* pos, PATH str, DBL_WORD* rule_applied, char after_rule_3){ DBL_WORD chars_found = 0,path_pos = str.begin; NODE *tmp; #ifdef DEBUG ST_PrintTree(tree); printf("extension: %lu phase+1: %lu",str.begin, str.end); if(after_rule_3 == 0) printf(" followed from (%lu,%lu | %lu) ", pos->node->edge_label_start, get_node_label_end(tree,pos->node), pos->edge_pos); else printf(" starting at (%lu,%lu | %lu) ", pos->node->edge_label_start, get_node_label_end(tree,pos->node), pos->edge_pos);#endif#ifdef STATISTICS counter++;#endif /*follow suffix link only if it's not the first extension after rule 3 was applied*/ if(after_rule_3 == 0) follow_suffix_link(tree, pos);#ifdef DEBUG #ifdef STATISTICS if(after_rule_3 == 0) printf("to (%lu,%lu | %lu). counter: %lu\n", pos->node->edge_label_start, get_node_label_end(tree,pos->node),pos->edge_pos,counter); else printf(". counter: %lu\n", counter);#endif#endif /*if node is root - trace whole string starting from the root, else - trace last character only.*/ if(pos->node == tree->root) { pos->node = trace_string(tree, tree->root, str, &(pos->edge_pos), &chars_found, no_skip); } else { str.begin = str.end; chars_found = 0; /*consider 2 cases: 1. last character matched is the last of its edge */ if(is_last_char_in_edge(tree,pos->node,pos->edge_pos)) { /*trace only last symbol of str, search in the NEXT edge (node).*/ tmp = find_son(tree, pos->node, tree->tree_string[str.end]); if(tmp != 0) { pos->node = tmp;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -