⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 suffix_tree.c

📁 suffix tree suffix tree suffix tree
💻 C
📖 第 1 页 / 共 3 页
字号:
                      /* String to trace */                      PATH            str,                               /* Last matching position in edge */                      DBL_WORD*       edge_pos,                            /* Last matching position in tree source string */                      DBL_WORD*       chars_found,                         /* Skip or no_skip*/                      SKIP_TYPE       type,                                /* 1 if search is done, 0 if not */                      int*            search_done)   {   NODE*      cont_node;   DBL_WORD   length,str_len;   /* Set default return values */   *search_done = 1;   *edge_pos    = 0;   /* Search for the first character of the string in the outcoming edge of      node */   cont_node = find_son(tree, node, tree->tree_string[str.begin]);   if(cont_node == 0)   {      /* Search is done, string not found */      *edge_pos = get_node_label_length(tree,node)-1;      *chars_found = 0;      return node;   }      /* Found first character - prepare for continuing the search */   node    = cont_node;   length  = get_node_label_length(tree,node);   str_len = str.end - str.begin + 1;   /* Compare edge length and string length. */   /* If edge is shorter then the string being searched and skipping is      enabled - skip edge */   if(type == skip)   {      if(length <= str_len)      {         (*chars_found)   = length;         (*edge_pos)      = length-1;         if(length < str_len)            *search_done  = 0;      }      else      {         (*chars_found)   = str_len;         (*edge_pos)      = str_len-1;      }#ifdef STATISTICS      counter++;#endif      return node;   }   else   {      /* Find minimum out of edge length and string length, and scan it */      if(str_len < length)         length = str_len;      for(*edge_pos=1, *chars_found=1; *edge_pos<length; (*chars_found)++,(*edge_pos)++)      {#ifdef STATISTICS         counter++;#endif         /* Compare current characters of the string and the edge. If equal - 	    continue */         if(tree->tree_string[node->edge_label_start+*edge_pos] != tree->tree_string[str.begin+*edge_pos])         {            (*edge_pos)--;            return node;         }      }   }   /* The loop has advanced *edge_pos one too much */   (*edge_pos)--;   if((*chars_found) < str_len)      /* Search is not done yet */      *search_done = 0;   return node;}/******************************************************************************//*   trace_string :   Traces for a string in the tree. This function is used in construction   process only, and not for after-construction search of substrings. It is   tailored to enable skipping (when we know a suffix is in the tree (when   following a suffix link) we can avoid comparing all symbols of the edge by   skipping its length immediately and thus save atomic operations - see   Ukkonen's algorithm, skip trick).   This function, in contradiction to the function trace_single_edge, 'sees' the   whole picture, meaning it searches a string in the whole tree and not just in   a specific edge.   Input : The string, given in indice of the main string.   Output: (by value) the node where tracing has stopped.           (by reference) the edge position where last match occured, the string	   position where last match occured, number of characters found, a flag	   for signaling whether search is done.*/NODE* trace_string(                      SUFFIX_TREE*    tree,                       /* Node to start from */                      NODE*           node,                               /* String to trace */                      PATH            str,                               /* Last matching position in edge */                      DBL_WORD*       edge_pos,                            /* Last matching position in tree string */                      DBL_WORD*       chars_found,                      /* skip or not */                      SKIP_TYPE       type)         {   /* This variable will be 1 when search is done.      It is a return value from function trace_single_edge */   int      search_done = 0;   /* This variable will hold the number of matching characters found in the      current edge. It is a return value from function trace_single_edge */   DBL_WORD edge_chars_found;   *chars_found = 0;   while(search_done == 0)   {      *edge_pos        = 0;      edge_chars_found = 0;      node = trace_single_edge(tree, node, str, edge_pos, &edge_chars_found, type, &search_done);      str.begin       += edge_chars_found;      *chars_found    += edge_chars_found;   }   return node;}/******************************************************************************//*   ST_FindSubstring :   See suffix_tree.h for description.*/DBL_WORD ST_FindSubstring(                      /* The suffix array */                      SUFFIX_TREE*    tree,                            /* The substring to find */                      char*  W,                               /* The length of W */                      DBL_WORD        P)         {   /* Starts with the root's son that has the first character of W as its      incoming edge first character */   NODE* node   = find_son(tree, tree->root, W[0]);   DBL_WORD k,j = 0, node_label_end;   /* Scan nodes down from the root untill a leaf is reached or the substring is      found */   while(node!=0)   {      k=node->edge_label_start;      node_label_end = get_node_label_end(tree,node);            /* Scan a single edge - compare each character with the searched string */      while(j<P && k<=node_label_end && tree->tree_string[k] == W[j])      {         j++;         k++;#ifdef STATISTICS         counter++;#endif      }            /* Checking which of the stopping conditions are true */      if(j == P)      {         /* W was found - it is a substring. Return its path starting index */         return node->path_position;      }      else if(k > node_label_end)         /* Current edge is found to match, continue to next edge */         node = find_son(tree, node, W[j]);      else      {         /* One non-matching symbols is found - W is not a substring */         return ST_ERROR;      }   }   return ST_ERROR;}/******************************************************************************//*   follow_suffix_link :   Follows the suffix link of the source node according to Ukkonen's rules.    Input : The tree, and pos. pos is a combination of the source node and the            position in its incoming edge where suffix ends.   Output: The destination node that represents the longest suffix of node's            path. Example: if node represents the path "abcde" then it returns            the node that represents "bcde".*/void follow_suffix_link(SUFFIX_TREE* tree, POS* pos){   /* gama is the string between node and its father, in case node doesn't have      a suffix link */   PATH      gama;               /* dummy argument for trace_string function */   DBL_WORD  chars_found = 0;         if(pos->node == tree->root)   {      return;   }   /* If node has no suffix link yet or in the middle of an edge - remember the      edge between the node and its father (gama) and follow its father's suffix      link (it must have one by Ukkonen's lemma). After following, trace down       gama - it must exist in the tree (and thus can use the skip trick - see       trace_string function description) */   if(pos->node->suffix_link == 0 || is_last_char_in_edge(tree,pos->node,pos->edge_pos) == 0)   {      /* If the node's father is the root, than no use following it's link (it          is linked to itself). Tracing from the root (like in the naive          algorithm) is required and is done by the calling function SEA uppon          recieving a return value of tree->root from this function */      if(pos->node->father == tree->root)      {         pos->node = tree->root;         return;      }            /* Store gama - the indices of node's incoming edge */      gama.begin      = pos->node->edge_label_start;      gama.end      = pos->node->edge_label_start + pos->edge_pos;      /* Follow father's suffix link */      pos->node      = pos->node->father->suffix_link;      /* Down-walk gama back to suffix_link's son */      pos->node      = trace_string(tree, pos->node, gama, &(pos->edge_pos), &chars_found, skip);   }   else   {      /* If a suffix link exists - just follow it */      pos->node      = pos->node->suffix_link;      pos->edge_pos   = get_node_label_length(tree,pos->node)-1;   }}/******************************************************************************//*   create_suffix_link :   Creates a suffix link between node and the node 'link' which represents its    largest suffix. The function could be avoided but is needed to monitor the    creation of suffix links when debuging or changing the tree.   Input : The node to link from, the node to link to.   Output: None.*/void create_suffix_link(NODE* node, NODE* link){   node->suffix_link = link;}/******************************************************************************//*   SEA :   Single-Extension-Algorithm (see Ukkonen's algorithm). Ensure that a certain    extension is in the tree.   1. Follows the current node's suffix link.   2. Check whether the rest of the extension is in the tree.   3. If it is - reports the calling function SPA of rule 3 (= current phase is       done).   4. If it's not - inserts it by applying rule 2.   Input : The tree, pos - the node and position in its incoming edge where            extension begins, str - the starting and ending indices of the            extension, a flag indicating whether the last phase ended by rule 3            (last extension of the last phase already existed in the tree - and            if so, the current phase starts at not following the suffix link of            the first extension).   Output: The rule that was applied to that extension. Can be 3 (phase is done)           or 2 (a new leaf was created).*/void SEA(                      SUFFIX_TREE*   tree,                       POS*           pos,                      PATH           str,                       DBL_WORD*      rule_applied,                      char           after_rule_3){   DBL_WORD   chars_found = 0 , path_pos = str.begin;   NODE*      tmp; #ifdef DEBUG      ST_PrintTree(tree);   printf("extension: %lu  phase+1: %lu",str.begin, str.end);   if(after_rule_3 == 0)      printf("   followed from (%lu,%lu | %lu) ", pos->node->edge_label_start, get_node_label_end(tree,pos->node), pos->edge_pos);   else      printf("   starting at (%lu,%lu | %lu) ", pos->node->edge_label_start, get_node_label_end(tree,pos->node), pos->edge_pos);#endif#ifdef STATISTICS   counter++;#endif   /* Follow suffix link only if it's not the first extension after rule 3 was applied */   if(after_rule_3 == 0)      follow_suffix_link(tree, pos);#ifdef DEBUG   #ifdef STATISTICS   if(after_rule_3 == 0)      printf("to (%lu,%lu | %lu). counter: %lu\n", pos->node->edge_label_start, get_node_label_end(tree,pos->node),pos->edge_pos,counter);   else      printf(". counter: %lu\n", counter);#endif#endif   /* If node is root - trace whole string starting from the root, else - trace last character only */   if(pos->node == tree->root)   {      pos->node = trace_string(tree, tree->root, str, &(pos->edge_pos), &chars_found, no_skip);   }   else   {      str.begin = str.end;      chars_found = 0;      /* Consider 2 cases:         1. last character matched is the last of its edge */      if(is_last_char_in_edge(tree,pos->node,pos->edge_pos))      {         /* Trace only last symbol of str, search in the  NEXT edge (node) */         tmp = find_son(tree, pos->node, tree->tree_string[str.end]);         if(tmp != 0)         {            pos->node      = tmp;            pos->edge_pos   = 0;            chars_found      = 1;         }      }      /* 2. last character matched is NOT the last of its edge */      else      {         /* Trace only last symbol of str, search in the CURRENT edge (node) */         if(tree->tree_string[pos->node->edge_label_start+pos->edge_pos+1] == tree->tree_string[str.end])         {            pos->edge_pos++;            chars_found   = 1;         }      }   }   /* If whole string was found - rule 3 applies */   if(chars_found == str.end - str.begin + 1)   {      *rule_applied = 3;      /* If there is an internal node that has no suffix link yet (only one may          exist) - create a suffix link from it to the father-node of the          current position in the tree (pos) */      if(suffixless != 0)      {         create_suffix_link(suffixless, pos->node->father);         /* Marks that no internal node with no suffix link exists */         suffixless = 0;      }

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -