📄 suffixtree.cpp
字号:
#include "stdafx.h"
//#include "suffixtree.h"
#include <cstdio>
#include <cstdlib>
#include <iostream>
using namespace std;
SuffixTree::SuffixTree(char *str)
:root(0), top(0)
{
m_nStrLen = strlen(str) + 1;
m_cpInternalStr = new char[m_nStrLen+2];
m_cpInternalStr[0]='\0';
sprintf(m_cpInternalStr+1, "%s#", str);
}
void SuffixTree::constuct()
{
int i;
top = __allocNode(); //创建top结点
root = __allocNode(); //创建root结点
if(top==0 || root == 0)
{
printf("__allocNode error!\n");
exit(1);
}
#ifdef DEBUG
cout<<"top结点地址为:"<<top<<endl;
cout<<"root结点地址为:"<<root<<endl;
#endif
root->m_tpSuffixLink=top;
root->m_nEdgeLabelStartPos=-1;
root->m_nEdgeLabelEndPos=-1;
root->m_bIsLeaf=false;
top-> m_tpChild = root;
top->m_tpMostRightChild = root;
globalS=root;
globalK=1;
i=0;
labelOfEnd=0;
while(m_cpInternalStr[i+1]!='\0')
{
i++;
labelOfEnd++;
#ifdef DEBUG
cout<<"当前字符: "<<m_cpInternalStr[i]<<endl;
#endif
update(i);
canonize(globalS,i);
#ifdef DEBUG
cout<<"--结束结点为: ( "<<globalS<<" : "<<globalK<<" , "<<i<<" )"<<endl;
#endif
}
return;
}
void SuffixTree::update(int i)
{
//(globalS, (globalK , i-1 ) is the canonical reference pair for the active point
//cout<<"--开始活动结点为: ( "<<globalS<<" : "<<globalK<<" , "<<i-1<<" )"<<endl;
tagSuffixNode *r;
tagSuffixNode *oldr=root;
r=testAndSplit(i-1,m_cpInternalStr[i]);
while(!endPoint)
{
#ifdef DEBUG
cout<<"----"<<m_cpInternalStr[i]<<"--transition不存在, 创建叶子结点"<<endl;
#endif
tagSuffixNode* info = __allocNode();
info->m_nEdgeLabelStartPos = i;
info->m_nEdgeLabelEndPos = m_nStrLen;
info->m_nActiveLen = r->m_nActiveLen + (info->m_nEdgeLabelEndPos - info->m_nEdgeLabelStartPos + 1);
#ifdef DEBUG
cout<<"-----创建的叶子结点地址为: "<<info<<" ";
cout<<"-----新插入的叶子结点开始结束标记为: "<<info->m_nEdgeLabelStartPos<<" , "<< info->m_nEdgeLabelEndPos<<endl;
#endif
if(r->m_tpChild == 0)
{
r->m_tpChild = info;
r->m_tpMostRightChild = info;
}
else
{
r->m_tpMostRightChild->m_tpRightSibling = info;
info->m_tpLeftSibling = r->m_tpMostRightChild;
info->m_tpRightSibling = 0;
r->m_tpMostRightChild=info;
}
#ifdef DEBUG
cout<<"--继续寻找结束结点!"<<endl;
#endif
if(oldr!=root)
{
oldr->m_tpSuffixLink=r;
#ifdef DEBUG
cout<<oldr<<"--->"<<r<<endl;
#endif
}
oldr=r;
#ifdef DEBUG
cout<<"globalS->m_tpSuffixLink=:"<<globalS->m_tpSuffixLink<<" globalK=:"<<globalK<<endl;
#endif
canonize(globalS->m_tpSuffixLink,i-1);
#ifdef DEBUG
cout<<"--->后缀链接为: ( "<<globalS<<" : "<<globalK<<" , "<<i-1<<" ) "<<endl;
#endif
r=testAndSplit(i-1,m_cpInternalStr[i]);
}
if(oldr!=root)
{
oldr->m_tpSuffixLink=globalS;
#ifdef DEBUG
cout<<oldr<<"--->"<<globalS<<endl;
#endif
}
return;
}
tagSuffixNode* SuffixTree::testAndSplit(int p,char t) //p=i-1,t为string[i]
{
#ifdef DEBUG
cout<<"--活动结点为: ( "<<globalS<<" : "<<globalK<<" , "<<p<<" )"<<endl;
#endif
if(globalK<=p) //当前活动结点为隐式结点
{
#ifdef DEBUG
cout<<"----当前活动结点为隐式结点"<<endl;
#endif
tagSuffixNode *s2;
int k2,p2;
s2=__findTTransition(globalS,m_cpInternalStr[globalK]);
k2=s2->m_nEdgeLabelStartPos;
p2=s2->m_nEdgeLabelEndPos;
#ifdef DEBUG
cout<<"s--s2 t-transition:< "<<k2<<" , "<<p2<<" > "<<endl;
#endif
if(t == m_cpInternalStr[k2+p-globalK+1]) // t = string[ k2+p-globalK+1],即(globalS, (globalK , i-1) )的下一字符为t,存在正确的儿子
{
endPoint=true; //记为找到结束结点,退出当前字符的加入
#ifdef DEBUG
cout<<"----找到结束结点,退出当前字符的加入"<<endl;
#endif
return globalS;
}
else
{
endPoint=false;
#ifdef DEBUG
cout<<"----不存在正确的儿子,隐式结点显式化"<<endl;
#endif
tagSuffixNode* parent = globalS; //若不存在正确的儿子,则活动结点(s,(k,i-1))显式化,并插入叶子结点
tagSuffixNode* r = __allocNode(); //new branch node 新的分支结点
r->m_bIsLeaf = false;
r->m_nEdgeLabelStartPos = k2;
r->m_nEdgeLabelEndPos = k2+p-globalK;
r->m_nActiveLen = parent->m_nActiveLen + (r->m_nEdgeLabelEndPos - r->m_nEdgeLabelStartPos + 1);
#ifdef DEBUG
cout<<"----创建的分支结点地址为: "<<r<<"起始,结束标记为: "<<r->m_nEdgeLabelStartPos<<" , "<<r->m_nEdgeLabelEndPos<<endl;
#endif
//original node
s2->m_nEdgeLabelStartPos = k2+p-globalK+1;
#ifdef DEBUG
cout<<"----原始的的分支结点地址为: "<< s2<<" 起始,结束标记为: "<<s2->m_nEdgeLabelStartPos<<" , "<<s2->m_nEdgeLabelEndPos<<endl;
#endif
r->m_tpRightSibling = parent->m_tpChild; //将r加入的s的孩子中,使child指针指向r
parent->m_tpChild->m_tpLeftSibling = r;
parent->m_tpChild = r;
s2->m_tpLeftSibling->m_tpRightSibling = s2->m_tpRightSibling;
if( s2->m_tpRightSibling != 0)
{
s2->m_tpRightSibling->m_tpLeftSibling = s2->m_tpLeftSibling;
}
else
{
parent->m_tpMostRightChild = s2->m_tpLeftSibling;
}
s2->m_tpLeftSibling=0; //将原先结点s2加入到分支结点后,为第一个结点,记得将s2的左兄弟指针清零
r->m_tpChild = s2;
r->m_tpMostRightChild = s2;
return r;
}
}
else
{
#ifdef DEBUG
cout<<"----当前活动结点为显式结点"<<endl;
#endif
tagSuffixNode *child;
child=__findTTransition(globalS,t);
if(child == 0) //不存在t-transition
{
endPoint=false;
return globalS;
}
else //存在t-transition, 结束
{
#ifdef DEBUG
cout<<"----存在"<<t<<"-transition, 找到结束结点,退出当前字符加入. "<<"边标记"<<child->m_nEdgeLabelStartPos<<" "<<child->m_nEdgeLabelEndPos<<endl;
#endif
endPoint=true;
return globalS;
}
}
}
void SuffixTree::canonize(tagSuffixNode* suffix , int p)
{
if(p<globalK)
{
#ifdef DEBUG
cout<<"----结点已为规范化"<<endl;
#endif
globalS=suffix;
return;
}
else
{
#ifdef DEBUG
cout<<"----对结点规范化"<<endl;
#endif
tagSuffixNode *s2;
int k2,p2;
s2=__findTTransition(suffix,m_cpInternalStr[globalK]);
#ifdef DEBUG
if(s2!=0)
cout<<"----"<<m_cpInternalStr[globalK]<<"--transition,地址为: "<<s2<<endl;
#endif
k2=s2->m_nEdgeLabelStartPos;
if(s2->m_bIsLeaf)
{
p2=labelOfEnd;
}
else
p2=s2->m_nEdgeLabelEndPos;
while( p2-k2 <= p-globalK )
{
globalK=globalK+p2-k2+1; //全局变量globalS,globalK仅在此代码段可能发生变化
suffix=s2;
#ifdef DEBUG
cout<<"suffix在此改变为: "<<suffix<<endl;
cout<<"globalK在此改变为: "<<globalK<<endl;
#endif
if(globalK<=p)
{
s2=__findTTransition(suffix,m_cpInternalStr[globalK]);
k2=s2->m_nEdgeLabelStartPos;
if(s2->m_bIsLeaf)
{
p2=labelOfEnd;
}
else
p2=s2->m_nEdgeLabelEndPos;
}
}
globalS=suffix;
return;
}
}
int SuffixTree::search(char *str)
{
if(str == 0)
return -1;
//TODO
//添加处理过程
int len = strlen(str);
tagSuffixNode* node = 0;
tagSuffixNode* cur = root;
for(int i=0; i<len; )
{
node = __findTTransition(cur, str[i]);
if(node == 0)
{
break;
}
else
{
int edgeLen = node->m_nEdgeLabelEndPos - node->m_nEdgeLabelStartPos + 1;
int remain = len - i;
if( remain <= edgeLen )
{
if( strncmp(&(str[i]), &(m_cpInternalStr[node->m_nEdgeLabelStartPos]), remain) != 0)
{
return -1;
}
else
{
return node->m_nEdgeLabelEndPos - node->m_nActiveLen + 1;
}
}
else
{
if( strncmp(&(str[i]), &(m_cpInternalStr[node->m_nEdgeLabelStartPos]), edgeLen) != 0)
{
return -1;
}
else
{
i += edgeLen;
cur = node;
}
}
}
}
return -1;
}
int SuffixTree::numberOfSubchar(char *str)
{
if(str == 0)
return -1;
//TODO
//添加处理过程
int len = strlen(str);
tagSuffixNode* node = 0;
tagSuffixNode* cur = root;
for(int i=0; i<len; )
{
node = __findTTransition(cur, str[i]);
if(node == 0)
{
break;
}
else
{
int edgeLen = node->m_nEdgeLabelEndPos - node->m_nEdgeLabelStartPos + 1;
int remain = len - i;
if( remain <= edgeLen )
{
if( strncmp(&(str[i]), &(m_cpInternalStr[node->m_nEdgeLabelStartPos]), remain) != 0)
{
return -1;
}
else
{
return findNumberOfLeaf(node);
}
}
else
{
if( strncmp(&(str[i]), &(m_cpInternalStr[node->m_nEdgeLabelStartPos]), edgeLen) != 0)
{
return -1;
}
else
{
i += edgeLen;
cur = node;
}
}
}
}
return -1;
}
int SuffixTree::findNumberOfLeaf( tagSuffixNode* node )
{
int total=0;
if(node->m_bIsLeaf)
{
total=1;
return total;
}
tagSuffixNode* child=node->m_tpChild;
while( child != NULL)
{
total+= findNumberOfLeaf(child);
child=child->m_tpRightSibling;
}
return total;
}
tagSuffixNode* SuffixTree::getRoot()
{
return root;
}
tagSuffixNode* SuffixTree:: __allocNode()
{
tagSuffixNode* node = new tagSuffixNode;
node->init();
return node;
}
tagSuffixNode* SuffixTree::__findTTransition(tagSuffixNode* node, char c)
{
//返回node的存在c-transition的子结点child
//assert(node != 0);
if(node == 0)
{
printf("__findTTransition error\n");
exit(1);
}
//若node为top结点,则其总是存在到root的transition
if(node == top)
return root;
tagSuffixNode* child = node->m_tpChild;
while(child != 0)
{
if( m_cpInternalStr[child->m_nEdgeLabelStartPos] == c)
return child;
child = child->m_tpRightSibling;
}
return 0;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -