⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 suffixtree.cpp

📁 后缀树的Ukkon算法实现
💻 CPP
字号:

#include "stdafx.h"
//#include "suffixtree.h"

#include <cstdio>
#include <cstdlib>
#include <iostream>

using namespace std;





SuffixTree::SuffixTree(char *str)
    :root(0), top(0)
{
    m_nStrLen = strlen(str) + 1;
    m_cpInternalStr = new char[m_nStrLen+2];
	m_cpInternalStr[0]='\0';
	sprintf(m_cpInternalStr+1, "%s#", str);
}

void SuffixTree::constuct()
{
	int i;

	 top = __allocNode();													//创建top结点
	 root = __allocNode();													//创建root结点
    if(top==0 || root == 0)
    {
        printf("__allocNode error!\n");
        exit(1);
    }

#ifdef DEBUG
	cout<<"top结点地址为:"<<top<<endl;
	cout<<"root结点地址为:"<<root<<endl;
#endif

	root->m_tpSuffixLink=top;
	root->m_nEdgeLabelStartPos=-1;
	root->m_nEdgeLabelEndPos=-1;
	root->m_bIsLeaf=false;

	top-> m_tpChild = root;
    top->m_tpMostRightChild = root;

	globalS=root;												
	globalK=1;
	i=0;
	labelOfEnd=0;
	while(m_cpInternalStr[i+1]!='\0')
	{
		i++;
		labelOfEnd++;

#ifdef DEBUG
		cout<<"当前字符:  "<<m_cpInternalStr[i]<<endl;		
#endif

		update(i);	

		canonize(globalS,i);

#ifdef DEBUG
		cout<<"--结束结点为:  ( "<<globalS<<" : "<<globalK<<" , "<<i<<" )"<<endl;	
#endif

	}
	return;
}


void SuffixTree::update(int i)
{
	//(globalS,  (globalK ,  i-1 ) is the canonical reference pair for the active point

	//cout<<"--开始活动结点为: ( "<<globalS<<" : "<<globalK<<" , "<<i-1<<" )"<<endl;

	tagSuffixNode *r;
	tagSuffixNode *oldr=root;
	r=testAndSplit(i-1,m_cpInternalStr[i]);	

	while(!endPoint)
	{

#ifdef DEBUG
		  cout<<"----"<<m_cpInternalStr[i]<<"--transition不存在, 创建叶子结点"<<endl;
#endif

		  tagSuffixNode* info = __allocNode();			
          info->m_nEdgeLabelStartPos = i;
          info->m_nEdgeLabelEndPos = m_nStrLen;

		 info->m_nActiveLen = r->m_nActiveLen + (info->m_nEdgeLabelEndPos - info->m_nEdgeLabelStartPos + 1);

#ifdef DEBUG
		 cout<<"-----创建的叶子结点地址为:   "<<info<<"   ";
		 cout<<"-----新插入的叶子结点开始结束标记为:  "<<info->m_nEdgeLabelStartPos<<" , "<< info->m_nEdgeLabelEndPos<<endl;
#endif

		  
		 if(r->m_tpChild == 0)
		 {
			  r->m_tpChild = info;
			  r->m_tpMostRightChild = info;
		 }
		 else
		 {			
			r->m_tpMostRightChild->m_tpRightSibling = info;
			info->m_tpLeftSibling = r->m_tpMostRightChild;
			info->m_tpRightSibling = 0;
			r->m_tpMostRightChild=info;
		 }

#ifdef DEBUG
		cout<<"--继续寻找结束结点!"<<endl;
#endif

		if(oldr!=root)
		{
			oldr->m_tpSuffixLink=r;

#ifdef DEBUG
			cout<<oldr<<"--->"<<r<<endl;
#endif
		}
		oldr=r;

#ifdef DEBUG
		cout<<"globalS->m_tpSuffixLink=:"<<globalS->m_tpSuffixLink<<"    globalK=:"<<globalK<<endl;
#endif

		canonize(globalS->m_tpSuffixLink,i-1);

#ifdef DEBUG
		cout<<"--->后缀链接为: ( "<<globalS<<" : "<<globalK<<" , "<<i-1<<" ) "<<endl;
#endif

		r=testAndSplit(i-1,m_cpInternalStr[i]);
	}

	if(oldr!=root)
	{
		oldr->m_tpSuffixLink=globalS;

#ifdef DEBUG
		cout<<oldr<<"--->"<<globalS<<endl;
#endif

	}

	return;
}


tagSuffixNode* SuffixTree::testAndSplit(int p,char t)				//p=i-1,t为string[i]
{

#ifdef DEBUG
	cout<<"--活动结点为: ( "<<globalS<<" : "<<globalK<<" , "<<p<<" )"<<endl;
#endif

	if(globalK<=p) //当前活动结点为隐式结点
	{
#ifdef DEBUG
		cout<<"----当前活动结点为隐式结点"<<endl;
#endif

		tagSuffixNode *s2;
		int k2,p2;
		s2=__findTTransition(globalS,m_cpInternalStr[globalK]);
		k2=s2->m_nEdgeLabelStartPos;				
		p2=s2->m_nEdgeLabelEndPos;

#ifdef DEBUG
		cout<<"s--s2 t-transition:< "<<k2<<" , "<<p2<<" > "<<endl;
#endif

		if(t == m_cpInternalStr[k2+p-globalK+1])		// t = string[ k2+p-globalK+1],即(globalS, (globalK , i-1) )的下一字符为t,存在正确的儿子
		{
			endPoint=true;										//记为找到结束结点,退出当前字符的加入

#ifdef DEBUG
			cout<<"----找到结束结点,退出当前字符的加入"<<endl;
#endif
			return globalS;
		}
		else
		{
					endPoint=false;

#ifdef DEBUG
					cout<<"----不存在正确的儿子,隐式结点显式化"<<endl;
#endif
				

					tagSuffixNode* parent = globalS;					//若不存在正确的儿子,则活动结点(s,(k,i-1))显式化,并插入叶子结点
                
                    tagSuffixNode* r = __allocNode();				//new branch node 新的分支结点

                    r->m_bIsLeaf = false;
                    r->m_nEdgeLabelStartPos = k2;
                    r->m_nEdgeLabelEndPos =   k2+p-globalK;
					r->m_nActiveLen = parent->m_nActiveLen + (r->m_nEdgeLabelEndPos - r->m_nEdgeLabelStartPos + 1);

#ifdef DEBUG
					cout<<"----创建的分支结点地址为:   "<<r<<"起始,结束标记为: "<<r->m_nEdgeLabelStartPos<<" , "<<r->m_nEdgeLabelEndPos<<endl;
#endif

              
                    //original node
                    s2->m_nEdgeLabelStartPos = k2+p-globalK+1;

#ifdef DEBUG
					cout<<"----原始的的分支结点地址为:   "<< s2<<"   起始,结束标记为: "<<s2->m_nEdgeLabelStartPos<<" , "<<s2->m_nEdgeLabelEndPos<<endl;
#endif
				
                    r->m_tpRightSibling = parent->m_tpChild;					//将r加入的s的孩子中,使child指针指向r
                    parent->m_tpChild->m_tpLeftSibling = r;
                    parent->m_tpChild = r;

                    s2->m_tpLeftSibling->m_tpRightSibling = s2->m_tpRightSibling;
                    if( s2->m_tpRightSibling != 0)
                    {
                        s2->m_tpRightSibling->m_tpLeftSibling = s2->m_tpLeftSibling;
                    }
                    else
                    {
                        parent->m_tpMostRightChild = s2->m_tpLeftSibling;
                    }

					s2->m_tpLeftSibling=0;				//将原先结点s2加入到分支结点后,为第一个结点,记得将s2的左兄弟指针清零
                    r->m_tpChild = s2;
                    r->m_tpMostRightChild = s2;
                            
					return r;

		}

	}
	else				
	{
#ifdef DEBUG
		cout<<"----当前活动结点为显式结点"<<endl;
#endif

		tagSuffixNode *child;
		child=__findTTransition(globalS,t);

		if(child == 0)			//不存在t-transition
		{	
			endPoint=false;
			return globalS;
		}
		else																//存在t-transition, 结束
		{
#ifdef DEBUG
			cout<<"----存在"<<t<<"-transition, 找到结束结点,退出当前字符加入.    "<<"边标记"<<child->m_nEdgeLabelStartPos<<"   "<<child->m_nEdgeLabelEndPos<<endl;
#endif
			endPoint=true;
			return globalS;	
		}

	}
}


void SuffixTree::canonize(tagSuffixNode* suffix , int p)
{
	if(p<globalK)
	{
#ifdef DEBUG
		cout<<"----结点已为规范化"<<endl;
#endif

		globalS=suffix;
		return;
	}
	else
	{

#ifdef DEBUG
		cout<<"----对结点规范化"<<endl;
#endif

		tagSuffixNode *s2;
		int k2,p2;
		s2=__findTTransition(suffix,m_cpInternalStr[globalK]);

#ifdef DEBUG
		if(s2!=0)
			cout<<"----"<<m_cpInternalStr[globalK]<<"--transition,地址为:   "<<s2<<endl;
#endif

		k2=s2->m_nEdgeLabelStartPos;

		if(s2->m_bIsLeaf)
		{
			p2=labelOfEnd;
		}
		else
			p2=s2->m_nEdgeLabelEndPos;

		
		while( p2-k2 <= p-globalK )
		{
			globalK=globalK+p2-k2+1;				//全局变量globalS,globalK仅在此代码段可能发生变化
			suffix=s2;
#ifdef DEBUG
			cout<<"suffix在此改变为:  "<<suffix<<endl;
			cout<<"globalK在此改变为:  "<<globalK<<endl;
#endif

			if(globalK<=p)
			{
				s2=__findTTransition(suffix,m_cpInternalStr[globalK]);
				k2=s2->m_nEdgeLabelStartPos;
				if(s2->m_bIsLeaf)
				{
					p2=labelOfEnd;
				}
				else
					p2=s2->m_nEdgeLabelEndPos;
			}
		}

		globalS=suffix;
		return;
	}
}

int SuffixTree::search(char *str)
{
    if(str == 0)
        return -1;
    //TODO
    //添加处理过程

    int len = strlen(str);

    tagSuffixNode* node = 0;
    tagSuffixNode* cur = root;
    for(int i=0; i<len; )
    {
        node = __findTTransition(cur, str[i]);
        if(node == 0)
        {
            break;
        }
        else
        {
            int edgeLen = node->m_nEdgeLabelEndPos - node->m_nEdgeLabelStartPos + 1;
            int remain = len - i;
            if( remain <= edgeLen )
            {
                if( strncmp(&(str[i]), &(m_cpInternalStr[node->m_nEdgeLabelStartPos]), remain) != 0)
                {
                    return -1;
                }
                else
                {
                    return node->m_nEdgeLabelEndPos - node->m_nActiveLen + 1;
                }
            }
            else
            {
                if( strncmp(&(str[i]), &(m_cpInternalStr[node->m_nEdgeLabelStartPos]), edgeLen) != 0)
                {
                    return -1;
                }
                else
                {
                    i += edgeLen;
                    cur = node;
                }
            }
        }
    }
    return -1;
}


int SuffixTree::numberOfSubchar(char *str)
{
	if(str == 0)
        return -1;
    //TODO
    //添加处理过程

    int len = strlen(str);

    tagSuffixNode* node = 0;
    tagSuffixNode* cur = root;
    for(int i=0; i<len; )
    {
        node = __findTTransition(cur, str[i]);
        if(node == 0)
        {
            break;
        }
        else
        {
            int edgeLen = node->m_nEdgeLabelEndPos - node->m_nEdgeLabelStartPos + 1;
            int remain = len - i;
            if( remain <= edgeLen )
            {
                if( strncmp(&(str[i]), &(m_cpInternalStr[node->m_nEdgeLabelStartPos]), remain) != 0)
                {
                    return -1;
                }
                else
                {
                    return findNumberOfLeaf(node);
                }
            }
            else
            {
                if( strncmp(&(str[i]), &(m_cpInternalStr[node->m_nEdgeLabelStartPos]), edgeLen) != 0)
                {
                    return -1;
                }
                else
                {
                    i += edgeLen;
                    cur = node;
                }
            }
        }
    }
    return -1;

    
}

int SuffixTree::findNumberOfLeaf( tagSuffixNode* node )
{
	int total=0;
	if(node->m_bIsLeaf)
	{
		total=1;
		return total;
	}
	 tagSuffixNode* child=node->m_tpChild;
	 while( child != NULL)
	 {
		 total+= findNumberOfLeaf(child);
		 child=child->m_tpRightSibling;
	 }
	 return total;

}

tagSuffixNode* SuffixTree::getRoot()
{
	return root;
}

tagSuffixNode*  SuffixTree:: __allocNode()
{
    tagSuffixNode* node = new tagSuffixNode;
    node->init();
    return node;
}

tagSuffixNode*  SuffixTree::__findTTransition(tagSuffixNode* node, char c)
{
	//返回node的存在c-transition的子结点child
	
	//assert(node != 0);
	if(node == 0)
	{
		printf("__findTTransition error\n");
		exit(1);
	}
	//若node为top结点,则其总是存在到root的transition
	if(node == top)
		return root;

    tagSuffixNode* child = node->m_tpChild;
    while(child != 0)
    {
        if( m_cpInternalStr[child->m_nEdgeLabelStartPos] == c)
            return child;

        child = child->m_tpRightSibling;
    }
    return 0;
}



⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -