⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 suffixtree.cpp

📁 后缀数组的算法实现
💻 CPP
字号:
#include "suffixtree.h"
#include <list>
#include <vector>
#include <cstring>
#include <cassert>
#include <cstdio>
using namespace std;

CSuffixTree::CSuffixTree(char *str)
    :m_nActiveLen(0),     
     m_tpRoot(0),    
     m_tpActiveNode(0)
{
    m_nStrLen = strlen(str) + 1;
    m_cpInternalStr = new char[m_nStrLen+1];

    sprintf(m_cpInternalStr, "%s#", str);
#ifdef DEBUG    
    m_cIndex = 'A';
#endif    
}

CSuffixTree::~CSuffixTree()
{
    deleteTree();
    delete[] m_cpInternalStr;
}

void CSuffixTree::deleteTree()
{
    list<tagSuffixNode*>::iterator iter = m_tagNodeList.begin();
    for(; iter != m_tagNodeList.end(); ++iter)
    {
        delete *iter;
    }

    m_tagNodeList.clear();
}


void CSuffixTree::reset(char *str)
{
    deleteTree();

    int tmp = strlen(str);
    if(tmp+1 > m_nStrLen)
    {
        m_nStrLen = tmp+1;
        delete[] m_cpInternalStr;
        m_cpInternalStr = new char[m_nStrLen+1];
    }
    else
    {
        m_nStrLen = tmp+1;
    }
    sprintf(m_cpInternalStr, "%s#", str);

    m_nActiveLen = 0;
    m_nStrLen = 0;
    m_tpActiveNode = 0;
    m_tpRoot = 0;

    m_tagFromNodeVec.clear();
    //reconstruct tree for new string
    construct();
}
void CSuffixTree::construct()
{
    m_tpRoot = __allocNode();
    if(m_tpRoot == 0)
    {
#ifdef DEBUG
        printf("__allocNode error!\n");
#endif
        return;
    }
    m_nActiveLen = 0;
    m_tpRoot->m_nActiveLen = 0;
    //initial active node
    m_tpActiveNode = m_tpRoot;

    m_tagToNodeVec.push_back(m_tpRoot);
    
    for(int i = 0; i<m_nStrLen; i++)
    {
#ifdef DEBUG
        printf("\ninsert %s\n", &(m_cpInternalStr[i]));
        printf("active node:[%c]\n", m_tpActiveNode->m_cIndex);
#endif
        //判断是否有后继节点
        if(m_tpActiveNode->m_tpSuffixLink != 0)
        {
            m_tpActiveNode = m_tpActiveNode->m_tpSuffixLink;
            m_nActiveLen--;
        }
        int pos = i;
        
#ifdef DEBUG        
        printf("new active node:[%c]\n", m_tpActiveNode->m_cIndex);
        printf("pos:%d\n", pos);
        printf("active length:%d\n", m_nActiveLen);
#endif        

        while(true)
        {
            //查找以当前后缀串的开始字符开始的子节点
            tagSuffixNode* node = __findChildBeginWithChar(m_tpActiveNode, m_cpInternalStr[i+m_nActiveLen]);
            //未找到以suffix[i]的开始字符开头的节点
            if(node == 0)
            {
#ifdef DEBUG
                printf("__findChildBeginWithChar null\n");
#endif                
                tagSuffixNode* child = __allocNode();

                //设置开始、结束位置的下标
                child->m_nEdgeLabelStartPos = pos;
                child->m_nEdgeLabelEndPos = m_nStrLen-1;
                //设置叶节点代表的后缀串的长度
                child->m_nActiveLen = m_nStrLen - i;
                //设置父节点
                child->m_tpParent = m_tpActiveNode;
                if( m_tpActiveNode->m_tpChild == 0)
                {
                    m_tpActiveNode->m_tpChild = child;
                    m_tpActiveNode->m_tpMostRightChild = child;
                }
                else
                {
                    m_tpActiveNode->m_tpMostRightChild->m_tpRightSibling = child;
                    child->m_tpLeftSibling = m_tpActiveNode->m_tpMostRightChild;
                    m_tpActiveNode->m_tpMostRightChild = child;
                }

                break;
            }
            else
            {
#ifdef DEBUG
                printf("__findChildBeginWithChar ok\n");
                printf("node start:%d\n", node->m_nEdgeLabelStartPos);
                printf("node end:%d\n", node->m_nEdgeLabelEndPos);
                printf("node index:%c\n", node->m_cIndex);
#endif
                //TODO
                //确定 m_nMinDistance
                int edgeStart = node->m_nEdgeLabelStartPos;
                int strStart = i + m_nActiveLen;
                bool split = false;
            
                for(; edgeStart<=node->m_nEdgeLabelEndPos; edgeStart++, strStart++)
                {
                    if( m_cpInternalStr[edgeStart] != m_cpInternalStr[strStart])
                    {
                        split = true;
                        break;
                    }
                }
                if(!split)
                {
                    m_tpActiveNode = node;
                    m_nActiveLen += node->m_nEdgeLabelEndPos - node->m_nEdgeLabelStartPos + 1;
                    pos += node->m_nEdgeLabelEndPos - node->m_nEdgeLabelStartPos + 1;
                    continue;
                }
                else
                {
                    tagSuffixNode* parent = node->m_tpParent;
                    //new branch node
                    tagSuffixNode* branch = __allocNode();
                    branch->m_bIsLeaf = false;
                    branch->m_nEdgeLabelStartPos = node->m_nEdgeLabelStartPos;
                    branch->m_nEdgeLabelEndPos = edgeStart-1;
                    branch->m_nActiveLen = parent->m_nActiveLen + (branch->m_nEdgeLabelEndPos - branch->m_nEdgeLabelStartPos + 1);
                    //original node
                    node->m_nEdgeLabelStartPos = edgeStart;

                    tagSuffixNode* info = __allocNode();
                    info->m_nEdgeLabelStartPos = strStart;
                    info->m_nEdgeLabelEndPos = m_nStrLen - 1;
                    info->m_nActiveLen = branch->m_nActiveLen + (info->m_nEdgeLabelEndPos - info->m_nEdgeLabelStartPos + 1);

                    branch->m_tpParent = parent;
                    branch->m_tpRightSibling = parent->m_tpChild;
                    parent->m_tpChild->m_tpLeftSibling = branch;
                    parent->m_tpChild = branch;

                    node->m_tpLeftSibling->m_tpRightSibling = node->m_tpRightSibling;
                    if( node->m_tpRightSibling != 0)
                    {
                        node->m_tpRightSibling->m_tpLeftSibling = node->m_tpLeftSibling;
                    }
                    else
                    {
                        parent->m_tpMostRightChild = node->m_tpLeftSibling;
                    }

                    branch->m_tpChild = info;
                    branch->m_tpMostRightChild = node;

                    info->m_tpRightSibling = node;
                    info->m_tpParent = branch;
                    
                    node->m_tpParent = branch;
                    node->m_tpLeftSibling = info;
                    node->m_tpRightSibling = 0;

                    
                    //设置节点的suffix link

                    bool setSuffix = false;
                    vector<tagSuffixNode*>::iterator iter = m_tagToNodeVec.begin();
                    for(; iter != m_tagToNodeVec.end(); ++iter)
                    {
                        tagSuffixNode* tmp = *iter;
                        if( strncmp( &(m_cpInternalStr[branch->m_nEdgeLabelEndPos - branch->m_nActiveLen + 2]), &(m_cpInternalStr[tmp->m_nEdgeLabelEndPos - tmp->m_nActiveLen + 1]), branch->m_nActiveLen - 1) == 0)
                        {
                            branch->m_tpSuffixLink = tmp;
                            setSuffix = true;
                            break;
#ifdef DEBUG
                            printf("branch[%c] suffix_link is branch[%c]\n", tmp->m_cIndex, branch->m_cIndex);
#endif                                
                        }
                    }
                    m_tagToNodeVec.push_back(branch);
                    
                    vector<tagSuffixNode*>::iterator iter2 = m_tagFromNodeVec.begin();
                    for(; iter2 != m_tagFromNodeVec.end(); ++iter2)
                    {
                        tagSuffixNode* tmp = *iter2;
                        //找到后缀节点,结束循环
                        if( strncmp( &(m_cpInternalStr[tmp->m_nEdgeLabelEndPos - tmp->m_nActiveLen + 2]), &(m_cpInternalStr[branch->m_nEdgeLabelEndPos - branch->m_nActiveLen + 1]), tmp->m_nActiveLen - 1) == 0)
                        {
                            tmp->m_tpSuffixLink = branch;
                            m_tagFromNodeVec.erase(iter2);
#ifdef DEBUG
                            printf("branch[%c] suffix_link is branch[%c]\n", tmp->m_cIndex, branch->m_cIndex);
#endif
                            break;
                        }
                    }

                    if( !setSuffix )
                    {
                        m_tagFromNodeVec.push_back(branch);
                    }

                    //已经将新的后缀字符串插入到树中,这时可以结束while了。
                    break;
                }
            }
        }
        //test
#ifdef DEBUG
        printf("print\n");
        printTree();
#endif        
    }
}

int CSuffixTree::search(char *str)
{
    if(str == 0)
        return -1;
    //TODO
    //添加处理过程

    int len = strlen(str);

    tagSuffixNode* node = 0;
    tagSuffixNode* cur = m_tpRoot;
    for(int i=0; i<len; )
    {
        node = __findChildBeginWithChar(cur, str[i]);
        if(node == 0)
        {
            break;
        }
        else
        {
            int edgeLen = node->m_nEdgeLabelEndPos - node->m_nEdgeLabelStartPos + 1;
            int remain = len - i;
            if( remain <= edgeLen )
            {
                if( strncmp(&(str[i]), &(m_cpInternalStr[node->m_nEdgeLabelStartPos]), remain) != 0)
                {
                    return -1;
                }
                else
                {
                    return node->m_nEdgeLabelEndPos - node->m_nActiveLen + 1;
                }
            }
            else
            {
                if( strncmp(&(str[i]), &(m_cpInternalStr[node->m_nEdgeLabelStartPos]), edgeLen) != 0)
                {
                    return -1;
                }
                else
                {
                    i += edgeLen;
                    cur = node;
                }
            }
        }
    }
    return -1;
}

tagSuffixNode* CSuffixTree::__allocNode()
{
    tagSuffixNode* node = new tagSuffixNode;

    m_tagNodeList.push_back(node);

    node->init();
#ifdef DEBUG    
    node->m_cIndex = m_cIndex;
    m_cIndex++;
#endif    
    return node;
}

tagSuffixNode* CSuffixTree::__findChildBeginWithChar(tagSuffixNode* node, char c)
{
    assert(node != 0);

    tagSuffixNode* child = node->m_tpChild;
    while(child != 0)
    {
        if( m_cpInternalStr[child->m_nEdgeLabelStartPos] == c)
            return child;

        child = child->m_tpRightSibling;
    }

    return 0;
}

void CSuffixTree::test()
{
    //printf("%s\n", m_cpInternalStr);
    printTree();
}
void CSuffixTree::printTree()
{
#ifdef DEBUG
    printf("[A]\n");
#endif
    __printHelper(m_tpRoot, 0);
}
void CSuffixTree::__printHelper(tagSuffixNode* node, int level)
{
    int start = node->m_nEdgeLabelStartPos;
    int end = node->m_nEdgeLabelEndPos;
    tagSuffixNode* child = node->m_tpChild;
    if( level > 0 )
    {
        for(int i=0; i<level; i++)
        {
            printf(" |");
        }
        printf(" + ");
        for(int j = start; j <= end; j++)
        {
            printf("%c", m_cpInternalStr[j]);
        }
#ifdef DEBUG        
         printf("[%c]", node->m_cIndex);
//         printf("[%d:%d:%d]", node->m_nEdgeLabelStartPos, node->m_nEdgeLabelEndPos, node->m_nActiveLen);
#endif        
        printf("\n");
    }

    while( child != 0 )
    {
        __printHelper(child, level+1);
        child = child->m_tpRightSibling;
    }
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -