⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 extractwords.cs

📁 KTDictSeg 简介: KTDictSeg 是由KaiToo搜索开发的一款基于字典的简单中英文分词算法 * 主要功能: 中英文分词
💻 CS
📖 第 1 页 / 共 2 页
字号:
                            {
                                m_GameNodes.Add(obj);
                            }
                        }
                    }
                    deep = 0;
                    nodes.Clear();
                }
            }
            else
            {
                nodes.Add(begin);
                deep++;

                T_WordInfo last = (T_WordInfo)words[begin];

                bool nextStep = false;
                bool reach = false;
                int endPos = last.Position + last.Word.Length - 1;

                int oldDeep = deep;
                int oldSpace = spaceNum;

                for (int i = begin + 1; i <= end; i++)
                {
                    T_WordInfo cur = (T_WordInfo)words[i];

                    if (endPos < cur.Position + cur.Word.Length - 1)
                    {
                        endPos = cur.Position + cur.Word.Length - 1;
                    }


                    if (last.Position + last.Word.Length <= cur.Position)
                    {

                        nextStep = true;

                        if (reach)
                        {
                            reach = false;
                            spaceNum = oldSpace;
                            deep = oldDeep;
                            nodes.RemoveAt(nodes.Count - 1);
                        }

                        spaceNum += cur.Position - (last.Position + last.Word.Length);
                        List<int> oneNodes;
                        oneNodes = GameTree(words, nodes, false, i, end, ref spaceNum, ref deep);

                        if (oneNodes != null)
                        {
                            bool select = false;

                            if (m_MinSpace > spaceNum ||
                                (m_MinSpace == spaceNum && deep < m_MinDeep))
                            {
                                select = true;
                            }
                            else if (m_MinDeep == deep && m_MinSpace == spaceNum)
                            {
                                if (m_CompareByPos != null && m_MinSpace == 0)
                                {
                                    select = m_CompareByPos(words, m_GameNodes, oneNodes);
                                }
                                else
                                {
                                    select = CompareGroup(words, m_GameNodes, oneNodes, MatchDirection);
                                }
                            }


                            if (select)
                            {
                                reach = true;
                                nextStep = false;
                                m_MinDeep = deep;
                                m_MinSpace = spaceNum;
                                m_GameNodes.Clear();
                                foreach (int obj in oneNodes)
                                {
                                    m_GameNodes.Add(obj);
                                }
                            }
                            else
                            {
                                spaceNum = oldSpace;
                                deep = oldDeep;
                                nodes.RemoveRange(deep, nodes.Count - deep);
                            }
                        }
                        else
                        {
                            spaceNum = oldSpace;
                            deep = oldDeep;
                            nodes.RemoveRange(deep , nodes.Count - deep);
                        }
                    }
                }

                if (!nextStep)
                {
                    spaceNum += endPos - (last.Position + last.Word.Length-1);

                    List<int> ret = new List<int>();

                    foreach (int obj in nodes)
                    {
                        ret.Add(obj);
                    }

                    return ret;
                }


            }

            return null;
        }

        /// <summary>
        /// 最大匹配提取全文中所有匹配的单词
        /// </summary>
        /// <param name="fullText">全文</param>
        /// <returns>返回T_WordInfo[]数组,如果没有找到一个匹配的单词,返回长度为0的数组</returns>
        public List<T_WordInfo> ExtractFullTextMaxMatch(String fullText)
        {
            List<T_WordInfo> retWords = new List<T_WordInfo>();
            List<T_WordInfo> words = ExtractFullText(fullText);

            int i = 0;

            while (i < words.Count)
            {
                T_WordInfo wordInfo = (T_WordInfo)words[i];

                int j;

                int rangeEndPos = 0;

                for (j = i; j < words.Count-1; j++)
                {
                    if (j - i > 16)
                    {
                        //嵌套太多的情况一般很少发生,如果发生,强行中断,以免造成博弈树遍历层次过多
                        //降低系统效率
                        break;
                    }

                    if (rangeEndPos < ((T_WordInfo)words[j]).Position + ((T_WordInfo)words[j]).Word.Length -1)
                    {
                        rangeEndPos = ((T_WordInfo)words[j]).Position + ((T_WordInfo)words[j]).Word.Length - 1;
                    }

                    if (rangeEndPos <
                        ((T_WordInfo)words[j + 1]).Position)  
                    {
                        break;
                    }
                }

                if (j > i)
                {
                    int spaceNum = 0;
                    int deep = 0;
                    m_GameNodes = new List<int>();
                    m_MinDeep = 65535;
                    m_MinSpace = 65535 * 256;

                    GameTree(words, new List<int>(), true, i, j, ref spaceNum, ref deep);

                    foreach (int index in m_GameNodes)
                    {
                        T_WordInfo info = (T_WordInfo)words[index];
                        retWords.Add(info);
                    }

                    i = j + 1;
                    continue;
                }
                else
                {
                    retWords.Add(wordInfo);
                    i++;
                }

                
            }

            return retWords;
        }


        /// <summary>
        /// 提取全文
        /// </summary>
        /// <param name="fullText">全文</param>
        /// <returns>返回T_WordInfo[]数组,如果没有找到一个匹配的单词,返回长度为0的数组</returns>
        public List<T_WordInfo> ExtractFullText(String fullText)
        {
            List<T_WordInfo> words = new List<T_WordInfo>();

            if (fullText == null || fullText == "")
            {
                return words;
            }

            T_DfaUnit cur = null;
            bool find = false;
            int pos = 0;
            int i = 0;

            while (i < fullText.Length)
            {
                cur = m_WordDfa.Next(cur, fullText[i]);
                if (cur != null && !find)
                {
                    pos = i;
                    find = true;
                }

                if (find)
                {
                    if (cur == null)
                    {
                        find = false;
                        i = pos + 1; //有可能存在包含关系的词汇,所以需要回溯
                        continue;
                    }
                    else if (cur.QuitWord != null)
                    {
                        T_WordInfo wordInfo = new T_WordInfo();
                        wordInfo.Word = cur.QuitWord;
                        wordInfo.Position = pos;
                        wordInfo.Rank = m_WordDfa.GetRank(wordInfo.Word);
                        wordInfo.Tag = cur.Tag;
                        words.Add(wordInfo);

                        if (cur.Childs == null)
                        {
                            find = false;
                            cur = null;
                            i = pos + 1; //有可能存在包含关系的词汇,所以需要回溯
                            continue;
                        }
                    }
                }

                i++;
            }

            return words;
        }



    }
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -