📄 simpledictseg.cs

📁 KTDictSeg 简介: KTDictSeg 是由KaiToo搜索开发的一款基于字典的简单中英文分词算法 * 主要功能: 中英文分词
💻 CS
📖 第 1 页 / 共 4 页
字号:
上一页 1 2 34
                    foreach (T_WordInfo wordInfo in words)
                    {
                        
                        if (lastPos < wordInfo.Position)
                        {
/*
                            String unMatchWord = word.Substring(lastPos, wordInfo.Position - lastPos);

                            InsertWordToArray(unMatchWord, retWords);
*/
                            //中间有未匹配词，将单个字逐个加入
                            for (int j = lastPos; j < wordInfo.Position; j++)
                            {
                                InsertWordToArray(word[j].ToString(), retWords);
                            }
 
                        }
                         

                        lastPos = wordInfo.Position + wordInfo.Word.Length ;

                        //统计中文姓名的后缀
                        if (AutoStudy && lstIsName)
                        {
                            T_DictStruct wordDict = (T_DictStruct)wordInfo.Tag;
                            if ((wordDict.Pos & (int)T_POS.POS_A_NR) == 0)
                            {
                                m_MatchNameRule.AddBefore(wordInfo.Word);
                            }

                            lstIsName = false;
                        }

                        //统计中文姓名的前缀
                        //如总统，主席等
                        if ((((T_DictStruct)wordInfo.Tag).Pos & (int)T_POS.POS_A_NR) != 0)
                        {
                            if (wordInfo.Word.Length > 1 && wordInfo.Word.Length <= 4 && retWords.Count > 0 && AutoStudy && !lstIsName)
                            {
                                T_DictStruct wordDict = (T_DictStruct)wordInfo.Tag;
                                m_MatchNameRule.AddBefore(retWords[retWords.Count - 1]);
                            }

                            lstIsName = true;
                        }


                        InsertWordToArray(wordInfo.Word, retWords);


                    }

                    if (lastPos < word.Length)
                    {
                        //尾部有未匹配词，将单个字逐个加入
                        for (int j = lastPos; j < word.Length; j++)
                        {
                            InsertWordToArray(word[j].ToString(), retWords);
                        }

                        //InsertWordToArray(word.Substring(lastPos, word.Length - lastPos), retWords);
                    }
                }

                i++;
            }

            return retWords;
        }

        private void TrafficUnknownWord(String word, T_POS Pos)
        {
            if (word.Length <= 1 || word.Length > 3)
            {
                return;
            }

            T_DictStruct unknownWord = m_UnknownWordsDictMgr.GetWord(word);


            if (unknownWord == null)
            {
                m_UnknownWordsDictMgr.InsertWord(word, 1, (int)Pos);
                return;
            }

            //如果是屏蔽的未登录词，则不加入
            //屏蔽的未登录词用词性等于0来表示
            if (unknownWord.Pos == 0)
            {
                return;
            }

            unknownWord.Pos |= (int)Pos;
            unknownWord.Frequency++;

            if (unknownWord.Frequency > UnknownWordsThreshold && AutoInsertUnknownWords)
            {
                T_DictStruct w = m_DictMgr.GetWord(word);
                if (w == null)
                {
                    m_DictMgr.InsertWord(word, unknownWord.Frequency, unknownWord.Pos);

                    m_ExtractWords.InsertWordToDfa(word, unknownWord);
                    m_POS.AddWordPos(word, unknownWord.Pos);

                }
                else
                {
                    w.Pos |= unknownWord.Pos;
                    w.Frequency += unknownWord.Frequency;
                }

                unknownWord.Frequency = 0;
            }
        }

        /// <summary>
        /// 召回未登录词
        /// </summary>
        /// <returns></returns>
        private List<String> RecoverUnknowWord(List<String> words)
        {
            List<String> retWords = new List<String>();

            int i = 0;
            int j = 0;

            while (i < words.Count)
            {
                String w = (String)words[i];

                if (i == words.Count-1)
                {
                    retWords.Add(w);
                    break;
                }

                if (m_POS.IsUnknowOneCharWord(w))
                {
                    String word = w;
                    i++;

                    while (m_POS.IsUnknowOneCharWord(words[i]))
                    {
                        word += (String)words[i];
                        i++;
                        if (i >= words.Count)
                        {
                            break;
                        }
                    }

                    if (AutoStudy)
                    {
                        TrafficUnknownWord(word, T_POS.POS_A_NZ);

                        //将所有连续单字组成一个词，假设其为未登录词，进行统计
                        if (j < i && w[0] >= 0x4e00 && w[0] <= 0x9fa5)
                        {
                            j = i;

                            if (j < words.Count)
                            {
                                String longWord = word;

                                while (words[j].Length == 1 && words[j][0] >= 0x4e00 && words[j][0] <= 0x9fa5)
                                {
                                    longWord += words[j];
                                    j++;

                                    if (j >= words.Count)
                                    {
                                        break;
                                    }
                                }

                                TrafficUnknownWord(longWord, T_POS.POS_A_NZ);
                            }
                        }
                    }

                    retWords.Add(word);
                    continue;
                }
                else
                {
                    if (AutoStudy)
                    {
                        //将所有连续单字组成一个词，假设其为未登录词，进行统计
                        if (j <= i && w.Length == 1 && w[0] >= 0x4e00 && w[0] <= 0x9fa5)
                        {
                            j = i + 1;
                            String word = w;

                            if (j < words.Count)
                            {
                                while (words[j].Length == 1 && words[j][0] >= 0x4e00 && words[j][0] <= 0x9fa5)
                                {
                                    word += words[j];
                                    j++;

                                    if (j >= words.Count)
                                    {
                                        break;
                                    }
                                }

                                TrafficUnknownWord(word, T_POS.POS_A_NZ);
                            }
                        }
                    }

                    retWords.Add(w);
                }

                i++;
            }

            return retWords;
    
        }


        /// <summary>
        /// 分词,不屏蔽停用词
        /// </summary>
        /// <param name="str"></param>
        /// <returns></returns>
        private List<String> SegmentNoStopWord(String str)
        {
            List<String> preWords = PreSegment(str);
            List<String> retWords = new List<String>();

            int index = 0 ;
            while (index < preWords.Count)
            {
                int next = -1;
                foreach (IRule rule in m_Rules)
                {
                    if (!m_MatchName && rule is MatchName)
                    {
                        continue;
                    }

                    next = rule.ProcRule(preWords, index, retWords);
                    if (next > 0)
                    {
                        index = next;
                        break;
                    }
                }

                if (next > 0)
                {
                    continue;
                }

                retWords.Add(preWords[index]);
                index++;
            }

            //return retWords;
            List<String> retStrings = RecoverUnknowWord(retWords);

            if (AutoStudy)
            {
                foreach (String word in retStrings)
                {
                    T_DictStruct dict = (T_DictStruct)m_ExtractWords.GetTag(word);

                    if (dict != null)
                    {
                        dict.Frequency++;
                    }

                }
            }

            return retStrings;
        }

        /// <summary>
        /// 定期保存最新的字典和统计信息
        /// </summary>
        private void SaveDictOnTime()
        {
            if (!AutoStudy)
            {
                return;
            }

            TimeSpan s = DateTime.Now - m_LastSaveTime;

            if (s.TotalSeconds > AutoSaveInterval)
            {
                m_LastSaveTime = DateTime.Now;
                SaveDict();
            }
        }

        /// <summary>
        /// 分词并输出单词信息列表 
        /// </summary>
        /// <param name="str"></param>
        /// <returns></returns>
        public List<T_WordInfo> SegmentToWordInfos(String str)
        {
            //定时保存字典
            SaveDictOnTime();

            List<String> words = SegmentNoStopWord(str);

            List<T_WordInfo> retWords = new List<T_WordInfo>();
            int position = 0;

            foreach (String word in words)
            {
                if (m_FilterStopWords)
                {
                    if (m_ChsStopwordTbl[word] != null || m_EngStopwordTbl[word] != null)
                    {
                        position += word.Length;
                        continue;
                    }
                }

                T_WordInfo wordInfo = new T_WordInfo();
                wordInfo.Word = word;
                wordInfo.Position = position;
                retWords.Add(wordInfo);
                position += word.Length;
            }

            return retWords;
        }

        /// <summary>
        /// 分词只输出单词列表
        /// </summary>
        /// <param name="str"></param>
        /// <returns></returns>
        public List<String> Segment(String str)
        {
            //定时保存字典
            SaveDictOnTime();

            List<String> words = SegmentNoStopWord(str);

            if (!m_FilterStopWords)
            {
                return words;
            }
            else
            {
                List<String> retWords = new List<String>();

                foreach (String word in words)
                {
                    if (m_ChsStopwordTbl[word] != null || m_EngStopwordTbl[word] != null)
                    {
                        continue;
                    }

                    retWords.Add(word);
                }

                return retWords;
            }
        }

        #endregion
    }

}
上一页 1 2 34
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -