📄 simpledictseg.cs
字号:
foreach (T_WordInfo wordInfo in words)
{
if (lastPos < wordInfo.Position)
{
/*
String unMatchWord = word.Substring(lastPos, wordInfo.Position - lastPos);
InsertWordToArray(unMatchWord, retWords);
*/
//中间有未匹配词,将单个字逐个加入
for (int j = lastPos; j < wordInfo.Position; j++)
{
InsertWordToArray(word[j].ToString(), retWords);
}
}
lastPos = wordInfo.Position + wordInfo.Word.Length ;
//统计中文姓名的后缀
if (AutoStudy && lstIsName)
{
T_DictStruct wordDict = (T_DictStruct)wordInfo.Tag;
if ((wordDict.Pos & (int)T_POS.POS_A_NR) == 0)
{
m_MatchNameRule.AddBefore(wordInfo.Word);
}
lstIsName = false;
}
//统计中文姓名的前缀
//如总统,主席等
if ((((T_DictStruct)wordInfo.Tag).Pos & (int)T_POS.POS_A_NR) != 0)
{
if (wordInfo.Word.Length > 1 && wordInfo.Word.Length <= 4 && retWords.Count > 0 && AutoStudy && !lstIsName)
{
T_DictStruct wordDict = (T_DictStruct)wordInfo.Tag;
m_MatchNameRule.AddBefore(retWords[retWords.Count - 1]);
}
lstIsName = true;
}
InsertWordToArray(wordInfo.Word, retWords);
}
if (lastPos < word.Length)
{
//尾部有未匹配词,将单个字逐个加入
for (int j = lastPos; j < word.Length; j++)
{
InsertWordToArray(word[j].ToString(), retWords);
}
//InsertWordToArray(word.Substring(lastPos, word.Length - lastPos), retWords);
}
}
i++;
}
return retWords;
}
private void TrafficUnknownWord(String word, T_POS Pos)
{
if (word.Length <= 1 || word.Length > 3)
{
return;
}
T_DictStruct unknownWord = m_UnknownWordsDictMgr.GetWord(word);
if (unknownWord == null)
{
m_UnknownWordsDictMgr.InsertWord(word, 1, (int)Pos);
return;
}
//如果是屏蔽的未登录词,则不加入
//屏蔽的未登录词用词性等于0来表示
if (unknownWord.Pos == 0)
{
return;
}
unknownWord.Pos |= (int)Pos;
unknownWord.Frequency++;
if (unknownWord.Frequency > UnknownWordsThreshold && AutoInsertUnknownWords)
{
T_DictStruct w = m_DictMgr.GetWord(word);
if (w == null)
{
m_DictMgr.InsertWord(word, unknownWord.Frequency, unknownWord.Pos);
m_ExtractWords.InsertWordToDfa(word, unknownWord);
m_POS.AddWordPos(word, unknownWord.Pos);
}
else
{
w.Pos |= unknownWord.Pos;
w.Frequency += unknownWord.Frequency;
}
unknownWord.Frequency = 0;
}
}
/// <summary>
/// 召回未登录词
/// </summary>
/// <returns></returns>
private List<String> RecoverUnknowWord(List<String> words)
{
List<String> retWords = new List<String>();
int i = 0;
int j = 0;
while (i < words.Count)
{
String w = (String)words[i];
if (i == words.Count-1)
{
retWords.Add(w);
break;
}
if (m_POS.IsUnknowOneCharWord(w))
{
String word = w;
i++;
while (m_POS.IsUnknowOneCharWord(words[i]))
{
word += (String)words[i];
i++;
if (i >= words.Count)
{
break;
}
}
if (AutoStudy)
{
TrafficUnknownWord(word, T_POS.POS_A_NZ);
//将所有连续单字组成一个词,假设其为未登录词,进行统计
if (j < i && w[0] >= 0x4e00 && w[0] <= 0x9fa5)
{
j = i;
if (j < words.Count)
{
String longWord = word;
while (words[j].Length == 1 && words[j][0] >= 0x4e00 && words[j][0] <= 0x9fa5)
{
longWord += words[j];
j++;
if (j >= words.Count)
{
break;
}
}
TrafficUnknownWord(longWord, T_POS.POS_A_NZ);
}
}
}
retWords.Add(word);
continue;
}
else
{
if (AutoStudy)
{
//将所有连续单字组成一个词,假设其为未登录词,进行统计
if (j <= i && w.Length == 1 && w[0] >= 0x4e00 && w[0] <= 0x9fa5)
{
j = i + 1;
String word = w;
if (j < words.Count)
{
while (words[j].Length == 1 && words[j][0] >= 0x4e00 && words[j][0] <= 0x9fa5)
{
word += words[j];
j++;
if (j >= words.Count)
{
break;
}
}
TrafficUnknownWord(word, T_POS.POS_A_NZ);
}
}
}
retWords.Add(w);
}
i++;
}
return retWords;
}
/// <summary>
/// 分词,不屏蔽停用词
/// </summary>
/// <param name="str"></param>
/// <returns></returns>
private List<String> SegmentNoStopWord(String str)
{
List<String> preWords = PreSegment(str);
List<String> retWords = new List<String>();
int index = 0 ;
while (index < preWords.Count)
{
int next = -1;
foreach (IRule rule in m_Rules)
{
if (!m_MatchName && rule is MatchName)
{
continue;
}
next = rule.ProcRule(preWords, index, retWords);
if (next > 0)
{
index = next;
break;
}
}
if (next > 0)
{
continue;
}
retWords.Add(preWords[index]);
index++;
}
//return retWords;
List<String> retStrings = RecoverUnknowWord(retWords);
if (AutoStudy)
{
foreach (String word in retStrings)
{
T_DictStruct dict = (T_DictStruct)m_ExtractWords.GetTag(word);
if (dict != null)
{
dict.Frequency++;
}
}
}
return retStrings;
}
/// <summary>
/// 定期保存最新的字典和统计信息
/// </summary>
private void SaveDictOnTime()
{
if (!AutoStudy)
{
return;
}
TimeSpan s = DateTime.Now - m_LastSaveTime;
if (s.TotalSeconds > AutoSaveInterval)
{
m_LastSaveTime = DateTime.Now;
SaveDict();
}
}
/// <summary>
/// 分词并输出单词信息列表
/// </summary>
/// <param name="str"></param>
/// <returns></returns>
public List<T_WordInfo> SegmentToWordInfos(String str)
{
//定时保存字典
SaveDictOnTime();
List<String> words = SegmentNoStopWord(str);
List<T_WordInfo> retWords = new List<T_WordInfo>();
int position = 0;
foreach (String word in words)
{
if (m_FilterStopWords)
{
if (m_ChsStopwordTbl[word] != null || m_EngStopwordTbl[word] != null)
{
position += word.Length;
continue;
}
}
T_WordInfo wordInfo = new T_WordInfo();
wordInfo.Word = word;
wordInfo.Position = position;
retWords.Add(wordInfo);
position += word.Length;
}
return retWords;
}
/// <summary>
/// 分词只输出单词列表
/// </summary>
/// <param name="str"></param>
/// <returns></returns>
public List<String> Segment(String str)
{
//定时保存字典
SaveDictOnTime();
List<String> words = SegmentNoStopWord(str);
if (!m_FilterStopWords)
{
return words;
}
else
{
List<String> retWords = new List<String>();
foreach (String word in words)
{
if (m_ChsStopwordTbl[word] != null || m_EngStopwordTbl[word] != null)
{
continue;
}
retWords.Add(word);
}
return retWords;
}
}
#endregion
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -