📄 simpledictseg.cs
字号:
itemName.Value, e1.Message));
}
}
}
catch (Exception e)
{
WriteLog(String.Format("Load config fail, errmsg:{0}", e.Message));
}
}
/// <summary>
/// 保存配置到配置文件
/// </summary>
/// <param name="fileName">配置文件名</param>
public void SaveConfig(String fileName)
{
System.Xml.XmlTextWriter writer = new System.Xml.XmlTextWriter(fileName, Encoding.UTF8);
try
{
writer.Formatting = System.Xml.Formatting.Indented;
writer.WriteStartDocument();
writer.WriteStartElement("KTDictSeg");
foreach (CfgItem item in GetCfgItems())
{
writer.WriteComment(item.Comment);
writer.WriteStartElement("Item");
writer.WriteAttributeString("Name", item.Pi.Name);
writer.WriteAttributeString("Value", item.Pi.GetValue(this, null).ToString());
writer.WriteEndElement(); //Item
}
writer.WriteEndElement(); //KTDictSeg
writer.WriteEndDocument();
writer.Flush();
writer.Close();
}
catch (Exception e)
{
WriteLog(String.Format("Save config fail, errmsg:{0}", e.Message));
}
}
#endregion
private void WriteLog(String log)
{
try
{
CFile.WriteLine(LogFileName,
String.Format("{0} {1}", DateTime.Now, log), "utf-8");
}
catch
{
}
}
double GetFreqWeight(List<T_WordInfo> words, List<int> list)
{
double weight = 0;
for (int i = 0; i < list.Count; i++)
{
T_WordInfo w = (T_WordInfo)words[(int)list[i]];
T_DictStruct dict = (T_DictStruct)w.Tag;
weight += dict.Frequency;
}
return weight;
}
int GetPosWeight(List<T_WordInfo> words, List<int> list)
{
int weight = 0;
for (int i = 0; i < list.Count-1; i++)
{
T_WordInfo w1 = (T_WordInfo)words[(int)list[i]];
T_WordInfo w2 = (T_WordInfo)words[(int)list[i+1]];
if (m_PosBinRule.Match(w1.Word, w2.Word))
{
weight++;
}
}
return weight;
}
bool CompareByPos(List<T_WordInfo> words, List<int> pre, List<int> cur)
{
int posWeightPre = GetPosWeight(words, pre);
int posWeightCur = GetPosWeight(words, cur);
if (posWeightPre < posWeightCur)
{
return true;
}
if (posWeightPre > posWeightCur)
{
return false;
}
//词性比较相同的情况下比较词频
return GetFreqWeight(words, pre) < GetFreqWeight(words, cur);
}
/// <summary>
/// 按词频优先进行选择
/// </summary>
/// <param name="words"></param>
/// <param name="pre"></param>
/// <param name="cur"></param>
/// <returns></returns>
private bool SelectByFreq(List<T_WordInfo> words, List<int> pre, List<int> cur)
{
double minPreFreq = 1000000000;
double minCurFreq = 1000000000;
int maxPreLength = 0; //Pre中所有词的最大
int maxCurLength = 0; //Cur中所有词的最大
foreach (int index in pre)
{
double freq = ((T_DictStruct)words[index].Tag).Frequency;
if (freq < minPreFreq)
{
minPreFreq = freq;
}
if (words[index].Word.Length > maxPreLength)
{
maxPreLength = words[index].Word.Length;
}
}
foreach (int index in cur)
{
double freq = ((T_DictStruct)words[index].Tag).Frequency;
if (freq < minCurFreq)
{
minCurFreq = freq;
}
if (words[index].Word.Length > maxCurLength)
{
maxCurLength = words[index].Word.Length;
}
}
//对于全部由单个字组成的词,不进行词频优先统计
if (maxPreLength <= 1 && maxCurLength > 1)
{
return true;
}
else if (maxPreLength > 1 && maxCurLength <= 1)
{
return false;
}
return minCurFreq > minPreFreq;
}
private void InitRules()
{
m_Rules = new IRule[3];
m_PosBinRule = new PosBinRule(m_POS);
m_Rules[0] = new MergeNumRule(m_POS);
m_Rules[1] = m_PosBinRule;
m_MatchNameRule = new MatchName(m_POS);
m_Rules[2] = m_MatchNameRule;
}
public CSimpleDictSeg()
{
m_MatchName = false;
m_FilterStopWords = false;
m_MatchDirection = T_Direction.LeftToRight;
m_ExtractWords = new CExtractWords();
m_ExtractWords.CompareByPosEvent = CompareByPos;
m_POS = new CPOS();
m_LastSaveTime = DateTime.Now;
InitRules();
}
/// <summary>
/// 合并浮点数
/// </summary>
/// <param name="words"></param>
/// <param name="start"></param>
/// <param name="end"></param>
/// <returns></returns>
private String MergeFloat(ArrayList words, int start, ref int end)
{
StringBuilder str = new StringBuilder();
int dotCount = 0;
end = start;
int i ;
for (i = start; i < words.Count; i++)
{
string word = (string)words[i];
if (word == "")
{
break;
}
if ((word[0] >= '0' && word[0] <= '9')
|| (word[0] >= '0' && word[0] <= '9'))
{
}
else if (word[0] == '.' && dotCount == 0)
{
dotCount++;
}
else
{
break;
}
str.Append(word);
}
end = i;
return str.ToString();
}
/// <summary>
/// 合并Email
/// </summary>
/// <param name="words"></param>
/// <param name="start"></param>
/// <param name="end"></param>
/// <returns></returns>
private String MergeEmail(ArrayList words, int start, ref int end)
{
StringBuilder str = new StringBuilder();
int dotCount = 0;
int atCount = 0;
end = start;
int i;
for (i = start; i < words.Count; i++)
{
string word = (string)words[i];
if (word == "")
{
break;
}
if ((word[0] >= 'a' && word[0] <= 'z') ||
(word[0] >= 'A' && word[0] <= 'Z') ||
word[0] >= '0' && word[0] <= '9')
{
dotCount = 0;
}
else if (word[0] == '@' && atCount == 0)
{
atCount++;
}
else if (word[0] == '.' && dotCount == 0)
{
dotCount++;
}
else
{
break;
}
str.Append(word);
}
end = i;
return str.ToString();
}
/// <summary>
/// 合并英文专用词。
/// 如果字典中有英文专用词如U.S.A, C++.C#等
/// 需要对初步分词后的英文和字母进行合并
/// </summary>
/// <param name="words"></param>
/// <param name="start"></param>
/// <param name="end"></param>
/// <returns></returns>
private String MergeEnglishSpecialWord(CExtractWords extractWords, ArrayList words, int start, ref int end)
{
StringBuilder str = new StringBuilder();
int i;
for (i = start; i < words.Count; i++)
{
string word = (string)words[i];
//word 为空或者为空格回车换行等分割符号,中断扫描
if (word.Trim() == "")
{
break;
}
//如果遇到中文,中断扫描
if (word[0] >= 0x4e00 && word[0] <= 0x9fa5)
{
break;
}
str.Append(word);
}
String mergeString = str.ToString();
List<T_WordInfo> exWords = extractWords.ExtractFullText(mergeString);
if (exWords.Count == 1)
{
T_WordInfo info = (T_WordInfo)exWords[0];
if (info.Word.Length == mergeString.Length)
{
end = i;
return mergeString;
}
}
return null;
}
#region 维护停用词
/// <summary>
/// 从停用词字典中加载停用词
/// 停用词字典的格式:
/// 文本文件格式,一个词占一行
/// </summary>
/// <param name="chsFileName">中文停用词</param>
/// <param name="engFileName">英文停用词</param>
/// <remarks>对文件存取的异常不做异常处理,由调用者进行异常处理</remarks>
public void LoadStopwordsDict(String chsFileName, String engFileName)
{
int numChrStop = 0;//统计中文停用词数目,并作为Value值插入哈希表
int numEngStop = 0;//统计英文停用词数目,并作为Value值插入哈希表
try
{
StreamReader swChrFile = new StreamReader(chsFileName, Encoding.GetEncoding("UTF-8"));
StreamReader swEngFile = new StreamReader(engFileName, Encoding.GetEncoding("UTF-8"));
//加载中文停用词
while (!swChrFile.EndOfStream)
{
//按行读取中文停用词
string strChrStop = swChrFile.ReadLine();
//如果哈希表中不包括该停用词则添加到哈希表中
if (!m_ChsStopwordTbl.Contains(strChrStop))
{
m_ChsStopwordTbl.Add(strChrStop, numChrStop);
numChrStop++;
}
}
//加载英文停用词
while (!swEngFile.EndOfStream)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -