📄 simpledictseg.cs
字号:
{
//按行读取中文停用词
string strEngStop = swEngFile.ReadLine();
//如果哈希表中不包括该停用词则添加到哈希表中
if (!m_EngStopwordTbl.Contains(strEngStop))
{
m_EngStopwordTbl.Add(strEngStop, numEngStop);
numEngStop++;
}
}
swChrFile.Close();
swEngFile.Close();
}
catch
{
throw;
}
}
/// <summary>
/// 将中文停用词保存到文件中
/// </summary>
/// <param name="fileName">要保存文件名</param>
/// <remarks>对文件存取的异常不做异常处理,由调用者进行异常处理</remarks>
public void SaveChsStopwordDict(String fileName)
{
try
{
//创建一个新的存储中文停用词的文本文件,若该文件存在则覆盖
FileStream fs = new FileStream(fileName, FileMode.Create, FileAccess.Write);
StreamWriter sw = new StreamWriter(fs, Encoding.GetEncoding("UTF-8"));
//遍历中文停用词表,写入文件
foreach (DictionaryEntry i in m_ChsStopwordTbl)
{
sw.WriteLine(i.Key.ToString());
}
sw.Close();
fs.Close();
}
catch
{
throw;
}
}
/// <summary>
/// 将英文停用词保存到文件中
/// </summary>
/// <param name="fileName">要保存文件名</param>
/// <remarks>对文件存取的异常不做异常处理,由调用者进行异常处理</remarks>
public void SaveEngStopwordDict(String fileName)
{
try
{
//创建一个新的存储英文停用词的文本文件,若该文件存在则覆盖
FileStream fs = new FileStream(fileName, FileMode.Create, FileAccess.Write);
StreamWriter sw = new StreamWriter(fs, Encoding.GetEncoding("UTF-8"));
//遍历英文停用词表,写入文件
foreach (DictionaryEntry i in m_EngStopwordTbl)
{
sw.WriteLine(i.Key.ToString());
}
sw.Close();
fs.Close();
}
catch
{
throw;
}
}
/// <summary>
/// 增加一个中文停用词
/// </summary>
/// <param name="word"></param>
public void AddChsStopword(String word)
{
//如果原来词库中已存在,则不做任何操作
if (m_ChsStopwordTbl.Contains(word))
{
return;
}
else
{
m_ChsStopwordTbl.Add(word, m_ChsStopwordTbl.Count);
}
}
/// <summary>
/// 删除一个中文停用词
/// </summary>
/// <param name="word"></param>
public void DelChsStopword(String word)
{
//如果原来词库中不存在,则不做任何操作
m_ChsStopwordTbl.Remove(word);
}
/// <summary>
/// 增加一个英文停用词
/// </summary>
/// <param name="word"></param>
public void AddEngStopword(String word)
{
//如果原来词库中已存在,则不做任何操作
if (m_EngStopwordTbl.Contains(word))
{
return;
}
else
{
m_EngStopwordTbl.Add(word, m_EngStopwordTbl.Count);
}
}
/// <summary>
/// 删除一个英文停用词
/// </summary>
/// <param name="word"></param>
public void DelEngStopword(String word)
{
//如果原来词库中不存在,则不做任何操作
m_EngStopwordTbl.Remove(word);
}
#endregion
#region 加载字典
public void LoadDict()
{
LoadDict(false);
}
/// <summary>
/// 加载字典
/// </summary>
/// <param name="clear">是否清除词频</param>
public void LoadDict(bool clear)
{
//加载姓名前缀后缀统计表
m_MatchNameRule.LoadNameTraffic(m_DictPath + "Name.dct");
//加载字典
m_Dict = Dict.LoadFromBinFileEx(m_DictPath + "Dict.dct");
m_DictMgr.Dict = m_Dict;
foreach (T_DictStruct word in m_Dict.Dicts)
{
if (clear)
{
word.Frequency = 0;
}
m_ExtractWords.InsertWordToDfa(word.Word, word);
m_POS.AddWordPos(word.Word, word.Pos);
}
//加载未登录词统计字典
if (File.Exists(m_DictPath + "UnknownWords.dct"))
{
m_UnknownWordsDict = Dict.LoadFromBinFileEx(m_DictPath + "UnknownWords.dct");
}
else
{
m_UnknownWordsDict = new T_DictFile();
}
m_UnknownWordsDictMgr.Dict = m_UnknownWordsDict;
if (clear)
{
m_MatchNameRule.ClearNameTraffic();
}
m_MatchNameRule.TrafficUnknownWordHandle = TrafficUnknownWord;
}
public void SaveDict()
{
m_MatchNameRule.SaveNameTraffic(m_DictPath + "Name.dct");
foreach (T_DictStruct word in m_Dict.Dicts)
{
T_DictStruct dict = (T_DictStruct)m_ExtractWords.GetTag(word.Word);
if (dict != null)
{
word.Frequency = dict.Frequency;
}
}
Dict.SaveToBinFileEx(m_DictPath + "Dict.dct", m_Dict);
Dict.SaveToBinFileEx(m_DictPath + "UnknownWords.dct", m_UnknownWordsDict);
}
#endregion
#region 分词属性
bool m_MatchName;
/// <summary>
/// 是否匹配汉语人名
/// </summary>
public bool MatchName
{
get
{
return m_MatchName;
}
set
{
m_MatchName = value;
}
}
T_Direction m_MatchDirection;
/// <summary>
/// 匹配方向
/// 默认为从左至右匹配,即正向匹配
/// </summary>
public T_Direction MatchDirection
{
get
{
return m_MatchDirection;
}
set
{
m_MatchDirection = value;
}
}
bool m_FilterStopWords;
/// <summary>
/// 是否过滤停用词
/// </summary>
public bool FilterStopWords
{
get
{
return m_FilterStopWords;
}
set
{
if (value)
{
if (m_ChsStopwordTbl.Count == 0 || m_EngStopwordTbl.Count == 0)
{
LoadStopwordsDict(m_DictPath + CHS_STOP_WORD_FILENAME, m_DictPath + ENG_STOP_WORD_FILENAME);
}
}
m_FilterStopWords = value;
}
}
#endregion
#region 分词
private void InsertWordToArray(String word, List<String> arr)
{
arr.Add(word);
}
/// <summary>
/// 预分词
/// </summary>
/// <param name="str">要分词的句子</param>
/// <returns>预分词后的字符串输出</returns>
private List<String> PreSegment(String str)
{
ArrayList initSeg = new ArrayList();
if (!CRegex.GetSingleMatchStrings(str, PATTERNS, true, ref initSeg))
{
return new List<String>();
}
List<String> retWords = new List<String>();
int i = 0;
m_ExtractWords.MatchDirection = MatchDirection;
while (i < initSeg.Count)
{
String word = (String)initSeg[i];
if (word == "")
{
word = " ";
}
if (i < initSeg.Count - 1)
{
bool mergeOk = false;
if (((word[0] >= '0' && word[0] <= '9') ||(word[0] >= '0' && word[0] <= '9')) &&
((word[word.Length - 1] >= '0' && word[word.Length - 1] <= '9') ||
(word[word.Length - 1] >= '0' && word[word.Length - 1] <= '9'))
)
{
//合并浮点数
word = MergeFloat(initSeg, i, ref i);
mergeOk = true;
}
else if ((word[0] >= 'a' && word[0] <= 'z') ||
(word[0] >= 'A' && word[0] <= 'Z')
)
{
//合并成英文专业名词
String specialEnglish = MergeEnglishSpecialWord(m_ExtractWords, initSeg, i, ref i);
if (specialEnglish != null)
{
InsertWordToArray(specialEnglish, retWords);
continue;
}
//合并邮件地址
if ((String)initSeg[i + 1] != "")
{
if (((String)initSeg[i + 1])[0] == '@')
{
word = MergeEmail(initSeg, i, ref i);
mergeOk = true;
}
}
}
if (mergeOk)
{
InsertWordToArray(word, retWords);
continue;
}
}
if (word[0] < 0x4e00 || word[0] > 0x9fa5)
{
//英文或符号,直接加入
InsertWordToArray(word, retWords);
}
else
{
List<T_WordInfo> words = m_ExtractWords.ExtractFullTextMaxMatch(word);
int lastPos = 0;
bool lstIsName = false; //前一个词是人名
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -