📄 seglist.cs
字号:
string strline = reader.ReadLine();
SegList list;
Hashtable child = new Hashtable();
long i = 0;
while (strline != null && strline.Trim() != "")
{
i++;
strChar1 = strline.Substring(0, 1);
strChar2 = strline.Substring(1, 1);
if (!htWords.ContainsKey(strChar1))
{
father = new Hashtable();
htWords.Add(strChar1, father);
}
else
{
father = (Hashtable)htWords[strChar1];
}
if (!father.ContainsKey(strChar2))
{
list = new SegList();
if (strline.Length > 2)
list.Add(strline.Substring(2));
else
list.Add("null");
father.Add(strChar2, list);
}
else
{
list = (SegList)father[strChar2];
if (strline.Length > 2)
{
list.Add(strline.Substring(2));
}
else
{
list.Add("null");
}
father[strChar2] = list;
}
htWords[strChar1] = father;
strline = reader.ReadLine();
}
try
{
reader.Close();
}
catch
{ }
SetCache("jcms_dict", htWords);
}
htWords = (Hashtable)GetCache("jcms_dict");
alNoise = LoadWords(NoisePath, alNoise);
alNumber = LoadWords(NumberPath, alNumber);
alWord = LoadWords(WordPath, alWord);
alPrefix = LoadWords(PrefixPath, alPrefix);
TimeSpan duration = DateTime.Now - start;
m_EventTime = duration.TotalMilliseconds;
}
/// <summary>
/// 加载文本词组到ArrayList
/// </summary>
public ArrayList LoadWords(string strPath, ArrayList list)
{
StreamReader reader = new StreamReader(strPath, System.Text.Encoding.UTF8);
list = new ArrayList();
string strline = reader.ReadLine();
while (strline != null)
{
list.Add(strline);
strline = reader.ReadLine();
}
try
{
reader.Close();
}
catch
{ }
return list;
}
/// <summary>
/// 输出词列表
/// </summary>
public void OutWords()
{
IDictionaryEnumerator idEnumerator1 = htWords.GetEnumerator();
while (idEnumerator1.MoveNext())
{
IDictionaryEnumerator idEnumerator2 = ((Hashtable)idEnumerator1.Value).GetEnumerator();
while (idEnumerator2.MoveNext())
{
SegList aa = (SegList)idEnumerator2.Value;
for (int i = 0; i < aa.Count; i++)
{
Console.WriteLine(idEnumerator1.Key.ToString() + idEnumerator2.Key.ToString() + aa.GetElem(i).ToString());
}
}
}
}
/// <summary>
/// 输出ArrayList
/// </summary>
public void OutArrayList(ArrayList list)
{
if (list == null) return;
for (int i = 0; i < list.Count; i++)
{
Console.WriteLine(list[i].ToString());
}
}
/// <summary>
/// 分词过程,不支持回车
/// </summary>
/// <param name="strText">要分词的文本</param>
/// <returns>分词后的文本</returns>
public string SegmentText(string strText)
{
strText = (strText + "$").Trim();
if (htWords == null) return strText;
if (strText.Length < 3) return strText;
DateTime start = DateTime.Now;
int length = 0;
int preFix = 0;
bool word = false;
bool number = false;
string reText = "";
string strPrefix = "";
string strLastChar = "";
string strLastWords = Separator;
for (int i = 0; i < strText.Length - 1; i++)
{
#region 对于每一个字的处理过程
string strChar1 = strText.Substring(i, 1);
string strChar2 = strText.Substring(i + 1, 1).Trim();
bool yes;
SegList l;
Hashtable h;
if (reText.Length > 0) strLastChar = reText.Substring(reText.Length - 1);
if (strChar1 == " ")
{
if ((number || word) && strLastChar != Separator) reText += this.Separator;
yes = true;
}
else
yes = false;
int CharType = GetCharType(strChar1);
switch (CharType)
{
case 1:
#region 如果是数字,如果数字的上一位是字母要和后面的数字分开
if (word)
{
reText += Separator;
}
word = false;
number = true;
strLastWords = "";
break;
#endregion
case 2:
case 5:
#region 如果是字母
if (number)
strLastWords = Separator;
else
strLastWords = "";
word = true;
number = false;
break;
#endregion
case 3:
case 4:
#region 第一级哈希表是否包含关键字,假如包含处理第二级哈希表
//上一个字是否为字母
if (word) reText += Separator;
#region 检测上一个是否是数字,这个过程是用于修正数字后的量词的
if (number && CharType != 4)
{
h = (Hashtable)htWords["n"];
if (h.ContainsKey(strChar1))
{
l = (SegList)h[strChar1];
if (l.Contains(strChar2))
{
reText += strChar1 + strChar2 + Separator;
yes = true;
i++;
}
else if (l.Contains("null"))
{
reText += strChar1 + Separator;
yes = true;
}
}
else
reText += Separator;
}
#endregion
//非汉字数字的汉字
if (CharType == 3)
{
word = false;
number = false;
strLastWords = Separator;
}
else
{
word = false;
number = true;
strLastWords = "";
}
//第二级哈希表取出
h = (Hashtable)htWords[strChar1];
//第二级哈希表是否包含关键字
if (h.ContainsKey(strChar2))
{
#region 第二级包含关键字
//取出ArrayList对象
l = (SegList)h[strChar2];
//遍历每一个对象 看是否能组合成词
for (int j = 0; j < l.Count; j++)
{
bool have = false;
string strChar3 = l.GetElem(j).ToString();
//对于每一个取出的词进行检测,看是否匹配,长度保护
if ((strChar3.Length + i + 2) < strText.Length)
{
//向i+2后取出m长度的字
string strChar = strText.Substring(i + 2, strChar3.Length).Trim();
if (strChar3 == strChar && !yes)
{
if (strPrefix != "")
{
reText += strPrefix + Separator;
strPrefix = "";
preFix = 0;
}
reText += strChar1 + strChar2 + strChar;
i += strChar3.Length + 1;
have = true;
yes = true;
break;
}
}
else if ((strChar3.Length + i + 2) == strText.Length)
{
string strChar = strText.Substring(i + 2).Trim();
if (strChar3 == strChar && !yes)
{
if (strPrefix != "")
{
reText += strPrefix + Separator;
strPrefix = "";
preFix = 0;
}
reText += strChar1 + strChar2 + strChar;
i += strChar3.Length + 1;
have = true;
yes = true;
break;
}
}
if (!have && j == l.Count - 1 && l.Contains("null") && !yes)
{
if (preFix == 1)
{
reText += strPrefix + strChar1 + strChar2;
strPrefix = "";
preFix = 0;
}
else if (preFix > 1)
{
reText += strPrefix + strLastWords + strChar1 + strChar2;
strPrefix = "";
preFix = 0;
}
else
{
if (CharType == 4) reText += strChar1 + strChar2;
else reText += strChar1 + strChar2;
strLastWords = this.Separator;
number = false;
}
i++;
yes = true;
break;
}
else if (have)
{
break;
}
}
#endregion
//如果没有匹配还可能有一种情况,这个词语只有两个字,以这两个字开头的词语不存在
if (!yes && l.Contains("null"))
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -