📄 seglist.cs
字号:
{
if (preFix == 1)
{
reText += strPrefix + strChar1 + strChar2;
strPrefix = "";
preFix = 0;
}
else if (preFix > 1)
{
reText += strPrefix + strLastWords + strChar1 + strChar2;
strPrefix = "";
preFix = 0;
}
else
{
if (CharType == 4) reText += strChar1 + strChar2;
else reText += strChar1 + strChar2;
strLastWords = this.Separator;
number = false;
}
i++;
yes = true;
}
if (reText.Length > 0) strLastChar = reText.Substring(reText.Length - 1);
if (CharType == 4 && GetCharType(strLastChar) == 4)
{
number = true;
}
else if (strLastChar != this.Separator) reText += this.Separator;
}
#endregion
break;
default:
#region 未知字符,可能是生僻字,也可能是标点符合之类
if (word && !yes)
{
reText += Separator;
}
else if (number && !yes)
{
reText += Separator;
}
number = false;
word = false;
strLastWords = this.Separator;
break;
#endregion
}
if (!yes && number || !yes && word)
{
reText += strChar1;
yes = true;
}
if (!yes)
{
#region 处理姓名问题
if (preFix == 0)
{
if (alPrefix.Contains(strChar1 + strChar2))
{
i++;
strPrefix = strChar1 + strChar2;
preFix++;
}
else if (alPrefix.Contains(strChar1))
{
if (!number)
{
strPrefix = strChar1;
preFix++;
}
else
{
reText += strChar1 + strLastWords;
number = false;
word = false;
}
}
else
{
if (preFix == 3)
{
reText += strPrefix + Separator + strChar1 + Separator;
strPrefix = "";
preFix = 0;
}
else if (preFix > 0)
{
if (Regex.IsMatch(strChar1, strChinese))
{
strPrefix += strChar1;
preFix++;
}
else
{
reText += strPrefix + Separator + strChar1 + Separator;
strPrefix = "";
preFix = 0;
}
}
else
{
reText += strChar1 + strLastWords;
number = false;
word = false;
}
}
}
else
{
if (preFix == 3)
{
reText += strPrefix + Separator + strChar1 + Separator;
strPrefix = "";
preFix = 0;
}
else if (preFix > 0)
{
if (Regex.IsMatch(strChar1, strChinese))
{
strPrefix += strChar1;
preFix++;
}
else
{
reText += strPrefix + Separator + strChar1 + Separator;
strPrefix = "";
preFix = 0;
}
}
else
{
reText += strChar1 + strLastWords;
number = false;
}
}
#endregion
}
length = i;
#endregion
}
#region 最后防止最后一个字的丢失
if (length < strText.Length - 1)
{
string strLastChar1 = strText.Substring(strText.Length - 1).Trim();
string strLastChar2 = strText.Substring(strText.Length - 2).Trim();
if (reText.Length > 0) strLastChar = reText.Substring(reText.Length - 1);
if (preFix != 0)
{
reText += strPrefix + strLastChar1;
}
else
{
switch (GetCharType(strLastChar1))
{
case 1:
if (strLastChar1 != "." && strLastChar1 != ".")
reText += strLastChar1;
else
reText += Separator + strLastChar1;
break;
case 2:
case 5:
if (alWord.Contains(strLastChar2))
reText += strLastChar1;
break;
case 3:
case 4:
if ((number || word) && strLastChar != Separator)
reText += Separator + strLastChar1;
else
reText += strLastChar1;
break;
default:
if (strLastChar != Separator)
reText += Separator + strLastChar1;
else
reText += strLastChar1;
break;
}
}
if (reText.Length > 0) strLastChar = (reText.Substring(reText.Length - 1));
if (strLastChar != this.Separator) reText += this.Separator;
}
#endregion
TimeSpan duration = DateTime.Now - start;
m_EventTime = duration.TotalMilliseconds;
return reText.Replace(" $", ""); //这里包含一个字的,则去掉
}
/// <summary>
/// 重载分词过程,支持回车
/// </summary>
public string SegmentText(string strText, bool Enter)
{
if (Enter)
{
DateTime start = DateTime.Now;
string[] strArr = strText.Split('\n');
string reText = "";
for (int i = 0; i < strArr.Length; i++)
{
reText += SegmentText(strArr[i]) + "\r\n";
}
TimeSpan duration = DateTime.Now - start;
m_EventTime = duration.TotalMilliseconds;
return reText;
}
else
{
return SegmentText(strText);
}
}
#region 判断字符类型
/// <summary>
/// 判断字符类型,0为未知,1为数字,2为字母,3为汉字,4为汉字数字
/// </summary>
private int GetCharType(string p_Char)
{
int CharType = 0;
if (alNumber.Contains(p_Char)) CharType = 1;
if (alWord.Contains(p_Char)) CharType = 2;
if (htWords.ContainsKey(p_Char)) CharType += 3;
return CharType;
}
#endregion
#region 对加载的词典排序并重新写入
/// <summary>
/// 对加载的词典排序并重新写入
/// </summary>
public void SortDic()
{
SortDic(false);
}
/// <summary>
/// 对加载的词典排序并重新写入
/// </summary>
/// <param name="Reload">是否重新加载</param>
public void SortDic(bool Reload)
{
DateTime start = DateTime.Now;
StreamWriter sw = new StreamWriter(DicPath, false, System.Text.Encoding.UTF8);
IDictionaryEnumerator idEnumerator1 = htWords.GetEnumerator();
while (idEnumerator1.MoveNext())
{
IDictionaryEnumerator idEnumerator2 = ((Hashtable)idEnumerator1.Value).GetEnumerator();
while (idEnumerator2.MoveNext())
{
SegList aa = (SegList)idEnumerator2.Value;
aa.Sort();
for (int i = 0; i < aa.Count; i++)
{
if (aa.GetElem(i).ToString() == "null")
sw.WriteLine(idEnumerator1.Key.ToString() + idEnumerator2.Key.ToString());
else
sw.WriteLine(idEnumerator1.Key.ToString() + idEnumerator2.Key.ToString() + aa.GetElem(i).ToString());
}
}
}
sw.Close();
if (Reload) InitWordDics();
TimeSpan duration = DateTime.Now - start;
m_EventTime = duration.TotalMilliseconds;
}
#endregion
/// <summary>
/// 删除两行完全相同的词,暂时无用!
/// </summary>
/// <returns>相同词条个数</returns>
public int Optimize()
{
int l = 0;
DateTime start = DateTime.Now;
Hashtable htOptimize = new Hashtable();
StreamReader reader = new StreamReader(DicPath, System.Text.Encoding.UTF8);
string strline = reader.ReadLine();
while (strline != null && strline.Trim() != "")
{
if (!htOptimize.ContainsKey(strline))
htOptimize.Add(strline, null);
else
l++;
}
Console.WriteLine("ready");
try
{
reader.Close();
}
catch { }
StreamWriter sw = new StreamWriter(DicPath, false, System.Text.Encoding.UTF8);
IDictionaryEnumerator ide = htOptimize.GetEnumerator();
while (ide.MoveNext())
sw.WriteLine(ide.Key.ToString());
try
{
sw.Close();
}
catch { }
TimeSpan duration = DateTime.Now - start;
m_EventTime = duration.TotalMilliseconds;
return l;
}
#endregion
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -