📄 matchnamerule.cs
字号:
using System;
using System.Collections;
using System.Collections.Generic;
using System.Text;
using System.IO;
using FTAlgorithm.General;
namespace KTDictSeg
{
/// <summary>
/// 中文人名统计
/// </summary>
[Serializable]
class T_ChsNameWordTraffic
{
/// <summary>
/// 单词
/// </summary>
public String Word;
/// <summary>
/// 单词在人名前出现的次数
/// </summary>
public long Before;
/// <summary>
/// 单词在人名后出现的次数
/// </summary>
public long After;
}
/// <summary>
/// 人名统计
/// </summary>
[Serializable]
class T_ChsNameTraffic
{
/// <summary>
/// 出现在人名前的单词总数
/// </summary>
public int BeforeWordCount;
/// <summary>
/// 人名前单词出现次数总和
/// </summary>
public long BeforeCount;
/// <summary>
/// 出现在人名后的单词总数
/// </summary>
public int AfterWordCount;
/// <summary>
/// 人名后单词出现次数总和
/// </summary>
public long AfterCount;
public List<T_ChsNameWordTraffic> Words = new List<T_ChsNameWordTraffic>();
}
/// <summary>
/// 中文人名统计实现
/// </summary>
class CChsNameTraffic
{
T_ChsNameTraffic m_Traffic;
Hashtable m_Table = new Hashtable();
public void Clear()
{
m_Table = new Hashtable();
m_Traffic = new T_ChsNameTraffic();
}
public void Save(String fileName)
{
Stream s = CSerialization.SerializeBinary(m_Traffic);
s.Position = 0;
CFile.WriteStream(fileName, (MemoryStream)s);
}
public void Load(String fileName)
{
try
{
List<T_ChsNameWordTraffic> newWords = new List<T_ChsNameWordTraffic>();
if (!File.Exists(fileName))
{
//字典文件不存在
return;
}
MemoryStream s = CFile.ReadFileToStream(fileName);
s.Position = 0;
object obj;
CSerialization.DeserializeBinary(s, out obj);
m_Traffic = (T_ChsNameTraffic)obj;
m_Traffic.AfterCount = 0;
m_Traffic.AfterWordCount = 0;
m_Traffic.BeforeCount = 0;
m_Traffic.BeforeWordCount = 0;
//整理姓名前缀后缀表
foreach (T_ChsNameWordTraffic wordTraffic in m_Traffic.Words)
{
if (wordTraffic.Word.Length > 1)
{
if (wordTraffic.Word[0] >= 0x4e00 && wordTraffic.Word[0] <= 0x9fa5)
{
//是汉字,且长度大于2,才作为有效前后缀
newWords.Add(wordTraffic);
if (wordTraffic.After > 0)
{
m_Traffic.AfterCount += wordTraffic.After;
m_Traffic.AfterWordCount++;
}
if (wordTraffic.Before > 0)
{
m_Traffic.BeforeCount += wordTraffic.After;
m_Traffic.BeforeWordCount++;
}
}
}
}
m_Traffic.Words = newWords;
}
catch
{
m_Traffic = new T_ChsNameTraffic();
}
foreach (T_ChsNameWordTraffic wordTraffic in m_Traffic.Words)
{
m_Table[wordTraffic.Word] = wordTraffic;
}
}
public void AddBefore(String word)
{
if (word.Length <= 1)
{
//长度必须大于1
return;
}
if (word[0] < 0x4e00 || word[0] > 0x9fa5)
{
//必须是汉字
return;
}
T_ChsNameWordTraffic wordTraffic = (T_ChsNameWordTraffic)m_Table[word];
if (wordTraffic == null)
{
wordTraffic = new T_ChsNameWordTraffic();
wordTraffic.Word = word;
wordTraffic.Before = 1;
wordTraffic.After = 0;
m_Traffic.Words.Add(wordTraffic);
m_Table[wordTraffic.Word] = wordTraffic;
m_Traffic.BeforeWordCount++;
m_Traffic.BeforeCount++;
return;
}
else
{
wordTraffic.Before++;
m_Traffic.BeforeCount++;
return;
}
}
public void AddAfter(String word)
{
if (word.Length <= 1)
{
//长度必须大于1
return;
}
if (word[0] < 0x4e00 || word[0] > 0x9fa5)
{
//必须是汉字
return;
}
T_ChsNameWordTraffic wordTraffic = (T_ChsNameWordTraffic)m_Table[word];
if (wordTraffic == null)
{
wordTraffic = new T_ChsNameWordTraffic();
wordTraffic.Word = word;
wordTraffic.Before = 0;
wordTraffic.After = 1;
m_Traffic.Words.Add(wordTraffic);
m_Table[wordTraffic.Word] = wordTraffic;
m_Traffic.AfterWordCount++;
m_Traffic.AfterCount++;
return;
}
else
{
wordTraffic.After++;
m_Traffic.AfterCount++;
return;
}
}
/// <summary>
/// 判断两个词的统计值,第二个比第一个大返回true
/// </summary>
/// <param name="fst"></param>
/// <param name="sec"></param>
/// <returns></returns>
public bool CompareTwoWords(String fst, String sec)
{
T_ChsNameWordTraffic cw1 = GetWordTraffic(fst);
T_ChsNameWordTraffic cw2 = GetWordTraffic(sec);
if (cw2 != null)
{
long after1;
if (cw1 == null)
{
after1 = 0;
}
else
{
after1 = cw1.After;
}
if (after1 < cw2.After)
{
return true;
}
}
return false;
}
public T_ChsNameWordTraffic GetWordTraffic(String word)
{
return (T_ChsNameWordTraffic)m_Table[word]; ;
}
public bool MaybeNameByAfter(String afterWord)
{
T_ChsNameWordTraffic wordTraffic = GetWordTraffic(afterWord);
if (wordTraffic == null)
{
return false;
}
if (wordTraffic.After >= m_Traffic.AfterCount * 5/ m_Traffic.AfterWordCount)
{
//出现概率大于平均值5倍
return true;
}
else
{
return false;
}
}
/// <summary>
/// 根据名字前的单词判断该名字是否可能是汉字名
/// </summary>
/// <param name="beforeWord"></param>
/// <returns></returns>
public bool MaybeNameByBefore(String beforeWord)
{
T_ChsNameWordTraffic wordTraffic = GetWordTraffic(beforeWord);
if (wordTraffic == null)
{
return false;
}
if (wordTraffic.Before >= m_Traffic.BeforeCount * 5 / m_Traffic.BeforeWordCount)
{
//出现概率大于平均值5倍
return true;
}
else
{
return false;
}
}
}
/// <summary>
/// 匹配姓名
/// </summary>
class MatchName : IRule
{
public delegate void TrafficUnknownWordFunc(String word, T_POS Pos);
CChsNameTraffic m_ChsNameTraffic;
PosBinRule m_PosBinRule;
CPOS m_Pos;
TrafficUnknownWordFunc m_TrafficUnknownWordHandle;
/// <summary>
/// 没有明显歧异的姓氏
/// </summary>
static string[] FAMILY_NAMES = {
//有明显歧异的姓氏
"王","张","黄","周","徐",
"胡","高","林","马","于",
"程","傅","曾","叶","余",
"夏","钟","田","任","方",
"石","熊","白","毛","江",
"史","候","龙","万","段",
"雷","钱","汤","易","常",
"武","赖","文", "查",
//没有明显歧异的姓氏
"赵", "肖", "孙", "李",
"吴", "郑", "冯", "陈",
"褚", "卫", "蒋", "沈",
"韩", "杨", "朱", "秦",
"尤", "许", "何", "吕",
"施", "桓", "孔", "曹",
"严", "华", "金", "魏",
"陶", "姜", "戚", "谢",
"邹", "喻", "柏", "窦",
"苏", "潘", "葛", "奚",
"范", "彭", "鲁", "韦",
"昌", "俞", "袁", "酆",
"鲍", "唐", "费", "廉",
"岑", "薛", "贺", "倪",
"滕", "殷", "罗", "毕",
"郝", "邬", "卞", "康",
"卜", "顾", "孟", "穆",
"萧", "尹", "姚", "邵",
"湛", "汪", "祁", "禹",
"狄", "贝", "臧", "伏",
"戴", "宋", "茅", "庞",
"纪", "舒", "屈", "祝",
"董", "梁", "杜", "阮",
"闵", "贾", "娄", "颜",
"郭", "邱", "骆", "蔡",
"樊", "凌", "霍", "虞",
"柯", "昝", "卢", "柯",
"缪", "宗", "丁", "贲",
"邓", "郁", "杭", "洪",
"崔", "龚", "嵇", "邢",
"滑", "裴", "陆", "荣",
"荀", "惠", "甄", "芮",
"羿", "储", "靳", "汲",
"邴", "糜", "隗", "侯",
"宓", "蓬", "郗", "仲",
"栾", "钭", "历", "戎",
"刘", "詹", "幸", "韶",
"郜", "黎", "蓟", "溥",
"蒲", "邰", "鄂", "咸",
"卓", "蔺", "屠", "乔",
"郁", "胥", "苍", "莘",
"翟", "谭", "贡", "劳",
"冉", "郦", "雍", "璩",
"桑", "桂", "濮", "扈",
"冀", "浦", "庄", "晏",
"瞿", "阎", "慕", "茹",
"习", "宦", "艾", "容",
"慎", "戈", "廖", "庾",
"衡", "耿", "弘", "匡",
"阙", "殳", "沃", "蔚",
"夔", "隆", "巩", "聂",
"晁", "敖", "融", "訾",
"辛", "阚", "毋", "乜",
"鞠", "丰", "蒯", "荆",
"竺", "盍", "单", "欧",
"司马", "上官", "欧阳",
"夏侯", "诸葛", "闻人",
"东方", "赫连", "皇甫",
"尉迟", "公羊", "澹台",
"公冶", "宗政", "濮阳",
"淳于", "单于", "太叔",
"申屠", "公孙", "仲孙",
"轩辕", "令狐", "徐离",
"宇文", "长孙", "慕容",
"司徒", "司空", "万俟"};
static Hashtable m_FamilyNameTbl;
bool m_AutoStudy = false;
/// <summary>
/// 是否自动学习
/// </summary>
public bool AutoStudy
{
get
{
return m_AutoStudy;
}
set
{
m_AutoStudy = value;
}
}
public TrafficUnknownWordFunc TrafficUnknownWordHandle
{
get
{
return m_TrafficUnknownWordHandle;
}
set
{
m_TrafficUnknownWordHandle = value;
}
}
public MatchName(CPOS pos)
{
m_PosBinRule = new PosBinRule(pos);
m_Pos = pos;
m_ChsNameTraffic = new CChsNameTraffic();
m_FamilyNameTbl = new Hashtable();
foreach (String familyName in FAMILY_NAMES)
{
m_FamilyNameTbl[familyName] = true;
}
}
#region ChsNameTraffic 相关函数
/// <summary>
/// 清除姓名统计文件
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -