📄 simpledictseg.cs
字号:
/***************************************************************************************
* KTDictSeg 简介: KTDictSeg 是由KaiToo搜索开发的一款基于字典的简单中英文分词算法
* 主要功能: 中英文分词,未登录词识别,多元歧义自动识别,全角字符识别能力
* 主要性能指标:
* 分词准确度:90%以上(有待专家的权威评测)
* 处理速度: 600KBytes/s
*
* 版本: V1.2.02
* Copyright(c) 2007 http://www.kaitoo.com
* 作者:肖波
* 授权: 开源GPL
* 公司网站: http://www.kaitoo.com
* 个人博客: http://blog.csdn.net/eaglet; http://www.cnblogs.com/eaglet
* 联系方式: blog.eaglet@gmail.com
* ***************************************************************************************/
using System;
using System.Collections.Generic;
using System.Text;
using System.Collections;
using System.IO;
using System.Reflection;
using FTAlgorithm;
using FTAlgorithm.General;
namespace KTDictSeg
{
/// <summary>
/// 简单字典分词
/// </summary>
public class CSimpleDictSeg
{
const string CHS_STOP_WORD_FILENAME = "chsstopwords.txt";
const string ENG_STOP_WORD_FILENAME = "engstopwords.txt";
IRule[] m_Rules ;
//中文停用词哈希表
Hashtable m_ChsStopwordTbl = new Hashtable();
//英文停用词哈希表
Hashtable m_EngStopwordTbl = new Hashtable();
CExtractWords m_ExtractWords;
const string PATTERNS = @"[0-9\d]+\%|[0-9\d]{1,2}月|[0-9\d]{1,2}日|[0-9\d]{1,4}年|"+
@"[0-9\d]{1,4}-[0-9\d]{1,2}-[0-9\d]{1,2}|" +
@"[0-9\d]+|[^a-zA-Za-zA-Z0-90-9\u4e00-\u9fa5]|[a-zA-Za-zA-Z]+|[\u4e00-\u9fa5]+";
//const string PATTERNS = @"[a-zA-Z]+|\d+|[\u4e00-\u9fa5]+";
String m_DictPath;
CPOS m_POS;
PosBinRule m_PosBinRule;
MatchName m_MatchNameRule;
/// <summary>
/// 字典
/// </summary>
T_DictFile m_Dict;
/// <summary>
/// 字典管理
/// </summary>
DictManage.DictMgr m_DictMgr = new DictManage.DictMgr();
/// <summary>
/// 未登录词统计字典
/// 用于统计未登录词的出现频率和词性。
/// 目前主要统计未知词性的未登录词和
/// 未知姓名
/// </summary>
T_DictFile m_UnknownWordsDict ;
/// <summary>
/// 未登录词字典管理
/// </summary>
DictManage.DictMgr m_UnknownWordsDictMgr = new DictManage.DictMgr();
private int m_UnknownWordsThreshold = 100;
private bool m_FreqFirst = true;
bool m_AutoStudy = false;
bool m_AutoInsertUnknownWords = false;
DateTime m_LastSaveTime; //上一次保存字典和统计信息的时间
int m_AutoSaveInterval = 24 * 3600; //间隔多少秒自动保存最新的字典和统计信息,AutoStudy = true时有效
String m_LogFileName = "KTDictSeg.log";
#region Public property
/// <summary>
/// 未登录词阈值,当统计超过这个值时,自动将未登录词加入到
/// 字典中
/// </summary>
public int UnknownWordsThreshold
{
get
{
return m_UnknownWordsThreshold;
}
set
{
if (value < 1)
{
m_UnknownWordsThreshold = 1;
}
else
{
m_UnknownWordsThreshold = value;
}
}
}
/// <summary>
/// 自动插入超过统计阈值的未登录词
/// </summary>
public bool AutoInsertUnknownWords
{
get
{
return m_AutoInsertUnknownWords;
}
set
{
m_AutoInsertUnknownWords = value;
}
}
/// <summary>
/// 优先判断词频,
/// 如果一个长的单词由多个短的单词组成,而长的单词词频较低
/// 则忽略长的单词。
/// 如 中央酒店的词频比中央和酒店的词频都要低,则忽略中央酒店。
/// </summary>
public bool FreqFirst
{
get
{
return m_FreqFirst;
}
set
{
m_FreqFirst = value;
if (m_FreqFirst)
{
m_ExtractWords.SelectByFreqEvent = SelectByFreq;
}
else
{
m_ExtractWords.SelectByFreqEvent = null;
}
}
}
/// <summary>
/// 自动学习
/// </summary>
public bool AutoStudy
{
get
{
return m_AutoStudy;
}
set
{
m_AutoStudy = value;
m_MatchNameRule.AutoStudy = value;
}
}
/// <summary>
/// 间隔多少秒自动保存最新的字典和统计信息,AutoStudy = true时有效
/// </summary>
public int AutoSaveInterval
{
get
{
return m_AutoSaveInterval;
}
set
{
if (value <= 1)
{
m_AutoSaveInterval = 1;
}
else
{
m_AutoSaveInterval = value;
}
}
}
/// <summary>
/// 字典文件所在路径
/// </summary>
public String DictPath
{
get
{
return m_DictPath;
}
set
{
m_DictPath = value;
}
}
/// <summary>
/// 日志文件名
/// </summary>
public String LogFileName
{
get
{
return m_LogFileName;
}
set
{
m_LogFileName = value;
}
}
/// <summary>
/// 词性
/// </summary>
public CPOS Pos
{
get
{
return m_POS;
}
}
#endregion
#region 配置文件
private object Convert(String In, Type destType)
{
if (destType.Equals(typeof(bool)))
{
return System.Convert.ToBoolean(In);
}
else if (destType.Equals(typeof(byte)))
{
return System.Convert.ToByte(In);
}
else if (destType.Equals(typeof(char)))
{
return System.Convert.ToChar(In);
}
else if (destType.Equals(typeof(DateTime)))
{
return System.Convert.ToDateTime(In);
}
else if (destType.Equals(typeof(decimal)))
{
return System.Convert.ToDecimal(In);
}
else if (destType.Equals(typeof(double)))
{
return System.Convert.ToDouble(In);
}
else if (destType.Equals(typeof(Int16)))
{
return System.Convert.ToInt16(In);
}
else if (destType.Equals(typeof(Int32)))
{
return System.Convert.ToInt32(In);
}
else if (destType.Equals(typeof(Int64)))
{
return System.Convert.ToInt64(In);
}
else if (destType.Equals(typeof(SByte)))
{
return System.Convert.ToSByte(In);
}
else if (destType.Equals(typeof(Single)))
{
return System.Convert.ToSingle(In);
}
else if (destType.Equals(typeof(String)))
{
return In;
}
else if (destType.Equals(typeof(UInt16)))
{
return System.Convert.ToUInt16(In);
}
else if (destType.Equals(typeof(UInt32)))
{
return System.Convert.ToUInt32(In);
}
else if (destType.Equals(typeof(UInt64)))
{
return System.Convert.ToUInt64(In);
}
else
{
throw new Exception(String.Format("Unknown type:{0}", destType.Name));
}
}
class CfgItem
{
public PropertyInfo Pi;
public String Comment;
public CfgItem(PropertyInfo pi, String comment)
{
Pi = pi;
Comment = comment;
}
}
CfgItem[] GetCfgItems()
{
CfgItem[] items = new CfgItem[9];
items[0] = new CfgItem(this.GetType().GetProperty("UnknownWordsThreshold"), "未登录词阈值,当统计超过这个值时,自动将未登录词加入到字典中");
items[1] = new CfgItem(this.GetType().GetProperty("AutoInsertUnknownWords"), "自动插入超过统计阈值的未登录词");
items[2] = new CfgItem(this.GetType().GetProperty("FreqFirst"), "优先判断词频,如果一个长的单词由多个短的单词组成,而长的单词词频较低则忽略长的单词。如 中央酒店的词频比中央和酒店的词频都要低,则忽略中央酒店。");
items[3] = new CfgItem(this.GetType().GetProperty("AutoStudy"), "自动统计姓名前后缀,自动统计未登录词,自动统计词频");
items[4] = new CfgItem(this.GetType().GetProperty("AutoSaveInterval"), "间隔多少秒自动保存最新的字典和统计信息,AutoStudy = true时有效");
items[5] = new CfgItem(this.GetType().GetProperty("DictPath"), "字典文件所在路径");
items[6] = new CfgItem(this.GetType().GetProperty("LogFileName"), "日志文件名");
items[7] = new CfgItem(this.GetType().GetProperty("MatchName"), "是否匹配汉语人名");
items[8] = new CfgItem(this.GetType().GetProperty("FilterStopWords"), "是否过滤停用词");
return items;
}
/// <summary>
/// 从配置文件加载配置
/// </summary>
/// <param name="fileName">配置文件名</param>
public void LoadConfig(String fileName)
{
System.Xml.XmlDocument doc = new System.Xml.XmlDocument();
try
{
doc.Load(fileName);
System.Xml.XmlNodeList list = doc.GetElementsByTagName("Item");
System.Xml.XmlAttribute itemName = null;
foreach (System.Xml.XmlNode node in list)
{
try
{
itemName = node.Attributes["Name"];
System.Xml.XmlAttribute value = node.Attributes["Value"];
if (itemName == null || value == null)
{
continue;
}
PropertyInfo pi = GetType().GetProperty(itemName.Value);
pi.SetValue(this, Convert(value.Value, pi.PropertyType), null);
}
catch (Exception e1)
{
WriteLog(String.Format("Load Item={0} fail, errmsg:{1}",
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -