📄 extractwords.cs
字号:
/***************************************************************************************
* KTDictSeg 简介: KTDictSeg 是由KaiToo搜索开发的一款基于字典的简单中英文分词算法
* 主要功能: 中英文分词,未登录词识别,多元歧义自动识别,全角字符识别能力
* 主要性能指标:
* 分词准确度:90%以上(有待专家的权威评测)
* 处理速度: 600KBytes/s
*
* 版本: V1.2.02
* Copyright(c) 2007 http://www.kaitoo.com
* 作者:肖波
* 授权: 开源GPL
* 公司网站: http://www.kaitoo.com
* 个人博客: http://blog.csdn.net/eaglet; http://www.cnblogs.com/eaglet
* 联系方式: blog.eaglet@gmail.com
* ***************************************************************************************/
using System;
using System.Collections.Generic;
using System.Collections;
using System.Text;
using System.Diagnostics;
namespace FTAlgorithm
{
public enum T_Direction
{
/// <summary>
/// 从左到右
/// </summary>
LeftToRight = 0,
/// <summary>
/// 从右到左
/// </summary>
RightToLeft = 1,
}
/// <summary>
/// 单词信息
/// </summary>
public class T_WordInfo
{
/// <summary>
/// 单词
/// </summary>
public String Word;
/// <summary>
/// 单词首字符在全文中的位置
/// </summary>
public int Position;
/// <summary>
/// 单词的权重级别
/// </summary>
public int Rank;
/// <summary>
/// 单词对应的标记
/// </summary>
public object Tag;
}
public delegate bool CompareByPosFunc(List<T_WordInfo> words, List<int> pre, List<int> cur);
public delegate bool SelectByFreqFunc(List<T_WordInfo> words, List<int> pre, List<int> cur);
/// <summary>
/// 从全文中提取指定的单词,及其位置
/// </summary>
public class CExtractWords
{
CWordDfa m_WordDfa;
List<int> m_GameNodes;
int m_MinSpace;
int m_MinDeep;
T_Direction m_MatchDirection;
CompareByPosFunc m_CompareByPos;
SelectByFreqFunc m_SelectByFreq;
public CompareByPosFunc CompareByPosEvent
{
get
{
return m_CompareByPos;
}
set
{
m_CompareByPos = value;
}
}
public SelectByFreqFunc SelectByFreqEvent
{
get
{
return m_SelectByFreq;
}
set
{
m_SelectByFreq = value;
}
}
/// <summary>
/// 匹配方向
/// </summary>
public T_Direction MatchDirection
{
get
{
return m_MatchDirection;
}
set
{
m_MatchDirection = value;
}
}
public CExtractWords()
{
m_MatchDirection = T_Direction.LeftToRight;
m_WordDfa = new CWordDfa();
}
public object GetTag(String word)
{
return m_WordDfa.GetTag(word);
}
public void InsertWordToDfa(String word, object tag)
{
m_WordDfa.InsertWordToDfa(word, tag);
}
private bool CompareGroup(List<T_WordInfo> words, List<int> pre, List<int> cur, T_Direction direction)
{
int i ;
if (direction == T_Direction.LeftToRight)
{
i = 0;
}
else
{
i = cur.Count - 1;
}
while ((direction == T_Direction.LeftToRight && i < cur.Count) ||
(direction == T_Direction.RightToLeft && i >= 0))
{
if (i >= pre.Count)
{
break;
}
int preId = (int)pre[i];
int curId = (int)cur[i];
if (((T_WordInfo)words[curId]).Word.Length > ((T_WordInfo)words[preId]).Word.Length)
{
return true;
}
else if (((T_WordInfo)words[curId]).Word.Length < ((T_WordInfo)words[preId]).Word.Length)
{
return false;
}
if (direction == T_Direction.LeftToRight)
{
i++;
}
else
{
i--;
}
}
return false;
}
/// <summary>
/// 博弈树
/// </summary>
/// <param name="words"></param>
/// <param name="nodes"></param>
/// <param name="init"></param>
/// <param name="begin"></param>
/// <param name="end"></param>
/// <param name="spaceNum"></param>
/// <param name="deep"></param>
/// <returns></returns>
private List<int> GameTree(List<T_WordInfo> words, List<int> nodes, bool init, int begin, int end, ref int spaceNum, ref int deep)
{
if (init)
{
int startPos = ((T_WordInfo)words[begin]).Position;
for (int i = begin; i <= end ; i++)
{
T_WordInfo wordInfo = (T_WordInfo)words[i];
spaceNum = wordInfo.Position - startPos;
deep = 0;
List<int> oneNodes;
if (i == end)
{
oneNodes = new List<int>();
oneNodes.Add(i);
deep++;
}
else
{
oneNodes = GameTree(words, nodes, false, i, end, ref spaceNum, ref deep);
}
if (oneNodes != null)
{
bool select = false;
if (m_MinSpace > spaceNum ||
(m_MinSpace == spaceNum && deep < m_MinDeep))
{
select = true;
if (m_MinSpace == 0)
{
if (SelectByFreqEvent != null)
{
select = SelectByFreqEvent(words, m_GameNodes, oneNodes);
}
}
}
else if (m_MinDeep == deep && m_MinSpace == spaceNum)
{
if (m_CompareByPos != null && m_MinSpace == 0)
{
select = m_CompareByPos(words, m_GameNodes, oneNodes);
}
else
{
select = CompareGroup(words, m_GameNodes, oneNodes, MatchDirection);
}
}
if (select)
{
m_MinDeep = deep;
m_MinSpace = spaceNum;
m_GameNodes.Clear();
foreach (int obj in oneNodes)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -