📄 segment.cs
字号:
/***********************************************************************************
* ICTCLAS简介:计算所汉语词法分析系统ICTCLAS
* Institute of Computing Technology, Chinese Lexical Analysis System
* 功能有:中文分词;词性标注;未登录词识别。
* 分词正确率高达97.58%(973专家评测结果),
* 未登录词识别召回率均高于90%,其中中国人名的识别召回率接近98%;
* 处理速度为31.5Kbytes/s。
* 著作权: Copyright(c)2002-2005中科院计算所 职务著作权人:张华平
* 遵循协议:自然语言处理开放资源许可证1.0
* Email: zhanghp@software.ict.ac.cn
* Homepage:www.i3s.ac.cn
*
*----------------------------------------------------------------------------------
*
* Copyright (c) 2000, 2001
* Institute of Computing Tech.
* Chinese Academy of Sciences
* All rights reserved.
*
* This file is the confidential and proprietary property of
* Institute of Computing Tech. and the posession or use of this file requires
* a written license from the author.
* Author: Kevin Zhang
* (zhanghp@software.ict.ac.cn)、
*
*----------------------------------------------------------------------------------
*
* SharpICTCLAS:.net平台下的ICTCLAS
* 是由河北理工大学经管学院吕震宇根据Free版ICTCLAS改编而成,
* 并对原有代码做了部分重写与调整
*
* Email: zhenyulu@163.com
* Blog: http://www.cnblogs.com/zhenyulu
*
***********************************************************************************/
using System;
using System.Collections.Generic;
using System.Text;
using System.Text.RegularExpressions;
namespace SharpICTCLAS
{
public class Segment
{
private WordDictionary biDict, coreDict;
public List<AtomNode> atomSegment;
public RowFirstDynamicArray<ChainContent> segGraph;
public ColumnFirstDynamicArray<ChainContent> biGraphResult;
public RowFirstDynamicArray<ChainContent> m_graphOptimum;
public List<WordResult[]> m_pWordSeg; //存放多个分词结果
public event SegmentEventHandler OnSegmentEvent;
#region 构造函数
public Segment(WordDictionary biDict, WordDictionary coreDict)
{
this.biDict = biDict;
this.coreDict = coreDict;
}
#endregion
#region BiSegment Method
public int BiSegment(string sSentence, double smoothPara, int nKind)
{
WordResult[] tmpResult;
WordLinkedArray linkedArray;
if (biDict == null || coreDict == null)
throw new Exception("biDict 或 coreDict 尚未初始化!");
//---原子分词
atomSegment = AtomSegment(sSentence);
OnAtomSegment(atomSegment);
//---检索词库,加入所有可能分词方案并存入链表结构
segGraph = GenerateWordNet(atomSegment, coreDict);
OnGenSegGraph(segGraph);
//---检索所有可能的两两组合
biGraphResult = BiGraphGenerate(segGraph, smoothPara, biDict, coreDict);
OnGenBiSegGraph(biGraphResult);
//---N 最短路径计算出多个分词方案
NShortPath.Calculate(biGraphResult, nKind);
List<int[]> spResult = NShortPath.GetNPaths(Predefine.MAX_SEGMENT_NUM);
OnNShortPath(spResult, segGraph);
m_pWordSeg = new List<WordResult[]>();
m_graphOptimum = new RowFirstDynamicArray<ChainContent>();
for (int i = 0; i < spResult.Count; i++)
{
linkedArray = BiPath2LinkedArray(spResult[i], segGraph, atomSegment);
tmpResult = GenerateWord(spResult[i], linkedArray, m_graphOptimum);
if (tmpResult != null)
m_pWordSeg.Add(tmpResult);
}
OnBeforeOptimize(m_pWordSeg);
return m_pWordSeg.Count;
}
#endregion
#region BiOptimumSegment Method
public int BiOptimumSegment(int nResultCount, double dSmoothingPara)
{
WordResult[] tmpResult;
WordLinkedArray linkedArray;
//Generate the biword link net
ColumnFirstDynamicArray<ChainContent> aBiwordsNet = BiGraphGenerate(m_graphOptimum, dSmoothingPara, biDict, coreDict);
OnGenBiOptimumSegGraph(aBiwordsNet);
NShortPath.Calculate(aBiwordsNet, nResultCount);
List<int[]> spResult = NShortPath.GetNPaths(Predefine.MAX_SEGMENT_NUM);
m_pWordSeg = new List<WordResult[]>();
segGraph = m_graphOptimum;
m_graphOptimum = new RowFirstDynamicArray<ChainContent>();
for (int i = 0; i < spResult.Count; i++)
{
linkedArray = BiPath2LinkedArray(spResult[i], segGraph, atomSegment);
tmpResult = GenerateWord(spResult[i], linkedArray, m_graphOptimum);
if (tmpResult != null)
m_pWordSeg.Add(tmpResult);
}
return m_pWordSeg.Count;
}
#endregion
#region AtomSegment Method
//====================================================================
// 对sSentence进行单个汉字的切割
//====================================================================
public static List<AtomNode> AtomSegment(string sSentence)
{
List<AtomNode> atomSegment = new List<AtomNode>();
AtomNode tmpEnd = null;
int startIndex = 0, length = sSentence.Length, pCur = 0, nCurType, nNextType;
StringBuilder sb = new StringBuilder();
char c;
// 如果是开始符号
if (sSentence.StartsWith(Predefine.SENTENCE_BEGIN))
{
atomSegment.Add(new AtomNode(Predefine.SENTENCE_BEGIN, Predefine.CT_SENTENCE_BEGIN));
startIndex = Predefine.SENTENCE_BEGIN.Length;
length -= startIndex;
}
// 如果是结束符号
if (sSentence.EndsWith(Predefine.SENTENCE_END))
{
tmpEnd = new AtomNode(Predefine.SENTENCE_END, Predefine.CT_SENTENCE_END);
length -= Predefine.SENTENCE_END.Length;
}
//==============================================================================================
// by zhenyulu:
//
// TODO: 使用一系列正则表达式将句子中的完整成分(百分比、日期、电子邮件、URL等)预先提取出来
//==============================================================================================
char[] charArray = sSentence.ToCharArray(startIndex, length);
int[] charTypeArray = new int[charArray.Length];
// 生成对应单个汉字的字符类型数组
for (int i = 0; i < charArray.Length; i++)
{
c = charArray[i];
charTypeArray[i] = Utility.charType(c);
if (c == '.' && i < (charArray.Length - 1) && Utility.charType(charArray[i + 1]) == Predefine.CT_NUM)
charTypeArray[i] = Predefine.CT_NUM;
else if (c == '.' && i < (charArray.Length - 1) && charArray[i + 1] >= '0' && charArray[i + 1] <= '9')
charTypeArray[i] = Predefine.CT_SINGLE;
else if (charTypeArray[i] == Predefine.CT_LETTER)
charTypeArray[i] = Predefine.CT_SINGLE;
}
// 根据字符类型数组中的内容完成原子切割
while (pCur < charArray.Length)
{
nCurType = charTypeArray[pCur];
if (nCurType == Predefine.CT_CHINESE || nCurType == Predefine.CT_INDEX ||
nCurType == Predefine.CT_DELIMITER || nCurType == Predefine.CT_OTHER)
{
if (charArray[pCur].ToString().Trim().Length != 0)
atomSegment.Add(new AtomNode(charArray[pCur].ToString(), nCurType));
pCur++;
}
//如果是字符、数字或者后面跟随了数字的小数点“.”则一直取下去。
else if (pCur < charArray.Length - 1 && (nCurType == Predefine.CT_SINGLE || nCurType == Predefine.CT_NUM))
{
sb.Remove(0, sb.Length);
sb.Append(charArray[pCur]);
bool reachEnd = true;
while (pCur < charArray.Length - 1)
{
nNextType = charTypeArray[++pCur];
if (nNextType == nCurType)
sb.Append(charArray[pCur]);
else
{
reachEnd = false;
break;
}
}
atomSegment.Add(new AtomNode(sb.ToString(), nCurType));
if (reachEnd)
pCur++;
}
// 对于所有其它情况
else
{
atomSegment.Add(new AtomNode(charArray[pCur].ToString(), nCurType));
pCur++;
}
}
// 增加结束标志
if (tmpEnd != null)
atomSegment.Add(tmpEnd);
return atomSegment;
}
#endregion
#region GenerateWordNet Method
//====================================================================
// Func Name : GenerateWordNet
// Description: Generate the segmentation word net according
// the original sentence
// Parameters : sSentence: the sentence
// dictCore : core dictionary
// bOriginalFreq=false: output original frequency
// Returns : bool
//====================================================================
public static RowFirstDynamicArray<ChainContent> GenerateWordNet(List<AtomNode> atomSegment, WordDictionary coreDict)
{
string sWord = "", sMaxMatchWord;
int nPOSRet, nPOS, nTotalFreq;
double dValue = 0;
RowFirstDynamicArray<ChainContent> m_segGraph = new RowFirstDynamicArray<ChainContent>();
m_segGraph.SetEmpty();
// 将原子部分存入m_segGraph
for (int i = 0; i < atomSegment.Count; i++)//Init the cost array
{
if (atomSegment[i].nPOS == Predefine.CT_CHINESE)
m_segGraph.SetElement(i, i + 1, new ChainContent(0, 0, atomSegment[i].sWord));
else
{
sWord = atomSegment[i].sWord;//init the word
dValue = Predefine.MAX_FREQUENCE;
switch (atomSegment[i].nPOS)
{
case Predefine.CT_INDEX:
case Predefine.CT_NUM:
nPOS = -27904;//'m'*256
sWord = "未##数";
dValue = 0;
break;
case Predefine.CT_DELIMITER:
nPOS = 30464;//'w'*256;
break;
case Predefine.CT_LETTER:
nPOS = -28280; // -'n' * 256 - 'x';
dValue = 0;
sWord = "未##串";
break;
case Predefine.CT_SINGLE://12021-2129-3121
if (Regex.IsMatch(atomSegment[i].sWord, @"^(-?\d+)(\.\d+)?$")) //匹配浮点数
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -