📄 span.cs
字号:
/***********************************************************************************
* ICTCLAS简介:计算所汉语词法分析系统ICTCLAS
* Institute of Computing Technology, Chinese Lexical Analysis System
* 功能有:中文分词;词性标注;未登录词识别。
* 分词正确率高达97.58%(973专家评测结果),
* 未登录词识别召回率均高于90%,其中中国人名的识别召回率接近98%;
* 处理速度为31.5Kbytes/s。
* 著作权: Copyright(c)2002-2005中科院计算所 职务著作权人:张华平
* 遵循协议:自然语言处理开放资源许可证1.0
* Email: zhanghp@software.ict.ac.cn
* Homepage:www.i3s.ac.cn
*
*----------------------------------------------------------------------------------
*
* Copyright (c) 2000, 2001
* Institute of Computing Tech.
* Chinese Academy of Sciences
* All rights reserved.
*
* This file is the confidential and proprietary property of
* Institute of Computing Tech. and the posession or use of this file requires
* a written license from the author.
* Author: Kevin Zhang
* (zhanghp@software.ict.ac.cn)、
*
*----------------------------------------------------------------------------------
*
* SharpICTCLAS:.net平台下的ICTCLAS
* 是由河北理工大学经管学院吕震宇根据Free版ICTCLAS改编而成,
* 并对原有代码做了部分重写与调整
*
* Email: zhenyulu@163.com
* Blog: http://www.cnblogs.com/zhenyulu
*
***********************************************************************************/
using System;
using System.Collections.Generic;
using System.Text;
namespace SharpICTCLAS
{
public class Span
{
#region Public Fields
//The number of unknown word
public int m_nUnknownWordsCount;
//The start and ending possition of unknown position
public int[,] m_nUnknownWords = new int[Predefine.MAX_UNKNOWN_PER_SENTENCE, 2];
//The possibility of unknown words
public double[] m_dWordsPossibility = new double[Predefine.MAX_UNKNOWN_PER_SENTENCE];
public ContextStat m_context = new ContextStat(); //context
#endregion
#region Private Fields
private TAG_TYPE m_tagType; //The type of tagging
private int m_nStartPos;
private int[] m_nBestTag = new int[Predefine.MAX_WORDS_PER_SENTENCE];
//Record the Best Tag
private string[] m_sWords = new string[Predefine.MAX_WORDS_PER_SENTENCE];
private int[] m_nWordPosition = new int[Predefine.MAX_WORDS_PER_SENTENCE];
private int[,] m_nTags = new int[Predefine.MAX_WORDS_PER_SENTENCE, Predefine.MAX_POS_PER_WORD];
private int[,] m_nBestPrev = new int[Predefine.MAX_WORDS_PER_SENTENCE, Predefine.MAX_POS_PER_WORD];
private double[,] m_dFrequency = new double[Predefine.MAX_WORDS_PER_SENTENCE, Predefine.MAX_POS_PER_WORD];
private int m_nCurLength;
#endregion
#region 构造函数
public Span()
{
if (m_tagType != TAG_TYPE.TT_NORMAL)
m_nTags[0, 0] = 100;
//Begin tag
else
m_nTags[0, 0] = 0;
//Begin tag
m_nTags[0, 1] = -1;
m_dFrequency[0, 0] = 0;
m_nCurLength = 1;
m_nUnknownWordsCount = 0;
m_nStartPos = 0;
m_nWordPosition[1] = 0;
m_sWords[0] = null;
m_tagType = TAG_TYPE.TT_NORMAL; //Default tagging type
}
#endregion
#region Disamb Method
private bool Disamb()
{
int i, j, k, nMinCandidate;
double dMinFee = 0, dTmp;
for (i = 1; i < m_nCurLength; i++)
//For every word
{
for (j = 0; m_nTags[i, j] >= 0; j++)
//For every word
{
nMinCandidate = Predefine.MAX_POS_PER_WORD + 1;
for (k = 0; m_nTags[i - 1, k] >= 0; k++)
{
dTmp = -Math.Log(m_context.GetContextPossibility(0, m_nTags[i - 1, k], m_nTags[i, j]));
dTmp += m_dFrequency[i - 1, k]; //Add the fees
if (nMinCandidate > 10 || dTmp < dMinFee)
//Get the minimum fee
{
nMinCandidate = k;
dMinFee = dTmp;
}
}
m_nBestPrev[i, j] = nMinCandidate; //The best previous for j
m_dFrequency[i, j] = m_dFrequency[i, j] + dMinFee;
}
}
return true;
}
#endregion
#region Reset Method
private bool Reset()
{
return Reset(true);
}
private bool Reset(bool bContinue)
{
if (!bContinue)
{
//||CC_Find("。!”〕〉》」〗】",m_sWords[m_nCurLength-1])
if (m_tagType != TAG_TYPE.TT_NORMAL)
//Get the last POS in the last sentence
m_nTags[0, 0] = 100;
//Begin tag
else
m_nTags[0, 0] = 0;
//Begin tag
m_nUnknownWordsCount = 0;
m_dFrequency[0, 0] = 0;
m_nStartPos = 0;
}
else
{
m_nTags[0, 0] = m_nTags[m_nCurLength - 1, 0];
//Get the last POS in the last sentence
m_dFrequency[0, 0] = m_dFrequency[m_nCurLength - 1, 0];
}
m_nTags[0, 1] = -1;
//Get the last POS in the last sentence,set the -1 as end flag
m_nCurLength = 1;
m_nWordPosition[1] = m_nStartPos;
m_sWords[0] = null;
return true;
}
#endregion
#region LoadContext Method
public bool LoadContext(string sFilename)
{
return m_context.Load(sFilename);
}
#endregion
#region GetBestPOS Method
private bool GetBestPOS()
{
Disamb();
for (int i = m_nCurLength - 1, j = 0; i > 0; i--)
//,j>=0
{
if (m_sWords[i] != null)
{
//Not virtual ending
m_nBestTag[i] = m_nTags[i, j]; //Record the best POS and its possibility
}
j = m_nBestPrev[i, j];
}
int nEnd = m_nCurLength; //Set the end of POS tagging
if (m_sWords[m_nCurLength - 1] == null)
nEnd = m_nCurLength - 1;
m_nBestTag[nEnd] = -1;
return true;
}
#endregion
#region PersonRecognize Method
public bool PersonRecognize(WordDictionary personDict)
{
StringBuilder sb = new StringBuilder();
int i;
string sPOS = "z", sPersonName;
string[] sPatterns = { "BBCD", "BBC", "BBE", "BBZ", "BCD", "BEE", "BE", "BG", "BXD", "BZ", "CDCD", "CD", "EE", "FB", "Y", "XD", "" };
double[] dFactor = { 0.003606, 0.000021, 0.001314, 0.000315, 0.656624, 0.000021, 0.146116, 0.009136,
0.000042, 0.038971, 0, 0.090367, 0.000273, 0.009157, 0.034324, 0.009735, 0 };
/*------------------------------------
About parameter:
BBCD 343 0.003606
BBC 2 0.000021
BBE 125 0.001314
BBZ 30 0.000315
BCD 62460 0.656624
BEE 0 0.000000
BE 13899 0.146116
BG 869 0.009136
BXD 4 0.000042
BZ 3707 0.038971
CD 8596 0.090367
EE 26 0.000273
FB 871 0.009157
Y 3265 0.034324
XD 926 0.009735
The person recognition patterns set
BBCD:姓+姓+名1+名2;
BBE: 姓+姓+单名;
BBZ: 姓+姓+双名成词;
BCD: 姓+名1+名2;
BE: 姓+单名;
BEE: 姓+单名+单名;韩磊磊
BG: 姓+后缀
BXD: 姓+姓双名首字成词+双名末字
BZ: 姓+双名成词;
B: 姓
CD: 名1+名2;
EE: 单名+单名;
FB: 前缀+姓
XD: 姓双名首字成词+双名末字
Y: 姓单名成词
------------------------------------*/
int[] nPatternLen = { 4, 3, 3, 3, 3, 3, 2, 2, 3, 2, 4, 2, 2, 2, 1, 2, 0 };
//Convert to string from POS
sb.Append('z');
for (i = 1; m_nBestTag[i] > -1; i++)
sb.Append(Convert.ToChar(m_nBestTag[i] + Convert.ToInt32('A')));
sPOS = sb.ToString();
int j = 1, k, nPos; //Find the proper pattern from the first POS
int nLittleFreqCount; //Counter for the person name role with little frequecy
bool bMatched = false;
while (j < i)
{
bMatched = false;
for (k = 0; !bMatched && nPatternLen[k] > 0; k++)
{
if (string.Compare(sPatterns[k], 0, sPOS, j, nPatternLen[k]) == 0 &&
string.Compare(m_sWords[j - 1], "·") != 0 && string.Compare(m_sWords[j + nPatternLen[k]], "·") != 0)
{
//Find the proper pattern k
if (string.Compare(sPatterns[k], "FB") == 0 && (sPOS[j + 2] == 'E' || sPOS[j + 2] == 'C' || sPOS[j + 2] == 'G'))
{
//Rule 1 for exclusion:前缀+姓+名1(名2): 规则(前缀+姓)失效;
continue;
}
/*
if((strcmp(sPatterns[k],"BEE")==0||strcmp(sPatterns[k],"EE")==0)&&strcmp(m_sWords[j+nPatternLen[k]-1],m_sWords[j+nPatternLen[k]-2])!=0)
{//Rule 2 for exclusion:姓+单名+单名:单名+单名 若EE对应的字不同,规则失效.如:韩磊磊
continue;
}
if(strcmp(sPatterns[k],"B")==0&&m_nBestTag[j+1]!=12)
{//Rule 3 for exclusion: 若姓后不是后缀,规则失效.如:江主席、刘大娘
continue;
}
*/
//Get the possible name
nPos = j; //Record the person position in the tag sequence
sPersonName = null;
nLittleFreqCount = 0; //Record the number of role with little frequency
while (nPos < j + nPatternLen[k])
{
//Get the possible person name
//
if (m_nBestTag[nPos] < 4 && personDict.GetFrequency(m_sWords[nPos], m_nBestTag[nPos]) < Predefine.LITTLE_FREQUENCY)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -