📄 cspan.java
字号:
package com.gftech.ictclas4j.tag;
import com.gftech.common.GFCommon;
import com.gftech.common.GFString;
import com.gftech.ictclas4j.utility.CContextStat;
import com.gftech.ictclas4j.utility.CDictionary;
import com.gftech.ictclas4j.utility.Final;
import com.gftech.ictclas4j.utility.TagWordResult;
import com.gftech.ictclas4j.utility.Utility;
public class CSpan {
private Final.TAG_TYPE m_tagType;// The type of tagging
private int m_nStartPos;
private int[] m_nBestTag = new int[Final.MAX_WORDS_PER_SENTENCE];
// Record the Best Tag
private byte[][] m_sWords = new byte[Final.MAX_WORDS_PER_SENTENCE][Final.WORD_MAXLENGTH];
private int[] m_nWordPosition = new int[Final.MAX_WORDS_PER_SENTENCE];
private int[][] m_nTags = new int[Final.MAX_WORDS_PER_SENTENCE][Final.MAX_POS_PER_WORD];
private byte[][] m_nBestPrev = new byte[Final.MAX_WORDS_PER_SENTENCE][Final.MAX_POS_PER_WORD];
private int m_nCurLength;
private double[][] m_dFrequency = new double[Final.MAX_WORDS_PER_SENTENCE][Final.MAX_POS_PER_WORD];
public int m_nUnknownIndex;
// The number of unknown word
public int[][] m_nUnknownWords = new int[Final.MAX_UNKNOWN_PER_SENTENCE][2];
// The start and ending possition of unknown position
public double[] m_dWordsPossibility = new double[Final.MAX_UNKNOWN_PER_SENTENCE];
// The possibility of unknown words
public CContextStat m_context;// context
public CSpan() {
if (m_tagType != Final.TAG_TYPE.TT_NORMAL)
m_nTags[0][0] = 100;// Begin tag
else
m_nTags[0][0] = 0;// Begin tag
m_nTags[0][1] = -1;
m_dFrequency[0][0] = 0;
m_nCurLength = 1;
m_nUnknownIndex = 0;
m_nStartPos = 0;
m_nWordPosition[1] = 0;
m_sWords[0][0] = 0;
m_tagType = Final.TAG_TYPE.TT_NORMAL;// Default tagging type
}
public boolean PlaceRecognize(CDictionary dictCore, CDictionary placeDict) {
int nStart=1,nEnd=1,i=1,nTemp;
double dPanelty=1.0;//Panelty value
while(m_nBestTag[i]>-1)
{
if(m_nBestTag[i]==1)//1 Trigger the recognition procession
{
nStart=i;
nEnd=nStart+1;
while(m_nBestTag[nEnd]==1)//
{
if(nEnd>nStart+1)
dPanelty+=1.0;
nEnd++;
}
while(m_nBestTag[nEnd]==2)//2,12,22
nEnd++;
nTemp=nEnd;
while(m_nBestTag[nEnd]==3)
{
if(nEnd>nTemp)
dPanelty+=1.0;
nEnd++;
}
}
else if(m_nBestTag[i]==2)//1,11,21 Trigger the recognition
{
dPanelty+=1.0;
nStart=i;
nEnd=nStart+1;
while(m_nBestTag[nEnd]==2)//2
nEnd++;
nTemp=nEnd;
while(m_nBestTag[nEnd]==3)//2
{
if(nEnd>nTemp)
dPanelty+=1.0;
nEnd++;
}
}
if(nEnd>nStart)
{
m_nUnknownWords[m_nUnknownIndex][0]=m_nWordPosition[nStart];
m_nUnknownWords[m_nUnknownIndex][1]=m_nWordPosition[nEnd];
m_dWordsPossibility[m_nUnknownIndex++]=ComputePossibility(nStart,nEnd-nStart+1,placeDict)+Math.log(dPanelty);
nStart=nEnd;
}
if(i<nEnd)
i=nEnd;
else
i=i+1;
}
return true;
}
public boolean PersonRecognize(CDictionary personDict) {
byte[] sPOS = new byte[Final.MAX_WORDS_PER_SENTENCE];
for (int i = 0; i < sPOS.length; i++)
sPOS[i] = "z".getBytes()[0];
byte[] sPersonName = new byte[100];
// 0 1 2 3 4 5
String[] sPatterns = { "BBCD", "BBC", "BBE", "BBZ", "BCD", "BEE", "BE",
"BG", "BXD", "BZ", "CDCD", "CD", "EE", "FB", "Y", "XD", "" };
// BBCD BBC BBE BBZ BCD BEE BE BG
double dFactor[] = { 0.003606, 0.000021, 0.001314, 0.000315, 0.656624,
0.000021, 0.146116, 0.009136,
// BXD BZ CDCD CD EE FB Y XD
0.000042, 0.038971, 0, 0.090367, 0.000273, 0.009157, 0.034324,
0.009735, 0 };
// About parameter:
/*
* BBCD 343 0.003606 BBC 2 0.000021 BBE 125 0.001314 BBZ 30 0.000315 BCD
* 62460 0.656624 BEE 0 0.000000 BE 13899 0.146116 BG 869 0.009136 BXD 4
* 0.000042 BZ 3707 0.038971 CD 8596 0.090367 EE 26 0.000273 FB 871
* 0.009157 Y 3265 0.034324 XD 926 0.009735
*/
// The person recognition patterns set
// BBCD:姓+姓+名1+名2;
// BBE: 姓+姓+单名;
// BBZ: 姓+姓+双名成词;
// BCD: 姓+名1+名2;
// BE: 姓+单名;
// BEE: 姓+单名+单名;韩磊磊
// BG: 姓+后缀
// BXD: 姓+姓双名首字成词+双名末字
// BZ: 姓+双名成词;
// B: 姓
// CD: 名1+名2;
// EE: 单名+单名;
// FB: 前缀+姓
// XD: 姓双名首字成词+双名末字
// Y: 姓单名成词
int nPatternLen[] = { 4, 3, 3, 3, 3, 3, 2, 2, 3, 2, 4, 2, 2, 2, 1, 2, 0 };
int i = 0;
for (i = 1; m_nBestTag[i] > -1; i++)
// Convert to string from POS
sPOS[i] = (byte) (m_nBestTag[i] + 'A');
sPOS[i] = 0;
int j = 1, k, nPos;// Find the proper pattern from the first POS
int nLittleFreqCount;// Counter for the person name role with little
// frequecy
boolean bMatched = false;
while (j < i) {
bMatched = false;
for (k = 0; !bMatched && nPatternLen[k] > 0; k++) {
// Find the proper pattern k
if (Utility.strncmp(sPatterns[k].getBytes(),0, GFCommon.bytesCopy(sPOS , j,nPatternLen[k]), nPatternLen[k])
&& m_sWords[j - 1][0]!= "·".getBytes()[0]
&& m_sWords[j + nPatternLen[k]][0]!= "·".getBytes()[0]) {
// Rule 1 for exclusion:前缀+姓+名1(名2): 规则(前缀+姓)失效;
if ( "FB".equals(sPatterns[k])
&& (sPOS[j + 2] == 'E' || sPOS[j + 2] == 'C' || sPOS[j + 2] == 'G')) {
continue;
}
/*
* if((strcmp(sPatterns[k],"BEE")==0||strcmp(sPatterns[k],"EE")==0)&&strcmp(m_sWords[j+nPatternLen[k]-1],m_sWords[j+nPatternLen[k]-2])!=0)
* {//Rule 2 for exclusion:姓+单名+单名:单名+单名
* 若EE对应的字不同,规则失效.如:韩磊磊 continue; }
*
* if(strcmp(sPatterns[k],"B")==0&&m_nBestTag[j+1]!=12)
* {//Rule 3 for exclusion: 若姓后不是后缀,规则失效.如:江主席、刘大娘 continue; }
*/// Get the possible name
// Record the person position in the tag sequence
nPos = j;
sPersonName[0] = 0;
// Record the number of role with little frequency
nLittleFreqCount = 0;
int index=0;
while (nPos < j + nPatternLen[k]) {// Get the possible
// person name
//
if (m_nBestTag[nPos] < 4
&& personDict.GetFrequency(m_sWords[nPos],
m_nBestTag[nPos]) < Final.LITTLE_FREQUENCY)
nLittleFreqCount++;// The counter increase
GFCommon.bytesCopy(sPersonName, m_sWords[nPos],0,m_sWords[nPos].length);
nPos += 1;
}
/*
* if(IsAllForeign(sPersonName)&&personDict.GetFrequency(m_sWords[j],1)<LITTLE_FREQUENCY)
* {//Exclusion foreign name //Rule 2 for
* exclusion:若均为外国人名用字 规则(名1+名2)失效 j+=nPatternLen[k]-1;
* continue; }
*/if ("CDCD".equals(sPatterns[k])) {// Rule for
// exclusion
// 规则(名1+名2+名1+名2)本身是排除规则:女高音歌唱家迪里拜尔演唱
// Rule 3 for exclusion:含外国人名用字 规则适用
// 否则,排除规则失效:黑妞白妞姐俩拔了头筹。
if (Utility.GetForeignCharCount(sPersonName) > 0)
j += nPatternLen[k] - 1;
continue;
}
/*
* if(strcmp(sPatterns[k],"CD")==0&&IsAllForeign(sPersonName))
* {// j+=nPatternLen[k]-1; continue; }
* if(nLittleFreqCount==nPatternLen[k]||nLittleFreqCount==3)
* //马哈蒂尔;小扎耶德与他的中国阿姨胡彩玲受华黎明大使之邀, //The all roles appear
* with two lower frequecy,we will ignore them continue;
*/m_nUnknownWords[m_nUnknownIndex][0] = m_nWordPosition[j];
m_nUnknownWords[m_nUnknownIndex][1] = m_nWordPosition[j
+ nPatternLen[k]];
m_dWordsPossibility[m_nUnknownIndex] = -Math.log(dFactor[k])
+ ComputePossibility(j, nPatternLen[k], personDict);
// Mutiply the factor
m_nUnknownIndex += 1;
j += nPatternLen[k];
bMatched = true;
}
}
if (!bMatched)// Not matched, add j by 1
j += 1;
}
return true;
}
public boolean POSTagging(TagWordResult[] pWordItems, CDictionary dictCore,
CDictionary dictUnknown) {
// pWordItems: Items; nItemCount: the count of items;core dictionary and unknown recognition dictionary
int i=0,j,nStartPos;
Reset(false);
while(i>-1&&pWordItems[i].sWord[0]!=0)
{
nStartPos=i;//Start Position
i=GetFrom(pWordItems,nStartPos,dictCore,dictUnknown);
GetBestPOS();
switch(m_tagType)
{
case TT_NORMAL://normal POS tagging
j=1;
while(m_nBestTag[j]!=-1&&j<m_nCurLength)
{//Store the best POS tagging
pWordItems[j+nStartPos-1].nHandle=m_nBestTag[j];
//Let 。be 0
if(pWordItems[j+nStartPos-1].dValue>0&&dictCore.IsExist(pWordItems[j+nStartPos-1].sWord,-1))//Exist and update its frequncy as a POS value
pWordItems[j+nStartPos-1].dValue=dictCore.GetFrequency(pWordItems[j+nStartPos-1].sWord,m_nBestTag[j]);
j+=1;
}
break;
case TT_PERSON://Person recognition
PersonRecognize(dictUnknown);
break;
case TT_PLACE://Place name recognition
case TT_TRANS_PERSON://Transliteration Person
PlaceRecognize(dictCore,dictUnknown);
break;
default:
break;
}
CSpan span=new CSpan();
span.Reset(false);
}
return true;
}
// POS tagging with Hidden Markov Model
public void SetTagType(Final.TAG_TYPE nType) {
m_tagType=nType;
}
// Set the tag type
public boolean LoadContext(String sFilename) {
return m_context.Load(sFilename);
}
protected double ComputePossibility(int nStartPos, int nLength,
CDictionary dict) {
double dRetValue=0,dPOSPoss;
//dPOSPoss: the possibility of a POS appears
//dContextPoss: The possibility of context POS appears
int nFreq;
for(int i=nStartPos;i<nStartPos+nLength;i++)
{
nFreq=dict.GetFrequency(m_sWords[i],m_nBestTag[i]);
//nFreq is word being the POS
dPOSPoss=Math.log((double)(m_context.GetFrequency(0,m_nBestTag[i])+1))-Math.log((double)(nFreq+1));
dRetValue+=dPOSPoss;
/* if(i<nStartPos+nLength-1)
{
dContextPoss=log((double)(m_context.GetContextPossibility(0,m_nBestTag[i],m_nBestTag[i+1])+1));
dRetValue+=dPOSPoss-dContextPoss;
}
*/ }
return dRetValue;
}
protected int GetFrom(TagWordResult[] pWordItems, int nIndex,
CDictionary dictCore, CDictionary dictUnknown) {
int nCount=0;
int[] aPOS=new int[Final.MAX_POS_PER_WORD];
int[] aFreq=new int[Final.MAX_POS_PER_WORD];
int nFreq=0,j,nRetPos=0,nWordsIndex=0;
boolean bSplit=false;//Need to split in Transliteration recognition
int i=1,nPOSCount;
byte[] sCurWord=new byte[Final.WORD_MAXLENGTH];//Current word
nWordsIndex=i+nIndex-1;
for(;i<Final.MAX_WORDS_PER_SENTENCE&&pWordItems[nWordsIndex].sWord[0]!=0;i++)
{
if(m_tagType==Final.TAG_TYPE.TT_NORMAL||!dictUnknown.IsExist(pWordItems[nWordsIndex].sWord,44))
{//store current word
GFCommon.bytesCopy(m_sWords[i],pWordItems[nWordsIndex].sWord,0,pWordItems[nWordsIndex].sWord.length);
m_nWordPosition[i+1]=m_nWordPosition[i]+ m_sWords[i].length;
}
else
{
if(!bSplit)
{
GFCommon.bytesCopy(m_sWords[i],pWordItems[nWordsIndex].sWord,0,2);//store current word
m_sWords[i][2]=0;
bSplit=true;
}
else
{
int nLen= pWordItems[nWordsIndex].sWord.length-2;
byte[] bt=GFCommon.bytesCopy(pWordItems[nWordsIndex].sWord,0,2);
GFCommon.bytesCopy(m_sWords[i],bt,0,nLen);//store current word
m_sWords[i][nLen]=0;
bSplit=false;
}
m_nWordPosition[i+1]=m_nWordPosition[i]+ m_sWords[i].length;
}
//Record the position of current word
m_nStartPos=m_nWordPosition[i+1];
//Move the Start POS to the ending
if(m_tagType!=Final.TAG_TYPE.TT_NORMAL)
{
//Get the POSs from the unknown recognition dictionary
GFCommon.bytesCopy( sCurWord,m_sWords[i],0,m_sWords[i].length);
if(m_tagType==Final.TAG_TYPE.TT_TRANS_PERSON&&i>0&&Utility.charType( m_sWords[i-1][0],m_sWords[i-1][1])==Final.CT_CHINESE)
{
if(m_sWords[i][0]=='.'&&m_sWords[i][1]==0)
GFCommon.bytesCopy(sCurWord,".".getBytes(),0,".".getBytes().length);
else if(m_sWords[i][0]=='-'&&m_sWords[i][1]==0)
GFCommon.bytesCopy(sCurWord,"-".getBytes(),0,"-".getBytes().length);
}
dictUnknown.GetHandle(sCurWord, nCount,aPOS,aFreq);
nPOSCount=nCount+1;
for(j=0;j<nCount;j++)
{//Get the POS set of sCurWord in the unknown dictionary
m_nTags[i][j]=aPOS[j];
m_dFrequency[i][j]=-Math.log((double)(1+aFreq[j]))+Math.log((double)(m_context.GetFrequency(0,aPOS[j])+nPOSCount));
}
//Get the POS set of sCurWord in the core dictionary
//We ignore the POS in the core dictionary and recognize them as other (0).
//We add their frequency to get the possibility as POS 0
if(GFString.getChineseString(m_sWords[i],"gb2312").indexOf("始##始")==0)
{
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -