📄 span.cpp
字号:
/****************************************************************************
*
* Copyright (c) 2000, 2001
* Machine Group
* Software Research Lab.
* Institute of Computing Tech.
* Chinese Academy of Sciences
* All rights reserved.
*
* This file is the confidential and proprietary property of
* Institute of Computing Tech. and the posession or use of this file requires
* a written license from the author.
* Filename: Span.cpp
* Abstract:
* implementation of the CSpan class.
* Author: Kevin Zhang
* (zhanghp@software.ict.ac.cn)
* Date: 2002-4-23
*
* Notes: Tagging with Hidden Markov Model
*
****************************************************************************/
#include "stdafx.h"
#include "Span.h"
#include "..\\Segment\\Segment.h"
#include "..\\Utility\\Utility.h"
#include <math.h>
#include <string.h>
#include <stdio.h>
#include <time.h>
//////////////////////////////////////////////////////////////////////
// Construction/Destruction
//////////////////////////////////////////////////////////////////////
CSpan::CSpan()
{
if(m_tagType!=TT_NORMAL)
m_nTags[0][0]=100;//Begin tag
else
m_nTags[0][0]=0;//Begin tag
m_nTags[0][1]=-1;
m_dFrequency[0][0]=0;
m_nCurLength=1;
m_nUnknownIndex=0;
m_nStartPos=0;
m_nWordPosition[1]=0;
m_sWords[0][0]=0;
m_tagType=TT_NORMAL;//Default tagging type
}
CSpan::~CSpan()
{
}
bool CSpan::Disamb()
{
int i,j,k,nMinCandidate;
double dMinFee,dTmp;
for(i=1;i<m_nCurLength;i++)//For every word
{
for(j=0;m_nTags[i][j]>=0;j++)//For every word
{
nMinCandidate=MAX_POS_PER_WORD+1;
for(k=0;m_nTags[i-1][k]>=0;k++)
{
//ConvertPOS(m_nTags[i-1][k],&nKey,&nPrevPOS);
//ConvertPOS(m_nTags[i][j],&nKey,&nCurPOS);
//dTmp=m_context.GetContextPossibility(nKey,nPrevPOS,nCurPOS);
dTmp=-log(m_context.GetContextPossibility(0,m_nTags[i-1][k],m_nTags[i][j]));
dTmp+=m_dFrequency[i-1][k];//Add the fees
if(nMinCandidate>10||dTmp<dMinFee)//Get the minimum fee
{
nMinCandidate=k;
dMinFee=dTmp;
}
}
m_nBestPrev[i][j]=nMinCandidate;//The best previous for j
m_dFrequency[i][j]=m_dFrequency[i][j]+dMinFee;
}
}
return true;
}
bool CSpan::Reset(bool bContinue)
{
if(!bContinue)
{//||CC_Find("。!”〕〉》」〗】",m_sWords[m_nCurLength-1])
if(m_tagType!=TT_NORMAL)//Get the last POS in the last sentence
m_nTags[0][0]=100;//Begin tag
else
m_nTags[0][0]=0;//Begin tag
m_nUnknownIndex=0;
m_dFrequency[0][0]=0;
m_nStartPos=0;
}
else
{
m_nTags[0][0]=m_nTags[m_nCurLength-1][0];//Get the last POS in the last sentence
m_dFrequency[0][0]=m_dFrequency[m_nCurLength-1][0];
}
m_nTags[0][1]=-1;//Get the last POS in the last sentence,set the -1 as end flag
m_nCurLength=1;
m_nWordPosition[1]=m_nStartPos;
m_sWords[0][0]=0;
return true;
}
bool CSpan::LoadContext(char *sFilename)
{
return m_context.Load(sFilename);
}
bool CSpan::UnknownMatch()
{//Find the template
char sPOS[MAX_WORDS_PER_SENTENCE]="z";
int nStart,nEnd;
for(int i=1;m_nBestTag[i]>-1;i++)
sPOS[i]=m_nBestTag[i]+'a';
sPOS[i]=0;
char *pFind=strchr(sPOS+1,'b');
char *pFindEnd;
while(pFind!=NULL&&*pFind!=NULL)
{
pFindEnd=pFind+1;
while(pFindEnd!=NULL&&*pFindEnd!=NULL&&(*pFindEnd=='c'||*pFindEnd=='d'))
pFindEnd=pFindEnd+1;
nStart=pFind-sPOS;
nEnd=pFindEnd-sPOS;
//Save the unknown word position
m_nUnknownWords[m_nUnknownIndex][0]=m_nWordPosition[nStart];
m_nUnknownWords[m_nUnknownIndex++][1]=m_nWordPosition[nEnd];
pFind=strchr(pFindEnd,'b');
}
return true;
}
bool CSpan::GetBestPOS()
{
Disamb();
for(int i=m_nCurLength-1,j=0;i>0;i--)
{
if(m_sWords[i][0])
{//Not virtual ending
m_nBestTag[i]=m_nTags[i][j];//Record the best POS and its possibility
}
j=m_nBestPrev[i][j];
}
int nEnd=m_nCurLength;//Set the end of POS tagging
if(m_sWords[m_nCurLength-1][0]==0)
nEnd=m_nCurLength-1;
m_nBestTag[nEnd]=-1;
return true;
}
bool CSpan::SplitPersonPOS(CDictionary &unlistDict)
{//Split the word with POS 21 and 22
int i=m_nCurLength-1,j;
unsigned int nLenWord,nLenPart;
char sFirstPart[50],sLastPart[50];
int nFirstPOS,nLastPOS;
for(;i>0;i--)
{
if(m_nBestTag[i]==21||m_nBestTag[i]==22)
{//Find the POS which need to split
for(j=m_nCurLength-1;j>i;j--)
{//Move the POS and words
strcpy(m_sWords[j+1],m_sWords[j]);
m_nBestTag[j+1]=m_nBestTag[j];
m_nWordPosition[j+1]=m_nWordPosition[j];
}
m_nCurLength+=1;//The length increment
/*
CSegment segment;
segment.Segment(m_sWords[i],unlistDict,1);
*/
//Generate new segment words and POS
if(m_nBestTag[i]==21)
{//Combination by Previous and first component
nLenWord=strlen(m_sWords[i]);
if(nLenWord>4)//Get first component
{
strcpy(sLastPart,m_sWords[i]+nLenWord-4);
if(!unlistDict.IsExist(sLastPart,-1))
strcpy(sLastPart,m_sWords[i]+nLenWord-2);
}
else
{
strcpy(sLastPart,m_sWords[i]+nLenWord-2);
}
nLenPart=strlen(sLastPart);
if(nLenPart<nLenWord)
{//Get first part
strncpy(sFirstPart,m_sWords[i],nLenWord-nLenPart);
sFirstPart[nLenWord-nLenPart]=0;
}
else
{
strncpy(sFirstPart,m_sWords[i],nLenWord-2);
sFirstPart[nLenWord-2]=0;
strncpy(sLastPart,m_sWords[i]+nLenWord-2,2);
sLastPart[2]=0;
}
nFirstPOS=11;
nLastPOS=1;
}
else
{//Combination by Next word and last component
nLenWord=strlen(m_sWords[i]);
if(nLenWord>4)//Get last component
{
strncpy(sFirstPart,m_sWords[i],4);
sFirstPart[4]=0;
if(!unlistDict.IsExist(sFirstPart,-1))
sFirstPart[2]=0;
}
else
{
strncpy(sFirstPart,m_sWords[i],2);
sFirstPart[2]=0;
}
nLenPart=strlen(sFirstPart);
if(nLenPart<nLenWord)
{//Get first part
strncpy(sLastPart,m_sWords[i]+nLenPart,nLenWord-nLenPart);
sLastPart[nLenWord-nLenPart]=0;
}
else
{
strncpy(sFirstPart,m_sWords[i],2);
sFirstPart[2]=0;
strncpy(sLastPart,m_sWords[i]+2,nLenWord-2);
sLastPart[nLenWord-2]=0;
}
if(unlistDict.IsExist(sFirstPart,1)&&m_nBestTag[i-1]==5)
//小陈说:
nFirstPOS=1;
else if(unlistDict.IsExist(m_sWords[i-1],1)&&!unlistDict.IsExist(m_sWords[i-2],1))
nFirstPOS=4;
else
nFirstPOS=3;
nLastPOS=12;
}
strcpy(m_sWords[i],sFirstPart);
m_nBestTag[i]=nFirstPOS;
strcpy(m_sWords[i+1],sLastPart);
m_nBestTag[i+1]=nLastPOS;
m_nWordPosition[i+1]=m_nWordPosition[i]+strlen(sFirstPart);
}
}
return true;
}
bool CSpan::PersonRecognize(CDictionary &personDict)
{
char sPOS[MAX_WORDS_PER_SENTENCE]="z",sPersonName[100];
//0 1 2 3 4 5
char sPatterns[][5]={ "BBCD","BBC","BBE","BBZ","BCD","BEE","BE",
"BG", "BXD","BZ", "CDCD","CD","EE",
"FB", "Y","XD",""};
double dFactor[]={0.0011,0.0011,0.0011,0.0011,0.7614,0.0011,0.2055,
0.0160,0.0011,0.0011,0,0.0160,0.0011,
0.0160,0.0011,0.0011,0
};
//About parameter:
/*
Given Name: 486 0.0160
Surname+postfix:484 0.0160
m_lPerson2Num:6265 0.2055
m_lPerson3Num: 23184 0.7614
m_lPerson4Num:32 0.0011
*/
//The person recognition patterns set
//BBCD:姓+姓+名1+名2;
//BBE: 姓+姓+单名;
//BBZ: 姓+姓+双名成词;
//BCD: 姓+名1+名2;
//BE: 姓+单名;
//BEE: 姓+单名+单名;韩磊磊
//BG: 姓+后缀
//BXD: 姓+姓双名首字成词+双名末字
//BZ: 姓+双名成词;
//B: 姓
//CD: 名1+名2;
//EE: 单名+单名;
//FB: 前缀+姓
//XD: 姓双名首字成词+双名末字
//Y: 姓单名成词
int nPatternLen[]={4,3,3,3,3,3,2,2,3,2,4,2,2,2,1,2,0};
for(int i=1;m_nBestTag[i]>-1;i++)//Convert to string from POS
sPOS[i]=m_nBestTag[i]+'A';
sPOS[i]=0;
int j=1,k,nPos;//Find the proper pattern from the first POS
int nLittleFreqCount;//Counter for the person name role with little frequecy
bool bMatched=false;
while(j<i)
{
bMatched=false;
for(k=0;!bMatched&&nPatternLen[k]>0;k++)
{
if(strncmp(sPatterns[k],sPOS+j,nPatternLen[k])==0&&strcmp(m_sWords[j-1],"·")!=0&&strcmp(m_sWords[j+nPatternLen[k]],"·")!=0)
{//Find the proper pattern k
if(strcmp(sPatterns[k],"FB")==0&&(sPOS[j+2]=='E'||sPOS[j+2]=='C'||sPOS[j+2]=='G'))
{//Rule 1 for exclusion:前缀+姓+名1(名2): 规则(前缀+姓)失效;
continue;
}
/* if((strcmp(sPatterns[k],"BEE")==0||strcmp(sPatterns[k],"EE")==0)&&strcmp(m_sWords[j+nPatternLen[k]-1],m_sWords[j+nPatternLen[k]-2])!=0)
{//Rule 2 for exclusion:姓+单名+单名:单名+单名 若EE对应的字不同,规则失效.如:韩磊磊
continue;
}
if(strcmp(sPatterns[k],"B")==0&&m_nBestTag[j+1]!=12)
{//Rule 3 for exclusion: 若姓后不是后缀,规则失效.如:江主席、刘大娘
continue;
}
*/ //Get the possible name
nPos=j;//Record the person position in the tag sequence
sPersonName[0]=0;
nLittleFreqCount=0;//Record the number of role with little frequency
while(nPos<j+nPatternLen[k])
{//Get the possible person name
//
if(m_nBestTag[nPos]<4&&personDict.GetFrequency(m_sWords[nPos],m_nBestTag[nPos])<LITTLE_FREQUENCY)
nLittleFreqCount++;//The counter increase
strcat(sPersonName,m_sWords[nPos]);
nPos+=1;
}
if(IsAllForeign(sPersonName)&&personDict.GetFrequency(m_sWords[j],1)<LITTLE_FREQUENCY)
{//Exclusion foreign name
//Rule 2 for exclusion:若均为外国人名用字 规则(名1+名2)失效
j+=nPatternLen[k]-1;
continue;
}
if(strcmp(sPatterns[k],"CDCD")==0)
{//Rule for exclusion
//规则(名1+名2+名1+名2)本身是排除规则:女高音歌唱家迪里拜尔演唱
//Rule 3 for exclusion:含外国人名用字 规则适用
//否则,排除规则失效:黑妞白妞姐俩拔了头筹。
if(GetForeignCharCount(sPersonName)>0)
j+=nPatternLen[k]-1;
continue;
}
if(strcmp(sPatterns[k],"CD")==0&&IsAllForeign(sPersonName))
{//
j+=nPatternLen[k]-1;
continue;
}
if(nLittleFreqCount==nPatternLen[k]||nLittleFreqCount==3)
//马哈蒂尔;小扎耶德与他的中国阿姨胡彩玲受华黎明大使之邀,
//The all roles appear with two lower frequecy,we will ignore them
continue;
m_nUnknownWords[m_nUnknownIndex][0]=m_nWordPosition[j];
m_nUnknownWords[m_nUnknownIndex][1]=m_nWordPosition[j+nPatternLen[k]];
m_dWordsPossibility[m_nUnknownIndex]=log(dFactor[k])+ComputePossibility(j,nPatternLen[k],personDict);
//Mutiply the factor
m_nUnknownIndex+=1;
j+=nPatternLen[k];
bMatched=true;
}
}
if(!bMatched)//Not matched, add j by 1
j+=1;
}
return true;
}
int CSpan::GetFrom(PWORD_RESULT pWordItems,int nIndex,CDictionary &dictCore, CDictionary &dictUnknown)
{
int nCount,aPOS[MAX_POS_PER_WORD],aFreq[MAX_POS_PER_WORD];
int nFreq=0,j,nRetPos=0,nWordsIndex=0;
bool bSplit=false;//Need to split in Transliteration recognition
int i=1;
nWordsIndex=i+nIndex-1;
for(;i<MAX_WORDS_PER_SENTENCE&&pWordItems[nWordsIndex].sWord[0]!=0;i++)
{
if(m_tagType==TT_NORMAL||!dictUnknown.IsExist(pWordItems[nWordsIndex].sWord,44))
{
strcpy(m_sWords[i],pWordItems[nWordsIndex].sWord);//store current word
m_nWordPosition[i+1]=m_nWordPosition[i]+strlen(m_sWords[i]);
}
else
{
if(!bSplit)
{
strncpy(m_sWords[i],pWordItems[nWordsIndex].sWord,2);//store current word
m_sWords[i][2]=0;
bSplit=true;
}
else
{
unsigned int nLen=strlen(pWordItems[nWordsIndex].sWord+2);
strncpy(m_sWords[i],pWordItems[nWordsIndex].sWord+2,nLen);//store current word
m_sWords[i][nLen]=0;
bSplit=false;
}
m_nWordPosition[i+1]=m_nWordPosition[i]+strlen(m_sWords[i]);
}
//Record the position of current word
m_nStartPos=m_nWordPosition[i+1];
//Move the Start POS to the ending
if(m_tagType!=TT_NORMAL)
{
//Get the POSs from the unknown recognition dictionary
dictUnknown.GetHandle(m_sWords[i],&nCount,aPOS,aFreq);
for(j=0;j<nCount;j++)
{//Get the POS set of sCurWord in the unknown dictionary
m_nTags[i][j]=aPOS[j];
m_dFrequency[i][j]=-log((double)(1+aFreq[j]))+log((double)(m_context.GetFrequency(0,aPOS[j])+1));
}
//Get the POS set of sCurWord in the core dictionary
//We ignore the POS in the core dictionary and recognize them as other (0).
//We add their frequency to get the possibility as POS 0
dictCore.GetHandle(m_sWords[i],&nCount,aPOS,aFreq);
nFreq=0;
for(int k=0;k<nCount;k++)
{
nFreq+=aFreq[k];
}
if(nCount>0)
{
m_nTags[i][j]=0;
//m_dFrequency[i][j]=(double)(1+nFreq)/(double)(m_context.GetFrequency(0,0)+1);
m_dFrequency[i][j]=-log((double)(1+nFreq))+log((double)(m_context.GetFrequency(0,0)+1));
j++;
}
}
else//For normal POS tagging
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -