📄 span.cpp
字号:
{//No ambuguity, so we can break from the loop
i++;
m_sWords[i][0]=0;
break;
}
if(!bSplit)
nWordsIndex++;
}
if(pWordItems[nWordsIndex].sWord[0]==0)
nRetPos=-1;//Reaching ending
if(m_nTags[i-1][1]!=-1)//||m_sWords[i][0]==0
{//Set end for words like "张/华/平"
if(m_tagType!=TT_NORMAL)
m_nTags[i][0]=101;
else
m_nTags[i][0]=1;
m_dFrequency[i][0]=0;
m_sWords[i][0]=0;//Set virtual ending
m_nTags[i++][1]=-1;
}
m_nCurLength=i;//The current word count
if(nRetPos!=-1)
return nWordsIndex+1;//Next start position
return -1;//Reaching ending
}
//Set the tag type
void CSpan::SetTagType(enum TAG_TYPE nType)
{
m_tagType=nType;
}
//POS tagging with Hidden Markov Model
bool CSpan::POSTagging(PWORD_RESULT pWordItems,CDictionary &dictCore,CDictionary &dictUnknown)
{
//pWordItems: Items; nItemCount: the count of items;core dictionary and unknown recognition dictionary
int i=0,j,nStartPos;
Reset(false);
while(i>-1&&pWordItems[i].sWord[0]!=0)
{
nStartPos=i;//Start Position
i=GetFrom(pWordItems,nStartPos,dictCore,dictUnknown);
GetBestPOS();
switch(m_tagType)
{
case TT_NORMAL://normal POS tagging
j=1;
while(m_nBestTag[j]!=-1&&j<m_nCurLength)
{//Store the best POS tagging
pWordItems[j+nStartPos-1].nHandle=m_nBestTag[j];
//Let 。be 0
if(pWordItems[j+nStartPos-1].dValue>0&&dictCore.IsExist(pWordItems[j+nStartPos-1].sWord,-1))//Exist and update its frequncy as a POS value
pWordItems[j+nStartPos-1].dValue=dictCore.GetFrequency(pWordItems[j+nStartPos-1].sWord,m_nBestTag[j]);
j+=1;
}
break;
case TT_PERSON://Person recognition
PersonRecognize(dictUnknown);
break;
case TT_PLACE://Place name recognition
case TT_TRANS_PERSON://Transliteration Person
PlaceRecognize(dictCore,dictUnknown);
break;
default:
break;
}
Reset();
}
return true;
}
//Guess the POS of No. nIndex word item
bool CSpan::GuessPOS(int nIndex,int *pSubIndex)
{
int j=0,i=nIndex,nCharType;
unsigned int nLen;
switch(m_tagType)
{
case TT_NORMAL:
break;
case TT_PERSON:
j=0;
if(CC_Find("××",m_sWords[nIndex]))
{
m_nTags[i][j]=6;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,6)+1);
}
else
{
m_nTags[i][j]=0;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,0)+1);
nLen=strlen(m_sWords[nIndex]);
if(nLen>=4)
{
m_nTags[i][j]=0;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,0)+1);
m_nTags[i][j]=11;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,11)*8);
m_nTags[i][j]=12;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,12)*8);
m_nTags[i][j]=13;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,13)*8);
}
else if(nLen==2)
{
m_nTags[i][j]=0;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,0)+1);
nCharType=charType((unsigned char *)m_sWords[nIndex]);
if(nCharType==CT_OTHER||nCharType==CT_CHINESE)
{
m_nTags[i][j]=1;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,1)+1);
m_nTags[i][j]=2;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,2)+1);
m_nTags[i][j]=3;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,3)+1);
m_nTags[i][j]=4;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,4)+1);
}
m_nTags[i][j]=11;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,11)*8);
m_nTags[i][j]=12;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,12)*8);
m_nTags[i][j]=13;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,13)*8);
}
}
break;
case TT_PLACE:
j=0;
m_nTags[i][j]=0;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,0)+1);
nLen=strlen(m_sWords[nIndex]);
if(nLen>=4)
{
m_nTags[i][j]=11;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,11)*8);
m_nTags[i][j]=12;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,12)*8);
m_nTags[i][j]=13;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,13)*8);
}
else if(nLen==2)
{
m_nTags[i][j]=0;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,0)+1);
nCharType=charType((unsigned char *)m_sWords[nIndex]);
if(nCharType==CT_OTHER||nCharType==CT_CHINESE)
{
m_nTags[i][j]=1;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,1)+1);
m_nTags[i][j]=2;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,2)+1);
m_nTags[i][j]=3;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,3)+1);
m_nTags[i][j]=4;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,4)+1);
}
m_nTags[i][j]=11;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,11)*8);
m_nTags[i][j]=12;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,12)*8);
m_nTags[i][j]=13;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,13)*8);
}
break;
case TT_TRANS_PERSON:
j=0;
nLen=strlen(m_sWords[nIndex]);
m_nTags[i][j]=0;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,0)+1);
if(!IsAllChinese((unsigned char *)m_sWords[nIndex]))
{
if(IsAllLetter((unsigned char *)m_sWords[nIndex]))
{
m_nTags[i][j]=1;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,1)+1);
m_nTags[i][j]=11;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,11)+1);
m_nTags[i][j]=2;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,2)*2+1);
m_nTags[i][j]=3;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,3)*2+1);
m_nTags[i][j]=12;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,12)*2+1);
m_nTags[i][j]=13;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,13)*2+1);
}
m_nTags[i][j]=41;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,41)*8);
m_nTags[i][j]=42;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,42)*8);
m_nTags[i][j]=43;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,43)*8);
}
else if(nLen>=4)
{
m_nTags[i][j]=41;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,41)*8);
m_nTags[i][j]=42;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,42)*8);
m_nTags[i][j]=43;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,43)*8);
}
else if(nLen==2)
{
nCharType=charType((unsigned char *)m_sWords[nIndex]);
if(nCharType==CT_OTHER||nCharType==CT_CHINESE)
{
m_nTags[i][j]=1;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,1)*2+1);
m_nTags[i][j]=2;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,2)*2+1);
m_nTags[i][j]=3;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,3)*2+1);
m_nTags[i][j]=30;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,30)*8+1);
m_nTags[i][j]=11;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,11)*4+1);
m_nTags[i][j]=12;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,12)*4+1);
m_nTags[i][j]=13;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,13)*4+1);
m_nTags[i][j]=21;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,21)*2+1);
m_nTags[i][j]=22;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,22)*2+1);
m_nTags[i][j]=23;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,23)*2+1);
}
m_nTags[i][j]=41;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,41)*8);
m_nTags[i][j]=42;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,42)*8);
m_nTags[i][j]=43;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,43)*8);
}
break;
default:
break;
}
*pSubIndex=j;
return true;
}
ELEMENT_TYPE CSpan::ComputePossibility(int nStartPos,int nLength,CDictionary &dict)
{
ELEMENT_TYPE dRetValue=0,dPOSPoss;
//dPOSPoss: the possibility of a POS appears
//dContextPoss: The possibility of context POS appears
int nFreq;
for(int i=nStartPos;i<nStartPos+nLength;i++)
{
nFreq=dict.GetFrequency(m_sWords[i],m_nBestTag[i]);
//nFreq is word being the POS
dPOSPoss=log((double)(m_context.GetFrequency(0,m_nBestTag[i])+1))-log((double)(nFreq+1));
dRetValue+=dPOSPoss;
/* if(i<nStartPos+nLength-1)
{
dContextPoss=log((double)(m_context.GetContextPossibility(0,m_nBestTag[i],m_nBestTag[i+1])+1));
dRetValue+=dPOSPoss-dContextPoss;
}
*/ }
return dRetValue;
}
//DEL bool CSpan::TransRecognize(CDictionary &dictCore,CDictionary &transDict)
//DEL {
//DEL char sPOS[MAX_WORDS_PER_SENTENCE]="Z";
//DEL int nStart=1,nEnd=1,i=1;
//DEL while(m_nBestTag[i]>-1)
//DEL {
//DEL if(m_nBestTag[i]==1||m_nBestTag[i]==11||m_nBestTag[i]==21)//1,11,21 Trigger the recognition
//DEL {
//DEL nStart=i;
//DEL nEnd=nStart+1;
//DEL while(m_nBestTag[nEnd]==m_nBestTag[nStart])//1,11,21
//DEL nEnd++;
//DEL while(m_nBestTag[nEnd]==m_nBestTag[nStart]+1)//2,12,22
//DEL nEnd++;
//DEL while(m_nBestTag[nEnd]==m_nBestTag[nStart]+2)//3,13,23
//DEL nEnd++;
//DEL while(m_nBestTag[nEnd]==30)//3,13,23
//DEL nEnd++;
//DEL }
//DEL else if(m_nBestTag[i]==2||m_nBestTag[i]==12||m_nBestTag[i]==22)//1,11,21 Trigger the recognition
//DEL {
//DEL nStart=i;
//DEL nEnd=nStart+1;
//DEL while(m_nBestTag[nEnd]==m_nBestTag[nStart])//2,12,22
//DEL nEnd++;
//DEL while(m_nBestTag[nEnd]==m_nBestTag[nStart]+1)//2,12,22
//DEL nEnd++;
//DEL while(m_nBestTag[nEnd]==30)//3,13,23
//DEL nEnd++;
//DEL }
//DEL if(nEnd>nStart&&!IsAllNum((unsigned char *)m_sWords[nStart])&&(nEnd>nStart+2||(nEnd==nStart+2&&(m_nBestTag[nEnd-1]!=30||strlen(m_sWords[nStart])>2))||(nEnd==nStart+1&&strlen(m_sWords[nStart])>2&&!dictCore.IsExist(m_sWords[nStart],-1))))
//DEL {
//DEL m_nUnknownWords[m_nUnknownIndex][0]=m_nWordPosition[nStart];
//DEL m_nUnknownWords[m_nUnknownIndex][1]=m_nWordPosition[nEnd];
//DEL m_dWordsPossibility[m_nUnknownIndex++]=ComputePossibility(nStart,nEnd-nStart+1,transDict);
//DEL nStart=nEnd;
//DEL }
//DEL
//DEL if(i<nEnd)
//DEL i=nEnd;
//DEL else
//DEL i=i+1;
//DEL }
//DEL return true;
//DEL }
bool CSpan::PlaceRecognize(CDictionary &dictCore,CDictionary &placeDict)
{
int nStart=1,nEnd=1,i=1,nTemp;
double dPanelty=1.0;//Panelty value
while(m_nBestTag[i]>-1)
{
if(m_nBestTag[i]==1)//1 Trigger the recognition procession
{
nStart=i;
nEnd=nStart+1;
while(m_nBestTag[nEnd]==1)//
{
if(nEnd>nStart+1)
dPanelty+=1.0;
nEnd++;
}
while(m_nBestTag[nEnd]==2)//2,12,22
nEnd++;
nTemp=nEnd;
while(m_nBestTag[nEnd]==3)
{
if(nEnd>nTemp)
dPanelty+=1.0;
nEnd++;
}
}
else if(m_nBestTag[i]==2)//1,11,21 Trigger the recognition
{
dPanelty+=1.0;
nStart=i;
nEnd=nStart+1;
while(m_nBestTag[nEnd]==2)//2
nEnd++;
nTemp=nEnd;
while(m_nBestTag[nEnd]==3)//2
{
if(nEnd>nTemp)
dPanelty+=1.0;
nEnd++;
}
}
if(nEnd>nStart)
{
m_nUnknownWords[m_nUnknownIndex][0]=m_nWordPosition[nStart];
m_nUnknownWords[m_nUnknownIndex][1]=m_nWordPosition[nEnd];
m_dWordsPossibility[m_nUnknownIndex++]=ComputePossibility(nStart,nEnd-nStart+1,placeDict)+log(dPanelty);
nStart=nEnd;
}
if(i<nEnd)
i=nEnd;
else
i=i+1;
}
return true;
}
//DEL bool CSpan::TransPersonRecognize(CDictionary &dictCore, CDictionary &transDict)
//DEL {
//DEL int nStart=1,nEnd=1,i=1;
//DEL while(m_nBestTag[i]>-1)
//DEL {
//DEL if(m_nBestTag[i]==1)//1,11,21 Trigger the recognition
//DEL {
//DEL nStart=i;
//DEL nEnd=nStart+1;
//DEL while(m_nBestTag[nEnd]==m_nBestTag[nStart])//1,11,21
//DEL nEnd++;
//DEL while(m_nBestTag[nEnd]==m_nBestTag[nStart]+1)//2,12,22
//DEL nEnd++;
//DEL while(m_nBestTag[nEnd]==m_nBestTag[nStart]+2)//3,13,23
//DEL nEnd++;
//DEL }
//DEL else if(m_nBestTag[i]==2)//1,11,21 Trigger the recognition
//DEL {
//DEL nStart=i;
//DEL nEnd=nStart+1;
//DEL while(m_nBestTag[nEnd]==m_nBestTag[nStart])//2,12,22
//DEL nEnd++;
//DEL while(m_nBestTag[nEnd]==m_nBestTag[nStart]+1)//2,12,22
//DEL nEnd++;
//DEL }
//DEL if(nEnd>nStart&&!IsAllNum((unsigned char *)m_sWords[nStart])&&(nEnd>nStart+2||(nEnd==nStart+2&&(m_nBestTag[nEnd-1]!=30||strlen(m_sWords[nStart])>2))||(nEnd==nStart+1&&strlen(m_sWords[nStart])>2&&!dictCore.IsExist(m_sWords[nStart],-1))))
//DEL {
//DEL m_nUnknownWords[m_nUnknownIndex][0]=m_nWordPosition[nStart];
//DEL m_nUnknownWords[m_nUnknownIndex][1]=m_nWordPosition[nEnd];
//DEL m_dWordsPossibility[m_nUnknownIndex++]=ComputePossibility(nStart,nEnd-nStart+1,transDict);
//DEL nStart=nEnd;
//DEL }
//DEL
//DEL if(i<nEnd)
//DEL i=nEnd;
//DEL else
//DEL i=i+1;
//DEL }
//DEL return true;
//DEL }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -