📄 segment(sen).c
字号:
}
int Pop(pQueue p,unsigned int *npValue,unsigned int *npIndex,double *epWeight, int bModify,int bFirstGet)
{
PQUEUE_ELEMENT pTemp;
if(bModify)
pTemp=p->m_pHead;
else
{
if(bFirstGet)
p->m_pLastAccess=p->m_pHead;
pTemp=p->m_pLastAccess;
}
if(pTemp==NULL)
return -1;
if(npValue!=0)
*npValue=pTemp->nParent;
if(npIndex!=0)
*npIndex=pTemp->nIndex;
if(epWeight!=0)
*epWeight=pTemp->eWeight;
if(bModify)
{
p->m_pHead=pTemp->next;
}
else
{
p->m_pLastAccess=pTemp->next;
}
return 1;
}
int IsEmpty(pQueue p,int bBrowsed)
{
if(bBrowsed==TRUE)
return (p->m_pLastAccess==NULL);
return (p->m_pHead==NULL);
}
int IsSingle(pQueue p)
{
return (p->m_pHead!=NULL&&p->m_pHead->next==NULL);
}
void IQueue(pQueue p)
{
p->m_pHead=NULL;
p->m_pLastAccess=NULL;
}
void USpan(pSpan p)
{
free(p->m_context);
p->m_context=NULL;
}
int Disamb(pSpan p)
{
int i,j,k,nMinCandidate;
double dMinFee=0,dTmp;
for(i=1;i<p->m_nCurLength;i++)
{
for(j=0;p->m_nTags[i][j]>=0;j++)
{
nMinCandidate=MAX_POS_PER_WORD+1;
for(k=0;p->m_nTags[i-1][k]>=0;k++)
{
dTmp=-log(GetContextPossibility(p->m_context,0,p->m_nTags[i-1][k],p->m_nTags[i][j]));
dTmp+=p->m_dFrequency[i-1][k];
if(nMinCandidate>10||dTmp<dMinFee)
{
nMinCandidate=k;
dMinFee=dTmp;
}
}
p->m_nBestPrev[i][j]=nMinCandidate;
p->m_dFrequency[i][j]=p->m_dFrequency[i][j]+dMinFee;
}
}
return TRUE;
}
int Reset(pSpan p,int bContinue)
{
if(!bContinue)
{
if(p->m_tagType!=TT_NORMAL)
p->m_nTags[0][0]=100;
else
p->m_nTags[0][0]=0;
p->m_nUnknownIndex=0;
p->m_dFrequency[0][0]=0;
p->m_nStartPos=0;
}
else
{
p->m_nTags[0][0]=p->m_nTags[p->m_nCurLength-1][0];
p->m_dFrequency[0][0]=p->m_dFrequency[p->m_nCurLength-1][0];
}
p->m_nTags[0][1]=-1;
p->m_nCurLength=1;
p->m_nWordPosition[1]=p->m_nStartPos;
p->m_sWords[0][0]=0;
return TRUE;
}
int LoadContext(pSpan p,char *sFilename)
{
return LoadContextState(p->m_context,sFilename);
}
int GetBestPOS(pSpan p)
{
int i,j,nEnd;
Disamb(p);
for(i=p->m_nCurLength-1,j=0;i>0;i--)
{
if(p->m_sWords[i][0])
{
p->m_nBestTag[i]=p->m_nTags[i][j];
}
j=p->m_nBestPrev[i][j];
}
nEnd=p->m_nCurLength;
if(p->m_sWords[p->m_nCurLength-1][0]==0)
nEnd=p->m_nCurLength-1;
p->m_nBestTag[nEnd]=-1;
return TRUE;
}
double ComputePossibility(pSpan p,int nStartPos,int nLength,pDictionary dict)
{
double dRetValue=0,dPOSPoss;
int nFreq,i;
for(i=nStartPos;i<nStartPos+nLength;i++)
{
nFreq=GetFrequency(dict,p->m_sWords[i],p->m_nBestTag[i]);
dPOSPoss=log((double)(GetContextFrequency(p->m_context,0,p->m_nBestTag[i])+1))-log((double)(nFreq+1));
dRetValue+=dPOSPoss;
}
return dRetValue;
}
int PersonRecognize(pSpan p,pDictionary personDict)
{
char sPOS[MAX_WORDS_PER_SENTENCE]="z",sPersonName[100];
char sPatterns[][5]={ "BBCD","BBC","BBE","BBZ","BCD","BEE","BE","BG",
"BXD","BZ", "CDCD","CD","EE", "FB", "Y","XD",""};
double dFactor[]={0.003606,0.000021,0.001314,0.000315,0.656624, 0.000021,0.146116,0.009136,
0.000042,0.038971,0,0.090367,0.000273,0.009157,0.034324,0.009735,0
};
int nPatternLen[]={4,3,3,3,3,3,2,2,3,2,4,2,2,2,1,2,0};
int i,k,nPos,nLittleFreqCount;
int j=1;
int bMatched=FALSE;
for(i=1;p->m_nBestTag[i]>-1;i++)
sPOS[i]=p->m_nBestTag[i]+'A';
sPOS[i]=0;
while(j<i)
{
bMatched=FALSE;
for(k=0;!bMatched&&nPatternLen[k]>0;k++)
{
if(strncmp(sPatterns[k],sPOS+j,nPatternLen[k])==0&&strcmp(p->m_sWords[j-1],"?¤")!=0&&strcmp(p->m_sWords[j+nPatternLen[k]],"?¤")!=0)
{
if(strcmp(sPatterns[k],"FB")==0&&(sPOS[j+2]=='E'||sPOS[j+2]=='C'||sPOS[j+2]=='G'))
continue;
nPos=j;
sPersonName[0]=0;
nLittleFreqCount=0;
while(nPos<j+nPatternLen[k])
{
if(p->m_nBestTag[nPos]<4&&GetFrequency(personDict,p->m_sWords[nPos],p->m_nBestTag[nPos])<LITTLE_FREQUENCY)
nLittleFreqCount++;
strcat(sPersonName,p->m_sWords[nPos]);
nPos+=1;
}
if(strcmp(sPatterns[k],"CDCD")==0)
{
if(GetForeignCharCount(sPersonName)>0)
j+=nPatternLen[k]-1;
continue;
}
p->m_nUnknownWords[p->m_nUnknownIndex][0]=p->m_nWordPosition[j];
p->m_nUnknownWords[p->m_nUnknownIndex][1]=p->m_nWordPosition[j+nPatternLen[k]];
p->m_dWordsPossibility[p->m_nUnknownIndex]=-log(dFactor[k])+ComputePossibility(p,j,nPatternLen[k],personDict);
p->m_nUnknownIndex+=1;
j+=nPatternLen[k];
bMatched=TRUE;
}
}
if(!bMatched)
j+=1;
}
return TRUE;
}
int GuessPOS(pSpan p,int nIndex,int *pSubIndex)
{
int j=0,i=nIndex,nCharType;
unsigned int nLen;
switch(p->m_tagType)
{
case TT_NORMAL:
break;
case TT_PERSON:
j=0;
if(CC_Find("××",p->m_sWords[nIndex]))
{
p->m_nTags[i][j]=6;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,6)+1);
}
else
{
p->m_nTags[i][j]=0;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,0)+1);
nLen=strlen(p->m_sWords[nIndex]);
if(nLen>=4)
{
p->m_nTags[i][j]=0;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,0)+1);
p->m_nTags[i][j]=11;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,11)*8);
p->m_nTags[i][j]=12;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,12)*8);
p->m_nTags[i][j]=13;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,13)*8);
}
else if(nLen==2)
{
p->m_nTags[i][j]=0;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,0)+1);
nCharType=charType((unsigned char *)p->m_sWords[nIndex]);
if(nCharType==CT_OTHER||nCharType==CT_CHINESE)
{
p->m_nTags[i][j]=1;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,1)+1);
p->m_nTags[i][j]=2;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,2)+1);
p->m_nTags[i][j]=3;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,3)+1);
p->m_nTags[i][j]=4;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,4)+1);
}
p->m_nTags[i][j]=11;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,11)*8);
p->m_nTags[i][j]=12;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,12)*8);
p->m_nTags[i][j]=13;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,13)*8);
}
}
break;
case TT_PLACE:
j=0;
p->m_nTags[i][j]=0;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,0)+1);
nLen=strlen(p->m_sWords[nIndex]);
if(nLen>=4)
{
p->m_nTags[i][j]=11;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,11)*8);
p->m_nTags[i][j]=12;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,12)*8);
p->m_nTags[i][j]=13;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,13)*8);
}
else if(nLen==2)
{
p->m_nTags[i][j]=0;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,0)+1);
nCharType=charType((unsigned char *)p->m_sWords[nIndex]);
if(nCharType==CT_OTHER||nCharType==CT_CHINESE)
{
p->m_nTags[i][j]=1;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,1)+1);
p->m_nTags[i][j]=2;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,2)+1);
p->m_nTags[i][j]=3;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,3)+1);
p->m_nTags[i][j]=4;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,4)+1);
}
p->m_nTags[i][j]=11;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,11)*8);
p->m_nTags[i][j]=12;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,12)*8);
p->m_nTags[i][j]=13;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,13)*8);
}
break;
case TT_TRANS_PERSON:
j=0;
nLen=strlen(p->m_sWords[nIndex]);
p->m_nTags[i][j]=0;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,0)+1);
if(!IsAllChinese((unsigned char *)p->m_sWords[nIndex]))
{
if(IsAllLetter((unsigned char *)p->m_sWords[nIndex]))
{
p->m_nTags[i][j]=1;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,1)+1);
p->m_nTags[i][j]=11;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,11)+1);
p->m_nTags[i][j]=2;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,2)*2+1);
p->m_nTags[i][j]=3;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,3)*2+1);
p->m_nTags[i][j]=12;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,12)*2+1);
p->m_nTags[i][j]=13;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,13)*2+1);
}
p->m_nTags[i][j]=41;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,41)*8);
p->m_nTags[i][j]=42;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,42)*8);
p->m_nTags[i][j]=43;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,43)*8);
}
else if(nLen>=4)
{
p->m_nTags[i][j]=41;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,41)*8);
p->m_nTags[i][j]=42;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,42)*8);
p->m_nTags[i][j]=43;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,43)*8);
}
else if(nLen==2)
{
nCharType=charType((unsigned char *)p->m_sWords[nIndex]);
if(nCharType==CT_OTHER||nCharType==CT_CHINESE)
{
p->m_nTags[i][j]=1;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,1)*2+1);
p->m_nTags[i][j]=2;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,2)*2+1);
p->m_nTags[i][j]=3;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,3)*2+1);
p->m_nTags[i][j]=30;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,30)*8+1);
p->m_nTags[i][j]=11;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,11)*4+1);
p->m_nTags[i][j]=12;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,12)*4+1);
p->m_nTags[i][j]=13;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,13)*4+1);
p->m_nTags[i][j]=21;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,21)*2+1);
p->m_nTags[i][j]=22;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,22)*2+1);
p->m_nTags[i][j]=23;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,23)*2+1);
}
p->m_nTags[i][j]=41;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,41)*8);
p->m_nTags[i][j]=42;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,42)*8);
p->m_nTags[i][j]=43;
p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,43)*8);
}
break;
default:
break;
}
*pSubIndex=j;
return TRUE;
}
int GetFrom(pSpan p,PWORD_RESULT pWordItems,int nIndex,pDictionary dictCore, pDictionary dictUnknown)
{
int nCount,aPOS[MAX_POS_PER_WORD],aFreq[MAX_POS_PER_WORD];
int nFreq=0,j,nRetPos=0,nWordsIndex=0;
int bSplit=FALSE;
int k,i=1,nPOSCount;
char sCurWord[WORD_MAXLENGTH];
unsigned int nLen;
nWordsIndex=i+nIndex-1;
for(;i<MAX_WORDS_PER_SENTENCE&&pWordItems[nWordsIndex].sWord[0]!=0;i++)
{
if(p->m_tagType==TT_NORMAL||!IsExist(dictUnknown,pWordItems[nWordsIndex].sWord,44))
{
strcpy(p->m_sWords[i],pWordItems[nWordsIndex].sWord);
p->m_nWordPosition[i+1]=p->m_nWordPosition[i]+strlen(p->m_sWords[i]);
}
else
{
if(!bSplit)
{
strncpy(p->m_sWords[i],pWordItems[nWordsIndex].sWord,2);
p->m_sWords[i][2]=0;
bSplit=TRUE;
}
else
{
nLen=strlen(pWordItems[nWordsIndex].sWord+2);
strncpy(p->m_sWords[i],pWordItems[nWordsIndex].sWord+2,nLen);
p->m_sWords[i][nLen]=0;
bSplit=FALSE;
}
p->m_nWordPosition[i+1]=p->m_nWordPosition[i]+strlen(p->m_sWords[i]);
}
p->m_nStartPos=p->m_nWordPosition[i+1];
if(p->m_tagType!=TT_NORMAL) {
strcpy(sCurWord,p->m_sWords[i]);
if(p->m_tagType==TT_TRANS_PERSON&&i>0&&charType((unsigned char*)p->m_sWords[i-1])==CT_CHINESE)
{
if(p->m_sWords[i][0]=='.'&&p->m_sWords[i][1]==0)
strcpy(sCurWord,"£?");
else if(p->m_sWords[i][0]=='-'&&p->m_sWords[i][1]==0)
strcpy(sCurWord,"£-");
}
GetHandle(dictUnknown,sCurWord,&nCount,aPOS,aFreq);
nPOSCount=nCount+1;
for(j=0;j<nCount;j++)
{
p->m_nTags[i][j]=aPOS[j];
p->m_dFrequency[i][j]=-log((double)(1+aFreq[j]))+log((double)(GetContextFrequency(p->m_context,0,aPOS[j])+nPOSCount));
}
if(strcmp(p->m_sWords[i],"?##?")==0)
{
p->m_nTags[i][j]=100;
p->m_dFrequency[i][j]=0;
j++;
}
else if(strcmp(p->m_sWords[i],"?##?")==0)
{
p->m_nTags[i][j]=101;
p->m_dFrequency[i][j]=0;
j++;
}
else
{
GetHandle(dictCore,p->m_sWords[i],&nCount,aPOS,aFreq);
nFreq=0;
for(k=0;k<nCount;k++)
{
nFreq+=aFreq[k];
}
if(nCount>0)
{
p->m_nTags[i][j]=0;
p->m_dFrequency[i][j]=-log((double)(1+nFreq))+log((double)(GetContextFrequency(p->m_context,0,0)+nPOSCount));
j++;
}
}
}
else
{
j=0;
if(pWordItems[nWordsIndex].nHandle>0)
{
p->m_nTags[i][j]=pWordItems[nWordsIndex].nHandle;
p->m_dFrequency[i][j]=-log(pWordItems[nWordsIndex].dValue)+log((double)(GetContextFrequency(p->m_context,0,p->m_nTags[i][j])+1));
if(p->m_dFrequency[i][j]<0)
p->m_dFrequency[i][j]=0;
j++;
}
else
{
if(pWordItems[nWordsIndex].nHandle<0)
{
p->m_nTags[i][j]=-pWordItems[nWordsIndex].nHandle;
p->m_dFrequency[i][j++]=pWordItems[nWordsIndex].dValue;
}
GetHandle(dictCore,p->m_sWords[i],&nCount,aPOS,aFreq);
nPOSCount=nCount;
for(;j<nCount;j++)
{
p->m_nTags[i][j]=aPOS[j];
p->m_dFrequency[i][j]=-log(1+aFreq[j])+log(GetContextFrequency(p->m_context,0,p->m_nTags[i][j])+nPOSCount);
}
}
}
if(j==0)
{
GuessPOS(p,i,&j);
}
p->m_nTags[i][j]=-1;
if(j==1&&p->m_nTags[i][j]!=CT_SENTENCE_BEGIN)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -