📄 cspan.java
字号:
m_nTags[i][j]=100;
m_dFrequency[i][j]=0;
j++;
}
else if(GFString.getChineseString( m_sWords[i],"gb2312").indexOf("末##末")==0)
{
m_nTags[i][j]=101;
m_dFrequency[i][j]=0;
j++;
}
else
{
dictCore.GetHandle(m_sWords[i], nCount,aPOS,aFreq);
nFreq=0;
for(int k=0;k<nCount;k++)
{
nFreq+=aFreq[k];
}
if(nCount>0)
{
m_nTags[i][j]=0;
//m_dFrequency[i][j]=(double)(1+nFreq)/(double)(m_context.GetFrequency(0,0)+1);
m_dFrequency[i][j]=-Math.log((double)(1+nFreq))+Math.log((double)(m_context.GetFrequency(0,0)+nPOSCount));
j++;
}
}
}
else//For normal POS tagging
{
j=0;
//Get the POSs from the unknown recognition dictionary
if(pWordItems[nWordsIndex].nHandle>0)
{//The word has is only one POS value
//We have record its POS and nFrequncy in the items.
m_nTags[i][j]=pWordItems[nWordsIndex].nHandle;
m_dFrequency[i][j]=-Math.log(pWordItems[nWordsIndex].dValue)+Math.log((double)(m_context.GetFrequency(0,m_nTags[i][j])+1));
if(m_dFrequency[i][j]<0)//Not permit the value less than 0
m_dFrequency[i][j]=0;
j++;
}
else
{//The word has multiple POSs, we should retrieve the information from Core Dictionary
if(pWordItems[nWordsIndex].nHandle<0)
{//The word has is only one POS value
//We have record its POS and nFrequncy in the items.
/*
if(pWordItems[nWordsIndex].nHandle==-'t'*256-'t')//tt
{
char sWordOrg[100],sPostfix[10];
double dRatio=0.6925;//The ratio which transliteration as a person name
PostfixSplit(pWordItems[nWordsIndex].sWord,sWordOrg,sPostfix);
if(sPostfix[0]!=0)
dRatio=0.01;
m_nTags[i][j]='n'*256+'r';
m_dFrequency[i][j]=-Math.log(dRatio)+pWordItems[nWordsIndex].dValue;
//m_dFrequency[i][j]=Math.log(dRatio)+pWordItems[nWordsIndex].dValue-Math.log(m_context.GetFrequency(0,m_nTags[i][j]))+Math.log(MAX_FREQUENCE);
//P(W|R)=P(WRT)/P(RT)=P(R)*P(W|T)/P(R|T)
j++;
m_nTags[i][j]='n'*256+'s';
m_dFrequency[i][j]=-Math.log(1-dRatio)+pWordItems[nWordsIndex].dValue;
//m_dFrequency[i][j]=Math.log(1-dRatio)+pWordItems[nWordsIndex].dValue-Math.log(m_context.GetFrequency(0,m_nTags[i][j]))+Math.log(MAX_FREQUENCE);
j++;
}
else//Unknown words such as Chinese person name or place name
{
*/
m_nTags[i][j]=-pWordItems[nWordsIndex].nHandle;
m_dFrequency[i][j++]=pWordItems[nWordsIndex].dValue;
//}
}
dictCore.GetHandle(m_sWords[i], nCount,aPOS,aFreq);
nPOSCount=nCount;
for(;j<nCount;j++)
{//Get the POS set of sCurWord in the unknown dictionary
m_nTags[i][j]=aPOS[j];
m_dFrequency[i][j]=-Math.log(1+aFreq[j])+Math.log(m_context.GetFrequency(0,m_nTags[i][j])+nPOSCount);
}
}
}
if(j==0)
{//We donot know the POS, so we have to guess them according lexical knowledge
GuessPOS(i, j);//Guess the POS of current word
}
m_nTags[i][j]=-1;//Set the ending POS
if(j==1&&m_nTags[i][j]!=Final.CT_SENTENCE_BEGIN)//No ambuguity
{//No ambuguity, so we can break from the loop
i++;
m_sWords[i][0]=0;
break;
}
if(!bSplit)
nWordsIndex++;
}
if(pWordItems[nWordsIndex].sWord[0]==0)
nRetPos=-1;//Reaching ending
if(m_nTags[i-1][1]!=-1)//||m_sWords[i][0]==0
{//Set end for words like "张/华/平"
if(m_tagType!=Final.TAG_TYPE.TT_NORMAL)
m_nTags[i][0]=101;
else
m_nTags[i][0]=1;
m_dFrequency[i][0]=0;
m_sWords[i][0]=0;//Set virtual ending
m_nTags[i++][1]=-1;
}
m_nCurLength=i;//The current word count
if(nRetPos!=-1)
return nWordsIndex+1;//Next start position
return -1;//Reaching ending
}
// Get words from the word items, start from nIndex, Function for unknown
// words recognition
protected boolean GuessPOS(int nIndex, int pSubIndex) {
int j=0,i=nIndex,nCharType;
int nLen;
switch(m_tagType)
{
case TT_NORMAL:
break;
case TT_PERSON:
j=0;
if(Utility.CC_Find("××".getBytes(),m_sWords[nIndex]))
{
m_nTags[i][j]=6;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,6)+1);
}
else
{
m_nTags[i][j]=0;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,0)+1);
nLen= m_sWords[nIndex].length;
if(nLen>=4)
{
m_nTags[i][j]=0;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,0)+1);
m_nTags[i][j]=11;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,11)*8);
m_nTags[i][j]=12;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,12)*8);
m_nTags[i][j]=13;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,13)*8);
}
else if(nLen==2)
{
m_nTags[i][j]=0;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,0)+1);
nCharType=Utility.charType( m_sWords[nIndex][0],m_sWords[nIndex][1]);
if(nCharType==Final.CT_OTHER||nCharType==Final.CT_CHINESE)
{
m_nTags[i][j]=1;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,1)+1);
m_nTags[i][j]=2;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,2)+1);
m_nTags[i][j]=3;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,3)+1);
m_nTags[i][j]=4;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,4)+1);
}
m_nTags[i][j]=11;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,11)*8);
m_nTags[i][j]=12;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,12)*8);
m_nTags[i][j]=13;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,13)*8);
}
}
break;
case TT_PLACE:
j=0;
m_nTags[i][j]=0;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,0)+1);
nLen= m_sWords[nIndex].length;
if(nLen>=4)
{
m_nTags[i][j]=11;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,11)*8);
m_nTags[i][j]=12;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,12)*8);
m_nTags[i][j]=13;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,13)*8);
}
else if(nLen==2)
{
m_nTags[i][j]=0;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,0)+1);
nCharType=Utility.charType( m_sWords[nIndex][0],m_sWords[nIndex][1]);
if(nCharType==Final.CT_OTHER||nCharType==Final.CT_CHINESE)
{
m_nTags[i][j]=1;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,1)+1);
m_nTags[i][j]=2;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,2)+1);
m_nTags[i][j]=3;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,3)+1);
m_nTags[i][j]=4;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,4)+1);
}
m_nTags[i][j]=11;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,11)*8);
m_nTags[i][j]=12;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,12)*8);
m_nTags[i][j]=13;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,13)*8);
}
break;
case TT_TRANS_PERSON:
j=0;
nLen= m_sWords[nIndex].length;
m_nTags[i][j]=0;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,0)+1);
if(!Utility.IsAllChinese( m_sWords[nIndex]))
{
if(Utility.IsAllLetter( m_sWords[nIndex]))
{
m_nTags[i][j]=1;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,1)+1);
m_nTags[i][j]=11;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,11)+1);
m_nTags[i][j]=2;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,2)*2+1);
m_nTags[i][j]=3;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,3)*2+1);
m_nTags[i][j]=12;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,12)*2+1);
m_nTags[i][j]=13;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,13)*2+1);
}
m_nTags[i][j]=41;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,41)*8);
m_nTags[i][j]=42;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,42)*8);
m_nTags[i][j]=43;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,43)*8);
}
else if(nLen>=4)
{
m_nTags[i][j]=41;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,41)*8);
m_nTags[i][j]=42;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,42)*8);
m_nTags[i][j]=43;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,43)*8);
}
else if(nLen==2)
{
nCharType=Utility.charType(m_sWords[nIndex][0],m_sWords[nIndex][1]);
if(nCharType==Final.CT_OTHER||nCharType==Final.CT_CHINESE)
{
m_nTags[i][j]=1;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,1)*2+1);
m_nTags[i][j]=2;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,2)*2+1);
m_nTags[i][j]=3;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,3)*2+1);
m_nTags[i][j]=30;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,30)*8+1);
m_nTags[i][j]=11;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,11)*4+1);
m_nTags[i][j]=12;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,12)*4+1);
m_nTags[i][j]=13;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,13)*4+1);
m_nTags[i][j]=21;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,21)*2+1);
m_nTags[i][j]=22;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,22)*2+1);
m_nTags[i][j]=23;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,23)*2+1);
}
m_nTags[i][j]=41;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,41)*8);
m_nTags[i][j]=42;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,42)*8);
m_nTags[i][j]=43;
m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,43)*8);
}
break;
default:
break;
}
//TODO:
pSubIndex=j;
return true;
}
protected boolean GetBestPOS() {
Disamb();
for (int i = m_nCurLength - 1, j = 0; i > 0; i--)// ,j>=0
{
if (m_sWords[i][0] == 0) {// Not virtual ending
m_nBestTag[i] = m_nTags[i][j];// Record the best POS and its
// possibility
}
j = m_nBestPrev[i][j];
}
int nEnd = m_nCurLength;// Set the end of POS tagging
if (m_sWords[m_nCurLength - 1][0] == 0)
nEnd = m_nCurLength - 1;
m_nBestTag[nEnd] = -1;
return true;
}
protected boolean Reset(boolean bContinue) {
if (!bContinue) {// ||CC_Find("。!”〕〉》」〗】",m_sWords[m_nCurLength-1])
if (m_tagType != Final.TAG_TYPE.TT_NORMAL)// Get the last POS in
// the last sentence
m_nTags[0][0] = 100;// Begin tag
else
m_nTags[0][0] = 0;// Begin tag
m_nUnknownIndex = 0;
m_dFrequency[0][0] = 0;
m_nStartPos = 0;
} else {
m_nTags[0][0] = m_nTags[m_nCurLength - 1][0];// Get the last POS
// in the last
// sentence
m_dFrequency[0][0] = m_dFrequency[m_nCurLength - 1][0];
}
m_nTags[0][1] = -1;// Get the last POS in the last sentence,set the -1
// as end flag
m_nCurLength = 1;
m_nWordPosition[1] = m_nStartPos;
m_sWords[0][0] = 0;
return true;
}
protected boolean Disamb() {
int i;
int j;
int k;
int nMinCandidate;
double dMinFee = 0;
double dTmp;
for (i = 1; i < m_nCurLength; i++)// For every word
{
for (j = 0; m_nTags[i][j] >= 0; j++)// For every word
{
nMinCandidate = Final.MAX_POS_PER_WORD + 1;
for (k = 0; m_nTags[i - 1][k] >= 0; k++) {
// ConvertPOS(m_nTags[i-1][k],&nKey,&nPrevPOS);
// ConvertPOS(m_nTags[i][j],&nKey,&nCurPOS);
// dTmp=m_context.GetContextPossibility(nKey,nPrevPOS,nCurPOS);
dTmp = -Math.log(m_context.GetContextPossibility(0,
m_nTags[i - 1][k], m_nTags[i][j]));
dTmp += m_dFrequency[i - 1][k];// Add the fees
if (nMinCandidate > 10 || dTmp < dMinFee)// Get the
// minimum fee
{
nMinCandidate = k;
dMinFee = dTmp;
}
}
m_nBestPrev[i][j] = (byte) nMinCandidate;// The best previous
// for j
m_dFrequency[i][j] = m_dFrequency[i][j] + dMinFee;
}
}
return true;
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -