cspan.java

来自「基于中科院的ICTCLAS实现中文分词系统开发工具是JAVA.经测试,效果很好」· Java 代码 · 共 738 行 · 第 1/2 页
JAVA
738 行
					m_nTags[i][j]=100;
	   				m_dFrequency[i][j]=0;
					j++;
				}
				else if(GFString.getChineseString( m_sWords[i],"gb2312").indexOf("末##末")==0)
				{
					m_nTags[i][j]=101;
	   				m_dFrequency[i][j]=0;
					j++;
				}
				else
				{
					dictCore.GetHandle(m_sWords[i], nCount,aPOS,aFreq);
					nFreq=0;
					for(int k=0;k<nCount;k++) 
					{
						nFreq+=aFreq[k];
					}
					if(nCount>0)
					{
						m_nTags[i][j]=0;
						//m_dFrequency[i][j]=(double)(1+nFreq)/(double)(m_context.GetFrequency(0,0)+1);
						m_dFrequency[i][j]=-Math.log((double)(1+nFreq))+Math.log((double)(m_context.GetFrequency(0,0)+nPOSCount));
						j++;
					}
				}
			}
			else//For normal POS tagging
			{
				j=0;
				//Get the POSs from the unknown recognition dictionary
				if(pWordItems[nWordsIndex].nHandle>0)
				{//The word has  is only one POS value
				 //We have record its POS and nFrequncy in the items.
					m_nTags[i][j]=pWordItems[nWordsIndex].nHandle;
					m_dFrequency[i][j]=-Math.log(pWordItems[nWordsIndex].dValue)+Math.log((double)(m_context.GetFrequency(0,m_nTags[i][j])+1));
					if(m_dFrequency[i][j]<0)//Not permit the value less than 0
						m_dFrequency[i][j]=0;
					j++;
				}
				else
				{//The word has multiple POSs, we should retrieve the information from Core Dictionary 
					
					if(pWordItems[nWordsIndex].nHandle<0)
					{//The word has  is only one POS value
					 //We have record its POS and nFrequncy in the items.
					/*
						if(pWordItems[nWordsIndex].nHandle==-'t'*256-'t')//tt
						{
							char sWordOrg[100],sPostfix[10];
							double dRatio=0.6925;//The ratio which transliteration as a person name 
							PostfixSplit(pWordItems[nWordsIndex].sWord,sWordOrg,sPostfix);
							if(sPostfix[0]!=0)
									dRatio=0.01;
							m_nTags[i][j]='n'*256+'r';
							m_dFrequency[i][j]=-Math.log(dRatio)+pWordItems[nWordsIndex].dValue;
							//m_dFrequency[i][j]=Math.log(dRatio)+pWordItems[nWordsIndex].dValue-Math.log(m_context.GetFrequency(0,m_nTags[i][j]))+Math.log(MAX_FREQUENCE);
							//P(W|R)=P(WRT)/P(RT)=P(R)*P(W|T)/P(R|T)
							j++;
							m_nTags[i][j]='n'*256+'s';
							m_dFrequency[i][j]=-Math.log(1-dRatio)+pWordItems[nWordsIndex].dValue;
							//m_dFrequency[i][j]=Math.log(1-dRatio)+pWordItems[nWordsIndex].dValue-Math.log(m_context.GetFrequency(0,m_nTags[i][j]))+Math.log(MAX_FREQUENCE);
							j++;
						}
						else//Unknown words such as Chinese person name or place name
						{
					*/
						m_nTags[i][j]=-pWordItems[nWordsIndex].nHandle;
						m_dFrequency[i][j++]=pWordItems[nWordsIndex].dValue;
					//}
					}
					dictCore.GetHandle(m_sWords[i], nCount,aPOS,aFreq);
					nPOSCount=nCount;
					for(;j<nCount;j++) 
					{//Get the POS set of sCurWord in the unknown dictionary
						m_nTags[i][j]=aPOS[j];
	   					m_dFrequency[i][j]=-Math.log(1+aFreq[j])+Math.log(m_context.GetFrequency(0,m_nTags[i][j])+nPOSCount);
					}
				}
			}
			if(j==0)
			{//We donot know the POS, so we have to guess them according lexical knowledge
				GuessPOS(i, j);//Guess the POS of current word
			}
			m_nTags[i][j]=-1;//Set the ending POS 
			if(j==1&&m_nTags[i][j]!=Final.CT_SENTENCE_BEGIN)//No ambuguity
			{//No ambuguity, so we can break from the loop
				i++;
				m_sWords[i][0]=0;
				break;
			}
			if(!bSplit)
				nWordsIndex++;
		}
		if(pWordItems[nWordsIndex].sWord[0]==0)
			nRetPos=-1;//Reaching ending

		if(m_nTags[i-1][1]!=-1)//||m_sWords[i][0]==0
		{//Set end for words like "张/华/平"
			if(m_tagType!=Final.TAG_TYPE.TT_NORMAL)
			       m_nTags[i][0]=101;
			else
			       m_nTags[i][0]=1;
			
			m_dFrequency[i][0]=0;
		    m_sWords[i][0]=0;//Set virtual ending
			m_nTags[i++][1]=-1;
		}
		m_nCurLength=i;//The current word count
		if(nRetPos!=-1)
			return nWordsIndex+1;//Next start position
		return -1;//Reaching ending
	}

	// Get words from the word items, start from nIndex, Function for unknown
	// words recognition
	protected boolean GuessPOS(int nIndex, int  pSubIndex) {
		int j=0,i=nIndex,nCharType;
		 int nLen;
		switch(m_tagType)
		{
		case TT_NORMAL:
			break;
		case TT_PERSON:
			j=0;
			if(Utility.CC_Find("××".getBytes(),m_sWords[nIndex]))
			{
				m_nTags[i][j]=6;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,6)+1);
			}
			else
			{
				m_nTags[i][j]=0;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,0)+1);
				nLen= m_sWords[nIndex].length;
				if(nLen>=4)
				{
					m_nTags[i][j]=0;
					m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,0)+1);
					m_nTags[i][j]=11;
					m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,11)*8);
					m_nTags[i][j]=12;
					m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,12)*8);
					m_nTags[i][j]=13;
					m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,13)*8);
				}
				else if(nLen==2)
				{
					m_nTags[i][j]=0;
					m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,0)+1);
					nCharType=Utility.charType( m_sWords[nIndex][0],m_sWords[nIndex][1]);
					if(nCharType==Final.CT_OTHER||nCharType==Final.CT_CHINESE)
					{
						m_nTags[i][j]=1;
						m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,1)+1);
						m_nTags[i][j]=2;
						m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,2)+1);
						m_nTags[i][j]=3;
						m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,3)+1);
						m_nTags[i][j]=4;
						m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,4)+1);
					}
						m_nTags[i][j]=11;
						m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,11)*8);
						m_nTags[i][j]=12;
						m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,12)*8);
						m_nTags[i][j]=13;
						m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,13)*8);
				}
			}
			break;
		case TT_PLACE:
			j=0;
			m_nTags[i][j]=0;
			m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,0)+1);
			nLen= m_sWords[nIndex].length;
			if(nLen>=4)
			{
				m_nTags[i][j]=11;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,11)*8);
				m_nTags[i][j]=12;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,12)*8);
				m_nTags[i][j]=13;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,13)*8);
			}
			else if(nLen==2)
			{
				m_nTags[i][j]=0;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,0)+1);
				nCharType=Utility.charType( m_sWords[nIndex][0],m_sWords[nIndex][1]);
				if(nCharType==Final.CT_OTHER||nCharType==Final.CT_CHINESE)
				{
					m_nTags[i][j]=1;
					m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,1)+1);
					m_nTags[i][j]=2;
					m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,2)+1);
					m_nTags[i][j]=3;
					m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,3)+1);
					m_nTags[i][j]=4;
					m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,4)+1);
				}
					m_nTags[i][j]=11;
					m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,11)*8);
					m_nTags[i][j]=12;
					m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,12)*8);
					m_nTags[i][j]=13;
					m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,13)*8);
			}
			break;
		case TT_TRANS_PERSON:
			j=0;
			nLen= m_sWords[nIndex].length;
			
			m_nTags[i][j]=0;
			m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,0)+1);

			if(!Utility.IsAllChinese( m_sWords[nIndex]))
			{
				if(Utility.IsAllLetter( m_sWords[nIndex]))
				{
					m_nTags[i][j]=1;
					m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,1)+1);
					m_nTags[i][j]=11;
					m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,11)+1);
					m_nTags[i][j]=2;
					m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,2)*2+1);
					m_nTags[i][j]=3;
					m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,3)*2+1);
					m_nTags[i][j]=12;
					m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,12)*2+1);
					m_nTags[i][j]=13;
					m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,13)*2+1);
				}
				m_nTags[i][j]=41;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,41)*8);
				m_nTags[i][j]=42;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,42)*8);
				m_nTags[i][j]=43;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,43)*8);
			}
			else if(nLen>=4)
			{
				m_nTags[i][j]=41;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,41)*8);
				m_nTags[i][j]=42;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,42)*8);
				m_nTags[i][j]=43;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,43)*8);
			}
			else if(nLen==2)
			{
				nCharType=Utility.charType(m_sWords[nIndex][0],m_sWords[nIndex][1]);
				if(nCharType==Final.CT_OTHER||nCharType==Final.CT_CHINESE)
				{
					m_nTags[i][j]=1;
					m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,1)*2+1);
					m_nTags[i][j]=2;
					m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,2)*2+1);
					m_nTags[i][j]=3;
					m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,3)*2+1);
					m_nTags[i][j]=30;
					m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,30)*8+1);
					m_nTags[i][j]=11;
					m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,11)*4+1);
					m_nTags[i][j]=12;
					m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,12)*4+1);
					m_nTags[i][j]=13;
					m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,13)*4+1);
					m_nTags[i][j]=21;
					m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,21)*2+1);
					m_nTags[i][j]=22;
					m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,22)*2+1);
					m_nTags[i][j]=23;
					m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,23)*2+1);
				}
					m_nTags[i][j]=41;
					m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,41)*8);
					m_nTags[i][j]=42;
					m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,42)*8);
					m_nTags[i][j]=43;
					m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,43)*8);
			}
			break;
		default:
			break;
		}
		//TODO:
		pSubIndex=j;
		return true;
	}

	protected boolean GetBestPOS() {
		Disamb();
		for (int i = m_nCurLength - 1, j = 0; i > 0; i--)// ,j>=0
		{
			if (m_sWords[i][0] == 0) {// Not virtual ending
				m_nBestTag[i] = m_nTags[i][j];// Record the best POS and its
				// possibility
			}
			j = m_nBestPrev[i][j];
		}
		int nEnd = m_nCurLength;// Set the end of POS tagging
		if (m_sWords[m_nCurLength - 1][0] == 0)
			nEnd = m_nCurLength - 1;
		m_nBestTag[nEnd] = -1;
		return true;
	}

	protected boolean Reset(boolean bContinue) {
		if (!bContinue) {// ||CC_Find("。！”〕〉》」〗】",m_sWords[m_nCurLength-1])
			if (m_tagType != Final.TAG_TYPE.TT_NORMAL)// Get the last POS in
				// the last sentence
				m_nTags[0][0] = 100;// Begin tag
			else
				m_nTags[0][0] = 0;// Begin tag
			m_nUnknownIndex = 0;
			m_dFrequency[0][0] = 0;
			m_nStartPos = 0;
		} else {
			m_nTags[0][0] = m_nTags[m_nCurLength - 1][0];// Get the last POS
			// in the last
			// sentence
			m_dFrequency[0][0] = m_dFrequency[m_nCurLength - 1][0];
		}
		m_nTags[0][1] = -1;// Get the last POS in the last sentence,set the -1
		// as end flag
		m_nCurLength = 1;
		m_nWordPosition[1] = m_nStartPos;
		m_sWords[0][0] = 0;
		return true;
	}

	protected boolean Disamb() {
		int i;
		int j;
		int k;
		int nMinCandidate;
		double dMinFee = 0;
		double dTmp;
		for (i = 1; i < m_nCurLength; i++)// For every word
		{
			for (j = 0; m_nTags[i][j] >= 0; j++)// For every word
			{
				nMinCandidate = Final.MAX_POS_PER_WORD + 1;
				for (k = 0; m_nTags[i - 1][k] >= 0; k++) {
					// ConvertPOS(m_nTags[i-1][k],&nKey,&nPrevPOS);
					// ConvertPOS(m_nTags[i][j],&nKey,&nCurPOS);
					// dTmp=m_context.GetContextPossibility(nKey,nPrevPOS,nCurPOS);
					dTmp = -Math.log(m_context.GetContextPossibility(0,
							m_nTags[i - 1][k], m_nTags[i][j]));
					dTmp += m_dFrequency[i - 1][k];// Add the fees
					if (nMinCandidate > 10 || dTmp < dMinFee)// Get the
					// minimum fee
					{
						nMinCandidate = k;
						dMinFee = dTmp;
					}
				}
				m_nBestPrev[i][j] = (byte) nMinCandidate;// The best previous
				// for j
				m_dFrequency[i][j] = m_dFrequency[i][j] + dMinFee;
			}
		}

		return true;
	}

}
cspan.java - 源码说明

本页面展示了「基于中科院的ICTCLAS实现中文分词系统开发工具是JAVA.经测试,效果很好」中的 cspan.java 源码文件，采用 Java 编程语言编写，共 738 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与ICTCLAS相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?