⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 span.cpp

📁 计算所汉语词法分析系统ICTCLAS.分词正确率高达97.58%(973专家组评测)
💻 CPP
📖 第 1 页 / 共 2 页
字号:
		{//No ambuguity, so we can break from the loop
			i++;
			m_sWords[i][0]=0;
			break;
		}
		if(!bSplit)
			nWordsIndex++;
	}
	if(pWordItems[nWordsIndex].sWord[0]==0)
		nRetPos=-1;//Reaching ending

	if(m_nTags[i-1][1]!=-1)//||m_sWords[i][0]==0
	{//Set end for words like "张/华/平"
		if(m_tagType!=TT_NORMAL)
		       m_nTags[i][0]=101;
		else
		       m_nTags[i][0]=1;
		
		m_dFrequency[i][0]=0;
	    m_sWords[i][0]=0;//Set virtual ending
		m_nTags[i++][1]=-1;
	}
	m_nCurLength=i;//The current word count
	if(nRetPos!=-1)
		return nWordsIndex+1;//Next start position
	return -1;//Reaching ending
}


//Set the tag type
void CSpan::SetTagType(enum TAG_TYPE  nType)
{
	m_tagType=nType;
}
//POS tagging with Hidden Markov Model
bool CSpan::POSTagging(PWORD_RESULT pWordItems,CDictionary &dictCore,CDictionary &dictUnknown)
{
//pWordItems: Items; nItemCount: the count of items;core dictionary and unknown recognition dictionary
    int i=0,j,nStartPos;
	Reset(false);
    while(i>-1&&pWordItems[i].sWord[0]!=0)
	{
		nStartPos=i;//Start Position
		i=GetFrom(pWordItems,nStartPos,dictCore,dictUnknown);
		GetBestPOS();
		switch(m_tagType)
		{
		case TT_NORMAL://normal POS tagging
			j=1;
			while(m_nBestTag[j]!=-1&&j<m_nCurLength)
			{//Store the best POS tagging
				pWordItems[j+nStartPos-1].nHandle=m_nBestTag[j];
				//Let 。be 0
				if(pWordItems[j+nStartPos-1].dValue>0&&dictCore.IsExist(pWordItems[j+nStartPos-1].sWord,-1))//Exist and update its frequncy as a POS value
					pWordItems[j+nStartPos-1].dValue=dictCore.GetFrequency(pWordItems[j+nStartPos-1].sWord,m_nBestTag[j]);
				j+=1;
			}
			break;
		case TT_PERSON://Person recognition
			PersonRecognize(dictUnknown);
			break;
		case TT_PLACE://Place name recognition
		case TT_TRANS_PERSON://Transliteration Person
			PlaceRecognize(dictCore,dictUnknown);
			break;
		default:
			break;
		}
		Reset();
	}
	return true;
}
//Guess the POS of No. nIndex word item
bool CSpan::GuessPOS(int nIndex,int *pSubIndex)
{
	int j=0,i=nIndex,nCharType;
	unsigned int nLen;
	switch(m_tagType)
	{
	case TT_NORMAL:
		break;
	case TT_PERSON:
		j=0;
		if(CC_Find("××",m_sWords[nIndex]))
		{
			m_nTags[i][j]=6;
			m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,6)+1);
		}
		else
		{
			m_nTags[i][j]=0;
			m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,0)+1);
			nLen=strlen(m_sWords[nIndex]);
			if(nLen>=4)
			{
				m_nTags[i][j]=0;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,0)+1);
				m_nTags[i][j]=11;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,11)*8);
				m_nTags[i][j]=12;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,12)*8);
				m_nTags[i][j]=13;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,13)*8);
			}
			else if(nLen==2)
			{
				m_nTags[i][j]=0;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,0)+1);
				nCharType=charType((unsigned char *)m_sWords[nIndex]);
				if(nCharType==CT_OTHER||nCharType==CT_CHINESE)
				{
					m_nTags[i][j]=1;
					m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,1)+1);
					m_nTags[i][j]=2;
					m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,2)+1);
					m_nTags[i][j]=3;
					m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,3)+1);
					m_nTags[i][j]=4;
					m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,4)+1);
				}
					m_nTags[i][j]=11;
					m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,11)*8);
					m_nTags[i][j]=12;
					m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,12)*8);
					m_nTags[i][j]=13;
					m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,13)*8);
			}
		}
		break;
	case TT_PLACE:
		j=0;
		m_nTags[i][j]=0;
		m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,0)+1);
		nLen=strlen(m_sWords[nIndex]);
		if(nLen>=4)
		{
			m_nTags[i][j]=11;
			m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,11)*8);
			m_nTags[i][j]=12;
			m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,12)*8);
			m_nTags[i][j]=13;
			m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,13)*8);
		}
		else if(nLen==2)
		{
			m_nTags[i][j]=0;
			m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,0)+1);
			nCharType=charType((unsigned char *)m_sWords[nIndex]);
			if(nCharType==CT_OTHER||nCharType==CT_CHINESE)
			{
				m_nTags[i][j]=1;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,1)+1);
				m_nTags[i][j]=2;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,2)+1);
				m_nTags[i][j]=3;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,3)+1);
				m_nTags[i][j]=4;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,4)+1);
			}
				m_nTags[i][j]=11;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,11)*8);
				m_nTags[i][j]=12;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,12)*8);
				m_nTags[i][j]=13;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,13)*8);
		}
		break;
	case TT_TRANS_PERSON:
		j=0;
		nLen=strlen(m_sWords[nIndex]);
		
		m_nTags[i][j]=0;
		m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,0)+1);

		if(!IsAllChinese((unsigned char *)m_sWords[nIndex]))
		{
			if(IsAllLetter((unsigned char *)m_sWords[nIndex]))
			{
				m_nTags[i][j]=1;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,1)+1);
				m_nTags[i][j]=11;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,11)+1);
				m_nTags[i][j]=2;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,2)*2+1);
				m_nTags[i][j]=3;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,3)*2+1);
				m_nTags[i][j]=12;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,12)*2+1);
				m_nTags[i][j]=13;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,13)*2+1);
			}
			m_nTags[i][j]=41;
			m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,41)*8);
			m_nTags[i][j]=42;
			m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,42)*8);
			m_nTags[i][j]=43;
			m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,43)*8);
		}
		else if(nLen>=4)
		{
			m_nTags[i][j]=41;
			m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,41)*8);
			m_nTags[i][j]=42;
			m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,42)*8);
			m_nTags[i][j]=43;
			m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,43)*8);
		}
		else if(nLen==2)
		{
			nCharType=charType((unsigned char *)m_sWords[nIndex]);
			if(nCharType==CT_OTHER||nCharType==CT_CHINESE)
			{
				m_nTags[i][j]=1;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,1)*2+1);
				m_nTags[i][j]=2;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,2)*2+1);
				m_nTags[i][j]=3;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,3)*2+1);
				m_nTags[i][j]=30;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,30)*8+1);
				m_nTags[i][j]=11;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,11)*4+1);
				m_nTags[i][j]=12;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,12)*4+1);
				m_nTags[i][j]=13;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,13)*4+1);
				m_nTags[i][j]=21;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,21)*2+1);
				m_nTags[i][j]=22;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,22)*2+1);
				m_nTags[i][j]=23;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,23)*2+1);
			}
				m_nTags[i][j]=41;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,41)*8);
				m_nTags[i][j]=42;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,42)*8);
				m_nTags[i][j]=43;
				m_dFrequency[i][j++]=(double)1/(double)(m_context.GetFrequency(0,43)*8);
		}
		break;
	default:
		break;
	}
	*pSubIndex=j;
	return true;
}

ELEMENT_TYPE  CSpan::ComputePossibility(int nStartPos,int nLength,CDictionary &dict)
{
	ELEMENT_TYPE dRetValue=0,dPOSPoss;
	//dPOSPoss: the possibility of a POS appears
	//dContextPoss: The possibility of context POS appears
	int nFreq;
	for(int i=nStartPos;i<nStartPos+nLength;i++)
	{
		nFreq=dict.GetFrequency(m_sWords[i],m_nBestTag[i]);
		//nFreq is word being the POS
		dPOSPoss=log((double)(m_context.GetFrequency(0,m_nBestTag[i])+1))-log((double)(nFreq+1));
		dRetValue+=dPOSPoss;
/*		if(i<nStartPos+nLength-1)
		{
			dContextPoss=log((double)(m_context.GetContextPossibility(0,m_nBestTag[i],m_nBestTag[i+1])+1));
			dRetValue+=dPOSPoss-dContextPoss;
		}
*/	}
	return dRetValue;
}
//DEL bool CSpan::TransRecognize(CDictionary &dictCore,CDictionary &transDict)
//DEL {
//DEL   char sPOS[MAX_WORDS_PER_SENTENCE]="Z";
//DEL   int nStart=1,nEnd=1,i=1;
//DEL   while(m_nBestTag[i]>-1)
//DEL   {
//DEL 	  if(m_nBestTag[i]==1||m_nBestTag[i]==11||m_nBestTag[i]==21)//1,11,21 Trigger the recognition
//DEL 	  {
//DEL 		nStart=i;
//DEL 		nEnd=nStart+1;
//DEL 		while(m_nBestTag[nEnd]==m_nBestTag[nStart])//1,11,21
//DEL 			nEnd++;
//DEL 		while(m_nBestTag[nEnd]==m_nBestTag[nStart]+1)//2,12,22
//DEL 			nEnd++;
//DEL 		while(m_nBestTag[nEnd]==m_nBestTag[nStart]+2)//3,13,23
//DEL 			nEnd++;
//DEL 		while(m_nBestTag[nEnd]==30)//3,13,23
//DEL 			nEnd++;
//DEL 	  }
//DEL 	  else if(m_nBestTag[i]==2||m_nBestTag[i]==12||m_nBestTag[i]==22)//1,11,21 Trigger the recognition
//DEL 	  {
//DEL 		nStart=i;
//DEL 		nEnd=nStart+1;
//DEL 		while(m_nBestTag[nEnd]==m_nBestTag[nStart])//2,12,22
//DEL 			nEnd++;
//DEL 		while(m_nBestTag[nEnd]==m_nBestTag[nStart]+1)//2,12,22
//DEL 			nEnd++;
//DEL 		while(m_nBestTag[nEnd]==30)//3,13,23
//DEL 			nEnd++;
//DEL 	  }
//DEL 	  if(nEnd>nStart&&!IsAllNum((unsigned char *)m_sWords[nStart])&&(nEnd>nStart+2||(nEnd==nStart+2&&(m_nBestTag[nEnd-1]!=30||strlen(m_sWords[nStart])>2))||(nEnd==nStart+1&&strlen(m_sWords[nStart])>2&&!dictCore.IsExist(m_sWords[nStart],-1))))
//DEL 	  {
//DEL 			m_nUnknownWords[m_nUnknownIndex][0]=m_nWordPosition[nStart];
//DEL 			m_nUnknownWords[m_nUnknownIndex][1]=m_nWordPosition[nEnd];
//DEL 			m_dWordsPossibility[m_nUnknownIndex++]=ComputePossibility(nStart,nEnd-nStart+1,transDict);
//DEL 			nStart=nEnd;
//DEL 	  }
//DEL 
//DEL 	  if(i<nEnd)
//DEL 		  i=nEnd;
//DEL 	  else
//DEL 		  i=i+1;
//DEL   }
//DEL   return true;
//DEL }
bool CSpan::PlaceRecognize(CDictionary &dictCore,CDictionary &placeDict)
{
  int nStart=1,nEnd=1,i=1,nTemp;
  double dPanelty=1.0;//Panelty value
  while(m_nBestTag[i]>-1)
  {
	  if(m_nBestTag[i]==1)//1 Trigger the recognition procession
	  {
		nStart=i;
		nEnd=nStart+1;
		while(m_nBestTag[nEnd]==1)//
		{
			if(nEnd>nStart+1)
				dPanelty+=1.0;
			nEnd++;
		}
		while(m_nBestTag[nEnd]==2)//2,12,22
			nEnd++;
		nTemp=nEnd;
		while(m_nBestTag[nEnd]==3)
		{
			if(nEnd>nTemp)
				dPanelty+=1.0;		
			nEnd++;
		}
	  }
	  else if(m_nBestTag[i]==2)//1,11,21 Trigger the recognition
	  {
		dPanelty+=1.0;		
		nStart=i;
		nEnd=nStart+1;
		while(m_nBestTag[nEnd]==2)//2
			nEnd++;
		nTemp=nEnd;
		while(m_nBestTag[nEnd]==3)//2
		{
			if(nEnd>nTemp)
				dPanelty+=1.0;		
			nEnd++;
		}
	  }
	  if(nEnd>nStart)
	  {
			m_nUnknownWords[m_nUnknownIndex][0]=m_nWordPosition[nStart];
			m_nUnknownWords[m_nUnknownIndex][1]=m_nWordPosition[nEnd];
			m_dWordsPossibility[m_nUnknownIndex++]=ComputePossibility(nStart,nEnd-nStart+1,placeDict)+log(dPanelty);
			nStart=nEnd;
	  }
	  if(i<nEnd)
		  i=nEnd;
	  else
		  i=i+1;
  }
  return true;
}

//DEL bool CSpan::TransPersonRecognize(CDictionary &dictCore, CDictionary &transDict)
//DEL {
//DEL   int nStart=1,nEnd=1,i=1;
//DEL   while(m_nBestTag[i]>-1)
//DEL   {
//DEL 	  if(m_nBestTag[i]==1)//1,11,21 Trigger the recognition
//DEL 	  {
//DEL 		nStart=i;
//DEL 		nEnd=nStart+1;
//DEL 		while(m_nBestTag[nEnd]==m_nBestTag[nStart])//1,11,21
//DEL 			nEnd++;
//DEL 		while(m_nBestTag[nEnd]==m_nBestTag[nStart]+1)//2,12,22
//DEL 			nEnd++;
//DEL 		while(m_nBestTag[nEnd]==m_nBestTag[nStart]+2)//3,13,23
//DEL 			nEnd++;
//DEL 	  }
//DEL 	  else if(m_nBestTag[i]==2)//1,11,21 Trigger the recognition
//DEL 	  {
//DEL 		nStart=i;
//DEL 		nEnd=nStart+1;
//DEL 		while(m_nBestTag[nEnd]==m_nBestTag[nStart])//2,12,22
//DEL 			nEnd++;
//DEL 		while(m_nBestTag[nEnd]==m_nBestTag[nStart]+1)//2,12,22
//DEL 			nEnd++;
//DEL 	  }
//DEL 	  if(nEnd>nStart&&!IsAllNum((unsigned char *)m_sWords[nStart])&&(nEnd>nStart+2||(nEnd==nStart+2&&(m_nBestTag[nEnd-1]!=30||strlen(m_sWords[nStart])>2))||(nEnd==nStart+1&&strlen(m_sWords[nStart])>2&&!dictCore.IsExist(m_sWords[nStart],-1))))
//DEL 	  {
//DEL 			m_nUnknownWords[m_nUnknownIndex][0]=m_nWordPosition[nStart];
//DEL 			m_nUnknownWords[m_nUnknownIndex][1]=m_nWordPosition[nEnd];
//DEL 			m_dWordsPossibility[m_nUnknownIndex++]=ComputePossibility(nStart,nEnd-nStart+1,transDict);
//DEL 			nStart=nEnd;
//DEL 	  }
//DEL 
//DEL 	  if(i<nEnd)
//DEL 		  i=nEnd;
//DEL 	  else
//DEL 		  i=i+1;
//DEL   }
//DEL   return true;
//DEL }

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -