📄 segtofile(sen).c

📁 为自然语言处理领域的中文分词程序
💻 C
📖 第 1 页 / 共 5 页
字号:
	 {
		 /*得到最佳位置*/
		 p->m_nBestTag[i]=p->m_nTags[i][j];
	 }
	 j=p->m_nBestPrev[i][j];
  }
  nEnd=p->m_nCurLength;
  if(p->m_sWords[p->m_nCurLength-1][0]==0)
	  nEnd=p->m_nCurLength-1;
  p->m_nBestTag[nEnd]=-1;
  return TRUE;
}

/**************************************************************************************
  46. 计算权重：开始位置，长度，词典和标注的链表，上下文概率
***************************************************************************************/
double  ComputePossibility(pSpan p,int nStartPos,int nLength,pDictionary dict)
{
	double dRetValue=0,dPOSPoss;
	int nFreq,i;
	for(i=nStartPos;i<nStartPos+nLength;i++)
	{
		/*在词典中查找词的标注的词频*/
		nFreq=GetFrequency(dict,p->m_sWords[i],p->m_nBestTag[i]);
        /*从二元关系链表中查找Symbol的频率,+1后对数，－总体频率*/
		dPOSPoss=log((double)(GetContextFrequency(p->m_context,0,p->m_nBestTag[i])+1))-log((double)(nFreq+1));
		dRetValue+=dPOSPoss;
	}
	return dRetValue;
}

/**************************************************************************************
  47. 人名识别：标注结构和人名词典
***************************************************************************************/
int PersonRecognize(pSpan p,pDictionary personDict)
{
  /*句子中的词位置为z*/
  char sPOS[MAX_WORDS_PER_SENTENCE]="z",sPersonName[100];
  /*人名的各模式*/
  char sPatterns[][5]={ "BBCD","BBC","BBE","BBZ","BCD","BEE","BE","BG",
	                    "BXD","BZ", "CDCD","CD","EE", "FB", "Y","XD",""};

  /*各因子的系数*/
  double dFactor[]={0.003606,0.000021,0.001314,0.000315,0.656624, 0.000021,0.146116,0.009136,

					0.000042,0.038971,0,0.090367,0.000273,0.009157,0.034324,0.009735,0};
  /*各模式的长度*/
  int nPatternLen[]={4,3,3,3,3,3,2,2,3,2,4,2,2,2,1,2,0};
  int i,k,nPos,nLittleFreqCount;
  int j=1;
  int bMatched=FALSE;
  /*词的最佳标注＋A */
  for(i=1;p->m_nBestTag[i]>-1;i++)
	sPOS[i]=p->m_nBestTag[i]+'A';
  sPOS[i]=0;

  while(j<i)
  {
	bMatched=FALSE;

	for(k=0;!bMatched&&nPatternLen[k]>0;k++)
	{   
		if(strncmp(sPatterns[k],sPOS+j,nPatternLen[k])==0&&strcmp(p->m_sWords[j-1],"?¤")!=0&&strcmp(p->m_sWords[j+nPatternLen[k]],"?¤")!=0)
		{   
			/*名字模式为FB，有E,C,G时中断*/
			if(strcmp(sPatterns[k],"FB")==0&&(sPOS[j+2]=='E'||sPOS[j+2]=='C'||sPOS[j+2]=='G'))			
				continue;			
			nPos=j;
			sPersonName[0]=0;
			nLittleFreqCount=0;
            /*得到每种标注的频率，位置在＋＋*/
			while(nPos<j+nPatternLen[k])
			{
				if(p->m_nBestTag[nPos]<4&&GetFrequency(personDict,p->m_sWords[nPos],p->m_nBestTag[nPos])<LITTLE_FREQUENCY)
					/*小频率数目＋＋*/
					nLittleFreqCount++;
				 /*人名连接上词内容*/
				strcat(sPersonName,p->m_sWords[nPos]);
				nPos+=1;
			}
			if(strcmp(sPatterns[k],"CDCD")==0)
			{
				if(GetForeignCharCount(sPersonName)>0)
					j+=nPatternLen[k]-1;
				continue;
			}
     		p->m_nUnknownWords[p->m_nUnknownIndex][0]=p->m_nWordPosition[j];
			p->m_nUnknownWords[p->m_nUnknownIndex][1]=p->m_nWordPosition[j+nPatternLen[k]];
			p->m_dWordsPossibility[p->m_nUnknownIndex]=-log(dFactor[k])+ComputePossibility(p,j,nPatternLen[k],personDict);
			p->m_nUnknownIndex+=1;
			j+=nPatternLen[k];
			bMatched=TRUE;
		}
	}
    if(!bMatched)
		j+=1;
  }
  return TRUE;
}

/**************************************************************************************
  48. 猜位置：标注结构、索引和子索引
***************************************************************************************/
int GuessPOS(pSpan p,int nIndex,int *pSubIndex)
{
	int j=0,i=nIndex,nCharType;
	unsigned int nLen;
	/*正常标注或识别*/
	switch(p->m_tagType)
	{
	case TT_NORMAL:
		break;
	case TT_PERSON:
		j=0;
        /*查找××*/
		if(CC_Find("××",p->m_sWords[nIndex]))
		{
			p->m_nTags[i][j]=6;
			/*得到上下文频率*/
			p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,6)+1);
		}
		else
		{
			p->m_nTags[i][j]=0;
			p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,0)+1);
			nLen=strlen(p->m_sWords[nIndex]);
			if(nLen>=4)
			{
				p->m_nTags[i][j]=0;
				/*长度>4,为标注为0，11，12，13得到上下文频率*/
				p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,0)+1);
				p->m_nTags[i][j]=11;
				p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,11)*8);
				p->m_nTags[i][j]=12;
				p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,12)*8);
				p->m_nTags[i][j]=13;
				p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,13)*8);
			}
			/*长度=2,为标注为0，11，12，13得到上下文频率*/
			else if(nLen==2)
			{
				p->m_nTags[i][j]=0;
				p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,0)+1);
				nCharType=charType((unsigned char *)p->m_sWords[nIndex]);
				if(nCharType==CT_OTHER||nCharType==CT_CHINESE)
				{
					p->m_nTags[i][j]=1;
					p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,1)+1);
					p->m_nTags[i][j]=2;
					p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,2)+1);
					p->m_nTags[i][j]=3;
					p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,3)+1);
					p->m_nTags[i][j]=4;
					p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,4)+1);
				}
					p->m_nTags[i][j]=11;
					p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,11)*8);
					p->m_nTags[i][j]=12;
					p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,12)*8);
					p->m_nTags[i][j]=13;
					p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,13)*8);
			}
		}
		break;
	case TT_PLACE:
		j=0;
		p->m_nTags[i][j]=0;
		p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,0)+1);
		nLen=strlen(p->m_sWords[nIndex]);
		if(nLen>=4)
		{
			p->m_nTags[i][j]=11;
			p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,11)*8);
			p->m_nTags[i][j]=12;
			p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,12)*8);
			p->m_nTags[i][j]=13;
			p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,13)*8);
		}
		else if(nLen==2)
		{
			p->m_nTags[i][j]=0;
			p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,0)+1);
			nCharType=charType((unsigned char *)p->m_sWords[nIndex]);
			if(nCharType==CT_OTHER||nCharType==CT_CHINESE)
			{
				p->m_nTags[i][j]=1;
				p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,1)+1);
				p->m_nTags[i][j]=2;
				p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,2)+1);
				p->m_nTags[i][j]=3;
				p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,3)+1);
				p->m_nTags[i][j]=4;
				p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,4)+1);
			}
				p->m_nTags[i][j]=11;
				p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,11)*8);
				p->m_nTags[i][j]=12;
				p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,12)*8);
				p->m_nTags[i][j]=13;
				p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,13)*8);
		}
		break;
	case TT_TRANS_PERSON:
		j=0;
		nLen=strlen(p->m_sWords[nIndex]);

		p->m_nTags[i][j]=0;
		p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,0)+1);

		if(!IsAllChinese((unsigned char *)p->m_sWords[nIndex]))
		{
			if(IsAllLetter((unsigned char *)p->m_sWords[nIndex]))
			{
				p->m_nTags[i][j]=1;
				p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,1)+1);
				p->m_nTags[i][j]=11;
				p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,11)+1);
				p->m_nTags[i][j]=2;
				p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,2)*2+1);
				p->m_nTags[i][j]=3;
				p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,3)*2+1);
				p->m_nTags[i][j]=12;
				p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,12)*2+1);
				p->m_nTags[i][j]=13;
				p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,13)*2+1);
			}
			p->m_nTags[i][j]=41;
			p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,41)*8);
			p->m_nTags[i][j]=42;
			p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,42)*8);
			p->m_nTags[i][j]=43;
			p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,43)*8);
		}
		else if(nLen>=4)
		{
			p->m_nTags[i][j]=41;
			p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,41)*8);
			p->m_nTags[i][j]=42;
			p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,42)*8);
			p->m_nTags[i][j]=43;
			p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,43)*8);
		}
		else if(nLen==2)
		{
			nCharType=charType((unsigned char *)p->m_sWords[nIndex]);
			if(nCharType==CT_OTHER||nCharType==CT_CHINESE)
			{
				p->m_nTags[i][j]=1;
				p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,1)*2+1);
				p->m_nTags[i][j]=2;
				p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,2)*2+1);
				p->m_nTags[i][j]=3;
				p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,3)*2+1);
				p->m_nTags[i][j]=30;
				p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,30)*8+1);
				p->m_nTags[i][j]=11;
				p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,11)*4+1);
				p->m_nTags[i][j]=12;
				p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,12)*4+1);
				p->m_nTags[i][j]=13;
				p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,13)*4+1);
				p->m_nTags[i][j]=21;
				p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,21)*2+1);
				p->m_nTags[i][j]=22;
				p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,22)*2+1);
				p->m_nTags[i][j]=23;
				p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,23)*2+1);
			}
				p->m_nTags[i][j]=41;
				p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,41)*8);
				p->m_nTags[i][j]=42;
				p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,42)*8);
				p->m_nTags[i][j]=43;
				p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,43)*8);
		}
		break;
	default:
		break;
	}
	*pSubIndex=j;
	return TRUE;
}

/**************************************************************************************
  49. 获取：标注结构、词结构、索引、词典和未登陆词词典
***************************************************************************************/
int GetFrom(pSpan p,PWORD_RESULT pWordItems,int nIndex,pDictionary dictCore, pDictionary dictUnknown)
{
	int nCount,aPOS[MAX_POS_PER_WORD],aFreq[MAX_POS_PER_WORD];
	int nFreq=0,j,nRetPos=0,nWordsIndex=0;
	int bSplit=FALSE;
    int k,i=1,nPOSCount;
	char sCurWord[WORD_MAXLENGTH];
	unsigned int nLen;
	nWordsIndex=i+nIndex-1;

	for(;i<MAX_WORDS_PER_SENTENCE&&pWordItems[nWordsIndex].sWord[0]!=0;i++)
	{
		if(p->m_tagType==TT_NORMAL||!IsExist(dictUnknown,pWordItems[nWordsIndex].sWord,44))
        {
			strcpy(p->m_sWords[i],pWordItems[nWordsIndex].sWord);
   		    p->m_nWordPosition[i+1]=p->m_nWordPosition[i]+strlen(p->m_sWords[i]);
		}
		else
		{
			if(!bSplit)
			{
				strncpy(p->m_sWords[i],pWordItems[nWordsIndex].sWord,2);
				p->m_sWords[i][2]=0;
				bSplit=TRUE;
			}
			else
			{
			    nLen=strlen(pWordItems[nWordsIndex].sWord+2);
				strncpy(p->m_sWords[i],pWordItems[nWordsIndex].sWord+2,nLen);
				p->m_sWords[i][nLen]=0;
				bSplit=FALSE;
			}
   		    p->m_nWordPosition[i+1]=p->m_nWordPosition[i]+strlen(p->m_sWords[i]);
		}
		p->m_nStartPos=p->m_nWordPosition[i+1];
		if(p->m_tagType!=TT_NORMAL)		{
			strcpy(sCurWord,p->m_sWords[i]);
			if(p->m_tagType==TT_TRANS_PERSON&&i>0&&charType((unsigned char*)p->m_sWords[i-1])==CT_CHINESE)
			{
				if(p->m_sWords[i][0]=='.'&&p->m_sWords[i][1]==0)
					strcpy(sCurWord,"￡?");
				else if(p->m_sWords[i][0]=='-'&&p->m_sWords[i][1]==0)
					strcpy(sCurWord,"￡-");
			}
			GetHandle(dictUnknown,sCurWord,&nCount,aPOS,aFreq);
			nPOSCount=nCount+1;
			for(j=0;j<nCount;j++)
			{
				p->m_nTags[i][j]=aPOS[j];
				p->m_dFrequency[i][j]=-log((double)(1+aFreq[j]))+log((double)(GetContextFrequency(p->m_context,0,aPOS[j])+nPOSCount));
			}
			if(strcmp(p->m_sWords[i],"?##?")==0)
			{
				p->m_nTags[i][j]=100;
   				p->m_dFrequency[i][j]=0;
				j++;
			}
			else if(strcmp(p->m_sWords[i],"?##?")==0)
			{
				p->m_nTags[i][j]=101;
   				p->m_dFrequency[i][j]=0;
				j++;
			}
			else
			{
				GetHandle(dictCore,p->m_sWords[i],&nCount,aPOS,aFreq);
				nFreq=0;
				for(k=0;k<nCount;k++)
				{
					nFreq+=aFreq[k];
				}
				if(nCount>0)
				{
					p->m_nTags[i][j]=0;

					p->m_dFrequency[i][j]=-log((double)(1+nFreq))+log((double)(GetContextFrequency(p->m_context,0,0)+nPOSCount));
					j++;
				}
			}
		}
		else
		{
			j=0;
			if(pWordItems[nWordsIndex].nHandle>0)
			{
				p->m_nTags[i][j]=pWordItems[nWordsIndex].nHandle;
				p->m_dFrequency[i][j]=-log(pWordItems[nWordsIndex].dValue)+log((double)(GetContextFrequency(p->m_context,0,p->m_nTags[i][j])+1));
				if(p->m_dFrequency[i][j]<0)
					p->m_dFrequency[i][j]=0;
				j++;
			}
			else
			{
				if(pWordItems[nWordsIndex].nHandle<0)
				{
					p->m_nTags[i][j]=-pWordItems[nWordsIndex].nHandle;
					p->m_dFrequency[i][j++]=pWordItems[nWordsIndex].dValue;

				}
				GetHandle(dictCore,p->m_sWords[i],&nCount,aPOS,aFreq);
				nPOSCount=nCount;
				for(;j<nCount;j++)
				{
					p->m_nTags[i][j]=aPOS[j];
					p->m_dFrequency[i][j]=-log(1+aFreq[j])+log(GetContextFrequency(p->m_context,0,p->m_nTags[i][j])+nPOSCount);
				}
			}
		}
		if(j==0)
		{
			GuessPOS(p,i,&j);
		}
		p->m_nTags[i][j]=-1;
		if(j==1&&p->m_nTags[i][j]!=CT_SENTENCE_BEGIN)
		{
			i++;
			p->m_sWords[i][0]=0;
			break;
		}
		if(!bSplit)
			nWordsIndex++;
	}
	if(pWordItems[nWordsIndex].sWord[0]==0)
		nRetPos=-1;

	if(p->m_nTags[i-1][1]!=-1)
	{
		if(p->m_tagType!=TT_NORMAL)
		       p->m_nTags[i][0]=101;
		else
		       p->m_nTags[i][0]=1;

		p->m_dFrequency[i][0]=0;
	    p->m_sWords[i][0]=0;
		p->m_nTags[i++][1]=-1;
	}
	p->m_nCurLength=i;
	if(nRetPos!=-1)
		return nWordsIndex+1;
	return -1;
}


/*49.设置标注类型，正常、人名、地名或翻译*/
void SetTagType(pSpan p,enum TAG_TYPE  nType)
{
💿 文件大小 2941 K
👤 上传用户 lqlm521
📂 所属分类其他
🏷️ 相关标签

#自然语言处理 #分 #程序
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -