📄 segment(sen).c

📁 为自然语言处理领域的中文分词程序
💻 C
📖 第 1 页 / 共 5 页
字号:
}

int Pop(pQueue p,unsigned int *npValue,unsigned int *npIndex,double *epWeight, int  bModify,int  bFirstGet)
{
	PQUEUE_ELEMENT pTemp;
	if(bModify)
		pTemp=p->m_pHead;
	else
	{
	  if(bFirstGet)
		   p->m_pLastAccess=p->m_pHead;
	  pTemp=p->m_pLastAccess;
	}
	if(pTemp==NULL)
		return -1;
    if(npValue!=0)
	    *npValue=pTemp->nParent;
    if(npIndex!=0)
	    *npIndex=pTemp->nIndex;
    if(epWeight!=0)
		*epWeight=pTemp->eWeight;
    if(bModify)
	{
	   p->m_pHead=pTemp->next;
	}
	else
	{
       p->m_pLastAccess=pTemp->next;
    }
    return 1;
}

int IsEmpty(pQueue p,int bBrowsed)
{
	if(bBrowsed==TRUE)
		return (p->m_pLastAccess==NULL);
   return (p->m_pHead==NULL);
}


int IsSingle(pQueue p)
{
   return (p->m_pHead!=NULL&&p->m_pHead->next==NULL);
}

void IQueue(pQueue p)
{	
	p->m_pHead=NULL; 
	p->m_pLastAccess=NULL; 
}

void USpan(pSpan p)
{	
	free(p->m_context);
	p->m_context=NULL;	
}

int Disamb(pSpan p)
{
	int i,j,k,nMinCandidate;
	double dMinFee=0,dTmp;
	for(i=1;i<p->m_nCurLength;i++) 
	{
		for(j=0;p->m_nTags[i][j]>=0;j++) 
		{
			nMinCandidate=MAX_POS_PER_WORD+1;
			for(k=0;p->m_nTags[i-1][k]>=0;k++)
			{
				dTmp=-log(GetContextPossibility(p->m_context,0,p->m_nTags[i-1][k],p->m_nTags[i][j]));
				dTmp+=p->m_dFrequency[i-1][k]; 
				if(nMinCandidate>10||dTmp<dMinFee) 
				{
					nMinCandidate=k;
					dMinFee=dTmp;
				}
			}
			p->m_nBestPrev[i][j]=nMinCandidate; 
			p->m_dFrequency[i][j]=p->m_dFrequency[i][j]+dMinFee;
		}
	}	
	return TRUE;
}

int Reset(pSpan p,int bContinue)
{
	if(!bContinue)
	{ 
		if(p->m_tagType!=TT_NORMAL) 
		      p->m_nTags[0][0]=100; 
		else
		      p->m_nTags[0][0]=0; 
		p->m_nUnknownIndex=0;
		p->m_dFrequency[0][0]=0;
		p->m_nStartPos=0;
	}
	else
	{
		p->m_nTags[0][0]=p->m_nTags[p->m_nCurLength-1][0]; 
		p->m_dFrequency[0][0]=p->m_dFrequency[p->m_nCurLength-1][0];
	}
    p->m_nTags[0][1]=-1; 
	p->m_nCurLength=1;
	p->m_nWordPosition[1]=p->m_nStartPos;	
	p->m_sWords[0][0]=0;
	return TRUE;
}

int LoadContext(pSpan p,char *sFilename)
{
	return LoadContextState(p->m_context,sFilename);
}

int GetBestPOS(pSpan p)
{
	int i,j,nEnd;
	Disamb(p);
	for(i=p->m_nCurLength-1,j=0;i>0;i--)
	{
	 if(p->m_sWords[i][0])
	 {
		 p->m_nBestTag[i]=p->m_nTags[i][j];
	 }
	 j=p->m_nBestPrev[i][j];
  }
  nEnd=p->m_nCurLength;
  if(p->m_sWords[p->m_nCurLength-1][0]==0)
	  nEnd=p->m_nCurLength-1;
  p->m_nBestTag[nEnd]=-1;
  return TRUE;
}

double  ComputePossibility(pSpan p,int nStartPos,int nLength,pDictionary dict)
{
	double dRetValue=0,dPOSPoss;
	int nFreq,i;
	for(i=nStartPos;i<nStartPos+nLength;i++)
	{
		nFreq=GetFrequency(dict,p->m_sWords[i],p->m_nBestTag[i]);

		dPOSPoss=log((double)(GetContextFrequency(p->m_context,0,p->m_nBestTag[i])+1))-log((double)(nFreq+1));
		dRetValue+=dPOSPoss;
	}
	return dRetValue;
}

int PersonRecognize(pSpan p,pDictionary personDict)
{
  char sPOS[MAX_WORDS_PER_SENTENCE]="z",sPersonName[100];

  char sPatterns[][5]={ "BBCD","BBC","BBE","BBZ","BCD","BEE","BE","BG",
	                    "BXD","BZ", "CDCD","CD","EE", "FB", "Y","XD",""};

  double dFactor[]={0.003606,0.000021,0.001314,0.000315,0.656624, 0.000021,0.146116,0.009136,

					0.000042,0.038971,0,0.090367,0.000273,0.009157,0.034324,0.009735,0
  };
  int nPatternLen[]={4,3,3,3,3,3,2,2,3,2,4,2,2,2,1,2,0};
  int i,k,nPos,nLittleFreqCount;
  int j=1;
  int bMatched=FALSE;
  for(i=1;p->m_nBestTag[i]>-1;i++)
	sPOS[i]=p->m_nBestTag[i]+'A';
  sPOS[i]=0;
  while(j<i)
  {
	bMatched=FALSE;
	for(k=0;!bMatched&&nPatternLen[k]>0;k++)
	{
		if(strncmp(sPatterns[k],sPOS+j,nPatternLen[k])==0&&strcmp(p->m_sWords[j-1],"?¤")!=0&&strcmp(p->m_sWords[j+nPatternLen[k]],"?¤")!=0)
		{
			if(strcmp(sPatterns[k],"FB")==0&&(sPOS[j+2]=='E'||sPOS[j+2]=='C'||sPOS[j+2]=='G'))			
				continue;			
			nPos=j;
			sPersonName[0]=0;
			nLittleFreqCount=0;
			while(nPos<j+nPatternLen[k])
			{
				if(p->m_nBestTag[nPos]<4&&GetFrequency(personDict,p->m_sWords[nPos],p->m_nBestTag[nPos])<LITTLE_FREQUENCY)
					nLittleFreqCount++;
				strcat(sPersonName,p->m_sWords[nPos]);
				nPos+=1;
			}
			if(strcmp(sPatterns[k],"CDCD")==0)
			{
				if(GetForeignCharCount(sPersonName)>0)
					j+=nPatternLen[k]-1;
				continue;
			}
     		p->m_nUnknownWords[p->m_nUnknownIndex][0]=p->m_nWordPosition[j];
			p->m_nUnknownWords[p->m_nUnknownIndex][1]=p->m_nWordPosition[j+nPatternLen[k]];
			p->m_dWordsPossibility[p->m_nUnknownIndex]=-log(dFactor[k])+ComputePossibility(p,j,nPatternLen[k],personDict);
			p->m_nUnknownIndex+=1;
			j+=nPatternLen[k];
			bMatched=TRUE;
		}
	}
    if(!bMatched)
		j+=1;
  }
  return TRUE;
}


int GuessPOS(pSpan p,int nIndex,int *pSubIndex)
{
	int j=0,i=nIndex,nCharType;
	unsigned int nLen;
	switch(p->m_tagType)
	{
	case TT_NORMAL:
		break;
	case TT_PERSON:
		j=0;
		if(CC_Find("××",p->m_sWords[nIndex]))
		{
			p->m_nTags[i][j]=6;
			p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,6)+1);
		}
		else
		{
			p->m_nTags[i][j]=0;
			p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,0)+1);
			nLen=strlen(p->m_sWords[nIndex]);
			if(nLen>=4)
			{
				p->m_nTags[i][j]=0;
				p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,0)+1);
				p->m_nTags[i][j]=11;
				p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,11)*8);
				p->m_nTags[i][j]=12;
				p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,12)*8);
				p->m_nTags[i][j]=13;
				p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,13)*8);
			}
			else if(nLen==2)
			{
				p->m_nTags[i][j]=0;
				p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,0)+1);
				nCharType=charType((unsigned char *)p->m_sWords[nIndex]);
				if(nCharType==CT_OTHER||nCharType==CT_CHINESE)
				{
					p->m_nTags[i][j]=1;
					p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,1)+1);
					p->m_nTags[i][j]=2;
					p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,2)+1);
					p->m_nTags[i][j]=3;
					p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,3)+1);
					p->m_nTags[i][j]=4;
					p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,4)+1);
				}
					p->m_nTags[i][j]=11;
					p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,11)*8);
					p->m_nTags[i][j]=12;
					p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,12)*8);
					p->m_nTags[i][j]=13;
					p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,13)*8);
			}
		}
		break;
	case TT_PLACE:
		j=0;
		p->m_nTags[i][j]=0;
		p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,0)+1);
		nLen=strlen(p->m_sWords[nIndex]);
		if(nLen>=4)
		{
			p->m_nTags[i][j]=11;
			p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,11)*8);
			p->m_nTags[i][j]=12;
			p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,12)*8);
			p->m_nTags[i][j]=13;
			p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,13)*8);
		}
		else if(nLen==2)
		{
			p->m_nTags[i][j]=0;
			p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,0)+1);
			nCharType=charType((unsigned char *)p->m_sWords[nIndex]);
			if(nCharType==CT_OTHER||nCharType==CT_CHINESE)
			{
				p->m_nTags[i][j]=1;
				p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,1)+1);
				p->m_nTags[i][j]=2;
				p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,2)+1);
				p->m_nTags[i][j]=3;
				p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,3)+1);
				p->m_nTags[i][j]=4;
				p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,4)+1);
			}
				p->m_nTags[i][j]=11;
				p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,11)*8);
				p->m_nTags[i][j]=12;
				p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,12)*8);
				p->m_nTags[i][j]=13;
				p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,13)*8);
		}
		break;
	case TT_TRANS_PERSON:
		j=0;
		nLen=strlen(p->m_sWords[nIndex]);

		p->m_nTags[i][j]=0;
		p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,0)+1);

		if(!IsAllChinese((unsigned char *)p->m_sWords[nIndex]))
		{
			if(IsAllLetter((unsigned char *)p->m_sWords[nIndex]))
			{
				p->m_nTags[i][j]=1;
				p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,1)+1);
				p->m_nTags[i][j]=11;
				p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,11)+1);
				p->m_nTags[i][j]=2;
				p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,2)*2+1);
				p->m_nTags[i][j]=3;
				p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,3)*2+1);
				p->m_nTags[i][j]=12;
				p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,12)*2+1);
				p->m_nTags[i][j]=13;
				p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,13)*2+1);
			}
			p->m_nTags[i][j]=41;
			p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,41)*8);
			p->m_nTags[i][j]=42;
			p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,42)*8);
			p->m_nTags[i][j]=43;
			p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,43)*8);
		}
		else if(nLen>=4)
		{
			p->m_nTags[i][j]=41;
			p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,41)*8);
			p->m_nTags[i][j]=42;
			p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,42)*8);
			p->m_nTags[i][j]=43;
			p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,43)*8);
		}
		else if(nLen==2)
		{
			nCharType=charType((unsigned char *)p->m_sWords[nIndex]);
			if(nCharType==CT_OTHER||nCharType==CT_CHINESE)
			{
				p->m_nTags[i][j]=1;
				p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,1)*2+1);
				p->m_nTags[i][j]=2;
				p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,2)*2+1);
				p->m_nTags[i][j]=3;
				p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,3)*2+1);
				p->m_nTags[i][j]=30;
				p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,30)*8+1);
				p->m_nTags[i][j]=11;
				p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,11)*4+1);
				p->m_nTags[i][j]=12;
				p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,12)*4+1);
				p->m_nTags[i][j]=13;
				p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,13)*4+1);
				p->m_nTags[i][j]=21;
				p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,21)*2+1);
				p->m_nTags[i][j]=22;
				p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,22)*2+1);
				p->m_nTags[i][j]=23;
				p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,23)*2+1);
			}
				p->m_nTags[i][j]=41;
				p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,41)*8);
				p->m_nTags[i][j]=42;
				p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,42)*8);
				p->m_nTags[i][j]=43;
				p->m_dFrequency[i][j++]=(double)1/(double)(GetContextFrequency(p->m_context,0,43)*8);
		}
		break;
	default:
		break;
	}
	*pSubIndex=j;
	return TRUE;
}

int GetFrom(pSpan p,PWORD_RESULT pWordItems,int nIndex,pDictionary dictCore, pDictionary dictUnknown)
{
	int nCount,aPOS[MAX_POS_PER_WORD],aFreq[MAX_POS_PER_WORD];
	int nFreq=0,j,nRetPos=0,nWordsIndex=0;
	int bSplit=FALSE;
    int k,i=1,nPOSCount;
	char sCurWord[WORD_MAXLENGTH];
	unsigned int nLen;
	nWordsIndex=i+nIndex-1;
	for(;i<MAX_WORDS_PER_SENTENCE&&pWordItems[nWordsIndex].sWord[0]!=0;i++)
	{
		if(p->m_tagType==TT_NORMAL||!IsExist(dictUnknown,pWordItems[nWordsIndex].sWord,44))
        {
			strcpy(p->m_sWords[i],pWordItems[nWordsIndex].sWord);
   		    p->m_nWordPosition[i+1]=p->m_nWordPosition[i]+strlen(p->m_sWords[i]);
		}
		else
		{
			if(!bSplit)
			{
				strncpy(p->m_sWords[i],pWordItems[nWordsIndex].sWord,2);
				p->m_sWords[i][2]=0;
				bSplit=TRUE;
			}
			else
			{
			    nLen=strlen(pWordItems[nWordsIndex].sWord+2);
				strncpy(p->m_sWords[i],pWordItems[nWordsIndex].sWord+2,nLen);
				p->m_sWords[i][nLen]=0;
				bSplit=FALSE;
			}
   		    p->m_nWordPosition[i+1]=p->m_nWordPosition[i]+strlen(p->m_sWords[i]);
		}
		p->m_nStartPos=p->m_nWordPosition[i+1];
		if(p->m_tagType!=TT_NORMAL)		{
			strcpy(sCurWord,p->m_sWords[i]);
			if(p->m_tagType==TT_TRANS_PERSON&&i>0&&charType((unsigned char*)p->m_sWords[i-1])==CT_CHINESE)
			{
				if(p->m_sWords[i][0]=='.'&&p->m_sWords[i][1]==0)
					strcpy(sCurWord,"￡?");
				else if(p->m_sWords[i][0]=='-'&&p->m_sWords[i][1]==0)
					strcpy(sCurWord,"￡-");
			}
			GetHandle(dictUnknown,sCurWord,&nCount,aPOS,aFreq);
			nPOSCount=nCount+1;
			for(j=0;j<nCount;j++)
			{
				p->m_nTags[i][j]=aPOS[j];
				p->m_dFrequency[i][j]=-log((double)(1+aFreq[j]))+log((double)(GetContextFrequency(p->m_context,0,aPOS[j])+nPOSCount));
			}
			if(strcmp(p->m_sWords[i],"?##?")==0)
			{
				p->m_nTags[i][j]=100;
   				p->m_dFrequency[i][j]=0;
				j++;
			}
			else if(strcmp(p->m_sWords[i],"?##?")==0)
			{
				p->m_nTags[i][j]=101;
   				p->m_dFrequency[i][j]=0;
				j++;
			}
			else
			{
				GetHandle(dictCore,p->m_sWords[i],&nCount,aPOS,aFreq);
				nFreq=0;
				for(k=0;k<nCount;k++)
				{
					nFreq+=aFreq[k];
				}
				if(nCount>0)
				{
					p->m_nTags[i][j]=0;

					p->m_dFrequency[i][j]=-log((double)(1+nFreq))+log((double)(GetContextFrequency(p->m_context,0,0)+nPOSCount));
					j++;
				}
			}
		}
		else
		{
			j=0;
			if(pWordItems[nWordsIndex].nHandle>0)
			{
				p->m_nTags[i][j]=pWordItems[nWordsIndex].nHandle;
				p->m_dFrequency[i][j]=-log(pWordItems[nWordsIndex].dValue)+log((double)(GetContextFrequency(p->m_context,0,p->m_nTags[i][j])+1));
				if(p->m_dFrequency[i][j]<0)
					p->m_dFrequency[i][j]=0;
				j++;
			}
			else
			{
				if(pWordItems[nWordsIndex].nHandle<0)
				{
					p->m_nTags[i][j]=-pWordItems[nWordsIndex].nHandle;
					p->m_dFrequency[i][j++]=pWordItems[nWordsIndex].dValue;

				}
				GetHandle(dictCore,p->m_sWords[i],&nCount,aPOS,aFreq);
				nPOSCount=nCount;
				for(;j<nCount;j++)
				{
					p->m_nTags[i][j]=aPOS[j];
					p->m_dFrequency[i][j]=-log(1+aFreq[j])+log(GetContextFrequency(p->m_context,0,p->m_nTags[i][j])+nPOSCount);
				}
			}
		}
		if(j==0)
		{
			GuessPOS(p,i,&j);
		}
		p->m_nTags[i][j]=-1;
		if(j==1&&p->m_nTags[i][j]!=CT_SENTENCE_BEGIN)
💿 文件大小 2941 K
👤 上传用户 lqlm521
📂 所属分类其他
🏷️ 相关标签

#自然语言处理 #分 #程序
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -