📄 segment(sen).c

📁 为自然语言处理领域的中文分词程序
💻 C
📖 第 1 页 / 共 5 页
字号:
		{
			i++;
			p->m_sWords[i][0]=0;
			break;
		}
		if(!bSplit)
			nWordsIndex++;
	}
	if(pWordItems[nWordsIndex].sWord[0]==0)
		nRetPos=-1;

	if(p->m_nTags[i-1][1]!=-1)
	{
		if(p->m_tagType!=TT_NORMAL)
		       p->m_nTags[i][0]=101;
		else
		       p->m_nTags[i][0]=1;

		p->m_dFrequency[i][0]=0;
	    p->m_sWords[i][0]=0;
		p->m_nTags[i++][1]=-1;
	}
	p->m_nCurLength=i;
	if(nRetPos!=-1)
		return nWordsIndex+1;
	return -1;
}

void SetTagType(pSpan p,enum TAG_TYPE  nType)
{
	p->m_tagType=nType;
}

int PlaceRecognize(pSpan p,pDictionary dictCore,pDictionary placeDict)
{
  int nStart=1,nEnd=1,i=1,nTemp;
  double dPanelty=1.0;
  while(p->m_nBestTag[i]>-1)
  {
	  if(p->m_nBestTag[i]==1)
	  {
		nStart=i;
		nEnd=nStart+1;
		while(p->m_nBestTag[nEnd]==1)
		{
			if(nEnd>nStart+1)
				dPanelty+=1.0;
			nEnd++;
		}
		while(p->m_nBestTag[nEnd]==2)
			nEnd++;
		nTemp=nEnd;
		while(p->m_nBestTag[nEnd]==3)
		{
			if(nEnd>nTemp)
				dPanelty+=1.0;
			nEnd++;
		}
	  }
	  else if(p->m_nBestTag[i]==2)
	  {
		dPanelty+=1.0;
		nStart=i;
		nEnd=nStart+1;
		while(p->m_nBestTag[nEnd]==2)
			nEnd++;
		nTemp=nEnd;
		while(p->m_nBestTag[nEnd]==3)
		{
			if(nEnd>nTemp)
				dPanelty+=1.0;
			nEnd++;
		}
	  }
	  if(nEnd>nStart)
	  {
			p->m_nUnknownWords[p->m_nUnknownIndex][0]=p->m_nWordPosition[nStart];
			p->m_nUnknownWords[p->m_nUnknownIndex][1]=p->m_nWordPosition[nEnd];
			p->m_dWordsPossibility[p->m_nUnknownIndex++]=ComputePossibility(p,nStart,nEnd-nStart+1,placeDict)+log(dPanelty);
			nStart=nEnd;
	  }
	  if(i<nEnd)
		  i=nEnd;
	  else
		  i=i+1;
  }
  return TRUE;
}

int POSTagging(pSpan p,PWORD_RESULT pWordItems,pDictionary dictCore,pDictionary dictUnknown)
{
    int i=0,j,nStartPos;
	Reset(p,FALSE);
    while(i>-1&&pWordItems[i].sWord[0]!=0)
	{
		nStartPos=i;
		i=GetFrom(p,pWordItems,nStartPos,dictCore,dictUnknown);
		GetBestPOS(p);
		switch(p->m_tagType)
		{
		case TT_NORMAL:
			j=1;
			while(p->m_nBestTag[j]!=-1&&j<p->m_nCurLength)
			{
				pWordItems[j+nStartPos-1].nHandle=p->m_nBestTag[j];

				if(pWordItems[j+nStartPos-1].dValue>0&&IsExist(dictCore,pWordItems[j+nStartPos-1].sWord,-1))
					pWordItems[j+nStartPos-1].dValue=GetFrequency(dictCore,pWordItems[j+nStartPos-1].sWord,p->m_nBestTag[j]);
				j+=1;
			}
			break;
		case TT_PERSON:
			PersonRecognize(p,dictUnknown);
			break;
		case TT_PLACE:
		case TT_TRANS_PERSON:
			PlaceRecognize(p,dictCore,dictUnknown);
			break;
		default:
			break;
		}
		Reset(p,TRUE);
	}
	return TRUE;
}

void ISpan(pSpan p)
{
	if(p->m_tagType!=TT_NORMAL)
	      p->m_nTags[0][0]=100;
	else
	      p->m_nTags[0][0]=0;
	p->m_context=(pContextState)malloc(sizeof(struct ContextState));
	IContextState(p->m_context);
	p->m_nTags[0][1]=-1;
	p->m_dFrequency[0][0]=0;
	p->m_nCurLength=1;
	p->m_nUnknownIndex=0;
	p->m_nStartPos=0;
	p->m_nWordPosition[1]=0;
	p->m_sWords[0][0]=0;
	p->m_tagType=TT_NORMAL;
}

int  AtomSegment(pSegGraph p,char *sSentence)
{
    unsigned int i=0,j=0;
	unsigned int 	nCurType,nNextType;
	char sChar[3];
	sChar[2]=0;
	p->m_sAtom[j][0]=0;
	p->m_nAtomLength[j]=0;
	if(strncmp(sSentence,SENTENCE_BEGIN,strlen(SENTENCE_BEGIN))==0)
	{
		strcpy(p->m_sAtom[j],SENTENCE_BEGIN);
		p->m_nAtomLength[j]=strlen(SENTENCE_BEGIN);
		p->m_nAtomPOS[j]=CT_SENTENCE_BEGIN;
		i+=p->m_nAtomLength[j];
		j+=1;
		p->m_sAtom[j][0]=0;
		p->m_nAtomLength[j]=0;
	}
	while(i<strlen(sSentence))
	{
		if(strncmp(sSentence+i,SENTENCE_END,strlen(SENTENCE_END))==0)
		{
			strcpy(p->m_sAtom[j],SENTENCE_END);
			p->m_nAtomLength[j]=strlen(SENTENCE_END);
			p->m_nAtomPOS[j]=CT_SENTENCE_END;
			i+=p->m_nAtomLength[j];
			j+=1;
			p->m_sAtom[j][0]=0;
			p->m_nAtomLength[j]=0;
			continue;
		}
		sChar[0]=*(sSentence+i);
		sChar[1]=0;
		i+=1;
		if(sChar[0]<0)
		{
			sChar[1]=*(sSentence+i);
			i+=1;
		}
		strcat(p->m_sAtom[j],sChar);
		nCurType=charType((unsigned char *)sChar);
		if(sChar[0]=='.'&&(charType((unsigned char *)sSentence+i)==CT_NUM||(*(sSentence+i)>='0'&&*(sSentence+i)<='9')))
			nCurType=CT_NUM;
		p->m_nAtomPOS[j]=nCurType;
		if(nCurType==CT_CHINESE||nCurType==CT_INDEX||nCurType==CT_DELIMITER||nCurType==CT_OTHER)
		{
			p->m_nAtomLength[j]=strlen(p->m_sAtom[j]);
			j+=1;
			p->m_sAtom[j][0]=0;
		}
		else
		{
			nNextType=255;
			if(i<strlen(sSentence))
				nNextType=charType((unsigned char *)(sSentence+i));
			if(nNextType!=nCurType||i==strlen(sSentence))
			{
				p->m_nAtomLength[j]=strlen(p->m_sAtom[j]);
				j+=1;
				p->m_sAtom[j][0]=0;
			}
		}
	}
	p->m_nAtomCount=j;
	return TRUE;
}


int GenerateWordNet(pSegGraph p,char *sSentence,pDictionary dictCore,int bOriginalFreq)
{
	unsigned int j,i=0;
 	char sWord[WORD_MAXLENGTH]="",sWordMatch[WORD_MAXLENGTH];
	int nHandleTemp,k,nPOS;
	int nMatchFreq[20],nMatchHandle[20],nTotalFreq,nMatchCount;
	double dValue=0;
	p->m_nAtomCount=0;
	if(p->m_segGraph){
		SetEmpty(p->m_segGraph);
	}
	AtomSegment(p,sSentence);
    for(i=0;i<p->m_nAtomCount;i++)
    {
		if(p->m_nAtomPOS[i]==CT_CHINESE)
		{
			if(!bOriginalFreq)
				SetElement(p->m_segGraph,i,i+1,log(MAX_FREQUENCE),0,0);
			else
				SetElement(p->m_segGraph,i,i+1,0,0,p->m_sAtom[i]);
		}
		else
		{
			strcpy(sWord,p->m_sAtom[i]);
			dValue=MAX_FREQUENCE;
			switch(p->m_nAtomPOS[i])
			{
			case CT_INDEX:
			case CT_NUM:
				nPOS=-27904;
				strcpy(sWord,"未##数");
				dValue=0;
				break;
			case CT_DELIMITER:
				nPOS=30464;
				break;
			case CT_LETTER:
				nPOS=-'n'*256-'x';
				dValue=0;
				strcpy(sWord,"未##串");
				break;
			case CT_SINGLE:
				if(GetCharCount("+-1234567890",p->m_sAtom[i])==(int)strlen(p->m_sAtom[i]))
				{
					nPOS=-27904;
					strcpy(sWord,"未##数");
				}
				else
				{
					nPOS=-'n'*256-'x';
					strcpy(sWord,"未##串");
				}
				dValue=0;
				break;
			default:
				nPOS=p->m_nAtomPOS[i];
				break;
			}
			if(!bOriginalFreq)
				SetElement(p->m_segGraph,i,i+1,0,nPOS,0);
			else
				SetElement(p->m_segGraph,i,i+1,dValue,nPOS,sWord);
		}
    }
	i=0;
	while(i<p->m_nAtomCount)
	{
		strcpy(sWord,p->m_sAtom[i]);
		j=i+1;
		if(strcmp(sWord,"月")==0&&strcmp(p->m_sAtom[i+1],"份")==0)
			j+=1;
		while(j<=p->m_nAtomCount&&GetMaxMatch(dictCore,sWord,sWordMatch,&nHandleTemp))
		{
			if(strcmp(sWordMatch,sWord)==0)
			{
				nTotalFreq=0;

				GetHandle(dictCore,sWord,&nMatchCount,nMatchHandle,nMatchFreq);
				for(k=0;k<nMatchCount;k++)
				{
					nTotalFreq+=nMatchFreq[k];
				}
				if(strlen(sWord)==4&&i>=1&&(IsAllNum((unsigned char *)p->m_sAtom[i-1])
					||IsAllChineseNum(p->m_sAtom[i-1]))&&(strncmp(sWord,"年",2)==0
					||strncmp(sWord,"月",2)==0))
				{
					if(CC_Find("末内中底前间初",sWord+2))
						break;
				}
				if(nMatchCount==1)
				{
					if(!bOriginalFreq)
						SetElement(p->m_segGraph,i,j,-log(nTotalFreq+1)+log(MAX_FREQUENCE),nMatchHandle[0],0);
					else
						SetElement(p->m_segGraph,i,j,nTotalFreq,nMatchHandle[0],sWord);
				}
				else
				{
					if(!bOriginalFreq)
						SetElement(p->m_segGraph,i,j,-log(nTotalFreq+1)+log(MAX_FREQUENCE),0,0);
					else
						SetElement(p->m_segGraph,i,j,nTotalFreq,0,sWord);
				}
			}
			strcat(sWord,p->m_sAtom[j++]);
		}
		i+=1;
	}
	return TRUE;
}


void USegGraph(pSegGraph p)
{
	free(p->m_segGraph);
	p->m_segGraph=NULL;
}

void ISegGraph(pSegGraph p)
{
    p->m_segGraph=(pDynamicArray)malloc(sizeof(struct DynamicArray));
	IDynamicArray(p->m_segGraph,FALSE);
	SetRowFirst(p->m_segGraph,TRUE);
}

int ShortPath(pNShortPath p)
{
	unsigned int nPreNode,i,nIndex,nCurNode;
	double eWeight;
	PARRAY_CHAIN pEdgeList;
    pEdgeList=(PARRAY_CHAIN)malloc(sizeof(ARRAY_CHAIN));
    for(nCurNode=1;nCurNode<p->m_nVertex;nCurNode++)
	{
	   pQueue queWork; 
	   queWork=(pQueue)malloc(sizeof(struct Queue));
	   IQueue(queWork);
	   eWeight=GetElementValue(p->m_apCost,-1,nCurNode,NULL,&pEdgeList);
       while(pEdgeList!=0 && pEdgeList->col==nCurNode)
	   {
		   nPreNode=pEdgeList->row; 
		   eWeight=pEdgeList->value; 
           for(i=0;i<p->m_nValueKind;i++)
		   {
			   if(nPreNode>0) 
			   {
				   if(p->m_pWeight[nPreNode-1][i]==INFINITE_VALUE)
					   break;
		           Push(queWork,nPreNode,i,eWeight+p->m_pWeight[nPreNode-1][i]);
			   }
			   else
			   {
				   Push(queWork,nPreNode,i,eWeight);
				   break;
			   }
		   } 
           pEdgeList=pEdgeList->next;		   
	   } 	    
	   for(i=0;i<p->m_nValueKind;i++)
	   {
			p->m_pWeight[nCurNode-1][i]=INFINITE_VALUE;
	   } 
	   i=0;	   
       while(i<p->m_nValueKind&&Pop(queWork,&nPreNode,&nIndex,&eWeight,TRUE,TRUE)!=-1)
	   { 
		   if(p->m_pWeight[nCurNode-1][i]==INFINITE_VALUE)
			   p->m_pWeight[nCurNode-1][i]=eWeight;
		   else if(p->m_pWeight[nCurNode-1][i]<eWeight) 
		   {
			   i++; 
			   if(i==p->m_nValueKind) 
				   break;
			  p->m_pWeight[nCurNode-1][i]=eWeight;
		   }
            Push(&p->m_pParent[nCurNode-1][i],nPreNode,nIndex,0);
	   }
	}
	return 1;
}

void GetPaths(pNShortPath p,unsigned int nNode,unsigned int nIndex,int **nResult,int bBest)
{
	
    pQueue queResult;
	int bFirstGet;
	unsigned int nCurNode,nCurIndex,nParentNode;
	unsigned int nParentIndex,nResultIndex=0;  
	queResult=(pQueue)malloc(sizeof(struct Queue));
	IQueue(queResult);	  
	if(p->m_nResultCount>=MAX_SEGMENT_NUM) 
		return ;
	nResult[p->m_nResultCount][nResultIndex]=-1; 
	Push(queResult,nNode,nIndex,0);
    nCurNode=nNode;
	nCurIndex=nIndex;  
    while(!IsEmpty(queResult,FALSE))
	{
		while(nCurNode>0) 
		{ 
			if(Pop(&p->m_pParent[nCurNode-1][nCurIndex],&nParentNode,&nParentIndex,0,FALSE,TRUE)!=-1)
			{
			   nCurNode=nParentNode;
			   nCurIndex=nParentIndex;
			}			
			if(nCurNode>0)
                Push(queResult,nCurNode,nCurIndex,0);
		}
		if(nCurNode==0)
		{  
  		   nResult[p->m_nResultCount][nResultIndex++]=nCurNode; 
		   bFirstGet=TRUE;
		   nParentNode=nCurNode;
		   while(Pop(queResult,&nCurNode,&nCurIndex,0,FALSE,bFirstGet)!=-1)
		   {
			   nResult[p->m_nResultCount][nResultIndex++]=nCurNode;
    	       bFirstGet=FALSE;
			   nParentNode=nCurNode;
		   }
		   nResult[p->m_nResultCount][nResultIndex]=-1; 
		   p->m_nResultCount+=1; 
		   if(p->m_nResultCount>=MAX_SEGMENT_NUM) 
				return ;
		   nResultIndex=0;
		   nResult[p->m_nResultCount][nResultIndex]=-1; 
		   if(bBest) 
			   return ;
		}
		Pop(queResult,&nCurNode,&nCurIndex,0,FALSE,TRUE); 
        while((IsEmpty(queResult,FALSE))==FALSE&&(IsSingle(&p->m_pParent[nCurNode-1][nCurIndex])||IsEmpty(&p->m_pParent[nCurNode-1][nCurIndex],TRUE)))
		{
	       Pop(queResult,&nCurNode,&nCurIndex,0,TRUE,TRUE); 
		   Pop(queResult,&nCurNode,&nCurIndex,0,FALSE,TRUE); 
		}
        if(IsEmpty(queResult,FALSE)==FALSE&&IsEmpty(&p->m_pParent[nCurNode-1][nCurIndex],TRUE)==FALSE)
		{
			Pop(&p->m_pParent[nCurNode-1][nCurIndex],&nParentNode,&nParentIndex,0,FALSE,FALSE);
			nCurNode=nParentNode;
			nCurIndex=nParentIndex;
			if(nCurNode>0)
			    Push(queResult,nCurNode,nCurIndex,0);
		}
	}
}
int OutputPath( pNShortPath p,int **nResult,int bBest,int *npCount)
{
  unsigned int i;  
  p->m_nResultCount=0;
  if(p->m_nVertex<2)
  {
	  nResult[0][0]=0;
	  nResult[0][1]=1;
	  *npCount=1;
	  return 1;
  }
  for(i=0;i<p->m_nValueKind&&p->m_pWeight[p->m_nVertex-2][i]<INFINITE_VALUE;i++)
  {
	  GetPaths(p,p->m_nVertex-1,i,nResult,bBest);
	  *npCount=p->m_nResultCount;
	  if(nResult[i][0]!=-1&&bBest) 
		  return 1;
      if(p->m_nResultCount>=MAX_SEGMENT_NUM) 
	 	  return 1;
  }
  return 1;
}


void UNShortPath(struct NShortPath *p)
{    
	 unsigned int i;
	 for(i=0;i<p->m_nVertex-1;i++) 
	 {
		 free(p->m_pWeight[i]);
		 p->m_pWeight[i]=NULL;
         free( p->m_pParent[i]);
		 p->m_pParent[i]=NULL;		
	 }
	 free(p->m_pWeight);
	 p->m_pWeight=NULL;
	 free(p->m_pParent);
💿 文件大小 2941 K
👤 上传用户 lqlm521
📂 所属分类其他
🏷️ 相关标签

#自然语言处理 #分 #程序
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -