⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 seg12_13(file).c

📁 为自然语言处理领域的中文分词程序
💻 C
📖 第 1 页 / 共 5 页
字号:
	{
		nStartPos=i;
		i=GetFrom(p,pWordItems,nStartPos,dictCore,dictUnknown);
		GetBestPOS(p);
		switch(p->m_tagType)
		{
		case TT_NORMAL:
			j=1;
			while(p->m_nBestTag[j]!=-1&&j<p->m_nCurLength)
			{
				pWordItems[j+nStartPos-1].nHandle=p->m_nBestTag[j];

				if(pWordItems[j+nStartPos-1].dValue>0&&IsExist(dictCore,pWordItems[j+nStartPos-1].sWord,-1))
					pWordItems[j+nStartPos-1].dValue=GetFrequency(dictCore,pWordItems[j+nStartPos-1].sWord,p->m_nBestTag[j]);
				j+=1;
			}
			break;
		case TT_PERSON:
			PersonRecognize(p,dictUnknown);
			break;
		case TT_PLACE:
		case TT_TRANS_PERSON:
			PlaceRecognize(p,dictCore,dictUnknown);
			break;
		default:
			break;
		}
		Reset(p,TRUE);
	}
	return TRUE;
}

void ISpan(pSpan p)
{
	if(p->m_tagType!=TT_NORMAL)
	      p->m_nTags[0][0]=100;
	else
	      p->m_nTags[0][0]=0;
	p->m_context=(pContextState)malloc(sizeof(struct ContextState));
	IContextState(p->m_context);
	p->m_nTags[0][1]=-1;
	p->m_dFrequency[0][0]=0;
	p->m_nCurLength=1;
	p->m_nUnknownIndex=0;
	p->m_nStartPos=0;
	p->m_nWordPosition[1]=0;
	p->m_sWords[0][0]=0;
	p->m_tagType=TT_NORMAL;
}

int  AtomSegment(pSegGraph p,char *sSentence)
{
    unsigned int i=0,j=0;
	unsigned int 	nCurType,nNextType;
	char sChar[3];
	sChar[2]=0;
	p->m_sAtom[j][0]=0;
	p->m_nAtomLength[j]=0;
	if(strncmp(sSentence,SENTENCE_BEGIN,strlen(SENTENCE_BEGIN))==0)
	{
		strcpy(p->m_sAtom[j],SENTENCE_BEGIN);
		p->m_nAtomLength[j]=strlen(SENTENCE_BEGIN);
		p->m_nAtomPOS[j]=CT_SENTENCE_BEGIN;
		i+=p->m_nAtomLength[j];
		j+=1;
		p->m_sAtom[j][0]=0;
		p->m_nAtomLength[j]=0;
	}
	while(i<strlen(sSentence))
	{
		if(strncmp(sSentence+i,SENTENCE_END,strlen(SENTENCE_END))==0)
		{
			strcpy(p->m_sAtom[j],SENTENCE_END);
			p->m_nAtomLength[j]=strlen(SENTENCE_END);
			p->m_nAtomPOS[j]=CT_SENTENCE_END;
			i+=p->m_nAtomLength[j];
			j+=1;
			p->m_sAtom[j][0]=0;
			p->m_nAtomLength[j]=0;
			continue;
		}
		sChar[0]=*(sSentence+i);
		sChar[1]=0;
		i+=1;
		if(sChar[0]<0)
		{
			sChar[1]=*(sSentence+i);
			i+=1;
		}
		strcat(p->m_sAtom[j],sChar);
		nCurType=charType((unsigned char *)sChar);
		if(sChar[0]=='.'&&(charType((unsigned char *)sSentence+i)==CT_NUM||(*(sSentence+i)>='0'&&*(sSentence+i)<='9')))
			nCurType=CT_NUM;
		p->m_nAtomPOS[j]=nCurType;
		if(nCurType==CT_CHINESE||nCurType==CT_INDEX||nCurType==CT_DELIMITER||nCurType==CT_OTHER)
		{
			p->m_nAtomLength[j]=strlen(p->m_sAtom[j]);
			j+=1;
			p->m_sAtom[j][0]=0;
		}
		else
		{
			nNextType=255;
			if(i<strlen(sSentence))
				nNextType=charType((unsigned char *)(sSentence+i));
			if(nNextType!=nCurType||i==strlen(sSentence))
			{
				p->m_nAtomLength[j]=strlen(p->m_sAtom[j]);
				j+=1;
				p->m_sAtom[j][0]=0;
			}
		}
	}
	p->m_nAtomCount=j;
	return TRUE;
}


int GenerateWordNet(pSegGraph p,char *sSentence,pDictionary dictCore,int bOriginalFreq)
{
	unsigned int j,i=0;
 	char sWord[WORD_MAXLENGTH]="",sWordMatch[WORD_MAXLENGTH];
	int nHandleTemp,k,nPOS;
	int nMatchFreq[20],nMatchHandle[20],nTotalFreq,nMatchCount;
	double dValue=0;
	p->m_nAtomCount=0;
	if(p->m_segGraph){
		SetEmpty(p->m_segGraph);
	}
	AtomSegment(p,sSentence);
    for(i=0;i<p->m_nAtomCount;i++)
    {
		if(p->m_nAtomPOS[i]==CT_CHINESE)
		{
			if(!bOriginalFreq)
				SetElement(p->m_segGraph,i,i+1,log(MAX_FREQUENCE),0,0);
			else
				SetElement(p->m_segGraph,i,i+1,0,0,p->m_sAtom[i]);
		}
		else
		{
			strcpy(sWord,p->m_sAtom[i]);
			dValue=MAX_FREQUENCE;
			switch(p->m_nAtomPOS[i])
			{
			case CT_INDEX:
			case CT_NUM:
				nPOS=-27904;
				strcpy(sWord,"未##数");
				dValue=0;
				break;
			case CT_DELIMITER:
				nPOS=30464;
				break;
			case CT_LETTER:
				nPOS=-'n'*256-'x';
				dValue=0;
				strcpy(sWord,"未##串");
				break;
			case CT_SINGLE:
				if(GetCharCount("+-1234567890",p->m_sAtom[i])==(int)strlen(p->m_sAtom[i]))
				{
					nPOS=-27904;
					strcpy(sWord,"未##数");
				}
				else
				{
					nPOS=-'n'*256-'x';
					strcpy(sWord,"未##串");
				}
				dValue=0;
				break;
			default:
				nPOS=p->m_nAtomPOS[i];
				break;
			}
			if(!bOriginalFreq)
				SetElement(p->m_segGraph,i,i+1,0,nPOS,0);
			else
				SetElement(p->m_segGraph,i,i+1,dValue,nPOS,sWord);
		}
    }
	i=0;
	while(i<p->m_nAtomCount)
	{
		strcpy(sWord,p->m_sAtom[i]);
		j=i+1;
		if(strcmp(sWord,"月")==0&&strcmp(p->m_sAtom[i+1],"份")==0)
			j+=1;
		while(j<=p->m_nAtomCount&&GetMaxMatch(dictCore,sWord,sWordMatch,&nHandleTemp))
		{
			if(strcmp(sWordMatch,sWord)==0)
			{
				nTotalFreq=0;

				GetHandle(dictCore,sWord,&nMatchCount,nMatchHandle,nMatchFreq);
				for(k=0;k<nMatchCount;k++)
				{
					nTotalFreq+=nMatchFreq[k];
				}
				if(strlen(sWord)==4&&i>=1&&(IsAllNum((unsigned char *)p->m_sAtom[i-1])
					||IsAllChineseNum(p->m_sAtom[i-1]))&&(strncmp(sWord,"年",2)==0
					||strncmp(sWord,"月",2)==0))
				{
					if(CC_Find("末内中底前间初",sWord+2))
						break;
				}
				if(nMatchCount==1)
				{
					if(!bOriginalFreq)
						SetElement(p->m_segGraph,i,j,-log(nTotalFreq+1)+log(MAX_FREQUENCE),nMatchHandle[0],0);
					else
						SetElement(p->m_segGraph,i,j,nTotalFreq,nMatchHandle[0],sWord);
				}
				else
				{
					if(!bOriginalFreq)
						SetElement(p->m_segGraph,i,j,-log(nTotalFreq+1)+log(MAX_FREQUENCE),0,0);
					else
						SetElement(p->m_segGraph,i,j,nTotalFreq,0,sWord);
				}
			}
			strcat(sWord,p->m_sAtom[j++]);
		}
		i+=1;
	}
	return TRUE;
}


void USegGraph(pSegGraph p)
{
	free(p->m_segGraph);
	p->m_segGraph=NULL;
}

void ISegGraph(pSegGraph p)
{
    p->m_segGraph=(pDynamicArray)malloc(sizeof(struct DynamicArray));
	IDynamicArray(p->m_segGraph,FALSE);
	SetRowFirst(p->m_segGraph,TRUE);
}

int ShortPath(pNShortPath p)
{
	unsigned int nPreNode,i,nIndex,nCurNode;
	double eWeight;
	PARRAY_CHAIN pEdgeList;
    pEdgeList=(PARRAY_CHAIN)malloc(sizeof(ARRAY_CHAIN));
    for(nCurNode=1;nCurNode<p->m_nVertex;nCurNode++)
	{
	   pQueue queWork; 
	   queWork=(pQueue)malloc(sizeof(struct Queue));
	   IQueue(queWork);
	   eWeight=GetElementValue(p->m_apCost,-1,nCurNode,NULL,&pEdgeList);
       while(pEdgeList!=0 && pEdgeList->col==nCurNode)
	   {
		   nPreNode=pEdgeList->row; 
		   eWeight=pEdgeList->value; 
           for(i=0;i<p->m_nValueKind;i++)
		   {
			   if(nPreNode>0) 
			   {
				   if(p->m_pWeight[nPreNode-1][i]==INFINITE_VALUE)
					   break;
		           Push(queWork,nPreNode,i,eWeight+p->m_pWeight[nPreNode-1][i]);
			   }
			   else
			   {
				   Push(queWork,nPreNode,i,eWeight);
				   break;
			   }
		   } 
           pEdgeList=pEdgeList->next;		   
	   } 	    
	   for(i=0;i<p->m_nValueKind;i++)
	   {
			p->m_pWeight[nCurNode-1][i]=INFINITE_VALUE;
	   } 
	   i=0;	   
       while(i<p->m_nValueKind&&Pop(queWork,&nPreNode,&nIndex,&eWeight,TRUE,TRUE)!=-1)
	   { 
		   if(p->m_pWeight[nCurNode-1][i]==INFINITE_VALUE)
			   p->m_pWeight[nCurNode-1][i]=eWeight;
		   else if(p->m_pWeight[nCurNode-1][i]<eWeight) 
		   {
			   i++; 
			   if(i==p->m_nValueKind) 
				   break;
			  p->m_pWeight[nCurNode-1][i]=eWeight;
		   }
            Push(&p->m_pParent[nCurNode-1][i],nPreNode,nIndex,0);
	   }
	}
	return 1;
}

void GetPaths(pNShortPath p,unsigned int nNode,unsigned int nIndex,int **nResult,int bBest)
{
	
    pQueue queResult;
	int bFirstGet;
	unsigned int nCurNode,nCurIndex,nParentNode;
	unsigned int nParentIndex,nResultIndex=0;  
	queResult=(pQueue)malloc(sizeof(struct Queue));
	IQueue(queResult);	  
	if(p->m_nResultCount>=MAX_SEGMENT_NUM) 
		return ;
	nResult[p->m_nResultCount][nResultIndex]=-1; 
	Push(queResult,nNode,nIndex,0);
    nCurNode=nNode;
	nCurIndex=nIndex;  
    while(!IsEmpty(queResult,FALSE))
	{
		while(nCurNode>0) 
		{ 
			if(Pop(&p->m_pParent[nCurNode-1][nCurIndex],&nParentNode,&nParentIndex,0,FALSE,TRUE)!=-1)
			{
			   nCurNode=nParentNode;
			   nCurIndex=nParentIndex;
			}			
			if(nCurNode>0)
                Push(queResult,nCurNode,nCurIndex,0);
		}
		if(nCurNode==0)
		{  
  		   nResult[p->m_nResultCount][nResultIndex++]=nCurNode; 
		   bFirstGet=TRUE;
		   nParentNode=nCurNode;
		   while(Pop(queResult,&nCurNode,&nCurIndex,0,FALSE,bFirstGet)!=-1)
		   {
			   nResult[p->m_nResultCount][nResultIndex++]=nCurNode;
    	       bFirstGet=FALSE;
			   nParentNode=nCurNode;
		   }
		   nResult[p->m_nResultCount][nResultIndex]=-1; 
		   p->m_nResultCount+=1; 
		   if(p->m_nResultCount>=MAX_SEGMENT_NUM) 
				return ;
		   nResultIndex=0;
		   nResult[p->m_nResultCount][nResultIndex]=-1; 
		   if(bBest) 
			   return ;
		}
		Pop(queResult,&nCurNode,&nCurIndex,0,FALSE,TRUE); 
        while((IsEmpty(queResult,FALSE))==FALSE&&(IsSingle(&p->m_pParent[nCurNode-1][nCurIndex])||IsEmpty(&p->m_pParent[nCurNode-1][nCurIndex],TRUE)))
		{
	       Pop(queResult,&nCurNode,&nCurIndex,0,TRUE,TRUE); 
		   Pop(queResult,&nCurNode,&nCurIndex,0,FALSE,TRUE); 
		}
        if(IsEmpty(queResult,FALSE)==FALSE&&IsEmpty(&p->m_pParent[nCurNode-1][nCurIndex],TRUE)==FALSE)
		{
			Pop(&p->m_pParent[nCurNode-1][nCurIndex],&nParentNode,&nParentIndex,0,FALSE,FALSE);
			nCurNode=nParentNode;
			nCurIndex=nParentIndex;
			if(nCurNode>0)
			    Push(queResult,nCurNode,nCurIndex,0);
		}
	}
}
int OutputPath( pNShortPath p,int **nResult,int bBest,int *npCount)
{
  unsigned int i;  
  p->m_nResultCount=0;
  if(p->m_nVertex < 2)
  {
	  nResult[0][0]=0;
	  nResult[0][1]=1;
	  *npCount=1;
	  return 1;
  }
  for(i=0;i<p->m_nValueKind&&p->m_pWeight[p->m_nVertex-2][i]<INFINITE_VALUE;i++)
  {
	  GetPaths(p,p->m_nVertex-1,i,nResult,bBest);
	  *npCount=p->m_nResultCount;
	  if(nResult[i][0]!=-1&&bBest) 
		  return 1;
      if(p->m_nResultCount>=MAX_SEGMENT_NUM) 
	 	  return 1;
  }
  return 1;
}


void UNShortPath(struct NShortPath *p)
{    
	 unsigned int i;
	 for(i=0;i<p->m_nVertex-1;i++) 
	 {
		 free(p->m_pWeight[i]);
		 p->m_pWeight[i]=NULL;
         free( p->m_pParent[i]);
		 p->m_pParent[i]=NULL;		
	 }
	 free(p->m_pWeight);
	 p->m_pWeight=NULL;
	 free(p->m_pParent); 
	 p->m_pParent=NULL;
	 UDynamicArray(p->m_apCost);
	 free(p->m_apCost);
	 p->m_apCost=NULL;
}

void  INShortPath(struct NShortPath *p,pDynamicArray apCost,unsigned int nValueKind)
{
	unsigned int i;
	p->m_apCost=apCost; 
	p->m_nValueKind=nValueKind; 
	p->m_nVertex=apCost->m_nCol+1; 
    if(p->m_nVertex<apCost->m_nRow+1)
	   p->m_nVertex=apCost->m_nRow+1;
	p->m_pParent=(pQueue*)malloc(sizeof(pQueue)*(p->m_nVertex-1));	
	p->m_pWeight=(double**)malloc(sizeof(double*)*(p->m_nVertex-1));
	for(i=0;i<p->m_nVertex-1;i++)
	{
		p->m_pParent[i]=(pQueue)malloc(sizeof(struct Queue)*(nValueKind));
		IQueue(p->m_pParent[i]);
		p->m_pWeight[i]=(double*)malloc(sizeof(double)*nValueKind);		
    }	
}


void IUnknowWord(pUnknowWord p)
{
	p->m_sUnknownFlags[0]=0;
	p->m_roleTag =(pSpan)malloc(sizeof(struct Span));
	ISpan(p->m_roleTag);
	p->m_dict=(pDictionary) malloc(sizeof(struct Dictionary));
	IDictionary(p->m_dict);
}

void UUnknowWord(pUnknowWord p)
{
	USpan(p->m_roleTag);
	free(p->m_roleTag);
	p->m_roleTag=0;
	UDictionary(p->m_dict);
	free(p->m_dict);
	p->m_dict=0;
} 
int Recognition(pUnknowWord p,PWORD_RESULT pWordSegResult, pDynamicArray graphOptimum,pSegGraph graphSeg,pDictionary dictCore)
{	
	int i;
	int nStartPos=0,j=0,nAtomStart,nAtomEnd,nPOSOriginal;
	double dValue;
	POSTagging(p->m_roleTag,pWordSegResult,dictCore,p->m_dict);	 
	for(i=0;i<p->m_roleTag->m_nUnknownIndex;i++)
	{
		while((unsigned int)j<graphSeg->m_nAtomCount&&nStartPos<p->m_roleTag->m_nUnknownWords[i][0])
		{
			nStartPos+=graphSeg->m_nAtomLength[j++];
		}
		nAtomStart=j;
		while((unsigned int)j<graphSeg->m_nAtomCount&&nStartPos<p->m_roleTag->m_nUnknownWords[i][1])
		{
			nStartPos+=graphSeg->m_nAtomLength[j++];
		}
		nAtomEnd=j;
		if(nAtomStart<nAtomEnd)
		{
			GetElement(graphOptimum,nAtomStart,nAtomEnd,&dValue,&nPOSOriginal,0);
			if(dValue>p->m_roleTag->m_dWordsPossibility[i]) 
				SetElement(graphOptimum,nAtomStart,nAtomEnd,p->m_roleTag->m_dWordsPossibility[i],p->m_nPOS,p->m_sUnknownFlags);
		}
	}
	return TRUE;
} 

int Configure(pUnknowWord p,char *sConfigFile,enum TAG_TYPE type)
{
	char sFilename[100];	 
	strcpy(sFilename,sConfigFile);
	strcat(sFilename,".dct");
	LoadDicFile(p->m_dict,sFilename,FALSE);
	strcpy(sFilename,sConfigFile);
	strcat(sFilename,".ctx");
	LoadContext(p->m_roleTag,sFilename);	 
	SetTagType(p->m_roleTag,type);
	switch(type)
	{
		case TT_PERSON:
		case TT_TRANS_PERSON: 
			p->m_nPOS=-28274; 
			strcpy(p->m_sUnknownFlags,"未##人");
			break;
		case TT_PLACE:
			p->m_nPOS=-28275; 
			strcpy(p->m_sUnknownFlags,"未##地");
			break;
		default :
			p->m_nPOS=0;
			break;
	}
	return TRUE;
}

int IsGivenName(pUnknowWord p,char *sName)
{
	char sFirstChar[3],sSecondChar[3];
	double dGivenNamePossibility=0,dSingleNamePossibility=0;
	if(strlen(sName)!=4)
		return FALSE;		
	strncpy(sFirstChar,sName,2);
	sFirstChar[2]=0;
	strncpy(sSecondChar,sName+2,2);
	sSecondChar[2]=0;	 
	dGivenNamePossibility+=log((double)GetFrequency(p->m_dict,sFirstChar,2)+1.0)-log(GetContextFrequency(p->m_roleTag->m_context,0,2)+1.0);
	dGivenNamePossi

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -