📄 segment(sen).c

📁 为自然语言处理领域的中文分词程序
💻 C
📖 第 1 页 / 共 5 页
字号:
上一页 1 2 3 45
	 p->m_pParent=NULL;
	 UDynamicArray(p->m_apCost);
	 free(p->m_apCost);
	 p->m_apCost=NULL;
}

void  INShortPath(struct NShortPath *p,pDynamicArray apCost,unsigned int nValueKind)
{
	unsigned int i;
	p->m_apCost=apCost; 
	p->m_nValueKind=nValueKind; 
	p->m_nVertex=apCost->m_nCol+1; 
    if(p->m_nVertex<apCost->m_nRow+1)
	   p->m_nVertex=apCost->m_nRow+1;
	p->m_pParent=(pQueue*)malloc(sizeof(pQueue)*(p->m_nVertex-1));	
	p->m_pWeight=(double**)malloc(sizeof(double*)*(p->m_nVertex-1));
	for(i=0;i<p->m_nVertex-1;i++)
	{
		p->m_pParent[i]=(pQueue)malloc(sizeof(struct Queue)*(nValueKind));
		IQueue(p->m_pParent[i]);
		p->m_pWeight[i]=(double*)malloc(sizeof(double)*nValueKind);		
    }	
}


void IUnknowWord(pUnknowWord p)
{
	p->m_sUnknownFlags[0]=0;
	p->m_roleTag =(pSpan)malloc(sizeof(struct Span));
	ISpan(p->m_roleTag);
	p->m_dict=(pDictionary) malloc(sizeof(struct Dictionary));
	IDictionary(p->m_dict);
}

void UUnknowWord(pUnknowWord p)
{
	USpan(p->m_roleTag);
	free(p->m_roleTag);
	p->m_roleTag=0;
	UDictionary(p->m_dict);
	free(p->m_dict);
	p->m_dict=0;
} 
int Recognition(pUnknowWord p,PWORD_RESULT pWordSegResult, pDynamicArray graphOptimum,pSegGraph graphSeg,pDictionary dictCore)
{	
	int i;
	int nStartPos=0,j=0,nAtomStart,nAtomEnd,nPOSOriginal;
	double dValue;
	POSTagging(p->m_roleTag,pWordSegResult,dictCore,p->m_dict);	 
	for(i=0;i<p->m_roleTag->m_nUnknownIndex;i++)
	{
		while((unsigned int)j<graphSeg->m_nAtomCount&&nStartPos<p->m_roleTag->m_nUnknownWords[i][0])
		{
			nStartPos+=graphSeg->m_nAtomLength[j++];
		}
		nAtomStart=j;
		while((unsigned int)j<graphSeg->m_nAtomCount&&nStartPos<p->m_roleTag->m_nUnknownWords[i][1])
		{
			nStartPos+=graphSeg->m_nAtomLength[j++];
		}
		nAtomEnd=j;
		if(nAtomStart<nAtomEnd)
		{
			GetElement(graphOptimum,nAtomStart,nAtomEnd,&dValue,&nPOSOriginal,0);
			if(dValue>p->m_roleTag->m_dWordsPossibility[i]) 
				SetElement(graphOptimum,nAtomStart,nAtomEnd,p->m_roleTag->m_dWordsPossibility[i],p->m_nPOS,p->m_sUnknownFlags);
		}
	}
	return TRUE;
} 

int Configure(pUnknowWord p,char *sConfigFile,enum TAG_TYPE type)
{
	char sFilename[100];	 
	strcpy(sFilename,sConfigFile);
	strcat(sFilename,".dct");
	LoadDicFile(p->m_dict,sFilename,FALSE);
	strcpy(sFilename,sConfigFile);
	strcat(sFilename,".ctx");
	LoadContext(p->m_roleTag,sFilename);	 
	SetTagType(p->m_roleTag,type);
	switch(type)
	{
		case TT_PERSON:
		case TT_TRANS_PERSON: 
			p->m_nPOS=-28274; 
			strcpy(p->m_sUnknownFlags,"未##人");
			break;
		case TT_PLACE:
			p->m_nPOS=-28275; 
			strcpy(p->m_sUnknownFlags,"未##地");
			break;
		default :
			p->m_nPOS=0;
			break;
	}
	return TRUE;
}

int IsGivenName(pUnknowWord p,char *sName)
{
	char sFirstChar[3],sSecondChar[3];
	double dGivenNamePossibility=0,dSingleNamePossibility=0;
	if(strlen(sName)!=4)
		return FALSE;		
	strncpy(sFirstChar,sName,2);
	sFirstChar[2]=0;
	strncpy(sSecondChar,sName+2,2);
	sSecondChar[2]=0;	 
	dGivenNamePossibility+=log((double)GetFrequency(p->m_dict,sFirstChar,2)+1.0)-log(GetContextFrequency(p->m_roleTag->m_context,0,2)+1.0);
	dGivenNamePossibility+=log((double)GetFrequency(p->m_dict,sSecondChar,3)+1.0)-log(GetContextFrequency(p->m_roleTag->m_context,0,3)+1.0);
	dGivenNamePossibility+=log(GetContextPossibility(p->m_roleTag->m_context,0,2,3)+1.0)-log(GetContextFrequency(p->m_roleTag->m_context,0,2)+1.0);
	dSingleNamePossibility+=log((double)GetFrequency(p->m_dict,sFirstChar,1)+1.0)-log(GetContextFrequency(p->m_roleTag->m_context,0,1)+1.0);
	dSingleNamePossibility+=log((double)GetFrequency(p->m_dict,sSecondChar,4)+1.0)-log(GetContextFrequency(p->m_roleTag->m_context,0,4)+1.0);
	dSingleNamePossibility+=log(GetContextPossibility(p->m_roleTag->m_context,0,1,4)+1.0)-log(GetContextFrequency (p->m_roleTag->m_context,0,1)+1.0);
	if(dSingleNamePossibility>=dGivenNamePossibility) 
		return FALSE;
	return TRUE;
}
int GenerateWord(pSegment p,int **nSegRoute, int nIndex)
{
	unsigned int i,k,nLen;
	int j,nStartVertex,nEndVertex,nPOS;
	char sAtom[WORD_MAXLENGTH],sNumCandidate[100],sCurWord[100];
	double fValue;
	i=0;
	k=0;;
	while(nSegRoute[nIndex][i]!=-1&&nSegRoute[nIndex][i+1]!=-1&&nSegRoute[nIndex][i]<nSegRoute[nIndex][i+1])
	{
		nStartVertex=nSegRoute[nIndex][i];
		j=nStartVertex; 
		nEndVertex=nSegRoute[nIndex][i+1]; 
		nPOS=0;
		GetElement(p->m_graphSeg->m_segGraph,nStartVertex,nEndVertex,&fValue,&nPOS,0);
		sAtom[0]=0;
		while(j<nEndVertex)
		{ 
			strcat(sAtom,p->m_graphSeg->m_sAtom[j]);
			j++;
		}
		p->m_pWordSeg[nIndex][k].sWord[0]=0; 
		strcpy(sNumCandidate,sAtom);
		while(sAtom[0]!=0&&(IsAllNum((unsigned char *)sNumCandidate)||IsAllChineseNum(sNumCandidate)))
		{ 
		  
			strcpy(p->m_pWordSeg[nIndex][k].sWord,sNumCandidate);			 
			i++; 
			sAtom[0]=0;			
			while(j<nSegRoute[nIndex][i+1])
			{ 
				strcat(sAtom,p->m_graphSeg->m_sAtom[j]);
				j++;
			}
			strcat(sNumCandidate,sAtom);
		}
		nLen=strlen(p->m_pWordSeg[nIndex][k].sWord);
		if(nLen==4&&CC_Find("第上成±—＋∶·．／",p->m_pWordSeg[nIndex][k].sWord)||nLen==1&&strchr("+-./",p->m_pWordSeg[nIndex][k].sWord[0]))
		{ 
			strcpy(sCurWord,p->m_pWordSeg[nIndex][k].sWord); 
			i--;
		}
		else if(p->m_pWordSeg[nIndex][k].sWord[0]==0) 
		{
			strcpy(p->m_pWordSeg[nIndex][k].sWord,sAtom);			 
			strcpy(sCurWord,sAtom); 
		}
		else
		{ 
			if(strcmp("－－",p->m_pWordSeg[nIndex][k].sWord)==0||strcmp("—",p->m_pWordSeg[nIndex][k].sWord)==0||p->m_pWordSeg[nIndex][k].sWord[0]=='-'&&p->m_pWordSeg[nIndex][k].sWord[1]==0) 
			{
				nPOS=30464; 
				i--; 
			}
			else
			{ 
				char sInitChar[3];
				unsigned int nCharIndex=0; 
				sInitChar[nCharIndex]=p->m_pWordSeg[nIndex][k].sWord[nCharIndex];
				if(sInitChar[nCharIndex]<0)
				{
					nCharIndex+=1;
					sInitChar[nCharIndex]=p->m_pWordSeg[nIndex][k].sWord[nCharIndex];
				}
				nCharIndex+=1;
				sInitChar[nCharIndex]='\0';
				if(k>0&&(abs(p->m_pWordSeg[nIndex][k-1].nHandle)==27904||abs(p->m_pWordSeg[nIndex][k-1].nHandle)==29696)&&(strcmp(sInitChar,"—")==0||sInitChar[0]=='-')&&(strlen(p->m_pWordSeg[nIndex][k].sWord)>nCharIndex))
				{ 				    
					strcpy(p->m_pWordSeg[nIndex][k+1].sWord,p->m_pWordSeg[nIndex][k].sWord+nCharIndex);
					p->m_pWordSeg[nIndex][k+1].dValue=p->m_pWordSeg[nIndex][k].dValue;
					p->m_pWordSeg[nIndex][k+1].nHandle=27904;
					p->m_pWordSeg[nIndex][k].sWord[nCharIndex]=0;
					p->m_pWordSeg[nIndex][k].dValue=0;
					p->m_pWordSeg[nIndex][k].nHandle=30464; 
					SetElement(p->m_graphOptimum,nStartVertex,nStartVertex+1,p->m_pWordSeg[nIndex][k].dValue,p->m_pWordSeg[nIndex][k].nHandle,p->m_pWordSeg[nIndex][k].sWord);
					nStartVertex+=1;
					k+=1;
				}
				nLen=strlen(p->m_pWordSeg[nIndex][k].sWord);
				if((strlen(sAtom)==2&&CC_Find("月日时分秒",sAtom))||strcmp(sAtom,"月份")==0)
				{ 
					strcat(p->m_pWordSeg[nIndex][k].sWord,sAtom);
					strcpy(sCurWord,"未##时");
					nPOS=-29696; 
				}
				else if(strcmp(sAtom,"年")==0)
				{
					 if(IsYearTime(p->m_pWordSeg[nIndex][k].sWord)) 
					 { 
						strcat(p->m_pWordSeg[nIndex][k].sWord,sAtom);
						strcpy(sCurWord,"未##时");
						nPOS=-29696; 
					 }
					 else
					 {
						strcpy(sCurWord,"未##数");
						nPOS=-27904; 
						i--; 
					 }
				}
     			else
				{					 
					if(strcmp(p->m_pWordSeg[nIndex][k].sWord+strlen(p->m_pWordSeg[nIndex][k].sWord)-2,"点")==0)
					{
						strcpy(sCurWord,"未##时");
						nPOS=-29696; 
					}	
					else 
					{
						if(!CC_Find("∶·．／",p->m_pWordSeg[nIndex][k].sWord+nLen-2)&&p->m_pWordSeg[nIndex][k].sWord[nLen-1]!='.'&&p->m_pWordSeg[nIndex][k].sWord[nLen-1]!='/')
						{
							strcpy(sCurWord,"未##数");
							nPOS=-27904; 
						}
						else if(nLen>strlen(sInitChar))
						{ 
							if(p->m_pWordSeg[nIndex][k].sWord[nLen-1]=='.'||p->m_pWordSeg[nIndex][k].sWord[nLen-1]=='/')
								p->m_pWordSeg[nIndex][k].sWord[nLen-1]=0;
							else
								p->m_pWordSeg[nIndex][k].sWord[nLen-2]=0;
							strcpy(sCurWord,"未##数");
							nPOS=-27904; 
							i--;
						}
					}
					i--; 
				}
			}
			fValue=0;
			nEndVertex=nSegRoute[nIndex][i+1]; 
		}
		p->m_pWordSeg[nIndex][k].nHandle=nPOS; 
		p->m_pWordSeg[nIndex][k].dValue=fValue; 
		SetElement(p->m_graphOptimum,nStartVertex,nEndVertex,fValue,nPOS,sCurWord);		 
		i++; 
		k++; 
	}
	p->m_pWordSeg[nIndex][k].sWord[0]=0;
	p->m_pWordSeg[nIndex][k].nHandle=-1; 
	return TRUE;
} 

int InSegment(pSegment p,char *sSentence,pDictionary dictCore,int nResultCount)
{
	int i;
	int **nSegRoute; 
	pNShortPath sp;	
	nSegRoute=(int**)malloc(sizeof(int*)*MAX_SEGMENT_NUM);
	for(i=0;i<MAX_SEGMENT_NUM;i++)
	{
		nSegRoute[i]=(int*)malloc((MAX_SENTENCE_LEN/2)*sizeof(int));
		memset(nSegRoute[i],0,MAX_SENTENCE_LEN/2*sizeof(int));
	}
	SetRowFirst(p->m_graphSeg->m_segGraph,FALSE);
	SetRowFirst(p->m_graphOptimum,FALSE);
 	GenerateWordNet(p->m_graphSeg,sSentence,dictCore,FALSE);
	
	sp=(pNShortPath)malloc(sizeof(struct NShortPath));
	INShortPath(sp,p->m_graphSeg->m_segGraph,nResultCount);
	ShortPath(sp);
	OutputPath(sp,nSegRoute,FALSE,&p->m_nSegmentCount);
	SetEmpty(p->m_graphOptimum);
	i=0;
	UNShortPath(sp);
	free(sp);
	sp=0;
	while(i<p->m_nSegmentCount)
	{
		GenerateWord(p,nSegRoute,i);
		i++;
	}
	for(i=0;i<MAX_SEGMENT_NUM;i++)
	{
		free(nSegRoute[i]);
	}
	free(nSegRoute);
	nSegRoute=0;
	return TRUE;
}

int BiGraphGenerate(pSegment p,pDynamicArray aWord, pDynamicArray aBinaryWordNet,double dSmoothingPara,pDictionary DictBinary,pDictionary DictCore)
{
	PARRAY_CHAIN pTail,pCur,pNextWords; 
	unsigned int nCurWordIndex,nNextWordIndex,nWordIndex=0;
	unsigned int nTwoWordsFreq=0;	 
	double dCurFreqency,dValue,dTemp; 
	char sTwoWords[WORD_MAXLENGTH];
	memset(sTwoWords,0,WORD_MAXLENGTH);
	p->m_nWordCount=GetTail(aWord,&pTail); 
	if(p->m_npWordPosMapTable)
	{ 
		free(p->m_npWordPosMapTable);
		p->m_npWordPosMapTable=0;
	}
	if(p->m_nWordCount>0) 
		p->m_npWordPosMapTable=(int *)malloc(sizeof(int)*p->m_nWordCount); 
	pCur=GetHead(aWord); 
	while(pCur!=0)
	{ 
		p->m_npWordPosMapTable[nWordIndex++]=pCur->row*MAX_SENTENCE_LEN+pCur->col; 
		pCur=pCur->next;
	}
	pCur=GetHead(aWord); 
	while(pCur!=0)
	{ 
		if(pCur->nPOS>=0) 
			dCurFreqency=pCur->value; 
		else 
			dCurFreqency=GetFrequency(DictCore,pCur->sWord,2); 
		GetElementValue(aWord,pCur->col,-1,pCur,&pNextWords); 
		while(pNextWords&&pNextWords->row==pCur->col)
		{ 
			strcpy(sTwoWords,pCur->sWord);
			strcat(sTwoWords,WORD_SEGMENTER);
			strcat(sTwoWords,pNextWords->sWord); 
			nTwoWordsFreq=GetFrequency(DictBinary,sTwoWords,3); 
			dTemp=(double)1/MAX_FREQUENCE;			 
			dValue=-log(dSmoothingPara*(1+dCurFreqency)/(MAX_FREQUENCE+80000)+(1-dSmoothingPara)*((1-dTemp)*nTwoWordsFreq/(1+dCurFreqency)+dTemp));			 
			if(pCur->nPOS<0) 
			    dValue+=pCur->value;			 
			nCurWordIndex=BinarySearch(pCur->row*MAX_SENTENCE_LEN+pCur->col,p->m_npWordPosMapTable,p->m_nWordCount);
			 
			nNextWordIndex=BinarySearch(pNextWords->row*MAX_SENTENCE_LEN+pNextWords->col,p->m_npWordPosMapTable,p->m_nWordCount);
			 
			SetElement(aBinaryWordNet,nCurWordIndex,nNextWordIndex,dValue,pCur->nPOS,0);
			pNextWords=pNextWords->next; 
		}
		pCur=pCur->next; 
	}
	return TRUE;
} 

int BiPath2UniPath(pSegment p,int *npPath)
{ 
	int i=0;
	int nTemp=-1;
	if(!p->m_npWordPosMapTable)
		return FALSE;
	while(npPath[i]!=-1&&npPath[i]<p->m_nWordCount)
	{
		nTemp=p->m_npWordPosMapTable[npPath[i]];
		npPath[i]=nTemp/MAX_SENTENCE_LEN;
		i++;
	}
	if(nTemp>0)
		npPath[i++]=nTemp%MAX_SENTENCE_LEN;
	npPath[i]=-1;
	return TRUE;	
}
int BiSegment(pSegment p,char *sSentence, double dSmoothingPara, pDictionary dictCore, pDictionary dictBinary, unsigned int nResultCount)
{
	int **nSegRoute; 
	unsigned int nLen;
	int i;
	pNShortPath sp;
	pDynamicArray aBiwordsNet;	
	nSegRoute=(int **) malloc (sizeof(int*)*MAX_SEGMENT_NUM); 
	nLen=strlen(sSentence)+10; 
	for(i=0;i<MAX_SEGMENT_NUM;i++)
	{
		nSegRoute[i]=(int *) malloc (sizeof(int*)*(nLen/2)); 
		memset(nSegRoute[i],-1,nLen/2*sizeof(int)); 
	}	 
 	GenerateWordNet(p->m_graphSeg,sSentence,dictCore,TRUE); 
    aBiwordsNet=(pDynamicArray)malloc(sizeof(struct DynamicArray));
	IDynamicArray(aBiwordsNet,FALSE);	 
	BiGraphGenerate(p,p->m_graphSeg->m_segGraph,aBiwordsNet,dSmoothingPara,dictBinary,dictCore);
	sp=(pNShortPath)malloc(sizeof (struct NShortPath));	
	INShortPath(sp,aBiwordsNet,nResultCount);
	ShortPath(sp);
	OutputPath(sp,nSegRoute,FALSE,&p->m_nSegmentCount);
	UNShortPath(sp);
	free(sp);
	sp=NULL;
	SetEmpty(p->m_graphOptimum); 
	i=0;
	while(i<p->m_nSegmentCount)
	{
		BiPath2UniPath(p,nSegRoute[i]);		 
		GenerateWord(p,nSegRoute,i);		 
		i++;
	}	 
	for(i=0;i<MAX_SEGMENT_NUM;i++)
	{
		free(nSegRoute[i]); 
		nSegRoute[i]=NULL;
	}
	free(nSegRoute); 
	nSegRoute=NULL;	
	aBiwordsNet=NULL;	
	return TRUE;
}

int BiOptimumSegment(pSegment p,unsigned int nResultCount,double dSmoothingPara, pDictionary dictBinary, pDictionary dictCore)
{
	int **nSegRoute;
    int i;
	pNShortPath sp;
    pDynamicArray aBiwordsNet;
	nSegRoute=(int **) malloc (sizeof(int*)*MAX_SEGMENT_NUM);
	for(i=0;i<MAX_SEGMENT_NUM;i++)
	{
		nSegRoute[i]=(int *)malloc(sizeof(int)*(MAX_SENTENCE_LEN/2));
		memset(nSegRoute[i],-1,MAX_SENTENCE_LEN/2*sizeof(int));
	}
	aBiwordsNet=(pDynamicArray)malloc(sizeof(struct DynamicArray));
	IDynamicArray(aBiwordsNet,FALSE);
	BiGraphGenerate(p,p->m_graphOptimum,aBiwordsNet,dSmoothingPara,dictBinary,dictCore);
	sp=(pNShortPath)malloc(sizeof (struct NShortPath));
	INShortPath(sp,aBiwordsNet,nResultCount);
	ShortPath(sp);
	OutputPath(sp,(int **)nSegRoute,FALSE,&p->m_nSegmentCount);
	UNShortPath(sp);
	free(sp);
	sp=0;
	i=0;
	p->m_graphSeg->m_segGraph=p->m_graphOptimum;
	SetEmpty(p->m_graphOptimum);
	while(i<p->m_nSegmentCount)
	{
		BiPath2UniPath(p,nSegRoute[i]);
		GenerateWord(p,nSegRoute,i);
		i++;
	}
	for(i=0;i<MAX_SEGMENT_NUM;i++)
	{
		free(nSegRoute[i]);
		nSegRoute[i]=0;
	}
	free(nSegRoute);
	nSegRoute=0;
	aBiwordsNet=0;
	return TRUE;
}

void ISegment(pSegment p)
{
	int i;
	p->m_pWordSeg= (PWORD_RESULT *)malloc(MAX_SEGMENT_NUM*sizeof(PWORD_RESULT));
	for(i=0;i<MAX_SEGMENT_NUM;i++)
	{
		p->m_pWordSeg[i]=(PWORD_RESULT)malloc(sizeof(WORD_RESULT)*MAX_WORDS);;
	}
	p->m_graphSeg=(pSegGraph)malloc(sizeof(struct SegGraph));
	ISegGraph(p->m_graphSeg);
	p->m_graphOptimum=(pDynamicArray)malloc(sizeof(struct DynamicArray));
	IDynamicArray(p->m_graphOptimum,TRUE);
	p->m_npWordPosMapTable=0;
	p->m_nWordCount=0;
	SetRowFirst(p->m_graphOptimum,TRUE);
}

void USegment(pSegment p)
{
	int i;
	for(i=0;i<MAX_SEGMENT_NUM;i++)
	{
		free(p->m_pWordSeg[i]);
		p->m_pWordSeg[i]=0;
	}
	free(p->m_pWordSeg);
	p->m_pWordSeg=0;
	USegGraph(p->m_graphSeg);
	free(p->m_graphSeg);
	p->m_graphSeg=0;
}

v
上一页 1 2 3 45
💿 文件大小 2941 K
👤 上传用户 lqlm521
📂 所属分类其他
🏷️ 相关标签

#自然语言处理 #分 #程序
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -