📄 segment(sen).c
字号:
p->m_pParent=NULL;
UDynamicArray(p->m_apCost);
free(p->m_apCost);
p->m_apCost=NULL;
}
void INShortPath(struct NShortPath *p,pDynamicArray apCost,unsigned int nValueKind)
{
unsigned int i;
p->m_apCost=apCost;
p->m_nValueKind=nValueKind;
p->m_nVertex=apCost->m_nCol+1;
if(p->m_nVertex<apCost->m_nRow+1)
p->m_nVertex=apCost->m_nRow+1;
p->m_pParent=(pQueue*)malloc(sizeof(pQueue)*(p->m_nVertex-1));
p->m_pWeight=(double**)malloc(sizeof(double*)*(p->m_nVertex-1));
for(i=0;i<p->m_nVertex-1;i++)
{
p->m_pParent[i]=(pQueue)malloc(sizeof(struct Queue)*(nValueKind));
IQueue(p->m_pParent[i]);
p->m_pWeight[i]=(double*)malloc(sizeof(double)*nValueKind);
}
}
void IUnknowWord(pUnknowWord p)
{
p->m_sUnknownFlags[0]=0;
p->m_roleTag =(pSpan)malloc(sizeof(struct Span));
ISpan(p->m_roleTag);
p->m_dict=(pDictionary) malloc(sizeof(struct Dictionary));
IDictionary(p->m_dict);
}
void UUnknowWord(pUnknowWord p)
{
USpan(p->m_roleTag);
free(p->m_roleTag);
p->m_roleTag=0;
UDictionary(p->m_dict);
free(p->m_dict);
p->m_dict=0;
}
int Recognition(pUnknowWord p,PWORD_RESULT pWordSegResult, pDynamicArray graphOptimum,pSegGraph graphSeg,pDictionary dictCore)
{
int i;
int nStartPos=0,j=0,nAtomStart,nAtomEnd,nPOSOriginal;
double dValue;
POSTagging(p->m_roleTag,pWordSegResult,dictCore,p->m_dict);
for(i=0;i<p->m_roleTag->m_nUnknownIndex;i++)
{
while((unsigned int)j<graphSeg->m_nAtomCount&&nStartPos<p->m_roleTag->m_nUnknownWords[i][0])
{
nStartPos+=graphSeg->m_nAtomLength[j++];
}
nAtomStart=j;
while((unsigned int)j<graphSeg->m_nAtomCount&&nStartPos<p->m_roleTag->m_nUnknownWords[i][1])
{
nStartPos+=graphSeg->m_nAtomLength[j++];
}
nAtomEnd=j;
if(nAtomStart<nAtomEnd)
{
GetElement(graphOptimum,nAtomStart,nAtomEnd,&dValue,&nPOSOriginal,0);
if(dValue>p->m_roleTag->m_dWordsPossibility[i])
SetElement(graphOptimum,nAtomStart,nAtomEnd,p->m_roleTag->m_dWordsPossibility[i],p->m_nPOS,p->m_sUnknownFlags);
}
}
return TRUE;
}
int Configure(pUnknowWord p,char *sConfigFile,enum TAG_TYPE type)
{
char sFilename[100];
strcpy(sFilename,sConfigFile);
strcat(sFilename,".dct");
LoadDicFile(p->m_dict,sFilename,FALSE);
strcpy(sFilename,sConfigFile);
strcat(sFilename,".ctx");
LoadContext(p->m_roleTag,sFilename);
SetTagType(p->m_roleTag,type);
switch(type)
{
case TT_PERSON:
case TT_TRANS_PERSON:
p->m_nPOS=-28274;
strcpy(p->m_sUnknownFlags,"未##人");
break;
case TT_PLACE:
p->m_nPOS=-28275;
strcpy(p->m_sUnknownFlags,"未##地");
break;
default :
p->m_nPOS=0;
break;
}
return TRUE;
}
int IsGivenName(pUnknowWord p,char *sName)
{
char sFirstChar[3],sSecondChar[3];
double dGivenNamePossibility=0,dSingleNamePossibility=0;
if(strlen(sName)!=4)
return FALSE;
strncpy(sFirstChar,sName,2);
sFirstChar[2]=0;
strncpy(sSecondChar,sName+2,2);
sSecondChar[2]=0;
dGivenNamePossibility+=log((double)GetFrequency(p->m_dict,sFirstChar,2)+1.0)-log(GetContextFrequency(p->m_roleTag->m_context,0,2)+1.0);
dGivenNamePossibility+=log((double)GetFrequency(p->m_dict,sSecondChar,3)+1.0)-log(GetContextFrequency(p->m_roleTag->m_context,0,3)+1.0);
dGivenNamePossibility+=log(GetContextPossibility(p->m_roleTag->m_context,0,2,3)+1.0)-log(GetContextFrequency(p->m_roleTag->m_context,0,2)+1.0);
dSingleNamePossibility+=log((double)GetFrequency(p->m_dict,sFirstChar,1)+1.0)-log(GetContextFrequency(p->m_roleTag->m_context,0,1)+1.0);
dSingleNamePossibility+=log((double)GetFrequency(p->m_dict,sSecondChar,4)+1.0)-log(GetContextFrequency(p->m_roleTag->m_context,0,4)+1.0);
dSingleNamePossibility+=log(GetContextPossibility(p->m_roleTag->m_context,0,1,4)+1.0)-log(GetContextFrequency (p->m_roleTag->m_context,0,1)+1.0);
if(dSingleNamePossibility>=dGivenNamePossibility)
return FALSE;
return TRUE;
}
int GenerateWord(pSegment p,int **nSegRoute, int nIndex)
{
unsigned int i,k,nLen;
int j,nStartVertex,nEndVertex,nPOS;
char sAtom[WORD_MAXLENGTH],sNumCandidate[100],sCurWord[100];
double fValue;
i=0;
k=0;;
while(nSegRoute[nIndex][i]!=-1&&nSegRoute[nIndex][i+1]!=-1&&nSegRoute[nIndex][i]<nSegRoute[nIndex][i+1])
{
nStartVertex=nSegRoute[nIndex][i];
j=nStartVertex;
nEndVertex=nSegRoute[nIndex][i+1];
nPOS=0;
GetElement(p->m_graphSeg->m_segGraph,nStartVertex,nEndVertex,&fValue,&nPOS,0);
sAtom[0]=0;
while(j<nEndVertex)
{
strcat(sAtom,p->m_graphSeg->m_sAtom[j]);
j++;
}
p->m_pWordSeg[nIndex][k].sWord[0]=0;
strcpy(sNumCandidate,sAtom);
while(sAtom[0]!=0&&(IsAllNum((unsigned char *)sNumCandidate)||IsAllChineseNum(sNumCandidate)))
{
strcpy(p->m_pWordSeg[nIndex][k].sWord,sNumCandidate);
i++;
sAtom[0]=0;
while(j<nSegRoute[nIndex][i+1])
{
strcat(sAtom,p->m_graphSeg->m_sAtom[j]);
j++;
}
strcat(sNumCandidate,sAtom);
}
nLen=strlen(p->m_pWordSeg[nIndex][k].sWord);
if(nLen==4&&CC_Find("第上成±—+∶·./",p->m_pWordSeg[nIndex][k].sWord)||nLen==1&&strchr("+-./",p->m_pWordSeg[nIndex][k].sWord[0]))
{
strcpy(sCurWord,p->m_pWordSeg[nIndex][k].sWord);
i--;
}
else if(p->m_pWordSeg[nIndex][k].sWord[0]==0)
{
strcpy(p->m_pWordSeg[nIndex][k].sWord,sAtom);
strcpy(sCurWord,sAtom);
}
else
{
if(strcmp("--",p->m_pWordSeg[nIndex][k].sWord)==0||strcmp("—",p->m_pWordSeg[nIndex][k].sWord)==0||p->m_pWordSeg[nIndex][k].sWord[0]=='-'&&p->m_pWordSeg[nIndex][k].sWord[1]==0)
{
nPOS=30464;
i--;
}
else
{
char sInitChar[3];
unsigned int nCharIndex=0;
sInitChar[nCharIndex]=p->m_pWordSeg[nIndex][k].sWord[nCharIndex];
if(sInitChar[nCharIndex]<0)
{
nCharIndex+=1;
sInitChar[nCharIndex]=p->m_pWordSeg[nIndex][k].sWord[nCharIndex];
}
nCharIndex+=1;
sInitChar[nCharIndex]='\0';
if(k>0&&(abs(p->m_pWordSeg[nIndex][k-1].nHandle)==27904||abs(p->m_pWordSeg[nIndex][k-1].nHandle)==29696)&&(strcmp(sInitChar,"—")==0||sInitChar[0]=='-')&&(strlen(p->m_pWordSeg[nIndex][k].sWord)>nCharIndex))
{
strcpy(p->m_pWordSeg[nIndex][k+1].sWord,p->m_pWordSeg[nIndex][k].sWord+nCharIndex);
p->m_pWordSeg[nIndex][k+1].dValue=p->m_pWordSeg[nIndex][k].dValue;
p->m_pWordSeg[nIndex][k+1].nHandle=27904;
p->m_pWordSeg[nIndex][k].sWord[nCharIndex]=0;
p->m_pWordSeg[nIndex][k].dValue=0;
p->m_pWordSeg[nIndex][k].nHandle=30464;
SetElement(p->m_graphOptimum,nStartVertex,nStartVertex+1,p->m_pWordSeg[nIndex][k].dValue,p->m_pWordSeg[nIndex][k].nHandle,p->m_pWordSeg[nIndex][k].sWord);
nStartVertex+=1;
k+=1;
}
nLen=strlen(p->m_pWordSeg[nIndex][k].sWord);
if((strlen(sAtom)==2&&CC_Find("月日时分秒",sAtom))||strcmp(sAtom,"月份")==0)
{
strcat(p->m_pWordSeg[nIndex][k].sWord,sAtom);
strcpy(sCurWord,"未##时");
nPOS=-29696;
}
else if(strcmp(sAtom,"年")==0)
{
if(IsYearTime(p->m_pWordSeg[nIndex][k].sWord))
{
strcat(p->m_pWordSeg[nIndex][k].sWord,sAtom);
strcpy(sCurWord,"未##时");
nPOS=-29696;
}
else
{
strcpy(sCurWord,"未##数");
nPOS=-27904;
i--;
}
}
else
{
if(strcmp(p->m_pWordSeg[nIndex][k].sWord+strlen(p->m_pWordSeg[nIndex][k].sWord)-2,"点")==0)
{
strcpy(sCurWord,"未##时");
nPOS=-29696;
}
else
{
if(!CC_Find("∶·./",p->m_pWordSeg[nIndex][k].sWord+nLen-2)&&p->m_pWordSeg[nIndex][k].sWord[nLen-1]!='.'&&p->m_pWordSeg[nIndex][k].sWord[nLen-1]!='/')
{
strcpy(sCurWord,"未##数");
nPOS=-27904;
}
else if(nLen>strlen(sInitChar))
{
if(p->m_pWordSeg[nIndex][k].sWord[nLen-1]=='.'||p->m_pWordSeg[nIndex][k].sWord[nLen-1]=='/')
p->m_pWordSeg[nIndex][k].sWord[nLen-1]=0;
else
p->m_pWordSeg[nIndex][k].sWord[nLen-2]=0;
strcpy(sCurWord,"未##数");
nPOS=-27904;
i--;
}
}
i--;
}
}
fValue=0;
nEndVertex=nSegRoute[nIndex][i+1];
}
p->m_pWordSeg[nIndex][k].nHandle=nPOS;
p->m_pWordSeg[nIndex][k].dValue=fValue;
SetElement(p->m_graphOptimum,nStartVertex,nEndVertex,fValue,nPOS,sCurWord);
i++;
k++;
}
p->m_pWordSeg[nIndex][k].sWord[0]=0;
p->m_pWordSeg[nIndex][k].nHandle=-1;
return TRUE;
}
int InSegment(pSegment p,char *sSentence,pDictionary dictCore,int nResultCount)
{
int i;
int **nSegRoute;
pNShortPath sp;
nSegRoute=(int**)malloc(sizeof(int*)*MAX_SEGMENT_NUM);
for(i=0;i<MAX_SEGMENT_NUM;i++)
{
nSegRoute[i]=(int*)malloc((MAX_SENTENCE_LEN/2)*sizeof(int));
memset(nSegRoute[i],0,MAX_SENTENCE_LEN/2*sizeof(int));
}
SetRowFirst(p->m_graphSeg->m_segGraph,FALSE);
SetRowFirst(p->m_graphOptimum,FALSE);
GenerateWordNet(p->m_graphSeg,sSentence,dictCore,FALSE);
sp=(pNShortPath)malloc(sizeof(struct NShortPath));
INShortPath(sp,p->m_graphSeg->m_segGraph,nResultCount);
ShortPath(sp);
OutputPath(sp,nSegRoute,FALSE,&p->m_nSegmentCount);
SetEmpty(p->m_graphOptimum);
i=0;
UNShortPath(sp);
free(sp);
sp=0;
while(i<p->m_nSegmentCount)
{
GenerateWord(p,nSegRoute,i);
i++;
}
for(i=0;i<MAX_SEGMENT_NUM;i++)
{
free(nSegRoute[i]);
}
free(nSegRoute);
nSegRoute=0;
return TRUE;
}
int BiGraphGenerate(pSegment p,pDynamicArray aWord, pDynamicArray aBinaryWordNet,double dSmoothingPara,pDictionary DictBinary,pDictionary DictCore)
{
PARRAY_CHAIN pTail,pCur,pNextWords;
unsigned int nCurWordIndex,nNextWordIndex,nWordIndex=0;
unsigned int nTwoWordsFreq=0;
double dCurFreqency,dValue,dTemp;
char sTwoWords[WORD_MAXLENGTH];
memset(sTwoWords,0,WORD_MAXLENGTH);
p->m_nWordCount=GetTail(aWord,&pTail);
if(p->m_npWordPosMapTable)
{
free(p->m_npWordPosMapTable);
p->m_npWordPosMapTable=0;
}
if(p->m_nWordCount>0)
p->m_npWordPosMapTable=(int *)malloc(sizeof(int)*p->m_nWordCount);
pCur=GetHead(aWord);
while(pCur!=0)
{
p->m_npWordPosMapTable[nWordIndex++]=pCur->row*MAX_SENTENCE_LEN+pCur->col;
pCur=pCur->next;
}
pCur=GetHead(aWord);
while(pCur!=0)
{
if(pCur->nPOS>=0)
dCurFreqency=pCur->value;
else
dCurFreqency=GetFrequency(DictCore,pCur->sWord,2);
GetElementValue(aWord,pCur->col,-1,pCur,&pNextWords);
while(pNextWords&&pNextWords->row==pCur->col)
{
strcpy(sTwoWords,pCur->sWord);
strcat(sTwoWords,WORD_SEGMENTER);
strcat(sTwoWords,pNextWords->sWord);
nTwoWordsFreq=GetFrequency(DictBinary,sTwoWords,3);
dTemp=(double)1/MAX_FREQUENCE;
dValue=-log(dSmoothingPara*(1+dCurFreqency)/(MAX_FREQUENCE+80000)+(1-dSmoothingPara)*((1-dTemp)*nTwoWordsFreq/(1+dCurFreqency)+dTemp));
if(pCur->nPOS<0)
dValue+=pCur->value;
nCurWordIndex=BinarySearch(pCur->row*MAX_SENTENCE_LEN+pCur->col,p->m_npWordPosMapTable,p->m_nWordCount);
nNextWordIndex=BinarySearch(pNextWords->row*MAX_SENTENCE_LEN+pNextWords->col,p->m_npWordPosMapTable,p->m_nWordCount);
SetElement(aBinaryWordNet,nCurWordIndex,nNextWordIndex,dValue,pCur->nPOS,0);
pNextWords=pNextWords->next;
}
pCur=pCur->next;
}
return TRUE;
}
int BiPath2UniPath(pSegment p,int *npPath)
{
int i=0;
int nTemp=-1;
if(!p->m_npWordPosMapTable)
return FALSE;
while(npPath[i]!=-1&&npPath[i]<p->m_nWordCount)
{
nTemp=p->m_npWordPosMapTable[npPath[i]];
npPath[i]=nTemp/MAX_SENTENCE_LEN;
i++;
}
if(nTemp>0)
npPath[i++]=nTemp%MAX_SENTENCE_LEN;
npPath[i]=-1;
return TRUE;
}
int BiSegment(pSegment p,char *sSentence, double dSmoothingPara, pDictionary dictCore, pDictionary dictBinary, unsigned int nResultCount)
{
int **nSegRoute;
unsigned int nLen;
int i;
pNShortPath sp;
pDynamicArray aBiwordsNet;
nSegRoute=(int **) malloc (sizeof(int*)*MAX_SEGMENT_NUM);
nLen=strlen(sSentence)+10;
for(i=0;i<MAX_SEGMENT_NUM;i++)
{
nSegRoute[i]=(int *) malloc (sizeof(int*)*(nLen/2));
memset(nSegRoute[i],-1,nLen/2*sizeof(int));
}
GenerateWordNet(p->m_graphSeg,sSentence,dictCore,TRUE);
aBiwordsNet=(pDynamicArray)malloc(sizeof(struct DynamicArray));
IDynamicArray(aBiwordsNet,FALSE);
BiGraphGenerate(p,p->m_graphSeg->m_segGraph,aBiwordsNet,dSmoothingPara,dictBinary,dictCore);
sp=(pNShortPath)malloc(sizeof (struct NShortPath));
INShortPath(sp,aBiwordsNet,nResultCount);
ShortPath(sp);
OutputPath(sp,nSegRoute,FALSE,&p->m_nSegmentCount);
UNShortPath(sp);
free(sp);
sp=NULL;
SetEmpty(p->m_graphOptimum);
i=0;
while(i<p->m_nSegmentCount)
{
BiPath2UniPath(p,nSegRoute[i]);
GenerateWord(p,nSegRoute,i);
i++;
}
for(i=0;i<MAX_SEGMENT_NUM;i++)
{
free(nSegRoute[i]);
nSegRoute[i]=NULL;
}
free(nSegRoute);
nSegRoute=NULL;
aBiwordsNet=NULL;
return TRUE;
}
int BiOptimumSegment(pSegment p,unsigned int nResultCount,double dSmoothingPara, pDictionary dictBinary, pDictionary dictCore)
{
int **nSegRoute;
int i;
pNShortPath sp;
pDynamicArray aBiwordsNet;
nSegRoute=(int **) malloc (sizeof(int*)*MAX_SEGMENT_NUM);
for(i=0;i<MAX_SEGMENT_NUM;i++)
{
nSegRoute[i]=(int *)malloc(sizeof(int)*(MAX_SENTENCE_LEN/2));
memset(nSegRoute[i],-1,MAX_SENTENCE_LEN/2*sizeof(int));
}
aBiwordsNet=(pDynamicArray)malloc(sizeof(struct DynamicArray));
IDynamicArray(aBiwordsNet,FALSE);
BiGraphGenerate(p,p->m_graphOptimum,aBiwordsNet,dSmoothingPara,dictBinary,dictCore);
sp=(pNShortPath)malloc(sizeof (struct NShortPath));
INShortPath(sp,aBiwordsNet,nResultCount);
ShortPath(sp);
OutputPath(sp,(int **)nSegRoute,FALSE,&p->m_nSegmentCount);
UNShortPath(sp);
free(sp);
sp=0;
i=0;
p->m_graphSeg->m_segGraph=p->m_graphOptimum;
SetEmpty(p->m_graphOptimum);
while(i<p->m_nSegmentCount)
{
BiPath2UniPath(p,nSegRoute[i]);
GenerateWord(p,nSegRoute,i);
i++;
}
for(i=0;i<MAX_SEGMENT_NUM;i++)
{
free(nSegRoute[i]);
nSegRoute[i]=0;
}
free(nSegRoute);
nSegRoute=0;
aBiwordsNet=0;
return TRUE;
}
void ISegment(pSegment p)
{
int i;
p->m_pWordSeg= (PWORD_RESULT *)malloc(MAX_SEGMENT_NUM*sizeof(PWORD_RESULT));
for(i=0;i<MAX_SEGMENT_NUM;i++)
{
p->m_pWordSeg[i]=(PWORD_RESULT)malloc(sizeof(WORD_RESULT)*MAX_WORDS);;
}
p->m_graphSeg=(pSegGraph)malloc(sizeof(struct SegGraph));
ISegGraph(p->m_graphSeg);
p->m_graphOptimum=(pDynamicArray)malloc(sizeof(struct DynamicArray));
IDynamicArray(p->m_graphOptimum,TRUE);
p->m_npWordPosMapTable=0;
p->m_nWordCount=0;
SetRowFirst(p->m_graphOptimum,TRUE);
}
void USegment(pSegment p)
{
int i;
for(i=0;i<MAX_SEGMENT_NUM;i++)
{
free(p->m_pWordSeg[i]);
p->m_pWordSeg[i]=0;
}
free(p->m_pWordSeg);
p->m_pWordSeg=0;
USegGraph(p->m_graphSeg);
free(p->m_graphSeg);
p->m_graphSeg=0;
}
v
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -