📄 segment(sen).c
字号:
{
i++;
p->m_sWords[i][0]=0;
break;
}
if(!bSplit)
nWordsIndex++;
}
if(pWordItems[nWordsIndex].sWord[0]==0)
nRetPos=-1;
if(p->m_nTags[i-1][1]!=-1)
{
if(p->m_tagType!=TT_NORMAL)
p->m_nTags[i][0]=101;
else
p->m_nTags[i][0]=1;
p->m_dFrequency[i][0]=0;
p->m_sWords[i][0]=0;
p->m_nTags[i++][1]=-1;
}
p->m_nCurLength=i;
if(nRetPos!=-1)
return nWordsIndex+1;
return -1;
}
void SetTagType(pSpan p,enum TAG_TYPE nType)
{
p->m_tagType=nType;
}
int PlaceRecognize(pSpan p,pDictionary dictCore,pDictionary placeDict)
{
int nStart=1,nEnd=1,i=1,nTemp;
double dPanelty=1.0;
while(p->m_nBestTag[i]>-1)
{
if(p->m_nBestTag[i]==1)
{
nStart=i;
nEnd=nStart+1;
while(p->m_nBestTag[nEnd]==1)
{
if(nEnd>nStart+1)
dPanelty+=1.0;
nEnd++;
}
while(p->m_nBestTag[nEnd]==2)
nEnd++;
nTemp=nEnd;
while(p->m_nBestTag[nEnd]==3)
{
if(nEnd>nTemp)
dPanelty+=1.0;
nEnd++;
}
}
else if(p->m_nBestTag[i]==2)
{
dPanelty+=1.0;
nStart=i;
nEnd=nStart+1;
while(p->m_nBestTag[nEnd]==2)
nEnd++;
nTemp=nEnd;
while(p->m_nBestTag[nEnd]==3)
{
if(nEnd>nTemp)
dPanelty+=1.0;
nEnd++;
}
}
if(nEnd>nStart)
{
p->m_nUnknownWords[p->m_nUnknownIndex][0]=p->m_nWordPosition[nStart];
p->m_nUnknownWords[p->m_nUnknownIndex][1]=p->m_nWordPosition[nEnd];
p->m_dWordsPossibility[p->m_nUnknownIndex++]=ComputePossibility(p,nStart,nEnd-nStart+1,placeDict)+log(dPanelty);
nStart=nEnd;
}
if(i<nEnd)
i=nEnd;
else
i=i+1;
}
return TRUE;
}
int POSTagging(pSpan p,PWORD_RESULT pWordItems,pDictionary dictCore,pDictionary dictUnknown)
{
int i=0,j,nStartPos;
Reset(p,FALSE);
while(i>-1&&pWordItems[i].sWord[0]!=0)
{
nStartPos=i;
i=GetFrom(p,pWordItems,nStartPos,dictCore,dictUnknown);
GetBestPOS(p);
switch(p->m_tagType)
{
case TT_NORMAL:
j=1;
while(p->m_nBestTag[j]!=-1&&j<p->m_nCurLength)
{
pWordItems[j+nStartPos-1].nHandle=p->m_nBestTag[j];
if(pWordItems[j+nStartPos-1].dValue>0&&IsExist(dictCore,pWordItems[j+nStartPos-1].sWord,-1))
pWordItems[j+nStartPos-1].dValue=GetFrequency(dictCore,pWordItems[j+nStartPos-1].sWord,p->m_nBestTag[j]);
j+=1;
}
break;
case TT_PERSON:
PersonRecognize(p,dictUnknown);
break;
case TT_PLACE:
case TT_TRANS_PERSON:
PlaceRecognize(p,dictCore,dictUnknown);
break;
default:
break;
}
Reset(p,TRUE);
}
return TRUE;
}
void ISpan(pSpan p)
{
if(p->m_tagType!=TT_NORMAL)
p->m_nTags[0][0]=100;
else
p->m_nTags[0][0]=0;
p->m_context=(pContextState)malloc(sizeof(struct ContextState));
IContextState(p->m_context);
p->m_nTags[0][1]=-1;
p->m_dFrequency[0][0]=0;
p->m_nCurLength=1;
p->m_nUnknownIndex=0;
p->m_nStartPos=0;
p->m_nWordPosition[1]=0;
p->m_sWords[0][0]=0;
p->m_tagType=TT_NORMAL;
}
int AtomSegment(pSegGraph p,char *sSentence)
{
unsigned int i=0,j=0;
unsigned int nCurType,nNextType;
char sChar[3];
sChar[2]=0;
p->m_sAtom[j][0]=0;
p->m_nAtomLength[j]=0;
if(strncmp(sSentence,SENTENCE_BEGIN,strlen(SENTENCE_BEGIN))==0)
{
strcpy(p->m_sAtom[j],SENTENCE_BEGIN);
p->m_nAtomLength[j]=strlen(SENTENCE_BEGIN);
p->m_nAtomPOS[j]=CT_SENTENCE_BEGIN;
i+=p->m_nAtomLength[j];
j+=1;
p->m_sAtom[j][0]=0;
p->m_nAtomLength[j]=0;
}
while(i<strlen(sSentence))
{
if(strncmp(sSentence+i,SENTENCE_END,strlen(SENTENCE_END))==0)
{
strcpy(p->m_sAtom[j],SENTENCE_END);
p->m_nAtomLength[j]=strlen(SENTENCE_END);
p->m_nAtomPOS[j]=CT_SENTENCE_END;
i+=p->m_nAtomLength[j];
j+=1;
p->m_sAtom[j][0]=0;
p->m_nAtomLength[j]=0;
continue;
}
sChar[0]=*(sSentence+i);
sChar[1]=0;
i+=1;
if(sChar[0]<0)
{
sChar[1]=*(sSentence+i);
i+=1;
}
strcat(p->m_sAtom[j],sChar);
nCurType=charType((unsigned char *)sChar);
if(sChar[0]=='.'&&(charType((unsigned char *)sSentence+i)==CT_NUM||(*(sSentence+i)>='0'&&*(sSentence+i)<='9')))
nCurType=CT_NUM;
p->m_nAtomPOS[j]=nCurType;
if(nCurType==CT_CHINESE||nCurType==CT_INDEX||nCurType==CT_DELIMITER||nCurType==CT_OTHER)
{
p->m_nAtomLength[j]=strlen(p->m_sAtom[j]);
j+=1;
p->m_sAtom[j][0]=0;
}
else
{
nNextType=255;
if(i<strlen(sSentence))
nNextType=charType((unsigned char *)(sSentence+i));
if(nNextType!=nCurType||i==strlen(sSentence))
{
p->m_nAtomLength[j]=strlen(p->m_sAtom[j]);
j+=1;
p->m_sAtom[j][0]=0;
}
}
}
p->m_nAtomCount=j;
return TRUE;
}
int GenerateWordNet(pSegGraph p,char *sSentence,pDictionary dictCore,int bOriginalFreq)
{
unsigned int j,i=0;
char sWord[WORD_MAXLENGTH]="",sWordMatch[WORD_MAXLENGTH];
int nHandleTemp,k,nPOS;
int nMatchFreq[20],nMatchHandle[20],nTotalFreq,nMatchCount;
double dValue=0;
p->m_nAtomCount=0;
if(p->m_segGraph){
SetEmpty(p->m_segGraph);
}
AtomSegment(p,sSentence);
for(i=0;i<p->m_nAtomCount;i++)
{
if(p->m_nAtomPOS[i]==CT_CHINESE)
{
if(!bOriginalFreq)
SetElement(p->m_segGraph,i,i+1,log(MAX_FREQUENCE),0,0);
else
SetElement(p->m_segGraph,i,i+1,0,0,p->m_sAtom[i]);
}
else
{
strcpy(sWord,p->m_sAtom[i]);
dValue=MAX_FREQUENCE;
switch(p->m_nAtomPOS[i])
{
case CT_INDEX:
case CT_NUM:
nPOS=-27904;
strcpy(sWord,"未##数");
dValue=0;
break;
case CT_DELIMITER:
nPOS=30464;
break;
case CT_LETTER:
nPOS=-'n'*256-'x';
dValue=0;
strcpy(sWord,"未##串");
break;
case CT_SINGLE:
if(GetCharCount("+-1234567890",p->m_sAtom[i])==(int)strlen(p->m_sAtom[i]))
{
nPOS=-27904;
strcpy(sWord,"未##数");
}
else
{
nPOS=-'n'*256-'x';
strcpy(sWord,"未##串");
}
dValue=0;
break;
default:
nPOS=p->m_nAtomPOS[i];
break;
}
if(!bOriginalFreq)
SetElement(p->m_segGraph,i,i+1,0,nPOS,0);
else
SetElement(p->m_segGraph,i,i+1,dValue,nPOS,sWord);
}
}
i=0;
while(i<p->m_nAtomCount)
{
strcpy(sWord,p->m_sAtom[i]);
j=i+1;
if(strcmp(sWord,"月")==0&&strcmp(p->m_sAtom[i+1],"份")==0)
j+=1;
while(j<=p->m_nAtomCount&&GetMaxMatch(dictCore,sWord,sWordMatch,&nHandleTemp))
{
if(strcmp(sWordMatch,sWord)==0)
{
nTotalFreq=0;
GetHandle(dictCore,sWord,&nMatchCount,nMatchHandle,nMatchFreq);
for(k=0;k<nMatchCount;k++)
{
nTotalFreq+=nMatchFreq[k];
}
if(strlen(sWord)==4&&i>=1&&(IsAllNum((unsigned char *)p->m_sAtom[i-1])
||IsAllChineseNum(p->m_sAtom[i-1]))&&(strncmp(sWord,"年",2)==0
||strncmp(sWord,"月",2)==0))
{
if(CC_Find("末内中底前间初",sWord+2))
break;
}
if(nMatchCount==1)
{
if(!bOriginalFreq)
SetElement(p->m_segGraph,i,j,-log(nTotalFreq+1)+log(MAX_FREQUENCE),nMatchHandle[0],0);
else
SetElement(p->m_segGraph,i,j,nTotalFreq,nMatchHandle[0],sWord);
}
else
{
if(!bOriginalFreq)
SetElement(p->m_segGraph,i,j,-log(nTotalFreq+1)+log(MAX_FREQUENCE),0,0);
else
SetElement(p->m_segGraph,i,j,nTotalFreq,0,sWord);
}
}
strcat(sWord,p->m_sAtom[j++]);
}
i+=1;
}
return TRUE;
}
void USegGraph(pSegGraph p)
{
free(p->m_segGraph);
p->m_segGraph=NULL;
}
void ISegGraph(pSegGraph p)
{
p->m_segGraph=(pDynamicArray)malloc(sizeof(struct DynamicArray));
IDynamicArray(p->m_segGraph,FALSE);
SetRowFirst(p->m_segGraph,TRUE);
}
int ShortPath(pNShortPath p)
{
unsigned int nPreNode,i,nIndex,nCurNode;
double eWeight;
PARRAY_CHAIN pEdgeList;
pEdgeList=(PARRAY_CHAIN)malloc(sizeof(ARRAY_CHAIN));
for(nCurNode=1;nCurNode<p->m_nVertex;nCurNode++)
{
pQueue queWork;
queWork=(pQueue)malloc(sizeof(struct Queue));
IQueue(queWork);
eWeight=GetElementValue(p->m_apCost,-1,nCurNode,NULL,&pEdgeList);
while(pEdgeList!=0 && pEdgeList->col==nCurNode)
{
nPreNode=pEdgeList->row;
eWeight=pEdgeList->value;
for(i=0;i<p->m_nValueKind;i++)
{
if(nPreNode>0)
{
if(p->m_pWeight[nPreNode-1][i]==INFINITE_VALUE)
break;
Push(queWork,nPreNode,i,eWeight+p->m_pWeight[nPreNode-1][i]);
}
else
{
Push(queWork,nPreNode,i,eWeight);
break;
}
}
pEdgeList=pEdgeList->next;
}
for(i=0;i<p->m_nValueKind;i++)
{
p->m_pWeight[nCurNode-1][i]=INFINITE_VALUE;
}
i=0;
while(i<p->m_nValueKind&&Pop(queWork,&nPreNode,&nIndex,&eWeight,TRUE,TRUE)!=-1)
{
if(p->m_pWeight[nCurNode-1][i]==INFINITE_VALUE)
p->m_pWeight[nCurNode-1][i]=eWeight;
else if(p->m_pWeight[nCurNode-1][i]<eWeight)
{
i++;
if(i==p->m_nValueKind)
break;
p->m_pWeight[nCurNode-1][i]=eWeight;
}
Push(&p->m_pParent[nCurNode-1][i],nPreNode,nIndex,0);
}
}
return 1;
}
void GetPaths(pNShortPath p,unsigned int nNode,unsigned int nIndex,int **nResult,int bBest)
{
pQueue queResult;
int bFirstGet;
unsigned int nCurNode,nCurIndex,nParentNode;
unsigned int nParentIndex,nResultIndex=0;
queResult=(pQueue)malloc(sizeof(struct Queue));
IQueue(queResult);
if(p->m_nResultCount>=MAX_SEGMENT_NUM)
return ;
nResult[p->m_nResultCount][nResultIndex]=-1;
Push(queResult,nNode,nIndex,0);
nCurNode=nNode;
nCurIndex=nIndex;
while(!IsEmpty(queResult,FALSE))
{
while(nCurNode>0)
{
if(Pop(&p->m_pParent[nCurNode-1][nCurIndex],&nParentNode,&nParentIndex,0,FALSE,TRUE)!=-1)
{
nCurNode=nParentNode;
nCurIndex=nParentIndex;
}
if(nCurNode>0)
Push(queResult,nCurNode,nCurIndex,0);
}
if(nCurNode==0)
{
nResult[p->m_nResultCount][nResultIndex++]=nCurNode;
bFirstGet=TRUE;
nParentNode=nCurNode;
while(Pop(queResult,&nCurNode,&nCurIndex,0,FALSE,bFirstGet)!=-1)
{
nResult[p->m_nResultCount][nResultIndex++]=nCurNode;
bFirstGet=FALSE;
nParentNode=nCurNode;
}
nResult[p->m_nResultCount][nResultIndex]=-1;
p->m_nResultCount+=1;
if(p->m_nResultCount>=MAX_SEGMENT_NUM)
return ;
nResultIndex=0;
nResult[p->m_nResultCount][nResultIndex]=-1;
if(bBest)
return ;
}
Pop(queResult,&nCurNode,&nCurIndex,0,FALSE,TRUE);
while((IsEmpty(queResult,FALSE))==FALSE&&(IsSingle(&p->m_pParent[nCurNode-1][nCurIndex])||IsEmpty(&p->m_pParent[nCurNode-1][nCurIndex],TRUE)))
{
Pop(queResult,&nCurNode,&nCurIndex,0,TRUE,TRUE);
Pop(queResult,&nCurNode,&nCurIndex,0,FALSE,TRUE);
}
if(IsEmpty(queResult,FALSE)==FALSE&&IsEmpty(&p->m_pParent[nCurNode-1][nCurIndex],TRUE)==FALSE)
{
Pop(&p->m_pParent[nCurNode-1][nCurIndex],&nParentNode,&nParentIndex,0,FALSE,FALSE);
nCurNode=nParentNode;
nCurIndex=nParentIndex;
if(nCurNode>0)
Push(queResult,nCurNode,nCurIndex,0);
}
}
}
int OutputPath( pNShortPath p,int **nResult,int bBest,int *npCount)
{
unsigned int i;
p->m_nResultCount=0;
if(p->m_nVertex<2)
{
nResult[0][0]=0;
nResult[0][1]=1;
*npCount=1;
return 1;
}
for(i=0;i<p->m_nValueKind&&p->m_pWeight[p->m_nVertex-2][i]<INFINITE_VALUE;i++)
{
GetPaths(p,p->m_nVertex-1,i,nResult,bBest);
*npCount=p->m_nResultCount;
if(nResult[i][0]!=-1&&bBest)
return 1;
if(p->m_nResultCount>=MAX_SEGMENT_NUM)
return 1;
}
return 1;
}
void UNShortPath(struct NShortPath *p)
{
unsigned int i;
for(i=0;i<p->m_nVertex-1;i++)
{
free(p->m_pWeight[i]);
p->m_pWeight[i]=NULL;
free( p->m_pParent[i]);
p->m_pParent[i]=NULL;
}
free(p->m_pWeight);
p->m_pWeight=NULL;
free(p->m_pParent);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -