📄 segment.cpp
字号:
//DEL {
//DEL int i=0;
//DEL char sTempBuffer[WORD_MAXLENGTH];
//DEL sResult[0]=0;
//DEL if(nIndex<0||nIndex>=m_nSegmentCount)
//DEL return false;
//DEL while(m_WordSeg[nIndex][i].sWord[0]!=0)
//DEL {
//DEL sprintf(sTempBuffer,"%s/%c%c",m_WordSeg[nIndex][i].sWord,m_WordSeg[nIndex][i].nHandle/256,m_WordSeg[nIndex][i].nHandle%256);
//DEL strcat(sResult,sTempBuffer);
//DEL strcat(sResult," ");
//DEL i++;
//DEL }
//DEL return true;
//DEL }
//Word Segmentation based on optimum segmentation graph
//After unknown word recognition
bool CSegment::OptimumSegmet(int nResultCount)
{
int **nSegRoute;//The segmentation route
nSegRoute=new int*[MAX_SEGMENT_NUM];
for(int i=0;i<MAX_SEGMENT_NUM;i++)
{
nSegRoute[i]=new int[MAX_SENTENCE_LEN/2];
}
CNShortPath sp(&m_graphOptimum,nResultCount);
sp.ShortPath();
sp.Output((int **)nSegRoute,false,&m_nSegmentCount);
i=0;
m_graphSeg.m_segGraph=m_graphOptimum;
m_graphOptimum.SetEmpty();//Set graph optimum empty
while(i<m_nSegmentCount)
{
GenerateWord(nSegRoute,i);
//Gernerate word according the Segmentation route
i++;
}
//free the memory
for(i=0;i<MAX_SEGMENT_NUM;i++)
{
delete [] nSegRoute[i];//free the pointer memory
}
delete [] nSegRoute;//free the pointer array
return true;
}
int CSegment::GetResultCount(PWORD_RESULT pItem)
{
int nCount=0;
while(pItem[nCount].sWord[0]!=0)
{
nCount+=1;
}
return nCount;
}
bool CSegment::GetLastWord(PWORD_RESULT pItem, char *sWordRet)
{
int nCount=0;
sWordRet[0]=0;
while(pItem[nCount].sWord[0]!=0)
{
strcpy(sWordRet,pItem[nCount].sWord);
nCount+=1;
}
return !sWordRet[0];
}
bool CSegment::IsYearTime(char *sNum)
{//Judge whether the sNum is a num genearating year
unsigned int nLen=strlen(sNum);
char sTemp[3];
strncpy(sTemp,sNum,2);
sTemp[2]=0;
if(IsAllSingleByte((unsigned char *)sNum)&&(nLen==4||nLen==2&&sNum[0]>'4'))//1992年, 90年
return true;
if(IsAllNum((unsigned char *)sNum)&&(nLen>=6||nLen==4&&CC_Find("56789",sTemp)))
return true;
if(GetCharCount("零○一二三四五六七八九壹贰叁肆伍陆柒捌玖",sNum)==(int)nLen/2&&nLen>=3)
return true;
if(nLen==8&&GetCharCount("千仟零○",sNum)==2)//二仟零二年
return true;
if(nLen==2&&GetCharCount("千仟",sNum)==1)
return true;
if(nLen==4&&GetCharCount("甲乙丙丁戊己庚辛壬癸",sNum)==1&&GetCharCount("子丑寅卯辰巳午未申酉戌亥",sNum+2)==1)
return true;
return false;
}
//CDynamicArray &aWord: the words array
//CDynamicArray &aWordBinaryNet:the net between words
//double dSmoothingPara: the parameter of data smoothing
//CDictionary &DictBinary: the binary dictionary
//CDictionary &DictCore: the Core dictionary
bool CSegment::BiGraphGenerate(CDynamicArray &aWord, CDynamicArray &aBinaryWordNet,double dSmoothingPara,CDictionary &DictBinary,CDictionary &DictCore)
{
PARRAY_CHAIN pTail,pCur,pNextWords;//Temp buffer
unsigned int nWordIndex=0,nTwoWordsFreq=0,nCurWordIndex,nNextWordIndex;
//nWordIndex: the index number of current word
double dCurFreqency,dValue,dTemp;
char sTwoWords[WORD_MAXLENGTH];
m_nWordCount=aWord.GetTail(&pTail);//Get tail element and return the words count
if(m_npWordPosMapTable)
{//free buffer
delete [] m_npWordPosMapTable;
m_npWordPosMapTable=0;
}
if(m_nWordCount>0)//Word count is greater than 0
m_npWordPosMapTable=new int[m_nWordCount];//Record the position of possible words
pCur=aWord.GetHead();
while(pCur!=NULL)//Set the position map of words
{
m_npWordPosMapTable[nWordIndex++]=pCur->row*MAX_SENTENCE_LEN+pCur->col;
pCur=pCur->next;
}
pCur=aWord.GetHead();
while(pCur!=NULL)//
{
if(pCur->nPOS>=0)//It's not an unknown words
dCurFreqency=pCur->value;
else//Unknown words
dCurFreqency=DictCore.GetFrequency(pCur->sWord,2);
aWord.GetElement(pCur->col,-1,pCur,&pNextWords);//Get next words which begin with pCur->col
while(pNextWords&&pNextWords->row==pCur->col)//Next words
{
//Current words frequency
strcpy(sTwoWords,pCur->sWord);
strcat(sTwoWords,WORD_SEGMENTER);
strcat(sTwoWords,pNextWords->sWord);
nTwoWordsFreq=DictBinary.GetFrequency(sTwoWords,3);
//Two linked Words frequency
dTemp=(double)1/MAX_FREQUENCE;
//Smoothing
dValue=-log(dSmoothingPara*(1+dCurFreqency)/(MAX_FREQUENCE+80000)+(1-dSmoothingPara)*((1-dTemp)*nTwoWordsFreq/(1+dCurFreqency)+dTemp));
//-log{a*P(Ci-1)+(1-a)P(Ci|Ci-1)} Note 0<a<1
if(pCur->nPOS<0)//Unknown words: P(Wi|Ci);while known words:1
dValue+=pCur->value;
//Get the position index of current word in the position map table
nCurWordIndex=BinarySearch(pCur->row*MAX_SENTENCE_LEN+pCur->col,m_npWordPosMapTable,m_nWordCount);
nNextWordIndex=BinarySearch(pNextWords->row*MAX_SENTENCE_LEN+pNextWords->col,m_npWordPosMapTable,m_nWordCount);
aBinaryWordNet.SetElement(nCurWordIndex,nNextWordIndex,dValue,pCur->nPOS);
pNextWords=pNextWords->next;//Get next word
}
pCur=pCur->next;
}
return true;
}
bool CSegment::BiSegment(char *sSentence, double dSmoothingPara, CDictionary &dictCore, CDictionary &dictBinary, unsigned int nResultCount)
{
int **nSegRoute;//The segmentation route
nSegRoute=new int*[MAX_SEGMENT_NUM];
unsigned int nLen=strlen(sSentence)+10;
for(int i=0;i<MAX_SEGMENT_NUM;i++)
{
nSegRoute[i]=new int[nLen/2];
memset(nSegRoute[i],-1,nLen/2*sizeof(int));
}
m_graphSeg.GenerateWordNet(sSentence,dictCore,true);//Generate words array
CDynamicArray aBiwordsNet;
BiGraphGenerate(m_graphSeg.m_segGraph,aBiwordsNet,dSmoothingPara,dictBinary,dictCore);
//Generate the biword link net
CNShortPath sp(&aBiwordsNet,nResultCount);
sp.ShortPath();
sp.Output(nSegRoute,false,&m_nSegmentCount);
m_graphOptimum.SetEmpty();//Set graph optimum empty
i=0;
while(i<m_nSegmentCount)
{
BiPath2UniPath(nSegRoute[i]);
//Path convert to unipath
GenerateWord(nSegRoute,i);
//Gernerate word according the Segmentation route
i++;
}
//free the memory
for(i=0;i<MAX_SEGMENT_NUM;i++)
{
delete [] nSegRoute[i];//free the pointer memory
}
delete [] nSegRoute;//free the pointer array
return true;
}
bool CSegment::BiPath2UniPath(int *npPath)
{//BiPath convert to unipath
int i=0,nTemp=-1;
if(!m_npWordPosMapTable)
return false;
while(npPath[i]!=-1&&npPath[i]<m_nWordCount)
{
nTemp=m_npWordPosMapTable[npPath[i]];
npPath[i]=nTemp/MAX_SENTENCE_LEN;
i++;
}
if(nTemp>0)
npPath[i++]=nTemp%MAX_SENTENCE_LEN;
npPath[i]=-1;
return true;
}
bool CSegment::BiOptimumSegment(unsigned int nResultCount,double dSmoothingPara, CDictionary &dictBinary, CDictionary &dictCore)
{
int **nSegRoute;//The segmentation route
nSegRoute=new int*[MAX_SEGMENT_NUM];
for(int i=0;i<MAX_SEGMENT_NUM;i++)
{
nSegRoute[i]=new int[MAX_SENTENCE_LEN/2];
memset(nSegRoute[i],-1,MAX_SENTENCE_LEN/2*sizeof(int));
}
CDynamicArray aBiwordsNet;
BiGraphGenerate(m_graphOptimum,aBiwordsNet,dSmoothingPara,dictBinary,dictCore);
//Generate the biword link net
CNShortPath sp(&aBiwordsNet,nResultCount);
sp.ShortPath();
sp.Output((int **)nSegRoute,false,&m_nSegmentCount);
i=0;
m_graphSeg.m_segGraph=m_graphOptimum;
m_graphOptimum.SetEmpty();//Set graph optimum empty
while(i<m_nSegmentCount)
{
BiPath2UniPath(nSegRoute[i]);
//Path convert to unipath
GenerateWord(nSegRoute,i);
//Gernerate word according the Segmentation route
i++;
}
//free the memory
for(i=0;i<MAX_SEGMENT_NUM;i++)
{
delete [] nSegRoute[i];//free the pointer memory
}
delete [] nSegRoute;//free the pointer array
return true;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -