📄 seggraph.cpp
字号:
/****************************************************************************
*
* Copyright (c) 2000, 2001
* Machine Group
* Software Research Lab.
* Institute of Computing Tech.
* Chinese Academy of Sciences
* All rights reserved.
*
* This file is the confidential and proprietary property of
* Institute of Computing Tech. and the posession or use of this file requires
* a written license from the author.
* Filename: SegGraph.cpp
* Abstract:
* implement for the Word Segmentation Directed Graph.
*
* Author: Kevin Zhang
* (zhanghp@software.ict.ac.cn)
* Date: 2002-1-8
*
* Notes:
*
*
****************************************************************************/
// SegGraph.cpp: implementation of the CSegGraph class.
//
//////////////////////////////////////////////////////////////////////
#include "stdafx.h"
#include "SegGraph.h"
#include "..\\Utility\\Utility.h"
#include <string.h>
#include <math.h>
//////////////////////////////////////////////////////////////////////
// Construction/Destruction
//////////////////////////////////////////////////////////////////////
CSegGraph::CSegGraph()
{
}
CSegGraph::~CSegGraph()
{
}
bool CSegGraph::GenerateWordNet(char *sSentence,CDictionary &dictCore)
{
//Gernerate the word net from the sLine, that's list all the possible word
unsigned int i=0,j,nLen=strlen(sSentence);
char sWord[WORD_MAXLENGTH]="",sTempWord[WORD_MAXLENGTH]="",sWordMatch[WORD_MAXLENGTH];
int nWordIndex=0,nHandleTemp,k,nPOS;
int nMatchFreq[20],nMatchHandle[20],nTotalFreq,nMatchCount;
m_nAtomCount=0;
m_segGraph.SetEmpty();//Set segmentation graph empty
AtomSegment(sSentence);
//Atomic Segmentation
for(i=0;i<m_nAtomCount;i++)//Init the cost array
{
if(m_nAtomPOS[i]==CT_CHINESE)//The atom is a Chinese Char
m_segGraph.SetElement(i,i+1,log(MAX_FREQUENCE),0,m_sAtom[i]);//init the link with the maximum value
else//Other atom
{
switch(m_nAtomPOS[i])
{
case CT_INDEX:
case CT_NUM:
nPOS='m'*256;
break;
case CT_DELIMITER:
nPOS='w'*256;
break;
case CT_LETTER:
nPOS='s'*256+'s';
break;
case CT_SINGLE://12021-2129-3121
if(m_sAtom[i][0]<='9'&&m_sAtom[i][0]>='0')
nPOS='m'*256;
else
nPOS='s'*256+'s';
break;
default:
nPOS=m_nAtomPOS[i];//'?'*256;
break;
}
m_segGraph.SetElement(i,i+1,0,nPOS,m_sAtom[i]);//init the link with minimum
}
}
i=0;
while(i<m_nAtomCount)//All the word
{
strcpy(sWord,m_sAtom[i]);//Get the current atom
j=i+1;
while(j<=m_nAtomCount&&dictCore.GetMaxMatch(sWord,sWordMatch,&nHandleTemp))
{//Add a condition to control the end of string
//retrieve the dictionary with the word
if(strcmp(sWordMatch,sWord)==0)//find the current word
{
nTotalFreq=0;
dictCore.GetHandle(sWord,&nMatchCount,nMatchHandle,nMatchFreq);
for(k=0;k<nMatchCount;k++)//Add the frequency
{
nTotalFreq+=nMatchFreq[k];
}
//Adding a rule to exclude some words to be formed.
if(strlen(sWord)==4&&i>=1&&(IsAllNum((unsigned char *)m_sAtom[i-1])||IsAllChineseNum(m_sAtom[i-1]))&&(strncmp(sWord,"年",2)==0||strncmp(sWord,"月",2)==0))
{//1年内、1999年末
if(CC_Find("末内中底前间初",sWord+2))
break;
}
if(nMatchCount==1)//The possible word has only one POS, store it
m_segGraph.SetElement(i,j,-log(nTotalFreq+1)+log(MAX_FREQUENCE),nMatchHandle[0],sWord);
else
m_segGraph.SetElement(i,j,-log(nTotalFreq+1)+log(MAX_FREQUENCE),0,sWord);
}
strcat(sWord,m_sAtom[j++]);
}
i+=1;//Start from i++;
}
return true;
}
bool CSegGraph::AtomSegment(char *sSentence)
{
unsigned int i=0,j=0,nCurType,nNextType;
//i is the pointer of sentence string
//j is the pointer of pAtoms
char sChar[3];
sChar[2]=0;//Set the char ending
m_sAtom[j][0]=0;//Set the first word as null
m_nAtomLength[j]=0;
if(strncmp(sSentence,SENTENCE_BEGIN,strlen(SENTENCE_BEGIN))==0)
{
strcpy(m_sAtom[j],SENTENCE_BEGIN);//Set the first word as sentence begining
m_nAtomLength[j]=strlen(SENTENCE_BEGIN);
m_nAtomPOS[j]=CT_SENTENCE_BEGIN;//init
i+=m_nAtomLength[j];
j+=1;
m_sAtom[j][0]=0;//Set the first word as null
m_nAtomLength[j]=0;
}
while(i<strlen(sSentence))
{
if(strncmp(sSentence+i,SENTENCE_END,strlen(SENTENCE_END))==0)
{
strcpy(m_sAtom[j],SENTENCE_END);//Set the first word as null
m_nAtomLength[j]=strlen(SENTENCE_END);
m_nAtomPOS[j]=CT_SENTENCE_END;//init
i+=m_nAtomLength[j];
j+=1;
m_sAtom[j][0]=0;//Set the first word as null
m_nAtomLength[j]=0;
continue;
}
sChar[0]=*(sSentence+i);//Get the char with first byte
sChar[1]=0;//
i+=1;
if(sChar[0]<0)//Two byte char
{
sChar[1]=*(sSentence+i);//Get the char with second byte
i+=1;//i increased by 1
}
strcat(m_sAtom[j],sChar);
nCurType=charType((unsigned char *)sChar);
if(sChar[0]=='.'&&(charType((unsigned char *)sSentence+i)==CT_NUM||(*(sSentence+i)>='0'&&*(sSentence+i)<='9')))
nCurType=CT_NUM;//Digit after . indicate . as a point in the numeric
m_nAtomPOS[j]=nCurType;
//Record its property, just convience for continuous processing
if(nCurType==CT_CHINESE||nCurType==CT_INDEX||nCurType==CT_DELIMITER||nCurType==CT_OTHER)
{//Chinese char, index number,delimiter and other is treated as atom
m_nAtomLength[j]=strlen(m_sAtom[j]);//Save its length
j+=1;//Skip to next atom
m_sAtom[j][0]=0;//init
}
else
{//Number,single char, letter
nNextType=255;
if(i<strlen(sSentence))
nNextType=charType((unsigned char *)(sSentence+i));
// if(nNextType==CT_CHINESE||nCurType==CT_INDEX||nCurType==CT_DELIMITER||nCurType==CT_OTHER||i==strlen(sSentence))
if(nNextType!=nCurType||i==strlen(sSentence))
//Reaching end or next char type is different from current char
{
m_nAtomLength[j]=strlen(m_sAtom[j]);//Save its length
j+=1;
m_sAtom[j][0]=0;//init
}
}
}
m_nAtomCount=j;//The count of segmentation atoms
return true;
}
bool CSegGraph::GenerateWordArray(char *sSentence, CDictionary &dictCore)
{
//Gernerate the word array from the sLine, that's list all the possible word
unsigned int i=0,j,nLen=strlen(sSentence);
char sWord[WORD_MAXLENGTH]="",sTempWord[WORD_MAXLENGTH]="",sWordMatch[WORD_MAXLENGTH];
int nWordIndex=0,nHandleTemp,k,nPOS;
int nMatchFreq[20],nMatchHandle[20],nTotalFreq,nMatchCount;
m_nAtomCount=0;
m_segGraph.SetEmpty();//Set segmentation graph empty
AtomSegment(sSentence);
//Atomic Segmentation
for(i=0;i<m_nAtomCount;i++)//Init the cost array
{
if(m_nAtomPOS[i]==CT_CHINESE)//The atom is a Chinese Char
m_segGraph.SetElement(i,i+1,0,0,m_sAtom[i]);//init the link with the maximum value
else//Other atom
{
switch(m_nAtomPOS[i])
{
case CT_INDEX:
case CT_NUM:
nPOS='m'*256;
break;
case CT_DELIMITER:
nPOS='w'*256;
break;
case CT_LETTER:
nPOS='s'*256+'s';
break;
case CT_SINGLE://12021-2129-3121
if(m_sAtom[i][0]<='9'&&m_sAtom[i][0]>='0')
nPOS='m'*256;
else
nPOS='s'*256+'s';
break;
default:
nPOS=m_nAtomPOS[i];//'?'*256;
break;
}
m_segGraph.SetElement(i,i+1,MAX_FREQUENCE,nPOS,m_sAtom[i]);//init the link with minimum
}
}
i=0;
while(i<m_nAtomCount)//All the word
{
strcpy(sWord,m_sAtom[i]);//Get the current atom
j=i+1;
while(j<=m_nAtomCount&&dictCore.GetMaxMatch(sWord,sWordMatch,&nHandleTemp))
{//Add a condition to control the end of string
//retrieve the dictionary with the word
if(strcmp(sWordMatch,sWord)==0)//find the current word
{
nTotalFreq=0;
dictCore.GetHandle(sWord,&nMatchCount,nMatchHandle,nMatchFreq);
for(k=0;k<nMatchCount;k++)//Add the frequency
{
nTotalFreq+=nMatchFreq[k];
}
//Adding a rule to exclude some words to be formed.
if(strlen(sWord)==4&&i>=1&&(IsAllNum((unsigned char *)m_sAtom[i-1])||IsAllChineseNum(m_sAtom[i-1]))&&(strncmp(sWord,"年",2)==0||strncmp(sWord,"月",2)==0))
{//1年内、1999年末
if(CC_Find("末内中底前间初",sWord+2))
break;
}
if(nMatchCount==1)//The possible word has only one POS, store it
m_segGraph.SetElement(i,j,nTotalFreq,nMatchHandle[0],sWord);
else
m_segGraph.SetElement(i,j,nTotalFreq,0,sWord);
}
strcat(sWord,m_sAtom[j++]);
}
i+=1;//Start from i++;
}
return true;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -