📄 cdevidesentence.cpp
字号:
// CDevideSentence.cpp : Defines the entry point for the console application.
//
#include "stdafx.h"
#include "string.h"
#include "CDevideSentence.h"
#include "usedll.h"
///////////////////CDevideSentence////////////////////////////////////////////////////////
CDevideSentence::CDevideSentence()
{
szSource=NULL;
szResult=NULL;
this->bOccuredError=0;
}
CDevideSentence::~CDevideSentence()
{
if(szSource) delete szSource;
if(szResult) delete szResult;
}
unsigned char* CDevideSentence::getResult()
{
return this->szResult;
}
void CDevideSentence::inputSentence(unsigned char *para_szSentence)
{
if(szSource) delete szSource;
if(szResult) delete szResult;
int strLen=strlen((char*)para_szSentence);
if(strLen>0)
{
szSource=new unsigned char[strLen+1];
strcpy((char*)szSource,(char*)para_szSentence);
szResult=new unsigned char[strLen*2];
}
if(szSource==NULL||szResult==NULL) this->bOccuredError=1;//分配内存出错
}
int CDevideSentence::isChineseChr(unsigned char *para_acChr)
{
int rtCode=0;
if((para_acChr[0]>=0xb0&¶_acChr[0]<0xf8&¶_acChr[1]>0xa0&¶_acChr[1]<0xff)/*GB2312汉字*/
||(para_acChr[0]>=0x81&¶_acChr[0]<0xff&¶_acChr[1]>=0x40&¶_acChr[1]<=0xa0)/*扩充汉字*/)
{
rtCode=1;
}
else
{
rtCode=0;
}
return rtCode;
}
int CDevideSentence::devideSentence(unsigned char *para_szSentence,unsigned char para_cSeparator)
{
CDevideWordSeg cDevideWordSeg;
this->inputSentence(para_szSentence);
this->chrDevideCharacter=para_cSeparator;
if(this->bOccuredError==1)
{
return -1;
}
//对句子进行分词。
//先将句子分隔成字段(全由中文组成的串),对各串分词
int nSourcePos=0;//读取原串的位置
int nResultPos=0;//写入结果串的位置。
int nScanPos;//当前扫描的位置
nScanPos=nSourcePos;
while(this->szSource[nScanPos]!=0)
{//字串尚未扫描结束
int rtval=isChineseChr((unsigned char*)szSource+nScanPos);
if(rtval==0)
{//遇到汉字外的字符
unsigned char storecurrentchr=szSource[nScanPos];//保留当前扫描到的字符
int isWide=0;//是否全角,默认不是
if(storecurrentchr>=0x80)
{//是全角
isWide=1;
}
szSource[nScanPos]=0;
if(0!=cDevideWordSeg.devideWordSeg(szSource+nSourcePos/*待切分中文字段*/,szResult+nResultPos/*输出的带分隔符的以0结尾的结果串*/,chrDevideCharacter/*分隔符*/))
{
return -1;
}
int endposofresult=nResultPos;
while(szResult[endposofresult]!=0) endposofresult++;
if(strlen((char*)szSource+nSourcePos)!=0)//如果长度为0则表示连续出现两个非中文字符,开头不能再加分隔符
{
szResult[endposofresult++]=chrDevideCharacter;
}
szResult[endposofresult++]=storecurrentchr;
if(isWide==1)
{
szResult[endposofresult++]=szSource[++nScanPos];
}
szResult[endposofresult++]=chrDevideCharacter;
nResultPos=endposofresult;
nScanPos++;
nSourcePos=nScanPos;
}
else
{//是汉字,继续扫描下去
nScanPos+=2;
}
}
cDevideWordSeg.devideWordSeg(szSource+nSourcePos/*待切分中文字段*/,szResult+nResultPos/*输出的带分隔符的以0结尾的结果串*/,chrDevideCharacter/*分隔符*/);
return 0;
}
//////////////////////////CDevideWordSeg/////////////////////////////////////////////////////////
void CDevideWordSeg::buildDifferentMeaningInfo()
{
//int getWord(char*/*要找的词*/,unsigned short*/*返回的词的词性*/,int*/*返回的词的频率*/);//取得给定词的信息,返回词的位置(>0):成功,-1:失败
int length=strlen((char*)source)/2;//取得源串的长度,(中文的长度)
int startpos=0;//开始扫描的位置
for(startpos=0;startpos<length;startpos++)
{
int j=10;
if((length-startpos)<j) j=length-startpos;
for(;j>0;j--)
{
if(j!=1)
{
unsigned char storechar=*(source+startpos*2+j*2);//暂存
*(source+startpos*2+j*2)=0;
unsigned short attr;
int frequancy;
getWord((char*)source+startpos*2,&attr,&frequancy);
if(frequancy>0)
{//频率大于0,肯定为词
pTable[0][startpos]=j-1+startpos;
*(source+startpos*2+j*2)=storechar;//恢复
break;
}
*(source+startpos*2+j*2)=storechar;//恢复
}
else
{
pTable[0][startpos]=startpos;
}
}
}
/* for(startpos=0;startpos<length;startpos++)
{
if(startpos==pTable[0][startpos])
{
pTable[0][startpos]=-1;
startpos++;
}
int maxpos=pTable[0][startpos];
for(int i=startpos;i<=maxpos;i++)
{
if(pTable[0][startpos]>maxpos)
{
maxpos=pTable[0][startpos];
}
}
if(startpos==maxpos)
{
pTable[0][startpos]=-1;
}
startpos=maxpos;
}
*/
}
int CDevideWordSeg::getSumFrequancy(unsigned char *para_acSource,int para_nLen)//取字段的总词频
{
int rtCode=0;
unsigned short attr;
int frequancy;
for(int i=0;i<para_nLen;i++)
{
for(int j=i;j<para_nLen;j++)
{
unsigned char storechar=*(para_acSource+(j+1)*2);
*(para_acSource+(j+1)*2)=0;
getWord((char*)para_acSource+i*2,&attr,&frequancy);
rtCode+=frequancy;
*(para_acSource+(j+1)*2)=storechar;
}
}
return rtCode;
}
void CDevideWordSeg::devideDifferentMeaningWordSeg(int para_nStart,int para_nEnd,int para_nStartpos,int para_nEndpos)//切分奇异字段
{//切分结果暂存在pTable[10]中,然后计算切分概率后再插入1-9行
unsigned short attr;
int frequancy;
int startpos=para_nStartpos;
int endpos=para_nEndpos;
if(startpos>endpos)
{//结束一种切分,开始进行计算概率并插入适当位置。
// para_nStart,para_nEnd;
int tempfrequancy=0;//暂存词的概率
tempfrequancy=pTable[10][para_nStart];
for(int i=para_nStart;i<=para_nEnd+1;i++)
{
if(i==para_nEnd+1)
{
int fenzi=pTable[10][i-1];
if(pTable[10][i-1]==0)
{
fenzi=1;
}
if(this->afPropability[10]==0)
{
this->afPropability[10]=(double)fenzi/(double)pTable[11][i-1];
}
else
{
this->afPropability[10]*=(double)fenzi/(double)pTable[11][i-1];
}
break;
}
if(pTable[10][i]!=tempfrequancy)
{//
tempfrequancy=pTable[10][i];
int fenzi=pTable[10][i-1];
if(pTable[10][i-1]==0)
{
fenzi=1;
}
if(this->afPropability[10]==0)
{
this->afPropability[10]=(double)fenzi/(double)pTable[11][i-1];
}
else
{
this->afPropability[10]*=(double)fenzi/(double)pTable[11][i-1];
}
}
}
//插入1-9中适当位置
int pos=1;
while(pos<9&&this->afPropability[10]<this->afPropability[pos]) pos++;
//now this->afPropability[10]>=afPropability[pos];
if(this->afPropability[1]>0)
{
for(int kk=8;kk>=pos;kk--)
{
memcpy(pTable[kk+1]+para_nStart,pTable[kk]+para_nStart,sizeof(int)*(para_nEnd-para_nStart+1));
this->afPropability[kk+1]=this->afPropability[kk];
}
}
memcpy(pTable[pos]+para_nStart,pTable[10]+para_nStart,sizeof(int)*(para_nEnd-para_nStart+1));
this->afPropability[pos]=this->afPropability[10];
this->afPropability[10]=0;
}
for(int i=startpos;i<=endpos;i++)
{
unsigned char storechar=*(source+(i+1)*2);
*(source+(i+1)*2)=0;
getWord((char*)source+para_nStartpos*2,&attr,&frequancy);
int rtval=frequancy;
*(source+(i+1)*2)=storechar;
if(startpos>0)
{
if(pTable[10][startpos-1]==rtval)
{
if(rtval>0)
{
rtval--;
}
else
{
rtval++;
}
}
}
for(int k=startpos;k<=i;k++)
{
if(rtval<2)
{
if(k>0)
{
rtval=pTable[10][k-1]+1;
rtval%=2;
pTable[10][k]=rtval;
}
else
{
pTable[10][k]=0;
}
}
else
{
pTable[10][k]=rtval;
}
}
devideDifferentMeaningWordSeg(para_nStart,para_nEnd,i+1,para_nEnd);
}
}
int CDevideWordSeg::devideWordSeg(unsigned char *para_szSource,unsigned char *para_szResult,unsigned char para_cSeparator)
{
int rtCode=0;
source=para_szSource;//待切分串,以0结尾
result=para_szResult;//用来存储切分结果,结果串必须以0结尾
unsigned char separator=para_cSeparator;//分隔符号
int length=strlen((char*)source)/2;//取得源串的长度,(中文的长度)
while(length>=this->nMaxLen)
{
this->addMem();
if(this->bOccuredError==1)
{//分配内存出错
return -1;
}
}
int startpos=0;//开始扫描的位置
if(bOccuredError==0)
{
this->buildDifferentMeaningInfo();
for(startpos=0;startpos<length;startpos++)
{
int maxpos=pTable[0][startpos];
for(int i=startpos;i<=maxpos;i++)
{
if(pTable[0][i]>maxpos)
{
maxpos=pTable[0][i];
}
}
//取奇异字段(source+startpos×2,source+maxpos×2)的总词频
int sumfrequancy=this->getSumFrequancy(source+startpos*2,maxpos-startpos+1);
for(int k=startpos;k<=maxpos;k++)
{//将奇异字段信息填入pTable第11行
pTable[11][k]=sumfrequancy;
}
this->devideDifferentMeaningWordSeg(startpos,maxpos,startpos,maxpos);
for(k=0;k<12;k++) this->afPropability[k]=0;//初始化已计算的歧义段概率
startpos=maxpos;
}
}
else
{
rtCode=-1;
}
//到此pTable[1]-pTable[9]存储了分隔方式。
int resultpos=0;//输出串输入地址
int tempfrequancy=pTable[1][0];
for(int i=0;i<length;i++)
{
if(pTable[1][i]!=tempfrequancy)
{//
tempfrequancy=pTable[1][i];
result[resultpos++]=separator;
result[resultpos++]=source[i*2];
result[resultpos++]=source[i*2+1];
}
else
{
result[resultpos++]=source[i*2];
result[resultpos++]=source[i*2+1];
}
}
result[resultpos++]=0;
return rtCode;
}
CDevideWordSeg::CDevideWordSeg()
{
//int getWord(char*/*要找的词*/,unsigned short*/*返回的词的词性*/,int*/*返回的词的频率*/);//取得给定词的信息,返回词的位置(>0):成功,-1:失败
bOccuredError=0;
nMaxLen=200;
for(int i=0;i<12;i++)
{
pTable[i]=new int[nMaxLen];
if(pTable[i]==NULL)
{
bOccuredError=1;
}
this->afPropability[i]=0;
}
hInstance=LoadLibrary("wordlibaccess.dll");//加载访问词库的动态库
if(hInstance)
{//加载成功
loadLib=(loadLib_dll)GetProcAddress(hInstance,"loadLib_dll");//取装入词库的函数
if(loadLib)
{//取函数成功
if(loadLib("wordlib.bin")==-1)
{//加载词库失败
this->bOccuredError=1;
}
else
{//词库加载成功
getWord=(getWord_dll)GetProcAddress(hInstance,"getWord_dll");//取取词频的函数
if(getWord==NULL)
{//取函数失败
this->bOccuredError=1;
}
}
}
}
else
{
this->bOccuredError=1;
}
}
CDevideWordSeg::~CDevideWordSeg()
{
for(int i=0;i<12;i++)
{
if(pTable[i])
{
delete pTable[i];
}
}
if(hInstance)
{
FreeLibrary(hInstance);
hInstance=NULL;
}
}
void CDevideWordSeg::addMem()
{
for(int i=0;i<11;i++)
{
if(pTable[i])
{
delete pTable[i];
}
pTable[i]=new int[nMaxLen*2];
if(pTable[i]==NULL)
{
bOccuredError=1;
}
}
nMaxLen*=2;
}
////////////////////////////////////////////////////////////////////////////////////////////
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -