📄 new_ssps.cpp
字号:
/***************************************************************
* 工程: 自然语言处理综合系统
* 作者: CISTR BUPT
* 修改者 李卫
* 描述: 单扫描算法快速分词(SSPS)(含词典管理)
* 主要函数:见头文件
* 版本: 1.0
* 修改: 增加DF统计,封装特征抽取功能
* 参考文献:IMFS1.0相关资料
**************************************************************/
#define __int16 short int
#define __int32 int
#include "ssps.h"
#include <fstream>
#include "CStatDir.h"
#include "stdlib.h"
#include "direct.h"
#include "io.h"
#define DIRSEPSTRING "\\"
CSSPS::CSSPS():D(127),E(15){
for(int i=0;i<94;i++)
for(int j=0;j<94;j++){
WIndexcom[i][j].WCount =0;
WIndexcom[i][j].WList =NULL;
}
for(int k=0;k<128;k++){
CIndexcom[k]; //指针索引表(全局变量),字符、数字等
CIndexcom[k].WCount =0;
CIndexcom[k].WList =NULL;
}
}
//-----------------------------------------------------------//
// 功能: 数据初始化
// 参数:
// (入口)DicFileName //词典文件名(不含扩展名)
// method_in //特征抽取方式
// kwdnum_in //每类抽取的特征数量
// vecfiledir_in //训练向量文件路径
// testfiledir_in //测试向量文件路径
// 调用函数: G_LoadKeyWords
// (出口)resmap 一个特征词于词数的对应表
// 返回:true false
//----------------------------------------------------------//
bool CSSPS::Init(const string& DicFileName,EvsMethod method_in,unsigned int kwdnum_in ,const string& vecfiledir_in,const string& testfiledir_in)
{
method=method_in;
kwdnum=kwdnum_in;
vecfiledir=vecfiledir_in;
testfiledir = testfiledir_in;
int doderrcode = DicOpenDic(DicFileName.c_str());
if(doderrcode>0){
return false;
}
return G_LoadKeyWords(GetMethoStr(),allwords,kwdnum_in,vecfiledir);
}
//-----------------------------------------------------------//
// 功能: 切换特征抽取方式并重新Load
// 参数:
// (入口)method_in //特征抽取方式
// kwdnum_in //每类抽取的特征数量
// 调用函数: G_LoadKeyWords
// 返回:true false
//----------------------------------------------------------//
bool CSSPS::ChangeEvsMethod(EvsMethod method_in,unsigned int kwdnum_in)
{
method=method_in;
kwdnum=kwdnum_in;
return G_LoadKeyWords(GetMethoStr(),allwords,kwdnum_in,vecfiledir);
}
CSSPS::~CSSPS(){
//DicCloseDic();
}
//-----------------------------------------------------------//
// 功能: 特征抽取方法的编号到字符串的转换
// 返回:字符串
//----------------------------------------------------------//
const string CSSPS::GetMethoStr()const{
string dicextname="";
switch(method){
case DF_EVS:dicextname ="df";break;
case CHI_EVS:dicextname ="chi";break;
default :dicextname ="df";break;
}
return dicextname;
}
//-----------------------------------------------------------//
// 功能: 查找当前类别名称
// 参数:
// (出口)namevec 名称集合
// 调用函数: FindClassNames
// 返回:类别数
//----------------------------------------------------------//
unsigned short CSSPS::GetClassNames(set<string>& namevec){
return FindClassNames(namevec,vecfiledir,"vec");
}
//-----------------------------------------------------------//
// 功能: 训练文本的向量化并进行特征抽取
// 参数:
// (入口)TrainFileDir 语料路径
// ClassName 类别名称
// 调用函数: CountDf、UpdateAllData
// 返回:true false
//----------------------------------------------------------//
bool CSSPS::TrainFiles(const char *TrainFileDir,const char *ClassName){
CountDf(TrainFileDir,ClassName,vecfiledir.c_str());
return UpdateAllData();
}
//-----------------------------------------------------------//
// 功能: 测试文本的向量化
// 参数:
// (入口)TrainFileDir 语料路径
// ClassName 类别名称
// 调用函数: CountDf、G_UpdateDocs
// 返回:true false
//----------------------------------------------------------//
bool CSSPS::TrainTestFiles(const char *TrainFileDir,const char *ClassName){
CountDf(TrainFileDir,ClassName,testfiledir.c_str());
G_UpdateDocs(GetMethoStr(),allwords,testfiledir);
return true;
}
//-----------------------------------------------------------//
// 功能:更新数据,包含向量化的所有操作
// 参数:
// 调用函数: G_UpdateKeyWords G_LoadKeyWords G_UpdateDocs
// 返回:true false
//----------------------------------------------------------//
bool CSSPS::UpdateAllData(){
if(G_UpdateKeyWords(GetMethoStr(),allwords,kwdnum,vecfiledir)
&& G_LoadKeyWords(GetMethoStr(),allwords,kwdnum,vecfiledir)){
// printf("数据正在更新,请稍后");
G_UpdateDocs(GetMethoStr(),allwords,vecfiledir);
G_UpdateDocs(GetMethoStr(),allwords,testfiledir);
if(G_UpdateIDVector(GetMethoStr(),allwords,vecfiledir)){
// printf("数据更新成功");
}else{
printf("数据未完全更新");
}
return true;
}
printf("数据无法更新");
return false;
};
//-----------------------------------------------------------//
// 功能:打开分词词典
// 参数: (入口)DicFileName 词典文件名(basenane)
//** 0 : OK!
//** 1 : CAN NOT OPEN THE *.LEX FILE
//** 2 : CAN NOT OPEN THE *.PTR FILE
//** 3 : Error of the format of the word file.
//** 4 : Error on the size of file Ssps.ptr!
//----------------------------------------------------------//
int CSSPS::DicOpenDic(const char *DicFileName)
{
__int32 NWords=0;
char fn[300];
//打开词典正文文件:
//strcpy(strrchr(DicFileName,'.'),".lex");
strcpy(fn,DicFileName);
changesuffix(fn,"lex");
if(!(fword=fopen(fn,"rb")))
{
printf("文件Ssps.lex无法打开!");
return 1;
}
//打开词典指针文件:
//strcpy(strrchr(DicFileName,'.'),".ptr");
strcpy(fn,DicFileName);
changesuffix(fn,"ptr");
if(!(fwptr=fopen(fn,"rb")))
{
printf("词典指针文件“Ssps.ptr”无法打开!");
return 2;
}
// printf("Loading current dictionary...\n");
unsigned long dwFileLength=filelength1(fileno(fwptr));//取指针文件长度
unsigned char sHanzi[2];//汉字
unsigned __int16 count;
unsigned __int32 offset;
__int16 x,y,z;
while(dwFileLength>0)//一直读到文件尾
{
fread(sHanzi,2,1,fwptr);//读汉字(词的首字)
struct aWORDdic* wwwp;
struct aWORDdic* tail=(aWORDdic*)emalloc(sizeof(aWORDdic));
if(sHanzi[0] & 128) //首字节表明是汉字词
{
if(sHanzi[0] == 46 && sHanzi[1] == 89)
{
cout<<"attention here!"<<endl;
}
x=sHanzi[0]-0xa1;
y=sHanzi[1]-0xa1;
fread(&count,2,1,fwptr);
WIndexcom[x][y].WCount=count;
NWords+=count;
//read the numer of words
fread(&offset,4,1,fwptr);// skip the 4 bytes offset
dwFileLength -= 8;//读了8个字节
WIndexcom[x][y].WList=tail;
while(count--) //read a list of words from the word file
{
wwwp=(aWORDdic *)emalloc(sizeof(aWORDdic));
if((wwwp->Len=fgetc(fword))==EOF)//读到文件尾或有其它异常
{
printf("Error of the format of the word file.\n");
return 3;
}
wwwp->sCIYU=(unsigned char*)emalloc(wwwp->Len+1);//note here
fread(wwwp->sCIYU,wwwp->Len,1,fword);//接着将词语读入
wwwp->sCIYU[wwwp->Len]=0;//note here
wwwp->pos = (unsigned char*)emalloc(9);
fread(wwwp->pos,8,1,fword);
wwwp->pos[8] = '\0';
wwwp->ptrNext=NULL;
tail->ptrNext=wwwp;
tail=wwwp;
}//end of read a list of words
struct aWORDdic * temp=WIndexcom[x][y].WList;
WIndexcom[x][y].WList=temp->ptrNext;
free(temp);
}
else//首字节表明是ASCII词s
{
z=sHanzi[0];
fseek(fwptr,-1, SEEK_CUR);
fread(&count,2,1,fwptr);
CIndexcom[z].WCount=count;
NWords+=count;
//read the numer of words
fread(&offset,4,1,fwptr);// skip the 4 bytes offset
dwFileLength -= 7;//读了8个字节
CIndexcom[z].WList=tail;
while(count--) //read a list of words from the word file
{
wwwp=(aWORDdic *)emalloc(sizeof(aWORDdic));
if((wwwp->Len=fgetc(fword))==EOF)//读到文件尾或有其它异常
{
printf("Error of the format of the word file.\n");
return 3;
}
wwwp->sCIYU=(unsigned char*)emalloc(wwwp->Len+1);//note here
fread(wwwp->sCIYU,wwwp->Len,1,fword);//接着将词语读入
wwwp->sCIYU[wwwp->Len]=0;//note here
wwwp->pos = (unsigned char*)emalloc(9);
fread(wwwp->pos,8,1,fword);
wwwp->pos[8] = 0;
wwwp->ptrNext=NULL;
tail->ptrNext=wwwp;
tail=wwwp;
}//end of read a list of words
struct aWORDdic * temp=CIndexcom[z].WList;
CIndexcom[z].WList=temp->ptrNext;
free(temp);
}
}//end of dwFileLength
fclose(fwptr);// close the 指针文件
fclose(fword);// close the words file
if(dwFileLength)
{
printf("Error on the size of file Ssps.ptr!!");
return 4;
}
// printf("Total %d words in dictionary.....\n",NWords);
return 0;
}//end of opendic
//-----------------------------------------------------------//
// 功能:关闭分词词典
// 参数: (入口)DicFileName 词典文件名(basenane)
//** 0 : ok
//** 1 : can not open the lex file
//** 2 : can not open the ptr file
//----------------------------------------------------------//
int CSSPS::DicCloseDic()
{
struct aWORDdic* Tptr,*freep;
for(int z=0;z<128;z++)
{
if(!CIndexcom[z].WCount) continue;
Tptr=CIndexcom[z].WList;
while(Tptr)
{
freep=Tptr;
Tptr=Tptr->ptrNext;
free(freep->sCIYU);
free(freep);
}//end of while
CIndexcom[z].WList = NULL;
CIndexcom[z].WCount = 0;
}//end of for;
for(int x=0;x<72;x++)
for(int y=0;y<94;y++)
{
if(!WIndexcom[x][y].WCount) continue;
Tptr=WIndexcom[x][y].WList;
while(Tptr)
{
freep=Tptr;
Tptr=Tptr->ptrNext;
free(freep->sCIYU);
free(freep);
}//end of while
WIndexcom[x][y].WList = NULL;
WIndexcom[x][y].WCount = 0;
}//end of double for;
fclose(fwptr);
fclose(fword);
return 0;
}
//-----------------------------------------------------------//
// 功能:给词典中添加新词
// 参数: (入口)DicFileName 词典文件名(basenane)
//** 0 : ok!
//** 1 : some error
//----------------------------------------------------------//
int CSSPS::WInsert(unsigned char* cWord,unsigned char* cPos)
{
int x,y;
int flag;
if(!cWord) return -1;
struct aWORDdic* Wptr;
struct aWORDdic* IPos=NULL;
struct aWORDdic* IWord;
if(cWord[0] & 128) //首字节表明是汉字词
{
x=cWord[0]-0xa1;
y=cWord[1]-0xa1;
if(x<0||y<0 ||x>=94||y>=94 )
{
printf("%s is beyond GB2312!",cWord);
return 1;
}
Wptr=WIndexcom[x][y].WList;
if(!Wptr)
{
IWord=(struct aWORDdic*)emalloc(sizeof(aWORDdic));
IWord->ptrNext=NULL;
IWord->Len=strlen((char*)cWord);
IWord->sCIYU=(unsigned char *)emalloc(IWord->Len+1);//note here
strcpy((char*)IWord->sCIYU,(char*)cWord);
IWord->pos=(unsigned char*)emalloc(9);
strcpy((char*)IWord->pos,(char*)cPos);
IWord->pos[8] = '\0';
WIndexcom[x][y].WList=IWord;
WIndexcom[x][y].WCount++;
// Num_Ins++;
//#ifdef Debug
printf("%s\n",cWord);
//#endif
return 0;
}
while(1)
{
flag=strcmp((char*)cWord,(char*)Wptr->sCIYU);
if(!flag)
{
// AlreadyExisted++;
//#ifdef Debug
printf("%s already exists.\n",cWord);
//#endif
break;
}
if(flag>0)
{
IPos=Wptr;
Wptr=Wptr->ptrNext;
if(!Wptr)
{
IWord=(struct aWORDdic*)emalloc(sizeof(aWORDdic));
IWord->ptrNext=NULL;
IWord->Len=strlen((char*)cWord);
IWord->sCIYU=(unsigned char *)emalloc(IWord->Len+1);//note here
strcpy((char*)IWord->sCIYU,(char*)cWord);
IWord->pos=(unsigned char*)emalloc(9);
strcpy((char*)IWord->pos,(char*)cPos);
IPos->ptrNext=IWord;
WIndexcom[x][y].WCount++;
// Num_Ins++;
//#ifdef Debug
printf("%s\n",cWord);
//#endif
return 0;
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -