📄 new_ssps.cpp

📁 贝叶斯公式
💻 CPP
📖 第 1 页 / 共 4 页
字号:
12 3 4 下一页
/***************************************************************
*  工程: 	自然语言处理综合系统
*  作者：	CISTR BUPT
*  修改者	李卫
*  描述:	单扫描算法快速分词（SSPS）（含词典管理）
*  主要函数：见头文件
*  版本:	1.0
*  修改:	增加DF统计，封装特征抽取功能
*  参考文献：IMFS1.0相关资料
**************************************************************/
#define __int16 short int
#define __int32 int
#include "ssps.h"
#include <fstream>

#include "CStatDir.h"
#include "stdlib.h" 
#include "direct.h" 
#include "io.h"
#define DIRSEPSTRING "\\"

CSSPS::CSSPS():D(127),E(15){
	for(int i=0;i<94;i++)
		for(int j=0;j<94;j++){
			WIndexcom[i][j].WCount =0;
			WIndexcom[i][j].WList =NULL;
		}
	for(int k=0;k<128;k++){
		CIndexcom[k];  //指针索引表(全局变量)，字符、数字等
			CIndexcom[k].WCount =0;
			CIndexcom[k].WList =NULL;
	}
}

//-----------------------------------------------------------//
//   功能：	数据初始化
//   参数： 
//  （入口）DicFileName		//词典文件名（不含扩展名）
//			method_in		//特征抽取方式
//			kwdnum_in		//每类抽取的特征数量
//			vecfiledir_in	//训练向量文件路径
//			testfiledir_in	//测试向量文件路径
//	调用函数：	G_LoadKeyWords
//  （出口）resmap	一个特征词于词数的对应表
//    返回：true false
//----------------------------------------------------------//
bool CSSPS::Init(const string& DicFileName,EvsMethod method_in,unsigned int kwdnum_in ,const string& vecfiledir_in,const string& testfiledir_in)
{
	method=method_in;
	kwdnum=kwdnum_in;
	vecfiledir=vecfiledir_in;
	testfiledir = testfiledir_in;
	int doderrcode = DicOpenDic(DicFileName.c_str());
	if(doderrcode>0){
		return false;
	}
	return G_LoadKeyWords(GetMethoStr(),allwords,kwdnum_in,vecfiledir);
}

//-----------------------------------------------------------//
//   功能：	切换特征抽取方式并重新Load
//   参数： 
//  （入口）method_in		//特征抽取方式
//			kwdnum_in		//每类抽取的特征数量
//	调用函数：	G_LoadKeyWords
//    返回：true false
//----------------------------------------------------------//

bool CSSPS::ChangeEvsMethod(EvsMethod method_in,unsigned int kwdnum_in)
{
	method=method_in;
	kwdnum=kwdnum_in;
	return G_LoadKeyWords(GetMethoStr(),allwords,kwdnum_in,vecfiledir);
}


CSSPS::~CSSPS(){
	//DicCloseDic();
}
//-----------------------------------------------------------//
//   功能：	特征抽取方法的编号到字符串的转换
//    返回：字符串
//----------------------------------------------------------//

const string CSSPS::GetMethoStr()const{
	string dicextname="";
	switch(method){
		case DF_EVS:dicextname ="df";break;
		case CHI_EVS:dicextname ="chi";break;
		default :dicextname ="df";break;
	}
	return dicextname;
}
//-----------------------------------------------------------//
//   功能：	查找当前类别名称
//   参数： 
//  （出口）namevec		名称集合
//	调用函数：	FindClassNames
//    返回：类别数
//----------------------------------------------------------//
unsigned short CSSPS::GetClassNames(set<string>& namevec){
	return FindClassNames(namevec,vecfiledir,"vec");
}

//-----------------------------------------------------------//
//   功能：	训练文本的向量化并进行特征抽取
//   参数： 
//  （入口）TrainFileDir	语料路径
//			ClassName		类别名称
//	调用函数：	CountDf、UpdateAllData
//    返回：true false
//----------------------------------------------------------//
bool CSSPS::TrainFiles(const char *TrainFileDir,const char *ClassName){
  CountDf(TrainFileDir,ClassName,vecfiledir.c_str());
  return UpdateAllData();
}
//-----------------------------------------------------------//
//   功能：	测试文本的向量化
//   参数： 
//  （入口）TrainFileDir	语料路径
//			ClassName		类别名称
//	调用函数：	CountDf、G_UpdateDocs
//    返回：true false
//----------------------------------------------------------//
bool CSSPS::TrainTestFiles(const char *TrainFileDir,const char *ClassName){
  CountDf(TrainFileDir,ClassName,testfiledir.c_str());
  G_UpdateDocs(GetMethoStr(),allwords,testfiledir);
  return true;
}


//-----------------------------------------------------------//
//   功能：更新数据，包含向量化的所有操作
//   参数： 
//	调用函数：	G_UpdateKeyWords G_LoadKeyWords G_UpdateDocs
//    返回：true false
//----------------------------------------------------------//
bool CSSPS::UpdateAllData(){
	if(G_UpdateKeyWords(GetMethoStr(),allwords,kwdnum,vecfiledir) 
		&& G_LoadKeyWords(GetMethoStr(),allwords,kwdnum,vecfiledir)){
// 		printf("数据正在更新,请稍后");
		G_UpdateDocs(GetMethoStr(),allwords,vecfiledir);
		G_UpdateDocs(GetMethoStr(),allwords,testfiledir);
		if(G_UpdateIDVector(GetMethoStr(),allwords,vecfiledir)){
// 			printf("数据更新成功");
		}else{
			printf("数据未完全更新");
		}
		return true;
	}
	printf("数据无法更新");
	return false;
};

//-----------------------------------------------------------//
//   功能：打开分词词典
//   参数： （入口）DicFileName 词典文件名（basenane）
//**		0 : OK!
//**		1 : CAN NOT OPEN THE *.LEX FILE
//**		2 : CAN NOT OPEN THE *.PTR FILE
//**		3 : Error of the format of the word file.
//**		4 : Error on the size of file Ssps.ptr!
//----------------------------------------------------------//
int CSSPS::DicOpenDic(const char *DicFileName)
{
	__int32 NWords=0;
	char fn[300];
	

		//打开词典正文文件：
		//strcpy(strrchr(DicFileName,'.'),".lex");
		strcpy(fn,DicFileName);
		changesuffix(fn,"lex");
		if(!(fword=fopen(fn,"rb")))
		{	
			printf("文件Ssps.lex无法打开！");
			return 1;
		}
		
		//打开词典指针文件：
		//strcpy(strrchr(DicFileName,'.'),".ptr");
		strcpy(fn,DicFileName);
		changesuffix(fn,"ptr");
		if(!(fwptr=fopen(fn,"rb")))
		{	
			printf("词典指针文件“Ssps.ptr”无法打开！");
			return 2;
		}
	
	//	printf("Loading current dictionary...\n");

		unsigned long dwFileLength=filelength1(fileno(fwptr));//取指针文件长度
		unsigned char sHanzi[2];//汉字
		unsigned __int16 count;
		unsigned __int32 offset;
		__int16 x,y,z;

		while(dwFileLength>0)//一直读到文件尾
		{	
			fread(sHanzi,2,1,fwptr);//读汉字（词的首字）
			struct aWORDdic* wwwp;
			struct aWORDdic* tail=(aWORDdic*)emalloc(sizeof(aWORDdic));
			if(sHanzi[0] & 128) //首字节表明是汉字词
		        {
					if(sHanzi[0] == 46 && sHanzi[1] == 89)
					{
						cout<<"attention here!"<<endl;
					}
		        x=sHanzi[0]-0xa1;
			    y=sHanzi[1]-0xa1;
				
			    fread(&count,2,1,fwptr);
			    WIndexcom[x][y].WCount=count;
			    NWords+=count;
				//read the numer of words

			    fread(&offset,4,1,fwptr);// skip the 4 bytes offset
			
			    dwFileLength -= 8;//读了8个字节
			    WIndexcom[x][y].WList=tail;

			    while(count--) //read a list of words from the word file
			    {
				wwwp=(aWORDdic *)emalloc(sizeof(aWORDdic));

 				if((wwwp->Len=fgetc(fword))==EOF)//读到文件尾或有其它异常
				{
					printf("Error of the format of the word file.\n");
					return 3;
				}
				
				wwwp->sCIYU=(unsigned char*)emalloc(wwwp->Len+1);//note here
				fread(wwwp->sCIYU,wwwp->Len,1,fword);//接着将词语读入
				wwwp->sCIYU[wwwp->Len]=0;//note here
				wwwp->pos = (unsigned char*)emalloc(9);
				fread(wwwp->pos,8,1,fword);
				wwwp->pos[8] = '\0';
				wwwp->ptrNext=NULL;	

				tail->ptrNext=wwwp;
				tail=wwwp;
			   }//end of read a list of words

			   struct aWORDdic * temp=WIndexcom[x][y].WList;
			   WIndexcom[x][y].WList=temp->ptrNext;
			   free(temp);
		        }
		       else//首字节表明是ASCII词s
		       {
			   z=sHanzi[0];
			   fseek(fwptr,-1, SEEK_CUR);
			   fread(&count,2,1,fwptr);
			   CIndexcom[z].WCount=count;
			   NWords+=count;
				//read the numer of words

			   fread(&offset,4,1,fwptr);// skip the 4 bytes offset
			
			   dwFileLength -= 7;//读了8个字节
			   CIndexcom[z].WList=tail;

			   while(count--) //read a list of words from the word file
			   {
				wwwp=(aWORDdic *)emalloc(sizeof(aWORDdic));

 				if((wwwp->Len=fgetc(fword))==EOF)//读到文件尾或有其它异常
				{
					printf("Error of the format of the word file.\n");
					return 3;
				}
				
				wwwp->sCIYU=(unsigned char*)emalloc(wwwp->Len+1);//note here
				fread(wwwp->sCIYU,wwwp->Len,1,fword);//接着将词语读入
				wwwp->sCIYU[wwwp->Len]=0;//note here
				wwwp->pos = (unsigned char*)emalloc(9);
				fread(wwwp->pos,8,1,fword);
				wwwp->pos[8] = 0;

				wwwp->ptrNext=NULL;	

				tail->ptrNext=wwwp;
				tail=wwwp;
			   }//end of read a list of words

			   struct aWORDdic * temp=CIndexcom[z].WList;
			   CIndexcom[z].WList=temp->ptrNext;
			   free(temp);
		       }

			
			
		}//end of dwFileLength	
	
		fclose(fwptr);// close the 指针文件
		fclose(fword);// close the words file	
				
		if(dwFileLength)
		{
			printf("Error on the size of file Ssps.ptr!!");
			return 4;
		}
		

// 		printf("Total %d words in dictionary.....\n",NWords);

	return 0;
}//end of opendic
//-----------------------------------------------------------//
//   功能：关闭分词词典
//   参数： （入口）DicFileName 词典文件名（basenane）
//**		0 : ok
//**		1 : can not open the lex file
//**		2 : can not open the ptr file
//----------------------------------------------------------//
int CSSPS::DicCloseDic()
{
		struct aWORDdic* Tptr,*freep;
		for(int z=0;z<128;z++)
		{
			if(!CIndexcom[z].WCount) continue;
				Tptr=CIndexcom[z].WList;
			while(Tptr)
			{
				freep=Tptr;
				Tptr=Tptr->ptrNext;

				free(freep->sCIYU);
				free(freep);
			}//end of while
			CIndexcom[z].WList = NULL;
			CIndexcom[z].WCount = 0;
		}//end of for;
		for(int x=0;x<72;x++)
			for(int y=0;y<94;y++)
			{
				if(!WIndexcom[x][y].WCount) continue;
				Tptr=WIndexcom[x][y].WList;
				while(Tptr)
				{
					freep=Tptr;
					Tptr=Tptr->ptrNext;

					free(freep->sCIYU);
					free(freep);
				}//end of while
				WIndexcom[x][y].WList = NULL;
				WIndexcom[x][y].WCount = 0;
			}//end of double for;


		fclose(fwptr);
		fclose(fword);
		return 0;
}

//-----------------------------------------------------------//
//   功能：给词典中添加新词
//   参数： （入口）DicFileName 词典文件名（basenane）
//**		0 : ok!
//**		1 : some error
//----------------------------------------------------------//
int CSSPS::WInsert(unsigned char* cWord,unsigned char* cPos)
{
	int x,y;
	int flag;

	if(!cWord) return -1;   
	struct aWORDdic* Wptr;
	struct aWORDdic* IPos=NULL;
	struct aWORDdic* IWord;
	if(cWord[0] & 128) //首字节表明是汉字词
	{
	   x=cWord[0]-0xa1;
	   y=cWord[1]-0xa1;

	   if(x<0||y<0 ||x>=94||y>=94 )
	   {
           printf("%s is beyond GB2312!",cWord);
		   return 1;
	   }
	   Wptr=WIndexcom[x][y].WList;

	   if(!Wptr)
	  {	
		IWord=(struct aWORDdic*)emalloc(sizeof(aWORDdic));
		IWord->ptrNext=NULL;
		IWord->Len=strlen((char*)cWord);
		IWord->sCIYU=(unsigned char *)emalloc(IWord->Len+1);//note here
		strcpy((char*)IWord->sCIYU,(char*)cWord);
		IWord->pos=(unsigned char*)emalloc(9);
		strcpy((char*)IWord->pos,(char*)cPos);
		IWord->pos[8] = '\0';
		
		WIndexcom[x][y].WList=IWord;
		WIndexcom[x][y].WCount++;

	//	Num_Ins++;

//#ifdef Debug
		printf("%s\n",cWord);
//#endif
		return 0;
	  }

	  while(1)
	 {
		flag=strcmp((char*)cWord,(char*)Wptr->sCIYU);
		
		if(!flag)
		{
	//		AlreadyExisted++;
        
//#ifdef Debug
			printf("%s already exists.\n",cWord);
//#endif			

			break;
		}

		if(flag>0) 
		{
			IPos=Wptr;
			Wptr=Wptr->ptrNext;
			
			if(!Wptr)
			{
				IWord=(struct aWORDdic*)emalloc(sizeof(aWORDdic));
				IWord->ptrNext=NULL;
				IWord->Len=strlen((char*)cWord);
				IWord->sCIYU=(unsigned char *)emalloc(IWord->Len+1);//note here
				strcpy((char*)IWord->sCIYU,(char*)cWord);
				IWord->pos=(unsigned char*)emalloc(9);
				strcpy((char*)IWord->pos,(char*)cPos);
		
				IPos->ptrNext=IWord;
				WIndexcom[x][y].WCount++;

	//			Num_Ins++;

//#ifdef Debug
				printf("%s\n",cWord);
//#endif
				
				return 0;
			}
		}
12 3 4 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -