📄 new_ssps.cpp

📁 贝叶斯公式
💻 CPP
📖 第 1 页 / 共 4 页
字号:
上一页 1 2 34
                {
                	if(!CIndexcom[z].WCount) continue;
			
				fputc(z,fwptr);
				fwrite(&CIndexcom[z].WCount,2,1,fwptr);//count
				
				NWords+=CIndexcom[z].WCount;

				offset=ftell(fword);
				fwrite((void *)&offset,4,1,fwptr);//offset

				Tptr=CIndexcom[z].WList;
				while(Tptr)
				{
					fputc(Tptr->Len,fword);

					fwrite(Tptr->sCIYU,Tptr->Len,1,fword);
					fwrite(Tptr->pos,8,1,fword);
					
				
					freep=Tptr;
					Tptr=Tptr->ptrNext;

					free(freep->sCIYU);
					free(freep->pos);
					free(freep);
					
				}//end of while
				CIndexcom[z].WList = NULL;
				CIndexcom[z].WCount = 0;
			}//end of double for;

                	
		for(int x=0;x<94;x++)
			for(int y=0;y<94;y++)
			{
				if(!WIndexcom[x][y].WCount) continue;
			
				fputc(x+0xa1,fwptr);
				fputc(y+0xa1,fwptr);//Chinese ch
				fwrite(&WIndexcom[x][y].WCount,2,1,fwptr);//count
				
				NWords+=WIndexcom[x][y].WCount;

				offset=ftell(fword);
				fwrite((void *)&offset,4,1,fwptr);//offset

				Tptr=WIndexcom[x][y].WList;
				while(Tptr)
				{
					fputc(Tptr->Len,fword);

					fwrite(Tptr->sCIYU,Tptr->Len,1,fword);
					fwrite(Tptr->pos,8,1,fword);
				
					freep=Tptr;
					Tptr=Tptr->ptrNext;

					free(freep->sCIYU);
					free(freep->pos);
					free(freep);
					
				}//end of while
				WIndexcom[x][y].WList = NULL;
				WIndexcom[x][y].WCount = 0;
			}//end of double for;

// 		printf("Total %d words in dictionary.\n",NWords);

		fclose(fwptr);
		fclose(fword);
		fclose(fptr);
		return 0;
}


//-----------------------------------------------------------//
//   功能：	给每个特征词计数，从0开始每调用一次+1
//   参数： 
//  （入口）start 输入文本起始指针
//			len   词长 （输入可能为一个很长的字符串，需要指定截取长度）
//  （出口）resmap	一个特征词于词数的对应表
//    返回：
//----------------------------------------------------------//
void CSSPS::insertwordtomap(const unsigned char*start,unsigned int len, map<string,unsigned int>&resmap)const{
	   
	   if(start==NULL || len==0)
		   return ;
	   map<string,unsigned int>::iterator it;
		char *tmpstr =new char[len+1];
		strncpy(tmpstr,(const char *)start,len);
		tmpstr[len]=0;
	   if((it=resmap.find(tmpstr))==resmap.end()){
			resmap.insert(pair<string,unsigned int>(tmpstr,1));
	   }else {
			it->second++;
	   }
	   delete [] tmpstr;
}

//-----------------------------------------------------------//
//   功能：	一篇文本的单次快速分词的同时获取文本count
//   参数： 
//  （入口）sBuffer	输入文本
//  （出口）resmap	一个特征词于词数的对应表
//    返回：该文本中有效的总词数
//----------------------------------------------------------//
unsigned int CSSPS::GetVecInDoc(const unsigned char* sBuffer,map<string,unsigned int>&resmap)const{
       
		int pointer=0;
       int x;
       int y;
	   int ascii_flag;
	   int flag=0;
       struct aWORDdic* Wptr;
       int count;
	   int totalwordcount=0;
       struct aWORDdic* DPos=NULL;
       int lenofsBuffer=strlen((const char*)sBuffer);
       while(pointer<lenofsBuffer) {
       	     if(sBuffer[pointer] & 128) //首字节表明是汉字词
       	     {
                 ascii_flag=0;
				 if(flag==1)
				 {
					 //totalwordcount++;
					 flag=0;
				 }
				  x=sBuffer[pointer]-0xa1;
				  y=sBuffer[pointer+1]-0xa1;
				  if(x<0||y<0 ||x>=94||y>=94 )
				  {
						pointer+=2; 
						continue;
				  }

				  Wptr=WIndexcom[x][y].WList;
				  count=WIndexcom[x][y].WCount;
				  DPos=NULL;
				 if(!Wptr) {
						pointer+=2;
						continue;
					  }
               }
               else //首字节表明是ASCII
       	      {
                  flag=1;
				  ascii_flag=1;
				  x=sBuffer[pointer];
				  Wptr=CIndexcom[x].WList;
				  count=CIndexcom[x].WCount;
				  DPos=NULL;
				  if(!Wptr) {
							pointer+=1;
							continue;
						  }
               }
               unsigned char LastLen=2;
               int iCompareResult;

               while(1) {
                        	if(!Wptr)
							{
								if(ascii_flag)
                                 {
                                    pointer+=1;
								     break;
								}
							if(LastLen>2){
								insertwordtomap((const unsigned char *)(sBuffer+pointer),LastLen,resmap);
								totalwordcount++;
							}
								pointer+=LastLen;
								break;
							}

	                    iCompareResult=strncmp((char *)(sBuffer+pointer),(const char *)Wptr->sCIYU,Wptr->Len);
                        if(iCompareResult<0) {
							if(ascii_flag)
							{
                                 pointer++;
								 break;
							}
							if(LastLen>2){
									insertwordtomap((const unsigned char *)(sBuffer+pointer),LastLen,resmap);
									totalwordcount++;
							}
								pointer+=LastLen;
                              break;
                         }
                         if(iCompareResult==0) {
                              LastLen=Wptr->Len;
                              DPos=Wptr;
			                  Wptr=Wptr->ptrNext;
                         }
                         if(iCompareResult>0)
						 {
							 DPos=Wptr;
			                 Wptr=Wptr->ptrNext;
						 }
                    }//end of while(1)
       }//end of while(pointer<...)
		return totalwordcount;
}

//-----------------------------------------------------------//
//   功能：	单次快速分词的同时获取文本count
//   参数： 
//  （入口）const char *TrainFileDir	训练文件路径
//			const char *ClassName		类别名称
//			const char *ResultVecFileDir输出结果文件路径
//			文件输入TrainFileDir下的文本文件
//  （出口）ClassName.vec;	所有的特征对应的文档数
//			OutFileName.tmw	每篇文档中词和出现次数对应列表 
//			以上两文本均夷词作为key，是特征抽取的数据源
//	 调用函数：GetVecInDoc
//    返回：true false
//----------------------------------------------------------//
//this function will call GetVecInDoc set onceonly = true
unsigned int CSSPS::CountDf(const char *TrainFileDir,const char *ClassName,const char *ResultVecFileDir){
	
	bool dirnotin=false;
	long hFile;
	unsigned int Filecount=0;
	string FileNameTmp="";
	map<string,unsigned int> resmap;
	map<string,unsigned int>::const_iterator itcm;
	string OutFileName = string(ResultVecFileDir)+DIRSEPSTRING+string(ClassName);
	ofstream fout_tmw;
	if(ResultVecFileDir!=NULL){
		string OutFileName2 =OutFileName+".tmw";
		fout_tmw.open(OutFileName2.c_str());
		if(!fout_tmw){
			printf("输出文件无法写入！");
			return 0;
		}
	}
	
	//获取当前目录 
	char pszCurrentPATH[_MAX_PATH];
	getcwd(pszCurrentPATH,_MAX_PATH); 

	//构造遍历文件夹类对象 
	CStatDir statdir; 
	//设置要遍历的目录 
	if (!statdir.SetInitDir(TrainFileDir)) 
	{ 
		puts("训练语料目录不存在。"); 
		return 0; 
	} 
	//开始遍历 
	statdir.BeginBrowse("*.*"); 
	//统计结果中，子目录个数不含 . 及 .. 
// 	printf("训练语料文件总数: %d\n子目录总数: %d\n",statdir.GetFileCount(), statdir.GetSubdirCount()); 
	
	//处理每篇语料
	string tmpName;
	int nFileCount = statdir.m_vsFileName.size();
	for(int i=0;i<nFileCount;i++)
	{
		tmpName = statdir.m_vsFileName[i];
//		printf("%s\n",(char *)tmpName.c_str());
		ifstream Fin(tmpName.c_str());
		FILE *stream = fopen(tmpName.c_str(),"r");
		if(stream == NULL) printf("the file is not opened\n");
		if(!Fin) continue;
		string StrBuffer;
		getline(Fin,StrBuffer,'\0');
		map<string,unsigned int> resmap_one;
		unsigned int totalwordcount = GetVecInDoc((const unsigned char *)(StrBuffer.c_str()),resmap_one);
		if(fout_tmw) fout_tmw<<totalwordcount;
		for(map<string,unsigned int>::const_iterator itc=resmap_one.begin();itc!=resmap_one.end();itc++){
			insertwordtomap((const unsigned char *)(itc->first.c_str()),itc->first.size(),resmap);
			if(fout_tmw)	fout_tmw<<"\t"<<itc->first<<"\t"<<itc->second;
		}
		if(fout_tmw)	fout_tmw<<endl;
		Fin.close();
		Filecount++;
// 		printf(">");

	}
	//返回原来的目录 
	if (_chdir(pszCurrentPATH) != 0) 
		return 0; 
	//如果目录的最后一个字母不是'\',则在最后加上一个'\' 
	int len=strlen(pszCurrentPATH); 
	if (pszCurrentPATH[len-1] != '\\') 
		strcat(pszCurrentPATH,"\\"); 




/*
	string tmpName1=string(TrainFileDir)+DIRSEPSTRING+"*.*";
	struct _finddata_t  TrainFile;
	if((hFile = _findfirst(tmpName1.c_str(), &TrainFile )) == -1L){
		printf("训练语料路径无法访问！");
		return 0;
	}
	do
	{
		string tmpName;
		FileNameTmp=TrainFile.name;
		if(FileNameTmp=="." || FileNameTmp=="..")	continue;

		tmpName=string(TrainFileDir)+DIRSEPSTRING+FileNameTmp.c_str();
		ifstream Fin(tmpName.c_str());
		if(!Fin) continue;
		string StrBuffer;
		getline(Fin,StrBuffer,'\0');
		map<string,unsigned int> resmap_one;
		unsigned int totalwordcount = GetVecInDoc((const unsigned char *)(StrBuffer.c_str()),resmap_one);
		if(fout_tmw) fout_tmw<<totalwordcount;
		for(map<string,unsigned int>::const_iterator itc=resmap_one.begin();itc!=resmap_one.end();itc++){
			insertwordtomap((const unsigned char *)(itc->first.c_str()),itc->first.size(),resmap);
			if(fout_tmw)	fout_tmw<<"\t"<<itc->first<<"\t"<<itc->second;
		}
		if(fout_tmw)	fout_tmw<<endl;
		Fin.close();
		Filecount++;
		printf(">");

	}while(! _findnext( hFile, &TrainFile )  );
	_findclose( hFile );


*/

// 	printf("训练语料%d\n",Filecount);
	if(!Filecount) return 0;


	string resultvecfile =string(ResultVecFileDir)+DIRSEPSTRING+string(ClassName)+".vec";
	ofstream fout((char *)resultvecfile.c_str() );
	if(!fout){
			printf("无法输出训练结果！");
			return 0;
	}
	fout<<Filecount<<endl;
	for(map<string,unsigned int>::const_iterator itc=resmap.begin();itc!=resmap.end();itc++){
		fout<<itc->first<<"\t"<<itc->second<<endl;
	}

	if(fout_tmw){
		fout_tmw.close();
	}


	return true;

}
//****************************************************************//
//功能：获得一个字符串的DF格式，即：总词数+词ID+ 词数
//参数：(入口)instr字符串
//       （出口）特定输出格式的字符串
//调用函数：getVec
//******************************************************************//
//string CSSPS::CountOneDf(string & instr){
//	
//
//}
//-----------------------------------------------------------//
//   功能：	向量化一个字符串
//   参数：     
//  （入口）strin 输入字符串
//	  调用函数：GetVecInDoc
//    返回：向量化文本	总词数 词编号:词数
//----------------------------------------------------------//
string CSSPS::GetVecStr(const string&strin)const{
	map<string,unsigned int> resmap;
	stringstream sstreamtmp;
	sstreamtmp<<GetVecInDoc((const unsigned char*)(strin.c_str()),resmap);
	map<string,unsigned int>::const_iterator itcm;
	for(map<string,unsigned int>::const_iterator itc=resmap.begin();itc!=resmap.end();itc++){
		if( (itcm=allwords.find(itc->first))!=allwords.end()	)	
				sstreamtmp<<" "<<itcm->second<<":"<<itc->second;
	}
	sstreamtmp<<endl;
	return sstreamtmp.str();

}
//-----------------------------------------------------------//
//   功能：	读入一个文本文件并向量化（编号）
//   参数：     
//  （入口）filename 文件名含路径
//			文件输入filename
//    返回：向量化文本	总词数 词编号:词数
//	  调用函数：GetVecStr
//			无法读取返回""
//----------------------------------------------------------//
string CSSPS::GetVecStrFromFile(const string&filename)const{
		ifstream Fin(filename.c_str());
		if(!Fin) {
			cout<<"file can not open"<<endl;
			return "";
		}
		string filecontent;
		getline(Fin,filecontent,'\0');
		return GetVecStr(filecontent);
}
bool CSSPS::exist(unsigned char* word,unsigned char len,unsigned char* pos){
	unsigned char head[2];
	head[0] = *word;
	head[1] = *(word + 1);
	if(head[1] & 128){   
		int x = head[0] - 0xa1;
		int y = head[1] - 0xa1;
		if(x<0||x>94||y<0||y>94){
			cout<<"this is not a word or a asc code\n";
			return false;
		}
		aWORDdic* wordlist = WIndexcom[x][y].WList;
		while(wordlist != NULL){
			if(!strncmp((char*)wordlist->sCIYU,(const char *)word,len)){
				strcpy((char*)pos,(const char*)wordlist->pos);
				return true;
			}

			else {
				wordlist = wordlist->ptrNext;
				continue;
			}
		}
		 
		return false;
	}
	else{
		int z = head[1] ;
		aWORDdic* wordlist = CIndexcom[z].WList;
		while(wordlist != NULL){
			if(!strncmp((char *)wordlist->sCIYU,(const char *)word,len)){
				pos = wordlist->pos;
				return true;
			}
			else {
				wordlist = wordlist->ptrNext;
				continue;
			}
		}
		return false;
	}

	
}
上一页 1 2 34
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -