📄 cnaivebayes.cpp

📁 贝叶斯公式
💻 CPP
📖 第 1 页 / 共 2 页
字号:
上一页 12
	string FileNameTmp="";

	//如果目录的最后一个字母不是'\',则在最后加上一个'\' 
	int len=sTestFilesPath.size(); 
	if (sTestFilesPath.at(len-1) != '\\') 
		sTestFilesPath += "\\";

	string TempName = sTestFilesPath+"*.txt";
	string sText="";
    int nClassID=0;

	long hFile;

	struct _finddata_t  TestFile;

    if((hFile = _findfirst(TempName.c_str(), &TestFile )) == -1L)
	{
    	cout<<"路径无法访问！"<<endl;
        return ;
	}
	

	string Path="";

	//重新写类别文件
	for(int i=0;i<m_nClassNum;i++)
	{
		char szTempResFile[200];
		sprintf(szTempResFile,"Res\\C%d.txt",i);
		DeleteFile(szTempResFile);
	}

	CSSPS ssps;
	ssps.Init("Zssps");

	do
	{
      
		FileNameTmp=TestFile.name;//训练文件的名称
 		Path=sTestFilesPath+FileNameTmp;//获取文件路径
		string sSrcContent="";	
		ifstream fin1(Path.c_str());
		getline(fin1,sSrcContent,'\0');
		string sContent = ssps.GetVecStr (sSrcContent);
		nClassID=TestTermFile(sContent);

		 //写文件Ci.txt
		char szResFile[20];
        memset(szResFile,0,20);
		sprintf(szResFile,"Res\\C%d.txt",nClassID);
		ofstream Fou;
		Fou.open(szResFile,ios::out|ios::app);
		Fou << Path << endl;
		Fou.close();

	}while(! _findnext( hFile, &TestFile ) );

}

//-----------------------------------------------------------------------------------------//
//   功能：       对每篇文档进行测试。
//	 返回值：	  该文档所属类别
//----------------------------------------------------------------------------------------//
int CNaiveBayes::TestTermFile(string sContent)
{
	    string::size_type pos=0, prev_pos=0;	
	    string::size_type wordnum_pos=0;
		string word;
		
		double *pro=new double[m_nClassNum];//存储各特征项取log后的和
		memset(pro,0,m_nClassNum*sizeof(double));
		wordnum_pos = sContent.find_first_of( ' ',wordnum_pos );
		
		//每篇文章的词数nWordNum		
		string sWordNum = sContent.substr( 0,wordnum_pos );
		int nWordNum = atoi( sWordNum.c_str() );
		
		pos=++wordnum_pos;
		prev_pos=pos;
		while((pos = sContent.find_first_of( ' ', pos ))!=string::npos)
		{
			//取得每个word 的key:weight
			string::size_type key_pos=0, weight_pos=0;
			word = sContent.substr( prev_pos, pos - prev_pos );
			prev_pos = ++pos;
			
			//取得每个word的key并计算在类中的先验概率
			key_pos = word.find_first_of( ':', key_pos );
			string str_key = word.substr( 0 , key_pos );
			int key = atoi(str_key.c_str());
			
			for(int i=0;i<m_nClassNum;i++)
			{	
				pro[i]=pro[i]+log(m_ppfTrainRes[i][key]);
			
			}
			
		}
		
		//处理最后一个空格后的word
		word = sContent.substr( prev_pos, pos - prev_pos );
		string::size_type key_pos=0;
		key_pos = word.find_first_of( ':', key_pos );
		string str_key = word.substr( 0 , key_pos );
		int key = atoi(str_key.c_str());
		
		
		for(int i=0;i<m_nClassNum;i++)
		{	
			pro[i]=pro[i]+log(m_ppfTrainRes[i][key])+log(m_pfPrC[i]);
		
        }

		//对测试文本进行分类
		
		double t;
		t = pro[0];
		int max_pro_num = 0;
		
		for(int s = 1 ; s < m_nClassNum ; s++ )
		{
			double k = pro[s];
			if( t < k && k != 0 && t != 0 )
				
			{
				max_pro_num = s;
				t = k;
			}
		}
		

		m_pnResNum[max_pro_num]++;

		return max_pro_num;

}

//-----------------------------------------------------------------------------------------//
//   功能：       读入类的先验概率及对应该类的特征项的先验概率。
//----------------------------------------------------------------------------------------//

void CNaiveBayes::PrwFRead()
{
	
	string PrcF="Pr\\Prc.txt";
	
	ifstream prcf(PrcF.c_str());
	string strPrC="";
	string classID="";//存储类序号
	string classfreV="";//存储类的先验概率
	int clsID=0;
	int pos1=0;
	int pos2=0;
    
	while(getline(prcf,strPrC,'\n')!=NULL)//读取行
	{
     	pos1=0;
		pos2=0;
		
		if((pos2= strPrC.find_first_of( ' ',pos1 ))!=-1)
		{
			classID=strPrC.substr(pos1,pos2-pos1);
            const char *cID=classID.c_str();
			clsID=atoi(cID);
			pos2++;
		}
	   if((pos1= strPrC.find_first_of( ' ',pos2))!=-1)
		{
			pos1++;
			classfreV=strPrC.substr(pos1,-1);
			const char *cFV=classfreV.c_str();
			m_pfPrC[clsID]=atof(cFV);
		//	cout<<clsID<<" "<<m_pfPrC[clsID]<<endl;
	   }
	}
	
	
	char PrwDir[20];
	memset(PrwDir,' ',20*sizeof(char));

	for (int i = 0 ; i < m_nClassNum ; i++ )
	{
		sprintf(PrwDir,"Pr\\PrW in C%d.txt",i);
		ifstream prwf(PrwDir);
		//ifstream prwf(PrwF.c_str());
		string strPrW="";
		string feaNum="";
		string fesVal="";
		int featID=0;
		float featVal;
		
		
		while(getline(prwf,strPrW,'\n')!=NULL)//读取行
		{
			pos1=0;
			pos2=0;
			
			if((pos2= strPrW.find_first_of( '\t',pos1 ))!=-1)
			{
				feaNum=strPrW.substr(pos1,pos2-pos1);
				
				pos2++;
			}
			const char *feaN=feaNum.c_str();
			featID=atoi(feaN);
			//cout<<featID<<"  ";
			fesVal=strPrW.substr(pos2,-1);
			//cout<<fesVal.c_str()<<endl;
			const char *feaV=fesVal.c_str();
			//featVal=atof(feaV);
			m_ppfTrainRes[i][featID]=atof(feaV);
			//  cout<<featID<<" "<<m_ppfTrainRes[i][featID]<<endl;
			
		}  
	}
	
}


//-----------------------------------------------------------------------------------------//
//   功能：       
//----------------------------------------------------------------------------------------//
int CNaiveBayes::InitPara(bool bPreTrain,string sTrainFilesPath)
{
	InitClassInfo();

	if (bPreTrain)
	{
		PreTrain(sTrainFilesPath);
	}

	//从文件_all_words.lst中读取特征词总个数
	ifstream wfile("..\\Dic\\DF\\_all_words.lst");
	string sTemp="";
	getline(wfile,sTemp,' ');//读取特征词总个数
	m_nFeatureNum=atoi(sTemp.c_str());
	
	m_ppfTrainRes = new float *[m_nClassNum];
	for(int i=0;i<m_nClassNum;i++)
	{
		m_ppfTrainRes[i]=new float[m_nFeatureNum];
		memset(m_ppfTrainRes[i],0,m_nFeatureNum*sizeof(float));
	}

    return m_nClassNum;
}

int CNaiveBayes::InitClassInfo()
{
	//从文件class.lst中读取类别数、类名
	ifstream cfile("class.lst");
	string sTemp="";
    getline(cfile,sTemp,'\n');//读取类别数
	m_nClassNum = atoi(sTemp.c_str());
	sTemp="";
	m_psClassName = new string[m_nClassNum];
	m_pnTrainNum = new int[m_nClassNum];
	int nClassIndex=0;
	
	while(getline(cfile,sTemp,'\n')!=NULL)//读取行
	{
		m_psClassName[nClassIndex]=sTemp.c_str();//将类名存入m_psClassName数组
		m_mapClassName2ID[sTemp] = nClassIndex;
		nClassIndex++;
		sTemp="";
		if (nClassIndex>=m_nClassNum) 
		{
			break;
		}
	}
	
	m_pfPrC = new double[m_nClassNum];
    memset(m_pfPrC,0,m_nClassNum*sizeof(double));
	
	return m_nClassNum;
}

bool CNaiveBayes::PreTrain(string sTrainFilesPath)
{
	CSSPS ssps;
	ssps.Init("Zssps");	

	string sSubTrainFilesPath="";	
	//如果目录的最后一个字母不是'\',则在最后加上一个'\' 
	int len=sTrainFilesPath.size(); 
	if (sTrainFilesPath.at(len-1) != '\\') 
	{
		sTrainFilesPath += "\\";
	}
	for(int nClassIndex=0;nClassIndex<m_nClassNum;nClassIndex++)
	{
		sSubTrainFilesPath = sTrainFilesPath + m_psClassName[nClassIndex];
		ssps.TrainFiles(sSubTrainFilesPath.c_str(), m_psClassName[nClassIndex].c_str());
	}

	return true;
}

int CNaiveBayes::TestAFile(string sTestFilePath)
{
	string sSrcContent="";	
	ifstream fin1(sTestFilePath.c_str());
	getline(fin1,sSrcContent,'\0');
	CSSPS ssps;
	ssps.Init("Zssps");
	string sContent = ssps.GetVecStr (sSrcContent);
	int nClassID=TestTermFile(sContent);

	return nClassID;
}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -