⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 evaluate.cpp

📁 贝叶斯学习算法源码
💻 CPP
📖 第 1 页 / 共 2 页
字号:
#include<iostream.h>
#include <direct.h>
#include <string.h>
#include <io.h>
#include<stdio.h>
#include<fstream.h>

#include "evaluate.h"

CEvaluate::	CEvaluate()
{
#if 1
	ReadInfo();
	m_StopWordNum = ReadStopList("StopWord.dat");
	int len = 0;
	switch(EObjectCnt)
	{
	case 0:	
		m_FeedSetFileNum[0] = (int)m_SetFileNum[0];
	    m_FeedSetFileNum[1] = (int)m_SetFileNum[1];
		memset(m_MailDir,0,40);
		strcpy(m_MailDir,Path);
		strcat(m_MailDir,"Test_Junk");

		strcpy(m_szInitDir,m_MailDir);
		len =strlen(m_szInitDir);
		if (m_szInitDir[len-1] != '\\')
			strcat(m_szInitDir,"\\");
		flag = 0;
		break;
	case 1:
		memset(m_MailDir,0,40);
		strcpy(m_MailDir,Path);
		strcat(m_MailDir,"Test_NorMail");
		strcpy(m_szInitDir,m_MailDir);
		len=strlen(m_szInitDir);
		if (m_szInitDir[len-1] != '\\')
			strcat(m_szInitDir,"\\");
		flag = 1;
		break;
	default:
		WriteInfo("KeyInfo.dat");
		break;
	}

	EObjectCnt ++;
	
#endif 
}

int CEvaluate::SelectFeature(Feature feature[], int Num)
{
	strcpy(m_MailDir,Path);
	strcat(m_MailDir,"Sort_accord_Gain.dat");
	ifstream inCredit(m_MailDir,ios::in );

	if(!inCredit)
	{
		cerr << "Input File could not be opened"  << endl;
		exit(1);
	}

	int i = 0;
	Feature Credit;
	
	while(inCredit >> Credit.str >> Credit.gain >> Credit.NormalMail
		>> Credit.NormalMail_Prior >>Credit.UnNormalMail >>Credit.UnNormalMail_Prior)
	{
		CopyFeature(feature[i],Credit);
		i ++;
		if(i >= Num)
			break;
	}


	MergesortInString(feature,i);

	return 0;
}

int CEvaluate::SeekKeyWords( FileNode Src,const char *Des,int num)   
{                      //如果在数组中已经有这个字符串,则返回1	
	int flag = 0;
	int Cnt = num;
	if(Cnt == 0)
	{
		return flag;
	}
	
	for (int temp = 0; temp < num; temp ++ )
	{
		if (!strcmp(Src.keys[temp].str,Des))
		{
			flag = 1;
			return flag;
		}
	}	
	return flag;
}

int CEvaluate::PreDealFile(const char *filename, FileNode &filenode)    //对邮件预处理
{
	if(filename == 0)
		return -1;
	
	char name[100];
	int eof;  
	char c; 
	char s = ' ';
	int i =0;	
	memset(name,0,100);

	FILE *r=fopen(filename,"r");       //输入文件
	if( r == 0 )
	{ 
		return -1; 
	}

	FILE *w=fopen("Cal.dat","w+");       //输出文件
	if( w == 0 )
	{ 
		return -1; 
	}
	
	for(eof=fscanf(r,"%c",&c);eof!=EOF && eof>0;eof=fscanf(r,"%c",&c))
	{
		if(isalpha(c))
		{			
			char ch = (char)tolower(c);
			fprintf( w, "%c",ch );
		}
		else
		{
			fprintf( w, "%c",s );
		}
	} 

    rewind(w);
	
	char str[100] = "UnInit";	

	i = 0;
	int Len = 0;
	eof = 0;

	memset(filenode.filename,0,40);
	strcpy(filenode.filename,"Cal.dat");
	filenode.flag = -1;
	for(eof=fscanf(w,"%s",&str);eof!=EOF && eof>0;eof=fscanf(w,"%s",&str))
	{
		if(i > 1198)
			break;
		Len = strlen (str);
		if(Len >= 39)
		{
			cout << filename << "  " << str << endl;
			continue;
		}
		str[Len] = 0;
		if(Len >= 3  && SeekKeyWords( filenode,str,i)  == 0  && !Del_StopWord(str,m_StopWordNum))
		{			
			memset(filenode.keys[i].str,0,40);
			strcpy(filenode.keys[i].str,str);
			i++;
		}
		memset(str,0,40);
	} 

	ShellSortKeyWords(filenode,i);
	fclose(w);
	fclose(r);

	return i;
}


int CEvaluate::ShellSortKeyWords(FileNode &filenode, int N)      //将文章的按关键字排序
{
	int gap = 0;
	int n = N;
	for(gap = n / 2; gap > 0; gap /= 2)
		for(int i = gap; i < n; i++)
			for(int j = i - gap; j >= 0; j -= gap)	
			{
				if(strcmp(filenode.keys[j].str,filenode.keys[j + gap].str) > 0)
				{
					char s[40];
					strcpy(s,filenode.keys[j].str);
					strcpy(filenode.keys[j].str,filenode.keys[j + gap].str);
					strcpy(filenode.keys[j + gap].str,s);
				}
			}
			

	return 0;
}


int CEvaluate::Binary(const char *Src, FileNode filenode, int N)     //折半查找法:找到返回1,否则返回0
{
	int low, high,mid;
	low = 0;
	high = N - 1;
	int flag = 0;

	while(low <= high)
	{
		mid = (low + high) / 2;
		if(strcmp(Src,filenode.keys[mid].str) < 0)
		{
			high = mid - 1;
		}
		else if(strcmp(Src,filenode.keys[mid].str) > 0)
		{
			low = mid + 1;
		}
		else 
		{
			flag = 1;
			return flag;
		}				
	}
	return flag;
}

int CEvaluate::ShellSortAccordStr(Feature M[], int N)    //按字符串升序排列(Feature M[], int N)    //按字符串升序排列
{
	int gap = 0;
	int n = N;
	for(gap = n / 2; gap > 0; gap /= 2)
		for(int i = gap; i < n; i++)
			for(int j = i - gap; j >= 0; j -= gap)	
			{
				if(strcmp(M[j].str,M[j + gap].str) > 0)
				{
					Feature x;
					CopyFeature(x,M[j]);
					CopyFeature(M[j],M[j + gap]);
					CopyFeature(M[j + gap],x);
				}
			}			
			return 0;
}

int CEvaluate::Classify(const char *str, int Num)
{
	if(str == 0)
		return -1;
	FileNode filenode;
	int Cnt = PreDealFile(str,filenode);
	if(Cnt < 1)
	{
		cout << str << endl;
		return -1;
	}
	Feature *feature = new Feature[Num];
	if(feature == 0)
		return 0;
	SelectFeature(feature,Num);

	int i = 0,j = 0;
	double k = 0.00;
	double TDensity[2] = {0.00, 0.00};
	for(i = 0;i < Num; i ++)
	{
		k = (double)(Binary(feature[i].str,filenode,Cnt));
		TDensity[0] += log(
			k * feature[i].UnNormalMail_Prior 
			+ 
			(1.00 - k ) * (1.00 - feature[i].UnNormalMail_Prior ) 
			);
		TDensity[1] += log(
			k * feature[i].NormalMail_Prior 
			+ (1.00 - k) * (1.00 - feature[i].NormalMail_Prior)
			);
	}

	
	TDensity[0] = exp(TDensity[0]) * (m_SetFileNum[0] /(m_SetFileNum[0] + m_SetFileNum[1]));
	TDensity[1] = exp(TDensity[1]) * (m_SetFileNum[1] /(m_SetFileNum[0] + m_SetFileNum[1]));

	int temp;

	if(TDensity[0] >  1.00 *  TDensity[1])
	{
		temp = 0;
	}
	else
	{
		temp = 1;
	}

	if(temp == flag)               //计算分类正确的邮件数目
	{
		m_nAccuracy ++;
	}
	else
	{
		if (temp == 1)   
		{
		} 
		else if(temp == 0)   
		{}
		FeedBack("Cal.dat",flag);   //反馈信息

	}

	delete feature;
	return 0;
}


bool CEvaluate::SetInitDir(const char *dir)
{
	if (_fullpath(m_szInitDir,dir,_MAX_PATH) == NULL)
		return false;
	
	if (_chdir(m_szInitDir) != 0)
		return false;
	
	int len=strlen(m_szInitDir);
	if (m_szInitDir[len-1] != '\\')
		strcat(m_szInitDir,"\\");
	
	return true;
}

bool CEvaluate::BeginBrowse(const char *filespec)
{
	return BrowseDir(m_szInitDir,filespec);
}

bool CEvaluate::ProcessFile(const char *filename)
{
	m_nFileCount++;		
	Classify(filename,TestKeyWordNumber);
	
	return true;
}

bool CEvaluate::BrowseDir(const char *dir, const char *filespec)
{
	
	_chdir(dir);
	
	long hFile;
	_finddata_t fileinfo;
	if ((hFile=_findfirst(filespec,&fileinfo)) != -1)
	{
		do
		{
			if (!(fileinfo.attrib & _A_SUBDIR))
			{
				char filename[_MAX_PATH];
				strcpy(filename,dir);
				strcat(filename,fileinfo.name);
				
				if (!ProcessFile(filename))
					return false;
			}
		} while (_findnext(hFile,&fileinfo) == 0);
		
		_findclose(hFile);
	}	
	
	return true;	
}


int CEvaluate::GetAccuracy()
{
	return m_nAccuracy;

}


int CEvaluate::MergesortInString(Feature feature[], int Num)
{
	Feature *swap = new Feature[Num];
	if(swap == 0)
		return -1;
	int k = 1,i;
	
	while( k < Num)
	{
		MergeInString(feature,swap,k,Num);

		for( i = 0;i < Num ;i ++)
		{
			strcpy(feature[i].str,swap[i].str);
			feature[i].gain = swap[i].gain;
			feature[i].NormalMail = swap[i].NormalMail;			
			feature[i].NormalMail_Prior = swap[i].NormalMail_Prior;
			feature[i].UnNormalMail = swap[i].UnNormalMail;
			feature[i].UnNormalMail_Prior = swap[i].UnNormalMail_Prior;
		}

		k = k * 2;
	}
	
	delete swap;
	return 0;
	
}

int CEvaluate::MergeInString(Feature x[], Feature swap[], int k, int n)
{
	
	int i,j,l1,u1,l2,u2,m;
	l1 = 0;
	m = 0;
	while(l1 + k < n)
	{
		l2 = l1 + k;
		u1 = l2 - 1;
		u2 = (l2 + k - 1 <= n - 1) ? (l2 + k - 1) :(n - 1);	

		for( i  = l1,j = l2; i <= u1 && j <= u2 ;m++)
		{
			if(strcmp(x[i].str,x[j].str) >= 0)
			{
				strcpy(swap[m].str,x[i].str);
				swap[m].gain = x[i].gain;
				swap[m].NormalMail = x[i].NormalMail;			
				swap[m].NormalMail_Prior = x[i].NormalMail_Prior;
				swap[m].UnNormalMail = x[i].UnNormalMail;
				swap[m].UnNormalMail_Prior = x[i].UnNormalMail_Prior;
				i ++;
			}
			else
			{
				strcpy(swap[m].str,x[j].str);
				swap[m].gain = x[j].gain;
				swap[m].NormalMail = x[j].NormalMail;			
				swap[m].NormalMail_Prior = x[j].NormalMail_Prior;
				swap[m].UnNormalMail = x[j].UnNormalMail;
				swap[m].UnNormalMail_Prior = x[j].UnNormalMail_Prior;
				 j++;
			}
		}

		while(i <=  u1)
		{
			strcpy(swap[m].str,x[i].str);
			swap[m].gain = x[i].gain;
			swap[m].NormalMail = x[i].NormalMail;			
			swap[m].NormalMail_Prior = x[i].NormalMail_Prior;
			swap[m].UnNormalMail = x[i].UnNormalMail;
			swap[m].UnNormalMail_Prior = x[i].UnNormalMail_Prior;
			m ++;
			i ++;
		}
		while(j <= u2)
		{
			strcpy(swap[m].str,x[j].str);
			swap[m].gain = x[j].gain;
			swap[m].NormalMail = x[j].NormalMail;			
			swap[m].NormalMail_Prior = x[j].NormalMail_Prior;
			swap[m].UnNormalMail = x[j].UnNormalMail;
			swap[m].UnNormalMail_Prior = x[j].UnNormalMail_Prior;
			m ++;
			j ++;
		}
		l1 = u2 + 1;	
	}
	for( i =l1;i < n; i ++,m++)
	{
		strcpy(swap[m].str,x[i].str);
		swap[m].gain = x[i].gain;
		swap[m].NormalMail = x[i].NormalMail;			
		swap[m].NormalMail_Prior = x[i].NormalMail_Prior;
		swap[m].UnNormalMail = x[i].UnNormalMail;
		swap[m].UnNormalMail_Prior = x[i].UnNormalMail_Prior;
	}
	return 0;

}

int CEvaluate::MergeInKeyWords(FileNode &filenode, int N)
{
	KeyWord *swap = new KeyWord[1200];
	if(swap == 0)
		return -1;

	int k = 1,i;
	

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -