⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 browsedir.cpp

📁 贝叶斯学习算法源码
💻 CPP
📖 第 1 页 / 共 3 页
字号:
#include<iostream.h>
#include <direct.h>
#include <string.h>
#include <io.h>
#include<stdio.h>
#include<fstream.h>

#include "browsedir.h"
static int Flag = 0;   //统计文件数目
static int Flagtemp = 0;   
static int WordLine = 0;  //统计词库中的词型数目
static int ObjectCnt = 0;


#if 1
CBrowseDir::CBrowseDir()
{
	GetSignal();
	int len = 0;
	Flagtemp = Flag;
	ClassCnt ++;
	m_StopWordNum = ReadStopList("StopWord.dat");
	if(m_switch == 0){
	switch(ObjectCnt)
	{
	case 0:
		getcwd(m_szInitDir,_MAX_PATH);		
		//如果目录的最后一个字母不是'\',则在最后加上一个'\'
		len = strlen(m_szInitDir);
		if (m_szInitDir[len-1] != '\\')
			strcat(m_szInitDir,"\\");		
		memset(RestoreFileName,0,strlen(RestoreFileName));
		strcpy(RestoreFileName,Path);
		strcat(RestoreFileName,"Spam_data1.dat");

		break;
	case 1:
		getcwd(m_szInitDir,_MAX_PATH);		
		len=strlen(m_szInitDir);
		if (m_szInitDir[len-1] != '\\')
			strcat(m_szInitDir,"\\");		
		memset(RestoreFileName,0,strlen(RestoreFileName));
		strcpy(RestoreFileName,Path);

		strcat(RestoreFileName,"Normal_data2.dat");
		break;
	case 2:
		strcpy(m_szInitDir,Path);

		//如果目录的最后一个字母不是'\',则在最后加上一个'\'
		len = strlen(m_szInitDir);
		if (m_szInitDir[len-1] != '\\')
			strcat(m_szInitDir,"\\");
		cout << " Route " << endl;
		break;
	default:
		break;
	}

	ObjectCnt ++;
	}
	else if (m_switch == 1)
	{
		strcpy(m_szInitDir,Path);

		//如果目录的最后一个字母不是'\',则在最后加上一个'\'
		len = strlen(m_szInitDir);
		if (m_szInitDir[len-1] != '\\')
			strcat(m_szInitDir,"\\");
		cout << " Route " << endl;
	}
	m_nFileCount = 0;
	m_nSubdirCount = 0;
}

#endif


bool CBrowseDir::SetInitDir(const char *dir)
{
	//先把dir转换为绝对路径
	if (_fullpath(m_szInitDir,dir,_MAX_PATH) == NULL)
		return false;
	
	//判断目录是否存在
	if (_chdir(m_szInitDir) != 0)
		return false;
	
	//如果目录的最后一个字母不是'\',则在最后加上一个'\'
	int len=strlen(m_szInitDir);
	if (m_szInitDir[len-1] != '\\')
		strcat(m_szInitDir,"\\");
	
	return true;
}


bool CBrowseDir::BeginBrowse(const char *filespec)
{
	ProcessDir(m_szInitDir,NULL);
	return BrowseDir(m_szInitDir,filespec);
}


bool CBrowseDir::BrowseDir(const char *dir,const char *filespec)
{
	_chdir(dir);
	
	
	long hFile;
	_finddata_t fileinfo;
	if ((hFile=_findfirst(filespec,&fileinfo)) != -1)
	{
		do
		{
			if (!(fileinfo.attrib & _A_SUBDIR))
			{
				char filename[_MAX_PATH];
				strcpy(filename,dir);
				////
				strcpy(FName,Restore);
				int Len = strlen(FName);
				
				if(FName[Len - 1] !='\\')
					strcat(FName,"\\");
				
				strcat(FName,&filename[3]);
				CreateDir(FName);        //建立新的目录
				/////
				strcat(filename,fileinfo.name);
				strcpy(tempFName,fileinfo.name);
				strcat(FName,fileinfo.name);
				
				if (!ProcessFile(filename))
					return false;
			}
		} while (_findnext(hFile,&fileinfo) == 0);
		
		_findclose(hFile);
	}
	
	_chdir(dir);
	if ((hFile=_findfirst("*.*",&fileinfo)) != -1)
	{
		do
		{
			if ((fileinfo.attrib & _A_SUBDIR))
			{
				if (strcmp(fileinfo.name,".") != 0 && strcmp(fileinfo.name,"..") != 0)
				{
					char subdir[_MAX_PATH];
					strcpy(subdir,dir);
					strcat(subdir,fileinfo.name);
					///////////
					strcpy(FName,subdir);
					CreateDir(FName);
					///////////
					strcat(subdir,"\\");
					ProcessDir(subdir,dir);
					if (!BrowseDir(subdir,filespec))
						return false;
				}
			}
		} while (_findnext(hFile,&fileinfo) == 0);
		
		_findclose(hFile);
	}
	
	return true;	
}


bool CBrowseDir::ProcessFile(const char *filename)
{
	m_nFileCount++;		
	int k = 0;
	Item M[2000];

	Split(filename, FName, M,2000);


	return true;
}

void CBrowseDir::ProcessDir(const char *currentdir,const char *parentdir)
{
	m_nSubdirCount++;
}

int CBrowseDir::Seek( Item *Sour,const char *Des,int num)   
{                      //如果在数组中已经有这个字符串,则返回1	
	int flag = 0;
	int Cnt = num;
	if(Cnt == 0)
	{
		return flag;
	}
	
	for (int temp = 0; temp < num; temp ++ )
	{
		if (!strcmp(Sour[temp].str,Des))
		{
			flag = 1;
			Sour[temp].freq =  Sour[temp].freq + 1;
			return flag;
		}
	}	
	return flag;
}

int CBrowseDir::Split(const char * path, char * outFile,Item M[], int num ) 
{
	++ Flag;    //处理的文本数目计数器
	int x = num;
	int i = 0;
	int k = PreDeal(path,M,x);

	ofstream outCredit(RestoreFileName,ios::out | ios::ate);
	
	if(!outCredit)
	{
		cerr << "File could not be opened"  << endl;
		exit(1);
	}
	WordofUnique = 0;
	WordNumber = 0;

	for( i = 0;i < k;i++ )
	{
		if(InfoLessWord(M[i].str) == 0)
		{
			WordofUnique++;
			WordNumber += M[i].freq;
		}
	}
	/************************************************************************/
	ShellSortInFreq(M,k);
	int Count = ((k <= KeyNum)? k: KeyNum);
	ShellSort(M,Count);
	/************************************************************************/

	for( i = 0;i < Count;i++ )
	{
		if(InfoLessWord(M[i].str) == 0  )
		{			
			M[i].density = (double)(M[i].freq/(double)WordNumber); 
			M[i].Sequence = Flag - Flagtemp;
			M[i].FileCnt = 1;
			M[i].flag = 0;
			outCredit << M[i].str <<' ' << M[i].freq << ' ' 
				<<M[i].density << ' ' << M[i].Sequence <<' '
				<< M[i].FileCnt <<' ' <<M[i].flag << '\n';
		}
	}
	
	return 0;
}

bool DirExist(const char *pszDirName)
{
	_finddata_t fileinfo;
	char _szDir[_MAX_PATH];
	strcpy(_szDir, pszDirName);
	int nLen = strlen(_szDir);
	if( (_szDir[nLen-1] == '\\') || (_szDir[nLen-1] == '/') )
	{
		_szDir[nLen-1] = '\0';
	}
	long hFind = _findfirst(_szDir, &fileinfo);
	
	if (hFind == -1)
	{
		return false;
	}
	if( fileinfo.attrib & _A_SUBDIR )
	{
		_findclose(hFind);
		return true;
	}
	_findclose(hFind);
	return false;
}



// 创建目录,包含子目录,可以创建多级子目录
bool CreateDir(const char *pszDirName)
{
	bool bRet = false;
	char _szDir[_MAX_PATH];
	char _szTmp[_MAX_DIR];
	int nLen = 0;
	int idx ;
	if( (DirExist(pszDirName)) == true )
		return true;
	strcpy(_szDir, pszDirName);
	nLen = strlen(_szDir);
	if( _szDir[nLen-1] == '\\' || _szDir[nLen-1] == '/' )
	{
		_szDir[nLen-1] = '\0';
	}
	nLen = strlen(_szDir);
	memset(_szTmp, 0, _MAX_DIR);
	char _str[2];
	for(idx = 0; idx < nLen; idx++)
	{
		if(_szDir[idx] != '\\' )
		{
			_str[0] = _szDir[idx];
			_str[1] = 0;
			strcat(_szTmp, _str);
		}
		else
		{
			bRet = (bool)CreateDirectory(_szTmp, NULL);
			if( bRet )
			{
				SetFileAttributes(_szTmp, FILE_ATTRIBUTE_NORMAL);
			}
			_str[0] = _szDir[idx];
			_str[1] = 0;
			strcat(_szTmp, _str);
		}
		if( idx == nLen-1 )
		{
			bRet = (bool)CreateDirectory(_szTmp, NULL);
			if( bRet )
			{
				SetFileAttributes(_szTmp, FILE_ATTRIBUTE_NORMAL);
			}
		}
	}
	if( DirExist(_szTmp) )
		return true;
	return false;
}  


bool CBrowseDir::InfoLessWord(const char *Word)     
{
	bool flag = false;
	const char *Set[16] = {" ","an","and","at","in","is","it","on","or","subject","that","the","these","this","those","to"};

	if(strlen(Word) < 3 )   
	{
		flag = true;
		return flag;
	}

	int low = 1,high = 15;
	int mid = 0;
	int temp;

	while(low <= high)        //快速查找法
	{
		mid = (low + high) / 2;
		temp = strcmp (Word,Set[mid]);
		if(temp == 0)
		{
			flag = true;
			return flag;
		}
		else if(temp < 0)
			high = mid - 1;
		else 
			low = mid + 1;
	}

	return flag;
}



int CBrowseDir::ShellSort(Item M[], int N)         //按单词在文章升序排序
{
	int gap = 0;
	int n = N;
	for(gap = n / 2; gap > 0; gap /= 2)
		for(int i = gap; i < n; i++)
			for(int j = i - gap; j >= 0; j -= gap)	
			{
				if(strcmp(M[j].str,M[j + gap].str) > 0)
				{
					Item x;
					Copy(x,M[j]);
					Copy(M[j],M[j + gap]);
					Copy(M[j + gap],x);
				}
			}
			
			return 0;
}


int CBrowseDir::mergesort( char *file )
{
	Item *M = new Item[70000];
	Item *swap = new Item[70000];
	if (M == 0 || swap == 0)
		return -1;
	
	int count = 0,countCur = 0;  //变量countCur计算当前数组的有效长度
	
	ifstream inCredit(RestoreFileName,ios::in );
	if(!inCredit)
	{
		cerr << "Input File could not be opened"  << endl;
		exit(1);
	}
	Item Credit;
	
	while(inCredit >> Credit.str >> Credit.freq >> Credit.density 
		>> Credit.Sequence >>Credit.FileCnt >>Credit.flag)
	{
		Copy(M[countCur],Credit);
		countCur++;
	}
	
	count = countCur;
	int i ,k;
	k = 1;
	while( k < count)
	{
		merge(M,swap,k,count - 1);
		for( i = 0;i < count - 1;i ++)
		{
			Copy(M[i],swap[i]);
		}
		cout << endl;
		 k = k * 2;
	}

	/************************************************************************/

	char str[100];
	memset(str,0,100);
	strcpy(str,RestoreFileName);
	str[strlen(str) - 4] = 0;
	strcat(str,"temp.dat");
	ofstream outCredit(str,ios::out);
	if(!outCredit)
	{
		cerr << "Output File could not be opened."  << endl;
		exit(1);
	}

	int CN = 0;             //计算实际合并后的数据数目
	
	for( countCur = 0;countCur < count;countCur ++)
	{
		if( atof(M[countCur].str) > 0.0000001)
			M[countCur].flag = 1;
		else
			for(int temp = 1;temp <= 30;temp ++)
			{
				if(strcmp(M[countCur].str,M[countCur + temp].str) == 0  
					&& M[countCur].flag == 0 && M[countCur + temp].flag == 0  
					&& (countCur + temp < i))
				{
					M[countCur].density += M[countCur + temp].density;
					M[countCur ].freq += M[countCur + temp].freq;
					
					if(M[countCur].Sequence != M[countCur + temp].Sequence)
						M[countCur ].FileCnt ++;

					M[countCur + temp].flag = 1;
				}
			}
			
			if(M[countCur].flag == 0)
			{
				CN ++;
				outCredit << M[countCur].str <<' ' << M[countCur].freq << ' ' 
					<<M[countCur].density << ' ' << M[countCur].Sequence <<' '
					<< M[countCur].FileCnt <<' ' <<M[countCur].flag << '\n';
			}
	}	

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -