📄 browsedir.cpp
字号:
#include<iostream.h>
#include <direct.h>
#include <string.h>
#include <io.h>
#include<stdio.h>
#include<fstream.h>
#include "browsedir.h"
static int Flag = 0; //统计文件数目
static int Flagtemp = 0;
static int WordLine = 0; //统计词库中的词型数目
static int ObjectCnt = 0;
#if 1
CBrowseDir::CBrowseDir()
{
GetSignal();
int len = 0;
Flagtemp = Flag;
ClassCnt ++;
m_StopWordNum = ReadStopList("StopWord.dat");
if(m_switch == 0){
switch(ObjectCnt)
{
case 0:
getcwd(m_szInitDir,_MAX_PATH);
//如果目录的最后一个字母不是'\',则在最后加上一个'\'
len = strlen(m_szInitDir);
if (m_szInitDir[len-1] != '\\')
strcat(m_szInitDir,"\\");
memset(RestoreFileName,0,strlen(RestoreFileName));
strcpy(RestoreFileName,Path);
strcat(RestoreFileName,"Spam_data1.dat");
break;
case 1:
getcwd(m_szInitDir,_MAX_PATH);
len=strlen(m_szInitDir);
if (m_szInitDir[len-1] != '\\')
strcat(m_szInitDir,"\\");
memset(RestoreFileName,0,strlen(RestoreFileName));
strcpy(RestoreFileName,Path);
strcat(RestoreFileName,"Normal_data2.dat");
break;
case 2:
strcpy(m_szInitDir,Path);
//如果目录的最后一个字母不是'\',则在最后加上一个'\'
len = strlen(m_szInitDir);
if (m_szInitDir[len-1] != '\\')
strcat(m_szInitDir,"\\");
cout << " Route " << endl;
break;
default:
break;
}
ObjectCnt ++;
}
else if (m_switch == 1)
{
strcpy(m_szInitDir,Path);
//如果目录的最后一个字母不是'\',则在最后加上一个'\'
len = strlen(m_szInitDir);
if (m_szInitDir[len-1] != '\\')
strcat(m_szInitDir,"\\");
cout << " Route " << endl;
}
m_nFileCount = 0;
m_nSubdirCount = 0;
}
#endif
bool CBrowseDir::SetInitDir(const char *dir)
{
//先把dir转换为绝对路径
if (_fullpath(m_szInitDir,dir,_MAX_PATH) == NULL)
return false;
//判断目录是否存在
if (_chdir(m_szInitDir) != 0)
return false;
//如果目录的最后一个字母不是'\',则在最后加上一个'\'
int len=strlen(m_szInitDir);
if (m_szInitDir[len-1] != '\\')
strcat(m_szInitDir,"\\");
return true;
}
bool CBrowseDir::BeginBrowse(const char *filespec)
{
ProcessDir(m_szInitDir,NULL);
return BrowseDir(m_szInitDir,filespec);
}
bool CBrowseDir::BrowseDir(const char *dir,const char *filespec)
{
_chdir(dir);
long hFile;
_finddata_t fileinfo;
if ((hFile=_findfirst(filespec,&fileinfo)) != -1)
{
do
{
if (!(fileinfo.attrib & _A_SUBDIR))
{
char filename[_MAX_PATH];
strcpy(filename,dir);
////
strcpy(FName,Restore);
int Len = strlen(FName);
if(FName[Len - 1] !='\\')
strcat(FName,"\\");
strcat(FName,&filename[3]);
CreateDir(FName); //建立新的目录
/////
strcat(filename,fileinfo.name);
strcpy(tempFName,fileinfo.name);
strcat(FName,fileinfo.name);
if (!ProcessFile(filename))
return false;
}
} while (_findnext(hFile,&fileinfo) == 0);
_findclose(hFile);
}
_chdir(dir);
if ((hFile=_findfirst("*.*",&fileinfo)) != -1)
{
do
{
if ((fileinfo.attrib & _A_SUBDIR))
{
if (strcmp(fileinfo.name,".") != 0 && strcmp(fileinfo.name,"..") != 0)
{
char subdir[_MAX_PATH];
strcpy(subdir,dir);
strcat(subdir,fileinfo.name);
///////////
strcpy(FName,subdir);
CreateDir(FName);
///////////
strcat(subdir,"\\");
ProcessDir(subdir,dir);
if (!BrowseDir(subdir,filespec))
return false;
}
}
} while (_findnext(hFile,&fileinfo) == 0);
_findclose(hFile);
}
return true;
}
bool CBrowseDir::ProcessFile(const char *filename)
{
m_nFileCount++;
int k = 0;
Item M[2000];
Split(filename, FName, M,2000);
return true;
}
void CBrowseDir::ProcessDir(const char *currentdir,const char *parentdir)
{
m_nSubdirCount++;
}
int CBrowseDir::Seek( Item *Sour,const char *Des,int num)
{ //如果在数组中已经有这个字符串,则返回1
int flag = 0;
int Cnt = num;
if(Cnt == 0)
{
return flag;
}
for (int temp = 0; temp < num; temp ++ )
{
if (!strcmp(Sour[temp].str,Des))
{
flag = 1;
Sour[temp].freq = Sour[temp].freq + 1;
return flag;
}
}
return flag;
}
int CBrowseDir::Split(const char * path, char * outFile,Item M[], int num )
{
++ Flag; //处理的文本数目计数器
int x = num;
int i = 0;
int k = PreDeal(path,M,x);
ofstream outCredit(RestoreFileName,ios::out | ios::ate);
if(!outCredit)
{
cerr << "File could not be opened" << endl;
exit(1);
}
WordofUnique = 0;
WordNumber = 0;
for( i = 0;i < k;i++ )
{
if(InfoLessWord(M[i].str) == 0)
{
WordofUnique++;
WordNumber += M[i].freq;
}
}
/************************************************************************/
ShellSortInFreq(M,k);
int Count = ((k <= KeyNum)? k: KeyNum);
ShellSort(M,Count);
/************************************************************************/
for( i = 0;i < Count;i++ )
{
if(InfoLessWord(M[i].str) == 0 )
{
M[i].density = (double)(M[i].freq/(double)WordNumber);
M[i].Sequence = Flag - Flagtemp;
M[i].FileCnt = 1;
M[i].flag = 0;
outCredit << M[i].str <<' ' << M[i].freq << ' '
<<M[i].density << ' ' << M[i].Sequence <<' '
<< M[i].FileCnt <<' ' <<M[i].flag << '\n';
}
}
return 0;
}
bool DirExist(const char *pszDirName)
{
_finddata_t fileinfo;
char _szDir[_MAX_PATH];
strcpy(_szDir, pszDirName);
int nLen = strlen(_szDir);
if( (_szDir[nLen-1] == '\\') || (_szDir[nLen-1] == '/') )
{
_szDir[nLen-1] = '\0';
}
long hFind = _findfirst(_szDir, &fileinfo);
if (hFind == -1)
{
return false;
}
if( fileinfo.attrib & _A_SUBDIR )
{
_findclose(hFind);
return true;
}
_findclose(hFind);
return false;
}
// 创建目录,包含子目录,可以创建多级子目录
bool CreateDir(const char *pszDirName)
{
bool bRet = false;
char _szDir[_MAX_PATH];
char _szTmp[_MAX_DIR];
int nLen = 0;
int idx ;
if( (DirExist(pszDirName)) == true )
return true;
strcpy(_szDir, pszDirName);
nLen = strlen(_szDir);
if( _szDir[nLen-1] == '\\' || _szDir[nLen-1] == '/' )
{
_szDir[nLen-1] = '\0';
}
nLen = strlen(_szDir);
memset(_szTmp, 0, _MAX_DIR);
char _str[2];
for(idx = 0; idx < nLen; idx++)
{
if(_szDir[idx] != '\\' )
{
_str[0] = _szDir[idx];
_str[1] = 0;
strcat(_szTmp, _str);
}
else
{
bRet = (bool)CreateDirectory(_szTmp, NULL);
if( bRet )
{
SetFileAttributes(_szTmp, FILE_ATTRIBUTE_NORMAL);
}
_str[0] = _szDir[idx];
_str[1] = 0;
strcat(_szTmp, _str);
}
if( idx == nLen-1 )
{
bRet = (bool)CreateDirectory(_szTmp, NULL);
if( bRet )
{
SetFileAttributes(_szTmp, FILE_ATTRIBUTE_NORMAL);
}
}
}
if( DirExist(_szTmp) )
return true;
return false;
}
bool CBrowseDir::InfoLessWord(const char *Word)
{
bool flag = false;
const char *Set[16] = {" ","an","and","at","in","is","it","on","or","subject","that","the","these","this","those","to"};
if(strlen(Word) < 3 )
{
flag = true;
return flag;
}
int low = 1,high = 15;
int mid = 0;
int temp;
while(low <= high) //快速查找法
{
mid = (low + high) / 2;
temp = strcmp (Word,Set[mid]);
if(temp == 0)
{
flag = true;
return flag;
}
else if(temp < 0)
high = mid - 1;
else
low = mid + 1;
}
return flag;
}
int CBrowseDir::ShellSort(Item M[], int N) //按单词在文章升序排序
{
int gap = 0;
int n = N;
for(gap = n / 2; gap > 0; gap /= 2)
for(int i = gap; i < n; i++)
for(int j = i - gap; j >= 0; j -= gap)
{
if(strcmp(M[j].str,M[j + gap].str) > 0)
{
Item x;
Copy(x,M[j]);
Copy(M[j],M[j + gap]);
Copy(M[j + gap],x);
}
}
return 0;
}
int CBrowseDir::mergesort( char *file )
{
Item *M = new Item[70000];
Item *swap = new Item[70000];
if (M == 0 || swap == 0)
return -1;
int count = 0,countCur = 0; //变量countCur计算当前数组的有效长度
ifstream inCredit(RestoreFileName,ios::in );
if(!inCredit)
{
cerr << "Input File could not be opened" << endl;
exit(1);
}
Item Credit;
while(inCredit >> Credit.str >> Credit.freq >> Credit.density
>> Credit.Sequence >>Credit.FileCnt >>Credit.flag)
{
Copy(M[countCur],Credit);
countCur++;
}
count = countCur;
int i ,k;
k = 1;
while( k < count)
{
merge(M,swap,k,count - 1);
for( i = 0;i < count - 1;i ++)
{
Copy(M[i],swap[i]);
}
cout << endl;
k = k * 2;
}
/************************************************************************/
char str[100];
memset(str,0,100);
strcpy(str,RestoreFileName);
str[strlen(str) - 4] = 0;
strcat(str,"temp.dat");
ofstream outCredit(str,ios::out);
if(!outCredit)
{
cerr << "Output File could not be opened." << endl;
exit(1);
}
int CN = 0; //计算实际合并后的数据数目
for( countCur = 0;countCur < count;countCur ++)
{
if( atof(M[countCur].str) > 0.0000001)
M[countCur].flag = 1;
else
for(int temp = 1;temp <= 30;temp ++)
{
if(strcmp(M[countCur].str,M[countCur + temp].str) == 0
&& M[countCur].flag == 0 && M[countCur + temp].flag == 0
&& (countCur + temp < i))
{
M[countCur].density += M[countCur + temp].density;
M[countCur ].freq += M[countCur + temp].freq;
if(M[countCur].Sequence != M[countCur + temp].Sequence)
M[countCur ].FileCnt ++;
M[countCur + temp].flag = 1;
}
}
if(M[countCur].flag == 0)
{
CN ++;
outCredit << M[countCur].str <<' ' << M[countCur].freq << ' '
<<M[countCur].density << ' ' << M[countCur].Sequence <<' '
<< M[countCur].FileCnt <<' ' <<M[countCur].flag << '\n';
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -