📄 evaluate.cpp
字号:
#include<iostream.h>
#include <direct.h>
#include <string.h>
#include <io.h>
#include<stdio.h>
#include<fstream.h>
#include "evaluate.h"
CEvaluate:: CEvaluate()
{
#if 1
ReadInfo();
m_StopWordNum = ReadStopList("StopWord.dat");
int len = 0;
switch(EObjectCnt)
{
case 0:
m_FeedSetFileNum[0] = (int)m_SetFileNum[0];
m_FeedSetFileNum[1] = (int)m_SetFileNum[1];
memset(m_MailDir,0,40);
strcpy(m_MailDir,Path);
strcat(m_MailDir,"Test_Junk");
strcpy(m_szInitDir,m_MailDir);
len =strlen(m_szInitDir);
if (m_szInitDir[len-1] != '\\')
strcat(m_szInitDir,"\\");
flag = 0;
break;
case 1:
memset(m_MailDir,0,40);
strcpy(m_MailDir,Path);
strcat(m_MailDir,"Test_NorMail");
strcpy(m_szInitDir,m_MailDir);
len=strlen(m_szInitDir);
if (m_szInitDir[len-1] != '\\')
strcat(m_szInitDir,"\\");
flag = 1;
break;
default:
WriteInfo("KeyInfo.dat");
break;
}
EObjectCnt ++;
#endif
}
int CEvaluate::SelectFeature(Feature feature[], int Num)
{
strcpy(m_MailDir,Path);
strcat(m_MailDir,"Sort_accord_Gain.dat");
ifstream inCredit(m_MailDir,ios::in );
if(!inCredit)
{
cerr << "Input File could not be opened" << endl;
exit(1);
}
int i = 0;
Feature Credit;
while(inCredit >> Credit.str >> Credit.gain >> Credit.NormalMail
>> Credit.NormalMail_Prior >>Credit.UnNormalMail >>Credit.UnNormalMail_Prior)
{
CopyFeature(feature[i],Credit);
i ++;
if(i >= Num)
break;
}
MergesortInString(feature,i);
return 0;
}
int CEvaluate::SeekKeyWords( FileNode Src,const char *Des,int num)
{ //如果在数组中已经有这个字符串,则返回1
int flag = 0;
int Cnt = num;
if(Cnt == 0)
{
return flag;
}
for (int temp = 0; temp < num; temp ++ )
{
if (!strcmp(Src.keys[temp].str,Des))
{
flag = 1;
return flag;
}
}
return flag;
}
int CEvaluate::PreDealFile(const char *filename, FileNode &filenode) //对邮件预处理
{
if(filename == 0)
return -1;
char name[100];
int eof;
char c;
char s = ' ';
int i =0;
memset(name,0,100);
FILE *r=fopen(filename,"r"); //输入文件
if( r == 0 )
{
return -1;
}
FILE *w=fopen("Cal.dat","w+"); //输出文件
if( w == 0 )
{
return -1;
}
for(eof=fscanf(r,"%c",&c);eof!=EOF && eof>0;eof=fscanf(r,"%c",&c))
{
if(isalpha(c))
{
char ch = (char)tolower(c);
fprintf( w, "%c",ch );
}
else
{
fprintf( w, "%c",s );
}
}
rewind(w);
char str[100] = "UnInit";
i = 0;
int Len = 0;
eof = 0;
memset(filenode.filename,0,40);
strcpy(filenode.filename,"Cal.dat");
filenode.flag = -1;
for(eof=fscanf(w,"%s",&str);eof!=EOF && eof>0;eof=fscanf(w,"%s",&str))
{
if(i > 1198)
break;
Len = strlen (str);
if(Len >= 39)
{
cout << filename << " " << str << endl;
continue;
}
str[Len] = 0;
if(Len >= 3 && SeekKeyWords( filenode,str,i) == 0 && !Del_StopWord(str,m_StopWordNum))
{
memset(filenode.keys[i].str,0,40);
strcpy(filenode.keys[i].str,str);
i++;
}
memset(str,0,40);
}
ShellSortKeyWords(filenode,i);
fclose(w);
fclose(r);
return i;
}
int CEvaluate::ShellSortKeyWords(FileNode &filenode, int N) //将文章的按关键字排序
{
int gap = 0;
int n = N;
for(gap = n / 2; gap > 0; gap /= 2)
for(int i = gap; i < n; i++)
for(int j = i - gap; j >= 0; j -= gap)
{
if(strcmp(filenode.keys[j].str,filenode.keys[j + gap].str) > 0)
{
char s[40];
strcpy(s,filenode.keys[j].str);
strcpy(filenode.keys[j].str,filenode.keys[j + gap].str);
strcpy(filenode.keys[j + gap].str,s);
}
}
return 0;
}
int CEvaluate::Binary(const char *Src, FileNode filenode, int N) //折半查找法:找到返回1,否则返回0
{
int low, high,mid;
low = 0;
high = N - 1;
int flag = 0;
while(low <= high)
{
mid = (low + high) / 2;
if(strcmp(Src,filenode.keys[mid].str) < 0)
{
high = mid - 1;
}
else if(strcmp(Src,filenode.keys[mid].str) > 0)
{
low = mid + 1;
}
else
{
flag = 1;
return flag;
}
}
return flag;
}
int CEvaluate::ShellSortAccordStr(Feature M[], int N) //按字符串升序排列(Feature M[], int N) //按字符串升序排列
{
int gap = 0;
int n = N;
for(gap = n / 2; gap > 0; gap /= 2)
for(int i = gap; i < n; i++)
for(int j = i - gap; j >= 0; j -= gap)
{
if(strcmp(M[j].str,M[j + gap].str) > 0)
{
Feature x;
CopyFeature(x,M[j]);
CopyFeature(M[j],M[j + gap]);
CopyFeature(M[j + gap],x);
}
}
return 0;
}
int CEvaluate::Classify(const char *str, int Num)
{
if(str == 0)
return -1;
FileNode filenode;
int Cnt = PreDealFile(str,filenode);
if(Cnt < 1)
{
cout << str << endl;
return -1;
}
Feature *feature = new Feature[Num];
if(feature == 0)
return 0;
SelectFeature(feature,Num);
int i = 0,j = 0;
double k = 0.00;
double TDensity[2] = {0.00, 0.00};
for(i = 0;i < Num; i ++)
{
k = (double)(Binary(feature[i].str,filenode,Cnt));
TDensity[0] += log(
k * feature[i].UnNormalMail_Prior
+
(1.00 - k ) * (1.00 - feature[i].UnNormalMail_Prior )
);
TDensity[1] += log(
k * feature[i].NormalMail_Prior
+ (1.00 - k) * (1.00 - feature[i].NormalMail_Prior)
);
}
TDensity[0] = exp(TDensity[0]) * (m_SetFileNum[0] /(m_SetFileNum[0] + m_SetFileNum[1]));
TDensity[1] = exp(TDensity[1]) * (m_SetFileNum[1] /(m_SetFileNum[0] + m_SetFileNum[1]));
int temp;
if(TDensity[0] > 1.00 * TDensity[1])
{
temp = 0;
}
else
{
temp = 1;
}
if(temp == flag) //计算分类正确的邮件数目
{
m_nAccuracy ++;
}
else
{
if (temp == 1)
{
}
else if(temp == 0)
{}
FeedBack("Cal.dat",flag); //反馈信息
}
delete feature;
return 0;
}
bool CEvaluate::SetInitDir(const char *dir)
{
if (_fullpath(m_szInitDir,dir,_MAX_PATH) == NULL)
return false;
if (_chdir(m_szInitDir) != 0)
return false;
int len=strlen(m_szInitDir);
if (m_szInitDir[len-1] != '\\')
strcat(m_szInitDir,"\\");
return true;
}
bool CEvaluate::BeginBrowse(const char *filespec)
{
return BrowseDir(m_szInitDir,filespec);
}
bool CEvaluate::ProcessFile(const char *filename)
{
m_nFileCount++;
Classify(filename,TestKeyWordNumber);
return true;
}
bool CEvaluate::BrowseDir(const char *dir, const char *filespec)
{
_chdir(dir);
long hFile;
_finddata_t fileinfo;
if ((hFile=_findfirst(filespec,&fileinfo)) != -1)
{
do
{
if (!(fileinfo.attrib & _A_SUBDIR))
{
char filename[_MAX_PATH];
strcpy(filename,dir);
strcat(filename,fileinfo.name);
if (!ProcessFile(filename))
return false;
}
} while (_findnext(hFile,&fileinfo) == 0);
_findclose(hFile);
}
return true;
}
int CEvaluate::GetAccuracy()
{
return m_nAccuracy;
}
int CEvaluate::MergesortInString(Feature feature[], int Num)
{
Feature *swap = new Feature[Num];
if(swap == 0)
return -1;
int k = 1,i;
while( k < Num)
{
MergeInString(feature,swap,k,Num);
for( i = 0;i < Num ;i ++)
{
strcpy(feature[i].str,swap[i].str);
feature[i].gain = swap[i].gain;
feature[i].NormalMail = swap[i].NormalMail;
feature[i].NormalMail_Prior = swap[i].NormalMail_Prior;
feature[i].UnNormalMail = swap[i].UnNormalMail;
feature[i].UnNormalMail_Prior = swap[i].UnNormalMail_Prior;
}
k = k * 2;
}
delete swap;
return 0;
}
int CEvaluate::MergeInString(Feature x[], Feature swap[], int k, int n)
{
int i,j,l1,u1,l2,u2,m;
l1 = 0;
m = 0;
while(l1 + k < n)
{
l2 = l1 + k;
u1 = l2 - 1;
u2 = (l2 + k - 1 <= n - 1) ? (l2 + k - 1) :(n - 1);
for( i = l1,j = l2; i <= u1 && j <= u2 ;m++)
{
if(strcmp(x[i].str,x[j].str) >= 0)
{
strcpy(swap[m].str,x[i].str);
swap[m].gain = x[i].gain;
swap[m].NormalMail = x[i].NormalMail;
swap[m].NormalMail_Prior = x[i].NormalMail_Prior;
swap[m].UnNormalMail = x[i].UnNormalMail;
swap[m].UnNormalMail_Prior = x[i].UnNormalMail_Prior;
i ++;
}
else
{
strcpy(swap[m].str,x[j].str);
swap[m].gain = x[j].gain;
swap[m].NormalMail = x[j].NormalMail;
swap[m].NormalMail_Prior = x[j].NormalMail_Prior;
swap[m].UnNormalMail = x[j].UnNormalMail;
swap[m].UnNormalMail_Prior = x[j].UnNormalMail_Prior;
j++;
}
}
while(i <= u1)
{
strcpy(swap[m].str,x[i].str);
swap[m].gain = x[i].gain;
swap[m].NormalMail = x[i].NormalMail;
swap[m].NormalMail_Prior = x[i].NormalMail_Prior;
swap[m].UnNormalMail = x[i].UnNormalMail;
swap[m].UnNormalMail_Prior = x[i].UnNormalMail_Prior;
m ++;
i ++;
}
while(j <= u2)
{
strcpy(swap[m].str,x[j].str);
swap[m].gain = x[j].gain;
swap[m].NormalMail = x[j].NormalMail;
swap[m].NormalMail_Prior = x[j].NormalMail_Prior;
swap[m].UnNormalMail = x[j].UnNormalMail;
swap[m].UnNormalMail_Prior = x[j].UnNormalMail_Prior;
m ++;
j ++;
}
l1 = u2 + 1;
}
for( i =l1;i < n; i ++,m++)
{
strcpy(swap[m].str,x[i].str);
swap[m].gain = x[i].gain;
swap[m].NormalMail = x[i].NormalMail;
swap[m].NormalMail_Prior = x[i].NormalMail_Prior;
swap[m].UnNormalMail = x[i].UnNormalMail;
swap[m].UnNormalMail_Prior = x[i].UnNormalMail_Prior;
}
return 0;
}
int CEvaluate::MergeInKeyWords(FileNode &filenode, int N)
{
KeyWord *swap = new KeyWord[1200];
if(swap == 0)
return -1;
int k = 1,i;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -