📄 browsedir.cpp
字号:
swap[m].UnNormalMail_Prior = x[i].UnNormalMail_Prior;
i ++;
}
else
{
//Copy(swap[m],x[j]);
strcpy(swap[m].str,x[j].str);
swap[m].gain = x[j].gain;
swap[m].NormalMail = x[j].NormalMail;
swap[m].NormalMail_Prior = x[j].NormalMail_Prior;
swap[m].UnNormalMail = x[j].UnNormalMail;
swap[m].UnNormalMail_Prior = x[j].UnNormalMail_Prior;
j++;
}
}
while(i <= u1)
{
//Copy(swap[m],x[i]);
strcpy(swap[m].str,x[i].str);
swap[m].gain = x[i].gain;
swap[m].NormalMail = x[i].NormalMail;
swap[m].NormalMail_Prior = x[i].NormalMail_Prior;
swap[m].UnNormalMail = x[i].UnNormalMail;
swap[m].UnNormalMail_Prior = x[i].UnNormalMail_Prior;
m ++;
i ++;
}
while(j <= u2)
{
//Copy(swap[m],x[j]);
strcpy(swap[m].str,x[j].str);
swap[m].gain = x[j].gain;
swap[m].NormalMail = x[j].NormalMail;
swap[m].NormalMail_Prior = x[j].NormalMail_Prior;
swap[m].UnNormalMail = x[j].UnNormalMail;
swap[m].UnNormalMail_Prior = x[j].UnNormalMail_Prior;
m ++;
j ++;
}
l1 = u2 + 1;
}
for( i =l1;i < n; i ++,m++)
{
//Copy(swap[m],x[i]);
strcpy(swap[m].str,x[i].str);
swap[m].gain = x[i].gain;
swap[m].NormalMail = x[i].NormalMail;
swap[m].NormalMail_Prior = x[i].NormalMail_Prior;
swap[m].UnNormalMail = x[i].UnNormalMail;
swap[m].UnNormalMail_Prior = x[i].UnNormalMail_Prior;
}
return 0;
}
int CBrowseDir::MergesortInString(Feature feature[], int Num)
{
Feature *swap = new Feature[Num];
if(swap == 0)
return -1;
int k = 1,i;
while( k < Num)
{
MergeInString(feature,swap,k,Num);
//cout << " k = " << k << endl;
for( i = 0;i < Num ;i ++)
{
strcpy(feature[i].str,swap[i].str);
feature[i].gain = swap[i].gain;
feature[i].NormalMail = swap[i].NormalMail;
feature[i].NormalMail_Prior = swap[i].NormalMail_Prior;
feature[i].UnNormalMail = swap[i].UnNormalMail;
feature[i].UnNormalMail_Prior = swap[i].UnNormalMail_Prior;
}
//cout << endl;
k = k * 2;
}
delete swap;
return 0;
}
int CBrowseDir::ReadStopList(const char *stoptext)
{
char str[40];
memset(str,0,40);
int size = 500;
stopwordArray = new StopWord[320];
memset(stopwordArray[0].str,0,40);
int eof,Len;
int i = 1;
FILE *readstop;
readstop = fopen(stoptext,"r");
if(readstop == 0)
return -1;
for(eof=fscanf(readstop,"%s",&str);eof!=EOF && eof>0;eof=fscanf(readstop,"%s",&str))
{
Len = strlen(str);
str[Len] = 0;
if(Len != 0)
{
if(i >= 320)
{
cout << "StopWord array out run !";
return -1;
}
memset(stopwordArray[i].str,0,40);
strcpy(stopwordArray[i].str,str);
i ++ ;
}
memset(str,0,40);
}
fclose(readstop);
return i - 1;
}
bool CBrowseDir::Del_StopWord(const char *Word, int Num)
{
bool flag = false;
if(strlen(Word) < 3 )
{
flag = true;
return flag;
}
int low = 1,high = Num;
int mid = 0;
int temp;
while(low <= high) //快速查找法
{
mid = (low + high) / 2;
temp = strcmp (Word,stopwordArray[mid].str);
if(temp == 0)
{
flag = true;
return flag;
}
else if(temp < 0)
high = mid - 1;
else
low = mid + 1;
}
return flag;
}
int CBrowseDir::CalculateGain(const char *str ,Feature feature[], int Num,const char *Output)
{
if(str == 0)
return -1;
ifstream inFile(str,ios::in); //读取信息KeyInfo.dat文件,根据它读取词库
if(!inFile)
{
cerr << "Input File could not be opened." << endl;
exit(1);
}
WordReservoir Info[2],temp; //建立词库信息数组
while(inFile >> temp.ClassFielName >> temp.ClassFlag >> temp.FileSum >> temp.WordsSum)
{
switch(temp.ClassFlag)
{
case 0: //0 表示垃圾邮件构成的词库
strcpy(Info[0].ClassFielName,temp.ClassFielName);
Info[0].ClassFlag = temp.ClassFlag;
Info[0].FileSum = temp.FileSum;
Info[0].WordsSum = temp.WordsSum;
break;
case 1: //1表示正常邮件构成的词库
strcpy(Info[1].ClassFielName,temp.ClassFielName);
Info[1].ClassFlag = temp.ClassFlag;
Info[1].FileSum = temp.FileSum;
Info[1].WordsSum = temp.WordsSum;
default:
break;
}
}
//记录正常邮件和垃圾邮件之和double FileNumber0_1
double FileNumber0_1 = (double)((Info[0].FileSum + Info[1].FileSum) * 1.00);
/************************************************************************/
/* 打开Info[]中的文件,并将词组导到数组中来 */
Item Credit;
int i = 0;
double WordsNumber[2]; //存储每个类别的单词的总数目
Item *M = new Item[70000];
Item *N = new Item[70000];
if( M == 0 )
return -1;
if( N == 0)
return -1;
if(Info[0].ClassFielName == 0) //词库为空,返回(垃圾邮件为标志0)
return -1;
ifstream inInfoFile1(Info[0].ClassFielName,ios::in);
if(!inInfoFile1)
{
cerr << "Input File could not be opened." << endl;
exit(1);
}
//读取垃圾邮件词库
i = 0;
while(inInfoFile1 >> Credit.str >> Credit.freq >> Credit.density
>> Credit.Sequence >>Credit.FileCnt >>Credit.flag)
{
Copy(M[i],Credit);
M[i].flag = Info[0].ClassFlag;
M[i].SetInfo = 0;
i++;
}
int Record = Unique(M,i); //计算熵之前,先排除重复的记录并由Record记录无重复的数量
WordsNumber[0] = (double)Record; //记录垃圾邮件得到的词汇总数
i = 0;
if(Info[1].ClassFielName == 0) //词库为空,返回(正常邮件标志为1)
return -1;
ifstream inInfoFile2(Info[1].ClassFielName,ios::in);
if(!inInfoFile2)
{
cerr << "Input File could not be opened." << endl;
exit(1);
}
//读取正常邮件词库
i = 0;
while(inInfoFile2 >> Credit.str >> Credit.freq >> Credit.density
>> Credit.Sequence >>Credit.FileCnt >>Credit.flag)
{
Copy(N[i],Credit);
N[i].SetInfo = 1;
N[i].flag = Info[1].ClassFlag;
i++;
}
Record = Unique(N,i); //计算熵之前,先排除重复的记录并由Record记录无重复的数量
WordsNumber[1] = (double)Record ; //记录正常邮件得到的词汇总数
int Cnt = WordsNumber[0] + WordsNumber[1]; //由垃圾词库和正常词库得到的总的词汇数量
double Density[2]; //Density[0]垃圾邮件的概率
//Density[1]正常邮件的概率
for (i = 0; i < 2 ;i ++)
{
Density[i] = (double)(Info[i].FileSum /FileNumber0_1);
}
double mGain = 0;
int Max = (int)WordsNumber[0] + WordsNumber[1];
WordGain *wordgainArray = new WordGain[Max];
int m = 0,n = 0;
for(int l = 0;l< Max; l++)
{
memset(wordgainArray[l].str,0,40);
if(m < WordsNumber[0] && n < WordsNumber[1] && l < Max)
{
if(strcmp(M[m].str,N[n].str) < 0)
{
strcpy(wordgainArray[l].str,M[m].str);
wordgainArray[l].density0 = M[m].density;
wordgainArray[l].FileCnt0 = M[m].FileCnt;
wordgainArray[l].freq0 = M[m].freq;
wordgainArray[l].flag0 = 0;
wordgainArray[l].density1 = 0.0;
wordgainArray[l].FileCnt1 = 0;
wordgainArray[l].freq1 = 0;
wordgainArray[l].flag1 = 1;
wordgainArray[l].gain = 0;
m ++;
}
else if(strcmp(M[m].str,N[n].str) == 0)
{
strcpy(wordgainArray[l].str,M[m].str);
wordgainArray[l].density0 = M[m].density;
wordgainArray[l].FileCnt0 = M[m].FileCnt;
wordgainArray[l].freq0 = M[m].freq;
wordgainArray[l].flag0 = 0;
wordgainArray[l].density1 = N[n].density;
wordgainArray[l].FileCnt1 = N[n].FileCnt;
wordgainArray[l].freq1 = N[n].freq;
wordgainArray[l].flag1 = 1;
wordgainArray[l].gain = 0;
m ++;
n ++;
}
else
{
strcpy(wordgainArray[l].str,M[m].str);
wordgainArray[l].density0 = 0.0;
wordgainArray[l].FileCnt0 = 0;
wordgainArray[l].freq0 = 0;
wordgainArray[l].flag0 = 0;
wordgainArray[l].density1 = N[n].density;
wordgainArray[l].FileCnt1 = N[n].FileCnt;
wordgainArray[l].freq1 = N[n].freq;
wordgainArray[l].flag1 = 1;
wordgainArray[l].gain = 0;
n ++;
}
}
else
if( m < WordsNumber[0] && n >= WordsNumber[1])
{
strcpy(wordgainArray[l].str,M[m].str);
wordgainArray[l].density0 = M[m].density;
wordgainArray[l].FileCnt0 = M[m].FileCnt;
wordgainArray[l].freq0 = M[m].freq;
wordgainArray[l].flag0 = 0;
wordgainArray[l].density1 = 0.0;
wordgainArray[l].FileCnt1 = 0;
wordgainArray[l].freq1 = 0;
wordgainArray[l].flag1 = 1;
wordgainArray[l].gain = 0;
m ++;
}
else
if( n < WordsNumber[1] && m >= WordsNumber[0])
{
strcpy(wordgainArray[l].str,N[n].str);
wordgainArray[l].density0 = 0.0;
wordgainArray[l].FileCnt0 = 0;
wordgainArray[l].freq0 = 0;
wordgainArray[l].flag0 = 0;
wordgainArray[l].density1 = N[n].density;
wordgainArray[l].FileCnt1 = N[n].FileCnt;
wordgainArray[l].freq1 = N[n].freq;
wordgainArray[l].flag1 = 1;
wordgainArray[l].gain = 0;
n ++;
}
if(m >= WordsNumber[0] && n >= WordsNumber[1] )
break;
}
int ReturnValue = l + 1; //记录合并后的单词数目
/************************************************************************/
//计算增益
double tempdensity[2] = {0.0,0.0}; //存储某个类别的单词在整个词库中的概率
double TempWord = 0.0;
for(i = 0; i < ReturnValue; i ++)
{
//记录正常邮件和垃圾邮件之和double FileNumber0_1
tempdensity[0] =(double) ((wordgainArray[i].FileCnt0 + 1.00) / (2.00 + FileNumber0_1));
tempdensity[1] =(double) ((wordgainArray[i].FileCnt1 + 1.00) / (2.00 + FileNumber0_1));
TempWord = (wordgainArray[i].FileCnt0 + wordgainArray[i].FileCnt1) * 1.00;
mGain = tempdensity[0] *
log(tempdensity[0] * (1.00 / Density[0]) * ( 1.00 / ( TempWord / FileNumber0_1 )))
/ log(2.000)
+
(1.00 - tempdensity[0]) *
log((1.00 - tempdensity[0]) * (1 / Density[0]) * ( 1.00 / (1.00 - TempWord / FileNumber0_1 ) ))
/ log(2.000)
+
tempdensity[1] *
log(tempdensity[1] * (1.00 / Density[1]) * ( 1.00 / ( TempWord / FileNumber0_1 )))
/ log(2.000)
+
(1.00 - tempdensity[1]) *
log((1.00 - tempdensity[1]) * (1 / Density[1]) * ( 1.00 / (1.00 - TempWord / FileNumber0_1 ) ))
/ log(2.000);
wordgainArray[i].gain = mGain;
}
//计算先验概率
for(i = 0; i < ReturnValue; i ++)
{
strcpy(feature[i].str,wordgainArray[i].str);
feature[i].UnNormalMail = 0;
feature[i].UnNormalMail_Prior = (1.00 + wordgainArray[i].FileCnt0) / (2.00 + Info[0].FileSum);
feature[i].NormalMail = 1;
feature[i].NormalMail_Prior = (1.00 + wordgainArray[i].FileCnt1) / (2.00 + Info[1].FileSum);
feature[i].gain = wordgainArray[i].gain;
}
if(Output == NULL)
return -1;
char tempstr[100];
memset(tempstr,0,100);
strcpy(tempstr,m_szInitDir);
strcat(tempstr,Output);
ofstream outFile(tempstr,ios::out| ios::app);
if(!outFile)
{
cerr << "Output File could not be opened." << endl;
exit(1);
}
for( i = 0; i <= l; i++)
{
outFile << feature[i].str << ' ' << feature[i].gain << ' '
<< feature[i].NormalMail << ' ' << feature[i].NormalMail_Prior << ' '
<< feature[i].UnNormalMail << ' ' << feature[i].UnNormalMail_Prior << '\n';
}
delete wordgainArray;
delete N;
delete M;
return ReturnValue;
}
int CBrowseDir::ShellSortInFreq(Item M[], int N) //按在文章中出现的次数排列
{
int gap = 0;
int n = N;
for(gap = n / 2; gap > 0; gap /= 2)
for(int i = gap; i < n; i++)
for(int j = i - gap; j >= 0; j -= gap)
{
if(M[j].freq < M[j + gap].freq)
{
Item x;
Copy(x,M[j]);
Copy(M[j],M[j + gap]);
Copy(M[j + gap],x);
}
}
return 0;
}
int CBrowseDir::GetSignal()
{
int m;
cout <<"请输入m_switch的值(0 表示开始训练或 1表示启动反馈):";
cin >> m;
if(m != 0 && m != 1){
cout <<"输入m_switch的值(0 或 1)错误。";
return -1;
}
m_switch = m;
return 0;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -