📄 evaluate.cpp
字号:
while( k < N)
{
MergeInKeyString(filenode,swap,k,N);
for( i = 0;i < N ;i ++)
{
strcpy(filenode.keys[i].str,swap[i].str);
}
k = k * 2;
}
delete swap;
return 0;
}
int CEvaluate::MergeInKeyString(FileNode &filenode, KeyWord swap[], int k, int n)
{
int i,j,l1,u1,l2,u2,m;
l1 = 0;
m = 0;
while(l1 + k < n)
{
l2 = l1 + k;
u1 = l2 - 1;
u2 = (l2 + k - 1 <= n - 1) ? (l2 + k - 1) :(n - 1);
for( i = l1,j = l2; i <= u1 && j <= u2 ;m++)
{
if(strcmp(filenode.keys[i].str,filenode.keys[j].str) >= 0)
{
strcpy(swap[m].str,filenode.keys[i].str);
i ++;
}
else
{
strcpy(swap[m].str,filenode.keys[j].str);
j++;
}
}
while(i <= u1)
{
strcpy(swap[m].str,filenode.keys[i].str);
m ++;
i ++;
}
while(j <= u2)
{
strcpy(swap[m].str,filenode.keys[j].str);
m ++;
j ++;
}
l1 = u2 + 1;
}
for( i =l1;i < n; i ++,m++)
{
strcpy(swap[m].str,filenode.keys[i].str);
}
return 0;
}
int CEvaluate::CopyFeature(Feature &K, const Feature X)
{
strcpy(K.str,X.str);
K.gain = X.gain;
K.NormalMail = X.NormalMail;
K.NormalMail_Prior = X.NormalMail_Prior;
K.UnNormalMail = X.UnNormalMail;
K.UnNormalMail_Prior = X.UnNormalMail_Prior;
return 0;
}
int CEvaluate::ReadInfo()
{
char s[30];
memset(s,0,30);
strcpy(s,Path);
strcat(s,"KeyInfo.dat");
ifstream inCredit(s,ios::in);
if(!inCredit)
{
cerr << "Output File could not be opened." << endl;
exit(1);
}
for(int i = 0;i < 2 ;i ++)
{
inCredit >> WordReservoirInfo[i].ClassFielName >> WordReservoirInfo[i].ClassFlag >> WordReservoirInfo[i].FileSum >> WordReservoirInfo[i].WordsSum ;
m_SetFileNum[i] = WordReservoirInfo[i].FileSum;
}
return 0;
}
int CEvaluate::ReadStopList(const char *stoptext)
{
char str[40];
memset(str,0,40);
int size = 500;
stopwordArray = new StopWord[320];
memset(stopwordArray[0].str,0,40);
int eof,Len;
int i = 1;
FILE *readstop;
readstop = fopen(stoptext,"r");
if(readstop == 0)
return -1;
for(eof=fscanf(readstop,"%s",&str);eof!=EOF && eof>0;eof=fscanf(readstop,"%s",&str))
{
Len = strlen(str);
str[Len] = 0;
if(Len != 0)
{
if(i >= 320)
{
cout << "StopWord array out run !";
return -1;
}
memset(stopwordArray[i].str,0,40);
strcpy(stopwordArray[i].str,str);
i ++ ;
}
memset(str,0,40);
}
fclose(readstop);
return i - 1;
}
bool CEvaluate::Del_StopWord(const char *Word, int Num)
{
bool flag = false;
if(strlen(Word) < 3 )
{
flag = true;
return flag;
}
int low = 1,high = Num;
int mid = 0;
int temp;
while(low <= high) //快速查找法
{
mid = (low + high) / 2;
temp = strcmp (Word,stopwordArray[mid].str);
if(temp == 0)
{
flag = true;
return flag;
}
else if(temp < 0)
high = mid - 1;
else
low = mid + 1;
}
return flag;
}
int CEvaluate::Seek(Item *Sour, const char *Des, int num)
{
int flag = 0;
int Cnt = num;
if(Cnt == 0)
{
return flag;
}
for (int temp = 0; temp < num; temp ++ )
{
if (!strcmp(Sour[temp].str,Des))
{
flag = 1;
Sour[temp].freq = Sour[temp].freq + 1;
return flag;
}
}
return flag;
}
int CEvaluate::ShellSortInFreq(Item M[], int N) //按在文章中出现的次数排列
{
int gap = 0;
int n = N;
for(gap = n / 2; gap > 0; gap /= 2)
for(int i = gap; i < n; i++)
for(int j = i - gap; j >= 0; j -= gap)
{
if(M[j].freq < M[j + gap].freq)
{
Item x;
Copy(x,M[j]);
Copy(M[j],M[j + gap]);
Copy(M[j + gap],x);
}
}
return 0;
}
int CEvaluate::ShellSort(Item M[], int N) //按单词在文章升序排序
{
int gap = 0;
int n = N;
for(gap = n / 2; gap > 0; gap /= 2)
for(int i = gap; i < n; i++)
for(int j = i - gap; j >= 0; j -= gap)
{
if(strcmp(M[j].str,M[j + gap].str) > 0)
{
Item x;
Copy(x,M[j]);
Copy(M[j],M[j + gap]);
Copy(M[j + gap],x);
}
}
return 0;
}
int CEvaluate::FeedBack(const char *filename,int Flag)
{
int i = 0;
int k = 0;
int Len = 0;
int eof = 0;
char str[100] = "UnInit";
Item *M = new Item[1000];
if(M == 0)
return -1;
FILE *r=fopen("Cal.dat","r"); //输入文件
if( r == 0 )
{
return -1;
}
int tempFileNum = ++ m_FeedSetFileNum[Flag];
for(eof=fscanf(r,"%s",&str);eof!=EOF && eof>0;eof=fscanf(r,"%s",&str))
{
Len = strlen (str);
if(Len >39)
{
break;
}
str[Len] = 0;
//增加停用词处理
if(Len >= 3 && Seek( M,str,i) == 0 && !Del_StopWord(str,m_StopWordNum))
{
memset(M[i].str,0,40);
strcpy(M[i].str,str);
M[i].Sequence = tempFileNum;
M[i].freq = 1;
M[i].FileCnt = 1;
M[i].density = 0.0;
M[i].gain = 0.0;
M[i].flag = flag;
M[i].SetInfo = flag;
i++;
}
memset(str,0,40);
}
fclose(r);
/************************************************************************/
ShellSortInFreq(M,i);
int Count = ((i <= KeyNum)? i: KeyNum);
ShellSort(M,Count);
/************************************************************************/
memset(str,0,100);
if(Flag == 0)
{
strcpy(str,Path);
strcat(str,"Spam_data1.dat");
}
else
{
strcpy(str,Path);
strcat(str,"Normal_data2.dat");
}
ofstream outCredit(str,ios::out | ios::ate);
if(!outCredit)
return -1;
for( i = 0;i < Count;i++ )
{
outCredit << M[i].str <<' ' << M[i].freq << ' ' <<M[i].density << ' ' << M[i].Sequence <<' '<< M[i].FileCnt <<' ' <<M[i].flag << '\n';
}
return 0;
}
int CEvaluate::Copy(Item &K, const Item M)
{
strcpy(K.str,M.str);
K.freq = M.freq;
K.density = M.density;
K.Sequence = M.Sequence;
K.FileCnt = M.FileCnt;
K.flag = M.flag;
K.gain = M.gain;
K.SetInfo = M.SetInfo;
return 0;
}
int CEvaluate::mergesort(char *file,int SetFlag)
{
Item *M = new Item[70000];
Item *swap = new Item[70000];
if (M == 0 || swap == 0)
return -1;
int count = 0,countCur = 0; //变量countCur计算当前数组的有效长度
char str[100];
memset(str,0,100);
strcpy(str,Path);
strcat(str,file);
ifstream inCredit(str,ios::in );
if(!inCredit)
{
cerr << "Input File could not be opened" << endl;
exit(1);
}
Item Credit;
while(inCredit >> Credit.str >> Credit.freq >> Credit.density
>> Credit.Sequence >>Credit.FileCnt >>Credit.flag)
{
Copy(M[countCur],Credit);
countCur++;
}
count = countCur;
int i ,k;
k = 1;
while( k < count)
{
merge(M,swap,k,count - 1);
//cout << " k = " << k << endl;
for( i = 0;i < count - 1;i ++)
{
Copy(M[i],swap[i]);
}
cout << endl;
k = k * 2;
}
/************************************************************************/
str[strlen(str) - 4] = 0;
strcat(str,"temp.dat");
ofstream outCredit(str,ios::out);
if(!outCredit)
{
cerr << "Output File could not be opened." << endl;
exit(1);
}
int CN = 0; //计算实际合并后的数据数目
for( countCur = 0;countCur < count;countCur ++)
{
if( atof(M[countCur].str) > 0.0000001)
M[countCur].flag = 1;
else
for(int temp = 1;temp <= 30;temp ++)
{
if(strcmp(M[countCur].str,M[countCur + temp].str) == 0
&& M[countCur].flag == 0 && M[countCur + temp].flag == 0
&& (countCur + temp < i))
{
M[countCur].density += M[countCur + temp].density;
M[countCur ].freq += M[countCur + temp].freq;
if(M[countCur].Sequence != M[countCur + temp].Sequence)
M[countCur ].FileCnt ++;
M[countCur + temp].flag = 1;
}
}
if(M[countCur].flag == 0)
{
CN ++;
outCredit << M[countCur].str <<' ' << M[countCur].freq << ' '
<<M[countCur].density << ' ' << M[countCur].Sequence <<' '
<< M[countCur].FileCnt <<' ' <<M[countCur].flag << '\n';
}
}
delete M;
delete swap;
return 0;
}
int CEvaluate::merge(Item x[], Item swap[], int k, int n)
{
int i,j,l1,u1,l2,u2,m;
l1 = 0;
m = 0;
while(l1 + k < n)
{
l2 = l1 + k;
u1 = l2 - 1;
u2 = (l2 + k - 1 <= n - 1) ? (l2 + k - 1) :(n - 1);
for( i = l1,j = l2; i <= u1 && j <= u2 ;m++)
{
if(strcmp(x[i].str,x[j].str) <= 0)
{
Copy(swap[m],x[i]);
i ++;
}
else
{
Copy(swap[m],x[j]);
j++;
}
}
while(i <= u1)
{
Copy(swap[m],x[i]);
m ++;
i ++;
}
while(j <= u2)
{
Copy(swap[m],x[j]);
m ++;
j ++;
}
l1 = u2 + 1;
}
for( i =l1;i < n; i ++,m++)
Copy(swap[m],x[i]);
return 0;
}
int CEvaluate::WriteInfo(char *filename)
{
if(filename == 0)
return -1;
char str[100];
memset(str,0,100);
strcpy(str,Path);
strcat(str,filename);
ofstream outCredit(str,ios::out| ios::trunc);
if(!outCredit)
{
cerr << "Output File could not be opened." << endl;
exit(1);
}
for(int i = 0;i < 2;i ++)
{
WordReservoirInfo[i].FileSum = m_FeedSetFileNum[i];
outCredit << WordReservoirInfo[i].ClassFielName << ' ' << WordReservoirInfo[i].ClassFlag << ' '<< WordReservoirInfo[i].FileSum << ' '<< WordReservoirInfo[i].WordsSum <<'\n';
}
return 0;
}
int CEvaluate::GetFileCount()
{
return m_nFileCount;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -