📄 browsedir.cpp
字号:
{
memset(WordReservoirInfo[ClassCnt - 1].ClassFielName,0,100);
strcpy(WordReservoirInfo[ClassCnt - 1].ClassFielName,str);
WordReservoirInfo[ClassCnt - 1].WordsSum = CN;
if (ClassCnt - 1 == 0)
{
WordReservoirInfo[ClassCnt - 1].ClassFlag = 0;
WordReservoirInfo[ClassCnt - 1].FileSum = m_nFileCount;
}
else if (ClassCnt - 1 == 1)
{
WordReservoirInfo[ClassCnt - 1].ClassFlag = 1;
WordReservoirInfo[ClassCnt - 1].FileSum = m_nFileCount;
}
}
delete M;
delete swap;
return 0;
}
int CBrowseDir::merge(Item x[], Item swap[], int k, int n)
{
int i,j,l1,u1,l2,u2,m;
l1 = 0;
m = 0;
while(l1 + k < n)
{
l2 = l1 + k;
u1 = l2 - 1;
u2 = (l2 + k - 1 <= n - 1) ? (l2 + k - 1) :(n - 1);
for( i = l1,j = l2; i <= u1 && j <= u2 ;m++)
{
if(strcmp(x[i].str,x[j].str) <= 0)
{
Copy(swap[m],x[i]);
i ++;
}
else
{
Copy(swap[m],x[j]);
j++;
}
}
while(i <= u1)
{
Copy(swap[m],x[i]);
m ++;
i ++;
}
while(j <= u2)
{
Copy(swap[m],x[j]);
m ++;
j ++;
}
l1 = u2 + 1;
}
for( i =l1;i < n; i ++,m++)
Copy(swap[m],x[i]);
return 0;
}
int CBrowseDir::Copy(Item &K, const Item M)
{
strcpy(K.str,M.str);
K.freq = M.freq;
K.density = M.density;
K.Sequence = M.Sequence;
K.FileCnt = M.FileCnt;
K.flag = M.flag;
K.gain = M.gain;
K.SetInfo = M.SetInfo;
return 0;
}
int CBrowseDir::CharDeal(char ch)
{
int temp = 0;
if (isalpha(ch))
temp = 1;
else if(isdigit(ch))
temp = 2;
else if (ch == '.')
temp = 3;
return temp;
}
int CBrowseDir::WriteInfo(char *filename)
{
if(filename == 0)
return -1;
ofstream outCredit(filename,ios::out| ios::app);
if(!outCredit)
{
cerr << "Output File could not be opened." << endl;
exit(1);
}
//
WordReservoirInfo[ClassCnt - 1].FileSum = GetFileCount();
outCredit << WordReservoirInfo[ClassCnt - 1].ClassFielName << ' '
<< WordReservoirInfo[ClassCnt - 1].ClassFlag << ' '
<< WordReservoirInfo[ClassCnt - 1].FileSum << ' '
<< WordReservoirInfo[ClassCnt - 1].WordsSum <<'\n';
return 0;
}
void CBrowseDir::ExtractFeature(const char *filename,const char *output )
{
if (filename == NULL )
return ;
Feature *M = new Feature[70000];
int i;
char tempstr[100];
memset(tempstr,0,100);
strcpy(tempstr,m_szInitDir);
strcat(tempstr,filename);
int Cnt = CalculateGain(tempstr ,M, 70000,output);
Mergesort(M,Cnt);
memset(tempstr,0,100);
strcpy(tempstr,m_szInitDir);
strcat(tempstr,"Sort_accord_Gain.dat");
ofstream outFile(tempstr,ios::out| ios::app);
if(!outFile)
{
cerr << "Output File could not be opened." << endl;
exit(1);
}
for( i = 0; i < Cnt; i++)
{
outFile << M[i].str << ' ' << M[i].gain << ' '
<< M[i].NormalMail << ' ' << M[i].NormalMail_Prior << ' '
<< M[i].UnNormalMail << ' ' << M[i].UnNormalMail_Prior << '\n';
}
delete M;
}
int CBrowseDir::PreDeal(const char *filename,Item M[],int Num, char *Outfilename) //文档与处理函数
{
if(filename == 0)
return -1;
char name[100];
int eof;
char c;
char s = ' ';
int i =0;
memset(name,0,100);
if(Outfilename == 0)
{
strcpy(name,filename);
int temp = strlen(filename) -1;
while(name[temp] != '\\')
temp --;
name[temp + 1] = 0;
strcat(name,"Cal.dat");
}
else
{
strcpy(name,Outfilename);
}
FILE *r=fopen(filename,"r"); //输入文件
if( r == 0 )
{
return -1;
}
FILE *w1=fopen(name,"w+"); //输出文件
if( w1 == 0 )
{
return -1;
}
for(eof=fscanf(r,"%c",&c);eof!=EOF && eof>0;eof=fscanf(r,"%c",&c))
{
if(isalpha(c))
{
char ch = (char)tolower(c);
fprintf( w1, "%c",ch );
}
else
{
fprintf( w1, "%c",s );
}
}
rewind(w1);
char str[100] = "UnInit";
i = 0;
int Len = 0;
for(eof=fscanf(w1,"%s",&str);eof!=EOF && eof>0;eof=fscanf(w1,"%s",&str))
{
Len = strlen (str);
if(Len >39)
{
//cout << filename; 输出含有异常长度字符串的文件名
break;
}
str[Len] = 0;
//增加停用词处理
//ofstream fout("t.dat",ios::out | ios::ate);
if(Len >= 3 && Seek( M,str,i) == 0 && !Del_StopWord(str,m_StopWordNum))
{
memset(M[i].str,0,40);
strcpy(M[i].str,str);
M[i].freq = 1;
M[i].FileCnt = 1;
M[i].density = 0.0;
M[i].gain = 0.0;
M[i].flag = 0;
M[i].SetInfo = ClassCnt - 1;
i++;
}
memset(str,0,40);
}
fclose(w1);
fclose(r);
return i;
}
int CBrowseDir::MergeInGain(Feature x[], Feature swap[], int k, int n) //按降序排列
{
int i,j,l1,u1,l2,u2,m;
l1 = 0;
m = 0;
while(l1 + k < n)
{
l2 = l1 + k;
u1 = l2 - 1;
u2 = (l2 + k - 1 <= n - 1) ? (l2 + k - 1) :(n - 1);
for( i = l1,j = l2; i <= u1 && j <= u2 ;m++)
{
if(x[i].gain * 1000 > x[j].gain * 1000)
{
strcpy(swap[m].str,x[i].str);
swap[m].gain = x[i].gain;
swap[m].NormalMail = x[i].NormalMail;
swap[m].NormalMail_Prior = x[i].NormalMail_Prior;
swap[m].UnNormalMail = x[i].UnNormalMail;
swap[m].UnNormalMail_Prior = x[i].UnNormalMail_Prior;
i ++;
}
else
{
strcpy(swap[m].str,x[j].str);
swap[m].gain = x[j].gain;
swap[m].NormalMail = x[j].NormalMail;
swap[m].NormalMail_Prior = x[j].NormalMail_Prior;
swap[m].UnNormalMail = x[j].UnNormalMail;
swap[m].UnNormalMail_Prior = x[j].UnNormalMail_Prior;
j++;
}
}
while(i <= u1)
{
strcpy(swap[m].str,x[i].str);
swap[m].gain = x[i].gain;
swap[m].NormalMail = x[i].NormalMail;
swap[m].NormalMail_Prior = x[i].NormalMail_Prior;
swap[m].UnNormalMail = x[i].UnNormalMail;
swap[m].UnNormalMail_Prior = x[i].UnNormalMail_Prior;
m ++;
i ++;
}
while(j <= u2)
{
strcpy(swap[m].str,x[j].str);
swap[m].gain = x[j].gain;
swap[m].NormalMail = x[j].NormalMail;
swap[m].NormalMail_Prior = x[j].NormalMail_Prior;
swap[m].UnNormalMail = x[j].UnNormalMail;
swap[m].UnNormalMail_Prior = x[j].UnNormalMail_Prior;
m ++;
j ++;
}
l1 = u2 + 1;
}
for( i =l1;i < n; i ++,m++)
{
strcpy(swap[m].str,x[i].str);
swap[m].gain = x[i].gain;
swap[m].NormalMail = x[i].NormalMail;
swap[m].NormalMail_Prior = x[i].NormalMail_Prior;
swap[m].UnNormalMail = x[i].UnNormalMail;
swap[m].UnNormalMail_Prior = x[i].UnNormalMail_Prior;
}
return 0;
}
int CBrowseDir::Mergesort(Feature feature[], int Num)
{
Feature *swap = new Feature[Num];
if(swap == 0)
return -1;
int k = 1,i;
while( k < Num)
{
MergeInGain(feature,swap,k,Num);
for( i = 0;i < Num ;i ++)
{
strcpy(feature[i].str,swap[i].str);
feature[i].gain = swap[i].gain;
feature[i].NormalMail = swap[i].NormalMail;
feature[i].NormalMail_Prior = swap[i].NormalMail_Prior;
feature[i].UnNormalMail = swap[i].UnNormalMail;
feature[i].UnNormalMail_Prior = swap[i].UnNormalMail_Prior;
}
k = k * 2;
}
delete swap;
return 0;
}
int CBrowseDir::Unique(Item X[], int Num)
{
int Value = 0;
Item *swap = new Item[70000];
int count = Num;
int i ,k;
k = 1;
while( k < count)
{
merge(X,swap,k,count - 1);
for( i = 0;i < count - 1;i ++)
{
Copy(X[i],swap[i]);
}
cout << endl;
k = k * 2;
}
delete swap;
for( i = 0; i < Num; i++)
{
for(int j = 1; j < 5; j ++)
{
if(i + j < Num )
{
if(strcmp(X[i].str,X[i + j].str) == 0)
{
X[i].density += X[i + j].density;
X[i].freq += X[i + j].freq;
if(X[i].Sequence != X[i + j].Sequence)
X[i].FileCnt ++;
X[i + j].flag = -1;
}
}
}
}
Item *Swap = new Item[Num];
if (Swap == 0)
return -1;
i = 0;
while( i < Num)
{
if(X[i].flag != -1)
{
Copy(Swap[Value],X[i]);
Value ++;
}
i ++;
}
for(i = 0 ;i < Value; i ++)
{
Copy(X[i],Swap[i]);
}
//将重复的X[i]值为无意义
for(i = Value ;i < Num; i ++)
{
memset(X[i].str,0,40);
X[i].SetInfo = -1;
X[i].flag = -1;
X[i].freq = -1;
X[i].gain = -1;
X[i].density = -1;
X[i].FileCnt = -1;
X[i].Sequence = -1;
}
delete Swap;
return Value;
}
int CBrowseDir::ShellSort(Feature M[], int N) //按增益降序排列
{
int gap = 0;
int n = N;
for(gap = n / 2; gap > 0; gap /= 2)
for(int i = gap; i < n; i++)
for(int j = i - gap; j >= 0; j -= gap)
{
if(M[j].gain < M[j + gap].gain)
{
Feature x;
CopyFeature(x,M[j]);
CopyFeature(M[j],M[j + gap]);
CopyFeature(M[j + gap],x);
}
}
return 0;
}
int CBrowseDir::CopyFeature(Feature &K, const Feature X)
{
strcpy(K.str,X.str);
K.gain = X.gain;
K.NormalMail = X.NormalMail;
K.NormalMail_Prior = X.NormalMail_Prior;
K.UnNormalMail = X.UnNormalMail;
K.UnNormalMail_Prior = X.UnNormalMail_Prior;
return 0;
}
int CBrowseDir::MergeInString(Feature x[], Feature swap[], int k, int n)
{
int i,j,l1,u1,l2,u2,m;
l1 = 0;
m = 0;
while(l1 + k < n)
{
l2 = l1 + k;
u1 = l2 - 1;
u2 = (l2 + k - 1 <= n - 1) ? (l2 + k - 1) :(n - 1);
for( i = l1,j = l2; i <= u1 && j <= u2 ;m++)
{
if(strcmp(x[i].str,x[j].str) >= 0)
{
//Copy(swap[m],x[i]);
strcpy(swap[m].str,x[i].str);
swap[m].gain = x[i].gain;
swap[m].NormalMail = x[i].NormalMail;
swap[m].NormalMail_Prior = x[i].NormalMail_Prior;
swap[m].UnNormalMail = x[i].UnNormalMail;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -