📄 phasestat.cpp
字号:
#include "stdio.h"
#include "stdlib.h"
#include "string.h"
#define DICTFILE "Dict.dic"
#define HAMDIR "samples\\ham\\"
#define SAPMDIR "samples\spam\\"
typedef struct Feature
{
char word[18];
int tf;
Feature *next;
}Feature;
typedef struct dict
{
char str[18];
int len;
dict *next;
}dict;
typedef struct Document
{
int ID;
int c;
Feature *feature[4000];
}Document;
Document *hamDoc;
Document *spamDoc;
dict *dictHead[4000];
//词组第一个字影射的链表。
int getIndex(char wordh,char wordl){
unsigned short wh(0),wl(0);
wh=(short)wordh;
wh=wh<<8;
wl=(short)wordl;
wl=wl&0x00ff;
wh=wh|wl;
if((wh-0xb0a1)<0||(wh-0xf7fe)>0){
return 4000;
}
return (wh-0xb0a1)%4000;
}
//统计全部ham出现的词频率
void statisticHam(){
int i=0;
int senlen(0),index(0),head(0),findDict(0),j(0),tail(0),findFt(0);
char sen[1700];
char word[19];
char buffer[50];
char filename[5];
dict *dictp;
hamDoc=(Document *)malloc(sizeof(Document));
for(j=0;j<4000;j++){
hamDoc->feature[j]=(Feature *)malloc(sizeof(Feature));
hamDoc->feature[j]->next=NULL;
}
hamDoc->c=1; //是正常邮件
for(i=0;i<5000;i++){
Feature *fhead,*ft;
itoa( i+1, filename, 10 );
strcpy(buffer,"samples\\ham\\");
strcat(buffer,filename);
FILE *hamfile=fopen(buffer,"rb");
if(hamfile==NULL){
printf("没有此ham文件\n");
exit(1);
}
while(!feof(hamfile)){
fscanf(hamfile,"%s\n",sen);
senlen=strlen(sen);
head=0;
tail=senlen>18 ? 18:senlen;
while(head<senlen){
findDict=0; //在词典中找到
findFt=0; //词在feature中已添加
strncpy(word,sen+head,(tail-head));
index=getIndex(word[0],word[1]);
if(index<4000){ //是汉字
if((tail-head)%2==0){
word[tail-head]=0;
}
else{
word[tail-head-1]=0;
}
dictp=dictHead[index]->next;
if((tail-head)>=4){
while(dictp!=NULL){
if(strcmp(word,dictp->str)==0){
findDict=1; //在词典中找到,分词
break;
}
dictp=dictp->next;
}
}
if(findDict){
//添加feature
fhead=hamDoc->feature[index];
while(fhead->next!=NULL&&!findFt){
if(strcmp(fhead->next->word,word)==0){
findFt=1;
fhead->next->tf++;
}
fhead=fhead->next;
}
if(!findFt){
ft=(Feature *)malloc(sizeof(Feature));
ft->tf=1;
strcpy(ft->word,word);
ft->word[tail-head]=0;
ft->next=fhead->next;
fhead->next=ft;
}
head=tail;
tail=senlen>head+18? head+18:senlen;
}
//词组不再字典中,缩短词组长度继续找。
else {
if((tail-head)>=6){
tail=tail-2;
}
else{
head=head+2;
tail=senlen>head+18? head+18:senlen;
}
}
}
else {
head++;
tail=senlen>head+18? head+18:senlen;
}
}
}
fclose(hamfile);
}
}
//统计全部spam出现的词频率
void statisticSpam(){
int i=0;
int senlen(0),index(0),head(0),findDict(0),j(0),tail(0),findFt(0);
char sen[1700];
char word[19];
char buffer[50];
char filename[5];
dict *dictp;
spamDoc=(Document *)malloc(sizeof(Document));
for(j=0;j<4000;j++){
spamDoc->feature[j]=(Feature *)malloc(sizeof(Feature));
spamDoc->feature[j]->next=NULL;
}
spamDoc->c=0; //是垃圾邮件
for(i=0;i<5000;i++){
Feature *fhead,*ft;
itoa( i+1, filename, 10 );
strcpy(buffer,"samples\\spam\\");
strcat(buffer,filename);
FILE *spamfile=fopen(buffer,"rb");
if(spamfile==NULL){
printf("没有此spam文件\n");
exit(1);
}
while(!feof(spamfile)){
fscanf(spamfile,"%s\n",sen);
senlen=strlen(sen);
head=0;
tail=senlen>18 ? 18:senlen;
while(head<senlen){
findDict=0;
findFt=0;
strncpy(word,sen+head,(tail-head));
index=getIndex(word[0],word[1]);
if(index<4000){ //是汉字
if((tail-head)%2==0){
word[tail-head]=0;
}
else{
word[tail-head-1]=0;
}
dictp=dictHead[index]->next;
if((tail-head)>=4){
while(dictp!=NULL){
if(strcmp(word,dictp->str)==0){
findDict=1; //在词典中找到,分词
break;
}
dictp=dictp->next;
}
}
if(findDict){
//添加feature
fhead=spamDoc->feature[index];
while(fhead->next!=NULL&&!findFt){
if(strcmp(fhead->next->word,word)==0){
findFt=1;
fhead->next->tf++;
}
fhead=fhead->next;
}
if(!findFt){
ft=(Feature *)malloc(sizeof(Feature));
ft->tf=1;
strcpy(ft->word,word);
ft->word[tail-head]=0;
ft->next=fhead->next;
fhead->next=ft;
}
head=tail;
tail=senlen>head+18? head+18:senlen;
}
//词组不再字典中,缩短词组长度继续找。
else {
if((tail-head)>=6){
tail=tail-2;
}
else{
head=head+2;
tail=senlen>head+18? head+18:senlen;
}
}
}
else {
head++;
tail=senlen>head+18? head+18:senlen;
}
}
}
fclose(spamfile);
}
}
//把词典中的词读入到链表中
void readDict()
{
int i,index(0);
long int line(0);
dict *p;
char word[9];
char sen[100];
for(i=0;i<4000;i++){
dictHead[i]=(dict*)malloc(sizeof(dict));
dictHead[i]->next=NULL;
}
FILE *dictFile=fopen(DICTFILE,"rb");
fgets(sen,100,dictFile);
while(!feof(dictFile)){
i=0;
line++;
while(sen[i]!=9&&i<12){ //分隔
word[i]=sen[i];
i++;
}
// printf("%d\n",line);
if(i<=12){
word[i]=0;
p=(dict *)malloc(sizeof(dict));
memset(p,0,sizeof(dict));
strcpy(p->str,word);
p->len=strlen(p->str)/2;
index=getIndex(p->str[0],p->str[1]);
if(index<4000){
p->next=dictHead[index]->next;
dictHead[index]->next=p;
}
}
// else{
// printf("stop");
// }
fgets(sen,100,dictFile);
}
fclose(dictFile);
}
void phaseSta(){
int i=0,j=0,find=0;
float hamsum(0),spamsum(0),ratio,times;
int hamfre,spamfre;
float respamfre,maxfre,minfre;
Feature *hamhead,*spamhead;
FILE *hamfreFile=fopen("hamphase.txt","wb");
FILE *spamfreFile=fopen("spamphase.txt","wb");
FILE *ftFile=fopen("features.txt","wb");
readDict();
statisticHam();
for(j=0;j<4000;j++){
hamhead=hamDoc->feature[j];
while(hamhead->next!=NULL){
//ham中出现的词写到文件中
fprintf(hamfreFile,"%s%d\n",hamhead->next->word,hamhead->next->tf);
hamsum=hamsum+hamhead->next->tf ;
hamhead=hamhead->next;
}
}
fclose(hamfreFile);
statisticSpam();
for(j=0;j<4000;j++){
spamhead=spamDoc->feature[j];
while(spamhead->next!=NULL){
//spam中出现的词写到文件中
fprintf(spamfreFile,"%s%d\n",spamhead->next->word,spamhead->next->tf);
spamsum=spamsum+spamhead->next->tf ;
spamhead=spamhead->next;
}
}
fclose(spamfreFile);
ratio=hamsum/spamsum; //ham中总词数除以spam中总词数
//比较ham特征和spam特征选取特征
for(j=0;j<4000;j++){
// 选取ham和spam中都存在且相差倍数大于等于20的特征,或ham中存在且频度大于250而spam中没出现的特征
hamhead=hamDoc->feature[j];
while(hamhead->next!=NULL){
hamfre=hamhead->next->tf;
spamhead=spamDoc->feature[j];
find=0;
while(spamhead->next!=NULL&&!find){
if(strcmp(hamhead->next->word,spamhead->next->word)==0){
find=1;
spamfre=spamhead->next->tf;
respamfre=spamfre*ratio; //计算相对频度
if(hamfre>=respamfre){
minfre=respamfre;
maxfre=hamfre;
}
else{
maxfre=respamfre;
minfre=hamfre;
}
if(maxfre>=5000){
if(minfre<=maxfre/6){
fprintf(ftFile,"%s %d %d\n",spamhead->next->word,hamfre,spamfre);
}
}
else{
if(minfre<=5){
if(maxfre>=250){
fprintf(ftFile,"%s %d %d\n",spamhead->next->word,hamfre,spamfre);
}
}
else{
if(maxfre>=400&&minfre<maxfre/8){
fprintf(ftFile,"%s %d %d\n",spamhead->next->word,hamfre,spamfre);
}
}
}
/*
if(minfre<10){
minfre=10;
}
times=maxfre/minfre;
if(times>=8&&maxfre>500){
fprintf(ftFile,"%s %d %d\n",spamhead->next->word,hamfre,spamfre);
}
*/
}
spamhead=spamhead->next ;
}
if(!find){
if(hamfre>=200){
fprintf(ftFile,"%s %d %d\n",hamhead->next->word,hamfre,0);
}
}
hamhead=hamhead->next;
}
//spam中存在且频度大于250而ham中部出现的特征
spamhead=spamDoc->feature[j];
while(spamhead->next!=NULL){
find=0;
spamfre=spamhead->next->tf;
respamfre=spamfre*ratio;
if(respamfre>=200){
hamhead=hamDoc->feature[j];
while(hamhead->next!=NULL&&!find){
if(strcmp(hamhead->next->word,spamhead->next->word)==0){
find=1;
}
hamhead=hamhead->next;
}
if(!find){
fprintf(ftFile,"%s %d %d\n",spamhead->next->word,0,spamfre);
}
}
spamhead=spamhead->next ;
}
}
fclose(ftFile);
free(hamDoc);
free(spamDoc);
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -