📄 phasestat.cpp

📁 中文邮件过滤。对训练邮件分词训练贝叶斯模型。然后对测试邮件分类
💻 CPP
字号:
#include "stdio.h"
#include "stdlib.h"
#include "string.h"
#define DICTFILE "Dict.dic"
#define HAMDIR  "samples\\ham\\"
#define SAPMDIR "samples\spam\\"


typedef struct Feature
{
	char word[18];
	int tf;
    Feature *next;
}Feature;

typedef struct dict
{
	char str[18];
	int len;
	dict *next;
}dict;

typedef struct Document
{
	int ID;
	int c;
	Feature *feature[4000];
}Document;

Document *hamDoc;
Document *spamDoc;
dict *dictHead[4000];


//词组第一个字影射的链表。
int getIndex(char wordh,char wordl){
	unsigned short wh(0),wl(0);
	wh=(short)wordh;
	wh=wh<<8;
	wl=(short)wordl;
	wl=wl&0x00ff;
	wh=wh|wl;
	if((wh-0xb0a1)<0||(wh-0xf7fe)>0){
		return 4000;
	}
	return (wh-0xb0a1)%4000;
}

//统计全部ham出现的词频率
void statisticHam(){
	int i=0;
	int senlen(0),index(0),head(0),findDict(0),j(0),tail(0),findFt(0);
	char sen[1700];
	char word[19];
	char buffer[50];
	char filename[5];
	dict *dictp;
	hamDoc=(Document *)malloc(sizeof(Document));
	for(j=0;j<4000;j++){
		hamDoc->feature[j]=(Feature *)malloc(sizeof(Feature));
		hamDoc->feature[j]->next=NULL;
	}
	hamDoc->c=1;      //是正常邮件
	for(i=0;i<5000;i++){
		Feature *fhead,*ft;
		itoa( i+1, filename, 10 );
		strcpy(buffer,"samples\\ham\\");
		strcat(buffer,filename);
		FILE *hamfile=fopen(buffer,"rb");
		if(hamfile==NULL){
			printf("没有此ham文件\n");
			exit(1);
		}
		while(!feof(hamfile)){
			fscanf(hamfile,"%s\n",sen);
			senlen=strlen(sen);
			head=0;
			tail=senlen>18 ? 18:senlen;
			while(head<senlen){
				findDict=0;    //在词典中找到
				findFt=0;      //词在feature中已添加
				strncpy(word,sen+head,(tail-head));
				index=getIndex(word[0],word[1]);
				if(index<4000){ //是汉字
					if((tail-head)%2==0){
						word[tail-head]=0;
					}
					else{
						word[tail-head-1]=0;
					}
					dictp=dictHead[index]->next;
					if((tail-head)>=4){
						while(dictp!=NULL){
							if(strcmp(word,dictp->str)==0){
								findDict=1; //在词典中找到，分词
								break;
							}
							dictp=dictp->next;
						}
					}
					if(findDict){
						//添加feature
						fhead=hamDoc->feature[index];
						while(fhead->next!=NULL&&!findFt){
							if(strcmp(fhead->next->word,word)==0){
								findFt=1;
								fhead->next->tf++;
							}
							fhead=fhead->next;
						}
						if(!findFt){
							ft=(Feature *)malloc(sizeof(Feature));
							ft->tf=1;
							strcpy(ft->word,word);
							ft->word[tail-head]=0;
							ft->next=fhead->next;
							fhead->next=ft;
						}
						head=tail;
						tail=senlen>head+18? head+18:senlen;
					}
					//词组不再字典中，缩短词组长度继续找。
					else {
						if((tail-head)>=6){
							tail=tail-2;
						}
						else{
							head=head+2;
							tail=senlen>head+18? head+18:senlen;
						}
					}
				}
				else {
					head++;
					tail=senlen>head+18? head+18:senlen;
				}
				
			}
		}
		fclose(hamfile);
	}
}


//统计全部spam出现的词频率
void statisticSpam(){
	int i=0;
	int senlen(0),index(0),head(0),findDict(0),j(0),tail(0),findFt(0);
	char sen[1700];
	char word[19];
	char buffer[50];
	char filename[5];
	dict *dictp;
	spamDoc=(Document *)malloc(sizeof(Document));
	for(j=0;j<4000;j++){
		spamDoc->feature[j]=(Feature *)malloc(sizeof(Feature));
		spamDoc->feature[j]->next=NULL;
	}
	spamDoc->c=0;      //是垃圾邮件
	for(i=0;i<5000;i++){
		Feature *fhead,*ft;
		itoa( i+1, filename, 10 );
		strcpy(buffer,"samples\\spam\\");
		strcat(buffer,filename);
		FILE *spamfile=fopen(buffer,"rb");
		if(spamfile==NULL){
			printf("没有此spam文件\n");
			exit(1);
		}
		while(!feof(spamfile)){
			fscanf(spamfile,"%s\n",sen);
			senlen=strlen(sen);
			head=0;
			tail=senlen>18 ? 18:senlen;
			while(head<senlen){
				findDict=0;
				findFt=0;
				strncpy(word,sen+head,(tail-head));
				index=getIndex(word[0],word[1]);
				if(index<4000){ //是汉字
					if((tail-head)%2==0){
						word[tail-head]=0;
					}
					else{
						word[tail-head-1]=0;
					}
					dictp=dictHead[index]->next;
					if((tail-head)>=4){
						while(dictp!=NULL){
							if(strcmp(word,dictp->str)==0){
								findDict=1; //在词典中找到，分词
								break;
							}
							dictp=dictp->next;
						}
					}
					if(findDict){
						//添加feature
						fhead=spamDoc->feature[index];
						while(fhead->next!=NULL&&!findFt){
							if(strcmp(fhead->next->word,word)==0){
								findFt=1;
								fhead->next->tf++;
							}
							fhead=fhead->next;
						}
						if(!findFt){
							ft=(Feature *)malloc(sizeof(Feature));
							ft->tf=1;
							strcpy(ft->word,word);
							ft->word[tail-head]=0;
							ft->next=fhead->next;
							fhead->next=ft;
						}
						head=tail;
						tail=senlen>head+18? head+18:senlen;
					}
					//词组不再字典中，缩短词组长度继续找。
					else {
						if((tail-head)>=6){
							tail=tail-2;
						}
						else{
							head=head+2;
							tail=senlen>head+18? head+18:senlen;
						}
					}
				}
				else {
					head++;
					tail=senlen>head+18? head+18:senlen;
				}
				
			}
		}
		fclose(spamfile);
	}
}

//把词典中的词读入到链表中
void readDict()
{
	int i,index(0);
	long int line(0);
	dict *p;
	char word[9];
	char sen[100];
	for(i=0;i<4000;i++){
		dictHead[i]=(dict*)malloc(sizeof(dict));
		dictHead[i]->next=NULL;
	}
	FILE *dictFile=fopen(DICTFILE,"rb");
	fgets(sen,100,dictFile);
	while(!feof(dictFile)){
		i=0;
		line++;
		while(sen[i]!=9&&i<12){ //分隔
			word[i]=sen[i];
			i++;
		}
		
	//	printf("%d\n",line);
	
		if(i<=12){
			word[i]=0;
			p=(dict *)malloc(sizeof(dict));
	    	memset(p,0,sizeof(dict));
			strcpy(p->str,word);
			p->len=strlen(p->str)/2;
			index=getIndex(p->str[0],p->str[1]);
			if(index<4000){
				p->next=dictHead[index]->next;
				dictHead[index]->next=p;
			}
		}
	//	else{
	//		printf("stop");
	//	}
		fgets(sen,100,dictFile);
	}
	fclose(dictFile);
}

void phaseSta(){
	int i=0,j=0,find=0;
	float hamsum(0),spamsum(0),ratio,times;
	int  hamfre,spamfre;
	float respamfre,maxfre,minfre;
	Feature *hamhead,*spamhead;
	FILE *hamfreFile=fopen("hamphase.txt","wb");
	FILE *spamfreFile=fopen("spamphase.txt","wb");
	FILE *ftFile=fopen("features.txt","wb");
	readDict();
	statisticHam();
	for(j=0;j<4000;j++){
		hamhead=hamDoc->feature[j];
		while(hamhead->next!=NULL){
			//ham中出现的词写到文件中
			fprintf(hamfreFile,"%s%d\n",hamhead->next->word,hamhead->next->tf);
			hamsum=hamsum+hamhead->next->tf ;
			hamhead=hamhead->next;
		}
	}
	fclose(hamfreFile);
	statisticSpam();
	for(j=0;j<4000;j++){
		spamhead=spamDoc->feature[j];
		while(spamhead->next!=NULL){
			//spam中出现的词写到文件中
			fprintf(spamfreFile,"%s%d\n",spamhead->next->word,spamhead->next->tf);
			spamsum=spamsum+spamhead->next->tf ;
			spamhead=spamhead->next;
		}
	}
	fclose(spamfreFile);
    ratio=hamsum/spamsum;  //ham中总词数除以spam中总词数

	//比较ham特征和spam特征选取特征
	for(j=0;j<4000;j++){
        // 选取ham和spam中都存在且相差倍数大于等于20的特征，或ham中存在且频度大于250而spam中没出现的特征
		hamhead=hamDoc->feature[j];
		while(hamhead->next!=NULL){
			hamfre=hamhead->next->tf;
			spamhead=spamDoc->feature[j];
			find=0;
			while(spamhead->next!=NULL&&!find){
				if(strcmp(hamhead->next->word,spamhead->next->word)==0){
					find=1;
					spamfre=spamhead->next->tf;
					respamfre=spamfre*ratio; //计算相对频度
					if(hamfre>=respamfre){
						minfre=respamfre;
						maxfre=hamfre;
					}
					else{
						maxfre=respamfre;
						minfre=hamfre;
					}
					
					if(maxfre>=5000){
						if(minfre<=maxfre/6){
							fprintf(ftFile,"%s %d %d\n",spamhead->next->word,hamfre,spamfre);
						}
					}
					else{
						if(minfre<=5){
							if(maxfre>=250){
								fprintf(ftFile,"%s %d %d\n",spamhead->next->word,hamfre,spamfre);
							}
						}
						else{
							if(maxfre>=400&&minfre<maxfre/8){
								fprintf(ftFile,"%s %d %d\n",spamhead->next->word,hamfre,spamfre);
							}
						}
					}

					/*
					if(minfre<10){
						minfre=10;
					}
					
					times=maxfre/minfre;
					if(times>=8&&maxfre>500){
						fprintf(ftFile,"%s %d %d\n",spamhead->next->word,hamfre,spamfre);
						
					}
					*/
				}
				spamhead=spamhead->next ;
			}
			if(!find){
				if(hamfre>=200){
					fprintf(ftFile,"%s %d %d\n",hamhead->next->word,hamfre,0);
				}
			}
			hamhead=hamhead->next;
		}
		//spam中存在且频度大于250而ham中部出现的特征
		spamhead=spamDoc->feature[j];
		while(spamhead->next!=NULL){
			find=0;
			spamfre=spamhead->next->tf;
			respamfre=spamfre*ratio;
			if(respamfre>=200){
				hamhead=hamDoc->feature[j];
				while(hamhead->next!=NULL&&!find){
					if(strcmp(hamhead->next->word,spamhead->next->word)==0){
				     	find=1;
					}
					hamhead=hamhead->next;
				}
				if(!find){
					fprintf(ftFile,"%s %d %d\n",spamhead->next->word,0,spamfre);
				}
			}
			spamhead=spamhead->next ;
		}
	}

	fclose(ftFile);
	free(hamDoc);
	free(spamDoc);
}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -