bayes.cpp

来自「中文邮件过滤。对训练邮件分词训练贝叶斯模型。然后对测试邮件分类」· C++ 代码 · 共 469 行
CPP
469 行
#include "extractor.h"
#include "stdio.h"
#include "stdlib.h"
#include "string.h"
#include "math.h"
#define MAXFEATURE 400
#define MAXTF 10
#define MAXTEST 1200
//dict *dictHead[4000];
int labelCount[2]; //spam和ham各自出现总次数
int featureCount[2][MAXFEATURE][MAXTF+1]; //各个feature以特定频度出现次数
Feature *ft[4000]; //存放选择的feature
FILE *ftFile;
char testdir[30];
//Document *doc;
int doc[MAXFEATURE+1];
int num(0);
void readDocument(FILE file);
void readFeature(){
	char sen[50];
	char word[9];
	int space=' ';
	int index,i(0);
	char *pdest;
    int  result;
	Feature *ftWord;
	ftFile=fopen("features.txt","rb");
	if(ftFile==NULL){
		printf("you must run the function phaseStat which statistic the word frequency");
		printf("and choose the appropriate feature produce the features.txt file\n");
		exit(1);
	}
	for(i=0;i<4000;i++){
		ft[i]=(Feature *)malloc(sizeof(Feature));
		memset(ft[i],0,sizeof(Feature));
	}
	fgets(sen,50,ftFile);
	while(!feof(ftFile)&&num<MAXFEATURE){
		num++;
		pdest = strchr( sen, space );
        result = pdest - sen ;
		strncpy(word,sen,result);
		word[result]=0;
		index=getIndex(word[0],word[1]);
		ftWord=(Feature *)malloc(sizeof(Feature));
		memset(ftWord,0,sizeof(Feature));
		strcpy(ftWord->word,word);
		ftWord->tf=num;
		ftWord->next=ft[index]->next;
		ft[index]->next=ftWord;
		fgets(sen,50,ftFile);
	}
    /*
	for(i=0;i<4000;i++){
		Feature *f;
		f=NULL;
		if(ft[i]->next!=NULL){
		  f=ft[i]->next;
		}
		while(f!=NULL){
			printf("%3d %s\n",f->tf,f->word);
			f=f->next;
		}
	}
    */
	fclose(ftFile);
}
/*
void readDocument(FILE *file){
	int j;
	int senlen(0),index(0),head(0),tail(0),findFt(0),find(0);
	Feature *fthead,*docft;
	char sen[1700];
	char word[9];
	for(j=0;j<4000;j++){
		doc->feature[j]=(Feature *)malloc(sizeof(Feature));
		doc->feature[j]->next=NULL;
	}
	fscanf(file,"%s",sen);
	while(!feof(file)){
		senlen=strlen(sen);
		head=0;
		tail=senlen>8 ? 8:senlen;
		while(head<senlen){
			find=0;
			findFt=0;      //词在feature中已添加
			strncpy(word,sen+head,(tail-head));
			index=getIndex(word[0],word[1]);
			if(index<4000){ //是汉字
				if((tail-head)%2==0){
					word[tail-head]=0;
				}
				else{
					word[tail-head-1]=0;
				}
				if((tail-head)>=4){
					fthead=ft[index]->next;
					while(fthead!=NULL){
						if(strcmp(word,fthead->word)==0){
							find=1; //在feature中找到，分词
							break;
						}
						fthead=fthead->next;
					}
				}
				if(find){
					//添加feature
					docft=doc->feature[index];
					while(docft->next!=NULL&&!findFt){
						if(strcmp(docft->next->word,word)==0){
							findFt=1;  //前面出现过这个特征
							docft->next->tf++;
							break;
						}
						docft=docft->next;
					}
					if(!findFt){
						//添加第一次出现feature
						docft=(Feature *)malloc(sizeof(Feature));
						//出现次数为1
						docft->tf=1;
						strcpy(docft->word,word);
						docft->word[tail-head]=0;
						docft->next=doc->feature[index]->next;
						doc->feature[index]->next=docft;
					}
					head=tail;
					tail=senlen>head+8? head+8:senlen;
				}
				//词组不再feature中，缩短词组长度继续找。
				else {
					if((tail-head)>=6){
						tail=tail-2;
					}
					//一个字时head+2
					else{
						head=head+2;
						tail=senlen>head+8? head+8:senlen;
					}
				}
			}
			else { //不是汉字
				head++;
				tail=senlen>head+8? head+8:senlen;
			}
		}
		fscanf(file,"%s",sen);
	}
	
	fclose(file);
}
*/

void readDocument(FILE *file){
	char sen[1700];
	char word[9];
	int i,d;
	int senlen(0),index(0),head(0),tail(0),find(0);
	Feature *fthead;
	for(i=0;i<=num;i++){
		doc[i]=0;
	}
	fscanf(file,"%s",sen);
	while(!feof(file)){
		senlen=strlen(sen);
		head=0;
		tail=senlen>8 ? 8:senlen;
		while(head<senlen){
			find=0;
			strncpy(word,sen+head,(tail-head));
			index=getIndex(word[0],word[1]);
			if(index<4000){ //是汉字
				if((tail-head)%2==0){
					word[tail-head]=0;
				}
				else{
					word[tail-head-1]=0;
				}
				if((tail-head)>=4){
					fthead=ft[index]->next;
					while(fthead!=NULL){
						if(strcmp(word,fthead->word)==0){
							find=1; //在feature中找到，分词
							d=fthead->tf;
							break;
						}
						fthead=fthead->next;
					}
				}
				if(find){
					//添加feature
					doc[d]++;
					head=tail;
					tail=senlen>head+8? head+8:senlen;
				}
				//词组不再feature中，缩短词组长度继续找。
				else {
					if((tail-head)>=6){
						tail=tail-2;
					}
					//一个字时head+2
					else{
						head=head+2;
						tail=senlen>head+8? head+8:senlen;
					}
				}
			}
			else { //不是汉字
				head++;
				tail=senlen>head+8? head+8:senlen;
			}
		}
		fscanf(file,"%s",sen);
	}
	fclose(file);
}
//建模，计算labelCount和featureCount参数
void iniModel(){
	int i,j,k;
	int senlen(0),index(0),head(0),tail(0),findFt(0),find(0);
	FILE *hamfile;
	FILE *spamfile;
//	Feature *fthead,*docft;
//	char sen[1700];
//	char word[19];
	char filename[5];
	char buffer[50];
	for(i=0;i<2;i++){
		labelCount[i]=0;
	}
	for(i=0;i<2;i++)
		for(j=0;j<MAXFEATURE;j++)
			for(k=0;k<MAXTF+1;k++)
				featureCount[i][j][k]=0;
//	doc=(Document *)malloc(sizeof(Document));
// memset(doc,0,sizeof(Document));

	for(i=0;i<5000;i++){
		//itoa(i+1,filename,10);
		//strcpy(buffer,"samples\\ham\\");
		//strcat(buffer,filename);
		sprintf(buffer,"samples\\ham\\%d",i+1);
		hamfile=fopen(buffer,"rb");
		if(hamfile==NULL){
			printf("没有%s文件\n",buffer);
			//exit(1);
		}
		
		else{
			readDocument(hamfile);
			/*
			for(j=0;j<4000;j++){
				docft=doc->feature[j]->next;
				while(docft!=NULL){
					strcpy(word,docft->word);
					index=getIndex(word[0],word[1]);
					fthead=ft[index]->next;
					//这个词是第几个feature
					k=0;
					while(fthead!=NULL){
						if(strcmp(word,fthead->word)==0){
							k=fthead->tf;
							break;
						}
						fthead=fthead->next;
					}
					if(docft->tf>MAXTF){
						//		printf("%s在%s中出现了%d次超过%d\n",docft->word,buffer,docft->tf,MAXTF);
						docft->tf=MAXTF;
					}
					featureCount[0][k][docft->tf]++;
					docft=docft->next;
				}
			}
			*/
			//这是一个非垃圾邮件
			labelCount[0]++;
			for(j=1;j<=num;j++){
				if(doc[j]>MAXTF){
					doc[j]=MAXTF;
				}
				featureCount[0][j][doc[j]]++;
			}
			/*
			//free文件的feature链表
			for(j=0;j<4000;j++){
				docft=doc->feature[j];
				Feature *post;
				while(docft!=NULL){
					post=docft->next;
					free(docft);
					docft=post;
				}
			}
			*/
		}
	}


	for(i=0;i<5000;i++){
		itoa(i+1,filename,10);
		strcpy(buffer,SPAMDIR);
		strcat(buffer,filename);
		spamfile=fopen(buffer,"rb");
		if(spamfile==NULL){
			printf("没有%s文件\n",buffer);
		//	exit(0);
		}

		else{
			readDocument(spamfile);
			//这是一个垃圾邮件
			labelCount[1]++;
			for(j=1;j<=num;j++){
				if(doc[j]>MAXTF){
					doc[j]=MAXTF;
				}
				featureCount[1][j][doc[j]]++;
			}
			/*
			for(j=0;j<4000;j++){
				docft=doc->feature[j]->next;
				while(docft!=NULL){
					strcpy(word,docft->word);
					index=getIndex(word[0],word[1]);
					fthead=ft[index]->next;
					//这个词是第几个feature
					k=0;
					while(fthead!=NULL){
						if(strcmp(word,fthead->word)==0){
							k=fthead->tf;
							break;
						}
						fthead=fthead->next;
					}
					if(docft->tf>MAXTF){
						//		printf("%s在%s中出现了%d次超过%d\n",docft->word,buffer,docft->tf,MAXTF);
						docft->tf=MAXTF;
					}
					featureCount[1][k][docft->tf]++;
					docft=docft->next;
				}
			}
			
			//free文件的feature链表
			for(j=0;j<4000;j++){
				docft=doc->feature[j];
				Feature *post;
				while(docft!=NULL){
					post=docft->next;
					free(docft);
					docft=post;
				}
			}
			*/
		}
	}
//	printf("%d\n",labelCount[0]);
//	printf("%d\n",labelCount[1]);

}

void BayesLearner(){
	readFeature();
	iniModel();
}

void BayesClasser(){
	int i,j;
	char filename[5];
	char testpath[50];
	double prop[2];
	float ratio,smooth(0.25);
	ratio=labelCount[0]/labelCount[1];
	FILE *testfile;
	FILE *result;
	result=fopen("test_results.out","w");
	strcpy(testdir,"samples\\spam\\");
//	strcpy(testdir,"test\\");
	for(i=0;i<MAXTEST;i++){
		itoa(i+1,filename,10);
		strcpy(testpath,testdir);
		strcat(testpath,filename);
		testfile=fopen(testpath,"rb");
		if(testfile!=NULL){
			prop[0]=0;
        	prop[1]=(num-1)*log(ratio);
			readDocument(testfile);
			/*
			for(j=0;j<4000;j++){
				docft=doc->feature[j]->next;
				while(docft!=NULL){
					strcpy(word,docft->word);
					index=getIndex(word[0],word[1]);
					fthead=ft[index]->next;
					//这个词是第几个feature
					k=0;
					while(fthead!=NULL){
						if(strcmp(word,fthead->word)==0){
							k=fthead->tf;
							break;
						}
						fthead=fthead->next;
					}
					if(docft->tf>MAXTF){
						docft->tf=MAXTF;
					}
                    //是非垃圾邮件的概率
					if(featureCount[0][k][docft->tf]==0){
						prop[0]=prop[0]+log(smooth);
					}
					else{
						prop[0]=prop[0]+log(featureCount[0][k][docft->tf]);
					}
					//是非垃圾邮件的概率
					if(featureCount[1][k][docft->tf]==0){
						prop[1]=prop[1]+log(smooth) ;
					}
					else{
						prop[1]=prop[1]+log(featureCount[1][k][docft->tf]) ;
					}
					docft=docft->next;
				}
			}
			*/
			for(j=0;j<=num;j++){
				if(doc[j]!=0){
					if(featureCount[0][j][doc[j]]==0){
						prop[0]=prop[0]+log(smooth);
					}
					else{
						prop[0]=prop[0]+log(featureCount[0][j][doc[j]]);
					}
					if(featureCount[1][j][doc[j]]==0){
						prop[1]=prop[1]+log(smooth);
					}
					else{
						prop[1]=prop[1]+log(featureCount[1][j][doc[j]]);
					}
				}
			}
			
			if(prop[0]>prop[1]){
				fprintf(result,"%d\n",0);
			}
			else{
				fprintf(result,"%d\n",1);
			}
            /* 
			//free文件的feature链表
			for(j=0;j<4000;j++){
				docft=doc->feature[j];
				Feature *post;
				while(docft!=NULL){
					post=docft->next;
					free(docft);
					docft=post;
				}
			}
			*/
		}
	}
	fclose(result);
}	

void partition(){
	BayesLearner() ;
	BayesClasser() ;
}
bayes.cpp - 源码说明

本页面展示了「中文邮件过滤。对训练邮件分词训练贝叶斯模型。然后对测试邮件分类」中的 bayes.cpp 源码文件，采用 C++ 编程语言编写，共 469 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与邮件过滤相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?