📄 bayes.cpp
字号:
#include "extractor.h"
#include "stdio.h"
#include "stdlib.h"
#include "string.h"
#include "math.h"
#define MAXFEATURE 400
#define MAXTF 10
#define MAXTEST 1200
//dict *dictHead[4000];
int labelCount[2]; //spam和ham各自出现总次数
int featureCount[2][MAXFEATURE][MAXTF+1]; //各个feature以特定频度出现次数
Feature *ft[4000]; //存放选择的feature
FILE *ftFile;
char testdir[30];
//Document *doc;
int doc[MAXFEATURE+1];
int num(0);
void readDocument(FILE file);
void readFeature(){
char sen[50];
char word[9];
int space=' ';
int index,i(0);
char *pdest;
int result;
Feature *ftWord;
ftFile=fopen("features.txt","rb");
if(ftFile==NULL){
printf("you must run the function phaseStat which statistic the word frequency");
printf("and choose the appropriate feature produce the features.txt file\n");
exit(1);
}
for(i=0;i<4000;i++){
ft[i]=(Feature *)malloc(sizeof(Feature));
memset(ft[i],0,sizeof(Feature));
}
fgets(sen,50,ftFile);
while(!feof(ftFile)&&num<MAXFEATURE){
num++;
pdest = strchr( sen, space );
result = pdest - sen ;
strncpy(word,sen,result);
word[result]=0;
index=getIndex(word[0],word[1]);
ftWord=(Feature *)malloc(sizeof(Feature));
memset(ftWord,0,sizeof(Feature));
strcpy(ftWord->word,word);
ftWord->tf=num;
ftWord->next=ft[index]->next;
ft[index]->next=ftWord;
fgets(sen,50,ftFile);
}
/*
for(i=0;i<4000;i++){
Feature *f;
f=NULL;
if(ft[i]->next!=NULL){
f=ft[i]->next;
}
while(f!=NULL){
printf("%3d %s\n",f->tf,f->word);
f=f->next;
}
}
*/
fclose(ftFile);
}
/*
void readDocument(FILE *file){
int j;
int senlen(0),index(0),head(0),tail(0),findFt(0),find(0);
Feature *fthead,*docft;
char sen[1700];
char word[9];
for(j=0;j<4000;j++){
doc->feature[j]=(Feature *)malloc(sizeof(Feature));
doc->feature[j]->next=NULL;
}
fscanf(file,"%s",sen);
while(!feof(file)){
senlen=strlen(sen);
head=0;
tail=senlen>8 ? 8:senlen;
while(head<senlen){
find=0;
findFt=0; //词在feature中已添加
strncpy(word,sen+head,(tail-head));
index=getIndex(word[0],word[1]);
if(index<4000){ //是汉字
if((tail-head)%2==0){
word[tail-head]=0;
}
else{
word[tail-head-1]=0;
}
if((tail-head)>=4){
fthead=ft[index]->next;
while(fthead!=NULL){
if(strcmp(word,fthead->word)==0){
find=1; //在feature中找到,分词
break;
}
fthead=fthead->next;
}
}
if(find){
//添加feature
docft=doc->feature[index];
while(docft->next!=NULL&&!findFt){
if(strcmp(docft->next->word,word)==0){
findFt=1; //前面出现过这个特征
docft->next->tf++;
break;
}
docft=docft->next;
}
if(!findFt){
//添加第一次出现feature
docft=(Feature *)malloc(sizeof(Feature));
//出现次数为1
docft->tf=1;
strcpy(docft->word,word);
docft->word[tail-head]=0;
docft->next=doc->feature[index]->next;
doc->feature[index]->next=docft;
}
head=tail;
tail=senlen>head+8? head+8:senlen;
}
//词组不再feature中,缩短词组长度继续找。
else {
if((tail-head)>=6){
tail=tail-2;
}
//一个字时head+2
else{
head=head+2;
tail=senlen>head+8? head+8:senlen;
}
}
}
else { //不是汉字
head++;
tail=senlen>head+8? head+8:senlen;
}
}
fscanf(file,"%s",sen);
}
fclose(file);
}
*/
void readDocument(FILE *file){
char sen[1700];
char word[9];
int i,d;
int senlen(0),index(0),head(0),tail(0),find(0);
Feature *fthead;
for(i=0;i<=num;i++){
doc[i]=0;
}
fscanf(file,"%s",sen);
while(!feof(file)){
senlen=strlen(sen);
head=0;
tail=senlen>8 ? 8:senlen;
while(head<senlen){
find=0;
strncpy(word,sen+head,(tail-head));
index=getIndex(word[0],word[1]);
if(index<4000){ //是汉字
if((tail-head)%2==0){
word[tail-head]=0;
}
else{
word[tail-head-1]=0;
}
if((tail-head)>=4){
fthead=ft[index]->next;
while(fthead!=NULL){
if(strcmp(word,fthead->word)==0){
find=1; //在feature中找到,分词
d=fthead->tf;
break;
}
fthead=fthead->next;
}
}
if(find){
//添加feature
doc[d]++;
head=tail;
tail=senlen>head+8? head+8:senlen;
}
//词组不再feature中,缩短词组长度继续找。
else {
if((tail-head)>=6){
tail=tail-2;
}
//一个字时head+2
else{
head=head+2;
tail=senlen>head+8? head+8:senlen;
}
}
}
else { //不是汉字
head++;
tail=senlen>head+8? head+8:senlen;
}
}
fscanf(file,"%s",sen);
}
fclose(file);
}
//建模,计算labelCount和featureCount参数
void iniModel(){
int i,j,k;
int senlen(0),index(0),head(0),tail(0),findFt(0),find(0);
FILE *hamfile;
FILE *spamfile;
// Feature *fthead,*docft;
// char sen[1700];
// char word[19];
char filename[5];
char buffer[50];
for(i=0;i<2;i++){
labelCount[i]=0;
}
for(i=0;i<2;i++)
for(j=0;j<MAXFEATURE;j++)
for(k=0;k<MAXTF+1;k++)
featureCount[i][j][k]=0;
// doc=(Document *)malloc(sizeof(Document));
// memset(doc,0,sizeof(Document));
for(i=0;i<5000;i++){
//itoa(i+1,filename,10);
//strcpy(buffer,"samples\\ham\\");
//strcat(buffer,filename);
sprintf(buffer,"samples\\ham\\%d",i+1);
hamfile=fopen(buffer,"rb");
if(hamfile==NULL){
printf("没有%s文件\n",buffer);
//exit(1);
}
else{
readDocument(hamfile);
/*
for(j=0;j<4000;j++){
docft=doc->feature[j]->next;
while(docft!=NULL){
strcpy(word,docft->word);
index=getIndex(word[0],word[1]);
fthead=ft[index]->next;
//这个词是第几个feature
k=0;
while(fthead!=NULL){
if(strcmp(word,fthead->word)==0){
k=fthead->tf;
break;
}
fthead=fthead->next;
}
if(docft->tf>MAXTF){
// printf("%s在%s中出现了%d次超过%d\n",docft->word,buffer,docft->tf,MAXTF);
docft->tf=MAXTF;
}
featureCount[0][k][docft->tf]++;
docft=docft->next;
}
}
*/
//这是一个非垃圾邮件
labelCount[0]++;
for(j=1;j<=num;j++){
if(doc[j]>MAXTF){
doc[j]=MAXTF;
}
featureCount[0][j][doc[j]]++;
}
/*
//free文件的feature链表
for(j=0;j<4000;j++){
docft=doc->feature[j];
Feature *post;
while(docft!=NULL){
post=docft->next;
free(docft);
docft=post;
}
}
*/
}
}
for(i=0;i<5000;i++){
itoa(i+1,filename,10);
strcpy(buffer,SPAMDIR);
strcat(buffer,filename);
spamfile=fopen(buffer,"rb");
if(spamfile==NULL){
printf("没有%s文件\n",buffer);
// exit(0);
}
else{
readDocument(spamfile);
//这是一个垃圾邮件
labelCount[1]++;
for(j=1;j<=num;j++){
if(doc[j]>MAXTF){
doc[j]=MAXTF;
}
featureCount[1][j][doc[j]]++;
}
/*
for(j=0;j<4000;j++){
docft=doc->feature[j]->next;
while(docft!=NULL){
strcpy(word,docft->word);
index=getIndex(word[0],word[1]);
fthead=ft[index]->next;
//这个词是第几个feature
k=0;
while(fthead!=NULL){
if(strcmp(word,fthead->word)==0){
k=fthead->tf;
break;
}
fthead=fthead->next;
}
if(docft->tf>MAXTF){
// printf("%s在%s中出现了%d次超过%d\n",docft->word,buffer,docft->tf,MAXTF);
docft->tf=MAXTF;
}
featureCount[1][k][docft->tf]++;
docft=docft->next;
}
}
//free文件的feature链表
for(j=0;j<4000;j++){
docft=doc->feature[j];
Feature *post;
while(docft!=NULL){
post=docft->next;
free(docft);
docft=post;
}
}
*/
}
}
// printf("%d\n",labelCount[0]);
// printf("%d\n",labelCount[1]);
}
void BayesLearner(){
readFeature();
iniModel();
}
void BayesClasser(){
int i,j;
char filename[5];
char testpath[50];
double prop[2];
float ratio,smooth(0.25);
ratio=labelCount[0]/labelCount[1];
FILE *testfile;
FILE *result;
result=fopen("test_results.out","w");
strcpy(testdir,"samples\\spam\\");
// strcpy(testdir,"test\\");
for(i=0;i<MAXTEST;i++){
itoa(i+1,filename,10);
strcpy(testpath,testdir);
strcat(testpath,filename);
testfile=fopen(testpath,"rb");
if(testfile!=NULL){
prop[0]=0;
prop[1]=(num-1)*log(ratio);
readDocument(testfile);
/*
for(j=0;j<4000;j++){
docft=doc->feature[j]->next;
while(docft!=NULL){
strcpy(word,docft->word);
index=getIndex(word[0],word[1]);
fthead=ft[index]->next;
//这个词是第几个feature
k=0;
while(fthead!=NULL){
if(strcmp(word,fthead->word)==0){
k=fthead->tf;
break;
}
fthead=fthead->next;
}
if(docft->tf>MAXTF){
docft->tf=MAXTF;
}
//是非垃圾邮件的概率
if(featureCount[0][k][docft->tf]==0){
prop[0]=prop[0]+log(smooth);
}
else{
prop[0]=prop[0]+log(featureCount[0][k][docft->tf]);
}
//是非垃圾邮件的概率
if(featureCount[1][k][docft->tf]==0){
prop[1]=prop[1]+log(smooth) ;
}
else{
prop[1]=prop[1]+log(featureCount[1][k][docft->tf]) ;
}
docft=docft->next;
}
}
*/
for(j=0;j<=num;j++){
if(doc[j]!=0){
if(featureCount[0][j][doc[j]]==0){
prop[0]=prop[0]+log(smooth);
}
else{
prop[0]=prop[0]+log(featureCount[0][j][doc[j]]);
}
if(featureCount[1][j][doc[j]]==0){
prop[1]=prop[1]+log(smooth);
}
else{
prop[1]=prop[1]+log(featureCount[1][j][doc[j]]);
}
}
}
if(prop[0]>prop[1]){
fprintf(result,"%d\n",0);
}
else{
fprintf(result,"%d\n",1);
}
/*
//free文件的feature链表
for(j=0;j<4000;j++){
docft=doc->feature[j];
Feature *post;
while(docft!=NULL){
post=docft->next;
free(docft);
docft=post;
}
}
*/
}
}
fclose(result);
}
void partition(){
BayesLearner() ;
BayesClasser() ;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -