📄 bogotune.c

📁 一个C语言写的快速贝叶斯垃圾邮件过滤工具
💻 C
📖 第 1 页 / 共 3 页
字号:
12 3 下一页
/* $Id: bogotune.c,v 1.233 2006/07/04 11:42:38 relson Exp $ *//*****************************************************************************NAME:   bogotune.c -- determine optimal parameters for bogofilterAUTHORS:   Greg Louis - perl version   David Relson - C version   Matthias Andree - options parser usability******************************************************************************//* To allow tuning of large corpora, the memory use must be minimized** for each messages.  Given the wordlist in ram, foreach token of a** test message bogotune only needs to know its spam and ham counts.**** 1. external wordlist ("-d") flag**    a. read wordlist.db**    b. read messages for test set**       1. lookup words in wordlist**       2. discard words**    c. replace wordhashs with wordcnts**    d. de-allocate resident wordlist**** 2. internal wordlist ("-D") flag**    a. read all messages**    b. distribute messages**        1. proper pct to wordlist**        2. proper pct to test set** 	  2a. create wordprops from wordlists**    c. replace wordprops with wordcnts**    d. de-allocate resident wordlist*//* Limitations:****	If all the input messages are in msg-count format,**	bogotune will use default ROBX value since an external**	wordlist is needed to compute a real robx value.*/#include "common.h"#include <math.h>#include <ctype.h>#include <errno.h>#include <stdlib.h>#include <sys/types.h>#include <sys/stat.h>#include "bogotune.h"#include "bogoconfig.h"#include "bogoreader.h"#include "collect.h"#include "datastore.h"#include "longoptions.h"#include "msgcounts.h"#include "mime.h"#include "mxcat.h"#include "paths.h"#include "robx.h"#include "score.h"#include "token.h"#include "tunelist.h"#include "wordhash.h"#include "wordlists.h"#include "xmalloc.h"#include "xstrdup.h"#undef	HAM_CUTOFF	/* ignore value in score.h */#undef	SPAM_CUTOFF	/* ignore value in score.h */#define	TEST_COUNT	500u	/* minimum allowable message count */#define	LIST_COUNT	2000u	/* minimum msg count in tunelist   */#define	PREF_COUNT	4000u	/* preferred message count         */#define	LARGE_COUNT	40000u#define	HAM_CUTOFF	0.10#define MIN_HAM_CUTOFF	0.10	/* minimum final ham_cutoff */#define MAX_HAM_CUTOFF	0.45	/* maximum final ham_cutoff */#define	MIN_CUTOFF	0.55	/* minimum cutoff  for set_thresh() */#define	WARN_MIN	0.50	/* warning minimum for set_thresh() */#define	WARN_MAX	0.99	/* warning maximum for set_thresh() */#define	MAX_CUTOFF	0.99	/* maximum cutoff  for set_thresh() */#define	SPAM_CUTOFF	0.975#define FP_CUTOFF	0.999#define	SCAN_CUTOFF	0.500	/* skip scans if cutoff is less or equal */#define	CHECK_PCT	0.01	/* for checking high scoring non-spam				** and low scoring spam *//* bogotune's default parameters */#define	DEFAULT_ROBS	ROBS	/* 0.0178 */		#define	DEFAULT_ROBX	ROBX	/* 0.5200 */#define	DEFAULT_MIN_DEV	0.02/* coarse scan parms */#define	MD_MIN_C	0.06	/* smallest min_dev to test */#define	MD_MAX_C	0.38	/* largest  min_dev to test */#define	MD_DLT_C	0.08	/* increment		    *//* fine scan parms */#define	MD_MIN_F	0.02#define	MD_MAX_F	MD_MAX_C+MD_DLT_F#define	MD_DLT_F	0.014/* robx limits */#define RX_MIN		0.4#define RX_MAX		0.6enum e_verbosity {    SUMMARY	   = 1,	/* summarize main loop iterations	*/    TIME	   = 2, /* include timing info           	*/    PARMS	   = 3,	/* print parameter sets (rs, md, rx)	*/    SCORE_SUMMARY  = 5,	/* verbosity level for printing scores	*/    SCORE_DETAIL	/* verbosity level for printing scores	*/};typedef enum e_ds_loc {    DS_NONE	   = 0,	/* datastore locn not specified */    DS_ERR	   = 1,	/* error in datastore locn spec */    DS_DSK	   = 2,	/* datastore on disk */    DS_RAM	   = 4 	/* datastore in ram  */} ds_loc;#define	MOD(n,m)	((n) - (floor((n)/(m)))*(m))#define	ROUND(m,n)	floor((m)*(n)+.5)/(n)#define	MIN(n)		((n)/60)#define SECONDS(n)	((n) - MIN(n)*60)#define KEY(r)		((r)->fn + (((r)->co > 0.5) ? (r)->co : 0.99))#define	ESF_SEL(a,b)	(!esf_flag ? (a) : (b))/* Global Variables */const char *progname = "bogotune";static char *ds_path;			/* directory */static ds_loc ds_flag = DS_NONE;static void *env = NULL;static bool    esf_flag = true;		/* test ESF factors if true */static bool    exit_zero = false;	/* non-error exits zero */static char   *bogolex_file = NULL;	/* non-NULL if creating msg-count output */static word_t *w_msg_count;static uint message_count;static wordhash_t *train;static tunelist_t *ns_and_sp;static tunelist_t *ns_msglists, *sp_msglists;static flhead_t *spam_files, *ham_files;typedef struct {    uint cnt;    double *data;} data_t;static data_t *rsval;static data_t *rxval;static data_t *mdval;static data_t *spexp;static data_t *nsexp;static uint target;static uint ns_cnt, sp_cnt;static double spex, nsex;static double check_percent;		/* initial check for excessively high/low scores */static double *ns_scores;static double *sp_scores;static double user_robx = 0.0;		/* from option '-r value' */static uint   coerced_target = 0;	/* user supplied with '-T value' */static uint   ncnt, nsum;		/* neighbor count and sum - for gfn() averaging */#undef	TEST#ifdef	TESTuint test = 0;#endifbool fMakeCheck = false;	/* allows quick & dirty regression testing */uint cMakeCheck =    50;	/* ... for 50 cycles *//* Function Declarations */static void process_bogotune_arg(int option);/* Function Definitions */static void bt_trap(void) {}static int get_cnt(double fst, double lst, double amt){    int cnt = (fabs(lst - fst) + EPS) / (fabs(amt) - EPS) + 1;    return cnt;}static data_t *seq_by_amt(double fst, double lst, double amt){    uint i;    data_t *val = xcalloc(1, sizeof(data_t));    val->cnt = get_cnt(fst, lst, amt);    val->data = xcalloc(val->cnt, sizeof(double));    for (i = 0; i < val->cnt; i += 1)	val->data[i] = fst + i * amt;    return val;}static data_t *seq_canonical(double fst, double amt){    uint c;    float v;    uint i = 0;    data_t *val = xcalloc(1, sizeof(data_t));    val->data = xcalloc(5, sizeof(double));    fst = max(fst, RX_MIN);    fst = min(fst, RX_MAX);    val->data[i++] = fst;    for (c = 1; c <= 2; c += 1) {	v = fst - amt * c; if (v >= RX_MIN) val->data[i++] = v;	v = fst + amt * c; if (v <= RX_MAX) val->data[i++] = v;    }    val->cnt = i;    return val;}static data_t *seq_by_pow(double fst, double lst, double amt){    uint i;    data_t *val = xcalloc(1, sizeof(data_t));    val->cnt = get_cnt(fst, lst, amt);    val->data = xcalloc(val->cnt, sizeof(double));    for (i = 0; i < val->cnt; i += 1)	val->data[i] = pow(10, fst + i * amt);    return val;}static void data_free(data_t *val){    xfree(val->data); xfree(val);}static void init_coarse(double _rx){    rxval = seq_canonical(_rx, 0.05);    rsval = seq_by_pow(0.0, -2.0, -1.0);    mdval = seq_by_amt(MD_MIN_C, MD_MAX_C, MD_DLT_C);    spexp = seq_by_amt(ESF_SEL(0.0,2.0), ESF_SEL(0.0,20.0), 3.0);    nsexp = seq_by_amt(ESF_SEL(0.0,2.0), ESF_SEL(0.0,20.0), 3.0);}static void init_fine(double _rs, double _md, double _rx,		      double _spex, double _nsex){    double s0, s1;    s0 = max(log10(_rs) - 0.5, -2.0);    s1 = min(log10(_rs) + 0.5,  0.0);    rsval = seq_by_pow(s1, s0, -0.25);    s0 = max(_md - 0.042, MD_MIN_F);    s1 = min(_md + 0.042, MD_MAX_F);    mdval = seq_by_amt(s0, s1, MD_DLT_F);    rxval = seq_canonical(_rx, 0.013);    s0 = max(_spex - 1.5,  0.0);    s1 = min(_spex + 1.5, 20.0);    spexp = seq_by_amt(s0, ESF_SEL(s0,s1), 0.5);    s0 = max(_nsex - 1.5,  0.0);    s1 = min(_nsex + 1.5, 20.0);    nsexp = seq_by_amt(s0, ESF_SEL(s0,s1), 0.5);}static void print_parms(const char *label, const char *format, data_t *data){    uint i;    printf("  %s: %2u ", label, data->cnt);    for (i = 0; i < data->cnt; i += 1) {	printf("%s", (i == 0) ? " (" : ", ");	printf(format, data->data[i]); /* RATS: ignore */    }    printf(")\n"); fflush(stdout);}static void print_all_parms(uint r_count){    if (verbose >= PARMS) {	print_parms("rsval", "%6.4f", rsval);	print_parms("rxval", "%5.3f", rxval);	print_parms("mdval", "%5.3f", mdval);	print_parms("spexp", "%5.2f", spexp);	print_parms("nsexp", "%5.2f", nsexp);    }    if (verbose >= TIME)	printf("cnt: %u (rs: %u, rx: %u, md: %u spex: %u, nsex: %u)\n",	       r_count, rsval->cnt, rxval->cnt, mdval->cnt, spexp->cnt, nsexp->cnt);}static int compare_ascending(const void *const ir1, const void *const ir2){    const double d1 = *(const double *)ir1;    const double d2 = *(const double *)ir2;    if (d1 - d2 > 0.0) return  1;    if (d2 - d1 > 0.0) return -1;    return 0;}static int compare_descending(const void *const ir1, const void *const ir2){    const double d1 = *(const double *)ir1;    const double d2 = *(const double *)ir2;    if (d1 - d2 > 0.0) return -1;    if (d2 - d1 > 0.0) return +1;    return 0;}#define CO(r) ((r->co > 0.5) ? r->co : 0.99)static int compare_results(const void *const ir1, const void *const ir2){    result_t const *r1 = (result_t const *)ir1;    result_t const *r2 = (result_t const *)ir2;    if (KEY(r1) > KEY(r2)) return  1;    if (KEY(r2) > KEY(r1)) return -1;    /* Favor cutoffs > 0.5 */    if (CO(r1) > CO(r2) ) return  1;    if (CO(r2) > CO(r1) ) return -1;    if (r1->idx > r2->idx ) return  1;    if (r2->idx > r1->idx ) return -1;    return 0;}/* Score all non-spam */static void score_ns(double *results){    uint i;    uint count = 0;    if (verbose >= SCORE_DETAIL)	printf("ns:\n");    verbose = -verbose;		/* disable bogofilter debug output */    for (i = 0; i < COUNTOF(ns_msglists->u.sets); i += 1) {	mlhead_t *list = ns_msglists->u.sets[i];	mlitem_t *item;	for (item = list->head; item != NULL; item = item->next) {	    wordhash_t *wh = item->wh;	    double score = msg_compute_spamicity(wh, NULL);	    results[count++] = score;	    if ( -verbose == SCORE_DETAIL ||		(-verbose >= SCORE_DETAIL && EPS < score && score < 1 - EPS))		printf("%6u %0.16f\n", count-1, score);	}    }    verbose = -verbose;		/* enable bogofilter debug output */    qsort(results, count, sizeof(double), compare_descending);    return;}static bool check_for_high_ns_scores(void){    uint t = ceil(ns_cnt * check_percent);    score_ns(ns_scores);	/* scores in descending order */    /* want at least 1 high scoring non-spam for FP determination */    if (ns_scores[t-1] < SPAM_CUTOFF)	return false;    if (!quiet) {	fprintf(stderr,		"Warning: test messages include many high scoring nonspam.\n");	fprintf(stderr,		"         You may wish to reclassify them and rerun.\n");    }    return true;}/* Score all spam to determine false negative counts */static void score_sp(double *results){    uint i;    uint count = 0;    if (verbose >= SCORE_DETAIL)	printf("sp:\n");    verbose = -verbose;		/* disable bogofilter debug output */    for (i = 0; i < COUNTOF(sp_msglists->u.sets); i += 1) {	mlhead_t *list = sp_msglists->u.sets[i];	mlitem_t *item;	for (item = list->head; item != NULL; item = item->next) {	    wordhash_t *wh = item->wh;	    double score = msg_compute_spamicity(wh, NULL);	    results[count++] = score;	    if ( -verbose == SCORE_DETAIL ||		(-verbose >= SCORE_DETAIL && EPS < score && score < 1 - EPS))		printf("%6u %0.16f\n", count-1, score);	}    }    verbose = -verbose;		/* enable bogofilter debug output */    qsort(results, count, sizeof(double), compare_ascending);    return;}static uint get_fn_count(uint count, double *results){    uint i;    uint fn = 0;    for (i = 0; i < count; i += 1) {	double score = results[i];	if (score < spam_cutoff)	    fn += 1;    }    return fn;}static bool check_for_low_sp_scores(void){    uint t = ceil(sp_cnt * check_percent);    score_sp(sp_scores);	/* get scores */    /* low scoring spam may cause problems ... */    if (sp_scores[t-1] > HAM_CUTOFF)	return false;    if (!quiet) {	fprintf(stderr,		"Warning: test messages include many low scoring spam.\n");	fprintf(stderr,		"         You may wish to reclassify them and rerun.\n");    }    return true;}static void scoring_error(void){    int i;    if (quiet)	return;    printf("    high ham scores:\n");    for (i = 0; i < 10 && ns_scores[i] > SPAM_CUTOFF; i += 1)	printf("      %2d %8.6f\n", i+1, ns_scores[i]);    printf("    low spam scores:\n");    for (i = 0; i < 10 && sp_scores[i] < HAM_CUTOFF; i += 1)	printf("      %2d %8.6f\n", i+1, sp_scores[i]);}#ifdef	TESTstatic char flag(uint idx, uint cnt, uint dlt){    if (dlt == 0)	return ' ';    if (idx < cnt)	return '+';    else	return '-';}#endif#ifdef	TESTstatic void print_ns_scores(uint beg, uint cnt, uint dlt){    uint i, m = min(cnt + dlt, ns_cnt);    printf("ns:\n");    for (i = beg; i <= m; i += 1)	printf("    %3d %0.16f %c\n", i+1, ns_scores[i], flag(i, cnt, dlt));}#endif#ifdef	TESTstatic void print_sp_scores(uint beg, uint cnt, uint dlt){    uint i, m = min(cnt + dlt, sp_cnt);    printf("sp:\n");    for (i = beg; i <= m; i += 1)	printf("    %3d %0.16f %c\n", i+1, sp_scores[i], flag(i, cnt, dlt));}#endifstatic double scale(uint cnt, uint lo, uint hi, double beg, double end){    double ans;    if (cnt < lo)	return beg;    if (cnt > hi)	return end;    ans = beg + (end - beg) * (cnt - lo ) / (hi - lo);    return ans;}/* compute scores and set global variables:****	 target**	 spam_cutoff**** As count increases from 500 to 4000 ...**	1) initial target percent drops from 1% to 0.25%**	2) initial minimum target increases from 5 to 10*/static void set_thresh(uint count, double *scores){    uint   ftarget = 0;    double cutoff, lgc;    score_ns(scores);		/* get scores *//***	Use parabolic curve to fit data**	message counts: (70814, 12645, 2118, 550)**	target values:  (   22,    18,    8,   1)*/    lgc = log(count);    ftarget = max(ROUND(((-0.4831 * lgc) + 12.8976) * lgc - 61.5264, 1.0), 1);    while (1) {	cutoff = ns_scores[ftarget-1];	if (verbose >= PARMS)	    printf("m:  cutoff %8.6f, ftarget %u\n", cutoff, ftarget);	if (ftarget == 1 || cutoff >= MIN_CUTOFF)	    break;	ftarget -= 1;    }    /* ensure cutoff is below SPAM_CUTOFF */    if (cutoff > SPAM_CUTOFF) { 	while (cutoff > SPAM_CUTOFF && ++ftarget < count) { 	    cutoff = scores[ftarget-1];	    if (verbose >= PARMS)		printf("s:  cutoff %8.6f, ftarget %u%%\n", cutoff, ftarget);	} 	cutoff = SPAM_CUTOFF;	--ftarget;	if (verbose >= PARMS)	    printf("s:  cutoff %8.6f, ftarget %u%%\n", cutoff, ftarget);    }    if (cutoff < WARN_MIN || cutoff > WARN_MAX) {	if (!quiet) {	    fprintf(stderr,		    "%s high-scoring non-spams in this data set.\n",		    (cutoff < WARN_MIN) ? "Too few" : "Too many");	    fprintf(stderr,		    "At target %u, cutoff is %8.6f.\n", ftarget, cutoff);	}    }#ifdef	TEST    if (verbose >= PARMS)	print_ns_scores(ftarget-4, ftarget+4, 0);#endif    target = ftarget;    spam_cutoff = cutoff;    return;}static void init_count(void){    message_count = 0;}static void print_final_count(void){    if (verbose) {	if (!fMakeCheck)	    printf("\r              \r");	printf("%u messages\n", message_count);
12 3 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -