📄 bogotune.c
字号:
/* $Id: bogotune.c,v 1.233 2006/07/04 11:42:38 relson Exp $ *//*****************************************************************************NAME: bogotune.c -- determine optimal parameters for bogofilterAUTHORS: Greg Louis - perl version David Relson - C version Matthias Andree - options parser usability******************************************************************************//* To allow tuning of large corpora, the memory use must be minimized** for each messages. Given the wordlist in ram, foreach token of a** test message bogotune only needs to know its spam and ham counts.**** 1. external wordlist ("-d") flag** a. read wordlist.db** b. read messages for test set** 1. lookup words in wordlist** 2. discard words** c. replace wordhashs with wordcnts** d. de-allocate resident wordlist**** 2. internal wordlist ("-D") flag** a. read all messages** b. distribute messages** 1. proper pct to wordlist** 2. proper pct to test set** 2a. create wordprops from wordlists** c. replace wordprops with wordcnts** d. de-allocate resident wordlist*//* Limitations:**** If all the input messages are in msg-count format,** bogotune will use default ROBX value since an external** wordlist is needed to compute a real robx value.*/#include "common.h"#include <math.h>#include <ctype.h>#include <errno.h>#include <stdlib.h>#include <sys/types.h>#include <sys/stat.h>#include "bogotune.h"#include "bogoconfig.h"#include "bogoreader.h"#include "collect.h"#include "datastore.h"#include "longoptions.h"#include "msgcounts.h"#include "mime.h"#include "mxcat.h"#include "paths.h"#include "robx.h"#include "score.h"#include "token.h"#include "tunelist.h"#include "wordhash.h"#include "wordlists.h"#include "xmalloc.h"#include "xstrdup.h"#undef HAM_CUTOFF /* ignore value in score.h */#undef SPAM_CUTOFF /* ignore value in score.h */#define TEST_COUNT 500u /* minimum allowable message count */#define LIST_COUNT 2000u /* minimum msg count in tunelist */#define PREF_COUNT 4000u /* preferred message count */#define LARGE_COUNT 40000u#define HAM_CUTOFF 0.10#define MIN_HAM_CUTOFF 0.10 /* minimum final ham_cutoff */#define MAX_HAM_CUTOFF 0.45 /* maximum final ham_cutoff */#define MIN_CUTOFF 0.55 /* minimum cutoff for set_thresh() */#define WARN_MIN 0.50 /* warning minimum for set_thresh() */#define WARN_MAX 0.99 /* warning maximum for set_thresh() */#define MAX_CUTOFF 0.99 /* maximum cutoff for set_thresh() */#define SPAM_CUTOFF 0.975#define FP_CUTOFF 0.999#define SCAN_CUTOFF 0.500 /* skip scans if cutoff is less or equal */#define CHECK_PCT 0.01 /* for checking high scoring non-spam ** and low scoring spam *//* bogotune's default parameters */#define DEFAULT_ROBS ROBS /* 0.0178 */ #define DEFAULT_ROBX ROBX /* 0.5200 */#define DEFAULT_MIN_DEV 0.02/* coarse scan parms */#define MD_MIN_C 0.06 /* smallest min_dev to test */#define MD_MAX_C 0.38 /* largest min_dev to test */#define MD_DLT_C 0.08 /* increment *//* fine scan parms */#define MD_MIN_F 0.02#define MD_MAX_F MD_MAX_C+MD_DLT_F#define MD_DLT_F 0.014/* robx limits */#define RX_MIN 0.4#define RX_MAX 0.6enum e_verbosity { SUMMARY = 1, /* summarize main loop iterations */ TIME = 2, /* include timing info */ PARMS = 3, /* print parameter sets (rs, md, rx) */ SCORE_SUMMARY = 5, /* verbosity level for printing scores */ SCORE_DETAIL /* verbosity level for printing scores */};typedef enum e_ds_loc { DS_NONE = 0, /* datastore locn not specified */ DS_ERR = 1, /* error in datastore locn spec */ DS_DSK = 2, /* datastore on disk */ DS_RAM = 4 /* datastore in ram */} ds_loc;#define MOD(n,m) ((n) - (floor((n)/(m)))*(m))#define ROUND(m,n) floor((m)*(n)+.5)/(n)#define MIN(n) ((n)/60)#define SECONDS(n) ((n) - MIN(n)*60)#define KEY(r) ((r)->fn + (((r)->co > 0.5) ? (r)->co : 0.99))#define ESF_SEL(a,b) (!esf_flag ? (a) : (b))/* Global Variables */const char *progname = "bogotune";static char *ds_path; /* directory */static ds_loc ds_flag = DS_NONE;static void *env = NULL;static bool esf_flag = true; /* test ESF factors if true */static bool exit_zero = false; /* non-error exits zero */static char *bogolex_file = NULL; /* non-NULL if creating msg-count output */static word_t *w_msg_count;static uint message_count;static wordhash_t *train;static tunelist_t *ns_and_sp;static tunelist_t *ns_msglists, *sp_msglists;static flhead_t *spam_files, *ham_files;typedef struct { uint cnt; double *data;} data_t;static data_t *rsval;static data_t *rxval;static data_t *mdval;static data_t *spexp;static data_t *nsexp;static uint target;static uint ns_cnt, sp_cnt;static double spex, nsex;static double check_percent; /* initial check for excessively high/low scores */static double *ns_scores;static double *sp_scores;static double user_robx = 0.0; /* from option '-r value' */static uint coerced_target = 0; /* user supplied with '-T value' */static uint ncnt, nsum; /* neighbor count and sum - for gfn() averaging */#undef TEST#ifdef TESTuint test = 0;#endifbool fMakeCheck = false; /* allows quick & dirty regression testing */uint cMakeCheck = 50; /* ... for 50 cycles *//* Function Declarations */static void process_bogotune_arg(int option);/* Function Definitions */static void bt_trap(void) {}static int get_cnt(double fst, double lst, double amt){ int cnt = (fabs(lst - fst) + EPS) / (fabs(amt) - EPS) + 1; return cnt;}static data_t *seq_by_amt(double fst, double lst, double amt){ uint i; data_t *val = xcalloc(1, sizeof(data_t)); val->cnt = get_cnt(fst, lst, amt); val->data = xcalloc(val->cnt, sizeof(double)); for (i = 0; i < val->cnt; i += 1) val->data[i] = fst + i * amt; return val;}static data_t *seq_canonical(double fst, double amt){ uint c; float v; uint i = 0; data_t *val = xcalloc(1, sizeof(data_t)); val->data = xcalloc(5, sizeof(double)); fst = max(fst, RX_MIN); fst = min(fst, RX_MAX); val->data[i++] = fst; for (c = 1; c <= 2; c += 1) { v = fst - amt * c; if (v >= RX_MIN) val->data[i++] = v; v = fst + amt * c; if (v <= RX_MAX) val->data[i++] = v; } val->cnt = i; return val;}static data_t *seq_by_pow(double fst, double lst, double amt){ uint i; data_t *val = xcalloc(1, sizeof(data_t)); val->cnt = get_cnt(fst, lst, amt); val->data = xcalloc(val->cnt, sizeof(double)); for (i = 0; i < val->cnt; i += 1) val->data[i] = pow(10, fst + i * amt); return val;}static void data_free(data_t *val){ xfree(val->data); xfree(val);}static void init_coarse(double _rx){ rxval = seq_canonical(_rx, 0.05); rsval = seq_by_pow(0.0, -2.0, -1.0); mdval = seq_by_amt(MD_MIN_C, MD_MAX_C, MD_DLT_C); spexp = seq_by_amt(ESF_SEL(0.0,2.0), ESF_SEL(0.0,20.0), 3.0); nsexp = seq_by_amt(ESF_SEL(0.0,2.0), ESF_SEL(0.0,20.0), 3.0);}static void init_fine(double _rs, double _md, double _rx, double _spex, double _nsex){ double s0, s1; s0 = max(log10(_rs) - 0.5, -2.0); s1 = min(log10(_rs) + 0.5, 0.0); rsval = seq_by_pow(s1, s0, -0.25); s0 = max(_md - 0.042, MD_MIN_F); s1 = min(_md + 0.042, MD_MAX_F); mdval = seq_by_amt(s0, s1, MD_DLT_F); rxval = seq_canonical(_rx, 0.013); s0 = max(_spex - 1.5, 0.0); s1 = min(_spex + 1.5, 20.0); spexp = seq_by_amt(s0, ESF_SEL(s0,s1), 0.5); s0 = max(_nsex - 1.5, 0.0); s1 = min(_nsex + 1.5, 20.0); nsexp = seq_by_amt(s0, ESF_SEL(s0,s1), 0.5);}static void print_parms(const char *label, const char *format, data_t *data){ uint i; printf(" %s: %2u ", label, data->cnt); for (i = 0; i < data->cnt; i += 1) { printf("%s", (i == 0) ? " (" : ", "); printf(format, data->data[i]); /* RATS: ignore */ } printf(")\n"); fflush(stdout);}static void print_all_parms(uint r_count){ if (verbose >= PARMS) { print_parms("rsval", "%6.4f", rsval); print_parms("rxval", "%5.3f", rxval); print_parms("mdval", "%5.3f", mdval); print_parms("spexp", "%5.2f", spexp); print_parms("nsexp", "%5.2f", nsexp); } if (verbose >= TIME) printf("cnt: %u (rs: %u, rx: %u, md: %u spex: %u, nsex: %u)\n", r_count, rsval->cnt, rxval->cnt, mdval->cnt, spexp->cnt, nsexp->cnt);}static int compare_ascending(const void *const ir1, const void *const ir2){ const double d1 = *(const double *)ir1; const double d2 = *(const double *)ir2; if (d1 - d2 > 0.0) return 1; if (d2 - d1 > 0.0) return -1; return 0;}static int compare_descending(const void *const ir1, const void *const ir2){ const double d1 = *(const double *)ir1; const double d2 = *(const double *)ir2; if (d1 - d2 > 0.0) return -1; if (d2 - d1 > 0.0) return +1; return 0;}#define CO(r) ((r->co > 0.5) ? r->co : 0.99)static int compare_results(const void *const ir1, const void *const ir2){ result_t const *r1 = (result_t const *)ir1; result_t const *r2 = (result_t const *)ir2; if (KEY(r1) > KEY(r2)) return 1; if (KEY(r2) > KEY(r1)) return -1; /* Favor cutoffs > 0.5 */ if (CO(r1) > CO(r2) ) return 1; if (CO(r2) > CO(r1) ) return -1; if (r1->idx > r2->idx ) return 1; if (r2->idx > r1->idx ) return -1; return 0;}/* Score all non-spam */static void score_ns(double *results){ uint i; uint count = 0; if (verbose >= SCORE_DETAIL) printf("ns:\n"); verbose = -verbose; /* disable bogofilter debug output */ for (i = 0; i < COUNTOF(ns_msglists->u.sets); i += 1) { mlhead_t *list = ns_msglists->u.sets[i]; mlitem_t *item; for (item = list->head; item != NULL; item = item->next) { wordhash_t *wh = item->wh; double score = msg_compute_spamicity(wh, NULL); results[count++] = score; if ( -verbose == SCORE_DETAIL || (-verbose >= SCORE_DETAIL && EPS < score && score < 1 - EPS)) printf("%6u %0.16f\n", count-1, score); } } verbose = -verbose; /* enable bogofilter debug output */ qsort(results, count, sizeof(double), compare_descending); return;}static bool check_for_high_ns_scores(void){ uint t = ceil(ns_cnt * check_percent); score_ns(ns_scores); /* scores in descending order */ /* want at least 1 high scoring non-spam for FP determination */ if (ns_scores[t-1] < SPAM_CUTOFF) return false; if (!quiet) { fprintf(stderr, "Warning: test messages include many high scoring nonspam.\n"); fprintf(stderr, " You may wish to reclassify them and rerun.\n"); } return true;}/* Score all spam to determine false negative counts */static void score_sp(double *results){ uint i; uint count = 0; if (verbose >= SCORE_DETAIL) printf("sp:\n"); verbose = -verbose; /* disable bogofilter debug output */ for (i = 0; i < COUNTOF(sp_msglists->u.sets); i += 1) { mlhead_t *list = sp_msglists->u.sets[i]; mlitem_t *item; for (item = list->head; item != NULL; item = item->next) { wordhash_t *wh = item->wh; double score = msg_compute_spamicity(wh, NULL); results[count++] = score; if ( -verbose == SCORE_DETAIL || (-verbose >= SCORE_DETAIL && EPS < score && score < 1 - EPS)) printf("%6u %0.16f\n", count-1, score); } } verbose = -verbose; /* enable bogofilter debug output */ qsort(results, count, sizeof(double), compare_ascending); return;}static uint get_fn_count(uint count, double *results){ uint i; uint fn = 0; for (i = 0; i < count; i += 1) { double score = results[i]; if (score < spam_cutoff) fn += 1; } return fn;}static bool check_for_low_sp_scores(void){ uint t = ceil(sp_cnt * check_percent); score_sp(sp_scores); /* get scores */ /* low scoring spam may cause problems ... */ if (sp_scores[t-1] > HAM_CUTOFF) return false; if (!quiet) { fprintf(stderr, "Warning: test messages include many low scoring spam.\n"); fprintf(stderr, " You may wish to reclassify them and rerun.\n"); } return true;}static void scoring_error(void){ int i; if (quiet) return; printf(" high ham scores:\n"); for (i = 0; i < 10 && ns_scores[i] > SPAM_CUTOFF; i += 1) printf(" %2d %8.6f\n", i+1, ns_scores[i]); printf(" low spam scores:\n"); for (i = 0; i < 10 && sp_scores[i] < HAM_CUTOFF; i += 1) printf(" %2d %8.6f\n", i+1, sp_scores[i]);}#ifdef TESTstatic char flag(uint idx, uint cnt, uint dlt){ if (dlt == 0) return ' '; if (idx < cnt) return '+'; else return '-';}#endif#ifdef TESTstatic void print_ns_scores(uint beg, uint cnt, uint dlt){ uint i, m = min(cnt + dlt, ns_cnt); printf("ns:\n"); for (i = beg; i <= m; i += 1) printf(" %3d %0.16f %c\n", i+1, ns_scores[i], flag(i, cnt, dlt));}#endif#ifdef TESTstatic void print_sp_scores(uint beg, uint cnt, uint dlt){ uint i, m = min(cnt + dlt, sp_cnt); printf("sp:\n"); for (i = beg; i <= m; i += 1) printf(" %3d %0.16f %c\n", i+1, sp_scores[i], flag(i, cnt, dlt));}#endifstatic double scale(uint cnt, uint lo, uint hi, double beg, double end){ double ans; if (cnt < lo) return beg; if (cnt > hi) return end; ans = beg + (end - beg) * (cnt - lo ) / (hi - lo); return ans;}/* compute scores and set global variables:**** target** spam_cutoff**** As count increases from 500 to 4000 ...** 1) initial target percent drops from 1% to 0.25%** 2) initial minimum target increases from 5 to 10*/static void set_thresh(uint count, double *scores){ uint ftarget = 0; double cutoff, lgc; score_ns(scores); /* get scores *//*** Use parabolic curve to fit data** message counts: (70814, 12645, 2118, 550)** target values: ( 22, 18, 8, 1)*/ lgc = log(count); ftarget = max(ROUND(((-0.4831 * lgc) + 12.8976) * lgc - 61.5264, 1.0), 1); while (1) { cutoff = ns_scores[ftarget-1]; if (verbose >= PARMS) printf("m: cutoff %8.6f, ftarget %u\n", cutoff, ftarget); if (ftarget == 1 || cutoff >= MIN_CUTOFF) break; ftarget -= 1; } /* ensure cutoff is below SPAM_CUTOFF */ if (cutoff > SPAM_CUTOFF) { while (cutoff > SPAM_CUTOFF && ++ftarget < count) { cutoff = scores[ftarget-1]; if (verbose >= PARMS) printf("s: cutoff %8.6f, ftarget %u%%\n", cutoff, ftarget); } cutoff = SPAM_CUTOFF; --ftarget; if (verbose >= PARMS) printf("s: cutoff %8.6f, ftarget %u%%\n", cutoff, ftarget); } if (cutoff < WARN_MIN || cutoff > WARN_MAX) { if (!quiet) { fprintf(stderr, "%s high-scoring non-spams in this data set.\n", (cutoff < WARN_MIN) ? "Too few" : "Too many"); fprintf(stderr, "At target %u, cutoff is %8.6f.\n", ftarget, cutoff); } }#ifdef TEST if (verbose >= PARMS) print_ns_scores(ftarget-4, ftarget+4, 0);#endif target = ftarget; spam_cutoff = cutoff; return;}static void init_count(void){ message_count = 0;}static void print_final_count(void){ if (verbose) { if (!fMakeCheck) printf("\r \r"); printf("%u messages\n", message_count);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -