📄 bogotune.c

📁 一个C语言写的快速贝叶斯垃圾邮件过滤工具
💻 C
📖 第 1 页 / 共 3 页
字号:
	fflush(stdout);    }}static int update_count(void){    message_count += 1;    if (verbose && (message_count % 100) == 0 && !fMakeCheck) {	if ((message_count % 1000) != 0)	    putchar('.');	else	    printf("\r              \r%u ", message_count/1000 );	fflush(stdout);    }    return message_count;}static unsigned int calc_db_cachesize(void){    struct stat fst;    if (!stat(ds_path, &fst)) {	int dbc = ceil((double)fst.st_size / (3 * 1024 * 1024));        return ((unsigned int)dbc);    } else {        fprintf(stderr, "Unable to stat %s\n", ds_path);        exit (EX_ERROR);    }}static void load_wordlist(ds_foreach_t *hook, void *userdata){    bfpath *bfp = bfpath_create(ds_path);    if (!bfpath_check_mode(bfp, BFP_MUST_EXIST)) {	fprintf(stderr, "Can't open wordlist '%s'\n", bfp->filepath);	exit(EX_ERROR);    }    if (verbose) {	printf("Reading %s\n", ds_path);	fflush(stdout);    }    ds_oper(env, bfp, DS_READ, hook, userdata);    bfpath_free(bfp);    return;}static int load_hook(word_t *key, dsv_t *data, void *userdata)/* returns 0 if ok, 1 if not ok */{    wordprop_t *tokenprop = wordhash_insert(train, key, sizeof(wordprop_t), &wordprop_init);    (void) userdata;	/* quiet compiler complaint */    tokenprop->cnts.bad = data->spamcount;    tokenprop->cnts.good = data->goodcount;    if (word_cmps(key, ".MSG_COUNT") == 0)	set_msg_counts(data->goodcount, data->spamcount);    if (word_cmps(key, ".ENCODING") == 0) {	if (encoding == E_UNKNOWN)	    encoding = data->spamcount;	if (encoding != data->spamcount) {	    fprintf(stderr, "Can't mix database encodings, i.e. utf-8 and any other.\n");	    exit(EX_ERROR);	}    }    return 0;}static void set_train_msg_counts(wordhash_t *wh){    wordprop_t *count;    count = wordhash_insert(wh, w_msg_count, sizeof(wordprop_t), NULL);    if (msgs_good == 0 && msgs_bad == 0) {	fprintf(stderr, "Can't find '.MSG_COUNT'.\n");	exit(EX_ERROR);    }}/* write_msgcount_file()****	Create a message count file from the original messages*/static void print_msgcount_entry(const char *token, uint bad, uint good){    printf( "\"%s\" %u %u\n", token, bad, good);}static void write_msgcount_file(wordhash_t *wh){    hashnode_t *hn;    print_msgcount_entry(".MSG_COUNT", msgs_bad, msgs_good);    for (hn = wordhash_first(wh); hn != NULL; hn = wordhash_next(wh)) {	word_t *token = hn->key;	wordprop_t *wp = (wordprop_t *) hn->buf;	wordcnts_t *cnts = &wp->cnts;	if (cnts->good == 0 && cnts->bad == 0) {	    wp = wordhash_search(train, token, 0);	    if (wp) {		cnts->good = wp->cnts.good;		cnts->bad  = wp->cnts.bad;	    }	}	print_msgcount_entry((char *)token->text, cnts->bad, cnts->good);    }    return;}static uint read_mailbox(char *arg, mlhead_t *msgs){    if (verbose) {	printf("Reading %s\n", arg);	fflush(stdout);    }    init_count();    mbox_mode = true;    bogoreader_init(1, &arg);    while ((*reader_more)()) {	wordhash_t *whp = NULL;	wordhash_t *whc = wordhash_new();	collect_words(whc);	if (ds_path != NULL && (msgs_good + msgs_bad) == 0)	    set_train_msg_counts(whc);	if (whc->count == 0 && !quiet) {	    printf("msg #%u, count is %u\n", message_count, whc->count);	    bt_trap();	}	if (bogolex_file != NULL) {	    wordhash_sort(whc);	    lookup_words(whc);	    write_msgcount_file(whc);	}	else if (whc->count != 0) {	    if (!msg_count_file)		whp = convert_wordhash_to_propslist(whc, train);	    else		whp = convert_propslist_to_countlist(whc);	    msglist_add(msgs, whp);	}	update_count();		if (whc != whp)	    wordhash_free(whc);    }    print_final_count();    ns_and_sp->count += message_count;    bogoreader_fini();    return message_count;}static uint filelist_read(int mode, flhead_t *list){    uint count = 0;    flitem_t *item;    mlhead_t *msgs = (mode == REG_GOOD) ? ns_msglists->msgs : sp_msglists->msgs;    run_type = mode;    for (item = list->head; item != NULL; item = item->next) {	lexer = NULL;	msg_count_file = false;	count += read_mailbox(item->name, msgs);    }    return count;}/* distribute()****	Proportionally distribute messages between training and scoring sets.****   Method:**	If only 2500 messages, use 2000 for training and 500 for scoring.**	If over 4000 messages, use equal numbers for training and scoring.**	In between 2500 and 4000, do a proportional distribution.*/static void distribute(int mode, tunelist_t *ns_or_sp){    int good = mode == REG_GOOD;    int bad  = 1 - good;    bool divvy = ds_flag == DS_RAM && user_robx < EPS && !msg_count_file;    mlitem_t *item;    mlhead_t *msgs = ns_or_sp->msgs;    int score_count = 0;    int train_count = 0;    static int train_good = 0;    static int train_bad  = 0;    double ratio = scale(msgs->count,			 LIST_COUNT + TEST_COUNT,	/* small count */			 LIST_COUNT + LIST_COUNT,	/* large count */			 LIST_COUNT / TEST_COUNT,	/* small ratio */			 LIST_COUNT / LIST_COUNT);	/* large ratio */    for (item = msgs->head; item != NULL; item = item->next) {	wordhash_t *wh = item->wh;	/* training set */	if (divvy && train_count / ratio < score_count + 1) {	    wordhash_set_counts(wh, good, bad);	    wordhash_add(train, wh, &wordprop_init);	    train_count += 1;	    wordhash_free(wh);	    train_good += good;	    train_bad  += bad;	}	/* scoring set  */	else {	    uint bin = divvy ? MOD(score_count,3) : 0;	    msglist_add(ns_or_sp->u.sets[bin], wh);	    score_count += 1;	}	item->wh = NULL;    }    if (divvy) {	wordhash_insert(train, w_msg_count, sizeof(wordprop_t), &wordprop_init);	set_msg_counts(train_good, train_bad);    }    if (verbose > 1)	printf("%s:  train_count = %d, score_count = %d\n",	       good ? "ns" : "sp",	       train_count, score_count);    return;}static void create_countlists(tunelist_t *ns_or_sp){    uint i;    uint c = COUNTOF(ns_or_sp->u.sets);    for (i = 0; i < c; i += 1) {	mlhead_t *list = ns_or_sp->u.sets[i];	mlitem_t *item;	for (item = list->head; item != NULL; item = item->next) {	    wordhash_t *who = item->wh;	    wordhash_t *whn = convert_propslist_to_countlist(who);	    if (whn != who) {		wordhash_free(who);		item->wh = whn;	    }	}    }    return;}static void print_version(void){    (void)fprintf(stderr,		  "%s version %s\n"		  "    Database: %s\n"		  "Copyright (C) 2002-2006 Greg Louis, David Relson\n\n"		  "%s comes with ABSOLUTELY NO WARRANTY.  "		  "This is free software, and\nyou are welcome to "		  "redistribute it under the General Public License.  "		  "See\nthe COPYING file with the source distribution for "		  "details.\n"		  "\n",		  progtype, version, ds_version_str(), PACKAGE);}static void help(void){    (void)fprintf(stderr,		  "Usage:  %s [options] { -c config } { -d directory } -n non-spam-file -s spam-file\n",		  progname);    (void)fprintf(stderr,		  "\t  -h      - print this help message.\n"		  "\t  -C      - don't read standard config files.\n"		  "\t  -c file - read specified config file.\n"		  "\t  -D      - don't read a wordlist file.\n"		  "\t  -d path - specify directory for wordlists.\n"		  "\t  -E      - disable ESF (effective size factor) tuning.\n"		  "\t  -M file - rewrite input file in message count format.\n"		  "\t  -r num  - specify robx value\n");    (void)fprintf(stderr,		  "\t  -T num  - specify fp target value\n"		  "\t  -s file1 file2 ... - spam files\n"		  "\t  -n file1 file2 ... - non-spam files\n"		  "\t  -v      - increase level of verbose messages\n"		  "\t  -q      - quiet (suppress warnings)\n"	);    (void)fprintf(stderr,		  "\n"		  "%s (version %s) is part of the bogofilter package.\n",		  progname, version);}static struct option longopts_bogotune[] = {    /* longoptions.h - common options */    LONGOPTIONS_COMMON    /* longoptions.h - bogofilter/-lexer options */    LONGOPTIONS_LEX    /* end of list */    { NULL,				0, 0, 0 }};static int process_arglist(int argc, char **argv){    int  count = 1;    bulk_mode = B_CMDLINE;#ifdef __EMX__    _response (&argc, &argv);	/* expand response files (@filename) */    _wildcard (&argc, &argv);	/* expand wildcards (*.*) */#endif#define	OPTIONS	":-:c:Cd:DeEM:n:qr:s:tT:vVx:"    while (1)    {	int option;	int option_index = 0;	int this_option_optind = optind ? optind : 1;	const char *name;	option = getopt_long(argc, argv, OPTIONS,			     longopts_bogotune, &option_index);	if (option == -1) 	    break;	name = (option_index == 0) ? argv[this_option_optind] : longopts_bogotune[option_index].name;	process_bogotune_arg(option);    }    if (ds_flag == DS_NONE)	/* default is "wordlist on disk" */	ds_flag = DS_DSK;    if (ds_flag == DS_ERR) {	fprintf(stderr, "Only one '-d dir' or '-D' option is allowed.\n");	exit(EX_ERROR);    }    if (bogolex_file == NULL &&	(spam_files->count == 0 || ham_files->count == 0)) {	fprintf(stderr,		"Bogotune needs both non-spam and spam message sets for its parameter testing.\n");	exit(EX_ERROR);    }    if (!suppress_config_file)	process_config_files(false, longopts_bogotune);    return count;}static void process_bogotune_arg(int option){    static int lastmode = -1;    if (option == 1) {	/* If getopt's RETURN_IN_ORDER behavior */	switch (lastmode) {	case 'n':	case 's':	    option = lastmode;	    break;	default:	    fprintf(stderr,		    "File names may only be given after -n or -s options.\n");	}    }    switch (option) {    case 'c':	read_config_file(optarg, false, false, PR_CFG_USER, longopts_bogotune);	/* FALLTHROUGH */    case 'C':	suppress_config_file = true;	break;    case 'd':	ds_path = xstrdup(optarg);	ds_flag = (ds_flag == DS_NONE) ? DS_DSK : DS_ERR;	break;    case 'D':	ds_flag = (ds_flag == DS_NONE) ? DS_RAM : DS_ERR;	break;    case 'e':	exit_zero = true;	break;    case 'E':	esf_flag ^= true;	break;    case 'M':	bogolex_file = optarg;	break;    case 'n':	lastmode = 'n';	filelist_add(ham_files, optarg);	break;    case 'q':	quiet = true;	break;    case 'r':	user_robx = atof(optarg);	break;    case 's':	lastmode = 's';	filelist_add(spam_files, optarg);	break;#ifdef	TEST    case 't':	test += 1;	break;#endif    case 'T':	coerced_target = atoi(optarg);	break;    case 'v':	verbose += 1;	break;    case 'V':	print_version();	exit(EX_OK);    case 'x':	if (strcmp(optarg, "MakeCheck") == 0)	    fMakeCheck = true;	else	    set_debug_mask( optarg );	break;    case O_MAX_TOKEN_LEN:	max_token_len = atoi(optarg);	break;    case O_MIN_TOKEN_LEN:	min_token_len = atoi(optarg);	break;    case O_MAX_MULTI_TOKEN_LEN:	max_multi_token_len=atoi(optarg);	break;    case O_MULTI_TOKEN_COUNT:	multi_token_count=atoi(optarg);	break;    default:	help();	exit(EX_ERROR);    }}static double get_robx(void){    double rx;    if (user_robx > 0.0)	rx = user_robx;    else if (ds_flag == DS_DSK)	{	printf("Calculating initial x value...\n");	verbose = -verbose;		/* disable bogofilter debug output */	rx = compute_robinson_x();	verbose = -verbose;		/* enable bogofilter debug output */    }    else	rx = ROBX;    if (rx > RX_MAX) rx = RX_MAX;    if (rx < RX_MIN) rx = RX_MIN;    printf("Initial x value is %8.6f\n", rx);    return rx;}static result_t *results_sort(uint r_count, result_t *results){    result_t *ans = xcalloc(r_count, sizeof(result_t));    memcpy(ans, results, r_count * sizeof(result_t));    qsort(ans, r_count, sizeof(result_t), compare_results);    return ans;}static void top_ten(result_t *sorted, uint n){    uint i, j;    bool f;    printf("Top ten parameter sets from this scan:\n");    printf("        rs     md    rx    spesf    nsesf    co     fp  fn   fppc   fnpc\n");    for (f = false; !f; f = true) {      for (i = j = 0; i < 10 && j < n;) { 	result_t *r = &sorted[j++]; 	if (!f && r->fp != target) continue;	sp_esf = ESF_SEL(sp_esf, pow(0.75, r->sp_exp));	ns_esf = ESF_SEL(ns_esf, pow(0.75, r->ns_exp));	printf("%5u %6.4f %5.3f %5.3f %8.6f %8.6f %6.4f  %3u %3u  %6.4f %6.4f\n",	       r->idx, r->rs, r->md, r->rx, sp_esf, ns_esf, r->co,	       r->fp, r->fn, r->fp*100.0/ns_cnt, r->fn*100.0/sp_cnt);	++i;      }      if (i) break;      printf("Warning: fp target not met, using original results\n");    }    printf("\n");    fflush(stdout);    return;}/* get false negative */static int gfn(result_t *results,	       uint rsi, uint mdi, uint rxi,	       uint spi, uint nsi){    uint i = (((rsi * mdval->cnt + mdi) * rxval->cnt + rxi) * spexp->cnt + spi) * nsexp->cnt + nsi;    result_t *r = &results[i];    int fn = r->fn;    if (r->fp != target) return INT_MAX;    if (verbose > 100)	printf("   %2u, %2u, %2u, %2u, %2u, %2d\n",	       rsi, mdi, rxi, spi, nsi, fn);    ncnt += 1;    nsum += fn;    return fn;}static result_t *count_outliers(uint r_count, result_t *sorted, result_t *unsorted){    bool f = false;    uint i, j = 0, o = 0;    uint fn;    uint rsi, mdi, rxi, spi, nsi;    uint rsc = rsval->cnt - 1;    uint rxc = rxval->cnt - 1;    uint mdc = mdval->cnt - 1;    uint spc = spexp->cnt - 1;    uint nsc = nsexp->cnt - 1;    result_t *r = NULL;					/* quench bogus compiler warning */    uint q33 = sorted[r_count * 33 / 100].fn;		/* 33% quantile */    uint med = sorted[r_count * 50 / 100].fn;		/* median false negative */    if (verbose)	printf("%u%% fn count was %u\n", 50u, med);    for (i = 0; i < r_count; i += 1) {	r = &sorted[i];	if (r->fp != target) continue;	if (j == 0) j = i+1;	if (fMakeCheck && j >= cMakeCheck) break;	rsi = r->rsi; mdi = r->mdi; rxi = r->rxi; spi = r->spi; nsi = r->nsi;	ncnt = nsum = 0;	if (((rsi == 0   ||	      (fn = gfn(unsorted, rsi-1, mdi, rxi, spi, nsi)) < med)) &&	    ((rsi == rsc ||	      (fn = gfn(unsorted, rsi+1, mdi, rxi, spi, nsi)) < med)) &&	    ((mdi == 0   ||	      (fn = gfn(unsorted, rsi, mdi-1, rxi, spi, nsi)) < med)) &&	    ((mdi == mdc ||	      (fn = gfn(unsorted, rsi, mdi+1, rxi, spi, nsi)) < med)) &&	    ((rxi == 0   ||	      (fn = gfn(unsorted, rsi, mdi, rxi-1, spi, nsi)) < med)) &&	    ((rxi == rxc ||	      (fn = gfn(unsorted, rsi, mdi, rxi+1, spi, nsi)) < med)) &&	    ((spi == 0   ||	      (fn = gfn(unsorted, rsi, mdi, rxi, spi-1, nsi)) < med)) &&	    ((spi == spc ||	      (fn = gfn(unsorted, rsi, mdi, rxi, spi+1, nsi)) < med)) &&	    ((nsi == 0   ||	      (fn = gfn(unsorted, rsi, mdi, rxi, spi, nsi-1)) < med)) &&	    ((nsi == nsc ||	      (fn = gfn(unsorted, rsi, mdi, rxi, spi, nsi+1)) < med)) &&	    (nsum / ncnt <  q33))	{	    f = true;	    break;	}	o++;    }    if (o > 0) {	printf("%u outlier%s encountered.                                                   \n",	       o, (o > 1) ? "s" : "");    }    if (!f) {	r = &sorted[j-1];	printf("No smooth minimum encountered, using lowest fn count (an outlier).         \n");    }
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -