📄 bogotune.c

📁 一个C语言写的快速贝叶斯垃圾邮件过滤工具
💻 C
📖 第 1 页 / 共 3 页
字号:
上一页 1 23
    return r;}static void progress(uint cur, uint top){    uint i;    uint ndots = ceil(70.0 * cur / top);    if (quiet)	return;    if (ndots < 1)	ndots = 1;     printf("\r%3u [", cur);     for (i = 0; i < ndots; i += 1)	 printf(".");     for (i = ndots; i < 70; i += 1)	 printf(" ");     printf("]");     fflush(stdout);}static void final_warning(void){    printf(	"The small number and/or relative uniformity of the test messages imply\n"	"that the recommended values (above), though appropriate to the test set,\n"	"may not remain valid for long.  Bogotune should be run again with more\n"	"messages when that becomes possible.\n"	);}static void final_recommendations(bool skip){    uint m;    bool printed = false;    uint minn[] = { 10000, 2000, 1000, 500, 1 };    printf("Performing final scoring:\n");    printf("Spam...  ");    score_sp(sp_scores);	/* get scores (in ascending order) */    printf("Non-Spam...\n");    score_ns(ns_scores);	/* get scores (in descending order) */    for (m=0; m<10; ++m) printf("%8.6f %8.6f\n", sp_scores[m], ns_scores[m]);    if (verbose >= PARMS)	printf("# ns_cnt %u, sp_cnt %u\n", ns_cnt, sp_cnt);    if (skip) {	printf("\n");	printf("### The following recommendations are provisional.\n");	printf("### Run bogotune with more messages when possible.\n");	printf("\n");    }    printf("\nRecommendations:\n\n");    printf("---cut---\n");    printf("db_cachesize=%u\n", db_cachesize);    printf("robs=%6.4f\n", robs);    printf("min_dev=%5.3f\n", min_dev);    printf("robx=%8.6f\n", robx);    printf("sp_esf=%8.6f\n", sp_esf);    printf("ns_esf=%8.6f\n", ns_esf);    for (m=0; m < COUNTOF(minn); m += 1) {	double cutoff;	uint i, fp = 0, fn = 0;	uint mn = minn[m];	double fpp, fnp;	if (ns_cnt < mn)	    continue;	if (mn > 1 ) {	    uint t = (ns_cnt + mn - 1) / mn;	    cutoff = ns_scores[t-1];	    if (cutoff > FP_CUTOFF)		continue;	    fp = ns_cnt / mn;	    fpp = 100.0 / mn;	}	else {	    cutoff = SPAM_CUTOFF;	    if (printed)		break;	    for (i = 0; i < ns_cnt; i += 1) {		if (ns_scores[i] >= cutoff)		    fp += 1;	    }	    cutoff = ns_scores[fp-1];	    fpp = 100.0 * fp / ns_cnt;	}	for (i = 0; i < sp_cnt; i += 1) {	    if (sp_scores[i] >= cutoff) {		fn = i;		break;	    }	}	fnp = 100.0 * fn / sp_cnt;	if (printed)  printf("#");	printf("spam_cutoff=%8.6f\t# for %4.2f%% fp (%u); expect %4.2f%% fn (%u).\n",	       cutoff, fpp, fp, fnp, fn);	printed = true;	if (skip)	    ham_cutoff = cutoff;    }    if (!skip) {	uint s = ceil(sp_cnt * 0.002 - 1);	ham_cutoff = sp_scores[s];	if (ham_cutoff < MIN_HAM_CUTOFF) ham_cutoff = MIN_HAM_CUTOFF;	if (ham_cutoff > MAX_HAM_CUTOFF) ham_cutoff = MAX_HAM_CUTOFF;    }    printf("ham_cutoff=%5.3f\t\n", ham_cutoff);    printf("---cut---\n");    printf("\n");    if (skip)	final_warning();    printf("Tuning completed.\n");}static void bogotune_init(void){    const char *msg_count = MSG_COUNT;    w_msg_count = word_news(msg_count);    train       = wordhash_new();    ns_and_sp   = tunelist_new("tr");		/* training lists */    ns_msglists = tunelist_new("ns");		/* non-spam scoring lists */    sp_msglists = tunelist_new("sp");		/* spam     scoring lists */    return;}static void bogotune_free(void){    xfree(ns_scores);    xfree(sp_scores);    filelist_free(ham_files);    filelist_free(spam_files);    tunelist_free(ns_msglists);    tunelist_free(sp_msglists);    tunelist_free(ns_and_sp);    word_free(w_msg_count);    token_cleanup();    mime_cleanup();    xfree(ds_path);    return;}static bool check_msgcount_parms(void){    bool ok = true;    if (ds_flag == DS_RAM) {	fprintf(stderr, "A wordlist directory must be specified for converting messages to the message count format.\n");	ok = false;    }    if (ham_files->count != 0 && spam_files->count != 0) {	fprintf(stderr, "Message count files may be created from spam or non-spam inputs but not both.\n");	fprintf(stderr, "Run bogotune once for the spam and again for the non-spam.\n");	ok = false;    }    return ok;}static bool check_msg_counts(void){    bool ok = true;    if (msgs_good < LIST_COUNT || msgs_bad < LIST_COUNT) {	if (!quiet)	    fprintf(stderr,		    "The wordlist contains %u non-spam and %u spam messages.\n"		    "Bogotune must be run with at least %u of each.\n",		    msgs_good, msgs_bad, LIST_COUNT);	ok = false;    }    if (msgs_bad * 5.0 < msgs_good ||	msgs_bad > msgs_good * 5.0) {	if (!quiet)	    fprintf(stderr,		    "The wordlist has a ratio of spam to non-spam of %0.1f to 1.0.\n"		    "Bogotune requires the ratio be in the range of 0.2 to 5.\n",		    (double)msgs_bad / msgs_good);	ok = false;    }    if (ns_cnt < TEST_COUNT || sp_cnt < TEST_COUNT) {	if (!quiet)	    fprintf(stderr,		    "The messages sets contain %u non-spam and %u spam.  Bogotune "		    "requires at least %u non-spam and %u spam messages to run.\n",		    ns_cnt, sp_cnt, TEST_COUNT, TEST_COUNT);	exit(EX_ERROR);    }    return ok;}static void show_elapsed_time(int beg, int end, uint cnt, double val,			      const char *lbl1, const char *lbl2){    int tm = end - beg;    if (!fMakeCheck)	printf("    %dm:%02ds for %u %s.  avg: %.1f %s\n",	       MIN(tm), SECONDS(tm), cnt, lbl1, val, lbl2);}static rc_t bogolex(void){    rc_t status = RC_OK;    if (!check_msgcount_parms())	exit(EX_ERROR);    read_mailbox(bogolex_file, NULL);    return status;}static rc_t bogotune(void){    bool skip;    result_t *best;    int beg, end;    uint cnt, scan;    rc_t status = RC_OK;    beg = time(NULL);    ham_cutoff = 0.0;    spam_cutoff = 0.1;    /* Note: memory usage highest while reading messages */    /* usage decreases as distribute() converts to count format */    /* read all messages, merge training sets, look up scoring sets */    ns_cnt = filelist_read(REG_GOOD, ham_files);    sp_cnt = filelist_read(REG_SPAM, spam_files);    cnt = ns_cnt + sp_cnt;    end = time(NULL);    if (verbose >= TIME) {	show_elapsed_time(beg, end, ns_cnt + sp_cnt, (double)cnt/(end-beg), "messages", "msg/sec");    }    distribute(REG_GOOD, ns_msglists);    distribute(REG_SPAM, sp_msglists);    create_countlists(ns_msglists);    create_countlists(sp_msglists);    if (verbose >= TIME && time(NULL) - end > 2) {	end = time(NULL);	show_elapsed_time(beg, end, ns_cnt + sp_cnt, (double)cnt/(end-beg), "messages", "msg/sec");    }    if (verbose > PARMS+1) {	tunelist_print(ns_and_sp);	tunelist_print(ns_msglists);	tunelist_print(sp_msglists);    }    ns_cnt = count_messages(ns_msglists);    sp_cnt = count_messages(sp_msglists);    cnt = ns_cnt + sp_cnt;    if (ds_flag == DS_DSK && !check_msg_counts())	exit(exit_zero ? EX_OK : EX_ERROR);    fflush(stdout);    check_percent = CHECK_PCT;	/* for checking low scoring spam				** and high scoring non-spam */    ns_scores = xcalloc(ns_cnt, sizeof(double));    sp_scores = xcalloc(sp_cnt, sizeof(double));    robs = DEFAULT_ROBS;    robx = DEFAULT_ROBX;    min_dev = DEFAULT_MIN_DEV;    if (check_for_high_ns_scores() | check_for_low_sp_scores())	scoring_error();    /*    ** 5.  Calculate x and cache size    ** Calculate x with bogoutil's -r option (a new addition).    ** Bound the calculated value within [0.4, 0.6] and set the range to be    ** investigated to [x-0.1, x+0.1].    */    robx = get_robx();    if (ds_flag == DS_DSK) {	db_cachesize = calc_db_cachesize();	printf("Recommended db cache size is %u MB\n", db_cachesize);    }    /*    ** 6.  Calculate fp target    ** The fp target will be derived thus: score non-spams with s and md as    ** shipped, and determine the count that will result from a spam cutoff    ** of 0.95; if that is < 0.25%, try 0.9375 etc.    */    min_dev = 0.02;    /* set target and spam_cutoff */    if (coerced_target == 0)	set_thresh(ns_cnt, ns_scores);    else {	/* if coerced target ... */	target = coerced_target;	spam_cutoff = ns_scores[target-1];    }    skip = ROUND(spam_cutoff,100000) < SCAN_CUTOFF;    printf("False-positive target is %u (cutoff %8.6f)\n", target, spam_cutoff);#ifdef	TEST    if (test) {	printf("m: %8.6f, s: %8.6f, x: %0.16f\n", min_dev, robs, robx);	if (verbose < PARMS)	    print_ns_scores(target-2, target+2, 0);    }#endif    if (!esf_flag && (sp_esf < 1.0 || ns_esf < 1.0))	fprintf(stderr, "Warning:  Using ESF values (sp=%8.6f, ns=%8.6f) from config file.\n", sp_esf, ns_esf);    /* No longer needed */    wordhash_free(train);    train = NULL;    for (scan=0; scan <= 1 && !skip; scan ++) {	uint r_count;	uint rsi, rxi, mdi, spi, nsi;	result_t *results, *r, *sorted;	printf("Performing %s scan:\n", scan==0 ? "coarse" : "fine");	switch (scan) {	case 0:		/* COARSE */	    /*	    ** 7.  Coarsely scan s, md and x	    ** The coarse s scan will range from 1 to 0.01 in half decades, and the	    ** coarse md scan will range from 0.05 to 0.45 in steps of 0.05.  The	    ** coarse x scan will use steps of 0.05. The trough must be surrounded on	    ** six sides by values below the 33% quantile (unless bounded on one or	    ** more sides).	    */	    init_coarse(robx);	    break;	case 1:		/* FINE */	    /*	    ** 8.  Finely scan the peak region	    ** The fine s scan will range over the estimated s +/- half a decade in	    ** steps of a quarter decade, and the fine md scan will range over the	    ** estimated md +/- 0.075 in steps of 0.015.  The fine x scan will range	    ** over the estimated x +/- 0.04 in steps of 0.02.  Scans of s and md	    ** are bounded by the limits of the coarse scan.  Again, the trough must	    ** be surrounded on six sides by values below the 33% quantile.  If no	    ** such trough exists, a warning is given.	    */	    init_fine(robs, min_dev, robx, spex, nsex);	    break;	}	r_count = rsval->cnt * mdval->cnt * rxval->cnt * spexp->cnt * nsexp->cnt;	results = (result_t *) xcalloc(r_count, sizeof(result_t));	print_all_parms(r_count);	if (verbose >= SUMMARY) {	    if (verbose >= SUMMARY+1)		printf("%3s ", "cnt");	    if (verbose >= SUMMARY+2)		printf(" %s %s %s      ", "s", "m", "x");	    printf(" %4s %5s   %4s %8s %8s %7s %3s %3s\n",		   "rs", "md", "rx", "spesf", "nsesf", "cutoff", "fp", "fn");	}	cnt = 0;	beg = time(NULL);	for (rsi = 0; rsi < rsval->cnt; rsi++) {	  robs = rsval->data[rsi];	  for (mdi = 0; mdi < mdval->cnt; mdi++) {	    min_dev = mdval->data[mdi];	    for (rxi = 0; rxi < rxval->cnt; rxi++) {	      robx = rxval->data[rxi];	      for (spi = 0; spi < spexp->cnt; spi++) {		spex = spexp->data[spi];		sp_esf = ESF_SEL(sp_esf, pow(0.75, spex));		for (nsi = 0; nsi < nsexp->cnt; nsi++) {		    uint fp, fn;		    nsex = nsexp->data[nsi];		    ns_esf = ESF_SEL(ns_esf, pow(0.75, nsex));		    /* save parms */		    r = &results[cnt++];		    r->idx = cnt;		    r->rsi = rsi; r->rs = robs;		    r->rxi = rxi; r->rx = robx;		    r->mdi = mdi; r->md = min_dev;		    r->spi = spi; r->sp_exp = spex;		    r->nsi = nsi; r->ns_exp = nsex;		    if (verbose >= SUMMARY) {			if (verbose >= SUMMARY+1)			    printf("%3u ", cnt);			if (verbose >= SUMMARY+2)			    printf(" %u %u %u %u %u  ",				rsi, mdi, rxi, spi, nsi);			printf("%6.4f %5.3f %5.3f %8.6f %8.6f",			    robs, min_dev, robx, sp_esf, ns_esf);			fflush(stdout);		    }		    spam_cutoff = 0.01;		    score_ns(ns_scores);	/* scores in descending order */		    /* Determine spam_cutoff and false_pos */		    for (fp = target; fp < ns_cnt; fp += 1) {			spam_cutoff = ns_scores[fp-1];			if (spam_cutoff < 0.999999)			    break;			if (coerced_target != 0)			    break;		    }		    if (ns_cnt < fp)			fprintf(stderr,				"Too few false positives to determine a valid cutoff\n");		    score_sp(sp_scores);	/* scores in ascending order */		    fn = get_fn_count(sp_cnt, sp_scores);		    /* save results */		    r->co = spam_cutoff;		    r->fp = fp;		    r->fn = fn;		    if (verbose < SUMMARY)			progress(cnt, r_count);		    else {			printf(" %8.6f %2u %3u\n", spam_cutoff, fp, fn);			fflush(stdout);		    }#ifdef	TEST		    if (test && spam_cutoff < 0.501) {			printf("co: %0.16f\n", spam_cutoff);			print_ns_scores(0, fp, 2);			print_sp_scores(fn-10, fn, 10);		    }#endif		    if (fMakeCheck && cnt >= cMakeCheck)			break;		}		if (fMakeCheck && cnt >= cMakeCheck)		    break;	      }	      if (fMakeCheck && cnt >= cMakeCheck)		  break;	    }	    if (fMakeCheck && cnt >= cMakeCheck)		break;	  }	  fflush(stdout);	  if (fMakeCheck && cnt >= cMakeCheck)	      break;	}	if (verbose >= TIME) {	    end = time(NULL);	    show_elapsed_time(beg, end, cnt, (double)(end-beg)/cnt, "iterations", "secs");	}	printf("\n");	/* Scan complete, now find minima */	sorted = results_sort(r_count, results);	top_ten(sorted, r_count);	best = count_outliers(r_count, sorted, results);	robs = rsval->data[best->rsi];	robx = rxval->data[best->rxi];	min_dev = mdval->data[best->mdi];	spex = spexp->data[best->spi]; sp_esf = ESF_SEL(sp_esf, pow(0.75, spex));	nsex = nsexp->data[best->nsi]; ns_esf = ESF_SEL(ns_esf, pow(0.75, nsex));	printf(    "Minimum found at s %6.4f, md %5.3f, x %5.3f, spesf %8.6f, nsesf %8.6f\n",    		robs, min_dev, robx, sp_esf, ns_esf);	printf("        fp %u (%6.4f%%), fn %u (%6.4f%%)\n",		best->fp, best->fp*100.0/ns_cnt,		best->fn, best->fn*100.0/sp_cnt);	printf("\n");	data_free(rsval);	data_free(rxval);	data_free(mdval);	data_free(spexp);	data_free(nsexp);	xfree(results);	xfree(sorted);    }    /*    ** 9.  Suggest possible spam and non-spam cutoff values    ** With the final x, md and s values, score the spams and non-spams and    ** sort the non-spam scores decreasing and the spam scores increasing;    ** then, traverse the non-spam list until the 0.2% point; report cutoffs    ** that give 0.05%, 0.1% and 0.2% fp.    */    final_recommendations(skip);    return status;}int main(int argc, char **argv) /*@globals errno,stderr,stdout@*/{    ex_t exitcode = EX_OK;    fBogotune = true;		/* for rob_compute_spamicity() */    dbgout = stderr;    progtype = build_progtype(progname, DB_TYPE);    ham_files  = filelist_new("ham");    spam_files = filelist_new("spam");    /* process args and read mailboxes */    process_arglist(argc, argv);    /* directories from command line and config file are already handled */    if (ds_flag == DS_DSK) {	bfpath *bfp;	if (ds_path == NULL)	    ds_path = get_directory(PR_ENV_BOGO);	if (ds_path == NULL)	    ds_path = get_directory(PR_ENV_HOME);	set_bogohome(ds_path);	bfp = bfpath_create(ds_path);	if (!bfpath_check_mode(bfp, BFP_MUST_EXIST)) {	    fprintf(stderr, "Can't open wordlist '%s'\n", bfp->filepath);	    exit(EX_ERROR);	}	if (bfp->exists && bfp->isdir) {	    bfpath_free(bfp);	    ds_path = mxcat(ds_path, DIRSEP_S, WORDLIST, NULL);		    bfp = bfpath_create(ds_path);	    if (!bfpath_check_mode(bfp, BFP_MUST_EXIST)) {		fprintf(stderr, "Can't open wordlist '%s'\n", bfp->filepath);		exit(EX_ERROR);	    }	}	env = ds_init(bfp);		init_wordlist("word", ds_path, 0, WL_REGULAR);    }    bogotune_init();    if (ds_flag == DS_DSK)	load_wordlist(load_hook, train);    /* if encoding not yet set, assume old style */    if (encoding == E_UNKNOWN)	encoding = E_RAW;    if (bogolex_file != NULL)	bogolex();    else	bogotune();    bogotune_free();    if (ds_flag == DS_DSK)	ds_cleanup(env);    exit(exitcode);}/* End */
上一页 1 23
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -