📄 bogotune.c
字号:
fflush(stdout); }}static int update_count(void){ message_count += 1; if (verbose && (message_count % 100) == 0 && !fMakeCheck) { if ((message_count % 1000) != 0) putchar('.'); else printf("\r \r%u ", message_count/1000 ); fflush(stdout); } return message_count;}static unsigned int calc_db_cachesize(void){ struct stat fst; if (!stat(ds_path, &fst)) { int dbc = ceil((double)fst.st_size / (3 * 1024 * 1024)); return ((unsigned int)dbc); } else { fprintf(stderr, "Unable to stat %s\n", ds_path); exit (EX_ERROR); }}static void load_wordlist(ds_foreach_t *hook, void *userdata){ bfpath *bfp = bfpath_create(ds_path); if (!bfpath_check_mode(bfp, BFP_MUST_EXIST)) { fprintf(stderr, "Can't open wordlist '%s'\n", bfp->filepath); exit(EX_ERROR); } if (verbose) { printf("Reading %s\n", ds_path); fflush(stdout); } ds_oper(env, bfp, DS_READ, hook, userdata); bfpath_free(bfp); return;}static int load_hook(word_t *key, dsv_t *data, void *userdata)/* returns 0 if ok, 1 if not ok */{ wordprop_t *tokenprop = wordhash_insert(train, key, sizeof(wordprop_t), &wordprop_init); (void) userdata; /* quiet compiler complaint */ tokenprop->cnts.bad = data->spamcount; tokenprop->cnts.good = data->goodcount; if (word_cmps(key, ".MSG_COUNT") == 0) set_msg_counts(data->goodcount, data->spamcount); if (word_cmps(key, ".ENCODING") == 0) { if (encoding == E_UNKNOWN) encoding = data->spamcount; if (encoding != data->spamcount) { fprintf(stderr, "Can't mix database encodings, i.e. utf-8 and any other.\n"); exit(EX_ERROR); } } return 0;}static void set_train_msg_counts(wordhash_t *wh){ wordprop_t *count; count = wordhash_insert(wh, w_msg_count, sizeof(wordprop_t), NULL); if (msgs_good == 0 && msgs_bad == 0) { fprintf(stderr, "Can't find '.MSG_COUNT'.\n"); exit(EX_ERROR); }}/* write_msgcount_file()**** Create a message count file from the original messages*/static void print_msgcount_entry(const char *token, uint bad, uint good){ printf( "\"%s\" %u %u\n", token, bad, good);}static void write_msgcount_file(wordhash_t *wh){ hashnode_t *hn; print_msgcount_entry(".MSG_COUNT", msgs_bad, msgs_good); for (hn = wordhash_first(wh); hn != NULL; hn = wordhash_next(wh)) { word_t *token = hn->key; wordprop_t *wp = (wordprop_t *) hn->buf; wordcnts_t *cnts = &wp->cnts; if (cnts->good == 0 && cnts->bad == 0) { wp = wordhash_search(train, token, 0); if (wp) { cnts->good = wp->cnts.good; cnts->bad = wp->cnts.bad; } } print_msgcount_entry((char *)token->text, cnts->bad, cnts->good); } return;}static uint read_mailbox(char *arg, mlhead_t *msgs){ if (verbose) { printf("Reading %s\n", arg); fflush(stdout); } init_count(); mbox_mode = true; bogoreader_init(1, &arg); while ((*reader_more)()) { wordhash_t *whp = NULL; wordhash_t *whc = wordhash_new(); collect_words(whc); if (ds_path != NULL && (msgs_good + msgs_bad) == 0) set_train_msg_counts(whc); if (whc->count == 0 && !quiet) { printf("msg #%u, count is %u\n", message_count, whc->count); bt_trap(); } if (bogolex_file != NULL) { wordhash_sort(whc); lookup_words(whc); write_msgcount_file(whc); } else if (whc->count != 0) { if (!msg_count_file) whp = convert_wordhash_to_propslist(whc, train); else whp = convert_propslist_to_countlist(whc); msglist_add(msgs, whp); } update_count(); if (whc != whp) wordhash_free(whc); } print_final_count(); ns_and_sp->count += message_count; bogoreader_fini(); return message_count;}static uint filelist_read(int mode, flhead_t *list){ uint count = 0; flitem_t *item; mlhead_t *msgs = (mode == REG_GOOD) ? ns_msglists->msgs : sp_msglists->msgs; run_type = mode; for (item = list->head; item != NULL; item = item->next) { lexer = NULL; msg_count_file = false; count += read_mailbox(item->name, msgs); } return count;}/* distribute()**** Proportionally distribute messages between training and scoring sets.**** Method:** If only 2500 messages, use 2000 for training and 500 for scoring.** If over 4000 messages, use equal numbers for training and scoring.** In between 2500 and 4000, do a proportional distribution.*/static void distribute(int mode, tunelist_t *ns_or_sp){ int good = mode == REG_GOOD; int bad = 1 - good; bool divvy = ds_flag == DS_RAM && user_robx < EPS && !msg_count_file; mlitem_t *item; mlhead_t *msgs = ns_or_sp->msgs; int score_count = 0; int train_count = 0; static int train_good = 0; static int train_bad = 0; double ratio = scale(msgs->count, LIST_COUNT + TEST_COUNT, /* small count */ LIST_COUNT + LIST_COUNT, /* large count */ LIST_COUNT / TEST_COUNT, /* small ratio */ LIST_COUNT / LIST_COUNT); /* large ratio */ for (item = msgs->head; item != NULL; item = item->next) { wordhash_t *wh = item->wh; /* training set */ if (divvy && train_count / ratio < score_count + 1) { wordhash_set_counts(wh, good, bad); wordhash_add(train, wh, &wordprop_init); train_count += 1; wordhash_free(wh); train_good += good; train_bad += bad; } /* scoring set */ else { uint bin = divvy ? MOD(score_count,3) : 0; msglist_add(ns_or_sp->u.sets[bin], wh); score_count += 1; } item->wh = NULL; } if (divvy) { wordhash_insert(train, w_msg_count, sizeof(wordprop_t), &wordprop_init); set_msg_counts(train_good, train_bad); } if (verbose > 1) printf("%s: train_count = %d, score_count = %d\n", good ? "ns" : "sp", train_count, score_count); return;}static void create_countlists(tunelist_t *ns_or_sp){ uint i; uint c = COUNTOF(ns_or_sp->u.sets); for (i = 0; i < c; i += 1) { mlhead_t *list = ns_or_sp->u.sets[i]; mlitem_t *item; for (item = list->head; item != NULL; item = item->next) { wordhash_t *who = item->wh; wordhash_t *whn = convert_propslist_to_countlist(who); if (whn != who) { wordhash_free(who); item->wh = whn; } } } return;}static void print_version(void){ (void)fprintf(stderr, "%s version %s\n" " Database: %s\n" "Copyright (C) 2002-2006 Greg Louis, David Relson\n\n" "%s comes with ABSOLUTELY NO WARRANTY. " "This is free software, and\nyou are welcome to " "redistribute it under the General Public License. " "See\nthe COPYING file with the source distribution for " "details.\n" "\n", progtype, version, ds_version_str(), PACKAGE);}static void help(void){ (void)fprintf(stderr, "Usage: %s [options] { -c config } { -d directory } -n non-spam-file -s spam-file\n", progname); (void)fprintf(stderr, "\t -h - print this help message.\n" "\t -C - don't read standard config files.\n" "\t -c file - read specified config file.\n" "\t -D - don't read a wordlist file.\n" "\t -d path - specify directory for wordlists.\n" "\t -E - disable ESF (effective size factor) tuning.\n" "\t -M file - rewrite input file in message count format.\n" "\t -r num - specify robx value\n"); (void)fprintf(stderr, "\t -T num - specify fp target value\n" "\t -s file1 file2 ... - spam files\n" "\t -n file1 file2 ... - non-spam files\n" "\t -v - increase level of verbose messages\n" "\t -q - quiet (suppress warnings)\n" ); (void)fprintf(stderr, "\n" "%s (version %s) is part of the bogofilter package.\n", progname, version);}static struct option longopts_bogotune[] = { /* longoptions.h - common options */ LONGOPTIONS_COMMON /* longoptions.h - bogofilter/-lexer options */ LONGOPTIONS_LEX /* end of list */ { NULL, 0, 0, 0 }};static int process_arglist(int argc, char **argv){ int count = 1; bulk_mode = B_CMDLINE;#ifdef __EMX__ _response (&argc, &argv); /* expand response files (@filename) */ _wildcard (&argc, &argv); /* expand wildcards (*.*) */#endif#define OPTIONS ":-:c:Cd:DeEM:n:qr:s:tT:vVx:" while (1) { int option; int option_index = 0; int this_option_optind = optind ? optind : 1; const char *name; option = getopt_long(argc, argv, OPTIONS, longopts_bogotune, &option_index); if (option == -1) break; name = (option_index == 0) ? argv[this_option_optind] : longopts_bogotune[option_index].name; process_bogotune_arg(option); } if (ds_flag == DS_NONE) /* default is "wordlist on disk" */ ds_flag = DS_DSK; if (ds_flag == DS_ERR) { fprintf(stderr, "Only one '-d dir' or '-D' option is allowed.\n"); exit(EX_ERROR); } if (bogolex_file == NULL && (spam_files->count == 0 || ham_files->count == 0)) { fprintf(stderr, "Bogotune needs both non-spam and spam message sets for its parameter testing.\n"); exit(EX_ERROR); } if (!suppress_config_file) process_config_files(false, longopts_bogotune); return count;}static void process_bogotune_arg(int option){ static int lastmode = -1; if (option == 1) { /* If getopt's RETURN_IN_ORDER behavior */ switch (lastmode) { case 'n': case 's': option = lastmode; break; default: fprintf(stderr, "File names may only be given after -n or -s options.\n"); } } switch (option) { case 'c': read_config_file(optarg, false, false, PR_CFG_USER, longopts_bogotune); /* FALLTHROUGH */ case 'C': suppress_config_file = true; break; case 'd': ds_path = xstrdup(optarg); ds_flag = (ds_flag == DS_NONE) ? DS_DSK : DS_ERR; break; case 'D': ds_flag = (ds_flag == DS_NONE) ? DS_RAM : DS_ERR; break; case 'e': exit_zero = true; break; case 'E': esf_flag ^= true; break; case 'M': bogolex_file = optarg; break; case 'n': lastmode = 'n'; filelist_add(ham_files, optarg); break; case 'q': quiet = true; break; case 'r': user_robx = atof(optarg); break; case 's': lastmode = 's'; filelist_add(spam_files, optarg); break;#ifdef TEST case 't': test += 1; break;#endif case 'T': coerced_target = atoi(optarg); break; case 'v': verbose += 1; break; case 'V': print_version(); exit(EX_OK); case 'x': if (strcmp(optarg, "MakeCheck") == 0) fMakeCheck = true; else set_debug_mask( optarg ); break; case O_MAX_TOKEN_LEN: max_token_len = atoi(optarg); break; case O_MIN_TOKEN_LEN: min_token_len = atoi(optarg); break; case O_MAX_MULTI_TOKEN_LEN: max_multi_token_len=atoi(optarg); break; case O_MULTI_TOKEN_COUNT: multi_token_count=atoi(optarg); break; default: help(); exit(EX_ERROR); }}static double get_robx(void){ double rx; if (user_robx > 0.0) rx = user_robx; else if (ds_flag == DS_DSK) { printf("Calculating initial x value...\n"); verbose = -verbose; /* disable bogofilter debug output */ rx = compute_robinson_x(); verbose = -verbose; /* enable bogofilter debug output */ } else rx = ROBX; if (rx > RX_MAX) rx = RX_MAX; if (rx < RX_MIN) rx = RX_MIN; printf("Initial x value is %8.6f\n", rx); return rx;}static result_t *results_sort(uint r_count, result_t *results){ result_t *ans = xcalloc(r_count, sizeof(result_t)); memcpy(ans, results, r_count * sizeof(result_t)); qsort(ans, r_count, sizeof(result_t), compare_results); return ans;}static void top_ten(result_t *sorted, uint n){ uint i, j; bool f; printf("Top ten parameter sets from this scan:\n"); printf(" rs md rx spesf nsesf co fp fn fppc fnpc\n"); for (f = false; !f; f = true) { for (i = j = 0; i < 10 && j < n;) { result_t *r = &sorted[j++]; if (!f && r->fp != target) continue; sp_esf = ESF_SEL(sp_esf, pow(0.75, r->sp_exp)); ns_esf = ESF_SEL(ns_esf, pow(0.75, r->ns_exp)); printf("%5u %6.4f %5.3f %5.3f %8.6f %8.6f %6.4f %3u %3u %6.4f %6.4f\n", r->idx, r->rs, r->md, r->rx, sp_esf, ns_esf, r->co, r->fp, r->fn, r->fp*100.0/ns_cnt, r->fn*100.0/sp_cnt); ++i; } if (i) break; printf("Warning: fp target not met, using original results\n"); } printf("\n"); fflush(stdout); return;}/* get false negative */static int gfn(result_t *results, uint rsi, uint mdi, uint rxi, uint spi, uint nsi){ uint i = (((rsi * mdval->cnt + mdi) * rxval->cnt + rxi) * spexp->cnt + spi) * nsexp->cnt + nsi; result_t *r = &results[i]; int fn = r->fn; if (r->fp != target) return INT_MAX; if (verbose > 100) printf(" %2u, %2u, %2u, %2u, %2u, %2d\n", rsi, mdi, rxi, spi, nsi, fn); ncnt += 1; nsum += fn; return fn;}static result_t *count_outliers(uint r_count, result_t *sorted, result_t *unsorted){ bool f = false; uint i, j = 0, o = 0; uint fn; uint rsi, mdi, rxi, spi, nsi; uint rsc = rsval->cnt - 1; uint rxc = rxval->cnt - 1; uint mdc = mdval->cnt - 1; uint spc = spexp->cnt - 1; uint nsc = nsexp->cnt - 1; result_t *r = NULL; /* quench bogus compiler warning */ uint q33 = sorted[r_count * 33 / 100].fn; /* 33% quantile */ uint med = sorted[r_count * 50 / 100].fn; /* median false negative */ if (verbose) printf("%u%% fn count was %u\n", 50u, med); for (i = 0; i < r_count; i += 1) { r = &sorted[i]; if (r->fp != target) continue; if (j == 0) j = i+1; if (fMakeCheck && j >= cMakeCheck) break; rsi = r->rsi; mdi = r->mdi; rxi = r->rxi; spi = r->spi; nsi = r->nsi; ncnt = nsum = 0; if (((rsi == 0 || (fn = gfn(unsorted, rsi-1, mdi, rxi, spi, nsi)) < med)) && ((rsi == rsc || (fn = gfn(unsorted, rsi+1, mdi, rxi, spi, nsi)) < med)) && ((mdi == 0 || (fn = gfn(unsorted, rsi, mdi-1, rxi, spi, nsi)) < med)) && ((mdi == mdc || (fn = gfn(unsorted, rsi, mdi+1, rxi, spi, nsi)) < med)) && ((rxi == 0 || (fn = gfn(unsorted, rsi, mdi, rxi-1, spi, nsi)) < med)) && ((rxi == rxc || (fn = gfn(unsorted, rsi, mdi, rxi+1, spi, nsi)) < med)) && ((spi == 0 || (fn = gfn(unsorted, rsi, mdi, rxi, spi-1, nsi)) < med)) && ((spi == spc || (fn = gfn(unsorted, rsi, mdi, rxi, spi+1, nsi)) < med)) && ((nsi == 0 || (fn = gfn(unsorted, rsi, mdi, rxi, spi, nsi-1)) < med)) && ((nsi == nsc || (fn = gfn(unsorted, rsi, mdi, rxi, spi, nsi+1)) < med)) && (nsum / ncnt < q33)) { f = true; break; } o++; } if (o > 0) { printf("%u outlier%s encountered. \n", o, (o > 1) ? "s" : ""); } if (!f) { r = &sorted[j-1]; printf("No smooth minimum encountered, using lowest fn count (an outlier). \n"); }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -