📄 nway-train-fast.py
字号:
#!/usr/bin/env python"""Train databases for the nway filterusage %(prog)s [ -h ] [ -f ] [ -d dir ] [ -i float ] db1:mbox1 db2:mbox2 ... -h - print this documentation and exit. -f - force full retrain. -d dir - directory containing mbox files - write db files relative to dir/nway-db -i float - set maximum imbalance of ham/spam ratio to float (default: %(MAX_RATIO)s) db:mbox - use mbox as source of messages for db"""from __future__ import divisionimport sysimport osimport getoptimport shelveprog = sys.argv[0]MAX_RATIO = 2.0STATE_KEY = 'saved state'def usage(msg=None): if msg is not None: print >> sys.stderr, msg print >> sys.stderr, __doc__.strip() % globals()def merge(hbases, max_ratio): hbases.sort(lambda x,y: cmp(x.lower(), y.lower())) # make a union of all the ham databases bigd = {"saved state": (5, 0, 0)} print >> sys.stderr, "building ham union:", for hbase in hbases: print >> sys.stderr, hbase, sys.stderr.flush() hdb = shelve.open("%(hbase)s-ham.db" % locals(), 'r') v, s, h = bigd[STATE_KEY] hv, hs, hh = hdb[STATE_KEY] bigd[STATE_KEY] = (v, 0, h+hh) for key in hdb: if key != STATE_KEY: s, h = bigd.get(key) or (0, 0) hs, hh = hdb[key] bigd[key] = (0, h+hh) hdb.close() print >> sys.stderr, "" # for each element of the hbases list, create a spam version of it, # merge bigd into it except for the values in the spam version. for hbase in hbases: print >> sys.stderr, "\nbuilding", hbase, "database" spamd = {} sdb = shelve.open("%(hbase)s-ham.db" % locals(), 'r') (v, s, h) = sdb[STATE_KEY] spamd[STATE_KEY] = (v, h, 0) for k in sdb: if k != STATE_KEY: (s, h) = sdb[k] spamd[k] = (h, 0) print >> sys.stderr, "merging hamcounts" # merge bigd into spamd minus the counts from hbase (v, s, h) = spamd[STATE_KEY] (v, ss, hh) = bigd[STATE_KEY] hh -= sdb[STATE_KEY][2] spamd[STATE_KEY] = (v, s, hh) for k in bigd: if k != STATE_KEY: (s, h) = spamd.get(k) or (0, 0) (ss, hh) = bigd[k] hh -= (sdb.get(k) or (0, 0))[1] spamd[k] = (s, h+hh) sdb.close() # scale the ham or spam counts so as to keep the ham:spam ratio # between 1/max_ratio and max_ratio hadjust = sadjust = 1.0 (v, spamcount, hamcount) = spamd[STATE_KEY] if hamcount/spamcount > max_ratio: hadjust = spamcount*max_ratio/hamcount elif spamcount/hamcount > max_ratio: sadjust = hamcount*max_ratio/spamcount spamcount *= sadjust hamcount *= hadjust print >> sys.stderr, "hamcount: ", hamcount print >> sys.stderr, "spamcount: ", spamcount print >> sys.stderr, "ham adjust: ", hadjust print >> sys.stderr, "spam adjust:", sadjust spamd[STATE_KEY] = (v, int(spamcount), int(hamcount)) for k in spamd.keys(): if k != STATE_KEY: s, h = spamd[k] del spamd[k] s *= sadjust h *= hadjust val = (int(s), int(h)) if val == (0, 0): continue spamd[k] = val print >> sys.stderr, "final size:", len(spamd)-1, "tokens" # dump it print >> sys.stderr, "dumping", hbase, "database" dbf = "%(hbase)s.db" % locals() sdb = shelve.open(dbf) sdb.update(spamd) sdb.close() print >> sys.stderr, "final disk size:", os.path.getsize(dbf), "bytes"def mtime(f): try: return os.path.getmtime(f) except OSError: return 0def main(args): try: opts, args = getopt.getopt(args, "hfd:i:", ["help", "force", "directory=", "imbalance="]) except getopt.GetoptError, msg: usage(msg) return 1 force_arg = "" directory = "." max_ratio = MAX_RATIO for opt, arg in opts: if opt in ("-h", "--help"): usage() return 0 elif opt in ("-f", "--force"): force_arg = "-f" elif opt in ("-d", "--directory"): directory = arg elif opt in ("-i", "--imbalance"): max_ratio = float(arg) mapping = {} for db_mbox in args: db, mbox = db_mbox.split(":") mapping[db] = mbox # train on each mbox once as ham keys = mapping.keys() keys.sort(lambda x,y: cmp(x.lower(),y.lower())) for db in keys: mboxf = os.path.join(directory, mapping[db]) dbf = os.path.join(directory, "nway-db", "%(db)s-ham.db" % locals()) if mtime(dbf) < mtime(mboxf): if mtime(dbf) == 0: # force a full retrain if the db doesn't exist force = "-f" else: force = force_arg os.system("sb_mboxtrain.py %(force)s -d %(dbf)s -g %(mboxf)s" % locals()) else: print >> sys.stderr, "skipping", dbf, "(up-to-date)" # merge all the databases together in strange and marvelous ways merge(mapping.keys(), max_ratio) return 0if __name__ == "__main__": sys.exit(main(sys.argv[1:]))
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -