📄 nway-train.py
字号:
#!/usr/bin/env python"""Train databases for the nway filterusage %(prog)s [ -h ] [ -f ] [ -d dir ] [ -i float ] db1:mbox1 db2:mbox2 ... -h - print this documentation and exit. -f - force full retrain. -d dir - directory containing mbox files - write db files relative to dir/nway-db -i float - set maximum imbalance of ham/spam ratio to float (default: %(MAX_RATIO)s) db:mbox - use mbox as source of messages for db"""from __future__ import divisionimport sysimport osimport getoptimport shelveprog = sys.argv[0]MAX_RATIO = 2.0STATE_KEY = 'saved state'def usage(msg=None): if msg is not None: print >> sys.stderr, msg print >> sys.stderr, __doc__.strip() % globals()def merge(sbase, hbases, max_ratio): print >> sys.stderr, "building %(sbase)s database" % locals() # The individual databases are always trained as ham, so we need to flip # the counts when using them as if they were spam. The ham count will # always be zero sdb = shelve.open("%(sbase)s-ham.db" % locals(), 'r') # remember, we want to reverse the usual meaning of the scores in # this database (ver, hamcount, spamcount) = sdb[STATE_KEY] for hbase in hbases: hdb = shelve.open("%(hbase)s-ham.db" % locals(), 'r') hamcount += hdb[STATE_KEY][2] hdb.close() # XXX SpamBayes works best when there are roughly the same number of # spam as ham messages. Many people with extreme ham:spam ratios have # reported problems. The mechanism I'm using here - one spam database # and several ham database will stress the system in that way. One # thing to try is to adjust the ham counts to minimize the apparent # ratios. For example, if the spam database has a count of 100 and the # sum of the ham counts from the ham databases is 2000, perhaps the ham # counts can all be scaled by a constant factor (by 20? by 5? by enough # to bring the ham:spam ratio down to no more than 5:1? 3:1?) to make # the apparent ham and spam counts more equal hadjust = sadjust = 1.0 if hamcount/spamcount > max_ratio: hadjust = spamcount*max_ratio/hamcount elif spamcount/hamcount > max_ratio: sadjust = hamcount*max_ratio/spamcount print >> sys.stderr, "hamcount: ", hamcount print >> sys.stderr, "spamcount: ", spamcount print >> sys.stderr, "ham adjust: ", hadjust print >> sys.stderr, "spam adjust:", sadjust d = {} d[STATE_KEY] = (ver, spamcount, hamcount*hadjust) for k in sdb: if k != STATE_KEY: d[k] = (sadjust*sdb[k][1], 0) sdb.close() # Now update using the remaining ham databases. Spam counts are always # zero, so all we adjust are the ham counts print >> sys.stderr, "merging", for hbase in hbases: hdb = shelve.open("%(hbase)s-ham.db" % locals(), 'r') print >> sys.stderr, "%(hbase)s" % locals(), sys.stderr.flush() for key in hdb: if key != STATE_KEY: val = hdb[key] s, h = d.get(key) or (0, 0) d[key] = (s, hadjust*h+val[1]) print >> sys.stderr, "" # we don't truncate to ints until the very end v, s, h = d[STATE_KEY] d[STATE_KEY] = (v, int(s), int(h)) print >> sys.stderr, "new saved state:", d[STATE_KEY] keys = d.keys() for k in keys: if k != STATE_KEY: s, h = int(d[k][0]), int(d[k][1]) # remove terms which dwindled away to (nearly) nothing if (s, h) in ((1, 0), (0, 0), (0, 1), (1, 1)): del d[k] else: d[k] = (s, h) db = shelve.open("%(sbase)s.db" % locals()) print >> sys.stderr, "saving to %(sbase)s.db" % locals() db.clear() db.update(d) db.close()def mtime(f): try: return os.path.getmtime(f) except OSError: return 0def main(args): try: opts, args = getopt.getopt(args, "hfd:i:", ["help", "force", "directory=", "imbalance="]) except getopt.GetoptError, msg: usage(msg) return 1 force_arg = "" directory = "." max_ratio = MAX_RATIO for opt, arg in opts: if opt in ("-h", "--help"): usage() return 0 elif opt in ("-f", "--force"): force_arg = "-f" elif opt in ("-d", "--directory"): directory = arg elif opt in ("-i", "--imbalance"): max_ratio = float(arg) mapping = {} for db_mbox in args: db, mbox = db_mbox.split(":") mapping[db] = mbox # train on each mbox once as ham keys = mapping.keys() keys.sort(lambda x,y: cmp(x.lower(),y.lower())) for db in keys: mboxf = os.path.join(directory, mapping[db]) dbf = os.path.join(directory, "nway-db", "%(db)s-ham.db" % locals()) if mtime(dbf) < mtime(mboxf): if mtime(dbf) == 0: # force a full retrain if the db doesn't exist force = "-f" else: force = force_arg os.system("sb_mboxtrain.py %(force)s -d %(dbf)s -g %(mboxf)s" % locals()) else: print >> sys.stderr, "skipping", dbf, "(up-to-date)" # for each key, merge its ham database (reversed) with all the other ham # databases for db in keys: hdbs = [d for d in mapping if d != db] hdbs.sort(lambda x,y: cmp(x.lower(),y.lower())) merge(db, hdbs, max_ratio) return 0if __name__ == "__main__": sys.exit(main(sys.argv[1:]))
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -