📄 findbest.py
字号:
#!/usr/bin/env python'''Find the next "best" unsure message to train on. %(prog)s [ -h ] [ -s ] [ -b N ] ham spam unsureGiven a number of unsure messages and a desire to keep your trainingdatabase small, the question naturally arises, "Which message should I addto my database next?". A common approach is to sort the unsures by theirSpamBayes scores and train on the one which scores lowest. This is areasonable approach, but there is no guarantee the lowest scoring unsure isin any way related to the other unsure messages.This script offers a different approach. Given an existing pile of ham andspam, it trains on them to establish a baseline, then for each message inthe unsure pile, it trains on that message, scores the entire unsure pileagainst the resulting training database, then untrains on that message. Foreach such message the following output is generated: * spamprob of the candidate message * number of other unsure messages which would score as spam if it was added to the training database * overall mean of all scored messages after training * standard deviation of all scored messages after training * message-id of the candidate messageWith no options, all candidate unsure messages are trained and scoredagainst. At the end of the run, a file, "best.pck" is written out which isa dictionary keyed by the overall mean rounded to three decimal places. Thevalues are lists of message-ids which generate that mean.Three options affect the behavior of the program. If the -h flag is given,this help message is displayed and the program exits. If the -s flag isgiven, no messages which score as spam are tested as candidates. If the -bN flag is given, only the messages which generated the N highest means inthe last run without the -b flag are tested as candidates. Because theprogram runtime can be very slow (O(n^2) in the number of unsure messages),if you have a fairly large pile of unsure messages, these options can speedthings up dramatically. If the -b flag is used, a new "best.pck" file isnot written. Typically you would run once without the -b flag, then severaltimes with the -b flag, adding one message to the spam pile after each run.After adding several messages to your spam file, you might then redistributethe unsure pile to move spams and hams to their respective folders, thenstart again with a smaller unsure pile.The ham, spam and unsure command line arguments can be anything suitable forfeeding to spambayes.mboxutils.getmbox(). The "best.pck" file is searchedfor and written to these files in this order: * best.pck in the current directory * $HOME/tmp/best.pck * $HOME/best.pck[To do? Someone might consider the reverse operation. Given a pile of hamand spam, which message can be removed with the least impact? What pile ofmail should that removal be tested against?]'''import sysimport osimport cPickle as pickleimport getoptimport mathfrom spambayes.mboxutils import getmboxfrom spambayes.classifier import Classifierfrom spambayes.hammie import Hammiefrom spambayes.tokenizer import tokenizefrom spambayes.Options import optionscls = Classifier()h = Hammie(cls)def counter(tag, i): if tag: sys.stdout.write("\r%s: %4d" % (tag, i)) else: sys.stdout.write("\r%4d" % i) sys.stdout.flush()def learn(mbox, h, is_spam): i = 0 tag = is_spam and "Spam" or "Ham" for msg in getmbox(mbox): counter(tag, i) i += 1 h.train(msg, is_spam) printdef score(unsure, h, cls, scores, msgids=None, skipspam=False): """See what effect on others each msg in unsure has""" ham_cutoff = options["Categorization", "ham_cutoff"] spam_cutoff = options["Categorization", "spam_cutoff"] # compute a base - number of messages in unsure already in the # region of interest n = 0 total = 0.0 okalready = set() add = okalready.add for msg in getmbox(unsure): prob = cls.spamprob(tokenize(msg)) n += 1 if prob >= spam_cutoff: add(msg['message-id']) else: total += prob first_mean = total/n print len(okalready), "out of", n, "messages already score as spam" print "initial mean spam prob: %.3f" % first_mean print "%5s %3s %5s %5s %s" % ("prob", "new", "mean", "sdev", "msgid") # one by one, train on each message and see what effect it has on # the other messages in the mailbox for msg in getmbox(unsure): msgid = msg['message-id'] if msgids is not None and msgid not in msgids: continue msgprob = cls.spamprob(tokenize(msg)) if skipspam and msgprob >= spam_cutoff: continue n = j = 0 h.train(msg, True) # see how many other messages in unsure now score as spam total = 0.0 probs = [] for trial in getmbox(unsure): # don't score messages which previously scored as spam if trial['message-id'] in okalready: continue n += 1 if n % 10 == 0: counter("", n) prob = cls.spamprob(tokenize(trial)) probs.append(prob) total += prob if prob >= spam_cutoff: j += 1 counter("", n) h.untrain(msg, True) mean = total/n meankey = round(mean, 3) scores.setdefault(meankey, []).append(msgid) sdev = math.sqrt(sum([(mean-prob)**2 for prob in probs])/n) print "\r%.3f %3d %.3f %.3f %s" % (msgprob, j, mean, sdev, msgid)prog = os.path.basename(sys.argv[0])def usage(msg=None): if msg is not None: print >> sys.stderr, msg print >> sys.stderr, __doc__.strip() % globals()def main(args): try: opts, args = getopt.getopt(args, "b:sh") except getopt.error, msg: usage(msg) return 1 best = 0 skipspam = False for opt, arg in opts: if opt == "-h": usage() return 0 if opt == "-b": best = int(arg) elif opt == "-s": skipspam = True if len(args) != 3: usage("require ham, spam and unsure message piles") return 1 ham, spam, unsure = args choices = ["best.pck"] if "HOME" in os.environ: home = os.environ["HOME"] choices.append(os.path.join(home, "tmp", "best.pck")) choices.append(os.path.join(home, "best.pck")) choices.append(None) for bestfile in choices: if bestfile is None: break if os.path.exists(bestfile): break try: file(bestfile, "w") except IOError: pass else: os.unlink(bestfile) if bestfile is None: usage("can't find a place to write best.pck file") return 1 print "establish base training" learn(ham, h, False) learn(spam, h, True) print "scoring" if best: last_scores = pickle.load(file(bestfile)) last_scores = last_scores.items() last_scores.sort() msgids = set() for (k, v) in last_scores[-best:]: msgids.update(set(v)) else: msgids = None scores = {} try: score(unsure, h, cls, scores, msgids, skipspam) except KeyboardInterrupt: # allow early termination without loss of computed scores pass if not best: pickle.dump(scores, file(bestfile, 'w')) return 0if __name__ == "__main__": sys.exit(main(sys.argv[1:]))
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -