testtoolsui.py
来自「用python实现的邮件过滤器」· Python 代码 · 共 556 行 · 第 1/2 页
PY
556 行
d.alldone() # Other end of the lazy 'capture the output' code. sys.stdout.seek(0) sys.stderr.seek(0) out, err = sys.stdout, sys.stderr sys.stdout = cout sys.stderr = cerr return out, err def rates(self, ifile): """This is essentially rates.py from the testtools directory.""" # XXX Stop being lazy and using the remapping cout/cerr cheat # XXX at some point. cout = sys.stdout cerr = sys.stderr sys.stdout = StringIO.StringIO() sys.stderr = StringIO.StringIO() interesting = filter(lambda line: line.startswith('-> '), ifile) ifile.close() ofile = StringIO.StringIO() def dump(*stuff): msg = ' '.join(map(str, stuff)) print msg print >> ofile, msg ntests = nfn = nfp = 0 sumfnrate = sumfprate = 0.0 for line in interesting: dump(line[:-1]) fields = line.split() # 0 1 2 3 4 5 6 -5 -4 -3 -2 -1 #-> <stat> tested 4000 hams & 2750 spams against 8000 hams & 5500 spams if line.startswith('-> <stat> tested '): ntests += 1 continue # 0 1 2 3 # -> <stat> false positive %: 0.025 # -> <stat> false negative %: 0.327272727273 if line.startswith('-> <stat> false '): kind = fields[3] percent = float(fields[-1]) if kind == 'positive': sumfprate += percent lastval = percent else: sumfnrate += percent dump(' %7.3f %7.3f' % (lastval, percent)) continue # 0 1 2 3 4 5 # -> <stat> 1 new false positives if len(fields) >= 5 and fields[3] == 'new' and fields[4] == 'false': kind = fields[-1] count = int(fields[2]) if kind == 'positives': nfp += count else: nfn += count dump('total unique false pos', nfp) dump('total unique false neg', nfn) dump('average fp %', sumfprate / ntests) dump('average fn %', sumfnrate / ntests) ofile.seek(0) sys.stdout = cout sys.stderr = cerr return ofile def compare(self, f1, f2): """This is essentially cmp.py from the testtools directory.""" # XXX Stop being lazy and using the remapping cout/cerr cheat # XXX at some point. cout, cerr = sys.stdout, sys.stderr sys.stdout = StringIO.StringIO() sys.stderr = StringIO.StringIO() def suck(f): fns = [] fps = [] hamdev = [] spamdev = [] hamdevall = spamdevall = (0.0, 0.0) get = f.readline while 1: line = get() if line.startswith('-> <stat> tested'): print line, if line.find(' items; mean ') != -1: # -> <stat> Ham distribution for this pair: 1000 items; mean 0.05; sample sdev 0.68 # and later "sample " went away vals = line.split(';') mean = float(vals[1].split()[-1]) sdev = float(vals[2].split()[-1]) val = (mean, sdev) typ = vals[0].split()[2] if line.find('for all runs') != -1: if typ == 'Ham': hamdevall = val else: spamdevall = val elif line.find('all in this') != -1: if typ == 'Ham': hamdev.append(val) else: spamdev.append(val) continue if line.startswith('-> '): continue if line.startswith('total'): break if len(line) == 0: continue # A line with an f-p rate and an f-n rate. p, n = map(float, line.split()) fps.append(p) fns.append(n) # "total unique false pos 0" # "total unique false neg 0" # "average fp % 0.0" # "average fn % 0.0" fptot = int(line.split()[-1]) fntot = int(get().split()[-1]) fpmean = float(get().split()[-1]) fnmean = float(get().split()[-1]) return (fps, fns, fptot, fntot, fpmean, fnmean, hamdev, spamdev, hamdevall, spamdevall) def tag(p1, p2): if p1 == p2: t = "tied " else: t = p1 < p2 and "lost " or "won " if p1: p = (p2 - p1) * 100.0 / p1 t += " %+7.2f%%" % p else: t += " +(was 0)" return t def mtag(m1, m2): mean1, dev1 = m1 mean2, dev2 = m2 t = "%7.2f %7.2f " % (mean1, mean2) if mean1: mp = (mean2 - mean1) * 100.0 / mean1 t += "%+7.2f%%" % mp else: t += "+(was 0)" t += " %7.2f %7.2f " % (dev1, dev2) if dev1: dp = (dev2 - dev1) * 100.0 / dev1 t += "%+7.2f%%" % dp else: t += "+(was 0)" return t def dump(p1s, p2s): alltags = "" for p1, p2 in zip(p1s, p2s): t = tag(p1, p2) print " %5.3f %5.3f %s" % (p1, p2, t) alltags += t + " " print for t in "won", "tied", "lost": print "%-4s %2d times" % (t, alltags.count(t)) print def dumpdev(meandev1, meandev2): for m1, m2 in zip(meandev1, meandev2): print mtag(m1, m2) (fp1, fn1, fptot1, fntot1, fpmean1, fnmean1, hamdev1, spamdev1, hamdevall1, spamdevall1) = suck(f1) (fp2, fn2, fptot2, fntot2, fpmean2, fnmean2, hamdev2, spamdev2, hamdevall2, spamdevall2) = suck(f2) print print "false positive percentages" dump(fp1, fp2) print "total unique fp went from", fptot1, "to", fptot2, tag(fptot1, fptot2) print "mean fp % went from", fpmean1, "to", fpmean2, tag(fpmean1, fpmean2) print print "false negative percentages" dump(fn1, fn2) print "total unique fn went from", fntot1, "to", fntot2, tag(fntot1, fntot2) print "mean fn % went from", fnmean1, "to", fnmean2, tag(fnmean1, fnmean2) print if len(hamdev1) == len(hamdev2) and len(spamdev1) == len(spamdev2): print "ham mean ham sdev" dumpdev(hamdev1, hamdev2) print print "ham mean and sdev for all runs" dumpdev([hamdevall1], [hamdevall2]) print print "spam mean spam sdev" dumpdev(spamdev1, spamdev2) print print "spam mean and sdev for all runs" dumpdev([spamdevall1], [spamdevall2]) print diff1 = spamdevall1[0] - hamdevall1[0] diff2 = spamdevall2[0] - hamdevall2[0] print "ham/spam mean difference: %2.2f %2.2f %+2.2f" % (diff1, diff2, diff2 - diff1) else: print "[info about ham & spam means & sdevs not available in both files]" sys.stdout.seek(0) sys.stderr.seek(0) out, err = sys.stdout, sys.stderr sys.stdout = cout sys.stderr = cerr return out, err# The iterator yields a stream of Msg objects from the given# 'directory'. The directory is actually the actual directory# and then an indication of the portion of it that we are after.# (so that a single directory can be used, a la the caches, rather# than a nicely split up into sets directory).class CacheStream(msgs.MsgStream): def produce(self): # We only want some of the msgs. Shuffle each directory list, but # in such a way that we'll get the same result each time this is # called on the same directory list. base_check = None for directory in self.directories: directory, portion = directory.split(os.pathsep) # All the directories in the list *must* be the same, and just # different sections, because this makes the code easier, and is # the desired usage, anyway. if base_check is None: base_check = directory assert directory == base_check set_num, nsets = portion.split('/') all = os.listdir(directory) random.seed(hash(max(all)) ^ msgs.SEED) random.shuffle(all) set_size = len(all) // int(nsets) set_num = int(set_num) set = all[set_num*set_size:((set_num+1)*set_size)-1] set.sort() for fname in set: yield msgs.Msg(directory, fname)class HamCacheStream(CacheStream): def __init__(self, tag, directories, train=0): if train: CacheStream.__init__(self, tag, directories, msgs.HAMTRAIN) else: CacheStream.__init__(self, tag, directories, msgs.HAMTEST)class SpamCacheStream(CacheStream): def __init__(self, tag, directories, train=0): if train: CacheStream.__init__(self, tag, directories, msgs.SPAMTRAIN) else: CacheStream.__init__(self, tag, directories, msgs.SPAMTEST)
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?