testtoolsui.py

来自「用python实现的邮件过滤器」· Python 代码 · 共 556 行 · 第 1/2 页

PY
556
字号
        d.alldone()        # Other end of the lazy 'capture the output' code.        sys.stdout.seek(0)        sys.stderr.seek(0)        out, err = sys.stdout, sys.stderr        sys.stdout = cout        sys.stderr = cerr        return out, err    def rates(self, ifile):        """This is essentially rates.py from the testtools directory."""        # XXX Stop being lazy and using the remapping cout/cerr cheat        # XXX at some point.        cout = sys.stdout        cerr = sys.stderr        sys.stdout = StringIO.StringIO()        sys.stderr = StringIO.StringIO()        interesting = filter(lambda line: line.startswith('-> '), ifile)        ifile.close()        ofile = StringIO.StringIO()        def dump(*stuff):            msg = ' '.join(map(str, stuff))            print msg            print >> ofile, msg        ntests = nfn = nfp = 0        sumfnrate = sumfprate = 0.0        for line in interesting:            dump(line[:-1])            fields = line.split()            # 0      1      2    3    4 5    6                 -5  -4 -3   -2    -1            #-> <stat> tested 4000 hams & 2750 spams against 8000 hams & 5500 spams            if line.startswith('-> <stat> tested '):                ntests += 1                continue            #  0      1     2        3            # -> <stat> false positive %: 0.025            # -> <stat> false negative %: 0.327272727273            if line.startswith('-> <stat> false '):                kind = fields[3]                percent = float(fields[-1])                if kind == 'positive':                    sumfprate += percent                    lastval = percent                else:                    sumfnrate += percent                    dump('    %7.3f %7.3f' % (lastval, percent))                continue            #  0      1 2   3     4         5            # -> <stat> 1 new false positives            if len(fields) >= 5 and fields[3] == 'new' and fields[4] == 'false':                kind = fields[-1]                count = int(fields[2])                if kind == 'positives':                    nfp += count                else:                    nfn += count        dump('total unique false pos', nfp)        dump('total unique false neg', nfn)        dump('average fp %', sumfprate / ntests)        dump('average fn %', sumfnrate / ntests)        ofile.seek(0)        sys.stdout = cout        sys.stderr = cerr        return ofile    def compare(self, f1, f2):        """This is essentially cmp.py from the testtools directory."""        # XXX Stop being lazy and using the remapping cout/cerr cheat        # XXX at some point.        cout, cerr = sys.stdout, sys.stderr        sys.stdout = StringIO.StringIO()        sys.stderr = StringIO.StringIO()        def suck(f):            fns = []            fps = []            hamdev = []            spamdev = []            hamdevall = spamdevall = (0.0, 0.0)            get = f.readline            while 1:                line = get()                if line.startswith('-> <stat> tested'):                    print line,                if line.find(' items; mean ') != -1:                    # -> <stat> Ham distribution for this pair: 1000 items; mean 0.05; sample sdev 0.68                    # and later "sample " went away                    vals = line.split(';')                    mean = float(vals[1].split()[-1])                    sdev = float(vals[2].split()[-1])                    val = (mean, sdev)                    typ = vals[0].split()[2]                    if line.find('for all runs') != -1:                        if typ == 'Ham':                            hamdevall = val                        else:                            spamdevall = val                    elif line.find('all in this') != -1:                        if typ == 'Ham':                            hamdev.append(val)                        else:                            spamdev.append(val)                    continue                if line.startswith('-> '):                    continue                if line.startswith('total'):                    break                if len(line) == 0:                    continue                # A line with an f-p rate and an f-n rate.                p, n = map(float, line.split())                fps.append(p)                fns.append(n)            # "total unique false pos 0"            # "total unique false neg 0"            # "average fp % 0.0"            # "average fn % 0.0"            fptot = int(line.split()[-1])            fntot = int(get().split()[-1])            fpmean = float(get().split()[-1])            fnmean = float(get().split()[-1])            return (fps, fns, fptot, fntot, fpmean, fnmean,                    hamdev, spamdev, hamdevall, spamdevall)        def tag(p1, p2):            if p1 == p2:                t = "tied          "            else:                t = p1 < p2 and "lost " or "won  "                if p1:                    p = (p2 - p1) * 100.0 / p1                    t += " %+7.2f%%" % p                else:                    t += " +(was 0)"            return t        def mtag(m1, m2):            mean1, dev1 = m1            mean2, dev2 = m2            t = "%7.2f %7.2f " % (mean1, mean2)            if mean1:                mp = (mean2 - mean1) * 100.0 / mean1                t += "%+7.2f%%" % mp            else:                t += "+(was 0)"            t += "     %7.2f %7.2f " % (dev1, dev2)            if dev1:                dp = (dev2 - dev1) * 100.0 / dev1                t += "%+7.2f%%" % dp            else:                t += "+(was 0)"            return t        def dump(p1s, p2s):            alltags = ""            for p1, p2 in zip(p1s, p2s):                t = tag(p1, p2)                print "    %5.3f  %5.3f  %s" % (p1, p2, t)                alltags += t + " "            print            for t in "won", "tied", "lost":                print "%-4s %2d times" % (t, alltags.count(t))            print        def dumpdev(meandev1, meandev2):            for m1, m2 in zip(meandev1, meandev2):                print mtag(m1, m2)        (fp1, fn1, fptot1, fntot1, fpmean1, fnmean1,         hamdev1, spamdev1, hamdevall1, spamdevall1) = suck(f1)        (fp2, fn2, fptot2, fntot2, fpmean2, fnmean2,         hamdev2, spamdev2, hamdevall2, spamdevall2) = suck(f2)        print        print "false positive percentages"        dump(fp1, fp2)        print "total unique fp went from", fptot1, "to", fptot2, tag(fptot1, fptot2)        print "mean fp % went from", fpmean1, "to", fpmean2, tag(fpmean1, fpmean2)        print        print "false negative percentages"        dump(fn1, fn2)        print "total unique fn went from", fntot1, "to", fntot2, tag(fntot1, fntot2)        print "mean fn % went from", fnmean1, "to", fnmean2, tag(fnmean1, fnmean2)        print        if len(hamdev1) == len(hamdev2) and len(spamdev1) == len(spamdev2):            print "ham mean                     ham sdev"            dumpdev(hamdev1, hamdev2)            print            print "ham mean and sdev for all runs"            dumpdev([hamdevall1], [hamdevall2])            print            print "spam mean                    spam sdev"            dumpdev(spamdev1, spamdev2)            print            print "spam mean and sdev for all runs"            dumpdev([spamdevall1], [spamdevall2])            print            diff1 = spamdevall1[0] - hamdevall1[0]            diff2 = spamdevall2[0] - hamdevall2[0]            print "ham/spam mean difference: %2.2f %2.2f %+2.2f" % (diff1,                                                                    diff2,                                                                    diff2 - diff1)        else:            print "[info about ham & spam means & sdevs not available in both files]"        sys.stdout.seek(0)        sys.stderr.seek(0)        out, err = sys.stdout, sys.stderr        sys.stdout = cout        sys.stderr = cerr        return out, err# The iterator yields a stream of Msg objects from the given# 'directory'.  The directory is actually the actual directory# and then an indication of the portion of it that we are after.# (so that a single directory can be used, a la the caches, rather# than a nicely split up into sets directory).class CacheStream(msgs.MsgStream):    def produce(self):        # We only want some of the msgs.  Shuffle each directory list, but        # in such a way that we'll get the same result each time this is        # called on the same directory list.        base_check = None        for directory in self.directories:            directory, portion = directory.split(os.pathsep)            # All the directories in the list *must* be the same, and just            # different sections, because this makes the code easier, and is            # the desired usage, anyway.            if base_check is None:                base_check = directory            assert directory == base_check            set_num, nsets = portion.split('/')            all = os.listdir(directory)            random.seed(hash(max(all)) ^ msgs.SEED)            random.shuffle(all)            set_size = len(all) // int(nsets)            set_num = int(set_num)            set = all[set_num*set_size:((set_num+1)*set_size)-1]            set.sort()            for fname in set:                yield msgs.Msg(directory, fname)class HamCacheStream(CacheStream):    def __init__(self, tag, directories, train=0):        if train:            CacheStream.__init__(self, tag, directories, msgs.HAMTRAIN)        else:            CacheStream.__init__(self, tag, directories, msgs.HAMTEST)class SpamCacheStream(CacheStream):    def __init__(self, tag, directories, train=0):        if train:            CacheStream.__init__(self, tag, directories, msgs.SPAMTRAIN)        else:            CacheStream.__init__(self, tag, directories, msgs.SPAMTEST)

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?