chi2.py

来自「用python实现的邮件过滤器」· Python 代码 · 共 208 行

208 行

import math as _mathtry:    True, Falseexcept NameError:    # Maintain compatibility with Python 2.2    True, False = 1, 0def chi2Q(x2, v, exp=_math.exp, min=min):    """Return prob(chisq >= x2, with v degrees of freedom).    v must be even.    """    assert v & 1 == 0    # XXX If x2 is very large, exp(-m) will underflow to 0.    m = x2 / 2.0    sum = term = exp(-m)    for i in range(1, v//2):        term *= m / i        sum += term    # With small x2 and large v, accumulated roundoff error, plus error in    # the platform exp(), can cause this to spill a few ULP above 1.0.  For    # example, chi2Q(100, 300) on my box has sum == 1.0 + 2.0**-52 at this    # point.  Returning a value even a teensy bit over 1.0 is no good.    return min(sum, 1.0)def normZ(z, sqrt2pi=_math.sqrt(2.0*_math.pi), exp=_math.exp):    "Return value of the unit Gaussian at z."    return exp(-z*z/2.0) / sqrt2pidef normP(z):    """Return area under the unit Gaussian from -inf to z.    This is the probability that a zscore is <= z.    """    # This is very accurate in a fixed-point sense.  For negative z of    # large magnitude (<= -8.3), it returns 0.0, essentially because    # P(-z) is, to machine precision, indistiguishable from 1.0 then.    # sum <- area from 0 to abs(z).    a = abs(float(z))    if a >= 8.3:        sum = 0.5    else:        sum2 = term = a * normZ(a)        z2 = a*a        sum = 0.0        i = 1.0        while sum != sum2:            sum = sum2            i += 2.0            term *= z2 / i            sum2 += term    if z >= 0:        result = 0.5 + sum    else:        result = 0.5 - sum    return resultdef normIQ(p, sqrt=_math.sqrt, ln=_math.log):    """Return z such that the area under the unit Gaussian from z to +inf is p.    Must have 0.0 <= p <= 1.0.    """    assert 0.0 <= p <= 1.0    # This is a low-accuracy rational approximation from Abramowitz & Stegun.    # The absolute error is bounded by 3e-3.    flipped = False    if p > 0.5:        flipped = True        p = 1.0 - p    if p == 0.0:        z = 8.3    else:        t = sqrt(-2.0 * ln(p))        z = t - (2.30753 + .27061*t) / (1. + .99229*t + .04481*t**2)    if flipped:        z = -z    return zdef normIP(p):    """Return z such that the area under the unit Gaussian from -inf to z is p.    Must have 0.0 <= p <= 1.0.    """    z = normIQ(1.0 - p)    # One Newton step should double the # of good digits.    return z + (p - normP(z)) / normZ(z)def main():    from spambayes.Histogram import Hist    import sys    class WrappedRandom:        # There's no way W-H is equidistributed in 50 dimensions, so use        # Marsaglia-wrapping to shuffle it more.        def __init__(self, baserandom=random.random, tabsize=513):            self.baserandom = baserandom            self.n = tabsize            self.tab = [baserandom() for i in range(tabsize)]            self.next = baserandom()        def random(self):            result = self.next            i = int(result * self.n)            self.next = self.tab[i]            self.tab[i] = self.baserandom()            return result    random = WrappedRandom().random    #from uni import uni as random    #print random    def judge(ps, ln=_math.log, ln2=_math.log(2), frexp=_math.frexp):        H = S = 1.0        Hexp = Sexp = 0        for p in ps:            S *= 1.0 - p            H *= p            if S < 1e-200:                S, e = frexp(S)                Sexp += e            if H < 1e-200:                H, e = frexp(H)                Hexp += e        S = ln(S) + Sexp * ln2        H = ln(H) + Hexp * ln2        n = len(ps)        S = 1.0 - chi2Q(-2.0 * S, 2*n)        H = 1.0 - chi2Q(-2.0 * H, 2*n)        return S, H, (S-H + 1.0) / 2.0    warp = 0    bias = 0.99    if len(sys.argv) > 1:        warp = int(sys.argv[1])    if len(sys.argv) > 2:        bias = float(sys.argv[2])    h = Hist(20, lo=0.0, hi=1.0)    s = Hist(20, lo=0.0, hi=1.0)    score = Hist(20, lo=0.0, hi=1.0)    for i in range(5000):        ps = [random() for j in range(50)]        s1, h1, score1 = judge(ps + [bias] * warp)        s.add(s1)        h.add(h1)        score.add(score1)    print "Result for random vectors of 50 probs, +", warp, "forced to", bias    # Should be uniformly distributed on all-random data.    print    print 'H',    h.display()    # Should be uniformly distributed on all-random data.    print    print 'S',    s.display()    # Distribution doesn't really matter.    print    print '(S-H+1)/2',    score.display()def showscore(ps, ln=_math.log, ln2=_math.log(2), frexp=_math.frexp):    H = S = 1.0    Hexp = Sexp = 0    for p in ps:        S *= 1.0 - p        H *= p        if S < 1e-200:            S, e = frexp(S)            Sexp += e        if H < 1e-200:            H, e = frexp(H)            Hexp += e    S = ln(S) + Sexp * ln2    H = ln(H) + Hexp * ln2    n = len(ps)    probS = chi2Q(-2*S, 2*n)    probH = chi2Q(-2*H, 2*n)    print "P(chisq >= %10g | v=%3d) = %10g" % (-2*S, 2*n, probS)    print "P(chisq >= %10g | v=%3d) = %10g" % (-2*H, 2*n, probH)    S = 1.0 - probS    H = 1.0 - probH    score = (S-H + 1.0) / 2.0    print "spam prob", S    print " ham prob", H    print "(S-H+1)/2", scoreif __name__ == '__main__':    import random    main()

chi2.py - 源码说明

本页面展示了「用python实现的邮件过滤器」中的 chi2.py 源码文件，采用 Python 编程语言编写，共 208 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。

虫虫下载站收录了大量与python相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。

⌨️ 快捷键说明

复制代码Ctrl + C

搜索代码Ctrl + F

全屏模式F11

增大字号Ctrl + =

减小字号Ctrl + -

显示快捷键?