classifier.py

来自「用python实现的邮件过滤器」· Python 代码 · 共 829 行 · 第 1/3 页
829 行
#! /usr/bin/env pythonfrom __future__ import generators# An implementation of a Bayes-like spam classifier.## Paul Graham's original description:##     http://www.paulgraham.com/spam.html## A highly fiddled version of that can be retrieved from our CVS repository,# via tag Last-Graham.  This made many demonstrated improvements in error# rates over Paul's original description.## This code implements Gary Robinson's suggestions, the core of which are# well explained on his webpage:##    http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html## This is theoretically cleaner, and in testing has performed at least as# well as our highly tuned Graham scheme did, often slightly better, and# sometimes much better.  It also has "a middle ground", which people like:# the scores under Paul's scheme were almost always very near 0 or very near# 1, whether or not the classification was correct.  The false positives# and false negatives under Gary's basic scheme (use_gary_combining) generally# score in a narrow range around the corpus's best spam_cutoff value.# However, it doesn't appear possible to guess the best spam_cutoff value in# advance, and it's touchy.## The last version of the Gary-combining scheme can be retrieved from our# CVS repository via tag Last-Gary.## The chi-combining scheme used by default here gets closer to the theoretical# basis of Gary's combining scheme, and does give extreme scores, but also# has a very useful middle ground (small # of msgs spread across a large range# of scores, and good cutoff values aren't touchy).## This implementation is due to Tim Peters et alia.import mathtry:    # We have three possibilities for Set:    #  (a) With Python 2.2 and earlier, we use our compatsets class    #  (b) With Python 2.3, we use the sets.Set class    #  (c) With Python 2.4 and later, we use the builtin set class    Set = setexcept NameError:    try:        from sets import Set    except ImportError:        from spambayes.compatsets import Set# XXX At time of writing, these are only necessary for the# XXX experimental url retrieving/slurping code.  If that# XXX gets ripped out, either rip these out, or run# XXX PyChecker over the code.import reimport osimport sysimport socketimport pickleimport urllib2from email import message_from_stringtry:    enumerateexcept NameError:    def enumerate(seq):        i = 0        for elt in seq:            yield (i, elt)            i += 1DOMAIN_AND_PORT_RE = re.compile(r"([^:/\\]+)(:([\d]+))?")HTTP_ERROR_RE = re.compile(r"HTTP Error ([\d]+)")URL_KEY_RE = re.compile(r"[\W]")# XXX ---- ends ----from spambayes.Options import optionsfrom spambayes.chi2 import chi2Qtry:    True, Falseexcept NameError:    # Maintain compatibility with Python 2.2    True, False = 1, 0LN2 = math.log(2)       # used frequently by chi-combiningslurp_wordstream = NonePICKLE_VERSION = 5class WordInfo(object):    # A WordInfo is created for each distinct word.  spamcount is the    # number of trained spam msgs in which the word appears, and hamcount    # the number of trained ham msgs.    #    # Invariant:  For use in a classifier database, at least one of    # spamcount and hamcount must be non-zero.    #    # Important:  This is a tiny object.  Use of __slots__ is essential    # to conserve memory.    __slots__ = 'spamcount', 'hamcount'    def __init__(self):        self.__setstate__((0, 0))    def __repr__(self):        return "WordInfo" + repr((self.spamcount, self.hamcount))    def __getstate__(self):        return self.spamcount, self.hamcount    def __setstate__(self, t):        self.spamcount, self.hamcount = tclass Classifier:    # Defining __slots__ here made Jeremy's life needlessly difficult when    # trying to hook this all up to ZODB as a persistent object.  There's    # no space benefit worth getting from slots in this class; slots were    # used solely to help catch errors earlier, when this code was changing    # rapidly.    #__slots__ = ('wordinfo',  # map word to WordInfo record    #             'nspam',     # number of spam messages learn() has seen    #             'nham',      # number of non-spam messages learn() has seen    #            )    # allow a subclass to use a different class for WordInfo    WordInfoClass = WordInfo    def __init__(self):        self.wordinfo = {}        self.probcache = {}        self.nspam = self.nham = 0    def __getstate__(self):        return (PICKLE_VERSION, self.wordinfo, self.nspam, self.nham)    def __setstate__(self, t):        if t[0] != PICKLE_VERSION:            raise ValueError("Can't unpickle -- version %s unknown" % t[0])        (self.wordinfo, self.nspam, self.nham) = t[1:]        self.probcache = {}    # spamprob() implementations.  One of the following is aliased to    # spamprob, depending on option settings.    # Currently only chi-squared is available, but maybe there will be    # an alternative again someday.    # Across vectors of length n, containing random uniformly-distributed    # probabilities, -2*sum(ln(p_i)) follows the chi-squared distribution    # with 2*n degrees of freedom.  This has been proven (in some    # appropriate sense) to be the most sensitive possible test for    # rejecting the hypothesis that a vector of probabilities is uniformly    # distributed.  Gary Robinson's original scheme was monotonic *with*    # this test, but skipped the details.  Turns out that getting closer    # to the theoretical roots gives a much sharper classification, with    # a very small (in # of msgs), but also very broad (in range of scores),    # "middle ground", where most of the mistakes live.  In particular,    # this scheme seems immune to all forms of "cancellation disease":  if    # there are many strong ham *and* spam clues, this reliably scores    # close to 0.5.  Most other schemes are extremely certain then -- and    # often wrong.    def chi2_spamprob(self, wordstream, evidence=False):        """Return best-guess probability that wordstream is spam.        wordstream is an iterable object producing words.        The return value is a float in [0.0, 1.0].        If optional arg evidence is True, the return value is a pair            probability, evidence        where evidence is a list of (word, probability) pairs.        """        from math import frexp, log as ln        # We compute two chi-squared statistics, one for ham and one for        # spam.  The sum-of-the-logs business is more sensitive to probs        # near 0 than to probs near 1, so the spam measure uses 1-p (so        # that high-spamprob words have greatest effect), and the ham        # measure uses p directly (so that lo-spamprob words have greatest        # effect).        #        # For optimization, sum-of-logs == log-of-product, and f.p.        # multiplication is a lot cheaper than calling ln().  It's easy        # to underflow to 0.0, though, so we simulate unbounded dynamic        # range via frexp.  The real product H = this H * 2**Hexp, and        # likewise the real product S = this S * 2**Sexp.        H = S = 1.0        Hexp = Sexp = 0        clues = self._getclues(wordstream)        for prob, word, record in clues:            S *= 1.0 - prob            H *= prob            if S < 1e-200:  # prevent underflow                S, e = frexp(S)                Sexp += e            if H < 1e-200:  # prevent underflow                H, e = frexp(H)                Hexp += e        # Compute the natural log of the product = sum of the logs:        # ln(x * 2**i) = ln(x) + i * ln(2).        S = ln(S) + Sexp * LN2        H = ln(H) + Hexp * LN2        n = len(clues)        if n:            S = 1.0 - chi2Q(-2.0 * S, 2*n)            H = 1.0 - chi2Q(-2.0 * H, 2*n)            # How to combine these into a single spam score?  We originally            # used (S-H)/(S+H) scaled into [0., 1.], which equals S/(S+H).  A            # systematic problem is that we could end up being near-certain            # a thing was (for example) spam, even if S was small, provided            # that H was much smaller.            # Rob Hooft stared at these problems and invented the measure            # we use now, the simpler S-H, scaled into [0., 1.].            prob = (S-H + 1.0) / 2.0        else:            prob = 0.5        if evidence:            clues = [(w, p) for p, w, r in clues]            clues.sort(lambda a, b: cmp(a[1], b[1]))            clues.insert(0, ('*S*', S))            clues.insert(0, ('*H*', H))            return prob, clues        else:            return prob    def slurping_spamprob(self, wordstream, evidence=False):        """Do the standard chi-squared spamprob, but if the evidence        leaves the score in the unsure range, and we have fewer tokens        than max_discriminators, also generate tokens from the text        obtained by following http URLs in the message."""        h_cut = options["Categorization", "ham_cutoff"]        s_cut = options["Categorization", "spam_cutoff"]        # Get the raw score.        prob, clues = self.chi2_spamprob(wordstream, True)        # If necessary, enhance it with the tokens from whatever is        # at the URL's destination.        if len(clues) < options["Classifier", "max_discriminators"] and \           prob > h_cut and prob < s_cut and slurp_wordstream:            slurp_tokens = list(self._generate_slurp())            slurp_tokens.extend([w for (w,p) in clues])            sprob, sclues = self.chi2_spamprob(slurp_tokens, True)            if sprob < h_cut or sprob > s_cut:                prob = sprob                clues = sclues        if evidence:            return prob, clues        return prob    if options["Classifier", "use_chi_squared_combining"]:        if options["URLRetriever", "x-slurp_urls"]:            spamprob = slurping_spamprob        else:            spamprob = chi2_spamprob    def learn(self, wordstream, is_spam):        """Teach the classifier by example.        wordstream is a word stream representing a message.  If is_spam is        True, you're telling the classifier this message is definitely spam,        else that it's definitely not spam.        """        if options["Classifier", "use_bigrams"]:            wordstream = self._enhance_wordstream(wordstream)        if options["URLRetriever", "x-slurp_urls"]:
classifier.py - 源码说明

本页面展示了「用python实现的邮件过滤器」中的 classifier.py 源码文件，采用 Python 编程语言编写，共 829 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与python相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?