📄 classifier.py
字号:
#! /usr/bin/env pythonfrom __future__ import generators# An implementation of a Bayes-like spam classifier.## Paul Graham's original description:## http://www.paulgraham.com/spam.html## A highly fiddled version of that can be retrieved from our CVS repository,# via tag Last-Graham. This made many demonstrated improvements in error# rates over Paul's original description.## This code implements Gary Robinson's suggestions, the core of which are# well explained on his webpage:## http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html## This is theoretically cleaner, and in testing has performed at least as# well as our highly tuned Graham scheme did, often slightly better, and# sometimes much better. It also has "a middle ground", which people like:# the scores under Paul's scheme were almost always very near 0 or very near# 1, whether or not the classification was correct. The false positives# and false negatives under Gary's basic scheme (use_gary_combining) generally# score in a narrow range around the corpus's best spam_cutoff value.# However, it doesn't appear possible to guess the best spam_cutoff value in# advance, and it's touchy.## The last version of the Gary-combining scheme can be retrieved from our# CVS repository via tag Last-Gary.## The chi-combining scheme used by default here gets closer to the theoretical# basis of Gary's combining scheme, and does give extreme scores, but also# has a very useful middle ground (small # of msgs spread across a large range# of scores, and good cutoff values aren't touchy).## This implementation is due to Tim Peters et alia.import mathtry: # We have three possibilities for Set: # (a) With Python 2.2 and earlier, we use our compatsets class # (b) With Python 2.3, we use the sets.Set class # (c) With Python 2.4 and later, we use the builtin set class Set = setexcept NameError: try: from sets import Set except ImportError: from spambayes.compatsets import Set# XXX At time of writing, these are only necessary for the# XXX experimental url retrieving/slurping code. If that# XXX gets ripped out, either rip these out, or run# XXX PyChecker over the code.import reimport osimport sysimport socketimport pickleimport urllib2from email import message_from_stringtry: enumerateexcept NameError: def enumerate(seq): i = 0 for elt in seq: yield (i, elt) i += 1DOMAIN_AND_PORT_RE = re.compile(r"([^:/\\]+)(:([\d]+))?")HTTP_ERROR_RE = re.compile(r"HTTP Error ([\d]+)")URL_KEY_RE = re.compile(r"[\W]")# XXX ---- ends ----from spambayes.Options import optionsfrom spambayes.chi2 import chi2Qtry: True, Falseexcept NameError: # Maintain compatibility with Python 2.2 True, False = 1, 0LN2 = math.log(2) # used frequently by chi-combiningslurp_wordstream = NonePICKLE_VERSION = 5class WordInfo(object): # A WordInfo is created for each distinct word. spamcount is the # number of trained spam msgs in which the word appears, and hamcount # the number of trained ham msgs. # # Invariant: For use in a classifier database, at least one of # spamcount and hamcount must be non-zero. # # Important: This is a tiny object. Use of __slots__ is essential # to conserve memory. __slots__ = 'spamcount', 'hamcount' def __init__(self): self.__setstate__((0, 0)) def __repr__(self): return "WordInfo" + repr((self.spamcount, self.hamcount)) def __getstate__(self): return self.spamcount, self.hamcount def __setstate__(self, t): self.spamcount, self.hamcount = tclass Classifier: # Defining __slots__ here made Jeremy's life needlessly difficult when # trying to hook this all up to ZODB as a persistent object. There's # no space benefit worth getting from slots in this class; slots were # used solely to help catch errors earlier, when this code was changing # rapidly. #__slots__ = ('wordinfo', # map word to WordInfo record # 'nspam', # number of spam messages learn() has seen # 'nham', # number of non-spam messages learn() has seen # ) # allow a subclass to use a different class for WordInfo WordInfoClass = WordInfo def __init__(self): self.wordinfo = {} self.probcache = {} self.nspam = self.nham = 0 def __getstate__(self): return (PICKLE_VERSION, self.wordinfo, self.nspam, self.nham) def __setstate__(self, t): if t[0] != PICKLE_VERSION: raise ValueError("Can't unpickle -- version %s unknown" % t[0]) (self.wordinfo, self.nspam, self.nham) = t[1:] self.probcache = {} # spamprob() implementations. One of the following is aliased to # spamprob, depending on option settings. # Currently only chi-squared is available, but maybe there will be # an alternative again someday. # Across vectors of length n, containing random uniformly-distributed # probabilities, -2*sum(ln(p_i)) follows the chi-squared distribution # with 2*n degrees of freedom. This has been proven (in some # appropriate sense) to be the most sensitive possible test for # rejecting the hypothesis that a vector of probabilities is uniformly # distributed. Gary Robinson's original scheme was monotonic *with* # this test, but skipped the details. Turns out that getting closer # to the theoretical roots gives a much sharper classification, with # a very small (in # of msgs), but also very broad (in range of scores), # "middle ground", where most of the mistakes live. In particular, # this scheme seems immune to all forms of "cancellation disease": if # there are many strong ham *and* spam clues, this reliably scores # close to 0.5. Most other schemes are extremely certain then -- and # often wrong. def chi2_spamprob(self, wordstream, evidence=False): """Return best-guess probability that wordstream is spam. wordstream is an iterable object producing words. The return value is a float in [0.0, 1.0]. If optional arg evidence is True, the return value is a pair probability, evidence where evidence is a list of (word, probability) pairs. """ from math import frexp, log as ln # We compute two chi-squared statistics, one for ham and one for # spam. The sum-of-the-logs business is more sensitive to probs # near 0 than to probs near 1, so the spam measure uses 1-p (so # that high-spamprob words have greatest effect), and the ham # measure uses p directly (so that lo-spamprob words have greatest # effect). # # For optimization, sum-of-logs == log-of-product, and f.p. # multiplication is a lot cheaper than calling ln(). It's easy # to underflow to 0.0, though, so we simulate unbounded dynamic # range via frexp. The real product H = this H * 2**Hexp, and # likewise the real product S = this S * 2**Sexp. H = S = 1.0 Hexp = Sexp = 0 clues = self._getclues(wordstream) for prob, word, record in clues: S *= 1.0 - prob H *= prob if S < 1e-200: # prevent underflow S, e = frexp(S) Sexp += e if H < 1e-200: # prevent underflow H, e = frexp(H) Hexp += e # Compute the natural log of the product = sum of the logs: # ln(x * 2**i) = ln(x) + i * ln(2). S = ln(S) + Sexp * LN2 H = ln(H) + Hexp * LN2 n = len(clues) if n: S = 1.0 - chi2Q(-2.0 * S, 2*n) H = 1.0 - chi2Q(-2.0 * H, 2*n) # How to combine these into a single spam score? We originally # used (S-H)/(S+H) scaled into [0., 1.], which equals S/(S+H). A # systematic problem is that we could end up being near-certain # a thing was (for example) spam, even if S was small, provided # that H was much smaller. # Rob Hooft stared at these problems and invented the measure # we use now, the simpler S-H, scaled into [0., 1.]. prob = (S-H + 1.0) / 2.0 else: prob = 0.5 if evidence: clues = [(w, p) for p, w, r in clues] clues.sort(lambda a, b: cmp(a[1], b[1])) clues.insert(0, ('*S*', S)) clues.insert(0, ('*H*', H)) return prob, clues else: return prob def slurping_spamprob(self, wordstream, evidence=False): """Do the standard chi-squared spamprob, but if the evidence leaves the score in the unsure range, and we have fewer tokens than max_discriminators, also generate tokens from the text obtained by following http URLs in the message.""" h_cut = options["Categorization", "ham_cutoff"] s_cut = options["Categorization", "spam_cutoff"] # Get the raw score. prob, clues = self.chi2_spamprob(wordstream, True) # If necessary, enhance it with the tokens from whatever is # at the URL's destination. if len(clues) < options["Classifier", "max_discriminators"] and \ prob > h_cut and prob < s_cut and slurp_wordstream: slurp_tokens = list(self._generate_slurp()) slurp_tokens.extend([w for (w,p) in clues]) sprob, sclues = self.chi2_spamprob(slurp_tokens, True) if sprob < h_cut or sprob > s_cut: prob = sprob clues = sclues if evidence: return prob, clues return prob if options["Classifier", "use_chi_squared_combining"]: if options["URLRetriever", "x-slurp_urls"]: spamprob = slurping_spamprob else: spamprob = chi2_spamprob def learn(self, wordstream, is_spam): """Teach the classifier by example. wordstream is a word stream representing a message. If is_spam is True, you're telling the classifier this message is definitely spam, else that it's definitely not spam. """ if options["Classifier", "use_bigrams"]: wordstream = self._enhance_wordstream(wordstream) if options["URLRetriever", "x-slurp_urls"]:
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -