📄 tokenizer.py

📁 用python实现的邮件过滤器
💻 PY
📖 第 1 页 / 共 5 页
字号:
# square brackets as it should, but every once in awhile it turns up in# parens.  Yahoo seems to be guilty of this minor infraction:#   Received: from unknown (66.218.66.218)#       by m19.grp.scd.yahoo.com with QMQP; 19 Dec 2003 04:06:53 -0000received_ip_re = re.compile(r'[[(]((\d{1,3}\.?){4})[])]')message_id_re = re.compile(r'\s*<[^@]+@([^>]+)>\s*')# I'm usually just splitting on whitespace, but for subject lines I want to# break things like "Python/Perl comparison?" up.  OTOH, I don't want to# break up the unitized numbers in spammish subject phrases like "Increase# size 79%" or "Now only $29.95!".  Then again, I do want to break up# "Python-Dev".  Runs of punctuation are also interesting in subject lines.subject_word_re = re.compile(r"[\w\x80-\xff$.%]+")punctuation_run_re = re.compile(r'\W+')fname_sep_re = re.compile(r'[/\\:]')def crack_filename(fname):    yield "fname:" + fname    components = fname_sep_re.split(fname)    morethan1 = len(components) > 1    for component in components:        if morethan1:            yield "fname comp:" + component        pieces = urlsep_re.split(component)        if len(pieces) > 1:            for piece in pieces:                yield "fname piece:" + piecedef tokenize_word(word, _len=len, maxword=options["Tokenizer",                                                  "skip_max_word_size"]):    n = _len(word)    # Make sure this range matches in tokenize().    if 3 <= n <= maxword:        yield word    elif n >= 3:        # A long word.        # Don't want to skip embedded email addresses.        # An earlier scheme also split up the y in x@y on '.'.  Not splitting        # improved the f-n rate; the f-p rate didn't care either way.        if n < 40 and '.' in word and word.count('@') == 1:            p1, p2 = word.split('@')            yield 'email name:' + p1            yield 'email addr:' + p2        else:            # There's value in generating a token indicating roughly how            # many chars were skipped.  This has real benefit for the f-n            # rate, but is neutral for the f-p rate.  I don't know why!            # XXX Figure out why, and/or see if some other way of summarizing            # XXX this info has greater benefit.            if options["Tokenizer", "generate_long_skips"]:                yield "skip:%c %d" % (word[0], n // 10 * 10)            if has_highbit_char(word):                hicount = 0                for i in map(ord, word):                    if i >= 128:                        hicount += 1                yield "8bit%%:%d" % round(hicount * 100.0 / len(word))# Generate tokens for:#    Content-Type#        and its type= param#    Content-Dispostion#        and its filename= param#    all the charsets## This has huge benefit for the f-n rate, and virtually no effect on the f-p# rate, although it does reduce the variance of the f-p rate across different# training sets (really marginal msgs, like a brief HTML msg saying just# "unsubscribe me", are almost always tagged as spam now; before they were# right on the edge, and now the multipart/alternative pushes them over it# more consistently).## XXX I put all of this in as one chunk.  I don't know which parts are# XXX most effective; it could be that some parts don't help at all.  But# XXX given the nature of the c.l.py tests, it's not surprising that the# XXX     'content-type:text/html'# XXX token is now the single most powerful spam indicator (== makes it# XXX into the nbest list most often).  What *is* a little surprising is# XXX that this doesn't push more mixed-type msgs into the f-p camp --# XXX unlike looking at *all* HTML tags, this is just one spam indicator# XXX instead of dozens, so relevant msg content can cancel it out.## A bug in this code prevented Content-Transfer-Encoding from getting# picked up.  Fixing that bug showed that it didn't help, so the corrected# code is disabled now (left column without Content-Transfer-Encoding,# right column with it);## false positive percentages#    0.000  0.000  tied#    0.000  0.000  tied#    0.100  0.100  tied#    0.000  0.000  tied#    0.025  0.025  tied#    0.025  0.025  tied#    0.100  0.100  tied#    0.025  0.025  tied#    0.025  0.025  tied#    0.050  0.050  tied#    0.100  0.100  tied#    0.025  0.025  tied#    0.025  0.025  tied#    0.025  0.025  tied#    0.025  0.025  tied#    0.025  0.025  tied#    0.025  0.025  tied#    0.000  0.025  lost  +(was 0)#    0.025  0.025  tied#    0.100  0.100  tied## won   0 times# tied 19 times# lost  1 times## total unique fp went from 9 to 10## false negative percentages#    0.364  0.400  lost    +9.89%#    0.400  0.364  won     -9.00%#    0.400  0.436  lost    +9.00%#    0.909  0.872  won     -4.07%#    0.836  0.836  tied#    0.618  0.618  tied#    0.291  0.291  tied#    1.018  0.981  won     -3.63%#    0.982  0.982  tied#    0.727  0.727  tied#    0.800  0.800  tied#    1.163  1.127  won     -3.10%#    0.764  0.836  lost    +9.42%#    0.473  0.473  tied#    0.473  0.618  lost   +30.66%#    0.727  0.763  lost    +4.95%#    0.655  0.618  won     -5.65%#    0.509  0.473  won     -7.07%#    0.545  0.582  lost    +6.79%#    0.509  0.509  tied## won   6 times# tied  8 times# lost  6 times## total unique fn went from 168 to 169# For support of the replace_nonascii_chars option, build a string.translate# table that maps all high-bit chars and control chars to a '?' character.non_ascii_translate_tab = ['?'] * 256# leave blank up to (but not including) DEL alonefor i in range(32, 127):    non_ascii_translate_tab[i] = chr(i)# leave "normal" whitespace alonefor ch in ' \t\r\n':    non_ascii_translate_tab[ord(ch)] = chdel i, chnon_ascii_translate_tab = ''.join(non_ascii_translate_tab)def crack_content_xyz(msg):    yield 'content-type:' + msg.get_content_type()    x = msg.get_param('type')    if x is not None:        yield 'content-type/type:' + x.lower()    try:        for x in msg.get_charsets(None):            if x is not None:                yield 'charset:' + x.lower()    except UnicodeEncodeError:        # Bad messages can cause an exception here.        # See [ 1175439 ] UnicodeEncodeError raised for bogus Content-Type        #                 header        yield 'charset:invalid_unicode'    x = msg.get('content-disposition')    if x is not None:        yield 'content-disposition:' + x.lower()    try:        fname = msg.get_filename()        if fname is not None:            for x in crack_filename(fname):                yield 'filename:' + x    except TypeError:        # bug in email pkg?  see the thread beginning at        # http://mail.python.org/pipermail/spambayes/2003-September/008006.html        # and        # http://mail.python.org/pipermail/spambayes-dev/2003-September/001177.html        yield "filename:<bogus>"    if 0:   # disabled; see comment before function        x = msg.get('content-transfer-encoding')        if x is not None:            yield 'content-transfer-encoding:' + x.lower()# The base64 decoder is actually very forgiving, but flubs one case:# if no padding is required (no trailing '='), it continues to read# following lines as if they were still part of the base64 part.  We're# actually stricter here.  The *point* is that some mailers tack plain# text on to the end of base64-encoded text sections.# Match a line of base64, up to & including the trailing newline.# We allow for optional leading and trailing whitespace, and don't care# about line length, but other than that are strict.  Group 1 is non-empty# after a match iff the last significant char on the line is '='; in that# case, it must be the last line of the base64 section.base64_re = re.compile(r"""    [ \t]*    [a-zA-Z0-9+/]*    (=*)    [ \t]*    \r?    \n""", re.VERBOSE)def try_to_repair_damaged_base64(text):    i = 0    while True:        # text[:i] looks like base64.  Does the line starting at i also?        m = base64_re.match(text, i)        if not m:            break        i = m.end()        if m.group(1):            # This line has a trailing '=' -- the base64 part is done.            break    base64text = ''    if i:        base64 = text[:i]        try:            base64text = binascii.a2b_base64(base64)        except:            # There's no point in tokenizing raw base64 gibberish.            pass    return base64text + text[i:]def breakdown_host(host):    parts = host.split('.')    for i in range(1, len(parts) + 1):        yield '.'.join(parts[-i:])def breakdown_ipaddr(ipaddr):    parts = ipaddr.split('.')    for i in range(1, 5):        yield '.'.join(parts[:i])def log2(n, log=math.log, c=math.log(2)):    return log(n)/cclass Stripper(object):    # The retained portions are catenated together with self.separator.    # CAUTION:  This used to be blank.  But then I noticed spam putting    # HTML comments embedded in words, like    #     FR<!--slkdflskjf-->EE!    # Breaking this into "FR" and "EE!" wasn't a real help <wink>.    separator = ''  # a subclass can override if this isn't appropriate    def __init__(self, find_start, find_end):        # find_start and find_end have signature        #     string, int -> match_object        # where the search starts at string[int:int].  If a match isn't found,        # they must return None.  The match_object for find_start, if not        # None, is passed to self.tokenize, which returns a (possibly empty)        # list of tokens to generate.  Subclasses may override tokenize().        # Text between find_start and find_end is thrown away, except for        # whatever tokenize() produces.  A match_object must support method        #     span() -> int, int    # the slice bounds of what was matched        self.find_start = find_start        self.find_end = find_end    # Efficiency note:  This is cheaper than it looks if there aren't any    # special sections.  Under the covers, string[0:] is optimized to    # return string (no new object is built), and likewise ' '.join([string])    # is optimized to return string.  It would actually slow this code down    # to special-case these "do nothing" special cases at the Python level!    def analyze(self, text):        i = 0        retained = []        pushretained = retained.append        tokens = []        while True:            m = self.find_start(text, i)            if not m:                pushretained(text[i:])                break            start, end = m.span()            pushretained(text[i : start])            tokens.extend(self.tokenize(m))            m = self.find_end(text, end)            if not m:                # No matching end - act as if the open                # tag did not exist.                pushretained(text[start:])                break            dummy, i = m.span()        return self.separator.join(retained), tokens    def tokenize(self, match_object):        # Override this if you want to suck info out of the start pattern.        return []# Strip out uuencoded sections and produce tokens.  The return value# is (new_text, sequence_of_tokens), where new_text no longer contains# uuencoded stuff.  Note that we're not bothering to decode it!  Maybe# we should.  One of my persistent false negatives is a spam containing# nothing but a uuencoded money.txt; OTOH, uuencode seems to be on# its way out (that's an old spam).uuencode_begin_re = re.compile(r"""    ^begin \s+    (\S+) \s+   # capture mode    (\S+) \s*   # capture filename    $""", re.VERBOSE | re.MULTILINE)uuencode_end_re = re.compile(r"^end\s*\n", re.MULTILINE)class UUencodeStripper(Stripper):    def __init__(self):        Stripper.__init__(self, uuencode_begin_re.search,                                uuencode_end_re.search)    def tokenize(self, m):        mode, fname = m.groups()        return (['uuencode mode:%s' % mode] +                ['uuencode:%s' % x for x in crack_filename(fname)])crack_uuencode = UUencodeStripper().analyze
💿 文件大小 1791 K
👤 上传用户 guigong
📂 所属分类数学计算
🏷️ 相关标签

#python #邮件过滤
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -