📄 tokenizer.py
字号:
# square brackets as it should, but every once in awhile it turns up in# parens. Yahoo seems to be guilty of this minor infraction:# Received: from unknown (66.218.66.218)# by m19.grp.scd.yahoo.com with QMQP; 19 Dec 2003 04:06:53 -0000received_ip_re = re.compile(r'[[(]((\d{1,3}\.?){4})[])]')message_id_re = re.compile(r'\s*<[^@]+@([^>]+)>\s*')# I'm usually just splitting on whitespace, but for subject lines I want to# break things like "Python/Perl comparison?" up. OTOH, I don't want to# break up the unitized numbers in spammish subject phrases like "Increase# size 79%" or "Now only $29.95!". Then again, I do want to break up# "Python-Dev". Runs of punctuation are also interesting in subject lines.subject_word_re = re.compile(r"[\w\x80-\xff$.%]+")punctuation_run_re = re.compile(r'\W+')fname_sep_re = re.compile(r'[/\\:]')def crack_filename(fname): yield "fname:" + fname components = fname_sep_re.split(fname) morethan1 = len(components) > 1 for component in components: if morethan1: yield "fname comp:" + component pieces = urlsep_re.split(component) if len(pieces) > 1: for piece in pieces: yield "fname piece:" + piecedef tokenize_word(word, _len=len, maxword=options["Tokenizer", "skip_max_word_size"]): n = _len(word) # Make sure this range matches in tokenize(). if 3 <= n <= maxword: yield word elif n >= 3: # A long word. # Don't want to skip embedded email addresses. # An earlier scheme also split up the y in x@y on '.'. Not splitting # improved the f-n rate; the f-p rate didn't care either way. if n < 40 and '.' in word and word.count('@') == 1: p1, p2 = word.split('@') yield 'email name:' + p1 yield 'email addr:' + p2 else: # There's value in generating a token indicating roughly how # many chars were skipped. This has real benefit for the f-n # rate, but is neutral for the f-p rate. I don't know why! # XXX Figure out why, and/or see if some other way of summarizing # XXX this info has greater benefit. if options["Tokenizer", "generate_long_skips"]: yield "skip:%c %d" % (word[0], n // 10 * 10) if has_highbit_char(word): hicount = 0 for i in map(ord, word): if i >= 128: hicount += 1 yield "8bit%%:%d" % round(hicount * 100.0 / len(word))# Generate tokens for:# Content-Type# and its type= param# Content-Dispostion# and its filename= param# all the charsets## This has huge benefit for the f-n rate, and virtually no effect on the f-p# rate, although it does reduce the variance of the f-p rate across different# training sets (really marginal msgs, like a brief HTML msg saying just# "unsubscribe me", are almost always tagged as spam now; before they were# right on the edge, and now the multipart/alternative pushes them over it# more consistently).## XXX I put all of this in as one chunk. I don't know which parts are# XXX most effective; it could be that some parts don't help at all. But# XXX given the nature of the c.l.py tests, it's not surprising that the# XXX 'content-type:text/html'# XXX token is now the single most powerful spam indicator (== makes it# XXX into the nbest list most often). What *is* a little surprising is# XXX that this doesn't push more mixed-type msgs into the f-p camp --# XXX unlike looking at *all* HTML tags, this is just one spam indicator# XXX instead of dozens, so relevant msg content can cancel it out.## A bug in this code prevented Content-Transfer-Encoding from getting# picked up. Fixing that bug showed that it didn't help, so the corrected# code is disabled now (left column without Content-Transfer-Encoding,# right column with it);## false positive percentages# 0.000 0.000 tied# 0.000 0.000 tied# 0.100 0.100 tied# 0.000 0.000 tied# 0.025 0.025 tied# 0.025 0.025 tied# 0.100 0.100 tied# 0.025 0.025 tied# 0.025 0.025 tied# 0.050 0.050 tied# 0.100 0.100 tied# 0.025 0.025 tied# 0.025 0.025 tied# 0.025 0.025 tied# 0.025 0.025 tied# 0.025 0.025 tied# 0.025 0.025 tied# 0.000 0.025 lost +(was 0)# 0.025 0.025 tied# 0.100 0.100 tied## won 0 times# tied 19 times# lost 1 times## total unique fp went from 9 to 10## false negative percentages# 0.364 0.400 lost +9.89%# 0.400 0.364 won -9.00%# 0.400 0.436 lost +9.00%# 0.909 0.872 won -4.07%# 0.836 0.836 tied# 0.618 0.618 tied# 0.291 0.291 tied# 1.018 0.981 won -3.63%# 0.982 0.982 tied# 0.727 0.727 tied# 0.800 0.800 tied# 1.163 1.127 won -3.10%# 0.764 0.836 lost +9.42%# 0.473 0.473 tied# 0.473 0.618 lost +30.66%# 0.727 0.763 lost +4.95%# 0.655 0.618 won -5.65%# 0.509 0.473 won -7.07%# 0.545 0.582 lost +6.79%# 0.509 0.509 tied## won 6 times# tied 8 times# lost 6 times## total unique fn went from 168 to 169# For support of the replace_nonascii_chars option, build a string.translate# table that maps all high-bit chars and control chars to a '?' character.non_ascii_translate_tab = ['?'] * 256# leave blank up to (but not including) DEL alonefor i in range(32, 127): non_ascii_translate_tab[i] = chr(i)# leave "normal" whitespace alonefor ch in ' \t\r\n': non_ascii_translate_tab[ord(ch)] = chdel i, chnon_ascii_translate_tab = ''.join(non_ascii_translate_tab)def crack_content_xyz(msg): yield 'content-type:' + msg.get_content_type() x = msg.get_param('type') if x is not None: yield 'content-type/type:' + x.lower() try: for x in msg.get_charsets(None): if x is not None: yield 'charset:' + x.lower() except UnicodeEncodeError: # Bad messages can cause an exception here. # See [ 1175439 ] UnicodeEncodeError raised for bogus Content-Type # header yield 'charset:invalid_unicode' x = msg.get('content-disposition') if x is not None: yield 'content-disposition:' + x.lower() try: fname = msg.get_filename() if fname is not None: for x in crack_filename(fname): yield 'filename:' + x except TypeError: # bug in email pkg? see the thread beginning at # http://mail.python.org/pipermail/spambayes/2003-September/008006.html # and # http://mail.python.org/pipermail/spambayes-dev/2003-September/001177.html yield "filename:<bogus>" if 0: # disabled; see comment before function x = msg.get('content-transfer-encoding') if x is not None: yield 'content-transfer-encoding:' + x.lower()# The base64 decoder is actually very forgiving, but flubs one case:# if no padding is required (no trailing '='), it continues to read# following lines as if they were still part of the base64 part. We're# actually stricter here. The *point* is that some mailers tack plain# text on to the end of base64-encoded text sections.# Match a line of base64, up to & including the trailing newline.# We allow for optional leading and trailing whitespace, and don't care# about line length, but other than that are strict. Group 1 is non-empty# after a match iff the last significant char on the line is '='; in that# case, it must be the last line of the base64 section.base64_re = re.compile(r""" [ \t]* [a-zA-Z0-9+/]* (=*) [ \t]* \r? \n""", re.VERBOSE)def try_to_repair_damaged_base64(text): i = 0 while True: # text[:i] looks like base64. Does the line starting at i also? m = base64_re.match(text, i) if not m: break i = m.end() if m.group(1): # This line has a trailing '=' -- the base64 part is done. break base64text = '' if i: base64 = text[:i] try: base64text = binascii.a2b_base64(base64) except: # There's no point in tokenizing raw base64 gibberish. pass return base64text + text[i:]def breakdown_host(host): parts = host.split('.') for i in range(1, len(parts) + 1): yield '.'.join(parts[-i:])def breakdown_ipaddr(ipaddr): parts = ipaddr.split('.') for i in range(1, 5): yield '.'.join(parts[:i])def log2(n, log=math.log, c=math.log(2)): return log(n)/cclass Stripper(object): # The retained portions are catenated together with self.separator. # CAUTION: This used to be blank. But then I noticed spam putting # HTML comments embedded in words, like # FR<!--slkdflskjf-->EE! # Breaking this into "FR" and "EE!" wasn't a real help <wink>. separator = '' # a subclass can override if this isn't appropriate def __init__(self, find_start, find_end): # find_start and find_end have signature # string, int -> match_object # where the search starts at string[int:int]. If a match isn't found, # they must return None. The match_object for find_start, if not # None, is passed to self.tokenize, which returns a (possibly empty) # list of tokens to generate. Subclasses may override tokenize(). # Text between find_start and find_end is thrown away, except for # whatever tokenize() produces. A match_object must support method # span() -> int, int # the slice bounds of what was matched self.find_start = find_start self.find_end = find_end # Efficiency note: This is cheaper than it looks if there aren't any # special sections. Under the covers, string[0:] is optimized to # return string (no new object is built), and likewise ' '.join([string]) # is optimized to return string. It would actually slow this code down # to special-case these "do nothing" special cases at the Python level! def analyze(self, text): i = 0 retained = [] pushretained = retained.append tokens = [] while True: m = self.find_start(text, i) if not m: pushretained(text[i:]) break start, end = m.span() pushretained(text[i : start]) tokens.extend(self.tokenize(m)) m = self.find_end(text, end) if not m: # No matching end - act as if the open # tag did not exist. pushretained(text[start:]) break dummy, i = m.span() return self.separator.join(retained), tokens def tokenize(self, match_object): # Override this if you want to suck info out of the start pattern. return []# Strip out uuencoded sections and produce tokens. The return value# is (new_text, sequence_of_tokens), where new_text no longer contains# uuencoded stuff. Note that we're not bothering to decode it! Maybe# we should. One of my persistent false negatives is a spam containing# nothing but a uuencoded money.txt; OTOH, uuencode seems to be on# its way out (that's an old spam).uuencode_begin_re = re.compile(r""" ^begin \s+ (\S+) \s+ # capture mode (\S+) \s* # capture filename $""", re.VERBOSE | re.MULTILINE)uuencode_end_re = re.compile(r"^end\s*\n", re.MULTILINE)class UUencodeStripper(Stripper): def __init__(self): Stripper.__init__(self, uuencode_begin_re.search, uuencode_end_re.search) def tokenize(self, m): mode, fname = m.groups() return (['uuencode mode:%s' % mode] + ['uuencode:%s' % x for x in crack_filename(fname)])crack_uuencode = UUencodeStripper().analyze
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -