📄 tokenizer.py
字号:
# If there was any invalid line, we record as invalid. # If all nine lines were correct, we record as valid. # Otherwise we ignore. if invalid_habeas == True: yield "x-habeas-swe:invalid" elif valid_habeas == 9: yield "x-habeas-swe:valid" # Subject: # Don't ignore case in Subject lines; e.g., 'free' versus 'FREE' is # especially significant in this context. Experiment showed a small # but real benefit to keeping case intact in this specific context. x = msg.get('subject', '') try: subjcharsetlist = email.Header.decode_header(x) except (binascii.Error, email.Errors.HeaderParseError): subjcharsetlist = [(x, 'invalid')] for x, subjcharset in subjcharsetlist: if subjcharset is not None: yield 'subjectcharset:' + subjcharset # this is a workaround for a bug in the csv module in Python # <= 2.3.4 and 2.4.0 (fixed in 2.5) x = x.replace('\r', ' ') for w in subject_word_re.findall(x): for t in tokenize_word(w): yield 'subject:' + t for w in punctuation_run_re.findall(x): yield 'subject:' + w # Dang -- I can't use Sender:. If I do, # 'sender:email name:python-list-admin' # becomes the most powerful indicator in the whole database. # # From: # this helps both rates # Reply-To: # my error rates are too low now to tell about this # # one (smalls wins & losses across runs, overall # # not significant), so leaving it out # To:, Cc: # These can help, if your ham and spam are sourced # # from the same location. If not, they'll be horrible. for field in options["Tokenizer", "address_headers"]: addrlist = msg.get_all(field, []) if not addrlist: yield field + ":none" continue noname_count = 0 for name, addr in email.Utils.getaddresses(addrlist): if name: try: subjcharsetlist = email.Header.decode_header(name) except (binascii.Error, email.Errors.HeaderParseError): subjcharsetlist = [(name, 'invalid')] for name, charset in subjcharsetlist: yield "%s:name:%s" % (field, name.lower()) if charset is not None: yield "%s:charset:%s" % (field, charset) else: noname_count += 1 if addr: for w in addr.lower().split('@'): yield "%s:addr:%s" % (field, w) else: yield field + ":addr:none" if noname_count: yield "%s:no real name:2**%d" % (field, round(log2(noname_count))) # Spammers sometimes send out mail alphabetically to fairly large # numbers of addresses. This results in headers like: # To: <itinerart@videotron.ca> # Cc: <itinerant@skyful.com>, <itinerant@netillusions.net>, # <itineraries@musi-cal.com>, <itinerario@rullet.leidenuniv.nl>, # <itinerance@sorengo.com> # # This token attempts to exploit that property. The above would # give a common prefix of "itinera" for 6 addresses, yielding a # gross score of 42. We group scores into buckets by dividing by 10 # to yield a final token value of "pfxlen:04". The length test # eliminates the bad case where the message was sent to a single # individual. if options["Tokenizer", "summarize_email_prefixes"]: all_addrs = [] addresses = msg.get_all('to', []) + msg.get_all('cc', []) for name, addr in email.Utils.getaddresses(addresses): all_addrs.append(addr.lower()) if len(all_addrs) > 1: # don't be fooled by "os.path." - commonprefix # operates char-by-char! pfx = os.path.commonprefix(all_addrs) if pfx: score = (len(pfx) * len(all_addrs)) // 10 # After staring at pfxlen:* values generated from a large # number of ham & spam I saw that any scores greater # than 3 were always associated with spam. Collapsing # all such scores into a single token avoids a bunch of # hapaxes like "pfxlen:28". if score > 3: yield "pfxlen:big" else: yield "pfxlen:%d" % score # same idea as above, but works for addresses in the same domain # like # To: "skip" <bugs@mojam.com>, <chris@mojam.com>, # <concertmaster@mojam.com>, <concerts@mojam.com>, # <design@mojam.com>, <rob@mojam.com>, <skip@mojam.com> if options["Tokenizer", "summarize_email_suffixes"]: all_addrs = [] addresses = msg.get_all('to', []) + msg.get_all('cc', []) for name, addr in email.Utils.getaddresses(addresses): # flip address code so following logic is the same as # that for prefixes addr = list(addr) addr.reverse() addr = "".join(addr) all_addrs.append(addr.lower()) if len(all_addrs) > 1: # don't be fooled by "os.path." - commonprefix # operates char-by-char! sfx = os.path.commonprefix(all_addrs) if sfx: score = (len(sfx) * len(all_addrs)) // 10 # Similar analysis as above regarding suffix length # I suspect the best cutoff is probably dependent on # how long the recipient domain is (e.g. "mojam.com" vs. # "montanaro.dyndns.org") if score > 5: yield "sfxlen:big" else: yield "sfxlen:%d" % score # To: # Cc: # Count the number of addresses in each of the recipient headers. for field in ('to', 'cc'): count = 0 for addrs in msg.get_all(field, []): count += len(addrs.split(',')) if count > 0: yield '%s:2**%d' % (field, round(log2(count))) # These headers seem to work best if they're not tokenized: just # normalize case and whitespace. # X-Mailer: This is a pure and significant win for the f-n rate; f-p # rate isn't affected. for field in ('x-mailer',): prefix = field + ':' x = msg.get(field, 'none').lower() yield prefix + ' '.join(x.split()) # Received: # Neil Schemenauer reports good results from this. if options["Tokenizer", "mine_received_headers"]: for header in msg.get_all("received", ()): # everything here should be case insensitive and not be # split across continuation lines, so normalize whitespace # and letter case just once per header header = ' '.join(header.split()).lower() for clue in received_complaints_re.findall(header): yield 'received:' + clue for pat, breakdown in [(received_host_re, breakdown_host), (received_ip_re, breakdown_ipaddr)]: m = pat.search(header) if m: for tok in breakdown(m.group(1)): yield 'received:' + tok # Message-Id: This seems to be a small win and should not # adversely affect a mixed source corpus so it's always enabled. msgid = msg.get("message-id", "") m = message_id_re.match(msgid) if m: # looks okay, return the hostname yield 'message-id:@%s' % m.group(1) else: # might be weird instead of invalid but who cares? yield 'message-id:invalid' # As suggested by Anthony Baxter, merely counting the number of # header lines, and in a case-sensitive way, has real value. # For example, all-caps SUBJECT is a strong spam clue, while # X-Complaints-To a strong ham clue. x2n = {} if options["Tokenizer", "count_all_header_lines"]: for x in msg.keys(): x2n[x] = x2n.get(x, 0) + 1 else: # Do a "safe" approximation to that. When spam and ham are # collected from different sources, the count of some header # lines can be a too strong a discriminator for accidental # reasons. safe_headers = options["Tokenizer", "safe_headers"] for x in msg.keys(): if x.lower() in safe_headers: x2n[x] = x2n.get(x, 0) + 1 for x in x2n.items(): yield "header:%s:%d" % x if options["Tokenizer", "record_header_absence"]: for k in x2n: if not k.lower() in options["Tokenizer", "safe_headers"]: yield "noheader:" + k def tokenize_text(self, text, maxword=options["Tokenizer", "skip_max_word_size"]): """Tokenize everything in the chunk of text we were handed.""" short_runs = Set() short_count = 0 for w in text.split(): n = len(w) if n < 3: # count how many short words we see in a row - meant to # latch onto crap like this: # X j A m N j A d X h # M k E z R d I p D u I m A c # C o I d A t L j I v S j short_count += 1 else: if short_count: short_runs.add(short_count) short_count = 0 # Make sure this range matches in tokenize_word(). if 3 <= n <= maxword: yield w elif n >= 3: for t in tokenize_word(w): yield t if short_runs and options["Tokenizer", "x-short_runs"]: yield "short:%d" % int(log2(max(short_runs))) def tokenize_body(self, msg): """Generate a stream of tokens from an email Message. If options['Tokenizer', 'check_octets'] is True, the first few undecoded characters of application/octet-stream parts of the message body become tokens. """ if options["Tokenizer", "check_octets"]: # Find, decode application/octet-stream parts of the body, # tokenizing the first few characters of each chunk. for part in octetparts(msg): try: text = part.get_payload(decode=True) except: yield "control: couldn't decode octet" text = part.get_payload(decode=False) if text is None: yield "control: octet payload is None" continue yield "octet:%s" % text[:options["Tokenizer", "octet_prefix_size"]] parts = imageparts(msg) if options["Tokenizer", "x-image_size"]: # Find image/* parts of the body, calculating the log(size) of # each image. total_len = 0 for part in parts: try: text = part.get_payload(decode=True) except: yield "control: couldn't decode image" text = part.get_payload(decode=False) total_len += len(text or "") if text is None: yield "control: image payload is None" if total_len: yield "image-size:2**%d" % round(log2(total_len)) if options["Tokenizer", "x-crack_images"]: from spambayes.ImageStripper import crack_images text, tokens = crack_images(parts) for t in tokens: yield t for t in self.tokenize_text(text): yield t # Find, decode (base64, qp), and tokenize textual parts of the body. for part in textparts(msg): # Decode, or take it as-is if decoding fails. try: text = part.get_payload(decode=True) except: yield "control: couldn't decode" text = part.get_payload(decode=False) if text is not None: text = try_to_repair_damaged_base64(text) if text is None: yield 'control: payload is None' continue # Replace numeric character entities (like a for the letter # 'a'). text = numeric_entity_re.sub(numeric_entity_replacer, text) # Normalize case. text = text.lower() if options["Tokenizer", "replace_nonascii_chars"]: # Replace high-bit chars and control chars with '?'. text = text.translate(non_ascii_translate_tab) for t in find_html_virus_clues(text): yield "virus:%s" % t # Get rid of uuencoded sections, embedded URLs, <style gimmicks, # and HTML comments. for cracker in (crack_uuencode, crack_urls, crack_html_style, crack_html_comment, crack_noframes): text, tokens = cracker(text) for t in tokens: yield t # Remove HTML/XML tags. Also . <br> and <p> tags should # create a space too. text = breaking_entity_re.sub(' ', text) # It's important to eliminate HTML tags rather than, e.g., # replace them with a blank (as this code used to do), else # simple tricks like # Wr<!$FS|i|R3$s80sA >inkle Reduc<!$FS|i|R3$s80sA >tion # can be used to disguise words. <br> and <p> were special-
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -