📄 tokenizer.py

📁 用python实现的邮件过滤器
💻 PY
📖 第 1 页 / 共 5 页
字号:
上一页 1 2 3 45
                # If there was any invalid line, we record as invalid.                # If all nine lines were correct, we record as valid.                # Otherwise we ignore.                if invalid_habeas == True:                    yield "x-habeas-swe:invalid"                elif valid_habeas == 9:                    yield "x-habeas-swe:valid"        # Subject:        # Don't ignore case in Subject lines; e.g., 'free' versus 'FREE' is        # especially significant in this context.  Experiment showed a small        # but real benefit to keeping case intact in this specific context.        x = msg.get('subject', '')        try:            subjcharsetlist = email.Header.decode_header(x)        except (binascii.Error, email.Errors.HeaderParseError):            subjcharsetlist = [(x, 'invalid')]        for x, subjcharset in subjcharsetlist:            if subjcharset is not None:                yield 'subjectcharset:' + subjcharset            # this is a workaround for a bug in the csv module in Python            # <= 2.3.4 and 2.4.0 (fixed in 2.5)            x = x.replace('\r', ' ')            for w in subject_word_re.findall(x):                for t in tokenize_word(w):                    yield 'subject:' + t            for w in punctuation_run_re.findall(x):                yield 'subject:' + w        # Dang -- I can't use Sender:.  If I do,        #     'sender:email name:python-list-admin'        # becomes the most powerful indicator in the whole database.        #        # From:         # this helps both rates        # Reply-To:     # my error rates are too low now to tell about this        #               # one (smalls wins & losses across runs, overall        #               # not significant), so leaving it out        # To:, Cc:      # These can help, if your ham and spam are sourced        #               # from the same location. If not, they'll be horrible.        for field in options["Tokenizer", "address_headers"]:            addrlist = msg.get_all(field, [])            if not addrlist:                yield field + ":none"                continue            noname_count = 0            for name, addr in email.Utils.getaddresses(addrlist):                if name:                    try:                        subjcharsetlist = email.Header.decode_header(name)                    except (binascii.Error, email.Errors.HeaderParseError):                        subjcharsetlist = [(name, 'invalid')]                    for name, charset in subjcharsetlist:                        yield "%s:name:%s" % (field, name.lower())                        if charset is not None:                            yield "%s:charset:%s" % (field, charset)                else:                    noname_count += 1                if addr:                    for w in addr.lower().split('@'):                        yield "%s:addr:%s" % (field, w)                else:                    yield field + ":addr:none"            if noname_count:                yield "%s:no real name:2**%d" % (field,                                                 round(log2(noname_count)))        # Spammers sometimes send out mail alphabetically to fairly large        # numbers of addresses.  This results in headers like:        #   To: <itinerart@videotron.ca>        #   Cc: <itinerant@skyful.com>, <itinerant@netillusions.net>,        #       <itineraries@musi-cal.com>, <itinerario@rullet.leidenuniv.nl>,        #       <itinerance@sorengo.com>        #        # This token attempts to exploit that property.  The above would        # give a common prefix of "itinera" for 6 addresses, yielding a        # gross score of 42.  We group scores into buckets by dividing by 10        # to yield a final token value of "pfxlen:04".  The length test        # eliminates the bad case where the message was sent to a single        # individual.        if options["Tokenizer", "summarize_email_prefixes"]:            all_addrs = []            addresses = msg.get_all('to', []) + msg.get_all('cc', [])            for name, addr in email.Utils.getaddresses(addresses):                all_addrs.append(addr.lower())            if len(all_addrs) > 1:                # don't be fooled by "os.path." - commonprefix                # operates char-by-char!                pfx = os.path.commonprefix(all_addrs)                if pfx:                    score = (len(pfx) * len(all_addrs)) // 10                    # After staring at pfxlen:* values generated from a large                    # number of ham & spam I saw that any scores greater                    # than 3 were always associated with spam.  Collapsing                    # all such scores into a single token avoids a bunch of                    # hapaxes like "pfxlen:28".                    if score > 3:                        yield "pfxlen:big"                    else:                        yield "pfxlen:%d" % score        # same idea as above, but works for addresses in the same domain        # like        #   To: "skip" <bugs@mojam.com>, <chris@mojam.com>,        #       <concertmaster@mojam.com>, <concerts@mojam.com>,        #       <design@mojam.com>, <rob@mojam.com>, <skip@mojam.com>        if options["Tokenizer", "summarize_email_suffixes"]:            all_addrs = []            addresses = msg.get_all('to', []) + msg.get_all('cc', [])            for name, addr in email.Utils.getaddresses(addresses):                # flip address code so following logic is the same as                # that for prefixes                addr = list(addr)                addr.reverse()                addr = "".join(addr)                all_addrs.append(addr.lower())            if len(all_addrs) > 1:                # don't be fooled by "os.path." - commonprefix                # operates char-by-char!                sfx = os.path.commonprefix(all_addrs)                if sfx:                    score = (len(sfx) * len(all_addrs)) // 10                    # Similar analysis as above regarding suffix length                    # I suspect the best cutoff is probably dependent on                    # how long the recipient domain is (e.g. "mojam.com" vs.                    # "montanaro.dyndns.org")                    if score > 5:                        yield "sfxlen:big"                    else:                        yield "sfxlen:%d" % score        # To:        # Cc:        # Count the number of addresses in each of the recipient headers.        for field in ('to', 'cc'):            count = 0            for addrs in msg.get_all(field, []):                count += len(addrs.split(','))            if count > 0:                yield '%s:2**%d' % (field, round(log2(count)))        # These headers seem to work best if they're not tokenized:  just        # normalize case and whitespace.        # X-Mailer:  This is a pure and significant win for the f-n rate; f-p        #            rate isn't affected.        for field in ('x-mailer',):            prefix = field + ':'            x = msg.get(field, 'none').lower()            yield prefix + ' '.join(x.split())        # Received:        # Neil Schemenauer reports good results from this.        if options["Tokenizer", "mine_received_headers"]:            for header in msg.get_all("received", ()):                # everything here should be case insensitive and not be                # split across continuation lines, so normalize whitespace                # and letter case just once per header                header = ' '.join(header.split()).lower()                for clue in received_complaints_re.findall(header):                    yield 'received:' + clue                for pat, breakdown in [(received_host_re, breakdown_host),                                       (received_ip_re, breakdown_ipaddr)]:                    m = pat.search(header)                    if m:                        for tok in breakdown(m.group(1)):                            yield 'received:' + tok        # Message-Id:  This seems to be a small win and should not        # adversely affect a mixed source corpus so it's always enabled.        msgid = msg.get("message-id", "")        m = message_id_re.match(msgid)        if m:            # looks okay, return the hostname            yield 'message-id:@%s' % m.group(1)        else:            # might be weird instead of invalid but who cares?            yield 'message-id:invalid'        # As suggested by Anthony Baxter, merely counting the number of        # header lines, and in a case-sensitive way, has real value.        # For example, all-caps SUBJECT is a strong spam clue, while        # X-Complaints-To a strong ham clue.        x2n = {}        if options["Tokenizer", "count_all_header_lines"]:            for x in msg.keys():                x2n[x] = x2n.get(x, 0) + 1        else:            # Do a "safe" approximation to that.  When spam and ham are            # collected from different sources, the count of some header            # lines can be a too strong a discriminator for accidental            # reasons.            safe_headers = options["Tokenizer", "safe_headers"]            for x in msg.keys():                if x.lower() in safe_headers:                    x2n[x] = x2n.get(x, 0) + 1        for x in x2n.items():            yield "header:%s:%d" % x        if options["Tokenizer", "record_header_absence"]:            for k in x2n:                if not k.lower() in options["Tokenizer", "safe_headers"]:                    yield "noheader:" + k    def tokenize_text(self, text, maxword=options["Tokenizer",                                                  "skip_max_word_size"]):        """Tokenize everything in the chunk of text we were handed."""        short_runs = Set()        short_count = 0        for w in text.split():            n = len(w)            if n < 3:                # count how many short words we see in a row - meant to                # latch onto crap like this:                # X j A m N j A d X h                # M k E z R d I p D u I m A c                # C o I d A t L j I v S j                short_count += 1            else:                if short_count:                    short_runs.add(short_count)                    short_count = 0                # Make sure this range matches in tokenize_word().                if 3 <= n <= maxword:                    yield w                elif n >= 3:                    for t in tokenize_word(w):                        yield t        if short_runs and options["Tokenizer", "x-short_runs"]:            yield "short:%d" % int(log2(max(short_runs)))    def tokenize_body(self, msg):        """Generate a stream of tokens from an email Message.        If options['Tokenizer', 'check_octets'] is True, the first few        undecoded characters of application/octet-stream parts of the        message body become tokens.        """        if options["Tokenizer", "check_octets"]:            # Find, decode application/octet-stream parts of the body,            # tokenizing the first few characters of each chunk.            for part in octetparts(msg):                try:                    text = part.get_payload(decode=True)                except:                    yield "control: couldn't decode octet"                    text = part.get_payload(decode=False)                if text is None:                    yield "control: octet payload is None"                    continue                yield "octet:%s" % text[:options["Tokenizer",                                                 "octet_prefix_size"]]        parts = imageparts(msg)        if options["Tokenizer", "x-image_size"]:            # Find image/* parts of the body, calculating the log(size) of            # each image.                        total_len = 0            for part in parts:                try:                    text = part.get_payload(decode=True)                except:                    yield "control: couldn't decode image"                    text = part.get_payload(decode=False)                total_len += len(text or "")                if text is None:                    yield "control: image payload is None"            if total_len:                yield "image-size:2**%d" % round(log2(total_len))        if options["Tokenizer", "x-crack_images"]:            from spambayes.ImageStripper import crack_images            text, tokens = crack_images(parts)            for t in tokens:                yield t            for t in self.tokenize_text(text):                yield t        # Find, decode (base64, qp), and tokenize textual parts of the body.        for part in textparts(msg):            # Decode, or take it as-is if decoding fails.            try:                text = part.get_payload(decode=True)            except:                yield "control: couldn't decode"                text = part.get_payload(decode=False)                if text is not None:                    text = try_to_repair_damaged_base64(text)            if text is None:                yield 'control: payload is None'                continue            # Replace numeric character entities (like &#97; for the letter            # 'a').            text = numeric_entity_re.sub(numeric_entity_replacer, text)            # Normalize case.            text = text.lower()            if options["Tokenizer", "replace_nonascii_chars"]:                # Replace high-bit chars and control chars with '?'.                text = text.translate(non_ascii_translate_tab)            for t in find_html_virus_clues(text):                yield "virus:%s" % t            # Get rid of uuencoded sections, embedded URLs, <style gimmicks,            # and HTML comments.            for cracker in (crack_uuencode,                            crack_urls,                            crack_html_style,                            crack_html_comment,                            crack_noframes):                text, tokens = cracker(text)                for t in tokens:                    yield t            # Remove HTML/XML tags.  Also &nbsp;.  <br> and <p> tags should            # create a space too.            text = breaking_entity_re.sub(' ', text)            # It's important to eliminate HTML tags rather than, e.g.,            # replace them with a blank (as this code used to do), else            # simple tricks like            #    Wr<!$FS|i|R3$s80sA >inkle Reduc<!$FS|i|R3$s80sA >tion            # can be used to disguise words.  <br> and <p> were special-
上一页 1 2 3 45
💿 文件大小 1791 K
👤 上传用户 guigong
📂 所属分类数学计算
🏷️ 相关标签

#python #邮件过滤
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -