⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 msgstore.py

📁 用python实现的邮件过滤器
💻 PY
📖 第 1 页 / 共 5 页
字号:
    def getDBKey(self):        # Long lived search key.        return self.searchkey    def __repr__(self):        if self.id is None:            id_str = "(deleted/moved)"        else:            id_str = mapi.HexFromBin(self.id[0]), mapi.HexFromBin(self.id[1])        return "<%s, '%s' id=%s>" % (self.__class__.__name__,                                     self.GetSubject(),                                     id_str)    # as per search-key comments above, we also "enforce" this at the Python    # level.  2 different messages, but one copied from the other, will    # return "==".    # Not being consistent could cause subtle bugs, especially in interactions    # with various test tools.    # Compare the GetID() results if you need to know different messages.    def __hash__(self):        return hash(self.searchkey)    def __eq__(self, other):        ceid = self.msgstore.session.CompareEntryIDs        return ceid(self.searchkey, other.searchkey)    def __ne__(self, other):        return not self.__eq__(other)    def GetID(self):        return mapi.HexFromBin(self.id[0]), mapi.HexFromBin(self.id[1])    def GetSubject(self):        return self.subject    def GetOutlookItem(self):        hex_item_id = mapi.HexFromBin(self.id[1])        hex_store_id = mapi.HexFromBin(self.id[0])        return self.msgstore.outlook.Session.GetItemFromID(hex_item_id, hex_store_id)    def IsFilterCandidate(self):        # We don't attempt to filter:        # * Non-mail items        # * Messages that weren't actually received - this generally means user        #   composed messages yet to be sent, or copies of "sent items".        # It does *not* exclude messages that were user composed, but still        # actually received by the user (ie, when you mail yourself)        # GroupWise generates IPM.Anti-Virus.Report.45 (but I'm not sure how        # it manages given it is an external server, and as far as I can tell,        # this does not appear in the headers.        if test_suite_running:            # While the test suite is running, we *only* filter test msgs.            return self.subject == "SpamBayes addin auto-generated test message"        class_check = self.msgclass.lower()        for check in "ipm.note", "ipm.anti-virus":            if class_check.startswith(check):                break        else:            # Not matching class - no good            return False        # Must match msg class to get here.        return self.was_received    def _GetPotentiallyLargeStringProp(self, prop_id, row):        return GetPotentiallyLargeStringProp(self.mapi_object, prop_id, row)    def _GetMessageText(self):        parts = self._GetMessageTextParts()        # parts is (headers, body, html), but could possibly grow        return "\n".join(parts)    def _GetMessageTextParts(self):        # This is almost reliable :).  The only messages this now fails for        # are for "forwarded" messages, where the forwards are actually        # in an attachment.  Later.        # Note we *dont* look in plain text attachments, which we arguably        # should.        from spambayes import mboxutils        self._EnsureObject()        prop_ids = (PR_BODY_A,                    MYPR_BODY_HTML_A,                    PR_TRANSPORT_MESSAGE_HEADERS_A)        hr, data = self.mapi_object.GetProps(prop_ids,0)        body = self._GetPotentiallyLargeStringProp(prop_ids[0], data[0])        html = self._GetPotentiallyLargeStringProp(prop_ids[1], data[1])        headers = self._GetPotentiallyLargeStringProp(prop_ids[2], data[2])        # xxx - not sure what to do if we have both.        if not html:            html = GetHTMLFromRTFProperty(self.mapi_object)        # Some Outlooks deliver a strange notion of headers, including        # interior MIME armor.  To prevent later errors, try to get rid        # of stuff now that can't possibly be parsed as "real" (SMTP)        # headers.        headers = mboxutils.extract_headers(headers)        # Mail delivered internally via Exchange Server etc may not have        # headers - fake some up.        if not headers:            headers = self._GetFakeHeaders()        # Mail delivered via the Exchange Internet Mail MTA may have        # gibberish at the start of the headers - fix this.        elif headers.startswith("Microsoft Mail"):            headers = "X-MS-Mail-Gibberish: " + headers            # This mail typically doesn't have a Received header, which            # is a real PITA for running the incremental testing setup.            # To make life easier, we add in the fake one that the message            # would have got if it had had no headers at all.            if headers.find("Received:") == -1:                prop_ids = PR_MESSAGE_DELIVERY_TIME                hr, data = self.mapi_object.GetProps(prop_ids, 0)                value = self._format_received(data[0][1])                headers = "Received: %s\n%s" % (value, headers)        if not html and not body:            # Only ever seen this for "multipart/signed" messages, so            # without any better clues, just handle this.            # Find all attachments with            # PR_ATTACH_MIME_TAG_A=multipart/signed            table = self.mapi_object.GetAttachmentTable(0)            restriction = (mapi.RES_PROPERTY,   # a property restriction                           (mapi.RELOP_EQ,      # check for equality                            PR_ATTACH_MIME_TAG_A,   # of the given prop                            (PR_ATTACH_MIME_TAG_A, "multipart/signed")))            try:                rows = mapi.HrQueryAllRows(table,                                           (PR_ATTACH_NUM,), # columns to get                                           restriction,    # only these rows                                           None,    # any sort order is fine                                           0)       # any # of results is fine            except pythoncom.com_error:                # For some reason there are no rows we can get                rows = []            if len(rows) == 0:                pass # Nothing we can fetch :(            else:                if len(rows) > 1:                    print "WARNING: Found %d rows with multipart/signed" \                          "- using first only" % len(rows)                row = rows[0]                (attach_num_tag, attach_num), = row                assert attach_num_tag != PT_ERROR, \                       "Error fetching attach_num prop"                # Open the attachment                attach = self.mapi_object.OpenAttach(attach_num,                                                   None,                                                   mapi.MAPI_DEFERRED_ERRORS)                prop_ids = (PR_ATTACH_DATA_BIN,)                hr, data = attach.GetProps(prop_ids, 0)                attach_body = GetPotentiallyLargeStringProp(attach, prop_ids[0], data[0])                # What we seem to have here now is a *complete* multi-part                # mime message - that Outlook must have re-constituted on                # the fly immediately after pulling it apart! - not unlike                # exactly what we are doing ourselves right here - putting                # it into a message object, so we can extract the text, so                # we can stick it back into another one.  Ahhhhh.                import email                msg = email.message_from_string(attach_body)                assert msg.is_multipart(), "Should be multi-part: %r" % attach_body                # reduce down all sub messages, collecting all text/ subtypes.                # (we could make a distinction between text and html, but                # it is all joined together by this method anyway.)                def collect_text_parts(msg):                    collected = ''                    if msg.is_multipart():                        for sub in msg.get_payload():                            collected += collect_text_parts(sub)                    else:                        if msg.get_content_maintype()=='text':                            collected += msg.get_payload()                        else:                            #print "skipping content type", msg.get_content_type()                            pass                    return collected                body = collect_text_parts(msg)        return headers, body, html    def _GetFakeHeaders(self):        # This is designed to fake up some SMTP headers for messages        # on an exchange server that do not have such headers of their own.        prop_ids = PR_SUBJECT_A, PR_SENDER_NAME_A, PR_DISPLAY_TO_A, \                   PR_DISPLAY_CC_A, PR_MESSAGE_DELIVERY_TIME, \                   MYPR_MESSAGE_ID_A, PR_IMPORTANCE, PR_CLIENT_SUBMIT_TIME,        hr, data = self.mapi_object.GetProps(prop_ids, 0)        headers = ["X-Exchange-Message: true"]        for header, index, potentially_large, format_func in (\            ("Subject", 0, True, None),            ("From", 1, True, self._format_address),            ("To", 2, True, self._format_address),            ("CC", 3, True, self._format_address),            ("Received", 4, False, self._format_received),            ("Message-ID", 5, True, None),            ("Importance", 6, False, self._format_importance),            ("Date", 7, False, self._format_time),            ("X-Mailer", 7, False, self._format_version),            ):            if potentially_large:                value = self._GetPotentiallyLargeStringProp(prop_ids[index],                                                            data[index])            else:                value = data[index][1]            if value:                if format_func:                    value = format_func(value)                headers.append("%s: %s" % (header, value))        return "\n".join(headers) + "\n"    def _format_received(self, raw):        # Fake up a 'received' header.  It's important that the date        # is right, so that sort+group.py will work.  The rest is just more        # clues for the tokenizer to find.        return "(via local Exchange server); %s" % (self._format_time(raw),)    def _format_time(self, raw):        from time import timezone        from email.Utils import formatdate        return formatdate(int(raw)-timezone, True)    def _format_importance(self, raw):        # olImportanceHigh = 2, olImportanceLow = 0, olImportanceNormal = 1        return {0 : "low", 1 : "normal", 2 : "high"}[raw]    def _format_version(self, unused):        return "Microsoft Exchange Client"    _address_re = re.compile(r"[()<>,:@!/=; ]")    def _format_address(self, raw):        # Fudge up something that's in the appropriate form.  We don't        # have enough information available to get an actual working        # email address.        addresses = raw.split(";")        formattedAddresses = []        for address in addresses:            address = address.strip()            if address.find("@") >= 0:                formattedAddress = address            else:                formattedAddress = "\"%s\" <%s>" % \                        (address, self._address_re.sub('.', address))            formattedAddresses.append(formattedAddress)        return "; ".join(formattedAddresses)    def _EnsureObject(self):        if self.mapi_object is None:            try:                help_test_suite("MAPIMsgStoreMsg._EnsureObject")                self.mapi_object = self.msgstore._OpenEntry(self.id)            except pythoncom.com_error, details:                raise MsgStoreExceptionFromCOMException(details)    def GetEmailPackageObject(self, strip_mime_headers=True):        # Return an email.Message object.        #        # strip_mime_headers is a hack, and should be left True unless you're        # trying to display all the headers for diagnostic purposes.  If we        # figure out something better to do, it should go away entirely.        #        # Problem #1:  suppose a msg is multipart/alternative, with        # text/plain and text/html sections.  The latter MIME decorations        # are plain missing in what _GetMessageText() returns.  If we leave        # the multipart/alternative in the headers anyway, the email        # package's "lax parsing" won't complain about not finding any        # sections, but since the type *is* multipart/alternative then        # anyway, the tokenizer finds no text/* parts at all to tokenize.        # As a result, only the headers get tokenized.  By stripping        # Content-Type from the headers (if present), the email pkg        # considers the body to be text/plain (the default), and so it        # does get tokenized.        #

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -