📄 message.py
字号:
#! /usr/bin/env python"""message.py - Core Spambayes classes.Classes: Message - an email.Message.Message, extended with spambayes methods SBHeaderMessage - A Message with spambayes header manipulations MessageInfoDB - persistent state storage for Message, using dbm MessageInfoZODB - persistent state storage for Message, using ZODB MessageInfoPickle - persistent state storage for Message, using pickleAbstract: MessageInfoDB is a simple shelve persistency class for the persistent state of a Message obect. The MessageInfoDB currently does not provide iterators, but should at some point. This would allow us to, for example, see how many messages have been trained differently than their classification, for fp/fn assessment purposes. Message is an extension of the email package Message class, to include persistent message information. The persistent state currently consists of the message id, its current classification, and its current training. The payload is not persisted. SBHeaderMessage extends Message to include spambayes header specific manipulations.Usage: A typical classification usage pattern would be something like: >>> import email >>> # substance comes from somewhere else >>> msg = email.message_from_string(substance, _class=SBHeaderMessage) >>> id = msg.setIdFromPayload() >>> if id is None: >>> msg.setId(time()) # or some unique identifier >>> msg.delSBHeaders() # never include sb headers in a classification >>> # bayes object is your responsibility >>> (prob, clues) = bayes.spamprob(msg.asTokens(), evidence=True) >>> msg.addSBHeaders(prob, clues) A typical usage pattern to train as spam would be something like: >>> import email >>> # substance comes from somewhere else >>> msg = email.message_from_string(substance, _class=SBHeaderMessage) >>> id = msg.setId(msgid) # id is a fname, outlook msg id, something... >>> msg.delSBHeaders() # never include sb headers in a train >>> if msg.getTraining() == False: # could be None, can't do boolean test >>> bayes.unlearn(msg.asTokens(), False) # untrain the ham >>> bayes.learn(msg.asTokens(), True) # train as spam >>> msg.rememberTraining(True)To Do: o Suggestions?"""# This module is part of the spambayes project, which is Copyright 2002-5# The Python Software Foundation and is covered by the Python Software# Foundation license.from __future__ import generators__author__ = "Tim Stone <tim@fourstonesExpressions.com>"__credits__ = "Mark Hammond, Tony Meyer, all the spambayes contributors."try: True, Falseexcept NameError: # Maintain compatibility with Python 2.2 True, False = 1, 0 def bool(val): return not not valimport osimport sysimport typesimport timeimport mathimport reimport errnoimport shelveimport warningstry: import cPickle as pickleexcept ImportError: import pickleimport tracebackimport emailimport email.Messageimport email.Parserimport email.Headerimport email.Generatorfrom spambayes import storagefrom spambayes import dbmstoragefrom spambayes.Options import options, get_pathname_optionfrom spambayes.tokenizer import tokenizetry: import cStringIO as StringIOexcept ImportError: import StringIOCRLF_RE = re.compile(r'\r\n|\r|\n')STATS_START_KEY = "Statistics start date"PERSISTENT_HAM_STRING = 'h'PERSISTENT_SPAM_STRING = 's'PERSISTENT_UNSURE_STRING = 'u'class MessageInfoBase(object): def __init__(self, db_name=None): self.db_name = db_name def __len__(self): return len(self.keys()) def get_statistics_start_date(self): if self.db.has_key(STATS_START_KEY): return self.db[STATS_START_KEY] else: return None def set_statistics_start_date(self, date): self.db[STATS_START_KEY] = date self.store() def __getstate__(self): return self.db def __setstate__(self, state): self.db = state def load_msg(self, msg): if self.db is not None: key = msg.getDBKey() assert key is not None, "None is not a valid key." try: try: attributes = self.db[key] except pickle.UnpicklingError: # The old-style Outlook message info db didn't use # shelve, so get it straight from the dbm. if hasattr(self, "dbm"): attributes = self.dbm[key] else: raise except KeyError: # Set to None, as it's not there. for att in msg.stored_attributes: # Don't overwrite. if not hasattr(msg, att): setattr(msg, att, None) else: if not isinstance(attributes, types.ListType): # Old-style message info db if isinstance(attributes, types.TupleType): # sb_server/sb_imapfilter, which only handled # storing 'c' and 't'. (msg.c, msg.t) = attributes return elif isinstance(attributes, types.StringTypes): # Outlook plug-in, which only handled storing 't', # and did it as a string. msg.t = {"0" : False, "1" : True}[attributes] return else: print >> sys.stderr, "Unknown message info type", \ attributes sys.exit(1) for att, val in attributes: setattr(msg, att, val) def store_msg(self, msg): if self.db is not None: msg.date_modified = time.time() attributes = [] for att in msg.stored_attributes: attributes.append((att, getattr(msg, att))) key = msg.getDBKey() assert key is not None, "None is not a valid key." self.db[key] = attributes self.store() def remove_msg(self, msg): if self.db is not None: del self.db[msg.getDBKey()] self.store() def keys(self): return self.db.keys()class MessageInfoPickle(MessageInfoBase): def __init__(self, db_name, pickle_type=1): MessageInfoBase.__init__(self, db_name) self.mode = pickle_type self.load() def load(self): try: fp = open(self.db_name, 'rb') except IOError, e: if e.errno == errno.ENOENT: # New pickle self.db = {} else: raise else: self.db = pickle.load(fp) fp.close() def close(self): # we keep no resources open - nothing to do pass def store(self): fp = open(self.db_name, 'wb') pickle.dump(self.db, fp, self.mode) fp.close()class MessageInfoDB(MessageInfoBase): def __init__(self, db_name, mode='c'): MessageInfoBase.__init__(self, db_name) self.mode = mode self.load() def load(self): try: self.dbm = dbmstorage.open(self.db_name, self.mode) self.db = shelve.Shelf(self.dbm) except dbmstorage.error: # This probably means that we don't have a dbm module # available. Print out a warning, and continue on # (not persisting any of this data). if options["globals", "verbose"]: print "Warning: no dbm modules available for MessageInfoDB" self.dbm = self.db = None def __del__(self): self.close() def close(self): # Close our underlying database. Better not assume all databases # have close functions! def noop(): pass getattr(self.db, "close", noop)() getattr(self.dbm, "close", noop)() def store(self): if self.db is not None: self.db.sync()# If ZODB isn't available, then this class won't be useable, but we# still need to be able to import this module. So we pretend that all# is ok.try: from persistent import Persistentexcept ImportError: Persistent = objectclass _PersistentMessageInfo(MessageInfoBase, Persistent): def __init__(self): import ZODB from BTrees.OOBTree import OOBTree MessageInfoBase.__init__(self) self.db = OOBTree()class MessageInfoZODB(storage.ZODBClassifier): ClassifierClass = _PersistentMessageInfo def __init__(self, db_name, mode='c'): self.nham = self.nspam = 0 # Only used for debugging prints storage.ZODBClassifier.__init__(self, db_name, mode) self.classifier.store = self.store self.db = self.classifier def __setattr__(self, att, value): # Override ZODBClassifier.__setattr__ object.__setattr__(self, att, value)# values are classifier class, True if it accepts a mode# arg, and True if the argument is a pathname_storage_types = {"dbm" : (MessageInfoDB, True, True), "pickle" : (MessageInfoPickle, False, True),## "pgsql" : (MessageInfoPG, False, False),## "mysql" : (MessageInfoMySQL, False, False),## "cdb" : (MessageInfoCDB, False, True), "zodb" : (MessageInfoZODB, True, True),## "zeo" : (MessageInfoZEO, False, False), }def open_storage(data_source_name, db_type="dbm", mode=None): """Return a storage object appropriate to the given parameters.""" try: klass, supports_mode, unused = _storage_types[db_type] except KeyError: raise storage.NoSuchClassifierError(db_type) if supports_mode and mode is not None: return klass(data_source_name, mode) else: return klass(data_source_name)def database_type(): dn = ("Storage", "messageinfo_storage_file") # The storage options here may lag behind those in storage.py, # so we try and be more robust. If we can't use the same storage # method, then we fall back to pickle. nm, typ = storage.database_type((), default_name=dn) if typ not in _storage_types.keys(): typ = "pickle" return nm, typclass Message(object, email.Message.Message): '''An email.Message.Message extended for SpamBayes''' def __init__(self, id=None): email.Message.Message.__init__(self) # persistent state # (non-persistent state includes all of email.Message.Message state) self.stored_attributes = ['c', 't', 'date_modified', ] self.getDBKey = self.getId self.id = None self.c = None self.t = None self.date_modified = None if id is not None: self.setId(id) # This whole message info database thing is a real mess. It really # ought to be a property of the Message class, not each instance. # So we want to access it via classmethods. However, we have treated # it as a regular attribute, so need to make it a property. To make # a classmethod property, we have to jump through some hoops, which we # deserve for not doing it right in the first place. _message_info_db = None def _get_class_message_info_db(klass): # If, the first time we access the attribute, it hasn't been # set, then we load up the default one. if klass._message_info_db is None: nm, typ = database_type() klass._message_info_db = open_storage(nm, typ) return klass._message_info_db _get_class_message_info_db = classmethod(_get_class_message_info_db) def _set_class_message_info_db(klass, value): klass._message_info_db = value
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -