📄 parser.py
字号:
# Copyright (C) 2001,2002 Python Software Foundation# Author: barry@zope.com (Barry Warsaw)"""A parser of RFC 2822 and MIME email messages."""import refrom cStringIO import StringIOfrom types import ListTypefrom email import Errorsfrom email import MessageEMPTYSTRING = ''NL = '\n'try: True, Falseexcept NameError: True = 1 False = 0NLCRE = re.compile('\r\n|\r|\n')class Parser: def __init__(self, _class=Message.Message, strict=False): """Parser of RFC 2822 and MIME email messages. Creates an in-memory object tree representing the email message, which can then be manipulated and turned over to a Generator to return the textual representation of the message. The string must be formatted as a block of RFC 2822 headers and header continuation lines, optionally preceeded by a `Unix-from' header. The header block is terminated either by the end of the string or by a blank line. _class is the class to instantiate for new message objects when they must be created. This class must have a constructor that can take zero arguments. Default is Message.Message. Optional strict tells the parser to be strictly RFC compliant or to be more forgiving in parsing of ill-formatted MIME documents. When non-strict mode is used, the parser will try to make up for missing or erroneous boundaries and other peculiarities seen in the wild. Default is non-strict parsing. """ self._class = _class self._strict = strict def parse(self, fp, headersonly=False): """Create a message structure from the data in a file. Reads all the data from the file and returns the root of the message structure. Optional headersonly is a flag specifying whether to stop parsing after reading the headers or not. The default is False, meaning it parses the entire contents of the file. """ root = self._class() firstbodyline = self._parseheaders(root, fp) if not headersonly: self._parsebody(root, fp, firstbodyline) return root def parsestr(self, text, headersonly=False): """Create a message structure from a string. Returns the root of the message structure. Optional headersonly is a flag specifying whether to stop parsing after reading the headers or not. The default is False, meaning it parses the entire contents of the file. """ return self.parse(StringIO(text), headersonly=headersonly) def _parseheaders(self, container, fp): # Parse the headers, returning a list of header/value pairs. None as # the header means the Unix-From header. lastheader = '' lastvalue = [] lineno = 0 firstbodyline = None while True: # Don't strip the line before we test for the end condition, # because whitespace-only header lines are RFC compliant # continuation lines. line = fp.readline() if not line: break line = line.splitlines()[0] if not line: break # Ignore the trailing newline lineno += 1 # Check for initial Unix From_ line if line.startswith('From '): if lineno == 1: container.set_unixfrom(line) continue elif self._strict: raise Errors.HeaderParseError( 'Unix-from in headers after first rfc822 header') else: # ignore the wierdly placed From_ line # XXX: maybe set unixfrom anyway? or only if not already? continue # Header continuation line if line[0] in ' \t': if not lastheader: raise Errors.HeaderParseError( 'Continuation line seen before first header') lastvalue.append(line) continue # Normal, non-continuation header. BAW: this should check to make # sure it's a legal header, e.g. doesn't contain spaces. Also, we # should expose the header matching algorithm in the API, and # allow for a non-strict parsing mode (that ignores the line # instead of raising the exception). i = line.find(':') if i < 0: if self._strict: raise Errors.HeaderParseError( "Not a header, not a continuation: ``%s''" % line) elif lineno == 1 and line.startswith('--'): # allow through duplicate boundary tags. continue else: # There was no separating blank line as mandated by RFC # 2822, but we're in non-strict mode. So just offer up # this current line as the first body line. firstbodyline = line break if lastheader: container[lastheader] = NL.join(lastvalue) lastheader = line[:i] lastvalue = [line[i+1:].lstrip()] # Make sure we retain the last header if lastheader: container[lastheader] = NL.join(lastvalue) return firstbodyline def _parsebody(self, container, fp, firstbodyline=None): # Parse the body, but first split the payload on the content-type # boundary if present. boundary = container.get_boundary() isdigest = (container.get_content_type() == 'multipart/digest') # If there's a boundary, split the payload text into its constituent # parts and parse each separately. Otherwise, just parse the rest of # the body as a single message. Note: any exceptions raised in the # recursive parse need to have their line numbers coerced. if boundary: preamble = epilogue = None # Split into subparts. The first boundary we're looking for won't # always have a leading newline since we're at the start of the # body text, and there's not always a preamble before the first # boundary. separator = '--' + boundary payload = fp.read() if firstbodyline is not None: payload = firstbodyline + '\n' + payload # We use an RE here because boundaries can have trailing # whitespace. mo = re.search( r'(?P<sep>' + re.escape(separator) + r')(?P<ws>[ \t]*)', payload) if not mo: if self._strict: raise Errors.BoundaryError( "Couldn't find starting boundary: %s" % boundary) container.set_payload(payload) return start = mo.start() if start > 0: # there's some pre-MIME boundary preamble preamble = payload[0:start] # Find out what kind of line endings we're using start += len(mo.group('sep')) + len(mo.group('ws')) mo = NLCRE.search(payload, start) if mo: start += len(mo.group(0)) # We create a compiled regexp first because we need to be able to # specify the start position, and the module function doesn't # support this signature. :( cre = re.compile('(?P<sep>\r\n|\r|\n)' + re.escape(separator) + '--') mo = cre.search(payload, start) if mo: terminator = mo.start() linesep = mo.group('sep') if mo.end() < len(payload): # There's some post-MIME boundary epilogue epilogue = payload[mo.end():] elif self._strict: raise Errors.BoundaryError( "Couldn't find terminating boundary: %s" % boundary) else: # Handle the case of no trailing boundary. Check that it ends # in a blank line. Some cases (spamspamspam) don't even have # that! mo = re.search('(?P<sep>\r\n|\r|\n){2}$', payload) if not mo: mo = re.search('(?P<sep>\r\n|\r|\n)$', payload) if not mo: raise Errors.BoundaryError( 'No terminating boundary and no trailing empty line') linesep = mo.group('sep') terminator = len(payload) # We split the textual payload on the boundary separator, which # includes the trailing newline. If the container is a # multipart/digest then the subparts are by default message/rfc822 # instead of text/plain. In that case, they'll have a optional # block of MIME headers, then an empty line followed by the # message headers. parts = re.split( linesep + re.escape(separator) + r'[ \t]*' + linesep, payload[start:terminator]) for part in parts: if isdigest: if part.startswith(linesep): # There's no header block so create an empty message # object as the container, and lop off the newline so # we can parse the sub-subobject msgobj = self._class() part = part[len(linesep):] else: parthdrs, part = part.split(linesep+linesep, 1) # msgobj in this case is the "message/rfc822" container msgobj = self.parsestr(parthdrs, headersonly=1) # while submsgobj is the message itself msgobj.set_default_type('message/rfc822') maintype = msgobj.get_content_maintype() if maintype in ('message', 'multipart'): submsgobj = self.parsestr(part) msgobj.attach(submsgobj) else: msgobj.set_payload(part) else: msgobj = self.parsestr(part) container.preamble = preamble container.epilogue = epilogue container.attach(msgobj) elif container.get_main_type() == 'multipart': # Very bad. A message is a multipart with no boundary! raise Errors.BoundaryError( 'multipart message with no defined boundary') elif container.get_type() == 'message/delivery-status': # This special kind of type contains blocks of headers separated # by a blank line. We'll represent each header block as a # separate Message object blocks = [] while True: blockmsg = self._class() self._parseheaders(blockmsg, fp) if not len(blockmsg): # No more header blocks left break blocks.append(blockmsg) container.set_payload(blocks) elif container.get_main_type() == 'message': # Create a container for the payload, but watch out for there not # being any headers left try: msg = self.parse(fp) except Errors.HeaderParseError: msg = self._class() self._parsebody(msg, fp) container.attach(msg) else: text = fp.read() if firstbodyline is not None: text = firstbodyline + '\n' + text container.set_payload(text)class HeaderParser(Parser): """A subclass of Parser, this one only meaningfully parses message headers. This class can be used if all you're interested in is the headers of a message. While it consumes the message body, it does not parse it, but simply makes it available as a string payload. Parsing with this subclass can be considerably faster if all you're interested in is the message headers. """ def _parsebody(self, container, fp, firstbodyline=None): # Consume but do not parse, the body text = fp.read() if firstbodyline is not None: text = firstbodyline + '\n' + text container.set_payload(text)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -