📄 parser.py

📁 mallet是自然语言处理、机器学习领域的一个开源项目。
💻 PY
字号:
# Copyright (C) 2001,2002 Python Software Foundation# Author: barry@zope.com (Barry Warsaw)"""A parser of RFC 2822 and MIME email messages."""import refrom cStringIO import StringIOfrom types import ListTypefrom email import Errorsfrom email import MessageEMPTYSTRING = ''NL = '\n'try:    True, Falseexcept NameError:    True = 1    False = 0NLCRE = re.compile('\r\n|\r|\n')class Parser:    def __init__(self, _class=Message.Message, strict=False):        """Parser of RFC 2822 and MIME email messages.        Creates an in-memory object tree representing the email message, which        can then be manipulated and turned over to a Generator to return the        textual representation of the message.        The string must be formatted as a block of RFC 2822 headers and header        continuation lines, optionally preceeded by a `Unix-from' header.  The        header block is terminated either by the end of the string or by a        blank line.        _class is the class to instantiate for new message objects when they        must be created.  This class must have a constructor that can take        zero arguments.  Default is Message.Message.        Optional strict tells the parser to be strictly RFC compliant or to be        more forgiving in parsing of ill-formatted MIME documents.  When        non-strict mode is used, the parser will try to make up for missing or        erroneous boundaries and other peculiarities seen in the wild.        Default is non-strict parsing.        """        self._class = _class        self._strict = strict    def parse(self, fp, headersonly=False):        """Create a message structure from the data in a file.        Reads all the data from the file and returns the root of the message        structure.  Optional headersonly is a flag specifying whether to stop        parsing after reading the headers or not.  The default is False,        meaning it parses the entire contents of the file.        """        root = self._class()        firstbodyline = self._parseheaders(root, fp)        if not headersonly:            self._parsebody(root, fp, firstbodyline)        return root    def parsestr(self, text, headersonly=False):        """Create a message structure from a string.        Returns the root of the message structure.  Optional headersonly is a        flag specifying whether to stop parsing after reading the headers or        not.  The default is False, meaning it parses the entire contents of        the file.        """        return self.parse(StringIO(text), headersonly=headersonly)    def _parseheaders(self, container, fp):        # Parse the headers, returning a list of header/value pairs.  None as        # the header means the Unix-From header.        lastheader = ''        lastvalue = []        lineno = 0        firstbodyline = None        while True:            # Don't strip the line before we test for the end condition,            # because whitespace-only header lines are RFC compliant            # continuation lines.            line = fp.readline()            if not line:                break            line = line.splitlines()[0]            if not line:                break            # Ignore the trailing newline            lineno += 1            # Check for initial Unix From_ line            if line.startswith('From '):                if lineno == 1:                    container.set_unixfrom(line)                    continue                elif self._strict:                    raise Errors.HeaderParseError(                        'Unix-from in headers after first rfc822 header')                else:                    # ignore the wierdly placed From_ line                    # XXX: maybe set unixfrom anyway? or only if not already?                    continue            # Header continuation line            if line[0] in ' \t':                if not lastheader:                    raise Errors.HeaderParseError(                        'Continuation line seen before first header')                lastvalue.append(line)                continue            # Normal, non-continuation header.  BAW: this should check to make            # sure it's a legal header, e.g. doesn't contain spaces.  Also, we            # should expose the header matching algorithm in the API, and            # allow for a non-strict parsing mode (that ignores the line            # instead of raising the exception).            i = line.find(':')            if i < 0:                if self._strict:                    raise Errors.HeaderParseError(                        "Not a header, not a continuation: ``%s''" % line)                elif lineno == 1 and line.startswith('--'):                    # allow through duplicate boundary tags.                    continue                else:                    # There was no separating blank line as mandated by RFC                    # 2822, but we're in non-strict mode.  So just offer up                    # this current line as the first body line.                    firstbodyline = line                    break            if lastheader:                container[lastheader] = NL.join(lastvalue)            lastheader = line[:i]            lastvalue = [line[i+1:].lstrip()]        # Make sure we retain the last header        if lastheader:            container[lastheader] = NL.join(lastvalue)        return firstbodyline    def _parsebody(self, container, fp, firstbodyline=None):        # Parse the body, but first split the payload on the content-type        # boundary if present.        boundary = container.get_boundary()        isdigest = (container.get_content_type() == 'multipart/digest')        # If there's a boundary, split the payload text into its constituent        # parts and parse each separately.  Otherwise, just parse the rest of        # the body as a single message.  Note: any exceptions raised in the        # recursive parse need to have their line numbers coerced.        if boundary:            preamble = epilogue = None            # Split into subparts.  The first boundary we're looking for won't            # always have a leading newline since we're at the start of the            # body text, and there's not always a preamble before the first            # boundary.            separator = '--' + boundary            payload = fp.read()            if firstbodyline is not None:                payload = firstbodyline + '\n' + payload            # We use an RE here because boundaries can have trailing            # whitespace.            mo = re.search(                r'(?P<sep>' + re.escape(separator) + r')(?P<ws>[ \t]*)',                payload)            if not mo:                if self._strict:                    raise Errors.BoundaryError(                        "Couldn't find starting boundary: %s" % boundary)                container.set_payload(payload)                return            start = mo.start()            if start > 0:                # there's some pre-MIME boundary preamble                preamble = payload[0:start]            # Find out what kind of line endings we're using            start += len(mo.group('sep')) + len(mo.group('ws'))            mo = NLCRE.search(payload, start)            if mo:                start += len(mo.group(0))            # We create a compiled regexp first because we need to be able to            # specify the start position, and the module function doesn't            # support this signature. :(            cre = re.compile('(?P<sep>\r\n|\r|\n)' +                             re.escape(separator) + '--')            mo = cre.search(payload, start)            if mo:                terminator = mo.start()                linesep = mo.group('sep')                if mo.end() < len(payload):                    # There's some post-MIME boundary epilogue                    epilogue = payload[mo.end():]            elif self._strict:                raise Errors.BoundaryError(                        "Couldn't find terminating boundary: %s" % boundary)            else:                # Handle the case of no trailing boundary.  Check that it ends                # in a blank line.  Some cases (spamspamspam) don't even have                # that!                mo = re.search('(?P<sep>\r\n|\r|\n){2}$', payload)                if not mo:                    mo = re.search('(?P<sep>\r\n|\r|\n)$', payload)                    if not mo:                        raise Errors.BoundaryError(                          'No terminating boundary and no trailing empty line')                linesep = mo.group('sep')                terminator = len(payload)            # We split the textual payload on the boundary separator, which            # includes the trailing newline. If the container is a            # multipart/digest then the subparts are by default message/rfc822            # instead of text/plain.  In that case, they'll have a optional            # block of MIME headers, then an empty line followed by the            # message headers.            parts = re.split(                linesep + re.escape(separator) + r'[ \t]*' + linesep,                payload[start:terminator])            for part in parts:                if isdigest:                    if part.startswith(linesep):                        # There's no header block so create an empty message                        # object as the container, and lop off the newline so                        # we can parse the sub-subobject                        msgobj = self._class()                        part = part[len(linesep):]                    else:                        parthdrs, part = part.split(linesep+linesep, 1)                        # msgobj in this case is the "message/rfc822" container                        msgobj = self.parsestr(parthdrs, headersonly=1)                    # while submsgobj is the message itself                    msgobj.set_default_type('message/rfc822')                    maintype = msgobj.get_content_maintype()                    if maintype in ('message', 'multipart'):                        submsgobj = self.parsestr(part)                        msgobj.attach(submsgobj)                    else:                        msgobj.set_payload(part)                else:                    msgobj = self.parsestr(part)                container.preamble = preamble                container.epilogue = epilogue                container.attach(msgobj)        elif container.get_main_type() == 'multipart':            # Very bad.  A message is a multipart with no boundary!            raise Errors.BoundaryError(                'multipart message with no defined boundary')        elif container.get_type() == 'message/delivery-status':            # This special kind of type contains blocks of headers separated            # by a blank line.  We'll represent each header block as a            # separate Message object            blocks = []            while True:                blockmsg = self._class()                self._parseheaders(blockmsg, fp)                if not len(blockmsg):                    # No more header blocks left                    break                blocks.append(blockmsg)            container.set_payload(blocks)        elif container.get_main_type() == 'message':            # Create a container for the payload, but watch out for there not            # being any headers left            try:                msg = self.parse(fp)            except Errors.HeaderParseError:                msg = self._class()                self._parsebody(msg, fp)            container.attach(msg)        else:            text = fp.read()            if firstbodyline is not None:                text = firstbodyline + '\n' + text            container.set_payload(text)class HeaderParser(Parser):    """A subclass of Parser, this one only meaningfully parses message headers.    This class can be used if all you're interested in is the headers of a    message.  While it consumes the message body, it does not parse it, but    simply makes it available as a string payload.    Parsing with this subclass can be considerably faster if all you're    interested in is the message headers.    """    def _parsebody(self, container, fp, firstbodyline=None):        # Consume but do not parse, the body        text = fp.read()        if firstbodyline is not None:            text = firstbodyline + '\n' + text        container.set_payload(text)
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -