feedparser.py

来自「Harvestman-最新版本」· Python 代码 · 共 1,630 行 · 第 1/5 页

PY
1,630
字号
                  'http://madskills.com/public/xml/rss/module/pingback/': 'pingback',                  'http://prismstandard.org/namespaces/1.2/basic/':       'prism',                  'http://www.w3.org/1999/02/22-rdf-syntax-ns#':          'rdf',                  'http://www.w3.org/2000/01/rdf-schema#':                'rdfs',                  'http://purl.org/rss/1.0/modules/reference/':           'ref',                  'http://purl.org/rss/1.0/modules/richequiv/':           'reqv',                  'http://purl.org/rss/1.0/modules/search/':              'search',                  'http://purl.org/rss/1.0/modules/slash/':               'slash',                  'http://schemas.xmlsoap.org/soap/envelope/':            'soap',                  'http://purl.org/rss/1.0/modules/servicestatus/':       'ss',                  'http://hacks.benhammersley.com/rss/streaming/':        'str',                  'http://purl.org/rss/1.0/modules/subscription/':        'sub',                  'http://purl.org/rss/1.0/modules/syndication/':         'sy',                  'http://purl.org/rss/1.0/modules/taxonomy/':            'taxo',                  'http://purl.org/rss/1.0/modules/threading/':           'thr',                  'http://purl.org/rss/1.0/modules/textinput/':           'ti',                  'http://madskills.com/public/xml/rss/module/trackback/':'trackback',                  'http://wellformedweb.org/commentAPI/':                 'wfw',                  'http://purl.org/rss/1.0/modules/wiki/':                'wiki',                  'http://www.w3.org/1999/xhtml':                         'xhtml',                  'http://www.w3.org/XML/1998/namespace':                 'xml',                  'http://schemas.pocketsoap.com/rss/myDescModule/':      'szf'}    _matchnamespaces = {}    can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'license', 'icon', 'logo']    can_contain_relative_uris = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']    can_contain_dangerous_markup = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']    html_types = ['text/html', 'application/xhtml+xml']        def __init__(self, baseuri=None, baselang=None, encoding='utf-8'):        if _debug: sys.stderr.write('initializing FeedParser\n')        if not self._matchnamespaces:            for k, v in self.namespaces.items():                self._matchnamespaces[k.lower()] = v        self.feeddata = FeedParserDict() # feed-level data        self.encoding = encoding # character encoding        self.entries = [] # list of entry-level data        self.version = '' # feed type/version, see SUPPORTED_VERSIONS        self.namespacesInUse = {} # dictionary of namespaces defined by the feed        # the following are used internally to track state;        # this is really out of control and should be refactored        self.infeed = 0        self.inentry = 0        self.incontent = 0        self.intextinput = 0        self.inimage = 0        self.inauthor = 0        self.incontributor = 0        self.inpublisher = 0        self.insource = 0        self.sourcedata = FeedParserDict()        self.contentparams = FeedParserDict()        self._summaryKey = None        self.namespacemap = {}        self.elementstack = []        self.basestack = []        self.langstack = []        self.baseuri = baseuri or ''        self.lang = baselang or None        if baselang:            self.feeddata['language'] = baselang    def unknown_starttag(self, tag, attrs):        if _debug: sys.stderr.write('start %s with %s\n' % (tag, attrs))        # normalize attrs        attrs = [(k.lower(), v) for k, v in attrs]        attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]                # track xml:base and xml:lang        attrsD = dict(attrs)        baseuri = attrsD.get('xml:base', attrsD.get('base')) or self.baseuri        self.baseuri = _urljoin(self.baseuri, baseuri)        lang = attrsD.get('xml:lang', attrsD.get('lang'))        if lang == '':            # xml:lang could be explicitly set to '', we need to capture that            lang = None        elif lang is None:            # if no xml:lang is specified, use parent lang            lang = self.lang        if lang:            if tag in ('feed', 'rss', 'rdf:RDF'):                self.feeddata['language'] = lang        self.lang = lang        self.basestack.append(self.baseuri)        self.langstack.append(lang)                # track namespaces        for prefix, uri in attrs:            if prefix.startswith('xmlns:'):                self.trackNamespace(prefix[6:], uri)            elif prefix == 'xmlns':                self.trackNamespace(None, uri)        # track inline content        if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):            # element declared itself as escaped markup, but it isn't really            self.contentparams['type'] = 'application/xhtml+xml'        if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml':            # Note: probably shouldn't simply recreate localname here, but            # our namespace handling isn't actually 100% correct in cases where            # the feed redefines the default namespace (which is actually            # the usual case for inline content, thanks Sam), so here we            # cheat and just reconstruct the element based on localname            # because that compensates for the bugs in our namespace handling.            # This will horribly munge inline content with non-empty qnames,            # but nobody actually does that, so I'm not fixing it.            tag = tag.split(':')[-1]            return self.handle_data('<%s%s>' % (tag, ''.join([' %s="%s"' % t for t in attrs])), escape=0)        # match namespaces        if tag.find(':') <> -1:            prefix, suffix = tag.split(':', 1)        else:            prefix, suffix = '', tag        prefix = self.namespacemap.get(prefix, prefix)        if prefix:            prefix = prefix + '_'        # special hack for better tracking of empty textinput/image elements in illformed feeds        if (not prefix) and tag not in ('title', 'link', 'description', 'name'):            self.intextinput = 0        if (not prefix) and tag not in ('title', 'link', 'description', 'url', 'href', 'width', 'height'):            self.inimage = 0                # call special handler (if defined) or default handler        methodname = '_start_' + prefix + suffix        try:            method = getattr(self, methodname)            return method(attrsD)        except AttributeError:            return self.push(prefix + suffix, 1)    def unknown_endtag(self, tag):        if _debug: sys.stderr.write('end %s\n' % tag)        # match namespaces        if tag.find(':') <> -1:            prefix, suffix = tag.split(':', 1)        else:            prefix, suffix = '', tag        prefix = self.namespacemap.get(prefix, prefix)        if prefix:            prefix = prefix + '_'        # call special handler (if defined) or default handler        methodname = '_end_' + prefix + suffix        try:            method = getattr(self, methodname)            method()        except AttributeError:            self.pop(prefix + suffix)        # track inline content        if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):            # element declared itself as escaped markup, but it isn't really            self.contentparams['type'] = 'application/xhtml+xml'        if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml':            tag = tag.split(':')[-1]            self.handle_data('</%s>' % tag, escape=0)        # track xml:base and xml:lang going out of scope        if self.basestack:            self.basestack.pop()            if self.basestack and self.basestack[-1]:                self.baseuri = self.basestack[-1]        if self.langstack:            self.langstack.pop()            if self.langstack: # and (self.langstack[-1] is not None):                self.lang = self.langstack[-1]    def handle_charref(self, ref):        # called for each character reference, e.g. for '&#160;', ref will be '160'        if not self.elementstack: return        ref = ref.lower()        if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'):            text = '&#%s;' % ref        else:            if ref[0] == 'x':                c = int(ref[1:], 16)            else:                c = int(ref)            text = unichr(c).encode('utf-8')        self.elementstack[-1][2].append(text)    def handle_entityref(self, ref):        # called for each entity reference, e.g. for '&copy;', ref will be 'copy'        if not self.elementstack: return        if _debug: sys.stderr.write('entering handle_entityref with %s\n' % ref)        if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):            text = '&%s;' % ref        else:            # entity resolution graciously donated by Aaron Swartz            def name2cp(k):                import htmlentitydefs                if hasattr(htmlentitydefs, 'name2codepoint'): # requires Python 2.3                    return htmlentitydefs.name2codepoint[k]                k = htmlentitydefs.entitydefs[k]                if k.startswith('&#') and k.endswith(';'):                    return int(k[2:-1]) # not in latin-1                return ord(k)            try: name2cp(ref)            except KeyError: text = '&%s;' % ref            else: text = unichr(name2cp(ref)).encode('utf-8')        self.elementstack[-1][2].append(text)    def handle_data(self, text, escape=1):        # called for each block of plain text, i.e. outside of any tag and        # not containing any character or entity references        if not self.elementstack: return        if escape and self.contentparams.get('type') == 'application/xhtml+xml':            text = _xmlescape(text)        self.elementstack[-1][2].append(text)    def handle_comment(self, text):        # called for each comment, e.g. <!-- insert message here -->        pass    def handle_pi(self, text):        # called for each processing instruction, e.g. <?instruction>        pass    def handle_decl(self, text):        pass    def parse_declaration(self, i):        # override internal declaration handler to handle CDATA blocks        if _debug: sys.stderr.write('entering parse_declaration\n')        if self.rawdata[i:i+9] == '<![CDATA[':            k = self.rawdata.find(']]>', i)            if k == -1: k = len(self.rawdata)            self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0)            return k+3        else:            k = self.rawdata.find('>', i)            return k+1    def mapContentType(self, contentType):        contentType = contentType.lower()        if contentType == 'text':            contentType = 'text/plain'        elif contentType == 'html':            contentType = 'text/html'        elif contentType == 'xhtml':            contentType = 'application/xhtml+xml'        return contentType        def trackNamespace(self, prefix, uri):        loweruri = uri.lower()        if (prefix, loweruri) == (None, 'http://my.netscape.com/rdf/simple/0.9/') and not self.version:            self.version = 'rss090'        if loweruri == 'http://purl.org/rss/1.0/' and not self.version:            self.version = 'rss10'        if loweruri == 'http://www.w3.org/2005/atom' and not self.version:            self.version = 'atom10'        if loweruri.find('backend.userland.com/rss') <> -1:            # match any backend.userland.com namespace            uri = 'http://backend.userland.com/rss'            loweruri = uri        if self._matchnamespaces.has_key(loweruri):            self.namespacemap[prefix] = self._matchnamespaces[loweruri]            self.namespacesInUse[self._matchnamespaces[loweruri]] = uri        else:            self.namespacesInUse[prefix or ''] = uri    def resolveURI(self, uri):        return _urljoin(self.baseuri or '', uri)        def decodeEntities(self, element, data):        return data    def push(self, element, expectingText):        self.elementstack.append([element, expectingText, []])    def pop(self, element, stripWhitespace=1):        if not self.elementstack: return        if self.elementstack[-1][0] != element: return                element, expectingText, pieces = self.elementstack.pop()        output = ''.join(pieces)        if stripWhitespace:            output = output.strip()        if not expectingText: return output        # decode base64 content        if base64 and self.contentparams.get('base64', 0):            try:                output = base64.decodestring(output)            except binascii.Error:                pass            except binascii.Incomplete:                pass                        # resolve relative URIs        if (element in self.can_be_relative_uri) and output:            output = self.resolveURI(output)                # decode entities within embedded markup        if not self.contentparams.get('base64', 0):            output = self.decodeEntities(element, output)        # remove temporary cruft from contentparams        try:            del self.contentparams['mode']        except KeyError:            pass        try:            del self.contentparams['base64']        except KeyError:            pass        # resolve relative URIs within embedded markup        if self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types:            if element in self.can_contain_relative_uris:                output = _resolveRelativeURIs(output, self.baseuri, self.encoding)                # sanitize embedded markup        if self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types:            if element in self.can_contain_dangerous_markup:                output = _sanitizeHTML(output, self.encoding)        if self.encoding and type(output) != type(u''):            try:                output = unicode(output, self.encoding)            except:                pass

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?