feedparser.py

来自「Harvestman-最新版本」· Python 代码 · 共 1,630 行 · 第 1/5 页

PY
1,630
字号
    def _start_prodlink(self, attrsD):        self.pushContent('content', attrsD, 'text/html', 1)    def _start_body(self, attrsD):        self.pushContent('content', attrsD, 'application/xhtml+xml', 1)    _start_xhtml_body = _start_body    def _start_content_encoded(self, attrsD):        self.pushContent('content', attrsD, 'text/html', 1)    _start_fullitem = _start_content_encoded    def _end_content(self):        copyToDescription = self.mapContentType(self.contentparams.get('type')) in (['text/plain'] + self.html_types)        value = self.popContent('content')        if copyToDescription:            self._save('description', value)    _end_body = _end_content    _end_xhtml_body = _end_content    _end_content_encoded = _end_content    _end_fullitem = _end_content    _end_prodlink = _end_content    def _start_itunes_image(self, attrsD):        self.push('itunes_image', 0)        self._getContext()['image'] = FeedParserDict({'href': attrsD.get('href')})    _start_itunes_link = _start_itunes_image            def _end_itunes_block(self):        value = self.pop('itunes_block', 0)        self._getContext()['itunes_block'] = (value == 'yes') and 1 or 0    def _end_itunes_explicit(self):        value = self.pop('itunes_explicit', 0)        self._getContext()['itunes_explicit'] = (value == 'yes') and 1 or 0if _XML_AVAILABLE:    class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler):        def __init__(self, baseuri, baselang, encoding):            if _debug: sys.stderr.write('trying StrictFeedParser\n')            xml.sax.handler.ContentHandler.__init__(self)            _FeedParserMixin.__init__(self, baseuri, baselang, encoding)            self.bozo = 0            self.exc = None                def startPrefixMapping(self, prefix, uri):            self.trackNamespace(prefix, uri)                def startElementNS(self, name, qname, attrs):            namespace, localname = name            lowernamespace = str(namespace or '').lower()            if lowernamespace.find('backend.userland.com/rss') <> -1:                # match any backend.userland.com namespace                namespace = 'http://backend.userland.com/rss'                lowernamespace = namespace            if qname and qname.find(':') > 0:                givenprefix = qname.split(':')[0]            else:                givenprefix = None            prefix = self._matchnamespaces.get(lowernamespace, givenprefix)            if givenprefix and (prefix == None or (prefix == '' and lowernamespace == '')) and not self.namespacesInUse.has_key(givenprefix):                    raise UndeclaredNamespace, "'%s' is not associated with a namespace" % givenprefix            if prefix:                localname = prefix + ':' + localname            localname = str(localname).lower()            if _debug: sys.stderr.write('startElementNS: qname = %s, namespace = %s, givenprefix = %s, prefix = %s, attrs = %s, localname = %s\n' % (qname, namespace, givenprefix, prefix, attrs.items(), localname))            # qname implementation is horribly broken in Python 2.1 (it            # doesn't report any), and slightly broken in Python 2.2 (it            # doesn't report the xml: namespace). So we match up namespaces            # with a known list first, and then possibly override them with            # the qnames the SAX parser gives us (if indeed it gives us any            # at all).  Thanks to MatejC for helping me test this and            # tirelessly telling me that it didn't work yet.            attrsD = {}            for (namespace, attrlocalname), attrvalue in attrs._attrs.items():                lowernamespace = (namespace or '').lower()                prefix = self._matchnamespaces.get(lowernamespace, '')                if prefix:                    attrlocalname = prefix + ':' + attrlocalname                attrsD[str(attrlocalname).lower()] = attrvalue            for qname in attrs.getQNames():                attrsD[str(qname).lower()] = attrs.getValueByQName(qname)            self.unknown_starttag(localname, attrsD.items())        def characters(self, text):            self.handle_data(text)        def endElementNS(self, name, qname):            namespace, localname = name            lowernamespace = str(namespace or '').lower()            if qname and qname.find(':') > 0:                givenprefix = qname.split(':')[0]            else:                givenprefix = ''            prefix = self._matchnamespaces.get(lowernamespace, givenprefix)            if prefix:                localname = prefix + ':' + localname            localname = str(localname).lower()            self.unknown_endtag(localname)        def error(self, exc):            self.bozo = 1            self.exc = exc                    def fatalError(self, exc):            self.error(exc)            raise excclass _BaseHTMLProcessor(sgmllib.SGMLParser):    elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',      'img', 'input', 'isindex', 'link', 'meta', 'param']        def __init__(self, encoding):        self.encoding = encoding        if _debug: sys.stderr.write('entering BaseHTMLProcessor, encoding=%s\n' % self.encoding)        sgmllib.SGMLParser.__init__(self)            def reset(self):        self.pieces = []        sgmllib.SGMLParser.reset(self)    def _shorttag_replace(self, match):        tag = match.group(1)        if tag in self.elements_no_end_tag:            return '<' + tag + ' />'        else:            return '<' + tag + '></' + tag + '>'            def feed(self, data):        data = re.compile(r'<!((?!DOCTYPE|--|\[))', re.IGNORECASE).sub(r'&lt;!\1', data)        #data = re.sub(r'<(\S+?)\s*?/>', self._shorttag_replace, data) # bug [ 1399464 ] Bad regexp for _shorttag_replace        data = re.sub(r'<([^<\s]+?)\s*/>', self._shorttag_replace, data)         data = data.replace('&#39;', "'")        data = data.replace('&#34;', '"')        if self.encoding and type(data) == type(u''):            data = data.encode(self.encoding)        sgmllib.SGMLParser.feed(self, data)    def normalize_attrs(self, attrs):        # utility method to be called by descendants        attrs = [(k.lower(), v) for k, v in attrs]        attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]        return attrs    def unknown_starttag(self, tag, attrs):        # called for each start tag        # attrs is a list of (attr, value) tuples        # e.g. for <pre class='screen'>, tag='pre', attrs=[('class', 'screen')]        if _debug: sys.stderr.write('_BaseHTMLProcessor, unknown_starttag, tag=%s\n' % tag)        uattrs = []        # thanks to Kevin Marks for this breathtaking hack to deal with (valid) high-bit attribute values in UTF-8 feeds        for key, value in attrs:            if type(value) != type(u''):                value = unicode(value, self.encoding)            uattrs.append((unicode(key, self.encoding), value))        strattrs = u''.join([u' %s="%s"' % (key, value) for key, value in uattrs]).encode(self.encoding)        if tag in self.elements_no_end_tag:            self.pieces.append('<%(tag)s%(strattrs)s />' % locals())        else:            self.pieces.append('<%(tag)s%(strattrs)s>' % locals())    def unknown_endtag(self, tag):        # called for each end tag, e.g. for </pre>, tag will be 'pre'        # Reconstruct the original end tag.        if tag not in self.elements_no_end_tag:            self.pieces.append("</%(tag)s>" % locals())    def handle_charref(self, ref):        # called for each character reference, e.g. for '&#160;', ref will be '160'        # Reconstruct the original character reference.        self.pieces.append('&#%(ref)s;' % locals())            def handle_entityref(self, ref):        # called for each entity reference, e.g. for '&copy;', ref will be 'copy'        # Reconstruct the original entity reference.        self.pieces.append('&%(ref)s;' % locals())    def handle_data(self, text):        # called for each block of plain text, i.e. outside of any tag and        # not containing any character or entity references        # Store the original text verbatim.        if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_text, text=%s\n' % text)        self.pieces.append(text)            def handle_comment(self, text):        # called for each HTML comment, e.g. <!-- insert Javascript code here -->        # Reconstruct the original comment.        self.pieces.append('<!--%(text)s-->' % locals())            def handle_pi(self, text):        # called for each processing instruction, e.g. <?instruction>        # Reconstruct original processing instruction.        self.pieces.append('<?%(text)s>' % locals())    def handle_decl(self, text):        # called for the DOCTYPE, if present, e.g.        # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"        #     "http://www.w3.org/TR/html4/loose.dtd">        # Reconstruct original DOCTYPE        self.pieces.append('<!%(text)s>' % locals())            _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match    def _scan_name(self, i, declstartpos):        rawdata = self.rawdata        n = len(rawdata)        if i == n:            return None, -1        m = self._new_declname_match(rawdata, i)        if m:            s = m.group()            name = s.strip()            if (i + len(s)) == n:                return None, -1  # end of buffer            return name.lower(), m.end()        else:            self.handle_data(rawdata)#            self.updatepos(declstartpos, i)            return None, -1    def output(self):        '''Return processed HTML as a single string'''        return ''.join([str(p) for p in self.pieces])class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor):    def __init__(self, baseuri, baselang, encoding):        sgmllib.SGMLParser.__init__(self)        _FeedParserMixin.__init__(self, baseuri, baselang, encoding)    def decodeEntities(self, element, data):        data = data.replace('&#60;', '&lt;')        data = data.replace('&#x3c;', '&lt;')        data = data.replace('&#62;', '&gt;')        data = data.replace('&#x3e;', '&gt;')        data = data.replace('&#38;', '&amp;')        data = data.replace('&#x26;', '&amp;')        data = data.replace('&#34;', '&quot;')        data = data.replace('&#x22;', '&quot;')        data = data.replace('&#39;', '&apos;')        data = data.replace('&#x27;', '&apos;')        if self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):            data = data.replace('&lt;', '<')            data = data.replace('&gt;', '>')            data = data.replace('&amp;', '&')            data = data.replace('&quot;', '"')            data = data.replace('&apos;', "'")        return data        class _RelativeURIResolver(_BaseHTMLProcessor):    relative_uris = [('a', 'href'),                     ('applet', 'codebase'),                     ('area', 'href'),                     ('blockquote', 'cite'),                     ('body', 'background'),                     ('del', 'cite'),                     ('form', 'action'),                     ('frame', 'longdesc'),                     ('frame', 'src'),                     ('iframe', 'longdesc'),                     ('iframe', 'src'),                     ('head', 'profile'),                     ('img', 'longdesc'),                     ('img', 'src'),                     ('img', 'usemap'),                     ('input', 'src'),                     ('input', 'usemap'),                     ('ins', 'cite'),                     ('link', 'href'),                     ('object', 'classid'),                     ('object', 'codebase'),                     ('object', 'data'),                     ('object', 'usemap'),                     ('q', 'cite'),                     ('script', 'src')]    def __init__(self, baseuri, encoding):        _BaseHTMLProcessor.__init__(self, encoding)        self.baseuri = baseuri    def resolveURI(self, uri):        return _urljoin(self.baseuri, uri)        def unknown_starttag(self, tag, attrs):        attrs = self.normalize_attrs(attrs)        attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs]        _BaseHTMLProcessor.unknown_starttag(self, tag, attrs)        def _resolveRelativeURIs(htmlSource, baseURI, encoding):    if _debug: sys.stderr.write('entering _resolveRelativeURIs\n')    p = _RelativeURIResolver(baseURI, encoding)    p.feed(htmlSource)    return p.output()class _HTMLSanitizer(_BaseHTMLProcessor):    acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', 'big',      'blockquote', 'br', 'button', 'caption', 'center', 'cite', 'code', 'col',      'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'fieldset',      'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input',      'ins', 'kbd', 'label', 'legend', 'li', 'map', 'menu', 'ol', 'optgroup',      'option', 'p', 'pre', 'q', 's', 'samp', 'select', 'small', 'span', 'strike',      'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'tfoot', 'th',      'thead', 'tr', 'tt', 'u', 'ul', 'var']    acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',      'action', 'align', 'alt', 'axis', 'border', 'cellpadding', 'cellspacing',      'char', 'charoff', 'charset', 'checked', 'cite', 'class', 'clear', 'cols',      'colspan', 'color', 'compact', 'coords', 'datetime', 'dir', 'disabled',      'enctype', 'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace',      'id', 'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method',      'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly',      'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',      'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title', 'type',      'usemap', 'valign', 'value', 'vspace', 'width']    unacceptable_elements_with_end_tag = ['script', 'applet']    def reset(self):        _BaseHTMLProcessor.reset(self)        self.unacceptablestack = 0            def unknown_starttag(self, tag, attrs):        if not tag in self.acceptable_elements:            if tag in self.unacceptable_elements_with_end_tag:                self.unacceptablestack += 1            return        attrs = self.normalize_attrs(attrs)        attr

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?