feedparser.py

来自「Harvestman-最新版本」· Python 代码 · 共 1,630 行 · 第 1/5 页

PY
1,630
字号
            if not emailmatch: return            email = emailmatch.group(0)            # probably a better way to do the following, but it passes all the tests            author = author.replace(email, '')            author = author.replace('()', '')            author = author.strip()            if author and (author[0] == '('):                author = author[1:]            if author and (author[-1] == ')'):                author = author[:-1]            author = author.strip()            context.setdefault('%s_detail' % key, FeedParserDict())            context['%s_detail' % key]['name'] = author            context['%s_detail' % key]['email'] = email    def _start_subtitle(self, attrsD):        self.pushContent('subtitle', attrsD, 'text/plain', 1)    _start_tagline = _start_subtitle    _start_itunes_subtitle = _start_subtitle    def _end_subtitle(self):        self.popContent('subtitle')    _end_tagline = _end_subtitle    _end_itunes_subtitle = _end_subtitle                def _start_rights(self, attrsD):        self.pushContent('rights', attrsD, 'text/plain', 1)    _start_dc_rights = _start_rights    _start_copyright = _start_rights    def _end_rights(self):        self.popContent('rights')    _end_dc_rights = _end_rights    _end_copyright = _end_rights    def _start_item(self, attrsD):        self.entries.append(FeedParserDict())        self.push('item', 0)        self.inentry = 1        self.guidislink = 0        id = self._getAttribute(attrsD, 'rdf:about')        if id:            context = self._getContext()            context['id'] = id        self._cdf_common(attrsD)    _start_entry = _start_item    _start_product = _start_item    def _end_item(self):        self.pop('item')        self.inentry = 0    _end_entry = _end_item    def _start_dc_language(self, attrsD):        self.push('language', 1)    _start_language = _start_dc_language    def _end_dc_language(self):        self.lang = self.pop('language')    _end_language = _end_dc_language    def _start_dc_publisher(self, attrsD):        self.push('publisher', 1)    _start_webmaster = _start_dc_publisher    def _end_dc_publisher(self):        self.pop('publisher')        self._sync_author_detail('publisher')    _end_webmaster = _end_dc_publisher    def _start_published(self, attrsD):        self.push('published', 1)    _start_dcterms_issued = _start_published    _start_issued = _start_published    def _end_published(self):        value = self.pop('published')        self._save('published_parsed', _parse_date(value))    _end_dcterms_issued = _end_published    _end_issued = _end_published    def _start_updated(self, attrsD):        self.push('updated', 1)    _start_modified = _start_updated    _start_dcterms_modified = _start_updated    _start_pubdate = _start_updated    _start_dc_date = _start_updated    def _end_updated(self):        value = self.pop('updated')        parsed_value = _parse_date(value)        self._save('updated_parsed', parsed_value)    _end_modified = _end_updated    _end_dcterms_modified = _end_updated    _end_pubdate = _end_updated    _end_dc_date = _end_updated    def _start_created(self, attrsD):        self.push('created', 1)    _start_dcterms_created = _start_created    def _end_created(self):        value = self.pop('created')        self._save('created_parsed', _parse_date(value))    _end_dcterms_created = _end_created    def _start_expirationdate(self, attrsD):        self.push('expired', 1)    def _end_expirationdate(self):        self._save('expired_parsed', _parse_date(self.pop('expired')))    def _start_cc_license(self, attrsD):        self.push('license', 1)        value = self._getAttribute(attrsD, 'rdf:resource')        if value:            self.elementstack[-1][2].append(value)        self.pop('license')            def _start_creativecommons_license(self, attrsD):        self.push('license', 1)    def _end_creativecommons_license(self):        self.pop('license')    def _addTag(self, term, scheme, label):        context = self._getContext()        tags = context.setdefault('tags', [])        if (not term) and (not scheme) and (not label): return        value = FeedParserDict({'term': term, 'scheme': scheme, 'label': label})        if value not in tags:            tags.append(FeedParserDict({'term': term, 'scheme': scheme, 'label': label}))    def _start_category(self, attrsD):        if _debug: sys.stderr.write('entering _start_category with %s\n' % repr(attrsD))        term = attrsD.get('term')        scheme = attrsD.get('scheme', attrsD.get('domain'))        label = attrsD.get('label')        self._addTag(term, scheme, label)        self.push('category', 1)    _start_dc_subject = _start_category    _start_keywords = _start_category            def _end_itunes_keywords(self):        for term in self.pop('itunes_keywords').split():            self._addTag(term, 'http://www.itunes.com/', None)            def _start_itunes_category(self, attrsD):        self._addTag(attrsD.get('text'), 'http://www.itunes.com/', None)        self.push('category', 1)            def _end_category(self):        value = self.pop('category')        if not value: return        context = self._getContext()        tags = context['tags']        if value and len(tags) and not tags[-1]['term']:            tags[-1]['term'] = value        else:            self._addTag(value, None, None)    _end_dc_subject = _end_category    _end_keywords = _end_category    _end_itunes_category = _end_category    def _start_cloud(self, attrsD):        self._getContext()['cloud'] = FeedParserDict(attrsD)            def _start_link(self, attrsD):        attrsD.setdefault('rel', 'alternate')        attrsD.setdefault('type', 'text/html')        attrsD = self._itsAnHrefDamnIt(attrsD)        if attrsD.has_key('href'):            attrsD['href'] = self.resolveURI(attrsD['href'])        expectingText = self.infeed or self.inentry or self.insource        context = self._getContext()        context.setdefault('links', [])        context['links'].append(FeedParserDict(attrsD))        if attrsD['rel'] == 'enclosure':            self._start_enclosure(attrsD)        if attrsD.has_key('href'):            expectingText = 0            if (attrsD.get('rel') == 'alternate') and (self.mapContentType(attrsD.get('type')) in self.html_types):                context['link'] = attrsD['href']        else:            self.push('link', expectingText)    _start_producturl = _start_link    def _end_link(self):        value = self.pop('link')        context = self._getContext()        if self.intextinput:            context['textinput']['link'] = value        if self.inimage:            context['image']['link'] = value    _end_producturl = _end_link    def _start_guid(self, attrsD):        self.guidislink = (attrsD.get('ispermalink', 'true') == 'true')        self.push('id', 1)    def _end_guid(self):        value = self.pop('id')        self._save('guidislink', self.guidislink and not self._getContext().has_key('link'))        if self.guidislink:            # guid acts as link, but only if 'ispermalink' is not present or is 'true',            # and only if the item doesn't already have a link element            self._save('link', value)    def _start_title(self, attrsD):        self.pushContent('title', attrsD, 'text/plain', self.infeed or self.inentry or self.insource)    _start_dc_title = _start_title    _start_media_title = _start_title    def _end_title(self):        value = self.popContent('title')        context = self._getContext()        if self.intextinput:            context['textinput']['title'] = value        elif self.inimage:            context['image']['title'] = value    _end_dc_title = _end_title    _end_media_title = _end_title    def _start_description(self, attrsD):        context = self._getContext()        if context.has_key('summary'):            self._summaryKey = 'content'            self._start_content(attrsD)        else:            self.pushContent('description', attrsD, 'text/html', self.infeed or self.inentry or self.insource)    def _start_abstract(self, attrsD):        self.pushContent('description', attrsD, 'text/plain', self.infeed or self.inentry or self.insource)    def _end_description(self):        if self._summaryKey == 'content':            self._end_content()        else:            value = self.popContent('description')            context = self._getContext()            if self.intextinput:                context['textinput']['description'] = value            elif self.inimage:                context['image']['description'] = value        self._summaryKey = None    _end_abstract = _end_description    def _start_info(self, attrsD):        self.pushContent('info', attrsD, 'text/plain', 1)    _start_feedburner_browserfriendly = _start_info    def _end_info(self):        self.popContent('info')    _end_feedburner_browserfriendly = _end_info    def _start_generator(self, attrsD):        if attrsD:            attrsD = self._itsAnHrefDamnIt(attrsD)            if attrsD.has_key('href'):                attrsD['href'] = self.resolveURI(attrsD['href'])        self._getContext()['generator_detail'] = FeedParserDict(attrsD)        self.push('generator', 1)    def _end_generator(self):        value = self.pop('generator')        context = self._getContext()        if context.has_key('generator_detail'):            context['generator_detail']['name'] = value                def _start_admin_generatoragent(self, attrsD):        self.push('generator', 1)        value = self._getAttribute(attrsD, 'rdf:resource')        if value:            self.elementstack[-1][2].append(value)        self.pop('generator')        self._getContext()['generator_detail'] = FeedParserDict({'href': value})    def _start_admin_errorreportsto(self, attrsD):        self.push('errorreportsto', 1)        value = self._getAttribute(attrsD, 'rdf:resource')        if value:            self.elementstack[-1][2].append(value)        self.pop('errorreportsto')            def _start_summary(self, attrsD):        context = self._getContext()        if context.has_key('summary'):            self._summaryKey = 'content'            self._start_content(attrsD)        else:            self._summaryKey = 'summary'            self.pushContent(self._summaryKey, attrsD, 'text/plain', 1)    _start_itunes_summary = _start_summary    def _end_summary(self):        if self._summaryKey == 'content':            self._end_content()        else:            self.popContent(self._summaryKey or 'summary')        self._summaryKey = None    _end_itunes_summary = _end_summary            def _start_enclosure(self, attrsD):        attrsD = self._itsAnHrefDamnIt(attrsD)        self._getContext().setdefault('enclosures', []).append(FeedParserDict(attrsD))        href = attrsD.get('href')        if href:            context = self._getContext()            if not context.get('id'):                context['id'] = href                def _start_source(self, attrsD):        self.insource = 1    def _end_source(self):        self.insource = 0        self._getContext()['source'] = copy.deepcopy(self.sourcedata)        self.sourcedata.clear()    def _start_content(self, attrsD):        self.pushContent('content', attrsD, 'text/plain', 1)        src = attrsD.get('src')        if src:            self.contentparams['src'] = src        self.push('content', 1)

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?