pageparser.py
来自「Harvestman-最新版本」· Python 代码 · 共 585 行 · 第 1/2 页
PY
585 行
# -- coding: utf-8""" pageparser.py - Module to parse an html page and extract its links. This module is part of the HarvestMan program. Author: Anand B Pillai <abpillai at gmail dot com> For licensing information see the file LICENSE.txt that is included in this distribution. Modification History ==================== Jan 2007 Anand Complete support for META robot tags implemented. Requested by jim sloan of MCHS. Mar 06 2007 Anand Added support for HTML EMBED & OBJECT tags. Apr 18 2007 Anand Made to use the urltypes module. Apr 19 2007 Anand Created class HarvestManCSSParser to take care of parsing stylesheet content to extract URLs. Aug 28 2007 Anand Added a parser baed on Effbot's sgmlop to parse pages with errors - as a part of fixes for #491. Sep 05 2007 Anand Added a basic javascript parser to parse Javascript statements - currently this can perform Javascript based site redirection. Sep 10 2007 Anand Added logic to filter junk links produced by web-directory pages. Oct 3 2007 Anand Removed class HarvestManJSParser since its functionality and additional DOM processing is done by the new JSParser class. Apr 4 2008 Anand Fix for EIAO bug #812. Apr 6 2008 Anand Added ParseTag class and features for EIAO bug #808. Copyright (C) 2004 Anand B Pillai. """__version__ = '2.0 b1'__author__ = 'Anand B Pillai'import refrom sgmllib import SGMLParserfrom harvestman.lib.urltypes import *from harvestman.lib.common.common import *from harvestman.lib.common.macros import *class ParseTag(object): """ Class representing a tag which is parsed by the HTML parser(s) """ def __init__(self, tag, tagdict, pattern=None, enabled=True): # Tag is the name of the tag (element) which will be parsed. # Tagdict is a dictionary which contains the attributes # of the tag which we are interested as keys and the type # of URL the value of the attribute will be saved as, as # the value. If there are more than one type of URL for this # attribute key, then the value is a list. # For example valid tagdicts are {'href': [URL_TYPE_ANY, URL_TYPE_ANCHOR] }, # {'codebase': URL_TYPE_JAPPLET_CODEBASE, 'code': URL_TYPE_JAPPLET'}. self.tag = tag self.tagdict = tagdict self.enabled = enabled self.pattern = pattern def disable(self): """ Disable parsing of this tag """ self.enabled = False def enable(self): """ Enable parsing of this tag """ self.enabled = True def isEnabled(self): """ Is this tag enabled ? """ return self.enabled def setPattern(self, pattern): self.pattern = pattern def __eq__(self, item): return self.tag.lower() == item.lower() class HarvestManSimpleParser(SGMLParser): """ An HTML/XHTML parser derived from SGMLParser """ # query_re = re.compile(r'[-.:_a-zA-Z0-9]*\?[-.:_a-zA-Z0-9]*=[-.a:_-zA-Z0-9]*', re.UNICODE) # A more lenient form of query regular expression query_re = re.compile(r'([^&=\?]*\?)([^&=\?]*=[^&=\?])*', re.UNICODE) skip_re = re.compile(r'(javascript:)|(mailto:)|(news:)') # Junk URLs obtained by parsing HTML of web-directory pages # i.e pages with title "Index of...". The filtering is done after # looking at the title of the page. index_page_re = re.compile(r'(\?[a-zA-Z0-9]=[a-zA-Z0-9])') features = [ ParseTag('a', {'href': URL_TYPE_ANY}), ParseTag('base', {'href' : URL_TYPE_BASE}), ParseTag('frame', {'src' : URL_TYPE_FRAME}), ParseTag('img', {'src': URL_TYPE_IMAGE}), ParseTag('form', {'action': URL_TYPE_FORM}), ParseTag('link', {'href': URL_TYPE_ANY}), ParseTag('body', {'background' : URL_TYPE_IMAGE}), ParseTag('script', {'src': URL_TYPE_JAVASCRIPT}), ParseTag('applet', {'codebase': URL_TYPE_JAPPLET_CODEBASE, 'code' : URL_TYPE_JAPPLET}), ParseTag('area', {'href': URL_TYPE_ANY}), ParseTag('meta', {'CONTENT': URL_TYPE_ANY, 'content': URL_TYPE_ANY}), ParseTag('embed', {'src': URL_TYPE_ANY}), ParseTag('object', {'data': URL_TYPE_ANY}), ParseTag('option', {'value': URL_TYPE_ANY}, enabled=False) ] handled_rel_types = ( URL_TYPE_STYLESHEET, ) def __init__(self): self.url = None self.links = [] self.linkpos = {} self.images = [] # Keywords self.keywords = [] # Description of page self.description = '' # Title of page self.title = '' self.title_flag = True # Fix for <base href="..."> links self.base_href = False # Base url for above self.base = None # anchor links flag self._anchors = True # For META robots tag self.can_index = True self.can_follow = True # Current tag self._tag = '' SGMLParser.__init__(self) # Type self.typ = 0 def save_anchors(self, value): """ Set the save anchor links flag """ # Warning: If you set this to true, anchor links on # webpages will be saved as separate files. self._anchors = value def enable_feature(self, tag): """ Enable the given tag feature if it is disabled """ if tag in self.features: parsetag = self.features[self.features.index(tag)] parsetag.enable() def disable_feature(self, tag): """ Disable the given tag feature if it is enabled """ if tag in self.features: parsetag = self.features[self.features.index(tag)] parsetag.disable() def filter_link(self, link): """ Function to filter links, we decide here whether to handle certain kinds of links """ if not link: return LINK_EMPTY # ignore javascript links (From 1.2 version javascript # links of the form .js are fetched, but we still ignore # the actual javascript actions since there is no # javascript engine.) llink = link.lower() # Skip javascript, mailto, news and directory special tags. if self.skip_re.match(llink): return LINK_FILTERED # If this is a web-directory Index page, then check for # match with junk URLs of such index pages if self.title.lower().startswith('index of'): if self.index_page_re.match(llink): # print 'Filtering link',llink return LINK_FILTERED # Check if we're accepting query style URLs if not objects.config.getquerylinks and self.query_re.search(llink): debug('Query filtering link',link) return LINK_FILTERED return LINK_NOT_FILTERED def handle_anchor_links(self, link): """ Handle links of the form html#...""" # if anchor tag, then get rid of anchor #... # and only add the webpage link if not link: return LINK_EMPTY # Need to do this here also self.check_add_link(URL_TYPE_ANCHOR, link) # No point in getting #anchor sort of links # since typically they point to anchors in the # same page index = link.rfind('.html#') if index != -1: newhref = link[:(index + 5)] self.check_add_link(URL_TYPE_WEBPAGE, newhref) return ANCHOR_LINK_FOUND else: index = link.rfind('.htm#') if index != -1: newhref = link[:(index + 4)] self.check_add_link(URL_TYPE_WEBPAGE, newhref) return ANCHOR_LINK_FOUND return ANCHOR_LINK_NOT_FOUND def unknown_starttag(self, tag, attrs): """ This method gives you the tag in the html page along with its attributes as a list of tuples """ # Raise event for anybody interested in catching a tagparse event... if objects.eventmgr and objects.eventmgr.raise_event('beforetag', self.url, None, tag=tag, attrs=attrs)==False: # Don't parse this tag.. return # Set as current tag self._tag = tag # print self._tag, attrs if not attrs: return isBaseTag = not self.base and tag == 'base' # print 'Base=>',isBaseTag if tag in self.features: d = CaselessDict(attrs) parsetag = self.features[self.features.index(tag)] # Don't do anything if the feature is disabled if not parsetag.isEnabled(): return tagdict = parsetag.tagdict link = '' for key, typ in tagdict.items(): # If there is a <base href="..."> tag # set self.base_href if isBaseTag and key=='href': self.base_href = True try: self.base = d[key] except: self.base_href = False continue # if the link already has a value, skip # (except for applet tags) if tag != 'applet': if link: continue if tag == 'link': try: # Fix - only reset typ if it is one # of the valid handled rel types. foundtyp = d['rel'].lower() if foundtyp in self.handled_rel_types: typ = getTypeClass(foundtyp) except KeyError: pass try: if tag == 'meta': # Handle meta tag for refresh foundtyp = d.get('http-equiv','').lower() if foundtyp.lower() == 'refresh': link = d.get(key,'') if not link: continue # This will be of the form of either # a time-gap (CONTENT="600") or a time-gap
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?