pageparser.py

来自「Harvestman-最新版本」· Python 代码 · 共 585 行 · 第 1/2 页

PY
585
字号
# -- coding: utf-8""" pageparser.py - Module to parse an html page and    extract its links. This module is part of the    HarvestMan program.    Author: Anand B Pillai <abpillai at gmail dot com>        For licensing information see the file LICENSE.txt that    is included in this distribution.    Modification History    ====================   Jan 2007       Anand              Complete support for META robot tags implemented.                                     Requested by jim sloan of MCHS.   Mar 06 2007    Anand              Added support for HTML EMBED & OBJECT tags.   Apr 18 2007    Anand              Made to use the urltypes module.   Apr 19 2007    Anand              Created class HarvestManCSSParser to take                                     care of parsing stylesheet content to extract                                     URLs.   Aug 28 2007    Anand              Added a parser baed on Effbot's sgmlop                                     to parse pages with errors - as a part of                                     fixes for #491.   Sep 05 2007    Anand              Added a basic javascript parser to parse                                     Javascript statements - currently this can                                     perform Javascript based site redirection.   Sep 10 2007    Anand              Added logic to filter junk links produced                                     by web-directory pages.   Oct 3  2007    Anand              Removed class HarvestManJSParser since its                                     functionality and additional DOM processing                                     is done by the new JSParser class.   Apr 4 2008     Anand              Fix for EIAO bug #812.   Apr 6 2008     Anand              Added ParseTag class and features for EIAO bug                                     #808.        Copyright (C) 2004 Anand B Pillai.                                                                          """__version__ = '2.0 b1'__author__ = 'Anand B Pillai'import refrom sgmllib import SGMLParserfrom harvestman.lib.urltypes import *from harvestman.lib.common.common import *from harvestman.lib.common.macros import *class ParseTag(object):    """ Class representing a tag which is parsed by the HTML parser(s) """        def __init__(self, tag, tagdict, pattern=None, enabled=True):        # Tag is the name of the tag (element) which will be parsed.        # Tagdict is a dictionary which contains the attributes        # of the tag which we are interested as keys and the type        # of URL the value of the attribute will be saved as, as        # the value. If there are more than one type of URL for this        # attribute key, then the value is a list.                # For example valid tagdicts are {'href': [URL_TYPE_ANY, URL_TYPE_ANCHOR] },        # {'codebase': URL_TYPE_JAPPLET_CODEBASE, 'code': URL_TYPE_JAPPLET'}.        self.tag = tag        self.tagdict = tagdict        self.enabled = enabled        self.pattern = pattern    def disable(self):        """ Disable parsing of this tag """        self.enabled = False    def enable(self):        """ Enable parsing of this tag """        self.enabled = True    def isEnabled(self):        """ Is this tag enabled ? """                return self.enabled    def setPattern(self, pattern):        self.pattern = pattern    def __eq__(self, item):        return self.tag.lower() == item.lower()    class HarvestManSimpleParser(SGMLParser):    """ An HTML/XHTML parser derived from SGMLParser """    # query_re = re.compile(r'[-.:_a-zA-Z0-9]*\?[-.:_a-zA-Z0-9]*=[-.a:_-zA-Z0-9]*', re.UNICODE)    # A more lenient form of query regular expression    query_re = re.compile(r'([^&=\?]*\?)([^&=\?]*=[^&=\?])*', re.UNICODE)     skip_re = re.compile(r'(javascript:)|(mailto:)|(news:)')    # Junk URLs obtained by parsing HTML of web-directory pages    # i.e pages with title "Index of...". The filtering is done after    # looking at the title of the page.    index_page_re = re.compile(r'(\?[a-zA-Z0-9]=[a-zA-Z0-9])')    features = [ ParseTag('a', {'href': URL_TYPE_ANY}),                 ParseTag('base', {'href' : URL_TYPE_BASE}),                 ParseTag('frame', {'src' : URL_TYPE_FRAME}),                 ParseTag('img', {'src': URL_TYPE_IMAGE}),                 ParseTag('form', {'action': URL_TYPE_FORM}),                 ParseTag('link', {'href': URL_TYPE_ANY}),                 ParseTag('body', {'background' : URL_TYPE_IMAGE}),                 ParseTag('script', {'src': URL_TYPE_JAVASCRIPT}),                 ParseTag('applet', {'codebase': URL_TYPE_JAPPLET_CODEBASE, 'code' : URL_TYPE_JAPPLET}),                 ParseTag('area', {'href': URL_TYPE_ANY}),                 ParseTag('meta', {'CONTENT': URL_TYPE_ANY, 'content': URL_TYPE_ANY}),                 ParseTag('embed', {'src': URL_TYPE_ANY}),                 ParseTag('object', {'data': URL_TYPE_ANY}),                 ParseTag('option', {'value': URL_TYPE_ANY}, enabled=False) ]                     handled_rel_types = ( URL_TYPE_STYLESHEET, )        def __init__(self):        self.url = None        self.links = []        self.linkpos = {}        self.images = []        # Keywords        self.keywords = []        # Description of page        self.description = ''        # Title of page        self.title = ''        self.title_flag = True        # Fix for <base href="..."> links        self.base_href = False        # Base url for above        self.base = None        # anchor links flag        self._anchors = True        # For META robots tag        self.can_index = True        self.can_follow = True        # Current tag        self._tag = ''        SGMLParser.__init__(self)        # Type        self.typ = 0            def save_anchors(self, value):        """ Set the save anchor links flag """        # Warning: If you set this to true, anchor links on        # webpages will be saved as separate files.        self._anchors = value    def enable_feature(self, tag):        """ Enable the given tag feature if it is disabled """        if tag in self.features:            parsetag = self.features[self.features.index(tag)]            parsetag.enable()    def disable_feature(self, tag):        """ Disable the given tag feature if it is enabled """        if tag in self.features:            parsetag = self.features[self.features.index(tag)]            parsetag.disable()                    def filter_link(self, link):        """ Function to filter links, we decide here whether        to handle certain kinds of links """        if not link:            return LINK_EMPTY        # ignore javascript links (From 1.2 version javascript        # links of the form .js are fetched, but we still ignore        # the actual javascript actions since there is no        # javascript engine.)        llink = link.lower()        # Skip javascript, mailto, news and directory special tags.        if self.skip_re.match(llink):            return LINK_FILTERED        # If this is a web-directory Index page, then check for        # match with junk URLs of such index pages        if self.title.lower().startswith('index of'):            if self.index_page_re.match(llink):                # print 'Filtering link',llink                return LINK_FILTERED                    # Check if we're accepting query style URLs        if not objects.config.getquerylinks and self.query_re.search(llink):            debug('Query filtering link',link)            return LINK_FILTERED        return LINK_NOT_FILTERED    def handle_anchor_links(self, link):        """ Handle links of the form html#..."""        # if anchor tag, then get rid of anchor #...        # and only add the webpage link        if not link:            return LINK_EMPTY        # Need to do this here also        self.check_add_link(URL_TYPE_ANCHOR, link)        # No point in getting #anchor sort of links        # since typically they point to anchors in the        # same page        index = link.rfind('.html#')        if index != -1:            newhref = link[:(index + 5)]            self.check_add_link(URL_TYPE_WEBPAGE, newhref)            return ANCHOR_LINK_FOUND        else:            index = link.rfind('.htm#')            if index != -1:                newhref = link[:(index + 4)]                self.check_add_link(URL_TYPE_WEBPAGE, newhref)                return ANCHOR_LINK_FOUND        return ANCHOR_LINK_NOT_FOUND    def unknown_starttag(self, tag, attrs):        """ This method gives you the tag in the html        page along with its attributes as a list of        tuples """        # Raise event for anybody interested in catching a tagparse event...        if objects.eventmgr and objects.eventmgr.raise_event('beforetag', self.url, None, tag=tag, attrs=attrs)==False:            # Don't parse this tag..            return                                             # Set as current tag        self._tag = tag        # print self._tag, attrs                if not attrs: return        isBaseTag = not self.base and tag == 'base'        # print 'Base=>',isBaseTag                if tag in self.features:            d = CaselessDict(attrs)            parsetag = self.features[self.features.index(tag)]            # Don't do anything if the feature is disabled            if not parsetag.isEnabled():                return                        tagdict = parsetag.tagdict                        link = ''            for key, typ in tagdict.items():                # If there is a <base href="..."> tag                # set self.base_href                if isBaseTag and key=='href':                    self.base_href = True                    try:                        self.base = d[key]                    except:                        self.base_href = False                        continue                                # if the link already has a value, skip                # (except for applet tags)                if tag != 'applet':                    if link: continue                if tag == 'link':                    try:                        # Fix - only reset typ if it is one                        # of the valid handled rel types.                        foundtyp = d['rel'].lower()                        if foundtyp in self.handled_rel_types:                            typ = getTypeClass(foundtyp)                    except KeyError:                        pass                try:                    if tag == 'meta':                        # Handle meta tag for refresh                        foundtyp = d.get('http-equiv','').lower()                        if foundtyp.lower() == 'refresh':                            link = d.get(key,'')                            if not link: continue                            # This will be of the form of either                            # a time-gap (CONTENT="600") or a time-gap

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?