pageparser.py

来自「Harvestman-最新版本」· Python 代码 · 共 585 行 · 第 1/2 页

PY
585
字号
                            # with a URL (CONTENT="0; URL=<url>")                            items = link.split(';')                            if len(items)==1:                                # Only a time-gap, skip it                                continue                            elif len(items)==2:                                # Second one should be a URL                                reqd = items[1]                                # print 'Reqd=>',reqd                                if (reqd.find('URL') != -1 or reqd.find('url') != -1) and reqd.find('=') != -1:                                    link = reqd.split('=')[1].strip()                                    # print 'Link=>',link                                else:                                    continue                        else:                            # Handle robots meta tag                            name = d.get('name','').lower()                            if name=='robots':                                robots = d.get('content','').lower()                                # Split to ','                                contents = [item.strip() for item in robots.split(',')]                                # Check for nofollow                                self.can_follow = not ('nofollow' in contents)                                # Check for noindex                                self.can_index = not ('noindex' in contents)                            elif name=='keywords':                                self.keywords = d.get('content','').split(',')                                # Trim the keywords list                                self.keywords = [word.lower().strip() for word in self.keywords]                            elif name=='description':                                self.description = d.get('content','').strip()                            else:                                continue                    elif tag != 'applet':                        link = d[key]                    else:                        link += d[key]                        if key == 'codebase':                            if link:                                if link[-1] != '/':                                    link += '/'                            continue                                                except KeyError:                    continue                # see if this link is to be filtered                if self.filter_link(link) != LINK_NOT_FILTERED:                    # print 'Filtered link',link                    continue                # anchor links in a page should not be saved                        # index = link.find('#')                # Make sure not to wrongly categorize '#' in query strings                # as anchor URLs.                if link.find('#') != -1 and not self.query_re.search(link):                    # print 'Is an anchor link',link                    self.handle_anchor_links(link)                else:                    # append to private list of links                    self.check_add_link(typ, link)    def unknown_endtag(self, tag):                    self._tag = ''        if tag=='title':            self.title_flag = False            self.title = self.title.strip()                def handle_data(self, data):        if self._tag.lower()=='title' and self.title_flag:            self.title += data    def check_add_link(self, typ, link):        """ To avoid adding duplicate links """        f = False        if typ == 'image':            if not (typ, link) in self.images:                self.images.append((typ, link))        elif not (typ, link) in self.links:                # print 'Adding link ', link, typ                pos = self.getpos()                self.links.append((typ, link))                self.linkpos[(typ,link)] = (pos[0],pos[1])                    def add_tag_info(self, taginfo):        """ Add new tag information to this object.        This can be used to change the behavior of this class        at runtime by adding new tags """        # The taginfo object should be a dictionary        # of the form { tagtype : (elementname, elementype) }        # egs: { 'body' : ('background', 'img) }        if type(taginfo) != dict:            raise AttributeError, "Attribute type mismatch, taginfo should be a dictionary!"        # get the key of the dictionary        key = (taginfo.keys())[0]        if len(taginfo[key]) != 2:            raise ValueError, 'Value mismatch, size of tag tuple should be 2'        # get the value tuple        tagelname, tageltype = taginfo[key]        # see if this is an already existing tagtype        if key in self.handled.keys:            _values = self.handled[key]            f=0            for index in xrange(len(_values)):                # if the elementname is also                # the same, just replace it.                v = _values[index]                elname, eltype = v                if elname == tagelname:                    f=1                    _values[index] = (tagelname, tageltype)                    break            # new element, add it to list            if f==0: _values.append((tagelname, tageltype))            return         else:            # new key, directly modify dictionary            elements = []            elements.append((tagelname, tageltype))            self.handled[key] = elements     def reset(self):        SGMLParser.reset(self)        self.url = None        self.base = None        self.links = []        self.images = []        self.base_href = False        self.base_url = ''        self.can_index = True        self.can_follow = True        self.title = ''        self.title_flag = True        self.description = ''        self.keywords = []            def base_url_defined(self):        """ Return whether this url had a        base url of the form <base href='...'>        defined """        return self.base_href    def get_base_url(self):        return self.base    def set_url(self, url):        """ Set the URL whose data is about to be parsed """        self.url = urlclass HarvestManSGMLOpParser(HarvestManSimpleParser):    """ A parser based on effbot's sgmlop """    def __init__(self):        # This module should be built already!        import sgmlop                self.parser = sgmlop.SGMLParser()        self.parser.register(self)        HarvestManSimpleParser.__init__(self)        # Type        self.typ = 1            def finish_starttag(self, tag, attrs):        self.unknown_starttag(tag, attrs)    def finish_endtag(self, tag):        self.unknown_endtag(tag)            def feed(self, data):        self.parser.feed(data)        class HarvestManCSSParser(object):    """ Class to parse stylesheets and extract URLs """    # Regexp to parse stylesheet imports    importcss1 = re.compile(r'(\@import\s+\"?)(?!url)([\w.-:/]+)(\"?)', re.MULTILINE|re.LOCALE|re.UNICODE)    importcss2 = re.compile(r'(\@import\s+url\(\"?)([\w.-:/]+)(\"?\))', re.MULTILINE|re.LOCALE|re.UNICODE)    # Regexp to parse URLs inside CSS files    cssurl = re.compile(r'(url\()([^\)]+)(\))', re.LOCALE|re.UNICODE)    def __init__(self):        # Any imported stylesheet URLs        self.csslinks = []        # All URLs including above        self.links = []    def feed(self, data):        self._parse(data)            def _parse(self, data):        """ Parse stylesheet data and extract imported css links, if any """        # Return is a list of imported css links.        # This subroutine uses the specification mentioned at        # http://www.w3.org/TR/REC-CSS2/cascade.html#at-import        # for doing stylesheet imports.        # This takes care of @import "style.css" and        # @import url("style.css") and url(...) syntax.        # Media types specified if any, are ignored.                # Matches for @import "style.css"        l1 = self.importcss1.findall(data)        # Matches for @import url("style.css")        l2 = self.importcss2.findall(data)        # Matches for url(...)        l3 = self.cssurl.findall(data)                for item in (l1+l2):            if not item: continue            url = item[1].replace("'",'').replace('"','')            self.csslinks.append(url)            self.links.append(url)                    for item in l3:            if not item: continue            url = item[1].replace("'",'').replace('"','')            if url not in self.links:                self.links.append(url)if __name__=="__main__":    import os    import config    import logger        SetAlias(config.HarvestManStateObject())    SetAlias(logger.HarvestManLogger())        cfg = objects.config    cfg.verbosity = 5    SetLogSeverity()        cfg.getquerylinks = True        p = HarvestManSimpleParser()    #p.enable_feature('option')    #p = HarvestManSGMLOpParser()        urls = ['http://projecteuler.net/index.php?section=problems']    urls = ['http://www.evvs.dk/index.php?cPath=30&osCsid=3b110c689f01d722dbbe53c5cee0bf2d']    urls = ['http://nltk.sourceforge.net/lite/doc/api/nltk_lite.contrib.fst.draw_graph.GraphEdgeWidget-class.html']    urls = ['http://wiki.java.net/bin/view/Javawsxml/Rome05Tutorials']    urls = ['http://bits.blogs.nytimes.com/2008/02/27/google-goes-after-another-microsoft-cash-cow/?ref=technology']    urls = ['http://mail.python.org/pipermail/bangpypers/2008-March/000410.html']    urls = ['http://www.bad-ischl.ooe.gv.at/system/web/default.aspx']    urls = ['http://europa.eu/languages/']        urls = ['http://www.web2.cz/rs-reference/']    urls = ['http://harvestmanontheweb.com/']    urls = ['http://www.web2.cz/rs-uvod/']    urls = ['http://digitallife.co.in/indian-cheerleaders-for-ipl/']    urls = ['http://www.brodingberg.gv.at']    urls = ["www.malvik.kommune.no"]    urls = ["http://www.gr.ch/Deutsch/index.cfm"]        for url in urls:        if os.system('wget %s -O index.html' % url ) == 0:            p.feed(open('index.html').read())            print p.links, len(p.links)            for link in p.links:                print link[1]                            print p.keywords            print p.description            print p.title            print p.base_href            print p.base                        p.reset()                                   

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?