📄 crawler.py

📁 Harvestman-最新版本
💻 PY
📖 第 1 页 / 共 3 页
字号:
上一页 1 23
                        # DOM modification parsing logic is rudimentary and will                        # screw up original page data most of the time!                                                #elif parser.domchanged:                        #    extrainfo("Javascript modified page DOM, using modified data to construct URLs...")                        #    # Get new content                        #    datatemp = repr(parser.getDocument())                        #    # Somehow if data is NULL, don't use it                        #    if len(datatemp) !=0:                        #        data = datatemp                        #    # print data                    except JSParserException, e:                        # No point printing this as error, since the parser is very baaaasic!                        # debug("Javascript parsing error =>", e)                        pass                    # Raise "afterjsparse" event                    objects.eventmgr.raise_event('afterjsparse', self.url, document, links=links)            parsecount = 0                        while True:                try:                    parsecount += 1                    self.wp.reset()                    self.wp.set_url(self.url)                    self.wp.feed(data)                    # Bug Fix: If the <base href="..."> tag was defined in the                    # web page, relative urls must be constructed against                    # the url provided in <base href="...">                    if self.wp.base_url_defined():                        url = self.wp.get_base_url()                        if not self.url.is_equal(url):                            debug("Base url defined, replacing",self.url)                            # Construct a url object                            url_obj = urlparser.HarvestManUrl(url,                                                              URL_TYPE_BASE,                                                              0,                                                              self.url,                                                              self._configobj.projdir)                            # Change document                            objects.datamgr.add_url(url_obj)                            document.set_url(url_obj)                    self.wp.close()                    # Related to issue #25 - Print a message if parsing went through                    # in a 2nd attempt                    if parsecount>1:                        extrainfo('Parsed web page successfully in second attempt',self.url)                     break                except (SGMLParseError, IOError), e:                    error('SGML parse error:',str(e))                    error('Error in parsing web-page %s' % self.url)                    if self.wp.typ==0:                        # Parse error occurred with Python parser                        debug('Trying to reparse using the HarvestManSGMLOpParser...')                        self.make_html_parser(choice=1)                    else:                        break                #except ValueError, e:                #    break                #except Exception, e:                #                    #    break            if self._configobj.robots:                # Check for NOFOLLOW tag                if not self.wp.can_follow:                    extrainfo('URL %s defines META Robots NOFOLLOW flag, not following its children...' % self.url)                    return data            links.extend(self.wp.links)            # print 'LINKS=>',self.wp.links            #for typ, link in links:            #    print 'Link=>',link                            # Let us update some stuff on the document...            document.keywords = self.wp.keywords[:]            document.description = self.wp.description            document.title = self.wp.title                        # Raise "afterparse" event...            objects.eventmgr.raise_event('afterparse', self.url, document, links=links)                        # Some times image links are provided in webpages as regular <a href=".."> links.            # So in order to filer images fully, we need to check the wp.links list also.            # Sample site: http://www.sheppeyseacadets.co.uk/gallery_2.htm            if self._configobj.images:                links += self.wp.images            else:                # Filter any links with image extensions out from links                links = [(type, link) for type, link in links if link[link.rfind('.'):].lower() not in \                         netinfo.image_extns]             #for typ, link in links:            #    print 'Link=>',link                            self.wp.reset()                        # Filter like that for video, flash & audio            if not self._configobj.movies:                # Filter any links with video extension out from links...                links = [(type, link) for type, link in links if link[link.rfind('.'):].lower() not in \                         netinfo.movie_extns]            if not self._configobj.flash:                # Filter any links with flash extension out from links...                links = [(type, link) for type, link in links if link[link.rfind('.'):].lower() not in \                         netinfo.flash_extns]                                if not self._configobj.sounds:                # Filter any links with audio extension out from links...                links = [(type, link) for type, link in links if link[link.rfind('.'):].lower() not in \                         netinfo.sound_extns]                            if not self._configobj.documents:                # Filter any links with popular documents extension out from links...                links = [(type, link) for type, link in links if link[link.rfind('.'):].lower() not in \                         netinfo.document_extns]                                        links = self.offset_links(links)            # print "Filtered links",links                        # Create collection object            coll = HarvestManAutoUrlCollection(url_obj)            children = []            for typ, url in links:                                is_cgi, is_php = False, False                # Not sure of the logical validity of the following 2 lines anymore...!                # This is old code...                if url.find('php?') != -1: is_php = True                if typ == 'form' or is_php: is_cgi = True                if not url or len(url)==0: continue                # print 'URL=>',url,url_obj.get_full_url()                                try:                    child_urlobj = urlparser.HarvestManUrl(url,                                                           typ,                                                           is_cgi,                                                           url_obj)                    # print url, child_urlobj.get_full_url()                                        if objects.datamgr.check_exists(child_urlobj):                        continue                    else:                        objects.datamgr.add_url(child_urlobj)                        coll.addURL(child_urlobj)                        children.append(child_urlobj)                                    except urlparser.HarvestManUrlError, e:                    error('URL Error:', e)                    continue            # objects.queuemgr.endloop(True)                        # Update the document again...            for child in children:                document.add_child(child)                                if not objects.queuemgr.push((url_obj.priority, coll, document), 'fetcher'):                if self._pushflag: self.buffer.append((url_obj.priority, coll, document))            # Update links called here            objects.datamgr.update_links(url_obj, coll)                        return data                elif self.url.is_stylesheet() and data:            # Parse stylesheet to find all contained URLs            # including imported stylesheets, if any.            # Create a document and keep updating it -this is useful to provide            # information to events...            document = url_obj.make_document(data, [], '', [])            # Raise "beforecssparse" event...            if objects.eventmgr.raise_event('beforecssparse', self.url, document)==False:                # Dont do anything with this URL...                return                        sp = pageparser.HarvestManCSSParser()            sp.feed(data)            objects.eventmgr.raise_event('aftercssparse', self.url, links=sp.links)                        links = self.offset_links(sp.links)                        # Filter the CSS URLs also w.r.t rules            # Filter any links with image extensions out from links            if not self._configobj.images:                links = [link for link in links if link[link.rfind('.'):].lower() not in netinfo.image_extns]                            children = []                         # Create collection object            coll = HarvestManAutoUrlCollection(self.url)                        # Add these links to the queue            for url in links:                if not url: continue                                # There is no type information - so look at the                # extension of the URL. If ending with .css then                # add as stylesheet type, else as generic type.                if url.lower().endswith('.css'):                    urltyp = URL_TYPE_STYLESHEET                else:                    urltyp = URL_TYPE_ANY                                    try:                    child_urlobj =  urlparser.HarvestManUrl(url,                                                            urltyp,                                                            False,                                                            self.url)                    if objects.datamgr.check_exists(child_urlobj):                        continue                    else:                        objects.datamgr.add_url(child_urlobj)                        coll.addURL(child_urlobj)                                            children.append(child_urlobj)                                        except urlparser.HarvestManUrlError:                    continue            # Update the document...            for child in children:                document.add_child(child)                        if not objects.queuemgr.push((self.url.priority, coll, document), 'fetcher'):                if self._pushflag: self.buffer.append((self.url.priority, coll, document))            # Update links called here            objects.datamgr.update_links(self.url, coll)            # Successful return returns data            return data        else:            # Dont do anything            return Noneclass HarvestManUrlDownloader(HarvestManUrlFetcher, HarvestManUrlCrawler):    """ This is a mixin class which does both the jobs of crawling webpages    and download urls """    def __init__(self, index, url_obj = None, isThread=True):        HarvestManUrlFetcher.__init__(self, index, url_obj, isThread)        self.set_url_object(url_obj)            def _initialize(self):        HarvestManUrlFetcher._initialize(self)        HarvestManUrlCrawler._initialize(self)                self._role = 'downloader'    def set_url_object(self, obj):        HarvestManUrlFetcher.set_url_object(self, obj)    def set_url_object2(self, obj):        HarvestManUrlCrawler.set_url_object(self, obj)            def exit_condition(self, caller):        # Exit condition for single thread case        if caller=='crawler':            return (objects.queuemgr.data_q.qsize()==0)        elif caller=='fetcher':            return (objects.queuemgr.url_q.qsize()==0)        return False    def is_exit_condition(self):        return (self.exit_condition('crawler') and self.exit_condition('fetcher'))        def action(self):        if self._isThread:            self._loops = 0            while not self._endflag:                obj = objects.queuemgr.get_url_data("downloader")                if not obj: continue                                self.set_url_object(obj)                                self.process_url()                self.crawl_url()                self._loops += 1                self.sleep()        else:            while True:                self.process_url()                obj = objects.queuemgr.get_url_data( "crawler" )                if obj: self.set_url_object2(obj)                                if self.url.is_webpage():                    self.crawl_url()                obj = objects.queuemgr.get_url_data("fetcher" )                self.set_url_object(obj)                                    if self.is_exit_condition():                    break    def process_url(self):        # First process urls using fetcher's algorithm        HarvestManUrlFetcher.process_url(self)    def crawl_url(self):        HarvestManUrlCrawler.crawl_url(self)
上一页 1 23
💿 文件大小 419 K
👤 上传用户 ccdn2615
📂 所属分类 Java编程
🏷️ 相关标签

#Harvestman #版本
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -