⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 crawler.py

📁 Harvestman-最新版本
💻 PY
📖 第 1 页 / 共 3 页
字号:
                if self._isThread:                        if not self.resuming:                self._loops = 0            while not self._endflag:                if not self.resuming:                    if self.buffer and self._pushflag:                        self.push_buffer()                    self.stateobj.set(self, CRAWLER_WAITING)                    obj = objects.queuemgr.get_url_data( "crawler" )                                        if not obj:                        if self._endflag: break                        if self.buffer and self._pushflag:                            debug('Trying to push buffer...')                            self.push_buffer()                        debug('OBJECT IS NONE,CONTINUING...',self)                        continue                    self.set_url_object(obj)                    if self.url==None:                        debug('NULL URLOBJECT',self)                        continue                    # We needs to do violates check here also                    if self.url.violates_rules():                        extrainfo("Filtered URL",self.url)                        continue                # Do a crawl to generate new objects                # only after trying to push buffer                # objects.                self.crawl_url()                self._loops += 1                # Sleep for some time                self.sleep()                # If I had resumed from a saved state, set resuming flag                # to false                self.resuming = False        else:            self.process_url()            self.crawl_url()    def apply_url_priority(self, url_obj):        """ Apply priority to url objects """        cfg = objects.config                # Set initial priority to previous url's generation        url_obj.priority = self.url.generation        # Get priority        curr_priority = url_obj.priority        # html files (webpages) get higher priority        if url_obj.is_webpage():            curr_priority -= 1        # Apply any priorities specified based on file extensions in        # the config file.        pr_dict1, pr_dict2 = cfg.urlprioritydict, cfg.serverprioritydict        # Get file extension        extn = ((os.path.splitext(url_obj.get_filename()))[1]).lower()        # Skip the '.'        extn = extn[1:]        # Get domain (server)        domain = url_obj.get_domain()        # Apply url priority        if extn in pr_dict1:            curr_priority -= int(pr_dict1[extn])        # Apply server priority, this allows a a partial        # key match         for key in pr_dict2:            # Apply the first match            if domain.find(key) != -1:                curr_priority -= int(pr_dict2[domain])                break                    # Set priority again        url_obj.priority = curr_priority                return HARVESTMAN_OK    def crawl_url(self):        """ Crawl a web page, recursively downloading its links """        # Raise before crawl event...        if objects.eventmgr.raise_event('beforecrawl', self.url, self.document)==False:            extrainfo('Not crawling this url',self.url)            return                if not self._download:            debug('DOWNLOAD FLAG UNSET!',self)            return None        self.stateobj.set(self, CRAWLER_CRAWLING)                            info('Fetching links', self.url)                priority_indx = 0        # print self.links                for url_obj in self.links:            # Check for status flag to end loop            if self._endflag: break            if not url_obj: continue            url_obj.generation = self.url.generation + 1            typ = url_obj.get_type()            # Type based filtering            if typ == 'javascript':                if not self._configobj.javascript:                    continue            elif typ == 'javaapplet':                if not self._configobj.javaapplet:                    continue                            # Check for basic rules of download            if url_obj.violates_rules():                extrainfo("Filtered URL",url_obj.get_full_url())                continue            priority_indx += 1            self.apply_url_priority( url_obj )            if not objects.queuemgr.push( url_obj, "crawler" ):                if self._pushflag: self.buffer.append(url_obj)        objects.eventmgr.raise_event('aftercrawl', self.url, self.document)        class HarvestManUrlFetcher(HarvestManBaseUrlCrawler):    """ This is the fetcher class, which downloads data for a url    and writes its files. It also posts the data for web pages    to a data queue """    def __init__(self, index, url_obj = None, isThread=True):        HarvestManBaseUrlCrawler.__init__(self, index, url_obj, isThread)        self._fetchtime = 0        self.stateobj.set(self, THREAD_IDLE)            def _initialize(self):        HarvestManBaseUrlCrawler._initialize(self)        self._role = "fetcher"        self.make_html_parser()            def make_html_parser(self, choice=0):        if choice==0:            self.wp = pageparser.HarvestManSimpleParser()        elif choice==1:            try:                self.wp = pageparser.HarvestManSGMLOpParser()            except ImportError:                self.wp = pageparser.HarvestManSimpleParser()        # Enable/disable features        if self.wp != None:            for feat, val in self._configobj.htmlfeatures:                # int feat,'=>',val                if val: self.wp.enable_feature(feat)                else: self.wp.disable_feature(feat)            def get_fetch_timestamp(self):        """ Return the time stamp before fetching """        return self._fetchtime        def set_url_object(self, obj):        if not obj: return False                try:            prior, url_obj = obj            # url_obj = GetUrlObject(indx)        except TypeError:            url_obj = obj        return HarvestManBaseUrlCrawler.set_url_object(self, url_obj)    def action(self):                if self._isThread:            if not self.resuming:                self._loops = 0                        while not self._endflag:                                    if not self.resuming:                    if self.buffer and self._pushflag:                        debug('Trying to push buffer...')                        self.push_buffer()                    self.stateobj.set(self, FETCHER_WAITING)                                        obj = objects.queuemgr.get_url_data("fetcher" )                                        if not obj:                        if self._endflag: break                        if self.buffer and self._pushflag:                            debug('Trying to push buffer...')                            self.push_buffer()                        continue                    if not self.set_url_object(obj):                        debug('NULL URLOBJECT',self)                        if self._endflag: break                        continue                # Process to generate new objects                # only after trying to push buffer                # objects.                self.process_url()                # Raise "afterfetch" event                objects.eventmgr.raise_event('afterfetch', self.url)                                self._loops += 1                # Sleep for some random time                self.sleep()                # Set resuming flag to False                self.resuming = False        else:            self.process_url()            self.crawl_url()    def offset_links(self, links):        """ Calculate a new list by applying any offset params        on the list of links """        n = len(links)        # Check for any links offset params - if so trim        # the list of links to the supplied offset values        offset_start = self._configobj.linksoffsetstart        offset_end = self._configobj.linksoffsetend        # Check for negative values for end offset        # This is considered as follows.        # -1 => Till and including end of list        # -2 => Till and including (n-1) element        # -3 => Till and including (n-2) element        # like that... upto -(n-1)...        if offset_end < 0:            offset_end = n - (offset_end + 1)        # If we still get negative value for offset end        # discard it and use list till end        if offset_end < 0:            offset_end = n        # Start offset should not have negative values        if offset_start >= 0:            return links[offset_start:offset_end]        else:            return links[:offset_end]            def process_url(self):        """ This function downloads the data for a url and writes its files.        It also posts the data for web pages to a data queue """        data = ''        # Raise "beforefetch" event...        if objects.eventmgr.raise_event('beforefetch', self.url)==False:            return                 if self.url.qstatus==urlparser.URL_NOT_QUEUED:            info('Downloading', self.url.get_full_url())            # About to fetch            self._fetchtime = time.time()            self.stateobj.set(self, FETCHER_DOWNLOADING)            data = objects.datamgr.download_url(self, self.url)                    # Add webpage links in datamgr, if we managed to        # download the url        url_obj = self.url        if self.url.is_webpage() and data:            # Create a HarvestMan document with all data we have...            # Create a document and keep updating it -this is useful to provide            # information to events...            document = url_obj.make_document(data, [], '', [])                        # Raise "beforeparse" event...            if objects.eventmgr.raise_event('beforeparse', self.url, document)==False:                return                         # Check if this page was already crawled            url = self.url.get_full_url()            sh = sha.new(data)            # Set this hash on the URL object itself            self.url.pagehash = str(sh.hexdigest())            extrainfo("Parsing web page", self.url)            self.stateobj.set(self, FETCHER_PARSING)                    links = []            # Perform any Javascript based redirection etc            if self._configobj.javascript:                skipjsparse = False                # Raise "beforejsparse" event...                if objects.eventmgr.raise_event('beforejsparse', self.url, document)==False:                    # Don't return, skip this...                    skipjsparse = True                if not skipjsparse:                    try:                        parser = JSParser()                        parser.parse(data)                        if parser.locnchanged:                            redirect_url = parser.getLocation().href                            extrainfo("Javascript redirection to", redirect_url)                            links.append((urlparser.URL_TYPE_ANY, redirect_url))

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -