📄 crawler.py
字号:
if self._isThread: if not self.resuming: self._loops = 0 while not self._endflag: if not self.resuming: if self.buffer and self._pushflag: self.push_buffer() self.stateobj.set(self, CRAWLER_WAITING) obj = objects.queuemgr.get_url_data( "crawler" ) if not obj: if self._endflag: break if self.buffer and self._pushflag: debug('Trying to push buffer...') self.push_buffer() debug('OBJECT IS NONE,CONTINUING...',self) continue self.set_url_object(obj) if self.url==None: debug('NULL URLOBJECT',self) continue # We needs to do violates check here also if self.url.violates_rules(): extrainfo("Filtered URL",self.url) continue # Do a crawl to generate new objects # only after trying to push buffer # objects. self.crawl_url() self._loops += 1 # Sleep for some time self.sleep() # If I had resumed from a saved state, set resuming flag # to false self.resuming = False else: self.process_url() self.crawl_url() def apply_url_priority(self, url_obj): """ Apply priority to url objects """ cfg = objects.config # Set initial priority to previous url's generation url_obj.priority = self.url.generation # Get priority curr_priority = url_obj.priority # html files (webpages) get higher priority if url_obj.is_webpage(): curr_priority -= 1 # Apply any priorities specified based on file extensions in # the config file. pr_dict1, pr_dict2 = cfg.urlprioritydict, cfg.serverprioritydict # Get file extension extn = ((os.path.splitext(url_obj.get_filename()))[1]).lower() # Skip the '.' extn = extn[1:] # Get domain (server) domain = url_obj.get_domain() # Apply url priority if extn in pr_dict1: curr_priority -= int(pr_dict1[extn]) # Apply server priority, this allows a a partial # key match for key in pr_dict2: # Apply the first match if domain.find(key) != -1: curr_priority -= int(pr_dict2[domain]) break # Set priority again url_obj.priority = curr_priority return HARVESTMAN_OK def crawl_url(self): """ Crawl a web page, recursively downloading its links """ # Raise before crawl event... if objects.eventmgr.raise_event('beforecrawl', self.url, self.document)==False: extrainfo('Not crawling this url',self.url) return if not self._download: debug('DOWNLOAD FLAG UNSET!',self) return None self.stateobj.set(self, CRAWLER_CRAWLING) info('Fetching links', self.url) priority_indx = 0 # print self.links for url_obj in self.links: # Check for status flag to end loop if self._endflag: break if not url_obj: continue url_obj.generation = self.url.generation + 1 typ = url_obj.get_type() # Type based filtering if typ == 'javascript': if not self._configobj.javascript: continue elif typ == 'javaapplet': if not self._configobj.javaapplet: continue # Check for basic rules of download if url_obj.violates_rules(): extrainfo("Filtered URL",url_obj.get_full_url()) continue priority_indx += 1 self.apply_url_priority( url_obj ) if not objects.queuemgr.push( url_obj, "crawler" ): if self._pushflag: self.buffer.append(url_obj) objects.eventmgr.raise_event('aftercrawl', self.url, self.document) class HarvestManUrlFetcher(HarvestManBaseUrlCrawler): """ This is the fetcher class, which downloads data for a url and writes its files. It also posts the data for web pages to a data queue """ def __init__(self, index, url_obj = None, isThread=True): HarvestManBaseUrlCrawler.__init__(self, index, url_obj, isThread) self._fetchtime = 0 self.stateobj.set(self, THREAD_IDLE) def _initialize(self): HarvestManBaseUrlCrawler._initialize(self) self._role = "fetcher" self.make_html_parser() def make_html_parser(self, choice=0): if choice==0: self.wp = pageparser.HarvestManSimpleParser() elif choice==1: try: self.wp = pageparser.HarvestManSGMLOpParser() except ImportError: self.wp = pageparser.HarvestManSimpleParser() # Enable/disable features if self.wp != None: for feat, val in self._configobj.htmlfeatures: # int feat,'=>',val if val: self.wp.enable_feature(feat) else: self.wp.disable_feature(feat) def get_fetch_timestamp(self): """ Return the time stamp before fetching """ return self._fetchtime def set_url_object(self, obj): if not obj: return False try: prior, url_obj = obj # url_obj = GetUrlObject(indx) except TypeError: url_obj = obj return HarvestManBaseUrlCrawler.set_url_object(self, url_obj) def action(self): if self._isThread: if not self.resuming: self._loops = 0 while not self._endflag: if not self.resuming: if self.buffer and self._pushflag: debug('Trying to push buffer...') self.push_buffer() self.stateobj.set(self, FETCHER_WAITING) obj = objects.queuemgr.get_url_data("fetcher" ) if not obj: if self._endflag: break if self.buffer and self._pushflag: debug('Trying to push buffer...') self.push_buffer() continue if not self.set_url_object(obj): debug('NULL URLOBJECT',self) if self._endflag: break continue # Process to generate new objects # only after trying to push buffer # objects. self.process_url() # Raise "afterfetch" event objects.eventmgr.raise_event('afterfetch', self.url) self._loops += 1 # Sleep for some random time self.sleep() # Set resuming flag to False self.resuming = False else: self.process_url() self.crawl_url() def offset_links(self, links): """ Calculate a new list by applying any offset params on the list of links """ n = len(links) # Check for any links offset params - if so trim # the list of links to the supplied offset values offset_start = self._configobj.linksoffsetstart offset_end = self._configobj.linksoffsetend # Check for negative values for end offset # This is considered as follows. # -1 => Till and including end of list # -2 => Till and including (n-1) element # -3 => Till and including (n-2) element # like that... upto -(n-1)... if offset_end < 0: offset_end = n - (offset_end + 1) # If we still get negative value for offset end # discard it and use list till end if offset_end < 0: offset_end = n # Start offset should not have negative values if offset_start >= 0: return links[offset_start:offset_end] else: return links[:offset_end] def process_url(self): """ This function downloads the data for a url and writes its files. It also posts the data for web pages to a data queue """ data = '' # Raise "beforefetch" event... if objects.eventmgr.raise_event('beforefetch', self.url)==False: return if self.url.qstatus==urlparser.URL_NOT_QUEUED: info('Downloading', self.url.get_full_url()) # About to fetch self._fetchtime = time.time() self.stateobj.set(self, FETCHER_DOWNLOADING) data = objects.datamgr.download_url(self, self.url) # Add webpage links in datamgr, if we managed to # download the url url_obj = self.url if self.url.is_webpage() and data: # Create a HarvestMan document with all data we have... # Create a document and keep updating it -this is useful to provide # information to events... document = url_obj.make_document(data, [], '', []) # Raise "beforeparse" event... if objects.eventmgr.raise_event('beforeparse', self.url, document)==False: return # Check if this page was already crawled url = self.url.get_full_url() sh = sha.new(data) # Set this hash on the URL object itself self.url.pagehash = str(sh.hexdigest()) extrainfo("Parsing web page", self.url) self.stateobj.set(self, FETCHER_PARSING) links = [] # Perform any Javascript based redirection etc if self._configobj.javascript: skipjsparse = False # Raise "beforejsparse" event... if objects.eventmgr.raise_event('beforejsparse', self.url, document)==False: # Don't return, skip this... skipjsparse = True if not skipjsparse: try: parser = JSParser() parser.parse(data) if parser.locnchanged: redirect_url = parser.getLocation().href extrainfo("Javascript redirection to", redirect_url) links.append((urlparser.URL_TYPE_ANY, redirect_url))
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -