📄 crawler.py
字号:
# DOM modification parsing logic is rudimentary and will # screw up original page data most of the time! #elif parser.domchanged: # extrainfo("Javascript modified page DOM, using modified data to construct URLs...") # # Get new content # datatemp = repr(parser.getDocument()) # # Somehow if data is NULL, don't use it # if len(datatemp) !=0: # data = datatemp # # print data except JSParserException, e: # No point printing this as error, since the parser is very baaaasic! # debug("Javascript parsing error =>", e) pass # Raise "afterjsparse" event objects.eventmgr.raise_event('afterjsparse', self.url, document, links=links) parsecount = 0 while True: try: parsecount += 1 self.wp.reset() self.wp.set_url(self.url) self.wp.feed(data) # Bug Fix: If the <base href="..."> tag was defined in the # web page, relative urls must be constructed against # the url provided in <base href="..."> if self.wp.base_url_defined(): url = self.wp.get_base_url() if not self.url.is_equal(url): debug("Base url defined, replacing",self.url) # Construct a url object url_obj = urlparser.HarvestManUrl(url, URL_TYPE_BASE, 0, self.url, self._configobj.projdir) # Change document objects.datamgr.add_url(url_obj) document.set_url(url_obj) self.wp.close() # Related to issue #25 - Print a message if parsing went through # in a 2nd attempt if parsecount>1: extrainfo('Parsed web page successfully in second attempt',self.url) break except (SGMLParseError, IOError), e: error('SGML parse error:',str(e)) error('Error in parsing web-page %s' % self.url) if self.wp.typ==0: # Parse error occurred with Python parser debug('Trying to reparse using the HarvestManSGMLOpParser...') self.make_html_parser(choice=1) else: break #except ValueError, e: # break #except Exception, e: # # break if self._configobj.robots: # Check for NOFOLLOW tag if not self.wp.can_follow: extrainfo('URL %s defines META Robots NOFOLLOW flag, not following its children...' % self.url) return data links.extend(self.wp.links) # print 'LINKS=>',self.wp.links #for typ, link in links: # print 'Link=>',link # Let us update some stuff on the document... document.keywords = self.wp.keywords[:] document.description = self.wp.description document.title = self.wp.title # Raise "afterparse" event... objects.eventmgr.raise_event('afterparse', self.url, document, links=links) # Some times image links are provided in webpages as regular <a href=".."> links. # So in order to filer images fully, we need to check the wp.links list also. # Sample site: http://www.sheppeyseacadets.co.uk/gallery_2.htm if self._configobj.images: links += self.wp.images else: # Filter any links with image extensions out from links links = [(type, link) for type, link in links if link[link.rfind('.'):].lower() not in \ netinfo.image_extns] #for typ, link in links: # print 'Link=>',link self.wp.reset() # Filter like that for video, flash & audio if not self._configobj.movies: # Filter any links with video extension out from links... links = [(type, link) for type, link in links if link[link.rfind('.'):].lower() not in \ netinfo.movie_extns] if not self._configobj.flash: # Filter any links with flash extension out from links... links = [(type, link) for type, link in links if link[link.rfind('.'):].lower() not in \ netinfo.flash_extns] if not self._configobj.sounds: # Filter any links with audio extension out from links... links = [(type, link) for type, link in links if link[link.rfind('.'):].lower() not in \ netinfo.sound_extns] if not self._configobj.documents: # Filter any links with popular documents extension out from links... links = [(type, link) for type, link in links if link[link.rfind('.'):].lower() not in \ netinfo.document_extns] links = self.offset_links(links) # print "Filtered links",links # Create collection object coll = HarvestManAutoUrlCollection(url_obj) children = [] for typ, url in links: is_cgi, is_php = False, False # Not sure of the logical validity of the following 2 lines anymore...! # This is old code... if url.find('php?') != -1: is_php = True if typ == 'form' or is_php: is_cgi = True if not url or len(url)==0: continue # print 'URL=>',url,url_obj.get_full_url() try: child_urlobj = urlparser.HarvestManUrl(url, typ, is_cgi, url_obj) # print url, child_urlobj.get_full_url() if objects.datamgr.check_exists(child_urlobj): continue else: objects.datamgr.add_url(child_urlobj) coll.addURL(child_urlobj) children.append(child_urlobj) except urlparser.HarvestManUrlError, e: error('URL Error:', e) continue # objects.queuemgr.endloop(True) # Update the document again... for child in children: document.add_child(child) if not objects.queuemgr.push((url_obj.priority, coll, document), 'fetcher'): if self._pushflag: self.buffer.append((url_obj.priority, coll, document)) # Update links called here objects.datamgr.update_links(url_obj, coll) return data elif self.url.is_stylesheet() and data: # Parse stylesheet to find all contained URLs # including imported stylesheets, if any. # Create a document and keep updating it -this is useful to provide # information to events... document = url_obj.make_document(data, [], '', []) # Raise "beforecssparse" event... if objects.eventmgr.raise_event('beforecssparse', self.url, document)==False: # Dont do anything with this URL... return sp = pageparser.HarvestManCSSParser() sp.feed(data) objects.eventmgr.raise_event('aftercssparse', self.url, links=sp.links) links = self.offset_links(sp.links) # Filter the CSS URLs also w.r.t rules # Filter any links with image extensions out from links if not self._configobj.images: links = [link for link in links if link[link.rfind('.'):].lower() not in netinfo.image_extns] children = [] # Create collection object coll = HarvestManAutoUrlCollection(self.url) # Add these links to the queue for url in links: if not url: continue # There is no type information - so look at the # extension of the URL. If ending with .css then # add as stylesheet type, else as generic type. if url.lower().endswith('.css'): urltyp = URL_TYPE_STYLESHEET else: urltyp = URL_TYPE_ANY try: child_urlobj = urlparser.HarvestManUrl(url, urltyp, False, self.url) if objects.datamgr.check_exists(child_urlobj): continue else: objects.datamgr.add_url(child_urlobj) coll.addURL(child_urlobj) children.append(child_urlobj) except urlparser.HarvestManUrlError: continue # Update the document... for child in children: document.add_child(child) if not objects.queuemgr.push((self.url.priority, coll, document), 'fetcher'): if self._pushflag: self.buffer.append((self.url.priority, coll, document)) # Update links called here objects.datamgr.update_links(self.url, coll) # Successful return returns data return data else: # Dont do anything return Noneclass HarvestManUrlDownloader(HarvestManUrlFetcher, HarvestManUrlCrawler): """ This is a mixin class which does both the jobs of crawling webpages and download urls """ def __init__(self, index, url_obj = None, isThread=True): HarvestManUrlFetcher.__init__(self, index, url_obj, isThread) self.set_url_object(url_obj) def _initialize(self): HarvestManUrlFetcher._initialize(self) HarvestManUrlCrawler._initialize(self) self._role = 'downloader' def set_url_object(self, obj): HarvestManUrlFetcher.set_url_object(self, obj) def set_url_object2(self, obj): HarvestManUrlCrawler.set_url_object(self, obj) def exit_condition(self, caller): # Exit condition for single thread case if caller=='crawler': return (objects.queuemgr.data_q.qsize()==0) elif caller=='fetcher': return (objects.queuemgr.url_q.qsize()==0) return False def is_exit_condition(self): return (self.exit_condition('crawler') and self.exit_condition('fetcher')) def action(self): if self._isThread: self._loops = 0 while not self._endflag: obj = objects.queuemgr.get_url_data("downloader") if not obj: continue self.set_url_object(obj) self.process_url() self.crawl_url() self._loops += 1 self.sleep() else: while True: self.process_url() obj = objects.queuemgr.get_url_data( "crawler" ) if obj: self.set_url_object2(obj) if self.url.is_webpage(): self.crawl_url() obj = objects.queuemgr.get_url_data("fetcher" ) self.set_url_object(obj) if self.is_exit_condition(): break def process_url(self): # First process urls using fetcher's algorithm HarvestManUrlFetcher.process_url(self) def crawl_url(self): HarvestManUrlCrawler.crawl_url(self)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -