📄 crawler.py
字号:
# -- coding: utf-8""" crawler.py - Module which does crawling and downloading of urls from the web. This module is part of HarvestMan program. Author: Anand B Pillai <abpillai at gmail dot com> For licensing information see the file LICENSE.txt that is included in this distribution. Modification history (Trimmed on Dec 14 2004) Aug 22 2006 Anand Changes for fixing single-thread mode. Nov 9 2006 Anand Added support to download imported stylesheets. Jan 2007 Anand Support for META robot tags. Feb 17 2007 Anand Modified return type of process_url in HarvestManUrlFetcher class to return the data. This is required for the modified swish-e plugin. Feb 26 2007 Anand Figured out the problem with 'disappearing' URLs. The error is in the crawl_url method which was checking whether a source URL was crawled. This happens when a page redefines its base URL as something else and when that URL is already crawled. We need to modify our logic of applying base URLs. Mar 06 2007 Anand Reset the logic of url-server to old one (only crawlers send data to url server). This is because sending both data to the server causes it to fail in a number of ways. NOTE: Decided not to use url server anymore since it is not yet stable. I think I need to go the Twisted way if this has to be done right. Apr 06 2007 Anand Added check to make sure that threads are not re-started for the same recurring problem. Oct 21 2007 Anand Added states for the crawler state machine. Copyright (C) 2004 Anand B Pillai. """__version__ = '2.0 b1'__author__ = 'Anand B Pillai'import os, sysimport socketimport timeimport threadingimport randomimport exceptionsimport shafrom sgmllib import SGMLParseErrorfrom harvestman.lib.common.common import *from harvestman.lib.common.macros import *from harvestman.lib.urltypes import *from harvestman.lib.urlcollections import *from harvestman.lib.methodwrapper import MethodWrapperMetaClassfrom harvestman.lib.js.jsparser import JSParser, JSParserExceptionfrom harvestman.lib import urlparserfrom harvestman.lib import pageparserfrom harvestman.lib.common import netinfo # Defining pluggable functions# Plugin name is the key and value is <class>:<function>__plugins__ = { 'fetcher_process_url_plugin': 'HarvestManUrlFetcher:process_url', 'crawler_crawl_url_plugin': 'HarvestManUrlCrawler:crawl_url' }# Defining functions with pre & post callbacks# Callback name is the key and value is <class>:<function>__callbacks__ = { 'fetcher_process_url_callback' : 'HarvestManUrlFetcher:process_url', 'crawler_crawl_url_callback' : 'HarvestManUrlCrawler:crawl_url', 'fetcher_push_buffer_callback' : 'HarvestManUrlFetcher:push_buffer', 'crawler_push_buffer_callback' : 'HarvestManUrlCrawler:push_buffer', 'fetcher_terminate_callback' : 'HarvestManUrlFetcher:terminate', 'crawler_terminate_callback' : 'HarvestManUrlCrawler:terminate' }class HarvestManThreadState(type): """ A metaclass for HarvestMan thread states """ IDX = -1 def __new__(cls, name, bases=(), dct={}): """ Overloaded Constructor """ # Automatically increment index, without we bothering # to assign a number to the state class... cls.IDX += 1 dct['index'] = cls.IDX return type.__new__(cls, name, bases, dct) def __init__(self, name, bases=(), dct={}): type.__init__(self, name, bases, dct) def __repr__(self): return '%d: %s' % (self.index, self.about) def __str__(self): return self.__name__ def __eq__(self, number): """ Overloaded __eq__ method to allow comparisons with numbers """ # Makes it easy to do things like # THREAD_IDLE == 0 in code. return self.index == number def DEFINE_STATE(name, description): """ A factory function for defining thread state classes """ # State classes are created and automatically injected in the module's # global namespace using the class name. globals()[name] = HarvestManThreadState(name, dct={'about': description}) # Thread statesDEFINE_STATE('THREAD_IDLE', "Idle thread, not running")DEFINE_STATE('THREAD_STARTED', "Thread started to run")DEFINE_STATE('CRAWLER_WAITING', "Crawler thread waiting for data")DEFINE_STATE('FETCHER_WAITING', "Fetcher thread waiting for data")DEFINE_STATE('CRAWLER_GOT_DATA', "Crawler thread got new list of URLs to crawl from the queue")DEFINE_STATE('FETCHER_GOT_DATA', "Fetcher thread got new URL information from the queue")DEFINE_STATE('FETCHER_DOWNLOADING', "Fetcher thread downloading data")DEFINE_STATE('FETCHER_PARSING', "Fetcher thread parsing webpage to extract new URLs")DEFINE_STATE('CRAWLER_CRAWLING', "Crawler thread crawling a page")DEFINE_STATE('FETCHER_PUSH_URL', "Fetcher thread pushing URL to queue")DEFINE_STATE('CRAWLER_PUSH_URL', "Crawler thread pushing URL to queue")DEFINE_STATE('FETCHER_PUSHED_URL', "Fetcher thread pushed URL to queue")DEFINE_STATE('CRAWLER_PUSHED_URL', "Crawler thread pushed URL to queue")DEFINE_STATE('THREAD_SLEEPING', "Thread sleeping")DEFINE_STATE('THREAD_SUSPENDED', "Thread is suspended on the state machine")DEFINE_STATE('THREAD_DIED', "Thread died due to an error")DEFINE_STATE('THREAD_STOPPED', "Thread stopped")class HarvestManUrlCrawlerException(Exception): """ An exception class for HarvestManBaseUrlCrawler and its derived classes """ def __init__(self, value): """ Overloaded __init__ method """ self.value = value def __repr__(self): return self.__str__() def __str__(self): return str(self.value)class HarvestManBaseUrlCrawler( threading.Thread ): """ Base class to do the crawling and fetching of internet/intranet urls. This is the base class with no actual code apart from the threading or termination functions. """ __metaclass__ = MethodWrapperMetaClass # Last error which caused the thread to die _lasterror = None def __init__(self, index, url_obj = None, isThread = True): # Index of the crawler self._index = index # Initialize my variables self._initialize() # Am i a thread self._isThread = isThread if isThread: threading.Thread.__init__(self, None, None, self._role + str(self._index)) def _initialize(self): """ Initialise my state after construction """ # End flag self._endflag = False # Download flag self._download = True self.url = None self.document = None # Number of loops self._loops = 0 # Role string self._role = "undefined" # State of the crawler self.stateobj = objects.queuemgr.stateobj # Configuration self._configobj = objects.config # Local Buffer for Objects # to be put in q. Maximum size is 100 self.buffer = Ldeque(100) # Flag for pushing to buffer self._pushflag = self._configobj.fastmode and (not self._configobj.blocking) # Resume flag - for resuming from a saved state self.resuming = False # Last exception self.exception = None # Sleep event if self._configobj.randomsleep: self.evnt = RandomSleepEvent(self._configobj.sleeptime) else: self.evnt = SleepEvent(self._configobj.sleeptime) def __str__(self): return self.getName() def get_url(self): """ Return my url """ return self.url def set_download_flag(self, val = True): """ Set the download flag """ self._download = val def set_url_object(self, obj): """ Set the url object of this crawler """ self.url = obj return True def set_index(self, index): self._index = index def get_index(self): return self._index def get_url_object(self): """ Return the url object of this crawler """ return self.url def get_current_url(self): """ Return the current url """ return self.url.get_full_url() def action(self): """ The action method, to be overridden by sub classes to provide action """ pass def run(self): """ The overloaded run method of threading.Thread class """ try: self.stateobj.set(self, THREAD_STARTED) self.action() except Exception, e: # print 'Exception',e,self self.exception = e self.stateobj.set(self, THREAD_DIED) def stop(self): self.join() def join(self): """ Stop this crawler thread """ self._endflag = True self.set_download_flag(False) threading.Thread.join(self, 1.0) self.stateobj.set(self, THREAD_STOPPED) # raise HarvestManUrlCrawlerException, "%s: Stopped" % self.getName() def sleep(self): self.stateobj.set(self, THREAD_SLEEPING) self.evnt.sleep() def crawl_url(self): """ Crawl a web page, recursively downloading its links """ pass def process_url(self): """ Download the data for a web page or a link and manage its data """ pass def push_buffer(self): """ Try to push items in local buffer to queue """ # Try to push the last item stuff = self.buffer[-1] if objects.queuemgr.push(stuff, self._role): # Remove item self.buffer.remove(stuff)class HarvestManUrlCrawler(HarvestManBaseUrlCrawler): """ The crawler class which crawls urls and fetches their links. These links are posted to the url queue """ def __init__(self, index, url_obj = None, isThread=True): HarvestManBaseUrlCrawler.__init__(self, index, url_obj, isThread) # Not running yet self.stateobj.set(self, THREAD_IDLE) def _initialize(self): HarvestManBaseUrlCrawler._initialize(self) self._role = "crawler" self.links = [] def set_url_object(self, obj): # Reset self.links = [] if not obj: return False prior, coll, document = obj url_index = coll.getSourceURL() url_obj = objects.datamgr.get_url(url_index) if not url_obj: return False self.links = [objects.datamgr.get_url(index) for index in coll.getAllURLs()] self.document = document return HarvestManBaseUrlCrawler.set_url_object(self, url_obj) def action(self):
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -