📄 crawler.py

📁 Harvestman-最新版本
💻 PY
📖 第 1 页 / 共 3 页
字号:
12 3 下一页
# -- coding: utf-8""" crawler.py - Module which does crawling and downloading    of urls from the web. This module is part of HarvestMan program.    Author: Anand B Pillai <abpillai at gmail dot com>    For licensing information see the file LICENSE.txt that    is included in this distribution.    Modification history (Trimmed on Dec 14 2004)    Aug 22 2006  Anand    Changes for fixing single-thread mode.    Nov 9 2006   Anand    Added support to download imported stylesheets.    Jan 2007     Anand    Support for META robot tags.    Feb 17 2007  Anand    Modified return type of process_url in                          HarvestManUrlFetcher class to return the data.                          This is required for the modified swish-e                          plugin.    Feb 26 2007 Anand     Figured out the problem with 'disappearing' URLs.                          The error is in the crawl_url method which was                          checking whether a source URL was crawled. This                          happens when a page redefines its base URL as                          something else and when that URL is already crawled.                          We need to modify our logic of applying base URLs.    Mar 06 2007 Anand     Reset the logic of url-server to old one (only                          crawlers send data to url server). This is because                          sending both data to the server causes it to fail                          in a number of ways.                          NOTE: Decided not to use url server anymore since                          it is not yet stable. I think I need to go the                          Twisted way if this has to be done right.    Apr 06 2007  Anand    Added check to make sure that threads are not                          re-started for the same recurring problem.    Oct 21 2007  Anand    Added states for the crawler state machine.     Copyright (C) 2004 Anand B Pillai.   """__version__ = '2.0 b1'__author__ = 'Anand B Pillai'import os, sysimport socketimport timeimport threadingimport randomimport exceptionsimport shafrom sgmllib import SGMLParseErrorfrom harvestman.lib.common.common import *from harvestman.lib.common.macros import *from harvestman.lib.urltypes import *from harvestman.lib.urlcollections import *from harvestman.lib.methodwrapper import MethodWrapperMetaClassfrom harvestman.lib.js.jsparser import JSParser, JSParserExceptionfrom harvestman.lib import urlparserfrom harvestman.lib import pageparserfrom harvestman.lib.common import netinfo # Defining pluggable functions# Plugin name is the key and value is <class>:<function>__plugins__ = { 'fetcher_process_url_plugin': 'HarvestManUrlFetcher:process_url',                'crawler_crawl_url_plugin': 'HarvestManUrlCrawler:crawl_url' }# Defining functions with pre & post callbacks# Callback name is the key and value is <class>:<function>__callbacks__ = { 'fetcher_process_url_callback' : 'HarvestManUrlFetcher:process_url',                  'crawler_crawl_url_callback' : 'HarvestManUrlCrawler:crawl_url',                  'fetcher_push_buffer_callback' : 'HarvestManUrlFetcher:push_buffer',                  'crawler_push_buffer_callback' : 'HarvestManUrlCrawler:push_buffer',                  'fetcher_terminate_callback' : 'HarvestManUrlFetcher:terminate',                  'crawler_terminate_callback' : 'HarvestManUrlCrawler:terminate' }class HarvestManThreadState(type):    """ A metaclass for HarvestMan thread states """    IDX = -1        def __new__(cls, name, bases=(), dct={}):        """ Overloaded Constructor """                # Automatically increment index, without we bothering        # to assign a number to the state class...        cls.IDX += 1        dct['index'] = cls.IDX        return type.__new__(cls, name, bases, dct)    def __init__(self, name, bases=(), dct={}):        type.__init__(self, name, bases, dct)            def __repr__(self):        return '%d: %s' % (self.index, self.about)    def __str__(self):        return self.__name__        def __eq__(self, number):        """ Overloaded __eq__ method to allow        comparisons with numbers """                # Makes it easy to do things like        # THREAD_IDLE == 0 in code.        return self.index == number    def DEFINE_STATE(name, description):    """ A factory function for defining thread state classes """    # State classes are created and automatically injected in the module's    # global namespace using the class name.    globals()[name] = HarvestManThreadState(name, dct={'about': description})     # Thread statesDEFINE_STATE('THREAD_IDLE', "Idle thread, not running")DEFINE_STATE('THREAD_STARTED', "Thread started to run")DEFINE_STATE('CRAWLER_WAITING', "Crawler thread waiting for data")DEFINE_STATE('FETCHER_WAITING', "Fetcher thread waiting for data")DEFINE_STATE('CRAWLER_GOT_DATA', "Crawler thread got new list of URLs to crawl from the queue")DEFINE_STATE('FETCHER_GOT_DATA', "Fetcher thread got new URL information from the queue")DEFINE_STATE('FETCHER_DOWNLOADING', "Fetcher thread downloading data")DEFINE_STATE('FETCHER_PARSING', "Fetcher thread parsing webpage to extract new URLs")DEFINE_STATE('CRAWLER_CRAWLING', "Crawler thread crawling a page")DEFINE_STATE('FETCHER_PUSH_URL', "Fetcher thread pushing URL to queue")DEFINE_STATE('CRAWLER_PUSH_URL', "Crawler thread pushing URL to queue")DEFINE_STATE('FETCHER_PUSHED_URL', "Fetcher thread pushed URL to queue")DEFINE_STATE('CRAWLER_PUSHED_URL', "Crawler thread pushed URL to queue")DEFINE_STATE('THREAD_SLEEPING', "Thread sleeping")DEFINE_STATE('THREAD_SUSPENDED', "Thread is suspended on the state machine")DEFINE_STATE('THREAD_DIED', "Thread died due to an error")DEFINE_STATE('THREAD_STOPPED', "Thread stopped")class HarvestManUrlCrawlerException(Exception):    """ An exception class for HarvestManBaseUrlCrawler and its    derived classes """        def __init__(self, value):        """ Overloaded __init__ method """                self.value = value    def __repr__(self):        return self.__str__()        def __str__(self):        return str(self.value)class HarvestManBaseUrlCrawler( threading.Thread ):    """ Base class to do the crawling and fetching of internet/intranet urls.    This is the base class with no actual code apart from the threading or    termination functions. """    __metaclass__ = MethodWrapperMetaClass    # Last error which caused the thread to die    _lasterror = None        def __init__(self, index, url_obj = None, isThread = True):        # Index of the crawler        self._index = index        # Initialize my variables        self._initialize()        # Am i a thread        self._isThread = isThread                if isThread:            threading.Thread.__init__(self, None, None, self._role + str(self._index))    def _initialize(self):        """ Initialise my state after construction """        # End flag        self._endflag = False        # Download flag        self._download = True        self.url = None        self.document = None        # Number of loops        self._loops = 0        # Role string        self._role = "undefined"        # State of the crawler        self.stateobj = objects.queuemgr.stateobj        # Configuration        self._configobj = objects.config        # Local Buffer for Objects        # to be put in q. Maximum size is 100        self.buffer = Ldeque(100)        # Flag for pushing to buffer        self._pushflag = self._configobj.fastmode and (not self._configobj.blocking)        # Resume flag - for resuming from a saved state        self.resuming = False        # Last exception        self.exception = None        # Sleep event        if self._configobj.randomsleep:            self.evnt = RandomSleepEvent(self._configobj.sleeptime)        else:            self.evnt = SleepEvent(self._configobj.sleeptime)                        def __str__(self):        return self.getName()    def get_url(self):        """ Return my url """        return self.url    def set_download_flag(self, val = True):        """ Set the download flag """        self._download = val    def set_url_object(self, obj):        """ Set the url object of this crawler """        self.url = obj        return True    def set_index(self, index):        self._index = index    def get_index(self):        return self._index        def get_url_object(self):        """ Return the url object of this crawler """        return self.url    def get_current_url(self):        """ Return the current url """        return self.url.get_full_url()        def action(self):        """ The action method, to be overridden by        sub classes to provide action """        pass            def run(self):        """ The overloaded run method of threading.Thread class """        try:            self.stateobj.set(self, THREAD_STARTED)            self.action()        except Exception, e:            # print 'Exception',e,self            self.exception = e            self.stateobj.set(self, THREAD_DIED)                    def stop(self):        self.join()            def join(self):        """ Stop this crawler thread """        self._endflag = True        self.set_download_flag(False)        threading.Thread.join(self, 1.0)        self.stateobj.set(self, THREAD_STOPPED)        # raise HarvestManUrlCrawlerException, "%s: Stopped" % self.getName()        def sleep(self):        self.stateobj.set(self, THREAD_SLEEPING)        self.evnt.sleep()            def crawl_url(self):        """ Crawl a web page, recursively downloading its links """        pass    def process_url(self):        """ Download the data for a web page or a link and        manage its data """        pass            def push_buffer(self):        """ Try to push items in local buffer to queue """        # Try to push the last item        stuff = self.buffer[-1]        if objects.queuemgr.push(stuff, self._role):            # Remove item            self.buffer.remove(stuff)class HarvestManUrlCrawler(HarvestManBaseUrlCrawler):    """ The crawler class which crawls urls and fetches their links.    These links are posted to the url queue """    def __init__(self, index, url_obj = None, isThread=True):        HarvestManBaseUrlCrawler.__init__(self, index, url_obj, isThread)        # Not running yet        self.stateobj.set(self, THREAD_IDLE)            def _initialize(self):        HarvestManBaseUrlCrawler._initialize(self)        self._role = "crawler"        self.links = []    def set_url_object(self, obj):        # Reset        self.links = []                if not obj:            return False        prior, coll, document = obj        url_index = coll.getSourceURL()        url_obj = objects.datamgr.get_url(url_index)                if not url_obj:            return False                self.links = [objects.datamgr.get_url(index) for index in coll.getAllURLs()]        self.document = document                return HarvestManBaseUrlCrawler.set_url_object(self, url_obj)    def action(self):
12 3 下一页
💿 文件大小 419 K
👤 上传用户 ccdn2615
📂 所属分类 Java编程
🏷️ 相关标签

#Harvestman #版本
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -