⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 harvestman.py

📁 网络蜘蛛
💻 PY
字号:
""" HARVESTMAN - Multithreaded internet spider program
    using urllib2 and other python modules.
    
    Version      - 1.3.9 final.

   Author: Anand B Pillai(anandpillai at letterboxes dot org).

 HARVESTMAN is totally free software. See the file LICENSE.txt for
 information on the terms and conditions of usage, and a DISCLAIMER
 of ALL WARRANTIES. The same license agreement applies to all other
 python software modules used in this program.

 Modification History
 ====================

  Oct 10 2003         Anand          1.3 a1 release.
  Jan 2 2004          Anand          1.3.1 bug fix version.
  Feb 24 2004         Anand          1.3.3 version release.
  Apr 20 2004         Anand          1.3.4 version release.
  Jun 14 2004         Anand          1.3.9 release.
  
"""

import os, sys
from sgmllib import SGMLParseError
from shutil import copy

# Our modules

# Tracker modules
import urltracker
# Connector module
import connector
# Rules module
import rules
# Data manager module
import datamgr
# Cookie module
import cookiemgr
# Utils module
import utils

# Globals/lookup module
from common import *

class harvestMan:
    """ Top level application class """

    def __init__(self):
        """ Constructor """

        # project start page (on disk)
        self._projectstartpage='file://'
        # error file descriptor
        self._ofs_=None
	self.USER_AGENT="HarvestMan 1.4"
	self.VERSION = "1.3.9"
        
    def set_error_log(self, errorfile):
        """ Function to call to set this class
        as sys.stderr """

        # error log file
        errorlogpath = os.path.join(self._cfg.projdir, errorfile)
        
        if os.path.exists(errorlogpath):
            try:
                os.remove(errorlogpath)
            except OSError, e:
                print e
                
        self._ofs_ = open(errorlogpath, 'w')

    def finish(self):
        """ Actions to take after download is over """

        # Close the cookie session so that
        # cookies are saved.
        cookie_manager = GetObject('cookiestore')

        # Localise file links
        # This code sits in the data manager class
        dmgr = GetObject('datamanager')
        dmgr.post_download_setup()
        
        if not self._cfg.testing:
            browser = utils.HarvestManBrowser()
            browser.make_project_browse_page()

        # FIXME: Better way to signal global module that
        # we are done.
        Finish()
        
        if self._ofs_:
            try:
                self._ofs_.close()
            except Exception, e:
                print e
                
        print 'Thank you for using the HarvestMan Program.'
            
    def write(self, msg):
        """ Overloaded function when this class behaves
        as sys.stdout """
        
        try:
            if self._ofs_:
                self._ofs_.write(msg)
                self._ofs_.flush()
        except Exception, e:
            # dont recursively crash on errors
            pass

    def welcome_message(self):
        """ Print a welcome message """
        
        info('Starting HarvestMan version', self._cfg.version)
        info('Copyright (C) 2004-2005, Anand B Pillai')
        info('WWW: http://harvestman.freezope.org')
        info(' ')

    def set_proj_dir(self):
        """ Create the project directory and set it in the config object """

        # Set project directory
        self._cfg.projdir = os.path.join( self._cfg.basedir, self._cfg.project )
        if not os.path.exists( self._cfg.projdir ):
            extrainfo('Creating directory => ', self._cfg.projdir)
            os.makedirs(self._cfg.projdir)
            return 0

        return -1

    def register_objects(self):
        """ Creates the objects for harvestman """
        
        # Data manager object
        dmgr = datamgr.harvestManDataManager()
        SetObject(dmgr)
        
        # Rules checker object
        ruleschecker = rules.harvestManRulesChecker()
        # Create rules for filters
        ruleschecker.make_filters()
        
        SetObject(ruleschecker)
        
        # Connector object
        conn = connector.HarvestManNetworkConnector()
        SetObject(conn)

        # Connector factory
        conn_factory = connector.HarvestManUrlConnectorFactory(self._cfg.connections)
        SetObject(conn_factory)
        
        # Cookie manager object
        # We are saving the cookies in each projects project directory
        if self._cfg.cookies:
            cookie_file = os.path.join(self._cfg.projdir, 'cookies.dat')
            hcookiestore = cookiemgr.DBMCookieStore(cookie_file)
        
            cmgr = cookiemgr.CookieManager(hcookiestore)
            SetObject(cmgr )

        # create tracker monitor
        tracker_queue = urltracker.HarvestManCrawlerQueue()
        SetObject(tracker_queue)
        tracker_queue.configure()

        # Set myself
        SetObject(self)

    def start_project(self):
        """ Start the current project """

        # Welcome messages
        self.welcome_message()

        # crawls through a site using http/ftp/https protocols
        info('Starting project ', self._cfg.project ,'...')

        # Write the project file
        projector = utils.HarvestManProjectManager()
        projector.write_project()
        
        info('Starting download of url ', self._cfg.url, '...')

        # Read the project cache file, if any
        if self._cfg.pagecache:
            GetObject('datamanager').read_project_cache()
            
        tracker_queue=GetObject('trackerqueue')
        # start the project
        tracker_queue.crawl()
        
    def clean_up(self):
        """ Clean up actions to do, say after
        an interrupt """

        tq = GetObject('trackerqueue')
        tq.kill_trackers()

        # Close the cookie session so that
        # cookies are saved.
        cookie_manager = GetObject('cookiestore')
        cookie_manager.close_session()
        
    def __prepare(self):
        """ Do the basic things and get ready """

        # Initialize globals module. This initializes
        # the config and connector objects.
        Initialize()

	SetUserAgent(self.USER_AGENT)

        self._cfg = GetObject('config')
        # set version on config object
        self._cfg.version=self.VERSION
        # set program name on config object
        self._cfg.progname = 'HarvestMan ' + self.VERSION

    def set_project(self):
        """ Set the variables and initialize
        this object and other harvestman objects """

        # Prepare myself
        self.__prepare()
        
        # Get program options
        res=self._cfg.get_program_options()
        
        # Populate the url, project and basedir variables
        url=self._cfg.url
        project=self._cfg.project
        basedir=self._cfg.basedir
        
        if not url or not project or not basedir:
            print 'Invalid config options'
            print 'Give a valid url, project, base directory in config file'
            sys.exit(1)

        self.set_proj_dir()
        self.register_objects()
        
        # Set error log file
        if self._cfg.errorfile:
            if not self._cfg.testing:
                # Somehow DBM (shelve) fails (at least on Windows 98)
                # when we redirect stderr like this. So this is
                # disabled for sometime, till I fix it.
                # sys.stderr = self
                # sef.setErrorLog(self._cfg.errorfile)
                pass

    def run_project(self):
        """ Run a harvestman project """
        
        # Start crawling
        if not self._cfg.testnocrawl:
            try:
                self.start_project()
            except (KeyboardInterrupt, EOFError):
                # Localise links
                if not self._cfg.ignorekbinterrupt:
                    # dont allow to write cache, since it
                    # screws up existing cache.
                    GetObject('datamanager').conditional_cache_set()
                    self.clean_up()
   
        # Clean up actions    
        self.finish()       

    def report_garbage_collection(self):
        """ Diagnosis report on garbage collection """

        # TODO
        pass
        
if __name__=="__main__":

    spider = harvestMan()
    spider.set_project()
    spider.run_project()

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -