📄 spider.py
字号:
#! /usr/bin/python# -- coding: utf-8""" HarvestMan - Extensible, modular, flexible, multithreaded Internet spider program using urllib2 and other python modules. This is the main module of HarvestMan. Version - 2.0 alpha 1. Author: Anand B Pillai <abpillai at gmail dot com> HARVESTMAN is free software. See the file LICENSE.txt for information on the terms and conditions of usage, and a DISCLAIMER of ALL WARRANTIES. Modification History Created: Aug 2003 Jan 23 2007 Anand Changes to copy config file to ~/.harvestman/conf folder on POSIX systems. This file is also looked for if config.xml not found in curdir. Jan 25 2007 Anand Simulation feature added. Also modified config.py to allow reading cmd line arguments when passing a config file using -C option. Feb 7 2007 Anand Finished implementation of plugin feature. Crawl simulator is now a plugin. Feb 8 2007 Anand Added swish-e integration as a plugin. Feb 11 2007 Anand Changes in the swish-e plugin implementation, by using callbacks. Mar 2 2007 Anand Renamed finish to finish_project. Moved Finish method from common.py to here and renamed it as finish(...). finish is never called at project end, but by default at program end. Mar 7 2007 Anand Disabled urlserver option. Mar 15 2007 Anand Added bandwidth calculation for determining max filesize before crawl. Need to add code to redetermine bandwidth when network interface changes. Apr 18 2007 Anand Added the urltypes module for URL type definitions and replaced entries with it. Upped version number to 2.0 since this is almost a new program now! Apr 19 2007 Anand Disabled urlserver option completely. Removed all referring code from this module, crawler and urlqueue modules. Moved code for grabbing URL to new hget module. Apr 24 2007 Anand Made to work on Windows (XP SP2 Professional, Python 2.5) Apr 24 2007 Anand Made the config directory creation/session saver features to work on Windows also. Apr 24 2007 Anand Modified connector algorithm to flush data to temp files for hget. This makes sure that hget can download huge files as multipart. May 7 2007 Anand Added plugin as option in configuration file. Added ability to process more than one plugin at once. Modified loading logic of plugins. May 10 2007 Anand Replaced a number of private attributes in classes (with double underscores), to semi-private (one underscore). This helps in inheriting from these classes. Dec 12 2007 Anand Re-merged code from harvestmanklass module to this and moved common initialization code to appbase.py under HarvestManAppBase class. Feb 12-14 08 Anand Major datastructure enhancements/revisions, fixes etc in datamgr, rules, urlparser, connector, crawler, ,urlqueue, urlthread modules. Copyright (C) 2004 Anand B Pillai. """ __version__ = '2.0 a1'__author__ = 'Anand B Pillai'import __init__import os, sysimport cPickleimport pickleimport timeimport threadingimport shutilimport globimport reimport copyimport signalimport localefrom shutil import copyfrom harvestman.lib.event import HarvestManEventfrom harvestman.lib.common.common import *from harvestman.lib.common.macros import *from harvestman.lib import urlqueuefrom harvestman.lib import connectorfrom harvestman.lib import rulesfrom harvestman.lib import datamgrfrom harvestman.lib import utilsfrom harvestman.lib import urlparserfrom harvestman.lib.db import HarvestManDbManagerfrom harvestman.lib.methodwrapper import MethodWrapperMetaClass# Current folder - okayfrom appbase import HarvestManAppBase# Defining callback points__callbacks__ = { 'run_saved_state_callback':'HarvestMan:run_saved_state', 'restore_state_callback':'HarvestMan:restore_state', 'run_projects_callback':'HarvestMan:run_projects', 'start_project_callback':'HarvestMan:start_project', 'finish_project_callback':'HarvestMan:finish_project', 'finalize_callback':'HarvestMan:finalize', 'init_callback' : 'HarvestMan:init'}# Defining pluggable functions__plugins__ = { 'clean_up_plugin':'HarvestMan:clean_up', 'save_current_state_plugin': 'HarvestMan:save_current_state', 'restore_state_plugin': 'HarvestMan:restore_state', 'reset_state_plugin': 'HarvestMan:reset_state' }class HarvestMan(HarvestManAppBase): """ The main crawler application class for HarvestMan """ klassmap = {} __metaclass__ = MethodWrapperMetaClass alias = 'spider' USER_AGENT = '%s/%s (+http://code.google.com/p/harvestman-crawler/wiki/bot)' %('Harvestman',__version__) def __init__(self): """ Initializing method """ self._projectstartpage = 'file://' super(HarvestMan, self).__init__() def finish_project(self): """ Actions to take after download is over for the current project """ if objects.eventmgr.raise_event('beforefinish', objects.queuemgr.baseurl, None)==False: return # Localise file links # This code sits in the data manager class objects.datamgr.post_download_setup() # if not objects.config.testing: if objects.config.browsepage: logconsole("Creating browser index page for the project...") browser = utils.HarvestManBrowser() browser.make_project_browse_page() logconsole("Done.") objects.eventmgr.raise_event('afterfinish', objects.queuemgr.baseurl, None) def finalize(self): """ This method is called at program exit or when handling signals to clean up """ # If this was started from a runfile, # remove it. if objects.config.runfile: try: os.remove(objects.config.runfile) except OSError, e: error('Error removing runfile %s.' % objects.config.runfile) # inform user of config file errors if globaldata.userdebug: logconsole("Some errors were found in your configuration, please correct them!") for x in range(len(globaldata.userdebug)): logconsole(str(x+1),':', globaldata.userdebug[x]) globaldata.userdebug = [] logconsole('HarvestMan session finished.') objects.datamgr.clean_up() objects.rulesmgr.clean_up() objects.logger.shutdown() def save_current_state(self): """ Save state of objects to disk so program can be restarted from saved state """ # If savesession is disabled, return if not objects.config.savesessions: extrainfo('Session save feature is disabled.') return # Top-level state dictionary state = {} # All state objects are dictionaries # Get state of queue & tracker threads state['trackerqueue'] = objects.queuemgr.get_state() # Get state of datamgr state['datamanager'] = objects.datamgr.get_state() # Get state of urlthreads #if p: state['threadpool'] = p.get_state() #state['ruleschecker'] = objects.rulesmgr.get_state() # Get config object #state['configobj'] = objects.config.copy() # Dump with time-stamp fname = os.path.join(objects.config.usersessiondir, '.harvestman_saves#' + str(int(time.time()))) extrainfo('Saving run-state to file %s...' % fname) try: cPickle.dump(state, open(fname, 'wb'), pickle.HIGHEST_PROTOCOL) extrainfo('Saved run-state to file %s.' % fname) except (pickle.PicklingError, RuntimeError), e: logconsole(e) error('Could not save run-state !') def welcome_message(self): """ Prints a welcome message before start of the program """ logconsole('Starting %s...' % objects.config.progname) logconsole('Copyright (C) 2004, Anand B Pillai') logconsole(' ') def register_common_objects(self): """ Create and register aliases for the common objects required by all program modules """ # Set myself SetAlias(self) objects.logger.make_logger() # Set verbosity in logger object objects.logger.setLogSeverity(objects.config.verbosity) # Data manager object dmgr = datamgr.HarvestManDataManager() SetAlias(dmgr) # Rules checker object ruleschecker = rules.HarvestManRulesChecker() SetAlias(ruleschecker) # Connector manager object connmgr = connector.HarvestManNetworkConnector() SetAlias(connmgr) # Connector factory conn_factory = connector.HarvestManUrlConnectorFactory(objects.config.connections) SetAlias(conn_factory) queuemgr = urlqueue.HarvestManCrawlerQueue() SetAlias(queuemgr) SetAlias(HarvestManEvent()) def start_project(self): """ Starts crawl for the current project, crawling its URL """ if objects.eventmgr.raise_event('beforestart', objects.queuemgr.baseurl, None)==False: return # crawls through a site using http/ftp/https protocols if objects.config.project: info('*** Log Started ***\n') if not objects.config.resuming: info('Starting project',objects.config.project,'...') else: info('Re-starting project',objects.config.project,'...') # Write the project file if not objects.config.fromprojfile: projector = utils.HarvestManProjectManager() projector.write_project() # Write the project database record HarvestManDbManager.add_project_record() if not objects.config.resuming: info('Starting download of url',objects.config.url,'...') else: pass # Reset objects keeping project-specific states now # Reset and re-initialize datamgr objects.datamgr.clean_up() objects.datamgr.initialize() objects.rulesmgr.reset() # Read the project cache file, if any if objects.config.pagecache: objects.datamgr.read_project_cache() if not objects.config.resuming: # Configure tracker manager for this project if objects.queuemgr.configure(): # start the project objects.queuemgr.crawl() else: objects.queuemgr.restart() objects.eventmgr.raise_event('afterstart', objects.queuemgr.baseurl, None) def clean_up(self): """ Performs clean up actions as part of the interrupt handling """ # Shut down logging on file extrainfo('Shutting down logging...') objects.logger.disableFileLogging() objects.queuemgr.endloop() def calculate_bandwidth(self): """ Calculate bandwidth of the user by downloading a specific URL and timing it, setting a limit on maximum file size """ # Calculate bandwidth bw = 0 # Look for harvestman.conf in user conf dir conf = os.path.join(objects.config.userconfdir, 'harvestman.conf') if not os.path.isfile(conf): conn = connector.HarvestManUrlConnector() urlobj = urlparser.HarvestManUrl('http://harvestmanontheweb.com/schemas/HarvestMan.xsd') bw = conn.calc_bandwidth(urlobj) bwstr='bandwidth=%f\n' % bw if bw: try: open(conf,'w').write(bwstr) except IOError, e: pass else: r = re.compile(r'(bandwidth=)(.*)') try: data = open(conf).read() m = r.findall(data) if m: bw = float(m[0][-1]) except IOError, e: pass return bw def create_user_directories(self): """ Creates the user folders for HarvestMan. Creates folders for storing user specific configuration, session and crawl database information """ # Create user's HarvestMan directory on POSIX at $HOME/.harvestman and # on Windows at $USERPROFILE/Local Settings/Application Data/HarvestMan harvestman_dir = objects.config.userdir harvestman_conf_dir = objects.config.userconfdir harvestman_sessions_dir = objects.config.usersessiondir harvestman_db_dir = objects.config.userdbdir
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -