📄 harvestman.py
字号:
""" HARVESTMAN - Multithreaded internet spider program
using urllib2 and other python modules.
Version - 1.3.9 final.
Author: Anand B Pillai(anandpillai at letterboxes dot org).
HARVESTMAN is totally free software. See the file LICENSE.txt for
information on the terms and conditions of usage, and a DISCLAIMER
of ALL WARRANTIES. The same license agreement applies to all other
python software modules used in this program.
Modification History
====================
Oct 10 2003 Anand 1.3 a1 release.
Jan 2 2004 Anand 1.3.1 bug fix version.
Feb 24 2004 Anand 1.3.3 version release.
Apr 20 2004 Anand 1.3.4 version release.
Jun 14 2004 Anand 1.3.9 release.
"""
import os, sys
from sgmllib import SGMLParseError
from shutil import copy
# Our modules
# Tracker modules
import urltracker
# Connector module
import connector
# Rules module
import rules
# Data manager module
import datamgr
# Cookie module
import cookiemgr
# Utils module
import utils
# Globals/lookup module
from common import *
class harvestMan:
""" Top level application class """
def __init__(self):
""" Constructor """
# project start page (on disk)
self._projectstartpage='file://'
# error file descriptor
self._ofs_=None
self.USER_AGENT="HarvestMan 1.4"
self.VERSION = "1.3.9"
def set_error_log(self, errorfile):
""" Function to call to set this class
as sys.stderr """
# error log file
errorlogpath = os.path.join(self._cfg.projdir, errorfile)
if os.path.exists(errorlogpath):
try:
os.remove(errorlogpath)
except OSError, e:
print e
self._ofs_ = open(errorlogpath, 'w')
def finish(self):
""" Actions to take after download is over """
# Close the cookie session so that
# cookies are saved.
cookie_manager = GetObject('cookiestore')
# Localise file links
# This code sits in the data manager class
dmgr = GetObject('datamanager')
dmgr.post_download_setup()
if not self._cfg.testing:
browser = utils.HarvestManBrowser()
browser.make_project_browse_page()
# FIXME: Better way to signal global module that
# we are done.
Finish()
if self._ofs_:
try:
self._ofs_.close()
except Exception, e:
print e
print 'Thank you for using the HarvestMan Program.'
def write(self, msg):
""" Overloaded function when this class behaves
as sys.stdout """
try:
if self._ofs_:
self._ofs_.write(msg)
self._ofs_.flush()
except Exception, e:
# dont recursively crash on errors
pass
def welcome_message(self):
""" Print a welcome message """
info('Starting HarvestMan version', self._cfg.version)
info('Copyright (C) 2004-2005, Anand B Pillai')
info('WWW: http://harvestman.freezope.org')
info(' ')
def set_proj_dir(self):
""" Create the project directory and set it in the config object """
# Set project directory
self._cfg.projdir = os.path.join( self._cfg.basedir, self._cfg.project )
if not os.path.exists( self._cfg.projdir ):
extrainfo('Creating directory => ', self._cfg.projdir)
os.makedirs(self._cfg.projdir)
return 0
return -1
def register_objects(self):
""" Creates the objects for harvestman """
# Data manager object
dmgr = datamgr.harvestManDataManager()
SetObject(dmgr)
# Rules checker object
ruleschecker = rules.harvestManRulesChecker()
# Create rules for filters
ruleschecker.make_filters()
SetObject(ruleschecker)
# Connector object
conn = connector.HarvestManNetworkConnector()
SetObject(conn)
# Connector factory
conn_factory = connector.HarvestManUrlConnectorFactory(self._cfg.connections)
SetObject(conn_factory)
# Cookie manager object
# We are saving the cookies in each projects project directory
if self._cfg.cookies:
cookie_file = os.path.join(self._cfg.projdir, 'cookies.dat')
hcookiestore = cookiemgr.DBMCookieStore(cookie_file)
cmgr = cookiemgr.CookieManager(hcookiestore)
SetObject(cmgr )
# create tracker monitor
tracker_queue = urltracker.HarvestManCrawlerQueue()
SetObject(tracker_queue)
tracker_queue.configure()
# Set myself
SetObject(self)
def start_project(self):
""" Start the current project """
# Welcome messages
self.welcome_message()
# crawls through a site using http/ftp/https protocols
info('Starting project ', self._cfg.project ,'...')
# Write the project file
projector = utils.HarvestManProjectManager()
projector.write_project()
info('Starting download of url ', self._cfg.url, '...')
# Read the project cache file, if any
if self._cfg.pagecache:
GetObject('datamanager').read_project_cache()
tracker_queue=GetObject('trackerqueue')
# start the project
tracker_queue.crawl()
def clean_up(self):
""" Clean up actions to do, say after
an interrupt """
tq = GetObject('trackerqueue')
tq.kill_trackers()
# Close the cookie session so that
# cookies are saved.
cookie_manager = GetObject('cookiestore')
cookie_manager.close_session()
def __prepare(self):
""" Do the basic things and get ready """
# Initialize globals module. This initializes
# the config and connector objects.
Initialize()
SetUserAgent(self.USER_AGENT)
self._cfg = GetObject('config')
# set version on config object
self._cfg.version=self.VERSION
# set program name on config object
self._cfg.progname = 'HarvestMan ' + self.VERSION
def set_project(self):
""" Set the variables and initialize
this object and other harvestman objects """
# Prepare myself
self.__prepare()
# Get program options
res=self._cfg.get_program_options()
# Populate the url, project and basedir variables
url=self._cfg.url
project=self._cfg.project
basedir=self._cfg.basedir
if not url or not project or not basedir:
print 'Invalid config options'
print 'Give a valid url, project, base directory in config file'
sys.exit(1)
self.set_proj_dir()
self.register_objects()
# Set error log file
if self._cfg.errorfile:
if not self._cfg.testing:
# Somehow DBM (shelve) fails (at least on Windows 98)
# when we redirect stderr like this. So this is
# disabled for sometime, till I fix it.
# sys.stderr = self
# sef.setErrorLog(self._cfg.errorfile)
pass
def run_project(self):
""" Run a harvestman project """
# Start crawling
if not self._cfg.testnocrawl:
try:
self.start_project()
except (KeyboardInterrupt, EOFError):
# Localise links
if not self._cfg.ignorekbinterrupt:
# dont allow to write cache, since it
# screws up existing cache.
GetObject('datamanager').conditional_cache_set()
self.clean_up()
# Clean up actions
self.finish()
def report_garbage_collection(self):
""" Diagnosis report on garbage collection """
# TODO
pass
if __name__=="__main__":
spider = harvestMan()
spider.set_project()
spider.run_project()
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -