📄 spider.py
字号:
if not os.path.isdir(harvestman_dir): try: logconsole('Looks like you are running HarvestMan for the first time in this machine') logconsole('Doing initial setup...') logconsole('Creating folder %s for storing HarvestMan application data...' % harvestman_dir) os.makedirs(harvestman_dir) except (OSError, IOError), e: logconsole(e) if not os.path.isdir(harvestman_conf_dir): try: logconsole('Creating "conf" sub-directory in %s...' % harvestman_dir) os.makedirs(harvestman_conf_dir) # Create user configuration in .harvestman/conf conf_data = objects.config.generate_user_configuration() logconsole("Generating user configuration in %s..." % harvestman_conf_dir) try: user_conf_file = os.path.join(harvestman_conf_dir, 'config.xml') open(user_conf_file, 'w').write(conf_data) logconsole("Done.") except IOError, e: print e except (OSError, IOError), e: logconsole(e) if not os.path.isdir(harvestman_sessions_dir): try: logconsole('Creating "sessions" sub-directory in %s...' % harvestman_dir) os.makedirs(harvestman_sessions_dir) logconsole('Done.') except (OSError, IOError), e: logconsole(e) if not os.path.isdir(harvestman_db_dir): try: logconsole('Creating "db" sub-directory in %s...' % harvestman_dir) os.makedirs(harvestman_db_dir) logconsole('Done.') except (OSError, IOError), e: logconsole(e) try: HarvestManDbManager.create_user_database() except Exception, e: logconsole(e) def init(self): """ Initialize the crawler by creating, register common objects and creating the user folders """ if objects.config.USER_AGENT=='': objects.config.USER_AGENT = self.__class__.USER_AGENT self.register_common_objects() self.create_user_directories() # Calculate bandwidth and set max file size # bw = self.calculate_bandwidth() # Max file size is calculated as bw*timeout # where timeout => max time for a worker thread # if bw: objects.config.maxfilesize = bw*objects.config.timeout def init_config(self): """ Initialize the configuration of the crawler """ # Following 2 methods are inherited from the parent class self.get_options() self.process_plugins() # For the time being, save session set to false objects.config.savesessions = 0 def get_config(self): """ Return the configuration object """ return objects.config def initialize(self): """ Umbrella method to initialize the crawler configuration and the crawler object """ self.init_config() self.init() def reset(self): """ Resets the state of the crawler, except the state of the configuration object """ self.init() def run_projects(self): """ Run all the HarvestMan projects specified for the current session """ # Set locale - To fix errors with # regular expressions on non-english web # sites. locale.setlocale(locale.LC_ALL, '') objects.rulesmgr.make_filters() if objects.config.verbosity: self.welcome_message() for x in range(len(objects.config.projects)): # Get all project related vars entry = objects.config.projects[x] url = entry.get('url') project = entry.get('project') basedir = entry.get('basedir') verb = entry.get('verbosity') if not url or not project or not basedir: info('Invalid config options found!') if not url: info('Provide a valid url') if not project: info('Provide a valid project name') if not basedir: info('Provide a valid base directory') continue # Set the current project vars objects.config.url = url objects.config.project = project objects.config.verbosity = verb objects.config.basedir = basedir try: self.run_project() except Exception, e: # Note: This design means that when we are having more than # one project configured, HarvestMan exits only the current # project if an interrupt (Ctrl-C) is received. The next # project will continue when control comes back...This was # not intentional, but is a good side-effect. # However if a Python exception is received, we exit the # program after calling this function... self.handle_interrupts(-1, None, e) def run_project(self): """ Run the current HarvestMan project by creating any project directories , initializing state and starting the project """ # Set project directory # Expand any shell variables used in the base directory. objects.config.basedir = os.path.expandvars(os.path.expanduser(objects.config.basedir)) if objects.config.basedir: objects.config.projdir = os.path.join( objects.config.basedir, objects.config.project ) if objects.config.projdir and not os.path.exists( objects.config.projdir ): os.makedirs(objects.config.projdir) if objects.config.datamode == CONNECTOR_DATA_MODE_FLUSH: objects.config.projtmpdir = os.path.join(objects.config.projdir, '.tmp') if objects.config.projtmpdir and not os.path.exists( objects.config.projtmpdir ): os.makedirs(objects.config.projtmpdir) # Set message log file if objects.config.projdir and objects.config.project: objects.config.logfile = os.path.join( objects.config.projdir, "".join((objects.config.project, '.log'))) SetLogFile() if not objects.config.testnocrawl: self.start_project() self.finish_project() def restore_state(self, state_file): """ Restore state of some objects from the most recent run of the program. This helps to re-run the program from where it left off """ try: state = cPickle.load(open(state_file, 'rb')) # This has six keys - configobj, threadpool, ruleschecker, # datamanager, common and trackerqueue. # First update config object localcfg = {} cfg = state.get('configobj') if cfg: for key,val in cfg.items(): localcfg[key] = val else: print 'Config corrupted' return RESTORE_STATE_NOT_OK # Now update trackerqueue ret = objects.queuemgr.set_state(state.get('trackerqueue')) if ret == -1: logconsole("Error restoring state in 'urlqueue' module - cannot proceed further!") return RESTORE_STATE_NOT_OK else: logconsole("Restored state in urlqueue module.") # Now update datamgr ret = objects.datamgr.set_state(state.get('datamanager')) if ret == -1: logconsole("Error restoring state in 'datamgr' module - cannot proceed further!") return RESTORE_STATE_NOT_OK else: dm.initialize() logconsole("Restored state in datamgr module.") # Update threadpool if any pool = None if state.has_key('threadpool'): pool = dm._urlThreadPool ret = pool.set_state(state.get('threadpool')) logconsole('Restored state in urlthread module.') # Update ruleschecker ret = objects.rulesmgr.set_state(state.get('ruleschecker')) logconsole('Restored state in rules module.') # Everything is fine, copy localcfg to config object for key,val in localcfg.items(): objects.config[key] = val # Open stream to log file SetLogFile() return RESTORE_STATE_OK except (pickle.UnpicklingError, AttributeError, IndexError, EOFError), e: return RESTORE_STATE_NOT_OK def run_saved_state(self): """ Restart the program from a previous state, from state saved during the most recent run of the program, if any """ # If savesession is disabled, return #if not objects.config.savesessions: extrainfo('Session save feature is disabled, ignoring save files') return SAVE_STATE_NOT_OK # Set locale - To fix errors with # regular expressions on non-english web # sites. # See if there is a file named .harvestman_saves#... ## sessions_dir = objects.config.usersessiondir## files = glob.glob(os.path.join(sessions_dir, '.harvestman_saves#*'))## # Get the last dumped file## if files:## runfile = max(files)## res = raw_input('Found HarvestMan save file %s. Do you want to re-run it ? [y/n]' % runfile)## if res.lower()=='y':## if self.restore_state(runfile) == RESTORE_STATE_OK:## objects.config.resuming = True## objects.config.runfile = runfile## if objects.config.verbosity:## self.welcome_message() ## if not objects.config.testnocrawl:## try:## self.start_project()## except Exception, e:## self.handle_interrupts(-1, None, e)## try:## self.finish_project()## return SAVE_STATE_OK ## except Exception, e:## # To catch errors at interpreter shutdown## pass## else:## logconsole('Could not re-run saved state, defaulting to standard configuration...')## objects.config.resuming = False## return SAVE_STATE_NOT_OK## else:## logconsole('OK, falling back to default configuration...')## return SAVE_STATE_NOT_OK## else:## return SAVE_STATE_NOT_OK## pass def handle_interrupts(self, signum, frame, e=None): """ Method which is called to handle program interrupts such as a Ctrl-C (interrupt) """ # print 'Signal handler called with',signum if signum == signal.SIGINT: objects.config.keyboardinterrupt = True if e != None: logconsole('Exception received=>',e) print_traceback() logtraceback() # dont allow to write cache, since it # screws up existing cache. objects.datamgr.conditional_cache_set() # self.save_current_state() self.clean_up() def bind_event(self, event, funktion, *args): """ Binds a function to a specific event in HarvestMan """ objects.eventmgr.bind(event, funktion, args) def main(self): """ The main sub-routine of the HarvestMan class """ # Set stderr to dummy to prevent all the thread # errors being printed by interpreter at shutdown # sys.stderr = DummyStderr() signal.signal(signal.SIGINT, self.handle_interrupts) # See if a crash file is there, then try to load it and run # program from crashed state. if self.run_saved_state() == SAVE_STATE_NOT_OK: # No such crashed state or user refused to run # from crashed state. So do the usual run. self.run_projects() # Final cleanup self.finalize()def callgraph_filter(call_stack, module_name, class_name, func_name, full_name): """ Function which is used to filter the call graphs when HarvestMan is run with 'pycallgraph' to generate call graph trees """ if class_name.lower().find('harvestman') != -1 or \ full_name.lower().find('harvestman') != -1: return True else: return Falsedef main(): """ Main routine """ spider = HarvestMan() spider.initialize() #import pycallgraph #pycallgraph.start_trace(filter_func=callgraph_filter) spider.main() #pycallgraph.make_dot_graph('harvestman.png') if __name__=="__main__": main()
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -