⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 spider.py

📁 Harvestman-最新版本
💻 PY
📖 第 1 页 / 共 2 页
字号:
        if not os.path.isdir(harvestman_dir):            try:                logconsole('Looks like you are running HarvestMan for the first time in this machine')                logconsole('Doing initial setup...')                logconsole('Creating folder %s for storing HarvestMan application data...' % harvestman_dir)                os.makedirs(harvestman_dir)            except (OSError, IOError), e:                logconsole(e)        if not os.path.isdir(harvestman_conf_dir):            try:                logconsole('Creating "conf" sub-directory in %s...' % harvestman_dir)                os.makedirs(harvestman_conf_dir)                # Create user configuration in .harvestman/conf                conf_data = objects.config.generate_user_configuration()                logconsole("Generating user configuration in %s..." % harvestman_conf_dir)                try:                    user_conf_file = os.path.join(harvestman_conf_dir, 'config.xml')                    open(user_conf_file, 'w').write(conf_data)                    logconsole("Done.")                                    except IOError, e:                    print e            except (OSError, IOError), e:                logconsole(e)        if not os.path.isdir(harvestman_sessions_dir):            try:                logconsole('Creating "sessions" sub-directory in %s...' % harvestman_dir)                os.makedirs(harvestman_sessions_dir)                                        logconsole('Done.')            except (OSError, IOError), e:                logconsole(e)        if not os.path.isdir(harvestman_db_dir):            try:                logconsole('Creating "db" sub-directory in %s...' % harvestman_dir)                os.makedirs(harvestman_db_dir)                                        logconsole('Done.')            except (OSError, IOError), e:                logconsole(e)            try:                HarvestManDbManager.create_user_database()            except Exception, e:                logconsole(e)                            def init(self):        """ Initialize the crawler by creating, register common objects and creating the        user folders """        if objects.config.USER_AGENT=='':            objects.config.USER_AGENT = self.__class__.USER_AGENT                    self.register_common_objects()        self.create_user_directories()        # Calculate bandwidth and set max file size        # bw = self.calculate_bandwidth()        # Max file size is calculated as bw*timeout        # where timeout => max time for a worker thread        # if bw: objects.config.maxfilesize = bw*objects.config.timeout            def init_config(self):        """ Initialize the configuration of the crawler """                # Following 2 methods are inherited from the parent class        self.get_options()        self.process_plugins()        # For the time being, save session set to false        objects.config.savesessions = 0    def get_config(self):        """ Return the configuration object """        return objects.config            def initialize(self):        """ Umbrella method to initialize the crawler configuration        and the crawler object """                self.init_config()        self.init()    def reset(self):        """ Resets the state of the crawler, except the state of the        configuration object """                self.init()            def run_projects(self):        """ Run all the HarvestMan projects specified for the current session """        # Set locale - To fix errors with        # regular expressions on non-english web        # sites.        locale.setlocale(locale.LC_ALL, '')        objects.rulesmgr.make_filters()                if objects.config.verbosity:            self.welcome_message()        for x in range(len(objects.config.projects)):            # Get all project related vars            entry = objects.config.projects[x]            url = entry.get('url')            project = entry.get('project')            basedir = entry.get('basedir')            verb = entry.get('verbosity')            if not url or not project or not basedir:                info('Invalid config options found!')                if not url: info('Provide a valid url')                if not project: info('Provide a valid project name')                if not basedir: info('Provide a valid base directory')                continue                        # Set the current project vars            objects.config.url = url            objects.config.project = project            objects.config.verbosity = verb            objects.config.basedir = basedir            try:                self.run_project()            except Exception, e:                # Note: This design means that when we are having more than                # one project configured, HarvestMan exits only the current                # project if an interrupt (Ctrl-C) is received. The next                # project will continue when control comes back...This was                # not intentional, but is a good side-effect.                                # However if a Python exception is received, we exit the                # program after calling this function...                self.handle_interrupts(-1, None, e)                        def run_project(self):        """ Run the current HarvestMan project by creating any project directories        , initializing state and starting the project """        # Set project directory        # Expand any shell variables used in the base directory.        objects.config.basedir = os.path.expandvars(os.path.expanduser(objects.config.basedir))                if objects.config.basedir:            objects.config.projdir = os.path.join( objects.config.basedir, objects.config.project )            if objects.config.projdir and not os.path.exists( objects.config.projdir ):                os.makedirs(objects.config.projdir)                            if objects.config.datamode == CONNECTOR_DATA_MODE_FLUSH:                    objects.config.projtmpdir = os.path.join(objects.config.projdir, '.tmp')                if objects.config.projtmpdir and not os.path.exists( objects.config.projtmpdir ):                    os.makedirs(objects.config.projtmpdir)                    # Set message log file        if objects.config.projdir and objects.config.project:            objects.config.logfile = os.path.join( objects.config.projdir, "".join((objects.config.project,                                                                          '.log')))        SetLogFile()        if not objects.config.testnocrawl:            self.start_project()        self.finish_project()                def restore_state(self, state_file):        """ Restore state of some objects from the most recent run of the program.        This helps to re-run the program from where it left off """        try:            state = cPickle.load(open(state_file, 'rb'))            # This has six keys - configobj, threadpool, ruleschecker,            # datamanager, common and trackerqueue.            # First update config object            localcfg = {}            cfg = state.get('configobj')            if cfg:                for key,val in cfg.items():                    localcfg[key] = val            else:                print 'Config corrupted'                return RESTORE_STATE_NOT_OK            # Now update trackerqueue            ret = objects.queuemgr.set_state(state.get('trackerqueue'))            if ret == -1:                logconsole("Error restoring state in 'urlqueue' module - cannot proceed further!")                return RESTORE_STATE_NOT_OK                            else:                logconsole("Restored state in urlqueue module.")                        # Now update datamgr            ret = objects.datamgr.set_state(state.get('datamanager'))            if ret == -1:                logconsole("Error restoring state in 'datamgr' module - cannot proceed further!")                return RESTORE_STATE_NOT_OK                            else:                dm.initialize()                logconsole("Restored state in datamgr module.")                                        # Update threadpool if any            pool = None            if state.has_key('threadpool'):                pool = dm._urlThreadPool                ret = pool.set_state(state.get('threadpool'))                logconsole('Restored state in urlthread module.')                        # Update ruleschecker            ret = objects.rulesmgr.set_state(state.get('ruleschecker'))            logconsole('Restored state in rules module.')            # Everything is fine, copy localcfg to config object            for key,val in localcfg.items():                objects.config[key] = val            # Open stream to log file            SetLogFile()                            return RESTORE_STATE_OK        except (pickle.UnpicklingError, AttributeError, IndexError, EOFError), e:            return RESTORE_STATE_NOT_OK                def run_saved_state(self):        """ Restart the program from a previous state, from state saved during        the most recent run of the program, if any """                # If savesession is disabled, return        #if not objects.config.savesessions:        extrainfo('Session save feature is disabled, ignoring save files')        return SAVE_STATE_NOT_OK                # Set locale - To fix errors with        # regular expressions on non-english web        # sites.        # See if there is a file named .harvestman_saves#...  ##       sessions_dir = objects.config.usersessiondir##         files = glob.glob(os.path.join(sessions_dir, '.harvestman_saves#*'))##         # Get the last dumped file##         if files:##             runfile = max(files)##             res = raw_input('Found HarvestMan save file %s. Do you want to re-run it ? [y/n]' % runfile)##             if res.lower()=='y':##                 if self.restore_state(runfile) == RESTORE_STATE_OK:##                     objects.config.resuming = True##                     objects.config.runfile = runfile##                     if objects.config.verbosity:##                         self.welcome_message()        ##                     if not objects.config.testnocrawl:##                         try:##                             self.start_project()##                         except Exception, e:##                             self.handle_interrupts(-1, None, e)##                     try:##                         self.finish_project()##                         return SAVE_STATE_OK                    ##                     except Exception, e:##                         # To catch errors at interpreter shutdown##                         pass##                 else:##                     logconsole('Could not re-run saved state, defaulting to standard configuration...')##                     objects.config.resuming = False##                     return SAVE_STATE_NOT_OK##             else:##                 logconsole('OK, falling back to default configuration...')##                 return SAVE_STATE_NOT_OK##         else:##             return SAVE_STATE_NOT_OK##         pass    def handle_interrupts(self, signum, frame, e=None):        """ Method which is called to handle program interrupts such as a Ctrl-C (interrupt) """        # print 'Signal handler called with',signum        if signum == signal.SIGINT:            objects.config.keyboardinterrupt = True        if e != None:            logconsole('Exception received=>',e)            print_traceback()        logtraceback()        # dont allow to write cache, since it        # screws up existing cache.        objects.datamgr.conditional_cache_set()        # self.save_current_state()        self.clean_up()    def bind_event(self, event, funktion, *args):        """ Binds a function to a specific event in HarvestMan """                objects.eventmgr.bind(event, funktion, args)            def main(self):        """ The main sub-routine of the HarvestMan class """        # Set stderr to dummy to prevent all the thread        # errors being printed by interpreter at shutdown        # sys.stderr = DummyStderr()        signal.signal(signal.SIGINT, self.handle_interrupts)                # See if a crash file is there, then try to load it and run        # program from crashed state.        if self.run_saved_state() == SAVE_STATE_NOT_OK:            # No such crashed state or user refused to run            # from crashed state. So do the usual run.            self.run_projects()                    # Final cleanup        self.finalize()def callgraph_filter(call_stack, module_name, class_name, func_name, full_name):    """ Function which is used to filter the call graphs when HarvestMan is    run with 'pycallgraph' to generate call graph trees """        if class_name.lower().find('harvestman') != -1 or \       full_name.lower().find('harvestman') != -1:        return True    else:        return Falsedef main():    """ Main routine """    spider = HarvestMan()    spider.initialize()    #import pycallgraph    #pycallgraph.start_trace(filter_func=callgraph_filter)    spider.main()    #pycallgraph.make_dot_graph('harvestman.png')    if __name__=="__main__":    main()                           

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -