config.py

来自「Harvestman-最新版本」· Python 代码 · 共 1,583 行 · 第 1/5 页

PY
1,583
字号
        self.verbosity_default=logger.INFO        # Override project verbosity - done        # if a global verbosity flag is defined        # say using the command line        self.verbosity_override = False        # timeout for worker threads is a rather        # large 5 minutes.        self.timeout=300.00        # timeout for sockets is a rather high 1.0 minute        self.socktimeout = 60.0        # Time out for fetchers is a rather small 4 minutes        self.fetchertimeout = 240.0        self.getimagelinks=1        self.getstylesheets=1        # Load images from anywhere irrepsective of rules        self.anyimages=1        self.threadpoolsize=10        self.renamefiles=0        self.fetchlevel=0        self.browsepage=0        self.checkfiles=1        self.pagecache=1        # Internal variable telling whether to write cache        self.writecache=True        self.cachefound=0        self._error=''        self.starttime=0        self.endtime=0        self.javascript = 1        self.javaapplet = 1        self.connections=5        # Bandwidth limit, 0 means no limit        self.bandwidthlimit = 0        self.throttlefactor = 1.5        self.cachefileformat='pickled'         self.testing = 0        self.testnocrawl = 0        self.ignoreinterrupts = 0        # Set to true when a kb interrupt is caught        self.keyboardinterrupt = 0        # Differentiate between sub-domains of a domain ?        # When set to True, subdomains act like different        # domains, so they are filtered out for fetchlevel<=1        self.subdomain = 1        # Flag to ignore tlds, if set to True, domains        # such as www.foo.com, www.foo.co.uk, www.foo.org        # will all evaluate to the same server.        # Use this carefully!        self.ignoretlds = 0        self.getquerylinks = 0        self.bytes = 20.00 # Not used!        self.projtimeout = 1800.00        self.downloadtime = 0.0        self.timelimit = -1        self.terminate = 0        self.datacache = 0        self.blocking = 0        self.junkfilter = 1        self.junkfilterdomains = 1        self.junkfilterpatterns = 1        self.urltreefile = 0        self.urlfile = ''        self.maxfilesize=5242880        self.minfilesize=0        self.format = 'xml'        self.rawsave = 0        self.fromprojfile = 0        # HTML features (optional)        self.htmlfeatures = []        # For running from previous states.        self.resuming = 0        self.runfile = None        # Control var for session-saver feature.        self.savesessions = 0        # List of enabled plugins        self.plugins = []        # Control var for simulation feature        self.simulate = 0        # Time to sleep between requests        self.sleeptime = 1.0        # Time to sleep on the request queue        self.queuetime = 1.0        # Queue size - fixed...        self.queuesize = 5000        self.randomsleep = 1        # For http compression        self.httpcompress = 1        # Type of URLs which can be        # set to skip any rules we define        # This is not a user configurable        # option, but can be configured in        # plugins, of course.        self.skipruletypes = []        # Number of parts to split a file        # to, for multipart http downloads        self.numparts = 4        # Flag to force multipart downloads off        self.nomultipart = 0        # Flag to indicate that a multipart        # download is in progress        self.multipart = 0        # Links offset start - this will        # skip the list of child links of a URL        # to the given value        self.linksoffsetstart = 0        # Links offset value - this will skip        # the list of child links of a URL        # after the given value        self.linksoffsetend = -1        # Cache size for         # Current progress object        self.progressobj = TextProgress()        # Internal flag - show progress obj ?        self.showprogress = True        # Flag for forcing multipart downloads        self.forcesplit = 0        # Data save mode for connectors        # Is flush by default        self.datamode = CONNECTOR_DATA_MODE_FLUSH        # Name for data mode        self.datamodename = "flush"        # Hget outfile - default empty string        self.hgetoutfile = ''        # Hget output directory - default current directory        self.hgetoutdir = '.'        # Hget verbosity flag - default False        self.hgetverbose = 0        # Hget temp flag - default False        self.hgetnotemp = 0        # Hget mirror file        self.mirrorfile = ''        # Hget mirror search flag        self.mirrorsearch = False        # Hget mirror relpath index        self.mirrorpathindex = 0        # Hget relpath use flag        self.mirroruserelpath = 1        # Hget resume mode        self.canresume = 1        # Internal state param        self._badrequests = 0        # Internal config param        self._connaddua = True            def _init2(self):        """ Second level initialization method. Initializes the dictionary which maps        configuration XML file entries to state variables """                # For mapping xml entities to config entities                self.xml_map = { 'project_ignore' : ('project_ignore', 'int'),                         'url' : ('url', 'func:set_project'),                         'name': ('project', 'func:set_project'),                         'basedir' : ('basedir', 'func:set_project'),                         'verbosity_level' : ('verbosity', 'func:set_project'),                         'proxyserver': ('proxy','str'),                         'proxyuser': ('puser','str'),                         'proxypasswd' : ('ppasswd','str'),                         'proxyport_value' : ('proxyport','int'),                         'username': ('username','str'),                         'passwd' : ('passwd','str'),                                                  'html_value' : ('html','int'),                         'images_value' : ('images','int'),                         'movies_value' : ('movies','int'),                         'flash_value' : ('flash','int'),                                                  'sounds_value' : ('sounds','int'),                         'documents_value' : ('documents','int'),                                                                           'javascript_value' : ('javascript','int'),                         'javaapplet_value' : ('javaapplet','int'),                         'querylinks_value' : ('getquerylinks','int'),                         'cache_status' : ('pagecache','int'),                         'datacache_value' : ('datacache','int'),                         'urllist': ('urlfile', 'str'),                         'urltreefile_status' : ('urltreefile', 'int'),                         'archive_status' : ('archive', 'int'),                         'archive_format' : ('archformat', 'str'),                         'urlheaders_status' : ('urlheaders', 'int'),                         'retries_value': ('retryfailed','int'),                         'imagelinks_value' : ('getimagelinks','int'),                         'stylesheetlinks_value' : ('getstylesheets','int'),                         'offset_start' : ('linksoffsetstart','int'),                         'offset_end' : ('linksoffsetend','int'),                         'fetchlevel_value' : ('fetchlevel','int'),                         'extserverlinks_value' : ('eserverlinks','int'),                         'extpagelinks_value' : ('epagelinks','int'),                         'depth_value' : ('depth','int'),                         'extdepth_value' : ('extdepth','int'),                         'subdomain_value' : ('subdomain','int'),                         'ignoretlds_value': ('ignoretlds','int'),                         'maxextservers_value' : ('maxextservers','int'),                         'maxextdirs_value' : ('maxextdirs','int'),                         'maxfiles_value' : ('maxfiles','int'),                         'maxfilesize_value' : ('maxfilesize','int'),                         'maxbytes_value' : ('maxbytes', 'func:set_maxbytes'),                         'maxconnections_value' : ('connections','int'),                         'maxbandwidth_value' : ('bandwidthlimit','func:set_maxbandwidth'),                         'maxbandwidth_factor': ('throttlefactor','float'),                         'robots_value' : ('robots','int'),                         'timelimit_value' : ('timelimit','float'),                         'urlpriority' : ('urlpriority','str'),                         'serverpriority' : ('serverpriority','str'),                         'serverfilter' : ('serverfilter','str'),                         'wordfilter' : ('wordfilter','str'),                         'junkfilter_value' : ('junkfilter','int'),                         'useragent_value': ('USER_AGENT','str'),                         'workers_status' : ('usethreads','int'),                         'workers_size' : ('threadpoolsize','int'),                         'workers_timeout' : ('timeout','float'),                         'trackers_value' : ('maxtrackers','int'),                         'trackers_timeout' : ('fetchertimeout','float'),                                                  'fastmode_value': ('fastmode','int'),                         'savesessions_value': ('savesessions','int'),                         'timegap_value': ('sleeptime', 'float'),                         'timegap_random': ('randomsleep', 'int'),                         'connections_type' : ('datamode', 'func:set_datamode'),                         'feature_name' : ('htmlfeatures', 'func:set_parse_features'),                         'simulate_value': ('simulate', 'int'),                         'localise_value' : ('localise','int'),                         'browsepage_value' : ('browsepage','int'),                         'configfile_value': ('configfile', 'str'),                         'projectfile_value': ('projectfile', 'str'),                         'regexp_value' : ('regexp', 'func:set_urlfilter'),                         'path_value': ('path', 'func:set_urlfilter'),                         'extension_value': ('extension', 'func:set_urlfilter'),                         'content_value': ('content', 'func:set_textfilter'),                         'meta_value': ('meta', 'func:set_textfilter'),                         'urlfilterre_value': (('inclfilter', 'list'),                                               ('exclfilter', 'list'),                                               ('allfilters', 'list')),                         'urlprioritydict_value': ('urlprioritydict', 'dict'),                         'serverprioritydict_value': ('serverprioritydict', 'dict'),                         'http_compress' : ('httpcompress', 'int'),                         'plugin_name': ('plugins','func:set_plugin')                         }    def copy(self):        """ Return a serializable copy of this instance """                # Set non-picklable objects to None type        self.progressobj = None        return self    def __getstate__(self):        """ Overloaded __getstate__ method """        return self    def __setstate__(self, state):        """ Overloaded __setstate__ method """        pass        def assign_option(self, option_val, value, kwargs={}):        """ Assigns values to internal variables using the option specified """        try:            if len(option_val) == 2:                key, typ = option_val                                # If type is not a list, the                # action is simple assignment                # Bug fix: If someone has set the                # value to 'True'/'False' instead of                # 1/0, convert to bool type first.                if type(value) in (str, unicode):                    if value.lower() == 'true':                        value = 1                    elif value.lower() == 'false':                        value = 0                if typ.find(':') == -1:                    # do any type casting of the option                    fval = (eval(typ))(value)                    self[key] = fval                                        # If type is list, the action is                    # appending, after doing any type                    # casting of the actual value                else:                    # Type is of the form <type>:<actual type>                    typname, typ = typ.split(':')                    if typname == 'list':                        if typ:                            fval = (eval(typ))(value)                        else:                            fval = value                        var = self[key]                        var.append(fval)                    elif typname == 'func':                        funktion = getattr(self, typ)                        if funktion:                            funktion(key, value, kwargs)            else:                error('Error in option value %s!' % option_val)        except Exception, e:            raise HarvestManConfigError, "Error: " + str(e)    def set_option(self, option, value, negate=0):        """ Sets the passed option in with its value as the passed value """                # find out if the option exists in the dictionary        if option in self.xml_map.keys():            # if the option is a string or int or any            # non-seq type            # if value is an emptry string, return error

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?