config.py
来自「Harvestman-最新版本」· Python 代码 · 共 1,583 行 · 第 1/5 页
PY
1,583 行
self.verbosity_default=logger.INFO # Override project verbosity - done # if a global verbosity flag is defined # say using the command line self.verbosity_override = False # timeout for worker threads is a rather # large 5 minutes. self.timeout=300.00 # timeout for sockets is a rather high 1.0 minute self.socktimeout = 60.0 # Time out for fetchers is a rather small 4 minutes self.fetchertimeout = 240.0 self.getimagelinks=1 self.getstylesheets=1 # Load images from anywhere irrepsective of rules self.anyimages=1 self.threadpoolsize=10 self.renamefiles=0 self.fetchlevel=0 self.browsepage=0 self.checkfiles=1 self.pagecache=1 # Internal variable telling whether to write cache self.writecache=True self.cachefound=0 self._error='' self.starttime=0 self.endtime=0 self.javascript = 1 self.javaapplet = 1 self.connections=5 # Bandwidth limit, 0 means no limit self.bandwidthlimit = 0 self.throttlefactor = 1.5 self.cachefileformat='pickled' self.testing = 0 self.testnocrawl = 0 self.ignoreinterrupts = 0 # Set to true when a kb interrupt is caught self.keyboardinterrupt = 0 # Differentiate between sub-domains of a domain ? # When set to True, subdomains act like different # domains, so they are filtered out for fetchlevel<=1 self.subdomain = 1 # Flag to ignore tlds, if set to True, domains # such as www.foo.com, www.foo.co.uk, www.foo.org # will all evaluate to the same server. # Use this carefully! self.ignoretlds = 0 self.getquerylinks = 0 self.bytes = 20.00 # Not used! self.projtimeout = 1800.00 self.downloadtime = 0.0 self.timelimit = -1 self.terminate = 0 self.datacache = 0 self.blocking = 0 self.junkfilter = 1 self.junkfilterdomains = 1 self.junkfilterpatterns = 1 self.urltreefile = 0 self.urlfile = '' self.maxfilesize=5242880 self.minfilesize=0 self.format = 'xml' self.rawsave = 0 self.fromprojfile = 0 # HTML features (optional) self.htmlfeatures = [] # For running from previous states. self.resuming = 0 self.runfile = None # Control var for session-saver feature. self.savesessions = 0 # List of enabled plugins self.plugins = [] # Control var for simulation feature self.simulate = 0 # Time to sleep between requests self.sleeptime = 1.0 # Time to sleep on the request queue self.queuetime = 1.0 # Queue size - fixed... self.queuesize = 5000 self.randomsleep = 1 # For http compression self.httpcompress = 1 # Type of URLs which can be # set to skip any rules we define # This is not a user configurable # option, but can be configured in # plugins, of course. self.skipruletypes = [] # Number of parts to split a file # to, for multipart http downloads self.numparts = 4 # Flag to force multipart downloads off self.nomultipart = 0 # Flag to indicate that a multipart # download is in progress self.multipart = 0 # Links offset start - this will # skip the list of child links of a URL # to the given value self.linksoffsetstart = 0 # Links offset value - this will skip # the list of child links of a URL # after the given value self.linksoffsetend = -1 # Cache size for # Current progress object self.progressobj = TextProgress() # Internal flag - show progress obj ? self.showprogress = True # Flag for forcing multipart downloads self.forcesplit = 0 # Data save mode for connectors # Is flush by default self.datamode = CONNECTOR_DATA_MODE_FLUSH # Name for data mode self.datamodename = "flush" # Hget outfile - default empty string self.hgetoutfile = '' # Hget output directory - default current directory self.hgetoutdir = '.' # Hget verbosity flag - default False self.hgetverbose = 0 # Hget temp flag - default False self.hgetnotemp = 0 # Hget mirror file self.mirrorfile = '' # Hget mirror search flag self.mirrorsearch = False # Hget mirror relpath index self.mirrorpathindex = 0 # Hget relpath use flag self.mirroruserelpath = 1 # Hget resume mode self.canresume = 1 # Internal state param self._badrequests = 0 # Internal config param self._connaddua = True def _init2(self): """ Second level initialization method. Initializes the dictionary which maps configuration XML file entries to state variables """ # For mapping xml entities to config entities self.xml_map = { 'project_ignore' : ('project_ignore', 'int'), 'url' : ('url', 'func:set_project'), 'name': ('project', 'func:set_project'), 'basedir' : ('basedir', 'func:set_project'), 'verbosity_level' : ('verbosity', 'func:set_project'), 'proxyserver': ('proxy','str'), 'proxyuser': ('puser','str'), 'proxypasswd' : ('ppasswd','str'), 'proxyport_value' : ('proxyport','int'), 'username': ('username','str'), 'passwd' : ('passwd','str'), 'html_value' : ('html','int'), 'images_value' : ('images','int'), 'movies_value' : ('movies','int'), 'flash_value' : ('flash','int'), 'sounds_value' : ('sounds','int'), 'documents_value' : ('documents','int'), 'javascript_value' : ('javascript','int'), 'javaapplet_value' : ('javaapplet','int'), 'querylinks_value' : ('getquerylinks','int'), 'cache_status' : ('pagecache','int'), 'datacache_value' : ('datacache','int'), 'urllist': ('urlfile', 'str'), 'urltreefile_status' : ('urltreefile', 'int'), 'archive_status' : ('archive', 'int'), 'archive_format' : ('archformat', 'str'), 'urlheaders_status' : ('urlheaders', 'int'), 'retries_value': ('retryfailed','int'), 'imagelinks_value' : ('getimagelinks','int'), 'stylesheetlinks_value' : ('getstylesheets','int'), 'offset_start' : ('linksoffsetstart','int'), 'offset_end' : ('linksoffsetend','int'), 'fetchlevel_value' : ('fetchlevel','int'), 'extserverlinks_value' : ('eserverlinks','int'), 'extpagelinks_value' : ('epagelinks','int'), 'depth_value' : ('depth','int'), 'extdepth_value' : ('extdepth','int'), 'subdomain_value' : ('subdomain','int'), 'ignoretlds_value': ('ignoretlds','int'), 'maxextservers_value' : ('maxextservers','int'), 'maxextdirs_value' : ('maxextdirs','int'), 'maxfiles_value' : ('maxfiles','int'), 'maxfilesize_value' : ('maxfilesize','int'), 'maxbytes_value' : ('maxbytes', 'func:set_maxbytes'), 'maxconnections_value' : ('connections','int'), 'maxbandwidth_value' : ('bandwidthlimit','func:set_maxbandwidth'), 'maxbandwidth_factor': ('throttlefactor','float'), 'robots_value' : ('robots','int'), 'timelimit_value' : ('timelimit','float'), 'urlpriority' : ('urlpriority','str'), 'serverpriority' : ('serverpriority','str'), 'serverfilter' : ('serverfilter','str'), 'wordfilter' : ('wordfilter','str'), 'junkfilter_value' : ('junkfilter','int'), 'useragent_value': ('USER_AGENT','str'), 'workers_status' : ('usethreads','int'), 'workers_size' : ('threadpoolsize','int'), 'workers_timeout' : ('timeout','float'), 'trackers_value' : ('maxtrackers','int'), 'trackers_timeout' : ('fetchertimeout','float'), 'fastmode_value': ('fastmode','int'), 'savesessions_value': ('savesessions','int'), 'timegap_value': ('sleeptime', 'float'), 'timegap_random': ('randomsleep', 'int'), 'connections_type' : ('datamode', 'func:set_datamode'), 'feature_name' : ('htmlfeatures', 'func:set_parse_features'), 'simulate_value': ('simulate', 'int'), 'localise_value' : ('localise','int'), 'browsepage_value' : ('browsepage','int'), 'configfile_value': ('configfile', 'str'), 'projectfile_value': ('projectfile', 'str'), 'regexp_value' : ('regexp', 'func:set_urlfilter'), 'path_value': ('path', 'func:set_urlfilter'), 'extension_value': ('extension', 'func:set_urlfilter'), 'content_value': ('content', 'func:set_textfilter'), 'meta_value': ('meta', 'func:set_textfilter'), 'urlfilterre_value': (('inclfilter', 'list'), ('exclfilter', 'list'), ('allfilters', 'list')), 'urlprioritydict_value': ('urlprioritydict', 'dict'), 'serverprioritydict_value': ('serverprioritydict', 'dict'), 'http_compress' : ('httpcompress', 'int'), 'plugin_name': ('plugins','func:set_plugin') } def copy(self): """ Return a serializable copy of this instance """ # Set non-picklable objects to None type self.progressobj = None return self def __getstate__(self): """ Overloaded __getstate__ method """ return self def __setstate__(self, state): """ Overloaded __setstate__ method """ pass def assign_option(self, option_val, value, kwargs={}): """ Assigns values to internal variables using the option specified """ try: if len(option_val) == 2: key, typ = option_val # If type is not a list, the # action is simple assignment # Bug fix: If someone has set the # value to 'True'/'False' instead of # 1/0, convert to bool type first. if type(value) in (str, unicode): if value.lower() == 'true': value = 1 elif value.lower() == 'false': value = 0 if typ.find(':') == -1: # do any type casting of the option fval = (eval(typ))(value) self[key] = fval # If type is list, the action is # appending, after doing any type # casting of the actual value else: # Type is of the form <type>:<actual type> typname, typ = typ.split(':') if typname == 'list': if typ: fval = (eval(typ))(value) else: fval = value var = self[key] var.append(fval) elif typname == 'func': funktion = getattr(self, typ) if funktion: funktion(key, value, kwargs) else: error('Error in option value %s!' % option_val) except Exception, e: raise HarvestManConfigError, "Error: " + str(e) def set_option(self, option, value, negate=0): """ Sets the passed option in with its value as the passed value """ # find out if the option exists in the dictionary if option in self.xml_map.keys(): # if the option is a string or int or any # non-seq type # if value is an emptry string, return error
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?