config.py
来自「Harvestman-最新版本」· Python 代码 · 共 1,583 行 · 第 1/5 页
PY
1,583 行
# -- coding: utf-8""" config.py - Module to keep configuration options for HarvestMan program and its related modules. This module is part of the HarvestMan program. Author: Anand B Pillai <abpillai at gmail dot com> For licensing information see the file LICENSE.txt that is included in this distribution. Jan 23 2007 Anand Added code to check config in $HOME/.harvestman. Added control-var for session saving feature. Feb 8 2007 Anand Added config support for loading plugins. Added code for swish-e plugin. Feb 11 2007 Anand Re-wrote configuration parsing using generic option parser. Mar 03 2007 Anand Removed old option parsing dictionary and some obsolete code. Added option for changing time gap between downloads in config file. Removed command line option for urllistfile/urltree file. Added option to read multiple start URLs from a file. Modified behaviour so that if a source of URL is specified (command-line, URL file etc), any URLs in config file is skipped. Set urlserver option as default. Mar 06 2007 Anand Reset default option to queue. April 11 2007 Anand Renamed xmlparser module to configparser. April 20 2007 Anand Added options for hget. May 7 2007 Anand Modified option parsing for plugin option. Jun 2 2008 Anand Fixed kludgy processing of <project> options by using a function set_project. Added method 'add' for easy adding of project URLs in interactive/programmatic crawling. Copyright (C) 2004 Anand B Pillai. """__version__ = '2.0 b1'__author__ = 'Anand B Pillai'USAGE1 = """\ %(program)s [options] [optional URL] %(appname)s %(version)s %(maturity)s: An extensible, multithreaded web crawler.Author: Anand B PillaiMail bug reports and suggestions to <abpillai at gmail dot com>."""USAGE2 = """\ %(program)s [options] URL(s) | file(s) %(appname)s %(version)s %(maturity)s: A multithreaded web downloader based on HarvestMan.Author: Anand B PillaiThe program accepts URL(s) or an input file(s) containing a number of URLs,one per line. If a file is passed as input, any other program optionpassed is applied for every URL downloaded using the file.Mail bug reports and suggestions to <abpillai at gmail dot com>."""import os, sysimport refrom harvestman.lib import configparserfrom harvestman.lib import optionsfrom harvestman.lib import urlparserfrom harvestman.lib import loggerfrom harvestman.lib import utilsfrom harvestman.lib.common.optionparser import *from harvestman.lib.common.macros import *from harvestman.lib.common.common import hexit, test_sgmlop, logconsole, objectsfrom harvestman.lib.common.singleton import Singletonfrom harvestman.lib.common.progress import TextProgressCONFIG_XML_TEMPLATE="""\<?xml version="1.0" encoding="utf-8"?><HarvestMan xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://harvestmanontheweb.com/schemas/HarvestMan.xsd"> <config version="3.0" xmlversion="1.0"> %(@PROJECTS_ELEMENT)s <network> <proxy> <proxyserver>%(proxy)s</proxyserver> <proxyuser>%(puser)s</proxyuser> <proxypasswd>%(ppasswd)s</proxypasswd> <proxyport value="%(proxyport)s" /> </proxy> </network> <download> <types> <html value="%(html)s" /> <images value="%(images)s" /> <movies value="%(movies)s" /> <flash value="%(flash)s" /> <sounds value="%(sounds)s" /> <documents value="%(documents)s" /> <javascript value="%(javascript)s" /> <javaapplet value="%(javaapplet)s" /> <querylinks value="%(getquerylinks)s" /> </types> <cache status="%(pagecache)s"> <datacache value="%(datacache)s" /> </cache> <protocol> <http compress="%(httpcompress)s" /> </protocol> <misc> <retries value="%(retryfailed)s" /> </misc> </download> <control> <links> <imagelinks value="%(getimagelinks)s" /> <stylesheetlinks value="%(getstylesheets)s" /> <offset start="%(linksoffsetstart)s" end="%(linksoffsetend)s" /> </links> <extent> <fetchlevel value="%(fetchlevel)s" /> <depth value="%(depth)s" /> <extdepth value="%(extdepth)s" /> <subdomain value="%(subdomain)s" /> <ignoretlds value="%(ignoretlds)s" /> </extent> <limits> <maxfiles value="%(maxfiles)s" /> <maxfilesize value="%(maxfilesize)s" /> <maxbytes value="%(maxbytes)s" /> <maxconnections value="%(connections)s" /> <maxbandwidth value="%(bandwidthlimit)s" factor="%(throttlefactor)s" /> <timelimit value="%(timelimit)s" /> </limits> <rules> <robots value="%(robots)s" /> <urlpriority>%(urlpriority)s</urlpriority> <serverpriority>%(serverpriority)s</serverpriority> </rules> <filters> <urlfilter>%(urlfilter)s</urlfilter> <serverfilter>%(serverfilter)s</serverfilter> <wordfilter>%(wordfilter)s</wordfilter> <junkfilter value="%(junkfilter)s" /> </filters> <plugins> <plugin name="swish-e" enable="0" /> <plugin name="simulator" enable="0" /> <plugin name="lucene" enable="0" /> <plugin name="userbrowse" enable="0" /> <plugin name="spam" enable="0" /> <plugin name="datafilter" enable="0" /> </plugins> </control> <parser> <feature name='a' enable='1' /> <feature name='base' enable='1' /> <feature name='frame' enable='1' /> <feature name='img' enable='1' /> <feature name='form' enable='1' /> <feature name='link' enable='1' /> <feature name='body' enable='1' /> <feature name='script' enable='1' /> <feature name='applet' enable='1' /> <feature name='area' enable='1' /> <feature name='meta' enable='1' /> <feature name='embed' enable='1' /> <feature name='object' enable='1' /> <feature name='option' enable='0' /> </parser> <system> <useragent value="%(USER_AGENT)s" /> <workers status="%(usethreads)s" size="%(threadpoolsize)s" timeout="%(timeout)s" /> <trackers value="%(maxtrackers)s" timeout="%(fetchertimeout)s" /> <timegap value="%(sleeptime)s" random="%(randomsleep)s" /> <connections type="%(datamodename)s" /> </system> <files> <urltreefile status="%(urltreefile)s" /> <archive status="%(archive)s" format="%(archformat)s" /> <urlheaders status="%(urlheaders)s" /> <localise value="%(localise)s" /> </files> <display> <browsepage value="%(browsepage)s"/> </display> </config> </HarvestMan>"""param_re = re.compile(r'\S+=\S+',re.LOCALE|re.UNICODE)int_re = re.compile(r'\d+')float_re = re.compile(r'\d+\.\d*')maxbytes_re = re.compile(r'(\d+\s*)(kb?|mb?|gb?)?$', re.IGNORECASE)maxbw_re = re.compile(r'(\d+\s*)(k(b|bps)?|m(b|bps)?|g(b|bps)?)?$', re.IGNORECASE)projectname_re = re.compile(r'^[a-zA-Z0-9-_\.]+$', re.IGNORECASE|re.UNICODE|re.LOCALE)# This will contain the absolute path of parent-folder of# harvestman installation...module_path = ''class HarvestManConfigError(Exception): """ Exception class for HarvestManStateObject """ passHarvestManStateObject = HarvestManConfigError class HarvestManStateObject(dict, Singleton): """ Configuration class for HarvestMan framework and applications derived from it. A single instance of this class keeps most of the shared state and configuration params of HarvestMan """ klassmap = {} alias = 'config' def __init__(self): """ Initialize dictionary with the most common settings and their values """ # Calculate the module path mydir = os.path.dirname(globals()["__file__"]) global module_path module_path = os.path.dirname(mydir) self._init1() self._init2() self.set_system_params() self.set_user_params() super(HarvestManStateObject, self).__init__() def _init1(self): """ First level initialization method. Initializes most of the state variables """ self.items_to_skip=[] # USER-AGENT string # Version for harvestman self.version='2.0' # Maturity for harvestman self.maturity="alpha 1" # Single appname property for hget/harvestman self.appname='HarvestMan' #self.USER_AGENT = 'v'.join((self.appname + ' ', self.version)) self.USER_AGENT = '%s/%s (+http://code.google.com/p/harvestman-crawler/wiki/bot)' %(self.appname,self.version) self.progname="".join((self.appname," ",self.version," ",self.maturity)) self.program = sys.argv[0] self.url='' self.project='' self.project_ignore = 0 self.basedir='' # A list which will hold dicts of (url, name, basedir, verbosity) for all projects self.projects = [] self.urlmap = {} self.archive = 0 self.archformat = 'bzip' self.urlheaders = 1 self.configfile = 'config.xml' self.projectfile = '' self.proxy='' self.puser='' self.ppasswd='' self.proxyenc=1 self.username='' self.passwd='' self.proxyport=80 self.errorfile='errors.log' self.localise=2 self.images=1 self.movies=0 self.flash=0 self.sounds=0 self.documents=1 self.depth=10 self.html=1 self.robots=1 # self.eserverlinks=0 # self.epagelinks=1 self.fastmode=1 self.usethreads=1 self.maxfiles=5000 self.maxbytes=0 # self.maxextservers=0 # self.maxextdirs=0 self.retryfailed=1 self.extdepth=0 self.maxtrackers=4 # Url filter object self.urlfilter = None # To prevent config from breaking... self.serverfilter='' self.wordfilter='' self.regexurlfilters = [] self.pathurlfilters = [] self.extnurlfilters = [] # Text filter object self.textfilter = None self.contentfilters = [] self.metafilters = [] self.inclfilter=[] self.exclfilter=[] self.allfilters=[] self.urlpriority = '' self.serverpriority = '' self.urlprioritydict = {} self.serverprioritydict = {} self.verbosity=logger.INFO
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?