config.py

来自「Harvestman-最新版本」· Python 代码 · 共 1,583 行 · 第 1/5 页

PY
1,583
字号
# -- coding: utf-8""" config.py - Module to keep configuration options    for HarvestMan program and its related modules. This     module is part of the HarvestMan program.    Author: Anand B Pillai <abpillai at gmail dot com>    For licensing information see the file LICENSE.txt that    is included in this distribution.    Jan 23 2007      Anand    Added code to check config in $HOME/.harvestman.                              Added control-var for session saving feature.    Feb 8 2007       Anand    Added config support for loading plugins. Added                              code for swish-e plugin.    Feb 11 2007      Anand    Re-wrote configuration parsing using generic option                              parser.    Mar 03 2007      Anand    Removed old option parsing dictionary and some                              obsolete code. Added option for changing time gap                              between downloads in config file. Removed command                              line option for urllistfile/urltree file. Added                              option to read multiple start URLs from a file.                              Modified behaviour so that if a source of URL is                              specified (command-line, URL file etc), any URLs                              in config file is skipped. Set urlserver option                              as default.   Mar 06 2007       Anand    Reset default option to queue.   April 11 2007     Anand    Renamed xmlparser module to configparser.   April 20 2007     Anand    Added options for hget.   May 7 2007       Anand     Modified option parsing for plugin option.   Jun 2 2008       Anand     Fixed kludgy processing of <project> options                              by using a function set_project. Added method                              'add' for easy adding of project URLs in                              interactive/programmatic crawling.      Copyright (C) 2004 Anand B Pillai.                              """__version__ = '2.0 b1'__author__ = 'Anand B Pillai'USAGE1 = """\ %(program)s [options] [optional URL] %(appname)s %(version)s %(maturity)s: An extensible, multithreaded web crawler.Author: Anand B PillaiMail bug reports and suggestions to <abpillai at gmail dot com>."""USAGE2 = """\ %(program)s [options] URL(s) | file(s) %(appname)s %(version)s %(maturity)s: A multithreaded web downloader based on HarvestMan.Author: Anand B PillaiThe program accepts URL(s) or an input file(s) containing a number of URLs,one per line. If a file is passed as input, any other program optionpassed is applied for every URL downloaded using the file.Mail bug reports and suggestions to <abpillai at gmail dot com>."""import os, sysimport refrom harvestman.lib import configparserfrom harvestman.lib import optionsfrom harvestman.lib import urlparserfrom harvestman.lib import loggerfrom harvestman.lib import utilsfrom harvestman.lib.common.optionparser import *from harvestman.lib.common.macros import *from harvestman.lib.common.common import hexit, test_sgmlop, logconsole, objectsfrom harvestman.lib.common.singleton import Singletonfrom harvestman.lib.common.progress import TextProgressCONFIG_XML_TEMPLATE="""\<?xml version="1.0" encoding="utf-8"?><HarvestMan xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"            xsi:schemaLocation="http://harvestmanontheweb.com/schemas/HarvestMan.xsd">               <config version="3.0" xmlversion="1.0">            %(@PROJECTS_ELEMENT)s     <network>      <proxy>        <proxyserver>%(proxy)s</proxyserver>        <proxyuser>%(puser)s</proxyuser>        <proxypasswd>%(ppasswd)s</proxypasswd>        <proxyport value="%(proxyport)s" />      </proxy>    </network>        <download>      <types>        <html value="%(html)s" />        <images value="%(images)s" />        <movies value="%(movies)s" />        <flash value="%(flash)s" />        <sounds value="%(sounds)s" />        <documents value="%(documents)s" />                <javascript value="%(javascript)s" />        <javaapplet value="%(javaapplet)s" />        <querylinks value="%(getquerylinks)s" />      </types>       <cache status="%(pagecache)s">        <datacache value="%(datacache)s" />      </cache>      <protocol>        <http compress="%(httpcompress)s" />      </protocol>      <misc>        <retries value="%(retryfailed)s" />      </misc>    </download>        <control>      <links>        <imagelinks value="%(getimagelinks)s" />        <stylesheetlinks value="%(getstylesheets)s" />        <offset start="%(linksoffsetstart)s" end="%(linksoffsetend)s" />      </links>      <extent>        <fetchlevel value="%(fetchlevel)s" />        <depth value="%(depth)s" />        <extdepth value="%(extdepth)s" />        <subdomain value="%(subdomain)s" />        <ignoretlds value="%(ignoretlds)s" />      </extent>      <limits>        <maxfiles value="%(maxfiles)s" />        <maxfilesize value="%(maxfilesize)s" />        <maxbytes value="%(maxbytes)s" />        <maxconnections value="%(connections)s" />        <maxbandwidth value="%(bandwidthlimit)s" factor="%(throttlefactor)s" />        <timelimit value="%(timelimit)s" />      </limits>      <rules>        <robots value="%(robots)s" />        <urlpriority>%(urlpriority)s</urlpriority>        <serverpriority>%(serverpriority)s</serverpriority>      </rules>      <filters>        <urlfilter>%(urlfilter)s</urlfilter>        <serverfilter>%(serverfilter)s</serverfilter>        <wordfilter>%(wordfilter)s</wordfilter>        <junkfilter value="%(junkfilter)s" />      </filters>      <plugins>        <plugin name="swish-e" enable="0" />        <plugin name="simulator" enable="0" />        <plugin name="lucene" enable="0" />        <plugin name="userbrowse" enable="0" />        <plugin name="spam" enable="0" />        <plugin name="datafilter" enable="0" />              </plugins>    </control>    <parser>      <feature name='a' enable='1' />      <feature name='base' enable='1' />      <feature name='frame' enable='1' />      <feature name='img' enable='1' />      <feature name='form' enable='1' />      <feature name='link' enable='1' />      <feature name='body' enable='1' />      <feature name='script' enable='1' />      <feature name='applet' enable='1' />      <feature name='area' enable='1' />      <feature name='meta' enable='1' />      <feature name='embed' enable='1' />      <feature name='object' enable='1' />      <feature name='option' enable='0' />    </parser>          <system>      <useragent value="%(USER_AGENT)s" />      <workers status="%(usethreads)s" size="%(threadpoolsize)s" timeout="%(timeout)s" />      <trackers value="%(maxtrackers)s" timeout="%(fetchertimeout)s" />      <timegap value="%(sleeptime)s" random="%(randomsleep)s" />      <connections type="%(datamodename)s" />    </system>        <files>      <urltreefile status="%(urltreefile)s" />      <archive status="%(archive)s" format="%(archformat)s" />      <urlheaders status="%(urlheaders)s" />      <localise value="%(localise)s" />    </files>        <display>      <browsepage value="%(browsepage)s"/>    </display>      </config>  </HarvestMan>"""param_re = re.compile(r'\S+=\S+',re.LOCALE|re.UNICODE)int_re = re.compile(r'\d+')float_re = re.compile(r'\d+\.\d*')maxbytes_re = re.compile(r'(\d+\s*)(kb?|mb?|gb?)?$', re.IGNORECASE)maxbw_re = re.compile(r'(\d+\s*)(k(b|bps)?|m(b|bps)?|g(b|bps)?)?$', re.IGNORECASE)projectname_re = re.compile(r'^[a-zA-Z0-9-_\.]+$', re.IGNORECASE|re.UNICODE|re.LOCALE)# This will contain the absolute path of parent-folder of# harvestman installation...module_path = ''class HarvestManConfigError(Exception):    """ Exception class for HarvestManStateObject """    passHarvestManStateObject = HarvestManConfigError    class HarvestManStateObject(dict, Singleton):    """ Configuration class for HarvestMan framework and applications    derived from it. A single instance of this class keeps most of the    shared state and configuration params of HarvestMan """    klassmap = {}    alias = 'config'        def __init__(self):        """ Initialize dictionary with the most common settings and their values """        # Calculate the module path        mydir = os.path.dirname(globals()["__file__"])        global module_path        module_path = os.path.dirname(mydir)                    self._init1()        self._init2()        self.set_system_params()        self.set_user_params()        super(HarvestManStateObject, self).__init__()            def _init1(self):        """ First level initialization method. Initializes most of the state variables """                self.items_to_skip=[]        # USER-AGENT string        # Version for harvestman        self.version='2.0'        # Maturity for harvestman        self.maturity="alpha 1"        # Single appname property for hget/harvestman        self.appname='HarvestMan'        #self.USER_AGENT = 'v'.join((self.appname + ' ', self.version))        self.USER_AGENT = '%s/%s (+http://code.google.com/p/harvestman-crawler/wiki/bot)' %(self.appname,self.version)        self.progname="".join((self.appname," ",self.version," ",self.maturity))        self.program = sys.argv[0]        self.url=''        self.project=''        self.project_ignore = 0        self.basedir=''        # A list which will hold dicts of (url, name, basedir, verbosity) for all projects        self.projects = []        self.urlmap = {}        self.archive = 0        self.archformat = 'bzip'        self.urlheaders = 1        self.configfile = 'config.xml'        self.projectfile = ''                 self.proxy=''        self.puser=''        self.ppasswd=''        self.proxyenc=1        self.username=''           self.passwd=''             self.proxyport=80        self.errorfile='errors.log'        self.localise=2        self.images=1        self.movies=0        self.flash=0        self.sounds=0        self.documents=1        self.depth=10        self.html=1        self.robots=1        # self.eserverlinks=0        # self.epagelinks=1        self.fastmode=1        self.usethreads=1        self.maxfiles=5000        self.maxbytes=0        # self.maxextservers=0        # self.maxextdirs=0        self.retryfailed=1        self.extdepth=0        self.maxtrackers=4        # Url filter object        self.urlfilter = None        # To prevent config from breaking...        self.serverfilter=''        self.wordfilter=''        self.regexurlfilters = []        self.pathurlfilters = []        self.extnurlfilters = []        # Text filter object        self.textfilter = None        self.contentfilters = []        self.metafilters = []        self.inclfilter=[]        self.exclfilter=[]        self.allfilters=[]        self.urlpriority = ''        self.serverpriority = ''        self.urlprioritydict = {}        self.serverprioritydict = {}        self.verbosity=logger.INFO

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?