⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 hget.py

📁 Harvestman-最新版本
💻 PY
字号:
#! /usr/bin/env python# -- coding: utf-8""" Hget - Extensible, modular, multithreaded Internet    downloader program in the spirit of wget, using    HarvestMan codebase, with HTTP multipart support.        Version      - 1.0 beta 1.    Author: Anand B Pillai <abpillai at gmail dot com>    HGET is free software. See the file LICENSE.txt for information    on the terms and conditions of usage, and a DISCLAIMER of ALL WARRANTIES. Modification History    Created: April 19 2007 Anand B Pillai     April 20 2007 Added more command-line options   Anand     April 24 2007 Made connector module to flush data  Anand                   to tempfiles when run with hget.     April 25 2007 Implementation of hget features is  Anand                   completed!     April 30 2007 Many fixes/enhancements to hget.                   1. Reconnection of a lost connection does not                   lose already downloaded data.                   2. Closing of files of threads when download is                   aborted.                   3. Thread count shows current number of threads                   which are actually doing downloads, reflecting                   the activity.                   4. Final printing of time taken, average bandwidth                   and file size.     May 10 2007   Added support for sf.net mirrors in multipart.     Aug    2007   Fixed bugs in resetting state of various objects                   when doing many multipart downloads one after other.     Copyright(C) 2007, Anand B Pillai"""import __init__import sys, osimport reimport shutilfrom harvestman.lib import connectorfrom harvestman.lib import urlparserfrom harvestman.lib import configfrom harvestman.lib import loggerfrom harvestman.lib import datamgrfrom harvestman.lib import urlthreadfrom harvestman.lib import mirrorsfrom harvestman.lib.methodwrapper import MethodWrapperMetaClassfrom harvestman.lib.common.common import *from harvestman.lib.common.macros import *# Current dir - okayfrom spider import HarvestManVERSION='0.1'MATURITY='beta 1'class Hget(HarvestMan):    """ Web getter class for HarvestMan which defines a wget like interface    for downloading files on the command line with HTTP/1.0 Multipart support, mirror    search and failover """    __metaclass__ = MethodWrapperMetaClass        USER_AGENT = "Python-urllib/1.16"    def grab_url(self, url, filename=None):        """ Download the given URL and save it to the (optional) filename """        # If a filename is given, set outfile to it        if filename:            objects.config.hgetoutfile = filename            # print 'Saving to',filename        # We need to reset some counters and        # data structures ...                # Reset progress object        objects.config.reset_progress()        # Reset thread pool, multipart status        self._pool.reset_multipart_data()        # Reset monitor        self._monitor.reset()        # Reset mirror manager        mirrormgr = mirrors.HarvestManMirrorManager.getInstance()        mirrormgr.reset()                try:            # print objects.config.requests, objects.config.connections            conn = connector.HarvestManUrlConnector()            urlobj = None                        try:                print '\nDownloading URL',url,'...'                urlobj = urlparser.HarvestManUrl(url)                ret = conn.url_to_file(urlobj)                if urlobj.trymultipart and mirrormgr.used:                    # Print stats if mirrors were used...                    mirrormgr.print_stats()                                    return HGET_DOWNLOAD_OK            except urlparser.HarvestManUrlError, e:                print str(e)                print 'Error: Invalid URL "%s"' % url                return HGET_DOWNLOAD_ERROR                    except KeyboardInterrupt, e:            print 'Caught keyboard interrupt...'            if urlobj: self.clean_up(conn, urlobj)            return HGET_KEYBOARD_INTERRUPT        except Exception, e:            print 'Caught fatal error (%s): %s' % (e.__class__.__name__, str(e))            if urlobj: self.clean_up(conn, urlobj, e)            print_traceback()            return HGET_FATAL_ERROR                def clean_up(self, conn, urlobj, exception=None):        """ Perform clean up after any exception """                reader = conn.get_fileobj()        if reader: reader.stop()        if exception==None:            print '\n\nDownload aborted by user interrupt.'        # If flushdata mode, delete temporary files        if objects.config.datamode == CONNECTOR_DATA_MODE_FLUSH:            print 'Cleaning up temporary files...'            fname1 = conn.get_tmpfname()            # print 'Temp fname=>',fname1                        fullurl = urlobj.get_full_url()            range_request = conn._headers.get('accept-ranges','').lower()            # If server supports range requests, then do not            # clean up temp file, since we can start from where            # we left off, if this file is requested again.            if not range_request=='bytes':                if fname1:                    try:                        os.remove(fname1)                    except OSError, e:                        print e            elif fname1:                # Dump an info file consisting of the header                # information to a file, so that we can use it                # to resume downloading from where we left off                conn.write_url_info_file(fullurl)            lthreads = self._pool.get_threads()            lfiles = []            for t in lthreads:                fname = t.get_tmpfname()                if fname: lfiles.append(fname)                t.close_file()            print 'Waiting for threads to finish...'            self._pool.end_all_threads()            # For currently running multipart download, clean            # up all pieces since there is no guarantee that            # the next request will be for the same number of            # pieces of files, though the server supports            # multipart downloads.            if lfiles:                tmpdir = os.path.dirname(lfiles[0])            else:                tmpdir = ''                            for f in lfiles:               if os.path.isfile(f):                   try:                       os.remove(f)                   except (IOError, OSError), e:                       print 'Error: ',e            # Commented out because this is giving a strange            # exception on Windows.                        # If doing multipart, cleanup temp dir also            #if objects.config.multipart:            #    if not objects.config.hgetnotemp and tmpdir:            #        try:            #            shutil.rmtree(tmpdir)            #        except OSError, e:            #            print e            print 'Done'        print ''            def create_user_directories(self):        """ Create the initial directories for Hget application """        super(Hget, self).create_user_directories()        # Create temporary directory for saving files        if not objects.config.hgetnotemp:            try:                tmp = GetMyTempDir()                if not os.path.isdir(tmp):                    os.makedirs(tmp)                # Could not make tempdir, set hgetnotemp to True                if not os.path.isdir(tmp):                    objects.config.hgetnotemp = True            except Exception, e:                pass    def init(self):        """ Initialize the Hget object's state """        objects.config.USER_AGENT = self.__class__.USER_AGENT        # Fudge Firefox USER-AGENT string since some sites        # dont accept our user-agent.        # objects.config.USER_AGENT = "Firefox/2.0.0.8"        objects.config.appname = 'Hget'        objects.config.version = VERSION        objects.config.maturity = MATURITY        objects.config.nocrawl = True        self._pool = None        self._monitor = None                # Get program options        objects.config.parse_arguments()                objects.config.threadpoolsize = 20        # Set number of connections to two plus numparts        objects.config.connections = 2*objects.config.numparts        # Set socket timeout to a very low value        objects.config.socktimeout = 30.0        # objects.config.requests = 2*objects.config.numparts        if objects.config.hgetverbose:            objects.config.verbosity=logger.EXTRAINFO        objects.logger.make_logger()                objects.logger.setLogSeverity(objects.config.verbosity)        self.process_plugins()                self.register_common_objects()        self.create_user_directories()        # Set logging format to plain        objects.logger.setPlainFormat()    def hget(self):        """ Main method of Hget class. Downloads all URL(s) passed on the command        line and saves them """        if len(objects.config.projects)==0:            print 'Error: No input URL/file given. Run with -h or no arguments to see usage.\n'            return -1        objects.datamgr.initialize()        self._pool = objects.datamgr.get_url_threadpool()        self._monitor = urlthread.HarvestManUrlThreadPoolMonitor(self._pool)        self._monitor.start()                    for arg in objects.config.projects:            url = arg['url']                        # Check if the argument is a file, if so            # download URLs specified in the file.            if os.path.isfile(url):                # Open it, read URL per line and schedule download                print 'Input file %s found, scheduling download of URLs...' % url                try:                    for line in file(url):                        line = line.strip()                        # The line can optionally contain a different output                        # file name, in which case it should be separated by                        # commas...                        items = line.split(',')                        if len(items)==2:                            url, filename = items[0].strip(), items[1].strip()                            if self.grab_url(url, filename) == HGET_KEYBOARD_INTERRUPT:                                break                        elif len(items)==1:                            url = items[0].strip()                            if self.grab_url(url) == HGET_KEYBOARD_INTERRUPT:                                break                        print ''                #except IOError, e:                #    print 'Error:',e                except Exception, e:                    raise            else:                self.grab_url(url)        self._monitor.stop()    def main(self):        """ Main sub-routine """        # Add help option if no arguments are given        if len(sys.argv)<2:            sys.argv.append('-h')                    self.init()        self.hget()        return 0def main():    """ Main routine """    Hget().main()    if __name__ == "__main__":    main()def run():    h = Hget()    h.main()    

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -