mirrors.py

来自「Harvestman-最新版本」· Python 代码 · 共 566 行 · 第 1/2 页
566 行
# -- coding: utf-8
""" mirrors.py - Module which provides support for managing
mirrors for domains, for hget.

Author - Anand B Pillai <abpillai at gmail dot com>

Created,  Anand B Pillai 14/08/07.
Modified  Anand B Pillai 10/10/07  Added file mirror support
Modified  Anand B Pillai 12/11/07  Added logic to retry mirrors which
                                   did not fail wi th fatal error.
                                   Replaced duplicate mirroring code with
                                   HarvestManMirror class.
Modified Anand B Pillai 6/02/08    Added mirror search logic (Successfully
                                   download tested apache ant binary using
                                   findfiles.com mirrors).

Copyright (C) 2007 Anand B Pillai.
    
"""

import random
import re
from pyparsing import *

from harvestman.lib import urlparser
from harvestman.lib import connector

from harvestman.lib.common.common import *
from harvestman.lib.common.macros import *
from harvestman.lib.common.singleton import Singleton

def test_parse():
    print urls

class HTMLTableParser(object):

    def __init__(self):
        self.grammar = Literal("<table") + ZeroOrMore(Word(alphas) + Literal("=") + Word(alphanums + "%" + '"')) + Literal(">") + \
                       ZeroOrMore(Literal("<tr>") + SkipTo(Literal("</tr>"))) + SkipTo(Literal("</table>"))
        #self.grammar = Literal("<table") + ZeroOrMore(Word(alphas) + Literal("=") + Word(alphanums + "%" + '"')) + Literal(">") + \
        #               OneOrMore(Literal("<tr>") + ZeroOrMore(Literal("<td") + ZeroOrMore(Word(alphas) + Literal("=") + Word(alphanums + "%%" + '"')) + Literal(">") + SkipTo(Literal("</td>"))) + SkipTo(Literal("</tr>"))) + SkipTo(Literal("</table>"))

    def parse(self, data):

        # data='<table class="tinner" width="100%" cellspacing="0" cellpadding="0" cellpadding="4"><tr><td></td></tr></table>'
        for item in self.grammar.scanString(data):
            print item
        
        
        
class HarvestManMirror(object):
    """ Class representing a URL mirror """

    def __init__(self, url, absolute=False):
        self.url = url
        self.absolute = absolute
        # Url object
        self.urlobj = urlparser.HarvestManUrl(self.url)
        # By default mirror URLs are assumed to be directory URLs.
        # if this is an absolute file URL, then don't do anything
        if not absolute:
            self.urlobj.set_directory_url()
        
        # Reliability factor - FUTURE
        self.reliability = 1.0
        # Geo location - FUTURE
        self.geoloc = 0
        # Count of number of times
        # this mirror was used
        self.usecnt = 0

    def __str__(self):
        return self.url

    def __repr__(self):
        return str(self)
    
    def calc_relative_path(self, urlobj):

        relpath = urlobj.get_relative_url()

        # Global configuration object
        if objects.config.mirroruserelpath:
            if objects.config.mirrorpathindex:
                items = relpath.split('/')
                # Trim again....
                items = [item for item in items if item != '']
                relpath = '/'.join(items[cfg.mirrorpathindex:])
        else:
            # Do not use relative paths, return the filename
            # of the URL...
            relpath = urlobj.get_filename()

        if is_sourceforge_url(urlobj):
            relpath = 'sourceforge' + relpath
        else:
            if relpath[0] == '/':
                relpath = relpath[1:]
                
        return relpath

    
    def mirror_url(self, urlobj):
        """ Return mirror URL for the given URL """

        if not self.absolute:
            relpath = self.calc_relative_path(urlobj)
            newurlobj = urlparser.HarvestManUrl(relpath, baseurl=self.urlobj)
        else:
            newurlobj = self.urlobj
            
        # Set mirror_url attribute
        newurlobj.mirror_url = urlobj
        # Set another attribute indicating the mirror is different
        newurlobj.mirrored = True
        newurlobj.trymultipart = True

        self.usecnt += 1
        # print '\t=>',newurlobj.get_full_url()
        # logconsole("Mirror URL %d=> %s" % (x+1, newurlobj.get_full_url()))
        return newurlobj

    def new_mirror_url(self, urlobj):
        """ Return new mirror URL for an already mirrored URL """

        # Typically called when errors are seen with mirrors
        orig_urlobj = urlobj.mirror_url
        newurlobj = self.mirror_url(orig_urlobj)
        newurlobj.clength = urlobj.clength
        newurlobj.range = urlobj.range
        newurlobj.mindex = urlobj.mindex

        self.usecnt += 1
        
        return newurlobj

class HarvestManMirrorSearch(object):
    """ Search mirror sites for files """

    # Mirror sites and their search URLs
    sites = {'filewatcher':  ('http://www.filewatcher.com/_/?q=%s',),
             'freewareweb':  ('http://www.freewareweb.com/cgi-bin/ftpsearch.pl?q=%s',),
             'filesearching': ('http://www.filesearching.com/cgi-bin/s?q=%s&l=en',),
             'findfiles' : ('http://www.findfiles.com/list.php?string=%s&db=Mirrors&match=Exact&search=',) }
    
    quotes_re = re.compile(r'[\'\"]')
    filename_re = '%s[\?a-zA-Z0-9-_]*'

    def __init__(self):
        self.tried = []
        self.valid = ('findfiles',)
        self.cache = []


    def make_urls(self, grammar, data, filename):

        urls = []
        rc = re.compile(self.filename_re % filename)
        
        for match in grammar.scanString(data):

            if not match: continue
            if len(match) != 3: continue
            if len(match[0])==0: continue
            if len(match[0][-1])==0: continue         
            url = self.quotes_re.sub('', match[0][-1])
            if url not in urls:
                # Currently we cannot support FTP mirror URLs
                #if url.startswith('ftp://') or \
                #   url.startswith('http://') or \
                #   url.startswith('https://'):
                if url.startswith('http://') or \
                   url.startswith('https://'):                    
                
                    if url.endswith(filename):
                        urls.append(url)
                    elif rc.search(url):
                        # Prune any characters after filename
                        idx = url.find(filename)
                        if idx != -1: urls.append(url[:idx+len(filename)])
                    
        return urls

    def search_filewatcher(self, filename):

        # Note: this grammar could change if the site changes its templates
        grammar = Literal("<p>") + Literal("<big>") + Literal("<a") + Literal("href") + Literal("=") + \
                  SkipTo(Literal(">"))

        urls = []
        search_url = self.sites['filewatcher'][0] % filename
        
        conn = connector.HarvestManUrlConnector()
        data = conn.get_url_data(search_url)

        return self.make_urls(grammar, data, filename)

    def search_findfiles(self, filename):

        print 'Searching http://www.findfiles.com for mirrors of file %s...' % filename

        # Note: this grammar could change if the site changes its templates        
        content1 = Literal("<h1") + SkipTo(Literal("Advanced Search"))
        content2 = Literal("<a") + Literal("href") + Literal("=") + SkipTo(Literal(">"))
        
        search_url = self.sites['findfiles'][0] % filename
        
        conn = connector.HarvestManUrlConnector()
        data = conn.get_url_data(search_url)
        # print data
        matches = []
        
        for match in content1.scanString(data):
            matches.append(match)
            
        # There will be only one match
        if matches:
            data = matches[0][0][-1]
            idx1 = data.find('<table')
            if idx1 != -1:
                idx2 = data.find('</table>',idx1)
                if idx2 != -1:
                    data = data[idx1:idx2+8]
                    return self.make_urls(content2, data, filename)
                
        return []

    def search_freewareweb(self, filename):

        # TODO
        pass

    def search_filesearching(self, filename):

        # TODO
        pass    
    

    def can_search(self):
        """ Return whether we can search for new mirrors """
        
        # This queries whether we have used up all the mirror search sites
        self.tried.sort()
        l = list(self.valid)
        l.sort()
        return not (self.tried == l)
        
    def search(self, urlobj):
        filename = urlobj.get_filename()
        print 'Searching mirrors for %s...' % filename
        
        # Searching in other mirror search sites returns mostly
        # FTP urls. We can currently do mirror downloads only
        # for HTTP URLs.
        # return self.search_filewatcher(filename)
        for item in self.valid:
            if item not in self.tried:
                func = getattr(self,'search_' + item)
                self.tried.append(item)
                mirror_urls = func(filename)
                if mirror_urls:
                    mirrors = [HarvestManMirror(url, True) for url  in mirror_urls]
                    self.cache = mirrors
                    return mirrors
            
             
class HarvestManMirrorManager(Singleton):
    """ Mirror manager class for HarvestMan/Hget """
    
    # Sourceforge mirror information in the form
    # of (servername, Place, Country) tuples.
    sf_mirror_info = (('easynews', 'Arizona, USA'),
                      ('internap', 'CA, USA'),
                      ('superb-east','Virginia, USA'),
                      ('superb-west','Washington, USA'),
                      ('ufpr', 'Curitiba, Brazil'),
                      ('belnet', 'Brussels, Belgium'),
                      ('switch', 'Laussane, Switzerland'),
                      ('mesh', 'Deusseldorf, Germany'),
                      ('ovh', 'Paris, France'),
                      ('dfn', 'Berlin, Germany'),
                      ('heanet', 'Dublin, Ireland'),
                      ('garr', 'Bologna, Italy'),
mirrors.py - 源码说明

本页面展示了「Harvestman-最新版本」中的 mirrors.py 源码文件，采用 Python 编程语言编写，共 566 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与Harvestman相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?