📄 mirrors.py
字号:
# -- coding: utf-8
""" mirrors.py - Module which provides support for managing
mirrors for domains, for hget.
Author - Anand B Pillai <abpillai at gmail dot com>
Created, Anand B Pillai 14/08/07.
Modified Anand B Pillai 10/10/07 Added file mirror support
Modified Anand B Pillai 12/11/07 Added logic to retry mirrors which
did not fail wi th fatal error.
Replaced duplicate mirroring code with
HarvestManMirror class.
Modified Anand B Pillai 6/02/08 Added mirror search logic (Successfully
download tested apache ant binary using
findfiles.com mirrors).
Copyright (C) 2007 Anand B Pillai.
"""
import random
import re
from pyparsing import *
from harvestman.lib import urlparser
from harvestman.lib import connector
from harvestman.lib.common.common import *
from harvestman.lib.common.macros import *
from harvestman.lib.common.singleton import Singleton
def test_parse():
print urls
class HTMLTableParser(object):
def __init__(self):
self.grammar = Literal("<table") + ZeroOrMore(Word(alphas) + Literal("=") + Word(alphanums + "%" + '"')) + Literal(">") + \
ZeroOrMore(Literal("<tr>") + SkipTo(Literal("</tr>"))) + SkipTo(Literal("</table>"))
#self.grammar = Literal("<table") + ZeroOrMore(Word(alphas) + Literal("=") + Word(alphanums + "%" + '"')) + Literal(">") + \
# OneOrMore(Literal("<tr>") + ZeroOrMore(Literal("<td") + ZeroOrMore(Word(alphas) + Literal("=") + Word(alphanums + "%%" + '"')) + Literal(">") + SkipTo(Literal("</td>"))) + SkipTo(Literal("</tr>"))) + SkipTo(Literal("</table>"))
def parse(self, data):
# data='<table class="tinner" width="100%" cellspacing="0" cellpadding="0" cellpadding="4"><tr><td></td></tr></table>'
for item in self.grammar.scanString(data):
print item
class HarvestManMirror(object):
""" Class representing a URL mirror """
def __init__(self, url, absolute=False):
self.url = url
self.absolute = absolute
# Url object
self.urlobj = urlparser.HarvestManUrl(self.url)
# By default mirror URLs are assumed to be directory URLs.
# if this is an absolute file URL, then don't do anything
if not absolute:
self.urlobj.set_directory_url()
# Reliability factor - FUTURE
self.reliability = 1.0
# Geo location - FUTURE
self.geoloc = 0
# Count of number of times
# this mirror was used
self.usecnt = 0
def __str__(self):
return self.url
def __repr__(self):
return str(self)
def calc_relative_path(self, urlobj):
relpath = urlobj.get_relative_url()
# Global configuration object
if objects.config.mirroruserelpath:
if objects.config.mirrorpathindex:
items = relpath.split('/')
# Trim again....
items = [item for item in items if item != '']
relpath = '/'.join(items[cfg.mirrorpathindex:])
else:
# Do not use relative paths, return the filename
# of the URL...
relpath = urlobj.get_filename()
if is_sourceforge_url(urlobj):
relpath = 'sourceforge' + relpath
else:
if relpath[0] == '/':
relpath = relpath[1:]
return relpath
def mirror_url(self, urlobj):
""" Return mirror URL for the given URL """
if not self.absolute:
relpath = self.calc_relative_path(urlobj)
newurlobj = urlparser.HarvestManUrl(relpath, baseurl=self.urlobj)
else:
newurlobj = self.urlobj
# Set mirror_url attribute
newurlobj.mirror_url = urlobj
# Set another attribute indicating the mirror is different
newurlobj.mirrored = True
newurlobj.trymultipart = True
self.usecnt += 1
# print '\t=>',newurlobj.get_full_url()
# logconsole("Mirror URL %d=> %s" % (x+1, newurlobj.get_full_url()))
return newurlobj
def new_mirror_url(self, urlobj):
""" Return new mirror URL for an already mirrored URL """
# Typically called when errors are seen with mirrors
orig_urlobj = urlobj.mirror_url
newurlobj = self.mirror_url(orig_urlobj)
newurlobj.clength = urlobj.clength
newurlobj.range = urlobj.range
newurlobj.mindex = urlobj.mindex
self.usecnt += 1
return newurlobj
class HarvestManMirrorSearch(object):
""" Search mirror sites for files """
# Mirror sites and their search URLs
sites = {'filewatcher': ('http://www.filewatcher.com/_/?q=%s',),
'freewareweb': ('http://www.freewareweb.com/cgi-bin/ftpsearch.pl?q=%s',),
'filesearching': ('http://www.filesearching.com/cgi-bin/s?q=%s&l=en',),
'findfiles' : ('http://www.findfiles.com/list.php?string=%s&db=Mirrors&match=Exact&search=',) }
quotes_re = re.compile(r'[\'\"]')
filename_re = '%s[\?a-zA-Z0-9-_]*'
def __init__(self):
self.tried = []
self.valid = ('findfiles',)
self.cache = []
def make_urls(self, grammar, data, filename):
urls = []
rc = re.compile(self.filename_re % filename)
for match in grammar.scanString(data):
if not match: continue
if len(match) != 3: continue
if len(match[0])==0: continue
if len(match[0][-1])==0: continue
url = self.quotes_re.sub('', match[0][-1])
if url not in urls:
# Currently we cannot support FTP mirror URLs
#if url.startswith('ftp://') or \
# url.startswith('http://') or \
# url.startswith('https://'):
if url.startswith('http://') or \
url.startswith('https://'):
if url.endswith(filename):
urls.append(url)
elif rc.search(url):
# Prune any characters after filename
idx = url.find(filename)
if idx != -1: urls.append(url[:idx+len(filename)])
return urls
def search_filewatcher(self, filename):
# Note: this grammar could change if the site changes its templates
grammar = Literal("<p>") + Literal("<big>") + Literal("<a") + Literal("href") + Literal("=") + \
SkipTo(Literal(">"))
urls = []
search_url = self.sites['filewatcher'][0] % filename
conn = connector.HarvestManUrlConnector()
data = conn.get_url_data(search_url)
return self.make_urls(grammar, data, filename)
def search_findfiles(self, filename):
print 'Searching http://www.findfiles.com for mirrors of file %s...' % filename
# Note: this grammar could change if the site changes its templates
content1 = Literal("<h1") + SkipTo(Literal("Advanced Search"))
content2 = Literal("<a") + Literal("href") + Literal("=") + SkipTo(Literal(">"))
search_url = self.sites['findfiles'][0] % filename
conn = connector.HarvestManUrlConnector()
data = conn.get_url_data(search_url)
# print data
matches = []
for match in content1.scanString(data):
matches.append(match)
# There will be only one match
if matches:
data = matches[0][0][-1]
idx1 = data.find('<table')
if idx1 != -1:
idx2 = data.find('</table>',idx1)
if idx2 != -1:
data = data[idx1:idx2+8]
return self.make_urls(content2, data, filename)
return []
def search_freewareweb(self, filename):
# TODO
pass
def search_filesearching(self, filename):
# TODO
pass
def can_search(self):
""" Return whether we can search for new mirrors """
# This queries whether we have used up all the mirror search sites
self.tried.sort()
l = list(self.valid)
l.sort()
return not (self.tried == l)
def search(self, urlobj):
filename = urlobj.get_filename()
print 'Searching mirrors for %s...' % filename
# Searching in other mirror search sites returns mostly
# FTP urls. We can currently do mirror downloads only
# for HTTP URLs.
# return self.search_filewatcher(filename)
for item in self.valid:
if item not in self.tried:
func = getattr(self,'search_' + item)
self.tried.append(item)
mirror_urls = func(filename)
if mirror_urls:
mirrors = [HarvestManMirror(url, True) for url in mirror_urls]
self.cache = mirrors
return mirrors
class HarvestManMirrorManager(Singleton):
""" Mirror manager class for HarvestMan/Hget """
# Sourceforge mirror information in the form
# of (servername, Place, Country) tuples.
sf_mirror_info = (('easynews', 'Arizona, USA'),
('internap', 'CA, USA'),
('superb-east','Virginia, USA'),
('superb-west','Washington, USA'),
('ufpr', 'Curitiba, Brazil'),
('belnet', 'Brussels, Belgium'),
('switch', 'Laussane, Switzerland'),
('mesh', 'Deusseldorf, Germany'),
('ovh', 'Paris, France'),
('dfn', 'Berlin, Germany'),
('heanet', 'Dublin, Ireland'),
('garr', 'Bologna, Italy'),
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -