urlparser.py
来自「Harvestman-最新版本」· Python 代码 · 共 934 行 · 第 1/3 页
PY
934 行
# -- coding: utf-8"""urlparser.py - Module containing class HarvestManUrl,representing a URL and its relation to disk files inHarvestMan.Creation Date: Nov 2 2004Author: Anand B Pillai <abpillai at gmail dot com> Jan 01 2006 jkleven Change is_webpage to return 'true' if the URL looks like a form query. Jan 10 2006 Anand Converted from dos to unix format (removed Ctrl-Ms). Oct 1 2006 Anand Fixes for EIAO ticket #193 - added reduce_url method to take care of .. chars inside URLs. Feb 25 2007 Anand Added .ars as a web-page extension to support the popular ars-technica website. Mar 12 2007 Anand Added more fields for multipart. Fixed a bug in is_webpage - anchor links should be returned as web-page links. Apr 12 2007 Anand Fixed a bug in anchor link parsing. The current logic was not taking care of multiple anchor links (#anchor1#anchor2). Fixed it by using a regular expression. Test page is http://nltk.sourceforge.net/lite/doc/api/term-index.html Mar 05 2008 Anand Many changes integrated. Method to get canonical form of URL added .Generating index as hash of canonical URL now. Added queue macros. Apr 24 2008 Anand Fix for #829.Copyright (C) 2004 Anand B Pillai. """__version__ = '2.0 b1'__author__ = 'Anand B Pillai'import os, sysimport reimport mimetypesimport copyimport urlprocimport md5import itertoolsimport randomfrom types import StringTypesfrom harvestman.lib import documentfrom harvestman.lib.common.common import *from harvestman.lib.common.netinfo import *from harvestman.lib.urltypes import *# URL queueing status macrosURL_NOT_QUEUED=0 # Fresh URL, not queued yetURL_QUEUED=1 # Fresh URL sent to queue, but not yet in queueURL_IN_QUEUE=2 # URL is in queueURL_IN_DOWNLOAD=3 # URL is out of queue and in downloadURL_DONE_DOWNLOAD=4 # URL has completed download, though this may not mean # that the download was successful.class HarvestManUrlError(Exception): """ Error class for HarvestManUrl """ def __init__(self, value): self.value = value def __repr__(self): return self.__str__() def __str__(self): return str(self.value) class HarvestManUrl(object): """ A class representing a URL in HarvestMan """ TEST = False hashes = {} def __init__(self, url, urltype = URL_TYPE_ANY, cgi = False, baseurl = None, rootdir = ''): # Remove trailing wspace chars. url = url.rstrip() try: try: try: url.encode("utf-8") except UnicodeDecodeError: url = url.decode("iso-8859-1") except UnicodeDecodeError, e: url = url.decode("latin-1") except UnicodeDecodeError, e: pass # For saving original url # since self.url can get # modified self.origurl = url self.url = url self.url = urlproc.modify_url(self.url) self.typ = urltype self.cgi = cgi self.anchor = '' self.index = 0 self.filename = 'index.html' self.validfilename = 'index.html' self.lastpath = '' self.protocol = '' self.defproto = False # If the url is a file like url # this value will be true, if it is # a directory like url, this value will # be false. self.filelike = False # download status, a number indicating # whether this url was downloaded successfully # or not. 0 indicates a successful download, and # any number >0 indicates a failed download self.status = -1 # Url scheduled status, a number indicating # how the URL is queued for download. # It has the following values # URL_NOT_QUEUED # URL_QUEUED # URL_IN_DOWNLOAD # URL_DONE_DOWNLOAD # The fact that the URL has URL_DONE_DOWNLOAD # need not mean that the download was successful! self.qstatus = URL_NOT_QUEUED # Fatal status self.fatal = False # is starting url? self.starturl = False # Flag for files having extension self.hasextn = False # Relative path flags self.isrel = False # Relative to server? self.isrels = False self.port = 80 self.domain = '' self.rpath = [] # Recursion depth self.rdepth = 0 # Url headers self.contentdict = {} # Url generation self.generation = 0 # Url priority self.priority = 0 # rules violation cache flags self.violatesrules = False self.rulescheckdone = False # Bytes range - used for HTTP/1.1 # multipart downloads. This has to # be set to an xrange object self.range = None # Flag to try multipart self.trymultipart = False # Multipart index self.mindex = 0 # Original url for mirrored URLs self.mirror_url = None # Flag set for URLs which are mirrored from # a different server than the original URL self.mirrored = False # Content-length for multi-part # This is the content length of the original # content. self.clength = 0 self.dirpath = [] # Re-computation flag self.reresolved = False # URL redirected flag self.redirected = False # Flag indicating we are using an old URL # which was redirected, again for producing # further redirections. This is used in Hget # for automatic split-mirror downloading # for URLs that auto-forward to mirrors. self.redirected_old = False self.baseurl = None # Hash of page data self.pagehash = '' # Flag to decide whether to recalculate get_full_url(...) # if flag is False, recalculate... self.urlflag = False # Cached full URL string self.absurl = '' # Base Url Dictionary if baseurl: if isinstance(baseurl, HarvestManUrl): self.baseurl = baseurl elif type(baseurl) in StringTypes: self.baseurl = HarvestManUrl(baseurl, 'generic', cgi, None, rootdir) # Root directory if rootdir == '': if self.baseurl and self.baseurl.rootdir: self.rootdir = self.baseurl.rootdir else: self.rootdir = os.getcwd() else: self.rootdir = rootdir self.anchorcheck() self.resolveurl() # For starting URL, the index is 0, for the rest # it is as hash of the canonical URL string... self.index = hash(self.get_canonical_url()) # If this is a URL similar to start URL, # reset its index to zero. The trick is # to store only the hash of the start URL # as key in the attribute 'hashes'. try: val = self.hashes[self.index] self.index = 0 except KeyError: pass # Copy of myself, this will be saved if # a re-resolving is requested so that old # parameters can be requested if needed self.orig_state = None def reset(self): """ Reset all the key attributes """ # Archive previous state self.orig_state = copy.copy(self) self.url = urlproc.modify_url(self.url) self.lastpath = '' self.protocol = '' self.defproto = False self.hasextn = False self.isrel = False self.isrels = False self.port = 80 self.domain = '' self.rpath = [] # Recursion depth self.rdepth = 0 self.dirpath = [] self.filename = 'index.html' self.validfilename = 'index.html' # Set urlflag to False self.urlflag = False self.absurl = '' def __str__(self): return self.absurl def wrapper_resolveurl(self): """ Called forcefully to re-resolve a URL, typically after a re-direction or change in URL has been detected """ self.reset() self.anchorcheck() self.resolveurl() self.reresolved = True def anchorcheck(self): """ Checking for anchor tags and processing accordingly """ if self.typ == 'anchor': if not self.baseurl: raise HarvestManUrlError, 'Base url should not be empty for anchor type url' if '#' in self.url: # Split with re items = anchore.split(self.url) # First item is the original url if len(items): if items[0]: self.url = items[0] else: self.url = self.baseurl.get_full_url() # Rest forms the anchor tag self.anchor = '#' + '#'.join(items[1:]) def resolve_protocol(self): """ Resolve the protocol of the url """ url2 = self.url.lower() for proto in protocol_map.keys(): if url2.find(proto) != -1: self.protocol = proto self.port = protocol_map.get(proto) return True else: # Fix: Use regex for detecting WWW urls. # Check for WWW urls. These can begin # with a 'www.' or 'www' followed by # a single number (www1, www3 etc). if www_re.match(url2): self.protocol = 'http://' self.url = "".join((self.protocol, self.url)) return True # We accept FTP urls beginning with just # ftp.<server>, and consider it as FTP over HTTP if url2.startswith('ftp.'):
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?