urlparser.py

来自「Harvestman-最新版本」· Python 代码 · 共 934 行 · 第 1/3 页

PY
934
字号
# -- coding: utf-8"""urlparser.py - Module containing class HarvestManUrl,representing a URL and its relation to disk files inHarvestMan.Creation Date: Nov 2 2004Author: Anand B Pillai <abpillai at gmail dot com>   Jan 01 2006      jkleven  Change is_webpage to return 'true'                             if the URL looks like a form query.   Jan 10 2006      Anand    Converted from dos to unix format (removed Ctrl-Ms).   Oct 1 2006       Anand    Fixes for EIAO ticket #193 - added reduce_url                             method to take care of .. chars inside URLs.   Feb 25 2007      Anand    Added .ars as a web-page extension to support                             the popular ars-technica website.   Mar 12 2007      Anand    Added more fields for multipart. Fixed a bug in                             is_webpage - anchor links should be returned                             as web-page links.   Apr 12 2007      Anand    Fixed a bug in anchor link parsing. The current                             logic was not taking care of multiple anchor                             links (#anchor1#anchor2). Fixed it by using                             a regular expression.                             Test page is                             http://nltk.sourceforge.net/lite/doc/api/term-index.html   Mar 05 2008     Anand    Many changes integrated. Method to get canonical form                             of URL added .Generating index as hash of canonical URL                             now. Added queue macros.   Apr 24 2008     Anand    Fix for #829.Copyright (C) 2004 Anand B Pillai.   """__version__ = '2.0 b1'__author__ = 'Anand B Pillai'import os, sysimport reimport mimetypesimport copyimport urlprocimport md5import itertoolsimport randomfrom types import StringTypesfrom harvestman.lib import documentfrom harvestman.lib.common.common import *from harvestman.lib.common.netinfo import *from harvestman.lib.urltypes import *# URL queueing status macrosURL_NOT_QUEUED=0       # Fresh URL, not queued yetURL_QUEUED=1           # Fresh URL sent to queue, but not yet in queueURL_IN_QUEUE=2         # URL is in queueURL_IN_DOWNLOAD=3      # URL is out of queue and in downloadURL_DONE_DOWNLOAD=4    # URL has completed download, though this may not mean                       # that the download was successful.class HarvestManUrlError(Exception):    """ Error class for HarvestManUrl """        def __init__(self, value):        self.value = value    def __repr__(self):        return self.__str__()        def __str__(self):        return str(self.value)    class HarvestManUrl(object):    """ A class representing a URL in HarvestMan """    TEST = False    hashes = {}        def __init__(self, url, urltype = URL_TYPE_ANY, cgi = False, baseurl  = None, rootdir = ''):        # Remove trailing wspace chars.        url = url.rstrip()        try:            try:                try:                    url.encode("utf-8")                except UnicodeDecodeError:                    url = url.decode("iso-8859-1")            except UnicodeDecodeError, e:                url = url.decode("latin-1")        except UnicodeDecodeError, e:            pass                        # For saving original url        # since self.url can get        # modified        self.origurl = url                self.url = url        self.url = urlproc.modify_url(self.url)                self.typ = urltype        self.cgi = cgi        self.anchor = ''        self.index = 0        self.filename = 'index.html'        self.validfilename = 'index.html'        self.lastpath = ''        self.protocol = ''        self.defproto = False        # If the url is a file like url        # this value will be true, if it is        # a directory like url, this value will        # be false.        self.filelike = False        # download status, a number indicating        # whether this url was downloaded successfully        # or not. 0 indicates a successful download, and        # any number >0 indicates a failed download        self.status = -1        # Url scheduled status, a number indicating        # how the URL is queued for download.        # It has the following values        # URL_NOT_QUEUED        # URL_QUEUED        # URL_IN_DOWNLOAD        # URL_DONE_DOWNLOAD        # The fact that the URL has URL_DONE_DOWNLOAD        # need not mean that the download was successful!        self.qstatus = URL_NOT_QUEUED        # Fatal status        self.fatal = False        # is starting url?        self.starturl = False        # Flag for files having extension        self.hasextn = False        # Relative path flags        self.isrel = False        # Relative to server?        self.isrels = False        self.port = 80        self.domain = ''        self.rpath = []        # Recursion depth        self.rdepth = 0        # Url headers        self.contentdict = {}        # Url generation        self.generation = 0        # Url priority        self.priority = 0        # rules violation cache flags        self.violatesrules = False        self.rulescheckdone = False        # Bytes range - used for HTTP/1.1        # multipart downloads. This has to        # be set to an xrange object         self.range = None        # Flag to try multipart        self.trymultipart = False        # Multipart index        self.mindex = 0        # Original url for mirrored URLs        self.mirror_url = None        # Flag set for URLs which are mirrored from        # a different server than the original URL        self.mirrored = False        # Content-length for multi-part        # This is the content length of the original        # content.        self.clength = 0        self.dirpath = []        # Re-computation flag        self.reresolved = False        # URL redirected flag        self.redirected = False        # Flag indicating we are using an old URL        # which was redirected, again for producing        # further redirections. This is used in Hget        # for automatic split-mirror downloading        # for URLs that auto-forward to mirrors.        self.redirected_old = False        self.baseurl = None        # Hash of page data        self.pagehash = ''        # Flag to decide whether to recalculate get_full_url(...)        # if flag is False, recalculate...        self.urlflag = False        # Cached full URL string        self.absurl = ''        # Base Url Dictionary        if baseurl:            if isinstance(baseurl, HarvestManUrl):                self.baseurl = baseurl            elif type(baseurl) in StringTypes:                self.baseurl = HarvestManUrl(baseurl, 'generic', cgi, None, rootdir)                              # Root directory        if rootdir == '':            if self.baseurl and self.baseurl.rootdir:                self.rootdir = self.baseurl.rootdir            else:                self.rootdir = os.getcwd()        else:            self.rootdir = rootdir                    self.anchorcheck()        self.resolveurl()        # For starting URL, the index is 0, for the rest        # it is as hash of the canonical URL string...        self.index = hash(self.get_canonical_url())        # If this is a URL similar to start URL,        # reset its index to zero. The trick is        # to store only the hash of the start URL        # as key in the attribute 'hashes'.                try:            val = self.hashes[self.index]            self.index = 0        except KeyError:            pass        # Copy of myself, this will be saved if        # a re-resolving is requested so that old        # parameters can be requested if needed        self.orig_state = None            def reset(self):        """ Reset all the key attributes """        # Archive previous state        self.orig_state = copy.copy(self)        self.url = urlproc.modify_url(self.url)        self.lastpath = ''        self.protocol = ''        self.defproto = False        self.hasextn = False        self.isrel = False        self.isrels = False        self.port = 80        self.domain = ''        self.rpath = []        # Recursion depth        self.rdepth = 0        self.dirpath = []        self.filename = 'index.html'        self.validfilename = 'index.html'        # Set urlflag to False        self.urlflag = False        self.absurl = ''    def __str__(self):        return self.absurl        def wrapper_resolveurl(self):        """ Called forcefully to re-resolve a URL, typically after a re-direction        or change in URL has been detected """        self.reset()        self.anchorcheck()        self.resolveurl()        self.reresolved = True            def anchorcheck(self):        """ Checking for anchor tags and processing accordingly """        if self.typ == 'anchor':            if not self.baseurl:                raise HarvestManUrlError, 'Base url should not be empty for anchor type url'            if '#' in self.url:                # Split with re                items = anchore.split(self.url)                # First item is the original url                if len(items):                    if items[0]:                        self.url = items[0]                    else:                        self.url = self.baseurl.get_full_url()                    # Rest forms the anchor tag                    self.anchor = '#' + '#'.join(items[1:])                        def resolve_protocol(self):        """ Resolve the protocol of the url """        url2 = self.url.lower()        for proto in protocol_map.keys():            if url2.find(proto) != -1:                self.protocol = proto                self.port = protocol_map.get(proto)                return True        else:            # Fix: Use regex for detecting WWW urls.            # Check for WWW urls. These can begin            # with a 'www.' or 'www' followed by            # a single number (www1, www3 etc).            if www_re.match(url2):                self.protocol = 'http://'                self.url =  "".join((self.protocol, self.url))                return True            # We accept FTP urls beginning with just            # ftp.<server>, and consider it as FTP over HTTP            if url2.startswith('ftp.'):

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?