urlparser.py

来自「Harvestman-最新版本」· Python 代码 · 共 934 行 · 第 1/3 页

PY
934
字号
                # FTP over HTTP                self.protocol = 'http://'                self.url = ''.join((self.protocol, self.url))                return True                        # Urls relative to server might            # begin with a //. Then prefix the protocol            # string to them.            if self.url.find('//') == 0:                # Pick protocol from base url                if self.baseurl and self.baseurl.protocol:                    self.protocol = self.baseurl.protocol                else:                    self.protocol = "http://"                   self.url = "".join((self.protocol, self.url[2:]))                return True            # None of these            # Protocol not resolved, so check            # base url first, if not found, set            # default protocol...            if self.baseurl and self.baseurl.protocol:                self.protocol = self.baseurl.protocol            else:                self.protocol = 'http://'            self.defproto = True                    return False            def resolveurl(self):        """ Resolves the url finding out protocol, port, domain etc        . Also resolves relative paths and builds a local file name        for the url based on the root directory path """        if len(self.url)==0:            raise HarvestManUrlError, 'Error: Zero Length Url'        proto = self.resolve_protocol()        paths = ''                if not proto:            # Could not resolve protocol, must be a relative url            if not self.baseurl:                raise HarvestManUrlError, 'Base url should not be empty for relative urls'            # Set url-relative flag            self.isrel = True            # Is relative to server?            if self.url[0] == '/':                self.isrels = True                        # Split paths            relpaths = self.url.split(URLSEP)            try:                idx = relpaths.index(DOTDOT)            except ValueError:                idx = -1            # Only reduce if the URL itself does not start with            # .. - if it does our rpath algorithm takes            # care of it.            # Mod: This is commented out now, since it looks            # like there is no harm in allowing to reduce, even            # if the path starts with '..'            #if idx > 0:                        relpaths = self.reduce_url(relpaths)            # Build relative path by checking for "." and ".." strings            self.rindex = 0            for ritem in relpaths:                # If path item is ., .. or empty, increment                # relpath index.                if ritem in (DOT, DOTDOT, ""):                    self.rindex += 1                    # If path item is not empty, insert                    # to relpaths list.                    if ritem:                        self.rpath.append(ritem)                else:                    # Otherwise, add the rest to paths                    # with the separator                    for entry in relpaths[self.rindex:]:                        paths = "".join((paths, entry, URLSEP))                    # Remove the last entry                    paths = paths[:-1]                                        # Again Trim if the relative path ends with /                    # like href = /img/abc.gif/                     #if paths[-1] == '/':                    #    paths = paths[:-1]                    break                    else:            # Absolute path, so 'paths' is the part of it            # minus the protocol part.            paths = self.url.replace(self.protocol, '')                        if paths=='':                # Error: URL consists only of protocol                raise HarvestManUrlError, 'Error: Invalid URL containing only protocol'                                            # Split URL            items = paths.split(URLSEP)                        # If there are nonsense .. and . chars in the paths, remove            # them to construct a sane path.            #try:            #    idx = items.index(DOTDOT)            #except ValueError:            #    idx = -1                        flag = (DOT in items) or (DOTDOT in items)                        if flag:                # Bugfix: Do not allow a URL like http://www.foo.com/../bar                # to become http://bar. Basically if the index of .. is                # 1, then remove the '..' entirely. This bug was encountered                # in EIAO testing of http://www.fylkesmannen.no/ for the URL                # http://www.fylkesmannen.no/osloogakershu                                items = self.reduce_url(items)                # Re-construct URL                paths = URLSEP.join(items)                        # Now compute local directory/file paths        self.compute_dirpaths(paths)        if not self.protocol.startswith('file:'):            self.compute_domain_and_port()        # For some file extensions, automatically set as directory URL.        if self.validfilename:            extn = ((os.path.splitext(self.validfilename))[1]).lower()            if extn in default_directory_extns:                self.set_directory_url()        # print self.dirpath, self.domain            def reduce_url(self, paths):        """ Remove nonsense .. and . chars from URL paths """        for x in range(len(paths)):            path = paths[x]            try:                nextpath = paths[x+1]                if nextpath in (DOT, DOTDOT):                    # Check if a ? occurs anywhere earlier in path.                    # If a ? occurs in the path, don't reduce                    # any paths coming after it.                    try:                        qindex = paths.index('?')                        if qindex < x+1:                            continue                    except ValueError:                        pass                                    if nextpath == DOTDOT:                    paths.pop(x+1)                    # Do not allow to remove the domain for                    # stupid URLs like 'http://www.foo.com/../bar' or                    # 'http://www.foo.com/camp/../../bar'. If allowed                    # they become nonsense URLs like http://bar.                    # This bug was encountered in EIAO testing of                    # http://www.fylkesmannen.no/ for the URL                    # http://www.fylkesmannen.no/osloogakershu                                        if self.isrel or x>0:                        paths.remove(path)                    return self.reduce_url(paths)                elif nextpath==DOT:                    paths.pop(x+1)                    return self.reduce_url(paths)                                except IndexError:                return paths                    def compute_file_and_dir_paths(self):        """ Compute file and directory paths """        if self.lastpath:            dotindex = self.lastpath.find(DOT)            if dotindex != -1:                self.hasextn = True            # If there is no extension or if there is            # an extension which is occuring in the middle            # of last path...            if (dotindex == -1) or \                ((dotindex >0) and (dotindex < (len(self.lastpath)-1))):                self.filelike = True                # Bug fix - Strip leading spaces & newlines                self.validfilename =  self.make_valid_filename(self.lastpath.strip())                self.filename = self.lastpath.strip()                self.dirpath  = self.dirpath [:-1]        else:            if not self.isrel:                self.dirpath  = self.dirpath [:-1]        # Remove leading spaces & newlines from dirpath        dirpath2 = []        for item in self.dirpath:            dirpath2.append(item.strip())        # Copy        self.dirpath = dirpath2[:]                def compute_dirpaths(self, path):        """ Computer local file & directory paths for the url """        self.dirpath = path.split(URLSEP)        self.lastpath = self.dirpath[-1]        # print self.dirpath, self.lastpath                if self.isrel:            # Construct file/dir names - This is valid only if the path            # has more than one component - like www.python.org/doc .            # Otherwise, the url is a plain domain            # path like www.python.org .            self.compute_file_and_dir_paths()            # print 'Rpath=>',self.rpath                        # Interprets relative path            # ../../. Nonsense relative paths are graciously ignored,            self.rpath.reverse()            # print 'Base url dirpath=>',self.baseurl.dirpath            # print 'Rindex=>',self.rindex            # This simple logic is fine for most paths except            # when a base URL has a "?" as part of its dirpath.            # Example: http://razor.occams.info/code/repo/?/govtrack/sec .            # In that case, any pieces of the base URL after the            # ? is to be omitted.            if '?' in self.baseurl.dirpath:                # Trim base url to the part before ?                qindex = self.baseurl.dirpath.index('?')                self.baseurl.dirpath = self.baseurl.dirpath[:qindex]                        if len(self.rpath) == 0 :                if not self.rindex:                    self.dirpath = self.baseurl.dirpath + self.dirpath            else:                pathstack = self.baseurl.dirpath[0:]                for ritem in self.rpath:                    if ritem == DOT:                        pathstack = self.baseurl.dirpath[0:]                    elif ritem == DOTDOT:                        if len(pathstack) !=0:                                pathstack.pop()                            self.dirpath  = pathstack + self.dirpath             # print 'Dirpath2=>',self.dirpath            #if self.noreduce:            #    return                        # Support for NONSENSE relative paths such as            # g/../foo and g/./foo             # consider base = http:\\bar.com\bar1            # then g/../foo => http:\\bar.com\bar1\..\foo => http:\\bar.com\foo            # g/./foo  is utter nonsense and we feel free to ignore that.            #index = 0            #for item in self.dirpath:            #    if item in (DOT, DOTDOT):            #        self.dirpath.remove(item)            #    if item == DOTDOT:            #        self.dirpath.remove(self.dirpath[index - 1])            #    index += 1        else:            if len(self.dirpath) > 1:                self.compute_file_and_dir_paths()                def compute_domain_and_port(self):        """ Computes url domain and port &        re-computes if necessary """        # Resolving the domain...                # Domain is parent domain, if        # url is relative :-)        if self.isrel:            self.domain = self.baseurl.domain        else:            # If not relative, then domain            # if the first item of dirpath.            self.domain = self.dirpath[0]            self.dirpath = self.dirpath[1:]        # Find out if the domain contains a port number        # for example, server:8080        dom = self.domain        index = dom.find(PORTSEP)        if index != -1:            self.domain = dom[:index]            # A bug here => needs to be fixed            try:                self.port   = int(dom[index+1:])            except:                pass        # Now check if the base domain had a port specification (other than 80)        # Then we need to use that port for all its children, otherwise        # we can use default value.        if self.isrel and \               self.baseurl and \

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?