urlparser.py
来自「Harvestman-最新版本」· Python 代码 · 共 934 行 · 第 1/3 页
PY
934 行
# FTP over HTTP self.protocol = 'http://' self.url = ''.join((self.protocol, self.url)) return True # Urls relative to server might # begin with a //. Then prefix the protocol # string to them. if self.url.find('//') == 0: # Pick protocol from base url if self.baseurl and self.baseurl.protocol: self.protocol = self.baseurl.protocol else: self.protocol = "http://" self.url = "".join((self.protocol, self.url[2:])) return True # None of these # Protocol not resolved, so check # base url first, if not found, set # default protocol... if self.baseurl and self.baseurl.protocol: self.protocol = self.baseurl.protocol else: self.protocol = 'http://' self.defproto = True return False def resolveurl(self): """ Resolves the url finding out protocol, port, domain etc . Also resolves relative paths and builds a local file name for the url based on the root directory path """ if len(self.url)==0: raise HarvestManUrlError, 'Error: Zero Length Url' proto = self.resolve_protocol() paths = '' if not proto: # Could not resolve protocol, must be a relative url if not self.baseurl: raise HarvestManUrlError, 'Base url should not be empty for relative urls' # Set url-relative flag self.isrel = True # Is relative to server? if self.url[0] == '/': self.isrels = True # Split paths relpaths = self.url.split(URLSEP) try: idx = relpaths.index(DOTDOT) except ValueError: idx = -1 # Only reduce if the URL itself does not start with # .. - if it does our rpath algorithm takes # care of it. # Mod: This is commented out now, since it looks # like there is no harm in allowing to reduce, even # if the path starts with '..' #if idx > 0: relpaths = self.reduce_url(relpaths) # Build relative path by checking for "." and ".." strings self.rindex = 0 for ritem in relpaths: # If path item is ., .. or empty, increment # relpath index. if ritem in (DOT, DOTDOT, ""): self.rindex += 1 # If path item is not empty, insert # to relpaths list. if ritem: self.rpath.append(ritem) else: # Otherwise, add the rest to paths # with the separator for entry in relpaths[self.rindex:]: paths = "".join((paths, entry, URLSEP)) # Remove the last entry paths = paths[:-1] # Again Trim if the relative path ends with / # like href = /img/abc.gif/ #if paths[-1] == '/': # paths = paths[:-1] break else: # Absolute path, so 'paths' is the part of it # minus the protocol part. paths = self.url.replace(self.protocol, '') if paths=='': # Error: URL consists only of protocol raise HarvestManUrlError, 'Error: Invalid URL containing only protocol' # Split URL items = paths.split(URLSEP) # If there are nonsense .. and . chars in the paths, remove # them to construct a sane path. #try: # idx = items.index(DOTDOT) #except ValueError: # idx = -1 flag = (DOT in items) or (DOTDOT in items) if flag: # Bugfix: Do not allow a URL like http://www.foo.com/../bar # to become http://bar. Basically if the index of .. is # 1, then remove the '..' entirely. This bug was encountered # in EIAO testing of http://www.fylkesmannen.no/ for the URL # http://www.fylkesmannen.no/osloogakershu items = self.reduce_url(items) # Re-construct URL paths = URLSEP.join(items) # Now compute local directory/file paths self.compute_dirpaths(paths) if not self.protocol.startswith('file:'): self.compute_domain_and_port() # For some file extensions, automatically set as directory URL. if self.validfilename: extn = ((os.path.splitext(self.validfilename))[1]).lower() if extn in default_directory_extns: self.set_directory_url() # print self.dirpath, self.domain def reduce_url(self, paths): """ Remove nonsense .. and . chars from URL paths """ for x in range(len(paths)): path = paths[x] try: nextpath = paths[x+1] if nextpath in (DOT, DOTDOT): # Check if a ? occurs anywhere earlier in path. # If a ? occurs in the path, don't reduce # any paths coming after it. try: qindex = paths.index('?') if qindex < x+1: continue except ValueError: pass if nextpath == DOTDOT: paths.pop(x+1) # Do not allow to remove the domain for # stupid URLs like 'http://www.foo.com/../bar' or # 'http://www.foo.com/camp/../../bar'. If allowed # they become nonsense URLs like http://bar. # This bug was encountered in EIAO testing of # http://www.fylkesmannen.no/ for the URL # http://www.fylkesmannen.no/osloogakershu if self.isrel or x>0: paths.remove(path) return self.reduce_url(paths) elif nextpath==DOT: paths.pop(x+1) return self.reduce_url(paths) except IndexError: return paths def compute_file_and_dir_paths(self): """ Compute file and directory paths """ if self.lastpath: dotindex = self.lastpath.find(DOT) if dotindex != -1: self.hasextn = True # If there is no extension or if there is # an extension which is occuring in the middle # of last path... if (dotindex == -1) or \ ((dotindex >0) and (dotindex < (len(self.lastpath)-1))): self.filelike = True # Bug fix - Strip leading spaces & newlines self.validfilename = self.make_valid_filename(self.lastpath.strip()) self.filename = self.lastpath.strip() self.dirpath = self.dirpath [:-1] else: if not self.isrel: self.dirpath = self.dirpath [:-1] # Remove leading spaces & newlines from dirpath dirpath2 = [] for item in self.dirpath: dirpath2.append(item.strip()) # Copy self.dirpath = dirpath2[:] def compute_dirpaths(self, path): """ Computer local file & directory paths for the url """ self.dirpath = path.split(URLSEP) self.lastpath = self.dirpath[-1] # print self.dirpath, self.lastpath if self.isrel: # Construct file/dir names - This is valid only if the path # has more than one component - like www.python.org/doc . # Otherwise, the url is a plain domain # path like www.python.org . self.compute_file_and_dir_paths() # print 'Rpath=>',self.rpath # Interprets relative path # ../../. Nonsense relative paths are graciously ignored, self.rpath.reverse() # print 'Base url dirpath=>',self.baseurl.dirpath # print 'Rindex=>',self.rindex # This simple logic is fine for most paths except # when a base URL has a "?" as part of its dirpath. # Example: http://razor.occams.info/code/repo/?/govtrack/sec . # In that case, any pieces of the base URL after the # ? is to be omitted. if '?' in self.baseurl.dirpath: # Trim base url to the part before ? qindex = self.baseurl.dirpath.index('?') self.baseurl.dirpath = self.baseurl.dirpath[:qindex] if len(self.rpath) == 0 : if not self.rindex: self.dirpath = self.baseurl.dirpath + self.dirpath else: pathstack = self.baseurl.dirpath[0:] for ritem in self.rpath: if ritem == DOT: pathstack = self.baseurl.dirpath[0:] elif ritem == DOTDOT: if len(pathstack) !=0: pathstack.pop() self.dirpath = pathstack + self.dirpath # print 'Dirpath2=>',self.dirpath #if self.noreduce: # return # Support for NONSENSE relative paths such as # g/../foo and g/./foo # consider base = http:\\bar.com\bar1 # then g/../foo => http:\\bar.com\bar1\..\foo => http:\\bar.com\foo # g/./foo is utter nonsense and we feel free to ignore that. #index = 0 #for item in self.dirpath: # if item in (DOT, DOTDOT): # self.dirpath.remove(item) # if item == DOTDOT: # self.dirpath.remove(self.dirpath[index - 1]) # index += 1 else: if len(self.dirpath) > 1: self.compute_file_and_dir_paths() def compute_domain_and_port(self): """ Computes url domain and port & re-computes if necessary """ # Resolving the domain... # Domain is parent domain, if # url is relative :-) if self.isrel: self.domain = self.baseurl.domain else: # If not relative, then domain # if the first item of dirpath. self.domain = self.dirpath[0] self.dirpath = self.dirpath[1:] # Find out if the domain contains a port number # for example, server:8080 dom = self.domain index = dom.find(PORTSEP) if index != -1: self.domain = dom[:index] # A bug here => needs to be fixed try: self.port = int(dom[index+1:]) except: pass # Now check if the base domain had a port specification (other than 80) # Then we need to use that port for all its children, otherwise # we can use default value. if self.isrel and \ self.baseurl and \
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?