📄 urlparser.py
字号:
self.port=80
self.url = self.protocol + self.url
else:
# Bug: urls beginning with a double '/' ('//') should not be
# treated as relative paths. Instead prepend the protocol string
# to it.
if self.url.find('//') == 0:
protocolindex = 0
self.url = self.protocol + self.url[2:]
if not self.protocol:
self.protocol = 'http://'
self.defaultproto = True
if protocolindex==-1:
# Raise Exception here, if base url is not present
if self._baseUrlObj is None:
raise HarvestManUrlParserError, 'Base url should not be empty for relative urls'
paths = ''
self.is_rel = True
# Is relative to server ?
if self.url[0] == '/': self.is_relto_server = True
relative_file=0
if self.url.find('/') == -1: relative_file = 1
relpaths = self.url.split('/')
if self.url.find('/') == -1:
locally_relative = 1
rindex = 0
for ritem in relpaths:
if (ritem.count('.') == len(ritem)) or ritem == '' :
rindex = rindex+1
if (ritem.count('.') == len(ritem)) and ritem != '':
self.rpath.insert(len(self.rpath),ritem)
else:
for ent in relpaths[rindex:]:
paths = paths + ent + '/'
# Trim off that extra
paths = paths[:len(paths)-1]
# Again Trim if the relative path ends with /
# like href = /img/abc.gif/
if paths[len(paths)-1] == '/':
paths = paths[:len(paths)-1]
break
else:
paths = self.url[(protocolindex + len(self.protocol)):]
# Compute dirpath , if it is Absolute , relative path will be the final dir path
# to fetch , if else it will be relative path evaluation + dirpath
if self._iscgi: paths += '/'
self.dirpath = paths.split('/')
self.lastpath = self.dirpath[-1]
if protocolindex != -1:
if len(self.dirpath ) > 1:
if self.lastpath:
dotindex = self.lastpath.find('.')
if dotindex != -1: self.has_extension = True
if (dotindex == -1) or \
((dotindex >0) and (dotindex < (len(self.lastpath)-1))):
self.filename_url = True
self.validfilename = self.__make_valid_filename(self.lastpath)
self.filename = self.lastpath
self.dirpath = self.dirpath [: len(self.dirpath )-1]
else:
self.dirpath = self.dirpath [: len(self.dirpath )-1]
else:
if self.lastpath:
dotindex = self.lastpath.find('.')
if dotindex != -1: self.has_extension = True
if (dotindex == -1) or \
((dotindex >0) and (dotindex < (len(self.lastpath)-1))):
self.filename_url = True
self.validfilename = self.__make_valid_filename(self.lastpath)
self.filename = self.lastpath
self.dirpath = self.dirpath [: len(self.dirpath )-1]
# Interprets relative path
# ../../. Nonsense relative paths are graciously ignored,just basic implementation
# might be modified later
if protocolindex == -1:
#print 'URL -> ', self.url
#print 'MY DIRPATH ->', self.dirpath
#print 'BASE DIRPATH ->', self._baseUrlObj().dirpath, self._baseUrlObj()
# Bug fixed on abs relpath / (Test Case #1)
self.rpath.reverse()
if len(self.rpath) == 0 :
if not rindex:
self.dirpath = self._baseUrlObj().dirpath + self.dirpath
else:
self.dirpath = self.dirpath
else:
pathstack = copy.deepcopy(self._baseUrlObj().dirpath)
for ritem in self.rpath:
if ritem == '.':
pathstack = copy.deepcopy(self._baseUrlObj().dirpath)
if ritem == '..':
if len(pathstack) !=0:
pathstack.pop()
self.dirpath = pathstack + self.dirpath
# Support for NONSENSE relative paths such as
# g/../foo and g/./foo
# consider base = http:\\bar.com\bar1
# then g/../foo => http:\\bar.com\bar1\..\foo => http:\\bar.com\foo
# g/./foo is utter nonsense and we feel free to ignore that
# More nonsense will be directly delegated to webserver , which is free to respond with 404
# and We are handling that exception 404 ;-)
if protocolindex == -1:
index = 0
for item in self.dirpath:
if item.count('.') == len(item):
if item == '.':
self.dirpath.remove('.')
if item == '..':
self.dirpath.remove('..')
self.dirpath.remove(self.dirpath[index - 1])
index = index + 1
# Resolving the domain
# if relative path parent domain :-)
if protocolindex == -1:
self.domain = self._baseUrlObj().domain
else:
# find out if the domain contains a port number
# for example, heino2e:8080
self.domain=self.dirpath[0]
self.dirpath = self.dirpath[1:]
dom=self.domain
index = dom.find(':')
if index != -1:
self.domain = dom[:index]
# A bug here => needs to be fixed
try:
self.port = int(dom[index+1:])
except:
pass
# Now check if the base domain had a port specification (other than 80)
# Then we need to use that port for all its children, otherwise
# we can use default value.
if self._baseUrlObj:
if self._baseUrlObj().port != 80:
self.port = self._baseUrlObj().port
# bug-fix: protocol also need to be
# derived from base. Fix for bug # B1077613467.85.
if self.defaultproto:
self.protocol = self._baseUrlObj().protocol
def violates_rules(self):
""" Check if this url violates existing download rules """
if not self.__rulescheckdone:
self.__violatesrules = GetObject('ruleschecker').violates_basic_rules(self)
self.__rulescheckdone = True
return self.__violatesrules
def get_type(self):
""" Return the type of this url as a string """
return self._urltyp
def set_type(self, type):
""" Set the type of this url """
self._urltyp = type
def get_base_urlobject(self):
""" Return the base url object of this url """
return self._baseUrlObj()
def is_cgi(self):
""" Check whether this url is a cgi (dynamic/form) link """
return self._iscgi
def is_relative_path(self):
""" Return whether the original url was a relative one """
return self.is_rel
def is_relative_to_server(self):
""" Return whether the original url was relative to the server """
return self.is_relto_server
def get_url_directory(self):
""" Return the directory path (url minus its filename if any) of the
url """
# get the directory path of the url
rval = self.protocol + self.domain
if self.dirpath:
newpath = '/' + reduce(lambda x, y: x + '/' + y, self.dirpath)
rval += newpath
return rval
def get_url(self):
""" Return the original url used to create this object """
return self.url
def get_full_url(self, intranet=0):
""" Return the full url path of this url object after
resolving relative paths, filenames etc """
rval = self.get_full_domain_with_port(intranet)
if self.dirpath:
newpath = reduce(lambda x, y: x + '/' + y, self.dirpath)
rval = "".join((rval, '/', newpath))
if rval[-1] != '/': rval = "".join((rval, '/'))
if self.filename_url:
rval = "".join((rval, self.filename))
return self.__make_valid_url(rval)
def get_full_url_sans_port(self):
""" Return absolute url without the port number """
rval = self.get_full_domain()
if self.dirpath:
newpath = reduce(lambda x, y: x + '/' + y, self.dirpath)
rval = "".join((rval, '/', newpath))
if rval[-1] != '/': rval = "".join((rval, '/'))
if self.filename_url:
rval = "".join((rval, self.filename))
return self.__make_valid_url(rval)
def get_port_number(self):
""" Return the port number of this url """
# 80 -> http urls
return self.port
def get_relative_url(self):
""" Return relative path of url w.r.t the domain """
newpath=""
if self.dirpath:
newpath = "".join(("/", reduce(lambda x, y: x + '/' + y, self.dirpath)))
if self.filename_url:
newpath = "".join((newpath, '/', self.filename))
return self.__make_valid_url(newpath)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -