📄 urlparser.py
字号:
def get_domain(self):
""" Return the domain (server) for this url object """
return self.domain
def get_full_domain(self):
""" Return the full domain (protocol + domain) for this url object """
return self.protocol + self.domain
def get_full_domain_with_port(self, intranet=0):
""" Return the domain (server) with port number
appended to it, if the port number is not the
default for the current protocol """
if intranet or ((self.protocol == 'http://' and int(self.port) != 80) \
or (self.protocol == 'https://' and int(self.port) != 443) \
or (self.protocol == 'ftp://' and int(self.port) != 21)):
return self.get_full_domain() + ':' + str(self.port)
else:
return self.get_full_domain()
def get_domain_with_port(self, intranet=0):
""" Return the domain (server) with port number
appended to it, if the port number is not the
default for the current protocol """
if intranet or ((self.protocol == 'http://' and self.port != 80) \
or (self.protocol == 'https://' and self.port != 443) \
or (self.protocol == 'ftp://' and self.port != 21)):
return self.domain + ':' + str(self.port)
else:
return self.domain
def get_full_filename(self):
""" Return the full filename of this url on the disk.
This is created w.r.t the local directory where we save
the url data """
filename=os.path.join(self.get_local_directory(), self.get_filename())
return filename
def get_filename(self):
""" Return the filenam of this url on the disk. """
# NOTE: This is just the filename, not the absolute filename path
if self._iscgi or not self.filename_url:
self.validfilename = 'index.html'
return self.validfilename
def get_relative_filename(self):
""" Get the relative filename path of this url object w.r.t
its base url """
if not self._baseUrlObj:
return self.get_filename()
else:
basedir = self._baseUrlObj().get_local_directory()
filename = self.get_full_filename()
l = [basedir, filename]
# bug: commonprefix returns invalid path names
# we need to convert it to a valid pathname
# this can be done by taking the directory name
# of the prefix.
prefix = os.path.commonprefix(l)
if not os.path.exists(prefix):
prefix = os.path.dirname(prefix)
if prefix:
relbasedir = basedir.replace(prefix, '')
relfilename = filename.replace(prefix, '')
else:
relbasedir = basedir
relfilename = filename
if relfilename[0] == '\\' or relfilename[0] == '/':
relfilename = relfilename[1:]
if relbasedir=="":
return relfilename
else:
# for each path in relbasedir add a '..' to the
# relative filename
# first replace all '\\' with '/'
relbasedir = relbasedir.replace('\\', '/')
dotdots = len(relbasedir.split('/'))
for x in range(dotdots - 1):
relfilename = '../' + relfilename
return relfilename
def get_relative_depth(self, hu, mode=0):
""" Get relative depth of current url object vs passed url object.
Return a postive integer if successful and -1 on failure """
# Fixed 2 bugs on 22/7/2003
# 1 => passing arguments to find function in wrong order
# 2 => Since we allow the notion of zero depth, even zero
# value of depth should be returned.
# This mode checks for depth based on a directory path
# This check is valid only if dir2 is a sub-directory of dir1
dir1=self.get_url_directory()
dir2=hu.get_url_directory()
# spit off the protocol from directories
dir1 = dir1.replace(self.protocol, '')
dir2 = dir2.replace(self.protocol, '')
# Append a '/' to the dirpath if not already present
if dir1[len(dir1)-1] != '/': dir1 += '/'
if dir2[len(dir2)-1] != '/': dir2 += '/'
# print dir1, dir2
if mode==0:
# check if dir2 is present in dir1
# bug: we were passing arguments to the find function
# in the wrong order.
if dir1.find(dir2) != -1:
# we need to check for depth only if the above condition is true.
l1=dir1.split('/')
l2=dir2.split('/')
# print l1, l2
if l1 and l2:
diff=len(l1) - len(l2)
if diff>=0: return diff
return -1
# This mode checks for depth based on the base server(domain).
# This check is valid only if dir1 and dir2 belong to the same
# base server (checked by name)
elif mode==1:
if self.domain == hu.domain:
# we need to check for depth only if the above condition is true.
l1=dir1.split('/')
l2=dir2.split('/')
if l1 and l2:
diff=len(l1) - len(l2)
if diff>=0: return diff
return -1
# This check is done for the current url against current base server (domain)
# i.e, this mode does not use the argument 'hu'
elif mode==2:
dir2 = self.domain
if dir1[len(dir2)-1] != '/':
dir2 += '/'
# we need to check for depth only if the above condition is true.
l1=dir1.split('/')
l2=dir2.split('/')
if l1 and l2:
diff=len(l1) - len(l2)
if diff>=0: return diff
return -1
return -1
def get_root_dir(self):
""" Return root directory """
return self.rootdir
def get_local_directory(self):
""" Return the local directory path of this url w.r.t
the directory on the disk where we save the files of this url """
# Gives Local Direcory path equivalent to URL Path in server
# Could be used to cache HTML pages to disk
rval = os.path.join(self.rootdir, self.domain)
for dir in self.dirpath:
if dir:
rval = os.path.abspath( os.path.join(rval, self.__make_valid_filename(dir)))
return rval
def is_image(self):
""" Find out by filename extension if the file is an image """
# Note: This function takes into account almost all popular
# web image formats.
img_extns = ['.bmp', '.dib', '.dcx', '.emf', '.fpx', '.gif', '.img',
'.jp2', '.jpc', '.j2k', '.jpf', '.jpg', '.jpeg', '.jpe',
'.mng', '.pbm', '.pcd', '.pcx', '.pgm', '.png', '.ppm',
'.psd', '.ras', '.rgb', '.tga', '.tif', '.tiff', '.wbmp',
'.xbm', '.xpm']
extn = ((os.path.splitext(self.validfilename))[1]).lower()
if extn in img_extns:
return True
else:
return False
def is_webpage(self):
""" Find out by filename extension if the file <filename>
is an html or html-like (server-side dynamic html files)
file, or a candidate for one """
# Note: right now we treat dynamic server-side scripts namely
# php, psp, asp, pl, jsp, and cgi as possible html candidates, though
# actually they might be generating non-html content (like dynamic
# images.)
if not self.has_extension:
return 1
if self.validfilename:
extn = ((os.path.splitext(self.validfilename))[1]).lower()
if extn in ('.htm', '.html', '.php', '.asp', '.jsp','.psp','.pl','.cgi'):
return 1
return 0
def set_url_content_info(self, contentlen, contenttype):
""" This function sets the url content information of this
url. It is a convenient function which can be used by connectors
to store url content information """
self.contentdict['Content-length'] = contentlen
self.contentdict['Content-type'] = contenttype
def get_url_content_info(self):
""" Get the url content information """
return self.contentdict
if __name__=="__main__":
# Test code
hulist = [HarvestManUrlParser('http://www.yahoo.com/photos/my photo.gif'),
HarvestManUrlParser('http://www.rediff.com:80/r/r/tn2/2003/jun/25usfed.htm'),
HarvestManUrlParser('http://cwc2003.rediffblogs.com'),
HarvestManUrlParser('/sports/2003/jun/25beck1.htm', 'normal', 0, 'http://www.rediff.com'),
HarvestManUrlParser('ftp://ftp.gnu.org/pub/lpf.README'),
HarvestManUrlParser('http://www.python.org/doc/2.3b2/'),
HarvestManUrlParser('//images.sourceforge.net/div.png', 'image', 0, 'http://sourceforge.net'),
HarvestManUrlParser('http://pyro.sourceforge.net/manual/LICENSE'),
HarvestManUrlParser('python/test.htm', 'normal', 0, 'http://www.foo.com/bar'),
HarvestManUrlParser('/python/test.css', 'normal', 0, 'http://www.foo.com/bar/vodka/test.htm'),
HarvestManUrlParser('/visuals/standard.css', 'normal', 0, 'http://www.garshol.priv.no/download/text/perl.html', 'd:/websites'),
HarvestManUrlParser('www.fnorb.org/index.html', 'normal', 0, 'http://pyro.sourceforge.net', 'd:/websites'),
HarvestManUrlParser('http://profigure.sourceforge.net/index.html', 'normal', 0, 'http://pyro.sourceforge.net', 'd:/websites'),
HarvestManUrlParser('#anchor', 'anchor', 0, 'http://www.foo.com/bar/index.html')]
for hu in hulist:
print '------------------------------------------------------------------\n'
print 'Full filename = ', hu.get_full_filename()
print 'Valid filename = ', hu.validfilename
print 'Local Filename = ', hu.get_filename()
print 'Is relative path = ', hu.is_relative_path()
print 'Full domain = ', hu.get_full_domain()
print 'Domain = ', hu.domain
print 'Local directory = ', hu.get_url_directory()
print 'Absolute Url = ', hu.get_full_url()
print 'Absolute Url Without Port = ', hu.get_full_url_sans_port()
print 'Local Directory = ', hu.get_local_directory()
print 'Is filename parsed = ', hu.filename_url
print 'Path rel to domain = ', hu.get_relative_url()
print 'Connection Port = ', hu.get_port_number()
print 'Domain with port = ', hu.get_full_domain_with_port()
print 'Relative filename = ', hu.get_relative_filename()
print 'Anchor url = ', hu.get_anchor_url()
print 'Anchor tag = ', hu.get_anchor()
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -