📄 urlparser.py
字号:
""" HarvestManUrlPathParser.py - Module to parse a url string based
on a directory and extract useful information like
directory, filename path etc. This software is part of
the HarvestMan(R) program.
Authors: Nirmal Kannan Chidambaram(nkchidambaram at yahoo dot com),
Anand B Pillai (anandpillai at letterboxes dot org).
For licensing information see the file LICENSE.txt that
is included in this distribution.
Dependency
==========
os, string, copy
Modification History
====================
Anand 1.3 alpha release Cleaning up, renaming, many changes.
Jan 2 2004 Anand 1.4 bug fix version development started.
Feb 10 2004 Anand 1.3.1 bug fix release.
Feb 24 2004 Anand 1.3.3 devel started. Fixed a bug with
ftp links.
Jun 5-9 2004 Anand 1.4 development.
*Modified __init__ method. It now takes
an existing url object or a url string for
the base url option. Root directory obtained
from base url object instead of explicitly passing
everytime.
*Added url generation and priority properties.
*Added a download status proprety
*Most visible change is the way we distinguish
between directory like and file like urls. Replaced
previous code which assumed it is a directory url
then tried to download it, and corrected to a file-like
url if download failed. This tied this class closely
to connector methods and necessitated caching of
connections. The new algorithm assumes all urls with
no extensions to be file like urls. The actual decision
is taken when we request for the file in the connector,
which then sets a flag in the url object, if it is a
directory url. Base url object is made a weakref reference
since we need to propagate such changes back by one
generation.
*Fix for urls with spaces.
Jun 14 2004 Anand 1.3.9 release.
"""
import copy
import weakref
import os, sys
from common import *
class HarvestManUrlParserError(Exception):
def __init__(self, value):
self.value = value
def __str__(self):
return str(self.value)
class HarvestManUrlParser:
def __init__(self, tarURL, urltype='normal', iscgi = 0, baseUrlObj = None, rootDir = ''):
self.url = tarURL
self._urltyp = urltype
self._anchor = ''
# if anchor type link, resolve the anchor
if self._urltyp == 'anchor':
if baseUrlObj is None:
raise HarvestManUrlParserError, 'Base url should not be empty for anchor type urls'
index = self.url.rfind('#')
if index != -1:
newhref = self.url[:index]
self._anchor = self.url[index:]
self.url = newhref
self._iscgi = iscgi
self.filename = 'index.html'
self.validfilename = 'index.html'
self.lastpath = ''
# Fix for bug #B1077613467.85
self.protocol = ''
self.defaultproto = False
# If the url is a file like url
# this value will be true, if it is
# a directory like url, this value will
# be false.
self.filename_url = False
# download status, a number indicating
# whether this url was downloaded successfully
# or not. 0 indicates a successful download, and
# any number >0 indicates a failed download
self._dstatus = 0
# is starting url?
self.is_starting_url = False
# Flag for files having extension
self.has_extension = False
# Relative path flags
self.is_rel = False
self.is_relto_server = False
self.port = 80
self.domain = ''
self.rpath = []
# Recursion depth
self.recursion_depth = 0
# Content information for updating urls
self.contentdict = {}
# Url generation
self._generation = 0
# Url priority
self._priority = 0
# rules violation cache flags
self.__violatesrules = False
self.__rulescheckdone = False
self._baseUrlObj = None
if baseUrlObj:
if isinstance(baseUrlObj, HarvestManUrlParser):
self._baseUrlObj = weakref.ref(baseUrlObj)
elif type(baseUrlObj) is str:
baseUrlObj = HarvestManUrlParser(baseUrlObj, 'normal', self._iscgi, None, rootDir)
self._baseUrlObj = weakref.ref(baseUrlObj)
if rootDir == '':
if self._baseUrlObj:
self.rootdir = self._baseUrlObj().get_root_dir()
else:
self.rootdir = os.getcwd()
else:
self.rootdir = rootDir
self.__resolve_url()
def __make_valid_filename(self, s):
""" Replace junk characters to create a valid
filename """
junks=list('?*"<>!:/\\')
for x in junks:
if s.find(x) != -1:
s = s.replace(x, '')
# replace '%20' with the space
# character (generated by POST requests)
s = s.replace('%20', ' ')
# replace %7E with ~
s = s.replace('%7E', '~')
return s
def __make_valid_url(self, url):
""" Make a valid url """
# Replace spaces with "%20"
url = url.replace(' ', '%20')
return url
def has_filename_extension(self):
""" Return whether the url of this object has a filename extension """
return self.has_extension
def is_filename_url(self):
""" Return whether this is file name url """
# A directory url is something like http://www.python.org
# which points to the <index.html> file inside the www.python.org
# directory.A file name url is a url that points to an actual
# file like http://www.python.org/doc/current/tut/tut.html
return self.filename_url
def set_directory_url(self, directory_url=True):
""" Set this as a directory url """
self.filename_url = False
self.dirpath.append(self.lastpath)
print "MY DIRPATH CHANGED=> ", self.dirpath
self.validfilename = 'index.html'
def get_anchor(self):
""" Return the anchor tag of this url """
return self._anchor
def get_anchor_url(self):
""" Get the anchor url, if this url is an anchor type """
return self.get_full_url() + self._anchor
def get_generation(self):
""" Return the generation of this url """
return self._generation
def set_generation(self, gen):
""" Set the generation of this url """
self._generation = gen
def get_priority(self):
""" Get the priority for this url """
return self._priority
def set_priority(self, priority):
""" Set the priority for this url """
self._priority = priority
def get_download_status(self):
""" Return the download status for this url """
return self._dstatus
def set_download_status(self, dstatus):
""" Set the download status of this url """
self._dstatus = int(dstatus)
def is_start_url(self):
""" Find out if this is the starting url """
return self.is_starting_url
def __resolve_url(self):
""" Local method for resolving urls """
self.recursion_depth += 1
if len(self.url)==0:
raise HarvestManUrlParserError, 'HarvestManUrlParserError: Zero length Url'
protocolindex = self.url.find("http://")
if protocolindex != -1:
self.protocol='http://'
self.port=80
else:
protocolindex = self.url.find("ftp://")
if protocolindex != -1:
self.protocol = 'ftp://'
self.port=21
else:
protocolindex = self.url.find("https://")
if protocolindex != -1:
self.protocol = 'https://'
self.port=443
else:
# for www urls, we assume the protocol
# is http://
protocolindex = self.url.find("www.")
if protocolindex != -1:
self.protocol = 'http://'
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -