urlparser.py
来自「Harvestman-最新版本」· Python 代码 · 共 934 行 · 第 1/3 页
PY
934 行
self.baseurl.port != self.port and\ self.baseurl.protocol != 'file://': self.port = self.baseurl.port # Convert domain to lower case if self.domain != '': self.domain = self.domain.lower() def make_valid_filename(self, s): """ Replace junk characters to create a valid filename """ # Replace any %xx strings percent_chars = percent_repl.findall(s) for pchar in percent_chars: try: s = s.replace(pchar, chr(int(pchar.replace('%','0x'), 16))) except UnicodeDecodeError: try: s = s.decode('iso-8859-1') s = s.replace(pchar, chr(int(pchar.replace('%','0x'), 16))) except UnicodeDecodeError, e: pass for x,y in itertools.izip(junk_chars, junk_chars_repl): s = s.replace(x, y) return s def make_valid_url(self, url): """ Make a valid url """ for x,y in itertools.izip(dirty_chars, dirty_chars_repl): if x in url: url = url.replace(x, y) # Replace spaces between words # with '%20'. # For example http://www.foo.com/bar/this file.html # Fix: Use regex instead of blind # replacement. if wspacere.search(url): url = re.sub(r'\s', '%20', url) # Replace all % chars with their capital counterparts # i.e %3a => %3A, %5b => %5B etc. This helps in # canonicalization. percent_chars = percent_repl.findall(url) for pchar in percent_chars: url = url.replace(pchar, pchar.upper()) return url def is_filename_url(self): """ Return whether this is file name url """ # A directory url is something like http://www.python.org # which points to the <index.html> file inside the www.python.org # directory.A file name url is a url that points to an actual # file like http://www.python.org/doc/current/tut/tut.html return self.filelike def is_cgi(self): """ Check whether this url is a cgi (dynamic/form) link """ return self.cgi def is_relative_path(self): """ Return whether the original url was a relative one """ return self.isrel def is_relative_to_server(self): """ Return whether the original url was relative to the server """ return self.isrels def is_image(self): """ Find out if the file is an image """ if self.typ == 'image': return True elif self.typ == 'generic': if self.validfilename: extn = ((os.path.splitext(self.validfilename))[1]).lower() return (extn in image_extns) return False def is_multimedia(self): """ Found out if the file is a multimedia (vide or audio) type """ return (self.is_video() or self.is_audio()) def is_audio(self): """ Find out if the file is a multimedia audio type """ if self.typ == 'audio': return True elif self.typ == 'generic': if self.validfilename: extn = ((os.path.splitext(self.validfilename))[1]).lower() return (extn in sound_extns) return False def is_video(self): """ Find out if the file is a multimedia video type """ if self.typ == 'video': return True elif self.typ == 'generic': if self.validfilename: extn = ((os.path.splitext(self.validfilename))[1]).lower() return (extn in movie_extns) return False def is_webpage(self): """ Find out by if the file is a webpage type """ # Note: right now we treat dynamic server-side scripts namely # php, psp, asp, pl, jsp, and cgi as possible html candidates, though # actually they might be generating non-html content (like dynamic # images.) if self.typ.isA(URL_TYPE_WEBPAGE): return True elif self.typ==URL_TYPE_ANY: if self.validfilename: extn = ((os.path.splitext(self.validfilename))[1]).lower() if extn in webpage_extns: return True elif extn not in document_extns and extn not in image_extns: return True else: # jkleven: 10/1/06. Forms were never being parsed for links. # If we are allowing download of query forms (i.e., bin?asdf=3 style URLs) # then run the URL through a regex if we're still not sure if its ok. # if it matches the from_re precompiled regex then we'll assume its # a query style URL and we'll return true. if objects.config and objects.config.getquerylinks and form_re.search(self.get_full_url()): return True return False def is_stylesheet(self): """ Find out whether the url is a style sheet type """ if self.typ == 'stylesheet': return True elif self.typ == 'generic': if self.validfilename: extn = ((os.path.splitext(self.validfilename))[1]).lower() return (extn in stylesheet_extns) return False def is_document(self): """ Return whether the url is a document """ # This method is useful for Indexers which use HarvestMan. # We define any URL which is not an image, is either a web-page # or any of the following types as a document. # Microsoft word documents # Openoffice documents # Adobe PDF documents # Postscript documents if self.is_image(): return False if self.is_webpage(): return True # Check extension if self.validfilename: extn = ((os.path.splitext(self.validfilename))[1]).lower() return (extn in document_extns) return False def is_flash(self): """ Return whether the url is flash, flash source code or flash action script """ # Check extension if self.validfilename: extn = ((os.path.splitext(self.validfilename))[1]).lower() return (extn in flash_extns) return False def is_equal(self, url): """ Find whether the passed url matches my url """ # Try 2 tests, one straightforward # other with a "/" appended at the end myurl = self.get_full_url() if url==myurl: return True #else: # myurl += URLSEP # if url==myurl: # return True return False # ============ End - Is (Boolean Get) Methods =========== # # ============ Begin - General Get Methods ============== # def get_url_content_info(self): """ Get the url content information """ return self.contentdict def get_anchor(self): """ Return the anchor tag of this url """ return self.anchor def get_anchor_url(self): """ Get the anchor url, if this url is an anchor type """ return "".join((self.get_full_url(), self.anchor)) def get_generation(self): """ Return the generation of this url """ return self.generation def get_priority(self): """ Get the priority for this url """ return self.priority def get_download_status(self): """ Return the download status for this url """ return self.status def get_type(self): """ Return the type of this url as a string """ return self.typ def get_parent_url(self): """ Return the parent url of this url """ return self.baseurl def get_url_directory(self): """ Return the directory path (url minus its filename if any) of the url """ # get the directory path of the url fulldom = self.get_full_domain() urldir = fulldom if self.dirpath: newpath = "".join((URLSEP, "".join([ x+'/' for x in self.dirpath]))) urldir = "".join((fulldom, newpath)) return urldir def get_url_directory_sans_domain(self): """ Return url directory minus the domain """ # New function in 1.4.1 urldir = '' if self.dirpath: urldir = "".join((URLSEP, "".join([ x+'/' for x in self.dirpath]))) return urldir def get_url(self): """ Return the url of this object """ return self.url def get_original_url(self): """ Return the original url of this object """ return self.origurl def get_canonical_url(self): """ Return the canonicalized form of this URL """ # A canonical URL or 'normalized' URL is a URL modified # to a standardized form so that similar URLs can be # found out by comparing their canonical forms. HarvestMan # uses canonical URLs to remove DUST (Duplicate URLs with # similar text) to some extent. # Wikipedia describes canonicalization of a URL # {http://en.wikipedia.org/wiki/URL_normalization} # # 1. Converting the scheme and host to lower case... # 2. Adding trailing to directory URLs... # 3. Removing directory index, i.e # http://www.example.com/default.asp => http://www.example.com/ # http://www.example.com/index.html => http://www.example.com/ # 4. Case insensitive files => If the URL is running on a case insensitive # file system (Windows, example: FAT*, NTFS etc), then the canonical # form should use lower case. # 5. Capitalizing letters in escape sequences - All letters within a # percent-encoding triplet (e.g., "%3A") are case-insensitive, and should # be capitalized. # Egs: http://www.example.com/a%c2%b1b 鈫
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?