urlparser.py

来自「Harvestman-最新版本」· Python 代码 · 共 934 行 · 第 1/3 页

PY
934
字号
               self.baseurl.port != self.port and\               self.baseurl.protocol != 'file://':                        self.port = self.baseurl.port        # Convert domain to lower case        if self.domain != '':            self.domain = self.domain.lower()            def make_valid_filename(self, s):        """ Replace junk characters to create a valid filename """        # Replace any %xx strings        percent_chars = percent_repl.findall(s)        for pchar in percent_chars:            try:                s = s.replace(pchar, chr(int(pchar.replace('%','0x'), 16)))            except UnicodeDecodeError:                try:                    s = s.decode('iso-8859-1')                    s = s.replace(pchar, chr(int(pchar.replace('%','0x'), 16)))                except UnicodeDecodeError, e:                    pass                        for x,y in itertools.izip(junk_chars, junk_chars_repl):            s = s.replace(x, y)        return s    def make_valid_url(self, url):        """ Make a valid url """        for x,y in itertools.izip(dirty_chars, dirty_chars_repl):            if x in url:                url = url.replace(x, y)        # Replace spaces between words        # with '%20'.        # For example http://www.foo.com/bar/this file.html        # Fix: Use regex instead of blind        # replacement.        if wspacere.search(url):            url = re.sub(r'\s', '%20', url)                # Replace all % chars with their capital counterparts        # i.e %3a => %3A, %5b => %5B etc. This helps in        # canonicalization.        percent_chars = percent_repl.findall(url)        for pchar in percent_chars:            url = url.replace(pchar, pchar.upper())                    return url    def is_filename_url(self):        """ Return whether this is file name url """        # A directory url is something like http://www.python.org        # which points to the <index.html> file inside the www.python.org        # directory.A file name url is a url that points to an actual        # file like http://www.python.org/doc/current/tut/tut.html        return self.filelike    def is_cgi(self):        """ Check whether this url is a cgi (dynamic/form) link """        return self.cgi    def is_relative_path(self):        """ Return whether the original url was a relative one """        return self.isrel    def is_relative_to_server(self):        """ Return whether the original url was relative to the server """                return self.isrels    def is_image(self):        """ Find out if the file is an image """        if self.typ == 'image':            return True        elif self.typ == 'generic':            if self.validfilename:                extn = ((os.path.splitext(self.validfilename))[1]).lower()                return (extn in image_extns)                     return False    def is_multimedia(self):        """ Found out if the file is a multimedia (vide or audio) type """        return (self.is_video() or self.is_audio())            def is_audio(self):        """ Find out if the file is a multimedia audio type """        if self.typ == 'audio':            return True        elif self.typ == 'generic':            if self.validfilename:                extn = ((os.path.splitext(self.validfilename))[1]).lower()                return (extn in sound_extns)                     return False    def is_video(self):        """ Find out if the file is a multimedia video type """        if self.typ == 'video':            return True        elif self.typ == 'generic':            if self.validfilename:                extn = ((os.path.splitext(self.validfilename))[1]).lower()                return (extn in movie_extns)                     return False                def is_webpage(self):        """ Find out by if the file is a webpage type """        # Note: right now we treat dynamic server-side scripts namely        # php, psp, asp, pl, jsp, and cgi as possible html candidates, though        # actually they might be generating non-html content (like dynamic        # images.)                if self.typ.isA(URL_TYPE_WEBPAGE):            return True        elif self.typ==URL_TYPE_ANY:            if self.validfilename:                extn = ((os.path.splitext(self.validfilename))[1]).lower()                                if extn in webpage_extns:                    return True                elif extn not in document_extns and extn not in image_extns:                    return True                else:                    # jkleven: 10/1/06.  Forms were never being parsed for links.                    # If we are allowing download of query forms (i.e., bin?asdf=3 style URLs)                    # then run the URL through a regex if we're still not sure if its ok.                    # if it matches the from_re precompiled regex then we'll assume its                    # a query style URL and we'll return true.                    if objects.config and objects.config.getquerylinks and form_re.search(self.get_full_url()):                        return True        return False    def is_stylesheet(self):        """ Find out whether the url is a style sheet type """        if self.typ == 'stylesheet':            return True        elif self.typ == 'generic':            if self.validfilename:                extn = ((os.path.splitext(self.validfilename))[1]).lower()                return (extn in stylesheet_extns)                     return False    def is_document(self):        """ Return whether the url is a document """        # This method is useful for Indexers which use HarvestMan.        # We define any URL which is not an image, is either a web-page        # or any of the following types as a document.        # Microsoft word documents        # Openoffice documents        # Adobe PDF documents        # Postscript documents        if self.is_image(): return False        if self.is_webpage(): return True        # Check extension        if self.validfilename:            extn = ((os.path.splitext(self.validfilename))[1]).lower()            return (extn in document_extns)        return False    def is_flash(self):        """ Return whether the url is flash, flash source code        or flash action script """        # Check extension        if self.validfilename:            extn = ((os.path.splitext(self.validfilename))[1]).lower()            return (extn in flash_extns)        return False                    def is_equal(self, url):        """ Find whether the passed url matches        my url """        # Try 2 tests, one straightforward        # other with a "/" appended at the end        myurl = self.get_full_url()        if url==myurl:            return True        #else:        #    myurl += URLSEP        #    if url==myurl:        #        return True        return False            # ============ End - Is (Boolean Get) Methods =========== #      # ============ Begin - General Get Methods ============== #    def get_url_content_info(self):        """ Get the url content information """                return self.contentdict        def get_anchor(self):        """ Return the anchor tag of this url """        return self.anchor    def get_anchor_url(self):        """ Get the anchor url, if this url is an anchor type """        return "".join((self.get_full_url(), self.anchor))    def get_generation(self):        """ Return the generation of this url """                return self.generation        def get_priority(self):        """ Get the priority for this url """        return self.priority    def get_download_status(self):        """ Return the download status for this url """        return self.status    def get_type(self):        """ Return the type of this url as a string """                return self.typ    def get_parent_url(self):        """ Return the parent url of this url """                return self.baseurl    def get_url_directory(self):        """ Return the directory path (url minus its filename if any) of the url """                # get the directory path of the url        fulldom = self.get_full_domain()        urldir = fulldom        if self.dirpath:            newpath = "".join((URLSEP, "".join([ x+'/' for x in self.dirpath])))            urldir = "".join((fulldom, newpath))        return urldir    def get_url_directory_sans_domain(self):        """ Return url directory minus the domain """        # New function in 1.4.1        urldir = ''                if self.dirpath:            urldir = "".join((URLSEP, "".join([ x+'/' for x in self.dirpath])))        return urldir                    def get_url(self):        """ Return the url of this object """                return self.url    def get_original_url(self):        """ Return the original url of this object """                return self.origurl    def get_canonical_url(self):        """ Return the canonicalized form of this URL """        # A canonical URL or 'normalized' URL is a URL modified        # to a standardized form so that similar URLs can be        # found out by comparing their canonical forms. HarvestMan        # uses canonical URLs to remove DUST (Duplicate URLs with        # similar text) to some extent.        # Wikipedia describes canonicalization of a URL        # {http://en.wikipedia.org/wiki/URL_normalization}        #        # 1. Converting the scheme and host to lower case...        # 2. Adding trailing to directory URLs...        # 3. Removing directory index, i.e        #    http://www.example.com/default.asp => http://www.example.com/        #    http://www.example.com/index.html => http://www.example.com/        # 4. Case insensitive files => If the URL is running on a case insensitive        #    file system (Windows, example: FAT*, NTFS etc), then the canonical        #    form should use lower case.        # 5. Capitalizing letters in escape sequences - All letters within a        # percent-encoding triplet (e.g., "%3A") are case-insensitive, and should        # be capitalized.        # Egs: http://www.example.com/a%c2%b1b 鈫

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?