⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 urlparser.py

📁 网络蜘蛛
💻 PY
📖 第 1 页 / 共 3 页
字号:
    def get_domain(self):
        """ Return the domain (server) for this url object """
        
        return self.domain

    def get_full_domain(self):
        """ Return the full domain (protocol + domain) for this url object """
        
        return self.protocol + self.domain

    def get_full_domain_with_port(self, intranet=0):
        """ Return the domain (server) with port number
        appended to it, if the port number is not the
        default for the current protocol """

        if intranet or ((self.protocol == 'http://' and int(self.port) != 80) \
                        or (self.protocol == 'https://' and int(self.port) != 443) \
                        or (self.protocol == 'ftp://' and int(self.port) != 21)):
            return self.get_full_domain() + ':' + str(self.port)
        else:
            return self.get_full_domain()

    def get_domain_with_port(self, intranet=0):
        """ Return the domain (server) with port number
        appended to it, if the port number is not the
        default for the current protocol """

        if intranet or ((self.protocol == 'http://' and self.port != 80) \
                        or (self.protocol == 'https://' and self.port != 443) \
                        or (self.protocol == 'ftp://' and self.port != 21)):
            return self.domain + ':' + str(self.port)
        else:
            return self.domain

    def get_full_filename(self):
        """ Return the full filename of this url on the disk.
        This is created w.r.t the local directory where we save
        the url data """

        filename=os.path.join(self.get_local_directory(), self.get_filename())
        return filename

    def get_filename(self):
        """ Return the filenam of this url on the disk. """

        # NOTE: This is just the filename, not the absolute filename path
        if self._iscgi or not self.filename_url:
            self.validfilename = 'index.html'
            
        return self.validfilename

    def get_relative_filename(self):
        """ Get the relative filename path of this url object w.r.t
        its base url """

        if not self._baseUrlObj:
            return self.get_filename()
        else:
            basedir = self._baseUrlObj().get_local_directory()
            filename = self.get_full_filename()

            l = [basedir, filename]

            # bug: commonprefix returns invalid path names
            # we need to convert it to a valid pathname
            # this can be done by taking the directory name
            # of  the prefix.
            prefix = os.path.commonprefix(l)
            if not os.path.exists(prefix):
                prefix = os.path.dirname(prefix)

            if prefix:
                relbasedir = basedir.replace(prefix, '')
                relfilename = filename.replace(prefix, '')
            else:
                relbasedir = basedir
                relfilename = filename
                
            if relfilename[0] == '\\' or relfilename[0] == '/':
                relfilename = relfilename[1:]
                
            if relbasedir=="":
                return relfilename
            else:
                # for each path in relbasedir add a '..' to the
                # relative filename
                # first replace all '\\' with '/'
                relbasedir = relbasedir.replace('\\', '/')
                dotdots = len(relbasedir.split('/'))

                for x in range(dotdots - 1):
                    relfilename = '../' + relfilename

                return relfilename

    def get_relative_depth(self, hu, mode=0):
        """ Get relative depth of current url object vs passed url object.
        Return a postive integer if successful and -1 on failure """

        # Fixed 2 bugs on 22/7/2003
        # 1 => passing arguments to find function in wrong order
        # 2 => Since we allow the notion of zero depth, even zero
        # value of depth should be returned.

        # This mode checks for depth based on a directory path
        # This check is valid only if dir2 is a sub-directory of dir1
        dir1=self.get_url_directory()
        dir2=hu.get_url_directory()

        # spit off the protocol from directories
        dir1 = dir1.replace(self.protocol, '')
        dir2 = dir2.replace(self.protocol, '')      

        # Append a '/' to the dirpath if not already present
        if dir1[len(dir1)-1] != '/': dir1 += '/'
        if dir2[len(dir2)-1] != '/': dir2 += '/'

        # print dir1, dir2
        if mode==0:
            # check if dir2 is present in dir1
            # bug: we were passing arguments to the find function
            # in the wrong order.
            if dir1.find(dir2) != -1:
                # we need to check for depth only if the above condition is true.
                l1=dir1.split('/')
                l2=dir2.split('/')
                # print l1, l2
                if l1 and l2:
                    diff=len(l1) - len(l2)
                    if diff>=0: return diff

            return -1
        # This mode checks for depth based on the base server(domain).
        # This check is valid only if dir1 and dir2 belong to the same
        # base server (checked by name)
        elif mode==1:
            if self.domain == hu.domain:
                # we need to check for depth only if the above condition is true.
                l1=dir1.split('/')
                l2=dir2.split('/')
                if l1 and l2:
                    diff=len(l1) - len(l2)
                    if diff>=0: return diff
            return -1

        # This check is done for the current url against current base server (domain)
        # i.e, this mode does not use the argument 'hu'
        elif mode==2:
            dir2 = self.domain
            if dir1[len(dir2)-1] != '/':
                dir2 += '/'
            # we need to check for depth only if the above condition is true.
            l1=dir1.split('/')
            l2=dir2.split('/')
            if l1 and l2:
                diff=len(l1) - len(l2)
                if diff>=0: return diff
            return -1

        return -1

    def get_root_dir(self):
        """ Return root directory """
        
        return self.rootdir
    
    def get_local_directory(self):
        """ Return the local directory path of this url w.r.t
        the directory on the disk where we save the files of this url """
        
        # Gives Local Direcory path equivalent to URL Path in server
        # Could be used to cache HTML pages to disk

        rval = os.path.join(self.rootdir, self.domain)

        for dir in self.dirpath:
            if dir:
                rval = os.path.abspath( os.path.join(rval, self.__make_valid_filename(dir)))

        return rval

    def is_image(self):
        """ Find out by filename extension if the file is an image """

        # Note: This function takes into account almost all popular
        # web image formats.
        img_extns = ['.bmp', '.dib', '.dcx', '.emf', '.fpx', '.gif', '.img',
                     '.jp2', '.jpc', '.j2k', '.jpf', '.jpg', '.jpeg', '.jpe',
                     '.mng', '.pbm', '.pcd', '.pcx', '.pgm', '.png', '.ppm',
                     '.psd', '.ras', '.rgb', '.tga', '.tif', '.tiff', '.wbmp',
                     '.xbm', '.xpm']

        extn = ((os.path.splitext(self.validfilename))[1]).lower()
        if extn in img_extns:
            return True
        else:
            return False
            
    def is_webpage(self):
        """ Find out by filename extension if the file <filename>
        is an html or html-like (server-side dynamic html files)
        file, or a candidate for one """

        # Note: right now we treat dynamic server-side scripts namely
        # php, psp, asp, pl, jsp, and cgi as possible html candidates, though
        # actually they might be generating non-html content (like dynamic
        # images.)
        if not self.has_extension:
            return 1
        
        if self.validfilename:
            extn = ((os.path.splitext(self.validfilename))[1]).lower()
            if extn in ('.htm', '.html', '.php', '.asp', '.jsp','.psp','.pl','.cgi'):
                return 1

        return 0

    def set_url_content_info(self, contentlen, contenttype):
        """ This function sets the url content information of this
        url. It is a convenient function which can be used by connectors
        to store url content information """

        self.contentdict['Content-length'] = contentlen
        self.contentdict['Content-type'] = contenttype

    def get_url_content_info(self):
        """ Get the url content information """
        
        return self.contentdict
        
if __name__=="__main__":

    # Test code

    hulist = [HarvestManUrlParser('http://www.yahoo.com/photos/my photo.gif'),
              HarvestManUrlParser('http://www.rediff.com:80/r/r/tn2/2003/jun/25usfed.htm'),
              HarvestManUrlParser('http://cwc2003.rediffblogs.com'),
              HarvestManUrlParser('/sports/2003/jun/25beck1.htm', 'normal', 0, 'http://www.rediff.com'),
              HarvestManUrlParser('ftp://ftp.gnu.org/pub/lpf.README'),
              HarvestManUrlParser('http://www.python.org/doc/2.3b2/'),
              HarvestManUrlParser('//images.sourceforge.net/div.png', 'image', 0, 'http://sourceforge.net'),
              HarvestManUrlParser('http://pyro.sourceforge.net/manual/LICENSE'),
              HarvestManUrlParser('python/test.htm', 'normal', 0, 'http://www.foo.com/bar'),
              HarvestManUrlParser('/python/test.css', 'normal', 0, 'http://www.foo.com/bar/vodka/test.htm'),
              HarvestManUrlParser('/visuals/standard.css', 'normal', 0, 'http://www.garshol.priv.no/download/text/perl.html', 'd:/websites'),
              HarvestManUrlParser('www.fnorb.org/index.html', 'normal', 0, 'http://pyro.sourceforge.net', 'd:/websites'),
              HarvestManUrlParser('http://profigure.sourceforge.net/index.html', 'normal', 0, 'http://pyro.sourceforge.net', 'd:/websites'),
              HarvestManUrlParser('#anchor', 'anchor', 0, 'http://www.foo.com/bar/index.html')]              

    for hu in hulist:
        print '------------------------------------------------------------------\n'
        print 'Full filename = ', hu.get_full_filename()
        print 'Valid filename = ', hu.validfilename
        print 'Local Filename  = ', hu.get_filename()
        print 'Is relative path = ', hu.is_relative_path()
        print 'Full domain = ', hu.get_full_domain()
        print 'Domain      = ', hu.domain
        print 'Local directory = ', hu.get_url_directory()
        print 'Absolute Url = ', hu.get_full_url()
        print 'Absolute Url Without Port = ', hu.get_full_url_sans_port()
        print 'Local Directory = ', hu.get_local_directory()
        print 'Is filename parsed = ', hu.filename_url
        print 'Path rel to domain = ', hu.get_relative_url()
        print 'Connection Port = ', hu.get_port_number()
        print 'Domain with port = ', hu.get_full_domain_with_port()
        print 'Relative filename = ', hu.get_relative_filename()
        print 'Anchor url     = ', hu.get_anchor_url()
        print 'Anchor tag     = ', hu.get_anchor()

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -