⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 urlparser.py

📁 网络蜘蛛
💻 PY
📖 第 1 页 / 共 3 页
字号:
                        self.port=80
                        self.url = self.protocol + self.url
                    else:
                        # Bug: urls beginning with a double '/' ('//') should not be
                        # treated as relative paths. Instead prepend the protocol string
                        # to it.
                        if self.url.find('//') == 0:
                            protocolindex = 0
                            self.url = self.protocol + self.url[2:]
                            
        if not self.protocol:
            self.protocol = 'http://'
            self.defaultproto = True
        
        if protocolindex==-1:
            # Raise Exception here, if base url is not present
            if self._baseUrlObj is None:
                raise HarvestManUrlParserError, 'Base url should not be empty for relative urls'

            paths = ''

            self.is_rel = True

            # Is relative to server ?
            if self.url[0] == '/': self.is_relto_server = True

            relative_file=0
            if self.url.find('/') == -1: relative_file = 1

            relpaths = self.url.split('/')
            if self.url.find('/') == -1:
                locally_relative = 1

            rindex = 0
            for ritem in relpaths:
                if (ritem.count('.') == len(ritem)) or ritem == '' :
                    rindex = rindex+1
                    if (ritem.count('.') == len(ritem)) and ritem != '':
                        self.rpath.insert(len(self.rpath),ritem)

                else:
                    for ent in relpaths[rindex:]:
                        paths = paths + ent + '/'

                    # Trim off that extra 
                    paths = paths[:len(paths)-1]

                    # Again Trim if the relative path ends with /
                    # like href = /img/abc.gif/ 
                    if paths[len(paths)-1] == '/':
                        paths = paths[:len(paths)-1]
                    break
        else:
            paths = self.url[(protocolindex + len(self.protocol)):]

        # Compute dirpath , if it is Absolute , relative path will be the final dir path
        # to fetch , if else it will be relative path evaluation + dirpath
        if self._iscgi: paths += '/'

        self.dirpath = paths.split('/')
        self.lastpath = self.dirpath[-1]

        if protocolindex != -1:
            if len(self.dirpath ) > 1:
                if self.lastpath:
                    dotindex = self.lastpath.find('.')
                    if dotindex != -1: self.has_extension = True

                    if (dotindex == -1) or \
                       ((dotindex >0) and (dotindex < (len(self.lastpath)-1))):
                        self.filename_url = True
                        self.validfilename =  self.__make_valid_filename(self.lastpath)
                        self.filename = self.lastpath
                        self.dirpath  = self.dirpath [: len(self.dirpath )-1]
                else:                
                    self.dirpath  = self.dirpath [: len(self.dirpath )-1]

        else:
            if self.lastpath:              
                dotindex = self.lastpath.find('.')
                if dotindex != -1: self.has_extension = True

                if (dotindex == -1) or \
                   ((dotindex >0) and (dotindex < (len(self.lastpath)-1))):
                    self.filename_url = True
                    self.validfilename =  self.__make_valid_filename(self.lastpath)
                    self.filename = self.lastpath
                    self.dirpath  = self.dirpath [: len(self.dirpath )-1]                

        # Interprets relative path
        # ../../. Nonsense relative paths are graciously ignored,just basic implementation
        # might be modified later

        if protocolindex == -1:
            #print 'URL -> ', self.url
            #print 'MY DIRPATH ->', self.dirpath
            #print 'BASE DIRPATH ->', self._baseUrlObj().dirpath, self._baseUrlObj()
            
            # Bug fixed on abs relpath / (Test Case #1)
            self.rpath.reverse()
            if len(self.rpath) == 0 :
                if not rindex:
                    self.dirpath = self._baseUrlObj().dirpath + self.dirpath
                else:
                    self.dirpath = self.dirpath
            else:
                pathstack = copy.deepcopy(self._baseUrlObj().dirpath)
                for ritem in self.rpath:
                    if ritem == '.':
                        pathstack = copy.deepcopy(self._baseUrlObj().dirpath)
                    if ritem == '..':
                        if len(pathstack) !=0:
                            pathstack.pop()

                self.dirpath  = pathstack + self.dirpath

        # Support for NONSENSE relative paths such as
        # g/../foo and g/./foo 
        # consider base = http:\\bar.com\bar1
        # then g/../foo => http:\\bar.com\bar1\..\foo => http:\\bar.com\foo
        # g/./foo  is utter nonsense and we feel free to ignore that
        # More nonsense will be directly delegated to webserver , which is free to respond with 404
        # and We are handling that exception 404 ;-)

        if protocolindex == -1:
            index = 0
            for item in self.dirpath:
                if item.count('.') == len(item):
                    if item == '.':
                        self.dirpath.remove('.')
                    if item == '..':
                        self.dirpath.remove('..')
                        self.dirpath.remove(self.dirpath[index - 1])
                index = index + 1

        # Resolving the domain
        # if relative path parent domain :-)
        if protocolindex == -1:
            self.domain = self._baseUrlObj().domain
        else:
            # find out if the domain contains a port number
            # for example, heino2e:8080
            self.domain=self.dirpath[0]
            self.dirpath = self.dirpath[1:]

        dom=self.domain
        index = dom.find(':')
        if index != -1:
            self.domain = dom[:index]
            # A bug here => needs to be fixed
            try:
                self.port   = int(dom[index+1:])
            except:
                pass

        # Now check if the base domain had a port specification (other than 80)
        # Then we need to use that port for all its children, otherwise
        # we can use default value.
        if self._baseUrlObj:
            if self._baseUrlObj().port != 80:
                self.port = self._baseUrlObj().port
                # bug-fix: protocol also need to be
                # derived from base. Fix for bug # B1077613467.85.
                if self.defaultproto:
                    self.protocol = self._baseUrlObj().protocol

    def violates_rules(self):
        """ Check if this url violates existing download rules """

        if not self.__rulescheckdone:
            self.__violatesrules = GetObject('ruleschecker').violates_basic_rules(self)
            self.__rulescheckdone = True

        return self.__violatesrules
        
    def get_type(self):
        """ Return the type of this url as a string """
        
        return self._urltyp

    def set_type(self, type):
        """ Set the type of this url """
        
        self._urltyp = type

    def get_base_urlobject(self):
        """ Return the base url object of this url """
        
        return self._baseUrlObj()

    def is_cgi(self):
        """ Check whether this url is a cgi (dynamic/form) link """
        return self._iscgi

    def is_relative_path(self):
        """ Return whether the original url was a relative one """

        return self.is_rel

    def is_relative_to_server(self):
        """ Return whether the original url was relative to the server """
        
        return self.is_relto_server

    def get_url_directory(self):
        """ Return the directory path (url minus its filename if any) of the
        url """
        
        # get the directory path of the url
        rval = self.protocol + self.domain
        if self.dirpath:
            newpath = '/' + reduce(lambda x, y: x + '/' + y, self.dirpath)
            rval += newpath

        return rval

    def get_url(self):
        """ Return the original url used to create this object """
        
        return self.url
    
    def get_full_url(self, intranet=0):
        """ Return the full url path of this url object after
        resolving relative paths, filenames etc """
        
        rval = self.get_full_domain_with_port(intranet)
        if self.dirpath:
            newpath = reduce(lambda x, y: x + '/' + y, self.dirpath)
            rval = "".join((rval, '/', newpath))
            
        if rval[-1] != '/': rval = "".join((rval, '/'))

        if self.filename_url:
            rval = "".join((rval, self.filename))

        return self.__make_valid_url(rval)

    def get_full_url_sans_port(self):
        """ Return absolute url without the port number """

        rval = self.get_full_domain()
        if self.dirpath:
            newpath = reduce(lambda x, y: x + '/' + y, self.dirpath)
            rval = "".join((rval, '/', newpath))

        if rval[-1] != '/': rval = "".join((rval, '/'))

        if self.filename_url:
            rval = "".join((rval, self.filename))

        return self.__make_valid_url(rval)

    def get_port_number(self):
        """ Return the port number of this url """

        # 80 -> http urls
        return self.port

    def get_relative_url(self):
        """ Return relative path of url w.r.t the domain """

        newpath=""
        if self.dirpath:
            newpath =  "".join(("/", reduce(lambda x, y: x + '/' + y, self.dirpath)))

        if self.filename_url:
            newpath = "".join((newpath, '/', self.filename))
            
        return self.__make_valid_url(newpath)

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -