⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 urlparser.py

📁 网络蜘蛛
💻 PY
📖 第 1 页 / 共 3 页
字号:
""" HarvestManUrlPathParser.py - Module to parse a url string based
    on a directory and extract useful information like
    directory, filename path etc. This software is part of
    the HarvestMan(R) program.

    Authors: Nirmal Kannan Chidambaram(nkchidambaram at yahoo dot com),
             Anand B Pillai (anandpillai at letterboxes dot org).    

    For licensing information see the file LICENSE.txt that
    is included in this distribution.

    Dependency
    ==========
    os, string, copy

    Modification History
    ====================

    Anand    1.3 alpha release Cleaning up, renaming, many changes.
    Jan 2 2004    Anand   1.4 bug fix version development started.
    Feb 10 2004   Anand   1.3.1 bug fix release.
    Feb 24 2004   Anand   1.3.3 devel started. Fixed a bug with
                          ftp links.

    Jun 5-9 2004 Anand    1.4 development.

                          *Modified __init__ method. It now takes
                          an existing url object or a url string for
                          the base url option. Root directory obtained
                          from base url object instead of explicitly passing
                          everytime.

                          *Added url generation and priority properties.

                          *Added a download status proprety

                          *Most visible change is the way we distinguish
                          between directory like and file like urls. Replaced
                          previous code which assumed it is a directory url 
                          then tried to download it, and corrected to a file-like
                          url if download failed. This tied this class closely
                          to connector methods and necessitated caching of
                          connections. The new algorithm assumes all urls with
                          no extensions to be file like urls. The actual decision
                          is taken when we request for the file in the connector,
                          which then sets a flag in the url object, if it is a
                          directory url. Base url object is made a weakref reference
                          since we need to propagate such changes back by one
                          generation.

                          *Fix for urls with spaces.

    Jun 14 2004 Anand     1.3.9 release.
    
"""

import copy
import weakref
import os, sys
from common import *

class HarvestManUrlParserError(Exception):
    def __init__(self, value):
        self.value = value

    def __str__(self):
        return str(self.value)

class HarvestManUrlParser:

    def __init__(self, tarURL, urltype='normal', iscgi = 0, baseUrlObj = None, rootDir = ''):

        self.url = tarURL
        self._urltyp = urltype
        self._anchor = ''

        # if anchor type link, resolve the anchor
        if self._urltyp == 'anchor':
            if baseUrlObj is None:
                raise HarvestManUrlParserError, 'Base url should not be empty for anchor type urls'

            index = self.url.rfind('#')
            if index != -1:
                newhref = self.url[:index]
                self._anchor = self.url[index:]
		self.url = newhref

        self._iscgi = iscgi
        self.filename = 'index.html'
        self.validfilename = 'index.html'
        self.lastpath = ''

        # Fix for bug #B1077613467.85
        self.protocol = ''
        self.defaultproto = False
        
        # If the url is a file like url
        # this value will be true, if it is
        # a directory like url, this value will
        # be false.
        self.filename_url = False

        # download status, a number indicating
        # whether this url was downloaded successfully
        # or not. 0 indicates a successful download, and
        # any number >0 indicates a failed download
        self._dstatus = 0
        
        # is starting url?
        self.is_starting_url = False
        
        # Flag for files having extension
        self.has_extension = False

        # Relative path flags
        self.is_rel = False
        self.is_relto_server = False

        self.port = 80
        self.domain = ''
        self.rpath = []

        # Recursion depth
        self.recursion_depth = 0
        # Content information for updating urls
        self.contentdict = {}

        # Url generation
        self._generation = 0
        # Url priority
        self._priority = 0
        # rules violation cache flags
        self.__violatesrules = False
        self.__rulescheckdone = False

        self._baseUrlObj = None

        if baseUrlObj:
            if isinstance(baseUrlObj, HarvestManUrlParser):
                self._baseUrlObj = weakref.ref(baseUrlObj)
            elif type(baseUrlObj) is str:
                baseUrlObj = HarvestManUrlParser(baseUrlObj, 'normal', self._iscgi, None, rootDir)
                self._baseUrlObj = weakref.ref(baseUrlObj)

        if rootDir == '':
            if self._baseUrlObj:
                self.rootdir = self._baseUrlObj().get_root_dir()
            else:
                self.rootdir = os.getcwd()
        else:
            self.rootdir = rootDir
                
        self.__resolve_url()

    def __make_valid_filename(self, s):
        """ Replace junk characters to create a valid
        filename """

        junks=list('?*"<>!:/\\')
        for x in junks:
            if s.find(x) != -1:
                s = s.replace(x, '')

        # replace '%20' with the space
        # character (generated by POST requests)
        s = s.replace('%20', ' ')
        # replace %7E with ~
        s = s.replace('%7E', '~')

        return s

    def __make_valid_url(self, url):
        """ Make a valid url """

        # Replace spaces with "%20"
        url = url.replace(' ', '%20')
        return url

        
    def has_filename_extension(self):
        """ Return whether the url of this object has a filename extension """

        return self.has_extension

    def is_filename_url(self):
        """ Return whether this is file name url """

        # A directory url is something like http://www.python.org
        # which points to the <index.html> file inside the www.python.org
        # directory.A file name url is a url that points to an actual
        # file like http://www.python.org/doc/current/tut/tut.html

        return self.filename_url

    def set_directory_url(self, directory_url=True):
        """ Set this as a directory url """

        self.filename_url = False
        self.dirpath.append(self.lastpath)
        print "MY DIRPATH CHANGED=> ", self.dirpath
        
        self.validfilename = 'index.html'

    def get_anchor(self):
        """ Return the anchor tag of this url """

        return self._anchor

    def get_anchor_url(self):
        """ Get the anchor url, if this url is an anchor type """

        return self.get_full_url() + self._anchor

    def get_generation(self):
        """ Return the generation of this url """
        return self._generation
        
    def set_generation(self, gen):
        """ Set the generation of this url """
        self._generation = gen

    def get_priority(self):
        """ Get the priority for this url """
        return self._priority
        
    def set_priority(self, priority):
        """ Set the priority for this url """
        self._priority = priority

    def get_download_status(self):
        """ Return the download status for this url """
        return self._dstatus

    def set_download_status(self, dstatus):
        """ Set the download status of this url """

        self._dstatus = int(dstatus)
        
    def is_start_url(self):
        """ Find out if this is the starting url """
        return self.is_starting_url
        
    def __resolve_url(self):
        """ Local method for resolving urls """

        self.recursion_depth += 1

        if len(self.url)==0:
            raise HarvestManUrlParserError, 'HarvestManUrlParserError: Zero length Url'

        protocolindex = self.url.find("http://")
        if protocolindex != -1:
            self.protocol='http://'
            self.port=80
        else:
            protocolindex = self.url.find("ftp://")
            if protocolindex != -1:
                self.protocol = 'ftp://'
                self.port=21                
            else:
                protocolindex = self.url.find("https://")
                if protocolindex != -1:
                    self.protocol = 'https://'
                    self.port=443                   
                else:
                    # for www urls, we assume the protocol
                    # is http://
                    protocolindex = self.url.find("www.")
                    if protocolindex != -1:
                        self.protocol = 'http://'

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -