📄 spider.py

📁 python web programming 部分
💻 PY
字号:
import htmllib
from urllib import basejoin

MAXDIST = 2

class myHTMLParser(htmllib.HTMLParser):
    """Modified to return URL references as a list after parsing."""
    
    def __init__(self, formatter, URL, site):
        htmllib.HTMLParser.__init__(self, formatter)
        self.rootURL = basejoin("http://",URL)
        self.URLstore = [self.rootURL]
        self.ignored = []
        self.site = site
        
    def build_hrefs(self):
        """Build the list of unique references from the anchor list."""
        # XXX need to treat http://hostname
        #     and http://hostname/ as equivalent;
        #     should also process "." and ".."
        if self.base is None:
            base = self.rootURL
        else:
            base = basejoin(self.rootURL, self.base)
        for href in self.anchorlist:
            ref = basejoin(base,href)
            if ref.startswith(self.site):
                if ref not in self.URLstore:
                    self.URLstore.append(ref)
            else:
                if ref not in self.ignored:
                    self.ignored.append(ref)

    def close(self):
        """Terminate parse and return unique URL list."""
        htmllib.HTMLParser.close(self)
        self.build_hrefs()
        return self.URLstore, self.ignored

class Link:
    """Stores information about a URL discovered by web crawling.
    
    distance specifies number of hops from the root page.
    good indicates whether the page is available.
    refs is a list of URLs which refer to this page.
    
    """
    def __init__(self,distance):
        self.good= None;
        self.distance = distance
        self.refs = []

    def addref(self, ref):
        self.refs.append(ref)

    def setgood(self):
        self.good = 1

def CheckPresent(URL):
    """Returns true if a page can be opened.
    Only used at distance limit."""
    try:
        f = urllib.open(URL)
        f.close()
        return 1
    except:
        return 0

def check(site):
    """Checks a whole site from a base URL."""
    import sys, formatter, urllib

    # initialize links and visited lists
    known={site: Link(0)}
    pages=[[site]]

    fmt = formatter.NullFormatter()

    ignored = []
    distance = 1
    while distance < MAXDIST:
        pages.append([])
        for URL in pages[distance-1]:
            try:
                f = urllib.urlopen(URL)
                data = f.read()
                f.close()

                p = myHTMLParser(fmt, URL, site)
                p.feed(data)
                links, rubbish = p.close()
                known[URL].setgood()

                for r in links:
                    if not known.has_key(r):
                        pages[distance].append(r)
                        known[r] = Link(distance)
                        known[r].addref(URL)

                for r in rubbish:
                    if not r in ignored:
                        ignored.append(r)

            except IOError, msg:
                print "Error:", URL, ":", msg

        print "Round", distance, len(pages[distance]), "outstanding"

        distance = distance+1

    for URL in pages[distance-1]:
        if CheckPresent(URL):
            known[URL].setgood()

    # XXX Need more intelligent sorting and output options here

    ignored.sort()
    for r in ignored:
       print "Ignored:", r

    print "\n\n\n"

    reflist = known.keys()
    reflist.sort()
    for k in reflist:
        kk = known[k]
        print "%3d : %s" % (kk.distance, k)
    #    for l in kk.refs:
    #    print "  <== %s" % l

import sys
if len(sys.argv) == 1:
    check("http://www.python.org/")
else:
    check(sys.argv[1])
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -