📄 spider.py
字号:
import htmllib
from urllib import basejoin
MAXDIST = 2
class myHTMLParser(htmllib.HTMLParser):
"""Modified to return URL references as a list after parsing."""
def __init__(self, formatter, URL, site):
htmllib.HTMLParser.__init__(self, formatter)
self.rootURL = basejoin("http://",URL)
self.URLstore = [self.rootURL]
self.ignored = []
self.site = site
def build_hrefs(self):
"""Build the list of unique references from the anchor list."""
# XXX need to treat http://hostname
# and http://hostname/ as equivalent;
# should also process "." and ".."
if self.base is None:
base = self.rootURL
else:
base = basejoin(self.rootURL, self.base)
for href in self.anchorlist:
ref = basejoin(base,href)
if ref.startswith(self.site):
if ref not in self.URLstore:
self.URLstore.append(ref)
else:
if ref not in self.ignored:
self.ignored.append(ref)
def close(self):
"""Terminate parse and return unique URL list."""
htmllib.HTMLParser.close(self)
self.build_hrefs()
return self.URLstore, self.ignored
class Link:
"""Stores information about a URL discovered by web crawling.
distance specifies number of hops from the root page.
good indicates whether the page is available.
refs is a list of URLs which refer to this page.
"""
def __init__(self,distance):
self.good= None;
self.distance = distance
self.refs = []
def addref(self, ref):
self.refs.append(ref)
def setgood(self):
self.good = 1
def CheckPresent(URL):
"""Returns true if a page can be opened.
Only used at distance limit."""
try:
f = urllib.open(URL)
f.close()
return 1
except:
return 0
def check(site):
"""Checks a whole site from a base URL."""
import sys, formatter, urllib
# initialize links and visited lists
known={site: Link(0)}
pages=[[site]]
fmt = formatter.NullFormatter()
ignored = []
distance = 1
while distance < MAXDIST:
pages.append([])
for URL in pages[distance-1]:
try:
f = urllib.urlopen(URL)
data = f.read()
f.close()
p = myHTMLParser(fmt, URL, site)
p.feed(data)
links, rubbish = p.close()
known[URL].setgood()
for r in links:
if not known.has_key(r):
pages[distance].append(r)
known[r] = Link(distance)
known[r].addref(URL)
for r in rubbish:
if not r in ignored:
ignored.append(r)
except IOError, msg:
print "Error:", URL, ":", msg
print "Round", distance, len(pages[distance]), "outstanding"
distance = distance+1
for URL in pages[distance-1]:
if CheckPresent(URL):
known[URL].setgood()
# XXX Need more intelligent sorting and output options here
ignored.sort()
for r in ignored:
print "Ignored:", r
print "\n\n\n"
reflist = known.keys()
reflist.sort()
for k in reflist:
kk = known[k]
print "%3d : %s" % (kk.distance, k)
# for l in kk.refs:
# print " <== %s" % l
import sys
if len(sys.argv) == 1:
check("http://www.python.org/")
else:
check(sys.argv[1])
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -