📄 pageparser.py
字号:
""" HarvestManPageParser.py - Module to parse an html page and
extract its links. This software is part of the
HarvestMan(R) program.
Author: Anand B Pillai (anandpillai at letterboxes dot org).
For licensing information see the file LICENSE.txt that
is included in this distribution.
Dependency
==========
WebHTMLParser, string
Jan 2 2004 1.4 bug fix version development started.
Fixed Bug #B1073028403.71.
Feb 10 2004 Added a handler to take care of frame
redirects. Fix for bug ID #B1076402199.0.
Fixed bug #B1073291938.97.
1.3.1 bug fix version.
Feb 11 2004 Fetch all links in <frame src='...'> tag
instead of only frame redirect links.
Jun 14 2004 Anand 1.3.9 release.
"""
from common import *
from htmlparser import HTMLParser, HTMLParseError
import re
import htmllib
class harvestManPageHandler:
""" This class contains our handler methods for HTML elements
which can be used by various parsers """
def __init__(self):
self._handled = { 'a' : [('href', 'normal'), ('href', 'anchor')],
'frame': [('src', 'normal')],
'img' : [('src', 'image')],
'form' : [('action', 'form')],
'link' : [('href', '')],
'body' : [('background', 'image')],
'script' : [('src', 'javascript')],
'applet' : [('codebase', 'appletcodebase'), ('code', 'javaapplet')]
}
self._pageextns=['.htm', '.html', '.asp', '.php', '.jsp', '.psp', '.cgi', '.pl']
self.links = []
self.images = []
# anchor links flag
self._anchors = True
def save_anchors(self, value):
""" Set the save anchor links flag """
# Warning: If you set this to true, anchor links on
# webpages will be saved as separate files.
self._anchors = value
def filter_link(self, link):
""" Function to filter links, we decide here whether
to handle certain kinds of links """
if not link: return
# ignore javascript links (From 1.2 version javascript
# links of the form .js are fetched, but we still ignore
# the actual javascript actions since there is no
# javascript engine.)
llink = link.lower()
# Bug fix for B1073028403.71
skip_re = re.compile(r'(javascript:)|(mailto:)|(news:)|(\?m=a)|(\?n=d)|(\?s=a)|(\?d=a)')
# Bug fix for B1073291938.97 (skip query forms)
query_re = re.compile(r'[-.a-zA-Z0-9]*\?[-.a-zA-Z0-9]*=[-.a-zA-Z0-9]*')
if skip_re.match(llink):
return 1
cfg = GetObject('config')
if cfg.skipqueryforms and query_re.search(llink):
return 1
return 0
def handle_anchor_links(self, link):
""" Handle links of the form html#..."""
# if anchor tag, then get rid of anchor #...
# and only add the webpage link
if not link: return
# moreinfo('Appending anchor link', link)
self.links.append(('anchor', link))
# No point in getting #anchor sort of links
# since they point to anchors in the same page
index = link.rfind('.html#')
if index != -1:
newhref = link[:(index + 5)]
self.links.append(('normal', newhref))
return 0
else:
index = link.rfind('.htm#')
if index != -1:
newhref = link[:(index + 4)]
self.links.append(('normal', newhref))
return 0
return 1
def anchor_bgn(self, href, name, type):
""" Handler for htmllib.HTMLParser """
print href, name, type
pass
def anchor_end(self):
""" Handler for htmllib.HTMLParser """
pass
def handle_image(self, source, alt, ismap, align, width, height):
print source
pass
def handle_starttag(self, tag, attrs):
""" This method gives you the tag in the html
page along with its attributes as a list of
tuples """
# We handle the following tags
# a => hypertext links
# img => image links
# link => css/icon etc
# form => cgi forms
# body => for background images
# frame => for redirects
# print tag, attrs
if tag in self._handled.keys():
d = dict(attrs)
_values = (self._handled[tag])
link = ''
for v in _values:
key = v[0]
typ = v[1]
# if the link already has a value, skip
# (except for applet tags)
if tag != 'applet':
if link: continue
if tag == 'link':
try:
typ = d['rel']
except KeyError:
pass
try:
if tag != 'applet':
link = d[key]
else:
link += d[key]
if key == 'codebase':
if link:
if link[-1] != '/':
link += '/'
continue
except KeyError:
continue
# see if this link is to be filtered
if self.filter_link(link):
debug('Filtering link ', link)
continue
# anchor links in a page should not be saved
index = link.find('#')
if index != -1:
self.handle_anchor_links(link)
else:
# append to private list of links
self.check_add_link(typ, link)
def check_add_link(self, typ, link):
""" To avoid adding duplicate links """
f = False
if typ == 'image':
for k,v in self.images:
if v == link:
f = True
break
if not f:
moredebug('Adding image ', link, typ)
self.images.append((typ, link))
else:
for k,v in self.links:
if v == link:
f = True
break
if not f:
moredebug('Adding link ', link, typ)
self.links.append((typ, link))
def add_tag_info(self, taginfo):
""" Add new tag information to this object.
This can be used to change the behavior of this class
at runtime by adding new tags """
# The taginfo object should be a dictionary
# of the form { tagtype : (elementname, elementype) }
# egs: { 'body' : ('background', 'img) }
if type(taginfo) != dict:
raise AttributeError, "Attribute type mismatch, taginfo should be a dictionary!"
# get the key of the dictionary
key = (taginfo.keys())[0]
if len(taginfo[key]) != 2:
raise ValueError, 'Value mismatch, size of tag tuple should be 2'
# get the value tuple
tagelname, tageltype = taginfo[key]
# see if this is an already existing tagtype
if key in self._handled.keys():
_values = self._handled[key]
f=0
for index in range(0, len(_values)):
# if the elementname is also
# the same, just replace it.
v = _values[index]
elname, eltype = v
if elname == tagelname:
f=1
_values[index] = (tagelname, tageltype)
break
# new element, add it to list
if f==0: _values.append((tagelname, tageltype))
return 0
else:
# new key, directly modify dictionary
elements = []
elements.append((tagelname, tageltype))
self._handled[key] = elements
def finish_starttag(self, tag, attrs):
self.handle_starttag(tag, attrs)
def finish_endtag(self, tag):
# Add code
pass
def handle_proc(self, instruction, content):
# Add code
pass
def handle_special(self, content):
# Add code
pass
def handle_charref(self, charref):
# Add code
pass
def handle_entityref(self, entityref):
# Add code
pass
def resolve_entityref(self, entityref):
# Add code
pass
def handle_data(self, data):
# Add code
pass
def handle_cdata(self, cdata):
# Add code
pass
def handle_comment(self, comment):
# Add code
pass
class harvestManSimpleParser(HTMLParser):
""" Simple python parser using the HTMLParser module """
def __init__(self):
self._handler = harvestManPageHandler()
self.links=[]
self.images=[]
HTMLParser.__init__(self)
def handle_starttag(self, tag, attrs):
self._handler.handle_starttag(tag, attrs)
# Copy links
self.links = self._handler.links[0:]
self.images = self._handler.images[0:]
def handle_javascript(self, tag, attrs):
self._handler.handle_starttag(tag, attrs)
def handle_java_applet_tag(self, tag, attrs):
# print 'APPLET => ', tag, attrs
self._handler.handle_starttag(tag, attrs)
def anchor_bgn(self, href, name, type):
self._handler.anchor_bgn(href, name, type)
# Copy links
self.links = self._handler.links[0:]
def anchor_end(self):
self._handler.anchor_end()
def handle_image(self, source, alt, ismap, align, width, height):
self._handler.handle_image(source, alt, ismap, align, width, height)
self.images = self._handler.images[0:]
# Wrapper functions
def save_anchors(self, anchor):
self._handler.save_anchors(anchor)
def add_tag_info(self, taginfo):
self._handler.add_tag_info(taginfo)
class harvestManFastParser(harvestManPageHandler):
""" A fast parser using the sgmlop module """
def __init__(self):
harvestManPageHandler.__init__(self)
import sgmlop # do local import
self._parser = sgmlop.XMLParser()
self._parser.register(self)
def feed(self, data):
self._parser.feed(data)
def close(self):
self._parser.close()
if __name__=="__main__":
import math, time
# test simple parser
Initialize()
cfg = GetObject('config')
cfg.verbosity=5
t1 = time.time()
wp = harvestManSimpleParser()
# wp.feed(open('test/anand-index.html').read())
# wp.feed(open('test/applet-example.html').read())
import tidy
options=dict(indent=1, tidy_mark=1, fix_uri=1)
wp.feed(str(tidy.parseString(open('1.0.html').read(), **options)))
##wp.feed(open('test/python.org-index.html').read())
## moredebug('------------------------------------------\n')
## wp.feed(open('test/python.org-tut.html').read())
## moredebug('------------------------------------------\n')
## wp.feed(open('test/pyro-index.html').read())
## moredebug('------------------------------------------\n')
## wp.feed(open('test/pyro-manual.html').read())
## moredebug('------------------------------------------\n')
## wp.feed(open('test/rediff.com-index.html').read())
wp.close()
t2 = time.time()
fetchtime = float((math.modf((t2-t1)*100.0)[1])/100.0)
print 'Time taken for parsing => ', fetchtime
del wp
# test fast parser
## wp= harvestManFastParser()
## t1 = time.time()
## wp.feed(open('test/python.org-index.html').read())
## moredebug('------------------------------------------\n')
## wp.feed(open('test/python.org-tut.html').read())
## moredebug('------------------------------------------\n')
## wp.feed(open('test/pyro-index.html').read())
## moredebug('------------------------------------------\n')
## wp.feed(open('test/pyro-manual.html').read())
## moredebug('------------------------------------------\n')
## wp.feed(open('test/rediff.com-index.html').read())
## wp.close()
## t2 = time.time()
## fetchtime = float((math.modf((t2-t1)*100.0)[1])/100.0)
## print 'Time taken for parsing => ', fetchtime
## del wp
## del cfg
Finish()
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -