pageparser.py
来自「Harvestman-最新版本」· Python 代码 · 共 585 行 · 第 1/2 页
PY
585 行
# with a URL (CONTENT="0; URL=<url>") items = link.split(';') if len(items)==1: # Only a time-gap, skip it continue elif len(items)==2: # Second one should be a URL reqd = items[1] # print 'Reqd=>',reqd if (reqd.find('URL') != -1 or reqd.find('url') != -1) and reqd.find('=') != -1: link = reqd.split('=')[1].strip() # print 'Link=>',link else: continue else: # Handle robots meta tag name = d.get('name','').lower() if name=='robots': robots = d.get('content','').lower() # Split to ',' contents = [item.strip() for item in robots.split(',')] # Check for nofollow self.can_follow = not ('nofollow' in contents) # Check for noindex self.can_index = not ('noindex' in contents) elif name=='keywords': self.keywords = d.get('content','').split(',') # Trim the keywords list self.keywords = [word.lower().strip() for word in self.keywords] elif name=='description': self.description = d.get('content','').strip() else: continue elif tag != 'applet': link = d[key] else: link += d[key] if key == 'codebase': if link: if link[-1] != '/': link += '/' continue except KeyError: continue # see if this link is to be filtered if self.filter_link(link) != LINK_NOT_FILTERED: # print 'Filtered link',link continue # anchor links in a page should not be saved # index = link.find('#') # Make sure not to wrongly categorize '#' in query strings # as anchor URLs. if link.find('#') != -1 and not self.query_re.search(link): # print 'Is an anchor link',link self.handle_anchor_links(link) else: # append to private list of links self.check_add_link(typ, link) def unknown_endtag(self, tag): self._tag = '' if tag=='title': self.title_flag = False self.title = self.title.strip() def handle_data(self, data): if self._tag.lower()=='title' and self.title_flag: self.title += data def check_add_link(self, typ, link): """ To avoid adding duplicate links """ f = False if typ == 'image': if not (typ, link) in self.images: self.images.append((typ, link)) elif not (typ, link) in self.links: # print 'Adding link ', link, typ pos = self.getpos() self.links.append((typ, link)) self.linkpos[(typ,link)] = (pos[0],pos[1]) def add_tag_info(self, taginfo): """ Add new tag information to this object. This can be used to change the behavior of this class at runtime by adding new tags """ # The taginfo object should be a dictionary # of the form { tagtype : (elementname, elementype) } # egs: { 'body' : ('background', 'img) } if type(taginfo) != dict: raise AttributeError, "Attribute type mismatch, taginfo should be a dictionary!" # get the key of the dictionary key = (taginfo.keys())[0] if len(taginfo[key]) != 2: raise ValueError, 'Value mismatch, size of tag tuple should be 2' # get the value tuple tagelname, tageltype = taginfo[key] # see if this is an already existing tagtype if key in self.handled.keys: _values = self.handled[key] f=0 for index in xrange(len(_values)): # if the elementname is also # the same, just replace it. v = _values[index] elname, eltype = v if elname == tagelname: f=1 _values[index] = (tagelname, tageltype) break # new element, add it to list if f==0: _values.append((tagelname, tageltype)) return else: # new key, directly modify dictionary elements = [] elements.append((tagelname, tageltype)) self.handled[key] = elements def reset(self): SGMLParser.reset(self) self.url = None self.base = None self.links = [] self.images = [] self.base_href = False self.base_url = '' self.can_index = True self.can_follow = True self.title = '' self.title_flag = True self.description = '' self.keywords = [] def base_url_defined(self): """ Return whether this url had a base url of the form <base href='...'> defined """ return self.base_href def get_base_url(self): return self.base def set_url(self, url): """ Set the URL whose data is about to be parsed """ self.url = urlclass HarvestManSGMLOpParser(HarvestManSimpleParser): """ A parser based on effbot's sgmlop """ def __init__(self): # This module should be built already! import sgmlop self.parser = sgmlop.SGMLParser() self.parser.register(self) HarvestManSimpleParser.__init__(self) # Type self.typ = 1 def finish_starttag(self, tag, attrs): self.unknown_starttag(tag, attrs) def finish_endtag(self, tag): self.unknown_endtag(tag) def feed(self, data): self.parser.feed(data) class HarvestManCSSParser(object): """ Class to parse stylesheets and extract URLs """ # Regexp to parse stylesheet imports importcss1 = re.compile(r'(\@import\s+\"?)(?!url)([\w.-:/]+)(\"?)', re.MULTILINE|re.LOCALE|re.UNICODE) importcss2 = re.compile(r'(\@import\s+url\(\"?)([\w.-:/]+)(\"?\))', re.MULTILINE|re.LOCALE|re.UNICODE) # Regexp to parse URLs inside CSS files cssurl = re.compile(r'(url\()([^\)]+)(\))', re.LOCALE|re.UNICODE) def __init__(self): # Any imported stylesheet URLs self.csslinks = [] # All URLs including above self.links = [] def feed(self, data): self._parse(data) def _parse(self, data): """ Parse stylesheet data and extract imported css links, if any """ # Return is a list of imported css links. # This subroutine uses the specification mentioned at # http://www.w3.org/TR/REC-CSS2/cascade.html#at-import # for doing stylesheet imports. # This takes care of @import "style.css" and # @import url("style.css") and url(...) syntax. # Media types specified if any, are ignored. # Matches for @import "style.css" l1 = self.importcss1.findall(data) # Matches for @import url("style.css") l2 = self.importcss2.findall(data) # Matches for url(...) l3 = self.cssurl.findall(data) for item in (l1+l2): if not item: continue url = item[1].replace("'",'').replace('"','') self.csslinks.append(url) self.links.append(url) for item in l3: if not item: continue url = item[1].replace("'",'').replace('"','') if url not in self.links: self.links.append(url)if __name__=="__main__": import os import config import logger SetAlias(config.HarvestManStateObject()) SetAlias(logger.HarvestManLogger()) cfg = objects.config cfg.verbosity = 5 SetLogSeverity() cfg.getquerylinks = True p = HarvestManSimpleParser() #p.enable_feature('option') #p = HarvestManSGMLOpParser() urls = ['http://projecteuler.net/index.php?section=problems'] urls = ['http://www.evvs.dk/index.php?cPath=30&osCsid=3b110c689f01d722dbbe53c5cee0bf2d'] urls = ['http://nltk.sourceforge.net/lite/doc/api/nltk_lite.contrib.fst.draw_graph.GraphEdgeWidget-class.html'] urls = ['http://wiki.java.net/bin/view/Javawsxml/Rome05Tutorials'] urls = ['http://bits.blogs.nytimes.com/2008/02/27/google-goes-after-another-microsoft-cash-cow/?ref=technology'] urls = ['http://mail.python.org/pipermail/bangpypers/2008-March/000410.html'] urls = ['http://www.bad-ischl.ooe.gv.at/system/web/default.aspx'] urls = ['http://europa.eu/languages/'] urls = ['http://www.web2.cz/rs-reference/'] urls = ['http://harvestmanontheweb.com/'] urls = ['http://www.web2.cz/rs-uvod/'] urls = ['http://digitallife.co.in/indian-cheerleaders-for-ipl/'] urls = ['http://www.brodingberg.gv.at'] urls = ["www.malvik.kommune.no"] urls = ["http://www.gr.ch/Deutsch/index.cfm"] for url in urls: if os.system('wget %s -O index.html' % url ) == 0: p.feed(open('index.html').read()) print p.links, len(p.links) for link in p.links: print link[1] print p.keywords print p.description print p.title print p.base_href print p.base p.reset()
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?