📄 jsparser.py
字号:
# -- coding: utf-8""" jsparser - This module provides classes which performJavascript extraction from HTML and Javascript parsing toprocess DOM objects.The module consists of two classes. HTMLJSParseris an HTML Javascript extractor which can extract javascriptcode present in HTML pages. JSParser builds upon HTMLJSParserto provide a Javascript parser which can parse HTML pagesand process Javascript which performs DOM modifications.Currently this class can process document.write* functionsand Javascript based redirection which changes the locationof a page.Both classes are written trying to mimic the behaviourof Firefox (2.0) as closely as possible.This module is part of the HarvestMan program. For licensinginformation see the file LICENSE.txt that is included in thisdistribution.Created Anand B Pillai <abpillai at gmail dot com> Aug 31 2007Modified Anand B Pillai Oct 2 2007 Added JSParser class and renamed old JSParser to HTMLJSParser.Modified Anand B Pillai Jan 18 2008 Rewrote regular expressions in HTMLJSParser using pyparsing.Copyright (C) 2007 Anand B Pillai."""import sys, osimport reimport urllib2from pyparsing import *from jsdom import *class HTMLJSParser(object): """ Javascript parser which extracts javascript statements embedded in HTML. The parser only performs extraction, and no Javascript tokenizing """ script_content = Literal("<") + Literal("script") + ZeroOrMore(Word(alphas) + Literal("=") + Word(alphanums + "."+ "/" + '"' + "'")) + Literal(">") + SkipTo(Literal("</") + Literal("script") + Literal(">"), True) comment_open = Literal("<!--") + SkipTo("\n", True) comment_close = Literal("//") + ZeroOrMore(Word(alphanums)) + Literal("-->") brace_open = Literal("{") brace_close = Literal("}") syntaxendre = re.compile(r';$') def __init__(self): self.comment_open.setParseAction(replaceWith('')) self.comment_close.setParseAction(replaceWith('')) self.brace_open.setParseAction(replaceWith('')) self.brace_close.setParseAction(replaceWith('')) self.reset() def reset(self): self.rawdata = '' self.buffer = '' self.statements = [] self.positions = [] def feed(self, data): self.rawdata = self.rawdata + data # Extract javascript content self.extract() # Internal - parse the HTML to extract Javascript def extract(self): rawdata = self.rawdata for match in self.script_content.scanString(rawdata): if not match: continue if len(match) != 3: continue if len(match[0])==0: continue if len(match[0][-1])==0: continue statement = match[0][-1][0] # print 'Statement=>',statement self.statements.append(statement.strip()) self.positions.append((match[-2], match[-1])) # print 'Length=>',len(self.statements) # If the JS is embedded in HTML comments <!--- js //--> # remove the comments. This logic takes care of trimming # any junk before/after the comments modeling the # behaviour of a browser (Firefox) as closely as possible. flag = True for x in range(len(self.statements)): s = self.statements[x] # Remove any braces s = self.brace_close.transformString(self.brace_open.transformString(s)) s = self.comment_open.transformString(s) s = self.comment_close.transformString(s) # Clean up any syntax end chars s = self.syntaxendre.sub('', s).strip() if s:self.statements[x] = s # Trim any empty statements # print 'Length=>',len(self.statements) # print self.statements class JSParserException(Exception): """ An exception class for JSParser """ def __init__(self, error, context=None): self._error = error # Context: line number, js statement etc. self._context =context def __str__(self): return str(self._error) def __repr__(self): return '@'.join((str(self._error), str(self._context))) class JSParser(object): """ Parser for Javascript DOM. This class can be used to parse javascript which contains DOM binding statements. It returns a DOM object. Calling a repr() on this object will produce the modified DOM text """ # TODO: Rewrite this using pyparsing # Start signature of document.write* methods re1 = re.compile(r"(document\.write\s*\()|(document\.writeln\s*\()") re3 = re.compile(r'(?<![document\.write\s*|document\.writeln\s*])\(.*\)', re.MULTILINE) # End signature of document.write* methods re4 = re.compile(r'[\'\"]\s*\)|[\'\"]\s*\);', re.MULTILINE) # Pattern for contents inside document.write*(...) methods # This can be either a single string enclosed in quotes, # a set of strings concatenated using "+" or a set of # string arguments (individual or concatenated) separated # using commas. Text can be enclosed either in single or # double quotes. # Valid Examples... # 1. document.write("<H1>This is a heading</H1>\n"); # 2. document.write("Hello World! ","Hello You! ","<p style='color:blue;'>Hello World!</p>"); # 3. document.write("Hi, this is " + "<p>A paragraph</p>" + "and this is " + "<p>Another one</p>"); # 4. document.write("Hi, this is " + "<p>A paragraph</p>", "and this is " + "<p>Another one</p>"); # Pattern for content re5 = re.compile(r'(\".*\")|(\'.*\')', re.MULTILINE) re6 = re.compile(r'(?<=[\"\'\s])[\+\,]+') re7 = re.compile(r'(?<=[\"\'])(\s*[\+\,]+)') re8 = re.compile(r'^[\'\"]|[\'\"]$') # JS redirect regular expressions # Form => window.location.replace("<url>") or window.location.assign("<url>") # or location.replace("<url>") or location.assign("<url>") jsredirect1 = re.compile(r'((window\.|this\.)?location\.(replace|assign))(\(.*\))', re.IGNORECASE) # Form => window.location.href="<url>" or location.href="<url>" jsredirect2 = re.compile(r'((window\.|this\.)?location(\.href)?\s*\=\s*)(.*)', re.IGNORECASE) quotechars = re.compile(r'[\'\"]*') newlineplusre = re.compile(r'\n\s*\+') def __init__(self): self._nflag = False self.parser = HTMLJSParser() self.resetDOM() self.statements = [] self.js = [] pass def resetDOM(self): self.page = None self.page = Window() self.page.document = Document() self.page.location = Location() self.locnchanged = False self.domchanged = False def _find(self, data): # Search for document.write* statements and return the # match group if found. Also sets the internal newline # flag, depending on whether a document.write or # document.writeln was found. self._nflag = False m = self.re1.search(data) if m: grp = m.group() if grp.startswith('document.writeln'): self._nflag = True return m def parse_url(self, url): """ Parse data from the given URL """ try: data = urllib2.urlopen(url).read() # print 'fetched data' return self.parse(data) except Exception, e: print e def parse(self, data): """ Parse HTML, extract javascript and process it """ self.js = [] self.resetDOM() self.parser.reset() self.page.document.content = data # Create a jsparser to extract content inside <script>...</script> # print 'Extracting js content...' self.parser.feed(data) self.js = self.parser.statements[:] # print 'Extracted js content.' # print 'Found %d JS statements.' % len(self.js) # print 'Processing JS' for x in range(len(self.js)): statement = self.js[x] # First check for JS redirects and # then for JS document changes. jsredirect = self.processLocation(statement) if jsredirect: # No need to process further since we are redirecting # the location break else: # Further process the URL for document changes position = self.parser.positions[x] rawdata = statement.strip() self._feed(rawdata) if len(self.statements): self.processDocument(position) # Set flags for DOM/Location change self.locnchanged = self.page.location.hrefchanged self.domchanged = self.page.document.contentchanged # print 'Processed JS.' def processDocument(self, position): """ Process DOM document javascript """ # The argument 'position' is a two tuple # containing the start and end positions of # the javascript tags inside the document. dom = self.page.document start, end = position # Reset positions on DOM content to empty string dom.chomp(start, end) for text, newline in self.statements: if newline: dom.writeln(text) else: dom.write(text) # Re-create content dom.construct() # Internal - validate URL strings for Javascript def validate_url(self, urlstring): """ Perform validation of URL strings """ # Validate the URL - This follows Firefox behaviour # In firefox, the URL might be or might not be enclosed # in quotes. However if it is enclosed in quotes the quote # character at start and begin should match. For example # 'http://www.struer.dk/webtop/site.asp?site=5', # "http://www.struer.dk/webtop/site.asp?site=5" and # http://www.struer.dk/webtop/site.asp?site=5 are valid, but # "http://www.struer.dk/webtop/site.asp?site=5' and # 'http://www.struer.dk/webtop/site.asp?site=5" are not. if urlstring.startswith("'") or urlstring.startswith('"'):
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -