📄 jsparser.py

📁 Harvestman-最新版本
💻 PY
📖 第 1 页 / 共 2 页
字号:
12 下一页
# -- coding: utf-8""" jsparser - This module provides classes which performJavascript extraction from HTML and Javascript parsing toprocess DOM objects.The module consists of two classes. HTMLJSParseris an HTML Javascript extractor which can extract javascriptcode present in HTML pages. JSParser builds upon HTMLJSParserto provide a Javascript parser which can parse HTML pagesand process Javascript which performs DOM modifications.Currently this class can process document.write* functionsand Javascript based redirection which changes the locationof a page.Both classes are written trying to mimic the behaviourof Firefox (2.0) as closely as possible.This module is part of the HarvestMan program. For licensinginformation see the file LICENSE.txt that is included in thisdistribution.Created Anand B Pillai <abpillai at gmail dot com> Aug 31 2007Modified Anand B Pillai Oct 2 2007 Added JSParser class and renamed                                   old JSParser to HTMLJSParser.Modified Anand B Pillai  Jan 18 2008 Rewrote regular expressions in                                     HTMLJSParser using pyparsing.Copyright (C) 2007 Anand B Pillai."""import sys, osimport reimport urllib2from pyparsing import *from jsdom import *class HTMLJSParser(object):   """ Javascript parser which extracts javascript statements   embedded in HTML. The parser only performs extraction, and no   Javascript tokenizing """   script_content = Literal("<") + Literal("script") + ZeroOrMore(Word(alphas) + Literal("=") + Word(alphanums + "."+ "/"  + '"' + "'")) + Literal(">") + SkipTo(Literal("</") + Literal("script") + Literal(">"), True)   comment_open = Literal("<!--") + SkipTo("\n", True)   comment_close = Literal("//") + ZeroOrMore(Word(alphanums)) + Literal("-->")   brace_open = Literal("{")   brace_close = Literal("}")      syntaxendre = re.compile(r';$')   def __init__(self):      self.comment_open.setParseAction(replaceWith(''))      self.comment_close.setParseAction(replaceWith(''))      self.brace_open.setParseAction(replaceWith(''))      self.brace_close.setParseAction(replaceWith(''))                  self.reset()          def reset(self):       self.rawdata = ''       self.buffer = ''       self.statements = []       self.positions = []          def feed(self, data):       self.rawdata = self.rawdata + data       # Extract javascript content       self.extract()       # Internal - parse the HTML to extract Javascript   def extract(self):      rawdata = self.rawdata      for match in self.script_content.scanString(rawdata):         if not match: continue         if len(match) != 3: continue         if len(match[0])==0: continue         if len(match[0][-1])==0: continue                  statement = match[0][-1][0]         # print 'Statement=>',statement         self.statements.append(statement.strip())         self.positions.append((match[-2], match[-1]))      # print 'Length=>',len(self.statements)      # If the JS is embedded in HTML comments <!--- js //-->      # remove the comments. This logic takes care of trimming      # any junk before/after the comments modeling the      # behaviour of a browser (Firefox) as closely as possible.            flag  = True      for x in range(len(self.statements)):         s = self.statements[x]         # Remove any braces         s = self.brace_close.transformString(self.brace_open.transformString(s))         s = self.comment_open.transformString(s)         s = self.comment_close.transformString(s)                  # Clean up any syntax end chars         s = self.syntaxendre.sub('', s).strip()                  if s:self.statements[x] = s      # Trim any empty statements      # print 'Length=>',len(self.statements)      # print self.statements      class JSParserException(Exception):   """ An exception class for JSParser """      def __init__(self, error, context=None):      self._error = error      # Context: line number, js statement etc.      self._context =context   def __str__(self):      return str(self._error)   def __repr__(self):      return '@'.join((str(self._error), str(self._context)))  class JSParser(object):   """ Parser for Javascript DOM. This class can be used to parse   javascript which contains DOM binding statements. It returns   a DOM object. Calling a repr() on this object will produce   the modified DOM text """   # TODO: Rewrite this using pyparsing      # Start signature of document.write* methods   re1 = re.compile(r"(document\.write\s*\()|(document\.writeln\s*\()")      re3 = re.compile(r'(?<![document\.write\s*|document\.writeln\s*])\(.*\)', re.MULTILINE)   # End signature of document.write* methods   re4 = re.compile(r'[\'\"]\s*\)|[\'\"]\s*\);', re.MULTILINE)   # Pattern for contents inside document.write*(...) methods   # This can be either a single string enclosed in quotes,   # a set of strings concatenated using "+" or a set of   # string arguments (individual or concatenated) separated   # using commas. Text can be enclosed either in single or   # double quotes.   # Valid Examples...   # 1. document.write("<H1>This is a heading</H1>\n");   # 2. document.write("Hello World! ","Hello You! ","<p style='color:blue;'>Hello World!</p>");   # 3. document.write("Hi, this is " + "<p>A paragraph</p>" + "and this is "  + "<p>Another one</p>");   # 4. document.write("Hi, this is " + "<p>A paragraph</p>", "and this is "  + "<p>Another one</p>");   # Pattern for content   re5 = re.compile(r'(\".*\")|(\'.*\')', re.MULTILINE)   re6 = re.compile(r'(?<=[\"\'\s])[\+\,]+')   re7 = re.compile(r'(?<=[\"\'])(\s*[\+\,]+)')      re8 = re.compile(r'^[\'\"]|[\'\"]$')      # JS redirect regular expressions   # Form => window.location.replace("<url>") or window.location.assign("<url>")   # or location.replace("<url>") or location.assign("<url>")   jsredirect1 = re.compile(r'((window\.|this\.)?location\.(replace|assign))(\(.*\))', re.IGNORECASE)   # Form => window.location.href="<url>" or location.href="<url>"   jsredirect2 = re.compile(r'((window\.|this\.)?location(\.href)?\s*\=\s*)(.*)', re.IGNORECASE)      quotechars = re.compile(r'[\'\"]*')   newlineplusre = re.compile(r'\n\s*\+')             def __init__(self):      self._nflag = False      self.parser = HTMLJSParser()      self.resetDOM()      self.statements = []      self.js = []      pass   def resetDOM(self):      self.page = None      self.page = Window()      self.page.document = Document()      self.page.location = Location()      self.locnchanged = False      self.domchanged = False         def _find(self, data):      # Search for document.write* statements and return the      # match group if found. Also sets the internal newline      # flag, depending on whether a document.write or      # document.writeln was found.      self._nflag = False      m = self.re1.search(data)      if m:         grp = m.group()         if grp.startswith('document.writeln'):            self._nflag = True         return m   def parse_url(self, url):      """ Parse data from the given URL """            try:         data = urllib2.urlopen(url).read()         # print 'fetched data'         return self.parse(data)      except Exception, e:         print e                     def parse(self, data):      """ Parse HTML, extract javascript and process it """      self.js = []      self.resetDOM()      self.parser.reset()            self.page.document.content = data            # Create a jsparser to extract content inside <script>...</script>      # print 'Extracting js content...'      self.parser.feed(data)      self.js = self.parser.statements[:]            # print 'Extracted js content.'      # print 'Found %d JS statements.' % len(self.js)            # print 'Processing JS'      for x in range(len(self.js)):         statement = self.js[x]         # First check for JS redirects and         # then for JS document changes.         jsredirect = self.processLocation(statement)         if jsredirect:            # No need to process further since we are redirecting            # the location            break         else:            # Further process the URL for document changes            position = self.parser.positions[x]                        rawdata = statement.strip()            self._feed(rawdata)                     if len(self.statements):               self.processDocument(position)      # Set flags for DOM/Location change      self.locnchanged = self.page.location.hrefchanged      self.domchanged = self.page.document.contentchanged      # print 'Processed JS.'         def processDocument(self, position):      """ Process DOM document javascript """      # The argument 'position' is a two tuple      # containing the start and end positions of      # the javascript tags inside the document.      dom = self.page.document      start, end = position            # Reset positions on DOM content to empty string      dom.chomp(start, end)            for text, newline in self.statements:         if newline:            dom.writeln(text)         else:            dom.write(text)      # Re-create content      dom.construct()   # Internal - validate URL strings for Javascript   def validate_url(self, urlstring):      """ Perform validation of URL strings """            # Validate the URL - This follows Firefox behaviour      # In firefox, the URL might be or might not be enclosed      # in quotes. However if it is enclosed in quotes the quote      # character at start and begin should match. For example      # 'http://www.struer.dk/webtop/site.asp?site=5',      # "http://www.struer.dk/webtop/site.asp?site=5" and      # http://www.struer.dk/webtop/site.asp?site=5 are valid, but      # "http://www.struer.dk/webtop/site.asp?site=5' and      # 'http://www.struer.dk/webtop/site.asp?site=5" are not.      if urlstring.startswith("'") or urlstring.startswith('"'):
12 下一页
💿 文件大小 419 K
👤 上传用户 ccdn2615
📂 所属分类 Java编程
🏷️ 相关标签

#Harvestman #版本
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -