📄 jsparser.py

📁 Harvestman-最新版本
💻 PY
📖 第 1 页 / 共 2 页
字号:
上一页 12
         if urlstring[0] != urlstring[-1]:            # Invalid URL            return False               return True   def make_valid_url(self, urlstring):      """ Create a valid URL string from the passed urlstring """            # Strip off any leading/trailing quote chars      urlstring = self.quotechars.sub('',urlstring)      return urlstring.strip()        def processLocation(self, statement):      """ Process any changes in document location """      locnchanged = False            for line in statement.split('\n'):                  # print 'Expression=>',statement         m1 = self.jsredirect1.search(line)         if m1:            tokens = self.jsredirect1.findall(line)            if tokens:                urltoken = tokens[0][-1]                # Strip of trailing and leading parents                url = urltoken.replace('(','').replace(')','').strip()                # Validate URL                if self.validate_url(url):                   url = self.make_valid_url(url)                   locnchanged = True                   self.page.location.replace(url)         else:            m2 = self.jsredirect2.search(line)            if m2:               tokens = self.jsredirect2.findall(line)               urltoken = tokens[0][-1]               # Strip of trailing and leading parents               url = urltoken.replace('(','').replace(')','').strip()               if tokens and self.validate_url(url):                  url = self.make_valid_url(url)                  locnchanged = True                  self.page.location.replace(url)                                    locnchanged = True      return locnchanged                   def _feed(self, data):      """ Internal method to feed data to process DOM document """            self.statements = []      self.rawdata = data      self.goahead()      self.process()         def tryQuoteException(self, line):      """ Check line for mismatching quotes """            ret = 0      # Check line for mismatching quotes      if line[0] in ("'",'"') and line[-1] in ("'",'"'):         ret = 1         if line[0] != line[-1]:            raise JSParserException("Mismatching quote characters", line)      return ret      def process(self):      """ Process DOM document related javascript """      # Process extracted statements      statements2 = []      for s, nflag in self.statements:         m = self.re5.match(s)         if m:            # Well behaved string            if self.re6.search(s):               m = self.re7.search(s)               newline = self.newlineplusre.match(m.groups(1)[0])               items = self.re6.split(s)                              # See if any entry in the list has mismatching quotes, then               # raise an error...               for item in items:                  # print 'Item=>',item                  self.tryQuoteException(item)                                 # Remove any trailing or beginning quotes from the items               items = [self.re8.sub('',item.strip()) for item in items]               # Replace any \" with "               items = [item.replace("\\\"", '"') for item in items]               # If the javascript consists of many lines with a +               # connecting them, there is a very good chance that it               # breaks spaces across multiple lines. In such case we               # need to join the pieces with at least a space.               if newline:                  s = ' '.join(items)               else:                  # If it does not consist of newline and a +, we don't                  # need any spaces between the pieces.                  s = ''.join(items)                              # Remove any trailing or beginning quotes from the statement            s = self.re8.sub('', s)            statements2.append((s, nflag))         else:            # Ill-behaved string, has some strange char either beginning            # or end of line which was passed up to here.            # print 'no match',s            # Split and check for mismatched quotes            if self.re6.search(s):               items = self.re6.split(s)               # See if any entry in the list has mismatching quotes, then               # raise an error...               for item in items:                  self.tryQuoteException(item)                              else:               # Ignore it               pass                  self.statements = statements2[:]      pass      def goahead(self):      rawdata = self.rawdata      self._nflag = False            # Scans the document for document.write* statements      # At the end of parsing, an internal DOM object      # will contain the modified DOM if any.      while rawdata:         m = self._find(rawdata)         if m:            # Get start of contents            start = m.end()            rawdata = rawdata[start:]            # Find the next occurence of a ')'            # First exclude any occurences of pairs of parens            # in the content            contentdata, pos = rawdata, 0            m1 = self.re3.search(contentdata)            while m1:               contentdata = contentdata[m1.end():]               pos = m1.end()               # print 'Pos=>',pos               # print contentdata               m1 = self.re3.search(contentdata)            m2 = self.re4.search(rawdata, pos)            if not m2:               raise JSParserException('Missing end paren!')            else:               start = m2.start()               statement = rawdata[:start+1].strip()               # print 'Statement=>',statement               # If statement contains a document.write*, then it is a               # botched up javascript, so raise error               if self.re1.search(statement):                  raise JSParserException('Invalid javascript', statement)                              # Look for errors like mismatching start and end quote chars               if self.tryQuoteException(statement) == 1:                  pass               elif statement[0] in ('+','-') and statement[-1] in ("'", '"'):                  # Firefox seems to accept this case                  print 'warning: garbage char "%s" in beginning of statement!' % statement[0]               else:                  raise JSParserException("Garbage in beginning/end of statement!")                                 # Add statement to list               self.statements.append((statement, self._nflag))               rawdata = rawdata[m2.end():]         else:            # No document.write* content found            # print 'no content'            break   def getDocument(self):      """ Return the DOM document object, this can be used to get      the modified page if any """      return self.page.document   def getLocation(self):      """ Return the DOM Location object, this can be used to      get modified URL if any """      return self.page.location   def getStatements(self):      """ Return the javascript statements in a list """      return self.parser.statements   def localtests():    print 'Doing local tests...'        P = JSParser()    P.parse(open('samples/bportugal.html').read())    assert(repr(P.getDocument())==open('samples/bportugal_dom.html').read())    assert(P.domchanged==True)    assert(P.locnchanged==False)        P.parse(open('samples/jstest.html').read())    assert(repr(P.getDocument())==open('samples/jstest_dom.html').read())    assert(P.domchanged==True)    assert(P.locnchanged==False)    P.parse(open('samples/jsnodom.html').read())    assert(repr(P.getDocument())==open('samples/jsnodom.html').read())    assert(P.domchanged==False)    assert(P.locnchanged==False)            P.parse(open('samples/jstest2.html').read())    assert(repr(P.getDocument())==open('samples/jstest2_dom.html').read())    assert(P.domchanged==True)    assert(P.locnchanged==False)        P.parse(open('samples/jstest3.html').read())    assert(repr(P.getDocument())==open('samples/jstest3_dom.html').read())    assert(P.domchanged==True)    assert(P.locnchanged==False)    P.parse(open('samples/jsredirect.html').read())    assert(repr(P.getDocument())==open('samples/jsredirect.html').read())    assert(P.domchanged==False)    assert(P.locnchanged==True)    assert(P.getLocation().href=="http://www.struer.dk/webtop/site.asp?site=5")    P.parse(open('samples/jsredirect2.html').read())    assert(repr(P.getDocument())==open('samples/jsredirect2.html').read())    assert(P.domchanged==False)    assert(P.locnchanged==True)    assert(P.getLocation().href=="http://www.struer.dk/webtop/site.asp?site=5")        P.parse(open('samples/jsredirect3.html').read())    assert(repr(P.getDocument())==open('samples/jsredirect3.html').read())    assert(P.domchanged==False)    assert(P.locnchanged==True)    assert(P.getLocation().href=="fra/index.php")    P.parse(open('samples/jsredirect4.html').read())    assert(repr(P.getDocument())==open('samples/jsredirect4.html').read())    assert(P.domchanged==False)    assert(P.locnchanged==True)    assert(P.getLocation().href=="http://www.szszm.hu/szigetszentmiklos.hu")        P.parse(open('samples/jsredirect5.html').read())    assert(repr(P.getDocument())==open('samples/jsredirect5.html').read())    assert(P.domchanged==False)    assert(P.locnchanged==True)    assert(P.getLocation().href=="sopron/main.php")        print 'All local tests passed.'def webtests():    print 'Starting web tests...'    P = JSParser()    urls = [("http://www.skien.kommune.no/", 0), ("http://www.bayern.de/", 7),            ("http://www.agsbs.ch/", 2), ("http://www.froideville.ch/", 1)]    for url, number in urls:       print 'Parsing URL %s...' % url       P.parse_url(url)       print 'Found %d statements.' % len(P.getStatements())       assert(number==len(P.getStatements()))       def experiments():   P = JSParser()   P.parse(open('samples/test.html').read())   if __name__ == "__main__":   localtests()   #webtests()   #experiments()
上一页 12
💿 文件大小 419 K
👤 上传用户 ccdn2615
📂 所属分类 Java编程
🏷️ 相关标签

#Harvestman #版本
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -