📄 jsparser.py
字号:
if urlstring[0] != urlstring[-1]: # Invalid URL return False return True def make_valid_url(self, urlstring): """ Create a valid URL string from the passed urlstring """ # Strip off any leading/trailing quote chars urlstring = self.quotechars.sub('',urlstring) return urlstring.strip() def processLocation(self, statement): """ Process any changes in document location """ locnchanged = False for line in statement.split('\n'): # print 'Expression=>',statement m1 = self.jsredirect1.search(line) if m1: tokens = self.jsredirect1.findall(line) if tokens: urltoken = tokens[0][-1] # Strip of trailing and leading parents url = urltoken.replace('(','').replace(')','').strip() # Validate URL if self.validate_url(url): url = self.make_valid_url(url) locnchanged = True self.page.location.replace(url) else: m2 = self.jsredirect2.search(line) if m2: tokens = self.jsredirect2.findall(line) urltoken = tokens[0][-1] # Strip of trailing and leading parents url = urltoken.replace('(','').replace(')','').strip() if tokens and self.validate_url(url): url = self.make_valid_url(url) locnchanged = True self.page.location.replace(url) locnchanged = True return locnchanged def _feed(self, data): """ Internal method to feed data to process DOM document """ self.statements = [] self.rawdata = data self.goahead() self.process() def tryQuoteException(self, line): """ Check line for mismatching quotes """ ret = 0 # Check line for mismatching quotes if line[0] in ("'",'"') and line[-1] in ("'",'"'): ret = 1 if line[0] != line[-1]: raise JSParserException("Mismatching quote characters", line) return ret def process(self): """ Process DOM document related javascript """ # Process extracted statements statements2 = [] for s, nflag in self.statements: m = self.re5.match(s) if m: # Well behaved string if self.re6.search(s): m = self.re7.search(s) newline = self.newlineplusre.match(m.groups(1)[0]) items = self.re6.split(s) # See if any entry in the list has mismatching quotes, then # raise an error... for item in items: # print 'Item=>',item self.tryQuoteException(item) # Remove any trailing or beginning quotes from the items items = [self.re8.sub('',item.strip()) for item in items] # Replace any \" with " items = [item.replace("\\\"", '"') for item in items] # If the javascript consists of many lines with a + # connecting them, there is a very good chance that it # breaks spaces across multiple lines. In such case we # need to join the pieces with at least a space. if newline: s = ' '.join(items) else: # If it does not consist of newline and a +, we don't # need any spaces between the pieces. s = ''.join(items) # Remove any trailing or beginning quotes from the statement s = self.re8.sub('', s) statements2.append((s, nflag)) else: # Ill-behaved string, has some strange char either beginning # or end of line which was passed up to here. # print 'no match',s # Split and check for mismatched quotes if self.re6.search(s): items = self.re6.split(s) # See if any entry in the list has mismatching quotes, then # raise an error... for item in items: self.tryQuoteException(item) else: # Ignore it pass self.statements = statements2[:] pass def goahead(self): rawdata = self.rawdata self._nflag = False # Scans the document for document.write* statements # At the end of parsing, an internal DOM object # will contain the modified DOM if any. while rawdata: m = self._find(rawdata) if m: # Get start of contents start = m.end() rawdata = rawdata[start:] # Find the next occurence of a ')' # First exclude any occurences of pairs of parens # in the content contentdata, pos = rawdata, 0 m1 = self.re3.search(contentdata) while m1: contentdata = contentdata[m1.end():] pos = m1.end() # print 'Pos=>',pos # print contentdata m1 = self.re3.search(contentdata) m2 = self.re4.search(rawdata, pos) if not m2: raise JSParserException('Missing end paren!') else: start = m2.start() statement = rawdata[:start+1].strip() # print 'Statement=>',statement # If statement contains a document.write*, then it is a # botched up javascript, so raise error if self.re1.search(statement): raise JSParserException('Invalid javascript', statement) # Look for errors like mismatching start and end quote chars if self.tryQuoteException(statement) == 1: pass elif statement[0] in ('+','-') and statement[-1] in ("'", '"'): # Firefox seems to accept this case print 'warning: garbage char "%s" in beginning of statement!' % statement[0] else: raise JSParserException("Garbage in beginning/end of statement!") # Add statement to list self.statements.append((statement, self._nflag)) rawdata = rawdata[m2.end():] else: # No document.write* content found # print 'no content' break def getDocument(self): """ Return the DOM document object, this can be used to get the modified page if any """ return self.page.document def getLocation(self): """ Return the DOM Location object, this can be used to get modified URL if any """ return self.page.location def getStatements(self): """ Return the javascript statements in a list """ return self.parser.statements def localtests(): print 'Doing local tests...' P = JSParser() P.parse(open('samples/bportugal.html').read()) assert(repr(P.getDocument())==open('samples/bportugal_dom.html').read()) assert(P.domchanged==True) assert(P.locnchanged==False) P.parse(open('samples/jstest.html').read()) assert(repr(P.getDocument())==open('samples/jstest_dom.html').read()) assert(P.domchanged==True) assert(P.locnchanged==False) P.parse(open('samples/jsnodom.html').read()) assert(repr(P.getDocument())==open('samples/jsnodom.html').read()) assert(P.domchanged==False) assert(P.locnchanged==False) P.parse(open('samples/jstest2.html').read()) assert(repr(P.getDocument())==open('samples/jstest2_dom.html').read()) assert(P.domchanged==True) assert(P.locnchanged==False) P.parse(open('samples/jstest3.html').read()) assert(repr(P.getDocument())==open('samples/jstest3_dom.html').read()) assert(P.domchanged==True) assert(P.locnchanged==False) P.parse(open('samples/jsredirect.html').read()) assert(repr(P.getDocument())==open('samples/jsredirect.html').read()) assert(P.domchanged==False) assert(P.locnchanged==True) assert(P.getLocation().href=="http://www.struer.dk/webtop/site.asp?site=5") P.parse(open('samples/jsredirect2.html').read()) assert(repr(P.getDocument())==open('samples/jsredirect2.html').read()) assert(P.domchanged==False) assert(P.locnchanged==True) assert(P.getLocation().href=="http://www.struer.dk/webtop/site.asp?site=5") P.parse(open('samples/jsredirect3.html').read()) assert(repr(P.getDocument())==open('samples/jsredirect3.html').read()) assert(P.domchanged==False) assert(P.locnchanged==True) assert(P.getLocation().href=="fra/index.php") P.parse(open('samples/jsredirect4.html').read()) assert(repr(P.getDocument())==open('samples/jsredirect4.html').read()) assert(P.domchanged==False) assert(P.locnchanged==True) assert(P.getLocation().href=="http://www.szszm.hu/szigetszentmiklos.hu") P.parse(open('samples/jsredirect5.html').read()) assert(repr(P.getDocument())==open('samples/jsredirect5.html').read()) assert(P.domchanged==False) assert(P.locnchanged==True) assert(P.getLocation().href=="sopron/main.php") print 'All local tests passed.'def webtests(): print 'Starting web tests...' P = JSParser() urls = [("http://www.skien.kommune.no/", 0), ("http://www.bayern.de/", 7), ("http://www.agsbs.ch/", 2), ("http://www.froideville.ch/", 1)] for url, number in urls: print 'Parsing URL %s...' % url P.parse_url(url) print 'Found %d statements.' % len(P.getStatements()) assert(number==len(P.getStatements())) def experiments(): P = JSParser() P.parse(open('samples/test.html').read()) if __name__ == "__main__": localtests() #webtests() #experiments()
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -