📄 narcissus.py
字号:
# -- coding: utf-8
"""
Python port of Rbnarcissus, a pure Javascript parser written in Ruby.
This code has been ported from the free Rbnarcissus port available
at http://idontsmoke.co.uk/2005/rbnarcissus/Parser.rb.
For a status of the code and the test cases which pass, read
the README.
This code is licensed under GNU GPL version 2.0.
Author : Anand B Pillai <abpillai at gmail dot com>
Copyright (C) 2007 Anand B Pillai <abpillai at gmail dot com>
"""
__version__ = "0.1 (alpha)"
__author__ = "Anand B Pillai"
import re
def dump(d):
for key in sorted(d):
print key,'=>',d[key]
class NarcissusError(Exception):
def __init__(self, msg, tokenizer):
self.msg = msg
self.t = tokenizer
def __str__(self):
return self.msg
tokens = [
# End of source.
"END",
# Operators and punctuators. Some pair-wise order matters, e.g. (+, -)
# and (UNARY_PLUS, UNARY_MINUS).
"\n", ";",
",",
"=",
"?", ":", "CONDITIONAL",
"||",
"&&",
"|",
"^",
"&",
"==", "!=", "===", "!==",
"<", "<=", ">=", ">",
"<<", ">>", ">>>",
"+", "-",
"*", "/", "%",
"!", "~", "UNARY_PLUS", "UNARY_MINUS",
"++", "--",
".",
"[", "]",
"{", "}",
"(", ")",
# Nonterminal tree node type codes.
"SCRIPT", "BLOCK", "LABEL", "FOR_IN", "CALL", "NEW_WITH_ARGS", "INDEX",
"ARRAY_INIT", "OBJECT_INIT", "PROPERTY_INIT", "GETTER", "SETTER",
"GROUP", "LIST",
# Terminals.
"IDENTIFIER", "NUMBER", "STRING", "REGEXP",
# Keywords.
"break",
"case", "catch", "const", "continue",
"debugger", "default", "delete", "do",
"else", "enum",
"false", "finally", "for", "function",
"if", "in", "instanceof",
"new", "null",
"return",
"switch",
"this", "throw", "true", "try", "typeof",
"var", "void",
"while", "with",
]
# Operator and punctuator mapping from token to tree node type name.
opTypeNames = {
"\n" : "NEWLINE",
';' : "SEMICOLON",
',' : "COMMA",
'?' : "HOOK",
':' : "COLON",
'||' : "OR",
'&&' : "AND",
'|' : "BITWISE_OR",
'^' : "BITWISE_XOR",
'&' : "BITWISE_AND",
'===' : "STRICT_EQ",
'==' : "EQ",
'=' : "ASSIGN",
'!==' : "STRICT_NE",
'!=' : "NE",
'<<' : "LSH",
'<=' : "LE",
'<' : "LT",
'>>>' : "URSH",
'>>' : "RSH",
'>=' : "GE",
'>' : "GT",
'++' : "INCREMENT",
'--' : "DECREMENT",
'+' : "PLUS",
'-' : "MINUS",
'*' : "MUL",
'/' : "DIV",
'%' : "MOD",
'!' : "NOT",
'~' : "BITWISE_NOT",
'.' : "DOT",
'[' : "LEFT_BRACKET",
']' : "RIGHT_BRACKET",
'{' : "LEFT_CURLY",
'}' : "RIGHT_CURLY",
'(' : "LEFT_PAREN",
')' : "RIGHT_PAREN"
}
# Hash of keyword identifier to tokens index.
keywords = {}
# Define const END, etc., based on the token names. Also map name to index.
consts = {}
r1 = re.compile(r'\A[a-z]')
r2 = re.compile(r'\A\W')
for i in range(len(tokens)):
t = tokens[i]
if r1.match(t):
# # print t
consts[t.upper()] = i
keywords[t] = i
elif r2.match(t):
consts[opTypeNames[t]] = i
else:
consts[t] = i
#for key in sorted(keywords):
# # print key,'=>',keywords[key]
# Map assignment operators to their indexes in the tokens array.
assignOps = ['|', '^', '&', '<<', '>>', '>>>', '+', '-', '*', '/', '%']
assignOpsHash = {}
for i in range(len(assignOps)):
t = assignOps[i]
assignOpsHash[t] = consts[opTypeNames[t]]
#for key in sorted(assignOpsHash):
# # print key,'=>',assignOpsHash[key]
opPrecedence = {
"SEMICOLON" : 0,
"COMMA" : 1,
"ASSIGN" : 2,
"HOOK" : 3, "COLON" : 3, "CONDITIONAL" : 3,
"OR" : 4,
"AND" : 5,
"BITWISE_OR" : 6,
"BITWISE_XOR" : 7,
"BITWISE_AND" : 8,
"EQ" : 9, "NE" : 9, "STRICT_EQ" : 9, "STRICT_NE" : 9,
"LT" : 10, "LE" : 10, "GE" : 10, "GT" : 10, "IN" : 10, "INSTANCEOF" : 10,
"LSH" : 11, "RSH" : 11, "URSH" : 11,
"PLUS" : 12, "MINUS" : 12,
"MUL" : 13, "DIV" : 13, "MOD" : 13,
"DELETE" : 14, "VOID" : 14, "TYPEOF" : 14, # PRE_INCREMENT: 14, PRE_DECREMENT: 14,
"NOT" : 14, "BITWISE_NOT" : 14, "UNARY_PLUS" : 14, "UNARY_MINUS" : 14,
"INCREMENT" : 15, "DECREMENT" : 15, # postfix
"NEW" : 16,
"DOT" : 17
}
# Map operator type code to precedence.
for key in opPrecedence.keys():
opPrecedence[consts[key]] = opPrecedence[key]
#for key in sorted(opPrecedence):
# # print key,'=>',opPrecedence[key]
opArity = {
"COMMA" : -2,
"ASSIGN" : 2,
"CONDITIONAL" : 3,
"OR" : 2,
"AND" : 2,
"BITWISE_OR" : 2,
"BITWISE_XOR" : 2,
"BITWISE_AND" : 2,
"EQ" : 2, "NE" : 2, "STRICT_EQ" : 2, "STRICT_NE" : 2,
"LT" : 2, "LE" : 2, "GE" : 2, "GT" : 2, "IN" : 2, "INSTANCEOF" : 2,
"LSH" : 2, "RSH" : 2, "URSH" : 2,
"PLUS" : 2, "MINUS" : 2,
"MUL" : 2, "DIV" : 2, "MOD" : 2,
"DELETE" : 1, "VOID" : 1, "TYPEOF" : 1, # PRE_INCREMENT: 1, PRE_DECREMENT: 1,
"NOT" : 1, "BITWISE_NOT" : 1, "UNARY_PLUS" : 1, "UNARY_MINUS" : 1,
"INCREMENT" : 1, "DECREMENT" : 1, # postfix
"NEW" : 1, "NEW_WITH_ARGS" : 2, "DOT" : 2, "INDEX" : 2, "CALL" : 2,
"ARRAY_INIT" : 1, "OBJECT_INIT" : 1, "GROUP" : 1
}
# Map operator type code to arity.
for key in opArity.keys():
opArity[consts[key]] = opArity[key]
# dump(opArity)
# NB: superstring tokens (e.g., ++) must come before their substring token
# counterparts (+ in the example), so that the opRegExp regular expression
# synthesized from this list makes the longest possible match.
ops = [';', ',', '?', ':', '||', '&&', '|', '^', '&', '===', '==',
'=', '!==', '!=', '<<', '<=', '<', '>>>', '>>', '>=', '>', '++', '--',
'+', '-', '*', '/', '%', '!', '~', '.', '[', ']', '{', '}', '(', ')']
# Build a regexp that recognizes operators and punctuators (except newline).
opRegExpSrc = "\\A"
r3 = re.compile(r'([?|^&(){}\[\]+\-*\/\.])')
# $ops.length.times do |i|
for i in range(len(ops)):
if opRegExpSrc != "\\A":
opRegExpSrc += "|\\A"
s = ops[i]
# #print 'S=>',s
for item in s:
opRegExpSrc += r3.sub("\\" + item, item)
opRegExp = re.compile(opRegExpSrc, re.MULTILINE)
# A regexp to match floating point literals (but not integer literals).
fpRegExp = re.compile("\\A\\d+\\.\\d*(?:[eE][-+]?\\d+)?|\\A\\d+(?:\\.\\d*)?[eE][-+]?\\d+|\\A\\.\\d+(?:[eE][-+]?\\d+)?", re.MULTILINE)
# dump(consts)
# import sys
# sys.exit(0)
class List(list):
def __setitem__(self, index, item):
l = len(self)
if l<=index:
for x in range(index-l): self.append(None)
self.append(item)
else:
super(List, self).__setitem__(index, item)
def __getitem__(self, index):
try:
return super(List, self).__getitem__(index)
except IndexError, e:
return None
class List2(list):
def last(self):
try:
return self[-1]
except IndexError, e:
return None
class Tokenizer(object):
def __init__(self, source, filename='', line=1):
self.cursor = 0
self.source = str(source)
self.tokens = List()
self.tokenIndex = 0
self.lookahead = 0
self.scanNewlines = False
self.scanOperand = True
self.filename = filename
self.lineno = line
def input(self):
return self.source[self.cursor:]
def done(self):
return (self.peek() == consts["END"])
def token(self):
return self.tokens[self.tokenIndex]
def match(self, tt):
print 'Calling match of',tt
got = self.get()
print 'GOOT',got,tt
if got == tt:
return True
else:
return self.unget()
def mustMatch(self, tt):
print 'Calling mustMatch',tt
if not self.match(tt):
raise NarcissusError("Missing " + self.tokens[tt].lower(), self)
return self.token()
def peek(self):
# # print self.lookahead
if self.lookahead > 0:
# # print len(self.tokens)
# # print self.tokenIndex,self.lookahead
# # print (self.tokenIndex + self.lookahead) & 3
# # print self.tokens
token = self.tokens[(self.tokenIndex + self.lookahead) & 3]
# # print token
tt = token.type
else:
tt = self.get()
self.unget()
return tt
def peekOnSameLine(self):
self.scanNewlines = True
tt = self.peek()
self.scanNewlines = False
return tt
def get(self):
pattern1 = re.compile(r'\A[ \t]+')
pattern2 = re.compile(r'\s+')
pattern3 = re.compile("\/(?:\*(?:.)*?\*\/|\/[^\n]*)", re.DOTALL)
# pattern3 = re.compile(r'(\?\:\*(\?\:\.)*?\*\/|\/[^\n]*)', re.MULTILINE)
pattern4 = re.compile(r'\A0[xX][\da-fA-F]+|0[0-7]*|\d+')
pattern5 = re.compile(r'\A(\w|\$)+')
pattern6 = re.compile(r"\A\"(?:\\.|[^\"])*\"|\A'(?:[^']|\\.)*'")
pattern7 = re.compile(r'\A\/((?:\\.|[^\/])+)\/([gi]*)')
while self.lookahead > 0:
self.lookahead -= 1
self.tokenIndex = (self.tokenIndex + 1) & 3
token = self.tokens[self.tokenIndex]
if token.type != consts["NEWLINE"] or self.scanNewlines:
return token.type
while True:
input_s = self.input()
print 'Input => ', input_s
if self.scanNewlines:
print 'Scannewlines is true'
match = pattern1.match(input_s)
else:
match = pattern2.match(input_s)
if match:
print 'A MATCH FOUND!'
spaces = match.group(0)
print 'Spaces =>',len(spaces)
self.cursor += len(spaces)
print 'Newline count =>',spaces.count('\n')
self.lineno += spaces.count('\n')
input_s = self.input()
print 'Input=>',input_s, len(input_s)
match = pattern3.match(input_s)
if not match:
print 'BREAKING'
break
print 'Cursor',self.cursor
comment = match.group(0)
print 'Comment =>',comment
self.cursor += len(comment)
print 'Cursor',self.cursor
print 'Comment length, comment newline count=>',len(comment),comment.count('\n')
self.lineno += comment.count('\n')
self.tokenIndex = (self.tokenIndex + 1) & 3
# # print self.tokenIndex
token = self.tokens[self.tokenIndex]
if token==None:
# # print self.tokens, self.tokenIndex
self.tokens[self.tokenIndex] = token = Token()
if len(input_s)==0:
token.type = consts["END"]
return token.type
matchflag = False
cursor_advance = 0
if fpRegExp.match(input_s):
print "Matched here1"
match = fpRegExp.match(input_s)
token.type = consts["NUMBER"]
# Not sure if this works or if we need .findall()[0]
token.value = float(match.group(0))
elif pattern4.match(input_s):
print "Matched here2"
match = pattern4.match(input_s)
token.type = consts["NUMBER"]
token.value = int(match.group(0))
elif pattern5.match(input_s):
print "MATCH: Matched here3",input_s
match = pattern5.match(input_s)
id = match.group(0)
token.type = keywords.get(id) or consts["IDENTIFIER"]
token.value = id
elif pattern6.match(input_s):
print "Matched here4"
match = pattern6.match(input_s)
token.type = consts["STRING"]
token.value = str(match.group(0))
elif self.scanOperand and pattern7.match(input_s):
print "Matched here5"
match = pattern7.match(input_s)
# # print match.group(2)
token.type = consts["REGEXP"]
# print match.group(1), match.group(2)
token.value = re.compile(match.group(1)) # , match.group(2))
elif opRegExp.match(input_s):
print "Matched here6",input_s
match = opRegExp.match(input_s)
op = match.group(0)
if assignOpsHash.get(op) and (input_s[len(op):2] == '='):
print 'Token type is ASSIGN'
token.type = consts["ASSIGN"]
token.assignOp = consts[opTypeNames[op]]
cursor_advance = 1 # length of '='
else:
token.type = consts[opTypeNames[op]]
print token.type, self.scanOperand, consts["MINUS"]
print 'TOKEN TYPE NOT ASSIGN!'
if self.scanOperand and (token.type==consts["PLUS"] or token.type==consts["MINUS"]):
print 'Adding to token type!'
token.type += consts["UNARY_PLUS"] - consts["PLUS"]
token.assignOp = None
token.value = op
else:
raise NarcissusError("Illegal token", self)
token.start = self.cursor
# # print token.start
print 'Group0 =>',match.group(0)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -