⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 narcissus.py

📁 Harvestman-最新版本
💻 PY
📖 第 1 页 / 共 3 页
字号:
# -- coding: utf-8
"""
Python port of Rbnarcissus, a pure Javascript parser written in Ruby.
This code has been ported from the free Rbnarcissus port available
at http://idontsmoke.co.uk/2005/rbnarcissus/Parser.rb.

For a status of the code and the test cases which pass, read
the README.

This code is licensed under GNU GPL version 2.0.

Author : Anand B Pillai <abpillai at gmail dot com>
Copyright (C) 2007 Anand B Pillai <abpillai at gmail dot com>
"""

__version__ = "0.1 (alpha)"
__author__  = "Anand B Pillai"

import re

def dump(d):
    for key in sorted(d):
        print key,'=>',d[key]

class NarcissusError(Exception):

    def __init__(self, msg, tokenizer):
        self.msg = msg
        self.t = tokenizer

    def __str__(self):
        return self.msg
    
tokens = [
        # End of source.
        "END",
        
        # Operators and punctuators.  Some pair-wise order matters, e.g. (+, -)
        # and (UNARY_PLUS, UNARY_MINUS).
        "\n", ";",
        ",",
        "=",
        "?", ":", "CONDITIONAL",
        "||",
        "&&",
        "|",
        "^",
        "&",
        "==", "!=", "===", "!==",
        "<", "<=", ">=", ">",
        "<<", ">>", ">>>",
        "+", "-",
        "*", "/", "%",
        "!", "~", "UNARY_PLUS", "UNARY_MINUS",
        "++", "--",
        ".",
        "[", "]",
        "{", "}",
        "(", ")",
        
        # Nonterminal tree node type codes.
        "SCRIPT", "BLOCK", "LABEL", "FOR_IN", "CALL", "NEW_WITH_ARGS", "INDEX",
        "ARRAY_INIT", "OBJECT_INIT", "PROPERTY_INIT", "GETTER", "SETTER",
        "GROUP", "LIST",
        
        # Terminals.
        "IDENTIFIER", "NUMBER", "STRING", "REGEXP",
        
        # Keywords.
        "break",
        "case", "catch", "const", "continue",
        "debugger", "default", "delete", "do",
        "else", "enum",
        "false", "finally", "for", "function",
        "if", "in", "instanceof",
        "new", "null",
        "return",
        "switch",
        "this", "throw", "true", "try", "typeof",
        "var", "void",
        "while", "with",
]

# Operator and punctuator mapping from token to tree node type name.
opTypeNames = {
    "\n"  : "NEWLINE",
    ';'   : "SEMICOLON",
    ','   : "COMMA",
    '?'   : "HOOK",
    ':'   : "COLON",
    '||'  : "OR",
    '&&'  : "AND",
    '|'   : "BITWISE_OR",
    '^'   : "BITWISE_XOR",
    '&'   : "BITWISE_AND",
    '===' : "STRICT_EQ",
    '=='  : "EQ",
    '='   : "ASSIGN",
    '!==' : "STRICT_NE",
    '!='  : "NE",
    '<<'  : "LSH",
    '<='  : "LE",
    '<'   : "LT",
    '>>>' : "URSH",
    '>>'  : "RSH",
    '>='  : "GE",
    '>'   : "GT",
    '++'  : "INCREMENT",
    '--'  : "DECREMENT",
    '+'   : "PLUS",
    '-'   : "MINUS",
    '*'   : "MUL",
    '/'   : "DIV",
    '%'   : "MOD",
    '!'   : "NOT",
    '~'   : "BITWISE_NOT",
    '.'   : "DOT",
    '['   : "LEFT_BRACKET",
    ']'   : "RIGHT_BRACKET",
    '{'   : "LEFT_CURLY",
    '}'   : "RIGHT_CURLY",
    '('   : "LEFT_PAREN",
    ')'   : "RIGHT_PAREN"
}

# Hash of keyword identifier to tokens index.
keywords = {}

# Define const END, etc., based on the token names.  Also map name to index.
consts = {}

r1 = re.compile(r'\A[a-z]')
r2 = re.compile(r'\A\W')

for i in range(len(tokens)):
    t = tokens[i]

    if r1.match(t):
#         # print t
        consts[t.upper()] = i
        keywords[t] = i
    elif r2.match(t):
        consts[opTypeNames[t]] = i
    else:
        consts[t] = i

#for key in sorted(keywords):
# #    print key,'=>',keywords[key]

# Map assignment operators to their indexes in the tokens array.
assignOps = ['|', '^', '&', '<<', '>>', '>>>', '+', '-', '*', '/', '%']
assignOpsHash = {}

for i in range(len(assignOps)):
    t = assignOps[i]
    assignOpsHash[t] = consts[opTypeNames[t]]

#for key in sorted(assignOpsHash):
# #    print key,'=>',assignOpsHash[key]
    
opPrecedence = {
    "SEMICOLON" : 0,
    "COMMA" : 1,
    "ASSIGN" : 2,
    "HOOK" : 3, "COLON" : 3, "CONDITIONAL" : 3,
    "OR" : 4,
    "AND" : 5,
    "BITWISE_OR" : 6,
    "BITWISE_XOR" : 7,
    "BITWISE_AND" : 8,
    "EQ" : 9, "NE" : 9, "STRICT_EQ" : 9, "STRICT_NE" : 9,
    "LT" : 10, "LE" : 10, "GE" : 10, "GT" : 10, "IN" : 10, "INSTANCEOF" : 10,
    "LSH" : 11, "RSH" : 11, "URSH" : 11,
    "PLUS" : 12, "MINUS" : 12,
    "MUL" : 13, "DIV" : 13, "MOD" : 13,
    "DELETE" : 14, "VOID" : 14, "TYPEOF" : 14, # PRE_INCREMENT: 14, PRE_DECREMENT: 14,
    "NOT" : 14, "BITWISE_NOT" : 14, "UNARY_PLUS" : 14, "UNARY_MINUS" : 14,
    "INCREMENT" : 15, "DECREMENT" : 15, # postfix
    "NEW" : 16,
    "DOT" : 17
}

# Map operator type code to precedence.
for key in opPrecedence.keys():
    opPrecedence[consts[key]] = opPrecedence[key]

#for key in sorted(opPrecedence):
# #    print key,'=>',opPrecedence[key]

opArity = {
        "COMMA" : -2,
        "ASSIGN" : 2,
        "CONDITIONAL" : 3,
        "OR" : 2,
        "AND" : 2,
        "BITWISE_OR" : 2,
        "BITWISE_XOR" : 2,
        "BITWISE_AND" : 2,
        "EQ" : 2, "NE" : 2, "STRICT_EQ" : 2, "STRICT_NE" : 2,
        "LT" : 2, "LE" : 2, "GE" : 2, "GT" : 2, "IN" : 2, "INSTANCEOF" : 2,
        "LSH" : 2, "RSH" : 2, "URSH" : 2,
        "PLUS" : 2, "MINUS" : 2,
        "MUL" : 2, "DIV" : 2, "MOD" : 2,
        "DELETE" : 1, "VOID" : 1, "TYPEOF" : 1, # PRE_INCREMENT: 1, PRE_DECREMENT: 1,
        "NOT" : 1, "BITWISE_NOT" : 1, "UNARY_PLUS" : 1, "UNARY_MINUS" : 1,
        "INCREMENT" : 1, "DECREMENT" : 1,   # postfix
        "NEW" : 1, "NEW_WITH_ARGS" : 2, "DOT" : 2, "INDEX" : 2, "CALL" : 2,
        "ARRAY_INIT" : 1, "OBJECT_INIT" : 1, "GROUP" : 1
}

# Map operator type code to arity.
for key in opArity.keys():
    opArity[consts[key]] = opArity[key]

# dump(opArity)

# NB: superstring tokens (e.g., ++) must come before their substring token
# counterparts (+ in the example), so that the opRegExp regular expression
# synthesized from this list makes the longest possible match.
ops = [';', ',', '?', ':', '||', '&&', '|', '^', '&', '===', '==', 
       '=', '!==', '!=', '<<', '<=', '<', '>>>', '>>', '>=', '>', '++', '--',
       '+', '-', '*', '/', '%', '!', '~', '.', '[', ']', '{', '}', '(', ')']


# Build a regexp that recognizes operators and punctuators (except newline).

opRegExpSrc = "\\A"
r3 = re.compile(r'([?|^&(){}\[\]+\-*\/\.])')

# $ops.length.times do |i|
for i in range(len(ops)):
    if opRegExpSrc != "\\A":
        opRegExpSrc += "|\\A"
        
    s = ops[i]
#     #print 'S=>',s
    for item in s:
        opRegExpSrc += r3.sub("\\" + item, item)
    
opRegExp = re.compile(opRegExpSrc, re.MULTILINE)

# A regexp to match floating point literals (but not integer literals).
fpRegExp = re.compile("\\A\\d+\\.\\d*(?:[eE][-+]?\\d+)?|\\A\\d+(?:\\.\\d*)?[eE][-+]?\\d+|\\A\\.\\d+(?:[eE][-+]?\\d+)?", re.MULTILINE)

# dump(consts)
# import sys
# sys.exit(0)

class List(list):
    def __setitem__(self, index, item):
        l = len(self)
        if l<=index:
          for x in range(index-l): self.append(None)
          self.append(item)
        else:
            super(List, self).__setitem__(index, item)

    def __getitem__(self, index):
        try:
            return super(List, self).__getitem__(index)
        except IndexError, e:
            return None

class List2(list):
    
    def last(self):
        try:
            return self[-1]
        except IndexError, e:
            return None

class Tokenizer(object):

    def __init__(self, source, filename='', line=1):
        self.cursor = 0
        self.source = str(source)
        self.tokens = List()
        self.tokenIndex = 0
        self.lookahead = 0
        self.scanNewlines = False
        self.scanOperand = True
        self.filename = filename
        self.lineno = line

    def input(self):
        return self.source[self.cursor:]

    def done(self):
        return (self.peek() == consts["END"])

    def token(self):
        return self.tokens[self.tokenIndex]

    def match(self, tt):
        print 'Calling match of',tt
        got = self.get()
        print 'GOOT',got,tt
        
        if got == tt:
            return True
        else:
            return self.unget()
    
    def mustMatch(self, tt):
        print 'Calling mustMatch',tt
        if not self.match(tt):
            raise NarcissusError("Missing " + self.tokens[tt].lower(), self)
        return self.token()
    
    def peek(self):
#         # print self.lookahead
        
        if self.lookahead > 0:
#             # print len(self.tokens)
#             # print self.tokenIndex,self.lookahead
#             # print (self.tokenIndex + self.lookahead) & 3
#             # print self.tokens
            token = self.tokens[(self.tokenIndex + self.lookahead) & 3]
#             # print token
            tt = token.type
        else:
            tt = self.get()
            self.unget()

        return tt

    def peekOnSameLine(self):
        self.scanNewlines = True
        tt = self.peek()
        self.scanNewlines = False
        return tt

    def get(self):

        pattern1 = re.compile(r'\A[ \t]+')
        pattern2 = re.compile(r'\s+')
        pattern3 = re.compile("\/(?:\*(?:.)*?\*\/|\/[^\n]*)", re.DOTALL)
        
        # pattern3 = re.compile(r'(\?\:\*(\?\:\.)*?\*\/|\/[^\n]*)', re.MULTILINE)
        pattern4 = re.compile(r'\A0[xX][\da-fA-F]+|0[0-7]*|\d+')
        pattern5 = re.compile(r'\A(\w|\$)+')
        pattern6 = re.compile(r"\A\"(?:\\.|[^\"])*\"|\A'(?:[^']|\\.)*'")
        pattern7 = re.compile(r'\A\/((?:\\.|[^\/])+)\/([gi]*)')
            
        while self.lookahead > 0:
            self.lookahead -= 1
            self.tokenIndex = (self.tokenIndex + 1) & 3
            token = self.tokens[self.tokenIndex]
            if token.type != consts["NEWLINE"] or self.scanNewlines:
                return token.type

        while True:

            input_s = self.input()
            print 'Input => ', input_s
            
            if self.scanNewlines:
                print 'Scannewlines is true'
                match = pattern1.match(input_s)
            else:
                match = pattern2.match(input_s)

            if match:
                print 'A MATCH FOUND!'
                spaces = match.group(0)
                print 'Spaces =>',len(spaces)
                self.cursor += len(spaces)
                print 'Newline count =>',spaces.count('\n')
                self.lineno += spaces.count('\n')
                input_s = self.input()

            print 'Input=>',input_s, len(input_s)
            match = pattern3.match(input_s)
            if not match:
                print 'BREAKING'
                break

            print 'Cursor',self.cursor
            comment = match.group(0)
            print 'Comment =>',comment
            self.cursor += len(comment)
            print 'Cursor',self.cursor            
            print 'Comment length, comment newline count=>',len(comment),comment.count('\n')
            self.lineno += comment.count('\n')

        self.tokenIndex = (self.tokenIndex + 1) & 3
#         # print self.tokenIndex
        token = self.tokens[self.tokenIndex]

        if token==None:
#             # print self.tokens, self.tokenIndex
            self.tokens[self.tokenIndex] = token = Token()

        if len(input_s)==0:
            token.type = consts["END"]
            return token.type
        
        matchflag = False
        cursor_advance = 0

        if fpRegExp.match(input_s):
            print "Matched here1"            
            match = fpRegExp.match(input_s)
            
            token.type = consts["NUMBER"]
            # Not sure if this works or if we need .findall()[0]
            token.value = float(match.group(0))
        elif pattern4.match(input_s):
            print "Matched here2"                        
            match = pattern4.match(input_s)
            token.type = consts["NUMBER"]
            token.value = int(match.group(0))
        elif pattern5.match(input_s):
            print "MATCH: Matched here3",input_s                        
            match = pattern5.match(input_s)
            id = match.group(0)
            token.type = keywords.get(id) or consts["IDENTIFIER"]
            token.value = id
        elif pattern6.match(input_s):
            print "Matched here4"                        
            match = pattern6.match(input_s)
            token.type = consts["STRING"]
            token.value = str(match.group(0))
        elif self.scanOperand and pattern7.match(input_s):
            print "Matched here5"                                    
            match = pattern7.match(input_s)
#             # print match.group(2)
            token.type = consts["REGEXP"]
            # print match.group(1), match.group(2)
            token.value = re.compile(match.group(1)) # , match.group(2))
        elif opRegExp.match(input_s):
            print "Matched here6",input_s                               
            match = opRegExp.match(input_s)

            op = match.group(0)
            if assignOpsHash.get(op) and (input_s[len(op):2] == '='):
                print 'Token type is ASSIGN'
                token.type = consts["ASSIGN"]
                token.assignOp = consts[opTypeNames[op]]
                cursor_advance = 1 # length of '='
            else:
                token.type = consts[opTypeNames[op]]
                print token.type, self.scanOperand, consts["MINUS"]
                print 'TOKEN TYPE NOT ASSIGN!'

                if self.scanOperand and (token.type==consts["PLUS"] or token.type==consts["MINUS"]):
                    print 'Adding to token type!'
                    token.type += consts["UNARY_PLUS"] - consts["PLUS"]

                token.assignOp = None
            token.value = op
        else:
            raise NarcissusError("Illegal token", self)

        token.start = self.cursor
#         # print token.start
        print 'Group0 =>',match.group(0)

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -