📄 lexer.py
字号:
import string
import wstring
import sys
import os
"""
C to C++ Library
Functions for scanning C source and header files.
(c) 2001-2005 by D.G. Sureau
Modified: August 30 2005
by Georg Wittenburg
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
webmaster@scriptol.org
http://www.scriptol.org
Some language conventions....
int a = 0; in source, the definition of a variable
extern int a; in header, the declaration of a variable
int a() { ... } in source, int a() is the interface of a function
int a(); in header, int a() is the prototype of a function
Inside a class, members are attributes (variables) or methods (functions).
Statements of a program stay in a "source" file
Declarations and prototypes in a "header" file,
also named file to include.
"""
TRUE = 1
FALSE = 0
hpp = ".hpp"
cpp = ".cpp"
types = [ "extern", "*", "void", "unsigned", "signed", "char", "short",
"int", "integer", "long", "double", "float", "far", "near",
"huge", "interrupt", "static", "struct", "register" ,
"volatile", "const", "struct" ]
modifiers = [ "extern", "static", "const", "volatile", "auto", "inline",
"near", "far" ]
predefined = [ "FILE", "ffblk", "ftime", "_Cdecl", "size_t", "div_t",
"ldiv_t", "wchar_t" ]
declarator = types + modifiers + predefined
typstruct = [] # new types of struct
constructs = [ "if", "switch", "while", "for", "case", "{", "else" ]
# These tokens are relevant to identify declarations and constructs
# they can be used on partially converted headers
reserved = [ "extern", "*", "void", "unsigned", "signed", "char", "short",
"int", "integer", "long", "double", "float", "register",
"far", "near", "huge" , "interrupt", "static", "volatile",
"const", "auto",
"struct", "typedef", "enum", "union",
"if", "else", "switch", "while", "for", "do", "until"
"case", "default",
"goto", "break", "continue", "inline", "asm",
"private", "protected", "public" ]
symbols = [ '+', '-', '*', '^', '/', '<', '>', '=',
'|', '&', '~', '[', ']', '(', ')', '{', '}',
'?', ':', ',', ';', '%', ' ', '\'', '\"'
'\t', '\r', '\n' ]
symstr = "+-*^/<>=|&~[](){}?:,;% \'\"\t\r\n"
#---------------------------------- Utilities
def iswindows():
if os.name == "msdos": return TRUE
if os.name == "nt": return TRUE
return FALSE
def oscase(str): # make lower under Windows
if iswindows(): str = string.lower(str)
return str
def nextalpha(line, i):
l = len(line)
while i < l:
if line[i] == ' ': i = i + 1
else: break
return i
def nextspace(line, i):
l = len(line)
while i < l:
if line[i] != ' ': i = i + 1
else: break
return i
# Is the word a identifier
def isident(word):
#print "is ident", word, "?"
if word in reserved: return FALSE
if word[0] not in string.letters: return FALSE
if len(word) == 1: return TRUE
for c in word[1:]:
if c in string.letters: continue
if c in string.digits: continue
if c == '_': continue
return FALSE # not in above lists
return TRUE
# Get the first identifier following a list of types
def getident(line):
line = removecomment(line)
line = removedirective(line)
l = wstring.split(line, symstr, '*') # * is both delimiter and keyword in C
#print "getident", line, l
for w in l:
wi = string.lower(w)
if wi == "typedef": continue
if w in types: continue # Skipping type keywords
if isident(w): return w # Identifier returned
return None # Anything else is not of concern
return None
# Get the list of ident in a multiple declaration of variables
# without assigment
# Ex: int i,j,k; char *aaa[], b[][]
def getvarnames(line):
vlist = []
l = wstring.split(line, " *;,\t\r\n", '*')
for w in l:
i = string.find(w, '[')
if i != -1:
w = w[:i]
w = wstring.strip(w)
if w in types: continue
if isident(w): vlist.append(w)
return vlist
# has an initializer
def initialized(vardef):
j = string.find(vardef, '=')
k = string.find(vardef, ';')
return j < k
# a global variable
def isglobal(line):
return isstatic(line)
# a static declaration
def isstatic(line):
if line is None: return FALSE
if len(line) < 10: return FALSE # min is "static int"
words = wstring.split(line, " \t\r\n*;[](){}?:,", "*")
i = 0
if string.lower(words[0]) == "extern": i = i + 1
return string.lower(words[i]) == "static"
# an external declaration
def isextern(line):
if line is None: return FALSE
if len(line) < 6: return FALSE # min is "int a;"
words = wstring.split(line, " \t\r\n*;[](){}?:,", "*")
return string.lower(words[0]) == "extern";
# Is the line a typed declaration?
# A list of type keywords followed by an ident
def isdeclaration(line):
if line is None: return FALSE
if len(line) < 6: return FALSE # min is "int a;"
words = wstring.split(line, " \t\r\n*;[](){}?:,", "*")
if words[0] == "extern":
words.remove(words[0])
first = words[0]
if first == "*": return FALSE # no valid as first keyword
if first not in types: return FALSE # no typed
# first being a type keyword, then scanning the line
for w in words:
if w in types: continue
if isident(w): return TRUE
return FALSE
return TRUE
# Is the line starting a definition of variable, array, function?
# A function returning a integer may be declared without the type,
# it is implicitely typed
# Consequently a typed line is
# - a line starting with a ident followed by ( and not terminated by ;
# - a type (made of one or several keywords), followed by a ident
# and followed by ( or [ or = or , or ;
# A prototype in a C source is ignored.
def isdefinition(line):
if line == None: return FALSE
line = wstring.strip(line)
le = len(line)
if le < 3: return FALSE # minimum length: f()
# I get the left part
j = 0
for c in line:
if c in [ '(', '[', ';', '=', ',' ]: break
j = j + 1
# function name without type not a call (terminated by semicolon)
soleident = ((c == '(') & (line[-1:] != ';'))
#print "solident", line, soleident, line[-1:]
le = min(le, j)
# Analysing the left part of the line
line = line[:le]
if line == "": return FALSE
words = wstring.split(line, " *", '*')
if len(words) == 0: return FALSE
if len(words) == 1:
if isident(words[0]):
return soleident # True is a identifier followed by (
#print "words >>>", words
# Handling case of the *ident= assignment inside block
if words[0] == '*': return FALSE
tflag = FALSE
iflag = FALSE
for w in words:
if w == "extern": return FALSE # External declarations are ignored
if w in declarator:
#print "type", w
if iflag: return FALSE # Type following ident?
tflag = TRUE
continue # Continue to next word...
if isident(w):
#print "ident", w
iflag = TRUE # Must be the last word
continue
return FALSE # Neither type nor identifier
#print "ok", line
return tflag # Found a type followed or not by an identifier then (=;,[
# Is the line an instance of typedef struct?
# assumes the line is a var declaration
# I search for [a list of type keywords followed by] a typedef of struct
def istypstruct(line):
words = wstring.split(line, vardelimiters, "*")
for w in words:
if w in typstruct:
#print "istypstruct - ", w, "in typedef struct"
return TRUE
if w in types:
#print w, "in type"
continue
if isident(w):
#print w, "in ident"
return FALSE
break
return FALSE
# Add a semicolon if missing
def addsemicolon(line):
line = wstring.strip(line)
if line[-1:] == ';': return line
return line + ';'
# Add the "extern" keyword if missing
def addextern(str):
words = string.split(str)
if len(words) == 0: return str
if words[0] == "extern": return str
return "extern " + str
def removextern(line):
i = string.find(string.lower(line), "extern")
if i > -1:
line = line[:i] + line[i+6:]
return line
# Add the "static" keyword if missing
def addstatic(str):
words = string.split(str)
if len(words) == 0: return str
if words[0] == "static": return str
return "static " + str
def removestatic(line):
i = string.find(string.lower(line), "static")
if i > -1:
line = line[:i] + line[i+6:]
return line
def removeglobal(line):
return removestatic(line)
#---------------------------------- Comments
# Processing a comment block
# I suppose than a block a comments
# starts on a line with no statement at left
# and finishes on a line, with no statements at right!
# Trailing and embedded comments are processed elsewhere
def opencomment(line):
inside = FALSE
if line[0:2] == "/*": inside = TRUE
if line[:-2] == "*/": inside = FALSE
return inside
# Called only when a comment line or block is opened
def closecomment(line):
inside = TRUE
p = ' '
for c in line:
if c == '/':
if p == '*':
inside = FALSE
if c == '*': p = c
return inside
# A comment starting and ending in the same line
def embeddedcomment(line):
if line is None: return FALSE
if len(line) < 2: return FALSE
i = string.find(line, "/*")
if i == -1: return FALSE
j = string.find(line, "*/")
if j == -1: return FALSE
if j < i: return FALSE
return TRUE
# A comment opened, but not closed
def unterminated(line):
while 1:
if line is None: return FALSE
if len(line) < 2: return FALSE
i = string.find(line, "/*")
if i == -1: return FALSE # comment not found
line = line[i+2:]
j = string.find(line, "*/") # searching for end of comment
if j == -1: break
line = line[j+2:] # getting the trailing part and loop
return TRUE
# Processing embedded comment
def commentfound(line):
if len(line) < 2: return FALSE
#print "found comment:", line
i = string.find(line, "/*")
if i == -1: return FALSE
return TRUE
def nocomment(line):
i = string.find(line, "/*")
if i == -1: return line
j = string.find(line, "*/", i + 2)
if j > i: # Terminator found
if j < (len(line) - 2): # embedded comment
return line[:i] + line[j + 2:] # removing the comment
#print "truncated", line[:i]
return line[:i] # Line without starting comment
def removecomment(line):
while commentfound(line):
line = nocomment(line)
return line
#----------------------------------- Compiler directives
# Remove compiler directives starting and ending with "__" and possibly
# followed by a block of "(...)".
def removedirective(line):
# First find the end of the __-delimited directive name.
# Return line unchanged, if there's no directive.
offset = string.find(line, "__")
if offset == -1: return line
offset = string.find(line, "__", offset + 2)
if offset == -1: return line
offset += 2
# offset points to the character after the directive name.
# Remove "(...)" block.
openbrackets = 0
if line[offset] == '(':
openbrackets += 1
offset += 1
else:
return line[offset:]
while openbrackets > 0:
if line[offset] == '(':
openbrackets += 1
if line[offset] == ')':
openbrackets -= 1
offset += 1
return line[offset:]
#----------------------------------- Define statement
# Is the line a define statement?
def isdefine(line):
if line is None: return FALSE
if len(line) < 9: return FALSE
if line[0] != '#': return FALSE
tmp = string.lower(line[1:8])
if tmp != "define ": return FALSE
return TRUE
# Processing a macro or pragma, true for all but alias
def opendef(line):
if len(line) == 0: return FALSE
inside = FALSE
if line[0] == "#":
inside = TRUE
if line[1:7] == "define":
#print line
key, value = splitdefine(line)
#print key, value
if string.find(key, '(' ) == -1: inside = FALSE
return inside
# Called only when a define block is opened
# Return TRUE if inside a multiline macro again
def closedef(line):
line = wstring.chop(line)
line = wstring.strip(line)
if line[-1] == "\\": return TRUE
return FALSE
# Get name and definition from a define statement
def splitdefine(line):
line = line[8:] # Skipping the keyword and the whitespace
i = nextalpha(line, 0) # Skipping other whitespaces
j = nextspace(line, i)
name = line[i:j]
definition = line[j + 1:]
definition = wstring.strip(definition)
return name, definition
#---------------------------------- Curly brace enclosed block
# Processing a block to be skiped
blocklevel = 0
def openblock(line):
if line is None: return FALSE
if openrecord(line): return TRUE
if '{' in line: return TRUE
return FALSE
# Called only when a block is opened
# Return TRUE while inside a multiline block
def closeblock(line):
global blocklevel
for c in line:
if c == '{': blocklevel = blocklevel + 1
if c == '}':
blocklevel = blocklevel - 1
if blocklevel < 0: blocklevel = 0
if blocklevel == 0: return FALSE
return TRUE
# Process any kind of record to be skipped
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -