recognizers.py
来自「antlr最新版本V3源代码」· Python 代码 · 共 1,189 行 · 第 1/3 页
PY
1,189 行
class TokenSource(object): """ @brief Abstract baseclass for token producers. A source of tokens must provide a sequence of tokens via nextToken() and also must reveal it's source of characters; CommonToken's text is computed from a CharStream; it only store indices into the char stream. Errors from the lexer are never passed to the parser. Either you want to keep going or you do not upon token recognition error. If you do not want to continue lexing then you do not want to continue parsing. Just throw an exception not under RecognitionException and Java will naturally toss you all the way out of the recognizers. If you want to continue lexing then you should not throw an exception to the parser--it has already requested a token. Keep lexing until you get a valid one. Just report errors and keep going, looking for a valid token. """ def nextToken(self): """Return a Token object from your input stream (usually a CharStream). Do not fail/return upon lexing error; keep chewing on the characters until you get a good one; errors are not passed through to the parser. """ raise NotImplementedError class Lexer(BaseRecognizer, TokenSource): """ @brief Baseclass for generated lexer classes. A lexer is recognizer that draws input symbols from a character stream. lexer grammars result in a subclass of this object. A Lexer object uses simplified match() and error recovery mechanisms in the interest of speed. """ def __init__(self, input): BaseRecognizer.__init__(self) TokenSource.__init__(self) # Where is the lexer drawing characters from? self.input = input # The goal of all lexer rules/methods is to create a token object. # This is an instance variable as multiple rules may collaborate to # create a single token. nextToken will return this object after # matching lexer rule(s). If you subclass to allow multiple token # emissions, then set this to the last token to be matched or # something nonnull so that the auto token emit mechanism will not # emit another token. self.token = None # What character index in the stream did the current token start at? # Needed, for example, to get the text for current token. Set at # the start of nextToken. self.tokenStartCharIndex = -1 # The line on which the first character of the token resides self.tokenStartLine = -1 # The character position of first character within the line self.tokenStartCharPositionInLine = -1 # The channel number for the current token self.channel = DEFAULT_CHANNEL # The token type for the current token self.type = INVALID_TOKEN_TYPE # You can set the text for the current token to override what is in # the input char buffer. Use setText() or can set this instance var. self._text = None def reset(self): BaseRecognizer.reset(self) # reset all recognizer state variables # wack Lexer state variables self.token = None self.type = INVALID_TOKEN_TYPE self.channel = DEFAULT_CHANNEL self.tokenStartCharIndex = -1 self.tokenStartLine = -1 self.tokenStartCharPositionInLine = -1 self._text = None if self.input is not None: self.input.seek(0) # rewind the input def nextToken(self): """ Return a token from this source; i.e., match a token on the char stream. """ while 1: self.token = None self.channel = DEFAULT_CHANNEL self.tokenStartCharIndex = self.input.index() self.tokenStartCharPositionInLine = self.input.charPositionInLine self.tokenStartLine = self.input.line self._text = None if self.input.LA(1) == EOF: return EOF_TOKEN try: self.mTokens() if self.token is None: self.emit() elif self.token == SKIP_TOKEN: continue return self.token except RecognitionException, re: self.reportError(re) self.recover(re) def skip(self): """ Instruct the lexer to skip creating a token for current lexer rule and look for another token. nextToken() knows to keep looking when a lexer rule finishes with token set to SKIP_TOKEN. Recall that if token==null at end of any token rule, it creates one for you and emits it. """ self.token = SKIP_TOKEN def mTokens(self): """This is the lexer entry point that sets instance var 'token'""" # abstract method raise NotImplementedError def setCharStream(self, input): """Set the char stream and reset the lexer""" self.input = None self.reset() self.input = input def emit(self, token=None): """ The standard method called to automatically emit a token at the outermost lexical rule. The token object should point into the char buffer start..stop. If there is a text override in 'text', use that to set the token's text. """ if token is None: token = CommonToken( input=self.input, type=self.type, channel=self.channel, start=self.tokenStartCharIndex, stop=self.getCharIndex()-1 ) token.line = self.tokenStartLine token.text = self.text token.charPositionInLine = self.tokenStartCharPositionInLine self.token = token return token def match(self, s): if isinstance(s, basestring): i = 0 while i < len(s): if self.input.LA(1) != s[i]: if self.backtracking > 0: self.failed = True return mte = MismatchedTokenException(s[i], self.input) self.recover(mte) raise mte i += 1 self.input.consume() self.failed = False else: if self.input.LA(1) != s: if self.backtracking > 0: self.failed = True return mte = MismatchedTokenException(s, self.input) self.recover(mte) raise mte self.input.consume() self.failed = False def matchAny(self): self.input.consume() def matchRange(self, a, b): if self.input.LA(1) < a or self.input.LA(1) > b: if self.backtracking > 0: self.failed = True return mre = MismatchedRangeException(a, b, self.input) self.recover(mre) raise mre self.input.consume() self.failed = False def getLine(self): return self.input.line def getCharPositionInLine(self): return self.input.charPositionInLine def getCharIndex(self): """What is the index of the current character of lookahead?""" return self.input.index() def getText(self): """ Return the text matched so far for the current token or any text override. """ if self._text is not None: return self._text return self.input.substring( self.tokenStartCharIndex, self.getCharIndex()-1 ) def setText(self, text): """ Set the complete text of this token; it wipes any previous changes to the text. """ self._text = text text = property(getText, setText) def reportError(self, e): ## TODO: not thought about recovery in lexer yet. ## # if we've already reported an error and have not matched a token ## # yet successfully, don't report any errors. ## if self.errorRecovery: ## #System.err.print("[SPURIOUS] "); ## return; ## ## self.errorRecovery = True self.displayRecognitionError(self.tokenNames, e) def getErrorMessage(self, e, tokenNames): msg = None if isinstance(e, MismatchedTokenException): msg = "mismatched character " \ + self.getCharErrorDisplay(e.c) \ + " expecting " \ + self.getCharErrorDisplay(e.expecting) elif isinstance(e, NoViableAltException): msg = "no viable alternative at character " \ + self.getCharErrorDisplay(e.c) elif isinstance(e, EarlyExitException): msg = "required (...)+ loop did not match anything at character " \ + self.getCharErrorDisplay(e.c) elif isinstance(e, MismatchedSetException): msg = "mismatched character " \ + self.getCharErrorDisplay(e.c) \ + " expecting set " \ + repr(e.expecting) elif isinstance(e, MismatchedNotSetException): msg = "mismatched character " \ + self.getCharErrorDisplay(e.c) \ + " expecting set " \ + repr(e.expecting) elif isinstance(e, MismatchedRangeException): msg = "mismatched character " \ + self.getCharErrorDisplay(e.c) \ + " expecting set " \ + self.getCharErrorDisplay(e.a) \ + ".." \ + self.getCharErrorDisplay(e.b) else: msg = BaseRecognizer.getErrorMessage(self, e, tokenNames) return msg def getCharErrorDisplay(self, c): if c == EOF: c = '<EOF>' return repr(c) def recover(self, re): """ Lexers can normally match any char in it's vocabulary after matching a token, so do the easy thing and just kill a character and hope it all works out. You can instead use the rule invocation stack to do sophisticated error recovery if you are in a fragment rule. """ self.input.consume() def traceIn(self, ruleName, ruleIndex): inputSymbol = "%s line=%d:%s" % (self.input.LT(1), self.getLine(), self.getCharPositionInLine() ) BaseRecognizer.traceIn(self, ruleName, ruleIndex, inputSymbol) def traceOut(self, ruleName, ruleIndex): inputSymbol = "%s line=%d:%s" % (self.input.LT(1), self.getLine(), self.getCharPositionInLine() ) BaseRecognizer.traceOut(self, ruleName, ruleIndex, inputSymbol)class Parser(BaseRecognizer): """ @brief Baseclass for generated parser classes. """ def __init__(self, lexer): BaseRecognizer.__init__(self) self.setTokenStream(lexer) def reset(self): BaseRecognizer.reset(self) # reset all recognizer state variables if self.input is not None: self.input.seek(0) # rewind the input def setTokenStream(self, input): """Set the token stream and reset the parser""" self.input = None self.reset() self.input = input def getTokenStream(self): return self.input def traceIn(self, ruleName, ruleIndex): BaseRecognizer.traceIn(self, ruleName, ruleIndex, self.input.LT(1)) def traceOut(self, ruleName, ruleIndex): BaseRecognizer.traceOut(self, ruleName, ruleIndex, self.input.LT(1))
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?