recognizers.py
来自「antlr最新版本V3源代码」· Python 代码 · 共 1,189 行 · 第 1/3 页
PY
1,189 行
"""ANTLR3 runtime package"""# begin[licence]## [The "BSD licence"]# Copyright (c) 2005-2006 Terence Parr# All rights reserved.## Redistribution and use in source and binary forms, with or without# modification, are permitted provided that the following conditions# are met:# 1. Redistributions of source code must retain the above copyright# notice, this list of conditions and the following disclaimer.# 2. Redistributions in binary form must reproduce the above copyright# notice, this list of conditions and the following disclaimer in the# documentation and/or other materials provided with the distribution.# 3. The name of the author may not be used to endorse or promote products# derived from this software without specific prior written permission.## THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.# IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.## end[licence]import sysimport inspectfrom antlr3.constants import DEFAULT_CHANNEL, HIDDEN_CHANNEL, EOF, \ EOR_TOKEN_TYPE, INVALID_TOKEN_TYPEfrom antlr3.exceptions import RecognitionException, MismatchedTokenException, \ MismatchedRangeException, MismatchedTreeNodeException, \ NoViableAltException, EarlyExitException, MismatchedSetException, \ MismatchedNotSetException, FailedPredicateExceptionfrom antlr3.tokens import CommonToken, EOF_TOKEN, SKIP_TOKENfrom antlr3.compat import set, frozenset, reversedclass BaseRecognizer(object): """ @brief Common recognizer functionality. A generic recognizer that can handle recognizers generated from lexer, parser, and tree grammars. This is all the parsing support code essentially; most of it is error recovery stuff and backtracking. """ MEMO_RULE_FAILED = -2 MEMO_RULE_UNKNOWN = -1 # copies from Token object for convenience in actions DEFAULT_TOKEN_CHANNEL = DEFAULT_CHANNEL # for convenience in actions HIDDEN = HIDDEN_CHANNEL # overridden by generated subclasses tokenNames = None def __init__(self): # Input stream of the recognizer. Must be initialized by a subclass. self.input = None # Track the set of token types that can follow any rule invocation. # Stack grows upwards. When it hits the max, it grows 2x in size # and keeps going. self.following = [] # This is true when we see an error and before having successfully # matched a token. Prevents generation of more than one error message # per error. self.errorRecovery = False # The index into the input stream where the last error occurred. # This is used to prevent infinite loops where an error is found # but no token is consumed during recovery...another error is found, # ad naseum. This is a failsafe mechanism to guarantee that at least # one token/tree node is consumed for two errors. self.lastErrorIndex = -1 # In lieu of a return value, this indicates that a rule or token # has failed to match. Reset to false upon valid token match. self.failed = False # If 0, no backtracking is going on. Safe to exec actions etc... # If >0 then it's the level of backtracking. self.backtracking = 0 # An array[size num rules] of Map<Integer,Integer> that tracks # the stop token index for each rule. ruleMemo[ruleIndex] is # the memoization table for ruleIndex. For key ruleStartIndex, you # get back the stop token for associated rule or MEMO_RULE_FAILED. # # This is only used if rule memoization is on (which it is by default). self.ruleMemo = None # this one only exists to shut up pylint :( def setInput(self, input): self.input = input def reset(self): """ reset the parser's state; subclasses must rewinds the input stream """ # wack everything related to error recovery self.errorRecovery = False self.lastErrorIndex = -1 self.failed = False # wack everything related to backtracking and memoization self.backtracking = 0 if self.ruleMemo is not None: self.ruleMemo = {} def match(self, input, ttype, follow): """ Match current input symbol against ttype. Upon error, do one token insertion or deletion if possible. You can override to not recover here and bail out of the current production to the normal error exception catch (at the end of the method) by just throwing MismatchedTokenException upon input.LA(1)!=ttype. """ if self.input.LA(1) == ttype: self.input.consume() self.errorRecovery = False self.failed = False return if self.backtracking > 0: self.failed = True return self.mismatch(input, ttype, follow) return def matchAny(self, input): self.errorRecovery = False self.failed = False self.input.consume() def mismatch(self, input, ttype, follow): """ factor out what to do upon token mismatch so tree parsers can behave differently. Override this method in your parser to do things like bailing out after the first error; just throw the mte object instead of calling the recovery method. """ mte = MismatchedTokenException(ttype, input) self.recoverFromMismatchedToken(input, mte, ttype, follow) def reportError(self, e): """Report a recognition problem. This method sets errorRecovery to indicate the parser is recovering not parsing. Once in recovery mode, no errors are generated. To get out of recovery mode, the parser must successfully match a token (after a resync). So it will go: -# error occurs -# enter recovery mode, report error -# consume until token found in resynch set -# try to resume parsing -# next match() will reset errorRecovery mode . """ # if we've already reported an error and have not matched a token # yet successfully, don't report any errors. if self.errorRecovery: return self.errorRecovery = True self.displayRecognitionError(self.tokenNames, e) def displayRecognitionError(self, tokenNames, e): hdr = self.getErrorHeader(e) msg = self.getErrorMessage(e, tokenNames) self.emitErrorMessage(hdr+" "+msg) def getErrorMessage(self, e, tokenNames): """ What error message should be generated for the various exception types? Not very object-oriented code, but I like having all error message generation within one method rather than spread among all of the exception classes. This also makes it much easier for the exception handling because the exception classes do not have to have pointers back to this object to access utility routines and so on. Also, changing the message for an exception type would be difficult because you would have to subclassing exception, but then somehow get ANTLR to make those kinds of exception objects instead of the default. This looks weird, but trust me--it makes the most sense in terms of flexibility. For grammar debugging, you will want to override this to add more information such as the stack frame with getRuleInvocationStack(e, this.getClass().getName()) and, for no viable alts, the decision description and state etc... Override this to change the message generated for one or more exception types. """ msg = None if isinstance(e, MismatchedTokenException): tokenName = "<unknown>" if e.expecting == EOF: tokenName = "EOF" else: tokenName = self.tokenNames[e.expecting] msg = "mismatched input " \ + self.getTokenErrorDisplay(e.token) \ + " expecting " \ + tokenName elif isinstance(e, MismatchedTreeNodeException): tokenName = "<unknown>" if e.expecting == EOF: tokenName = "EOF" else: tokenName = self.tokenNames[e.expecting] msg = "mismatched tree node: " \ + e.node \ + " expecting " \ + tokenName elif isinstance(e, NoViableAltException): msg = "no viable alternative at input " \ + self.getTokenErrorDisplay(e.token) elif isinstance(e, EarlyExitException): msg = "required (...)+ loop did not match anything at input " \ + self.getTokenErrorDisplay(e.token) elif isinstance(e, MismatchedSetException): msg = "mismatched input " \ + self.getTokenErrorDisplay(e.token) \ + " expecting set " \ + repr(e.expecting) elif isinstance(e, MismatchedNotSetException): msg = "mismatched input " \ + self.getTokenErrorDisplay(e.token) \ + " expecting set " \ + repr(e.expecting) elif isinstance(e, FailedPredicateException): msg = "rule " \ + e.ruleName \ + " failed predicate: {" \ + e.predicateText \ + "}?" return msg def getErrorHeader(self, e): """ What is the error header, normally line/character position information? """ return "line %d:%d" % (e.line, e.charPositionInLine) def getTokenErrorDisplay(self, t): """ How should a token be displayed in an error message? The default is to display just the text, but during development you might want to have a lot of information spit out. Override in that case to use t.toString() (which, for CommonToken, dumps everything about the token). This is better than forcing you to override a method in your token objects because you don't have to go modify your lexer so that it creates a new Java type. """ s = t.text if s is None: if t.type == EOF: s = "<EOF>" else: s = "<"+t.type+">" return repr(s) def emitErrorMessage(self, msg): """Override this method to change where error messages go""" sys.stderr.write(msg + '\n') def recover(self, input, re): """ Recover from an error found on the input stream. Mostly this is NoViableAlt exceptions, but could be a mismatched token that the match() routine could not recover from. """ # PROBLEM? what if input stream is not the same as last time # perhaps make lastErrorIndex a member of input if self.lastErrorIndex == input.index(): # uh oh, another error at same token index; must be a case # where LT(1) is in the recovery token set so nothing is # consumed; consume a single token so at least to prevent # an infinite loop; this is a failsafe. input.consume() self.lastErrorIndex = input.index() followSet = self.computeErrorRecoverySet() self.beginResync() self.consumeUntil(input, followSet) self.endResync() def beginResync(self): """ A hook to listen in on the token consumption during error recovery. The DebugParser subclasses this to fire events to the listenter. """ pass def endResync(self): """ A hook to listen in on the token consumption during error recovery. The DebugParser subclasses this to fire events to the listenter. """ pass def computeErrorRecoverySet(self): """ Compute the error recovery set for the current rule. During rule invocation, the parser pushes the set of tokens that can follow that rule reference on the stack; this amounts to computing FIRST of what follows the rule reference in the enclosing rule. This local follow set only includes tokens from within the rule; i.e., the FIRST computation done by ANTLR stops at the end of a rule. EXAMPLE When you find a "no viable alt exception", the input is not consistent with any of the alternatives for rule r. The best thing to do is to consume tokens until you see something that can legally follow a call to r *or* any rule that called r. You don't want the exact set of viable next tokens because the input might just be missing a token--you might consume the rest of the input looking for one of the missing tokens. Consider grammar: a : '[' b ']' | '(' b ')' ; b : c '^' INT ; c : ID | INT ; At each rule invocation, the set of tokens that could follow that rule is pushed on a stack. Here are the various "local" follow sets: FOLLOW(b1_in_a) = FIRST(']') = ']' FOLLOW(b2_in_a) = FIRST(')') = ')' FOLLOW(c_in_b) = FIRST('^') = '^' Upon erroneous input "[]", the call chain is a -> b -> c
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?