📄 baserecognizer.java
字号:
package org.antlr.runtime;import java.util.ArrayList;import java.util.HashMap;import java.util.List;import java.util.Map;/** A generic recognizer that can handle recognizers generated from * lexer, parser, and tree grammars. This is all the parsing * support code essentially; most of it is error recovery stuff and * backtracking. */public abstract class BaseRecognizer { public static final int MEMO_RULE_FAILED = -2; public static final int MEMO_RULE_UNKNOWN = -1; public static final int INITIAL_FOLLOW_STACK_SIZE = 100; public static final Integer MEMO_RULE_FAILED_I = new Integer(MEMO_RULE_FAILED); // copies from Token object for convenience in actions public static final int DEFAULT_TOKEN_CHANNEL = Token.DEFAULT_CHANNEL; public static final int HIDDEN = Token.HIDDEN_CHANNEL; public static final String NEXT_TOKEN_RULE_NAME = "nextToken"; /** Track the set of token types that can follow any rule invocation. * Stack grows upwards. When it hits the max, it grows 2x in size * and keeps going. */ protected BitSet[] following = new BitSet[INITIAL_FOLLOW_STACK_SIZE]; protected int _fsp = -1; /** This is true when we see an error and before having successfully * matched a token. Prevents generation of more than one error message * per error. */ protected boolean errorRecovery = false; /** The index into the input stream where the last error occurred. * This is used to prevent infinite loops where an error is found * but no token is consumed during recovery...another error is found, * ad naseum. This is a failsafe mechanism to guarantee that at least * one token/tree node is consumed for two errors. */ protected int lastErrorIndex = -1; /** In lieu of a return value, this indicates that a rule or token * has failed to match. Reset to false upon valid token match. */ protected boolean failed = false; /** If 0, no backtracking is going on. Safe to exec actions etc... * If >0 then it's the level of backtracking. */ protected int backtracking = 0; /** An array[size num rules] of Map<Integer,Integer> that tracks * the stop token index for each rule. ruleMemo[ruleIndex] is * the memoization table for ruleIndex. For key ruleStartIndex, you * get back the stop token for associated rule or MEMO_RULE_FAILED. * * This is only used if rule memoization is on (which it is by default). */ protected Map[] ruleMemo; /** reset the parser's state; subclasses must rewinds the input stream */ public void reset() { // wack everything related to error recovery _fsp = -1; errorRecovery = false; lastErrorIndex = -1; failed = false; // wack everything related to backtracking and memoization backtracking = 0; for (int i = 0; ruleMemo!=null && i < ruleMemo.length; i++) { // wipe cache ruleMemo[i] = null; } } /** Match current input symbol against ttype. Upon error, do one token * insertion or deletion if possible. You can override to not recover * here and bail out of the current production to the normal error * exception catch (at the end of the method) by just throwing * MismatchedTokenException upon input.LA(1)!=ttype. */ public void match(IntStream input, int ttype, BitSet follow) throws RecognitionException { if ( input.LA(1)==ttype ) { input.consume(); errorRecovery = false; failed = false; return; } if ( backtracking>0 ) { failed = true; return; } mismatch(input, ttype, follow); return; } public void matchAny(IntStream input) { errorRecovery = false; failed = false; input.consume(); } /** factor out what to do upon token mismatch so tree parsers can behave * differently. Override this method in your parser to do things * like bailing out after the first error; just throw the mte object * instead of calling the recovery method. */ protected void mismatch(IntStream input, int ttype, BitSet follow) throws RecognitionException { MismatchedTokenException mte = new MismatchedTokenException(ttype, input); recoverFromMismatchedToken(input, mte, ttype, follow); } /** Report a recognition problem. * * This method sets errorRecovery to indicate the parser is recovering * not parsing. Once in recovery mode, no errors are generated. * To get out of recovery mode, the parser must successfully match * a token (after a resync). So it will go: * * 1. error occurs * 2. enter recovery mode, report error * 3. consume until token found in resynch set * 4. try to resume parsing * 5. next match() will reset errorRecovery mode */ public void reportError(RecognitionException e) { // if we've already reported an error and have not matched a token // yet successfully, don't report any errors. if ( errorRecovery ) { //System.err.print("[SPURIOUS] "); return; } errorRecovery = true; displayRecognitionError(this.getTokenNames(), e); } public void displayRecognitionError(String[] tokenNames, RecognitionException e) { String hdr = getErrorHeader(e); String msg = getErrorMessage(e, tokenNames); emitErrorMessage(hdr+" "+msg); } /** What error message should be generated for the various * exception types? * * Not very object-oriented code, but I like having all error message * generation within one method rather than spread among all of the * exception classes. This also makes it much easier for the exception * handling because the exception classes do not have to have pointers back * to this object to access utility routines and so on. Also, changing * the message for an exception type would be difficult because you * would have to subclassing exception, but then somehow get ANTLR * to make those kinds of exception objects instead of the default. * This looks weird, but trust me--it makes the most sense in terms * of flexibility. * * For grammar debugging, you will want to override this to add * more information such as the stack frame with * getRuleInvocationStack(e, this.getClass().getName()) and, * for no viable alts, the decision description and state etc... * * Override this to change the message generated for one or more * exception types. */ public String getErrorMessage(RecognitionException e, String[] tokenNames) { String msg = null; if ( e instanceof MismatchedTokenException ) { MismatchedTokenException mte = (MismatchedTokenException)e; String tokenName="<unknown>"; if ( mte.expecting== Token.EOF ) { tokenName = "EOF"; } else { tokenName = tokenNames[mte.expecting]; } msg = "mismatched input "+getTokenErrorDisplay(e.token)+ " expecting "+tokenName; } else if ( e instanceof MismatchedTreeNodeException ) { MismatchedTreeNodeException mtne = (MismatchedTreeNodeException)e; String tokenName="<unknown>"; if ( mtne.expecting==Token.EOF ) { tokenName = "EOF"; } else { tokenName = tokenNames[mtne.expecting]; } msg = "mismatched tree node: "+mtne.node+ " expecting "+tokenName; } else if ( e instanceof NoViableAltException ) { NoViableAltException nvae = (NoViableAltException)e; // for development, can add "decision=<<"+nvae.grammarDecisionDescription+">>" // and "(decision="+nvae.decisionNumber+") and // "state "+nvae.stateNumber msg = "no viable alternative at input "+getTokenErrorDisplay(e.token); } else if ( e instanceof EarlyExitException ) { EarlyExitException eee = (EarlyExitException)e; // for development, can add "(decision="+eee.decisionNumber+")" msg = "required (...)+ loop did not match anything at input "+ getTokenErrorDisplay(e.token); } else if ( e instanceof MismatchedSetException ) { MismatchedSetException mse = (MismatchedSetException)e; msg = "mismatched input "+getTokenErrorDisplay(e.token)+ " expecting set "+mse.expecting; } else if ( e instanceof MismatchedNotSetException ) { MismatchedNotSetException mse = (MismatchedNotSetException)e; msg = "mismatched input "+getTokenErrorDisplay(e.token)+ " expecting set "+mse.expecting; } else if ( e instanceof FailedPredicateException ) { FailedPredicateException fpe = (FailedPredicateException)e; msg = "rule "+fpe.ruleName+" failed predicate: {"+ fpe.predicateText+"}?"; } return msg; } /** What is the error header, normally line/character position information? */ public String getErrorHeader(RecognitionException e) { return "line "+e.line+":"+e.charPositionInLine; } /** How should a token be displayed in an error message? The default * is to display just the text, but during development you might * want to have a lot of information spit out. Override in that case * to use t.toString() (which, for CommonToken, dumps everything about * the token). This is better than forcing you to override a method in * your token objects because you don't have to go modify your lexer * so that it creates a new Java type. */ public String getTokenErrorDisplay(Token t) { String s = t.getText(); if ( s==null ) { if ( t.getType()==Token.EOF ) { s = "<EOF>"; } else { s = "<"+t.getType()+">"; } } s = s.replaceAll("\n","\\\\n"); s = s.replaceAll("\r","\\\\r"); s = s.replaceAll("\t","\\\\t"); return "'"+s+"'"; } /** Override this method to change where error messages go */ public void emitErrorMessage(String msg) { System.err.println(msg); } /** Recover from an error found on the input stream. Mostly this is * NoViableAlt exceptions, but could be a mismatched token that * the match() routine could not recover from. */ public void recover(IntStream input, RecognitionException re) { if ( lastErrorIndex==input.index() ) { // uh oh, another error at same token index; must be a case // where LT(1) is in the recovery token set so nothing is // consumed; consume a single token so at least to prevent // an infinite loop; this is a failsafe. input.consume(); } lastErrorIndex = input.index(); BitSet followSet = computeErrorRecoverySet(); beginResync(); consumeUntil(input, followSet); endResync(); } /** A hook to listen in on the token consumption during error recovery. * The DebugParser subclasses this to fire events to the listenter. */ public void beginResync() { } public void endResync() { } /* Compute the error recovery set for the current rule. During * rule invocation, the parser pushes the set of tokens that can * follow that rule reference on the stack; this amounts to * computing FIRST of what follows the rule reference in the * enclosing rule. This local follow set only includes tokens * from within the rule; i.e., the FIRST computation done by * ANTLR stops at the end of a rule. * * EXAMPLE * * When you find a "no viable alt exception", the input is not * consistent with any of the alternatives for rule r. The best * thing to do is to consume tokens until you see something that * can legally follow a call to r *or* any rule that called r. * You don't want the exact set of viable next tokens because the * input might just be missing a token--you might consume the * rest of the input looking for one of the missing tokens. * * Consider grammar: * * a : '[' b ']' * | '(' b ')' * ; * b : c '^' INT ; * c : ID * | INT * ; * * At each rule invocation, the set of tokens that could follow * that rule is pushed on a stack. Here are the various "local" * follow sets: * * FOLLOW(b1_in_a) = FIRST(']') = ']' * FOLLOW(b2_in_a) = FIRST(')') = ')' * FOLLOW(c_in_b) = FIRST('^') = '^' * * Upon erroneous input "[]", the call chain is * * a -> b -> c * * and, hence, the follow context stack is: * * depth local follow set after call to rule * 0 <EOF> a (from main()) * 1 ']' b * 3 '^' c * * Notice that ')' is not included, because b would have to have * been called from a different context in rule a for ')' to be * included. * * For error recovery, we cannot consider FOLLOW(c) * (context-sensitive or otherwise). We need the combined set of * all context-sensitive FOLLOW sets--the set of all tokens that * could follow any reference in the call chain. We need to * resync to one of those tokens. Note that FOLLOW(c)='^' and if * we resync'd to that token, we'd consume until EOF. We need to * sync to context-sensitive FOLLOWs for a, b, and c: {']','^'}. * In this case, for input "[]", LA(1) is in this set so we would * not consume anything and after printing an error rule c would * return normally. It would not find the required '^' though. * At this point, it gets a mismatched token error and throws an * exception (since LA(1) is not in the viable following token * set). The rule exception handler tries to recover, but finds * the same recovery set and doesn't consume anything. Rule b * exits normally returning to rule a. Now it finds the ']' (and * with the successful match exits errorRecovery mode). * * So, you cna see that the parser walks up call chain looking * for the token that was a member of the recovery set. * * Errors are not generated in errorRecovery mode. * * ANTLR's error recovery mechanism is based upon original ideas: * * "Algorithms + Data Structures = Programs" by Niklaus Wirth * * and * * "A note on error recovery in recursive descent parsers": * http://portal.acm.org/citation.cfm?id=947902.947905 * * Later, Josef Grosch had some good ideas: * * "Efficient and Comfortable Error Recovery in Recursive Descent * Parsers": * ftp://www.cocolab.com/products/cocktail/doca4.ps/ell.ps.zip * * Like Grosch I implemented local FOLLOW sets that are combined * at run-time upon error to avoid overhead during parsing. */ protected BitSet computeErrorRecoverySet() { return combineFollows(false); } /** Compute the context-sensitive FOLLOW set for current rule. * This is set of token types that can follow a specific rule * reference given a specific call chain. You get the set of * viable tokens that can possibly come next (lookahead depth 1) * given the current call chain. Contrast this with the * definition of plain FOLLOW for rule r: * * FOLLOW(r)={x | S=>*alpha r beta in G and x in FIRST(beta)} * * where x in T* and alpha, beta in V*; T is set of terminals and * V is the set of terminals and nonterminals. In other words, * FOLLOW(r) is the set of all tokens that can possibly follow * references to r in *any* sentential form (context). At * runtime, however, we know precisely which context applies as * we have the call chain. We may compute the exact (rather * than covering superset) set of following tokens. * * For example, consider grammar: * * stat : ID '=' expr ';' // FOLLOW(stat)=={EOF} * | "return" expr '.' * ; * expr : atom ('+' atom)* ; // FOLLOW(expr)=={';','.',')'} * atom : INT // FOLLOW(atom)=={'+',')',';','.'} * | '(' expr ')' * ;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -