📄 grammar.java
字号:
/* [The "BSD licence"] Copyright (c) 2005-2006 Terence Parr All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.*/package org.antlr.tool;import antlr.RecognitionException;import antlr.Token;import antlr.TokenStreamRewriteEngine;import antlr.TokenWithIndex;import antlr.collections.AST;import org.antlr.Tool;import org.antlr.analysis.*;import org.antlr.codegen.CodeGenerator;import org.antlr.misc.Barrier;import org.antlr.misc.IntSet;import org.antlr.misc.IntervalSet;import org.antlr.misc.Utils;import org.antlr.stringtemplate.StringTemplate;import org.antlr.stringtemplate.language.AngleBracketTemplateLexer;import java.io.*;import java.util.*;/** Represents a grammar in memory. */public class Grammar { public static final String SYNPRED_RULE_PREFIX = "synpred"; public static final String GRAMMAR_FILE_EXTENSION = ".g"; /** used for generating lexer temp files */ public static final String LEXER_GRAMMAR_FILE_EXTENSION = ".g"; public static final int INITIAL_DECISION_LIST_SIZE = 300; public static final int INVALID_RULE_INDEX = -1; // the various kinds of labels. t=type, id=ID, types+=type ids+=ID public static final int RULE_LABEL = 1; public static final int TOKEN_LABEL = 2; public static final int RULE_LIST_LABEL = 3; public static final int TOKEN_LIST_LABEL = 4; public static final int CHAR_LABEL = 5; // used in lexer for x='a' public static String[] LabelTypeToString = {"<invalid>", "rule", "token", "rule-list", "token-list"}; public static final String ARTIFICIAL_TOKENS_RULENAME = "Tokens"; public static final String FRAGMENT_RULE_MODIFIER = "fragment"; public static final String SYNPREDGATE_ACTION_NAME = "synpredgate"; /** When converting ANTLR char and string literals, here is the * value set of escape chars. */ public static int ANTLRLiteralEscapedCharValue[] = new int[255]; /** Given a char, we need to be able to show as an ANTLR literal. */ public static String ANTLRLiteralCharValueEscape[] = new String[255]; static { ANTLRLiteralEscapedCharValue['n'] = '\n'; ANTLRLiteralEscapedCharValue['r'] = '\r'; ANTLRLiteralEscapedCharValue['t'] = '\t'; ANTLRLiteralEscapedCharValue['b'] = '\b'; ANTLRLiteralEscapedCharValue['f'] = '\f'; ANTLRLiteralEscapedCharValue['\\'] = '\\'; ANTLRLiteralEscapedCharValue['\''] = '\''; ANTLRLiteralEscapedCharValue['"'] = '"'; ANTLRLiteralCharValueEscape['\n'] = "\\n"; ANTLRLiteralCharValueEscape['\r'] = "\\r"; ANTLRLiteralCharValueEscape['\t'] = "\\t"; ANTLRLiteralCharValueEscape['\b'] = "\\b"; ANTLRLiteralCharValueEscape['\f'] = "\\f"; ANTLRLiteralCharValueEscape['\\'] = "\\\\"; ANTLRLiteralCharValueEscape['\''] = "\\'"; } public static final int LEXER = 1; public static final int PARSER = 2; public static final int TREE_PARSER = 3; public static final int COMBINED = 4; public static final String[] grammarTypeToString = new String[] { "<invalid>", "lexer", "parser", "tree", "combined" }; public static final String[] grammarTypeToFileNameSuffix = new String[] { "<invalid>", "Lexer", "Parser", "", // no suffix for tree grammars "Parser" // if combined grammar, gen Parser and Lexer will be done later }; /** This is the buffer of *all* tokens found in the grammar file * including whitespace tokens etc... I use this to extract * lexer rules from combined grammars. */ protected TokenStreamRewriteEngine tokenBuffer; public static final String IGNORE_STRING_IN_GRAMMAR_FILE_NAME = "__"; public static class Decision { public int decision; public NFAState startState; public GrammarAST blockAST; public DFA dfa; } public class LabelElementPair { public antlr.Token label; public GrammarAST elementRef; public String referencedRuleName; /** Has an action referenced the label? Set by ActionAnalysis.g * Currently only set for rule labels. */ public boolean actionReferencesLabel; public int type; // in {RULE_LABEL,TOKEN_LABEL,RULE_LIST_LABEL,TOKEN_LIST_LABEL} public LabelElementPair(antlr.Token label, GrammarAST elementRef) { this.label = label; this.elementRef = elementRef; this.referencedRuleName = elementRef.getText(); } public Rule getReferencedRule() { return getRule(referencedRuleName); } public String toString() { return elementRef.toString(); } } /** What name did the user provide for this grammar? */ public String name; /** What type of grammar is this: lexer, parser, tree walker */ public int type; /** A list of options specified at the grammar level such as language=Java. * The value can be an AST for complicated values such as character sets. * There may be code generator specific options in here. I do no * interpretation of the key/value pairs...they are simply available for * who wants them. */ protected Map options; public static final Set legalOptions = new HashSet() { { add("language"); add("tokenVocab"); add("output"); add("rewrite"); add("ASTLabelType"); add("TokenLabelType"); add("superClass"); add("filter"); add("k"); add("backtrack"); add("memoize"); } }; public static final Set doNotCopyOptionsToLexer = new HashSet() { { add("output"); add("ASTLabelType"); add("superClass"); add("k"); add("backtrack"); add("memoize"); add("rewrite"); } }; public static final Map defaultOptions = new HashMap() { { put("language","Java"); } }; /** Is there a global fixed lookahead set for this grammar? * If 0, nothing specified. -1 implies we have not looked at * the options table yet to set k. */ protected int global_k = -1; /** Map a scope to a map of name:action pairs. * Map<String, Map<String,GrammarAST>> * The code generator will use this to fill holes in the output files. * I track the AST node for the action in case I need the line number * for errors. */ protected Map actions = new HashMap(); /** The NFA that represents the grammar with edges labelled with tokens * or epsilon. It is more suitable to analysis than an AST representation. */ protected NFA nfa; /** Token names and literal tokens like "void" are uniquely indexed. * with -1 implying EOF. Characters are different; they go from * -1 (EOF) to \uFFFE. For example, 0 could be a binary byte you * want to lexer. Labels of DFA/NFA transitions can be both tokens * and characters. I use negative numbers for bookkeeping labels * like EPSILON. Char/String literals and token types overlap in the same * space, however. */ protected int maxTokenType = Label.MIN_TOKEN_TYPE-1; /** TODO: hook this to the charVocabulary option */ protected IntSet charVocabulary = null; /** Map token like ID (but not literals like "while") to its token type */ protected Map tokenIDToTypeMap = new HashMap(); /** Map token literals like "while" to its token type. It may be that * WHILE="while"=35, in which case both tokenNameToTypeMap and this * field will have entries both mapped to 35. */ protected Map stringLiteralToTypeMap = new HashMap(); /** Map a token type to its token name. * Must subtract MIN_TOKEN_TYPE from index. */ protected Vector typeToTokenList = new Vector(); /** For interpreting and testing, you sometimes want to import token * definitions from another grammar (instead of reading token defs from * a file). */ protected Grammar importTokenVocabularyFromGrammar; /** For ANTLRWorks, we want to be able to map a line:col to a specific * decision DFA so it can display DFA. */ Map lineColumnToLookaheadDFAMap = new HashMap(); public Tool tool; /** The unique set of all rule references in any rule; set of Token * objects so two refs to same rule can exist but at different line/position. */ protected Set<antlr.Token> ruleRefs = new HashSet<antlr.Token>(); /** The unique set of all token ID references in any rule */ protected Set<antlr.Token> tokenIDRefs = new HashSet<antlr.Token>(); /** If combined or lexer grammar, track the rules; Set<String>. * Track lexer rules so we can warn about undefined tokens. */ protected Set<String> lexerRules = new HashSet<String>(); /** Be able to assign a number to every decision in grammar; * decisions in 1..n */ protected int decisionNumber = 0; /** Rules are uniquely labeled from 1..n */ protected int ruleIndex = 1; /** A list of all rules that are in any left-recursive cycle. There * could be multiple cycles, but this is a flat list of all problematic * rules. */ protected Set leftRecursiveRules; /** An external tool requests that DFA analysis abort prematurely. Stops * at DFA granularity, which are limited to a DFA size and time computation * as failsafe. */ protected boolean externalAnalysisAbort; /** When we read in a grammar, we track the list of syntactic predicates * and build faux rules for them later. See my blog entry Dec 2, 2005: * http://www.antlr.org/blog/antlr3/lookahead.tml * This maps the name (we make up) for a pred to the AST grammar fragment. */ protected LinkedHashMap nameToSynpredASTMap; /** Map a rule to it's Rule object */ protected LinkedHashMap nameToRuleMap = new LinkedHashMap(); /** Track the scopes defined outside of rules and the scopes associated * with all rules (even if empty). */ protected Map scopes = new HashMap(); /** Map a rule index to its name; use a Vector on purpose as new * collections stuff won't let me setSize and make it grow. :( * I need a specific guaranteed index, which the Collections stuff * won't let me have. */ protected Vector ruleIndexToRuleList = new Vector(); /** An AST that records entire input grammar with all rules. A simple * grammar with one rule, "grammar t; a : A | B ;", looks like: * ( grammar t ( rule a ( BLOCK ( ALT A ) ( ALT B ) ) <end-of-rule> ) ) */ protected GrammarAST grammarTree = null; /** Each subrule/rule is a decision point and we must track them so we * can go back later and build DFA predictors for them. This includes * all the rules, subrules, optional blocks, ()+, ()* etc... The * elements in this list are NFAState objects. */ protected Vector indexToDecision = new Vector(INITIAL_DECISION_LIST_SIZE); /** If non-null, this is the code generator we will use to generate * recognizers in the target language. */ protected CodeGenerator generator; NameSpaceChecker nameSpaceChecker = new NameSpaceChecker(this); /** Used during LOOK to detect computation cycles */ protected Set lookBusy = new HashSet(); /** The checkForLeftRecursion method needs to track what rules it has * visited to track infinite recursion. */ protected Set visitedDuringRecursionCheck = null; protected boolean watchNFAConversion = false; /** For merged lexer/parsers, we must construct a separate lexer spec. * This is the template for lexer; put the literals first then the * regular rules. We don't need to specify a token vocab import as * I make the new grammar import from the old all in memory; don't want * to force it to read from the disk. Lexer grammar will have same * name as original grammar but will be in different filename. Foo.g * with combined grammar will have FooParser.java generated and * Foo__.g with again Foo inside. It will however generate FooLexer.java * as it's a lexer grammar. A bit odd, but autogenerated. Can tweak * later if we want. */ protected StringTemplate lexerGrammarST = new StringTemplate( "lexer grammar <name>;\n" + "<if(options)>" + "options {\n" + " <options:{<it.name>=<it.value>;<\\n>}>\n" + "}<\\n>\n" + "<endif>\n" + "<actionNames,actions:{n,a|@<n> {<a>}\n}>\n" + "<literals:{<it.ruleName> : <it.literal> ;\n}>\n" + "<rules>", AngleBracketTemplateLexer.class ); /** What file name holds this grammar? */ protected String fileName; /** How long in ms did it take to build DFAs for this grammar? * If this grammar is a combined grammar, it only records time for * the parser grammar component. This only records the time to * do the LL(*) work; NFA->DFA conversion. */ public long DFACreationWallClockTimeInMS; public int numberOfSemanticPredicates = 0; public int numberOfManualLookaheadOptions = 0; public Set setOfNondeterministicDecisionNumbers = new HashSet(); public Set setOfNondeterministicDecisionNumbersResolvedWithPredicates = new HashSet(); public Set setOfDFAWhoseConversionTerminatedEarly = new HashSet(); /** Track decisions with syn preds specified for reporting. * This is the a set of BLOCK type AST nodes. */ public Set<GrammarAST> blocksWithSynPreds = new HashSet(); /** Track decisions that actually use the syn preds in the DFA. * Computed during NFA to DFA conversion. */ public Set<DFA> decisionsWhoseDFAsUsesSynPreds = new HashSet();
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -