📄 antlr3baserecognizer.c
字号:
* sync to context-sensitive FOLLOWs for a, b, and c: {']','^'}. * In this case, for input "[]", LA(1) is in this set so we would * not consume anything and after printing an error rule c would * return normally. It would not find the required '^' though. * At this point, it gets a mismatched token error and throws an * exception (since LA(1) is not in the viable following token * set). The rule exception handler tries to recover, but finds * the same recovery set and doesn't consume anything. Rule b * exits normally returning to rule a. Now it finds the ']' (and * with the successful match exits errorRecovery mode). * * So, you cna see that the parser walks up call chain looking * for the token that was a member of the recovery set. * * Errors are not generated in errorRecovery mode. * * ANTLR's error recovery mechanism is based upon original ideas: * * "Algorithms + Data Structures = Programs" by Niklaus Wirth * * and * * "A note on error recovery in recursive descent parsers": * http://portal.acm.org/citation.cfm?id=947902.947905 * * Later, Josef Grosch had some good ideas: * * "Efficient and Comfortable Error Recovery in Recursive Descent * Parsers": * ftp://www.cocolab.com/products/cocktail/doca4.ps/ell.ps.zip * * Like Grosch I implemented local FOLLOW sets that are combined * at run-time upon error to avoid overhead during parsing. */static pANTLR3_BITSET computeErrorRecoverySet (pANTLR3_BASE_RECOGNIZER recognizer){ return recognizer->combineFollows(recognizer, ANTLR3_FALSE);}/** Compute the context-sensitive FOLLOW set for current rule. * This is set of token types that can follow a specific rule * reference given a specific call chain. You get the set of * viable tokens that can possibly come next (lookahead depth 1) * given the current call chain. Contrast this with the * definition of plain FOLLOW for rule r: * * FOLLOW(r)={x | S=>*alpha r beta in G and x in FIRST(beta)} * * where x in T* and alpha, beta in V*; T is set of terminals and * V is the set of terminals and nonterminals. In other words, * FOLLOW(r) is the set of all tokens that can possibly follow * references to r in *any* sentential form (context). At * runtime, however, we know precisely which context applies as * we have the call chain. We may compute the exact (rather * than covering superset) set of following tokens. * * For example, consider grammar: * * stat : ID '=' expr ';' // FOLLOW(stat)=={EOF} * | "return" expr '.' * ; * expr : atom ('+' atom)* ; // FOLLOW(expr)=={';','.',')'} * atom : INT // FOLLOW(atom)=={'+',')',';','.'} * | '(' expr ')' * ; * * The FOLLOW sets are all inclusive whereas context-sensitive * FOLLOW sets are precisely what could follow a rule reference. * For input input "i=(3);", here is the derivation: * * stat => ID '=' expr ';' * => ID '=' atom ('+' atom)* ';' * => ID '=' '(' expr ')' ('+' atom)* ';' * => ID '=' '(' atom ')' ('+' atom)* ';' * => ID '=' '(' INT ')' ('+' atom)* ';' * => ID '=' '(' INT ')' ';' * * At the "3" token, you'd have a call chain of * * stat -> expr -> atom -> expr -> atom * * What can follow that specific nested ref to atom? Exactly ')' * as you can see by looking at the derivation of this specific * input. Contrast this with the FOLLOW(atom)={'+',')',';','.'}. * * You want the exact viable token set when recovering from a * token mismatch. Upon token mismatch, if LA(1) is member of * the viable next token set, then you know there is most likely * a missing token in the input stream. "Insert" one by just not * throwing an exception. */static pANTLR3_BITSET computeCSRuleFollow (pANTLR3_BASE_RECOGNIZER recognizer){ return recognizer->combineFollows(recognizer, ANTLR3_FALSE);}static pANTLR3_BITSET combineFollows (pANTLR3_BASE_RECOGNIZER recognizer, ANTLR3_BOOLEAN exact){ pANTLR3_BITSET followSet; pANTLR3_BITSET localFollowSet; ANTLR3_UINT64 top; ANTLR3_UINT64 i; top = recognizer->following->size(recognizer->following); followSet = antlr3BitsetNew(0); for (i = top; i>0; i--) { localFollowSet = (pANTLR3_BITSET) recognizer->following->get(recognizer->following, i); if (localFollowSet != NULL) { followSet->orInPlace(followSet, localFollowSet); } if ( exact == ANTLR3_TRUE && localFollowSet->isMember(localFollowSet, ANTLR3_EOR_TOKEN_TYPE) == ANTLR3_FALSE ) { break; } } followSet->remove(followSet, ANTLR3_EOR_TOKEN_TYPE); return followSet;}#ifdef WIN32#pragma warning( disable : 4100 )#endifstatic void displayRecognitionError (pANTLR3_BASE_RECOGNIZER recognizer, pANTLR3_UINT8 * tokenNames){ pANTLR3_PARSER parser; pANTLR3_TREE_PARSER tparser; pANTLR3_INT_STREAM is; pANTLR3_COMMON_TOKEN theToken; pANTLR3_BASE_TREE theBaseTree; pANTLR3_COMMON_TREE theCommonTree; /* Indicate this recognizer had an error while processing. */ recognizer->errorCount++; theToken = NULL; /* Assume there is no token to use */ fprintf(stderr, "%s(", (char *)(recognizer->exception->streamName));#ifdef WIN32 /* shanzzle fraazzle Dick Dastardly */ fprintf(stderr, "%I64d) ", recognizer->exception->line);#else fprintf(stderr, "%lld) ", recognizer->exception->type);#endif fprintf(stderr, ": error %d : %s", recognizer->exception->type, (pANTLR3_UINT8) (recognizer->exception->message)); /* How we determine the next piece is dependent on which thign raised the * error. */ switch (recognizer->type) { case ANTLR3_TYPE_PARSER: parser = (pANTLR3_PARSER) (recognizer->super); tparser = NULL; is = parser->tstream->istream; theToken = (pANTLR3_COMMON_TOKEN)(recognizer->exception->token); fprintf(stderr, ", at offset %d", recognizer->exception->charPositionInLine); if (theToken != NULL) { if (theToken->type == ANTLR3_TOKEN_EOF) { fprintf(stderr, ", at <EOF>"); } else { fprintf(stderr, ", near %s", theToken->toString(theToken)->chars); } } break; case ANTLR3_TYPE_TREE_PARSER: tparser = (pANTLR3_TREE_PARSER) (recognizer->super); parser = NULL; is = tparser->ctnstream->tnstream->istream; theBaseTree = (pANTLR3_BASE_TREE)(recognizer->exception->token); if (theBaseTree != NULL) { theCommonTree = (pANTLR3_COMMON_TREE) theBaseTree->super; if (theCommonTree != NULL) { theToken = (pANTLR3_COMMON_TOKEN) theCommonTree->getToken(theBaseTree); } fprintf(stderr, ", at offset %d", theBaseTree->getCharPositionInLine(theBaseTree)); } break; default: fprintf(stderr, "Base recognizerfunction displayRecognitionError called by unknown parser type - provide override for this function\n"); return; break; } fprintf(stderr, "\n"); /* TODO: Improve error output acccording to the exception type, though generally * the implementor will want their own function to replace this. */}/** Recover from an error found on the input stream. Mostly this is * NoViableAlt exceptions, but could be a mismatched token that * the match() routine could not recover from. */static void recover (pANTLR3_BASE_RECOGNIZER recognizer){ /* Used to compute the follow set of tokens */ pANTLR3_BITSET followSet; pANTLR3_PARSER parser; pANTLR3_TREE_PARSER tparser; pANTLR3_INT_STREAM is; switch (recognizer->type) { case ANTLR3_TYPE_PARSER: parser = (pANTLR3_PARSER) (recognizer->super); tparser = NULL; is = parser->tstream->istream; break; case ANTLR3_TYPE_TREE_PARSER: tparser = (pANTLR3_TREE_PARSER) (recognizer->super); parser = NULL; is = tparser->ctnstream->tnstream->istream; break; default: fprintf(stderr, "Base recognizerfunction recover called by unknown paresr type - provide override for this function\n"); return; break; } /* I know that all the indirection looks confusing, but you get used to it and it really isn't. * Don't be tempted to use macros like we do for the generated C code, you will never know * what is going on. The generated C code does this to hide implementation details not clarify them. */ if (recognizer->lastErrorIndex == is->index(is)) { /* The last error was at the same token index point. This must be a case * where LT(1) is in the recovery token set so nothing is * consumed. Consume a single token so at least to prevent * an infinite loop; this is a failsafe. */ is->consume(is); } /* Record error index position */ recognizer->lastErrorIndex = is->index(is); /* Work out the follows set for error recovery */ followSet = recognizer->computeErrorRecoverySet(recognizer); /* Call resync hook (for debuggers and so on) */ recognizer->beginResync(recognizer); /* Consume tokens until we have resynced to something in the follows set */ recognizer->consumeUntilSet(recognizer, followSet); /* End resync hook */ recognizer->endResync(recognizer); /* Destoy the temporary bitset we produced. */ followSet->free(followSet); /* Reset the in error bit so we don't re-report the exception */ recognizer->error = ANTLR3_FALSE;}/** Attempt to recover from a single missing or extra token. * * EXTRA TOKEN * * LA(1) is not what we are looking for. If LA(2) has the right token, * however, then assume LA(1) is some extra spurious token. Delete it * and LA(2) as if we were doing a normal match(), which advances the * input. * * MISSING TOKEN * * If current token is consistent with what could come after * ttype then it is ok to "insert" the missing token, else throw * exception For example, Input "i=(3;" is clearly missing the * ')'. When the parser returns from the nested call to expr, it * will have call chain: * * stat -> expr -> atom * * and it will be trying to match the ')' at this point in the * derivation: * * => ID '=' '(' INT ')' ('+' atom)* ';' * ^ * match() will see that ';' doesn't match ')' and report a * mismatched token error. To recover, it sees that LA(1)==';' * is in the set of tokens that can follow the ')' token * reference in rule atom. It can assume that you forgot the ')'. * * May need ot come back and look at the exception stuff here, I am assuming * that the exception that was passed in in the java implementation is * sotred in the recognizer exception stack. To 'throw' it we set the * error flag and rules can cascade back when this is set. */static void recoverFromMismatchedToken (pANTLR3_BASE_RECOGNIZER recognizer, ANTLR3_UINT32 ttype, pANTLR3_BITSET follow){ pANTLR3_PARSER parser; pANTLR3_TREE_PARSER tparser; pANTLR3_INT_STREAM is; switch (recognizer->type) { case ANTLR3_TYPE_PARSER: parser = (pANTLR3_PARSER) (recognizer->super); tparser = NULL; is = parser->tstream->istream; break; case ANTLR3_TYPE_TREE_PARSER: tparser = (pANTLR3_TREE_PARSER) (recognizer->super); parser = NULL; is = tparser->ctnstream->tnstream->istream; break; default: fprintf(stderr, "Base recognizerfunction recoverFromMismatchedToken called by unknown paresr type - provide override for this function\n"); return; break; } /* If the next token after the one we are looking at in the input stream * is what we are looking for then we remove the one we have discovered * from the stream by consuming it, then consume this next one along too as * if nothing had happened. */ if ( is->_LA(is, 2) == ttype) { /* Print out the error */ recognizer->reportError(recognizer); /* Call resync hook (for debuggeres and so on) */ recognizer->beginResync(recognizer); /* "delete" the extra token */ is->consume(is); /* End resync hook */ recognizer->endResync(recognizer); /* consume the token that the rule actually expected to get */ is->consume(is); recognizer->error = ANTLR3_FALSE; /* Exception is not outstanding any more */ } /* The next token (after the one that is current, is not the one * that we were expecting, so the input is in more of an error state * than we hoped. * If we are able to recover from the error using the follow set, then * we are hunky dory again and can move on, if we cannot, then we resort * to throwing the exception. */ if (recognizer->recoverFromMismatchedElement(recognizer, follow) == ANTLR3_FALSE) { recognizer->error = ANTLR3_TRUE; recognizer->failed = ANTLR3_TRUE; return; }}static void recoverFromMismatchedSet (pANTLR3_BASE_RECOGNIZER recognizer, pANTLR3_BITSET follow){ pANTLR3_PARSER parser; pANTLR3_TREE_PARSER tparser; pANTLR3_INT_STREAM is; switch (recognizer->type) { case ANTLR3_TYPE_PARSER: parser = (pANTLR3_PARSER) (recognizer->super); tparser = NULL; is = parser->tstream->istream; break; case ANTLR3_TYPE_TREE_PARSER: tparser = (pANTLR3_TREE_PARSER) (recognizer->super); parser = NULL; is = tparser->ctnstream->tnstream->istream; break; default: fprintf(stderr, "Base recognizerfunction recoverFromMismatchedSet called by unknown paresr type - provide override for this function\n"); return; break; } /* TODO - Single token deletion like in recoverFromMismatchedToken() */ if (recognizer->recoverFromMismatchedElement(recognizer, follow) == ANTLR3_FALSE) { recognizer->error = ANTLR3_TRUE; recognizer->failed = ANTLR3_TRUE; return; }}/** This code is factored out from mismatched token and mismatched set * recovery. It handles "single token insertion" error recovery for * both. No tokens are consumed to recover from insertions. Return * true if recovery was possible else return false. */static ANTLR3_BOOLEAN recoverFromMismatchedElement (pANTLR3_BASE_RECOGNIZER recognizer, pANTLR3_BITSET follow){ pANTLR3_BITSET viableToksFollowingRule; pANTLR3_BITSET newFollow; pANTLR3_PARSER parser; pANTLR3_TREE_PARSER tparser; pANTLR3_INT_STREAM is; switch (recognizer->type) { case ANTLR3_TYPE_PARSER: parser = (pANTLR3_PARSER) (recognizer->super); tparser = NULL; is = parser->tstream->istream; break; case ANTLR3_TYPE_TREE_PARSER: tparser = (pANTLR3_TREE_PARSER) (recognizer->super); parser = NULL; is = tparser->ctnstream->tnstream->istream; break; default: fprintf(stderr, "Base recognizerfunction recover called by unknown paresr type - provide override for this function\n"); return ANTLR3_FALSE;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -