📄 psqlscan.l
字号:
%{/*------------------------------------------------------------------------- * * psqlscan.l * lexical scanner for psql * * This code is mainly needed to determine where the end of a SQL statement * is: we are looking for semicolons that are not within quotes, comments, * or parentheses. The most reliable way to handle this is to borrow the * backend's flex lexer rules, lock, stock, and barrel. The rules below * are (except for a few) the same as the backend's, but their actions are * just ECHO whereas the backend's actions generally do other things. * * XXX The rules in this file must be kept in sync with the backend lexer!!! * * XXX Avoid creating backtracking cases --- see the backend lexer for info. * * The most difficult aspect of this code is that we need to work in multibyte * encodings that are not ASCII-safe. A "safe" encoding is one in which each * byte of a multibyte character has the high bit set (it's >= 0x80). Since * all our lexing rules treat all high-bit-set characters alike, we don't * really need to care whether such a byte is part of a sequence or not. * In an "unsafe" encoding, we still expect the first byte of a multibyte * sequence to be >= 0x80, but later bytes might not be. If we scan such * a sequence as-is, the lexing rules could easily be fooled into matching * such bytes to ordinary ASCII characters. Our solution for this is to * substitute 0xFF for each non-first byte within the data presented to flex. * The flex rules will then pass the FF's through unmolested. The emit() * subroutine is responsible for looking back to the original string and * replacing FF's with the corresponding original bytes. * * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION * $PostgreSQL: pgsql/src/bin/psql/psqlscan.l,v 1.15 2005/06/26 19:16:06 tgl Exp $ * *------------------------------------------------------------------------- */#include "postgres_fe.h"#include "psqlscan.h"#include <ctype.h>#include "mb/pg_wchar.h"#include "common.h"#include "settings.h"#include "variables.h"/* * We use a stack of flex buffers to handle substitution of psql variables. * Each stacked buffer contains the as-yet-unread text from one psql variable. * When we pop the stack all the way, we resume reading from the outer buffer * identified by scanbufhandle. */typedef struct StackElem{ YY_BUFFER_STATE buf; /* flex input control structure */ char *bufstring; /* data actually being scanned by flex */ char *origstring; /* copy of original data, if needed */ struct StackElem *next;} StackElem;/* * All working state of the lexer must be stored in PsqlScanStateData * between calls. This allows us to have multiple open lexer operations, * which is needed for nested include files. The lexer itself is not * recursive, but it must be re-entrant. */typedef struct PsqlScanStateData{ StackElem *buffer_stack; /* stack of variable expansion buffers */ /* * These variables always refer to the outer buffer, never to any * stacked variable-expansion buffer. */ YY_BUFFER_STATE scanbufhandle; char *scanbuf; /* start of outer-level input buffer */ const char *scanline; /* current input line at outer level */ /* safe_encoding, curline, refline are used by emit() to replace FFs */ int encoding; /* encoding being used now */ bool safe_encoding; /* is current encoding "safe"? */ const char *curline; /* actual flex input string for cur buf */ const char *refline; /* original data for cur buffer */ /* * All this state lives across successive input lines, until explicitly * reset by psql_scan_reset. */ int start_state; /* saved YY_START */ int paren_depth; /* depth of nesting in parentheses */ int xcdepth; /* depth of nesting in slash-star comments */ char *dolqstart; /* current $foo$ quote start string */} PsqlScanStateData;static PsqlScanState cur_state; /* current state while active */static PQExpBuffer output_buf; /* current output buffer *//* these variables do not need to be saved across calls */static enum slash_option_type option_type;static char *option_quote;/* Return values from yylex() */#define LEXRES_EOL 0 /* end of input */#define LEXRES_SEMI 1 /* command-terminating semicolon found */#define LEXRES_BACKSLASH 2 /* backslash command start */#define LEXRES_OK 3 /* OK completion of backslash argument */int yylex(void);static void push_new_buffer(const char *newstr);static YY_BUFFER_STATE prepare_buffer(const char *txt, int len, char **txtcopy);static void emit(const char *txt, int len);#define ECHO emit(yytext, yyleng)%}%option 8bit%option never-interactive%option nodefault%option nounput%option noyywrap/* * All of the following definitions and rules should exactly match * src/backend/parser/scan.l so far as the flex patterns are concerned. * The rule bodies are just ECHO as opposed to what the backend does, * however. (But be sure to duplicate code that affects the lexing process, * such as BEGIN().) Also, psqlscan uses a single <<EOF>> rule whereas * scan.l has a separate one for each exclusive state. *//* * OK, here is a short description of lex/flex rules behavior. * The longest pattern which matches an input string is always chosen. * For equal-length patterns, the first occurring in the rules list is chosen. * INITIAL is the starting state, to which all non-conditional rules apply. * Exclusive states change parsing rules while the state is active. When in * an exclusive state, only those rules defined for that state apply. * * We use exclusive states for quoted strings, extended comments, * and to eliminate parsing troubles for numeric strings. * Exclusive states: * <xb> bit string literal * <xc> extended C-style comments * <xd> delimited identifiers (double-quoted identifiers) * <xh> hexadecimal numeric string * <xq> quoted strings * <xdolq> $foo$ quoted strings */%x xb%x xc%x xd%x xh%x xq%x xdolq/* Additional exclusive states for psql only: lex backslash commands */%x xslashcmd%x xslasharg%x xslashquote%x xslashbackquote%x xslashdefaultarg%x xslashquotedarg%x xslashwholeline%x xslashend/* * In order to make the world safe for Windows and Mac clients as well as * Unix ones, we accept either \n or \r as a newline. A DOS-style \r\n * sequence will be seen as two successive newlines, but that doesn't cause * any problems. Comments that start with -- and extend to the next * newline are treated as equivalent to a single whitespace character. * * NOTE a fine point: if there is no newline following --, we will absorb * everything to the end of the input as a comment. This is correct. Older * versions of Postgres failed to recognize -- as a comment if the input * did not end with a newline. * * XXX perhaps \f (formfeed) should be treated as a newline as well? */space [ \t\n\r\f]horiz_space [ \t\f]newline [\n\r]non_newline [^\n\r]comment ("--"{non_newline}*)whitespace ({space}+|{comment})/* * SQL requires at least one newline in the whitespace separating * string literals that are to be concatenated. Silly, but who are we * to argue? Note that {whitespace_with_newline} should not have * after * it, whereas {whitespace} should generally have a * after it... */special_whitespace ({space}+|{comment}{newline})horiz_whitespace ({horiz_space}|{comment})whitespace_with_newline ({horiz_whitespace}*{newline}{special_whitespace}*)/* * To ensure that {quotecontinue} can be scanned without having to back up * if the full pattern isn't matched, we include trailing whitespace in * {quotestop}. This matches all cases where {quotecontinue} fails to match, * except for {quote} followed by whitespace and just one "-" (not two, * which would start a {comment}). To cover that we have {quotefail}. * The actions for {quotestop} and {quotefail} must throw back characters * beyond the quote proper. */quote 'quotestop {quote}{whitespace}*quotecontinue {quote}{whitespace_with_newline}{quote}quotefail {quote}{whitespace}*"-"/* Bit string * It is tempting to scan the string for only those characters * which are allowed. However, this leads to silently swallowed * characters if illegal characters are included in the string. * For example, if xbinside is [01] then B'ABCD' is interpreted * as a zero-length string, and the ABCD' is lost! * Better to pass the string forward and let the input routines * validate the contents. */xbstart [bB]{quote}xbinside [^']*/* Hexadecimal number */xhstart [xX]{quote}xhinside [^']*/* National character */xnstart [nN]{quote}/* Quoted string that allows backslash escapes */xestart [eE]{quote}/* Extended quote * xqdouble implements embedded quote, '''' */xqstart {quote}xqdouble {quote}{quote}xqinside [^\\']+xqescape [\\][^0-7]xqoctesc [\\][0-7]{1,3}xqhexesc [\\]x[0-9A-Fa-f]{1,2}/* $foo$ style quotes ("dollar quoting") * The quoted string starts with $foo$ where "foo" is an optional string * in the form of an identifier, except that it may not contain "$", * and extends to the first occurrence of an identical string. * There is *no* processing of the quoted text. * * {dolqfailed} is an error rule to avoid scanner backup when {dolqdelim} * fails to match its trailing "$". */dolq_start [A-Za-z\200-\377_]dolq_cont [A-Za-z\200-\377_0-9]dolqdelim \$({dolq_start}{dolq_cont}*)?\$dolqfailed \${dolq_start}{dolq_cont}*dolqinside [^$]+/* Double quote * Allows embedded spaces and other special characters into identifiers. */dquote \"xdstart {dquote}xdstop {dquote}xddouble {dquote}{dquote}xdinside [^"]+/* C-style comments * * The "extended comment" syntax closely resembles allowable operator syntax. * The tricky part here is to get lex to recognize a string starting with * slash-star as a comment, when interpreting it as an operator would produce * a longer match --- remember lex will prefer a longer match! Also, if we * have something like plus-slash-star, lex will think this is a 3-character * operator whereas we want to see it as a + operator and a comment start. * The solution is two-fold: * 1. append {op_chars}* to xcstart so that it matches as much text as * {operator} would. Then the tie-breaker (first matching rule of same * length) ensures xcstart wins. We put back the extra stuff with yyless() * in case it contains a star-slash that should terminate the comment. * 2. In the operator rule, check for slash-star within the operator, and * if found throw it back with yyless(). This handles the plus-slash-star * problem. * Dash-dash comments have similar interactions with the operator rule. */xcstart \/\*{op_chars}*xcstop \*+\/xcinside [^*/]+digit [0-9]ident_start [A-Za-z\200-\377_]ident_cont [A-Za-z\200-\377_0-9\$]identifier {ident_start}{ident_cont}*typecast "::"/* * "self" is the set of chars that should be returned as single-character * tokens. "op_chars" is the set of chars that can make up "Op" tokens, * which can be one or more characters long (but if a single-char token * appears in the "self" set, it is not to be returned as an Op). Note * that the sets overlap, but each has some chars that are not in the other. * * If you change either set, adjust the character lists appearing in the * rule for "operator"! */self [,()\[\].;\:\+\-\*\/\%\^\<\>\=]op_chars [\~\!\@\#\^\&\|\`\?\+\-\*\/\%\<\>\=]operator {op_chars}+/* we no longer allow unary minus in numbers. * instead we pass it separately to parser. there it gets * coerced via doNegate() -- Leon aug 20 1999 * * {realfail1} and {realfail2} are added to prevent the need for scanner * backup when the {real} rule fails to match completely. */integer {digit}+decimal (({digit}*\.{digit}+)|({digit}+\.{digit}*))real ({integer}|{decimal})[Ee][-+]?{digit}+realfail1 ({integer}|{decimal})[Ee]realfail2 ({integer}|{decimal})[Ee][-+]param \${integer}other ./* * Dollar quoted strings are totally opaque, and no escaping is done on them. * Other quoted strings must allow some special characters such as single-quote * and newline. * Embedded single-quotes are implemented both in the SQL standard * style of two adjacent single quotes "''" and in the Postgres/Java style * of escaped-quote "\'". * Other embedded escaped characters are matched explicitly and the leading * backslash is dropped from the string. * Note that xcstart must appear before operator, as explained above! * Also whitespace (comment) must appear before operator. */%%{whitespace} { /* * Note that the whitespace rule includes both true * whitespace and single-line ("--" style) comments. * We suppress whitespace at the start of the query * buffer. We also suppress all single-line comments, * which is pretty dubious but is the historical * behavior. */ if (!(output_buf->len == 0 || yytext[0] == '-')) ECHO; }{xcstart} { cur_state->xcdepth = 0; BEGIN(xc); /* Put back any characters past slash-star; see above */ yyless(2); ECHO; }<xc>{xcstart} { cur_state->xcdepth++; /* Put back any characters past slash-star; see above */ yyless(2); ECHO; }<xc>{xcstop} { if (cur_state->xcdepth <= 0) { BEGIN(INITIAL); } else cur_state->xcdepth--; ECHO; }<xc>{xcinside} { ECHO; }<xc>{op_chars} { ECHO; }<xc>\*+ { ECHO; }{xbstart} { BEGIN(xb); ECHO; }<xb>{quotestop} |<xb>{quotefail} { yyless(1); BEGIN(INITIAL); ECHO; }<xh>{xhinside} |<xb>{xbinside} { ECHO; }<xh>{quotecontinue} |<xb>{quotecontinue} { ECHO; }{xhstart} { /* Hexadecimal bit type. * At some point we should simply pass the string * forward to the parser and label it there. * In the meantime, place a leading "x" on the string * to mark it for the input routine as a hex string. */ BEGIN(xh); ECHO; }<xh>{quotestop} |<xh>{quotefail} { yyless(1); BEGIN(INITIAL); ECHO; }{xnstart} { yyless(1); /* eat only 'n' this time */ ECHO; }{xqstart} { BEGIN(xq); ECHO; }{xestart} { BEGIN(xq); ECHO; }<xq>{quotestop} |<xq>{quotefail} { yyless(1); BEGIN(INITIAL); ECHO; }<xq>{xqdouble} { ECHO; }<xq>{xqinside} { ECHO; }<xq>{xqescape} { ECHO; }<xq>{xqoctesc} { ECHO; }<xq>{xqhexesc} { ECHO; }<xq>{quotecontinue} { ECHO; }<xq>. { /* This is only needed for \ just before EOF */ ECHO; }{dolqdelim} { cur_state->dolqstart = pg_strdup(yytext); BEGIN(xdolq); ECHO; }{dolqfailed} { /* throw back all but the initial "$" */ yyless(1); ECHO; }<xdolq>{dolqdelim} { if (strcmp(yytext, cur_state->dolqstart) == 0) { free(cur_state->dolqstart); cur_state->dolqstart = NULL; BEGIN(INITIAL); } else { /* * When we fail to match $...$ to dolqstart, transfer * the $... part to the output, but put back the final * $ for rescanning. Consider $delim$...$junk$delim$ */ yyless(yyleng-1); } ECHO; }<xdolq>{dolqinside} { ECHO; }<xdolq>{dolqfailed} { ECHO; }<xdolq>. { /* This is only needed for $ inside the quoted text */ ECHO; }{xdstart} { BEGIN(xd); ECHO; }<xd>{xdstop} { BEGIN(INITIAL); ECHO; }<xd>{xddouble} { ECHO; }<xd>{xdinside} { ECHO; }{typecast} { ECHO; }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -