📄 scan.l
字号:
%{/*------------------------------------------------------------------------- * * scan.l * lexical scanner for PostgreSQL * * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION * $Header: /cvsroot/pgsql/src/backend/parser/scan.l,v 1.111.2.1 2004/02/21 00:35:13 tgl Exp $ * *------------------------------------------------------------------------- */#include "postgres.h"#include <ctype.h>#include <unistd.h>#include <errno.h>#include "miscadmin.h"#include "nodes/parsenodes.h"#include "nodes/pg_list.h"#include "parser/gramparse.h"#include "parser/keywords.h"/* Not needed now that this file is compiled as part of gram.y *//* #include "parser/parse.h" */#include "parser/scansup.h"#include "utils/builtins.h"#include "mb/pg_wchar.h"/* No reason to constrain amount of data slurped */#define YY_READ_BUF_SIZE 16777216/* Avoid exit() on fatal scanner errors (a bit ugly -- see yy_fatal_error) */#define fprintf(file, fmt, msg) ereport(ERROR, (errmsg_internal("%s", msg)))extern YYSTYPE yylval;static int xcdepth = 0; /* depth of nesting in slash-star comments *//* * literalbuf is used to accumulate literal values when multiple rules * are needed to parse a single literal. Call startlit to reset buffer * to empty, addlit to add text. Note that the buffer is palloc'd and * starts life afresh on every parse cycle. */static char *literalbuf; /* expandable buffer */static int literallen; /* actual current length */static int literalalloc; /* current allocated buffer size */#define startlit() (literalbuf[0] = '\0', literallen = 0)static void addlit(char *ytext, int yleng);static void addlitchar(unsigned char ychar);static char *litbufdup(void);/* * When we parse a token that requires multiple lexer rules to process, * we set token_start to point at the true start of the token, for use * by yyerror(). yytext will point at just the text consumed by the last * rule, so it's not very helpful (e.g., it might contain just the last * quote mark of a quoted identifier). But to avoid cluttering every rule * with setting token_start, we allow token_start = NULL to denote that * it's okay to use yytext. */static char *token_start;/* Handles to the buffer that the lexer uses internally */static YY_BUFFER_STATE scanbufhandle;static char *scanbuf;unsigned char unescape_single_char(unsigned char c);%}%option 8bit%option never-interactive%option nounput%option noyywrap%option prefix="base_yy"/* * OK, here is a short description of lex/flex rules behavior. * The longest pattern which matches an input string is always chosen. * For equal-length patterns, the first occurring in the rules list is chosen. * INITIAL is the starting state, to which all non-conditional rules apply. * Exclusive states change parsing rules while the state is active. When in * an exclusive state, only those rules defined for that state apply. * * We use exclusive states for quoted strings, extended comments, * and to eliminate parsing troubles for numeric strings. * Exclusive states: * <xb> bit string literal * <xc> extended C-style comments * <xd> delimited identifiers (double-quoted identifiers) * <xh> hexadecimal numeric string * <xq> quoted strings */%x xb%x xc%x xd%x xh%x xq/* Bit string * It is tempting to scan the string for only those characters * which are allowed. However, this leads to silently swallowed * characters if illegal characters are included in the string. * For example, if xbinside is [01] then B'ABCD' is interpreted * as a zero-length string, and the ABCD' is lost! * Better to pass the string forward and let the input routines * validate the contents. */xbstart [bB]{quote}xbstop {quote}xbinside [^']*xbcat {quote}{whitespace_with_newline}{quote}/* Hexadecimal number */xhstart [xX]{quote}xhstop {quote}xhinside [^']*xhcat {quote}{whitespace_with_newline}{quote}/* National character */xnstart [nN]{quote}/* Extended quote * xqdouble implements embedded quote * xqcat allows strings to cross input lines */quote 'xqstart {quote}xqstop {quote}xqdouble {quote}{quote}xqinside [^\\']+xqescape [\\][^0-7]xqoctesc [\\][0-7]{1,3}xqcat {quote}{whitespace_with_newline}{quote}/* Double quote * Allows embedded spaces and other special characters into identifiers. */dquote \"xdstart {dquote}xdstop {dquote}xddouble {dquote}{dquote}xdinside [^"]+/* C-style comments * * The "extended comment" syntax closely resembles allowable operator syntax. * The tricky part here is to get lex to recognize a string starting with * slash-star as a comment, when interpreting it as an operator would produce * a longer match --- remember lex will prefer a longer match! Also, if we * have something like plus-slash-star, lex will think this is a 3-character * operator whereas we want to see it as a + operator and a comment start. * The solution is two-fold: * 1. append {op_chars}* to xcstart so that it matches as much text as * {operator} would. Then the tie-breaker (first matching rule of same * length) ensures xcstart wins. We put back the extra stuff with yyless() * in case it contains a star-slash that should terminate the comment. * 2. In the operator rule, check for slash-star within the operator, and * if found throw it back with yyless(). This handles the plus-slash-star * problem. * Dash-dash comments have similar interactions with the operator rule. */xcstart \/\*{op_chars}*xcstop \*+\/xcinside [^*/]+digit [0-9]ident_start [A-Za-z\200-\377_]ident_cont [A-Za-z\200-\377_0-9\$]identifier {ident_start}{ident_cont}*typecast "::"/* * "self" is the set of chars that should be returned as single-character * tokens. "op_chars" is the set of chars that can make up "Op" tokens, * which can be one or more characters long (but if a single-char token * appears in the "self" set, it is not to be returned as an Op). Note * that the sets overlap, but each has some chars that are not in the other. * * If you change either set, adjust the character lists appearing in the * rule for "operator"! */self [,()\[\].;\:\+\-\*\/\%\^\<\>\=]op_chars [\~\!\@\#\^\&\|\`\?\+\-\*\/\%\<\>\=]operator {op_chars}+/* we no longer allow unary minus in numbers. * instead we pass it separately to parser. there it gets * coerced via doNegate() -- Leon aug 20 1999 */integer {digit}+decimal (({digit}*\.{digit}+)|({digit}+\.{digit}*))real ((({digit}*\.{digit}+)|({digit}+\.{digit}*)|({digit}+))([Ee][-+]?{digit}+))param \${integer}/* * In order to make the world safe for Windows and Mac clients as well as * Unix ones, we accept either \n or \r as a newline. A DOS-style \r\n * sequence will be seen as two successive newlines, but that doesn't cause * any problems. Comments that start with -- and extend to the next * newline are treated as equivalent to a single whitespace character. * * NOTE a fine point: if there is no newline following --, we will absorb * everything to the end of the input as a comment. This is correct. Older * versions of Postgres failed to recognize -- as a comment if the input * did not end with a newline. * * XXX perhaps \f (formfeed) should be treated as a newline as well? */space [ \t\n\r\f]horiz_space [ \t\f]newline [\n\r]non_newline [^\n\r]comment ("--"{non_newline}*)whitespace ({space}+|{comment})/* * SQL requires at least one newline in the whitespace separating * string literals that are to be concatenated. Silly, but who are we * to argue? Note that {whitespace_with_newline} should not have * after * it, whereas {whitespace} should generally have a * after it... */special_whitespace ({space}+|{comment}{newline})horiz_whitespace ({horiz_space}|{comment})whitespace_with_newline ({horiz_whitespace}*{newline}{special_whitespace}*)other ./* * Quoted strings must allow some special characters such as single-quote * and newline. * Embedded single-quotes are implemented both in the SQL standard * style of two adjacent single quotes "''" and in the Postgres/Java style * of escaped-quote "\'". * Other embedded escaped characters are matched explicitly and the leading * backslash is dropped from the string. * Note that xcstart must appear before operator, as explained above! * Also whitespace (comment) must appear before operator. */%%%{ /* code to execute during start of each call of yylex() */ token_start = NULL;%}{whitespace} { /* ignore */ }{xcstart} { token_start = yytext; xcdepth = 0; BEGIN(xc); /* Put back any characters past slash-star; see above */ yyless(2); }<xc>{xcstart} { xcdepth++; /* Put back any characters past slash-star; see above */ yyless(2); }<xc>{xcstop} { if (xcdepth <= 0) { BEGIN(INITIAL); /* reset token_start for next token */ token_start = NULL; } else xcdepth--; }<xc>{xcinside} { /* ignore */ }<xc>{op_chars} { /* ignore */ }<xc><<EOF>> { yyerror("unterminated /* comment"); }{xbstart} { /* Binary bit type. * At some point we should simply pass the string * forward to the parser and label it there. * In the meantime, place a leading "b" on the string * to mark it for the input routine as a binary string. */ token_start = yytext; BEGIN(xb); startlit(); addlitchar('b'); }<xb>{xbstop} { BEGIN(INITIAL); yylval.str = litbufdup(); return BCONST; }<xh>{xhinside} |<xb>{xbinside} { addlit(yytext, yyleng); }<xh>{xhcat} |<xb>{xbcat} { /* ignore */ }<xb><<EOF>> { yyerror("unterminated bit string literal"); }{xhstart} { /* Hexadecimal bit type. * At some point we should simply pass the string * forward to the parser and label it there. * In the meantime, place a leading "x" on the string * to mark it for the input routine as a hex string. */ token_start = yytext; BEGIN(xh); startlit(); addlitchar('x'); }<xh>{xhstop} { BEGIN(INITIAL); yylval.str = litbufdup(); return XCONST; }<xh><<EOF>> { yyerror("unterminated hexadecimal string literal"); }{xnstart} {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -