psqlscan.l

来自「PostgreSQL 8.1.4的源码适用于Linux下的开源数据库系统」· L 代码 · 共 1,631 行 · 第 1/3 页
1,631 行
%{/*------------------------------------------------------------------------- * * psqlscan.l *	  lexical scanner for psql * * This code is mainly needed to determine where the end of a SQL statement * is: we are looking for semicolons that are not within quotes, comments, * or parentheses.  The most reliable way to handle this is to borrow the * backend's flex lexer rules, lock, stock, and barrel.  The rules below * are (except for a few) the same as the backend's, but their actions are * just ECHO whereas the backend's actions generally do other things. * * XXX The rules in this file must be kept in sync with the backend lexer!!! * * XXX Avoid creating backtracking cases --- see the backend lexer for info. * * The most difficult aspect of this code is that we need to work in multibyte * encodings that are not ASCII-safe.  A "safe" encoding is one in which each * byte of a multibyte character has the high bit set (it's >= 0x80).  Since * all our lexing rules treat all high-bit-set characters alike, we don't * really need to care whether such a byte is part of a sequence or not. * In an "unsafe" encoding, we still expect the first byte of a multibyte * sequence to be >= 0x80, but later bytes might not be.  If we scan such * a sequence as-is, the lexing rules could easily be fooled into matching * such bytes to ordinary ASCII characters.  Our solution for this is to * substitute 0xFF for each non-first byte within the data presented to flex. * The flex rules will then pass the FF's through unmolested.  The emit() * subroutine is responsible for looking back to the original string and * replacing FF's with the corresponding original bytes. * * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION *	  $PostgreSQL: pgsql/src/bin/psql/psqlscan.l,v 1.15 2005/06/26 19:16:06 tgl Exp $ * *------------------------------------------------------------------------- */#include "postgres_fe.h"#include "psqlscan.h"#include <ctype.h>#include "mb/pg_wchar.h"#include "common.h"#include "settings.h"#include "variables.h"/* * We use a stack of flex buffers to handle substitution of psql variables. * Each stacked buffer contains the as-yet-unread text from one psql variable. * When we pop the stack all the way, we resume reading from the outer buffer * identified by scanbufhandle. */typedef struct StackElem{	YY_BUFFER_STATE buf;		/* flex input control structure */	char	   *bufstring;		/* data actually being scanned by flex */	char	   *origstring;		/* copy of original data, if needed */	struct StackElem *next;} StackElem;/* * All working state of the lexer must be stored in PsqlScanStateData * between calls.  This allows us to have multiple open lexer operations, * which is needed for nested include files.  The lexer itself is not * recursive, but it must be re-entrant. */typedef struct PsqlScanStateData{	StackElem  *buffer_stack;	/* stack of variable expansion buffers */	/*	 * These variables always refer to the outer buffer, never to any	 * stacked variable-expansion buffer.	 */	YY_BUFFER_STATE scanbufhandle;	char	   *scanbuf;		/* start of outer-level input buffer */	const char *scanline;		/* current input line at outer level */	/* safe_encoding, curline, refline are used by emit() to replace FFs */	int			encoding;		/* encoding being used now */	bool		safe_encoding;	/* is current encoding "safe"? */	const char *curline;		/* actual flex input string for cur buf */	const char *refline;		/* original data for cur buffer */	/*	 * All this state lives across successive input lines, until explicitly	 * reset by psql_scan_reset.	 */	int			start_state;	/* saved YY_START */	int			paren_depth;	/* depth of nesting in parentheses */	int			xcdepth;		/* depth of nesting in slash-star comments */	char	   *dolqstart;		/* current $foo$ quote start string */} PsqlScanStateData;static PsqlScanState cur_state;	/* current state while active */static PQExpBuffer output_buf;	/* current output buffer *//* these variables do not need to be saved across calls */static enum slash_option_type option_type;static char *option_quote;/* Return values from yylex() */#define LEXRES_EOL			0	/* end of input */#define LEXRES_SEMI			1	/* command-terminating semicolon found */#define LEXRES_BACKSLASH	2	/* backslash command start */#define LEXRES_OK			3	/* OK completion of backslash argument */int	yylex(void);static void push_new_buffer(const char *newstr);static YY_BUFFER_STATE prepare_buffer(const char *txt, int len,									  char **txtcopy);static void emit(const char *txt, int len);#define ECHO emit(yytext, yyleng)%}%option 8bit%option never-interactive%option nodefault%option nounput%option noyywrap/* * All of the following definitions and rules should exactly match * src/backend/parser/scan.l so far as the flex patterns are concerned. * The rule bodies are just ECHO as opposed to what the backend does, * however.  (But be sure to duplicate code that affects the lexing process, * such as BEGIN().)  Also, psqlscan uses a single <<EOF>> rule whereas * scan.l has a separate one for each exclusive state. *//* * OK, here is a short description of lex/flex rules behavior. * The longest pattern which matches an input string is always chosen. * For equal-length patterns, the first occurring in the rules list is chosen. * INITIAL is the starting state, to which all non-conditional rules apply. * Exclusive states change parsing rules while the state is active.  When in * an exclusive state, only those rules defined for that state apply. * * We use exclusive states for quoted strings, extended comments, * and to eliminate parsing troubles for numeric strings. * Exclusive states: *  <xb> bit string literal *  <xc> extended C-style comments *  <xd> delimited identifiers (double-quoted identifiers) *  <xh> hexadecimal numeric string *  <xq> quoted strings *  <xdolq> $foo$ quoted strings */%x xb%x xc%x xd%x xh%x xq%x xdolq/* Additional exclusive states for psql only: lex backslash commands */%x xslashcmd%x xslasharg%x xslashquote%x xslashbackquote%x xslashdefaultarg%x xslashquotedarg%x xslashwholeline%x xslashend/* * In order to make the world safe for Windows and Mac clients as well as * Unix ones, we accept either \n or \r as a newline.  A DOS-style \r\n * sequence will be seen as two successive newlines, but that doesn't cause * any problems.  Comments that start with -- and extend to the next * newline are treated as equivalent to a single whitespace character. * * NOTE a fine point: if there is no newline following --, we will absorb * everything to the end of the input as a comment.  This is correct.  Older * versions of Postgres failed to recognize -- as a comment if the input * did not end with a newline. * * XXX perhaps \f (formfeed) should be treated as a newline as well? */space			[ \t\n\r\f]horiz_space		[ \t\f]newline			[\n\r]non_newline		[^\n\r]comment			("--"{non_newline}*)whitespace		({space}+|{comment})/* * SQL requires at least one newline in the whitespace separating * string literals that are to be concatenated.  Silly, but who are we * to argue?  Note that {whitespace_with_newline} should not have * after * it, whereas {whitespace} should generally have a * after it... */special_whitespace		({space}+|{comment}{newline})horiz_whitespace		({horiz_space}|{comment})whitespace_with_newline	({horiz_whitespace}*{newline}{special_whitespace}*)/* * To ensure that {quotecontinue} can be scanned without having to back up * if the full pattern isn't matched, we include trailing whitespace in * {quotestop}.  This matches all cases where {quotecontinue} fails to match, * except for {quote} followed by whitespace and just one "-" (not two, * which would start a {comment}).  To cover that we have {quotefail}. * The actions for {quotestop} and {quotefail} must throw back characters * beyond the quote proper. */quote			'quotestop		{quote}{whitespace}*quotecontinue	{quote}{whitespace_with_newline}{quote}quotefail		{quote}{whitespace}*"-"/* Bit string * It is tempting to scan the string for only those characters * which are allowed. However, this leads to silently swallowed * characters if illegal characters are included in the string. * For example, if xbinside is [01] then B'ABCD' is interpreted * as a zero-length string, and the ABCD' is lost! * Better to pass the string forward and let the input routines * validate the contents. */xbstart			[bB]{quote}xbinside		[^']*/* Hexadecimal number */xhstart			[xX]{quote}xhinside		[^']*/* National character */xnstart			[nN]{quote}/* Quoted string that allows backslash escapes */xestart			[eE]{quote}/* Extended quote * xqdouble implements embedded quote, '''' */xqstart			{quote}xqdouble		{quote}{quote}xqinside		[^\\']+xqescape		[\\][^0-7]xqoctesc		[\\][0-7]{1,3}xqhexesc		[\\]x[0-9A-Fa-f]{1,2}/* $foo$ style quotes ("dollar quoting") * The quoted string starts with $foo$ where "foo" is an optional string * in the form of an identifier, except that it may not contain "$",  * and extends to the first occurrence of an identical string.   * There is *no* processing of the quoted text. * * {dolqfailed} is an error rule to avoid scanner backup when {dolqdelim} * fails to match its trailing "$". */dolq_start		[A-Za-z\200-\377_]dolq_cont		[A-Za-z\200-\377_0-9]dolqdelim		\$({dolq_start}{dolq_cont}*)?\$dolqfailed		\${dolq_start}{dolq_cont}*dolqinside		[^$]+/* Double quote * Allows embedded spaces and other special characters into identifiers. */dquote			\"xdstart			{dquote}xdstop			{dquote}xddouble		{dquote}{dquote}xdinside		[^"]+/* C-style comments * * The "extended comment" syntax closely resembles allowable operator syntax. * The tricky part here is to get lex to recognize a string starting with * slash-star as a comment, when interpreting it as an operator would produce * a longer match --- remember lex will prefer a longer match!  Also, if we * have something like plus-slash-star, lex will think this is a 3-character * operator whereas we want to see it as a + operator and a comment start. * The solution is two-fold: * 1. append {op_chars}* to xcstart so that it matches as much text as *    {operator} would. Then the tie-breaker (first matching rule of same *    length) ensures xcstart wins.  We put back the extra stuff with yyless() *    in case it contains a star-slash that should terminate the comment. * 2. In the operator rule, check for slash-star within the operator, and *    if found throw it back with yyless().  This handles the plus-slash-star *    problem. * Dash-dash comments have similar interactions with the operator rule. */xcstart			\/\*{op_chars}*xcstop			\*+\/xcinside		[^*/]+digit			[0-9]ident_start		[A-Za-z\200-\377_]ident_cont		[A-Za-z\200-\377_0-9\$]identifier		{ident_start}{ident_cont}*typecast		"::"/* * "self" is the set of chars that should be returned as single-character * tokens.  "op_chars" is the set of chars that can make up "Op" tokens, * which can be one or more characters long (but if a single-char token * appears in the "self" set, it is not to be returned as an Op).  Note * that the sets overlap, but each has some chars that are not in the other. * * If you change either set, adjust the character lists appearing in the * rule for "operator"! */self			[,()\[\].;\:\+\-\*\/\%\^\<\>\=]op_chars		[\~\!\@\#\^\&\|\`\?\+\-\*\/\%\<\>\=]operator		{op_chars}+/* we no longer allow unary minus in numbers.  * instead we pass it separately to parser. there it gets * coerced via doNegate() -- Leon aug 20 1999 * * {realfail1} and {realfail2} are added to prevent the need for scanner * backup when the {real} rule fails to match completely. */integer			{digit}+decimal			(({digit}*\.{digit}+)|({digit}+\.{digit}*))real			({integer}|{decimal})[Ee][-+]?{digit}+realfail1		({integer}|{decimal})[Ee]realfail2		({integer}|{decimal})[Ee][-+]param			\${integer}other			./* * Dollar quoted strings are totally opaque, and no escaping is done on them. * Other quoted strings must allow some special characters such as single-quote *  and newline. * Embedded single-quotes are implemented both in the SQL standard *  style of two adjacent single quotes "''" and in the Postgres/Java style *  of escaped-quote "\'". * Other embedded escaped characters are matched explicitly and the leading *  backslash is dropped from the string. * Note that xcstart must appear before operator, as explained above! *  Also whitespace (comment) must appear before operator. */%%{whitespace}	{					/*					 * Note that the whitespace rule includes both true					 * whitespace and single-line ("--" style) comments.					 * We suppress whitespace at the start of the query					 * buffer.  We also suppress all single-line comments,					 * which is pretty dubious but is the historical					 * behavior.					 */					if (!(output_buf->len == 0 || yytext[0] == '-'))						ECHO;				}{xcstart}		{					cur_state->xcdepth = 0;					BEGIN(xc);					/* Put back any characters past slash-star; see above */					yyless(2);					ECHO;				}<xc>{xcstart}	{					cur_state->xcdepth++;					/* Put back any characters past slash-star; see above */					yyless(2);					ECHO;				}<xc>{xcstop}	{					if (cur_state->xcdepth <= 0)					{						BEGIN(INITIAL);					}					else						cur_state->xcdepth--;					ECHO;				}<xc>{xcinside}	{					ECHO;				}<xc>{op_chars}	{					ECHO;				}<xc>\*+			{					ECHO;				}{xbstart}		{					BEGIN(xb);					ECHO;				}<xb>{quotestop}	|<xb>{quotefail} {					yyless(1);					BEGIN(INITIAL);					ECHO;				}<xh>{xhinside}	|<xb>{xbinside}	{					ECHO;				}<xh>{quotecontinue}	|<xb>{quotecontinue}	{					ECHO;				}{xhstart}		{					/* Hexadecimal bit type.					 * At some point we should simply pass the string					 * forward to the parser and label it there.					 * In the meantime, place a leading "x" on the string					 * to mark it for the input routine as a hex string.					 */					BEGIN(xh);					ECHO;				}<xh>{quotestop}	|<xh>{quotefail} {					yyless(1);					BEGIN(INITIAL);					ECHO;				}{xnstart}		{					yyless(1);				/* eat only 'n' this time */					ECHO;				}{xqstart}		{					BEGIN(xq);					ECHO;				}{xestart}		{					BEGIN(xq);					ECHO;				}<xq>{quotestop}	|<xq>{quotefail} {					yyless(1);					BEGIN(INITIAL);					ECHO;				}<xq>{xqdouble}  {					ECHO;				}<xq>{xqinside}  {					ECHO;				}<xq>{xqescape}  {					ECHO;				}<xq>{xqoctesc}  {					ECHO;				}<xq>{xqhexesc}  {					ECHO;				}<xq>{quotecontinue} {					ECHO;				}<xq>.			{					/* This is only needed for \ just before EOF */					ECHO;				}{dolqdelim}		{					cur_state->dolqstart = pg_strdup(yytext);					BEGIN(xdolq);					ECHO;				}{dolqfailed}	{					/* throw back all but the initial "$" */					yyless(1);					ECHO;				}<xdolq>{dolqdelim} {					if (strcmp(yytext, cur_state->dolqstart) == 0)					{						free(cur_state->dolqstart);						cur_state->dolqstart = NULL;						BEGIN(INITIAL);					}					else					{						/*						 * When we fail to match $...$ to dolqstart, transfer						 * the $... part to the output, but put back the final						 * $ for rescanning.  Consider $delim$...$junk$delim$						 */						yyless(yyleng-1);					}					ECHO;				}<xdolq>{dolqinside} {					ECHO;				}<xdolq>{dolqfailed} {					ECHO;				}<xdolq>.		{					/* This is only needed for $ inside the quoted text */					ECHO;				}{xdstart}		{					BEGIN(xd);					ECHO;				}<xd>{xdstop}	{					BEGIN(INITIAL);					ECHO;				}<xd>{xddouble}	{					ECHO;				}<xd>{xdinside}	{					ECHO;				}{typecast}		{					ECHO;				}
psqlscan.l - 源码说明

本页面展示了「PostgreSQL 8.1.4的源码适用于Linux下的开源数据库系统」中的 psqlscan.l 源码文件，采用 L 编程语言编写，共 1,631 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与PostgreSQL相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?