Tweak the backend scanner (and psqlscan.l, which must track the backend

scanner anyway) to avoid having any backup states. According to the flex manual, this should speed things up, and indeed the backend scanner is about a third faster according to some quick profiling checks. I haven't tried to measure the speed change in psql, but it probably is similar.
21 years ago · 15e4d1e2a7
parent 38af680ad5
commit 15e4d1e2a7
2 changed files with 161 additions and 41 deletions
--- a/src/backend/parser/scan.l
+++ b/src/backend/parser/scan.l
@ -4,13 +4,27 @@
 * scan.l
 *	  lexical scanner for PostgreSQL
 *
- * XXX The rules in this file must be kept in sync with psql's lexer!!!
+ * NOTE NOTE NOTE:
+ *
+ * The rules in this file must be kept in sync with psql's lexer!!!
+ *
+ * The rules are designed so that the scanner never has to backtrack,
+ * in the sense that there is always a rule that can match the input
+ * consumed so far (the rule action may internally throw back some input
+ * with yyless(), however).  As explained in the flex manual, this makes
+ * for a useful speed increase --- about a third faster than a plain -CF
+ * lexer, in simple testing.  The extra complexity is mostly in the rules
+ * for handling float numbers and continued string literals.  If you change
+ * the lexical rules, verify that you haven't broken the no-backtrack
+ * property by running flex with the "-b" option and checking that the
+ * resulting "lex.backup" file says that no backing up is needed.
+ *
 *
 * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.121 2005/03/11 19:13:42 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.122 2005/05/26 01:24:29 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -138,6 +152,20 @@ special_whitespace		({space}+|{comment}{newline})
 horiz_whitespace		({horiz_space}|{comment})
 whitespace_with_newline	({horiz_whitespace}*{newline}{special_whitespace}*)

+/*
+ * To ensure that {quotecontinue} can be scanned without having to back up
+ * if the full pattern isn't matched, we include trailing whitespace in
+ * {quotestop}.  This matches all cases where {quotecontinue} fails to match,
+ * except for {quote} followed by whitespace and just one "-" (not two,
+ * which would start a {comment}).  To cover that we have {quotefail}.
+ * The actions for {quotestop} and {quotefail} must throw back characters
+ * beyond the quote proper.
+ */
+quote			'
+quotestop		{quote}{whitespace}*
+quotecontinue	{quote}{whitespace_with_newline}{quote}
+quotefail		{quote}{whitespace}*"-"
+
 /* Bit string
 * It is tempting to scan the string for only those characters
 * which are allowed. However, this leads to silently swallowed
@ -148,16 +176,12 @@ whitespace_with_newline	({horiz_whitespace}*{newline}{special_whitespace}*)
 * validate the contents.
 */
 xbstart			[bB]{quote}
-xbstop			{quote}
 xbinside		[^']*
-xbcat			{quote}{whitespace_with_newline}{quote}

 /* Hexadecimal number
 */
 xhstart			[xX]{quote}
-xhstop			{quote}
 xhinside		[^']*
-xhcat			{quote}{whitespace_with_newline}{quote}

 /* National character
 */
@ -165,26 +189,26 @@ xnstart			[nN]{quote}

 /* Extended quote
 * xqdouble implements embedded quote
- * xqcat allows strings to cross input lines
 */
-quote			'
 xqstart			{quote}
-xqstop			{quote}
 xqdouble		{quote}{quote}
 xqinside		[^\\']+
 xqescape		[\\][^0-7]
 xqoctesc		[\\][0-7]{1,3}
-xqcat			{quote}{whitespace_with_newline}{quote}

 /* $foo$ style quotes ("dollar quoting")
 * The quoted string starts with $foo$ where "foo" is an optional string
 * in the form of an identifier, except that it may not contain "$", 
 * and extends to the first occurrence of an identical string.  
 * There is *no* processing of the quoted text.
+ *
+ * {dolqfailed} is an error rule to avoid scanner backup when {dolqdelim}
+ * fails to match its trailing "$".
 */
 dolq_start		[A-Za-z\200-\377_]
 dolq_cont		[A-Za-z\200-\377_0-9]
 dolqdelim		\$({dolq_start}{dolq_cont}*)?\$
+dolqfailed		\${dolq_start}{dolq_cont}*
 dolqinside		[^$]+

 /* Double quote
@ -242,12 +266,17 @@ operator		{op_chars}+

 /* we no longer allow unary minus in numbers. 
 * instead we pass it separately to parser. there it gets
- * coerced via doNegate() -- Leon aug 20 1999 
+ * coerced via doNegate() -- Leon aug 20 1999
+ *
+ * {realfail1} and {realfail2} are added to prevent the need for scanner
+ * backup when the {real} rule fails to match completely.
 */

 integer			{digit}+
 decimal			(({digit}*\.{digit}+)|({digit}+\.{digit}*))
-real			((({digit}*\.{digit}+)|({digit}+\.{digit}*)|({digit}+))([Ee][-+]?{digit}+))
+real			({integer}|{decimal})[Ee][-+]?{digit}+
+realfail1		({integer}|{decimal})[Ee]
+realfail2		({integer}|{decimal})[Ee][-+]

 param			\${integer}

@ -310,6 +339,10 @@ other			.
 					/* ignore */
 				}

+<xc>\*+			{
+					/* ignore */
+				}
+
 <xc><<EOF>>		{ yyerror("unterminated /* comment"); }

 {xbstart}		{
@ -324,7 +357,9 @@ other			.
 					startlit();
 					addlitchar('b');
 				}
-<xb>{xbstop}	{
+<xb>{quotestop}	|
+<xb>{quotefail} {
+					yyless(1);
 					BEGIN(INITIAL);
 					yylval.str = litbufdup();
 					return BCONST;
@ -333,8 +368,8 @@ other			.
 <xb>{xbinside}	{
 					addlit(yytext, yyleng);
 				}
-<xh>{xhcat}		|
-<xb>{xbcat}		{
+<xh>{quotecontinue}	|
+<xb>{quotecontinue}	{
 					/* ignore */
 				}
 <xb><<EOF>>		{ yyerror("unterminated bit string literal"); }
@ -351,7 +386,9 @@ other			.
 					startlit();
 					addlitchar('x');
 				}
-<xh>{xhstop}	{
+<xh>{quotestop}	|
+<xh>{quotefail} {
+					yyless(1);
 					BEGIN(INITIAL);
 					yylval.str = litbufdup();
 					return XCONST;
@ -365,13 +402,11 @@ other			.
 					 */
 					const ScanKeyword *keyword;

-					/* This had better be a keyword! */
+					yyless(1);				/* eat only 'n' this time */
+					/* nchar had better be a keyword! */
 					keyword = ScanKeywordLookup("nchar");
 					Assert(keyword != NULL);
 					yylval.keyword = keyword->name;
-					token_start = yytext;
-					BEGIN(xq);
-					startlit();
 					return keyword->value;
 				}

@ -380,7 +415,9 @@ other			.
 					BEGIN(xq);
 					startlit();
 				}
-<xq>{xqstop}	{
+<xq>{quotestop}	|
+<xq>{quotefail} {
+					yyless(1);
 					BEGIN(INITIAL);
 					yylval.str = litbufdup();
 					return SCONST;
@ -398,7 +435,7 @@ other			.
 					unsigned char c = strtoul(yytext+1, NULL, 8);
 					addlitchar(c);
 				}
-<xq>{xqcat}		{
+<xq>{quotecontinue} {
 					/* ignore */
 				}
 <xq>.			{
@ -413,6 +450,12 @@ other			.
 					BEGIN(xdolq);
 					startlit();
 				}
+{dolqfailed}	{
+					/* throw back all but the initial "$" */
+					yyless(1);
+					/* and treat it as {other} */
+					return yytext[0];
+				}
 <xdolq>{dolqdelim} {
 					if (strcmp(yytext, dolqstart) == 0)
 					{
@ -435,6 +478,9 @@ other			.
 <xdolq>{dolqinside} {
 					addlit(yytext, yyleng);
 				}
+<xdolq>{dolqfailed} {
+					addlit(yytext, yyleng);
+				}
 <xdolq>.		{
 					/* This is only needed for $ inside the quoted text */
 					addlitchar(yytext[0]);
@ -576,6 +622,23 @@ other			.
 					yylval.str = pstrdup(yytext);
 					return FCONST;
 				}
+{realfail1}		{
+					/*
+					 * throw back the [Ee], and treat as {decimal}.  Note
+					 * that it is possible the input is actually {integer},
+					 * but since this case will almost certainly lead to a
+					 * syntax error anyway, we don't bother to distinguish.
+					 */
+					yyless(yyleng-1);
+					yylval.str = pstrdup(yytext);
+					return FCONST;
+				}
+{realfail2}		{
+					/* throw back the [Ee][+-], and proceed as above */
+					yyless(yyleng-2);
+					yylval.str = pstrdup(yytext);
+					return FCONST;
+				}


 {identifier}	{
--- a/src/bin/psql/psqlscan.l
+++ b/src/bin/psql/psqlscan.l
@ -11,7 +11,9 @@
 * are (except for a few) the same as the backend's, but their actions are
 * just ECHO whereas the backend's actions generally do other things.
 *
- * XXX The rules in this file must be kept in sync with the main parser!!!
+ * XXX The rules in this file must be kept in sync with the backend lexer!!!
+ *
+ * XXX Avoid creating backtracking cases --- see the backend lexer for info.
 *
 * The most difficult aspect of this code is that we need to work in multibyte
 * encodings that are not ASCII-safe.  A "safe" encoding is one in which each
@ -31,7 +33,7 @@
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/bin/psql/psqlscan.l,v 1.9 2004/12/31 22:03:15 pgsql Exp $
+ *	  $PostgreSQL: pgsql/src/bin/psql/psqlscan.l,v 1.10 2005/05/26 01:24:29 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -207,6 +209,20 @@ special_whitespace		({space}+|{comment}{newline})
 horiz_whitespace		({horiz_space}|{comment})
 whitespace_with_newline	({horiz_whitespace}*{newline}{special_whitespace}*)

+/*
+ * To ensure that {quotecontinue} can be scanned without having to back up
+ * if the full pattern isn't matched, we include trailing whitespace in
+ * {quotestop}.  This matches all cases where {quotecontinue} fails to match,
+ * except for {quote} followed by whitespace and just one "-" (not two,
+ * which would start a {comment}).  To cover that we have {quotefail}.
+ * The actions for {quotestop} and {quotefail} must throw back characters
+ * beyond the quote proper.
+ */
+quote			'
+quotestop		{quote}{whitespace}*
+quotecontinue	{quote}{whitespace_with_newline}{quote}
+quotefail		{quote}{whitespace}*"-"
+
 /* Bit string
 * It is tempting to scan the string for only those characters
 * which are allowed. However, this leads to silently swallowed
@ -217,16 +233,12 @@ whitespace_with_newline	({horiz_whitespace}*{newline}{special_whitespace}*)
 * validate the contents.
 */
 xbstart			[bB]{quote}
-xbstop			{quote}
 xbinside		[^']*
-xbcat			{quote}{whitespace_with_newline}{quote}

 /* Hexadecimal number
 */
 xhstart			[xX]{quote}
-xhstop			{quote}
 xhinside		[^']*
-xhcat			{quote}{whitespace_with_newline}{quote}

 /* National character
 */
@ -234,26 +246,26 @@ xnstart			[nN]{quote}

 /* Extended quote
 * xqdouble implements embedded quote
- * xqcat allows strings to cross input lines
 */
-quote			'
 xqstart			{quote}
-xqstop			{quote}
 xqdouble		{quote}{quote}
 xqinside		[^\\']+
 xqescape		[\\][^0-7]
 xqoctesc		[\\][0-7]{1,3}
-xqcat			{quote}{whitespace_with_newline}{quote}

 /* $foo$ style quotes ("dollar quoting")
 * The quoted string starts with $foo$ where "foo" is an optional string
 * in the form of an identifier, except that it may not contain "$", 
 * and extends to the first occurrence of an identical string.  
 * There is *no* processing of the quoted text.
+ *
+ * {dolqfailed} is an error rule to avoid scanner backup when {dolqdelim}
+ * fails to match its trailing "$".
 */
 dolq_start		[A-Za-z\200-\377_]
 dolq_cont		[A-Za-z\200-\377_0-9]
 dolqdelim		\$({dolq_start}{dolq_cont}*)?\$
+dolqfailed		\${dolq_start}{dolq_cont}*
 dolqinside		[^$]+

 /* Double quote
@ -311,12 +323,17 @@ operator		{op_chars}+

 /* we no longer allow unary minus in numbers. 
 * instead we pass it separately to parser. there it gets
- * coerced via doNegate() -- Leon aug 20 1999 
+ * coerced via doNegate() -- Leon aug 20 1999
+ *
+ * {realfail1} and {realfail2} are added to prevent the need for scanner
+ * backup when the {real} rule fails to match completely.
 */

 integer			{digit}+
 decimal			(({digit}*\.{digit}+)|({digit}+\.{digit}*))
-real			((({digit}*\.{digit}+)|({digit}+\.{digit}*)|({digit}+))([Ee][-+]?{digit}+))
+real			({integer}|{decimal})[Ee][-+]?{digit}+
+realfail1		({integer}|{decimal})[Ee]
+realfail2		({integer}|{decimal})[Ee][-+]

 param			\${integer}

@ -383,11 +400,17 @@ other			.
 					ECHO;
 				}

+<xc>\*+			{
+					ECHO;
+				}
+
 {xbstart}		{
 					BEGIN(xb);
 					ECHO;
 				}
-<xb>{xbstop}	{
+<xb>{quotestop}	|
+<xb>{quotefail} {
+					yyless(1);
 					BEGIN(INITIAL);
 					ECHO;
 				}
@ -395,8 +418,8 @@ other			.
 <xb>{xbinside}	{
 					ECHO;
 				}
-<xh>{xhcat}		|
-<xb>{xbcat}		{
+<xh>{quotecontinue}	|
+<xb>{quotecontinue}	{
 					ECHO;
 				}

@ -410,13 +433,15 @@ other			.
 					BEGIN(xh);
 					ECHO;
 				}
-<xh>{xhstop}	{
+<xh>{quotestop}	|
+<xh>{quotefail} {
+					yyless(1);
 					BEGIN(INITIAL);
 					ECHO;
 				}

 {xnstart}		{
-					BEGIN(xq);
+					yyless(1);				/* eat only 'n' this time */
 					ECHO;
 				}

@ -424,7 +449,9 @@ other			.
 					BEGIN(xq);
 					ECHO;
 				}
-<xq>{xqstop}	{
+<xq>{quotestop}	|
+<xq>{quotefail} {
+					yyless(1);
 					BEGIN(INITIAL);
 					ECHO;
 				}
@ -440,7 +467,7 @@ other			.
 <xq>{xqoctesc}  {
 					ECHO;
 				}
-<xq>{xqcat}		{
+<xq>{quotecontinue} {
 					ECHO;
 				}
 <xq>.			{
@ -453,6 +480,11 @@ other			.
 					BEGIN(xdolq);
 					ECHO;
 				}
+{dolqfailed}	{
+					/* throw back all but the initial "$" */
+					yyless(1);
+					ECHO;
+				}
 <xdolq>{dolqdelim} {
 					if (strcmp(yytext, cur_state->dolqstart) == 0)
 					{
@ -474,6 +506,9 @@ other			.
 <xdolq>{dolqinside} {
 					ECHO;
 				}
+<xdolq>{dolqfailed} {
+					ECHO;
+				}
 <xdolq>.		{
 					/* This is only needed for $ inside the quoted text */
 					ECHO;
@ -636,6 +671,21 @@ other			.
 {real}			{
 					ECHO;
 				}
+{realfail1}		{
+					/*
+					 * throw back the [Ee], and treat as {decimal}.  Note
+					 * that it is possible the input is actually {integer},
+					 * but since this case will almost certainly lead to a
+					 * syntax error anyway, we don't bother to distinguish.
+					 */
+					yyless(yyleng-1);
+					ECHO;
+				}
+{realfail2}		{
+					/* throw back the [Ee][+-], and proceed as above */
+					yyless(yyleng-2);
+					ECHO;
+				}


 {identifier}	{
@ -817,6 +867,13 @@ other			.
 										  (char) strtol(yytext + 1, NULL, 0));
 				}

+"\\"0[xX]	{
+					/* failed hex case */
+					yyless(2);
+					appendPQExpBufferChar(output_buf,
+										  (char) strtol(yytext + 1, NULL, 0));
+				}
+
 "\\".			{ emit(yytext + 1, 1); }

 {other}|\n		{ ECHO; }