Teach plpgsql's lexer about dollar-quoted literals.

Andrew Dunstan, some help from Tom Lane.
22 years ago · 5ada9ef088
parent fa7a3abe87
commit 5ada9ef088
4 changed files with 118 additions and 30 deletions
--- a/src/pl/plpgsql/src/gram.y
+++ b/src/pl/plpgsql/src/gram.y
@ -4,7 +4,7 @@
 *						  procedural language
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/pl/plpgsql/src/gram.y,v 1.50 2003/12/23 00:01:57 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/pl/plpgsql/src/gram.y,v 1.51 2004/02/25 18:10:51 tgl Exp $
 *
 *	  This software is copyrighted by Jan Wieck - Hamburg.
 *
@ -1235,7 +1235,7 @@ stmt_raise		: K_RAISE lno raise_level raise_msg raise_params ';'

 raise_msg		: T_STRING
 					{
-						$$ = strdup(yytext);
+						$$ = plpgsql_get_string_value();
 					}
 				;

--- a/src/pl/plpgsql/src/pl_exec.c
+++ b/src/pl/plpgsql/src/pl_exec.c
@ -3,7 +3,7 @@
 *			  procedural language
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/pl/plpgsql/src/pl_exec.c,v 1.96 2004/02/24 01:44:33 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/pl/plpgsql/src/pl_exec.c,v 1.97 2004/02/25 18:10:51 tgl Exp $
 *
 *	  This software is copyrighted by Jan Wieck - Hamburg.
 *
@ -1805,7 +1805,7 @@ exec_stmt_raise(PLpgSQL_execstate * estate, PLpgSQL_stmt_raise * stmt)
 	for (cp = stmt->message; *cp; cp++)
 	{
 		/*
-		 * Occurences of a single % are replaced by the next argument's
+		 * Occurrences of a single % are replaced by the next argument's
 		 * external representation. Double %'s are converted to one %.
 		 */
 		if ((c[0] = *cp) == '%')
@ -1834,21 +1834,6 @@ exec_stmt_raise(PLpgSQL_execstate * estate, PLpgSQL_stmt_raise * stmt)
 			continue;
 		}

-		/*
-		 * Occurrences of single ' are removed. double ' are reduced to
-		 * single ones.  We must do this because the parameter stored by
-		 * the grammar is the raw T_STRING input literal, rather than the
-		 * de-lexed string as you might expect ...
-		 */
-		if (*cp == '\'')
-		{
-			cp++;
-			if (*cp == '\'')
-				plpgsql_dstring_append(&ds, c);
-			else
-				cp--;
-			continue;
-		}
 		plpgsql_dstring_append(&ds, c);
 	}

--- a/src/pl/plpgsql/src/plpgsql.h
+++ b/src/pl/plpgsql/src/plpgsql.h
@ -3,7 +3,7 @@
 *			  procedural language
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/pl/plpgsql/src/plpgsql.h,v 1.43 2003/11/29 19:52:12 pgsql Exp $
+ *	  $PostgreSQL: pgsql/src/pl/plpgsql/src/plpgsql.h,v 1.44 2004/02/25 18:10:51 tgl Exp $
 *
 *	  This software is copyrighted by Jan Wieck - Hamburg.
 *
@ -694,5 +694,6 @@ extern void plpgsql_push_back_token(int token);
 extern int	plpgsql_scanner_lineno(void);
 extern void plpgsql_scanner_init(const char *str, int functype);
 extern void plpgsql_scanner_finish(void);
+extern char *plpgsql_get_string_value(void);

 #endif   /* PLPGSQL_H */
--- a/src/pl/plpgsql/src/scan.l
+++ b/src/pl/plpgsql/src/scan.l
@ -4,7 +4,7 @@
 *			  procedural language
 *
 * IDENTIFICATION
- *    $PostgreSQL: pgsql/src/pl/plpgsql/src/scan.l,v 1.31 2004/02/24 22:06:32 tgl Exp $
+ *    $PostgreSQL: pgsql/src/pl/plpgsql/src/scan.l,v 1.32 2004/02/25 18:10:51 tgl Exp $
 *
 *    This software is copyrighted by Jan Wieck - Hamburg.
 *
@ -57,6 +57,8 @@ static int	lookahead_token;
 static bool have_lookahead_token;
 static const char *cur_line_start;
 static int	cur_line_num;
+static char    *dolqstart;      /* current $foo$ quote start string */
+static int	dolqlen;			/* signal to plpgsql_get_string_value */

 int	plpgsql_SpaceScanned = 0;
 %}
@ -70,7 +72,9 @@ int	plpgsql_SpaceScanned = 0;
 %option case-insensitive


-%x	IN_STRING IN_COMMENT
+%x	IN_STRING
+%x	IN_COMMENT
+%x	IN_DOLLARQUOTE

 digit			[0-9]
 ident_start		[A-Za-z\200-\377_]
@ -84,6 +88,14 @@ param			\${digit}+

 space			[ \t\n\r\f]

+/* $foo$ style quotes ("dollar quoting")
+ * copied straight from the backend SQL parser
+ */
+dolq_start		[A-Za-z\200-\377_]
+dolq_cont		[A-Za-z\200-\377_0-9]
+dolqdelim		\$({dolq_start}{dolq_cont}*)?\$
+dolqinside		[^$]+
+
 %%
    /* ----------
     * Local variables in scanner to remember where
@ -97,7 +109,7 @@ space			[ \t\n\r\f]
     * Reset the state when entering the scanner
     * ----------
     */
-    BEGIN INITIAL;
+    BEGIN(INITIAL);
    plpgsql_SpaceScanned = 0;

    /* ----------
@ -247,9 +259,9 @@ dump			{ return O_DUMP;			}
 --[^\r\n]*		;

 \/\*			{ start_lineno = plpgsql_scanner_lineno();
-			  BEGIN IN_COMMENT;
+			  BEGIN(IN_COMMENT);
 			}
-<IN_COMMENT>\*\/	{ BEGIN INITIAL; plpgsql_SpaceScanned = 1; }
+<IN_COMMENT>\*\/	{ BEGIN(INITIAL); plpgsql_SpaceScanned = 1; }
 <IN_COMMENT>\n		;
 <IN_COMMENT>.		;
 <IN_COMMENT><<EOF>>	{
@ -260,7 +272,7 @@ dump			{ return O_DUMP;			}
 			}

    /* ----------
-     * Collect anything inside of ''s and return one STRING
+     * Collect anything inside of ''s and return one STRING token
 	 *
 	 * Hacking yytext/yyleng here lets us avoid using yymore(), which is
 	 * a win for performance.  It's safe because we know the underlying
@ -270,15 +282,18 @@ dump			{ return O_DUMP;			}
 '			{
 			  start_lineno = plpgsql_scanner_lineno();
 			  start_charpos = yytext;
-			  BEGIN IN_STRING;
+			  BEGIN(IN_STRING);
 			}
 <IN_STRING>\\.		{ }
 <IN_STRING>\\		{ /* can only happen with \ at EOF */ }
 <IN_STRING>''		{ }
 <IN_STRING>'		{
-			  yyleng -= (yytext - start_charpos);
+			  /* tell plpgsql_get_string_value it's not a dollar quote */
+			  dolqlen = 0;
+			  /* adjust yytext/yyleng to describe whole string token */
+			  yyleng += (yytext - start_charpos);
 			  yytext = start_charpos;
-			  BEGIN INITIAL;
+			  BEGIN(INITIAL);
 			  return T_STRING;
 			}
 <IN_STRING>[^'\\]+	{ }
@ -289,6 +304,43 @@ dump			{ return O_DUMP;			}
 						 errmsg("unterminated string")));
 			}

+{dolqdelim}		{
+			  start_lineno = plpgsql_scanner_lineno();
+			  start_charpos = yytext;
+			  dolqstart = pstrdup(yytext);
+			  BEGIN(IN_DOLLARQUOTE);
+			}
+<IN_DOLLARQUOTE>{dolqdelim} {
+			  if (strcmp(yytext, dolqstart) == 0)
+			  {
+					pfree(dolqstart);
+					/* tell plpgsql_get_string_value it is a dollar quote */
+					dolqlen = yyleng;
+					/* adjust yytext/yyleng to describe whole string token */
+					yyleng += (yytext - start_charpos);
+					yytext = start_charpos;
+					BEGIN(INITIAL);
+					return T_STRING;
+			  }
+			  else
+			  {
+					/*
+					 * When we fail to match $...$ to dolqstart, transfer
+					 * the $... part to the output, but put back the final
+					 * $ for rescanning.  Consider $delim$...$junk$delim$
+					 */
+					yyless(yyleng-1);
+			  }
+			}
+<IN_DOLLARQUOTE>{dolqinside} { }
+<IN_DOLLARQUOTE>.	{ /* needed for $ inside the quoted text */ }
+<IN_DOLLARQUOTE><<EOF>>	{ 
+				plpgsql_error_lineno = start_lineno;
+				ereport(ERROR,
+						(errcode(ERRCODE_DATATYPE_MISMATCH),
+						 errmsg("unterminated dollar-quoted string")));
+			}
+
    /* ----------
     * Any unmatched character is returned as is
     * ----------
@ -429,7 +481,6 @@ plpgsql_scanner_init(const char *str, int functype)
 	BEGIN(INITIAL);
 }

-
 /*
 * Called after parsing is done to clean up after plpgsql_scanner_init()
 */
@ -439,3 +490,54 @@ plpgsql_scanner_finish(void)
 	yy_delete_buffer(scanbufhandle);
 	pfree(scanbuf);
 }
+
+/*
+ * Called after a T_STRING token is read to get the string literal's value
+ * as a malloc'd string.  (We make this a separate call because in many
+ * scenarios there's no need to get the decoded value.)
+ *
+ * Note: we expect the literal to be the most recently lexed token.  This
+ * would not work well if we supported multiple-token pushback or if 
+ * plpgsql_yylex() wanted to read ahead beyond a T_STRING token.
+ */
+char *
+plpgsql_get_string_value(void)
+{
+	char	   *result;
+	const char *cp;
+	int			len;
+
+	if (dolqlen > 0)
+	{
+		/* Token is a $foo$...$foo$ string */
+		len = yyleng - 2 * dolqlen;
+		Assert(len >= 0);
+		result = (char *) malloc(len + 1);
+		memcpy(result, yytext + dolqlen, len);
+		result[len] = '\0';
+	}
+	else
+	{
+		/* Token is a '...' string */
+		result = (char *) malloc(yyleng + 1);	/* more than enough room */
+		len = 0;
+		for (cp = yytext; *cp; cp++)
+		{
+			if (*cp == '\'')
+			{
+				if (cp[1] == '\'')
+					result[len++] = *cp++;
+				/* else it must be string start or end quote */
+			}
+			else if (*cp == '\\')
+			{
+				if (cp[1] != '\0')	/* just a paranoid check */
+					result[len++] = *(++cp);
+			}
+			else
+				result[len++] = *cp;
+		}
+		result[len] = '\0';
+	}
+	return result;
+}