@ -9,7 +9,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/pl/plpgsql/src/scan.l,v 1.67 2009/02/18 11:33:04 petere Exp $
* $PostgreSQL: pgsql/src/pl/plpgsql/src/scan.l,v 1.68 2009/04/19 18:52:57 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -19,27 +19,31 @@
#include "mb/pg_wchar.h"
/* No reason to constrain amount of data slurped */
#define YY_READ_BUF_SIZE 16777216
/* Avoid exit() on fatal scanner errors (a bit ugly -- see yy_fatal_error) */
#undef fprintf
#define fprintf(file, fmt, msg) ereport(ERROR, (errmsg_internal("%s", msg)))
/*
* When we parse a token that requires multiple lexer rules to process,
* remember the token's starting position this way.
*/
#define SAVE_TOKEN_START() \
( start_lineno = plpgsql_scanner_lineno(), start_charpos = yytext )
/* Handles to the buffer that the lexer uses internally */
static YY_BUFFER_STATE scanbufhandle;
static char *scanbuf;
static const char *scanstr; /* original input string */
static int scanner_functype;
static bool scanner_typereported;
static int pushback_token;
static bool have_pushback_token;
static const char *cur_line_start;
static int cur_line_num;
static int xcdepth = 0; /* depth of nesting in slash-star comments */
static char *dolqstart; /* current $foo$ quote start string */
static int dolqlen; /* signal to plpgsql_get_string_value */
extern bool standard_conforming_strings;
bool plpgsql_SpaceScanned = false;
%}
@ -54,31 +58,73 @@ bool plpgsql_SpaceScanned = false;
%option case-insensitive
/*
* Exclusive states are a subset of the core lexer's:
* <xc> extended C-style comments
* <xq> standard quoted strings
* <xe> extended quoted strings (support backslash escape sequences)
* <xdolq> $foo$ quoted strings
*/
%x IN_STRING
%x IN_COMMENT
%x IN_DOLLARQUOTE
%x xc
%x xe
%x xq
%x xdolq
digit [0-9]
ident_start [A-Za-z\200-\377_]
ident_cont [A-Za-z\200-\377_0-9\$]
/*
* Definitions --- these generally must match the core lexer, but in some
* cases we can simplify, since we only care about identifying the token
* boundaries and not about deriving the represented value. Also, we
* aren't trying to lex multicharacter operators so their interactions
* with comments go away.
*/
quoted_ident (\"[^\"]*\")+
space [ \t\n\r\f]
horiz_space [ \t\f]
newline [\n\r]
non_newline [^\n\r]
identifier ({ident_start}{ident_cont}*|{quoted_ident})
comment ("--"{non_newline}* )
param \${digit}+
whitespace ({space}+|{comment})
special_whitespace ({space}+|{comment}{newline})
horiz_whitespace ({horiz_space}|{comment})
whitespace_with_newline ({horiz_whitespace}*{newline}{special_whitespace}*)
space [ \t\n\r\f]
quote '
quotestop {quote}{whitespace}*
quotecontinue {quote}{whitespace_with_newline}{quote}
quotefail {quote}{whitespace}*"-"
xestart [eE]{quote}
xeinside [^\\']+
xeescape [\\].
xqstart {quote}
xqdouble {quote}{quote}
xqinside [^']+
/* $foo$ style quotes ("dollar quoting")
* copied straight from the backend SQL parser
*/
dolq_start [A-Za-z\200-\377_]
dolq_cont [A-Za-z\200-\377_0-9]
dolqdelim \$({dolq_start}{dolq_cont}*)?\$
dolqfailed \${dolq_start}{dolq_cont}*
dolqinside [^$]+
xcstart \/\*
xcstop \*+\/
xcinside [^*/]+
digit [0-9]
ident_start [A-Za-z\200-\377_]
ident_cont [A-Za-z\200-\377_0-9\$]
/* This is a simpler treatment of quoted identifiers than the core uses */
quoted_ident (\"[^\"]*\")+
identifier ({ident_start}{ident_cont}*|{quoted_ident})
param \${digit}+
%%
/* ----------
* Local variables in scanner to remember where
@ -95,17 +141,6 @@ dolqinside [^$]+
BEGIN(INITIAL);
plpgsql_SpaceScanned = false;
/* ----------
* On the first call to a new source report the
* function's type (T_FUNCTION or T_TRIGGER)
* ----------
*/
if (!scanner_typereported)
{
scanner_typereported = true;
return scanner_functype;
}
/* ----------
* The keyword rules
* ----------
@ -225,119 +260,134 @@ dump { return O_DUMP; }
{digit}+ { return T_NUMBER; }
\". {
plpgsql_error_lineno = plpgsql_scanner_lineno();
ereport(ERROR,
(errcode(ERRCODE_DATATYPE_MISMATCH),
errmsg("unterminated quoted identifier")));
}
/* ----------
* Ignore whitespaces but remember this happened
* ----------
*/
{space}+ { plpgsql_SpaceScanned = true; }
\". { yyerror("unterminated quoted identifier"); }
/* ----------
* Eat up comments
* Ignore whitespace (including comments) but remember this happened
* ----------
*/
--[^\r\n]* ;
\/\* { start_lineno = plpgsql_scanner_lineno();
BEGIN(IN_COMMENT);
}
<IN_COMMENT>\*\/ { BEGIN(INITIAL); plpgsql_SpaceScanned = true; }
<IN_COMMENT>\n ;
<IN_COMMENT>. ;
<IN_COMMENT><<EOF>> {
plpgsql_error_lineno = start_lineno;
ereport(ERROR,
(errcode(ERRCODE_DATATYPE_MISMATCH),
errmsg("unterminated /* comment")));
}
{whitespace} { plpgsql_SpaceScanned = true; }
/* ----------
* Collect anything inside of ''s and return one STRING token
*
* Hacking yytext/yyleng here lets us avoid using yymore(), which is
* a win for performance. It's safe because we know the underlying
* input buffer is not changing.
* Comment and literal handling is mostly copied from the core lexer
* ----------
*/
' {
start_lineno = plpgsql_scanner_lineno();
start_charpos = yytext;
BEGIN(IN_STRING);
}
[eE]' {
/* for now, treat the same as a regular literal */
start_lineno = plpgsql_scanner_lineno();
start_charpos = yytext;
BEGIN(IN_STRING);
}
<IN_STRING>\\. { }
<IN_STRING>\\ { /* can only happen with \ at EOF */ }
<IN_STRING>'' { }
<IN_STRING>' {
/* tell plpgsql_get_string_value it's not a dollar quote */
dolqlen = 0;
/* adjust yytext/yyleng to describe whole string token */
yyleng += (yytext - start_charpos);
yytext = start_charpos;
BEGIN(INITIAL);
return T_STRING;
}
<IN_STRING>[^'\\]+ { }
<IN_STRING><<EOF>> {
plpgsql_error_lineno = start_lineno;
ereport(ERROR,
(errcode(ERRCODE_DATATYPE_MISMATCH),
errmsg("unterminated quoted string")));
}
{dolqdelim} {
start_lineno = plpgsql_scanner_lineno();
start_charpos = yytext;
dolqstart = pstrdup(yytext);
BEGIN(IN_DOLLARQUOTE);
}
<IN_DOLLARQUOTE>{dolqdelim} {
if (strcmp(yytext, dolqstart) == 0)
{
pfree(dolqstart);
/* tell plpgsql_get_string_value it is a dollar quote */
dolqlen = yyleng;
{xcstart} {
/* Set location in case of syntax error in comment */
SAVE_TOKEN_START();
xcdepth = 0;
BEGIN(xc);
plpgsql_SpaceScanned = true;
}
<xc>{xcstart} {
xcdepth++;
}
<xc>{xcstop} {
if (xcdepth <= 0)
BEGIN(INITIAL);
else
xcdepth--;
}
<xc>{xcinside} {
/* ignore */
}
<xc>\/+ {
/* ignore */
}
<xc>\*+ {
/* ignore */
}
<xc><<EOF>> { yyerror("unterminated /* comment"); }
{xqstart} {
SAVE_TOKEN_START();
if (standard_conforming_strings)
BEGIN(xq);
else
BEGIN(xe);
}
{xestart} {
SAVE_TOKEN_START();
BEGIN(xe);
}
<xq,xe>{quotestop} |
<xq,xe>{quotefail} {
yyless(1);
BEGIN(INITIAL);
/* adjust yytext/yyleng to describe whole string token */
yyleng += (yytext - start_charpos);
yytext = start_charpos;
BEGIN(INITIAL);
return T_STRING;
}
else
{
/*
* When we fail to match $...$ to dolqstart, transfer
* the $... part to the output, but put back the final
* $ for rescanning. Consider $delim$...$junk$delim$
*/
yyless(yyleng-1);
}
}
<IN_DOLLARQUOTE>{dolqinside} { }
<IN_DOLLARQUOTE>. { /* needed for $ inside the quoted text */ }
<IN_DOLLARQUOTE><<EOF>> {
plpgsql_error_lineno = start_lineno;
ereport(ERROR,
(errcode(ERRCODE_DATATYPE_MISMATCH),
errmsg("unterminated dollar-quoted string")));
}
}
<xq,xe>{xqdouble} {
}
<xq>{xqinside} {
}
<xe>{xeinside} {
}
<xe>{xeescape} {
}
<xq,xe>{quotecontinue} {
/* ignore */
}
<xe>. {
/* This is only needed for \ just before EOF */
}
<xq,xe><<EOF>> { yyerror("unterminated quoted string"); }
{dolqdelim} {
SAVE_TOKEN_START();
dolqstart = pstrdup(yytext);
BEGIN(xdolq);
}
{dolqfailed} {
/* throw back all but the initial "$" */
yyless(1);
/* and treat it as {other} */
return yytext[0];
}
<xdolq>{dolqdelim} {
if (strcmp(yytext, dolqstart) == 0)
{
pfree(dolqstart);
BEGIN(INITIAL);
/* adjust yytext/yyleng to describe whole string */
yyleng += (yytext - start_charpos);
yytext = start_charpos;
return T_STRING;
}
else
{
/*
* When we fail to match $...$ to dolqstart, transfer
* the $... part to the output, but put back the final
* $ for rescanning. Consider $delim$...$junk$delim$
*/
yyless(yyleng-1);
}
}
<xdolq>{dolqinside} {
}
<xdolq>{dolqfailed} {
}
<xdolq>. {
/* This is only needed for $ inside the quoted text */
}
<xdolq><<EOF>> { yyerror("unterminated dollar-quoted string"); }
/* ----------
* Any unmatched character is returned as is
* ----------
*/
. { return yytext[0]; }
. {
return yytext[0];
}
%%
@ -437,7 +487,7 @@ plpgsql_scanner_lineno(void)
* to cite in error messages.
*/
void
plpgsql_scanner_init(const char *str, int functype )
plpgsql_scanner_init(const char *str)
{
Size slen;
@ -460,9 +510,6 @@ plpgsql_scanner_init(const char *str, int functype)
/* Other setup */
scanstr = str;
scanner_functype = functype;
scanner_typereported = false;
have_pushback_token = false;
cur_line_start = scanbuf;
@ -493,77 +540,3 @@ plpgsql_scanner_finish(void)
yy_delete_buffer(scanbufhandle);
pfree(scanbuf);
}
/*
* Called after a T_STRING token is read to get the string literal's value
* as a palloc'd string. (We make this a separate call because in many
* scenarios there's no need to get the decoded value.)
*
* Note: we expect the literal to be the most recently lexed token. This
* would not work well if we supported multiple-token pushback or if
* plpgsql_yylex() wanted to read ahead beyond a T_STRING token.
*/
char *
plpgsql_get_string_value(void)
{
char *result;
const char *cp;
int len;
if (dolqlen > 0)
{
/* Token is a $foo$...$foo$ string */
len = yyleng - 2 * dolqlen;
Assert(len >= 0);
result = (char *) palloc(len + 1);
memcpy(result, yytext + dolqlen, len);
result[len] = '\0';
}
else if (*yytext == 'E' || *yytext == 'e')
{
/* Token is an E'...' string */
result = (char *) palloc(yyleng + 1); /* more than enough room */
len = 0;
for (cp = yytext + 2; *cp; cp++)
{
if (*cp == '\'')
{
if (cp[1] == '\'')
result[len++] = *cp++;
/* else it must be string end quote */
}
else if (*cp == '\\')
{
if (cp[1] != '\0') /* just a paranoid check */
result[len++] = *(++cp);
}
else
result[len++] = *cp;
}
result[len] = '\0';
}
else
{
/* Token is a '...' string */
result = (char *) palloc(yyleng + 1); /* more than enough room */
len = 0;
for (cp = yytext + 1; *cp; cp++)
{
if (*cp == '\'')
{
if (cp[1] == '\'')
result[len++] = *cp++;
/* else it must be string end quote */
}
else if (*cp == '\\')
{
if (cp[1] != '\0') /* just a paranoid check */
result[len++] = *(++cp);
}
else
result[len++] = *cp;
}
result[len] = '\0';
}
return result;
}