postgresql/src/interfaces/ecpg/preproc/parser.c

232 lines
5.6 KiB
C

/*-------------------------------------------------------------------------
*
* parser.c
* Main entry point/driver for PostgreSQL grammar
*
* This should match src/backend/parser/parser.c, except that we do not
* need to bother with re-entrant interfaces.
*
* Note: ECPG doesn't report error location like the backend does.
* This file will need work if we ever want it to.
*
*
* Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* src/interfaces/ecpg/preproc/parser.c
*
*-------------------------------------------------------------------------
*/
#include "postgres_fe.h"
#include "preproc_extern.h"
#include "preproc.h"
static bool have_lookahead; /* is lookahead info valid? */
static int lookahead_token; /* one-token lookahead */
static YYSTYPE lookahead_yylval; /* yylval for lookahead token */
static YYLTYPE lookahead_yylloc; /* yylloc for lookahead token */
static char *lookahead_yytext; /* start current token */
static bool check_uescapechar(unsigned char escape);
static bool ecpg_isspace(char ch);
/*
* Intermediate filter between parser and base lexer (base_yylex in scan.l).
*
* This filter is needed because in some cases the standard SQL grammar
* requires more than one token lookahead. We reduce these cases to one-token
* lookahead by replacing tokens here, in order to keep the grammar LALR(1).
*
* Using a filter is simpler than trying to recognize multiword tokens
* directly in scan.l, because we'd have to allow for comments between the
* words. Furthermore it's not clear how to do that without re-introducing
* scanner backtrack, which would cost more performance than this filter
* layer does.
*
* We also use this filter to convert UIDENT and USCONST sequences into
* plain IDENT and SCONST tokens. While that could be handled by additional
* productions in the main grammar, it's more efficient to do it like this.
*/
int
filtered_base_yylex(void)
{
int cur_token;
int next_token;
YYSTYPE cur_yylval;
YYLTYPE cur_yylloc;
char *cur_yytext;
/* Get next token --- we might already have it */
if (have_lookahead)
{
cur_token = lookahead_token;
base_yylval = lookahead_yylval;
base_yylloc = lookahead_yylloc;
base_yytext = lookahead_yytext;
have_lookahead = false;
}
else
cur_token = base_yylex();
/*
* If this token isn't one that requires lookahead, just return it.
*/
switch (cur_token)
{
case NOT:
case NULLS_P:
case WITH:
case UIDENT:
case USCONST:
break;
default:
return cur_token;
}
/* Save and restore lexer output variables around the call */
cur_yylval = base_yylval;
cur_yylloc = base_yylloc;
cur_yytext = base_yytext;
/* Get next token, saving outputs into lookahead variables */
next_token = base_yylex();
lookahead_token = next_token;
lookahead_yylval = base_yylval;
lookahead_yylloc = base_yylloc;
lookahead_yytext = base_yytext;
base_yylval = cur_yylval;
base_yylloc = cur_yylloc;
base_yytext = cur_yytext;
have_lookahead = true;
/* Replace cur_token if needed, based on lookahead */
switch (cur_token)
{
case NOT:
/* Replace NOT by NOT_LA if it's followed by BETWEEN, IN, etc */
switch (next_token)
{
case BETWEEN:
case IN_P:
case LIKE:
case ILIKE:
case SIMILAR:
cur_token = NOT_LA;
break;
}
break;
case NULLS_P:
/* Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST */
switch (next_token)
{
case FIRST_P:
case LAST_P:
cur_token = NULLS_LA;
break;
}
break;
case WITH:
/* Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY */
switch (next_token)
{
case TIME:
case ORDINALITY:
cur_token = WITH_LA;
break;
}
break;
case UIDENT:
case USCONST:
/* Look ahead for UESCAPE */
if (next_token == UESCAPE)
{
/* Yup, so get third token, which had better be SCONST */
const char *escstr;
/*
* Again save and restore lexer output variables around the
* call
*/
cur_yylval = base_yylval;
cur_yylloc = base_yylloc;
cur_yytext = base_yytext;
/* Get third token */
next_token = base_yylex();
if (next_token != SCONST)
mmerror(PARSE_ERROR, ET_ERROR, "UESCAPE must be followed by a simple string literal");
/*
* Save and check escape string, which the scanner returns
* with quotes
*/
escstr = base_yylval.str;
if (strlen(escstr) != 3 || !check_uescapechar(escstr[1]))
mmerror(PARSE_ERROR, ET_ERROR, "invalid Unicode escape character");
base_yylval = cur_yylval;
base_yylloc = cur_yylloc;
base_yytext = cur_yytext;
/* Combine 3 tokens into 1 */
base_yylval.str = psprintf("%s UESCAPE %s", base_yylval.str, escstr);
/* Clear have_lookahead, thereby consuming all three tokens */
have_lookahead = false;
}
if (cur_token == UIDENT)
cur_token = IDENT;
else if (cur_token == USCONST)
cur_token = SCONST;
break;
}
return cur_token;
}
/*
* check_uescapechar() and ecpg_isspace() should match their equivalents
* in pgc.l.
*/
/* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
static bool
check_uescapechar(unsigned char escape)
{
if (isxdigit(escape)
|| escape == '+'
|| escape == '\''
|| escape == '"'
|| ecpg_isspace(escape))
return false;
else
return true;
}
/*
* ecpg_isspace() --- return true if flex scanner considers char whitespace
*/
static bool
ecpg_isspace(char ch)
{
if (ch == ' ' ||
ch == '\t' ||
ch == '\n' ||
ch == '\r' ||
ch == '\f')
return true;
return false;
}