232 lines
5.6 KiB
C
232 lines
5.6 KiB
C
/*-------------------------------------------------------------------------
|
|
*
|
|
* parser.c
|
|
* Main entry point/driver for PostgreSQL grammar
|
|
*
|
|
* This should match src/backend/parser/parser.c, except that we do not
|
|
* need to bother with re-entrant interfaces.
|
|
*
|
|
* Note: ECPG doesn't report error location like the backend does.
|
|
* This file will need work if we ever want it to.
|
|
*
|
|
*
|
|
* Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
|
*
|
|
* IDENTIFICATION
|
|
* src/interfaces/ecpg/preproc/parser.c
|
|
*
|
|
*-------------------------------------------------------------------------
|
|
*/
|
|
|
|
#include "postgres_fe.h"
|
|
|
|
#include "preproc_extern.h"
|
|
#include "preproc.h"
|
|
|
|
|
|
static bool have_lookahead; /* is lookahead info valid? */
|
|
static int lookahead_token; /* one-token lookahead */
|
|
static YYSTYPE lookahead_yylval; /* yylval for lookahead token */
|
|
static YYLTYPE lookahead_yylloc; /* yylloc for lookahead token */
|
|
static char *lookahead_yytext; /* start current token */
|
|
|
|
static bool check_uescapechar(unsigned char escape);
|
|
static bool ecpg_isspace(char ch);
|
|
|
|
|
|
/*
|
|
* Intermediate filter between parser and base lexer (base_yylex in scan.l).
|
|
*
|
|
* This filter is needed because in some cases the standard SQL grammar
|
|
* requires more than one token lookahead. We reduce these cases to one-token
|
|
* lookahead by replacing tokens here, in order to keep the grammar LALR(1).
|
|
*
|
|
* Using a filter is simpler than trying to recognize multiword tokens
|
|
* directly in scan.l, because we'd have to allow for comments between the
|
|
* words. Furthermore it's not clear how to do that without re-introducing
|
|
* scanner backtrack, which would cost more performance than this filter
|
|
* layer does.
|
|
*
|
|
* We also use this filter to convert UIDENT and USCONST sequences into
|
|
* plain IDENT and SCONST tokens. While that could be handled by additional
|
|
* productions in the main grammar, it's more efficient to do it like this.
|
|
*/
|
|
int
|
|
filtered_base_yylex(void)
|
|
{
|
|
int cur_token;
|
|
int next_token;
|
|
YYSTYPE cur_yylval;
|
|
YYLTYPE cur_yylloc;
|
|
char *cur_yytext;
|
|
|
|
/* Get next token --- we might already have it */
|
|
if (have_lookahead)
|
|
{
|
|
cur_token = lookahead_token;
|
|
base_yylval = lookahead_yylval;
|
|
base_yylloc = lookahead_yylloc;
|
|
base_yytext = lookahead_yytext;
|
|
have_lookahead = false;
|
|
}
|
|
else
|
|
cur_token = base_yylex();
|
|
|
|
/*
|
|
* If this token isn't one that requires lookahead, just return it.
|
|
*/
|
|
switch (cur_token)
|
|
{
|
|
case NOT:
|
|
case NULLS_P:
|
|
case WITH:
|
|
case UIDENT:
|
|
case USCONST:
|
|
break;
|
|
default:
|
|
return cur_token;
|
|
}
|
|
|
|
/* Save and restore lexer output variables around the call */
|
|
cur_yylval = base_yylval;
|
|
cur_yylloc = base_yylloc;
|
|
cur_yytext = base_yytext;
|
|
|
|
/* Get next token, saving outputs into lookahead variables */
|
|
next_token = base_yylex();
|
|
|
|
lookahead_token = next_token;
|
|
lookahead_yylval = base_yylval;
|
|
lookahead_yylloc = base_yylloc;
|
|
lookahead_yytext = base_yytext;
|
|
|
|
base_yylval = cur_yylval;
|
|
base_yylloc = cur_yylloc;
|
|
base_yytext = cur_yytext;
|
|
|
|
have_lookahead = true;
|
|
|
|
/* Replace cur_token if needed, based on lookahead */
|
|
switch (cur_token)
|
|
{
|
|
case NOT:
|
|
/* Replace NOT by NOT_LA if it's followed by BETWEEN, IN, etc */
|
|
switch (next_token)
|
|
{
|
|
case BETWEEN:
|
|
case IN_P:
|
|
case LIKE:
|
|
case ILIKE:
|
|
case SIMILAR:
|
|
cur_token = NOT_LA;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case NULLS_P:
|
|
/* Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST */
|
|
switch (next_token)
|
|
{
|
|
case FIRST_P:
|
|
case LAST_P:
|
|
cur_token = NULLS_LA;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case WITH:
|
|
/* Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY */
|
|
switch (next_token)
|
|
{
|
|
case TIME:
|
|
case ORDINALITY:
|
|
cur_token = WITH_LA;
|
|
break;
|
|
}
|
|
break;
|
|
case UIDENT:
|
|
case USCONST:
|
|
/* Look ahead for UESCAPE */
|
|
if (next_token == UESCAPE)
|
|
{
|
|
/* Yup, so get third token, which had better be SCONST */
|
|
const char *escstr;
|
|
|
|
/*
|
|
* Again save and restore lexer output variables around the
|
|
* call
|
|
*/
|
|
cur_yylval = base_yylval;
|
|
cur_yylloc = base_yylloc;
|
|
cur_yytext = base_yytext;
|
|
|
|
/* Get third token */
|
|
next_token = base_yylex();
|
|
|
|
if (next_token != SCONST)
|
|
mmerror(PARSE_ERROR, ET_ERROR, "UESCAPE must be followed by a simple string literal");
|
|
|
|
/*
|
|
* Save and check escape string, which the scanner returns
|
|
* with quotes
|
|
*/
|
|
escstr = base_yylval.str;
|
|
if (strlen(escstr) != 3 || !check_uescapechar(escstr[1]))
|
|
mmerror(PARSE_ERROR, ET_ERROR, "invalid Unicode escape character");
|
|
|
|
base_yylval = cur_yylval;
|
|
base_yylloc = cur_yylloc;
|
|
base_yytext = cur_yytext;
|
|
|
|
/* Combine 3 tokens into 1 */
|
|
base_yylval.str = psprintf("%s UESCAPE %s", base_yylval.str, escstr);
|
|
|
|
/* Clear have_lookahead, thereby consuming all three tokens */
|
|
have_lookahead = false;
|
|
}
|
|
|
|
if (cur_token == UIDENT)
|
|
cur_token = IDENT;
|
|
else if (cur_token == USCONST)
|
|
cur_token = SCONST;
|
|
break;
|
|
}
|
|
|
|
return cur_token;
|
|
}
|
|
|
|
/*
|
|
* check_uescapechar() and ecpg_isspace() should match their equivalents
|
|
* in pgc.l.
|
|
*/
|
|
|
|
/* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
|
|
static bool
|
|
check_uescapechar(unsigned char escape)
|
|
{
|
|
if (isxdigit(escape)
|
|
|| escape == '+'
|
|
|| escape == '\''
|
|
|| escape == '"'
|
|
|| ecpg_isspace(escape))
|
|
return false;
|
|
else
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
* ecpg_isspace() --- return true if flex scanner considers char whitespace
|
|
*/
|
|
static bool
|
|
ecpg_isspace(char ch)
|
|
{
|
|
if (ch == ' ' ||
|
|
ch == '\t' ||
|
|
ch == '\n' ||
|
|
ch == '\r' ||
|
|
ch == '\f')
|
|
return true;
|
|
return false;
|
|
}
|