Add current substring regular expression syntax

SQL:1999 had syntax

    SUBSTRING(text FROM pattern FOR escapechar)

but this was replaced in SQL:2003 by the more clear

    SUBSTRING(text SIMILAR pattern ESCAPE escapechar)

but this was never implemented in PostgreSQL.  This patch adds that
new syntax as an alternative in the parser, and updates documentation
and tests to indicate that this is the preferred alternative now.

Reviewed-by: Pavel Stehule <pavel.stehule@gmail.com>
Reviewed-by: Vik Fearing <vik@postgresfriends.org>
Reviewed-by: Fabien COELHO <coelho@cri.ensmp.fr>
Discussion: https://www.postgresql.org/message-id/flat/a15db31c-d0f8-8ce0-9039-578a31758adb%402ndquadrant.com
This commit is contained in:
Peter Eisentraut 2020-06-29 11:04:42 +02:00
parent aafefb4dcb
commit 78c887679d
8 changed files with 77 additions and 34 deletions

View File

@ -1602,7 +1602,7 @@ SELECT substring('Thomas'::citext from '...$') = 'mas' AS t;
t
(1 row)
SELECT substring('Thomas'::citext from '%#"o_a#"_' for '#') = 'oma' AS t;
SELECT substring('Thomas'::citext similar '%#"o_a#"_' escape '#') = 'oma' AS t;
t
---
t

View File

@ -1602,7 +1602,7 @@ SELECT substring('Thomas'::citext from '...$') = 'mas' AS t;
t
(1 row)
SELECT substring('Thomas'::citext from '%#"o_a#"_' for '#') = 'oma' AS t;
SELECT substring('Thomas'::citext similar '%#"o_a#"_' escape '#') = 'oma' AS t;
t
---
t

View File

@ -564,7 +564,7 @@ SELECT substring('alphabet'::citext, 3, 2) = 'ph' AS t;
SELECT substring('Thomas'::citext from 2 for 3) = 'hom' AS t;
SELECT substring('Thomas'::citext from 2) = 'homas' AS t;
SELECT substring('Thomas'::citext from '...$') = 'mas' AS t;
SELECT substring('Thomas'::citext from '%#"o_a#"_' for '#') = 'oma' AS t;
SELECT substring('Thomas'::citext similar '%#"o_a#"_' escape '#') = 'oma' AS t;
SELECT trim(' trim '::citext) = 'trim' AS t;
SELECT trim('xxxxxtrimxxxx'::citext, 'x'::citext) = 'trim' AS t;

View File

@ -2669,15 +2669,21 @@ repeat('Pg', 4) <returnvalue>PgPgPgPg</returnvalue>
<row>
<entry role="func_table_entry"><para role="func_signature">
<function>substring</function> ( <parameter>string</parameter> <type>text</type> <literal>SIMILAR</literal> <parameter>pattern</parameter> <type>text</type> <literal>ESCAPE</literal> <parameter>escape</parameter> <type>text</type> )
<returnvalue>text</returnvalue>
</para>
<para role="func_signature">
<function>substring</function> ( <parameter>string</parameter> <type>text</type> <literal>FROM</literal> <parameter>pattern</parameter> <type>text</type> <literal>FOR</literal> <parameter>escape</parameter> <type>text</type> )
<returnvalue>text</returnvalue>
</para>
<para>
Extracts substring matching <acronym>SQL</acronym> regular expression;
see <xref linkend="functions-similarto-regexp"/>.
see <xref linkend="functions-similarto-regexp"/>. The first form has
been specified since SQL:2003; the second form was only in SQL:1999
and should be considered obsolete.
</para>
<para>
<literal>substring('Thomas' from '%#"o_a#"_' for '#')</literal>
<literal>substring('Thomas' similar '%#"o_a#"_' escape '#')</literal>
<returnvalue>oma</returnvalue>
</para></entry>
</row>
@ -5160,7 +5166,11 @@ cast(-44 as bit(12)) <lineannotation>111111010100</lineannotation>
The <function>substring</function> function with three parameters
provides extraction of a substring that matches an SQL
regular expression pattern. The function can be written according
to SQL99 syntax:
to standard SQL syntax:
<synopsis>
substring(<replaceable>string</replaceable> similar <replaceable>pattern</replaceable> escape <replaceable>escape-character</replaceable>)
</synopsis>
or using the now obsolete SQL:1999 syntax:
<synopsis>
substring(<replaceable>string</replaceable> from <replaceable>pattern</replaceable> for <replaceable>escape-character</replaceable>)
</synopsis>
@ -5201,8 +5211,8 @@ substring(<replaceable>string</replaceable>, <replaceable>pattern</replaceable>,
<para>
Some examples, with <literal>#&quot;</literal> delimiting the return string:
<programlisting>
substring('foobar' from '%#"o_b#"%' for '#') <lineannotation>oob</lineannotation>
substring('foobar' from '#"o_b#"%' for '#') <lineannotation>NULL</lineannotation>
substring('foobar' similar '%#"o_b#"%' escape '#') <lineannotation>oob</lineannotation>
substring('foobar' similar '#"o_b#"%' escape '#') <lineannotation>NULL</lineannotation>
</programlisting>
</para>
</sect2>

View File

@ -182,7 +182,7 @@ CREATE FUNCTION _pg_interval_type(typid oid, mod int4) RETURNS text
AS
$$SELECT
CASE WHEN $1 IN (1186) /* interval */
THEN pg_catalog.upper(substring(pg_catalog.format_type($1, $2) from 'interval[()0-9]* #"%#"' for '#'))
THEN pg_catalog.upper(substring(pg_catalog.format_type($1, $2) similar 'interval[()0-9]* #"%#"' escape '#'))
ELSE null
END$$;

View File

@ -14451,7 +14451,27 @@ position_list:
| /*EMPTY*/ { $$ = NIL; }
;
/* SUBSTRING() arguments */
/*
* SUBSTRING() arguments
*
* Note that SQL:1999 has both
*
* text FROM int FOR int
*
* and
*
* text FROM pattern FOR escape
*
* In the parser we map them both to a call to the substring() function and
* rely on type resolution to pick the right one.
*
* In SQL:2003, the second variant was changed to
*
* text SIMILAR pattern ESCAPE escape
*
* We could in theory map that to a different function internally, but
* since we still support the SQL:1999 version, we don't.
*/
substr_list:
a_expr FROM a_expr FOR a_expr
{
@ -14483,6 +14503,10 @@ substr_list:
makeTypeCast($3,
SystemTypeName("int4"), -1));
}
| a_expr SIMILAR a_expr ESCAPE a_expr
{
$$ = list_make3($1, $3, $5);
}
/*
* We also want to support generic substring functions that
* accept the usual generic list of arguments.

View File

@ -397,6 +397,13 @@ SELECT SUBSTRING('1234567890' FROM 4 FOR 3) = '456' AS "456";
(1 row)
-- T581 regular expression substring (with SQL's bizarre regexp syntax)
SELECT SUBSTRING('abcdefg' SIMILAR 'a#"(b_d)#"%' ESCAPE '#') AS "bcd";
bcd
-----
bcd
(1 row)
-- obsolete SQL99 syntax
SELECT SUBSTRING('abcdefg' FROM 'a#"(b_d)#"%' FOR '#') AS "bcd";
bcd
-----
@ -404,75 +411,75 @@ SELECT SUBSTRING('abcdefg' FROM 'a#"(b_d)#"%' FOR '#') AS "bcd";
(1 row)
-- No match should return NULL
SELECT SUBSTRING('abcdefg' FROM '#"(b_d)#"%' FOR '#') IS NULL AS "True";
SELECT SUBSTRING('abcdefg' SIMILAR '#"(b_d)#"%' ESCAPE '#') IS NULL AS "True";
True
------
t
(1 row)
-- Null inputs should return NULL
SELECT SUBSTRING('abcdefg' FROM '%' FOR NULL) IS NULL AS "True";
SELECT SUBSTRING('abcdefg' SIMILAR '%' ESCAPE NULL) IS NULL AS "True";
True
------
t
(1 row)
SELECT SUBSTRING(NULL FROM '%' FOR '#') IS NULL AS "True";
SELECT SUBSTRING(NULL SIMILAR '%' ESCAPE '#') IS NULL AS "True";
True
------
t
(1 row)
SELECT SUBSTRING('abcdefg' FROM NULL FOR '#') IS NULL AS "True";
SELECT SUBSTRING('abcdefg' SIMILAR NULL ESCAPE '#') IS NULL AS "True";
True
------
t
(1 row)
-- The first and last parts should act non-greedy
SELECT SUBSTRING('abcdefg' FROM 'a#"%#"g' FOR '#') AS "bcdef";
SELECT SUBSTRING('abcdefg' SIMILAR 'a#"%#"g' ESCAPE '#') AS "bcdef";
bcdef
-------
bcdef
(1 row)
SELECT SUBSTRING('abcdefg' FROM 'a*#"%#"g*' FOR '#') AS "abcdefg";
SELECT SUBSTRING('abcdefg' SIMILAR 'a*#"%#"g*' ESCAPE '#') AS "abcdefg";
abcdefg
---------
abcdefg
(1 row)
-- Vertical bar in any part affects only that part
SELECT SUBSTRING('abcdefg' FROM 'a|b#"%#"g' FOR '#') AS "bcdef";
SELECT SUBSTRING('abcdefg' SIMILAR 'a|b#"%#"g' ESCAPE '#') AS "bcdef";
bcdef
-------
bcdef
(1 row)
SELECT SUBSTRING('abcdefg' FROM 'a#"%#"x|g' FOR '#') AS "bcdef";
SELECT SUBSTRING('abcdefg' SIMILAR 'a#"%#"x|g' ESCAPE '#') AS "bcdef";
bcdef
-------
bcdef
(1 row)
SELECT SUBSTRING('abcdefg' FROM 'a#"%|ab#"g' FOR '#') AS "bcdef";
SELECT SUBSTRING('abcdefg' SIMILAR 'a#"%|ab#"g' ESCAPE '#') AS "bcdef";
bcdef
-------
bcdef
(1 row)
-- Can't have more than two part separators
SELECT SUBSTRING('abcdefg' FROM 'a*#"%#"g*#"x' FOR '#') AS "error";
SELECT SUBSTRING('abcdefg' SIMILAR 'a*#"%#"g*#"x' ESCAPE '#') AS "error";
ERROR: SQL regular expression may not contain more than two escape-double-quote separators
CONTEXT: SQL function "substring" statement 1
-- Postgres extension: with 0 or 1 separator, assume parts 1 and 3 are empty
SELECT SUBSTRING('abcdefg' FROM 'a#"%g' FOR '#') AS "bcdefg";
SELECT SUBSTRING('abcdefg' SIMILAR 'a#"%g' ESCAPE '#') AS "bcdefg";
bcdefg
--------
bcdefg
(1 row)
SELECT SUBSTRING('abcdefg' FROM 'a%g' FOR '#') AS "abcdefg";
SELECT SUBSTRING('abcdefg' SIMILAR 'a%g' ESCAPE '#') AS "abcdefg";
abcdefg
---------
abcdefg

View File

@ -132,31 +132,33 @@ SELECT SUBSTRING('1234567890' FROM 3) = '34567890' AS "34567890";
SELECT SUBSTRING('1234567890' FROM 4 FOR 3) = '456' AS "456";
-- T581 regular expression substring (with SQL's bizarre regexp syntax)
SELECT SUBSTRING('abcdefg' SIMILAR 'a#"(b_d)#"%' ESCAPE '#') AS "bcd";
-- obsolete SQL99 syntax
SELECT SUBSTRING('abcdefg' FROM 'a#"(b_d)#"%' FOR '#') AS "bcd";
-- No match should return NULL
SELECT SUBSTRING('abcdefg' FROM '#"(b_d)#"%' FOR '#') IS NULL AS "True";
SELECT SUBSTRING('abcdefg' SIMILAR '#"(b_d)#"%' ESCAPE '#') IS NULL AS "True";
-- Null inputs should return NULL
SELECT SUBSTRING('abcdefg' FROM '%' FOR NULL) IS NULL AS "True";
SELECT SUBSTRING(NULL FROM '%' FOR '#') IS NULL AS "True";
SELECT SUBSTRING('abcdefg' FROM NULL FOR '#') IS NULL AS "True";
SELECT SUBSTRING('abcdefg' SIMILAR '%' ESCAPE NULL) IS NULL AS "True";
SELECT SUBSTRING(NULL SIMILAR '%' ESCAPE '#') IS NULL AS "True";
SELECT SUBSTRING('abcdefg' SIMILAR NULL ESCAPE '#') IS NULL AS "True";
-- The first and last parts should act non-greedy
SELECT SUBSTRING('abcdefg' FROM 'a#"%#"g' FOR '#') AS "bcdef";
SELECT SUBSTRING('abcdefg' FROM 'a*#"%#"g*' FOR '#') AS "abcdefg";
SELECT SUBSTRING('abcdefg' SIMILAR 'a#"%#"g' ESCAPE '#') AS "bcdef";
SELECT SUBSTRING('abcdefg' SIMILAR 'a*#"%#"g*' ESCAPE '#') AS "abcdefg";
-- Vertical bar in any part affects only that part
SELECT SUBSTRING('abcdefg' FROM 'a|b#"%#"g' FOR '#') AS "bcdef";
SELECT SUBSTRING('abcdefg' FROM 'a#"%#"x|g' FOR '#') AS "bcdef";
SELECT SUBSTRING('abcdefg' FROM 'a#"%|ab#"g' FOR '#') AS "bcdef";
SELECT SUBSTRING('abcdefg' SIMILAR 'a|b#"%#"g' ESCAPE '#') AS "bcdef";
SELECT SUBSTRING('abcdefg' SIMILAR 'a#"%#"x|g' ESCAPE '#') AS "bcdef";
SELECT SUBSTRING('abcdefg' SIMILAR 'a#"%|ab#"g' ESCAPE '#') AS "bcdef";
-- Can't have more than two part separators
SELECT SUBSTRING('abcdefg' FROM 'a*#"%#"g*#"x' FOR '#') AS "error";
SELECT SUBSTRING('abcdefg' SIMILAR 'a*#"%#"g*#"x' ESCAPE '#') AS "error";
-- Postgres extension: with 0 or 1 separator, assume parts 1 and 3 are empty
SELECT SUBSTRING('abcdefg' FROM 'a#"%g' FOR '#') AS "bcdefg";
SELECT SUBSTRING('abcdefg' FROM 'a%g' FOR '#') AS "abcdefg";
SELECT SUBSTRING('abcdefg' SIMILAR 'a#"%g' ESCAPE '#') AS "bcdefg";
SELECT SUBSTRING('abcdefg' SIMILAR 'a%g' ESCAPE '#') AS "abcdefg";
-- substring() with just two arguments is not allowed by SQL spec;
-- we accept it, but we interpret the pattern as a POSIX regexp not SQL