diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index c878a0ba4d..f2e545ed87 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -4121,6 +4121,14 @@ cast(-44 as bit(12)) 111111010100 special meaning of underscore and percent signs in the pattern. + + According to the SQL standard, omitting ESCAPE + means there is no escape character (rather than defaulting to a + backslash), and a zero-length ESCAPE value is + disallowed. PostgreSQL's behavior in + this regard is therefore slightly nonstandard. + + The key word ILIKE can be used instead of LIKE to make the match case-insensitive according @@ -4139,9 +4147,9 @@ cast(-44 as bit(12)) 111111010100 - There is also the prefix operator ^@ and corresponding - starts_with function which covers cases when only - searching by beginning of the string is needed. + Also see the prefix operator ^@ and corresponding + starts_with function, which are useful in cases + where simply matching the beginning of a string is needed. @@ -4172,7 +4180,7 @@ cast(-44 as bit(12)) 111111010100 It is similar to LIKE, except that it interprets the pattern using the SQL standard's definition of a regular expression. SQL regular expressions are a curious cross - between LIKE notation and common regular + between LIKE notation and common (POSIX) regular expression notation. @@ -4256,18 +4264,38 @@ cast(-44 as bit(12)) 111111010100 - As with LIKE, a backslash disables the special meaning - of any of these metacharacters; or a different escape character can - be specified with ESCAPE. + As with LIKE, a backslash disables the special + meaning of any of these metacharacters. A different escape character + can be specified with ESCAPE, or the escape + capability can be disabled by writing ESCAPE ''. + + + + According to the SQL standard, omitting ESCAPE + means there is no escape character (rather than defaulting to a + backslash), and a zero-length ESCAPE value is + disallowed. PostgreSQL's behavior in + this regard is therefore slightly nonstandard. + + + + Another nonstandard extension is that following the escape character + with a letter or digit provides access to the escape sequences + defined for POSIX regular expressions; see + , + , and + below. Some examples: -'abc' SIMILAR TO 'abc' true -'abc' SIMILAR TO 'a' false -'abc' SIMILAR TO '%(b|d)%' true -'abc' SIMILAR TO '(b|c)%' false +'abc' SIMILAR TO 'abc' true +'abc' SIMILAR TO 'a' false +'abc' SIMILAR TO '%(b|d)%' true +'abc' SIMILAR TO '(b|c)%' false +'-abc-' SIMILAR TO '%\mabc\M%' true +'xabcy' SIMILAR TO '%\mabc\M%' false diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index c97bb367f8..a954acf509 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -13073,15 +13073,15 @@ a_expr: c_expr { $$ = $1; } | a_expr SIMILAR TO a_expr %prec SIMILAR { - FuncCall *n = makeFuncCall(SystemFuncName("similar_escape"), - list_make2($4, makeNullAConst(-1)), + FuncCall *n = makeFuncCall(SystemFuncName("similar_to_escape"), + list_make1($4), @2); $$ = (Node *) makeSimpleA_Expr(AEXPR_SIMILAR, "~", $1, (Node *) n, @2); } | a_expr SIMILAR TO a_expr ESCAPE a_expr %prec SIMILAR { - FuncCall *n = makeFuncCall(SystemFuncName("similar_escape"), + FuncCall *n = makeFuncCall(SystemFuncName("similar_to_escape"), list_make2($4, $6), @2); $$ = (Node *) makeSimpleA_Expr(AEXPR_SIMILAR, "~", @@ -13089,15 +13089,15 @@ a_expr: c_expr { $$ = $1; } } | a_expr NOT_LA SIMILAR TO a_expr %prec NOT_LA { - FuncCall *n = makeFuncCall(SystemFuncName("similar_escape"), - list_make2($5, makeNullAConst(-1)), + FuncCall *n = makeFuncCall(SystemFuncName("similar_to_escape"), + list_make1($5), @2); $$ = (Node *) makeSimpleA_Expr(AEXPR_SIMILAR, "!~", $1, (Node *) n, @2); } | a_expr NOT_LA SIMILAR TO a_expr ESCAPE a_expr %prec NOT_LA { - FuncCall *n = makeFuncCall(SystemFuncName("similar_escape"), + FuncCall *n = makeFuncCall(SystemFuncName("similar_to_escape"), list_make2($5, $7), @2); $$ = (Node *) makeSimpleA_Expr(AEXPR_SIMILAR, "!~", @@ -14323,9 +14323,9 @@ subquery_Op: | NOT_LA ILIKE { $$ = list_make1(makeString("!~~*")); } /* cannot put SIMILAR TO here, because SIMILAR TO is a hack. - * the regular expression is preprocessed by a function (similar_escape), + * the regular expression is preprocessed by a function (similar_to_escape), * and the ~ operator for posix regular expressions is used. - * x SIMILAR TO y -> x ~ similar_escape(y) + * x SIMILAR TO y -> x ~ similar_to_escape(y) * this transformation is made on the fly by the parser upwards. * however the SubLink structure which handles any/some/all stuff * is not ready for such a thing. diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c index 90a9197792..3d38aef820 100644 --- a/src/backend/utils/adt/regexp.c +++ b/src/backend/utils/adt/regexp.c @@ -654,15 +654,18 @@ textregexreplace(PG_FUNCTION_ARGS) } /* - * similar_escape() - * Convert a SQL:2008 regexp pattern to POSIX style, so it can be used by - * our regexp engine. + * similar_to_escape(), similar_escape() + * + * Convert a SQL "SIMILAR TO" regexp pattern to POSIX style, so it can be + * used by our regexp engine. + * + * similar_escape_internal() is the common workhorse for three SQL-exposed + * functions. esc_text can be passed as NULL to select the default escape + * (which is '\'), or as an empty string to select no escape character. */ -Datum -similar_escape(PG_FUNCTION_ARGS) +static text * +similar_escape_internal(text *pat_text, text *esc_text) { - text *pat_text; - text *esc_text; text *result; char *p, *e, @@ -673,13 +676,9 @@ similar_escape(PG_FUNCTION_ARGS) bool incharclass = false; int nquotes = 0; - /* This function is not strict, so must test explicitly */ - if (PG_ARGISNULL(0)) - PG_RETURN_NULL(); - pat_text = PG_GETARG_TEXT_PP(0); p = VARDATA_ANY(pat_text); plen = VARSIZE_ANY_EXHDR(pat_text); - if (PG_ARGISNULL(1)) + if (esc_text == NULL) { /* No ESCAPE clause provided; default to backslash as escape */ e = "\\"; @@ -687,12 +686,11 @@ similar_escape(PG_FUNCTION_ARGS) } else { - esc_text = PG_GETARG_TEXT_PP(1); e = VARDATA_ANY(esc_text); elen = VARSIZE_ANY_EXHDR(esc_text); if (elen == 0) e = NULL; /* no escape character */ - else + else if (elen > 1) { int escape_mblen = pg_mbstrlen_with_len(e, elen); @@ -898,6 +896,65 @@ similar_escape(PG_FUNCTION_ARGS) SET_VARSIZE(result, r - ((char *) result)); + return result; +} + +/* + * similar_to_escape(pattern, escape) + */ +Datum +similar_to_escape_2(PG_FUNCTION_ARGS) +{ + text *pat_text = PG_GETARG_TEXT_PP(0); + text *esc_text = PG_GETARG_TEXT_PP(1); + text *result; + + result = similar_escape_internal(pat_text, esc_text); + + PG_RETURN_TEXT_P(result); +} + +/* + * similar_to_escape(pattern) + * Inserts a default escape character. + */ +Datum +similar_to_escape_1(PG_FUNCTION_ARGS) +{ + text *pat_text = PG_GETARG_TEXT_PP(0); + text *result; + + result = similar_escape_internal(pat_text, NULL); + + PG_RETURN_TEXT_P(result); +} + +/* + * similar_escape(pattern, escape) + * + * Legacy function for compatibility with views stored using the + * pre-v13 expansion of SIMILAR TO. Unlike the above functions, this + * is non-strict, which leads to not-per-spec handling of "ESCAPE NULL". + */ +Datum +similar_escape(PG_FUNCTION_ARGS) +{ + text *pat_text; + text *esc_text; + text *result; + + /* This function is not strict, so must test explicitly */ + if (PG_ARGISNULL(0)) + PG_RETURN_NULL(); + pat_text = PG_GETARG_TEXT_PP(0); + + if (PG_ARGISNULL(1)) + esc_text = NULL; /* use default escape character */ + else + esc_text = PG_GETARG_TEXT_PP(1); + + result = similar_escape_internal(pat_text, esc_text); + PG_RETURN_TEXT_P(result); } diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index 80def7d401..00cc71dcd1 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -53,6 +53,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 201908012 +#define CATALOG_VERSION_NO 201909071 #endif diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index cf1f409351..e6645f139c 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -3346,9 +3346,15 @@ proname => 'repeat', prorettype => 'text', proargtypes => 'text int4', prosrc => 'repeat' }, -{ oid => '1623', descr => 'convert SQL99 regexp pattern to POSIX style', +{ oid => '1623', descr => 'convert SQL regexp pattern to POSIX style', proname => 'similar_escape', proisstrict => 'f', prorettype => 'text', proargtypes => 'text text', prosrc => 'similar_escape' }, +{ oid => '1986', descr => 'convert SQL regexp pattern to POSIX style', + proname => 'similar_to_escape', prorettype => 'text', + proargtypes => 'text text', prosrc => 'similar_to_escape_2' }, +{ oid => '1987', descr => 'convert SQL regexp pattern to POSIX style', + proname => 'similar_to_escape', prorettype => 'text', proargtypes => 'text', + prosrc => 'similar_to_escape_1' }, { oid => '1624', proname => 'mul_d_interval', prorettype => 'interval', @@ -5771,10 +5777,10 @@ { oid => '2073', descr => 'extract text matching regular expression', proname => 'substring', prorettype => 'text', proargtypes => 'text text', prosrc => 'textregexsubstr' }, -{ oid => '2074', descr => 'extract text matching SQL99 regular expression', +{ oid => '2074', descr => 'extract text matching SQL regular expression', proname => 'substring', prolang => 'sql', prorettype => 'text', proargtypes => 'text text text', - prosrc => 'select pg_catalog.substring($1, pg_catalog.similar_escape($2, $3))' }, + prosrc => 'select pg_catalog.substring($1, pg_catalog.similar_to_escape($2, $3))' }, { oid => '2075', descr => 'convert int8 to bitstring', proname => 'bit', prorettype => 'bit', proargtypes => 'int8 int4', @@ -10554,8 +10560,7 @@ proparallel => 'r', prorettype => 'void', proargtypes => '', prosrc => 'pg_replication_origin_xact_reset' }, -{ oid => '6012', - descr => 'advance replication origin to specific location', +{ oid => '6012', descr => 'advance replication origin to specific location', proname => 'pg_replication_origin_advance', provolatile => 'v', proparallel => 'u', prorettype => 'void', proargtypes => 'text pg_lsn', prosrc => 'pg_replication_origin_advance' }, diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out index 486c00b3b3..2483966576 100644 --- a/src/test/regress/expected/strings.out +++ b/src/test/regress/expected/strings.out @@ -410,7 +410,56 @@ SELECT SUBSTRING('abcdefg' FROM 'b(.*)f') AS "cde"; cde (1 row) --- PostgreSQL extension to allow using back reference in replace string; +-- Check behavior of SIMILAR TO, which uses largely the same regexp variant +SELECT 'abcdefg' SIMILAR TO '_bcd%' AS true; + true +------ + t +(1 row) + +SELECT 'abcdefg' SIMILAR TO 'bcd%' AS false; + false +------- + f +(1 row) + +SELECT 'abcdefg' SIMILAR TO '_bcd#%' ESCAPE '#' AS false; + false +------- + f +(1 row) + +SELECT 'abcd%' SIMILAR TO '_bcd#%' ESCAPE '#' AS true; + true +------ + t +(1 row) + +-- Postgres uses '\' as the default escape character, which is not per spec +SELECT 'abcdefg' SIMILAR TO '_bcd\%' AS false; + false +------- + f +(1 row) + +-- and an empty string to mean "no escape", which is also not per spec +SELECT 'abcd\efg' SIMILAR TO '_bcd\%' ESCAPE '' AS true; + true +------ + t +(1 row) + +-- these behaviors are per spec, though: +SELECT 'abcdefg' SIMILAR TO '_bcd%' ESCAPE NULL AS null; + null +------ + +(1 row) + +SELECT 'abcdefg' SIMILAR TO '_bcd#%' ESCAPE '##' AS error; +ERROR: invalid escape string +HINT: Escape string must be empty or one character. +-- Test back reference in regexp_replace SELECT regexp_replace('1112223333', E'(\\d{3})(\\d{3})(\\d{4})', E'(\\1) \\2-\\3'); regexp_replace ---------------- diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql index 5744c9f800..b5e75c344f 100644 --- a/src/test/regress/sql/strings.sql +++ b/src/test/regress/sql/strings.sql @@ -144,7 +144,20 @@ SELECT SUBSTRING('abcdefg' FROM 'c.e') AS "cde"; -- With a parenthesized subexpression, return only what matches the subexpr SELECT SUBSTRING('abcdefg' FROM 'b(.*)f') AS "cde"; --- PostgreSQL extension to allow using back reference in replace string; +-- Check behavior of SIMILAR TO, which uses largely the same regexp variant +SELECT 'abcdefg' SIMILAR TO '_bcd%' AS true; +SELECT 'abcdefg' SIMILAR TO 'bcd%' AS false; +SELECT 'abcdefg' SIMILAR TO '_bcd#%' ESCAPE '#' AS false; +SELECT 'abcd%' SIMILAR TO '_bcd#%' ESCAPE '#' AS true; +-- Postgres uses '\' as the default escape character, which is not per spec +SELECT 'abcdefg' SIMILAR TO '_bcd\%' AS false; +-- and an empty string to mean "no escape", which is also not per spec +SELECT 'abcd\efg' SIMILAR TO '_bcd\%' ESCAPE '' AS true; +-- these behaviors are per spec, though: +SELECT 'abcdefg' SIMILAR TO '_bcd%' ESCAPE NULL AS null; +SELECT 'abcdefg' SIMILAR TO '_bcd#%' ESCAPE '##' AS error; + +-- Test back reference in regexp_replace SELECT regexp_replace('1112223333', E'(\\d{3})(\\d{3})(\\d{4})', E'(\\1) \\2-\\3'); SELECT regexp_replace('AAA BBB CCC ', E'\\s+', ' ', 'g'); SELECT regexp_replace('AAA', '^|$', 'Z', 'g');