Clean up formatting.c's logic for matching constant strings.

seq_search(), which is used to match input substrings to constants such as month and day names, had a lot of bizarre and unnecessary behaviors. It was mostly possible to avert our eyes from that before, but we don't want to duplicate those behaviors in the upcoming patch to allow recognition of non-English month and day names. So it's time to clean this up. In particular: * seq_search scribbled on the input string, which is a pretty dangerous thing to do, especially in the badly underdocumented way it was done here. Fortunately the input string is a temporary copy, but that was being made three subroutine levels away, making it something easy to break accidentally. The behavior is externally visible nonetheless, in the form of odd case-folding in error reports about unrecognized month/day names. The scribbling is evidently being done to save a few calls to pg_tolower, but that's such a cheap function (at least for ASCII data) that it's pretty pointless to worry about. In HEAD I switched it to be pg_ascii_tolower to ensure it is cheap in all cases; but there are corner cases in Turkish where this'd change behavior, so leave it as pg_tolower in the back branches. * seq_search insisted on knowing the case form (all-upper, all-lower, or initcap) of the constant strings, so that it didn't have to case-fold them to perform case-insensitive comparisons. This likewise seems like excessive micro-optimization, given that pg_tolower is certainly very cheap for ASCII data. It seems unsafe to assume that we know the case form that will come out of pg_locale.c for localized month/day names, so it's better just to define the comparison rule as "downcase all strings before comparing". (The choice between downcasing and upcasing is arbitrary so far as English is concerned, but it might not be in other locales, so follow citext's lead here.) * seq_search also had a parameter that'd cause it to report a match after a maximum number of characters, even if the constant string were longer than that. This was not actually used because no caller passed a value small enough to cut off a comparison. Replicating that behavior for localized month/day names seems expensive as well as useless, so let's get rid of that too. * from_char_seq_search used the maximum-length parameter to truncate the input string in error reports about not finding a matching name. This leads to rather confusing reports in many cases. Worse, it is outright dangerous if the input string isn't all-ASCII, because we risk truncating the string in the middle of a multibyte character. That'd lead either to delivering an illegible error message to the client, or to encoding-conversion failures that obscure the actual data problem. Get rid of that in favor of truncating at whitespace if any (a suggestion due to Alvaro Herrera). In addition to fixing these things, I const-ified the input string pointers of DCH_from_char and its subroutines, to make sure there aren't any other scribbling-on-input problems. The risk of generating a badly-encoded error message seems like enough of a bug to justify back-patching, so patch all supported branches. Discussion: https://postgr.es/m/29432.1579731087@sss.pgh.pa.us
2020-01-23 13:42:09 -05:00 · 2020-01-23 13:42:09 -05:00 · 4c70098ffa
parent cd69ec66c8
commit 4c70098ffa
2 changed files with 85 additions and 91 deletions
--- a/src/backend/utils/adt/formatting.c
+++ b/src/backend/utils/adt/formatting.c
@ -88,6 +88,7 @@
 #include "catalog/pg_collation.h"
 #include "catalog/pg_type.h"
 #include "mb/pg_wchar.h"
+#include "parser/scansup.h"
 #include "utils/builtins.h"
 #include "utils/date.h"
 #include "utils/datetime.h"
@ -317,16 +318,6 @@ static const char *const numth[] = {"st", "nd", "rd", "th", NULL};
 * Flags & Options:
 * ----------
 */
-#define ONE_UPPER		1		/* Name */
-#define ALL_UPPER		2		/* NAME */
-#define ALL_LOWER		3		/* name */
-
-#define MAX_MONTH_LEN	9
-#define MAX_MON_LEN		3
-#define MAX_DAY_LEN		9
-#define MAX_DY_LEN		3
-#define MAX_RM_LEN		4
-
 #define TH_UPPER		1
 #define TH_LOWER		2

@ -1048,7 +1039,7 @@ static void parse_format(FormatNode *node, const char *str, const KeyWord *kw,

 static void DCH_to_char(FormatNode *node, bool is_interval,
 						TmToChar *in, char *out, Oid collid);
-static void DCH_from_char(FormatNode *node, char *in, TmFromChar *out,
+static void DCH_from_char(FormatNode *node, const char *in, TmFromChar *out,
 						  bool std, bool *have_error);

 #ifdef DEBUG_TO_FROM_CHAR
@ -1059,18 +1050,18 @@ static void dump_node(FormatNode *node, int max);
 static const char *get_th(char *num, int type);
 static char *str_numth(char *dest, char *num, int type);
 static int	adjust_partial_year_to_2020(int year);
-static int	strspace_len(char *str);
+static int	strspace_len(const char *str);
 static void from_char_set_mode(TmFromChar *tmfc, const FromCharDateMode mode,
 							   bool *have_error);
 static void from_char_set_int(int *dest, const int value, const FormatNode *node,
 							  bool *have_error);
-static int	from_char_parse_int_len(int *dest, char **src, const int len,
+static int	from_char_parse_int_len(int *dest, const char **src, const int len,
 									FormatNode *node, bool *have_error);
-static int	from_char_parse_int(int *dest, char **src, FormatNode *node,
+static int	from_char_parse_int(int *dest, const char **src, FormatNode *node,
 								bool *have_error);
-static int	seq_search(char *name, const char *const *array, int type, int max, int *len);
-static int	from_char_seq_search(int *dest, char **src,
-								 const char *const *array, int type, int max,
+static int	seq_search(const char *name, const char *const *array, int *len);
+static int	from_char_seq_search(int *dest, const char **src,
+								 const char *const *array,
 								 FormatNode *node, bool *have_error);
 static void do_to_timestamp(text *date_txt, text *fmt, bool std,
 							struct pg_tm *tm, fsec_t *fsec, int *fprec,
@ -2259,7 +2250,7 @@ adjust_partial_year_to_2020(int year)


 static int
-strspace_len(char *str)
+strspace_len(const char *str)
 {
 	int			len = 0;

@ -2348,12 +2339,12 @@ on_error:
 * and -1 is returned.
 */
 static int
-from_char_parse_int_len(int *dest, char **src, const int len, FormatNode *node,
+from_char_parse_int_len(int *dest, const char **src, const int len, FormatNode *node,
 						bool *have_error)
 {
 	long		result;
 	char		copy[DCH_MAX_ITEM_SIZ + 1];
-	char	   *init = *src;
+	const char *init = *src;
 	int			used;

 	/*
@ -2370,8 +2361,11 @@ from_char_parse_int_len(int *dest, char **src, const int len, FormatNode *node,
 		 * This node is in Fill Mode, or the next node is known to be a
 		 * non-digit value, so we just slurp as many characters as we can get.
 		 */
+		char	   *endptr;
+
 		errno = 0;
-		result = strtol(init, src, 10);
+		result = strtol(init, &endptr, 10);
+		*src = endptr;
 	}
 	else
 	{
@ -2448,76 +2442,61 @@ on_error:
 * required length explicitly.
 */
 static int
-from_char_parse_int(int *dest, char **src, FormatNode *node, bool *have_error)
+from_char_parse_int(int *dest, const char **src, FormatNode *node, bool *have_error)
 {
 	return from_char_parse_int_len(dest, src, node->key->len, node, have_error);
 }

-/* ----------
- * Sequential search with to upper/lower conversion
- * ----------
+/*
+ * Sequentially search null-terminated "array" for a case-insensitive match
+ * to the initial character(s) of "name".
+ *
+ * Returns array index of match, or -1 for no match.
+ *
+ * *len is set to the length of the match, or 0 for no match.
+ *
+ * Case-insensitivity is defined per pg_ascii_tolower, so this is only
+ * suitable for comparisons to ASCII strings.
 */
 static int
-seq_search(char *name, const char *const *array, int type, int max, int *len)
+seq_search(const char *name, const char *const *array, int *len)
 {
-	const char *p;
+	unsigned char firstc;
 	const char *const *a;
-	char	   *n;
-	int			last,
-				i;

 	*len = 0;

+	/* empty string can't match anything */
 	if (!*name)
 		return -1;

-	/* set first char */
-	if (type == ONE_UPPER || type == ALL_UPPER)
-		*name = pg_toupper((unsigned char) *name);
-	else if (type == ALL_LOWER)
-		*name = pg_tolower((unsigned char) *name);
+	/* we handle first char specially to gain some speed */
+	firstc = pg_ascii_tolower((unsigned char) *name);

-	for (last = 0, a = array; *a != NULL; a++)
+	for (a = array; *a != NULL; a++)
 	{
+		const char *p;
+		const char *n;
+
 		/* compare first chars */
-		if (*name != **a)
+		if (pg_ascii_tolower((unsigned char) **a) != firstc)
 			continue;

-		for (i = 1, p = *a + 1, n = name + 1;; n++, p++, i++)
+		/* compare rest of string */
+		for (p = *a + 1, n = name + 1;; p++, n++)
 		{
-			/* search fragment (max) only */
-			if (max && i == max)
-			{
-				*len = i;
-				return a - array;
-			}
-			/* full size */
+			/* return success if we matched whole array entry */
 			if (*p == '\0')
 			{
-				*len = i;
+				*len = n - name;
 				return a - array;
 			}
-			/* Not found in array 'a' */
+			/* else, must have another character in "name" ... */
 			if (*n == '\0')
 				break;
-
-			/*
-			 * Convert (but convert new chars only)
-			 */
-			if (i > last)
-			{
-				if (type == ONE_UPPER || type == ALL_LOWER)
-					*n = pg_tolower((unsigned char) *n);
-				else if (type == ALL_UPPER)
-					*n = pg_toupper((unsigned char) *n);
-				last = i;
-			}
-
-#ifdef DEBUG_TO_FROM_CHAR
-			elog(DEBUG_elog_output, "N: %c, P: %c, A: %s (%s)",
-				 *n, *p, *a, name);
-#endif
-			if (*n != *p)
+			/* ... and it must match */
+			if (pg_ascii_tolower((unsigned char) *p) !=
+				pg_ascii_tolower((unsigned char) *n))
 				break;
 		}
 	}
@ -2526,8 +2505,8 @@ seq_search(char *name, const char *const *array, int type, int max, int *len)
 }

 /*
- * Perform a sequential search in 'array' for text matching the first 'max'
- * characters of the source string.
+ * Perform a sequential search in 'array' for an entry matching the first
+ * character(s) of the 'src' string case-insensitively.
 *
 * If a match is found, copy the array index of the match into the integer
 * pointed to by 'dest', advance 'src' to the end of the part of the string
@ -2535,20 +2514,35 @@ seq_search(char *name, const char *const *array, int type, int max, int *len)
 *
 * If the string doesn't match, throw an error if 'have_error' is NULL,
 * otherwise set '*have_error' and return -1.
+ *
+ * 'node' is used only for error reports: node->key->name identifies the
+ * field type we were searching for.
 */
 static int
-from_char_seq_search(int *dest, char **src, const char *const *array, int type,
-					 int max, FormatNode *node, bool *have_error)
+from_char_seq_search(int *dest, const char **src, const char *const *array,
+					 FormatNode *node, bool *have_error)
 {
 	int			len;

-	*dest = seq_search(*src, array, type, max, &len);
+	*dest = seq_search(*src, array, &len);
+
 	if (len <= 0)
 	{
-		char		copy[DCH_MAX_ITEM_SIZ + 1];
+		/*
+		 * In the error report, truncate the string at the next whitespace (if
+		 * any) to avoid including irrelevant data.
+		 */
+		char	   *copy = pstrdup(*src);
+		char	   *c;

-		Assert(max <= DCH_MAX_ITEM_SIZ);
-		strlcpy(copy, *src, max + 1);
+		for (c = copy; *c; c++)
+		{
+			if (scanner_isspace(*c))
+			{
+				*c = '\0';
+				break;
+			}
+		}

 		RETURN_ERROR(ereport(ERROR,
 							 (errcode(ERRCODE_INVALID_DATETIME_FORMAT),
@ -3166,11 +3160,11 @@ DCH_to_char(FormatNode *node, bool is_interval, TmToChar *in, char *out, Oid col
 * ----------
 */
 static void
-DCH_from_char(FormatNode *node, char *in, TmFromChar *out, bool std,
+DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, bool std,
 			  bool *have_error)
 {
 	FormatNode *n;
-	char	   *s;
+	const char *s;
 	int			len,
 				value;
 	bool		fx_mode = std;
@ -3279,7 +3273,7 @@ DCH_from_char(FormatNode *node, char *in, TmFromChar *out, bool std,
 			case DCH_a_m:
 			case DCH_p_m:
 				from_char_seq_search(&value, &s, ampm_strings_long,
-									 ALL_UPPER, n->key->len, n, have_error);
+									 n, have_error);
 				CHECK_ERROR;
 				from_char_set_int(&out->pm, value % 2, n, have_error);
 				CHECK_ERROR;
@ -3290,7 +3284,7 @@ DCH_from_char(FormatNode *node, char *in, TmFromChar *out, bool std,
 			case DCH_am:
 			case DCH_pm:
 				from_char_seq_search(&value, &s, ampm_strings,
-									 ALL_UPPER, n->key->len, n, have_error);
+									 n, have_error);
 				CHECK_ERROR;
 				from_char_set_int(&out->pm, value % 2, n, have_error);
 				CHECK_ERROR;
@ -3403,7 +3397,7 @@ DCH_from_char(FormatNode *node, char *in, TmFromChar *out, bool std,
 			case DCH_a_d:
 			case DCH_b_c:
 				from_char_seq_search(&value, &s, adbc_strings_long,
-									 ALL_UPPER, n->key->len, n, have_error);
+									 n, have_error);
 				CHECK_ERROR;
 				from_char_set_int(&out->bc, value % 2, n, have_error);
 				CHECK_ERROR;
@ -3413,7 +3407,7 @@ DCH_from_char(FormatNode *node, char *in, TmFromChar *out, bool std,
 			case DCH_ad:
 			case DCH_bc:
 				from_char_seq_search(&value, &s, adbc_strings,
-									 ALL_UPPER, n->key->len, n, have_error);
+									 n, have_error);
 				CHECK_ERROR;
 				from_char_set_int(&out->bc, value % 2, n, have_error);
 				CHECK_ERROR;
@ -3421,8 +3415,8 @@ DCH_from_char(FormatNode *node, char *in, TmFromChar *out, bool std,
 			case DCH_MONTH:
 			case DCH_Month:
 			case DCH_month:
-				from_char_seq_search(&value, &s, months_full, ONE_UPPER,
-									 MAX_MONTH_LEN, n, have_error);
+				from_char_seq_search(&value, &s, months_full,
+									 n, have_error);
 				CHECK_ERROR;
 				from_char_set_int(&out->mm, value + 1, n, have_error);
 				CHECK_ERROR;
@ -3430,8 +3424,8 @@ DCH_from_char(FormatNode *node, char *in, TmFromChar *out, bool std,
 			case DCH_MON:
 			case DCH_Mon:
 			case DCH_mon:
-				from_char_seq_search(&value, &s, months, ONE_UPPER,
-									 MAX_MON_LEN, n, have_error);
+				from_char_seq_search(&value, &s, months,
+									 n, have_error);
 				CHECK_ERROR;
 				from_char_set_int(&out->mm, value + 1, n, have_error);
 				CHECK_ERROR;
@ -3444,8 +3438,8 @@ DCH_from_char(FormatNode *node, char *in, TmFromChar *out, bool std,
 			case DCH_DAY:
 			case DCH_Day:
 			case DCH_day:
-				from_char_seq_search(&value, &s, days, ONE_UPPER,
-									 MAX_DAY_LEN, n, have_error);
+				from_char_seq_search(&value, &s, days,
+									 n, have_error);
 				CHECK_ERROR;
 				from_char_set_int(&out->d, value, n, have_error);
 				CHECK_ERROR;
@ -3454,8 +3448,8 @@ DCH_from_char(FormatNode *node, char *in, TmFromChar *out, bool std,
 			case DCH_DY:
 			case DCH_Dy:
 			case DCH_dy:
-				from_char_seq_search(&value, &s, days, ONE_UPPER,
-									 MAX_DY_LEN, n, have_error);
+				from_char_seq_search(&value, &s, days,
+									 n, have_error);
 				CHECK_ERROR;
 				from_char_set_int(&out->d, value, n, have_error);
 				CHECK_ERROR;
@ -3572,7 +3566,7 @@ DCH_from_char(FormatNode *node, char *in, TmFromChar *out, bool std,
 				break;
 			case DCH_RM:
 				from_char_seq_search(&value, &s, rm_months_upper,
-									 ALL_UPPER, MAX_RM_LEN, n, have_error);
+									 n, have_error);
 				CHECK_ERROR;
 				from_char_set_int(&out->mm, MONTHS_PER_YEAR - value,
 								  n, have_error);
@ -3580,7 +3574,7 @@ DCH_from_char(FormatNode *node, char *in, TmFromChar *out, bool std,
 				break;
 			case DCH_rm:
 				from_char_seq_search(&value, &s, rm_months_lower,
-									 ALL_LOWER, MAX_RM_LEN, n, have_error);
+									 n, have_error);
 				CHECK_ERROR;
 				from_char_set_int(&out->mm, MONTHS_PER_YEAR - value,
 								  n, have_error);
--- a/src/test/regress/expected/horology.out
+++ b/src/test/regress/expected/horology.out
@ -2616,7 +2616,7 @@ SELECT to_timestamp('2000January09Sunday', 'YYYYFMMonthDDFMDay');
 (1 row)

 SELECT to_timestamp('97/Feb/16', 'YYMonDD');
-ERROR:  invalid value "/Fe" for "Mon"
+ERROR:  invalid value "/Feb/16" for "Mon"
 DETAIL:  The given value did not match any of the allowed values for this field.
 SELECT to_timestamp('97/Feb/16', 'YY:Mon:DD');
         to_timestamp         
@ -2941,7 +2941,7 @@ SELECT to_timestamp('2000 ++ JUN', 'YYYY  MON');
 (1 row)

 SELECT to_timestamp('2000 + + JUN', 'YYYY  MON');
-ERROR:  invalid value "+ J" for "MON"
+ERROR:  invalid value "+" for "MON"
 DETAIL:  The given value did not match any of the allowed values for this field.
 SELECT to_timestamp('2000 + + JUN', 'YYYY   MON');
         to_timestamp