Re: Allow to_date() and to_timestamp() to accept localized names

Tom Lane Sat, 29 Feb 2020 13:37:46 -0800

=?UTF-8?Q?Juan_Jos=C3=A9_Santamar=C3=ADa_Flecha?= <[email protected]> 
writes:
> On Tue, Jan 28, 2020 at 9:35 PM Peter Eisentraut <
> [email protected]> wrote:
>> So I'm withdrawing my concerns with respect to this patch.  As long as
>> it can do a roundtrip conversion with to_char(), it's fine.


> We can avoid issues with non injective case conversion languages with a
> double conversion, so both strings in the comparison end up in the same
> state.
> I propose an upper/lower conversion as in the attached patch.

This seems pretty reasonable to me, with a couple of caveats:

* I don't think it's appropriate to hard-wire DEFAULT_COLLATION_OID
as the collation to do case-folding with.  For to_date/to_timestamp,
we have collatable text input so we can rely on the function's input
collation instead, at the cost of having to pass down the collation
OID through a few layers of subroutines :-(.  For parse_datetime,
I punted for now and let it use DEFAULT_COLLATION_OID, because that's
currently only called by JSONB code that probably hasn't got a usable
input collation anyway (since jsonb isn't considered collatable).

* I think it's likely worthwhile to do a quick check for an exact
match before we go through all these case-folding pushups.  If the
expected use-case is reversing to_char() output, that will win all
the time.  We're probably ahead of the game even if it only matches
a few percent of the time.

Attached v10 incorporates those improvements, plus a bit of
polishing of docs and comments.  Barring objections, this seems
committable to me.

                        regards, tom lane

diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 28035f1..8b73e05 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -5968,7 +5968,7 @@ SELECT regexp_match('abc01234xyz', '(?:(.*?)(\d+)(.*)){1,1}');
        </row>
        <row>
         <entry><literal>TM</literal> prefix</entry>
-        <entry>translation mode (print localized day and month names based on
+        <entry>translation mode (use localized day and month names based on
          <xref linkend="guc-lc-time"/>)</entry>
         <entry><literal>TMMonth</literal></entry>
        </row>
@@ -5999,9 +5999,20 @@ SELECT regexp_match('abc01234xyz', '(?:(.*?)(\d+)(.*)){1,1}');
 
      <listitem>
       <para>
-       <literal>TM</literal> does not include trailing blanks.
-       <function>to_timestamp</function> and <function>to_date</function> ignore
-       the <literal>TM</literal> modifier.
+       <literal>TM</literal> suppresses trailing blanks whether or
+       not <literal>FM</literal> is specified.
+      </para>
+     </listitem>
+
+     <listitem>
+      <para>
+       <function>to_timestamp</function> and <function>to_date</function>
+       ignore letter case in the input; so for
+       example <literal>MON</literal>, <literal>Mon</literal>,
+       and <literal>mon</literal> all accept the same strings.  When using
+       the <literal>TM</literal> modifier, case-folding is done according to
+       the rules of the function's input collation (see
+       <xref linkend="collation"/>).
       </para>
      </listitem>
 
diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c
index d029468..db99a6b 100644
--- a/src/backend/utils/adt/formatting.c
+++ b/src/backend/utils/adt/formatting.c
@@ -1038,7 +1038,7 @@ static void parse_format(FormatNode *node, const char *str, const KeyWord *kw,
 static void DCH_to_char(FormatNode *node, bool is_interval,
 						TmToChar *in, char *out, Oid collid);
 static void DCH_from_char(FormatNode *node, const char *in, TmFromChar *out,
-						  bool std, bool *have_error);
+						  Oid collid, bool std, bool *have_error);
 
 #ifdef DEBUG_TO_FROM_CHAR
 static void dump_index(const KeyWord *k, const int *index);
@@ -1057,11 +1057,14 @@ static int	from_char_parse_int_len(int *dest, const char **src, const int len,
 									FormatNode *node, bool *have_error);
 static int	from_char_parse_int(int *dest, const char **src, FormatNode *node,
 								bool *have_error);
-static int	seq_search(const char *name, const char *const *array, int *len);
+static int	seq_search_ascii(const char *name, const char *const *array, int *len);
+static int	seq_search_localized(const char *name, char **array, int *len,
+								 Oid collid);
 static int	from_char_seq_search(int *dest, const char **src,
 								 const char *const *array,
+								 char **localized_array, Oid collid,
 								 FormatNode *node, bool *have_error);
-static void do_to_timestamp(text *date_txt, text *fmt, bool std,
+static void do_to_timestamp(text *date_txt, text *fmt, Oid collid, bool std,
 							struct pg_tm *tm, fsec_t *fsec, int *fprec,
 							uint32 *flags, bool *have_error);
 static char *fill_str(char *str, int c, int max);
@@ -2457,7 +2460,7 @@ from_char_parse_int(int *dest, const char **src, FormatNode *node, bool *have_er
  * suitable for comparisons to ASCII strings.
  */
 static int
-seq_search(const char *name, const char *const *array, int *len)
+seq_search_ascii(const char *name, const char *const *array, int *len)
 {
 	unsigned char firstc;
 	const char *const *a;
@@ -2503,8 +2506,89 @@ seq_search(const char *name, const char *const *array, int *len)
 }
 
 /*
- * Perform a sequential search in 'array' for an entry matching the first
- * character(s) of the 'src' string case-insensitively.
+ * Sequentially search an array of possibly non-English words for
+ * a case-insensitive match to the initial character(s) of "name".
+ *
+ * This has the same API as seq_search_ascii(), but we use a more general
+ * case-folding transformation to achieve case-insensitivity.  Case folding
+ * is done per the rules of the collation identified by "collid".
+ *
+ * The array is treated as const, but we don't declare it that way because
+ * the arrays exported by pg_locale.c aren't const.
+ */
+static int
+seq_search_localized(const char *name, char **array, int *len, Oid collid)
+{
+	char	  **a;
+	char	   *upper_name;
+	char	   *lower_name;
+
+	*len = 0;
+
+	/* empty string can't match anything */
+	if (!*name)
+		return -1;
+
+	/*
+	 * The case-folding processing done below is fairly expensive, so before
+	 * doing that, make a quick pass to see if there is an exact match.
+	 */
+	for (a = array; *a != NULL; a++)
+	{
+		int			element_len = strlen(*a);
+
+		if (strncmp(name, *a, element_len) == 0)
+		{
+			*len = element_len;
+			return a - array;
+		}
+	}
+
+	/*
+	 * Fold to upper case, then to lower case, so that we can match reliably
+	 * even in languages in which case conversions are not injective.
+	 */
+	upper_name = str_toupper(unconstify(char *, name), strlen(name), collid);
+	lower_name = str_tolower(upper_name, strlen(upper_name), collid);
+	pfree(upper_name);
+
+	for (a = array; *a != NULL; a++)
+	{
+		char	   *upper_element;
+		char	   *lower_element;
+		int			element_len;
+
+		/* Likewise upper/lower-case array element */
+		upper_element = str_toupper(*a, strlen(*a), collid);
+		lower_element = str_tolower(upper_element, strlen(upper_element),
+									collid);
+		pfree(upper_element);
+		element_len = strlen(lower_element);
+
+		/* Match? */
+		if (strncmp(lower_name, lower_element, element_len) == 0)
+		{
+			*len = element_len;
+			pfree(lower_element);
+			pfree(lower_name);
+			return a - array;
+		}
+		pfree(lower_element);
+	}
+
+	pfree(lower_name);
+	return -1;
+}
+
+/*
+ * Perform a sequential search in 'array' (or 'localized_array', if that's
+ * not NULL) for an entry matching the first character(s) of the 'src'
+ * string case-insensitively.
+ *
+ * The 'array' is presumed to be English words (all-ASCII), but
+ * if 'localized_array' is supplied, that might be non-English
+ * so we need a more expensive case-folding transformation
+ * (which will follow the rules of the collation 'collid').
  *
  * If a match is found, copy the array index of the match into the integer
  * pointed to by 'dest', advance 'src' to the end of the part of the string
@@ -2518,11 +2602,15 @@ seq_search(const char *name, const char *const *array, int *len)
  */
 static int
 from_char_seq_search(int *dest, const char **src, const char *const *array,
+					 char **localized_array, Oid collid,
 					 FormatNode *node, bool *have_error)
 {
 	int			len;
 
-	*dest = seq_search(*src, array, &len);
+	if (localized_array == NULL)
+		*dest = seq_search_ascii(*src, array, &len);
+	else
+		*dest = seq_search_localized(*src, localized_array, &len, collid);
 
 	if (len <= 0)
 	{
@@ -3147,19 +3235,20 @@ DCH_to_char(FormatNode *node, bool is_interval, TmToChar *in, char *out, Oid col
 	*s = '\0';
 }
 
-/* ----------
- * Process a string as denoted by a list of FormatNodes.
+/*
+ * Process the string 'in' as denoted by the array of FormatNodes 'node[]'.
  * The TmFromChar struct pointed to by 'out' is populated with the results.
  *
+ * 'collid' identifies the collation to use, if needed.
+ * 'std' specifies standard parsing mode.
+ * If 'have_error' is NULL, then errors are thrown, else '*have_error' is set.
+ *
  * Note: we currently don't have any to_interval() function, so there
  * is no need here for INVALID_FOR_INTERVAL checks.
- *
- * If 'have_error' is NULL, then errors are thrown, else '*have_error' is set.
- * ----------
  */
 static void
-DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, bool std,
-			  bool *have_error)
+DCH_from_char(FormatNode *node, const char *in, TmFromChar *out,
+			  Oid collid, bool std, bool *have_error)
 {
 	FormatNode *n;
 	const char *s;
@@ -3170,6 +3259,9 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, bool std,
 	/* number of extra skipped characters (more than given in format string) */
 	int			extra_skip = 0;
 
+	/* cache localized days and months */
+	cache_locale_time();
+
 	for (n = node, s = in; n->type != NODE_TYPE_END && *s != '\0'; n++)
 	{
 		/*
@@ -3271,6 +3363,7 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, bool std,
 			case DCH_a_m:
 			case DCH_p_m:
 				from_char_seq_search(&value, &s, ampm_strings_long,
+									 NULL, InvalidOid,
 									 n, have_error);
 				CHECK_ERROR;
 				from_char_set_int(&out->pm, value % 2, n, have_error);
@@ -3282,6 +3375,7 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, bool std,
 			case DCH_am:
 			case DCH_pm:
 				from_char_seq_search(&value, &s, ampm_strings,
+									 NULL, InvalidOid,
 									 n, have_error);
 				CHECK_ERROR;
 				from_char_set_int(&out->pm, value % 2, n, have_error);
@@ -3395,6 +3489,7 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, bool std,
 			case DCH_a_d:
 			case DCH_b_c:
 				from_char_seq_search(&value, &s, adbc_strings_long,
+									 NULL, InvalidOid,
 									 n, have_error);
 				CHECK_ERROR;
 				from_char_set_int(&out->bc, value % 2, n, have_error);
@@ -3405,6 +3500,7 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, bool std,
 			case DCH_ad:
 			case DCH_bc:
 				from_char_seq_search(&value, &s, adbc_strings,
+									 NULL, InvalidOid,
 									 n, have_error);
 				CHECK_ERROR;
 				from_char_set_int(&out->bc, value % 2, n, have_error);
@@ -3414,6 +3510,8 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, bool std,
 			case DCH_Month:
 			case DCH_month:
 				from_char_seq_search(&value, &s, months_full,
+									 S_TM(n->suffix) ? localized_full_months : NULL,
+									 collid,
 									 n, have_error);
 				CHECK_ERROR;
 				from_char_set_int(&out->mm, value + 1, n, have_error);
@@ -3423,6 +3521,8 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, bool std,
 			case DCH_Mon:
 			case DCH_mon:
 				from_char_seq_search(&value, &s, months,
+									 S_TM(n->suffix) ? localized_abbrev_months : NULL,
+									 collid,
 									 n, have_error);
 				CHECK_ERROR;
 				from_char_set_int(&out->mm, value + 1, n, have_error);
@@ -3437,6 +3537,8 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, bool std,
 			case DCH_Day:
 			case DCH_day:
 				from_char_seq_search(&value, &s, days,
+									 S_TM(n->suffix) ? localized_full_days : NULL,
+									 collid,
 									 n, have_error);
 				CHECK_ERROR;
 				from_char_set_int(&out->d, value, n, have_error);
@@ -3447,6 +3549,8 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, bool std,
 			case DCH_Dy:
 			case DCH_dy:
 				from_char_seq_search(&value, &s, days_short,
+									 S_TM(n->suffix) ? localized_abbrev_days : NULL,
+									 collid,
 									 n, have_error);
 				CHECK_ERROR;
 				from_char_set_int(&out->d, value, n, have_error);
@@ -3565,6 +3669,7 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, bool std,
 			case DCH_RM:
 			case DCH_rm:
 				from_char_seq_search(&value, &s, rm_months_lower,
+									 NULL, InvalidOid,
 									 n, have_error);
 				CHECK_ERROR;
 				from_char_set_int(&out->mm, MONTHS_PER_YEAR - value,
@@ -4031,13 +4136,15 @@ to_timestamp(PG_FUNCTION_ARGS)
 {
 	text	   *date_txt = PG_GETARG_TEXT_PP(0);
 	text	   *fmt = PG_GETARG_TEXT_PP(1);
+	Oid			collid = PG_GET_COLLATION();
 	Timestamp	result;
 	int			tz;
 	struct pg_tm tm;
 	fsec_t		fsec;
 	int			fprec;
 
-	do_to_timestamp(date_txt, fmt, false, &tm, &fsec, &fprec, NULL, NULL);
+	do_to_timestamp(date_txt, fmt, collid, false,
+					&tm, &fsec, &fprec, NULL, NULL);
 
 	/* Use the specified time zone, if any. */
 	if (tm.tm_zone)
@@ -4072,11 +4179,13 @@ to_date(PG_FUNCTION_ARGS)
 {
 	text	   *date_txt = PG_GETARG_TEXT_PP(0);
 	text	   *fmt = PG_GETARG_TEXT_PP(1);
+	Oid			collid = PG_GET_COLLATION();
 	DateADT		result;
 	struct pg_tm tm;
 	fsec_t		fsec;
 
-	do_to_timestamp(date_txt, fmt, false, &tm, &fsec, NULL, NULL, NULL);
+	do_to_timestamp(date_txt, fmt, collid, false,
+					&tm, &fsec, NULL, NULL, NULL);
 
 	/* Prevent overflow in Julian-day routines */
 	if (!IS_VALID_JULIAN(tm.tm_year, tm.tm_mon, tm.tm_mday))
@@ -4116,7 +4225,13 @@ parse_datetime(text *date_txt, text *fmt, bool strict, Oid *typid,
 	int			fprec;
 	uint32		flags;
 
-	do_to_timestamp(date_txt, fmt, strict, &tm, &fsec, &fprec, &flags, have_error);
+	/*
+	 * At some point we might wish to have callers supply the collation to
+	 * use, but right now it's not clear that they'd be able to do better than
+	 * DEFAULT_COLLATION_OID anyway.
+	 */
+	do_to_timestamp(date_txt, fmt, DEFAULT_COLLATION_OID, strict,
+					&tm, &fsec, &fprec, &flags, have_error);
 	CHECK_ERROR;
 
 	*typmod = fprec ? fprec : -1;	/* fractional part precision */
@@ -4278,21 +4393,21 @@ on_error:
  * Parse the 'date_txt' according to 'fmt', return results as a struct pg_tm,
  * fractional seconds, and fractional precision.
  *
+ * 'collid' identifies the collation to use, if needed.
+ * 'std' specifies standard parsing mode.
+ * Bit mask of date/time/zone components found in 'fmt' is returned in 'flags',
+ * if that is not NULL.
+ * If 'have_error' is NULL, then errors are thrown, else '*have_error' is set.
+ *
  * We parse 'fmt' into a list of FormatNodes, which is then passed to
  * DCH_from_char to populate a TmFromChar with the parsed contents of
  * 'date_txt'.
  *
  * The TmFromChar is then analysed and converted into the final results in
- * struct 'tm' and 'fsec'.
- *
- * Bit mask of date/time/zone components found in 'fmt' is returned in 'flags'.
- *
- * 'std' specifies standard parsing mode.
- *
- * If 'have_error' is NULL, then errors are thrown, else '*have_error' is set.
+ * struct 'tm', 'fsec', and 'fprec'.
  */
 static void
-do_to_timestamp(text *date_txt, text *fmt, bool std,
+do_to_timestamp(text *date_txt, text *fmt, Oid collid, bool std,
 				struct pg_tm *tm, fsec_t *fsec, int *fprec,
 				uint32 *flags, bool *have_error)
 {
@@ -4352,7 +4467,7 @@ do_to_timestamp(text *date_txt, text *fmt, bool std,
 		/* dump_index(DCH_keywords, DCH_index); */
 #endif
 
-		DCH_from_char(format, date_str, &tmfc, std, have_error);
+		DCH_from_char(format, date_str, &tmfc, collid, std, have_error);
 		CHECK_ERROR;
 
 		pfree(fmt_str);
diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index 25fb7e2..64fd3ae 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -96,11 +96,17 @@ char	   *locale_monetary;
 char	   *locale_numeric;
 char	   *locale_time;
 
-/* lc_time localization cache */
-char	   *localized_abbrev_days[7];
-char	   *localized_full_days[7];
-char	   *localized_abbrev_months[12];
-char	   *localized_full_months[12];
+/*
+ * lc_time localization cache.
+ *
+ * We use only the first 7 or 12 entries of these arrays.  The last array
+ * element is left as NULL for the convenience of outside code that wants
+ * to sequentially scan these arrays.
+ */
+char	   *localized_abbrev_days[7 + 1];
+char	   *localized_full_days[7 + 1];
+char	   *localized_abbrev_months[12 + 1];
+char	   *localized_full_months[12 + 1];
 
 /* indicates whether locale information cache is valid */
 static bool CurrentLocaleConvValid = false;
@@ -922,6 +928,8 @@ cache_locale_time(void)
 		cache_single_string(&localized_full_days[i], bufptr, encoding);
 		bufptr += MAX_L10N_DATA;
 	}
+	localized_abbrev_days[7] = NULL;
+	localized_full_days[7] = NULL;
 
 	/* localized months */
 	for (i = 0; i < 12; i++)
@@ -931,6 +939,8 @@ cache_locale_time(void)
 		cache_single_string(&localized_full_months[i], bufptr, encoding);
 		bufptr += MAX_L10N_DATA;
 	}
+	localized_abbrev_months[12] = NULL;
+	localized_full_months[12] = NULL;
 
 	CurrentLCTimeValid = true;
 }
diff --git a/src/test/regress/expected/collate.linux.utf8.out b/src/test/regress/expected/collate.linux.utf8.out
index 37c6add..f06ae54 100644
--- a/src/test/regress/expected/collate.linux.utf8.out
+++ b/src/test/regress/expected/collate.linux.utf8.out
@@ -461,6 +461,22 @@ SELECT to_char(date '2010-04-01', 'DD TMMON YYYY' COLLATE "tr_TR");
  01 NİS 2010
 (1 row)
 
+-- to_date
+SELECT to_date('01 ŞUB 2010', 'DD TMMON YYYY');
+  to_date   
+------------
+ 02-01-2010
+(1 row)
+
+SELECT to_date('01 Şub 2010', 'DD TMMON YYYY');
+  to_date   
+------------
+ 02-01-2010
+(1 row)
+
+SELECT to_date('1234567890ab 2010', 'TMMONTH YYYY'); -- fail
+ERROR:  invalid value "1234567890ab" for "MONTH"
+DETAIL:  The given value did not match any of the allowed values for this field.
 -- backwards parsing
 CREATE VIEW collview1 AS SELECT * FROM collate_test1 WHERE b COLLATE "C" >= 'bbc';
 CREATE VIEW collview2 AS SELECT a, b FROM collate_test1 ORDER BY b COLLATE "C";
diff --git a/src/test/regress/sql/collate.linux.utf8.sql b/src/test/regress/sql/collate.linux.utf8.sql
index 8c26f16..cbbd220 100644
--- a/src/test/regress/sql/collate.linux.utf8.sql
+++ b/src/test/regress/sql/collate.linux.utf8.sql
@@ -182,6 +182,12 @@ SELECT to_char(date '2010-02-01', 'DD TMMON YYYY' COLLATE "tr_TR");
 SELECT to_char(date '2010-04-01', 'DD TMMON YYYY');
 SELECT to_char(date '2010-04-01', 'DD TMMON YYYY' COLLATE "tr_TR");
 
+-- to_date
+
+SELECT to_date('01 ŞUB 2010', 'DD TMMON YYYY');
+SELECT to_date('01 Şub 2010', 'DD TMMON YYYY');
+SELECT to_date('1234567890ab 2010', 'TMMONTH YYYY'); -- fail
+
 
 -- backwards parsing

Re: Allow to_date() and to_timestamp() to accept localized names

Reply via email to