Hello,

On 14.07.2016 12:16, Pavel Stehule wrote:

last point was discussed in thread related to to_date_valid function.

Regards

Pavel

Thank you.

Here is my patch. It is a proof of concept.

Date/Time Formatting
--------------------

There are changes in date/time formatting rules:

- now to_timestamp() and to_date() skip spaces in the input string and in the formatting string unless FX option is used, as Amul Sul wrote on first message of this thread. But Ex.2 gives an error now with this patch (should we fix this too?).

- in the code space characters and separator characters have different types of FormatNode. Separator characters are characters ',', '-', '.', '/' and ':'. This is done to have different rules of formatting to space and separator characters. If FX option isn't used then PostgreSQL do not insist that separator in the formatting string should match separator in the formatting string. But count of separators should be equal with or without FX option.

- now PostgreSQL check is there a closing quote. Otherwise the error is raised.

Still PostgreSQL do not insist that text character in the formatting string should match text character in the input string. It is not obvious if this should be fixed. Because we may have different character case or character with accent mark or without accent mark. But I suppose that it is not right just check text character count. For example, there is unicode version of space character U+00A0.

Code changes
------------

- new defines:

#define NODE_TYPE_SEPARATOR     4
#define NODE_TYPE_SPACE         5

- now DCH_cache_getnew() is called after parse_format(). Because now parse_format() can raise an error and in the next attempt DCH_cache_search() could return broken cache entry.


This patch do not handle all noticed issues in this thread, since still there is not consensus about them. So this patch in a proof of concept status and it can be changed.

Of course this patch can be completely wrong. But it tries to introduce more formal rules for formatting.

I will be grateful for notes and remarks.

--
Artur Zakirov
Postgres Professional: http://www.postgrespro.com
Russian Postgres Company

diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 7830334..5656245 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -6083,9 +6083,10 @@ SELECT regexp_matches('abc01234xyz', '(?:(.*?)(\d+)(.*)){1,1}');
      <listitem>
       <para>
        <function>to_timestamp</function> and <function>to_date</function>
-       skip multiple blank spaces in the input string unless the
-       <literal>FX</literal> option is used. For example,
-       <literal>to_timestamp('2000&nbsp;&nbsp;&nbsp;&nbsp;JUN', 'YYYY MON')</literal> works, but
+       skip multiple blank spaces in the input string and in the formatting
+       string unless the <literal>FX</literal> option is used. For example,
+       <literal>to_timestamp('2000&nbsp;&nbsp;&nbsp;&nbsp;JUN', 'YYYY MON')</literal>
+       and <literal>to_timestamp('2000&nbsp;JUN', 'YYYY&nbsp;&nbsp;&nbsp;&nbsp;MON')</literal> work, but
        <literal>to_timestamp('2000&nbsp;&nbsp;&nbsp;&nbsp;JUN', 'FXYYYY MON')</literal> returns an error
        because <function>to_timestamp</function> expects one space only.
        <literal>FX</literal> must be specified as the first item in
@@ -6121,6 +6122,19 @@ SELECT regexp_matches('abc01234xyz', '(?:(.*?)(\d+)(.*)){1,1}');
        skips two input characters.
       </para>
      </listitem>
+     
+     <listitem>
+      <para>
+       In <function>to_date</> and <function>to_timestamp</> separator
+       characters <literal>','</literal>, <literal>'-'</literal>,
+       <literal>'.'</literal>, <literal>'/'</literal> and <literal>':'</literal>
+       outside of double-quoted strings skip the number of input characters
+       contained in the string unless the <literal>FX</literal> option is used.
+       If <literal>FX</literal> option is specified then consumed separator
+       character in the formatting string must match the separator character
+       in the input string.
+      </para>
+     </listitem>
 
      <listitem>
       <para>
diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c
index bbd97dc..ef39df9 100644
--- a/src/backend/utils/adt/formatting.c
+++ b/src/backend/utils/adt/formatting.c
@@ -169,6 +169,8 @@ struct FormatNode
 #define NODE_TYPE_END		1
 #define NODE_TYPE_ACTION	2
 #define NODE_TYPE_CHAR		3
+#define NODE_TYPE_SEPARATOR	4
+#define NODE_TYPE_SPACE		5
 
 #define SUFFTYPE_PREFIX		1
 #define SUFFTYPE_POSTFIX	2
@@ -947,6 +949,7 @@ typedef struct NUMProc
 static const KeyWord *index_seq_search(char *str, const KeyWord *kw,
 				 const int *index);
 static const KeySuffix *suff_search(char *str, const KeySuffix *suf, int type);
+static bool is_char_separator(char *str);
 static void NUMDesc_prepare(NUMDesc *num, FormatNode *n);
 static void parse_format(FormatNode *node, char *str, const KeyWord *kw,
 			 const KeySuffix *suf, const int *index, int ver, NUMDesc *Num);
@@ -963,7 +966,6 @@ static void dump_node(FormatNode *node, int max);
 static const char *get_th(char *num, int type);
 static char *str_numth(char *dest, char *num, int type);
 static int	adjust_partial_year_to_2020(int year);
-static int	strspace_len(char *str);
 static void from_char_set_mode(TmFromChar *tmfc, const FromCharDateMode mode);
 static void from_char_set_int(int *dest, const int value, const FormatNode *node);
 static int	from_char_parse_int_len(int *dest, char **src, const int len, FormatNode *node);
@@ -1036,6 +1038,16 @@ suff_search(char *str, const KeySuffix *suf, int type)
 	return NULL;
 }
 
+static bool
+is_char_separator(char *str)
+{
+	return (*str == ',' ||
+			*str == '-' ||
+			*str == '.' ||
+			*str == '/' ||
+			*str == ':');
+}
+
 /* ----------
  * Prepare NUMDesc (number description struct) via FormatNode struct
  * ----------
@@ -1237,9 +1249,10 @@ parse_format(FormatNode *node, char *str, const KeyWord *kw,
 {
 	const KeySuffix *s;
 	FormatNode *n;
+	bool		in_text = false,
+				in_backslash = false;
 	int			node_set = 0,
-				suffix,
-				last = 0;
+				suffix;
 
 #ifdef DEBUG_TO_FROM_CHAR
 	elog(DEBUG_elog_output, "to_char/number(): run parser");
@@ -1251,6 +1264,49 @@ parse_format(FormatNode *node, char *str, const KeyWord *kw,
 	{
 		suffix = 0;
 
+		/* Previous character was a backslash */
+		if (in_backslash)
+		{
+			/* After backslash should go non-space character */
+			if (isspace(*str))
+				ereport(ERROR,
+						(errcode(ERRCODE_SYNTAX_ERROR),
+						 errmsg("invalid escape sequence")));
+			in_backslash = false;
+
+			n->type = NODE_TYPE_CHAR;
+			n->character = *str;
+			n->key = NULL;
+			n->suffix = 0;
+			n++;
+			str++;
+			continue;
+		}
+		/* Previous character was a quote */
+		else if (in_text)
+		{
+			if (*str == '"')
+			{
+				str++;
+				in_text = false;
+			}
+			else if (*str == '\\')
+			{
+				str++;
+				in_backslash = true;
+			}
+			else
+			{
+				n->type = NODE_TYPE_CHAR;
+				n->character = *str;
+				n->key = NULL;
+				n->suffix = 0;
+				n++;
+				str++;
+			}
+			continue;
+		}
+
 		/*
 		 * Prefix
 		 */
@@ -1290,48 +1346,30 @@ parse_format(FormatNode *node, char *str, const KeyWord *kw,
 		}
 		else if (*str)
 		{
-			/*
-			 * Special characters '\' and '"'
-			 */
-			if (*str == '"' && last != '\\')
+			if (*str == '"')
 			{
-				int			x = 0;
-
-				while (*(++str))
-				{
-					if (*str == '"' && x != '\\')
-					{
-						str++;
-						break;
-					}
-					else if (*str == '\\' && x != '\\')
-					{
-						x = '\\';
-						continue;
-					}
-					n->type = NODE_TYPE_CHAR;
-					n->character = *str;
-					n->key = NULL;
-					n->suffix = 0;
-					++n;
-					x = *str;
-				}
+				in_text = true;
 				node_set = 0;
-				suffix = 0;
-				last = 0;
+				str++;
 			}
-			else if (*str && *str == '\\' && last != '\\' && *(str + 1) == '"')
+			else if (*str == '\\')
 			{
-				last = *str;
+				in_backslash = true;
+				node_set = 0;
 				str++;
 			}
-			else if (*str)
+			else
 			{
-				n->type = NODE_TYPE_CHAR;
+				if (ver == DCH_TYPE && is_char_separator(str))
+					n->type = NODE_TYPE_SEPARATOR;
+				else if (isspace(*str))
+					n->type = NODE_TYPE_SPACE;
+				else
+					n->type = NODE_TYPE_CHAR;
+
 				n->character = *str;
 				n->key = NULL;
 				node_set = 1;
-				last = 0;
 				str++;
 			}
 		}
@@ -1348,6 +1386,17 @@ parse_format(FormatNode *node, char *str, const KeyWord *kw,
 		}
 	}
 
+	if (in_backslash)
+		ereport(ERROR,
+				(errcode(ERRCODE_SYNTAX_ERROR),
+				 errmsg("invalid escape sequence")));
+
+	/* If we didn't meet closing quotes */
+	if (in_text)
+		ereport(ERROR,
+				(errcode(ERRCODE_SYNTAX_ERROR),
+				 errmsg("unexpected end of format string, expected '\"' character")));
+
 	n->type = NODE_TYPE_END;
 	n->suffix = 0;
 	return;
@@ -2081,20 +2130,6 @@ adjust_partial_year_to_2020(int year)
 		return year;
 }
 
-
-static int
-strspace_len(char *str)
-{
-	int			len = 0;
-
-	while (*str && isspace((unsigned char) *str))
-	{
-		str++;
-		len++;
-	}
-	return len;
-}
-
 /*
  * Set the date mode of a from-char conversion.
  *
@@ -2164,11 +2199,6 @@ from_char_parse_int_len(int *dest, char **src, const int len, FormatNode *node)
 	char	   *init = *src;
 	int			used;
 
-	/*
-	 * Skip any whitespace before parsing the integer.
-	 */
-	*src += strspace_len(*src);
-
 	Assert(len <= DCH_MAX_ITEM_SIZ);
 	used = (int) strlcpy(copy, *src, len + 1);
 
@@ -2948,20 +2978,71 @@ DCH_from_char(FormatNode *node, char *in, TmFromChar *out)
 	char	   *s;
 	int			len,
 				value;
-	bool		fx_mode = false;
+	bool		in_space = false,
+				fx_mode = false;
 
 	for (n = node, s = in; n->type != NODE_TYPE_END && *s != '\0'; n++)
 	{
-		if (n->type != NODE_TYPE_ACTION)
+		if (n->type == NODE_TYPE_SPACE)
+		{
+			/*
+			 * Previous node was a space node. In non FX (fixed format) mode we
+			 * skiped all spaces in the previous step.
+			 */
+			if (in_space && !fx_mode)
+				continue;
+
+			if (!isspace(*s))
+				ereport(ERROR,
+						(errcode(ERRCODE_INVALID_DATETIME_FORMAT),
+						 errmsg("expected space character in given string"),
+						 errdetail("The given value did not match any of the allowed "
+								   "values for space field.")));
+			s++;
+			/* If not in FX (fixed format) mode skip other spaces */
+			if (!fx_mode)
+				while (*s != '\0' && isspace(*s))
+					s++;
+
+			in_space = true;
+			continue;
+		}
+		else if (n->type == NODE_TYPE_SEPARATOR)
 		{
 			/*
-			 * Separator, so consume one character from input string.  Notice
-			 * we don't insist that the consumed character match the format's
-			 * character.
+			 * In FX mode we don't insist that consumed separator match the
+			 * format's separator character.
 			 */
+			if (!fx_mode && !is_char_separator(s))
+				ereport(ERROR,
+						(errcode(ERRCODE_INVALID_DATETIME_FORMAT),
+						 errmsg("expected separator character in input string"),
+						 errdetail("The given value did not match any of the allowed "
+								   "values for separator field.")));
+			else if (fx_mode && n->character != *s)
+				ereport(ERROR,
+						(errcode(ERRCODE_INVALID_DATETIME_FORMAT),
+						 errmsg("unexpected separator \"%c\", expected \"%c\"",
+								*s, n->character),
+						 errdetail("The given value did not match any of the allowed "
+								   "values for separator field.")));
 			s++;
+			in_space = false;
 			continue;
 		}
+		else if (n->type == NODE_TYPE_CHAR)
+		{
+			/*
+			 * Text character, so consume one character from input string.
+			 * Notice we don't insist that the consumed character match the
+			 * format's character.
+			 * Text field ignores FX mode.
+			 */
+			s++;
+			in_space = false;
+			continue;
+		}
+		in_space = false;
 
 		/* Ignore spaces before fields when not in FX (fixed width) mode */
 		if (!fx_mode && n->key->id != DCH_FX)
@@ -3340,15 +3421,17 @@ datetime_to_char_body(TmToChar *tmtc, text *fmt, bool is_interval, Oid collid)
 
 		if ((ent = DCH_cache_search(fmt_str)) == NULL)
 		{
-			ent = DCH_cache_getnew(fmt_str);
+			FormatNode	buf[DCH_CACHE_SIZE + 1];
 
 			/*
 			 * Not in the cache, must run parser and save a new format-picture
 			 * to the cache.
 			 */
-			parse_format(ent->format, fmt_str, DCH_keywords,
+			parse_format(buf, fmt_str, DCH_keywords,
 						 DCH_suff, DCH_index, DCH_TYPE, NULL);
 
+			ent = DCH_cache_getnew(fmt_str);
+			memcpy(ent->format, buf, sizeof(buf));
 			(ent->format + fmt_len)->type = NODE_TYPE_END;		/* Paranoia? */
 
 #ifdef DEBUG_TO_FROM_CHAR
@@ -3604,15 +3687,17 @@ do_to_timestamp(text *date_txt, text *fmt,
 
 			if ((ent = DCH_cache_search(fmt_str)) == NULL)
 			{
-				ent = DCH_cache_getnew(fmt_str);
+				FormatNode	buf[DCH_CACHE_SIZE + 1];
 
 				/*
 				 * Not in the cache, must run parser and save a new
 				 * format-picture to the cache.
 				 */
-				parse_format(ent->format, fmt_str, DCH_keywords,
+				parse_format(buf, fmt_str, DCH_keywords,
 							 DCH_suff, DCH_index, DCH_TYPE, NULL);
 
+				ent = DCH_cache_getnew(fmt_str);
+				memcpy(ent->format, buf, sizeof(buf));
 				(ent->format + fmt_len)->type = NODE_TYPE_END;	/* Paranoia? */
 #ifdef DEBUG_TO_FROM_CHAR
 				/* dump_node(ent->format, fmt_len); */
@@ -3981,32 +4066,41 @@ NUM_cache(int len, NUMDesc *Num, text *pars_str, bool *shouldFree)
 
 		if ((ent = NUM_cache_search(str)) == NULL)
 		{
-			ent = NUM_cache_getnew(str);
+			FormatNode	buf[NUM_CACHE_SIZE + 1];
+
+			zeroize_NUM(Num);
 
 			/*
 			 * Not in the cache, must run parser and save a new format-picture
 			 * to the cache.
 			 */
-			parse_format(ent->format, str, NUM_keywords,
-						 NULL, NUM_index, NUM_TYPE, &ent->Num);
+			parse_format(buf, str, NUM_keywords,
+						 NULL, NUM_index, NUM_TYPE, Num);
+
+			ent = NUM_cache_getnew(str);
+
+			memcpy(ent->format, buf, sizeof(buf));
+			memcpy(&ent->Num, Num, sizeof(NUMDesc));
 
 			(ent->format + len)->type = NODE_TYPE_END;	/* Paranoia? */
 		}
+		else
+		{
+			/*
+			 * Copy cache to used struct
+			 */
+			Num->flag = ent->Num.flag;
+			Num->lsign = ent->Num.lsign;
+			Num->pre = ent->Num.pre;
+			Num->post = ent->Num.post;
+			Num->pre_lsign_num = ent->Num.pre_lsign_num;
+			Num->need_locale = ent->Num.need_locale;
+			Num->multi = ent->Num.multi;
+			Num->zero_start = ent->Num.zero_start;
+			Num->zero_end = ent->Num.zero_end;
+		}
 
 		format = ent->format;
-
-		/*
-		 * Copy cache to used struct
-		 */
-		Num->flag = ent->Num.flag;
-		Num->lsign = ent->Num.lsign;
-		Num->pre = ent->Num.pre;
-		Num->post = ent->Num.post;
-		Num->pre_lsign_num = ent->Num.pre_lsign_num;
-		Num->need_locale = ent->Num.need_locale;
-		Num->multi = ent->Num.multi;
-		Num->zero_start = ent->Num.zero_start;
-		Num->zero_end = ent->Num.zero_end;
 	}
 
 #ifdef DEBUG_TO_FROM_CHAR
diff --git a/src/test/regress/expected/horology.out b/src/test/regress/expected/horology.out
index 1fe02be..d6290f2 100644
--- a/src/test/regress/expected/horology.out
+++ b/src/test/regress/expected/horology.out
@@ -2763,6 +2763,9 @@ SELECT to_timestamp('0097/Feb/16 --> 08:14:30', 'YYYY/Mon/DD --> HH:MI:SS');
  Sat Feb 16 08:14:30 0097 PST
 (1 row)
 
+SELECT to_timestamp('0097/Feb/16 --> 08:14:30', 'YYYY/Mon/DD ---> HH:MI:SS');
+ERROR:  expected separator character in input string
+DETAIL:  The given value did not match any of the allowed values for separator field.
 SELECT to_timestamp('97/2/16 8:14:30', 'FMYYYY/FMMM/FMDD FMHH:FMMI:FMSS');
          to_timestamp         
 ------------------------------
@@ -2775,8 +2778,20 @@ SELECT to_timestamp('1985 January 12', 'YYYY FMMonth DD');
  Sat Jan 12 00:00:00 1985 PST
 (1 row)
 
+SELECT to_timestamp('1985 FMMonth 12', 'YYYY "FMMonth" DD');
+         to_timestamp         
+------------------------------
+ Sat Jan 12 00:00:00 1985 PST
+(1 row)
+
+SELECT to_timestamp('1985 \ 12', 'YYYY \\ DD');
+         to_timestamp         
+------------------------------
+ Sat Jan 12 00:00:00 1985 PST
+(1 row)
+
 SELECT to_timestamp('My birthday-> Year: 1976, Month: May, Day: 16',
-                    '"My birthday-> Year" YYYY, "Month:" FMMonth, "Day:" DD');
+                    '"My birthday-> Year:" YYYY, "Month:" FMMonth, "Day:" DD');
          to_timestamp         
 ------------------------------
  Sun May 16 00:00:00 1976 PDT
@@ -2789,7 +2804,7 @@ SELECT to_timestamp('1,582nd VIII 21', 'Y,YYYth FMRM DD');
 (1 row)
 
 SELECT to_timestamp('15 "text between quote marks" 98 54 45',
-                    E'HH24 "\\text between quote marks\\"" YY MI SS');
+                    E'HH24 "\\"text between quote marks\\"" YY MI SS');
          to_timestamp         
 ------------------------------
  Thu Jan 01 15:54:45 1998 PST
@@ -2810,6 +2825,21 @@ SELECT to_timestamp('2000January09Sunday', 'YYYYFMMonthDDFMDay');
 SELECT to_timestamp('97/Feb/16', 'YYMonDD');
 ERROR:  invalid value "/Fe" for "Mon"
 DETAIL:  The given value did not match any of the allowed values for this field.
+SELECT to_timestamp('97/Feb/16', 'YY:Mon:DD');
+         to_timestamp         
+------------------------------
+ Sun Feb 16 00:00:00 1997 PST
+(1 row)
+
+SELECT to_timestamp('97/Feb/16', 'FXYY:Mon:DD');
+ERROR:  unexpected separator "/", expected ":"
+DETAIL:  The given value did not match any of the allowed values for separator field.
+SELECT to_timestamp('97/Feb/16', 'FXYY/Mon/DD');
+         to_timestamp         
+------------------------------
+ Sun Feb 16 00:00:00 1997 PST
+(1 row)
+
 SELECT to_timestamp('19971116', 'YYYYMMDD');
          to_timestamp         
 ------------------------------
@@ -2912,7 +2942,7 @@ SELECT to_timestamp('  20050302', 'YYYYMMDD');
 SELECT to_timestamp('2011-12-18 23:38:15', 'YYYY-MM-DD  HH24:MI:SS');
          to_timestamp         
 ------------------------------
- Sun Dec 18 03:38:15 2011 PST
+ Sun Dec 18 23:38:15 2011 PST
 (1 row)
 
 SELECT to_timestamp('2011-12-18  23:38:15', 'YYYY-MM-DD  HH24:MI:SS');
@@ -2942,7 +2972,7 @@ SELECT to_timestamp('2011-12-18  23:38:15', 'YYYY-MM-DD  HH24:MI:SS');
 SELECT to_timestamp('2011-12-18  23:38:15', 'YYYY-MM-DD   HH24:MI:SS');
          to_timestamp         
 ------------------------------
- Sun Dec 18 03:38:15 2011 PST
+ Sun Dec 18 23:38:15 2011 PST
 (1 row)
 
 SELECT to_date('2011 12  18', 'YYYY MM DD');
@@ -2960,13 +2990,13 @@ SELECT to_date('2011 12  18', 'YYYY MM  DD');
 SELECT to_date('2011 12  18', 'YYYY MM   DD');
   to_date   
 ------------
- 12-08-2011
+ 12-18-2011
 (1 row)
 
 SELECT to_date('2011 12 18', 'YYYY  MM DD');
   to_date   
 ------------
- 02-18-2011
+ 12-18-2011
 (1 row)
 
 SELECT to_date('2011  12 18', 'YYYY  MM DD');
diff --git a/src/test/regress/sql/horology.sql b/src/test/regress/sql/horology.sql
index c81437b..c3da350 100644
--- a/src/test/regress/sql/horology.sql
+++ b/src/test/regress/sql/horology.sql
@@ -390,17 +390,23 @@ RESET DateStyle;
 
 SELECT to_timestamp('0097/Feb/16 --> 08:14:30', 'YYYY/Mon/DD --> HH:MI:SS');
 
+SELECT to_timestamp('0097/Feb/16 --> 08:14:30', 'YYYY/Mon/DD ---> HH:MI:SS');
+
 SELECT to_timestamp('97/2/16 8:14:30', 'FMYYYY/FMMM/FMDD FMHH:FMMI:FMSS');
 
 SELECT to_timestamp('1985 January 12', 'YYYY FMMonth DD');
 
+SELECT to_timestamp('1985 FMMonth 12', 'YYYY "FMMonth" DD');
+
+SELECT to_timestamp('1985 \ 12', 'YYYY \\ DD');
+
 SELECT to_timestamp('My birthday-> Year: 1976, Month: May, Day: 16',
-                    '"My birthday-> Year" YYYY, "Month:" FMMonth, "Day:" DD');
+                    '"My birthday-> Year:" YYYY, "Month:" FMMonth, "Day:" DD');
 
 SELECT to_timestamp('1,582nd VIII 21', 'Y,YYYth FMRM DD');
 
 SELECT to_timestamp('15 "text between quote marks" 98 54 45',
-                    E'HH24 "\\text between quote marks\\"" YY MI SS');
+                    E'HH24 "\\"text between quote marks\\"" YY MI SS');
 
 SELECT to_timestamp('05121445482000', 'MMDDHH24MISSYYYY');
 
@@ -408,6 +414,12 @@ SELECT to_timestamp('2000January09Sunday', 'YYYYFMMonthDDFMDay');
 
 SELECT to_timestamp('97/Feb/16', 'YYMonDD');
 
+SELECT to_timestamp('97/Feb/16', 'YY:Mon:DD');
+
+SELECT to_timestamp('97/Feb/16', 'FXYY:Mon:DD');
+
+SELECT to_timestamp('97/Feb/16', 'FXYY/Mon/DD');
+
 SELECT to_timestamp('19971116', 'YYYYMMDD');
 
 SELECT to_timestamp('20000-1116', 'YYYY-MMDD');

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to