I wrote:
> That leads me to the attached patch.  There is more that could be done
> here --- in particular, I'd like to see the character-not-byte-count
> rule extended to literal text.  But that seems like fit material for
> a different patch.

Attached is a patch that makes formatting.c more multibyte-aware;
it now handles multibyte characters as single NODE_TYPE_CHAR format
nodes, rather than one node per byte.  This doesn't really have much
impact on the output (to_char) side, but on the input side, it
greatly simplifies treating such characters as single characters
rather than multiple ones.  An example is that (in UTF8 encoding)
previously we got

u8=# select to_number('$12.34', '€99D99');
 to_number 
-----------
      0.34
(1 row)

because the literal euro sign is 3 bytes long and was thought to be
3 literal characters.  Now we get the expected result

u8=# select to_number('$12.34', '€99D99');
 to_number 
-----------
     12.34
(1 row)

Aside from skipping 1 input character (not byte) per literal format
character, I fixed the SKIP_THth macro, allowing to_date/to_timestamp to
also follow the rule of skipping whole characters not bytes for non-data
format patterns.  There might be some other places that need similar
adjustments, but I couldn't find any.

Not sure about whether/how to add regression tests for this; it's really
impossible to add specific tests in an ASCII-only test file.  Maybe we
could put a test or two into collate.linux.utf8.sql, but it seems a bit
off topic for that, and I think that test file hardly gets run anyway.

Note this needs to be applied over the patch I posted at
https://postgr.es/m/3626.1510949...@sss.pgh.pa.us
I intend to commit that fairly soon, but it's not in right now.

                        regards, tom lane

diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c
index cb0dbf7..ec97de0 100644
*** a/src/backend/utils/adt/formatting.c
--- b/src/backend/utils/adt/formatting.c
*************** typedef enum
*** 151,158 ****
  	FROM_CHAR_DATE_ISOWEEK		/* ISO 8601 week date */
  } FromCharDateMode;
  
- typedef struct FormatNode FormatNode;
- 
  typedef struct
  {
  	const char *name;
--- 151,156 ----
*************** typedef struct
*** 162,174 ****
  	FromCharDateMode date_mode;
  } KeyWord;
  
! struct FormatNode
  {
! 	int			type;			/* node type			*/
! 	const KeyWord *key;			/* if node type is KEYWORD	*/
! 	char		character;		/* if node type is CHAR		*/
! 	int			suffix;			/* keyword suffix		*/
! };
  
  #define NODE_TYPE_END		1
  #define NODE_TYPE_ACTION	2
--- 160,172 ----
  	FromCharDateMode date_mode;
  } KeyWord;
  
! typedef struct
  {
! 	int			type;			/* NODE_TYPE_XXX, see below */
! 	const KeyWord *key;			/* if type is ACTION */
! 	char		character[MAX_MULTIBYTE_CHAR_LEN + 1];	/* if type is CHAR */
! 	int			suffix;			/* keyword prefix/suffix code, if any */
! } FormatNode;
  
  #define NODE_TYPE_END		1
  #define NODE_TYPE_ACTION	2
*************** parse_format(FormatNode *node, const cha
*** 1282,1293 ****
  		}
  		else if (*str)
  		{
  			/*
  			 * Process double-quoted literal string, if any
  			 */
  			if (*str == '"')
  			{
! 				while (*(++str))
  				{
  					if (*str == '"')
  					{
--- 1280,1294 ----
  		}
  		else if (*str)
  		{
+ 			int			chlen;
+ 
  			/*
  			 * Process double-quoted literal string, if any
  			 */
  			if (*str == '"')
  			{
! 				str++;
! 				while (*str)
  				{
  					if (*str == '"')
  					{
*************** parse_format(FormatNode *node, const cha
*** 1297,1307 ****
  					/* backslash quotes the next character, if any */
  					if (*str == '\\' && *(str + 1))
  						str++;
  					n->type = NODE_TYPE_CHAR;
! 					n->character = *str;
  					n->key = NULL;
  					n->suffix = 0;
  					n++;
  				}
  			}
  			else
--- 1298,1311 ----
  					/* backslash quotes the next character, if any */
  					if (*str == '\\' && *(str + 1))
  						str++;
+ 					chlen = pg_mblen(str);
  					n->type = NODE_TYPE_CHAR;
! 					memcpy(n->character, str, chlen);
! 					n->character[chlen] = '\0';
  					n->key = NULL;
  					n->suffix = 0;
  					n++;
+ 					str += chlen;
  				}
  			}
  			else
*************** parse_format(FormatNode *node, const cha
*** 1312,1323 ****
  				 */
  				if (*str == '\\' && *(str + 1) == '"')
  					str++;
  				n->type = NODE_TYPE_CHAR;
! 				n->character = *str;
  				n->key = NULL;
  				n->suffix = 0;
  				n++;
! 				str++;
  			}
  		}
  	}
--- 1316,1329 ----
  				 */
  				if (*str == '\\' && *(str + 1) == '"')
  					str++;
+ 				chlen = pg_mblen(str);
  				n->type = NODE_TYPE_CHAR;
! 				memcpy(n->character, str, chlen);
! 				n->character[chlen] = '\0';
  				n->key = NULL;
  				n->suffix = 0;
  				n++;
! 				str += chlen;
  			}
  		}
  	}
*************** dump_node(FormatNode *node, int max)
*** 1349,1355 ****
  			elog(DEBUG_elog_output, "%d:\t NODE_TYPE_ACTION '%s'\t(%s,%s)",
  				 a, n->key->name, DUMP_THth(n->suffix), DUMP_FM(n->suffix));
  		else if (n->type == NODE_TYPE_CHAR)
! 			elog(DEBUG_elog_output, "%d:\t NODE_TYPE_CHAR '%c'", a, n->character);
  		else if (n->type == NODE_TYPE_END)
  		{
  			elog(DEBUG_elog_output, "%d:\t NODE_TYPE_END", a);
--- 1355,1362 ----
  			elog(DEBUG_elog_output, "%d:\t NODE_TYPE_ACTION '%s'\t(%s,%s)",
  				 a, n->key->name, DUMP_THth(n->suffix), DUMP_FM(n->suffix));
  		else if (n->type == NODE_TYPE_CHAR)
! 			elog(DEBUG_elog_output, "%d:\t NODE_TYPE_CHAR '%s'",
! 				 a, n->character);
  		else if (n->type == NODE_TYPE_END)
  		{
  			elog(DEBUG_elog_output, "%d:\t NODE_TYPE_END", a);
*************** asc_toupper_z(const char *buff)
*** 2008,2015 ****
  	do { \
  		if (S_THth(_suf)) \
  		{ \
! 			if (*(ptr)) (ptr)++; \
! 			if (*(ptr)) (ptr)++; \
  		} \
  	} while (0)
  
--- 2015,2022 ----
  	do { \
  		if (S_THth(_suf)) \
  		{ \
! 			if (*(ptr)) (ptr) += pg_mblen(ptr); \
! 			if (*(ptr)) (ptr) += pg_mblen(ptr); \
  		} \
  	} while (0)
  
*************** is_next_separator(FormatNode *n)
*** 2076,2082 ****
  
  		return true;
  	}
! 	else if (isdigit((unsigned char) n->character))
  		return false;
  
  	return true;				/* some non-digit input (separator) */
--- 2083,2090 ----
  
  		return true;
  	}
! 	else if (n->character[1] == '\0' &&
! 			 isdigit((unsigned char) n->character[0]))
  		return false;
  
  	return true;				/* some non-digit input (separator) */
*************** DCH_to_char(FormatNode *node, bool is_in
*** 2405,2412 ****
  	{
  		if (n->type != NODE_TYPE_ACTION)
  		{
! 			*s = n->character;
! 			s++;
  			continue;
  		}
  
--- 2413,2420 ----
  	{
  		if (n->type != NODE_TYPE_ACTION)
  		{
! 			strcpy(s, n->character);
! 			s += strlen(s);
  			continue;
  		}
  
*************** DCH_from_char(FormatNode *node, char *in
*** 2974,2980 ****
  			 * we don't insist that the consumed character match the format's
  			 * character.
  			 */
! 			s++;
  			continue;
  		}
  
--- 2982,2988 ----
  			 * we don't insist that the consumed character match the format's
  			 * character.
  			 */
! 			s += pg_mblen(s);
  			continue;
  		}
  
*************** get_last_relevant_decnum(char *num)
*** 4217,4223 ****
  /*
   * These macros are used in NUM_processor() and its subsidiary routines.
   * OVERLOAD_TEST: true if we've reached end of input string
!  * AMOUNT_TEST(s): true if at least s characters remain in string
   */
  #define OVERLOAD_TEST	(Np->inout_p >= Np->inout + input_len)
  #define AMOUNT_TEST(s)	(Np->inout_p <= Np->inout + (input_len - (s)))
--- 4225,4231 ----
  /*
   * These macros are used in NUM_processor() and its subsidiary routines.
   * OVERLOAD_TEST: true if we've reached end of input string
!  * AMOUNT_TEST(s): true if at least s bytes remain in string
   */
  #define OVERLOAD_TEST	(Np->inout_p >= Np->inout + input_len)
  #define AMOUNT_TEST(s)	(Np->inout_p <= Np->inout + (input_len - (s)))
*************** NUM_processor(FormatNode *node, NUMDesc 
*** 4821,4829 ****
  		if (!Np->is_to_char)
  		{
  			/*
! 			 * Check at least one character remains to be scanned.  (In
! 			 * actions below, must use AMOUNT_TEST if we want to read more
! 			 * characters than that.)
  			 */
  			if (OVERLOAD_TEST)
  				break;
--- 4829,4837 ----
  		if (!Np->is_to_char)
  		{
  			/*
! 			 * Check at least one byte remains to be scanned.  (In actions
! 			 * below, must use AMOUNT_TEST if we want to read more bytes than
! 			 * that.)
  			 */
  			if (OVERLOAD_TEST)
  				break;
*************** NUM_processor(FormatNode *node, NUMDesc 
*** 5081,5092 ****
  			 * In TO_CHAR, non-pattern characters in the format are copied to
  			 * the output.  In TO_NUMBER, we skip one input character for each
  			 * non-pattern format character, whether or not it matches the
! 			 * format character.  (Currently, that's actually implemented as
! 			 * skipping one input byte per non-pattern format byte, which is
! 			 * wrong...)
  			 */
  			if (Np->is_to_char)
! 				*Np->inout_p = n->character;
  		}
  		Np->inout_p++;
  	}
--- 5089,5106 ----
  			 * In TO_CHAR, non-pattern characters in the format are copied to
  			 * the output.  In TO_NUMBER, we skip one input character for each
  			 * non-pattern format character, whether or not it matches the
! 			 * format character.
  			 */
  			if (Np->is_to_char)
! 			{
! 				strcpy(Np->inout_p, n->character);
! 				Np->inout_p += strlen(Np->inout_p);
! 			}
! 			else
! 			{
! 				Np->inout_p += pg_mblen(Np->inout_p);
! 			}
! 			continue;
  		}
  		Np->inout_p++;
  	}

Reply via email to