I wrote: > That leads me to the attached patch. There is more that could be done > here --- in particular, I'd like to see the character-not-byte-count > rule extended to literal text. But that seems like fit material for > a different patch.
Attached is a patch that makes formatting.c more multibyte-aware; it now handles multibyte characters as single NODE_TYPE_CHAR format nodes, rather than one node per byte. This doesn't really have much impact on the output (to_char) side, but on the input side, it greatly simplifies treating such characters as single characters rather than multiple ones. An example is that (in UTF8 encoding) previously we got u8=# select to_number('$12.34', '€99D99'); to_number ----------- 0.34 (1 row) because the literal euro sign is 3 bytes long and was thought to be 3 literal characters. Now we get the expected result u8=# select to_number('$12.34', '€99D99'); to_number ----------- 12.34 (1 row) Aside from skipping 1 input character (not byte) per literal format character, I fixed the SKIP_THth macro, allowing to_date/to_timestamp to also follow the rule of skipping whole characters not bytes for non-data format patterns. There might be some other places that need similar adjustments, but I couldn't find any. Not sure about whether/how to add regression tests for this; it's really impossible to add specific tests in an ASCII-only test file. Maybe we could put a test or two into collate.linux.utf8.sql, but it seems a bit off topic for that, and I think that test file hardly gets run anyway. Note this needs to be applied over the patch I posted at https://postgr.es/m/3626.1510949...@sss.pgh.pa.us I intend to commit that fairly soon, but it's not in right now. regards, tom lane
diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c index cb0dbf7..ec97de0 100644 *** a/src/backend/utils/adt/formatting.c --- b/src/backend/utils/adt/formatting.c *************** typedef enum *** 151,158 **** FROM_CHAR_DATE_ISOWEEK /* ISO 8601 week date */ } FromCharDateMode; - typedef struct FormatNode FormatNode; - typedef struct { const char *name; --- 151,156 ---- *************** typedef struct *** 162,174 **** FromCharDateMode date_mode; } KeyWord; ! struct FormatNode { ! int type; /* node type */ ! const KeyWord *key; /* if node type is KEYWORD */ ! char character; /* if node type is CHAR */ ! int suffix; /* keyword suffix */ ! }; #define NODE_TYPE_END 1 #define NODE_TYPE_ACTION 2 --- 160,172 ---- FromCharDateMode date_mode; } KeyWord; ! typedef struct { ! int type; /* NODE_TYPE_XXX, see below */ ! const KeyWord *key; /* if type is ACTION */ ! char character[MAX_MULTIBYTE_CHAR_LEN + 1]; /* if type is CHAR */ ! int suffix; /* keyword prefix/suffix code, if any */ ! } FormatNode; #define NODE_TYPE_END 1 #define NODE_TYPE_ACTION 2 *************** parse_format(FormatNode *node, const cha *** 1282,1293 **** } else if (*str) { /* * Process double-quoted literal string, if any */ if (*str == '"') { ! while (*(++str)) { if (*str == '"') { --- 1280,1294 ---- } else if (*str) { + int chlen; + /* * Process double-quoted literal string, if any */ if (*str == '"') { ! str++; ! while (*str) { if (*str == '"') { *************** parse_format(FormatNode *node, const cha *** 1297,1307 **** /* backslash quotes the next character, if any */ if (*str == '\\' && *(str + 1)) str++; n->type = NODE_TYPE_CHAR; ! n->character = *str; n->key = NULL; n->suffix = 0; n++; } } else --- 1298,1311 ---- /* backslash quotes the next character, if any */ if (*str == '\\' && *(str + 1)) str++; + chlen = pg_mblen(str); n->type = NODE_TYPE_CHAR; ! memcpy(n->character, str, chlen); ! n->character[chlen] = '\0'; n->key = NULL; n->suffix = 0; n++; + str += chlen; } } else *************** parse_format(FormatNode *node, const cha *** 1312,1323 **** */ if (*str == '\\' && *(str + 1) == '"') str++; n->type = NODE_TYPE_CHAR; ! n->character = *str; n->key = NULL; n->suffix = 0; n++; ! str++; } } } --- 1316,1329 ---- */ if (*str == '\\' && *(str + 1) == '"') str++; + chlen = pg_mblen(str); n->type = NODE_TYPE_CHAR; ! memcpy(n->character, str, chlen); ! n->character[chlen] = '\0'; n->key = NULL; n->suffix = 0; n++; ! str += chlen; } } } *************** dump_node(FormatNode *node, int max) *** 1349,1355 **** elog(DEBUG_elog_output, "%d:\t NODE_TYPE_ACTION '%s'\t(%s,%s)", a, n->key->name, DUMP_THth(n->suffix), DUMP_FM(n->suffix)); else if (n->type == NODE_TYPE_CHAR) ! elog(DEBUG_elog_output, "%d:\t NODE_TYPE_CHAR '%c'", a, n->character); else if (n->type == NODE_TYPE_END) { elog(DEBUG_elog_output, "%d:\t NODE_TYPE_END", a); --- 1355,1362 ---- elog(DEBUG_elog_output, "%d:\t NODE_TYPE_ACTION '%s'\t(%s,%s)", a, n->key->name, DUMP_THth(n->suffix), DUMP_FM(n->suffix)); else if (n->type == NODE_TYPE_CHAR) ! elog(DEBUG_elog_output, "%d:\t NODE_TYPE_CHAR '%s'", ! a, n->character); else if (n->type == NODE_TYPE_END) { elog(DEBUG_elog_output, "%d:\t NODE_TYPE_END", a); *************** asc_toupper_z(const char *buff) *** 2008,2015 **** do { \ if (S_THth(_suf)) \ { \ ! if (*(ptr)) (ptr)++; \ ! if (*(ptr)) (ptr)++; \ } \ } while (0) --- 2015,2022 ---- do { \ if (S_THth(_suf)) \ { \ ! if (*(ptr)) (ptr) += pg_mblen(ptr); \ ! if (*(ptr)) (ptr) += pg_mblen(ptr); \ } \ } while (0) *************** is_next_separator(FormatNode *n) *** 2076,2082 **** return true; } ! else if (isdigit((unsigned char) n->character)) return false; return true; /* some non-digit input (separator) */ --- 2083,2090 ---- return true; } ! else if (n->character[1] == '\0' && ! isdigit((unsigned char) n->character[0])) return false; return true; /* some non-digit input (separator) */ *************** DCH_to_char(FormatNode *node, bool is_in *** 2405,2412 **** { if (n->type != NODE_TYPE_ACTION) { ! *s = n->character; ! s++; continue; } --- 2413,2420 ---- { if (n->type != NODE_TYPE_ACTION) { ! strcpy(s, n->character); ! s += strlen(s); continue; } *************** DCH_from_char(FormatNode *node, char *in *** 2974,2980 **** * we don't insist that the consumed character match the format's * character. */ ! s++; continue; } --- 2982,2988 ---- * we don't insist that the consumed character match the format's * character. */ ! s += pg_mblen(s); continue; } *************** get_last_relevant_decnum(char *num) *** 4217,4223 **** /* * These macros are used in NUM_processor() and its subsidiary routines. * OVERLOAD_TEST: true if we've reached end of input string ! * AMOUNT_TEST(s): true if at least s characters remain in string */ #define OVERLOAD_TEST (Np->inout_p >= Np->inout + input_len) #define AMOUNT_TEST(s) (Np->inout_p <= Np->inout + (input_len - (s))) --- 4225,4231 ---- /* * These macros are used in NUM_processor() and its subsidiary routines. * OVERLOAD_TEST: true if we've reached end of input string ! * AMOUNT_TEST(s): true if at least s bytes remain in string */ #define OVERLOAD_TEST (Np->inout_p >= Np->inout + input_len) #define AMOUNT_TEST(s) (Np->inout_p <= Np->inout + (input_len - (s))) *************** NUM_processor(FormatNode *node, NUMDesc *** 4821,4829 **** if (!Np->is_to_char) { /* ! * Check at least one character remains to be scanned. (In ! * actions below, must use AMOUNT_TEST if we want to read more ! * characters than that.) */ if (OVERLOAD_TEST) break; --- 4829,4837 ---- if (!Np->is_to_char) { /* ! * Check at least one byte remains to be scanned. (In actions ! * below, must use AMOUNT_TEST if we want to read more bytes than ! * that.) */ if (OVERLOAD_TEST) break; *************** NUM_processor(FormatNode *node, NUMDesc *** 5081,5092 **** * In TO_CHAR, non-pattern characters in the format are copied to * the output. In TO_NUMBER, we skip one input character for each * non-pattern format character, whether or not it matches the ! * format character. (Currently, that's actually implemented as ! * skipping one input byte per non-pattern format byte, which is ! * wrong...) */ if (Np->is_to_char) ! *Np->inout_p = n->character; } Np->inout_p++; } --- 5089,5106 ---- * In TO_CHAR, non-pattern characters in the format are copied to * the output. In TO_NUMBER, we skip one input character for each * non-pattern format character, whether or not it matches the ! * format character. */ if (Np->is_to_char) ! { ! strcpy(Np->inout_p, n->character); ! Np->inout_p += strlen(Np->inout_p); ! } ! else ! { ! Np->inout_p += pg_mblen(Np->inout_p); ! } ! continue; } Np->inout_p++; }