On Wed, Sep 18, 2019 at 11:09 AM Juan José Santamaría Flecha <juanjo.santama...@gmail.com> wrote: > > On Fri, Sep 13, 2019 at 10:31 PM Alvaro Herrera > <alvhe...@2ndquadrant.com> wrote: > > > Thanks for taking a look at this. > > > I'm confused why we acquire the MONTH_DIM / etc definitions. Can't we > > just use lengthof() of the corresponding array? AFAICS it should work > > just as well. > > > > It was because of the length difference between ascii-name arrays, > which were all null-ended, and localized-name arrays. The attached > version uses lengthof(). > > > I wonder if the "compare first char" thing (seq_search_localized) really > > works when there are multibyte chars in the day/month names. I think > > the code compares just the first char ... but what if the original > > string uses those funny Unicode non-normalized letters and the locale > > translation uses normalized letters? My guess is that the first-char > > comparison will fail, but surely you'll want the name to match. > > (There's no month/day name in Spanish that doesn't start with an ASCII > > letter, but I bet there are some in other languages.) I think the > > localized comparison should avoid the first-char optimization, just > > compare the whole string all the time, and avoid possible weird issues. > > The localized search is reformulated in this version to deal with > multibyte normalization. A regression test for this issue is included.
This version is updated to optimize the need for dynamic allocation. > Regards, > > Juan José Santamaría Flecha
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index 4e3e213..c470c0e 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -6415,8 +6415,17 @@ SELECT regexp_match('abc01234xyz', '(?:(.*?)(\d+)(.*)){1,1}'); <listitem> <para> <literal>TM</literal> does not include trailing blanks. + </para> + </listitem> + + <listitem> + <para> <function>to_timestamp</function> and <function>to_date</function> ignore - the <literal>TM</literal> modifier. + the case when receiving names as an input. For example, either + <literal>to_timestamp('2000-JUN', 'YYYY-MON')</literal> or + <literal>to_timestamp('2000-Jun', 'YYYY-MON')</literal> or + <literal>to_timestamp('2000-JUN', 'YYYY-mon')</literal> work and return + the same output. </para> </listitem> diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c index 755ca6e..39d2d11 100644 --- a/src/backend/utils/adt/formatting.c +++ b/src/backend/utils/adt/formatting.c @@ -96,6 +96,7 @@ #include "utils/memutils.h" #include "utils/numeric.h" #include "utils/pg_locale.h" +#include "common/unicode_norm.h" /* ---------- * Routines type @@ -174,18 +175,17 @@ typedef struct #define CLOCK_24_HOUR 0 #define CLOCK_12_HOUR 1 - /* ---------- * Full months * ---------- */ static const char *const months_full[] = { "January", "February", "March", "April", "May", "June", "July", - "August", "September", "October", "November", "December", NULL + "August", "September", "October", "November", "December" }; static const char *const days_short[] = { - "Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat", NULL + "Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat" }; /* ---------- @@ -217,8 +217,8 @@ static const char *const days_short[] = { * matches for BC have an odd index. So the boolean value for BC is given by * taking the array index of the match, modulo 2. */ -static const char *const adbc_strings[] = {ad_STR, bc_STR, AD_STR, BC_STR, NULL}; -static const char *const adbc_strings_long[] = {a_d_STR, b_c_STR, A_D_STR, B_C_STR, NULL}; +static const char *const adbc_strings[] = {ad_STR, bc_STR, AD_STR, BC_STR}; +static const char *const adbc_strings_long[] = {a_d_STR, b_c_STR, A_D_STR, B_C_STR}; /* ---------- * AM / PM @@ -244,8 +244,8 @@ static const char *const adbc_strings_long[] = {a_d_STR, b_c_STR, A_D_STR, B_C_S * matches for PM have an odd index. So the boolean value for PM is given by * taking the array index of the match, modulo 2. */ -static const char *const ampm_strings[] = {am_STR, pm_STR, AM_STR, PM_STR, NULL}; -static const char *const ampm_strings_long[] = {a_m_STR, p_m_STR, A_M_STR, P_M_STR, NULL}; +static const char *const ampm_strings[] = {am_STR, pm_STR, AM_STR, PM_STR}; +static const char *const ampm_strings_long[] = {a_m_STR, p_m_STR, A_M_STR, P_M_STR}; /* ---------- * Months in roman-numeral @@ -254,10 +254,10 @@ static const char *const ampm_strings_long[] = {a_m_STR, p_m_STR, A_M_STR, P_M_S * ---------- */ static const char *const rm_months_upper[] = -{"XII", "XI", "X", "IX", "VIII", "VII", "VI", "V", "IV", "III", "II", "I", NULL}; +{"XII", "XI", "X", "IX", "VIII", "VII", "VI", "V", "IV", "III", "II", "I"}; static const char *const rm_months_lower[] = -{"xii", "xi", "x", "ix", "viii", "vii", "vi", "v", "iv", "iii", "ii", "i", NULL}; +{"xii", "xi", "x", "ix", "viii", "vii", "vi", "v", "iv", "iii", "ii", "i"}; /* ---------- * Roman numbers @@ -975,7 +975,7 @@ static void parse_format(FormatNode *node, const char *str, const KeyWord *kw, static void DCH_to_char(FormatNode *node, bool is_interval, TmToChar *in, char *out, Oid collid); -static void DCH_from_char(FormatNode *node, char *in, TmFromChar *out); +static void DCH_from_char(FormatNode *node, char *in, TmFromChar *out, Oid collid); #ifdef DEBUG_TO_FROM_CHAR static void dump_index(const KeyWord *k, const int *index); @@ -990,10 +990,12 @@ static void from_char_set_mode(TmFromChar *tmfc, const FromCharDateMode mode); static void from_char_set_int(int *dest, const int value, const FormatNode *node); static int from_char_parse_int_len(int *dest, char **src, const int len, FormatNode *node); static int from_char_parse_int(int *dest, char **src, FormatNode *node); -static int seq_search(char *name, const char *const *array, int type, int max, int *len); -static int from_char_seq_search(int *dest, char **src, const char *const *array, int type, int max, FormatNode *node); +static int seq_search_sqlascii(char *name, const char *const *array, int type, int max, int *len, int dim); +static int seq_search_localized(char *name, char **array, int max, int *len, int dim, Oid collid); +static int from_char_seq_search(int *dest, char **src, const char *const *array, char **localized_array, int type, int max, + FormatNode *node, int dim, Oid collid); static void do_to_timestamp(text *date_txt, text *fmt, - struct pg_tm *tm, fsec_t *fsec); + struct pg_tm *tm, fsec_t *fsec, Oid collid); static char *fill_str(char *str, int c, int max); static FormatNode *NUM_cache(int len, NUMDesc *Num, text *pars_str, bool *shouldFree); static char *int_to_roman(int number); @@ -2325,17 +2327,18 @@ from_char_parse_int(int *dest, char **src, FormatNode *node) } /* ---------- - * Sequential search with to upper/lower conversion + * Sequential search with to upper/lower conversion for SQL_ASCII array input * ---------- */ static int -seq_search(char *name, const char *const *array, int type, int max, int *len) +seq_search_sqlascii(char *name, const char *const *array, int type, int max, int *len, int dim) { const char *p; const char *const *a; char *n; int last, i; + int index; *len = 0; @@ -2348,7 +2351,7 @@ seq_search(char *name, const char *const *array, int type, int max, int *len) else if (type == ALL_LOWER) *name = pg_tolower((unsigned char) *name); - for (last = 0, a = array; *a != NULL; a++) + for (last = 0, a = array, index = 0; index < dim; a++, index++) { /* compare first chars */ if (*name != **a) @@ -2360,13 +2363,13 @@ seq_search(char *name, const char *const *array, int type, int max, int *len) if (max && i == max) { *len = i; - return a - array; + return index; } /* full size */ if (*p == '\0') { *len = i; - return a - array; + return index; } /* Not found in array 'a' */ if (*n == '\0') @@ -2396,6 +2399,83 @@ seq_search(char *name, const char *const *array, int type, int max, int *len) return -1; } +/* ---------- + * Sequential search with initcap conversion for localized array input + * ---------- + */ +static int +seq_search_localized(char *name, char **array, int max, int *len, int dim, Oid collid) +{ + char **a; + char *initcap_element; + char *initcap_name; + char *norm_name; + int index; + int mb_max; + int name_len; + int encoding; + int norm_len; + int element_len; + + *len = 0; + + if (!*name) + return -1; + + encoding = GetDatabaseEncoding(); + mb_max = max * pg_encoding_max_length(encoding); + name_len = strlen(name); + name_len = name_len < mb_max ? name_len : mb_max; + + /* Normalize and initcap name */ + norm_name = name; + norm_len = name_len; + if (mb_max > max && encoding == PG_UTF8) + { + pg_wchar *wchar_name; + pg_wchar *norm_wname; + size_t name_wlen; + size_t norm_wlen; + wchar_name = (pg_wchar *) palloc((name_len + 1) * sizeof(pg_wchar)); + name_wlen = pg_mb2wchar_with_len(name, wchar_name, name_len); + norm_wname = unicode_normalize_kc(wchar_name); + pfree(wchar_name); + norm_wlen = pg_wchar_strlen(norm_wname); + if (name_wlen > norm_wlen) + { + norm_name = (char *) palloc((norm_wlen + 1) * sizeof(pg_wchar)); + norm_len = pg_wchar2mb_with_len(norm_wname, norm_name, norm_wlen); + } + pfree(norm_wname); + } + initcap_name = str_initcap(norm_name, norm_len, collid); + if (name_len != norm_len) + pfree(norm_name); + + for (a = array, index = 0; index < dim; a++, index++) + { + /* Initcap element, assume it is normalized */ + element_len = strlen(*a); + initcap_element = str_initcap(*a, element_len, collid); + +#ifdef DEBUG_TO_FROM_CHAR + elog(DEBUG_elog_output, "Name: 0x%x, Normalized: 0x%x, Element: 0x%x", + (unsigned char)*name, (unsigned char)*initcap_name, (unsigned char)*initcap_element); +#endif + if (strncmp(initcap_name, initcap_element, element_len) == 0) + { + *len = element_len + name_len - norm_len; + pfree(initcap_element); + pfree(initcap_name); + return index; + } + pfree(initcap_element); + } + + pfree(initcap_name); + return -1; +} + /* * Perform a sequential search in 'array' for text matching the first 'max' * characters of the source string. @@ -2407,16 +2487,20 @@ seq_search(char *name, const char *const *array, int type, int max, int *len) * If the string doesn't match, throw an error. */ static int -from_char_seq_search(int *dest, char **src, const char *const *array, int type, int max, - FormatNode *node) +from_char_seq_search(int *dest, char **src, const char *const *array, char **localized_array, int type, int max, + FormatNode *node, int dim, Oid collid) { int len; - *dest = seq_search(*src, array, type, max, &len); + if (localized_array == NULL) + *dest = seq_search_sqlascii(*src, array, type, max, &len, dim); + else + *dest = seq_search_localized(*src, localized_array, max, &len, dim, collid); if (len <= 0) { char copy[DCH_MAX_ITEM_SIZ + 1]; + /* We use byte length, localized names encoding is ignored */ Assert(max <= DCH_MAX_ITEM_SIZ); strlcpy(copy, *src, max + 1); @@ -3017,19 +3101,24 @@ DCH_to_char(FormatNode *node, bool is_interval, TmToChar *in, char *out, Oid col * ---------- */ static void -DCH_from_char(FormatNode *node, char *in, TmFromChar *out) +DCH_from_char(FormatNode *node, char *in, TmFromChar *out, Oid collid) { FormatNode *n; char *s; int len, value; bool fx_mode = false; + char **localized_names; /* number of extra skipped characters (more than given in format string) */ int extra_skip = 0; + /* cache localized days and months */ + cache_locale_time(); for (n = node, s = in; n->type != NODE_TYPE_END && *s != '\0'; n++) { + localized_names = NULL; + /* * Ignore spaces at the beginning of the string and before fields when * not in FX (fixed width) mode. @@ -3111,8 +3200,8 @@ DCH_from_char(FormatNode *node, char *in, TmFromChar *out) case DCH_P_M: case DCH_a_m: case DCH_p_m: - from_char_seq_search(&value, &s, ampm_strings_long, - ALL_UPPER, n->key->len, n); + from_char_seq_search(&value, &s, ampm_strings_long, localized_names, + ALL_UPPER, n->key->len, n, lengthof(ampm_strings_long), collid); from_char_set_int(&out->pm, value % 2, n); out->clock = CLOCK_12_HOUR; break; @@ -3120,8 +3209,8 @@ DCH_from_char(FormatNode *node, char *in, TmFromChar *out) case DCH_PM: case DCH_am: case DCH_pm: - from_char_seq_search(&value, &s, ampm_strings, - ALL_UPPER, n->key->len, n); + from_char_seq_search(&value, &s, ampm_strings, localized_names, + ALL_UPPER, n->key->len, n, lengthof(ampm_strings), collid); from_char_set_int(&out->pm, value % 2, n); out->clock = CLOCK_12_HOUR; break; @@ -3211,30 +3300,34 @@ DCH_from_char(FormatNode *node, char *in, TmFromChar *out) case DCH_B_C: case DCH_a_d: case DCH_b_c: - from_char_seq_search(&value, &s, adbc_strings_long, - ALL_UPPER, n->key->len, n); + from_char_seq_search(&value, &s, adbc_strings_long, localized_names, + ALL_UPPER, n->key->len, n, lengthof(adbc_strings_long), collid); from_char_set_int(&out->bc, value % 2, n); break; case DCH_AD: case DCH_BC: case DCH_ad: case DCH_bc: - from_char_seq_search(&value, &s, adbc_strings, - ALL_UPPER, n->key->len, n); + from_char_seq_search(&value, &s, adbc_strings, localized_names, + ALL_UPPER, n->key->len, n, lengthof(adbc_strings), collid); from_char_set_int(&out->bc, value % 2, n); break; case DCH_MONTH: case DCH_Month: case DCH_month: - from_char_seq_search(&value, &s, months_full, ONE_UPPER, - MAX_MONTH_LEN, n); + if (S_TM(n->suffix)) + localized_names = localized_full_months; + from_char_seq_search(&value, &s, months_full, localized_names, + ONE_UPPER, MAX_MONTH_LEN, n, lengthof(months_full), collid); from_char_set_int(&out->mm, value + 1, n); break; case DCH_MON: case DCH_Mon: case DCH_mon: - from_char_seq_search(&value, &s, months, ONE_UPPER, - MAX_MON_LEN, n); + if (S_TM(n->suffix)) + localized_names = localized_abbrev_months; + from_char_seq_search(&value, &s, months, localized_names, + ONE_UPPER, MAX_MON_LEN, n, lengthof(months_full), collid); from_char_set_int(&out->mm, value + 1, n); break; case DCH_MM: @@ -3244,16 +3337,20 @@ DCH_from_char(FormatNode *node, char *in, TmFromChar *out) case DCH_DAY: case DCH_Day: case DCH_day: - from_char_seq_search(&value, &s, days, ONE_UPPER, - MAX_DAY_LEN, n); + if (S_TM(n->suffix)) + localized_names = localized_full_days; + from_char_seq_search(&value, &s, days, localized_names, + ONE_UPPER, MAX_DAY_LEN, n, lengthof(days_short), collid); from_char_set_int(&out->d, value, n); out->d++; break; case DCH_DY: case DCH_Dy: case DCH_dy: - from_char_seq_search(&value, &s, days, ONE_UPPER, - MAX_DY_LEN, n); + if (S_TM(n->suffix)) + localized_names = localized_abbrev_days; + from_char_seq_search(&value, &s, days_short, localized_names, + ONE_UPPER, MAX_DY_LEN, n, lengthof(days_short), collid); from_char_set_int(&out->d, value, n); out->d++; break; @@ -3351,13 +3448,13 @@ DCH_from_char(FormatNode *node, char *in, TmFromChar *out) SKIP_THth(s, n->suffix); break; case DCH_RM: - from_char_seq_search(&value, &s, rm_months_upper, - ALL_UPPER, MAX_RM_LEN, n); + from_char_seq_search(&value, &s, rm_months_upper, localized_names, + ALL_UPPER, MAX_RM_LEN, n, lengthof(rm_months_upper), collid); from_char_set_int(&out->mm, MONTHS_PER_YEAR - value, n); break; case DCH_rm: - from_char_seq_search(&value, &s, rm_months_lower, - ALL_LOWER, MAX_RM_LEN, n); + from_char_seq_search(&value, &s, rm_months_lower, localized_names, + ALL_LOWER, MAX_RM_LEN, n, lengthof(rm_months_lower), collid); from_char_set_int(&out->mm, MONTHS_PER_YEAR - value, n); break; case DCH_W: @@ -3690,7 +3787,7 @@ to_timestamp(PG_FUNCTION_ARGS) struct pg_tm tm; fsec_t fsec; - do_to_timestamp(date_txt, fmt, &tm, &fsec); + do_to_timestamp(date_txt, fmt, &tm, &fsec, PG_GET_COLLATION()); /* Use the specified time zone, if any. */ if (tm.tm_zone) @@ -3725,7 +3822,7 @@ to_date(PG_FUNCTION_ARGS) struct pg_tm tm; fsec_t fsec; - do_to_timestamp(date_txt, fmt, &tm, &fsec); + do_to_timestamp(date_txt, fmt, &tm, &fsec, PG_GET_COLLATION()); /* Prevent overflow in Julian-day routines */ if (!IS_VALID_JULIAN(tm.tm_year, tm.tm_mon, tm.tm_mday)) @@ -3761,7 +3858,7 @@ to_date(PG_FUNCTION_ARGS) */ static void do_to_timestamp(text *date_txt, text *fmt, - struct pg_tm *tm, fsec_t *fsec) + struct pg_tm *tm, fsec_t *fsec, Oid collid) { FormatNode *format; TmFromChar tmfc; @@ -3810,11 +3907,11 @@ do_to_timestamp(text *date_txt, text *fmt, } #ifdef DEBUG_TO_FROM_CHAR - /* dump_node(format, fmt_len); */ - /* dump_index(DCH_keywords, DCH_index); */ + dump_node(format, fmt_len); + dump_index(DCH_keywords, DCH_index); #endif - DCH_from_char(format, date_str, &tmfc); + DCH_from_char(format, date_str, &tmfc, collid); pfree(fmt_str); if (!incache) diff --git a/src/test/regress/expected/collate.linux.utf8.out b/src/test/regress/expected/collate.linux.utf8.out index ad56ff9..5319d95 100644 --- a/src/test/regress/expected/collate.linux.utf8.out +++ b/src/test/regress/expected/collate.linux.utf8.out @@ -461,6 +461,24 @@ SELECT to_char(date '2010-04-01', 'DD TMMON YYYY' COLLATE "tr_TR"); 01 NİS 2010 (1 row) +-- to_date +SET DateStyle='ISO, YMD'; +SET TimeZone='UTC'; +SELECT to_timestamp('01 ŞUB 2010', 'DD TMMON YYYY'); -- normalized \u015E + to_timestamp +------------------------ + 2010-02-01 00:00:00+00 +(1 row) + +SELECT to_timestamp('01 ŞUB 2010', 'DD TMMON YYYY'); -- not normalized \u0053+\u0327 + to_timestamp +------------------------ + 2010-02-01 00:00:00+00 +(1 row) + +SELECT to_timestamp('1234567890ab 2010', 'TMMONTH YYYY'); -- fail +ERROR: invalid value "123456789" for "MONTH" +DETAIL: The given value did not match any of the allowed values for this field. -- backwards parsing CREATE VIEW collview1 AS SELECT * FROM collate_test1 WHERE b COLLATE "C" >= 'bbc'; CREATE VIEW collview2 AS SELECT a, b FROM collate_test1 ORDER BY b COLLATE "C"; diff --git a/src/test/regress/sql/collate.linux.utf8.sql b/src/test/regress/sql/collate.linux.utf8.sql index eac2f90..08b0404 100644 --- a/src/test/regress/sql/collate.linux.utf8.sql +++ b/src/test/regress/sql/collate.linux.utf8.sql @@ -182,6 +182,14 @@ SELECT to_char(date '2010-02-01', 'DD TMMON YYYY' COLLATE "tr_TR"); SELECT to_char(date '2010-04-01', 'DD TMMON YYYY'); SELECT to_char(date '2010-04-01', 'DD TMMON YYYY' COLLATE "tr_TR"); +-- to_date + +SET DateStyle='ISO, YMD'; +SET TimeZone='UTC'; +SELECT to_timestamp('01 ŞUB 2010', 'DD TMMON YYYY'); -- normalized \u015E +SELECT to_timestamp('01 ŞUB 2010', 'DD TMMON YYYY'); -- not normalized \u0053+\u0327 +SELECT to_timestamp('1234567890ab 2010', 'TMMONTH YYYY'); -- fail + -- backwards parsing