Hi,
> On Mon, Dec 9, 2024 at 3:07 AM Tomas Vondra <to...@vondra.me> wrote: > >> Thanks for a nice patch. I did a quick review today, seems almost there, >> I only have a couple minor comments: >> >> 1) Template Patterns for Numeric Formatting >> >> Why the wording change? "input between 1 and 3999" sounds fine to me. >> > input... seemed to refer to numeric input for to_char roman format. But, > after roman support in to_number function shouldn't we modify it? It is the > reason I changed it to "valid for numbers 1 to 3999". > > 2) The docs say "standard numbers" but I'm not sure what that is? We >> don't use that term anywhere else, I think. Same for "standard roman >> numeral". >> > I will change "standard numbers" to "numbers". > Note that we are following the standard form. There are other forms too. > For example, 4 can be represented as IV (standard) or IIII (non standard). > https://en.wikipedia.org/wiki/Roman_numerals#Standard_form > > >> 3) A couple comments need a bit of formatting. Multi-line comments >> should start with an empty line, some lines are a bit too long. >> > I will fix the multi-line formatting. > > >> 4) I was wondering if the regression tests check all interesting cases, >> and it seems none of the tests hit these two cases: >> >> if (vCount > 1 || lCount > 1 || dCount > 1) >> return -1; >> > SELECT to_number('viv', 'RN') test covers this if statement for the > subtraction part. But, I noticed that no test covers simple counts of V, L, > D. > I will add SELECT to_number('VV', 'RN') for that. > > >> and >> >> if (!IS_VALID_SUB_COMB(currChar, nextChar)) >> return -1; >> > Noted. SELECT to_number('IL', 'RN') test can be added. > I’ve attached a new patch that addresses comments 2, 3, and 4 from Tomas. Looking forward to your feedback. Regards, Hunaid Sohail
From 585dc7a04179be9260ddbdf01d8c7caa48c3037d Mon Sep 17 00:00:00 2001 From: Hunaid2000 <hunaid2000@gmail.com> Date: Fri, 13 Dec 2024 10:19:20 +0500 Subject: [PATCH v6] Add roman support for to_number function --- doc/src/sgml/func.sgml | 13 ++- src/backend/utils/adt/formatting.c | 159 +++++++++++++++++++++++++- src/test/regress/expected/numeric.out | 45 ++++++++ src/test/regress/sql/numeric.sql | 23 ++++ 4 files changed, 235 insertions(+), 5 deletions(-) diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index 47370e581a..5a07e741d7 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -8670,7 +8670,7 @@ SELECT regexp_match('abc01234xyz', '(?:(.*?)(\d+)(.*)){1,1}'); </row> <row> <entry><literal>RN</literal></entry> - <entry>Roman numeral (input between 1 and 3999)</entry> + <entry>Roman numeral (valid for numbers 1 to 3999)</entry> </row> <row> <entry><literal>TH</literal> or <literal>th</literal></entry> @@ -8798,6 +8798,17 @@ SELECT regexp_match('abc01234xyz', '(?:(.*?)(\d+)(.*)){1,1}'); (e.g., <literal>9.99EEEE</literal> is a valid pattern). </para> </listitem> + + <listitem> + <para> + In <function>to_number</function>, <literal>RN</literal> pattern converts roman numerals + (standard form) to numbers. It is case-insensitive (e.g., <literal>'XIV'</literal>, + <literal>'xiv'</literal>, and <literal>'Xiv'</literal> are all seen as <literal>14</literal>). + <literal>RN</literal> cannot be used in combination with any other formatting patterns + or modifiers, with the exception of <literal>FM</literal>, which is applicable only in + <function>to_char()</function> and is ignored in <function>to_number()</function>. + </para> + </listitem> </itemizedlist> </para> diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c index 0dcb551511..22de73f2e6 100644 --- a/src/backend/utils/adt/formatting.c +++ b/src/backend/utils/adt/formatting.c @@ -49,7 +49,6 @@ * - better number building (formatting) / parsing, now it isn't * ideal code * - use Assert() - * - add support for roman number to standard number conversion * - add support for number spelling * - add support for string to string formatting (we must be better * than Oracle :-), @@ -271,6 +270,31 @@ static const char *const rm100[] = {"C", "CC", "CCC", "CD", "D", "DC", "DCC", "D static const char *const numTH[] = {"ST", "ND", "RD", "TH", NULL}; static const char *const numth[] = {"st", "nd", "rd", "th", NULL}; +/* ---------- + * MACRO: Check if the current and next characters + * form a valid subtraction combination for roman numerals + * ---------- + */ +#define IS_VALID_SUB_COMB(curr, next) \ + (((curr) == 'I' && ((next) == 'V' || (next) == 'X')) || \ + ((curr) == 'X' && ((next) == 'L' || (next) == 'C')) || \ + ((curr) == 'C' && ((next) == 'D' || (next) == 'M'))) + +/* ---------- + * MACRO: Roman number value + * ---------- + */ +#define ROMAN_VAL(r) \ + ((r) == 'I' ? 1 : \ + (r) == 'V' ? 5 : \ + (r) == 'X' ? 10 : \ + (r) == 'L' ? 50 : \ + (r) == 'C' ? 100 : \ + (r) == 'D' ? 500 : \ + (r) == 'M' ? 1000 : 0) + +#define MAX_ROMAN_LEN 15 + /* ---------- * Flags & Options: * ---------- @@ -1075,6 +1099,7 @@ static bool do_to_timestamp(text *date_txt, text *fmt, Oid collid, bool std, static char *fill_str(char *str, int c, int max); static FormatNode *NUM_cache(int len, NUMDesc *Num, text *pars_str, bool *shouldFree); static char *int_to_roman(int number); +static int roman_to_int(char* s, int len); static void NUM_prepare_locale(NUMProc *Np); static char *get_last_relevant_decnum(char *num); static void NUM_numpart_from_char(NUMProc *Np, int id, int input_len); @@ -1285,6 +1310,15 @@ NUMDesc_prepare(NUMDesc *num, FormatNode *n) case NUM_rn: case NUM_RN: + if (IS_ROMAN(num)) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("cannot use \"RN\" twice"))); + if (IS_BLANK(num) || IS_LSIGN(num) || IS_BRACKET(num) || + IS_MINUS(num) || IS_PLUS(num) || IS_MULTI(num)) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("\"RN\" is incompatible with other formats"))); num->flag |= NUM_F_ROMAN; break; @@ -5351,7 +5385,114 @@ int_to_roman(int number) return result; } +/* + * Convert a roman numeral (standard form) to an integer. + * Result is an integer between 1 and 3999. + * + * If input is invalid, return -1. + */ +static int +roman_to_int(char* s, int len) +{ + int repeatCount = 1; + int vCount = 0, lCount = 0, dCount = 0; + bool subtractionEncountered = false; + char lastSubtractedChar = 0; + int total = 0; + + /* + * Ensure the input is not too long or empty. + * 'MMMDCCCLXXXVIII' (3888) is the longest + * valid roman numeral (15 characters). + */ + if (len == 0 || len > MAX_ROMAN_LEN) + return -1; + + for (int i = 0; i < len; ++i) + { + char currChar = pg_ascii_toupper(s[i]); + int currValue = ROMAN_VAL(currChar); + + if (currValue == 0) + return -1; + + /* + * Ensure no character greater than or equal to the + * subtracted character appears after the subtraction. + */ + if (subtractionEncountered && (currValue >= ROMAN_VAL(lastSubtractedChar))) + return -1; + + /* Check for invalid repetitions of characters V, L, or D. */ + if (currChar == 'V') vCount++; + if (currChar == 'L') lCount++; + if (currChar == 'D') dCount++; + if (vCount > 1 || lCount > 1 || dCount > 1) + return -1; + if (i < len - 1) + { + char nextChar = pg_ascii_toupper(s[i + 1]); + int nextValue = ROMAN_VAL(nextChar); + + if (nextValue == 0) + return -1; + + /* + * If the current value is less than the next value, + * handle subtraction. Verify valid subtractive + * combinations and update the total accordingly. + */ + if (currValue < nextValue) + { + /* Check for invalid repetitions of characters V, L, or D. */ + if (nextChar == 'V') vCount++; + if (nextChar == 'L') lCount++; + if (nextChar == 'D') dCount++; + if (vCount > 1 || lCount > 1 || dCount > 1) + return -1; + + /* + * For cases where same character is repeated + * with subtraction (e.g. 'MCCM' or 'DCCCD'). + */ + if (repeatCount > 1) + return -1; + + if (!IS_VALID_SUB_COMB(currChar, nextChar)) + return -1; + + /* + * Skip the next character as it is part of + * the subtractive combination. + */ + i++; + repeatCount = 1; + subtractionEncountered = true; + lastSubtractedChar = currChar; + total += (nextValue - currValue); + } + else + { + /* For same characters, check for repetition. */ + if (currChar == nextChar) + { + repeatCount++; + if (repeatCount > 3) + return -1; + } + else + repeatCount = 1; + total += currValue; + } + } + /* Add the value of the last character. */ + else + total += currValue; + } + + return total; +} /* ---------- * Locale @@ -5933,9 +6074,19 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout, if (IS_ROMAN(Np->Num)) { if (!Np->is_to_char) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("\"RN\" not supported for input"))); + { + int roman_result = roman_to_int(inout, input_len); + if (roman_result == -1) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid roman numeral"))); + else + { + Np->Num->pre = sprintf(number, "%d", roman_result); + Np->Num->post = 0; + return number; + } + } Np->Num->lsign = Np->Num->pre_lsign_num = Np->Num->post = Np->Num->pre = Np->out_pre_spaces = Np->sign = 0; diff --git a/src/test/regress/expected/numeric.out b/src/test/regress/expected/numeric.out index 0898107ec3..ac023d1a24 100644 --- a/src/test/regress/expected/numeric.out +++ b/src/test/regress/expected/numeric.out @@ -2384,6 +2384,51 @@ SELECT to_number('123456', '99999V99'); 1234.560000000000000000 (1 row) +-- Test for correct conversion between numbers and Roman numerals +WITH rows AS + (SELECT i, to_char(i, 'FMRN') AS roman FROM generate_series(1, 3999) AS i) +SELECT + bool_and(to_number(roman, 'RN') = i) as valid +FROM rows; + valid +------- + t +(1 row) + +SELECT to_number('CvIiI', 'rn'); + to_number +----------- + 108 +(1 row) + +SELECT to_number('viv', 'RN'); +ERROR: invalid roman numeral +SELECT to_number('DCCCD', 'RN'); +ERROR: invalid roman numeral +SELECT to_number('XIXL', 'RN'); +ERROR: invalid roman numeral +SELECT to_number('MCCM', 'RN'); +ERROR: invalid roman numeral +SELECT to_number('MMMM', 'RN'); +ERROR: invalid roman numeral +SELECT to_number('VV', 'RN'); +ERROR: invalid roman numeral +SELECT to_number('IL', 'RN'); +ERROR: invalid roman numeral +SELECT to_number('CM', 'MIRN'); +ERROR: "RN" is incompatible with other formats +SELECT to_number('CM', 'RNRN'); +ERROR: cannot use "RN" twice +SELECT to_number(' XIV ', 'RN'); +ERROR: invalid roman numeral +SELECT to_number('MMXX ', 'RN'); +ERROR: invalid roman numeral +SELECT to_number('M CC', 'RN'); +ERROR: invalid roman numeral +SELECT to_number('', 'RN'); +ERROR: invalid roman numeral +SELECT to_number(' ', 'RN'); +ERROR: invalid roman numeral RESET lc_numeric; -- -- Input syntax diff --git a/src/test/regress/sql/numeric.sql b/src/test/regress/sql/numeric.sql index 9da12c6b9e..379a596c31 100644 --- a/src/test/regress/sql/numeric.sql +++ b/src/test/regress/sql/numeric.sql @@ -1085,6 +1085,29 @@ SELECT to_number('1234.56','L99,999.99'); SELECT to_number('1,234.56','L99,999.99'); SELECT to_number('42nd', '99th'); SELECT to_number('123456', '99999V99'); + +-- Test for correct conversion between numbers and Roman numerals +WITH rows AS + (SELECT i, to_char(i, 'FMRN') AS roman FROM generate_series(1, 3999) AS i) +SELECT + bool_and(to_number(roman, 'RN') = i) as valid +FROM rows; + +SELECT to_number('CvIiI', 'rn'); +SELECT to_number('viv', 'RN'); +SELECT to_number('DCCCD', 'RN'); +SELECT to_number('XIXL', 'RN'); +SELECT to_number('MCCM', 'RN'); +SELECT to_number('MMMM', 'RN'); +SELECT to_number('VV', 'RN'); +SELECT to_number('IL', 'RN'); +SELECT to_number('CM', 'MIRN'); +SELECT to_number('CM', 'RNRN'); +SELECT to_number(' XIV ', 'RN'); +SELECT to_number('MMXX ', 'RN'); +SELECT to_number('M CC', 'RN'); +SELECT to_number('', 'RN'); +SELECT to_number(' ', 'RN'); RESET lc_numeric; -- -- 2.34.1