Re: [PATCH] Add roman support for to_number function

Tom Lane Fri, 17 Jan 2025 16:28:08 -0800

Hunaid Sohail <hunaidp...@gmail.com> writes:
> I’ve attached a new patch that addresses comments 2, 3, and 4 from Tomas.


I'm still quite unhappy that this doesn't tolerate trailing
whitespace.  That's not how other format patterns work, and it makes
it impossible to round-trip the output of to_char(..., 'RN').
I think the core of the problem is that you tried to cram the logic
in where the existing "not implemented" error is thrown.  But on
inspection there is nothing sane about that entire "Roman correction"
stanza -- what it does is either useless (zeroing already-zeroed
fields) or in the wrong place (if we don't want to allow other flags
with IS_ROMAN, that should be done via an error in NUMDesc_prepare,
like other similar cases).  In the attached I moved the roman_to_int
call into NUM_processor's main loop so it's more like other format
patterns, and fixed it to not eat any more characters than it should.
This might allow putting back some format-combination capabilities,
but other than the whitespace angle I figure we can leave that for
people to request it.

                        regards, tom lane

diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 47370e581a..5678e7621a 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -8669,8 +8669,8 @@ SELECT regexp_match('abc01234xyz', '(?:(.*?)(\d+)(.*)){1,1}');
         <entry>plus/minus sign in specified position</entry>
        </row>
        <row>
-        <entry><literal>RN</literal></entry>
-        <entry>Roman numeral (input between 1 and 3999)</entry>
+        <entry><literal>RN</literal> or <literal>rn</literal></entry>
+        <entry>Roman numeral (values between 1 and 3999)</entry>
        </row>
        <row>
         <entry><literal>TH</literal> or <literal>th</literal></entry>
@@ -8798,6 +8798,19 @@ SELECT regexp_match('abc01234xyz', '(?:(.*?)(\d+)(.*)){1,1}');
        (e.g., <literal>9.99EEEE</literal> is a valid pattern).
       </para>
      </listitem>
+
+     <listitem>
+      <para>
+       In <function>to_number()</function>, the <literal>RN</literal>
+       pattern converts Roman numerals (in standard form) to numbers.
+       Input is case-insensitive, so <literal>RN</literal>
+       and <literal>rn</literal> are equivalent.  <literal>RN</literal>
+       cannot be used in combination with any other formatting patterns or
+       modifiers except <literal>FM</literal>, which is applicable only
+       in <function>to_char()</function> and is ignored
+       in <function>to_number()</function>.
+      </para>
+     </listitem>
     </itemizedlist>
    </para>
 
diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c
index 3960235e14..f885ed0448 100644
--- a/src/backend/utils/adt/formatting.c
+++ b/src/backend/utils/adt/formatting.c
@@ -49,7 +49,6 @@
  *	- better number building (formatting) / parsing, now it isn't
  *		  ideal code
  *	- use Assert()
- *	- add support for roman number to standard number conversion
  *	- add support for number spelling
  *	- add support for string to string formatting (we must be better
  *	  than Oracle :-),
@@ -257,13 +256,39 @@ static const char *const rm_months_lower[] =
 {"xii", "xi", "x", "ix", "viii", "vii", "vi", "v", "iv", "iii", "ii", "i", NULL};
 
 /* ----------
- * Roman numbers
+ * Roman numerals
  * ----------
  */
 static const char *const rm1[] = {"I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX", NULL};
 static const char *const rm10[] = {"X", "XX", "XXX", "XL", "L", "LX", "LXX", "LXXX", "XC", NULL};
 static const char *const rm100[] = {"C", "CC", "CCC", "CD", "D", "DC", "DCC", "DCCC", "CM", NULL};
 
+/*
+ * MACRO: Check if the current and next characters form a valid subtraction
+ * combination for roman numerals.
+ */
+#define IS_VALID_SUB_COMB(curr, next) \
+	(((curr) == 'I' && ((next) == 'V' || (next) == 'X')) || \
+	 ((curr) == 'X' && ((next) == 'L' || (next) == 'C')) || \
+	 ((curr) == 'C' && ((next) == 'D' || (next) == 'M')))
+
+/*
+ * MACRO: Roman numeral value, or 0 if character isn't a roman numeral.
+ */
+#define ROMAN_VAL(r) \
+	((r) == 'I' ? 1 : \
+	 (r) == 'V' ? 5 : \
+	 (r) == 'X' ? 10 : \
+	 (r) == 'L' ? 50 : \
+	 (r) == 'C' ? 100 : \
+	 (r) == 'D' ? 500 : \
+	 (r) == 'M' ? 1000 : 0)
+
+/*
+ * 'MMMDCCCLXXXVIII' (3888) is the longest valid roman numeral (15 characters).
+ */
+#define MAX_ROMAN_LEN	15
+
 /* ----------
  * Ordinal postfixes
  * ----------
@@ -1028,6 +1053,15 @@ typedef struct NUMProc
 #define DCH_TIMED	0x02
 #define DCH_ZONED	0x04
 
+/*
+ * These macros are used in NUM_processor() and its subsidiary routines.
+ * OVERLOAD_TEST: true if we've reached end of input string
+ * AMOUNT_TEST(s): true if at least s bytes remain in string
+ */
+#define OVERLOAD_TEST	(Np->inout_p >= Np->inout + input_len)
+#define AMOUNT_TEST(s)	(Np->inout_p <= Np->inout + (input_len - (s)))
+
+
 /* ----------
  * Functions
  * ----------
@@ -1075,6 +1109,7 @@ static bool do_to_timestamp(text *date_txt, text *fmt, Oid collid, bool std,
 static char *fill_str(char *str, int c, int max);
 static FormatNode *NUM_cache(int len, NUMDesc *Num, text *pars_str, bool *shouldFree);
 static char *int_to_roman(int number);
+static int	roman_to_int(NUMProc *Np, int input_len);
 static void NUM_prepare_locale(NUMProc *Np);
 static char *get_last_relevant_decnum(char *num);
 static void NUM_numpart_from_char(NUMProc *Np, int id, int input_len);
@@ -1285,6 +1320,10 @@ NUMDesc_prepare(NUMDesc *num, FormatNode *n)
 
 		case NUM_rn:
 		case NUM_RN:
+			if (IS_ROMAN(num))
+				ereport(ERROR,
+						(errcode(ERRCODE_SYNTAX_ERROR),
+						 errmsg("cannot use \"RN\" twice")));
 			num->flag |= NUM_F_ROMAN;
 			break;
 
@@ -1316,6 +1355,13 @@ NUMDesc_prepare(NUMDesc *num, FormatNode *n)
 			num->flag |= NUM_F_EEEE;
 			break;
 	}
+
+	if (IS_ROMAN(num) &&
+		(num->flag & ~(NUM_F_ROMAN | NUM_F_FILLMODE)) != 0)
+		ereport(ERROR,
+				(errcode(ERRCODE_SYNTAX_ERROR),
+				 errmsg("\"RN\" is incompatible with other formats"),
+				 errdetail("\"RN\" may only be used together with \"FM\".")));
 }
 
 /* ----------
@@ -4956,7 +5002,7 @@ int_to_roman(int number)
 			   *result,
 				numstr[12];
 
-	result = (char *) palloc(16);
+	result = (char *) palloc(MAX_ROMAN_LEN + 1);
 	*result = '\0';
 
 	/*
@@ -4966,7 +5012,7 @@ int_to_roman(int number)
 	 */
 	if (number > 3999 || number < 1)
 	{
-		fill_str(result, '#', 15);
+		fill_str(result, '#', MAX_ROMAN_LEN);
 		return result;
 	}
 
@@ -5000,6 +5046,136 @@ int_to_roman(int number)
 	return result;
 }
 
+/*
+ * Convert a roman numeral (standard form) to an integer.
+ * Result is an integer between 1 and 3999.
+ * Np->inout_p is advanced past the characters consumed.
+ *
+ * If input is invalid, return -1.
+ */
+static int
+roman_to_int(NUMProc *Np, int input_len)
+{
+	int			result = 0;
+	int			len;
+	char		romanChars[MAX_ROMAN_LEN];
+	int			romanValues[MAX_ROMAN_LEN];
+	int			repeatCount = 1;
+	int			vCount = 0,
+				lCount = 0,
+				dCount = 0;
+	bool		subtractionEncountered = false;
+	int			lastSubtractedValue = 0;
+
+	/*
+	 * Collect and decode valid roman numerals, consuming at most
+	 * MAX_ROMAN_LEN characters.  We do this in a separate loop to avoid
+	 * repeated decoding and because the main loop needs to know when it's at
+	 * the last numeral.
+	 */
+	for (len = 0; len < MAX_ROMAN_LEN && !OVERLOAD_TEST; len++)
+	{
+		char		currChar = pg_ascii_toupper(*Np->inout_p);
+		int			currValue = ROMAN_VAL(currChar);
+
+		if (currValue == 0)
+			break;				/* Not a valid roman numeral. */
+		romanChars[len] = currChar;
+		romanValues[len] = currValue;
+		Np->inout_p++;
+	}
+
+	if (len == 0)
+		return -1;				/* No valid roman numerals. */
+
+	/* Check for valid combinations and compute the represented value. */
+	for (int i = 0; i < len; i++)
+	{
+		char		currChar = romanChars[i];
+		int			currValue = romanValues[i];
+
+		/*
+		 * Ensure no character greater than or equal to the subtracted
+		 * character appears after a subtraction.
+		 */
+		if (subtractionEncountered && currValue >= lastSubtractedValue)
+			return -1;
+
+		/* Check for invalid repetitions of characters V, L, or D. */
+		if (currChar == 'V')
+			vCount++;
+		if (currChar == 'L')
+			lCount++;
+		if (currChar == 'D')
+			dCount++;
+		if (vCount > 1 || lCount > 1 || dCount > 1)
+			return -1;
+
+		if (i < len - 1)
+		{
+			char		nextChar = romanChars[i + 1];
+			int			nextValue = romanValues[i + 1];
+
+			/*
+			 * If the current value is less than the next value, handle
+			 * subtraction. Verify valid subtractive combinations and update
+			 * the result accordingly.
+			 */
+			if (currValue < nextValue)
+			{
+				/* Check for invalid repetitions of characters V, L, or D. */
+				if (nextChar == 'V')
+					vCount++;
+				if (nextChar == 'L')
+					lCount++;
+				if (nextChar == 'D')
+					dCount++;
+				if (vCount > 1 || lCount > 1 || dCount > 1)
+					return -1;
+
+				/*
+				 * For cases where same character is repeated with subtraction
+				 * (e.g. 'MCCM' or 'DCCCD').
+				 */
+				if (repeatCount > 1)
+					return -1;
+
+				if (!IS_VALID_SUB_COMB(currChar, nextChar))
+					return -1;
+
+				/*
+				 * Skip the next character as it is part of the subtractive
+				 * combination.
+				 */
+				i++;
+				repeatCount = 1;
+				subtractionEncountered = true;
+				lastSubtractedValue = currValue;
+				result += (nextValue - currValue);
+			}
+			else
+			{
+				/* For same characters, check for repetition. */
+				if (currChar == nextChar)
+				{
+					repeatCount++;
+					if (repeatCount > 3)
+						return -1;
+				}
+				else
+					repeatCount = 1;
+				result += currValue;
+			}
+		}
+		else
+		{
+			/* Add the value of the last character. */
+			result += currValue;
+		}
+	}
+
+	return result;
+}
 
 
 /* ----------
@@ -5112,14 +5288,6 @@ get_last_relevant_decnum(char *num)
 	return result;
 }
 
-/*
- * These macros are used in NUM_processor() and its subsidiary routines.
- * OVERLOAD_TEST: true if we've reached end of input string
- * AMOUNT_TEST(s): true if at least s bytes remain in string
- */
-#define OVERLOAD_TEST	(Np->inout_p >= Np->inout + input_len)
-#define AMOUNT_TEST(s)	(Np->inout_p <= Np->inout + (input_len - (s)))
-
 /* ----------
  * Number extraction for TO_NUMBER()
  * ----------
@@ -5576,29 +5744,6 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout,
 		return strcpy(inout, number);
 	}
 
-	/*
-	 * Roman correction
-	 */
-	if (IS_ROMAN(Np->Num))
-	{
-		if (!Np->is_to_char)
-			ereport(ERROR,
-					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-					 errmsg("\"RN\" not supported for input")));
-
-		Np->Num->lsign = Np->Num->pre_lsign_num = Np->Num->post =
-			Np->Num->pre = Np->out_pre_spaces = Np->sign = 0;
-
-		if (IS_FILLMODE(Np->Num))
-		{
-			Np->Num->flag = 0;
-			Np->Num->flag |= NUM_F_FILLMODE;
-		}
-		else
-			Np->Num->flag = 0;
-		Np->Num->flag |= NUM_F_ROMAN;
-	}
-
 	/*
 	 * Sign
 	 */
@@ -5849,28 +5994,34 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout,
 					break;
 
 				case NUM_RN:
-					if (IS_FILLMODE(Np->Num))
-					{
-						strcpy(Np->inout_p, Np->number_p);
-						Np->inout_p += strlen(Np->inout_p) - 1;
-					}
-					else
-					{
-						sprintf(Np->inout_p, "%15s", Np->number_p);
-						Np->inout_p += strlen(Np->inout_p) - 1;
-					}
-					break;
-
 				case NUM_rn:
-					if (IS_FILLMODE(Np->Num))
+					if (Np->is_to_char)
 					{
-						strcpy(Np->inout_p, asc_tolower_z(Np->number_p));
+						const char *number_p;
+
+						if (n->key->id == NUM_rn)
+							number_p = asc_tolower_z(Np->number_p);
+						else
+							number_p = Np->number_p;
+						if (IS_FILLMODE(Np->Num))
+							strcpy(Np->inout_p, number_p);
+						else
+							sprintf(Np->inout_p, "%15s", number_p);
 						Np->inout_p += strlen(Np->inout_p) - 1;
 					}
 					else
 					{
-						sprintf(Np->inout_p, "%15s", asc_tolower_z(Np->number_p));
-						Np->inout_p += strlen(Np->inout_p) - 1;
+						int			roman_result = roman_to_int(Np, input_len);
+
+						if (roman_result < 0)
+							ereport(ERROR,
+									(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+									 errmsg("invalid Roman numeral")));
+						Np->Num->pre = sprintf(Np->number_p, "%d",
+											   roman_result);
+						Np->number_p += Np->Num->pre;
+						Np->Num->post = 0;
+						continue;	/* roman_to_int ate all the chars */
 					}
 					break;
 
diff --git a/src/test/regress/expected/numeric.out b/src/test/regress/expected/numeric.out
index 0898107ec3..dc2b930ee4 100644
--- a/src/test/regress/expected/numeric.out
+++ b/src/test/regress/expected/numeric.out
@@ -2384,6 +2384,66 @@ SELECT to_number('123456', '99999V99');
  1234.560000000000000000
 (1 row)
 
+-- Test for correct conversion between numbers and Roman numerals
+WITH rows AS
+  (SELECT i, to_char(i, 'FMRN') AS roman FROM generate_series(1, 3999) AS i)
+SELECT
+  bool_and(to_number(roman, 'RN') = i) as valid
+FROM rows;
+ valid 
+-------
+ t
+(1 row)
+
+-- Some additional tests for RN input
+SELECT to_number('CvIiI', 'rn');
+ to_number 
+-----------
+       108
+(1 row)
+
+SELECT to_number('MMXX  ', 'RN');
+ to_number 
+-----------
+      2020
+(1 row)
+
+SELECT to_number('  XIV  ', '  RN');
+ to_number 
+-----------
+        14
+(1 row)
+
+SELECT to_number('M CC', 'RN');
+ to_number 
+-----------
+      1000
+(1 row)
+
+-- error cases
+SELECT to_number('viv', 'RN');
+ERROR:  invalid Roman numeral
+SELECT to_number('DCCCD', 'RN');
+ERROR:  invalid Roman numeral
+SELECT to_number('XIXL', 'RN');
+ERROR:  invalid Roman numeral
+SELECT to_number('MCCM', 'RN');
+ERROR:  invalid Roman numeral
+SELECT to_number('MMMM', 'RN');
+ERROR:  invalid Roman numeral
+SELECT to_number('VV', 'RN');
+ERROR:  invalid Roman numeral
+SELECT to_number('IL', 'RN');
+ERROR:  invalid Roman numeral
+SELECT to_number('CM', 'MIRN');
+ERROR:  "RN" is incompatible with other formats
+DETAIL:  "RN" may only be used together with "FM".
+SELECT to_number('CM', 'RNRN');
+ERROR:  cannot use "RN" twice
+SELECT to_number('', 'RN');
+ERROR:  invalid input syntax for type numeric: " "
+SELECT to_number(' ', 'RN');
+ERROR:  invalid Roman numeral
 RESET lc_numeric;
 --
 -- Input syntax
diff --git a/src/test/regress/sql/numeric.sql b/src/test/regress/sql/numeric.sql
index 9da12c6b9e..8b7864d54a 100644
--- a/src/test/regress/sql/numeric.sql
+++ b/src/test/regress/sql/numeric.sql
@@ -1085,6 +1085,32 @@ SELECT to_number('1234.56','L99,999.99');
 SELECT to_number('1,234.56','L99,999.99');
 SELECT to_number('42nd', '99th');
 SELECT to_number('123456', '99999V99');
+
+-- Test for correct conversion between numbers and Roman numerals
+WITH rows AS
+  (SELECT i, to_char(i, 'FMRN') AS roman FROM generate_series(1, 3999) AS i)
+SELECT
+  bool_and(to_number(roman, 'RN') = i) as valid
+FROM rows;
+
+-- Some additional tests for RN input
+SELECT to_number('CvIiI', 'rn');
+SELECT to_number('MMXX  ', 'RN');
+SELECT to_number('  XIV  ', '  RN');
+SELECT to_number('M CC', 'RN');
+-- error cases
+SELECT to_number('viv', 'RN');
+SELECT to_number('DCCCD', 'RN');
+SELECT to_number('XIXL', 'RN');
+SELECT to_number('MCCM', 'RN');
+SELECT to_number('MMMM', 'RN');
+SELECT to_number('VV', 'RN');
+SELECT to_number('IL', 'RN');
+SELECT to_number('CM', 'MIRN');
+SELECT to_number('CM', 'RNRN');
+SELECT to_number('', 'RN');
+SELECT to_number(' ', 'RN');
+
 RESET lc_numeric;
 
 --

Re: [PATCH] Add roman support for to_number function

Reply via email to