The warnings look good, except that this one:

   $ printf '1.0\n0.9\n' | sort -s -k1,1g --debug
   sort: numbers use ‘.’ as a decimal point in this locale
   0.9
   ___
   1.0
   ___

seems overkill if we're in the C locale.

Also, shouldn't similar diagnostics be generated if the field separator is '-', or '+', or a digit in the current locale?


+  if (numeric_field_span)
+    {
+      char sep_string[2] = { 0, };
+      sep_string[0] = thousands_sep;
+      if ((tab == TAB_DEFAULT
+           && (isblank (to_uchar (thousands_sep))))
+          || tab == thousands_sep)
+        {
+          error (0, 0,
+                 _("field separator %s is treated as a "
+                   "group separator in numbers"),
+                 quote (sep_string));
+          number_locale_warned = true;
+        }
+    }

This code brought it to my attention that the GNU 'sort' has had a longstanding bug (in code that I wrote long ago - sorry!) in that thousands_sep is either -1 or an unsigned char converted to int, and this doesn't work in some unusual cases. I installed the attached patch to fix that bug, and I vaguely suspect that it fixes similar bugs in GNU 'test' and GNU 'expr'. Good thing you brought it to my attention. (Sorry, I'm too lazy and/or time-pressed and/or overconfident to write test cases....)

Anyway, with that patch installed, TAB and THOUSANDS_SEP can both be CHAR_MAX + 1 so the above code needs to be twiddled. Also, we can assume C99. So, something like following (pardon Thunderbird's line wrap):

  if (numeric_field_span
      && (tab == TAB_DEFAULT
          ? thousands_char != NON_CHAR && isblank (to_uchar (thousands_sep))
          : tab == thousands_sep))
    {
      error (0, 0,
             _("field separator %s is treated as a group separator in numbers"),
             quote (((char []) {thousands_sep, 0})));
      number_locale_warned = true;
    }

with a similar replacement to the decimal_point code.
From 6cafb122fa214baa96a67fbbd2dab6c77a961e0a Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Sun, 10 Oct 2021 15:59:56 -0700
Subject: [PATCH] sort: fix unlikely bug when '\377' < 0
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* gl/lib/strintcmp.c (strintcmp): Don’t assume that the input
cannot contain ((char) -1), as this equals '\377' when char is
signed (assuming 8-bit char).
* src/sort.c (decimal_point): Now char, to make it clear
that it’s always in char range now.
(NON_CHAR): New constant.
(traverse_raw_number): Return char not unsigned char;
this is simpler and could be faster.  All callers changed.
(main): Do not convert decimal_point and thousands_sep to
unsigned char, as this can mishandle comparisons on
machines where char is signed and the input data contains
((char) -1).  Use NON_CHAR, not -1, as an out-of-range value for
thousands_sep.
---
 gl/lib/strintcmp.c    |  4 +++-
 gl/lib/strnumcmp-in.h |  4 ++--
 src/sort.c            | 23 +++++++++++++----------
 3 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/gl/lib/strintcmp.c b/gl/lib/strintcmp.c
index 0ecf6d02c..215158627 100644
--- a/gl/lib/strintcmp.c
+++ b/gl/lib/strintcmp.c
@@ -21,6 +21,8 @@
 
 #include "strnumcmp-in.h"
 
+#include <limits.h>
+
 /* Compare strings A and B as integers without explicitly converting
    them to machine numbers, to avoid overflow problems and perhaps
    improve performance.  */
@@ -28,5 +30,5 @@
 int
 strintcmp (char const *a, char const *b)
 {
-  return numcompare (a, b, -1, -1);
+  return numcompare (a, b, CHAR_MAX + 1, CHAR_MAX + 1);
 }
diff --git a/gl/lib/strnumcmp-in.h b/gl/lib/strnumcmp-in.h
index 7f2d4bbe6..f1b03689c 100644
--- a/gl/lib/strnumcmp-in.h
+++ b/gl/lib/strnumcmp-in.h
@@ -106,8 +106,8 @@ fraccompare (char const *a, char const *b, char decimal_point)
 /* Compare strings A and B as numbers without explicitly converting
    them to machine numbers, to avoid overflow problems and perhaps
    improve performance.  DECIMAL_POINT is the decimal point and
-   THOUSANDS_SEP the thousands separator.  A DECIMAL_POINT of -1
-   causes comparisons to act as if there is no decimal point
+   THOUSANDS_SEP the thousands separator.  A DECIMAL_POINT outside
+   'char' range causes comparisons to act as if there is no decimal point
    character, and likewise for THOUSANDS_SEP.  */
 
 static inline int _GL_ATTRIBUTE_PURE
diff --git a/src/sort.c b/src/sort.c
index 2979400e7..d32dcfb02 100644
--- a/src/sort.c
+++ b/src/sort.c
@@ -152,9 +152,9 @@ enum
   };
 
 /* The representation of the decimal point in the current locale.  */
-static int decimal_point;
+static char decimal_point;
 
-/* Thousands separator; if -1, then there isn't one.  */
+/* Thousands separator; if outside char range, there is no separator.  */
 static int thousands_sep;
 
 /* Nonzero if the corresponding locales are hard.  */
@@ -338,6 +338,9 @@ static bool reverse;
    they were read if all keys compare equal.  */
 static bool stable;
 
+/* An int value outside char range.  */
+enum { NON_CHAR = CHAR_MAX + 1 };
+
 /* If TAB has this value, blanks separate fields.  */
 enum { TAB_DEFAULT = CHAR_MAX + 1 };
 
@@ -1900,12 +1903,12 @@ static char const unit_order[UCHAR_LIM] =
    decimal_point chars only.  Returns the highest digit found in the number,
    or '\0' if no digit has been found.  Upon return *number points at the
    character that immediately follows after the given number.  */
-static unsigned char
+static char
 traverse_raw_number (char const **number)
 {
   char const *p = *number;
-  unsigned char ch;
-  unsigned char max_digit = '\0';
+  char ch;
+  char max_digit = '\0';
   bool ends_with_thousands_sep = false;
 
   /* Scan to end of number.
@@ -1953,7 +1956,7 @@ find_unit_order (char const *number)
 {
   bool minus_sign = (*number == '-');
   char const *p = number + minus_sign;
-  unsigned char max_digit = traverse_raw_number (&p);
+  char max_digit = traverse_raw_number (&p);
   if ('0' < max_digit)
     {
       unsigned char ch = *p;
@@ -2334,7 +2337,7 @@ debug_key (struct line const *line, struct keyfield const *key)
           else if (key->numeric || key->human_numeric)
             {
               char const *p = beg + (beg < lim && *beg == '-');
-              unsigned char max_digit = traverse_raw_number (&p);
+              char max_digit = traverse_raw_number (&p);
               if ('0' <= max_digit)
                 {
                   unsigned char ch = *p;
@@ -4229,14 +4232,14 @@ main (int argc, char **argv)
     /* If the locale doesn't define a decimal point, or if the decimal
        point is multibyte, use the C locale's decimal point.  FIXME:
        add support for multibyte decimal points.  */
-    decimal_point = to_uchar (locale->decimal_point[0]);
+    decimal_point = locale->decimal_point[0];
     if (! decimal_point || locale->decimal_point[1])
       decimal_point = '.';
 
     /* FIXME: add support for multibyte thousands separators.  */
-    thousands_sep = to_uchar (*locale->thousands_sep);
+    thousands_sep = locale->thousands_sep[0];
     if (! thousands_sep || locale->thousands_sep[1])
-      thousands_sep = -1;
+      thousands_sep = NON_CHAR;
   }
 
   have_read_stdin = false;
-- 
2.30.2

Reply via email to