The warnings look good, except that this one:
$ printf '1.0\n0.9\n' | sort -s -k1,1g --debug
sort: numbers use ‘.’ as a decimal point in this locale
0.9
___
1.0
___
seems overkill if we're in the C locale.
Also, shouldn't similar diagnostics be generated if the field separator
is '-', or '+', or a digit in the current locale?
+ if (numeric_field_span)
+ {
+ char sep_string[2] = { 0, };
+ sep_string[0] = thousands_sep;
+ if ((tab == TAB_DEFAULT
+ && (isblank (to_uchar (thousands_sep))))
+ || tab == thousands_sep)
+ {
+ error (0, 0,
+ _("field separator %s is treated as a "
+ "group separator in numbers"),
+ quote (sep_string));
+ number_locale_warned = true;
+ }
+ }
This code brought it to my attention that the GNU 'sort' has had a
longstanding bug (in code that I wrote long ago - sorry!) in that
thousands_sep is either -1 or an unsigned char converted to int, and
this doesn't work in some unusual cases. I installed the attached patch
to fix that bug, and I vaguely suspect that it fixes similar bugs in GNU
'test' and GNU 'expr'. Good thing you brought it to my attention.
(Sorry, I'm too lazy and/or time-pressed and/or overconfident to write
test cases....)
Anyway, with that patch installed, TAB and THOUSANDS_SEP can both be
CHAR_MAX + 1 so the above code needs to be twiddled. Also, we can assume
C99. So, something like following (pardon Thunderbird's line wrap):
if (numeric_field_span
&& (tab == TAB_DEFAULT
? thousands_char != NON_CHAR && isblank (to_uchar (thousands_sep))
: tab == thousands_sep))
{
error (0, 0,
_("field separator %s is treated as a group separator in numbers"),
quote (((char []) {thousands_sep, 0})));
number_locale_warned = true;
}
with a similar replacement to the decimal_point code.
From 6cafb122fa214baa96a67fbbd2dab6c77a961e0a Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Sun, 10 Oct 2021 15:59:56 -0700
Subject: [PATCH] sort: fix unlikely bug when '\377' < 0
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
* gl/lib/strintcmp.c (strintcmp): Don’t assume that the input
cannot contain ((char) -1), as this equals '\377' when char is
signed (assuming 8-bit char).
* src/sort.c (decimal_point): Now char, to make it clear
that it’s always in char range now.
(NON_CHAR): New constant.
(traverse_raw_number): Return char not unsigned char;
this is simpler and could be faster. All callers changed.
(main): Do not convert decimal_point and thousands_sep to
unsigned char, as this can mishandle comparisons on
machines where char is signed and the input data contains
((char) -1). Use NON_CHAR, not -1, as an out-of-range value for
thousands_sep.
---
gl/lib/strintcmp.c | 4 +++-
gl/lib/strnumcmp-in.h | 4 ++--
src/sort.c | 23 +++++++++++++----------
3 files changed, 18 insertions(+), 13 deletions(-)
diff --git a/gl/lib/strintcmp.c b/gl/lib/strintcmp.c
index 0ecf6d02c..215158627 100644
--- a/gl/lib/strintcmp.c
+++ b/gl/lib/strintcmp.c
@@ -21,6 +21,8 @@
#include "strnumcmp-in.h"
+#include <limits.h>
+
/* Compare strings A and B as integers without explicitly converting
them to machine numbers, to avoid overflow problems and perhaps
improve performance. */
@@ -28,5 +30,5 @@
int
strintcmp (char const *a, char const *b)
{
- return numcompare (a, b, -1, -1);
+ return numcompare (a, b, CHAR_MAX + 1, CHAR_MAX + 1);
}
diff --git a/gl/lib/strnumcmp-in.h b/gl/lib/strnumcmp-in.h
index 7f2d4bbe6..f1b03689c 100644
--- a/gl/lib/strnumcmp-in.h
+++ b/gl/lib/strnumcmp-in.h
@@ -106,8 +106,8 @@ fraccompare (char const *a, char const *b, char decimal_point)
/* Compare strings A and B as numbers without explicitly converting
them to machine numbers, to avoid overflow problems and perhaps
improve performance. DECIMAL_POINT is the decimal point and
- THOUSANDS_SEP the thousands separator. A DECIMAL_POINT of -1
- causes comparisons to act as if there is no decimal point
+ THOUSANDS_SEP the thousands separator. A DECIMAL_POINT outside
+ 'char' range causes comparisons to act as if there is no decimal point
character, and likewise for THOUSANDS_SEP. */
static inline int _GL_ATTRIBUTE_PURE
diff --git a/src/sort.c b/src/sort.c
index 2979400e7..d32dcfb02 100644
--- a/src/sort.c
+++ b/src/sort.c
@@ -152,9 +152,9 @@ enum
};
/* The representation of the decimal point in the current locale. */
-static int decimal_point;
+static char decimal_point;
-/* Thousands separator; if -1, then there isn't one. */
+/* Thousands separator; if outside char range, there is no separator. */
static int thousands_sep;
/* Nonzero if the corresponding locales are hard. */
@@ -338,6 +338,9 @@ static bool reverse;
they were read if all keys compare equal. */
static bool stable;
+/* An int value outside char range. */
+enum { NON_CHAR = CHAR_MAX + 1 };
+
/* If TAB has this value, blanks separate fields. */
enum { TAB_DEFAULT = CHAR_MAX + 1 };
@@ -1900,12 +1903,12 @@ static char const unit_order[UCHAR_LIM] =
decimal_point chars only. Returns the highest digit found in the number,
or '\0' if no digit has been found. Upon return *number points at the
character that immediately follows after the given number. */
-static unsigned char
+static char
traverse_raw_number (char const **number)
{
char const *p = *number;
- unsigned char ch;
- unsigned char max_digit = '\0';
+ char ch;
+ char max_digit = '\0';
bool ends_with_thousands_sep = false;
/* Scan to end of number.
@@ -1953,7 +1956,7 @@ find_unit_order (char const *number)
{
bool minus_sign = (*number == '-');
char const *p = number + minus_sign;
- unsigned char max_digit = traverse_raw_number (&p);
+ char max_digit = traverse_raw_number (&p);
if ('0' < max_digit)
{
unsigned char ch = *p;
@@ -2334,7 +2337,7 @@ debug_key (struct line const *line, struct keyfield const *key)
else if (key->numeric || key->human_numeric)
{
char const *p = beg + (beg < lim && *beg == '-');
- unsigned char max_digit = traverse_raw_number (&p);
+ char max_digit = traverse_raw_number (&p);
if ('0' <= max_digit)
{
unsigned char ch = *p;
@@ -4229,14 +4232,14 @@ main (int argc, char **argv)
/* If the locale doesn't define a decimal point, or if the decimal
point is multibyte, use the C locale's decimal point. FIXME:
add support for multibyte decimal points. */
- decimal_point = to_uchar (locale->decimal_point[0]);
+ decimal_point = locale->decimal_point[0];
if (! decimal_point || locale->decimal_point[1])
decimal_point = '.';
/* FIXME: add support for multibyte thousands separators. */
- thousands_sep = to_uchar (*locale->thousands_sep);
+ thousands_sep = locale->thousands_sep[0];
if (! thousands_sep || locale->thousands_sep[1])
- thousands_sep = -1;
+ thousands_sep = NON_CHAR;
}
have_read_stdin = false;
--
2.30.2