On 09/10/2021 23:29, Paul Eggert wrote:
On 10/9/21 5:00 AM, Pádraig Brady wrote:
On 09/10/2021 04:48, Paul Eggert wrote:
'sort' could determine the group sizes from the locale, and
reject digit strings that are formatted improperly according to the
group-size rules. (Not that I plan to write the code to do that....)
Yes I agree that would be better, but not worth it I think
as there would still be ambiguity in what was a grouping char
and what was a field separator. Also that ambiguity would
now vary across locales.
I don't see the ambiguity problem. The field separator is used to
identify fields; once the fields are identified, the thousands
separator, decimal point, etc. contribute to numeric comparison in the
usual way. So it's OK (albeit confusing) for the field separator to be
'.' or ',' or '-' or '0' or any another character that could be part of
a number.
For example, with 'sort -t 0 -k 2,2n', the digit 0 is not part of the
numeric field that is compared, and there's no ambiguity about that even
though 0 is allowed in numbers. The same idea applies to 'sort -t , -k
2,2n'.
Indeed. I dropped -t, from my later examples and confused myself.
Attached is the proposed change to add appropriate warnings in this area.
Examples now diagnosed are:
$ printf '0,9\n1,a\n' | sort -nk1 --debug -t, -s
sort: key 1 is numeric and spans multiple fields
sort: field separator ‘,’ is treated as a group separator in numbers
1,a
_
0,9
___
$ printf '1,a\n0,9\n' | LC_ALL=fr_FR.utf8 sort -gk1 --debug -t, -s
sort: key 1 is numeric and spans multiple fields
sort: field separator ‘,’ is treated as a decimal point in numbers
0,9
___
1,a
__
$ printf '1.0\n0.9\n' | sort -s -k1,1g --debug
sort: numbers use ‘.’ as a decimal point in this locale
0.9
___
1.0
___
$ printf '1.0\n0.9\n' | LC_ALL=fr_FR.utf8 sort -s -k1,1g --debug
sort: numbers use ‘,’ as a decimal point in this locale
0.9
_
1.0
_
cheers,
Pádraig
>From 06410ad77fcd80010859586cc068d8931bcf74e6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?P=C3=A1draig=20Brady?= <p...@draigbrady.com>
Date: Sun, 10 Oct 2021 18:35:59 +0100
Subject: [PATCH] sort: --debug: add warnings about radix and grouping chars
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
New warnings are added related to the handling
of thousands grouping characters, and decimal points.
Examples now diagnosed are:
$ printf '0,9\n1,a\n' | sort -nk1 --debug -t, -s
sort: key 1 is numeric and spans multiple fields
sort: field separator ‘,’ is treated as a group separator in numbers
1,a
_
0,9
___
$ printf '1,a\n0,9\n' | LC_ALL=fr_FR.utf8 sort -gk1 --debug -t, -s
sort: key 1 is numeric and spans multiple fields
sort: field separator ‘,’ is treated as a decimal point in numbers
0,9
___
1,a
__
$ printf '1.0\n0.9\n' | sort -s -k1,1g --debug
sort: numbers use ‘.’ as a decimal point in this locale
0.9
___
1.0
___
$ printf '1.0\n0.9\n' | LC_ALL=fr_FR.utf8 sort -s -k1,1g --debug
sort: numbers use ‘,’ as a decimal point in this locale
0.9
_
1.0
_
* src/sort.c (key_warnings): Add the warnings above.
* tests/misc/sort-debug-warn.sh: Add test cases.
Addresses https://bugs.gnu.org/51011
---
src/sort.c | 62 +++++++++++++++++++++++++++++++++--
tests/misc/sort-debug-warn.sh | 33 ++++++++++++++++++-
2 files changed, 92 insertions(+), 3 deletions(-)
diff --git a/src/sort.c b/src/sort.c
index 2979400e7..17bae2a57 100644
--- a/src/sort.c
+++ b/src/sort.c
@@ -2422,9 +2422,15 @@ key_warnings (struct keyfield const *gkey, bool gkey_only)
struct keyfield const *key;
struct keyfield ugkey = *gkey;
unsigned long keynum = 1;
+ bool numeric_field = false;
+ bool numeric_field_span = false;
+ bool general_numeric_field_span = false;
for (key = keylist; key; key = key->next, keynum++)
{
+ if (key_numeric (key))
+ numeric_field = true;
+
if (key->traditional_used)
{
size_t sword = key->sword;
@@ -2479,8 +2485,14 @@ key_warnings (struct keyfield const *gkey, bool gkey_only)
if (!sword)
sword++;
if (!eword || sword < eword)
- error (0, 0, _("key %lu is numeric and spans multiple fields"),
- keynum);
+ {
+ error (0, 0, _("key %lu is numeric and spans multiple fields"),
+ keynum);
+ if (key->general_numeric)
+ general_numeric_field_span = true;
+ else
+ numeric_field_span = true;
+ }
}
/* Flag global options not copied or specified in any key. */
@@ -2499,6 +2511,52 @@ key_warnings (struct keyfield const *gkey, bool gkey_only)
ugkey.reverse &= !key->reverse;
}
+ /* Explicitly warn if field delimiters in this locale
+ don't constrain numbers. */
+ bool number_locale_warned = false;
+ if (numeric_field_span)
+ {
+ char sep_string[2] = { 0, };
+ sep_string[0] = thousands_sep;
+ if ((tab == TAB_DEFAULT
+ && (isblank (to_uchar (thousands_sep))))
+ || tab == thousands_sep)
+ {
+ error (0, 0,
+ _("field separator %s is treated as a "
+ "group separator in numbers"),
+ quote (sep_string));
+ number_locale_warned = true;
+ }
+ }
+ if (numeric_field_span || general_numeric_field_span)
+ {
+ char sep_string[2] = { 0, };
+ sep_string[0] = decimal_point;
+ if ((tab == TAB_DEFAULT && isblank (to_uchar (decimal_point)))
+ || tab == decimal_point)
+ {
+ error (0, 0,
+ _("field separator %s is treated as a "
+ "decimal point in numbers"),
+ quote (sep_string));
+ number_locale_warned = true;
+ }
+ }
+
+ /* Explicitly indicate the decimal point used in this locale,
+ as it suggests that robust scripts need to consider
+ setting the locale when comparing numbers. */
+ if (numeric_field && ! number_locale_warned)
+ {
+ char radix_string[2] = { 0, };
+ radix_string[0] = decimal_point;
+ error (0, 0,
+ _("numbers use %s as a decimal point in this locale"),
+ quote (radix_string));
+
+ }
+
/* Warn about ignored global options flagged above.
This clears all flags if UGKEY is the only one in the list. */
if (!default_key_compare (&ugkey)
diff --git a/tests/misc/sort-debug-warn.sh b/tests/misc/sort-debug-warn.sh
index bd1c4d8a1..8c6113934 100755
--- a/tests/misc/sort-debug-warn.sh
+++ b/tests/misc/sort-debug-warn.sh
@@ -26,30 +26,39 @@ sort: key 1 has zero width and will be ignored
2
sort: text ordering performed using simple byte comparison
sort: key 1 has zero width and will be ignored
+sort: numbers use '.' as a decimal point in this locale
3
sort: text ordering performed using simple byte comparison
sort: key 1 is numeric and spans multiple fields
+sort: numbers use '.' as a decimal point in this locale
4
sort: text ordering performed using simple byte comparison
+sort: numbers use '.' as a decimal point in this locale
sort: options '-bghMRrV' are ignored
5
sort: text ordering performed using simple byte comparison
+sort: numbers use '.' as a decimal point in this locale
sort: options '-bghMRV' are ignored
sort: option '-r' only applies to last-resort comparison
6
sort: text ordering performed using simple byte comparison
+sort: numbers use '.' as a decimal point in this locale
sort: option '-r' only applies to last-resort comparison
7
sort: text ordering performed using simple byte comparison
sort: leading blanks are significant in key 2; consider also specifying 'b'
+sort: numbers use '.' as a decimal point in this locale
sort: options '-bg' are ignored
8
sort: text ordering performed using simple byte comparison
+sort: numbers use '.' as a decimal point in this locale
9
sort: text ordering performed using simple byte comparison
+sort: numbers use '.' as a decimal point in this locale
sort: option '-b' is ignored
10
sort: text ordering performed using simple byte comparison
+sort: numbers use '.' as a decimal point in this locale
11
sort: text ordering performed using simple byte comparison
sort: leading blanks are significant in key 1; consider also specifying 'b'
@@ -106,7 +115,6 @@ echo 16 >> out
sort -rM --debug /dev/null 2>>out #no warning
echo 17 >> out
sort -rM -k1,1 --debug /dev/null 2>>out #no warning
-
compare exp out || fail=1
@@ -130,6 +138,7 @@ sort: text ordering performed using simple byte comparison
sort: key 1 is numeric and spans multiple fields
sort: obsolescent key '+2 -1' used; consider '-k 3,1' instead
sort: key 2 has zero width and will be ignored
+sort: numbers use '.' as a decimal point in this locale
sort: option '-b' is ignored
sort: option '-r' only applies to last-resort comparison
EOF
@@ -138,4 +147,26 @@ sort --debug -rb -k2n +2.2 -1b /dev/null 2>out
compare exp out || fail=1
+
+# check decimal point and grouping character warnings
+cat <<\EOF > exp
+sort: text ordering performed using simple byte comparison
+sort: key 1 is numeric and spans multiple fields
+sort: field separator ',' is treated as a group separator in numbers
+EOF
+
+if test $(printf '0,9\n0,8\n' | sort -ns | tail -n1) = '0,9'; then
+ # thousands_sep == ,
+ sort -nk1 -t, --debug /dev/null 2>out
+ compare exp out || fail=1
+fi
+
+cat <<\EOF > exp
+sort: text ordering performed using simple byte comparison
+sort: key 1 is numeric and spans multiple fields
+sort: field separator '.' is treated as a decimal point in numbers
+EOF
+sort -nk1 -t. --debug /dev/null 2>out
+compare exp out || fail=1
+
Exit $fail
--
2.26.2