On 17/12/2019 17:25, Roy Smith wrote:
I stopped short of actually building uniq.c from source (bootstrap, 
prerequisites, ...), but looking at the code, it looks like the call chain is:

different()
xmemcoll()
memcoll()
strcoll()

so I tried a little test at the strcoll() level:

#include <stdio.h>
#include <unistd.h>
#include <string.h>

int
main (int argc, char **argv)
{
   unsigned char null[] = {

     0342, 0201, 0277, 0341, 0265, 0230, 0313, 0241, 0313, 0241, 0
   };
   unsigned char iraq[] = {
     0334, 0245, 0334, 0235, 0334, 0252, 0334, 0220, 0334, 0251, 0};

   printf("%s\n", null);
   printf("%s\n", iraq);

   int m = strcoll(null, iraq);
   printf("m = %d\n", m);
}

That correctly says the strings are different:

$ LANG=en_US.UTF-8 ./a.out
ⁿᵘˡˡ
ܥܝܪܐܩ
m = 6






On Dec 16, 2019, at 7:46 PM, Roy Smith <r...@panix.com> wrote:

Yup, this does depend on the locale.  In my original example, I had 
LANG=en_US.UTF-8.  Setting it to C.UTF-8 gets me the right result:

$ LANG=C.UTF-8 uniq -c x
       1 "ⁿᵘˡˡ"
       1 "ܥܝܪܐܩ"


But, that doesn't fully explain what's going on.  I find it difficult to believe that there's 
any collation sequence in the world where those two strings should compare the same.  I've 
been playing around with the ICU string compare demo 
<http://demo.icu-project.org/icu-bin/locexp?_=en_US&d_=en&x=col> and can't 
reproduce this there.  Possibly I just haven't hit upon the right combination of options to 
set, but I think it's far-fetched that there's any such combination for which those two 
strings comparing equal is legitimate.

I think you ran your test on a newer glibc.
Testing on older glibc-2.22 I see the issue with strcoll() returning 0 for the 
above strings,
while it returns an expected difference on glibc-2.30 at least.

There are a few things to reason about with removing strcoll(), namely:
  buggy strcoll implementations
  inconsistent unicode normalization
  mismatched locale settings and data
  handling of characters ignored in collation order

tl;dr is that strcoll() should be removed for all these reasons,
and I've added a test for each of the 4 cases above in the attached patch,
which I'll push later.

Marking this as done.

thanks,
Pádraig
>From 439a0f7fe0b89ebc371a80b30b07e3fd8b0c1b4e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?P=C3=A1draig=20Brady?= <p...@draigbrady.com>
Date: Sun, 23 Feb 2020 13:20:08 +0000
Subject: [PATCH] uniq: avoid strcoll() to improve performance and consistency

strcoll() is only significant to uniq(1) if it returns 0,
and it generally only does so with buggy locales or mismatched
locales and data.  Some systems may have strcoll()
return 0 for equivalent normalized unicode forms,
but for consistency across platforms strcoll() is avoided.
The various cases are defined in the new test.
This is consistent with newer POSIX standards as discussed at:
https://www.austingroupbugs.net/view.php?id=963

* src/uniq.c: s/xstrcoll/memcmp/.
* tests/local.mk: Reference the new test.
* tests/misc/uniq-collate.sh: Add a new test.
* NEWS: Mention the change in behavior.
Fixes https://bugs.gnu.org/38627
---
 NEWS                       |  3 ++
 src/uniq.c                 | 13 +-------
 tests/local.mk             |  1 +
 tests/misc/uniq-collate.sh | 64 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 69 insertions(+), 12 deletions(-)
 create mode 100755 tests/misc/uniq-collate.sh

diff --git a/NEWS b/NEWS
index 8a349634e..6afb9cb6d 100644
--- a/NEWS
+++ b/NEWS
@@ -65,6 +65,9 @@ GNU coreutils NEWS                                    -*- outline -*-
   [The old behavior was introduced in sh-utils 2.0.15 ca. 1999, predating
   coreutils package.]
 
+  uniq no longer uses strcoll() to determine string equivalence,
+  and so will operate more efficiently and consistently.
+
 ** New Features
 
   ls now supports the --time=birth option to display and sort by
diff --git a/src/uniq.c b/src/uniq.c
index 0fcf50a16..6608b6c62 100644
--- a/src/uniq.c
+++ b/src/uniq.c
@@ -30,7 +30,6 @@
 #include "hard-locale.h"
 #include "posixver.h"
 #include "stdio--.h"
-#include "xmemcoll.h"
 #include "xstrtol.h"
 #include "memcasecmp.h"
 #include "quote.h"
@@ -52,9 +51,6 @@
     }						\
   while (0)
 
-/* True if the LC_COLLATE locale is hard.  */
-static bool hard_LC_COLLATE;
-
 /* Number of fields to skip on each line when doing comparisons. */
 static size_t skip_fields;
 
@@ -220,7 +216,6 @@ characters.  Fields are skipped before chars.\n\
 \n\
 Note: 'uniq' does not detect repeated lines unless they are adjacent.\n\
 You may want to sort the input first, or use 'sort -u' without 'uniq'.\n\
-Also, comparisons honor the rules specified by 'LC_COLLATE'.\n\
 "), stdout);
       emit_ancillary_info (PROGRAM_NAME);
     }
@@ -293,12 +288,7 @@ different (char *old, char *new, size_t oldlen, size_t newlen)
     newlen = check_chars;
 
   if (ignore_case)
-    {
-      /* FIXME: This should invoke strcoll somehow.  */
-      return oldlen != newlen || memcasecmp (old, new, oldlen);
-    }
-  else if (hard_LC_COLLATE)
-    return xmemcoll (old, oldlen, new, newlen) != 0;
+    return oldlen != newlen || memcasecmp (old, new, oldlen);
   else
     return oldlen != newlen || memcmp (old, new, oldlen);
 }
@@ -501,7 +491,6 @@ main (int argc, char **argv)
   setlocale (LC_ALL, "");
   bindtextdomain (PACKAGE, LOCALEDIR);
   textdomain (PACKAGE);
-  hard_LC_COLLATE = hard_locale (LC_COLLATE);
 
   atexit (close_stdout);
 
diff --git a/tests/local.mk b/tests/local.mk
index bbcb9d413..0aabdaacc 100644
--- a/tests/local.mk
+++ b/tests/local.mk
@@ -438,6 +438,7 @@ all_tests =					\
   tests/misc/unexpand.pl			\
   tests/misc/uniq.pl				\
   tests/misc/uniq-perf.sh			\
+  tests/misc/uniq-collate.sh			\
   tests/misc/xattr.sh				\
   tests/misc/yes.sh				\
   tests/tail-2/wait.sh				\
diff --git a/tests/misc/uniq-collate.sh b/tests/misc/uniq-collate.sh
new file mode 100755
index 000000000..6767848a4
--- /dev/null
+++ b/tests/misc/uniq-collate.sh
@@ -0,0 +1,64 @@
+#!/bin/sh
+# before coreutils-8.32, uniq would not distinguish
+# items which compared equal with strcoll()
+# So ensure we avoid strcoll() for the following cases.
+
+# Copyright (C) 2020 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
+print_ver_ uniq printf
+
+gen_input()
+{
+  env LC_ALL=$LOCALE_FR_UTF8 printf \
+    "$@" > in || framework_failure_
+}
+
+# strcoll() used to return 0 comparing the following strings
+# which was fixed somewhere between glibc-2.22 and glibc-2.30
+gen_input '%s\n' 'ⁿᵘˡˡ' 'ܥܝܪܐܩ' > in || framework_failure_
+test $(LC_ALL=$LOCALE_FR_UTF8 uniq < in | wc -l) = 2 || fail=1
+
+# normalization in strcoll is inconsistent across platforms.
+# glibc based systems at least do _not_ normalize in strcoll,
+# while cygwin systems for example may do so.
+# á composed, decomposed
+gen_input '\u00E1\na\u0301\n' > in || framework_failure_
+test $(LC_ALL=$LOCALE_FR_UTF8 uniq < in | wc -l) = 2 || fail=1
+# Similarly with hangul
+gen_input '\uAC01\n\u1100\u1161\u11A8' > in || framework_failure_
+test $(LC_ALL=ko_KR.utf8 uniq < in | wc -l) = 2 || fail=1
+
+# Note if running in the wrong locale, strcoll may
+# indicate the strings match when they don't
+# I.e., cjk and hangul will now work even if
+# uniq is running in the wong locale
+# hangul (ko_KR.utf8)
+gen_input '\uAC00\n\uAC01\n' > in || framework_failure_
+test $(LC_ALL=en_US.utf8 uniq < in | wc -l) = 2 || fail=1
+# CJK (zh_CN.utf8)
+gen_input '\u3400\n\u3401\n' > in || framework_failure_
+test $(LC_ALL=en_US.utf8 uniq < in | wc -l) = 2 || fail=1
+
+# Note strcoll() ignores certain characters,
+# but not if the strings are otherwise equal.
+# I.e., the following on glibc-2.30 at least
+# does not print a single item as expected,
+# but testing here for illustration
+gen_input ',a\n.a\n' > in || framework_failure_
+test $(LC_ALL=$LOCALE_FR_UTF8 uniq < in | wc -l) = 2 || fail=1
+
+Exit $fail
-- 
2.24.1

Reply via email to