On Wed, Mar 21, 2012 at 9:59 AM, Eric Blake <ebl...@redhat.com> wrote:
>
> Missing documentation in NEWS and doc/coreutils.texi.
>
> Based on just the diffstat, this patch is non-trivial, so we'd need
> copyright assignment to the FSF from both you and from Alex Shinn, as
> co-authors of this material.  Is that something you are interested in
> pursuing?  I'm refraining from reviewing the patch itself, in case we
> need to come up with a clean-room reimplementation.

Apologies for creating this mess on the mailing list -- the patch you
glanced at was the first of two that I sent (see bug #6366 for the
second). I'm including a third here. Compared to the first, I've made
the following changes:
- Added documentation (both NEWS and doc/coreutils.texi).
- Added tests.
- Reimplemented the numeric comparison functionality by copying code
from sort.c.
Due to the reimplementation, just about all of Alex's patch has been
replaced (except for the bits where there's obviously one right way to
do it, e.g. parsing the new options). Also, since I copied most of the
code from sort.c, there's very little truly original code in the whole
patch (mostly old code blocks in new places). If a copyright
assignment is needed, I'd be happy to provide that.

Drew
From 37eaa8e579f31c163cfa15e6549e1ba9000106cb Mon Sep 17 00:00:00 2001
From: Drew Frank <drewfr...@gmail.com>
Date: Thu, 1 Mar 2012 14:24:49 -0800
Subject: [PATCH] join: add numeric sort feature (-g and -n flags).

* src/join.c: add flags and implement numeric comparison features.
* tests/misc/join: add three tests for numerically sorted key fields.
* doc/coreutils.texi: document new functionality.
* NEWS: document new functionality.
---
 NEWS               |    4 ++
 doc/coreutils.texi |   16 ++++++
 gnulib             |    2 +-
 src/join.c         |  157 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 tests/misc/join    |    8 +++
 5 files changed, 182 insertions(+), 5 deletions(-)

diff --git a/NEWS b/NEWS
index 5b53eb8..0b036fb 100644
--- a/NEWS
+++ b/NEWS
@@ -38,6 +38,10 @@ GNU coreutils NEWS                                    -*- 
outline -*-
   dirname now supports more than one argument.  Also the complementary
   -z option was added to delimit output items with the NUL character.
 
+  join now supports the -n and -g options, which allow joining on
+  numerically sorted columns.  Data should be ordered according to the
+  corresponding sort option.
+
 ** Bug fixes
 
   du --one-file-system (-x) would ignore any non-directory specified on
diff --git a/doc/coreutils.texi b/doc/coreutils.texi
index 10be715..017118f 100644
--- a/doc/coreutils.texi
+++ b/doc/coreutils.texi
@@ -5798,6 +5798,14 @@ specified format.  The header lines will not be checked 
for ordering even if
 @option{--check-order} is specified.  Also if the header lines from each file
 do not match, the heading fields from the first file will be used.
 
+@item -g
+@itemx --general-numeric-sort
+@opindex -g
+@opindex --general-numeric-sort
+Compute general numerical value when comparing keys.
+With this option, the lines of the input files must be ordered in the same way.
+Use @samp{sort -g} to produce this ordering.
+
 @item -i
 @itemx --ignore-case
 @opindex -i
@@ -5806,6 +5814,14 @@ Ignore differences in case when comparing keys.
 With this option, the lines of the input files must be ordered in the same way.
 Use @samp{sort -f} to produce this ordering.
 
+@item -n
+@itemx --numeric-sort
+@opindex -n
+@opindex --numeric-sort
+Compute string numerical value when comparing keys.
+With this option, the lines of the input files must be ordered in the same way.
+Use @samp{sort -n} to produce this ordering.
+
 @item -1 @var{field}
 @opindex -1
 Join on field @var{field} (a positive integer) of file 1.
diff --git a/gnulib b/gnulib
index 4730c3e..cbc11ff 160000
--- a/gnulib
+++ b/gnulib
@@ -1 +1 @@
-Subproject commit 4730c3e3692b344effb72d46b3ff92db0bdb797a
+Subproject commit cbc11ff0020eb9c04caea6b3e7dc4e4281dff1f9
diff --git a/src/join.c b/src/join.c
index b92c1f8..95dac47 100644
--- a/src/join.c
+++ b/src/join.c
@@ -30,6 +30,7 @@
 #include "memcasecmp.h"
 #include "quote.h"
 #include "stdio--.h"
+#include "strnumcmp.h"
 #include "xmemcoll.h"
 #include "xstrtol.h"
 #include "argmatch.h"
@@ -47,6 +48,16 @@
   b = tmp; \
 } while (0);
 
+#define UCHAR_LIM (UCHAR_MAX + 1)
+
+#if HAVE_C99_STRTOLD
+# define long_double long double
+#else
+# define long_double double
+# undef strtold
+# define strtold strtod
+#endif
+
 /* An element of the list identifying which fields to print for each
    output line.  */
 struct outlist
@@ -100,6 +111,12 @@ static char *g_names[2];
    want to overwrite the previous buffer before we check order. */
 static struct line *spareline[2] = {NULL, NULL};
 
+/* The representation of the decimal point in the current locale.  */
+static int decimal_point;
+
+/* Thousands separator; if -1, then there isn't one.  */
+static int thousands_sep;
+
 /* True if the LC_COLLATE locale is hard.  */
 static bool hard_LC_COLLATE;
 
@@ -140,6 +157,9 @@ static struct outlist *outlist_end = &outlist_head;
    tab character whose value (when cast to unsigned char) equals TAB.  */
 static int tab = -1;
 
+/* Table of blanks.  */
+static bool blanks[UCHAR_LIM];
+
 /* If nonzero, check that the input is correctly ordered. */
 static enum
   {
@@ -158,7 +178,9 @@ enum
 
 static struct option const longopts[] =
 {
+  {"general-numeric-sort", no_argument, NULL, 'g'},
   {"ignore-case", no_argument, NULL, 'i'},
+  {"numeric-sort", no_argument, NULL, 'n'},
   {"check-order", no_argument, NULL, CHECK_ORDER_OPTION},
   {"nocheck-order", no_argument, NULL, NOCHECK_ORDER_OPTION},
   {"header", no_argument, NULL, HEADER_LINE_OPTION},
@@ -173,6 +195,12 @@ static struct line uni_blank;
 /* If nonzero, ignore case when comparing join fields.  */
 static bool ignore_case;
 
+/* If nonzero, treat keys as numeric values.  */
+static bool numeric_sort;
+
+/* If nonzero, treat keys as general numeric values.  */
+static bool general_numeric_sort;
+
 /* If nonzero, treat the first line of each file as column headers -
    join them without checking for ordering */
 static bool join_header_lines;
@@ -198,7 +226,9 @@ by whitespace.  When FILE1 or FILE2 (not both) is -, read 
standard input.\n\
   -e EMPTY          replace missing input fields with EMPTY\n\
 "), stdout);
       fputs (_("\
-  -i, --ignore-case  ignore differences in case when comparing fields\n\
+  -g, --general-numeric-sort  compare according to general numerical value\n\
+  -i, --ignore-case   ignore differences in case when comparing fields\n\
+  -n, --numeric-sort  compare according to string numerical value\n\
   -j FIELD          equivalent to '-1 FIELD -2 FIELD'\n\
   -o FORMAT         obey FORMAT while constructing output line\n\
   -t CHAR           use CHAR as input and output field separator\n\
@@ -303,6 +333,73 @@ freeline (struct line *line)
   line->buf.buffer = NULL;
 }
 
+/* Compare strings A and B as numbers without explicitly converting them to
+   machine numbers.  Comparatively slow for short strings, but asymptotically
+   hideously fast. Copied from sort.c. */
+
+static int
+numcompare (char const *a, char const *b)
+{
+  while (blanks[to_uchar (*a)])
+    a++;
+  while (blanks[to_uchar (*b)])
+    b++;
+
+  return strnumcmp (a, b, decimal_point, thousands_sep);
+}
+
+/* Work around a problem whereby the long double value returned by glibc's
+   strtold ("NaN", ...) contains uninitialized bits: clear all bytes of
+   A and B before calling strtold.  FIXME: remove this function once
+   gnulib guarantees that strtold's result is always well defined.
+   Copied from sort.c. */
+
+static int
+nan_compare (char const *sa, char const *sb)
+{
+  long_double a;
+  memset (&a, 0, sizeof a);
+  a = strtold (sa, NULL);
+
+  long_double b;
+  memset (&b, 0, sizeof b);
+  b = strtold (sb, NULL);
+
+  return memcmp (&a, &b, sizeof a);
+}
+
+
+/* Compare general numerical value by conversion to long double.
+ * Copied from sort.c. */
+
+static int
+general_numcompare (char const *sa, char const *sb)
+{
+  /* FIXME: maybe add option to try expensive FP conversion
+     only if A and B can't be compared more cheaply/accurately.  */
+
+  char *ea;
+  char *eb;
+  long_double a = strtold (sa, &ea);
+  long_double b = strtold (sb, &eb);
+
+  /* Put conversion errors at the start of the collating sequence.  */
+  if (sa == ea)
+    return sb == eb ? 0 : -1;
+  if (sb == eb)
+    return 1;
+
+  /* Sort numbers in the usual way, where -0 == +0.  Put NaNs after
+     conversion errors but before numbers; sort them by internal
+     bit-pattern, for lack of a more portable alternative.  */
+  return (a < b ? -1
+          : a > b ? 1
+          : a == b ? 0
+          : b == b ? -1
+          : a == a ? 1
+          : nan_compare (sa, sb));
+}
+
 /* Return <0 if the join field in LINE1 compares less than the one in LINE2;
    >0 if it compares greater; 0 if it compares equal.
    Report an error and exit if the comparison fails.
@@ -318,6 +415,7 @@ keycmp (struct line const *line1, struct line const *line2,
 
   size_t len1;
   size_t len2;         /* Length of fields to compare.  */
+  long double x1, x2;
   int diff;
 
   if (jf_1 < line1->nfields)
@@ -347,7 +445,19 @@ keycmp (struct line const *line1, struct line const *line2,
   if (len2 == 0)
     return 1;
 
-  if (ignore_case)
+  if (numeric_sort)
+    {
+      char end1 = beg1[len1];
+      char end2 = beg2[len2];
+      beg1[len1] = '\0'; beg2[len2] = '\0';
+      diff = numcompare (beg1, beg2);
+      beg1[len1] = end1; beg2[len2] = end2;
+    }
+  else if (general_numeric_sort)
+    {
+        diff = general_numcompare (beg1, beg2);
+    }
+  else if (ignore_case)
     {
       /* FIXME: ignore_case does not work with NLS (in particular,
          with multibyte chars).  */
@@ -360,7 +470,7 @@ keycmp (struct line const *line1, struct line const *line2,
       diff = memcmp (beg1, beg2, MIN (len1, len2));
     }
 
-  if (diff)
+  if (diff || numeric_sort || general_numeric_sort)
     return diff;
   return len1 < len2 ? -1 : len1 != len2;
 }
@@ -412,6 +522,19 @@ check_order (const struct line *prev,
     }
 }
 
+/* Initialize the character class tables. */
+
+static void
+inittables (void)
+{
+  size_t i;
+
+  for (i = 0; i < UCHAR_LIM; ++i)
+    {
+      blanks[i] = !! isblank (i);
+    }
+}
+
 static inline void
 reset_line (struct line *line)
 {
@@ -1009,6 +1132,24 @@ main (int argc, char **argv)
   textdomain (PACKAGE);
   hard_LC_COLLATE = hard_locale (LC_COLLATE);
 
+  /* Get locale's representation of the decimal point.  */
+  {
+    struct lconv const *locale = localeconv ();
+
+    /* If the locale doesn't define a decimal point, or if the decimal
+       point is multibyte, use the C locale's decimal point.  FIXME:
+       add support for multibyte decimal points.  */
+    decimal_point = to_uchar (locale->decimal_point[0]);
+    if (! decimal_point || locale->decimal_point[1])
+      decimal_point = '.';
+
+    /* FIXME: add support for multibyte thousands separators.  */
+    thousands_sep = to_uchar (*locale->thousands_sep);
+    if (! thousands_sep || locale->thousands_sep[1])
+      thousands_sep = -1;
+  }
+  inittables ();
+
   atexit (close_stdout);
   atexit (free_spareline);
 
@@ -1017,7 +1158,7 @@ main (int argc, char **argv)
   issued_disorder_warning[0] = issued_disorder_warning[1] = false;
   check_input_order = CHECK_ORDER_DEFAULT;
 
-  while ((optc = getopt_long (argc, argv, "-a:e:i1:2:j:o:t:v:",
+  while ((optc = getopt_long (argc, argv, "-a:e:gin1:2:j:o:t:v:",
                               longopts, NULL))
          != -1)
     {
@@ -1050,10 +1191,18 @@ main (int argc, char **argv)
           empty_filler = optarg;
           break;
 
+        case 'g':
+          general_numeric_sort = true;
+          break;
+
         case 'i':
           ignore_case = true;
           break;
 
+        case 'n':
+          numeric_sort = true;
+          break;
+
         case '1':
           set_join_field (&join_field_1, string_to_join_field (optarg));
           break;
diff --git a/tests/misc/join b/tests/misc/join
index a3fd1a8..2dce94e 100755
--- a/tests/misc/join
+++ b/tests/misc/join
@@ -147,6 +147,14 @@ my @tv = (
  ["a,1,,2\nb,1,2\n", "a,3,4\nb,3,4\n"],
  "a,1,,2,3,4\nb,1,2,,3,4\n"],
 
+# Join on numerically sorted field.
+['numeric-1', '-n', ["7 s\n8 e\n10 t\n4000 f\n", "7 S\n9 N\n4000 F\n"],
+    "7 s S\n4000 f F\n", 0],
+['numeric-2', '',   ["7 s\n8 e\n10 t\n", "7 S\n9 N\n10 T\n"],
+    "7 s S\n", 1, "$prog: numeric-2.2:3: is not sorted: 10 T\n"],
+['numeric-3', '-g',   ["7 s\n5e2 f\n800 e", "70e-1 S\n500.0 F\n"],
+    "7 s S\n5e2 f F\n", 0],
+
 # From Tim Smithers: fixed in 1.22l
 ['trailing-sp', '-t: -1 1 -2 1', ["a:x \n", "a:y \n"], "a:x :y \n", 0],
 
-- 
1.7.9.4

Reply via email to