Hi,

Bug #6366 was reported over a year ago to address a deficiency in "join":
it is unable to join on fields that are sorted numerically (rather than
lexicographically). The bug report has a patch attached -- I applied it to
the current Git head, cleaned up a few things, and added a couple tests.

In response to previous discussions on the issue, my two cents:
* "sort" includes several other sorting criteria, but they're not likely to
be useful in this context.
* This patch doesn't implement the functionality of sort's "-g" option, so
"-n" is appropriate.
* If two values have different string representations but compare as equal
due to lack of precision...well, the person doing the join should be aware
of the limits of floating point representations and use the "sort, join,
sort -n" strategy. I doubt this would come up much in practice though.

The lack of this option seems like an unusually conspicuous wart and I'd
love to see it addressed. Please let me know if I can be of any more help
to that effect.

Best,
Drew Frank
From 74ed24edc2ffa224ee5175ea39fcc19e7fba705f Mon Sep 17 00:00:00 2001
From: Drew Frank <drewfr...@gmail.com>
Date: Thu, 1 Mar 2012 14:24:49 -0800
Subject: [PATCH] join: add numeric sort feature.

* src/join.c: add new flags and implement numeric comparison feature.
* tests/misc/join: add two tests for numerically sorted key fields.
This patch is based on code written by Alex Shinn
<Alex Shinn <at> gmail.com>
---
 src/join.c      |   22 +++++++++++++++++++---
 tests/misc/join |    6 ++++++
 2 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/src/join.c b/src/join.c
index b92c1f8..c65f07e 100644
--- a/src/join.c
+++ b/src/join.c
@@ -159,6 +159,7 @@ enum
 static struct option const longopts[] =
 {
   {"ignore-case", no_argument, NULL, 'i'},
+  {"numeric-sort", no_argument, NULL, 'n'},
   {"check-order", no_argument, NULL, CHECK_ORDER_OPTION},
   {"nocheck-order", no_argument, NULL, NOCHECK_ORDER_OPTION},
   {"header", no_argument, NULL, HEADER_LINE_OPTION},
@@ -173,6 +174,9 @@ static struct line uni_blank;
 /* If nonzero, ignore case when comparing join fields.  */
 static bool ignore_case;
 
+/* If nonzero, treat keys as numeric values.  */
+static bool numeric_sort;
+
 /* If nonzero, treat the first line of each file as column headers -
    join them without checking for ordering */
 static bool join_header_lines;
@@ -198,7 +202,8 @@ by whitespace.  When FILE1 or FILE2 (not both) is -, read standard input.\n\
   -e EMPTY          replace missing input fields with EMPTY\n\
 "), stdout);
       fputs (_("\
-  -i, --ignore-case  ignore differences in case when comparing fields\n\
+  -i, --ignore-case   ignore differences in case when comparing fields\n\
+  -n, --numeric-sort  compare according to string numerical value\n\
   -j FIELD          equivalent to '-1 FIELD -2 FIELD'\n\
   -o FORMAT         obey FORMAT while constructing output line\n\
   -t CHAR           use CHAR as input and output field separator\n\
@@ -318,6 +323,7 @@ keycmp (struct line const *line1, struct line const *line2,
 
   size_t len1;
   size_t len2;		/* Length of fields to compare.  */
+  long double x1, x2;
   int diff;
 
   if (jf_1 < line1->nfields)
@@ -347,7 +353,13 @@ keycmp (struct line const *line1, struct line const *line2,
   if (len2 == 0)
     return 1;
 
-  if (ignore_case)
+  if (numeric_sort)
+    {
+      x1 = strtold (beg1, NULL);
+      x2 = strtold (beg2, NULL);
+      diff = x1 < x2 ? -1 : x1 != x2;
+    }
+  else if (ignore_case)
     {
       /* FIXME: ignore_case does not work with NLS (in particular,
          with multibyte chars).  */
@@ -1017,7 +1029,7 @@ main (int argc, char **argv)
   issued_disorder_warning[0] = issued_disorder_warning[1] = false;
   check_input_order = CHECK_ORDER_DEFAULT;
 
-  while ((optc = getopt_long (argc, argv, "-a:e:i1:2:j:o:t:v:",
+  while ((optc = getopt_long (argc, argv, "-a:e:in1:2:j:o:t:v:",
                               longopts, NULL))
          != -1)
     {
@@ -1054,6 +1066,10 @@ main (int argc, char **argv)
           ignore_case = true;
           break;
 
+        case 'n':
+          numeric_sort = true;
+          break;
+
         case '1':
           set_join_field (&join_field_1, string_to_join_field (optarg));
           break;
diff --git a/tests/misc/join b/tests/misc/join
index a3fd1a8..ae9ef10 100755
--- a/tests/misc/join
+++ b/tests/misc/join
@@ -147,6 +147,12 @@ my @tv = (
  ["a,1,,2\nb,1,2\n", "a,3,4\nb,3,4\n"],
  "a,1,,2,3,4\nb,1,2,,3,4\n"],
 
+# Join on numerically sorted field.
+['numeric-1', '-n', ["7 s\n8 e\n10 t\n", "7 S\n9 N\n10 T\n"],
+    "7 s S\n10 t T\n", 0],
+['numeric-2', '',   ["7 s\n8 e\n10 t\n", "7 S\n9 N\n10 T\n"],
+    "7 s S\n", 1, "$prog: numeric-2.2:3: is not sorted: 10 T\n"],
+
 # From Tim Smithers: fixed in 1.22l
 ['trailing-sp', '-t: -1 1 -2 1', ["a:x \n", "a:y \n"], "a:x :y \n", 0],
 
-- 
1.7.9.2

Reply via email to