sort --head

Mark E. Shoulson Wed, 19 Jun 2024 06:24:33 -0700

Hi. New here, to post a patch which I hope will inspire someone to do abetter job.

I can't really believe that for all these years, we still so very, veryoften run sort(1) and pipe it through head(1) or tail(1), because weonly want the top/bottom 20 lines or whatever. That means we're sortinga potentially gigantic list of lines but not caring about most of theinformation.

I'm attaching a patch that adds `--head` and `--tail` options tosort(1), so you say `sort --head=20` (which is equivalent to saying`sort --tail=-20` and vice-versa). I've worked this into sort(1) VERYVERY BADLY, I'm sorry to say, but maybe someone with better familiaritywith the code can do it better. Basically, I'm special-casing out thesituation when --head is active and doing my own thing with it, and thenlying to the rest of the program about it. But it works, anyway.

Performance: with a file consisting of 8,000,000 random strings, `sort |head -20` takes about 3.5sec (three sample consecutive runs yield 3.443,3.669, 3.372 sec; presumably the I/O cached is as cached as it can be). If we stipulate `--parallel=1` (no parallel sorting), then we get about24.5sec (24.230, 24.516, 24.157). But `sort --head=20` takes just 1.36seconds, even with parallel=1 (because I didn't bother with any parallelstuff anyway.) I think that's significant enough to be worth a look.

The algorithm is straightforward: have a buffer of k lines; as each linecomes in, if it's bigger/smaller than the kth line (or the buffer isn'tfull yet) insertion-sort it into the buffer and remove the kth line. Otherwise, don't even bother processing it further. It's an O(nk)operation at worst, and k << n, basically a constant, so it's O(n). (Mybuffer actually COPIES the lines (sorry) because just shifting aroundpointers didn't work; pointers wound up pointing to the wrong places atthe end. But this way the memory usage is limited, so it works out.)

I didn't even document it yet, please let me know what you think. Doessomeone want to do this better? Or just want me to write the docs andresubmit?


Thanks,

~mark

diff --git a/src/sort.c b/src/sort.c
index 6c273fab9..7c17b8794 100644
--- a/src/sort.c
+++ b/src/sort.c
@@ -359,6 +359,13 @@ static bool have_read_stdin;
 /* List of key field comparisons to be tried.  */
 static struct keyfield *keylist;
 
+/* For --head or --tail, the number of lines to be output. */
+/* Not size_t, since it could be negative (tail) */
+#define MAX_HEAD 256
+static int head = 0;
+static struct line **headbuf = NULL;
+static size_t currentlines = 0;
+
 /* Program used to (de)compress temp files.  Must accept -d.  */
 static char const *compress_program;
 
@@ -369,6 +376,9 @@ static bool debug;
    number are present, temp files will be used. */
 static unsigned int nmerge = NMERGE_DEFAULT;
 
+static void add_to_headbuf(struct line *line);
+static struct line *dup_line(struct line *line);
+
 /* Output an error to stderr and exit using async-signal-safe routines.
    This can be used safely from signal handlers,
    and between fork and exec of multithreaded processes.  */
@@ -542,7 +552,8 @@ enum
   NMERGE_OPTION,
   RANDOM_SOURCE_OPTION,
   SORT_OPTION,
-  PARALLEL_OPTION
+  PARALLEL_OPTION,
+  HEAD_OPTION
 };
 
 static char const short_options[] = "-bcCdfghik:mMno:rRsS:t:T:uVy:z";
@@ -577,6 +588,8 @@ static struct option const long_options[] =
   {"unique", no_argument, nullptr, 'u'},
   {"zero-terminated", no_argument, nullptr, 'z'},
   {"parallel", required_argument, nullptr, PARALLEL_OPTION},
+  {"head", required_argument, nullptr, HEAD_OPTION},
+  {"tail", required_argument, nullptr, HEAD_OPTION},
   {GETOPT_HELP_OPTION_DECL},
   {GETOPT_VERSION_OPTION_DECL},
   {nullptr, 0, nullptr, 0},
@@ -1843,7 +1856,10 @@ fillbuf (struct buffer *buf, FILE *fp, char const *file)
                       line->keybeg = line_start;
                     }
                 }
-
+              if (head)
+                {
+                  add_to_headbuf(line);
+                }
               line_start = ptr;
             }
 
@@ -1856,6 +1872,13 @@ fillbuf (struct buffer *buf, FILE *fp, char const *file)
       buf->nlines = buffer_linelim (buf) - line;
       if (buf->nlines != 0)
         {
+          if (head)
+            {
+              /* The buffer does not actually grow, we didn't read anything. */
+              buf->nlines = MIN(buf->nlines, abs(head));
+              buf->used = 0;
+              return true;
+            }
           buf->left = ptr - line_start;
           merge_buffer_size = mergesize + MIN_MERGE_BUFFER_SIZE;
           return true;
@@ -3616,6 +3639,15 @@ mergelines_node (struct merge_node *restrict node, size_t total_lines,
     }
   else
     {
+      if (head)
+        {
+          for(int i = 0; i < MIN(abs(head), currentlines); i++)
+            {
+              write_line(headbuf[i], tfp, temp_output);
+            }
+          exit(0);
+        }
+
       /* Merge directly to output. */
       while (node->lo != node->end_lo && node->hi != node->end_hi && to_merge--)
         {
@@ -4316,6 +4348,98 @@ set_ordering (char const *s, struct keyfield *key, enum blanktype blanktype)
   return (char *) s;
 }
 
+static struct line *
+dup_line(struct line *line)
+{
+  /* Duplicate a line struct, allocating memory. */
+
+  struct line *newline = (struct line *)malloc (sizeof (struct line));
+  newline->text = strndup (line->text, line->length);
+  newline->length = line->length;
+  if (line->keybeg)
+    {
+      newline->keybeg = newline->text + (line->keybeg - line->text);
+    }
+  if (line->keylim)
+    {
+      newline->keylim = newline->text + (line->keylim - line->text);
+    }
+  return newline;
+}
+
+static void
+add_to_headbuf(struct line *line)
+{
+  /* insertion-sort into headbuf, which nonetheless is not to
+     exceed abs(head) lines. */
+  /* should probably COPY each line into the headbuf, so the
+     original can be freed.  Or else do better and use the originals
+     and make sure unused ones are freed.  */
+  /* COPYING NOW */
+  /* Things apparently get moved around in the buffers, and just pointing
+     at the originals doesn't work right. */
+  if (currentlines <= 0) {
+    /* STICK IN POSITION 0 */
+    headbuf[0] = dup_line(line);
+    currentlines = 1;
+    return;
+  }
+  int i = currentlines;
+  bool inserted = false;
+  while (i)
+    {
+      struct line *thisline = headbuf[i - 1];
+      int cmp = compare(line, thisline);
+      /* compare takes reversed into account, but how and whether we do the
+         insertion depends on the sign of the global "head" */
+      if ((head > 0 && cmp < 0) ||
+          (head < 0 && cmp > 0))
+        {
+          if (i == abs(head))
+            {
+              /* At the top and thisline is to be jettisoned.  Do NOT shift
+                 upward. */
+              /* DELETE the thisline buffer */
+              free(thisline->text);
+              free(thisline);
+            }
+          else
+            {
+              /* SHIFT thisline UPWARD */
+              headbuf[i] = thisline;
+              if (i == currentlines)
+                {
+                  currentlines++;
+                }
+            }
+        }
+      else
+        {
+          if (i == abs(head))
+            {
+              /* Don't bother looking! */
+              return;
+            }
+          /* Found where line goes */
+          /* INSERT line and stop looking. */
+          /* COPY the line buffer. */
+          // headbuf[i] = line;
+          headbuf[i] = dup_line(line);
+          inserted = true;
+          if (i == currentlines)
+            {
+              currentlines++;
+            }
+          break;
+        }
+      i--;
+    }
+  if (! inserted)
+    {
+      headbuf[0] = dup_line(line); /* Insert at the top. */
+    }
+}
+
 /* Initialize KEY.  */
 
 static struct keyfield *
@@ -4683,6 +4807,23 @@ main (int argc, char **argv)
           nthreads = specify_nthreads (oi, c, optarg);
           break;
 
+        case HEAD_OPTION:
+          char *endptr;
+          head = (int) strtol (optarg, &endptr, 10);
+          /* If 0, silently ignore the whole thing? */
+          if (!strncmp (argv[optind-1], "--tail", 6))
+            {
+              head = -head;
+            }
+          if (head > MAX_HEAD ||
+              head < -MAX_HEAD)
+            {
+              /* XXXX proper error message */
+              error (SORT_FAILURE, 0, _("Invalid head/tail"));
+            }
+          headbuf = (struct line **) xmalloc (abs(head) * sizeof(struct line *));
+          break;
+
         case 'u':
           unique = true;
           break;

sort --head

Reply via email to