Hi. New here, to post a patch which I hope will inspire someone to do a
better job.
I can't really believe that for all these years, we still so very, very
often run sort(1) and pipe it through head(1) or tail(1), because we
only want the top/bottom 20 lines or whatever. That means we're sorting
a potentially gigantic list of lines but not caring about most of the
information.
I'm attaching a patch that adds `--head` and `--tail` options to
sort(1), so you say `sort --head=20` (which is equivalent to saying
`sort --tail=-20` and vice-versa). I've worked this into sort(1) VERY
VERY BADLY, I'm sorry to say, but maybe someone with better familiarity
with the code can do it better. Basically, I'm special-casing out the
situation when --head is active and doing my own thing with it, and then
lying to the rest of the program about it. But it works, anyway.
Performance: with a file consisting of 8,000,000 random strings, `sort |
head -20` takes about 3.5sec (three sample consecutive runs yield 3.443,
3.669, 3.372 sec; presumably the I/O cached is as cached as it can be).
If we stipulate `--parallel=1` (no parallel sorting), then we get about
24.5sec (24.230, 24.516, 24.157). But `sort --head=20` takes just 1.36
seconds, even with parallel=1 (because I didn't bother with any parallel
stuff anyway.) I think that's significant enough to be worth a look.
The algorithm is straightforward: have a buffer of k lines; as each line
comes in, if it's bigger/smaller than the kth line (or the buffer isn't
full yet) insertion-sort it into the buffer and remove the kth line.
Otherwise, don't even bother processing it further. It's an O(nk)
operation at worst, and k << n, basically a constant, so it's O(n). (My
buffer actually COPIES the lines (sorry) because just shifting around
pointers didn't work; pointers wound up pointing to the wrong places at
the end. But this way the memory usage is limited, so it works out.)
I didn't even document it yet, please let me know what you think. Does
someone want to do this better? Or just want me to write the docs and
resubmit?
Thanks,
~mark
diff --git a/src/sort.c b/src/sort.c
index 6c273fab9..7c17b8794 100644
--- a/src/sort.c
+++ b/src/sort.c
@@ -359,6 +359,13 @@ static bool have_read_stdin;
/* List of key field comparisons to be tried. */
static struct keyfield *keylist;
+/* For --head or --tail, the number of lines to be output. */
+/* Not size_t, since it could be negative (tail) */
+#define MAX_HEAD 256
+static int head = 0;
+static struct line **headbuf = NULL;
+static size_t currentlines = 0;
+
/* Program used to (de)compress temp files. Must accept -d. */
static char const *compress_program;
@@ -369,6 +376,9 @@ static bool debug;
number are present, temp files will be used. */
static unsigned int nmerge = NMERGE_DEFAULT;
+static void add_to_headbuf(struct line *line);
+static struct line *dup_line(struct line *line);
+
/* Output an error to stderr and exit using async-signal-safe routines.
This can be used safely from signal handlers,
and between fork and exec of multithreaded processes. */
@@ -542,7 +552,8 @@ enum
NMERGE_OPTION,
RANDOM_SOURCE_OPTION,
SORT_OPTION,
- PARALLEL_OPTION
+ PARALLEL_OPTION,
+ HEAD_OPTION
};
static char const short_options[] = "-bcCdfghik:mMno:rRsS:t:T:uVy:z";
@@ -577,6 +588,8 @@ static struct option const long_options[] =
{"unique", no_argument, nullptr, 'u'},
{"zero-terminated", no_argument, nullptr, 'z'},
{"parallel", required_argument, nullptr, PARALLEL_OPTION},
+ {"head", required_argument, nullptr, HEAD_OPTION},
+ {"tail", required_argument, nullptr, HEAD_OPTION},
{GETOPT_HELP_OPTION_DECL},
{GETOPT_VERSION_OPTION_DECL},
{nullptr, 0, nullptr, 0},
@@ -1843,7 +1856,10 @@ fillbuf (struct buffer *buf, FILE *fp, char const *file)
line->keybeg = line_start;
}
}
-
+ if (head)
+ {
+ add_to_headbuf(line);
+ }
line_start = ptr;
}
@@ -1856,6 +1872,13 @@ fillbuf (struct buffer *buf, FILE *fp, char const *file)
buf->nlines = buffer_linelim (buf) - line;
if (buf->nlines != 0)
{
+ if (head)
+ {
+ /* The buffer does not actually grow, we didn't read anything. */
+ buf->nlines = MIN(buf->nlines, abs(head));
+ buf->used = 0;
+ return true;
+ }
buf->left = ptr - line_start;
merge_buffer_size = mergesize + MIN_MERGE_BUFFER_SIZE;
return true;
@@ -3616,6 +3639,15 @@ mergelines_node (struct merge_node *restrict node, size_t total_lines,
}
else
{
+ if (head)
+ {
+ for(int i = 0; i < MIN(abs(head), currentlines); i++)
+ {
+ write_line(headbuf[i], tfp, temp_output);
+ }
+ exit(0);
+ }
+
/* Merge directly to output. */
while (node->lo != node->end_lo && node->hi != node->end_hi && to_merge--)
{
@@ -4316,6 +4348,98 @@ set_ordering (char const *s, struct keyfield *key, enum blanktype blanktype)
return (char *) s;
}
+static struct line *
+dup_line(struct line *line)
+{
+ /* Duplicate a line struct, allocating memory. */
+
+ struct line *newline = (struct line *)malloc (sizeof (struct line));
+ newline->text = strndup (line->text, line->length);
+ newline->length = line->length;
+ if (line->keybeg)
+ {
+ newline->keybeg = newline->text + (line->keybeg - line->text);
+ }
+ if (line->keylim)
+ {
+ newline->keylim = newline->text + (line->keylim - line->text);
+ }
+ return newline;
+}
+
+static void
+add_to_headbuf(struct line *line)
+{
+ /* insertion-sort into headbuf, which nonetheless is not to
+ exceed abs(head) lines. */
+ /* should probably COPY each line into the headbuf, so the
+ original can be freed. Or else do better and use the originals
+ and make sure unused ones are freed. */
+ /* COPYING NOW */
+ /* Things apparently get moved around in the buffers, and just pointing
+ at the originals doesn't work right. */
+ if (currentlines <= 0) {
+ /* STICK IN POSITION 0 */
+ headbuf[0] = dup_line(line);
+ currentlines = 1;
+ return;
+ }
+ int i = currentlines;
+ bool inserted = false;
+ while (i)
+ {
+ struct line *thisline = headbuf[i - 1];
+ int cmp = compare(line, thisline);
+ /* compare takes reversed into account, but how and whether we do the
+ insertion depends on the sign of the global "head" */
+ if ((head > 0 && cmp < 0) ||
+ (head < 0 && cmp > 0))
+ {
+ if (i == abs(head))
+ {
+ /* At the top and thisline is to be jettisoned. Do NOT shift
+ upward. */
+ /* DELETE the thisline buffer */
+ free(thisline->text);
+ free(thisline);
+ }
+ else
+ {
+ /* SHIFT thisline UPWARD */
+ headbuf[i] = thisline;
+ if (i == currentlines)
+ {
+ currentlines++;
+ }
+ }
+ }
+ else
+ {
+ if (i == abs(head))
+ {
+ /* Don't bother looking! */
+ return;
+ }
+ /* Found where line goes */
+ /* INSERT line and stop looking. */
+ /* COPY the line buffer. */
+ // headbuf[i] = line;
+ headbuf[i] = dup_line(line);
+ inserted = true;
+ if (i == currentlines)
+ {
+ currentlines++;
+ }
+ break;
+ }
+ i--;
+ }
+ if (! inserted)
+ {
+ headbuf[0] = dup_line(line); /* Insert at the top. */
+ }
+}
+
/* Initialize KEY. */
static struct keyfield *
@@ -4683,6 +4807,23 @@ main (int argc, char **argv)
nthreads = specify_nthreads (oi, c, optarg);
break;
+ case HEAD_OPTION:
+ char *endptr;
+ head = (int) strtol (optarg, &endptr, 10);
+ /* If 0, silently ignore the whole thing? */
+ if (!strncmp (argv[optind-1], "--tail", 6))
+ {
+ head = -head;
+ }
+ if (head > MAX_HEAD ||
+ head < -MAX_HEAD)
+ {
+ /* XXXX proper error message */
+ error (SORT_FAILURE, 0, _("Invalid head/tail"));
+ }
+ headbuf = (struct line **) xmalloc (abs(head) * sizeof(struct line *));
+ break;
+
case 'u':
unique = true;
break;