On Tue, Apr 1, 2008 at 4:11 AM, Jim Meyering <[EMAIL PROTECTED]> wrote:
>  So please see if you can find an option name that does not start with
>  --merge, and, ideally, that doesn't influence interpretation of any
>  other option abbreviations.
>

How about --batch-size.  It doesn't have an initial substring that
coincides with more than a character of an existing longopt, and there
aren't really any other 'batches' in sort aside from these merge
batches (with the exception perhaps of input buffers which are covered
by the the --buffer-size option).  In my personal version I've been
using --nmerge, but I wasn't sure if that was user-friendly enough for
a more general audience.

I've made nmerge an unsigned int and added a check to ensure that it's
not allowed to get bigger than (SIZE_MAX / MIN_MERGE_BUFFER_SIZE).

I also realized that I had introduced tabs into tests/misc/sort-merge
where there previously hadn't been any, so I replaced those with
spaces.

Bo
From d1c257dc8c0bd6892ef252e153b72f273879c267 Mon Sep 17 00:00:00 2001
From: Bo Borgerson <[EMAIL PROTECTED]>
Date: Mon, 31 Mar 2008 16:58:21 -0400
Subject: [PATCH] sort: added --batch-size=NMERGE option.

* src/sort.c: Replace constant NMERGE with static unsigned int nmerge.
Validate and apply nmerge command-line settings. Replace some (now
variable-length) arrays with pointers to xnmalloc'd storage.
* tests/misc/sort-merge: Test new option
* doc/coreutils.texi: Describe new option
* NEWS: Advertise new option

Signed-off-by: Bo Borgerson <[EMAIL PROTECTED]>
---
 NEWS                  |    4 ++
 doc/coreutils.texi    |   16 ++++++++++
 src/sort.c            |   78 ++++++++++++++++++++++++++++++++++++++++--------
 tests/misc/sort-merge |   31 +++++++++++++++++--
 4 files changed, 113 insertions(+), 16 deletions(-)

diff --git a/NEWS b/NEWS
index c05e0ad..c8b727c 100644
--- a/NEWS
+++ b/NEWS
@@ -50,6 +50,10 @@ GNU coreutils NEWS                                    -*- outline -*-
   options --general-numeric-sort/-g, --month-sort/-M, --numeric-sort/-n
   and --random-sort/-R, resp.
 
+  sort accepts a new option --batch-size=NMERGE, where NMERGE
+  represents the maximum number of inputs that will be merged at once.
+  When more than NMERGE inputs are present temporary files are used.
+
 ** Improvements
 
   id and groups work around an AFS-related bug whereby those programs
diff --git a/doc/coreutils.texi b/doc/coreutils.texi
index ee7dbb2..eef8940 100644
--- a/doc/coreutils.texi
+++ b/doc/coreutils.texi
@@ -3690,6 +3690,22 @@ multiple fields.
 Example:  To sort on the second field, use @option{--key=2,2}
 (@option{-k 2,2}).  See below for more examples.
 
[EMAIL PROTECTED] [EMAIL PROTECTED]
[EMAIL PROTECTED] --batch-size
[EMAIL PROTECTED] number of inputs to merge, nmerge
+Merge at most @var{nmerge} inputs at once.
+
+If more than @var{nmerge} inputs are to be merged, then temporary files
+will be used.
+
+A large value of @var{nmerge} may improve merge performance and decrease
+temporary storage utilization at the expense of increased memory usage
+and I/0.  Conversely a small value of @var{nmerge} may reduce memory
+requirements and I/0 at the expense of temporary storage consumption and
+merge performance.
+
+The value of @var{nmerge} must be at least 2.
+
 @item -o @var{output-file}
 @itemx [EMAIL PROTECTED]
 @opindex -o
diff --git a/src/sort.c b/src/sort.c
index 8b2eec5..4dc7d1b 100644
--- a/src/sort.c
+++ b/src/sort.c
@@ -223,13 +223,13 @@ static struct month monthtab[] =
 };
 
 /* During the merge phase, the number of files to merge at once. */
-#define NMERGE 16
+#define NMERGE_DEFAULT 16
 
 /* Minimum size for a merge or check buffer.  */
 #define MIN_MERGE_BUFFER_SIZE (2 + sizeof (struct line))
 
 /* Minimum sort size; the code might not work with smaller sizes.  */
-#define MIN_SORT_SIZE (NMERGE * MIN_MERGE_BUFFER_SIZE)
+#define MIN_SORT_SIZE (nmerge * MIN_MERGE_BUFFER_SIZE)
 
 /* The number of bytes needed for a merge or check buffer, which can
    function relatively efficiently even if it holds only one line.  If
@@ -281,6 +281,10 @@ static struct keyfield *keylist;
 /* Program used to (de)compress temp files.  Must accept -d.  */
 static char const *compress_program;
 
+/* Maximum number of files to merge in one go.  If more than this
+   number are present, temp files will be used. */
+static unsigned int nmerge = NMERGE_DEFAULT;
+
 static void sortlines_temp (struct line *, size_t, struct line *);
 
 /* Report MESSAGE for FILE, then clean up and exit.
@@ -344,6 +348,8 @@ Other options:\n\
                               decompress them with PROG -d\n\
   -k, --key=POS1[,POS2]     start a key at POS1, end it at POS2 (origin 1)\n\
   -m, --merge               merge already sorted files; do not sort\n\
+      --batch-size=NMERGE   merge at most NMERGE inputs at once;\n\
+                            for more use temp files\n\
 "), stdout);
       fputs (_("\
   -o, --output=FILE         write result to FILE instead of standard output\n\
@@ -395,7 +401,8 @@ enum
   CHECK_OPTION = CHAR_MAX + 1,
   COMPRESS_PROGRAM_OPTION,
   RANDOM_SOURCE_OPTION,
-  SORT_OPTION
+  SORT_OPTION,
+  NMERGE_OPTION
 };
 
 static char const short_options[] = "-bcCdfgik:mMno:rRsS:t:T:uy:z";
@@ -419,6 +426,7 @@ static struct option const long_options[] =
   {"output", required_argument, NULL, 'o'},
   {"reverse", no_argument, NULL, 'r'},
   {"stable", no_argument, NULL, 's'},
+  {"batch-size", required_argument, NULL, NMERGE_OPTION},
   {"buffer-size", required_argument, NULL, 'S'},
   {"field-separator", required_argument, NULL, 't'},
   {"temporary-directory", required_argument, NULL, 'T'},
@@ -1045,6 +1053,36 @@ inittables (void)
 #endif
 }
 
+/* Specify how many inputs may be merged at once. */
+static void
+specify_nmerge (int oi, char c, char const *s)
+{
+  intmax_t n;
+  enum strtol_error e = xstrtoimax (s, NULL, 10, &n, NULL);
+
+  if (e == LONGINT_OK)
+    {
+      nmerge = n;
+      if (nmerge == n)
+	{
+	  if (nmerge >= 2 
+	      && nmerge <= (SIZE_MAX / MIN_MERGE_BUFFER_SIZE))
+	    {
+	      /* Need to re-check that we meet the minimum
+		 requirement for memory usage with the new,
+		 potentially larger, nmerge */
+	      sort_size = MAX (sort_size, MIN_SORT_SIZE);
+	      return;
+	    }
+	  e = LONGINT_INVALID;
+	}
+      else
+	e = LONGINT_OVERFLOW;
+    }
+
+  xstrtol_fatal (e, oi, c, long_options, s);
+}
+
 /* Specify the amount of main memory to use when sorting.  */
 static void
 specify_sort_size (int oi, char c, char const *s)
@@ -2029,15 +2067,20 @@ static void
 mergefps (struct sortfile *files, size_t ntemps, size_t nfiles,
 	  FILE *ofp, char const *output_file)
 {
-  FILE *fps[NMERGE];		/* Input streams for each file.  */
-  struct buffer buffer[NMERGE];	/* Input buffers for each file. */
+  FILE **fps = xnmalloc (nmerge, sizeof *fps);
+				/* Input streams for each file.  */
+  struct buffer *buffer = xnmalloc (nmerge, sizeof *buffer);
+				/* Input buffers for each file. */
   struct line saved;		/* Saved line storage for unique check. */
   struct line const *savedline = NULL;
 				/* &saved if there is a saved line. */
   size_t savealloc = 0;		/* Size allocated for the saved line. */
-  struct line const *cur[NMERGE]; /* Current line in each line table. */
-  struct line const *base[NMERGE]; /* Base of each line table.  */
-  size_t ord[NMERGE];		/* Table representing a permutation of fps,
+  struct line const **cur = xnmalloc (nmerge, sizeof *cur);
+				/* Current line in each line table. */
+  struct line const **base = xnmalloc (nmerge, sizeof *base);
+				/* Base of each line table.  */
+  size_t *ord = xnmalloc (nmerge, sizeof *ord);
+				/* Table representing a permutation of fps,
 				   such that cur[ord[0]] is the smallest line
 				   and will be next output. */
   size_t i;
@@ -2210,6 +2253,11 @@ mergefps (struct sortfile *files, size_t ntemps, size_t nfiles,
     }
 
   xfclose (ofp, output_file);
+  free(fps);
+  free(buffer);
+  free(ord);
+  free(base);
+  free(cur);
 }
 
 /* Merge into T the two sorted arrays of lines LO (with NLO members)
@@ -2397,7 +2445,7 @@ static void
 merge (struct sortfile *files, size_t ntemps, size_t nfiles,
        char const *output_file)
 {
-  while (NMERGE < nfiles)
+  while (nmerge < nfiles)
     {
       /* Number of input files processed so far.  */
       size_t in;
@@ -2413,20 +2461,20 @@ merge (struct sortfile *files, size_t ntemps, size_t nfiles,
       size_t cheap_slots;
 
       /* Do as many NMERGE-size merges as possible.  */
-      for (out = in = 0; out < nfiles / NMERGE; out++, in += NMERGE)
+      for (out = in = 0; out < nfiles / nmerge; out++, in += nmerge)
 	{
 	  FILE *tfp;
 	  pid_t pid;
 	  char *temp = create_temp (&tfp, &pid);
-	  size_t nt = MIN (ntemps, NMERGE);
+	  size_t nt = MIN (ntemps, nmerge);
 	  ntemps -= nt;
-	  mergefps (&files[in], nt, NMERGE, tfp, temp);
+	  mergefps (&files[in], nt, nmerge, tfp, temp);
 	  files[out].name = temp;
 	  files[out].pid = pid;
 	}
 
       remainder = nfiles - in;
-      cheap_slots = NMERGE - out % NMERGE;
+      cheap_slots = nmerge - out % nmerge;
 
       if (cheap_slots < remainder)
 	{
@@ -3013,6 +3061,10 @@ main (int argc, char **argv)
 	  mergeonly = true;
 	  break;
 
+	case NMERGE_OPTION:
+	  specify_nmerge (oi, c, optarg);
+	  break;
+
 	case 'o':
 	  if (outfile && !STREQ (outfile, optarg))
 	    error (SORT_FAILURE, 0, _("multiple output files specified"));
diff --git a/tests/misc/sort-merge b/tests/misc/sort-merge
index 7ac9f84..b182c0c 100755
--- a/tests/misc/sort-merge
+++ b/tests/misc/sort-merge
@@ -1,7 +1,7 @@
 #!/bin/sh
 # Test "sort -m".
 
-# Copyright (C) 2002, 2003, 2005-2007 Free Software Foundation, Inc.
+# Copyright (C) 2002, 2003, 2005-2008 Free Software Foundation, Inc.
 
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -25,19 +25,44 @@ require 5.003;
 use strict;
 
 (my $program_name = $0) =~ s|.*/||;
+my $prog = 'sort';
 
 # Turn off localisation of executable's ouput.
 @ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3;
 
+# three empty files and one that says 'foo'
+my @inputs = (+(map{{IN=> {"empty$_"=> ''}}}1..3), {IN=> {foo=> "foo\n"}});
+
+# don't need to check for existence, since we're running in a temp dir
+my $badtmp = 'does/not/exist';
+
+# 2^64+1
+my $bigint = "18446744073709551617";
+
 my @Tests =
     (
-     ['m1', '-m', {IN=> {empty=> ''}}, {IN=> {f=> "foo\n"}}, {OUT=>"foo\n"}],
+     ['m1', '-m', @inputs, {OUT=>"foo\n"}],
+
+     # check bounds on --batch-size
+     ['m2', "-m --batch-size=1", @inputs,
+        {ERR=>"$prog: invalid --batch-size argument `1'\n"}, {EXIT=>2}],
+     ['m3', "-m --batch-size=$bigint", @inputs,
+        {ERR=>"$prog: --batch-size argument `$bigint' too large\n"},
+        {EXIT=>2}],
+
+     # This should work since nmerge >= the number of input files
+     ['m4', "-m --batch-size=4 -T$badtmp", @inputs, {OUT=>"foo\n"}],
+
+     # this should fail since nmerge < # of input files, so
+     # temp files are needed
+     ['m5', "-m --batch-size=2 -T$badtmp", @inputs,
+        {ERR_SUBST=>"s|: $badtmp/sort.+||"},
+        {ERR=>"$prog: cannot create temporary file\n"}, {EXIT=>2}],
     );
 
 my $save_temps = $ENV{DEBUG};
 my $verbose = $ENV{VERBOSE};
 
-my $prog = 'sort';
 my $fail = run_tests ($program_name, $prog, [EMAIL PROTECTED], $save_temps, $verbose);
 exit $fail;
 EOF
-- 
1.5.2.5

_______________________________________________
Bug-coreutils mailing list
Bug-coreutils@gnu.org
http://lists.gnu.org/mailman/listinfo/bug-coreutils

Reply via email to