Hi, I've found at times that it's useful to merge more than NMERGE (currently 16) inputs without using temp files.
I've modified my local version of sort to allow this number to be overridden on the command-line (please see enclosed patch). Is this an option that might be worth including in a future release? Thanks, Bo
--- coreutils-6.10/src/sort.c 2007-11-25 08:23:31.000000000 -0500 +++ coreutils-6.10-modified/src/sort.c 2008-03-29 16:06:45.000000000 -0400 @@ -223,13 +223,13 @@ }; /* During the merge phase, the number of files to merge at once. */ -#define NMERGE 16 +#define NMERGE_DEFAULT 16 /* Minimum size for a merge or check buffer. */ #define MIN_MERGE_BUFFER_SIZE (2 + sizeof (struct line)) /* Minimum sort size; the code might not work with smaller sizes. */ -#define MIN_SORT_SIZE (NMERGE * MIN_MERGE_BUFFER_SIZE) +#define MIN_SORT_SIZE (nmerge * MIN_MERGE_BUFFER_SIZE) /* The number of bytes needed for a merge or check buffer, which can function relatively efficiently even if it holds only one line. If @@ -281,6 +281,10 @@ /* Program used to (de)compress temp files. Must accept -d. */ static char const *compress_program; +/* Maximum number of files to merge in one go. If more than this + number are present, temp files will be used. */ +static int nmerge = NMERGE_DEFAULT; + static void sortlines_temp (struct line *, size_t, struct line *); /* Report MESSAGE for FILE, then clean up and exit. @@ -341,6 +345,7 @@ decompress them with PROG -d\n\ -k, --key=POS1[,POS2] start a key at POS1, end it at POS2 (origin 1)\n\ -m, --merge merge already sorted files; do not sort\n\ + --merge-batch-size=N merge at most this many inputs at once\n\ "), stdout); fputs (_("\ -o, --output=FILE write result to FILE instead of standard output\n\ @@ -391,7 +396,8 @@ { CHECK_OPTION = CHAR_MAX + 1, COMPRESS_PROGRAM_OPTION, - RANDOM_SOURCE_OPTION + RANDOM_SOURCE_OPTION, + NMERGE_OPTION }; static char const short_options[] = "-bcCdfgik:mMno:rRsS:t:T:uy:z"; @@ -414,6 +420,7 @@ {"output", required_argument, NULL, 'o'}, {"reverse", no_argument, NULL, 'r'}, {"stable", no_argument, NULL, 's'}, + {"merge-batch-size", required_argument, NULL, NMERGE_OPTION}, {"buffer-size", required_argument, NULL, 'S'}, {"field-separator", required_argument, NULL, 't'}, {"temporary-directory", required_argument, NULL, 'T'}, @@ -2014,15 +2021,15 @@ mergefps (struct sortfile *files, size_t ntemps, size_t nfiles, FILE *ofp, char const *output_file) { - FILE *fps[NMERGE]; /* Input streams for each file. */ - struct buffer buffer[NMERGE]; /* Input buffers for each file. */ + FILE *fps[nmerge]; /* Input streams for each file. */ + struct buffer buffer[nmerge]; /* Input buffers for each file. */ struct line saved; /* Saved line storage for unique check. */ struct line const *savedline = NULL; /* &saved if there is a saved line. */ size_t savealloc = 0; /* Size allocated for the saved line. */ - struct line const *cur[NMERGE]; /* Current line in each line table. */ - struct line const *base[NMERGE]; /* Base of each line table. */ - size_t ord[NMERGE]; /* Table representing a permutation of fps, + struct line const *cur[nmerge]; /* Current line in each line table. */ + struct line const *base[nmerge]; /* Base of each line table. */ + size_t ord[nmerge]; /* Table representing a permutation of fps, such that cur[ord[0]] is the smallest line and will be next output. */ size_t i; @@ -2382,7 +2389,7 @@ merge (struct sortfile *files, size_t ntemps, size_t nfiles, char const *output_file) { - while (NMERGE < nfiles) + while (nmerge < nfiles) { /* Number of input files processed so far. */ size_t in; @@ -2390,33 +2397,33 @@ /* Number of output files generated so far. */ size_t out; - /* nfiles % NMERGE; this counts input files that are left over + /* nfiles % nmerge; this counts input files that are left over after all full-sized merges have been done. */ size_t remainder; /* Number of easily-available slots at the next loop iteration. */ size_t cheap_slots; - /* Do as many NMERGE-size merges as possible. */ - for (out = in = 0; out < nfiles / NMERGE; out++, in += NMERGE) + /* Do as many nmerge-size merges as possible. */ + for (out = in = 0; out < nfiles / nmerge; out++, in += nmerge) { FILE *tfp; pid_t pid; char *temp = create_temp (&tfp, &pid); - size_t nt = MIN (ntemps, NMERGE); + size_t nt = MIN (ntemps, nmerge); ntemps -= nt; - mergefps (&files[in], nt, NMERGE, tfp, temp); + mergefps (&files[in], nt, nmerge, tfp, temp); files[out].name = temp; files[out].pid = pid; } remainder = nfiles - in; - cheap_slots = NMERGE - out % NMERGE; + cheap_slots = nmerge - out % nmerge; if (cheap_slots < remainder) { /* So many files remain that they can't all be put into the last - NMERGE-sized output window. Do one more merge. Merge as few + nmerge-sized output window. Do one more merge. Merge as few files as possible, to avoid needless I/O. */ size_t nshortmerge = remainder - cheap_slots + 1; FILE *tfp; @@ -2430,7 +2437,7 @@ in += nshortmerge; } - /* Put the remaining input files into the last NMERGE-sized output + /* Put the remaining input files into the last nmerge-sized output window, so they will be merged in the next pass. */ memmove(&files[out], &files[in], (nfiles - in) * sizeof *files); ntemps += out; @@ -2995,6 +3002,13 @@ mergeonly = true; break; + case NMERGE_OPTION: + nmerge = strtod(optarg, NULL); + if (nmerge < 2) + error (SORT_FAILURE, 0, + _("invalid merge-batch-size: %s"), optarg); + break; + case 'o': if (outfile && !STREQ (outfile, optarg)) error (SORT_FAILURE, 0, _("multiple output files specified"));
_______________________________________________ Bug-coreutils mailing list Bug-coreutils@gnu.org http://lists.gnu.org/mailman/listinfo/bug-coreutils