2008-02-16 James Youngman <[EMAIL PROTECTED]> * src/join.c (join): Support --check-order and --nocheck-order. For --check-order, verify that the input files are in sorted order. (usage): Mention --check-order and --nocheck-order. (dupline): Save a copy of the previously-read input line so that we can detect disorder on the input. (get_line): Temporarily save a copy of the previous line (by calling dupline) and check relative ordering (by calling checkorder) before returning the newly-read line. (getseq, join): Tell get_line which file we are reading from. (advance_seq): New function, factoring out some of the code commonly surrounding calls to getseq. (checkorder): New function. Verifies that a pair of consecutive input lines are in sorted order. * coreutils.texi (join invocation): Document the new options --check-order and --nocheck-order. * NEWS: Mention the new options. --- NEWS | 6 ++ doc/coreutils.texi | 19 ++++++- src/join.c | 129 ++++++++++++++++++++++++++++++++++++++++++--------- 3 files changed, 128 insertions(+), 26 deletions(-)
diff --git a/NEWS b/NEWS index af27aab..0bf8dbb 100644 --- a/NEWS +++ b/NEWS @@ -17,6 +17,12 @@ GNU coreutils NEWS -*- outline -*- mkdir and split now write --verbose output to stdout, not stderr. +** New features + + join now has a --check-order option which causes join to verify that + the input files are indeed sorted. The option --nocheck-sorted + turns the check off (the check is currently off by default). + * Noteworthy changes in release 6.10 (2008-01-22) [stable] diff --git a/doc/coreutils.texi b/doc/coreutils.texi index 23d0ab4..0dd4587 100644 --- a/doc/coreutils.texi +++ b/doc/coreutils.texi @@ -5149,9 +5149,16 @@ sort a file on its default join field, but if you select a non-default locale, join field, separator, or comparison options, then you should do so consistently between @command{join} and @command{sort}. -As a @acronym{GNU} extension, if the input has no unpairable lines the -sort order can be any order that considers two fields to be equal if and -only if the sort comparison described above considers them to be equal. [EMAIL PROTECTED] Unsorted inputs are a common cause of FAQs, but we probably [EMAIL PROTECTED] should not make --check-order the default, as we documented this [EMAIL PROTECTED] extension and so should continue to allow it +. +If the @option{--check-order} option is given, unsorted inputs will +cause a fatal error message. If the @option{--check-order} option is +not given, a @acronym{GNU} extension is available: if the input has no +unpairable lines the sort order can be any order that considers two +fields to be equal if and only if the sort comparison described above +considers them to be equal. For example: @example @@ -5188,6 +5195,12 @@ The program accepts the following options. Also see @ref{Common options}. Print a line for each unpairable line in file @var{file-number} (either @samp{1} or @samp{2}), in addition to the normal output. [EMAIL PROTECTED] --check-order +Check that both input files are in sorted order. + [EMAIL PROTECTED] --nocheck-order +Do not check that both input files are in sorted order. This is the default. + @item -e @var{string} @opindex -e Replace those output fields that are missing in the input with diff --git a/src/join.c b/src/join.c index a6ca7e4..2a5147d 100644 --- a/src/join.c +++ b/src/join.c @@ -108,9 +108,21 @@ static struct outlist *outlist_end = &outlist_head; tab character whose value (when cast to unsigned char) equals TAB. */ static int tab = -1; +/* If nonzero, check that the input is correctly ordered. */ +static bool check_input_order = false; + +enum +{ + CHECK_ORDER_OPTION = CHAR_MAX + 1, + NOCHECK_ORDER_OPTION +}; + + static struct option const longopts[] = { {"ignore-case", no_argument, NULL, 'i'}, + {"check-order", no_argument, NULL, CHECK_ORDER_OPTION}, + {"nocheck-order", no_argument, NULL, NOCHECK_ORDER_OPTION}, {GETOPT_HELP_OPTION_DECL}, {GETOPT_VERSION_OPTION_DECL}, {NULL, 0, NULL, 0} @@ -122,6 +134,9 @@ static struct line uni_blank; /* If nonzero, ignore case when comparing join fields. */ static bool ignore_case; + +static void checkorder (const struct line *, const struct line *, int); + void usage (int status) { @@ -153,6 +168,8 @@ by whitespace. When FILE1 or FILE2 (not both) is -, read standard input.\n\ -v FILENUM like -a FILENUM, but suppress joined output lines\n\ -1 FIELD join on this FIELD of file 1\n\ -2 FIELD join on this FIELD of file 2\n\ + --check-order check that the input is correctly sorted\n\ + --nocheck-order do not check that the input is correctly sorted\n\ "), stdout); fputs (HELP_OPTION_DESCRIPTION, stdout); fputs (VERSION_OPTION_DESCRIPTION, stdout); @@ -228,12 +245,49 @@ xfields (struct line *line) extract_field (line, ptr, lim - ptr); } +struct line* +dupline (const struct line *old) +{ + struct line *newline = xmalloc (sizeof *newline); + size_t i; + + /* Duplicate the buffer. */ + initbuffer (&newline->buf); + newline->buf.buffer = xmalloc (old->buf.size); + newline->buf.size = old->buf.size; + memcpy (newline->buf.buffer, old->buf.buffer, old->buf.length); + newline->buf.length = old->buf.length; + + /* Duplicate the field positions. */ + newline->fields = xmalloc (sizeof *newline->fields * old->nfields_allocated); + newline->nfields = old->nfields; + newline->nfields_allocated = old->nfields_allocated; + + for (i=0; i<old->nfields; i++) + { + newline->fields[i].len = old->fields[i].len; + newline->fields[i].beg = newline->buf.buffer + (old->fields[i].beg + - old->buf.buffer); + } + return newline; +} + +static void +freeline (struct line *line) +{ + free (line->fields); + free (line->buf.buffer); + line->buf.buffer = NULL; +} + /* Read a line from FP into LINE and split it into fields. Return true if successful. */ static bool -get_line (FILE *fp, struct line *line) +get_line (FILE *fp, struct line *line, int which) { + struct line *old = check_input_order ? dupline (line) : NULL; + initbuffer (&line->buf); if (! readlinebuffer (&line->buf, fp)) @@ -242,6 +296,8 @@ get_line (FILE *fp, struct line *line) error (EXIT_FAILURE, errno, _("read error")); free (line->buf.buffer); line->buf.buffer = NULL; + if (check_input_order) + freeline (old); return false; } @@ -249,15 +305,13 @@ get_line (FILE *fp, struct line *line) line->nfields = 0; line->fields = NULL; xfields (line); - return true; -} -static void -freeline (struct line *line) -{ - free (line->fields); - free (line->buf.buffer); - line->buf.buffer = NULL; + if (check_input_order) + { + checkorder (old, line, which); + freeline (old); + } + return true; } static void @@ -271,12 +325,12 @@ initseq (struct seq *seq) /* Read a line from FP and add it to SEQ. Return true if successful. */ static bool -getseq (FILE *fp, struct seq *seq) +getseq (FILE *fp, struct seq *seq, int whichfile) { if (seq->count == seq->alloc) seq->lines = X2NREALLOC (seq->lines, &seq->alloc); - if (get_line (fp, &seq->lines[seq->count])) + if (get_line (fp, &seq->lines[seq->count], whichfile)) { ++seq->count; return true; @@ -284,6 +338,20 @@ getseq (FILE *fp, struct seq *seq) return false; } +/* Read a line from FP and add it to SEQ, as the first item if FIRST is + * true, else as the next. + */ +static bool +advance_seq (FILE *fp, struct seq *seq, bool first, int whichfile) +{ + if (first) + { + freeline (&seq->lines[0]); + seq->count = 0; + } + return getseq (fp, seq, whichfile); +} + static void delseq (struct seq *seq) { @@ -354,6 +422,17 @@ keycmp (struct line const *line1, struct line const *line2) return len1 < len2 ? -1 : len1 != len2; } +static void +checkorder (const struct line *prev, + const struct line *current, + int whatfile) +{ + if (keycmp (prev, current) > 0) + { + error (EXIT_FAILURE, 0, _("File %d is not in sorted order"), whatfile); + } +} + /* Print field N of LINE if it exists and is nonempty, otherwise `empty_filler' if it is nonempty. */ @@ -468,9 +547,9 @@ join (FILE *fp1, FILE *fp2) /* Read the first line of each file. */ initseq (&seq1); - getseq (fp1, &seq1); + getseq (fp1, &seq1, 1); initseq (&seq2); - getseq (fp2, &seq2); + getseq (fp2, &seq2, 2); while (seq1.count && seq2.count) { @@ -480,18 +559,14 @@ join (FILE *fp1, FILE *fp2) { if (print_unpairables_1) prjoin (&seq1.lines[0], &uni_blank); - freeline (&seq1.lines[0]); - seq1.count = 0; - getseq (fp1, &seq1); + advance_seq (fp1, &seq1, true, 1); continue; } if (diff > 0) { if (print_unpairables_2) prjoin (&uni_blank, &seq2.lines[0]); - freeline (&seq2.lines[0]); - seq2.count = 0; - getseq (fp2, &seq2); + advance_seq (fp2, &seq2, true, 2); continue; } @@ -499,7 +574,7 @@ join (FILE *fp1, FILE *fp2) match the current line from file2. */ eof1 = false; do - if (!getseq (fp1, &seq1)) + if (!advance_seq (fp1, &seq1, false, 1)) { eof1 = true; ++seq1.count; @@ -511,7 +586,7 @@ join (FILE *fp1, FILE *fp2) match the current line from file1. */ eof2 = false; do - if (!getseq (fp2, &seq2)) + if (!advance_seq (fp2, &seq2, false, 2)) { eof2 = true; ++seq2.count; @@ -554,7 +629,7 @@ join (FILE *fp1, FILE *fp2) { prjoin (&seq1.lines[0], &uni_blank); freeline (&seq1.lines[0]); - while (get_line (fp1, &line)) + while (get_line (fp1, &line, 1)) { prjoin (&line, &uni_blank); freeline (&line); @@ -565,7 +640,7 @@ join (FILE *fp1, FILE *fp2) { prjoin (&uni_blank, &seq2.lines[0]); freeline (&seq2.lines[0]); - while (get_line (fp2, &line)) + while (get_line (fp2, &line, 2)) { prjoin (&uni_blank, &line); freeline (&line); @@ -875,6 +950,14 @@ main (int argc, char **argv) } break; + case NOCHECK_ORDER_OPTION: + check_input_order = false; + break; + + case CHECK_ORDER_OPTION: + check_input_order = true; + break; + case 1: /* Non-option argument. */ add_file_name (optarg, names, operand_status, joption_count, &nfiles, &prev_optc_status, &optc_status); -- 1.5.3.8 _______________________________________________ Bug-coreutils mailing list Bug-coreutils@gnu.org http://lists.gnu.org/mailman/listinfo/bug-coreutils