Add new command-line option and the required logic that allow multiple consecutive delimiters to be treated as a single delimiter. Of course, this option is valid only with the cut's field mode.
This new feature should make cut much more usable in various real-world applications, some of which are already mentioned in the gotchas. For example, merging the consecutive delimiters is very useful when cut is used to process the outputs of various commands. Add a whole battery of new cut tests, which cover this new feature, and add more tests for the related already existing features, to make sure no regressions are introduced. While there, clean up the comments and the whitespace in the cut tests a bit, to make them slightly more readable. Signed-off-by: Dragan Simic <dsi...@manjaro.org> --- src/cut.c | 57 +++++++++++++++++++++++-- tests/cut/cut.pl | 109 +++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 150 insertions(+), 16 deletions(-) diff --git a/src/cut.c b/src/cut.c index b4edbabec..e21d417aa 100644 --- a/src/cut.c +++ b/src/cut.c @@ -75,6 +75,10 @@ static size_t field_1_bufsize; with field mode. */ static bool suppress_non_delimited; +/* If true, treat multiple consecutive delimiters as a single delimiter. + This option is valid only with field mode. */ +static bool merge_delimiters; + /* If true, print all bytes, characters, or fields _except_ those that were specified. */ static bool complement; @@ -113,6 +117,7 @@ static struct option const longopts[] = {"fields", required_argument, nullptr, 'f'}, {"delimiter", required_argument, nullptr, 'd'}, {"only-delimited", no_argument, nullptr, 's'}, + {"merge-delimiters", no_argument, NULL, 'm'}, {"output-delimiter", required_argument, nullptr, OUTPUT_DELIMITER_OPTION}, {"complement", no_argument, nullptr, COMPLEMENT_OPTION}, {"zero-terminated", no_argument, nullptr, 'z'}, @@ -148,9 +153,10 @@ Print selected parts of lines from each FILE to standard output.\n\ -f, --fields=LIST select only these fields; also print any line\n\ that contains no delimiter character, unless\n\ the -s option is specified\n\ - -n (ignored)\n\ "), stdout); fputs (_("\ + -m, --merge-delimiters treat multiple consecutive delimiters as one\n\ + -n (ignored)\n\ --complement complement the set of selected bytes, characters\n\ or fields\n\ "), stdout); @@ -368,6 +374,15 @@ cut_fields (FILE *stream) found_any_selected_field = true; } } + + if (merge_delimiters) + { + /* Consume any consecutive delimiters. */ + int last_c; + while ((last_c = getc (stream)) == delim); + if (last_c != EOF) + ungetc (last_c, stream); + } next_item (&field_idx); } @@ -408,8 +423,31 @@ cut_fields (FILE *stream) } if (c == delim) - next_item (&field_idx); - else if (c == line_delim || c == EOF) + { + if (merge_delimiters) + { + /* Consume any consecutive delimiters. */ + int last_c; + while ((last_c = getc (stream)) == delim); + if (last_c != EOF) + { + ungetc (last_c, stream); + next_item (&field_idx); + } + else + { + /* EOF is handled in the "if" block below. */ + c = last_c; + } + } + else + { + next_item (&field_idx); + } + } + + /* Delimiters are handled in the "if" block above. */ + if ((c != delim && c == line_delim) || c == EOF) { if (found_any_selected_field || !(suppress_non_delimited && field_idx == 1)) @@ -496,10 +534,13 @@ main (int argc, char **argv) /* By default, all non-delimited lines are printed. */ suppress_non_delimited = false; + /* By default, all delimiters are treated separately. */ + merge_delimiters = false; + delim = '\0'; have_read_stdin = false; - while ((optc = getopt_long (argc, argv, "b:c:d:f:nsz", longopts, nullptr)) + while ((optc = getopt_long (argc, argv, "b:c:f:d:mnsz", longopts, nullptr)) != -1) { switch (optc) @@ -533,6 +574,10 @@ main (int argc, char **argv) output_delimiter_string = optarg; break; + case 'm': + merge_delimiters = true; + break; + case 'n': break; @@ -567,6 +612,10 @@ main (int argc, char **argv) if (suppress_non_delimited) FATAL_ERROR (_("suppressing non-delimited lines makes sense\n\ \tonly when operating on fields")); + + if (merge_delimiters) + FATAL_ERROR (_("merging consecutive delimiters makes sense\n\ +\tonly when operating on fields")); } set_fields (spec_list_string, diff --git a/tests/cut/cut.pl b/tests/cut/cut.pl index 2e019078d..cb0bd0e0d 100755 --- a/tests/cut/cut.pl +++ b/tests/cut/cut.pl @@ -67,14 +67,16 @@ my @Tests = ['a', qw(-s -d:), '-f3-', {IN=>"a:b:c\n"}, {OUT=>"c\n"}], ['b', qw(-s -d:), '-f2,3', {IN=>"a:b:c\n"}, {OUT=>"b:c\n"}], ['c', qw(-s -d:), '-f1,3', {IN=>"a:b:c\n"}, {OUT=>"a:c\n"}], + # Trailing colon should not be output ['d', qw(-s -d:), '-f1,3', {IN=>"a:b:c:\n"}, {OUT=>"a:c\n"}], ['e', qw(-s -d:), '-f3-', {IN=>"a:b:c:\n"}, {OUT=>"c:\n"}], ['f', qw(-s -d:), '-f3-4', {IN=>"a:b:c:\n"}, {OUT=>"c:\n"}], ['g', qw(-s -d:), '-f3,4', {IN=>"a:b:c:\n"}, {OUT=>"c:\n"}], + # Make sure -s suppresses non-delimited lines ['h', qw(-s -d:), '-f2,3', {IN=>"abc\n"}, {OUT=>""}], - # + ['i', qw(-d: -f1-3), {IN=>":::\n"}, {OUT=>"::\n"}], ['j', qw(-d: -f1-4), {IN=>":::\n"}, {OUT=>":::\n"}], ['k', qw(-d: -f2-3), {IN=>":::\n"}, {OUT=>":\n"}], @@ -87,44 +89,89 @@ my @Tests = ['r', qw(-s -d: -f2-4), {IN=>":::\n:1\n"}, {OUT=>"::\n1\n"}], ['s', qw(-s -d: -f1-4), {IN=>":::\n:a\n"}, {OUT=>":::\n:a\n"}], ['t', qw(-s -d: -f3-), {IN=>":::\n:1\n"}, {OUT=>":\n\n"}], + # Make sure it handles empty input properly, with and without -s. ['u', qw(-s -f3-), {IN=>""}, {OUT=>""}], ['v', '-f3-', {IN=>""}, {OUT=>""}], + # Make sure it handles empty input properly. ['w', qw(-b 1), {IN=>""}, {OUT=>""}], ['x', qw(-s -d: -f2-4), {IN=>":\n"}, {OUT=>"\n"}], + # Errors # -s may be used only with -f - ['y', qw(-s -b4), {IN=>":\n"}, {OUT=>""}, {EXIT=>1}, + ['error-suppress', qw(-s -b4), {IN=>":\n"}, {OUT=>""}, {EXIT=>1}, {ERR=>"$prog: suppressing non-delimited lines makes sense\n" . "\tonly when operating on fields\n$try"}], + + # -m may be used only with -f + ['error-merge', qw(-m -b4), {IN=>":\n"}, {OUT=>""}, {EXIT=>1}, + {ERR=>"$prog: merging consecutive delimiters makes sense\n" + . "\tonly when operating on fields\n$try"}], + # You must specify bytes or fields (or chars) - ['z', '', {IN=>":\n"}, {OUT=>""}, {EXIT=>1}, + ['error-specify', '', {IN=>":\n"}, {OUT=>""}, {EXIT=>1}, {ERR=>"$prog: you must specify a list of bytes, characters, or fields\n$try"} ], + # Empty field list ['empty-fl', qw(-f ''), {IN=>":\n"}, {OUT=>""}, {EXIT=>1}, {ERR=>$from_field1}], + # Missing field list ['missing-fl', qw(-f --), {IN=>":\n"}, {OUT=>""}, {EXIT=>1}, {ERR=>$inval_fld}], + # Empty byte list ['empty-bl', qw(-b ''), {IN=>":\n"}, {OUT=>""}, {EXIT=>1}, {ERR=>$from_pos1}], + # Missing byte list ['missing-bl', qw(-b --), {IN=>":\n"}, {OUT=>""}, {EXIT=>1}, {ERR=>$inval_pos}], # This test fails with cut from textutils-1.22. ['empty-f1', '-f1', {IN=>""}, {OUT=>""}], - ['empty-f2', '-f2', {IN=>""}, {OUT=>""}], + # Input delimiter variants ['o-delim', qw(-d: --out=_), '-f2,3', {IN=>"a:b:c\n"}, {OUT=>"b_c\n"}], ['nul-idelim', qw(-d '' --out=_), '-f2,3', {IN=>"a\0b\0c\n"}, {OUT=>"b_c\n"}], ['nul-odelim', qw(-d: --out=), '-f2,3', {IN=>"a:b:c\n"}, {OUT=>"b\0c\n"}], ['multichar-od', qw(-d: --out=_._), '-f2,3', {IN=>"a:b:c\n"}, {OUT=>"b_._c\n"}], + # Consecutive input delimiters, merging disabled + ['merge-delim-no-1', qw(-d:), '-f2,3', {IN=>"a:b::c:d\n"}, {OUT=>"b:\n"}], + ['merge-delim-no-2', qw(-d:), '-f2,3', {IN=>"a:b:::c:d\n"}, {OUT=>"b:\n"}], + ['merge-delim-no-3', qw(-d:), '-f3-', {IN=>"a:b::c:d\n"}, {OUT=>":c:d\n"}], + ['merge-delim-no-4', qw(-d:), '-f3-', {IN=>"a:b::c:d:\n"}, {OUT=>":c:d:\n"}], + ['merge-delim-no-5', qw(-d:), '-f3-', {IN=>"a:b::c:d::\n"}, {OUT=>":c:d::\n"}], + ['merge-delim-no-6', qw(-d:), '-f3-', {IN=>":a:b::c:d::\n"}, {OUT=>"b::c:d::\n"}], + ['merge-delim-no-7', qw(-d:), '-f1,2', {IN=>"a:b::c:d::\n"}, {OUT=>"a:b\n"}], + ['merge-delim-no-8', qw(-d:), '-f1,2', {IN=>":a:b::c:d::\n"}, {OUT=>":a\n"}], + ['merge-delim-no-9', qw(-d:), '-f1-3', {IN=>"::a:b::c:d::\n"}, {OUT=>"::a\n"}], + ['merge-delim-no-10', qw(-d:), '-f1-5', {IN=>"::a:b::c:d::\n"}, {OUT=>"::a:b:\n"}], + ['merge-delim-no-11', qw(-d:), '-f1-6', {IN=>"::a:b::c:d::\n"}, {OUT=>"::a:b::c\n"}], + + # Consecutive input delimiters, merging enabled + ['merge-delim-yes-1', qw(-d: -m), '-f2,3', {IN=>"a:b::c:d\n"}, {OUT=>"b:c\n"}], + ['merge-delim-yes-2', qw(-d: -m), '-f2,3', {IN=>"a:b:::c:d\n"}, {OUT=>"b:c\n"}], + ['merge-delim-yes-3', qw(-d: -m), '-f3-', {IN=>"a:b::c:d\n"}, {OUT=>"c:d\n"}], + ['merge-delim-yes-4', qw(-d: -m), '-f3-', {IN=>"a:b:::c:d\n"}, {OUT=>"c:d\n"}], + ['merge-delim-yes-5', qw(-d: -m), '-f3-', {IN=>"a::b:::c:d\n"}, {OUT=>"c:d\n"}], + ['merge-delim-yes-6', qw(-d: -m), '-f2-', {IN=>"a::b:::c::d\n"}, {OUT=>"b:c:d\n"}], + ['merge-delim-yes-7', qw(-d: -m), '-f2-', {IN=>"a::b:::c::d:\n"}, {OUT=>"b:c:d:\n"}], + ['merge-delim-yes-8', qw(-d: -m), '-f2-', {IN=>"a::b:::c::d::\n"}, {OUT=>"b:c:d:\n"}], + ['merge-delim-yes-9', qw(-d: -m), '-f2-', {IN=>":a::b:::c::d::\n"}, {OUT=>"a:b:c:d:\n"}], + ['merge-delim-yes-10', qw(-d: -m), '-f2-', {IN=>"::a::b:::c::d::\n"}, {OUT=>"a:b:c:d:\n"}], + ['merge-delim-yes-11', qw(-d: -m), '-f1,2', {IN=>":a::b:::c::d::\n"}, {OUT=>":a\n"}], + ['merge-delim-yes-12', qw(-d: -m), '-f1,2', {IN=>"::a::b:::c::d::\n"}, {OUT=>":a\n"}], + ['merge-delim-yes-13', qw(-d: -m), '-f1-3', {IN=>"::a::b:::c::d::\n"}, {OUT=>":a:b\n"}], + ['merge-delim-yes-14', qw(-d: -m), '-f1-5', {IN=>"::a:b::c:d::\n"}, {OUT=>":a:b:c:d\n"}], + ['merge-delim-yes-15', qw(-d: -m), '-f1-6', {IN=>"::a:b::c:d::\n"}, {OUT=>":a:b:c:d:\n"}], + ['merge-delim-yes-16', qw(-d: -m), '-f1-', {IN=>":a::b:::c::d::\n"}, {OUT=>":a:b:c:d:\n"}], + ['merge-delim-yes-17', qw(-d: -m), '-f1-', {IN=>"::a::b:::c::d::\n"}, {OUT=>":a:b:c:d:\n"}], + # Ensure delim is not allowed without a field # Prior to 8.21, a NUL delim was allowed without a field ['delim-no-field1', qw(-d ''), '-b1', {EXIT=>1}, {ERR=>$nofield}], @@ -134,7 +181,7 @@ my @Tests = ['8bit-delim', '-d', "\255", '--out=_', '-f2,3', {IN=>"a\255b\255c\n"}, {OUT=>"b_c\n"}], - # newline processing for fields + # Newline processing for fields ['newline-1', '-f1-', {IN=>"a\nb"}, {OUT=>"a\nb\n"}], ['newline-2', '-f1-', {IN=>""}, {OUT=>""}], ['newline-3', '-d:', '-f1', {IN=>"a:1\nb:2\n"}, {OUT=>"a\nb\n"}], @@ -148,18 +195,52 @@ my @Tests = ['newline-11', '-s', '-d:', '-f1,2', {IN=>"a:1\nb:2\n"}, {OUT=>"a:1\nb:2\n"}], ['newline-12', '-s', '-d:', '-f1', {IN=>"a:1\nb:"}, {OUT=>"a\nb\n"}], ['newline-13', '-d:', '-f1-', {IN=>"a1:\n:"}, {OUT=>"a1:\n:\n"}], - # newline processing for fields when -d == '\n' + + # Newline processing for fields when -d == '\n' ['newline-14', "-d'\n'", '-f1', {IN=>"a:1\nb:"}, {OUT=>"a:1\n"}], ['newline-15', '-s', "-d'\n'", '-f1', {IN=>"a:1\nb:"}, {OUT=>"a:1\n"}], ['newline-16', '-s', "-d'\n'", '-f2', {IN=>"\nb"}, {OUT=>"b\n"}], ['newline-17', '-s', "-d'\n'", '-f1', {IN=>"\nb"}, {OUT=>"\n"}], ['newline-18', "-d'\n'", '-f2', {IN=>"\nb"}, {OUT=>"b\n"}], ['newline-19', "-d'\n'", '-f1', {IN=>"\nb"}, {OUT=>"\n"}], - ['newline-20', '-s', "-d'\n'", '-f1-', {IN=>"\n"}, {OUT=>"\n"}], - ['newline-21', '-s', "-d'\n'", '-f1-', {IN=>"\nb"}, {OUT=>"\nb\n"}], - ['newline-22', "-d'\n'", '-f1-', {IN=>"\nb"}, {OUT=>"\nb\n"}], - ['newline-23', "-d'\n'", '-f1-', '--ou=:', {IN=>"a\nb\n"}, {OUT=>"a:b\n"}], - ['newline-24', "-d'\n'", '-f1,2', '--ou=:', {IN=>"a\nb\n"}, {OUT=>"a:b\n"}], + ['newline-20', "-d'\n'", '-f1', {IN=>"b"}, {OUT=>"b\n"}], + ['newline-21', '-s', "-d'\n'", '-f1-', {IN=>"\n"}, {OUT=>"\n"}], + ['newline-22', '-s', "-d'\n'", '-f1-', {IN=>"\nb"}, {OUT=>"\nb\n"}], + ['newline-23', "-d'\n'", '-f1-', {IN=>"\nb"}, {OUT=>"\nb\n"}], + ['newline-24', "-d'\n'", '-f1-', '--ou=:', {IN=>"a\nb\n"}, {OUT=>"a:b\n"}], + ['newline-25', "-d'\n'", '-f1,2', '--ou=:', {IN=>"a\nb\n"}, {OUT=>"a:b\n"}], + ['newline-26', "-d'\n'", '-f1-', '--ou=:', {IN=>"a\n\nb\n"}, {OUT=>"a::b\n"}], + ['newline-27', "-d'\n'", '-f1,2', '--ou=:', {IN=>"a\nb\n\n"}, {OUT=>"a:b\n"}], + ['newline-28', "-d'\n'", '-f1-3', '--ou=:', {IN=>"a\nb\n\nc"}, {OUT=>"a:b:\n"}], + ['newline-29', "-d'\n'", '-f1-3', '--ou=:', {IN=>"a\nb\n\nc\n\n"}, {OUT=>"a:b:\n"}], + ['newline-30', "-d'\n'", '-f1-3', '--ou=:', {IN=>"\na\nb\n\nc"}, {OUT=>":a:b\n"}], + ['newline-31', "-d'\n'", '-f1-3', '--ou=:', {IN=>"\n\na\nb\n\nc\n\n"}, {OUT=>"::a\n"}], + ['newline-32', "-d'\n'", '-f1-4', '--ou=:', {IN=>"\na\nb\n\nc"}, {OUT=>":a:b:\n"}], + ['newline-33', "-d'\n'", '-f1-4', '--ou=:', {IN=>"\n\na\nb\n\nc\n\n"}, {OUT=>"::a:b\n"}], + + # Newline processing for fields when -d == '\n' and merging enabled, which + # produces the same outputs as above when there are no consecutive delimiters, + # but it is internally different + ['newline-merge-1', "-d'\n' -m", '-f1', {IN=>"a:1\nb:"}, {OUT=>"a:1\n"}], + ['newline-merge-2', '-s', "-d'\n' -m", '-f1', {IN=>"a:1\nb:"}, {OUT=>"a:1\n"}], + ['newline-merge-3', '-s', "-d'\n' -m", '-f2', {IN=>"\nb"}, {OUT=>"b\n"}], + ['newline-merge-4', '-s', "-d'\n' -m", '-f1', {IN=>"\nb"}, {OUT=>"\n"}], + ['newline-merge-5', "-d'\n' -m", '-f2', {IN=>"\nb"}, {OUT=>"b\n"}], + ['newline-merge-6', "-d'\n' -m", '-f1', {IN=>"\nb"}, {OUT=>"\n"}], + ['newline-merge-7', "-d'\n' -m", '-f1', {IN=>"b"}, {OUT=>"b\n"}], + ['newline-merge-8', '-s', "-d'\n' -m", '-f1-', {IN=>"\n"}, {OUT=>"\n"}], + ['newline-merge-9', '-s', "-d'\n' -m", '-f1-', {IN=>"\nb"}, {OUT=>"\nb\n"}], + ['newline-merge-10', "-d'\n' -m", '-f1-', {IN=>"\nb"}, {OUT=>"\nb\n"}], + ['newline-merge-11', "-d'\n' -m", '-f1-', '--ou=:', {IN=>"a\nb\n"}, {OUT=>"a:b\n"}], + ['newline-merge-12', "-d'\n' -m", '-f1,2', '--ou=:', {IN=>"a\nb\n"}, {OUT=>"a:b\n"}], + ['newline-merge-13', "-d'\n' -m", '-f1-', '--ou=:', {IN=>"a\n\nb\n"}, {OUT=>"a:b\n"}], + ['newline-merge-14', "-d'\n' -m", '-f1,2', '--ou=:', {IN=>"a\nb\n\n"}, {OUT=>"a:b\n"}], + ['newline-merge-15', "-d'\n' -m", '-f1-3', '--ou=:', {IN=>"a\nb\n\nc"}, {OUT=>"a:b:c\n"}], + ['newline-merge-16', "-d'\n' -m", '-f1-3', '--ou=:', {IN=>"a\nb\n\nc\n\n"}, {OUT=>"a:b:c\n"}], + ['newline-merge-17', "-d'\n' -m", '-f1-3', '--ou=:', {IN=>"\na\nb\n\nc"}, {OUT=>":a:b\n"}], + ['newline-merge-18', "-d'\n' -m", '-f1-3', '--ou=:', {IN=>"\n\na\nb\n\nc\n\n"}, {OUT=>":a:b\n"}], + ['newline-merge-19', "-d'\n' -m", '-f1-4', '--ou=:', {IN=>"\na\nb\n\nc"}, {OUT=>":a:b:c\n"}], + ['newline-merge-20', "-d'\n' -m", '-f1-4', '--ou=:', {IN=>"\n\na\nb\n\nc\n\n"}, {OUT=>":a:b:c\n"}], # --zero-terminated ['zerot-1', "-z", '-c1', {IN=>"ab\0cd\0"}, {OUT=>"a\0c\0"}], @@ -172,23 +253,27 @@ my @Tests = # New functionality: ['out-delim1', '-c1-3,5-', '--output-d=:', {IN=>"abcdefg\n"}, {OUT=>"abc:efg\n"}], + # A totally overlapped field shouldn't change anything: ['out-delim2', '-c1-3,2,5-', '--output-d=:', {IN=>"abcdefg\n"}, {OUT=>"abc:efg\n"}], + # Partial overlap: index '2' is not at the start of a range. ['out-delim3', '-c1-3,2-4,6', '--output-d=:', {IN=>"abcdefg\n"}, {OUT=>"abcd:f\n"}], ['out-delim3a', '-c1-3,2-4,6-', '--output-d=:', {IN=>"abcdefg\n"}, {OUT=>"abcd:fg\n"}], + # Ensure that the following two commands produce the same output. # Before an off-by-1 fix, the output from the former would not contain a ':'. ['out-delim4', '-c4-,2-3', '--output-d=:', {IN=>"abcdefg\n"}, {OUT=>"bc:defg\n"}], ['out-delim5', '-c2-3,4-', '--output-d=:', {IN=>"abcdefg\n"}, {OUT=>"bc:defg\n"}], + # This test would fail for cut from coreutils-5.0.1 and earlier. ['out-delim6', '-c2,1-3', '--output-d=:', {IN=>"abc\n"}, {OUT=>"abc\n"}], - # + ['od-abut', '-b1-2,3-4', '--output-d=:', {IN=>"abcd\n"}, {OUT=>"ab:cd\n"}], ['od-overlap', '-b1-2,2', '--output-d=:', {IN=>"abc\n"}, {OUT=>"ab\n"}], ['od-overlap2', '-b1-2,2-', '--output-d=:', {IN=>"abc\n"}, {OUT=>"abc\n"}], -- 2.33.1