Here is a patch to teach grep the --files0-from=F option.
That is the same option that is already supported by wc, du and sort
in the GNU coreutils.

This option allows one to specify NUL-separated file names in the file
argument, F.  When I would receive requests to add this option to other
programs from the coreutils, I would refuse, because one can obtain
similar behavior by using xargs like this:

  xargs -0 PROGRAM < F

The "use xargs" argument does not work for programs like wc, du and sort.
In those cases, concatenating the output from multiple invocations
(as that xargs usage would do) does not result in the same output as
using --files0-from=F, e.g., because of totals or, for sort, because
the result cannot be the concatenation of separately-sorted outputs.

For most uses of grep, it would be ok to process separate lists of
file names in batches and to concatenate the results, but not with
the --count (-c) option.  The count must reflect the total over
all files processed.

Also, more subtly, consider what would happen when the last batch of file
name inputs to xargs happens to contain only one name. The output from
that final invocation of grep (assuming no option like -h or -H) would
omit the "FILENAME:" prefix for any lines matched from that final file.

This patch depends on the perl-based test infrastructure I have just
posted, because the tests are very similar to those for the three tools
from the coreutils.
From e5bfd1d64ed1ceb1ec6423e8c89aa936e24109e5 Mon Sep 17 00:00:00 2001
From: Jim Meyering <meyer...@fb.com>
Date: Sat, 27 Jun 2015 08:44:31 -0700
Subject: [PATCH] accept new option: --files0-from=F

* src/grep.c Include <assert.h>, "argv-iter.h" and "quotearg.h".
(usage): Describe the new option, and adjust the `Usage':
with this option, no FILE may be specified on the command line.
(wrap_state): New file-scoped global.
(wrapped_argv_iter_n_args, wrapped_argv_iter): New functions.
(main): Handle the new option.
* bootstrap.conf (gnulib_modules): Add argv-iter.
* configure.ac: Set the AM_CONDITIONAL variable, HAVE_PERL.
* tests/files0-from.pl: New file.
* tests/Makefile.am (TESTS): Add it.
* NEWS (New features): Mention it.
---
 NEWS                 |   5 ++
 bootstrap.conf       |   1 +
 src/grep.c           | 205 ++++++++++++++++++++++++++++++++++++++++++++++-----
 src/system.h         |   7 ++
 tests/Makefile.am    |   1 +
 tests/files0-from.pl | 100 +++++++++++++++++++++++++
 6 files changed, 301 insertions(+), 18 deletions(-)
 create mode 100755 tests/files0-from.pl

diff --git a/NEWS b/NEWS
index 35c4aad..393ebf4 100644
--- a/NEWS
+++ b/NEWS
@@ -2,6 +2,11 @@ GNU grep NEWS                                    -*- outline -*-

 * Noteworthy changes in release ?.? (????-??-??) [?]

+** New features
+
+  grep accepts a new option --files0-from=FILE, where FILE contains a
+  list of NUL-terminated file names.
+
 ** Improvements

   When building grep, 'configure' now uses PCRE's pkg-config module for
diff --git a/bootstrap.conf b/bootstrap.conf
index 7842928..5e96142 100644
--- a/bootstrap.conf
+++ b/bootstrap.conf
@@ -25,6 +25,7 @@ gnulib_modules='
 alloca
 announce-gen
 argmatch
+argv-iter
 binary-io
 btowc
 c-ctype
diff --git a/src/grep.c b/src/grep.c
index a735ea5..6cd9f6a 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -26,9 +26,11 @@
 #include <fcntl.h>
 #include <inttypes.h>
 #include <stdio.h>
+#include <assert.h>
 #include "system.h"

 #include "argmatch.h"
+#include "argv-iter.h"
 #include "c-ctype.h"
 #include "closeout.h"
 #include "colorize.h"
@@ -43,6 +45,7 @@
 #include "progname.h"
 #include "propername.h"
 #include "quote.h"
+#include "quotearg.h"
 #include "safe-read.h"
 #include "search.h"
 #include "version-etc.h"
@@ -310,6 +313,7 @@ enum
   EXCLUDE_DIRECTORY_OPTION,
   EXCLUDE_OPTION,
   EXCLUDE_FROM_OPTION,
+  FILES0_FROM_OPTION,
   GROUP_SEPARATOR_OPTION,
   INCLUDE_OPTION,
   LINE_BUFFERED_OPTION,
@@ -340,6 +344,7 @@ static struct option const long_options[] =
   {"file", required_argument, NULL, 'f'},
   {"files-with-matches", no_argument, NULL, 'l'},
   {"files-without-match", no_argument, NULL, 'L'},
+  {"files0-from", required_argument, NULL, FILES0_FROM_OPTION},
   {"group-separator", required_argument, NULL, GROUP_SEPARATOR_OPTION},
   {"help", no_argument, &show_help, 1},
   {"include", required_argument, NULL, INCLUDE_OPTION},
@@ -1753,8 +1758,11 @@ usage (int status)
 {
   if (status != 0)
     {
-      fprintf (stderr, _("Usage: %s [OPTION]... PATTERN [FILE]...\n"),
-               program_name);
+      fprintf (stderr, _("\
+Usage: %s [OPTION]... PATTERN [FILE]...\n\
+  or:  %s [OPTION]... --files0-from=F PATTERN\n\
+"),
+               program_name, program_name);
       fprintf (stderr, _("Try '%s --help' for more information.\n"),
                program_name);
     }
@@ -1783,6 +1791,9 @@ Regexp selection and interpretation:\n"), program_name);
       printf (_("\
 \n\
 Miscellaneous:\n\
+      --files0-from=F       read input from the files specified by\n\
+                              NUL-terminated names in file F;\n\
+                              If F is - then read names from standard input\n\
   -s, --no-messages         suppress error messages\n\
   -v, --invert-match        select non-matching lines\n\
   -V, --version             display version information and exit\n\
@@ -2159,6 +2170,62 @@ fgrep_to_grep_pattern (size_t len, char const *keys,
   *new_len = p - *new_keys;
 }

+/* This global and the following two wrapper functions are solely
+   to support the case in which we are reading file names from F, given
+   --files0-from=F.  We need to know, when processing the first file,
+   if there is at least one more file name in F (to decide whether to
+   print the "FILENAME: " prefix), yet the argv-iter module provides no
+   mechanism to peek ahead into that stream.  That is the reason for the
+   wrapped_argv_iter function.  The other function is required because we
+   print the index (analogous to line number) when diagnosing an empty file
+   name, and right after we've peeked into the stream, the value of that
+   function would be one too large.  It compensates in that sole case.  */
+static unsigned int wrap_state = 0;
+
+/* See above.  */
+static size_t
+wrapped_argv_iter_n_args (struct argv_iterator const *ai)
+{
+  size_t n = argv_iter_n_args (ai);
+  return n == 2 && wrap_state == 1 ? 1 : n;
+}
+
+/* Just like argv_iter, but upon first call, set *TWO_OR_MORE,
+   to true when there are two or more file names.  We must save
+   in malloc'd storage the second string, and be careful to free
+   it if this function is called again.  We deliberately do not
+   worry about leaking that value when exiting between the one-time
+   allocation and one-time free.  */
+static char *
+wrapped_argv_iter (struct argv_iterator *ai, enum argv_iter_err *err,
+                   bool *two_or_more)
+{
+  static char *f0;
+  static char *f1;
+  static enum argv_iter_err err_1;
+  if (wrap_state == 0)
+    {
+      f0 = argv_iter (ai, err);
+      if (f0)
+        {
+          f0 = xstrdup (f0); /* deliberate, possible one-time leak */
+          f1 = argv_iter (ai, &err_1);
+          *two_or_more = f1 != NULL;
+        }
+      wrap_state = 1;
+      return f0;
+    }
+  else if (wrap_state == 1)
+    {
+      wrap_state = 2;
+      *err = err_1;
+      free (f0);
+      return f1;
+    }
+
+  return argv_iter (ai, err);
+}
+
 int
 main (int argc, char **argv)
 {
@@ -2170,7 +2237,9 @@ main (int argc, char **argv)
   int prev_optind, last_recursive;
   int fread_errno;
   intmax_t default_context;
+  char *files_from = NULL;
   FILE *fp;
+
   exit_failure = EXIT_TROUBLE;
   initialize_main (&argc, &argv);
   set_program_name (argv[0]);
@@ -2468,6 +2537,10 @@ main (int argc, char **argv)
         add_exclude (excluded_directory_patterns, optarg, EXCLUDE_WILDCARDS);
         break;

+      case FILES0_FROM_OPTION:
+        files_from = optarg;
+        break;
+
       case GROUP_SEPARATOR_OPTION:
         group_separator = optarg;
         break;
@@ -2584,9 +2657,6 @@ main (int argc, char **argv)
   skip_empty_lines = ((execute (eolbytes + 1, 1, &match_size, NULL) == 0)
                       == out_invert);

-  if ((argc - optind > 1 && !no_filenames) || with_filenames)
-    out_file = 1;
-
 #ifdef SET_BINARY
   /* Output is set to binary mode because we shouldn't convert
      NL to CR-LF pairs, especially when grepping binary files.  */
@@ -2601,26 +2671,125 @@ main (int argc, char **argv)
     devices = READ_DEVICES;

   char *const *files;
-  if (optind < argc)
+  struct argv_iterator *ai;
+  if (files_from)
     {
-      files = argv + optind;
-    }
-  else if (directories == RECURSE_DIRECTORIES && prepended < last_recursive)
-    {
-      static char *const cwd_only[] = { (char *) ".", NULL };
-      files = cwd_only;
-      omit_dot_slash = true;
+      /* When using --files0-from=F, you may not specify any files
+         on the command-line.  */
+      if (optind < argc)
+        {
+          /* Trigger with e.g., echo a|src/grep --files0-from=- PAT x  */
+          error (0, 0, _("extra operand %s"), quote (argv[optind]));
+          error (EXIT_TROUBLE, 0,
+                 _("file operands cannot be combined with --files0-from"));
+        }
+
+      if (! (STREQ (files_from, "-") || freopen (files_from, "r", stdin)))
+        error (EXIT_TROUBLE, errno, _("cannot open %s for reading"),
+               quote (files_from));
+
+      ai = argv_iter_init_stream (stdin);
     }
   else
     {
-      static char *const stdin_only[] = { (char *) "-", NULL };
-      files = stdin_only;
+      if (optind < argc)
+        files = argv + optind;
+      else if (directories == RECURSE_DIRECTORIES
+               && prepended < last_recursive)
+        {
+          static char *const cwd_only[] = { (char *) ".", NULL };
+          files = cwd_only;
+          omit_dot_slash = true;
+        }
+      else
+        {
+          static char *const stdin_only[] = { (char *) "-", NULL };
+          files = stdin_only;
+        }
+      ai = argv_iter_init_argv ((char **) files);
     }

+  if (!ai)
+    xalloc_die ();
+
+  int i;
   bool status = true;
-  do
-    status &= grep_command_line_arg (*files++);
-  while (*files != NULL);
+  for (i = 0; /* */; i++)
+    {
+      bool skip_file = false;
+      enum argv_iter_err ai_err;
+      bool two_or_more IF_LINT (= true);
+      /* Using this wrapped argv_iter function is so that we know
+         when there are two or more names in the input specified via
+         --files0-from=F (required in order to set OUT_FILE).  That
+         requires to cache any second name, which in turn requires to
+         use a wrapped_argv_iter_n_args function below, when including
+         the position of an offending zero-length name in such an
+         input file.  */
+      char *file_name = wrapped_argv_iter (ai, &ai_err, &two_or_more);
+      if (i == 0 && ((two_or_more && !no_filenames) || with_filenames))
+        out_file = 1;
+      if (!file_name)
+        {
+          switch (ai_err)
+            {
+            case AI_ERR_EOF:
+              goto argv_iter_done;
+            case AI_ERR_READ:
+              error (0, errno, _("%s: read error"),
+                     quotearg_colon (files_from));
+              status = false;
+              goto argv_iter_done;
+            case AI_ERR_MEM:
+              xalloc_die ();
+            default:
+              assert (!"unexpected error code from argv_iter");
+            }
+        }
+      if (files_from && STREQ (files_from, "-") && STREQ (file_name, "-"))
+        {
+          /* Give a better diagnostic in an unusual case:
+             printf - | grep --files0-from=- RE */
+          error (0, 0, _("when reading file names from stdin, "
+                         "no file name of %s allowed"),
+                 quote (file_name));
+          skip_file = true;
+        }
+
+      if (!file_name[0])
+        {
+          /* Diagnose a zero-length file name.  When it's one
+             among many, knowing the record number may help.
+             FIXME: currently print the record number only with
+             --files0-from=FILE.  Maybe do it for argv, too?  */
+          if (files_from == NULL)
+            error (0, 0, "%s", _("invalid zero-length file name"));
+          else
+            {
+              /* Using the standard 'filename:line-number:' prefix here is
+                 not totally appropriate, since NUL is the separator, not NL,
+                 but it might be better than nothing.  */
+              unsigned long int file_number = wrapped_argv_iter_n_args (ai);
+              error (0, 0, "%s:%lu: %s", quotearg_colon (files_from),
+                     file_number, _("invalid zero-length file name"));
+            }
+          skip_file = true;
+        }
+
+      if (skip_file)
+        errseen = true;
+      else
+        status &= grep_command_line_arg (file_name);
+    }
+ argv_iter_done:
+
+  /* No arguments on the command line is fine.  That means read from stdin.
+     However, no arguments on the --files0-from input stream means don't
+     read anything.  */
+  if (status && !files_from && wrapped_argv_iter_n_args (ai) == 0)
+    status &= grep_command_line_arg ("-");
+
+  argv_iter_free (ai);

   /* We register via atexit() to test stdout.  */
   return errseen ? EXIT_TROUBLE : status;
diff --git a/src/system.h b/src/system.h
index 15a1abb..474c6fd 100644
--- a/src/system.h
+++ b/src/system.h
@@ -107,4 +107,11 @@ static _GL_UNUSED void
 __asan_unpoison_memory_region (void const volatile *addr, size_t size) { }
 #endif

+/* Use this to suppress gcc's '...may be used before initialized' warnings. */
+#ifdef lint
+# define IF_LINT(Code) Code
+#else
+# define IF_LINT(Code) /* empty */
+#endif
+
 #endif
diff --git a/tests/Makefile.am b/tests/Makefile.am
index ccd0196..52adad9 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -86,6 +86,7 @@ TESTS =						\
   fedora					\
   fgrep-infloop					\
   file						\
+  files0-from.pl				\
   fmbtest					\
   foad1						\
   grep-dev-null					\
diff --git a/tests/files0-from.pl b/tests/files0-from.pl
new file mode 100755
index 0000000..8e3f6cb
--- /dev/null
+++ b/tests/files0-from.pl
@@ -0,0 +1,100 @@
+#!/usr/bin/perl
+# Exercise grep's --files0-from option.
+# FIXME: keep this file in sync with tests/misc/wc-files0-from.
+
+# Copyright (C) 2004-2015 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+use strict;
+
+(my $program_name = $0) =~ s|.*/||;
+
+my $prog = 'grep';
+
+# Turn off localization of executable's output.
+@ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3;
+
+my @Tests =
+  (
+   # invalid extra command line argument
+   ['f-extra-arg', '--files0-from=- RE no-such', {IN=>"a"}, {EXIT=>2},
+    {ERR => "$prog: extra operand 'no-such'\n"
+        . "$prog: file operands cannot be combined with --files0-from\n" }
+    ],
+
+   # missing input file
+   ['missing', '--files0-from=missing RE', {EXIT=>2},
+    {ERR => "$prog: cannot open 'missing' for reading: "
+     . "No such file or directory\n"}],
+
+   # input file name of '-'
+   ['minus-in-stdin', '--files0-from=- RE', '<', {IN=>{f=>'-'}}, {EXIT=>2},
+    {ERR => "$prog: when reading file names from stdin, no file name of"
+     . " '-' allowed\n"}],
+
+   # empty input, regular file
+   ['empty', '--files0-from=@AUX@ RE', {AUX=>''}, {EXIT=>1}],
+
+   # empty input, from non-regular file
+   ['empty-nonreg', '--files0-from=/dev/null RE', {EXIT=>1}],
+
+   # one NUL
+   ['nul-1', '--files0-from=- RE', '<', {IN=>"\0"}, {EXIT=>2},
+    {ERR => "$prog: -:1: invalid zero-length file name\n"}],
+
+   # two NULs
+   ['nul-2', '--files0-from=- RE', '<', {IN=>"\0\0"}, {EXIT=>2},
+    {ERR => "$prog: -:1: invalid zero-length file name\n"
+          . "$prog: -:2: invalid zero-length file name\n"}],
+
+   # one file name, no NUL
+   ['1', '--files0-from=- RE', '<',
+    {IN=>{f=>"g"}}, {AUX=>{g=>'RE'}}, {OUT=>"RE\n"} ],
+
+   # one file name, with NUL
+   ['1a', '--files0-from=- RE', '<',
+    {IN=>{f=>"g\0"}}, {AUX=>{g=>'RE'}}, {OUT=>"RE\n"} ],
+
+   # two distinct file names, no final NUL
+   ['2-distinct', '--files0-from=- RE', '<',
+    {IN=>{f=>"g\0h"}}, {AUX=>{g=>'RE'}}, {AUX=>{h=>'RE'}},
+    {OUT=>"g:RE\nh:RE\n"} ],
+
+   # two identical file names, no final NUL
+   ['2-identical', '--files0-from=- RE', '<',
+    {IN=>{f=>"g\0g"}}, {AUX=>{g=>'RE'}}, {OUT=>"g:RE\ng:RE\n"} ],
+
+   # two identical file names, with final NUL
+   ['2a', '--files0-from=- RE', '<',
+    {IN=>{f=>"g\0g\0"}}, {AUX=>{g=>'RE'}}, {OUT=>"g:RE\ng:RE\n"} ],
+
+   # Ensure that $prog processes FILEs following a zero-length name.
+   ['zero-len', '--files0-from=- RE', '<',
+    {IN=>{f=>"\0g\0"}}, {AUX=>{g=>'RE'}}, {OUT=>"g:RE\n"},
+    {ERR => "$prog: -:1: invalid zero-length file name\n"}, {EXIT=>2} ],
+
+   # Diagnose extra file operand when using --files0-from=F
+   ['extra-file', '--files0-from=F RE', 'X',
+    {AUX=>{F=>''}},
+    {ERR => "$prog: extra operand 'X'\n" .
+     "$prog: file operands cannot be combined with --files0-from\n"},
+    {EXIT=>2} ],
+  );
+
+my $save_temps = $ENV{DEBUG};
+my $verbose = $ENV{VERBOSE};
+
+my $fail = run_tests ($program_name, $prog, \@Tests, $save_temps, $verbose);
+exit $fail;
-- 
2.3.7

Reply via email to