find(1): add support for terminating '+' in -exec

Pascal Stumpf Sun, 26 Jun 2011 12:14:56 -0700

This is my first try at getting support for terminating '+' in find(1)'s
-exec statement to work, as required by POSIX.  Code shamelessly
guttenberged from NetBSD, with some minor modifications.




Index: extern.h
===================================================================
RCS file: /cvs/src/usr.bin/find/extern.h,v
retrieving revision 1.16
diff -u -r1.16 extern.h
--- extern.h    21 Apr 2011 01:14:21 -0000      1.16
+++ extern.h    26 Jun 2011 19:07:30 -0000
@@ -37,10 +37,12 @@
 PLAN   *find_create(char ***);
 int     find_execute(PLAN *, char **);
 PLAN   *find_formplan(char **);
+int    find_traverse(PLAN *, int (*)(PLAN *, void *), void *);
 PLAN   *not_squish(PLAN *);
 OPTION *option(char *);
 PLAN   *or_squish(PLAN *);
 PLAN   *paren_squish(PLAN *);
+int    plan_cleanup(PLAN *, void *);
 struct stat;
 void    printlong(char *, char *, struct stat *);
 int     queryuser(char **);
Index: find.1
===================================================================
RCS file: /cvs/src/usr.bin/find/find.1,v
retrieving revision 1.82
diff -u -r1.82 find.1
--- find.1      13 Feb 2011 12:35:02 -0000      1.82
+++ find.1      26 Jun 2011 19:07:30 -0000
@@ -189,28 +189,48 @@
 .Op argument ...
 .No ;
 .Xc
-True if the program named
-.Ar utility
-returns a zero value as its exit status.
+.It Xo
+.Ic -exec Ar utility
+.Op argument ...
+.No {}
++
+.Xc
+Execute the specified
+.Ar utility .
 Optional arguments may be passed to the utility.
 The expression must be terminated by a semicolon
-.Pq Ql \&; .
+.Pq Ql \&;
+or a plus sign
+.Pq Ql \&+ .
+.Pp
+If terminated by a semicolon, the
+.Ar utility
+is executed once per path.
 If the string
 .Qq {}
 appears anywhere in the utility name or the
 arguments it is replaced by the pathname of the current file.
-.Ar utility
-will be executed from the directory from which
-.Nm
-was executed.
 .Pp
-Since
+If terminated by a plus sign
+.Pq Dq \&+ ,
+the pathnames for which the
+primary is evaluated are aggregated into sets, and
 .Ar utility
-is executed every time a match is made,
-it is often more efficient to pipe the output of
-.Nm
-to
+will be invoked once per set, similar to
 .Xr xargs 1 .
+If any invocation exits with non-zero exit status, then
+.Nm
+will eventually do so as well, but this does not cause
+.Nm
+to exit early.
+The string
+.Dq {}
+must appear, and must appear last.
+Each set is limited to no more than 5,000 pathnames,
+and is also limited such that the invocation of
+.Ar utility
+does not exceed
+.Dv ARG_MAX .
 .It Xo
 .Ic -execdir Ar utility
 .Op argument ...
Index: find.c
===================================================================
RCS file: /cvs/src/usr.bin/find/find.c,v
retrieving revision 1.15
diff -u -r1.15 find.c
--- find.c      21 Apr 2011 01:14:21 -0000      1.15
+++ find.c      26 Jun 2011 19:07:30 -0000
@@ -148,7 +148,7 @@
     char **paths)              /* array of pathnames to traverse */
 {
        sigset_t fullset, oset;
-       int rval;
+       int r, rval;
        PLAN *p;
 
        rval = 0;
@@ -201,5 +201,45 @@
                    ;
        }
        (void)fts_close(tree);
+
+       /*
+        * Cleanup any plans with leftover state.
+        * Keep the last non-zero return value.
+        */
+       if ((r = find_traverse(plan, plan_cleanup, NULL)) != 0)
+               rval = r;
        return (rval);
+}
+
+/*
+ * find_traverse --
+ *     traverse the plan tree and execute func() on all plans.  This
+ *     does not evaluate each plan's eval() function; it is intended
+ *     for operations that must run on all plans, such as state
+ *     cleanup.
+ *
+ *     If any func() returns non-zero, then so will find_traverse().
+ */
+int
+find_traverse(PLAN *plan, int (*func)(PLAN *, void *), void *arg)
+{
+       PLAN *p;
+       int r, rval;
+
+       rval = 0;
+       for (p = plan; p; p = p->next) {
+               if ((r = func(p, arg)) != 0)
+                       rval = r;
+               if (p->type == N_EXPR || p->type == N_OR) {
+                       if (p->p_data[0])
+                               if ((r = find_traverse(p->p_data[0],
+                                           func, arg)) != 0)
+                                       rval = r;
+                       if (p->p_data[1])
+                               if ((r = find_traverse(p->p_data[1],
+                                           func, arg)) != 0)
+                                       rval = r;
+               }
+       }
+       return rval;
 }
Index: find.h
===================================================================
RCS file: /cvs/src/usr.bin/find/find.h,v
retrieving revision 1.14
diff -u -r1.14 find.h
--- find.h      15 Sep 2004 18:43:25 -0000      1.14
+++ find.h      26 Jun 2011 19:07:30 -0000
@@ -54,6 +54,7 @@
 #define        F_LESSTHAN      2
 #define        F_GREATER       3
 #define        F_NEEDOK        1                       /* exec ok */
+#define F_PLUSSET      2                       /* -exec ... {} + */
 #define        F_MTFLAG        1                       /* fstype */
 #define        F_MTTYPE        2
 #define        F_ATLEAST       1                       /* perm */
@@ -77,6 +78,13 @@
                        char **_e_argv;         /* argv array */
                        char **_e_orig;         /* original strings */
                        int *_e_len;            /* allocated length */
+                       char **_ep_bxp;         /* ptr to 1st addt'l arg */
+                       char *_ep_p;            /* current buffer pointer */
+                       char *_ep_bbp;          /* begin buffer pointer */
+                       char *_ep_ebp;          /* end buffer pointer */
+                       int _ep_maxargs;        /* max #args */
+                       int _ep_narg;           /* # addt'l args */
+                       int _ep_rval;           /* return value */
                } ex;
                char *_a_data[2];               /* array of char pointers */
                char *_c_data;                  /* char pointer */
@@ -104,6 +112,15 @@
 #define        e_argv          p_un.ex._e_argv
 #define        e_orig          p_un.ex._e_orig
 #define        e_len           p_un.ex._e_len
+#define        ep_p            p_un.ex._ep_p
+#define        ep_bbp          p_un.ex._ep_bbp
+#define        ep_ebp          p_un.ex._ep_ebp
+#define        ep_bxp          p_un.ex._ep_bxp
+#define        ep_cnt          p_un.ex._ep_cnt
+#define        ep_maxargs      p_un.ex._ep_maxargs
+#define        ep_nline        p_un.ex._ep_nline
+#define        ep_narg         p_un.ex._ep_narg
+#define        ep_rval         p_un.ex._ep_rval
 
 typedef struct _option {
        char *name;                             /* option name */
Index: function.c
===================================================================
RCS file: /cvs/src/usr.bin/find/function.c,v
retrieving revision 1.36
diff -u -r1.36 function.c
--- function.c  1 Dec 2010 01:20:29 -0000       1.36
+++ function.c  26 Jun 2011 19:07:30 -0000
@@ -46,6 +46,7 @@
 #include <fts.h>
 #include <grp.h>
 #include <libgen.h>
+#include <limits.h>
 #include <pwd.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -71,6 +72,7 @@
 
 static PLAN *palloc(enum ntype, int (*)(PLAN *, FTSENT *));
 static long find_parsenum(PLAN *plan, char *option, char *vp, char *endch);
+static void run_f_exec(PLAN *plan);
 static PLAN *palloc(enum ntype t, int (*f)(PLAN *, FTSENT *));
 
 int    f_amin(PLAN *, FTSENT *);
@@ -339,38 +341,108 @@
 
 /*
  * [-exec | -ok] utility [arg ... ] ; functions --
+ * [-exec | -ok] utility [arg ... ] {} + functions --
  *
- *     True if the executed utility returns a zero value as exit status.
- *     The end of the primary expression is delimited by a semicolon.  If
- *     "{}" occurs anywhere, it gets replaced by the current pathname.
- *     The current directory for the execution of utility is the same as
- *     the current directory when the find utility was started.
+ *     If the end of the primary expression is delimited by a
+ *     semicolon: true if the executed utility returns a zero value
+ *     as exit status.  If "{}" occurs anywhere, it gets replaced by
+ *     the current pathname.
+ *
+ *     If the end of the primary expression is delimited by a plus
+ *     sign: always true. Pathnames for which the primary is
+ *     evaluated shall be aggregated into sets. The utility will be
+ *     executed once per set, with "{}" replaced by the entire set of
+ *     pathnames (as if xargs). "{}" must appear last.
  *
- *     The primary -ok is different in that it requests affirmation of the
- *     user before executing the utility.
+ *     The current directory for the execution of utility is the same
+ *     as the current directory when the find utility was started.
+ *
+ *     The primary -ok is different in that it requests affirmation
+ *     of the user before executing the utility.
  */
 int
 f_exec(PLAN *plan, FTSENT *entry)
 {
-       int cnt;
+       int cnt, l;
        pid_t pid;
        int status;
 
-       for (cnt = 0; plan->e_argv[cnt]; ++cnt)
-               if (plan->e_len[cnt])
-                       brace_subst(plan->e_orig[cnt], &plan->e_argv[cnt],
-                           entry->fts_path, plan->e_len[cnt]);
+       if (plan->flags & F_PLUSSET) {
+               /*
+                * Confirm sufficient buffer space, then copy the path
+                * to the buffer.
+                */
+               l = strlen(entry->fts_path);
+               if (plan->ep_p + l < plan->ep_ebp) {
+                       plan->ep_bxp[plan->ep_narg++] = plan->ep_p;
+                       strlcpy(plan->ep_p, entry->fts_path, l + 1);
+                       plan->ep_p += l + 1;
+
+                       if (plan->ep_narg == plan->ep_maxargs)
+                               run_f_exec(plan);
+               } else {
+                       /*
+                        * Without sufficient space to copy in the next
+                        * argument, run the command to empty out the
+                        * buffer before re-attepting the copy.
+                        */
+                       run_f_exec(plan);
+                       if (plan->ep_p + l < plan->ep_ebp) {
+                               plan->ep_bxp[plan->ep_narg++] = plan->ep_p;
+                               strlcpy(plan->ep_p, entry->fts_path, l + 1);
+                               plan->ep_p += l + 1;
+                       } else
+                               errx(1, "insufficient space for argument");
+               }
+               return (1);
+       } else {
+               for (cnt = 0; plan->e_argv[cnt]; ++cnt)
+                       if (plan->e_len[cnt])
+                               brace_subst(plan->e_orig[cnt],
+                                   &plan->e_argv[cnt],
+                                   entry->fts_path,
+                                   plan->e_len[cnt]);
+               if (plan->flags & F_NEEDOK && !queryuser(plan->e_argv))
+                       return (0);
 
-       if (plan->flags == F_NEEDOK && !queryuser(plan->e_argv))
-               return (0);
+               /* don't mix output of command with find output */
+               fflush(stdout);
+               fflush(stderr);
+
+               switch (pid = vfork()) {
+               case -1:
+                       err(1, "fork");
+                       /* NOTREACHED */
+               case 0:
+                       if (fchdir(dotfd)) {
+                               warn("chdir");
+                               _exit(1);
+                       }
+                       execvp(plan->e_argv[0], plan->e_argv);
+                       warn("%s", plan->e_argv[0]);
+                       _exit(1);
+               }
+               pid = waitpid(pid, &status, 0);
+               return (pid != -1 && WIFEXITED(status) && !WEXITSTATUS(status));
+       }
+}
 
-       /* don't mix output of command with find output */
-       fflush(stdout);
-       fflush(stderr);
+static void
+run_f_exec(PLAN *plan)
+{
+       pid_t pid;
+       int rval, status;
+
+       /* Ensure arg list is null terminated. */
+       plan->ep_bxp[plan->ep_narg] = NULL;
+
+       /* Don't mix output of command with find output. */
+       fflush(stdout);
+       fflush(stderr);
 
        switch (pid = vfork()) {
        case -1:
-               err(1, "fork");
+               err(1, "vfork");
                /* NOTREACHED */
        case 0:
                if (fchdir(dotfd)) {
@@ -381,8 +453,26 @@
                warn("%s", plan->e_argv[0]);
                _exit(1);
        }
+
+       /* Clear out the argument list. */
+       plan->ep_narg = 0;
+       plan->ep_bxp[plan->ep_narg] = NULL;
+       /* As well as the argument buffer. */
+       plan->ep_p = plan->ep_bbp;
+       *plan->ep_p = '\0';
+
        pid = waitpid(pid, &status, 0);
-       return (pid != -1 && WIFEXITED(status) && !WEXITSTATUS(status));
+       if (WIFEXITED(status))
+               rval = WEXITSTATUS(status);
+       else
+               rval = -1;
+
+       /*
+        * If we have a non-zero exit status, preserve it so find(1) can
+        * later exit with it.
+        */
+       if (rval)
+               plan->ep_rval = rval;
 }
  
 /*
@@ -391,12 +481,16 @@
  *     on the command line, one with (possibly duplicated) pointers to the
  *     argv array, and one with integer values that are lengths of the
  *     strings, but also flags meaning that the string has to be massaged.
+ *
+ *     If -exec ... {} +, use only the first array, but make it large
+ *     enough to hold 5000 args (cf. src/usr.bin/xargs/xargs.c for a
+ *     discussion), and then allocate ARG_MAX - 4K of space for args.
  */
 PLAN *
 c_exec(char *unused, char ***argvp, int isok)
 {
        PLAN *new;                      /* node returned */
-       int cnt;
+       int cnt, brace, lastbrace;
        char **argv, **ap, *p;
 
        /* make sure the current directory is readable */
@@ -407,36 +501,93 @@
     
        new = palloc(N_EXEC, f_exec);
        if (isok)
-               new->flags = F_NEEDOK;
+               new->flags |= F_NEEDOK;
 
-       for (ap = argv = *argvp;; ++ap) {
+       /*
+        * Terminate if we encounter an arg exacty equal to ";", or an
+        * arg exacty equal to "+" following an arg exacty equal to
+        * "{}".
+        */
+       for (ap = argv = *argvp, brace = 0;; ++ap) {
                if (!*ap)
-                       errx(1,
-                           "%s: no terminating \";\"", isok ? "-ok" : "-exec");
-               if (**ap == ';')
+                       errx(1, "%s: no terminating \";\" or \"+\"",
+                           isok ? "-ok" : "-exec");
+               lastbrace = brace;
+               brace = 0;
+               if (strcmp(*ap, "{}") == 0)
+                       brace = 1;
+               if (strcmp(*ap, ";") == 0)
                        break;
+               if (strcmp(*ap, "+") == 0 && lastbrace) {
+                       new->flags |= F_PLUSSET;
+                       break;
+               }
        }
 
-       cnt = ap - *argvp + 1;
-       new->e_argv = (char **)emalloc((u_int)cnt * sizeof(char *));
-       new->e_orig = (char **)emalloc((u_int)cnt * sizeof(char *));
-       new->e_len = (int *)emalloc((u_int)cnt * sizeof(int));
 
-       for (argv = *argvp, cnt = 0; argv < ap; ++argv, ++cnt) {
-               new->e_orig[cnt] = *argv;
-               for (p = *argv; *p; ++p)
-                       if (p[0] == '{' && p[1] == '}') {
-                               new->e_argv[cnt] = emalloc((u_int)MAXPATHLEN);
-                               new->e_len[cnt] = MAXPATHLEN;
-                               break;
+       /*
+        * POSIX says -ok ... {} + "need not be supported," and it does
+        * not make much sense anyway.
+        */
+       if (new->flags & F_NEEDOK && new->flags & F_PLUSSET)
+               errx(1, "-ok: terminating \"+\" not permitted.");
+
+       if (new->flags & F_PLUSSET) {
+               u_int c, bufsize;
+
+               cnt = ap - *argvp - 1;                  /* units are words */
+               new->ep_maxargs = 5000;
+               new->e_argv = (char **)emalloc((u_int)(cnt + new->ep_maxargs)
+                                               * sizeof(char **));
+
+               /* We start stuffing arguments after the user's last one. */
+               new->ep_bxp = &new->e_argv[cnt];
+               new->ep_narg = 0;
+
+               /*
+                * Count up the space of the user's arguments, and
+                * subtract that from what we allocate.
+                */
+               for (argv = *argvp, c = 0, cnt = 0;
+                    argv < ap;
+                    ++argv, ++cnt) {
+                       c += strlen(*argv) + 1;
+                       new->e_argv[cnt] = *argv;
+               }
+               bufsize = ARG_MAX - 4 * 1024 - c;
+
+
+               /*
+                * Allocate, and then initialize current, base, and
+                * end pointers.
+                */
+               new->ep_p = new->ep_bbp = malloc(bufsize + 1);
+               new->ep_ebp = new->ep_bbp + bufsize - 1;
+               new->ep_rval = 0;
+       } else { /* !F_PLUSSET */
+               cnt = ap - *argvp + 1;
+               new->e_argv = (char **)emalloc((u_int)cnt * sizeof(char *));
+               new->e_orig = (char **)emalloc((u_int)cnt * sizeof(char *));
+               new->e_len = (int *)emalloc((u_int)cnt * sizeof(int));
+
+               for (argv = *argvp, cnt = 0; argv < ap; ++argv, ++cnt) {
+                       new->e_orig[cnt] = *argv;
+                       for (p = *argv; *p; ++p)
+                               if (p[0] == '{' && p[1] == '}') {
+                                       new->e_argv[cnt] =
+                                               emalloc((u_int)MAXPATHLEN);
+                                       new->e_len[cnt] = MAXPATHLEN;
+                                       break;
+                               }
+                       if (!*p) {
+                               new->e_argv[cnt] = *argv;
+                               new->e_len[cnt] = 0;
                        }
-               if (!*p) {
-                       new->e_argv[cnt] = *argv;
-                       new->e_len[cnt] = 0;
                }
-       }
-       new->e_argv[cnt] = new->e_orig[cnt] = NULL;
+               new->e_orig[cnt] = NULL;
+       }
 
+       new->e_argv[cnt] = NULL;
        *argvp = argv + 1;
        return (new);
 }
@@ -1440,6 +1591,27 @@
 {
        return (palloc(N_OR, f_or));
 }
+
+
+/*
+ * plan_cleanup --
+ *     Check and see if the specified plan has any residual state,
+ *     and if so, clean it up as appropriate.
+ *
+ *     At the moment, only N_EXEC has state. Two kinds: 1)
+ *     lists of files to feed to subprocesses 2) State on exit
+ *     statusses of past subprocesses.
+ */
+/* ARGSUSED1 */
+int
+plan_cleanup(PLAN *plan, void *arg)
+{
+       if (plan->type==N_EXEC && plan->ep_narg)
+               run_f_exec(plan);
+
+       return plan->ep_rval;           /* Passed save exit-status up chain */
+}
+
 
 static PLAN *
 palloc(enum ntype t, int (*f)(PLAN *, FTSENT *))

find(1): add support for terminating '+' in -exec

Reply via email to