Re: kbind(2): push kernel lock down as needed

Scott Cheloha Mon, 06 Dec 2021 12:59:34 -0800

On Mon, Dec 06, 2021 at 08:35:15PM +0100, Mark Kettenis wrote:
> > Date: Sun, 5 Dec 2021 18:01:04 -0600
> > From: Scott Cheloha <[email protected]>
> > 
> > Two things in sys_kbind() need an explicit kernel lock.  First,
> > sigexit().  Second, uvm_map_extract(), because the following call
> > chain panics without it:
> > 
> > [...]
> > 
> > With this committed we can unlock kbind(2).
> > 
> > Thoughts?  ok?
> 
> To be honest, I don't think this makes sense unless you can make the
> "normal" code path lock free.  You're replacing a single
> KERNEL_LOCK/UNLOCK pair with (potentially) a bunch of them.  That may
> actually make things worse.  So I think we need to make
> uvm_map_extract() mpsafe first.


Unlocking uvm_map_extract() would improve things, yes.

I wrote a benchmark so we have something concrete to talk about.
Attached inline at the end.  The benchmark spawns 1 or more threads
that repeatedly fork(2), execve(2), and waitpid(2).  By default the
child executes /usr/bin/true, the simplest program in base that needs
ld.so.  The benchmark stops when a given number of fork/execve/waitpid
cycles have completed.  It then reports the elapsed time.

With this patch + kbind(2) unlocked the benchmark completes no faster
in any configuration.

> In case mpi@ disagrees, the unbalanced KERNEL_LOCK() before the
> sigexit() call is unfortunate.  I'd add a /* NOTREACHED */ after the
> sigexit() call to signal that the unbalanced KERNEL_LOCK() is ok.

Thanks, wasn't sure about that.  Updated.

--

Here's the benchmark.  Try something like this:

# How quickly can four threads fork+exec true(1) a
# thousand times?
$ ./forkexecwait -n4 1000 /usr/bin/true

/*
 * cc -o forkexecwait forkexecwait.c -lpthread
 */

#include <sys/types.h>
#include <sys/atomic.h>
#include <sys/time.h>
#include <sys/wait.h>

#include <err.h>
#include <errno.h>
#include <limits.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>

pthread_t *thread;
pthread_barrier_t barrier;

char *default_argv[] = { "true", NULL };
char *default_envp[] = { NULL };

char *exec_path = "/usr/bin/true";
char **exec_argv = default_argv;

struct timespec start, stop;
unsigned long count, max;
volatile int done;

void *thread_func(void *);
void thread_join(pthread_t *, unsigned int);
void thread_spawn(pthread_t **, unsigned int, void *(*)(void *));
void usage(void);

int
main(int argc, char *argv[])
{
        struct timespec elapsed;
        const char *errstr;
        unsigned int nthreads;
        int ch, error;

        nthreads = 1;

        while ((ch = getopt(argc, argv, "n:")) != -1) {
                switch (ch) {
                case 'n':
                        nthreads = strtonum(optarg, 1, UINT_MAX, &errstr);
                        if (errstr != NULL)
                                errx(1, "nthreads is %s: %s", errstr, optarg);
                        break;
                default:
                        usage();
                }
        }
        argc -= optind;
        argv += optind;

        if (argc < 1)
                usage();

        max = strtonum(argv[0], 1, LONG_MAX, &errstr);
        if (errstr != NULL)
                errx(1, "count is %s: %s", errstr, argv[0]);
        if (argc >= 2) {
                exec_path = argv[1];
                exec_argv = &argv[1];
        }

        error = pthread_barrier_init(&barrier, NULL, nthreads);
        if (error)
                errc(1, error, "pthread_barrier_init");

        thread_spawn(&thread, nthreads, &thread_func);
        thread_join(thread, nthreads);

        error = pthread_barrier_destroy(&barrier);
        if (error)
                errc(1, error, "pthread_barrier_destroy");

        timespecsub(&stop, &start, &elapsed);
        printf("%lld.%09ld\n", (long long)elapsed.tv_sec, elapsed.tv_nsec);

        return 0;
}

void *
thread_func(void *arg)
{
        int error, status;
        pid_t pid;

        error = pthread_barrier_wait(&barrier);
        if (error) {
                if (error != PTHREAD_BARRIER_SERIAL_THREAD)
                        errc(1, error, "pthread_barrier_wait");
                clock_gettime(CLOCK_MONOTONIC, &start);
        }

        while (!done) {
                switch (pid = fork()) {
                case -1:
                        err(1, "fork");
                case 0:
                        execve(exec_path, exec_argv, default_envp);
                        error = (errno == ENOENT) ? 127 : 126;
                        err(error, "execve %s", exec_path);
                default:
                        while (waitpid(pid, &status, 0) == -1) {
                                if (errno != EINTR)
                                        err(1, "waitpid %ld", (long)pid);
                        }
                        if (atomic_inc_long_nv(&count) == max) {
                                clock_gettime(CLOCK_MONOTONIC, &stop);
                                done = 1;
                        }
                }
        }

        return NULL;
}

void
thread_join(pthread_t *thread, unsigned int nthreads)
{
        unsigned int i;
        int error;

        for (i = 0; i < nthreads; i++) {
                error = pthread_join(thread[i], NULL);
                if (error)
                        errc(1, error, "pthread_join");
        }
}

void
thread_spawn(pthread_t **threadp, unsigned int nthreads, void *(*func)(void *))
{
        pthread_t *thread;
        unsigned int i;
        int error;

        *threadp = calloc(nthreads, sizeof(**threadp));
        if (*threadp == NULL)
                err(1, NULL);
        thread = *threadp;

        for (i = 0; i < nthreads; i++) {
                error = pthread_create(&thread[i], NULL, func, NULL);
                if (error)
                        errc(1, error, "pthread_create");
        }
}

void
usage(void)
{
        fprintf(stderr, "usage: %s [-n nthreads] count [command [arg ...]]\n",
            getprogname());
        exit(1);
}

--

And here's the updated patch.

Index: uvm_mmap.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_mmap.c,v
retrieving revision 1.165
diff -u -p -r1.165 uvm_mmap.c
--- uvm_mmap.c  5 Dec 2021 22:00:42 -0000       1.165
+++ uvm_mmap.c  6 Dec 2021 20:24:42 -0000
@@ -1112,10 +1112,13 @@ sys_kbind(struct proc *p, void *v, regis
        if (pr->ps_kbind_addr == 0) {
                pr->ps_kbind_addr = pc;
                pr->ps_kbind_cookie = SCARG(uap, proc_cookie);
-       } else if (pc != pr->ps_kbind_addr || pc == BOGO_PC)
-               sigexit(p, SIGILL);
-       else if (pr->ps_kbind_cookie != SCARG(uap, proc_cookie))
+       } else if (pc != pr->ps_kbind_addr || pc == BOGO_PC ||
+           pr->ps_kbind_cookie != SCARG(uap, proc_cookie)) {
+               KERNEL_LOCK();
                sigexit(p, SIGILL);
+               /* NOTREACHED */
+       }
+
        if (psize < sizeof(struct __kbind) || psize > sizeof(param))
                return EINVAL;
        if ((error = copyin(paramp, &param, psize)))
@@ -1169,8 +1172,11 @@ sys_kbind(struct proc *p, void *v, regis
                                vm_map_unlock(kernel_map);
                                kva = 0;
                        }
-                       if ((error = uvm_map_extract(&p->p_vmspace->vm_map,
-                           baseva, PAGE_SIZE, &kva, UVM_EXTRACT_FIXPROT)))
+                       KERNEL_LOCK();
+                       error = uvm_map_extract(&p->p_vmspace->vm_map,
+                           baseva, PAGE_SIZE, &kva, UVM_EXTRACT_FIXPROT);
+                       KERNEL_UNLOCK();
+                       if (error)
                                break;
                        last_baseva = baseva;
                }

Re: kbind(2): push kernel lock down as needed

Reply via email to