This adds the kernel side of the seccomp based process handling.

Co-authored-by: Johannes Berg <johan...@sipsolutions.net>
Signed-off-by: Benjamin Berg <benja...@sipsolutions.net>
Signed-off-by: Benjamin Berg <benjamin.b...@intel.com>
---
 arch/um/include/shared/common-offsets.h    |   2 +
 arch/um/include/shared/os.h                |   2 +-
 arch/um/include/shared/skas/stub-data.h    |   5 +-
 arch/um/kernel/skas/mmu.c                  |   6 +-
 arch/um/kernel/skas/stub_exe.c             | 141 +++++++-
 arch/um/os-Linux/internal.h                |   4 +
 arch/um/os-Linux/skas/mem.c                |  38 ++-
 arch/um/os-Linux/skas/process.c            | 374 +++++++++++++++------
 arch/um/os-Linux/start_up.c                |   3 -
 arch/x86/um/shared/sysdep/kernel-offsets.h |   2 +
 arch/x86/um/tls_32.c                       |  23 +-
 11 files changed, 460 insertions(+), 140 deletions(-)

diff --git a/arch/um/include/shared/common-offsets.h 
b/arch/um/include/shared/common-offsets.h
index 44cb72413db4..6460a1b5b1cc 100644
--- a/arch/um/include/shared/common-offsets.h
+++ b/arch/um/include/shared/common-offsets.h
@@ -35,3 +35,5 @@ DEFINE(UML_CONFIG_UML_MAX_USERSPACE_ITERATIONS, 0);
 #endif
 
 DEFINE(UM_KERN_GDT_ENTRY_TLS_ENTRIES, GDT_ENTRY_TLS_ENTRIES);
+
+DEFINE(UM_SECCOMP_ARCH_NATIVE, SECCOMP_ARCH_NATIVE);
diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h
index e25e81742bdd..54f712236843 100644
--- a/arch/um/include/shared/os.h
+++ b/arch/um/include/shared/os.h
@@ -283,7 +283,7 @@ int unmap(struct mm_id *mm_idp, unsigned long addr, 
unsigned long len);
 
 /* skas/process.c */
 extern int is_skas_winch(int pid, int fd, void *data);
-extern int start_userspace(unsigned long stub_stack);
+extern int start_userspace(struct mm_id *mm_id);
 extern void userspace(struct uml_pt_regs *regs);
 extern void new_thread(void *stack, jmp_buf *buf, void (*handler)(void));
 extern void switch_threads(jmp_buf *me, jmp_buf *you);
diff --git a/arch/um/include/shared/skas/stub-data.h 
b/arch/um/include/shared/skas/stub-data.h
index 4a2a00556a8e..615c3054ad2a 100644
--- a/arch/um/include/shared/skas/stub-data.h
+++ b/arch/um/include/shared/skas/stub-data.h
@@ -18,6 +18,8 @@
 #define FUTEX_IN_KERN 1
 
 struct stub_init_data {
+       int seccomp;
+
        unsigned long stub_start;
 
        int stub_code_fd;
@@ -25,7 +27,8 @@ struct stub_init_data {
        int stub_data_fd;
        unsigned long stub_data_offset;
 
-       unsigned long segv_handler;
+       unsigned long signal_handler;
+       unsigned long signal_restorer;
 };
 
 #define STUB_NEXT_SYSCALL(s) \
diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c
index 62f27daf3d37..438b7a3082e6 100644
--- a/arch/um/kernel/skas/mmu.c
+++ b/arch/um/kernel/skas/mmu.c
@@ -39,13 +39,11 @@ int init_new_context(struct task_struct *task, struct 
mm_struct *mm)
                /* Insert into list, used for lookups when the child dies */
                list_add(&mm->context.list, &mm_list);
 
-               new_id->pid = start_userspace(stack);
+               ret = start_userspace(new_id);
        }
 
-       if (new_id->pid < 0) {
-               ret = new_id->pid;
+       if (ret < 0)
                goto out_free;
-       }
 
        /* Ensure the new MM is clean and nothing unwanted is mapped */
        unmap(new_id, 0, STUB_START);
diff --git a/arch/um/kernel/skas/stub_exe.c b/arch/um/kernel/skas/stub_exe.c
index 23c99b285e82..f40f2332b676 100644
--- a/arch/um/kernel/skas/stub_exe.c
+++ b/arch/um/kernel/skas/stub_exe.c
@@ -3,6 +3,9 @@
 #include <asm/unistd.h>
 #include <sysdep/stub.h>
 #include <stub-data.h>
+#include <linux/filter.h>
+#include <linux/seccomp.h>
+#include <generated/asm-offsets.h>
 
 void _start(void);
 
@@ -25,8 +28,6 @@ noinline static void real_init(void)
        } sa = {
                /* Need to set SA_RESTORER (but the handler never returns) */
                .sa_flags = SA_ONSTACK | SA_NODEFER | SA_SIGINFO | 0x04000000,
-               /* no need to mask any signals */
-               .sa_mask = 0,
        };
 
        /* set a nice name */
@@ -35,6 +36,9 @@ noinline static void real_init(void)
        /* Make sure this process dies if the kernel dies */
        stub_syscall2(__NR_prctl, PR_SET_PDEATHSIG, SIGKILL);
 
+       /* Needed in SECCOMP mode (and safe to do anyway) */
+       stub_syscall5(__NR_prctl, PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+
        /* read information from STDIN and close it */
        res = stub_syscall3(__NR_read, 0,
                            (unsigned long)&init_data, sizeof(init_data));
@@ -63,18 +67,133 @@ noinline static void real_init(void)
        stack.ss_sp = (void *)init_data.stub_start + UM_KERN_PAGE_SIZE;
        stub_syscall2(__NR_sigaltstack, (unsigned long)&stack, 0);
 
-       /* register SIGSEGV handler */
-       sa.sa_handler_ = (void *) init_data.segv_handler;
-       res = stub_syscall4(__NR_rt_sigaction, SIGSEGV, (unsigned long)&sa, 0,
-                           sizeof(sa.sa_mask));
-       if (res != 0)
-               stub_syscall1(__NR_exit, 13);
+       /* register signal handlers */
+       sa.sa_handler_ = (void *) init_data.signal_handler;
+       sa.sa_restorer = (void *) init_data.signal_restorer;
+       if (!init_data.seccomp) {
+               /* In ptrace mode, the SIGSEGV handler never returns */
+               sa.sa_mask = 0;
+
+               res = stub_syscall4(__NR_rt_sigaction, SIGSEGV,
+                                   (unsigned long)&sa, 0, sizeof(sa.sa_mask));
+               if (res != 0)
+                       stub_syscall1(__NR_exit, 13);
+       } else {
+               /* SECCOMP mode uses rt_sigreturn, need to mask all signals */
+               sa.sa_mask = ~0ULL;
+
+               res = stub_syscall4(__NR_rt_sigaction, SIGSEGV,
+                                   (unsigned long)&sa, 0, sizeof(sa.sa_mask));
+               if (res != 0)
+                       stub_syscall1(__NR_exit, 14);
+
+               res = stub_syscall4(__NR_rt_sigaction, SIGSYS,
+                                   (unsigned long)&sa, 0, sizeof(sa.sa_mask));
+               if (res != 0)
+                       stub_syscall1(__NR_exit, 15);
+
+               res = stub_syscall4(__NR_rt_sigaction, SIGALRM,
+                                   (unsigned long)&sa, 0, sizeof(sa.sa_mask));
+               if (res != 0)
+                       stub_syscall1(__NR_exit, 16);
+
+               res = stub_syscall4(__NR_rt_sigaction, SIGTRAP,
+                                   (unsigned long)&sa, 0, sizeof(sa.sa_mask));
+               if (res != 0)
+                       stub_syscall1(__NR_exit, 17);
+
+               res = stub_syscall4(__NR_rt_sigaction, SIGILL,
+                                   (unsigned long)&sa, 0, sizeof(sa.sa_mask));
+               if (res != 0)
+                       stub_syscall1(__NR_exit, 18);
+
+               res = stub_syscall4(__NR_rt_sigaction, SIGFPE,
+                                   (unsigned long)&sa, 0, sizeof(sa.sa_mask));
+               if (res != 0)
+                       stub_syscall1(__NR_exit, 19);
+       }
+
+       /*
+        * If in seccomp mode, install the SECCOMP filter and trigger a syscall.
+        * Otherwise set PTRACE_TRACEME and do a SIGSTOP.
+        */
+       if (init_data.seccomp) {
+               struct sock_filter filter[] = {
+#if __BITS_PER_LONG > 32
+                       /* [0] Load upper 32bit of instruction pointer from 
seccomp_data */
+                       BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
+                                (offsetof(struct seccomp_data, 
instruction_pointer) + 4)),
+
+                       /* [1] Jump forward 3 instructions if the upper address 
is not identical */
+                       BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 
(init_data.stub_start) >> 32, 0, 3),
+#endif
+                       /* [2] Load lower 32bit of instruction pointer from 
seccomp_data */
+                       BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
+                                (offsetof(struct seccomp_data, 
instruction_pointer))),
+
+                       /* [3] Mask out lower bits */
+                       BPF_STMT(BPF_ALU | BPF_AND | BPF_K, 0xfffff000),
+
+                       /* [4] Jump to [6] if the lower bits are not on the 
expected page */
+                       BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 
(init_data.stub_start) & 0xfffff000, 1, 0),
+
+                       /* [5] Trap call, allow */
+                       BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_TRAP),
+
+                       /* [6,7] Check architecture */
+                       BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
+                                offsetof(struct seccomp_data, arch)),
+                       BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K,
+                                UM_SECCOMP_ARCH_NATIVE, 1, 0),
+
+                       /* [8] Kill (for architecture check) */
+                       BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS),
+
+                       /* [9] Load syscall number */
+                       BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
+                                offsetof(struct seccomp_data, nr)),
+
+                       /* [10-14] Check against permitted syscalls */
+                       BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_futex,
+                                5, 0),
+                       BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, STUB_MMAP_NR,
+                                4, 0),
+                       BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_munmap,
+                                3, 0),
+#ifdef __i386__
+                       BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 
__NR_set_thread_area,
+                                2, 0),
+#else
+                       BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_arch_prctl,
+                                2, 0),
+#endif
+                       BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_rt_sigreturn,
+                                1, 0),
+
+                       /* [15] Not one of the permitted syscalls */
+                       BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS),
+
+                       /* [16] Permitted call for the stub */
+                       BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
+               };
+               struct sock_fprog prog = {
+                       .len = sizeof(filter) / sizeof(filter[0]),
+                       .filter = filter,
+               };
+
+               if (stub_syscall3(__NR_seccomp, SECCOMP_SET_MODE_FILTER,
+                                 SECCOMP_FILTER_FLAG_TSYNC,
+                                 (unsigned long)&prog) != 0)
+                       stub_syscall1(__NR_exit, 20);
 
-       stub_syscall4(__NR_ptrace, PTRACE_TRACEME, 0, 0, 0);
+               /* Fall through, the exit syscall will cause SIGSYS */
+       } else {
+               stub_syscall4(__NR_ptrace, PTRACE_TRACEME, 0, 0, 0);
 
-       stub_syscall2(__NR_kill, stub_syscall0(__NR_getpid), SIGSTOP);
+               stub_syscall2(__NR_kill, stub_syscall0(__NR_getpid), SIGSTOP);
+       }
 
-       stub_syscall1(__NR_exit, 14);
+       stub_syscall1(__NR_exit, 30);
 
        __builtin_unreachable();
 }
diff --git a/arch/um/os-Linux/internal.h b/arch/um/os-Linux/internal.h
index 317fca190c2b..09fa232f5695 100644
--- a/arch/um/os-Linux/internal.h
+++ b/arch/um/os-Linux/internal.h
@@ -2,6 +2,9 @@
 #ifndef __UM_OS_LINUX_INTERNAL_H
 #define __UM_OS_LINUX_INTERNAL_H
 
+#include <mm_id.h>
+#include <stub-data.h>
+
 /*
  * elf_aux.c
  */
@@ -16,5 +19,6 @@ void check_tmpexec(void);
  * skas/process.c
  */
 void wait_stub_done(int pid);
+void wait_stub_done_seccomp(struct mm_id *mm_idp, int running, int 
wait_sigsys);
 
 #endif /* __UM_OS_LINUX_INTERNAL_H */
diff --git a/arch/um/os-Linux/skas/mem.c b/arch/um/os-Linux/skas/mem.c
index d7f1814b0e5a..f6bce0d83a0f 100644
--- a/arch/um/os-Linux/skas/mem.c
+++ b/arch/um/os-Linux/skas/mem.c
@@ -4,6 +4,7 @@
  * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
  */
 
+#include <linux/kconfig.h>
 #include <stddef.h>
 #include <unistd.h>
 #include <errno.h>
@@ -80,27 +81,32 @@ static inline long do_syscall_stub(struct mm_id *mm_idp)
        int n, i;
        int err, pid = mm_idp->pid;
 
-       n = ptrace_setregs(pid, syscall_regs);
-       if (n < 0) {
-               printk(UM_KERN_ERR "Registers - \n");
-               for (i = 0; i < MAX_REG_NR; i++)
-                       printk(UM_KERN_ERR "\t%d\t0x%lx\n", i, syscall_regs[i]);
-               panic("%s : PTRACE_SETREGS failed, errno = %d\n",
-                     __func__, -n);
-       }
-
        /* Inform process how much we have filled in. */
        proc_data->syscall_data_len = mm_idp->syscall_data_len;
 
-       err = ptrace(PTRACE_CONT, pid, 0, 0);
-       if (err)
-               panic("Failed to continue stub, pid = %d, errno = %d\n", pid,
-                     errno);
-
-       wait_stub_done(pid);
+       if (using_seccomp) {
+               proc_data->restart_wait = 1;
+               wait_stub_done_seccomp(mm_idp, 0, 1);
+       } else {
+               n = ptrace_setregs(pid, syscall_regs);
+               if (n < 0) {
+                       printk(UM_KERN_ERR "Registers -\n");
+                       for (i = 0; i < MAX_REG_NR; i++)
+                               printk(UM_KERN_ERR "\t%d\t0x%lx\n", i, 
syscall_regs[i]);
+                       panic("%s : PTRACE_SETREGS failed, errno = %d\n",
+                             __func__, -n);
+               }
+
+               err = ptrace(PTRACE_CONT, pid, 0, 0);
+               if (err)
+                       panic("Failed to continue stub, pid = %d, errno = %d\n",
+                             pid, errno);
+
+               wait_stub_done(pid);
+       }
 
        /*
-        * proc_data->err will be non-zero if there was an (unexpected) error.
+        * proc_data->err will be negative if there was an (unexpected) error.
         * In that case, syscall_data_len points to the last executed syscall,
         * otherwise it will be zero (but we do not need to rely on that).
         */
diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c
index 5eb0155ff7a7..c663b67c3fd3 100644
--- a/arch/um/os-Linux/skas/process.c
+++ b/arch/um/os-Linux/skas/process.c
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
+ * Copyright (C) 2021 Benjamin Berg <benja...@sipsolutions.net>
  * Copyright (C) 2015 Thomas Meyer (tho...@m3y3r.de)
  * Copyright (C) 2002- 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
  */
@@ -25,8 +26,11 @@
 #include <registers.h>
 #include <skas.h>
 #include <sysdep/stub.h>
+#include <sysdep/mcontext.h>
+#include <linux/futex.h>
 #include <linux/threads.h>
 #include <timetravel.h>
+#include <asm-generic/rwonce.h>
 #include "../internal.h"
 
 int is_skas_winch(int pid, int fd, void *data)
@@ -142,6 +146,74 @@ void wait_stub_done(int pid)
        fatal_sigsegv();
 }
 
+#ifdef CONFIG_UML_SECCOMP
+void wait_stub_done_seccomp(struct mm_id *mm_idp, int running, int wait_sigsys)
+{
+       struct stub_data *data = (void *)mm_idp->stack;
+       int ret;
+
+       do {
+               if (!running) {
+                       data->signal = 0;
+                       data->futex = FUTEX_IN_CHILD;
+                       CATCH_EINTR(syscall(__NR_futex, &data->futex,
+                                           FUTEX_WAKE, 1, NULL, NULL, 0));
+               }
+
+               do {
+                       /*
+                        * We need to check whether the child is still alive
+                        * before and after the FUTEX_WAIT call. Before, in
+                        * case it just died but we still updated data->futex
+                        * to FUTEX_IN_CHILD. And after, in case it died while
+                        * we were waiting (and SIGCHLD woke us up, see the
+                        * IRQ handler in mmu.c).
+                        *
+                        * Either way, if PID is negative, then we have no
+                        * choice but to kill the task.
+                        */
+                       if (__READ_ONCE(mm_idp->pid) < 0)
+                               goto out_kill;
+
+                       ret = syscall(__NR_futex, &data->futex,
+                                     FUTEX_WAIT, FUTEX_IN_CHILD,
+                                     NULL, NULL, 0);
+               } while ((ret == -1 && errno == EINTR) && data->futex == 
FUTEX_IN_CHILD);
+
+               if (__READ_ONCE(mm_idp->pid) < 0)
+                       goto out_kill;
+
+               running = 0;
+
+               /* We may receive a SIGALRM before SIGSYS, iterate again. */
+       } while (wait_sigsys && data->signal == SIGALRM);
+
+       if (ret < 0 && errno != EAGAIN) {
+               printk(UM_KERN_ERR "%s : waiting for child futex failed, errno 
= %d\n",
+                      __func__, errno);
+               goto out_kill;
+       }
+
+       if (data->mctx_offset > sizeof(data->sigstack) - sizeof(mcontext_t)) {
+               printk(UM_KERN_ERR "%s : invalid mcontext offset", __func__);
+               goto out_kill;
+       }
+
+       if (wait_sigsys && data->signal != SIGSYS) {
+               printk(UM_KERN_ERR "%s : expected SIGSYS but got %d",
+                      __func__, data->signal);
+               goto out_kill;
+       }
+
+       return;
+
+out_kill:
+       printk(UM_KERN_ERR "%s : failed to wait for stub, pid = %d, errno = 
%d\n",
+              __func__, mm_idp->pid, errno);
+       fatal_sigsegv();
+}
+#endif
+
 extern unsigned long current_stub_stack(void);
 
 static void get_skas_faultinfo(int pid, struct faultinfo *fi)
@@ -181,14 +253,26 @@ static int userspace_tramp(void *stack)
        int pipe_fds[2];
        unsigned long long offset;
        struct stub_init_data init_data = {
+               .seccomp = using_seccomp,
                .stub_start = STUB_START,
-               .segv_handler = STUB_CODE +
-                               (unsigned long) stub_segv_handler -
-                               (unsigned long) __syscall_stub_start,
        };
        struct iomem_region *iomem;
        int ret;
 
+       if (using_seccomp) {
+               init_data.signal_handler = STUB_CODE +
+                                          (unsigned long) 
stub_signal_interrupt -
+                                          (unsigned long) __syscall_stub_start;
+               init_data.signal_restorer = STUB_CODE +
+                                          (unsigned long) stub_signal_restorer 
-
+                                          (unsigned long) __syscall_stub_start;
+       } else {
+               init_data.signal_handler = STUB_CODE +
+                                          (unsigned long) stub_segv_handler -
+                                          (unsigned long) __syscall_stub_start;
+               init_data.signal_restorer = 0;
+       }
+
        init_data.stub_code_fd = phys_mapping(uml_to_phys(__syscall_stub_start),
                                              &offset);
        init_data.stub_code_offset = MMAP_OFFSET(offset);
@@ -315,8 +399,9 @@ int userspace_pid[NR_CPUS];
  *         when negative: an error number.
  * FIXME: can PIDs become negative?!
  */
-int start_userspace(unsigned long stub_stack)
+int start_userspace(struct mm_id *mm_id)
 {
+       struct stub_data *proc_data = (void *)mm_id->stack;
        void *stack;
        unsigned long sp;
        int pid, status, n, err;
@@ -335,10 +420,13 @@ int start_userspace(unsigned long stub_stack)
        /* set stack pointer to the end of the stack page, so it can grow 
downwards */
        sp = (unsigned long)stack + UM_KERN_PAGE_SIZE;
 
+       if (using_seccomp)
+               proc_data->futex = FUTEX_IN_CHILD;
+
        /* clone into new userspace process */
        pid = clone(userspace_tramp, (void *) sp,
                    CLONE_VFORK | CLONE_VM | SIGCHLD,
-                   (void *)stub_stack);
+                   (void *)mm_id->stack);
        if (pid < 0) {
                err = -errno;
                printk(UM_KERN_ERR "%s : clone failed, errno = %d\n",
@@ -346,29 +434,34 @@ int start_userspace(unsigned long stub_stack)
                return err;
        }
 
-       do {
-               CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED | __WALL));
-               if (n < 0) {
+       if (using_seccomp) {
+               wait_stub_done_seccomp(mm_id, 1, 1);
+       } else {
+               do {
+                       CATCH_EINTR(n = waitpid(pid, &status,
+                                               WUNTRACED | __WALL));
+                       if (n < 0) {
+                               err = -errno;
+                               printk(UM_KERN_ERR "%s : wait failed, errno = 
%d\n",
+                                      __func__, errno);
+                               goto out_kill;
+                       }
+               } while (WIFSTOPPED(status) && (WSTOPSIG(status) == SIGALRM));
+
+               if (!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGSTOP)) {
+                       err = -EINVAL;
+                       printk(UM_KERN_ERR "%s : expected SIGSTOP, got status = 
%d\n",
+                              __func__, status);
+                       goto out_kill;
+               }
+
+               if (ptrace(PTRACE_SETOPTIONS, pid, NULL,
+                          (void *) PTRACE_O_TRACESYSGOOD) < 0) {
                        err = -errno;
-                       printk(UM_KERN_ERR "%s : wait failed, errno = %d\n",
+                       printk(UM_KERN_ERR "%s : PTRACE_SETOPTIONS failed, 
errno = %d\n",
                               __func__, errno);
                        goto out_kill;
                }
-       } while (WIFSTOPPED(status) && (WSTOPSIG(status) == SIGALRM));
-
-       if (!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGSTOP)) {
-               err = -EINVAL;
-               printk(UM_KERN_ERR "%s : expected SIGSTOP, got status = %d\n",
-                      __func__, status);
-               goto out_kill;
-       }
-
-       if (ptrace(PTRACE_SETOPTIONS, pid, NULL,
-                  (void *) PTRACE_O_TRACESYSGOOD) < 0) {
-               err = -errno;
-               printk(UM_KERN_ERR "%s : PTRACE_SETOPTIONS failed, errno = 
%d\n",
-                      __func__, errno);
-               goto out_kill;
        }
 
        if (munmap(stack, UM_KERN_PAGE_SIZE) < 0) {
@@ -378,6 +471,8 @@ int start_userspace(unsigned long stub_stack)
                goto out_kill;
        }
 
+       mm_id->pid = pid;
+
        return pid;
 
  out_kill:
@@ -391,7 +486,9 @@ extern unsigned long tt_extra_sched_jiffies;
 void userspace(struct uml_pt_regs *regs)
 {
        int err, status, op, pid = userspace_pid[0];
-       siginfo_t si;
+       siginfo_t si_ptrace;
+       siginfo_t *si;
+       int sig;
 
        /* Handle any immediate reschedules or signals */
        interrupt_end();
@@ -422,104 +519,181 @@ void userspace(struct uml_pt_regs *regs)
 
                current_mm_sync();
 
-               /* Flush out any pending syscalls */
-               err = syscall_stub_flush(current_mm_id());
-               if (err) {
-                       if (err == -ENOMEM)
-                               report_enomem();
+               if (using_seccomp) {
+                       struct mm_id *mm_id = current_mm_id();
+                       struct stub_data *proc_data = (void *) mm_id->stack;
+                       int ret;
 
-                       printk(UM_KERN_ERR "%s - Error flushing stub syscalls: 
%d",
-                               __func__, -err);
-                       fatal_sigsegv();
-               }
+                       ret = set_stub_state(regs, proc_data, singlestepping());
+                       if (ret) {
+                               printk(UM_KERN_ERR "%s - failed to set regs: 
%d",
+                                      __func__, ret);
+                               fatal_sigsegv();
+                       }
 
-               /*
-                * This can legitimately fail if the process loads a
-                * bogus value into a segment register.  It will
-                * segfault and PTRACE_GETREGS will read that value
-                * out of the process.  However, PTRACE_SETREGS will
-                * fail.  In this case, there is nothing to do but
-                * just kill the process.
-                */
-               if (ptrace(PTRACE_SETREGS, pid, 0, regs->gp)) {
-                       printk(UM_KERN_ERR "%s - ptrace set regs failed, errno 
= %d\n",
-                              __func__, errno);
-                       fatal_sigsegv();
-               }
+                       /* Must have been reset by the syscall caller */
+                       if (proc_data->restart_wait != 0)
+                               panic("Programming error: Flag to only run 
syscalls in child was not cleared!");
+
+                       /* Mark pending syscalls for flushing */
+                       proc_data->syscall_data_len = mm_id->syscall_data_len;
+                       mm_id->syscall_data_len = 0;
+
+                       proc_data->signal = 0;
+                       proc_data->futex = FUTEX_IN_CHILD;
+                       CATCH_EINTR(syscall(__NR_futex, &proc_data->futex,
+                                           FUTEX_WAKE, 1, NULL, NULL, 0));
+                       do {
+                               ret = syscall(__NR_futex, &proc_data->futex,
+                                             FUTEX_WAIT, FUTEX_IN_CHILD, NULL, 
NULL, 0);
+                       } while ((ret == -1 && errno == EINTR) ||
+                                proc_data->futex == FUTEX_IN_CHILD);
+
+                       sig = proc_data->signal;
+
+                       if (sig == SIGTRAP && proc_data->err != 0) {
+                               printk(UM_KERN_ERR "%s - Error flushing stub 
syscalls",
+                                      __func__);
+                               syscall_stub_dump_error(mm_id);
+                               fatal_sigsegv();
+                       }
 
-               if (put_fp_registers(pid, regs->fp)) {
-                       printk(UM_KERN_ERR "%s - ptrace set fp regs failed, 
errno = %d\n",
-                              __func__, errno);
-                       fatal_sigsegv();
-               }
+                       ret = get_stub_state(regs, proc_data, NULL);
+                       if (ret) {
+                               printk(UM_KERN_ERR "%s - failed to get regs: 
%d",
+                                      __func__, ret);
+                               fatal_sigsegv();
+                       }
 
-               if (singlestepping())
-                       op = PTRACE_SYSEMU_SINGLESTEP;
-               else
-                       op = PTRACE_SYSEMU;
+                       if (proc_data->si_offset > sizeof(proc_data->sigstack) 
- sizeof(*si))
+                               panic("%s - Invalid siginfo offset from child",
+                                     __func__);
+                       si = (void *)&proc_data->sigstack[proc_data->si_offset];
 
-               if (ptrace(op, pid, 0, 0)) {
-                       printk(UM_KERN_ERR "%s - ptrace continue failed, op = 
%d, errno = %d\n",
-                              __func__, op, errno);
-                       fatal_sigsegv();
-               }
+                       regs->is_user = 1;
 
-               CATCH_EINTR(err = waitpid(pid, &status, WUNTRACED | __WALL));
-               if (err < 0) {
-                       printk(UM_KERN_ERR "%s - wait failed, errno = %d\n",
-                              __func__, errno);
-                       fatal_sigsegv();
-               }
+                       /* Fill in ORIG_RAX and extract fault information */
+                       PT_SYSCALL_NR(regs->gp) = si->si_syscall;
+                       if (sig == SIGSEGV) {
+                               mcontext_t *mcontext = (void 
*)&proc_data->sigstack[proc_data->mctx_offset];
 
-               regs->is_user = 1;
-               if (ptrace(PTRACE_GETREGS, pid, 0, regs->gp)) {
-                       printk(UM_KERN_ERR "%s - PTRACE_GETREGS failed, errno = 
%d\n",
-                              __func__, errno);
-                       fatal_sigsegv();
-               }
+                               GET_FAULTINFO_FROM_MC(regs->faultinfo, 
mcontext);
+                       }
+               } else {
+                       /* Flush out any pending syscalls */
+                       err = syscall_stub_flush(current_mm_id());
+                       if (err) {
+                               if (err == -ENOMEM)
+                                       report_enomem();
+
+                               printk(UM_KERN_ERR "%s - Error flushing stub 
syscalls: %d",
+                                       __func__, -err);
+                               fatal_sigsegv();
+                       }
 
-               if (get_fp_registers(pid, regs->fp)) {
-                       printk(UM_KERN_ERR "%s -  get_fp_registers failed, 
errno = %d\n",
-                              __func__, errno);
-                       fatal_sigsegv();
-               }
+                       /*
+                        * This can legitimately fail if the process loads a
+                        * bogus value into a segment register.  It will
+                        * segfault and PTRACE_GETREGS will read that value
+                        * out of the process.  However, PTRACE_SETREGS will
+                        * fail.  In this case, there is nothing to do but
+                        * just kill the process.
+                        */
+                       if (ptrace(PTRACE_SETREGS, pid, 0, regs->gp)) {
+                               printk(UM_KERN_ERR "%s - ptrace set regs 
failed, errno = %d\n",
+                                      __func__, errno);
+                               fatal_sigsegv();
+                       }
 
-               UPT_SYSCALL_NR(regs) = -1; /* Assume: It's not a syscall */
+                       if (put_fp_registers(pid, regs->fp)) {
+                               printk(UM_KERN_ERR "%s - ptrace set fp regs 
failed, errno = %d\n",
+                                      __func__, errno);
+                               fatal_sigsegv();
+                       }
 
-               if (WIFSTOPPED(status)) {
-                       int sig = WSTOPSIG(status);
+                       if (singlestepping())
+                               op = PTRACE_SYSEMU_SINGLESTEP;
+                       else
+                               op = PTRACE_SYSEMU;
 
-                       /* These signal handlers need the si argument.
-                        * The SIGIO and SIGALARM handlers which constitute the
-                        * majority of invocations, do not use it.
-                        */
-                       switch (sig) {
-                       case SIGSEGV:
-                       case SIGTRAP:
-                       case SIGILL:
-                       case SIGBUS:
-                       case SIGFPE:
-                       case SIGWINCH:
-                               ptrace(PTRACE_GETSIGINFO, pid, 0, (struct 
siginfo *)&si);
-                               break;
+                       if (ptrace(op, pid, 0, 0)) {
+                               printk(UM_KERN_ERR "%s - ptrace continue 
failed, op = %d, errno = %d\n",
+                                      __func__, op, errno);
+                               fatal_sigsegv();
+                       }
+
+                       CATCH_EINTR(err = waitpid(pid, &status, WUNTRACED | 
__WALL));
+                       if (err < 0) {
+                               printk(UM_KERN_ERR "%s - wait failed, errno = 
%d\n",
+                                      __func__, errno);
+                               fatal_sigsegv();
+                       }
+
+                       regs->is_user = 1;
+                       if (ptrace(PTRACE_GETREGS, pid, 0, regs->gp)) {
+                               printk(UM_KERN_ERR "%s - PTRACE_GETREGS failed, 
errno = %d\n",
+                                      __func__, errno);
+                               fatal_sigsegv();
+                       }
+
+                       if (get_fp_registers(pid, regs->fp)) {
+                               printk(UM_KERN_ERR "%s -  get_fp_registers 
failed, errno = %d\n",
+                                      __func__, errno);
+                               fatal_sigsegv();
                        }
 
+                       if (WIFSTOPPED(status)) {
+                               sig = WSTOPSIG(status);
+
+                               /* These signal handlers need the si argument
+                                * and SIGSEGV needs the faultinfo.
+                                * The SIGIO and SIGALARM handlers which 
constitute the
+                                * majority of invocations, do not use it.
+                                */
+                               switch (sig) {
+                               case SIGSEGV:
+                                       get_skas_faultinfo(pid,
+                                                          &regs->faultinfo);
+                                       fallthrough;
+                               case SIGTRAP:
+                               case SIGILL:
+                               case SIGBUS:
+                               case SIGFPE:
+                               case SIGWINCH:
+                                       ptrace(PTRACE_GETSIGINFO, pid, 0,
+                                              (struct siginfo *)&si_ptrace);
+                                       si = &si_ptrace;
+                                       break;
+                               default:
+                                       si = NULL;
+                                       break;
+                               }
+                       } else {
+                               sig = 0;
+                       }
+               }
+
+               UPT_SYSCALL_NR(regs) = -1; /* Assume: It's not a syscall */
+
+               if (sig) {
                        switch (sig) {
                        case SIGSEGV:
-                               get_skas_faultinfo(pid, &regs->faultinfo);
-
-                               if (PTRACE_FULL_FAULTINFO)
-                                       (*sig_info[SIGSEGV])(SIGSEGV, (struct 
siginfo *)&si,
+                               if (using_seccomp || PTRACE_FULL_FAULTINFO)
+                                       (*sig_info[SIGSEGV])(SIGSEGV,
+                                                            (struct siginfo 
*)si,
                                                             regs);
                                else
                                        segv(regs->faultinfo, 0, 1, NULL);
 
+                               break;
+                       case SIGSYS:
+                               handle_syscall(regs);
                                break;
                        case SIGTRAP + 0x80:
                                handle_trap(pid, regs);
                                break;
                        case SIGTRAP:
-                               relay_signal(SIGTRAP, (struct siginfo *)&si, 
regs);
+                               relay_signal(SIGTRAP, (struct siginfo *)si, 
regs);
                                break;
                        case SIGALRM:
                                break;
@@ -529,7 +703,7 @@ void userspace(struct uml_pt_regs *regs)
                        case SIGFPE:
                        case SIGWINCH:
                                block_signals_trace();
-                               (*sig_info[sig])(sig, (struct siginfo *)&si, 
regs);
+                               (*sig_info[sig])(sig, (struct siginfo *)si, 
regs);
                                unblock_signals_trace();
                                break;
                        default:
diff --git a/arch/um/os-Linux/start_up.c b/arch/um/os-Linux/start_up.c
index f1064817e719..09196f1ee8c9 100644
--- a/arch/um/os-Linux/start_up.c
+++ b/arch/um/os-Linux/start_up.c
@@ -429,12 +429,9 @@ void __init os_early_checks(void)
        using_seccomp = 0;
 
        if (init_seccomp()) {
-               /* Not yet fully implemented */
-#if 0
                using_seccomp = 1;
 
                return;
-#endif
        }
 #endif
 
diff --git a/arch/x86/um/shared/sysdep/kernel-offsets.h 
b/arch/x86/um/shared/sysdep/kernel-offsets.h
index 48de3a71f845..6fd1ed400399 100644
--- a/arch/x86/um/shared/sysdep/kernel-offsets.h
+++ b/arch/x86/um/shared/sysdep/kernel-offsets.h
@@ -4,7 +4,9 @@
 #include <linux/elf.h>
 #include <linux/crypto.h>
 #include <linux/kbuild.h>
+#include <linux/audit.h>
 #include <asm/mman.h>
+#include <asm/seccomp.h>
 
 /* workaround for a warning with -Wmissing-prototypes */
 void foo(void);
diff --git a/arch/x86/um/tls_32.c b/arch/x86/um/tls_32.c
index fbb129023080..21cbb70cf771 100644
--- a/arch/x86/um/tls_32.c
+++ b/arch/x86/um/tls_32.c
@@ -12,6 +12,7 @@
 #include <skas.h>
 #include <sysdep/tls.h>
 #include <asm/desc.h>
+#include <stub-data.h>
 
 /*
  * If needed we can detect when it's uninitialized.
@@ -21,13 +22,27 @@
 static int host_supports_tls = -1;
 int host_gdt_entry_tls_min;
 
-static int do_set_thread_area(struct user_desc *info)
+static int do_set_thread_area(struct task_struct* task, struct user_desc *info)
 {
        int ret;
        u32 cpu;
 
+       if (info->entry_number < host_gdt_entry_tls_min ||
+           info->entry_number >= host_gdt_entry_tls_min + 
GDT_ENTRY_TLS_ENTRIES)
+               return -EINVAL;
+
+       if (using_seccomp) {
+               int idx = info->entry_number - host_gdt_entry_tls_min;
+               struct stub_data *data = (void *)task->mm->context.id.stack;
+
+               data->arch_data.tls[idx] = *info;
+               data->arch_data.sync |= BIT(idx);
+
+               return 0;
+       }
+
        cpu = get_cpu();
-       ret = os_set_thread_area(info, userspace_pid[cpu]);
+       ret = os_set_thread_area(info, task->mm->context.id.pid);
        put_cpu();
 
        if (ret)
@@ -97,7 +112,7 @@ static int load_TLS(int flags, struct task_struct *to)
                if (!(flags & O_FORCE) && curr->flushed)
                        continue;
 
-               ret = do_set_thread_area(&curr->tls);
+               ret = do_set_thread_area(current, &curr->tls);
                if (ret)
                        goto out;
 
@@ -275,7 +290,7 @@ SYSCALL_DEFINE1(set_thread_area, struct user_desc __user *, 
user_desc)
                        return -EFAULT;
        }
 
-       ret = do_set_thread_area(&info);
+       ret = do_set_thread_area(current, &info);
        if (ret)
                return ret;
        return set_tls_entry(current, &info, idx, 1);
-- 
2.47.0


Reply via email to