Hello,
thanks for the update; was waiting for this. On Tue, 25 Feb 2025 03:18:25 +0900, Benjamin Berg wrote: > > This adds the kernel side of the seccomp based process handling. > > Co-authored-by: Johannes Berg <johan...@sipsolutions.net> > Signed-off-by: Benjamin Berg <benja...@sipsolutions.net> > Signed-off-by: Benjamin Berg <benjamin.b...@intel.com> (snip) > diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c > index f8ee5d612c47..0abc509e3f4c 100644 > --- a/arch/um/kernel/skas/mmu.c > +++ b/arch/um/kernel/skas/mmu.c > @@ -38,14 +38,11 @@ int init_new_context(struct task_struct *task, struct > mm_struct *mm) > scoped_guard(spinlock_irqsave, &mm_list_lock) { > /* Insert into list, used for lookups when the child dies */ > list_add(&mm->context.list, &mm_list); > - maybe this is not needed. > } > > - new_id->pid = start_userspace(stack); > - if (new_id->pid < 0) { > - ret = new_id->pid; > + ret = start_userspace(new_id); > + if (ret < 0) > goto out_free; > - } > > /* Ensure the new MM is clean and nothing unwanted is mapped */ > unmap(new_id, 0, STUB_START); > diff --git a/arch/um/kernel/skas/stub_exe.c b/arch/um/kernel/skas/stub_exe.c > index 23c99b285e82..f40f2332b676 100644 > --- a/arch/um/kernel/skas/stub_exe.c > +++ b/arch/um/kernel/skas/stub_exe.c > @@ -3,6 +3,9 @@ > #include <asm/unistd.h> > #include <sysdep/stub.h> > #include <stub-data.h> > +#include <linux/filter.h> > +#include <linux/seccomp.h> > +#include <generated/asm-offsets.h> > > void _start(void); > > @@ -25,8 +28,6 @@ noinline static void real_init(void) > } sa = { > /* Need to set SA_RESTORER (but the handler never returns) */ > .sa_flags = SA_ONSTACK | SA_NODEFER | SA_SIGINFO | 0x04000000, > - /* no need to mask any signals */ > - .sa_mask = 0, > }; > > /* set a nice name */ > @@ -35,6 +36,9 @@ noinline static void real_init(void) > /* Make sure this process dies if the kernel dies */ > stub_syscall2(__NR_prctl, PR_SET_PDEATHSIG, SIGKILL); > > + /* Needed in SECCOMP mode (and safe to do anyway) */ > + stub_syscall5(__NR_prctl, PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); > + > /* read information from STDIN and close it */ > res = stub_syscall3(__NR_read, 0, > (unsigned long)&init_data, sizeof(init_data)); > @@ -63,18 +67,133 @@ noinline static void real_init(void) > stack.ss_sp = (void *)init_data.stub_start + UM_KERN_PAGE_SIZE; > stub_syscall2(__NR_sigaltstack, (unsigned long)&stack, 0); > > - /* register SIGSEGV handler */ > - sa.sa_handler_ = (void *) init_data.segv_handler; > - res = stub_syscall4(__NR_rt_sigaction, SIGSEGV, (unsigned long)&sa, 0, > - sizeof(sa.sa_mask)); > - if (res != 0) > - stub_syscall1(__NR_exit, 13); > + /* register signal handlers */ > + sa.sa_handler_ = (void *) init_data.signal_handler; > + sa.sa_restorer = (void *) init_data.signal_restorer; > + if (!init_data.seccomp) { > + /* In ptrace mode, the SIGSEGV handler never returns */ > + sa.sa_mask = 0; > + > + res = stub_syscall4(__NR_rt_sigaction, SIGSEGV, > + (unsigned long)&sa, 0, sizeof(sa.sa_mask)); > + if (res != 0) > + stub_syscall1(__NR_exit, 13); > + } else { > + /* SECCOMP mode uses rt_sigreturn, need to mask all signals */ > + sa.sa_mask = ~0ULL; > + > + res = stub_syscall4(__NR_rt_sigaction, SIGSEGV, > + (unsigned long)&sa, 0, sizeof(sa.sa_mask)); > + if (res != 0) > + stub_syscall1(__NR_exit, 14); > + > + res = stub_syscall4(__NR_rt_sigaction, SIGSYS, > + (unsigned long)&sa, 0, sizeof(sa.sa_mask)); > + if (res != 0) > + stub_syscall1(__NR_exit, 15); > + > + res = stub_syscall4(__NR_rt_sigaction, SIGALRM, > + (unsigned long)&sa, 0, sizeof(sa.sa_mask)); > + if (res != 0) > + stub_syscall1(__NR_exit, 16); > + > + res = stub_syscall4(__NR_rt_sigaction, SIGTRAP, > + (unsigned long)&sa, 0, sizeof(sa.sa_mask)); > + if (res != 0) > + stub_syscall1(__NR_exit, 17); > + > + res = stub_syscall4(__NR_rt_sigaction, SIGILL, > + (unsigned long)&sa, 0, sizeof(sa.sa_mask)); > + if (res != 0) > + stub_syscall1(__NR_exit, 18); > + > + res = stub_syscall4(__NR_rt_sigaction, SIGFPE, > + (unsigned long)&sa, 0, sizeof(sa.sa_mask)); > + if (res != 0) > + stub_syscall1(__NR_exit, 19); > + } > + > + /* > + * If in seccomp mode, install the SECCOMP filter and trigger a syscall. > + * Otherwise set PTRACE_TRACEME and do a SIGSTOP. > + */ > + if (init_data.seccomp) { > + struct sock_filter filter[] = { > +#if __BITS_PER_LONG > 32 > + /* [0] Load upper 32bit of instruction pointer from > seccomp_data */ > + BPF_STMT(BPF_LD | BPF_W | BPF_ABS, > + (offsetof(struct seccomp_data, > instruction_pointer) + 4)), > + > + /* [1] Jump forward 3 instructions if the upper address > is not identical */ > + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, > (init_data.stub_start) >> 32, 0, 3), > +#endif > + /* [2] Load lower 32bit of instruction pointer from > seccomp_data */ > + BPF_STMT(BPF_LD | BPF_W | BPF_ABS, > + (offsetof(struct seccomp_data, > instruction_pointer))), > + > + /* [3] Mask out lower bits */ > + BPF_STMT(BPF_ALU | BPF_AND | BPF_K, 0xfffff000), > + > + /* [4] Jump to [6] if the lower bits are not on the > expected page */ > + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, > (init_data.stub_start) & 0xfffff000, 1, 0), > + > + /* [5] Trap call, allow */ > + BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_TRAP), > + > + /* [6,7] Check architecture */ > + BPF_STMT(BPF_LD | BPF_W | BPF_ABS, > + offsetof(struct seccomp_data, arch)), > + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, > + UM_SECCOMP_ARCH_NATIVE, 1, 0), > + > + /* [8] Kill (for architecture check) */ > + BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS), > + > + /* [9] Load syscall number */ > + BPF_STMT(BPF_LD | BPF_W | BPF_ABS, > + offsetof(struct seccomp_data, nr)), > + > + /* [10-14] Check against permitted syscalls */ > + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_futex, > + 5, 0), > + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, STUB_MMAP_NR, > + 4, 0), > + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_munmap, > + 3, 0), > +#ifdef __i386__ > + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, > __NR_set_thread_area, > + 2, 0), > +#else > + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_arch_prctl, > + 2, 0), > +#endif > + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_rt_sigreturn, > + 1, 0), I was trying to understand what you mean 'permitted syscalls' here. Is this a list of syscall used by UML itself, or something else ? and should the list be maintained/updated if UML expands the permitted syscalls ? > + /* [15] Not one of the permitted syscalls */ > + BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS), > + > + /* [16] Permitted call for the stub */ > + BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW), > + }; > + struct sock_fprog prog = { > + .len = sizeof(filter) / sizeof(filter[0]), > + .filter = filter, > + }; > + > + if (stub_syscall3(__NR_seccomp, SECCOMP_SET_MODE_FILTER, > + SECCOMP_FILTER_FLAG_TSYNC, > + (unsigned long)&prog) != 0) > + stub_syscall1(__NR_exit, 20); > > - stub_syscall4(__NR_ptrace, PTRACE_TRACEME, 0, 0, 0); > + /* Fall through, the exit syscall will cause SIGSYS */ > + } else { > + stub_syscall4(__NR_ptrace, PTRACE_TRACEME, 0, 0, 0); > > - stub_syscall2(__NR_kill, stub_syscall0(__NR_getpid), SIGSTOP); > + stub_syscall2(__NR_kill, stub_syscall0(__NR_getpid), SIGSTOP); > + } > > - stub_syscall1(__NR_exit, 14); > + stub_syscall1(__NR_exit, 30); > > __builtin_unreachable(); > } I was thinking that if I can clean up (or share) the seccomp filter code of nommu UML with this, but it is not likely as the memory layout is different. I would think that the detection part might be useful as well for nommu. -- Hajime