Hi, On Fri, 2025-03-07 at 16:04 +0900, Hajime Tazaki wrote: > thanks for the update; was waiting for this. > > On Tue, 25 Feb 2025 03:18:25 +0900, > Benjamin Berg wrote: > > > > This adds the kernel side of the seccomp based process handling. > > > > Co-authored-by: Johannes Berg <johan...@sipsolutions.net> > > Signed-off-by: Benjamin Berg <benja...@sipsolutions.net> > > Signed-off-by: Benjamin Berg <benjamin.b...@intel.com> > (snip) > > diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c > > index f8ee5d612c47..0abc509e3f4c 100644 > > --- a/arch/um/kernel/skas/mmu.c > > +++ b/arch/um/kernel/skas/mmu.c > > @@ -38,14 +38,11 @@ int init_new_context(struct task_struct *task, struct > > mm_struct *mm) > > scoped_guard(spinlock_irqsave, &mm_list_lock) { > > /* Insert into list, used for lookups when the child dies */ > > list_add(&mm->context.list, &mm_list); > > - > > maybe this is not needed.
Oh, that is a mistake from amending patches. I changed the code and removed this line in the wrong patch. > > } > > > > - new_id->pid = start_userspace(stack); > > - if (new_id->pid < 0) { > > - ret = new_id->pid; > > + ret = start_userspace(new_id); > > + if (ret < 0) > > goto out_free; > > - } > > > > /* Ensure the new MM is clean and nothing unwanted is mapped */ > > unmap(new_id, 0, STUB_START); > > diff --git a/arch/um/kernel/skas/stub_exe.c b/arch/um/kernel/skas/stub_exe.c > > index 23c99b285e82..f40f2332b676 100644 > > --- a/arch/um/kernel/skas/stub_exe.c > > +++ b/arch/um/kernel/skas/stub_exe.c > > @@ -3,6 +3,9 @@ > > #include <asm/unistd.h> > > #include <sysdep/stub.h> > > #include <stub-data.h> > > +#include <linux/filter.h> > > +#include <linux/seccomp.h> > > +#include <generated/asm-offsets.h> > > > > void _start(void); > > > > @@ -25,8 +28,6 @@ noinline static void real_init(void) > > } sa = { > > /* Need to set SA_RESTORER (but the handler never returns) */ > > .sa_flags = SA_ONSTACK | SA_NODEFER | SA_SIGINFO | 0x04000000, > > - /* no need to mask any signals */ > > - .sa_mask = 0, > > }; > > > > /* set a nice name */ > > @@ -35,6 +36,9 @@ noinline static void real_init(void) > > /* Make sure this process dies if the kernel dies */ > > stub_syscall2(__NR_prctl, PR_SET_PDEATHSIG, SIGKILL); > > > > + /* Needed in SECCOMP mode (and safe to do anyway) */ > > + stub_syscall5(__NR_prctl, PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); > > + > > /* read information from STDIN and close it */ > > res = stub_syscall3(__NR_read, 0, > > (unsigned long)&init_data, sizeof(init_data)); > > @@ -63,18 +67,133 @@ noinline static void real_init(void) > > stack.ss_sp = (void *)init_data.stub_start + UM_KERN_PAGE_SIZE; > > stub_syscall2(__NR_sigaltstack, (unsigned long)&stack, 0); > > > > - /* register SIGSEGV handler */ > > - sa.sa_handler_ = (void *) init_data.segv_handler; > > - res = stub_syscall4(__NR_rt_sigaction, SIGSEGV, (unsigned long)&sa, 0, > > - sizeof(sa.sa_mask)); > > - if (res != 0) > > - stub_syscall1(__NR_exit, 13); > > + /* register signal handlers */ > > + sa.sa_handler_ = (void *) init_data.signal_handler; > > + sa.sa_restorer = (void *) init_data.signal_restorer; > > + if (!init_data.seccomp) { > > + /* In ptrace mode, the SIGSEGV handler never returns */ > > + sa.sa_mask = 0; > > + > > + res = stub_syscall4(__NR_rt_sigaction, SIGSEGV, > > + (unsigned long)&sa, 0, sizeof(sa.sa_mask)); > > + if (res != 0) > > + stub_syscall1(__NR_exit, 13); > > + } else { > > + /* SECCOMP mode uses rt_sigreturn, need to mask all signals */ > > + sa.sa_mask = ~0ULL; > > + > > + res = stub_syscall4(__NR_rt_sigaction, SIGSEGV, > > + (unsigned long)&sa, 0, sizeof(sa.sa_mask)); > > + if (res != 0) > > + stub_syscall1(__NR_exit, 14); > > + > > + res = stub_syscall4(__NR_rt_sigaction, SIGSYS, > > + (unsigned long)&sa, 0, sizeof(sa.sa_mask)); > > + if (res != 0) > > + stub_syscall1(__NR_exit, 15); > > + > > + res = stub_syscall4(__NR_rt_sigaction, SIGALRM, > > + (unsigned long)&sa, 0, sizeof(sa.sa_mask)); > > + if (res != 0) > > + stub_syscall1(__NR_exit, 16); > > + > > + res = stub_syscall4(__NR_rt_sigaction, SIGTRAP, > > + (unsigned long)&sa, 0, sizeof(sa.sa_mask)); > > + if (res != 0) > > + stub_syscall1(__NR_exit, 17); > > + > > + res = stub_syscall4(__NR_rt_sigaction, SIGILL, > > + (unsigned long)&sa, 0, sizeof(sa.sa_mask)); > > + if (res != 0) > > + stub_syscall1(__NR_exit, 18); > > + > > + res = stub_syscall4(__NR_rt_sigaction, SIGFPE, > > + (unsigned long)&sa, 0, sizeof(sa.sa_mask)); > > + if (res != 0) > > + stub_syscall1(__NR_exit, 19); > > + } > > + > > + /* > > + * If in seccomp mode, install the SECCOMP filter and trigger a syscall. > > + * Otherwise set PTRACE_TRACEME and do a SIGSTOP. > > + */ > > + if (init_data.seccomp) { > > + struct sock_filter filter[] = { > > +#if __BITS_PER_LONG > 32 > > + /* [0] Load upper 32bit of instruction pointer from > > seccomp_data */ > > + BPF_STMT(BPF_LD | BPF_W | BPF_ABS, > > + (offsetof(struct seccomp_data, > > instruction_pointer) + 4)), > > + > > + /* [1] Jump forward 3 instructions if the upper address > > is not identical */ > > + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, > > (init_data.stub_start) >> 32, 0, 3), > > +#endif > > + /* [2] Load lower 32bit of instruction pointer from > > seccomp_data */ > > + BPF_STMT(BPF_LD | BPF_W | BPF_ABS, > > + (offsetof(struct seccomp_data, > > instruction_pointer))), > > + > > + /* [3] Mask out lower bits */ > > + BPF_STMT(BPF_ALU | BPF_AND | BPF_K, 0xfffff000), > > + > > + /* [4] Jump to [6] if the lower bits are not on the > > expected page */ > > + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, > > (init_data.stub_start) & 0xfffff000, 1, 0), > > + > > + /* [5] Trap call, allow */ > > + BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_TRAP), > > + > > + /* [6,7] Check architecture */ > > + BPF_STMT(BPF_LD | BPF_W | BPF_ABS, > > + offsetof(struct seccomp_data, arch)), > > + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, > > + UM_SECCOMP_ARCH_NATIVE, 1, 0), > > + > > + /* [8] Kill (for architecture check) */ > > + BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS), > > + > > + /* [9] Load syscall number */ > > + BPF_STMT(BPF_LD | BPF_W | BPF_ABS, > > + offsetof(struct seccomp_data, nr)), > > + > > + /* [10-14] Check against permitted syscalls */ > > + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_futex, > > + 5, 0), > > + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, STUB_MMAP_NR, > > + 4, 0), > > + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_munmap, > > + 3, 0), > > +#ifdef __i386__ > > + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, > > __NR_set_thread_area, > > + 2, 0), > > +#else > > + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_arch_prctl, > > + 2, 0), > > +#endif > > + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_rt_sigreturn, > > + 1, 0), > > I was trying to understand what you mean 'permitted syscalls' here. > Is this a list of syscall used by UML itself, or something else ? > > and should the list be maintained/updated if UML expands the permitted > syscalls ? "permitted" are the legitimate syscalls that the stub code needs. So yes, this is what UML itself uses (see "stub_signal_interrupt"). The userspace process itself is not allowed to do any host syscalls. > > + /* [15] Not one of the permitted syscalls */ > > + BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS), > > + > > + /* [16] Permitted call for the stub */ > > + BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW), > > + }; > > + struct sock_fprog prog = { > > + .len = sizeof(filter) / sizeof(filter[0]), > > + .filter = filter, > > + }; > > + > > + if (stub_syscall3(__NR_seccomp, SECCOMP_SET_MODE_FILTER, > > + SECCOMP_FILTER_FLAG_TSYNC, > > + (unsigned long)&prog) != 0) > > + stub_syscall1(__NR_exit, 20); > > > > - stub_syscall4(__NR_ptrace, PTRACE_TRACEME, 0, 0, 0); > > + /* Fall through, the exit syscall will cause SIGSYS */ > > + } else { > > + stub_syscall4(__NR_ptrace, PTRACE_TRACEME, 0, 0, 0); > > > > - stub_syscall2(__NR_kill, stub_syscall0(__NR_getpid), SIGSTOP); > > + stub_syscall2(__NR_kill, stub_syscall0(__NR_getpid), SIGSTOP); > > + } > > > > - stub_syscall1(__NR_exit, 14); > > + stub_syscall1(__NR_exit, 30); > > > > __builtin_unreachable(); > > } > > I was thinking that if I can clean up (or share) the seccomp filter > code of nommu UML with this, but it is not likely as the memory layout > is different. I would think that the detection part might be useful > as well for nommu. I would guess there is not much overlap. You might also need a 64 bit pointer comparison to check which memory range the code is in, but that is probably about it. Benjamin