Hi,

On Fri, 2025-03-07 at 16:04 +0900, Hajime Tazaki wrote:
> thanks for the update; was waiting for this.
> 
> On Tue, 25 Feb 2025 03:18:25 +0900,
> Benjamin Berg wrote:
> > 
> > This adds the kernel side of the seccomp based process handling.
> > 
> > Co-authored-by: Johannes Berg <johan...@sipsolutions.net>
> > Signed-off-by: Benjamin Berg <benja...@sipsolutions.net>
> > Signed-off-by: Benjamin Berg <benjamin.b...@intel.com>
> (snip)
> > diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c
> > index f8ee5d612c47..0abc509e3f4c 100644
> > --- a/arch/um/kernel/skas/mmu.c
> > +++ b/arch/um/kernel/skas/mmu.c
> > @@ -38,14 +38,11 @@ int init_new_context(struct task_struct *task, struct 
> > mm_struct *mm)
> >     scoped_guard(spinlock_irqsave, &mm_list_lock) {
> >             /* Insert into list, used for lookups when the child dies */
> >             list_add(&mm->context.list, &mm_list);
> > -
> 
> maybe this is not needed.

Oh, that is a mistake from amending patches. I changed the code and
removed this line in the wrong patch.

> >     }
> >  
> > -   new_id->pid = start_userspace(stack);
> > -   if (new_id->pid < 0) {
> > -           ret = new_id->pid;
> > +   ret = start_userspace(new_id);
> > +   if (ret < 0)
> >             goto out_free;
> > -   }
> >  
> >     /* Ensure the new MM is clean and nothing unwanted is mapped */
> >     unmap(new_id, 0, STUB_START);
> > diff --git a/arch/um/kernel/skas/stub_exe.c b/arch/um/kernel/skas/stub_exe.c
> > index 23c99b285e82..f40f2332b676 100644
> > --- a/arch/um/kernel/skas/stub_exe.c
> > +++ b/arch/um/kernel/skas/stub_exe.c
> > @@ -3,6 +3,9 @@
> >  #include <asm/unistd.h>
> >  #include <sysdep/stub.h>
> >  #include <stub-data.h>
> > +#include <linux/filter.h>
> > +#include <linux/seccomp.h>
> > +#include <generated/asm-offsets.h>
> >  
> >  void _start(void);
> >  
> > @@ -25,8 +28,6 @@ noinline static void real_init(void)
> >     } sa = {
> >             /* Need to set SA_RESTORER (but the handler never returns) */
> >             .sa_flags = SA_ONSTACK | SA_NODEFER | SA_SIGINFO | 0x04000000,
> > -           /* no need to mask any signals */
> > -           .sa_mask = 0,
> >     };
> >  
> >     /* set a nice name */
> > @@ -35,6 +36,9 @@ noinline static void real_init(void)
> >     /* Make sure this process dies if the kernel dies */
> >     stub_syscall2(__NR_prctl, PR_SET_PDEATHSIG, SIGKILL);
> >  
> > +   /* Needed in SECCOMP mode (and safe to do anyway) */
> > +   stub_syscall5(__NR_prctl, PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
> > +
> >     /* read information from STDIN and close it */
> >     res = stub_syscall3(__NR_read, 0,
> >                         (unsigned long)&init_data, sizeof(init_data));
> > @@ -63,18 +67,133 @@ noinline static void real_init(void)
> >     stack.ss_sp = (void *)init_data.stub_start + UM_KERN_PAGE_SIZE;
> >     stub_syscall2(__NR_sigaltstack, (unsigned long)&stack, 0);
> >  
> > -   /* register SIGSEGV handler */
> > -   sa.sa_handler_ = (void *) init_data.segv_handler;
> > -   res = stub_syscall4(__NR_rt_sigaction, SIGSEGV, (unsigned long)&sa, 0,
> > -                       sizeof(sa.sa_mask));
> > -   if (res != 0)
> > -           stub_syscall1(__NR_exit, 13);
> > +   /* register signal handlers */
> > +   sa.sa_handler_ = (void *) init_data.signal_handler;
> > +   sa.sa_restorer = (void *) init_data.signal_restorer;
> > +   if (!init_data.seccomp) {
> > +           /* In ptrace mode, the SIGSEGV handler never returns */
> > +           sa.sa_mask = 0;
> > +
> > +           res = stub_syscall4(__NR_rt_sigaction, SIGSEGV,
> > +                               (unsigned long)&sa, 0, sizeof(sa.sa_mask));
> > +           if (res != 0)
> > +                   stub_syscall1(__NR_exit, 13);
> > +   } else {
> > +           /* SECCOMP mode uses rt_sigreturn, need to mask all signals */
> > +           sa.sa_mask = ~0ULL;
> > +
> > +           res = stub_syscall4(__NR_rt_sigaction, SIGSEGV,
> > +                               (unsigned long)&sa, 0, sizeof(sa.sa_mask));
> > +           if (res != 0)
> > +                   stub_syscall1(__NR_exit, 14);
> > +
> > +           res = stub_syscall4(__NR_rt_sigaction, SIGSYS,
> > +                               (unsigned long)&sa, 0, sizeof(sa.sa_mask));
> > +           if (res != 0)
> > +                   stub_syscall1(__NR_exit, 15);
> > +
> > +           res = stub_syscall4(__NR_rt_sigaction, SIGALRM,
> > +                               (unsigned long)&sa, 0, sizeof(sa.sa_mask));
> > +           if (res != 0)
> > +                   stub_syscall1(__NR_exit, 16);
> > +
> > +           res = stub_syscall4(__NR_rt_sigaction, SIGTRAP,
> > +                               (unsigned long)&sa, 0, sizeof(sa.sa_mask));
> > +           if (res != 0)
> > +                   stub_syscall1(__NR_exit, 17);
> > +
> > +           res = stub_syscall4(__NR_rt_sigaction, SIGILL,
> > +                               (unsigned long)&sa, 0, sizeof(sa.sa_mask));
> > +           if (res != 0)
> > +                   stub_syscall1(__NR_exit, 18);
> > +
> > +           res = stub_syscall4(__NR_rt_sigaction, SIGFPE,
> > +                               (unsigned long)&sa, 0, sizeof(sa.sa_mask));
> > +           if (res != 0)
> > +                   stub_syscall1(__NR_exit, 19);
> > +   }
> > +
> > +   /*
> > +    * If in seccomp mode, install the SECCOMP filter and trigger a syscall.
> > +    * Otherwise set PTRACE_TRACEME and do a SIGSTOP.
> > +    */
> > +   if (init_data.seccomp) {
> > +           struct sock_filter filter[] = {
> > +#if __BITS_PER_LONG > 32
> > +                   /* [0] Load upper 32bit of instruction pointer from 
> > seccomp_data */
> > +                   BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
> > +                            (offsetof(struct seccomp_data, 
> > instruction_pointer) + 4)),
> > +
> > +                   /* [1] Jump forward 3 instructions if the upper address 
> > is not identical */
> > +                   BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 
> > (init_data.stub_start) >> 32, 0, 3),
> > +#endif
> > +                   /* [2] Load lower 32bit of instruction pointer from 
> > seccomp_data */
> > +                   BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
> > +                            (offsetof(struct seccomp_data, 
> > instruction_pointer))),
> > +
> > +                   /* [3] Mask out lower bits */
> > +                   BPF_STMT(BPF_ALU | BPF_AND | BPF_K, 0xfffff000),
> > +
> > +                   /* [4] Jump to [6] if the lower bits are not on the 
> > expected page */
> > +                   BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 
> > (init_data.stub_start) & 0xfffff000, 1, 0),
> > +
> > +                   /* [5] Trap call, allow */
> > +                   BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_TRAP),
> > +
> > +                   /* [6,7] Check architecture */
> > +                   BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
> > +                            offsetof(struct seccomp_data, arch)),
> > +                   BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K,
> > +                            UM_SECCOMP_ARCH_NATIVE, 1, 0),
> > +
> > +                   /* [8] Kill (for architecture check) */
> > +                   BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS),
> > +
> > +                   /* [9] Load syscall number */
> > +                   BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
> > +                            offsetof(struct seccomp_data, nr)),
> > +
> > +                   /* [10-14] Check against permitted syscalls */
> > +                   BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_futex,
> > +                            5, 0),
> > +                   BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, STUB_MMAP_NR,
> > +                            4, 0),
> > +                   BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_munmap,
> > +                            3, 0),
> > +#ifdef __i386__
> > +                   BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 
> > __NR_set_thread_area,
> > +                            2, 0),
> > +#else
> > +                   BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_arch_prctl,
> > +                            2, 0),
> > +#endif
> > +                   BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_rt_sigreturn,
> > +                            1, 0),
> 
> I was trying to understand what you mean 'permitted syscalls' here.
> Is this a list of syscall used by UML itself, or something else ?
> 
> and should the list be maintained/updated if UML expands the permitted
> syscalls ?

"permitted" are the legitimate syscalls that the stub code needs. So
yes, this is what UML itself uses (see "stub_signal_interrupt"). The
userspace process itself is not allowed to do any host syscalls.

> > +                   /* [15] Not one of the permitted syscalls */
> > +                   BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS),
> > +
> > +                   /* [16] Permitted call for the stub */
> > +                   BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
> > +           };
> > +           struct sock_fprog prog = {
> > +                   .len = sizeof(filter) / sizeof(filter[0]),
> > +                   .filter = filter,
> > +           };
> > +
> > +           if (stub_syscall3(__NR_seccomp, SECCOMP_SET_MODE_FILTER,
> > +                             SECCOMP_FILTER_FLAG_TSYNC,
> > +                             (unsigned long)&prog) != 0)
> > +                   stub_syscall1(__NR_exit, 20);
> >  
> > -   stub_syscall4(__NR_ptrace, PTRACE_TRACEME, 0, 0, 0);
> > +           /* Fall through, the exit syscall will cause SIGSYS */
> > +   } else {
> > +           stub_syscall4(__NR_ptrace, PTRACE_TRACEME, 0, 0, 0);
> >  
> > -   stub_syscall2(__NR_kill, stub_syscall0(__NR_getpid), SIGSTOP);
> > +           stub_syscall2(__NR_kill, stub_syscall0(__NR_getpid), SIGSTOP);
> > +   }
> >  
> > -   stub_syscall1(__NR_exit, 14);
> > +   stub_syscall1(__NR_exit, 30);
> >  
> >     __builtin_unreachable();
> >  }
> 
> I was thinking that if I can clean up (or share) the seccomp filter
> code of nommu UML with this, but it is not likely as the memory layout
> is different.  I would think that the detection part might be useful
> as well for nommu.

I would guess there is not much overlap. You might also need a 64 bit
pointer comparison to check which memory range the code is in, but that
is probably about it.

Benjamin

Reply via email to