This commit adds syscall handlers with seccomp, which has two functions. 1) syscall hooks issues from userspace memory ($rip), and 2) prevent syscall and report when zpoline is used as zpoline cannot translate syscall/sysenter instructions by 1) dlopen-ed code containing syscall instructions, or 2) JIT-generated code.
The SIGSYS signal is raised upon the execution from uml_reserved and high_physmem, which locates userspace memory. Signed-off-by: Hajime Tazaki <thehaj...@gmail.com> Signed-off-by: Kenichi Yasukata <kenichi.yasuk...@gmail.com> --- arch/um/include/shared/kern_util.h | 2 + arch/um/include/shared/os.h | 6 +++ arch/um/kernel/trap.c | 12 +++++ arch/um/kernel/um_arch.c | 4 ++ arch/um/os-Linux/process.c | 78 ++++++++++++++++++++++++++++ arch/um/os-Linux/signal.c | 22 ++++++++ arch/x86/um/os-Linux/mcontext.c | 22 ++++++++ arch/x86/um/shared/sysdep/mcontext.h | 4 ++ arch/x86/um/zpoline.c | 15 ++++++ 9 files changed, 165 insertions(+) diff --git a/arch/um/include/shared/kern_util.h b/arch/um/include/shared/kern_util.h index f21dc8517538..9b26386dd2ea 100644 --- a/arch/um/include/shared/kern_util.h +++ b/arch/um/include/shared/kern_util.h @@ -67,4 +67,6 @@ void um_idle_sleep(void); void kasan_map_memory(void *start, size_t len); +extern void trap_sigsys(struct uml_pt_regs *regs); + #endif diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h index 6874be0c38a8..c979a8b15434 100644 --- a/arch/um/include/shared/os.h +++ b/arch/um/include/shared/os.h @@ -220,6 +220,9 @@ extern int os_unmap_memory(void *addr, int len); extern int os_drop_memory(void *addr, int length); extern int can_drop_memory(void); extern int os_mincore(void *addr, unsigned long len); +#ifndef CONFIG_MMU +extern int os_setup_seccomp(void); +#endif void os_set_pdeathsig(void); @@ -252,6 +255,9 @@ extern void register_pm_wake_signal(void); extern void block_signals_hard(void); extern void unblock_signals_hard(void); extern void mark_sigio_pending(void); +#ifndef CONFIG_MMU +extern int um_zpoline_enabled; +#endif /* util.c */ extern void stack_protections(unsigned long address); diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c index a7519b3de4bf..f23ba7f9a82d 100644 --- a/arch/um/kernel/trap.c +++ b/arch/um/kernel/trap.c @@ -310,3 +310,15 @@ void winch(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs) { do_IRQ(WINCH_IRQ, regs); } + +void trap_sigsys(struct uml_pt_regs *regs) +{ + struct task_struct *tsk = current; + + pr_info_ratelimited("%s%s[%d]: sigsys ip %p sp %p\n", + task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, + tsk->comm, task_pid_nr(tsk), + (void *)UPT_IP(regs), (void *)UPT_SP(regs)); + + force_sig(SIGSYS); +} diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c index 62ddb865eb91..d89752bf5be0 100644 --- a/arch/um/kernel/um_arch.c +++ b/arch/um/kernel/um_arch.c @@ -432,6 +432,10 @@ void __init setup_arch(char **cmdline_p) add_bootloader_randomness(rng_seed, sizeof(rng_seed)); memzero_explicit(rng_seed, sizeof(rng_seed)); } + +#ifndef CONFIG_MMU + os_setup_seccomp(); +#endif } void __init arch_cpu_finalize_init(void) diff --git a/arch/um/os-Linux/process.c b/arch/um/os-Linux/process.c index ef1a2f0aa06a..4e0b21b4b00c 100644 --- a/arch/um/os-Linux/process.c +++ b/arch/um/os-Linux/process.c @@ -17,7 +17,11 @@ #include <asm/unistd.h> #include <init.h> #include <longjmp.h> +#include <as-layout.h> #include <os.h> +#include <sys/prctl.h> +#include <linux/filter.h> +#include <linux/seccomp.h> void os_alarm_process(int pid) { @@ -209,3 +213,77 @@ void os_set_pdeathsig(void) { prctl(PR_SET_PDEATHSIG, SIGKILL); } + +#ifndef CONFIG_MMU +int os_setup_seccomp(void) +{ + int err; + unsigned long __userspace_start = uml_reserved, + __userspace_end = high_physmem; + + struct sock_filter filter[] = { + /* if (IP_high > __userspace_end) allow; */ + BPF_STMT(BPF_LD + BPF_W + BPF_ABS, + offsetof(struct seccomp_data, instruction_pointer) + 4), + BPF_JUMP(BPF_JMP + BPF_JGT + BPF_K, __userspace_end >> 32, + /*true-skip=*/0, /*false-skip=*/1), + BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_ALLOW), + + /* if (IP_high == __userspace_end && IP_low >= __userspace_end) allow; */ + BPF_STMT(BPF_LD + BPF_W + BPF_ABS, + offsetof(struct seccomp_data, instruction_pointer) + 4), + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, __userspace_end >> 32, + /*true-skip=*/0, /*false-skip=*/3), + BPF_STMT(BPF_LD + BPF_W + BPF_ABS, + offsetof(struct seccomp_data, instruction_pointer)), + BPF_JUMP(BPF_JMP + BPF_JGE + BPF_K, __userspace_end, + /*true-skip=*/0, /*false-skip=*/1), + BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_ALLOW), + + /* if (IP_high < __userspace_start) allow; */ + BPF_STMT(BPF_LD + BPF_W + BPF_ABS, + offsetof(struct seccomp_data, instruction_pointer) + 4), + BPF_JUMP(BPF_JMP + BPF_JGE + BPF_K, __userspace_start >> 32, + /*true-skip=*/1, /*false-skip=*/0), + BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_ALLOW), + + /* if (IP_high == __userspace_start && IP_low < __userspace_start) allow; */ + BPF_STMT(BPF_LD + BPF_W + BPF_ABS, + offsetof(struct seccomp_data, instruction_pointer) + 4), + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, __userspace_start >> 32, + /*true-skip=*/0, /*false-skip=*/3), + BPF_STMT(BPF_LD + BPF_W + BPF_ABS, + offsetof(struct seccomp_data, instruction_pointer)), + BPF_JUMP(BPF_JMP + BPF_JGE + BPF_K, __userspace_start, + /*true-skip=*/1, /*false-skip=*/0), + BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_ALLOW), + + /* other address; trap */ + BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_TRAP), + }; + struct sock_fprog prog = { + .len = ARRAY_SIZE(filter), + .filter = filter, + }; + + err = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + if (err) + os_warn("PR_SET_NO_NEW_PRIVS (err=%d, ernro=%d)\n", + err, errno); + + err = syscall(SYS_seccomp, SECCOMP_SET_MODE_FILTER, + SECCOMP_FILTER_FLAG_TSYNC, &prog); + if (err) { + os_warn("SECCOMP_SET_MODE_FILTER (err=%d, ernro=%d)\n", + err, errno); + exit(-1); + } + + set_handler(SIGSYS); + + os_info("seccomp: filter syscalls in the range: 0x%lx-0x%lx\n", + __userspace_start, __userspace_end); + + return 0; +} +#endif diff --git a/arch/um/os-Linux/signal.c b/arch/um/os-Linux/signal.c index 9ea7269ffb77..c0d1fb1fc0c4 100644 --- a/arch/um/os-Linux/signal.c +++ b/arch/um/os-Linux/signal.c @@ -20,6 +20,25 @@ #include <um_malloc.h> #include <sys/ucontext.h> #include <timetravel.h> +#include <init.h> + +#ifndef CONFIG_MMU +static void sigsys_handler(int sig, struct siginfo *si, mcontext_t *mc) +{ + struct uml_pt_regs r; + + if (!um_zpoline_enabled) { + /* hook syscall via SIGSYS */ + mc_set_sigsys_hook(mc); + } else { + /* trap SIGSYS to userspace */ + get_regs_from_mc(&r, mc); + trap_sigsys(&r); + /* force handle signals after rt_sigreturn() */ + mc_set_regs_ip_relay(mc); + } +} +#endif void (*sig_info[NSIG])(int, struct siginfo *, struct uml_pt_regs *) = { [SIGTRAP] = relay_signal, @@ -178,6 +197,9 @@ static void (*handlers[_NSIG])(int sig, struct siginfo *si, mcontext_t *mc) = { [SIGILL] = sig_handler, [SIGFPE] = sig_handler, [SIGTRAP] = sig_handler, +#ifndef CONFIG_MMU + [SIGSYS] = sigsys_handler, +#endif [SIGIO] = sig_handler, [SIGWINCH] = sig_handler, diff --git a/arch/x86/um/os-Linux/mcontext.c b/arch/x86/um/os-Linux/mcontext.c index e80ab7d28117..d876e34a9c7a 100644 --- a/arch/x86/um/os-Linux/mcontext.c +++ b/arch/x86/um/os-Linux/mcontext.c @@ -4,6 +4,7 @@ #include <asm/ptrace.h> #include <sysdep/ptrace.h> #include <sysdep/mcontext.h> +#include <sysdep/syscalls.h> void get_regs_from_mc(struct uml_pt_regs *regs, mcontext_t *mc) { @@ -31,3 +32,24 @@ void get_regs_from_mc(struct uml_pt_regs *regs, mcontext_t *mc) regs->gp[CS / sizeof(unsigned long)] |= 3; #endif } + +#ifndef CONFIG_MMU +static void userspace_sigreturn(void) +{ + __asm__ volatile("movq $15, %rax"); + __asm__ volatile("call *%0" : : "r"(__kernel_vsyscall) :); +} + +void mc_set_regs_ip_relay(mcontext_t *mc) +{ + mc->gregs[REG_RIP] = (unsigned long) userspace_sigreturn; +} + +void mc_set_sigsys_hook(mcontext_t *mc) +{ + mc->gregs[REG_RSP] -= sizeof(unsigned long); + *((unsigned long *) (mc->gregs[REG_RSP])) = mc->gregs[REG_RIP]; + mc->gregs[REG_RCX] = mc->gregs[REG_RIP]; + mc->gregs[REG_RIP] = (unsigned long) __kernel_vsyscall; +} +#endif diff --git a/arch/x86/um/shared/sysdep/mcontext.h b/arch/x86/um/shared/sysdep/mcontext.h index b724c54da316..0e837f4b5757 100644 --- a/arch/x86/um/shared/sysdep/mcontext.h +++ b/arch/x86/um/shared/sysdep/mcontext.h @@ -7,6 +7,10 @@ #define __SYS_SIGCONTEXT_X86_H extern void get_regs_from_mc(struct uml_pt_regs *, mcontext_t *); +#ifndef CONFIG_MMU +extern void mc_set_sigsys_hook(mcontext_t *mc); +extern void mc_set_regs_ip_relay(mcontext_t *mc); +#endif #ifdef __i386__ diff --git a/arch/x86/um/zpoline.c b/arch/x86/um/zpoline.c index 97f5345ab314..6ec44233276b 100644 --- a/arch/x86/um/zpoline.c +++ b/arch/x86/um/zpoline.c @@ -14,6 +14,7 @@ #include <sysdep/syscalls.h> #include <os.h> +int um_zpoline_enabled; /* start of trampoline code area */ static char *__zpoline_start; @@ -111,6 +112,10 @@ int elf_arch_finalize_exec(struct elf_fdpic_params *exec_params, int err = 0, count = 0; struct mm_struct *mm = current->mm; + /* zpoline disabled */ + if (!um_zpoline_enabled) + return 0; + if (down_write_killable(&mm->mmap_lock)) return -EINTR; @@ -221,3 +226,13 @@ static int __init setup_zpoline_trampoline(void) return 0; } arch_initcall(setup_zpoline_trampoline); + +static int __init zpoline_set(char *str) +{ + int val = 0; + + get_option(&str, &val); + um_zpoline_enabled = val; + return 1; +} +__setup("zpoline=", zpoline_set); -- 2.43.0