As syscall translation done by zpoline assumes that there are no direct syscalls issued by userspace code, but there would be possibly issued by 1) dlopen-ed code containing syscall instructions, or 2) JIT-generated code. This commit add a seccomp filter to prevent such syscalls from userspace code.
Signed-off-by: Hajime Tazaki <thehaj...@gmail.com> --- arch/um/include/shared/os.h | 3 ++ arch/um/kernel/um_arch.c | 4 ++ arch/um/os-Linux/process.c | 76 +++++++++++++++++++++++++++++++++++++ 3 files changed, 83 insertions(+) diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h index 6874be0c38a8..5a6722f254d5 100644 --- a/arch/um/include/shared/os.h +++ b/arch/um/include/shared/os.h @@ -220,6 +220,9 @@ extern int os_unmap_memory(void *addr, int len); extern int os_drop_memory(void *addr, int length); extern int can_drop_memory(void); extern int os_mincore(void *addr, unsigned long len); +#ifndef CONFIG_MMU +extern int os_setup_seccomp(void); +#endif void os_set_pdeathsig(void); diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c index ec17576ce9fc..694e428ddf35 100644 --- a/arch/um/kernel/um_arch.c +++ b/arch/um/kernel/um_arch.c @@ -433,6 +433,10 @@ void __init setup_arch(char **cmdline_p) add_bootloader_randomness(rng_seed, sizeof(rng_seed)); memzero_explicit(rng_seed, sizeof(rng_seed)); } + +#ifndef CONFIG_MMU + os_setup_seccomp(); +#endif } void __init arch_cpu_finalize_init(void) diff --git a/arch/um/os-Linux/process.c b/arch/um/os-Linux/process.c index ef1a2f0aa06a..ed3d99301dc8 100644 --- a/arch/um/os-Linux/process.c +++ b/arch/um/os-Linux/process.c @@ -17,7 +17,11 @@ #include <asm/unistd.h> #include <init.h> #include <longjmp.h> +#include <as-layout.h> #include <os.h> +#include <sys/prctl.h> +#include <linux/filter.h> +#include <linux/seccomp.h> void os_alarm_process(int pid) { @@ -209,3 +213,75 @@ void os_set_pdeathsig(void) { prctl(PR_SET_PDEATHSIG, SIGKILL); } + +#ifndef CONFIG_MMU +int os_setup_seccomp(void) +{ + int err; + unsigned long __userspace_start = uml_reserved, + __userspace_end = high_physmem; + + struct sock_filter filter[] = { + /* if (IP_high > __userspace_end) allow; */ + BPF_STMT(BPF_LD + BPF_W + BPF_ABS, + offsetof(struct seccomp_data, instruction_pointer) + 4), + BPF_JUMP(BPF_JMP + BPF_JGT + BPF_K, __userspace_end >> 32, + /*true-skip=*/0, /*false-skip=*/1), + BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_ALLOW), + + /* if (IP_high == __userspace_end && IP_low >= __userspace_end) allow; */ + BPF_STMT(BPF_LD + BPF_W + BPF_ABS, + offsetof(struct seccomp_data, instruction_pointer) + 4), + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, __userspace_end >> 32, + /*true-skip=*/0, /*false-skip=*/3), + BPF_STMT(BPF_LD + BPF_W + BPF_ABS, + offsetof(struct seccomp_data, instruction_pointer)), + BPF_JUMP(BPF_JMP + BPF_JGE + BPF_K, __userspace_end, + /*true-skip=*/0, /*false-skip=*/1), + BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_ALLOW), + + /* if (IP_high < __userspace_start) allow; */ + BPF_STMT(BPF_LD + BPF_W + BPF_ABS, + offsetof(struct seccomp_data, instruction_pointer) + 4), + BPF_JUMP(BPF_JMP + BPF_JGE + BPF_K, __userspace_start >> 32, + /*true-skip=*/1, /*false-skip=*/0), + BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_ALLOW), + + /* if (IP_high == __userspace_start && IP_low < __userspace_start) allow; */ + BPF_STMT(BPF_LD + BPF_W + BPF_ABS, + offsetof(struct seccomp_data, instruction_pointer) + 4), + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, __userspace_start >> 32, + /*true-skip=*/0, /*false-skip=*/3), + BPF_STMT(BPF_LD + BPF_W + BPF_ABS, + offsetof(struct seccomp_data, instruction_pointer)), + BPF_JUMP(BPF_JMP + BPF_JGE + BPF_K, __userspace_start, + /*true-skip=*/1, /*false-skip=*/0), + BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_ALLOW), + + /* other address; trap */ + BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_TRAP), + }; + struct sock_fprog prog = { + .len = ARRAY_SIZE(filter), + .filter = filter, + }; + + err = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + if (err) + os_warn("PR_SET_NO_NEW_PRIVS (err=%d, ernro=%d)\n", + err, errno); + + err = syscall(SYS_seccomp, SECCOMP_SET_MODE_FILTER, + SECCOMP_FILTER_FLAG_TSYNC, &prog); + if (err) { + os_warn("SECCOMP_SET_MODE_FILTER (err=%d, ernro=%d)\n", + err, errno); + exit(-1); + } + + os_info("seccomp: filter syscalls in the range: 0x%lx-0x%lx\n", + __userspace_start, __userspace_end); + + return 0; +} +#endif -- 2.43.0