One problem with seccomp was that ptrace could be used to change a syscall after seccomp filtering had completed. This was a well documented limitation, and it was recommended to block ptrace when defining a filter to avoid this problem. This can be quite a limitation for containers or other places where ptrace is desired even under seccomp filters.
Since seccomp filtering has been split into pre-trace and trace phases (phase1 and phase2 respectively), it's possible to re-run phase1 seccomp after ptrace. This makes that change, and updates the test suite for both SECCOMP_RET_TRACE and PTRACE_SYSCALL manipulation. Signed-off-by: Kees Cook <keesc...@chromium.org> --- include/linux/seccomp.h | 6 + include/linux/tracehook.h | 8 +- kernel/seccomp.c | 42 ++++++ tools/testing/selftests/seccomp/seccomp_bpf.c | 176 ++++++++++++++++++++++++-- 4 files changed, 220 insertions(+), 12 deletions(-) diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h index 2296e6b2f690..e2b72394c200 100644 --- a/include/linux/seccomp.h +++ b/include/linux/seccomp.h @@ -85,6 +85,7 @@ static inline int seccomp_mode(struct seccomp *s) #ifdef CONFIG_SECCOMP_FILTER extern void put_seccomp_filter(struct task_struct *tsk); extern void get_seccomp_filter(struct task_struct *tsk); +extern int seccomp_phase1_recheck(void); #else /* CONFIG_SECCOMP_FILTER */ static inline void put_seccomp_filter(struct task_struct *tsk) { @@ -94,6 +95,11 @@ static inline void get_seccomp_filter(struct task_struct *tsk) { return; } + +static inline int seccomp_phase1_recheck(void) +{ + return 0; +} #endif /* CONFIG_SECCOMP_FILTER */ #if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE) diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h index 26c152122a42..69b584d88508 100644 --- a/include/linux/tracehook.h +++ b/include/linux/tracehook.h @@ -48,6 +48,7 @@ #include <linux/sched.h> #include <linux/ptrace.h> +#include <linux/seccomp.h> #include <linux/security.h> #include <linux/task_work.h> #include <linux/memcontrol.h> @@ -100,7 +101,12 @@ static inline int ptrace_report_syscall(struct pt_regs *regs) static inline __must_check int tracehook_report_syscall_entry( struct pt_regs *regs) { - return ptrace_report_syscall(regs); + int skip; + + skip = ptrace_report_syscall(regs); + if (skip) + return skip; + return seccomp_phase1_recheck(); } /** diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 7002796f14a4..6eaa3a1c5edb 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -665,6 +665,46 @@ u32 seccomp_phase1(struct seccomp_data *sd) } /** + * seccomp_phase1_recheck() - recheck phase1 in the context of ptrace + * + * This re-runs phase 1 seccomp checks in the case where ptrace may have + * just changed things out from under us. + * + * Returns 0 if the syscall should be processed or -1 to skip the syscall. + */ +int seccomp_phase1_recheck(void) +{ + u32 action; + + /* If we're not under seccomp, continue normally. */ + if (!test_thread_flag(TIF_SECCOMP)) + return 0; + + /* Pass NULL struct seccomp_data to force reload after ptrace. */ + action = seccomp_phase1(NULL); + switch (action) { + case SECCOMP_PHASE1_OK: + /* Passes seccomp, continue normally. */ + break; + case SECCOMP_PHASE1_SKIP: + /* Skip the syscall. */ + return -1; + default: + if ((action & SECCOMP_RET_ACTION) != SECCOMP_RET_TRACE) { + /* Impossible return value: kill the process. */ + do_exit(SIGSYS); + } + /* + * We've hit a trace request, but ptrace already put us + * into this state, so just continue. + */ + break; + } + + return 0; +} + +/** * seccomp_phase2() - finish slow path seccomp work for the current syscall * @phase1_result: The return value from seccomp_phase1() * @@ -701,6 +741,8 @@ int seccomp_phase2(u32 phase1_result) do_exit(SIGSYS); if (syscall_get_nr(current, regs) < 0) return -1; /* Explicit request to skip. */ + if (seccomp_phase1_recheck() < 0) + return -1; return 0; } diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index 7947e568e057..b8f6c743998f 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -1021,8 +1021,8 @@ void tracer_stop(int sig) typedef void tracer_func_t(struct __test_metadata *_metadata, pid_t tracee, int status, void *args); -void tracer(struct __test_metadata *_metadata, int fd, pid_t tracee, - tracer_func_t tracer_func, void *args) +void start_tracer(struct __test_metadata *_metadata, int fd, pid_t tracee, + tracer_func_t tracer_func, void *args, bool ptrace_syscall) { int ret = -1; struct sigaction action = { @@ -1042,12 +1042,16 @@ void tracer(struct __test_metadata *_metadata, int fd, pid_t tracee, /* Wait for attach stop */ wait(NULL); - ret = ptrace(PTRACE_SETOPTIONS, tracee, NULL, PTRACE_O_TRACESECCOMP); + ret = ptrace(PTRACE_SETOPTIONS, tracee, NULL, ptrace_syscall ? + PTRACE_O_TRACESYSGOOD : + PTRACE_O_TRACESECCOMP); ASSERT_EQ(0, ret) { TH_LOG("Failed to set PTRACE_O_TRACESECCOMP"); kill(tracee, SIGKILL); } - ptrace(PTRACE_CONT, tracee, NULL, 0); + ret = ptrace(ptrace_syscall ? PTRACE_SYSCALL : PTRACE_CONT, + tracee, NULL, 0); + ASSERT_EQ(0, ret); /* Unblock the tracee */ ASSERT_EQ(1, write(fd, "A", 1)); @@ -1063,12 +1067,13 @@ void tracer(struct __test_metadata *_metadata, int fd, pid_t tracee, /* Child is dead. Time to go. */ return; - /* Make sure this is a seccomp event. */ - ASSERT_EQ(true, IS_SECCOMP_EVENT(status)); + /* Check if this is a seccomp event. */ + ASSERT_EQ(!ptrace_syscall, IS_SECCOMP_EVENT(status)); tracer_func(_metadata, tracee, status, args); - ret = ptrace(PTRACE_CONT, tracee, NULL, NULL); + ret = ptrace(ptrace_syscall ? PTRACE_SYSCALL : PTRACE_CONT, + tracee, NULL, 0); ASSERT_EQ(0, ret); } /* Directly report the status of our test harness results. */ @@ -1079,7 +1084,7 @@ void tracer(struct __test_metadata *_metadata, int fd, pid_t tracee, void cont_handler(int num) { } pid_t setup_trace_fixture(struct __test_metadata *_metadata, - tracer_func_t func, void *args) + tracer_func_t func, void *args, bool ptrace_syscall) { char sync; int pipefd[2]; @@ -1095,7 +1100,8 @@ pid_t setup_trace_fixture(struct __test_metadata *_metadata, signal(SIGALRM, cont_handler); if (tracer_pid == 0) { close(pipefd[0]); - tracer(_metadata, pipefd[1], tracee, func, args); + start_tracer(_metadata, pipefd[1], tracee, func, args, + ptrace_syscall); syscall(__NR_exit, 0); } close(pipefd[1]); @@ -1177,7 +1183,7 @@ FIXTURE_SETUP(TRACE_poke) /* Launch tracer. */ self->tracer = setup_trace_fixture(_metadata, tracer_poke, - &self->tracer_args); + &self->tracer_args, false); } FIXTURE_TEARDOWN(TRACE_poke) @@ -1395,6 +1401,29 @@ void tracer_syscall(struct __test_metadata *_metadata, pid_t tracee, } +void tracer_ptrace(struct __test_metadata *_metadata, pid_t tracee, + int status, void *args) +{ + int ret, nr; + unsigned long msg; + static bool entry; + + /* Make sure we got an empty message. */ + ret = ptrace(PTRACE_GETEVENTMSG, tracee, NULL, &msg); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, msg); + + /* The only way to tell PTRACE_SYSCALL entry/exit is by counting. */ + entry = !entry; + if (!entry) + return; + + nr = get_syscall(_metadata, tracee); + + if (nr == __NR_getpid) + change_syscall(_metadata, tracee, __NR_getppid); +} + FIXTURE_DATA(TRACE_syscall) { struct sock_fprog prog; pid_t tracer, mytid, mypid, parent; @@ -1436,7 +1465,8 @@ FIXTURE_SETUP(TRACE_syscall) ASSERT_NE(self->parent, self->mypid); /* Launch tracer. */ - self->tracer = setup_trace_fixture(_metadata, tracer_syscall, NULL); + self->tracer = setup_trace_fixture(_metadata, tracer_syscall, NULL, + false); } FIXTURE_TEARDOWN(TRACE_syscall) @@ -1496,6 +1526,130 @@ TEST_F(TRACE_syscall, syscall_dropped) EXPECT_NE(self->mytid, syscall(__NR_gettid)); } +TEST_F(TRACE_syscall, skip_after_RET_TRACE) +{ + struct sock_filter filter[] = { + BPF_STMT(BPF_LD|BPF_W|BPF_ABS, + offsetof(struct seccomp_data, nr)), + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getppid, 0, 1), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | EPERM), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + }; + struct sock_fprog prog = { + .len = (unsigned short)ARRAY_SIZE(filter), + .filter = filter, + }; + long ret; + + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret); + + /* Install fixture filter. */ + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog, 0, 0); + ASSERT_EQ(0, ret); + + /* Install "errno on getppid" filter. */ + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0); + ASSERT_EQ(0, ret); + + /* Tracer will redirect getpid to getppid, and we should see EPERM. */ + EXPECT_EQ(-1, syscall(__NR_getpid)); + EXPECT_EQ(EPERM, errno); +} + +TEST_F_SIGNAL(TRACE_syscall, kill_after_RET_TRACE, SIGSYS) +{ + struct sock_filter filter[] = { + BPF_STMT(BPF_LD|BPF_W|BPF_ABS, + offsetof(struct seccomp_data, nr)), + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getppid, 0, 1), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + }; + struct sock_fprog prog = { + .len = (unsigned short)ARRAY_SIZE(filter), + .filter = filter, + }; + long ret; + + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret); + + /* Install fixture filter. */ + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog, 0, 0); + ASSERT_EQ(0, ret); + + /* Install "death on getppid" filter. */ + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0); + ASSERT_EQ(0, ret); + + /* Tracer will redirect getpid to getppid, and we should die. */ + EXPECT_NE(self->mypid, syscall(__NR_getpid)); +} + +TEST_F(TRACE_syscall, skip_after_ptrace) +{ + struct sock_filter filter[] = { + BPF_STMT(BPF_LD|BPF_W|BPF_ABS, + offsetof(struct seccomp_data, nr)), + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getppid, 0, 1), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | EPERM), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + }; + struct sock_fprog prog = { + .len = (unsigned short)ARRAY_SIZE(filter), + .filter = filter, + }; + long ret; + + /* Swap SECCOMP_RET_TRACE tracer for PTRACE_SYSCALL tracer. */ + teardown_trace_fixture(_metadata, self->tracer); + self->tracer = setup_trace_fixture(_metadata, tracer_ptrace, NULL, + true); + + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret); + + /* Install "errno on getppid" filter. */ + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0); + ASSERT_EQ(0, ret); + + /* Tracer will redirect getpid to getppid, and we should see EPERM. */ + EXPECT_EQ(-1, syscall(__NR_getpid)); + EXPECT_EQ(EPERM, errno); +} + +TEST_F_SIGNAL(TRACE_syscall, kill_after_ptrace, SIGSYS) +{ + struct sock_filter filter[] = { + BPF_STMT(BPF_LD|BPF_W|BPF_ABS, + offsetof(struct seccomp_data, nr)), + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getppid, 0, 1), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + }; + struct sock_fprog prog = { + .len = (unsigned short)ARRAY_SIZE(filter), + .filter = filter, + }; + long ret; + + /* Swap SECCOMP_RET_TRACE tracer for PTRACE_SYSCALL tracer. */ + teardown_trace_fixture(_metadata, self->tracer); + self->tracer = setup_trace_fixture(_metadata, tracer_ptrace, NULL, + true); + + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret); + + /* Install "death on getppid" filter. */ + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0); + ASSERT_EQ(0, ret); + + /* Tracer will redirect getpid to getppid, and we should die. */ + EXPECT_NE(self->mypid, syscall(__NR_getpid)); +} + #ifndef __NR_seccomp # if defined(__i386__) # define __NR_seccomp 354 -- 2.6.3 -- Kees Cook Chrome OS & Brillo Security