After enabling GENERIC_ENTRY on PowerPC, syscall_enter_from_user_mode()
returns -1 as a sentinel to signal that seccomp or ptrace has intercepted
the syscall and already set a return value via syscall_set_return_value().
system_call_exception() was not handling this sentinel, and since -1UL
is >= NR_syscalls, the code fell into the out-of-range path and returned
-ENOSYS, overwriting the errno already placed in regs->gpr[3].

The naive fix of checking r0 == -1L before the NR_syscalls bounds check
is ambiguous: a user legitimately calling syscall(-1) also produces r0 ==
-1L, and a tracer intercepting such a call would have its injected return
value silently discarded.

Fix this properly by introducing regs->entry_flags, a kernel-internal
field in struct pt_regs (consuming one slot of the existing __pt_regs_pad
so the ABI is preserved), with SYSCALL_ENTRY_RET_SET as an out-of-band
flag. syscall_set_return_value() sets this flag whenever seccomp or ptrace
injects a return value. system_call_exception() zeros entry_flags before
calling syscall_enter_from_user_mode(), then checks and clears the flag
afterwards: if set, it returns regs->gpr[3] directly regardless of what
syscall number the user originally requested.

This handles all seccomp actions correctly:

  - SECCOMP_RET_ERRNO, SECCOMP_RET_TRACE (no tracer), SECCOMP_RET_USER_NOTIF:
    all call syscall_set_return_value(), flag is set, injected value returned.
  - SECCOMP_RET_TRAP, SECCOMP_RET_KILL: call syscall_rollback() and deliver
    a signal; flag is not set, but the process is dying so the return value
    is irrelevant.

The fix covers both ppc32 and ppc64 with no #ifdefs.

Fixes: bee25f97ad24 ("powerpc: Enable GENERIC_ENTRY feature")
Reported-by: Michal Suchánek <[email protected]>
Closes: https://lore.kernel.org/all/[email protected]/
Signed-off-by: Mukesh Kumar Chaurasiya (IBM) <[email protected]>
---
v2 -> v3:
 - Last fix is not working for -1 syscall. Fixed that with this.
v2: https://lore.kernel.org/all/[email protected]

v1 -> v2:
 - Fix issues in the previous fix (Michal)
v1: https://lore.kernel.org/all/[email protected]

 arch/powerpc/include/asm/ptrace.h      | 22 +++++++++++++++++++++-
 arch/powerpc/include/asm/syscall.h     |  6 ++++++
 arch/powerpc/include/uapi/asm/ptrace.h |  6 ++++--
 arch/powerpc/kernel/ptrace/ptrace.c    |  2 ++
 arch/powerpc/kernel/syscall.c          | 18 ++++++++++++++++++
 5 files changed, 51 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/ptrace.h 
b/arch/powerpc/include/asm/ptrace.h
index fdeb97421785..1a53d5cfa8db 100644
--- a/arch/powerpc/include/asm/ptrace.h
+++ b/arch/powerpc/include/asm/ptrace.h
@@ -54,8 +54,9 @@ struct pt_regs
                        };
                        unsigned long result;
                        unsigned long exit_flags;
+                       unsigned long entry_flags;
                        /* Maintain 16 byte interrupt stack alignment */
-                       unsigned long __pt_regs_pad[3];
+                       unsigned long __pt_regs_pad[2];
                };
        };
 #if defined(CONFIG_PPC64) || defined(CONFIG_PPC_KUAP)
@@ -233,6 +234,25 @@ static inline unsigned long frame_pointer(struct pt_regs 
*regs)
 #define current_pt_regs() \
        ((struct pt_regs *)((unsigned long)task_stack_page(current) + 
THREAD_SIZE) - 1)
 
+/*
+ * SYSCALL_ENTRY_RET_SET: seccomp or ptrace called syscall_set_return_value()
+ * and wants the syscall skipped; regs->gpr[3] already holds the return value.
+ */
+#define SYSCALL_ENTRY_RET_SET  BIT(0)
+
+static inline void set_syscall_entry_ret(struct pt_regs *regs)
+{
+       regs->entry_flags |= SYSCALL_ENTRY_RET_SET;
+}
+
+static inline bool test_and_clear_syscall_entry_ret(struct pt_regs *regs)
+{
+       bool set = !!(regs->entry_flags & SYSCALL_ENTRY_RET_SET);
+
+       regs->entry_flags &= ~SYSCALL_ENTRY_RET_SET;
+       return set;
+}
+
 /*
  * The 4 low bits (0xf) are available as flags to overload the trap word,
  * because interrupt vectors have minimum alignment of 0x10. TRAP_FLAGS_MASK
diff --git a/arch/powerpc/include/asm/syscall.h 
b/arch/powerpc/include/asm/syscall.h
index 834fcc4f7b54..9ae79326abe3 100644
--- a/arch/powerpc/include/asm/syscall.h
+++ b/arch/powerpc/include/asm/syscall.h
@@ -98,6 +98,12 @@ static inline void syscall_set_return_value(struct 
task_struct *task,
                        regs->gpr[3] = val;
                }
        }
+       /*
+        * Mark that a return value has been explicitly set by seccomp or
+        * ptrace so that system_call_exception() can skip the syscall
+        * unconditionally, even when the user requested syscall(-1).
+        */
+       set_syscall_entry_ret(regs);
 }
 
 static inline void syscall_get_arguments(struct task_struct *task,
diff --git a/arch/powerpc/include/uapi/asm/ptrace.h 
b/arch/powerpc/include/uapi/asm/ptrace.h
index a393b7f2760a..2f2a43414fe6 100644
--- a/arch/powerpc/include/uapi/asm/ptrace.h
+++ b/arch/powerpc/include/uapi/asm/ptrace.h
@@ -56,7 +56,8 @@ struct pt_regs
        unsigned long dsisr;            /* on 4xx/Book-E used for ESR */
        unsigned long result;           /* Result of a system call */
        unsigned long exit_flags;       /* System call exit flags */
-       unsigned long __pt_regs_pad[3]; /* Maintain 16 byte interrupt stack 
alignment */
+       unsigned long entry_flags;      /* System call entry flags */
+       unsigned long __pt_regs_pad[2]; /* Maintain 16 byte interrupt stack 
alignment */
 };
 
 #endif /* __ASSEMBLER__ */
@@ -117,7 +118,8 @@ struct pt_regs
 #define PT_DSISR 42
 #define PT_RESULT 43
 #define PT_EXIT_FLAGS 44
-#define PT_PAD 47 /* 3 times */
+#define PT_ENTRY_FLAGS 45
+#define PT_PAD 46 /* 2 times */
 #define PT_DSCR 48
 #define PT_REGS_COUNT 48
 
diff --git a/arch/powerpc/kernel/ptrace/ptrace.c 
b/arch/powerpc/kernel/ptrace/ptrace.c
index 316d4f5ead8e..440d00690cf2 100644
--- a/arch/powerpc/kernel/ptrace/ptrace.c
+++ b/arch/powerpc/kernel/ptrace/ptrace.c
@@ -235,6 +235,8 @@ void __init pt_regs_check(void)
                     offsetof(struct user_pt_regs, dsisr));
        BUILD_BUG_ON(offsetof(struct pt_regs, result) !=
                     offsetof(struct user_pt_regs, result));
+       BUILD_BUG_ON(offsetof(struct pt_regs, entry_flags) !=
+                    offsetof(struct user_pt_regs, entry_flags));
 
        BUILD_BUG_ON(sizeof(struct user_pt_regs) > sizeof(struct pt_regs));
 
diff --git a/arch/powerpc/kernel/syscall.c b/arch/powerpc/kernel/syscall.c
index a9da2af6efa8..c7458aae199d 100644
--- a/arch/powerpc/kernel/syscall.c
+++ b/arch/powerpc/kernel/syscall.c
@@ -19,9 +19,27 @@ notrace long system_call_exception(struct pt_regs *regs, 
unsigned long r0)
        long ret;
        syscall_fn f;
 
+       /*
+        * Zero entry_flags before syscall_enter_from_user_mode() so that
+        * syscall_set_return_value() can set SYSCALL_ENTRY_RET_SET as an
+        * unambiguous out-of-band signal.  The field is not initialised by
+        * the entry assembly.
+        */
+       regs->entry_flags = 0;
        add_random_kstack_offset();
        r0 = syscall_enter_from_user_mode(regs, r0);
 
+       /*
+        * Seccomp or ptrace may have set a return value and requested that
+        * the syscall be skipped. syscall_set_return_value() sets
+        * SYSCALL_ENTRY_RET_SET in regs->entry_flags as an
+        * unambiguous out-of-band signal. This avoids the ambiguity of
+        * using r0 == -1 as the skip sentinel when the user themselves
+        * called syscall(-1).
+        */
+       if (unlikely(test_and_clear_syscall_entry_ret(regs)))
+               return regs->gpr[3];
+
        if (unlikely(r0 >= NR_syscalls)) {
                if (unlikely(trap_is_unsupported_scv(regs))) {
                        /* Unsupported scv vector */
-- 
2.55.0


Reply via email to