Hello,
> Honestly, I think we need a test case to be able to move forward. The > test needs to trigger an exception (FPE, segfault, whatever) and then > handle the signal. In the signal handler, verify the register state in > the mcontext is expected (RIP, RSP, FP regs), then update it to not > raise an exception again and return. The test should obviously exit > cleanly afterwards. I agree to have a test case. I played with your RFC patch ([RFC 0/2] Experimental kunit test for signal context handling), which I guess the similar one which you gave me in the past, with minor modification for nommu mode, and looks like that test passed. (none):/# /root/test-fp-save-restore TAP version 13 1..1 # pre-signal: 50 / 100, 11223344 55667788 99aabbcc ddeeff00 # sighandler: extended_size: 2700, xstate_size: 2696 # post-signal: 51200 / 100, 11233345 55677789 99abbbcd ddefff01 (should change: 1, changed: 1) ok 1 mcontext # Totals: pass:1 fail:0 xfail:0 xpass:0 skip:0 error:0 I couldn't invoke this test via `kunit.py run` (which I should investigate more), but this can be a good start to have the test case which you proposed. I will follow up the highlevel discussion on how syscall/signal entry/exit code is implemented in nommu, which I think I've been explained several times but why not :) -- Hajime On Fri, 11 Jul 2025 19:05:13 +0900, Benjamin Berg wrote: > > On Fri, 2025-07-11 at 11:39 +0200, Benjamin Berg wrote: > > [SNIP] > > > > That said, I would also still like to see a higher level discussion on > > how userspace registers are saved and restored. We have two separate > > cases--interrupts/exceptions (host signals) and the syscall path--and > > both need to be well defined. My hope is still that both of these can > > use the same register save/restore mechanism. > > Now syscalls are also just signals. The crucial difference is that for > syscalls you are allowed to clobber R11 and RCX. Your current syscall > entry code uses that fact, but that does not work for other signals. > > Benjamin > > > > > Benjamin > > > > > > > > --- > > > arch/um/include/shared/kern_util.h | 4 + > > > arch/um/nommu/Makefile | 2 +- > > > arch/um/nommu/os-Linux/signal.c | 8 + > > > arch/um/nommu/trap.c | 201 > > > ++++++++++++++++++++++++ > > > arch/um/os-Linux/signal.c | 3 +- > > > arch/x86/um/nommu/do_syscall_64.c | 6 + > > > arch/x86/um/nommu/entry_64.S | 14 ++ > > > arch/x86/um/nommu/os-Linux/mcontext.c | 5 + > > > arch/x86/um/shared/sysdep/mcontext.h | 1 + > > > arch/x86/um/shared/sysdep/ptrace.h | 2 +- > > > arch/x86/um/shared/sysdep/syscalls_64.h | 1 + > > > 11 files changed, 244 insertions(+), 3 deletions(-) > > > create mode 100644 arch/um/nommu/trap.c > > > > > > diff --git a/arch/um/include/shared/kern_util.h > > > b/arch/um/include/shared/kern_util.h > > > index ec8ba1f13c58..7f55402b6385 100644 > > > --- a/arch/um/include/shared/kern_util.h > > > +++ b/arch/um/include/shared/kern_util.h > > > @@ -73,4 +73,8 @@ void um_idle_sleep(void); > > > > > > void kasan_map_memory(void *start, size_t len); > > > > > > +#ifndef CONFIG_MMU > > > +extern void nommu_relay_signal(void *ptr); > > > +#endif > > > + > > > #endif > > > diff --git a/arch/um/nommu/Makefile b/arch/um/nommu/Makefile > > > index baab7c2f57c2..096221590cfd 100644 > > > --- a/arch/um/nommu/Makefile > > > +++ b/arch/um/nommu/Makefile > > > @@ -1,3 +1,3 @@ > > > # SPDX-License-Identifier: GPL-2.0 > > > > > > -obj-y := os-Linux/ > > > +obj-y := trap.o os-Linux/ > > > diff --git a/arch/um/nommu/os-Linux/signal.c b/arch/um/nommu/os- > > > Linux/signal.c > > > index 19043b9652e2..27b6b37744b7 100644 > > > --- a/arch/um/nommu/os-Linux/signal.c > > > +++ b/arch/um/nommu/os-Linux/signal.c > > > @@ -5,6 +5,7 @@ > > > #include <os.h> > > > #include <sysdep/mcontext.h> > > > #include <sys/ucontext.h> > > > +#include <as-layout.h> > > > > > > void sigsys_handler(int sig, struct siginfo *si, > > > struct uml_pt_regs *regs, void *ptr) > > > @@ -14,3 +15,10 @@ void sigsys_handler(int sig, struct siginfo *si, > > > /* hook syscall via SIGSYS */ > > > set_mc_sigsys_hook(mc); > > > } > > > + > > > +void nommu_relay_signal(void *ptr) > > > +{ > > > + mcontext_t *mc = (mcontext_t *) ptr; > > > + > > > + set_mc_return_address(mc); > > > +} > > > diff --git a/arch/um/nommu/trap.c b/arch/um/nommu/trap.c > > > new file mode 100644 > > > index 000000000000..430297517455 > > > --- /dev/null > > > +++ b/arch/um/nommu/trap.c > > > @@ -0,0 +1,201 @@ > > > +// SPDX-License-Identifier: GPL-2.0 > > > + > > > +#include <linux/mm.h> > > > +#include <linux/sched/signal.h> > > > +#include <linux/hardirq.h> > > > +#include <linux/module.h> > > > +#include <linux/uaccess.h> > > > +#include <linux/sched/debug.h> > > > +#include <asm/current.h> > > > +#include <asm/tlbflush.h> > > > +#include <arch.h> > > > +#include <as-layout.h> > > > +#include <kern_util.h> > > > +#include <os.h> > > > +#include <skas.h> > > > + > > > +/* > > > + * Note this is constrained to return 0, -EFAULT, -EACCES, -ENOMEM > > > by > > > + * segv(). > > > + */ > > > +int handle_page_fault(unsigned long address, unsigned long ip, > > > + int is_write, int is_user, int *code_out) > > > +{ > > > + /* !MMU has no pagefault */ > > > + return -EFAULT; > > > +} > > > + > > > +static void show_segv_info(struct uml_pt_regs *regs) > > > +{ > > > + struct task_struct *tsk = current; > > > + struct faultinfo *fi = UPT_FAULTINFO(regs); > > > + > > > + if (!unhandled_signal(tsk, SIGSEGV)) > > > + return; > > > + > > > + pr_warn_ratelimited("%s%s[%d]: segfault at %lx ip %p sp %p > > > error %x", > > > + task_pid_nr(tsk) > 1 ? KERN_INFO : > > > KERN_EMERG, > > > + tsk->comm, task_pid_nr(tsk), > > > FAULT_ADDRESS(*fi), > > > + (void *)UPT_IP(regs), (void > > > *)UPT_SP(regs), > > > + fi->error_code); > > > +} > > > + > > > +static void bad_segv(struct faultinfo fi, unsigned long ip) > > > +{ > > > + current->thread.arch.faultinfo = fi; > > > + force_sig_fault(SIGSEGV, SEGV_ACCERR, (void __user *) > > > FAULT_ADDRESS(fi)); > > > +} > > > + > > > +void fatal_sigsegv(void) > > > +{ > > > + force_fatal_sig(SIGSEGV); > > > + do_signal(¤t->thread.regs); > > > + /* > > > + * This is to tell gcc that we're not returning - > > > do_signal > > > + * can, in general, return, but in this case, it's not, > > > since > > > + * we just got a fatal SIGSEGV queued. > > > + */ > > > + os_dump_core(); > > > +} > > > + > > > +/** > > > + * segv_handler() - the SIGSEGV handler > > > + * @sig: the signal number > > > + * @unused_si: the signal info struct; unused in this handler > > > + * @regs: the ptrace register information > > > + * > > > + * The handler first extracts the faultinfo from the UML ptrace > > > regs struct. > > > + * If the userfault did not happen in an UML userspace process, > > > bad_segv is called. > > > + * Otherwise the signal did happen in a cloned userspace process, > > > handle it. > > > + */ > > > +void segv_handler(int sig, struct siginfo *unused_si, struct > > > uml_pt_regs *regs, > > > + void *mc) > > > +{ > > > + struct faultinfo *fi = UPT_FAULTINFO(regs); > > > + > > > + /* !MMU specific part; detection of userspace */ > > > + /* mark is_user=1 when the IP is from userspace code. */ > > > + if (UPT_IP(regs) > uml_reserved && UPT_IP(regs) < > > > high_physmem) > > > + regs->is_user = 1; > > > + > > > + if (UPT_IS_USER(regs) && !SEGV_IS_FIXABLE(fi)) { > > > + show_segv_info(regs); > > > + bad_segv(*fi, UPT_IP(regs)); > > > + return; > > > + } > > > + segv(*fi, UPT_IP(regs), UPT_IS_USER(regs), regs, mc); > > > + > > > + /* !MMU specific part; detection of userspace */ > > > + relay_signal(sig, unused_si, regs, mc); > > > +} > > > + > > > +/* > > > + * We give a *copy* of the faultinfo in the regs to segv. > > > + * This must be done, since nesting SEGVs could overwrite > > > + * the info in the regs. A pointer to the info then would > > > + * give us bad data! > > > + */ > > > +unsigned long segv(struct faultinfo fi, unsigned long ip, int > > > is_user, > > > + struct uml_pt_regs *regs, void *mc) > > > +{ > > > + int si_code; > > > + int err; > > > + int is_write = FAULT_WRITE(fi); > > > + unsigned long address = FAULT_ADDRESS(fi); > > > + > > > + if (!is_user && regs) > > > + current->thread.segv_regs = container_of(regs, > > > struct pt_regs, regs); > > > + > > > + if (current->mm == NULL) { > > > + show_regs(container_of(regs, struct pt_regs, > > > regs)); > > > + panic("Segfault with no mm"); > > > + } else if (!is_user && address > PAGE_SIZE && address < > > > TASK_SIZE) { > > > + show_regs(container_of(regs, struct pt_regs, > > > regs)); > > > + panic("Kernel tried to access user memory at addr > > > 0x%lx, ip 0x%lx", > > > + address, ip); > > > + } > > > + > > > + if (SEGV_IS_FIXABLE(&fi)) > > > + err = handle_page_fault(address, ip, is_write, > > > is_user, > > > + &si_code); > > > + else { > > > + err = -EFAULT; > > > + /* > > > + * A thread accessed NULL, we get a fault, but CR2 > > > is invalid. > > > + * This code is used in __do_copy_from_user() of > > > TT mode. > > > + * XXX tt mode is gone, so maybe this isn't needed > > > any more > > > + */ > > > + address = 0; > > > + } > > > + > > > + if (!err) > > > + goto out; > > > + else if (!is_user && arch_fixup(ip, regs)) > > > + goto out; > > > + > > > + if (!is_user) { > > > + show_regs(container_of(regs, struct pt_regs, > > > regs)); > > > + panic("Kernel mode fault at addr 0x%lx, ip 0x%lx", > > > + address, ip); > > > + } > > > + > > > + show_segv_info(regs); > > > + > > > + if (err == -EACCES) { > > > + current->thread.arch.faultinfo = fi; > > > + force_sig_fault(SIGBUS, BUS_ADRERR, (void __user > > > *)address); > > > + } else { > > > + WARN_ON_ONCE(err != -EFAULT); > > > + current->thread.arch.faultinfo = fi; > > > + force_sig_fault(SIGSEGV, si_code, (void __user *) > > > address); > > > + } > > > + > > > +out: > > > + if (regs) > > > + current->thread.segv_regs = NULL; > > > + > > > + return 0; > > > +} > > > + > > > +void relay_signal(int sig, struct siginfo *si, struct uml_pt_regs > > > *regs, > > > + void *mc) > > > +{ > > > + int code, err; > > > + > > > + /* !MMU specific part; detection of userspace */ > > > + /* mark is_user=1 when the IP is from userspace code. */ > > > + if (UPT_IP(regs) > uml_reserved && UPT_IP(regs) < > > > high_physmem) > > > + regs->is_user = 1; > > > + > > > + if (!UPT_IS_USER(regs)) { > > > + if (sig == SIGBUS) > > > + pr_err("Bus error - the host /dev/shm or > > > /tmp mount likely just ran out of space\n"); > > > + panic("Kernel mode signal %d", sig); > > > + } > > > + /* if is_user==1, set return to userspace sig handler to > > > relay signal */ > > > + nommu_relay_signal(mc); > > > + > > > + arch_examine_signal(sig, regs); > > > + > > > + /* Is the signal layout for the signal known? > > > + * Signal data must be scrubbed to prevent information > > > leaks. > > > + */ > > > + code = si->si_code; > > > + err = si->si_errno; > > > + if ((err == 0) && (siginfo_layout(sig, code) == > > > SIL_FAULT)) { > > > + struct faultinfo *fi = UPT_FAULTINFO(regs); > > > + > > > + current->thread.arch.faultinfo = *fi; > > > + force_sig_fault(sig, code, (void __user > > > *)FAULT_ADDRESS(*fi)); > > > + } else { > > > + pr_err("Attempted to relay unknown signal %d > > > (si_code = %d) with errno %d\n", > > > + sig, code, err); > > > + force_sig(sig); > > > + } > > > +} > > > + > > > +void winch(int sig, struct siginfo *unused_si, struct uml_pt_regs > > > *regs, > > > + void *mc) > > > +{ > > > + do_IRQ(WINCH_IRQ, regs); > > > +} > > > diff --git a/arch/um/os-Linux/signal.c b/arch/um/os-Linux/signal.c > > > index 53e276e81b37..67dcd88b45b1 100644 > > > --- a/arch/um/os-Linux/signal.c > > > +++ b/arch/um/os-Linux/signal.c > > > @@ -40,9 +40,10 @@ static void sig_handler_common(int sig, struct > > > siginfo *si, mcontext_t *mc) > > > int save_errno = errno; > > > > > > r.is_user = 0; > > > + if (mc) > > > + get_regs_from_mc(&r, mc); > > > if (sig == SIGSEGV) { > > > /* For segfaults, we want the data from the > > > sigcontext. */ > > > - get_regs_from_mc(&r, mc); > > > GET_FAULTINFO_FROM_MC(r.faultinfo, mc); > > > } > > > > > > diff --git a/arch/x86/um/nommu/do_syscall_64.c > > > b/arch/x86/um/nommu/do_syscall_64.c > > > index 74d5bcc4508d..d77e69e097c1 100644 > > > --- a/arch/x86/um/nommu/do_syscall_64.c > > > +++ b/arch/x86/um/nommu/do_syscall_64.c > > > @@ -44,6 +44,9 @@ __visible void do_syscall_64(struct pt_regs > > > *regs) > > > /* set fs register to the original host one */ > > > os_x86_arch_prctl(0, ARCH_SET_FS, (void *)host_fs); > > > > > > + /* save fp registers */ > > > + asm volatile("fxsaveq %0" : "=m"(*(struct _xstate *)regs- > > > >regs.fp)); > > > + > > > if (likely(syscall < NR_syscalls)) { > > > PT_REGS_SET_SYSCALL_RETURN(regs, > > > EXECUTE_SYSCALL(syscall, regs)); > > > @@ -54,6 +57,9 @@ __visible void do_syscall_64(struct pt_regs > > > *regs) > > > /* handle tasks and signals at the end */ > > > interrupt_end(); > > > > > > + /* restore fp registers */ > > > + asm volatile("fxrstorq %0" : : "m"((current- > > > >thread.regs.regs.fp))); > > > + > > > /* restore back fs register to userspace configured one */ > > > os_x86_arch_prctl(0, ARCH_SET_FS, > > > (void *)(current- > > > >thread.regs.regs.gp[FS_BASE > > > diff --git a/arch/x86/um/nommu/entry_64.S > > > b/arch/x86/um/nommu/entry_64.S > > > index 950447dfa66b..e038bc7b53ac 100644 > > > --- a/arch/x86/um/nommu/entry_64.S > > > +++ b/arch/x86/um/nommu/entry_64.S > > > @@ -111,3 +111,17 @@ ENTRY(userspace) > > > jmp *%r11 > > > > > > END(userspace) > > > + > > > +/* > > > + * this routine prepares the stack to return via host-generated > > > + * signals (e.g., SEGV, FPE) via do_signal() from interrupt_end(). > > > + */ > > > +ENTRY(__prep_sigreturn) > > > + /* > > > + * Switch to current top of stack, so "current->" points > > > + * to the right task. > > > + */ > > > + movq current_top_of_stack, %rsp > > > + > > > + jmp userspace > > > +END(__prep_sigreturn) > > > diff --git a/arch/x86/um/nommu/os-Linux/mcontext.c > > > b/arch/x86/um/nommu/os-Linux/mcontext.c > > > index c4ef877d5ea0..87fb2a35e7ff 100644 > > > --- a/arch/x86/um/nommu/os-Linux/mcontext.c > > > +++ b/arch/x86/um/nommu/os-Linux/mcontext.c > > > @@ -6,6 +6,11 @@ > > > #include <sysdep/mcontext.h> > > > #include <sysdep/syscalls.h> > > > > > > +void set_mc_return_address(mcontext_t *mc) > > > +{ > > > + mc->gregs[REG_RIP] = (unsigned long) __prep_sigreturn; > > > +} > > > + > > > void set_mc_sigsys_hook(mcontext_t *mc) > > > { > > > mc->gregs[REG_RCX] = mc->gregs[REG_RIP]; > > > diff --git a/arch/x86/um/shared/sysdep/mcontext.h > > > b/arch/x86/um/shared/sysdep/mcontext.h > > > index 9a0d6087f357..de4041b758f3 100644 > > > --- a/arch/x86/um/shared/sysdep/mcontext.h > > > +++ b/arch/x86/um/shared/sysdep/mcontext.h > > > @@ -19,6 +19,7 @@ extern int set_stub_state(struct uml_pt_regs > > > *regs, struct stub_data *data, > > > > > > #ifndef CONFIG_MMU > > > extern void set_mc_sigsys_hook(mcontext_t *mc); > > > +extern void set_mc_return_address(mcontext_t *mc); > > > #endif > > > > > > #ifdef __i386__ > > > diff --git a/arch/x86/um/shared/sysdep/ptrace.h > > > b/arch/x86/um/shared/sysdep/ptrace.h > > > index 8f7476ff6e95..7d553d9f05be 100644 > > > --- a/arch/x86/um/shared/sysdep/ptrace.h > > > +++ b/arch/x86/um/shared/sysdep/ptrace.h > > > @@ -65,7 +65,7 @@ struct uml_pt_regs { > > > int is_user; > > > > > > /* Dynamically sized FP registers (holds an XSTATE) */ > > > - unsigned long fp[]; > > > + unsigned long fp[] __attribute__((aligned(16))); > > > }; > > > > > > #define EMPTY_UML_PT_REGS { } > > > diff --git a/arch/x86/um/shared/sysdep/syscalls_64.h > > > b/arch/x86/um/shared/sysdep/syscalls_64.h > > > index ffd80ee3b9dc..bd152422cdfb 100644 > > > --- a/arch/x86/um/shared/sysdep/syscalls_64.h > > > +++ b/arch/x86/um/shared/sysdep/syscalls_64.h > > > @@ -29,6 +29,7 @@ extern syscall_handler_t sys_arch_prctl; > > > extern void do_syscall_64(struct pt_regs *regs); > > > extern long __kernel_vsyscall(int64_t a0, int64_t a1, int64_t a2, > > > int64_t a3, > > > int64_t a4, int64_t a5, int64_t a6); > > > +extern void __prep_sigreturn(void); > > > #endif > > > > > > #endif > > >