On Fri, 2025-07-11 at 11:39 +0200, Benjamin Berg wrote:
> [SNIP]
> 
> That said, I would also still like to see a higher level discussion on
> how userspace registers are saved and restored. We have two separate
> cases--interrupts/exceptions (host signals) and the syscall path--and
> both need to be well defined. My hope is still that both of these can
> use the same register save/restore mechanism.

Now syscalls are also just signals. The crucial difference is that for 
syscalls you are allowed to clobber R11 and RCX. Your current syscall
entry code uses that fact, but that does not work for other signals.

Benjamin

> 
> Benjamin
> 
> > 
> > ---
> >  arch/um/include/shared/kern_util.h      |   4 +
> >  arch/um/nommu/Makefile                  |   2 +-
> >  arch/um/nommu/os-Linux/signal.c         |   8 +
> >  arch/um/nommu/trap.c                    | 201
> > ++++++++++++++++++++++++
> >  arch/um/os-Linux/signal.c               |   3 +-
> >  arch/x86/um/nommu/do_syscall_64.c       |   6 +
> >  arch/x86/um/nommu/entry_64.S            |  14 ++
> >  arch/x86/um/nommu/os-Linux/mcontext.c   |   5 +
> >  arch/x86/um/shared/sysdep/mcontext.h    |   1 +
> >  arch/x86/um/shared/sysdep/ptrace.h      |   2 +-
> >  arch/x86/um/shared/sysdep/syscalls_64.h |   1 +
> >  11 files changed, 244 insertions(+), 3 deletions(-)
> >  create mode 100644 arch/um/nommu/trap.c
> > 
> > diff --git a/arch/um/include/shared/kern_util.h
> > b/arch/um/include/shared/kern_util.h
> > index ec8ba1f13c58..7f55402b6385 100644
> > --- a/arch/um/include/shared/kern_util.h
> > +++ b/arch/um/include/shared/kern_util.h
> > @@ -73,4 +73,8 @@ void um_idle_sleep(void);
> >  
> >  void kasan_map_memory(void *start, size_t len);
> >  
> > +#ifndef CONFIG_MMU
> > +extern void nommu_relay_signal(void *ptr);
> > +#endif
> > +
> >  #endif
> > diff --git a/arch/um/nommu/Makefile b/arch/um/nommu/Makefile
> > index baab7c2f57c2..096221590cfd 100644
> > --- a/arch/um/nommu/Makefile
> > +++ b/arch/um/nommu/Makefile
> > @@ -1,3 +1,3 @@
> >  # SPDX-License-Identifier: GPL-2.0
> >  
> > -obj-y := os-Linux/
> > +obj-y := trap.o os-Linux/
> > diff --git a/arch/um/nommu/os-Linux/signal.c b/arch/um/nommu/os-
> > Linux/signal.c
> > index 19043b9652e2..27b6b37744b7 100644
> > --- a/arch/um/nommu/os-Linux/signal.c
> > +++ b/arch/um/nommu/os-Linux/signal.c
> > @@ -5,6 +5,7 @@
> >  #include <os.h>
> >  #include <sysdep/mcontext.h>
> >  #include <sys/ucontext.h>
> > +#include <as-layout.h>
> >  
> >  void sigsys_handler(int sig, struct siginfo *si,
> >                 struct uml_pt_regs *regs, void *ptr)
> > @@ -14,3 +15,10 @@ void sigsys_handler(int sig, struct siginfo *si,
> >     /* hook syscall via SIGSYS */
> >     set_mc_sigsys_hook(mc);
> >  }
> > +
> > +void nommu_relay_signal(void *ptr)
> > +{
> > +   mcontext_t *mc = (mcontext_t *) ptr;
> > +
> > +   set_mc_return_address(mc);
> > +}
> > diff --git a/arch/um/nommu/trap.c b/arch/um/nommu/trap.c
> > new file mode 100644
> > index 000000000000..430297517455
> > --- /dev/null
> > +++ b/arch/um/nommu/trap.c
> > @@ -0,0 +1,201 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +
> > +#include <linux/mm.h>
> > +#include <linux/sched/signal.h>
> > +#include <linux/hardirq.h>
> > +#include <linux/module.h>
> > +#include <linux/uaccess.h>
> > +#include <linux/sched/debug.h>
> > +#include <asm/current.h>
> > +#include <asm/tlbflush.h>
> > +#include <arch.h>
> > +#include <as-layout.h>
> > +#include <kern_util.h>
> > +#include <os.h>
> > +#include <skas.h>
> > +
> > +/*
> > + * Note this is constrained to return 0, -EFAULT, -EACCES, -ENOMEM
> > by
> > + * segv().
> > + */
> > +int handle_page_fault(unsigned long address, unsigned long ip,
> > +                 int is_write, int is_user, int *code_out)
> > +{
> > +   /* !MMU has no pagefault */
> > +   return -EFAULT;
> > +}
> > +
> > +static void show_segv_info(struct uml_pt_regs *regs)
> > +{
> > +   struct task_struct *tsk = current;
> > +   struct faultinfo *fi = UPT_FAULTINFO(regs);
> > +
> > +   if (!unhandled_signal(tsk, SIGSEGV))
> > +           return;
> > +
> > +   pr_warn_ratelimited("%s%s[%d]: segfault at %lx ip %p sp %p
> > error %x",
> > +                       task_pid_nr(tsk) > 1 ? KERN_INFO :
> > KERN_EMERG,
> > +                       tsk->comm, task_pid_nr(tsk),
> > FAULT_ADDRESS(*fi),
> > +                       (void *)UPT_IP(regs), (void
> > *)UPT_SP(regs),
> > +                       fi->error_code);
> > +}
> > +
> > +static void bad_segv(struct faultinfo fi, unsigned long ip)
> > +{
> > +   current->thread.arch.faultinfo = fi;
> > +   force_sig_fault(SIGSEGV, SEGV_ACCERR, (void __user *)
> > FAULT_ADDRESS(fi));
> > +}
> > +
> > +void fatal_sigsegv(void)
> > +{
> > +   force_fatal_sig(SIGSEGV);
> > +   do_signal(&current->thread.regs);
> > +   /*
> > +    * This is to tell gcc that we're not returning -
> > do_signal
> > +    * can, in general, return, but in this case, it's not,
> > since
> > +    * we just got a fatal SIGSEGV queued.
> > +    */
> > +   os_dump_core();
> > +}
> > +
> > +/**
> > + * segv_handler() - the SIGSEGV handler
> > + * @sig:   the signal number
> > + * @unused_si:     the signal info struct; unused in this handler
> > + * @regs:  the ptrace register information
> > + *
> > + * The handler first extracts the faultinfo from the UML ptrace
> > regs struct.
> > + * If the userfault did not happen in an UML userspace process,
> > bad_segv is called.
> > + * Otherwise the signal did happen in a cloned userspace process,
> > handle it.
> > + */
> > +void segv_handler(int sig, struct siginfo *unused_si, struct
> > uml_pt_regs *regs,
> > +             void *mc)
> > +{
> > +   struct faultinfo *fi = UPT_FAULTINFO(regs);
> > +
> > +   /* !MMU specific part; detection of userspace */
> > +   /* mark is_user=1 when the IP is from userspace code. */
> > +   if (UPT_IP(regs) > uml_reserved && UPT_IP(regs) <
> > high_physmem)
> > +           regs->is_user = 1;
> > +
> > +   if (UPT_IS_USER(regs) && !SEGV_IS_FIXABLE(fi)) {
> > +           show_segv_info(regs);
> > +           bad_segv(*fi, UPT_IP(regs));
> > +           return;
> > +   }
> > +   segv(*fi, UPT_IP(regs), UPT_IS_USER(regs), regs, mc);
> > +
> > +   /* !MMU specific part; detection of userspace */
> > +   relay_signal(sig, unused_si, regs, mc);
> > +}
> > +
> > +/*
> > + * We give a *copy* of the faultinfo in the regs to segv.
> > + * This must be done, since nesting SEGVs could overwrite
> > + * the info in the regs. A pointer to the info then would
> > + * give us bad data!
> > + */
> > +unsigned long segv(struct faultinfo fi, unsigned long ip, int
> > is_user,
> > +              struct uml_pt_regs *regs, void *mc)
> > +{
> > +   int si_code;
> > +   int err;
> > +   int is_write = FAULT_WRITE(fi);
> > +   unsigned long address = FAULT_ADDRESS(fi);
> > +
> > +   if (!is_user && regs)
> > +           current->thread.segv_regs = container_of(regs,
> > struct pt_regs, regs);
> > +
> > +   if (current->mm == NULL) {
> > +           show_regs(container_of(regs, struct pt_regs,
> > regs));
> > +           panic("Segfault with no mm");
> > +   } else if (!is_user && address > PAGE_SIZE && address <
> > TASK_SIZE) {
> > +           show_regs(container_of(regs, struct pt_regs,
> > regs));
> > +           panic("Kernel tried to access user memory at addr
> > 0x%lx, ip 0x%lx",
> > +                  address, ip);
> > +   }
> > +
> > +   if (SEGV_IS_FIXABLE(&fi))
> > +           err = handle_page_fault(address, ip, is_write,
> > is_user,
> > +                                   &si_code);
> > +   else {
> > +           err = -EFAULT;
> > +           /*
> > +            * A thread accessed NULL, we get a fault, but CR2
> > is invalid.
> > +            * This code is used in __do_copy_from_user() of
> > TT mode.
> > +            * XXX tt mode is gone, so maybe this isn't needed
> > any more
> > +            */
> > +           address = 0;
> > +   }
> > +
> > +   if (!err)
> > +           goto out;
> > +   else if (!is_user && arch_fixup(ip, regs))
> > +           goto out;
> > +
> > +   if (!is_user) {
> > +           show_regs(container_of(regs, struct pt_regs,
> > regs));
> > +           panic("Kernel mode fault at addr 0x%lx, ip 0x%lx",
> > +                 address, ip);
> > +   }
> > +
> > +   show_segv_info(regs);
> > +
> > +   if (err == -EACCES) {
> > +           current->thread.arch.faultinfo = fi;
> > +           force_sig_fault(SIGBUS, BUS_ADRERR, (void __user
> > *)address);
> > +   } else {
> > +           WARN_ON_ONCE(err != -EFAULT);
> > +           current->thread.arch.faultinfo = fi;
> > +           force_sig_fault(SIGSEGV, si_code, (void __user *)
> > address);
> > +   }
> > +
> > +out:
> > +   if (regs)
> > +           current->thread.segv_regs = NULL;
> > +
> > +   return 0;
> > +}
> > +
> > +void relay_signal(int sig, struct siginfo *si, struct uml_pt_regs
> > *regs,
> > +             void *mc)
> > +{
> > +   int code, err;
> > +
> > +   /* !MMU specific part; detection of userspace */
> > +   /* mark is_user=1 when the IP is from userspace code. */
> > +   if (UPT_IP(regs) > uml_reserved && UPT_IP(regs) <
> > high_physmem)
> > +           regs->is_user = 1;
> > +
> > +   if (!UPT_IS_USER(regs)) {
> > +           if (sig == SIGBUS)
> > +                   pr_err("Bus error - the host /dev/shm or
> > /tmp mount likely just ran out of space\n");
> > +           panic("Kernel mode signal %d", sig);
> > +   }
> > +   /* if is_user==1, set return to userspace sig handler to
> > relay signal */
> > +   nommu_relay_signal(mc);
> > +
> > +   arch_examine_signal(sig, regs);
> > +
> > +   /* Is the signal layout for the signal known?
> > +    * Signal data must be scrubbed to prevent information
> > leaks.
> > +    */
> > +   code = si->si_code;
> > +   err = si->si_errno;
> > +   if ((err == 0) && (siginfo_layout(sig, code) ==
> > SIL_FAULT)) {
> > +           struct faultinfo *fi = UPT_FAULTINFO(regs);
> > +
> > +           current->thread.arch.faultinfo = *fi;
> > +           force_sig_fault(sig, code, (void __user
> > *)FAULT_ADDRESS(*fi));
> > +   } else {
> > +           pr_err("Attempted to relay unknown signal %d
> > (si_code = %d) with errno %d\n",
> > +                  sig, code, err);
> > +           force_sig(sig);
> > +   }
> > +}
> > +
> > +void winch(int sig, struct siginfo *unused_si, struct uml_pt_regs
> > *regs,
> > +      void *mc)
> > +{
> > +   do_IRQ(WINCH_IRQ, regs);
> > +}
> > diff --git a/arch/um/os-Linux/signal.c b/arch/um/os-Linux/signal.c
> > index 53e276e81b37..67dcd88b45b1 100644
> > --- a/arch/um/os-Linux/signal.c
> > +++ b/arch/um/os-Linux/signal.c
> > @@ -40,9 +40,10 @@ static void sig_handler_common(int sig, struct
> > siginfo *si, mcontext_t *mc)
> >     int save_errno = errno;
> >  
> >     r.is_user = 0;
> > +   if (mc)
> > +           get_regs_from_mc(&r, mc);
> >     if (sig == SIGSEGV) {
> >             /* For segfaults, we want the data from the
> > sigcontext. */
> > -           get_regs_from_mc(&r, mc);
> >             GET_FAULTINFO_FROM_MC(r.faultinfo, mc);
> >     }
> >  
> > diff --git a/arch/x86/um/nommu/do_syscall_64.c
> > b/arch/x86/um/nommu/do_syscall_64.c
> > index 74d5bcc4508d..d77e69e097c1 100644
> > --- a/arch/x86/um/nommu/do_syscall_64.c
> > +++ b/arch/x86/um/nommu/do_syscall_64.c
> > @@ -44,6 +44,9 @@ __visible void do_syscall_64(struct pt_regs
> > *regs)
> >     /* set fs register to the original host one */
> >     os_x86_arch_prctl(0, ARCH_SET_FS, (void *)host_fs);
> >  
> > +   /* save fp registers */
> > +   asm volatile("fxsaveq %0" : "=m"(*(struct _xstate *)regs-
> > >regs.fp));
> > +
> >     if (likely(syscall < NR_syscalls)) {
> >             PT_REGS_SET_SYSCALL_RETURN(regs,
> >                             EXECUTE_SYSCALL(syscall, regs));
> > @@ -54,6 +57,9 @@ __visible void do_syscall_64(struct pt_regs
> > *regs)
> >     /* handle tasks and signals at the end */
> >     interrupt_end();
> >  
> > +   /* restore fp registers */
> > +   asm volatile("fxrstorq %0" : : "m"((current-
> > >thread.regs.regs.fp)));
> > +
> >     /* restore back fs register to userspace configured one */
> >     os_x86_arch_prctl(0, ARCH_SET_FS,
> >                   (void *)(current-
> > >thread.regs.regs.gp[FS_BASE
> > diff --git a/arch/x86/um/nommu/entry_64.S
> > b/arch/x86/um/nommu/entry_64.S
> > index 950447dfa66b..e038bc7b53ac 100644
> > --- a/arch/x86/um/nommu/entry_64.S
> > +++ b/arch/x86/um/nommu/entry_64.S
> > @@ -111,3 +111,17 @@ ENTRY(userspace)
> >     jmp     *%r11
> >  
> >  END(userspace)
> > +
> > +/*
> > + * this routine prepares the stack to return via host-generated
> > + * signals (e.g., SEGV, FPE) via do_signal() from interrupt_end().
> > + */
> > +ENTRY(__prep_sigreturn)
> > +   /*
> > +    * Switch to current top of stack, so "current->" points
> > +    * to the right task.
> > +    */
> > +   movq    current_top_of_stack, %rsp
> > +
> > +   jmp     userspace
> > +END(__prep_sigreturn)
> > diff --git a/arch/x86/um/nommu/os-Linux/mcontext.c
> > b/arch/x86/um/nommu/os-Linux/mcontext.c
> > index c4ef877d5ea0..87fb2a35e7ff 100644
> > --- a/arch/x86/um/nommu/os-Linux/mcontext.c
> > +++ b/arch/x86/um/nommu/os-Linux/mcontext.c
> > @@ -6,6 +6,11 @@
> >  #include <sysdep/mcontext.h>
> >  #include <sysdep/syscalls.h>
> >  
> > +void set_mc_return_address(mcontext_t *mc)
> > +{
> > +   mc->gregs[REG_RIP] = (unsigned long) __prep_sigreturn;
> > +}
> > +
> >  void set_mc_sigsys_hook(mcontext_t *mc)
> >  {
> >     mc->gregs[REG_RCX] = mc->gregs[REG_RIP];
> > diff --git a/arch/x86/um/shared/sysdep/mcontext.h
> > b/arch/x86/um/shared/sysdep/mcontext.h
> > index 9a0d6087f357..de4041b758f3 100644
> > --- a/arch/x86/um/shared/sysdep/mcontext.h
> > +++ b/arch/x86/um/shared/sysdep/mcontext.h
> > @@ -19,6 +19,7 @@ extern int set_stub_state(struct uml_pt_regs
> > *regs, struct stub_data *data,
> >  
> >  #ifndef CONFIG_MMU
> >  extern void set_mc_sigsys_hook(mcontext_t *mc);
> > +extern void set_mc_return_address(mcontext_t *mc);
> >  #endif
> >  
> >  #ifdef __i386__
> > diff --git a/arch/x86/um/shared/sysdep/ptrace.h
> > b/arch/x86/um/shared/sysdep/ptrace.h
> > index 8f7476ff6e95..7d553d9f05be 100644
> > --- a/arch/x86/um/shared/sysdep/ptrace.h
> > +++ b/arch/x86/um/shared/sysdep/ptrace.h
> > @@ -65,7 +65,7 @@ struct uml_pt_regs {
> >     int is_user;
> >  
> >     /* Dynamically sized FP registers (holds an XSTATE) */
> > -   unsigned long fp[];
> > +   unsigned long fp[] __attribute__((aligned(16)));
> >  };
> >  
> >  #define EMPTY_UML_PT_REGS { }
> > diff --git a/arch/x86/um/shared/sysdep/syscalls_64.h
> > b/arch/x86/um/shared/sysdep/syscalls_64.h
> > index ffd80ee3b9dc..bd152422cdfb 100644
> > --- a/arch/x86/um/shared/sysdep/syscalls_64.h
> > +++ b/arch/x86/um/shared/sysdep/syscalls_64.h
> > @@ -29,6 +29,7 @@ extern syscall_handler_t sys_arch_prctl;
> >  extern void do_syscall_64(struct pt_regs *regs);
> >  extern long __kernel_vsyscall(int64_t a0, int64_t a1, int64_t a2,
> > int64_t a3,
> >                           int64_t a4, int64_t a5, int64_t a6);
> > +extern void __prep_sigreturn(void);
> >  #endif
> >  
> >  #endif
> 


Reply via email to