Hi, On Mon, 2024-11-11 at 15:27 +0900, Hajime Tazaki wrote: > As userspace on UML/!MMU also need to configure %fs register when it is > running to correctly access thread structure, host syscalls implemented > in os-Linux drivers may be puzzled when they are called. Thus it has to > configure %fs register via arch_prctl(SET_FS) on every host syscalls. > > [SNIP] > + > +/** > + * get_host_cpu_features() return true with X86_FEATURE_FSGSBASE even > + * if the kernel is older and disabled using fsgsbase instruction. > + * thus detection is based on whether SIGILL is raised or not. > + */ > +static jmp_buf jmpbuf; > +static void sigill(int sig, siginfo_t *si, void *ctx_void) > +{ > + siglongjmp(jmpbuf, 1); > +} > + > +void __init check_fsgsbase(void) > +{ > + unsigned long fsbase; > + struct sigaction sa; > + > + /* Probe FSGSBASE */ > + memset(&sa, 0, sizeof(sa)); > + sa.sa_sigaction = sigill; > + sa.sa_flags = SA_SIGINFO | SA_RESETHAND; > + sigemptyset(&sa.sa_mask); > + if (sigaction(SIGILL, &sa, 0)) > + os_warn("sigaction"); > + > + os_info("Checking FSGSBASE instructions..."); > + if (sigsetjmp(jmpbuf, 0) == 0) { > + asm volatile("rdfsbase %0" : "=r" (fsbase) :: "memory"); > + host_has_fsgsbase = 1; > + os_info("OK\n"); > + } else { > + host_has_fsgsbase = 0; > + os_info("disabled\n"); > + } > +}
According to Documentation/arch/x86/x86_64/fsgs.rst it looks like this can also be checked using the HWCAP2_FSGSBASE bit in AT_HWCAP2. Maybe that is a bit simpler? > [SNIP] > > __visible void do_syscall_64(struct pt_regs *regs) > { > int syscall; > @@ -49,6 +76,9 @@ __visible void do_syscall_64(struct pt_regs *regs) > if (syscall == __NR_vfork) > stack_copy = vfork_save_stack(); > > + /* set fs register to the original host one */ > + os_x86_arch_prctl(0, ARCH_SET_FS, (void *)host_fs); > + > if (likely(syscall < NR_syscalls)) { > PT_REGS_SET_SYSCALL_RETURN(regs, > EXECUTE_SYSCALL(syscall, regs)); > @@ -63,6 +93,11 @@ __visible void do_syscall_64(struct pt_regs *regs) > set_thread_flag(TIF_SIGPENDING); > interrupt_end(); > > + /* restore back fs register to userspace configured one */ > + os_x86_arch_prctl(0, ARCH_SET_FS, > + (void *)(current->thread.regs.regs.gp[FS_BASE > + / sizeof(unsigned long)])); > + > /* execve succeeded */ > if (syscall == __NR_execve && regs->regs.gp[HOST_AX] == 0) > userspace(¤t->thread.regs.regs); > diff --git a/arch/x86/um/syscalls_64.c b/arch/x86/um/syscalls_64.c > index edb17fc73e07..d56df936a2d7 100644 > --- a/arch/x86/um/syscalls_64.c > +++ b/arch/x86/um/syscalls_64.c > @@ -12,11 +12,26 @@ > #include <asm/prctl.h> /* XXX This should get the constants from libc */ > #include <registers.h> > #include <os.h> > +#include <asm/thread_info.h> > +#include <asm/mman.h> > + > +#ifndef CONFIG_MMU > +/* > + * The guest libc can change FS, which confuses the host libc. > + * In fact, changing FS directly is not supported (check > + * man arch_prctl). So, whenever we make a host syscall, > + * we should be changing FS to the original FS (not the > + * one set by the guest libc). This original FS is stored > + * in host_fs. > + */ > +long long host_fs = -1; Right, the libc already uses it for its own thread-local storage. That is a bit annoying, as UML doesn't need threading in that sense. Note that similar handling needs to happen for every userspace to kernel switch. I guess the only other location is the signal handler. Benjamin > +#endif > > long arch_prctl(struct task_struct *task, int option, > unsigned long __user *arg2) > { > long ret = -EINVAL; > +#ifdef CONFIG_MMU > > switch (option) { > case ARCH_SET_FS: > @@ -38,6 +53,48 @@ long arch_prctl(struct task_struct *task, int option, > } > > return ret; > +#else > + > + unsigned long *ptr = arg2, tmp; > + > + switch (option) { > + case ARCH_SET_FS: > + if (host_fs == -1) > + os_arch_prctl(0, ARCH_GET_FS, (void *)&host_fs); > + ret = 0; > + break; > + case ARCH_SET_GS: > + ret = 0; > + break; > + case ARCH_GET_FS: > + case ARCH_GET_GS: > + ptr = &tmp; > + break; > + } > + > + ret = os_arch_prctl(0, option, ptr); > + if (ret) > + return ret; > + > + switch (option) { > + case ARCH_SET_FS: > + current->thread.regs.regs.gp[FS_BASE / sizeof(unsigned long)] = > + (unsigned long) arg2; > + break; > + case ARCH_SET_GS: > + current->thread.regs.regs.gp[GS_BASE / sizeof(unsigned long)] = > + (unsigned long) arg2; > + break; > + case ARCH_GET_FS: > + ret = put_user(current->thread.regs.regs.gp[FS_BASE / > sizeof(unsigned long)], arg2); > + break; > + case ARCH_GET_GS: > + ret = put_user(current->thread.regs.regs.gp[GS_BASE / > sizeof(unsigned long)], arg2); > + break; > + } > + > + return ret; > +#endif > } > > SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)