Hi, On Thu, 2024-10-17 at 15:17 +0800, David Gow wrote: > On Thu, 19 Sept 2024 at 20:45, Benjamin Berg <benja...@sipsolutions.net> > wrote: > > > > [SNIP] > > It turns out that this breaks the KUnit user alloc helpers on x86_64, > at least on my machine. > > This can be reproduced with: > ./tools/testing/kunit/kunit.py run usercopy > > Though the 32-bit version works: > ./tools/testing/kunit/kunit.py run usercopy --kconfig_add CONFIG_64BIT=n > > The error we're getting is: > start_userspace : expected SIGSTOP, got status = 139 > Could not create userspace mm > > This basically is the result of the stub_exe segfaulting very early on > in its execution. > > It seems that this is due to the stack being misaligned, and so the > generated SSE instructions are faulting. The workarounds I've tested > here include: > a) Build the stub with -mno-sse > b) Decorate real_init() with __attribute__((force_align_arg_pointer)) > c) Decorate __start() with __attribute__((naked)) > > The last one seems to validate my theory as to why this is occurring: > __start's prologue is misaligning the stack, as __start is not > actually _called_ from anything, so there's no 8-byte misalignment to > hold the return address. > > If this makes sense, I'll send a patch out with whichever the > preferred fix(es) are. My guess is that (c) is the "proper" fix, > though I'd not _miss_ SSE if we chose to disable it for the handful of > instructions here anyway.
Yeah, SSE is pointless here, but we probably should ensure the alignment is as expected. So, c) seems like the right thing to do here. I am just a bit confused about the alignment guarantees from the kernel. Are we correct to assume a 16 byte alignment there or do we also need b) just to be on the safe side? Unfortunately I have not managed to reproduce the issue. Benjamin > > Cheers, > -- David > > > arch/um/Makefile | 3 +- > > arch/um/include/shared/skas/stub-data.h | 11 ++ > > arch/um/kernel/skas/.gitignore | 2 + > > arch/um/kernel/skas/Makefile | 33 ++++- > > arch/um/kernel/skas/stub_exe.c | 88 ++++++++++++ > > arch/um/kernel/skas/stub_exe_embed.S | 11 ++ > > arch/um/os-Linux/mem.c | 2 +- > > arch/um/os-Linux/skas/process.c | 181 ++++++++++++++++---- > > ---- > > 8 files changed, 271 insertions(+), 60 deletions(-) > > create mode 100644 arch/um/kernel/skas/.gitignore > > create mode 100644 arch/um/kernel/skas/stub_exe.c > > create mode 100644 arch/um/kernel/skas/stub_exe_embed.S > > > > diff --git a/arch/um/Makefile b/arch/um/Makefile > > index 00b63bac5eff..31e367e8ab4d 100644 > > --- a/arch/um/Makefile > > +++ b/arch/um/Makefile > > @@ -61,7 +61,8 @@ KBUILD_CFLAGS += $(CFLAGS) $(CFLAGS-y) - > > D__arch_um__ \ > > $(ARCH_INCLUDE) $(MODE_INCLUDE) -Dvmap=kernel_vmap \ > > -Dlongjmp=kernel_longjmp -Dsetjmp=kernel_setjmp \ > > -Din6addr_loopback=kernel_in6addr_loopback \ > > - -Din6addr_any=kernel_in6addr_any -Dstrrchr=kernel_strrchr > > + -Din6addr_any=kernel_in6addr_any -Dstrrchr=kernel_strrchr \ > > + -D__close_range=kernel__close_range > > > > KBUILD_RUSTFLAGS += -Crelocation-model=pie > > > > diff --git a/arch/um/include/shared/skas/stub-data.h > > b/arch/um/include/shared/skas/stub-data.h > > index 2b6b44759dfa..3fbdda727373 100644 > > --- a/arch/um/include/shared/skas/stub-data.h > > +++ b/arch/um/include/shared/skas/stub-data.h > > @@ -12,6 +12,17 @@ > > #include <as-layout.h> > > #include <sysdep/tls.h> > > > > +struct stub_init_data { > > + unsigned long stub_start; > > + > > + int stub_code_fd; > > + unsigned long stub_code_offset; > > + int stub_data_fd; > > + unsigned long stub_data_offset; > > + > > + unsigned long segv_handler; > > +}; > > + > > #define STUB_NEXT_SYSCALL(s) \ > > ((struct stub_syscall *) (((unsigned long) s) + (s)- > > >cmd_len)) > > > > diff --git a/arch/um/kernel/skas/.gitignore > > b/arch/um/kernel/skas/.gitignore > > new file mode 100644 > > index 000000000000..c3409ced0f38 > > --- /dev/null > > +++ b/arch/um/kernel/skas/.gitignore > > @@ -0,0 +1,2 @@ > > +stub_exe > > +stub_exe.dbg > > diff --git a/arch/um/kernel/skas/Makefile > > b/arch/um/kernel/skas/Makefile > > index 6f86d53e3d69..fbb61968055f 100644 > > --- a/arch/um/kernel/skas/Makefile > > +++ b/arch/um/kernel/skas/Makefile > > @@ -3,14 +3,43 @@ > > # Copyright (C) 2002 - 2007 Jeff Dike > > (jdike@{addtoit,linux.intel}.com) > > # > > > > -obj-y := stub.o mmu.o process.o syscall.o uaccess.o > > +obj-y := stub.o mmu.o process.o syscall.o uaccess.o \ > > + stub_exe_embed.o > > + > > +# Stub executable > > + > > +stub_exe_objs-y := stub_exe.o > > + > > +stub_exe_objs := $(foreach F,$(stub_exe_objs-y),$(obj)/$F) > > + > > +# Object file containing the ELF executable > > +$(obj)/stub_exe_embed.o: $(src)/stub_exe_embed.S $(obj)/stub_exe > > + > > +$(obj)/stub_exe.dbg: $(stub_exe_objs) FORCE > > + $(call if_changed,stub_exe) > > + > > +$(obj)/stub_exe: OBJCOPYFLAGS := -S > > +$(obj)/stub_exe: $(obj)/stub_exe.dbg FORCE > > + $(call if_changed,objcopy) > > + > > +quiet_cmd_stub_exe = STUB_EXE $@ > > + cmd_stub_exe = $(CC) -nostdlib -o $@ \ > > + $(KBUILD_CFLAGS) $(STUB_EXE_LDFLAGS) \ > > + $(filter %.o,$^) > > + > > +STUB_EXE_LDFLAGS = -n -static > > + > > +targets += stub_exe.dbg stub_exe $(stub_exe_objs-y) > > + > > +# end > > > > # stub.o is in the stub, so it can't be built with profiling > > # GCC hardened also auto-enables -fpic, but we need %ebx so it > > can't work -> > > # disable it > > > > CFLAGS_stub.o := $(CFLAGS_NO_HARDENING) > > -UNPROFILE_OBJS := stub.o > > +CFLAGS_stub_exe.o := $(CFLAGS_NO_HARDENING) > > +UNPROFILE_OBJS := stub.o stub_exe.o > > KCOV_INSTRUMENT := n > > > > include $(srctree)/arch/um/scripts/Makefile.rules > > diff --git a/arch/um/kernel/skas/stub_exe.c > > b/arch/um/kernel/skas/stub_exe.c > > new file mode 100644 > > index 000000000000..bc6ba2e4d805 > > --- /dev/null > > +++ b/arch/um/kernel/skas/stub_exe.c > > @@ -0,0 +1,88 @@ > > +#include <sys/ptrace.h> > > +#include <sys/prctl.h> > > +#include <asm/unistd.h> > > +#include <sysdep/stub.h> > > +#include <stub-data.h> > > + > > +void _start(void); > > + > > +noinline static void real_init(void) > > +{ > > + struct stub_init_data init_data; > > + unsigned long res; > > + struct { > > + void *ss_sp; > > + int ss_flags; > > + size_t ss_size; > > + } stack = { > > + .ss_size = STUB_DATA_PAGES * UM_KERN_PAGE_SIZE, > > + }; > > + struct { > > + void *sa_handler_; > > + unsigned long sa_flags; > > + void *sa_restorer; > > + unsigned long long sa_mask; > > + } sa = { > > + /* Need to set SA_RESTORER (but the handler never > > returns) */ > > + .sa_flags = SA_ONSTACK | SA_NODEFER | SA_SIGINFO | > > 0x04000000, > > + /* no need to mask any signals */ > > + .sa_mask = 0, > > + }; > > + > > + /* set a nice name */ > > + stub_syscall2(__NR_prctl, PR_SET_NAME, (unsigned long)"uml- > > userspace"); > > + > > + /* read information from STDIN and close it */ > > + res = stub_syscall3(__NR_read, 0, > > + (unsigned long)&init_data, > > sizeof(init_data)); > > + if (res != sizeof(init_data)) > > + stub_syscall1(__NR_exit, 10); > > + > > + stub_syscall1(__NR_close, 0); > > + > > + /* map stub code + data */ > > + res = stub_syscall6(STUB_MMAP_NR, > > + init_data.stub_start, > > UM_KERN_PAGE_SIZE, > > + PROT_READ | PROT_EXEC, MAP_FIXED | > > MAP_SHARED, > > + init_data.stub_code_fd, > > init_data.stub_code_offset); > > + if (res != init_data.stub_start) > > + stub_syscall1(__NR_exit, 11); > > + > > + res = stub_syscall6(STUB_MMAP_NR, > > + init_data.stub_start + > > UM_KERN_PAGE_SIZE, > > + STUB_DATA_PAGES * UM_KERN_PAGE_SIZE, > > + PROT_READ | PROT_WRITE, MAP_FIXED | > > MAP_SHARED, > > + init_data.stub_data_fd, > > init_data.stub_data_offset); > > + if (res != init_data.stub_start + UM_KERN_PAGE_SIZE) > > + stub_syscall1(__NR_exit, 12); > > + > > + /* setup signal stack inside stub data */ > > + stack.ss_sp = (void *)init_data.stub_start + > > UM_KERN_PAGE_SIZE; > > + stub_syscall2(__NR_sigaltstack, (unsigned long)&stack, 0); > > + > > + /* register SIGSEGV handler */ > > + sa.sa_handler_ = (void *) init_data.segv_handler; > > + res = stub_syscall4(__NR_rt_sigaction, SIGSEGV, (unsigned > > long)&sa, 0, > > + sizeof(sa.sa_mask)); > > + if (res != 0) > > + stub_syscall1(__NR_exit, 13); > > + > > + stub_syscall4(__NR_ptrace, PTRACE_TRACEME, 0, 0, 0); > > + > > + stub_syscall2(__NR_kill, stub_syscall0(__NR_getpid), > > SIGSTOP); > > + > > + stub_syscall1(__NR_exit, 14); > > + > > + __builtin_unreachable(); > > +} > > + > > +void _start(void) > > +{ > > + char *alloc; > > + > > + /* Make enough space for the stub (including space for > > alignment) */ > > + alloc = __builtin_alloca((1 + 2 * STUB_DATA_PAGES - 1) * > > UM_KERN_PAGE_SIZE); > > + asm volatile("" : "+r,m"(alloc) : : "memory"); > > + > > + real_init(); > > +} > > diff --git a/arch/um/kernel/skas/stub_exe_embed.S > > b/arch/um/kernel/skas/stub_exe_embed.S > > new file mode 100644 > > index 000000000000..6d8914fbe8f1 > > --- /dev/null > > +++ b/arch/um/kernel/skas/stub_exe_embed.S > > @@ -0,0 +1,11 @@ > > +/* SPDX-License-Identifier: GPL-2.0 */ > > +#include <linux/init.h> > > +#include <linux/linkage.h> > > + > > +__INITDATA > > + > > +SYM_DATA_START(stub_exe_start) > > + .incbin "arch/um/kernel/skas/stub_exe" > > +SYM_DATA_END_LABEL(stub_exe_start, SYM_L_GLOBAL, stub_exe_end) > > + > > +__FINIT > > diff --git a/arch/um/os-Linux/mem.c b/arch/um/os-Linux/mem.c > > index cf44d386f23c..857e3deab293 100644 > > --- a/arch/um/os-Linux/mem.c > > +++ b/arch/um/os-Linux/mem.c > > @@ -42,7 +42,7 @@ void kasan_map_memory(void *start, size_t len) > > } > > > > /* Set by make_tempfile() during early boot. */ > > -static char *tempdir = NULL; > > +char *tempdir = NULL; > > > > /* Check if dir is on tmpfs. Return 0 if yes, -1 if no or error. > > */ > > static int __init check_tmpfs(const char *dir) > > diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os- > > Linux/skas/process.c > > index b6f656bcffb1..2caaa9e85752 100644 > > --- a/arch/um/os-Linux/skas/process.c > > +++ b/arch/um/os-Linux/skas/process.c > > @@ -10,8 +10,11 @@ > > #include <sched.h> > > #include <errno.h> > > #include <string.h> > > +#include <fcntl.h> > > +#include <mem_user.h> > > #include <sys/mman.h> > > #include <sys/wait.h> > > +#include <sys/stat.h> > > #include <asm/unistd.h> > > #include <as-layout.h> > > #include <init.h> > > @@ -189,69 +192,135 @@ static void handle_trap(int pid, struct > > uml_pt_regs *regs) > > > > extern char __syscall_stub_start[]; > > > > -/** > > - * userspace_tramp() - userspace trampoline > > - * @stack: pointer to the new userspace stack page > > - * > > - * The userspace trampoline is used to setup a new userspace > > process in start_userspace() after it was clone()'ed. > > - * This function will run on a temporary stack page. > > - * It ptrace()'es itself, then > > - * Two pages are mapped into the userspace address space: > > - * - STUB_CODE (with EXEC), which contains the skas stub code > > - * - STUB_DATA (with R/W), which contains a data page that is used > > to transfer certain data between the UML userspace process and the > > UML kernel. > > - * Also for the userspace process a SIGSEGV handler is installed > > to catch pagefaults in the userspace process. > > - * And last the process stops itself to give control to the UML > > kernel for this userspace process. > > - * > > - * Return: Always zero, otherwise the current userspace process is > > ended with non null exit() call > > - */ > > +static int stub_exe_fd; > > + > > static int userspace_tramp(void *stack) > > { > > - struct sigaction sa; > > - void *addr; > > - int fd; > > + char *const argv[] = { "uml-userspace", NULL }; > > + int pipe_fds[2]; > > unsigned long long offset; > > - unsigned long segv_handler = STUB_CODE + > > - (unsigned long) > > stub_segv_handler - > > - (unsigned long) > > __syscall_stub_start; > > - > > - ptrace(PTRACE_TRACEME, 0, 0, 0); > > - > > - signal(SIGTERM, SIG_DFL); > > - signal(SIGWINCH, SIG_IGN); > > - > > - fd = phys_mapping(uml_to_phys(__syscall_stub_start), > > &offset); > > - addr = mmap64((void *) STUB_CODE, UM_KERN_PAGE_SIZE, > > - PROT_EXEC, MAP_FIXED | MAP_PRIVATE, fd, > > offset); > > - if (addr == MAP_FAILED) { > > - os_info("mapping mmap stub at 0x%lx failed, errno = > > %d\n", > > - STUB_CODE, errno); > > - exit(1); > > + struct stub_init_data init_data = { > > + .stub_start = STUB_START, > > + .segv_handler = STUB_CODE + > > + (unsigned long) stub_segv_handler - > > + (unsigned long) > > __syscall_stub_start, > > + }; > > + struct iomem_region *iomem; > > + int ret; > > + > > + init_data.stub_code_fd = > > phys_mapping(uml_to_phys(__syscall_stub_start), > > + &offset); > > + init_data.stub_code_offset = MMAP_OFFSET(offset); > > + > > + init_data.stub_data_fd = phys_mapping(uml_to_phys(stack), > > &offset); > > + init_data.stub_data_offset = MMAP_OFFSET(offset); > > + > > + /* Set CLOEXEC on all FDs and then unset on all memory > > related FDs */ > > + close_range(0, ~0U, CLOSE_RANGE_CLOEXEC); > > + > > + fcntl(init_data.stub_data_fd, F_SETFD, 0); > > + for (iomem = iomem_regions; iomem; iomem = iomem->next) > > + fcntl(iomem->fd, F_SETFD, 0); > > + > > + /* Create a pipe for init_data (no CLOEXEC) and dup2 to > > STDIN */ > > + if (pipe2(pipe_fds, 0)) > > + exit(2); > > + > > + close(0); > > + if (dup2(pipe_fds[0], 0) < 0) { > > + close(pipe_fds[0]); > > + close(pipe_fds[1]); > > + exit(3); > > } > > + close(pipe_fds[0]); > > + > > + /* Write init_data and close write side */ > > + ret = write(pipe_fds[1], &init_data, sizeof(init_data)); > > + close(pipe_fds[1]); > > + > > + if (ret != sizeof(init_data)) > > + exit(4); > > + > > + execveat(stub_exe_fd, "", argv, NULL, AT_EMPTY_PATH); > > > > - fd = phys_mapping(uml_to_phys(stack), &offset); > > - addr = mmap((void *) STUB_DATA, > > - STUB_DATA_PAGES * UM_KERN_PAGE_SIZE, PROT_READ > > | PROT_WRITE, > > - MAP_FIXED | MAP_SHARED, fd, offset); > > - if (addr == MAP_FAILED) { > > - os_info("mapping segfault stack at 0x%lx failed, > > errno = %d\n", > > - STUB_DATA, errno); > > - exit(1); > > + exit(5); > > +} > > + > > +extern char stub_exe_start[]; > > +extern char stub_exe_end[]; > > + > > +extern char *tempdir; > > + > > +#define STUB_EXE_NAME_TEMPLATE "/uml-userspace-XXXXXX" > > + > > +#ifndef MFD_EXEC > > +#define MFD_EXEC 0x0010U > > +#endif > > + > > +static int __init init_stub_exe_fd(void) > > +{ > > + size_t written = 0; > > + char *tmpfile = NULL; > > + > > + stub_exe_fd = memfd_create("uml-userspace", > > + MFD_EXEC | MFD_CLOEXEC | > > MFD_ALLOW_SEALING); > > + > > + if (stub_exe_fd < 0) { > > + printk(UM_KERN_INFO "Could not create executable > > memfd, using temporary file!"); > > + > > + tmpfile = malloc(strlen(tempdir) + > > + strlen(STUB_EXE_NAME_TEMPLATE) + > > 1); > > + if (tmpfile == NULL) > > + panic("Failed to allocate memory for stub > > binary name"); > > + > > + strcpy(tmpfile, tempdir); > > + strcat(tmpfile, STUB_EXE_NAME_TEMPLATE); > > + > > + stub_exe_fd = mkstemp(tmpfile); > > + if (stub_exe_fd < 0) > > + panic("Could not create temporary file for > > stub binary: %d", > > + -errno); > > } > > > > - set_sigstack((void *) STUB_DATA, STUB_DATA_PAGES * > > UM_KERN_PAGE_SIZE); > > - sigemptyset(&sa.sa_mask); > > - sa.sa_flags = SA_ONSTACK | SA_NODEFER | SA_SIGINFO; > > - sa.sa_sigaction = (void *) segv_handler; > > - sa.sa_restorer = NULL; > > - if (sigaction(SIGSEGV, &sa, NULL) < 0) { > > - os_info("%s - setting SIGSEGV handler failed - > > errno = %d\n", > > - __func__, errno); > > - exit(1); > > + while (written < stub_exe_end - stub_exe_start) { > > + ssize_t res = write(stub_exe_fd, stub_exe_start + > > written, > > + stub_exe_end - stub_exe_start - > > written); > > + if (res < 0) { > > + if (errno == EINTR) > > + continue; > > + > > + if (tmpfile) > > + unlink(tmpfile); > > + panic("Failed write stub binary: %d", - > > errno); > > + } > > + > > + written += res; > > + } > > + > > + if (!tmpfile) { > > + fcntl(stub_exe_fd, F_ADD_SEALS, > > + F_SEAL_WRITE | F_SEAL_SHRINK | F_SEAL_GROW | > > F_SEAL_SEAL); > > + } else { > > + if (fchmod(stub_exe_fd, 00500) < 0) { > > + unlink(tmpfile); > > + panic("Could not make stub binary > > executable: %d", > > + -errno); > > + } > > + > > + close(stub_exe_fd); > > + stub_exe_fd = open(tmpfile, O_RDONLY | O_CLOEXEC | > > O_NOFOLLOW); > > + if (stub_exe_fd < 0) { > > + unlink(tmpfile); > > + panic("Could not reopen stub binary: %d", - > > errno); > > + } > > + > > + unlink(tmpfile); > > + free(tmpfile); > > } > > > > - kill(os_getpid(), SIGSTOP); > > return 0; > > } > > +__initcall(init_stub_exe_fd); > > > > int userspace_pid[NR_CPUS]; > > > > @@ -270,7 +339,7 @@ int start_userspace(unsigned long stub_stack) > > { > > void *stack; > > unsigned long sp; > > - int pid, status, n, flags, err; > > + int pid, status, n, err; > > > > /* setup a temporary stack page */ > > stack = mmap(NULL, UM_KERN_PAGE_SIZE, > > @@ -286,10 +355,10 @@ int start_userspace(unsigned long stub_stack) > > /* set stack pointer to the end of the stack page, so it > > can grow downwards */ > > sp = (unsigned long)stack + UM_KERN_PAGE_SIZE; > > > > - flags = CLONE_FILES | SIGCHLD; > > - > > /* clone into new userspace process */ > > - pid = clone(userspace_tramp, (void *) sp, flags, (void *) > > stub_stack); > > + pid = clone(userspace_tramp, (void *) sp, > > + CLONE_VFORK | CLONE_VM | SIGCHLD, > > + (void *)stub_stack); > > if (pid < 0) { > > err = -errno; > > printk(UM_KERN_ERR "%s : clone failed, errno = > > %d\n", > > -- > > 2.46.0 > > > >