On 6/19/24 11:34 PM, Benjamin Berg wrote: > From: Benjamin Berg <benjamin.b...@intel.com> > > Using clone will not undo features that have been enabled by libc. An > example of this already happening is rseq, which could cause the kernel > to read/write memory of the userspace process. In the future the > standard library might also use mseal by default to protect itself, > which would also thwart our attempts at unmapping everything. > > Solve all this by taking a step back and doing an execve into a tiny > static binary that sets up the minimal environment required for the > stub without using any standard library. That way we have a clean > execution environment that is fully under the control of UML. > > Note that this changes things a bit as the FDs are not anymore shared > with the kernel. Instead, we explicitly share the FDs for the physical > memory and all existing iomem regions. Doing this is fine, as iomem > regions cannot be added at runtime.
This is pretty cool! :) > > Signed-off-by: Benjamin Berg <benjamin.b...@intel.com> > --- > arch/um/include/shared/skas/stub-data.h | 11 ++ > arch/um/os-Linux/skas/process.c | 145 +++++++++++++++--------- > arch/x86/um/.gitignore | 2 + > arch/x86/um/Makefile | 32 +++++- > arch/x86/um/stub_elf.c | 84 ++++++++++++++ > arch/x86/um/stub_elf_embed.S | 11 ++ > 6 files changed, 227 insertions(+), 58 deletions(-) > create mode 100644 arch/x86/um/.gitignore > create mode 100644 arch/x86/um/stub_elf.c > create mode 100644 arch/x86/um/stub_elf_embed.S > > diff --git a/arch/um/include/shared/skas/stub-data.h > b/arch/um/include/shared/skas/stub-data.h > index 5e3ade3fb38b..83d210f59956 100644 > --- a/arch/um/include/shared/skas/stub-data.h > +++ b/arch/um/include/shared/skas/stub-data.h > @@ -8,6 +8,17 @@ > #ifndef __STUB_DATA_H > #define __STUB_DATA_H > > +struct stub_init_data { > + unsigned long stub_start; > + > + int stub_code_fd; > + unsigned long stub_code_offset; > + int stub_data_fd; > + unsigned long stub_data_offset; > + > + unsigned long segv_handler; > +}; > + > struct stub_data { > unsigned long offset; > int fd; > diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c > index 41a288dcfc34..6cd82ca7a43d 100644 > --- a/arch/um/os-Linux/skas/process.c > +++ b/arch/um/os-Linux/skas/process.c > @@ -23,6 +23,8 @@ > #include <skas.h> > #include <sysdep/stub.h> > #include <linux/threads.h> > +#include <fcntl.h> > +#include <mem_user.h> > #include "../internal.h" > > int is_skas_winch(int pid, int fd, void *data) > @@ -188,69 +190,100 @@ static void handle_trap(int pid, struct uml_pt_regs > *regs) > > extern char __syscall_stub_start[]; > > -/** > - * userspace_tramp() - userspace trampoline > - * @stack: pointer to the new userspace stack page > - * > - * The userspace trampoline is used to setup a new userspace process in > start_userspace() after it was clone()'ed. > - * This function will run on a temporary stack page. > - * It ptrace()'es itself, then > - * Two pages are mapped into the userspace address space: > - * - STUB_CODE (with EXEC), which contains the skas stub code > - * - STUB_DATA (with R/W), which contains a data page that is used to > transfer certain data between the UML userspace process and the UML kernel. > - * Also for the userspace process a SIGSEGV handler is installed to catch > pagefaults in the userspace process. > - * And last the process stops itself to give control to the UML kernel for > this userspace process. > - * > - * Return: Always zero, otherwise the current userspace process is ended > with non null exit() call > - */ > +int userspace_pid[NR_CPUS]; Nit: userspace_pid is already defined in this file. > + > +extern char stub_elf_start[]; > +extern char stub_elf_end[]; > + > +int stub_exec_fd; stub_exec_fd isn't used outside this file. Might be better to make it static. > + > static int userspace_tramp(void *stack) > { > - struct sigaction sa; > - void *addr; > - int fd; > + char *const argv[] = { "uml-userspace", NULL }; > + int pipe_fds[2]; > unsigned long long offset; > - unsigned long segv_handler = STUB_CODE + > - (unsigned long) stub_segv_handler - > - (unsigned long) __syscall_stub_start; > - > - ptrace(PTRACE_TRACEME, 0, 0, 0); > - > - signal(SIGTERM, SIG_DFL); > - signal(SIGWINCH, SIG_IGN); > - > - fd = phys_mapping(uml_to_phys(__syscall_stub_start), &offset); > - addr = mmap64((void *) STUB_CODE, UM_KERN_PAGE_SIZE, > - PROT_EXEC, MAP_FIXED | MAP_PRIVATE, fd, offset); > - if (addr == MAP_FAILED) { > - os_info("mapping mmap stub at 0x%lx failed, errno = %d\n", > - STUB_CODE, errno); > - exit(1); > + struct stub_init_data init_data = { > + .stub_start = STUB_START, > + .segv_handler = STUB_CODE + > + (unsigned long) stub_segv_handler - > + (unsigned long) __syscall_stub_start, > + }; > + struct iomem_region *iomem = iomem_regions; > + int ret; > + > + init_data.stub_code_fd = phys_mapping(uml_to_phys(__syscall_stub_start), > + &offset); > + init_data.stub_code_offset = MMAP_OFFSET(offset); > + > + init_data.stub_data_fd = phys_mapping(uml_to_phys(stack), &offset); > + init_data.stub_data_offset = MMAP_OFFSET(offset); > + > + /* Set CLOEXEC on all FDs and then unset on all memory related FDs */ > + close_range(0, ~0U, CLOSE_RANGE_CLOEXEC); > + > + fcntl(init_data.stub_data_fd, F_SETFD, 0); > + while (iomem) { > + fcntl(init_data.stub_data_fd, F_SETFD, 0); Typo: this is supposed to be iomem->fd. > + iomem = iomem->next; > } > > - fd = phys_mapping(uml_to_phys(stack), &offset); > - addr = mmap((void *) STUB_DATA, > - STUB_DATA_PAGES * UM_KERN_PAGE_SIZE, PROT_READ | PROT_WRITE, > - MAP_FIXED | MAP_SHARED, fd, offset); > - if (addr == MAP_FAILED) { > - os_info("mapping segfault stack at 0x%lx failed, errno = %d\n", > - STUB_DATA, errno); > - exit(1); > + /* Create a pipe for init_data (no CLOEXEC) and dup2 to STDIN */ > + if (pipe2(pipe_fds, 0)) > + exit(2); > + > + close(0); > + if (dup2(pipe_fds[0], 0) < 0) { > + close(pipe_fds[0]); > + close(pipe_fds[1]); > + exit(3); > } > + close(pipe_fds[0]); > + > + /* Write init_data and close write side */ > + ret = write(pipe_fds[1], &init_data, sizeof(init_data)); > + close(pipe_fds[1]); > + > + if (ret != sizeof(init_data)) > + exit(4); > + > + execveat(stub_exec_fd, "", argv, NULL, AT_EMPTY_PATH); > > - set_sigstack((void *) STUB_DATA, STUB_DATA_PAGES * UM_KERN_PAGE_SIZE); > - sigemptyset(&sa.sa_mask); > - sa.sa_flags = SA_ONSTACK | SA_NODEFER | SA_SIGINFO; > - sa.sa_sigaction = (void *) segv_handler; > - sa.sa_restorer = NULL; > - if (sigaction(SIGSEGV, &sa, NULL) < 0) { > - os_info("%s - setting SIGSEGV handler failed - errno = %d\n", > - __func__, errno); > - exit(1); > + close(0); > + > + exit(4); Might be better to bump the exit code. > +} > + [...] > diff --git a/arch/x86/um/stub_elf.c b/arch/x86/um/stub_elf.c > new file mode 100644 > index 000000000000..b634740a6a15 > --- /dev/null > +++ b/arch/x86/um/stub_elf.c > @@ -0,0 +1,84 @@ > +#include <sys/ptrace.h> > +#include <sys/prctl.h> > +#include <asm/unistd.h> > +#include <sysdep/stub.h> > +#include <stub-data.h> > + > +void _start(void); > + > +static void real_init(void) > +{ > + struct stub_init_data init_data; > + unsigned long res; > + struct { > + void *ss_sp; > + int ss_flags; > + size_t ss_size; > + } stack; > + struct { > + void *sa_handler_; > + unsigned long sa_flags; > + void *sa_restorer; > + unsigned long sa_mask; > + } sa = {}; > + > + /* set a nice name */ > + stub_syscall2(__NR_prctl, PR_SET_NAME, (unsigned long)"uml-userspace"); > + > + /* read information from STDIN and close it */ > + stub_syscall3(__NR_read, 0, > + (unsigned long)&init_data, sizeof(init_data)); Might be better to check the number of bytes read. Regards, Tiwei