On 6/19/24 11:34 PM, Benjamin Berg wrote:
> From: Benjamin Berg <benjamin.b...@intel.com>
> 
> Using clone will not undo features that have been enabled by libc. An
> example of this already happening is rseq, which could cause the kernel
> to read/write memory of the userspace process. In the future the
> standard library might also use mseal by default to protect itself,
> which would also thwart our attempts at unmapping everything.
> 
> Solve all this by taking a step back and doing an execve into a tiny
> static binary that sets up the minimal environment required for the
> stub without using any standard library. That way we have a clean
> execution environment that is fully under the control of UML.
> 
> Note that this changes things a bit as the FDs are not anymore shared
> with the kernel. Instead, we explicitly share the FDs for the physical
> memory and all existing iomem regions. Doing this is fine, as iomem
> regions cannot be added at runtime.

This is pretty cool! :)

> 
> Signed-off-by: Benjamin Berg <benjamin.b...@intel.com>
> ---
>  arch/um/include/shared/skas/stub-data.h |  11 ++
>  arch/um/os-Linux/skas/process.c         | 145 +++++++++++++++---------
>  arch/x86/um/.gitignore                  |   2 +
>  arch/x86/um/Makefile                    |  32 +++++-
>  arch/x86/um/stub_elf.c                  |  84 ++++++++++++++
>  arch/x86/um/stub_elf_embed.S            |  11 ++
>  6 files changed, 227 insertions(+), 58 deletions(-)
>  create mode 100644 arch/x86/um/.gitignore
>  create mode 100644 arch/x86/um/stub_elf.c
>  create mode 100644 arch/x86/um/stub_elf_embed.S
> 
> diff --git a/arch/um/include/shared/skas/stub-data.h 
> b/arch/um/include/shared/skas/stub-data.h
> index 5e3ade3fb38b..83d210f59956 100644
> --- a/arch/um/include/shared/skas/stub-data.h
> +++ b/arch/um/include/shared/skas/stub-data.h
> @@ -8,6 +8,17 @@
>  #ifndef __STUB_DATA_H
>  #define __STUB_DATA_H
>  
> +struct stub_init_data {
> +     unsigned long stub_start;
> +
> +     int stub_code_fd;
> +     unsigned long stub_code_offset;
> +     int stub_data_fd;
> +     unsigned long stub_data_offset;
> +
> +     unsigned long segv_handler;
> +};
> +
>  struct stub_data {
>       unsigned long offset;
>       int fd;
> diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c
> index 41a288dcfc34..6cd82ca7a43d 100644
> --- a/arch/um/os-Linux/skas/process.c
> +++ b/arch/um/os-Linux/skas/process.c
> @@ -23,6 +23,8 @@
>  #include <skas.h>
>  #include <sysdep/stub.h>
>  #include <linux/threads.h>
> +#include <fcntl.h>
> +#include <mem_user.h>
>  #include "../internal.h"
>  
>  int is_skas_winch(int pid, int fd, void *data)
> @@ -188,69 +190,100 @@ static void handle_trap(int pid, struct uml_pt_regs 
> *regs)
>  
>  extern char __syscall_stub_start[];
>  
> -/**
> - * userspace_tramp() - userspace trampoline
> - * @stack:   pointer to the new userspace stack page
> - *
> - * The userspace trampoline is used to setup a new userspace process in 
> start_userspace() after it was clone()'ed.
> - * This function will run on a temporary stack page.
> - * It ptrace()'es itself, then
> - * Two pages are mapped into the userspace address space:
> - * - STUB_CODE (with EXEC), which contains the skas stub code
> - * - STUB_DATA (with R/W), which contains a data page that is used to 
> transfer certain data between the UML userspace process and the UML kernel.
> - * Also for the userspace process a SIGSEGV handler is installed to catch 
> pagefaults in the userspace process.
> - * And last the process stops itself to give control to the UML kernel for 
> this userspace process.
> - *
> - * Return: Always zero, otherwise the current userspace process is ended 
> with non null exit() call
> - */
> +int userspace_pid[NR_CPUS];

Nit: userspace_pid is already defined in this file.

> +
> +extern char stub_elf_start[];
> +extern char stub_elf_end[];
> +
> +int stub_exec_fd;

stub_exec_fd isn't used outside this file. Might be better to make it static.

> +
>  static int userspace_tramp(void *stack)
>  {
> -     struct sigaction sa;
> -     void *addr;
> -     int fd;
> +     char *const argv[] = { "uml-userspace", NULL };
> +     int pipe_fds[2];
>       unsigned long long offset;
> -     unsigned long segv_handler = STUB_CODE +
> -                                  (unsigned long) stub_segv_handler -
> -                                  (unsigned long) __syscall_stub_start;
> -
> -     ptrace(PTRACE_TRACEME, 0, 0, 0);
> -
> -     signal(SIGTERM, SIG_DFL);
> -     signal(SIGWINCH, SIG_IGN);
> -
> -     fd = phys_mapping(uml_to_phys(__syscall_stub_start), &offset);
> -     addr = mmap64((void *) STUB_CODE, UM_KERN_PAGE_SIZE,
> -                   PROT_EXEC, MAP_FIXED | MAP_PRIVATE, fd, offset);
> -     if (addr == MAP_FAILED) {
> -             os_info("mapping mmap stub at 0x%lx failed, errno = %d\n",
> -                     STUB_CODE, errno);
> -             exit(1);
> +     struct stub_init_data init_data = {
> +             .stub_start = STUB_START,
> +             .segv_handler = STUB_CODE +
> +                             (unsigned long) stub_segv_handler -
> +                             (unsigned long) __syscall_stub_start,
> +     };
> +     struct iomem_region *iomem = iomem_regions;
> +     int ret;
> +
> +     init_data.stub_code_fd = phys_mapping(uml_to_phys(__syscall_stub_start),
> +                                           &offset);
> +     init_data.stub_code_offset = MMAP_OFFSET(offset);
> +
> +     init_data.stub_data_fd = phys_mapping(uml_to_phys(stack), &offset);
> +     init_data.stub_data_offset = MMAP_OFFSET(offset);
> +
> +     /* Set CLOEXEC on all FDs and then unset on all memory related FDs */
> +     close_range(0, ~0U, CLOSE_RANGE_CLOEXEC);
> +
> +     fcntl(init_data.stub_data_fd, F_SETFD, 0);
> +     while (iomem) {
> +             fcntl(init_data.stub_data_fd, F_SETFD, 0);

Typo: this is supposed to be iomem->fd.

> +             iomem = iomem->next;
>       }
>  
> -     fd = phys_mapping(uml_to_phys(stack), &offset);
> -     addr = mmap((void *) STUB_DATA,
> -                 STUB_DATA_PAGES * UM_KERN_PAGE_SIZE, PROT_READ | PROT_WRITE,
> -                 MAP_FIXED | MAP_SHARED, fd, offset);
> -     if (addr == MAP_FAILED) {
> -             os_info("mapping segfault stack at 0x%lx failed, errno = %d\n",
> -                     STUB_DATA, errno);
> -             exit(1);
> +     /* Create a pipe for init_data (no CLOEXEC) and dup2 to STDIN */
> +     if (pipe2(pipe_fds, 0))
> +             exit(2);
> +
> +     close(0);
> +     if (dup2(pipe_fds[0], 0) < 0) {
> +             close(pipe_fds[0]);
> +             close(pipe_fds[1]);
> +             exit(3);
>       }
> +     close(pipe_fds[0]);
> +
> +     /* Write init_data and close write side */
> +     ret = write(pipe_fds[1], &init_data, sizeof(init_data));
> +     close(pipe_fds[1]);
> +
> +     if (ret != sizeof(init_data))
> +             exit(4);
> +
> +     execveat(stub_exec_fd, "", argv, NULL, AT_EMPTY_PATH);
>  
> -     set_sigstack((void *) STUB_DATA, STUB_DATA_PAGES * UM_KERN_PAGE_SIZE);
> -     sigemptyset(&sa.sa_mask);
> -     sa.sa_flags = SA_ONSTACK | SA_NODEFER | SA_SIGINFO;
> -     sa.sa_sigaction = (void *) segv_handler;
> -     sa.sa_restorer = NULL;
> -     if (sigaction(SIGSEGV, &sa, NULL) < 0) {
> -             os_info("%s - setting SIGSEGV handler failed - errno = %d\n",
> -                     __func__, errno);
> -             exit(1);
> +     close(0);
> +
> +     exit(4);

Might be better to bump the exit code.

> +}
> +
[...]
> diff --git a/arch/x86/um/stub_elf.c b/arch/x86/um/stub_elf.c
> new file mode 100644
> index 000000000000..b634740a6a15
> --- /dev/null
> +++ b/arch/x86/um/stub_elf.c
> @@ -0,0 +1,84 @@
> +#include <sys/ptrace.h>
> +#include <sys/prctl.h>
> +#include <asm/unistd.h>
> +#include <sysdep/stub.h>
> +#include <stub-data.h>
> +
> +void _start(void);
> +
> +static void real_init(void)
> +{
> +     struct stub_init_data init_data;
> +     unsigned long res;
> +     struct {
> +             void  *ss_sp;
> +             int    ss_flags;
> +             size_t ss_size;
> +     } stack;
> +     struct {
> +             void *sa_handler_;
> +             unsigned long sa_flags;
> +             void *sa_restorer;
> +             unsigned long sa_mask;
> +     } sa = {};
> +
> +     /* set a nice name */
> +     stub_syscall2(__NR_prctl, PR_SET_NAME, (unsigned long)"uml-userspace");
> +
> +     /* read information from STDIN and close it */
> +     stub_syscall3(__NR_read, 0,
> +                   (unsigned long)&init_data, sizeof(init_data));

Might be better to check the number of bytes read.

Regards,
Tiwei

Reply via email to