From: Benjamin Berg <benjamin.b...@intel.com> Newer glibc versions are enabling rseq support by default. This remains enabled in the cloned child process, potentially causing the host kernel to write/read memory in the child.
It appears that this was purely not an issue because the used memory area happened to be above TASK_SIZE and remains mapped. Note that a better approach would be to exec a small static binary that does not link with other libraries. Using a memfd and execveat the binary could be embedded into UML itself and it would result in an entirely clean execution environment for userspace. Signed-off-by: Benjamin Berg <benjamin.b...@intel.com> --- v2: Improved clone logic using CLONE_VFORK --- arch/um/os-Linux/skas/process.c | 55 ++++++++++++++++++++++++++++++--- 1 file changed, 50 insertions(+), 5 deletions(-) diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c index 41a288dcfc34..11bc6f4ce5b3 100644 --- a/arch/um/os-Linux/skas/process.c +++ b/arch/um/os-Linux/skas/process.c @@ -255,6 +255,32 @@ static int userspace_tramp(void *stack) int userspace_pid[NR_CPUS]; int kill_userspace_mm[NR_CPUS]; +struct tramp_data { + int pid; + void *clone_sp; + void *stack; +}; + +static int userspace_tramp_clone_vm(void *data) +{ + struct tramp_data *tramp_data = data; + + /* + * At this point we are still in the same VM as the parent, but rseq + * has been disabled for this process. + * Continue with the clone into the new userspace process, the kernel + * continues as soon as this process quits (CLONE_VFORK). + */ + + tramp_data->pid = clone(userspace_tramp, tramp_data->clone_sp, + CLONE_PARENT | CLONE_FILES | SIGCHLD, + tramp_data->stack); + if (tramp_data->pid < 0) + tramp_data->pid = -errno; + + exit(0); +} + /** * start_userspace() - prepare a new userspace process * @stub_stack: pointer to the stub stack. @@ -268,9 +294,10 @@ int kill_userspace_mm[NR_CPUS]; */ int start_userspace(unsigned long stub_stack) { + struct tramp_data tramp_data; void *stack; unsigned long sp; - int pid, status, n, flags, err; + int pid, status, n, err; /* setup a temporary stack page */ stack = mmap(NULL, UM_KERN_PAGE_SIZE, @@ -286,10 +313,13 @@ int start_userspace(unsigned long stub_stack) /* set stack pointer to the end of the stack page, so it can grow downwards */ sp = (unsigned long)stack + UM_KERN_PAGE_SIZE; - flags = CLONE_FILES | SIGCHLD; + tramp_data.stack = (void *) stub_stack; + tramp_data.clone_sp = (void *) sp; + tramp_data.pid = -EINVAL; - /* clone into new userspace process */ - pid = clone(userspace_tramp, (void *) sp, flags, (void *) stub_stack); + /* first stage CLONE_VM clone using VFORK and no signal notification */ + pid = clone(userspace_tramp_clone_vm, (void *) sp, + CLONE_VM | CLONE_FILES | CLONE_VFORK, &tramp_data); if (pid < 0) { err = -errno; printk(UM_KERN_ERR "%s : clone failed, errno = %d\n", @@ -297,6 +327,21 @@ int start_userspace(unsigned long stub_stack) return err; } + n = waitpid(pid, &status, WUNTRACED | WNOHANG | __WCLONE); + if (n < 0 || !WIFEXITED(status) || WEXITSTATUS(status)) { + err = -errno; + printk(UM_KERN_ERR "%s : wait failed, errno = %d, status = %d\n", + __func__, n < 0 ? errno : 0, status); + goto out_kill; + } + + pid = tramp_data.pid; + if (pid < 0) { + printk(UM_KERN_ERR "%s : second clone failed, errno = %d\n", + __func__, -pid); + return pid; + } + do { CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED | __WALL)); if (n < 0) { @@ -305,7 +350,7 @@ int start_userspace(unsigned long stub_stack) __func__, errno); goto out_kill; } - } while (WIFSTOPPED(status) && (WSTOPSIG(status) == SIGALRM)); + } while (WIFEXITED(status) && (WSTOPSIG(status) == SIGALRM)); if (!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGSTOP)) { err = -EINVAL; -- 2.45.1