On Fri, 2023-09-22 at 13:16 +0200, Johannes Berg wrote: > From: Johannes Berg <johannes.b...@intel.com> > > While enabling PREEMPT on UML, we found that the call to > force_flush_all() cannot be done where it is, it sleeps > while atomic. > > Further investigation shows that all this seems at least > a bit roundabout and possibly wrong wrong in the first > place - we copy from the 'current' process and then flush > when it starts running. > > What we really want is to start fresh with an empty mm > process, then have it all set up by the kernel (copying > the original mm contents as needed), and then sync it > in arch_dup_mmap(). > > We should do the same for the LDT, so need to split that > to be able to do this. > > Note that this fixes what seems like an issue - we look > at current->mm when we copy, but that doesn't seem right > in the case of clone() without copying the MM. This is > probably saved by the forced flush later right now.
Well, not clone(), but rather any execve(). The execve() happens in a new MM (do_execveat_common -> alloc_bprm -> bprm_mm_init -> mm_alloc) which is then swapped into the current task by exec_mmap. And, that explains why the flush was needed. Without it, our userspace had the parent's memory available unless it was un-/remapped (possibly even RW with CLONE_VM|CLONE_VFORK). This is obviously wrong and pointless as we discarded the information anyway. So, doing a flush in arch_dup_mmap seems much more reasonable to me (not entirely sure if it is needed). And, I doubt there is a benefit in optimizing the flush away by somehow cloning the userspace process. Benjamin > Signed-off-by: Johannes Berg <johannes.b...@intel.com> > --- > arch/um/include/asm/Kbuild | 1 - > arch/um/include/asm/mm_hooks.h | 22 ++++++ > arch/um/include/asm/mmu.h | 3 +- > arch/um/include/asm/mmu_context.h | 2 +- > arch/um/include/shared/os.h | 1 - > arch/um/kernel/process.c | 3 - > arch/um/kernel/skas/mmu.c | 35 ++++++---- > arch/um/kernel/tlb.c | 5 +- > arch/um/os-Linux/skas/process.c | 107 ---------------------------- > -- > arch/x86/um/ldt.c | 53 +++++++-------- > 10 files changed, 76 insertions(+), 156 deletions(-) > create mode 100644 arch/um/include/asm/mm_hooks.h > > diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild > index b2d834a29f3a..de8d82a6fd7b 100644 > --- a/arch/um/include/asm/Kbuild > +++ b/arch/um/include/asm/Kbuild > @@ -26,5 +26,4 @@ generic-y += switch_to.h > generic-y += topology.h > generic-y += trace_clock.h > generic-y += kprobes.h > -generic-y += mm_hooks.h > generic-y += vga.h > diff --git a/arch/um/include/asm/mm_hooks.h > b/arch/um/include/asm/mm_hooks.h > new file mode 100644 > index 000000000000..b1016520c5b8 > --- /dev/null > +++ b/arch/um/include/asm/mm_hooks.h > @@ -0,0 +1,22 @@ > +/* SPDX-License-Identifier: GPL-2.0 */ > +#ifndef _ASM_UM_MM_HOOKS_H > +#define _ASM_UM_MM_HOOKS_H > + > +int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm); > + > +static inline void arch_exit_mmap(struct mm_struct *mm) > +{ > +} > + > +static inline void arch_unmap(struct mm_struct *mm, > + unsigned long start, unsigned long end) > +{ > +} > + > +static inline bool arch_vma_access_permitted(struct vm_area_struct > *vma, > + bool write, bool execute, bool foreign) > +{ > + /* by default, allow everything */ > + return true; > +} > +#endif /* _ASM_UM_MM_HOOKS_H */ > diff --git a/arch/um/include/asm/mmu.h b/arch/um/include/asm/mmu.h > index 5b072aba5b65..68a710d23b5d 100644 > --- a/arch/um/include/asm/mmu.h > +++ b/arch/um/include/asm/mmu.h > @@ -18,7 +18,8 @@ typedef struct mm_context { > extern void __switch_mm(struct mm_id * mm_idp); > > /* Avoid tangled inclusion with asm/ldt.h */ > -extern long init_new_ldt(struct mm_context *to_mm, struct mm_context > *from_mm); > +extern int init_new_ldt(struct mm_context *to_mm); > +extern int copy_ldt(struct mm_context *to_mm, struct mm_context > *from_mm); > extern void free_ldt(struct mm_context *mm); > > #endif > diff --git a/arch/um/include/asm/mmu_context.h > b/arch/um/include/asm/mmu_context.h > index 68e2eb9cfb47..8668861d4a85 100644 > --- a/arch/um/include/asm/mmu_context.h > +++ b/arch/um/include/asm/mmu_context.h > @@ -13,7 +13,7 @@ > #include <asm/mm_hooks.h> > #include <asm/mmu.h> > > -extern void force_flush_all(void); > +void force_flush_all(struct mm_struct *mm); > > #define activate_mm activate_mm > static inline void activate_mm(struct mm_struct *old, struct > mm_struct *new) > diff --git a/arch/um/include/shared/os.h > b/arch/um/include/shared/os.h > index 1a82c6548dd5..c9acc28fe47c 100644 > --- a/arch/um/include/shared/os.h > +++ b/arch/um/include/shared/os.h > @@ -289,7 +289,6 @@ extern int protect(struct mm_id * mm_idp, > unsigned long addr, > /* skas/process.c */ > extern int is_skas_winch(int pid, int fd, void *data); > extern int start_userspace(unsigned long stub_stack); > -extern int copy_context_skas0(unsigned long stack, int pid); > extern void userspace(struct uml_pt_regs *regs, unsigned long > *aux_fp_regs); > extern void new_thread(void *stack, jmp_buf *buf, void > (*handler)(void)); > extern void switch_threads(jmp_buf *me, jmp_buf *you); > diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c > index 6daffb9d8a8d..a024acd6d85c 100644 > --- a/arch/um/kernel/process.c > +++ b/arch/um/kernel/process.c > @@ -25,7 +25,6 @@ > #include <linux/threads.h> > #include <linux/resume_user_mode.h> > #include <asm/current.h> > -#include <asm/mmu_context.h> > #include <linux/uaccess.h> > #include <as-layout.h> > #include <kern_util.h> > @@ -139,8 +138,6 @@ void new_thread_handler(void) > /* Called magically, see new_thread_handler above */ > void fork_handler(void) > { > - force_flush_all(); > - > schedule_tail(current->thread.prev_sched); > > /* > diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c > index 656fe16c9b63..ac4ca203ac24 100644 > --- a/arch/um/kernel/skas/mmu.c > +++ b/arch/um/kernel/skas/mmu.c > @@ -10,13 +10,13 @@ > > #include <asm/pgalloc.h> > #include <asm/sections.h> > +#include <asm/mmu_context.h> > #include <as-layout.h> > #include <os.h> > #include <skas.h> > > int init_new_context(struct task_struct *task, struct mm_struct *mm) > { > - struct mm_context *from_mm = NULL; > struct mm_context *to_mm = &mm->context; > unsigned long stack = 0; > int ret = -ENOMEM; > @@ -26,14 +26,13 @@ int init_new_context(struct task_struct *task, > struct mm_struct *mm) > goto out; > > to_mm->id.stack = stack; > - if (current->mm != NULL && current->mm != &init_mm) > - from_mm = ¤t->mm->context; > > + /* > + * Allocate a completely fresh mm. We'll sync the mappings > once > + * the rest of the kernel is done, in arch_copy_mm(). > + */ > block_signals_trace(); > - if (from_mm) > - to_mm->id.u.pid = copy_context_skas0(stack, > - from_mm- > >id.u.pid); > - else to_mm->id.u.pid = start_userspace(stack); > + to_mm->id.u.pid = start_userspace(stack); > unblock_signals_trace(); > > if (to_mm->id.u.pid < 0) { > @@ -41,12 +40,9 @@ int init_new_context(struct task_struct *task, > struct mm_struct *mm) > goto out_free; > } > > - ret = init_new_ldt(to_mm, from_mm); > - if (ret < 0) { > - printk(KERN_ERR "init_new_context_skas - init_ldt" > - " failed, errno = %d\n", ret); > + ret = init_new_ldt(to_mm); > + if (ret) > goto out_free; > - } > > return 0; > > @@ -57,6 +53,21 @@ int init_new_context(struct task_struct *task, > struct mm_struct *mm) > return ret; > } > > +int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) > +{ > + int ret = copy_ldt(&mm->context, &oldmm->context); > + > + if (ret < 0) { > + printk(KERN_ERR "%s - copy_ldt failed, errno = %d\n", > + __func__, ret); > + return ret; > + } > + > + force_flush_all(mm); > + return 0; > +} > + > + > void destroy_context(struct mm_struct *mm) > { > struct mm_context *mmu = &mm->context; > diff --git a/arch/um/kernel/tlb.c b/arch/um/kernel/tlb.c > index 34ec8e677fb9..7c0161321fd9 100644 > --- a/arch/um/kernel/tlb.c > +++ b/arch/um/kernel/tlb.c > @@ -600,14 +600,11 @@ void flush_tlb_mm(struct mm_struct *mm) > fix_range(mm, vma->vm_start, vma->vm_end, 0); > } > > -void force_flush_all(void) > +void force_flush_all(struct mm_struct *mm) > { > - struct mm_struct *mm = current->mm; > struct vm_area_struct *vma; > VMA_ITERATOR(vmi, mm, 0); > > - mmap_read_lock(mm); > for_each_vma(vmi, vma) > fix_range(mm, vma->vm_start, vma->vm_end, 1); > - mmap_read_unlock(mm); > } > diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os- > Linux/skas/process.c > index f92129bbf981..403f4c6082b6 100644 > --- a/arch/um/os-Linux/skas/process.c > +++ b/arch/um/os-Linux/skas/process.c > @@ -508,113 +508,6 @@ void userspace(struct uml_pt_regs *regs, > unsigned long *aux_fp_regs) > } > } > > -static unsigned long thread_regs[MAX_REG_NR]; > -static unsigned long thread_fp_regs[FP_SIZE]; > - > -static int __init init_thread_regs(void) > -{ > - get_safe_registers(thread_regs, thread_fp_regs); > - /* Set parent's instruction pointer to start of clone-stub */ > - thread_regs[REGS_IP_INDEX] = STUB_CODE + > - (unsigned long) stub_clone_handler - > - (unsigned long) __syscall_stub_start; > - thread_regs[REGS_SP_INDEX] = STUB_DATA + STUB_DATA_PAGES * > UM_KERN_PAGE_SIZE - > - sizeof(void *); > -#ifdef __SIGNAL_FRAMESIZE > - thread_regs[REGS_SP_INDEX] -= __SIGNAL_FRAMESIZE; > -#endif > - return 0; > -} > - > -__initcall(init_thread_regs); > - > -int copy_context_skas0(unsigned long new_stack, int pid) > -{ > - int err; > - unsigned long current_stack = current_stub_stack(); > - struct stub_data *data = (struct stub_data *) current_stack; > - struct stub_data *child_data = (struct stub_data *) > new_stack; > - unsigned long long new_offset; > - int new_fd = phys_mapping(uml_to_phys((void *)new_stack), > &new_offset); > - > - /* > - * prepare offset and fd of child's stack as argument for > parent's > - * and child's mmap2 calls > - */ > - *data = ((struct stub_data) { > - .offset = MMAP_OFFSET(new_offset), > - .fd = new_fd, > - .parent_err = -ESRCH, > - .child_err = 0, > - }); > - > - *child_data = ((struct stub_data) { > - .child_err = -ESRCH, > - }); > - > - err = ptrace_setregs(pid, thread_regs); > - if (err < 0) { > - err = -errno; > - printk(UM_KERN_ERR "%s : PTRACE_SETREGS failed, pid = > %d, errno = %d\n", > - __func__, pid, -err); > - return err; > - } > - > - err = put_fp_registers(pid, thread_fp_regs); > - if (err < 0) { > - printk(UM_KERN_ERR "%s : put_fp_registers failed, pid > = %d, err = %d\n", > - __func__, pid, err); > - return err; > - } > - > - /* > - * Wait, until parent has finished its work: read child's pid > from > - * parent's stack, and check, if bad result. > - */ > - err = ptrace(PTRACE_CONT, pid, 0, 0); > - if (err) { > - err = -errno; > - printk(UM_KERN_ERR "Failed to continue new process, > pid = %d, errno = %d\n", > - pid, errno); > - return err; > - } > - > - wait_stub_done(pid); > - > - pid = data->parent_err; > - if (pid < 0) { > - printk(UM_KERN_ERR "%s - stub-parent reports error > %d\n", > - __func__, -pid); > - return pid; > - } > - > - /* > - * Wait, until child has finished too: read child's result > from > - * child's stack and check it. > - */ > - wait_stub_done(pid); > - if (child_data->child_err != STUB_DATA) { > - printk(UM_KERN_ERR "%s - stub-child %d reports error > %ld\n", > - __func__, pid, data->child_err); > - err = data->child_err; > - goto out_kill; > - } > - > - if (ptrace(PTRACE_OLDSETOPTIONS, pid, NULL, > - (void *)PTRACE_O_TRACESYSGOOD) < 0) { > - err = -errno; > - printk(UM_KERN_ERR "%s : PTRACE_OLDSETOPTIONS failed, > errno = %d\n", > - __func__, errno); > - goto out_kill; > - } > - > - return pid; > - > - out_kill: > - os_kill_ptraced_process(pid, 1); > - return err; > -} > - > void new_thread(void *stack, jmp_buf *buf, void (*handler)(void)) > { > (*buf)[0].JB_IP = (unsigned long) handler; > diff --git a/arch/x86/um/ldt.c b/arch/x86/um/ldt.c > index 255a44dd415a..609feaeff23b 100644 > --- a/arch/x86/um/ldt.c > +++ b/arch/x86/um/ldt.c > @@ -297,36 +297,37 @@ static void ldt_get_host_info(void) > free_pages((unsigned long)ldt, order); > } > > -long init_new_ldt(struct mm_context *new_mm, struct mm_context > *from_mm) > +int init_new_ldt(struct mm_context *new_mm) > { > - struct user_desc desc; > + struct user_desc desc = {}; > short * num_p; > - int i; > - long page, err=0; > + int err = 0; > void *addr = NULL; > > - > mutex_init(&new_mm->arch.ldt.lock); > > - if (!from_mm) { > - memset(&desc, 0, sizeof(desc)); > - /* > - * Now we try to retrieve info about the ldt, we > - * inherited from the host. All ldt-entries found > - * will be reset in the following loop > - */ > - ldt_get_host_info(); > - for (num_p=host_ldt_entries; *num_p != -1; num_p++) { > - desc.entry_number = *num_p; > - err = write_ldt_entry(&new_mm->id, 1, &desc, > - &addr, *(num_p + 1) == > -1); > - if (err) > - break; > - } > - new_mm->arch.ldt.entry_count = 0; > - > - goto out; > + memset(&desc, 0, sizeof(desc)); > + /* > + * Now we try to retrieve info about the ldt, we > + * inherited from the host. All ldt-entries found > + * will be reset in the following loop > + */ > + ldt_get_host_info(); > + for (num_p=host_ldt_entries; *num_p != -1; num_p++) { > + desc.entry_number = *num_p; > + err = write_ldt_entry(&new_mm->id, 1, &desc, > + &addr, *(num_p + 1) == -1); > + if (err) > + break; > } > + new_mm->arch.ldt.entry_count = 0; > + > + return err; > +} > + > +int copy_ldt(struct mm_context *new_mm, struct mm_context *from_mm) > +{ > + int err = 0; > > /* > * Our local LDT is used to supply the data for > @@ -339,7 +340,9 @@ long init_new_ldt(struct mm_context *new_mm, > struct mm_context *from_mm) > memcpy(new_mm->arch.ldt.u.entries, from_mm- > >arch.ldt.u.entries, > sizeof(new_mm->arch.ldt.u.entries)); > else { > - i = from_mm->arch.ldt.entry_count / > LDT_ENTRIES_PER_PAGE; > + int i = from_mm->arch.ldt.entry_count / > LDT_ENTRIES_PER_PAGE; > + unsigned long page; > + > while (i-->0) { > page = > __get_free_page(GFP_KERNEL|__GFP_ZERO); > if (!page) { > @@ -355,11 +358,9 @@ long init_new_ldt(struct mm_context *new_mm, > struct mm_context *from_mm) > new_mm->arch.ldt.entry_count = from_mm->arch.ldt.entry_count; > mutex_unlock(&from_mm->arch.ldt.lock); > > - out: > return err; > } > > - > void free_ldt(struct mm_context *mm) > { > int i; _______________________________________________ linux-um mailing list linux-um@lists.infradead.org http://lists.infradead.org/mailman/listinfo/linux-um