Hi, On Tue, 2025-01-14 at 20:30 +0900, Hajime Tazaki wrote: > This patchset is another spin of nommu mode addition to UML. It doesn't > change a lot since the last version (v5), but contain clean ups. It would > be nice to hear about your opinions on that. > > There are still several limitations/issues which we already found; > here is the list of those issues. > > - memory mapped by loadable modules are not distinguished from > userspace memory.
Maybe I am missing it, but I do not yet see proper FP register handling. This will be needed for task/thread switches and also signal emission/sigreturn. I am attaching the test program that I used to verify the correct behaviour when dealing with the recent changes to FP register handling in UML. Benjamin > > -- Hajime > > v6: > - rebase to the latest uml/next tree > - more clean up on mmu/nommu for signal handling [10/13] > - rename functions of mcontext routines [06,10/13] > - added Acked-by tag for binfmt_elf_fdpic [02/13] > > v5: > - clean up stack manipulation code [05,06,07,10/13] > - > https://lore.kernel.org/linux-um/cover.1733998168.git.thehaj...@gmail.com/ > > v4: > - add arch/um/nommu, arch/x86/um/nommu to contain !MMU specific codes > - remove zpoline patch > - drop binfmt_elf_fdpic patch > - reduce ifndef CONFIG_MMU if possible > - split to elf header cleanup patch [01/13] > - fix kernel test robot warnings [06/13] > - fix coding styles [07/13] > - move task_top_of_stack definition [05/13] > - > https://lore.kernel.org/linux-um/cover.1733652929.git.thehaj...@gmail.com/ > > v3: > - > https://lore.kernel.org/linux-um/cover.1733199769.git.thehaj...@gmail.com/ > - add seccomp-based syscall hook in addition to zpoline [06/13] > - remove RFC, add a line to MAINTAINERS file > - fix kernel test robot warnings [02/13,08/13,10/13] > - add base-commit tag to cover letter > - pull the latest uml/next > - clean up SIGSEGV handling [10/13] > - detect fsgsbase availability with elf aux vector [08/13] > - simplify vdso code with macros [09/13] > > RFC v2: > - > https://lore.kernel.org/linux-um/cover.1731290567.git.thehaj...@gmail.com/ > - base branch is now uml/linux.git instead of torvalds/linux.git. > - reorganize the patch series to clean up > - fixed various coding styles issues > - clean up exec code path [07/13] > - fixed the crash/SIGSEGV case on userspace programs [10/13] > - add seccomp filter to limit syscall caller address [06/13] > - detect fsgsbase availability with sigsetjmp/siglongjmp [08/13] > - removes unrelated changes > - removes unneeded ifndef CONFIG_MMU > - convert UML_CONFIG_MMU to CONFIG_MMU as using uml/linux.git > - proposed a patch of maple-tree issue (resolving a limitation in RFC > v1) > > https://lore.kernel.org/linux-mm/20241108222834.3625217-1-thehaj...@gmail.com/ > > RFC: > - > https://lore.kernel.org/linux-um/cover.1729770373.git.thehaj...@gmail.com/ > > Hajime Tazaki (13): > x86/um: clean up elf specific definitions > x86/um: nommu: elf loader for fdpic > um: decouple MMU specific code from the common part > um: nommu: memory handling > x86/um: nommu: syscall handling > um: nommu: seccomp syscalls hook > x86/um: nommu: process/thread handling > um: nommu: configure fs register on host syscall invocation > x86/um/vdso: nommu: vdso memory update > x86/um: nommu: signal handling > um: change machine name for uname output > um: nommu: add documentation of nommu UML > um: nommu: plug nommu code into build system > > Documentation/virt/uml/nommu-uml.rst | 177 ++++++++++++++++++++++ > MAINTAINERS | 1 + > arch/um/Kconfig | 14 +- > arch/um/Makefile | 10 ++ > arch/um/configs/x86_64_nommu_defconfig | 64 ++++++++ > arch/um/include/asm/Kbuild | 1 + > arch/um/include/asm/futex.h | 4 + > arch/um/include/asm/mmu.h | 8 + > arch/um/include/asm/mmu_context.h | 2 + > arch/um/include/asm/ptrace-generic.h | 6 + > arch/um/include/asm/uaccess.h | 7 +- > arch/um/include/shared/kern_util.h | 12 ++ > arch/um/include/shared/os.h | 16 ++ > arch/um/kernel/Makefile | 5 +- > arch/um/kernel/mem-pgtable.c | 55 +++++++ > arch/um/kernel/mem.c | 39 +---- > arch/um/kernel/process.c | 25 ++++ > arch/um/kernel/skas/process.c | 27 ---- > arch/um/kernel/um_arch.c | 3 + > arch/um/nommu/Makefile | 3 + > arch/um/nommu/os-Linux/Makefile | 7 + > arch/um/nommu/os-Linux/signal.c | 28 ++++ > arch/um/nommu/trap.c | 188 > ++++++++++++++++++++++++ > arch/um/os-Linux/Makefile | 8 +- > arch/um/os-Linux/internal.h | 5 + > arch/um/os-Linux/mem.c | 4 + > arch/um/os-Linux/process.c | 149 ++++++++++++++++++- > arch/um/os-Linux/seccomp.c | 87 +++++++++++ > arch/um/os-Linux/signal.c | 31 +++- > arch/um/os-Linux/skas/process.c | 132 ----------------- > arch/um/os-Linux/start_up.c | 20 +++ > arch/um/os-Linux/util.c | 3 +- > arch/x86/um/Makefile | 7 +- > arch/x86/um/asm/elf.h | 8 +- > arch/x86/um/asm/module.h | 24 --- > arch/x86/um/nommu/Makefile | 8 + > arch/x86/um/nommu/do_syscall_64.c | 74 ++++++++++ > arch/x86/um/nommu/entry_64.S | 113 ++++++++++++++ > arch/x86/um/nommu/os-Linux/Makefile | 6 + > arch/x86/um/nommu/os-Linux/mcontext.c | 24 +++ > arch/x86/um/nommu/syscalls.h | 16 ++ > arch/x86/um/nommu/syscalls_64.c | 115 +++++++++++++++ > arch/x86/um/shared/sysdep/mcontext.h | 4 + > arch/x86/um/shared/sysdep/syscalls_64.h | 6 + > arch/x86/um/signal.c | 7 + > arch/x86/um/vdso/vma.c | 17 ++- > fs/Kconfig.binfmt | 2 +- > 47 files changed, 1328 insertions(+), 244 deletions(-) > create mode 100644 Documentation/virt/uml/nommu-uml.rst > create mode 100644 arch/um/configs/x86_64_nommu_defconfig > create mode 100644 arch/um/kernel/mem-pgtable.c > create mode 100644 arch/um/nommu/Makefile > create mode 100644 arch/um/nommu/os-Linux/Makefile > create mode 100644 arch/um/nommu/os-Linux/signal.c > create mode 100644 arch/um/nommu/trap.c > create mode 100644 arch/um/os-Linux/seccomp.c > delete mode 100644 arch/x86/um/asm/module.h > create mode 100644 arch/x86/um/nommu/Makefile > create mode 100644 arch/x86/um/nommu/do_syscall_64.c > create mode 100644 arch/x86/um/nommu/entry_64.S > create mode 100644 arch/x86/um/nommu/os-Linux/Makefile > create mode 100644 arch/x86/um/nommu/os-Linux/mcontext.c > create mode 100644 arch/x86/um/nommu/syscalls.h > create mode 100644 arch/x86/um/nommu/syscalls_64.c > > > base-commit: 2d2b61ae38bd91217ea7cc5bc700a2b9e75b3937
/* * gcc test-signal-restore.c -o test-signal-restore-amd64 * gcc -m32 -march=i686 -lm test-signal-restore.c -o test-signal-restore-i386 */ /* Is there a better way to *not* include bits/sigcontext.h? */ #include <features.h> #undef __USE_MISC #include <asm/sigcontext.h> #include <elf.h> #include <math.h> #include <stdio.h> #include <signal.h> #include <stdlib.h> #include <sys/mman.h> #include <sys/wait.h> #include <errno.h> #include <unistd.h> #include <sys/ptrace.h> #include <sys/user.h> #include <sys/uio.h> #include <asm/unistd.h> #define ST0_EXP_ADD 10 void *scratch_page; void sighandler(int sig, siginfo_t *info, void *p) { ucontext_t *uc = p; printf("sighandler: extended_size: %d, xstate_size: %d\n", ((struct _fpstate *)uc->uc_mcontext.__fpregs)->sw_reserved.extended_size, ((struct _fpstate *)uc->uc_mcontext.__fpregs)->sw_reserved.xstate_size); uc->uc_mcontext.__fpregs->_st[0].__exponent += ST0_EXP_ADD; } int test_fp() { double num = 0.5; long ret; printf("pre-signal: %g\n", num); /* * This does kill(getpid(), SIGUSR1); with "num" being passed in AND * out of the floating point stack. We can therefore modify num by * changing st[0] when handling the signal. */ #ifdef __i386__ asm volatile ( "int $0x80;" : "=t" (num), "=a" (ret) : "0" (num), "1" (__NR_kill), "b" (getpid()), "c" (SIGUSR1) : ); #else asm volatile ( "syscall;" : "=t" (num), "=a" (ret) : "0" (num), "1" (__NR_kill), "D" (getpid()), "S" (SIGUSR1) : "r11", "rcx"); #endif printf("post-signal: %g\n", num); if (num != pow(2, ST0_EXP_ADD - 1)) { printf("floating point register was not manipulated\n"); return 1; } return 0; } enum source { S_FPREGS = 0, S_FPXREGS = 1, S_GETREGS_FPREGS = 2, S_GETREGS_XFPREGS = 3, S_GETREGS_XSTATE = 4, }; int test_fp_ptrace(enum source source) { int pid, status, ret; pid = fork(); if (pid < 0) return 127; if (pid == 0) { /* child */ ptrace(PTRACE_TRACEME, 0, 0, 0); kill(getpid(), SIGSTOP); if (test_fp()) exit(1); exit(0); } /* Wait for child to stop itself */ do { ret = waitpid(pid, &status, 0); } while (ret < 0 && errno == EINTR); if (!WIFSTOPPED(status)) return 127; /* Continue until SIGUSR1 to self */ ptrace(PTRACE_CONT, pid, NULL, 0); do { ret = waitpid(pid, &status, 0); } while (ret < 0 && errno == EINTR); if (!WIFSTOPPED(status)) return 127; if (source == S_FPXREGS || source == S_GETREGS_XFPREGS) { #ifdef __i386__ struct user_fpxregs_struct *fpstate; struct iovec iov = { .iov_len = sizeof(*fpstate), }; int ret; fpstate = scratch_page + 4096 - iov.iov_len; iov.iov_base = fpstate; if (source == S_GETREGS_XFPREGS) ret = ptrace(PTRACE_GETREGSET, pid, NT_PRXFPREG, &iov); else ret = ptrace(PTRACE_GETFPXREGS, pid, NULL, fpstate); if (ret) { kill(pid, SIGKILL); if (errno == EINVAL) { printf("Getting FPX regs not supported\n"); return 0; } else { printf("Error getting FPX regs: %d\n", errno); return 127; } } ((struct _fpxreg*)&fpstate->st_space[0])->exponent += ST0_EXP_ADD; if (source == S_GETREGS_XFPREGS) ret = ptrace(PTRACE_SETREGSET, pid, NT_PRXFPREG, &iov); else ret = ptrace(PTRACE_SETFPXREGS, pid, NULL, fpstate); if (ret) return -127; #else printf("No FPXREGS on x86_64\n"); kill(pid, SIGKILL); return 127; #endif } else if (source == S_FPREGS || source == S_GETREGS_FPREGS) { struct _fpstate *fpstate; struct iovec iov = { .iov_len = sizeof(*fpstate), }; fpstate = scratch_page; // + 4096 - sizeof(*fpstate); iov.iov_base = fpstate; if (source == S_GETREGS_FPREGS) ret = ptrace(PTRACE_GETREGSET, pid, NT_PRFPREG, &iov); else ret = ptrace(PTRACE_GETFPREGS, pid, NULL, fpstate); if (ret) { kill(pid, SIGKILL); if (errno == EINVAL) { printf("Getting FP regs not supported\n"); return 0; } else { printf("Error getting FPX regs: %d\n", errno); return 127; } } #ifdef __i386__ ((struct _fpreg*) &fpstate->_st[0])->exponent += ST0_EXP_ADD; #else ((struct _fpxreg*) &fpstate->st_space[0])->exponent += ST0_EXP_ADD; #endif if (source == S_GETREGS_FPREGS) ret = ptrace(PTRACE_SETREGSET, pid, NT_PRFPREG, &iov); else ret = ptrace(PTRACE_SETFPREGS, pid, NULL, fpstate); if (ret) return 127; } else if (source == S_GETREGS_XSTATE) { #ifdef __i386__ struct user_fpxregs_struct *fpstate; #else struct user_fpregs_struct *fpstate; #endif struct iovec iov = { .iov_len = 4096, }; fpstate = scratch_page + 4096 - iov.iov_len; iov.iov_base = fpstate; ret = ptrace(PTRACE_GETREGSET, pid, NT_X86_XSTATE, &iov); if (ret) { kill(pid, SIGKILL); if (errno == EINVAL) { printf("Getting XSTATE not supported\n"); return 0; } else { printf("Error getting XSTATE size: %d\n", errno); return 127; } } printf("host xstate size: %ld\n", iov.iov_len); /* Second time with the exact length (to test the kernel) */ fpstate = scratch_page + 4096 - iov.iov_len; iov.iov_base = fpstate; ret = ptrace(PTRACE_GETREGSET, pid, NT_X86_XSTATE, &iov); if (ret) { printf("Error getting XSTATE: %d\n", errno); return 127; } fpstate = scratch_page + 4096 - iov.iov_len; iov.iov_base = fpstate; ret = ptrace(PTRACE_GETREGSET, pid, NT_X86_XSTATE, &iov); if (ret) { kill(pid, SIGKILL); printf("Error getting XSTATE (with correct size): %d\n", errno); return 127; } #ifdef __i386__ ((struct _fpxreg *)&fpstate->st_space[0])->exponent += ST0_EXP_ADD; #else ((struct _fpxreg *)&fpstate->st_space[0])->exponent += ST0_EXP_ADD; #endif ret = ptrace(PTRACE_SETREGSET, pid, NT_X86_XSTATE, &iov); if (ret) { printf("Failed to set XSTATE: %d\n", errno); return 127; } } else { return 127; } /* Run until completion (without handling the signal) */ ptrace(PTRACE_CONT, pid, NULL, 0); do { ret = waitpid(pid, &status, 0); } while (ret < 0 && errno == EINTR); if (!WIFEXITED(status)) return 127; return WEXITSTATUS(status); } int main() { struct sigaction sa = { .sa_flags = SA_SIGINFO, .sa_handler = (void (*)(int))sighandler, }; int ret; scratch_page = mmap(NULL, 8192, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); munmap(scratch_page + 4096, 4096); sigaction(SIGUSR1, &sa, NULL); if (test_fp()) return 1; sa.sa_handler = SIG_DFL; sigaction(SIGUSR1, &sa, NULL); printf("\nmodify using ptrace PTRACE_SETFPREGS instead of sighandler:\n"); ret = test_fp_ptrace(S_FPREGS); if (ret) return ret; #ifdef __i386__ printf("\nmodify using ptrace PTRACE_SETFPXREGS instead of sighandler:\n"); ret = test_fp_ptrace(S_FPXREGS); if (ret) return ret; #endif printf("\nmodify using ptrace PTRACE_SETREGSET, via NT_PRFPREG instead of sighandler:\n"); ret = test_fp_ptrace(S_GETREGS_FPREGS); if (ret) return ret; #ifdef __i386__ printf("\nmodify using ptrace PTRACE_SETREGSET, via NT_XFPREGS instead of sighandler:\n"); ret = test_fp_ptrace(S_GETREGS_XFPREGS); if (ret) return ret; #endif printf("\nmodify using ptrace PTRACE_SETREGSET, via NT_X86_XSTATE instead of sighandler:\n"); ret = test_fp_ptrace(S_GETREGS_XSTATE); if (ret) return ret; return 0; }