On Fri, 2 Jun 2017 09:02:10 +0200
Heiko Carstens <heiko.carst...@de.ibm.com> wrote:

> On Thu, Jun 01, 2017 at 12:46:51PM +0200, Martin Schwidefsky wrote:
> > > Unfortunately, converting all page tables to 4k pgste page tables is
> > > not possible without provoking various race conditions.  
> > 
> > That is one approach we tried and was found to be buggy. The point is that
> > you are not allowed to reallocate a page table while a VMA exists that is
> > in the address range of that page table.
> > 
> > Another approach we tried is to use an ELF flag on the qemu executable.
> > That does not work either because fs/exec.c allocates and populates the
> > new mm struct for the argument pages before fs/binfmt_elf.c comes into
> > play.  
> 
> How about if you would fail the system call within arch_check_elf() if you
> detect that the binary requires pgstes (as indicated by elf flags) and then
> restart the system call?
> 
> That is: arch_check_elf() e.g. would set a thread flag that future mm's
> should be allocated with pgstes. Then do_execve() would cleanup everything
> and return to entry.S. Upon return to userspace we detect this condition
> and simply restart the system call, similar to signals vs -ERESTARTSYS.
> 
> That would make do_execve() cleanup everything and upon reentering it would
> allocate an mm with the pgste flag set.
> 
> Maybe this is a bit over-simplified, but might work.

This is not over-simplified at all, that does work:
--
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 69a77eecaec1..7bd182676ddd 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -64,6 +64,7 @@ config ARCH_SUPPORTS_UPROBES
 
 config S390
        def_bool y
+       select ARCH_BINFMT_ELF_STATE
        select ARCH_HAS_DEVMEM_IS_ALLOWED
        select ARCH_HAS_ELF_RANDOMIZE
        select ARCH_HAS_GCOV_PROFILE_ALL
diff --git a/arch/s390/include/asm/elf.h b/arch/s390/include/asm/elf.h
index e8f623041769..79911231f9e6 100644
--- a/arch/s390/include/asm/elf.h
+++ b/arch/s390/include/asm/elf.h
@@ -151,6 +151,28 @@ extern unsigned int vdso_enabled;
         && (x)->e_ident[EI_CLASS] == ELF_CLASS)
 #define compat_start_thread    start_thread31
 
+struct arch_elf_state {
+};
+
+#define INIT_ARCH_ELF_STATE { }
+
+#define arch_elf_pt_proc(ehdr, phdr, elf, interp, state) (0)
+#define arch_check_elf(ehdr, interp, interp_ehdr, state)       \
+({                                                             \
+       struct elf64_hdr *hdr = (void*) ehdr;                   \
+       int _rc = 0;                                            \
+       if (hdr->e_ident[EI_CLASS] == ELFCLASS64 &&             \
+           (hdr->e_flags & 0x00000002) &&                      \
+           !page_table_allocate_pgste &&                       \
+           !current->mm->context.alloc_pgste) {                \
+               current->mm->context.alloc_pgste = 1;           \
+               set_pt_regs_flag(task_pt_regs(current),         \
+                                PIF_SYSCALL_RESTART);          \
+               _rc = -EAGAIN;                                  \
+       }                                                       \
+       _rc;                                                    \
+})
+
 /* For SVR4/S390 the function pointer to be registered with `atexit` is
    passed in R14. */
 #define ELF_PLAT_INIT(_r, load_addr) \
diff --git a/arch/s390/include/asm/mmu_context.h 
b/arch/s390/include/asm/mmu_context.h
index c119d564d8f2..268a5d22ce1b 100644
--- a/arch/s390/include/asm/mmu_context.h
+++ b/arch/s390/include/asm/mmu_context.h
@@ -25,7 +25,8 @@ static inline int init_new_context(struct task_struct *tsk,
        mm->context.gmap_asce = 0;
        mm->context.flush_mm = 0;
 #ifdef CONFIG_PGSTE
-       mm->context.alloc_pgste = page_table_allocate_pgste;
+       mm->context.alloc_pgste = page_table_allocate_pgste ||
+               current->mm->context.alloc_pgste;
        mm->context.has_pgste = 0;
        mm->context.use_skey = 0;
        mm->context.use_cmma = 0;
diff --git a/arch/s390/include/asm/ptrace.h b/arch/s390/include/asm/ptrace.h
index 99bc456cc26a..24baa80f7af6 100644
--- a/arch/s390/include/asm/ptrace.h
+++ b/arch/s390/include/asm/ptrace.h
@@ -11,9 +11,11 @@
 
 #define PIF_SYSCALL            0       /* inside a system call */
 #define PIF_PER_TRAP           1       /* deliver sigtrap on return to user */
+#define PIF_SYSCALL_RESTART    2       /* restart the current system call */
 
 #define _PIF_SYSCALL           _BITUL(PIF_SYSCALL)
 #define _PIF_PER_TRAP          _BITUL(PIF_PER_TRAP)
+#define _PIF_SYSCALL_RESTART   _BITUL(PIF_SYSCALL_RESTART)
 
 #ifndef __ASSEMBLY__
 
diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index 0c2c3b8bfc9a..8c824b32527a 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -52,7 +52,7 @@ _TIF_TRACE    = (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | 
_TIF_SECCOMP | \
                   _TIF_SYSCALL_TRACEPOINT)
 _CIF_WORK      = (_CIF_MCCK_PENDING | _CIF_ASCE_PRIMARY | \
                   _CIF_ASCE_SECONDARY | _CIF_FPU)
-_PIF_WORK      = (_PIF_PER_TRAP)
+_PIF_WORK      = (_PIF_PER_TRAP | _PIF_SYSCALL_RESTART)
 
 #define BASED(name) name-cleanup_critical(%r13)
 
@@ -342,6 +342,8 @@ ENTRY(system_call)
        jo      .Lsysc_guarded_storage
        TSTMSK  __PT_FLAGS(%r11),_PIF_PER_TRAP
        jo      .Lsysc_singlestep
+       TSTMSK  __PT_FLAGS(%r11),_PIF_SYSCALL_RESTART
+       jo      .Lsysc_syscall_restart
        TSTMSK  __TI_flags(%r12),_TIF_SIGPENDING
        jo      .Lsysc_sigpending
        TSTMSK  __TI_flags(%r12),_TIF_NOTIFY_RESUME
@@ -434,6 +436,15 @@ ENTRY(system_call)
        jg      do_per_trap
 
 #
+# _PIF_SYSCALL_RESTART is set, repeat the current system call
+#
+.Lsysc_syscall_restart:
+       ni      __PT_FLAGS+7(%r11),255-_PIF_SYSCALL_RESTART
+       lmg     %r1,%r7,__PT_R1(%r11)   # load svc arguments
+       lg      %r2,__PT_ORIG_GPR2(%r11)
+       j       .Lsysc_do_svc
+
+#
 # call tracehook_report_syscall_entry/tracehook_report_syscall_exit before
 # and after the system call
 #
-- 
blue skies,
   Martin.

"Reality continues to ruin my life." - Calvin.

Reply via email to