Excerpts from Michael Ellerman's message of February 11, 2021 11:51 pm: > When we enabled STRICT_KERNEL_RWX we received some reports of boot > failures when using the Hash MMU and running under phyp. The crashes > are intermittent, and often exhibit as a completely unresponsive > system, or possibly an oops. > > One example, which was caught in xmon: > > [ 14.068327][ T1] devtmpfs: mounted > [ 14.069302][ T1] Freeing unused kernel memory: 5568K > [ 14.142060][ T347] BUG: Unable to handle kernel instruction fetch > [ 14.142063][ T1] Run /sbin/init as init process > [ 14.142074][ T347] Faulting instruction address: 0xc000000000004400 > cpu 0x2: Vector: 400 (Instruction Access) at [c00000000c7475e0] > pc: c000000000004400: exc_virt_0x4400_instruction_access+0x0/0x80 > lr: c0000000001862d4: update_rq_clock+0x44/0x110 > sp: c00000000c747880 > msr: 8000000040001031 > current = 0xc00000000c60d380 > paca = 0xc00000001ec9de80 irqmask: 0x03 irq_happened: 0x01 > pid = 347, comm = kworker/2:1 > ... > enter ? for help > [c00000000c747880] c0000000001862d4 update_rq_clock+0x44/0x110 (unreliable) > [c00000000c7478f0] c000000000198794 update_blocked_averages+0xb4/0x6d0 > [c00000000c7479f0] c000000000198e40 update_nohz_stats+0x90/0xd0 > [c00000000c747a20] c0000000001a13b4 _nohz_idle_balance+0x164/0x390 > [c00000000c747b10] c0000000001a1af8 newidle_balance+0x478/0x610 > [c00000000c747be0] c0000000001a1d48 pick_next_task_fair+0x58/0x480 > [c00000000c747c40] c000000000eaab5c __schedule+0x12c/0x950 > [c00000000c747cd0] c000000000eab3e8 schedule+0x68/0x120 > [c00000000c747d00] c00000000016b730 worker_thread+0x130/0x640 > [c00000000c747da0] c000000000174d50 kthread+0x1a0/0x1b0 > [c00000000c747e10] c00000000000e0f0 ret_from_kernel_thread+0x5c/0x6c > > This shows that CPU 2, which was idle, woke up and then appears to > randomly take an instruction fault on a completely valid area of > kernel text. > > The cause turns out to be the call to hash__mark_rodata_ro(), late in > boot. Due to the way we layout text and rodata, that function actually > changes the permissions for all of text and rodata to read-only plus > execute. > > To do the permission change we use a hypervisor call, H_PROTECT. On > phyp that appears to be implemented by briefly removing the mapping of > the kernel text, before putting it back with the updated permissions. > If any other CPU is executing during that window, it will see spurious > faults on the kernel text and/or data, leading to crashes. > > To fix it we use stop machine to collect all other CPUs, and then have > them drop into real mode (MMU off), while we change the mapping. That > way they are unaffected by the mapping temporarily disappearing. > > Signed-off-by: Michael Ellerman <m...@ellerman.id.au> > --- > arch/powerpc/mm/book3s64/hash_pgtable.c | 105 +++++++++++++++++++++++- > 1 file changed, 104 insertions(+), 1 deletion(-) > > diff --git a/arch/powerpc/mm/book3s64/hash_pgtable.c > b/arch/powerpc/mm/book3s64/hash_pgtable.c > index 3663d3cdffac..01de985df2c4 100644 > --- a/arch/powerpc/mm/book3s64/hash_pgtable.c > +++ b/arch/powerpc/mm/book3s64/hash_pgtable.c > @@ -8,6 +8,7 @@ > #include <linux/sched.h> > #include <linux/mm_types.h> > #include <linux/mm.h> > +#include <linux/stop_machine.h> > > #include <asm/sections.h> > #include <asm/mmu.h> > @@ -400,6 +401,19 @@ EXPORT_SYMBOL_GPL(hash__has_transparent_hugepage); > #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ > > #ifdef CONFIG_STRICT_KERNEL_RWX > + > +struct change_memory_parms { > + unsigned long start, end, newpp; > + unsigned int step, nr_cpus, master_cpu; > + atomic_t cpu_counter; > +}; > + > +// We'd rather this was on the stack but it has to be in the RMO > +static struct change_memory_parms chmem_parms; > + > +// And therefore we need a lock to protect it from concurrent use > +static DEFINE_MUTEX(chmem_lock); > + > static void change_memory_range(unsigned long start, unsigned long end, > unsigned int step, unsigned long newpp) > { > @@ -414,6 +428,73 @@ static void change_memory_range(unsigned long start, > unsigned long end, > mmu_kernel_ssize); > } > > +static int notrace chmem_secondary_loop(struct change_memory_parms *parms) > +{ > + unsigned long msr, tmp, flags; > + int *p; > + > + p = &parms->cpu_counter.counter; > + > + local_irq_save(flags); > + __hard_EE_RI_disable(); > + > + asm volatile ( > + // Switch to real mode and leave interrupts off > + "mfmsr %[msr] ;" > + "li %[tmp], %[MSR_IR_DR] ;" > + "andc %[tmp], %[msr], %[tmp] ;" > + "mtmsrd %[tmp] ;" > + > + // Tell the master we are in real mode > + "1: " > + "lwarx %[tmp], 0, %[p] ;" > + "addic %[tmp], %[tmp], -1 ;" > + "stwcx. %[tmp], 0, %[p] ;" > + "bne- 1b ;" > + > + // Spin until the counter goes to zero > + "2: ;" > + "lwz %[tmp], 0(%[p]) ;" > + "cmpwi %[tmp], 0 ;" > + "bne- 2b ;" > + > + // Switch back to virtual mode > + "mtmsrd %[msr] ;"
Pity we don't have something that can switch to emergency stack and so we can write this stuff in C. How's something like this suit you? --- arch/powerpc/kernel/misc_64.S | 22 +++++++++++++++++++++ arch/powerpc/kernel/process.c | 37 +++++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+) diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S index 070465825c21..5e911d0b0b16 100644 --- a/arch/powerpc/kernel/misc_64.S +++ b/arch/powerpc/kernel/misc_64.S @@ -27,6 +27,28 @@ .text +#ifdef CONFIG_PPC_BOOK3S_64 +_GLOBAL(__call_realmode) + mflr r0 + std r0,16(r1) + stdu r1,THREAD_SIZE-STACK_FRAME_OVERHEAD(r5) + mr r1,r5 + mtctr r3 + mr r3,r4 + mfmsr r4 + xori r4,r4,(MSR_IR|MSR_DR) + mtmsrd r4 + bctrl + mfmsr r4 + xori r4,r4,(MSR_IR|MSR_DR) + mtmsrd r4 + ld r1,0(r1) + ld r0,16(r1) + mtlr r0 + blr + +#endif + _GLOBAL(call_do_softirq) mflr r0 std r0,16(r1) diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index a66f435dabbf..260d60f665a3 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -2197,6 +2197,43 @@ void show_stack(struct task_struct *tsk, unsigned long *stack, put_task_stack(tsk); } +#ifdef CONFIG_PPC_BOOK3S_64 +int __call_realmode(int (*fn)(void *arg), void *arg, void *sp); + +/* XXX: find a better place for this + * Executing C code in real-mode in general Book3S-64 code can only be done + * via this function that switches the stack to one inside the real-mode-area, + * which may cover only a small first part of real memory on hash guest LPARs. + * fn must be NOKPROBES, must not access vmalloc or anything outside the RMA, + * probably shouldn't enable the MMU or interrupts, etc, and be very careful + * about calling other generic kernel or powerpc functions. + */ +int call_realmode(int (*fn)(void *arg), void *arg) +{ + unsigned long flags; + void *cursp, *emsp; + int ret; + + /* Stack switch is only really required for HPT LPAR, but do it for all to help test coverage of tricky code */ + cursp = (void *)(current_stack_pointer & ~(THREAD_SIZE - 1)); + emsp = (void *)(local_paca->emergency_sp - THREAD_SIZE); + + /* XXX check_stack_overflow(); */ + + if (WARN_ON_ONCE(cursp == emsp)) + return -EBUSY; + + local_irq_save(flags); + hard_irq_disable(); + + ret = __call_realmode(fn, arg, emsp); + + local_irq_restore(flags); + + return ret; +} +#endif + #ifdef CONFIG_PPC64 /* Called with hard IRQs off */ void notrace __ppc64_runlatch_on(void) -- 2.23.0