From: Mahesh Salgaonkar <mah...@linux.vnet.ibm.com> Once we get high level MCE error event from opal, process it and figure out if it recoverable or not. If yes, take corrective actions.
TODO: - Rework on handling of asynchronous MCE errors. - Update opal_recover_mce() to ignore async errors. - Update flush_and_reload_slb() to avoid SLB reload in radix mode. Signed-off-by: Mahesh Salgaonkar <mah...@linux.vnet.ibm.com> --- arch/powerpc/include/asm/mce.h | 3 +++ arch/powerpc/kernel/mce.c | 26 +++++++++++++++++++++++ arch/powerpc/kernel/mce_power.c | 38 +++++++++++++++++++++++++++++++++ arch/powerpc/platforms/powernv/opal.c | 2 ++ 4 files changed, 69 insertions(+) diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h index 36db6b0..69e4a42 100644 --- a/arch/powerpc/include/asm/mce.h +++ b/arch/powerpc/include/asm/mce.h @@ -88,9 +88,12 @@ extern void save_mce_event(struct pt_regs *regs, long handled, struct mce_error_info *mce_err, uint64_t nip, uint64_t addr); extern int get_mce_event(struct OpalMachineCheckEvent *mce, bool release); +extern int set_mce_event(struct OpalMachineCheckEvent *mce); extern void release_mce_event(void); extern void machine_check_queue_event(void); extern void machine_check_print_event_info(struct OpalMachineCheckEvent *evt); extern uint64_t get_mce_fault_addr(struct OpalMachineCheckEvent *evt); +extern long handle_mce_errors(struct pt_regs *regs, + struct OpalMachineCheckEvent *evt); #endif /* __ASM_PPC64_MCE_H__ */ diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index 51a7c64..36da14a3 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -166,6 +166,32 @@ int get_mce_event(struct OpalMachineCheckEvent *mce, bool release) return ret; } +int set_mce_event(struct OpalMachineCheckEvent *mce) +{ + int index = __this_cpu_inc_return(mce_nest_count) - 1; + struct OpalMachineCheckEvent *mc_evt = this_cpu_ptr(&mce_event[index]); + int ret = 0; + + /* Sanity check */ + if (index < 0) + return ret; + + /* Check if we have MCE info slot within array limit. */ + if (index < MAX_MC_EVT) { + /* Copy the event structure and release the original */ + if (mce) { + *mc_evt = *mce; + /* endian conversions */ + mc_evt->srr0 = be64_to_cpu(mce->srr0); + mc_evt->srr1 = be64_to_cpu(mce->srr1); + mc_evt->u.ue_error.effective_address = + be64_to_cpu(mce->u.ue_error.effective_address); + } + ret = 1; + } + return ret; +} + void release_mce_event(void) { get_mce_event(NULL, true); diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c index 7353991..91ed2ef 100644 --- a/arch/powerpc/kernel/mce_power.c +++ b/arch/powerpc/kernel/mce_power.c @@ -372,3 +372,41 @@ long __machine_check_early_realmode_p8(struct pt_regs *regs) save_mce_event(regs, handled, &mce_error_info, nip, addr); return handled; } + +static long flush_tlb(void) +{ + long handled = 0; + + if (cur_cpu_spec && cur_cpu_spec->flush_tlb) { + cur_cpu_spec->flush_tlb(TLB_INVAL_SCOPE_GLOBAL); + handled = 1; + } + return handled; +} + +long handle_mce_errors(struct pt_regs *regs, struct OpalMachineCheckEvent *evt) +{ + long handled = 1; + + if (evt->disposition == MCE_DISPOSITION_RECOVERED) + return handled; + + switch (evt->error_type) { + case MCE_ERROR_TYPE_UE: + handled = mce_handle_ue_error(regs); + break; + case MCE_ERROR_TYPE_SLB: + case MCE_ERROR_TYPE_ERAT: + flush_and_reload_slb(); + handled = 1; + break; + case MCE_ERROR_TYPE_TLB: + handled = flush_tlb(); + break; + default: + handled = 0; + } + if (handled) + evt->disposition = MCE_DISPOSITION_RECOVERED; + return handled; +} diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index 263c57e..f1115c4 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -501,6 +501,8 @@ int opal_machine_check_early(struct pt_regs *regs, long *handled) if (rc != OPAL_SUCCESS) return -1; + *handled = handle_mce_errors(regs, &evt); + set_mce_event(&evt); return 0; }