Excerpts from Ganesh Goudar's message of November 8, 2021 6:38 pm:
> In realmode mce handler we use irq_work_queue() to defer
> the processing of mce events, irq_work_queue() can only
> be called when translation is enabled because it touches
> memory outside RMA, hence we enable translation before
> calling irq_work_queue and disable on return, though it
> is not safe to do in realmode.
> 
> To avoid this, program the decrementer and call the event
> processing functions from timer handler.
> 
> Signed-off-by: Ganesh Goudar <ganes...@linux.ibm.com>
> ---
>  arch/powerpc/include/asm/machdep.h       |  2 +
>  arch/powerpc/include/asm/mce.h           |  2 +
>  arch/powerpc/include/asm/paca.h          |  1 +
>  arch/powerpc/kernel/mce.c                | 51 +++++++++++-------------
>  arch/powerpc/kernel/time.c               |  3 ++
>  arch/powerpc/platforms/pseries/pseries.h |  1 +
>  arch/powerpc/platforms/pseries/ras.c     | 31 +-------------
>  arch/powerpc/platforms/pseries/setup.c   |  1 +
>  8 files changed, 34 insertions(+), 58 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/machdep.h 
> b/arch/powerpc/include/asm/machdep.h
> index 764f2732a821..c89cc03c0f97 100644
> --- a/arch/powerpc/include/asm/machdep.h
> +++ b/arch/powerpc/include/asm/machdep.h
> @@ -103,6 +103,8 @@ struct machdep_calls {
>       /* Called during machine check exception to retrive fixup address. */
>       bool            (*mce_check_early_recovery)(struct pt_regs *regs);
>  
> +     void            (*machine_check_log_err)(void);
> +
>       /* Motherboard/chipset features. This is a kind of general purpose
>        * hook used to control some machine specific features (like reset
>        * lines, chip power control, etc...).
> diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
> index 331d944280b8..187810f13669 100644
> --- a/arch/powerpc/include/asm/mce.h
> +++ b/arch/powerpc/include/asm/mce.h
> @@ -235,8 +235,10 @@ extern void machine_check_print_event_info(struct 
> machine_check_event *evt,
>  unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr);
>  extern void mce_common_process_ue(struct pt_regs *regs,
>                                 struct mce_error_info *mce_err);
> +extern void machine_check_raise_dec_intr(void);

No new externs on function declarations, they tell me.

>  int mce_register_notifier(struct notifier_block *nb);
>  int mce_unregister_notifier(struct notifier_block *nb);
> +void mce_run_late_handlers(void);
>  #ifdef CONFIG_PPC_BOOK3S_64
>  void flush_and_reload_slb(void);
>  void flush_erat(void);
> diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
> index dc05a862e72a..f49180f8c9be 100644
> --- a/arch/powerpc/include/asm/paca.h
> +++ b/arch/powerpc/include/asm/paca.h
> @@ -280,6 +280,7 @@ struct paca_struct {
>  #endif
>  #ifdef CONFIG_PPC_BOOK3S_64
>       struct mce_info *mce_info;
> +     atomic_t mces_to_process;
>  #endif /* CONFIG_PPC_BOOK3S_64 */
>  } ____cacheline_aligned;
>  
> diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
> index fd829f7f25a4..45baa062ebc0 100644
> --- a/arch/powerpc/kernel/mce.c
> +++ b/arch/powerpc/kernel/mce.c
> @@ -28,19 +28,9 @@
>  
>  #include "setup.h"
>  
> -static void machine_check_process_queued_event(struct irq_work *work);
> -static void machine_check_ue_irq_work(struct irq_work *work);
>  static void machine_check_ue_event(struct machine_check_event *evt);
>  static void machine_process_ue_event(struct work_struct *work);
>  
> -static struct irq_work mce_event_process_work = {
> -        .func = machine_check_process_queued_event,
> -};
> -
> -static struct irq_work mce_ue_event_irq_work = {
> -     .func = machine_check_ue_irq_work,
> -};
> -
>  static DECLARE_WORK(mce_ue_event_work, machine_process_ue_event);
>  
>  static BLOCKING_NOTIFIER_HEAD(mce_notifier_list);
> @@ -89,6 +79,12 @@ static void mce_set_error_info(struct machine_check_event 
> *mce,
>       }
>  }
>  
> +/* Raise decrementer interrupt */
> +void machine_check_raise_dec_intr(void)
> +{
> +     set_dec(1);
> +}

The problem here is a timer can be scheduled (e.g., by an external 
interrupt if it gets taken before the decrementer, then uses a
timer) and that set decr > 1. See logic in decrementer_set_next_event.

I _think_ the way to get around this would be to have the machine check
just use arch_irq_work_raise.

Then you could also only call the mce handler inside the
test_irq_work_pending() check and avoid the added function call on every 
timer. That test should also be marked unlikely come to think of it, but
that's a side patchlet.

> +
>  /*
>   * Decode and save high level MCE information into per cpu buffer which
>   * is an array of machine_check_event structure.
> @@ -135,6 +131,8 @@ void save_mce_event(struct pt_regs *regs, long handled,
>       if (mce->error_type == MCE_ERROR_TYPE_UE)
>               mce->u.ue_error.ignore_event = mce_err->ignore_event;
>  
> +     atomic_inc(&local_paca->mces_to_process);
> +
>       if (!addr)
>               return;
>  
> @@ -217,7 +215,7 @@ void release_mce_event(void)
>       get_mce_event(NULL, true);
>  }
>  
> -static void machine_check_ue_irq_work(struct irq_work *work)
> +static void machine_check_ue_work(void)
>  {
>       schedule_work(&mce_ue_event_work);
>  }
> @@ -239,7 +237,7 @@ static void machine_check_ue_event(struct 
> machine_check_event *evt)
>              evt, sizeof(*evt));
>  
>       /* Queue work to process this event later. */
> -     irq_work_queue(&mce_ue_event_irq_work);
> +     machine_check_raise_dec_intr();
>  }
>  
>  /*
> @@ -249,7 +247,6 @@ void machine_check_queue_event(void)
>  {
>       int index;
>       struct machine_check_event evt;
> -     unsigned long msr;
>  
>       if (!get_mce_event(&evt, MCE_EVENT_RELEASE))
>               return;
> @@ -263,20 +260,7 @@ void machine_check_queue_event(void)
>       memcpy(&local_paca->mce_info->mce_event_queue[index],
>              &evt, sizeof(evt));
>  
> -     /*
> -      * Queue irq work to process this event later. Before
> -      * queuing the work enable translation for non radix LPAR,
> -      * as irq_work_queue may try to access memory outside RMO
> -      * region.
> -      */
> -     if (!radix_enabled() && firmware_has_feature(FW_FEATURE_LPAR)) {
> -             msr = mfmsr();
> -             mtmsr(msr | MSR_IR | MSR_DR);
> -             irq_work_queue(&mce_event_process_work);
> -             mtmsr(msr);
> -     } else {
> -             irq_work_queue(&mce_event_process_work);
> -     }

Getting rid of these things would be very nice.

> +     machine_check_raise_dec_intr();
>  }
>  
>  void mce_common_process_ue(struct pt_regs *regs,
> @@ -338,7 +322,7 @@ static void machine_process_ue_event(struct work_struct 
> *work)
>   * process pending MCE event from the mce event queue. This function will be
>   * called during syscall exit.
>   */
> -static void machine_check_process_queued_event(struct irq_work *work)
> +static void machine_check_process_queued_event(void)
>  {
>       int index;
>       struct machine_check_event *evt;
> @@ -363,6 +347,17 @@ static void machine_check_process_queued_event(struct 
> irq_work *work)
>       }
>  }
>  
> +void mce_run_late_handlers(void)
> +{
> +     if (unlikely(atomic_read(&local_paca->mces_to_process))) {
> +             if (ppc_md.machine_check_log_err)
> +                     ppc_md.machine_check_log_err();
> +             machine_check_process_queued_event();
> +             machine_check_ue_work();
> +             atomic_dec(&local_paca->mces_to_process);
> +     }
> +}
> +
>  void machine_check_print_event_info(struct machine_check_event *evt,
>                                   bool user_mode, bool in_guest)
>  {
> diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
> index 934d8ae66cc6..2dc09d75d77c 100644
> --- a/arch/powerpc/kernel/time.c
> +++ b/arch/powerpc/kernel/time.c
> @@ -597,6 +597,9 @@ DEFINE_INTERRUPT_HANDLER_ASYNC(timer_interrupt)
>               irq_work_run();
>       }
>  
> +#ifdef CONFIG_PPC_BOOK3S_64
> +     mce_run_late_handlers();
> +#endif

It looks like if (IS_ENABLED(CONFIG_PPC_BOOK3S_64)) should work here?

>       now = get_tb();
>       if (now >= *next_tb) {
>               *next_tb = ~(u64)0;
> diff --git a/arch/powerpc/platforms/pseries/pseries.h 
> b/arch/powerpc/platforms/pseries/pseries.h
> index 3544778e06d0..0dc4f1027b30 100644
> --- a/arch/powerpc/platforms/pseries/pseries.h
> +++ b/arch/powerpc/platforms/pseries/pseries.h
> @@ -21,6 +21,7 @@ struct pt_regs;
>  extern int pSeries_system_reset_exception(struct pt_regs *regs);
>  extern int pSeries_machine_check_exception(struct pt_regs *regs);
>  extern long pseries_machine_check_realmode(struct pt_regs *regs);
> +extern void pSeries_machine_check_log_err(void);

extern can be removed.

Thanks,
Nick

>  
>  #ifdef CONFIG_SMP
>  extern void smp_init_pseries(void);
> diff --git a/arch/powerpc/platforms/pseries/ras.c 
> b/arch/powerpc/platforms/pseries/ras.c
> index 56092dccfdb8..8613f9cc5798 100644
> --- a/arch/powerpc/platforms/pseries/ras.c
> +++ b/arch/powerpc/platforms/pseries/ras.c
> @@ -23,11 +23,6 @@ static DEFINE_SPINLOCK(ras_log_buf_lock);
>  
>  static int ras_check_exception_token;
>  
> -static void mce_process_errlog_event(struct irq_work *work);
> -static struct irq_work mce_errlog_process_work = {
> -     .func = mce_process_errlog_event,
> -};
> -
>  #define EPOW_SENSOR_TOKEN    9
>  #define EPOW_SENSOR_INDEX    0
>  
> @@ -729,40 +724,16 @@ static int mce_handle_error(struct pt_regs *regs, 
> struct rtas_error_log *errp)
>       error_type = mce_log->error_type;
>  
>       disposition = mce_handle_err_realmode(disposition, error_type);
> -
> -     /*
> -      * Enable translation as we will be accessing per-cpu variables
> -      * in save_mce_event() which may fall outside RMO region, also
> -      * leave it enabled because subsequently we will be queuing work
> -      * to workqueues where again per-cpu variables accessed, besides
> -      * fwnmi_release_errinfo() crashes when called in realmode on
> -      * pseries.
> -      * Note: All the realmode handling like flushing SLB entries for
> -      *       SLB multihit is done by now.
> -      */
>  out:
> -     msr = mfmsr();
> -     mtmsr(msr | MSR_IR | MSR_DR);
> -
>       disposition = mce_handle_err_virtmode(regs, errp, mce_log,
>                                             disposition);
> -
> -     /*
> -      * Queue irq work to log this rtas event later.
> -      * irq_work_queue uses per-cpu variables, so do this in virt
> -      * mode as well.
> -      */
> -     irq_work_queue(&mce_errlog_process_work);
> -
> -     mtmsr(msr);
> -
>       return disposition;
>  }
>  
>  /*
>   * Process MCE rtas errlog event.
>   */
> -static void mce_process_errlog_event(struct irq_work *work)
> +void pSeries_machine_check_log_err(void)
>  {
>       struct rtas_error_log *err;
>  
> diff --git a/arch/powerpc/platforms/pseries/setup.c 
> b/arch/powerpc/platforms/pseries/setup.c
> index f79126f16258..54bd7bdb7e92 100644
> --- a/arch/powerpc/platforms/pseries/setup.c
> +++ b/arch/powerpc/platforms/pseries/setup.c
> @@ -1085,6 +1085,7 @@ define_machine(pseries) {
>       .system_reset_exception = pSeries_system_reset_exception,
>       .machine_check_early    = pseries_machine_check_realmode,
>       .machine_check_exception = pSeries_machine_check_exception,
> +     .machine_check_log_err  = pSeries_machine_check_log_err,
>  #ifdef CONFIG_KEXEC_CORE
>       .machine_kexec          = pSeries_machine_kexec,
>       .kexec_cpu_down         = pseries_kexec_cpu_down,
> -- 
> 2.26.2
> 
> 

Reply via email to