On Wed, 13 Sep 2017 02:05:53 +1000
Nicholas Piggin <npig...@gmail.com> wrote:

> There are two complications. The first is that sreset from stop states
> come in with SRR1 set to do a powersave wakeup, with an sreset reason
> encoded.
> 
> The second is that threads on the same core can't be signalled directly
> so we must designate a bounce CPU to reflect the IPI back.

Here is an updated Linux patch for the latest OPAL patch. This has
a few assorted fixes as well to make it work nicely, I roll them into
one patch here to make it easy to apply for testing the OPAL patch.

Thanks,
Nick

---
 arch/powerpc/include/asm/opal-api.h            |  1 +
 arch/powerpc/include/asm/opal.h                |  2 +
 arch/powerpc/kernel/irq.c                      | 18 ++++++
 arch/powerpc/kernel/watchdog.c                 | 30 +++++++--
 arch/powerpc/platforms/powernv/opal-wrappers.S |  1 +
 arch/powerpc/platforms/powernv/powernv.h       |  1 +
 arch/powerpc/platforms/powernv/setup.c         |  3 +
 arch/powerpc/platforms/powernv/smp.c           | 89 ++++++++++++++++++++++++++
 arch/powerpc/xmon/xmon.c                       | 17 +++--
 9 files changed, 151 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/include/asm/opal-api.h 
b/arch/powerpc/include/asm/opal-api.h
index 450a60b81d2a..e39f4236b413 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -188,6 +188,7 @@
 #define OPAL_XIVE_DUMP                         142
 #define OPAL_XIVE_RESERVED3                    143
 #define OPAL_XIVE_RESERVED4                    144
+#define OPAL_SIGNAL_SYSTEM_RESET                145
 #define OPAL_NPU_INIT_CONTEXT                  146
 #define OPAL_NPU_DESTROY_CONTEXT               147
 #define OPAL_NPU_MAP_LPAR                      148
diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index 726c23304a57..7d7613c49f2b 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -281,6 +281,8 @@ int opal_get_power_shift_ratio(u32 handle, int token, u32 
*psr);
 int opal_set_power_shift_ratio(u32 handle, int token, u32 psr);
 int opal_sensor_group_clear(u32 group_hndl, int token);
 
+int64_t opal_signal_system_reset(int32_t cpu);
+
 /* Internal functions */
 extern int early_init_dt_scan_opal(unsigned long node, const char *uname,
                                   int depth, void *data);
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 4e65bf82f5e0..5f2c0367bab2 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -407,10 +407,28 @@ static const u8 srr1_to_lazyirq[0x10] = {
        PACA_IRQ_HMI,
        0, 0, 0, 0, 0 };
 
+/*
+ * System reset does not have to wait for Linux interrupts
+ * to be re-enabled, so just replay it now.
+ */
+static noinline void replay_system_reset(void)
+{
+       struct pt_regs regs;
+
+       ppc_save_regs(&regs);
+
+       get_paca()->in_nmi = 1;
+       system_reset_exception(&regs);
+       get_paca()->in_nmi = 0;
+}
+
 void irq_set_pending_from_srr1(unsigned long srr1)
 {
        unsigned int idx = (srr1 & SRR1_WAKEMASK_P8) >> 18;
 
+       if (unlikely(idx == 4))
+               replay_system_reset();
+
        /*
         * The 0 index (SRR1[42:45]=b0000) must always evaluate to 0,
         * so this can be called unconditionally with srr1 wake reason.
diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c
index 2f6eadd9408d..a6aa85b0cdeb 100644
--- a/arch/powerpc/kernel/watchdog.c
+++ b/arch/powerpc/kernel/watchdog.c
@@ -61,6 +61,7 @@ static DEFINE_PER_CPU(u64, wd_timer_tb);
  */
 static unsigned long __wd_smp_lock;
 static cpumask_t wd_smp_cpus_pending;
+static cpumask_t wd_smp_cpus_stuck_tmp;
 static cpumask_t wd_smp_cpus_stuck;
 static u64 wd_smp_last_reset_tb;
 
@@ -97,8 +98,7 @@ static void wd_lockup_ipi(struct pt_regs *regs)
        else
                dump_stack();
 
-       if (hardlockup_panic)
-               nmi_panic(regs, "Hard LOCKUP");
+       /* Do not panic from here because that can recurse into NMI IPI layer */
 }
 
 static void set_cpumask_stuck(const struct cpumask *cpumask, u64 tb)
@@ -136,16 +136,29 @@ static void watchdog_smp_panic(int cpu, u64 tb)
 
        /*
         * Try to trigger the stuck CPUs.
+        *
+        * There is a bit of a hack for OPAL here because it can not
+        * signal sibling threads. Don't try to signal those or mark
+        * them stuck, in the hope that another core will notice.
         */
+       cpumask_clear(&wd_smp_cpus_stuck_tmp);
        for_each_cpu(c, &wd_smp_cpus_pending) {
                if (c == cpu)
                        continue;
-               smp_send_nmi_ipi(c, wd_lockup_ipi, 1000000);
+               if (firmware_has_feature(FW_FEATURE_OPAL)) {
+                       if (cpumask_test_cpu(c, cpu_sibling_mask(cpu)))
+                               continue;
+               }
+               cpumask_set_cpu(c, &wd_smp_cpus_stuck_tmp);
+               if (!sysctl_hardlockup_all_cpu_backtrace)
+                       smp_send_nmi_ipi(c, wd_lockup_ipi, 1000000);
        }
-       smp_flush_nmi_ipi(1000000);
 
        /* Take the stuck CPUs out of the watch group */
-       set_cpumask_stuck(&wd_smp_cpus_pending, tb);
+       set_cpumask_stuck(&wd_smp_cpus_stuck_tmp, tb);
+
+       if (!sysctl_hardlockup_all_cpu_backtrace)
+               smp_flush_nmi_ipi(1000000);
 
        wd_smp_unlock(&flags);
 
@@ -275,9 +288,12 @@ void arch_touch_nmi_watchdog(void)
 {
        unsigned long ticks = tb_ticks_per_usec * wd_timer_period_ms * 1000;
        int cpu = smp_processor_id();
+       u64 tb = get_tb();
 
-       if (get_tb() - per_cpu(wd_timer_tb, cpu) >= ticks)
-               watchdog_timer_interrupt(cpu);
+       if (tb - per_cpu(wd_timer_tb, cpu) >= ticks) {
+               per_cpu(wd_timer_tb, cpu) = tb;
+               wd_smp_clear_cpu_pending(cpu, tb);
+       }
 }
 EXPORT_SYMBOL(arch_touch_nmi_watchdog);
 
diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S 
b/arch/powerpc/platforms/powernv/opal-wrappers.S
index 8c1ede2d3f7e..37cd170201a2 100644
--- a/arch/powerpc/platforms/powernv/opal-wrappers.S
+++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
@@ -307,6 +307,7 @@ OPAL_CALL(opal_xive_get_vp_info,            
OPAL_XIVE_GET_VP_INFO);
 OPAL_CALL(opal_xive_set_vp_info,               OPAL_XIVE_SET_VP_INFO);
 OPAL_CALL(opal_xive_sync,                      OPAL_XIVE_SYNC);
 OPAL_CALL(opal_xive_dump,                      OPAL_XIVE_DUMP);
+OPAL_CALL(opal_signal_system_reset,            OPAL_SIGNAL_SYSTEM_RESET);
 OPAL_CALL(opal_npu_init_context,               OPAL_NPU_INIT_CONTEXT);
 OPAL_CALL(opal_npu_destroy_context,            OPAL_NPU_DESTROY_CONTEXT);
 OPAL_CALL(opal_npu_map_lpar,                   OPAL_NPU_MAP_LPAR);
diff --git a/arch/powerpc/platforms/powernv/powernv.h 
b/arch/powerpc/platforms/powernv/powernv.h
index a159d48573d7..49add2037e0d 100644
--- a/arch/powerpc/platforms/powernv/powernv.h
+++ b/arch/powerpc/platforms/powernv/powernv.h
@@ -3,6 +3,7 @@
 
 #ifdef CONFIG_SMP
 extern void pnv_smp_init(void);
+extern int pnv_system_reset_exception(struct pt_regs *regs);
 #else
 static inline void pnv_smp_init(void) { }
 #endif
diff --git a/arch/powerpc/platforms/powernv/setup.c 
b/arch/powerpc/platforms/powernv/setup.c
index 897aa1400eb8..4fdaa1d7c4cd 100644
--- a/arch/powerpc/platforms/powernv/setup.c
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -282,6 +282,9 @@ static void __init pnv_setup_machdep_opal(void)
        ppc_md.restart = pnv_restart;
        pm_power_off = pnv_power_off;
        ppc_md.halt = pnv_halt;
+#ifdef CONFIG_SMP
+       ppc_md.system_reset_exception = pnv_system_reset_exception;
+#endif
        ppc_md.machine_check_exception = opal_machine_check;
        ppc_md.mce_check_early_recovery = opal_mce_check_early_recovery;
        ppc_md.hmi_exception_early = opal_hmi_exception_early;
diff --git a/arch/powerpc/platforms/powernv/smp.c 
b/arch/powerpc/platforms/powernv/smp.c
index c17f81e433f7..9da97962c93a 100644
--- a/arch/powerpc/platforms/powernv/smp.c
+++ b/arch/powerpc/platforms/powernv/smp.c
@@ -290,6 +290,93 @@ static void __init pnv_smp_probe(void)
        }
 }
 
+static int nmi_ipi_bounce_cpu;
+static int nmi_ipi_bounce_target_core;
+static int nmi_ipi_bounce_target_exclude;
+
+int pnv_system_reset_exception(struct pt_regs *regs)
+{
+       if (nmi_ipi_bounce_cpu == smp_processor_id()) {
+               int c;
+               nmi_ipi_bounce_cpu = -1;
+               for_each_online_cpu(c) {
+                       if (!cpumask_test_cpu(c, cpu_sibling_mask(
+                                               nmi_ipi_bounce_target_core)))
+                               continue;
+                       if (c == nmi_ipi_bounce_target_exclude)
+                               continue;
+                       opal_signal_system_reset(
+                                       get_hard_smp_processor_id(c));
+                       /* can't do much with failure here */
+               }
+       }
+
+       if (smp_handle_nmi_ipi(regs))
+               return 1;
+       return 0;
+}
+
+static int pnv_cause_nmi_ipi(int cpu)
+{
+       int64_t rc;
+
+       if (cpu >= 0) {
+               rc = opal_signal_system_reset(get_hard_smp_processor_id(cpu));
+               if (rc == OPAL_SUCCESS)
+                       return 1;
+               return 0;
+       } else {
+               int c;
+
+               /*
+                * Some platforms can not send NMI to sibling threads in
+                * the same core. We can designate one inter-core target
+                * to bounce NMIs back to our sibling threads.
+                */
+
+               if (cpu >= 0) {
+                       /*
+                        * Don't support bouncing unicast NMIs yet (because
+                        * that would have to raise an NMI on an unrelated
+                        * CPU. Revisit this if callers start using unicast.
+                        */
+                       return 0;
+               }
+
+               nmi_ipi_bounce_cpu = -1;
+               nmi_ipi_bounce_target_core = -1;
+               nmi_ipi_bounce_target_exclude = -1;
+
+               for_each_online_cpu(c) {
+                       if (cpumask_test_cpu(c, 
cpu_sibling_mask(smp_processor_id())))
+                               continue;
+
+                       if (nmi_ipi_bounce_cpu == -1) {
+                               nmi_ipi_bounce_cpu = c;
+                               nmi_ipi_bounce_target_core = smp_processor_id();
+                               if (cpu == NMI_IPI_ALL_OTHERS)
+                                       nmi_ipi_bounce_target_exclude = 
smp_processor_id();
+                               smp_mb();
+                       } else {
+                               rc = opal_signal_system_reset(
+                                               get_hard_smp_processor_id(c));
+                               if (rc != OPAL_SUCCESS)
+                                       return 0;
+                       }
+               }
+
+               if (nmi_ipi_bounce_cpu == -1)
+                       return 0; /* could not find a bouncer */
+               rc = opal_signal_system_reset(
+                               get_hard_smp_processor_id(nmi_ipi_bounce_cpu));
+               if (rc != OPAL_SUCCESS)
+                       return 0;
+               return 1;
+       }
+
+       return 0;
+}
+
 static struct smp_ops_t pnv_smp_ops = {
        .message_pass   = NULL, /* Use smp_muxed_ipi_message_pass */
        .cause_ipi      = NULL, /* Filled at runtime by pnv_smp_probe() */
@@ -308,6 +395,8 @@ static struct smp_ops_t pnv_smp_ops = {
 /* This is called very early during platform setup_arch */
 void __init pnv_smp_init(void)
 {
+       if (opal_check_token(OPAL_SIGNAL_SYSTEM_RESET))
+               pnv_smp_ops.cause_nmi_ipi = pnv_cause_nmi_ipi;
        smp_ops = &pnv_smp_ops;
 
 #ifdef CONFIG_HOTPLUG_CPU
diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 33351c6704b1..d9a12102b111 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -530,14 +530,19 @@ static int xmon_core(struct pt_regs *regs, int fromipi)
 
  waiting:
        secondary = 1;
+       spin_begin();
        while (secondary && !xmon_gate) {
                if (in_xmon == 0) {
-                       if (fromipi)
+                       if (fromipi) {
+                               spin_end();
                                goto leave;
+                       }
                        secondary = test_and_set_bit(0, &in_xmon);
                }
-               barrier();
+               spin_cpu_relax();
+               touch_nmi_watchdog();
        }
+       spin_end();
 
        if (!secondary && !xmon_gate) {
                /* we are the first cpu to come in */
@@ -568,21 +573,25 @@ static int xmon_core(struct pt_regs *regs, int fromipi)
                mb();
                xmon_gate = 1;
                barrier();
+               touch_nmi_watchdog();
        }
 
  cmdloop:
        while (in_xmon) {
                if (secondary) {
+                       spin_begin();
                        if (cpu == xmon_owner) {
                                if (!test_and_set_bit(0, &xmon_taken)) {
                                        secondary = 0;
+                                       spin_end();
                                        continue;
                                }
                                /* missed it */
                                while (cpu == xmon_owner)
-                                       barrier();
+                                       spin_cpu_relax();
                        }
-                       barrier();
+                       spin_cpu_relax();
+                       touch_nmi_watchdog();
                } else {
                        cmd = cmds(regs);
                        if (cmd != 0) {
-- 
2.13.3

Reply via email to