The partition suspend sequence as specified in the platform
architecture requires that all active processor threads call
H_JOIN, which:

- suspends the calling thread until it is the target of
  an H_PROD; or
- immediately returns H_CONTINUE, if the calling thread is the last to
  call H_JOIN. This thread is expected to call ibm,suspend-me to
  completely suspend the partition.

Upon returning from ibm,suspend-me the calling thread must wake all
others using H_PROD.

rtas_ibm_suspend_me_unsafe() uses on_each_cpu() to implement this
protocol, but because of its synchronizing nature this is susceptible
to deadlock versus users of stop_machine() or other callers of
on_each_cpu().

Not only is stop_machine() intended for use cases like this, it
handles error propagation and allows us to keep the data shared
between CPUs minimal: a single atomic counter which ensures exactly
one CPU will wake the others from their joined states.

Switch the migration code to use stop_machine() and a less complex
local implementation of the H_JOIN/ibm,suspend-me logic, which
carries additional benefits:

- more informative error reporting, appropriately ratelimited
- resets the lockup detector / watchdog on resume to prevent lockup
  warnings when the OS has been suspended for a time exceeding the
  threshold.

Fixes: 91dc182ca6e2 ("[PATCH] powerpc: special-case ibm,suspend-me RTAS call")
Signed-off-by: Nathan Lynch <nath...@linux.ibm.com>
---
 arch/powerpc/platforms/pseries/mobility.c | 132 ++++++++++++++++++++--
 1 file changed, 125 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/mobility.c 
b/arch/powerpc/platforms/pseries/mobility.c
index 1b8ae221b98a..44ca7d4e143d 100644
--- a/arch/powerpc/platforms/pseries/mobility.c
+++ b/arch/powerpc/platforms/pseries/mobility.c
@@ -12,9 +12,11 @@
 #include <linux/cpu.h>
 #include <linux/kernel.h>
 #include <linux/kobject.h>
+#include <linux/nmi.h>
 #include <linux/sched.h>
 #include <linux/smp.h>
 #include <linux/stat.h>
+#include <linux/stop_machine.h>
 #include <linux/completion.h>
 #include <linux/device.h>
 #include <linux/delay.h>
@@ -412,6 +414,128 @@ static int wait_for_vasi_session_suspending(u64 handle)
        return ret;
 }
 
+static void prod_single(unsigned int target_cpu)
+{
+       long hvrc;
+       int hwid;
+
+       hwid = get_hard_smp_processor_id(target_cpu);
+       hvrc = plpar_hcall_norets(H_PROD, hwid);
+       if (hvrc == H_SUCCESS)
+               return;
+       pr_err_ratelimited("H_PROD of CPU %u (hwid %d) error: %ld\n",
+                          target_cpu, hwid, hvrc);
+}
+
+static void prod_others(void)
+{
+       unsigned int cpu;
+
+       for_each_online_cpu(cpu) {
+               if (cpu != smp_processor_id())
+                       prod_single(cpu);
+       }
+}
+
+static u16 clamp_slb_size(void)
+{
+       u16 prev = mmu_slb_size;
+
+       slb_set_size(SLB_MIN_SIZE);
+
+       return prev;
+}
+
+static int do_suspend(void)
+{
+       u16 saved_slb_size;
+       int status;
+       int ret;
+
+       pr_info("calling ibm,suspend-me on CPU %i\n", smp_processor_id());
+
+       /*
+        * The destination processor model may have fewer SLB entries
+        * than the source. We reduce mmu_slb_size to a safe minimum
+        * before suspending in order to minimize the possibility of
+        * programming non-existent entries on the destination. If
+        * suspend fails, we restore it before returning. On success
+        * the OF reconfig path will update it from the new device
+        * tree after resuming on the destination.
+        */
+       saved_slb_size = clamp_slb_size();
+
+       ret = rtas_ibm_suspend_me(&status);
+       if (ret != 0) {
+               pr_err("ibm,suspend-me error: %d\n", status);
+               slb_set_size(saved_slb_size);
+       }
+
+       return ret;
+}
+
+static int do_join(void *arg)
+{
+       atomic_t *counter = arg;
+       long hvrc;
+       int ret;
+
+       /* Must ensure MSR.EE off for H_JOIN. */
+       hard_irq_disable();
+       hvrc = plpar_hcall_norets(H_JOIN);
+
+       switch (hvrc) {
+       case H_CONTINUE:
+               /*
+                * All other CPUs are offline or in H_JOIN. This CPU
+                * attempts the suspend.
+                */
+               ret = do_suspend();
+               break;
+       case H_SUCCESS:
+               /*
+                * The suspend is complete and this cpu has received a
+                * prod.
+                */
+               ret = 0;
+               break;
+       case H_BAD_MODE:
+       case H_HARDWARE:
+       default:
+               ret = -EIO;
+               pr_err_ratelimited("H_JOIN error %ld on CPU %i\n",
+                                  hvrc, smp_processor_id());
+               break;
+       }
+
+       if (atomic_inc_return(counter) == 1) {
+               pr_info("CPU %u waking all threads\n", smp_processor_id());
+               prod_others();
+       }
+       /*
+        * Execution may have been suspended for several seconds, so
+        * reset the watchdog.
+        */
+       touch_nmi_watchdog();
+       return ret;
+}
+
+static int pseries_migrate_partition(u64 handle)
+{
+       atomic_t counter = ATOMIC_INIT(0);
+       int ret;
+
+       ret = wait_for_vasi_session_suspending(handle);
+       if (ret)
+               goto out;
+
+       ret = stop_machine(do_join, &counter, cpu_online_mask);
+       if (ret == 0)
+               post_mobility_fixup();
+out:
+       return ret;
+}
+
 static ssize_t migration_store(struct class *class,
                               struct class_attribute *attr, const char *buf,
                               size_t count)
@@ -423,16 +547,10 @@ static ssize_t migration_store(struct class *class,
        if (rc)
                return rc;
 
-       rc = wait_for_vasi_session_suspending(streamid);
+       rc = pseries_migrate_partition(streamid);
        if (rc)
                return rc;
 
-       rc = rtas_ibm_suspend_me_unsafe(streamid);
-       if (rc)
-               return rc;
-
-       post_mobility_fixup();
-
        return count;
 }
 
-- 
2.25.4

Reply via email to