The partition suspend sequence as specified in the platform architecture requires that all active processor threads call H_JOIN, which:
- suspends the calling thread until it is the target of an H_PROD; or - immediately returns H_CONTINUE, if the calling thread is the last to call H_JOIN. This thread is expected to call ibm,suspend-me to completely suspend the partition. Upon returning from ibm,suspend-me the calling thread must wake all others using H_PROD. rtas_ibm_suspend_me_unsafe() uses on_each_cpu() to implement this protocol, but because of its synchronizing nature this is susceptible to deadlock versus users of stop_machine() or other callers of on_each_cpu(). Not only is stop_machine() intended for use cases like this, it handles error propagation and allows us to keep the data shared between CPUs minimal: a single atomic counter which ensures exactly one CPU will wake the others from their joined states. Switch the migration code to use stop_machine() and a less complex local implementation of the H_JOIN/ibm,suspend-me logic, which carries additional benefits: - more informative error reporting, appropriately ratelimited - resets the lockup detector / watchdog on resume to prevent lockup warnings when the OS has been suspended for a time exceeding the threshold. Fixes: 91dc182ca6e2 ("[PATCH] powerpc: special-case ibm,suspend-me RTAS call") Signed-off-by: Nathan Lynch <nath...@linux.ibm.com> --- arch/powerpc/platforms/pseries/mobility.c | 132 ++++++++++++++++++++-- 1 file changed, 125 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/platforms/pseries/mobility.c b/arch/powerpc/platforms/pseries/mobility.c index 1b8ae221b98a..44ca7d4e143d 100644 --- a/arch/powerpc/platforms/pseries/mobility.c +++ b/arch/powerpc/platforms/pseries/mobility.c @@ -12,9 +12,11 @@ #include <linux/cpu.h> #include <linux/kernel.h> #include <linux/kobject.h> +#include <linux/nmi.h> #include <linux/sched.h> #include <linux/smp.h> #include <linux/stat.h> +#include <linux/stop_machine.h> #include <linux/completion.h> #include <linux/device.h> #include <linux/delay.h> @@ -412,6 +414,128 @@ static int wait_for_vasi_session_suspending(u64 handle) return ret; } +static void prod_single(unsigned int target_cpu) +{ + long hvrc; + int hwid; + + hwid = get_hard_smp_processor_id(target_cpu); + hvrc = plpar_hcall_norets(H_PROD, hwid); + if (hvrc == H_SUCCESS) + return; + pr_err_ratelimited("H_PROD of CPU %u (hwid %d) error: %ld\n", + target_cpu, hwid, hvrc); +} + +static void prod_others(void) +{ + unsigned int cpu; + + for_each_online_cpu(cpu) { + if (cpu != smp_processor_id()) + prod_single(cpu); + } +} + +static u16 clamp_slb_size(void) +{ + u16 prev = mmu_slb_size; + + slb_set_size(SLB_MIN_SIZE); + + return prev; +} + +static int do_suspend(void) +{ + u16 saved_slb_size; + int status; + int ret; + + pr_info("calling ibm,suspend-me on CPU %i\n", smp_processor_id()); + + /* + * The destination processor model may have fewer SLB entries + * than the source. We reduce mmu_slb_size to a safe minimum + * before suspending in order to minimize the possibility of + * programming non-existent entries on the destination. If + * suspend fails, we restore it before returning. On success + * the OF reconfig path will update it from the new device + * tree after resuming on the destination. + */ + saved_slb_size = clamp_slb_size(); + + ret = rtas_ibm_suspend_me(&status); + if (ret != 0) { + pr_err("ibm,suspend-me error: %d\n", status); + slb_set_size(saved_slb_size); + } + + return ret; +} + +static int do_join(void *arg) +{ + atomic_t *counter = arg; + long hvrc; + int ret; + + /* Must ensure MSR.EE off for H_JOIN. */ + hard_irq_disable(); + hvrc = plpar_hcall_norets(H_JOIN); + + switch (hvrc) { + case H_CONTINUE: + /* + * All other CPUs are offline or in H_JOIN. This CPU + * attempts the suspend. + */ + ret = do_suspend(); + break; + case H_SUCCESS: + /* + * The suspend is complete and this cpu has received a + * prod. + */ + ret = 0; + break; + case H_BAD_MODE: + case H_HARDWARE: + default: + ret = -EIO; + pr_err_ratelimited("H_JOIN error %ld on CPU %i\n", + hvrc, smp_processor_id()); + break; + } + + if (atomic_inc_return(counter) == 1) { + pr_info("CPU %u waking all threads\n", smp_processor_id()); + prod_others(); + } + /* + * Execution may have been suspended for several seconds, so + * reset the watchdog. + */ + touch_nmi_watchdog(); + return ret; +} + +static int pseries_migrate_partition(u64 handle) +{ + atomic_t counter = ATOMIC_INIT(0); + int ret; + + ret = wait_for_vasi_session_suspending(handle); + if (ret) + goto out; + + ret = stop_machine(do_join, &counter, cpu_online_mask); + if (ret == 0) + post_mobility_fixup(); +out: + return ret; +} + static ssize_t migration_store(struct class *class, struct class_attribute *attr, const char *buf, size_t count) @@ -423,16 +547,10 @@ static ssize_t migration_store(struct class *class, if (rc) return rc; - rc = wait_for_vasi_session_suspending(streamid); + rc = pseries_migrate_partition(streamid); if (rc) return rc; - rc = rtas_ibm_suspend_me_unsafe(streamid); - if (rc) - return rc; - - post_mobility_fixup(); - return count; } -- 2.25.4