Currently the kernel detects if its running on a shared lpar platform
and requests home node associativity before the scheduler sched_domains
are setup. However between the time NUMA setup is initialized and the
request for home node associativity, workqueue initializes its per node
cpumask. The per node workqueue possible cpumask may turn invalid
after home node associativity resulting in weird situations like
workqueue possible cpumask being a subset of workqueue online cpumask.

This can be fixed by requesting home node associativity earlier just
before NUMA setup. However at the NUMA setup time, kernel may not be in
a position to detect if its running on a shared lpar platform. So
request for home node associativity and if the request fails, fallback
on the device tree property.

However home node associativity requires cpu's hwid which is set in
smp_setup_pacas. Hence call smp_setup_pacas before numa_setup_cpus.

Signed-off-by: Srikar Dronamraju <sri...@linux.vnet.ibm.com>
Cc: Michael Ellerman <m...@ellerman.id.au>
Cc: Nicholas Piggin <npig...@gmail.com>
Cc: Nathan Lynch <nath...@linux.ibm.com>
Cc: linuxppc-dev@lists.ozlabs.org
Reported-by: Satheesh Rajendran <sathn...@linux.vnet.ibm.com>
Reported-by: Abdul Haleem <abdha...@linux.vnet.ibm.com>
---
 arch/powerpc/kernel/setup-common.c |  5 +++--
 arch/powerpc/mm/numa.c             | 28 +++++++++++++++++++++++++++-
 2 files changed, 30 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/setup-common.c 
b/arch/powerpc/kernel/setup-common.c
index 1f8db66..9135dba 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -888,6 +888,9 @@ void __init setup_arch(char **cmdline_p)
        /* Check the SMT related command line arguments (ppc64). */
        check_smt_enabled();
 
+#ifdef CONFIG_SMP
+       smp_setup_pacas();
+#endif
        /* Parse memory topology */
        mem_topology_setup();
 
@@ -899,8 +902,6 @@ void __init setup_arch(char **cmdline_p)
         * so smp_release_cpus() does nothing for them.
         */
 #ifdef CONFIG_SMP
-       smp_setup_pacas();
-
        /* On BookE, setup per-core TLB data structures. */
        setup_tlb_core_data();
 
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 88b5157..7965d3b 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -461,6 +461,21 @@ static int of_drconf_to_nid_single(struct drmem_lmb *lmb)
        return nid;
 }
 
+static int vphn_get_nid(unsigned long cpu)
+{
+       __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
+       long rc;
+
+       /* Use associativity from first thread for all siblings */
+       rc = hcall_vphn(get_hard_smp_processor_id(cpu),
+                               VPHN_FLAG_VCPU, associativity);
+
+       if (rc == H_SUCCESS)
+               return  associativity_to_nid(associativity);
+
+       return NUMA_NO_NODE;
+}
+
 /*
  * Figure out to which domain a cpu belongs and stick it there.
  * Return the id of the domain used.
@@ -490,7 +505,18 @@ static int numa_setup_cpu(unsigned long lcpu)
                        goto out;
        }
 
-       nid = of_node_to_nid_single(cpu);
+       /*
+        * On a shared lpar, the device tree might not have the correct node
+        * associativity.  At this time lppaca, or its __old_status field
+        * may not be updated. Hence request an explicit associativity
+        * irrespective of whether the lpar is shared or dedicated.  Use the
+        * device tree property as a fallback.
+        */
+       if (firmware_has_feature(FW_FEATURE_VPHN))
+               nid = vphn_get_nid(lcpu);
+
+       if (nid == NUMA_NO_NODE)
+               nid = of_node_to_nid_single(cpu);
 
 out_present:
        if (nid < 0 || !node_possible(nid))
-- 
1.8.3.1

Reply via email to