Yasuaki Ishimatsu found that with node online/offline, cpu<->node
relationship is established. Because workqueue uses a info which was
established at boot time, but it may be changed by node hotpluging.

Once pool->node points to a stale node, following allocation failure
happens.
  ==
     SLUB: Unable to allocate memory on node 2 (gfp=0x80d0)
      cache: kmalloc-192, object size: 192, buffer size: 192, default
order:
    1, min order: 0
      node 0: slabs: 6172, objs: 259224, free: 245741
      node 1: slabs: 3261, objs: 136962, free: 127656
  ==

This patch use the present cpumask directly rather than self-maintained
wq_numa_possible_cpumask, so that the wq is a simple customer of numa, and
updates per cpu workqueue pool's node affinity when numa node changed via
the notify callback that registered to node on/down event at the functions
try_online/offline_node.

Unbound workqueue's per node pool are already updated by 
wq_update_unbound_numa()
at CPU_DOWN_PREPARE of the last cpu, by existing code.

Reported-by: Yasuaki Ishimatsu <isimatu.yasu...@jp.fujitsu.com>
Signed-off-by: Gu Zheng <guz.f...@cn.fujitsu.com>
---
 include/linux/memory_hotplug.h |    8 +++
 kernel/workqueue.c             |   93 +++++++++++++++++++++++++--------------
 mm/memory_hotplug.c            |   44 +++++++++++++++++++
 3 files changed, 111 insertions(+), 34 deletions(-)

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 8f1a419..07bb94e 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -98,6 +98,14 @@ extern void __online_page_free(struct page *page);
 
 extern int try_online_node(int nid);
 
+/* event type of node on/dwon */
+#define NODE_ON                0x00001
+#define NODE_DOWN              0x00002
+
+extern int __ref register_node_notifier(struct notifier_block *nb);
+extern int __ref unregister_node_notifier(struct notifier_block *nb);
+extern void get_present_cpumask_of_node(cpumask_var_t cpumask, int node);
+
 #ifdef CONFIG_MEMORY_HOTREMOVE
 extern bool is_pageblock_removable_nolock(struct page *page);
 extern int arch_remove_memory(u64 start, u64 size);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index beeeac9..60c5d29 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -265,9 +265,6 @@ struct workqueue_struct {
 
 static struct kmem_cache *pwq_cache;
 
-static cpumask_var_t *wq_numa_possible_cpumask;
-                                       /* possible CPUs of each node */
-
 static bool wq_disable_numa;
 module_param_named(disable_numa, wq_disable_numa, bool, 0444);
 
@@ -3493,10 +3490,18 @@ static struct worker_pool *get_unbound_pool(const 
struct workqueue_attrs *attrs)
        pool->attrs->no_numa = false;
 
        /* if cpumask is contained inside a NUMA node, we belong to that node */
-       if (wq_numa_enabled) {
+       if (wq_numa_enabled && !cpumask_empty(pool->attrs->cpumask)) {
                for_each_node(node) {
-                       if (cpumask_subset(pool->attrs->cpumask,
-                                          wq_numa_possible_cpumask[node])) {
+                       int cpu;
+                       bool is_sub_set = true;
+
+                       for_each_cpu(cpu, pool->attrs->cpumask)
+                               if (cpu_to_node(cpu) != node) {
+                                       is_sub_set = false;
+                                       break;
+                               }
+
+                       if (is_sub_set) {
                                pool->node = node;
                                break;
                        }
@@ -3717,8 +3722,9 @@ static bool wq_calc_node_cpumask(const struct 
workqueue_attrs *attrs, int node,
        if (cpumask_empty(cpumask))
                goto use_dfl;
 
-       /* yeap, return possible CPUs in @node that @attrs wants */
-       cpumask_and(cpumask, attrs->cpumask, wq_numa_possible_cpumask[node]);
+       /* yeap, return present CPUs in @node that @attrs wants */
+       get_present_cpumask_of_node(cpumask, node);
+       cpumask_and(cpumask, attrs->cpumask, cpumask);
        return !cpumask_equal(cpumask, attrs->cpumask);
 
 use_dfl:
@@ -4564,6 +4570,7 @@ static int workqueue_cpu_up_callback(struct 
notifier_block *nfb,
                                               void *hcpu)
 {
        int cpu = (unsigned long)hcpu;
+       int node = cpu_to_node(cpu);
        struct worker_pool *pool;
        struct workqueue_struct *wq;
        int pi;
@@ -4571,6 +4578,7 @@ static int workqueue_cpu_up_callback(struct 
notifier_block *nfb,
        switch (action & ~CPU_TASKS_FROZEN) {
        case CPU_UP_PREPARE:
                for_each_cpu_worker_pool(pool, cpu) {
+                       pool->node = node;
                        if (pool->nr_workers)
                                continue;
                        if (!create_worker(pool))
@@ -4787,11 +4795,50 @@ out_unlock:
 }
 #endif /* CONFIG_FREEZER */
 
-static void __init wq_numa_init(void)
+static int wq_numa_callback(struct notifier_block *self,
+                           unsigned long action, void *arg)
 {
-       cpumask_var_t *tbl;
-       int node, cpu;
+       int node = (unsigned long)arg;
+       struct worker_pool *pool;
+       int pi;
+       int cpu;
 
+       switch (action) {
+       case NODE_DOWN:
+               mutex_lock(&wq_pool_mutex);
+               for_each_pool(pool, pi) {
+                       if (pool->node == node) {
+                               pool->node = NUMA_NO_NODE;
+                               if (pool->cpu < 0)
+                                       hash_del(&pool->hash_node);
+                       }
+               }
+               mutex_unlock(&wq_pool_mutex);
+               break;
+       case NODE_ON:
+               mutex_lock(&wq_pool_mutex);
+               for_each_present_cpu(cpu) {
+                       if (node != cpu_to_node(cpu))
+                               continue;
+                       for_each_cpu_worker_pool(pool, cpu)
+                               pool->node = node;
+               }
+               mutex_unlock(&wq_pool_mutex);
+               break;
+       default:
+               break;
+       }
+
+       return 0;
+}
+
+static struct notifier_block wq_numa_nb = {
+       .notifier_call = wq_numa_callback,
+       .priority = 0
+};
+
+static void __init wq_numa_init(void)
+{
        if (num_possible_nodes() <= 1)
                return;
 
@@ -4803,29 +4850,7 @@ static void __init wq_numa_init(void)
        wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs(GFP_KERNEL);
        BUG_ON(!wq_update_unbound_numa_attrs_buf);
 
-       /*
-        * We want masks of possible CPUs of each node which isn't readily
-        * available.  Build one from cpu_to_node() which should have been
-        * fully initialized by now.
-        */
-       tbl = kzalloc(nr_node_ids * sizeof(tbl[0]), GFP_KERNEL);
-       BUG_ON(!tbl);
-
-       for_each_node(node)
-               BUG_ON(!zalloc_cpumask_var_node(&tbl[node], GFP_KERNEL,
-                               node_online(node) ? node : NUMA_NO_NODE));
-
-       for_each_possible_cpu(cpu) {
-               node = cpu_to_node(cpu);
-               if (WARN_ON(node == NUMA_NO_NODE)) {
-                       pr_warn("workqueue: NUMA node mapping not available for 
cpu%d, disabling NUMA support\n", cpu);
-                       /* happens iff arch is bonkers, let's just proceed */
-                       return;
-               }
-               cpumask_set_cpu(cpu, tbl[node]);
-       }
-
-       wq_numa_possible_cpumask = tbl;
+       register_node_notifier(&wq_numa_nb);
        wq_numa_enabled = true;
 }
 
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 9fab107..1778628 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1132,6 +1132,44 @@ static void rollback_node_hotadd(int nid, pg_data_t 
*pgdat)
        return;
 }
 
+static RAW_NOTIFIER_HEAD(node_chain);
+
+int __ref register_node_notifier(struct notifier_block *nb)
+{
+       int ret;
+
+       ret = raw_notifier_chain_register(&node_chain, nb);
+       return ret;
+}
+EXPORT_SYMBOL(register_node_notifier);
+
+int __ref unregister_node_notifier(struct notifier_block *nb)
+{
+       int ret;
+
+       ret = raw_notifier_chain_unregister(&node_chain, nb);
+       return ret;
+}
+EXPORT_SYMBOL(unregister_node_notifier);
+
+static int call_node_notify(unsigned long val, void *v)
+{
+       int ret;
+
+       ret = __raw_notifier_call_chain(&node_chain, val, v, -1, NULL);
+
+       return notifier_to_errno(ret);
+}
+
+void get_present_cpumask_of_node(cpumask_var_t cpumask, int node)
+{
+       unsigned int cpu;
+
+       for_each_present_cpu(cpu)
+               if (node == cpu_to_node(cpu))
+                       cpumask_set_cpu(cpu, cpumask);
+}
+EXPORT_SYMBOL(get_present_cpumask_of_node);
 
 /**
  * try_online_node - online a node if offlined
@@ -1157,6 +1195,9 @@ int try_online_node(int nid)
        ret = register_one_node(nid);
        BUG_ON(ret);
 
+       /* notify that the node is on */
+       call_node_notify(NODE_ON, (void *)(long)nid);
+
        if (pgdat->node_zonelists->_zonerefs->zone == NULL) {
                mutex_lock(&zonelists_mutex);
                build_all_zonelists(NULL, NULL);
@@ -1978,6 +2019,9 @@ void try_offline_node(int nid)
                        vfree(zone->wait_table);
        }
 
+       /* notify that the node is down */
+       call_node_notify(NODE_DOWN, (void *)(long)nid);
+
        /*
         * Since there is no way to guarentee the address of pgdat/zone is not
         * on stack of any kernel threads or used by other kernel objects
-- 
1.7.7

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to