We found an AA deadlock problem as shown belowed:

TaskA                           TaskB                           WatchDog        
                system_wq

...
css_killed_work_fn:
P(cgroup_mutex)
...
                                                                ...
                                                                
__lockup_detector_reconfigure:
                                                                
P(cpu_hotplug_lock.read)
                                                                ...
                                ...
                                percpu_down_write:
                                P(cpu_hotplug_lock.write)
                                                                                
                ...
                                                                                
                cgroup_bpf_release:
                                                                                
                P(cgroup_mutex)
                                                                smp_call_on_cpu:
                                                                Wait system_wq

cpuset_css_offline:
P(cpu_hotplug_lock.read)

WatchDog is waitting for system_wq, who is waitting for cgroup_mutex, to finish
the jobs, but the owner of the cgroup_mutex is waitting for cpu_hotplug_lock.
The key point is the cpu_hotplug_lock, cause the system_wq may be waitting other
lock. What's more, it seems that smp_call_on_cpu doesn't need protection from
cpu_hotplug_lock. I try to revert the old patch to fix this problem, but I
encountered some conflicts. Or I should just release and acquire 
cpu_hotplug_lock
during between smp_call_on_cpu? I'm looking forward any suggestion :).

Fixes: e31d6883f21c ("watchdog/core, powerpc: Lock cpus across reconfiguration")

Signed-off-by: Luo Gengkun <luogeng...@huaweicloud.com>
---
 arch/powerpc/kernel/watchdog.c | 4 ++++
 kernel/watchdog.c              | 9 ---------
 2 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c
index 8c464a5d8246..f33f532ea7fa 100644
--- a/arch/powerpc/kernel/watchdog.c
+++ b/arch/powerpc/kernel/watchdog.c
@@ -550,17 +550,21 @@ void watchdog_hardlockup_stop(void)
 {
        int cpu;
 
+       cpus_read_lock();
        for_each_cpu(cpu, &wd_cpus_enabled)
                stop_watchdog_on_cpu(cpu);
+       cpus_read_unlock();
 }
 
 void watchdog_hardlockup_start(void)
 {
        int cpu;
 
+       cpus_read_lock();
        watchdog_calc_timeouts();
        for_each_cpu_and(cpu, cpu_online_mask, &watchdog_cpumask)
                start_watchdog_on_cpu(cpu);
+       cpus_read_unlock();
 }
 
 /*
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 51915b44ac73..13303a932cde 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -867,7 +867,6 @@ int lockup_detector_offline_cpu(unsigned int cpu)
 
 static void __lockup_detector_reconfigure(void)
 {
-       cpus_read_lock();
        watchdog_hardlockup_stop();
 
        softlockup_stop_all();
@@ -877,12 +876,6 @@ static void __lockup_detector_reconfigure(void)
                softlockup_start_all();
 
        watchdog_hardlockup_start();
-       cpus_read_unlock();
-       /*
-        * Must be called outside the cpus locked section to prevent
-        * recursive locking in the perf code.
-        */
-       __lockup_detector_cleanup();
 }
 
 void lockup_detector_reconfigure(void)
@@ -916,11 +909,9 @@ static __init void lockup_detector_setup(void)
 #else /* CONFIG_SOFTLOCKUP_DETECTOR */
 static void __lockup_detector_reconfigure(void)
 {
-       cpus_read_lock();
        watchdog_hardlockup_stop();
        lockup_detector_update_enable();
        watchdog_hardlockup_start();
-       cpus_read_unlock();
 }
 void lockup_detector_reconfigure(void)
 {
-- 
2.34.1

Reply via email to