[  264.483189] ======================================================
[  264.483487] WARNING: possible circular locking dependency detected
[  264.483781] 5.6.0-deli-v5.6-2848-g3f3109b0e75f #1 Tainted: G           OE
[  264.484076] ------------------------------------------------------
[  264.484370] kworker/39:1/567 is trying to acquire lock:
[  264.484663] ffffffffc15df4b0 (mgpu_info.mutex){+.+.}, at: 
amdgpu_unregister_gpu_instance+0x1d/0xc0 [amdgpu]
[  264.485081]
               but task is already holding lock:
[  264.485670] ffff965fd31647a0 (&adev->reset_sem){++++}, at: 
amdgpu_device_gpu_recover+0x264/0x1030 [amdgpu]
[  264.486074]
               which lock already depends on the new lock.

[  264.487043]
               the existing dependency chain (in reverse order) is:
[  264.487710]
               -> #3 (&adev->reset_sem){++++}:
[  264.488400]        down_write+0x49/0x120
[  264.488783]        amdgpu_device_gpu_recover+0x264/0x1030 [amdgpu]
[  264.489179]        amdgpu_ras_do_recovery+0x159/0x190 [amdgpu]
[  264.489544]        process_one_work+0x29e/0x630
[  264.489910]        worker_thread+0x3c/0x3f0
[  264.490279]        kthread+0x12f/0x150
[  264.490649]        ret_from_fork+0x3a/0x50
[  264.491020]
               -> #2 (&tmp->hive_lock){+.+.}:
[  264.491764]        __mutex_lock+0x95/0xa20
[  264.492137]        mutex_lock_nested+0x1b/0x20
[  264.492553]        amdgpu_get_xgmi_hive+0x352/0x400 [amdgpu]
[  264.492972]        amdgpu_xgmi_add_device+0xb8/0x460 [amdgpu]
[  264.493387]        amdgpu_device_init+0x12fb/0x1e10 [amdgpu]
[  264.493807]        amdgpu_driver_load_kms+0x5c/0x2c0 [amdgpu]
[  264.494226]        amdgpu_pci_probe+0x11d/0x200 [amdgpu]
[  264.494617]        local_pci_probe+0x47/0xa0
[  264.494998]        work_for_cpu_fn+0x1a/0x30
[  264.495369]        process_one_work+0x29e/0x630
[  264.495746]        worker_thread+0x22b/0x3f0
[  264.496124]        kthread+0x12f/0x150
[  264.496504]        ret_from_fork+0x3a/0x50
[  264.496876]
               -> #1 (xgmi_mutex){+.+.}:
[  264.497596]        __mutex_lock+0x95/0xa20
[  264.497954]        mutex_lock_nested+0x1b/0x20
[  264.498346]        amdgpu_get_xgmi_hive+0x38/0x400 [amdgpu]
[  264.498741]        amdgpu_xgmi_set_pstate+0x10/0x20 [amdgpu]
[  264.499126]        amdgpu_device_ip_late_init+0x219/0x230 [amdgpu]
[  264.499506]        amdgpu_device_init+0x1401/0x1e10 [amdgpu]
[  264.499886]        amdgpu_driver_load_kms+0x5c/0x2c0 [amdgpu]
[  264.500264]        amdgpu_pci_probe+0x11d/0x200 [amdgpu]
[  264.500608]        local_pci_probe+0x47/0xa0
[  264.500945]        work_for_cpu_fn+0x1a/0x30
[  264.501276]        process_one_work+0x29e/0x630
[  264.501603]        worker_thread+0x22b/0x3f0
[  264.501927]        kthread+0x12f/0x150
[  264.502239]        ret_from_fork+0x3a/0x50
[  264.502541]
               -> #0 (mgpu_info.mutex){+.+.}:
[  264.503126]        __lock_acquire+0x13ec/0x16e0
[  264.503411]        lock_acquire+0xb8/0x1c0
[  264.503693]        __mutex_lock+0x95/0xa20
[  264.504019]        mutex_lock_nested+0x1b/0x20
[  264.504354]        amdgpu_unregister_gpu_instance+0x1d/0xc0 [amdgpu]
[  264.504691]        amdgpu_device_gpu_recover+0x360/0x1030 [amdgpu]
[  264.505029]        amdgpu_ras_do_recovery+0x159/0x190 [amdgpu]
[  264.505334]        process_one_work+0x29e/0x630
[  264.505617]        worker_thread+0x3c/0x3f0
[  264.505897]        kthread+0x12f/0x150
[  264.506176]        ret_from_fork+0x3a/0x50
[  264.506453]
               other info that might help us debug this:

[  264.507267] Chain exists of:
                 mgpu_info.mutex --> &tmp->hive_lock --> &adev->reset_sem

[  264.508102]  Possible unsafe locking scenario:

[  264.508664]        CPU0                    CPU1
[  264.508945]        ----                    ----
[  264.509221]   lock(&adev->reset_sem);
[  264.509524]                                lock(&tmp->hive_lock);
[  264.509818]                                lock(&adev->reset_sem);
[  264.510111]   lock(mgpu_info.mutex);
[  264.510401]
                *** DEADLOCK ***

[  264.511224] 4 locks held by kworker/39:1/567:
[  264.511499]  #0: ffff961ff5c1d348 ((wq_completion)events){+.+.}, at: 
process_one_work+0x21f/0x630
[  264.511793]  #1: ffffafa90e233e58 
((work_completion)(&con->recovery_work)){+.+.}, at: process_one_work+0x21f/0x630
[  264.512100]  #2: ffffffffc16245d0 (&tmp->hive_lock){+.+.}, at: 
amdgpu_device_gpu_recover+0xb0/0x1030 [amdgpu]
[  264.512450]  #3: ffff965fd31647a0 (&adev->reset_sem){++++}, at: 
amdgpu_device_gpu_recover+0x264/0x1030 [amdgpu]

Remove the lock(&hive->hive_lock) out of amdgpu_get_xgmi_hive,
to disable its locking dependency on xgmi_mutex.

Signed-off-by: Dennis Li <dennis...@amd.com>
Change-Id: I2d9d80ee23f9f9ac6ce9e1b9e5e1b2b3530f5bdd

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 62ecac97fbd2..6c572db42d92 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2840,7 +2840,7 @@ static void amdgpu_device_xgmi_reset_func(struct 
work_struct *__work)
 {
        struct amdgpu_device *adev =
                container_of(__work, struct amdgpu_device, xgmi_reset_work);
-       struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);
+       struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
 
        /* It's a bug to not have a hive within this function */
        if (WARN_ON(!hive))
@@ -4283,7 +4283,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
         * We always reset all schedulers for device and all devices for XGMI
         * hive so that should take care of them too.
         */
-       hive = amdgpu_get_xgmi_hive(adev, false);
+       hive = amdgpu_get_xgmi_hive(adev);
        if (hive) {
                if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) {
                        DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as 
another already in progress",
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 5680f7eafcb1..5cd42740461c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1514,7 +1514,7 @@ static void amdgpu_ras_do_recovery(struct work_struct 
*work)
        struct amdgpu_device *remote_adev = NULL;
        struct amdgpu_device *adev = ras->adev;
        struct list_head device_list, *device_list_handle =  NULL;
-       struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, false);
+       struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
 
        /* Build list of devices to query RAS related errors */
        if  (hive && adev->gmc.xgmi.num_physical_nodes > 1)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 67a756f4337b..19fd5ce3e857 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -336,7 +336,7 @@ static void amdgpu_xgmi_sysfs_rem_dev_info(struct 
amdgpu_device *adev,
 
 
 
-struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int 
lock)
+struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev)
 {
        int i;
        struct amdgpu_hive_info *tmp;
@@ -349,8 +349,6 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct 
amdgpu_device *adev, int lo
        for (i = 0 ; i < hive_count; ++i) {
                tmp = &xgmi_hives[i];
                if (tmp->hive_id == adev->gmc.xgmi.hive_id) {
-                       if (lock)
-                               mutex_lock(&tmp->hive_lock);
                        mutex_unlock(&xgmi_mutex);
                        return tmp;
                }
@@ -374,9 +372,6 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct 
amdgpu_device *adev, int lo
        mutex_init(&tmp->hive_lock);
        atomic_set(&tmp->in_reset, 0);
        task_barrier_init(&tmp->tb);
-
-       if (lock)
-               mutex_lock(&tmp->hive_lock);
        tmp->pstate = AMDGPU_XGMI_PSTATE_UNKNOWN;
        tmp->hi_req_gpu = NULL;
        /*
@@ -392,7 +387,7 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct 
amdgpu_device *adev, int lo
 int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate)
 {
        int ret = 0;
-       struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);
+       struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
        struct amdgpu_device *request_adev = hive->hi_req_gpu ?
                                                hive->hi_req_gpu : adev;
        bool is_hi_req = pstate == AMDGPU_XGMI_PSTATE_MAX_VEGA20;
@@ -515,7 +510,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
                adev->gmc.xgmi.node_id = adev->gmc.xgmi.physical_node_id + 16;
        }
 
-       hive = amdgpu_get_xgmi_hive(adev, 1);
+       hive = amdgpu_get_xgmi_hive(adev);
        if (!hive) {
                ret = -EINVAL;
                dev_err(adev->dev,
@@ -524,6 +519,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
                goto exit;
        }
 
+       mutex_lock(&hive->hive_lock);
+
        top_info = &adev->psp.xgmi_context.top_info;
 
        list_add_tail(&adev->gmc.xgmi.head, &hive->device_list);
@@ -587,10 +584,11 @@ int amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
        if (!adev->gmc.xgmi.supported)
                return -EINVAL;
 
-       hive = amdgpu_get_xgmi_hive(adev, 1);
+       hive = amdgpu_get_xgmi_hive(adev);
        if (!hive)
                return -EINVAL;
 
+       mutex_lock(&hive->hive_lock);
        task_barrier_rem_task(&hive->tb);
        amdgpu_xgmi_sysfs_rem_dev_info(adev, hive);
        mutex_unlock(&hive->hive_lock);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
index 61720cd4a1ee..ae3249c1aafe 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
@@ -51,7 +51,7 @@ struct amdgpu_pcs_ras_field {
        uint32_t pcs_err_shift;
 };
 
-struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int 
lock);
+struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev);
 int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct 
amdgpu_device *adev);
 int amdgpu_xgmi_add_device(struct amdgpu_device *adev);
 int amdgpu_xgmi_remove_device(struct amdgpu_device *adev);
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Reply via email to