[PATCH] drm/amdgpu: Effective health check before reset

Ce Sun Tue, 29 Jul 2025 00:02:46 -0700

Move amdgpu_device_health_check into amdgpu_device_gpu_recover to
ensure that if the device is present can be checked before reset


The reason is:
1.During the dpc event, the device where the dpc event occurs is not
present on the bus
2.When both dpc event and ATHUB event occur simultaneously,the dpc thread
holds the reset domain lock when detecting error,and the gpu recover thread
acquires the hive lock.  The device is simultaneously in the states of
amdgpu_ras_in_recovery and occurs_dpc,so gpu recover thread will not go to
amdgpu_device_health_check.  It waits for the reset domain lock held by the
dpc thread, but dpc thread has not released the reset domain lock.In the dpc
callback slot_reset,to obtain the hive lock, the hive lock is held by the
gpu recover thread at this time.So a deadlock occurred

Signed-off-by: Ce Sun <cesun...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 26 +++++++---------------
 1 file changed, 8 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 343155f5375c..efe98ffb679a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -6128,12 +6128,11 @@ static int amdgpu_device_health_check(struct list_head 
*device_list_handle)
        return ret;
 }
 
-static int amdgpu_device_recovery_prepare(struct amdgpu_device *adev,
+static void amdgpu_device_recovery_prepare(struct amdgpu_device *adev,
                                          struct list_head *device_list,
                                          struct amdgpu_hive_info *hive)
 {
        struct amdgpu_device *tmp_adev = NULL;
-       int r;
 
        /*
         * Build list of devices to reset.
@@ -6153,14 +6152,6 @@ static int amdgpu_device_recovery_prepare(struct 
amdgpu_device *adev,
        } else {
                list_add_tail(&adev->reset_list, device_list);
        }
-
-       if (!amdgpu_sriov_vf(adev) && (!adev->pcie_reset_ctx.occurs_dpc)) {
-               r = amdgpu_device_health_check(device_list);
-               if (r)
-                       return r;
-       }
-
-       return 0;
 }
 
 static void amdgpu_device_recovery_get_reset_lock(struct amdgpu_device *adev,
@@ -6453,8 +6444,13 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
        reset_context->hive = hive;
        INIT_LIST_HEAD(&device_list);
 
-       if (amdgpu_device_recovery_prepare(adev, &device_list, hive))
-               goto end_reset;
+       amdgpu_device_recovery_prepare(adev, &device_list, hive);
+
+       if (!amdgpu_sriov_vf(adev)) {
+               r = amdgpu_device_health_check(&device_list);
+               if (r)
+                       goto end_reset;
+       }
 
        /* We need to lock reset domain only once both for XGMI and single 
device */
        amdgpu_device_recovery_get_reset_lock(adev, &device_list);
@@ -6952,12 +6948,6 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev 
*pdev)
        int r = 0, i;
        u32 memsize;
 
-       /* PCI error slot reset should be skipped During RAS recovery */
-       if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
-           amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) &&
-           amdgpu_ras_in_recovery(adev))
-               return PCI_ERS_RESULT_RECOVERED;
-
        dev_info(adev->dev, "PCI error: slot reset callback!!\n");
 
        memset(&reset_context, 0, sizeof(reset_context));
-- 
2.34.1

[PATCH] drm/amdgpu: Effective health check before reset

Reply via email to