[PATCH] drm/amdgpu: Effective health check before reset

Ce Sun Mon, 28 Jul 2025 21:16:07 -0700

Move amdgpu_device_health_check into amdgpu_device_gpu_recover to
ensure that if the device is present can be checked before reset


Signed-off-by: Ce Sun <cesun...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 25 +++++++---------------
 1 file changed, 8 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 2659e3ebbe49..176712225037 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -6129,12 +6129,11 @@ static int amdgpu_device_health_check(struct list_head 
*device_list_handle)
        return ret;
 }
 
-static int amdgpu_device_recovery_prepare(struct amdgpu_device *adev,
+static void amdgpu_device_recovery_prepare(struct amdgpu_device *adev,
                                          struct list_head *device_list,
                                          struct amdgpu_hive_info *hive)
 {
        struct amdgpu_device *tmp_adev = NULL;
-       int r;
 
        /*
         * Build list of devices to reset.
@@ -6155,13 +6154,6 @@ static int amdgpu_device_recovery_prepare(struct 
amdgpu_device *adev,
                list_add_tail(&adev->reset_list, device_list);
        }
 
-       if (!amdgpu_sriov_vf(adev) && (!adev->pcie_reset_ctx.occurs_dpc)) {
-               r = amdgpu_device_health_check(device_list);
-               if (r)
-                       return r;
-       }
-
-       return 0;
 }
 
 static void amdgpu_device_recovery_get_reset_lock(struct amdgpu_device *adev,
@@ -6449,8 +6441,13 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
        reset_context->hive = hive;
        INIT_LIST_HEAD(&device_list);
 
-       if (amdgpu_device_recovery_prepare(adev, &device_list, hive))
-               goto end_reset;
+       amdgpu_device_recovery_prepare(adev, &device_list, hive);
+
+       if (!amdgpu_sriov_vf(adev)) {
+               r = amdgpu_device_health_check(&device_list);
+               if (r)
+                       goto end_reset;
+       }
 
        /* We need to lock reset domain only once both for XGMI and single 
device */
        amdgpu_device_recovery_get_reset_lock(adev, &device_list);
@@ -6956,12 +6953,6 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev 
*pdev)
        int r = 0, i;
        u32 memsize;
 
-       /* PCI error slot reset should be skipped During RAS recovery */
-       if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
-           amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) &&
-           amdgpu_ras_in_recovery(adev))
-               return PCI_ERS_RESULT_RECOVERED;
-
        dev_info(adev->dev, "PCI error: slot reset callback!!\n");
 
        memset(&reset_context, 0, sizeof(reset_context));
-- 
2.34.1

[PATCH] drm/amdgpu: Effective health check before reset

Reply via email to