Check if the device is present in the bus before trying to recover. It
could be that device itself is lost from the bus in some hang
situations.

Signed-off-by: Lijo Lazar <lijo.la...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 24 ++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 1e9454e6e4cb..b37113b79483 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5536,6 +5536,23 @@ static inline void 
amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
 
 }
 
+static int amdgpu_device_health_check(struct list_head *device_list_handle)
+{
+       struct amdgpu_device *tmp_adev;
+       int ret = 0;
+       u32 status;
+
+       list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
+               pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status);
+               if (PCI_POSSIBLE_ERROR(status)) {
+                       dev_err(tmp_adev->dev, "device lost from bus!");
+                       ret = -ENODEV;
+               }
+       }
+
+       return ret;
+}
+
 /**
  * amdgpu_device_gpu_recover - reset the asic and recover scheduler
  *
@@ -5607,6 +5624,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
                device_list_handle = &device_list;
        }
 
+       if (!amdgpu_sriov_vf(adev)) {
+               r = amdgpu_device_health_check(device_list_handle);
+               if (r)
+                       goto end_reset;
+       }
+
        /* We need to lock reset domain only once both for XGMI and single 
device */
        tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
                                    reset_list);
@@ -5772,6 +5795,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
                                            reset_list);
        amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
 
+end_reset:
        if (hive) {
                mutex_unlock(&hive->hive_lock);
                amdgpu_put_xgmi_hive(hive);
-- 
2.25.1

Reply via email to