Perform queue reset for SDMA poison consumption.

Signed-off-by: Tao Zhou <tao.zh...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c |  8 ++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h |  3 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c    | 30 ++++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h    |  4 +++
 4 files changed, 45 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 6c47f7d9adcd..085bff11319a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -741,6 +741,14 @@ void amdgpu_amdkfd_ras_poison_consumption_handler(struct 
amdgpu_device *adev,
        amdgpu_umc_pasid_poison_handler(adev, block, 0, NULL, NULL, reset);
 }
 
+int amdgpu_amdkfd_ras_poison_queue_reset(struct amdgpu_device *adev,
+       enum amdgpu_ras_block block, uint16_t client_id, uint16_t vmid,
+       uint16_t node_id)
+{
+       return amdgpu_ras_poison_queue_reset(adev, block, client_id, vmid,
+                       node_id);
+}
+
 int amdgpu_amdkfd_send_close_event_drain_irq(struct amdgpu_device *adev,
                                        uint32_t *payload)
 {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 3fa951ede37c..f1680027399e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -404,6 +404,9 @@ void amdgpu_amdkfd_ras_poison_consumption_handler(struct 
amdgpu_device *adev,
 void amdgpu_amdkfd_ras_pasid_poison_consumption_handler(struct amdgpu_device 
*adev,
                        enum amdgpu_ras_block block, uint16_t pasid,
                        pasid_notify pasid_fn, void *data, uint32_t reset);
+int amdgpu_amdkfd_ras_poison_queue_reset(struct amdgpu_device *adev,
+                       enum amdgpu_ras_block block, uint16_t client_id,
+                       uint16_t vmid, uint16_t node_id);
 
 bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev);
 bool amdgpu_amdkfd_bo_mapped_to_dev(void *drm_priv, struct kgd_mem *mem);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 6565dc7ff9cd..7e63c2fc1a62 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -5311,3 +5311,33 @@ bool amdgpu_ras_is_rma(struct amdgpu_device *adev)
 
        return con->is_rma;
 }
+
+int amdgpu_ras_poison_queue_reset(struct amdgpu_device *adev,
+       enum amdgpu_ras_block block, uint16_t client_id, uint16_t vmid,
+       uint16_t node_id)
+{
+       struct amdgpu_ring *ring = NULL;
+       int sdma_inst, ret;
+
+       if (block == AMDGPU_RAS_BLOCK__SDMA &&
+           adev->sdma.instance[0].funcs->sdma_irq_id_to_seq) {
+               sdma_inst = 
adev->sdma.instance[0].funcs->sdma_irq_id_to_seq(adev,
+                                       client_id, node_id);
+               if (sdma_inst < 0)
+                       return sdma_inst;
+
+               ring = &(adev->sdma.instance[sdma_inst].ring);
+       }
+
+       if (ring && ring->funcs->reset) {
+               ret = amdgpu_ring_reset(ring, vmid);
+               if (ret)
+                       dev_warn(adev->dev,
+                               "queue reset failed in block%d (ret %d), 
fallback to gpu reset\n",
+                               block, ret);
+       } else {
+               return -EINVAL;
+       }
+
+       return ret;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 927d6bff734a..debc07767b5c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -984,4 +984,8 @@ void amdgpu_ras_event_log_print(struct amdgpu_device *adev, 
u64 event_id,
                                const char *fmt, ...);
 
 bool amdgpu_ras_is_rma(struct amdgpu_device *adev);
+
+int amdgpu_ras_poison_queue_reset(struct amdgpu_device *adev,
+       enum amdgpu_ras_block block, uint16_t client_id, uint16_t vmid,
+       uint16_t node_id);
 #endif
-- 
2.34.1

Reply via email to