Add support for common RAS posion consumption handler.

Signed-off-by: Tao Zhou <tao.zh...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 39 ++++++++++++++++++++++++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  1 +
 2 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 863bc282292d..92ce898d50bc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1515,6 +1515,38 @@ static int amdgpu_ras_fs_fini(struct amdgpu_device *adev)
 /* ras fs end */
 
 /* ih begin */
+/* poison consumption handler */
+static void ras_interrupt_handler_poison(struct ras_manager *obj,
+                               struct amdgpu_iv_entry *entry)
+{
+       bool poison_stat = true, need_reset = true;
+       struct amdgpu_device *adev = obj->adev;
+       struct ras_err_data err_data = {0, 0, 0, NULL};
+       struct amdgpu_ras_block_object *block_obj =
+               amdgpu_ras_get_ras_block(adev, obj->head.block, 0);
+
+       if (!adev->gmc.xgmi.connected_to_cpu)
+               amdgpu_umc_poison_handler(adev, &err_data, false);
+
+       if (block_obj && block_obj->hw_ops) {
+               if (block_obj->hw_ops->query_poison_status) {
+                       poison_stat = 
block_obj->hw_ops->query_poison_status(adev);
+                       if (!poison_stat)
+                               dev_info(adev->dev, "No RAS poison status in %s 
poison IH.\n",
+                                               block_obj->ras_comm.name);
+               }
+
+               if (poison_stat && 
block_obj->hw_ops->handle_poison_consumption) {
+                       poison_stat = 
block_obj->hw_ops->handle_poison_consumption(adev);
+                       need_reset = poison_stat;
+               }
+       }
+
+       /* gpu reset is fallback for all failed cases */
+       if (need_reset)
+               amdgpu_ras_reset_gpu(adev);
+}
+
 static void ras_interrupt_handler_non_poison(struct ras_manager *obj,
                                struct amdgpu_iv_entry *entry)
 {
@@ -1563,8 +1595,13 @@ static void amdgpu_ras_interrupt_handler(struct 
ras_manager *obj)
                data->rptr = (data->aligned_element_size +
                                data->rptr) % data->ring_size;
 
-               ras_interrupt_handler_non_poison(obj, &entry);
+               if (data->flag & (1 << AMDGPU_RAS_IH_POISON_CONSUMPTION))
+                       ras_interrupt_handler_poison(obj, &entry);
+               else
+                       ras_interrupt_handler_non_poison(obj, &entry);
        }
+
+       data->flag &= ~(1 << AMDGPU_RAS_IH_POISON_CONSUMPTION);
 }
 
 static void amdgpu_ras_interrupt_process_handler(struct work_struct *work)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 380f4c3020c7..562e94954650 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -516,6 +516,7 @@ struct amdgpu_ras_block_hw_ops {
        void (*reset_ras_error_count)(struct amdgpu_device *adev);
        void (*reset_ras_error_status)(struct amdgpu_device *adev);
        bool (*query_poison_status)(struct amdgpu_device *adev);
+       bool (*handle_poison_consumption)(struct amdgpu_device *adev);
 };
 
 /* work flow
-- 
2.35.1

Reply via email to