In ras poison mode, page retirement will be handled by the irq handler of the
module which consumes corrupted data.

Signed-off-by: Tao Zhou <tao.zh...@amd.com>
---
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c    | 13 ++++++++++++-
 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c     | 10 ++++++++--
 drivers/gpu/drm/amd/include/kgd_kfd_interface.h     |  1 +
 3 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
index 46cd4ee6bafb..eb5e9c1b1073 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
@@ -23,6 +23,16 @@
 #include "amdgpu_amdkfd.h"
 #include "amdgpu_amdkfd_arcturus.h"
 #include "amdgpu_amdkfd_gfx_v9.h"
+#include "amdgpu_ras.h"
+#include "amdgpu_umc.h"
+
+int kgd_aldebaran_ras_process_cb(struct kgd_dev *kgd)
+{
+       struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
+       struct ras_err_data err_data = {0, 0, 0, NULL};
+
+       return amdgpu_umc_process_ras_data_cb(adev, &err_data, NULL);
+}
 
 const struct kfd2kgd_calls aldebaran_kfd2kgd = {
        .program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings,
@@ -44,5 +54,6 @@ const struct kfd2kgd_calls aldebaran_kfd2kgd = {
        .get_atc_vmid_pasid_mapping_info =
                                kgd_gfx_v9_get_atc_vmid_pasid_mapping_info,
        .set_vm_context_page_table_base = 
kgd_gfx_v9_set_vm_context_page_table_base,
-       .program_trap_handler_settings = 
kgd_gfx_v9_program_trap_handler_settings
+       .program_trap_handler_settings = 
kgd_gfx_v9_program_trap_handler_settings,
+       .ras_process_cb = kgd_aldebaran_ras_process_cb
 };
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index 12d91e53556c..851b5120927a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -231,7 +231,10 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
                                if (sq_intr_err != 
SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST &&
                                        sq_intr_err != 
SQ_INTERRUPT_ERROR_TYPE_MEMVIOL) {
                                        kfd_signal_poison_consumed_event(dev, 
pasid);
-                                       amdgpu_amdkfd_gpu_reset(dev->kgd);
+                                       if (dev->kfd2kgd->ras_process_cb)
+                                               
dev->kfd2kgd->ras_process_cb(dev->kgd);
+                                       else
+                                               
amdgpu_amdkfd_gpu_reset(dev->kgd);
                                        return;
                                }
                                break;
@@ -253,7 +256,10 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
                        kfd_signal_event_interrupt(pasid, context_id0 & 
0xfffffff, 28);
                } else if (source_id == SOC15_INTSRC_SDMA_ECC) {
                        kfd_signal_poison_consumed_event(dev, pasid);
-                       amdgpu_amdkfd_gpu_reset(dev->kgd);
+                       if (dev->kfd2kgd->ras_process_cb)
+                               dev->kfd2kgd->ras_process_cb(dev->kgd);
+                       else
+                               amdgpu_amdkfd_gpu_reset(dev->kgd);
                        return;
                }
        } else if (client_id == SOC15_IH_CLIENTID_VMC ||
diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h 
b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
index c84bd7b2cf59..9e6525871ad4 100644
--- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
+++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
@@ -301,6 +301,7 @@ struct kfd2kgd_calls {
                        int *max_waves_per_cu);
        void (*program_trap_handler_settings)(struct kgd_dev *kgd,
                        uint32_t vmid, uint64_t tba_addr, uint64_t tma_addr);
+       int (*ras_process_cb)(struct kgd_dev *kgd);
 };
 
 #endif /* KGD_KFD_INTERFACE_H_INCLUDED */
-- 
2.17.1

Reply via email to