Do RAS page retirement and use gpu reset as fallback in utcl2
fault handler.

Signed-off-by: Tao Zhou <tao.zh...@amd.com>
---
 .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c   | 23 ++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index f7def0bf0730..3991f71d865b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -93,11 +93,12 @@ enum SQ_INTERRUPT_ERROR_TYPE {
 static void event_interrupt_poison_consumption(struct kfd_dev *dev,
                                const uint32_t *ih_ring_entry)
 {
-       uint16_t source_id, pasid;
+       uint16_t source_id, client_id, pasid;
        int ret = -EINVAL;
        struct kfd_process *p;
 
        source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry);
+       client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry);
        pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry);
 
        p = kfd_lookup_process_by_pasid(pasid);
@@ -110,6 +111,7 @@ static void event_interrupt_poison_consumption(struct 
kfd_dev *dev,
                return;
        }
 
+       pr_debug("RAS poison consumption handling\n");
        atomic_set(&p->poison, 1);
        kfd_unref_process(p);
 
@@ -119,10 +121,14 @@ static void event_interrupt_poison_consumption(struct 
kfd_dev *dev,
                break;
        case SOC15_INTSRC_SDMA_ECC:
        default:
+               if (client_id == SOC15_IH_CLIENTID_UTCL2)
+                       ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
                break;
        }
 
-       kfd_signal_poison_consumed_event(dev, pasid);
+       /* utcl2 page fault has its own vm fault event */
+       if (client_id != SOC15_IH_CLIENTID_UTCL2)
+               kfd_signal_poison_consumed_event(dev, pasid);
 
        /* resetting queue passes, do page retirement without gpu reset
         * resetting queue fails, fallback to gpu reset solution
@@ -314,7 +320,18 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
                info.prot_write = ring_id & 0x20;
 
                kfd_smi_event_update_vmfault(dev, pasid);
-               kfd_dqm_evict_pasid(dev->dqm, pasid);
+
+               if (client_id == SOC15_IH_CLIENTID_UTCL2 &&
+                   dev->kfd2kgd->is_ras_utcl2_poison &&
+                   dev->kfd2kgd->is_ras_utcl2_poison(dev->adev, client_id)) {
+                       event_interrupt_poison_consumption(dev, ih_ring_entry);
+
+                       if (dev->kfd2kgd->utcl2_fault_clear)
+                               dev->kfd2kgd->utcl2_fault_clear(dev->adev);
+               }
+               else
+                       kfd_dqm_evict_pasid(dev->dqm, pasid);
+
                kfd_signal_vm_fault_event(dev, pasid, &info);
        }
 }
-- 
2.35.1

Reply via email to