If fatal error is detected, packet submission won't go through. Return
error in such cases. Also, avoid waiting for fence when fatal error is
detected.

Signed-off-by: Lijo Lazar <lijo.la...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c            |  5 +++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h            |  1 +
 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c |  4 ++++
 drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c         |  8 +++++++-
 drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h         |  2 +-
 drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c       | 10 +++++-----
 6 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 190039f14c30..f5f2945711be 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -742,6 +742,11 @@ void amdgpu_amdkfd_debug_mem_fence(struct amdgpu_device 
*adev)
        amdgpu_device_flush_hdp(adev, NULL);
 }
 
+bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev)
+{
+       return amdgpu_ras_get_fed_status(adev);
+}
+
 void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
        enum amdgpu_ras_block block, bool reset)
 {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index e60f63ccf79a..4fb32d86cd0e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -337,6 +337,7 @@ int amdgpu_amdkfd_get_tile_config(struct amdgpu_device 
*adev,
                                struct tile_config *config);
 void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
                        enum amdgpu_ras_block block, bool reset);
+bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev);
 bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem 
*mem);
 void amdgpu_amdkfd_block_mmu_notifications(void *p);
 int amdgpu_amdkfd_criu_resume(void *p);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index c0e71543389a..f4d395e38683 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -1903,6 +1903,10 @@ int amdkfd_fence_wait_timeout(struct 
device_queue_manager *dqm,
        uint64_t *fence_addr =  dqm->fence_addr;
 
        while (*fence_addr != fence_value) {
+               /* Fatal err detected, this response won't come */
+               if (amdgpu_amdkfd_is_fed(dqm->dev->adev))
+                       return -EIO;
+
                if (time_after(jiffies, end_jiffies)) {
                        dev_err(dev, "qcm fence wait loop timeout expired\n");
                        /* In HWS case, this is used to halt the driver thread
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
index 1bea629c49ca..32c926986dbb 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
@@ -286,7 +286,7 @@ int kq_acquire_packet_buffer(struct kernel_queue *kq,
        return -ENOMEM;
 }
 
-void kq_submit_packet(struct kernel_queue *kq)
+int kq_submit_packet(struct kernel_queue *kq)
 {
 #ifdef DEBUG
        int i;
@@ -298,6 +298,10 @@ void kq_submit_packet(struct kernel_queue *kq)
        }
        pr_debug("\n");
 #endif
+       /* Fatal err detected, packet submission won't go through */
+       if (amdgpu_amdkfd_is_fed(kq->dev->adev))
+               return -EIO;
+
        if (kq->dev->kfd->device_info.doorbell_size == 8) {
                *kq->wptr64_kernel = kq->pending_wptr64;
                write_kernel_doorbell64(kq->queue->properties.doorbell_ptr,
@@ -307,6 +311,8 @@ void kq_submit_packet(struct kernel_queue *kq)
                write_kernel_doorbell(kq->queue->properties.doorbell_ptr,
                                        kq->pending_wptr);
        }
+
+       return 0;
 }
 
 void kq_rollback_packet(struct kernel_queue *kq)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h
index 9a6244430845..e24ee50acdf0 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h
@@ -47,7 +47,7 @@
 int kq_acquire_packet_buffer(struct kernel_queue *kq,
                                size_t packet_size_in_dwords,
                                unsigned int **buffer_ptr);
-void kq_submit_packet(struct kernel_queue *kq);
+int kq_submit_packet(struct kernel_queue *kq);
 void kq_rollback_packet(struct kernel_queue *kq);
 
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
index 401096c103b2..d6f65f39072b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
@@ -288,7 +288,7 @@ int pm_send_set_resources(struct packet_manager *pm,
 
        retval = pm->pmf->set_resources(pm, buffer, res);
        if (!retval)
-               kq_submit_packet(pm->priv_queue);
+               retval = kq_submit_packet(pm->priv_queue);
        else
                kq_rollback_packet(pm->priv_queue);
 
@@ -325,7 +325,7 @@ int pm_send_runlist(struct packet_manager *pm, struct 
list_head *dqm_queues)
        if (retval)
                goto fail_create_runlist;
 
-       kq_submit_packet(pm->priv_queue);
+       retval = kq_submit_packet(pm->priv_queue);
 
        mutex_unlock(&pm->lock);
 
@@ -361,7 +361,7 @@ int pm_send_query_status(struct packet_manager *pm, 
uint64_t fence_address,
 
        retval = pm->pmf->query_status(pm, buffer, fence_address, fence_value);
        if (!retval)
-               kq_submit_packet(pm->priv_queue);
+               retval = kq_submit_packet(pm->priv_queue);
        else
                kq_rollback_packet(pm->priv_queue);
 
@@ -392,7 +392,7 @@ int pm_update_grace_period(struct packet_manager *pm, 
uint32_t grace_period)
 
                retval = pm->pmf->set_grace_period(pm, buffer, grace_period);
                if (!retval)
-                       kq_submit_packet(pm->priv_queue);
+                       retval = kq_submit_packet(pm->priv_queue);
                else
                        kq_rollback_packet(pm->priv_queue);
        }
@@ -421,7 +421,7 @@ int pm_send_unmap_queue(struct packet_manager *pm,
 
        retval = pm->pmf->unmap_queues(pm, buffer, filter, filter_param, reset);
        if (!retval)
-               kq_submit_packet(pm->priv_queue);
+               retval = kq_submit_packet(pm->priv_queue);
        else
                kq_rollback_packet(pm->priv_queue);
 
-- 
2.25.1

Reply via email to