Perform queue reset for SDMA poison consumption. Signed-off-by: Tao Zhou <tao.zh...@amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 8 ++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 3 +++ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 30 ++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 4 +++ 4 files changed, 45 insertions(+)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index 6c47f7d9adcd..085bff11319a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -741,6 +741,14 @@ void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, amdgpu_umc_pasid_poison_handler(adev, block, 0, NULL, NULL, reset); } +int amdgpu_amdkfd_ras_poison_queue_reset(struct amdgpu_device *adev, + enum amdgpu_ras_block block, uint16_t client_id, uint16_t vmid, + uint16_t node_id) +{ + return amdgpu_ras_poison_queue_reset(adev, block, client_id, vmid, + node_id); +} + int amdgpu_amdkfd_send_close_event_drain_irq(struct amdgpu_device *adev, uint32_t *payload) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index 3fa951ede37c..f1680027399e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -404,6 +404,9 @@ void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, void amdgpu_amdkfd_ras_pasid_poison_consumption_handler(struct amdgpu_device *adev, enum amdgpu_ras_block block, uint16_t pasid, pasid_notify pasid_fn, void *data, uint32_t reset); +int amdgpu_amdkfd_ras_poison_queue_reset(struct amdgpu_device *adev, + enum amdgpu_ras_block block, uint16_t client_id, + uint16_t vmid, uint16_t node_id); bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev); bool amdgpu_amdkfd_bo_mapped_to_dev(void *drm_priv, struct kgd_mem *mem); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 6565dc7ff9cd..7e63c2fc1a62 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -5311,3 +5311,33 @@ bool amdgpu_ras_is_rma(struct amdgpu_device *adev) return con->is_rma; } + +int amdgpu_ras_poison_queue_reset(struct amdgpu_device *adev, + enum amdgpu_ras_block block, uint16_t client_id, uint16_t vmid, + uint16_t node_id) +{ + struct amdgpu_ring *ring = NULL; + int sdma_inst, ret; + + if (block == AMDGPU_RAS_BLOCK__SDMA && + adev->sdma.instance[0].funcs->sdma_irq_id_to_seq) { + sdma_inst = adev->sdma.instance[0].funcs->sdma_irq_id_to_seq(adev, + client_id, node_id); + if (sdma_inst < 0) + return sdma_inst; + + ring = &(adev->sdma.instance[sdma_inst].ring); + } + + if (ring && ring->funcs->reset) { + ret = amdgpu_ring_reset(ring, vmid); + if (ret) + dev_warn(adev->dev, + "queue reset failed in block%d (ret %d), fallback to gpu reset\n", + block, ret); + } else { + return -EINVAL; + } + + return ret; +} diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 927d6bff734a..debc07767b5c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -984,4 +984,8 @@ void amdgpu_ras_event_log_print(struct amdgpu_device *adev, u64 event_id, const char *fmt, ...); bool amdgpu_ras_is_rma(struct amdgpu_device *adev); + +int amdgpu_ras_poison_queue_reset(struct amdgpu_device *adev, + enum amdgpu_ras_block block, uint16_t client_id, uint16_t vmid, + uint16_t node_id); #endif -- 2.34.1