[AMD Official Use Only - General] ----------------- Best Regards, Thomas
-----Original Message----- From: Zhang, Hawking <hawking.zh...@amd.com> Sent: Thursday, April 25, 2024 11:01 AM To: Chai, Thomas <yipeng.c...@amd.com>; amd-gfx@lists.freedesktop.org Cc: Zhou1, Tao <tao.zh...@amd.com>; Li, Candice <candice...@amd.com>; Wang, Yang(Kevin) <kevinyang.w...@amd.com>; Yang, Stanley <stanley.y...@amd.com> Subject: RE: [PATCH 11/15] drm/amdgpu: prepare to handle pasid poison consumption [AMD Official Use Only - General] +void amdgpu_amdkfd_ras_pasid_poison_consumption_handler(struct amdgpu_device *adev, + enum amdgpu_ras_block block, uint16_t pasid, + pasid_notify pasid_fn, void *data, uint32_t +reset); > So we ultimately switch to above poison consumption handler for all the > existing v9 adapters, right? If so, we shall be able to make this function > backwards compatible. I'm wondering if we can just change the existing > amdgpu_amdkfd_ras_poison_consumption_handler. > Pasid_poison_consumption_handler is a little bit confusing. [Thomas] No, Only UMC_HWIP greater or equal to IP_VERSION(12, 0, 0)), it works on the new path. The IP check is in amdgpu_umc_pasid_poison_handler function. Regards, Hawking -----Original Message----- From: Chai, Thomas <yipeng.c...@amd.com> Sent: Thursday, April 18, 2024 10:59 To: amd-gfx@lists.freedesktop.org Cc: Chai, Thomas <yipeng.c...@amd.com>; Zhang, Hawking <hawking.zh...@amd.com>; Zhou1, Tao <tao.zh...@amd.com>; Li, Candice <candice...@amd.com>; Wang, Yang(Kevin) <kevinyang.w...@amd.com>; Yang, Stanley <stanley.y...@amd.com>; Chai, Thomas <yipeng.c...@amd.com> Subject: [PATCH 11/15] drm/amdgpu: prepare to handle pasid poison consumption Prepare to handle pasid poison consumption. Signed-off-by: YiPeng Chai <yipeng.c...@amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 9 ++++++++- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 5 +++++ drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 20 ++++++++++++------- drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h | 3 +++ .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 3 ++- 5 files changed, 31 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index 66753940bb4d..287ce431901c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -759,10 +759,17 @@ bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev) return amdgpu_ras_get_fed_status(adev); } +void amdgpu_amdkfd_ras_pasid_poison_consumption_handler(struct amdgpu_device *adev, + enum amdgpu_ras_block block, uint16_t pasid, + pasid_notify pasid_fn, void *data, uint32_t reset) { + amdgpu_umc_pasid_poison_handler(adev, block, pasid, pasid_fn, +data, reset); } + void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, enum amdgpu_ras_block block, uint32_t reset) { - amdgpu_umc_poison_handler(adev, block, reset); + amdgpu_umc_pasid_poison_handler(adev, block, 0, NULL, NULL, + reset); } int amdgpu_amdkfd_send_close_event_drain_irq(struct amdgpu_device *adev, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index ad50c7bbc326..54e15994d02b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -401,6 +401,11 @@ int amdgpu_amdkfd_get_tile_config(struct amdgpu_device *adev, struct tile_config *config); void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, enum amdgpu_ras_block block, uint32_t reset); + +void amdgpu_amdkfd_ras_pasid_poison_consumption_handler(struct amdgpu_device *adev, + enum amdgpu_ras_block block, uint16_t pasid, + pasid_notify pasid_fn, void *data, uint32_t +reset); + bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev); bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem *mem); void amdgpu_amdkfd_block_mmu_notifications(void *p); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index dcda3d24bee3..8ebbca9e2e22 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -252,8 +252,9 @@ int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev, return 0; } -int amdgpu_umc_poison_handler(struct amdgpu_device *adev, - enum amdgpu_ras_block block, uint32_t reset) +int amdgpu_umc_pasid_poison_handler(struct amdgpu_device *adev, + enum amdgpu_ras_block block, uint16_t pasid, + pasid_notify pasid_fn, void *data, uint32_t +reset) { int ret = AMDGPU_RAS_SUCCESS; @@ -291,16 +292,14 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev, amdgpu_ras_error_data_fini(&err_data); } else { - if (reset) { - amdgpu_umc_bad_page_polling_timeout(adev, - reset, MAX_UMC_POISON_POLLING_TIME_SYNC); - } else { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + amdgpu_ras_put_poison_req(adev, + block, pasid, pasid_fn, data, + reset); + atomic_inc(&con->page_retirement_req_cnt); wake_up(&con->page_retirement_wq); - } } } else { if (adev->virt.ops && adev->virt.ops->ras_poison_handler) @@ -313,6 +312,13 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev, return ret; } +int amdgpu_umc_poison_handler(struct amdgpu_device *adev, + enum amdgpu_ras_block block, uint32_t reset) { + return amdgpu_umc_pasid_poison_handler(adev, + block, 0, NULL, NULL, reset); } + int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev, void *ras_error_status, struct amdgpu_iv_entry *entry) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h index 9e77e6d48e3b..5f50c69c3cec 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h @@ -106,6 +106,9 @@ int amdgpu_umc_ras_sw_init(struct amdgpu_device *adev); int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block); int amdgpu_umc_poison_handler(struct amdgpu_device *adev, enum amdgpu_ras_block block, uint32_t reset); +int amdgpu_umc_pasid_poison_handler(struct amdgpu_device *adev, + enum amdgpu_ras_block block, uint16_t pasid, + pasid_notify pasid_fn, void *data, uint32_t +reset); int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev, struct amdgpu_irq_src *source, struct amdgpu_iv_entry *entry); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c index c368c70df3f4..6bf4bbc3cffa 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c @@ -201,7 +201,8 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev, "RAS poison consumption, fall back to gpu reset flow: client id %d\n", client_id); - amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, reset); + amdgpu_amdkfd_ras_pasid_poison_consumption_handler(dev->adev, + block, pasid, NULL, NULL, reset); } static bool context_id_expected(struct kfd_dev *dev) -- 2.34.1