On 2/21/2025 4:15 PM, jesse.zh...@amd.com wrote:
> From: "jesse.zh...@amd.com" <jesse.zh...@amd.com>
>
> - Added `sdma_v4_4_2_update_reset_mask` function to update the reset mask.
> - update the sysfs reset mask to the `late_init` stage to ensure that the SMU
> initialization
> and capability setup are completed before checking the SDMA reset
> capability.
> - For IP versions 9.4.3 and 9.4.4, enable per-queue reset if the MEC firmware
> version is at least 0xb0 and PMFW supports queue reset.
> - Add a TODO comment for future support of per-queue reset for IP version
> 9.4.5.
>
> This change ensures that per-queue reset is only enabled when the MEC and
> PMFW support it.
>
> Suggested-by: Jonathan Kim <jonathan....@amd.com>
> Signed-off-by: Vitaly Prosyak <vitaly.pros...@amd.com>
> Signed-off-by: Jesse Zhang <jesse.zh...@amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 37 +++++++++++++++++++++++-
> 1 file changed, 36 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
> b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
> index 4fa688e00f5e..17e7e36f4477 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
> @@ -107,6 +107,7 @@ static void sdma_v4_4_2_set_vm_pte_funcs(struct
> amdgpu_device *adev);
> static void sdma_v4_4_2_set_irq_funcs(struct amdgpu_device *adev);
> static void sdma_v4_4_2_set_ras_funcs(struct amdgpu_device *adev);
> static void sdma_v4_4_2_set_engine_reset_funcs(struct amdgpu_device *adev);
> +static void sdma_v4_4_2_update_reset_mask(struct amdgpu_device *adev);
>
> static u32 sdma_v4_4_2_get_reg_offset(struct amdgpu_device *adev,
> u32 instance, u32 offset)
> @@ -1374,6 +1375,12 @@ static int sdma_v4_4_2_late_init(struct
> amdgpu_ip_block *ip_block)
> if (!amdgpu_persistent_edc_harvesting_supported(adev))
> amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__SDMA);
>
> + /* The initialization is done in the late_init stage to ensure that the
> SMU
> + * initialization and capability setup are completed before we check
> the SDMA
> + * reset capability
> + */
> + sdma_v4_4_2_update_reset_mask(adev);
> +
> return 0;
> }
>
> @@ -1481,7 +1488,6 @@ static int sdma_v4_4_2_sw_init(struct amdgpu_ip_block
> *ip_block)
> }
> }
>
> - /* TODO: Add queue reset mask when FW fully supports it */
> adev->sdma.supported_reset =
> amdgpu_get_soft_full_reset_mask(&adev->sdma.instance[0].ring);
>
> @@ -2328,6 +2334,35 @@ static void sdma_v4_4_2_set_vm_pte_funcs(struct
> amdgpu_device *adev)
> adev->vm_manager.vm_pte_num_scheds = adev->sdma.num_instances;
> }
>
> +/**
> + * sdma_v4_4_2_update_reset_mask - update reset mask for SDMA
> + * @adev: Pointer to the AMDGPU device structure
> + *
> + * This function update reset mask for SDMA and sets the supported
> + * reset types based on the IP version and firmware versions.
> + *
> + */
> +static void sdma_v4_4_2_update_reset_mask(struct amdgpu_device *adev)
> +{
> +
> + /*
> + * the user queue relies on MEC fw and pmfw when the sdma queue do
> reset.
> + * it needs to check both of them at here to skip old mec and pmfw.
> + */
> + switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
> + case IP_VERSION(9, 4, 3):
> + case IP_VERSION(9, 4, 4):
> + if ((adev->gfx.mec_fw_version >= 0xb0) &&
> amdgpu_dpm_reset_sdma_is_supported(adev))
> + adev->sdma.supported_reset |=
> AMDGPU_RESET_TYPE_PER_QUEUE;
> + break;
> + case IP_VERSION(9, 4, 5):
Did you mean v9.5.0?
Thanks,
Lijo
> + /*TODO: enable the queue reset flag until fw supported */
> + default:
> + break;
> + }
> +
> +}
> +
> const struct amdgpu_ip_block_version sdma_v4_4_2_ip_block = {
> .type = AMD_IP_BLOCK_TYPE_SDMA,
> .major = 4,