On 2/10/2025 1:01 PM, jesse.zh...@amd.com wrote:
> From: "jesse.zh...@amd.com" <jesse.zh...@amd.com>
> 
> This patch introduces shared SDMA reset functionality between AMDGPU and KFD.
> The implementation includes the following key changes:
> 
> 1. Added `amdgpu_sdma_reset_queue`:
>    - Resets a specific SDMA queue by instance ID.
>    - Invokes registered pre-reset and post-reset callbacks to allow KFD and 
> AMDGPU
>      to save/restore their state during the reset process.
> 
> 2. Added `amdgpu_set_on_reset_callbacks`:
>    - Allows KFD and AMDGPU to register callback functions for pre-reset and
>      post-reset operations.
>    - Callbacks are stored in a global linked list and invoked in the correct 
> order
>      during SDMA reset.
> 
> This patch ensures that both AMDGPU and KFD can handle SDMA reset events
> gracefully, with proper state saving and restoration. It also provides a 
> flexible
> callback mechanism for future extensions.
> 
> v2: fix CamelCase and put the SDMA helper into amdgpu_sdma.c (Alex)
> v3: rename the `amdgpu_register_on_reset_callbacks` function to
>       `amdgpu_sdma_register_on_reset_callbacks`
>     move global reset_callback_list to struct amdgpu_sdma (Alex)
> 
> Suggested-by: Alex Deucher <alexander.deuc...@amd.com>
> Suggested-by: Jiadong Zhu <jiadong....@amd.com>
> Signed-off-by: Jesse Zhang <jesse.zh...@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 72 ++++++++++++++++++++++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h | 11 ++++
>  drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c |  2 +-
>  3 files changed, 84 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> index 174badca27e7..19c8be7d72e2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> @@ -460,3 +460,75 @@ void amdgpu_sdma_sysfs_reset_mask_fini(struct 
> amdgpu_device *adev)
>                       device_remove_file(adev->dev, 
> &dev_attr_sdma_reset_mask);
>       }
>  }
> +
> +/**
> + * amdgpu_sdma_register_on_reset_callbacks - Register SDMA reset callbacks
> + * @funcs: Pointer to the callback structure containing pre_reset and 
> post_reset functions
> + *
> + * This function allows KFD and AMDGPU to register their own callbacks for 
> handling
> + * pre-reset and post-reset operations. The callbacks are added to a global 
> list.
> + */
> +void amdgpu_sdma_register_on_reset_callbacks(struct amdgpu_device *adev, 
> struct sdma_on_reset_funcs *funcs)

Maybe simpler to keep it sdma_register_reset_callbacks?
> +{
> +     if (!funcs)
> +             return;
> +
> +     /* Initialize the list node in the callback structure */
> +     INIT_LIST_HEAD(&funcs->list);
> +
> +     /* Add the callback structure to the global list */
> +     list_add_tail(&funcs->list, &adev->sdma.reset_callback_list);
> +}
> +
> +/**
> + * amdgpu_sdma_reset_instance - Reset a specific SDMA instance
> + * @adev: Pointer to the AMDGPU device
> + * @instance_id: ID of the SDMA engine instance to reset
> + *
> + * This function performs the following steps:
> + * 1. Calls all registered pre_reset callbacks to allow KFD and AMDGPU to 
> save their state.
> + * 2. Resets the specified SDMA engine instance.
> + * 3. Calls all registered post_reset callbacks to allow KFD and AMDGPU to 
> restore their state.
> + *
> + * Returns: 0 on success, or a negative error code on failure.
> + */
> +int amdgpu_sdma_reset_instance(struct amdgpu_device *adev, uint32_t 
> instance_id)
> +{
> +     struct sdma_on_reset_funcs *funcs;
> +     int ret;
> +
> +     /* Invoke all registered pre_reset callbacks */
> +     list_for_each_entry(funcs, &adev->sdma.reset_callback_list, list) {
> +             if (funcs->pre_reset) {
> +                     ret = funcs->pre_reset(adev, instance_id);
> +                     if (ret) {
> +                             dev_err(adev->dev,
> +                             "beforeReset callback failed for instance %u: 
> %d\n",
> +                                     instance_id, ret);

Please add more context like "SDMA pre-reset failed" etc. ('SDMA' needs
to be there). Same for the message down also.

Thanks,
Lijo

> +                             return ret;
> +                     }
> +             }
> +     }
> +
> +     /* Perform the SDMA reset for the specified instance */
> +     ret = amdgpu_dpm_reset_sdma(adev, 1 << instance_id);
> +     if (ret) {
> +             dev_err(adev->dev, "Failed to reset SDMA instance %u\n", 
> instance_id);
> +             return ret;
> +     }
> +
> +     /* Invoke all registered post_reset callbacks */
> +     list_for_each_entry(funcs, &adev->sdma.reset_callback_list, list) {
> +             if (funcs->post_reset) {
> +                     ret = funcs->post_reset(adev, instance_id);
> +                     if (ret) {
> +                             dev_err(adev->dev,
> +                             "afterReset callback failed for instance %u: 
> %d\n",
> +                                     instance_id, ret);
> +                             return ret;
> +                     }
> +             }
> +     }
> +
> +     return 0;
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
> index 5f60736051d1..fbb8b04ef2cb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
> @@ -98,6 +98,13 @@ struct amdgpu_sdma_ras {
>       struct amdgpu_ras_block_object ras_block;
>  };
>  
> +struct sdma_on_reset_funcs {
> +     int (*pre_reset)(struct amdgpu_device *adev, uint32_t instance_id);
> +     int (*post_reset)(struct amdgpu_device *adev, uint32_t instance_id);
> +     /* Linked list node to store this structure in a list; */
> +     struct list_head list;
> +};
> +
>  struct amdgpu_sdma {
>       struct amdgpu_sdma_instance instance[AMDGPU_MAX_SDMA_INSTANCES];
>       struct amdgpu_irq_src   trap_irq;
> @@ -118,6 +125,7 @@ struct amdgpu_sdma {
>       struct amdgpu_sdma_ras  *ras;
>       uint32_t                *ip_dump;
>       uint32_t                supported_reset;
> +     struct list_head        reset_callback_list;
>  };
>  
>  /*
> @@ -157,6 +165,9 @@ struct amdgpu_buffer_funcs {
>                                uint32_t byte_count);
>  };
>  
> +void amdgpu_sdma_register_on_reset_callbacks(struct amdgpu_device *adev, 
> struct sdma_on_reset_funcs *funcs);
> +int amdgpu_sdma_reset_instance(struct amdgpu_device *adev, uint32_t 
> instance_id);
> +
>  #define amdgpu_emit_copy_buffer(adev, ib, s, d, b, t) 
> (adev)->mman.buffer_funcs->emit_copy_buffer((ib),  (s), (d), (b), (t))
>  #define amdgpu_emit_fill_buffer(adev, ib, s, d, b) 
> (adev)->mman.buffer_funcs->emit_fill_buffer((ib), (s), (d), (b))
>  
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c 
> b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
> index 5e0066cd6c51..64c163dd708f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
> @@ -1477,7 +1477,7 @@ static int sdma_v4_4_2_sw_init(struct amdgpu_ip_block 
> *ip_block)
>       r = amdgpu_sdma_sysfs_reset_mask_init(adev);
>       if (r)
>               return r;
> -
> +     INIT_LIST_HEAD(&adev->sdma.reset_callback_list);
>       return r;
>  }
>  

Reply via email to