On Wed, Apr 2, 2025 at 5:28 AM jesse.zh...@amd.com <jesse.zh...@amd.com> wrote:
>
> Since KFD no longer registers its own callbacks for SDMA resets, and only KGD 
> uses the reset mechanism,
> we can simplify the SDMA reset flow by directly calling the ring's 
> `stop_queue` and `start_queue` functions.
> This patch removes the dynamic callback mechanism and prepares for its 
> eventual deprecation.
>
> 1. **Remove Dynamic Callbacks**:
>    - The `pre_reset` and `post_reset` callback invocations in 
> `amdgpu_sdma_reset_engine` are removed.
>    - Instead, the ring's `stop_queue` and `start_queue` functions are called 
> directly during the reset process.
>
> 2. **Prepare for Deprecation of Dynamic Mechanism**:
>    - By removing the callback invocations, this patch prepares the codebase 
> for the eventual removal of the dynamic callback registration mechanism.
>
> v2: Update stop_queue/start_queue function paramters to use ring pointer 
> instead of device/instance(Christian)
>
> Signed-off-by: Jesse Zhang <jesse.zh...@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  2 ++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 34 +++---------------------
>  drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 13 ++++-----
>  3 files changed, 13 insertions(+), 36 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> index 615c3d5c5a8d..23ea221e26de 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> @@ -237,6 +237,8 @@ struct amdgpu_ring_funcs {
>         void (*patch_ce)(struct amdgpu_ring *ring, unsigned offset);
>         void (*patch_de)(struct amdgpu_ring *ring, unsigned offset);
>         int (*reset)(struct amdgpu_ring *ring, unsigned int vmid);
> +       int (*stop_queue)(struct amdgpu_ring *ring);
> +       int (*start_queue)(struct amdgpu_ring *ring);

Since this is specific to SDMA, maybe it would be cleaner to add these
to struct amdgpu_sdma_instance.  And if so, maybe rename it since it's
specific to kernel queues.  E.g.,
       int (*stop_kernel_queue)(struct amdgpu_ring *ring);
       int (*start_kernel_queue)(struct amdgpu_ring *ring);
Unless you think we may need it for other engines in the future which
only support engine level soft resets.

Alex


>         void (*emit_cleaner_shader)(struct amdgpu_ring *ring);
>         bool (*is_guilty)(struct amdgpu_ring *ring);
>  };
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> index 0a9893fee828..b51fe644940f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> @@ -558,16 +558,10 @@ void amdgpu_sdma_register_on_reset_callbacks(struct 
> amdgpu_device *adev, struct
>   * @adev: Pointer to the AMDGPU device
>   * @instance_id: ID of the SDMA engine instance to reset
>   *
> - * This function performs the following steps:
> - * 1. Calls all registered pre_reset callbacks to allow KFD and AMDGPU to 
> save their state.
> - * 2. Resets the specified SDMA engine instance.
> - * 3. Calls all registered post_reset callbacks to allow KFD and AMDGPU to 
> restore their state.
> - *
>   * Returns: 0 on success, or a negative error code on failure.
>   */
>  int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t 
> instance_id)
>  {
> -       struct sdma_on_reset_funcs *funcs;
>         int ret = 0;
>         struct amdgpu_sdma_instance *sdma_instance = 
> &adev->sdma.instance[instance_id];
>         struct amdgpu_ring *gfx_ring = &sdma_instance->ring;
> @@ -589,18 +583,8 @@ int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, 
> uint32_t instance_id)
>                 page_sched_stopped = true;
>         }
>
> -       /* Invoke all registered pre_reset callbacks */
> -       list_for_each_entry(funcs, &adev->sdma.reset_callback_list, list) {
> -               if (funcs->pre_reset) {
> -                       ret = funcs->pre_reset(adev, instance_id);
> -                       if (ret) {
> -                               dev_err(adev->dev,
> -                               "beforeReset callback failed for instance %u: 
> %d\n",
> -                                       instance_id, ret);
> -                               goto exit;
> -                       }
> -               }
> -       }
> +       if (gfx_ring->funcs->stop_queue)
> +               gfx_ring->funcs->stop_queue(gfx_ring);
>
>         /* Perform the SDMA reset for the specified instance */
>         ret = amdgpu_dpm_reset_sdma(adev, 1 << instance_id);
> @@ -609,18 +593,8 @@ int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, 
> uint32_t instance_id)
>                 goto exit;
>         }
>
> -       /* Invoke all registered post_reset callbacks */
> -       list_for_each_entry(funcs, &adev->sdma.reset_callback_list, list) {
> -               if (funcs->post_reset) {
> -                       ret = funcs->post_reset(adev, instance_id);
> -                       if (ret) {
> -                               dev_err(adev->dev,
> -                               "afterReset callback failed for instance %u: 
> %d\n",
> -                                       instance_id, ret);
> -                               goto exit;
> -                       }
> -               }
> -       }
> +       if (gfx_ring->funcs->start_queue)
> +               gfx_ring->funcs->start_queue(gfx_ring);
>
>  exit:
>         /* Restart the scheduler's work queue for the GFX and page rings
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c 
> b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
> index 688a720bbbbd..a8330504692d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
> @@ -1678,11 +1678,12 @@ static int sdma_v4_4_2_reset_queue(struct amdgpu_ring 
> *ring, unsigned int vmid)
>         return r;
>  }
>
> -static int sdma_v4_4_2_stop_queue(struct amdgpu_device *adev, uint32_t 
> instance_id)
> +static int sdma_v4_4_2_stop_queue(struct amdgpu_ring *ring)
>  {
>         u32 inst_mask;
>         uint64_t rptr;
> -       struct amdgpu_ring *ring = &adev->sdma.instance[instance_id].ring;
> +       struct amdgpu_device *adev = ring->adev;
> +       u32 instance_id = GET_INST(SDMA0, ring->me);
>
>         if (amdgpu_sriov_vf(adev))
>                 return -EINVAL;
> @@ -1715,11 +1716,11 @@ static int sdma_v4_4_2_stop_queue(struct 
> amdgpu_device *adev, uint32_t instance_
>         return 0;
>  }
>
> -static int sdma_v4_4_2_restore_queue(struct amdgpu_device *adev, uint32_t 
> instance_id)
> +static int sdma_v4_4_2_restore_queue(struct amdgpu_ring *ring)
>  {
>         int i;
>         u32 inst_mask;
> -       struct amdgpu_ring *ring = &adev->sdma.instance[instance_id].ring;
> +       struct amdgpu_device *adev = ring->adev;
>
>         inst_mask = 1 << ring->me;
>         udelay(50);
> @@ -1740,8 +1741,6 @@ static int sdma_v4_4_2_restore_queue(struct 
> amdgpu_device *adev, uint32_t instan
>  }
>
>  static struct sdma_on_reset_funcs sdma_v4_4_2_engine_reset_funcs = {
> -       .pre_reset = sdma_v4_4_2_stop_queue,
> -       .post_reset = sdma_v4_4_2_restore_queue,
>  };
>
>  static void sdma_v4_4_2_set_engine_reset_funcs(struct amdgpu_device *adev)
> @@ -2143,6 +2142,8 @@ static const struct amdgpu_ring_funcs 
> sdma_v4_4_2_ring_funcs = {
>         .emit_reg_wait = sdma_v4_4_2_ring_emit_reg_wait,
>         .emit_reg_write_reg_wait = amdgpu_ring_emit_reg_write_reg_wait_helper,
>         .reset = sdma_v4_4_2_reset_queue,
> +       .stop_queue = sdma_v4_4_2_stop_queue,
> +       .start_queue = sdma_v4_4_2_restore_queue,
>         .is_guilty = sdma_v4_4_2_ring_is_guilty,
>  };
>
> --
> 2.25.1
>

Reply via email to