On Tue, Apr 8, 2025 at 4:47 AM jesse.zh...@amd.com <jesse.zh...@amd.com> wrote: > > Since KFD no longer registers its own callbacks for SDMA resets, and only KGD > uses the reset mechanism, > we can simplify the SDMA reset flow by directly calling the ring's > `stop_queue` and `start_queue` functions. > This patch removes the dynamic callback mechanism and prepares for its > eventual deprecation. > > 1. **Remove Dynamic Callbacks**: > - The `pre_reset` and `post_reset` callback invocations in > `amdgpu_sdma_reset_engine` are removed. > - Instead, the ring's `stop_queue` and `start_queue` functions are called > directly during the reset process. > > 2. **Prepare for Deprecation of Dynamic Mechanism**: > - By removing the callback invocations, this patch prepares the codebase > for the eventual removal of the dynamic callback registration mechanism. > > v2: Update stop_queue/start_queue function paramters to use ring pointer > instead of device/instance(Christian) > v3: The general coding style is to declare variables like "i" or "r" last. > E.g. longest lines first and short lasts. (Chritian) > v4: move stop_queue/start_queue to struct amdgpu_sdma_instance and rename > them. (Alex) > > Signed-off-by: Jesse Zhang <jesse.zh...@amd.com>
These patches look good, but I would rework the ordering a bit. I'd suggest splitting them up like so: 1. Add the new sdma function pointers (stop_kernel_queue, start_kernel_queue, soft_reset) for amdgpu_sdma.h 2. Register the new sdma function pointers (stop_kernel_queue, start_kernel_queue, soft_reset) for each sdma IP version that needs them 3. Switch amdgpu_sdma_reset_engine() to use the new sdma function pointers 4. optimize queue reset and stop logic patches 5. implement soft reset directly for sdma 5 patches 6. remove old sdma reset callback mechanism Alex Alex > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 54 ++++++------------------ > drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h | 7 ++- > drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 22 ++++++---- > 3 files changed, 34 insertions(+), 49 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c > index 0a9893fee828..541b349e0310 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c > @@ -558,20 +558,14 @@ void amdgpu_sdma_register_on_reset_callbacks(struct > amdgpu_device *adev, struct > * @adev: Pointer to the AMDGPU device > * @instance_id: ID of the SDMA engine instance to reset > * > - * This function performs the following steps: > - * 1. Calls all registered pre_reset callbacks to allow KFD and AMDGPU to > save their state. > - * 2. Resets the specified SDMA engine instance. > - * 3. Calls all registered post_reset callbacks to allow KFD and AMDGPU to > restore their state. > - * > * Returns: 0 on success, or a negative error code on failure. > */ > int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t > instance_id) > { > - struct sdma_on_reset_funcs *funcs; > int ret = 0; > struct amdgpu_sdma_instance *sdma_instance = > &adev->sdma.instance[instance_id]; > - struct amdgpu_ring *gfx_ring = &sdma_instance->ring; > - struct amdgpu_ring *page_ring = &sdma_instance->page; > + struct amdgpu_ring *sdma_gfx_ring = &sdma_instance->ring; > + struct amdgpu_ring *sdma_page_ring = &sdma_instance->page; > bool gfx_sched_stopped = false, page_sched_stopped = false; > > mutex_lock(&sdma_instance->engine_reset_mutex); > @@ -579,28 +573,18 @@ int amdgpu_sdma_reset_engine(struct amdgpu_device > *adev, uint32_t instance_id) > * This ensures that no new tasks are submitted to the queues while > * the reset is in progress. > */ > - if (!amdgpu_ring_sched_ready(gfx_ring)) { > - drm_sched_wqueue_stop(&gfx_ring->sched); > + if (!amdgpu_ring_sched_ready(sdma_gfx_ring)) { > + drm_sched_wqueue_stop(&sdma_gfx_ring->sched); > gfx_sched_stopped = true; > } > > - if (adev->sdma.has_page_queue && !amdgpu_ring_sched_ready(page_ring)) > { > - drm_sched_wqueue_stop(&page_ring->sched); > + if (adev->sdma.has_page_queue && > !amdgpu_ring_sched_ready(sdma_page_ring)) { > + drm_sched_wqueue_stop(&sdma_page_ring->sched); > page_sched_stopped = true; > } > > - /* Invoke all registered pre_reset callbacks */ > - list_for_each_entry(funcs, &adev->sdma.reset_callback_list, list) { > - if (funcs->pre_reset) { > - ret = funcs->pre_reset(adev, instance_id); > - if (ret) { > - dev_err(adev->dev, > - "beforeReset callback failed for instance %u: > %d\n", > - instance_id, ret); > - goto exit; > - } > - } > - } > + if (sdma_instance->funcs->stop_kernel_queue) > + sdma_instance->funcs->stop_kernel_queue(sdma_gfx_ring); > > /* Perform the SDMA reset for the specified instance */ > ret = amdgpu_dpm_reset_sdma(adev, 1 << instance_id); > @@ -609,18 +593,8 @@ int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, > uint32_t instance_id) > goto exit; > } > > - /* Invoke all registered post_reset callbacks */ > - list_for_each_entry(funcs, &adev->sdma.reset_callback_list, list) { > - if (funcs->post_reset) { > - ret = funcs->post_reset(adev, instance_id); > - if (ret) { > - dev_err(adev->dev, > - "afterReset callback failed for instance %u: > %d\n", > - instance_id, ret); > - goto exit; > - } > - } > - } > + if (sdma_instance->funcs->start_kernel_queue) > + sdma_instance->funcs->start_kernel_queue(sdma_gfx_ring); > > exit: > /* Restart the scheduler's work queue for the GFX and page rings > @@ -628,11 +602,11 @@ int amdgpu_sdma_reset_engine(struct amdgpu_device > *adev, uint32_t instance_id) > * to be submitted to the queues after the reset is complete. > */ > if (!ret) { > - if (gfx_sched_stopped && amdgpu_ring_sched_ready(gfx_ring)) { > - drm_sched_wqueue_start(&gfx_ring->sched); > + if (gfx_sched_stopped && > amdgpu_ring_sched_ready(sdma_gfx_ring)) { > + drm_sched_wqueue_start(&sdma_gfx_ring->sched); > } > - if (page_sched_stopped && amdgpu_ring_sched_ready(page_ring)) > { > - drm_sched_wqueue_start(&page_ring->sched); > + if (page_sched_stopped && > amdgpu_ring_sched_ready(sdma_page_ring)) { > + drm_sched_wqueue_start(&sdma_page_ring->sched); > } > } > mutex_unlock(&sdma_instance->engine_reset_mutex); > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h > index 47d56fd0589f..620fd7663526 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h > @@ -50,6 +50,11 @@ enum amdgpu_sdma_irq { > > #define NUM_SDMA(x) hweight32(x) > > +struct amdgpu_sdma_funcs { > + int (*stop_kernel_queue)(struct amdgpu_ring *ring); > + int (*start_kernel_queue)(struct amdgpu_ring *ring); > +}; > + > struct amdgpu_sdma_instance { > /* SDMA firmware */ > const struct firmware *fw; > @@ -68,7 +73,7 @@ struct amdgpu_sdma_instance { > /* track guilty state of GFX and PAGE queues */ > bool gfx_guilty; > bool page_guilty; > - > + const struct amdgpu_sdma_funcs *funcs; > }; > > enum amdgpu_sdma_ras_memory_id { > diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c > b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c > index 688a720bbbbd..c663c63485f4 100644 > --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c > +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c > @@ -108,6 +108,8 @@ static void sdma_v4_4_2_set_irq_funcs(struct > amdgpu_device *adev); > static void sdma_v4_4_2_set_ras_funcs(struct amdgpu_device *adev); > static void sdma_v4_4_2_set_engine_reset_funcs(struct amdgpu_device *adev); > static void sdma_v4_4_2_update_reset_mask(struct amdgpu_device *adev); > +static int sdma_v4_4_2_stop_queue(struct amdgpu_ring *ring); > +static int sdma_v4_4_2_restore_queue(struct amdgpu_ring *ring); > > static u32 sdma_v4_4_2_get_reg_offset(struct amdgpu_device *adev, > u32 instance, u32 offset) > @@ -1333,6 +1335,11 @@ static bool sdma_v4_4_2_fw_support_paging_queue(struct > amdgpu_device *adev) > } > } > > +static const struct amdgpu_sdma_funcs sdma_v4_4_2_sdma_funcs = { > + .stop_kernel_queue = &sdma_v4_4_2_stop_queue, > + .start_kernel_queue = &sdma_v4_4_2_restore_queue, > +}; > + > static int sdma_v4_4_2_early_init(struct amdgpu_ip_block *ip_block) > { > struct amdgpu_device *adev = ip_block->adev; > @@ -1352,7 +1359,6 @@ static int sdma_v4_4_2_early_init(struct > amdgpu_ip_block *ip_block) > sdma_v4_4_2_set_irq_funcs(adev); > sdma_v4_4_2_set_ras_funcs(adev); > sdma_v4_4_2_set_engine_reset_funcs(adev); > - > return 0; > } > > @@ -1447,6 +1453,7 @@ static int sdma_v4_4_2_sw_init(struct amdgpu_ip_block > *ip_block) > /* Initialize guilty flags for GFX and PAGE queues */ > adev->sdma.instance[i].gfx_guilty = false; > adev->sdma.instance[i].page_guilty = false; > + adev->sdma.instance[i].funcs = &sdma_v4_4_2_sdma_funcs; > > ring = &adev->sdma.instance[i].ring; > ring->ring_obj = NULL; > @@ -1678,11 +1685,12 @@ static int sdma_v4_4_2_reset_queue(struct amdgpu_ring > *ring, unsigned int vmid) > return r; > } > > -static int sdma_v4_4_2_stop_queue(struct amdgpu_device *adev, uint32_t > instance_id) > +static int sdma_v4_4_2_stop_queue(struct amdgpu_ring *ring) > { > + struct amdgpu_device *adev = ring->adev; > + u32 instance_id = GET_INST(SDMA0, ring->me); > u32 inst_mask; > uint64_t rptr; > - struct amdgpu_ring *ring = &adev->sdma.instance[instance_id].ring; > > if (amdgpu_sriov_vf(adev)) > return -EINVAL; > @@ -1715,11 +1723,11 @@ static int sdma_v4_4_2_stop_queue(struct > amdgpu_device *adev, uint32_t instance_ > return 0; > } > > -static int sdma_v4_4_2_restore_queue(struct amdgpu_device *adev, uint32_t > instance_id) > +static int sdma_v4_4_2_restore_queue(struct amdgpu_ring *ring) > { > - int i; > + struct amdgpu_device *adev = ring->adev; > u32 inst_mask; > - struct amdgpu_ring *ring = &adev->sdma.instance[instance_id].ring; > + int i; > > inst_mask = 1 << ring->me; > udelay(50); > @@ -1740,8 +1748,6 @@ static int sdma_v4_4_2_restore_queue(struct > amdgpu_device *adev, uint32_t instan > } > > static struct sdma_on_reset_funcs sdma_v4_4_2_engine_reset_funcs = { > - .pre_reset = sdma_v4_4_2_stop_queue, > - .post_reset = sdma_v4_4_2_restore_queue, > }; > > static void sdma_v4_4_2_set_engine_reset_funcs(struct amdgpu_device *adev) > -- > 2.25.1 >