On 4/30/25 17:48, Alex Deucher wrote: > If map or unmap fails, or a user fence times out, attempt to > reset the queue. If that fails, schedule a GPU reset. > > Reviewed-by: Sunil Khatri <sunil.kha...@amd.com> > Signed-off-by: Alex Deucher <alexander.deuc...@amd.com>
Acked-by: Christian König <christian.koe...@amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 8 ++ > drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 128 +++++++++++++++++++-- > drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h | 5 + > 4 files changed, 131 insertions(+), 11 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > index cc26cf1bd843e..936e1a0ac02f1 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > @@ -1251,6 +1251,7 @@ struct amdgpu_device { > struct list_head userq_mgr_list; > struct mutex userq_mutex; > bool userq_halt_for_enforce_isolation; > + struct work_struct userq_reset_work; > }; > > static inline uint32_t amdgpu_ip_version(const struct amdgpu_device *adev, > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index fe68ba9997ae4..317c86c1493a8 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -4420,6 +4420,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, > } > > INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); > + INIT_WORK(&adev->userq_reset_work, amdgpu_userq_reset_work); > > adev->gfx.gfx_off_req_count = 1; > adev->gfx.gfx_off_residency = 0; > @@ -5777,6 +5778,10 @@ int amdgpu_device_reinit_after_reset(struct > amdgpu_reset_context *reset_context) > if (r) > goto out; > > + r = amdgpu_userq_post_reset(tmp_adev, > vram_lost); > + if (r) > + goto out; > + > drm_client_dev_resume(adev_to_drm(tmp_adev), > false); > > /* > @@ -5999,6 +6004,7 @@ static inline void > amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) > if (!amdgpu_sriov_vf(adev)) > cancel_work(&adev->reset_work); > #endif > + cancel_work(&adev->userq_reset_work); > > if (adev->kfd.dev) > cancel_work(&adev->kfd.reset_work); > @@ -6109,6 +6115,8 @@ static int amdgpu_device_halt_activities(struct > amdgpu_device *adev, > amdgpu_device_ip_need_full_reset(tmp_adev)) > amdgpu_ras_suspend(tmp_adev); > > + amdgpu_userq_pre_reset(tmp_adev); > + > for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { > struct amdgpu_ring *ring = tmp_adev->rings[i]; > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c > index afbe01149ed3f..4be46fa76ceba 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c > @@ -26,7 +26,10 @@ > #include <drm/drm_exec.h> > #include <linux/pm_runtime.h> > > +#include <drm/drm_drv.h> > + > #include "amdgpu.h" > +#include "amdgpu_reset.h" > #include "amdgpu_vm.h" > #include "amdgpu_userq.h" > #include "amdgpu_userq_fence.h" > @@ -44,6 +47,44 @@ u32 amdgpu_userq_get_supported_ip_mask(struct > amdgpu_device *adev) > return userq_ip_mask; > } > > +static void amdgpu_userq_gpu_reset(struct amdgpu_device *adev) > +{ > + > + if (amdgpu_device_should_recover_gpu(adev)) > + amdgpu_reset_domain_schedule(adev->reset_domain, > + &adev->userq_reset_work); > +} > + > +static bool > +amdgpu_userq_queue_reset_helper(struct amdgpu_userq_mgr *uq_mgr, > + struct amdgpu_usermode_queue *queue) > +{ > + struct amdgpu_device *adev = uq_mgr->adev; > + const struct amdgpu_userq_funcs *userq_funcs = > + adev->userq_funcs[queue->queue_type]; > + bool gpu_reset = false; > + int r; > + > + if (unlikely(adev->debug_disable_gpu_ring_reset)) { > + dev_err(adev->dev, "userq reset disabled by debug mask\n"); > + } else if (amdgpu_gpu_recovery && userq_funcs->reset) { > + r = userq_funcs->reset(uq_mgr, queue); > + if (r) { > + dev_err(adev->dev, "userq reset failed\n"); > + gpu_reset = true; > + } else { > + dev_err(adev->dev, "userq reset succeeded\n"); > + atomic_inc(&adev->gpu_reset_counter); > + amdgpu_userq_fence_driver_force_completion(queue); > + drm_dev_wedged_event(adev_to_drm(adev), > DRM_WEDGE_RECOVERY_NONE); > + } > + } else if (amdgpu_gpu_recovery && !userq_funcs->reset) { > + gpu_reset = true; > + } > + > + return gpu_reset; > +} > + > static int > amdgpu_userq_unmap_helper(struct amdgpu_userq_mgr *uq_mgr, > struct amdgpu_usermode_queue *queue) > @@ -51,15 +92,22 @@ amdgpu_userq_unmap_helper(struct amdgpu_userq_mgr *uq_mgr, > struct amdgpu_device *adev = uq_mgr->adev; > const struct amdgpu_userq_funcs *userq_funcs = > adev->userq_funcs[queue->queue_type]; > + bool gpu_reset = false; > int r = 0; > > if (queue->state == AMDGPU_USERQ_STATE_MAPPED) { > r = userq_funcs->unmap(uq_mgr, queue); > - if (r) > + if (r) { > queue->state = AMDGPU_USERQ_STATE_HUNG; > - else > + gpu_reset = amdgpu_userq_queue_reset_helper(uq_mgr, > queue); > + } else { > queue->state = AMDGPU_USERQ_STATE_UNMAPPED; > + } > } > + > + if (gpu_reset) > + amdgpu_userq_gpu_reset(adev); > + > return r; > } > > @@ -70,16 +118,22 @@ amdgpu_userq_map_helper(struct amdgpu_userq_mgr *uq_mgr, > struct amdgpu_device *adev = uq_mgr->adev; > const struct amdgpu_userq_funcs *userq_funcs = > adev->userq_funcs[queue->queue_type]; > + bool gpu_reset = false; > int r = 0; > > if (queue->state == AMDGPU_USERQ_STATE_UNMAPPED) { > r = userq_funcs->map(uq_mgr, queue); > if (r) { > queue->state = AMDGPU_USERQ_STATE_HUNG; > + gpu_reset = amdgpu_userq_queue_reset_helper(uq_mgr, > queue); > } else { > queue->state = AMDGPU_USERQ_STATE_MAPPED; > } > } > + > + if (gpu_reset) > + amdgpu_userq_gpu_reset(adev); > + > return r; > } > > @@ -698,6 +752,23 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr) > return ret; > } > > +void amdgpu_userq_reset_work(struct work_struct *work) > +{ > + struct amdgpu_device *adev = container_of(work, struct amdgpu_device, > + userq_reset_work); > + struct amdgpu_reset_context reset_context; > + > + memset(&reset_context, 0, sizeof(reset_context)); > + > + reset_context.method = AMD_RESET_METHOD_NONE; > + reset_context.reset_req_dev = adev; > + reset_context.src = AMDGPU_RESET_SRC_USERQ; > + set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); > + /*set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);*/ > + > + amdgpu_device_gpu_recover(adev, NULL, &reset_context); > +} > + > static int > amdgpu_userq_wait_for_signal(struct amdgpu_userq_mgr *uq_mgr) > { > @@ -724,22 +795,19 @@ void > amdgpu_userq_evict(struct amdgpu_userq_mgr *uq_mgr, > struct amdgpu_eviction_fence *ev_fence) > { > - int ret; > struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr); > struct amdgpu_eviction_fence_mgr *evf_mgr = &fpriv->evf_mgr; > + struct amdgpu_device *adev = uq_mgr->adev; > + int ret; > > /* Wait for any pending userqueue fence work to finish */ > ret = amdgpu_userq_wait_for_signal(uq_mgr); > - if (ret) { > - DRM_ERROR("Not evicting userqueue, timeout waiting for work\n"); > - return; > - } > + if (ret) > + dev_err(adev->dev, "Not evicting userqueue, timeout waiting for > work\n"); > > ret = amdgpu_userq_evict_all(uq_mgr); > - if (ret) { > - DRM_ERROR("Failed to evict userqueue\n"); > - return; > - } > + if (ret) > + dev_err(adev->dev, "Failed to evict userqueue\n"); > > /* Signal current eviction fence */ > amdgpu_eviction_fence_signal(evf_mgr, ev_fence); > @@ -914,3 +982,41 @@ int > amdgpu_userq_start_sched_for_enforce_isolation(struct amdgpu_device *adev, > mutex_unlock(&adev->userq_mutex); > return ret; > } > + > +void amdgpu_userq_pre_reset(struct amdgpu_device *adev) > +{ > + const struct amdgpu_userq_funcs *userq_funcs; > + struct amdgpu_usermode_queue *queue; > + struct amdgpu_userq_mgr *uqm, *tmp; > + int queue_id; > + > + mutex_lock(&adev->userq_mutex); > + list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) { > + cancel_delayed_work_sync(&uqm->resume_work); > + mutex_lock(&uqm->userq_mutex); > + idr_for_each_entry(&uqm->userq_idr, queue, queue_id) { > + if (queue->state == AMDGPU_USERQ_STATE_MAPPED) { > + amdgpu_userq_wait_for_last_fence(uqm, queue); > + userq_funcs = > adev->userq_funcs[queue->queue_type]; > + userq_funcs->unmap(uqm, queue); > + /* just mark all queues as hung at this point. > + * if unmap succeeds, we could map again > + * in amdgpu_userq_post_reset() if vram is not > lost > + */ > + queue->state = AMDGPU_USERQ_STATE_HUNG; > + > amdgpu_userq_fence_driver_force_completion(queue); > + } > + } > + mutex_unlock(&uqm->userq_mutex); > + } > + mutex_unlock(&adev->userq_mutex); > +} > + > +int amdgpu_userq_post_reset(struct amdgpu_device *adev, bool vram_lost) > +{ > + /* if any queue state is AMDGPU_USERQ_STATE_UNMAPPED > + * at this point, we should be able to map it again > + * and continue if vram is not lost. > + */ > + return 0; > +} > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h > index 24d201cdc9887..6ede08dd821d2 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h > @@ -132,4 +132,9 @@ int amdgpu_userq_stop_sched_for_enforce_isolation(struct > amdgpu_device *adev, > int amdgpu_userq_start_sched_for_enforce_isolation(struct amdgpu_device > *adev, > u32 idx); > > +void amdgpu_userq_reset_work(struct work_struct *work); > + > +void amdgpu_userq_pre_reset(struct amdgpu_device *adev); > +int amdgpu_userq_post_reset(struct amdgpu_device *adev, bool vram_lost); > + > #endif