On Fri, Oct 24, 2025 at 6:01 AM Jesse.Zhang <[email protected]> wrote: > > From: Alex Deucher <[email protected]> > > This patch adds robust reset handling for user queues (userq) to improve > recovery from queue failures. The key components include: > > 1. Queue detection and reset logic: > - amdgpu_userq_detect_and_reset_queues() identifies failed queues > - Per-IP detect_and_reset callbacks for targeted recovery > - Falls back to full GPU reset when needed > > 2. Reset infrastructure: > - Adds userq_reset_work workqueue for async reset handling > - Implements pre/post reset handlers for queue state management > - Integrates with existing GPU reset framework > > 3. Error handling improvements: > - Enhanced state tracking with HUNG state > - Automatic reset triggering on critical failures > - VRAM loss handling during recovery > > 4. Integration points: > - Added to device init/reset paths > - Called during queue destroy, suspend, and isolation events > - Handles both individual queue and full GPU resets > > The reset functionality works with both gfx/compute and sdma queues, > providing better resilience against queue failures while minimizing > disruption to unaffected queues. > > v2: add detection and reset calls when preemption/unmaped fails. > add a per device userq counter for each user queue type.(Alex) > v3: make sure we hold the adev->userq_mutex when we call > amdgpu_userq_detect_and_reset_queues. (Alex) > warn if the adev->userq_mutex is not held. > v4: make sure we have all of the uqm->userq_mutex held. > warn if the uqm->userq_mutex is not held. > > v5: Use array for user queue type counters.(Alex) > all of the uqm->userq_mutex need to be held when calling detect and > reset. (Alex) > > v6: fix lock dep warning in amdgpu_userq_fence_dence_driver_process > > v7: add the queue types in an array and use a loop in > amdgpu_userq_detect_and_reset_queues (Lijo) > v8: remove atomic_set(&userq_mgr->userq_count[i], 0). > it should already be 0 since we kzalloc the structure (Alex) > > Signed-off-by: Alex Deucher <[email protected]> > Signed-off-by: Jesse Zhang <[email protected]>
Reviewed-by: Alex Deucher <[email protected]> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 8 + > drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 1 + > drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 176 +++++++++++++++++++-- > drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h | 5 + > 5 files changed, 179 insertions(+), 12 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > index 50079209c472..d0fb4eb1d7c4 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > @@ -1316,6 +1316,7 @@ struct amdgpu_device { > bool apu_prefer_gtt; > > bool userq_halt_for_enforce_isolation; > + struct work_struct userq_reset_work; > struct amdgpu_uid *uid_info; > > /* KFD > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index b8d91247f51a..8480b72258f2 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -4608,6 +4608,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, > } > > INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); > + INIT_WORK(&adev->userq_reset_work, amdgpu_userq_reset_work); > > adev->gfx.gfx_off_req_count = 1; > adev->gfx.gfx_off_residency = 0; > @@ -5990,6 +5991,10 @@ int amdgpu_device_reinit_after_reset(struct > amdgpu_reset_context *reset_context) > if (r) > goto out; > > + r = amdgpu_userq_post_reset(tmp_adev, > vram_lost); > + if (r) > + goto out; > + > drm_client_dev_resume(adev_to_drm(tmp_adev), > false); > > /* > @@ -6212,6 +6217,7 @@ static inline void > amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) > if (!amdgpu_sriov_vf(adev)) > cancel_work(&adev->reset_work); > #endif > + cancel_work(&adev->userq_reset_work); > > if (adev->kfd.dev) > cancel_work(&adev->kfd.reset_work); > @@ -6332,6 +6338,8 @@ static void amdgpu_device_halt_activities(struct > amdgpu_device *adev, > amdgpu_device_ip_need_full_reset(tmp_adev)) > amdgpu_ras_suspend(tmp_adev); > > + amdgpu_userq_pre_reset(tmp_adev); > + > for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { > struct amdgpu_ring *ring = tmp_adev->rings[i]; > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h > index 87b962df5460..7a27c6c4bb44 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h > @@ -83,6 +83,7 @@ enum amdgpu_ring_type { > AMDGPU_RING_TYPE_MES, > AMDGPU_RING_TYPE_UMSCH_MM, > AMDGPU_RING_TYPE_CPER, > + AMDGPU_RING_TYPE_MAX, > }; > > enum amdgpu_ib_pool_type { > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c > index c50b162e80a7..188de848c229 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c > @@ -25,8 +25,10 @@ > #include <drm/drm_auth.h> > #include <drm/drm_exec.h> > #include <linux/pm_runtime.h> > +#include <drm/drm_drv.h> > > #include "amdgpu.h" > +#include "amdgpu_reset.h" > #include "amdgpu_vm.h" > #include "amdgpu_userq.h" > #include "amdgpu_hmm.h" > @@ -45,6 +47,69 @@ u32 amdgpu_userq_get_supported_ip_mask(struct > amdgpu_device *adev) > return userq_ip_mask; > } > > +static void amdgpu_userq_gpu_reset(struct amdgpu_device *adev) > +{ > + if (amdgpu_device_should_recover_gpu(adev)) { > + amdgpu_reset_domain_schedule(adev->reset_domain, > + &adev->userq_reset_work); > + /* Wait for the reset job to complete */ > + flush_work(&adev->userq_reset_work); > + } > +} > + > +static int > +amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr) > +{ > + struct amdgpu_device *adev = uq_mgr->adev; > + const int queue_types[] = { > + AMDGPU_RING_TYPE_COMPUTE, > + AMDGPU_RING_TYPE_GFX, > + AMDGPU_RING_TYPE_SDMA > + }; > + const int num_queue_types = ARRAY_SIZE(queue_types); > + bool gpu_reset = false; > + int r = 0; > + int i; > + > + /* Warning if current process mutex is not held */ > + WARN_ON(!mutex_is_locked(&uq_mgr->userq_mutex)); > + > + if (unlikely(adev->debug_disable_gpu_ring_reset)) { > + dev_err(adev->dev, "userq reset disabled by debug mask\n"); > + return 0; > + } > + > + /* > + * If GPU recovery feature is disabled system-wide, > + * skip all reset detection logic > + */ > + if (!amdgpu_gpu_recovery) > + return 0; > + > + /* > + * Iterate through all queue types to detect and reset problematic > queues > + * Process each queue type in the defined order > + */ > + for (i = 0; i < num_queue_types; i++) { > + int ring_type = queue_types[i]; > + const struct amdgpu_userq_funcs *funcs = > adev->userq_funcs[ring_type]; > + > + if (atomic_read(&uq_mgr->userq_count[ring_type]) > 0 && > + funcs && funcs->detect_and_reset) { > + r = funcs->detect_and_reset(adev, ring_type); > + if (r) { > + gpu_reset = true; > + break; > + } > + } > + } > + > + if (gpu_reset) > + amdgpu_userq_gpu_reset(adev); > + > + return r; > +} > + > static int amdgpu_userq_buffer_va_list_add(struct amdgpu_usermode_queue > *queue, > struct amdgpu_bo_va_mapping > *va_map, u64 addr) > { > @@ -175,17 +240,22 @@ amdgpu_userq_preempt_helper(struct amdgpu_userq_mgr > *uq_mgr, > struct amdgpu_device *adev = uq_mgr->adev; > const struct amdgpu_userq_funcs *userq_funcs = > adev->userq_funcs[queue->queue_type]; > + bool found_hung_queue = false; > int r = 0; > > if (queue->state == AMDGPU_USERQ_STATE_MAPPED) { > r = userq_funcs->preempt(uq_mgr, queue); > if (r) { > queue->state = AMDGPU_USERQ_STATE_HUNG; > + found_hung_queue = true; > } else { > queue->state = AMDGPU_USERQ_STATE_PREEMPTED; > } > } > > + if (found_hung_queue) > + amdgpu_userq_detect_and_reset_queues(uq_mgr); > + > return r; > } > > @@ -217,16 +287,23 @@ amdgpu_userq_unmap_helper(struct amdgpu_userq_mgr > *uq_mgr, > struct amdgpu_device *adev = uq_mgr->adev; > const struct amdgpu_userq_funcs *userq_funcs = > adev->userq_funcs[queue->queue_type]; > + bool found_hung_queue = false; > int r = 0; > > if ((queue->state == AMDGPU_USERQ_STATE_MAPPED) || > (queue->state == AMDGPU_USERQ_STATE_PREEMPTED)) { > r = userq_funcs->unmap(uq_mgr, queue); > - if (r) > + if (r) { > queue->state = AMDGPU_USERQ_STATE_HUNG; > - else > + found_hung_queue = true; > + } else { > queue->state = AMDGPU_USERQ_STATE_UNMAPPED; > + } > } > + > + if (found_hung_queue) > + amdgpu_userq_detect_and_reset_queues(uq_mgr); > + > return r; > } > > @@ -243,10 +320,12 @@ amdgpu_userq_map_helper(struct amdgpu_userq_mgr *uq_mgr, > r = userq_funcs->map(uq_mgr, queue); > if (r) { > queue->state = AMDGPU_USERQ_STATE_HUNG; > + amdgpu_userq_detect_and_reset_queues(uq_mgr); > } else { > queue->state = AMDGPU_USERQ_STATE_MAPPED; > } > } > + > return r; > } > > @@ -474,10 +553,11 @@ amdgpu_userq_destroy(struct drm_file *filp, int > queue_id) > amdgpu_bo_unreserve(queue->db_obj.obj); > } > amdgpu_bo_unref(&queue->db_obj.obj); > - > + atomic_dec(&uq_mgr->userq_count[queue->queue_type]); > #if defined(CONFIG_DEBUG_FS) > debugfs_remove_recursive(queue->debugfs_queue); > #endif > + amdgpu_userq_detect_and_reset_queues(uq_mgr); > r = amdgpu_userq_unmap_helper(uq_mgr, queue); > /*TODO: It requires a reset for userq hw unmap error*/ > if (unlikely(r != AMDGPU_USERQ_STATE_UNMAPPED)) { > @@ -699,6 +779,7 @@ amdgpu_userq_create(struct drm_file *filp, union > drm_amdgpu_userq *args) > kfree(queue_name); > > args->out.queue_id = qid; > + atomic_inc(&uq_mgr->userq_count[queue->queue_type]); > > unlock: > mutex_unlock(&uq_mgr->userq_mutex); > @@ -1043,6 +1124,7 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr) > unsigned long queue_id; > int ret = 0, r; > > + amdgpu_userq_detect_and_reset_queues(uq_mgr); > /* Try to unmap all the queues in this process ctx */ > xa_for_each(&uq_mgr->userq_mgr_xa, queue_id, queue) { > r = amdgpu_userq_preempt_helper(uq_mgr, queue); > @@ -1055,6 +1137,23 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr) > return ret; > } > > +void amdgpu_userq_reset_work(struct work_struct *work) > +{ > + struct amdgpu_device *adev = container_of(work, struct amdgpu_device, > + userq_reset_work); > + struct amdgpu_reset_context reset_context; > + > + memset(&reset_context, 0, sizeof(reset_context)); > + > + reset_context.method = AMD_RESET_METHOD_NONE; > + reset_context.reset_req_dev = adev; > + reset_context.src = AMDGPU_RESET_SRC_USERQ; > + set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); > + /*set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);*/ > + > + amdgpu_device_gpu_recover(adev, NULL, &reset_context); > +} > + > static int > amdgpu_userq_wait_for_signal(struct amdgpu_userq_mgr *uq_mgr) > { > @@ -1082,22 +1181,19 @@ void > amdgpu_userq_evict(struct amdgpu_userq_mgr *uq_mgr, > struct amdgpu_eviction_fence *ev_fence) > { > - int ret; > struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr); > struct amdgpu_eviction_fence_mgr *evf_mgr = &fpriv->evf_mgr; > + struct amdgpu_device *adev = uq_mgr->adev; > + int ret; > > /* Wait for any pending userqueue fence work to finish */ > ret = amdgpu_userq_wait_for_signal(uq_mgr); > - if (ret) { > - drm_file_err(uq_mgr->file, "Not evicting userqueue, timeout > waiting for work\n"); > - return; > - } > + if (ret) > + dev_err(adev->dev, "Not evicting userqueue, timeout waiting > for work\n"); > > ret = amdgpu_userq_evict_all(uq_mgr); > - if (ret) { > - drm_file_err(uq_mgr->file, "Failed to evict userqueue\n"); > - return; > - } > + if (ret) > + dev_err(adev->dev, "Failed to evict userqueue\n"); > > /* Signal current eviction fence */ > amdgpu_eviction_fence_signal(evf_mgr, ev_fence); > @@ -1131,6 +1227,7 @@ void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr > *userq_mgr) > cancel_delayed_work_sync(&userq_mgr->resume_work); > > mutex_lock(&userq_mgr->userq_mutex); > + amdgpu_userq_detect_and_reset_queues(userq_mgr); > xa_for_each(&userq_mgr->userq_mgr_xa, queue_id, queue) { > amdgpu_userq_wait_for_last_fence(userq_mgr, queue); > amdgpu_userq_unmap_helper(userq_mgr, queue); > @@ -1157,6 +1254,7 @@ int amdgpu_userq_suspend(struct amdgpu_device *adev) > uqm = queue->userq_mgr; > cancel_delayed_work_sync(&uqm->resume_work); > guard(mutex)(&uqm->userq_mutex); > + amdgpu_userq_detect_and_reset_queues(uqm); > if (adev->in_s0ix) > r = amdgpu_userq_preempt_helper(uqm, queue); > else > @@ -1215,6 +1313,7 @@ int > amdgpu_userq_stop_sched_for_enforce_isolation(struct amdgpu_device *adev, > if (((queue->queue_type == AMDGPU_HW_IP_GFX) || > (queue->queue_type == AMDGPU_HW_IP_COMPUTE)) && > (queue->xcp_id == idx)) { > + amdgpu_userq_detect_and_reset_queues(uqm); > r = amdgpu_userq_preempt_helper(uqm, queue); > if (r) > ret = r; > @@ -1287,3 +1386,56 @@ int amdgpu_userq_gem_va_unmap_validate(struct > amdgpu_device *adev, > > return 0; > } > + > +void amdgpu_userq_pre_reset(struct amdgpu_device *adev) > +{ > + const struct amdgpu_userq_funcs *userq_funcs; > + struct amdgpu_usermode_queue *queue; > + struct amdgpu_userq_mgr *uqm; > + unsigned long queue_id; > + > + xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) { > + uqm = queue->userq_mgr; > + cancel_delayed_work_sync(&uqm->resume_work); > + if (queue->state == AMDGPU_USERQ_STATE_MAPPED) { > + amdgpu_userq_wait_for_last_fence(uqm, queue); > + userq_funcs = adev->userq_funcs[queue->queue_type]; > + userq_funcs->unmap(uqm, queue); > + /* just mark all queues as hung at this point. > + * if unmap succeeds, we could map again > + * in amdgpu_userq_post_reset() if vram is not lost > + */ > + queue->state = AMDGPU_USERQ_STATE_HUNG; > + amdgpu_userq_fence_driver_force_completion(queue); > + } > + } > +} > + > +int amdgpu_userq_post_reset(struct amdgpu_device *adev, bool vram_lost) > +{ > + /* if any queue state is AMDGPU_USERQ_STATE_UNMAPPED > + * at this point, we should be able to map it again > + * and continue if vram is not lost. > + */ > + struct amdgpu_userq_mgr *uqm; > + struct amdgpu_usermode_queue *queue; > + const struct amdgpu_userq_funcs *userq_funcs; > + unsigned long queue_id; > + int r = 0; > + > + xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) { > + uqm = queue->userq_mgr; > + if (queue->state == AMDGPU_USERQ_STATE_HUNG && !vram_lost) { > + userq_funcs = adev->userq_funcs[queue->queue_type]; > + /* Re-map queue */ > + r = userq_funcs->map(uqm, queue); > + if (r) { > + dev_err(adev->dev, "Failed to remap queue > %ld\n", queue_id); > + continue; > + } > + queue->state = AMDGPU_USERQ_STATE_MAPPED; > + } > + } > + > + return r; > +} > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h > index 09da0617bfa2..c37444427a14 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h > @@ -106,6 +106,7 @@ struct amdgpu_userq_mgr { > struct amdgpu_device *adev; > struct delayed_work resume_work; > struct drm_file *file; > + atomic_t userq_count[AMDGPU_RING_TYPE_MAX]; > }; > > struct amdgpu_db_info { > @@ -148,6 +149,10 @@ int amdgpu_userq_stop_sched_for_enforce_isolation(struct > amdgpu_device *adev, > u32 idx); > int amdgpu_userq_start_sched_for_enforce_isolation(struct amdgpu_device > *adev, > u32 idx); > +void amdgpu_userq_reset_work(struct work_struct *work); > +void amdgpu_userq_pre_reset(struct amdgpu_device *adev); > +int amdgpu_userq_post_reset(struct amdgpu_device *adev, bool vram_lost); > + > int amdgpu_userq_input_va_validate(struct amdgpu_usermode_queue *queue, > u64 addr, u64 expected_size); > int amdgpu_userq_gem_va_unmap_validate(struct amdgpu_device *adev, > -- > 2.49.0 >
