On Tue, Jan 13, 2026 at 3:42 AM Jesse.Zhang <[email protected]> wrote: > > In error scenarios (e.g., malformed commands), user queue fences may never > be signaled, causing processes to wait indefinitely. To address this while > preserving the requirement of infinite fence waits, implement an independent > timeout detection mechanism: > > 1. Initialize a hang detect work when creating a user queue (one-time setup) > 2. Start the work with queue-type-specific timeout (gfx/compute/sdma) when > the last fence is created via amdgpu_userq_signal_ioctl (per-fence > timing) > 3. Trigger queue reset logic if the timer expires before the fence is signaled > > v2: make timeout per queue type (adev->gfx_timeout vs adev->compute_timeout > vs adev->sdma_timeout) to be consistent with kernel queues. (Alex) > v3: The timeout detection must be independent from the fence, e.g. you don't > wait for a timeout on the fence > but rather have the timeout start as soon as the fence is > initialized. (Christian) > v4: replace the timer with the `hang_detect_work` delayed work. > > Signed-off-by: Jesse Zhang <[email protected]>
Reviewed-by: Alex Deucher <[email protected]> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 70 ++++++++++++++++++- > drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h | 3 + > .../gpu/drm/amd/amdgpu/amdgpu_userq_fence.c | 1 + > 3 files changed, 73 insertions(+), 1 deletion(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c > index 98110f543307..664a15278c1d 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c > @@ -148,6 +148,69 @@ amdgpu_userq_detect_and_reset_queues(struct > amdgpu_userq_mgr *uq_mgr) > return r; > } > > +static void amdgpu_userq_hang_detect_work(struct work_struct *work) > +{ > + struct amdgpu_usermode_queue *queue = container_of(work, > + struct > amdgpu_usermode_queue, > + > hang_detect_work.work); > + struct dma_fence *fence; > + struct amdgpu_userq_mgr *uq_mgr; > + > + if (!queue || !queue->userq_mgr) > + return; > + > + uq_mgr = queue->userq_mgr; > + fence = READ_ONCE(queue->hang_detect_fence); > + /* Fence already signaled – no action needed */ > + if (!fence || dma_fence_is_signaled(fence)) > + return; > + > + mutex_lock(&uq_mgr->userq_mutex); > + amdgpu_userq_detect_and_reset_queues(uq_mgr); > + mutex_unlock(&uq_mgr->userq_mutex); > +} > + > +/* > + * Start hang detection for a user queue fence. A delayed work will be > scheduled > + * to check if the fence is still pending after the timeout period. > +*/ > +void amdgpu_userq_start_hang_detect_work(struct amdgpu_usermode_queue *queue) > +{ > + struct amdgpu_device *adev; > + unsigned long timeout_ms; > + > + if (!queue || !queue->userq_mgr || !queue->userq_mgr->adev) > + return; > + > + adev = queue->userq_mgr->adev; > + /* Determine timeout based on queue type */ > + switch (queue->queue_type) { > + case AMDGPU_RING_TYPE_GFX: > + timeout_ms = adev->gfx_timeout; > + break; > + case AMDGPU_RING_TYPE_COMPUTE: > + timeout_ms = adev->compute_timeout; > + break; > + case AMDGPU_RING_TYPE_SDMA: > + timeout_ms = adev->sdma_timeout; > + break; > + default: > + timeout_ms = adev->gfx_timeout; > + break; > + } > + > + /* Store the fence to monitor and schedule hang detection */ > + WRITE_ONCE(queue->hang_detect_fence, queue->last_fence); > + schedule_delayed_work(&queue->hang_detect_work, > + msecs_to_jiffies(timeout_ms)); > +} > + > +static void amdgpu_userq_init_hang_detect_work(struct amdgpu_usermode_queue > *queue) > +{ > + INIT_DELAYED_WORK(&queue->hang_detect_work, > amdgpu_userq_hang_detect_work); > + queue->hang_detect_fence = NULL; > +} > + > static int amdgpu_userq_buffer_va_list_add(struct amdgpu_usermode_queue > *queue, > struct amdgpu_bo_va_mapping > *va_map, u64 addr) > { > @@ -572,7 +635,6 @@ amdgpu_userq_destroy(struct drm_file *filp, int queue_id) > > cancel_delayed_work_sync(&uq_mgr->resume_work); > mutex_lock(&uq_mgr->userq_mutex); > - > queue = amdgpu_userq_find(uq_mgr, queue_id); > if (!queue) { > drm_dbg_driver(adev_to_drm(uq_mgr->adev), "Invalid queue id > to destroy\n"); > @@ -580,6 +642,11 @@ amdgpu_userq_destroy(struct drm_file *filp, int queue_id) > return -EINVAL; > } > amdgpu_userq_wait_for_last_fence(queue); > + /* Cancel any pending hang detection work and cleanup */ > + if (queue->hang_detect_fence) { > + cancel_delayed_work_sync(&queue->hang_detect_work); > + queue->hang_detect_fence = NULL; > + } > r = amdgpu_bo_reserve(queue->db_obj.obj, true); > if (!r) { > amdgpu_bo_unpin(queue->db_obj.obj); > @@ -818,6 +885,7 @@ amdgpu_userq_create(struct drm_file *filp, union > drm_amdgpu_userq *args) > queue->debugfs_queue = debugfs_create_dir(queue_name, > filp->debugfs_client); > debugfs_create_file("mqd_info", 0444, queue->debugfs_queue, queue, > &amdgpu_mqd_info_fops); > #endif > + amdgpu_userq_init_hang_detect_work(queue); > kfree(queue_name); > > args->out.queue_id = qid; > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h > index 1eaa94f8a291..06a06272b41a 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h > @@ -72,6 +72,8 @@ struct amdgpu_usermode_queue { > u32 xcp_id; > int priority; > struct dentry *debugfs_queue; > + struct delayed_work hang_detect_work; > + struct dma_fence *hang_detect_fence; > > struct list_head userq_va_list; > }; > @@ -146,6 +148,7 @@ int amdgpu_userq_start_sched_for_enforce_isolation(struct > amdgpu_device *adev, > void amdgpu_userq_reset_work(struct work_struct *work); > void amdgpu_userq_pre_reset(struct amdgpu_device *adev); > int amdgpu_userq_post_reset(struct amdgpu_device *adev, bool vram_lost); > +void amdgpu_userq_start_hang_detect_work(struct amdgpu_usermode_queue > *queue); > > int amdgpu_userq_input_va_validate(struct amdgpu_device *adev, > struct amdgpu_usermode_queue *queue, > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c > index 25f178536469..374fbd0e859a 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c > @@ -569,6 +569,7 @@ int amdgpu_userq_signal_ioctl(struct drm_device *dev, > void *data, > > dma_fence_put(queue->last_fence); > queue->last_fence = dma_fence_get(fence); > + amdgpu_userq_start_hang_detect_work(queue); > mutex_unlock(&userq_mgr->userq_mutex); > > drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT, > -- > 2.49.0 >
