On 3/19/26 09:21, Prike Liang wrote:
> In the userq destroy routine, the queue refcount
> should be 0 and the queue already removed from the
> manager list, so it must not be touched. Attempting
> to lock the userq mutex here would deadlock, as it
> is already held by the eviction suspend work like as
> following.
If I'm not completely mistaken Sunil already took a look into this.
@Sunil if you haven't seen that before please take a look at this patch.
Regards,
Christian.
>
> [ 107.881652] ============================================
> [ 107.881866] WARNING: possible recursive locking detected
> [ 107.882081] 6.19.0-custom #16 Tainted: G U OE
> [ 107.882305] --------------------------------------------
> [ 107.882518] kworker/15:1/158 is trying to acquire lock:
> [ 107.882728] ffff8f2854b3d110 (&userq_mgr->userq_mutex){+.+.}-{4:4}, at:
> amdgpu_userq_kref_destroy+0x57/0x540 [amdgpu]
> [ 107.883462]
> but task is already holding lock:
> [ 107.883701] ffff8f2854b3d110 (&userq_mgr->userq_mutex){+.+.}-{4:4}, at:
> amdgpu_eviction_fence_suspend_worker+0x31/0xc0 [amdgpu]
> [ 107.884485]
> other info that might help us debug this:
> [ 107.884751] Possible unsafe locking scenario:
>
> [ 107.884993] CPU0
> [ 107.885100] ----
> [ 107.885207] lock(&userq_mgr->userq_mutex);
> [ 107.885385] lock(&userq_mgr->userq_mutex);
> [ 107.885561]
> *** DEADLOCK ***
>
> [ 107.885798] May be due to missing lock nesting notation
>
> [ 107.886069] 4 locks held by kworker/15:1/158:
> [ 107.886247] #0: ffff8f2840057558 ((wq_completion)events){+.+.}-{0:0}, at:
> process_one_work+0x455/0x650
> [ 107.886630] #1: ffffd32f01a4fe18
> ((work_completion)(&evf_mgr->suspend_work)){+.+.}-{0:0}, at:
> process_one_work+0x1f3/0x650
> [ 107.887075] #2: ffff8f2854b3d110 (&userq_mgr->userq_mutex){+.+.}-{4:4},
> at: amdgpu_eviction_fence_suspend_worker+0x31/0xc0 [amdgpu]
> [ 107.887799] #3: ffffffffb8d3f700 (dma_fence_map){++++}-{0:0}, at:
> amdgpu_eviction_fence_suspend_worker+0x36/0xc0 [amdgpu]
> [ 107.888457]
>
> Signed-off-by: Prike Liang <[email protected]>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 50 +++++++++++++++++++++--
> 1 file changed, 47 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> index bb5d572f5a3c..c7a9306a1c01 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> @@ -148,6 +148,52 @@ amdgpu_userq_detect_and_reset_queues(struct
> amdgpu_userq_mgr *uq_mgr)
> return r;
> }
>
> +static int
> +amdgpu_userq_perq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr,
> + struct amdgpu_usermode_queue *queue)
> +{
> + struct amdgpu_device *adev = uq_mgr->adev;
> + bool gpu_reset = false;
> + int r = 0;
> +
> + /* Warning if current process mutex is not held */
> + if (refcount_read(&queue->refcount.refcount))
> + WARN_ON(!mutex_is_locked(&uq_mgr->userq_mutex));
> +
> + if (unlikely(adev->debug_disable_gpu_ring_reset)) {
> + dev_err(adev->dev, "userq reset disabled by debug mask\n");
> + return 0;
> + }
> +
> + /*
> + * If GPU recovery feature is disabled system-wide,
> + * skip all reset detection logic
> + */
> + if (!amdgpu_gpu_recovery)
> + return 0;
> +
> + /*
> + * Iterate through all queue types to detect and reset problematic
> queues
> + * Process each queue type in the defined order
> + */
> + int ring_type = queue->queue_type;
> + const struct amdgpu_userq_funcs *funcs = adev->userq_funcs[ring_type];
> +
> + if (!amdgpu_userq_is_reset_type_supported(adev, ring_type,
> AMDGPU_RESET_TYPE_PER_QUEUE))
> + return r;
> +
> + if (atomic_read(&uq_mgr->userq_count[ring_type]) > 0 &&
> + funcs && funcs->detect_and_reset) {
> + r = funcs->detect_and_reset(adev, ring_type);
> + if (r)
> + gpu_reset = true;
> + }
> +
> + if (gpu_reset)
> + amdgpu_userq_gpu_reset(adev);
> +
> + return r;
> +}
> static void amdgpu_userq_hang_detect_work(struct work_struct *work)
> {
> struct amdgpu_usermode_queue *queue = container_of(work,
> @@ -627,7 +673,6 @@ amdgpu_userq_destroy(struct amdgpu_userq_mgr *uq_mgr,
> struct amdgpu_usermode_que
> /* Cancel any pending hang detection work and cleanup */
> cancel_delayed_work_sync(&queue->hang_detect_work);
>
> - mutex_lock(&uq_mgr->userq_mutex);
> queue->hang_detect_fence = NULL;
> amdgpu_userq_wait_for_last_fence(queue);
>
> @@ -649,7 +694,7 @@ amdgpu_userq_destroy(struct amdgpu_userq_mgr *uq_mgr,
> struct amdgpu_usermode_que
> #if defined(CONFIG_DEBUG_FS)
> debugfs_remove_recursive(queue->debugfs_queue);
> #endif
> - amdgpu_userq_detect_and_reset_queues(uq_mgr);
> + amdgpu_userq_perq_detect_and_reset_queues(uq_mgr, queue);
> r = amdgpu_userq_unmap_helper(queue);
> /*TODO: It requires a reset for userq hw unmap error*/
> if (unlikely(r != AMDGPU_USERQ_STATE_UNMAPPED)) {
> @@ -657,7 +702,6 @@ amdgpu_userq_destroy(struct amdgpu_userq_mgr *uq_mgr,
> struct amdgpu_usermode_que
> queue->state = AMDGPU_USERQ_STATE_HUNG;
> }
> amdgpu_userq_cleanup(queue);
> - mutex_unlock(&uq_mgr->userq_mutex);
>
> pm_runtime_put_autosuspend(adev_to_drm(adev)->dev);
>