On 3/19/26 09:21, Prike Liang wrote:
> In the userq destroy routine, the queue refcount
> should be 0 and the queue already removed from the
> manager list, so it must not be touched. Attempting
> to lock the userq mutex here would deadlock, as it
> is already held by the eviction suspend work like as
> following.

If I'm not completely mistaken Sunil already took a look into this.

@Sunil if you haven't seen that before please take a look at this patch.

Regards,
Christian.

> 
> [  107.881652] ============================================
> [  107.881866] WARNING: possible recursive locking detected
> [  107.882081] 6.19.0-custom #16 Tainted: G     U     OE
> [  107.882305] --------------------------------------------
> [  107.882518] kworker/15:1/158 is trying to acquire lock:
> [  107.882728] ffff8f2854b3d110 (&userq_mgr->userq_mutex){+.+.}-{4:4}, at: 
> amdgpu_userq_kref_destroy+0x57/0x540 [amdgpu]
> [  107.883462]
>                but task is already holding lock:
> [  107.883701] ffff8f2854b3d110 (&userq_mgr->userq_mutex){+.+.}-{4:4}, at: 
> amdgpu_eviction_fence_suspend_worker+0x31/0xc0 [amdgpu]
> [  107.884485]
>                other info that might help us debug this:
> [  107.884751]  Possible unsafe locking scenario:
> 
> [  107.884993]        CPU0
> [  107.885100]        ----
> [  107.885207]   lock(&userq_mgr->userq_mutex);
> [  107.885385]   lock(&userq_mgr->userq_mutex);
> [  107.885561]
>                 *** DEADLOCK ***
> 
> [  107.885798]  May be due to missing lock nesting notation
> 
> [  107.886069] 4 locks held by kworker/15:1/158:
> [  107.886247]  #0: ffff8f2840057558 ((wq_completion)events){+.+.}-{0:0}, at: 
> process_one_work+0x455/0x650
> [  107.886630]  #1: ffffd32f01a4fe18 
> ((work_completion)(&evf_mgr->suspend_work)){+.+.}-{0:0}, at: 
> process_one_work+0x1f3/0x650
> [  107.887075]  #2: ffff8f2854b3d110 (&userq_mgr->userq_mutex){+.+.}-{4:4}, 
> at: amdgpu_eviction_fence_suspend_worker+0x31/0xc0 [amdgpu]
> [  107.887799]  #3: ffffffffb8d3f700 (dma_fence_map){++++}-{0:0}, at: 
> amdgpu_eviction_fence_suspend_worker+0x36/0xc0 [amdgpu]
> [  107.888457]
> 
> Signed-off-by: Prike Liang <[email protected]>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 50 +++++++++++++++++++++--
>  1 file changed, 47 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> index bb5d572f5a3c..c7a9306a1c01 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> @@ -148,6 +148,52 @@ amdgpu_userq_detect_and_reset_queues(struct 
> amdgpu_userq_mgr *uq_mgr)
>       return r;
>  }
>  
> +static int
> +amdgpu_userq_perq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr,
> +                     struct amdgpu_usermode_queue *queue)
> +{
> +     struct amdgpu_device *adev = uq_mgr->adev;
> +     bool gpu_reset = false;
> +     int r = 0;
> +
> +     /* Warning if current process mutex is not held */
> +     if (refcount_read(&queue->refcount.refcount))
> +             WARN_ON(!mutex_is_locked(&uq_mgr->userq_mutex));
> +
> +     if (unlikely(adev->debug_disable_gpu_ring_reset)) {
> +             dev_err(adev->dev, "userq reset disabled by debug mask\n");
> +             return 0;
> +     }
> +
> +     /*
> +      * If GPU recovery feature is disabled system-wide,
> +      * skip all reset detection logic
> +      */
> +     if (!amdgpu_gpu_recovery)
> +             return 0;
> +
> +     /*
> +      * Iterate through all queue types to detect and reset problematic 
> queues
> +      * Process each queue type in the defined order
> +      */
> +     int ring_type = queue->queue_type;
> +     const struct amdgpu_userq_funcs *funcs = adev->userq_funcs[ring_type];
> +
> +     if (!amdgpu_userq_is_reset_type_supported(adev, ring_type, 
> AMDGPU_RESET_TYPE_PER_QUEUE))
> +                     return r;
> +
> +     if (atomic_read(&uq_mgr->userq_count[ring_type]) > 0 &&
> +         funcs && funcs->detect_and_reset) {
> +             r = funcs->detect_and_reset(adev, ring_type);
> +             if (r)
> +                     gpu_reset = true;
> +     }
> +
> +     if (gpu_reset)
> +             amdgpu_userq_gpu_reset(adev);
> +
> +     return r;
> +}
>  static void amdgpu_userq_hang_detect_work(struct work_struct *work)
>  {
>       struct amdgpu_usermode_queue *queue = container_of(work,
> @@ -627,7 +673,6 @@ amdgpu_userq_destroy(struct amdgpu_userq_mgr *uq_mgr, 
> struct amdgpu_usermode_que
>       /* Cancel any pending hang detection work and cleanup */
>       cancel_delayed_work_sync(&queue->hang_detect_work);
>  
> -     mutex_lock(&uq_mgr->userq_mutex);
>       queue->hang_detect_fence = NULL;
>       amdgpu_userq_wait_for_last_fence(queue);
>  
> @@ -649,7 +694,7 @@ amdgpu_userq_destroy(struct amdgpu_userq_mgr *uq_mgr, 
> struct amdgpu_usermode_que
>  #if defined(CONFIG_DEBUG_FS)
>       debugfs_remove_recursive(queue->debugfs_queue);
>  #endif
> -     amdgpu_userq_detect_and_reset_queues(uq_mgr);
> +     amdgpu_userq_perq_detect_and_reset_queues(uq_mgr, queue);
>       r = amdgpu_userq_unmap_helper(queue);
>       /*TODO: It requires a reset for userq hw unmap error*/
>       if (unlikely(r != AMDGPU_USERQ_STATE_UNMAPPED)) {
> @@ -657,7 +702,6 @@ amdgpu_userq_destroy(struct amdgpu_userq_mgr *uq_mgr, 
> struct amdgpu_usermode_que
>               queue->state = AMDGPU_USERQ_STATE_HUNG;
>       }
>       amdgpu_userq_cleanup(queue);
> -     mutex_unlock(&uq_mgr->userq_mutex);
>  
>       pm_runtime_put_autosuspend(adev_to_drm(adev)->dev);
>  

Reply via email to