In the userq destroy routine, the queue refcount
should be 0 and the queue already removed from the
manager list, so it must not be touched. Attempting
to lock the userq mutex here would deadlock, as it
is already held by the eviction suspend work like as
following.
[ 107.881652] ============================================
[ 107.881866] WARNING: possible recursive locking detected
[ 107.882081] 6.19.0-custom #16 Tainted: G U OE
[ 107.882305] --------------------------------------------
[ 107.882518] kworker/15:1/158 is trying to acquire lock:
[ 107.882728] ffff8f2854b3d110 (&userq_mgr->userq_mutex){+.+.}-{4:4}, at:
amdgpu_userq_kref_destroy+0x57/0x540 [amdgpu]
[ 107.883462]
but task is already holding lock:
[ 107.883701] ffff8f2854b3d110 (&userq_mgr->userq_mutex){+.+.}-{4:4}, at:
amdgpu_eviction_fence_suspend_worker+0x31/0xc0 [amdgpu]
[ 107.884485]
other info that might help us debug this:
[ 107.884751] Possible unsafe locking scenario:
[ 107.884993] CPU0
[ 107.885100] ----
[ 107.885207] lock(&userq_mgr->userq_mutex);
[ 107.885385] lock(&userq_mgr->userq_mutex);
[ 107.885561]
*** DEADLOCK ***
[ 107.885798] May be due to missing lock nesting notation
[ 107.886069] 4 locks held by kworker/15:1/158:
[ 107.886247] #0: ffff8f2840057558 ((wq_completion)events){+.+.}-{0:0}, at:
process_one_work+0x455/0x650
[ 107.886630] #1: ffffd32f01a4fe18
((work_completion)(&evf_mgr->suspend_work)){+.+.}-{0:0}, at:
process_one_work+0x1f3/0x650
[ 107.887075] #2: ffff8f2854b3d110 (&userq_mgr->userq_mutex){+.+.}-{4:4}, at:
amdgpu_eviction_fence_suspend_worker+0x31/0xc0 [amdgpu]
[ 107.887799] #3: ffffffffb8d3f700 (dma_fence_map){++++}-{0:0}, at:
amdgpu_eviction_fence_suspend_worker+0x36/0xc0 [amdgpu]
[ 107.888457]
Signed-off-by: Prike Liang <[email protected]>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 50 +++++++++++++++++++++--
1 file changed, 47 insertions(+), 3 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
index bb5d572f5a3c..c7a9306a1c01 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
@@ -148,6 +148,52 @@ amdgpu_userq_detect_and_reset_queues(struct
amdgpu_userq_mgr *uq_mgr)
return r;
}
+static int
+amdgpu_userq_perq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr,
+ struct amdgpu_usermode_queue *queue)
+{
+ struct amdgpu_device *adev = uq_mgr->adev;
+ bool gpu_reset = false;
+ int r = 0;
+
+ /* Warning if current process mutex is not held */
+ if (refcount_read(&queue->refcount.refcount))
+ WARN_ON(!mutex_is_locked(&uq_mgr->userq_mutex));
+
+ if (unlikely(adev->debug_disable_gpu_ring_reset)) {
+ dev_err(adev->dev, "userq reset disabled by debug mask\n");
+ return 0;
+ }
+
+ /*
+ * If GPU recovery feature is disabled system-wide,
+ * skip all reset detection logic
+ */
+ if (!amdgpu_gpu_recovery)
+ return 0;
+
+ /*
+ * Iterate through all queue types to detect and reset problematic
queues
+ * Process each queue type in the defined order
+ */
+ int ring_type = queue->queue_type;
+ const struct amdgpu_userq_funcs *funcs = adev->userq_funcs[ring_type];
+
+ if (!amdgpu_userq_is_reset_type_supported(adev, ring_type,
AMDGPU_RESET_TYPE_PER_QUEUE))
+ return r;
+
+ if (atomic_read(&uq_mgr->userq_count[ring_type]) > 0 &&
+ funcs && funcs->detect_and_reset) {
+ r = funcs->detect_and_reset(adev, ring_type);
+ if (r)
+ gpu_reset = true;
+ }
+
+ if (gpu_reset)
+ amdgpu_userq_gpu_reset(adev);
+
+ return r;
+}
static void amdgpu_userq_hang_detect_work(struct work_struct *work)
{
struct amdgpu_usermode_queue *queue = container_of(work,
@@ -627,7 +673,6 @@ amdgpu_userq_destroy(struct amdgpu_userq_mgr *uq_mgr,
struct amdgpu_usermode_que
/* Cancel any pending hang detection work and cleanup */
cancel_delayed_work_sync(&queue->hang_detect_work);
- mutex_lock(&uq_mgr->userq_mutex);
queue->hang_detect_fence = NULL;
amdgpu_userq_wait_for_last_fence(queue);
@@ -649,7 +694,7 @@ amdgpu_userq_destroy(struct amdgpu_userq_mgr *uq_mgr,
struct amdgpu_usermode_que
#if defined(CONFIG_DEBUG_FS)
debugfs_remove_recursive(queue->debugfs_queue);
#endif
- amdgpu_userq_detect_and_reset_queues(uq_mgr);
+ amdgpu_userq_perq_detect_and_reset_queues(uq_mgr, queue);
r = amdgpu_userq_unmap_helper(queue);
/*TODO: It requires a reset for userq hw unmap error*/
if (unlikely(r != AMDGPU_USERQ_STATE_UNMAPPED)) {
@@ -657,7 +702,6 @@ amdgpu_userq_destroy(struct amdgpu_userq_mgr *uq_mgr,
struct amdgpu_usermode_que
queue->state = AMDGPU_USERQ_STATE_HUNG;
}
amdgpu_userq_cleanup(queue);
- mutex_unlock(&uq_mgr->userq_mutex);
pm_runtime_put_autosuspend(adev_to_drm(adev)->dev);
--
2.34.1