In error scenarios (e.g., malformed commands), user queue fences may never
be signaled, causing processes to wait indefinitely. To address this while
preserving the requirement of infinite fence waits, implement an independent
timeout detection mechanism:
1. Initialize a hang detect timer when creating a user queue (one-time setup)
2. Start the timer with queue-type-specific timeout (gfx/compute/sdma) when
the last fence is created via amdgpu_userq_signal_ioctl (per-fence timing)
3. Trigger queue reset logic if the timer expires before the fence is signaled
4. Clean up the timer when destroying the user queue to avoid leaks
v2: make timeout per queue type (adev->gfx_timeout vs adev->compute_timeout vs
adev->sdma_timeout) to be consistent with kernel queues. (Alex)
v3: The timeout detection must be independent from the fence, e.g. you don't
wait for a timeout on the fence
but rather have the timeout start as soon as the fence is initialized.
(Christian)
Signed-off-by: Jesse Zhang <[email protected]>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 53 ++++++++++++++++++-
drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h | 4 ++
.../gpu/drm/amd/amdgpu/amdgpu_userq_fence.c | 1 +
3 files changed, 57 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
index 98110f543307..6e04286a707e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
@@ -148,6 +148,56 @@ amdgpu_userq_detect_and_reset_queues(struct
amdgpu_userq_mgr *uq_mgr)
return r;
}
+static void amdgpu_userq_hang_detect_timer_cb(struct timer_list *t) {
+ struct amdgpu_usermode_queue *queue = timer_container_of(queue, t,
hang_detect_timer);
+ struct dma_fence *fence = queue->timer_fence;
+ struct amdgpu_userq_mgr *uq_mgr = queue->userq_mgr;
+
+ /* Fence already signaled ??? no action needed */
+ if (!fence || dma_fence_is_signaled(fence)) {
+ return;
+ }
+
+ mutex_lock(&uq_mgr->userq_mutex);
+ amdgpu_userq_detect_and_reset_queues(uq_mgr);
+ mutex_unlock(&uq_mgr->userq_mutex);
+}
+
+/* Start hang detection timer for a user queue fence */
+void amdgpu_userq_start_hang_detect_timer(struct amdgpu_usermode_queue *queue)
{
+ struct amdgpu_device *adev = queue->userq_mgr->adev;
+ unsigned long timeout_ms;
+
+ /* Determine timeout based on queue type */
+ switch (queue->queue_type) {
+ case AMDGPU_RING_TYPE_GFX:
+ timeout_ms = adev->gfx_timeout;
+ break;
+ case AMDGPU_RING_TYPE_COMPUTE:
+ timeout_ms = adev->compute_timeout;
+ break;
+ case AMDGPU_RING_TYPE_SDMA:
+ timeout_ms = adev->sdma_timeout;
+ break;
+ default:
+ timeout_ms = adev->gfx_timeout;
+ break;
+ }
+
+ /* Link timer to the queue and set expiration */
+ queue->timer_fence = queue->last_fence;
+ mod_timer(&queue->hang_detect_timer, jiffies +
msecs_to_jiffies(timeout_ms));
+}
+
+static void amdgpu_userq_init_hang_detect_timer(struct amdgpu_usermode_queue
*queue) {
+ timer_setup(&queue->hang_detect_timer,
amdgpu_userq_hang_detect_timer_cb, 0);
+}
+/* Clean up hang detection timer */
+static void amdgpu_userq_cleanup_hang_detect_timer(struct
amdgpu_usermode_queue *queue) {
+ timer_delete_sync(&queue->hang_detect_timer);
+ queue->timer_fence = NULL;
+}
+
static int amdgpu_userq_buffer_va_list_add(struct amdgpu_usermode_queue *queue,
struct amdgpu_bo_va_mapping *va_map,
u64 addr)
{
@@ -580,6 +630,7 @@ amdgpu_userq_destroy(struct drm_file *filp, int queue_id)
return -EINVAL;
}
amdgpu_userq_wait_for_last_fence(queue);
+ amdgpu_userq_cleanup_hang_detect_timer(queue);
r = amdgpu_bo_reserve(queue->db_obj.obj, true);
if (!r) {
amdgpu_bo_unpin(queue->db_obj.obj);
@@ -818,8 +869,8 @@ amdgpu_userq_create(struct drm_file *filp, union
drm_amdgpu_userq *args)
queue->debugfs_queue = debugfs_create_dir(queue_name,
filp->debugfs_client);
debugfs_create_file("mqd_info", 0444, queue->debugfs_queue, queue,
&amdgpu_mqd_info_fops);
#endif
+ amdgpu_userq_init_hang_detect_timer(queue);
kfree(queue_name);
-
args->out.queue_id = qid;
atomic_inc(&uq_mgr->userq_count[queue->queue_type]);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
index 1eaa94f8a291..605aa5e5a915 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
@@ -25,6 +25,7 @@
#ifndef AMDGPU_USERQ_H_
#define AMDGPU_USERQ_H_
#include "amdgpu_eviction_fence.h"
+#include <linux/timer.h>
#define AMDGPU_MAX_USERQ_COUNT 512
@@ -72,6 +73,8 @@ struct amdgpu_usermode_queue {
u32 xcp_id;
int priority;
struct dentry *debugfs_queue;
+ struct timer_list hang_detect_timer;
+ struct dma_fence *timer_fence;
struct list_head userq_va_list;
};
@@ -146,6 +149,7 @@ int amdgpu_userq_start_sched_for_enforce_isolation(struct
amdgpu_device *adev,
void amdgpu_userq_reset_work(struct work_struct *work);
void amdgpu_userq_pre_reset(struct amdgpu_device *adev);
int amdgpu_userq_post_reset(struct amdgpu_device *adev, bool vram_lost);
+void amdgpu_userq_start_hang_detect_timer(struct amdgpu_usermode_queue *queue);
int amdgpu_userq_input_va_validate(struct amdgpu_device *adev,
struct amdgpu_usermode_queue *queue,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
index 25f178536469..9cf3991ad6d4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
@@ -569,6 +569,7 @@ int amdgpu_userq_signal_ioctl(struct drm_device *dev, void
*data,
dma_fence_put(queue->last_fence);
queue->last_fence = dma_fence_get(fence);
+ amdgpu_userq_start_hang_detect_timer(queue);
mutex_unlock(&userq_mgr->userq_mutex);
drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT,
--
2.49.0