[PATCH] drm/sched: Add error code parameter to drm_sched_start

vitaly.prosyak Wed, 24 Jul 2024 11:43:55 -0700

From: Vitaly Prosyak <vitaly.pros...@amd.com>

The current implementation of drm_sched_start uses a hardcoded -ECANCELED to 
dispose of a job when
the parent/hw fence is NULL. This results in drm_sched_job_done being called 
with -ECANCELED for
each job with a NULL parent in the pending list, making it difficult to 
distinguish between recovery
methods, whether a queue reset or a full GPU reset was used. To improve this, 
we first try a soft
recovery for timeout jobs and use the error code -ENODATA. If soft recovery 
fails, we proceed with
a queue reset, where the error code would still be -ENODATA for the job. 
Finally, for a full GPU
reset, we use error codes -ECANCELED or -ETIME. This patch adds an error code 
parameter to
drm_sched_start, allowing us to differentiate between queue reset and GPU reset 
failures.
This enables user mode and test applications to validate the expected 
correctness of the requested
operation. After a successful queue reset, the only way to continue normal 
operation is to call
drm_sched_job_done with the specific error code -ENODATA.


v1: Initial implementation by Jesse utilized amdgpu_device_lock_reset_domain and
    amdgpu_device_unlock_reset_domain to allow user mode to track the queue 
reset status
    and distinguish between queue reset and GPU reset.
v2: Christian suggested using the error codes -ENODATA for queue reset and 
-ECANCELED or
    -ETIME for GPU reset, returned to amdgpu_cs_wait_ioctl.
v3: To meet the requirements, we introduce a new function drm_sched_start_ex 
with an additional
    parameter to set dma_fence_set_error, allowing us to handle the specific 
error codes
    appropriately and dispose of bad jobs with the selected error code 
depending on whether
    it was a queue reset or GPU reset.

Signed-off-by: Jesse Zhang <jesse.zh...@amd.com>
Signed-off-by: Vitaly Prosyak <vitaly.pros...@amd.com>
---
 drivers/gpu/drm/scheduler/sched_main.c | 19 ++++++++++++++++---
 include/drm/gpu_scheduler.h            |  1 +
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/scheduler/sched_main.c 
b/drivers/gpu/drm/scheduler/sched_main.c
index 7e90c9f95611..5a534772335a 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -671,13 +671,14 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched, 
struct drm_sched_job *bad)
 EXPORT_SYMBOL(drm_sched_stop);
 
 /**
- * drm_sched_start - recover jobs after a reset
+ * drm_sched_start_ex - recover jobs after a reset
  *
  * @sched: scheduler instance
  * @full_recovery: proceed with complete sched restart
+ * @error : err code for set dma_fence_set_error
  *
  */
-void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery)
+void drm_sched_start_ex(struct drm_gpu_scheduler *sched, bool full_recovery, 
int error)
 {
        struct drm_sched_job *s_job, *tmp;
        int r;
@@ -704,7 +705,7 @@ void drm_sched_start(struct drm_gpu_scheduler *sched, bool 
full_recovery)
                                DRM_DEV_ERROR(sched->dev, "fence add callback 
failed (%d)\n",
                                          r);
                } else
-                       drm_sched_job_done(s_job, -ECANCELED);
+                       drm_sched_job_done(s_job, error);
        }
 
        if (full_recovery)
@@ -712,6 +713,18 @@ void drm_sched_start(struct drm_gpu_scheduler *sched, bool 
full_recovery)
 
        drm_sched_wqueue_start(sched);
 }
+EXPORT_SYMBOL(drm_sched_start_ex);
+/**
+ * drm_sched_start - recover jobs after a reset
+ *
+ * @sched: scheduler instance
+ * @full_recovery: proceed with complete sched restart
+ *
+ */
+void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery)
+{
+       drm_sched_start_ex(sched, full_recovery, -ECANCELED);
+}
 EXPORT_SYMBOL(drm_sched_start);
 
 /**
diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
index 5acc64954a88..444fa6761590 100644
--- a/include/drm/gpu_scheduler.h
+++ b/include/drm/gpu_scheduler.h
@@ -580,6 +580,7 @@ void drm_sched_wqueue_stop(struct drm_gpu_scheduler *sched);
 void drm_sched_wqueue_start(struct drm_gpu_scheduler *sched);
 void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job 
*bad);
 void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery);
+void drm_sched_start_ex(struct drm_gpu_scheduler *sched, bool full_recovery, 
int error);
 void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched);
 void drm_sched_increase_karma(struct drm_sched_job *bad);
 void drm_sched_reset_karma(struct drm_sched_job *bad);
-- 
2.25.1

[PATCH] drm/sched: Add error code parameter to drm_sched_start

Reply via email to