[PATCH] drm/amdgpu: deprecate guilty handling

Christian König Fri, 07 Mar 2025 06:59:39 -0800

The guilty handling tried to establish a second way of signaling problems with
the GPU back to userspace. This caused quite a bunch of issue we had to work
around, especially lifetime issues with the drm_sched_entity.


Just drop the handling altogether and use the dma_fence based approach instead.

Signed-off-by: Christian König <christian.koe...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c     |  5 -----
 drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c    | 25 ++++++++++++++++++++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h    |  1 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  9 +-------
 4 files changed, 24 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index 5df21529b3b1..fcace736f208 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -59,11 +59,6 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p,
        if (!p->ctx)
                return -EINVAL;
 
-       if (atomic_read(&p->ctx->guilty)) {
-               amdgpu_ctx_put(p->ctx);
-               return -ECANCELED;
-       }
-
        amdgpu_sync_create(&p->sync);
        drm_exec_init(&p->exec, DRM_EXEC_INTERRUPTIBLE_WAIT |
                      DRM_EXEC_IGNORE_DUPLICATES, 0);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
index c43d1b6e5d66..0b6eb718577a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
@@ -250,7 +250,7 @@ static int amdgpu_ctx_init_entity(struct amdgpu_ctx *ctx, 
u32 hw_ip,
        }
 
        r = drm_sched_entity_init(&entity->entity, drm_prio, scheds, num_scheds,
-                                 &ctx->guilty);
+                                 NULL);
        if (r)
                goto error_free_entity;
 
@@ -572,6 +572,27 @@ static int amdgpu_ctx_query(struct amdgpu_device *adev,
 
 #define AMDGPU_RAS_COUNTE_DELAY_MS 3000
 
+static bool amdgpu_ctx_guilty(struct amdgpu_ctx *ctx)
+{
+       int i, j, r;
+
+       for (i = 0; i < AMDGPU_HW_IP_NUM; ++i) {
+               for (j = 0; j < amdgpu_ctx_num_entities[i]; ++j) {
+                       struct amdgpu_ctx_entity *ctx_entity;
+
+                       ctx_entity = ctx->entities[i][j];
+                       if (ctx_entity)
+                               continue;
+
+                       r == drm_sched_entity_error(&ctx_entity->entity);
+                       if (r == -ETIME)
+                               return true;
+               }
+       }
+
+       return false;
+}
+
 static int amdgpu_ctx_query2(struct amdgpu_device *adev,
                             struct amdgpu_fpriv *fpriv, uint32_t id,
                             union drm_amdgpu_ctx_out *out)
@@ -600,7 +621,7 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev,
        if (ctx->generation != amdgpu_vm_generation(adev, &fpriv->vm))
                out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_VRAMLOST;
 
-       if (atomic_read(&ctx->guilty))
+       if (amdgpu_ctx_guilty(ctx))
                out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_GUILTY;
 
        if (amdgpu_in_reset(adev))
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
index 85376baaa92f..45569cce484e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
@@ -53,7 +53,6 @@ struct amdgpu_ctx {
        bool                            preamble_presented;
        int32_t                         init_priority;
        int32_t                         override_priority;
-       atomic_t                        guilty;
        unsigned long                   ras_counter_ce;
        unsigned long                   ras_counter_ue;
        uint32_t                        stable_pstate;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 198d29faa754..ed65c14a4ed7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5467,14 +5467,10 @@ int amdgpu_device_mode1_reset(struct amdgpu_device 
*adev)
 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
                                 struct amdgpu_reset_context *reset_context)
 {
-       int i, r = 0;
-       struct amdgpu_job *job = NULL;
        struct amdgpu_device *tmp_adev = reset_context->reset_req_dev;
        bool need_full_reset =
                test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
-
-       if (reset_context->reset_req_dev == adev)
-               job = reset_context->job;
+       int i, r;
 
        if (amdgpu_sriov_vf(adev))
                amdgpu_virt_pre_reset(adev);
@@ -5499,9 +5495,6 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device 
*adev,
 
        amdgpu_fence_driver_isr_toggle(adev, false);
 
-       if (job && job->vm)
-               drm_sched_increase_karma(&job->base);
-
        r = amdgpu_reset_prepare_hwcontext(adev, reset_context);
        /* If reset handler not implemented, continue; otherwise return */
        if (r == -EOPNOTSUPP)
-- 
2.34.1

[PATCH] drm/amdgpu: deprecate guilty handling

Reply via email to