amdgpu: fix gang submission error handling

Christian König Thu, 15 May 2025 08:00:59 -0700

For the unlikely case that we ran into an ENOMEM while fixing up the gang
submission dependencies we can't clean up any more since the gang
members are already armed.


Fix this by using pre-allocated dependency slots and re-ordering the
code, also fix a double unref since the fence reference is also dropped
on error.

Signed-off-by: Christian König <christian.koe...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 49 +++++++++++++++-----------
 1 file changed, 28 insertions(+), 21 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index 82df06a72ee0..b50a5532f4c6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -1282,6 +1282,7 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
 {
        struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
        struct amdgpu_job *leader = p->gang_leader;
+       u32 fence_slots[AMDGPU_CS_GANG_SIZE];
        struct amdgpu_bo_list_entry *e;
        struct drm_gem_object *gobj;
        unsigned long index;
@@ -1289,36 +1290,23 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
        uint64_t seq;
        int r;
 
-       for (i = 0; i < p->gang_size; ++i)
-               drm_sched_job_arm(&p->jobs[i]->base);
-
+       /* Preallocate the memory for the gang dependencies */
        for (i = 0; i < p->gang_size; ++i) {
-               struct dma_fence *fence;
-
-               if (p->jobs[i] == leader)
-                       continue;
-
-               fence = &p->jobs[i]->base.s_fence->scheduled;
-               dma_fence_get(fence);
-               r = drm_sched_job_add_dependency(&leader->base, fence);
-               if (r) {
-                       dma_fence_put(fence);
+               r = drm_sched_job_prealloc_dependency_slot(&leader->base,
+                                                          &fence_slots[i]);
+               if (r)
                        return r;
-               }
        }
 
-       if (p->gang_size > 1) {
-               for (i = 0; i < p->gang_size; ++i)
-                       amdgpu_job_set_gang_leader(p->jobs[i], leader);
-       }
-
-       /* No memory allocation is allowed while holding the notifier lock.
+       /*
+        * No memory allocation is allowed while holding the notifier lock.
         * The lock is held until amdgpu_cs_submit is finished and fence is
         * added to BOs.
         */
        mutex_lock(&p->adev->notifier_lock);
 
-       /* If userptr are invalidated after amdgpu_cs_parser_bos(), return
+       /*
+        * If userptr are invalidated after amdgpu_cs_parser_bos(), return
         * -EAGAIN, drmIoctl in libdrm will restart the amdgpu_cs_ioctl.
         */
        r = 0;
@@ -1333,6 +1321,25 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
                return r;
        }
 
+       for (i = 0; i < p->gang_size; ++i)
+               drm_sched_job_arm(&p->jobs[i]->base);
+
+       for (i = 0; i < p->gang_size; ++i) {
+               struct dma_fence *fence;
+
+               if (p->jobs[i] == leader)
+                       continue;
+
+               fence = dma_fence_get(&p->jobs[i]->base.s_fence->scheduled);
+               drm_sched_job_add_prealloc_dep(&leader->base, fence_slots[i],
+                                              fence);
+       }
+
+       if (p->gang_size > 1) {
+               for (i = 0; i < p->gang_size; ++i)
+                       amdgpu_job_set_gang_leader(p->jobs[i], leader);
+       }
+
        p->fence = dma_fence_get(&leader->base.s_fence->finished);
        drm_exec_for_each_locked_object(&p->exec, index, gobj) {
 
-- 
2.34.1

[PATCH 3/3] drm/amdgpu: fix gang submission error handling

Reply via email to