sched: optimize drm_sched_job_add_dependency

Christian König Fri, 23 May 2025 05:57:22 -0700

It turned out that we can actually massively optimize here.

The previous code was horrible inefficient since it constantly released
and re-acquired the lock of the xarray and started each iteration from the
base of the array to avoid concurrent modification which in our case
doesn't exist.


Additional to that the xas_find() and xas_store() functions are explicitly
made in a way so that you can efficiently check entries and if you don't
find a match store a new one at the end or replace existing ones.

So use xas_for_each()/xa_store() instead of xa_for_each()/xa_alloc().
It's a bit more code, but should be much faster in the end.

Signed-off-by: Christian König <christian.koe...@amd.com>
---
 drivers/gpu/drm/scheduler/sched_main.c | 29 ++++++++++++++++++--------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/scheduler/sched_main.c 
b/drivers/gpu/drm/scheduler/sched_main.c
index f7118497e47a..cf200b1b643e 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -871,10 +871,8 @@ EXPORT_SYMBOL(drm_sched_job_arm);
 int drm_sched_job_add_dependency(struct drm_sched_job *job,
                                 struct dma_fence *fence)
 {
+       XA_STATE(xas, &job->dependencies, 0);
        struct dma_fence *entry;
-       unsigned long index;
-       u32 id = 0;
-       int ret;
 
        if (!fence)
                return 0;
@@ -883,24 +881,37 @@ int drm_sched_job_add_dependency(struct drm_sched_job 
*job,
         * This lets the size of the array of deps scale with the number of
         * engines involved, rather than the number of BOs.
         */
-       xa_for_each(&job->dependencies, index, entry) {
+       xa_lock(&job->dependencies);
+       xas_for_each(&xas, entry, ULONG_MAX) {
                if (entry->context != fence->context)
                        continue;
 
                if (dma_fence_is_later(fence, entry)) {
                        dma_fence_put(entry);
-                       xa_store(&job->dependencies, index, fence, GFP_KERNEL);
+                       xas_store(&xas, fence);
                } else {
                        dma_fence_put(fence);
                }
-               return 0;
+               xa_unlock(&job->dependencies);
+               return xas_error(&xas);
        }
 
-       ret = xa_alloc(&job->dependencies, &id, fence, xa_limit_32b, 
GFP_KERNEL);
-       if (ret != 0)
+retry:
+       entry = xas_store(&xas, fence);
+       xa_unlock(&job->dependencies);
+
+       /* There shouldn't be any concurrent add, so no need to loop again */
+       if (xas_nomem(&xas, GFP_KERNEL)) {
+               xa_lock(&job->dependencies);
+               goto retry;
+       }
+
+       if (xas_error(&xas))
                dma_fence_put(fence);
+       else
+               WARN_ON(entry);
 
-       return ret;
+       return xas_error(&xas);
 }
 EXPORT_SYMBOL(drm_sched_job_add_dependency);
 
-- 
2.34.1

[PATCH 1/4] drm/sched: optimize drm_sched_job_add_dependency

Reply via email to