On 01.09.25 10:31, Philipp Stanner wrote:
> This reverts:
> 
> commit bead88002227 ("drm/nouveau: Remove waitque for sched teardown")
> commit 5f46f5c7af8c ("drm/nouveau: Add new callback for scheduler teardown")
> 
> from the drm/sched teardown leak fix series:
> 
> https://lore.kernel.org/dri-devel/20250710125412.128476-2-pha...@kernel.org/
> 
> The aforementioned series removed a blocking waitqueue from
> nouveau_sched_fini(). It was mistakenly assumed that this waitqueue only
> prevents jobs from leaking, which the series fixed.
> 
> The waitqueue, however, also guarantees that all VM_BIND related jobs
> are finished in order, cleaning up mappings in the GPU's MMU. These jobs
> must be executed sequentially. Without the waitqueue, this is no longer
> guaranteed, because entity and scheduler teardown can race with each
> other.

That sounds like exactly the kind of issues I tried to catch with the recent 
dma_fence changes.

Going to keep working on that and potentially using this here as blueprint for 
something it should catch.

Regards,
Christian.

> 
> Revert all patches related to the waitqueue removal.
> 
> Fixes: bead88002227 ("drm/nouveau: Remove waitque for sched teardown")
> Suggested-by: Danilo Krummrich <d...@kernel.org>
> Signed-off-by: Philipp Stanner <pha...@kernel.org>
> ---
> Changes in v2:
>   - Don't revert commit 89b2675198ab ("drm/nouveau: Make fence container 
> helper usable driver-wide")
>   - Add Fixes-tag
> ---
>  drivers/gpu/drm/nouveau/nouveau_fence.c | 15 -----------
>  drivers/gpu/drm/nouveau/nouveau_fence.h |  1 -
>  drivers/gpu/drm/nouveau/nouveau_sched.c | 35 ++++++++++---------------
>  drivers/gpu/drm/nouveau/nouveau_sched.h |  9 ++++---
>  drivers/gpu/drm/nouveau/nouveau_uvmm.c  |  8 +++---
>  5 files changed, 24 insertions(+), 44 deletions(-)
> 
> diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.c 
> b/drivers/gpu/drm/nouveau/nouveau_fence.c
> index 9f345a008717..869d4335c0f4 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_fence.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_fence.c
> @@ -240,21 +240,6 @@ nouveau_fence_emit(struct nouveau_fence *fence)
>       return ret;
>  }
>  
> -void
> -nouveau_fence_cancel(struct nouveau_fence *fence)
> -{
> -     struct nouveau_fence_chan *fctx = nouveau_fctx(fence);
> -     unsigned long flags;
> -
> -     spin_lock_irqsave(&fctx->lock, flags);
> -     if (!dma_fence_is_signaled_locked(&fence->base)) {
> -             dma_fence_set_error(&fence->base, -ECANCELED);
> -             if (nouveau_fence_signal(fence))
> -                     nvif_event_block(&fctx->event);
> -     }
> -     spin_unlock_irqrestore(&fctx->lock, flags);
> -}
> -
>  bool
>  nouveau_fence_done(struct nouveau_fence *fence)
>  {
> diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.h 
> b/drivers/gpu/drm/nouveau/nouveau_fence.h
> index 9957a919bd38..183dd43ecfff 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_fence.h
> +++ b/drivers/gpu/drm/nouveau/nouveau_fence.h
> @@ -29,7 +29,6 @@ void nouveau_fence_unref(struct nouveau_fence **);
>  
>  int  nouveau_fence_emit(struct nouveau_fence *);
>  bool nouveau_fence_done(struct nouveau_fence *);
> -void nouveau_fence_cancel(struct nouveau_fence *fence);
>  int  nouveau_fence_wait(struct nouveau_fence *, bool lazy, bool intr);
>  int  nouveau_fence_sync(struct nouveau_bo *, struct nouveau_channel *, bool 
> exclusive, bool intr);
>  
> diff --git a/drivers/gpu/drm/nouveau/nouveau_sched.c 
> b/drivers/gpu/drm/nouveau/nouveau_sched.c
> index 0cc0bc9f9952..e60f7892f5ce 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_sched.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_sched.c
> @@ -11,7 +11,6 @@
>  #include "nouveau_exec.h"
>  #include "nouveau_abi16.h"
>  #include "nouveau_sched.h"
> -#include "nouveau_chan.h"
>  
>  #define NOUVEAU_SCHED_JOB_TIMEOUT_MS         10000
>  
> @@ -122,9 +121,11 @@ nouveau_job_done(struct nouveau_job *job)
>  {
>       struct nouveau_sched *sched = job->sched;
>  
> -     spin_lock(&sched->job_list.lock);
> +     spin_lock(&sched->job.list.lock);
>       list_del(&job->entry);
> -     spin_unlock(&sched->job_list.lock);
> +     spin_unlock(&sched->job.list.lock);
> +
> +     wake_up(&sched->job.wq);
>  }
>  
>  void
> @@ -305,9 +306,9 @@ nouveau_job_submit(struct nouveau_job *job)
>       }
>  
>       /* Submit was successful; add the job to the schedulers job list. */
> -     spin_lock(&sched->job_list.lock);
> -     list_add(&job->entry, &sched->job_list.head);
> -     spin_unlock(&sched->job_list.lock);
> +     spin_lock(&sched->job.list.lock);
> +     list_add(&job->entry, &sched->job.list.head);
> +     spin_unlock(&sched->job.list.lock);
>  
>       drm_sched_job_arm(&job->base);
>       job->done_fence = dma_fence_get(&job->base.s_fence->finished);
> @@ -392,23 +393,10 @@ nouveau_sched_free_job(struct drm_sched_job *sched_job)
>       nouveau_job_fini(job);
>  }
>  
> -static void
> -nouveau_sched_cancel_job(struct drm_sched_job *sched_job)
> -{
> -     struct nouveau_fence *fence;
> -     struct nouveau_job *job;
> -
> -     job = to_nouveau_job(sched_job);
> -     fence = to_nouveau_fence(job->done_fence);
> -
> -     nouveau_fence_cancel(fence);
> -}
> -
>  static const struct drm_sched_backend_ops nouveau_sched_ops = {
>       .run_job = nouveau_sched_run_job,
>       .timedout_job = nouveau_sched_timedout_job,
>       .free_job = nouveau_sched_free_job,
> -     .cancel_job = nouveau_sched_cancel_job,
>  };
>  
>  static int
> @@ -458,8 +446,9 @@ nouveau_sched_init(struct nouveau_sched *sched, struct 
> nouveau_drm *drm,
>               goto fail_sched;
>  
>       mutex_init(&sched->mutex);
> -     spin_lock_init(&sched->job_list.lock);
> -     INIT_LIST_HEAD(&sched->job_list.head);
> +     spin_lock_init(&sched->job.list.lock);
> +     INIT_LIST_HEAD(&sched->job.list.head);
> +     init_waitqueue_head(&sched->job.wq);
>  
>       return 0;
>  
> @@ -493,12 +482,16 @@ nouveau_sched_create(struct nouveau_sched **psched, 
> struct nouveau_drm *drm,
>       return 0;
>  }
>  
> +
>  static void
>  nouveau_sched_fini(struct nouveau_sched *sched)
>  {
>       struct drm_gpu_scheduler *drm_sched = &sched->base;
>       struct drm_sched_entity *entity = &sched->entity;
>  
> +     rmb(); /* for list_empty to work without lock */
> +     wait_event(sched->job.wq, list_empty(&sched->job.list.head));
> +
>       drm_sched_entity_fini(entity);
>       drm_sched_fini(drm_sched);
>  
> diff --git a/drivers/gpu/drm/nouveau/nouveau_sched.h 
> b/drivers/gpu/drm/nouveau/nouveau_sched.h
> index b98c3f0bef30..20cd1da8db73 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_sched.h
> +++ b/drivers/gpu/drm/nouveau/nouveau_sched.h
> @@ -103,9 +103,12 @@ struct nouveau_sched {
>       struct mutex mutex;
>  
>       struct {
> -             struct list_head head;
> -             spinlock_t lock;
> -     } job_list;
> +             struct {
> +                     struct list_head head;
> +                     spinlock_t lock;
> +             } list;
> +             struct wait_queue_head wq;
> +     } job;
>  };
>  
>  int nouveau_sched_create(struct nouveau_sched **psched, struct nouveau_drm 
> *drm,
> diff --git a/drivers/gpu/drm/nouveau/nouveau_uvmm.c 
> b/drivers/gpu/drm/nouveau/nouveau_uvmm.c
> index d94a85509176..79eefdfd08a2 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_uvmm.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_uvmm.c
> @@ -1019,8 +1019,8 @@ bind_validate_map_sparse(struct nouveau_job *job, u64 
> addr, u64 range)
>       u64 end = addr + range;
>  
>  again:
> -     spin_lock(&sched->job_list.lock);
> -     list_for_each_entry(__job, &sched->job_list.head, entry) {
> +     spin_lock(&sched->job.list.lock);
> +     list_for_each_entry(__job, &sched->job.list.head, entry) {
>               struct nouveau_uvmm_bind_job *bind_job = 
> to_uvmm_bind_job(__job);
>  
>               list_for_each_op(op, &bind_job->ops) {
> @@ -1030,7 +1030,7 @@ bind_validate_map_sparse(struct nouveau_job *job, u64 
> addr, u64 range)
>  
>                               if (!(end <= op_addr || addr >= op_end)) {
>                                       nouveau_uvmm_bind_job_get(bind_job);
> -                                     spin_unlock(&sched->job_list.lock);
> +                                     spin_unlock(&sched->job.list.lock);
>                                       
> wait_for_completion(&bind_job->complete);
>                                       nouveau_uvmm_bind_job_put(bind_job);
>                                       goto again;
> @@ -1038,7 +1038,7 @@ bind_validate_map_sparse(struct nouveau_job *job, u64 
> addr, u64 range)
>                       }
>               }
>       }
> -     spin_unlock(&sched->job_list.lock);
> +     spin_unlock(&sched->job.list.lock);
>  }
>  
>  static int

Reply via email to