amdgpu: switch job hw_fence to amdgpu_fence

Alex Deucher Fri, 06 Jun 2025 09:09:01 -0700

On Fri, Jun 6, 2025 at 7:39 AM Christian König <christian.koe...@amd.com> wrote:
>
> On 6/6/25 08:43, Alex Deucher wrote:
> > Use the amdgpu fence container so we can store additional
> > data in the fence.
> >
> > Signed-off-by: Alex Deucher <alexander.deuc...@amd.com>
> > ---
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c |  2 +-
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  2 +-
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c   | 30 +++++----------------
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c     | 12 ++++-----
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_job.h     |  2 +-
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h    | 16 +++++++++++
> >  6 files changed, 32 insertions(+), 32 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> > index 8e626f50b362e..f81608330a3d0 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> > @@ -1902,7 +1902,7 @@ static void amdgpu_ib_preempt_mark_partial_job(struct 
> > amdgpu_ring *ring)
> >                       continue;
> >               }
> >               job = to_amdgpu_job(s_job);
> > -             if (preempted && (&job->hw_fence) == fence)
> > +             if (preempted && (&job->hw_fence.base) == fence)
> >                       /* mark the job as preempted */
> >                       job->preemption_status |= AMDGPU_IB_PREEMPTED;
> >       }
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > index ea565651f7459..8298e95e4543e 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > @@ -6375,7 +6375,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device 
> > *adev,
> >        *
> >        * job->base holds a reference to parent fence
> >        */
> > -     if (job && dma_fence_is_signaled(&job->hw_fence)) {
> > +     if (job && dma_fence_is_signaled(&job->hw_fence.base)) {
> >               job_signaled = true;
> >               dev_info(adev->dev, "Guilty job already signaled, skipping HW 
> > reset");
> >               goto skip_hw_reset;
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c 
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> > index 2f24a6aa13bf6..569e0e5373927 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> > @@ -41,22 +41,6 @@
> >  #include "amdgpu_trace.h"
> >  #include "amdgpu_reset.h"
> >
> > -/*
> > - * Fences mark an event in the GPUs pipeline and are used
> > - * for GPU/CPU synchronization.  When the fence is written,
> > - * it is expected that all buffers associated with that fence
> > - * are no longer in use by the associated ring on the GPU and
> > - * that the relevant GPU caches have been flushed.
> > - */
> > -
> > -struct amdgpu_fence {
> > -     struct dma_fence base;
> > -
> > -     /* RB, DMA, etc. */
> > -     struct amdgpu_ring              *ring;
> > -     ktime_t                         start_timestamp;
> > -};
> > -
>
> Oh, that handling here is completely broken.
>
> The MCBP muxer overwrites fields in the job because of this ^^.
>
> I think that patch needs to be a bug fix we even backport.


What is broken in the muxer code?

Alex

>
> Regards,
> CFhristian.
>
> >  static struct kmem_cache *amdgpu_fence_slab;
> >
> >  int amdgpu_fence_slab_init(void)
> > @@ -151,12 +135,12 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, 
> > struct dma_fence **f, struct amd
> >               am_fence = kmem_cache_alloc(amdgpu_fence_slab, GFP_ATOMIC);
> >               if (am_fence == NULL)
> >                       return -ENOMEM;
> > -             fence = &am_fence->base;
> > -             am_fence->ring = ring;
> >       } else {
> >               /* take use of job-embedded fence */
> > -             fence = &job->hw_fence;
> > +             am_fence = &job->hw_fence;
> >       }
> > +     fence = &am_fence->base;
> > +     am_fence->ring = ring;
> >
> >       seq = ++ring->fence_drv.sync_seq;
> >       if (job && job->job_run_counter) {
> > @@ -718,7 +702,7 @@ void amdgpu_fence_driver_clear_job_fences(struct 
> > amdgpu_ring *ring)
> >                        * it right here or we won't be able to track them in 
> > fence_drv
> >                        * and they will remain unsignaled during sa_bo free.
> >                        */
> > -                     job = container_of(old, struct amdgpu_job, hw_fence);
> > +                     job = container_of(old, struct amdgpu_job, 
> > hw_fence.base);
> >                       if (!job->base.s_fence && !dma_fence_is_signaled(old))
> >                               dma_fence_signal(old);
> >                       RCU_INIT_POINTER(*ptr, NULL);
> > @@ -780,7 +764,7 @@ static const char 
> > *amdgpu_fence_get_timeline_name(struct dma_fence *f)
> >
> >  static const char *amdgpu_job_fence_get_timeline_name(struct dma_fence *f)
> >  {
> > -     struct amdgpu_job *job = container_of(f, struct amdgpu_job, hw_fence);
> > +     struct amdgpu_job *job = container_of(f, struct amdgpu_job, 
> > hw_fence.base);
> >
> >       return (const char *)to_amdgpu_ring(job->base.sched)->name;
> >  }
> > @@ -810,7 +794,7 @@ static bool amdgpu_fence_enable_signaling(struct 
> > dma_fence *f)
> >   */
> >  static bool amdgpu_job_fence_enable_signaling(struct dma_fence *f)
> >  {
> > -     struct amdgpu_job *job = container_of(f, struct amdgpu_job, hw_fence);
> > +     struct amdgpu_job *job = container_of(f, struct amdgpu_job, 
> > hw_fence.base);
> >
> >       if 
> > (!timer_pending(&to_amdgpu_ring(job->base.sched)->fence_drv.fallback_timer))
> >               
> > amdgpu_fence_schedule_fallback(to_amdgpu_ring(job->base.sched));
> > @@ -845,7 +829,7 @@ static void amdgpu_job_fence_free(struct rcu_head *rcu)
> >       struct dma_fence *f = container_of(rcu, struct dma_fence, rcu);
> >
> >       /* free job if fence has a parent job */
> > -     kfree(container_of(f, struct amdgpu_job, hw_fence));
> > +     kfree(container_of(f, struct amdgpu_job, hw_fence.base));
> >  }
> >
> >  /**
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> > index acb21fc8b3ce5..ddb9d3269357c 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> > @@ -272,8 +272,8 @@ void amdgpu_job_free_resources(struct amdgpu_job *job)
> >       /* Check if any fences where initialized */
> >       if (job->base.s_fence && job->base.s_fence->finished.ops)
> >               f = &job->base.s_fence->finished;
> > -     else if (job->hw_fence.ops)
> > -             f = &job->hw_fence;
> > +     else if (job->hw_fence.base.ops)
> > +             f = &job->hw_fence.base;
> >       else
> >               f = NULL;
> >
> > @@ -290,10 +290,10 @@ static void amdgpu_job_free_cb(struct drm_sched_job 
> > *s_job)
> >       amdgpu_sync_free(&job->explicit_sync);
> >
> >       /* only put the hw fence if has embedded fence */
> > -     if (!job->hw_fence.ops)
> > +     if (!job->hw_fence.base.ops)
> >               kfree(job);
> >       else
> > -             dma_fence_put(&job->hw_fence);
> > +             dma_fence_put(&job->hw_fence.base);
> >  }
> >
> >  void amdgpu_job_set_gang_leader(struct amdgpu_job *job,
> > @@ -322,10 +322,10 @@ void amdgpu_job_free(struct amdgpu_job *job)
> >       if (job->gang_submit != &job->base.s_fence->scheduled)
> >               dma_fence_put(job->gang_submit);
> >
> > -     if (!job->hw_fence.ops)
> > +     if (!job->hw_fence.base.ops)
> >               kfree(job);
> >       else
> > -             dma_fence_put(&job->hw_fence);
> > +             dma_fence_put(&job->hw_fence.base);
> >  }
> >
> >  struct dma_fence *amdgpu_job_submit(struct amdgpu_job *job)
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h 
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> > index f2c049129661f..931fed8892cc1 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> > @@ -48,7 +48,7 @@ struct amdgpu_job {
> >       struct drm_sched_job    base;
> >       struct amdgpu_vm        *vm;
> >       struct amdgpu_sync      explicit_sync;
> > -     struct dma_fence        hw_fence;
> > +     struct amdgpu_fence     hw_fence;
> >       struct dma_fence        *gang_submit;
> >       uint32_t                preamble_status;
> >       uint32_t                preemption_status;
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h 
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> > index b95b471107692..e1f25218943a4 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> > @@ -127,6 +127,22 @@ struct amdgpu_fence_driver {
> >       struct dma_fence                **fences;
> >  };
> >
> > +/*
> > + * Fences mark an event in the GPUs pipeline and are used
> > + * for GPU/CPU synchronization.  When the fence is written,
> > + * it is expected that all buffers associated with that fence
> > + * are no longer in use by the associated ring on the GPU and
> > + * that the relevant GPU caches have been flushed.
> > + */
> > +
> > +struct amdgpu_fence {
> > +     struct dma_fence base;
> > +
> > +     /* RB, DMA, etc. */
> > +     struct amdgpu_ring              *ring;
> > +     ktime_t                         start_timestamp;
> > +};
> > +
> >  extern const struct drm_sched_backend_ops amdgpu_sched_ops;
> >
> >  void amdgpu_fence_driver_clear_job_fences(struct amdgpu_ring *ring);
>

Re: [PATCH 05/29] drm/amdgpu: switch job hw_fence to amdgpu_fence

Reply via email to