On Fri, Jun 6, 2025 at 7:39 AM Christian König <christian.koe...@amd.com> wrote: > > On 6/6/25 08:43, Alex Deucher wrote: > > Use the amdgpu fence container so we can store additional > > data in the fence. > > > > Signed-off-by: Alex Deucher <alexander.deuc...@amd.com> > > --- > > drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 2 +- > > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +- > > drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 30 +++++---------------- > > drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 12 ++++----- > > drivers/gpu/drm/amd/amdgpu/amdgpu_job.h | 2 +- > > drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 16 +++++++++++ > > 6 files changed, 32 insertions(+), 32 deletions(-) > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c > > index 8e626f50b362e..f81608330a3d0 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c > > @@ -1902,7 +1902,7 @@ static void amdgpu_ib_preempt_mark_partial_job(struct > > amdgpu_ring *ring) > > continue; > > } > > job = to_amdgpu_job(s_job); > > - if (preempted && (&job->hw_fence) == fence) > > + if (preempted && (&job->hw_fence.base) == fence) > > /* mark the job as preempted */ > > job->preemption_status |= AMDGPU_IB_PREEMPTED; > > } > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > > index ea565651f7459..8298e95e4543e 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > > @@ -6375,7 +6375,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device > > *adev, > > * > > * job->base holds a reference to parent fence > > */ > > - if (job && dma_fence_is_signaled(&job->hw_fence)) { > > + if (job && dma_fence_is_signaled(&job->hw_fence.base)) { > > job_signaled = true; > > dev_info(adev->dev, "Guilty job already signaled, skipping HW > > reset"); > > goto skip_hw_reset; > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c > > index 2f24a6aa13bf6..569e0e5373927 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c > > @@ -41,22 +41,6 @@ > > #include "amdgpu_trace.h" > > #include "amdgpu_reset.h" > > > > -/* > > - * Fences mark an event in the GPUs pipeline and are used > > - * for GPU/CPU synchronization. When the fence is written, > > - * it is expected that all buffers associated with that fence > > - * are no longer in use by the associated ring on the GPU and > > - * that the relevant GPU caches have been flushed. > > - */ > > - > > -struct amdgpu_fence { > > - struct dma_fence base; > > - > > - /* RB, DMA, etc. */ > > - struct amdgpu_ring *ring; > > - ktime_t start_timestamp; > > -}; > > - > > Oh, that handling here is completely broken. > > The MCBP muxer overwrites fields in the job because of this ^^. > > I think that patch needs to be a bug fix we even backport.
What is broken in the muxer code? Alex > > Regards, > CFhristian. > > > static struct kmem_cache *amdgpu_fence_slab; > > > > int amdgpu_fence_slab_init(void) > > @@ -151,12 +135,12 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, > > struct dma_fence **f, struct amd > > am_fence = kmem_cache_alloc(amdgpu_fence_slab, GFP_ATOMIC); > > if (am_fence == NULL) > > return -ENOMEM; > > - fence = &am_fence->base; > > - am_fence->ring = ring; > > } else { > > /* take use of job-embedded fence */ > > - fence = &job->hw_fence; > > + am_fence = &job->hw_fence; > > } > > + fence = &am_fence->base; > > + am_fence->ring = ring; > > > > seq = ++ring->fence_drv.sync_seq; > > if (job && job->job_run_counter) { > > @@ -718,7 +702,7 @@ void amdgpu_fence_driver_clear_job_fences(struct > > amdgpu_ring *ring) > > * it right here or we won't be able to track them in > > fence_drv > > * and they will remain unsignaled during sa_bo free. > > */ > > - job = container_of(old, struct amdgpu_job, hw_fence); > > + job = container_of(old, struct amdgpu_job, > > hw_fence.base); > > if (!job->base.s_fence && !dma_fence_is_signaled(old)) > > dma_fence_signal(old); > > RCU_INIT_POINTER(*ptr, NULL); > > @@ -780,7 +764,7 @@ static const char > > *amdgpu_fence_get_timeline_name(struct dma_fence *f) > > > > static const char *amdgpu_job_fence_get_timeline_name(struct dma_fence *f) > > { > > - struct amdgpu_job *job = container_of(f, struct amdgpu_job, hw_fence); > > + struct amdgpu_job *job = container_of(f, struct amdgpu_job, > > hw_fence.base); > > > > return (const char *)to_amdgpu_ring(job->base.sched)->name; > > } > > @@ -810,7 +794,7 @@ static bool amdgpu_fence_enable_signaling(struct > > dma_fence *f) > > */ > > static bool amdgpu_job_fence_enable_signaling(struct dma_fence *f) > > { > > - struct amdgpu_job *job = container_of(f, struct amdgpu_job, hw_fence); > > + struct amdgpu_job *job = container_of(f, struct amdgpu_job, > > hw_fence.base); > > > > if > > (!timer_pending(&to_amdgpu_ring(job->base.sched)->fence_drv.fallback_timer)) > > > > amdgpu_fence_schedule_fallback(to_amdgpu_ring(job->base.sched)); > > @@ -845,7 +829,7 @@ static void amdgpu_job_fence_free(struct rcu_head *rcu) > > struct dma_fence *f = container_of(rcu, struct dma_fence, rcu); > > > > /* free job if fence has a parent job */ > > - kfree(container_of(f, struct amdgpu_job, hw_fence)); > > + kfree(container_of(f, struct amdgpu_job, hw_fence.base)); > > } > > > > /** > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > > index acb21fc8b3ce5..ddb9d3269357c 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > > @@ -272,8 +272,8 @@ void amdgpu_job_free_resources(struct amdgpu_job *job) > > /* Check if any fences where initialized */ > > if (job->base.s_fence && job->base.s_fence->finished.ops) > > f = &job->base.s_fence->finished; > > - else if (job->hw_fence.ops) > > - f = &job->hw_fence; > > + else if (job->hw_fence.base.ops) > > + f = &job->hw_fence.base; > > else > > f = NULL; > > > > @@ -290,10 +290,10 @@ static void amdgpu_job_free_cb(struct drm_sched_job > > *s_job) > > amdgpu_sync_free(&job->explicit_sync); > > > > /* only put the hw fence if has embedded fence */ > > - if (!job->hw_fence.ops) > > + if (!job->hw_fence.base.ops) > > kfree(job); > > else > > - dma_fence_put(&job->hw_fence); > > + dma_fence_put(&job->hw_fence.base); > > } > > > > void amdgpu_job_set_gang_leader(struct amdgpu_job *job, > > @@ -322,10 +322,10 @@ void amdgpu_job_free(struct amdgpu_job *job) > > if (job->gang_submit != &job->base.s_fence->scheduled) > > dma_fence_put(job->gang_submit); > > > > - if (!job->hw_fence.ops) > > + if (!job->hw_fence.base.ops) > > kfree(job); > > else > > - dma_fence_put(&job->hw_fence); > > + dma_fence_put(&job->hw_fence.base); > > } > > > > struct dma_fence *amdgpu_job_submit(struct amdgpu_job *job) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h > > index f2c049129661f..931fed8892cc1 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h > > @@ -48,7 +48,7 @@ struct amdgpu_job { > > struct drm_sched_job base; > > struct amdgpu_vm *vm; > > struct amdgpu_sync explicit_sync; > > - struct dma_fence hw_fence; > > + struct amdgpu_fence hw_fence; > > struct dma_fence *gang_submit; > > uint32_t preamble_status; > > uint32_t preemption_status; > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h > > index b95b471107692..e1f25218943a4 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h > > @@ -127,6 +127,22 @@ struct amdgpu_fence_driver { > > struct dma_fence **fences; > > }; > > > > +/* > > + * Fences mark an event in the GPUs pipeline and are used > > + * for GPU/CPU synchronization. When the fence is written, > > + * it is expected that all buffers associated with that fence > > + * are no longer in use by the associated ring on the GPU and > > + * that the relevant GPU caches have been flushed. > > + */ > > + > > +struct amdgpu_fence { > > + struct dma_fence base; > > + > > + /* RB, DMA, etc. */ > > + struct amdgpu_ring *ring; > > + ktime_t start_timestamp; > > +}; > > + > > extern const struct drm_sched_backend_ops amdgpu_sched_ops; > > > > void amdgpu_fence_driver_clear_job_fences(struct amdgpu_ring *ring); >