On 15.09.25 15:33, Alex Deucher wrote:
> When we backup ring contents to reemit after a queue reset,
> we don't backup ring contents from the bad context. When
> we signal the fences, we should set an error on those
> fences as well.
>
> v2: misc cleanups
> v3: add locking for fence error, fix comment (Christian)
>
> Fixes: 77cc0da39c7c ("drm/amdgpu: track ring state associated with a fence")
> Signed-off-by: Alex Deucher <alexander.deuc...@amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 33 ++++++++++++++++++++---
> drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 2 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 2 +-
> 3 files changed, 31 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> index fd8cca241da62..72f0f16924476 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> @@ -758,11 +758,36 @@ void amdgpu_fence_driver_force_completion(struct
> amdgpu_ring *ring)
> * @fence: fence of the ring to signal
> *
> */
> -void amdgpu_fence_driver_guilty_force_completion(struct amdgpu_fence *fence)
> +void amdgpu_fence_driver_guilty_force_completion(struct amdgpu_fence *af)
> {
> - dma_fence_set_error(&fence->base, -ETIME);
> - amdgpu_fence_write(fence->ring, fence->seq);
> - amdgpu_fence_process(fence->ring);
> + struct dma_fence *unprocessed;
> + struct dma_fence __rcu **ptr;
> + struct amdgpu_fence *fence;
> + struct amdgpu_ring *ring = af->ring;
> + unsigned long flags;
> + u64 i, seqno;
> +
> + seqno = amdgpu_fence_read(ring);
> +
> + spin_lock_irqsave(&ring->fence_drv.lock, flags);
> + for (i = seqno + 1; i <= ring->fence_drv.sync_seq; ++i) {
That won't work, the seqno can wrap around.
I would just go over all fences, e.g. from 0 to end of array.
The checks below should make sure that we don't try to set an error on
something already processed.
> + ptr = &ring->fence_drv.fences[i &
> ring->fence_drv.num_fences_mask];
> + rcu_read_lock();
> + unprocessed = rcu_dereference(*ptr);
> +
> + if (unprocessed && !dma_fence_is_signaled(unprocessed)) {
dma_fence_is_signaled_locked(), otherwise the function would try to lock the
same spinlock again.
> + fence = container_of(unprocessed, struct amdgpu_fence,
> base);
> +
> + if (fence == af)
> + dma_fence_set_error(&fence->base, -ETIME);
> + else if (fence->context == af->context)
> + dma_fence_set_error(&fence->base, -ECANCELED);
> + }
> + rcu_read_unlock();
> + }
> + spin_unlock_irqrestore(&ring->fence_drv.lock, flags);
> + amdgpu_fence_write(ring, af->seq);
> + amdgpu_fence_process(ring);
That's actually wrong. We want the other fences to signal later on when we
process them.
Regards,
Christian.
> }
>
> void amdgpu_fence_save_wptr(struct dma_fence *fence)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> index 6379bb25bf5ce..f6c9decedbd51 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> @@ -812,7 +812,7 @@ int amdgpu_ring_reset_helper_end(struct amdgpu_ring *ring,
> if (r)
> return r;
>
> - /* signal the fence of the bad job */
> + /* signal the guilty fence and set an error on all fences from the
> context */
> if (guilty_fence)
> amdgpu_fence_driver_guilty_force_completion(guilty_fence);
> /* Re-emit the non-guilty commands */
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> index 7670f5d82b9e4..0704fd9b85fdb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> @@ -155,7 +155,7 @@ extern const struct drm_sched_backend_ops
> amdgpu_sched_ops;
> void amdgpu_fence_driver_clear_job_fences(struct amdgpu_ring *ring);
> void amdgpu_fence_driver_set_error(struct amdgpu_ring *ring, int error);
> void amdgpu_fence_driver_force_completion(struct amdgpu_ring *ring);
> -void amdgpu_fence_driver_guilty_force_completion(struct amdgpu_fence *fence);
> +void amdgpu_fence_driver_guilty_force_completion(struct amdgpu_fence *af);
> void amdgpu_fence_save_wptr(struct dma_fence *fence);
>
> int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring);