amdgpu: don't reemit ring contents more than once

Timur Kristóf Wed, 17 Dec 2025 21:21:11 -0800

On 2025. december 15., hétfő 10:07:06 középső államokbeli zónaidő Alex Deucher 
wrote:
> If we cancel a bad job and reemit the ring contents, and
> we get another timeout, cancel everything rather than reemitting.
> The wptr markers are only relevant for the original emit.  If
> we reemit, the wptr markers are no longer correct.


I see the point of not reemitting, considering the wptrs are no longer 
correct. However, would it be possible to correct the wptrs instead?

As it is, this sounds like it would make the reset less reliable when there is 
more than one job emitted that can cause hangs.

> 
> Signed-off-by: Alex Deucher <[email protected]>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 22 +++++++++++++++++-----
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h  |  2 ++
>  2 files changed, 19 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c index
> 1fe31d2f27060..334ddd6e48c06 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> @@ -710,6 +710,7 @@ void amdgpu_fence_driver_guilty_force_completion(struct
> amdgpu_fence *af) struct amdgpu_ring *ring = af->ring;
>       unsigned long flags;
>       u32 seq, last_seq;
> +     bool reemitted = false;
> 
>       last_seq = amdgpu_fence_read(ring) & ring-
>fence_drv.num_fences_mask;
>       seq = ring->fence_drv.sync_seq & ring->fence_drv.num_fences_mask;
> @@ -727,7 +728,9 @@ void amdgpu_fence_driver_guilty_force_completion(struct
> amdgpu_fence *af) if (unprocessed &&
> !dma_fence_is_signaled_locked(unprocessed)) { fence =
> container_of(unprocessed, struct amdgpu_fence, base);
> 
> -                     if (fence == af)
> +                     if (fence->reemitted > 1)
> +                             reemitted = true;
> +                     else if (fence == af)
>                               dma_fence_set_error(&fence->base, 
-ETIME);
>                       else if (fence->context == af->context)
>                               dma_fence_set_error(&fence->base, 
-ECANCELED);
> @@ -735,9 +738,16 @@ void amdgpu_fence_driver_guilty_force_completion(struct
> amdgpu_fence *af) rcu_read_unlock();
>       } while (last_seq != seq);
>       spin_unlock_irqrestore(&ring->fence_drv.lock, flags);
> -     /* signal the guilty fence */
> -     amdgpu_fence_write(ring, (u32)af->base.seqno);
> -     amdgpu_fence_process(ring);
> +
> +     if (reemitted) {
> +             /* if we've already reemitted once then just cancel 
everything */
> +             amdgpu_fence_driver_force_completion(af->ring);
> +             af->ring->ring_backup_entries_to_copy = 0;
> +     } else {
> +             /* signal the guilty fence */
> +             amdgpu_fence_write(ring, (u32)af->base.seqno);
> +             amdgpu_fence_process(ring);
> +     }
>  }
> 
>  void amdgpu_fence_save_wptr(struct amdgpu_fence *af)
> @@ -785,10 +795,12 @@ void amdgpu_ring_backup_unprocessed_commands(struct
> amdgpu_ring *ring, /* save everything if the ring is not guilty, otherwise
>                        * just save the content from other 
contexts.
>                        */
> -                     if (!guilty_fence || (fence->context != 
guilty_fence->context))
> +                     if (!fence->reemitted &&
> +                         (!guilty_fence || (fence->context != 
guilty_fence->context)))
>                               
amdgpu_ring_backup_unprocessed_command(ring, wptr,
>                                                               
       fence->wptr);
>                       wptr = fence->wptr;
> +                     fence->reemitted++;
>               }
>               rcu_read_unlock();
>       } while (last_seq != seq);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h index
> a1fb0fadb6eab..d881829528976 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> @@ -150,6 +150,8 @@ struct amdgpu_fence {
>       u64                             wptr;
>       /* fence context for resets */
>       u64                             context;
> +     /* has this fence been reemitted */
> +     unsigned int                    reemitted;
>  };
> 
>  extern const struct drm_sched_backend_ops amdgpu_sched_ops;

Re: [PATCH 1/6] drm/amdgpu: don't reemit ring contents more than once

Reply via email to