On 5/29/25 22:07, Alex Deucher wrote:
> We need to know the wptr and sequence number associated
> with a job so that we can re-emit the unprocessed state
> after a ring reset.  Pre-allocate storage space for
> the ring buffer contents and add a helper to save off
> the unprocessed state so that it can be re-emitted
> after the queue is reset.
> 
> Add a helper that ring reset callbacks can use to verify
> that the ring has reset successfully and to reemit any
> unprocessed ring contents from subsequent jobs.
> 
> Signed-off-by: Alex Deucher <alexander.deuc...@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 12 ++++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c    |  6 +++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c   |  5 ++-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_job.h   |  2 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c  | 46 +++++++++++++++++++++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h  |  8 ++++
>  6 files changed, 78 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> index 2f24a6aa13bf6..319548ac58820 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> @@ -764,6 +764,18 @@ void amdgpu_fence_driver_force_completion(struct 
> amdgpu_ring *ring)
>       amdgpu_fence_process(ring);
>  }
>  
> +/**
> + * amdgpu_fence_driver_seq_force_completion - force signal of specified 
> sequence
> + *
> + * @ring: fence of the ring to signal
> + *
> + */
> +void amdgpu_fence_driver_seq_force_completion(struct amdgpu_ring *ring, u32 
> seq)
> +{
> +     amdgpu_fence_write(ring, seq);
> +     amdgpu_fence_process(ring);
> +}
> +
>  /*
>   * Common fence implementation
>   */
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> index 802743efa3b39..67df82d50a74a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> @@ -306,6 +306,12 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, 
> unsigned int num_ibs,
>  
>       amdgpu_ring_ib_end(ring);
>       amdgpu_ring_commit(ring);
> +     /* This must be last for resets to work properly
> +      * as we need to save the wptr associated with this
> +      * job.
> +      */
> +     if (job)
> +             job->ring_wptr = ring->wptr;

First of all such state should *absolutely* not be made part of the job. That 
belongs into the HW fence.

Then we need to handle the case that one application submitted multiple jobs 
which potentially depend on each other.

I think we should rather put this logic into amdgpu_device_enforce_isolation().

Regards,
Christian.


>       return 0;
>  }
>  
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index a0fab947143b5..f0f752284b925 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -91,6 +91,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct 
> drm_sched_job *s_job)
>       struct amdgpu_job *job = to_amdgpu_job(s_job);
>       struct amdgpu_task_info *ti;
>       struct amdgpu_device *adev = ring->adev;
> +     struct dma_fence *fence = &job->hw_fence;
>       int idx;
>       int r;
>  
> @@ -154,8 +155,10 @@ static enum drm_gpu_sched_stat 
> amdgpu_job_timedout(struct drm_sched_job *s_job)
>               else
>                       is_guilty = true;
>  
> -             if (is_guilty)
> +             if (is_guilty) {
> +                     amdgpu_ring_backup_unprocessed_jobs(ring, 
> job->ring_wptr, fence->seqno);
>                       dma_fence_set_error(&s_job->s_fence->finished, -ETIME);
> +             }
>  
>               r = amdgpu_ring_reset(ring, job->vmid);
>               if (!r) {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> index f2c049129661f..c2ed0edb5179d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> @@ -79,6 +79,8 @@ struct amdgpu_job {
>       /* enforce isolation */
>       bool                    enforce_isolation;
>       bool                    run_cleaner_shader;
> +     /* wptr for the job for resets */
> +     uint32_t                ring_wptr;
>  
>       uint32_t                num_ibs;
>       struct amdgpu_ib        ibs[];
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> index 426834806fbf2..909b121d432cb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> @@ -333,6 +333,12 @@ int amdgpu_ring_init(struct amdgpu_device *adev, struct 
> amdgpu_ring *ring,
>       /*  Initialize cached_rptr to 0 */
>       ring->cached_rptr = 0;
>  
> +     if (!ring->ring_backup) {
> +             ring->ring_backup = kvzalloc(ring->ring_size, GFP_KERNEL);
> +             if (!ring->ring_backup)
> +                     return -ENOMEM;
> +     }
> +
>       /* Allocate ring buffer */
>       if (ring->ring_obj == NULL) {
>               r = amdgpu_bo_create_kernel(adev, ring->ring_size + 
> ring->funcs->extra_dw, PAGE_SIZE,
> @@ -342,6 +348,7 @@ int amdgpu_ring_init(struct amdgpu_device *adev, struct 
> amdgpu_ring *ring,
>                                           (void **)&ring->ring);
>               if (r) {
>                       dev_err(adev->dev, "(%d) ring create failed\n", r);
> +                     kvfree(ring->ring_backup);
>                       return r;
>               }
>               amdgpu_ring_clear_ring(ring);
> @@ -385,6 +392,8 @@ void amdgpu_ring_fini(struct amdgpu_ring *ring)
>       amdgpu_bo_free_kernel(&ring->ring_obj,
>                             &ring->gpu_addr,
>                             (void **)&ring->ring);
> +     kvfree(ring->ring_backup);
> +     ring->ring_backup = NULL;
>  
>       dma_fence_put(ring->vmid_wait);
>       ring->vmid_wait = NULL;
> @@ -753,3 +762,40 @@ bool amdgpu_ring_sched_ready(struct amdgpu_ring *ring)
>  
>       return true;
>  }
> +
> +void amdgpu_ring_backup_unprocessed_jobs(struct amdgpu_ring *ring,
> +                                      u64 bad_wptr, u32 bad_seq)
> +{
> +     unsigned int entries_to_copy = ring->wptr - bad_wptr;
> +     unsigned int idx, i;
> +
> +     for (i = 0; i < entries_to_copy; i++) {
> +             idx = (bad_wptr + i) & ring->buf_mask;
> +             ring->ring_backup[i] = ring->ring[idx];
> +     }
> +     ring->ring_backup_entries_to_copy = entries_to_copy;
> +     ring->ring_backup_seq = bad_seq;
> +}
> +
> +int amdgpu_ring_reemit_unprocessed_jobs(struct amdgpu_ring *ring)
> +{
> +     unsigned int i;
> +     int r;
> +
> +     /* signal the fence of the bad job */
> +     amdgpu_fence_driver_seq_force_completion(ring, ring->ring_backup_seq);
> +     /* verify that the ring is functional */
> +     r = amdgpu_ring_test_ring(ring);
> +     if (r)
> +             return r;
> +     /* re-emit the unprocessed ring contents */
> +     if (ring->ring_backup_entries_to_copy) {
> +             if (amdgpu_ring_alloc(ring, ring->ring_backup_entries_to_copy))
> +                     return -ENOMEM;
> +             for (i = 0; i < ring->ring_backup_entries_to_copy; i++)
> +                     amdgpu_ring_write(ring, ring->ring_backup[i]);
> +             amdgpu_ring_commit(ring);
> +     }
> +
> +     return r;
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> index b95b471107692..fd08449eee33f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> @@ -132,6 +132,8 @@ extern const struct drm_sched_backend_ops 
> amdgpu_sched_ops;
>  void amdgpu_fence_driver_clear_job_fences(struct amdgpu_ring *ring);
>  void amdgpu_fence_driver_set_error(struct amdgpu_ring *ring, int error);
>  void amdgpu_fence_driver_force_completion(struct amdgpu_ring *ring);
> +void amdgpu_fence_driver_seq_force_completion(struct amdgpu_ring *ring,
> +                                           u32 seq);
>  
>  int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring);
>  int amdgpu_fence_driver_start_ring(struct amdgpu_ring *ring,
> @@ -268,6 +270,9 @@ struct amdgpu_ring {
>  
>       struct amdgpu_bo        *ring_obj;
>       uint32_t                *ring;
> +     uint32_t                *ring_backup;
> +     uint32_t                ring_backup_seq;
> +     unsigned int            ring_backup_entries_to_copy;
>       unsigned                rptr_offs;
>       u64                     rptr_gpu_addr;
>       volatile u32            *rptr_cpu_addr;
> @@ -534,4 +539,7 @@ int amdgpu_ib_pool_init(struct amdgpu_device *adev);
>  void amdgpu_ib_pool_fini(struct amdgpu_device *adev);
>  int amdgpu_ib_ring_tests(struct amdgpu_device *adev);
>  bool amdgpu_ring_sched_ready(struct amdgpu_ring *ring);
> +void amdgpu_ring_backup_unprocessed_jobs(struct amdgpu_ring *ring,
> +                                      u64 bad_wptr, u32 bad_seq);
> +int amdgpu_ring_reemit_unprocessed_jobs(struct amdgpu_ring *ring);
>  #endif

Reply via email to