On Mon, Jun 2, 2025 at 10:36 AM Christian König <christian.koe...@amd.com> wrote: > > On 5/29/25 22:07, Alex Deucher wrote: > > We need to know the wptr and sequence number associated > > with a job so that we can re-emit the unprocessed state > > after a ring reset. Pre-allocate storage space for > > the ring buffer contents and add a helper to save off > > the unprocessed state so that it can be re-emitted > > after the queue is reset. > > > > Add a helper that ring reset callbacks can use to verify > > that the ring has reset successfully and to reemit any > > unprocessed ring contents from subsequent jobs. > > > > Signed-off-by: Alex Deucher <alexander.deuc...@amd.com> > > --- > > drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 12 ++++++ > > drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c | 6 +++ > > drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 5 ++- > > drivers/gpu/drm/amd/amdgpu/amdgpu_job.h | 2 + > > drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 46 +++++++++++++++++++++++ > > drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 8 ++++ > > 6 files changed, 78 insertions(+), 1 deletion(-) > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c > > index 2f24a6aa13bf6..319548ac58820 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c > > @@ -764,6 +764,18 @@ void amdgpu_fence_driver_force_completion(struct > > amdgpu_ring *ring) > > amdgpu_fence_process(ring); > > } > > > > +/** > > + * amdgpu_fence_driver_seq_force_completion - force signal of specified > > sequence > > + * > > + * @ring: fence of the ring to signal > > + * > > + */ > > +void amdgpu_fence_driver_seq_force_completion(struct amdgpu_ring *ring, > > u32 seq) > > +{ > > + amdgpu_fence_write(ring, seq); > > + amdgpu_fence_process(ring); > > +} > > + > > /* > > * Common fence implementation > > */ > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c > > index 802743efa3b39..67df82d50a74a 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c > > @@ -306,6 +306,12 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, > > unsigned int num_ibs, > > > > amdgpu_ring_ib_end(ring); > > amdgpu_ring_commit(ring); > > + /* This must be last for resets to work properly > > + * as we need to save the wptr associated with this > > + * job. > > + */ > > + if (job) > > + job->ring_wptr = ring->wptr; > > First of all such state should *absolutely* not be made part of the job. That > belongs into the HW fence.
Done. Updated patches pushed here: https://gitlab.freedesktop.org/agd5f/linux/-/commits/kq_resets?ref_type=heads > > Then we need to handle the case that one application submitted multiple jobs > which potentially depend on each other. > > I think we should rather put this logic into > amdgpu_device_enforce_isolation(). I'm not quite sure I understand what you are proposing. Is the idea to track all of the jobs associated with a particular process and then when we reset a queue, skip all of the ring contents associated with those jobs and signal and set the error on all of their job fences? What about cross ring dependencies? Alex > > Regards, > Christian. > > > > return 0; > > } > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > > index a0fab947143b5..f0f752284b925 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > > @@ -91,6 +91,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct > > drm_sched_job *s_job) > > struct amdgpu_job *job = to_amdgpu_job(s_job); > > struct amdgpu_task_info *ti; > > struct amdgpu_device *adev = ring->adev; > > + struct dma_fence *fence = &job->hw_fence; > > int idx; > > int r; > > > > @@ -154,8 +155,10 @@ static enum drm_gpu_sched_stat > > amdgpu_job_timedout(struct drm_sched_job *s_job) > > else > > is_guilty = true; > > > > - if (is_guilty) > > + if (is_guilty) { > > + amdgpu_ring_backup_unprocessed_jobs(ring, > > job->ring_wptr, fence->seqno); > > dma_fence_set_error(&s_job->s_fence->finished, > > -ETIME); > > + } > > > > r = amdgpu_ring_reset(ring, job->vmid); > > if (!r) { > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h > > index f2c049129661f..c2ed0edb5179d 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h > > @@ -79,6 +79,8 @@ struct amdgpu_job { > > /* enforce isolation */ > > bool enforce_isolation; > > bool run_cleaner_shader; > > + /* wptr for the job for resets */ > > + uint32_t ring_wptr; > > > > uint32_t num_ibs; > > struct amdgpu_ib ibs[]; > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c > > index 426834806fbf2..909b121d432cb 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c > > @@ -333,6 +333,12 @@ int amdgpu_ring_init(struct amdgpu_device *adev, > > struct amdgpu_ring *ring, > > /* Initialize cached_rptr to 0 */ > > ring->cached_rptr = 0; > > > > + if (!ring->ring_backup) { > > + ring->ring_backup = kvzalloc(ring->ring_size, GFP_KERNEL); > > + if (!ring->ring_backup) > > + return -ENOMEM; > > + } > > + > > /* Allocate ring buffer */ > > if (ring->ring_obj == NULL) { > > r = amdgpu_bo_create_kernel(adev, ring->ring_size + > > ring->funcs->extra_dw, PAGE_SIZE, > > @@ -342,6 +348,7 @@ int amdgpu_ring_init(struct amdgpu_device *adev, struct > > amdgpu_ring *ring, > > (void **)&ring->ring); > > if (r) { > > dev_err(adev->dev, "(%d) ring create failed\n", r); > > + kvfree(ring->ring_backup); > > return r; > > } > > amdgpu_ring_clear_ring(ring); > > @@ -385,6 +392,8 @@ void amdgpu_ring_fini(struct amdgpu_ring *ring) > > amdgpu_bo_free_kernel(&ring->ring_obj, > > &ring->gpu_addr, > > (void **)&ring->ring); > > + kvfree(ring->ring_backup); > > + ring->ring_backup = NULL; > > > > dma_fence_put(ring->vmid_wait); > > ring->vmid_wait = NULL; > > @@ -753,3 +762,40 @@ bool amdgpu_ring_sched_ready(struct amdgpu_ring *ring) > > > > return true; > > } > > + > > +void amdgpu_ring_backup_unprocessed_jobs(struct amdgpu_ring *ring, > > + u64 bad_wptr, u32 bad_seq) > > +{ > > + unsigned int entries_to_copy = ring->wptr - bad_wptr; > > + unsigned int idx, i; > > + > > + for (i = 0; i < entries_to_copy; i++) { > > + idx = (bad_wptr + i) & ring->buf_mask; > > + ring->ring_backup[i] = ring->ring[idx]; > > + } > > + ring->ring_backup_entries_to_copy = entries_to_copy; > > + ring->ring_backup_seq = bad_seq; > > +} > > + > > +int amdgpu_ring_reemit_unprocessed_jobs(struct amdgpu_ring *ring) > > +{ > > + unsigned int i; > > + int r; > > + > > + /* signal the fence of the bad job */ > > + amdgpu_fence_driver_seq_force_completion(ring, ring->ring_backup_seq); > > + /* verify that the ring is functional */ > > + r = amdgpu_ring_test_ring(ring); > > + if (r) > > + return r; > > + /* re-emit the unprocessed ring contents */ > > + if (ring->ring_backup_entries_to_copy) { > > + if (amdgpu_ring_alloc(ring, > > ring->ring_backup_entries_to_copy)) > > + return -ENOMEM; > > + for (i = 0; i < ring->ring_backup_entries_to_copy; i++) > > + amdgpu_ring_write(ring, ring->ring_backup[i]); > > + amdgpu_ring_commit(ring); > > + } > > + > > + return r; > > +} > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h > > index b95b471107692..fd08449eee33f 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h > > @@ -132,6 +132,8 @@ extern const struct drm_sched_backend_ops > > amdgpu_sched_ops; > > void amdgpu_fence_driver_clear_job_fences(struct amdgpu_ring *ring); > > void amdgpu_fence_driver_set_error(struct amdgpu_ring *ring, int error); > > void amdgpu_fence_driver_force_completion(struct amdgpu_ring *ring); > > +void amdgpu_fence_driver_seq_force_completion(struct amdgpu_ring *ring, > > + u32 seq); > > > > int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring); > > int amdgpu_fence_driver_start_ring(struct amdgpu_ring *ring, > > @@ -268,6 +270,9 @@ struct amdgpu_ring { > > > > struct amdgpu_bo *ring_obj; > > uint32_t *ring; > > + uint32_t *ring_backup; > > + uint32_t ring_backup_seq; > > + unsigned int ring_backup_entries_to_copy; > > unsigned rptr_offs; > > u64 rptr_gpu_addr; > > volatile u32 *rptr_cpu_addr; > > @@ -534,4 +539,7 @@ int amdgpu_ib_pool_init(struct amdgpu_device *adev); > > void amdgpu_ib_pool_fini(struct amdgpu_device *adev); > > int amdgpu_ib_ring_tests(struct amdgpu_device *adev); > > bool amdgpu_ring_sched_ready(struct amdgpu_ring *ring); > > +void amdgpu_ring_backup_unprocessed_jobs(struct amdgpu_ring *ring, > > + u64 bad_wptr, u32 bad_seq); > > +int amdgpu_ring_reemit_unprocessed_jobs(struct amdgpu_ring *ring); > > #endif >