The idea here is to enable enforce isolation legacy behavior for gfx 10+ and as such, we can adjust the behavior to better suite that. This aligns with how windows handles resets and seems to work reliably in my testing on GFX 10+. For older chips, or if enforce isolation is disabled, the soft recovery (wave kill) tends to work better. Additionally, only force completion if enforce isolation is not enabled.
Signed-off-by: Alex Deucher <alexander.deuc...@amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index e57401ef85140..b1618a33d5f77 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -112,6 +112,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) amdgpu_job_core_dump(adev, job); if (amdgpu_gpu_recovery && + !job->enforce_isolation && amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence->parent)) { dev_err(adev->dev, "ring %s timeout, but soft recovered\n", s_job->sched->name); @@ -161,7 +162,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) if (!r) { if (is_guilty) { atomic_inc(&ring->adev->gpu_reset_counter); - if (ring->funcs->type != AMDGPU_RING_TYPE_GFX) + if (!job->enforce_isolation) amdgpu_fence_driver_force_completion(ring); } drm_sched_wqueue_start(&ring->sched); -- 2.49.0