-----Original Message-----
From: amd-gfx <amd-gfx-boun...@lists.freedesktop.org> On Behalf Of
Christian König
Sent: Thursday, August 23, 2018 7:24 PM
To: amd-gfx@lists.freedesktop.org
Subject: [PATCH 1/4] drm/amdgpu: add ring soft recovery v3
Instead of hammering hard on the GPU try a soft recovery first.
v2: reorder code a bit
v3: increase timeout to 10ms, increment GPU reset counter
Signed-off-by: Christian König <christian.koe...@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 6 ++++++
drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 25
+++++++++++++++++++++++++
drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 4 ++++
3 files changed, 35 insertions(+)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index 265ff90f4e01..d93e31a5c4e7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -33,6 +33,12 @@ static void amdgpu_job_timedout(struct
drm_sched_job *s_job)
struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched);
struct amdgpu_job *job = to_amdgpu_job(s_job);
+ if (amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence-
parent)) {
+ DRM_ERROR("ring %s timeout, but soft recovered\n",
+ s_job->sched->name);
+ return;
+ }
+
DRM_ERROR("ring %s timeout, signaled seq=%u, emitted seq=%u\n",
job->base.sched->name, atomic_read(&ring-
fence_drv.last_seq),
ring->fence_drv.sync_seq);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
index 5dfd26be1eec..d445acb3d956 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
@@ -383,6 +383,31 @@ void
amdgpu_ring_emit_reg_write_reg_wait_helper(struct amdgpu_ring *ring,
amdgpu_ring_emit_reg_wait(ring, reg1, mask, mask); }
+/**
+ * amdgpu_ring_soft_recovery - try to soft recover a ring lockup
+ *
+ * @ring: ring to try the recovery on
+ * @vmid: VMID we try to get going again
+ * @fence: timedout fence
+ *
+ * Tries to get a ring proceeding again when it is stuck.
+ */
+bool amdgpu_ring_soft_recovery(struct amdgpu_ring *ring, unsigned int
vmid,
+ struct dma_fence *fence)
+{
+ ktime_t deadline = ktime_add_us(ktime_get(), 10000);
+
+ if (!ring->funcs->soft_recovery)
+ return false;
+
+ atomic_inc(&adev->gpu_reset_counter);
+ while (!dma_fence_is_signaled(fence) &&
+ ktime_to_ns(ktime_sub(deadline, ktime_get())) > 0)
+ ring->funcs->soft_recovery(ring, vmid);