If we haven't even begun executing the payload of the stalled request,
then we should not claim that its userspace context was guilty of
submitting a hanging batch.

v2: Check for context corruption before trying to restart.

Signed-off-by: Chris Wilson <ch...@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/intel_lrc.c              | 34 ++++++++++++++++++-
 drivers/gpu/drm/i915/selftests/igt_spinner.c  |  9 ++++-
 .../gpu/drm/i915/selftests/intel_hangcheck.c  |  6 ++++
 3 files changed, 47 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 5e98fd79bd9d..5d5ce91a5dfa 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1878,6 +1878,23 @@ static void execlists_reset_prepare(struct 
intel_engine_cs *engine)
        spin_unlock_irqrestore(&engine->timeline.lock, flags);
 }
 
+static bool lrc_regs_ok(const struct i915_request *rq)
+{
+       const struct intel_ring *ring = rq->ring;
+       const u32 *regs = rq->hw_context->lrc_reg_state;
+
+       /* Quick spot check for the common signs of context corruption */
+
+       if (regs[CTX_RING_BUFFER_CONTROL + 1] !=
+           (RING_CTL_SIZE(ring->size) | RING_VALID))
+               return false;
+
+       if (regs[CTX_RING_BUFFER_START + 1] != i915_ggtt_offset(ring->vma))
+               return false;
+
+       return true;
+}
+
 static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
 {
        struct intel_engine_execlists * const execlists = &engine->execlists;
@@ -1912,6 +1929,21 @@ static void execlists_reset(struct intel_engine_cs 
*engine, bool stalled)
        if (!rq)
                goto out_unlock;
 
+       /*
+        * If this request hasn't started yet, e.g. it is waiting on a
+        * semaphore, we need to avoid skipping the request or else we
+        * break the signaling chain. However, if the context is corrupt
+        * the request will not restart and we will be stuck with a wedged
+        * device. It is quite often the case that if we issue a reset
+        * while the GPU is loading the context image, that context image
+        * becomes corrupt.
+        *
+        * Otherwise, if we have not started yet, the request should replay
+        * perfectly and we do not need to flag the result as being erroneous.
+        */
+       if (!i915_request_started(rq) && lrc_regs_ok(rq))
+               goto out_unlock;
+
        /*
         * If the request was innocent, we leave the request in the ELSP
         * and will try to replay it on restarting. The context image may
@@ -1924,7 +1956,7 @@ static void execlists_reset(struct intel_engine_cs 
*engine, bool stalled)
         * image back to the expected values to skip over the guilty request.
         */
        i915_reset_request(rq, stalled);
-       if (!stalled)
+       if (!stalled && lrc_regs_ok(rq))
                goto out_unlock;
 
        /*
diff --git a/drivers/gpu/drm/i915/selftests/igt_spinner.c 
b/drivers/gpu/drm/i915/selftests/igt_spinner.c
index 9ebd9225684e..86354e51bdd3 100644
--- a/drivers/gpu/drm/i915/selftests/igt_spinner.c
+++ b/drivers/gpu/drm/i915/selftests/igt_spinner.c
@@ -142,10 +142,17 @@ igt_spinner_create_request(struct igt_spinner *spin,
        *batch++ = upper_32_bits(vma->node.start);
        *batch++ = MI_BATCH_BUFFER_END; /* not reached */
 
-       i915_gem_chipset_flush(spin->i915);
+       if (engine->emit_init_breadcrumb &&
+           rq->timeline->has_initial_breadcrumb) {
+               err = engine->emit_init_breadcrumb(rq);
+               if (err)
+                       goto cancel_rq;
+       }
 
        err = engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, 0);
 
+       i915_gem_chipset_flush(spin->i915);
+
 cancel_rq:
        if (err) {
                i915_request_skip(rq, err);
diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c 
b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
index 4886fac12628..36c17bfe05a7 100644
--- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
@@ -246,6 +246,12 @@ hang_create_request(struct hang *h, struct intel_engine_cs 
*engine)
        if (INTEL_GEN(vm->i915) <= 5)
                flags |= I915_DISPATCH_SECURE;
 
+       if (rq->engine->emit_init_breadcrumb) {
+               err = rq->engine->emit_init_breadcrumb(rq);
+               if (err)
+                       goto cancel_rq;
+       }
+
        err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
 
 cancel_rq:
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

Reply via email to