If the ring submission is stalled on an external request, nothing can be
submitted, not even the heartbeat in the kernel context. Since nothing
is running, resetting the engine/device does not unblock the system and
is pointless. We can see if the heartbeat is supposed to be running
before declaring foul.

Signed-off-by: Chris Wilson <ch...@chris-wilson.co.uk>
---
 .../gpu/drm/i915/gt/intel_engine_heartbeat.c  | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c 
b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
index 5136c8bf112d..f67ad937eefb 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
@@ -48,8 +48,10 @@ static void show_heartbeat(const struct i915_request *rq,
        struct drm_printer p = drm_debug_printer("heartbeat");
 
        intel_engine_dump(engine, &p,
-                         "%s heartbeat {prio:%d} not ticking\n",
+                         "%s heartbeat {seqno:%llx:%lld, prio:%d} not 
ticking\n",
                          engine->name,
+                         rq->fence.context,
+                         rq->fence.seqno,
                          rq->sched.attr.priority);
 }
 
@@ -76,8 +78,19 @@ static void heartbeat(struct work_struct *wrk)
                goto out;
 
        if (engine->heartbeat.systole) {
-               if (engine->schedule &&
-                   rq->sched.attr.priority < I915_PRIORITY_BARRIER) {
+               if (!i915_sw_fence_signaled(&rq->submit)) {
+                       /*
+                        * Not yet submitted, system is stalled.
+                        *
+                        * This more often happens for ring submission,
+                        * where all contexts are funnelled into a common
+                        * ringbuffer. If one context is blocked on an
+                        * external fence, not only is it not submitted,
+                        * but all other contexts, including the kernel
+                        * context are stuck waiting for the signal.
+                        */
+               } else if (engine->schedule &&
+                          rq->sched.attr.priority < I915_PRIORITY_BARRIER) {
                        /*
                         * Gradually raise the priority of the heartbeat to
                         * give high priority work [which presumably desires
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

Reply via email to