[Intel-gfx] [PATCH] drm/i915: Don't count semaphore waits towards a stuck ring

Chris Wilson Fri, 07 Jun 2013 01:56:44 -0700

If we detect a ring is in a valid wait for another, just let it be.
Eventually it will either begin to progress again, or the entire system
will come grinding to a halt and then hangcheck will fire as soon as the
deadlock is detected.


This error was foretold by Ben in
commit 05407ff889ceebe383aa5907219f86582ef96b72
Author: Mika Kuoppala <mika.kuopp...@linux.intel.com>
Date:   Thu May 30 09:04:29 2013 +0300

    drm/i915: detect hang using per ring hangcheck_score

"If ring B is waiting on ring A via semaphore, and ring A is making
progress, albeit slowly - the hangcheck will fire. The check will
determine that A is moving, however ring B will appear hung because
the ACTHD doesn't move. I honestly can't say if that's actually a
realistic problem to hit it probably implies the timeout value is too
low."

v2: Make sure we don't even incur the KICK cost whilst waiting.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=65394
Signed-off-by: Chris Wilson <ch...@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuopp...@linux.intel.com>
Cc: Ben Widawsky <b...@bwidawsk.net>
---
 drivers/gpu/drm/i915/i915_irq.c         |   95 ++++++++++++++++++++++---------
 drivers/gpu/drm/i915/intel_ringbuffer.h |    1 +
 2 files changed, 70 insertions(+), 26 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 2d1890d..05f8f75 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2331,21 +2331,21 @@ ring_idle(struct intel_ring_buffer *ring, u32 seqno)
                i915_seqno_passed(seqno, ring_last_seqno(ring)));
 }
 
-static bool semaphore_passed(struct intel_ring_buffer *ring)
+static struct intel_ring_buffer *
+semaphore_waits_for(struct intel_ring_buffer *ring, u32 *seqno)
 {
        struct drm_i915_private *dev_priv = ring->dev->dev_private;
-       u32 acthd = intel_ring_get_active_head(ring) & HEAD_ADDR;
-       struct intel_ring_buffer *signaller;
-       u32 cmd, ipehr, acthd_min;
+       u32 cmd, ipehr, acthd, acthd_min;
 
        ipehr = I915_READ(RING_IPEHR(ring->mmio_base));
        if ((ipehr & ~(0x3 << 16)) !=
            (MI_SEMAPHORE_MBOX | MI_SEMAPHORE_COMPARE | MI_SEMAPHORE_REGISTER))
-               return false;
+               return NULL;
 
        /* ACTHD is likely pointing to the dword after the actual command,
         * so scan backwards until we find the MBOX.
         */
+       acthd = intel_ring_get_active_head(ring) & HEAD_ADDR;
        acthd_min = max((int)acthd - 3 * 4, 0);
        do {
                cmd = ioread32(ring->virtual_start + acthd);
@@ -2354,22 +2354,53 @@ static bool semaphore_passed(struct intel_ring_buffer 
*ring)
 
                acthd -= 4;
                if (acthd < acthd_min)
-                       return false;
+                       return NULL;
        } while (1);
 
-       signaller = &dev_priv->ring[(ring->id + (((ipehr >> 17) & 1) + 1)) % 3];
-       return i915_seqno_passed(signaller->get_seqno(signaller, false),
-                                ioread32(ring->virtual_start+acthd+4)+1);
+       *seqno = ioread32(ring->virtual_start+acthd+4)+1;
+       return &dev_priv->ring[(ring->id + (((ipehr >> 17) & 1) + 1)) % 3];
+}
+
+static int semaphore_passed(struct intel_ring_buffer *ring)
+{
+       struct drm_i915_private *dev_priv = ring->dev->dev_private;
+       struct intel_ring_buffer *signaller;
+       u32 seqno, ctl;
+
+       ring->hangcheck.deadlock = true;
+
+       signaller = semaphore_waits_for(ring, &seqno);
+       if (signaller == NULL || signaller->hangcheck.deadlock)
+               return -1;
+
+       /* cursory check for an unkickable deadlock */
+       ctl = I915_READ_CTL(signaller);
+       if (ctl & RING_WAIT_SEMAPHORE && semaphore_passed(signaller) < 0)
+               return -1;
+
+       return i915_seqno_passed(signaller->get_seqno(signaller, false), seqno);
 }
 
-static bool ring_hung(struct intel_ring_buffer *ring)
+static void semaphore_clear_deadlocks(struct drm_i915_private *dev_priv)
+{
+       struct intel_ring_buffer *ring;
+       int i;
+
+       for_each_ring(ring, dev_priv, i)
+               ring->hangcheck.deadlock = false;
+}
+
+static enum { pass, kick, hung } ring_stuck(struct intel_ring_buffer *ring, 
u32 acthd)
 {
        struct drm_device *dev = ring->dev;
        struct drm_i915_private *dev_priv = dev->dev_private;
        u32 tmp;
 
+       if (ring->hangcheck.acthd != acthd)
+               return pass;
+
        if (IS_GEN2(dev))
-               return true;
+               return hung;
 
        /* Is the chip hanging on a WAIT_FOR_EVENT?
         * If so we can simply poke the RB_WAIT bit
@@ -2381,19 +2412,24 @@ static bool ring_hung(struct intel_ring_buffer *ring)
                DRM_ERROR("Kicking stuck wait on %s\n",
                          ring->name);
                I915_WRITE_CTL(ring, tmp);
-               return false;
-       }
-
-       if (INTEL_INFO(dev)->gen >= 6 &&
-           tmp & RING_WAIT_SEMAPHORE &&
-           semaphore_passed(ring)) {
-               DRM_ERROR("Kicking stuck semaphore on %s\n",
-                         ring->name);
-               I915_WRITE_CTL(ring, tmp);
-               return false;
+               return kick;
+       }
+
+       if (INTEL_INFO(dev)->gen >= 6 && tmp & RING_WAIT_SEMAPHORE) {
+               switch (semaphore_passed(ring)) {
+               default:
+                       return hung;
+               case 1:
+                       DRM_ERROR("Kicking stuck semaphore on %s\n",
+                                 ring->name);
+                       I915_WRITE_CTL(ring, tmp);
+                       return kick;
+               case 0:
+                       return pass;
+               }
        }
 
-       return true;
+       return hung;
 }
 
 /**
@@ -2422,6 +2458,8 @@ void i915_hangcheck_elapsed(unsigned long data)
        for_each_ring(ring, dev_priv, i) {
                u32 seqno, acthd;
 
+               semaphore_clear_deadlocks(dev_priv);
+
                seqno = ring->get_seqno(ring, false);
                acthd = intel_ring_get_active_head(ring);
 
@@ -2436,16 +2474,21 @@ void i915_hangcheck_elapsed(unsigned long data)
                                        busy_count++;
                                }
                        } else {
-                               stuck[i] = ring->hangcheck.acthd == acthd;
-                               if (stuck[i]) {
+                               switch (ring_stuck(ring, acthd)) {
+                               case pass:
+                                       break;
+                               case kick:
                                        /* Every time we kick the ring, add a
                                         * small increment to the hangcheck
                                         * score so that we can catch a
                                         * batch that is repeatedly kicked.
                                         */
                                        ring->hangcheck.score += KICK;
-                                       if (ring_hung(ring))
-                                               ring->hangcheck.score += HUNG;
+                                       break;
+                               case hung:
+                                       ring->hangcheck.score += HUNG;
+                                       stuck[i] = true;
+                                       break;
                                }
                                busy_count++;
                        }
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h 
b/drivers/gpu/drm/i915/intel_ringbuffer.h
index efc403d..a3e9610 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -38,6 +38,7 @@ struct  intel_hw_status_page {
 #define I915_READ_SYNC_1(ring) I915_READ(RING_SYNC_1((ring)->mmio_base))
 
 struct intel_ring_hangcheck {
+       bool deadlock;
        u32 seqno;
        u32 acthd;
        int score;
-- 
1.7.10.4

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

[Intel-gfx] [PATCH] drm/i915: Don't count semaphore waits towards a stuck ring

Reply via email to