On Fri, Jun 07, 2013 at 09:55:51AM +0100, Chris Wilson wrote:
> If we detect a ring is in a valid wait for another, just let it be.
> Eventually it will either begin to progress again, or the entire system
> will come grinding to a halt and then hangcheck will fire as soon as the
> deadlock is detected.
> 
> This error was foretold by Ben in
> commit 05407ff889ceebe383aa5907219f86582ef96b72
> Author: Mika Kuoppala <mika.kuopp...@linux.intel.com>
> Date:   Thu May 30 09:04:29 2013 +0300
> 
>     drm/i915: detect hang using per ring hangcheck_score
> 
> "If ring B is waiting on ring A via semaphore, and ring A is making
> progress, albeit slowly - the hangcheck will fire. The check will
> determine that A is moving, however ring B will appear hung because
> the ACTHD doesn't move. I honestly can't say if that's actually a
> realistic problem to hit it probably implies the timeout value is too
> low."
> 
> v2: Make sure we don't even incur the KICK cost whilst waiting.
> 
> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=65394
> Signed-off-by: Chris Wilson <ch...@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuopp...@linux.intel.com>
> Cc: Ben Widawsky <b...@bwidawsk.net>
> ---
>  drivers/gpu/drm/i915/i915_irq.c         |   95 
> ++++++++++++++++++++++---------
>  drivers/gpu/drm/i915/intel_ringbuffer.h |    1 +
>  2 files changed, 70 insertions(+), 26 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 2d1890d..05f8f75 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -2331,21 +2331,21 @@ ring_idle(struct intel_ring_buffer *ring, u32 seqno)
>               i915_seqno_passed(seqno, ring_last_seqno(ring)));
>  }
>  
> -static bool semaphore_passed(struct intel_ring_buffer *ring)
> +static struct intel_ring_buffer *
> +semaphore_waits_for(struct intel_ring_buffer *ring, u32 *seqno)
>  {
>       struct drm_i915_private *dev_priv = ring->dev->dev_private;
> -     u32 acthd = intel_ring_get_active_head(ring) & HEAD_ADDR;
> -     struct intel_ring_buffer *signaller;
> -     u32 cmd, ipehr, acthd_min;
> +     u32 cmd, ipehr, acthd, acthd_min;
>  
>       ipehr = I915_READ(RING_IPEHR(ring->mmio_base));
>       if ((ipehr & ~(0x3 << 16)) !=
>           (MI_SEMAPHORE_MBOX | MI_SEMAPHORE_COMPARE | MI_SEMAPHORE_REGISTER))
> -             return false;
> +             return NULL;
>  
>       /* ACTHD is likely pointing to the dword after the actual command,
>        * so scan backwards until we find the MBOX.
>        */
> +     acthd = intel_ring_get_active_head(ring) & HEAD_ADDR;
>       acthd_min = max((int)acthd - 3 * 4, 0);
>       do {
>               cmd = ioread32(ring->virtual_start + acthd);
> @@ -2354,22 +2354,53 @@ static bool semaphore_passed(struct intel_ring_buffer 
> *ring)
>  
>               acthd -= 4;
>               if (acthd < acthd_min)
> -                     return false;
> +                     return NULL;
>       } while (1);
>  
> -     signaller = &dev_priv->ring[(ring->id + (((ipehr >> 17) & 1) + 1)) % 3];
> -     return i915_seqno_passed(signaller->get_seqno(signaller, false),
> -                              ioread32(ring->virtual_start+acthd+4)+1);
> +     *seqno = ioread32(ring->virtual_start+acthd+4)+1;
> +     return &dev_priv->ring[(ring->id + (((ipehr >> 17) & 1) + 1)) % 3];
> +}
> +
>
I hated these since they were introduced... does this work still with
VECS? Seems at the very least %3 should be NUM_RINGS?
>
> +static int semaphore_passed(struct intel_ring_buffer *ring)
> +{
> +     struct drm_i915_private *dev_priv = ring->dev->dev_private;
> +     struct intel_ring_buffer *signaller;
> +     u32 seqno, ctl;
> +
> +     ring->hangcheck.deadlock = true;
> +
> +     signaller = semaphore_waits_for(ring, &seqno);
> +     if (signaller == NULL || signaller->hangcheck.deadlock)
> +             return -1;
> +
> +     /* cursory check for an unkickable deadlock */
> +     ctl = I915_READ_CTL(signaller);
> +     if (ctl & RING_WAIT_SEMAPHORE && semaphore_passed(signaller) < 0)
> +             return -1;
> +
> +     return i915_seqno_passed(signaller->get_seqno(signaller, false), seqno);
>  }
>  
> -static bool ring_hung(struct intel_ring_buffer *ring)
> +static void semaphore_clear_deadlocks(struct drm_i915_private *dev_priv)
> +{
> +     struct intel_ring_buffer *ring;
> +     int i;
> +
> +     for_each_ring(ring, dev_priv, i)
> +             ring->hangcheck.deadlock = false;
> +}
> +
> +static enum { pass, kick, hung } ring_stuck(struct intel_ring_buffer *ring, 
> u32 acthd)
>  {
>       struct drm_device *dev = ring->dev;
>       struct drm_i915_private *dev_priv = dev->dev_private;
>       u32 tmp;
>  
> +     if (ring->hangcheck.acthd != acthd)
> +             return pass;
> +
>       if (IS_GEN2(dev))
> -             return true;
> +             return hung;
>  
>       /* Is the chip hanging on a WAIT_FOR_EVENT?
>        * If so we can simply poke the RB_WAIT bit
> @@ -2381,19 +2412,24 @@ static bool ring_hung(struct intel_ring_buffer *ring)
>               DRM_ERROR("Kicking stuck wait on %s\n",
>                         ring->name);
>               I915_WRITE_CTL(ring, tmp);
> -             return false;
> -     }
> -
> -     if (INTEL_INFO(dev)->gen >= 6 &&
> -         tmp & RING_WAIT_SEMAPHORE &&
> -         semaphore_passed(ring)) {
> -             DRM_ERROR("Kicking stuck semaphore on %s\n",
> -                       ring->name);
> -             I915_WRITE_CTL(ring, tmp);
> -             return false;
> +             return kick;
> +     }
> +
> +     if (INTEL_INFO(dev)->gen >= 6 && tmp & RING_WAIT_SEMAPHORE) {
> +             switch (semaphore_passed(ring)) {
> +             default:
> +                     return hung;
> +             case 1:
> +                     DRM_ERROR("Kicking stuck semaphore on %s\n",
> +                               ring->name);
> +                     I915_WRITE_CTL(ring, tmp);
> +                     return kick;
> +             case 0:
> +                     return pass;
> +             }
>       }
>  
> -     return true;
> +     return hung;
>  }
>  
>  /**
> @@ -2422,6 +2458,8 @@ void i915_hangcheck_elapsed(unsigned long data)
>       for_each_ring(ring, dev_priv, i) {
>               u32 seqno, acthd;
>  
> +             semaphore_clear_deadlocks(dev_priv);
> +
>               seqno = ring->get_seqno(ring, false);
>               acthd = intel_ring_get_active_head(ring);
>  
> @@ -2436,16 +2474,21 @@ void i915_hangcheck_elapsed(unsigned long data)
>                                       busy_count++;
>                               }
>                       } else {
> -                             stuck[i] = ring->hangcheck.acthd == acthd;
> -                             if (stuck[i]) {
> +                             switch (ring_stuck(ring, acthd)) {
> +                             case pass:
> +                                     break;
> +                             case kick:
>                                       /* Every time we kick the ring, add a
>                                        * small increment to the hangcheck
>                                        * score so that we can catch a
>                                        * batch that is repeatedly kicked.
>                                        */
>                                       ring->hangcheck.score += KICK;
> -                                     if (ring_hung(ring))
> -                                             ring->hangcheck.score += HUNG;
> +                                     break;
> +                             case hung:
> +                                     ring->hangcheck.score += HUNG;
> +                                     stuck[i] = true;
> +                                     break;
>                               }
>                               busy_count++;
>                       }
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h 
> b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index efc403d..a3e9610 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -38,6 +38,7 @@ struct  intel_hw_status_page {
>  #define I915_READ_SYNC_1(ring) I915_READ(RING_SYNC_1((ring)->mmio_base))
>  
>  struct intel_ring_hangcheck {
> +     bool deadlock;
>       u32 seqno;
>       u32 acthd;
>       int score;
> -- 
> 1.7.10.4
> 
>

I'm going to give this another look tomorrow if Daniel doesn't merge it
before then. The patch is a bit hard for me to read, but the idea seems
like how I'd want it to behave.

-- 
Ben Widawsky, Intel Open Source Technology Center
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

Reply via email to