On Mon, May 13, 2013 at 04:32:12PM +0300, Mika Kuoppala wrote:
> Keep track of ring seqno progress and if there are no
> progress detected, declare hang. Use actual head (acthd)
> to distinguish between ring stuck and batchbuffer looping
> situation. Stuck ring will be kicked to trigger progress.
> 
> v2: use atchd to detect stuck ring from loop (Ben Widawsky)
> 
> v3: Use acthd to check when ring needs kicking.
> Declare hang on third time in order to give time for
> kick_ring to take effect.
> 
> Signed-off-by: Mika Kuoppala <mika.kuopp...@intel.com>
>
>
>
> ---
>  drivers/gpu/drm/i915/i915_irq.c         |   80 
> ++++++++++++++++++-------------
>  drivers/gpu/drm/i915/intel_ringbuffer.h |    2 +
>  2 files changed, 49 insertions(+), 33 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 6efe3b0..5dde61f 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -681,7 +681,6 @@ static void notify_ring(struct drm_device *dev,
>  
>       wake_up_all(&ring->irq_queue);
>       if (i915_enable_hangcheck) {
> -             dev_priv->gpu_error.hangcheck_count = 0;
>               mod_timer(&dev_priv->gpu_error.hangcheck_timer,
>                         round_jiffies_up(jiffies + 
> DRM_I915_HANGCHECK_JIFFIES));
>       }
> @@ -2381,61 +2380,76 @@ static bool i915_hangcheck_hung(struct drm_device 
> *dev)
>  
>  /**
>   * This is called when the chip hasn't reported back with completed
> - * batchbuffers in a long time. The first time this is called we simply 
> record
> - * ACTHD. If ACTHD hasn't changed by the time the hangcheck timer elapses
> - * again, we assume the chip is wedged and try to fix it.
> + * batchbuffers in a long time. We keep track per ring seqno progress and
> + * if there are no progress, hangcheck score for that ring is increased.
> + * Further, acthd is inspected to see if the ring is stuck. On stuck case
> + * we kick the ring. If we see no progress on three subsequent calls
> + * we assume chip is wedged and try to fix it by resetting the chip.
>   */
>  void i915_hangcheck_elapsed(unsigned long data)
>  {
>       struct drm_device *dev = (struct drm_device *)data;
>       drm_i915_private_t *dev_priv = dev->dev_private;
>       struct intel_ring_buffer *ring;
> -     bool err = false, idle;
>       int i;
> -     u32 seqno[I915_NUM_RINGS];
> -     bool work_done;
> +     int busy_count = 0, rings_hung = 0;
> +     bool stuck[I915_NUM_RINGS];
>  
>       if (!i915_enable_hangcheck)
>               return;
>  
> -     idle = true;
>       for_each_ring(ring, dev_priv, i) {
> -             seqno[i] = ring->get_seqno(ring, false);
> -             idle &= i915_hangcheck_ring_idle(ring, seqno[i], &err);
> -     }
> +             u32 seqno, acthd;
> +             bool idle, err = false;
> +
> +             seqno = ring->get_seqno(ring, false);
> +             acthd = intel_ring_get_active_head(ring);
> +             idle = i915_hangcheck_ring_idle(ring, seqno, &err);
> +             stuck[i] = ring->hangcheck.acthd == acthd;
> +
> +             if (idle) {
> +                     if (err)
> +                             ring->hangcheck.score += 2;
> +                     else
> +                             ring->hangcheck.score = 0;
> +             } else {
> +                     busy_count++;
>  
> -     /* If all work is done then ACTHD clearly hasn't advanced. */
> -     if (idle) {
> -             if (err) {
> -                     if (i915_hangcheck_hung(dev))
> -                             return;
> +                     if (ring->hangcheck.seqno == seqno) {
> +                             ring->hangcheck.score++;
>  
> -                     goto repeat;
> +                             /* Kick ring if stuck*/
> +                             if (stuck[i])
> +                                     i915_hangcheck_ring_hung(ring);
> +                     } else {
> +                             ring->hangcheck.score = 0;
> +                     }
>               }
>  
> -             dev_priv->gpu_error.hangcheck_count = 0;
> -             return;
> +             ring->hangcheck.seqno = seqno;
> +             ring->hangcheck.acthd = acthd;
>       }
>  
> -     work_done = false;
>       for_each_ring(ring, dev_priv, i) {
> -             if (ring->hangcheck.seqno != seqno[i]) {
> -                     work_done = true;
> -                     ring->hangcheck.seqno = seqno[i];
> +             if (ring->hangcheck.score > 2) {
> +                     rings_hung++;
> +                     DRM_ERROR("%s: %s on %s 0x%x\n", ring->name,
> +                               stuck[i] ? "stuck" : "no progress",
> +                               stuck[i] ? "addr" : "seqno",
> +                               stuck[i] ? ring->hangcheck.acthd & HEAD_ADDR :
> +                               ring->hangcheck.seqno);
>               }
>       }
>  
> -     if (!work_done) {
> -             if (i915_hangcheck_hung(dev))
> -                     return;
> -     } else {
> -             dev_priv->gpu_error.hangcheck_count = 0;
> -     }
> +     if (rings_hung)
> +             return i915_handle_error(dev, true);
>  
> -repeat:
> -     /* Reset timer case chip hangs without another request being added */
> -     mod_timer(&dev_priv->gpu_error.hangcheck_timer,
> -               round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES));
> +     if (busy_count)
> +             /* Reset timer case chip hangs without another request
> +              * being added */
> +             mod_timer(&dev_priv->gpu_error.hangcheck_timer,
> +                       round_jiffies_up(jiffies +
> +                                        DRM_I915_HANGCHECK_JIFFIES));
>  }
>  
>  /* drm_dma.h hooks
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h 
> b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index ef374a8..5886667 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -39,6 +39,8 @@ struct  intel_hw_status_page {
>  
>  struct intel_ring_hangcheck {
>       u32 seqno;
> +     u32 acthd;
> +     int score;
>  };
>  
>  struct  intel_ring_buffer {
>

Maybe I'm just stating the functional changes of the patch, but in case
they were unintended here is what I see as potential issues:

1. If ring B is waiting on ring A via semaphore, and ring A is making
   progress, albeit slowly - the hangcheck will fire. The check will
   determine that A is moving, however ring B will appear hung because
   the ACTHD doesn't move. I honestly can't say if that's actually a
   realistic problem to hit it probably implies the timeout value is too
   low.

2. There's also another corner case on the kick. If the seqno = 2
   (though not stuck), and on the 3rd hangcheck, the ring is stuck, and
   we try to kick it... we don't actually try to find out if the kick
   helped.

#2 doesn't worry me much, just wanted to point it out. The question then
is, did I make #1 up? If not, what should we do?

-- 
Ben Widawsky, Intel Open Source Technology Center
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

Reply via email to