On Thu, Jun 06, 2013 at 10:45:42AM +0100, Chris Wilson wrote:
> After kicking a ring, it should be free to make progress again and so
> should not be accused of being stuck until hangcheck fires once more. In
> order to catch a denial-of-service within a batch or across multiple
> batches, we still do increment the hangcheck score - just not as
> severely so that it takes multiple kicks to fail.
> 
> This should address part of Ben's justified criticism of
> 
> commit 05407ff889ceebe383aa5907219f86582ef96b72
> Author: Mika Kuoppala <mika.kuopp...@linux.intel.com>
> Date:   Thu May 30 09:04:29 2013 +0300
> 
>     drm/i915: detect hang using per ring hangcheck_score
> 
> "There's also another corner case on the kick. If the seqno = 2
> (though not stuck), and on the 3rd hangcheck, the ring is stuck, and
> we try to kick it... we don't actually try to find out if the kick
> helped."
> 
> v2: Make sure we catch DoS attempts with batches full of invalid WAITs.
> 
> References: https://bugs.freedesktop.org/show_bug.cgi?id=65394
> Signed-off-by: Chris Wilson <ch...@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuopp...@linux.intel.com>
> Cc: Ben Widawsky <b...@bwidawsk.net>
> ---
>  drivers/gpu/drm/i915/i915_irq.c |  105 
> ++++++++++++++++++++-------------------
>  1 file changed, 53 insertions(+), 52 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 85694d7..2d1890d 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -2324,21 +2324,11 @@ ring_last_seqno(struct intel_ring_buffer *ring)
>                         struct drm_i915_gem_request, list)->seqno;
>  }
>  
> -static bool i915_hangcheck_ring_idle(struct intel_ring_buffer *ring,
> -                                  u32 ring_seqno, bool *err)
> -{
> -     if (list_empty(&ring->request_list) ||
> -         i915_seqno_passed(ring_seqno, ring_last_seqno(ring))) {
> -             /* Issue a wake-up to catch stuck h/w. */
> -             if (waitqueue_active(&ring->irq_queue)) {
> -                     DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
> -                               ring->name);
> -                     wake_up_all(&ring->irq_queue);
> -                     *err = true;
> -             }
> -             return true;
> -     }
> -     return false;
> +static bool
> +ring_idle(struct intel_ring_buffer *ring, u32 seqno)
> +{
> +     return (list_empty(&ring->request_list) ||
> +             i915_seqno_passed(seqno, ring_last_seqno(ring)));
>  }
>  
>  static bool semaphore_passed(struct intel_ring_buffer *ring)
> @@ -2372,16 +2362,26 @@ static bool semaphore_passed(struct intel_ring_buffer 
> *ring)
>                                ioread32(ring->virtual_start+acthd+4)+1);
>  }
>  
> -static bool kick_ring(struct intel_ring_buffer *ring)
> +static bool ring_hung(struct intel_ring_buffer *ring)
>  {
>       struct drm_device *dev = ring->dev;
>       struct drm_i915_private *dev_priv = dev->dev_private;
> -     u32 tmp = I915_READ_CTL(ring);
> +     u32 tmp;
> +
> +     if (IS_GEN2(dev))
> +             return true;
> +
> +     /* Is the chip hanging on a WAIT_FOR_EVENT?
> +      * If so we can simply poke the RB_WAIT bit
> +      * and break the hang. This should work on
> +      * all but the second generation chipsets.
> +      */
> +     tmp = I915_READ_CTL(ring);
>       if (tmp & RING_WAIT) {
>               DRM_ERROR("Kicking stuck wait on %s\n",
>                         ring->name);
>               I915_WRITE_CTL(ring, tmp);
> -             return true;
> +             return false;
>       }
>  
>       if (INTEL_INFO(dev)->gen >= 6 &&
> @@ -2390,22 +2390,10 @@ static bool kick_ring(struct intel_ring_buffer *ring)
>               DRM_ERROR("Kicking stuck semaphore on %s\n",
>                         ring->name);
>               I915_WRITE_CTL(ring, tmp);
> -             return true;
> -     }
> -     return false;
> -}
> -
> -static bool i915_hangcheck_ring_hung(struct intel_ring_buffer *ring)
> -{
> -     if (IS_GEN2(ring->dev))
>               return false;
> +     }
>  
> -     /* Is the chip hanging on a WAIT_FOR_EVENT?
> -      * If so we can simply poke the RB_WAIT bit
> -      * and break the hang. This should work on
> -      * all but the second generation chipsets.
> -      */
> -     return !kick_ring(ring);
> +     return true;
>  }
>  
>  /**
> @@ -2423,37 +2411,50 @@ void i915_hangcheck_elapsed(unsigned long data)
>       struct intel_ring_buffer *ring;
>       int i;
>       int busy_count = 0, rings_hung = 0;
> -     bool stuck[I915_NUM_RINGS];
> +     bool stuck[I915_NUM_RINGS] = { 0 };
> +#define KICK 5
> +#define HUNG 20
> +#define FIRE 30
>  
>       if (!i915_enable_hangcheck)
>               return;
>  
>       for_each_ring(ring, dev_priv, i) {
>               u32 seqno, acthd;
> -             bool idle, err = false;
>  
>               seqno = ring->get_seqno(ring, false);
>               acthd = intel_ring_get_active_head(ring);
> -             idle = i915_hangcheck_ring_idle(ring, seqno, &err);
> -             stuck[i] = ring->hangcheck.acthd == acthd;
> -
> -             if (idle) {
> -                     if (err)
> -                             ring->hangcheck.score += 2;
> -                     else
> -                             ring->hangcheck.score = 0;
> -             } else {
> -                     busy_count++;
>  
> -                     if (ring->hangcheck.seqno == seqno) {
> -                             ring->hangcheck.score++;
> -
> -                             /* Kick ring if stuck*/
> -                             if (stuck[i])
> -                                     i915_hangcheck_ring_hung(ring);
> +             if (ring->hangcheck.seqno == seqno) {
> +                     if (ring_idle(ring, seqno)) {
> +                             if (waitqueue_active(&ring->irq_queue)) {
> +                                     /* Issue a wake-up to catch stuck h/w. 
> */
> +                                     DRM_ERROR("Hangcheck timer elapsed... 
> %s idle\n",
> +                                               ring->name);
> +                                     wake_up_all(&ring->irq_queue);
> +                                     ring->hangcheck.score += HUNG;
> +                                     busy_count++;
> +                             }

Correct me if I'm wrong, but we shouldn't be able to get here if the
waitqueue isn't active, so perhaps a WARN for the else?

>                       } else {
> -                             ring->hangcheck.score = 0;
> +                             stuck[i] = ring->hangcheck.acthd == acthd;
> +                             if (stuck[i]) {
> +                                     /* Every time we kick the ring, add a
> +                                      * small increment to the hangcheck
> +                                      * score so that we can catch a
> +                                      * batch that is repeatedly kicked.
> +                                      */
> +                                     ring->hangcheck.score += KICK;
> +                                     if (ring_hung(ring))
> +                                             ring->hangcheck.score += HUNG;
> +                             }
> +                             busy_count++;
>                       }
> +             } else {
> +                     /* Gradually reduce the count so that we catch DoS
> +                      * attempts across multiple batches.
> +                      */
> +                     if (ring->hangcheck.score > 0)
> +                             ring->hangcheck.score--;
>               }

This else feels weird to me. I think we'd want to decrease at least by a
multiple of KICK, or HUNG if seqno is actually moving. I'm also not
certain I understand why we're worried about DoS. Did that conversation
happen somewhere I can't find?

>  
>               ring->hangcheck.seqno = seqno;
> @@ -2461,7 +2462,7 @@ void i915_hangcheck_elapsed(unsigned long data)
>       }
>  
>       for_each_ring(ring, dev_priv, i) {
> -             if (ring->hangcheck.score > 2) {
> +             if (ring->hangcheck.score > FIRE) {
>                       rings_hung++;
>                       DRM_ERROR("%s: %s on %s 0x%x\n", ring->name,
>                                 stuck[i] ? "stuck" : "no progress",
> -- 
> 1.7.10.4
> 

-- 
Ben Widawsky, Intel Open Source Technology Center
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

Reply via email to