Chris Wilson <ch...@chris-wilson.co.uk> writes:

> Only declare a missed interrupt if we find that the GPU is idle with
> waiters and a hangcheck interval has passed in which no new user
> interrupts have been raised.
>
> Signed-off-by: Chris Wilson <ch...@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuopp...@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_debugfs.c     | 11 +++++++----
>  drivers/gpu/drm/i915/i915_irq.c         |  7 ++++++-
>  drivers/gpu/drm/i915/intel_ringbuffer.h |  2 ++
>  3 files changed, 15 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c 
> b/drivers/gpu/drm/i915/i915_debugfs.c
> index b421b53ca128..966fc022418c 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -730,10 +730,10 @@ static int i915_gem_request_info(struct seq_file *m, 
> void *data)
>  static void i915_ring_seqno_info(struct seq_file *m,
>                                struct intel_engine_cs *ring)
>  {
> -     if (ring->get_seqno) {
> -             seq_printf(m, "Current sequence (%s): %x\n",
> -                        ring->name, ring->get_seqno(ring));
> -     }
> +     seq_printf(m, "Current sequence (%s): %x\n",
> +                ring->name, ring->get_seqno(ring));
> +     seq_printf(m, "Current user interrupts (%s): %x\n",
> +                ring->name, READ_ONCE(ring->user_interrupts));
>  }
>  
>  static int i915_gem_seqno_info(struct seq_file *m, void *data)
> @@ -1361,6 +1361,9 @@ static int i915_hangcheck_info(struct seq_file *m, void 
> *unused)
>               seq_printf(m, "%s:\n", ring->name);
>               seq_printf(m, "\tseqno = %x [current %x]\n",
>                          ring->hangcheck.seqno, seqno[i]);
> +             seq_printf(m, "\tuser interrupts = %x [current %x]\n",
> +                        ring->hangcheck.user_interrupts,
> +                        ring->user_interrupts);
>               seq_printf(m, "\tACTHD = 0x%08llx [current 0x%08llx]\n",
>                          (long long)ring->hangcheck.acthd,
>                          (long long)acthd[i]);
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 07bc2cdd6252..978eebcf4594 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -1000,6 +1000,7 @@ static void notify_ring(struct intel_engine_cs *ring)
>               return;
>  
>       trace_i915_gem_request_notify(ring);
> +     ring->user_interrupts++;
>  

For 100% accuracy we would neeb (w)mb() here?

Now you only do READ_ONCE on reader side which is not
enough.

I admit that the chances to hit this are slim to none,
but is this all to avoid mb on irq path?

-Mika



>       wake_up_all(&ring->irq_queue);
>  }
> @@ -3097,6 +3098,7 @@ static void i915_hangcheck_elapsed(struct work_struct 
> *work)
>       for_each_ring(ring, dev_priv, i) {
>               u64 acthd;
>               u32 seqno;
> +             unsigned user_interrupts;
>               bool busy = true;
>  
>               semaphore_clear_deadlocks(dev_priv);
> @@ -3113,6 +3115,7 @@ static void i915_hangcheck_elapsed(struct work_struct 
> *work)
>  
>               acthd = intel_ring_get_active_head(ring);
>               seqno = ring->get_seqno(ring);
> +             user_interrupts = READ_ONCE(ring->user_interrupts);
>  
>               if (ring->hangcheck.seqno == seqno) {
>                       if (ring_idle(ring, seqno)) {
> @@ -3120,7 +3123,8 @@ static void i915_hangcheck_elapsed(struct work_struct 
> *work)
>  
>                               if (waitqueue_active(&ring->irq_queue)) {
>                                       /* Issue a wake-up to catch stuck h/w. 
> */
> -                                     if (!test_and_set_bit(ring->id, 
> &dev_priv->gpu_error.missed_irq_rings)) {
> +                                     if (ring->hangcheck.user_interrupts == 
> user_interrupts &&
> +                                         !test_and_set_bit(ring->id, 
> &dev_priv->gpu_error.missed_irq_rings)) {
>                                               if 
> (!(dev_priv->gpu_error.test_irq_rings & intel_ring_flag(ring)))
>                                                       DRM_ERROR("Hangcheck 
> timer elapsed... %s idle\n",
>                                                                 ring->name);
> @@ -3187,6 +3191,7 @@ static void i915_hangcheck_elapsed(struct work_struct 
> *work)
>  
>               ring->hangcheck.seqno = seqno;
>               ring->hangcheck.acthd = acthd;
> +             ring->hangcheck.user_interrupts = user_interrupts;
>               busy_count += busy;
>       }
>  
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h 
> b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index e1797d42054c..b30ad99a54bf 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -90,6 +90,7 @@ struct intel_ring_hangcheck {
>       u64 acthd;
>       u64 max_acthd;
>       u32 seqno;
> +     unsigned user_interrupts;
>       int score;
>       enum intel_ring_hangcheck_action action;
>       int deadlock;
> @@ -301,6 +302,7 @@ struct  intel_engine_cs {
>        * inspecting request list.
>        */
>       u32 last_submitted_seqno;
> +     unsigned user_interrupts;
>  
>       bool gpu_caches_dirty;
>  
> -- 
> 2.7.0.rc3
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

Reply via email to