Chris Wilson <ch...@chris-wilson.co.uk> writes:

> Previously, we relied on only running the hangcheck while somebody was
> waiting on the GPU, in order to minimise the amount of time hangcheck
> had to run. (If nobody was watching the GPU, nobody would notice if the
> GPU wasn't responding -- eventually somebody would care and so kick
> hangcheck into action.) However, this falls apart from around commit
> 4680816be336 ("drm/i915: Wait first for submission, before waiting for
> request completion"), as not all waiters declare themselves to hangcheck
> and so we could switch off hangcheck and miss GPU hangs even when
> waiting under the struct_mutex.
>
> If we enable hangcheck from the first request submission, and let it run
> until the GPU is idle again, we forgo all the complexity involved with
> only enabling around waiters. Instead we have to be careful that we do
> not declare a GPU hang when idly waiting for the next request to be come
> ready.

For the complexity part I agree that this is simple and elegant. But
I think I have not understood it fully as I don't connect the part where
we need to be careful in idly waiting for next request.
Could you elaborate and point it the relevant portion in the patch for it?

-Mika

>
> Fixes: 4680816be336 ("drm/i915: Wait first for submission, before waiting for 
> request completion"
> Signed-off-by: Chris Wilson <ch...@chris-wilson.co.uk>
> Cc: Chris Wilson <ch...@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuopp...@linux.intel.com>
> ---
>  drivers/gpu/drm/i915/i915_gem.c          |  7 +++----
>  drivers/gpu/drm/i915/i915_gem_request.c  |  2 ++
>  drivers/gpu/drm/i915/intel_breadcrumbs.c | 11 -----------
>  drivers/gpu/drm/i915/intel_hangcheck.c   |  7 +------
>  4 files changed, 6 insertions(+), 21 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 062b21408698..63308ec016a3 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -3322,16 +3322,15 @@ i915_gem_retire_work_handler(struct work_struct *work)
>               mutex_unlock(&dev->struct_mutex);
>       }
>  
> -     /* Keep the retire handler running until we are finally idle.
> +     /*
> +      * Keep the retire handler running until we are finally idle.
>        * We do not need to do this test under locking as in the worst-case
>        * we queue the retire worker once too often.
>        */
> -     if (READ_ONCE(dev_priv->gt.awake)) {
> -             i915_queue_hangcheck(dev_priv);
> +     if (READ_ONCE(dev_priv->gt.awake))
>               queue_delayed_work(dev_priv->wq,
>                                  &dev_priv->gt.retire_work,
>                                  round_jiffies_up_relative(HZ));
> -     }
>  }
>  
>  static void shrink_caches(struct drm_i915_private *i915)
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.c 
> b/drivers/gpu/drm/i915/i915_gem_request.c
> index 9b02310307fc..6cacd78cc849 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.c
> +++ b/drivers/gpu/drm/i915/i915_gem_request.c
> @@ -393,6 +393,8 @@ static void mark_busy(struct drm_i915_private *i915)
>  
>       intel_engines_unpark(i915);
>  
> +     i915_queue_hangcheck(i915);
> +
>       queue_delayed_work(i915->wq,
>                          &i915->gt.retire_work,
>                          round_jiffies_up_relative(HZ));
> diff --git a/drivers/gpu/drm/i915/intel_breadcrumbs.c 
> b/drivers/gpu/drm/i915/intel_breadcrumbs.c
> index 86acac010bb8..62b2a20bc24e 100644
> --- a/drivers/gpu/drm/i915/intel_breadcrumbs.c
> +++ b/drivers/gpu/drm/i915/intel_breadcrumbs.c
> @@ -149,17 +149,6 @@ static void intel_breadcrumbs_fake_irq(struct timer_list 
> *t)
>               return;
>  
>       mod_timer(&b->fake_irq, jiffies + 1);
> -
> -     /* Ensure that even if the GPU hangs, we get woken up.
> -      *
> -      * However, note that if no one is waiting, we never notice
> -      * a gpu hang. Eventually, we will have to wait for a resource
> -      * held by the GPU and so trigger a hangcheck. In the most
> -      * pathological case, this will be upon memory starvation! To
> -      * prevent this, we also queue the hangcheck from the retire
> -      * worker.
> -      */
> -     i915_queue_hangcheck(engine->i915);
>  }
>  
>  static void irq_enable(struct intel_engine_cs *engine)
> diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c 
> b/drivers/gpu/drm/i915/intel_hangcheck.c
> index 31f01d64c021..348a4f7ffb67 100644
> --- a/drivers/gpu/drm/i915/intel_hangcheck.c
> +++ b/drivers/gpu/drm/i915/intel_hangcheck.c
> @@ -411,7 +411,6 @@ static void i915_hangcheck_elapsed(struct work_struct 
> *work)
>       struct intel_engine_cs *engine;
>       enum intel_engine_id id;
>       unsigned int hung = 0, stuck = 0;
> -     int busy_count = 0;
>  
>       if (!i915_modparams.enable_hangcheck)
>               return;
> @@ -429,7 +428,6 @@ static void i915_hangcheck_elapsed(struct work_struct 
> *work)
>       intel_uncore_arm_unclaimed_mmio_detection(dev_priv);
>  
>       for_each_engine(engine, dev_priv, id) {
> -             const bool busy = intel_engine_has_waiter(engine);
>               struct intel_engine_hangcheck hc;
>  
>               semaphore_clear_deadlocks(dev_priv);
> @@ -443,16 +441,13 @@ static void i915_hangcheck_elapsed(struct work_struct 
> *work)
>                       if (hc.action != ENGINE_DEAD)
>                               stuck |= intel_engine_flag(engine);
>               }
> -
> -             busy_count += busy;
>       }
>  
>       if (hung)
>               hangcheck_declare_hang(dev_priv, hung, stuck);
>  
>       /* Reset timer in case GPU hangs without another request being added */
> -     if (busy_count)
> -             i915_queue_hangcheck(dev_priv);
> +     i915_queue_hangcheck(dev_priv);
>  }
>  
>  void intel_engine_init_hangcheck(struct intel_engine_cs *engine)
> -- 
> 2.15.1
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

Reply via email to