-----Original Message-----
From: Andi Shyti <andi.sh...@linux.intel.com> 
Sent: Wednesday, August 7, 2024 2:10 AM
To: intel-gfx <intel-...@lists.freedesktop.org>; dri-devel 
<dri-devel@lists.freedesktop.org>
Cc: Chris Wilson <chris.p.wil...@linux.intel.com>; Das, Nirmoy 
<nirmoy....@intel.com>; Cavitt, Jonathan <jonathan.cav...@intel.com>; Andi 
Shyti <andi.sh...@linux.intel.com>
Subject: [PATCH] drm/i915/gt: Mark the GT as dead when mmio is unreliable
> 
> From: Chris Wilson <chris.p.wil...@intel.com>
> 
> After we detect that mmio is returning all 0xff, we believe that the GPU
> has dropped off the pci bus and is dead. Mark the device as wedged such
> that we can propagate the failure back to userspace and wait for
> recovery.
> 
> Signed-off-by: Chris Wilson <chris.p.wil...@intel.com>
> Signed-off-by: Andi Shyti <andi.sh...@linux.intel.com>

LGTM.
Reviewed-by: Jonathan Cavitt <jonathan.cav...@intel.com>
-Jonathan Cavitt

> ---
>  drivers/gpu/drm/i915/gt/intel_gt.h       |  6 ++++++
>  drivers/gpu/drm/i915/gt/intel_gt_types.h |  2 ++
>  drivers/gpu/drm/i915/gt/intel_reset.c    | 12 +++++++++++-
>  drivers/gpu/drm/i915/intel_uncore.c      |  7 +++++--
>  4 files changed, 24 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt.h 
> b/drivers/gpu/drm/i915/gt/intel_gt.h
> index b5e114d284ad..b73555889d50 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt.h
> +++ b/drivers/gpu/drm/i915/gt/intel_gt.h
> @@ -208,4 +208,10 @@ enum i915_map_type intel_gt_coherent_map_type(struct 
> intel_gt *gt,
>  void intel_gt_bind_context_set_ready(struct intel_gt *gt);
>  void intel_gt_bind_context_set_unready(struct intel_gt *gt);
>  bool intel_gt_is_bind_context_ready(struct intel_gt *gt);
> +
> +static inline void intel_gt_set_wedged_async(struct intel_gt *gt)
> +{
> +     queue_work(system_highpri_wq, &gt->wedge);
> +}
> +
>  #endif /* __INTEL_GT_H__ */
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt_types.h 
> b/drivers/gpu/drm/i915/gt/intel_gt_types.h
> index cfdd2ad5e954..bcee084b1f27 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt_types.h
> +++ b/drivers/gpu/drm/i915/gt/intel_gt_types.h
> @@ -292,6 +292,8 @@ struct intel_gt {
>       struct gt_defaults defaults;
>       struct kobject *sysfs_defaults;
>  
> +     struct work_struct wedge;
> +
>       struct i915_perf_gt perf;
>  
>       /** link: &ggtt.gt_list */
> diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c 
> b/drivers/gpu/drm/i915/gt/intel_reset.c
> index 735cd23a43c6..8f1ea95471ef 100644
> --- a/drivers/gpu/drm/i915/gt/intel_reset.c
> +++ b/drivers/gpu/drm/i915/gt/intel_reset.c
> @@ -1013,6 +1013,15 @@ static void __intel_gt_set_wedged(struct intel_gt *gt)
>       GT_TRACE(gt, "end\n");
>  }
>  
> +static void set_wedged_work(struct work_struct *w)
> +{
> +     struct intel_gt *gt = container_of(w, struct intel_gt, wedge);
> +     intel_wakeref_t wf;
> +
> +     with_intel_runtime_pm(gt->uncore->rpm, wf)
> +             __intel_gt_set_wedged(gt);
> +}
> +
>  void intel_gt_set_wedged(struct intel_gt *gt)
>  {
>       intel_wakeref_t wakeref;
> @@ -1614,6 +1623,7 @@ void intel_gt_init_reset(struct intel_gt *gt)
>       init_waitqueue_head(&gt->reset.queue);
>       mutex_init(&gt->reset.mutex);
>       init_srcu_struct(&gt->reset.backoff_srcu);
> +     INIT_WORK(&gt->wedge, set_wedged_work);
>  
>       /*
>        * While undesirable to wait inside the shrinker, complain anyway.
> @@ -1640,7 +1650,7 @@ static void intel_wedge_me(struct work_struct *work)
>       struct intel_wedge_me *w = container_of(work, typeof(*w), work.work);
>  
>       gt_err(w->gt, "%s timed out, cancelling all in-flight rendering.\n", 
> w->name);
> -     intel_gt_set_wedged(w->gt);
> +     set_wedged_work(&w->gt->wedge);
>  }
>  
>  void __intel_init_wedge(struct intel_wedge_me *w,
> diff --git a/drivers/gpu/drm/i915/intel_uncore.c 
> b/drivers/gpu/drm/i915/intel_uncore.c
> index 2eba289d88ad..6aa179a3e92a 100644
> --- a/drivers/gpu/drm/i915/intel_uncore.c
> +++ b/drivers/gpu/drm/i915/intel_uncore.c
> @@ -24,6 +24,7 @@
>  #include <drm/drm_managed.h>
>  #include <linux/pm_runtime.h>
>  
> +#include "gt/intel_gt.h"
>  #include "gt/intel_engine_regs.h"
>  #include "gt/intel_gt_regs.h"
>  
> @@ -180,14 +181,16 @@ fw_domain_wait_ack_clear(const struct 
> intel_uncore_forcewake_domain *d)
>       if (!wait_ack_clear(d, FORCEWAKE_KERNEL))
>               return;
>  
> -     if (fw_ack(d) == ~0)
> +     if (fw_ack(d) == ~0) {
>               drm_err(&d->uncore->i915->drm,
>                       "%s: MMIO unreliable (forcewake register returns 
> 0xFFFFFFFF)!\n",
>                       intel_uncore_forcewake_domain_to_str(d->id));
> -     else
> +             intel_gt_set_wedged_async(d->uncore->gt);
> +     } else {
>               drm_err(&d->uncore->i915->drm,
>                       "%s: timed out waiting for forcewake ack to clear.\n",
>                       intel_uncore_forcewake_domain_to_str(d->id));
> +     }
>  
>       add_taint_for_CI(d->uncore->i915, TAINT_WARN); /* CI now unreliable */
>  }
> -- 
> 2.45.2
> 
> 

Reply via email to