Re: [Intel-gfx] [PATCH v2] drm/i915: Rework GPU reset sequence to match driver load & thaw

Mcaulay, Alistair Fri, 15 Aug 2014 06:34:30 -0700

Hi Mika / Daniel,

below is the basic code path of a reset which has been changed by my patch:


i915_reset()
{
        ....
        i915_gem_reset() -> This used to call i915_gem_context_reset(), which 
has now been removed. 
        .....
        i915_gem_init_hw()
                .....
                i915_gem_context_enable() -> This used to return during reset. 
Now it doesn't
                .....
                        for each ring, i915_switch_context(default)
                                do_switch();
                .....

        .....
}

" I am with Daniel on this one. I don't understand how can we throw everything 
in here away."
Did you maybe mean Ben?
Daniel, I thought you were happy with the implementation, and V2 fixed your 
last concern, could you please comment.

" We need to force hw to switch to a working context, after reset, so that our 
internal state tracking matches."
I believe this patch does that using i915_switch_context, rather than the hacky 
i915_gem_context_reset()

Alistair.

> -----Original Message-----
> From: Mika Kuoppala [mailto:mika.kuopp...@linux.intel.com]
> Sent: Wednesday, August 06, 2014 5:25 PM
> To: Mcaulay, Alistair; intel-gfx@lists.freedesktop.org
> Subject: Re: [Intel-gfx] [PATCH v2] drm/i915: Rework GPU reset sequence to
> match driver load & thaw
> 
> 
> Hi,
> 
> alistair.mcau...@intel.com writes:
> 
> > From: "McAulay, Alistair" <alistair.mcau...@intel.com>
> >
> > This patch is to address Daniels concerns over different code during reset:
> >
> > http://lists.freedesktop.org/archives/intel-gfx/2014-June/047758.html
> >
> > "The reason for aiming as hard as possible to use the exact same code
> > for driver load, gpu reset and runtime pm/system resume is that we've
> > simply seen too many bugs due to slight variations and unintended
> omissions."
> >
> > Tested using igt drv_hangman.
> >
> > V2: Cleaner way of preventing check_wedge returning -EAGAIN
> >
> > Signed-off-by: McAulay, Alistair <alistair.mcau...@intel.com>
> > ---
> >  drivers/gpu/drm/i915/i915_drv.c         |  6 +++
> >  drivers/gpu/drm/i915/i915_drv.h         |  3 ++
> >  drivers/gpu/drm/i915/i915_gem.c         |  6 +--
> >  drivers/gpu/drm/i915/i915_gem_context.c | 42 +--------------------
> >  drivers/gpu/drm/i915/i915_gem_gtt.c     | 67 
> > +++++----------------------------
> >  drivers/gpu/drm/i915/i915_gem_gtt.h     |  3 +-
> >  6 files changed, 23 insertions(+), 104 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/i915/i915_drv.c
> > b/drivers/gpu/drm/i915/i915_drv.c index 5e4fefd..3bfafe6 100644
> > --- a/drivers/gpu/drm/i915/i915_drv.c
> > +++ b/drivers/gpu/drm/i915/i915_drv.c
> > @@ -806,7 +806,13 @@ int i915_reset(struct drm_device *dev)
> >                     !dev_priv->ums.mm_suspended) {
> >             dev_priv->ums.mm_suspended = 0;
> >
> > +           /* Used to prevent gem_check_wedged returning -EAGAIN
> during gpu reset */
> > +           dev_priv->gpu_error.reload_in_reset = true;
> > +
> >             ret = i915_gem_init_hw(dev);
> > +
> > +           dev_priv->gpu_error.reload_in_reset = false;
> > +
> >             mutex_unlock(&dev->struct_mutex);
> >             if (ret) {
> >                     DRM_ERROR("Failed hw init on reset %d\n", ret); diff
> --git
> > a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> > index 991b663..116daff 100644
> > --- a/drivers/gpu/drm/i915/i915_drv.h
> > +++ b/drivers/gpu/drm/i915/i915_drv.h
> > @@ -1217,6 +1217,9 @@ struct i915_gpu_error {
> >
> >     /* For missed irq/seqno simulation. */
> >     unsigned int test_irq_rings;
> > +
> > +   /* Used to prevent gem_check_wedged returning -EAGAIN during
> gpu reset   */
> > +   bool reload_in_reset;
> >  };
> >
> >  enum modeset_restore {
> > diff --git a/drivers/gpu/drm/i915/i915_gem.c
> > b/drivers/gpu/drm/i915/i915_gem.c index ef047bc..14e1770 100644
> > --- a/drivers/gpu/drm/i915/i915_gem.c
> > +++ b/drivers/gpu/drm/i915/i915_gem.c
> > @@ -1085,7 +1085,9 @@ i915_gem_check_wedge(struct i915_gpu_error
> *error,
> >             if (i915_terminally_wedged(error))
> >                     return -EIO;
> >
> > -           return -EAGAIN;
> > +           /* Check if GPU Reset is in progress */
> > +           if (!error->reload_in_reset)
> > +                   return -EAGAIN;
> >     }
> >
> >     return 0;
> > @@ -2590,8 +2592,6 @@ void i915_gem_reset(struct drm_device *dev)
> >     for_each_ring(ring, dev_priv, i)
> >             i915_gem_reset_ring_cleanup(dev_priv, ring);
> >
> > -   i915_gem_context_reset(dev);
> > -
> >     i915_gem_restore_fences(dev);
> >  }
> >
> > diff --git a/drivers/gpu/drm/i915/i915_gem_context.c
> > b/drivers/gpu/drm/i915/i915_gem_context.c
> > index de72a28..d96219f 100644
> > --- a/drivers/gpu/drm/i915/i915_gem_context.c
> > +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> > @@ -372,42 +372,6 @@ err_destroy:
> >     return ERR_PTR(ret);
> >  }
> >
> > -void i915_gem_context_reset(struct drm_device *dev) -{
> > -   struct drm_i915_private *dev_priv = dev->dev_private;
> > -   int i;
> > -
> > -   /* Prevent the hardware from restoring the last context (which
> hung) on
> > -    * the next switch */
> > -   for (i = 0; i < I915_NUM_RINGS; i++) {
> > -           struct intel_engine_cs *ring = &dev_priv->ring[i];
> > -           struct intel_context *dctx = ring->default_context;
> > -           struct intel_context *lctx = ring->last_context;
> > -
> > -           /* Do a fake switch to the default context */
> > -           if (lctx == dctx)
> > -                   continue;
> > -
> > -           if (!lctx)
> > -                   continue;
> > -
> > -           if (dctx->legacy_hw_ctx.rcs_state && i == RCS) {
> > -                   WARN_ON(i915_gem_obj_ggtt_pin(dctx-
> >legacy_hw_ctx.rcs_state,
> > -
> get_context_alignment(dev), 0));
> > -                   /* Fake a finish/inactive */
> > -                   dctx->legacy_hw_ctx.rcs_state->base.write_domain
> = 0;
> > -                   dctx->legacy_hw_ctx.rcs_state->active = 0;
> > -           }
> > -
> > -           if (lctx->legacy_hw_ctx.rcs_state && i == RCS)
> > -                   i915_gem_object_ggtt_unpin(lctx-
> >legacy_hw_ctx.rcs_state);
> > -
> > -           i915_gem_context_unreference(lctx);
> > -           i915_gem_context_reference(dctx);
> > -           ring->last_context = dctx;
> > -   }
> > -}
> > -
> 
> I am with Daniel on this one. I don't understand how can we throw
> everything in here away.
> 
> We need to force hw to switch to a working context, after reset, so that our
> internal state tracking matches.
> 
> Further, if we aim to more unification I think we should make it so that the
> initial render state will get run, also after reset.
> 
> If we cleanup the last context for each ring set default context carefully,
> i915_gem_context_enable() will then switch to default contexts and reinit
> them using the initial render state. Something like this:
> 
> void i915_gem_context_reset(struct drm_device *dev) {
>       struct drm_i915_private *dev_priv = dev->dev_private;
>       int i;
> 
>       for (i = 0; i < I915_NUM_RINGS; i++) {
>               struct intel_engine_cs *ring = &dev_priv->ring[i];
>               struct intel_context *lctx = ring->last_context;
>               struct intel_context *dctx = ring->default_context;
> 
>               if (lctx) {
>                       if (lctx->legacy_hw_ctx.rcs_state && i == RCS)
>                               i915_gem_object_ggtt_unpin(lctx-
> >legacy_hw_ctx.rcs_state);
> 
>                       i915_gem_context_unreference(lctx);
>                       ring->last_context = NULL;
>               }
> 
>               if (dctx->legacy_hw_ctx.rcs_state && i == RCS) {
>                       dctx->legacy_hw_ctx.rcs_state->base.write_domain
> = 0;
>                       dctx->legacy_hw_ctx.rcs_state->active = 0;
>                       dctx->legacy_hw_ctx.initialized = false;
>               }
>       }
> }
> 
> The state would be closer of what we get after module reload.
> 
> -Mika
> 
> >  int i915_gem_context_init(struct drm_device *dev)  {
> >     struct drm_i915_private *dev_priv = dev->dev_private; @@ -498,10
> > +462,6 @@ int i915_gem_context_enable(struct drm_i915_private
> *dev_priv)
> >             ppgtt->enable(ppgtt);
> >     }
> >
> > -   /* FIXME: We should make this work, even in reset */
> > -   if (i915_reset_in_progress(&dev_priv->gpu_error))
> > -           return 0;
> > -
> >     BUG_ON(!dev_priv->ring[RCS].default_context);
> >
> >     for_each_ring(ring, dev_priv, i) {
> > @@ -645,7 +605,7 @@ static int do_switch(struct intel_engine_cs *ring,
> >     from = ring->last_context;
> >
> >     if (USES_FULL_PPGTT(ring->dev)) {
> > -           ret = ppgtt->switch_mm(ppgtt, ring, false);
> > +           ret = ppgtt->switch_mm(ppgtt, ring);
> >             if (ret)
> >                     goto unpin_out;
> >     }
> > diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c
> > b/drivers/gpu/drm/i915/i915_gem_gtt.c
> > index 5188936..450c8a9 100644
> > --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> > +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> > @@ -216,19 +216,12 @@ static gen6_gtt_pte_t
> iris_pte_encode(dma_addr_t
> > addr,
> >
> >  /* Broadwell Page Directory Pointer Descriptors */  static int
> > gen8_write_pdp(struct intel_engine_cs *ring, unsigned entry,
> > -                      uint64_t val, bool synchronous)
> > +                      uint64_t val)
> >  {
> > -   struct drm_i915_private *dev_priv = ring->dev->dev_private;
> >     int ret;
> >
> >     BUG_ON(entry >= 4);
> >
> > -   if (synchronous) {
> > -           I915_WRITE(GEN8_RING_PDP_UDW(ring, entry), val >> 32);
> > -           I915_WRITE(GEN8_RING_PDP_LDW(ring, entry), (u32)val);
> > -           return 0;
> > -   }
> > -
> >     ret = intel_ring_begin(ring, 6);
> >     if (ret)
> >             return ret;
> > @@ -245,8 +238,7 @@ static int gen8_write_pdp(struct intel_engine_cs
> > *ring, unsigned entry,  }
> >
> >  static int gen8_mm_switch(struct i915_hw_ppgtt *ppgtt,
> > -                     struct intel_engine_cs *ring,
> > -                     bool synchronous)
> > +                     struct intel_engine_cs *ring)
> >  {
> >     int i, ret;
> >
> > @@ -255,7 +247,7 @@ static int gen8_mm_switch(struct i915_hw_ppgtt
> > *ppgtt,
> >
> >     for (i = used_pd - 1; i >= 0; i--) {
> >             dma_addr_t addr = ppgtt->pd_dma_addr[i];
> > -           ret = gen8_write_pdp(ring, i, addr, synchronous);
> > +           ret = gen8_write_pdp(ring, i, addr);
> >             if (ret)
> >                     return ret;
> >     }
> > @@ -724,29 +716,10 @@ static uint32_t get_pd_offset(struct
> > i915_hw_ppgtt *ppgtt)  }
> >
> >  static int hsw_mm_switch(struct i915_hw_ppgtt *ppgtt,
> > -                    struct intel_engine_cs *ring,
> > -                    bool synchronous)
> > +                    struct intel_engine_cs *ring)
> >  {
> > -   struct drm_device *dev = ppgtt->base.dev;
> > -   struct drm_i915_private *dev_priv = dev->dev_private;
> >     int ret;
> >
> > -   /* If we're in reset, we can assume the GPU is sufficiently idle to
> > -    * manually frob these bits. Ideally we could use the ring functions,
> > -    * except our error handling makes it quite difficult (can't use
> > -    * intel_ring_begin, ring->flush, or intel_ring_advance)
> > -    *
> > -    * FIXME: We should try not to special case reset
> > -    */
> > -   if (synchronous ||
> > -       i915_reset_in_progress(&dev_priv->gpu_error)) {
> > -           WARN_ON(ppgtt != dev_priv->mm.aliasing_ppgtt);
> > -           I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
> > -           I915_WRITE(RING_PP_DIR_BASE(ring),
> get_pd_offset(ppgtt));
> > -           POSTING_READ(RING_PP_DIR_BASE(ring));
> > -           return 0;
> > -   }
> > -
> >     /* NB: TLBs must be flushed and invalidated before a switch */
> >     ret = ring->flush(ring, I915_GEM_GPU_DOMAINS,
> I915_GEM_GPU_DOMAINS);
> >     if (ret)
> > @@ -768,29 +741,10 @@ static int hsw_mm_switch(struct i915_hw_ppgtt
> > *ppgtt,  }
> >
> >  static int gen7_mm_switch(struct i915_hw_ppgtt *ppgtt,
> > -                     struct intel_engine_cs *ring,
> > -                     bool synchronous)
> > +                     struct intel_engine_cs *ring)
> >  {
> > -   struct drm_device *dev = ppgtt->base.dev;
> > -   struct drm_i915_private *dev_priv = dev->dev_private;
> >     int ret;
> >
> > -   /* If we're in reset, we can assume the GPU is sufficiently idle to
> > -    * manually frob these bits. Ideally we could use the ring functions,
> > -    * except our error handling makes it quite difficult (can't use
> > -    * intel_ring_begin, ring->flush, or intel_ring_advance)
> > -    *
> > -    * FIXME: We should try not to special case reset
> > -    */
> > -   if (synchronous ||
> > -       i915_reset_in_progress(&dev_priv->gpu_error)) {
> > -           WARN_ON(ppgtt != dev_priv->mm.aliasing_ppgtt);
> > -           I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
> > -           I915_WRITE(RING_PP_DIR_BASE(ring),
> get_pd_offset(ppgtt));
> > -           POSTING_READ(RING_PP_DIR_BASE(ring));
> > -           return 0;
> > -   }
> > -
> >     /* NB: TLBs must be flushed and invalidated before a switch */
> >     ret = ring->flush(ring, I915_GEM_GPU_DOMAINS,
> I915_GEM_GPU_DOMAINS);
> >     if (ret)
> > @@ -819,14 +773,11 @@ static int gen7_mm_switch(struct i915_hw_ppgtt
> > *ppgtt,  }
> >
> >  static int gen6_mm_switch(struct i915_hw_ppgtt *ppgtt,
> > -                     struct intel_engine_cs *ring,
> > -                     bool synchronous)
> > +                     struct intel_engine_cs *ring)
> >  {
> >     struct drm_device *dev = ppgtt->base.dev;
> >     struct drm_i915_private *dev_priv = dev->dev_private;
> >
> > -   if (!synchronous)
> > -           return 0;
> >
> >     I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
> >     I915_WRITE(RING_PP_DIR_BASE(ring), get_pd_offset(ppgtt)); @@ -
> 852,7
> > +803,7 @@ static int gen8_ppgtt_enable(struct i915_hw_ppgtt *ppgtt)
> >             if (USES_FULL_PPGTT(dev))
> >                     continue;
> >
> > -           ret = ppgtt->switch_mm(ppgtt, ring, true);
> > +           ret = ppgtt->switch_mm(ppgtt, ring);
> >             if (ret)
> >                     goto err_out;
> >     }
> > @@ -897,7 +848,7 @@ static int gen7_ppgtt_enable(struct i915_hw_ppgtt
> *ppgtt)
> >             if (USES_FULL_PPGTT(dev))
> >                     continue;
> >
> > -           ret = ppgtt->switch_mm(ppgtt, ring, true);
> > +           ret = ppgtt->switch_mm(ppgtt, ring);
> >             if (ret)
> >                     return ret;
> >     }
> > @@ -926,7 +877,7 @@ static int gen6_ppgtt_enable(struct i915_hw_ppgtt
> *ppgtt)
> >     I915_WRITE(GFX_MODE,
> _MASKED_BIT_ENABLE(GFX_PPGTT_ENABLE));
> >
> >     for_each_ring(ring, dev_priv, i) {
> > -           int ret = ppgtt->switch_mm(ppgtt, ring, true);
> > +           int ret = ppgtt->switch_mm(ppgtt, ring);
> >             if (ret)
> >                     return ret;
> >     }
> > diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h
> > b/drivers/gpu/drm/i915/i915_gem_gtt.h
> > index 8d6f7c1..bf1e4fc 100644
> > --- a/drivers/gpu/drm/i915/i915_gem_gtt.h
> > +++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
> > @@ -262,8 +262,7 @@ struct i915_hw_ppgtt {
> >
> >     int (*enable)(struct i915_hw_ppgtt *ppgtt);
> >     int (*switch_mm)(struct i915_hw_ppgtt *ppgtt,
> > -                    struct intel_engine_cs *ring,
> > -                    bool synchronous);
> > +                    struct intel_engine_cs *ring);
> >     void (*debug_dump)(struct i915_hw_ppgtt *ppgtt, struct seq_file
> *m);
> > };
> >
> > --
> > 2.0.0
> >
> > _______________________________________________
> > Intel-gfx mailing list
> > Intel-gfx@lists.freedesktop.org
> > http://lists.freedesktop.org/mailman/listinfo/intel-gfx
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

Re: [Intel-gfx] [PATCH v2] drm/i915: Rework GPU reset sequence to match driver load & thaw

Reply via email to