On Tue, Oct 04, 2011 at 02:11:52PM -0700, Ben Widawsky wrote: > On Gen6+ we have other rings which may be in use. We haven't hung if the > blit or media ring is still going > > Before rebase: > Reviewed-by: Daniel Vetter <daniel.vet...@ffwll.ch> > Signed-off-by: Ben Widawsky <b...@bwidawsk.net>
I've spotted another goof-up besides the instdone_stuck && acthd_stuck I've overlooked last time around. > --- > drivers/gpu/drm/i915/i915_drv.h | 5 +- > drivers/gpu/drm/i915/i915_irq.c | 143 > +++++++++++++++++++++++++++------------ > 2 files changed, 102 insertions(+), 46 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h > index 567275c..edfa8be 100644 > --- a/drivers/gpu/drm/i915/i915_drv.h > +++ b/drivers/gpu/drm/i915/i915_drv.h > @@ -323,9 +323,8 @@ typedef struct drm_i915_private { > #define DRM_I915_HANGCHECK_PERIOD 1500 /* in ms */ > struct timer_list hangcheck_timer; > int hangcheck_count; > - uint32_t last_acthd; > - uint32_t last_instdone; > - uint32_t last_instdone1; > + uint32_t last_acthd[I915_NUM_RINGS]; > + uint64_t last_instdone[I915_NUM_RINGS]; > > unsigned long cfb_size; > unsigned int cfb_fb; > diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c > index 97e338b..6b6abe1 100644 > --- a/drivers/gpu/drm/i915/i915_irq.c > +++ b/drivers/gpu/drm/i915/i915_irq.c > @@ -948,7 +948,7 @@ static void i915_capture_error_state(struct drm_device > *dev) > error->instdone[RCS] = I915_READ(INSTDONE_I965); > error->instps[RCS] = I915_READ(INSTPS); > error->instdone1 = I915_READ(INSTDONE1); > - error->acthd = I915_READ(ACTHD_I965); > + error->acthd[RCS] = I915_READ(ACTHD_I965); > error->bbaddr = I915_READ64(BB_ADDR); > } else { > error->ipeir[RCS] = I915_READ(IPEIR); > @@ -1666,6 +1666,85 @@ static bool kick_ring(struct intel_ring_buffer *ring) > return false; > } > > +static bool > +instdone_stuck(struct drm_device *dev) > +{ > + struct drm_i915_private *dev_priv = dev->dev_private; > + uint64_t instdone = 0, instdone1 = 0, vcs_instdone = 0, bcs_instdone = > 0; > + bool stuck; > + > + switch (INTEL_INFO(dev)->gen) { > + case 7: > + case 6: > + bcs_instdone = I915_READ(BCS_INSTDONE); > + case 5: > + vcs_instdone = I915_READ(VCS_INSTDONE); > + case 4: > + instdone = I915_READ(INSTDONE_I965); > + instdone1 = I915_READ(INSTDONE1); > + break; > + case 3: > + case 2: > + instdone = I915_READ(INSTDONE); > + break; > + } > + > + stuck = > + (dev_priv->last_instdone[RCS] == ((instdone << 32) | instdone1)) && > + (dev_priv->last_instdone[VCS] == vcs_instdone) && > + (dev_priv->last_instdone[BCS] == bcs_instdone); > + > + dev_priv->last_instdone[RCS] = (instdone << 32) | instdone1; > + dev_priv->last_instdone[VCS] = vcs_instdone; > + dev_priv->last_instdone[BCS] = bcs_instdone; > + > + return stuck; > +} > + > +static bool > +acthd_stuck(struct drm_device *dev) > +{ > + struct drm_i915_private *dev_priv = dev->dev_private; > + uint32_t acthd = 0, vcs_acthd = 0, bcs_acthd = 0; > + bool stuck = false; > + > + switch (INTEL_INFO(dev)->gen) { > + case 7: > + case 6: > + bcs_acthd = intel_ring_get_active_head(&dev_priv->ring[BCS]); > + case 5: > + vcs_acthd = intel_ring_get_active_head(&dev_priv->ring[VCS]); > + case 4: > + case 3: > + case 2: > + acthd = intel_ring_get_active_head(&dev_priv->ring[RCS]); > + break; > + } > + > + stuck = > + (dev_priv->last_acthd[RCS] == acthd) && > + (dev_priv->last_acthd[VCS] == vcs_acthd) && > + (dev_priv->last_acthd[BCS] == bcs_acthd); > + > + dev_priv->last_acthd[RCS] = acthd; > + dev_priv->last_acthd[VCS] = vcs_acthd; > + dev_priv->last_acthd[BCS] = bcs_acthd; > + > + return stuck; > +} > + > +static bool gpu_stuck(struct drm_device *dev) > +{ > + #define NUM_HANGCHECKS_TO_RESET 1 > + > + struct drm_i915_private *dev_priv = dev->dev_private; > + > + if (dev_priv->hangcheck_count++ < NUM_HANGCHECKS_TO_RESET) > + return false; > + > + return acthd_stuck(dev) && instdone_stuck(dev); First check whether the gpu ist stuck, then increment the hangcheck_count If the gpu is not stuck, we also need to clear the hangcheck_count again. > +} > + > /** > * This is called when the chip hasn't reported back with completed > * batchbuffers in a long time. The first time this is called we simply > record > @@ -1676,13 +1755,11 @@ void i915_hangcheck_elapsed(unsigned long data) > { > struct drm_device *dev = (struct drm_device *)data; > drm_i915_private_t *dev_priv = dev->dev_private; > - uint32_t acthd, instdone, instdone1; > bool err = false; > > if (!i915_enable_hangcheck) > return; > > - /* If all work is done then ACTHD clearly hasn't advanced. */ > if (i915_hangcheck_ring_idle(&dev_priv->ring[RCS], &err) && > i915_hangcheck_ring_idle(&dev_priv->ring[VCS], &err) && > i915_hangcheck_ring_idle(&dev_priv->ring[BCS], &err)) { > @@ -1692,50 +1769,30 @@ void i915_hangcheck_elapsed(unsigned long data) > return; > } > > - if (INTEL_INFO(dev)->gen < 4) { > - acthd = I915_READ(ACTHD); > - instdone = I915_READ(INSTDONE); > - instdone1 = 0; > - } else { > - acthd = I915_READ(ACTHD_I965); > - instdone = I915_READ(INSTDONE_I965); > - instdone1 = I915_READ(INSTDONE1); > - } > - > - if (dev_priv->last_acthd == acthd && > - dev_priv->last_instdone == instdone && > - dev_priv->last_instdone1 == instdone1) { > - if (dev_priv->hangcheck_count++ > 1) { > - DRM_ERROR("Hangcheck timer elapsed... GPU hung\n"); > + if (gpu_stuck(dev)) { > + DRM_ERROR("Hangcheck timer elapsed... GPU hung\n"); > > - if (!IS_GEN2(dev)) { > - /* Is the chip hanging on a WAIT_FOR_EVENT? > - * If so we can simply poke the RB_WAIT bit > - * and break the hang. This should work on > - * all but the second generation chipsets. > - */ > - > - if (kick_ring(&dev_priv->ring[RCS])) > - goto repeat; > + if (!IS_GEN2(dev)) { > + /* Is the chip hanging on a WAIT_FOR_EVENT? > + * If so we can simply poke the RB_WAIT bit > + * and break the hang. This should work on > + * all but the second generation chipsets. > + */ > > - if (HAS_BSD(dev) && > - kick_ring(&dev_priv->ring[VCS])) > - goto repeat; > + if (kick_ring(&dev_priv->ring[RCS])) > + goto repeat; > > - if (HAS_BLT(dev) && > - kick_ring(&dev_priv->ring[BCS])) > - goto repeat; > - } > + if (HAS_BSD(dev) && > + kick_ring(&dev_priv->ring[VCS])) > + goto repeat; > > - i915_handle_error(dev, true); > - return; > + if (HAS_BLT(dev) && > + kick_ring(&dev_priv->ring[BCS])) > + goto repeat; > } > - } else { > - dev_priv->hangcheck_count = 0; > > - dev_priv->last_acthd = acthd; > - dev_priv->last_instdone = instdone; > - dev_priv->last_instdone1 = instdone1; > + i915_handle_error(dev, true); > + return; > } > > repeat: > -- > 1.7.6.4 > > _______________________________________________ > Intel-gfx mailing list > Intel-gfx@lists.freedesktop.org > http://lists.freedesktop.org/mailman/listinfo/intel-gfx -- Daniel Vetter Mail: dan...@ffwll.ch Mobile: +41 (0)79 365 57 48 _______________________________________________ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/intel-gfx