On Mon, Dec 16, 2013 at 04:03:52PM +0000, Lister, Ian wrote:
> From f71a7de85e9d81be3aa3962c8fe2557235ff21c1 Mon Sep 17 00:00:00 2001
> Message-Id: 
> <f71a7de85e9d81be3aa3962c8fe2557235ff21c1.1387201899.git.ian.lis...@intel.com>
> In-Reply-To: <cover.1387201899.git.ian.lis...@intel.com>
> References: <cover.1387201899.git.ian.lis...@intel.com>
> From: ian-lister <ian.lis...@intel.com>
> Date: Wed, 11 Dec 2013 11:25:44 +0000
> Subject: [RFC 13/13] drm/i915: Exec buffer inserts watchdog commands
> 
> A start timer command is inserted before the batch buffer start command
> and a stop timer command is inserted afterwards. If the batch executes
> normally then the timer will be cancelled before it can fire, however if
> the batch buffer hangs then the watchdog timer will fire and
> i915_handle_error is called to complete the recovery work.
> 
> This patch requires the per-engine TDR patches.
> 
> Signed-off-by: ian-lister <ian.lis...@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_debugfs.c        | 16 +++++++++++++---
>  drivers/gpu/drm/i915/i915_gem_execbuffer.c | 22 ++++++++++++++++++++++
>  drivers/gpu/drm/i915/i915_irq.c            |  1 +
>  drivers/gpu/drm/i915/intel_ringbuffer.h    |  1 +
>  4 files changed, 37 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c 
> b/drivers/gpu/drm/i915/i915_debugfs.c
> index f00b19c..6867b096 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -906,15 +906,23 @@ i915_ring_hangcheck_read(struct file *filp,
>        * have hung and been reset since boot */
>       struct drm_device *dev = filp->private_data;
>       drm_i915_private_t *dev_priv = dev->dev_private;
> -     char buf[100];
> +     char buf[200];
>       int len;
>  
>       len = scnprintf(buf, sizeof(buf),
> -             "GPU=0x%08X,RCS=0x%08X,VCS=0x%08X,BCS=0x%08X\n",
> +             "GPU=0x%08X,RCS=0x%08X,VCS=0x%08X,BCS=0x%08X,"
> +             "RCS_T=0x%08x,VCS_T=0x%08x,BCS_T=0x%08x,"
> +             "RCS_W=0x%08x,VCS_W=0x%08x,BCS_W=0x%08x\n",
>               dev_priv->gpu_error.global_resets,
>               dev_priv->ring[RCS].hangcheck.total,
>               dev_priv->ring[VCS].hangcheck.total,
> -             dev_priv->ring[BCS].hangcheck.total);
> +             dev_priv->ring[BCS].hangcheck.total,
> +             dev_priv->ring[RCS].hangcheck.tdr_count,
> +             dev_priv->ring[VCS].hangcheck.tdr_count,
> +             dev_priv->ring[BCS].hangcheck.tdr_count,
> +             dev_priv->ring[RCS].hangcheck.watchdog_count,
> +             dev_priv->ring[VCS].hangcheck.watchdog_count,
> +             dev_priv->ring[BCS].hangcheck.watchdog_count);
>  
>       return simple_read_from_buffer(ubuf, max, ppos, buf, len);
>  }
> @@ -935,6 +943,8 @@ i915_ring_hangcheck_write(struct file *filp,
>       for (i = 0; i < I915_NUM_RINGS; i++) {
>               /* Reset the hangcheck counters */
>               dev_priv->ring[i].hangcheck.total = 0;
> +             dev_priv->ring[i].hangcheck.tdr_count = 0;
> +             dev_priv->ring[i].hangcheck.watchdog_count = 0;
>       }


Looks like unrelated debug code, needs to be extracted and shuffled into
the right patch (or separate debug patch). There's more like this below.
>  
>       dev_priv->gpu_error.global_resets = 0;
> diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c 
> b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> index c9d330a..722b41e 100644
> --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> @@ -32,6 +32,7 @@
>  #include "i915_trace.h"
>  #include "intel_drv.h"
>  #include <linux/dma_remapping.h>
> +#include "intel_ringbuffer.h"
>  
>  struct eb_vmas {
>       struct list_head vmas;
> @@ -1196,6 +1197,17 @@ i915_gem_do_execbuffer(struct drm_device *dev, void 
> *data,
>                       goto err;
>       }
>  
> +     /* Start watchdog timer*/
> +     if ((args->flags & I915_EXEC_ENABLE_WATCHDOG) &&
> +             i915_enable_watchdog &&
> +             intel_ring_watchdog_supported(ring)) {

Imo just yell at userspace (with an -EINVAL) if it tries to use the
watchdog support on unsupported rings. Which means we need a driver flag
for this.

Also I don't see the value of of a module option for this (I presume
i915_enable_watchdog is this) since this is purely opt-in from userspace.
> +
> +             ret = intel_ring_start_watchdog(ring);
> +
> +             if (ret)
> +                     goto err;
> +     }
> +
>       exec_start = i915_gem_obj_offset(batch_obj, vm) +
>               args->batch_start_offset;
>       exec_len = args->batch_len;
> @@ -1220,6 +1232,16 @@ i915_gem_do_execbuffer(struct drm_device *dev, void 
> *data,
>                       goto err;
>       }
>  
> +     /* Cancel watchdog timer */
> +     if ((args->flags & I915_EXEC_ENABLE_WATCHDOG) &&
> +             i915_enable_watchdog &&
> +             intel_ring_watchdog_supported(ring)) {
> +
> +             ret = intel_ring_stop_watchdog(ring);
> +             if (ret)
> +                     goto err;
> +     }
> +
>       trace_i915_gem_ring_dispatch(ring, intel_ring_get_seqno(ring), flags);
>  
>       i915_gem_execbuffer_move_to_active(&eb->vmas, ring);
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index c26c3db..bc7d68b 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -2974,6 +2974,7 @@ static void i915_hangcheck_elapsed(unsigned long data)
>               busy_count += i915_hangcheck_ring_sample(ring);
>  
>               if (ring->hangcheck.score > FIRE) {
> +                     ring->hangcheck.tdr_count++;
>                       DRM_ERROR("Hang detected on %s\n",
>                                ring->name);
>  
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h 
> b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index 473cb94..4a42638 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -72,6 +72,7 @@ struct intel_ring_hangcheck {
>       u32 total; /* Total resets applied to this ring/engine*/
>       u32 last_head; /* Head value recorded at last hang */
>       u32 status_updated;
> +     u32 tdr_count;      /* Total TDR resets for this ring */
>       u32 watchdog_threshold;
>       u32 watchdog_count; /* Total watchdog resets for this ring */
>  };
> -- 
> 1.8.5.1
> 
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

Reply via email to