i915: Record error state capture reason

Ben Widawsky Tue, 11 Feb 2014 18:32:56 -0800

On Mon, Feb 10, 2014 at 04:30:51PM +0200, Mika Kuoppala wrote:
> We capture error state not only when the GPU hangs but
> also on other situations as in interrupt errors and
> in situations where we can kick things forward without GPU reset.
> There will be log entry on most of these cases. But as error state
> capture might be only thing we have, if dmesg was not captured. Or as
> in GEN4 case, interrupt error can trigger error state capture without log
> entry, the exact reason why capture was made is hard to decipher.
> 
> To avoid confusion why the error state was captured, print reason
> along with error code into log and also store it into the error state.
>


I could have done with a better commit subject and message. We already
capture error state, so "Record error state capture" is a bit weird.
Also, I think the commit message glosses over the main point of why
these cases are special as opposed to hangcheck.

> References: https://bugs.freedesktop.org/show_bug.cgi?id=74193
> Signed-off-by: Mika Kuoppala <mika.kuopp...@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_debugfs.c   |    5 +-
>  drivers/gpu/drm/i915/i915_drv.h       |    9 +++-
>  drivers/gpu/drm/i915/i915_gpu_error.c |   84 
> +++++++++++++++++++++++----------
>  drivers/gpu/drm/i915/i915_irq.c       |   45 +++++++++++-------
>  4 files changed, 98 insertions(+), 45 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c 
> b/drivers/gpu/drm/i915/i915_debugfs.c
> index 2dc05c3..175e524 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -3006,9 +3006,8 @@ i915_wedged_set(void *data, u64 val)
>  {
>       struct drm_device *dev = data;
>  
> -     DRM_INFO("Manually setting wedged to %llu\n", val);
> -     i915_handle_error(dev, val);
> -
> +     i915_handle_error(dev, val,
> +                       "Manually setting wedged to %llu", val);
>       return 0;
>  }
>  
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 2a61a29..ad41c71 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -299,6 +299,8 @@ struct drm_i915_error_state {
>       struct kref ref;
>       struct timeval time;
>  
> +     char error_msg[128];
> +
>       /* Generic register state */
>       u32 eir;
>       u32 pgtbl_er;
> @@ -1994,7 +1996,9 @@ extern void intel_console_resume(struct work_struct 
> *work);
>  
>  /* i915_irq.c */
>  void i915_queue_hangcheck(struct drm_device *dev);
> -void i915_handle_error(struct drm_device *dev, bool wedged);
> +__printf(3, 4)
> +void i915_handle_error(struct drm_device *dev, bool wedged,
> +                    const char *fmt, ...);
>  
>  void gen6_set_pm_mask(struct drm_i915_private *dev_priv, u32 pm_iir,
>                                                       int new_delay);
> @@ -2464,7 +2468,8 @@ static inline void i915_error_state_buf_release(
>  {
>       kfree(eb->buf);
>  }
> -void i915_capture_error_state(struct drm_device *dev);
> +void i915_capture_error_state(struct drm_device *dev, bool wedge,
> +                           const char *error_msg);
>  void i915_error_state_get(struct drm_device *dev,
>                         struct i915_error_state_file_priv *error_priv);
>  void i915_error_state_put(struct i915_error_state_file_priv *error_priv);
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c 
> b/drivers/gpu/drm/i915/i915_gpu_error.c
> index a90971a..dce569b 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -329,6 +329,7 @@ int i915_error_state_to_str(struct 
> drm_i915_error_state_buf *m,
>               goto out;
>       }
>  
> +     err_printf(m, "%s\n", error->error_msg);
>       err_printf(m, "Time: %ld s %ld us\n", error->time.tv_sec,
>                  error->time.tv_usec);
>       err_printf(m, "Kernel: " UTS_RELEASE "\n");
> @@ -686,7 +687,8 @@ static u32 capture_pinned_bo(struct drm_i915_error_buffer 
> *err,
>   * It's only a small step better than a random number in its current form.
>   */
>  static uint32_t i915_error_generate_code(struct drm_i915_private *dev_priv,
> -                                      struct drm_i915_error_state *error)
> +                                      struct drm_i915_error_state *error,
> +                                      int *ring_id)
>  {
>       uint32_t error_code = 0;
>       int i;
> @@ -696,9 +698,14 @@ static uint32_t i915_error_generate_code(struct 
> drm_i915_private *dev_priv,
>        * synchronization commands which almost always appear in the case
>        * strictly a client bug. Use instdone to differentiate those some.
>        */
> -     for (i = 0; i < I915_NUM_RINGS; i++)
> -             if (error->ring[i].hangcheck_action == HANGCHECK_HUNG)
> +     for (i = 0; i < I915_NUM_RINGS; i++) {
> +             if (error->ring[i].hangcheck_action == HANGCHECK_HUNG) {
> +                     if (ring_id)
> +                             *ring_id = i;
> +
>                       return error->ring[i].ipehr ^ error->ring[i].instdone;
> +             }
> +     }
>  
>       return error_code;
>  }
> @@ -1075,6 +1082,39 @@ static void i915_capture_reg_state(struct 
> drm_i915_private *dev_priv,
>       i915_get_extra_instdone(dev, error->extra_instdone);
>  }
>  
> +static void i915_error_generate_msg(struct drm_device *dev,
> +                                 struct drm_i915_error_state *error,
> +                                 bool wedge,
> +                                 const char *error_msg)
> +{
> +     struct drm_i915_private *dev_priv = dev->dev_private;
> +     u32 ecode;
> +     int ring_id = -1, len;
> +
> +     ecode = i915_error_generate_code(dev_priv, error, &ring_id);
> +
> +     len = scnprintf(error->error_msg, sizeof(error->error_msg),
> +                     "GPU HANG: ecode %d:0x%08x", ring_id, ecode);
> +
> +     if (ring_id != -1 && error->ring[ring_id].pid != -1)
> +                     len += scnprintf(error->error_msg + len,
> +                                      sizeof(error->error_msg) - len,
> +                                      ", in %s [%d]",
> +                                      error->ring[ring_id].comm,
> +                                      error->ring[ring_id].pid);
> +
> +     if (error_msg)

I'd drop this and just always have an error_msg. But whatever you want
is fine.

> +             len += scnprintf(error->error_msg + len,
> +                              sizeof(error->error_msg) - len,
> +                              ", reason: %s",
> +                              error_msg);
> +
> +     if (wedge)
> +             len += scnprintf(error->error_msg + len,
> +                              sizeof(error->error_msg) - len,
> +                              ", resetting GPU");

I also don't think the fact that we're resetting the GPU belongs here
(for instance, if it's disabled via modparam). On the other hand, I'd be
in favor of a taint on GPU reset for all future error states.

> +}
> +
>  /**
>   * i915_capture_error_state - capture an error record for later analysis
>   * @dev: drm device
> @@ -1084,19 +1124,13 @@ static void i915_capture_reg_state(struct 
> drm_i915_private *dev_priv,
>   * out a structure which becomes available in debugfs for user level tools
>   * to pick up.
>   */
> -void i915_capture_error_state(struct drm_device *dev)
> +void i915_capture_error_state(struct drm_device *dev, bool wedged,
> +                           const char *error_msg)
>  {
>       static bool warned;
>       struct drm_i915_private *dev_priv = dev->dev_private;
>       struct drm_i915_error_state *error;
>       unsigned long flags;
> -     uint32_t ecode;
> -
> -     spin_lock_irqsave(&dev_priv->gpu_error.lock, flags);
> -     error = dev_priv->gpu_error.first_error;
> -     spin_unlock_irqrestore(&dev_priv->gpu_error.lock, flags);
> -     if (error)
> -             return;

This seems like it should be a separate patch, or else I'm not clear why
you've introduced this change in this series. It doesn't seem to be
related to what the rest of the patch does.

>  
>       /* Account for pipe specific data like PIPE*STAT */
>       error = kzalloc(sizeof(*error), GFP_ATOMIC);
> @@ -1105,30 +1139,21 @@ void i915_capture_error_state(struct drm_device *dev)
>               return;
>       }
>  
> -     DRM_INFO("GPU crash dump saved to /sys/class/drm/card%d/error\n",
> -              dev->primary->index);
>       kref_init(&error->ref);
>  
>       i915_capture_reg_state(dev_priv, error);
>       i915_gem_capture_buffers(dev_priv, error);
>       i915_gem_record_fences(dev, error);
>       i915_gem_record_rings(dev, error);
> -     ecode = i915_error_generate_code(dev_priv, error);
> -
> -     if (!warned) {
> -             DRM_INFO("GPU HANG [%x]\n", ecode);
> -             DRM_INFO("GPU hangs can indicate a bug anywhere in the entire 
> gfx stack, including userspace.\n");
> -             DRM_INFO("Please file a _new_ bug report on 
> bugs.freedesktop.org against DRI -> DRM/Intel\n");
> -             DRM_INFO("drm/i915 developers can then reassign to the right 
> component if it's not a kernel issue.\n");
> -             DRM_INFO("The gpu crash dump is required to analyze gpu hangs, 
> so please always attach it.\n");
> -             warned = true;
> -     }
>  
>       do_gettimeofday(&error->time);
>  
>       error->overlay = intel_overlay_capture_error_state(dev);
>       error->display = intel_display_capture_error_state(dev);
>  
> +     i915_error_generate_msg(dev, error, wedged, error_msg);
> +     DRM_INFO("%s\n", error->error_msg);
> +
>       spin_lock_irqsave(&dev_priv->gpu_error.lock, flags);
>       if (dev_priv->gpu_error.first_error == NULL) {
>               dev_priv->gpu_error.first_error = error;
> @@ -1136,8 +1161,19 @@ void i915_capture_error_state(struct drm_device *dev)
>       }
>       spin_unlock_irqrestore(&dev_priv->gpu_error.lock, flags);
>  
> -     if (error)
> +     if (error) {
>               i915_error_state_free(&error->ref);
> +             return;
> +     }

Same comment as above.

> +
> +     if (!warned) {
> +             DRM_INFO("GPU hangs can indicate a bug anywhere in the entire 
> gfx stack, including userspace.\n");
> +             DRM_INFO("Please file a _new_ bug report on 
> bugs.freedesktop.org against DRI -> DRM/Intel\n");
> +             DRM_INFO("drm/i915 developers can then reassign to the right 
> component if it's not a kernel issue.\n");
> +             DRM_INFO("The gpu crash dump is required to analyze gpu hangs, 
> so please always attach it.\n");
> +             DRM_INFO("GPU crash dump saved to 
> /sys/class/drm/card%d/error\n", dev->primary->index);
> +             warned = true;
> +     }
>  }
>  
>  void i915_error_state_get(struct drm_device *dev,
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index d4defd8..36bba16 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -1223,8 +1223,7 @@ static void snb_gt_irq_handler(struct drm_device *dev,
>       if (gt_iir & (GT_BLT_CS_ERROR_INTERRUPT |
>                     GT_BSD_CS_ERROR_INTERRUPT |
>                     GT_RENDER_CS_MASTER_ERROR_INTERRUPT)) {
> -             DRM_ERROR("GT error interrupt 0x%08x\n", gt_iir);
> -             i915_handle_error(dev, false);
> +             i915_handle_error(dev, false, "GT error interrupt 0x%08x", 
> gt_iir);
>       }
>  
>       if (gt_iir & GT_PARITY_ERROR(dev))
> @@ -1471,8 +1470,9 @@ static void gen6_rps_irq_handler(struct 
> drm_i915_private *dev_priv, u32 pm_iir)
>                       notify_ring(dev_priv->dev, &dev_priv->ring[VECS]);
>  
>               if (pm_iir & PM_VEBOX_CS_ERROR_INTERRUPT) {
> -                     DRM_ERROR("VEBOX CS error interrupt 0x%08x\n", pm_iir);
> -                     i915_handle_error(dev_priv->dev, false);
> +                     i915_handle_error(dev_priv->dev, false,
> +                                       "VEBOX CS error interrupt 0x%08x",
> +                                       pm_iir);
>               }
>       }
>  }
> @@ -2174,11 +2174,18 @@ static void i915_report_and_clear_eir(struct 
> drm_device *dev)
>   * so userspace knows something bad happened (should trigger collection
>   * of a ring dump etc.).
>   */
> -void i915_handle_error(struct drm_device *dev, bool wedged)
> +void i915_handle_error(struct drm_device *dev, bool wedged,
> +                    const char *fmt, ...)
>  {
>       struct drm_i915_private *dev_priv = dev->dev_private;
> +     va_list args;
> +     char error_msg[80];
>  
> -     i915_capture_error_state(dev);
> +     va_start(args, fmt);
> +     vscnprintf(error_msg, sizeof(error_msg), fmt, args);
> +     va_end(args);
> +
> +     i915_capture_error_state(dev, wedged, error_msg);

As I said above regarding always having an error_msg, I'd be in favor of
moving most of this to capture error state, and just passing along the
string - but whatever you want is fine with me.

>       i915_report_and_clear_eir(dev);
>  
>       if (wedged) {
> @@ -2481,9 +2488,9 @@ ring_stuck(struct intel_ring_buffer *ring, u32 acthd)
>        */
>       tmp = I915_READ_CTL(ring);
>       if (tmp & RING_WAIT) {
> -             DRM_ERROR("Kicking stuck wait on %s\n",
> -                       ring->name);
> -             i915_handle_error(dev, false);
> +             i915_handle_error(dev, false,
> +                               "Kicking stuck wait on %s",
> +                               ring->name);
>               I915_WRITE_CTL(ring, tmp);
>               return HANGCHECK_KICK;
>       }
> @@ -2493,9 +2500,9 @@ ring_stuck(struct intel_ring_buffer *ring, u32 acthd)
>               default:
>                       return HANGCHECK_HUNG;
>               case 1:
> -                     DRM_ERROR("Kicking stuck semaphore on %s\n",
> -                               ring->name);
> -                     i915_handle_error(dev, false);
> +                     i915_handle_error(dev, false,
> +                                       "Kicking stuck semaphore on %s",
> +                                       ring->name);
>                       I915_WRITE_CTL(ring, tmp);
>                       return HANGCHECK_KICK;
>               case 0:
> @@ -2617,7 +2624,7 @@ static void i915_hangcheck_elapsed(unsigned long data)
>       }
>  
>       if (rings_hung)
> -             return i915_handle_error(dev, true);
> +             return i915_handle_error(dev, true, "Ring hung");
>  
>       if (busy_count)
>               /* Reset timer case chip hangs without another request
> @@ -3234,7 +3241,9 @@ static irqreturn_t i8xx_irq_handler(int irq, void *arg)
>                */
>               spin_lock_irqsave(&dev_priv->irq_lock, irqflags);
>               if (iir & I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT)
> -                     i915_handle_error(dev, false);
> +                     i915_handle_error(dev, false,
> +                                       "Command parser error, iir 0x%08x",
> +                                       iir);
>  
>               for_each_pipe(pipe) {
>                       int reg = PIPESTAT(pipe);
> @@ -3416,7 +3425,9 @@ static irqreturn_t i915_irq_handler(int irq, void *arg)
>                */
>               spin_lock_irqsave(&dev_priv->irq_lock, irqflags);
>               if (iir & I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT)
> -                     i915_handle_error(dev, false);
> +                     i915_handle_error(dev, false,
> +                                       "Command parser error, iir 0x%08x",
> +                                       iir);
>  
>               for_each_pipe(pipe) {
>                       int reg = PIPESTAT(pipe);
> @@ -3653,7 +3664,9 @@ static irqreturn_t i965_irq_handler(int irq, void *arg)
>                */
>               spin_lock_irqsave(&dev_priv->irq_lock, irqflags);
>               if (iir & I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT)
> -                     i915_handle_error(dev, false);
> +                     i915_handle_error(dev, false,
> +                                       "Command parser error, iir 0x%08x",
> +                                       iir);
>  

Since you're not directly using any of the DRM_ printers, we lose the
file/line. Would you care to add __FILE__/__LINE? I think it would be
helpful - not sure what others thing.

>               for_each_pipe(pipe) {
>                       int reg = PIPESTAT(pipe);
> -- 
> 1.7.9.5
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Ben Widawsky, Intel Open Source Technology Center
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

Re: [Intel-gfx] [PATCH 3/3] drm/i915: Record error state capture reason

Reply via email to