i915: Rely on accurate request tracking for finding hung batches

Ben Widawsky Tue, 11 Feb 2014 17:57:55 -0800

On Mon, Feb 10, 2014 at 04:30:49PM +0200, Mika Kuoppala wrote:
> From: Chris Wilson <ch...@chris-wilson.co.uk>
> 
> In the past, it was possible to have multiple batches per request due to
> a stray signal or ENOMEM. As a result we had to scan each active object
> (filtered by those having the COMMAND domain) for the one that contained
> the ACTHD pointer. This was then made more complicated by the
> introduction of ppgtt, whereby ACTHD then pointed into the address space
> of the context and so also needed to be taken into account.
> 
> This is a fairly robust approach (though the implementation is a little
> fragile and depends upon the per-generation setup, registers and
> parameters). However, due to the requirements for hangstats, we needed a
> robust method for associating batches with a particular request and
> having that we can rely upon it for finding the associated batch object
> for error capture.
> 
> If the batch buffer tracking is not robust enough, that should become
> apparent quite quickly through an erroneous error capture. That should
> also help to make sure that the runtime reporting to userspace is
> robust. It also means that we then report the oldest incomplete batch on
> each ring, which can be useful for determining the state of userspace at
> the time of a hang.
> 
> v2: Use i915_gem_find_active_request (Mika)
> 
> Signed-off-by: Chris Wilson <ch...@chris-wilson.co.uk> (v1)
> Signed-off-by: Mika Kuoppala <mika.kuopp...@intel.com> (v2)
> Cc: Ben Widawsky <benjamin.widaw...@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.h       |    3 ++
>  drivers/gpu/drm/i915/i915_gem.c       |   13 +++++--
>  drivers/gpu/drm/i915/i915_gpu_error.c |   66 
> ++++-----------------------------
>  3 files changed, 20 insertions(+), 62 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index b1e91c3..eb260bc 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -2147,6 +2147,9 @@ i915_gem_object_unpin_fence(struct drm_i915_gem_object 
> *obj)
>       }
>  }
>  
> +struct drm_i915_gem_request *
> +i915_gem_find_active_request(struct intel_ring_buffer *ring);
> +
>  bool i915_gem_retire_requests(struct drm_device *dev);
>  void i915_gem_retire_requests_ring(struct intel_ring_buffer *ring);
>  int __must_check i915_gem_check_wedge(struct i915_gpu_error *error,
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index b0a244a..20e55ef 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2298,11 +2298,16 @@ static void i915_gem_free_request(struct 
> drm_i915_gem_request *request)
>       kfree(request);
>  }
>  
> -static struct drm_i915_gem_request *
> -i915_gem_find_first_non_complete(struct intel_ring_buffer *ring)
> +struct drm_i915_gem_request *
> +i915_gem_find_active_request(struct intel_ring_buffer *ring)
>  {
>       struct drm_i915_gem_request *request;
> -     const u32 completed_seqno = ring->get_seqno(ring, false);
> +     u32 completed_seqno;
> +
> +     if (WARN_ON(!ring->get_seqno))
> +             return NULL;


ring->get_seqno(ring, false) ?

This looks like it's currently wrong below as well.

> +
> +     completed_seqno = ring->get_seqno(ring, false);
>  
>       list_for_each_entry(request, &ring->request_list, list) {
>               if (i915_seqno_passed(completed_seqno, request->seqno))
> @@ -2320,7 +2325,7 @@ static void i915_gem_reset_ring_status(struct 
> drm_i915_private *dev_priv,
>       struct drm_i915_gem_request *request;
>       bool ring_hung;
>  
> -     request = i915_gem_find_first_non_complete(ring);
> +     request = i915_gem_find_active_request(ring);
>  
>       if (request == NULL)
>               return;
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c 
> b/drivers/gpu/drm/i915/i915_gpu_error.c
> index dc47bb9..5bd075a 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -713,46 +713,14 @@ static void i915_gem_record_fences(struct drm_device 
> *dev,
>       }
>  }
>  
> -/* This assumes all batchbuffers are executed from the PPGTT. It might have 
> to
> - * change in the future. */
> -static bool is_active_vm(struct i915_address_space *vm,
> -                      struct intel_ring_buffer *ring)
> -{
> -     struct drm_device *dev = vm->dev;
> -     struct drm_i915_private *dev_priv = dev->dev_private;
> -     struct i915_hw_ppgtt *ppgtt;
> -
> -     if (INTEL_INFO(dev)->gen < 7)
> -             return i915_is_ggtt(vm);
> -
> -     /* FIXME: This ignores that the global gtt vm is also on this list. */
> -     ppgtt = container_of(vm, struct i915_hw_ppgtt, base);
> -
> -     if (INTEL_INFO(dev)->gen >= 8) {
> -             u64 pdp0 = (u64)I915_READ(GEN8_RING_PDP_UDW(ring, 0)) << 32;
> -             pdp0 |=  I915_READ(GEN8_RING_PDP_LDW(ring, 0));
> -             return pdp0 == ppgtt->pd_dma_addr[0];
> -     } else {
> -             u32 pp_db;
> -             pp_db = I915_READ(RING_PP_DIR_BASE(ring));
> -             return (pp_db >> 10) == ppgtt->pd_offset;
> -     }
> -}
> -
>  static struct drm_i915_error_object *
>  i915_error_first_batchbuffer(struct drm_i915_private *dev_priv,
>                            struct intel_ring_buffer *ring)
>  {
> -     struct i915_address_space *vm;
> -     struct i915_vma *vma;
> -     struct drm_i915_gem_object *obj;
> -     bool found_active = false;
> -     u32 seqno;
> -
> -     if (!ring->get_seqno)
> -             return NULL;
> +     struct drm_i915_gem_request *request;
>  
>       if (HAS_BROKEN_CS_TLB(dev_priv->dev)) {
> +             struct drm_i915_gem_object *obj;
>               u32 acthd = I915_READ(ACTHD);
>  
>               if (WARN_ON(ring->id != RCS))
> @@ -765,32 +733,14 @@ i915_error_first_batchbuffer(struct drm_i915_private 
> *dev_priv,
>                       return i915_error_ggtt_object_create(dev_priv, obj);
>       }
>  
> -     seqno = ring->get_seqno(ring, false);
> -     list_for_each_entry(vm, &dev_priv->vm_list, global_link) {
> -             if (!is_active_vm(vm, ring))
> -                     continue;
> -
> -             found_active = true;
> -
> -             list_for_each_entry(vma, &vm->active_list, mm_list) {
> -                     obj = vma->obj;
> -                     if (obj->ring != ring)
> -                             continue;
> -
> -                     if (i915_seqno_passed(seqno, obj->last_read_seqno))
> -                             continue;
> -
> -                     if ((obj->base.read_domains & I915_GEM_DOMAIN_COMMAND) 
> == 0)
> -                             continue;
> -
> -                     /* We need to copy these to an anonymous buffer as the 
> simplest
> -                      * method to avoid being overwritten by userspace.
> -                      */
> -                     return i915_error_object_create(dev_priv, obj, vm);
> -             }
> +     request = i915_gem_find_active_request(ring);
> +     if (request) {
> +             /* We need to copy these to an anonymous buffer as the simplest
> +              * method to avoid being overwritten by userspace.
> +              */
> +             return i915_error_object_create(dev_priv, request->batch_obj, 
> request->ctx->vm);

Wrap this one. It's too long even for my undiscerning eye.

>       }
>  
> -     WARN_ON(!found_active);
>       return NULL;
>  }
>  


I still would have preferred to keep both methods for a while until we
felt really comfortable with this... the commit message's statement that
getting bad error state is inevitably a good thing is total bunk, I
think.

However, I'll act as a Daniel proxy here who seems in favor of this
path. The code is definitely simpler, I am just a chicken.

With the above fixed:
Reviewed-by: Ben Widawsky <b...@bwidawsk.net>

-- 
Ben Widawsky, Intel Open Source Technology Center
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

Re: [Intel-gfx] [PATCH 1/3] drm/i915: Rely on accurate request tracking for finding hung batches

Reply via email to