If context has recently submitted a faulty batchbuffers guilty of
gpu hang and decides to keep submitting more crap, ban it permanently.

v2: Store guilty ban status bool in gpu_error instead of pointers
    that might become danling before hang is declared.

v3: Use return value for banned status instead of stashing state
    into gpu_error (Chris Wilson)

Signed-off-by: Mika Kuoppala <mika.kuopp...@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.c            |    6 +++--
 drivers/gpu/drm/i915/i915_drv.h            |    8 ++++++-
 drivers/gpu/drm/i915/i915_gem.c            |   34 ++++++++++++++++++++--------
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |   13 +++++++++++
 4 files changed, 49 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index c3e4f29..70b64fd 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -867,6 +867,7 @@ int i915_reset(struct drm_device *dev)
 {
        drm_i915_private_t *dev_priv = dev->dev_private;
        bool simulated;
+       bool ctx_banned;
        int ret;
 
        if (!i915_try_reset)
@@ -874,11 +875,12 @@ int i915_reset(struct drm_device *dev)
 
        mutex_lock(&dev->struct_mutex);
 
-       i915_gem_reset(dev);
+       ctx_banned = i915_gem_reset(dev);
 
        simulated = dev_priv->gpu_error.stop_rings != 0;
 
-       if (!simulated && get_seconds() - dev_priv->gpu_error.last_reset < 5) {
+       if (!(simulated || ctx_banned) &&
+           get_seconds() - dev_priv->gpu_error.last_reset < 5) {
                DRM_ERROR("GPU hanging too fast, declaring wedged!\n");
                ret = -ENODEV;
        } else {
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 8bc399c..364afff 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -504,6 +504,12 @@ struct i915_ctx_hang_stats {
 
        /* This context had batch active when hang was declared */
        unsigned batch_active;
+
+       /* Time when this context was last blamed for a GPU reset */
+       unsigned long batch_active_reset_ts;
+
+       /* This context is banned to submit more work */
+       bool banned;
 };
 
 /* This must match up with the value previously used for execbuf2.rsvd1. */
@@ -1738,7 +1744,7 @@ static inline bool i915_terminally_wedged(struct 
i915_gpu_error *error)
        return atomic_read(&error->reset_counter) == I915_WEDGED;
 }
 
-void i915_gem_reset(struct drm_device *dev);
+bool i915_gem_reset(struct drm_device *dev);
 void i915_gem_clflush_object(struct drm_i915_gem_object *obj);
 int __must_check i915_gem_object_set_domain(struct drm_i915_gem_object *obj,
                                            uint32_t read_domains,
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 6144f0b..3ecf1fe 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2143,15 +2143,15 @@ static bool i915_request_guilty(struct 
drm_i915_gem_request *request,
        return false;
 }
 
-static void i915_set_reset_status(struct intel_ring_buffer *ring,
+static bool i915_set_reset_status(struct intel_ring_buffer *ring,
                                  struct drm_i915_gem_request *request,
                                  u32 acthd)
 {
        struct i915_ctx_hang_stats *hs = NULL;
-       bool inside, guilty;
+       bool inside, guilty, banned;
 
        /* Innocent until proven guilty */
-       guilty = false;
+       guilty = banned = false;
 
        if (ring->hangcheck.last_action != wait &&
            i915_request_guilty(request, acthd, &inside)) {
@@ -2175,11 +2175,20 @@ static void i915_set_reset_status(struct 
intel_ring_buffer *ring,
                hs = &request->file_priv->hang_stats;
 
        if (hs) {
-               if (guilty)
+               if (guilty) {
+                       if (!hs->banned &&
+                           get_seconds() - hs->batch_active_reset_ts < 15) {
+                               hs->banned = banned = true;
+                               DRM_ERROR("context hanging too fast, declaring 
banned\n");
+                       }
                        hs->batch_active++;
-               else
+                       hs->batch_active_reset_ts = get_seconds();
+               } else {
                        hs->batch_pending++;
+               }
        }
+
+       return banned;
 }
 
 static void i915_gem_free_request(struct drm_i915_gem_request *request)
@@ -2193,11 +2202,12 @@ static void i915_gem_free_request(struct 
drm_i915_gem_request *request)
        kfree(request);
 }
 
-static void i915_gem_reset_ring_lists(struct drm_i915_private *dev_priv,
+static bool i915_gem_reset_ring_lists(struct drm_i915_private *dev_priv,
                                      struct intel_ring_buffer *ring)
 {
        u32 completed_seqno;
        u32 acthd;
+       bool ctx_banned = false;
 
        acthd = intel_ring_get_active_head(ring);
        completed_seqno = ring->get_seqno(ring, false);
@@ -2210,7 +2220,8 @@ static void i915_gem_reset_ring_lists(struct 
drm_i915_private *dev_priv,
                                           list);
 
                if (request->seqno > completed_seqno)
-                       i915_set_reset_status(ring, request, acthd);
+                       ctx_banned |= i915_set_reset_status(ring,
+                                                           request, acthd);
 
                i915_gem_free_request(request);
        }
@@ -2224,6 +2235,8 @@ static void i915_gem_reset_ring_lists(struct 
drm_i915_private *dev_priv,
 
                i915_gem_object_move_to_inactive(obj);
        }
+
+       return ctx_banned;
 }
 
 static void i915_gem_reset_fences(struct drm_device *dev)
@@ -2247,15 +2260,16 @@ static void i915_gem_reset_fences(struct drm_device 
*dev)
        INIT_LIST_HEAD(&dev_priv->mm.fence_list);
 }
 
-void i915_gem_reset(struct drm_device *dev)
+bool i915_gem_reset(struct drm_device *dev)
 {
        struct drm_i915_private *dev_priv = dev->dev_private;
        struct drm_i915_gem_object *obj;
        struct intel_ring_buffer *ring;
        int i;
+       bool ctx_banned = false;
 
        for_each_ring(ring, dev_priv, i)
-               i915_gem_reset_ring_lists(dev_priv, ring);
+               ctx_banned |= i915_gem_reset_ring_lists(dev_priv, ring);
 
        /* Move everything out of the GPU domains to ensure we do any
         * necessary invalidation upon reuse.
@@ -2269,6 +2283,8 @@ void i915_gem_reset(struct drm_device *dev)
 
        /* The fence registers are invalidated so clear them out */
        i915_gem_reset_fences(dev);
+
+       return ctx_banned;
 }
 
 /**
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c 
b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 87a3227..7fcd6c0 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -842,6 +842,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
        struct drm_i915_gem_object *batch_obj;
        struct drm_clip_rect *cliprects = NULL;
        struct intel_ring_buffer *ring;
+       struct i915_ctx_hang_stats *hs;
        u32 ctx_id = i915_execbuffer2_get_context_id(*args);
        u32 exec_start, exec_len;
        u32 mask, flags;
@@ -1033,6 +1034,18 @@ i915_gem_do_execbuffer(struct drm_device *dev, void 
*data,
        if (ret)
                goto err;
 
+       hs = i915_gem_context_get_hang_stats(&dev_priv->ring[RCS],
+                                            file, ctx_id);
+       if (IS_ERR(hs)) {
+               ret = PTR_ERR(hs);
+               goto err;
+       }
+
+       if (hs->banned) {
+               ret = -EIO;
+               goto err;
+       }
+
        ret = i915_switch_context(ring, file, ctx_id);
        if (ret)
                goto err;
-- 
1.7.9.5

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

Reply via email to