i915: clear up wedged transitions

Daniel Vetter Wed, 14 Nov 2012 08:28:50 -0800

We have two important transitions of the wedged state in the current
code:

- 0 -> 1: This means a hang has been detected, and signals to everyone
  that they please get of any locks, so that the reset work item can
  do its job.


- 1 -> 0: The reset handler has completed.

Now the last transition mixes up two states: "Reset completed and
successful" and "Reset failed". To distinguish these two we do some
tricks with the reset completion, but I simply could not convince
myself that this doesn't race under odd circumstances.

Hence split this up, and add a new terminal state indicating that the
hw is gone for good.

Also add explicit #defines for both states, update comments.

v2: Split out the reset handling bugfix for the throttle ioctl.

v3: s/tmp/wedged/ sugested by Chris Wilson. Also fixup up a rebase
error which prevent this patch from actually compiling.

Signed-Off-by: Daniel Vetter <daniel.vet...@ffwll.ch>
---
 drivers/gpu/drm/i915/i915_drv.h | 17 ++++++++++++++++-
 drivers/gpu/drm/i915/i915_gem.c | 37 +++++++++++--------------------------
 drivers/gpu/drm/i915/i915_irq.c | 23 +++++++++++++++--------
 3 files changed, 42 insertions(+), 35 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 6958bb0..423541b 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -723,11 +723,26 @@ struct i915_gpu_error {
        /* Protected by the above dev->gpu_error.lock. */
        struct drm_i915_error_state *first_error;
        struct work_struct work;
-       struct completion completion;
 
        unsigned long last_reset;
 
+       /**
+        * State variable controlling the reset flow
+        *
+        * 1 means a reset is in progress. This state will (presuming we don't
+        * have any bugs) decay into either 0 (successful reset) or 2 (hw
+        * terminally sour). All waiters on the reset_queue will be woken when
+        * that happens.
+        */
        atomic_t wedged;
+#define I915_RESET_IN_PROGRESS 1
+#define I915_WEDGED            2
+
+       /**
+        * Waitqueue to signal when the reset has completed. Used by clients
+        * that wait for dev_priv->mm.wedged to settle.
+        */
+       wait_queue_head_t reset_queue;
 
        /* For gpu hang simulation. */
        unsigned int stop_rings;
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 55cdad9..6112a4f 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -89,36 +89,28 @@ static void i915_gem_info_remove_obj(struct 
drm_i915_private *dev_priv,
 static int
 i915_gem_wait_for_error(struct i915_gpu_error *error)
 {
-       struct completion *x = &error->completion;
-       unsigned long flags;
        int ret;
 
        if (!atomic_read(&error->wedged))
                return 0;
 
+#define EXIT_COND (atomic_read(&error->wedged) != I915_RESET_IN_PROGRESS)
        /*
         * Only wait 10 seconds for the gpu reset to complete to avoid hanging
         * userspace. If it takes that long something really bad is going on and
         * we should simply try to bail out and fail as gracefully as possible.
         */
-       ret = wait_for_completion_interruptible_timeout(x, 10*HZ);
+       ret = wait_event_interruptible_timeout(error->reset_queue,
+                                              EXIT_COND,
+                                              10*HZ);
        if (ret == 0) {
                DRM_ERROR("Timed out waiting for the gpu reset to complete\n");
                return -EIO;
        } else if (ret < 0) {
                return ret;
        }
+#undef EXIT_COND
 
-       if (atomic_read(&error->wedged)) {
-               /* GPU is hung, bump the completion count to account for
-                * the token we just consumed so that we never hit zero and
-                * end up waiting upon a subsequent completion event that
-                * will never happen.
-                */
-               spin_lock_irqsave(&x->wait.lock, flags);
-               x->done++;
-               spin_unlock_irqrestore(&x->wait.lock, flags);
-       }
        return 0;
 }
 
@@ -943,23 +935,16 @@ int
 i915_gem_check_wedge(struct i915_gpu_error *error,
                     bool interruptible)
 {
-       if (atomic_read(&error->wedged)) {
-               struct completion *x = &error->completion;
-               bool recovery_complete;
-               unsigned long flags;
-
-               /* Give the error handler a chance to run. */
-               spin_lock_irqsave(&x->wait.lock, flags);
-               recovery_complete = x->done > 0;
-               spin_unlock_irqrestore(&x->wait.lock, flags);
+       unsigned wedged = atomic_read(&error->wedged);
 
+       if (wedged) {
                /* Non-interruptible callers can't handle -EAGAIN, hence return
                 * -EIO unconditionally for these. */
                if (!interruptible)
                        return -EIO;
 
-               /* Recovery complete, but still wedged means reset failure. */
-               if (recovery_complete)
+               /* Recovery complete, but the reset failed ... */
+               if (wedged == I915_WEDGED)
                        return -EIO;
 
                return -EAGAIN;
@@ -1385,7 +1370,7 @@ out:
                /* If this -EIO is due to a gpu hang, give the reset code a
                 * chance to clean up the mess. Otherwise return the proper
                 * SIGBUS. */
-               if (!atomic_read(&dev_priv->gpu_error.wedged))
+               if (atomic_read(&dev_priv->gpu_error.wedged) == I915_WEDGED)
                        return VM_FAULT_SIGBUS;
        case -EAGAIN:
                /* Give the error handler a chance to run and move the
@@ -4112,7 +4097,7 @@ i915_gem_load(struct drm_device *dev)
                INIT_LIST_HEAD(&dev_priv->fence_regs[i].lru_list);
        INIT_DELAYED_WORK(&dev_priv->mm.retire_work,
                          i915_gem_retire_work_handler);
-       init_completion(&dev_priv->gpu_error.completion);
+       init_waitqueue_head(&dev_priv->gpu_error.reset_queue);
 
        /* On GEN3 we really need to make sure the ARB C3 LP bit is set */
        if (IS_GEN3(dev)) {
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 9d8921a..73d46f4 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -838,8 +838,10 @@ done:
  */
 static void i915_error_work_func(struct work_struct *work)
 {
-       drm_i915_private_t *dev_priv = container_of(work, drm_i915_private_t,
-                                                   gpu_error.work);
+       struct i915_gpu_error *error = container_of(work, struct i915_gpu_error,
+                                                   work);
+       drm_i915_private_t *dev_priv = container_of(error, drm_i915_private_t,
+                                                   gpu_error);
        struct drm_device *dev = dev_priv->dev;
        char *error_event[] = { "ERROR=1", NULL };
        char *reset_event[] = { "RESET=1", NULL };
@@ -847,14 +849,19 @@ static void i915_error_work_func(struct work_struct *work)
 
        kobject_uevent_env(&dev->primary->kdev.kobj, KOBJ_CHANGE, error_event);
 
-       if (atomic_read(&dev_priv->gpu_error.wedged)) {
+       if (atomic_read(&error->wedged) == I915_RESET_IN_PROGRESS) {
                DRM_DEBUG_DRIVER("resetting chip\n");
                kobject_uevent_env(&dev->primary->kdev.kobj, KOBJ_CHANGE, 
reset_event);
+
                if (!i915_reset(dev)) {
-                       atomic_set(&dev_priv->gpu_error.wedged, 0);
+                       atomic_set(&error->wedged, 0);
                        kobject_uevent_env(&dev->primary->kdev.kobj, 
KOBJ_CHANGE, reset_done_event);
+               } else {
+                       atomic_set(&error->wedged,
+                                  I915_WEDGED);
                }
-               complete_all(&dev_priv->gpu_error.completion);
+
+               wake_up_all(&dev_priv->gpu_error.reset_queue);
        }
 }
 
@@ -1434,11 +1441,11 @@ void i915_handle_error(struct drm_device *dev, bool 
wedged)
        i915_report_and_clear_eir(dev);
 
        if (wedged) {
-               INIT_COMPLETION(dev_priv->gpu_error.completion);
-               atomic_set(&dev_priv->gpu_error.wedged, 1);
+               atomic_set(&dev_priv->gpu_error.wedged, I915_RESET_IN_PROGRESS);
 
                /*
-                * Wakeup waiting processes so they don't hang
+                * Wakeup waiting processes so that the reset work item
+                * doesn't deadlock trying to grab various locks.
                 */
                for_each_ring(ring, dev_priv, i)
                        wake_up_all(&ring->irq_queue);
-- 
1.7.11.4

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

[Intel-gfx] [PATCH 5/6] drm/i915: clear up wedged transitions

Reply via email to