Missed breadcrumb detection is defunct due to the tight coupling with
dma_fence signaling and the myriad ways we may signal fences from
everywhere but from an interrupt, i.e. we frequently signal a fence
before we even see its interrupt. This means that even if we miss an
interrupt for a fence, it still is signaled before our breadcrumb
hangcheck fires, so simplify the breadcrumb hangchecking by moving it
into the GPU hangcheck and forgo fake interrupts.

Signed-off-by: Chris Wilson <ch...@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_debugfs.c           |  94 +----------
 drivers/gpu/drm/i915/i915_gpu_error.c         |   2 -
 drivers/gpu/drm/i915/i915_gpu_error.h         |   5 -
 drivers/gpu/drm/i915/intel_breadcrumbs.c      | 147 +-----------------
 drivers/gpu/drm/i915/intel_hangcheck.c        |   2 +
 drivers/gpu/drm/i915/intel_ringbuffer.h       |   5 -
 .../gpu/drm/i915/selftests/i915_gem_context.c |   7 -
 drivers/gpu/drm/i915/selftests/i915_request.c |   7 -
 8 files changed, 6 insertions(+), 263 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c 
b/drivers/gpu/drm/i915/i915_debugfs.c
index a8830d7d1617..3c89ab5ea480 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1319,9 +1319,7 @@ static int i915_hangcheck_info(struct seq_file *m, void 
*unused)
                seq_printf(m, "\tseqno = %x [current %x, last %x]\n",
                           engine->hangcheck.seqno, seqno[id],
                           intel_engine_last_submit(engine));
-               seq_printf(m, "\tfake irq active? %s, stalled? %s, wedged? 
%s\n",
-                          yesno(test_bit(engine->id,
-                                         
&dev_priv->gpu_error.missed_irq_rings)),
+               seq_printf(m, "\tstalled? %s, wedged? %s\n",
                           yesno(engine->hangcheck.stalled),
                           yesno(engine->hangcheck.wedged));
 
@@ -3886,94 +3884,6 @@ DEFINE_SIMPLE_ATTRIBUTE(i915_wedged_fops,
                        i915_wedged_get, i915_wedged_set,
                        "%llu\n");
 
-static int
-fault_irq_set(struct drm_i915_private *i915,
-             unsigned long *irq,
-             unsigned long val)
-{
-       int err;
-
-       err = mutex_lock_interruptible(&i915->drm.struct_mutex);
-       if (err)
-               return err;
-
-       err = i915_gem_wait_for_idle(i915,
-                                    I915_WAIT_LOCKED |
-                                    I915_WAIT_INTERRUPTIBLE,
-                                    MAX_SCHEDULE_TIMEOUT);
-       if (err)
-               goto err_unlock;
-
-       *irq = val;
-       mutex_unlock(&i915->drm.struct_mutex);
-
-       /* Flush idle worker to disarm irq */
-       drain_delayed_work(&i915->gt.idle_work);
-
-       return 0;
-
-err_unlock:
-       mutex_unlock(&i915->drm.struct_mutex);
-       return err;
-}
-
-static int
-i915_ring_missed_irq_get(void *data, u64 *val)
-{
-       struct drm_i915_private *dev_priv = data;
-
-       *val = dev_priv->gpu_error.missed_irq_rings;
-       return 0;
-}
-
-static int
-i915_ring_missed_irq_set(void *data, u64 val)
-{
-       struct drm_i915_private *i915 = data;
-
-       return fault_irq_set(i915, &i915->gpu_error.missed_irq_rings, val);
-}
-
-DEFINE_SIMPLE_ATTRIBUTE(i915_ring_missed_irq_fops,
-                       i915_ring_missed_irq_get, i915_ring_missed_irq_set,
-                       "0x%08llx\n");
-
-static int
-i915_ring_test_irq_get(void *data, u64 *val)
-{
-       struct drm_i915_private *dev_priv = data;
-
-       *val = dev_priv->gpu_error.test_irq_rings;
-
-       return 0;
-}
-
-static int
-i915_ring_test_irq_set(void *data, u64 val)
-{
-       struct drm_i915_private *i915 = data;
-
-       /* GuC keeps the user interrupt permanently enabled for submission */
-       if (USES_GUC_SUBMISSION(i915))
-               return -ENODEV;
-
-       /*
-        * From icl, we can no longer individually mask interrupt generation
-        * from each engine.
-        */
-       if (INTEL_GEN(i915) >= 11)
-               return -ENODEV;
-
-       val &= INTEL_INFO(i915)->ring_mask;
-       DRM_DEBUG_DRIVER("Masking interrupts on rings 0x%08llx\n", val);
-
-       return fault_irq_set(i915, &i915->gpu_error.test_irq_rings, val);
-}
-
-DEFINE_SIMPLE_ATTRIBUTE(i915_ring_test_irq_fops,
-                       i915_ring_test_irq_get, i915_ring_test_irq_set,
-                       "0x%08llx\n");
-
 #define DROP_UNBOUND   BIT(0)
 #define DROP_BOUND     BIT(1)
 #define DROP_RETIRE    BIT(2)
@@ -4735,8 +4645,6 @@ static const struct i915_debugfs_files {
 } i915_debugfs_files[] = {
        {"i915_wedged", &i915_wedged_fops},
        {"i915_cache_sharing", &i915_cache_sharing_fops},
-       {"i915_ring_missed_irq", &i915_ring_missed_irq_fops},
-       {"i915_ring_test_irq", &i915_ring_test_irq_fops},
        {"i915_gem_drop_caches", &i915_drop_caches_fops},
 #if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
        {"i915_error_state", &i915_error_state_fops},
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c 
b/drivers/gpu/drm/i915/i915_gpu_error.c
index c6057126bb53..ccdb412f6a75 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -721,8 +721,6 @@ static void __err_print_to_sgl(struct 
drm_i915_error_state_buf *m,
        err_printf(m, "FORCEWAKE: 0x%08x\n", error->forcewake);
        err_printf(m, "DERRMR: 0x%08x\n", error->derrmr);
        err_printf(m, "CCID: 0x%08x\n", error->ccid);
-       err_printf(m, "Missed interrupts: 0x%08lx\n",
-                  m->i915->gpu_error.missed_irq_rings);
 
        for (i = 0; i < error->nfence; i++)
                err_printf(m, "  fence[%d] = %08llx\n", i, error->fence[i]);
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h 
b/drivers/gpu/drm/i915/i915_gpu_error.h
index cb6a6add543e..5231d433a880 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.h
+++ b/drivers/gpu/drm/i915/i915_gpu_error.h
@@ -205,8 +205,6 @@ struct i915_gpu_error {
 
        atomic_t pending_fb_pin;
 
-       unsigned long missed_irq_rings;
-
        /**
         * State variable controlling the reset flow and count
         *
@@ -275,9 +273,6 @@ struct i915_gpu_error {
         */
        wait_queue_head_t reset_queue;
 
-       /* For missed irq/seqno simulation. */
-       unsigned long test_irq_rings;
-
        struct i915_gpu_restart *restart;
 };
 
diff --git a/drivers/gpu/drm/i915/intel_breadcrumbs.c 
b/drivers/gpu/drm/i915/intel_breadcrumbs.c
index f9fd967bb50c..f1bd7e51d951 100644
--- a/drivers/gpu/drm/i915/intel_breadcrumbs.c
+++ b/drivers/gpu/drm/i915/intel_breadcrumbs.c
@@ -94,7 +94,6 @@ bool intel_engine_breadcrumbs_irq(struct intel_engine_cs 
*engine)
 
        spin_lock(&b->irq_lock);
 
-       b->irq_fired = true;
        if (b->irq_armed && list_empty(&b->signalers))
                __intel_breadcrumbs_disarm_irq(b);
 
@@ -158,86 +157,6 @@ static void signal_irq_work(struct irq_work *work)
        intel_engine_breadcrumbs_irq(engine);
 }
 
-static unsigned long wait_timeout(void)
-{
-       return round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES);
-}
-
-static noinline void missed_breadcrumb(struct intel_engine_cs *engine)
-{
-       if (GEM_SHOW_DEBUG()) {
-               struct drm_printer p = drm_debug_printer(__func__);
-
-               intel_engine_dump(engine, &p,
-                                 "%s missed breadcrumb at %pS\n",
-                                 engine->name, __builtin_return_address(0));
-       }
-
-       set_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings);
-}
-
-static void intel_breadcrumbs_hangcheck(struct timer_list *t)
-{
-       struct intel_engine_cs *engine =
-               from_timer(engine, t, breadcrumbs.hangcheck);
-       struct intel_breadcrumbs *b = &engine->breadcrumbs;
-
-       if (!b->irq_armed)
-               return;
-
-       if (b->irq_fired)
-               goto rearm;
-
-       /*
-        * We keep the hangcheck timer alive until we disarm the irq, even
-        * if there are no waiters at present.
-        *
-        * If the waiter was currently running, assume it hasn't had a chance
-        * to process the pending interrupt (e.g, low priority task on a loaded
-        * system) and wait until it sleeps before declaring a missed interrupt.
-        *
-        * If the waiter was asleep (and not even pending a wakeup), then we
-        * must have missed an interrupt as the GPU has stopped advancing
-        * but we still have a waiter. Assuming all batches complete within
-        * DRM_I915_HANGCHECK_JIFFIES [1.5s]!
-        */
-       synchronize_hardirq(engine->i915->drm.irq);
-       if (intel_engine_signal_breadcrumbs(engine)) {
-               missed_breadcrumb(engine);
-               mod_timer(&b->fake_irq, jiffies + 1);
-       } else {
-rearm:
-               b->irq_fired = false;
-               mod_timer(&b->hangcheck, wait_timeout());
-       }
-}
-
-static void intel_breadcrumbs_fake_irq(struct timer_list *t)
-{
-       struct intel_engine_cs *engine =
-               from_timer(engine, t, breadcrumbs.fake_irq);
-       struct intel_breadcrumbs *b = &engine->breadcrumbs;
-
-       /*
-        * The timer persists in case we cannot enable interrupts,
-        * or if we have previously seen seqno/interrupt incoherency
-        * ("missed interrupt" syndrome, better known as a "missed breadcrumb").
-        * Here the worker will wake up every jiffie in order to kick the
-        * oldest waiter to do the coherent seqno check.
-        */
-
-       if (!intel_engine_signal_breadcrumbs(engine) && !b->irq_armed)
-               return;
-
-       /* If the user has disabled the fake-irq, restore the hangchecking */
-       if (!test_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings)) {
-               mod_timer(&b->hangcheck, wait_timeout());
-               return;
-       }
-
-       mod_timer(&b->fake_irq, jiffies + 1);
-}
-
 void intel_engine_pin_breadcrumbs_irq(struct intel_engine_cs *engine)
 {
        struct intel_breadcrumbs *b = &engine->breadcrumbs;
@@ -260,43 +179,14 @@ void intel_engine_unpin_breadcrumbs_irq(struct 
intel_engine_cs *engine)
        spin_unlock_irq(&b->irq_lock);
 }
 
-static bool use_fake_irq(const struct intel_breadcrumbs *b)
-{
-       const struct intel_engine_cs *engine =
-               container_of(b, struct intel_engine_cs, breadcrumbs);
-
-       if (!test_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings))
-               return false;
-
-       /*
-        * Only start with the heavy weight fake irq timer if we have not
-        * seen any interrupts since enabling it the first time. If the
-        * interrupts are still arriving, it means we made a mistake in our
-        * engine->seqno_barrier(), a timing error that should be transient
-        * and unlikely to reoccur.
-        */
-       return !b->irq_fired;
-}
-
-static void enable_fake_irq(struct intel_breadcrumbs *b)
-{
-       /* Ensure we never sleep indefinitely */
-       if (!b->irq_enabled || use_fake_irq(b))
-               mod_timer(&b->fake_irq, jiffies + 1);
-       else
-               mod_timer(&b->hangcheck, wait_timeout());
-}
-
-static bool __intel_breadcrumbs_arm_irq(struct intel_breadcrumbs *b)
+static void __intel_breadcrumbs_arm_irq(struct intel_breadcrumbs *b)
 {
        struct intel_engine_cs *engine =
                container_of(b, struct intel_engine_cs, breadcrumbs);
-       struct drm_i915_private *i915 = engine->i915;
-       bool enabled;
 
        lockdep_assert_held(&b->irq_lock);
        if (b->irq_armed)
-               return false;
+               return;
 
        /*
         * The breadcrumb irq will be disarmed on the interrupt after the
@@ -314,16 +204,8 @@ static bool __intel_breadcrumbs_arm_irq(struct 
intel_breadcrumbs *b)
         * the driver is idle) we disarm the breadcrumbs.
         */
 
-       /* No interrupts? Kick the waiter every jiffie! */
-       enabled = false;
-       if (!b->irq_enabled++ &&
-           !test_bit(engine->id, &i915->gpu_error.test_irq_rings)) {
+       if (!b->irq_enabled++)
                irq_enable(engine);
-               enabled = true;
-       }
-
-       enable_fake_irq(b);
-       return enabled;
 }
 
 void intel_engine_init_breadcrumbs(struct intel_engine_cs *engine)
@@ -334,18 +216,6 @@ void intel_engine_init_breadcrumbs(struct intel_engine_cs 
*engine)
        INIT_LIST_HEAD(&b->signalers);
 
        init_irq_work(&b->irq_work, signal_irq_work);
-
-       timer_setup(&b->fake_irq, intel_breadcrumbs_fake_irq, 0);
-       timer_setup(&b->hangcheck, intel_breadcrumbs_hangcheck, 0);
-}
-
-static void cancel_fake_irq(struct intel_engine_cs *engine)
-{
-       struct intel_breadcrumbs *b = &engine->breadcrumbs;
-
-       del_timer_sync(&b->fake_irq); /* may queue b->hangcheck */
-       del_timer_sync(&b->hangcheck);
-       clear_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings);
 }
 
 void intel_engine_reset_breadcrumbs(struct intel_engine_cs *engine)
@@ -355,13 +225,6 @@ void intel_engine_reset_breadcrumbs(struct intel_engine_cs 
*engine)
 
        spin_lock_irqsave(&b->irq_lock, flags);
 
-       /*
-        * Leave the fake_irq timer enabled (if it is running), but clear the
-        * bit so that it turns itself off on its next wake up and goes back
-        * to the long hangcheck interval if still required.
-        */
-       clear_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings);
-
        if (b->irq_enabled)
                irq_enable(engine);
        else
@@ -372,7 +235,6 @@ void intel_engine_reset_breadcrumbs(struct intel_engine_cs 
*engine)
 
 void intel_engine_fini_breadcrumbs(struct intel_engine_cs *engine)
 {
-       cancel_fake_irq(engine);
 }
 
 bool intel_engine_enable_signaling(struct i915_request *rq)
@@ -454,7 +316,4 @@ void intel_engine_print_breadcrumbs(struct intel_engine_cs 
*engine,
                }
        }
        spin_unlock_irq(&b->irq_lock);
-
-       if (test_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings))
-               drm_printf(p, "Fake irq active\n");
 }
diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c 
b/drivers/gpu/drm/i915/intel_hangcheck.c
index 7dc11fcb13de..bc8d884f24bd 100644
--- a/drivers/gpu/drm/i915/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/intel_hangcheck.c
@@ -271,6 +271,8 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
        for_each_engine(engine, dev_priv, id) {
                struct intel_engine_hangcheck hc;
 
+               intel_engine_signal_breadcrumbs(engine);
+
                hangcheck_load_sample(engine, &hc);
                hangcheck_accumulate_sample(engine, &hc);
                hangcheck_store_sample(engine, &hc);
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h 
b/drivers/gpu/drm/i915/intel_ringbuffer.h
index ca56816902e6..c1de9d6675c3 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -386,14 +386,9 @@ struct intel_engine_cs {
 
                struct irq_work irq_work;
 
-               struct timer_list fake_irq; /* used after a missed interrupt */
-               struct timer_list hangcheck; /* detect missed interrupts */
-
-               unsigned int hangcheck_interrupts;
                unsigned int irq_enabled;
 
                bool irq_armed;
-               bool irq_fired;
        } breadcrumbs;
 
        struct {
diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_context.c 
b/drivers/gpu/drm/i915/selftests/i915_gem_context.c
index 4cba50679607..2cca234fd291 100644
--- a/drivers/gpu/drm/i915/selftests/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/selftests/i915_gem_context.c
@@ -65,7 +65,6 @@ static int begin_live_test(struct live_test *t,
                return err;
        }
 
-       i915->gpu_error.missed_irq_rings = 0;
        t->reset_global = i915_reset_count(&i915->gpu_error);
 
        for_each_engine(engine, i915, id)
@@ -103,12 +102,6 @@ static int end_live_test(struct live_test *t)
                return -EIO;
        }
 
-       if (i915->gpu_error.missed_irq_rings) {
-               pr_err("%s(%s): Missed interrupts on engines %lx\n",
-                      t->func, t->name, i915->gpu_error.missed_irq_rings);
-               return -EIO;
-       }
-
        return 0;
 }
 
diff --git a/drivers/gpu/drm/i915/selftests/i915_request.c 
b/drivers/gpu/drm/i915/selftests/i915_request.c
index 48b6159afe64..5953a47827a7 100644
--- a/drivers/gpu/drm/i915/selftests/i915_request.c
+++ b/drivers/gpu/drm/i915/selftests/i915_request.c
@@ -535,7 +535,6 @@ static int begin_live_test(struct live_test *t,
                return err;
        }
 
-       i915->gpu_error.missed_irq_rings = 0;
        t->reset_count = i915_reset_count(&i915->gpu_error);
 
        return 0;
@@ -559,12 +558,6 @@ static int end_live_test(struct live_test *t)
                return -EIO;
        }
 
-       if (i915->gpu_error.missed_irq_rings) {
-               pr_err("%s(%s): Missed interrupts on engines %lx\n",
-                      t->func, t->name, i915->gpu_error.missed_irq_rings);
-               return -EIO;
-       }
-
        return 0;
 }
 
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

Reply via email to