Added debugfs functions and embedded test infrastructure in the context event
interrupt handler for simulating the loss of context event interrupts so that a
context submission state inconsistency can be induced. This is useful for
testing the consistency checker pre-stage to the engine hang recovery path
since in order to test that the inconsistency detection works we first need to
induce a state inconsistency that the inconsistency checker can detect and act
upon.

Signed-off-by: Tomas Elf <tomas....@intel.com>
---
 drivers/gpu/drm/i915/i915_debugfs.c | 88 +++++++++++++++++++++++++++++++++++++
 drivers/gpu/drm/i915/i915_dma.c     |  2 +
 drivers/gpu/drm/i915/i915_drv.c     |  3 ++
 drivers/gpu/drm/i915/i915_drv.h     | 12 +++++
 drivers/gpu/drm/i915/intel_lrc.c    | 68 ++++++++++++++++++++++++++++
 5 files changed, 173 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c 
b/drivers/gpu/drm/i915/i915_debugfs.c
index edb79a7..233088e 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -4667,6 +4667,93 @@ DEFINE_SIMPLE_ATTRIBUTE(i915_wedged_fops,
                        "%llu\n");
 
 static int
+i915_fake_ctx_submission_inconsistency_get(void *data, u64 *val)
+{
+       struct drm_device *dev = data;
+       struct drm_i915_private *dev_priv = dev->dev_private;
+       struct intel_engine_cs *ring;
+       unsigned i;
+
+       DRM_INFO("Faked inconsistent context submission state: %x\n",
+               dev_priv->gpu_error.faked_lost_ctx_event_irq);
+
+       for_each_ring(ring, dev_priv, i) {
+               u32 fake_cnt =
+                       (dev_priv->gpu_error.faked_lost_ctx_event_irq >> 
(i<<2)) & 0xf;
+
+               DRM_INFO("%s: Faking %s [%u IRQs left to drop]\n",
+                       ring->name,
+                       fake_cnt?"enabled":"disabled",
+                       fake_cnt);
+       }
+
+       *val = (u64) dev_priv->gpu_error.faked_lost_ctx_event_irq;
+
+       return 0;
+}
+
+static int
+i915_fake_ctx_submission_inconsistency_set(void *data, u64 val)
+{
+       struct drm_device *dev = data;
+       struct drm_i915_private *dev_priv = dev->dev_private;
+       u32 fake_status;
+
+       /*
+        * Set up a simulated/faked lost context event interrupt. This is used
+        * to induce inconsistent HW/driver states that the context submission
+        * status consistency checker (involved as a pre-stage to GPU engine
+        * hang recovery), which is required for validation purposes.
+        *
+        * val contains the new faked_lost_ctx_event_irq word that is to be
+        * merged with the already set faked_lost_ctx_event_irq word.
+        *
+        * val == 0 means clear all previously set fake bits.
+        *
+        * Each nibble contains a number between 0-15 denoting the number of
+        * interrupts left to lose on the engine that nibble corresponds to.
+        *
+        * RCS: faked_lost_ctx_event_irq[3:0]
+        * VCS: faked_lost_ctx_event_irq[7:4]
+        * BCS: faked_lost_ctx_event_irq[11:8]
+        * VECS: faked_lost_ctx_event_irq[15:12]
+        * etc
+        *
+        * The number in each nibble is decremented by the context event
+        * interrupt handler in intel_lrc.c once the faked interrupt loss is
+        * executed. If a targetted interrupt is received when bit
+        * corresponding to that engine is set that interrupt will be dropped
+        * without side-effects, thus inducing an inconsistency since the
+        * hardware has entered a state where removal of a context from the
+        * context queue is required but the driver is not informed of this and
+        * is therefore stuck in that state until inconsistency rectification
+        * (forced CSB checking) or reboot.
+        */
+
+       fake_status =
+               dev_priv->gpu_error.faked_lost_ctx_event_irq;
+
+       DRM_INFO("Faking lost context event IRQ (new status: %x, old status: 
%x)\n",
+               (u32) val, fake_status);
+
+       if (val) {
+               dev_priv->gpu_error.faked_lost_ctx_event_irq |= ((u32) val);
+       } else {
+               DRM_INFO("Clearing lost context event IRQ mask\n");
+
+               dev_priv->gpu_error.faked_lost_ctx_event_irq = 0;
+       }
+
+
+       return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(i915_fake_ctx_submission_inconsistency_fops,
+                       i915_fake_ctx_submission_inconsistency_get,
+                       i915_fake_ctx_submission_inconsistency_set,
+                       "%llu\n");
+
+static int
 i915_ring_stop_get(void *data, u64 *val)
 {
        struct drm_device *dev = data;
@@ -5320,6 +5407,7 @@ static const struct i915_debugfs_files {
        const struct file_operations *fops;
 } i915_debugfs_files[] = {
        {"i915_wedged", &i915_wedged_fops},
+       {"i915_fake_ctx_inconsistency", 
&i915_fake_ctx_submission_inconsistency_fops},
        {"i915_max_freq", &i915_max_freq_fops},
        {"i915_min_freq", &i915_min_freq_fops},
        {"i915_cache_sharing", &i915_cache_sharing_fops},
diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
index 1e203e7d..e8193c9 100644
--- a/drivers/gpu/drm/i915/i915_dma.c
+++ b/drivers/gpu/drm/i915/i915_dma.c
@@ -843,6 +843,8 @@ i915_hangcheck_init(struct drm_device *dev)
        int i;
        struct drm_i915_private *dev_priv = dev->dev_private;
 
+       dev_priv->gpu_error.faked_lost_ctx_event_irq = 0;
+
        for (i = 0; i < I915_NUM_RINGS; i++) {
                struct intel_engine_cs *engine = &dev_priv->ring[i];
                struct intel_ring_hangcheck *hc = &engine->hangcheck;
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index c361b19..c32c475 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -917,6 +917,9 @@ int i915_reset(struct drm_device *dev)
                }
        }
 
+       /* Clear simulated lost context event interrupts */
+       dev_priv->gpu_error.faked_lost_ctx_event_irq = 0;
+
        if (i915_stop_ring_allow_warn(dev_priv))
                pr_notice("drm/i915: Resetting chip after gpu hang\n");
 
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 9219904..7ebf800 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1400,6 +1400,18 @@ struct i915_gpu_error {
 #define I915_STOP_RING_ALLOW_BAN       (1 << 31)
 #define I915_STOP_RING_ALLOW_WARN      (1 << 30)
 
+       /*
+        * Bit mask for simulation of lost context event IRQs on each
+        * respective engine.
+        *
+        *   Bits 0:3:   Number of lost IRQs to be faked on RCS
+        *   Bits 4:7:   Number of lost IRQs to be faked on VCS
+        *   Bits 8:11:  Number of lost IRQs to be faked on BCS
+        *   Bits 12:15: Number of lost IRQs to be faked on VECS
+        *   Bits 16:19: Number of lost IRQs to be faked on VCS2
+       */
+       u32 faked_lost_ctx_event_irq;
+
        /* For missed irq/seqno simulation. */
        unsigned int test_irq_rings;
 
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index b48f74c..5bb7d6e 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -700,6 +700,52 @@ static bool execlists_check_remove_request(struct 
intel_engine_cs *ring,
 }
 
 /**
+ * fake_lost_ctx_event_irq() - Checks for pending faked lost context event 
IRQs.
+ * @dev_priv: ...
+ * @ring: Engine to check pending faked lost IRQs for.
+ *
+ * Checks the bits in dev_priv->gpu_error.faked_lost_ctx_event_irq 
corresponding
+ * to the specified engine and updates the bits and returns a value 
accordingly.
+ *
+ * Return:
+ *     true: If the current IRQ is to be lost.
+ *     false: If the current IRQ is to be processed as normal.
+ */
+static inline bool fake_lost_ctx_event_irq(struct drm_i915_private *dev_priv,
+                                          struct intel_engine_cs *ring)
+{
+       u32 *faked_lost_irq_mask =
+               &dev_priv->gpu_error.faked_lost_ctx_event_irq;
+
+       /*
+        * Point out the least significant bit in the nibble of the faked lost
+        * context event IRQ mask that corresponds to the engine at hand.
+        */
+       u32 engine_nibble = (ring->id << 2);
+
+       /* Check engine nibble for any pending IRQs to be simulated as lost */
+       if (*faked_lost_irq_mask & (0xf << engine_nibble)) {
+               DRM_INFO("Faked lost interrupt on %s! (%x)\n",
+                       ring->name,
+                       *faked_lost_irq_mask);
+
+               /*
+                * Subtract the IRQ that is to be simulated as lost from the
+                * engine nibble.
+                */
+               *faked_lost_irq_mask -= (0x1 << engine_nibble);
+
+               DRM_INFO("New fake lost irq mask: %x\n",
+                       *faked_lost_irq_mask);
+
+               /* Tell the IRQ handler to simulate lost context event IRQ */
+               return true;
+       }
+
+       return false;
+}
+
+/**
  * intel_lrc_irq_handler() - handle Context Switch interrupts
  * @ring: Engine Command Streamer to handle.
  * @do_lock: Lock execlist spinlock (if false the caller is responsible for 
this)
@@ -740,6 +786,23 @@ int intel_lrc_irq_handler(struct intel_engine_cs *ring, 
bool do_lock)
 
                if (status & GEN8_CTX_STATUS_PREEMPTED) {
                        if (status & GEN8_CTX_STATUS_LITE_RESTORE) {
+                               if (fake_lost_ctx_event_irq(dev_priv, ring)) {
+                                   /*
+                                    * If we want to simulate the loss of a
+                                    * context event IRQ (only for such events
+                                    * that could affect the execlist queue,
+                                    * since this is something that could
+                                    * affect the context submission status
+                                    * consistency checker) then just exit the
+                                    * IRQ handler early with no side-effects!
+                                    * We want to pretend like this IRQ never
+                                    * happened. The next time the IRQ handler
+                                    * is entered for this engine the CSB
+                                    * events should remain in the CSB, waiting
+                                    * to be processed.
+                                    */
+                                   goto exit;
+                               }
                                if (execlists_check_remove_request(ring, 
status_id))
                                        WARN(1, "Lite Restored request removed 
from queue\n");
                        } else
@@ -748,6 +811,10 @@ int intel_lrc_irq_handler(struct intel_engine_cs *ring, 
bool do_lock)
 
                 if ((status & GEN8_CTX_STATUS_ACTIVE_IDLE) ||
                     (status & GEN8_CTX_STATUS_ELEMENT_SWITCH)) {
+
+                       if (fake_lost_ctx_event_irq(dev_priv, ring))
+                           goto exit;
+
                        if (execlists_check_remove_request(ring, status_id))
                                submit_contexts++;
                }
@@ -770,6 +837,7 @@ int intel_lrc_irq_handler(struct intel_engine_cs *ring, 
bool do_lock)
                                 ((u32)ring->next_context_status_buffer &
                                  GEN8_CSB_PTR_MASK) << 8));
 
+exit:
        if (do_lock)
                spin_unlock(&ring->execlist_lock);
 
-- 
1.9.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

Reply via email to