On 2025-09-08 6:35 p.m., Daniele Ceraolo Spurio wrote:


On 9/2/2025 8:14 AM, Zhanjun Dong wrote:
Boolean flag access from interrupt context might have synchronous issueis
on multiple processor platform, flags modified by one core might be read
as an old value by another core. This issue on interrupt enable flag might
causes interrupt misses or leakage.
Change the interrupts.enable type to atomic to ensure memory
synchronization.

I am not convinced that making this variable atomic is actually helping. The issue is that the tasklet is running after we think we have stopped it, which is much later than us setting guc->interrupts.enabled to false, so there should be no risk of concurrent access to that variable.


Fixes: a187f13d51fa0 ("drm/i915/guc: handle interrupts from media GuC")

I'm not sure if this is the correct guilty patch. While this adds guc- >interrupts.enabled, that shouldn't have changed the timing of things around reset. The issue seems to be that a task already started isn't cleaned up properly, while that patch changes how we stop new tasks from starting.

Closes: https://gitlab.freedesktop.org/drm/i915/kernel/-/issues/14834
Signed-off-by: Zhanjun Dong <zhanjun.d...@intel.com>
Cc: Andi Shyti <andi.sh...@kernel.org>
Cc: Daniele Ceraolo Spurio <daniele.ceraolospu...@intel.com>

---
Change history:
v4: Add back skip on disabled case for tasklet
v3: Drop skip on disabled case for tasklet
     Drop memory barrier
v2: Add skip on disabled case for tasklet
     Add memory barrier after flag changed
     Add Closes tag and typo fix
---
  drivers/gpu/drm/i915/gt/intel_gt_irq.c    |  2 +-
  drivers/gpu/drm/i915/gt/uc/intel_guc.c    | 11 +++++++----
  drivers/gpu/drm/i915/gt/uc/intel_guc.h    |  4 ++--
  drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c |  3 +++
  drivers/gpu/drm/i915/gt/uc/intel_uc.c     |  5 +++--
  5 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_gt_irq.c b/drivers/gpu/drm/ i915/gt/intel_gt_irq.c
index 75e802e10be2..21804eec8320 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_irq.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt_irq.c
@@ -20,7 +20,7 @@
  static void guc_irq_handler(struct intel_guc *guc, u16 iir)
  {
-    if (unlikely(!guc->interrupts.enabled))
+    if (unlikely(!atomic_read(&guc->interrupts.enabled)))
          return;
      if (iir & GUC_INTR_GUC2HOST)
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.c b/drivers/gpu/drm/ i915/gt/uc/intel_guc.c
index f360f020d8f1..1b8d3bbfa16d 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.c
@@ -100,8 +100,8 @@ static void gen9_enable_guc_interrupts(struct intel_guc *guc)
               gt->pm_guc_events);
      gen6_gt_pm_enable_irq(gt, gt->pm_guc_events);
      spin_unlock_irq(gt->irq_lock);
+    atomic_set(&guc->interrupts.enabled, true);

Leftover blank line

-    guc->interrupts.enabled = true;
  }
  static void gen9_disable_guc_interrupts(struct intel_guc *guc)
@@ -109,7 +109,8 @@ static void gen9_disable_guc_interrupts(struct intel_guc *guc)
      struct intel_gt *gt = guc_to_gt(guc);
      assert_rpm_wakelock_held(&gt->i915->runtime_pm);
-    guc->interrupts.enabled = false;
+    atomic_set(&guc->interrupts.enabled, false);
+

Extra blank line

    spin_lock_irq(gt->irq_lock);
@@ -146,14 +147,16 @@ static void gen11_enable_guc_interrupts(struct intel_guc *guc)
      __gen11_reset_guc_interrupts(gt);
      spin_unlock_irq(gt->irq_lock);
-    guc->interrupts.enabled = true;
+    atomic_set(&guc->interrupts.enabled, true);
+

extra blank line

  }
  static void gen11_disable_guc_interrupts(struct intel_guc *guc)
  {
      struct intel_gt *gt = guc_to_gt(guc);
-    guc->interrupts.enabled = false;
+    atomic_set(&guc->interrupts.enabled, false);
+
      intel_synchronize_irq(gt->i915);
      gen11_reset_guc_interrupts(guc);
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h b/drivers/gpu/drm/ i915/gt/uc/intel_guc.h
index 053780f562c1..40242bbb166e 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
@@ -93,7 +93,7 @@ struct intel_guc {
      /** @interrupts: pointers to GuC interrupt-managing functions. */
      struct {
-        bool enabled;
+        atomic_t enabled;
          void (*reset)(struct intel_guc *guc);
          void (*enable)(struct intel_guc *guc);
          void (*disable)(struct intel_guc *guc);
@@ -393,7 +393,7 @@ static inline int intel_guc_send_busy_loop(struct intel_guc *guc,
  /* Only call this from the interrupt handler code */
  static inline void intel_guc_to_host_event_handler(struct intel_guc *guc)
  {
-    if (guc->interrupts.enabled)
+    if (atomic_read(&guc->interrupts.enabled))
          intel_guc_ct_event_handler(&guc->ct);
  }
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c b/drivers/gpu/ drm/i915/gt/uc/intel_guc_ct.c
index 380a11c92d63..f0ee3f1235d4 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
@@ -1326,6 +1326,9 @@ static void ct_try_receive_message(struct intel_guc_ct *ct)
  {
      int ret;
+    if (!atomic_read(&ct_to_guc(ct)->interrupts.enabled))
+        return;
+

The GEM_WARN_ON below is there to guarantee that the tasklet is not running when we don't expect it to. By adding this check above, you make the check below impossible to hit, because interrupts.enabled is always set to false before ct->enabled, so we'd always exit silently here before reaching the GEM_WARN_ON.

I see 2 alternative approaches here:

1) If you can motivate the fact that we're ok with exiting silently here without logging an error, then just remove the WARN_ON from below and exit silently if !ct->enabled 2) Make absolutely sure that the tasklet is gone and not running before setting ct->enabled to false (e.g., you could leave the tasklet as disabled and re-enable it after the GuC is re-loaded).

Daniele

I agree with Daniele that make enable flag atomic is not a fix, it looks more like an improvement, but not fix the issue in tasklet. This patch will not moving forward and I will make another patch that focus on skip the GEM_WARN_ON under some condition.

Regards,
Zhanjun Dong


      if (GEM_WARN_ON(!ct->enabled))
          return;
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc.c b/drivers/gpu/drm/ i915/gt/uc/intel_uc.c
index 4a3493e8d433..9d01c3c3d504 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_uc.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
@@ -659,7 +659,8 @@ void intel_uc_runtime_suspend(struct intel_uc *uc)
      struct intel_guc *guc = &uc->guc;
      if (!intel_guc_is_ready(guc)) {
-        guc->interrupts.enabled = false;
+        atomic_set(&guc->interrupts.enabled, false);
+
          return;
      }
@@ -687,7 +688,7 @@ void intel_uc_suspend(struct intel_uc *uc)
      wake_up_all_tlb_invalidate(guc);
      if (!intel_guc_is_ready(guc)) {
-        guc->interrupts.enabled = false;
+        atomic_set(&guc->interrupts.enabled, false);
          return;
      }


Reply via email to