From: Tvrtko Ursulin <tvrtko.ursu...@intel.com>

This reduces the cost of the software engine busyness tracking
to a single no-op instruction when there are no listeners.

We add a new i915 ordered workqueue to be used only for tasks
not needing struct mutex.

v2: Rebase and some comments.
v3: Rebase.
v4: Checkpatch fixes.
v5: Rebase.
v6: Use system_long_wq to avoid being blocked by struct_mutex
    users.
v7: Fix bad conflict resolution from last rebase. (Dmitry Rogozhkin)
v8: Rebase.
v9:
 * Fix race between unordered enable followed by disable.
   (Chris Wilson)
 * Prettify order of local variable declarations. (Chris Wilson)
v10:
 Chris Wilson:
  * Fix workqueue allocation failure handling.
  * Expand comment about critical workqueue ordering.
  * Add some GEM_BUG_ONs to document impossible conditions.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursu...@intel.com>
Reviewed-by: Chris Wilson <ch...@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_drv.c         |  13 +++-
 drivers/gpu/drm/i915/i915_drv.h         |   5 ++
 drivers/gpu/drm/i915/i915_pmu.c         |  65 ++++++++++++++++++--
 drivers/gpu/drm/i915/intel_engine_cs.c  |  17 ++++++
 drivers/gpu/drm/i915/intel_ringbuffer.h | 101 ++++++++++++++++++++------------
 5 files changed, 158 insertions(+), 43 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index d373a840566a..3853038c6fdd 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -804,12 +804,22 @@ static int i915_workqueues_init(struct drm_i915_private 
*dev_priv)
        if (dev_priv->wq == NULL)
                goto out_err;
 
+       /*
+        * Ordered workqueue for tasks not needing the global mutex but
+        * which still need to be serialized.
+        */
+       dev_priv->wq_misc = alloc_ordered_workqueue("i915-misc", 0);
+       if (dev_priv->wq_misc == NULL)
+               goto out_free_wq;
+
        dev_priv->hotplug.dp_wq = alloc_ordered_workqueue("i915-dp", 0);
        if (dev_priv->hotplug.dp_wq == NULL)
-               goto out_free_wq;
+               goto out_free_wq_misc;
 
        return 0;
 
+out_free_wq_misc:
+       destroy_workqueue(dev_priv->wq_misc);
 out_free_wq:
        destroy_workqueue(dev_priv->wq);
 out_err:
@@ -830,6 +840,7 @@ static void i915_engines_cleanup(struct drm_i915_private 
*i915)
 static void i915_workqueues_cleanup(struct drm_i915_private *dev_priv)
 {
        destroy_workqueue(dev_priv->hotplug.dp_wq);
+       destroy_workqueue(dev_priv->wq_misc);
        destroy_workqueue(dev_priv->wq);
 }
 
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 2904ac4b2ae5..99989cd778a9 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2356,6 +2356,11 @@ struct drm_i915_private {
         */
        struct workqueue_struct *wq;
 
+       /**
+        * @wq_misc: Workqueue for serialized tasks not needing struct mutex.
+        */
+       struct workqueue_struct *wq_misc;
+
        /* Display functions */
        struct drm_i915_display_funcs display;
 
diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index 2671e3640639..d552e415becd 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -458,11 +458,20 @@ static void i915_pmu_enable(struct perf_event *event)
                GEM_BUG_ON(sample >= I915_PMU_SAMPLE_BITS);
                GEM_BUG_ON(engine->pmu.enable_count[sample] == ~0);
                if (engine->pmu.enable_count[sample]++ == 0) {
+                       /*
+                        * Enable engine busy stats tracking if needed or
+                        * alternatively cancel the scheduled disabling of the
+                        * same.
+                        *
+                        * If the delayed disable was pending, cancel it and in
+                        * this case no need to queue a new enable.
+                        */
                        if (engine_needs_busy_stats(engine) &&
                            !engine->pmu.busy_stats) {
-                               engine->pmu.busy_stats =
-                                       intel_enable_engine_stats(engine) == 0;
-                               WARN_ON_ONCE(!engine->pmu.busy_stats);
+                               engine->pmu.busy_stats = true;
+                               if 
(!cancel_delayed_work(&engine->pmu.disable_busy_stats))
+                                       queue_work(i915->wq_misc,
+                                                  
&engine->pmu.enable_busy_stats);
                        }
                }
        }
@@ -505,7 +514,22 @@ static void i915_pmu_disable(struct perf_event *event)
                        if (!engine_needs_busy_stats(engine) &&
                            engine->pmu.busy_stats) {
                                engine->pmu.busy_stats = false;
-                               intel_disable_engine_stats(engine);
+                               /*
+                                * We request a delayed disable to handle the
+                                * rapid on/off cycles on events, which can
+                                * happen when tools like perf stat start, in a
+                                * nicer way.
+                                *
+                                * PMU enable and disable callbacks are
+                                * serialized with the spinlock, and since we do
+                                * them in decoupled fashion from workers, it is
+                                * critical to use an ordered work queue to
+                                * preserve this ordering between the two
+                                * events.
+                                */
+                               queue_delayed_work(i915->wq_misc,
+                                                  
&engine->pmu.disable_busy_stats,
+                                                  
round_jiffies_up_relative(HZ));
                        }
                }
        }
@@ -689,8 +713,26 @@ static int i915_pmu_cpu_offline(unsigned int cpu, struct 
hlist_node *node)
        return 0;
 }
 
+static void __enable_busy_stats(struct work_struct *work)
+{
+       struct intel_engine_cs *engine =
+               container_of(work, typeof(*engine), pmu.enable_busy_stats);
+
+       WARN_ON_ONCE(intel_enable_engine_stats(engine));
+}
+
+static void __disable_busy_stats(struct work_struct *work)
+{
+       struct intel_engine_cs *engine =
+              container_of(work, typeof(*engine), pmu.disable_busy_stats.work);
+
+       intel_disable_engine_stats(engine);
+}
+
 void i915_pmu_register(struct drm_i915_private *i915)
 {
+       struct intel_engine_cs *engine;
+       enum intel_engine_id id;
        int ret;
 
        if (INTEL_GEN(i915) <= 2) {
@@ -725,6 +767,12 @@ void i915_pmu_register(struct drm_i915_private *i915)
        i915->pmu.timer.function = i915_sample;
        i915->pmu.enable = 0;
 
+       for_each_engine(engine, i915, id) {
+               INIT_WORK(&engine->pmu.enable_busy_stats, __enable_busy_stats);
+               INIT_DELAYED_WORK(&engine->pmu.disable_busy_stats,
+                                 __disable_busy_stats);
+       }
+
        ret = perf_pmu_register(&i915->pmu.base, "i915", -1);
        if (ret == 0)
                return;
@@ -743,6 +791,9 @@ void i915_pmu_register(struct drm_i915_private *i915)
 
 void i915_pmu_unregister(struct drm_i915_private *i915)
 {
+       struct intel_engine_cs *engine;
+       enum intel_engine_id id;
+
        if (!i915->pmu.base.event_init)
                return;
 
@@ -754,6 +805,12 @@ void i915_pmu_unregister(struct drm_i915_private *i915)
 
        hrtimer_cancel(&i915->pmu.timer);
 
+       for_each_engine(engine, i915, id) {
+               GEM_BUG_ON(engine->pmu.busy_stats);
+               GEM_BUG_ON(flush_work(&engine->pmu.enable_busy_stats));
+               flush_delayed_work(&engine->pmu.disable_busy_stats);
+       }
+
        perf_pmu_unregister(&i915->pmu.base);
        i915->pmu.base.event_init = NULL;
 }
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c 
b/drivers/gpu/drm/i915/intel_engine_cs.c
index 51885629d3df..80a7b55184df 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -21,6 +21,7 @@
  * IN THE SOFTWARE.
  *
  */
+#include <linux/static_key.h>
 
 #include <drm/drm_print.h>
 
@@ -1813,6 +1814,10 @@ intel_engine_lookup_user(struct drm_i915_private *i915, 
u8 class, u8 instance)
        return i915->engine_class[class][instance];
 }
 
+DEFINE_STATIC_KEY_FALSE(i915_engine_stats_key);
+static DEFINE_MUTEX(i915_engine_stats_mutex);
+static int i915_engine_stats_ref;
+
 /**
  * intel_enable_engine_stats() - Enable engine busy tracking on engine
  * @engine: engine to enable stats collection
@@ -1828,6 +1833,8 @@ int intel_enable_engine_stats(struct intel_engine_cs 
*engine)
        if (!i915_modparams.enable_execlists)
                return -ENODEV;
 
+       mutex_lock(&i915_engine_stats_mutex);
+
        spin_lock_irqsave(&engine->stats.lock, flags);
        if (engine->stats.enabled == ~0)
                goto busy;
@@ -1835,10 +1842,16 @@ int intel_enable_engine_stats(struct intel_engine_cs 
*engine)
                engine->stats.enabled_at = ktime_get();
        spin_unlock_irqrestore(&engine->stats.lock, flags);
 
+       if (i915_engine_stats_ref++ == 0)
+               static_branch_enable(&i915_engine_stats_key);
+
+       mutex_unlock(&i915_engine_stats_mutex);
+
        return 0;
 
 busy:
        spin_unlock_irqrestore(&engine->stats.lock, flags);
+       mutex_unlock(&i915_engine_stats_mutex);
 
        return -EBUSY;
 }
@@ -1856,6 +1869,7 @@ void intel_disable_engine_stats(struct intel_engine_cs 
*engine)
        if (!i915_modparams.enable_execlists)
                return;
 
+       mutex_lock(&i915_engine_stats_mutex);
        spin_lock_irqsave(&engine->stats.lock, flags);
        WARN_ON_ONCE(engine->stats.enabled == 0);
        if (--engine->stats.enabled == 0) {
@@ -1865,6 +1879,9 @@ void intel_disable_engine_stats(struct intel_engine_cs 
*engine)
                engine->stats.total = 0;
        }
        spin_unlock_irqrestore(&engine->stats.lock, flags);
+       if (--i915_engine_stats_ref == 0)
+               static_branch_disable(&i915_engine_stats_key);
+       mutex_unlock(&i915_engine_stats_mutex);
 }
 
 /**
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h 
b/drivers/gpu/drm/i915/intel_ringbuffer.h
index b7f0cd449824..0202b1fa42fd 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -363,6 +363,22 @@ struct intel_engine_cs {
                 *              requested.
                 */
                bool busy_stats;
+               /**
+                * @enable_busy_stats: Work item for engine busy stats enabling.
+                *
+                * Since the action can sleep it needs to be decoupled from the
+                * perf API callback.
+                */
+               struct work_struct enable_busy_stats;
+               /**
+                * @disable_busy_stats: Work item for busy stats disabling.
+                *
+                * Same as with @enable_busy_stats action, with the difference
+                * that we delay it in case there are rapid enable-disable
+                * actions, which can happen during tool startup (like perf
+                * stat).
+                */
+               struct delayed_work disable_busy_stats;
        } pmu;
 
        /*
@@ -906,59 +922,68 @@ void intel_engine_dump(struct intel_engine_cs *engine, 
struct drm_printer *p);
 struct intel_engine_cs *
 intel_engine_lookup_user(struct drm_i915_private *i915, u8 class, u8 instance);
 
+DECLARE_STATIC_KEY_FALSE(i915_engine_stats_key);
+
 static inline void intel_engine_context_in(struct intel_engine_cs *engine)
 {
        unsigned long flags;
 
-       if (READ_ONCE(engine->stats.enabled) == 0)
-               return;
+       if (static_branch_unlikely(&i915_engine_stats_key)) {
+               if (READ_ONCE(engine->stats.enabled) == 0)
+                       return;
 
-       spin_lock_irqsave(&engine->stats.lock, flags);
+               spin_lock_irqsave(&engine->stats.lock, flags);
 
-       if (engine->stats.enabled > 0) {
-               if (engine->stats.active++ == 0)
-                       engine->stats.start = ktime_get();
-               GEM_BUG_ON(engine->stats.active == 0);
-       }
+                       if (engine->stats.enabled > 0) {
+                               if (engine->stats.active++ == 0)
+                                       engine->stats.start = ktime_get();
+                               GEM_BUG_ON(engine->stats.active == 0);
+                       }
 
-       spin_unlock_irqrestore(&engine->stats.lock, flags);
+               spin_unlock_irqrestore(&engine->stats.lock, flags);
+       }
 }
 
 static inline void intel_engine_context_out(struct intel_engine_cs *engine)
 {
        unsigned long flags;
 
-       if (READ_ONCE(engine->stats.enabled) == 0)
-               return;
-
-       spin_lock_irqsave(&engine->stats.lock, flags);
-
-       if (engine->stats.enabled > 0) {
-               ktime_t last, now = ktime_get();
-
-               if (engine->stats.active && --engine->stats.active == 0) {
-                       /*
-                        * Decrement the active context count and in case GPU
-                        * is now idle add up to the running total.
-                        */
-                       last = ktime_sub(now, engine->stats.start);
-
-                       engine->stats.total = ktime_add(engine->stats.total,
-                                                       last);
-               } else if (engine->stats.active == 0) {
-                       /*
-                        * After turning on engine stats, context out might be
-                        * the first event in which case we account from the
-                        * time stats gathering was turned on.
-                        */
-                       last = ktime_sub(now, engine->stats.enabled_at);
-
-                       engine->stats.total = ktime_add(engine->stats.total,
-                                                       last);
+       if (static_branch_unlikely(&i915_engine_stats_key)) {
+               if (READ_ONCE(engine->stats.enabled) == 0)
+                       return;
+
+               spin_lock_irqsave(&engine->stats.lock, flags);
+
+               if (engine->stats.enabled > 0) {
+                       ktime_t last, now = ktime_get();
+
+                       if (engine->stats.active &&
+                           --engine->stats.active == 0) {
+                               /*
+                                * Decrement the active context count and in
+                                * case GPU is now idle add up to the running
+                                * total.
+                                */
+                               last = ktime_sub(now, engine->stats.start);
+
+                               engine->stats.total =
+                                       ktime_add(engine->stats.total, last);
+                       } else if (engine->stats.active == 0) {
+                               /*
+                                * After turning on engine stats, context out
+                                * might be the first event in which case we
+                                * account from the time stats gathering was
+                                * turned on.
+                                */
+                               last = ktime_sub(now, engine->stats.enabled_at);
+
+                               engine->stats.total =
+                                       ktime_add(engine->stats.total, last);
+                       }
                }
-       }
 
-       spin_unlock_irqrestore(&engine->stats.lock, flags);
+               spin_unlock_irqrestore(&engine->stats.lock, flags);
+       }
 }
 
 int intel_enable_engine_stats(struct intel_engine_cs *engine);
-- 
2.9.5

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

Reply via email to