From: Tvrtko Ursulin <tvrtko.ursu...@intel.com>

We can use engine busy stats instead of the MMIO sampling timer
for better efficiency.

As minimum this saves period * num_engines / sec mmio reads,
and in a better case, when only engine busy samplers are active,
it enables us to not kick off the sampling timer at all.

v2: Rebase.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursu...@intel.com>
---
 drivers/gpu/drm/i915/i915_pmu.c         | 86 ++++++++++++++++++++++++++++-----
 drivers/gpu/drm/i915/intel_ringbuffer.h |  3 ++
 2 files changed, 77 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index 0d9c0d07a432..3272ec0763bf 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -59,6 +59,11 @@ static u64 event_enabled_mask(struct perf_event *event)
        return config_enabled_mask(event->attr.config);
 }
 
+static bool supports_busy_stats(void)
+{
+       return i915.enable_execlists;
+}
+
 static bool pmu_needs_timer(struct drm_i915_private *i915, bool gpu_active)
 {
        u64 enable = i915->pmu.enable;
@@ -69,6 +74,8 @@ static bool pmu_needs_timer(struct drm_i915_private *i915, 
bool gpu_active)
 
        if (!gpu_active)
                enable &= ~ENGINE_SAMPLE_MASK;
+       else if (supports_busy_stats())
+               enable &= ~BIT(I915_SAMPLE_BUSY);
 
        return enable;
 }
@@ -132,7 +139,8 @@ static void engines_sample(struct drm_i915_private 
*dev_priv)
                if (enable & BIT(I915_SAMPLE_QUEUED))
                        engine->pmu.sample[I915_SAMPLE_QUEUED] += PERIOD;
 
-               if (enable & BIT(I915_SAMPLE_BUSY)) {
+               if ((enable & BIT(I915_SAMPLE_BUSY)) &&
+                   !engine->pmu.busy_stats) {
                        u32 val;
 
                        fw = grab_forcewake(dev_priv, fw);
@@ -349,14 +357,29 @@ static void i915_pmu_timer_cancel(struct perf_event 
*event)
        hrtimer_cancel(&hwc->hrtimer);
 }
 
+static bool engine_needs_busy_stats(struct intel_engine_cs *engine)
+{
+       return supports_busy_stats() &&
+              (engine->pmu.enable & BIT(I915_SAMPLE_BUSY));
+}
+
 static void i915_pmu_enable(struct perf_event *event)
 {
        struct drm_i915_private *i915 =
                container_of(event->pmu, typeof(*i915), pmu.base);
+       struct intel_engine_cs *engine = NULL;
        unsigned long flags;
 
        spin_lock_irqsave(&i915->pmu.lock, flags);
 
+       if (is_engine_event(event)) {
+               engine = intel_engine_lookup_user(i915,
+                                                 engine_event_class(event),
+                                                 engine_event_instance(event));
+               GEM_BUG_ON(!engine);
+               engine->pmu.enable |= BIT(engine_event_sample(event));
+       }
+
        i915->pmu.enable |= event_enabled_mask(event);
 
        if (pmu_needs_timer(i915, true) && !i915->pmu.timer_enabled) {
@@ -364,16 +387,11 @@ static void i915_pmu_enable(struct perf_event *event)
                                       ns_to_ktime(PERIOD), 0,
                                       HRTIMER_MODE_REL_PINNED);
                i915->pmu.timer_enabled = true;
-       }
-
-       if (is_engine_event(event)) {
-               struct intel_engine_cs *engine;
-
-               engine = intel_engine_lookup_user(i915,
-                                                 engine_event_class(event),
-                                                 engine_event_instance(event));
-               GEM_BUG_ON(!engine);
-               engine->pmu.enable |= BIT(engine_event_sample(event));
+       } else if (is_engine_event(event) && engine_needs_busy_stats(engine) &&
+                  !engine->pmu.busy_stats) {
+               engine->pmu.busy_stats = true;
+               if (!cancel_delayed_work(&engine->pmu.disable_busy_stats))
+                       queue_work(i915->wq, &engine->pmu.enable_busy_stats);
        }
 
        spin_unlock_irqrestore(&i915->pmu.lock, flags);
@@ -399,10 +417,17 @@ static void i915_pmu_disable(struct perf_event *event)
                                                  engine_event_instance(event));
                GEM_BUG_ON(!engine);
                engine->pmu.enable &= ~BIT(engine_event_sample(event));
+               if (engine->pmu.busy_stats &&
+                   !engine_needs_busy_stats(engine)) {
+                       engine->pmu.busy_stats = false;
+                       queue_delayed_work(i915->wq,
+                                          &engine->pmu.disable_busy_stats,
+                                          round_jiffies_up_relative(2 * HZ));
+               }
                mask = 0;
                for_each_engine(engine, i915, id)
                        mask |= engine->pmu.enable;
-               mask = ~mask;
+               mask = (~mask) & ENGINE_SAMPLE_MASK;
        } else {
                mask = event_enabled_mask(event);
        }
@@ -474,6 +499,9 @@ static void i915_pmu_event_read(struct perf_event *event)
 
                if (WARN_ON_ONCE(!engine)) {
                        /* Do nothing */
+               } else if (sample == I915_SAMPLE_BUSY &&
+                          engine->pmu.busy_stats) {
+                       val = ktime_to_ns(intel_engine_get_busy_time(engine));
                } else {
                        val = engine->pmu.sample[sample];
                }
@@ -634,8 +662,27 @@ static const struct attribute_group 
*i915_pmu_attr_groups[] = {
         NULL
 };
 
+static void __enable_busy_stats(struct work_struct *work)
+{
+       struct intel_engine_cs *engine =
+               container_of(work, typeof(*engine), pmu.enable_busy_stats);
+
+       WARN_ON_ONCE(intel_enable_engine_stats(engine));
+}
+
+static void __disable_busy_stats(struct work_struct *work)
+{
+       struct intel_engine_cs *engine =
+              container_of(work, typeof(*engine), pmu.disable_busy_stats.work);
+
+       intel_disable_engine_stats(engine);
+}
+
 void i915_pmu_register(struct drm_i915_private *i915)
 {
+       struct intel_engine_cs *engine;
+       enum intel_engine_id id;
+
        if (INTEL_GEN(i915) <= 2)
                return;
 
@@ -651,6 +698,13 @@ void i915_pmu_register(struct drm_i915_private *i915)
 
        spin_lock_init(&i915->pmu.lock);
        hrtimer_init(&i915->pmu.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+
+       for_each_engine(engine, i915, id) {
+               INIT_WORK(&engine->pmu.enable_busy_stats, __enable_busy_stats);
+               INIT_DELAYED_WORK(&engine->pmu.disable_busy_stats,
+                         __disable_busy_stats);
+       }
+
        i915->pmu.timer.function = i915_sample;
        i915->pmu.enable = 0;
 
@@ -660,6 +714,9 @@ void i915_pmu_register(struct drm_i915_private *i915)
 
 void i915_pmu_unregister(struct drm_i915_private *i915)
 {
+       struct intel_engine_cs *engine;
+       enum intel_engine_id id;
+
        if (!i915->pmu.base.event_init)
                return;
 
@@ -669,4 +726,9 @@ void i915_pmu_unregister(struct drm_i915_private *i915)
        i915->pmu.base.event_init = NULL;
 
        hrtimer_cancel(&i915->pmu.timer);
+
+       for_each_engine(engine, i915, id) {
+               flush_work(&engine->pmu.enable_busy_stats);
+               flush_delayed_work(&engine->pmu.disable_busy_stats);
+       }
 }
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h 
b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 68f50ec72be6..fd5d838ca7b5 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -248,6 +248,9 @@ struct intel_engine_cs {
        struct {
                u32 enable;
                u64 sample[4];
+               bool busy_stats;
+               struct work_struct enable_busy_stats;
+               struct delayed_work disable_busy_stats;
        } pmu;
 
        /*
-- 
2.9.4

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

Reply via email to