On 11-03-2025 16:47, Nilawar, Badal wrote:

On 07-03-2025 00:41, sk.anir...@intel.com wrote:
From: Sk Anirban <sk.anir...@intel.com>

Refactor power measurement logic to store and compare energy values.
Introduce a threshold check to ensure the GPU enters RC6 properly.

v2:
   - Improved commit message (Badal)

v3:
  - Reorder threshold check (Badal)

Signed-off-by: Sk Anirban <sk.anir...@intel.com>
---
  drivers/gpu/drm/i915/gt/selftest_rc6.c | 59 +++++++++++++++++---------
  1 file changed, 38 insertions(+), 21 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/selftest_rc6.c b/drivers/gpu/drm/i915/gt/selftest_rc6.c
index 908483ab0bc8..5364e50be638 100644
--- a/drivers/gpu/drm/i915/gt/selftest_rc6.c
+++ b/drivers/gpu/drm/i915/gt/selftest_rc6.c
@@ -33,15 +33,20 @@ int live_rc6_manual(void *arg)
  {
      struct intel_gt *gt = arg;
      struct intel_rc6 *rc6 = &gt->rc6;
-    u64 rc0_power, rc6_power;
+    struct intel_rps *rps = &gt->rps;
      intel_wakeref_t wakeref;
+    u64 sleep_time = 1000;
+    u32 rc0_freq = 0;
+    u32 rc6_freq = 0;
+    u64 rc0_power[3];
+    u64 rc6_power[3];
      bool has_power;
+    u64 threshold;
      ktime_t dt;
      u64 res[2];
      int err = 0;
-    u32 rc0_freq = 0;
-    u32 rc6_freq = 0;
-    struct intel_rps *rps = &gt->rps;
+    u64 diff;
+
        /*
       * Our claim is that we can "encourage" the GPU to enter rc6 at will.
@@ -65,9 +70,9 @@ int live_rc6_manual(void *arg)
      res[0] = rc6_residency(rc6);
        dt = ktime_get();
-    rc0_power = librapl_energy_uJ();
-    msleep(1000);
-    rc0_power = librapl_energy_uJ() - rc0_power;
+    rc0_power[0] = librapl_energy_uJ();
+    msleep(sleep_time);
+    rc0_power[1] = librapl_energy_uJ() - rc0_power[0];
      dt = ktime_sub(ktime_get(), dt);
      res[1] = rc6_residency(rc6);
      rc0_freq = intel_rps_read_actual_frequency_fw(rps);
@@ -79,11 +84,12 @@ int live_rc6_manual(void *arg)
      }
        if (has_power) {
-        rc0_power = div64_u64(NSEC_PER_SEC * rc0_power,
-                      ktime_to_ns(dt));
-        if (!rc0_power) {
+        rc0_power[2] = div64_u64(NSEC_PER_SEC * rc0_power[1],
+                     ktime_to_ns(dt));
+
+        if (!rc0_power[2]) {
              if (rc0_freq)
-                pr_debug("No power measured while in RC0! GPU Freq: %u in RC0\n", +                pr_debug("No power measured while in RC0! GPU Freq: %uMHz in RC0\n",
                       rc0_freq);
              else
                  pr_err("No power and freq measured while in RC0\n");
@@ -98,10 +104,10 @@ int live_rc6_manual(void *arg)
      res[0] = rc6_residency(rc6);
      intel_uncore_forcewake_flush(rc6_to_uncore(rc6), FORCEWAKE_ALL);
      dt = ktime_get();
-    rc6_power = librapl_energy_uJ();
-    msleep(1000);
+    rc6_power[0] = librapl_energy_uJ();
+    msleep(sleep_time);
      rc6_freq = intel_rps_read_actual_frequency_fw(rps);
-    rc6_power = librapl_energy_uJ() - rc6_power;
+    rc6_power[1] = librapl_energy_uJ() - rc6_power[0];
      dt = ktime_sub(ktime_get(), dt);
      res[1] = rc6_residency(rc6);
      if (res[1] == res[0]) {
@@ -113,13 +119,24 @@ int live_rc6_manual(void *arg)
      }
        if (has_power) {
-        rc6_power = div64_u64(NSEC_PER_SEC * rc6_power,
-                      ktime_to_ns(dt));
-        pr_info("GPU consumed %llduW in RC0 and %llduW in RC6\n",
-            rc0_power, rc6_power);
-        if (2 * rc6_power > rc0_power) {
-            pr_err("GPU leaked energy while in RC6! GPU Freq: %u in RC6 and %u in RC0\n",
-                   rc6_freq, rc0_freq);
+        rc6_power[2] = div64_u64(NSEC_PER_SEC * rc6_power[1],
+                     ktime_to_ns(dt));
+        pr_info("GPU consumed %lluuW in RC0 and %lluuW in RC6\n",
+            rc0_power[2], rc6_power[2]);
+
+        if (2 * rc6_power[2] > rc0_power[2]) {
+            pr_err("GPU leaked energy while in RC6!\n"
+                   "GPU Freq: %uMHz in RC6 and %uMHz in RC0\n"
+                   "RC0 energy before & after sleep respectively: %lluuJ %lluuJ\n" +                   "RC6 energy before & after sleep respectively: %lluuJ %lluuJ\n",
+                   rc6_freq, rc0_freq, rc0_power[0], rc0_power[1],
+                   rc6_power[0], rc6_power[1]);
+
+            diff = res[1] - res[0];
+            threshold = (9 * NSEC_PER_MSEC * sleep_time) / 10;
+            if (diff < threshold)
+                pr_err("Did not enter RC6 properly, RC6 start residency=%lluns, RC6 end residency=%lluns\n",
+                       res[0], res[1]);

Check if BAT failures reported are related. Similar errors were seen with other selftest related patches too.
Otherwise this looks good to me.

Reviewed-by: Badal Nilawar <badal.nila...@intel.com>

Regards,
Badal
BAT failures were not related to these changes and re-reported.

Thanks,
Anirban

              err = -EINVAL;
              goto out_unlock;
          }

Reply via email to