Hi Markus,

On 24/08/2022 20:28, Markus Theil wrote:
If DPDK applications should be used with a minimal set of privileges,
using the msr kernel module on linux should not be necessary.

Since at least kernel 4.4 the rdmsr call to obtain the last non-turbo
boost frequency can be left out, if the sysfs interface is used.
Also RHEL 7 with recent kernel updates should include the sysfs interface
for this (I only looked this up for CentOS 7).

Signed-off-by: Markus Theil <markus.th...@tu-ilmenau.de>
---
  lib/power/power_pstate_cpufreq.c | 69 ++++++++++++++++++--------------
  1 file changed, 40 insertions(+), 29 deletions(-)

diff --git a/lib/power/power_pstate_cpufreq.c b/lib/power/power_pstate_cpufreq.c
index 78c9197695..c3d66a8f68 100644
--- a/lib/power/power_pstate_cpufreq.c
+++ b/lib/power/power_pstate_cpufreq.c
@@ -35,15 +35,9 @@
                "/sys/devices/system/cpu/cpu%u/cpufreq/cpuinfo_min_freq"
  #define POWER_SYSFILE_BASE_FREQ  \
                "/sys/devices/system/cpu/cpu%u/cpufreq/base_frequency"
+#define POWER_SYSFILE_TURBO_PCT  \
+               "/sys/devices/system/cpu/intel_pstate/turbo_pct"
  #define POWER_PSTATE_DRIVER "intel_pstate"
-#define POWER_MSR_PATH  "/dev/cpu/%u/msr"
-
-/*
- * MSR related
- */
-#define PLATFORM_INFO     0x0CE
-#define NON_TURBO_MASK    0xFF00
-#define NON_TURBO_OFFSET  0x8
enum power_state {
@@ -74,37 +68,33 @@ struct pstate_power_info {
  static struct pstate_power_info lcore_power_info[RTE_MAX_LCORE];
/**
- * It is to read the specific MSR.
+ * It is to read the turbo mode percentage from sysfs
   */
-
  static int32_t
-power_rdmsr(int msr, uint64_t *val, unsigned int lcore_id)
+power_read_turbo_pct(uint64_t *outVal)
  {
        int fd, ret;
-       char fullpath[PATH_MAX];
+       char val[4] = {0};
- snprintf(fullpath, sizeof(fullpath), POWER_MSR_PATH, lcore_id);
-
-       fd = open(fullpath, O_RDONLY);
+       fd = open(POWER_SYSFILE_TURBO_PCT, O_RDONLY);
if (fd < 0) {
-               RTE_LOG(ERR, POWER, "Error opening '%s': %s\n", fullpath,
+               RTE_LOG(ERR, POWER, "Error opening '%s': %s\n", 
POWER_SYSFILE_TURBO_PCT,
                                 strerror(errno));
                return fd;
        }
- ret = pread(fd, val, sizeof(uint64_t), msr);
+       ret = read(fd, val, sizeof(val));
if (ret < 0) {
-               RTE_LOG(ERR, POWER, "Error reading '%s': %s\n", fullpath,
+               RTE_LOG(ERR, POWER, "Error reading '%s': %s\n", 
POWER_SYSFILE_TURBO_PCT,
                                 strerror(errno));
                goto out;
        }
- POWER_DEBUG_TRACE("MSR Path %s, offset 0x%X for lcore %u\n",
-                       fullpath, msr, lcore_id);
+       *outVal = (uint64_t) atol(val);

I'd recommend replacing atol with strtol, it's a safer implementation. It's more commonly found in DPDK code than atol.


-       POWER_DEBUG_TRACE("Ret value %d, content is 0x%"PRIx64"\n", ret, *val);
+       POWER_DEBUG_TRACE("power turbo pct: %"PRIu64"\n", *outVal);
out: close(fd);
        return ret;
@@ -116,8 +106,9 @@ out:        close(fd);
  static int
  power_init_for_setting_freq(struct pstate_power_info *pi)
  {
-       FILE *f_base = NULL, *f_base_max = NULL, *f_min = NULL, *f_max = NULL;
-       uint32_t base_ratio, base_max_ratio;
+       FILE *f_base = NULL, *f_base_min = NULL, *f_base_max = NULL,
+            *f_min = NULL, *f_max = NULL;
+       uint32_t base_ratio, base_min_ratio, base_max_ratio;
        uint64_t max_non_turbo;
        int ret;
@@ -130,6 +121,14 @@ power_init_for_setting_freq(struct pstate_power_info *pi)
                goto err;
        }
+ open_core_sysfs_file(&f_base_min, "r", POWER_SYSFILE_BASE_MIN_FREQ,
+                       pi->lcore_id);
+       if (f_base_min == NULL) {
+               RTE_LOG(ERR, POWER, "failed to open %s\n",
+                               POWER_SYSFILE_BASE_MIN_FREQ);
+               goto err;
+       }
+
        open_core_sysfs_file(&f_min, "rw+", POWER_SYSFILE_MIN_FREQ,
                        pi->lcore_id);
        if (f_min == NULL) {
@@ -158,6 +157,14 @@ power_init_for_setting_freq(struct pstate_power_info *pi)
                goto err;
        }
+ /* read base min ratio */
+       ret = read_core_sysfs_u32(f_base_min, &base_min_ratio);
+       if (ret < 0) {
+               RTE_LOG(ERR, POWER, "Failed to read %s\n",
+                               POWER_SYSFILE_BASE_MIN_FREQ);
+               goto err;
+       }
+
        /* base ratio may not exist */
        if (f_base != NULL) {
                ret = read_core_sysfs_u32(f_base, &base_ratio);
@@ -170,20 +177,22 @@ power_init_for_setting_freq(struct pstate_power_info *pi)
                base_ratio = 0;
        }
- /* Add MSR read to detect turbo status */
-       if (power_rdmsr(PLATFORM_INFO, &max_non_turbo, pi->lcore_id) < 0)
-               goto err;
-       /* no errors after this point */
-
        /* convert ratios to bins */
        base_max_ratio /= BUS_FREQ;
+       base_min_ratio /= BUS_FREQ;
        base_ratio /= BUS_FREQ;
/* assign file handles */
        pi->f_cur_min = f_min;
        pi->f_cur_max = f_max;
- max_non_turbo = (max_non_turbo&NON_TURBO_MASK)>>NON_TURBO_OFFSET;
+       /* try to get turbo from global sysfs entry for less privileges than 
from MSR */
+       if (power_read_turbo_pct(&max_non_turbo) < 0)
+               goto err;
+       /* no errors after this point */
+
+       max_non_turbo = base_min_ratio
+                     + (100 - max_non_turbo) * (base_max_ratio - 
base_min_ratio) / 100;
POWER_DEBUG_TRACE("no turbo perf %"PRIu64"\n", max_non_turbo); @@ -220,6 +229,8 @@ power_init_for_setting_freq(struct pstate_power_info *pi)
  err:
        if (f_base != NULL)
                fclose(f_base);
+       if (f_base_min != NULL)
+               fclose(f_base_min);
        if (f_base_max != NULL)
                fclose(f_base_max);
        if (f_min != NULL)

Nice patch.

I've run the patched code and can confirm that the max_non_turbo value agrees with the relevant byte in the 0xCE MSR, so looks good.

Tested-By: David Hunt <david.h...@intel.com>

Also, once the atol issue mentioned above is resolved:

Acked-By: David Hunt <david.h...@intel.com>

Thanks!

Reply via email to