Hi Huisong Please see comments inline.
Thanks On 2024/8/9 17:50, Huisong Li wrote: > The deeper the idle state, the lower the power consumption, but the longer > the resume time. Some service are delay sensitive and very except the low > resume time, like interrupt packet receiving mode. > > And the "/sys/devices/system/cpu/cpuX/power/pm_qos_resume_latency_us" sysfs > interface is used to set and get the resume latency limit on the cpuX for > userspace. Each cpuidle governor in Linux select which idle state to enter > based on this CPU resume latency in their idle task. > > The per-CPU PM QoS API can be used to control this CPU's idle state > selection and limit just enter the shallowest idle state to low the delay > after sleep by setting strict resume latency (zero value). > > Signed-off-by: Huisong Li <lihuis...@huawei.com> > Acked-by: Morten Brørup <m...@smartsharesystems.com> > --- ... > diff --git a/lib/power/rte_power_qos.c b/lib/power/rte_power_qos.c > new file mode 100644 > index 0000000000..375746f832 > --- /dev/null > +++ b/lib/power/rte_power_qos.c > @@ -0,0 +1,114 @@ > +/* SPDX-License-Identifier: BSD-3-Clause > + * Copyright(c) 2024 HiSilicon Limited > + */ > + > +#include <errno.h> > +#include <stdlib.h> > +#include <string.h> > + > +#include <rte_lcore.h> > +#include <rte_log.h> > + > +#include "power_common.h" > +#include "rte_power_qos.h" > + > +#define PM_QOS_SYSFILE_RESUME_LATENCY_US \ > + "/sys/devices/system/cpu/cpu%u/power/pm_qos_resume_latency_us" > + > +int > +rte_power_qos_set_cpu_resume_latency(uint16_t lcore_id, int latency) > +{ > + char buf[LINE_MAX]; no need LINE_MAX, [32] would enough. > + FILE *f; > + int ret; > + > + if (!rte_lcore_is_enabled(lcore_id)) { > + POWER_LOG(ERR, "lcore id %u is not enabled", lcore_id); > + return -EINVAL; > + } > + > + if (latency < 0) { > + POWER_LOG(ERR, "latency should be greater than and equal to 0"); > + return -EINVAL; > + } > + > + ret = open_core_sysfs_file(&f, "w", PM_QOS_SYSFILE_RESUME_LATENCY_US, > lcore_id); > + if (ret != 0) { > + POWER_LOG(ERR, "Failed to open > "PM_QOS_SYSFILE_RESUME_LATENCY_US, lcore_id); > + return ret; > + } > + > + /* > + * Based on the sysfs interface pm_qos_resume_latency_us under > + * @PM_QOS_SYSFILE_RESUME_LATENCY_US directory in kernel, their meanning meanning -> meaning > + * is as follows for different input string. > + * 1> the resume latency is 0 if the input is "n/a". > + * 2> the resume latency is no constraint if the input is "0". > + * 3> the resume latency is the actual value to be set. > + */ > + if (latency == 0) > + snprintf(buf, sizeof(buf), "%s", "n/a"); > + else if (latency == RTE_POWER_QOS_RESUME_LATENCY_NO_CONSTRAINT) > + snprintf(buf, sizeof(buf), "%u", 0); > + else > + snprintf(buf, sizeof(buf), "%u", latency); > + > + ret = write_core_sysfs_s(f, buf); > + if (ret != 0) { > + POWER_LOG(ERR, "Failed to write > "PM_QOS_SYSFILE_RESUME_LATENCY_US, lcore_id); > + goto out; no need of goto > + } > + > +out: > + if (f != NULL) > + fclose(f); just fclose(f) because f is valid here. > + > + return ret; > +} > + > +int > +rte_power_qos_get_cpu_resume_latency(uint16_t lcore_id) > +{ > + char buf[LINE_MAX]; > + int latency = -1; > + FILE *f; > + int ret; > + > + if (!rte_lcore_is_enabled(lcore_id)) { > + POWER_LOG(ERR, "lcore id %u is not enabled", lcore_id); > + return -EINVAL; > + } > + > + ret = open_core_sysfs_file(&f, "r", PM_QOS_SYSFILE_RESUME_LATENCY_US, > lcore_id); > + if (ret != 0) { > + POWER_LOG(ERR, "Failed to open > "PM_QOS_SYSFILE_RESUME_LATENCY_US, lcore_id); > + return ret; > + } > + > + ret = read_core_sysfs_s(f, buf, sizeof(buf)); > + if (ret != 0) { > + POWER_LOG(ERR, "Failed to read > "PM_QOS_SYSFILE_RESUME_LATENCY_US, lcore_id); > + goto out; > + } > + > + /* > + * Based on the sysfs interface pm_qos_resume_latency_us under > + * @PM_QOS_SYSFILE_RESUME_LATENCY_US directory in kernel, their meanning meanning -> meaning > + * is as follows for different output string. > + * 1> the resume latency is 0 if the output is "n/a". > + * 2> the resume latency is no constraint if the output is "0". > + * 3> the resume latency is the actual value in used for other string. > + */ > + if (strcmp(buf, "n/a") == 0) > + latency = 0; > + else { > + latency = strtoul(buf, NULL, 10); > + latency = latency == 0 ? > RTE_POWER_QOS_RESUME_LATENCY_NO_CONSTRAINT : latency; > + } > + > +out: > + if (f != NULL) > + fclose(f); just fclose(f) because f is valid here. > + > + return latency != -1 ? latency : ret; > +} > diff --git a/lib/power/rte_power_qos.h b/lib/power/rte_power_qos.h > new file mode 100644 > index 0000000000..990c488373 > --- /dev/null > +++ b/lib/power/rte_power_qos.h > @@ -0,0 +1,73 @@ > +/* SPDX-License-Identifier: BSD-3-Clause > + * Copyright(c) 2024 HiSilicon Limited > + */ > + > +#ifndef RTE_POWER_QOS_H > +#define RTE_POWER_QOS_H > + > +#include <stdint.h> > + > +#include <rte_compat.h> > + > +#ifdef __cplusplus > +extern "C" { > +#endif > + > +/** > + * @file rte_power_qos.h > + * > + * PM QoS API. > + * > + * The CPU-wide resume latency limit has a positive impact on this CPU's idle > + * state selection in each cpuidle governor. > + * Please see the PM QoS on CPU wide in the following link: > + * > https://www.kernel.org/doc/html/latest/admin-guide/abi-testing.html?highlight=pm_qos_resume_latency_us#abi-sys-devices-power-pm-qos-resume-latency-us > + * > + * The deeper the idle state, the lower the power consumption, but the > + * longer the resume time. Some service are delay sensitive and very except > the > + * low resume time, like interrupt packet receiving mode. > + * > + * In these case, per-CPU PM QoS API can be used to control this CPU's idle > + * state selection and limit just enter the shallowest idle state to low the > + * delay after sleep by setting strict resume latency (zero value). > + */ > + > +#define RTE_POWER_QOS_STRICT_LATENCY_VALUE 0 > +#define RTE_POWER_QOS_RESUME_LATENCY_NO_CONSTRAINT ((int)(UINT32_MAX >> > 1)) > + > +/** > + * @warning > + * @b EXPERIMENTAL: this API may change without prior notice. > + * > + * @param lcore_id > + * target logical core id > + * > + * @param latency > + * The latency should be greater than and equal to zero in microseconds > unit. > + * > + * @return > + * 0 on success. Otherwise negative value is returned. > + */ > +__rte_experimental > +int rte_power_qos_set_cpu_resume_latency(uint16_t lcore_id, int latency); > + > +/** > + * @warning > + * @b EXPERIMENTAL: this API may change without prior notice. > + * > + * Get the current resume latency of this logical core. > + * The default value in kernel is @see > RTE_POWER_QOS_RESUME_LATENCY_NO_CONSTRAINT > + * if don't set it. > + * > + * @return > + * Negative value on failure. > + * >= 0 means the actual resume latency limit on this core. > + */ > +__rte_experimental > +int rte_power_qos_get_cpu_resume_latency(uint16_t lcore_id); > + > +#ifdef __cplusplus > +} > +#endif > + > +#endif /* RTE_POWER_QOS_H */ > diff --git a/lib/power/version.map b/lib/power/version.map > index c9a226614e..4e4955a4cf 100644 > --- a/lib/power/version.map > +++ b/lib/power/version.map > @@ -51,4 +51,8 @@ EXPERIMENTAL { > rte_power_set_uncore_env; > rte_power_uncore_freqs; > rte_power_unset_uncore_env; > + > + # added in 24.11 > + rte_power_qos_set_cpu_resume_latency; > + rte_power_qos_get_cpu_resume_latency; order by alphabetic. another question, I think rename cpu with core maybe more accurate, despite sysfs export with cpu, but in DPDK it means core. and there are some rte_power_core_xxx name in rte_power library, I think better to keep the same. > }; >