> On 09-Oct-20 5:38 PM, Ananyev, Konstantin wrote: > >> Add a simple on/off switch that will enable saving power when no > >> packets are arriving. It is based on counting the number of empty > >> polls and, when the number reaches a certain threshold, entering an > >> architecture-defined optimized power state that will either wait > >> until a TSC timestamp expires, or when packets arrive. > >> > >> This API support 1 port to multiple core use case. > >> > >> This design leverage RX Callback mechnaism which allow three > >> different power management methodology co exist. > >> > >> 1. umwait/umonitor: > >> > >> The TSC timestamp is automatically calculated using current > >> link speed and RX descriptor ring size, such that the sleep > >> time is not longer than it would take for a NIC to fill its > >> entire RX descriptor ring. > >> > >> 2. Pause instruction > >> > >> Instead of move the core into deeper C state, this lightweight > >> method use Pause instruction to relief the processor from > >> busy polling. > >> > >> 3. Frequency Scaling > >> Reuse exist rte power library to scale up/down core frequency > >> depend on traffic volume. > >> > >> Signed-off-by: Liang Ma <liang.j...@intel.com> > >> Signed-off-by: Anatoly Burakov <anatoly.bura...@intel.com> > >> --- > >> lib/librte_power/meson.build | 5 +- > >> lib/librte_power/pmd_mgmt.h | 49 ++++++ > >> lib/librte_power/rte_power_pmd_mgmt.c | 208 +++++++++++++++++++++++++ > >> lib/librte_power/rte_power_pmd_mgmt.h | 88 +++++++++++ > >> lib/librte_power/rte_power_version.map | 4 + > >> 5 files changed, 352 insertions(+), 2 deletions(-) > >> create mode 100644 lib/librte_power/pmd_mgmt.h > >> create mode 100644 lib/librte_power/rte_power_pmd_mgmt.c > >> create mode 100644 lib/librte_power/rte_power_pmd_mgmt.h > >> > >> diff --git a/lib/librte_power/meson.build b/lib/librte_power/meson.build > >> index 78c031c943..cc3c7a8646 100644 > >> --- a/lib/librte_power/meson.build > >> +++ b/lib/librte_power/meson.build > >> @@ -9,6 +9,7 @@ sources = files('rte_power.c', 'power_acpi_cpufreq.c', > >> 'power_kvm_vm.c', 'guest_channel.c', > >> 'rte_power_empty_poll.c', > >> 'power_pstate_cpufreq.c', > >> +'rte_power_pmd_mgmt.c', > >> 'power_common.c') > >> -headers = files('rte_power.h','rte_power_empty_poll.h') > >> -deps += ['timer'] > >> +headers = > >> files('rte_power.h','rte_power_empty_poll.h','rte_power_pmd_mgmt.h') > >> +deps += ['timer' ,'ethdev'] > >> diff --git a/lib/librte_power/pmd_mgmt.h b/lib/librte_power/pmd_mgmt.h > >> new file mode 100644 > >> index 0000000000..756fbe20f7 > >> --- /dev/null > >> +++ b/lib/librte_power/pmd_mgmt.h > >> @@ -0,0 +1,49 @@ > >> +/* SPDX-License-Identifier: BSD-3-Clause > >> + * Copyright(c) 2010-2020 Intel Corporation > >> + */ > >> + > >> +#ifndef _PMD_MGMT_H > >> +#define _PMD_MGMT_H > >> + > >> +/** > >> + * @file > >> + * Power Management > >> + */ > >> + > >> +#ifdef __cplusplus > >> +extern "C" { > >> +#endif > >> + > >> +/** > >> + * Possible power management states of an ethdev port. > >> + */ > >> +enum pmd_mgmt_state { > >> +/** Device power management is disabled. */ > >> +PMD_MGMT_DISABLED = 0, > >> +/** Device power management is enabled. */ > >> +PMD_MGMT_ENABLED, > >> +}; > >> + > >> +struct pmd_queue_cfg { > >> +enum pmd_mgmt_state pwr_mgmt_state; > >> +/**< Power mgmt Callback mode */ > >> +enum rte_power_pmd_mgmt_type cb_mode; > >> +/**< Empty poll number */ > >> +uint16_t empty_poll_stats; > >> +/**< Callback instance */ > >> +const struct rte_eth_rxtx_callback *cur_cb; > >> +} __rte_cache_aligned; > >> + > >> +struct pmd_port_cfg { > >> +int ref_cnt; > >> +struct pmd_queue_cfg *queue_cfg; > >> +} __rte_cache_aligned; > >> + > >> + > >> + > >> + > >> +#ifdef __cplusplus > >> +} > >> +#endif > >> + > >> +#endif > >> diff --git a/lib/librte_power/rte_power_pmd_mgmt.c > >> b/lib/librte_power/rte_power_pmd_mgmt.c > >> new file mode 100644 > >> index 0000000000..35d2af46a4 > >> --- /dev/null > >> +++ b/lib/librte_power/rte_power_pmd_mgmt.c > >> @@ -0,0 +1,208 @@ > >> +/* SPDX-License-Identifier: BSD-3-Clause > >> + * Copyright(c) 2010-2020 Intel Corporation > >> + */ > >> + > >> +#include <rte_lcore.h> > >> +#include <rte_cycles.h> > >> +#include <rte_malloc.h> > >> +#include <rte_ethdev.h> > >> +#include <rte_power_intrinsics.h> > >> + > >> +#include "rte_power_pmd_mgmt.h" > >> +#include "pmd_mgmt.h" > >> + > >> + > >> +#define EMPTYPOLL_MAX 512 > >> +#define PAUSE_NUM 64 > >> + > >> +static struct pmd_port_cfg port_cfg[RTE_MAX_ETHPORTS]; > >> + > >> +static uint16_t > >> +rte_power_mgmt_umwait(uint16_t port_id, uint16_t qidx, > >> +struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx, > >> +uint16_t max_pkts __rte_unused, void *_ __rte_unused) > >> +{ > >> + > >> +struct pmd_queue_cfg *q_conf; > >> +q_conf = &port_cfg[port_id].queue_cfg[qidx]; > >> + > >> +if (unlikely(nb_rx == 0)) { > >> +q_conf->empty_poll_stats++; > >> +if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX)) { > > > > > > Here and in other places - wouldn't it be better to empty_poll_max as > > configurable > > parameter, instead of constant value? > > It would be more flexible, but i don't think it's "better" in the sense > that providing additional options will only make using this (already > under-utilized!) API harder than it needs to be. > > > > >> +volatile void *target_addr; > >> +uint64_t expected, mask; > >> +uint16_t ret; > >> + > >> +/* > >> + * get address of next descriptor in the RX > >> + * ring for this queue, as well as expected > >> + * value and a mask. > >> + */ > >> +ret = rte_eth_get_wake_addr(port_id, qidx, > >> + &target_addr, &expected, > >> + &mask); > >> +if (ret == 0) > >> +/* -1ULL is maximum value for TSC */ > >> +rte_power_monitor(target_addr, > >> + expected, mask, > >> + 0, -1ULL); > > > > > > Why not make timeout a user specified parameter? > > This is meant to be an "easy to use" API, we were trying to keep the > amount of configuration to an absolute minimum. If the user wants to use > custom timeouts, they can do so with using rte_power_monitor API explicitly. > > > > >> +} > >> +} else > >> +q_conf->empty_poll_stats = 0; > >> + > >> +return nb_rx; > >> +} > >> + > >> +static uint16_t > >> +rte_power_mgmt_pause(uint16_t port_id, uint16_t qidx, > >> +struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx, > >> +uint16_t max_pkts __rte_unused, void *_ __rte_unused) > >> +{ > >> +struct pmd_queue_cfg *q_conf; > >> +int i; > >> +q_conf = &port_cfg[port_id].queue_cfg[qidx]; > >> + > >> +if (unlikely(nb_rx == 0)) { > >> +q_conf->empty_poll_stats++; > >> +if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX)) { > >> +for (i = 0; i < PAUSE_NUM; i++) > >> +rte_pause(); > > > > Just rte_delay_us(timeout) instead of this loop? > > Yes, seems better, thanks. > > > > >> +} > >> +} else > >> +q_conf->empty_poll_stats = 0; > >> + > >> +return nb_rx; > >> +} > >> + > >> +static uint16_t > >> +rte_power_mgmt_scalefreq(uint16_t port_id, uint16_t qidx, > >> +struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx, > >> +uint16_t max_pkts __rte_unused, void *_ __rte_unused) > >> +{ > >> +struct pmd_queue_cfg *q_conf; > >> +q_conf = &port_cfg[port_id].queue_cfg[qidx]; > >> + > >> +if (unlikely(nb_rx == 0)) { > >> +q_conf->empty_poll_stats++; > >> +if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX)) { > >> +/*scale down freq */ > >> +rte_power_freq_min(rte_lcore_id()); > >> + > >> +} > >> +} else { > >> +q_conf->empty_poll_stats = 0; > >> +/* scal up freq */ > >> +rte_power_freq_max(rte_lcore_id()); > >> +} > >> + > >> +return nb_rx; > >> +} > >> + > > > > Probably worth to mention in comments that these functions enable/disable > > are not MT safe. > > Will do in v6. > > > > >> +int > >> +rte_power_pmd_mgmt_queue_enable(unsigned int lcore_id, > >> +uint16_t port_id, > >> +uint16_t queue_id, > >> +enum rte_power_pmd_mgmt_type mode) > >> +{ > >> +struct rte_eth_dev *dev; > >> +struct pmd_queue_cfg *queue_cfg; > >> +int ret = 0; > >> + > >> +RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL); > >> +dev = &rte_eth_devices[port_id]; > >> + > >> +if (port_cfg[port_id].queue_cfg == NULL) { > >> +port_cfg[port_id].ref_cnt = 0; > >> +/* allocate memory for empty poll stats */ > >> +port_cfg[port_id].queue_cfg = rte_malloc_socket(NULL, > >> +sizeof(struct pmd_queue_cfg) > >> +* RTE_MAX_QUEUES_PER_PORT, > >> +0, dev->data->numa_node); > >> +if (port_cfg[port_id].queue_cfg == NULL) > >> +return -ENOMEM; > >> +} > >> + > >> +queue_cfg = &port_cfg[port_id].queue_cfg[queue_id]; > >> + > >> +if (queue_cfg->pwr_mgmt_state == PMD_MGMT_ENABLED) { > >> +ret = -EINVAL; > >> +goto failure_handler; > >> +} > >> + > >> +switch (mode) { > >> +case RTE_POWER_MGMT_TYPE_WAIT: > >> +if (!rte_cpu_get_flag_enabled(RTE_CPUFLAG_WAITPKG)) { > >> +ret = -ENOTSUP; > >> +goto failure_handler; > >> +} > >> +queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id, queue_id, > >> +rte_power_mgmt_umwait, NULL); > >> +break; > >> +case RTE_POWER_MGMT_TYPE_SCALE: > >> +/* init scale freq */ > >> +if (rte_power_init(lcore_id)) { > >> +ret = -EINVAL; > >> +goto failure_handler; > >> +} > >> +queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id, queue_id, > >> +rte_power_mgmt_scalefreq, NULL); > >> +break; > >> +case RTE_POWER_MGMT_TYPE_PAUSE: > >> +queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id, queue_id, > >> +rte_power_mgmt_pause, NULL); > >> +break; > > > > default: > > .... > > Will add in v6. > > > > >> +} > >> +queue_cfg->cb_mode = mode; > >> +port_cfg[port_id].ref_cnt++; > >> +queue_cfg->pwr_mgmt_state = PMD_MGMT_ENABLED; > >> +return ret; > >> + > >> +failure_handler: > >> +if (port_cfg[port_id].ref_cnt == 0) { > >> +rte_free(port_cfg[port_id].queue_cfg); > >> +port_cfg[port_id].queue_cfg = NULL; > >> +} > >> +return ret; > >> +} > >> + > >> +int > >> +rte_power_pmd_mgmt_queue_disable(unsigned int lcore_id, > >> +uint16_t port_id, > >> +uint16_t queue_id) > >> +{ > >> +struct pmd_queue_cfg *queue_cfg; > >> + > >> +if (port_cfg[port_id].ref_cnt <= 0) > >> +return -EINVAL; > >> + > >> +queue_cfg = &port_cfg[port_id].queue_cfg[queue_id]; > >> + > >> +if (queue_cfg->pwr_mgmt_state == PMD_MGMT_DISABLED) > >> +return -EINVAL; > >> + > >> +switch (queue_cfg->cb_mode) { > >> +case RTE_POWER_MGMT_TYPE_WAIT: > > > > Think we need wakeup(lcore_id) here. > > Not sure what you mean? Could you please elaborate? > > > > >> +case RTE_POWER_MGMT_TYPE_PAUSE: > >> +rte_eth_remove_rx_callback(port_id, queue_id, > >> + queue_cfg->cur_cb); > >> +break; > >> +case RTE_POWER_MGMT_TYPE_SCALE: > >> +rte_power_freq_max(lcore_id); > >> +rte_eth_remove_rx_callback(port_id, queue_id, > >> + queue_cfg->cur_cb); > >> +rte_power_exit(lcore_id); > >> +break; > >> +} > >> +/* it's not recommend to free callback instance here. > >> + * it cause memory leak which is a known issue. > >> + */ > >> +queue_cfg->cur_cb = NULL; > >> +queue_cfg->pwr_mgmt_state = PMD_MGMT_DISABLED; > >> +port_cfg[port_id].ref_cnt--; > >> + > >> +if (port_cfg[port_id].ref_cnt == 0) { > >> +rte_free(port_cfg[port_id].queue_cfg); > > > > It is not safe to do so, unless device is already stopped. > > Otherwise you need some sync mechanism here (hand-made as bpf lib, or rcu > > online/offline, or...) > > Not sure what you mean. We're not freeing the callback structure, we're > freeing the local data structure holding the per-port status.
What is the difference? You still trying to free memory that might be used by your DP thread that still executes the callback. > > > > >> +port_cfg[port_id].queue_cfg = NULL; > >> +} > >> +return 0; > >> +} > > > -- > Thanks, > Anatoly