> Add a simple on/off switch that will enable saving power when no > packets are arriving. It is based on counting the number of empty > polls and, when the number reaches a certain threshold, entering an > architecture-defined optimized power state that will either wait > until a TSC timestamp expires, or when packets arrive. > > This API support 1 port to multiple core use case. > > This design leverage RX Callback mechnaism which allow three > different power management methodology co exist. > > 1. umwait/umonitor: > > The TSC timestamp is automatically calculated using current > link speed and RX descriptor ring size, such that the sleep > time is not longer than it would take for a NIC to fill its > entire RX descriptor ring. > > 2. Pause instruction > > Instead of move the core into deeper C state, this lightweight > method use Pause instruction to relief the processor from > busy polling. > > 3. Frequency Scaling > Reuse exist rte power library to scale up/down core frequency > depend on traffic volume. > > Signed-off-by: Liang Ma <liang.j...@intel.com> > Signed-off-by: Anatoly Burakov <anatoly.bura...@intel.com> > --- > lib/librte_power/meson.build | 5 +- > lib/librte_power/pmd_mgmt.h | 49 ++++++ > lib/librte_power/rte_power_pmd_mgmt.c | 208 +++++++++++++++++++++++++ > lib/librte_power/rte_power_pmd_mgmt.h | 88 +++++++++++ > lib/librte_power/rte_power_version.map | 4 + > 5 files changed, 352 insertions(+), 2 deletions(-) > create mode 100644 lib/librte_power/pmd_mgmt.h > create mode 100644 lib/librte_power/rte_power_pmd_mgmt.c > create mode 100644 lib/librte_power/rte_power_pmd_mgmt.h > > diff --git a/lib/librte_power/meson.build b/lib/librte_power/meson.build > index 78c031c943..cc3c7a8646 100644 > --- a/lib/librte_power/meson.build > +++ b/lib/librte_power/meson.build > @@ -9,6 +9,7 @@ sources = files('rte_power.c', 'power_acpi_cpufreq.c', > 'power_kvm_vm.c', 'guest_channel.c', > 'rte_power_empty_poll.c', > 'power_pstate_cpufreq.c', > + 'rte_power_pmd_mgmt.c', > 'power_common.c') > -headers = files('rte_power.h','rte_power_empty_poll.h') > -deps += ['timer'] > +headers = > files('rte_power.h','rte_power_empty_poll.h','rte_power_pmd_mgmt.h') > +deps += ['timer' ,'ethdev'] > diff --git a/lib/librte_power/pmd_mgmt.h b/lib/librte_power/pmd_mgmt.h > new file mode 100644 > index 0000000000..756fbe20f7 > --- /dev/null > +++ b/lib/librte_power/pmd_mgmt.h > @@ -0,0 +1,49 @@ > +/* SPDX-License-Identifier: BSD-3-Clause > + * Copyright(c) 2010-2020 Intel Corporation > + */ > + > +#ifndef _PMD_MGMT_H > +#define _PMD_MGMT_H > + > +/** > + * @file > + * Power Management > + */ > + > +#ifdef __cplusplus > +extern "C" { > +#endif > + > +/** > + * Possible power management states of an ethdev port. > + */ > +enum pmd_mgmt_state { > + /** Device power management is disabled. */ > + PMD_MGMT_DISABLED = 0, > + /** Device power management is enabled. */ > + PMD_MGMT_ENABLED, > +}; > + > +struct pmd_queue_cfg { > + enum pmd_mgmt_state pwr_mgmt_state; > + /**< Power mgmt Callback mode */ > + enum rte_power_pmd_mgmt_type cb_mode; > + /**< Empty poll number */ > + uint16_t empty_poll_stats; > + /**< Callback instance */ > + const struct rte_eth_rxtx_callback *cur_cb; > +} __rte_cache_aligned; > + > +struct pmd_port_cfg { > + int ref_cnt; > + struct pmd_queue_cfg *queue_cfg; > +} __rte_cache_aligned; > + > + > + > + > +#ifdef __cplusplus > +} > +#endif > + > +#endif > diff --git a/lib/librte_power/rte_power_pmd_mgmt.c > b/lib/librte_power/rte_power_pmd_mgmt.c > new file mode 100644 > index 0000000000..35d2af46a4 > --- /dev/null > +++ b/lib/librte_power/rte_power_pmd_mgmt.c > @@ -0,0 +1,208 @@ > +/* SPDX-License-Identifier: BSD-3-Clause > + * Copyright(c) 2010-2020 Intel Corporation > + */ > + > +#include <rte_lcore.h> > +#include <rte_cycles.h> > +#include <rte_malloc.h> > +#include <rte_ethdev.h> > +#include <rte_power_intrinsics.h> > + > +#include "rte_power_pmd_mgmt.h" > +#include "pmd_mgmt.h" > + > + > +#define EMPTYPOLL_MAX 512 > +#define PAUSE_NUM 64 > + > +static struct pmd_port_cfg port_cfg[RTE_MAX_ETHPORTS]; > + > +static uint16_t > +rte_power_mgmt_umwait(uint16_t port_id, uint16_t qidx, > + struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx, > + uint16_t max_pkts __rte_unused, void *_ __rte_unused) > +{ > + > + struct pmd_queue_cfg *q_conf; > + q_conf = &port_cfg[port_id].queue_cfg[qidx]; > + > + if (unlikely(nb_rx == 0)) { > + q_conf->empty_poll_stats++; > + if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX)) {
Here and in other places - wouldn't it be better to empty_poll_max as configurable parameter, instead of constant value? > + volatile void *target_addr; > + uint64_t expected, mask; > + uint16_t ret; > + > + /* > + * get address of next descriptor in the RX > + * ring for this queue, as well as expected > + * value and a mask. > + */ > + ret = rte_eth_get_wake_addr(port_id, qidx, > + &target_addr, &expected, > + &mask); > + if (ret == 0) > + /* -1ULL is maximum value for TSC */ > + rte_power_monitor(target_addr, > + expected, mask, > + 0, -1ULL); Why not make timeout a user specified parameter? > + } > + } else > + q_conf->empty_poll_stats = 0; > + > + return nb_rx; > +} > + > +static uint16_t > +rte_power_mgmt_pause(uint16_t port_id, uint16_t qidx, > + struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx, > + uint16_t max_pkts __rte_unused, void *_ __rte_unused) > +{ > + struct pmd_queue_cfg *q_conf; > + int i; > + q_conf = &port_cfg[port_id].queue_cfg[qidx]; > + > + if (unlikely(nb_rx == 0)) { > + q_conf->empty_poll_stats++; > + if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX)) { > + for (i = 0; i < PAUSE_NUM; i++) > + rte_pause(); Just rte_delay_us(timeout) instead of this loop? > + } > + } else > + q_conf->empty_poll_stats = 0; > + > + return nb_rx; > +} > + > +static uint16_t > +rte_power_mgmt_scalefreq(uint16_t port_id, uint16_t qidx, > + struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx, > + uint16_t max_pkts __rte_unused, void *_ __rte_unused) > +{ > + struct pmd_queue_cfg *q_conf; > + q_conf = &port_cfg[port_id].queue_cfg[qidx]; > + > + if (unlikely(nb_rx == 0)) { > + q_conf->empty_poll_stats++; > + if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX)) { > + /*scale down freq */ > + rte_power_freq_min(rte_lcore_id()); > + > + } > + } else { > + q_conf->empty_poll_stats = 0; > + /* scal up freq */ > + rte_power_freq_max(rte_lcore_id()); > + } > + > + return nb_rx; > +} > + Probably worth to mention in comments that these functions enable/disable are not MT safe. > +int > +rte_power_pmd_mgmt_queue_enable(unsigned int lcore_id, > + uint16_t port_id, > + uint16_t queue_id, > + enum rte_power_pmd_mgmt_type mode) > +{ > + struct rte_eth_dev *dev; > + struct pmd_queue_cfg *queue_cfg; > + int ret = 0; > + > + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL); > + dev = &rte_eth_devices[port_id]; > + > + if (port_cfg[port_id].queue_cfg == NULL) { > + port_cfg[port_id].ref_cnt = 0; > + /* allocate memory for empty poll stats */ > + port_cfg[port_id].queue_cfg = rte_malloc_socket(NULL, > + sizeof(struct pmd_queue_cfg) > + * RTE_MAX_QUEUES_PER_PORT, > + 0, dev->data->numa_node); > + if (port_cfg[port_id].queue_cfg == NULL) > + return -ENOMEM; > + } > + > + queue_cfg = &port_cfg[port_id].queue_cfg[queue_id]; > + > + if (queue_cfg->pwr_mgmt_state == PMD_MGMT_ENABLED) { > + ret = -EINVAL; > + goto failure_handler; > + } > + > + switch (mode) { > + case RTE_POWER_MGMT_TYPE_WAIT: > + if (!rte_cpu_get_flag_enabled(RTE_CPUFLAG_WAITPKG)) { > + ret = -ENOTSUP; > + goto failure_handler; > + } > + queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id, queue_id, > + rte_power_mgmt_umwait, NULL); > + break; > + case RTE_POWER_MGMT_TYPE_SCALE: > + /* init scale freq */ > + if (rte_power_init(lcore_id)) { > + ret = -EINVAL; > + goto failure_handler; > + } > + queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id, queue_id, > + rte_power_mgmt_scalefreq, NULL); > + break; > + case RTE_POWER_MGMT_TYPE_PAUSE: > + queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id, queue_id, > + rte_power_mgmt_pause, NULL); > + break; default: .... > + } > + queue_cfg->cb_mode = mode; > + port_cfg[port_id].ref_cnt++; > + queue_cfg->pwr_mgmt_state = PMD_MGMT_ENABLED; > + return ret; > + > +failure_handler: > + if (port_cfg[port_id].ref_cnt == 0) { > + rte_free(port_cfg[port_id].queue_cfg); > + port_cfg[port_id].queue_cfg = NULL; > + } > + return ret; > +} > + > +int > +rte_power_pmd_mgmt_queue_disable(unsigned int lcore_id, > + uint16_t port_id, > + uint16_t queue_id) > +{ > + struct pmd_queue_cfg *queue_cfg; > + > + if (port_cfg[port_id].ref_cnt <= 0) > + return -EINVAL; > + > + queue_cfg = &port_cfg[port_id].queue_cfg[queue_id]; > + > + if (queue_cfg->pwr_mgmt_state == PMD_MGMT_DISABLED) > + return -EINVAL; > + > + switch (queue_cfg->cb_mode) { > + case RTE_POWER_MGMT_TYPE_WAIT: Think we need wakeup(lcore_id) here. > + case RTE_POWER_MGMT_TYPE_PAUSE: > + rte_eth_remove_rx_callback(port_id, queue_id, > + queue_cfg->cur_cb); > + break; > + case RTE_POWER_MGMT_TYPE_SCALE: > + rte_power_freq_max(lcore_id); > + rte_eth_remove_rx_callback(port_id, queue_id, > + queue_cfg->cur_cb); > + rte_power_exit(lcore_id); > + break; > + } > + /* it's not recommend to free callback instance here. > + * it cause memory leak which is a known issue. > + */ > + queue_cfg->cur_cb = NULL; > + queue_cfg->pwr_mgmt_state = PMD_MGMT_DISABLED; > + port_cfg[port_id].ref_cnt--; > + > + if (port_cfg[port_id].ref_cnt == 0) { > + rte_free(port_cfg[port_id].queue_cfg); It is not safe to do so, unless device is already stopped. Otherwise you need some sync mechanism here (hand-made as bpf lib, or rcu online/offline, or...) > + port_cfg[port_id].queue_cfg = NULL; > + } > + return 0; > +}