Add a simple on/off switch that will enable saving power when no packets are arriving. It is based on counting the number of empty polls and, when the number reaches a certain threshold, entering an architecture-defined optimized power state that will either wait until a TSC timestamp expires, or when packets arrive.
This API support 1 port to multiple core use case. This design leverage RX Callback mechnaism which allow three different power management methodology co exist. 1. umwait/umonitor: The TSC timestamp is automatically calculated using current link speed and RX descriptor ring size, such that the sleep time is not longer than it would take for a NIC to fill its entire RX descriptor ring. 2. Pause instruction Instead of move the core into deeper C state, this lightweight method use Pause instruction to relief the processor from busy polling. 3. Frequency Scaling Reuse exist rte power library to scale up/down core frequency depend on traffic volume. Signed-off-by: Liang Ma <liang.j...@intel.com> Signed-off-by: Anatoly Burakov <anatoly.bura...@intel.com> --- lib/librte_power/meson.build | 5 +- lib/librte_power/pmd_mgmt.h | 49 ++++++ lib/librte_power/rte_power_pmd_mgmt.c | 208 +++++++++++++++++++++++++ lib/librte_power/rte_power_pmd_mgmt.h | 88 +++++++++++ lib/librte_power/rte_power_version.map | 4 + 5 files changed, 352 insertions(+), 2 deletions(-) create mode 100644 lib/librte_power/pmd_mgmt.h create mode 100644 lib/librte_power/rte_power_pmd_mgmt.c create mode 100644 lib/librte_power/rte_power_pmd_mgmt.h diff --git a/lib/librte_power/meson.build b/lib/librte_power/meson.build index 78c031c943..cc3c7a8646 100644 --- a/lib/librte_power/meson.build +++ b/lib/librte_power/meson.build @@ -9,6 +9,7 @@ sources = files('rte_power.c', 'power_acpi_cpufreq.c', 'power_kvm_vm.c', 'guest_channel.c', 'rte_power_empty_poll.c', 'power_pstate_cpufreq.c', + 'rte_power_pmd_mgmt.c', 'power_common.c') -headers = files('rte_power.h','rte_power_empty_poll.h') -deps += ['timer'] +headers = files('rte_power.h','rte_power_empty_poll.h','rte_power_pmd_mgmt.h') +deps += ['timer' ,'ethdev'] diff --git a/lib/librte_power/pmd_mgmt.h b/lib/librte_power/pmd_mgmt.h new file mode 100644 index 0000000000..756fbe20f7 --- /dev/null +++ b/lib/librte_power/pmd_mgmt.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2020 Intel Corporation + */ + +#ifndef _PMD_MGMT_H +#define _PMD_MGMT_H + +/** + * @file + * Power Management + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Possible power management states of an ethdev port. + */ +enum pmd_mgmt_state { + /** Device power management is disabled. */ + PMD_MGMT_DISABLED = 0, + /** Device power management is enabled. */ + PMD_MGMT_ENABLED, +}; + +struct pmd_queue_cfg { + enum pmd_mgmt_state pwr_mgmt_state; + /**< Power mgmt Callback mode */ + enum rte_power_pmd_mgmt_type cb_mode; + /**< Empty poll number */ + uint16_t empty_poll_stats; + /**< Callback instance */ + const struct rte_eth_rxtx_callback *cur_cb; +} __rte_cache_aligned; + +struct pmd_port_cfg { + int ref_cnt; + struct pmd_queue_cfg *queue_cfg; +} __rte_cache_aligned; + + + + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/lib/librte_power/rte_power_pmd_mgmt.c b/lib/librte_power/rte_power_pmd_mgmt.c new file mode 100644 index 0000000000..35d2af46a4 --- /dev/null +++ b/lib/librte_power/rte_power_pmd_mgmt.c @@ -0,0 +1,208 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2020 Intel Corporation + */ + +#include <rte_lcore.h> +#include <rte_cycles.h> +#include <rte_malloc.h> +#include <rte_ethdev.h> +#include <rte_power_intrinsics.h> + +#include "rte_power_pmd_mgmt.h" +#include "pmd_mgmt.h" + + +#define EMPTYPOLL_MAX 512 +#define PAUSE_NUM 64 + +static struct pmd_port_cfg port_cfg[RTE_MAX_ETHPORTS]; + +static uint16_t +rte_power_mgmt_umwait(uint16_t port_id, uint16_t qidx, + struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx, + uint16_t max_pkts __rte_unused, void *_ __rte_unused) +{ + + struct pmd_queue_cfg *q_conf; + q_conf = &port_cfg[port_id].queue_cfg[qidx]; + + if (unlikely(nb_rx == 0)) { + q_conf->empty_poll_stats++; + if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX)) { + volatile void *target_addr; + uint64_t expected, mask; + uint16_t ret; + + /* + * get address of next descriptor in the RX + * ring for this queue, as well as expected + * value and a mask. + */ + ret = rte_eth_get_wake_addr(port_id, qidx, + &target_addr, &expected, + &mask); + if (ret == 0) + /* -1ULL is maximum value for TSC */ + rte_power_monitor(target_addr, + expected, mask, + 0, -1ULL); + } + } else + q_conf->empty_poll_stats = 0; + + return nb_rx; +} + +static uint16_t +rte_power_mgmt_pause(uint16_t port_id, uint16_t qidx, + struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx, + uint16_t max_pkts __rte_unused, void *_ __rte_unused) +{ + struct pmd_queue_cfg *q_conf; + int i; + q_conf = &port_cfg[port_id].queue_cfg[qidx]; + + if (unlikely(nb_rx == 0)) { + q_conf->empty_poll_stats++; + if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX)) { + for (i = 0; i < PAUSE_NUM; i++) + rte_pause(); + } + } else + q_conf->empty_poll_stats = 0; + + return nb_rx; +} + +static uint16_t +rte_power_mgmt_scalefreq(uint16_t port_id, uint16_t qidx, + struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx, + uint16_t max_pkts __rte_unused, void *_ __rte_unused) +{ + struct pmd_queue_cfg *q_conf; + q_conf = &port_cfg[port_id].queue_cfg[qidx]; + + if (unlikely(nb_rx == 0)) { + q_conf->empty_poll_stats++; + if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX)) { + /*scale down freq */ + rte_power_freq_min(rte_lcore_id()); + + } + } else { + q_conf->empty_poll_stats = 0; + /* scal up freq */ + rte_power_freq_max(rte_lcore_id()); + } + + return nb_rx; +} + +int +rte_power_pmd_mgmt_queue_enable(unsigned int lcore_id, + uint16_t port_id, + uint16_t queue_id, + enum rte_power_pmd_mgmt_type mode) +{ + struct rte_eth_dev *dev; + struct pmd_queue_cfg *queue_cfg; + int ret = 0; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL); + dev = &rte_eth_devices[port_id]; + + if (port_cfg[port_id].queue_cfg == NULL) { + port_cfg[port_id].ref_cnt = 0; + /* allocate memory for empty poll stats */ + port_cfg[port_id].queue_cfg = rte_malloc_socket(NULL, + sizeof(struct pmd_queue_cfg) + * RTE_MAX_QUEUES_PER_PORT, + 0, dev->data->numa_node); + if (port_cfg[port_id].queue_cfg == NULL) + return -ENOMEM; + } + + queue_cfg = &port_cfg[port_id].queue_cfg[queue_id]; + + if (queue_cfg->pwr_mgmt_state == PMD_MGMT_ENABLED) { + ret = -EINVAL; + goto failure_handler; + } + + switch (mode) { + case RTE_POWER_MGMT_TYPE_WAIT: + if (!rte_cpu_get_flag_enabled(RTE_CPUFLAG_WAITPKG)) { + ret = -ENOTSUP; + goto failure_handler; + } + queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id, queue_id, + rte_power_mgmt_umwait, NULL); + break; + case RTE_POWER_MGMT_TYPE_SCALE: + /* init scale freq */ + if (rte_power_init(lcore_id)) { + ret = -EINVAL; + goto failure_handler; + } + queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id, queue_id, + rte_power_mgmt_scalefreq, NULL); + break; + case RTE_POWER_MGMT_TYPE_PAUSE: + queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id, queue_id, + rte_power_mgmt_pause, NULL); + break; + } + queue_cfg->cb_mode = mode; + port_cfg[port_id].ref_cnt++; + queue_cfg->pwr_mgmt_state = PMD_MGMT_ENABLED; + return ret; + +failure_handler: + if (port_cfg[port_id].ref_cnt == 0) { + rte_free(port_cfg[port_id].queue_cfg); + port_cfg[port_id].queue_cfg = NULL; + } + return ret; +} + +int +rte_power_pmd_mgmt_queue_disable(unsigned int lcore_id, + uint16_t port_id, + uint16_t queue_id) +{ + struct pmd_queue_cfg *queue_cfg; + + if (port_cfg[port_id].ref_cnt <= 0) + return -EINVAL; + + queue_cfg = &port_cfg[port_id].queue_cfg[queue_id]; + + if (queue_cfg->pwr_mgmt_state == PMD_MGMT_DISABLED) + return -EINVAL; + + switch (queue_cfg->cb_mode) { + case RTE_POWER_MGMT_TYPE_WAIT: + case RTE_POWER_MGMT_TYPE_PAUSE: + rte_eth_remove_rx_callback(port_id, queue_id, + queue_cfg->cur_cb); + break; + case RTE_POWER_MGMT_TYPE_SCALE: + rte_power_freq_max(lcore_id); + rte_eth_remove_rx_callback(port_id, queue_id, + queue_cfg->cur_cb); + rte_power_exit(lcore_id); + break; + } + /* it's not recommend to free callback instance here. + * it cause memory leak which is a known issue. + */ + queue_cfg->cur_cb = NULL; + queue_cfg->pwr_mgmt_state = PMD_MGMT_DISABLED; + port_cfg[port_id].ref_cnt--; + + if (port_cfg[port_id].ref_cnt == 0) { + rte_free(port_cfg[port_id].queue_cfg); + port_cfg[port_id].queue_cfg = NULL; + } + return 0; +} diff --git a/lib/librte_power/rte_power_pmd_mgmt.h b/lib/librte_power/rte_power_pmd_mgmt.h new file mode 100644 index 0000000000..8b110f1148 --- /dev/null +++ b/lib/librte_power/rte_power_pmd_mgmt.h @@ -0,0 +1,88 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2020 Intel Corporation + */ + +#ifndef _RTE_POWER_PMD_MGMT_H +#define _RTE_POWER_PMD_MGMT_H + +/** + * @file + * RTE PMD Power Management + */ +#include <stdint.h> +#include <stdbool.h> + +#include <rte_common.h> +#include <rte_byteorder.h> +#include <rte_log.h> +#include <rte_power.h> +#include <rte_atomic.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * PMD Power Management Type + */ +enum rte_power_pmd_mgmt_type { + /** WAIT callback mode. */ + RTE_POWER_MGMT_TYPE_WAIT = 1, + /** PAUSE callback mode. */ + RTE_POWER_MGMT_TYPE_PAUSE, + /** Freq Scaling callback mode. */ + RTE_POWER_MGMT_TYPE_SCALE, +}; + +/** + * @warning + * @b EXPERIMENTAL: this API may change, or be removed, without prior notice + * + * Setup per-queue power management callback. + * @param lcore_id + * lcore_id. + * @param port_id + * The port identifier of the Ethernet device. + * @param queue_id + * The queue identifier of the Ethernet device. + * @param mode + * The power management callback function type. + + * @return + * 0 on success + * <0 on error + */ + +__rte_experimental +int +rte_power_pmd_mgmt_queue_enable(unsigned int lcore_id, + uint16_t port_id, + uint16_t queue_id, + enum rte_power_pmd_mgmt_type mode); + +/** + * @warning + * @b EXPERIMENTAL: this API may change, or be removed, without prior notice + * + * Remove per-queue power management callback. + * @param lcore_id + * lcore_id. + * @param port_id + * The port identifier of the Ethernet device. + * @param queue_id + * The queue identifier of the Ethernet device. + * @return + * 0 on success + * <0 on error + */ + +__rte_experimental +int +rte_power_pmd_mgmt_queue_disable(unsigned int lcore_id, + uint16_t port_id, + uint16_t queue_id); +#ifdef __cplusplus +} +#endif + +#endif diff --git a/lib/librte_power/rte_power_version.map b/lib/librte_power/rte_power_version.map index 69ca9af616..3f2f6cd6f6 100644 --- a/lib/librte_power/rte_power_version.map +++ b/lib/librte_power/rte_power_version.map @@ -34,4 +34,8 @@ EXPERIMENTAL { rte_power_guest_channel_receive_msg; rte_power_poll_stat_fetch; rte_power_poll_stat_update; + # added in 20.11 + rte_power_pmd_mgmt_queue_enable; + rte_power_pmd_mgmt_queue_disable; + }; -- 2.17.1