> > Add a simple on/off switch that will enable saving power when no > packets are arriving. It is based on counting the number of empty > polls and, when the number reaches a certain threshold, entering an > architecture-defined optimized power state that will either wait > until a TSC timestamp expires, or when packets arrive. > > This API is limited to 1 core 1 port 1 queue use case as there is > no coordination between queues/cores in ethdev. 1 port map to multiple > core will be supported in next version. > > This design leverage RX Callback mechnaism which allow three > different power management methodology co exist. > > 1. umwait/umonitor: > > The TSC timestamp is automatically calculated using current > link speed and RX descriptor ring size, such that the sleep > time is not longer than it would take for a NIC to fill its > entire RX descriptor ring. > > 2. Pause instruction > > Instead of move the core into deeper C state, this lightweight > method use Pause instruction to releaf the processor from > busy polling. > > 3. Frequency Scaling > Reuse exist rte power library to scale up/down core frequency > depend on traffic volume. > > Signed-off-by: Liang Ma <liang.j...@intel.com> > Signed-off-by: Anatoly Burakov <anatoly.bura...@intel.com> > --- > lib/librte_power/meson.build | 3 +- > lib/librte_power/rte_power.h | 38 +++++ > lib/librte_power/rte_power_pmd_mgmt.c | 184 +++++++++++++++++++++++++ > lib/librte_power/rte_power_version.map | 4 + > 4 files changed, 228 insertions(+), 1 deletion(-) > create mode 100644 lib/librte_power/rte_power_pmd_mgmt.c > > diff --git a/lib/librte_power/meson.build b/lib/librte_power/meson.build > index 78c031c943..44b01afce2 100644 > --- a/lib/librte_power/meson.build > +++ b/lib/librte_power/meson.build > @@ -9,6 +9,7 @@ sources = files('rte_power.c', 'power_acpi_cpufreq.c', > 'power_kvm_vm.c', 'guest_channel.c', > 'rte_power_empty_poll.c', > 'power_pstate_cpufreq.c', > + 'rte_power_pmd_mgmt.c', > 'power_common.c') > headers = files('rte_power.h','rte_power_empty_poll.h') > -deps += ['timer'] > +deps += ['timer' ,'ethdev'] > diff --git a/lib/librte_power/rte_power.h b/lib/librte_power/rte_power.h > index bbbde4dfb4..06d5a9984f 100644 > --- a/lib/librte_power/rte_power.h > +++ b/lib/librte_power/rte_power.h > @@ -14,6 +14,7 @@ > #include <rte_byteorder.h> > #include <rte_log.h> > #include <rte_string_fns.h> > +#include <rte_ethdev.h> > > #ifdef __cplusplus > extern "C" { > @@ -97,6 +98,43 @@ int rte_power_init(unsigned int lcore_id); > */ > int rte_power_exit(unsigned int lcore_id); > > +/** > + * @warning > + * @b EXPERIMENTAL: this API may change, or be removed, without prior notice > + * > + * Enable device power management. > + * @param lcore_id > + * lcore id. > + * @param port_id > + * The port identifier of the Ethernet device. > + * @param mode > + * The power management callback function mode. > + * @return > + * 0 on success > + * <0 on error > + */ > +__rte_experimental > +int rte_power_pmd_mgmt_enable(unsigned int lcore_id, > + uint16_t port_id, > + enum rte_eth_dev_power_mgmt_cb_mode mode); > + > +/** > + * @warning > + * @b EXPERIMENTAL: this API may change, or be removed, without prior notice > + * > + * Disable device power management. > + * @param lcore_id > + * lcore id. > + * @param port_id > + * The port identifier of the Ethernet device. > + * > + * @return > + * 0 on success > + * <0 on error > + */ > +__rte_experimental > +int rte_power_pmd_mgmt_disable(unsigned int lcore_id, uint16_t port_id); > + > /** > * Get the available frequencies of a specific lcore. > * Function pointer definition. Review each environments > diff --git a/lib/librte_power/rte_power_pmd_mgmt.c > b/lib/librte_power/rte_power_pmd_mgmt.c > new file mode 100644 > index 0000000000..a445153ede > --- /dev/null > +++ b/lib/librte_power/rte_power_pmd_mgmt.c > @@ -0,0 +1,184 @@ > +/* SPDX-License-Identifier: BSD-3-Clause > + * Copyright(c) 2010-2020 Intel Corporation > + */ > + > +#include <rte_lcore.h> > +#include <rte_cycles.h> > +#include <rte_atomic.h> > +#include <rte_malloc.h> > +#include <rte_ethdev.h> > + > +#include "rte_power.h" > + > + > + > +static uint16_t > +rte_power_mgmt_umwait(uint16_t port_id, uint16_t qidx, > + struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx, > + uint16_t max_pkts __rte_unused, void *_ __rte_unused) > +{ > + > + struct rte_eth_dev *dev = &rte_eth_devices[port_id]; > + > + if (unlikely(nb_rx == 0)) { > + dev->empty_poll_stats[qidx].num++;
I believe there are two fundamental issues with that approach: 1. You put metadata specific lib (power) callbacks into rte_eth_dev struct. 2. These callbacks do access rte_eth_devices[] directly. That doesn't look right to me - rte_eth_dev structure supposed to be treated as internal one librt_ether and underlying drivers and should be accessed directly by outer code. If these callbacks need some extra metadata, then it is responsibility of power library to allocate/manage these metadata. You can pass pointer to this metadata via last parameter for rte_eth_add_rx_callback(). > + if (unlikely(dev->empty_poll_stats[qidx].num > > + ETH_EMPTYPOLL_MAX)) { > + volatile void *target_addr; > + uint64_t expected, mask; > + uint16_t ret; > + > + /* > + * get address of next descriptor in the RX > + * ring for this queue, as well as expected > + * value and a mask. > + */ > + ret = (*dev->dev_ops->next_rx_desc) > + (dev->data->rx_queues[qidx], > + &target_addr, &expected, &mask); > + if (ret == 0) > + /* -1ULL is maximum value for TSC */ > + rte_power_monitor(target_addr, > + expected, mask, > + 0, -1ULL); > + } > + } else > + dev->empty_poll_stats[qidx].num = 0; > + > + return nb_rx; > +} > + > +static uint16_t > +rte_power_mgmt_pause(uint16_t port_id, uint16_t qidx, > + struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx, > + uint16_t max_pkts __rte_unused, void *_ __rte_unused) > +{ > + struct rte_eth_dev *dev = &rte_eth_devices[port_id]; > + > + int i; > + > + if (unlikely(nb_rx == 0)) { > + > + dev->empty_poll_stats[qidx].num++; > + > + if (unlikely(dev->empty_poll_stats[qidx].num > > + ETH_EMPTYPOLL_MAX)) { > + > + for (i = 0; i < RTE_ETH_PAUSE_NUM; i++) > + rte_pause(); > + > + } > + } else > + dev->empty_poll_stats[qidx].num = 0; > + > + return nb_rx; > +} > + > +static uint16_t > +rte_power_mgmt_scalefreq(uint16_t port_id, uint16_t qidx, > + struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx, > + uint16_t max_pkts __rte_unused, void *_ __rte_unused) > +{ > + struct rte_eth_dev *dev = &rte_eth_devices[port_id]; > + > + if (unlikely(nb_rx == 0)) { > + dev->empty_poll_stats[qidx].num++; > + if (unlikely(dev->empty_poll_stats[qidx].num > > + ETH_EMPTYPOLL_MAX)) { > + > + /*scale down freq */ > + rte_power_freq_min(rte_lcore_id()); > + > + } > + } else { > + dev->empty_poll_stats[qidx].num = 0; > + /* scal up freq */ > + rte_power_freq_max(rte_lcore_id()); > + } > + > + return nb_rx; > +} > + > +int > +rte_power_pmd_mgmt_enable(unsigned int lcore_id, > + uint16_t port_id, > + enum rte_eth_dev_power_mgmt_cb_mode mode) > +{ > + struct rte_eth_dev *dev; > + > + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL); > + dev = &rte_eth_devices[port_id]; > + > + if (dev->pwr_mgmt_state == RTE_ETH_DEV_POWER_MGMT_ENABLED) > + return -EINVAL; > + /* allocate memory for empty poll stats */ > + dev->empty_poll_stats = rte_malloc_socket(NULL, > + sizeof(struct rte_eth_ep_stat) > + * RTE_MAX_QUEUES_PER_PORT, > + 0, dev->data->numa_node); > + if (dev->empty_poll_stats == NULL) > + return -ENOMEM; > + > + switch (mode) { > + case RTE_ETH_DEV_POWER_MGMT_CB_WAIT: > + if (!rte_cpu_get_flag_enabled(RTE_CPUFLAG_WAITPKG)) > + return -ENOTSUP; Here and in other places: in case of error return you don't' free your empty_poll_stats. > + dev->cur_pwr_cb = rte_eth_add_rx_callback(port_id, 0, Why zero for queue number, why not to pass queue_id as a parameter for that function? > + rte_power_mgmt_umwait, NULL); As I said above, instead of NULL - could be pointer to metadata struct. > + break; > + case RTE_ETH_DEV_POWER_MGMT_CB_SCALE: > + /* init scale freq */ > + if (rte_power_init(lcore_id)) > + return -EINVAL; > + dev->cur_pwr_cb = rte_eth_add_rx_callback(port_id, 0, > + rte_power_mgmt_scalefreq, NULL); > + break; > + case RTE_ETH_DEV_POWER_MGMT_CB_PAUSE: > + dev->cur_pwr_cb = rte_eth_add_rx_callback(port_id, 0, > + rte_power_mgmt_pause, NULL); > + break; > + } > + > + dev->cb_mode = mode; > + dev->pwr_mgmt_state = RTE_ETH_DEV_POWER_MGMT_ENABLED; > + return 0; > +} > + > +int > +rte_power_pmd_mgmt_disable(unsigned int lcore_id, > + uint16_t port_id) > +{ > + struct rte_eth_dev *dev; > + > + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL); > + dev = &rte_eth_devices[port_id]; > + > + /*add flag check */ > + > + if (dev->pwr_mgmt_state == RTE_ETH_DEV_POWER_MGMT_DISABLED) > + return -EINVAL; > + > + /* rte_free ignores NULL so safe to call without checks */ > + rte_free(dev->empty_poll_stats); You can't free callback metadata before removing the callback itself. In fact, with current rx callback code it is not safe to free it even after (we discussed it offline). > + > + switch (dev->cb_mode) { > + case RTE_ETH_DEV_POWER_MGMT_CB_WAIT: > + case RTE_ETH_DEV_POWER_MGMT_CB_PAUSE: > + rte_eth_remove_rx_callback(port_id, 0, > + dev->cur_pwr_cb); > + break; > + case RTE_ETH_DEV_POWER_MGMT_CB_SCALE: > + rte_power_freq_max(lcore_id); Stupid q: what makes you think that lcore frequency was max, *before* you setup the callback? > + rte_eth_remove_rx_callback(port_id, 0, > + dev->cur_pwr_cb); > + if (rte_power_exit(lcore_id)) > + return -EINVAL; > + break; > + } > + > + dev->pwr_mgmt_state = RTE_ETH_DEV_POWER_MGMT_DISABLED; > + dev->cur_pwr_cb = NULL; > + dev->cb_mode = 0; > + > + return 0; > +} > diff --git a/lib/librte_power/rte_power_version.map > b/lib/librte_power/rte_power_version.map > index 00ee5753e2..ade83cfd4f 100644 > --- a/lib/librte_power/rte_power_version.map > +++ b/lib/librte_power/rte_power_version.map > @@ -34,4 +34,8 @@ EXPERIMENTAL { > rte_power_guest_channel_receive_msg; > rte_power_poll_stat_fetch; > rte_power_poll_stat_update; > + # added in 20.08 > + rte_power_pmd_mgmt_disable; > + rte_power_pmd_mgmt_enable; > + > }; > -- > 2.17.1