Re: [dpdk-dev] [PATCH v3 3/6] power: add simple power management API and callback

Ananyev, Konstantin Fri, 04 Sep 2020 11:34:29 -0700

> 
> Add a simple on/off switch that will enable saving power when no
> packets are arriving. It is based on counting the number of empty
> polls and, when the number reaches a certain threshold, entering an
> architecture-defined optimized power state that will either wait
> until a TSC timestamp expires, or when packets arrive.
> 
> This API is limited to 1 core 1 port 1 queue use case as there is
> no coordination between queues/cores in ethdev. 1 port map to multiple
> core will be supported in next version.
> 
> This design leverage RX Callback mechnaism which allow three
> different power management methodology co exist.
> 
> 1. umwait/umonitor:
> 
>    The TSC timestamp is automatically calculated using current
>    link speed and RX descriptor ring size, such that the sleep
>    time is not longer than it would take for a NIC to fill its
>    entire RX descriptor ring.
> 
> 2. Pause instruction
> 
>    Instead of move the core into deeper C state, this lightweight
>    method use Pause instruction to releaf the processor from
>    busy polling.
> 
> 3. Frequency Scaling
>    Reuse exist rte power library to scale up/down core frequency
>    depend on traffic volume.
> 
> Signed-off-by: Liang Ma <liang.j...@intel.com>
> Signed-off-by: Anatoly Burakov <anatoly.bura...@intel.com>
> ---
>  lib/librte_power/meson.build           |   3 +-
>  lib/librte_power/rte_power.h           |  38 +++++
>  lib/librte_power/rte_power_pmd_mgmt.c  | 184 +++++++++++++++++++++++++
>  lib/librte_power/rte_power_version.map |   4 +
>  4 files changed, 228 insertions(+), 1 deletion(-)
>  create mode 100644 lib/librte_power/rte_power_pmd_mgmt.c
> 
> diff --git a/lib/librte_power/meson.build b/lib/librte_power/meson.build
> index 78c031c943..44b01afce2 100644
> --- a/lib/librte_power/meson.build
> +++ b/lib/librte_power/meson.build
> @@ -9,6 +9,7 @@ sources = files('rte_power.c', 'power_acpi_cpufreq.c',
>               'power_kvm_vm.c', 'guest_channel.c',
>               'rte_power_empty_poll.c',
>               'power_pstate_cpufreq.c',
> +             'rte_power_pmd_mgmt.c',
>               'power_common.c')
>  headers = files('rte_power.h','rte_power_empty_poll.h')
> -deps += ['timer']
> +deps += ['timer' ,'ethdev']
> diff --git a/lib/librte_power/rte_power.h b/lib/librte_power/rte_power.h
> index bbbde4dfb4..06d5a9984f 100644
> --- a/lib/librte_power/rte_power.h
> +++ b/lib/librte_power/rte_power.h
> @@ -14,6 +14,7 @@
>  #include <rte_byteorder.h>
>  #include <rte_log.h>
>  #include <rte_string_fns.h>
> +#include <rte_ethdev.h>
> 
>  #ifdef __cplusplus
>  extern "C" {
> @@ -97,6 +98,43 @@ int rte_power_init(unsigned int lcore_id);
>   */
>  int rte_power_exit(unsigned int lcore_id);
> 
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
> + *
> + * Enable device power management.
> + * @param lcore_id
> + *   lcore id.
> + * @param port_id
> + *   The port identifier of the Ethernet device.
> + * @param mode
> + *   The power management callback function mode.
> + * @return
> + *   0 on success
> + *   <0 on error
> + */
> +__rte_experimental
> +int rte_power_pmd_mgmt_enable(unsigned int lcore_id,
> +                               uint16_t port_id,
> +                               enum rte_eth_dev_power_mgmt_cb_mode mode);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
> + *
> + * Disable device power management.
> + * @param lcore_id
> + *   lcore id.
> + * @param port_id
> + *   The port identifier of the Ethernet device.
> + *
> + * @return
> + *   0 on success
> + *   <0 on error
> + */
> +__rte_experimental
> +int rte_power_pmd_mgmt_disable(unsigned int lcore_id, uint16_t port_id);
> +
>  /**
>   * Get the available frequencies of a specific lcore.
>   * Function pointer definition. Review each environments
> diff --git a/lib/librte_power/rte_power_pmd_mgmt.c 
> b/lib/librte_power/rte_power_pmd_mgmt.c
> new file mode 100644
> index 0000000000..a445153ede
> --- /dev/null
> +++ b/lib/librte_power/rte_power_pmd_mgmt.c
> @@ -0,0 +1,184 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2010-2020 Intel Corporation
> + */
> +
> +#include <rte_lcore.h>
> +#include <rte_cycles.h>
> +#include <rte_atomic.h>
> +#include <rte_malloc.h>
> +#include <rte_ethdev.h>
> +
> +#include "rte_power.h"
> +
> +
> +
> +static uint16_t
> +rte_power_mgmt_umwait(uint16_t port_id, uint16_t qidx,
> +             struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
> +             uint16_t max_pkts __rte_unused, void *_  __rte_unused)
> +{
> +
> +     struct rte_eth_dev *dev = &rte_eth_devices[port_id];
> +
> +     if (unlikely(nb_rx == 0)) {
> +             dev->empty_poll_stats[qidx].num++;


I believe there are two fundamental issues with that approach:
1. You put metadata specific lib (power) callbacks into rte_eth_dev struct.
2. These callbacks do access rte_eth_devices[] directly. 
That doesn't look right to me - rte_eth_dev structure supposed to be treated
as internal one librt_ether and underlying drivers and should be accessed 
directly
by outer code.
If these callbacks need some extra metadata, then it is responsibility
of power library to allocate/manage these metadata.
You can pass pointer to this metadata via last parameter for 
rte_eth_add_rx_callback().

> +             if (unlikely(dev->empty_poll_stats[qidx].num >
> +                          ETH_EMPTYPOLL_MAX)) {
> +                     volatile void *target_addr;
> +                     uint64_t expected, mask;
> +                     uint16_t ret;
> +
> +                     /*
> +                      * get address of next descriptor in the RX
> +                      * ring for this queue, as well as expected
> +                      * value and a mask.
> +                      */
> +                     ret = (*dev->dev_ops->next_rx_desc)
> +                             (dev->data->rx_queues[qidx],
> +                              &target_addr, &expected, &mask);
> +                     if (ret == 0)
> +                             /* -1ULL is maximum value for TSC */
> +                             rte_power_monitor(target_addr,
> +                                               expected, mask,
> +                                               0, -1ULL);
> +             }
> +     } else
> +             dev->empty_poll_stats[qidx].num = 0;
> +
> +     return nb_rx;
> +}
> +
> +static uint16_t
> +rte_power_mgmt_pause(uint16_t port_id, uint16_t qidx,
> +             struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
> +             uint16_t max_pkts __rte_unused, void *_  __rte_unused)
> +{
> +     struct rte_eth_dev *dev = &rte_eth_devices[port_id];
> +
> +     int i;
> +
> +     if (unlikely(nb_rx == 0)) {
> +
> +             dev->empty_poll_stats[qidx].num++;
> +
> +             if (unlikely(dev->empty_poll_stats[qidx].num >
> +                          ETH_EMPTYPOLL_MAX)) {
> +
> +                     for (i = 0; i < RTE_ETH_PAUSE_NUM; i++)
> +                             rte_pause();
> +
> +             }
> +     } else
> +             dev->empty_poll_stats[qidx].num = 0;
> +
> +     return nb_rx;
> +}
> +
> +static uint16_t
> +rte_power_mgmt_scalefreq(uint16_t port_id, uint16_t qidx,
> +             struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
> +             uint16_t max_pkts __rte_unused, void *_  __rte_unused)
> +{
> +     struct rte_eth_dev *dev = &rte_eth_devices[port_id];
> +
> +     if (unlikely(nb_rx == 0)) {
> +             dev->empty_poll_stats[qidx].num++;
> +             if (unlikely(dev->empty_poll_stats[qidx].num >
> +                          ETH_EMPTYPOLL_MAX)) {
> +
> +                     /*scale down freq */
> +                     rte_power_freq_min(rte_lcore_id());
> +
> +             }
> +     } else {
> +             dev->empty_poll_stats[qidx].num = 0;
> +             /* scal up freq */
> +             rte_power_freq_max(rte_lcore_id());
> +     }
> +
> +     return nb_rx;
> +}
> +
> +int
> +rte_power_pmd_mgmt_enable(unsigned int lcore_id,
> +                     uint16_t port_id,
> +                     enum rte_eth_dev_power_mgmt_cb_mode mode)
> +{
> +     struct rte_eth_dev *dev;
> +
> +     RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
> +     dev = &rte_eth_devices[port_id];
> +
> +     if (dev->pwr_mgmt_state == RTE_ETH_DEV_POWER_MGMT_ENABLED)
> +             return -EINVAL;
> +     /* allocate memory for empty poll stats */
> +     dev->empty_poll_stats = rte_malloc_socket(NULL,
> +                                               sizeof(struct rte_eth_ep_stat)
> +                                               * RTE_MAX_QUEUES_PER_PORT,
> +                                               0, dev->data->numa_node);
> +     if (dev->empty_poll_stats == NULL)
> +             return -ENOMEM;
> +
> +     switch (mode) {
> +     case RTE_ETH_DEV_POWER_MGMT_CB_WAIT:
> +             if (!rte_cpu_get_flag_enabled(RTE_CPUFLAG_WAITPKG))
> +                     return -ENOTSUP;

Here and in other places: in case of error return you don't' free your 
empty_poll_stats.

> +             dev->cur_pwr_cb = rte_eth_add_rx_callback(port_id, 0,

Why zero for queue number, why not to pass queue_id as a parameter for that 
function?

> +                                             rte_power_mgmt_umwait, NULL);

As I said above, instead of NULL - could be pointer to metadata struct.

> +             break;
> +     case RTE_ETH_DEV_POWER_MGMT_CB_SCALE:
> +             /* init scale freq */
> +             if (rte_power_init(lcore_id))
> +                     return -EINVAL;
> +             dev->cur_pwr_cb = rte_eth_add_rx_callback(port_id, 0,
> +                                     rte_power_mgmt_scalefreq, NULL);
> +             break;
> +     case RTE_ETH_DEV_POWER_MGMT_CB_PAUSE:
> +             dev->cur_pwr_cb = rte_eth_add_rx_callback(port_id, 0,
> +                                             rte_power_mgmt_pause, NULL);
> +             break;
> +     }
> +
> +     dev->cb_mode = mode;
> +     dev->pwr_mgmt_state = RTE_ETH_DEV_POWER_MGMT_ENABLED;
> +     return 0;
> +}
> +
> +int
> +rte_power_pmd_mgmt_disable(unsigned int lcore_id,
> +                             uint16_t port_id)
> +{
> +     struct rte_eth_dev *dev;
> +
> +     RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
> +     dev = &rte_eth_devices[port_id];
> +
> +     /*add flag check */
> +
> +     if (dev->pwr_mgmt_state == RTE_ETH_DEV_POWER_MGMT_DISABLED)
> +             return -EINVAL;
> +
> +     /* rte_free ignores NULL so safe to call without checks */
> +     rte_free(dev->empty_poll_stats);

You can't free callback metadata before removing the callback itself.
In fact, with current rx callback code it is not safe to free it
even after (we discussed it offline).

> +
> +     switch (dev->cb_mode) {
> +     case RTE_ETH_DEV_POWER_MGMT_CB_WAIT:
> +     case RTE_ETH_DEV_POWER_MGMT_CB_PAUSE:
> +             rte_eth_remove_rx_callback(port_id, 0,
> +                                        dev->cur_pwr_cb);
> +             break;
> +     case RTE_ETH_DEV_POWER_MGMT_CB_SCALE:
> +             rte_power_freq_max(lcore_id);

Stupid q: what makes you think that lcore frequency was max,
*before* you setup the callback?

> +             rte_eth_remove_rx_callback(port_id, 0,
> +                                        dev->cur_pwr_cb);
> +             if (rte_power_exit(lcore_id))
> +                     return -EINVAL;
> +             break;
> +     }
> +
> +     dev->pwr_mgmt_state = RTE_ETH_DEV_POWER_MGMT_DISABLED;
> +     dev->cur_pwr_cb = NULL;
> +     dev->cb_mode = 0;
> +
> +     return 0;
> +}
> diff --git a/lib/librte_power/rte_power_version.map 
> b/lib/librte_power/rte_power_version.map
> index 00ee5753e2..ade83cfd4f 100644
> --- a/lib/librte_power/rte_power_version.map
> +++ b/lib/librte_power/rte_power_version.map
> @@ -34,4 +34,8 @@ EXPERIMENTAL {
>       rte_power_guest_channel_receive_msg;
>       rte_power_poll_stat_fetch;
>       rte_power_poll_stat_update;
> +     # added in 20.08
> +     rte_power_pmd_mgmt_disable;
> +     rte_power_pmd_mgmt_enable;
> +
>  };
> --
> 2.17.1

Re: [dpdk-dev] [PATCH v3 3/6] power: add simple power management API and callback

Reply via email to