Re: [dpdk-dev] [PATCH v4 04/10] power: add simple power management API and callback

Ananyev, Konstantin Fri, 09 Oct 2020 09:39:34 -0700

> Add a simple on/off switch that will enable saving power when no
> packets are arriving. It is based on counting the number of empty
> polls and, when the number reaches a certain threshold, entering an
> architecture-defined optimized power state that will either wait
> until a TSC timestamp expires, or when packets arrive.
> 
> This API support 1 port to multiple core use case.
> 
> This design leverage RX Callback mechnaism which allow three
> different power management methodology co exist.
> 
> 1. umwait/umonitor:
> 
>    The TSC timestamp is automatically calculated using current
>    link speed and RX descriptor ring size, such that the sleep
>    time is not longer than it would take for a NIC to fill its
>    entire RX descriptor ring.
> 
> 2. Pause instruction
> 
>    Instead of move the core into deeper C state, this lightweight
>    method use Pause instruction to relief the processor from
>    busy polling.
> 
> 3. Frequency Scaling
>    Reuse exist rte power library to scale up/down core frequency
>    depend on traffic volume.
> 
> Signed-off-by: Liang Ma <liang.j...@intel.com>
> Signed-off-by: Anatoly Burakov <anatoly.bura...@intel.com>
> ---
>  lib/librte_power/meson.build           |   5 +-
>  lib/librte_power/pmd_mgmt.h            |  49 ++++++
>  lib/librte_power/rte_power_pmd_mgmt.c  | 208 +++++++++++++++++++++++++
>  lib/librte_power/rte_power_pmd_mgmt.h  |  88 +++++++++++
>  lib/librte_power/rte_power_version.map |   4 +
>  5 files changed, 352 insertions(+), 2 deletions(-)
>  create mode 100644 lib/librte_power/pmd_mgmt.h
>  create mode 100644 lib/librte_power/rte_power_pmd_mgmt.c
>  create mode 100644 lib/librte_power/rte_power_pmd_mgmt.h
> 
> diff --git a/lib/librte_power/meson.build b/lib/librte_power/meson.build
> index 78c031c943..cc3c7a8646 100644
> --- a/lib/librte_power/meson.build
> +++ b/lib/librte_power/meson.build
> @@ -9,6 +9,7 @@ sources = files('rte_power.c', 'power_acpi_cpufreq.c',
>               'power_kvm_vm.c', 'guest_channel.c',
>               'rte_power_empty_poll.c',
>               'power_pstate_cpufreq.c',
> +             'rte_power_pmd_mgmt.c',
>               'power_common.c')
> -headers = files('rte_power.h','rte_power_empty_poll.h')
> -deps += ['timer']
> +headers = 
> files('rte_power.h','rte_power_empty_poll.h','rte_power_pmd_mgmt.h')
> +deps += ['timer' ,'ethdev']
> diff --git a/lib/librte_power/pmd_mgmt.h b/lib/librte_power/pmd_mgmt.h
> new file mode 100644
> index 0000000000..756fbe20f7
> --- /dev/null
> +++ b/lib/librte_power/pmd_mgmt.h
> @@ -0,0 +1,49 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2010-2020 Intel Corporation
> + */
> +
> +#ifndef _PMD_MGMT_H
> +#define _PMD_MGMT_H
> +
> +/**
> + * @file
> + * Power Management
> + */
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +/**
> + * Possible power management states of an ethdev port.
> + */
> +enum pmd_mgmt_state {
> +     /** Device power management is disabled. */
> +     PMD_MGMT_DISABLED = 0,
> +     /** Device power management is enabled. */
> +     PMD_MGMT_ENABLED,
> +};
> +
> +struct pmd_queue_cfg {
> +     enum pmd_mgmt_state pwr_mgmt_state;
> +     /**< Power mgmt Callback mode */
> +     enum rte_power_pmd_mgmt_type cb_mode;
> +     /**< Empty poll number */
> +     uint16_t empty_poll_stats;
> +     /**< Callback instance  */
> +     const struct rte_eth_rxtx_callback *cur_cb;
> +} __rte_cache_aligned;
> +
> +struct pmd_port_cfg {
> +     int  ref_cnt;
> +     struct pmd_queue_cfg *queue_cfg;
> +} __rte_cache_aligned;
> +
> +
> +
> +
> +#ifdef __cplusplus
> +}
> +#endif
> +
> +#endif
> diff --git a/lib/librte_power/rte_power_pmd_mgmt.c 
> b/lib/librte_power/rte_power_pmd_mgmt.c
> new file mode 100644
> index 0000000000..35d2af46a4
> --- /dev/null
> +++ b/lib/librte_power/rte_power_pmd_mgmt.c
> @@ -0,0 +1,208 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2010-2020 Intel Corporation
> + */
> +
> +#include <rte_lcore.h>
> +#include <rte_cycles.h>
> +#include <rte_malloc.h>
> +#include <rte_ethdev.h>
> +#include <rte_power_intrinsics.h>
> +
> +#include "rte_power_pmd_mgmt.h"
> +#include "pmd_mgmt.h"
> +
> +
> +#define EMPTYPOLL_MAX  512
> +#define PAUSE_NUM  64
> +
> +static struct pmd_port_cfg port_cfg[RTE_MAX_ETHPORTS];
> +
> +static uint16_t
> +rte_power_mgmt_umwait(uint16_t port_id, uint16_t qidx,
> +             struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
> +             uint16_t max_pkts __rte_unused, void *_  __rte_unused)
> +{
> +
> +     struct pmd_queue_cfg *q_conf;
> +     q_conf = &port_cfg[port_id].queue_cfg[qidx];
> +
> +     if (unlikely(nb_rx == 0)) {
> +             q_conf->empty_poll_stats++;
> +             if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX)) {



Here and in other places - wouldn't it be better to empty_poll_max as 
configurable
parameter, instead of constant value? 

> +                     volatile void *target_addr;
> +                     uint64_t expected, mask;
> +                     uint16_t ret;
> +
> +                     /*
> +                      * get address of next descriptor in the RX
> +                      * ring for this queue, as well as expected
> +                      * value and a mask.
> +                      */
> +                     ret = rte_eth_get_wake_addr(port_id, qidx,
> +                                                 &target_addr, &expected,
> +                                                 &mask);
> +                     if (ret == 0)
> +                             /* -1ULL is maximum value for TSC */
> +                             rte_power_monitor(target_addr,
> +                                               expected, mask,
> +                                               0, -1ULL);


Why not make timeout a user specified parameter?

> +             }
> +     } else
> +             q_conf->empty_poll_stats = 0;
> +
> +     return nb_rx;
> +}
> +
> +static uint16_t
> +rte_power_mgmt_pause(uint16_t port_id, uint16_t qidx,
> +             struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
> +             uint16_t max_pkts __rte_unused, void *_  __rte_unused)
> +{
> +     struct pmd_queue_cfg *q_conf;
> +     int i;
> +     q_conf = &port_cfg[port_id].queue_cfg[qidx];
> +
> +     if (unlikely(nb_rx == 0)) {
> +             q_conf->empty_poll_stats++;
> +             if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX)) {
> +                     for (i = 0; i < PAUSE_NUM; i++)
> +                             rte_pause();

Just rte_delay_us(timeout) instead of this loop?

> +             }
> +     } else
> +             q_conf->empty_poll_stats = 0;
> +
> +     return nb_rx;
> +}
> +
> +static uint16_t
> +rte_power_mgmt_scalefreq(uint16_t port_id, uint16_t qidx,
> +             struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
> +             uint16_t max_pkts __rte_unused, void *_  __rte_unused)
> +{
> +     struct pmd_queue_cfg *q_conf;
> +     q_conf = &port_cfg[port_id].queue_cfg[qidx];
> +
> +     if (unlikely(nb_rx == 0)) {
> +             q_conf->empty_poll_stats++;
> +             if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX)) {
> +                     /*scale down freq */
> +                     rte_power_freq_min(rte_lcore_id());
> +
> +             }
> +     } else {
> +             q_conf->empty_poll_stats = 0;
> +             /* scal up freq */
> +             rte_power_freq_max(rte_lcore_id());
> +     }
> +
> +     return nb_rx;
> +}
> +

Probably worth to mention in comments that these functions enable/disable
are not MT safe.

> +int
> +rte_power_pmd_mgmt_queue_enable(unsigned int lcore_id,
> +                             uint16_t port_id,
> +                             uint16_t queue_id,
> +                             enum rte_power_pmd_mgmt_type mode)
> +{
> +     struct rte_eth_dev *dev;
> +     struct pmd_queue_cfg *queue_cfg;
> +     int ret = 0;
> +
> +     RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
> +     dev = &rte_eth_devices[port_id];
> +
> +     if (port_cfg[port_id].queue_cfg == NULL) {
> +             port_cfg[port_id].ref_cnt = 0;
> +             /* allocate memory for empty poll stats */
> +             port_cfg[port_id].queue_cfg  = rte_malloc_socket(NULL,
> +                                     sizeof(struct pmd_queue_cfg)
> +                                     * RTE_MAX_QUEUES_PER_PORT,
> +                                     0, dev->data->numa_node);
> +             if (port_cfg[port_id].queue_cfg == NULL)
> +                     return -ENOMEM;
> +     }
> +
> +     queue_cfg = &port_cfg[port_id].queue_cfg[queue_id];
> +
> +     if (queue_cfg->pwr_mgmt_state == PMD_MGMT_ENABLED) {
> +             ret = -EINVAL;
> +             goto failure_handler;
> +     }
> +
> +     switch (mode) {
> +     case RTE_POWER_MGMT_TYPE_WAIT:
> +             if (!rte_cpu_get_flag_enabled(RTE_CPUFLAG_WAITPKG)) {
> +                     ret = -ENOTSUP;
> +                     goto failure_handler;
> +             }
> +             queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id, queue_id,
> +                                             rte_power_mgmt_umwait, NULL);
> +             break;
> +     case RTE_POWER_MGMT_TYPE_SCALE:
> +             /* init scale freq */
> +             if (rte_power_init(lcore_id)) {
> +                     ret = -EINVAL;
> +                     goto failure_handler;
> +             }
> +             queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id, queue_id,
> +                                     rte_power_mgmt_scalefreq, NULL);
> +             break;
> +     case RTE_POWER_MGMT_TYPE_PAUSE:
> +             queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id, queue_id,
> +                                             rte_power_mgmt_pause, NULL);
> +             break;

        default:
                ....

> +     }
> +     queue_cfg->cb_mode = mode;
> +     port_cfg[port_id].ref_cnt++;
> +     queue_cfg->pwr_mgmt_state = PMD_MGMT_ENABLED;
> +     return ret;
> +
> +failure_handler:
> +     if (port_cfg[port_id].ref_cnt == 0) {
> +             rte_free(port_cfg[port_id].queue_cfg);
> +             port_cfg[port_id].queue_cfg = NULL;
> +     }
> +     return ret;
> +}
> +
> +int
> +rte_power_pmd_mgmt_queue_disable(unsigned int lcore_id,
> +                             uint16_t port_id,
> +                             uint16_t queue_id)
> +{
> +     struct pmd_queue_cfg *queue_cfg;
> +
> +     if (port_cfg[port_id].ref_cnt <= 0)
> +             return -EINVAL;
> +
> +     queue_cfg = &port_cfg[port_id].queue_cfg[queue_id];
> +
> +     if (queue_cfg->pwr_mgmt_state == PMD_MGMT_DISABLED)
> +             return -EINVAL;
> +
> +     switch (queue_cfg->cb_mode) {
> +     case RTE_POWER_MGMT_TYPE_WAIT:

Think we need wakeup(lcore_id) here.

> +     case RTE_POWER_MGMT_TYPE_PAUSE:
> +             rte_eth_remove_rx_callback(port_id, queue_id,
> +                                        queue_cfg->cur_cb);
> +             break;
> +     case RTE_POWER_MGMT_TYPE_SCALE:
> +             rte_power_freq_max(lcore_id);
> +             rte_eth_remove_rx_callback(port_id, queue_id,
> +                                        queue_cfg->cur_cb);
> +             rte_power_exit(lcore_id);
> +             break;
> +     }
> +     /* it's not recommend to free callback instance here.
> +      * it cause memory leak which is a known issue.
> +      */
> +     queue_cfg->cur_cb = NULL;
> +     queue_cfg->pwr_mgmt_state = PMD_MGMT_DISABLED;
> +     port_cfg[port_id].ref_cnt--;
> +
> +     if (port_cfg[port_id].ref_cnt == 0) {
> +             rte_free(port_cfg[port_id].queue_cfg);

It is not safe to do so, unless device is already stopped.
Otherwise you need some sync mechanism here (hand-made as bpf lib, or rcu 
online/offline, or...)

> +             port_cfg[port_id].queue_cfg = NULL;
> +     }
> +     return 0;
> +}

Re: [dpdk-dev] [PATCH v4 04/10] power: add simple power management API and callback

Reply via email to