From: Liang Ma <liang.j...@intel.com>

Add a simple on/off switch that will enable saving power when no
packets are arriving. It is based on counting the number of empty
polls and, when the number reaches a certain threshold, entering an
architecture-defined optimized power state that will either wait
until a TSC timestamp expires, or when packets arrive.

This API mandates a core-to-single-queue mapping (that is, multiple
queued per device are supported, but they have to be polled on different
cores).

This design is using PMD RX callbacks.

1. UMWAIT/UMONITOR:

   When a certain threshold of empty polls is reached, the core will go
   into a power optimized sleep while waiting on an address of next RX
   descriptor to be written to.

2. Pause instruction

   Instead of move the core into deeper C state, this method uses the
   pause instruction to avoid busy polling.

3. Frequency scaling
   Reuse existing DPDK power library to scale up/down core frequency
   depending on traffic volume.

Signed-off-by: Liang Ma <liang.j...@intel.com>
Signed-off-by: Anatoly Burakov <anatoly.bura...@intel.com>
---

Notes:
    v5:
    - Make error checking more robust
      - Prevent initializing scaling if ACPI or PSTATE env wasn't set
      - Prevent initializing UMWAIT path if PMD doesn't support get_wake_addr
      - Add some debug logging
    - Replace x86-specific code path to generic path using the intrinsic check

 lib/librte_power/meson.build           |   5 +-
 lib/librte_power/pmd_mgmt.h            |  38 ++++
 lib/librte_power/rte_power_pmd_mgmt.c  | 244 +++++++++++++++++++++++++
 lib/librte_power/rte_power_pmd_mgmt.h  |  88 +++++++++
 lib/librte_power/rte_power_version.map |   4 +
 5 files changed, 377 insertions(+), 2 deletions(-)
 create mode 100644 lib/librte_power/pmd_mgmt.h
 create mode 100644 lib/librte_power/rte_power_pmd_mgmt.c
 create mode 100644 lib/librte_power/rte_power_pmd_mgmt.h

diff --git a/lib/librte_power/meson.build b/lib/librte_power/meson.build
index 78c031c943..cc3c7a8646 100644
--- a/lib/librte_power/meson.build
+++ b/lib/librte_power/meson.build
@@ -9,6 +9,7 @@ sources = files('rte_power.c', 'power_acpi_cpufreq.c',
                'power_kvm_vm.c', 'guest_channel.c',
                'rte_power_empty_poll.c',
                'power_pstate_cpufreq.c',
+               'rte_power_pmd_mgmt.c',
                'power_common.c')
-headers = files('rte_power.h','rte_power_empty_poll.h')
-deps += ['timer']
+headers = files('rte_power.h','rte_power_empty_poll.h','rte_power_pmd_mgmt.h')
+deps += ['timer' ,'ethdev']
diff --git a/lib/librte_power/pmd_mgmt.h b/lib/librte_power/pmd_mgmt.h
new file mode 100644
index 0000000000..20be53bacf
--- /dev/null
+++ b/lib/librte_power/pmd_mgmt.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2020 Intel Corporation
+ */
+
+#ifndef _PMD_MGMT_H
+#define _PMD_MGMT_H
+
+/**
+ * @file
+ * Power Management
+ */
+
+/**
+ * Possible power management states of an ethdev port.
+ */
+enum pmd_mgmt_state {
+       /** Device power management is disabled. */
+       PMD_MGMT_DISABLED = 0,
+       /** Device power management is enabled. */
+       PMD_MGMT_ENABLED,
+};
+
+struct pmd_queue_cfg {
+       enum pmd_mgmt_state pwr_mgmt_state;
+       /**< Power mgmt Callback mode */
+       enum rte_power_pmd_mgmt_type cb_mode;
+       /**< Empty poll number */
+       uint16_t empty_poll_stats;
+       /**< Callback instance  */
+       const struct rte_eth_rxtx_callback *cur_cb;
+} __rte_cache_aligned;
+
+struct pmd_port_cfg {
+       int  ref_cnt;
+       struct pmd_queue_cfg *queue_cfg;
+} __rte_cache_aligned;
+
+#endif
diff --git a/lib/librte_power/rte_power_pmd_mgmt.c 
b/lib/librte_power/rte_power_pmd_mgmt.c
new file mode 100644
index 0000000000..07dfe7c077
--- /dev/null
+++ b/lib/librte_power/rte_power_pmd_mgmt.c
@@ -0,0 +1,244 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2020 Intel Corporation
+ */
+
+#include <rte_lcore.h>
+#include <rte_cycles.h>
+#include <rte_cpuflags.h>
+#include <rte_malloc.h>
+#include <rte_ethdev.h>
+#include <rte_power_intrinsics.h>
+
+#include "rte_power_pmd_mgmt.h"
+#include "pmd_mgmt.h"
+
+
+#define EMPTYPOLL_MAX  512
+#define PAUSE_NUM  64
+
+static struct pmd_port_cfg port_cfg[RTE_MAX_ETHPORTS];
+
+static uint16_t
+rte_power_mgmt_umwait(uint16_t port_id, uint16_t qidx,
+               struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
+               uint16_t max_pkts __rte_unused, void *_  __rte_unused)
+{
+
+       struct pmd_queue_cfg *q_conf;
+       q_conf = &port_cfg[port_id].queue_cfg[qidx];
+
+       if (unlikely(nb_rx == 0)) {
+               q_conf->empty_poll_stats++;
+               if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX)) {
+                       volatile void *target_addr;
+                       uint64_t expected, mask;
+                       uint16_t ret;
+
+                       /*
+                        * get address of next descriptor in the RX
+                        * ring for this queue, as well as expected
+                        * value and a mask.
+                        */
+                       ret = rte_eth_get_wake_addr(port_id, qidx,
+                                       &target_addr, &expected, &mask);
+                       if (ret == 0)
+                               /* -1ULL is maximum value for TSC */
+                               rte_power_monitor(target_addr, expected,
+                                               mask, -1ULL);
+               }
+       } else
+               q_conf->empty_poll_stats = 0;
+
+       return nb_rx;
+}
+
+static uint16_t
+rte_power_mgmt_pause(uint16_t port_id, uint16_t qidx,
+               struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
+               uint16_t max_pkts __rte_unused, void *_  __rte_unused)
+{
+       struct pmd_queue_cfg *q_conf;
+       int i;
+       q_conf = &port_cfg[port_id].queue_cfg[qidx];
+
+       if (unlikely(nb_rx == 0)) {
+               q_conf->empty_poll_stats++;
+               if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX)) {
+                       for (i = 0; i < PAUSE_NUM; i++)
+                               rte_pause();
+               }
+       } else
+               q_conf->empty_poll_stats = 0;
+
+       return nb_rx;
+}
+
+static uint16_t
+rte_power_mgmt_scalefreq(uint16_t port_id, uint16_t qidx,
+               struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
+               uint16_t max_pkts __rte_unused, void *_  __rte_unused)
+{
+       struct pmd_queue_cfg *q_conf;
+       q_conf = &port_cfg[port_id].queue_cfg[qidx];
+
+       if (unlikely(nb_rx == 0)) {
+               q_conf->empty_poll_stats++;
+               if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX)) {
+                       /*scale down freq */
+                       rte_power_freq_min(rte_lcore_id());
+
+               }
+       } else {
+               q_conf->empty_poll_stats = 0;
+               /* scal up freq */
+               rte_power_freq_max(rte_lcore_id());
+       }
+
+       return nb_rx;
+}
+
+int
+rte_power_pmd_mgmt_queue_enable(unsigned int lcore_id,
+                               uint16_t port_id,
+                               uint16_t queue_id,
+                               enum rte_power_pmd_mgmt_type mode)
+{
+       struct rte_eth_dev *dev;
+       struct pmd_queue_cfg *queue_cfg;
+       int ret = 0;
+
+       RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
+       dev = &rte_eth_devices[port_id];
+
+       if (port_cfg[port_id].queue_cfg == NULL) {
+               port_cfg[port_id].ref_cnt = 0;
+               /* allocate memory for empty poll stats */
+               port_cfg[port_id].queue_cfg  = rte_malloc_socket(NULL,
+                                       sizeof(struct pmd_queue_cfg)
+                                       * RTE_MAX_QUEUES_PER_PORT,
+                                       0, dev->data->numa_node);
+               if (port_cfg[port_id].queue_cfg == NULL)
+                       return -ENOMEM;
+       }
+
+       queue_cfg = &port_cfg[port_id].queue_cfg[queue_id];
+
+       if (queue_cfg->pwr_mgmt_state == PMD_MGMT_ENABLED) {
+               ret = -EINVAL;
+               goto failure_handler;
+       }
+
+       switch (mode) {
+       case RTE_POWER_MGMT_TYPE_WAIT:
+       {
+               /* check if rte_power_monitor is supported */
+               uint64_t dummy_expected, dummy_mask;
+               struct rte_cpu_intrinsics i;
+               void *dummy_addr;
+
+               rte_cpu_get_intrinsics_support(&i);
+
+               if (!i.power_monitor) {
+                       RTE_LOG(DEBUG, POWER, "Monitoring intrinsics are not 
supported\n");
+                       ret = -ENOTSUP;
+                       goto failure_handler;
+               }
+
+               /* check if the device supports the necessary PMD API */
+               if (rte_eth_get_wake_addr(port_id, queue_id, &dummy_addr,
+                               &dummy_expected, &dummy_mask) == -ENOTSUP) {
+                       RTE_LOG(DEBUG, POWER, "The device does not support 
get_wake_addr\n");
+                       ret = -ENOTSUP;
+                       goto failure_handler;
+               }
+
+               queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id, queue_id,
+                                               rte_power_mgmt_umwait, NULL);
+               break;
+       }
+       case RTE_POWER_MGMT_TYPE_SCALE:
+       {
+               enum power_management_env env;
+               /* only PSTATE and ACPI modes are supported */
+               if (!rte_power_check_env_supported(PM_ENV_ACPI_CPUFREQ) &&
+                       !rte_power_check_env_supported(PM_ENV_PSTATE_CPUFREQ)) {
+                       RTE_LOG(DEBUG, POWER, "Neither ACPI nor PSTATE modes 
are supported\n");
+                       ret = -ENOTSUP;
+                       goto failure_handler;
+               }
+               /* ensure we could initialize the power library */
+               if (rte_power_init(lcore_id)) {
+                       ret = -EINVAL;
+                       goto failure_handler;
+               }
+               /* ensure we initialized the correct env */
+               env = rte_power_get_env();
+               if (env != PM_ENV_ACPI_CPUFREQ &&
+                               env != PM_ENV_PSTATE_CPUFREQ) {
+                       RTE_LOG(DEBUG, POWER, "Neither ACPI nor PSTATE modes 
were initialized\n");
+                       ret = -ENOTSUP;
+                       goto failure_handler;
+               }
+               queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id, queue_id,
+                                       rte_power_mgmt_scalefreq, NULL);
+               break;
+       }
+       case RTE_POWER_MGMT_TYPE_PAUSE:
+               queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id, queue_id,
+                                               rte_power_mgmt_pause, NULL);
+               break;
+       }
+       queue_cfg->cb_mode = mode;
+       port_cfg[port_id].ref_cnt++;
+       queue_cfg->pwr_mgmt_state = PMD_MGMT_ENABLED;
+       return ret;
+
+failure_handler:
+       if (port_cfg[port_id].ref_cnt == 0) {
+               rte_free(port_cfg[port_id].queue_cfg);
+               port_cfg[port_id].queue_cfg = NULL;
+       }
+       return ret;
+}
+
+int
+rte_power_pmd_mgmt_queue_disable(unsigned int lcore_id,
+                               uint16_t port_id,
+                               uint16_t queue_id)
+{
+       struct pmd_queue_cfg *queue_cfg;
+
+       if (port_cfg[port_id].ref_cnt <= 0)
+               return -EINVAL;
+
+       queue_cfg = &port_cfg[port_id].queue_cfg[queue_id];
+
+       if (queue_cfg->pwr_mgmt_state == PMD_MGMT_DISABLED)
+               return -EINVAL;
+
+       switch (queue_cfg->cb_mode) {
+       case RTE_POWER_MGMT_TYPE_WAIT:
+       case RTE_POWER_MGMT_TYPE_PAUSE:
+               rte_eth_remove_rx_callback(port_id, queue_id,
+                                          queue_cfg->cur_cb);
+               break;
+       case RTE_POWER_MGMT_TYPE_SCALE:
+               rte_power_freq_max(lcore_id);
+               rte_eth_remove_rx_callback(port_id, queue_id,
+                                          queue_cfg->cur_cb);
+               rte_power_exit(lcore_id);
+               break;
+       }
+       /* it's not recommend to free callback instance here.
+        * it cause memory leak which is a known issue.
+        */
+       queue_cfg->cur_cb = NULL;
+       queue_cfg->pwr_mgmt_state = PMD_MGMT_DISABLED;
+       port_cfg[port_id].ref_cnt--;
+
+       if (port_cfg[port_id].ref_cnt == 0) {
+               rte_free(port_cfg[port_id].queue_cfg);
+               port_cfg[port_id].queue_cfg = NULL;
+       }
+       return 0;
+}
diff --git a/lib/librte_power/rte_power_pmd_mgmt.h 
b/lib/librte_power/rte_power_pmd_mgmt.h
new file mode 100644
index 0000000000..8b110f1148
--- /dev/null
+++ b/lib/librte_power/rte_power_pmd_mgmt.h
@@ -0,0 +1,88 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2020 Intel Corporation
+ */
+
+#ifndef _RTE_POWER_PMD_MGMT_H
+#define _RTE_POWER_PMD_MGMT_H
+
+/**
+ * @file
+ * RTE PMD Power Management
+ */
+#include <stdint.h>
+#include <stdbool.h>
+
+#include <rte_common.h>
+#include <rte_byteorder.h>
+#include <rte_log.h>
+#include <rte_power.h>
+#include <rte_atomic.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * PMD Power Management Type
+ */
+enum rte_power_pmd_mgmt_type {
+       /** WAIT callback mode. */
+       RTE_POWER_MGMT_TYPE_WAIT = 1,
+       /** PAUSE callback mode. */
+       RTE_POWER_MGMT_TYPE_PAUSE,
+       /** Freq Scaling callback mode. */
+       RTE_POWER_MGMT_TYPE_SCALE,
+};
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
+ *
+ * Setup per-queue power management callback.
+ * @param lcore_id
+ *   lcore_id.
+ * @param port_id
+ *   The port identifier of the Ethernet device.
+ * @param queue_id
+ *   The queue identifier of the Ethernet device.
+ * @param mode
+ *   The power management callback function type.
+
+ * @return
+ *   0 on success
+ *   <0 on error
+ */
+
+__rte_experimental
+int
+rte_power_pmd_mgmt_queue_enable(unsigned int lcore_id,
+                               uint16_t port_id,
+                               uint16_t queue_id,
+                               enum rte_power_pmd_mgmt_type mode);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
+ *
+ * Remove per-queue power management callback.
+ * @param lcore_id
+ *   lcore_id.
+ * @param port_id
+ *   The port identifier of the Ethernet device.
+ * @param queue_id
+ *   The queue identifier of the Ethernet device.
+ * @return
+ *   0 on success
+ *   <0 on error
+ */
+
+__rte_experimental
+int
+rte_power_pmd_mgmt_queue_disable(unsigned int lcore_id,
+                               uint16_t port_id,
+                               uint16_t queue_id);
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/lib/librte_power/rte_power_version.map 
b/lib/librte_power/rte_power_version.map
index 69ca9af616..3f2f6cd6f6 100644
--- a/lib/librte_power/rte_power_version.map
+++ b/lib/librte_power/rte_power_version.map
@@ -34,4 +34,8 @@ EXPERIMENTAL {
        rte_power_guest_channel_receive_msg;
        rte_power_poll_stat_fetch;
        rte_power_poll_stat_update;
+       # added in 20.11
+       rte_power_pmd_mgmt_queue_enable;
+       rte_power_pmd_mgmt_queue_disable;
+
 };
-- 
2.17.1

Reply via email to