Add a simple on/off switch that will enable saving power when no
packets are arriving. It is based on counting the number of empty
polls and, when the number reaches a certain threshold, entering an
architecture-defined optimized power state that will either wait
until a TSC timestamp expires, or when packets arrive.

This API is limited to 1 core 1 port 1 queue use case as there is
no coordination between queues/cores in ethdev. 1 port map to multiple
core will be supported in next version.

This design leverage RX Callback mechnaism which allow three
different power management methodology co exist.

1. umwait/umonitor:

   The TSC timestamp is automatically calculated using current
   link speed and RX descriptor ring size, such that the sleep
   time is not longer than it would take for a NIC to fill its
   entire RX descriptor ring.

2. Pause instruction

   Instead of move the core into deeper C state, this lightweight
   method use Pause instruction to releaf the processor from
   busy polling.

3. Frequency Scaling
   Reuse exist rte power library to scale up/down core frequency
   depend on traffic volume.

Signed-off-by: Liang Ma <liang.j...@intel.com>
Signed-off-by: Anatoly Burakov <anatoly.bura...@intel.com>
---
 lib/librte_power/meson.build           |   3 +-
 lib/librte_power/rte_power.h           |  38 +++++
 lib/librte_power/rte_power_pmd_mgmt.c  | 184 +++++++++++++++++++++++++
 lib/librte_power/rte_power_version.map |   4 +
 4 files changed, 228 insertions(+), 1 deletion(-)
 create mode 100644 lib/librte_power/rte_power_pmd_mgmt.c

diff --git a/lib/librte_power/meson.build b/lib/librte_power/meson.build
index 78c031c943..44b01afce2 100644
--- a/lib/librte_power/meson.build
+++ b/lib/librte_power/meson.build
@@ -9,6 +9,7 @@ sources = files('rte_power.c', 'power_acpi_cpufreq.c',
                'power_kvm_vm.c', 'guest_channel.c',
                'rte_power_empty_poll.c',
                'power_pstate_cpufreq.c',
+               'rte_power_pmd_mgmt.c',
                'power_common.c')
 headers = files('rte_power.h','rte_power_empty_poll.h')
-deps += ['timer']
+deps += ['timer' ,'ethdev']
diff --git a/lib/librte_power/rte_power.h b/lib/librte_power/rte_power.h
index bbbde4dfb4..06d5a9984f 100644
--- a/lib/librte_power/rte_power.h
+++ b/lib/librte_power/rte_power.h
@@ -14,6 +14,7 @@
 #include <rte_byteorder.h>
 #include <rte_log.h>
 #include <rte_string_fns.h>
+#include <rte_ethdev.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -97,6 +98,43 @@ int rte_power_init(unsigned int lcore_id);
  */
 int rte_power_exit(unsigned int lcore_id);
 
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
+ *
+ * Enable device power management.
+ * @param lcore_id
+ *   lcore id.
+ * @param port_id
+ *   The port identifier of the Ethernet device.
+ * @param mode
+ *   The power management callback function mode.
+ * @return
+ *   0 on success
+ *   <0 on error
+ */
+__rte_experimental
+int rte_power_pmd_mgmt_enable(unsigned int lcore_id,
+                                 uint16_t port_id,
+                                 enum rte_eth_dev_power_mgmt_cb_mode mode);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
+ *
+ * Disable device power management.
+ * @param lcore_id
+ *   lcore id.
+ * @param port_id
+ *   The port identifier of the Ethernet device.
+ *
+ * @return
+ *   0 on success
+ *   <0 on error
+ */
+__rte_experimental
+int rte_power_pmd_mgmt_disable(unsigned int lcore_id, uint16_t port_id);
+
 /**
  * Get the available frequencies of a specific lcore.
  * Function pointer definition. Review each environments
diff --git a/lib/librte_power/rte_power_pmd_mgmt.c 
b/lib/librte_power/rte_power_pmd_mgmt.c
new file mode 100644
index 0000000000..a445153ede
--- /dev/null
+++ b/lib/librte_power/rte_power_pmd_mgmt.c
@@ -0,0 +1,184 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2020 Intel Corporation
+ */
+
+#include <rte_lcore.h>
+#include <rte_cycles.h>
+#include <rte_atomic.h>
+#include <rte_malloc.h>
+#include <rte_ethdev.h>
+
+#include "rte_power.h"
+
+
+
+static uint16_t
+rte_power_mgmt_umwait(uint16_t port_id, uint16_t qidx,
+               struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
+               uint16_t max_pkts __rte_unused, void *_  __rte_unused)
+{
+
+       struct rte_eth_dev *dev = &rte_eth_devices[port_id];
+
+       if (unlikely(nb_rx == 0)) {
+               dev->empty_poll_stats[qidx].num++;
+               if (unlikely(dev->empty_poll_stats[qidx].num >
+                            ETH_EMPTYPOLL_MAX)) {
+                       volatile void *target_addr;
+                       uint64_t expected, mask;
+                       uint16_t ret;
+
+                       /*
+                        * get address of next descriptor in the RX
+                        * ring for this queue, as well as expected
+                        * value and a mask.
+                        */
+                       ret = (*dev->dev_ops->next_rx_desc)
+                               (dev->data->rx_queues[qidx],
+                                &target_addr, &expected, &mask);
+                       if (ret == 0)
+                               /* -1ULL is maximum value for TSC */
+                               rte_power_monitor(target_addr,
+                                                 expected, mask,
+                                                 0, -1ULL);
+               }
+       } else
+               dev->empty_poll_stats[qidx].num = 0;
+
+       return nb_rx;
+}
+
+static uint16_t
+rte_power_mgmt_pause(uint16_t port_id, uint16_t qidx,
+               struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
+               uint16_t max_pkts __rte_unused, void *_  __rte_unused)
+{
+       struct rte_eth_dev *dev = &rte_eth_devices[port_id];
+
+       int i;
+
+       if (unlikely(nb_rx == 0)) {
+
+               dev->empty_poll_stats[qidx].num++;
+
+               if (unlikely(dev->empty_poll_stats[qidx].num >
+                            ETH_EMPTYPOLL_MAX)) {
+
+                       for (i = 0; i < RTE_ETH_PAUSE_NUM; i++)
+                               rte_pause();
+
+               }
+       } else
+               dev->empty_poll_stats[qidx].num = 0;
+
+       return nb_rx;
+}
+
+static uint16_t
+rte_power_mgmt_scalefreq(uint16_t port_id, uint16_t qidx,
+               struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
+               uint16_t max_pkts __rte_unused, void *_  __rte_unused)
+{
+       struct rte_eth_dev *dev = &rte_eth_devices[port_id];
+
+       if (unlikely(nb_rx == 0)) {
+               dev->empty_poll_stats[qidx].num++;
+               if (unlikely(dev->empty_poll_stats[qidx].num >
+                            ETH_EMPTYPOLL_MAX)) {
+
+                       /*scale down freq */
+                       rte_power_freq_min(rte_lcore_id());
+
+               }
+       } else {
+               dev->empty_poll_stats[qidx].num = 0;
+               /* scal up freq */
+               rte_power_freq_max(rte_lcore_id());
+       }
+
+       return nb_rx;
+}
+
+int
+rte_power_pmd_mgmt_enable(unsigned int lcore_id,
+                       uint16_t port_id,
+                       enum rte_eth_dev_power_mgmt_cb_mode mode)
+{
+       struct rte_eth_dev *dev;
+
+       RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
+       dev = &rte_eth_devices[port_id];
+
+       if (dev->pwr_mgmt_state == RTE_ETH_DEV_POWER_MGMT_ENABLED)
+               return -EINVAL;
+       /* allocate memory for empty poll stats */
+       dev->empty_poll_stats = rte_malloc_socket(NULL,
+                                                 sizeof(struct rte_eth_ep_stat)
+                                                 * RTE_MAX_QUEUES_PER_PORT,
+                                                 0, dev->data->numa_node);
+       if (dev->empty_poll_stats == NULL)
+               return -ENOMEM;
+
+       switch (mode) {
+       case RTE_ETH_DEV_POWER_MGMT_CB_WAIT:
+               if (!rte_cpu_get_flag_enabled(RTE_CPUFLAG_WAITPKG))
+                       return -ENOTSUP;
+               dev->cur_pwr_cb = rte_eth_add_rx_callback(port_id, 0,
+                                               rte_power_mgmt_umwait, NULL);
+               break;
+       case RTE_ETH_DEV_POWER_MGMT_CB_SCALE:
+               /* init scale freq */
+               if (rte_power_init(lcore_id))
+                       return -EINVAL;
+               dev->cur_pwr_cb = rte_eth_add_rx_callback(port_id, 0,
+                                       rte_power_mgmt_scalefreq, NULL);
+               break;
+       case RTE_ETH_DEV_POWER_MGMT_CB_PAUSE:
+               dev->cur_pwr_cb = rte_eth_add_rx_callback(port_id, 0,
+                                               rte_power_mgmt_pause, NULL);
+               break;
+       }
+
+       dev->cb_mode = mode;
+       dev->pwr_mgmt_state = RTE_ETH_DEV_POWER_MGMT_ENABLED;
+       return 0;
+}
+
+int
+rte_power_pmd_mgmt_disable(unsigned int lcore_id,
+                               uint16_t port_id)
+{
+       struct rte_eth_dev *dev;
+
+       RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
+       dev = &rte_eth_devices[port_id];
+
+       /*add flag check */
+
+       if (dev->pwr_mgmt_state == RTE_ETH_DEV_POWER_MGMT_DISABLED)
+               return -EINVAL;
+
+       /* rte_free ignores NULL so safe to call without checks */
+       rte_free(dev->empty_poll_stats);
+
+       switch (dev->cb_mode) {
+       case RTE_ETH_DEV_POWER_MGMT_CB_WAIT:
+       case RTE_ETH_DEV_POWER_MGMT_CB_PAUSE:
+               rte_eth_remove_rx_callback(port_id, 0,
+                                          dev->cur_pwr_cb);
+               break;
+       case RTE_ETH_DEV_POWER_MGMT_CB_SCALE:
+               rte_power_freq_max(lcore_id);
+               rte_eth_remove_rx_callback(port_id, 0,
+                                          dev->cur_pwr_cb);
+               if (rte_power_exit(lcore_id))
+                       return -EINVAL;
+               break;
+       }
+
+       dev->pwr_mgmt_state = RTE_ETH_DEV_POWER_MGMT_DISABLED;
+       dev->cur_pwr_cb = NULL;
+       dev->cb_mode = 0;
+
+       return 0;
+}
diff --git a/lib/librte_power/rte_power_version.map 
b/lib/librte_power/rte_power_version.map
index 00ee5753e2..ade83cfd4f 100644
--- a/lib/librte_power/rte_power_version.map
+++ b/lib/librte_power/rte_power_version.map
@@ -34,4 +34,8 @@ EXPERIMENTAL {
        rte_power_guest_channel_receive_msg;
        rte_power_poll_stat_fetch;
        rte_power_poll_stat_update;
+       # added in 20.08
+       rte_power_pmd_mgmt_disable;
+       rte_power_pmd_mgmt_enable;
+
 };
-- 
2.17.1

Reply via email to