Add a simple on/off switch that will enable saving power when no
packets are arriving. It is based on counting the number of empty
polls and, when the number reaches a certain threshold, entering an
architecture-defined optimized power state that will either wait
until a TSC timestamp expires, or when packets arrive.
This API is limited to 1 core 1 queue use case as there is no
coordination between queues/cores in ethdev.

This design leverage RX Callback mechnaism which allow three
different power management methodology co exist.

1. umwait/umonitor:

   The TSC timestamp is automatically calculated using current
   link speed and RX descriptor ring size, such that the sleep
   time is not longer than it would take for a NIC to fill its
   entire RX descriptor ring.

2. Pause instruction

   Instead of move the core into deeper C state, this lightweight
   method use Pause instruction to releaf the processor from
   busy polling.

3. Frequency Scaling
   Reuse exist rte power library to scale up/down core frequency
   depend on traffic volume.

Signed-off-by: Liang Ma <liang.j...@intel.com>
Signed-off-by: Anatoly Burakov <anatoly.bura...@intel.com>
---
 config/common_base                       |   4 +-
 lib/Makefile                             |   1 +
 lib/librte_ethdev/Makefile               |   2 +-
 lib/librte_ethdev/meson.build            |   2 +-
 lib/librte_ethdev/rte_ethdev.c           | 198 +++++++++++++++++++++++
 lib/librte_ethdev/rte_ethdev.h           |  59 +++++++
 lib/librte_ethdev/rte_ethdev_core.h      |  43 ++++-
 lib/librte_ethdev/rte_ethdev_version.map |   4 +
 lib/meson.build                          |   5 +-
 mk/rte.app.mk                            |   2 +-
 10 files changed, 311 insertions(+), 9 deletions(-)

diff --git a/config/common_base b/config/common_base
index f76585f16..e0948f0cb 100644
--- a/config/common_base
+++ b/config/common_base
@@ -155,7 +155,7 @@ CONFIG_RTE_MAX_ETHPORTS=32
 CONFIG_RTE_MAX_QUEUES_PER_PORT=1024
 CONFIG_RTE_LIBRTE_IEEE1588=n
 CONFIG_RTE_ETHDEV_QUEUE_STAT_CNTRS=16
-CONFIG_RTE_ETHDEV_RXTX_CALLBACKS=y
+CONFIG_RTE_ETHDEV_RXTX_CALLBACKS=n
 CONFIG_RTE_ETHDEV_PROFILE_WITH_VTUNE=n
 
 #
@@ -978,7 +978,7 @@ CONFIG_RTE_LIBRTE_ACL_DEBUG=n
 #
 # Compile librte_power
 #
-CONFIG_RTE_LIBRTE_POWER=n
+CONFIG_RTE_LIBRTE_POWER=y
 CONFIG_RTE_LIBRTE_POWER_DEBUG=n
 CONFIG_RTE_MAX_LCORE_FREQS=64
 
diff --git a/lib/Makefile b/lib/Makefile
index 8f5b68a2d..87646698a 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -28,6 +28,7 @@ DEPDIRS-librte_ethdev := librte_net librte_eal librte_mempool 
librte_ring
 DEPDIRS-librte_ethdev += librte_mbuf
 DEPDIRS-librte_ethdev += librte_kvargs
 DEPDIRS-librte_ethdev += librte_meter
+DEPDIRS-librte_ethdev += librte_power
 DIRS-$(CONFIG_RTE_LIBRTE_BBDEV) += librte_bbdev
 DEPDIRS-librte_bbdev := librte_eal librte_mempool librte_mbuf
 DIRS-$(CONFIG_RTE_LIBRTE_CRYPTODEV) += librte_cryptodev
diff --git a/lib/librte_ethdev/Makefile b/lib/librte_ethdev/Makefile
index 47747150b..6a4ce14cf 100644
--- a/lib/librte_ethdev/Makefile
+++ b/lib/librte_ethdev/Makefile
@@ -11,7 +11,7 @@ LIB = librte_ethdev.a
 CFLAGS += -O3
 CFLAGS += $(WERROR_FLAGS)
 LDLIBS += -lrte_net -lrte_eal -lrte_mempool -lrte_ring
-LDLIBS += -lrte_mbuf -lrte_kvargs -lrte_meter -lrte_telemetry
+LDLIBS += -lrte_mbuf -lrte_kvargs -lrte_meter -lrte_telemetry -lrte_power
 
 EXPORT_MAP := rte_ethdev_version.map
 
diff --git a/lib/librte_ethdev/meson.build b/lib/librte_ethdev/meson.build
index 8fc24e8c8..e09e2395e 100644
--- a/lib/librte_ethdev/meson.build
+++ b/lib/librte_ethdev/meson.build
@@ -27,4 +27,4 @@ headers = files('rte_ethdev.h',
        'rte_tm.h',
        'rte_tm_driver.h')
 
-deps += ['net', 'kvargs', 'meter', 'telemetry']
+deps += ['net', 'kvargs', 'meter', 'telemetry', 'power']
diff --git a/lib/librte_ethdev/rte_ethdev.c b/lib/librte_ethdev/rte_ethdev.c
index 7858ad5f1..b43de88ce 100644
--- a/lib/librte_ethdev/rte_ethdev.c
+++ b/lib/librte_ethdev/rte_ethdev.c
@@ -16,6 +16,7 @@
 #include <netinet/in.h>
 
 #include <rte_byteorder.h>
+#include <rte_cpuflags.h>
 #include <rte_log.h>
 #include <rte_debug.h>
 #include <rte_interrupts.h>
@@ -39,6 +40,7 @@
 #include <rte_class.h>
 #include <rte_ether.h>
 #include <rte_telemetry.h>
+#include <rte_power.h>
 
 #include "rte_ethdev_trace.h"
 #include "rte_ethdev.h"
@@ -185,6 +187,100 @@ enum {
        STAT_QMAP_RX
 };
 
+
+static uint16_t
+rte_ethdev_pmgmt_umait(uint16_t port_id, uint16_t qidx,
+               struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
+               uint16_t max_pkts __rte_unused, void *_  __rte_unused)
+{
+
+       struct rte_eth_dev *dev = &rte_eth_devices[port_id];
+
+       if (dev->pwr_mgmt_state == RTE_ETH_DEV_POWER_MGMT_ENABLED) {
+               if (unlikely(nb_rx == 0)) {
+                       dev->empty_poll_stats[qidx].num++;
+                       if (unlikely(dev->empty_poll_stats[qidx].num >
+                                       ETH_EMPTYPOLL_MAX)) {
+                               volatile void *target_addr;
+                               uint64_t expected, mask;
+                               uint16_t ret;
+
+                               /*
+                                * get address of next descriptor in the RX
+                                * ring for this queue, as well as expected
+                                * value and a mask.
+                                */
+                               ret = (*dev->dev_ops->next_rx_desc)
+                                       (dev->data->rx_queues[qidx],
+                                        &target_addr, &expected, &mask);
+                               if (ret == 0)
+                                       /* -1ULL is maximum value for TSC */
+                                       rte_power_monitor(target_addr,
+                                                         expected, mask,
+                                                         0, -1ULL);
+                       }
+               } else
+                       dev->empty_poll_stats[qidx].num = 0;
+       }
+
+       return 0;
+}
+
+static uint16_t
+rte_ethdev_pmgmt_pause(uint16_t port_id, uint16_t qidx,
+               struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
+               uint16_t max_pkts __rte_unused, void *_  __rte_unused)
+{
+       struct rte_eth_dev *dev = &rte_eth_devices[port_id];
+
+       int i;
+
+       if (dev->pwr_mgmt_state == RTE_ETH_DEV_POWER_MGMT_ENABLED) {
+               if (unlikely(nb_rx == 0)) {
+
+                       dev->empty_poll_stats[qidx].num++;
+
+                       if (unlikely(dev->empty_poll_stats[qidx].num >
+                                       ETH_EMPTYPOLL_MAX)) {
+
+                               for (i = 0; i < RTE_ETH_PAUSE_NUM; i++)
+                                       rte_pause();
+
+                       }
+               } else
+                       dev->empty_poll_stats[qidx].num = 0;
+       }
+
+       return 0;
+}
+
+static uint16_t
+rte_ethdev_pmgmt_scalefreq(uint16_t port_id, uint16_t qidx,
+               struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
+               uint16_t max_pkts __rte_unused, void *_  __rte_unused)
+{
+       struct rte_eth_dev *dev = &rte_eth_devices[port_id];
+
+       if (dev->pwr_mgmt_state == RTE_ETH_DEV_POWER_MGMT_ENABLED) {
+               if (unlikely(nb_rx == 0)) {
+                       dev->empty_poll_stats[qidx].num++;
+                       if (unlikely(dev->empty_poll_stats[qidx].num >
+                                       ETH_EMPTYPOLL_MAX)) {
+
+                               /*scale down freq */
+                               rte_power_freq_min(rte_lcore_id());
+
+                       }
+               } else {
+                       dev->empty_poll_stats[qidx].num = 0;
+                       /* scal up freq */
+                       rte_power_freq_max(rte_lcore_id());
+               }
+       }
+
+       return 0;
+}
+
 int
 rte_eth_iterator_init(struct rte_dev_iterator *iter, const char *devargs_str)
 {
@@ -5113,6 +5209,108 @@ rte_eth_dev_pool_ops_supported(uint16_t port_id, const 
char *pool)
        return (*dev->dev_ops->pool_ops_supported)(dev, pool);
 }
 
+int
+rte_eth_dev_power_mgmt_enable(unsigned int lcore_id,
+                             uint16_t port_id,
+                        enum rte_eth_dev_power_mgmt_cb_mode mode)
+{
+       struct rte_eth_dev *dev;
+
+       RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
+       dev = &rte_eth_devices[port_id];
+
+       /* allocate memory for empty poll stats */
+       dev->empty_poll_stats = rte_malloc_socket(NULL,
+                                                 sizeof(struct rte_eth_ep_stat)
+                                                 * RTE_MAX_QUEUES_PER_PORT,
+                                                 0, dev->data->numa_node);
+
+       if (dev->empty_poll_stats == NULL)
+               return -ENOMEM;
+
+       if (dev->pwr_mgmt_state == RTE_ETH_DEV_POWER_MGMT_ENABLED)
+               return -EINVAL;
+
+       dev->cb_mode = mode;
+
+       switch (mode) {
+
+       case RTE_ETH_DEV_POWER_MGMT_CB_UMWAIT:
+
+               if (!rte_cpu_get_flag_enabled(RTE_CPUFLAG_WAITPKG))
+                       return -ENOTSUP;
+
+               dev->cur_pwr_cb = rte_eth_add_rx_callback(port_id, 0,
+                                               rte_ethdev_pmgmt_umait, NULL);
+               break;
+
+       case RTE_ETH_DEV_POWER_MGMT_CB_SCALE:
+
+               /* init scale freq */
+               if (rte_power_init(lcore_id))
+                       return -EINVAL;
+
+               dev->cur_pwr_cb = rte_eth_add_rx_callback(port_id, 0,
+                                       rte_ethdev_pmgmt_scalefreq, NULL);
+               break;
+
+       case RTE_ETH_DEV_POWER_MGMT_CB_PAUSE:
+
+               dev->cur_pwr_cb = rte_eth_add_rx_callback(port_id, 0,
+                                               rte_ethdev_pmgmt_pause, NULL);
+               break;
+
+       }
+
+       dev->pwr_mgmt_state = RTE_ETH_DEV_POWER_MGMT_ENABLED;
+       return 0;
+}
+
+int
+rte_eth_dev_power_mgmt_disable(unsigned int lcore_id,
+                              uint16_t port_id)
+{
+       struct rte_eth_dev *dev;
+
+       RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
+       dev = &rte_eth_devices[port_id];
+
+       /*add flag check */
+
+       if (dev->pwr_mgmt_state == RTE_ETH_DEV_POWER_MGMT_ENABLED)  {
+               /* rte_free ignores NULL so safe to call without checks */
+               rte_free(dev->empty_poll_stats);
+
+               switch (dev->cb_mode) {
+
+               case RTE_ETH_DEV_POWER_MGMT_CB_UMWAIT:
+
+               case RTE_ETH_DEV_POWER_MGMT_CB_PAUSE:
+
+                       rte_eth_remove_rx_callback(port_id, 0,
+                                                  dev->cur_pwr_cb);
+
+                       break;
+
+               case RTE_ETH_DEV_POWER_MGMT_CB_SCALE:
+
+                       rte_power_freq_max(lcore_id);
+
+                       rte_eth_remove_rx_callback(port_id, 0,
+                                                  dev->cur_pwr_cb);
+
+                       if (rte_power_exit(lcore_id))
+                               return -EINVAL;
+
+                       break;
+               }
+
+               dev->pwr_mgmt_state = RTE_ETH_DEV_POWER_MGMT_DISABLED;
+
+       }
+       return 0;
+}
+
 /**
  * A set of values to describe the possible states of a switch domain.
  */
diff --git a/lib/librte_ethdev/rte_ethdev.h b/lib/librte_ethdev/rte_ethdev.h
index 57e4a6ca5..6858c0338 100644
--- a/lib/librte_ethdev/rte_ethdev.h
+++ b/lib/librte_ethdev/rte_ethdev.h
@@ -157,6 +157,7 @@ extern "C" {
 #include <rte_common.h>
 #include <rte_config.h>
 #include <rte_ether.h>
+#include <rte_power_intrinsics.h>
 
 #include "rte_ethdev_trace_fp.h"
 #include "rte_dev_info.h"
@@ -775,6 +776,7 @@ rte_eth_rss_hf_refine(uint64_t rss_hf)
 /** Maximum nb. of vlan per mirror rule */
 #define ETH_MIRROR_MAX_VLANS       64
 
+#define ETH_EMPTYPOLL_MAX          512 /**< Empty poll number threshlold */
 #define ETH_MIRROR_VIRTUAL_POOL_UP     0x01  /**< Virtual Pool uplink 
Mirroring. */
 #define ETH_MIRROR_UPLINK_PORT         0x02  /**< Uplink Port Mirroring. */
 #define ETH_MIRROR_DOWNLINK_PORT       0x04  /**< Downlink Port Mirroring. */
@@ -1603,6 +1605,25 @@ enum rte_eth_dev_state {
        RTE_ETH_DEV_REMOVED,
 };
 
+#define  RTE_ETH_PAUSE_NUM  64    /* How many times to pause */
+/**
+ * Possible power management states of an ethdev port.
+ */
+enum rte_eth_dev_power_mgmt_state {
+       /** Device power management is disabled. */
+       RTE_ETH_DEV_POWER_MGMT_DISABLED = 0,
+       /** Device power management is enabled. */
+       RTE_ETH_DEV_POWER_MGMT_ENABLED,
+};
+
+enum rte_eth_dev_power_mgmt_cb_mode {
+       /** Device power management is disabled. */
+       RTE_ETH_DEV_POWER_MGMT_CB_UMWAIT = 0,
+       /** Device power management is enabled. */
+       RTE_ETH_DEV_POWER_MGMT_CB_PAUSE,
+       RTE_ETH_DEV_POWER_MGMT_CB_SCALE,
+};
+
 struct rte_eth_dev_sriov {
        uint8_t active;               /**< SRIOV is active with 16, 32 or 64 
pools */
        uint8_t nb_q_per_pool;        /**< rx queue number per pool */
@@ -4415,6 +4436,40 @@ __rte_experimental
 int rte_eth_dev_hairpin_capability_get(uint16_t port_id,
                                       struct rte_eth_hairpin_cap *cap);
 
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
+ *
+ * Enable device power management.
+ *
+ * @param port_id
+ *   The port identifier of the Ethernet device.
+ *
+ * @return
+ *   0 on success
+ *   <0 on error
+ */
+__rte_experimental
+int rte_eth_dev_power_mgmt_enable(unsigned int lcore_id,
+                                 uint16_t port_id,
+                                 enum rte_eth_dev_power_mgmt_cb_mode mode);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
+ *
+ * Disable device power management.
+ *
+ * @param port_id
+ *   The port identifier of the Ethernet device.
+ *
+ * @return
+ *   0 on success
+ *   <0 on error
+ */
+__rte_experimental
+int rte_eth_dev_power_mgmt_disable(unsigned int lcore_id, uint16_t port_id);
+
 #include <rte_ethdev_core.h>
 
 /**
@@ -4535,6 +4590,7 @@ rte_eth_rx_burst(uint16_t port_id, uint16_t queue_id,
        return nb_rx;
 }
 
+
 /**
  * Get the number of used descriptors of a rx queue
  *
@@ -4993,6 +5049,9 @@ rte_eth_tx_buffer(uint16_t port_id, uint16_t queue_id,
        return rte_eth_tx_buffer_flush(port_id, queue_id, buffer);
 }
 
+
+
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_ethdev/rte_ethdev_core.h 
b/lib/librte_ethdev/rte_ethdev_core.h
index 32407dd41..7d6d85ddc 100644
--- a/lib/librte_ethdev/rte_ethdev_core.h
+++ b/lib/librte_ethdev/rte_ethdev_core.h
@@ -603,6 +603,27 @@ typedef int (*eth_tx_hairpin_queue_setup_t)
         uint16_t nb_tx_desc,
         const struct rte_eth_hairpin_conf *hairpin_conf);
 
+/**
+ * @internal
+ * Get the next RX ring descriptor address.
+ *
+ * @param rxq
+ *   ethdev queue pointer.
+ * @param tail_desc_addr
+ *   the pointer point to descriptor address var.
+ *
+ * @return
+ *   Negative errno value on error, 0 on success.
+ *
+ * @retval 0
+ *   Success.
+ * @retval -EINVAL
+ *   Failed to get descriptor address.
+ */
+typedef int (*eth_next_rx_desc_t)
+       (void *rxq, volatile void **tail_desc_addr,
+        uint64_t *expected, uint64_t *mask);
+
 /**
  * @internal A structure containing the functions exported by an Ethernet 
driver.
  */
@@ -752,6 +773,8 @@ struct eth_dev_ops {
        /**< Set up device RX hairpin queue. */
        eth_tx_hairpin_queue_setup_t tx_hairpin_queue_setup;
        /**< Set up device TX hairpin queue. */
+       eth_next_rx_desc_t next_rx_desc;
+       /**< Get next RX ring descriptor address. */
 };
 
 /**
@@ -768,6 +791,14 @@ struct rte_eth_rxtx_callback {
        void *param;
 };
 
+/**
+ * @internal
+ * Structure used to hold counters for empty poll
+ */
+struct rte_eth_ep_stat {
+       uint64_t num;
+} __rte_cache_aligned;
+
 /**
  * @internal
  * The generic data structure associated with each ethernet device.
@@ -807,8 +838,16 @@ struct rte_eth_dev {
        enum rte_eth_dev_state state; /**< Flag indicating the port state */
        void *security_ctx; /**< Context for security ops */
 
-       uint64_t reserved_64s[4]; /**< Reserved for future fields */
-       void *reserved_ptrs[4];   /**< Reserved for future fields */
+       /**< Empty poll number */
+       enum rte_eth_dev_power_mgmt_state pwr_mgmt_state;
+       enum rte_eth_dev_power_mgmt_cb_mode cb_mode;
+       uint32_t reserved_32;
+       uint64_t reserved_64s[3]; /**< Reserved for future fields */
+
+       /**< Flag indicating the port power state */
+       struct rte_eth_ep_stat *empty_poll_stats;
+       const struct rte_eth_rxtx_callback *cur_pwr_cb;
+       void *reserved_ptrs[3];   /**< Reserved for future fields */
 } __rte_cache_aligned;
 
 struct rte_eth_dev_sriov;
diff --git a/lib/librte_ethdev/rte_ethdev_version.map 
b/lib/librte_ethdev/rte_ethdev_version.map
index 1212a17d3..4d5b63a5b 100644
--- a/lib/librte_ethdev/rte_ethdev_version.map
+++ b/lib/librte_ethdev/rte_ethdev_version.map
@@ -241,6 +241,10 @@ EXPERIMENTAL {
        __rte_ethdev_trace_rx_burst;
        __rte_ethdev_trace_tx_burst;
        rte_flow_get_aged_flows;
+
+       # added in 20.08
+       rte_eth_dev_power_mgmt_disable;
+       rte_eth_dev_power_mgmt_enable;
 };
 
 INTERNAL {
diff --git a/lib/meson.build b/lib/meson.build
index 3852c0156..54cc0db7d 100644
--- a/lib/meson.build
+++ b/lib/meson.build
@@ -14,17 +14,18 @@ libraries = [
        'eal', # everything depends on eal
        'ring',
        'rcu', # rcu depends on ring
+       'timer',   # eventdev depends on this
+       'power',   # eventdev depends on this
        'mempool', 'mbuf', 'net', 'meter', 'ethdev', 'pci', # core
        'cmdline',
        'metrics', # bitrate/latency stats depends on this
        'hash',    # efd depends on this
-       'timer',   # eventdev depends on this
        'acl', 'bbdev', 'bitratestats', 'cfgfile',
        'compressdev', 'cryptodev',
        'distributor', 'efd', 'eventdev',
        'gro', 'gso', 'ip_frag', 'jobstats',
        'kni', 'latencystats', 'lpm', 'member',
-       'power', 'pdump', 'rawdev', 'regexdev',
+       'pdump', 'rawdev', 'regexdev',
        'rib', 'reorder', 'sched', 'security', 'stack', 'vhost',
        # ipsec lib depends on net, crypto and security
        'ipsec',
diff --git a/mk/rte.app.mk b/mk/rte.app.mk
index a54425997..b87abb26e 100644
--- a/mk/rte.app.mk
+++ b/mk/rte.app.mk
@@ -58,7 +58,6 @@ endif
 _LDLIBS-$(CONFIG_RTE_LIBRTE_METRICS)        += --no-whole-archive
 _LDLIBS-$(CONFIG_RTE_LIBRTE_BITRATE)        += -lrte_bitratestats
 _LDLIBS-$(CONFIG_RTE_LIBRTE_LATENCY_STATS)  += -lrte_latencystats
-_LDLIBS-$(CONFIG_RTE_LIBRTE_POWER)          += -lrte_power
 
 _LDLIBS-$(CONFIG_RTE_LIBRTE_EFD)            += -lrte_efd
 _LDLIBS-$(CONFIG_RTE_LIBRTE_BPF)            += -lrte_bpf
@@ -80,6 +79,7 @@ _LDLIBS-$(CONFIG_RTE_LIBRTE_KVARGS)         += -lrte_kvargs
 _LDLIBS-y                                   += -lrte_telemetry
 _LDLIBS-$(CONFIG_RTE_LIBRTE_MBUF)           += -lrte_mbuf
 _LDLIBS-$(CONFIG_RTE_LIBRTE_NET)            += -lrte_net
+_LDLIBS-$(CONFIG_RTE_LIBRTE_POWER)          += -lrte_power
 _LDLIBS-$(CONFIG_RTE_LIBRTE_ETHER)          += -lrte_ethdev
 _LDLIBS-$(CONFIG_RTE_LIBRTE_BBDEV)          += -lrte_bbdev
 _LDLIBS-$(CONFIG_RTE_LIBRTE_CRYPTODEV)      += -lrte_cryptodev
-- 
2.17.1

Reply via email to