From: Pavan Nikhilesh <pbhagavat...@marvell.com>

Optimize Rx routine to use AVX2 instructions when underlying
architecture supports it.

Signed-off-by: Pavan Nikhilesh <pbhagavat...@marvell.com>
---
 doc/guides/rel_notes/release_24_03.rst |   5 +
 drivers/net/octeon_ep/cnxk_ep_rx_avx.c | 123 +++++++++++++++++++++++++
 drivers/net/octeon_ep/meson.build      |  12 +++
 drivers/net/octeon_ep/otx_ep_ethdev.c  |  10 ++
 drivers/net/octeon_ep/otx_ep_rxtx.h    |   6 ++
 5 files changed, 156 insertions(+)
 create mode 100644 drivers/net/octeon_ep/cnxk_ep_rx_avx.c

diff --git a/doc/guides/rel_notes/release_24_03.rst 
b/doc/guides/rel_notes/release_24_03.rst
index 6f8ad27808..2191dd78e7 100644
--- a/doc/guides/rel_notes/release_24_03.rst
+++ b/doc/guides/rel_notes/release_24_03.rst
@@ -55,6 +55,11 @@ New Features
      Also, make sure to start the actual text at the margin.
      =======================================================
 
+* **Updated Marvell Octeon ep driver.**
+
+  * Added SSE/AVX2 Rx routines.
+  * Updated Tx queue thresholds.
+
 
 Removed Items
 -------------
diff --git a/drivers/net/octeon_ep/cnxk_ep_rx_avx.c 
b/drivers/net/octeon_ep/cnxk_ep_rx_avx.c
new file mode 100644
index 0000000000..ae4615e6da
--- /dev/null
+++ b/drivers/net/octeon_ep/cnxk_ep_rx_avx.c
@@ -0,0 +1,123 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2023 Marvell.
+ */
+
+#include "cnxk_ep_rx.h"
+
+static __rte_always_inline void
+cnxk_ep_process_pkts_vec_avx(struct rte_mbuf **rx_pkts, struct otx_ep_droq 
*droq, uint16_t new_pkts)
+{
+       struct rte_mbuf **recv_buf_list = droq->recv_buf_list;
+       uint32_t bytes_rsvd = 0, read_idx = droq->read_idx;
+       const uint64_t rearm_data = droq->rearm_data;
+       struct rte_mbuf *m[CNXK_EP_OQ_DESC_PER_LOOP_AVX];
+       uint32_t pidx[CNXK_EP_OQ_DESC_PER_LOOP_AVX];
+       uint32_t idx[CNXK_EP_OQ_DESC_PER_LOOP_AVX];
+       uint16_t nb_desc = droq->nb_desc;
+       uint16_t pkts = 0;
+       uint8_t i;
+
+       idx[0] = read_idx;
+       while (pkts < new_pkts) {
+               __m256i data[CNXK_EP_OQ_DESC_PER_LOOP_AVX];
+               /* mask to shuffle from desc. to mbuf (2 descriptors)*/
+               const __m256i mask =
+                       _mm256_set_epi8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 20, 
21, 0xFF, 0xFF, 20,
+                                       21, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 
0xFF, 0xFF, 0xFF,
+                                       0xFF, 0xFF, 0xFF, 7, 6, 5, 4, 3, 2, 1, 
0);
+
+               /* Load indexes. */
+               for (i = 1; i < CNXK_EP_OQ_DESC_PER_LOOP_AVX; i++)
+                       idx[i] = otx_ep_incr_index(idx[i - 1], 1, nb_desc);
+
+               /* Prefetch next indexes. */
+               if (new_pkts - pkts > 8) {
+                       pidx[0] = otx_ep_incr_index(idx[i - 1], 1, nb_desc);
+                       for (i = 1; i < CNXK_EP_OQ_DESC_PER_LOOP_AVX; i++)
+                               pidx[i] = otx_ep_incr_index(pidx[i - 1], 1, 
nb_desc);
+
+                       for (i = 0; i < CNXK_EP_OQ_DESC_PER_LOOP_AVX; i++) {
+                               rte_prefetch0(recv_buf_list[pidx[i]]);
+                               
rte_prefetch0(rte_pktmbuf_mtod(recv_buf_list[pidx[i]], void *));
+                       }
+               }
+
+               /* Load mbuf array. */
+               for (i = 0; i < CNXK_EP_OQ_DESC_PER_LOOP_AVX; i++)
+                       m[i] = recv_buf_list[idx[i]];
+
+               /* Load rearm data and packet length for shuffle. */
+               for (i = 0; i < CNXK_EP_OQ_DESC_PER_LOOP_AVX; i++)
+                       data[i] = _mm256_set_epi64x(0,
+                               rte_pktmbuf_mtod(m[i], struct otx_ep_droq_info 
*)->length >> 16,
+                               0, rearm_data);
+
+               /* Shuffle data to its place and sum the packet length. */
+               for (i = 0; i < CNXK_EP_OQ_DESC_PER_LOOP_AVX; i++) {
+                       data[i] = _mm256_shuffle_epi8(data[i], mask);
+                       bytes_rsvd += _mm256_extract_epi16(data[i], 10);
+               }
+
+               /* Store the 256bit data to the mbuf. */
+               for (i = 0; i < CNXK_EP_OQ_DESC_PER_LOOP_AVX; i++)
+                       _mm256_storeu_si256((__m256i *)&m[i]->rearm_data, 
data[i]);
+
+               for (i = 0; i < CNXK_EP_OQ_DESC_PER_LOOP_AVX; i++)
+                       rx_pkts[pkts++] = m[i];
+               idx[0] = otx_ep_incr_index(idx[i - 1], 1, nb_desc);
+       }
+       droq->read_idx = idx[0];
+
+       droq->refill_count += new_pkts;
+       droq->pkts_pending -= new_pkts;
+       /* Stats */
+       droq->stats.pkts_received += new_pkts;
+       droq->stats.bytes_received += bytes_rsvd;
+}
+
+uint16_t __rte_noinline __rte_hot
+cnxk_ep_recv_pkts_avx(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t 
nb_pkts)
+{
+       struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
+       uint16_t new_pkts, vpkts;
+
+       new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
+       vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_AVX);
+       cnxk_ep_process_pkts_vec_avx(rx_pkts, droq, vpkts);
+       cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);
+
+       /* Refill RX buffers */
+       if (droq->refill_count >= DROQ_REFILL_THRESHOLD)
+               cnxk_ep_rx_refill(droq);
+
+       return new_pkts;
+}
+
+uint16_t __rte_noinline __rte_hot
+cn9k_ep_recv_pkts_avx(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t 
nb_pkts)
+{
+       struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
+       uint16_t new_pkts, vpkts;
+
+       new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
+       vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_AVX);
+       cnxk_ep_process_pkts_vec_avx(rx_pkts, droq, vpkts);
+       cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);
+
+       /* Refill RX buffers */
+       if (droq->refill_count >= DROQ_REFILL_THRESHOLD) {
+               cnxk_ep_rx_refill(droq);
+       } else {
+               /* SDP output goes into DROP state when output doorbell count
+                * goes below drop count. When door bell count is written with
+                * a value greater than drop count SDP output should come out
+                * of DROP state. Due to a race condition this is not happening.
+                * Writing doorbell register with 0 again may make SDP output
+                * come out of this state.
+                */
+
+               rte_write32(0, droq->pkts_credit_reg);
+       }
+
+       return new_pkts;
+}
diff --git a/drivers/net/octeon_ep/meson.build 
b/drivers/net/octeon_ep/meson.build
index feba1fdf25..e8ae56018d 100644
--- a/drivers/net/octeon_ep/meson.build
+++ b/drivers/net/octeon_ep/meson.build
@@ -15,6 +15,18 @@ sources = files(
 
 if arch_subdir == 'x86'
     sources += files('cnxk_ep_rx_sse.c')
+    if cc.get_define('__AVX2__', args: machine_args) != ''
+        cflags += ['-DCC_AVX2_SUPPORT']
+        sources += files('cnxk_ep_rx_avx.c')
+    elif cc.has_argument('-mavx2')
+        cflags += ['-DCC_AVX2_SUPPORT']
+        otx_ep_avx2_lib = static_library('otx_ep_avx2_lib',
+                        'cnxk_ep_rx_avx.c',
+                        dependencies: [static_rte_ethdev, static_rte_pci, 
static_rte_bus_pci],
+                        include_directories: includes,
+                        c_args: [cflags, '-mavx2'])
+        objs += otx_ep_avx2_lib.extract_objects('cnxk_ep_rx_avx.c')
+    endif
 endif
 
 extra_flags = ['-Wno-strict-aliasing']
diff --git a/drivers/net/octeon_ep/otx_ep_ethdev.c 
b/drivers/net/octeon_ep/otx_ep_ethdev.c
index 51b34cdaa0..42a97ea110 100644
--- a/drivers/net/octeon_ep/otx_ep_ethdev.c
+++ b/drivers/net/octeon_ep/otx_ep_ethdev.c
@@ -54,6 +54,11 @@ otx_ep_set_rx_func(struct rte_eth_dev *eth_dev)
                eth_dev->rx_pkt_burst = &cnxk_ep_recv_pkts;
 #ifdef RTE_ARCH_X86
                eth_dev->rx_pkt_burst = &cnxk_ep_recv_pkts_sse;
+#ifdef CC_AVX2_SUPPORT
+               if (rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256 &&
+                   rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1)
+                       eth_dev->rx_pkt_burst = &cnxk_ep_recv_pkts_avx;
+#endif
 #endif
                if (otx_epvf->rx_offloads & RTE_ETH_RX_OFFLOAD_SCATTER)
                        eth_dev->rx_pkt_burst = &cnxk_ep_recv_pkts_mseg;
@@ -61,6 +66,11 @@ otx_ep_set_rx_func(struct rte_eth_dev *eth_dev)
                eth_dev->rx_pkt_burst = &cn9k_ep_recv_pkts;
 #ifdef RTE_ARCH_X86
                eth_dev->rx_pkt_burst = &cn9k_ep_recv_pkts_sse;
+#ifdef CC_AVX2_SUPPORT
+               if (rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256 &&
+                   rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1)
+                       eth_dev->rx_pkt_burst = &cn9k_ep_recv_pkts_avx;
+#endif
 #endif
 
                if (otx_epvf->rx_offloads & RTE_ETH_RX_OFFLOAD_SCATTER)
diff --git a/drivers/net/octeon_ep/otx_ep_rxtx.h 
b/drivers/net/octeon_ep/otx_ep_rxtx.h
index efc41a8275..0adcbc7814 100644
--- a/drivers/net/octeon_ep/otx_ep_rxtx.h
+++ b/drivers/net/octeon_ep/otx_ep_rxtx.h
@@ -51,6 +51,9 @@ cnxk_ep_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, 
uint16_t budget);
 uint16_t
 cnxk_ep_recv_pkts_sse(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t 
budget);
 
+uint16_t
+cnxk_ep_recv_pkts_avx(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t 
budget);
+
 uint16_t
 cnxk_ep_recv_pkts_mseg(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t 
budget);
 
@@ -60,6 +63,9 @@ cn9k_ep_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, 
uint16_t budget);
 uint16_t
 cn9k_ep_recv_pkts_sse(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t 
budget);
 
+uint16_t
+cn9k_ep_recv_pkts_avx(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t 
budget);
+
 uint16_t
 cn9k_ep_recv_pkts_mseg(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t 
budget);
 #endif /* _OTX_EP_RXTX_H_ */
-- 
2.25.1

Reply via email to