From: Long Wu <long...@corigine.com>

Use AVX2 instructions to accelerate Rx performance. The
acceleration only works on X86 machine.

Signed-off-by: Peng Zhang <peng.zh...@corigine.com>
Signed-off-by: Long Wu <long...@corigine.com>
Reviewed-by: Chaoyong He <chaoyong...@corigine.com>
---
 drivers/net/nfp/nfp_ethdev.c        |   2 +-
 drivers/net/nfp/nfp_ethdev_vf.c     |   2 +-
 drivers/net/nfp/nfp_net_meta.c      |   1 +
 drivers/net/nfp/nfp_rxtx.c          |  10 ++
 drivers/net/nfp/nfp_rxtx.h          |   1 +
 drivers/net/nfp/nfp_rxtx_vec.h      |   4 +
 drivers/net/nfp/nfp_rxtx_vec_avx2.c | 252 ++++++++++++++++++++++++++++
 drivers/net/nfp/nfp_rxtx_vec_stub.c |   9 +
 8 files changed, 279 insertions(+), 2 deletions(-)

diff --git a/drivers/net/nfp/nfp_ethdev.c b/drivers/net/nfp/nfp_ethdev.c
index a7b40af712..bd35df2dc9 100644
--- a/drivers/net/nfp/nfp_ethdev.c
+++ b/drivers/net/nfp/nfp_ethdev.c
@@ -969,7 +969,7 @@ nfp_net_ethdev_ops_mount(struct nfp_net_hw *hw,
 
        eth_dev->dev_ops = &nfp_net_eth_dev_ops;
        eth_dev->rx_queue_count = nfp_net_rx_queue_count;
-       eth_dev->rx_pkt_burst = &nfp_net_recv_pkts;
+       nfp_net_recv_pkts_set(eth_dev);
 }
 
 static int
diff --git a/drivers/net/nfp/nfp_ethdev_vf.c b/drivers/net/nfp/nfp_ethdev_vf.c
index b955624ed6..cdf5da3af7 100644
--- a/drivers/net/nfp/nfp_ethdev_vf.c
+++ b/drivers/net/nfp/nfp_ethdev_vf.c
@@ -245,7 +245,7 @@ nfp_netvf_ethdev_ops_mount(struct nfp_net_hw *hw,
 
        eth_dev->dev_ops = &nfp_netvf_eth_dev_ops;
        eth_dev->rx_queue_count = nfp_net_rx_queue_count;
-       eth_dev->rx_pkt_burst = &nfp_net_recv_pkts;
+       nfp_net_recv_pkts_set(eth_dev);
 }
 
 static int
diff --git a/drivers/net/nfp/nfp_net_meta.c b/drivers/net/nfp/nfp_net_meta.c
index b31ef56f17..07c6758d33 100644
--- a/drivers/net/nfp/nfp_net_meta.c
+++ b/drivers/net/nfp/nfp_net_meta.c
@@ -80,6 +80,7 @@ nfp_net_meta_parse_single(uint8_t *meta_base,
                rte_be32_t meta_header,
                struct nfp_net_meta_parsed *meta)
 {
+       meta->flags = 0;
        meta->flags |= (1 << NFP_NET_META_HASH);
        meta->hash_type = rte_be_to_cpu_32(meta_header);
        meta->hash = rte_be_to_cpu_32(*(rte_be32_t *)(meta_base + 4));
diff --git a/drivers/net/nfp/nfp_rxtx.c b/drivers/net/nfp/nfp_rxtx.c
index 1db79ad1cd..4fc3374987 100644
--- a/drivers/net/nfp/nfp_rxtx.c
+++ b/drivers/net/nfp/nfp_rxtx.c
@@ -17,6 +17,7 @@
 #include "nfp_ipsec.h"
 #include "nfp_logs.h"
 #include "nfp_net_meta.h"
+#include "nfp_rxtx_vec.h"
 
 /*
  * The bit format and map of nfp packet type for rxd.offload_info in Rx 
descriptor.
@@ -867,3 +868,12 @@ nfp_net_tx_queue_info_get(struct rte_eth_dev *dev,
        info->conf.offloads = dev_info.tx_offload_capa &
                        dev->data->dev_conf.txmode.offloads;
 }
+
+void
+nfp_net_recv_pkts_set(struct rte_eth_dev *eth_dev)
+{
+       if (nfp_net_get_avx2_supported())
+               eth_dev->rx_pkt_burst = nfp_net_vec_avx2_recv_pkts;
+       else
+               eth_dev->rx_pkt_burst = nfp_net_recv_pkts;
+}
diff --git a/drivers/net/nfp/nfp_rxtx.h b/drivers/net/nfp/nfp_rxtx.h
index 3ddf717da0..fff8371991 100644
--- a/drivers/net/nfp/nfp_rxtx.h
+++ b/drivers/net/nfp/nfp_rxtx.h
@@ -244,5 +244,6 @@ void nfp_net_rx_queue_info_get(struct rte_eth_dev *dev,
 void nfp_net_tx_queue_info_get(struct rte_eth_dev *dev,
                uint16_t queue_id,
                struct rte_eth_txq_info *qinfo);
+void nfp_net_recv_pkts_set(struct rte_eth_dev *eth_dev);
 
 #endif /* __NFP_RXTX_H__ */
diff --git a/drivers/net/nfp/nfp_rxtx_vec.h b/drivers/net/nfp/nfp_rxtx_vec.h
index c92660f963..8720662744 100644
--- a/drivers/net/nfp/nfp_rxtx_vec.h
+++ b/drivers/net/nfp/nfp_rxtx_vec.h
@@ -10,4 +10,8 @@
 
 bool nfp_net_get_avx2_supported(void);
 
+uint16_t nfp_net_vec_avx2_recv_pkts(void *rx_queue,
+               struct rte_mbuf **rx_pkts,
+               uint16_t nb_pkts);
+
 #endif /* __NFP_RXTX_VEC_AVX2_H__ */
diff --git a/drivers/net/nfp/nfp_rxtx_vec_avx2.c 
b/drivers/net/nfp/nfp_rxtx_vec_avx2.c
index 50638e74ab..7c18213624 100644
--- a/drivers/net/nfp/nfp_rxtx_vec_avx2.c
+++ b/drivers/net/nfp/nfp_rxtx_vec_avx2.c
@@ -5,9 +5,14 @@
 
 #include <stdbool.h>
 
+#include <bus_pci_driver.h>
+#include <ethdev_driver.h>
 #include <rte_cpuflags.h>
 #include <rte_vect.h>
 
+#include "nfp_logs.h"
+#include "nfp_net_common.h"
+#include "nfp_net_meta.h"
 #include "nfp_rxtx_vec.h"
 
 bool
@@ -19,3 +24,250 @@ nfp_net_get_avx2_supported(void)
 
        return false;
 }
+
+static inline void
+nfp_vec_avx2_recv_set_des1(struct nfp_net_rxq *rxq,
+               struct nfp_net_rx_desc *rxds,
+               struct rte_mbuf *rxb)
+{
+       __m128i dma;
+       __m128i dma_hi;
+       __m128i vaddr0;
+       __m128i hdr_room = _mm_set_epi64x(0, RTE_PKTMBUF_HEADROOM);
+
+       dma = _mm_add_epi64(_mm_loadu_si128((__m128i *)&rxb->buf_addr), 
hdr_room);
+       dma_hi = _mm_srli_epi64(dma, 32);
+       vaddr0 = _mm_unpacklo_epi32(dma_hi, dma);
+
+       _mm_storel_epi64((void *)rxds, vaddr0);
+
+       rxq->rd_p = (rxq->rd_p + 1) & (rxq->rx_count - 1);
+}
+
+static inline void
+nfp_vec_avx2_recv_set_des4(struct nfp_net_rxq *rxq,
+               struct nfp_net_rx_desc *rxds,
+               struct rte_mbuf **rxb)
+{
+       __m128i dma;
+       __m128i dma_hi;
+       __m128i vaddr0;
+       __m128i vaddr1;
+       __m128i vaddr2;
+       __m128i vaddr3;
+       __m128i vaddr0_1;
+       __m128i vaddr2_3;
+       __m256i vaddr0_3;
+       __m128i hdr_room = _mm_set_epi64x(0, RTE_PKTMBUF_HEADROOM);
+
+       dma = _mm_add_epi64(_mm_loadu_si128((__m128i *)&rxb[0]->buf_addr), 
hdr_room);
+       dma_hi = _mm_srli_epi64(dma, 32);
+       vaddr0 = _mm_unpacklo_epi32(dma_hi, dma);
+
+       dma = _mm_add_epi64(_mm_loadu_si128((__m128i *)&rxb[1]->buf_addr), 
hdr_room);
+       dma_hi = _mm_srli_epi64(dma, 32);
+       vaddr1 = _mm_unpacklo_epi32(dma_hi, dma);
+
+       dma = _mm_add_epi64(_mm_loadu_si128((__m128i *)&rxb[2]->buf_addr), 
hdr_room);
+       dma_hi = _mm_srli_epi64(dma, 32);
+       vaddr2 = _mm_unpacklo_epi32(dma_hi, dma);
+
+       dma = _mm_add_epi64(_mm_loadu_si128((__m128i *)&rxb[3]->buf_addr), 
hdr_room);
+       dma_hi = _mm_srli_epi64(dma, 32);
+       vaddr3 = _mm_unpacklo_epi32(dma_hi, dma);
+
+       vaddr0_1 = _mm_unpacklo_epi64(vaddr0, vaddr1);
+       vaddr2_3 = _mm_unpacklo_epi64(vaddr2, vaddr3);
+
+       vaddr0_3 = _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr0_1),
+                       vaddr2_3, 1);
+
+       _mm256_store_si256((void *)rxds, vaddr0_3);
+
+       rxq->rd_p = (rxq->rd_p + 4) & (rxq->rx_count - 1);
+}
+
+static inline void
+nfp_vec_avx2_recv_set_rxpkt1(struct nfp_net_rxq *rxq,
+               struct nfp_net_rx_desc *rxds,
+               struct rte_mbuf *rx_pkt)
+{
+       struct nfp_net_hw *hw = rxq->hw;
+       struct nfp_net_meta_parsed meta;
+
+       rx_pkt->data_len = rxds->rxd.data_len - NFP_DESC_META_LEN(rxds);
+       /* Size of the whole packet. We just support 1 segment */
+       rx_pkt->pkt_len = rxds->rxd.data_len - NFP_DESC_META_LEN(rxds);
+
+       /* Filling the received mbuf with packet info */
+       if (hw->rx_offset)
+               rx_pkt->data_off = RTE_PKTMBUF_HEADROOM + hw->rx_offset;
+       else
+               rx_pkt->data_off = RTE_PKTMBUF_HEADROOM + 
NFP_DESC_META_LEN(rxds);
+
+       rx_pkt->port = rxq->port_id;
+       rx_pkt->nb_segs = 1;
+       rx_pkt->next = NULL;
+
+       nfp_net_meta_parse(rxds, rxq, hw, rx_pkt, &meta);
+
+       /* Checking the checksum flag */
+       nfp_net_rx_cksum(rxq, rxds, rx_pkt);
+}
+
+static inline void
+nfp_vec_avx2_recv1(struct nfp_net_rxq *rxq,
+               struct nfp_net_rx_desc *rxds,
+               struct rte_mbuf *rxb,
+               struct rte_mbuf *rx_pkt)
+{
+       nfp_vec_avx2_recv_set_rxpkt1(rxq, rxds, rx_pkt);
+
+       nfp_vec_avx2_recv_set_des1(rxq, rxds, rxb);
+}
+
+static inline void
+nfp_vec_avx2_recv4(struct nfp_net_rxq *rxq,
+               struct nfp_net_rx_desc *rxds,
+               struct rte_mbuf **rxb,
+               struct rte_mbuf **rx_pkts)
+{
+       nfp_vec_avx2_recv_set_rxpkt1(rxq, rxds, rx_pkts[0]);
+       nfp_vec_avx2_recv_set_rxpkt1(rxq, rxds + 1, rx_pkts[1]);
+       nfp_vec_avx2_recv_set_rxpkt1(rxq, rxds + 2, rx_pkts[2]);
+       nfp_vec_avx2_recv_set_rxpkt1(rxq, rxds + 3, rx_pkts[3]);
+
+       nfp_vec_avx2_recv_set_des4(rxq, rxds, rxb);
+}
+
+static inline bool
+nfp_vec_avx2_recv_check_packets4(struct nfp_net_rx_desc *rxds)
+{
+       __m256i data = _mm256_loadu_si256((void *)rxds);
+
+       if ((_mm256_extract_epi8(data, 3) & PCIE_DESC_RX_DD) == 0 ||
+                       (_mm256_extract_epi8(data, 11) & PCIE_DESC_RX_DD) == 0 
||
+                       (_mm256_extract_epi8(data, 19) & PCIE_DESC_RX_DD) == 0 
||
+                       (_mm256_extract_epi8(data, 27) & PCIE_DESC_RX_DD) == 0)
+               return false;
+
+       return true;
+}
+
+uint16_t
+nfp_net_vec_avx2_recv_pkts(void *rx_queue,
+               struct rte_mbuf **rx_pkts,
+               uint16_t nb_pkts)
+{
+       uint16_t avail;
+       uint16_t nb_hold;
+       bool burst_receive;
+       struct rte_mbuf **rxb;
+       struct nfp_net_rx_desc *rxds;
+       struct nfp_net_rxq *rxq = rx_queue;
+
+       if (unlikely(rxq == NULL)) {
+               PMD_RX_LOG(ERR, "RX Bad queue");
+               return 0;
+       }
+
+       avail = 0;
+       nb_hold = 0;
+       burst_receive = true;
+       while (avail < nb_pkts) {
+               rxds = &rxq->rxds[rxq->rd_p];
+               rxb = &rxq->rxbufs[rxq->rd_p].mbuf;
+
+               if ((_mm_extract_epi8(_mm_loadu_si128((void *)(rxds)), 3)
+                               & PCIE_DESC_RX_DD) == 0)
+                       goto recv_end;
+
+               rte_prefetch0(rxq->rxbufs[rxq->rd_p].mbuf);
+
+               if ((rxq->rd_p & 0x3) == 0) {
+                       rte_prefetch0(&rxq->rxds[rxq->rd_p]);
+                       rte_prefetch0(&rxq->rxbufs[rxq->rd_p]);
+                       rte_prefetch0(rxq->rxbufs[rxq->rd_p + 1].mbuf);
+                       rte_prefetch0(rxq->rxbufs[rxq->rd_p + 2].mbuf);
+                       rte_prefetch0(rxq->rxbufs[rxq->rd_p + 3].mbuf);
+               }
+
+               if ((rxq->rd_p & 0x7) == 0) {
+                       rte_prefetch0(rxq->rxbufs[rxq->rd_p + 4].mbuf);
+                       rte_prefetch0(rxq->rxbufs[rxq->rd_p + 5].mbuf);
+                       rte_prefetch0(rxq->rxbufs[rxq->rd_p + 6].mbuf);
+                       rte_prefetch0(rxq->rxbufs[rxq->rd_p + 7].mbuf);
+               }
+
+               /*
+                * If can not receive burst, just receive one.
+                * 1. Rx ring will coming to the tail.
+                * 2. Do not need to receive 4 packets.
+                * 3. If pointer address unaligned on 32-bit boundary.
+                * 4. Rx ring does not have 4 packets or alloc 4 mbufs failed.
+                */
+               if ((rxq->rx_count - rxq->rd_p) < 4 ||
+                               (nb_pkts - avail) < 4 ||
+                               ((uintptr_t)rxds & 0x1F) != 0 ||
+                               !burst_receive) {
+                       _mm_storel_epi64((void *)&rx_pkts[avail],
+                                       _mm_loadu_si128((void *)rxb));
+
+                       /* Allocate a new mbuf into the software ring. */
+                       if (rte_pktmbuf_alloc_bulk(rxq->mem_pool, rxb, 1) < 0) {
+                               PMD_RX_LOG(DEBUG, "RX mbuf alloc failed 
port_id=%u queue_id=%hu",
+                                               rxq->port_id, rxq->qidx);
+                               nfp_net_mbuf_alloc_failed(rxq);
+                               goto recv_end;
+                       }
+
+                       nfp_vec_avx2_recv1(rxq, rxds, *rxb, rx_pkts[avail]);
+
+                       avail++;
+                       nb_hold++;
+                       continue;
+               }
+
+               burst_receive = nfp_vec_avx2_recv_check_packets4(rxds);
+               if (!burst_receive)
+                       continue;
+
+               _mm256_storeu_si256((void *)&rx_pkts[avail],
+                               _mm256_loadu_si256((void *)rxb));
+
+               /* Allocate 4 new mbufs into the software ring. */
+               if (rte_pktmbuf_alloc_bulk(rxq->mem_pool, rxb, 4) < 0) {
+                       burst_receive = false;
+                       continue;
+               }
+
+               nfp_vec_avx2_recv4(rxq, rxds, rxb, &rx_pkts[avail]);
+
+               avail += 4;
+               nb_hold += 4;
+       }
+
+recv_end:
+       if (nb_hold == 0)
+               return nb_hold;
+
+       PMD_RX_LOG(DEBUG, "RX port_id=%u queue_id=%u, %d packets received",
+                       rxq->port_id, (unsigned int)rxq->qidx, nb_hold);
+
+       nb_hold += rxq->nb_rx_hold;
+
+       /*
+        * FL descriptors needs to be written before incrementing the
+        * FL queue WR pointer
+        */
+       rte_wmb();
+       if (nb_hold > rxq->rx_free_thresh) {
+               PMD_RX_LOG(DEBUG, "port=%hu queue=%hu nb_hold=%hu avail=%hu",
+                               rxq->port_id, rxq->qidx, nb_hold, avail);
+               nfp_qcp_ptr_add(rxq->qcp_fl, NFP_QCP_WRITE_PTR, nb_hold);
+               nb_hold = 0;
+       }
+       rxq->nb_rx_hold = nb_hold;
+
+       return avail;
+}
diff --git a/drivers/net/nfp/nfp_rxtx_vec_stub.c 
b/drivers/net/nfp/nfp_rxtx_vec_stub.c
index 1bc55b67e0..c480f61ef0 100644
--- a/drivers/net/nfp/nfp_rxtx_vec_stub.c
+++ b/drivers/net/nfp/nfp_rxtx_vec_stub.c
@@ -6,6 +6,7 @@
 #include <stdbool.h>
 
 #include <rte_common.h>
+#include <rte_mbuf_core.h>
 
 #include "nfp_rxtx_vec.h"
 
@@ -14,3 +15,11 @@ nfp_net_get_avx2_supported(void)
 {
        return false;
 }
+
+uint16_t __rte_weak
+nfp_net_vec_avx2_recv_pkts(__rte_unused void *rx_queue,
+               __rte_unused struct rte_mbuf **rx_pkts,
+               __rte_unused uint16_t nb_pkts)
+{
+       return 0;
+}
-- 
2.39.1

Reply via email to