On Sat, Nov 25, 2023 at 10:52 PM <pbhagavat...@marvell.com> wrote: > > From: Pavan Nikhilesh <pbhagavat...@marvell.com> > > Optimize Rx routine to use SSE instructions. > > Signed-off-by: Pavan Nikhilesh <pbhagavat...@marvell.com> > ---
> diff --git a/drivers/net/octeon_ep/cnxk_ep_rx_sse.c > b/drivers/net/octeon_ep/cnxk_ep_rx_sse.c > new file mode 100644 > index 0000000000..531f75a2e0 > --- /dev/null > +++ b/drivers/net/octeon_ep/cnxk_ep_rx_sse.c > @@ -0,0 +1,124 @@ > +/* SPDX-License-Identifier: BSD-3-Clause > + * Copyright(C) 2023 Marvell. > + */ > + > +#include "cnxk_ep_rx.h" > + > +static __rte_always_inline uint32_t > +hadd(__m128i x) > +{ > + __m128i hi64 = _mm_shuffle_epi32(x, _MM_SHUFFLE(1, 0, 3, 2)); > + __m128i sum64 = _mm_add_epi32(hi64, x); > + __m128i hi32 = _mm_shufflelo_epi16(sum64, _MM_SHUFFLE(1, 0, 3, 2)); > + __m128i sum32 = _mm_add_epi32(sum64, hi32); > + return _mm_cvtsi128_si32(sum32); > +} > + > +static __rte_always_inline void > +cnxk_ep_process_pkts_vec_sse(struct rte_mbuf **rx_pkts, struct otx_ep_droq > *droq, uint16_t new_pkts) > +{ > + struct rte_mbuf **recv_buf_list = droq->recv_buf_list; > + uint32_t bytes_rsvd = 0, read_idx = droq->read_idx; > + uint32_t idx0, idx1, idx2, idx3; > + struct rte_mbuf *m0, *m1, *m2, *m3; > + uint16_t nb_desc = droq->nb_desc; > + uint16_t pkts = 0; > + > + idx0 = read_idx; > + while (pkts < new_pkts) { > + const __m128i bswap_mask = _mm_set_epi8(0xFF, 0xFF, 12, 13, > 0xFF, 0xFF, 8, 9, 0xFF, > + 0xFF, 4, 5, 0xFF, > 0xFF, 0, 1); > + const __m128i cpy_mask = _mm_set_epi8(0xFF, 0xFF, 9, 8, 0xFF, > 0xFF, 9, 8, 0xFF, > + 0xFF, 1, 0, 0xFF, 0xFF, > 1, 0); > + __m128i s01, s23; > + > + idx1 = otx_ep_incr_index(idx0, 1, nb_desc); > + idx2 = otx_ep_incr_index(idx1, 1, nb_desc); > + idx3 = otx_ep_incr_index(idx2, 1, nb_desc); > + > + m0 = recv_buf_list[idx0]; > + m1 = recv_buf_list[idx1]; > + m2 = recv_buf_list[idx2]; > + m3 = recv_buf_list[idx3]; > + Please add some comments for SSE usage for this section > + s01 = _mm_set_epi32(rte_pktmbuf_mtod(m3, struct > otx_ep_droq_info *)->length >> 48, > + rte_pktmbuf_mtod(m1, struct > otx_ep_droq_info *)->length >> 48, > + rte_pktmbuf_mtod(m2, struct > otx_ep_droq_info *)->length >> 48, > + rte_pktmbuf_mtod(m0, struct > otx_ep_droq_info *)->length >> 48); > + s01 = _mm_shuffle_epi8(s01, bswap_mask); > + bytes_rsvd += hadd(s01); > + s23 = _mm_shuffle_epi32(s01, _MM_SHUFFLE(3, 3, 1, 1)); > + s01 = _mm_shuffle_epi8(s01, cpy_mask); > + s23 = _mm_shuffle_epi8(s23, cpy_mask); > diff --git a/drivers/net/octeon_ep/otx_ep_rxtx.h > b/drivers/net/octeon_ep/otx_ep_rxtx.h > index b159c32cae..af657dba50 100644 > --- a/drivers/net/octeon_ep/otx_ep_rxtx.h > +++ b/drivers/net/octeon_ep/otx_ep_rxtx.h > @@ -48,12 +48,22 @@ cnxk_ep_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf > **pkts, uint16_t nb_pkts) > uint16_t > cnxk_ep_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t > budget); > > +#ifdef RTE_ARCH_X86 We can skip #ifdef for function declaration. Same comment for AVX > +uint16_t > +cnxk_ep_recv_pkts_sse(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t > budget); > +#endif > + > uint16_t > cnxk_ep_recv_pkts_mseg(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t > budget); > > uint16_t > cn9k_ep_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t > budget); > > +#ifdef RTE_ARCH_X86 We can skip #ifdef for function declaration. Same comment for AVX > +uint16_t > +cn9k_ep_recv_pkts_sse(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t > budget); > +#endif > + > uint16_t > cn9k_ep_recv_pkts_mseg(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t > budget); > #endif /* _OTX_EP_RXTX_H_ */ > -- > 2.25.1 >