Hi Zhichao, The "vector" in the subject and commit log should be revised to "SSE" with a more accurate expression.
For the code part, see below. > -----Original Message----- > From: Zeng, ZhichaoX <zhichaox.z...@intel.com> > Sent: Friday, July 22, 2022 5:30 PM > To: dev@dpdk.org > Cc: sta...@dpdk.org; Yang, Qiming <qiming.y...@intel.com>; Zeng, ZhichaoX > <zhichaox.z...@intel.com>; Richardson, Bruce <bruce.richard...@intel.com>; > Konstantin Ananyev <konstantin.v.anan...@yandex.ru>; Wu, Jingjing > <jingjing...@intel.com>; Xing, Beilei <beilei.x...@intel.com>; Zhang, Qi Z > <qi.z.zh...@intel.com>; Rong, Leyi <leyi.r...@intel.com> > Subject: [PATCH] net/iavf: fix processing vlan tci in vector path > > From: Zhichao Zeng <zhichaox.z...@intel.com> > > The vector RX path does not process vlan tci correctly when it's stored in > L2TAG2, > so that the vlan tci could not be extracted from descriptor, then would not be > put into mbuf also. > > Add processing when vlan tci is stored in L2TAG2. > > Fixes: 1162f5a0ef31 ("net/iavf: support flexible Rx descriptor in SSE path") > Cc: sta...@dpdk.org > > Signed-off-by: Zhichao Zeng <zhichaox.z...@intel.com> > --- > drivers/net/iavf/iavf_rxtx_vec_sse.c | 100 ++++++++++++++++++++++----- > 1 file changed, 83 insertions(+), 17 deletions(-) > > diff --git a/drivers/net/iavf/iavf_rxtx_vec_sse.c > b/drivers/net/iavf/iavf_rxtx_vec_sse.c > index 4a5232c1d2..00ca1f0c39 100644 > --- a/drivers/net/iavf/iavf_rxtx_vec_sse.c > +++ b/drivers/net/iavf/iavf_rxtx_vec_sse.c > @@ -208,15 +208,24 @@ flex_rxd_to_fdir_flags_vec(const __m128i fdir_id0_3) > return fdir_flags; > } > > +#ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC > +static inline void > +flex_desc_to_olflags_v(struct iavf_rx_queue *rxq, __m128i descs[4], __m128i > descs_bh[4], > + struct rte_mbuf **rx_pkts) > +#else > static inline void > flex_desc_to_olflags_v(struct iavf_rx_queue *rxq, __m128i descs[4], > struct rte_mbuf **rx_pkts) > +#endif > { > const __m128i mbuf_init = _mm_set_epi64x(0, rxq->mbuf_initializer); > __m128i rearm0, rearm1, rearm2, rearm3; > > __m128i tmp_desc, flags, rss_vlan; > > + unsigned long long vlan_mask = 0; > + if (rxq->rx_flags & IAVF_RX_FLAGS_VLAN_TAG_LOC_L2TAG1) > + vlan_mask = RTE_MBUF_F_RX_VLAN | Be cautious to introduce branch statement in vector path, alternative is splitting the RSS and vlan flag processing. Better to check benchmarking data w/ and w/o this flag. > RTE_MBUF_F_RX_VLAN_STRIPPED; > /* mask everything except checksum, RSS and VLAN flags. > * bit6:4 for checksum. > * bit12 for RSS indication. > @@ -259,8 +268,8 @@ flex_desc_to_olflags_v(struct iavf_rx_queue *rxq, > __m128i descs[4], > const __m128i rss_vlan_flags = _mm_set_epi8(0, 0, 0, 0, > 0, 0, 0, 0, > 0, 0, 0, 0, > - RTE_MBUF_F_RX_RSS_HASH | RTE_MBUF_F_RX_VLAN > | RTE_MBUF_F_RX_VLAN_STRIPPED, > - RTE_MBUF_F_RX_VLAN | > RTE_MBUF_F_RX_VLAN_STRIPPED, > + RTE_MBUF_F_RX_RSS_HASH | vlan_mask, > + vlan_mask, > RTE_MBUF_F_RX_RSS_HASH, 0); > > /* merge 4 descriptors */ > @@ -286,6 +295,39 @@ flex_desc_to_olflags_v(struct iavf_rx_queue *rxq, > __m128i descs[4], > /* merge the flags */ > flags = _mm_or_si128(flags, rss_vlan); > > +#ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC > + if (rxq->rx_flags & IAVF_RX_FLAGS_VLAN_TAG_LOC_L2TAG2_2) { > + const __m128i l2tag2_mask = > + _mm_set1_epi32(1 << > IAVF_RX_FLEX_DESC_STATUS1_L2TAG2P_S); > + > + const __m128i vlan_tci0_1 = > + _mm_unpacklo_epi32(descs_bh[0], descs_bh[1]); > + const __m128i vlan_tci2_3 = > + _mm_unpacklo_epi32(descs_bh[2], descs_bh[3]); > + const __m128i vlan_tci0_3 = > + _mm_unpacklo_epi64(vlan_tci0_1, vlan_tci2_3); > + > + __m128i vlan_bits = _mm_and_si128(vlan_tci0_3, l2tag2_mask); > + > + vlan_bits = _mm_srli_epi32(vlan_bits, > + > IAVF_RX_FLEX_DESC_STATUS1_L2TAG2P_S); > + > + const __m128i vlan_flags_shuf = > + _mm_set_epi8(0, 0, 0, 0, > + 0, 0, 0, 0, > + 0, 0, 0, 0, > + 0, 0, > + RTE_MBUF_F_RX_VLAN | > + RTE_MBUF_F_RX_VLAN_STRIPPED, > + 0); > + > + const __m128i vlan_flags = _mm_shuffle_epi8(vlan_flags_shuf, > +vlan_bits); > + > + /* merge with vlan_flags */ > + flags = _mm_or_si128(flags, vlan_flags); > + } > +#endif > + > if (rxq->fdir_enabled) { > const __m128i fdir_id0_1 = > _mm_unpackhi_epi32(descs[0], descs[1]); @@ -748,6 > +790,9 @@ _recv_raw_pkts_vec_flex_rxd(struct iavf_rx_queue *rxq, > pos += IAVF_VPMD_DESCS_PER_LOOP, > rxdp += IAVF_VPMD_DESCS_PER_LOOP) { > __m128i descs[IAVF_VPMD_DESCS_PER_LOOP]; > +#ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC > + __m128i descs_bh[IAVF_VPMD_DESCS_PER_LOOP]; > +#endif > __m128i pkt_mb0, pkt_mb1, pkt_mb2, pkt_mb3; > __m128i staterr, sterr_tmp1, sterr_tmp2; > /* 2 64 bit or 4 32 bit mbuf pointers in one XMM reg. */ @@ - > 806,8 +851,6 @@ _recv_raw_pkts_vec_flex_rxd(struct iavf_rx_queue *rxq, > /* C.1 4=>2 filter staterr info only */ > sterr_tmp1 = _mm_unpackhi_epi32(descs[1], descs[0]); > > - flex_desc_to_olflags_v(rxq, descs, &rx_pkts[pos]); > - > /* D.2 pkt 3,4 set in_port/nb_seg and remove crc */ > pkt_mb3 = _mm_add_epi16(pkt_mb3, crc_adjust); > pkt_mb2 = _mm_add_epi16(pkt_mb2, crc_adjust); @@ -821,36 > +864,35 @@ _recv_raw_pkts_vec_flex_rxd(struct iavf_rx_queue *rxq, > * needs to load 2nd 16B of each desc for RSS hash parsing, > * will cause performance drop to get into this context. > */ > - if (offloads & RTE_ETH_RX_OFFLOAD_RSS_HASH) { > + if (offloads & RTE_ETH_RX_OFFLOAD_RSS_HASH || > + rxq->rx_flags & > IAVF_RX_FLAGS_VLAN_TAG_LOC_L2TAG2_2) { > /* load bottom half of every 32B desc */ > - const __m128i raw_desc_bh3 = > - _mm_load_si128 > + descs_bh[3] = _mm_load_si128 > ((void *)(&rxdp[3].wb.status_error1)); > rte_compiler_barrier(); > - const __m128i raw_desc_bh2 = > - _mm_load_si128 > + descs_bh[2] = _mm_load_si128 > ((void *)(&rxdp[2].wb.status_error1)); > rte_compiler_barrier(); > - const __m128i raw_desc_bh1 = > - _mm_load_si128 > + descs_bh[1] = _mm_load_si128 > ((void *)(&rxdp[1].wb.status_error1)); > rte_compiler_barrier(); > - const __m128i raw_desc_bh0 = > - _mm_load_si128 > + descs_bh[0] = _mm_load_si128 > ((void *)(&rxdp[0].wb.status_error1)); > + } > > + if (offloads & RTE_ETH_RX_OFFLOAD_RSS_HASH) { > /** > * to shift the 32b RSS hash value to the > * highest 32b of each 128b before mask > */ > __m128i rss_hash3 = > - _mm_slli_epi64(raw_desc_bh3, 32); > + _mm_slli_epi64(descs_bh[3], 32); > __m128i rss_hash2 = > - _mm_slli_epi64(raw_desc_bh2, 32); > + _mm_slli_epi64(descs_bh[2], 32); > __m128i rss_hash1 = > - _mm_slli_epi64(raw_desc_bh1, 32); > + _mm_slli_epi64(descs_bh[1], 32); > __m128i rss_hash0 = > - _mm_slli_epi64(raw_desc_bh0, 32); > + _mm_slli_epi64(descs_bh[0], 32); > > __m128i rss_hash_msk = > _mm_set_epi32(0xFFFFFFFF, 0, 0, 0); @@ - > 869,6 +911,30 @@ _recv_raw_pkts_vec_flex_rxd(struct iavf_rx_queue *rxq, > pkt_mb1 = _mm_or_si128(pkt_mb1, rss_hash1); > pkt_mb0 = _mm_or_si128(pkt_mb0, rss_hash0); > } /* if() on RSS hash parsing */ > + > + if (rxq->rx_flags & IAVF_RX_FLAGS_VLAN_TAG_LOC_L2TAG2_2) > { > + /* L2TAG2_2 */ > + __m128i vlan_tci3 = _mm_slli_si128(descs_bh[3], 4); > + __m128i vlan_tci2 = _mm_slli_si128(descs_bh[2], 4); > + __m128i vlan_tci1 = _mm_slli_si128(descs_bh[1], 4); > + __m128i vlan_tci0 = _mm_slli_si128(descs_bh[0], 4); > + > + const __m128i vlan_tci_msk = _mm_set_epi32(0, > 0xFFFF0000, 0, 0); > + > + vlan_tci3 = _mm_and_si128(vlan_tci3, vlan_tci_msk); > + vlan_tci2 = _mm_and_si128(vlan_tci2, vlan_tci_msk); > + vlan_tci1 = _mm_and_si128(vlan_tci1, vlan_tci_msk); > + vlan_tci0 = _mm_and_si128(vlan_tci0, vlan_tci_msk); > + > + pkt_mb3 = _mm_or_si128(pkt_mb3, vlan_tci3); > + pkt_mb2 = _mm_or_si128(pkt_mb2, vlan_tci2); > + pkt_mb1 = _mm_or_si128(pkt_mb1, vlan_tci1); > + pkt_mb0 = _mm_or_si128(pkt_mb0, vlan_tci0); > + } > + > + flex_desc_to_olflags_v(rxq, descs, descs_bh, &rx_pkts[pos]); > #else > + flex_desc_to_olflags_v(rxq, descs, &rx_pkts[pos]); > #endif > > /* C.2 get 4 pkts staterr value */ > -- > 2.25.1