When receiving tunneled packets, the testpmd output log shows 'ol_flags' value always as 'RTE_MBUF_F_RX_OUTER_L4_CKSUM_UNKNOWN', but expected value should be 'RX_OUTER_L4_CKSUM_GOOD' or 'RX_OUTER_L4_CKSUM_BAD'.
Adding 'RX_OUTER_L4_CKSUM_GOOD' and 'RX_OUTER_L4_CKSUM_BAD' to 'flags' for normal path, 'l3_l4_flags_shuf' for AVX2 and AVX512 vector path and 'cksum_flags' for SSE vector path to ensure that the 'ol_flags' can match correct flags. Fixes: b8b4c54ef9b0 ("net/iavf: support flexible Rx descriptor in normal path") Fixes: 1162f5a0ef31 ("net/iavf: support flexible Rx descriptor in SSE path") Fixes: 5b6e8859081d ("net/iavf: support flexible Rx descriptor in AVX path") Fixes: 9c9aa0040344 ("net/iavf: add offload path for Rx AVX512 flex descriptor") Cc: sta...@dpdk.org Signed-off-by: Zhichao Zeng <zhichaox.z...@intel.com> --- drivers/net/iavf/iavf_rxtx.c | 9 +- drivers/net/iavf/iavf_rxtx_vec_avx2.c | 118 +++++++++++++++------ drivers/net/iavf/iavf_rxtx_vec_avx512.c | 133 ++++++++++++++++++------ drivers/net/iavf/iavf_rxtx_vec_sse.c | 77 ++++++++++---- 4 files changed, 252 insertions(+), 85 deletions(-) diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c index 3deabe1d7e..e1681024a6 100644 --- a/drivers/net/iavf/iavf_rxtx.c +++ b/drivers/net/iavf/iavf_rxtx.c @@ -1277,7 +1277,9 @@ iavf_flex_rxd_error_to_pkt_flags(uint16_t stat_err0) return 0; if (likely(!(stat_err0 & IAVF_RX_FLEX_ERR0_BITS))) { - flags |= (RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_GOOD); + flags |= (RTE_MBUF_F_RX_IP_CKSUM_GOOD | + RTE_MBUF_F_RX_L4_CKSUM_GOOD | + RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD); return flags; } @@ -1294,6 +1296,11 @@ iavf_flex_rxd_error_to_pkt_flags(uint16_t stat_err0) if (unlikely(stat_err0 & (1 << IAVF_RX_FLEX_DESC_STATUS0_XSUM_EIPE_S))) flags |= RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD; + if (unlikely(stat_err0 & (1 << IAVF_RX_FLEX_DESC_STATUS0_XSUM_EUDPE_S))) + flags |= RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD; + else + flags |= RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD; + return flags; } diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx2.c b/drivers/net/iavf/iavf_rxtx_vec_avx2.c index d6243b96e2..862f6eb0c0 100644 --- a/drivers/net/iavf/iavf_rxtx_vec_avx2.c +++ b/drivers/net/iavf/iavf_rxtx_vec_avx2.c @@ -622,43 +622,88 @@ _iavf_recv_raw_pkts_vec_avx2_flex_rxd(struct iavf_rx_queue *rxq, * bit13 is for VLAN indication. */ const __m256i flags_mask = - _mm256_set1_epi32((7 << 4) | (1 << 12) | (1 << 13)); + _mm256_set1_epi32((0xF << 4) | (1 << 12) | (1 << 13)); /** * data to be shuffled by the result of the flags mask shifted by 4 * bits. This gives use the l3_l4 flags. */ - const __m256i l3_l4_flags_shuf = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, - /* shift right 1 bit to make sure it not exceed 255 */ - (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD | - RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, - (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD | - RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, - (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_GOOD | - RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, - (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_GOOD | - RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, - (RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, - (RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, - (RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, - (RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, - /* second 128-bits */ - 0, 0, 0, 0, 0, 0, 0, 0, - (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD | - RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, - (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD | - RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, - (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_GOOD | - RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, - (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_GOOD | - RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, - (RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, - (RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, - (RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, - (RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1); + const __m256i l3_l4_flags_shuf = + _mm256_set_epi8((RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | + RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD | + RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | + RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | + RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | + RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | + RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | + RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | + RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | + RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | + RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | + RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | + RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | + RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | + RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | + RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | + RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | + RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + /** + * second 128-bits + * shift right 20 bits to use the low two bits to indicate + * outer checksum status + * shift right 1 bit to make sure it not exceed 255 + */ + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | + RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | + RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | + RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | + RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | + RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | + RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | + RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | + RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | + RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | + RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | + RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | + RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | + RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | + RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | + RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | + RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1); const __m256i cksum_mask = - _mm256_set1_epi32(RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD | - RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_BAD | - RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD); + _mm256_set1_epi32(RTE_MBUF_F_RX_IP_CKSUM_MASK | + RTE_MBUF_F_RX_L4_CKSUM_MASK | + RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | + RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK); /** * data to be shuffled by result of flag mask, shifted down 12. * If RSS(bit12)/VLAN(bit13) are set, @@ -836,6 +881,15 @@ _iavf_recv_raw_pkts_vec_avx2_flex_rxd(struct iavf_rx_queue *rxq, __m256i l3_l4_flags = _mm256_shuffle_epi8(l3_l4_flags_shuf, _mm256_srli_epi32(flag_bits, 4)); l3_l4_flags = _mm256_slli_epi32(l3_l4_flags, 1); + __m256i l4_outer_mask = _mm256_set1_epi32(0x6); + __m256i l4_outer_flags = + _mm256_and_si256(l3_l4_flags, l4_outer_mask); + l4_outer_flags = _mm256_slli_epi32(l4_outer_flags, 20); + + __m256i l3_l4_mask = _mm256_set1_epi32(~0x6); + + l3_l4_flags = _mm256_and_si256(l3_l4_flags, l3_l4_mask); + l3_l4_flags = _mm256_or_si256(l3_l4_flags, l4_outer_flags); l3_l4_flags = _mm256_and_si256(l3_l4_flags, cksum_mask); /* set rss and vlan flags */ diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx512.c b/drivers/net/iavf/iavf_rxtx_vec_avx512.c index 3bfec63851..b416a716cf 100644 --- a/drivers/net/iavf/iavf_rxtx_vec_avx512.c +++ b/drivers/net/iavf/iavf_rxtx_vec_avx512.c @@ -969,45 +969,105 @@ _iavf_recv_raw_pkts_vec_avx512_flex_rxd(struct iavf_rx_queue *rxq, * bit13 is for VLAN indication. */ const __m256i flags_mask = - _mm256_set1_epi32((7 << 4) | (1 << 12) | (1 << 13)); + _mm256_set1_epi32((0xF << 4) | (1 << 12) | (1 << 13)); #endif #ifdef IAVF_RX_CSUM_OFFLOAD /** * data to be shuffled by the result of the flags mask shifted by 4 * bits. This gives use the l3_l4 flags. */ - const __m256i l3_l4_flags_shuf = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, - /* shift right 1 bit to make sure it not exceed 255 */ - (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD | - RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, - (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD | - RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, - (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_GOOD | - RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, - (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_GOOD | - RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, - (RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, - (RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, - (RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, - (RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, - /* second 128-bits */ - 0, 0, 0, 0, 0, 0, 0, 0, - (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD | - RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, - (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD | - RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, - (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_GOOD | - RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, - (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_GOOD | - RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, - (RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, - (RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, - (RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, - (RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1); + const __m256i l3_l4_flags_shuf = + _mm256_set_epi8((RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | + RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD | + RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | + RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD | + RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | + RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_GOOD | + RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | + RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_GOOD | + RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | + RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | + RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | + RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | + RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | + RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD | + RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | + RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD | + RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | + RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_GOOD | + RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | + RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_GOOD | + RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | + RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | + RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | + RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | + RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + /** + * second 128-bits + * shift right 20 bits to use the low two bits to indicate + * outer checksum status + * shift right 1 bit to make sure it not exceed 255 + */ + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | + RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD | + RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | + RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD | + RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | + RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_GOOD | + RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | + RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_GOOD | + RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | + RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | + RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | + RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | + RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | + RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD | + RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | + RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD | + RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | + RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_GOOD | + RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | + RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_GOOD | + RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | + RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | + RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | + RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | + RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1); const __m256i cksum_mask = - _mm256_set1_epi32(RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD | - RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_BAD | - RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD); + _mm256_set1_epi32(RTE_MBUF_F_RX_IP_CKSUM_MASK | + RTE_MBUF_F_RX_L4_CKSUM_MASK | + RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | + RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK); #endif #if defined(IAVF_RX_VLAN_OFFLOAD) || defined(IAVF_RX_RSS_OFFLOAD) /** @@ -1057,6 +1117,15 @@ _iavf_recv_raw_pkts_vec_avx512_flex_rxd(struct iavf_rx_queue *rxq, __m256i l3_l4_flags = _mm256_shuffle_epi8(l3_l4_flags_shuf, _mm256_srli_epi32(flag_bits, 4)); l3_l4_flags = _mm256_slli_epi32(l3_l4_flags, 1); + __m256i l4_outer_mask = _mm256_set1_epi32(0x6); + __m256i l4_outer_flags = + _mm256_and_si256(l3_l4_flags, l4_outer_mask); + l4_outer_flags = _mm256_slli_epi32(l4_outer_flags, 20); + + __m256i l3_l4_mask = _mm256_set1_epi32(~0x6); + + l3_l4_flags = _mm256_and_si256(l3_l4_flags, l3_l4_mask); + l3_l4_flags = _mm256_or_si256(l3_l4_flags, l4_outer_flags); l3_l4_flags = _mm256_and_si256(l3_l4_flags, cksum_mask); #endif #if defined(IAVF_RX_VLAN_OFFLOAD) || defined(IAVF_RX_RSS_OFFLOAD) diff --git a/drivers/net/iavf/iavf_rxtx_vec_sse.c b/drivers/net/iavf/iavf_rxtx_vec_sse.c index 4a5232c1d2..1f477eada5 100644 --- a/drivers/net/iavf/iavf_rxtx_vec_sse.c +++ b/drivers/net/iavf/iavf_rxtx_vec_sse.c @@ -222,39 +222,69 @@ flex_desc_to_olflags_v(struct iavf_rx_queue *rxq, __m128i descs[4], * bit12 for RSS indication. * bit13 for VLAN indication. */ - const __m128i desc_mask = _mm_set_epi32(0x3070, 0x3070, - 0x3070, 0x3070); + const __m128i desc_mask = _mm_set_epi32(0x30f0, 0x30f0, + 0x30f0, 0x30f0); const __m128i cksum_mask = _mm_set_epi32(RTE_MBUF_F_RX_IP_CKSUM_MASK | RTE_MBUF_F_RX_L4_CKSUM_MASK | + RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD, RTE_MBUF_F_RX_IP_CKSUM_MASK | RTE_MBUF_F_RX_L4_CKSUM_MASK | + RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD, RTE_MBUF_F_RX_IP_CKSUM_MASK | RTE_MBUF_F_RX_L4_CKSUM_MASK | + RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD, RTE_MBUF_F_RX_IP_CKSUM_MASK | RTE_MBUF_F_RX_L4_CKSUM_MASK | + RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD); /* map the checksum, rss and vlan fields to the checksum, rss * and vlan flag */ - const __m128i cksum_flags = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, - /* shift right 1 bit to make sure it not exceed 255 */ - (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD | - RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, - (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD | - RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, - (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_GOOD | - RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, - (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_GOOD | - RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, - (RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, - (RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, - (RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, - (RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1); + const __m128i cksum_flags = + _mm_set_epi8((RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | + RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD | + RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | + RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | + RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | + RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | + RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | + RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | + RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | + RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + /** + * shift right 20 bits to use the low two bits to indicate + * outer checksum status + * shift right 1 bit to make sure it not exceed 255 + */ + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | + RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | + RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | + RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | + RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | + RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD | + RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | + RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1, + (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD | + RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1); + const __m128i rss_vlan_flags = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, @@ -274,6 +304,13 @@ flex_desc_to_olflags_v(struct iavf_rx_queue *rxq, __m128i descs[4], flags = _mm_shuffle_epi8(cksum_flags, tmp_desc); /* then we shift left 1 bit */ flags = _mm_slli_epi32(flags, 1); + __m128i l4_outer_mask = _mm_set_epi32(0x6, 0x6, 0x6, 0x6); + __m128i l4_outer_flags = _mm_and_si128(flags, l4_outer_mask); + l4_outer_flags = _mm_slli_epi32(l4_outer_flags, 20); + + __m128i l3_l4_mask = _mm_set_epi32(~0x6, ~0x6, ~0x6, ~0x6); + __m128i l3_l4_flags = _mm_and_si128(flags, l3_l4_mask); + flags = _mm_or_si128(l3_l4_flags, l4_outer_flags); /* we need to mask out the redundant bits introduced by RSS or * VLAN fields. */ @@ -325,10 +362,10 @@ flex_desc_to_olflags_v(struct iavf_rx_queue *rxq, __m128i descs[4], * appropriate flags means that we have to do a shift and blend for * each mbuf before we do the write. */ - rearm0 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(flags, 8), 0x10); - rearm1 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(flags, 4), 0x10); - rearm2 = _mm_blend_epi16(mbuf_init, flags, 0x10); - rearm3 = _mm_blend_epi16(mbuf_init, _mm_srli_si128(flags, 4), 0x10); + rearm0 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(flags, 8), 0x30); + rearm1 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(flags, 4), 0x30); + rearm2 = _mm_blend_epi16(mbuf_init, flags, 0x30); + rearm3 = _mm_blend_epi16(mbuf_init, _mm_srli_si128(flags, 4), 0x30); /* write the rearm data and the olflags in one write */ RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) != -- 2.25.1