When receiving tunnel packets, the testpmd terminal output log shows
'ol_flags' value always as 'RTE_MBUF_F_RX_OUTER_L4_CKSUM_UNKNOWN',
but what we expected should be 'RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD'
or 'RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD'.

Adding 'RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD' and
'RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD' flags for normal path,
which are also added to 'l3_l4_flags_shuf' for AVX2 and AVX512
vector path and 'cksum_flags' for SSE vector path to ensure
the 'ol_flags' can match correct flags.

Fixes: 4861cde46116 ("i40e: new poll mode driver")
Fixes: daa02b5cddbb ("mbuf: add namespace to offload flags")
Fixes: e6a6a138919f ("net/i40e: add AVX512 vector path")
Fixes: dafadd73762e ("net/i40e: add AVX2 Rx function")
Fixes: 9966a00a0688 ("net/i40e: enable bad checksum flags in vector Rx")
Fixes: f3a85f4ce04d ("net/i40e: fix checksum flag in x86 vector Rx")
Fixes: f4356d7ca168 ("net/i40e: eliminate mbuf write on rearm")
Cc: sta...@dpdk.org

Signed-off-by: Mingjin Ye <mingjinx...@intel.com>
---
 drivers/net/i40e/i40e_rxtx.c            |   6 +-
 drivers/net/i40e/i40e_rxtx_vec_avx2.c   | 119 +++++++++++++++++-------
 drivers/net/i40e/i40e_rxtx_vec_avx512.c | 113 ++++++++++++++++------
 drivers/net/i40e/i40e_rxtx_vec_sse.c    |  73 ++++++++++-----
 4 files changed, 228 insertions(+), 83 deletions(-)

diff --git a/drivers/net/i40e/i40e_rxtx.c b/drivers/net/i40e/i40e_rxtx.c
index 788ffb51c2..e8903007f0 100644
--- a/drivers/net/i40e/i40e_rxtx.c
+++ b/drivers/net/i40e/i40e_rxtx.c
@@ -169,7 +169,9 @@ i40e_rxd_error_to_pkt_flags(uint64_t qword)
 
 #define I40E_RX_ERR_BITS 0x3f
        if (likely((error_bits & I40E_RX_ERR_BITS) == 0)) {
-               flags |= (RTE_MBUF_F_RX_IP_CKSUM_GOOD | 
RTE_MBUF_F_RX_L4_CKSUM_GOOD);
+               flags |= (RTE_MBUF_F_RX_IP_CKSUM_GOOD |
+                       RTE_MBUF_F_RX_L4_CKSUM_GOOD |
+                       RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD);
                return flags;
        }
 
@@ -185,6 +187,8 @@ i40e_rxd_error_to_pkt_flags(uint64_t qword)
 
        if (unlikely(error_bits & (1 << I40E_RX_DESC_ERROR_EIPE_SHIFT)))
                flags |= RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD;
+       else
+               flags |= RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD;
 
        return flags;
 }
diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx2.c 
b/drivers/net/i40e/i40e_rxtx_vec_avx2.c
index 761edb9d20..348dad4293 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_avx2.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_avx2.c
@@ -202,7 +202,7 @@ _recv_raw_pkts_vec_avx2(struct i40e_rx_queue *rxq, struct 
rte_mbuf **rx_pkts,
         * field (bits 22-24) are for IP/L4 checksum errors
         */
        const __m256i flags_mask = _mm256_set1_epi32(
-                       (1 << 2) | (1 << 11) | (3 << 12) | (7 << 22));
+                       (0xF << 4) | (1 << 12) | (1 << 13));
        /*
         * data to be shuffled by result of flag mask. If VLAN bit is set,
         * (bit 2), then position 4 in this array will be used in the
@@ -228,39 +228,83 @@ _recv_raw_pkts_vec_avx2(struct i40e_rx_queue *rxq, struct 
rte_mbuf **rx_pkts,
         * data to be shuffled by the result of the flags mask shifted by 22
         * bits.  This gives use the l3_l4 flags.
         */
-       const __m256i l3_l4_flags_shuf = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
-                       /* shift right 1 bit to make sure it not exceed 255 */
-                       (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 
RTE_MBUF_F_RX_L4_CKSUM_BAD  |
-                        RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
-                       (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 
RTE_MBUF_F_RX_L4_CKSUM_BAD  |
-                        RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
-                       (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 
RTE_MBUF_F_RX_L4_CKSUM_GOOD |
-                        RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
-                       (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 
RTE_MBUF_F_RX_L4_CKSUM_GOOD |
-                        RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
-                       (RTE_MBUF_F_RX_L4_CKSUM_BAD  | 
RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
-                       (RTE_MBUF_F_RX_L4_CKSUM_BAD  | 
RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
-                       (RTE_MBUF_F_RX_L4_CKSUM_GOOD | 
RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
-                       (RTE_MBUF_F_RX_L4_CKSUM_GOOD | 
RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
-                       /* second 128-bits */
-                       0, 0, 0, 0, 0, 0, 0, 0,
-                       (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 
RTE_MBUF_F_RX_L4_CKSUM_BAD  |
-                        RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
-                       (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 
RTE_MBUF_F_RX_L4_CKSUM_BAD  |
-                        RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
-                       (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 
RTE_MBUF_F_RX_L4_CKSUM_GOOD |
-                        RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
-                       (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 
RTE_MBUF_F_RX_L4_CKSUM_GOOD |
-                        RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
-                       (RTE_MBUF_F_RX_L4_CKSUM_BAD  | 
RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
-                       (RTE_MBUF_F_RX_L4_CKSUM_BAD  | 
RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
-                       (RTE_MBUF_F_RX_L4_CKSUM_GOOD | 
RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
-                       (RTE_MBUF_F_RX_L4_CKSUM_GOOD | 
RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1);
+               const __m256i l3_l4_flags_shuf =
+                       _mm256_set_epi8((RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 
|
+                       RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 
RTE_MBUF_F_RX_L4_CKSUM_BAD |
+                       RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | 
RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+                       RTE_MBUF_F_RX_L4_CKSUM_BAD | 
RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | 
RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+                       RTE_MBUF_F_RX_L4_CKSUM_GOOD | 
RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | 
RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+                       RTE_MBUF_F_RX_L4_CKSUM_GOOD | 
RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | 
RTE_MBUF_F_RX_L4_CKSUM_BAD  |
+                       RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | 
RTE_MBUF_F_RX_L4_CKSUM_BAD  |
+                       RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | 
RTE_MBUF_F_RX_L4_CKSUM_GOOD |
+                       RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | 
RTE_MBUF_F_RX_L4_CKSUM_GOOD |
+                       RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | 
RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+                       RTE_MBUF_F_RX_L4_CKSUM_BAD | 
RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | 
RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+                       RTE_MBUF_F_RX_L4_CKSUM_BAD | 
RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | 
RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+                       RTE_MBUF_F_RX_L4_CKSUM_GOOD | 
RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | 
RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+                       RTE_MBUF_F_RX_L4_CKSUM_GOOD | 
RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | 
RTE_MBUF_F_RX_L4_CKSUM_BAD |
+                       RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | 
RTE_MBUF_F_RX_L4_CKSUM_BAD |
+                       RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | 
RTE_MBUF_F_RX_L4_CKSUM_GOOD |
+                       RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | 
RTE_MBUF_F_RX_L4_CKSUM_GOOD |
+                       RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+                       /**
+                       * second 128-bits
+                       * shift right 20 bits to use the low two bits to 
indicate
+                       * outer checksum status
+                       * shift right 1 bit to make sure it not exceed 255
+                       */
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | 
RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+                       RTE_MBUF_F_RX_L4_CKSUM_BAD | 
RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | 
RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+                       RTE_MBUF_F_RX_L4_CKSUM_BAD | 
RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | 
RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+                       RTE_MBUF_F_RX_L4_CKSUM_GOOD | 
RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | 
RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+                       RTE_MBUF_F_RX_L4_CKSUM_GOOD | 
RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | 
RTE_MBUF_F_RX_L4_CKSUM_BAD  |
+                       RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | 
RTE_MBUF_F_RX_L4_CKSUM_BAD  |
+                       RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | 
RTE_MBUF_F_RX_L4_CKSUM_GOOD |
+                       RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | 
RTE_MBUF_F_RX_L4_CKSUM_GOOD |
+                       RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | 
RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+                       RTE_MBUF_F_RX_L4_CKSUM_BAD | 
RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | 
RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+                       RTE_MBUF_F_RX_L4_CKSUM_BAD | 
RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | 
RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+                       RTE_MBUF_F_RX_L4_CKSUM_GOOD | 
RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | 
RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+                       RTE_MBUF_F_RX_L4_CKSUM_GOOD | 
RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | 
RTE_MBUF_F_RX_L4_CKSUM_BAD |
+                       RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | 
RTE_MBUF_F_RX_L4_CKSUM_BAD |
+                       RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | 
RTE_MBUF_F_RX_L4_CKSUM_GOOD |
+                       RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | 
RTE_MBUF_F_RX_L4_CKSUM_GOOD |
+                       RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1);
 
        const __m256i cksum_mask = _mm256_set1_epi32(
-                       RTE_MBUF_F_RX_IP_CKSUM_GOOD | 
RTE_MBUF_F_RX_IP_CKSUM_BAD |
-                       RTE_MBUF_F_RX_L4_CKSUM_GOOD | 
RTE_MBUF_F_RX_L4_CKSUM_BAD |
-                       RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD);
+                       RTE_MBUF_F_RX_IP_CKSUM_MASK | 
RTE_MBUF_F_RX_L4_CKSUM_MASK |
+                       RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+                       RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK);
 
        RTE_SET_USED(avx_aligned); /* for 32B descriptors we don't use this */
 
@@ -407,6 +451,17 @@ _recv_raw_pkts_vec_avx2(struct i40e_rx_queue *rxq, struct 
rte_mbuf **rx_pkts,
                __m256i l3_l4_flags = _mm256_shuffle_epi8(l3_l4_flags_shuf,
                                _mm256_srli_epi32(flag_bits, 22));
                l3_l4_flags = _mm256_slli_epi32(l3_l4_flags, 1);
+
+               __m256i l4_outer_mask = _mm256_set1_epi32(0x6);
+               __m256i l4_outer_flags =
+                                       _mm256_and_si256(l3_l4_flags, 
l4_outer_mask);
+               l4_outer_flags = _mm256_slli_epi32(l4_outer_flags, 20);
+
+               __m256i l3_l4_mask = _mm256_set1_epi32(~0x6);
+
+               l3_l4_flags = _mm256_and_si256(l3_l4_flags, l3_l4_mask);
+               l3_l4_flags = _mm256_or_si256(l3_l4_flags, l4_outer_flags);
+
                l3_l4_flags = _mm256_and_si256(l3_l4_flags, cksum_mask);
 
                /* merge flags */
diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx512.c 
b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
index 60c97d5331..94fd309748 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_avx512.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
@@ -313,7 +313,7 @@ _recv_raw_pkts_vec_avx512(struct i40e_rx_queue *rxq, struct 
rte_mbuf **rx_pkts,
         * field (bits 22-24) are for IP/L4 checksum errors
         */
        const __m256i flags_mask = _mm256_set1_epi32
-               ((1 << 2) | (1 << 11) | (3 << 12) | (7 << 22));
+                       ((0xF << 4) | (1 << 12) | (1 << 13));
 
        /* data to be shuffled by result of flag mask. If VLAN bit is set,
         * (bit 2), then position 4 in this array will be used in the
@@ -339,35 +339,83 @@ _recv_raw_pkts_vec_avx512(struct i40e_rx_queue *rxq, 
struct rte_mbuf **rx_pkts,
         * bits.  This gives use the l3_l4 flags.
         */
        const __m256i l3_l4_flags_shuf = _mm256_set_epi8
-               (0, 0, 0, 0, 0, 0, 0, 0,
-               /* shift right 1 bit to make sure it not exceed 255 */
-               (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD |
-                RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
-               (RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD 
|
-                RTE_MBUF_F_RX_L4_CKSUM_BAD) >> 1,
-               (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) 
>> 1,
-               (RTE_MBUF_F_RX_IP_CKSUM_GOOD | 
RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD) >> 1,
-               (RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
-               (RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_BAD) >> 1,
-               RTE_MBUF_F_RX_IP_CKSUM_BAD >> 1,
-               (RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_GOOD) >> 
1,
-               /* second 128-bits */
-               0, 0, 0, 0, 0, 0, 0, 0,
-               (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD |
-                RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
-               (RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD 
|
-                RTE_MBUF_F_RX_L4_CKSUM_BAD) >> 1,
-               (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) 
>> 1,
-               (RTE_MBUF_F_RX_IP_CKSUM_GOOD | 
RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD) >> 1,
-               (RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
-               (RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_BAD) >> 1,
-               RTE_MBUF_F_RX_IP_CKSUM_BAD >> 1,
-               (RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_GOOD) >> 
1);
+                       ((RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 |
+                       RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 
RTE_MBUF_F_RX_L4_CKSUM_BAD |
+                       RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | 
RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+                       RTE_MBUF_F_RX_L4_CKSUM_BAD | 
RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | 
RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+                       RTE_MBUF_F_RX_L4_CKSUM_GOOD | 
RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | 
RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+                       RTE_MBUF_F_RX_L4_CKSUM_GOOD | 
RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | 
RTE_MBUF_F_RX_L4_CKSUM_BAD  |
+                       RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | 
RTE_MBUF_F_RX_L4_CKSUM_BAD  |
+                       RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | 
RTE_MBUF_F_RX_L4_CKSUM_GOOD |
+                       RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | 
RTE_MBUF_F_RX_L4_CKSUM_GOOD |
+                       RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | 
RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+                       RTE_MBUF_F_RX_L4_CKSUM_BAD | 
RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | 
RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+                       RTE_MBUF_F_RX_L4_CKSUM_BAD | 
RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | 
RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+                       RTE_MBUF_F_RX_L4_CKSUM_GOOD | 
RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | 
RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+                       RTE_MBUF_F_RX_L4_CKSUM_GOOD | 
RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | 
RTE_MBUF_F_RX_L4_CKSUM_BAD |
+                       RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | 
RTE_MBUF_F_RX_L4_CKSUM_BAD |
+                       RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | 
RTE_MBUF_F_RX_L4_CKSUM_GOOD |
+                       RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | 
RTE_MBUF_F_RX_L4_CKSUM_GOOD |
+                       RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+                       /**
+                       * second 128-bits
+                       * shift right 20 bits to use the low two bits to 
indicate
+                       * outer checksum status
+                       * shift right 1 bit to make sure it not exceed 255
+                       */
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | 
RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+                       RTE_MBUF_F_RX_L4_CKSUM_BAD | 
RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | 
RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+                       RTE_MBUF_F_RX_L4_CKSUM_BAD | 
RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | 
RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+                       RTE_MBUF_F_RX_L4_CKSUM_GOOD | 
RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | 
RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+                       RTE_MBUF_F_RX_L4_CKSUM_GOOD | 
RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | 
RTE_MBUF_F_RX_L4_CKSUM_BAD  |
+                       RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | 
RTE_MBUF_F_RX_L4_CKSUM_BAD  |
+                       RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | 
RTE_MBUF_F_RX_L4_CKSUM_GOOD |
+                       RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | 
RTE_MBUF_F_RX_L4_CKSUM_GOOD |
+                       RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | 
RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+                       RTE_MBUF_F_RX_L4_CKSUM_BAD | 
RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | 
RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+                       RTE_MBUF_F_RX_L4_CKSUM_BAD | 
RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | 
RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+                       RTE_MBUF_F_RX_L4_CKSUM_GOOD | 
RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | 
RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+                       RTE_MBUF_F_RX_L4_CKSUM_GOOD | 
RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | 
RTE_MBUF_F_RX_L4_CKSUM_BAD |
+                       RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | 
RTE_MBUF_F_RX_L4_CKSUM_BAD |
+                       RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | 
RTE_MBUF_F_RX_L4_CKSUM_GOOD |
+                       RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | 
RTE_MBUF_F_RX_L4_CKSUM_GOOD |
+                       RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1);
 
        const __m256i cksum_mask = _mm256_set1_epi32
-               (RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD |
-               RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_BAD |
-               RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD);
+               (RTE_MBUF_F_RX_IP_CKSUM_MASK |
+               RTE_MBUF_F_RX_L4_CKSUM_MASK |
+               RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+               RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK);
 
        uint16_t i, received;
 
@@ -535,6 +583,15 @@ _recv_raw_pkts_vec_avx512(struct i40e_rx_queue *rxq, 
struct rte_mbuf **rx_pkts,
                __m256i l3_l4_flags = _mm256_shuffle_epi8(l3_l4_flags_shuf,
                                _mm256_srli_epi32(flag_bits, 22));
                l3_l4_flags = _mm256_slli_epi32(l3_l4_flags, 1);
+               __m256i l4_outer_mask = _mm256_set1_epi32(0x6);
+               __m256i l4_outer_flags =
+                               _mm256_and_si256(l3_l4_flags, l4_outer_mask);
+               l4_outer_flags = _mm256_slli_epi32(l4_outer_flags, 20);
+
+               __m256i l3_l4_mask = _mm256_set1_epi32(~0x6);
+
+               l3_l4_flags = _mm256_and_si256(l3_l4_flags, l3_l4_mask);
+               l3_l4_flags = _mm256_or_si256(l3_l4_flags, l4_outer_flags);
                l3_l4_flags = _mm256_and_si256(l3_l4_flags, cksum_mask);
 
                /* merge flags */
diff --git a/drivers/net/i40e/i40e_rxtx_vec_sse.c 
b/drivers/net/i40e/i40e_rxtx_vec_sse.c
index bdc979a839..65630e40a2 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_sse.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_sse.c
@@ -230,16 +230,16 @@ desc_to_olflags_v(struct i40e_rx_queue *rxq, volatile 
union i40e_rx_desc *rxdp,
        const __m128i cksum_mask = _mm_set_epi32(
                        RTE_MBUF_F_RX_IP_CKSUM_GOOD | 
RTE_MBUF_F_RX_IP_CKSUM_BAD |
                        RTE_MBUF_F_RX_L4_CKSUM_GOOD | 
RTE_MBUF_F_RX_L4_CKSUM_BAD |
-                       RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD,
+                       RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK | 
RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD,
                        RTE_MBUF_F_RX_IP_CKSUM_GOOD | 
RTE_MBUF_F_RX_IP_CKSUM_BAD |
                        RTE_MBUF_F_RX_L4_CKSUM_GOOD | 
RTE_MBUF_F_RX_L4_CKSUM_BAD |
-                       RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD,
+                       RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK | 
RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD,
                        RTE_MBUF_F_RX_IP_CKSUM_GOOD | 
RTE_MBUF_F_RX_IP_CKSUM_BAD |
                        RTE_MBUF_F_RX_L4_CKSUM_GOOD | 
RTE_MBUF_F_RX_L4_CKSUM_BAD |
-                       RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD,
+                       RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK | 
RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD,
                        RTE_MBUF_F_RX_IP_CKSUM_GOOD | 
RTE_MBUF_F_RX_IP_CKSUM_BAD |
                        RTE_MBUF_F_RX_L4_CKSUM_GOOD | 
RTE_MBUF_F_RX_L4_CKSUM_BAD |
-                       RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD);
+                       RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK | 
RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD);
 
        /* map rss and vlan type to rss hash and vlan flag */
        const __m128i vlan_flags = _mm_set_epi8(0, 0, 0, 0,
@@ -252,20 +252,45 @@ desc_to_olflags_v(struct i40e_rx_queue *rxq, volatile 
union i40e_rx_desc *rxdp,
                        RTE_MBUF_F_RX_RSS_HASH | RTE_MBUF_F_RX_FDIR, 
RTE_MBUF_F_RX_RSS_HASH, 0, 0,
                        0, 0, RTE_MBUF_F_RX_FDIR, 0);
 
-       const __m128i l3_l4e_flags = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
-                       /* shift right 1 bit to make sure it not exceed 255 */
-                       (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 
RTE_MBUF_F_RX_L4_CKSUM_BAD  |
-                        RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
-                       (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 
RTE_MBUF_F_RX_L4_CKSUM_BAD  |
-                        RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
-                       (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 
RTE_MBUF_F_RX_L4_CKSUM_GOOD |
-                        RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
-                       (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 
RTE_MBUF_F_RX_L4_CKSUM_GOOD |
-                        RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
-                       (RTE_MBUF_F_RX_L4_CKSUM_BAD  | 
RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
-                       (RTE_MBUF_F_RX_L4_CKSUM_BAD  | 
RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
-                       (RTE_MBUF_F_RX_L4_CKSUM_GOOD | 
RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
-                       (RTE_MBUF_F_RX_L4_CKSUM_GOOD | 
RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1);
+       const __m128i l3_l4e_flags =
+                       _mm_set_epi8((RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 |
+                       RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | 
RTE_MBUF_F_RX_L4_CKSUM_BAD |
+                       RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | 
RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+                       RTE_MBUF_F_RX_L4_CKSUM_BAD | 
RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | 
RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+                       RTE_MBUF_F_RX_L4_CKSUM_GOOD | 
RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | 
RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+                       RTE_MBUF_F_RX_L4_CKSUM_GOOD | 
RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | 
RTE_MBUF_F_RX_L4_CKSUM_BAD |
+                       RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | 
RTE_MBUF_F_RX_L4_CKSUM_BAD |
+                       RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | 
RTE_MBUF_F_RX_L4_CKSUM_GOOD |
+                       RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | 
RTE_MBUF_F_RX_L4_CKSUM_GOOD |
+                       RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+                       /**
+                       * shift right 20 bits to use the low two bits to 
indicate
+                       * outer checksum status
+                       * shift right 1 bit to make sure it not exceed 255
+                       */
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | 
RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+                       RTE_MBUF_F_RX_L4_CKSUM_BAD | 
RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | 
RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+                       RTE_MBUF_F_RX_L4_CKSUM_BAD | 
RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | 
RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+                       RTE_MBUF_F_RX_L4_CKSUM_GOOD | 
RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | 
RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
+                       RTE_MBUF_F_RX_L4_CKSUM_GOOD | 
RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | 
RTE_MBUF_F_RX_L4_CKSUM_BAD |
+                       RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | 
RTE_MBUF_F_RX_L4_CKSUM_BAD |
+                       RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | 
RTE_MBUF_F_RX_L4_CKSUM_GOOD |
+                       RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
+                       (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | 
RTE_MBUF_F_RX_L4_CKSUM_GOOD |
+                       RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1);
 
        /* Unpack "status" from quadword 1, bits 0:32 */
        vlan0 = _mm_unpackhi_epi32(descs[0], descs[1]);
@@ -282,6 +307,10 @@ desc_to_olflags_v(struct i40e_rx_queue *rxq, volatile 
union i40e_rx_desc *rxdp,
        l3_l4e = _mm_shuffle_epi8(l3_l4e_flags, l3_l4e);
        /* then we shift left 1 bit */
        l3_l4e = _mm_slli_epi32(l3_l4e, 1);
+       __m128i l4_outer_mask = _mm_set_epi32(0x6, 0x6, 0x6, 0x6);
+       __m128i l4_outer_flags = _mm_and_si128(l3_l4e, l4_outer_mask);
+       l4_outer_flags = _mm_slli_epi32(l4_outer_flags, 20);
+       vlan0 = _mm_or_si128(vlan0, l4_outer_flags);
        /* we need to mask out the redundant bits */
        l3_l4e = _mm_and_si128(l3_l4e, cksum_mask);
 
@@ -310,10 +339,10 @@ desc_to_olflags_v(struct i40e_rx_queue *rxq, volatile 
union i40e_rx_desc *rxdp,
         * appropriate flags means that we have to do a shift and blend for
         * each mbuf before we do the write.
         */
-       rearm0 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(vlan0, 8), 0x10);
-       rearm1 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(vlan0, 4), 0x10);
-       rearm2 = _mm_blend_epi16(mbuf_init, vlan0, 0x10);
-       rearm3 = _mm_blend_epi16(mbuf_init, _mm_srli_si128(vlan0, 4), 0x10);
+       rearm0 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(vlan0, 8), 0x30);
+       rearm1 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(vlan0, 4), 0x30);
+       rearm2 = _mm_blend_epi16(mbuf_init, vlan0, 0x30);
+       rearm3 = _mm_blend_epi16(mbuf_init, _mm_srli_si128(vlan0, 4), 0x30);
 
        /* write the rearm data and the olflags in one write */
        RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) !=
-- 
2.34.1

Reply via email to