> -----Original Message-----
> From: Maxime Coquelin <maxime.coque...@redhat.com>
> Sent: Friday, April 24, 2020 7:52 PM
> To: Liu, Yong <yong....@intel.com>; Ye, Xiaolong <xiaolong...@intel.com>;
> Wang, Zhihong <zhihong.w...@intel.com>
> Cc: dev@dpdk.org; Van Haaren, Harry <harry.van.haa...@intel.com>
> Subject: Re: [PATCH v9 5/9] net/virtio: add vectorized packed ring Rx path
> 
> 
> 
> On 4/24/20 11:24 AM, Marvin Liu wrote:
> > Optimize packed ring Rx path with SIMD instructions. Solution of
> > optimization is pretty like vhost, is that split path into batch and
> > single functions. Batch function is further optimized by AVX512
> > instructions. Also pad desc extra structure to 16 bytes aligned, thus
> > four elements will be saved in one batch.
> >
> > Signed-off-by: Marvin Liu <yong....@intel.com>
> >
> > diff --git a/drivers/net/virtio/Makefile b/drivers/net/virtio/Makefile
> > index c9edb84ee..102b1deab 100644
> > --- a/drivers/net/virtio/Makefile
> > +++ b/drivers/net/virtio/Makefile
> > @@ -36,6 +36,41 @@ else ifneq ($(filter y,$(CONFIG_RTE_ARCH_ARM)
> $(CONFIG_RTE_ARCH_ARM64)),)
> >  SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_rxtx_simple_neon.c
> >  endif
> >
> > +ifneq ($(FORCE_DISABLE_AVX512), y)
> > +   CC_AVX512_SUPPORT=\
> > +   $(shell $(CC) -march=native -dM -E - </dev/null 2>&1 | \
> > +   sed '/./{H;$$!d} ; x ; /AVX512F/!d; /AVX512BW/!d; /AVX512VL/!d' | \
> > +   grep -q AVX512 && echo 1)
> > +endif
> > +
> > +ifeq ($(CC_AVX512_SUPPORT), 1)
> > +CFLAGS += -DCC_AVX512_SUPPORT
> > +SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_rxtx_packed_avx.c
> > +
> > +ifeq ($(RTE_TOOLCHAIN), gcc)
> > +ifeq ($(shell test $(GCC_VERSION) -ge 83 && echo 1), 1)
> > +CFLAGS += -DVIRTIO_GCC_UNROLL_PRAGMA
> > +endif
> > +endif
> > +
> > +ifeq ($(RTE_TOOLCHAIN), clang)
> > +ifeq ($(shell test $(CLANG_MAJOR_VERSION)$(CLANG_MINOR_VERSION) -
> ge 37 && echo 1), 1)
> > +CFLAGS += -DVIRTIO_CLANG_UNROLL_PRAGMA
> > +endif
> > +endif
> > +
> > +ifeq ($(RTE_TOOLCHAIN), icc)
> > +ifeq ($(shell test $(ICC_MAJOR_VERSION) -ge 16 && echo 1), 1)
> > +CFLAGS += -DVIRTIO_ICC_UNROLL_PRAGMA
> > +endif
> > +endif
> > +
> > +CFLAGS_virtio_rxtx_packed_avx.o += -mavx512f -mavx512bw -mavx512vl
> > +ifeq ($(shell test $(GCC_VERSION) -ge 100 && echo 1), 1)
> > +CFLAGS_virtio_rxtx_packed_avx.o += -Wno-zero-length-bounds
> > +endif
> > +endif
> > +
> >  ifeq ($(CONFIG_RTE_VIRTIO_USER),y)
> >  SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_user/vhost_user.c
> >  SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_user/vhost_kernel.c
> > diff --git a/drivers/net/virtio/meson.build b/drivers/net/virtio/meson.build
> > index 15150eea1..8e68c3039 100644
> > --- a/drivers/net/virtio/meson.build
> > +++ b/drivers/net/virtio/meson.build
> > @@ -9,6 +9,20 @@ sources += files('virtio_ethdev.c',
> >  deps += ['kvargs', 'bus_pci']
> >
> >  if arch_subdir == 'x86'
> > +   if '-mno-avx512f' not in machine_args
> > +           if cc.has_argument('-mavx512f') and cc.has_argument('-
> mavx512vl') and cc.has_argument('-mavx512bw')
> > +                   cflags += ['-mavx512f', '-mavx512bw', '-mavx512vl']
> > +                   cflags += ['-DCC_AVX512_SUPPORT']
> > +                   if (toolchain == 'gcc' and
> cc.version().version_compare('>=8.3.0'))
> > +                           cflags += '-DVHOST_GCC_UNROLL_PRAGMA'
> > +                   elif (toolchain == 'clang' and
> cc.version().version_compare('>=3.7.0'))
> > +                           cflags += '-
> DVHOST_CLANG_UNROLL_PRAGMA'
> > +                   elif (toolchain == 'icc' and
> cc.version().version_compare('>=16.0.0'))
> > +                           cflags += '-DVHOST_ICC_UNROLL_PRAGMA'
> > +                   endif
> > +                   sources += files('virtio_rxtx_packed_avx.c')
> > +           endif
> > +   endif
> >     sources += files('virtio_rxtx_simple_sse.c')
> >  elif arch_subdir == 'ppc'
> >     sources += files('virtio_rxtx_simple_altivec.c')
> > diff --git a/drivers/net/virtio/virtio_ethdev.h
> b/drivers/net/virtio/virtio_ethdev.h
> > index febaf17a8..5c112cac7 100644
> > --- a/drivers/net/virtio/virtio_ethdev.h
> > +++ b/drivers/net/virtio/virtio_ethdev.h
> > @@ -105,6 +105,9 @@ uint16_t virtio_xmit_pkts_inorder(void *tx_queue,
> struct rte_mbuf **tx_pkts,
> >  uint16_t virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
> >             uint16_t nb_pkts);
> >
> > +uint16_t virtio_recv_pkts_packed_vec(void *rx_queue, struct rte_mbuf
> **rx_pkts,
> > +           uint16_t nb_pkts);
> > +
> >  int eth_virtio_dev_init(struct rte_eth_dev *eth_dev);
> >
> >  void virtio_interrupt_handler(void *param);
> > diff --git a/drivers/net/virtio/virtio_rxtx.c 
> > b/drivers/net/virtio/virtio_rxtx.c
> > index 84f4cf946..c9b6e7844 100644
> > --- a/drivers/net/virtio/virtio_rxtx.c
> > +++ b/drivers/net/virtio/virtio_rxtx.c
> > @@ -2329,3 +2329,11 @@ virtio_xmit_pkts_inorder(void *tx_queue,
> >
> >     return nb_tx;
> >  }
> > +
> > +__rte_weak uint16_t
> > +virtio_recv_pkts_packed_vec(void *rx_queue __rte_unused,
> > +                       struct rte_mbuf **rx_pkts __rte_unused,
> > +                       uint16_t nb_pkts __rte_unused)
> > +{
> > +   return 0;
> > +}
> > diff --git a/drivers/net/virtio/virtio_rxtx_packed_avx.c
> b/drivers/net/virtio/virtio_rxtx_packed_avx.c
> > new file mode 100644
> > index 000000000..8a7b459eb
> > --- /dev/null
> > +++ b/drivers/net/virtio/virtio_rxtx_packed_avx.c
> > @@ -0,0 +1,374 @@
> > +/* SPDX-License-Identifier: BSD-3-Clause
> > + * Copyright(c) 2010-2020 Intel Corporation
> > + */
> > +
> > +#include <stdint.h>
> > +#include <stdio.h>
> > +#include <stdlib.h>
> > +#include <string.h>
> > +#include <errno.h>
> > +
> > +#include <rte_net.h>
> > +
> > +#include "virtio_logs.h"
> > +#include "virtio_ethdev.h"
> > +#include "virtio_pci.h"
> > +#include "virtqueue.h"
> > +
> > +#define BYTE_SIZE 8
> > +/* flag bits offset in packed ring desc higher 64bits */
> > +#define FLAGS_BITS_OFFSET ((offsetof(struct vring_packed_desc, flags) - \
> > +   offsetof(struct vring_packed_desc, len)) * BYTE_SIZE)
> > +
> > +#define PACKED_FLAGS_MASK ((0ULL |
> VRING_PACKED_DESC_F_AVAIL_USED) << \
> > +   FLAGS_BITS_OFFSET)
> > +
> > +#define PACKED_BATCH_SIZE (RTE_CACHE_LINE_SIZE / \
> > +   sizeof(struct vring_packed_desc))
> > +#define PACKED_BATCH_MASK (PACKED_BATCH_SIZE - 1)
> > +
> > +#ifdef VIRTIO_GCC_UNROLL_PRAGMA
> > +#define virtio_for_each_try_unroll(iter, val, size) _Pragma("GCC unroll 4")
> \
> > +   for (iter = val; iter < size; iter++)
> > +#endif
> > +
> > +#ifdef VIRTIO_CLANG_UNROLL_PRAGMA
> > +#define virtio_for_each_try_unroll(iter, val, size) _Pragma("unroll 4") \
> > +   for (iter = val; iter < size; iter++)
> > +#endif
> > +
> > +#ifdef VIRTIO_ICC_UNROLL_PRAGMA
> > +#define virtio_for_each_try_unroll(iter, val, size) _Pragma("unroll (4)") \
> > +   for (iter = val; iter < size; iter++)
> > +#endif
> > +
> > +#ifndef virtio_for_each_try_unroll
> > +#define virtio_for_each_try_unroll(iter, val, num) \
> > +   for (iter = val; iter < num; iter++)
> > +#endif
> > +
> > +static inline void
> > +virtio_update_batch_stats(struct virtnet_stats *stats,
> > +                     uint16_t pkt_len1,
> > +                     uint16_t pkt_len2,
> > +                     uint16_t pkt_len3,
> > +                     uint16_t pkt_len4)
> > +{
> > +   stats->bytes += pkt_len1;
> > +   stats->bytes += pkt_len2;
> > +   stats->bytes += pkt_len3;
> > +   stats->bytes += pkt_len4;
> > +}
> > +
> > +/* Optionally fill offload information in structure */
> > +static inline int
> > +virtio_vec_rx_offload(struct rte_mbuf *m, struct virtio_net_hdr *hdr)
> > +{
> > +   struct rte_net_hdr_lens hdr_lens;
> > +   uint32_t hdrlen, ptype;
> > +   int l4_supported = 0;
> > +
> > +   /* nothing to do */
> > +   if (hdr->flags == 0)
> > +           return 0;
> 
> IIUC, the only difference with the non-vectorized version is the GSO
> support removed here.
> gso_type being in the same cacheline as flags in virtio_net_hdr, I don't
> think checking the performance gain is worth the added maintainance
> effort due to code duplication.
> 
> Please prove I'm wrong, otherwise please move virtio_rx_offload() in a
> header and use it here. Alternative if it really imapcts performance is
> to put all the shared code in a dedicated function that can be re-used
> by both implementations.
> 

Maxime,
It won't be much performance difference between non-vectorized and vectorized.
The reason to add special vectorized version is for skipping the handling of 
garbage GSO packets. 
As all descs have been handled in batch, it is needed to revert when found 
garbage packets. 
That will introduce complicated logic in vectorized path.

Regards,
Marvin

> > +
> > +   /* GSO not support in vec path, skip check */
> > +   m->ol_flags |= PKT_RX_IP_CKSUM_UNKNOWN;
> > +
> > +   ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK);
> > +   m->packet_type = ptype;
> > +   if ((ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_TCP ||
> > +       (ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_UDP ||
> > +       (ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_SCTP)
> > +           l4_supported = 1;
> > +
> > +   if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
> > +           hdrlen = hdr_lens.l2_len + hdr_lens.l3_len + hdr_lens.l4_len;
> > +           if (hdr->csum_start <= hdrlen && l4_supported) {
> > +                   m->ol_flags |= PKT_RX_L4_CKSUM_NONE;
> > +           } else {
> > +                   /* Unknown proto or tunnel, do sw cksum. We can
> assume
> > +                    * the cksum field is in the first segment since the
> > +                    * buffers we provided to the host are large enough.
> > +                    * In case of SCTP, this will be wrong since it's a CRC
> > +                    * but there's nothing we can do.
> > +                    */
> > +                   uint16_t csum = 0, off;
> > +
> > +                   rte_raw_cksum_mbuf(m, hdr->csum_start,
> > +                           rte_pktmbuf_pkt_len(m) - hdr->csum_start,
> > +                           &csum);
> > +                   if (likely(csum != 0xffff))
> > +                           csum = ~csum;
> > +                   off = hdr->csum_offset + hdr->csum_start;
> > +                   if (rte_pktmbuf_data_len(m) >= off + 1)
> > +                           *rte_pktmbuf_mtod_offset(m, uint16_t *,
> > +                                   off) = csum;
> > +           }
> > +   } else if (hdr->flags & VIRTIO_NET_HDR_F_DATA_VALID &&
> l4_supported) {
> > +           m->ol_flags |= PKT_RX_L4_CKSUM_GOOD;
> > +   }
> > +
> > +   return 0;
> > +}
> 
> Otherwise, the patch looks okay to me.
> 
> Thanks,
> Maxime

Reply via email to