> -----Original Message-----
> From: Konstantin Ananyev <konstantin.v.anan...@yandex.ru>
> Sent: Tuesday, June 21, 2022 4:43 AM
> To: Rahul Bhansali <rbhans...@marvell.com>; dev@dpdk.org; Ruifeng Wang
> <ruifeng.w...@arm.com>
> Cc: Jerin Jacob Kollanukkaran <jer...@marvell.com>
> Subject: [EXT] Re: [PATCH v2 1/2] examples/l3fwd: common packet group
> functionality
> 
> External Email
> 
> ----------------------------------------------------------------------
> 17/06/2022 08:50, Rahul Bhansali пишет:
> > CC: Konstantin Ananyev
> >
> >> -----Original Message-----
> >> From: Rahul Bhansali <rbhans...@marvell.com>
> >> Sent: Friday, June 17, 2022 1:13 PM
> >> To: dev@dpdk.org; Ruifeng Wang <ruifeng.w...@arm.com>
> >> Cc: Jerin Jacob Kollanukkaran <jer...@marvell.com>; Rahul Bhansali
> >> <rbhans...@marvell.com>
> >> Subject: [PATCH v2 1/2] examples/l3fwd: common packet group
> >> functionality
> >>
> >> This will make the packet grouping function common, so that other
> >> examples can utilize as per need.
> >>
> >> Signed-off-by: Rahul Bhansali <rbhans...@marvell.com>
> >> ---
> >> Changes in v2: New patch to address review comment.
> >>
> >>   examples/common/neon_common.h |  50 ++++++++++++
> >>   examples/common/pkt_group.h   | 139
> >> ++++++++++++++++++++++++++++++++++
> >>   examples/l3fwd/Makefile       |   5 +-
> >>   examples/l3fwd/l3fwd.h        |   2 -
> >>   examples/l3fwd/l3fwd_common.h | 129 +------------------------------
> >>   examples/l3fwd/l3fwd_neon.h   |  43 +----------
> >>   examples/meson.build          |   2 +-
> >>   7 files changed, 198 insertions(+), 172 deletions(-)  create mode
> >> 100644 examples/common/neon_common.h  create mode 100644
> >> examples/common/pkt_group.h
> >>
> >> diff --git a/examples/common/neon_common.h
> >> b/examples/common/neon_common.h new file mode 100644 index
> >> 0000000000..f01b5ab6bc
> >> --- /dev/null
> >> +++ b/examples/common/neon_common.h
> >> @@ -0,0 +1,50 @@
> >> +/* SPDX-License-Identifier: BSD-3-Clause
> >> + * Copyright(c) 2016-2018 Intel Corporation.
> >> + * Copyright(c) 2017-2018 Linaro Limited.
> >> + * Copyright(C) 2022 Marvell.
> >> + */
> >> +
> >> +#ifndef _NEON_COMMON_H_
> >> +#define _NEON_COMMON_H_
> >> +
> >> +#include "pkt_group.h"
> >> +
> >> +/*
> >> + * Group consecutive packets with the same destination port in bursts of 
> >> 4.
> >> + * Suppose we have array of destination ports:
> >> + * dst_port[] = {a, b, c, d,, e, ... }
> >> + * dp1 should contain: <a, b, c, d>, dp2: <b, c, d, e>.
> >> + * We doing 4 comparisons at once and the result is 4 bit mask.
> >> + * This mask is used as an index into prebuild array of pnum values.
> >> + */
> >> +static inline uint16_t *
> >> +neon_port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp, uint16x8_t
> dp1,
> >> +            uint16x8_t dp2)
> >> +{
> >> +  union {
> >> +          uint16_t u16[FWDSTEP + 1];
> >> +          uint64_t u64;
> >> +  } *pnum = (void *)pn;
> >> +
> >> +  uint16x8_t mask = {1, 2, 4, 8, 0, 0, 0, 0};
> >> +  int32_t v;
> >> +
> >> +  dp1 = vceqq_u16(dp1, dp2);
> >> +  dp1 = vandq_u16(dp1, mask);
> >> +  v = vaddvq_u16(dp1);
> >> +
> >> +  /* update last port counter. */hh
> >> +  lp[0] += gptbl[v].lpv;
> >> +  rte_compiler_barrier();
> >> +
> >> +  /* if dest port value has changed. */
> >> +  if (v != GRPMSK) {
> >> +          pnum->u64 = gptbl[v].pnum;
> >> +          pnum->u16[FWDSTEP] = 1;
> >> +          lp = pnum->u16 + gptbl[v].idx;
> >> +  }
> >> +
> >> +  return lp;
> >> +}
> 
> Thanks for the effort.
> As I can see this function: port_groupx4() is nearly identical for all 3
> platforms: sse/nenon/altivec (except of course built-in arch-specific 
> instincts).
> In fact, even comemnts are identical.
> I wonder can we have something like:
> examples/common/<arch>/port_group.h
> and for each arch will have defined port_groupx4(...) ?
> 
Yes, It’s a good point. I was thinking to have arch in file name itself. But we 
can have arch specific directory and have different header files.
Do you want me to make changes for all 3 sse/neon/altivec or just neon ?
I can check compilation for all but functionality/perf validate for Neon only.

> >> +
> >> +#endif /* _NEON_COMMON_H_ */
> >> diff --git a/examples/common/pkt_group.h
> >> b/examples/common/pkt_group.h new file mode 100644 index
> >> 0000000000..8b26d9380f
> >> --- /dev/null
> >> +++ b/examples/common/pkt_group.h
> >> @@ -0,0 +1,139 @@
> >> +/* SPDX-License-Identifier: BSD-3-Clause
> >> + * Copyright(c) 2016-2018 Intel Corporation.
> >> + * Copyright(c) 2017-2018 Linaro Limited.
> >> + * Copyright(C) 2022 Marvell.
> >> + */
> >> +
> >> +#ifndef _PKT_GROUP_H_
> >> +#define _PKT_GROUP_H_
> >> +
> >> +#define FWDSTEP   4
> >> +
> >> +/*
> >> + * Group consecutive packets with the same destination port into one 
> >> burst.
> >> + * To avoid extra latency this is done together with some other
> >> +packet
> >> + * processing, but after we made a final decision about packet's 
> >> destination.
> >> + * To do this we maintain:
> >> + * pnum - array of number of consecutive packets with the same dest
> >> +port for
> >> + * each packet in the input burst.
> >> + * lp - pointer to the last updated element in the pnum.
> >> + * dlp - dest port value lp corresponds to.
> >> + */
> >> +
> >> +#define   GRPSZ   (1 << FWDSTEP)
> >> +#define   GRPMSK  (GRPSZ - 1)
> >> +
> >> +#define GROUP_PORT_STEP(dlp, dcp, lp, pn, idx)    do { \
> >> +  if (likely((dlp) == (dcp)[(idx)])) {         \
> >> +          (lp)[0]++;                           \
> >> +  } else {                                     \
> >> +          (dlp) = (dcp)[idx];                  \
> >> +          (lp) = (pn) + (idx);                 \
> >> +          (lp)[0] = 1;                         \
> >> +  }                                            \
> >> +} while (0)
> >> +
> >> +static const struct {
> >> +  uint64_t pnum; /* prebuild 4 values for pnum[]. */
> >> +  int32_t  idx;  /* index for new last updated elemnet. */
> >> +  uint16_t lpv;  /* add value to the last updated element. */ }
> >> +gptbl[GRPSZ] = {
> >> +  {
> >> +          /* 0: a != b, b != c, c != d, d != e */
> >> +          .pnum = UINT64_C(0x0001000100010001),
> >> +          .idx = 4,
> >> +          .lpv = 0,
> >> +  },
> >> +  {
> >> +          /* 1: a == b, b != c, c != d, d != e */
> >> +          .pnum = UINT64_C(0x0001000100010002),
> >> +          .idx = 4,
> >> +          .lpv = 1,
> >> +  },
> >> +  {
> >> +          /* 2: a != b, b == c, c != d, d != e */
> >> +          .pnum = UINT64_C(0x0001000100020001),
> >> +          .idx = 4,
> >> +          .lpv = 0,
> >> +  },
> >> +  {
> >> +          /* 3: a == b, b == c, c != d, d != e */
> >> +          .pnum = UINT64_C(0x0001000100020003),
> >> +          .idx = 4,
> >> +          .lpv = 2,
> >> +  },
> >> +  {
> >> +          /* 4: a != b, b != c, c == d, d != e */
> >> +          .pnum = UINT64_C(0x0001000200010001),
> >> +          .idx = 4,
> >> +          .lpv = 0,
> >> +  },
> >> +  {
> >> +          /* 5: a == b, b != c, c == d, d != e */
> >> +          .pnum = UINT64_C(0x0001000200010002),
> >> +          .idx = 4,
> >> +          .lpv = 1,
> >> +  },
> >> +  {
> >> +          /* 6: a != b, b == c, c == d, d != e */
> >> +          .pnum = UINT64_C(0x0001000200030001),
> >> +          .idx = 4,
> >> +          .lpv = 0,
> >> +  },
> >> +  {
> >> +          /* 7: a == b, b == c, c == d, d != e */
> >> +          .pnum = UINT64_C(0x0001000200030004),
> >> +          .idx = 4,
> >> +          .lpv = 3,
> >> +  },
> >> +  {
> >> +          /* 8: a != b, b != c, c != d, d == e */
> >> +          .pnum = UINT64_C(0x0002000100010001),
> >> +          .idx = 3,
> >> +          .lpv = 0,
> >> +  },
> >> +  {
> >> +          /* 9: a == b, b != c, c != d, d == e */
> >> +          .pnum = UINT64_C(0x0002000100010002),
> >> +          .idx = 3,
> >> +          .lpv = 1,
> >> +  },
> >> +  {
> >> +          /* 0xa: a != b, b == c, c != d, d == e */
> >> +          .pnum = UINT64_C(0x0002000100020001),
> >> +          .idx = 3,
> >> +          .lpv = 0,
> >> +  },
> >> +  {
> >> +          /* 0xb: a == b, b == c, c != d, d == e */
> >> +          .pnum = UINT64_C(0x0002000100020003),
> >> +          .idx = 3,
> >> +          .lpv = 2,
> >> +  },
> >> +  {
> >> +          /* 0xc: a != b, b != c, c == d, d == e */
> >> +          .pnum = UINT64_C(0x0002000300010001),
> >> +          .idx = 2,
> >> +          .lpv = 0,
> >> +  },
> >> +  {
> >> +          /* 0xd: a == b, b != c, c == d, d == e */
> >> +          .pnum = UINT64_C(0x0002000300010002),
> >> +          .idx = 2,
> >> +          .lpv = 1,
> >> +  },
> >> +  {
> >> +          /* 0xe: a != b, b == c, c == d, d == e */
> >> +          .pnum = UINT64_C(0x0002000300040001),
> >> +          .idx = 1,
> >> +          .lpv = 0,
> >> +  },
> >> +  {
> >> +          /* 0xf: a == b, b == c, c == d, d == e */
> >> +          .pnum = UINT64_C(0x0002000300040005),
> >> +          .idx = 0,
> >> +          .lpv = 4,
> >> +  },
> >> +};
> >> +
> >> +#endif /* _PKT_GROUP_H_ */
> >> diff --git a/examples/l3fwd/Makefile b/examples/l3fwd/Makefile index
> >> 8efe6378e2..8dbe85c2e6 100644
> >> --- a/examples/l3fwd/Makefile
> >> +++ b/examples/l3fwd/Makefile
> >> @@ -22,6 +22,7 @@ shared: build/$(APP)-shared
> >>   static: build/$(APP)-static
> >>    ln -sf $(APP)-static build/$(APP)
> >>
> >> +INCLUDES =-I../common
> >>   PC_FILE := $(shell $(PKGCONF) --path libdpdk 2>/dev/null)  CFLAGS
> >> += -O3 $(shell $(PKGCONF) --cflags libdpdk)  # Added for
> 'rte_eth_link_to_str()'
> >> @@ -38,10 +39,10 @@ endif
> >>   endif
> >>
> >>   build/$(APP)-shared: $(SRCS-y) Makefile $(PC_FILE) | build
> >> -  $(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_SHARED)
> >> +  $(CC) $(CFLAGS) $(SRCS-y) $(INCLUDES) -o $@ $(LDFLAGS)
> >> +$(LDFLAGS_SHARED)
> >>
> >>   build/$(APP)-static: $(SRCS-y) Makefile $(PC_FILE) | build
> >> -  $(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_STATIC)
> >> +  $(CC) $(CFLAGS) $(SRCS-y) $(INCLUDES) -o $@ $(LDFLAGS)
> >> +$(LDFLAGS_STATIC)
> >>
> >>   build:
> >>    @mkdir -p $@
> >> diff --git a/examples/l3fwd/l3fwd.h b/examples/l3fwd/l3fwd.h index
> >> 8a52c90755..40b5f32a9e 100644
> >> --- a/examples/l3fwd/l3fwd.h
> >> +++ b/examples/l3fwd/l3fwd.h
> >> @@ -44,8 +44,6 @@
> >>   /* Used to mark destination port as 'invalid'. */
> >>   #define  BAD_PORT ((uint16_t)-1)
> >>
> >> -#define FWDSTEP   4
> >> -
> >>   /* replace first 12B of the ethernet header. */
> >>   #define  MASK_ETH 0x3f
> >>
> >> diff --git a/examples/l3fwd/l3fwd_common.h
> >> b/examples/l3fwd/l3fwd_common.h index 8e4c27218f..224b1c08e8 100644
> >> --- a/examples/l3fwd/l3fwd_common.h
> >> +++ b/examples/l3fwd/l3fwd_common.h
> >> @@ -7,6 +7,8 @@
> >>   #ifndef _L3FWD_COMMON_H_
> >>   #define _L3FWD_COMMON_H_
> >>
> >> +#include "pkt_group.h"
> >> +
> >>   #ifdef DO_RFC_1812_CHECKS
> >>
> >>   #define  IPV4_MIN_VER_IHL        0x45
> >> @@ -50,133 +52,6 @@ rfc1812_process(struct rte_ipv4_hdr *ipv4_hdr,
> >> uint16_t *dp, uint32_t ptype)
> >>   #define  rfc1812_process(mb, dp, ptype)  do { } while (0)
> >>   #endif /* DO_RFC_1812_CHECKS */
> >>
> >> -/*
> >> - * We group consecutive packets with the same destination port into one
> burst.
> >> - * To avoid extra latency this is done together with some other
> >> packet
> >> - * processing, but after we made a final decision about packet's 
> >> destination.
> >> - * To do this we maintain:
> >> - * pnum - array of number of consecutive packets with the same dest
> >> port for
> >> - * each packet in the input burst.
> >> - * lp - pointer to the last updated element in the pnum.
> >> - * dlp - dest port value lp corresponds to.
> >> - */
> >> -
> >> -#define   GRPSZ   (1 << FWDSTEP)
> >> -#define   GRPMSK  (GRPSZ - 1)
> >> -
> >> -#define GROUP_PORT_STEP(dlp, dcp, lp, pn, idx)    do { \
> >> -  if (likely((dlp) == (dcp)[(idx)])) {             \
> >> -          (lp)[0]++;                                   \
> >> -  } else {                                         \
> >> -          (dlp) = (dcp)[idx];                          \
> >> -          (lp) = (pn) + (idx);                         \
> >> -          (lp)[0] = 1;                                 \
> >> -  }                                                \
> >> -} while (0)
> >> -
> >> -static const struct {
> >> -  uint64_t pnum; /* prebuild 4 values for pnum[]. */
> >> -  int32_t  idx;  /* index for new last updated element. */
> >> -  uint16_t lpv;  /* add value to the last updated element. */
> >> -} gptbl[GRPSZ] = {
> >> -  {
> >> -          /* 0: a != b, b != c, c != d, d != e */
> >> -          .pnum = UINT64_C(0x0001000100010001),
> >> -          .idx = 4,
> >> -          .lpv = 0,
> >> -  },
> >> -  {
> >> -          /* 1: a == b, b != c, c != d, d != e */
> >> -          .pnum = UINT64_C(0x0001000100010002),
> >> -          .idx = 4,
> >> -          .lpv = 1,
> >> -  },
> >> -  {
> >> -          /* 2: a != b, b == c, c != d, d != e */
> >> -          .pnum = UINT64_C(0x0001000100020001),
> >> -          .idx = 4,
> >> -          .lpv = 0,
> >> -  },
> >> -  {
> >> -          /* 3: a == b, b == c, c != d, d != e */
> >> -          .pnum = UINT64_C(0x0001000100020003),
> >> -          .idx = 4,
> >> -          .lpv = 2,
> >> -  },
> >> -  {
> >> -          /* 4: a != b, b != c, c == d, d != e */
> >> -          .pnum = UINT64_C(0x0001000200010001),
> >> -          .idx = 4,
> >> -          .lpv = 0,
> >> -  },
> >> -  {
> >> -          /* 5: a == b, b != c, c == d, d != e */
> >> -          .pnum = UINT64_C(0x0001000200010002),
> >> -          .idx = 4,
> >> -          .lpv = 1,
> >> -  },
> >> -  {
> >> -          /* 6: a != b, b == c, c == d, d != e */
> >> -          .pnum = UINT64_C(0x0001000200030001),
> >> -          .idx = 4,
> >> -          .lpv = 0,
> >> -  },
> >> -  {
> >> -          /* 7: a == b, b == c, c == d, d != e */
> >> -          .pnum = UINT64_C(0x0001000200030004),
> >> -          .idx = 4,
> >> -          .lpv = 3,
> >> -  },
> >> -  {
> >> -          /* 8: a != b, b != c, c != d, d == e */
> >> -          .pnum = UINT64_C(0x0002000100010001),
> >> -          .idx = 3,
> >> -          .lpv = 0,
> >> -  },
> >> -  {
> >> -          /* 9: a == b, b != c, c != d, d == e */
> >> -          .pnum = UINT64_C(0x0002000100010002),
> >> -          .idx = 3,
> >> -          .lpv = 1,
> >> -  },
> >> -  {
> >> -          /* 0xa: a != b, b == c, c != d, d == e */
> >> -          .pnum = UINT64_C(0x0002000100020001),
> >> -          .idx = 3,
> >> -          .lpv = 0,
> >> -  },
> >> -  {
> >> -          /* 0xb: a == b, b == c, c != d, d == e */
> >> -          .pnum = UINT64_C(0x0002000100020003),
> >> -          .idx = 3,
> >> -          .lpv = 2,
> >> -  },
> >> -  {
> >> -          /* 0xc: a != b, b != c, c == d, d == e */
> >> -          .pnum = UINT64_C(0x0002000300010001),
> >> -          .idx = 2,
> >> -          .lpv = 0,
> >> -  },
> >> -  {
> >> -          /* 0xd: a == b, b != c, c == d, d == e */
> >> -          .pnum = UINT64_C(0x0002000300010002),
> >> -          .idx = 2,
> >> -          .lpv = 1,
> >> -  },
> >> -  {
> >> -          /* 0xe: a != b, b == c, c == d, d == e */
> >> -          .pnum = UINT64_C(0x0002000300040001),
> >> -          .idx = 1,
> >> -          .lpv = 0,
> >> -  },
> >> -  {
> >> -          /* 0xf: a == b, b == c, c == d, d == e */
> >> -          .pnum = UINT64_C(0x0002000300040005),
> >> -          .idx = 0,
> >> -          .lpv = 4,
> >> -  },
> >> -};
> >> -
> >>   static __rte_always_inline void
> >>   send_packetsx4(struct lcore_conf *qconf, uint16_t port, struct rte_mbuf
> *m[],
> >>            uint32_t num)
> >> diff --git a/examples/l3fwd/l3fwd_neon.h
> >> b/examples/l3fwd/l3fwd_neon.h index e3d33a5229..5fa765b640 100644
> >> --- a/examples/l3fwd/l3fwd_neon.h
> >> +++ b/examples/l3fwd/l3fwd_neon.h
> >> @@ -7,6 +7,7 @@
> >>   #define _L3FWD_NEON_H_
> >>
> >>   #include "l3fwd.h"
> >> +#include "neon_common.h"
> >>   #include "l3fwd_common.h"
> >>
> >>   /*
> >> @@ -62,44 +63,6 @@ processx4_step3(struct rte_mbuf *pkt[FWDSTEP],
> >> uint16_t dst_port[FWDSTEP])
> >>                    &dst_port[3], pkt[3]->packet_type);
> >>   }
> >>
> >> -/*
> >> - * Group consecutive packets with the same destination port in bursts of 
> >> 4.
> >> - * Suppose we have array of destination ports:
> >> - * dst_port[] = {a, b, c, d,, e, ... }
> >> - * dp1 should contain: <a, b, c, d>, dp2: <b, c, d, e>.
> >> - * We doing 4 comparisons at once and the result is 4 bit mask.
> >> - * This mask is used as an index into prebuild array of pnum values.
> >> - */
> >> -static inline uint16_t *
> >> -port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp, uint16x8_t dp1,
> >> -       uint16x8_t dp2)
> >> -{
> >> -  union {
> >> -          uint16_t u16[FWDSTEP + 1];
> >> -          uint64_t u64;
> >> -  } *pnum = (void *)pn;
> >> -
> >> -  int32_t v;
> >> -  uint16x8_t mask = {1, 2, 4, 8, 0, 0, 0, 0};
> >> -
> >> -  dp1 = vceqq_u16(dp1, dp2);
> >> -  dp1 = vandq_u16(dp1, mask);
> >> -  v = vaddvq_u16(dp1);
> >> -
> >> -  /* update last port counter. */
> >> -  lp[0] += gptbl[v].lpv;
> >> -  rte_compiler_barrier();
> >> -
> >> -  /* if dest port value has changed. */
> >> -  if (v != GRPMSK) {
> >> -          pnum->u64 = gptbl[v].pnum;
> >> -          pnum->u16[FWDSTEP] = 1;
> >> -          lp = pnum->u16 + gptbl[v].idx;
> >> -  }
> >> -
> >> -  return lp;
> >> -}
> >> -
> >>   /**
> >>    * Process one packet:
> >>    * Update source and destination MAC addresses in the ethernet header.
> >> @@ -161,7 +124,7 @@ send_packets_multi(struct lcore_conf *qconf,
> >> struct rte_mbuf **pkts_burst,
> >>                     * <d[j-3], d[j-2], d[j-1], d[j], ... >
> >>                     */
> >>                    dp2 = vld1q_u16(&dst_port[j - FWDSTEP + 1]);
> >> -                  lp  = port_groupx4(&pnum[j - FWDSTEP], lp, dp1, dp2);
> >> +                  lp  = neon_port_groupx4(&pnum[j - FWDSTEP], lp, dp1,
> >> dp2);
> >>
> >>                    /*
> >>                     * dp1:
> >> @@ -175,7 +138,7 @@ send_packets_multi(struct lcore_conf *qconf,
> >> struct rte_mbuf **pkts_burst,
> >>             */
> >>            dp2 = vextq_u16(dp1, dp1, 1);
> >>            dp2 = vsetq_lane_u16(vgetq_lane_u16(dp2, 2), dp2, 3);
> >> -          lp  = port_groupx4(&pnum[j - FWDSTEP], lp, dp1, dp2);
> >> +          lp  = neon_port_groupx4(&pnum[j - FWDSTEP], lp, dp1, dp2);
> >>
> >>            /*
> >>             * remove values added by the last repeated diff --git
> >> a/examples/meson.build b/examples/meson.build index
> >> 78de0e1f37..81e93799f2 100644
> >> --- a/examples/meson.build
> >> +++ b/examples/meson.build
> >> @@ -97,7 +97,7 @@ foreach example: examples
> >>       ldflags = default_ldflags
> >>
> >>       ext_deps = []
> >> -    includes = [include_directories(example)]
> >> +    includes = [include_directories(example, 'common')]
> >>       deps = ['eal', 'mempool', 'net', 'mbuf', 'ethdev', 'cmdline']
> >>       subdir(example)
> >>
> >> --
> >> 2.25.1
> >

Reply via email to