On Thu, May 19, 2022 at 7:04 PM Medvedkin, Vladimir <vladimir.medved...@intel.com> wrote: > > Hi Stanislaw, Michal, > > As far as I can see, this implementation almost completely repeats other > lookupx4() implementations, except for the use of vector instructions. > > On my board (x86_64) in lpm_perf_autotest your implementation takes about: > LPM LookupX4: 29.5 cycles (fails = 12.5%) > > replacing this code with a simple loop with rte_lpm_lookup(): > > uint32_t nh; > int i, ret; > > for (i = 0; i < 4; i++) { > ret = rte_lpm_lookup((struct rte_lpm *)lpm, ((rte_xmm_t)ip).u32[i], &nh); > hop[i] = (ret == 0) ? nh : defv; > } > > works faster: > LPM LookupX4: 22.2 cycles (fails = 12.5%) > > I'm wondering if this will work faster on your board (I assume it it > RISC-V arch)? Hi Vladimir,
On my HiFive Unmatched RISC-V board there is a marginal difference (~ -1.56%): Our version: 210.5 cycles (fails = 12.5%) rte_lpm_lookup version: 213.8 cycles (fails = 12.5%) Given that x86 is faster with rte_lpm_lookup, I'll change to this implementation in the next version. That said I wonder why do we have different const requirements for rte_lpm_lookup() and rte_lpm_lookupx4(): static inline int rte_lpm_lookup(struct rte_lpm *lpm, uint32_t ip, uint32_t *next_hop) static inline void rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4], uint32_t defv); I think both should be const. > > Thanks! > > On 10/05/2022 12:58, Stanislaw Kardach wrote: > > From: Michal Mazurek <m...@semihalf.com> > > > > Add an implementation of the rte_lpm_lookupx4() function for platforms > > without support for vector operations. > > > > This will be useful in the upcoming RISC-V port as well as any platform > > which may want to start with a basic level of LPM support. > > > > Signed-off-by: Michal Mazurek <m...@semihalf.com> > > Signed-off-by: Stanislaw Kardach <k...@semihalf.com> > > --- > > doc/guides/rel_notes/release_22_07.rst | 5 + > > lib/lpm/meson.build | 1 + > > lib/lpm/rte_lpm.h | 4 +- > > lib/lpm/rte_lpm_scalar.h | 122 +++++++++++++++++++++++++ > > 4 files changed, 131 insertions(+), 1 deletion(-) > > create mode 100644 lib/lpm/rte_lpm_scalar.h > > > > diff --git a/doc/guides/rel_notes/release_22_07.rst > > b/doc/guides/rel_notes/release_22_07.rst > > index 4ae91dd94d..73e8d632f2 100644 > > --- a/doc/guides/rel_notes/release_22_07.rst > > +++ b/doc/guides/rel_notes/release_22_07.rst > > @@ -70,6 +70,11 @@ New Features > > * Added AH mode support in lookaside protocol (IPsec) for CN9K & CN10K. > > * Added AES-GMAC support in lookaside protocol (IPsec) for CN9K & CN10K. > > > > +* **Added scalar version of the LPM library.** > > + > > + * Added scalar implementation of ``rte_lpm_lookupx4``. This is a > > fall-back > > + implementation for platforms that don't support vector operations. > > + > > > > Removed Items > > ------------- > > diff --git a/lib/lpm/meson.build b/lib/lpm/meson.build > > index 78d91d3421..6b47361fce 100644 > > --- a/lib/lpm/meson.build > > +++ b/lib/lpm/meson.build > > @@ -14,6 +14,7 @@ headers = files('rte_lpm.h', 'rte_lpm6.h') > > indirect_headers += files( > > 'rte_lpm_altivec.h', > > 'rte_lpm_neon.h', > > + 'rte_lpm_scalar.h', > > 'rte_lpm_sse.h', > > 'rte_lpm_sve.h', > > ) > > diff --git a/lib/lpm/rte_lpm.h b/lib/lpm/rte_lpm.h > > index eb91960e81..b5db6a353a 100644 > > --- a/lib/lpm/rte_lpm.h > > +++ b/lib/lpm/rte_lpm.h > > @@ -405,8 +405,10 @@ rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, > > uint32_t hop[4], > > #endif > > #elif defined(RTE_ARCH_PPC_64) > > #include "rte_lpm_altivec.h" > > -#else > > +#elif defined(RTE_ARCH_X86) > > #include "rte_lpm_sse.h" > > +#else > > +#include "rte_lpm_scalar.h" > > #endif > > > > #ifdef __cplusplus > > diff --git a/lib/lpm/rte_lpm_scalar.h b/lib/lpm/rte_lpm_scalar.h > > new file mode 100644 > > index 0000000000..991b94e687 > > --- /dev/null > > +++ b/lib/lpm/rte_lpm_scalar.h > > @@ -0,0 +1,122 @@ > > +/* SPDX-License-Identifier: BSD-3-Clause > > + * Copyright(c) 2022 StarFive > > + * Copyright(c) 2022 SiFive > > + * Copyright(c) 2022 Semihalf > > + */ > > + > > +#ifndef _RTE_LPM_SCALAR_H_ > > +#define _RTE_LPM_SCALAR_H_ > > + > > +#include <rte_branch_prediction.h> > > +#include <rte_byteorder.h> > > +#include <rte_common.h> > > +#include <rte_vect.h> > > + > > +#ifdef __cplusplus > > +extern "C" { > > +#endif > > + > > +static inline void > > +rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4], > > + uint32_t defv) > > +{ > > + rte_xmm_t i24; > > + rte_xmm_t i8; > > + uint32_t tbl[4]; > > + uint64_t pt, pt2; > > + const uint32_t *ptbl; > > + > > + const rte_xmm_t mask8 = { > > + .u32 = {UINT8_MAX, UINT8_MAX, UINT8_MAX, UINT8_MAX}}; > > + > > + /* > > + * RTE_LPM_VALID_EXT_ENTRY_BITMASK for 2 LPM entries > > + * as one 64-bit value (0x0300000003000000). > > + */ > > + const uint64_t mask_xv = > > + ((uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK | > > + (uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK << 32); > > + > > + /* > > + * RTE_LPM_LOOKUP_SUCCESS for 2 LPM entries > > + * as one 64-bit value (0x0100000001000000). > > + */ > > + const uint64_t mask_v = > > + ((uint64_t)RTE_LPM_LOOKUP_SUCCESS | > > + (uint64_t)RTE_LPM_LOOKUP_SUCCESS << 32); > > + > > + /* get 4 indexes for tbl24[]. */ > > + i24.x = ip; > > + i24.u32[0] >>= CHAR_BIT; > > + i24.u32[1] >>= CHAR_BIT; > > + i24.u32[2] >>= CHAR_BIT; > > + i24.u32[3] >>= CHAR_BIT; > > + > > + /* extract values from tbl24[] */ > > + ptbl = (const uint32_t *)&lpm->tbl24[i24.u32[0]]; > > + tbl[0] = *ptbl; > > + ptbl = (const uint32_t *)&lpm->tbl24[i24.u32[1]]; > > + tbl[1] = *ptbl; > > + ptbl = (const uint32_t *)&lpm->tbl24[i24.u32[2]]; > > + tbl[2] = *ptbl; > > + ptbl = (const uint32_t *)&lpm->tbl24[i24.u32[3]]; > > + tbl[3] = *ptbl; > > + > > + /* get 4 indexes for tbl8[]. */ > > + i8.x = ip; > > + i8.u64[0] &= mask8.u64[0]; > > + i8.u64[1] &= mask8.u64[1]; > > + > > + pt = (uint64_t)tbl[0] | > > + (uint64_t)tbl[1] << 32; > > + pt2 = (uint64_t)tbl[2] | > > + (uint64_t)tbl[3] << 32; > > + > > + /* search successfully finished for all 4 IP addresses. */ > > + if (likely((pt & mask_xv) == mask_v) && > > + likely((pt2 & mask_xv) == mask_v)) { > > + *(uint64_t *)hop = pt & RTE_LPM_MASKX4_RES; > > + *(uint64_t *)(hop + 2) = pt2 & RTE_LPM_MASKX4_RES; > > + return; > > + } > > + > > + if (unlikely((pt & RTE_LPM_VALID_EXT_ENTRY_BITMASK) == > > + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { > > + i8.u32[0] = i8.u32[0] + > > + (tbl[0] & 0x00FFFFFF) * > > RTE_LPM_TBL8_GROUP_NUM_ENTRIES; > > + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[0]]; > > + tbl[0] = *ptbl; > > + } > > + if (unlikely((pt >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) == > > + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { > > + i8.u32[1] = i8.u32[1] + > > + (tbl[1] & 0x00FFFFFF) * > > RTE_LPM_TBL8_GROUP_NUM_ENTRIES; > > + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[1]]; > > + tbl[1] = *ptbl; > > + } > > + if (unlikely((pt2 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) == > > + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { > > + i8.u32[2] = i8.u32[2] + > > + (tbl[2] & 0x00FFFFFF) * > > RTE_LPM_TBL8_GROUP_NUM_ENTRIES; > > + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[2]]; > > + tbl[2] = *ptbl; > > + } > > + if (unlikely((pt2 >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) == > > + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { > > + i8.u32[3] = i8.u32[3] + > > + (tbl[3] & 0x00FFFFFF) * > > RTE_LPM_TBL8_GROUP_NUM_ENTRIES; > > + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[3]]; > > + tbl[3] = *ptbl; > > + } > > + > > + hop[0] = (tbl[0] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[0] & 0x00FFFFFF : > > defv; > > + hop[1] = (tbl[1] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[1] & 0x00FFFFFF : > > defv; > > + hop[2] = (tbl[2] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[2] & 0x00FFFFFF : > > defv; > > + hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[3] & 0x00FFFFFF : > > defv; > > +} > > + > > +#ifdef __cplusplus > > +} > > +#endif > > + > > +#endif /* _RTE_LPM_SCALAR_H_ */ > > -- > Regards, > Vladimir