On Mon, 2019-02-11 at 15:30 +0800, Ruifeng Wang wrote: > > ------------------------------------------------------------------- > --- > Implemented signature compare function based on neon intrinsic. > Hash bulk lookup had 3% - 6% performance gain after optimization. > > Signed-off-by: Ruifeng Wang <ruifeng.w...@arm.com> > --- > lib/librte_hash/rte_cuckoo_hash.c | 32 > ++++++++++++++++++++++++++++++- > lib/librte_hash/rte_cuckoo_hash.h | 1 + > 2 files changed, 32 insertions(+), 1 deletion(-) > > diff --git a/lib/librte_hash/rte_cuckoo_hash.c > b/lib/librte_hash/rte_cuckoo_hash.c > index c01489ba5..5745a254f 100644 > --- a/lib/librte_hash/rte_cuckoo_hash.c > +++ b/lib/librte_hash/rte_cuckoo_hash.c > @@ -26,6 +26,9 @@ > #include <rte_spinlock.h> > #include <rte_ring.h> > #include <rte_compat.h> > +#if defined(RTE_ARCH_ARM64) > +#include <arm_neon.h> > +#endif
The use of rte_vector.h will remove the need for #if defined... > > #include "rte_hash.h" > #include "rte_cuckoo_hash.h" > @@ -407,6 +410,10 @@ rte_hash_create(const struct rte_hash_parameters > *params) > if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SSE2)) > h->sig_cmp_fn = RTE_HASH_COMPARE_SSE; > else > +#elif defined(RTE_ARCH_ARM64) > + if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON)) > + h->sig_cmp_fn = RTE_HASH_COMPARE_NEON; > + else > #endif > h->sig_cmp_fn = RTE_HASH_COMPARE_SCALAR; > > @@ -1578,10 +1585,15 @@ compare_signatures(uint32_t > *prim_hash_matches, uint32_t *sec_hash_matches, > enum rte_hash_sig_compare_function sig_cmp_fn) > { > unsigned int i; > +#ifdef RTE_MACHINE_CPUFLAG_NEON > + uint16x8_t vmat, vsig, x; > + uint64x2_t x64; > + int16x8_t shift = {-15, -13, -11, -9, -7, -5, -3, -1}; > +#endif Is it possible move down the variable declaration? to avoid the need for #ifdef here > > /* For match mask the first bit of every two bits indicates the > match */ > switch (sig_cmp_fn) { > -#ifdef RTE_MACHINE_CPUFLAG_SSE2 > +#if defined(RTE_MACHINE_CPUFLAG_SSE2) > case RTE_HASH_COMPARE_SSE: > /* Compare all signatures in the bucket */ > *prim_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16( > @@ -1594,6 +1606,24 @@ compare_signatures(uint32_t > *prim_hash_matches, uint32_t *sec_hash_matches, > (__m128i const *)sec_bkt- > >sig_current), > _mm_set1_epi16(sig))); > break; > +#elif defined(RTE_MACHINE_CPUFLAG_NEON) > + case RTE_HASH_COMPARE_NEON: > + vsig = vld1q_dup_u16((uint16_t const *)&sig); > + /* Compare all signatures in the primary bucket */ > + vmat = vceqq_u16(vsig, > + vld1q_u16((uint16_t const *)prim_bkt- > >sig_current)); > + x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x8000)), > shift); > + x64 = vpaddlq_u32(vpaddlq_u16(x)); > + *prim_hash_matches = (uint32_t)(vgetq_lane_u64(x64, 0) > + > + vgetq_lane_u64(x64, 1)); > + /* Compare all signatures in the secondary bucket */ > + vmat = vceqq_u16(vsig, > + vld1q_u16((uint16_t const *)sec_bkt- > >sig_current)); > + x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x8000)), > shift); > + x64 = vpaddlq_u32(vpaddlq_u16(x)); > + *sec_hash_matches = (uint32_t)(vgetq_lane_u64(x64, 0) + > + vgetq_lane_u64(x64, 1)); > + break; > #endif > default: > for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) { > diff --git a/lib/librte_hash/rte_cuckoo_hash.h > b/lib/librte_hash/rte_cuckoo_hash.h > index eacdaa8d4..0548c97f0 100644 > --- a/lib/librte_hash/rte_cuckoo_hash.h > +++ b/lib/librte_hash/rte_cuckoo_hash.h > @@ -141,6 +141,7 @@ struct rte_hash_key { > enum rte_hash_sig_compare_function { > RTE_HASH_COMPARE_SCALAR = 0, > RTE_HASH_COMPARE_SSE, > + RTE_HASH_COMPARE_NEON, > RTE_HASH_COMPARE_NUM > }; >