Current hitmask includes padding due to Intel's SIMD implementation detail. This patch allows non Intel SIMD implementations to benefit from a dense hitmask.
Signed-off-by: Yoan Picchi <yoan.pic...@arm.com> --- .mailmap | 2 + lib/hash/rte_cuckoo_hash.c | 118 ++++++++++++++++++++++++++----------- 2 files changed, 86 insertions(+), 34 deletions(-) diff --git a/.mailmap b/.mailmap index 3f5bab26a8..b9c49aa7f6 100644 --- a/.mailmap +++ b/.mailmap @@ -485,6 +485,7 @@ Hari Kumar Vemula <hari.kumarx.vem...@intel.com> Harini Ramakrishnan <harini.ramakrish...@microsoft.com> Hariprasad Govindharajan <hariprasad.govindhara...@intel.com> Harish Patil <harish.pa...@cavium.com> <harish.pa...@qlogic.com> +Harjot Singh <harjot.si...@arm.com> Harman Kalra <hka...@marvell.com> Harneet Singh <harneet.si...@intel.com> Harold Huang <baymaxhu...@gmail.com> @@ -1602,6 +1603,7 @@ Yixue Wang <yixue.w...@intel.com> Yi Yang <yangy...@inspur.com> <yi.y.y...@intel.com> Yi Zhang <zhang.y...@zte.com.cn> Yoann Desmouceaux <ydesm...@cisco.com> +Yoan Picchi <yoan.pic...@arm.com> Yogesh Jangra <yogesh.jan...@intel.com> Yogev Chaimovich <yo...@cgstowernetworks.com> Yongjie Gu <yongjiex...@intel.com> diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c index 19b23f2a97..2aa96eb862 100644 --- a/lib/hash/rte_cuckoo_hash.c +++ b/lib/hash/rte_cuckoo_hash.c @@ -1850,8 +1850,50 @@ rte_hash_free_key_with_position(const struct rte_hash *h, } +#if defined(__ARM_NEON) + +static inline void +compare_signatures_dense(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches, + const struct rte_hash_bucket *prim_bkt, + const struct rte_hash_bucket *sec_bkt, + uint16_t sig, + enum rte_hash_sig_compare_function sig_cmp_fn) +{ + unsigned int i; + + /* For match mask every bits indicates the match */ + switch (sig_cmp_fn) { + case RTE_HASH_COMPARE_NEON: { + uint16x8_t vmat, vsig, x; + int16x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7}; + + vsig = vld1q_dup_u16((uint16_t const *)&sig); + /* Compare all signatures in the primary bucket */ + vmat = vceqq_u16(vsig, + vld1q_u16((uint16_t const *)prim_bkt->sig_current)); + x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift); + *prim_hash_matches = (uint32_t)(vaddvq_u16(x)); + /* Compare all signatures in the secondary bucket */ + vmat = vceqq_u16(vsig, + vld1q_u16((uint16_t const *)sec_bkt->sig_current)); + x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift); + *sec_hash_matches = (uint32_t)(vaddvq_u16(x)); + } + break; + default: + for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) { + *prim_hash_matches |= + ((sig == prim_bkt->sig_current[i]) << i); + *sec_hash_matches |= + ((sig == sec_bkt->sig_current[i]) << i); + } + } +} + +#else + static inline void -compare_signatures(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches, +compare_signatures_sparse(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches, const struct rte_hash_bucket *prim_bkt, const struct rte_hash_bucket *sec_bkt, uint16_t sig, @@ -1878,25 +1920,7 @@ compare_signatures(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches, /* Extract the even-index bits only */ *sec_hash_matches &= 0x5555; break; -#elif defined(__ARM_NEON) - case RTE_HASH_COMPARE_NEON: { - uint16x8_t vmat, vsig, x; - int16x8_t shift = {-15, -13, -11, -9, -7, -5, -3, -1}; - - vsig = vld1q_dup_u16((uint16_t const *)&sig); - /* Compare all signatures in the primary bucket */ - vmat = vceqq_u16(vsig, - vld1q_u16((uint16_t const *)prim_bkt->sig_current)); - x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x8000)), shift); - *prim_hash_matches = (uint32_t)(vaddvq_u16(x)); - /* Compare all signatures in the secondary bucket */ - vmat = vceqq_u16(vsig, - vld1q_u16((uint16_t const *)sec_bkt->sig_current)); - x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x8000)), shift); - *sec_hash_matches = (uint32_t)(vaddvq_u16(x)); - } - break; -#endif +#endif /* defined(__SSE2__) */ default: for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) { *prim_hash_matches |= @@ -1907,6 +1931,8 @@ compare_signatures(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches, } } +#endif /* defined(__ARM_NEON) */ + static inline void __bulk_lookup_l(const struct rte_hash *h, const void **keys, const struct rte_hash_bucket **primary_bkt, @@ -1921,18 +1947,30 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys, uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0}; struct rte_hash_bucket *cur_bkt, *next_bkt; +#if defined(__ARM_NEON) + const int hitmask_padding = 0; +#else + const int hitmask_padding = 1; +#endif + __hash_rw_reader_lock(h); /* Compare signatures and prefetch key slot of first hit */ for (i = 0; i < num_keys; i++) { - compare_signatures(&prim_hitmask[i], &sec_hitmask[i], +#if defined(__ARM_NEON) + compare_signatures_dense(&prim_hitmask[i], &sec_hitmask[i], + primary_bkt[i], secondary_bkt[i], + sig[i], h->sig_cmp_fn); +#else + compare_signatures_sparse(&prim_hitmask[i], &sec_hitmask[i], primary_bkt[i], secondary_bkt[i], sig[i], h->sig_cmp_fn); +#endif if (prim_hitmask[i]) { uint32_t first_hit = __builtin_ctzl(prim_hitmask[i]) - >> 1; + >> hitmask_padding; uint32_t key_idx = primary_bkt[i]->key_idx[first_hit]; const struct rte_hash_key *key_slot = @@ -1946,7 +1984,7 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys, if (sec_hitmask[i]) { uint32_t first_hit = __builtin_ctzl(sec_hitmask[i]) - >> 1; + >> hitmask_padding; uint32_t key_idx = secondary_bkt[i]->key_idx[first_hit]; const struct rte_hash_key *key_slot = @@ -1963,7 +2001,7 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys, while (prim_hitmask[i]) { uint32_t hit_index = __builtin_ctzl(prim_hitmask[i]) - >> 1; + >> hitmask_padding; uint32_t key_idx = primary_bkt[i]->key_idx[hit_index]; const struct rte_hash_key *key_slot = @@ -1985,13 +2023,13 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys, positions[i] = key_idx - 1; goto next_key; } - prim_hitmask[i] &= ~(3ULL << (hit_index << 1)); + prim_hitmask[i] &= ~(1 << (hit_index << hitmask_padding)); } while (sec_hitmask[i]) { uint32_t hit_index = __builtin_ctzl(sec_hitmask[i]) - >> 1; + >> hitmask_padding; uint32_t key_idx = secondary_bkt[i]->key_idx[hit_index]; const struct rte_hash_key *key_slot = @@ -2014,7 +2052,7 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys, positions[i] = key_idx - 1; goto next_key; } - sec_hitmask[i] &= ~(3ULL << (hit_index << 1)); + sec_hitmask[i] &= ~(1 << (hit_index << hitmask_padding)); } next_key: continue; @@ -2069,6 +2107,12 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys, struct rte_hash_bucket *cur_bkt, *next_bkt; uint32_t cnt_b, cnt_a; +#if defined(__ARM_NEON) + const int hitmask_padding = 0; +#else + const int hitmask_padding = 1; +#endif + for (i = 0; i < num_keys; i++) positions[i] = -ENOENT; @@ -2082,14 +2126,20 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys, /* Compare signatures and prefetch key slot of first hit */ for (i = 0; i < num_keys; i++) { - compare_signatures(&prim_hitmask[i], &sec_hitmask[i], +#if defined(__ARM_NEON) + compare_signatures_dense(&prim_hitmask[i], &sec_hitmask[i], primary_bkt[i], secondary_bkt[i], sig[i], h->sig_cmp_fn); +#else + compare_signatures_sparse(&prim_hitmask[i], &sec_hitmask[i], + primary_bkt[i], secondary_bkt[i], + sig[i], h->sig_cmp_fn); +#endif if (prim_hitmask[i]) { uint32_t first_hit = __builtin_ctzl(prim_hitmask[i]) - >> 1; + >> hitmask_padding; uint32_t key_idx = primary_bkt[i]->key_idx[first_hit]; const struct rte_hash_key *key_slot = @@ -2103,7 +2153,7 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys, if (sec_hitmask[i]) { uint32_t first_hit = __builtin_ctzl(sec_hitmask[i]) - >> 1; + >> hitmask_padding; uint32_t key_idx = secondary_bkt[i]->key_idx[first_hit]; const struct rte_hash_key *key_slot = @@ -2119,7 +2169,7 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys, while (prim_hitmask[i]) { uint32_t hit_index = __builtin_ctzl(prim_hitmask[i]) - >> 1; + >> hitmask_padding; uint32_t key_idx = __atomic_load_n( &primary_bkt[i]->key_idx[hit_index], @@ -2145,13 +2195,13 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys, positions[i] = key_idx - 1; goto next_key; } - prim_hitmask[i] &= ~(3ULL << (hit_index << 1)); + prim_hitmask[i] &= ~(1 << (hit_index << hitmask_padding)); } while (sec_hitmask[i]) { uint32_t hit_index = __builtin_ctzl(sec_hitmask[i]) - >> 1; + >> hitmask_padding; uint32_t key_idx = __atomic_load_n( &secondary_bkt[i]->key_idx[hit_index], @@ -2178,7 +2228,7 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys, positions[i] = key_idx - 1; goto next_key; } - sec_hitmask[i] &= ~(3ULL << (hit_index << 1)); + sec_hitmask[i] &= ~(1 << (hit_index << hitmask_padding)); } next_key: continue; -- 2.25.1