From: Harjot Singh <harjot.si...@arm.com> - Implemented Vector Length Agnostic SVE code for comparing signatures in bulk lookup. - Added Defines in code for SVE code support. - New Optimised SVE code is 1-2 CPU cycle slower than NEON for N2 processor.
Performance Numbers from hash_perf_autotest : Elements in Primary or Secondary Location Results (in CPU cycles/operation) ----------------------------------- Operations without data Without pre-computed hash values Keysize Add/Lookup/Lookup_bulk Neon SVE 4 93/71/26 93/71/27 8 93/70/26 93/70/27 9 94/74/27 94/74/28 13 100/80/31 100/79/32 16 100/78/30 100/78/31 32 109/110/38 108/110/39 With pre-computed hash values Keysize Add/Lookup/Lookup_bulk Neon SVE 4 83/58/27 83/58/29 8 83/57/27 83/57/28 9 83/60/28 83/60/29 13 84/60/28 83/60/29 16 83/58/27 83/58/29 32 84/68/31 84/68/32 Signed-off-by: Harjot Singh <harjot.si...@arm.com> Reviewed-by: Nathan Brown <nathan.br...@arm.com> Reviewed-by: Feifei Wang <feifei.wa...@arm.com> Reviewed-by: Jieqiang Wang <jieqiang.w...@arm.com> Reviewed-by: Honnappa Nagarahalli <honnappa.nagaraha...@arm.com> --- .mailmap | 1 + lib/hash/rte_cuckoo_hash.c | 37 ++++++++++++++++++++++++++++++++++++- lib/hash/rte_cuckoo_hash.h | 1 + 3 files changed, 38 insertions(+), 1 deletion(-) diff --git a/.mailmap b/.mailmap index 864d33ee46..2cce48c900 100644 --- a/.mailmap +++ b/.mailmap @@ -481,6 +481,7 @@ Hari Kumar Vemula <hari.kumarx.vem...@intel.com> Harini Ramakrishnan <harini.ramakrish...@microsoft.com> Hariprasad Govindharajan <hariprasad.govindhara...@intel.com> Harish Patil <harish.pa...@cavium.com> <harish.pa...@qlogic.com> +Harjot Singh <harjot.si...@arm.com> Harman Kalra <hka...@marvell.com> Harneet Singh <harneet.si...@intel.com> Harold Huang <baymaxhu...@gmail.com> diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c index d92a903bb3..fdb06eb33e 100644 --- a/lib/hash/rte_cuckoo_hash.c +++ b/lib/hash/rte_cuckoo_hash.c @@ -435,8 +435,11 @@ rte_hash_create(const struct rte_hash_parameters *params) h->sig_cmp_fn = RTE_HASH_COMPARE_SSE; else #elif defined(RTE_ARCH_ARM64) - if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON)) + if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON)) { h->sig_cmp_fn = RTE_HASH_COMPARE_NEON; + if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SVE)) + h->sig_cmp_fn = RTE_HASH_COMPARE_SVE; + } else #endif h->sig_cmp_fn = RTE_HASH_COMPARE_SCALAR; @@ -1892,6 +1895,38 @@ compare_signatures(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches, *sec_hash_matches = (uint32_t)(vaddvq_u16(x)); } break; +#if defined(RTE_HAS_SVE_ACLE) + case RTE_HASH_COMPARE_SVE: { + svuint16_t vsign, shift, sv_prim_matches, sv_sec_matches; + svbool_t pred, p_match, s_match; + int i = 0; + uint64_t vl = svcnth(); + + vsign = svdup_u16(sig); + shift = svindex_u16(0, 2); + do { + pred = svwhilelt_b16(i, RTE_HASH_BUCKET_ENTRIES); + /* Compare all signatures in the primary bucket */ + p_match = svcmpeq_u16(pred, vsign, svld1_u16(pred, + &prim_bkt->sig_current[i])); + if (svptest_any(svptrue_b16(), p_match)) { + sv_prim_matches = svdup_u16_z(p_match, 1); + sv_prim_matches = svlsl_u16_z(pred, sv_prim_matches, shift); + *prim_hash_matches |= svorv_u16(pred, sv_prim_matches); + } + /* Compare all signatures in the secondary bucket */ + s_match = svcmpeq_u16(pred, vsign, svld1_u16(pred, + &sec_bkt->sig_current[i])); + if (svptest_any(svptrue_b16(), s_match)) { + sv_sec_matches = svdup_u16_z(s_match, 1); + sv_sec_matches = svlsl_u16_z(pred, sv_sec_matches, shift); + *sec_hash_matches |= svorv_u16(pred, sv_sec_matches); + } + i += vl; + } while (i < RTE_HASH_BUCKET_ENTRIES); + } + break; +#endif #endif default: for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) { diff --git a/lib/hash/rte_cuckoo_hash.h b/lib/hash/rte_cuckoo_hash.h index eb2644f74b..356ec2a69e 100644 --- a/lib/hash/rte_cuckoo_hash.h +++ b/lib/hash/rte_cuckoo_hash.h @@ -148,6 +148,7 @@ enum rte_hash_sig_compare_function { RTE_HASH_COMPARE_SCALAR = 0, RTE_HASH_COMPARE_SSE, RTE_HASH_COMPARE_NEON, + RTE_HASH_COMPARE_SVE, RTE_HASH_COMPARE_NUM }; -- 2.25.1