- Implemented SVE code for comparing signatures in bulk lookup. - New SVE code is ~5% slower than optimized NEON for N2 processor for 128b vectors.
Signed-off-by: Yoan Picchi <yoan.pic...@arm.com> Signed-off-by: Harjot Singh <harjot.si...@arm.com> Reviewed-by: Nathan Brown <nathan.br...@arm.com> Reviewed-by: Ruifeng Wang <ruifeng.w...@arm.com> --- lib/hash/arch/arm/compare_signatures.h | 58 ++++++++++++++++++++++++++ lib/hash/rte_cuckoo_hash.c | 7 +++- lib/hash/rte_cuckoo_hash.h | 1 + 3 files changed, 65 insertions(+), 1 deletion(-) diff --git a/lib/hash/arch/arm/compare_signatures.h b/lib/hash/arch/arm/compare_signatures.h index 2601ed68b3..140ff97b1d 100644 --- a/lib/hash/arch/arm/compare_signatures.h +++ b/lib/hash/arch/arm/compare_signatures.h @@ -47,6 +47,64 @@ compare_signatures_dense(uint16_t *hitmask_buffer, *hitmask_buffer = vaddvq_u16(hit2); } break; +#endif +#if defined(RTE_HAS_SVE_ACLE) + case RTE_HASH_COMPARE_SVE: { + svuint16_t vsign, shift, sv_matches; + svbool_t pred, match, bucket_wide_pred; + int i = 0; + uint64_t vl = svcnth(); + + vsign = svdup_u16(sig); + shift = svindex_u16(0, 1); + + if (vl >= 2 * RTE_HASH_BUCKET_ENTRIES && RTE_HASH_BUCKET_ENTRIES <= 8) { + svuint16_t primary_array_vect, secondary_array_vect; + bucket_wide_pred = svwhilelt_b16(0, RTE_HASH_BUCKET_ENTRIES); + primary_array_vect = svld1_u16(bucket_wide_pred, prim_bucket_sigs); + secondary_array_vect = svld1_u16(bucket_wide_pred, sec_bucket_sigs); + + /* We merged the two vectors so we can do both comparisons at once */ + primary_array_vect = svsplice_u16(bucket_wide_pred, + primary_array_vect, + secondary_array_vect); + pred = svwhilelt_b16(0, 2*RTE_HASH_BUCKET_ENTRIES); + + /* Compare all signatures in the buckets */ + match = svcmpeq_u16(pred, vsign, primary_array_vect); + if (svptest_any(svptrue_b16(), match)) { + sv_matches = svdup_u16(1); + sv_matches = svlsl_u16_z(match, sv_matches, shift); + *hitmask_buffer = svorv_u16(svptrue_b16(), sv_matches); + } + } else { + do { + pred = svwhilelt_b16(i, RTE_HASH_BUCKET_ENTRIES); + uint16_t lower_half = 0; + uint16_t upper_half = 0; + /* Compare all signatures in the primary bucket */ + match = svcmpeq_u16(pred, vsign, svld1_u16(pred, + &prim_bucket_sigs[i])); + if (svptest_any(svptrue_b16(), match)) { + sv_matches = svdup_u16(1); + sv_matches = svlsl_u16_z(match, sv_matches, shift); + lower_half = svorv_u16(svptrue_b16(), sv_matches); + } + /* Compare all signatures in the secondary bucket */ + match = svcmpeq_u16(pred, vsign, svld1_u16(pred, + &sec_bucket_sigs[i])); + if (svptest_any(svptrue_b16(), match)) { + sv_matches = svdup_u16(1); + sv_matches = svlsl_u16_z(match, sv_matches, shift); + upper_half = svorv_u16(svptrue_b16(), sv_matches) + << RTE_HASH_BUCKET_ENTRIES; + } + hitmask_buffer[i / 8] = upper_half | lower_half; + i += vl; + } while (i < RTE_HASH_BUCKET_ENTRIES); + } + } + break; #endif default: for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) { diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c index 0697743cdf..75f555ba2c 100644 --- a/lib/hash/rte_cuckoo_hash.c +++ b/lib/hash/rte_cuckoo_hash.c @@ -450,8 +450,13 @@ rte_hash_create(const struct rte_hash_parameters *params) h->sig_cmp_fn = RTE_HASH_COMPARE_SSE; else #elif defined(RTE_ARCH_ARM64) - if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON)) + if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON)) { h->sig_cmp_fn = RTE_HASH_COMPARE_NEON; +#if defined(RTE_HAS_SVE_ACLE) + if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SVE)) + h->sig_cmp_fn = RTE_HASH_COMPARE_SVE; +#endif + } else #endif h->sig_cmp_fn = RTE_HASH_COMPARE_SCALAR; diff --git a/lib/hash/rte_cuckoo_hash.h b/lib/hash/rte_cuckoo_hash.h index a528f1d1a0..01ad01c258 100644 --- a/lib/hash/rte_cuckoo_hash.h +++ b/lib/hash/rte_cuckoo_hash.h @@ -139,6 +139,7 @@ enum rte_hash_sig_compare_function { RTE_HASH_COMPARE_SCALAR = 0, RTE_HASH_COMPARE_SSE, RTE_HASH_COMPARE_NEON, + RTE_HASH_COMPARE_SVE, RTE_HASH_COMPARE_NUM }; -- 2.25.1