llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-libc Author: None (pcc) <details> <summary>Changes</summary> We can use UMAXV.4S to reduce the comparison result in a single instruction. This improves performance by roughly 4% on Apple M1: Summary bin/libc.src.string.bcmp_benchmark3 --study-name="new bcmp" --sweep-mode --sweep-max-size=128 --output=/dev/null --num-trials=10 ran 1.01 ± 0.02 times faster than bin/libc.src.string.bcmp_benchmark3 --study-name="new bcmp" --sweep-mode --sweep-max-size=128 --output=/dev/null --num-trials=10 1.01 ± 0.03 times faster than bin/libc.src.string.bcmp_benchmark3 --study-name="new bcmp" --sweep-mode --sweep-max-size=128 --output=/dev/null --num-trials=10 1.01 ± 0.03 times faster than bin/libc.src.string.bcmp_benchmark3 --study-name="new bcmp" --sweep-mode --sweep-max-size=128 --output=/dev/null --num-trials=10 1.01 ± 0.02 times faster than bin/libc.src.string.bcmp_benchmark2 --study-name="new bcmp" --sweep-mode --sweep-max-size=128 --output=/dev/null --num-trials=10 1.02 ± 0.03 times faster than bin/libc.src.string.bcmp_benchmark2 --study-name="new bcmp" --sweep-mode --sweep-max-size=128 --output=/dev/null --num-trials=10 1.03 ± 0.03 times faster than bin/libc.src.string.bcmp_benchmark2 --study-name="new bcmp" --sweep-mode --sweep-max-size=128 --output=/dev/null --num-trials=10 1.03 ± 0.03 times faster than bin/libc.src.string.bcmp_benchmark2 --study-name="new bcmp" --sweep-mode --sweep-max-size=128 --output=/dev/null --num-trials=10 1.05 ± 0.02 times faster than bin/libc.src.string.bcmp_benchmark1 --study-name="new bcmp" --sweep-mode --sweep-max-size=128 --output=/dev/null --num-trials=10 1.05 ± 0.02 times faster than bin/libc.src.string.bcmp_benchmark1 --study-name="new bcmp" --sweep-mode --sweep-max-size=128 --output=/dev/null --num-trials=10 1.05 ± 0.03 times faster than bin/libc.src.string.bcmp_benchmark1 --study-name="new bcmp" --sweep-mode --sweep-max-size=128 --output=/dev/null --num-trials=10 1.05 ± 0.02 times faster than bin/libc.src.string.bcmp_benchmark1 --study-name="new bcmp" --sweep-mode --sweep-max-size=128 --output=/dev/null --num-trials=10 (1 = original, 2 = a variant of this patch that uses UMAXV.16B, 3 = this patch) --- Full diff: https://github.com/llvm/llvm-project/pull/99260.diff 1 Files Affected: - (modified) libc/src/string/memory_utils/op_aarch64.h (+6-12) ``````````diff diff --git a/libc/src/string/memory_utils/op_aarch64.h b/libc/src/string/memory_utils/op_aarch64.h index 1090ea2617f09..5c08a6ae48b04 100644 --- a/libc/src/string/memory_utils/op_aarch64.h +++ b/libc/src/string/memory_utils/op_aarch64.h @@ -84,8 +84,7 @@ template <size_t Size> struct Bcmp { uint8x16_t a = vld1q_u8(_p1); uint8x16_t n = vld1q_u8(_p2); uint8x16_t an = veorq_u8(a, n); - uint32x2_t an_reduced = vqmovn_u64(vreinterpretq_u64_u8(an)); - return vmaxv_u32(an_reduced); + return vmaxvq_u32(vreinterpretq_u32_u8(an)); } else if constexpr (Size == 32) { auto _p1 = as_u8(p1); auto _p2 = as_u8(p2); @@ -97,12 +96,9 @@ template <size_t Size> struct Bcmp { uint8x16_t bo = veorq_u8(b, o); // anbo = (a ^ n) | (b ^ o). At least one byte is nonzero if there is // a difference between the two buffers. We reduce this value down to 4 - // bytes in two steps. First, calculate the saturated move value when - // going from 2x64b to 2x32b. Second, compute the max of the 2x32b to get - // a single 32 bit nonzero value if a mismatch occurred. + // bytes using the UMAXV instruction to compute the max across the vector. uint8x16_t anbo = vorrq_u8(an, bo); - uint32x2_t anbo_reduced = vqmovn_u64(vreinterpretq_u64_u8(anbo)); - return vmaxv_u32(anbo_reduced); + return vmaxvq_u32(vreinterpretq_u32_u8(anbo)); } else if constexpr ((Size % BlockSize) == 0) { for (size_t offset = 0; offset < Size; offset += BlockSize) if (auto value = Bcmp<BlockSize>::block(p1 + offset, p2 + offset)) @@ -129,8 +125,7 @@ template <size_t Size> struct Bcmp { uint8x16_t bo = veorq_u8(b, o); // anbo = (a ^ n) | (b ^ o) uint8x16_t anbo = vorrq_u8(an, bo); - uint32x2_t anbo_reduced = vqmovn_u64(vreinterpretq_u64_u8(anbo)); - return vmaxv_u32(anbo_reduced); + return vmaxvq_u32(vreinterpretq_u32_u8(anbo)); } else if constexpr (Size == 32) { auto _p1 = as_u8(p1); auto _p2 = as_u8(p2); @@ -150,9 +145,8 @@ template <size_t Size> struct Bcmp { uint8x16_t cpdq = vorrq_u8(cp, dq); // abnocpdq = ((a ^ n) | (b ^ o)) | ((c ^ p) | (d ^ q)). Reduce this to // a nonzero 32 bit value if a mismatch occurred. - uint64x2_t abnocpdq = vreinterpretq_u64_u8(anbo | cpdq); - uint32x2_t abnocpdq_reduced = vqmovn_u64(abnocpdq); - return vmaxv_u32(abnocpdq_reduced); + uint8x16_t abnocpdq = anbo | cpdq; + return vmaxvq_u32(vreinterpretq_u32_u8(abnocpdq)); } else { static_assert(cpp::always_false<decltype(Size)>, "SIZE not implemented"); } `````````` </details> https://github.com/llvm/llvm-project/pull/99260 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits