Here's a new version of 0002 with a modified SSE2 implementation, as discussed elsewhere [0]. This allows us to remove vector8_ssub().
[0] https://postgr.es/m/aNWO7L43UevRErw_%40nathan -- nathan
>From a644f049add68ea78326f88d8994898c92f23c20 Mon Sep 17 00:00:00 2001 From: Nathan Bossart <nat...@postgresql.org> Date: Mon, 22 Sep 2025 16:17:09 -0500 Subject: [PATCH v2 1/1] Optimize vector8_has_le() on AArch64. --- src/include/port/simd.h | 32 ++++++-------------------------- 1 file changed, 6 insertions(+), 26 deletions(-) diff --git a/src/include/port/simd.h b/src/include/port/simd.h index 97c5f353022..e317e5fdfc0 100644 --- a/src/include/port/simd.h +++ b/src/include/port/simd.h @@ -86,7 +86,6 @@ static inline uint32 vector8_highbit_mask(const Vector8 v); static inline Vector8 vector8_or(const Vector8 v1, const Vector8 v2); #ifndef USE_NO_SIMD static inline Vector32 vector32_or(const Vector32 v1, const Vector32 v2); -static inline Vector8 vector8_ssub(const Vector8 v1, const Vector8 v2); #endif /* @@ -250,14 +249,13 @@ vector8_has_le(const Vector8 v, const uint8 c) } } } -#else +#elif defined(USE_SSE2) + Vector8 umin = _mm_min_epu8(v, vector8_broadcast(c)); + Vector8 cmpe = _mm_cmpeq_epi8(umin, v); - /* - * Use saturating subtraction to find bytes <= c, which will present as - * NUL bytes. This approach is a workaround for the lack of unsigned - * comparison instructions on some architectures. - */ - result = vector8_has_zero(vector8_ssub(v, vector8_broadcast(c))); + result = vector8_is_highbit_set(cmpe); +#elif defined(USE_NEON) + result = vminvq_u8(v) <= c; #endif Assert(assert_result == result); @@ -358,24 +356,6 @@ vector32_or(const Vector32 v1, const Vector32 v2) } #endif /* ! USE_NO_SIMD */ -/* - * Return the result of subtracting the respective elements of the input - * vectors using saturation (i.e., if the operation would yield a value less - * than zero, zero is returned instead). For more information on saturation - * arithmetic, see https://en.wikipedia.org/wiki/Saturation_arithmetic - */ -#ifndef USE_NO_SIMD -static inline Vector8 -vector8_ssub(const Vector8 v1, const Vector8 v2) -{ -#ifdef USE_SSE2 - return _mm_subs_epu8(v1, v2); -#elif defined(USE_NEON) - return vqsubq_u8(v1, v2); -#endif -} -#endif /* ! USE_NO_SIMD */ - /* * Return a vector with all bits set in each lane where the corresponding * lanes in the inputs are equal. -- 2.39.5 (Apple Git-154)