Here's a new version of 0002 with a modified SSE2 implementation, as
discussed elsewhere [0].  This allows us to remove vector8_ssub().

[0] https://postgr.es/m/aNWO7L43UevRErw_%40nathan

-- 
nathan
>From a644f049add68ea78326f88d8994898c92f23c20 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nat...@postgresql.org>
Date: Mon, 22 Sep 2025 16:17:09 -0500
Subject: [PATCH v2 1/1] Optimize vector8_has_le() on AArch64.

---
 src/include/port/simd.h | 32 ++++++--------------------------
 1 file changed, 6 insertions(+), 26 deletions(-)

diff --git a/src/include/port/simd.h b/src/include/port/simd.h
index 97c5f353022..e317e5fdfc0 100644
--- a/src/include/port/simd.h
+++ b/src/include/port/simd.h
@@ -86,7 +86,6 @@ static inline uint32 vector8_highbit_mask(const Vector8 v);
 static inline Vector8 vector8_or(const Vector8 v1, const Vector8 v2);
 #ifndef USE_NO_SIMD
 static inline Vector32 vector32_or(const Vector32 v1, const Vector32 v2);
-static inline Vector8 vector8_ssub(const Vector8 v1, const Vector8 v2);
 #endif
 
 /*
@@ -250,14 +249,13 @@ vector8_has_le(const Vector8 v, const uint8 c)
                        }
                }
        }
-#else
+#elif defined(USE_SSE2)
+       Vector8         umin = _mm_min_epu8(v, vector8_broadcast(c));
+       Vector8         cmpe = _mm_cmpeq_epi8(umin, v);
 
-       /*
-        * Use saturating subtraction to find bytes <= c, which will present as
-        * NUL bytes.  This approach is a workaround for the lack of unsigned
-        * comparison instructions on some architectures.
-        */
-       result = vector8_has_zero(vector8_ssub(v, vector8_broadcast(c)));
+       result = vector8_is_highbit_set(cmpe);
+#elif defined(USE_NEON)
+       result = vminvq_u8(v) <= c;
 #endif
 
        Assert(assert_result == result);
@@ -358,24 +356,6 @@ vector32_or(const Vector32 v1, const Vector32 v2)
 }
 #endif                                                 /* ! USE_NO_SIMD */
 
-/*
- * Return the result of subtracting the respective elements of the input
- * vectors using saturation (i.e., if the operation would yield a value less
- * than zero, zero is returned instead).  For more information on saturation
- * arithmetic, see https://en.wikipedia.org/wiki/Saturation_arithmetic
- */
-#ifndef USE_NO_SIMD
-static inline Vector8
-vector8_ssub(const Vector8 v1, const Vector8 v2)
-{
-#ifdef USE_SSE2
-       return _mm_subs_epu8(v1, v2);
-#elif defined(USE_NEON)
-       return vqsubq_u8(v1, v2);
-#endif
-}
-#endif                                                 /* ! USE_NO_SIMD */
-
 /*
  * Return a vector with all bits set in each lane where the corresponding
  * lanes in the inputs are equal.
-- 
2.39.5 (Apple Git-154)

Reply via email to