On Fri, Mar 15, 2024 at 12:41:49PM -0500, Nathan Bossart wrote:
> I've also attached the results of running this benchmark on my machine at
> HEAD, after applying 0001, and after applying both 0001 and 0002.  0001
> appears to work pretty well.  When there is a small "tail," it regresses a
> small amount, but overall, it seems to improve more cases than it harms.
> 0002 does regress searches on smaller arrays quite a bit, since it
> postpones the SIMD optimizations until the arrays are longer.  It might be
> possible to mitigate by using 2 registers when the "tail" is long enough,
> but I have yet to try that.

The attached 0003 is a sketch of what such mitigation might look like.  It
appears to help with the regressions nicely.  I omitted the benchmarking
patch in v3 to appease cfbot.

-- 
Nathan Bossart
Amazon Web Services: https://aws.amazon.com
>From 3817435d200af5da954d505ae66245662dea064c Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nat...@postgresql.org>
Date: Fri, 15 Mar 2024 12:26:26 -0500
Subject: [PATCH v3 1/3] pg_lfind32: process "tail" with SIMD intructions

---
 src/include/port/pg_lfind.h | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/src/include/port/pg_lfind.h b/src/include/port/pg_lfind.h
index b8dfa66eef..9d21284724 100644
--- a/src/include/port/pg_lfind.h
+++ b/src/include/port/pg_lfind.h
@@ -103,7 +103,7 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
 	const uint32 nelem_per_iteration = 4 * nelem_per_vector;
 
 	/* round down to multiple of elements per iteration */
-	const uint32 tail_idx = nelem & ~(nelem_per_iteration - 1);
+	uint32 tail_idx = nelem & ~(nelem_per_iteration - 1);
 
 #if defined(USE_ASSERT_CHECKING)
 	bool		assert_result = false;
@@ -117,9 +117,11 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
 			break;
 		}
 	}
+	i = 0;
 #endif
 
-	for (i = 0; i < tail_idx; i += nelem_per_iteration)
+retry:
+	for (; i < tail_idx; i += nelem_per_iteration)
 	{
 		Vector32	vals1,
 					vals2,
@@ -157,6 +159,16 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
 			return true;
 		}
 	}
+
+	if (i == nelem)
+		return false;
+	else if (tail_idx > 0)
+	{
+		tail_idx = nelem;
+		i = nelem - nelem_per_iteration;
+		goto retry;
+	}
+
 #endif							/* ! USE_NO_SIMD */
 
 	/* Process the remaining elements one at a time. */
-- 
2.25.1

>From a867e342db08aae501374c75c0d8f17473a6cbc9 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nat...@postgresql.org>
Date: Fri, 15 Mar 2024 12:26:52 -0500
Subject: [PATCH v3 2/3] add avx2 support in simd.h

---
 src/include/port/simd.h | 58 ++++++++++++++++++++++++++++++++---------
 1 file changed, 45 insertions(+), 13 deletions(-)

diff --git a/src/include/port/simd.h b/src/include/port/simd.h
index 597496f2fb..767127b85c 100644
--- a/src/include/port/simd.h
+++ b/src/include/port/simd.h
@@ -18,7 +18,15 @@
 #ifndef SIMD_H
 #define SIMD_H
 
-#if (defined(__x86_64__) || defined(_M_AMD64))
+#if defined(__AVX2__)
+
+#include <immintrin.h>
+#define USE_AVX2
+typedef __m256i Vector8;
+typedef __m256i Vector32;
+
+#elif (defined(__x86_64__) || defined(_M_AMD64))
+
 /*
  * SSE2 instructions are part of the spec for the 64-bit x86 ISA. We assume
  * that compilers targeting this architecture understand SSE2 intrinsics.
@@ -107,7 +115,9 @@ static inline Vector32 vector32_eq(const Vector32 v1, const Vector32 v2);
 static inline void
 vector8_load(Vector8 *v, const uint8 *s)
 {
-#if defined(USE_SSE2)
+#if defined(USE_AVX2)
+	*v = _mm256_loadu_si256((const __m256i *) s);
+#elif defined(USE_SSE2)
 	*v = _mm_loadu_si128((const __m128i *) s);
 #elif defined(USE_NEON)
 	*v = vld1q_u8(s);
@@ -120,7 +130,9 @@ vector8_load(Vector8 *v, const uint8 *s)
 static inline void
 vector32_load(Vector32 *v, const uint32 *s)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	*v = _mm256_loadu_si256((const __m256i *) s);
+#elif defined(USE_SSE2)
 	*v = _mm_loadu_si128((const __m128i *) s);
 #elif defined(USE_NEON)
 	*v = vld1q_u32(s);
@@ -134,7 +146,9 @@ vector32_load(Vector32 *v, const uint32 *s)
 static inline Vector8
 vector8_broadcast(const uint8 c)
 {
-#if defined(USE_SSE2)
+#if defined(USE_AVX2)
+	return _mm256_set1_epi8(c);
+#elif defined(USE_SSE2)
 	return _mm_set1_epi8(c);
 #elif defined(USE_NEON)
 	return vdupq_n_u8(c);
@@ -147,7 +161,9 @@ vector8_broadcast(const uint8 c)
 static inline Vector32
 vector32_broadcast(const uint32 c)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_set1_epi32(c);
+#elif defined(USE_SSE2)
 	return _mm_set1_epi32(c);
 #elif defined(USE_NEON)
 	return vdupq_n_u32(c);
@@ -270,7 +286,9 @@ vector8_has_le(const Vector8 v, const uint8 c)
 static inline bool
 vector8_is_highbit_set(const Vector8 v)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_movemask_epi8(v) != 0;
+#elif defined(USE_SSE2)
 	return _mm_movemask_epi8(v) != 0;
 #elif defined(USE_NEON)
 	return vmaxvq_u8(v) > 0x7F;
@@ -308,7 +326,9 @@ vector32_is_highbit_set(const Vector32 v)
 static inline uint32
 vector8_highbit_mask(const Vector8 v)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return (uint32) _mm256_movemask_epi8(v);
+#elif defined(USE_SSE2)
 	return (uint32) _mm_movemask_epi8(v);
 #elif defined(USE_NEON)
 	/*
@@ -337,7 +357,9 @@ vector8_highbit_mask(const Vector8 v)
 static inline Vector8
 vector8_or(const Vector8 v1, const Vector8 v2)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_or_si256(v1, v2);
+#elif defined(USE_SSE2)
 	return _mm_or_si128(v1, v2);
 #elif defined(USE_NEON)
 	return vorrq_u8(v1, v2);
@@ -350,7 +372,9 @@ vector8_or(const Vector8 v1, const Vector8 v2)
 static inline Vector32
 vector32_or(const Vector32 v1, const Vector32 v2)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_or_si256(v1, v2);
+#elif defined(USE_SSE2)
 	return _mm_or_si128(v1, v2);
 #elif defined(USE_NEON)
 	return vorrq_u32(v1, v2);
@@ -368,7 +392,9 @@ vector32_or(const Vector32 v1, const Vector32 v2)
 static inline Vector8
 vector8_ssub(const Vector8 v1, const Vector8 v2)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_subs_epu8(v1, v2);
+#elif defined(USE_SSE2)
 	return _mm_subs_epu8(v1, v2);
 #elif defined(USE_NEON)
 	return vqsubq_u8(v1, v2);
@@ -384,7 +410,9 @@ vector8_ssub(const Vector8 v1, const Vector8 v2)
 static inline Vector8
 vector8_eq(const Vector8 v1, const Vector8 v2)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_cmpeq_epi8(v1, v2);
+#elif defined(USE_SSE2)
 	return _mm_cmpeq_epi8(v1, v2);
 #elif defined(USE_NEON)
 	return vceqq_u8(v1, v2);
@@ -396,7 +424,9 @@ vector8_eq(const Vector8 v1, const Vector8 v2)
 static inline Vector32
 vector32_eq(const Vector32 v1, const Vector32 v2)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_cmpeq_epi32(v1, v2);
+#elif defined(USE_SSE2)
 	return _mm_cmpeq_epi32(v1, v2);
 #elif defined(USE_NEON)
 	return vceqq_u32(v1, v2);
@@ -411,7 +441,9 @@ vector32_eq(const Vector32 v1, const Vector32 v2)
 static inline Vector8
 vector8_min(const Vector8 v1, const Vector8 v2)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_min_epu8(v1, v2);
+#elif defined(USE_SSE2)
 	return _mm_min_epu8(v1, v2);
 #elif defined(USE_NEON)
 	return vminq_u8(v1, v2);
-- 
2.25.1

>From edd188759e4b937089c5bc2259a401cea1f9331f Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nat...@postgresql.org>
Date: Fri, 15 Mar 2024 14:27:52 -0500
Subject: [PATCH v3 3/3] optimize pg_lfind32() by processing "tail" with fewer
 vectors

---
 src/include/port/pg_lfind.h | 29 +++++++++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

diff --git a/src/include/port/pg_lfind.h b/src/include/port/pg_lfind.h
index 9d21284724..9c6cce0b69 100644
--- a/src/include/port/pg_lfind.h
+++ b/src/include/port/pg_lfind.h
@@ -100,7 +100,7 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
 	 */
 	const Vector32 keys = vector32_broadcast(key);	/* load copies of key */
 	const uint32 nelem_per_vector = sizeof(Vector32) / sizeof(uint32);
-	const uint32 nelem_per_iteration = 4 * nelem_per_vector;
+	uint32 nelem_per_iteration = 4 * nelem_per_vector;
 
 	/* round down to multiple of elements per iteration */
 	uint32 tail_idx = nelem & ~(nelem_per_iteration - 1);
@@ -120,7 +120,6 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
 	i = 0;
 #endif
 
-retry:
 	for (; i < tail_idx; i += nelem_per_iteration)
 	{
 		Vector32	vals1,
@@ -160,6 +159,32 @@ retry:
 		}
 	}
 
+retry:
+	nelem_per_iteration = 2 * nelem_per_vector;
+	tail_idx = nelem & ~(nelem_per_iteration - 1);
+	for (; i < tail_idx; i += nelem_per_iteration)
+	{
+		Vector32	vals1,
+					vals2,
+					result1,
+					result2,
+					result;
+
+		vector32_load(&vals1, &base[i]);
+		vector32_load(&vals2, &base[i + nelem_per_vector]);
+
+		result1 = vector32_eq(keys, vals1);
+		result2 = vector32_eq(keys, vals2);
+
+		result = vector32_or(result1, result2);
+
+		if (vector32_is_highbit_set(result))
+		{
+			Assert(assert_result == true);
+			return true;
+		}
+	}
+
 	if (i == nelem)
 		return false;
 	else if (tail_idx > 0)
-- 
2.25.1

Reply via email to