On Fri, Mar 15, 2024 at 12:41:49PM -0500, Nathan Bossart wrote: > I've also attached the results of running this benchmark on my machine at > HEAD, after applying 0001, and after applying both 0001 and 0002. 0001 > appears to work pretty well. When there is a small "tail," it regresses a > small amount, but overall, it seems to improve more cases than it harms. > 0002 does regress searches on smaller arrays quite a bit, since it > postpones the SIMD optimizations until the arrays are longer. It might be > possible to mitigate by using 2 registers when the "tail" is long enough, > but I have yet to try that.
The attached 0003 is a sketch of what such mitigation might look like. It appears to help with the regressions nicely. I omitted the benchmarking patch in v3 to appease cfbot. -- Nathan Bossart Amazon Web Services: https://aws.amazon.com
>From 3817435d200af5da954d505ae66245662dea064c Mon Sep 17 00:00:00 2001 From: Nathan Bossart <nat...@postgresql.org> Date: Fri, 15 Mar 2024 12:26:26 -0500 Subject: [PATCH v3 1/3] pg_lfind32: process "tail" with SIMD intructions --- src/include/port/pg_lfind.h | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/src/include/port/pg_lfind.h b/src/include/port/pg_lfind.h index b8dfa66eef..9d21284724 100644 --- a/src/include/port/pg_lfind.h +++ b/src/include/port/pg_lfind.h @@ -103,7 +103,7 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem) const uint32 nelem_per_iteration = 4 * nelem_per_vector; /* round down to multiple of elements per iteration */ - const uint32 tail_idx = nelem & ~(nelem_per_iteration - 1); + uint32 tail_idx = nelem & ~(nelem_per_iteration - 1); #if defined(USE_ASSERT_CHECKING) bool assert_result = false; @@ -117,9 +117,11 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem) break; } } + i = 0; #endif - for (i = 0; i < tail_idx; i += nelem_per_iteration) +retry: + for (; i < tail_idx; i += nelem_per_iteration) { Vector32 vals1, vals2, @@ -157,6 +159,16 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem) return true; } } + + if (i == nelem) + return false; + else if (tail_idx > 0) + { + tail_idx = nelem; + i = nelem - nelem_per_iteration; + goto retry; + } + #endif /* ! USE_NO_SIMD */ /* Process the remaining elements one at a time. */ -- 2.25.1
>From a867e342db08aae501374c75c0d8f17473a6cbc9 Mon Sep 17 00:00:00 2001 From: Nathan Bossart <nat...@postgresql.org> Date: Fri, 15 Mar 2024 12:26:52 -0500 Subject: [PATCH v3 2/3] add avx2 support in simd.h --- src/include/port/simd.h | 58 ++++++++++++++++++++++++++++++++--------- 1 file changed, 45 insertions(+), 13 deletions(-) diff --git a/src/include/port/simd.h b/src/include/port/simd.h index 597496f2fb..767127b85c 100644 --- a/src/include/port/simd.h +++ b/src/include/port/simd.h @@ -18,7 +18,15 @@ #ifndef SIMD_H #define SIMD_H -#if (defined(__x86_64__) || defined(_M_AMD64)) +#if defined(__AVX2__) + +#include <immintrin.h> +#define USE_AVX2 +typedef __m256i Vector8; +typedef __m256i Vector32; + +#elif (defined(__x86_64__) || defined(_M_AMD64)) + /* * SSE2 instructions are part of the spec for the 64-bit x86 ISA. We assume * that compilers targeting this architecture understand SSE2 intrinsics. @@ -107,7 +115,9 @@ static inline Vector32 vector32_eq(const Vector32 v1, const Vector32 v2); static inline void vector8_load(Vector8 *v, const uint8 *s) { -#if defined(USE_SSE2) +#if defined(USE_AVX2) + *v = _mm256_loadu_si256((const __m256i *) s); +#elif defined(USE_SSE2) *v = _mm_loadu_si128((const __m128i *) s); #elif defined(USE_NEON) *v = vld1q_u8(s); @@ -120,7 +130,9 @@ vector8_load(Vector8 *v, const uint8 *s) static inline void vector32_load(Vector32 *v, const uint32 *s) { -#ifdef USE_SSE2 +#if defined(USE_AVX2) + *v = _mm256_loadu_si256((const __m256i *) s); +#elif defined(USE_SSE2) *v = _mm_loadu_si128((const __m128i *) s); #elif defined(USE_NEON) *v = vld1q_u32(s); @@ -134,7 +146,9 @@ vector32_load(Vector32 *v, const uint32 *s) static inline Vector8 vector8_broadcast(const uint8 c) { -#if defined(USE_SSE2) +#if defined(USE_AVX2) + return _mm256_set1_epi8(c); +#elif defined(USE_SSE2) return _mm_set1_epi8(c); #elif defined(USE_NEON) return vdupq_n_u8(c); @@ -147,7 +161,9 @@ vector8_broadcast(const uint8 c) static inline Vector32 vector32_broadcast(const uint32 c) { -#ifdef USE_SSE2 +#if defined(USE_AVX2) + return _mm256_set1_epi32(c); +#elif defined(USE_SSE2) return _mm_set1_epi32(c); #elif defined(USE_NEON) return vdupq_n_u32(c); @@ -270,7 +286,9 @@ vector8_has_le(const Vector8 v, const uint8 c) static inline bool vector8_is_highbit_set(const Vector8 v) { -#ifdef USE_SSE2 +#if defined(USE_AVX2) + return _mm256_movemask_epi8(v) != 0; +#elif defined(USE_SSE2) return _mm_movemask_epi8(v) != 0; #elif defined(USE_NEON) return vmaxvq_u8(v) > 0x7F; @@ -308,7 +326,9 @@ vector32_is_highbit_set(const Vector32 v) static inline uint32 vector8_highbit_mask(const Vector8 v) { -#ifdef USE_SSE2 +#if defined(USE_AVX2) + return (uint32) _mm256_movemask_epi8(v); +#elif defined(USE_SSE2) return (uint32) _mm_movemask_epi8(v); #elif defined(USE_NEON) /* @@ -337,7 +357,9 @@ vector8_highbit_mask(const Vector8 v) static inline Vector8 vector8_or(const Vector8 v1, const Vector8 v2) { -#ifdef USE_SSE2 +#if defined(USE_AVX2) + return _mm256_or_si256(v1, v2); +#elif defined(USE_SSE2) return _mm_or_si128(v1, v2); #elif defined(USE_NEON) return vorrq_u8(v1, v2); @@ -350,7 +372,9 @@ vector8_or(const Vector8 v1, const Vector8 v2) static inline Vector32 vector32_or(const Vector32 v1, const Vector32 v2) { -#ifdef USE_SSE2 +#if defined(USE_AVX2) + return _mm256_or_si256(v1, v2); +#elif defined(USE_SSE2) return _mm_or_si128(v1, v2); #elif defined(USE_NEON) return vorrq_u32(v1, v2); @@ -368,7 +392,9 @@ vector32_or(const Vector32 v1, const Vector32 v2) static inline Vector8 vector8_ssub(const Vector8 v1, const Vector8 v2) { -#ifdef USE_SSE2 +#if defined(USE_AVX2) + return _mm256_subs_epu8(v1, v2); +#elif defined(USE_SSE2) return _mm_subs_epu8(v1, v2); #elif defined(USE_NEON) return vqsubq_u8(v1, v2); @@ -384,7 +410,9 @@ vector8_ssub(const Vector8 v1, const Vector8 v2) static inline Vector8 vector8_eq(const Vector8 v1, const Vector8 v2) { -#ifdef USE_SSE2 +#if defined(USE_AVX2) + return _mm256_cmpeq_epi8(v1, v2); +#elif defined(USE_SSE2) return _mm_cmpeq_epi8(v1, v2); #elif defined(USE_NEON) return vceqq_u8(v1, v2); @@ -396,7 +424,9 @@ vector8_eq(const Vector8 v1, const Vector8 v2) static inline Vector32 vector32_eq(const Vector32 v1, const Vector32 v2) { -#ifdef USE_SSE2 +#if defined(USE_AVX2) + return _mm256_cmpeq_epi32(v1, v2); +#elif defined(USE_SSE2) return _mm_cmpeq_epi32(v1, v2); #elif defined(USE_NEON) return vceqq_u32(v1, v2); @@ -411,7 +441,9 @@ vector32_eq(const Vector32 v1, const Vector32 v2) static inline Vector8 vector8_min(const Vector8 v1, const Vector8 v2) { -#ifdef USE_SSE2 +#if defined(USE_AVX2) + return _mm256_min_epu8(v1, v2); +#elif defined(USE_SSE2) return _mm_min_epu8(v1, v2); #elif defined(USE_NEON) return vminq_u8(v1, v2); -- 2.25.1
>From edd188759e4b937089c5bc2259a401cea1f9331f Mon Sep 17 00:00:00 2001 From: Nathan Bossart <nat...@postgresql.org> Date: Fri, 15 Mar 2024 14:27:52 -0500 Subject: [PATCH v3 3/3] optimize pg_lfind32() by processing "tail" with fewer vectors --- src/include/port/pg_lfind.h | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/src/include/port/pg_lfind.h b/src/include/port/pg_lfind.h index 9d21284724..9c6cce0b69 100644 --- a/src/include/port/pg_lfind.h +++ b/src/include/port/pg_lfind.h @@ -100,7 +100,7 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem) */ const Vector32 keys = vector32_broadcast(key); /* load copies of key */ const uint32 nelem_per_vector = sizeof(Vector32) / sizeof(uint32); - const uint32 nelem_per_iteration = 4 * nelem_per_vector; + uint32 nelem_per_iteration = 4 * nelem_per_vector; /* round down to multiple of elements per iteration */ uint32 tail_idx = nelem & ~(nelem_per_iteration - 1); @@ -120,7 +120,6 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem) i = 0; #endif -retry: for (; i < tail_idx; i += nelem_per_iteration) { Vector32 vals1, @@ -160,6 +159,32 @@ retry: } } +retry: + nelem_per_iteration = 2 * nelem_per_vector; + tail_idx = nelem & ~(nelem_per_iteration - 1); + for (; i < tail_idx; i += nelem_per_iteration) + { + Vector32 vals1, + vals2, + result1, + result2, + result; + + vector32_load(&vals1, &base[i]); + vector32_load(&vals2, &base[i + nelem_per_vector]); + + result1 = vector32_eq(keys, vals1); + result2 = vector32_eq(keys, vals2); + + result = vector32_or(result1, result2); + + if (vector32_is_highbit_set(result)) + { + Assert(assert_result == true); + return true; + } + } + if (i == nelem) return false; else if (tail_idx > 0) -- 2.25.1