Hi hackers, This is a follow-up for recent changes that optimized [sub]xip lookups in XidInMVCCSnapshot() on Intel hardware [0] [1]. I've attached a patch that uses ARM Advanced SIMD (Neon) intrinsic functions where available to speed up the search. The approach is nearly identical to the SSE2 version, and the usual benchmark [2] shows similar improvements.
writers head simd 8 866 836 16 849 833 32 782 822 64 846 833 128 805 821 256 722 739 512 529 674 768 374 608 1024 268 522 I've tested the patch on a recent macOS (M1 Pro) and Amazon Linux (Graviton2), and I've confirmed that the instructions aren't used on a Linux/Intel machine. I did add a new configure check to see if the relevant intrinsics are available, but I didn't add a runtime check like there is for the CRC instructions since the compilers I used support these intrinsics by default. (I don't think a runtime check would work very well with the inline function, anyway.) AFAICT these intrinsics are pretty standard on aarch64, although IIUC the spec indicates that they are technically optional. I suspect that a simple check for "aarch64" would be sufficient, but I haven't investigated the level of compiler support yet. Thoughts? [0] https://git.postgresql.org/gitweb/?p=postgresql.git;a=commit;h=b6ef167 [1] https://git.postgresql.org/gitweb/?p=postgresql.git;a=commit;h=37a6e5d [2] https://postgr.es/m/057a9a95-19d2-05f0-17e2-f46ff20e9...@2ndquadrant.com -- Nathan Bossart Amazon Web Services: https://aws.amazon.com
>From 1295f4d6eedabec1d850893d3bc86180bd33c932 Mon Sep 17 00:00:00 2001 From: Nathan Bossart <nathandboss...@gmail.com> Date: Fri, 19 Aug 2022 10:41:07 -0700 Subject: [PATCH v1 1/1] Use ARM Advanced SIMD intrinsic functions in pg_lfind32(). Use ARM Advanced SIMD intrinsic functions to speed up the search, where available. Otherwise, use a simple 'for' loop as before. As with b6ef167, this speeds up XidInMVCCSnapshot(), but any uses of pg_lfind32() will also benefit. Author: Nathan Bossart --- config/c-compiler.m4 | 25 +++++++++++++++++++++++ configure | 40 +++++++++++++++++++++++++++++++++++++ configure.ac | 2 ++ src/include/pg_config.h.in | 3 +++ src/include/port/pg_lfind.h | 35 ++++++++++++++++++++++++++++++++ src/include/port/simd.h | 4 ++++ 6 files changed, 109 insertions(+) diff --git a/config/c-compiler.m4 b/config/c-compiler.m4 index 69efc5bb10..e8931d7059 100644 --- a/config/c-compiler.m4 +++ b/config/c-compiler.m4 @@ -650,3 +650,28 @@ if test x"$Ac_cachevar" = x"yes"; then fi undefine([Ac_cachevar])dnl ])# PGAC_ARMV8_CRC32C_INTRINSICS + +# PGAC_ARM_ADVANCED_SIMD_INTRINSICS +# --------------------------------- +# Check if the compiler supports the vdupq_n_u32, vld1q_u32, vceqq_u32, +# vorrq_u32, and vmaxvq_u32 intrinsic functions. These instructions were first +# introduced in ARMv7. +AC_DEFUN([PGAC_ARM_ADVANCED_SIMD_INTRINSICS], +[AC_CACHE_CHECK([for vdupq_n_u32, vld1q_u32, vceqq_u32, vorrq_u32, and vmaxvq_u32], +pgac_cv_arm_advanced_simd_intrinsics, +[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <arm_neon.h>], + [unsigned int val[[]] = {1, 2, 3, 4, 5, 6, 7, 8}; + uint32x4_t keys = vdupq_n_u32(7); + uint32x4_t vals1 = vld1q_u32(val); + uint32x4_t vals2 = vld1q_u32(&val[[4]]); + uint32x4_t tmp1 = vceqq_u32(keys, vals1); + uint32x4_t tmp2 = vceqq_u32(keys, vals2); + uint32x4_t result = vorrq_u32(tmp1, tmp2); + /* return computed value to prevent the above from being optimized away */ + return vmaxvq_u32(result) != 0;])], +[pgac_cv_arm_advanced_simd_intrinsics=yes], +[pgac_cv_arm_advanced_simd_intrinsics=no])]) +if test x"$pgac_cv_arm_advanced_simd_intrinsics" = xyes ; then +AC_DEFINE(USE_ARM_ADVANCED_SIMD_INTRINSICS, 1, + [Define to 1 to use ARM Advanced SIMD (Neon) intrinsics.]) +fi])# PGAC_ARM_ADVANCED_SIMD_INTRINSICS diff --git a/configure b/configure index b28fccbc47..0924e5ae8f 100755 --- a/configure +++ b/configure @@ -18230,6 +18230,46 @@ $as_echo "slicing-by-8" >&6; } fi +# Check for ARM Advanced SIMD intrinsics. +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for vdupq_n_u32, vld1q_u32, vceqq_u32, vorrq_u32, and vmaxvq_u32" >&5 +$as_echo_n "checking for vdupq_n_u32, vld1q_u32, vceqq_u32, vorrq_u32, and vmaxvq_u32... " >&6; } +if ${pgac_cv_arm_advanced_simd_intrinsics+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include <arm_neon.h> +int +main () +{ +unsigned int val[] = {1, 2, 3, 4, 5, 6, 7, 8}; + uint32x4_t keys = vdupq_n_u32(7); + uint32x4_t vals1 = vld1q_u32(val); + uint32x4_t vals2 = vld1q_u32(&val[4]); + uint32x4_t tmp1 = vceqq_u32(keys, vals1); + uint32x4_t tmp2 = vceqq_u32(keys, vals2); + uint32x4_t result = vorrq_u32(tmp1, tmp2); + /* return computed value to prevent the above from being optimized away */ + return vmaxvq_u32(result) != 0; + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + pgac_cv_arm_advanced_simd_intrinsics=yes +else + pgac_cv_arm_advanced_simd_intrinsics=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_arm_advanced_simd_intrinsics" >&5 +$as_echo "$pgac_cv_arm_advanced_simd_intrinsics" >&6; } +if test x"$pgac_cv_arm_advanced_simd_intrinsics" = xyes ; then + +$as_echo "#define USE_ARM_ADVANCED_SIMD_INTRINSICS 1" >>confdefs.h + +fi # Select semaphore implementation type. if test "$PORTNAME" != "win32"; then diff --git a/configure.ac b/configure.ac index dd368290a6..62d8b4abda 100644 --- a/configure.ac +++ b/configure.ac @@ -2172,6 +2172,8 @@ else fi AC_SUBST(PG_CRC32C_OBJS) +# Check for ARM Advanced SIMD intrinsics. +PGAC_ARM_ADVANCED_SIMD_INTRINSICS # Select semaphore implementation type. if test "$PORTNAME" != "win32"; then diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index 26eb6a2dfe..8d04f31fff 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -702,6 +702,9 @@ /* Define to 1 if strerror_r() returns int. */ #undef STRERROR_R_INT +/* Define to 1 to use ARM Advanced SIMD (Neon) intrinsics. */ +#undef USE_ARM_ADVANCED_SIMD_INTRINSICS + /* Define to 1 to use ARMv8 CRC Extension. */ #undef USE_ARMV8_CRC32C diff --git a/src/include/port/pg_lfind.h b/src/include/port/pg_lfind.h index fb125977b2..51315a4fb3 100644 --- a/src/include/port/pg_lfind.h +++ b/src/include/port/pg_lfind.h @@ -82,6 +82,41 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem) } #endif /* USE_SSE2 */ +#ifdef USE_ARM_ADVANCED_SIMD_INTRINSICS + + /* + * A 16-byte register only has four 4-byte lanes. For better + * instruction-level parallelism, each loop iteration operates on a block + * of four registers. + */ + const uint32x4_t keys = vdupq_n_u32(key); /* load 4 copies of key */ + uint32 iterations = nelem & ~0xF; /* round down to multiple of 16 */ + + for (i = 0; i < iterations; i += 16) + { + /* load the next block into 4 registers holding 4 values each */ + const uint32x4_t vals1 = vld1q_u32((const uint32 *) & base[i]); + const uint32x4_t vals2 = vld1q_u32((const uint32 *) & base[i + 4]); + const uint32x4_t vals3 = vld1q_u32((const uint32 *) & base[i + 8]); + const uint32x4_t vals4 = vld1q_u32((const uint32 *) & base[i + 12]); + + /* compare each value to the key */ + const uint32x4_t result1 = vceqq_u32(keys, vals1); + const uint32x4_t result2 = vceqq_u32(keys, vals2); + const uint32x4_t result3 = vceqq_u32(keys, vals3); + const uint32x4_t result4 = vceqq_u32(keys, vals4); + + /* combine the results into a single variable */ + const uint32x4_t tmp1 = vorrq_u32(result1, result2); + const uint32x4_t tmp2 = vorrq_u32(result3, result4); + const uint32x4_t result = vorrq_u32(tmp1, tmp2); + + /* see if there was a match */ + if (vmaxvq_u32(result) != 0) + return true; + } +#endif /* USE_ARM_ADVANCED_SIMD_INTRINSICS */ + /* Process the remaining elements one at a time. */ for (; i < nelem; i++) { diff --git a/src/include/port/simd.h b/src/include/port/simd.h index a571e79f57..5eee1c944f 100644 --- a/src/include/port/simd.h +++ b/src/include/port/simd.h @@ -27,4 +27,8 @@ #define USE_SSE2 #endif +#ifdef USE_ARM_ADVANCED_SIMD_INTRINSICS +#include <arm_neon.h> +#endif + #endif /* SIMD_H */ -- 2.25.1