use ARM intrinsics in pg_lfind32() where available

Nathan Bossart Fri, 19 Aug 2022 13:08:51 -0700

Hi hackers,

This is a follow-up for recent changes that optimized [sub]xip lookups in
XidInMVCCSnapshot() on Intel hardware [0] [1].  I've attached a patch that
uses ARM Advanced SIMD (Neon) intrinsic functions where available to speed
up the search.  The approach is nearly identical to the SSE2 version, and
the usual benchmark [2] shows similar improvements.


  writers  head  simd
  8        866   836
  16       849   833
  32       782   822
  64       846   833
  128      805   821
  256      722   739
  512      529   674
  768      374   608
  1024     268   522

I've tested the patch on a recent macOS (M1 Pro) and Amazon Linux
(Graviton2), and I've confirmed that the instructions aren't used on a
Linux/Intel machine.  I did add a new configure check to see if the
relevant intrinsics are available, but I didn't add a runtime check like
there is for the CRC instructions since the compilers I used support these
intrinsics by default.  (I don't think a runtime check would work very well
with the inline function, anyway.)  AFAICT these intrinsics are pretty
standard on aarch64, although IIUC the spec indicates that they are
technically optional.  I suspect that a simple check for "aarch64" would be
sufficient, but I haven't investigated the level of compiler support yet.

Thoughts?

[0] https://git.postgresql.org/gitweb/?p=postgresql.git;a=commit;h=b6ef167
[1] https://git.postgresql.org/gitweb/?p=postgresql.git;a=commit;h=37a6e5d
[2] https://postgr.es/m/057a9a95-19d2-05f0-17e2-f46ff20e9...@2ndquadrant.com

-- 
Nathan Bossart
Amazon Web Services: https://aws.amazon.com

>From 1295f4d6eedabec1d850893d3bc86180bd33c932 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nathandboss...@gmail.com>
Date: Fri, 19 Aug 2022 10:41:07 -0700
Subject: [PATCH v1 1/1] Use ARM Advanced SIMD intrinsic functions in
 pg_lfind32().

Use ARM Advanced SIMD intrinsic functions to speed up the search,
where available.  Otherwise, use a simple 'for' loop as before.  As
with b6ef167, this speeds up XidInMVCCSnapshot(), but any uses of
pg_lfind32() will also benefit.

Author: Nathan Bossart
---
 config/c-compiler.m4        | 25 +++++++++++++++++++++++
 configure                   | 40 +++++++++++++++++++++++++++++++++++++
 configure.ac                |  2 ++
 src/include/pg_config.h.in  |  3 +++
 src/include/port/pg_lfind.h | 35 ++++++++++++++++++++++++++++++++
 src/include/port/simd.h     |  4 ++++
 6 files changed, 109 insertions(+)

diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index 69efc5bb10..e8931d7059 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -650,3 +650,28 @@ if test x"$Ac_cachevar" = x"yes"; then
 fi
 undefine([Ac_cachevar])dnl
 ])# PGAC_ARMV8_CRC32C_INTRINSICS
+
+# PGAC_ARM_ADVANCED_SIMD_INTRINSICS
+# ---------------------------------
+# Check if the compiler supports the vdupq_n_u32, vld1q_u32, vceqq_u32,
+# vorrq_u32, and vmaxvq_u32 intrinsic functions.  These instructions were first
+# introduced in ARMv7.
+AC_DEFUN([PGAC_ARM_ADVANCED_SIMD_INTRINSICS],
+[AC_CACHE_CHECK([for vdupq_n_u32, vld1q_u32, vceqq_u32, vorrq_u32, and vmaxvq_u32],
+pgac_cv_arm_advanced_simd_intrinsics,
+[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <arm_neon.h>],
+  [unsigned int val[[]] = {1, 2, 3, 4, 5, 6, 7, 8};
+   uint32x4_t keys = vdupq_n_u32(7);
+   uint32x4_t vals1 = vld1q_u32(val);
+   uint32x4_t vals2 = vld1q_u32(&val[[4]]);
+   uint32x4_t tmp1 = vceqq_u32(keys, vals1);
+   uint32x4_t tmp2 = vceqq_u32(keys, vals2);
+   uint32x4_t result = vorrq_u32(tmp1, tmp2);
+   /* return computed value to prevent the above from being optimized away */
+   return vmaxvq_u32(result) != 0;])],
+[pgac_cv_arm_advanced_simd_intrinsics=yes],
+[pgac_cv_arm_advanced_simd_intrinsics=no])])
+if test x"$pgac_cv_arm_advanced_simd_intrinsics" = xyes ; then
+AC_DEFINE(USE_ARM_ADVANCED_SIMD_INTRINSICS, 1,
+          [Define to 1 to use ARM Advanced SIMD (Neon) intrinsics.])
+fi])# PGAC_ARM_ADVANCED_SIMD_INTRINSICS
diff --git a/configure b/configure
index b28fccbc47..0924e5ae8f 100755
--- a/configure
+++ b/configure
@@ -18230,6 +18230,46 @@ $as_echo "slicing-by-8" >&6; }
 fi
 
 
+# Check for ARM Advanced SIMD intrinsics.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for vdupq_n_u32, vld1q_u32, vceqq_u32, vorrq_u32, and vmaxvq_u32" >&5
+$as_echo_n "checking for vdupq_n_u32, vld1q_u32, vceqq_u32, vorrq_u32, and vmaxvq_u32... " >&6; }
+if ${pgac_cv_arm_advanced_simd_intrinsics+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <arm_neon.h>
+int
+main ()
+{
+unsigned int val[] = {1, 2, 3, 4, 5, 6, 7, 8};
+   uint32x4_t keys = vdupq_n_u32(7);
+   uint32x4_t vals1 = vld1q_u32(val);
+   uint32x4_t vals2 = vld1q_u32(&val[4]);
+   uint32x4_t tmp1 = vceqq_u32(keys, vals1);
+   uint32x4_t tmp2 = vceqq_u32(keys, vals2);
+   uint32x4_t result = vorrq_u32(tmp1, tmp2);
+   /* return computed value to prevent the above from being optimized away */
+   return vmaxvq_u32(result) != 0;
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pgac_cv_arm_advanced_simd_intrinsics=yes
+else
+  pgac_cv_arm_advanced_simd_intrinsics=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_arm_advanced_simd_intrinsics" >&5
+$as_echo "$pgac_cv_arm_advanced_simd_intrinsics" >&6; }
+if test x"$pgac_cv_arm_advanced_simd_intrinsics" = xyes ; then
+
+$as_echo "#define USE_ARM_ADVANCED_SIMD_INTRINSICS 1" >>confdefs.h
+
+fi
 
 # Select semaphore implementation type.
 if test "$PORTNAME" != "win32"; then
diff --git a/configure.ac b/configure.ac
index dd368290a6..62d8b4abda 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2172,6 +2172,8 @@ else
 fi
 AC_SUBST(PG_CRC32C_OBJS)
 
+# Check for ARM Advanced SIMD intrinsics.
+PGAC_ARM_ADVANCED_SIMD_INTRINSICS
 
 # Select semaphore implementation type.
 if test "$PORTNAME" != "win32"; then
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 26eb6a2dfe..8d04f31fff 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -702,6 +702,9 @@
 /* Define to 1 if strerror_r() returns int. */
 #undef STRERROR_R_INT
 
+/* Define to 1 to use ARM Advanced SIMD (Neon) intrinsics. */
+#undef USE_ARM_ADVANCED_SIMD_INTRINSICS
+
 /* Define to 1 to use ARMv8 CRC Extension. */
 #undef USE_ARMV8_CRC32C
 
diff --git a/src/include/port/pg_lfind.h b/src/include/port/pg_lfind.h
index fb125977b2..51315a4fb3 100644
--- a/src/include/port/pg_lfind.h
+++ b/src/include/port/pg_lfind.h
@@ -82,6 +82,41 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
 	}
 #endif							/* USE_SSE2 */
 
+#ifdef USE_ARM_ADVANCED_SIMD_INTRINSICS
+
+	/*
+	 * A 16-byte register only has four 4-byte lanes. For better
+	 * instruction-level parallelism, each loop iteration operates on a block
+	 * of four registers.
+	 */
+	const		uint32x4_t keys = vdupq_n_u32(key); /* load 4 copies of key */
+	uint32		iterations = nelem & ~0xF;  /* round down to multiple of 16 */
+
+	for (i = 0; i < iterations; i += 16)
+	{
+		/* load the next block into 4 registers holding 4 values each */
+		const		uint32x4_t vals1 = vld1q_u32((const uint32 *) & base[i]);
+		const		uint32x4_t vals2 = vld1q_u32((const uint32 *) & base[i + 4]);
+		const		uint32x4_t vals3 = vld1q_u32((const uint32 *) & base[i + 8]);
+		const		uint32x4_t vals4 = vld1q_u32((const uint32 *) & base[i + 12]);
+
+		/* compare each value to the key */
+		const		uint32x4_t result1 = vceqq_u32(keys, vals1);
+		const		uint32x4_t result2 = vceqq_u32(keys, vals2);
+		const		uint32x4_t result3 = vceqq_u32(keys, vals3);
+		const		uint32x4_t result4 = vceqq_u32(keys, vals4);
+
+		/* combine the results into a single variable */
+		const		uint32x4_t tmp1 = vorrq_u32(result1, result2);
+		const		uint32x4_t tmp2 = vorrq_u32(result3, result4);
+		const		uint32x4_t result = vorrq_u32(tmp1, tmp2);
+
+		/* see if there was a match */
+		if (vmaxvq_u32(result) != 0)
+			return true;
+	}
+#endif							/* USE_ARM_ADVANCED_SIMD_INTRINSICS */
+
 	/* Process the remaining elements one at a time. */
 	for (; i < nelem; i++)
 	{
diff --git a/src/include/port/simd.h b/src/include/port/simd.h
index a571e79f57..5eee1c944f 100644
--- a/src/include/port/simd.h
+++ b/src/include/port/simd.h
@@ -27,4 +27,8 @@
 #define USE_SSE2
 #endif
 
+#ifdef USE_ARM_ADVANCED_SIMD_INTRINSICS
+#include <arm_neon.h>
+#endif
+
 #endif							/* SIMD_H */
-- 
2.25.1

use ARM intrinsics in pg_lfind32() where available

Reply via email to