Hello,

This looks quite reasonable.  On my machine, I get the compiler test to
pass so I get a "yes" in configure; but of course my CPU doesn't support
the instructions so I get the slow variant.  So here's the patch again
with some minor artifacts fixed.

I have the following review notes:

1. we use __get_cpuid_count and __cpuidex by relying on macros
HAVE__GET_CPUID and HAVE__CPUID respectively; but those macros are (in
the current Postgres source) only used and tested for __get_cpuid and
__cpuid respectively.  So unless there's some reason to be certain that
__get_cpuid_count is always present when __get_cpuid is present, and
that __cpuidex is present when __cpuid is present, I think we need to
add new configure tests and new HAVE_ macros for these.

2. we rely on <immintrin.h> being present with no AC_CHECK_HEADER()
test.  We currently don't use this header anywhere, so I suppose we need
a test for this one as well.  (Also, I suppose if we don't have
immintrin.h we can skip the rest of it?)

3. We do the __get_cpuid_count/__cpuidex test and we also do a xgetbv
test.  The comment there claims that this is to check the results for
consistency.  But ... how would we know that the results are ever
inconsistent?  As far as I understand, if they were, we would silently
become slower.  Is this really what we want?  I'm confused about this
coding.  Maybe we do need both tests to succeed?  In that case, just
reword the comment.

I think if both tests are each considered reliable on its own, then we
could either choose one of them and stick with it, ignoring the other;
or we could use one as primary and then in a USE_ASSERT_CHECKING block
verify that the other matches and throw a WARNING if not (but what would
that tell us?).  Or something like that ... not sure.

4. It needs meson support, which I suppose consists of copying the
c-compiler.m4 test into meson.build, mimicking what the tests for CRC
instructions do.


I started a CI run with this patch applied,
https://cirrus-ci.com/build/4912499619790848
but because Meson support is missing, the compile failed
immediately:

[10:08:48.825] ccache cc -Isrc/port/libpgport_srv.a.p -Isrc/include 
-I../src/include -Isrc/include/utils -fdiagnostics-color=always -pipe 
-D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -g -fno-strict-aliasing -fwrapv 
-fexcess-precision=standard -D_GNU_SOURCE -Wmissing-prototypes -Wpointer-arith 
-Werror=vla -Wendif-labels -Wmissing-format-attribute -Wimplicit-fallthrough=3 
-Wcast-function-type -Wshadow=compatible-local -Wformat-security 
-Wdeclaration-after-statement -Wno-format-truncation -Wno-stringop-truncation 
-fPIC -pthread -DBUILDING_DLL -MD -MQ 
src/port/libpgport_srv.a.p/pg_bitutils.c.o -MF 
src/port/libpgport_srv.a.p/pg_bitutils.c.o.d -o 
src/port/libpgport_srv.a.p/pg_bitutils.c.o -c ../src/port/pg_bitutils.c
[10:08:48.825] ../src/port/pg_bitutils.c: In function ‘pg_popcount512_fast’:
[10:08:48.825] ../src/port/pg_bitutils.c:270:11: warning: AVX512F vector return 
without AVX512F enabled changes the ABI [-Wpsabi]
[10:08:48.825]   270 |  __m512i  accumulator = _mm512_setzero_si512();
[10:08:48.825]       |           ^~~~~~~~~~~
[10:08:48.825] In file included from 
/usr/lib/gcc/x86_64-linux-gnu/10/include/immintrin.h:55,
[10:08:48.825]                  from ../src/port/pg_bitutils.c:22:
[10:08:48.825] /usr/lib/gcc/x86_64-linux-gnu/10/include/avx512fintrin.h:339:1: 
error: inlining failed in call to ‘always_inline’ ‘_mm512_setzero_si512’: 
target specific option mismatch
[10:08:48.825]   339 | _mm512_setzero_si512 (void)
[10:08:48.825]       | ^~~~~~~~~~~~~~~~~~~~
[10:08:48.825] ../src/port/pg_bitutils.c:270:25: note: called from here
[10:08:48.825]   270 |  __m512i  accumulator = _mm512_setzero_si512();
[10:08:48.825]       |                         ^~~~~~~~~~~~~~~~~~~~~~


Thanks

-- 
Álvaro Herrera        Breisgau, Deutschland  —  https://www.EnterpriseDB.com/
"Siempre hay que alimentar a los dioses, aunque la tierra esté seca" (Orual)
>From 188cd244ffcff20e1cf0bc655106d4db1a51b55b Mon Sep 17 00:00:00 2001
From: Alvaro Herrera <alvhe...@alvh.no-ip.org>
Date: Tue, 6 Feb 2024 19:36:59 +0100
Subject: [PATCH v3] Add support for AVX512-implemented POPCNT

Author: Paul D Amonson <paul.d.amon...@intel.com>
Discussion: https://postgr.es/m/bl1pr11mb5304097df7ea81d04c33f3d1dc...@bl1pr11mb5304.namprd11.prod.outlook.com
---
 config/c-compiler.m4   |  33 +++++++++++
 configure              |  91 ++++++++++++++++++++++++++++++
 configure.ac           |   8 +++
 src/Makefile.global.in |   1 +
 src/port/Makefile      |   5 ++
 src/port/pg_bitutils.c | 122 ++++++++++++++++++++++++++++++++++++-----
 6 files changed, 247 insertions(+), 13 deletions(-)

diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index 5db02b2ab7..a5a3246199 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -694,3 +694,36 @@ if test x"$Ac_cachevar" = x"yes"; then
 fi
 undefine([Ac_cachevar])dnl
 ])# PGAC_LOONGARCH_CRC32C_INTRINSICS
+
+# PGAC_AVX512_POPCNT_INTRINSICS
+# ---------------------------
+# Check if the compiler supports the x86_64 AVX512 POPCNT instructions using
+# intrinsics used in CPUID features AVX512F and AVX512VPOPCNTDQ.
+#
+# Optional compiler flags can be passed as argument (e.g. -mavx512vpopcntdq).
+# If the intrinsics are supported then pgac_avx512_popcnt_intrinsics and
+# CFLAGS_AVX512_POPCNT are set.
+AC_DEFUN([PGAC_AVX512_POPCNT_INTRINSICS],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_avx512_popcnt_intrinsics_$1])])dnl
+AC_CACHE_CHECK([for _mm512_popcnt_epi64 with CFLAGS=$1], [Ac_cachevar],
+[pgac_save_CFLAGS=$CFLAGS
+CFLAGS="$pgac_save_CFLAGS $1"
+AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <immintrin.h>
+#include <stdint.h>],
+  [__m512i tmp __attribute__((aligned(64)));
+   __m512i input = _mm512_setzero_si512();
+   __m512i output = _mm512_popcnt_epi64(input);
+   uint64_t cnt = 999;
+   _mm512_store_si512(&tmp, output);
+   cnt = _mm512_reduce_add_epi64(tmp);
+   /* return computed value, to prevent the above being optimized away */
+   return cnt == 0;])],
+  [Ac_cachevar=yes],
+  [Ac_cachevar=no])
+CFLAGS="$pgac_save_CFLAGS"])
+if test x"$Ac_cachevar" = x"yes"; then
+  CFLAGS_AVX512_POPCNT="$1"
+  pgac_avx512_popcnt_intrinsics=yes
+fi
+undefine([Ac_cachevar])dnl
+])# PGAC_AVX512_POPCNT_INTRINSICS
diff --git a/configure b/configure
index 2a1ee251f2..47d91384b9 100755
--- a/configure
+++ b/configure
@@ -647,6 +647,7 @@ MSGFMT_FLAGS
 MSGFMT
 PG_CRC32C_OBJS
 CFLAGS_CRC
+CFLAGS_AVX512_POPCNT
 LIBOBJS
 OPENSSL
 ZSTD
@@ -17736,6 +17737,96 @@ $as_echo "#define HAVE__CPUID 1" >>confdefs.h
 
 fi
 
+# Check for Intel AVX512 intrinsics to do POPCNT calculations.
+#
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm512_popcnt_epi64 with CFLAGS=" >&5
+$as_echo_n "checking for _mm512_popcnt_epi64 with CFLAGS=... " >&6; }
+if ${pgac_cv_avx512_popcnt_intrinsics_+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  pgac_save_CFLAGS=$CFLAGS
+CFLAGS="$pgac_save_CFLAGS "
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <immintrin.h>
+#include <stdint.h>
+int
+main ()
+{
+__m512i tmp __attribute__((aligned(64)));
+   __m512i input = _mm512_setzero_si512();
+   __m512i output = _mm512_popcnt_epi64(input);
+   uint64_t cnt = 999;
+   _mm512_store_si512(&tmp, output);
+   cnt = _mm512_reduce_add_epi64(tmp);
+   /* return computed value, to prevent the above being optimized away */
+   return cnt == 0;
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pgac_cv_avx512_popcnt_intrinsics_=yes
+else
+  pgac_cv_avx512_popcnt_intrinsics_=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+CFLAGS="$pgac_save_CFLAGS"
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_avx512_popcnt_intrinsics_" >&5
+$as_echo "$pgac_cv_avx512_popcnt_intrinsics_" >&6; }
+if test x"$pgac_cv_avx512_popcnt_intrinsics_" = x"yes"; then
+  CFLAGS_AVX512_POPCNT=""
+  pgac_avx512_popcnt_intrinsics=yes
+fi
+
+if test x"$pgac_avx512_popcnt_intrinsics" != x"yes"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm512_popcnt_epi64 with CFLAGS=-mavx512vpopcntdq -mavx512f" >&5
+$as_echo_n "checking for _mm512_popcnt_epi64 with CFLAGS=-mavx512vpopcntdq -mavx512f... " >&6; }
+if ${pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq__mavx512f+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  pgac_save_CFLAGS=$CFLAGS
+CFLAGS="$pgac_save_CFLAGS -mavx512vpopcntdq -mavx512f"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <immintrin.h>
+#include <stdint.h>
+int
+main ()
+{
+__m512i tmp __attribute__((aligned(64)));
+   __m512i input = _mm512_setzero_si512();
+   __m512i output = _mm512_popcnt_epi64(input);
+   uint64_t cnt = 999;
+   _mm512_store_si512(&tmp, output);
+   cnt = _mm512_reduce_add_epi64(tmp);
+   /* return computed value, to prevent the above being optimized away */
+   return cnt == 0;
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq__mavx512f=yes
+else
+  pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq__mavx512f=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+CFLAGS="$pgac_save_CFLAGS"
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq__mavx512f" >&5
+$as_echo "$pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq__mavx512f" >&6; }
+if test x"$pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq__mavx512f" = x"yes"; then
+  CFLAGS_AVX512_POPCNT="-mavx512vpopcntdq -mavx512f"
+  pgac_avx512_popcnt_intrinsics=yes
+fi
+
+fi
+
+
 # Check for Intel SSE 4.2 intrinsics to do CRC calculations.
 #
 # First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
diff --git a/configure.ac b/configure.ac
index 52fd7af446..d5fe701c9c 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2078,6 +2078,14 @@ if test x"$pgac_cv__cpuid" = x"yes"; then
   AC_DEFINE(HAVE__CPUID, 1, [Define to 1 if you have __cpuid.])
 fi
 
+# Check for Intel AVX512 intrinsics to do POPCNT calculations.
+#
+PGAC_AVX512_POPCNT_INTRINSICS([])
+if test x"$pgac_avx512_popcnt_intrinsics" != x"yes"; then
+  PGAC_AVX512_POPCNT_INTRINSICS([-mavx512vpopcntdq -mavx512f])
+fi
+AC_SUBST(CFLAGS_AVX512_POPCNT)
+
 # Check for Intel SSE 4.2 intrinsics to do CRC calculations.
 #
 # First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
diff --git a/src/Makefile.global.in b/src/Makefile.global.in
index 8b3f8c24e0..089f49b7f3 100644
--- a/src/Makefile.global.in
+++ b/src/Makefile.global.in
@@ -263,6 +263,7 @@ CXXFLAGS_SL_MODULE = @CXXFLAGS_SL_MODULE@
 CFLAGS_UNROLL_LOOPS = @CFLAGS_UNROLL_LOOPS@
 CFLAGS_VECTORIZE = @CFLAGS_VECTORIZE@
 CFLAGS_CRC = @CFLAGS_CRC@
+CFLAGS_AVX512_POPCNT = @CFLAGS_AVX512_POPCNT@
 PERMIT_DECLARATION_AFTER_STATEMENT = @PERMIT_DECLARATION_AFTER_STATEMENT@
 CXXFLAGS = @CXXFLAGS@
 
diff --git a/src/port/Makefile b/src/port/Makefile
index dcc8737e68..6a01a7d89a 100644
--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -87,6 +87,11 @@ pg_crc32c_sse42.o: CFLAGS+=$(CFLAGS_CRC)
 pg_crc32c_sse42_shlib.o: CFLAGS+=$(CFLAGS_CRC)
 pg_crc32c_sse42_srv.o: CFLAGS+=$(CFLAGS_CRC)
 
+# Newer Intel processors can use AVX-512 POPCNT Capabilities (01/30/2024)
+pg_bitutils.o: CFLAGS+=$(CFLAGS_AVX512_POPCNT)
+pg_bitutils_shlib.o: CFLAGS+=$(CFLAGS_AVX512_POPCNT)
+pg_bitutils_srv.o:CFLAGS+=$(CFLAGS_AVX512_POPCNT)
+
 # all versions of pg_crc32c_armv8.o need CFLAGS_CRC
 pg_crc32c_armv8.o: CFLAGS+=$(CFLAGS_CRC)
 pg_crc32c_armv8_shlib.o: CFLAGS+=$(CFLAGS_CRC)
diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c
index 640a89561a..f8f029190f 100644
--- a/src/port/pg_bitutils.c
+++ b/src/port/pg_bitutils.c
@@ -19,6 +19,8 @@
 #include <intrin.h>
 #endif
 
+#include <immintrin.h>
+
 #include "port/pg_bitutils.h"
 
 
@@ -110,11 +112,15 @@ static int	pg_popcount64_slow(uint64 word);
 static bool pg_popcount_available(void);
 static int	pg_popcount32_choose(uint32 word);
 static int	pg_popcount64_choose(uint64 word);
+static uint64 pg_popcount512_choose(const char *buf, int bytes);
 static int	pg_popcount32_fast(uint32 word);
 static int	pg_popcount64_fast(uint64 word);
+static uint64 pg_popcount512_fast(const char *buf, int bytes);
+static uint64 pg_popcount512_slow(const char *buf, int bytes);
 
 int			(*pg_popcount32) (uint32 word) = pg_popcount32_choose;
 int			(*pg_popcount64) (uint64 word) = pg_popcount64_choose;
+uint64		(*pg_popcount512) (const char *buf, int bytes) = pg_popcount512_choose;
 #endif							/* TRY_POPCNT_FAST */
 
 #ifdef TRY_POPCNT_FAST
@@ -138,6 +144,40 @@ pg_popcount_available(void)
 	return (exx[2] & (1 << 23)) != 0;	/* POPCNT */
 }
 
+/*
+ * Return true if CPUID indicates that the AVX512_POPCNT instruction is
+ * available. This is similar to the method above; see
+ * https://en.wikipedia.org/wiki/CPUID#EAX=7,_ECX=0:_Extended_Features
+ *
+ * Finally, we make sure the xgetbv result is consistent with the CPUID
+ * results.
+ */
+static bool
+pg_popcount512_available(void)
+{
+	unsigned int exx[4] = {0, 0, 0, 0};
+
+	/* Check for AVX512VPOPCNTDQ and AVX512F */
+#if defined(HAVE__GET_CPUID)
+	__get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
+#elif defined(HAVE__CPUID)
+	__cpuidex(exx, 7, 0);
+#else
+#error cpuid instruction not available
+#endif
+
+	if ((exx[2] & (0x00004000)) != 0 && (exx[1] & (0x00010000)) != 0)
+	{
+		uint64		xcr = 0;
+		uint32		high;
+		uint32		low;
+
+__asm__ __volatile__("xgetbv\t\n":"=a"(low), "=d"(high):"c"(xcr));
+		return (low & 0xE0) != 0;
+	}							/* POPCNT 512 */
+	return false;
+}
+
 /*
  * These functions get called on the first call to pg_popcount32 etc.
  * They detect whether we can use the asm implementations, and replace
@@ -178,6 +218,17 @@ pg_popcount64_choose(uint64 word)
 	return pg_popcount64(word);
 }
 
+static uint64
+pg_popcount512_choose(const char *buf, int bytes)
+{
+	if (pg_popcount512_available())
+		pg_popcount512 = pg_popcount512_fast;
+	else
+		pg_popcount512 = pg_popcount512_slow;
+
+	return pg_popcount512(buf, bytes);
+}
+
 /*
  * pg_popcount32_fast
  *		Return the number of 1 bits set in word
@@ -212,6 +263,32 @@ __asm__ __volatile__(" popcntq %1,%0\n":"=q"(res):"rm"(word):"cc");
 #endif
 }
 
+static uint64
+pg_popcount512_fast(const char *buf, int bytes)
+{
+	uint64		popcnt = 0;
+	__m512i		accumulator = _mm512_setzero_si512();
+
+	while (bytes >= 64)
+	{
+		const		__m512i v = _mm512_loadu_si512((const __m512i *) buf);
+		const		__m512i p = _mm512_popcnt_epi64(v);
+
+		accumulator = _mm512_add_epi64(accumulator, p);
+		bytes -= 64;
+		buf += 64;
+	}
+
+	popcnt = _mm512_reduce_add_epi64(accumulator);
+	bytes = bytes % 64;
+
+	/* Process any remaining bytes */
+	while (bytes--)
+		popcnt += pg_number_of_ones[(unsigned char) *buf++];
+
+	return popcnt;
+}
+
 #endif							/* TRY_POPCNT_FAST */
 
 
@@ -265,6 +342,31 @@ pg_popcount64_slow(uint64 word)
 #endif							/* HAVE__BUILTIN_POPCOUNT */
 }
 
+static uint64
+pg_popcount512_slow(const char *buf, int bytes)
+{
+	uint64		popcnt = 0;
+
+	if (buf == (const char *) TYPEALIGN(8, buf))
+	{
+		const uint64 *words = (const uint64 *) buf;
+
+		while (bytes >= 8)
+		{
+			popcnt += pg_popcount64(*words++);
+			bytes -= 8;
+		}
+
+		buf = (const char *) words;
+	}
+
+	/* Process any remaining bytes */
+	while (bytes--)
+		popcnt += pg_number_of_ones[(unsigned char) *buf++];
+
+	return popcnt;
+}
+
 #ifndef TRY_POPCNT_FAST
 
 /*
@@ -286,6 +388,12 @@ pg_popcount64(uint64 word)
 	return pg_popcount64_slow(word);
 }
 
+uint64
+pg_popcount512(const char *buf, int bytes)
+{
+	return pg_popcount512_slow(buf, bytes);
+}
+
 #endif							/* !TRY_POPCNT_FAST */
 
 /*
@@ -298,19 +406,7 @@ pg_popcount(const char *buf, int bytes)
 	uint64		popcnt = 0;
 
 #if SIZEOF_VOID_P >= 8
-	/* Process in 64-bit chunks if the buffer is aligned. */
-	if (buf == (const char *) TYPEALIGN(8, buf))
-	{
-		const uint64 *words = (const uint64 *) buf;
-
-		while (bytes >= 8)
-		{
-			popcnt += pg_popcount64(*words++);
-			bytes -= 8;
-		}
-
-		buf = (const char *) words;
-	}
+	return pg_popcount512(buf, bytes);
 #else
 	/* Process in 32-bit chunks if the buffer is aligned. */
 	if (buf == (const char *) TYPEALIGN(4, buf))
-- 
2.39.2

Reply via email to