Here is a v14 of the patch that I think is beginning to approach something committable. Besides general review and testing, there are two things that I'd like to bring up:
* The latest patch set from Paul Amonson appeared to support MSVC in the meson build, but not the autoconf one. I don't have much expertise here, so the v14 patch doesn't have any autoconf/meson support for MSVC, which I thought might be okay for now. IIUC we assume that 64-bit/MSVC builds can always compile the x86_64 popcount code, but I don't know whether that's safe for AVX512. * I think we need to verify there isn't a huge performance regression for smaller arrays. IIUC those will still require an AVX512 instruction or two as well as a function call, which might add some noticeable overhead. -- Nathan Bossart Amazon Web Services: https://aws.amazon.com
>From 9b5725e36aa8cff7caeb8683e11cd09bd5bda745 Mon Sep 17 00:00:00 2001 From: Nathan Bossart <nat...@postgresql.org> Date: Wed, 27 Mar 2024 16:39:24 -0500 Subject: [PATCH v14 1/1] AVX512 popcount support --- config/c-compiler.m4 | 34 +++++++ configure | 165 +++++++++++++++++++++++++++++++++ configure.ac | 34 +++++++ meson.build | 59 ++++++++++++ src/Makefile.global.in | 1 + src/include/pg_config.h.in | 9 ++ src/include/port/pg_bitutils.h | 20 ++++ src/makefiles/meson.build | 1 + src/port/Makefile | 6 ++ src/port/meson.build | 6 +- src/port/pg_bitutils.c | 56 ++++------- src/port/pg_popcount_avx512.c | 98 ++++++++++++++++++++ 12 files changed, 451 insertions(+), 38 deletions(-) create mode 100644 src/port/pg_popcount_avx512.c diff --git a/config/c-compiler.m4 b/config/c-compiler.m4 index 3268a780bb..f881e7ec28 100644 --- a/config/c-compiler.m4 +++ b/config/c-compiler.m4 @@ -694,3 +694,37 @@ if test x"$Ac_cachevar" = x"yes"; then fi undefine([Ac_cachevar])dnl ])# PGAC_LOONGARCH_CRC32C_INTRINSICS + +# PGAC_AVX512_POPCNT_INTRINSICS +# ----------------------------- +# Check if the compiler supports the AVX512 POPCNT instructions using the +# _mm512_setzero_si512, _mm512_loadu_si512, _mm512_popcnt_epi64, +# _mm512_add_epi64, and _mm512_reduce_add_epi64 intrinsic functions. +# +# An optional compiler flag can be passed as argument +# (e.g., -mavx512vpopcntdq). If the intrinsics are supported, sets +# pgac_avx512_popcnt_intrinsics and CFLAGS_AVX512_POPCNT. +AC_DEFUN([PGAC_AVX512_POPCNT_INTRINSICS], +[define([Ac_cachevar], [AS_TR_SH([pgac_cv_avx512_popcnt_intrinsics_$1])])dnl +AC_CACHE_CHECK([for _mm512_popcnt_epi64 with CFLAGS=$1], [Ac_cachevar], +[pgac_save_CFLAGS=$CFLAGS +CFLAGS="$pgac_save_CFLAGS $1" +AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <immintrin.h>], + [const char buf@<:@sizeof(__m512i)@:>@; + PG_INT64_TYPE popcnt = 0; + __m512i accum = _mm512_setzero_si512(); + const __m512i val = _mm512_loadu_si512((const __m512i *) buf); + const __m512i cnt = _mm512_popcnt_epi64(val); + accum = _mm512_add_epi64(accum, cnt); + popcnt = _mm512_reduce_add_epi64(accum); + /* return computed value, to prevent the above being optimized away */ + return popcnt == 0;])], + [Ac_cachevar=yes], + [Ac_cachevar=no]) +CFLAGS="$pgac_save_CFLAGS"]) +if test x"$Ac_cachevar" = x"yes"; then + CFLAGS_AVX512_POPCNT="$1" + pgac_avx512_popcnt_intrinsics=yes +fi +undefine([Ac_cachevar])dnl +])# PGAC_AVX512_POPCNT_INTRINSICS diff --git a/configure b/configure index 36feeafbb2..189264b86e 100755 --- a/configure +++ b/configure @@ -647,6 +647,7 @@ MSGFMT_FLAGS MSGFMT PG_CRC32C_OBJS CFLAGS_CRC +CFLAGS_AVX512_POPCNT LIBOBJS OPENSSL ZSTD @@ -17404,6 +17405,41 @@ $as_echo "#define HAVE__GET_CPUID 1" >>confdefs.h fi +# Check for x86 cpuid_count instruction +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __get_cpuid_count" >&5 +$as_echo_n "checking for __get_cpuid_count... " >&6; } +if ${pgac_cv__get_cpuid_count+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include <cpuid.h> +int +main () +{ +unsigned int exx[4] = {0, 0, 0, 0}; + __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]); + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + pgac_cv__get_cpuid_count="yes" +else + pgac_cv__get_cpuid_count="no" +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv__get_cpuid_count" >&5 +$as_echo "$pgac_cv__get_cpuid_count" >&6; } +if test x"$pgac_cv__get_cpuid_count" = x"yes"; then + +$as_echo "#define HAVE__GET_CPUID_COUNT 1" >>confdefs.h + +fi + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for __cpuid" >&5 $as_echo_n "checking for __cpuid... " >&6; } if ${pgac_cv__cpuid+:} false; then : @@ -17438,6 +17474,135 @@ $as_echo "#define HAVE__CPUID 1" >>confdefs.h fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __cpuidex" >&5 +$as_echo_n "checking for __cpuidex... " >&6; } +if ${pgac_cv__cpuidex+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include <intrin.h> +int +main () +{ +unsigned int exx[4] = {0, 0, 0, 0}; + __get_cpuidex(exx[0], 7, 0); + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + pgac_cv__cpuidex="yes" +else + pgac_cv__cpuidex="no" +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv__cpuidex" >&5 +$as_echo "$pgac_cv__cpuidex" >&6; } +if test x"$pgac_cv__cpuidex" = x"yes"; then + +$as_echo "#define HAVE__CPUIDEX 1" >>confdefs.h + +fi + +# Check for AVX512 popcount intrinsics +# +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm512_popcnt_epi64 with CFLAGS=" >&5 +$as_echo_n "checking for _mm512_popcnt_epi64 with CFLAGS=... " >&6; } +if ${pgac_cv_avx512_popcnt_intrinsics_+:} false; then : + $as_echo_n "(cached) " >&6 +else + pgac_save_CFLAGS=$CFLAGS +CFLAGS="$pgac_save_CFLAGS " +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include <immintrin.h> +int +main () +{ +const char buf[sizeof(__m512i)]; + PG_INT64_TYPE popcnt = 0; + __m512i accum = _mm512_setzero_si512(); + const __m512i val = _mm512_loadu_si512((const __m512i *) buf); + const __m512i cnt = _mm512_popcnt_epi64(val); + accum = _mm512_add_epi64(accum, cnt); + popcnt = _mm512_reduce_add_epi64(accum); + /* return computed value, to prevent the above being optimized away */ + return popcnt == 0; + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + pgac_cv_avx512_popcnt_intrinsics_=yes +else + pgac_cv_avx512_popcnt_intrinsics_=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +CFLAGS="$pgac_save_CFLAGS" +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_avx512_popcnt_intrinsics_" >&5 +$as_echo "$pgac_cv_avx512_popcnt_intrinsics_" >&6; } +if test x"$pgac_cv_avx512_popcnt_intrinsics_" = x"yes"; then + CFLAGS_AVX512_POPCNT="" + pgac_avx512_popcnt_intrinsics=yes +fi + +if test x"$pgac_avx512_popcnt_intrinsics" != x"yes"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm512_popcnt_epi64 with CFLAGS=-mavx512vpopcntdq" >&5 +$as_echo_n "checking for _mm512_popcnt_epi64 with CFLAGS=-mavx512vpopcntdq... " >&6; } +if ${pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq+:} false; then : + $as_echo_n "(cached) " >&6 +else + pgac_save_CFLAGS=$CFLAGS +CFLAGS="$pgac_save_CFLAGS -mavx512vpopcntdq" +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include <immintrin.h> +int +main () +{ +const char buf[sizeof(__m512i)]; + PG_INT64_TYPE popcnt = 0; + __m512i accum = _mm512_setzero_si512(); + const __m512i val = _mm512_loadu_si512((const __m512i *) buf); + const __m512i cnt = _mm512_popcnt_epi64(val); + accum = _mm512_add_epi64(accum, cnt); + popcnt = _mm512_reduce_add_epi64(accum); + /* return computed value, to prevent the above being optimized away */ + return popcnt == 0; + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq=yes +else + pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +CFLAGS="$pgac_save_CFLAGS" +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq" >&5 +$as_echo "$pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq" >&6; } +if test x"$pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq" = x"yes"; then + CFLAGS_AVX512_POPCNT="-mavx512vpopcntdq" + pgac_avx512_popcnt_intrinsics=yes +fi + +fi +if test x"$pgac_avx512_popcnt_intrinsics" = x"yes"; then + +$as_echo "#define USE_AVX512_POPCNT_WITH_RUNTIME_CHECK 1" >>confdefs.h + +fi + + # Check for Intel SSE 4.2 intrinsics to do CRC calculations. # # First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used diff --git a/configure.ac b/configure.ac index 57f734879e..ced39c9055 100644 --- a/configure.ac +++ b/configure.ac @@ -2052,6 +2052,18 @@ if test x"$pgac_cv__get_cpuid" = x"yes"; then AC_DEFINE(HAVE__GET_CPUID, 1, [Define to 1 if you have __get_cpuid.]) fi +# Check for x86 cpuid_count instruction +AC_CACHE_CHECK([for __get_cpuid_count], [pgac_cv__get_cpuid_count], +[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <cpuid.h>], + [[unsigned int exx[4] = {0, 0, 0, 0}; + __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]); + ]])], + [pgac_cv__get_cpuid_count="yes"], + [pgac_cv__get_cpuid_count="no"])]) +if test x"$pgac_cv__get_cpuid_count" = x"yes"; then + AC_DEFINE(HAVE__GET_CPUID_COUNT, 1, [Define to 1 if you have __get_cpuid_count.]) +fi + AC_CACHE_CHECK([for __cpuid], [pgac_cv__cpuid], [AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <intrin.h>], [[unsigned int exx[4] = {0, 0, 0, 0}; @@ -2063,6 +2075,28 @@ if test x"$pgac_cv__cpuid" = x"yes"; then AC_DEFINE(HAVE__CPUID, 1, [Define to 1 if you have __cpuid.]) fi +AC_CACHE_CHECK([for __cpuidex], [pgac_cv__cpuidex], +[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <intrin.h>], + [[unsigned int exx[4] = {0, 0, 0, 0}; + __get_cpuidex(exx[0], 7, 0); + ]])], + [pgac_cv__cpuidex="yes"], + [pgac_cv__cpuidex="no"])]) +if test x"$pgac_cv__cpuidex" = x"yes"; then + AC_DEFINE(HAVE__CPUIDEX, 1, [Define to 1 if you have __cpuidex.]) +fi + +# Check for AVX512 popcount intrinsics +# +PGAC_AVX512_POPCNT_INTRINSICS([]) +if test x"$pgac_avx512_popcnt_intrinsics" != x"yes"; then + PGAC_AVX512_POPCNT_INTRINSICS([-mavx512vpopcntdq]) +fi +if test x"$pgac_avx512_popcnt_intrinsics" = x"yes"; then + AC_DEFINE(USE_AVX512_POPCNT_WITH_RUNTIME_CHECK, 1, [Define to 1 to use AVX512 popcount instructions with a runtime check.]) +fi +AC_SUBST(CFLAGS_AVX512_POPCNT) + # Check for Intel SSE 4.2 intrinsics to do CRC calculations. # # First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used diff --git a/meson.build b/meson.build index 18b5be842e..2399b90d6e 100644 --- a/meson.build +++ b/meson.build @@ -1783,6 +1783,30 @@ elif cc.links(''' endif +# Check for __get_cpuid_count() and __cpuidex() in a similar fashion. +if cc.links(''' + #include <cpuid.h> + int main(int arg, char **argv) + { + unsigned int exx[4] = {0, 0, 0, 0}; + __get_cpuid_count(7, &exx[0], &exx[1], &exx[2], &exx[3]); + } + ''', name: '__get_cpuid_count', + args: test_c_args) + cdata.set('HAVE__GET_CPUID_COUNT', 1) +elif cc.links(''' + #include <intrin.h> + int main(int arg, char **argv) + { + unsigned int exx[4] = {0, 0, 0, 0}; + __cpuidex(exx, 7, 0); + } + ''', name: '__cpuidex', + args: test_c_args) + cdata.set('HAVE__CPUIDEX', 1) +endif + + # Defend against clang being used on x86-32 without SSE2 enabled. As current # versions of clang do not understand -fexcess-precision=standard, the use of # x87 floating point operations leads to problems like isinf possibly returning @@ -1996,6 +2020,41 @@ int main(void) endif +############################################################### +# Check for the availability of AVX512 popcount intrinsics. +############################################################### + +cflags_avx512_popcnt = [] +if host_cpu == 'x86' or host_cpu == 'x86_64' + + prog = ''' +#include <immintrin.h> + +int main(void) +{ + const char buf[sizeof(__m512i)]; + INT64 popcnt = 0; + __m512i accum = _mm512_setzero_si512(); + const __m512i val = _mm512_loadu_si512((const __m512i *) buf); + const __m512i cnt = _mm512_popcnt_epi64(val); + accum = _mm512_add_epi64(accum, cnt); + popcnt = _mm512_reduce_add_epi64(accum); + /* return computed value, to prevent the above being optimized away */ + return popcnt == 0; +} +''' + + if cc.links(prog, name: 'AVX512 popcount without -mavx512vpopcntdq', + args: test_c_args + ['-DINT64=@0@'.format(cdata.get('PG_INT64_TYPE'))]) + cdata.set('USE_AVX512_POPCNT_WITH_RUNTIME_CHECK', 1) + elif cc.links(prog, name: 'AVX512 popcount with -mavx512vpopcntdq', + args: test_c_args + ['-DINT64=@0@'.format(cdata.get('PG_INT64_TYPE'))] + ['-mavx512vpopcntdq']) + cdata.set('USE_AVX512_POPCNT_WITH_RUNTIME_CHECK', 1) + cflags_avx512_popcnt += '-mavx512vpopcntdq' + endif + +endif + ############################################################### # Select CRC-32C implementation. diff --git a/src/Makefile.global.in b/src/Makefile.global.in index 8b3f8c24e0..a6c0c4a692 100644 --- a/src/Makefile.global.in +++ b/src/Makefile.global.in @@ -262,6 +262,7 @@ CFLAGS_SL_MODULE = @CFLAGS_SL_MODULE@ CXXFLAGS_SL_MODULE = @CXXFLAGS_SL_MODULE@ CFLAGS_UNROLL_LOOPS = @CFLAGS_UNROLL_LOOPS@ CFLAGS_VECTORIZE = @CFLAGS_VECTORIZE@ +CFLAGS_AVX512_POPCNT = @CFLAGS_AVX512_POPCNT@ CFLAGS_CRC = @CFLAGS_CRC@ PERMIT_DECLARATION_AFTER_STATEMENT = @PERMIT_DECLARATION_AFTER_STATEMENT@ CXXFLAGS = @CXXFLAGS@ diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index 591e1ca3df..133d8ba071 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -555,9 +555,15 @@ /* Define to 1 if you have __cpuid. */ #undef HAVE__CPUID +/* Define to 1 if you have __cpuidex. */ +#undef HAVE__CPUIDEX + /* Define to 1 if you have __get_cpuid. */ #undef HAVE__GET_CPUID +/* Define to 1 if you have __get_cpuid_count. */ +#undef HAVE__GET_CPUID_COUNT + /* Define to 1 if your compiler understands _Static_assert. */ #undef HAVE__STATIC_ASSERT @@ -680,6 +686,9 @@ /* Define to 1 to build with assertion checks. (--enable-cassert) */ #undef USE_ASSERT_CHECKING +/* Define to 1 to use AVX512 popcount instructions with a runtime check. */ +#undef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK + /* Define to 1 to build with Bonjour support. (--with-bonjour) */ #undef USE_BONJOUR diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h index 53e5239717..c69a85e08e 100644 --- a/src/include/port/pg_bitutils.h +++ b/src/include/port/pg_bitutils.h @@ -298,12 +298,32 @@ pg_ceil_log2_64(uint64 num) #endif #endif +/* + * We can also try to use the AVX512 popcount instruction on some systems. + * The implementation of that is located in its own file because it may + * require special compiler flags that we don't want to apply to any other + * files. + * + * NB: We assume that there's no hope of AVX512 popcount support if the "fast" + * implementations aren't available. + */ +#if defined(TRY_POPCNT_FAST) && defined(USE_AVX512_POPCNT_WITH_RUNTIME_CHECK) +#if defined(HAVE__GET_CPUID_COUNT) || defined(HAVE__CPUIDEX) +#define TRY_POPCNT_AVX512 1 +extern bool pg_popcount_avx512_available(void); +extern uint64 pg_popcount_avx512(const char *buf, int bytes); +#endif +#endif + #ifdef TRY_POPCNT_FAST /* Attempt to use the POPCNT instruction, but perform a runtime check first */ extern PGDLLIMPORT int (*pg_popcount32) (uint32 word); extern PGDLLIMPORT int (*pg_popcount64) (uint64 word); extern PGDLLIMPORT uint64 (*pg_popcount) (const char *buf, int bytes); +/* Export pg_popcount_fast() for use in the AVX512 implementation. */ +extern uint64 pg_popcount_fast(const char *buf, int bytes); + #else /* Use a portable implementation -- no need for a function pointer. */ extern int pg_popcount32(uint32 word); diff --git a/src/makefiles/meson.build b/src/makefiles/meson.build index b0f4178b3d..c2345cc95f 100644 --- a/src/makefiles/meson.build +++ b/src/makefiles/meson.build @@ -99,6 +99,7 @@ pgxs_kv = { 'PERMIT_DECLARATION_AFTER_STATEMENT': ' '.join(cflags_no_decl_after_statement), + 'CFLAGS_AVX512_POPCNT': ' '.join(cflags_avx512_popcnt), 'CFLAGS_CRC': ' '.join(cflags_crc), 'CFLAGS_UNROLL_LOOPS': ' '.join(unroll_loops_cflags), 'CFLAGS_VECTORIZE': ' '.join(vectorize_cflags), diff --git a/src/port/Makefile b/src/port/Makefile index dcc8737e68..fd2c59aec6 100644 --- a/src/port/Makefile +++ b/src/port/Makefile @@ -44,6 +44,7 @@ OBJS = \ noblock.o \ path.o \ pg_bitutils.o \ + pg_popcount_avx512.o \ pg_strong_random.o \ pgcheckdir.o \ pgmkdirp.o \ @@ -92,6 +93,11 @@ pg_crc32c_armv8.o: CFLAGS+=$(CFLAGS_CRC) pg_crc32c_armv8_shlib.o: CFLAGS+=$(CFLAGS_CRC) pg_crc32c_armv8_srv.o: CFLAGS+=$(CFLAGS_CRC) +# all versions of pg_popcount_avx512.o need CFLAGS_AVX512_POPCNT +pg_popcount_avx512.o: CFLAGS+=$(CFLAGS_AVX512_POPCNT) +pg_popcount_avx512_shlib.o: CFLAGS+=$(CFLAGS_AVX512_POPCNT) +pg_popcount_avx512_srv.o: CFLAGS+=$(CFLAGS_AVX512_POPCNT) + # # Shared library versions of object files # diff --git a/src/port/meson.build b/src/port/meson.build index 92b593e6ef..4e69fe8e91 100644 --- a/src/port/meson.build +++ b/src/port/meson.build @@ -7,6 +7,7 @@ pgport_sources = [ 'noblock.c', 'path.c', 'pg_bitutils.c', + 'pg_popcount_avx512.c', 'pg_strong_random.c', 'pgcheckdir.c', 'pgmkdirp.c', @@ -84,6 +85,7 @@ replace_funcs_pos = [ ['pg_crc32c_sse42', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK', 'crc'], ['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'], ['pg_crc32c_sb8', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'], + ['pg_popcount_avx512', 'USE_AVX512_POPCNT_WITH_RUNTIME_CHECK', 'avx512_popcnt'], # arm / aarch64 ['pg_crc32c_armv8', 'USE_ARMV8_CRC32C'], @@ -98,8 +100,8 @@ replace_funcs_pos = [ ['pg_crc32c_sb8', 'USE_SLICING_BY_8_CRC32C'], ] -pgport_cflags = {'crc': cflags_crc} -pgport_sources_cflags = {'crc': []} +pgport_cflags = {'crc': cflags_crc, 'avx512_popcnt': cflags_avx512_popcnt} +pgport_sources_cflags = {'crc': [], 'avx512_popcnt': []} foreach f : replace_funcs_neg func = f.get(0) diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c index 1197696e97..61cd049553 100644 --- a/src/port/pg_bitutils.c +++ b/src/port/pg_bitutils.c @@ -114,7 +114,6 @@ static int pg_popcount64_choose(uint64 word); static uint64 pg_popcount_choose(const char *buf, int bytes); static inline int pg_popcount32_fast(uint32 word); static inline int pg_popcount64_fast(uint64 word); -static uint64 pg_popcount_fast(const char *buf, int bytes); int (*pg_popcount32) (uint32 word) = pg_popcount32_choose; int (*pg_popcount64) (uint64 word) = pg_popcount64_choose; @@ -142,20 +141,18 @@ pg_popcount_available(void) return (exx[2] & (1 << 23)) != 0; /* POPCNT */ } -/* - * These functions get called on the first call to pg_popcount32 etc. - * They detect whether we can use the asm implementations, and replace - * the function pointers so that subsequent calls are routed directly to - * the chosen implementation. - */ -static int -pg_popcount32_choose(uint32 word) +static inline void +choose_popcount_functions(void) { if (pg_popcount_available()) { pg_popcount32 = pg_popcount32_fast; pg_popcount64 = pg_popcount64_fast; pg_popcount = pg_popcount_fast; +#ifdef TRY_POPCNT_AVX512 + if (pg_popcount_avx512_available()) + pg_popcount = pg_popcount_avx512; +#endif } else { @@ -163,45 +160,32 @@ pg_popcount32_choose(uint32 word) pg_popcount64 = pg_popcount64_slow; pg_popcount = pg_popcount_slow; } +} +/* + * These functions get called on the first call to pg_popcount32 etc. + * They detect whether we can use the asm implementations, and replace + * the function pointers so that subsequent calls are routed directly to + * the chosen implementation. + */ +static int +pg_popcount32_choose(uint32 word) +{ + choose_popcount_functions(); return pg_popcount32(word); } static int pg_popcount64_choose(uint64 word) { - if (pg_popcount_available()) - { - pg_popcount32 = pg_popcount32_fast; - pg_popcount64 = pg_popcount64_fast; - pg_popcount = pg_popcount_fast; - } - else - { - pg_popcount32 = pg_popcount32_slow; - pg_popcount64 = pg_popcount64_slow; - pg_popcount = pg_popcount_slow; - } - + choose_popcount_functions(); return pg_popcount64(word); } static uint64 pg_popcount_choose(const char *buf, int bytes) { - if (pg_popcount_available()) - { - pg_popcount32 = pg_popcount32_fast; - pg_popcount64 = pg_popcount64_fast; - pg_popcount = pg_popcount_fast; - } - else - { - pg_popcount32 = pg_popcount32_slow; - pg_popcount64 = pg_popcount64_slow; - pg_popcount = pg_popcount_slow; - } - + choose_popcount_functions(); return pg_popcount(buf, bytes); } @@ -243,7 +227,7 @@ __asm__ __volatile__(" popcntq %1,%0\n":"=q"(res):"rm"(word):"cc"); * pg_popcount_fast * Returns the number of 1-bits in buf */ -static uint64 +uint64 pg_popcount_fast(const char *buf, int bytes) { uint64 popcnt = 0; diff --git a/src/port/pg_popcount_avx512.c b/src/port/pg_popcount_avx512.c new file mode 100644 index 0000000000..66ca92c029 --- /dev/null +++ b/src/port/pg_popcount_avx512.c @@ -0,0 +1,98 @@ +/*------------------------------------------------------------------------- + * + * pg_popcount_avx512.c + * Holds the pg_popcount() implementation that uses AVX512 instructions. + * + * Copyright (c) 2019-2024, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/port/pg_popcount_avx512.c + * + *------------------------------------------------------------------------- + */ +#include "c.h" + +#ifdef HAVE__GET_CPUID_COUNT +#include <cpuid.h> +#endif + +#ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK +#include <immintrin.h> +#endif + +#ifdef HAVE__CPUIDEX +#include <intrin.h> +#endif + +#include "port/pg_bitutils.h" + +/* + * XXX: Someday we should figure out how to determine whether this file needs + * to compiled at configure-time instead of relying on macros that are + * determined at compile-time. + */ +#ifdef TRY_POPCNT_AVX512 + +/* + * Return true if CPUID indicates that the AVX512 POPCNT instruction is + * available. + */ +bool +pg_popcount_avx512_available(void) +{ + unsigned int exx[4] = {0, 0, 0, 0}; + +#if defined(HAVE__GET_CPUID_COUNT) + __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]); +#elif defined(HAVE__CPUIDEX) + __cpuidex(exx, 7, 0); +#else +#error cpuid instruction not available +#endif + + if ((exx[1] & (1 << 16)) != 0 && /* avx512f */ + (exx[2] & (1 << 14)) != 0) /* avx512vpopcntdq */ + { + /* + * We also need to check that the OS has enabled support for the ZMM + * registers. + */ +#ifdef _MSC_VER + return (_xgetbv(0) & 0xe0) != 0; +#else + uint64 xcr = 0; + uint32 high; + uint32 low; + +__asm__ __volatile__(" xgetbv\n":"=a"(low), "=d"(high):"c"(xcr)); + return (low & 0xe0) != 0; +#endif + } + + return false; +} + +/* + * pg_popcount_avx512 + * Returns the number of 1-bits in buf + */ +uint64 +pg_popcount_avx512(const char *buf, int bytes) +{ + uint64 popcnt; + __m512i accum = _mm512_setzero_si512(); + + for (; bytes >= sizeof(__m512i); bytes -= sizeof(__m512i)) + { + const __m512i val = _mm512_loadu_si512((const __m512i *) buf); + const __m512i cnt = _mm512_popcnt_epi64(val); + + accum = _mm512_add_epi64(accum, cnt); + buf += sizeof(__m512i); + } + + popcnt = _mm512_reduce_add_epi64(accum); + return popcnt + pg_popcount_fast(buf, bytes); +} + +#endif /* TRY_POPCNT_AVX512 */ -- 2.25.1