On Wed, Oct 30, 2024 at 04:10:10PM -0500, Nathan Bossart wrote:
> On Wed, Oct 30, 2024 at 08:53:10PM +0000, Raghuveer Devulapalli wrote:
>> BTW, I just realized function attributes for xsave and avx512 don't work
>> on MSVC (see
>> https://developercommunity.visualstudio.com/t/support-function-target-attribute-and-mutiversioning/10130630).
>> Not sure if you care about it. Its an easy fix (see
>> https://gcc.godbolt.org/z/Pebdj3vMx).
> 
> Oh, good catch.  IIUC we only need to check for #ifndef _MSC_VER in the
> configure programs for meson.  pg_attribute_target will be empty on MSVC,
> and I believe we only support meson builds there.

Here is an updated patch with this change.

-- 
nathan
>From 8cf7c08220a9c0a1dec809794af2dfb719981923 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nat...@postgresql.org>
Date: Wed, 16 Oct 2024 15:57:55 -0500
Subject: [PATCH v2 1/1] use __attribute__((target(...))) for AVX-512 stuff

---
 config/c-compiler.m4                 |  60 +++++-----
 configure                            | 163 ++++++---------------------
 configure.ac                         |  17 +--
 meson.build                          |  21 ++--
 src/Makefile.global.in               |   5 -
 src/include/c.h                      |  10 ++
 src/makefiles/meson.build            |   4 +-
 src/port/Makefile                    |  12 +-
 src/port/meson.build                 |   7 +-
 src/port/pg_popcount_avx512.c        |  86 +++++++++++++-
 src/port/pg_popcount_avx512_choose.c | 102 -----------------
 11 files changed, 175 insertions(+), 312 deletions(-)
 delete mode 100644 src/port/pg_popcount_avx512_choose.c

diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index 10f8c7bd0a..aa90f8ef33 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -700,20 +700,20 @@ undefine([Ac_cachevar])dnl
 # Check if the compiler supports the XSAVE instructions using the _xgetbv
 # intrinsic function.
 #
-# An optional compiler flag can be passed as argument (e.g., -mxsave).  If the
-# intrinsic is supported, sets pgac_xsave_intrinsics and CFLAGS_XSAVE.
+# If the intrinsics are supported, sets pgac_xsave_intrinsics.
 AC_DEFUN([PGAC_XSAVE_INTRINSICS],
-[define([Ac_cachevar], [AS_TR_SH([pgac_cv_xsave_intrinsics_$1])])dnl
-AC_CACHE_CHECK([for _xgetbv with CFLAGS=$1], [Ac_cachevar],
-[pgac_save_CFLAGS=$CFLAGS
-CFLAGS="$pgac_save_CFLAGS $1"
-AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <immintrin.h>],
-  [return _xgetbv(0) & 0xe0;])],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_xsave_intrinsics])])dnl
+AC_CACHE_CHECK([for _xgetbv], [Ac_cachevar],
+[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <immintrin.h>
+    __attribute__((target("xsave")))
+    static int xsave_test(void)
+    {
+      return _xgetbv(0) & 0xe0;
+    }],
+  [return xsave_test();])],
   [Ac_cachevar=yes],
-  [Ac_cachevar=no])
-CFLAGS="$pgac_save_CFLAGS"])
+  [Ac_cachevar=no])])
 if test x"$Ac_cachevar" = x"yes"; then
-  CFLAGS_XSAVE="$1"
   pgac_xsave_intrinsics=yes
 fi
 undefine([Ac_cachevar])dnl
@@ -725,29 +725,27 @@ undefine([Ac_cachevar])dnl
 # _mm512_setzero_si512, _mm512_maskz_loadu_epi8, _mm512_popcnt_epi64,
 # _mm512_add_epi64, and _mm512_reduce_add_epi64 intrinsic functions.
 #
-# Optional compiler flags can be passed as argument (e.g., -mavx512vpopcntdq
-# -mavx512bw).  If the intrinsics are supported, sets
-# pgac_avx512_popcnt_intrinsics and CFLAGS_POPCNT.
+# If the intrinsics are supported, sets pgac_avx512_popcnt_intrinsics.
 AC_DEFUN([PGAC_AVX512_POPCNT_INTRINSICS],
-[define([Ac_cachevar], [AS_TR_SH([pgac_cv_avx512_popcnt_intrinsics_$1])])dnl
-AC_CACHE_CHECK([for _mm512_popcnt_epi64 with CFLAGS=$1], [Ac_cachevar],
-[pgac_save_CFLAGS=$CFLAGS
-CFLAGS="$pgac_save_CFLAGS $1"
-AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <immintrin.h>],
-  [const char buf@<:@sizeof(__m512i)@:>@;
-   PG_INT64_TYPE popcnt = 0;
-   __m512i accum = _mm512_setzero_si512();
-   const __m512i val = _mm512_maskz_loadu_epi8((__mmask64) 0xf0f0f0f0f0f0f0f0, 
(const __m512i *) buf);
-   const __m512i cnt = _mm512_popcnt_epi64(val);
-   accum = _mm512_add_epi64(accum, cnt);
-   popcnt = _mm512_reduce_add_epi64(accum);
-   /* return computed value, to prevent the above being optimized away */
-   return popcnt == 0;])],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_avx512_popcnt_intrinsics])])dnl
+AC_CACHE_CHECK([for _mm512_popcnt_epi64], [Ac_cachevar],
+[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <immintrin.h>
+    __attribute__((target("avx512vpopcntdq","avx512bw")))
+    static int popcount_test(void)
+    {
+      const char buf@<:@sizeof(__m512i)@:>@;
+      PG_INT64_TYPE popcnt = 0;
+      __m512i accum = _mm512_setzero_si512();
+      const __m512i val = _mm512_maskz_loadu_epi8((__mmask64) 
0xf0f0f0f0f0f0f0f0, (const __m512i *) buf);
+      const __m512i cnt = _mm512_popcnt_epi64(val);
+      accum = _mm512_add_epi64(accum, cnt);
+      popcnt = _mm512_reduce_add_epi64(accum);
+      return (int) popcnt;
+    }],
+  [return popcount_test();])],
   [Ac_cachevar=yes],
-  [Ac_cachevar=no])
-CFLAGS="$pgac_save_CFLAGS"])
+  [Ac_cachevar=no])])
 if test x"$Ac_cachevar" = x"yes"; then
-  CFLAGS_POPCNT="$1"
   pgac_avx512_popcnt_intrinsics=yes
 fi
 undefine([Ac_cachevar])dnl
diff --git a/configure b/configure
index 268ac94ae6..354c8740e1 100755
--- a/configure
+++ b/configure
@@ -647,9 +647,6 @@ MSGFMT_FLAGS
 MSGFMT
 PG_CRC32C_OBJS
 CFLAGS_CRC
-PG_POPCNT_OBJS
-CFLAGS_POPCNT
-CFLAGS_XSAVE
 LIBOBJS
 OPENSSL
 ZSTD
@@ -17266,185 +17263,99 @@ fi
 
 # Check for XSAVE intrinsics
 #
-CFLAGS_XSAVE=""
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _xgetbv with CFLAGS=" >&5
-$as_echo_n "checking for _xgetbv with CFLAGS=... " >&6; }
-if ${pgac_cv_xsave_intrinsics_+:} false; then :
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _xgetbv" >&5
+$as_echo_n "checking for _xgetbv... " >&6; }
+if ${pgac_cv_xsave_intrinsics+:} false; then :
   $as_echo_n "(cached) " >&6
 else
-  pgac_save_CFLAGS=$CFLAGS
-CFLAGS="$pgac_save_CFLAGS "
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-#include <immintrin.h>
-int
-main ()
-{
-return _xgetbv(0) & 0xe0;
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
-  pgac_cv_xsave_intrinsics_=yes
-else
-  pgac_cv_xsave_intrinsics_=no
-fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext conftest.$ac_ext
-CFLAGS="$pgac_save_CFLAGS"
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_xsave_intrinsics_" 
>&5
-$as_echo "$pgac_cv_xsave_intrinsics_" >&6; }
-if test x"$pgac_cv_xsave_intrinsics_" = x"yes"; then
-  CFLAGS_XSAVE=""
-  pgac_xsave_intrinsics=yes
-fi
-
-if test x"$pgac_xsave_intrinsics" != x"yes"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _xgetbv with 
CFLAGS=-mxsave" >&5
-$as_echo_n "checking for _xgetbv with CFLAGS=-mxsave... " >&6; }
-if ${pgac_cv_xsave_intrinsics__mxsave+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  pgac_save_CFLAGS=$CFLAGS
-CFLAGS="$pgac_save_CFLAGS -mxsave"
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 #include <immintrin.h>
+    __attribute__((target("xsave")))
+    static int xsave_test(void)
+    {
+      return _xgetbv(0) & 0xe0;
+    }
 int
 main ()
 {
-return _xgetbv(0) & 0xe0;
+return xsave_test();
   ;
   return 0;
 }
 _ACEOF
 if ac_fn_c_try_link "$LINENO"; then :
-  pgac_cv_xsave_intrinsics__mxsave=yes
+  pgac_cv_xsave_intrinsics=yes
 else
-  pgac_cv_xsave_intrinsics__mxsave=no
+  pgac_cv_xsave_intrinsics=no
 fi
 rm -f core conftest.err conftest.$ac_objext \
     conftest$ac_exeext conftest.$ac_ext
-CFLAGS="$pgac_save_CFLAGS"
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: 
$pgac_cv_xsave_intrinsics__mxsave" >&5
-$as_echo "$pgac_cv_xsave_intrinsics__mxsave" >&6; }
-if test x"$pgac_cv_xsave_intrinsics__mxsave" = x"yes"; then
-  CFLAGS_XSAVE="-mxsave"
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_xsave_intrinsics" >&5
+$as_echo "$pgac_cv_xsave_intrinsics" >&6; }
+if test x"$pgac_cv_xsave_intrinsics" = x"yes"; then
   pgac_xsave_intrinsics=yes
 fi
 
-fi
 if test x"$pgac_xsave_intrinsics" = x"yes"; then
 
 $as_echo "#define HAVE_XSAVE_INTRINSICS 1" >>confdefs.h
 
 fi
 
-
 # Check for AVX-512 popcount intrinsics
 #
-CFLAGS_POPCNT=""
-PG_POPCNT_OBJS=""
 if test x"$host_cpu" = x"x86_64"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm512_popcnt_epi64 
with CFLAGS=" >&5
-$as_echo_n "checking for _mm512_popcnt_epi64 with CFLAGS=... " >&6; }
-if ${pgac_cv_avx512_popcnt_intrinsics_+:} false; then :
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm512_popcnt_epi64" 
>&5
+$as_echo_n "checking for _mm512_popcnt_epi64... " >&6; }
+if ${pgac_cv_avx512_popcnt_intrinsics+:} false; then :
   $as_echo_n "(cached) " >&6
 else
-  pgac_save_CFLAGS=$CFLAGS
-CFLAGS="$pgac_save_CFLAGS "
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-#include <immintrin.h>
-int
-main ()
-{
-const char buf[sizeof(__m512i)];
-   PG_INT64_TYPE popcnt = 0;
-   __m512i accum = _mm512_setzero_si512();
-   const __m512i val = _mm512_maskz_loadu_epi8((__mmask64) 0xf0f0f0f0f0f0f0f0, 
(const __m512i *) buf);
-   const __m512i cnt = _mm512_popcnt_epi64(val);
-   accum = _mm512_add_epi64(accum, cnt);
-   popcnt = _mm512_reduce_add_epi64(accum);
-   /* return computed value, to prevent the above being optimized away */
-   return popcnt == 0;
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
-  pgac_cv_avx512_popcnt_intrinsics_=yes
-else
-  pgac_cv_avx512_popcnt_intrinsics_=no
-fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext conftest.$ac_ext
-CFLAGS="$pgac_save_CFLAGS"
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: 
$pgac_cv_avx512_popcnt_intrinsics_" >&5
-$as_echo "$pgac_cv_avx512_popcnt_intrinsics_" >&6; }
-if test x"$pgac_cv_avx512_popcnt_intrinsics_" = x"yes"; then
-  CFLAGS_POPCNT=""
-  pgac_avx512_popcnt_intrinsics=yes
-fi
-
-  if test x"$pgac_avx512_popcnt_intrinsics" != x"yes"; then
-    { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm512_popcnt_epi64 
with CFLAGS=-mavx512vpopcntdq -mavx512bw" >&5
-$as_echo_n "checking for _mm512_popcnt_epi64 with CFLAGS=-mavx512vpopcntdq 
-mavx512bw... " >&6; }
-if ${pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq__mavx512bw+:} false; 
then :
-  $as_echo_n "(cached) " >&6
-else
-  pgac_save_CFLAGS=$CFLAGS
-CFLAGS="$pgac_save_CFLAGS -mavx512vpopcntdq -mavx512bw"
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 #include <immintrin.h>
+    __attribute__((target("avx512vpopcntdq","avx512bw")))
+    static int popcount_test(void)
+    {
+      const char buf[sizeof(__m512i)];
+      PG_INT64_TYPE popcnt = 0;
+      __m512i accum = _mm512_setzero_si512();
+      const __m512i val = _mm512_maskz_loadu_epi8((__mmask64) 
0xf0f0f0f0f0f0f0f0, (const __m512i *) buf);
+      const __m512i cnt = _mm512_popcnt_epi64(val);
+      accum = _mm512_add_epi64(accum, cnt);
+      popcnt = _mm512_reduce_add_epi64(accum);
+      return (int) popcnt;
+    }
 int
 main ()
 {
-const char buf[sizeof(__m512i)];
-   PG_INT64_TYPE popcnt = 0;
-   __m512i accum = _mm512_setzero_si512();
-   const __m512i val = _mm512_maskz_loadu_epi8((__mmask64) 0xf0f0f0f0f0f0f0f0, 
(const __m512i *) buf);
-   const __m512i cnt = _mm512_popcnt_epi64(val);
-   accum = _mm512_add_epi64(accum, cnt);
-   popcnt = _mm512_reduce_add_epi64(accum);
-   /* return computed value, to prevent the above being optimized away */
-   return popcnt == 0;
+return popcount_test();
   ;
   return 0;
 }
 _ACEOF
 if ac_fn_c_try_link "$LINENO"; then :
-  pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq__mavx512bw=yes
+  pgac_cv_avx512_popcnt_intrinsics=yes
 else
-  pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq__mavx512bw=no
+  pgac_cv_avx512_popcnt_intrinsics=no
 fi
 rm -f core conftest.err conftest.$ac_objext \
     conftest$ac_exeext conftest.$ac_ext
-CFLAGS="$pgac_save_CFLAGS"
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: 
$pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq__mavx512bw" >&5
-$as_echo "$pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq__mavx512bw" >&6; 
}
-if test x"$pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq__mavx512bw" = 
x"yes"; then
-  CFLAGS_POPCNT="-mavx512vpopcntdq -mavx512bw"
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: 
$pgac_cv_avx512_popcnt_intrinsics" >&5
+$as_echo "$pgac_cv_avx512_popcnt_intrinsics" >&6; }
+if test x"$pgac_cv_avx512_popcnt_intrinsics" = x"yes"; then
   pgac_avx512_popcnt_intrinsics=yes
 fi
 
-  fi
   if test x"$pgac_avx512_popcnt_intrinsics" = x"yes"; then
-    PG_POPCNT_OBJS="pg_popcount_avx512.o pg_popcount_avx512_choose.o"
 
 $as_echo "#define USE_AVX512_POPCNT_WITH_RUNTIME_CHECK 1" >>confdefs.h
 
   fi
 fi
 
-
-
 # Check for Intel SSE 4.2 intrinsics to do CRC calculations.
 #
 # First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
diff --git a/configure.ac b/configure.ac
index 3c89b54bf1..d2ee63ab03 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2048,32 +2048,19 @@ fi
 
 # Check for XSAVE intrinsics
 #
-CFLAGS_XSAVE=""
-PGAC_XSAVE_INTRINSICS([])
-if test x"$pgac_xsave_intrinsics" != x"yes"; then
-  PGAC_XSAVE_INTRINSICS([-mxsave])
-fi
+PGAC_XSAVE_INTRINSICS()
 if test x"$pgac_xsave_intrinsics" = x"yes"; then
   AC_DEFINE(HAVE_XSAVE_INTRINSICS, 1, [Define to 1 if you have XSAVE 
intrinsics.])
 fi
-AC_SUBST(CFLAGS_XSAVE)
 
 # Check for AVX-512 popcount intrinsics
 #
-CFLAGS_POPCNT=""
-PG_POPCNT_OBJS=""
 if test x"$host_cpu" = x"x86_64"; then
-  PGAC_AVX512_POPCNT_INTRINSICS([])
-  if test x"$pgac_avx512_popcnt_intrinsics" != x"yes"; then
-    PGAC_AVX512_POPCNT_INTRINSICS([-mavx512vpopcntdq -mavx512bw])
-  fi
+  PGAC_AVX512_POPCNT_INTRINSICS()
   if test x"$pgac_avx512_popcnt_intrinsics" = x"yes"; then
-    PG_POPCNT_OBJS="pg_popcount_avx512.o pg_popcount_avx512_choose.o"
     AC_DEFINE(USE_AVX512_POPCNT_WITH_RUNTIME_CHECK, 1, [Define to 1 to use 
AVX-512 popcount instructions with a runtime check.])
   fi
 fi
-AC_SUBST(CFLAGS_POPCNT)
-AC_SUBST(PG_POPCNT_OBJS)
 
 # Check for Intel SSE 4.2 intrinsics to do CRC calculations.
 #
diff --git a/meson.build b/meson.build
index bb9d7f5a8e..ef162bd7a7 100644
--- a/meson.build
+++ b/meson.build
@@ -2153,25 +2153,22 @@ endforeach
 # Check for the availability of XSAVE intrinsics.
 ###############################################################
 
-cflags_xsave = []
 if host_cpu == 'x86' or host_cpu == 'x86_64'
 
   prog = '''
 #include <immintrin.h>
 
+#ifndef _MSC_VER
+__attribute__((target("xsave")))
+#endif
 int main(void)
 {
     return _xgetbv(0) & 0xe0;
 }
 '''
 
-  if cc.links(prog, name: 'XSAVE intrinsics without -mxsave',
-        args: test_c_args)
-    cdata.set('HAVE_XSAVE_INTRINSICS', 1)
-  elif cc.links(prog, name: 'XSAVE intrinsics with -mxsave',
-        args: test_c_args + ['-mxsave'])
+  if cc.links(prog, name: 'XSAVE intrinsics', args: test_c_args)
     cdata.set('HAVE_XSAVE_INTRINSICS', 1)
-    cflags_xsave += '-mxsave'
   endif
 
 endif
@@ -2181,12 +2178,14 @@ endif
 # Check for the availability of AVX-512 popcount intrinsics.
 ###############################################################
 
-cflags_popcnt = []
 if host_cpu == 'x86_64'
 
   prog = '''
 #include <immintrin.h>
 
+#ifndef _MSC_VER
+__attribute__((target("avx512vpopcntdq","avx512bw")))
+#endif
 int main(void)
 {
     const char buf[sizeof(__m512i)];
@@ -2201,13 +2200,9 @@ int main(void)
 }
 '''
 
-  if cc.links(prog, name: 'AVX-512 popcount without -mavx512vpopcntdq 
-mavx512bw',
+  if cc.links(prog, name: 'AVX-512 popcount',
         args: test_c_args + ['-DINT64=@0@'.format(cdata.get('PG_INT64_TYPE'))])
     cdata.set('USE_AVX512_POPCNT_WITH_RUNTIME_CHECK', 1)
-  elif cc.links(prog, name: 'AVX-512 popcount with -mavx512vpopcntdq 
-mavx512bw',
-        args: test_c_args + ['-DINT64=@0@'.format(cdata.get('PG_INT64_TYPE'))] 
+ ['-mavx512vpopcntdq'] + ['-mavx512bw'])
-    cdata.set('USE_AVX512_POPCNT_WITH_RUNTIME_CHECK', 1)
-    cflags_popcnt += ['-mavx512vpopcntdq'] + ['-mavx512bw']
   endif
 
 endif
diff --git a/src/Makefile.global.in b/src/Makefile.global.in
index 42f50b4976..45696247e9 100644
--- a/src/Makefile.global.in
+++ b/src/Makefile.global.in
@@ -262,9 +262,7 @@ CFLAGS_SL_MODULE = @CFLAGS_SL_MODULE@
 CXXFLAGS_SL_MODULE = @CXXFLAGS_SL_MODULE@
 CFLAGS_UNROLL_LOOPS = @CFLAGS_UNROLL_LOOPS@
 CFLAGS_VECTORIZE = @CFLAGS_VECTORIZE@
-CFLAGS_POPCNT = @CFLAGS_POPCNT@
 CFLAGS_CRC = @CFLAGS_CRC@
-CFLAGS_XSAVE = @CFLAGS_XSAVE@
 PERMIT_DECLARATION_AFTER_STATEMENT = @PERMIT_DECLARATION_AFTER_STATEMENT@
 PERMIT_MISSING_VARIABLE_DECLARATIONS = @PERMIT_MISSING_VARIABLE_DECLARATIONS@
 CXXFLAGS = @CXXFLAGS@
@@ -762,9 +760,6 @@ LIBOBJS = @LIBOBJS@
 # files needed for the chosen CRC-32C implementation
 PG_CRC32C_OBJS = @PG_CRC32C_OBJS@
 
-# files needed for the chosen popcount implementation
-PG_POPCNT_OBJS = @PG_POPCNT_OBJS@
-
 LIBS := -lpgcommon -lpgport $(LIBS)
 
 # to make ws2_32.lib the last library
diff --git a/src/include/c.h b/src/include/c.h
index 55dec71a6d..6f5ca25542 100644
--- a/src/include/c.h
+++ b/src/include/c.h
@@ -174,6 +174,16 @@
 #define pg_attribute_nonnull(...)
 #endif
 
+/*
+ * pg_attribute_target allows specifying different target options that the
+ * function should be compiled with (e.g., for using special CPU instructions).
+ */
+#if __has_attribute (target)
+#define pg_attribute_target(...) __attribute__((target(__VA_ARGS__)))
+#else
+#define pg_attribute_target(...)
+#endif
+
 /*
  * Append PG_USED_FOR_ASSERTS_ONLY to definitions of variables that are only
  * used in assert-enabled builds, to avoid compiler warnings about unused
diff --git a/src/makefiles/meson.build b/src/makefiles/meson.build
index 850e927584..479aa08420 100644
--- a/src/makefiles/meson.build
+++ b/src/makefiles/meson.build
@@ -102,10 +102,8 @@ pgxs_kv = {
     ' '.join(cflags_no_missing_var_decls),
 
   'CFLAGS_CRC': ' '.join(cflags_crc),
-  'CFLAGS_POPCNT': ' '.join(cflags_popcnt),
   'CFLAGS_UNROLL_LOOPS': ' '.join(unroll_loops_cflags),
   'CFLAGS_VECTORIZE': ' '.join(vectorize_cflags),
-  'CFLAGS_XSAVE': ' '.join(cflags_xsave),
 
   'LDFLAGS': var_ldflags,
   'LDFLAGS_EX': var_ldflags_ex,
@@ -181,7 +179,7 @@ pgxs_empty = [
   'WANTED_LANGUAGES',
 
   # Not needed because we don't build the server / PLs with the generated 
makefile
-  'LIBOBJS', 'PG_CRC32C_OBJS', 'PG_POPCNT_OBJS', 'TAS',
+  'LIBOBJS', 'PG_CRC32C_OBJS', 'TAS',
   'DTRACEFLAGS', # only server has dtrace probes
 
   'perl_archlibexp', 'perl_embed_ccflags', 'perl_embed_ldflags', 
'perl_includespec', 'perl_privlibexp',
diff --git a/src/port/Makefile b/src/port/Makefile
index 9324ec2d9f..366c814bd9 100644
--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -38,13 +38,13 @@ LIBS += $(PTHREAD_LIBS)
 OBJS = \
        $(LIBOBJS) \
        $(PG_CRC32C_OBJS) \
-       $(PG_POPCNT_OBJS) \
        bsearch_arg.o \
        chklocale.o \
        inet_net_ntop.o \
        noblock.o \
        path.o \
        pg_bitutils.o \
+       pg_popcount_avx512.o \
        pg_strong_random.o \
        pgcheckdir.o \
        pgmkdirp.o \
@@ -92,16 +92,6 @@ pg_crc32c_armv8.o: CFLAGS+=$(CFLAGS_CRC)
 pg_crc32c_armv8_shlib.o: CFLAGS+=$(CFLAGS_CRC)
 pg_crc32c_armv8_srv.o: CFLAGS+=$(CFLAGS_CRC)
 
-# all versions of pg_popcount_avx512_choose.o need CFLAGS_XSAVE
-pg_popcount_avx512_choose.o: CFLAGS+=$(CFLAGS_XSAVE)
-pg_popcount_avx512_choose_shlib.o: CFLAGS+=$(CFLAGS_XSAVE)
-pg_popcount_avx512_choose_srv.o: CFLAGS+=$(CFLAGS_XSAVE)
-
-# all versions of pg_popcount_avx512.o need CFLAGS_POPCNT
-pg_popcount_avx512.o: CFLAGS+=$(CFLAGS_POPCNT)
-pg_popcount_avx512_shlib.o: CFLAGS+=$(CFLAGS_POPCNT)
-pg_popcount_avx512_srv.o: CFLAGS+=$(CFLAGS_POPCNT)
-
 #
 # Shared library versions of object files
 #
diff --git a/src/port/meson.build b/src/port/meson.build
index 1150966ab7..83a0632520 100644
--- a/src/port/meson.build
+++ b/src/port/meson.build
@@ -7,6 +7,7 @@ pgport_sources = [
   'noblock.c',
   'path.c',
   'pg_bitutils.c',
+  'pg_popcount_avx512.c',
   'pg_strong_random.c',
   'pgcheckdir.c',
   'pgmkdirp.c',
@@ -84,8 +85,6 @@ replace_funcs_pos = [
   ['pg_crc32c_sse42', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK', 'crc'],
   ['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
   ['pg_crc32c_sb8', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
-  ['pg_popcount_avx512', 'USE_AVX512_POPCNT_WITH_RUNTIME_CHECK', 'popcnt'],
-  ['pg_popcount_avx512_choose', 'USE_AVX512_POPCNT_WITH_RUNTIME_CHECK', 
'xsave'],
 
   # arm / aarch64
   ['pg_crc32c_armv8', 'USE_ARMV8_CRC32C'],
@@ -100,8 +99,8 @@ replace_funcs_pos = [
   ['pg_crc32c_sb8', 'USE_SLICING_BY_8_CRC32C'],
 ]
 
-pgport_cflags = {'crc': cflags_crc, 'popcnt': cflags_popcnt, 'xsave': 
cflags_xsave}
-pgport_sources_cflags = {'crc': [], 'popcnt': [], 'xsave': []}
+pgport_cflags = {'crc': cflags_crc}
+pgport_sources_cflags = {'crc': []}
 
 foreach f : replace_funcs_neg
   func = f.get(0)
diff --git a/src/port/pg_popcount_avx512.c b/src/port/pg_popcount_avx512.c
index 9d3149e2d0..b598e86554 100644
--- a/src/port/pg_popcount_avx512.c
+++ b/src/port/pg_popcount_avx512.c
@@ -12,7 +12,17 @@
  */
 #include "c.h"
 
+#if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT)
+#include <cpuid.h>
+#endif
+
+#ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
 #include <immintrin.h>
+#endif
+
+#if defined(HAVE__CPUID) || defined(HAVE__CPUIDEX)
+#include <intrin.h>
+#endif
 
 #include "port/pg_bitutils.h"
 
@@ -21,12 +31,82 @@
  * use AVX-512 intrinsics, but we check it anyway to be sure.  We piggy-back on
  * the function pointers that are only used when TRY_POPCNT_FAST is set.
  */
-#ifdef TRY_POPCNT_FAST
+#if defined(TRY_POPCNT_FAST) && defined(USE_AVX512_POPCNT_WITH_RUNTIME_CHECK)
+
+/*
+ * Does CPUID say there's support for XSAVE instructions?
+ */
+static inline bool
+xsave_available(void)
+{
+       unsigned int exx[4] = {0, 0, 0, 0};
+
+#if defined(HAVE__GET_CPUID)
+       __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
+#elif defined(HAVE__CPUID)
+       __cpuid(exx, 1);
+#else
+#error cpuid instruction not available
+#endif
+       return (exx[2] & (1 << 27)) != 0;       /* osxsave */
+}
+
+/*
+ * Does XGETBV say the ZMM registers are enabled?
+ *
+ * NB: Caller is responsible for verifying that xsave_available() returns true
+ * before calling this.
+ */
+#ifdef HAVE_XSAVE_INTRINSICS
+pg_attribute_target("xsave")
+#endif
+static inline bool
+zmm_regs_available(void)
+{
+#ifdef HAVE_XSAVE_INTRINSICS
+       return (_xgetbv(0) & 0xe6) == 0xe6;
+#else
+       return false;
+#endif
+}
+
+/*
+ * Does CPUID say there's support for AVX-512 popcount and byte-and-word
+ * instructions?
+ */
+static inline bool
+avx512_popcnt_available(void)
+{
+       unsigned int exx[4] = {0, 0, 0, 0};
+
+#if defined(HAVE__GET_CPUID_COUNT)
+       __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
+#elif defined(HAVE__CPUIDEX)
+       __cpuidex(exx, 7, 0);
+#else
+#error cpuid instruction not available
+#endif
+       return (exx[2] & (1 << 14)) != 0 && /* avx512-vpopcntdq */
+               (exx[1] & (1 << 30)) != 0;      /* avx512-bw */
+}
+
+/*
+ * Returns true if the CPU supports the instructions required for the AVX-512
+ * pg_popcount() implementation.
+ */
+bool
+pg_popcount_avx512_available(void)
+{
+       return xsave_available() &&
+               zmm_regs_available() &&
+               avx512_popcnt_available();
+}
 
 /*
  * pg_popcount_avx512
  *             Returns the number of 1-bits in buf
  */
+pg_attribute_target("avx512vpopcntdq", "avx512bw")
 uint64
 pg_popcount_avx512(const char *buf, int bytes)
 {
@@ -82,6 +162,7 @@ pg_popcount_avx512(const char *buf, int bytes)
  * pg_popcount_masked_avx512
  *             Returns the number of 1-bits in buf after applying the mask to 
each byte
  */
+pg_attribute_target("avx512vpopcntdq", "avx512bw")
 uint64
 pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask)
 {
@@ -138,4 +219,5 @@ pg_popcount_masked_avx512(const char *buf, int bytes, bits8 
mask)
        return _mm512_reduce_add_epi64(accum);
 }
 
-#endif                                                 /* TRY_POPCNT_FAST */
+#endif                                                 /* TRY_POPCNT_FAST &&
+                                                                * 
USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */
diff --git a/src/port/pg_popcount_avx512_choose.c 
b/src/port/pg_popcount_avx512_choose.c
deleted file mode 100644
index b37107803a..0000000000
--- a/src/port/pg_popcount_avx512_choose.c
+++ /dev/null
@@ -1,102 +0,0 @@
-/*-------------------------------------------------------------------------
- *
- * pg_popcount_avx512_choose.c
- *    Test whether we can use the AVX-512 pg_popcount() implementation.
- *
- * Copyright (c) 2024, PostgreSQL Global Development Group
- *
- * IDENTIFICATION
- *    src/port/pg_popcount_avx512_choose.c
- *
- *-------------------------------------------------------------------------
- */
-#include "c.h"
-
-#if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT)
-#include <cpuid.h>
-#endif
-
-#ifdef HAVE_XSAVE_INTRINSICS
-#include <immintrin.h>
-#endif
-
-#if defined(HAVE__CPUID) || defined(HAVE__CPUIDEX)
-#include <intrin.h>
-#endif
-
-#include "port/pg_bitutils.h"
-
-/*
- * It's probably unlikely that TRY_POPCNT_FAST won't be set if we are able to
- * use AVX-512 intrinsics, but we check it anyway to be sure.  We piggy-back on
- * the function pointers that are only used when TRY_POPCNT_FAST is set.
- */
-#ifdef TRY_POPCNT_FAST
-
-/*
- * Does CPUID say there's support for XSAVE instructions?
- */
-static inline bool
-xsave_available(void)
-{
-       unsigned int exx[4] = {0, 0, 0, 0};
-
-#if defined(HAVE__GET_CPUID)
-       __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
-#elif defined(HAVE__CPUID)
-       __cpuid(exx, 1);
-#else
-#error cpuid instruction not available
-#endif
-       return (exx[2] & (1 << 27)) != 0;       /* osxsave */
-}
-
-/*
- * Does XGETBV say the ZMM registers are enabled?
- *
- * NB: Caller is responsible for verifying that xsave_available() returns true
- * before calling this.
- */
-static inline bool
-zmm_regs_available(void)
-{
-#ifdef HAVE_XSAVE_INTRINSICS
-       return (_xgetbv(0) & 0xe6) == 0xe6;
-#else
-       return false;
-#endif
-}
-
-/*
- * Does CPUID say there's support for AVX-512 popcount and byte-and-word
- * instructions?
- */
-static inline bool
-avx512_popcnt_available(void)
-{
-       unsigned int exx[4] = {0, 0, 0, 0};
-
-#if defined(HAVE__GET_CPUID_COUNT)
-       __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
-#elif defined(HAVE__CPUIDEX)
-       __cpuidex(exx, 7, 0);
-#else
-#error cpuid instruction not available
-#endif
-       return (exx[2] & (1 << 14)) != 0 && /* avx512-vpopcntdq */
-               (exx[1] & (1 << 30)) != 0;      /* avx512-bw */
-}
-
-/*
- * Returns true if the CPU supports the instructions required for the AVX-512
- * pg_popcount() implementation.
- */
-bool
-pg_popcount_avx512_available(void)
-{
-       return xsave_available() &&
-               zmm_regs_available() &&
-               avx512_popcnt_available();
-}
-
-#endif                                                 /* TRY_POPCNT_FAST */
-- 
2.39.5 (Apple Git-154)

Reply via email to