Here's what I have staged for commit (except for the commit message, which
needs some work).

-- 
nathan
>From c405f5a41cca9ce5de9c0e69e00d11b16ba93986 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nat...@postgresql.org>
Date: Tue, 26 Nov 2024 14:17:03 -0600
Subject: [PATCH v6 1/1] Use __attribute__((target(...))) for SSE4.2 support.

Author: Raghuveer Devulapalli
Discussion: 
https://postgr.es/m/PH8PR11MB8286BE735A463468415D46B5FB5C2%40PH8PR11MB8286.namprd11.prod.outlook.com
---
 config/c-compiler.m4       | 32 +++++++------
 configure                  | 93 ++++++++++++--------------------------
 configure.ac               | 19 ++++----
 meson.build                | 22 ++++++---
 src/port/Makefile          |  5 --
 src/port/meson.build       |  2 +-
 src/port/pg_crc32c_sse42.c |  1 +
 7 files changed, 72 insertions(+), 102 deletions(-)

diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index a129edb88e..309d5b04b4 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -605,24 +605,26 @@ fi])# PGAC_HAVE_GCC__ATOMIC_INT64_CAS
 # test the 8-byte variant, _mm_crc32_u64, but it is assumed to be present if
 # the other ones are, on x86-64 platforms)
 #
-# An optional compiler flag can be passed as argument (e.g. -msse4.2). If the
-# intrinsics are supported, sets pgac_sse42_crc32_intrinsics, and CFLAGS_CRC.
+# If the intrinsics are supported, sets pgac_sse42_crc32_intrinsics.
 AC_DEFUN([PGAC_SSE42_CRC32_INTRINSICS],
-[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sse42_crc32_intrinsics_$1])])dnl
-AC_CACHE_CHECK([for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=$1], 
[Ac_cachevar],
-[pgac_save_CFLAGS=$CFLAGS
-CFLAGS="$pgac_save_CFLAGS $1"
-AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <nmmintrin.h>],
-  [unsigned int crc = 0;
-   crc = _mm_crc32_u8(crc, 0);
-   crc = _mm_crc32_u32(crc, 0);
-   /* return computed value, to prevent the above being optimized away */
-   return crc == 0;])],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sse42_crc32_intrinsics])])dnl
+AC_CACHE_CHECK([for _mm_crc32_u8 and _mm_crc32_u32], [Ac_cachevar],
+[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <nmmintrin.h>
+    #if defined(__has_attribute) && __has_attribute (target)
+    __attribute__((target("sse4.2")))
+    #endif
+    static int crc32_sse42_test(void)
+    {
+      unsigned int crc = 0;
+      crc = _mm_crc32_u8(crc, 0);
+      crc = _mm_crc32_u32(crc, 0);
+      /* return computed value, to prevent the above being optimized away */
+      return crc == 0;
+    }],
+  [return crc32_sse42_test();])],
   [Ac_cachevar=yes],
-  [Ac_cachevar=no])
-CFLAGS="$pgac_save_CFLAGS"])
+  [Ac_cachevar=no])])
 if test x"$Ac_cachevar" = x"yes"; then
-  CFLAGS_CRC="$1"
   pgac_sse42_crc32_intrinsics=yes
 fi
 undefine([Ac_cachevar])dnl
diff --git a/configure b/configure
index 199d666aa7..cb89866649 100755
--- a/configure
+++ b/configure
@@ -17375,87 +17375,47 @@ fi
 
 # Check for Intel SSE 4.2 intrinsics to do CRC calculations.
 #
-# First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
-# with the default compiler flags. If not, check if adding the -msse4.2
-# flag helps. CFLAGS_CRC is set to -msse4.2 if that's required.
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and 
_mm_crc32_u32 with CFLAGS=" >&5
-$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=... " >&6; 
}
-if ${pgac_cv_sse42_crc32_intrinsics_+:} false; then :
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and 
_mm_crc32_u32" >&5
+$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32... " >&6; }
+if ${pgac_cv_sse42_crc32_intrinsics+:} false; then :
   $as_echo_n "(cached) " >&6
 else
-  pgac_save_CFLAGS=$CFLAGS
-CFLAGS="$pgac_save_CFLAGS "
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-#include <nmmintrin.h>
-int
-main ()
-{
-unsigned int crc = 0;
-   crc = _mm_crc32_u8(crc, 0);
-   crc = _mm_crc32_u32(crc, 0);
-   /* return computed value, to prevent the above being optimized away */
-   return crc == 0;
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
-  pgac_cv_sse42_crc32_intrinsics_=yes
-else
-  pgac_cv_sse42_crc32_intrinsics_=no
-fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext conftest.$ac_ext
-CFLAGS="$pgac_save_CFLAGS"
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: 
$pgac_cv_sse42_crc32_intrinsics_" >&5
-$as_echo "$pgac_cv_sse42_crc32_intrinsics_" >&6; }
-if test x"$pgac_cv_sse42_crc32_intrinsics_" = x"yes"; then
-  CFLAGS_CRC=""
-  pgac_sse42_crc32_intrinsics=yes
-fi
-
-if test x"$pgac_sse42_crc32_intrinsics" != x"yes"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and 
_mm_crc32_u32 with CFLAGS=-msse4.2" >&5
-$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32 with 
CFLAGS=-msse4.2... " >&6; }
-if ${pgac_cv_sse42_crc32_intrinsics__msse4_2+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  pgac_save_CFLAGS=$CFLAGS
-CFLAGS="$pgac_save_CFLAGS -msse4.2"
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 #include <nmmintrin.h>
+    #if defined(__has_attribute) && __has_attribute (target)
+    __attribute__((target("sse4.2")))
+    #endif
+    static int crc32_sse42_test(void)
+    {
+      unsigned int crc = 0;
+      crc = _mm_crc32_u8(crc, 0);
+      crc = _mm_crc32_u32(crc, 0);
+      /* return computed value, to prevent the above being optimized away */
+      return crc == 0;
+    }
 int
 main ()
 {
-unsigned int crc = 0;
-   crc = _mm_crc32_u8(crc, 0);
-   crc = _mm_crc32_u32(crc, 0);
-   /* return computed value, to prevent the above being optimized away */
-   return crc == 0;
+return crc32_sse42_test();
   ;
   return 0;
 }
 _ACEOF
 if ac_fn_c_try_link "$LINENO"; then :
-  pgac_cv_sse42_crc32_intrinsics__msse4_2=yes
+  pgac_cv_sse42_crc32_intrinsics=yes
 else
-  pgac_cv_sse42_crc32_intrinsics__msse4_2=no
+  pgac_cv_sse42_crc32_intrinsics=no
 fi
 rm -f core conftest.err conftest.$ac_objext \
     conftest$ac_exeext conftest.$ac_ext
-CFLAGS="$pgac_save_CFLAGS"
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: 
$pgac_cv_sse42_crc32_intrinsics__msse4_2" >&5
-$as_echo "$pgac_cv_sse42_crc32_intrinsics__msse4_2" >&6; }
-if test x"$pgac_cv_sse42_crc32_intrinsics__msse4_2" = x"yes"; then
-  CFLAGS_CRC="-msse4.2"
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: 
$pgac_cv_sse42_crc32_intrinsics" >&5
+$as_echo "$pgac_cv_sse42_crc32_intrinsics" >&6; }
+if test x"$pgac_cv_sse42_crc32_intrinsics" = x"yes"; then
   pgac_sse42_crc32_intrinsics=yes
 fi
 
-fi
 
 # Are we targeting a processor that supports SSE 4.2? gcc, clang and icc all
 # define __SSE4_2__ in that case.
@@ -17658,15 +17618,20 @@ fi
 # If we are targeting a processor that has Intel SSE 4.2 instructions, we can
 # use the special CRC instructions for calculating CRC-32C. If we're not
 # targeting such a processor, but we can nevertheless produce code that uses
-# the SSE intrinsics, perhaps with some extra CFLAGS, compile both
-# implementations and select which one to use at runtime, depending on whether
-# SSE 4.2 is supported by the processor we're running on.
+# the SSE intrinsics, compile both implementations and select which one to use
+# at runtime, depending on whether SSE 4.2 is supported by the processor we're
+# running on.
 #
 # Similarly, if we are targeting an ARM processor that has the CRC
 # instructions that are part of the ARMv8 CRC Extension, use them. And if
 # we're not targeting such a processor, but can nevertheless produce code that
 # uses the CRC instructions, compile both, and select at runtime.
 #
+# Note that we do not use __attribute__((target("..."))) for the ARM CRC
+# instructions because until Clang 16.0.0, using the intrinsics still requires
+# special -march flags.  Perhaps we can re-evaluate this decision after some
+# time has passed.
+#
 # You can skip the runtime check by setting the appropriate USE_*_CRC32 flag 
to 1
 # in the template or configure command line.
 #
diff --git a/configure.ac b/configure.ac
index 4f56bb5062..5785600faf 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2069,13 +2069,7 @@ fi
 
 # Check for Intel SSE 4.2 intrinsics to do CRC calculations.
 #
-# First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
-# with the default compiler flags. If not, check if adding the -msse4.2
-# flag helps. CFLAGS_CRC is set to -msse4.2 if that's required.
-PGAC_SSE42_CRC32_INTRINSICS([])
-if test x"$pgac_sse42_crc32_intrinsics" != x"yes"; then
-  PGAC_SSE42_CRC32_INTRINSICS([-msse4.2])
-fi
+PGAC_SSE42_CRC32_INTRINSICS()
 
 # Are we targeting a processor that supports SSE 4.2? gcc, clang and icc all
 # define __SSE4_2__ in that case.
@@ -2112,15 +2106,20 @@ AC_SUBST(CFLAGS_CRC)
 # If we are targeting a processor that has Intel SSE 4.2 instructions, we can
 # use the special CRC instructions for calculating CRC-32C. If we're not
 # targeting such a processor, but we can nevertheless produce code that uses
-# the SSE intrinsics, perhaps with some extra CFLAGS, compile both
-# implementations and select which one to use at runtime, depending on whether
-# SSE 4.2 is supported by the processor we're running on.
+# the SSE intrinsics, compile both implementations and select which one to use
+# at runtime, depending on whether SSE 4.2 is supported by the processor we're
+# running on.
 #
 # Similarly, if we are targeting an ARM processor that has the CRC
 # instructions that are part of the ARMv8 CRC Extension, use them. And if
 # we're not targeting such a processor, but can nevertheless produce code that
 # uses the CRC instructions, compile both, and select at runtime.
 #
+# Note that we do not use __attribute__((target("..."))) for the ARM CRC
+# instructions because until Clang 16.0.0, using the intrinsics still requires
+# special -march flags.  Perhaps we can re-evaluate this decision after some
+# time has passed.
+#
 # You can skip the runtime check by setting the appropriate USE_*_CRC32 flag 
to 1
 # in the template or configure command line.
 #
diff --git a/meson.build b/meson.build
index 83e61d0f4a..d6dc8c8525 100644
--- a/meson.build
+++ b/meson.build
@@ -2211,14 +2211,19 @@ endif
 # If we are targeting a processor that has Intel SSE 4.2 instructions, we can
 # use the special CRC instructions for calculating CRC-32C. If we're not
 # targeting such a processor, but we can nevertheless produce code that uses
-# the SSE intrinsics, perhaps with some extra CFLAGS, compile both
-# implementations and select which one to use at runtime, depending on whether
-# SSE 4.2 is supported by the processor we're running on.
+# the SSE intrinsics, compile both implementations and select which one to use
+# at runtime, depending on whether SSE 4.2 is supported by the processor we're
+# running on.
 #
 # Similarly, if we are targeting an ARM processor that has the CRC
 # instructions that are part of the ARMv8 CRC Extension, use them. And if
 # we're not targeting such a processor, but can nevertheless produce code that
 # uses the CRC instructions, compile both, and select at runtime.
+#
+# Note that we do not use __attribute__((target("..."))) for the ARM CRC
+# instructions because until Clang 16.0.0, using the intrinsics still requires
+# special -march flags.  Perhaps we can re-evaluate this decision after some
+# time has passed.
 ###############################################################
 
 have_optimized_crc = false
@@ -2234,6 +2239,9 @@ if host_cpu == 'x86' or host_cpu == 'x86_64'
     prog = '''
 #include <nmmintrin.h>
 
+#if defined(__has_attribute) && __has_attribute (target)
+__attribute__((target("sse4.2")))
+#endif
 int main(void)
 {
     unsigned int crc = 0;
@@ -2244,16 +2252,16 @@ int main(void)
 }
 '''
 
-    if cc.links(prog, name: '_mm_crc32_u8 and _mm_crc32_u32 without -msse4.2',
+    if not cc.links(prog, name: '_mm_crc32_u8 and _mm_crc32_u32',
           args: test_c_args)
+      # Do not use Intel SSE 4.2
+    elif (cc.get_define('__SSE4_2__') != '')
       # Use Intel SSE 4.2 unconditionally.
       cdata.set('USE_SSE42_CRC32C', 1)
       have_optimized_crc = true
-    elif cc.links(prog, name: '_mm_crc32_u8 and _mm_crc32_u32 with -msse4.2',
-          args: test_c_args + ['-msse4.2'])
+    else
       # Use Intel SSE 4.2, with runtime check. The CPUID instruction is needed 
for
       # the runtime check.
-      cflags_crc += '-msse4.2'
       cdata.set('USE_SSE42_CRC32C', false)
       cdata.set('USE_SSE42_CRC32C_WITH_RUNTIME_CHECK', 1)
       have_optimized_crc = true
diff --git a/src/port/Makefile b/src/port/Makefile
index 366c814bd9..4c22431951 100644
--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -82,11 +82,6 @@ libpgport.a: $(OBJS)
        rm -f $@
        $(AR) $(AROPT) $@ $^
 
-# all versions of pg_crc32c_sse42.o need CFLAGS_CRC
-pg_crc32c_sse42.o: CFLAGS+=$(CFLAGS_CRC)
-pg_crc32c_sse42_shlib.o: CFLAGS+=$(CFLAGS_CRC)
-pg_crc32c_sse42_srv.o: CFLAGS+=$(CFLAGS_CRC)
-
 # all versions of pg_crc32c_armv8.o need CFLAGS_CRC
 pg_crc32c_armv8.o: CFLAGS+=$(CFLAGS_CRC)
 pg_crc32c_armv8_shlib.o: CFLAGS+=$(CFLAGS_CRC)
diff --git a/src/port/meson.build b/src/port/meson.build
index 83a0632520..c5bceed9cd 100644
--- a/src/port/meson.build
+++ b/src/port/meson.build
@@ -82,7 +82,7 @@ endif
 replace_funcs_pos = [
   # x86/x64
   ['pg_crc32c_sse42', 'USE_SSE42_CRC32C'],
-  ['pg_crc32c_sse42', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK', 'crc'],
+  ['pg_crc32c_sse42', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
   ['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
   ['pg_crc32c_sb8', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
 
diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c
index 7f88c11480..dcc4904a82 100644
--- a/src/port/pg_crc32c_sse42.c
+++ b/src/port/pg_crc32c_sse42.c
@@ -19,6 +19,7 @@
 #include "port/pg_crc32c.h"
 
 pg_attribute_no_sanitize_alignment()
+pg_attribute_target("sse4.2")
 pg_crc32c
 pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len)
 {
-- 
2.39.5 (Apple Git-154)

Reply via email to