From fda615dc6260f91cb53c1c84b438dbb8251eb09b Mon Sep 17 00:00:00 2001
From: Dave Cramer <davecramer@gmail.com>
Date: Sun, 13 Jul 2025 06:33:17 -0400
Subject: [PATCH v4] Enable PostgreSQL build on Windows 11 ARM64 with MSVC

Add support for the ARM64 architecture on Windows 11 using MSVC compiler,
addressing build issues and implementing proper memory synchronization
semantics for this platform.

Changes:

1. Spinlock delay implementation (spin_delay.h)
   - Implement pg_spin_delay() for ARM64 MSVC using __isb(_ARM64_BARRIER_SY)
   - ISB (Instruction Synchronization Barrier) provides an effective delay
     hint for spin-wait loops on ARM64, flushing the instruction pipeline
     to reduce power consumption on high-core-count systems

2. CRC32C hardware acceleration (meson.build, pg_crc32c_armv8.c)
   - Detect CRC32C capability at build time for MSVC ARM64
   - Falls back to software implementation if hardware acceleration
     unavailable

3. Spinlock release memory barrier (s_lock.h)
   - Replace compiler barrier (_ReadWriteBarrier) with hardware barrier
     (__dmb(_ARM64_BARRIER_SY)) in S_UNLOCK macro for ARM64 MSVC
   - Compiler barriers are insufficient on ARM64's weak memory model;
     __dmb ensures all prior memory operations complete before the lock
     is released
   - Prevents memory reordering across cores, ensuring data visibility
     to threads acquiring the released lock
   - Critical for correctness in concurrent operations (e.g., WAL
     synchronization)

This patch brings ARM64 MSVC support to parity with existing GCC/Clang
implementations on the same architecture.

Author: Greg Burd <greg@burd.me>
Author: Dave Cramer <davecramer@gmail.com>
Discussion: https://postgr.es/m/3c576ad7-d2da-4137-b791-5821da7cc370%40app.fastmail.com
Reference: https://learn.microsoft.com/en-us/cpp/intrinsics/arm64-intrinsics#BarrierRestrictions
---
 doc/src/sgml/installation.sgml |  2 +-
 meson.build                    | 81 ++++++++++++++++++++++++++--------
 src/include/storage/s_lock.h   | 29 ++++++++++--
 src/port/pg_crc32c_armv8.c     |  6 +++
 src/tools/msvc_gendef.pl       |  8 ++--
 5 files changed, 98 insertions(+), 28 deletions(-)

diff --git a/doc/src/sgml/installation.sgml b/doc/src/sgml/installation.sgml
index fe8d73e1f8c..3f8d512a906 100644
--- a/doc/src/sgml/installation.sgml
+++ b/doc/src/sgml/installation.sgml
@@ -3967,7 +3967,7 @@ configure ... LDFLAGS="-R /usr/sfw/lib:/opt/sfw/lib:/usr/local/lib"
    <sect3 id="install-windows-full-64-bit">
     <title>Special Considerations for 64-Bit Windows</title>
     <para>
-     PostgreSQL will only build for the x64 architecture on 64-bit Windows.
+     PostgreSQL will only build for the x64 and ARM64 architectures on 64-bit Windows.
     </para>
     <para>
      Mixing 32- and 64-bit versions in the same build tree is not supported.
diff --git a/meson.build b/meson.build
index 6e7ddd74683..f71be89da97 100644
--- a/meson.build
+++ b/meson.build
@@ -2494,7 +2494,11 @@ int main(void)
 elif host_cpu == 'arm' or host_cpu == 'aarch64'
 
   prog = '''
+#ifdef _MSC_VER
+#include <intrin.h>
+#else
 #include <arm_acle.h>
+#endif
 unsigned int crc;
 
 int main(void)
@@ -2509,25 +2513,64 @@ int main(void)
 }
 '''
 
-  if cc.links(prog, name: '__crc32cb, __crc32ch, __crc32cw, and __crc32cd without -march=armv8-a+crc',
-      args: test_c_args)
-    # Use ARM CRC Extension unconditionally
-    cdata.set('USE_ARMV8_CRC32C', 1)
-    have_optimized_crc = true
-  elif cc.links(prog, name: '__crc32cb, __crc32ch, __crc32cw, and __crc32cd with -march=armv8-a+crc+simd',
-      args: test_c_args + ['-march=armv8-a+crc+simd'])
-    # Use ARM CRC Extension, with runtime check
-    cflags_crc += '-march=armv8-a+crc+simd'
-    cdata.set('USE_ARMV8_CRC32C', false)
-    cdata.set('USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK', 1)
-    have_optimized_crc = true
-  elif cc.links(prog, name: '__crc32cb, __crc32ch, __crc32cw, and __crc32cd with -march=armv8-a+crc',
-      args: test_c_args + ['-march=armv8-a+crc'])
-    # Use ARM CRC Extension, with runtime check
-    cflags_crc += '-march=armv8-a+crc'
-    cdata.set('USE_ARMV8_CRC32C', false)
-    cdata.set('USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK', 1)
-    have_optimized_crc = true
+  if cc.get_id() == 'msvc'
+    # MSVC: Intrinsic availability check for ARM64
+    if host_machine.cpu_family() == 'aarch64'
+      # Test if CRC32C intrinsics are available in intrin.h
+      crc32c_test_msvc = '''
+        #include <intrin.h>
+        int main(void) {
+          uint32_t crc = 0;
+          uint8_t data = 0;
+          crc = __crc32cb(crc, data);
+          return 0;
+        }
+      '''
+      if cc.links(crc32c_test_msvc, name: '__crc32cb intrinsic available')
+        cdata.set('USE_ARMV8_CRC32C', 1)
+        have_optimized_crc = true
+        message('Using ARM64 CRC32C hardware acceleration (MSVC)')
+      else
+        message('CRC32C intrinsics not available on this MSVC ARM64 build')
+      endif
+    endif
+
+  elif host_machine.cpu_family() == 'aarch64'
+    # GCC/Clang paths: Try progressively with weaker requirements
+
+    # First: Try without any special flags (built-in support)
+    if cc.links(prog, name: '__crc32cb, __crc32ch, __crc32cw, and __crc32cd without -march=armv8-a+crc',
+        args: test_c_args)
+      cdata.set('USE_ARMV8_CRC32C', 1)
+      have_optimized_crc = true
+      message('Using ARM64 CRC32C without flags (built-in support)')
+
+    # Second: Try with -march=armv8-a+crc+simd (newer toolchains)
+    elif cc.links(prog, name: '__crc32cb, __crc32ch, __crc32cw, and __crc32cd with -march=armv8-a+crc+simd',
+        args: test_c_args + ['-march=armv8-a+crc+simd'])
+      cflags_crc += '-march=armv8-a+crc+simd'
+      cdata.set('USE_ARMV8_CRC32C', false)
+      cdata.set('USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK', 1)
+      have_optimized_crc = true
+      message('Using ARM64 CRC32C with runtime check (-march=armv8-a+crc+simd)')
+
+    # Third: Try with -march=armv8-a+crc (basic flag)
+    elif cc.links(prog, name: '__crc32cb, __crc32ch, __crc32cw, and __crc32cd with -march=armv8-a+crc',
+        args: test_c_args + ['-march=armv8-a+crc'])
+      cflags_crc += '-march=armv8-a+crc'
+      cdata.set('USE_ARMV8_CRC32C', false)
+      cdata.set('USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK', 1)
+      have_optimized_crc = true
+      message('Using ARM64 CRC32C with runtime check (-march=armv8-a+crc)')
+
+    else
+      message('CRC32C optimization not available for this ARM64 GCC/Clang build')
+    endif
+  endif
+
+  # Fallback: Use software CRC if no hardware acceleration found
+  if not have_optimized_crc
+    message('CRC32C: Using software implementation')
   endif
 
 elif host_cpu == 'loongarch64'
diff --git a/src/include/storage/s_lock.h b/src/include/storage/s_lock.h
index 7f8f566bd40..029f2fa9729 100644
--- a/src/include/storage/s_lock.h
+++ b/src/include/storage/s_lock.h
@@ -602,15 +602,32 @@ typedef LONG slock_t;
 
 #define SPIN_DELAY() spin_delay()
 
-/* If using Visual C++ on Win64, inline assembly is unavailable.
- * Use a _mm_pause intrinsic instead of rep nop.
+/*
+ * If using Visual C++ on Win64, inline assembly is unavailable.
+ * Use architecture specific intrinsics.
  */
 #if defined(_WIN64)
+/*
+ * For Arm64, use __isb intrinsic. See aarch64 inline assembly definition for details.
+ */
+#ifdef _M_ARM64
+
+static __forceinline void
+spin_delay(void)
+{
+	 /* Reference: https://learn.microsoft.com/en-us/cpp/intrinsics/arm64-intrinsics#BarrierRestrictions */
+	__isb(_ARM64_BARRIER_SY);
+}
+#else
+/*
+ * For x64, use _mm_pause intrinsic instead of rep nop.
+ */
 static __forceinline void
 spin_delay(void)
 {
 	_mm_pause();
 }
+#endif
 #else
 static __forceinline void
 spin_delay(void)
@@ -623,9 +640,13 @@ spin_delay(void)
 #include <intrin.h>
 #pragma intrinsic(_ReadWriteBarrier)
 
-#define S_UNLOCK(lock)	\
+#ifdef _M_ARM64
+#define S_UNLOCK(lock) \
+	do { __dmb(_ARM64_BARRIER_SY); (*(lock)) = 0; } while (0)
+#else
+#define S_UNLOCK(lock) \
 	do { _ReadWriteBarrier(); (*(lock)) = 0; } while (0)
-
+#endif
 #endif
 
 
diff --git a/src/port/pg_crc32c_armv8.c b/src/port/pg_crc32c_armv8.c
index 5ba070bb99d..29a91dca62f 100644
--- a/src/port/pg_crc32c_armv8.c
+++ b/src/port/pg_crc32c_armv8.c
@@ -14,7 +14,13 @@
  */
 #include "c.h"
 
+#ifdef _MSC_VER
+ /* MSVC ARM64 intrinsics */
+#include <intrin.h>
+#else
+ /* GCC/Clang: Use ACLE intrinsics from arm_acle.h */
 #include <arm_acle.h>
+#endif
 
 #include "port/pg_crc32c.h"
 
diff --git a/src/tools/msvc_gendef.pl b/src/tools/msvc_gendef.pl
index 868aad51b09..c92c94c4775 100644
--- a/src/tools/msvc_gendef.pl
+++ b/src/tools/msvc_gendef.pl
@@ -118,9 +118,9 @@ sub writedef
 	{
 		my $isdata = $def->{$f} eq 'data';
 
-		# Strip the leading underscore for win32, but not x64
+		# Strip the leading underscore for win32, but not x64 and aarch64
 		$f =~ s/^_//
-		  unless ($arch eq "x86_64");
+		  unless ($arch eq "x86_64" || $arch eq "aarch64");
 
 		# Emit just the name if it's a function symbol, or emit the name
 		# decorated with the DATA option for variables.
@@ -141,7 +141,7 @@ sub writedef
 sub usage
 {
 	die("Usage: msvc_gendef.pl --arch <arch> --deffile <deffile> --tempdir <tempdir> files-or-directories\n"
-		  . "    arch: x86 | x86_64\n"
+		  . "    arch: x86 | x86_64 | aarch64\n"
 		  . "    deffile: path of the generated file\n"
 		  . "    tempdir: directory for temporary files\n"
 		  . "    files or directories: object files or directory containing object files\n"
@@ -158,7 +158,7 @@ GetOptions(
 	'tempdir:s' => \$tempdir,) or usage();
 
 usage("arch: $arch")
-  unless ($arch eq 'x86' || $arch eq 'x86_64');
+  unless ($arch eq 'x86' || $arch eq 'x86_64' || $arch eq 'aarch64');
 
 my @files;
 
-- 
2.52.0.windows.1

