From 85dc82d7ad53c1bd19725883125f5f19a506973e Mon Sep 17 00:00:00 2001
From: Dave Cramer <davecramer@gmail.com>
Date: Sun, 13 Jul 2025 06:33:17 -0400
Subject: [PATCH v7] Enable the Microsoft Windows ARM64/MSVC platform

Add support for the ARM64 architecture on Windows 11 using MSVC compiler
addressing build issues and implementing proper memory synchronization
semantics for this platform.

* Implement spin_delay() with __isb(_ARM64_BARRIER_SY) intrinsic to emit
  the "ISB SY" instruction which matches the GCC/Clang approach to
  spinloop delay and emperical evidence that it out-scales the YIELD
  instruction in practice.

* Unconditionally choose to use the MSVC supplied intrinsic
  for CRC32 on ARM64.

* Implement the S_UNLOCK() macro using the InterlockedExchange()
  intrinsic.

Author: Greg Burd <greg@burd.me>
Author: Dave Cramer <davecramer@gmail.com>
Discussion: https://postgr.es/m/3c576ad7-d2da-4137-b791-5821da7cc370%40app.fastmail.com
---
 doc/src/sgml/installation.sgml |  2 +-
 meson.build                    |  9 +++--
 src/include/storage/s_lock.h   | 60 +++++++++++++++++++++++++++-------
 src/port/pg_crc32c_armv8.c     |  6 ++++
 src/tools/msvc_gendef.pl       |  8 ++---
 5 files changed, 67 insertions(+), 18 deletions(-)

diff --git a/doc/src/sgml/installation.sgml b/doc/src/sgml/installation.sgml
index fe8d73e1f8c..3f8d512a906 100644
--- a/doc/src/sgml/installation.sgml
+++ b/doc/src/sgml/installation.sgml
@@ -3967,7 +3967,7 @@ configure ... LDFLAGS="-R /usr/sfw/lib:/opt/sfw/lib:/usr/local/lib"
    <sect3 id="install-windows-full-64-bit">
     <title>Special Considerations for 64-Bit Windows</title>
     <para>
-     PostgreSQL will only build for the x64 architecture on 64-bit Windows.
+     PostgreSQL will only build for the x64 and ARM64 architectures on 64-bit Windows.
     </para>
     <para>
      Mixing 32- and 64-bit versions in the same build tree is not supported.
diff --git a/meson.build b/meson.build
index 718150e3ac0..e57c233b124 100644
--- a/meson.build
+++ b/meson.build
@@ -2512,7 +2512,12 @@ int main(void)
 }
 '''
 
-  if cc.links(prog, name: '__crc32cb, __crc32ch, __crc32cw, and __crc32cd without -march=armv8-a+crc',
+  # Check first for a MSVC/ARM64 combo because the test prog above won't
+  # compile (as it doesn't '#ifdef _MSC_VER #include <intrin.h>'), which
+  # is okay as we know for a fact that this platform combo supports the
+  # intrinsic for ARM64 CRC the test performs, so use that unconditionally.
+  if (host_machine.cpu_family() == 'aarch64' and cc.get_id() == 'msvc') or
+        cc.links(prog, name: '__crc32cb, __crc32ch, __crc32cw, and __crc32cd without -march=armv8-a+crc',
       args: test_c_args)
     # Use ARM CRC Extension unconditionally
     cdata.set('USE_ARMV8_CRC32C', 1)
@@ -2531,7 +2536,7 @@ int main(void)
     cdata.set('USE_ARMV8_CRC32C', false)
     cdata.set('USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK', 1)
     have_optimized_crc = true
-  endif
+endif
 
 elif host_cpu == 'loongarch64'
 
diff --git a/src/include/storage/s_lock.h b/src/include/storage/s_lock.h
index 7f8f566bd40..50262cca887 100644
--- a/src/include/storage/s_lock.h
+++ b/src/include/storage/s_lock.h
@@ -594,7 +594,8 @@ tas(volatile slock_t *lock)
 
 #if !defined(HAS_TEST_AND_SET)	/* We didn't trigger above, let's try here */
 
-#ifdef _MSC_VER
+/* When compiling for Microsoft Windows using MSVC */
+#if defined(_MSC_VER)
 typedef LONG slock_t;
 
 #define HAS_TEST_AND_SET
@@ -602,34 +603,71 @@ typedef LONG slock_t;
 
 #define SPIN_DELAY() spin_delay()
 
-/* If using Visual C++ on Win64, inline assembly is unavailable.
- * Use a _mm_pause intrinsic instead of rep nop.
+/*
+ * _InterlockedExchange() generates a full memory barrier (or release
+ * semantics that ensures all prior memory operations are visible to
+ * other cores before the lock is released.
+ */
+#define S_UNLOCK(lock) (InterlockedExchange(lock, 0))
+
+#if defined(_WIN64) /* Microsoft Windows x64 */
+
+#if defined(_M_ARM64) /* aarch64 */
+/*
+ * While there is support for a __yield() intrinsic for MSVC/ARM64[1], there
+ * is a wealth of real-world testing across databases and languages as well
+ * as a blog post by ARM[2] suggest that ISB is the most scalable and power
+ * friendly instruction to use for spinlock delay loops. So we use the only
+ * supported intrinsic/flag combination availble for this platform combo[3].
+ * This matches what we do above when compiling with either GCC or Clang.
+ *
+ * [1] https://learn.microsoft.com/en-us/cpp/intrinsics/arm64-intrinsics
+ * [2] https://developer.arm.com/community/arm-community-blogs/b/architectures-and-processors-blog/posts/multi-threaded-applications-arm
+ * [3] https://github.com/MicrosoftDocs/cpp-docs/blob/main/docs/intrinsics/arm64-intrinsics.md
+ */
+static __forceinline void
+spin_delay(void)
+{
+	__isb(_ARM64_BARRIER_SY);
+}
+
+#elif defined(_M_X64) /* x86-64 */
+
+/*
+ * Use _mm_pause() intrinsic for x86-64. This emits the PAUSE instruction,
+ * which improves performance in spin-wait loops by preventing pipeline flush
+ * on Hyper-Threading systems.
  */
-#if defined(_WIN64)
 static __forceinline void
 spin_delay(void)
 {
 	_mm_pause();
 }
-#else
+
+#endif /* defined(_M_ARM64|_M_X64) */
+
+#else /* !defined(_WIN64) */
+
+#ifdef _M_IX86 /* x86-specific */
+
+/* Use no-op for MSVC 32bit x86 */
 static __forceinline void
 spin_delay(void)
 {
 	/* See comment for gcc code. Same code, MASM syntax */
 	__asm rep nop;
 }
-#endif
 
 #include <intrin.h>
 #pragma intrinsic(_ReadWriteBarrier)
 
-#define S_UNLOCK(lock)	\
+#define S_UNLOCK(lock) \
 	do { _ReadWriteBarrier(); (*(lock)) = 0; } while (0)
 
-#endif
-
-
-#endif	/* !defined(HAS_TEST_AND_SET) */
+#endif /* defined(_M_IX86) */
+#endif /* defined(_WIN64) */
+#endif /* defined(_MSC_VER) */
+#endif /* !defined(HAS_TEST_AND_SET) */
 
 
 /* Blow up if we didn't have any way to do spinlocks */
diff --git a/src/port/pg_crc32c_armv8.c b/src/port/pg_crc32c_armv8.c
index 5ba070bb99d..29a91dca62f 100644
--- a/src/port/pg_crc32c_armv8.c
+++ b/src/port/pg_crc32c_armv8.c
@@ -14,7 +14,13 @@
  */
 #include "c.h"
 
+#ifdef _MSC_VER
+ /* MSVC ARM64 intrinsics */
+#include <intrin.h>
+#else
+ /* GCC/Clang: Use ACLE intrinsics from arm_acle.h */
 #include <arm_acle.h>
+#endif
 
 #include "port/pg_crc32c.h"
 
diff --git a/src/tools/msvc_gendef.pl b/src/tools/msvc_gendef.pl
index 868aad51b09..c92c94c4775 100644
--- a/src/tools/msvc_gendef.pl
+++ b/src/tools/msvc_gendef.pl
@@ -118,9 +118,9 @@ sub writedef
 	{
 		my $isdata = $def->{$f} eq 'data';
 
-		# Strip the leading underscore for win32, but not x64
+		# Strip the leading underscore for win32, but not x64 and aarch64
 		$f =~ s/^_//
-		  unless ($arch eq "x86_64");
+		  unless ($arch eq "x86_64" || $arch eq "aarch64");
 
 		# Emit just the name if it's a function symbol, or emit the name
 		# decorated with the DATA option for variables.
@@ -141,7 +141,7 @@ sub writedef
 sub usage
 {
 	die("Usage: msvc_gendef.pl --arch <arch> --deffile <deffile> --tempdir <tempdir> files-or-directories\n"
-		  . "    arch: x86 | x86_64\n"
+		  . "    arch: x86 | x86_64 | aarch64\n"
 		  . "    deffile: path of the generated file\n"
 		  . "    tempdir: directory for temporary files\n"
 		  . "    files or directories: object files or directory containing object files\n"
@@ -158,7 +158,7 @@ GetOptions(
 	'tempdir:s' => \$tempdir,) or usage();
 
 usage("arch: $arch")
-  unless ($arch eq 'x86' || $arch eq 'x86_64');
+  unless ($arch eq 'x86' || $arch eq 'x86_64' || $arch eq 'aarch64');
 
 my @files;
 
-- 
2.52.0.windows.1

