vectorized CRC on ARM64

John Naylor Wed, 14 May 2025 03:36:59 -0700

We did something similar for x86 for v18, and here is some progress
towards Arm support.


0001: Like e2809e3a101 -- inline small constant inputs to compensate
for the fact that 0002 will do a runtime check even if the usual CRC
extension is targeted. There is a difference from x86, however: On Arm
we currently align on 8-byte boundaries before looping on 8-byte
chunks. That requirement would prevent loop unrolling. We could use
4-byte chunks to get around that, but it's not clear which way is
best. I've coded it so it's easy to try both ways.

0002: Like 3c6e8c12389 and in fact uses the same program to generate
the code, by specifying Neon instructions with the Arm "crypto"
extension instead. There are some interesting differences from x86
here as well:
- The upstream implementation chose to use inline assembly instead of
intrinsics for some reason. I initially thought that was a way to get
broader compiler support, but it turns out you still need to pass the
relevant flags to get the assembly to link.
- I only have Meson support for now, since I used MacOS on CI to test.
That OS and compiler combination apparently targets the CRC extension,
but the PMULL instruction runtime check uses Linux-only headers, I
believe, so previously I hacked the choose function to return true for
testing. The choose function in 0002 is untested in this form.
- On x86 it could be fairly costly to align on a cacheline boundary
before beginning the main loop so I elected to skip that for short-ish
inputs in PG18. On Arm the main loop uses 4 16-byte accumulators, so
the patch acts like upsteam and always aligns on 16-byte boundaries.

0003: An afterthought regarding the above-mentioned alignment, this is
an alternative preamble that might shave a couple cycles for 4-byte
aligned inputs, e.g. WAL.

--
John Naylor
Amazon Web Services

From 9f3096da8fbed3f1457e7d52dfd91e6b29557239 Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@postgresql.org>
Date: Sun, 27 Apr 2025 04:04:28 +0700
Subject: [PATCH v1 1/3] Inline CRC computation for small fixed-length input on
 Arm

Similar vein to e2809e3a1. One difference is that the dispatch
function requires 4-byte alignment to prevent unnecessary branching
in the preamble. This corresponds to the alignment of WAL records.
---
 src/include/port/pg_crc32c.h | 44 +++++++++++++++++++++++++++++++++++-
 1 file changed, 43 insertions(+), 1 deletion(-)

diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h
index 82313bb7fcf..9f021db83b2 100644
--- a/src/include/port/pg_crc32c.h
+++ b/src/include/port/pg_crc32c.h
@@ -114,11 +114,53 @@ extern pg_crc32c pg_comp_crc32c_avx512(pg_crc32c crc, const void *data, size_t l
 /* Use ARMv8 CRC Extension instructions. */
 
 #define COMP_CRC32C(crc, data, len)							\
-	((crc) = pg_comp_crc32c_armv8((crc), (data), (len)))
+	((crc) = pg_comp_crc32c_dispatch((crc), (data), (len)))
 #define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF)
 
 extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len);
 
+static inline
+pg_crc32c
+pg_comp_crc32c_dispatch(pg_crc32c crc, const void *data, size_t len)
+{
+	/* require 4-byte alignment to avoid a long preamble */
+	if (__builtin_constant_p(len) &&
+		PointerIsAligned(data, uint32) &&
+		len < 32)
+	{
+		const unsigned char *p = data;
+
+		/*
+		 * For small constant inputs, inline the computation to avoid a
+		 * function call and allow the compiler to unroll loops.
+		 */
+#if 1
+
+		/*
+		 * WIP: is it better to avoid branching by unrolling the loop and
+		 * processing only 4-bytes per iteration?
+		 */
+		if (!PointerIsAligned(p, uint64) && len > 4)
+		{
+			crc = __crc32cw(crc, *(uint32 *) p);
+			p += 4;
+			len -= 4;
+		}
+#if SIZEOF_VOID_P >= 8
+		for (; len >= 8; p += 8, len -= 8)
+			crc = __crc32cd(crc, *(const uint64 *) p);
+#endif
+#endif
+		for (; len >= 4; p += 4, len -= 4)
+			crc = __crc32cw(crc, *(const uint32 *) p);
+		for (; len > 0; --len)
+			crc = __crc32cb(crc, *p++);
+		return crc;
+	}
+	else
+		return pg_comp_crc32c_armv8(crc, data, len);
+}
+
 #elif defined(USE_LOONGARCH_CRC32C)
 /* Use LoongArch CRCC instructions. */
 
-- 
2.49.0

From d447d42e5b0a89e4c2d0e6c40e38980df558b899 Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@postgresql.org>
Date: Fri, 9 May 2025 19:48:26 +0700
Subject: [PATCH v1 2/3] Compute CRC32C on ARM using the Crypto Extension where
 available

---
 meson.build                       |  32 ++++++++
 src/include/port/pg_crc32c.h      |  24 ++++--
 src/port/meson.build              |   1 +
 src/port/pg_crc32c_armv8.c        | 125 ++++++++++++++++++++++++++++++
 src/port/pg_crc32c_armv8_choose.c |  36 +++++++++
 5 files changed, 213 insertions(+), 5 deletions(-)

diff --git a/meson.build b/meson.build
index 12de5e80c31..5f7a10f4c8d 100644
--- a/meson.build
+++ b/meson.build
@@ -2523,6 +2523,38 @@ int main(void)
     have_optimized_crc = true
   endif
 
+    # Check if the compiler supports ARMv8 CRYPTO carryless multiplication
+    # and three-way exclusive-or instructions used for computing CRC.
+    # We test with cflags_crc since we also need __crc32cd.
+    prog = '''
+#include <arm_acle.h>
+#include <arm_neon.h>
+uint64x2_t	a;
+uint64x2_t	b;
+uint64x2_t	c;
+
+int main(void)
+{
+    uint64x2_t	r;
+    uint64x2_t	r2;
+
+__asm("pmull %0.1q, %2.1d, %3.1d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r), "+w"(c):"w"(a), "w"(b));
+__asm("pmull2 %0.1q, %2.2d, %3.2d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r2), "+w"(c):"w"(a), "w"(b));
+
+    /* return computed value, to prevent the above being optimized away */
+    r = veorq_u64(r, r2);
+    return __crc32cd(0, vgetq_lane_u64(r, 0));
+}
+'''
+
+  if cc.links(prog,
+      name: 'PMULL CRC32C',
+      args: test_c_args + ['-march=armv8-a+crc+simd+crypto'])
+    # Use ARM CRYPTO Extension, with runtime check
+    cflags_crc += '-march=armv8-a+crc+simd+crypto'
+    cdata.set('USE_PMULL_CRC32C_WITH_RUNTIME_CHECK', 1)
+  endif
+
 elif host_cpu == 'loongarch64'
 
   prog = '''
diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h
index 9f021db83b2..def3de5dfb9 100644
--- a/src/include/port/pg_crc32c.h
+++ b/src/include/port/pg_crc32c.h
@@ -111,13 +111,22 @@ extern pg_crc32c pg_comp_crc32c_avx512(pg_crc32c crc, const void *data, size_t l
 #endif
 
 #elif defined(USE_ARMV8_CRC32C)
-/* Use ARMv8 CRC Extension instructions. */
+/*
+ * Use either ARMv8 CRC Extension or CRYPTO Extension (PMULL) instructions.
+ * We don't need a runtime check for CRC, so we can inline those in some cases.
+ */
+
+#include <arm_acle.h>
 
 #define COMP_CRC32C(crc, data, len)							\
 	((crc) = pg_comp_crc32c_dispatch((crc), (data), (len)))
 #define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF)
 
+extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len);
 extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len);
+#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK
+extern pg_crc32c pg_comp_crc32c_pmull(pg_crc32c crc, const void *data, size_t len);
+#endif
 
 static inline
 pg_crc32c
@@ -132,13 +141,14 @@ pg_comp_crc32c_dispatch(pg_crc32c crc, const void *data, size_t len)
 
 		/*
 		 * For small constant inputs, inline the computation to avoid a
-		 * function call and allow the compiler to unroll loops.
+		 * function call.
 		 */
 #if 1
 
 		/*
-		 * WIP: is it better to avoid branching by unrolling the loop and
-		 * processing only 4-bytes per iteration?
+		 * WIP: Unlike x86, using 8-byte variants requires 8-byte alignment,
+		 * so the compiler cannot unroll the loop unless we restrict ourselves
+		 * to the 4-byte variant. Needs testing to choose the best one.
 		 */
 		if (!PointerIsAligned(p, uint64) && len > 4)
 		{
@@ -158,7 +168,8 @@ pg_comp_crc32c_dispatch(pg_crc32c crc, const void *data, size_t len)
 		return crc;
 	}
 	else
-		return pg_comp_crc32c_armv8(crc, data, len);
+		/* Otherwise, use a runtime check for PMULL instructions. */
+		return pg_comp_crc32c(crc, data, len);
 }
 
 #elif defined(USE_LOONGARCH_CRC32C)
@@ -183,6 +194,9 @@ extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_
 extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len);
 extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len);
 extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len);
+#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK
+extern pg_crc32c pg_comp_crc32c_pmull(pg_crc32c crc, const void *data, size_t len);
+#endif
 
 #else
 /*
diff --git a/src/port/meson.build b/src/port/meson.build
index fc7b059fee5..8b0ac6b931a 100644
--- a/src/port/meson.build
+++ b/src/port/meson.build
@@ -94,6 +94,7 @@ replace_funcs_pos = [
   # arm / aarch64
   ['pg_crc32c_armv8', 'USE_ARMV8_CRC32C'],
   ['pg_crc32c_armv8', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK', 'crc'],
+  ['pg_crc32c_armv8_choose', 'USE_ARMV8_CRC32C'],
   ['pg_crc32c_armv8_choose', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'],
   ['pg_crc32c_sb8', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'],
 
diff --git a/src/port/pg_crc32c_armv8.c b/src/port/pg_crc32c_armv8.c
index 5ba070bb99d..f67de2016b4 100644
--- a/src/port/pg_crc32c_armv8.c
+++ b/src/port/pg_crc32c_armv8.c
@@ -15,6 +15,9 @@
 #include "c.h"
 
 #include <arm_acle.h>
+#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK
+#include <arm_neon.h>
+#endif
 
 #include "port/pg_crc32c.h"
 
@@ -73,3 +76,125 @@ pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len)
 
 	return crc;
 }
+
+#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK
+
+/*
+ * Note: There is no copyright notice in the following generated code.
+ *
+ * We have modified the output to
+ *   - match our function declaration and alignment macro
+ *   - match whitespace to our project style
+ */
+
+/* Generated by https://github.com/corsix/fast-crc32/ using: */
+/* ./generate -i neon -p crc32c -a v4e */
+/* MIT licensed */
+
+static inline
+uint64x2_t
+clmul_lo_e(uint64x2_t a, uint64x2_t b, uint64x2_t c)
+{
+	uint64x2_t	r;
+
+__asm("pmull %0.1q, %2.1d, %3.1d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r), "+w"(c):"w"(a), "w"(b));
+	return r;
+}
+
+static inline
+uint64x2_t
+clmul_hi_e(uint64x2_t a, uint64x2_t b, uint64x2_t c)
+{
+	uint64x2_t	r;
+
+__asm("pmull2 %0.1q, %2.2d, %3.2d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r), "+w"(c):"w"(a), "w"(b));
+	return r;
+}
+
+pg_crc32c
+pg_comp_crc32c_pmull(pg_crc32c crc, const void *data, size_t len)
+{
+	/* adjust names to match generated code */
+	pg_crc32c	crc0 = crc;
+	const char *buf = data;
+
+	/* Align to 16 bytes to prevent straddling cacheline boundaries. */
+	for (; len && ((uintptr_t) buf & 7); --len)
+	{
+		crc0 = __crc32cb(crc0, *buf++);
+	}
+	if (((uintptr_t) buf & 8) && len >= 8)
+	{
+		crc0 = __crc32cd(crc0, *(const uint64_t *) buf);
+		buf += 8;
+		len -= 8;
+	}
+
+	if (len >= 64)
+	{
+		const char *end = buf + len;
+		const char *limit = buf + len - 64;
+
+		/* First vector chunk. */
+		uint64x2_t	x0 = vld1q_u64((const uint64_t *) buf),
+					y0;
+		uint64x2_t	x1 = vld1q_u64((const uint64_t *) (buf + 16)),
+					y1;
+		uint64x2_t	x2 = vld1q_u64((const uint64_t *) (buf + 32)),
+					y2;
+		uint64x2_t	x3 = vld1q_u64((const uint64_t *) (buf + 48)),
+					y3;
+		uint64x2_t	k;
+
+		{
+			static const uint64_t pg_attribute_aligned(16) k_[] = {0x740eef02, 0x9e4addf8};
+
+			k = vld1q_u64(k_);
+		}
+
+
+		/* pgindent doesn't like this: */
+		x0 = veorq_u64((uint64x2_t) {crc0, 0}, x0);
+		/*
+		 * a possible alternative?
+		 *
+		 * x0 = veorq_u64((uint64x2_t) vsetq_lane_u32(crc0, vdupq_n_u32(0), 0), x0);
+		 */
+
+		buf += 64;
+
+		/* Main loop. */
+		while (buf <= limit)
+		{
+			y0 = clmul_lo_e(x0, k, vld1q_u64((const uint64_t *) buf)), x0 = clmul_hi_e(x0, k, y0);
+			y1 = clmul_lo_e(x1, k, vld1q_u64((const uint64_t *) (buf + 16))), x1 = clmul_hi_e(x1, k, y1);
+			y2 = clmul_lo_e(x2, k, vld1q_u64((const uint64_t *) (buf + 32))), x2 = clmul_hi_e(x2, k, y2);
+			y3 = clmul_lo_e(x3, k, vld1q_u64((const uint64_t *) (buf + 48))), x3 = clmul_hi_e(x3, k, y3);
+			buf += 64;
+		}
+
+		/* Reduce x0 ... x3 to just x0. */
+		{
+			static const uint64_t pg_attribute_aligned(16) k_[] = {0xf20c0dfe, 0x493c7d27};
+
+			k = vld1q_u64(k_);
+		}
+		y0 = clmul_lo_e(x0, k, x1), x0 = clmul_hi_e(x0, k, y0);
+		y2 = clmul_lo_e(x2, k, x3), x2 = clmul_hi_e(x2, k, y2);
+		{
+			static const uint64_t pg_attribute_aligned(16) k_[] = {0x3da6d0cb, 0xba4fc28e};
+
+			k = vld1q_u64(k_);
+		}
+		y0 = clmul_lo_e(x0, k, x2), x0 = clmul_hi_e(x0, k, y0);
+
+		/* Reduce 128 bits to 32 bits, and multiply by x^32. */
+		crc0 = __crc32cd(0, vgetq_lane_u64(x0, 0));
+		crc0 = __crc32cd(crc0, vgetq_lane_u64(x0, 1));
+		len = end - buf;
+	}
+
+	return pg_comp_crc32c_armv8(crc0, buf, len);
+}
+
+#endif
diff --git a/src/port/pg_crc32c_armv8_choose.c b/src/port/pg_crc32c_armv8_choose.c
index ec12be1bbc3..9a1656a2685 100644
--- a/src/port/pg_crc32c_armv8_choose.c
+++ b/src/port/pg_crc32c_armv8_choose.c
@@ -107,6 +107,27 @@ pg_crc32c_armv8_available(void)
 #endif
 }
 
+static inline bool
+pg_pmull_available(void)
+{
+#ifdef __aarch64__
+
+#ifdef HAVE_ELF_AUX_INFO
+	unsigned long value;
+
+	return elf_aux_info(AT_HWCAP, &value, sizeof(value)) == 0 &&
+		(value & HWCAP_PMULL) != 0;
+#elif defined(HAVE_GETAUXVAL)
+	return (getauxval(AT_HWCAP) & HWCAP_PMULL) != 0;
+#else
+	return false;
+#endif
+
+#else
+	return false;
+#endif
+}
+
 /*
  * This gets called on the first call. It replaces the function pointer
  * so that subsequent calls are routed directly to the chosen implementation.
@@ -114,11 +135,26 @@ pg_crc32c_armv8_available(void)
 static pg_crc32c
 pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len)
 {
+#if defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK)
 	if (pg_crc32c_armv8_available())
 		pg_comp_crc32c = pg_comp_crc32c_armv8;
 	else
 		pg_comp_crc32c = pg_comp_crc32c_sb8;
 
+#elif defined(USE_ARMV8_CRC32C)
+	/*
+	 * On MacOS, compilers may emit CRC instructions without extra CFLAGS, but
+	 * we can't use Linux-isms to detect CPU features, so we just set it here
+	 * as a fallback for PMULL.
+	 */
+	pg_comp_crc32c = pg_comp_crc32c_armv8;
+#endif
+
+#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK
+	if (pg_pmull_available())
+		pg_comp_crc32c = pg_comp_crc32c_pmull;
+#endif
+
 	return pg_comp_crc32c(crc, data, len);
 }
 
-- 
2.49.0

From a0aad5bf044170e6a8ceaf368fcfff68177cb9fb Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@postgresql.org>
Date: Wed, 14 May 2025 01:59:41 +0700
Subject: [PATCH v1 3/3] WIP: Attempt alignment preamble better suited to WAL

---
 src/port/pg_crc32c_armv8.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/src/port/pg_crc32c_armv8.c b/src/port/pg_crc32c_armv8.c
index f67de2016b4..da6116efa21 100644
--- a/src/port/pg_crc32c_armv8.c
+++ b/src/port/pg_crc32c_armv8.c
@@ -119,6 +119,24 @@ pg_comp_crc32c_pmull(pg_crc32c crc, const void *data, size_t len)
 	const char *buf = data;
 
 	/* Align to 16 bytes to prevent straddling cacheline boundaries. */
+#if 1
+
+	/*
+	 * WIP: WAL is 4-byte aligned, so in that case the the first loop will be skipped. Is this
+	 * better?
+	 */
+	for (; len && ((uintptr_t) buf & 3); --len)
+	{
+		crc0 = __crc32cb(crc0, *buf++);
+	}
+	if (((uintptr_t) buf & 12) && len >= 4)
+	{
+		crc0 = __crc32cw(crc0, *(const uint64_t *) buf);
+		buf += 4;
+		len -= 4;
+	}
+#else
+	/* original */
 	for (; len && ((uintptr_t) buf & 7); --len)
 	{
 		crc0 = __crc32cb(crc0, *buf++);
@@ -129,6 +147,7 @@ pg_comp_crc32c_pmull(pg_crc32c crc, const void *data, size_t len)
 		buf += 8;
 		len -= 8;
 	}
+#endif
 
 	if (len >= 64)
 	{
-- 
2.49.0

vectorized CRC on ARM64

Reply via email to