Re: CRC32C Parallel Computation Optimization on ARM

John Naylor Tue, 18 Mar 2025 04:51:34 -0700

On Thu, Mar 13, 2025 at 12:50 AM Devulapalli, Raghuveer
<raghuveer.devulapa...@intel.com> wrote:
>
> > > Intel has contributed SSE4.2 CRC32C [1] and AVX-512 CRC32C [2] based on
> > similar techniques to postgres.
> >
> > ...this is a restatement of facts we already know. I'm guessing the intended
> > takeaway is "since Intel submitted an implementation to us based on paper A,
> > then we are free to separately also use a technique from paper B (which 
> > cites
> > patents)".
>
> Yes.
>
> > The original proposal that started this thread is below, and I'd like to 
> > give that
> > author credit for initiating that work
>
> Yup, that should be fine.


Thank you for confirming. I've attached v10, which has mostly
polishing and comment writing, and a draft commit message. The lookup
table and software carryless multiplication routine are still in
pg_crc32c_sb.c , which is now built unconditionally. That's good
foreshadowing of future pclmul/pmull support, as I've found building
that file everywhere makes some things simpler anyway. That file has
become a bit of a misnomer, and I've thought of renaming it to
*_common.c or perhaps *_fallback.c , since the addition from this
patch is still kind of a fallback where we won't have the hardware
needed for faster algorithms, as discussed elsewhere.

0002-3 puts the relevant parts into a header so that the hardware
details can be abstracted away. These would be squashed, but I've kept
them separate here for comparison.

--
John Naylor
Amazon Web Services

From f246bba2d9090116f8914c20114ecc3f4b9daeea Mon Sep 17 00:00:00 2001
From: John Naylor <john.nay...@postgresql.org>
Date: Tue, 18 Mar 2025 12:56:32 +0700
Subject: [PATCH v10 2/3] Use template file for parallel CRC computation

---
 src/port/pg_crc32c_armv8.c    | 65 ++++++++++++++++------------------
 src/port/pg_crc32c_parallel.h | 66 +++++++++++++++++++++++++++++++++++
 src/port/pg_crc32c_sb8.c      |  2 ++
 src/port/pg_crc32c_sse42.c    | 65 +++++++---------------------------
 4 files changed, 112 insertions(+), 86 deletions(-)
 create mode 100644 src/port/pg_crc32c_parallel.h

diff --git a/src/port/pg_crc32c_armv8.c b/src/port/pg_crc32c_armv8.c
index 0265a2a13d7..4767406bef3 100644
--- a/src/port/pg_crc32c_armv8.c
+++ b/src/port/pg_crc32c_armv8.c
@@ -18,13 +18,42 @@
 
 #include "port/pg_crc32c.h"
 
+#define DEBUG_CRC				/* XXX not for commit */
+
+static pg_crc32c pg_comp_crc32c_armv8_tail(pg_crc32c crc, const void *data, size_t len);
+
+
 pg_crc32c
 pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len)
+{
+	const unsigned char *p = data;
+	pg_crc32c	crc0 = crc;
+
+#ifdef DEBUG_CRC
+	const size_t orig_len PG_USED_FOR_ASSERTS_ONLY = len;
+#endif
+
+/* min size to compute multiple segments in parallel */
+#define MIN_PARALLEL_LENGTH 600
+
+#define PG_CRC32C_1B(c, w) __crc32cb(c, w)
+#define PG_CRC32C_8B(c, w) __crc32cd(c, w)
+#include "pg_crc32c_parallel.h"
+
+	crc0 = pg_comp_crc32c_armv8_tail(crc0, p, len);
+
+#ifdef DEBUG_CRC
+	Assert(crc0 == pg_comp_crc32c_sb8(crc, data, orig_len));
+#endif
+
+	return crc0;
+}
+
+static pg_crc32c
+pg_comp_crc32c_armv8_tail(pg_crc32c crc, const void *data, size_t len)
 {
 	const unsigned char *p = data;
 	const unsigned char *pend = p + len;
-	const size_t min_blocklen = 42; /* Min size to consider interleaving */
-	const pg_crc32c orig_crc = crc; // XXX not for commit
 
 	/*
 	 * ARMv8 doesn't require alignment, but aligned memory access is
@@ -50,36 +79,6 @@ pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len)
 		p += 4;
 	}
 
-	/* See pg_crc32c_sse42.c for explanation */
-	while (p + min_blocklen * CRC_BYTES_PER_ITER <= pend)
-	{
-		const size_t block_len = Min(CRC_MAX_BLOCK_LEN, (pend - p) / CRC_BYTES_PER_ITER);
-		const uint64 *in64 = (const uint64 *) (p);
-		pg_crc32c	crc0 = crc,
-					crc1 = 0,
-					crc2 = 0;
-		uint64		mul0,
-					mul1,
-					precompute;
-
-		for (int i = 0; i < block_len; i++, in64++)
-		{
-			crc0 = __crc32cd(crc0, *(in64));
-			crc1 = __crc32cd(crc1, *(in64 + block_len));
-			crc2 = __crc32cd(crc2, *(in64 + block_len * 2));
-		}
-
-		precompute = combine_crc_lookup[block_len - 1];
-		mul0 = pg_clmul(crc0, (uint32) precompute);
-		mul1 = pg_clmul(crc1, (uint32) (precompute >> 32));
-
-		crc0 = __crc32cd(0, mul0);
-		crc1 = __crc32cd(0, mul1);
-		crc = crc0 ^ crc1 ^ crc2;
-
-		p += block_len * CRC_BYTES_PER_ITER;
-	}
-
 	/* Process eight bytes at a time, as far as we can. */
 	while (p + 8 <= pend)
 	{
@@ -103,7 +102,5 @@ pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len)
 		crc = __crc32cb(crc, *p);
 	}
 
-	// XXX not for commit
-	Assert(crc == pg_comp_crc32c_sb8(orig_crc, data, len));
 	return crc;
 }
diff --git a/src/port/pg_crc32c_parallel.h b/src/port/pg_crc32c_parallel.h
new file mode 100644
index 00000000000..caee564726e
--- /dev/null
+++ b/src/port/pg_crc32c_parallel.h
@@ -0,0 +1,66 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_crc32c_parallel.h
+ *	  Hardware-independent template for parallel CRC computation.
+ *
+ * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ *	  src/port/pg_crc32c_parallel.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PG_CRC32C_H
+#define PG_CRC32C_H
+
+if (unlikely(len >= MIN_PARALLEL_LENGTH))
+{
+	/*
+	 * Align pointer regardless of architecture to avoid straddling cacheline
+	 * boundaries, since we issue three loads per loop iteration below.
+	 */
+	for (; (uintptr_t) p & 7; len--)
+		crc0 = PG_CRC32C_1B(crc0, *p++);
+
+	/*
+	 * A CRC instruction can be issued every cycle on many architectures, but
+	 * the latency of its result will take several cycles. We can take
+	 * advantage of this by dividing the input into 3 equal blocks and
+	 * computing the CRC of each independently.
+	 */
+	while (len >= MIN_PARALLEL_LENGTH)
+	{
+		const size_t block_len = Min(CRC_MAX_BLOCK_LEN,
+									 len / CRC_BYTES_PER_ITER);
+		const uint64 *in64 = (const uint64 *) (p);
+		pg_crc32c	crc1 = 0,
+					crc2 = 0;
+		uint64		mul0,
+					mul1,
+					precompute;
+
+		for (int i = 0; i < block_len; i++, in64++)
+		{
+			crc0 = PG_CRC32C_8B(crc0, *(in64));
+			crc1 = PG_CRC32C_8B(crc1, *(in64 + block_len));
+			crc2 = PG_CRC32C_8B(crc2, *(in64 + block_len * 2));
+		}
+
+		/*
+		 * Combine the partial CRCs using carryless multiplication on
+		 * pre-computed length-specific constants.
+		 */
+		precompute = combine_crc_lookup[block_len - 1];
+		mul0 = pg_clmul(crc0, (uint32) precompute);
+		mul1 = pg_clmul(crc1, (uint32) (precompute >> 32));
+		crc0 = PG_CRC32C_8B(0, mul0);
+		crc0 ^= PG_CRC32C_8B(0, mul1);
+		crc0 ^= crc2;
+
+		p += block_len * CRC_BYTES_PER_ITER;
+		len -= block_len * CRC_BYTES_PER_ITER;
+	}
+}
+
+#endif							/* PG_CRC32C_H */
diff --git a/src/port/pg_crc32c_sb8.c b/src/port/pg_crc32c_sb8.c
index 004fe92d70b..d0ec8c5bed0 100644
--- a/src/port/pg_crc32c_sb8.c
+++ b/src/port/pg_crc32c_sb8.c
@@ -1169,6 +1169,8 @@ static const uint32 pg_crc32c_table[8][256] = {
 };
 
 
+/* platform-independent infrastructure for parallel CRC computation */
+
 /*
  * Carryless multiplication in software
  */
diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c
index f674d3f71d7..3fe8716601f 100644
--- a/src/port/pg_crc32c_sse42.c
+++ b/src/port/pg_crc32c_sse42.c
@@ -18,8 +18,7 @@
 
 #include "port/pg_crc32c.h"
 
-/* min size to compute multiple segments in parallel */
-#define MIN_PARALLEL_LENGTH 600
+#define DEBUG_CRC				/* XXX not for commit */
 
 static pg_crc32c pg_comp_crc32c_sse42_tail(pg_crc32c crc, const void *data, size_t len);
 
@@ -31,64 +30,26 @@ pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len)
 	const unsigned char *p = data;
 	pg_crc32c	crc0 = crc;
 
-	/* XXX not for commit */
+#ifdef DEBUG_CRC
 	const size_t orig_len PG_USED_FOR_ASSERTS_ONLY = len;
+#endif
 
 #if SIZEOF_VOID_P >= 8
-	if (unlikely(len >= MIN_PARALLEL_LENGTH))
-	{
-		/*
-		 * Align pointer to avoid straddling cacheline boundaries, since we
-		 * issue three loads per loop iteration below.
-		 */
-		for (; (uintptr_t) p & 7; len--)
-			crc0 = _mm_crc32_u8(crc0, *p++);
-
-		/*
-		 * A CRC instruction can be issued every cycle but the latency of its
-		 * result will take several cycles. We can take advantage of this by
-		 * dividing the input into 3 equal blocks and computing the CRC of
-		 * each independently.
-		 */
-		while (len >= MIN_PARALLEL_LENGTH)
-		{
-			const size_t block_len = Min(CRC_MAX_BLOCK_LEN,
-										 len / CRC_BYTES_PER_ITER);
-			const uint64 *in64 = (const uint64 *) (p);
-			pg_crc32c	crc1 = 0,
-						crc2 = 0;
-			uint64		mul0,
-						mul1,
-						precompute;
-
-			for (int i = 0; i < block_len; i++, in64++)
-			{
-				crc0 = _mm_crc32_u64(crc0, *(in64));
-				crc1 = _mm_crc32_u64(crc1, *(in64 + block_len));
-				crc2 = _mm_crc32_u64(crc2, *(in64 + block_len * 2));
-			}
-
-			/*
-			 * Combine the partial CRCs using carryless multiplication on
-			 * pre-computed length-specific constants.
-			 */
-			precompute = combine_crc_lookup[block_len - 1];
-			mul0 = pg_clmul(crc0, (uint32) precompute);
-			mul1 = pg_clmul(crc1, (uint32) (precompute >> 32));
-			crc0 = _mm_crc32_u64(0, mul0);
-			crc0 ^= _mm_crc32_u64(0, mul1);
-			crc0 ^= crc2;
-
-			p += block_len * CRC_BYTES_PER_ITER;
-			len -= block_len * CRC_BYTES_PER_ITER;
-		}
-	}
+
+/* min size to compute multiple segments in parallel */
+#define MIN_PARALLEL_LENGTH 600
+
+#define PG_CRC32C_1B(c, w) _mm_crc32_u8(c, w)
+#define PG_CRC32C_8B(c, w) _mm_crc32_u64(c, w)
+#include "pg_crc32c_parallel.h"
+
 #endif
 
 	crc0 = pg_comp_crc32c_sse42_tail(crc0, p, len);
 
-	/* XXX not for commit */
+#ifdef DEBUG_CRC
 	Assert(crc0 == pg_comp_crc32c_sb8(crc, data, orig_len));
+#endif
 
 	return crc0;
 }
-- 
2.48.1

From 2306c2a94e1610b76a1b7584bbaea63d8bf57302 Mon Sep 17 00:00:00 2001
From: John Naylor <john.nay...@postgresql.org>
Date: Wed, 11 Dec 2024 11:30:33 +0700
Subject: [PATCH v10 1/3] Execute hardware CRC computation in parallel

CRC computations on the current input word depend not only on that
input, but also on the CRC of the previous input. This means that
the speed is limited by the latency of the CRC instruction.

Most modern CPUs can start executing a new CRC instruction before a
currently executing one has finished, i.e. the reciprocal throughput
is lower than latency. By computing partial CRCs of non-overlapping
segments of the input, we can achieve the full throughput that the
CPU is capable of. To preserve the correctness of the result, however,
we must recombine the partial results using carryless multiplication
with constants specific to the input length. We get these from a
lookup table of pre-computed CRCs. Because of the overhead of the
recombinination step, parallelism is only faster with inputs of at
least a few hundred bytes.

For now we only implement parallelism for x86 and Arm. It might be
worthwhile to apply this technique to LoongArch, depending on the
throughput of CRC on that platform.

XXX The lookup table and supporting code is found in pg_crc32c_sb.c,
which is now built unconditionally on all platforms. Perhaps
s/sb8/common/ ?

This technique originated from the Intel white paper "Fast CRC
Computation for iSCSI Polynomial Using CRC32 Instruction", by Vinodh
Gopal et al, 2011. Thanks to Raghuveer Devulapalli for assistance in
verifying the usability of this technique from a legal perspective.

Xiang Gao's original proposal was specific to the Arm architecture,
computed in fixed-size chunks of 1024 bytes, and required hardware
support for carryless multiplication. I added support for x86 and
a wider range of chunk sizes, and switched to pure C for carryless
multiplication. The portability of the latter is important for two
reasons: 1) We may want to use this technique on architectures that
don't have hardware carryless multiplication and 2) This is intended as
a fallback, since if hardware carryless multiplication is available,
there are other algorithms that are useful on much smaller inputs
than this one.

Author: Xiang Gao <xiang....@arm.com>
Author: John Naylor <johncnaylo...@gmail.com>
Reviewed-by: Nathan Bossart <nathandboss...@gmail.com>
Discussion: https://postgr.es/m/db9pr08mb6991329a73923bf8ed4b3422f5...@db9pr08mb6991.eurprd08.prod.outlook.com
---
 configure                    |   5 +-
 configure.ac                 |   5 +-
 src/include/port/pg_crc32c.h |  12 ++++
 src/port/Makefile            |   1 +
 src/port/meson.build         |   6 +-
 src/port/pg_crc32c_armv8.c   |  34 +++++++++++
 src/port/pg_crc32c_sb8.c     | 111 +++++++++++++++++++++++++++++++++++
 src/port/pg_crc32c_sse42.c   |  77 +++++++++++++++++++++++-
 8 files changed, 239 insertions(+), 12 deletions(-)

diff --git a/configure b/configure
index 93fddd69981..3403bb0c931 100755
--- a/configure
+++ b/configure
@@ -17692,7 +17692,7 @@ else
 
 $as_echo "#define USE_SSE42_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h
 
-    PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sb8.o pg_crc32c_sse42_choose.o"
+    PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sse42_choose.o"
     { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2 with runtime check" >&5
 $as_echo "SSE 4.2 with runtime check" >&6; }
   else
@@ -17708,7 +17708,7 @@ $as_echo "ARMv8 CRC instructions" >&6; }
 
 $as_echo "#define USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h
 
-        PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_sb8.o pg_crc32c_armv8_choose.o"
+        PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_armv8_choose.o"
         { $as_echo "$as_me:${as_lineno-$LINENO}: result: ARMv8 CRC instructions with runtime check" >&5
 $as_echo "ARMv8 CRC instructions with runtime check" >&6; }
       else
@@ -17723,7 +17723,6 @@ $as_echo "LoongArch CRCC instructions" >&6; }
 
 $as_echo "#define USE_SLICING_BY_8_CRC32C 1" >>confdefs.h
 
-          PG_CRC32C_OBJS="pg_crc32c_sb8.o"
           { $as_echo "$as_me:${as_lineno-$LINENO}: result: slicing-by-8" >&5
 $as_echo "slicing-by-8" >&6; }
         fi
diff --git a/configure.ac b/configure.ac
index b6d02f5ecc7..855949b7d74 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2156,7 +2156,7 @@ if test x"$USE_SSE42_CRC32C" = x"1"; then
 else
   if test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then
     AC_DEFINE(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use Intel SSE 4.2 CRC instructions with a runtime check.])
-    PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sb8.o pg_crc32c_sse42_choose.o"
+    PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sse42_choose.o"
     AC_MSG_RESULT(SSE 4.2 with runtime check)
   else
     if test x"$USE_ARMV8_CRC32C" = x"1"; then
@@ -2166,7 +2166,7 @@ else
     else
       if test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then
         AC_DEFINE(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use ARMv8 CRC Extension with a runtime check.])
-        PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_sb8.o pg_crc32c_armv8_choose.o"
+        PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_armv8_choose.o"
         AC_MSG_RESULT(ARMv8 CRC instructions with runtime check)
       else
         if test x"$USE_LOONGARCH_CRC32C" = x"1"; then
@@ -2175,7 +2175,6 @@ else
           AC_MSG_RESULT(LoongArch CRCC instructions)
         else
           AC_DEFINE(USE_SLICING_BY_8_CRC32C, 1, [Define to 1 to use software CRC-32C implementation (slicing-by-8).])
-          PG_CRC32C_OBJS="pg_crc32c_sb8.o"
           AC_MSG_RESULT(slicing-by-8)
         fi
       fi
diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h
index 65ebeacf4b1..e6c149c71f1 100644
--- a/src/include/port/pg_crc32c.h
+++ b/src/include/port/pg_crc32c.h
@@ -41,6 +41,8 @@ typedef uint32 pg_crc32c;
 #define INIT_CRC32C(crc) ((crc) = 0xFFFFFFFF)
 #define EQ_CRC32C(c1, c2) ((c1) == (c2))
 
+extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len);
+
 #if defined(USE_SSE42_CRC32C)
 /* Use Intel SSE4.2 instructions. */
 #define COMP_CRC32C(crc, data, len) \
@@ -107,4 +109,14 @@ extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len)
 
 #endif
 
+/* semi-private to files in src/port that compute CRCs in parallel */
+
+#define CRC_BYTES_PER_ITER (3 * sizeof(uint64))
+/* for parallel computation, max number of words per block for recombination */
+#define CRC_MAX_BLOCK_LEN 350
+
+extern PGDLLIMPORT const uint64 combine_crc_lookup[CRC_MAX_BLOCK_LEN];
+
+extern uint64 pg_clmul(uint32 a, uint32 b);
+
 #endif							/* PG_CRC32C_H */
diff --git a/src/port/Makefile b/src/port/Makefile
index 4c224319512..a8e2467a866 100644
--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -44,6 +44,7 @@ OBJS = \
 	noblock.o \
 	path.o \
 	pg_bitutils.o \
+	pg_crc32c_sb8.o \
 	pg_popcount_avx512.o \
 	pg_strong_random.o \
 	pgcheckdir.o \
diff --git a/src/port/meson.build b/src/port/meson.build
index 7fcfa728d43..8aed1de2d1d 100644
--- a/src/port/meson.build
+++ b/src/port/meson.build
@@ -7,6 +7,7 @@ pgport_sources = [
   'noblock.c',
   'path.c',
   'pg_bitutils.c',
+  'pg_crc32c_sb8.c',
   'pg_popcount_avx512.c',
   'pg_strong_random.c',
   'pgcheckdir.c',
@@ -84,19 +85,14 @@ replace_funcs_pos = [
   ['pg_crc32c_sse42', 'USE_SSE42_CRC32C'],
   ['pg_crc32c_sse42', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
   ['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
-  ['pg_crc32c_sb8', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
 
   # arm / aarch64
   ['pg_crc32c_armv8', 'USE_ARMV8_CRC32C'],
   ['pg_crc32c_armv8', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK', 'crc'],
   ['pg_crc32c_armv8_choose', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'],
-  ['pg_crc32c_sb8', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'],
 
   # loongarch
   ['pg_crc32c_loongarch', 'USE_LOONGARCH_CRC32C'],
-
-  # generic fallback
-  ['pg_crc32c_sb8', 'USE_SLICING_BY_8_CRC32C'],
 ]
 
 pgport_cflags = {'crc': cflags_crc}
diff --git a/src/port/pg_crc32c_armv8.c b/src/port/pg_crc32c_armv8.c
index 5ba070bb99d..0265a2a13d7 100644
--- a/src/port/pg_crc32c_armv8.c
+++ b/src/port/pg_crc32c_armv8.c
@@ -23,6 +23,8 @@ pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len)
 {
 	const unsigned char *p = data;
 	const unsigned char *pend = p + len;
+	const size_t min_blocklen = 42; /* Min size to consider interleaving */
+	const pg_crc32c orig_crc = crc; // XXX not for commit
 
 	/*
 	 * ARMv8 doesn't require alignment, but aligned memory access is
@@ -48,6 +50,36 @@ pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len)
 		p += 4;
 	}
 
+	/* See pg_crc32c_sse42.c for explanation */
+	while (p + min_blocklen * CRC_BYTES_PER_ITER <= pend)
+	{
+		const size_t block_len = Min(CRC_MAX_BLOCK_LEN, (pend - p) / CRC_BYTES_PER_ITER);
+		const uint64 *in64 = (const uint64 *) (p);
+		pg_crc32c	crc0 = crc,
+					crc1 = 0,
+					crc2 = 0;
+		uint64		mul0,
+					mul1,
+					precompute;
+
+		for (int i = 0; i < block_len; i++, in64++)
+		{
+			crc0 = __crc32cd(crc0, *(in64));
+			crc1 = __crc32cd(crc1, *(in64 + block_len));
+			crc2 = __crc32cd(crc2, *(in64 + block_len * 2));
+		}
+
+		precompute = combine_crc_lookup[block_len - 1];
+		mul0 = pg_clmul(crc0, (uint32) precompute);
+		mul1 = pg_clmul(crc1, (uint32) (precompute >> 32));
+
+		crc0 = __crc32cd(0, mul0);
+		crc1 = __crc32cd(0, mul1);
+		crc = crc0 ^ crc1 ^ crc2;
+
+		p += block_len * CRC_BYTES_PER_ITER;
+	}
+
 	/* Process eight bytes at a time, as far as we can. */
 	while (p + 8 <= pend)
 	{
@@ -71,5 +103,7 @@ pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len)
 		crc = __crc32cb(crc, *p);
 	}
 
+	// XXX not for commit
+	Assert(crc == pg_comp_crc32c_sb8(orig_crc, data, len));
 	return crc;
 }
diff --git a/src/port/pg_crc32c_sb8.c b/src/port/pg_crc32c_sb8.c
index 19659d186a0..004fe92d70b 100644
--- a/src/port/pg_crc32c_sb8.c
+++ b/src/port/pg_crc32c_sb8.c
@@ -1167,3 +1167,114 @@ static const uint32 pg_crc32c_table[8][256] = {
 	}
 #endif							/* WORDS_BIGENDIAN */
 };
+
+
+/*
+ * Carryless multiplication in software
+ */
+uint64
+pg_clmul(uint32 a, uint32 b)
+{
+	uint64		result = 0;
+
+	for (uint32 i = 0; i < 32; i++)
+		if ((a >> i) & 1)
+			result ^= (uint64) b << i;
+
+	return result;
+}
+
+/*
+ * Lookup table for combining partial CRC computations
+ */
+const uint64 combine_crc_lookup[CRC_MAX_BLOCK_LEN] =
+{
+	0x00000001493c7d27, 0x493c7d27ba4fc28e, 0xf20c0dfeddc0152b, 0xba4fc28e9e4addf8,
+	0x3da6d0cb39d3b296, 0xddc0152b0715ce53, 0x1c291d0447db8317, 0x9e4addf80d3b6092,
+	0x740eef02c96cfdc0, 0x39d3b296878a92a7, 0x083a6eecdaece73e, 0x0715ce53ab7aff2a,
+	0xc49f4f672162d385, 0x47db831783348832, 0x2ad91c30299847d5, 0x0d3b6092b9e02b86,
+	0x6992cea218b33a4e, 0xc96cfdc0b6dd949b, 0x7e90804878d9ccb7, 0x878a92a7bac2fd7b,
+	0x1b3d8f29a60ce07b, 0xdaece73ece7f39f4, 0xf1d0f55e61d82e56, 0xab7aff2ad270f1a2,
+	0xa87ab8a8c619809d, 0x2162d3852b3cac5d, 0x8462d80065863b64, 0x833488321b03397f,
+	0x71d111a8ebb883bd, 0x299847d5b3e32c28, 0xffd852c6064f7f26, 0xb9e02b86dd7e3b0c,
+	0xdcb17aa4f285651c, 0x18b33a4e10746f3c, 0xf37c5aeec7a68855, 0xb6dd949b271d9844,
+	0x6051d5a28e766a0c, 0x78d9ccb793a5f730, 0x18b0d4ff6cb08e5c, 0xbac2fd7b6b749fb2,
+	0x21f3d99c1393e203, 0xa60ce07bcec3662e, 0x8f15801496c515bb, 0xce7f39f4e6fc4e6a,
+	0xa00457f78227bb8a, 0x61d82e56b0cd4768, 0x8d6d2c4339c7ff35, 0xd270f1a2d7a4825c,
+	0x00ac29cf0ab3844b, 0xc619809d0167d312, 0xe9adf796f6076544, 0x2b3cac5d26f6a60a,
+	0x96638b34a741c1bf, 0x65863b6498d8d9cb, 0xe0e9f35149c3cc9c, 0x1b03397f68bce87a,
+	0x9af01f2d57a3d037, 0xebb883bd6956fc3b, 0x2cff42cf42d98888, 0xb3e32c283771e98f,
+	0x88f25a3ab42ae3d9, 0x064f7f262178513a, 0x4e36f0b0e0ac139e, 0xdd7e3b0c170076fa,
+	0xbd6f81f8444dd413, 0xf285651c6f345e45, 0x91c9bd4b41d17b64, 0x10746f3cff0dba97,
+	0x885f087ba2b73df1, 0xc7a68855f872e54c, 0x4c1449321e41e9fc, 0x271d984486d8e4d2,
+	0x52148f02651bd98b, 0x8e766a0c5bb8f1bc, 0xa3c6f37aa90fd27a, 0x93a5f730b3af077a,
+	0xd7c0557f4984d782, 0x6cb08e5cca6ef3ac, 0x63ded06a234e0b26, 0x6b749fb2dd66cbbb,
+	0x4d56973c4597456a, 0x1393e203e9e28eb4, 0x9669c9df7b3ff57a, 0xcec3662ec9c8b782,
+	0xe417f38a3f70cc6f, 0x96c515bb93e106a4, 0x4b9e0f7162ec6c6d, 0xe6fc4e6ad813b325,
+	0xd104b8fc0df04680, 0x8227bb8a2342001e, 0x5b3977300a2a8d7e, 0xb0cd47686d9a4957,
+	0xe78eb416e8b6368b, 0x39c7ff35d2c3ed1a, 0x61ff0e01995a5724, 0xd7a4825c9ef68d35,
+	0x8d96551c0c139b31, 0x0ab3844bf2271e60, 0x0bf80dd20b0bf8ca, 0x0167d3122664fd8b,
+	0x8821abeded64812d, 0xf607654402ee03b2, 0x6a45d2b28604ae0f, 0x26f6a60a363bd6b3,
+	0xd8d26619135c83fd, 0xa741c1bf5fabe670, 0xde87806c35ec3279, 0x98d8d9cb00bcf5f6,
+	0x143387548ae00689, 0x49c3cc9c17f27698, 0x5bd2011f58ca5f00, 0x68bce87aaa7c7ad5,
+	0xdd07448eb5cfca28, 0x57a3d037ded288f8, 0xdde8f5b959f229bc, 0x6956fc3b6d390dec,
+	0xa3e3e02c37170390, 0x42d988886353c1cc, 0xd73c7beac4584f5c, 0x3771e98ff48642e9,
+	0x80ff0093531377e2, 0xb42ae3d9dd35bc8d, 0x8fe4c34db25b29f2, 0x2178513a9a5ede41,
+	0xdf99fc11a563905d, 0xe0ac139e45cddf4e, 0x6c23e841acfa3103, 0x170076faa51b6135,
+	0xfe314258dfd94fb2, 0x444dd41380f2886b, 0x0d8373a067969a6a, 0x6f345e45021ac5ef,
+	0x19e3635ee8310afa, 0x41d17b6475451b04, 0x29f268b48e1450f7, 0xff0dba97cbbe4ee1,
+	0x1dc0632a3a83de21, 0xa2b73df1e0cdcf86, 0x1614f396453c1679, 0xf872e54cdefba41c,
+	0x9e2993d3613eee91, 0x1e41e9fcddaf5114, 0x6bebd73c1f1dd124, 0x86d8e4d2bedc6ba1,
+	0x63ae91e6eca08ffe, 0x651bd98b3ae30875, 0xf8c9da7a0cd1526a, 0x5bb8f1bcb1630f04,
+	0x945a19c1ff47317b, 0xa90fd27ad6c3a807, 0xee8213b79a7781e0, 0xb3af077a63d097e9,
+	0x93781dc71d31175f, 0x4984d78294eb256e, 0xccc4a1b913184649, 0xca6ef3ac4be7fd90,
+	0xa2c2d9717d5c1d64, 0x234e0b2680ba859a, 0x1cad44526eeed1c9, 0xdd66cbbb22c3799f,
+	0x74922601d8ecc578, 0x4597456ab3a6da94, 0xc55f7eabcaf933fe, 0xe9e28eb450bfaade,
+	0xa19623292e7d11a7, 0x7b3ff57a7d14748f, 0x2d37074932d8041c, 0xc9c8b782889774e1,
+	0x397d84a16cc8a0ff, 0x3f70cc6f5aa1f3cf, 0x791132708a074012, 0x93e106a433bc58b3,
+	0xbc8178039f2b002a, 0x62ec6c6dbd0bb25f, 0x88eb3c0760bf0a6a, 0xd813b3258515c07f,
+	0x6e4cb6303be3c09b, 0x0df04680d8440525, 0x71971d5c682d085d, 0x2342001e465a4eee,
+	0xf33b8bc628b5de82, 0x0a2a8d7e077d54e0, 0x9fb3bbc02e5f3c8c, 0x6d9a4957c00df280,
+	0x6ef22b23d0a37f43, 0xe8b6368ba52f58ec, 0xce2df76800712e86, 0xd2c3ed1ad6748e82,
+	0xe53a4fc747972100, 0x995a572451aeef66, 0xbe60a91a71900712, 0x9ef68d35359674f7,
+	0x1dfa0a15647fbd15, 0x0c139b311baaa809, 0x8ec52396469aef86, 0xf2271e6086d42d06,
+	0x0e766b114aba1470, 0x0b0bf8ca1c2cce0a, 0x475846a4aa0cd2d3, 0x2664fd8bf8448640,
+	0xb2a3dfa6ac4fcdec, 0xed64812de81cf154, 0xdc1a160cc2c7385c, 0x02ee03b295ffd7dc,
+	0x79afdf1c91de6176, 0x8604ae0f84ee89ac, 0x07ac6e46533e308d, 0x363bd6b35f0e0438,
+	0x15f85253604d6e09, 0x135c83fdaeb3e622, 0x1bec24dd4263eb04, 0x5fabe67050c2cb16,
+	0x4c36cd5b6667afe7, 0x35ec32791a6889b8, 0xe0a22e29de42c92a, 0x00bcf5f67f47463d,
+	0x7c2b6ed9b82b6080, 0x8ae00689828d550b, 0x06ff88fddca2b4da, 0x17f276984ac726eb,
+	0xf7317cf0529295e6, 0x58ca5f005e9f28eb, 0x61b6e40b40c14fff, 0xaa7c7ad596a1f19b,
+	0xde8a97f8997157e1, 0xb5cfca28b0ed8196, 0x88f61445097e41e6, 0xded288f84ce8bfe5,
+	0xd4520e9ee36841ad, 0x59f229bcd1a9427c, 0x0c592bd593f3319c, 0x6d390decb58ac6fe,
+	0x38edfaf3e3809241, 0x37170390f22fd3e2, 0x72cbfcdb83c2df88, 0x6353c1ccd6b1825a,
+	0x348331a54e4ff232, 0xc4584f5c6664d9c1, 0xc3977c19836b5a6e, 0xf48642e923d5e7e5,
+	0xdafaea7c65065343, 0x531377e21495d20d, 0x73db4c04a29c82eb, 0xdd35bc8df370b37f,
+	0x72675ce8ea6dd7dc, 0xb25b29f2e9415bce, 0x3ec2ff8396309b0f, 0x9a5ede41c776b648,
+	0xe8c7a017c22c52c5, 0xa563905dcecfcd43, 0xcf4bfaefd8311ee7, 0x45cddf4e24e6fe8f,
+	0x6bde1ac7d0c6d7c9, 0xacfa310345aa5d4a, 0xae1175c2cf067065, 0xa51b613582f89c77,
+	0xf7506984a348c84e, 0xdfd94fb2d07737ea, 0xe0863e5636069dd2, 0x80f2886bc4cedd32,
+	0xd7e661ae9a97be47, 0x67969a6af45cd585, 0x01afc14f93f36e2b, 0x021ac5ef195bc82d,
+	0xd2fd8e3ce622aaca, 0xe8310afa23912612, 0xc4eb27b2a1fd0859, 0x75451b04a2edbd17,
+	0x632098732cefbfdd, 0x8e1450f7f36d84e2, 0xf29971cf9664532d, 0xcbbe4ee1cfeff4b3,
+	0xaf6939d96737eead, 0x3a83de21f52d28d3, 0x650ef6c5fb3bb2c8, 0xe0cdcf864a9d4498,
+	0x36e108faaef471c1, 0x453c16790d08bb68, 0x09c20a6c3b6c03be, 0xdefba41c4de20a7c,
+	0x0a1a6a8877792405, 0x613eee91b95a9eb0, 0x286d109b11f2bc8f, 0xddaf51147956e76a,
+	0x9fd51b88032a8058, 0x1f1dd1241b93589c, 0x4860285dcc66546f, 0xbedc6ba1005bb964,
+	0x6e221adc28198362, 0xeca08ffe3f2e57b1, 0x0e0a10735f54bb14, 0x3ae3087599e44d01,
+	0x37f194212591f073, 0x0cd1526a0871bd30, 0xe9bbf6481fb48d12, 0xb1630f043888ed03,
+	0x0fa0277f1e22167e, 0xff47317bd272cadf, 0xeb2fb89a8653215c, 0xd6c3a807f6e6d69e,
+	0x1c47ed30b9f5bf62, 0x9a7781e09dde6145, 0x271cfb40ed49c4d3, 0x63d097e91ad321bc,
+	0xea1cb6e7f206e4b8, 0x1d31175f60165873, 0x9f737f83159dff70, 0x94eb256ee1a468d0,
+	0xd4619bbcda11f51b, 0x1318464993189d18, 0x794dd0f2ac4f4691, 0x4be7fd9087d07ae0,
+	0xb6d42cd90db9f589, 0x7d5c1d64c51a240c, 0x8b9be230ab819fbb, 0x80ba859a5cc19671,
+	0xd4617a4c46183f0a, 0x6eeed1c993e7c448, 0xb9f93bd067fe6e36, 0x22c3799fca110698,
+	0xb8b67c1c8a2acc83, 0xd8ecc578eb75a090, 0xc5ca433a3e18bd99, 0xb3a6da94480e7e4d,
+	0x5e5dcd9560bced33, 0xcaf933fe0bf69a3f, 0x7b589372dbf59471, 0x50bfaadea00bae3d,
+	0x0afb7f3b4b7df256, 0x2e7d11a77959fe2a, 0x0f97c69068e3d179, 0x7d14748fa160e585,
+	0x3e254fe4cac39a0b, 0x32d8041cf4229b7a, 0x141e8512007ca0f9, 0x889774e10e0126b2,
+	0xe5e25bd082fe946e, 0x6cc8a0ff013b3856, 0xacf1231667c69966, 0x5aa1f3cffa0f2bd0,
+	0x7b454cb35d4c91fc, 0x8a07401279d95f64, 0x311709b8807121c0, 0x33bc58b3b0a3f16d,
+	0x9948a7d2618e0996, 0x9f2b002a2308bca9, 0x809cef1f343272b3, 0xbd0bb25ff5c40599,
+	0x11dde5b740c1c64c, 0x60bf0a6a71cc89e8,
+};
diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c
index 22c2137df31..f674d3f71d7 100644
--- a/src/port/pg_crc32c_sse42.c
+++ b/src/port/pg_crc32c_sse42.c
@@ -18,10 +18,85 @@
 
 #include "port/pg_crc32c.h"
 
-pg_attribute_no_sanitize_alignment()
+/* min size to compute multiple segments in parallel */
+#define MIN_PARALLEL_LENGTH 600
+
+static pg_crc32c pg_comp_crc32c_sse42_tail(pg_crc32c crc, const void *data, size_t len);
+
+
 pg_attribute_target("sse4.2")
 pg_crc32c
 pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len)
+{
+	const unsigned char *p = data;
+	pg_crc32c	crc0 = crc;
+
+	/* XXX not for commit */
+	const size_t orig_len PG_USED_FOR_ASSERTS_ONLY = len;
+
+#if SIZEOF_VOID_P >= 8
+	if (unlikely(len >= MIN_PARALLEL_LENGTH))
+	{
+		/*
+		 * Align pointer to avoid straddling cacheline boundaries, since we
+		 * issue three loads per loop iteration below.
+		 */
+		for (; (uintptr_t) p & 7; len--)
+			crc0 = _mm_crc32_u8(crc0, *p++);
+
+		/*
+		 * A CRC instruction can be issued every cycle but the latency of its
+		 * result will take several cycles. We can take advantage of this by
+		 * dividing the input into 3 equal blocks and computing the CRC of
+		 * each independently.
+		 */
+		while (len >= MIN_PARALLEL_LENGTH)
+		{
+			const size_t block_len = Min(CRC_MAX_BLOCK_LEN,
+										 len / CRC_BYTES_PER_ITER);
+			const uint64 *in64 = (const uint64 *) (p);
+			pg_crc32c	crc1 = 0,
+						crc2 = 0;
+			uint64		mul0,
+						mul1,
+						precompute;
+
+			for (int i = 0; i < block_len; i++, in64++)
+			{
+				crc0 = _mm_crc32_u64(crc0, *(in64));
+				crc1 = _mm_crc32_u64(crc1, *(in64 + block_len));
+				crc2 = _mm_crc32_u64(crc2, *(in64 + block_len * 2));
+			}
+
+			/*
+			 * Combine the partial CRCs using carryless multiplication on
+			 * pre-computed length-specific constants.
+			 */
+			precompute = combine_crc_lookup[block_len - 1];
+			mul0 = pg_clmul(crc0, (uint32) precompute);
+			mul1 = pg_clmul(crc1, (uint32) (precompute >> 32));
+			crc0 = _mm_crc32_u64(0, mul0);
+			crc0 ^= _mm_crc32_u64(0, mul1);
+			crc0 ^= crc2;
+
+			p += block_len * CRC_BYTES_PER_ITER;
+			len -= block_len * CRC_BYTES_PER_ITER;
+		}
+	}
+#endif
+
+	crc0 = pg_comp_crc32c_sse42_tail(crc0, p, len);
+
+	/* XXX not for commit */
+	Assert(crc0 == pg_comp_crc32c_sb8(crc, data, orig_len));
+
+	return crc0;
+}
+
+pg_attribute_no_sanitize_alignment()
+pg_attribute_target("sse4.2")
+static pg_crc32c
+pg_comp_crc32c_sse42_tail(pg_crc32c crc, const void *data, size_t len)
 {
 	const unsigned char *p = data;
 	const unsigned char *pend = p + len;
-- 
2.48.1

From 93b46829b80d33be5afee1156dbdf074f097fbdd Mon Sep 17 00:00:00 2001
From: John Naylor <john.nay...@postgresql.org>
Date: Tue, 18 Mar 2025 18:41:35 +0700
Subject: [PATCH v10 3/3] Fix headerscheck

---
 src/tools/pginclude/headerscheck | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/tools/pginclude/headerscheck b/src/tools/pginclude/headerscheck
index 9e86d049362..3883ec558e5 100755
--- a/src/tools/pginclude/headerscheck
+++ b/src/tools/pginclude/headerscheck
@@ -122,6 +122,9 @@ do
 	test "$f" = src/include/nodes/nodetags.h && continue
 	test "$f" = src/backend/nodes/nodetags.h && continue
 
+	# pg_crc32c_parallel.h contains just a code fragment
+	test "$f" = src/port/pg_crc32c_parallel.h && continue
+
 	# These files are not meant to be included standalone, because
 	# they contain lists that might have multiple use-cases.
 	test "$f" = src/include/access/rmgrlist.h && continue
-- 
2.48.1

Re: CRC32C Parallel Computation Optimization on ARM

Reply via email to