On Wed, Mar 5, 2025 at 10:52 PM Nathan Bossart <nathandboss...@gmail.com> wrote:
>
> On Wed, Mar 05, 2025 at 08:51:21AM +0700, John Naylor wrote:
> > That was my hunch too, but I wanted to be more sure, so I modified the
> > benchmark so it doesn't know the address of the next calculation until
> > it finishes the last calculation so we can hopefully see the latency
> > caused by indirection. It also does an additional calculation on
> > constant 20 bytes, like the WAL header. I also tweaked the length each
> > iteration so the branch predictor maybe has a harder time predicting
> > the constant 20 input. And to make it more challenging, I removed the
> > part that inlined all small inputs, so it inlines only constant
> > inputs:
>
> Would you mind sharing this test?

The test script is the same as here, except I only ran small lengths:

https://www.postgresql.org/message-id/CANWCAZahvhE-%2BhtZiUyzPiS5e45ukx5877mD-dHr-KSX6LcdjQ%40mail.gmail.com

...but I must have forgotten to attach the slightly tweaked patch set,
which I've done now. 0002 modifies the 0001 test module and 0006
reverts inlining non-constant input from 0005, just to see if I could
find a regression from indirection, which I didn't. If we don't need
it, it'd better to avoid inlining loops to keep from bloating the
binary.

> It sounds like you are running a
> workload with a mix of constant/inlined calls and function pointer calls to
> simulate typical usage for WAL, but I'm not 100% sure I'm understanding you
> correctly.

Exactly.

--
John Naylor
Amazon Web Services
From c5cd6e44028eaf11efc2cf4fc49c87101b49c97f Mon Sep 17 00:00:00 2001
From: John Naylor <john.nay...@postgresql.org>
Date: Wed, 5 Mar 2025 08:21:54 +0700
Subject: [PATCH v12 6/6] Only inline for constant input (partial revert)

---
 src/include/port/pg_crc32c.h | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h
index 26b676dddc9..01192831ca3 100644
--- a/src/include/port/pg_crc32c.h
+++ b/src/include/port/pg_crc32c.h
@@ -66,12 +66,11 @@ static inline
 pg_crc32c
 pg_comp_crc32c_dispatch(pg_crc32c crc, const void *data, size_t len)
 {
-	if (len < 64)
+	if (__builtin_constant_p(len) && len < 64)
 	{
 		/*
-		 * For small inputs, inline the computation to avoid the runtime
-		 * check. This also allows the compiler to unroll loops for constant
-		 * input.
+		 * For small constant inputs, inline the computation. This allows the
+		 * compiler to unroll loops.
 		 */
 		return pg_comp_crc32c_sse42_inline(crc, data, len);
 	}
-- 
2.48.1

From eafea75fc761fd51fa67311af794cf0f7dec40aa Mon Sep 17 00:00:00 2001
From: John Naylor <john.nay...@postgresql.org>
Date: Wed, 12 Feb 2025 15:27:16 +0700
Subject: [PATCH v12 4/6] Improve CRC32C performance on x86_64

The current SSE4.2 implementation of CRC32C relies on the native
CRC32 instruction, which operates on 8 bytes at a time. We can get a
substantial speedup on longer inputs by using carryless multiplication
on SIMD registers, processing 64 bytes per loop iteration.

The PCLMULQDQ instruction has been widely available since 2011 (almost
as old as SSE 4.2), so this commit now requires that, as well as SSE
4.2, to build pg_crc32c_sse42.c.

The MIT-licensed implementation was generated with the "generate"
program from

https://github.com/corsix/fast-crc32/

Based on: "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ
Instruction" V. Gopal, E. Ozturk, et al., 2009

Author: Raghuveer Devulapalli <raghuveer.devulapa...@intel.com>
Author: John Naylor <johncnaylo...@gmail.com>
Discussion: https://postgr.es/m/ph8pr11mb82869ff741dfa4e9a029ff13fb...@ph8pr11mb8286.namprd11.prod.outlook.com
---
 src/include/port/pg_crc32c.h      | 30 ++++++++---
 src/port/pg_crc32c_sse42.c        | 88 +++++++++++++++++++++++++++++++
 src/port/pg_crc32c_sse42_choose.c | 26 ++++-----
 3 files changed, 124 insertions(+), 20 deletions(-)

diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h
index 5ccc79295c0..fe0e1b6b275 100644
--- a/src/include/port/pg_crc32c.h
+++ b/src/include/port/pg_crc32c.h
@@ -37,6 +37,11 @@
 
 typedef uint32 pg_crc32c;
 
+/* WIP: configure checks */
+#ifdef __x86_64__
+#define USE_PCLMUL_WITH_RUNTIME_CHECK
+#endif
+
 /* The INIT and EQ macros are the same for all implementations. */
 #define INIT_CRC32C(crc) ((crc) = 0xFFFFFFFF)
 #define EQ_CRC32C(c1, c2) ((c1) == (c2))
@@ -68,6 +73,23 @@ pg_comp_crc32c_dispatch(pg_crc32c crc, const void *data, size_t len)
 		return pg_comp_crc32c_sse42(crc, data, len);
 }
 
+#elif defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK)
+
+/*
+ * Use Intel SSE 4.2 or PCLMUL instructions, but perform a runtime check first
+ * to check that they are available.
+ */
+#define COMP_CRC32C(crc, data, len) \
+	((crc) = pg_comp_crc32c((crc), (data), (len)))
+#define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF)
+
+extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len);
+extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len);
+extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len);
+#ifdef USE_PCLMUL_WITH_RUNTIME_CHECK
+extern pg_crc32c pg_comp_crc32c_pclmul(pg_crc32c crc, const void *data, size_t len);
+#endif
+
 #elif defined(USE_ARMV8_CRC32C)
 /* Use ARMv8 CRC Extension instructions. */
 
@@ -86,7 +108,7 @@ extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t le
 
 extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_t len);
 
-#elif defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK) || defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK)
+#elif defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK)
 
 /*
  * Use Intel SSE 4.2 or ARMv8 instructions, but perform a runtime check first
@@ -98,13 +120,7 @@ extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_
 
 extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len);
 extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len);
-
-#ifdef USE_SSE42_CRC32C_WITH_RUNTIME_CHECK
-extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len);
-#endif
-#ifdef USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK
 extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len);
-#endif
 
 #else
 /*
diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c
index 6a35f7fdc67..b56da2f6934 100644
--- a/src/port/pg_crc32c_sse42.c
+++ b/src/port/pg_crc32c_sse42.c
@@ -15,6 +15,7 @@
 #include "c.h"
 
 #include <nmmintrin.h>
+#include <wmmintrin.h>
 
 #include "port/pg_crc32c.h"
 #include "port/pg_crc32c_sse42_impl.h"
@@ -26,3 +27,90 @@ pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len)
 {
 	return pg_comp_crc32c_sse42_inline(crc, data, len);
 }
+
+#ifdef USE_PCLMUL_WITH_RUNTIME_CHECK
+
+/* Generated by https://github.com/corsix/fast-crc32/ using: */
+/* ./generate -i sse -p crc32c -a v4e */
+/* MIT licensed */
+
+#define clmul_lo(a, b) (_mm_clmulepi64_si128((a), (b), 0))
+#define clmul_hi(a, b) (_mm_clmulepi64_si128((a), (b), 17))
+
+pg_attribute_target("sse4.2,pclmul")
+pg_crc32c
+pg_comp_crc32c_pclmul(pg_crc32c crc, const void *data, size_t length)
+{
+	/* adjust names to match generated code */
+	pg_crc32c	crc0 = crc;
+	size_t		len = length;
+	const char *buf = data;
+
+	// This prolog is trying to avoid loads straddling
+	// cache lines, but it doesn't seem worth it if
+	// we're trying to be fast on small inputs as well
+#if 0
+	for (; len && ((uintptr_t) buf & 7); --len)
+	{
+		crc0 = _mm_crc32_u8(crc0, *buf++);
+	}
+	if (((uintptr_t) buf & 8) && len >= 8)
+	{
+		crc0 = _mm_crc32_u64(crc0, *(const uint64_t *) buf);
+		buf += 8;
+		len -= 8;
+	}
+#endif
+	if (len >= 64)
+	{
+		const char *end = buf + len;
+		const char *limit = buf + len - 64;
+
+		/* First vector chunk. */
+		__m128i		x0 = _mm_loadu_si128((const __m128i *) buf),
+					y0;
+		__m128i		x1 = _mm_loadu_si128((const __m128i *) (buf + 16)),
+					y1;
+		__m128i		x2 = _mm_loadu_si128((const __m128i *) (buf + 32)),
+					y2;
+		__m128i		x3 = _mm_loadu_si128((const __m128i *) (buf + 48)),
+					y3;
+		__m128i		k;
+
+		k = _mm_setr_epi32(0x740eef02, 0, 0x9e4addf8, 0);
+		x0 = _mm_xor_si128(_mm_cvtsi32_si128(crc0), x0);
+		buf += 64;
+		/* Main loop. */
+		while (buf <= limit)
+		{
+			y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
+			y1 = clmul_lo(x1, k), x1 = clmul_hi(x1, k);
+			y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k);
+			y3 = clmul_lo(x3, k), x3 = clmul_hi(x3, k);
+			y0 = _mm_xor_si128(y0, _mm_loadu_si128((const __m128i *) buf)), x0 = _mm_xor_si128(x0, y0);
+			y1 = _mm_xor_si128(y1, _mm_loadu_si128((const __m128i *) (buf + 16))), x1 = _mm_xor_si128(x1, y1);
+			y2 = _mm_xor_si128(y2, _mm_loadu_si128((const __m128i *) (buf + 32))), x2 = _mm_xor_si128(x2, y2);
+			y3 = _mm_xor_si128(y3, _mm_loadu_si128((const __m128i *) (buf + 48))), x3 = _mm_xor_si128(x3, y3);
+			buf += 64;
+		}
+
+		/* Reduce x0 ... x3 to just x0. */
+		k = _mm_setr_epi32(0xf20c0dfe, 0, 0x493c7d27, 0);
+		y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
+		y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k);
+		y0 = _mm_xor_si128(y0, x1), x0 = _mm_xor_si128(x0, y0);
+		y2 = _mm_xor_si128(y2, x3), x2 = _mm_xor_si128(x2, y2);
+		k = _mm_setr_epi32(0x3da6d0cb, 0, 0xba4fc28e, 0);
+		y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
+		y0 = _mm_xor_si128(y0, x2), x0 = _mm_xor_si128(x0, y0);
+
+		/* Reduce 128 bits to 32 bits, and multiply by x^32. */
+		crc0 = _mm_crc32_u64(0, _mm_extract_epi64(x0, 0));
+		crc0 = _mm_crc32_u64(crc0, _mm_extract_epi64(x0, 1));
+		len = end - buf;
+	}
+
+	return pg_comp_crc32c_sse42_inline(crc0, buf, len);
+}
+
+#endif
diff --git a/src/port/pg_crc32c_sse42_choose.c b/src/port/pg_crc32c_sse42_choose.c
index 65dbc4d4249..abea0f90eb3 100644
--- a/src/port/pg_crc32c_sse42_choose.c
+++ b/src/port/pg_crc32c_sse42_choose.c
@@ -30,8 +30,12 @@
 
 #include "port/pg_crc32c.h"
 
-static bool
-pg_crc32c_sse42_available(void)
+/*
+ * This gets called on the first call. It replaces the function pointer
+ * so that subsequent calls are routed directly to the chosen implementation.
+ */
+static pg_crc32c
+pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len)
 {
 	unsigned int exx[4] = {0, 0, 0, 0};
 
@@ -43,18 +47,14 @@ pg_crc32c_sse42_available(void)
 #error cpuid instruction not available
 #endif
 
-	return (exx[2] & (1 << 20)) != 0;	/* SSE 4.2 */
-}
-
-/*
- * This gets called on the first call. It replaces the function pointer
- * so that subsequent calls are routed directly to the chosen implementation.
- */
-static pg_crc32c
-pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len)
-{
-	if (pg_crc32c_sse42_available())
+	if ((exx[2] & (1 << 20)) != 0)	/* SSE 4.2 */
+	{
 		pg_comp_crc32c = pg_comp_crc32c_sse42;
+#ifdef USE_PCLMUL_WITH_RUNTIME_CHECK
+		if ((exx[2] & (1 << 1)) != 0)	/* PCLMUL */
+			pg_comp_crc32c = pg_comp_crc32c_pclmul;
+#endif
+	}
 	else
 		pg_comp_crc32c = pg_comp_crc32c_sb8;
 
-- 
2.48.1

From ebbd072d558574f78bd4489c3431a13fd831f254 Mon Sep 17 00:00:00 2001
From: John Naylor <john.nay...@postgresql.org>
Date: Fri, 28 Feb 2025 16:27:30 +0700
Subject: [PATCH v12 3/6] Inline CRC computation for small fixed-length input

---
 src/include/port/pg_crc32c.h            | 21 ++++++-
 src/include/port/pg_crc32c_sse42_impl.h | 74 +++++++++++++++++++++++++
 src/port/pg_crc32c_sse42.c              | 46 +--------------
 3 files changed, 96 insertions(+), 45 deletions(-)
 create mode 100644 src/include/port/pg_crc32c_sse42_impl.h

diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h
index 65ebeacf4b1..5ccc79295c0 100644
--- a/src/include/port/pg_crc32c.h
+++ b/src/include/port/pg_crc32c.h
@@ -43,12 +43,31 @@ typedef uint32 pg_crc32c;
 
 #if defined(USE_SSE42_CRC32C)
 /* Use Intel SSE4.2 instructions. */
+
+#include "pg_crc32c_sse42_impl.h"
+
 #define COMP_CRC32C(crc, data, len) \
-	((crc) = pg_comp_crc32c_sse42((crc), (data), (len)))
+	((crc) = pg_comp_crc32c_dispatch((crc), (data), (len)))
 #define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF)
 
 extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len);
 
+static inline
+pg_crc32c
+pg_comp_crc32c_dispatch(pg_crc32c crc, const void *data, size_t len)
+{
+	if (__builtin_constant_p(len) && len < 64)
+	{
+		/*
+		 * For small constant inputs, inline the computation. This allows the
+		 * compiler to unroll loops.
+		 */
+		return pg_comp_crc32c_sse42_inline(crc, data, len);
+	}
+	else
+		return pg_comp_crc32c_sse42(crc, data, len);
+}
+
 #elif defined(USE_ARMV8_CRC32C)
 /* Use ARMv8 CRC Extension instructions. */
 
diff --git a/src/include/port/pg_crc32c_sse42_impl.h b/src/include/port/pg_crc32c_sse42_impl.h
new file mode 100644
index 00000000000..e10ad777618
--- /dev/null
+++ b/src/include/port/pg_crc32c_sse42_impl.h
@@ -0,0 +1,74 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_crc32c_sse42_impl.h
+ *	  Inline implementation of CRC computation using SSE 4.2
+ *
+ * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_crc32c_sse42_impl.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PG_CRC32C_SSE42_IMPL_H
+#define PG_CRC32C_SSE42_IMPL_H
+
+#include "c.h"
+
+#include <nmmintrin.h>
+
+pg_attribute_no_sanitize_alignment()
+pg_attribute_target("sse4.2")
+static inline
+pg_crc32c
+pg_comp_crc32c_sse42_inline(pg_crc32c crc, const void *data, size_t len)
+{
+	const unsigned char *p = data;
+	const unsigned char *pend = p + len;
+
+	/*
+	 * Process eight bytes of data at a time.
+	 *
+	 * NB: We do unaligned accesses here. The Intel architecture allows that,
+	 * and performance testing didn't show any performance gain from aligning
+	 * the begin address.
+	 */
+#ifdef __x86_64__
+	while (p + 8 <= pend)
+	{
+		crc = (uint32) _mm_crc32_u64(crc, *((const uint64 *) p));
+		p += 8;
+	}
+
+	/* Process remaining full four bytes if any */
+	if (p + 4 <= pend)
+	{
+		crc = _mm_crc32_u32(crc, *((const unsigned int *) p));
+		p += 4;
+	}
+#else
+
+	/*
+	 * Process four bytes at a time. (The eight byte instruction is not
+	 * available on the 32-bit x86 architecture).
+	 */
+	while (p + 4 <= pend)
+	{
+		crc = _mm_crc32_u32(crc, *((const unsigned int *) p));
+		p += 4;
+	}
+#endif							/* __x86_64__ */
+
+	/* Process any remaining bytes one at a time. */
+	while (p < pend)
+	{
+		crc = _mm_crc32_u8(crc, *p);
+		p++;
+	}
+
+	return crc;
+}
+
+#endif							/* PG_CRC32C_SSE42_IMPL_H */
diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c
index 22c2137df31..6a35f7fdc67 100644
--- a/src/port/pg_crc32c_sse42.c
+++ b/src/port/pg_crc32c_sse42.c
@@ -17,54 +17,12 @@
 #include <nmmintrin.h>
 
 #include "port/pg_crc32c.h"
+#include "port/pg_crc32c_sse42_impl.h"
 
 pg_attribute_no_sanitize_alignment()
 pg_attribute_target("sse4.2")
 pg_crc32c
 pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len)
 {
-	const unsigned char *p = data;
-	const unsigned char *pend = p + len;
-
-	/*
-	 * Process eight bytes of data at a time.
-	 *
-	 * NB: We do unaligned accesses here. The Intel architecture allows that,
-	 * and performance testing didn't show any performance gain from aligning
-	 * the begin address.
-	 */
-#ifdef __x86_64__
-	while (p + 8 <= pend)
-	{
-		crc = (uint32) _mm_crc32_u64(crc, *((const uint64 *) p));
-		p += 8;
-	}
-
-	/* Process remaining full four bytes if any */
-	if (p + 4 <= pend)
-	{
-		crc = _mm_crc32_u32(crc, *((const unsigned int *) p));
-		p += 4;
-	}
-#else
-
-	/*
-	 * Process four bytes at a time. (The eight byte instruction is not
-	 * available on the 32-bit x86 architecture).
-	 */
-	while (p + 4 <= pend)
-	{
-		crc = _mm_crc32_u32(crc, *((const unsigned int *) p));
-		p += 4;
-	}
-#endif							/* __x86_64__ */
-
-	/* Process any remaining bytes one at a time. */
-	while (p < pend)
-	{
-		crc = _mm_crc32_u8(crc, *p);
-		p++;
-	}
-
-	return crc;
+	return pg_comp_crc32c_sse42_inline(crc, data, len);
 }
-- 
2.48.1

From e38654507d2efad4b5ad75548e0c388c3db9cfe5 Mon Sep 17 00:00:00 2001
From: John Naylor <john.nay...@postgresql.org>
Date: Fri, 28 Feb 2025 18:27:40 +0700
Subject: [PATCH v12 5/6] Use runtime check even when we have SSE 4.2 at
 compile time

This allows us to use PCLMUL for longer inputs. Short inputs are
inlined to avoid the indirection through a function pointer.
---
 configure                         |  2 +-
 configure.ac                      |  2 +-
 src/include/port/pg_crc32c.h      | 15 +++++++++++----
 src/port/meson.build              |  1 +
 src/port/pg_crc32c_sse42_choose.c |  2 ++
 5 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/configure b/configure
index 93fddd69981..91c0ffc8272 100755
--- a/configure
+++ b/configure
@@ -17684,7 +17684,7 @@ if test x"$USE_SSE42_CRC32C" = x"1"; then
 
 $as_echo "#define USE_SSE42_CRC32C 1" >>confdefs.h
 
-  PG_CRC32C_OBJS="pg_crc32c_sse42.o"
+  PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sse42_choose.o"
   { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2" >&5
 $as_echo "SSE 4.2" >&6; }
 else
diff --git a/configure.ac b/configure.ac
index b6d02f5ecc7..a85bdbd4ff6 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2151,7 +2151,7 @@ fi
 AC_MSG_CHECKING([which CRC-32C implementation to use])
 if test x"$USE_SSE42_CRC32C" = x"1"; then
   AC_DEFINE(USE_SSE42_CRC32C, 1, [Define to 1 use Intel SSE 4.2 CRC instructions.])
-  PG_CRC32C_OBJS="pg_crc32c_sse42.o"
+  PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sse42_choose.o"
   AC_MSG_RESULT(SSE 4.2)
 else
   if test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then
diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h
index fe0e1b6b275..26b676dddc9 100644
--- a/src/include/port/pg_crc32c.h
+++ b/src/include/port/pg_crc32c.h
@@ -55,22 +55,29 @@ typedef uint32 pg_crc32c;
 	((crc) = pg_comp_crc32c_dispatch((crc), (data), (len)))
 #define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF)
 
+extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len);
+extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len);
 extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len);
+#ifdef USE_PCLMUL_WITH_RUNTIME_CHECK
+extern pg_crc32c pg_comp_crc32c_pclmul(pg_crc32c crc, const void *data, size_t len);
+#endif
 
 static inline
 pg_crc32c
 pg_comp_crc32c_dispatch(pg_crc32c crc, const void *data, size_t len)
 {
-	if (__builtin_constant_p(len) && len < 64)
+	if (len < 64)
 	{
 		/*
-		 * For small constant inputs, inline the computation. This allows the
-		 * compiler to unroll loops.
+		 * For small inputs, inline the computation to avoid the runtime
+		 * check. This also allows the compiler to unroll loops for constant
+		 * input.
 		 */
 		return pg_comp_crc32c_sse42_inline(crc, data, len);
 	}
 	else
-		return pg_comp_crc32c_sse42(crc, data, len);
+		/* For larger inputs, use a runtime check for PCLMUL instructions. */
+		return pg_comp_crc32c(crc, data, len);
 }
 
 #elif defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK)
diff --git a/src/port/meson.build b/src/port/meson.build
index 7fcfa728d43..8d70a4d510e 100644
--- a/src/port/meson.build
+++ b/src/port/meson.build
@@ -83,6 +83,7 @@ replace_funcs_pos = [
   # x86/x64
   ['pg_crc32c_sse42', 'USE_SSE42_CRC32C'],
   ['pg_crc32c_sse42', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
+  ['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C'],
   ['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
   ['pg_crc32c_sb8', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
 
diff --git a/src/port/pg_crc32c_sse42_choose.c b/src/port/pg_crc32c_sse42_choose.c
index abea0f90eb3..89a48c76894 100644
--- a/src/port/pg_crc32c_sse42_choose.c
+++ b/src/port/pg_crc32c_sse42_choose.c
@@ -55,8 +55,10 @@ pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len)
 			pg_comp_crc32c = pg_comp_crc32c_pclmul;
 #endif
 	}
+#ifdef USE_SSE42_CRC32C_WITH_RUNTIME_CHECK
 	else
 		pg_comp_crc32c = pg_comp_crc32c_sb8;
+#endif
 
 	return pg_comp_crc32c(crc, data, len);
 }
-- 
2.48.1

From 7a9b94677da30db0f8c296fe71037f65b157bc1c Mon Sep 17 00:00:00 2001
From: John Naylor <john.nay...@postgresql.org>
Date: Wed, 5 Mar 2025 07:52:52 +0700
Subject: [PATCH v12 2/6] Attempt to make benchmark more sensitive to latency

---
 contrib/test_crc32c/test_crc32c.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/contrib/test_crc32c/test_crc32c.c b/contrib/test_crc32c/test_crc32c.c
index 28bc42de314..3e5ebad4e39 100644
--- a/contrib/test_crc32c/test_crc32c.c
+++ b/contrib/test_crc32c/test_crc32c.c
@@ -21,13 +21,13 @@ drive_crc32c(PG_FUNCTION_ARGS)
 {
 	int64			count	= PG_GETARG_INT32(0);
 	int64			num		= PG_GETARG_INT32(1);
-	char*		data	= malloc((size_t)num);
-	pg_crc32c crc;
+	char*		data	= malloc((size_t)num + 256);
+	pg_crc32c crc = 0;
 	pg_prng_state state;
 	uint64 seed = 42;
 	pg_prng_seed(&state, seed);
 	/* set random data */
-	for (uint64 i = 0; i < num; i++)
+	for (uint64 i = 0; i < num + 256; i++)
 	{
 		data[i] = pg_prng_uint32(&state) % 255;
 	}
@@ -36,11 +36,15 @@ drive_crc32c(PG_FUNCTION_ARGS)
 
 	while(count--)
 	{
-		INIT_CRC32C(crc);
-		COMP_CRC32C(crc, data, num);
-		FIN_CRC32C(crc);
+		size_t delta = crc & 7;
+
+		/* make both pointer and length unpredictable */
+		COMP_CRC32C(crc, data + delta, num + delta);
+		/* simulate WAL header */
+		COMP_CRC32C(crc, data + delta, 20);
 	}
 
+	FIN_CRC32C(crc);
 	free((void *)data);
 
 	PG_RETURN_INT64((int64_t)crc);
-- 
2.48.1

From 2d8b2ad3e967231d1498953f6563b36b94977445 Mon Sep 17 00:00:00 2001
From: Paul Amonson <paul.d.amon...@intel.com>
Date: Mon, 6 May 2024 08:34:17 -0700
Subject: [PATCH v12 1/6] Add a Postgres SQL function for crc32c benchmarking

Add a drive_crc32c() function to use for benchmarking crc32c
computation. The function takes 2 arguments:

(1) count: num of times CRC32C is computed in a loop.
(2) num: #bytes in the buffer to calculate crc over.

XXX not for commit

Extracted from a patch by  Raghuveer Devulapalli
---
 contrib/meson.build                          |  1 +
 contrib/test_crc32c/Makefile                 | 20 +++++++
 contrib/test_crc32c/expected/test_crc32c.out | 57 ++++++++++++++++++++
 contrib/test_crc32c/meson.build              | 34 ++++++++++++
 contrib/test_crc32c/sql/test_crc32c.sql      |  3 ++
 contrib/test_crc32c/test_crc32c--1.0.sql     |  1 +
 contrib/test_crc32c/test_crc32c.c            | 47 ++++++++++++++++
 contrib/test_crc32c/test_crc32c.control      |  4 ++
 8 files changed, 167 insertions(+)
 create mode 100644 contrib/test_crc32c/Makefile
 create mode 100644 contrib/test_crc32c/expected/test_crc32c.out
 create mode 100644 contrib/test_crc32c/meson.build
 create mode 100644 contrib/test_crc32c/sql/test_crc32c.sql
 create mode 100644 contrib/test_crc32c/test_crc32c--1.0.sql
 create mode 100644 contrib/test_crc32c/test_crc32c.c
 create mode 100644 contrib/test_crc32c/test_crc32c.control

diff --git a/contrib/meson.build b/contrib/meson.build
index 1ba73ebd67a..06673db0625 100644
--- a/contrib/meson.build
+++ b/contrib/meson.build
@@ -12,6 +12,7 @@ contrib_doc_args = {
   'install_dir': contrib_doc_dir,
 }
 
+subdir('test_crc32c')
 subdir('amcheck')
 subdir('auth_delay')
 subdir('auto_explain')
diff --git a/contrib/test_crc32c/Makefile b/contrib/test_crc32c/Makefile
new file mode 100644
index 00000000000..5b747c6184a
--- /dev/null
+++ b/contrib/test_crc32c/Makefile
@@ -0,0 +1,20 @@
+MODULE_big = test_crc32c
+OBJS = test_crc32c.o
+PGFILEDESC = "test"
+EXTENSION = test_crc32c
+DATA = test_crc32c--1.0.sql
+
+first: all
+
+# test_crc32c.o:	CFLAGS+=-g
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = src/test/modules/test_crc32c
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/contrib/test_crc32c/expected/test_crc32c.out b/contrib/test_crc32c/expected/test_crc32c.out
new file mode 100644
index 00000000000..dff6bb3133b
--- /dev/null
+++ b/contrib/test_crc32c/expected/test_crc32c.out
@@ -0,0 +1,57 @@
+CREATE EXTENSION test_crc32c;
+select drive_crc32c(1, i) from generate_series(100, 300, 4) i;
+ drive_crc32c 
+--------------
+    532139994
+   2103623867
+    785984197
+   2686825890
+   3213049059
+   3819630168
+   1389234603
+    534072900
+   2930108140
+   2496889855
+   1475239611
+    136366931
+   3067402116
+   2012717871
+   3682416023
+   2054270645
+   1817339875
+   4100939569
+   1192727539
+   3636976218
+    369764421
+   3161609879
+   1067984880
+   1235066769
+   3138425899
+    648132037
+   4203750233
+   1330187888
+   2683521348
+   1951644495
+   2574090107
+   3904902018
+   3772697795
+   1644686344
+   2868962106
+   3369218491
+   3902689890
+   3456411865
+    141004025
+   1504497996
+   3782655204
+   3544797610
+   3429174879
+   2524728016
+   3935861181
+     25498897
+    692684159
+    345705535
+   2761600287
+   2654632420
+   3945991399
+(51 rows)
+
diff --git a/contrib/test_crc32c/meson.build b/contrib/test_crc32c/meson.build
new file mode 100644
index 00000000000..d7bec4ba1cb
--- /dev/null
+++ b/contrib/test_crc32c/meson.build
@@ -0,0 +1,34 @@
+# Copyright (c) 2022-2024, PostgreSQL Global Development Group
+
+test_crc32c_sources = files(
+  'test_crc32c.c',
+)
+
+if host_system == 'windows'
+  test_crc32c_sources += rc_lib_gen.process(win32ver_rc, extra_args: [
+    '--NAME', 'test_crc32c',
+    '--FILEDESC', 'test_crc32c - test code for crc32c library',])
+endif
+
+test_crc32c = shared_module('test_crc32c',
+  test_crc32c_sources,
+  kwargs: contrib_mod_args,
+)
+contrib_targets += test_crc32c
+
+install_data(
+  'test_crc32c.control',
+  'test_crc32c--1.0.sql',
+  kwargs: contrib_data_args,
+)
+
+tests += {
+  'name': 'test_crc32c',
+  'sd': meson.current_source_dir(),
+  'bd': meson.current_build_dir(),
+  'regress': {
+    'sql': [
+      'test_crc32c',
+    ],
+  },
+}
diff --git a/contrib/test_crc32c/sql/test_crc32c.sql b/contrib/test_crc32c/sql/test_crc32c.sql
new file mode 100644
index 00000000000..95c6dfe4488
--- /dev/null
+++ b/contrib/test_crc32c/sql/test_crc32c.sql
@@ -0,0 +1,3 @@
+CREATE EXTENSION test_crc32c;
+
+select drive_crc32c(1, i) from generate_series(100, 300, 4) i;
diff --git a/contrib/test_crc32c/test_crc32c--1.0.sql b/contrib/test_crc32c/test_crc32c--1.0.sql
new file mode 100644
index 00000000000..52b9772f908
--- /dev/null
+++ b/contrib/test_crc32c/test_crc32c--1.0.sql
@@ -0,0 +1 @@
+CREATE FUNCTION drive_crc32c  (count int, num int) RETURNS bigint AS 'MODULE_PATHNAME' LANGUAGE C;
diff --git a/contrib/test_crc32c/test_crc32c.c b/contrib/test_crc32c/test_crc32c.c
new file mode 100644
index 00000000000..28bc42de314
--- /dev/null
+++ b/contrib/test_crc32c/test_crc32c.c
@@ -0,0 +1,47 @@
+/* select drive_crc32c(1000000, 1024); */
+
+#include "postgres.h"
+#include "fmgr.h"
+#include "port/pg_crc32c.h"
+#include "common/pg_prng.h"
+
+PG_MODULE_MAGIC;
+
+/*
+ * drive_crc32c(count: int, num: int) returns bigint
+ *
+ * count is the nuimber of loops to perform
+ *
+ * num is the number byte in the buffer to calculate
+ * crc32c over.
+ */
+PG_FUNCTION_INFO_V1(drive_crc32c);
+Datum
+drive_crc32c(PG_FUNCTION_ARGS)
+{
+	int64			count	= PG_GETARG_INT32(0);
+	int64			num		= PG_GETARG_INT32(1);
+	char*		data	= malloc((size_t)num);
+	pg_crc32c crc;
+	pg_prng_state state;
+	uint64 seed = 42;
+	pg_prng_seed(&state, seed);
+	/* set random data */
+	for (uint64 i = 0; i < num; i++)
+	{
+		data[i] = pg_prng_uint32(&state) % 255;
+	}
+
+	INIT_CRC32C(crc);
+
+	while(count--)
+	{
+		INIT_CRC32C(crc);
+		COMP_CRC32C(crc, data, num);
+		FIN_CRC32C(crc);
+	}
+
+	free((void *)data);
+
+	PG_RETURN_INT64((int64_t)crc);
+}
diff --git a/contrib/test_crc32c/test_crc32c.control b/contrib/test_crc32c/test_crc32c.control
new file mode 100644
index 00000000000..878a077ee18
--- /dev/null
+++ b/contrib/test_crc32c/test_crc32c.control
@@ -0,0 +1,4 @@
+comment = 'test'
+default_version = '1.0'
+module_pathname = '$libdir/test_crc32c'
+relocatable = true
-- 
2.48.1

Reply via email to