Hi Raghuveer,

You raised some interesting points, which deserve a thoughtful
response. After sleeping on it, however I came to the conclusion that
a sweeping change in runtime checks, with either of our approaches,
has downsides and unresolved questions. Perhaps we can come back to it
at a later time. For this release cycle, I took a step back and tried
to think of the least invasive way to solve the immediate problem,
which is: How to allow existing builds with "-msse4.2" to take
advantage of CLMUL while not adding overhead. Here's what I came up
with in v11:

0001: same benchmark module as before
0002: For SSE4.2 builds, arrange so that constant input uses an
inlined path so that the compiler can emit unrolled loops anywhere.
This is particularly important for the WAL insertion lock, so this is
possibly committable on its own just for that.
0003: the PCLMUL path, only for runtime-check builds
0004: the PCLMUL path for SSE4.2 builds. This uses a function pointer
for long-ish input and the same above inlined path for short input
(whether constant or not). So it gets the best of both worlds.

There is also a separate issue:

On Tue, Feb 25, 2025 at 6:05 PM John Naylor <johncnaylo...@gmail.com> wrote:
>
> Another thing I found in Agner's manuals: AMD Zen, even as recently as
> Zen 4, don't have as good a microarchitecture for PCLMUL, so if anyone
> with such a machine would like to help test the cutoff

David Rowley shared some results off-list, which are: Zen 4 is very
good with this algorithm even at 64 bytes input length, but Zen 2
regresses up to maybe 256 bytes. Having a large cutoff to cover all
bases makes this less useful, and that was one of my reservations
about AVX-512. However, with the corsix generator I found it's
possible to specify AVX-512 with a single accumulator (rather than 4),
which still gives a minimum input of 64 bytes, so I'll plan to put
something together to demonstrate.

(Older machines could use the 3-way stream algorithm as a fallback on
long inputs, as I've mentioned elsewhere, assuming that's legally
unencumbered.)

-- 
John Naylor
Amazon Web Services
From f96ae8bf6913796c5d724ff9da25bbc79927ff1c Mon Sep 17 00:00:00 2001
From: John Naylor <john.nay...@postgresql.org>
Date: Fri, 28 Feb 2025 18:27:40 +0700
Subject: [PATCH v11 4/4] Use runtime check even when we have SSE 4.2 at
 compile time

This allows us to use PCLMUL for longer inputs. Short inputs are
inlined to avoid the indirection through a function pointer.
---
 configure                         |  2 +-
 configure.ac                      |  2 +-
 src/include/port/pg_crc32c.h      | 15 +++++++++++----
 src/port/meson.build              |  1 +
 src/port/pg_crc32c_sse42_choose.c |  2 ++
 5 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/configure b/configure
index 93fddd69981..91c0ffc8272 100755
--- a/configure
+++ b/configure
@@ -17684,7 +17684,7 @@ if test x"$USE_SSE42_CRC32C" = x"1"; then
 
 $as_echo "#define USE_SSE42_CRC32C 1" >>confdefs.h
 
-  PG_CRC32C_OBJS="pg_crc32c_sse42.o"
+  PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sse42_choose.o"
   { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2" >&5
 $as_echo "SSE 4.2" >&6; }
 else
diff --git a/configure.ac b/configure.ac
index b6d02f5ecc7..a85bdbd4ff6 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2151,7 +2151,7 @@ fi
 AC_MSG_CHECKING([which CRC-32C implementation to use])
 if test x"$USE_SSE42_CRC32C" = x"1"; then
   AC_DEFINE(USE_SSE42_CRC32C, 1, [Define to 1 use Intel SSE 4.2 CRC instructions.])
-  PG_CRC32C_OBJS="pg_crc32c_sse42.o"
+  PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sse42_choose.o"
   AC_MSG_RESULT(SSE 4.2)
 else
   if test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then
diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h
index fe0e1b6b275..26b676dddc9 100644
--- a/src/include/port/pg_crc32c.h
+++ b/src/include/port/pg_crc32c.h
@@ -55,22 +55,29 @@ typedef uint32 pg_crc32c;
 	((crc) = pg_comp_crc32c_dispatch((crc), (data), (len)))
 #define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF)
 
+extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len);
+extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len);
 extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len);
+#ifdef USE_PCLMUL_WITH_RUNTIME_CHECK
+extern pg_crc32c pg_comp_crc32c_pclmul(pg_crc32c crc, const void *data, size_t len);
+#endif
 
 static inline
 pg_crc32c
 pg_comp_crc32c_dispatch(pg_crc32c crc, const void *data, size_t len)
 {
-	if (__builtin_constant_p(len) && len < 64)
+	if (len < 64)
 	{
 		/*
-		 * For small constant inputs, inline the computation. This allows the
-		 * compiler to unroll loops.
+		 * For small inputs, inline the computation to avoid the runtime
+		 * check. This also allows the compiler to unroll loops for constant
+		 * input.
 		 */
 		return pg_comp_crc32c_sse42_inline(crc, data, len);
 	}
 	else
-		return pg_comp_crc32c_sse42(crc, data, len);
+		/* For larger inputs, use a runtime check for PCLMUL instructions. */
+		return pg_comp_crc32c(crc, data, len);
 }
 
 #elif defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK)
diff --git a/src/port/meson.build b/src/port/meson.build
index 7fcfa728d43..8d70a4d510e 100644
--- a/src/port/meson.build
+++ b/src/port/meson.build
@@ -83,6 +83,7 @@ replace_funcs_pos = [
   # x86/x64
   ['pg_crc32c_sse42', 'USE_SSE42_CRC32C'],
   ['pg_crc32c_sse42', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
+  ['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C'],
   ['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
   ['pg_crc32c_sb8', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
 
diff --git a/src/port/pg_crc32c_sse42_choose.c b/src/port/pg_crc32c_sse42_choose.c
index abea0f90eb3..89a48c76894 100644
--- a/src/port/pg_crc32c_sse42_choose.c
+++ b/src/port/pg_crc32c_sse42_choose.c
@@ -55,8 +55,10 @@ pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len)
 			pg_comp_crc32c = pg_comp_crc32c_pclmul;
 #endif
 	}
+#ifdef USE_SSE42_CRC32C_WITH_RUNTIME_CHECK
 	else
 		pg_comp_crc32c = pg_comp_crc32c_sb8;
+#endif
 
 	return pg_comp_crc32c(crc, data, len);
 }
-- 
2.48.1

From 9bf92b1ee09810ae734e6463fe01ad9858fa7168 Mon Sep 17 00:00:00 2001
From: John Naylor <john.nay...@postgresql.org>
Date: Wed, 12 Feb 2025 15:27:16 +0700
Subject: [PATCH v11 3/4] Improve CRC32C performance on x86_64

The current SSE4.2 implementation of CRC32C relies on the native
CRC32 instruction, which operates on 8 bytes at a time. We can get a
substantial speedup on longer inputs by using carryless multiplication
on SIMD registers, processing 64 bytes per loop iteration.

The PCLMULQDQ instruction has been widely available since 2011 (almost
as old as SSE 4.2), so this commit now requires that, as well as SSE
4.2, to build pg_crc32c_sse42.c.

The MIT-licensed implementation was generated with the "generate"
program from

https://github.com/corsix/fast-crc32/

Based on: "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ
Instruction" V. Gopal, E. Ozturk, et al., 2009

Author: Raghuveer Devulapalli <raghuveer.devulapa...@intel.com>
Author: John Naylor <johncnaylo...@gmail.com>
Discussion: https://postgr.es/m/ph8pr11mb82869ff741dfa4e9a029ff13fb...@ph8pr11mb8286.namprd11.prod.outlook.com
---
 src/include/port/pg_crc32c.h      | 30 ++++++++---
 src/port/pg_crc32c_sse42.c        | 88 +++++++++++++++++++++++++++++++
 src/port/pg_crc32c_sse42_choose.c | 26 ++++-----
 3 files changed, 124 insertions(+), 20 deletions(-)

diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h
index 5ccc79295c0..fe0e1b6b275 100644
--- a/src/include/port/pg_crc32c.h
+++ b/src/include/port/pg_crc32c.h
@@ -37,6 +37,11 @@
 
 typedef uint32 pg_crc32c;
 
+/* WIP: configure checks */
+#ifdef __x86_64__
+#define USE_PCLMUL_WITH_RUNTIME_CHECK
+#endif
+
 /* The INIT and EQ macros are the same for all implementations. */
 #define INIT_CRC32C(crc) ((crc) = 0xFFFFFFFF)
 #define EQ_CRC32C(c1, c2) ((c1) == (c2))
@@ -68,6 +73,23 @@ pg_comp_crc32c_dispatch(pg_crc32c crc, const void *data, size_t len)
 		return pg_comp_crc32c_sse42(crc, data, len);
 }
 
+#elif defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK)
+
+/*
+ * Use Intel SSE 4.2 or PCLMUL instructions, but perform a runtime check first
+ * to check that they are available.
+ */
+#define COMP_CRC32C(crc, data, len) \
+	((crc) = pg_comp_crc32c((crc), (data), (len)))
+#define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF)
+
+extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len);
+extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len);
+extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len);
+#ifdef USE_PCLMUL_WITH_RUNTIME_CHECK
+extern pg_crc32c pg_comp_crc32c_pclmul(pg_crc32c crc, const void *data, size_t len);
+#endif
+
 #elif defined(USE_ARMV8_CRC32C)
 /* Use ARMv8 CRC Extension instructions. */
 
@@ -86,7 +108,7 @@ extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t le
 
 extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_t len);
 
-#elif defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK) || defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK)
+#elif defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK)
 
 /*
  * Use Intel SSE 4.2 or ARMv8 instructions, but perform a runtime check first
@@ -98,13 +120,7 @@ extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_
 
 extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len);
 extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len);
-
-#ifdef USE_SSE42_CRC32C_WITH_RUNTIME_CHECK
-extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len);
-#endif
-#ifdef USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK
 extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len);
-#endif
 
 #else
 /*
diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c
index 6a35f7fdc67..b56da2f6934 100644
--- a/src/port/pg_crc32c_sse42.c
+++ b/src/port/pg_crc32c_sse42.c
@@ -15,6 +15,7 @@
 #include "c.h"
 
 #include <nmmintrin.h>
+#include <wmmintrin.h>
 
 #include "port/pg_crc32c.h"
 #include "port/pg_crc32c_sse42_impl.h"
@@ -26,3 +27,90 @@ pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len)
 {
 	return pg_comp_crc32c_sse42_inline(crc, data, len);
 }
+
+#ifdef USE_PCLMUL_WITH_RUNTIME_CHECK
+
+/* Generated by https://github.com/corsix/fast-crc32/ using: */
+/* ./generate -i sse -p crc32c -a v4e */
+/* MIT licensed */
+
+#define clmul_lo(a, b) (_mm_clmulepi64_si128((a), (b), 0))
+#define clmul_hi(a, b) (_mm_clmulepi64_si128((a), (b), 17))
+
+pg_attribute_target("sse4.2,pclmul")
+pg_crc32c
+pg_comp_crc32c_pclmul(pg_crc32c crc, const void *data, size_t length)
+{
+	/* adjust names to match generated code */
+	pg_crc32c	crc0 = crc;
+	size_t		len = length;
+	const char *buf = data;
+
+	// This prolog is trying to avoid loads straddling
+	// cache lines, but it doesn't seem worth it if
+	// we're trying to be fast on small inputs as well
+#if 0
+	for (; len && ((uintptr_t) buf & 7); --len)
+	{
+		crc0 = _mm_crc32_u8(crc0, *buf++);
+	}
+	if (((uintptr_t) buf & 8) && len >= 8)
+	{
+		crc0 = _mm_crc32_u64(crc0, *(const uint64_t *) buf);
+		buf += 8;
+		len -= 8;
+	}
+#endif
+	if (len >= 64)
+	{
+		const char *end = buf + len;
+		const char *limit = buf + len - 64;
+
+		/* First vector chunk. */
+		__m128i		x0 = _mm_loadu_si128((const __m128i *) buf),
+					y0;
+		__m128i		x1 = _mm_loadu_si128((const __m128i *) (buf + 16)),
+					y1;
+		__m128i		x2 = _mm_loadu_si128((const __m128i *) (buf + 32)),
+					y2;
+		__m128i		x3 = _mm_loadu_si128((const __m128i *) (buf + 48)),
+					y3;
+		__m128i		k;
+
+		k = _mm_setr_epi32(0x740eef02, 0, 0x9e4addf8, 0);
+		x0 = _mm_xor_si128(_mm_cvtsi32_si128(crc0), x0);
+		buf += 64;
+		/* Main loop. */
+		while (buf <= limit)
+		{
+			y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
+			y1 = clmul_lo(x1, k), x1 = clmul_hi(x1, k);
+			y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k);
+			y3 = clmul_lo(x3, k), x3 = clmul_hi(x3, k);
+			y0 = _mm_xor_si128(y0, _mm_loadu_si128((const __m128i *) buf)), x0 = _mm_xor_si128(x0, y0);
+			y1 = _mm_xor_si128(y1, _mm_loadu_si128((const __m128i *) (buf + 16))), x1 = _mm_xor_si128(x1, y1);
+			y2 = _mm_xor_si128(y2, _mm_loadu_si128((const __m128i *) (buf + 32))), x2 = _mm_xor_si128(x2, y2);
+			y3 = _mm_xor_si128(y3, _mm_loadu_si128((const __m128i *) (buf + 48))), x3 = _mm_xor_si128(x3, y3);
+			buf += 64;
+		}
+
+		/* Reduce x0 ... x3 to just x0. */
+		k = _mm_setr_epi32(0xf20c0dfe, 0, 0x493c7d27, 0);
+		y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
+		y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k);
+		y0 = _mm_xor_si128(y0, x1), x0 = _mm_xor_si128(x0, y0);
+		y2 = _mm_xor_si128(y2, x3), x2 = _mm_xor_si128(x2, y2);
+		k = _mm_setr_epi32(0x3da6d0cb, 0, 0xba4fc28e, 0);
+		y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
+		y0 = _mm_xor_si128(y0, x2), x0 = _mm_xor_si128(x0, y0);
+
+		/* Reduce 128 bits to 32 bits, and multiply by x^32. */
+		crc0 = _mm_crc32_u64(0, _mm_extract_epi64(x0, 0));
+		crc0 = _mm_crc32_u64(crc0, _mm_extract_epi64(x0, 1));
+		len = end - buf;
+	}
+
+	return pg_comp_crc32c_sse42_inline(crc0, buf, len);
+}
+
+#endif
diff --git a/src/port/pg_crc32c_sse42_choose.c b/src/port/pg_crc32c_sse42_choose.c
index 65dbc4d4249..abea0f90eb3 100644
--- a/src/port/pg_crc32c_sse42_choose.c
+++ b/src/port/pg_crc32c_sse42_choose.c
@@ -30,8 +30,12 @@
 
 #include "port/pg_crc32c.h"
 
-static bool
-pg_crc32c_sse42_available(void)
+/*
+ * This gets called on the first call. It replaces the function pointer
+ * so that subsequent calls are routed directly to the chosen implementation.
+ */
+static pg_crc32c
+pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len)
 {
 	unsigned int exx[4] = {0, 0, 0, 0};
 
@@ -43,18 +47,14 @@ pg_crc32c_sse42_available(void)
 #error cpuid instruction not available
 #endif
 
-	return (exx[2] & (1 << 20)) != 0;	/* SSE 4.2 */
-}
-
-/*
- * This gets called on the first call. It replaces the function pointer
- * so that subsequent calls are routed directly to the chosen implementation.
- */
-static pg_crc32c
-pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len)
-{
-	if (pg_crc32c_sse42_available())
+	if ((exx[2] & (1 << 20)) != 0)	/* SSE 4.2 */
+	{
 		pg_comp_crc32c = pg_comp_crc32c_sse42;
+#ifdef USE_PCLMUL_WITH_RUNTIME_CHECK
+		if ((exx[2] & (1 << 1)) != 0)	/* PCLMUL */
+			pg_comp_crc32c = pg_comp_crc32c_pclmul;
+#endif
+	}
 	else
 		pg_comp_crc32c = pg_comp_crc32c_sb8;
 
-- 
2.48.1

From 7cd98c95678752dac778c3e4c36cc3ae8d93639d Mon Sep 17 00:00:00 2001
From: John Naylor <john.nay...@postgresql.org>
Date: Fri, 28 Feb 2025 16:27:30 +0700
Subject: [PATCH v11 2/4] Inline CRC computation for small fixed-length input

---
 src/include/port/pg_crc32c.h            | 21 ++++++-
 src/include/port/pg_crc32c_sse42_impl.h | 74 +++++++++++++++++++++++++
 src/port/pg_crc32c_sse42.c              | 46 +--------------
 3 files changed, 96 insertions(+), 45 deletions(-)
 create mode 100644 src/include/port/pg_crc32c_sse42_impl.h

diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h
index 65ebeacf4b1..5ccc79295c0 100644
--- a/src/include/port/pg_crc32c.h
+++ b/src/include/port/pg_crc32c.h
@@ -43,12 +43,31 @@ typedef uint32 pg_crc32c;
 
 #if defined(USE_SSE42_CRC32C)
 /* Use Intel SSE4.2 instructions. */
+
+#include "pg_crc32c_sse42_impl.h"
+
 #define COMP_CRC32C(crc, data, len) \
-	((crc) = pg_comp_crc32c_sse42((crc), (data), (len)))
+	((crc) = pg_comp_crc32c_dispatch((crc), (data), (len)))
 #define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF)
 
 extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len);
 
+static inline
+pg_crc32c
+pg_comp_crc32c_dispatch(pg_crc32c crc, const void *data, size_t len)
+{
+	if (__builtin_constant_p(len) && len < 64)
+	{
+		/*
+		 * For small constant inputs, inline the computation. This allows the
+		 * compiler to unroll loops.
+		 */
+		return pg_comp_crc32c_sse42_inline(crc, data, len);
+	}
+	else
+		return pg_comp_crc32c_sse42(crc, data, len);
+}
+
 #elif defined(USE_ARMV8_CRC32C)
 /* Use ARMv8 CRC Extension instructions. */
 
diff --git a/src/include/port/pg_crc32c_sse42_impl.h b/src/include/port/pg_crc32c_sse42_impl.h
new file mode 100644
index 00000000000..e10ad777618
--- /dev/null
+++ b/src/include/port/pg_crc32c_sse42_impl.h
@@ -0,0 +1,74 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_crc32c_sse42_impl.h
+ *	  Inline implementation of CRC computation using SSE 4.2
+ *
+ * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_crc32c_sse42_impl.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PG_CRC32C_SSE42_IMPL_H
+#define PG_CRC32C_SSE42_IMPL_H
+
+#include "c.h"
+
+#include <nmmintrin.h>
+
+pg_attribute_no_sanitize_alignment()
+pg_attribute_target("sse4.2")
+static inline
+pg_crc32c
+pg_comp_crc32c_sse42_inline(pg_crc32c crc, const void *data, size_t len)
+{
+	const unsigned char *p = data;
+	const unsigned char *pend = p + len;
+
+	/*
+	 * Process eight bytes of data at a time.
+	 *
+	 * NB: We do unaligned accesses here. The Intel architecture allows that,
+	 * and performance testing didn't show any performance gain from aligning
+	 * the begin address.
+	 */
+#ifdef __x86_64__
+	while (p + 8 <= pend)
+	{
+		crc = (uint32) _mm_crc32_u64(crc, *((const uint64 *) p));
+		p += 8;
+	}
+
+	/* Process remaining full four bytes if any */
+	if (p + 4 <= pend)
+	{
+		crc = _mm_crc32_u32(crc, *((const unsigned int *) p));
+		p += 4;
+	}
+#else
+
+	/*
+	 * Process four bytes at a time. (The eight byte instruction is not
+	 * available on the 32-bit x86 architecture).
+	 */
+	while (p + 4 <= pend)
+	{
+		crc = _mm_crc32_u32(crc, *((const unsigned int *) p));
+		p += 4;
+	}
+#endif							/* __x86_64__ */
+
+	/* Process any remaining bytes one at a time. */
+	while (p < pend)
+	{
+		crc = _mm_crc32_u8(crc, *p);
+		p++;
+	}
+
+	return crc;
+}
+
+#endif							/* PG_CRC32C_SSE42_IMPL_H */
diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c
index 22c2137df31..6a35f7fdc67 100644
--- a/src/port/pg_crc32c_sse42.c
+++ b/src/port/pg_crc32c_sse42.c
@@ -17,54 +17,12 @@
 #include <nmmintrin.h>
 
 #include "port/pg_crc32c.h"
+#include "port/pg_crc32c_sse42_impl.h"
 
 pg_attribute_no_sanitize_alignment()
 pg_attribute_target("sse4.2")
 pg_crc32c
 pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len)
 {
-	const unsigned char *p = data;
-	const unsigned char *pend = p + len;
-
-	/*
-	 * Process eight bytes of data at a time.
-	 *
-	 * NB: We do unaligned accesses here. The Intel architecture allows that,
-	 * and performance testing didn't show any performance gain from aligning
-	 * the begin address.
-	 */
-#ifdef __x86_64__
-	while (p + 8 <= pend)
-	{
-		crc = (uint32) _mm_crc32_u64(crc, *((const uint64 *) p));
-		p += 8;
-	}
-
-	/* Process remaining full four bytes if any */
-	if (p + 4 <= pend)
-	{
-		crc = _mm_crc32_u32(crc, *((const unsigned int *) p));
-		p += 4;
-	}
-#else
-
-	/*
-	 * Process four bytes at a time. (The eight byte instruction is not
-	 * available on the 32-bit x86 architecture).
-	 */
-	while (p + 4 <= pend)
-	{
-		crc = _mm_crc32_u32(crc, *((const unsigned int *) p));
-		p += 4;
-	}
-#endif							/* __x86_64__ */
-
-	/* Process any remaining bytes one at a time. */
-	while (p < pend)
-	{
-		crc = _mm_crc32_u8(crc, *p);
-		p++;
-	}
-
-	return crc;
+	return pg_comp_crc32c_sse42_inline(crc, data, len);
 }
-- 
2.48.1

From f176a0cfadcb0a9971746f23677de7d4278a5391 Mon Sep 17 00:00:00 2001
From: Paul Amonson <paul.d.amon...@intel.com>
Date: Mon, 6 May 2024 08:34:17 -0700
Subject: [PATCH v11 1/4] Add a Postgres SQL function for crc32c benchmarking

Add a drive_crc32c() function to use for benchmarking crc32c
computation. The function takes 2 arguments:

(1) count: num of times CRC32C is computed in a loop.
(2) num: #bytes in the buffer to calculate crc over.

XXX not for commit

Extracted from a patch by  Raghuveer Devulapalli
---
 contrib/meson.build                          |  1 +
 contrib/test_crc32c/Makefile                 | 20 +++++++
 contrib/test_crc32c/expected/test_crc32c.out | 57 ++++++++++++++++++++
 contrib/test_crc32c/meson.build              | 34 ++++++++++++
 contrib/test_crc32c/sql/test_crc32c.sql      |  3 ++
 contrib/test_crc32c/test_crc32c--1.0.sql     |  1 +
 contrib/test_crc32c/test_crc32c.c            | 47 ++++++++++++++++
 contrib/test_crc32c/test_crc32c.control      |  4 ++
 8 files changed, 167 insertions(+)
 create mode 100644 contrib/test_crc32c/Makefile
 create mode 100644 contrib/test_crc32c/expected/test_crc32c.out
 create mode 100644 contrib/test_crc32c/meson.build
 create mode 100644 contrib/test_crc32c/sql/test_crc32c.sql
 create mode 100644 contrib/test_crc32c/test_crc32c--1.0.sql
 create mode 100644 contrib/test_crc32c/test_crc32c.c
 create mode 100644 contrib/test_crc32c/test_crc32c.control

diff --git a/contrib/meson.build b/contrib/meson.build
index 1ba73ebd67a..06673db0625 100644
--- a/contrib/meson.build
+++ b/contrib/meson.build
@@ -12,6 +12,7 @@ contrib_doc_args = {
   'install_dir': contrib_doc_dir,
 }
 
+subdir('test_crc32c')
 subdir('amcheck')
 subdir('auth_delay')
 subdir('auto_explain')
diff --git a/contrib/test_crc32c/Makefile b/contrib/test_crc32c/Makefile
new file mode 100644
index 00000000000..5b747c6184a
--- /dev/null
+++ b/contrib/test_crc32c/Makefile
@@ -0,0 +1,20 @@
+MODULE_big = test_crc32c
+OBJS = test_crc32c.o
+PGFILEDESC = "test"
+EXTENSION = test_crc32c
+DATA = test_crc32c--1.0.sql
+
+first: all
+
+# test_crc32c.o:	CFLAGS+=-g
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = src/test/modules/test_crc32c
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/contrib/test_crc32c/expected/test_crc32c.out b/contrib/test_crc32c/expected/test_crc32c.out
new file mode 100644
index 00000000000..dff6bb3133b
--- /dev/null
+++ b/contrib/test_crc32c/expected/test_crc32c.out
@@ -0,0 +1,57 @@
+CREATE EXTENSION test_crc32c;
+select drive_crc32c(1, i) from generate_series(100, 300, 4) i;
+ drive_crc32c 
+--------------
+    532139994
+   2103623867
+    785984197
+   2686825890
+   3213049059
+   3819630168
+   1389234603
+    534072900
+   2930108140
+   2496889855
+   1475239611
+    136366931
+   3067402116
+   2012717871
+   3682416023
+   2054270645
+   1817339875
+   4100939569
+   1192727539
+   3636976218
+    369764421
+   3161609879
+   1067984880
+   1235066769
+   3138425899
+    648132037
+   4203750233
+   1330187888
+   2683521348
+   1951644495
+   2574090107
+   3904902018
+   3772697795
+   1644686344
+   2868962106
+   3369218491
+   3902689890
+   3456411865
+    141004025
+   1504497996
+   3782655204
+   3544797610
+   3429174879
+   2524728016
+   3935861181
+     25498897
+    692684159
+    345705535
+   2761600287
+   2654632420
+   3945991399
+(51 rows)
+
diff --git a/contrib/test_crc32c/meson.build b/contrib/test_crc32c/meson.build
new file mode 100644
index 00000000000..d7bec4ba1cb
--- /dev/null
+++ b/contrib/test_crc32c/meson.build
@@ -0,0 +1,34 @@
+# Copyright (c) 2022-2024, PostgreSQL Global Development Group
+
+test_crc32c_sources = files(
+  'test_crc32c.c',
+)
+
+if host_system == 'windows'
+  test_crc32c_sources += rc_lib_gen.process(win32ver_rc, extra_args: [
+    '--NAME', 'test_crc32c',
+    '--FILEDESC', 'test_crc32c - test code for crc32c library',])
+endif
+
+test_crc32c = shared_module('test_crc32c',
+  test_crc32c_sources,
+  kwargs: contrib_mod_args,
+)
+contrib_targets += test_crc32c
+
+install_data(
+  'test_crc32c.control',
+  'test_crc32c--1.0.sql',
+  kwargs: contrib_data_args,
+)
+
+tests += {
+  'name': 'test_crc32c',
+  'sd': meson.current_source_dir(),
+  'bd': meson.current_build_dir(),
+  'regress': {
+    'sql': [
+      'test_crc32c',
+    ],
+  },
+}
diff --git a/contrib/test_crc32c/sql/test_crc32c.sql b/contrib/test_crc32c/sql/test_crc32c.sql
new file mode 100644
index 00000000000..95c6dfe4488
--- /dev/null
+++ b/contrib/test_crc32c/sql/test_crc32c.sql
@@ -0,0 +1,3 @@
+CREATE EXTENSION test_crc32c;
+
+select drive_crc32c(1, i) from generate_series(100, 300, 4) i;
diff --git a/contrib/test_crc32c/test_crc32c--1.0.sql b/contrib/test_crc32c/test_crc32c--1.0.sql
new file mode 100644
index 00000000000..52b9772f908
--- /dev/null
+++ b/contrib/test_crc32c/test_crc32c--1.0.sql
@@ -0,0 +1 @@
+CREATE FUNCTION drive_crc32c  (count int, num int) RETURNS bigint AS 'MODULE_PATHNAME' LANGUAGE C;
diff --git a/contrib/test_crc32c/test_crc32c.c b/contrib/test_crc32c/test_crc32c.c
new file mode 100644
index 00000000000..28bc42de314
--- /dev/null
+++ b/contrib/test_crc32c/test_crc32c.c
@@ -0,0 +1,47 @@
+/* select drive_crc32c(1000000, 1024); */
+
+#include "postgres.h"
+#include "fmgr.h"
+#include "port/pg_crc32c.h"
+#include "common/pg_prng.h"
+
+PG_MODULE_MAGIC;
+
+/*
+ * drive_crc32c(count: int, num: int) returns bigint
+ *
+ * count is the nuimber of loops to perform
+ *
+ * num is the number byte in the buffer to calculate
+ * crc32c over.
+ */
+PG_FUNCTION_INFO_V1(drive_crc32c);
+Datum
+drive_crc32c(PG_FUNCTION_ARGS)
+{
+	int64			count	= PG_GETARG_INT32(0);
+	int64			num		= PG_GETARG_INT32(1);
+	char*		data	= malloc((size_t)num);
+	pg_crc32c crc;
+	pg_prng_state state;
+	uint64 seed = 42;
+	pg_prng_seed(&state, seed);
+	/* set random data */
+	for (uint64 i = 0; i < num; i++)
+	{
+		data[i] = pg_prng_uint32(&state) % 255;
+	}
+
+	INIT_CRC32C(crc);
+
+	while(count--)
+	{
+		INIT_CRC32C(crc);
+		COMP_CRC32C(crc, data, num);
+		FIN_CRC32C(crc);
+	}
+
+	free((void *)data);
+
+	PG_RETURN_INT64((int64_t)crc);
+}
diff --git a/contrib/test_crc32c/test_crc32c.control b/contrib/test_crc32c/test_crc32c.control
new file mode 100644
index 00000000000..878a077ee18
--- /dev/null
+++ b/contrib/test_crc32c/test_crc32c.control
@@ -0,0 +1,4 @@
+comment = 'test'
+default_version = '1.0'
+module_pathname = '$libdir/test_crc32c'
+relocatable = true
-- 
2.48.1

Reply via email to