On Thu, Feb 13, 2025 at 4:18 AM Nathan Bossart <nathandboss...@gmail.com> wrote:
>
> I think the idea behind USE_SSE42_CRC32C is to avoid the function pointer
> overhead if possible.  I looked at switching to always using runtime checks
> for this stuff, and we concluded that we'd better not [0].
>
> [0] https://postgr.es/m/flat/20231030161706.GA3011%40nathanxps13

For short lengths, I tried unrolling the loop into a switch statement,
as in the attached v5-0006 (the other new patches are fixes for CI).
That usually looks faster for me, but not on the length used under the
WAL insert lock. Usual caveat: Using small fixed-sized lengths in
benchmarks can be misleading, because branches are more easily
predicted.

It seems like for always using runtime checks we'd need to use
branching, rather than function pointers, as has been proposed
elsewhere.

master:
20
latency average = 3.622 ms
latency average = 3.573 ms
latency average = 3.599 ms
64
latency average = 7.791 ms
latency average = 7.920 ms
latency average = 7.888 ms
80
latency average = 8.076 ms
latency average = 8.140 ms
latency average = 8.150 ms
96
latency average = 8.853 ms
latency average = 8.897 ms
latency average = 8.914 ms
112
latency average = 9.867 ms
latency average = 9.825 ms
latency average = 9.869 ms

v5:
20
latency average = 4.550 ms
latency average = 4.327 ms
latency average = 4.320 ms
64
latency average = 5.064 ms
latency average = 4.934 ms
latency average = 5.020 ms
80
latency average = 4.904 ms
latency average = 4.786 ms
latency average = 4.942 ms
96
latency average = 5.392 ms
latency average = 5.376 ms
latency average = 5.367 ms
112
latency average = 5.730 ms
latency average = 5.859 ms
latency average = 5.734 ms



--
John Naylor
Amazon Web Services
From acb63cddd8c8220db97ae0b012bf4f2fb5174e8a Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@postgresql.org>
Date: Wed, 12 Feb 2025 17:07:49 +0700
Subject: [PATCH v5 5/8] Improve CRC32C performance on x86_64

The current SSE4.2 implementation of CRC32C relies on the native
CRC32 instruction, which operates on 8 bytes at a time. We can get a
substantial speedup on longer inputs by using carryless multiplication
on SIMD registers, processing 64 bytes per loop iteration.

The PCLMULQDQ instruction has been widely available since 2011 (almost
as old as SSE 4.2), so this commit now requires that, as well as SSE
4.2, to build pg_crc32c_sse42.c.

The MIT-licensed implementation was generated with the "generate"
program from

https://github.com/corsix/fast-crc32/

Based on: "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ
Instruction" V. Gopal, E. Ozturk, et al., 2009

Author: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Author: John Naylor <johncnaylorls@gmail.com>
Discussion: PH8PR11MB82869FF741DFA4E9A029FF13FBF72@PH8PR11MB8286.namprd11.prod.outlook.com">https://postgr.es/m/PH8PR11MB82869FF741DFA4E9A029FF13FBF72@PH8PR11MB8286.namprd11.prod.outlook.com
---
 config/c-compiler.m4              | 7 ++++++-
 configure                         | 7 ++++++-
 meson.build                       | 7 +++++--
 src/port/pg_crc32c_sse42.c        | 4 ++++
 src/port/pg_crc32c_sse42_choose.c | 9 ++++++---
 5 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index 8534cc54c1..8b255b5cc8 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -557,14 +557,19 @@ AC_DEFUN([PGAC_SSE42_CRC32_INTRINSICS],
 [define([Ac_cachevar], [AS_TR_SH([pgac_cv_sse42_crc32_intrinsics])])dnl
 AC_CACHE_CHECK([for _mm_crc32_u8 and _mm_crc32_u32], [Ac_cachevar],
 [AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <nmmintrin.h>
+    #include <wmmintrin.h>
     #if defined(__has_attribute) && __has_attribute (target)
-    __attribute__((target("sse4.2")))
+    __attribute__((target("sse4.2,pclmul")))
     #endif
     static int crc32_sse42_test(void)
+
     {
+      __m128i x1 = _mm_set1_epi32(1);
       unsigned int crc = 0;
       crc = _mm_crc32_u8(crc, 0);
       crc = _mm_crc32_u32(crc, 0);
+      x1 = _mm_clmulepi64_si128(x1, x1, 0x00); // pclmul
+      crc = crc + _mm_extract_epi32(x1, 1);
       /* return computed value, to prevent the above being optimized away */
       return crc == 0;
     }],
diff --git a/configure b/configure
index 0ffcaeb436..3f2a2a515e 100755
--- a/configure
+++ b/configure
@@ -17059,14 +17059,19 @@ else
   cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 #include <nmmintrin.h>
+    #include <wmmintrin.h>
     #if defined(__has_attribute) && __has_attribute (target)
-    __attribute__((target("sse4.2")))
+    __attribute__((target("sse4.2,pclmul")))
     #endif
     static int crc32_sse42_test(void)
+
     {
+      __m128i x1 = _mm_set1_epi32(1);
       unsigned int crc = 0;
       crc = _mm_crc32_u8(crc, 0);
       crc = _mm_crc32_u32(crc, 0);
+      x1 = _mm_clmulepi64_si128(x1, x1, 0x00);
+      crc = crc + _mm_extract_epi32(x1, 1);
       /* return computed value, to prevent the above being optimized away */
       return crc == 0;
     }
diff --git a/meson.build b/meson.build
index 1ceadb9a83..456c3fafc3 100644
--- a/meson.build
+++ b/meson.build
@@ -2227,15 +2227,18 @@ if host_cpu == 'x86' or host_cpu == 'x86_64'
 
     prog = '''
 #include <nmmintrin.h>
-
+#include <wmmintrin.h>
 #if defined(__has_attribute) && __has_attribute (target)
-__attribute__((target("sse4.2")))
+__attribute__((target("sse4.2,pclmul")))
 #endif
 int main(void)
 {
+    __m128i x1 = _mm_set1_epi32(1);
     unsigned int crc = 0;
     crc = _mm_crc32_u8(crc, 0);
     crc = _mm_crc32_u32(crc, 0);
+    x1 = _mm_clmulepi64_si128(x1, x1, 0x00); // pclmul
+    crc = crc + _mm_extract_epi32(x1, 1);
     /* return computed value, to prevent the above being optimized away */
     return crc == 0;
 }
diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c
index 7250eccf6b..05b11b47cb 100644
--- a/src/port/pg_crc32c_sse42.c
+++ b/src/port/pg_crc32c_sse42.c
@@ -3,6 +3,10 @@
  * pg_crc32c_sse42.c
  *	  Compute CRC-32C checksum using Intel SSE 4.2 instructions.
  *
+ * 	  For longer inputs, we use carryless multiplication on SIMD registers,
+ *	  based on: "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ
+ *	  Instruction" V. Gopal, E. Ozturk, et al., 2009
+ *
  * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
diff --git a/src/port/pg_crc32c_sse42_choose.c b/src/port/pg_crc32c_sse42_choose.c
index 65dbc4d424..95cfe63493 100644
--- a/src/port/pg_crc32c_sse42_choose.c
+++ b/src/port/pg_crc32c_sse42_choose.c
@@ -31,7 +31,7 @@
 #include "port/pg_crc32c.h"
 
 static bool
-pg_crc32c_sse42_available(void)
+pg_crc32c_sse42_pclmul_available(void)
 {
 	unsigned int exx[4] = {0, 0, 0, 0};
 
@@ -43,7 +43,10 @@ pg_crc32c_sse42_available(void)
 #error cpuid instruction not available
 #endif
 
-	return (exx[2] & (1 << 20)) != 0;	/* SSE 4.2 */
+	bool		sse42 = (exx[2] & (1 << 20)) != 0;	/* SSE 4.2 */
+	bool		pclmul = (exx[2] & (1 << 1)) != 0;	/* PCLMULQDQ */
+
+	return sse42 && pclmul;
 }
 
 /*
@@ -53,7 +56,7 @@ pg_crc32c_sse42_available(void)
 static pg_crc32c
 pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len)
 {
-	if (pg_crc32c_sse42_available())
+	if (pg_crc32c_sse42_pclmul_available())
 		pg_comp_crc32c = pg_comp_crc32c_sse42;
 	else
 		pg_comp_crc32c = pg_comp_crc32c_sb8;
-- 
2.48.1

From 2c8289de7f612ac01e9bbe5cc86a39571d171925 Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@postgresql.org>
Date: Thu, 13 Feb 2025 15:53:20 +0700
Subject: [PATCH v5 8/8] Allow dev test to build on Windows for CI XXX not for
 commit

---
 src/port/pg_crc32c_sse42_choose.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/port/pg_crc32c_sse42_choose.c b/src/port/pg_crc32c_sse42_choose.c
index 95cfe63493..5833b92638 100644
--- a/src/port/pg_crc32c_sse42_choose.c
+++ b/src/port/pg_crc32c_sse42_choose.c
@@ -64,4 +64,4 @@ pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len)
 	return pg_comp_crc32c(crc, data, len);
 }
 
-pg_crc32c	(*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len) = pg_comp_crc32c_choose;
+PGDLLIMPORT pg_crc32c	(*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len) = pg_comp_crc32c_choose;
-- 
2.48.1

From aefccb195cfec8532d85695768fdd49faae17a46 Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@postgresql.org>
Date: Thu, 13 Feb 2025 13:52:54 +0700
Subject: [PATCH v5 6/8] Unroll tail

---
 src/port/pg_crc32c_sse42.c | 31 +++++++++++++++++++++++++++----
 1 file changed, 27 insertions(+), 4 deletions(-)

diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c
index 05b11b47cb..0fb9c16dfc 100644
--- a/src/port/pg_crc32c_sse42.c
+++ b/src/port/pg_crc32c_sse42.c
@@ -23,6 +23,9 @@
 
 #include "port/pg_crc32c.h"
 
+#define PCLMUL_THRESHOLD 128
+#define CRC_CASE(n) do {crc = (uint32) _mm_crc32_u64(crc, *((const uint64 *) (p - (n)*sizeof(uint64_t))));} while(0)
+
 pg_attribute_no_sanitize_alignment()
 pg_attribute_target("sse4.2")
 static pg_crc32c
@@ -39,10 +42,30 @@ pg_comp_crc32c_sse42_tail(pg_crc32c crc, const void *data, size_t len)
 	 * the begin address.
 	 */
 #ifdef __x86_64__
-	while (p + 8 <= pend)
+
+	/* set p to end of last word boundary */
+	p = pend - len % (sizeof(uint64_t));
+	Assert (len < PCLMUL_THRESHOLD);
+
+	switch (len / sizeof(uint64_t))
 	{
-		crc = (uint32) _mm_crc32_u64(crc, *((const uint64 *) p));
-		p += 8;
+		case 15: CRC_CASE(15); /* FALLTHROUGH */
+		case 14: CRC_CASE(14); /* FALLTHROUGH */
+		case 13: CRC_CASE(13); /* FALLTHROUGH */
+		case 12: CRC_CASE(12); /* FALLTHROUGH */
+		case 11: CRC_CASE(11); /* FALLTHROUGH */
+		case 10: CRC_CASE(10); /* FALLTHROUGH */
+		case 9: CRC_CASE(9); /* FALLTHROUGH */
+		case 8: CRC_CASE(8); /* FALLTHROUGH */
+		case 7: CRC_CASE(7); /* FALLTHROUGH */
+		case 6: CRC_CASE(6); /* FALLTHROUGH */
+		case 5: CRC_CASE(5); /* FALLTHROUGH */
+		case 4: CRC_CASE(4); /* FALLTHROUGH */
+		case 3: CRC_CASE(3); /* FALLTHROUGH */
+		case 2: CRC_CASE(2); /* FALLTHROUGH */
+		case 1: CRC_CASE(1); /* FALLTHROUGH */
+		case 0: break;
+		default: pg_unreachable();
 	}
 
 	/* Process remaining full four bytes if any */
@@ -90,7 +113,7 @@ pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t length)
 	size_t		len = length;
 	const unsigned char *buf = data;
 
-	if (len >= 128)
+	if (len >= PCLMUL_THRESHOLD)
 	{
 		/* First vector chunk. */
 		__m128i		x0 = _mm_loadu_si128((const __m128i *) buf),
-- 
2.48.1

From 2da25b18739c95384d236c783c188526e7f5f641 Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@postgresql.org>
Date: Thu, 13 Feb 2025 15:40:02 +0700
Subject: [PATCH v5 7/8] Fix 32-bit build

---
 src/port/pg_crc32c_sse42.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c
index 0fb9c16dfc..fe7e8165ec 100644
--- a/src/port/pg_crc32c_sse42.c
+++ b/src/port/pg_crc32c_sse42.c
@@ -113,6 +113,7 @@ pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t length)
 	size_t		len = length;
 	const unsigned char *buf = data;
 
+#if SIZEOF_VOID_P >= 8
 	if (len >= PCLMUL_THRESHOLD)
 	{
 		/* First vector chunk. */
@@ -160,6 +161,7 @@ pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t length)
 		crc0 = _mm_crc32_u64(0, _mm_extract_epi64(x0, 0));
 		crc0 = _mm_crc32_u64(crc0, _mm_extract_epi64(x0, 1));
 	}
+#endif /* SIZEOF_VOID_P */
 
 	return pg_comp_crc32c_sse42_tail(crc0, buf, len);
 }
-- 
2.48.1

From a09e918bab5b6aac134c28bebd4b6f60ed05bfc9 Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@postgresql.org>
Date: Wed, 12 Feb 2025 16:03:52 +0700
Subject: [PATCH v5 4/8] Run pgindent XXX Some lines are still really long

---
 src/port/pg_crc32c_sse42.c | 95 +++++++++++++++++++++-----------------
 1 file changed, 53 insertions(+), 42 deletions(-)

diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c
index 3395617301..7250eccf6b 100644
--- a/src/port/pg_crc32c_sse42.c
+++ b/src/port/pg_crc32c_sse42.c
@@ -79,49 +79,60 @@ pg_comp_crc32c_sse42_tail(pg_crc32c crc, const void *data, size_t len)
 
 pg_attribute_target("sse4.2,pclmul")
 pg_crc32c
-pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t length) {
+pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t length)
+{
 	/* adjust names to match generated code */
-	pg_crc32c crc0 = crc;
-	size_t len = length;
+	pg_crc32c	crc0 = crc;
+	size_t		len = length;
 	const unsigned char *buf = data;
 
-  if (len >= 128) {
-    /* First vector chunk. */
-    __m128i x0 = _mm_loadu_si128((const __m128i*)buf), y0;
-    __m128i x1 = _mm_loadu_si128((const __m128i*)(buf + 16)), y1;
-    __m128i x2 = _mm_loadu_si128((const __m128i*)(buf + 32)), y2;
-    __m128i x3 = _mm_loadu_si128((const __m128i*)(buf + 48)), y3;
-    __m128i k;
-    k = _mm_setr_epi32(0x740eef02, 0, 0x9e4addf8, 0);
-    x0 = _mm_xor_si128(_mm_cvtsi32_si128(crc0), x0);
-    buf += 64;
-    len -= 64;
-    /* Main loop. */
-    while (len >= 64) {
-      y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
-      y1 = clmul_lo(x1, k), x1 = clmul_hi(x1, k);
-      y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k);
-      y3 = clmul_lo(x3, k), x3 = clmul_hi(x3, k);
-      y0 = _mm_xor_si128(y0, _mm_loadu_si128((const __m128i*)buf)), x0 = _mm_xor_si128(x0, y0);
-      y1 = _mm_xor_si128(y1, _mm_loadu_si128((const __m128i*)(buf + 16))), x1 = _mm_xor_si128(x1, y1);
-      y2 = _mm_xor_si128(y2, _mm_loadu_si128((const __m128i*)(buf + 32))), x2 = _mm_xor_si128(x2, y2);
-      y3 = _mm_xor_si128(y3, _mm_loadu_si128((const __m128i*)(buf + 48))), x3 = _mm_xor_si128(x3, y3);
-      buf += 64;
-      len -= 64;
-    }
-    /* Reduce x0 ... x3 to just x0. */
-    k = _mm_setr_epi32(0xf20c0dfe, 0, 0x493c7d27, 0);
-    y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
-    y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k);
-    y0 = _mm_xor_si128(y0, x1), x0 = _mm_xor_si128(x0, y0);
-    y2 = _mm_xor_si128(y2, x3), x2 = _mm_xor_si128(x2, y2);
-    k = _mm_setr_epi32(0x3da6d0cb, 0, 0xba4fc28e, 0);
-    y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
-    y0 = _mm_xor_si128(y0, x2), x0 = _mm_xor_si128(x0, y0);
-    /* Reduce 128 bits to 32 bits, and multiply by x^32. */
-    crc0 = _mm_crc32_u64(0, _mm_extract_epi64(x0, 0));
-    crc0 = _mm_crc32_u64(crc0, _mm_extract_epi64(x0, 1));
-  }
-
-  return pg_comp_crc32c_sse42_tail(crc0, buf, len);
+	if (len >= 128)
+	{
+		/* First vector chunk. */
+		__m128i		x0 = _mm_loadu_si128((const __m128i *) buf),
+					y0;
+		__m128i		x1 = _mm_loadu_si128((const __m128i *) (buf + 16)),
+					y1;
+		__m128i		x2 = _mm_loadu_si128((const __m128i *) (buf + 32)),
+					y2;
+		__m128i		x3 = _mm_loadu_si128((const __m128i *) (buf + 48)),
+					y3;
+		__m128i		k;
+
+		k = _mm_setr_epi32(0x740eef02, 0, 0x9e4addf8, 0);
+		x0 = _mm_xor_si128(_mm_cvtsi32_si128(crc0), x0);
+		buf += 64;
+		len -= 64;
+
+		/* Main loop. */
+		while (len >= 64)
+		{
+			y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
+			y1 = clmul_lo(x1, k), x1 = clmul_hi(x1, k);
+			y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k);
+			y3 = clmul_lo(x3, k), x3 = clmul_hi(x3, k);
+			y0 = _mm_xor_si128(y0, _mm_loadu_si128((const __m128i *) buf)), x0 = _mm_xor_si128(x0, y0);
+			y1 = _mm_xor_si128(y1, _mm_loadu_si128((const __m128i *) (buf + 16))), x1 = _mm_xor_si128(x1, y1);
+			y2 = _mm_xor_si128(y2, _mm_loadu_si128((const __m128i *) (buf + 32))), x2 = _mm_xor_si128(x2, y2);
+			y3 = _mm_xor_si128(y3, _mm_loadu_si128((const __m128i *) (buf + 48))), x3 = _mm_xor_si128(x3, y3);
+			buf += 64;
+			len -= 64;
+		}
+
+		/* Reduce x0 ... x3 to just x0. */
+		k = _mm_setr_epi32(0xf20c0dfe, 0, 0x493c7d27, 0);
+		y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
+		y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k);
+		y0 = _mm_xor_si128(y0, x1), x0 = _mm_xor_si128(x0, y0);
+		y2 = _mm_xor_si128(y2, x3), x2 = _mm_xor_si128(x2, y2);
+		k = _mm_setr_epi32(0x3da6d0cb, 0, 0xba4fc28e, 0);
+		y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
+		y0 = _mm_xor_si128(y0, x2), x0 = _mm_xor_si128(x0, y0);
+
+		/* Reduce 128 bits to 32 bits, and multiply by x^32. */
+		crc0 = _mm_crc32_u64(0, _mm_extract_epi64(x0, 0));
+		crc0 = _mm_crc32_u64(crc0, _mm_extract_epi64(x0, 1));
+	}
+
+	return pg_comp_crc32c_sse42_tail(crc0, buf, len);
 }
-- 
2.48.1

From 57952d1f89f0c3a4a2d28399344e9335f8bee72b Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@postgresql.org>
Date: Wed, 12 Feb 2025 15:27:16 +0700
Subject: [PATCH v5 2/8] Vendor SSE implementation from
 https://github.com/corsix/fast-crc32/

---
 src/port/pg_crc32c_sse42.c | 77 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 77 insertions(+)

diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c
index 22c2137df3..6cc39de175 100644
--- a/src/port/pg_crc32c_sse42.c
+++ b/src/port/pg_crc32c_sse42.c
@@ -68,3 +68,80 @@ pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len)
 
 	return crc;
 }
+
+/* Generated by https://github.com/corsix/fast-crc32/ using: */
+/* ./generate -i sse -p crc32c -a v4 */
+/* MIT licensed */
+
+#include <stddef.h>
+#include <stdint.h>
+#include <nmmintrin.h>
+#include <wmmintrin.h>
+
+#if defined(_MSC_VER)
+#define CRC_AINLINE static __forceinline
+#define CRC_ALIGN(n) __declspec(align(n))
+#else
+#define CRC_AINLINE static __inline __attribute__((always_inline))
+#define CRC_ALIGN(n) __attribute__((aligned(n)))
+#endif
+#define CRC_EXPORT extern
+
+#define clmul_lo(a, b) (_mm_clmulepi64_si128((a), (b), 0))
+#define clmul_hi(a, b) (_mm_clmulepi64_si128((a), (b), 17))
+
+CRC_EXPORT uint32_t crc32_impl(uint32_t crc0, const char* buf, size_t len) {
+  crc0 = ~crc0;
+  for (; len && ((uintptr_t)buf & 7); --len) {
+    crc0 = _mm_crc32_u8(crc0, *buf++);
+  }
+  if (((uintptr_t)buf & 8) && len >= 8) {
+    crc0 = _mm_crc32_u64(crc0, *(const uint64_t*)buf);
+    buf += 8;
+    len -= 8;
+  }
+  if (len >= 64) {
+    /* First vector chunk. */
+    __m128i x0 = _mm_loadu_si128((const __m128i*)buf), y0;
+    __m128i x1 = _mm_loadu_si128((const __m128i*)(buf + 16)), y1;
+    __m128i x2 = _mm_loadu_si128((const __m128i*)(buf + 32)), y2;
+    __m128i x3 = _mm_loadu_si128((const __m128i*)(buf + 48)), y3;
+    __m128i k;
+    k = _mm_setr_epi32(0x740eef02, 0, 0x9e4addf8, 0);
+    x0 = _mm_xor_si128(_mm_cvtsi32_si128(crc0), x0);
+    buf += 64;
+    len -= 64;
+    /* Main loop. */
+    while (len >= 64) {
+      y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
+      y1 = clmul_lo(x1, k), x1 = clmul_hi(x1, k);
+      y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k);
+      y3 = clmul_lo(x3, k), x3 = clmul_hi(x3, k);
+      y0 = _mm_xor_si128(y0, _mm_loadu_si128((const __m128i*)buf)), x0 = _mm_xor_si128(x0, y0);
+      y1 = _mm_xor_si128(y1, _mm_loadu_si128((const __m128i*)(buf + 16))), x1 = _mm_xor_si128(x1, y1);
+      y2 = _mm_xor_si128(y2, _mm_loadu_si128((const __m128i*)(buf + 32))), x2 = _mm_xor_si128(x2, y2);
+      y3 = _mm_xor_si128(y3, _mm_loadu_si128((const __m128i*)(buf + 48))), x3 = _mm_xor_si128(x3, y3);
+      buf += 64;
+      len -= 64;
+    }
+    /* Reduce x0 ... x3 to just x0. */
+    k = _mm_setr_epi32(0xf20c0dfe, 0, 0x493c7d27, 0);
+    y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
+    y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k);
+    y0 = _mm_xor_si128(y0, x1), x0 = _mm_xor_si128(x0, y0);
+    y2 = _mm_xor_si128(y2, x3), x2 = _mm_xor_si128(x2, y2);
+    k = _mm_setr_epi32(0x3da6d0cb, 0, 0xba4fc28e, 0);
+    y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
+    y0 = _mm_xor_si128(y0, x2), x0 = _mm_xor_si128(x0, y0);
+    /* Reduce 128 bits to 32 bits, and multiply by x^32. */
+    crc0 = _mm_crc32_u64(0, _mm_extract_epi64(x0, 0));
+    crc0 = _mm_crc32_u64(crc0, _mm_extract_epi64(x0, 1));
+  }
+  for (; len >= 8; buf += 8, len -= 8) {
+    crc0 = _mm_crc32_u64(crc0, *(const uint64_t*)buf);
+  }
+  for (; len; --len) {
+    crc0 = _mm_crc32_u8(crc0, *buf++);
+  }
+  return ~crc0;
+}
-- 
2.48.1

From 543752f816e3f9f0e312dac2be14fabb7c56101e Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@postgresql.org>
Date: Wed, 12 Feb 2025 15:27:27 +0700
Subject: [PATCH v5 3/8] Adjust previous commit to match our style, add
 128-byte threshold

---
 src/port/pg_crc32c_sse42.c | 48 +++++++++++---------------------------
 1 file changed, 14 insertions(+), 34 deletions(-)

diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c
index 6cc39de175..3395617301 100644
--- a/src/port/pg_crc32c_sse42.c
+++ b/src/port/pg_crc32c_sse42.c
@@ -15,13 +15,14 @@
 #include "c.h"
 
 #include <nmmintrin.h>
+#include <wmmintrin.h>
 
 #include "port/pg_crc32c.h"
 
 pg_attribute_no_sanitize_alignment()
 pg_attribute_target("sse4.2")
-pg_crc32c
-pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len)
+static pg_crc32c
+pg_comp_crc32c_sse42_tail(pg_crc32c crc, const void *data, size_t len)
 {
 	const unsigned char *p = data;
 	const unsigned char *pend = p + len;
@@ -73,34 +74,18 @@ pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len)
 /* ./generate -i sse -p crc32c -a v4 */
 /* MIT licensed */
 
-#include <stddef.h>
-#include <stdint.h>
-#include <nmmintrin.h>
-#include <wmmintrin.h>
-
-#if defined(_MSC_VER)
-#define CRC_AINLINE static __forceinline
-#define CRC_ALIGN(n) __declspec(align(n))
-#else
-#define CRC_AINLINE static __inline __attribute__((always_inline))
-#define CRC_ALIGN(n) __attribute__((aligned(n)))
-#endif
-#define CRC_EXPORT extern
-
 #define clmul_lo(a, b) (_mm_clmulepi64_si128((a), (b), 0))
 #define clmul_hi(a, b) (_mm_clmulepi64_si128((a), (b), 17))
 
-CRC_EXPORT uint32_t crc32_impl(uint32_t crc0, const char* buf, size_t len) {
-  crc0 = ~crc0;
-  for (; len && ((uintptr_t)buf & 7); --len) {
-    crc0 = _mm_crc32_u8(crc0, *buf++);
-  }
-  if (((uintptr_t)buf & 8) && len >= 8) {
-    crc0 = _mm_crc32_u64(crc0, *(const uint64_t*)buf);
-    buf += 8;
-    len -= 8;
-  }
-  if (len >= 64) {
+pg_attribute_target("sse4.2,pclmul")
+pg_crc32c
+pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t length) {
+	/* adjust names to match generated code */
+	pg_crc32c crc0 = crc;
+	size_t len = length;
+	const unsigned char *buf = data;
+
+  if (len >= 128) {
     /* First vector chunk. */
     __m128i x0 = _mm_loadu_si128((const __m128i*)buf), y0;
     __m128i x1 = _mm_loadu_si128((const __m128i*)(buf + 16)), y1;
@@ -137,11 +122,6 @@ CRC_EXPORT uint32_t crc32_impl(uint32_t crc0, const char* buf, size_t len) {
     crc0 = _mm_crc32_u64(0, _mm_extract_epi64(x0, 0));
     crc0 = _mm_crc32_u64(crc0, _mm_extract_epi64(x0, 1));
   }
-  for (; len >= 8; buf += 8, len -= 8) {
-    crc0 = _mm_crc32_u64(crc0, *(const uint64_t*)buf);
-  }
-  for (; len; --len) {
-    crc0 = _mm_crc32_u8(crc0, *buf++);
-  }
-  return ~crc0;
+
+  return pg_comp_crc32c_sse42_tail(crc0, buf, len);
 }
-- 
2.48.1

From 3a27b748ec17feff4547d7ab2689d80ba6d55665 Mon Sep 17 00:00:00 2001
From: Paul Amonson <paul.d.amonson@intel.com>
Date: Mon, 6 May 2024 08:34:17 -0700
Subject: [PATCH v5 1/8] Add a Postgres SQL function for crc32c benchmarking

Add a drive_crc32c() function to use for benchmarking crc32c
computation. The function takes 2 arguments:

(1) count: num of times CRC32C is computed in a loop.
(2) num: #bytes in the buffer to calculate crc over.

XXX not for commit

Extracted from a patch by  Raghuveer Devulapalli
---
 contrib/meson.build                          |  1 +
 contrib/test_crc32c/Makefile                 | 20 +++++++
 contrib/test_crc32c/expected/test_crc32c.out | 57 ++++++++++++++++++++
 contrib/test_crc32c/meson.build              | 34 ++++++++++++
 contrib/test_crc32c/sql/test_crc32c.sql      |  3 ++
 contrib/test_crc32c/test_crc32c--1.0.sql     |  1 +
 contrib/test_crc32c/test_crc32c.c            | 47 ++++++++++++++++
 contrib/test_crc32c/test_crc32c.control      |  4 ++
 8 files changed, 167 insertions(+)
 create mode 100644 contrib/test_crc32c/Makefile
 create mode 100644 contrib/test_crc32c/expected/test_crc32c.out
 create mode 100644 contrib/test_crc32c/meson.build
 create mode 100644 contrib/test_crc32c/sql/test_crc32c.sql
 create mode 100644 contrib/test_crc32c/test_crc32c--1.0.sql
 create mode 100644 contrib/test_crc32c/test_crc32c.c
 create mode 100644 contrib/test_crc32c/test_crc32c.control

diff --git a/contrib/meson.build b/contrib/meson.build
index 1ba73ebd67..06673db062 100644
--- a/contrib/meson.build
+++ b/contrib/meson.build
@@ -12,6 +12,7 @@ contrib_doc_args = {
   'install_dir': contrib_doc_dir,
 }
 
+subdir('test_crc32c')
 subdir('amcheck')
 subdir('auth_delay')
 subdir('auto_explain')
diff --git a/contrib/test_crc32c/Makefile b/contrib/test_crc32c/Makefile
new file mode 100644
index 0000000000..5b747c6184
--- /dev/null
+++ b/contrib/test_crc32c/Makefile
@@ -0,0 +1,20 @@
+MODULE_big = test_crc32c
+OBJS = test_crc32c.o
+PGFILEDESC = "test"
+EXTENSION = test_crc32c
+DATA = test_crc32c--1.0.sql
+
+first: all
+
+# test_crc32c.o:	CFLAGS+=-g
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = src/test/modules/test_crc32c
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/contrib/test_crc32c/expected/test_crc32c.out b/contrib/test_crc32c/expected/test_crc32c.out
new file mode 100644
index 0000000000..dff6bb3133
--- /dev/null
+++ b/contrib/test_crc32c/expected/test_crc32c.out
@@ -0,0 +1,57 @@
+CREATE EXTENSION test_crc32c;
+select drive_crc32c(1, i) from generate_series(100, 300, 4) i;
+ drive_crc32c 
+--------------
+    532139994
+   2103623867
+    785984197
+   2686825890
+   3213049059
+   3819630168
+   1389234603
+    534072900
+   2930108140
+   2496889855
+   1475239611
+    136366931
+   3067402116
+   2012717871
+   3682416023
+   2054270645
+   1817339875
+   4100939569
+   1192727539
+   3636976218
+    369764421
+   3161609879
+   1067984880
+   1235066769
+   3138425899
+    648132037
+   4203750233
+   1330187888
+   2683521348
+   1951644495
+   2574090107
+   3904902018
+   3772697795
+   1644686344
+   2868962106
+   3369218491
+   3902689890
+   3456411865
+    141004025
+   1504497996
+   3782655204
+   3544797610
+   3429174879
+   2524728016
+   3935861181
+     25498897
+    692684159
+    345705535
+   2761600287
+   2654632420
+   3945991399
+(51 rows)
+
diff --git a/contrib/test_crc32c/meson.build b/contrib/test_crc32c/meson.build
new file mode 100644
index 0000000000..d7bec4ba1c
--- /dev/null
+++ b/contrib/test_crc32c/meson.build
@@ -0,0 +1,34 @@
+# Copyright (c) 2022-2024, PostgreSQL Global Development Group
+
+test_crc32c_sources = files(
+  'test_crc32c.c',
+)
+
+if host_system == 'windows'
+  test_crc32c_sources += rc_lib_gen.process(win32ver_rc, extra_args: [
+    '--NAME', 'test_crc32c',
+    '--FILEDESC', 'test_crc32c - test code for crc32c library',])
+endif
+
+test_crc32c = shared_module('test_crc32c',
+  test_crc32c_sources,
+  kwargs: contrib_mod_args,
+)
+contrib_targets += test_crc32c
+
+install_data(
+  'test_crc32c.control',
+  'test_crc32c--1.0.sql',
+  kwargs: contrib_data_args,
+)
+
+tests += {
+  'name': 'test_crc32c',
+  'sd': meson.current_source_dir(),
+  'bd': meson.current_build_dir(),
+  'regress': {
+    'sql': [
+      'test_crc32c',
+    ],
+  },
+}
diff --git a/contrib/test_crc32c/sql/test_crc32c.sql b/contrib/test_crc32c/sql/test_crc32c.sql
new file mode 100644
index 0000000000..95c6dfe448
--- /dev/null
+++ b/contrib/test_crc32c/sql/test_crc32c.sql
@@ -0,0 +1,3 @@
+CREATE EXTENSION test_crc32c;
+
+select drive_crc32c(1, i) from generate_series(100, 300, 4) i;
diff --git a/contrib/test_crc32c/test_crc32c--1.0.sql b/contrib/test_crc32c/test_crc32c--1.0.sql
new file mode 100644
index 0000000000..52b9772f90
--- /dev/null
+++ b/contrib/test_crc32c/test_crc32c--1.0.sql
@@ -0,0 +1 @@
+CREATE FUNCTION drive_crc32c  (count int, num int) RETURNS bigint AS 'MODULE_PATHNAME' LANGUAGE C;
diff --git a/contrib/test_crc32c/test_crc32c.c b/contrib/test_crc32c/test_crc32c.c
new file mode 100644
index 0000000000..b350caf5ce
--- /dev/null
+++ b/contrib/test_crc32c/test_crc32c.c
@@ -0,0 +1,47 @@
+/* select drive_crc32c(1000000, 1024); */
+
+#include "postgres.h"
+#include "fmgr.h"
+#include "port/pg_crc32c.h"
+#include "common/pg_prng.h"
+
+PG_MODULE_MAGIC;
+
+/*
+ * drive_crc32c(count: int, num: int) returns bigint
+ *
+ * count is the nuimber of loops to perform
+ *
+ * num is the number byte in the buffer to calculate
+ * crc32c over.
+ */
+PG_FUNCTION_INFO_V1(drive_crc32c);
+Datum
+drive_crc32c(PG_FUNCTION_ARGS)
+{
+	int64			count	= PG_GETARG_INT64(0);
+	int64			num		= PG_GETARG_INT64(1);
+	char*		data	= malloc((size_t)num);
+	pg_crc32c crc;
+	pg_prng_state state;
+	uint64 seed = 42;
+	pg_prng_seed(&state, seed);
+	/* set random data */
+	for (uint64 i = 0; i < num; i++)
+	{
+		data[i] = pg_prng_uint32(&state) % 255;
+	}
+
+	INIT_CRC32C(crc);
+
+	while(count--)
+	{
+		INIT_CRC32C(crc);
+		COMP_CRC32C(crc, data, num);
+		FIN_CRC32C(crc);
+	}
+
+	free((void *)data);
+
+	PG_RETURN_INT64((int64_t)crc);
+}
diff --git a/contrib/test_crc32c/test_crc32c.control b/contrib/test_crc32c/test_crc32c.control
new file mode 100644
index 0000000000..878a077ee1
--- /dev/null
+++ b/contrib/test_crc32c/test_crc32c.control
@@ -0,0 +1,4 @@
+comment = 'test'
+default_version = '1.0'
+module_pathname = '$libdir/test_crc32c'
+relocatable = true
-- 
2.48.1

Reply via email to