On Thu, Feb 13, 2025 at 4:18 AM Nathan Bossart <nathandboss...@gmail.com> wrote: > > I think the idea behind USE_SSE42_CRC32C is to avoid the function pointer > overhead if possible. I looked at switching to always using runtime checks > for this stuff, and we concluded that we'd better not [0]. > > [0] https://postgr.es/m/flat/20231030161706.GA3011%40nathanxps13
For short lengths, I tried unrolling the loop into a switch statement, as in the attached v5-0006 (the other new patches are fixes for CI). That usually looks faster for me, but not on the length used under the WAL insert lock. Usual caveat: Using small fixed-sized lengths in benchmarks can be misleading, because branches are more easily predicted. It seems like for always using runtime checks we'd need to use branching, rather than function pointers, as has been proposed elsewhere. master: 20 latency average = 3.622 ms latency average = 3.573 ms latency average = 3.599 ms 64 latency average = 7.791 ms latency average = 7.920 ms latency average = 7.888 ms 80 latency average = 8.076 ms latency average = 8.140 ms latency average = 8.150 ms 96 latency average = 8.853 ms latency average = 8.897 ms latency average = 8.914 ms 112 latency average = 9.867 ms latency average = 9.825 ms latency average = 9.869 ms v5: 20 latency average = 4.550 ms latency average = 4.327 ms latency average = 4.320 ms 64 latency average = 5.064 ms latency average = 4.934 ms latency average = 5.020 ms 80 latency average = 4.904 ms latency average = 4.786 ms latency average = 4.942 ms 96 latency average = 5.392 ms latency average = 5.376 ms latency average = 5.367 ms 112 latency average = 5.730 ms latency average = 5.859 ms latency average = 5.734 ms -- John Naylor Amazon Web Services
From acb63cddd8c8220db97ae0b012bf4f2fb5174e8a Mon Sep 17 00:00:00 2001 From: John Naylor <john.naylor@postgresql.org> Date: Wed, 12 Feb 2025 17:07:49 +0700 Subject: [PATCH v5 5/8] Improve CRC32C performance on x86_64 The current SSE4.2 implementation of CRC32C relies on the native CRC32 instruction, which operates on 8 bytes at a time. We can get a substantial speedup on longer inputs by using carryless multiplication on SIMD registers, processing 64 bytes per loop iteration. The PCLMULQDQ instruction has been widely available since 2011 (almost as old as SSE 4.2), so this commit now requires that, as well as SSE 4.2, to build pg_crc32c_sse42.c. The MIT-licensed implementation was generated with the "generate" program from https://github.com/corsix/fast-crc32/ Based on: "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction" V. Gopal, E. Ozturk, et al., 2009 Author: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com> Author: John Naylor <johncnaylorls@gmail.com> Discussion: PH8PR11MB82869FF741DFA4E9A029FF13FBF72@PH8PR11MB8286.namprd11.prod.outlook.com">https://postgr.es/m/PH8PR11MB82869FF741DFA4E9A029FF13FBF72@PH8PR11MB8286.namprd11.prod.outlook.com --- config/c-compiler.m4 | 7 ++++++- configure | 7 ++++++- meson.build | 7 +++++-- src/port/pg_crc32c_sse42.c | 4 ++++ src/port/pg_crc32c_sse42_choose.c | 9 ++++++--- 5 files changed, 27 insertions(+), 7 deletions(-) diff --git a/config/c-compiler.m4 b/config/c-compiler.m4 index 8534cc54c1..8b255b5cc8 100644 --- a/config/c-compiler.m4 +++ b/config/c-compiler.m4 @@ -557,14 +557,19 @@ AC_DEFUN([PGAC_SSE42_CRC32_INTRINSICS], [define([Ac_cachevar], [AS_TR_SH([pgac_cv_sse42_crc32_intrinsics])])dnl AC_CACHE_CHECK([for _mm_crc32_u8 and _mm_crc32_u32], [Ac_cachevar], [AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <nmmintrin.h> + #include <wmmintrin.h> #if defined(__has_attribute) && __has_attribute (target) - __attribute__((target("sse4.2"))) + __attribute__((target("sse4.2,pclmul"))) #endif static int crc32_sse42_test(void) + { + __m128i x1 = _mm_set1_epi32(1); unsigned int crc = 0; crc = _mm_crc32_u8(crc, 0); crc = _mm_crc32_u32(crc, 0); + x1 = _mm_clmulepi64_si128(x1, x1, 0x00); // pclmul + crc = crc + _mm_extract_epi32(x1, 1); /* return computed value, to prevent the above being optimized away */ return crc == 0; }], diff --git a/configure b/configure index 0ffcaeb436..3f2a2a515e 100755 --- a/configure +++ b/configure @@ -17059,14 +17059,19 @@ else cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ #include <nmmintrin.h> + #include <wmmintrin.h> #if defined(__has_attribute) && __has_attribute (target) - __attribute__((target("sse4.2"))) + __attribute__((target("sse4.2,pclmul"))) #endif static int crc32_sse42_test(void) + { + __m128i x1 = _mm_set1_epi32(1); unsigned int crc = 0; crc = _mm_crc32_u8(crc, 0); crc = _mm_crc32_u32(crc, 0); + x1 = _mm_clmulepi64_si128(x1, x1, 0x00); + crc = crc + _mm_extract_epi32(x1, 1); /* return computed value, to prevent the above being optimized away */ return crc == 0; } diff --git a/meson.build b/meson.build index 1ceadb9a83..456c3fafc3 100644 --- a/meson.build +++ b/meson.build @@ -2227,15 +2227,18 @@ if host_cpu == 'x86' or host_cpu == 'x86_64' prog = ''' #include <nmmintrin.h> - +#include <wmmintrin.h> #if defined(__has_attribute) && __has_attribute (target) -__attribute__((target("sse4.2"))) +__attribute__((target("sse4.2,pclmul"))) #endif int main(void) { + __m128i x1 = _mm_set1_epi32(1); unsigned int crc = 0; crc = _mm_crc32_u8(crc, 0); crc = _mm_crc32_u32(crc, 0); + x1 = _mm_clmulepi64_si128(x1, x1, 0x00); // pclmul + crc = crc + _mm_extract_epi32(x1, 1); /* return computed value, to prevent the above being optimized away */ return crc == 0; } diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c index 7250eccf6b..05b11b47cb 100644 --- a/src/port/pg_crc32c_sse42.c +++ b/src/port/pg_crc32c_sse42.c @@ -3,6 +3,10 @@ * pg_crc32c_sse42.c * Compute CRC-32C checksum using Intel SSE 4.2 instructions. * + * For longer inputs, we use carryless multiplication on SIMD registers, + * based on: "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ + * Instruction" V. Gopal, E. Ozturk, et al., 2009 + * * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * diff --git a/src/port/pg_crc32c_sse42_choose.c b/src/port/pg_crc32c_sse42_choose.c index 65dbc4d424..95cfe63493 100644 --- a/src/port/pg_crc32c_sse42_choose.c +++ b/src/port/pg_crc32c_sse42_choose.c @@ -31,7 +31,7 @@ #include "port/pg_crc32c.h" static bool -pg_crc32c_sse42_available(void) +pg_crc32c_sse42_pclmul_available(void) { unsigned int exx[4] = {0, 0, 0, 0}; @@ -43,7 +43,10 @@ pg_crc32c_sse42_available(void) #error cpuid instruction not available #endif - return (exx[2] & (1 << 20)) != 0; /* SSE 4.2 */ + bool sse42 = (exx[2] & (1 << 20)) != 0; /* SSE 4.2 */ + bool pclmul = (exx[2] & (1 << 1)) != 0; /* PCLMULQDQ */ + + return sse42 && pclmul; } /* @@ -53,7 +56,7 @@ pg_crc32c_sse42_available(void) static pg_crc32c pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len) { - if (pg_crc32c_sse42_available()) + if (pg_crc32c_sse42_pclmul_available()) pg_comp_crc32c = pg_comp_crc32c_sse42; else pg_comp_crc32c = pg_comp_crc32c_sb8; -- 2.48.1
From 2c8289de7f612ac01e9bbe5cc86a39571d171925 Mon Sep 17 00:00:00 2001 From: John Naylor <john.naylor@postgresql.org> Date: Thu, 13 Feb 2025 15:53:20 +0700 Subject: [PATCH v5 8/8] Allow dev test to build on Windows for CI XXX not for commit --- src/port/pg_crc32c_sse42_choose.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/port/pg_crc32c_sse42_choose.c b/src/port/pg_crc32c_sse42_choose.c index 95cfe63493..5833b92638 100644 --- a/src/port/pg_crc32c_sse42_choose.c +++ b/src/port/pg_crc32c_sse42_choose.c @@ -64,4 +64,4 @@ pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len) return pg_comp_crc32c(crc, data, len); } -pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len) = pg_comp_crc32c_choose; +PGDLLIMPORT pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len) = pg_comp_crc32c_choose; -- 2.48.1
From aefccb195cfec8532d85695768fdd49faae17a46 Mon Sep 17 00:00:00 2001 From: John Naylor <john.naylor@postgresql.org> Date: Thu, 13 Feb 2025 13:52:54 +0700 Subject: [PATCH v5 6/8] Unroll tail --- src/port/pg_crc32c_sse42.c | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c index 05b11b47cb..0fb9c16dfc 100644 --- a/src/port/pg_crc32c_sse42.c +++ b/src/port/pg_crc32c_sse42.c @@ -23,6 +23,9 @@ #include "port/pg_crc32c.h" +#define PCLMUL_THRESHOLD 128 +#define CRC_CASE(n) do {crc = (uint32) _mm_crc32_u64(crc, *((const uint64 *) (p - (n)*sizeof(uint64_t))));} while(0) + pg_attribute_no_sanitize_alignment() pg_attribute_target("sse4.2") static pg_crc32c @@ -39,10 +42,30 @@ pg_comp_crc32c_sse42_tail(pg_crc32c crc, const void *data, size_t len) * the begin address. */ #ifdef __x86_64__ - while (p + 8 <= pend) + + /* set p to end of last word boundary */ + p = pend - len % (sizeof(uint64_t)); + Assert (len < PCLMUL_THRESHOLD); + + switch (len / sizeof(uint64_t)) { - crc = (uint32) _mm_crc32_u64(crc, *((const uint64 *) p)); - p += 8; + case 15: CRC_CASE(15); /* FALLTHROUGH */ + case 14: CRC_CASE(14); /* FALLTHROUGH */ + case 13: CRC_CASE(13); /* FALLTHROUGH */ + case 12: CRC_CASE(12); /* FALLTHROUGH */ + case 11: CRC_CASE(11); /* FALLTHROUGH */ + case 10: CRC_CASE(10); /* FALLTHROUGH */ + case 9: CRC_CASE(9); /* FALLTHROUGH */ + case 8: CRC_CASE(8); /* FALLTHROUGH */ + case 7: CRC_CASE(7); /* FALLTHROUGH */ + case 6: CRC_CASE(6); /* FALLTHROUGH */ + case 5: CRC_CASE(5); /* FALLTHROUGH */ + case 4: CRC_CASE(4); /* FALLTHROUGH */ + case 3: CRC_CASE(3); /* FALLTHROUGH */ + case 2: CRC_CASE(2); /* FALLTHROUGH */ + case 1: CRC_CASE(1); /* FALLTHROUGH */ + case 0: break; + default: pg_unreachable(); } /* Process remaining full four bytes if any */ @@ -90,7 +113,7 @@ pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t length) size_t len = length; const unsigned char *buf = data; - if (len >= 128) + if (len >= PCLMUL_THRESHOLD) { /* First vector chunk. */ __m128i x0 = _mm_loadu_si128((const __m128i *) buf), -- 2.48.1
From 2da25b18739c95384d236c783c188526e7f5f641 Mon Sep 17 00:00:00 2001 From: John Naylor <john.naylor@postgresql.org> Date: Thu, 13 Feb 2025 15:40:02 +0700 Subject: [PATCH v5 7/8] Fix 32-bit build --- src/port/pg_crc32c_sse42.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c index 0fb9c16dfc..fe7e8165ec 100644 --- a/src/port/pg_crc32c_sse42.c +++ b/src/port/pg_crc32c_sse42.c @@ -113,6 +113,7 @@ pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t length) size_t len = length; const unsigned char *buf = data; +#if SIZEOF_VOID_P >= 8 if (len >= PCLMUL_THRESHOLD) { /* First vector chunk. */ @@ -160,6 +161,7 @@ pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t length) crc0 = _mm_crc32_u64(0, _mm_extract_epi64(x0, 0)); crc0 = _mm_crc32_u64(crc0, _mm_extract_epi64(x0, 1)); } +#endif /* SIZEOF_VOID_P */ return pg_comp_crc32c_sse42_tail(crc0, buf, len); } -- 2.48.1
From a09e918bab5b6aac134c28bebd4b6f60ed05bfc9 Mon Sep 17 00:00:00 2001 From: John Naylor <john.naylor@postgresql.org> Date: Wed, 12 Feb 2025 16:03:52 +0700 Subject: [PATCH v5 4/8] Run pgindent XXX Some lines are still really long --- src/port/pg_crc32c_sse42.c | 95 +++++++++++++++++++++----------------- 1 file changed, 53 insertions(+), 42 deletions(-) diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c index 3395617301..7250eccf6b 100644 --- a/src/port/pg_crc32c_sse42.c +++ b/src/port/pg_crc32c_sse42.c @@ -79,49 +79,60 @@ pg_comp_crc32c_sse42_tail(pg_crc32c crc, const void *data, size_t len) pg_attribute_target("sse4.2,pclmul") pg_crc32c -pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t length) { +pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t length) +{ /* adjust names to match generated code */ - pg_crc32c crc0 = crc; - size_t len = length; + pg_crc32c crc0 = crc; + size_t len = length; const unsigned char *buf = data; - if (len >= 128) { - /* First vector chunk. */ - __m128i x0 = _mm_loadu_si128((const __m128i*)buf), y0; - __m128i x1 = _mm_loadu_si128((const __m128i*)(buf + 16)), y1; - __m128i x2 = _mm_loadu_si128((const __m128i*)(buf + 32)), y2; - __m128i x3 = _mm_loadu_si128((const __m128i*)(buf + 48)), y3; - __m128i k; - k = _mm_setr_epi32(0x740eef02, 0, 0x9e4addf8, 0); - x0 = _mm_xor_si128(_mm_cvtsi32_si128(crc0), x0); - buf += 64; - len -= 64; - /* Main loop. */ - while (len >= 64) { - y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); - y1 = clmul_lo(x1, k), x1 = clmul_hi(x1, k); - y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k); - y3 = clmul_lo(x3, k), x3 = clmul_hi(x3, k); - y0 = _mm_xor_si128(y0, _mm_loadu_si128((const __m128i*)buf)), x0 = _mm_xor_si128(x0, y0); - y1 = _mm_xor_si128(y1, _mm_loadu_si128((const __m128i*)(buf + 16))), x1 = _mm_xor_si128(x1, y1); - y2 = _mm_xor_si128(y2, _mm_loadu_si128((const __m128i*)(buf + 32))), x2 = _mm_xor_si128(x2, y2); - y3 = _mm_xor_si128(y3, _mm_loadu_si128((const __m128i*)(buf + 48))), x3 = _mm_xor_si128(x3, y3); - buf += 64; - len -= 64; - } - /* Reduce x0 ... x3 to just x0. */ - k = _mm_setr_epi32(0xf20c0dfe, 0, 0x493c7d27, 0); - y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); - y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k); - y0 = _mm_xor_si128(y0, x1), x0 = _mm_xor_si128(x0, y0); - y2 = _mm_xor_si128(y2, x3), x2 = _mm_xor_si128(x2, y2); - k = _mm_setr_epi32(0x3da6d0cb, 0, 0xba4fc28e, 0); - y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); - y0 = _mm_xor_si128(y0, x2), x0 = _mm_xor_si128(x0, y0); - /* Reduce 128 bits to 32 bits, and multiply by x^32. */ - crc0 = _mm_crc32_u64(0, _mm_extract_epi64(x0, 0)); - crc0 = _mm_crc32_u64(crc0, _mm_extract_epi64(x0, 1)); - } - - return pg_comp_crc32c_sse42_tail(crc0, buf, len); + if (len >= 128) + { + /* First vector chunk. */ + __m128i x0 = _mm_loadu_si128((const __m128i *) buf), + y0; + __m128i x1 = _mm_loadu_si128((const __m128i *) (buf + 16)), + y1; + __m128i x2 = _mm_loadu_si128((const __m128i *) (buf + 32)), + y2; + __m128i x3 = _mm_loadu_si128((const __m128i *) (buf + 48)), + y3; + __m128i k; + + k = _mm_setr_epi32(0x740eef02, 0, 0x9e4addf8, 0); + x0 = _mm_xor_si128(_mm_cvtsi32_si128(crc0), x0); + buf += 64; + len -= 64; + + /* Main loop. */ + while (len >= 64) + { + y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); + y1 = clmul_lo(x1, k), x1 = clmul_hi(x1, k); + y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k); + y3 = clmul_lo(x3, k), x3 = clmul_hi(x3, k); + y0 = _mm_xor_si128(y0, _mm_loadu_si128((const __m128i *) buf)), x0 = _mm_xor_si128(x0, y0); + y1 = _mm_xor_si128(y1, _mm_loadu_si128((const __m128i *) (buf + 16))), x1 = _mm_xor_si128(x1, y1); + y2 = _mm_xor_si128(y2, _mm_loadu_si128((const __m128i *) (buf + 32))), x2 = _mm_xor_si128(x2, y2); + y3 = _mm_xor_si128(y3, _mm_loadu_si128((const __m128i *) (buf + 48))), x3 = _mm_xor_si128(x3, y3); + buf += 64; + len -= 64; + } + + /* Reduce x0 ... x3 to just x0. */ + k = _mm_setr_epi32(0xf20c0dfe, 0, 0x493c7d27, 0); + y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); + y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k); + y0 = _mm_xor_si128(y0, x1), x0 = _mm_xor_si128(x0, y0); + y2 = _mm_xor_si128(y2, x3), x2 = _mm_xor_si128(x2, y2); + k = _mm_setr_epi32(0x3da6d0cb, 0, 0xba4fc28e, 0); + y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); + y0 = _mm_xor_si128(y0, x2), x0 = _mm_xor_si128(x0, y0); + + /* Reduce 128 bits to 32 bits, and multiply by x^32. */ + crc0 = _mm_crc32_u64(0, _mm_extract_epi64(x0, 0)); + crc0 = _mm_crc32_u64(crc0, _mm_extract_epi64(x0, 1)); + } + + return pg_comp_crc32c_sse42_tail(crc0, buf, len); } -- 2.48.1
From 57952d1f89f0c3a4a2d28399344e9335f8bee72b Mon Sep 17 00:00:00 2001 From: John Naylor <john.naylor@postgresql.org> Date: Wed, 12 Feb 2025 15:27:16 +0700 Subject: [PATCH v5 2/8] Vendor SSE implementation from https://github.com/corsix/fast-crc32/ --- src/port/pg_crc32c_sse42.c | 77 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c index 22c2137df3..6cc39de175 100644 --- a/src/port/pg_crc32c_sse42.c +++ b/src/port/pg_crc32c_sse42.c @@ -68,3 +68,80 @@ pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len) return crc; } + +/* Generated by https://github.com/corsix/fast-crc32/ using: */ +/* ./generate -i sse -p crc32c -a v4 */ +/* MIT licensed */ + +#include <stddef.h> +#include <stdint.h> +#include <nmmintrin.h> +#include <wmmintrin.h> + +#if defined(_MSC_VER) +#define CRC_AINLINE static __forceinline +#define CRC_ALIGN(n) __declspec(align(n)) +#else +#define CRC_AINLINE static __inline __attribute__((always_inline)) +#define CRC_ALIGN(n) __attribute__((aligned(n))) +#endif +#define CRC_EXPORT extern + +#define clmul_lo(a, b) (_mm_clmulepi64_si128((a), (b), 0)) +#define clmul_hi(a, b) (_mm_clmulepi64_si128((a), (b), 17)) + +CRC_EXPORT uint32_t crc32_impl(uint32_t crc0, const char* buf, size_t len) { + crc0 = ~crc0; + for (; len && ((uintptr_t)buf & 7); --len) { + crc0 = _mm_crc32_u8(crc0, *buf++); + } + if (((uintptr_t)buf & 8) && len >= 8) { + crc0 = _mm_crc32_u64(crc0, *(const uint64_t*)buf); + buf += 8; + len -= 8; + } + if (len >= 64) { + /* First vector chunk. */ + __m128i x0 = _mm_loadu_si128((const __m128i*)buf), y0; + __m128i x1 = _mm_loadu_si128((const __m128i*)(buf + 16)), y1; + __m128i x2 = _mm_loadu_si128((const __m128i*)(buf + 32)), y2; + __m128i x3 = _mm_loadu_si128((const __m128i*)(buf + 48)), y3; + __m128i k; + k = _mm_setr_epi32(0x740eef02, 0, 0x9e4addf8, 0); + x0 = _mm_xor_si128(_mm_cvtsi32_si128(crc0), x0); + buf += 64; + len -= 64; + /* Main loop. */ + while (len >= 64) { + y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); + y1 = clmul_lo(x1, k), x1 = clmul_hi(x1, k); + y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k); + y3 = clmul_lo(x3, k), x3 = clmul_hi(x3, k); + y0 = _mm_xor_si128(y0, _mm_loadu_si128((const __m128i*)buf)), x0 = _mm_xor_si128(x0, y0); + y1 = _mm_xor_si128(y1, _mm_loadu_si128((const __m128i*)(buf + 16))), x1 = _mm_xor_si128(x1, y1); + y2 = _mm_xor_si128(y2, _mm_loadu_si128((const __m128i*)(buf + 32))), x2 = _mm_xor_si128(x2, y2); + y3 = _mm_xor_si128(y3, _mm_loadu_si128((const __m128i*)(buf + 48))), x3 = _mm_xor_si128(x3, y3); + buf += 64; + len -= 64; + } + /* Reduce x0 ... x3 to just x0. */ + k = _mm_setr_epi32(0xf20c0dfe, 0, 0x493c7d27, 0); + y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); + y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k); + y0 = _mm_xor_si128(y0, x1), x0 = _mm_xor_si128(x0, y0); + y2 = _mm_xor_si128(y2, x3), x2 = _mm_xor_si128(x2, y2); + k = _mm_setr_epi32(0x3da6d0cb, 0, 0xba4fc28e, 0); + y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); + y0 = _mm_xor_si128(y0, x2), x0 = _mm_xor_si128(x0, y0); + /* Reduce 128 bits to 32 bits, and multiply by x^32. */ + crc0 = _mm_crc32_u64(0, _mm_extract_epi64(x0, 0)); + crc0 = _mm_crc32_u64(crc0, _mm_extract_epi64(x0, 1)); + } + for (; len >= 8; buf += 8, len -= 8) { + crc0 = _mm_crc32_u64(crc0, *(const uint64_t*)buf); + } + for (; len; --len) { + crc0 = _mm_crc32_u8(crc0, *buf++); + } + return ~crc0; +} -- 2.48.1
From 543752f816e3f9f0e312dac2be14fabb7c56101e Mon Sep 17 00:00:00 2001 From: John Naylor <john.naylor@postgresql.org> Date: Wed, 12 Feb 2025 15:27:27 +0700 Subject: [PATCH v5 3/8] Adjust previous commit to match our style, add 128-byte threshold --- src/port/pg_crc32c_sse42.c | 48 +++++++++++--------------------------- 1 file changed, 14 insertions(+), 34 deletions(-) diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c index 6cc39de175..3395617301 100644 --- a/src/port/pg_crc32c_sse42.c +++ b/src/port/pg_crc32c_sse42.c @@ -15,13 +15,14 @@ #include "c.h" #include <nmmintrin.h> +#include <wmmintrin.h> #include "port/pg_crc32c.h" pg_attribute_no_sanitize_alignment() pg_attribute_target("sse4.2") -pg_crc32c -pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len) +static pg_crc32c +pg_comp_crc32c_sse42_tail(pg_crc32c crc, const void *data, size_t len) { const unsigned char *p = data; const unsigned char *pend = p + len; @@ -73,34 +74,18 @@ pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len) /* ./generate -i sse -p crc32c -a v4 */ /* MIT licensed */ -#include <stddef.h> -#include <stdint.h> -#include <nmmintrin.h> -#include <wmmintrin.h> - -#if defined(_MSC_VER) -#define CRC_AINLINE static __forceinline -#define CRC_ALIGN(n) __declspec(align(n)) -#else -#define CRC_AINLINE static __inline __attribute__((always_inline)) -#define CRC_ALIGN(n) __attribute__((aligned(n))) -#endif -#define CRC_EXPORT extern - #define clmul_lo(a, b) (_mm_clmulepi64_si128((a), (b), 0)) #define clmul_hi(a, b) (_mm_clmulepi64_si128((a), (b), 17)) -CRC_EXPORT uint32_t crc32_impl(uint32_t crc0, const char* buf, size_t len) { - crc0 = ~crc0; - for (; len && ((uintptr_t)buf & 7); --len) { - crc0 = _mm_crc32_u8(crc0, *buf++); - } - if (((uintptr_t)buf & 8) && len >= 8) { - crc0 = _mm_crc32_u64(crc0, *(const uint64_t*)buf); - buf += 8; - len -= 8; - } - if (len >= 64) { +pg_attribute_target("sse4.2,pclmul") +pg_crc32c +pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t length) { + /* adjust names to match generated code */ + pg_crc32c crc0 = crc; + size_t len = length; + const unsigned char *buf = data; + + if (len >= 128) { /* First vector chunk. */ __m128i x0 = _mm_loadu_si128((const __m128i*)buf), y0; __m128i x1 = _mm_loadu_si128((const __m128i*)(buf + 16)), y1; @@ -137,11 +122,6 @@ CRC_EXPORT uint32_t crc32_impl(uint32_t crc0, const char* buf, size_t len) { crc0 = _mm_crc32_u64(0, _mm_extract_epi64(x0, 0)); crc0 = _mm_crc32_u64(crc0, _mm_extract_epi64(x0, 1)); } - for (; len >= 8; buf += 8, len -= 8) { - crc0 = _mm_crc32_u64(crc0, *(const uint64_t*)buf); - } - for (; len; --len) { - crc0 = _mm_crc32_u8(crc0, *buf++); - } - return ~crc0; + + return pg_comp_crc32c_sse42_tail(crc0, buf, len); } -- 2.48.1
From 3a27b748ec17feff4547d7ab2689d80ba6d55665 Mon Sep 17 00:00:00 2001 From: Paul Amonson <paul.d.amonson@intel.com> Date: Mon, 6 May 2024 08:34:17 -0700 Subject: [PATCH v5 1/8] Add a Postgres SQL function for crc32c benchmarking Add a drive_crc32c() function to use for benchmarking crc32c computation. The function takes 2 arguments: (1) count: num of times CRC32C is computed in a loop. (2) num: #bytes in the buffer to calculate crc over. XXX not for commit Extracted from a patch by Raghuveer Devulapalli --- contrib/meson.build | 1 + contrib/test_crc32c/Makefile | 20 +++++++ contrib/test_crc32c/expected/test_crc32c.out | 57 ++++++++++++++++++++ contrib/test_crc32c/meson.build | 34 ++++++++++++ contrib/test_crc32c/sql/test_crc32c.sql | 3 ++ contrib/test_crc32c/test_crc32c--1.0.sql | 1 + contrib/test_crc32c/test_crc32c.c | 47 ++++++++++++++++ contrib/test_crc32c/test_crc32c.control | 4 ++ 8 files changed, 167 insertions(+) create mode 100644 contrib/test_crc32c/Makefile create mode 100644 contrib/test_crc32c/expected/test_crc32c.out create mode 100644 contrib/test_crc32c/meson.build create mode 100644 contrib/test_crc32c/sql/test_crc32c.sql create mode 100644 contrib/test_crc32c/test_crc32c--1.0.sql create mode 100644 contrib/test_crc32c/test_crc32c.c create mode 100644 contrib/test_crc32c/test_crc32c.control diff --git a/contrib/meson.build b/contrib/meson.build index 1ba73ebd67..06673db062 100644 --- a/contrib/meson.build +++ b/contrib/meson.build @@ -12,6 +12,7 @@ contrib_doc_args = { 'install_dir': contrib_doc_dir, } +subdir('test_crc32c') subdir('amcheck') subdir('auth_delay') subdir('auto_explain') diff --git a/contrib/test_crc32c/Makefile b/contrib/test_crc32c/Makefile new file mode 100644 index 0000000000..5b747c6184 --- /dev/null +++ b/contrib/test_crc32c/Makefile @@ -0,0 +1,20 @@ +MODULE_big = test_crc32c +OBJS = test_crc32c.o +PGFILEDESC = "test" +EXTENSION = test_crc32c +DATA = test_crc32c--1.0.sql + +first: all + +# test_crc32c.o: CFLAGS+=-g + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = src/test/modules/test_crc32c +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif diff --git a/contrib/test_crc32c/expected/test_crc32c.out b/contrib/test_crc32c/expected/test_crc32c.out new file mode 100644 index 0000000000..dff6bb3133 --- /dev/null +++ b/contrib/test_crc32c/expected/test_crc32c.out @@ -0,0 +1,57 @@ +CREATE EXTENSION test_crc32c; +select drive_crc32c(1, i) from generate_series(100, 300, 4) i; + drive_crc32c +-------------- + 532139994 + 2103623867 + 785984197 + 2686825890 + 3213049059 + 3819630168 + 1389234603 + 534072900 + 2930108140 + 2496889855 + 1475239611 + 136366931 + 3067402116 + 2012717871 + 3682416023 + 2054270645 + 1817339875 + 4100939569 + 1192727539 + 3636976218 + 369764421 + 3161609879 + 1067984880 + 1235066769 + 3138425899 + 648132037 + 4203750233 + 1330187888 + 2683521348 + 1951644495 + 2574090107 + 3904902018 + 3772697795 + 1644686344 + 2868962106 + 3369218491 + 3902689890 + 3456411865 + 141004025 + 1504497996 + 3782655204 + 3544797610 + 3429174879 + 2524728016 + 3935861181 + 25498897 + 692684159 + 345705535 + 2761600287 + 2654632420 + 3945991399 +(51 rows) + diff --git a/contrib/test_crc32c/meson.build b/contrib/test_crc32c/meson.build new file mode 100644 index 0000000000..d7bec4ba1c --- /dev/null +++ b/contrib/test_crc32c/meson.build @@ -0,0 +1,34 @@ +# Copyright (c) 2022-2024, PostgreSQL Global Development Group + +test_crc32c_sources = files( + 'test_crc32c.c', +) + +if host_system == 'windows' + test_crc32c_sources += rc_lib_gen.process(win32ver_rc, extra_args: [ + '--NAME', 'test_crc32c', + '--FILEDESC', 'test_crc32c - test code for crc32c library',]) +endif + +test_crc32c = shared_module('test_crc32c', + test_crc32c_sources, + kwargs: contrib_mod_args, +) +contrib_targets += test_crc32c + +install_data( + 'test_crc32c.control', + 'test_crc32c--1.0.sql', + kwargs: contrib_data_args, +) + +tests += { + 'name': 'test_crc32c', + 'sd': meson.current_source_dir(), + 'bd': meson.current_build_dir(), + 'regress': { + 'sql': [ + 'test_crc32c', + ], + }, +} diff --git a/contrib/test_crc32c/sql/test_crc32c.sql b/contrib/test_crc32c/sql/test_crc32c.sql new file mode 100644 index 0000000000..95c6dfe448 --- /dev/null +++ b/contrib/test_crc32c/sql/test_crc32c.sql @@ -0,0 +1,3 @@ +CREATE EXTENSION test_crc32c; + +select drive_crc32c(1, i) from generate_series(100, 300, 4) i; diff --git a/contrib/test_crc32c/test_crc32c--1.0.sql b/contrib/test_crc32c/test_crc32c--1.0.sql new file mode 100644 index 0000000000..52b9772f90 --- /dev/null +++ b/contrib/test_crc32c/test_crc32c--1.0.sql @@ -0,0 +1 @@ +CREATE FUNCTION drive_crc32c (count int, num int) RETURNS bigint AS 'MODULE_PATHNAME' LANGUAGE C; diff --git a/contrib/test_crc32c/test_crc32c.c b/contrib/test_crc32c/test_crc32c.c new file mode 100644 index 0000000000..b350caf5ce --- /dev/null +++ b/contrib/test_crc32c/test_crc32c.c @@ -0,0 +1,47 @@ +/* select drive_crc32c(1000000, 1024); */ + +#include "postgres.h" +#include "fmgr.h" +#include "port/pg_crc32c.h" +#include "common/pg_prng.h" + +PG_MODULE_MAGIC; + +/* + * drive_crc32c(count: int, num: int) returns bigint + * + * count is the nuimber of loops to perform + * + * num is the number byte in the buffer to calculate + * crc32c over. + */ +PG_FUNCTION_INFO_V1(drive_crc32c); +Datum +drive_crc32c(PG_FUNCTION_ARGS) +{ + int64 count = PG_GETARG_INT64(0); + int64 num = PG_GETARG_INT64(1); + char* data = malloc((size_t)num); + pg_crc32c crc; + pg_prng_state state; + uint64 seed = 42; + pg_prng_seed(&state, seed); + /* set random data */ + for (uint64 i = 0; i < num; i++) + { + data[i] = pg_prng_uint32(&state) % 255; + } + + INIT_CRC32C(crc); + + while(count--) + { + INIT_CRC32C(crc); + COMP_CRC32C(crc, data, num); + FIN_CRC32C(crc); + } + + free((void *)data); + + PG_RETURN_INT64((int64_t)crc); +} diff --git a/contrib/test_crc32c/test_crc32c.control b/contrib/test_crc32c/test_crc32c.control new file mode 100644 index 0000000000..878a077ee1 --- /dev/null +++ b/contrib/test_crc32c/test_crc32c.control @@ -0,0 +1,4 @@ +comment = 'test' +default_version = '1.0' +module_pathname = '$libdir/test_crc32c' +relocatable = true -- 2.48.1