On Wed, Mar 5, 2025 at 10:52 PM Nathan Bossart <nathandboss...@gmail.com> wrote: > > On Wed, Mar 05, 2025 at 08:51:21AM +0700, John Naylor wrote: > > That was my hunch too, but I wanted to be more sure, so I modified the > > benchmark so it doesn't know the address of the next calculation until > > it finishes the last calculation so we can hopefully see the latency > > caused by indirection. It also does an additional calculation on > > constant 20 bytes, like the WAL header. I also tweaked the length each > > iteration so the branch predictor maybe has a harder time predicting > > the constant 20 input. And to make it more challenging, I removed the > > part that inlined all small inputs, so it inlines only constant > > inputs: > > Would you mind sharing this test?
The test script is the same as here, except I only ran small lengths: https://www.postgresql.org/message-id/CANWCAZahvhE-%2BhtZiUyzPiS5e45ukx5877mD-dHr-KSX6LcdjQ%40mail.gmail.com ...but I must have forgotten to attach the slightly tweaked patch set, which I've done now. 0002 modifies the 0001 test module and 0006 reverts inlining non-constant input from 0005, just to see if I could find a regression from indirection, which I didn't. If we don't need it, it'd better to avoid inlining loops to keep from bloating the binary. > It sounds like you are running a > workload with a mix of constant/inlined calls and function pointer calls to > simulate typical usage for WAL, but I'm not 100% sure I'm understanding you > correctly. Exactly. -- John Naylor Amazon Web Services
From c5cd6e44028eaf11efc2cf4fc49c87101b49c97f Mon Sep 17 00:00:00 2001 From: John Naylor <john.nay...@postgresql.org> Date: Wed, 5 Mar 2025 08:21:54 +0700 Subject: [PATCH v12 6/6] Only inline for constant input (partial revert) --- src/include/port/pg_crc32c.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h index 26b676dddc9..01192831ca3 100644 --- a/src/include/port/pg_crc32c.h +++ b/src/include/port/pg_crc32c.h @@ -66,12 +66,11 @@ static inline pg_crc32c pg_comp_crc32c_dispatch(pg_crc32c crc, const void *data, size_t len) { - if (len < 64) + if (__builtin_constant_p(len) && len < 64) { /* - * For small inputs, inline the computation to avoid the runtime - * check. This also allows the compiler to unroll loops for constant - * input. + * For small constant inputs, inline the computation. This allows the + * compiler to unroll loops. */ return pg_comp_crc32c_sse42_inline(crc, data, len); } -- 2.48.1
From eafea75fc761fd51fa67311af794cf0f7dec40aa Mon Sep 17 00:00:00 2001 From: John Naylor <john.nay...@postgresql.org> Date: Wed, 12 Feb 2025 15:27:16 +0700 Subject: [PATCH v12 4/6] Improve CRC32C performance on x86_64 The current SSE4.2 implementation of CRC32C relies on the native CRC32 instruction, which operates on 8 bytes at a time. We can get a substantial speedup on longer inputs by using carryless multiplication on SIMD registers, processing 64 bytes per loop iteration. The PCLMULQDQ instruction has been widely available since 2011 (almost as old as SSE 4.2), so this commit now requires that, as well as SSE 4.2, to build pg_crc32c_sse42.c. The MIT-licensed implementation was generated with the "generate" program from https://github.com/corsix/fast-crc32/ Based on: "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction" V. Gopal, E. Ozturk, et al., 2009 Author: Raghuveer Devulapalli <raghuveer.devulapa...@intel.com> Author: John Naylor <johncnaylo...@gmail.com> Discussion: https://postgr.es/m/ph8pr11mb82869ff741dfa4e9a029ff13fb...@ph8pr11mb8286.namprd11.prod.outlook.com --- src/include/port/pg_crc32c.h | 30 ++++++++--- src/port/pg_crc32c_sse42.c | 88 +++++++++++++++++++++++++++++++ src/port/pg_crc32c_sse42_choose.c | 26 ++++----- 3 files changed, 124 insertions(+), 20 deletions(-) diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h index 5ccc79295c0..fe0e1b6b275 100644 --- a/src/include/port/pg_crc32c.h +++ b/src/include/port/pg_crc32c.h @@ -37,6 +37,11 @@ typedef uint32 pg_crc32c; +/* WIP: configure checks */ +#ifdef __x86_64__ +#define USE_PCLMUL_WITH_RUNTIME_CHECK +#endif + /* The INIT and EQ macros are the same for all implementations. */ #define INIT_CRC32C(crc) ((crc) = 0xFFFFFFFF) #define EQ_CRC32C(c1, c2) ((c1) == (c2)) @@ -68,6 +73,23 @@ pg_comp_crc32c_dispatch(pg_crc32c crc, const void *data, size_t len) return pg_comp_crc32c_sse42(crc, data, len); } +#elif defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK) + +/* + * Use Intel SSE 4.2 or PCLMUL instructions, but perform a runtime check first + * to check that they are available. + */ +#define COMP_CRC32C(crc, data, len) \ + ((crc) = pg_comp_crc32c((crc), (data), (len))) +#define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF) + +extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len); +extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len); +extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len); +#ifdef USE_PCLMUL_WITH_RUNTIME_CHECK +extern pg_crc32c pg_comp_crc32c_pclmul(pg_crc32c crc, const void *data, size_t len); +#endif + #elif defined(USE_ARMV8_CRC32C) /* Use ARMv8 CRC Extension instructions. */ @@ -86,7 +108,7 @@ extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t le extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_t len); -#elif defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK) || defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK) +#elif defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK) /* * Use Intel SSE 4.2 or ARMv8 instructions, but perform a runtime check first @@ -98,13 +120,7 @@ extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_ extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len); extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len); - -#ifdef USE_SSE42_CRC32C_WITH_RUNTIME_CHECK -extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len); -#endif -#ifdef USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len); -#endif #else /* diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c index 6a35f7fdc67..b56da2f6934 100644 --- a/src/port/pg_crc32c_sse42.c +++ b/src/port/pg_crc32c_sse42.c @@ -15,6 +15,7 @@ #include "c.h" #include <nmmintrin.h> +#include <wmmintrin.h> #include "port/pg_crc32c.h" #include "port/pg_crc32c_sse42_impl.h" @@ -26,3 +27,90 @@ pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len) { return pg_comp_crc32c_sse42_inline(crc, data, len); } + +#ifdef USE_PCLMUL_WITH_RUNTIME_CHECK + +/* Generated by https://github.com/corsix/fast-crc32/ using: */ +/* ./generate -i sse -p crc32c -a v4e */ +/* MIT licensed */ + +#define clmul_lo(a, b) (_mm_clmulepi64_si128((a), (b), 0)) +#define clmul_hi(a, b) (_mm_clmulepi64_si128((a), (b), 17)) + +pg_attribute_target("sse4.2,pclmul") +pg_crc32c +pg_comp_crc32c_pclmul(pg_crc32c crc, const void *data, size_t length) +{ + /* adjust names to match generated code */ + pg_crc32c crc0 = crc; + size_t len = length; + const char *buf = data; + + // This prolog is trying to avoid loads straddling + // cache lines, but it doesn't seem worth it if + // we're trying to be fast on small inputs as well +#if 0 + for (; len && ((uintptr_t) buf & 7); --len) + { + crc0 = _mm_crc32_u8(crc0, *buf++); + } + if (((uintptr_t) buf & 8) && len >= 8) + { + crc0 = _mm_crc32_u64(crc0, *(const uint64_t *) buf); + buf += 8; + len -= 8; + } +#endif + if (len >= 64) + { + const char *end = buf + len; + const char *limit = buf + len - 64; + + /* First vector chunk. */ + __m128i x0 = _mm_loadu_si128((const __m128i *) buf), + y0; + __m128i x1 = _mm_loadu_si128((const __m128i *) (buf + 16)), + y1; + __m128i x2 = _mm_loadu_si128((const __m128i *) (buf + 32)), + y2; + __m128i x3 = _mm_loadu_si128((const __m128i *) (buf + 48)), + y3; + __m128i k; + + k = _mm_setr_epi32(0x740eef02, 0, 0x9e4addf8, 0); + x0 = _mm_xor_si128(_mm_cvtsi32_si128(crc0), x0); + buf += 64; + /* Main loop. */ + while (buf <= limit) + { + y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); + y1 = clmul_lo(x1, k), x1 = clmul_hi(x1, k); + y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k); + y3 = clmul_lo(x3, k), x3 = clmul_hi(x3, k); + y0 = _mm_xor_si128(y0, _mm_loadu_si128((const __m128i *) buf)), x0 = _mm_xor_si128(x0, y0); + y1 = _mm_xor_si128(y1, _mm_loadu_si128((const __m128i *) (buf + 16))), x1 = _mm_xor_si128(x1, y1); + y2 = _mm_xor_si128(y2, _mm_loadu_si128((const __m128i *) (buf + 32))), x2 = _mm_xor_si128(x2, y2); + y3 = _mm_xor_si128(y3, _mm_loadu_si128((const __m128i *) (buf + 48))), x3 = _mm_xor_si128(x3, y3); + buf += 64; + } + + /* Reduce x0 ... x3 to just x0. */ + k = _mm_setr_epi32(0xf20c0dfe, 0, 0x493c7d27, 0); + y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); + y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k); + y0 = _mm_xor_si128(y0, x1), x0 = _mm_xor_si128(x0, y0); + y2 = _mm_xor_si128(y2, x3), x2 = _mm_xor_si128(x2, y2); + k = _mm_setr_epi32(0x3da6d0cb, 0, 0xba4fc28e, 0); + y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); + y0 = _mm_xor_si128(y0, x2), x0 = _mm_xor_si128(x0, y0); + + /* Reduce 128 bits to 32 bits, and multiply by x^32. */ + crc0 = _mm_crc32_u64(0, _mm_extract_epi64(x0, 0)); + crc0 = _mm_crc32_u64(crc0, _mm_extract_epi64(x0, 1)); + len = end - buf; + } + + return pg_comp_crc32c_sse42_inline(crc0, buf, len); +} + +#endif diff --git a/src/port/pg_crc32c_sse42_choose.c b/src/port/pg_crc32c_sse42_choose.c index 65dbc4d4249..abea0f90eb3 100644 --- a/src/port/pg_crc32c_sse42_choose.c +++ b/src/port/pg_crc32c_sse42_choose.c @@ -30,8 +30,12 @@ #include "port/pg_crc32c.h" -static bool -pg_crc32c_sse42_available(void) +/* + * This gets called on the first call. It replaces the function pointer + * so that subsequent calls are routed directly to the chosen implementation. + */ +static pg_crc32c +pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len) { unsigned int exx[4] = {0, 0, 0, 0}; @@ -43,18 +47,14 @@ pg_crc32c_sse42_available(void) #error cpuid instruction not available #endif - return (exx[2] & (1 << 20)) != 0; /* SSE 4.2 */ -} - -/* - * This gets called on the first call. It replaces the function pointer - * so that subsequent calls are routed directly to the chosen implementation. - */ -static pg_crc32c -pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len) -{ - if (pg_crc32c_sse42_available()) + if ((exx[2] & (1 << 20)) != 0) /* SSE 4.2 */ + { pg_comp_crc32c = pg_comp_crc32c_sse42; +#ifdef USE_PCLMUL_WITH_RUNTIME_CHECK + if ((exx[2] & (1 << 1)) != 0) /* PCLMUL */ + pg_comp_crc32c = pg_comp_crc32c_pclmul; +#endif + } else pg_comp_crc32c = pg_comp_crc32c_sb8; -- 2.48.1
From ebbd072d558574f78bd4489c3431a13fd831f254 Mon Sep 17 00:00:00 2001 From: John Naylor <john.nay...@postgresql.org> Date: Fri, 28 Feb 2025 16:27:30 +0700 Subject: [PATCH v12 3/6] Inline CRC computation for small fixed-length input --- src/include/port/pg_crc32c.h | 21 ++++++- src/include/port/pg_crc32c_sse42_impl.h | 74 +++++++++++++++++++++++++ src/port/pg_crc32c_sse42.c | 46 +-------------- 3 files changed, 96 insertions(+), 45 deletions(-) create mode 100644 src/include/port/pg_crc32c_sse42_impl.h diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h index 65ebeacf4b1..5ccc79295c0 100644 --- a/src/include/port/pg_crc32c.h +++ b/src/include/port/pg_crc32c.h @@ -43,12 +43,31 @@ typedef uint32 pg_crc32c; #if defined(USE_SSE42_CRC32C) /* Use Intel SSE4.2 instructions. */ + +#include "pg_crc32c_sse42_impl.h" + #define COMP_CRC32C(crc, data, len) \ - ((crc) = pg_comp_crc32c_sse42((crc), (data), (len))) + ((crc) = pg_comp_crc32c_dispatch((crc), (data), (len))) #define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF) extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len); +static inline +pg_crc32c +pg_comp_crc32c_dispatch(pg_crc32c crc, const void *data, size_t len) +{ + if (__builtin_constant_p(len) && len < 64) + { + /* + * For small constant inputs, inline the computation. This allows the + * compiler to unroll loops. + */ + return pg_comp_crc32c_sse42_inline(crc, data, len); + } + else + return pg_comp_crc32c_sse42(crc, data, len); +} + #elif defined(USE_ARMV8_CRC32C) /* Use ARMv8 CRC Extension instructions. */ diff --git a/src/include/port/pg_crc32c_sse42_impl.h b/src/include/port/pg_crc32c_sse42_impl.h new file mode 100644 index 00000000000..e10ad777618 --- /dev/null +++ b/src/include/port/pg_crc32c_sse42_impl.h @@ -0,0 +1,74 @@ +/*------------------------------------------------------------------------- + * + * pg_crc32c_sse42_impl.h + * Inline implementation of CRC computation using SSE 4.2 + * + * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/port/pg_crc32c_sse42_impl.h + * + *------------------------------------------------------------------------- + */ +#ifndef PG_CRC32C_SSE42_IMPL_H +#define PG_CRC32C_SSE42_IMPL_H + +#include "c.h" + +#include <nmmintrin.h> + +pg_attribute_no_sanitize_alignment() +pg_attribute_target("sse4.2") +static inline +pg_crc32c +pg_comp_crc32c_sse42_inline(pg_crc32c crc, const void *data, size_t len) +{ + const unsigned char *p = data; + const unsigned char *pend = p + len; + + /* + * Process eight bytes of data at a time. + * + * NB: We do unaligned accesses here. The Intel architecture allows that, + * and performance testing didn't show any performance gain from aligning + * the begin address. + */ +#ifdef __x86_64__ + while (p + 8 <= pend) + { + crc = (uint32) _mm_crc32_u64(crc, *((const uint64 *) p)); + p += 8; + } + + /* Process remaining full four bytes if any */ + if (p + 4 <= pend) + { + crc = _mm_crc32_u32(crc, *((const unsigned int *) p)); + p += 4; + } +#else + + /* + * Process four bytes at a time. (The eight byte instruction is not + * available on the 32-bit x86 architecture). + */ + while (p + 4 <= pend) + { + crc = _mm_crc32_u32(crc, *((const unsigned int *) p)); + p += 4; + } +#endif /* __x86_64__ */ + + /* Process any remaining bytes one at a time. */ + while (p < pend) + { + crc = _mm_crc32_u8(crc, *p); + p++; + } + + return crc; +} + +#endif /* PG_CRC32C_SSE42_IMPL_H */ diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c index 22c2137df31..6a35f7fdc67 100644 --- a/src/port/pg_crc32c_sse42.c +++ b/src/port/pg_crc32c_sse42.c @@ -17,54 +17,12 @@ #include <nmmintrin.h> #include "port/pg_crc32c.h" +#include "port/pg_crc32c_sse42_impl.h" pg_attribute_no_sanitize_alignment() pg_attribute_target("sse4.2") pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len) { - const unsigned char *p = data; - const unsigned char *pend = p + len; - - /* - * Process eight bytes of data at a time. - * - * NB: We do unaligned accesses here. The Intel architecture allows that, - * and performance testing didn't show any performance gain from aligning - * the begin address. - */ -#ifdef __x86_64__ - while (p + 8 <= pend) - { - crc = (uint32) _mm_crc32_u64(crc, *((const uint64 *) p)); - p += 8; - } - - /* Process remaining full four bytes if any */ - if (p + 4 <= pend) - { - crc = _mm_crc32_u32(crc, *((const unsigned int *) p)); - p += 4; - } -#else - - /* - * Process four bytes at a time. (The eight byte instruction is not - * available on the 32-bit x86 architecture). - */ - while (p + 4 <= pend) - { - crc = _mm_crc32_u32(crc, *((const unsigned int *) p)); - p += 4; - } -#endif /* __x86_64__ */ - - /* Process any remaining bytes one at a time. */ - while (p < pend) - { - crc = _mm_crc32_u8(crc, *p); - p++; - } - - return crc; + return pg_comp_crc32c_sse42_inline(crc, data, len); } -- 2.48.1
From e38654507d2efad4b5ad75548e0c388c3db9cfe5 Mon Sep 17 00:00:00 2001 From: John Naylor <john.nay...@postgresql.org> Date: Fri, 28 Feb 2025 18:27:40 +0700 Subject: [PATCH v12 5/6] Use runtime check even when we have SSE 4.2 at compile time This allows us to use PCLMUL for longer inputs. Short inputs are inlined to avoid the indirection through a function pointer. --- configure | 2 +- configure.ac | 2 +- src/include/port/pg_crc32c.h | 15 +++++++++++---- src/port/meson.build | 1 + src/port/pg_crc32c_sse42_choose.c | 2 ++ 5 files changed, 16 insertions(+), 6 deletions(-) diff --git a/configure b/configure index 93fddd69981..91c0ffc8272 100755 --- a/configure +++ b/configure @@ -17684,7 +17684,7 @@ if test x"$USE_SSE42_CRC32C" = x"1"; then $as_echo "#define USE_SSE42_CRC32C 1" >>confdefs.h - PG_CRC32C_OBJS="pg_crc32c_sse42.o" + PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sse42_choose.o" { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2" >&5 $as_echo "SSE 4.2" >&6; } else diff --git a/configure.ac b/configure.ac index b6d02f5ecc7..a85bdbd4ff6 100644 --- a/configure.ac +++ b/configure.ac @@ -2151,7 +2151,7 @@ fi AC_MSG_CHECKING([which CRC-32C implementation to use]) if test x"$USE_SSE42_CRC32C" = x"1"; then AC_DEFINE(USE_SSE42_CRC32C, 1, [Define to 1 use Intel SSE 4.2 CRC instructions.]) - PG_CRC32C_OBJS="pg_crc32c_sse42.o" + PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sse42_choose.o" AC_MSG_RESULT(SSE 4.2) else if test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h index fe0e1b6b275..26b676dddc9 100644 --- a/src/include/port/pg_crc32c.h +++ b/src/include/port/pg_crc32c.h @@ -55,22 +55,29 @@ typedef uint32 pg_crc32c; ((crc) = pg_comp_crc32c_dispatch((crc), (data), (len))) #define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF) +extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len); +extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len); extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len); +#ifdef USE_PCLMUL_WITH_RUNTIME_CHECK +extern pg_crc32c pg_comp_crc32c_pclmul(pg_crc32c crc, const void *data, size_t len); +#endif static inline pg_crc32c pg_comp_crc32c_dispatch(pg_crc32c crc, const void *data, size_t len) { - if (__builtin_constant_p(len) && len < 64) + if (len < 64) { /* - * For small constant inputs, inline the computation. This allows the - * compiler to unroll loops. + * For small inputs, inline the computation to avoid the runtime + * check. This also allows the compiler to unroll loops for constant + * input. */ return pg_comp_crc32c_sse42_inline(crc, data, len); } else - return pg_comp_crc32c_sse42(crc, data, len); + /* For larger inputs, use a runtime check for PCLMUL instructions. */ + return pg_comp_crc32c(crc, data, len); } #elif defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK) diff --git a/src/port/meson.build b/src/port/meson.build index 7fcfa728d43..8d70a4d510e 100644 --- a/src/port/meson.build +++ b/src/port/meson.build @@ -83,6 +83,7 @@ replace_funcs_pos = [ # x86/x64 ['pg_crc32c_sse42', 'USE_SSE42_CRC32C'], ['pg_crc32c_sse42', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'], + ['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C'], ['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'], ['pg_crc32c_sb8', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'], diff --git a/src/port/pg_crc32c_sse42_choose.c b/src/port/pg_crc32c_sse42_choose.c index abea0f90eb3..89a48c76894 100644 --- a/src/port/pg_crc32c_sse42_choose.c +++ b/src/port/pg_crc32c_sse42_choose.c @@ -55,8 +55,10 @@ pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len) pg_comp_crc32c = pg_comp_crc32c_pclmul; #endif } +#ifdef USE_SSE42_CRC32C_WITH_RUNTIME_CHECK else pg_comp_crc32c = pg_comp_crc32c_sb8; +#endif return pg_comp_crc32c(crc, data, len); } -- 2.48.1
From 7a9b94677da30db0f8c296fe71037f65b157bc1c Mon Sep 17 00:00:00 2001 From: John Naylor <john.nay...@postgresql.org> Date: Wed, 5 Mar 2025 07:52:52 +0700 Subject: [PATCH v12 2/6] Attempt to make benchmark more sensitive to latency --- contrib/test_crc32c/test_crc32c.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/contrib/test_crc32c/test_crc32c.c b/contrib/test_crc32c/test_crc32c.c index 28bc42de314..3e5ebad4e39 100644 --- a/contrib/test_crc32c/test_crc32c.c +++ b/contrib/test_crc32c/test_crc32c.c @@ -21,13 +21,13 @@ drive_crc32c(PG_FUNCTION_ARGS) { int64 count = PG_GETARG_INT32(0); int64 num = PG_GETARG_INT32(1); - char* data = malloc((size_t)num); - pg_crc32c crc; + char* data = malloc((size_t)num + 256); + pg_crc32c crc = 0; pg_prng_state state; uint64 seed = 42; pg_prng_seed(&state, seed); /* set random data */ - for (uint64 i = 0; i < num; i++) + for (uint64 i = 0; i < num + 256; i++) { data[i] = pg_prng_uint32(&state) % 255; } @@ -36,11 +36,15 @@ drive_crc32c(PG_FUNCTION_ARGS) while(count--) { - INIT_CRC32C(crc); - COMP_CRC32C(crc, data, num); - FIN_CRC32C(crc); + size_t delta = crc & 7; + + /* make both pointer and length unpredictable */ + COMP_CRC32C(crc, data + delta, num + delta); + /* simulate WAL header */ + COMP_CRC32C(crc, data + delta, 20); } + FIN_CRC32C(crc); free((void *)data); PG_RETURN_INT64((int64_t)crc); -- 2.48.1
From 2d8b2ad3e967231d1498953f6563b36b94977445 Mon Sep 17 00:00:00 2001 From: Paul Amonson <paul.d.amon...@intel.com> Date: Mon, 6 May 2024 08:34:17 -0700 Subject: [PATCH v12 1/6] Add a Postgres SQL function for crc32c benchmarking Add a drive_crc32c() function to use for benchmarking crc32c computation. The function takes 2 arguments: (1) count: num of times CRC32C is computed in a loop. (2) num: #bytes in the buffer to calculate crc over. XXX not for commit Extracted from a patch by Raghuveer Devulapalli --- contrib/meson.build | 1 + contrib/test_crc32c/Makefile | 20 +++++++ contrib/test_crc32c/expected/test_crc32c.out | 57 ++++++++++++++++++++ contrib/test_crc32c/meson.build | 34 ++++++++++++ contrib/test_crc32c/sql/test_crc32c.sql | 3 ++ contrib/test_crc32c/test_crc32c--1.0.sql | 1 + contrib/test_crc32c/test_crc32c.c | 47 ++++++++++++++++ contrib/test_crc32c/test_crc32c.control | 4 ++ 8 files changed, 167 insertions(+) create mode 100644 contrib/test_crc32c/Makefile create mode 100644 contrib/test_crc32c/expected/test_crc32c.out create mode 100644 contrib/test_crc32c/meson.build create mode 100644 contrib/test_crc32c/sql/test_crc32c.sql create mode 100644 contrib/test_crc32c/test_crc32c--1.0.sql create mode 100644 contrib/test_crc32c/test_crc32c.c create mode 100644 contrib/test_crc32c/test_crc32c.control diff --git a/contrib/meson.build b/contrib/meson.build index 1ba73ebd67a..06673db0625 100644 --- a/contrib/meson.build +++ b/contrib/meson.build @@ -12,6 +12,7 @@ contrib_doc_args = { 'install_dir': contrib_doc_dir, } +subdir('test_crc32c') subdir('amcheck') subdir('auth_delay') subdir('auto_explain') diff --git a/contrib/test_crc32c/Makefile b/contrib/test_crc32c/Makefile new file mode 100644 index 00000000000..5b747c6184a --- /dev/null +++ b/contrib/test_crc32c/Makefile @@ -0,0 +1,20 @@ +MODULE_big = test_crc32c +OBJS = test_crc32c.o +PGFILEDESC = "test" +EXTENSION = test_crc32c +DATA = test_crc32c--1.0.sql + +first: all + +# test_crc32c.o: CFLAGS+=-g + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = src/test/modules/test_crc32c +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif diff --git a/contrib/test_crc32c/expected/test_crc32c.out b/contrib/test_crc32c/expected/test_crc32c.out new file mode 100644 index 00000000000..dff6bb3133b --- /dev/null +++ b/contrib/test_crc32c/expected/test_crc32c.out @@ -0,0 +1,57 @@ +CREATE EXTENSION test_crc32c; +select drive_crc32c(1, i) from generate_series(100, 300, 4) i; + drive_crc32c +-------------- + 532139994 + 2103623867 + 785984197 + 2686825890 + 3213049059 + 3819630168 + 1389234603 + 534072900 + 2930108140 + 2496889855 + 1475239611 + 136366931 + 3067402116 + 2012717871 + 3682416023 + 2054270645 + 1817339875 + 4100939569 + 1192727539 + 3636976218 + 369764421 + 3161609879 + 1067984880 + 1235066769 + 3138425899 + 648132037 + 4203750233 + 1330187888 + 2683521348 + 1951644495 + 2574090107 + 3904902018 + 3772697795 + 1644686344 + 2868962106 + 3369218491 + 3902689890 + 3456411865 + 141004025 + 1504497996 + 3782655204 + 3544797610 + 3429174879 + 2524728016 + 3935861181 + 25498897 + 692684159 + 345705535 + 2761600287 + 2654632420 + 3945991399 +(51 rows) + diff --git a/contrib/test_crc32c/meson.build b/contrib/test_crc32c/meson.build new file mode 100644 index 00000000000..d7bec4ba1cb --- /dev/null +++ b/contrib/test_crc32c/meson.build @@ -0,0 +1,34 @@ +# Copyright (c) 2022-2024, PostgreSQL Global Development Group + +test_crc32c_sources = files( + 'test_crc32c.c', +) + +if host_system == 'windows' + test_crc32c_sources += rc_lib_gen.process(win32ver_rc, extra_args: [ + '--NAME', 'test_crc32c', + '--FILEDESC', 'test_crc32c - test code for crc32c library',]) +endif + +test_crc32c = shared_module('test_crc32c', + test_crc32c_sources, + kwargs: contrib_mod_args, +) +contrib_targets += test_crc32c + +install_data( + 'test_crc32c.control', + 'test_crc32c--1.0.sql', + kwargs: contrib_data_args, +) + +tests += { + 'name': 'test_crc32c', + 'sd': meson.current_source_dir(), + 'bd': meson.current_build_dir(), + 'regress': { + 'sql': [ + 'test_crc32c', + ], + }, +} diff --git a/contrib/test_crc32c/sql/test_crc32c.sql b/contrib/test_crc32c/sql/test_crc32c.sql new file mode 100644 index 00000000000..95c6dfe4488 --- /dev/null +++ b/contrib/test_crc32c/sql/test_crc32c.sql @@ -0,0 +1,3 @@ +CREATE EXTENSION test_crc32c; + +select drive_crc32c(1, i) from generate_series(100, 300, 4) i; diff --git a/contrib/test_crc32c/test_crc32c--1.0.sql b/contrib/test_crc32c/test_crc32c--1.0.sql new file mode 100644 index 00000000000..52b9772f908 --- /dev/null +++ b/contrib/test_crc32c/test_crc32c--1.0.sql @@ -0,0 +1 @@ +CREATE FUNCTION drive_crc32c (count int, num int) RETURNS bigint AS 'MODULE_PATHNAME' LANGUAGE C; diff --git a/contrib/test_crc32c/test_crc32c.c b/contrib/test_crc32c/test_crc32c.c new file mode 100644 index 00000000000..28bc42de314 --- /dev/null +++ b/contrib/test_crc32c/test_crc32c.c @@ -0,0 +1,47 @@ +/* select drive_crc32c(1000000, 1024); */ + +#include "postgres.h" +#include "fmgr.h" +#include "port/pg_crc32c.h" +#include "common/pg_prng.h" + +PG_MODULE_MAGIC; + +/* + * drive_crc32c(count: int, num: int) returns bigint + * + * count is the nuimber of loops to perform + * + * num is the number byte in the buffer to calculate + * crc32c over. + */ +PG_FUNCTION_INFO_V1(drive_crc32c); +Datum +drive_crc32c(PG_FUNCTION_ARGS) +{ + int64 count = PG_GETARG_INT32(0); + int64 num = PG_GETARG_INT32(1); + char* data = malloc((size_t)num); + pg_crc32c crc; + pg_prng_state state; + uint64 seed = 42; + pg_prng_seed(&state, seed); + /* set random data */ + for (uint64 i = 0; i < num; i++) + { + data[i] = pg_prng_uint32(&state) % 255; + } + + INIT_CRC32C(crc); + + while(count--) + { + INIT_CRC32C(crc); + COMP_CRC32C(crc, data, num); + FIN_CRC32C(crc); + } + + free((void *)data); + + PG_RETURN_INT64((int64_t)crc); +} diff --git a/contrib/test_crc32c/test_crc32c.control b/contrib/test_crc32c/test_crc32c.control new file mode 100644 index 00000000000..878a077ee18 --- /dev/null +++ b/contrib/test_crc32c/test_crc32c.control @@ -0,0 +1,4 @@ +comment = 'test' +default_version = '1.0' +module_pathname = '$libdir/test_crc32c' +relocatable = true -- 2.48.1