On Thu, Mar 13, 2025 at 12:50 AM Devulapalli, Raghuveer <raghuveer.devulapa...@intel.com> wrote: > > > > Intel has contributed SSE4.2 CRC32C [1] and AVX-512 CRC32C [2] based on > > similar techniques to postgres. > > > > ...this is a restatement of facts we already know. I'm guessing the intended > > takeaway is "since Intel submitted an implementation to us based on paper A, > > then we are free to separately also use a technique from paper B (which > > cites > > patents)". > > Yes. > > > The original proposal that started this thread is below, and I'd like to > > give that > > author credit for initiating that work > > Yup, that should be fine.
Thank you for confirming. I've attached v10, which has mostly polishing and comment writing, and a draft commit message. The lookup table and software carryless multiplication routine are still in pg_crc32c_sb.c , which is now built unconditionally. That's good foreshadowing of future pclmul/pmull support, as I've found building that file everywhere makes some things simpler anyway. That file has become a bit of a misnomer, and I've thought of renaming it to *_common.c or perhaps *_fallback.c , since the addition from this patch is still kind of a fallback where we won't have the hardware needed for faster algorithms, as discussed elsewhere. 0002-3 puts the relevant parts into a header so that the hardware details can be abstracted away. These would be squashed, but I've kept them separate here for comparison. -- John Naylor Amazon Web Services
From f246bba2d9090116f8914c20114ecc3f4b9daeea Mon Sep 17 00:00:00 2001 From: John Naylor <john.nay...@postgresql.org> Date: Tue, 18 Mar 2025 12:56:32 +0700 Subject: [PATCH v10 2/3] Use template file for parallel CRC computation --- src/port/pg_crc32c_armv8.c | 65 ++++++++++++++++------------------ src/port/pg_crc32c_parallel.h | 66 +++++++++++++++++++++++++++++++++++ src/port/pg_crc32c_sb8.c | 2 ++ src/port/pg_crc32c_sse42.c | 65 +++++++--------------------------- 4 files changed, 112 insertions(+), 86 deletions(-) create mode 100644 src/port/pg_crc32c_parallel.h diff --git a/src/port/pg_crc32c_armv8.c b/src/port/pg_crc32c_armv8.c index 0265a2a13d7..4767406bef3 100644 --- a/src/port/pg_crc32c_armv8.c +++ b/src/port/pg_crc32c_armv8.c @@ -18,13 +18,42 @@ #include "port/pg_crc32c.h" +#define DEBUG_CRC /* XXX not for commit */ + +static pg_crc32c pg_comp_crc32c_armv8_tail(pg_crc32c crc, const void *data, size_t len); + + pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len) +{ + const unsigned char *p = data; + pg_crc32c crc0 = crc; + +#ifdef DEBUG_CRC + const size_t orig_len PG_USED_FOR_ASSERTS_ONLY = len; +#endif + +/* min size to compute multiple segments in parallel */ +#define MIN_PARALLEL_LENGTH 600 + +#define PG_CRC32C_1B(c, w) __crc32cb(c, w) +#define PG_CRC32C_8B(c, w) __crc32cd(c, w) +#include "pg_crc32c_parallel.h" + + crc0 = pg_comp_crc32c_armv8_tail(crc0, p, len); + +#ifdef DEBUG_CRC + Assert(crc0 == pg_comp_crc32c_sb8(crc, data, orig_len)); +#endif + + return crc0; +} + +static pg_crc32c +pg_comp_crc32c_armv8_tail(pg_crc32c crc, const void *data, size_t len) { const unsigned char *p = data; const unsigned char *pend = p + len; - const size_t min_blocklen = 42; /* Min size to consider interleaving */ - const pg_crc32c orig_crc = crc; // XXX not for commit /* * ARMv8 doesn't require alignment, but aligned memory access is @@ -50,36 +79,6 @@ pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len) p += 4; } - /* See pg_crc32c_sse42.c for explanation */ - while (p + min_blocklen * CRC_BYTES_PER_ITER <= pend) - { - const size_t block_len = Min(CRC_MAX_BLOCK_LEN, (pend - p) / CRC_BYTES_PER_ITER); - const uint64 *in64 = (const uint64 *) (p); - pg_crc32c crc0 = crc, - crc1 = 0, - crc2 = 0; - uint64 mul0, - mul1, - precompute; - - for (int i = 0; i < block_len; i++, in64++) - { - crc0 = __crc32cd(crc0, *(in64)); - crc1 = __crc32cd(crc1, *(in64 + block_len)); - crc2 = __crc32cd(crc2, *(in64 + block_len * 2)); - } - - precompute = combine_crc_lookup[block_len - 1]; - mul0 = pg_clmul(crc0, (uint32) precompute); - mul1 = pg_clmul(crc1, (uint32) (precompute >> 32)); - - crc0 = __crc32cd(0, mul0); - crc1 = __crc32cd(0, mul1); - crc = crc0 ^ crc1 ^ crc2; - - p += block_len * CRC_BYTES_PER_ITER; - } - /* Process eight bytes at a time, as far as we can. */ while (p + 8 <= pend) { @@ -103,7 +102,5 @@ pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len) crc = __crc32cb(crc, *p); } - // XXX not for commit - Assert(crc == pg_comp_crc32c_sb8(orig_crc, data, len)); return crc; } diff --git a/src/port/pg_crc32c_parallel.h b/src/port/pg_crc32c_parallel.h new file mode 100644 index 00000000000..caee564726e --- /dev/null +++ b/src/port/pg_crc32c_parallel.h @@ -0,0 +1,66 @@ +/*------------------------------------------------------------------------- + * + * pg_crc32c_parallel.h + * Hardware-independent template for parallel CRC computation. + * + * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * src/port/pg_crc32c_parallel.h + * + *------------------------------------------------------------------------- + */ +#ifndef PG_CRC32C_H +#define PG_CRC32C_H + +if (unlikely(len >= MIN_PARALLEL_LENGTH)) +{ + /* + * Align pointer regardless of architecture to avoid straddling cacheline + * boundaries, since we issue three loads per loop iteration below. + */ + for (; (uintptr_t) p & 7; len--) + crc0 = PG_CRC32C_1B(crc0, *p++); + + /* + * A CRC instruction can be issued every cycle on many architectures, but + * the latency of its result will take several cycles. We can take + * advantage of this by dividing the input into 3 equal blocks and + * computing the CRC of each independently. + */ + while (len >= MIN_PARALLEL_LENGTH) + { + const size_t block_len = Min(CRC_MAX_BLOCK_LEN, + len / CRC_BYTES_PER_ITER); + const uint64 *in64 = (const uint64 *) (p); + pg_crc32c crc1 = 0, + crc2 = 0; + uint64 mul0, + mul1, + precompute; + + for (int i = 0; i < block_len; i++, in64++) + { + crc0 = PG_CRC32C_8B(crc0, *(in64)); + crc1 = PG_CRC32C_8B(crc1, *(in64 + block_len)); + crc2 = PG_CRC32C_8B(crc2, *(in64 + block_len * 2)); + } + + /* + * Combine the partial CRCs using carryless multiplication on + * pre-computed length-specific constants. + */ + precompute = combine_crc_lookup[block_len - 1]; + mul0 = pg_clmul(crc0, (uint32) precompute); + mul1 = pg_clmul(crc1, (uint32) (precompute >> 32)); + crc0 = PG_CRC32C_8B(0, mul0); + crc0 ^= PG_CRC32C_8B(0, mul1); + crc0 ^= crc2; + + p += block_len * CRC_BYTES_PER_ITER; + len -= block_len * CRC_BYTES_PER_ITER; + } +} + +#endif /* PG_CRC32C_H */ diff --git a/src/port/pg_crc32c_sb8.c b/src/port/pg_crc32c_sb8.c index 004fe92d70b..d0ec8c5bed0 100644 --- a/src/port/pg_crc32c_sb8.c +++ b/src/port/pg_crc32c_sb8.c @@ -1169,6 +1169,8 @@ static const uint32 pg_crc32c_table[8][256] = { }; +/* platform-independent infrastructure for parallel CRC computation */ + /* * Carryless multiplication in software */ diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c index f674d3f71d7..3fe8716601f 100644 --- a/src/port/pg_crc32c_sse42.c +++ b/src/port/pg_crc32c_sse42.c @@ -18,8 +18,7 @@ #include "port/pg_crc32c.h" -/* min size to compute multiple segments in parallel */ -#define MIN_PARALLEL_LENGTH 600 +#define DEBUG_CRC /* XXX not for commit */ static pg_crc32c pg_comp_crc32c_sse42_tail(pg_crc32c crc, const void *data, size_t len); @@ -31,64 +30,26 @@ pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len) const unsigned char *p = data; pg_crc32c crc0 = crc; - /* XXX not for commit */ +#ifdef DEBUG_CRC const size_t orig_len PG_USED_FOR_ASSERTS_ONLY = len; +#endif #if SIZEOF_VOID_P >= 8 - if (unlikely(len >= MIN_PARALLEL_LENGTH)) - { - /* - * Align pointer to avoid straddling cacheline boundaries, since we - * issue three loads per loop iteration below. - */ - for (; (uintptr_t) p & 7; len--) - crc0 = _mm_crc32_u8(crc0, *p++); - - /* - * A CRC instruction can be issued every cycle but the latency of its - * result will take several cycles. We can take advantage of this by - * dividing the input into 3 equal blocks and computing the CRC of - * each independently. - */ - while (len >= MIN_PARALLEL_LENGTH) - { - const size_t block_len = Min(CRC_MAX_BLOCK_LEN, - len / CRC_BYTES_PER_ITER); - const uint64 *in64 = (const uint64 *) (p); - pg_crc32c crc1 = 0, - crc2 = 0; - uint64 mul0, - mul1, - precompute; - - for (int i = 0; i < block_len; i++, in64++) - { - crc0 = _mm_crc32_u64(crc0, *(in64)); - crc1 = _mm_crc32_u64(crc1, *(in64 + block_len)); - crc2 = _mm_crc32_u64(crc2, *(in64 + block_len * 2)); - } - - /* - * Combine the partial CRCs using carryless multiplication on - * pre-computed length-specific constants. - */ - precompute = combine_crc_lookup[block_len - 1]; - mul0 = pg_clmul(crc0, (uint32) precompute); - mul1 = pg_clmul(crc1, (uint32) (precompute >> 32)); - crc0 = _mm_crc32_u64(0, mul0); - crc0 ^= _mm_crc32_u64(0, mul1); - crc0 ^= crc2; - - p += block_len * CRC_BYTES_PER_ITER; - len -= block_len * CRC_BYTES_PER_ITER; - } - } + +/* min size to compute multiple segments in parallel */ +#define MIN_PARALLEL_LENGTH 600 + +#define PG_CRC32C_1B(c, w) _mm_crc32_u8(c, w) +#define PG_CRC32C_8B(c, w) _mm_crc32_u64(c, w) +#include "pg_crc32c_parallel.h" + #endif crc0 = pg_comp_crc32c_sse42_tail(crc0, p, len); - /* XXX not for commit */ +#ifdef DEBUG_CRC Assert(crc0 == pg_comp_crc32c_sb8(crc, data, orig_len)); +#endif return crc0; } -- 2.48.1
From 2306c2a94e1610b76a1b7584bbaea63d8bf57302 Mon Sep 17 00:00:00 2001 From: John Naylor <john.nay...@postgresql.org> Date: Wed, 11 Dec 2024 11:30:33 +0700 Subject: [PATCH v10 1/3] Execute hardware CRC computation in parallel CRC computations on the current input word depend not only on that input, but also on the CRC of the previous input. This means that the speed is limited by the latency of the CRC instruction. Most modern CPUs can start executing a new CRC instruction before a currently executing one has finished, i.e. the reciprocal throughput is lower than latency. By computing partial CRCs of non-overlapping segments of the input, we can achieve the full throughput that the CPU is capable of. To preserve the correctness of the result, however, we must recombine the partial results using carryless multiplication with constants specific to the input length. We get these from a lookup table of pre-computed CRCs. Because of the overhead of the recombinination step, parallelism is only faster with inputs of at least a few hundred bytes. For now we only implement parallelism for x86 and Arm. It might be worthwhile to apply this technique to LoongArch, depending on the throughput of CRC on that platform. XXX The lookup table and supporting code is found in pg_crc32c_sb.c, which is now built unconditionally on all platforms. Perhaps s/sb8/common/ ? This technique originated from the Intel white paper "Fast CRC Computation for iSCSI Polynomial Using CRC32 Instruction", by Vinodh Gopal et al, 2011. Thanks to Raghuveer Devulapalli for assistance in verifying the usability of this technique from a legal perspective. Xiang Gao's original proposal was specific to the Arm architecture, computed in fixed-size chunks of 1024 bytes, and required hardware support for carryless multiplication. I added support for x86 and a wider range of chunk sizes, and switched to pure C for carryless multiplication. The portability of the latter is important for two reasons: 1) We may want to use this technique on architectures that don't have hardware carryless multiplication and 2) This is intended as a fallback, since if hardware carryless multiplication is available, there are other algorithms that are useful on much smaller inputs than this one. Author: Xiang Gao <xiang....@arm.com> Author: John Naylor <johncnaylo...@gmail.com> Reviewed-by: Nathan Bossart <nathandboss...@gmail.com> Discussion: https://postgr.es/m/db9pr08mb6991329a73923bf8ed4b3422f5...@db9pr08mb6991.eurprd08.prod.outlook.com --- configure | 5 +- configure.ac | 5 +- src/include/port/pg_crc32c.h | 12 ++++ src/port/Makefile | 1 + src/port/meson.build | 6 +- src/port/pg_crc32c_armv8.c | 34 +++++++++++ src/port/pg_crc32c_sb8.c | 111 +++++++++++++++++++++++++++++++++++ src/port/pg_crc32c_sse42.c | 77 +++++++++++++++++++++++- 8 files changed, 239 insertions(+), 12 deletions(-) diff --git a/configure b/configure index 93fddd69981..3403bb0c931 100755 --- a/configure +++ b/configure @@ -17692,7 +17692,7 @@ else $as_echo "#define USE_SSE42_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h - PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sb8.o pg_crc32c_sse42_choose.o" + PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sse42_choose.o" { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2 with runtime check" >&5 $as_echo "SSE 4.2 with runtime check" >&6; } else @@ -17708,7 +17708,7 @@ $as_echo "ARMv8 CRC instructions" >&6; } $as_echo "#define USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h - PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_sb8.o pg_crc32c_armv8_choose.o" + PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_armv8_choose.o" { $as_echo "$as_me:${as_lineno-$LINENO}: result: ARMv8 CRC instructions with runtime check" >&5 $as_echo "ARMv8 CRC instructions with runtime check" >&6; } else @@ -17723,7 +17723,6 @@ $as_echo "LoongArch CRCC instructions" >&6; } $as_echo "#define USE_SLICING_BY_8_CRC32C 1" >>confdefs.h - PG_CRC32C_OBJS="pg_crc32c_sb8.o" { $as_echo "$as_me:${as_lineno-$LINENO}: result: slicing-by-8" >&5 $as_echo "slicing-by-8" >&6; } fi diff --git a/configure.ac b/configure.ac index b6d02f5ecc7..855949b7d74 100644 --- a/configure.ac +++ b/configure.ac @@ -2156,7 +2156,7 @@ if test x"$USE_SSE42_CRC32C" = x"1"; then else if test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then AC_DEFINE(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use Intel SSE 4.2 CRC instructions with a runtime check.]) - PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sb8.o pg_crc32c_sse42_choose.o" + PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sse42_choose.o" AC_MSG_RESULT(SSE 4.2 with runtime check) else if test x"$USE_ARMV8_CRC32C" = x"1"; then @@ -2166,7 +2166,7 @@ else else if test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then AC_DEFINE(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use ARMv8 CRC Extension with a runtime check.]) - PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_sb8.o pg_crc32c_armv8_choose.o" + PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_armv8_choose.o" AC_MSG_RESULT(ARMv8 CRC instructions with runtime check) else if test x"$USE_LOONGARCH_CRC32C" = x"1"; then @@ -2175,7 +2175,6 @@ else AC_MSG_RESULT(LoongArch CRCC instructions) else AC_DEFINE(USE_SLICING_BY_8_CRC32C, 1, [Define to 1 to use software CRC-32C implementation (slicing-by-8).]) - PG_CRC32C_OBJS="pg_crc32c_sb8.o" AC_MSG_RESULT(slicing-by-8) fi fi diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h index 65ebeacf4b1..e6c149c71f1 100644 --- a/src/include/port/pg_crc32c.h +++ b/src/include/port/pg_crc32c.h @@ -41,6 +41,8 @@ typedef uint32 pg_crc32c; #define INIT_CRC32C(crc) ((crc) = 0xFFFFFFFF) #define EQ_CRC32C(c1, c2) ((c1) == (c2)) +extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len); + #if defined(USE_SSE42_CRC32C) /* Use Intel SSE4.2 instructions. */ #define COMP_CRC32C(crc, data, len) \ @@ -107,4 +109,14 @@ extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len) #endif +/* semi-private to files in src/port that compute CRCs in parallel */ + +#define CRC_BYTES_PER_ITER (3 * sizeof(uint64)) +/* for parallel computation, max number of words per block for recombination */ +#define CRC_MAX_BLOCK_LEN 350 + +extern PGDLLIMPORT const uint64 combine_crc_lookup[CRC_MAX_BLOCK_LEN]; + +extern uint64 pg_clmul(uint32 a, uint32 b); + #endif /* PG_CRC32C_H */ diff --git a/src/port/Makefile b/src/port/Makefile index 4c224319512..a8e2467a866 100644 --- a/src/port/Makefile +++ b/src/port/Makefile @@ -44,6 +44,7 @@ OBJS = \ noblock.o \ path.o \ pg_bitutils.o \ + pg_crc32c_sb8.o \ pg_popcount_avx512.o \ pg_strong_random.o \ pgcheckdir.o \ diff --git a/src/port/meson.build b/src/port/meson.build index 7fcfa728d43..8aed1de2d1d 100644 --- a/src/port/meson.build +++ b/src/port/meson.build @@ -7,6 +7,7 @@ pgport_sources = [ 'noblock.c', 'path.c', 'pg_bitutils.c', + 'pg_crc32c_sb8.c', 'pg_popcount_avx512.c', 'pg_strong_random.c', 'pgcheckdir.c', @@ -84,19 +85,14 @@ replace_funcs_pos = [ ['pg_crc32c_sse42', 'USE_SSE42_CRC32C'], ['pg_crc32c_sse42', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'], ['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'], - ['pg_crc32c_sb8', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'], # arm / aarch64 ['pg_crc32c_armv8', 'USE_ARMV8_CRC32C'], ['pg_crc32c_armv8', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK', 'crc'], ['pg_crc32c_armv8_choose', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'], - ['pg_crc32c_sb8', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'], # loongarch ['pg_crc32c_loongarch', 'USE_LOONGARCH_CRC32C'], - - # generic fallback - ['pg_crc32c_sb8', 'USE_SLICING_BY_8_CRC32C'], ] pgport_cflags = {'crc': cflags_crc} diff --git a/src/port/pg_crc32c_armv8.c b/src/port/pg_crc32c_armv8.c index 5ba070bb99d..0265a2a13d7 100644 --- a/src/port/pg_crc32c_armv8.c +++ b/src/port/pg_crc32c_armv8.c @@ -23,6 +23,8 @@ pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len) { const unsigned char *p = data; const unsigned char *pend = p + len; + const size_t min_blocklen = 42; /* Min size to consider interleaving */ + const pg_crc32c orig_crc = crc; // XXX not for commit /* * ARMv8 doesn't require alignment, but aligned memory access is @@ -48,6 +50,36 @@ pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len) p += 4; } + /* See pg_crc32c_sse42.c for explanation */ + while (p + min_blocklen * CRC_BYTES_PER_ITER <= pend) + { + const size_t block_len = Min(CRC_MAX_BLOCK_LEN, (pend - p) / CRC_BYTES_PER_ITER); + const uint64 *in64 = (const uint64 *) (p); + pg_crc32c crc0 = crc, + crc1 = 0, + crc2 = 0; + uint64 mul0, + mul1, + precompute; + + for (int i = 0; i < block_len; i++, in64++) + { + crc0 = __crc32cd(crc0, *(in64)); + crc1 = __crc32cd(crc1, *(in64 + block_len)); + crc2 = __crc32cd(crc2, *(in64 + block_len * 2)); + } + + precompute = combine_crc_lookup[block_len - 1]; + mul0 = pg_clmul(crc0, (uint32) precompute); + mul1 = pg_clmul(crc1, (uint32) (precompute >> 32)); + + crc0 = __crc32cd(0, mul0); + crc1 = __crc32cd(0, mul1); + crc = crc0 ^ crc1 ^ crc2; + + p += block_len * CRC_BYTES_PER_ITER; + } + /* Process eight bytes at a time, as far as we can. */ while (p + 8 <= pend) { @@ -71,5 +103,7 @@ pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len) crc = __crc32cb(crc, *p); } + // XXX not for commit + Assert(crc == pg_comp_crc32c_sb8(orig_crc, data, len)); return crc; } diff --git a/src/port/pg_crc32c_sb8.c b/src/port/pg_crc32c_sb8.c index 19659d186a0..004fe92d70b 100644 --- a/src/port/pg_crc32c_sb8.c +++ b/src/port/pg_crc32c_sb8.c @@ -1167,3 +1167,114 @@ static const uint32 pg_crc32c_table[8][256] = { } #endif /* WORDS_BIGENDIAN */ }; + + +/* + * Carryless multiplication in software + */ +uint64 +pg_clmul(uint32 a, uint32 b) +{ + uint64 result = 0; + + for (uint32 i = 0; i < 32; i++) + if ((a >> i) & 1) + result ^= (uint64) b << i; + + return result; +} + +/* + * Lookup table for combining partial CRC computations + */ +const uint64 combine_crc_lookup[CRC_MAX_BLOCK_LEN] = +{ + 0x00000001493c7d27, 0x493c7d27ba4fc28e, 0xf20c0dfeddc0152b, 0xba4fc28e9e4addf8, + 0x3da6d0cb39d3b296, 0xddc0152b0715ce53, 0x1c291d0447db8317, 0x9e4addf80d3b6092, + 0x740eef02c96cfdc0, 0x39d3b296878a92a7, 0x083a6eecdaece73e, 0x0715ce53ab7aff2a, + 0xc49f4f672162d385, 0x47db831783348832, 0x2ad91c30299847d5, 0x0d3b6092b9e02b86, + 0x6992cea218b33a4e, 0xc96cfdc0b6dd949b, 0x7e90804878d9ccb7, 0x878a92a7bac2fd7b, + 0x1b3d8f29a60ce07b, 0xdaece73ece7f39f4, 0xf1d0f55e61d82e56, 0xab7aff2ad270f1a2, + 0xa87ab8a8c619809d, 0x2162d3852b3cac5d, 0x8462d80065863b64, 0x833488321b03397f, + 0x71d111a8ebb883bd, 0x299847d5b3e32c28, 0xffd852c6064f7f26, 0xb9e02b86dd7e3b0c, + 0xdcb17aa4f285651c, 0x18b33a4e10746f3c, 0xf37c5aeec7a68855, 0xb6dd949b271d9844, + 0x6051d5a28e766a0c, 0x78d9ccb793a5f730, 0x18b0d4ff6cb08e5c, 0xbac2fd7b6b749fb2, + 0x21f3d99c1393e203, 0xa60ce07bcec3662e, 0x8f15801496c515bb, 0xce7f39f4e6fc4e6a, + 0xa00457f78227bb8a, 0x61d82e56b0cd4768, 0x8d6d2c4339c7ff35, 0xd270f1a2d7a4825c, + 0x00ac29cf0ab3844b, 0xc619809d0167d312, 0xe9adf796f6076544, 0x2b3cac5d26f6a60a, + 0x96638b34a741c1bf, 0x65863b6498d8d9cb, 0xe0e9f35149c3cc9c, 0x1b03397f68bce87a, + 0x9af01f2d57a3d037, 0xebb883bd6956fc3b, 0x2cff42cf42d98888, 0xb3e32c283771e98f, + 0x88f25a3ab42ae3d9, 0x064f7f262178513a, 0x4e36f0b0e0ac139e, 0xdd7e3b0c170076fa, + 0xbd6f81f8444dd413, 0xf285651c6f345e45, 0x91c9bd4b41d17b64, 0x10746f3cff0dba97, + 0x885f087ba2b73df1, 0xc7a68855f872e54c, 0x4c1449321e41e9fc, 0x271d984486d8e4d2, + 0x52148f02651bd98b, 0x8e766a0c5bb8f1bc, 0xa3c6f37aa90fd27a, 0x93a5f730b3af077a, + 0xd7c0557f4984d782, 0x6cb08e5cca6ef3ac, 0x63ded06a234e0b26, 0x6b749fb2dd66cbbb, + 0x4d56973c4597456a, 0x1393e203e9e28eb4, 0x9669c9df7b3ff57a, 0xcec3662ec9c8b782, + 0xe417f38a3f70cc6f, 0x96c515bb93e106a4, 0x4b9e0f7162ec6c6d, 0xe6fc4e6ad813b325, + 0xd104b8fc0df04680, 0x8227bb8a2342001e, 0x5b3977300a2a8d7e, 0xb0cd47686d9a4957, + 0xe78eb416e8b6368b, 0x39c7ff35d2c3ed1a, 0x61ff0e01995a5724, 0xd7a4825c9ef68d35, + 0x8d96551c0c139b31, 0x0ab3844bf2271e60, 0x0bf80dd20b0bf8ca, 0x0167d3122664fd8b, + 0x8821abeded64812d, 0xf607654402ee03b2, 0x6a45d2b28604ae0f, 0x26f6a60a363bd6b3, + 0xd8d26619135c83fd, 0xa741c1bf5fabe670, 0xde87806c35ec3279, 0x98d8d9cb00bcf5f6, + 0x143387548ae00689, 0x49c3cc9c17f27698, 0x5bd2011f58ca5f00, 0x68bce87aaa7c7ad5, + 0xdd07448eb5cfca28, 0x57a3d037ded288f8, 0xdde8f5b959f229bc, 0x6956fc3b6d390dec, + 0xa3e3e02c37170390, 0x42d988886353c1cc, 0xd73c7beac4584f5c, 0x3771e98ff48642e9, + 0x80ff0093531377e2, 0xb42ae3d9dd35bc8d, 0x8fe4c34db25b29f2, 0x2178513a9a5ede41, + 0xdf99fc11a563905d, 0xe0ac139e45cddf4e, 0x6c23e841acfa3103, 0x170076faa51b6135, + 0xfe314258dfd94fb2, 0x444dd41380f2886b, 0x0d8373a067969a6a, 0x6f345e45021ac5ef, + 0x19e3635ee8310afa, 0x41d17b6475451b04, 0x29f268b48e1450f7, 0xff0dba97cbbe4ee1, + 0x1dc0632a3a83de21, 0xa2b73df1e0cdcf86, 0x1614f396453c1679, 0xf872e54cdefba41c, + 0x9e2993d3613eee91, 0x1e41e9fcddaf5114, 0x6bebd73c1f1dd124, 0x86d8e4d2bedc6ba1, + 0x63ae91e6eca08ffe, 0x651bd98b3ae30875, 0xf8c9da7a0cd1526a, 0x5bb8f1bcb1630f04, + 0x945a19c1ff47317b, 0xa90fd27ad6c3a807, 0xee8213b79a7781e0, 0xb3af077a63d097e9, + 0x93781dc71d31175f, 0x4984d78294eb256e, 0xccc4a1b913184649, 0xca6ef3ac4be7fd90, + 0xa2c2d9717d5c1d64, 0x234e0b2680ba859a, 0x1cad44526eeed1c9, 0xdd66cbbb22c3799f, + 0x74922601d8ecc578, 0x4597456ab3a6da94, 0xc55f7eabcaf933fe, 0xe9e28eb450bfaade, + 0xa19623292e7d11a7, 0x7b3ff57a7d14748f, 0x2d37074932d8041c, 0xc9c8b782889774e1, + 0x397d84a16cc8a0ff, 0x3f70cc6f5aa1f3cf, 0x791132708a074012, 0x93e106a433bc58b3, + 0xbc8178039f2b002a, 0x62ec6c6dbd0bb25f, 0x88eb3c0760bf0a6a, 0xd813b3258515c07f, + 0x6e4cb6303be3c09b, 0x0df04680d8440525, 0x71971d5c682d085d, 0x2342001e465a4eee, + 0xf33b8bc628b5de82, 0x0a2a8d7e077d54e0, 0x9fb3bbc02e5f3c8c, 0x6d9a4957c00df280, + 0x6ef22b23d0a37f43, 0xe8b6368ba52f58ec, 0xce2df76800712e86, 0xd2c3ed1ad6748e82, + 0xe53a4fc747972100, 0x995a572451aeef66, 0xbe60a91a71900712, 0x9ef68d35359674f7, + 0x1dfa0a15647fbd15, 0x0c139b311baaa809, 0x8ec52396469aef86, 0xf2271e6086d42d06, + 0x0e766b114aba1470, 0x0b0bf8ca1c2cce0a, 0x475846a4aa0cd2d3, 0x2664fd8bf8448640, + 0xb2a3dfa6ac4fcdec, 0xed64812de81cf154, 0xdc1a160cc2c7385c, 0x02ee03b295ffd7dc, + 0x79afdf1c91de6176, 0x8604ae0f84ee89ac, 0x07ac6e46533e308d, 0x363bd6b35f0e0438, + 0x15f85253604d6e09, 0x135c83fdaeb3e622, 0x1bec24dd4263eb04, 0x5fabe67050c2cb16, + 0x4c36cd5b6667afe7, 0x35ec32791a6889b8, 0xe0a22e29de42c92a, 0x00bcf5f67f47463d, + 0x7c2b6ed9b82b6080, 0x8ae00689828d550b, 0x06ff88fddca2b4da, 0x17f276984ac726eb, + 0xf7317cf0529295e6, 0x58ca5f005e9f28eb, 0x61b6e40b40c14fff, 0xaa7c7ad596a1f19b, + 0xde8a97f8997157e1, 0xb5cfca28b0ed8196, 0x88f61445097e41e6, 0xded288f84ce8bfe5, + 0xd4520e9ee36841ad, 0x59f229bcd1a9427c, 0x0c592bd593f3319c, 0x6d390decb58ac6fe, + 0x38edfaf3e3809241, 0x37170390f22fd3e2, 0x72cbfcdb83c2df88, 0x6353c1ccd6b1825a, + 0x348331a54e4ff232, 0xc4584f5c6664d9c1, 0xc3977c19836b5a6e, 0xf48642e923d5e7e5, + 0xdafaea7c65065343, 0x531377e21495d20d, 0x73db4c04a29c82eb, 0xdd35bc8df370b37f, + 0x72675ce8ea6dd7dc, 0xb25b29f2e9415bce, 0x3ec2ff8396309b0f, 0x9a5ede41c776b648, + 0xe8c7a017c22c52c5, 0xa563905dcecfcd43, 0xcf4bfaefd8311ee7, 0x45cddf4e24e6fe8f, + 0x6bde1ac7d0c6d7c9, 0xacfa310345aa5d4a, 0xae1175c2cf067065, 0xa51b613582f89c77, + 0xf7506984a348c84e, 0xdfd94fb2d07737ea, 0xe0863e5636069dd2, 0x80f2886bc4cedd32, + 0xd7e661ae9a97be47, 0x67969a6af45cd585, 0x01afc14f93f36e2b, 0x021ac5ef195bc82d, + 0xd2fd8e3ce622aaca, 0xe8310afa23912612, 0xc4eb27b2a1fd0859, 0x75451b04a2edbd17, + 0x632098732cefbfdd, 0x8e1450f7f36d84e2, 0xf29971cf9664532d, 0xcbbe4ee1cfeff4b3, + 0xaf6939d96737eead, 0x3a83de21f52d28d3, 0x650ef6c5fb3bb2c8, 0xe0cdcf864a9d4498, + 0x36e108faaef471c1, 0x453c16790d08bb68, 0x09c20a6c3b6c03be, 0xdefba41c4de20a7c, + 0x0a1a6a8877792405, 0x613eee91b95a9eb0, 0x286d109b11f2bc8f, 0xddaf51147956e76a, + 0x9fd51b88032a8058, 0x1f1dd1241b93589c, 0x4860285dcc66546f, 0xbedc6ba1005bb964, + 0x6e221adc28198362, 0xeca08ffe3f2e57b1, 0x0e0a10735f54bb14, 0x3ae3087599e44d01, + 0x37f194212591f073, 0x0cd1526a0871bd30, 0xe9bbf6481fb48d12, 0xb1630f043888ed03, + 0x0fa0277f1e22167e, 0xff47317bd272cadf, 0xeb2fb89a8653215c, 0xd6c3a807f6e6d69e, + 0x1c47ed30b9f5bf62, 0x9a7781e09dde6145, 0x271cfb40ed49c4d3, 0x63d097e91ad321bc, + 0xea1cb6e7f206e4b8, 0x1d31175f60165873, 0x9f737f83159dff70, 0x94eb256ee1a468d0, + 0xd4619bbcda11f51b, 0x1318464993189d18, 0x794dd0f2ac4f4691, 0x4be7fd9087d07ae0, + 0xb6d42cd90db9f589, 0x7d5c1d64c51a240c, 0x8b9be230ab819fbb, 0x80ba859a5cc19671, + 0xd4617a4c46183f0a, 0x6eeed1c993e7c448, 0xb9f93bd067fe6e36, 0x22c3799fca110698, + 0xb8b67c1c8a2acc83, 0xd8ecc578eb75a090, 0xc5ca433a3e18bd99, 0xb3a6da94480e7e4d, + 0x5e5dcd9560bced33, 0xcaf933fe0bf69a3f, 0x7b589372dbf59471, 0x50bfaadea00bae3d, + 0x0afb7f3b4b7df256, 0x2e7d11a77959fe2a, 0x0f97c69068e3d179, 0x7d14748fa160e585, + 0x3e254fe4cac39a0b, 0x32d8041cf4229b7a, 0x141e8512007ca0f9, 0x889774e10e0126b2, + 0xe5e25bd082fe946e, 0x6cc8a0ff013b3856, 0xacf1231667c69966, 0x5aa1f3cffa0f2bd0, + 0x7b454cb35d4c91fc, 0x8a07401279d95f64, 0x311709b8807121c0, 0x33bc58b3b0a3f16d, + 0x9948a7d2618e0996, 0x9f2b002a2308bca9, 0x809cef1f343272b3, 0xbd0bb25ff5c40599, + 0x11dde5b740c1c64c, 0x60bf0a6a71cc89e8, +}; diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c index 22c2137df31..f674d3f71d7 100644 --- a/src/port/pg_crc32c_sse42.c +++ b/src/port/pg_crc32c_sse42.c @@ -18,10 +18,85 @@ #include "port/pg_crc32c.h" -pg_attribute_no_sanitize_alignment() +/* min size to compute multiple segments in parallel */ +#define MIN_PARALLEL_LENGTH 600 + +static pg_crc32c pg_comp_crc32c_sse42_tail(pg_crc32c crc, const void *data, size_t len); + + pg_attribute_target("sse4.2") pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len) +{ + const unsigned char *p = data; + pg_crc32c crc0 = crc; + + /* XXX not for commit */ + const size_t orig_len PG_USED_FOR_ASSERTS_ONLY = len; + +#if SIZEOF_VOID_P >= 8 + if (unlikely(len >= MIN_PARALLEL_LENGTH)) + { + /* + * Align pointer to avoid straddling cacheline boundaries, since we + * issue three loads per loop iteration below. + */ + for (; (uintptr_t) p & 7; len--) + crc0 = _mm_crc32_u8(crc0, *p++); + + /* + * A CRC instruction can be issued every cycle but the latency of its + * result will take several cycles. We can take advantage of this by + * dividing the input into 3 equal blocks and computing the CRC of + * each independently. + */ + while (len >= MIN_PARALLEL_LENGTH) + { + const size_t block_len = Min(CRC_MAX_BLOCK_LEN, + len / CRC_BYTES_PER_ITER); + const uint64 *in64 = (const uint64 *) (p); + pg_crc32c crc1 = 0, + crc2 = 0; + uint64 mul0, + mul1, + precompute; + + for (int i = 0; i < block_len; i++, in64++) + { + crc0 = _mm_crc32_u64(crc0, *(in64)); + crc1 = _mm_crc32_u64(crc1, *(in64 + block_len)); + crc2 = _mm_crc32_u64(crc2, *(in64 + block_len * 2)); + } + + /* + * Combine the partial CRCs using carryless multiplication on + * pre-computed length-specific constants. + */ + precompute = combine_crc_lookup[block_len - 1]; + mul0 = pg_clmul(crc0, (uint32) precompute); + mul1 = pg_clmul(crc1, (uint32) (precompute >> 32)); + crc0 = _mm_crc32_u64(0, mul0); + crc0 ^= _mm_crc32_u64(0, mul1); + crc0 ^= crc2; + + p += block_len * CRC_BYTES_PER_ITER; + len -= block_len * CRC_BYTES_PER_ITER; + } + } +#endif + + crc0 = pg_comp_crc32c_sse42_tail(crc0, p, len); + + /* XXX not for commit */ + Assert(crc0 == pg_comp_crc32c_sb8(crc, data, orig_len)); + + return crc0; +} + +pg_attribute_no_sanitize_alignment() +pg_attribute_target("sse4.2") +static pg_crc32c +pg_comp_crc32c_sse42_tail(pg_crc32c crc, const void *data, size_t len) { const unsigned char *p = data; const unsigned char *pend = p + len; -- 2.48.1
From 93b46829b80d33be5afee1156dbdf074f097fbdd Mon Sep 17 00:00:00 2001 From: John Naylor <john.nay...@postgresql.org> Date: Tue, 18 Mar 2025 18:41:35 +0700 Subject: [PATCH v10 3/3] Fix headerscheck --- src/tools/pginclude/headerscheck | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/tools/pginclude/headerscheck b/src/tools/pginclude/headerscheck index 9e86d049362..3883ec558e5 100755 --- a/src/tools/pginclude/headerscheck +++ b/src/tools/pginclude/headerscheck @@ -122,6 +122,9 @@ do test "$f" = src/include/nodes/nodetags.h && continue test "$f" = src/backend/nodes/nodetags.h && continue + # pg_crc32c_parallel.h contains just a code fragment + test "$f" = src/port/pg_crc32c_parallel.h && continue + # These files are not meant to be included standalone, because # they contain lists that might have multiple use-cases. test "$f" = src/include/access/rmgrlist.h && continue -- 2.48.1