On Thu, Feb 13, 2025 at 5:19 AM Nathan Bossart <nathandboss...@gmail.com> wrote: > > On Wed, Feb 12, 2025 at 10:12:20PM +0000, Devulapalli, Raghuveer wrote: > >> Well, I suspect the AVX-512 version will pretty much always need the > >> runtime > >> check given that its not available on a lot of newer hardware and requires > >> a > >> bunch of extra runtime checks (see pg_popcount_avx512.c). But it might be > >> worth doing for PCLMUL. Otherwise, I think we'd have to leave out the > >> PCLMUL > >> optimizations if built with -msse4.2 -mpclmul because we don't want to > >> regress > >> existing -msse4.2 users with a runtime check. > > > > Sounds good to me. Although, users building with just -msse4.2 will now > > encounter an > > an additional pclmul runtime check. That would be a regression unless they > > update to > > building with both -msse4.2 and -mpclmul. > > My thinking was that building with just -msse4.2 would cause the existing > SSE 4.2 implementation to be used (without the function pointer). That's > admittedly a bit goofy because they'd miss out on the PCLMUL optimization, > but things at least don't get any worse for them.
I tried using branching for the runtime check, and this looks like the way to go: - Existing -msse4.2 builders will still call directly, but inside the function there is a length check and only for long input will it do a runtime check for pclmul. - This smooths the way for -msse4.2 (and the equivalent on Arm) to inline calls with short constant input (e.g. WAL insert lock), although I've not done that here. - This can be a simple starting point for consolidating runtime checks, as was proposed for popcount in the AVX-512 CRC thread, but with branching my model was Andres' sketch here: https://www.postgresql.org/message-id/20240731023918.ixsfbeuub6e76one%40awork3.anarazel.de -- John Naylor Amazon Web Services
From f327b7fcb588100d2dc7483369cfd36380210715 Mon Sep 17 00:00:00 2001 From: Paul Amonson <paul.d.amon...@intel.com> Date: Mon, 6 May 2024 08:34:17 -0700 Subject: [PATCH v6 2/3] Add a Postgres SQL function for crc32c benchmarking Add a drive_crc32c() function to use for benchmarking crc32c computation. The function takes 2 arguments: (1) count: num of times CRC32C is computed in a loop. (2) num: #bytes in the buffer to calculate crc over. XXX not for commit Extracted from a patch by Raghuveer Devulapalli --- contrib/meson.build | 1 + contrib/test_crc32c/Makefile | 20 +++++++ contrib/test_crc32c/expected/test_crc32c.out | 57 ++++++++++++++++++++ contrib/test_crc32c/meson.build | 34 ++++++++++++ contrib/test_crc32c/sql/test_crc32c.sql | 3 ++ contrib/test_crc32c/test_crc32c--1.0.sql | 1 + contrib/test_crc32c/test_crc32c.c | 47 ++++++++++++++++ contrib/test_crc32c/test_crc32c.control | 4 ++ 8 files changed, 167 insertions(+) create mode 100644 contrib/test_crc32c/Makefile create mode 100644 contrib/test_crc32c/expected/test_crc32c.out create mode 100644 contrib/test_crc32c/meson.build create mode 100644 contrib/test_crc32c/sql/test_crc32c.sql create mode 100644 contrib/test_crc32c/test_crc32c--1.0.sql create mode 100644 contrib/test_crc32c/test_crc32c.c create mode 100644 contrib/test_crc32c/test_crc32c.control diff --git a/contrib/meson.build b/contrib/meson.build index 1ba73ebd67..06673db062 100644 --- a/contrib/meson.build +++ b/contrib/meson.build @@ -12,6 +12,7 @@ contrib_doc_args = { 'install_dir': contrib_doc_dir, } +subdir('test_crc32c') subdir('amcheck') subdir('auth_delay') subdir('auto_explain') diff --git a/contrib/test_crc32c/Makefile b/contrib/test_crc32c/Makefile new file mode 100644 index 0000000000..5b747c6184 --- /dev/null +++ b/contrib/test_crc32c/Makefile @@ -0,0 +1,20 @@ +MODULE_big = test_crc32c +OBJS = test_crc32c.o +PGFILEDESC = "test" +EXTENSION = test_crc32c +DATA = test_crc32c--1.0.sql + +first: all + +# test_crc32c.o: CFLAGS+=-g + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = src/test/modules/test_crc32c +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif diff --git a/contrib/test_crc32c/expected/test_crc32c.out b/contrib/test_crc32c/expected/test_crc32c.out new file mode 100644 index 0000000000..dff6bb3133 --- /dev/null +++ b/contrib/test_crc32c/expected/test_crc32c.out @@ -0,0 +1,57 @@ +CREATE EXTENSION test_crc32c; +select drive_crc32c(1, i) from generate_series(100, 300, 4) i; + drive_crc32c +-------------- + 532139994 + 2103623867 + 785984197 + 2686825890 + 3213049059 + 3819630168 + 1389234603 + 534072900 + 2930108140 + 2496889855 + 1475239611 + 136366931 + 3067402116 + 2012717871 + 3682416023 + 2054270645 + 1817339875 + 4100939569 + 1192727539 + 3636976218 + 369764421 + 3161609879 + 1067984880 + 1235066769 + 3138425899 + 648132037 + 4203750233 + 1330187888 + 2683521348 + 1951644495 + 2574090107 + 3904902018 + 3772697795 + 1644686344 + 2868962106 + 3369218491 + 3902689890 + 3456411865 + 141004025 + 1504497996 + 3782655204 + 3544797610 + 3429174879 + 2524728016 + 3935861181 + 25498897 + 692684159 + 345705535 + 2761600287 + 2654632420 + 3945991399 +(51 rows) + diff --git a/contrib/test_crc32c/meson.build b/contrib/test_crc32c/meson.build new file mode 100644 index 0000000000..d7bec4ba1c --- /dev/null +++ b/contrib/test_crc32c/meson.build @@ -0,0 +1,34 @@ +# Copyright (c) 2022-2024, PostgreSQL Global Development Group + +test_crc32c_sources = files( + 'test_crc32c.c', +) + +if host_system == 'windows' + test_crc32c_sources += rc_lib_gen.process(win32ver_rc, extra_args: [ + '--NAME', 'test_crc32c', + '--FILEDESC', 'test_crc32c - test code for crc32c library',]) +endif + +test_crc32c = shared_module('test_crc32c', + test_crc32c_sources, + kwargs: contrib_mod_args, +) +contrib_targets += test_crc32c + +install_data( + 'test_crc32c.control', + 'test_crc32c--1.0.sql', + kwargs: contrib_data_args, +) + +tests += { + 'name': 'test_crc32c', + 'sd': meson.current_source_dir(), + 'bd': meson.current_build_dir(), + 'regress': { + 'sql': [ + 'test_crc32c', + ], + }, +} diff --git a/contrib/test_crc32c/sql/test_crc32c.sql b/contrib/test_crc32c/sql/test_crc32c.sql new file mode 100644 index 0000000000..95c6dfe448 --- /dev/null +++ b/contrib/test_crc32c/sql/test_crc32c.sql @@ -0,0 +1,3 @@ +CREATE EXTENSION test_crc32c; + +select drive_crc32c(1, i) from generate_series(100, 300, 4) i; diff --git a/contrib/test_crc32c/test_crc32c--1.0.sql b/contrib/test_crc32c/test_crc32c--1.0.sql new file mode 100644 index 0000000000..52b9772f90 --- /dev/null +++ b/contrib/test_crc32c/test_crc32c--1.0.sql @@ -0,0 +1 @@ +CREATE FUNCTION drive_crc32c (count int, num int) RETURNS bigint AS 'MODULE_PATHNAME' LANGUAGE C; diff --git a/contrib/test_crc32c/test_crc32c.c b/contrib/test_crc32c/test_crc32c.c new file mode 100644 index 0000000000..b350caf5ce --- /dev/null +++ b/contrib/test_crc32c/test_crc32c.c @@ -0,0 +1,47 @@ +/* select drive_crc32c(1000000, 1024); */ + +#include "postgres.h" +#include "fmgr.h" +#include "port/pg_crc32c.h" +#include "common/pg_prng.h" + +PG_MODULE_MAGIC; + +/* + * drive_crc32c(count: int, num: int) returns bigint + * + * count is the nuimber of loops to perform + * + * num is the number byte in the buffer to calculate + * crc32c over. + */ +PG_FUNCTION_INFO_V1(drive_crc32c); +Datum +drive_crc32c(PG_FUNCTION_ARGS) +{ + int64 count = PG_GETARG_INT64(0); + int64 num = PG_GETARG_INT64(1); + char* data = malloc((size_t)num); + pg_crc32c crc; + pg_prng_state state; + uint64 seed = 42; + pg_prng_seed(&state, seed); + /* set random data */ + for (uint64 i = 0; i < num; i++) + { + data[i] = pg_prng_uint32(&state) % 255; + } + + INIT_CRC32C(crc); + + while(count--) + { + INIT_CRC32C(crc); + COMP_CRC32C(crc, data, num); + FIN_CRC32C(crc); + } + + free((void *)data); + + PG_RETURN_INT64((int64_t)crc); +} diff --git a/contrib/test_crc32c/test_crc32c.control b/contrib/test_crc32c/test_crc32c.control new file mode 100644 index 0000000000..878a077ee1 --- /dev/null +++ b/contrib/test_crc32c/test_crc32c.control @@ -0,0 +1,4 @@ +comment = 'test' +default_version = '1.0' +module_pathname = '$libdir/test_crc32c' +relocatable = true -- 2.48.1
From d3ee691a067fb1f41d3e8e4377df1a67962cf5c7 Mon Sep 17 00:00:00 2001 From: John Naylor <john.nay...@postgresql.org> Date: Wed, 12 Feb 2025 15:27:16 +0700 Subject: [PATCH v6 3/3] Improve CRC32C performance on x86_64 The current SSE4.2 implementation of CRC32C relies on the native CRC32 instruction, which operates on 8 bytes at a time. We can get a substantial speedup on longer inputs by using carryless multiplication on SIMD registers, processing 64 bytes per loop iteration. The PCLMULQDQ instruction has been widely available since 2011 (almost as old as SSE 4.2), so this commit now requires that, as well as SSE 4.2, to build pg_crc32c_sse42.c. The MIT-licensed implementation was generated with the "generate" program from https://github.com/corsix/fast-crc32/ Based on: "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction" V. Gopal, E. Ozturk, et al., 2009 Author: Raghuveer Devulapalli <raghuveer.devulapa...@intel.com> Author: John Naylor <johncnaylo...@gmail.com> Discussion: https://postgr.es/m/ph8pr11mb82869ff741dfa4e9a029ff13fb...@ph8pr11mb8286.namprd11.prod.outlook.com --- src/include/port/pg_cpu.h | 1 + src/include/port/pg_crc32c.h | 1 + src/port/pg_cpu.c | 3 + src/port/pg_crc32c_sse42.c | 95 +++++++++++++++++++++++++++ src/port/pg_crc32c_sse42_choose.c | 16 +++++ src/test/regress/expected/strings.out | 24 +++++++ src/test/regress/sql/strings.sql | 4 ++ 7 files changed, 144 insertions(+) diff --git a/src/include/port/pg_cpu.h b/src/include/port/pg_cpu.h index 45ce9d3c50..0d8137ebb3 100644 --- a/src/include/port/pg_cpu.h +++ b/src/include/port/pg_cpu.h @@ -16,6 +16,7 @@ #define PGCPUCAP_INIT (1 << 0) #define PGCPUCAP_CRC32C (1 << 1) +#define PGCPUCAP_CLMUL (1 << 2) extern uint32 pg_cpucap; extern void pg_cpucap_initialize(void); diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h index db155d690e..068a653605 100644 --- a/src/include/port/pg_crc32c.h +++ b/src/include/port/pg_crc32c.h @@ -58,6 +58,7 @@ extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len) #endif extern bool pg_crc32c_sse42_available(void); +extern bool pg_crc32c_pclmul_available(void); extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len); #elif defined(USE_ARMV8_CRC32C) || defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK) diff --git a/src/port/pg_cpu.c b/src/port/pg_cpu.c index c948335743..52944b2d4e 100644 --- a/src/port/pg_cpu.c +++ b/src/port/pg_cpu.c @@ -31,6 +31,9 @@ pg_cpucap_crc32c(void) if (pg_crc32c_sse42_available()) pg_cpucap |= PGCPUCAP_CRC32C; + if (pg_crc32c_pclmul_available()) + pg_cpucap |= PGCPUCAP_CLMUL; + #elif defined(USE_ARMV8_CRC32C) || defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK) if (pg_crc32c_armv8_available()) pg_cpucap |= PGCPUCAP_CRC32C; diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c index 22c2137df3..66ddb7ec87 100644 --- a/src/port/pg_crc32c_sse42.c +++ b/src/port/pg_crc32c_sse42.c @@ -15,9 +15,19 @@ #include "c.h" #include <nmmintrin.h> +#include <wmmintrin.h> #include "port/pg_crc32c.h" +static pg_crc32c pg_comp_crc32c_pclmul(pg_crc32c crc, const void *data, size_t length); + +/* WIP: configure checks */ +#ifdef __x86_64__ +#define HAVE_PCLMUL_RUNTIME +#endif + +#define PCLMUL_THRESHOLD 128 + pg_attribute_no_sanitize_alignment() pg_attribute_target("sse4.2") pg_crc32c @@ -25,6 +35,17 @@ pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len) { const unsigned char *p = data; const unsigned char *pend = p + len; + const pg_crc32c orig_crc = crc; /* XXX not for commit */ + const size_t orig_len = len; + +#ifdef HAVE_PCLMUL_RUNTIME + if (len >= PCLMUL_THRESHOLD && (pg_cpucap & PGCPUCAP_CLMUL)) + { + crc = pg_comp_crc32c_pclmul(crc, data, len); + len %= 64; + p = pend - len; + } +#endif /* * Process eight bytes of data at a time. @@ -66,5 +87,79 @@ pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len) p++; } + /* XXX not for commit */ + Assert(crc == pg_comp_crc32c_sb8(orig_crc, data, orig_len)); + return crc; } + +#ifdef HAVE_PCLMUL_RUNTIME + +/* Generated by https://github.com/corsix/fast-crc32/ using: */ +/* ./generate -i sse -p crc32c -a v4 */ +/* MIT licensed */ + +#define clmul_lo(a, b) (_mm_clmulepi64_si128((a), (b), 0)) +#define clmul_hi(a, b) (_mm_clmulepi64_si128((a), (b), 17)) + +pg_attribute_target("sse4.2,pclmul") +static pg_crc32c +pg_comp_crc32c_pclmul(pg_crc32c crc, const void *data, size_t length) +{ + /* adjust names to match generated code */ + pg_crc32c crc0 = crc; + size_t len = length; + const unsigned char *buf = data; + + if (len >= 64) + { + /* First vector chunk. */ + __m128i x0 = _mm_loadu_si128((const __m128i *) buf), + y0; + __m128i x1 = _mm_loadu_si128((const __m128i *) (buf + 16)), + y1; + __m128i x2 = _mm_loadu_si128((const __m128i *) (buf + 32)), + y2; + __m128i x3 = _mm_loadu_si128((const __m128i *) (buf + 48)), + y3; + __m128i k; + + k = _mm_setr_epi32(0x740eef02, 0, 0x9e4addf8, 0); + x0 = _mm_xor_si128(_mm_cvtsi32_si128(crc0), x0); + buf += 64; + len -= 64; + + /* Main loop. */ + while (len >= 64) + { + y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); + y1 = clmul_lo(x1, k), x1 = clmul_hi(x1, k); + y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k); + y3 = clmul_lo(x3, k), x3 = clmul_hi(x3, k); + y0 = _mm_xor_si128(y0, _mm_loadu_si128((const __m128i *) buf)), x0 = _mm_xor_si128(x0, y0); + y1 = _mm_xor_si128(y1, _mm_loadu_si128((const __m128i *) (buf + 16))), x1 = _mm_xor_si128(x1, y1); + y2 = _mm_xor_si128(y2, _mm_loadu_si128((const __m128i *) (buf + 32))), x2 = _mm_xor_si128(x2, y2); + y3 = _mm_xor_si128(y3, _mm_loadu_si128((const __m128i *) (buf + 48))), x3 = _mm_xor_si128(x3, y3); + buf += 64; + len -= 64; + } + + /* Reduce x0 ... x3 to just x0. */ + k = _mm_setr_epi32(0xf20c0dfe, 0, 0x493c7d27, 0); + y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); + y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k); + y0 = _mm_xor_si128(y0, x1), x0 = _mm_xor_si128(x0, y0); + y2 = _mm_xor_si128(y2, x3), x2 = _mm_xor_si128(x2, y2); + k = _mm_setr_epi32(0x3da6d0cb, 0, 0xba4fc28e, 0); + y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); + y0 = _mm_xor_si128(y0, x2), x0 = _mm_xor_si128(x0, y0); + + /* Reduce 128 bits to 32 bits, and multiply by x^32. */ + crc0 = _mm_crc32_u64(0, _mm_extract_epi64(x0, 0)); + crc0 = _mm_crc32_u64(crc0, _mm_extract_epi64(x0, 1)); + } + + return crc0; +} + +#endif diff --git a/src/port/pg_crc32c_sse42_choose.c b/src/port/pg_crc32c_sse42_choose.c index f4d3215bc5..59a5a31c00 100644 --- a/src/port/pg_crc32c_sse42_choose.c +++ b/src/port/pg_crc32c_sse42_choose.c @@ -40,3 +40,19 @@ pg_crc32c_sse42_available(void) return (exx[2] & (1 << 20)) != 0; /* SSE 4.2 */ } + +bool +pg_crc32c_pclmul_available(void) +{ + unsigned int exx[4] = {0, 0, 0, 0}; + +#if defined(HAVE__GET_CPUID) + __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]); +#elif defined(HAVE__CPUID) + __cpuid(exx, 1); +#else +#error cpuid instruction not available +#endif + + return (exx[2] & (1 << 1)) != 0; /* PCLMUL */ +} diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out index b65bb2d536..662bd37ace 100644 --- a/src/test/regress/expected/strings.out +++ b/src/test/regress/expected/strings.out @@ -2282,6 +2282,30 @@ SELECT crc32c('The quick brown fox jumps over the lazy dog.'); 419469235 (1 row) +SELECT crc32c(repeat('A', 80)::bytea); + crc32c +------------ + 3799127650 +(1 row) + +SELECT crc32c(repeat('A', 127)::bytea); + crc32c +----------- + 291820082 +(1 row) + +SELECT crc32c(repeat('A', 128)::bytea); + crc32c +----------- + 816091258 +(1 row) + +SELECT crc32c(repeat('A', 129)::bytea); + crc32c +------------ + 4213642571 +(1 row) + -- -- encode/decode -- diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql index 8e0f3a0e75..26f86dc92e 100644 --- a/src/test/regress/sql/strings.sql +++ b/src/test/regress/sql/strings.sql @@ -727,6 +727,10 @@ SELECT crc32('The quick brown fox jumps over the lazy dog.'); SELECT crc32c(''); SELECT crc32c('The quick brown fox jumps over the lazy dog.'); +SELECT crc32c(repeat('A', 80)::bytea); +SELECT crc32c(repeat('A', 127)::bytea); +SELECT crc32c(repeat('A', 128)::bytea); +SELECT crc32c(repeat('A', 129)::bytea); -- -- encode/decode -- 2.48.1
From 527e157966e9ce0df8aae6aac8ed833af9cd53fb Mon Sep 17 00:00:00 2001 From: John Naylor <john.nay...@postgresql.org> Date: Sat, 15 Feb 2025 19:18:16 +0700 Subject: [PATCH v6 1/3] Dispatch CRC computation by branching rather than indirect calls --- configure | 2 +- configure.ac | 2 +- src/backend/postmaster/postmaster.c | 4 ++ src/bin/pg_basebackup/pg_basebackup.c | 3 + src/bin/pg_basebackup/pg_createsubscriber.c | 3 + src/bin/pg_checksums/pg_checksums.c | 3 + src/bin/pg_combinebackup/pg_combinebackup.c | 3 + src/bin/pg_controldata/pg_controldata.c | 3 + src/bin/pg_ctl/pg_ctl.c | 3 + src/bin/pg_resetwal/pg_resetwal.c | 3 + src/bin/pg_rewind/pg_rewind.c | 3 + src/bin/pg_verifybackup/pg_verifybackup.c | 3 + src/bin/pg_waldump/pg_waldump.c | 3 + src/bin/pg_walsummary/pg_walsummary.c | 4 ++ src/include/port/pg_cpu.h | 23 ++++++ src/include/port/pg_crc32c.h | 78 +++++++++++++++------ src/port/Makefile | 1 + src/port/meson.build | 4 ++ src/port/pg_cpu.c | 54 ++++++++++++++ src/port/pg_crc32c_armv8_choose.c | 26 +------ src/port/pg_crc32c_sse42_choose.c | 26 +------ 21 files changed, 182 insertions(+), 72 deletions(-) create mode 100644 src/include/port/pg_cpu.h create mode 100644 src/port/pg_cpu.c diff --git a/configure b/configure index 0ffcaeb436..41aad7b4d7 100755 --- a/configure +++ b/configure @@ -17352,7 +17352,7 @@ if test x"$USE_SSE42_CRC32C" = x"1"; then $as_echo "#define USE_SSE42_CRC32C 1" >>confdefs.h - PG_CRC32C_OBJS="pg_crc32c_sse42.o" + PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sse42_choose.o" { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2" >&5 $as_echo "SSE 4.2" >&6; } else diff --git a/configure.ac b/configure.ac index f56681e0d9..efa8249360 100644 --- a/configure.ac +++ b/configure.ac @@ -2110,7 +2110,7 @@ fi AC_MSG_CHECKING([which CRC-32C implementation to use]) if test x"$USE_SSE42_CRC32C" = x"1"; then AC_DEFINE(USE_SSE42_CRC32C, 1, [Define to 1 use Intel SSE 4.2 CRC instructions.]) - PG_CRC32C_OBJS="pg_crc32c_sse42.o" + PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sse42_choose.o" AC_MSG_RESULT(SSE 4.2) else if test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index bb22b13ade..c218f15f97 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -99,6 +99,7 @@ #include "pg_getopt.h" #include "pgstat.h" #include "port/pg_bswap.h" +#include "port/pg_cpu.h" #include "postmaster/autovacuum.h" #include "postmaster/bgworker_internals.h" #include "postmaster/pgarch.h" @@ -1951,6 +1952,9 @@ InitProcessGlobals(void) #ifndef WIN32 srandom(pg_prng_uint32(&pg_global_prng_state)); #endif + + /* detect CPU capabilities */ + pg_cpucap_initialize(); } /* diff --git a/src/bin/pg_basebackup/pg_basebackup.c b/src/bin/pg_basebackup/pg_basebackup.c index dc0c805137..8d4b3718b6 100644 --- a/src/bin/pg_basebackup/pg_basebackup.c +++ b/src/bin/pg_basebackup/pg_basebackup.c @@ -2405,6 +2405,9 @@ main(int argc, char **argv) progname = get_progname(argv[0]); set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_basebackup")); + /* detect CPU capabilities */ + pg_cpucap_initialize(); + if (argc > 1) { if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0) diff --git a/src/bin/pg_basebackup/pg_createsubscriber.c b/src/bin/pg_basebackup/pg_createsubscriber.c index 2d881d54f5..04e550ef75 100644 --- a/src/bin/pg_basebackup/pg_createsubscriber.c +++ b/src/bin/pg_basebackup/pg_createsubscriber.c @@ -1906,6 +1906,9 @@ main(int argc, char **argv) progname = get_progname(argv[0]); set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_basebackup")); + /* detect CPU capabilities */ + pg_cpucap_initialize(); + if (argc > 1) { if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0) diff --git a/src/bin/pg_checksums/pg_checksums.c b/src/bin/pg_checksums/pg_checksums.c index e1acb6e933..eb88aeedb5 100644 --- a/src/bin/pg_checksums/pg_checksums.c +++ b/src/bin/pg_checksums/pg_checksums.c @@ -453,6 +453,9 @@ main(int argc, char *argv[]) set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_checksums")); progname = get_progname(argv[0]); + /* detect CPU capabilities */ + pg_cpucap_initialize(); + if (argc > 1) { if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0) diff --git a/src/bin/pg_combinebackup/pg_combinebackup.c b/src/bin/pg_combinebackup/pg_combinebackup.c index 5864ec574f..ee24dba231 100644 --- a/src/bin/pg_combinebackup/pg_combinebackup.c +++ b/src/bin/pg_combinebackup/pg_combinebackup.c @@ -166,6 +166,9 @@ main(int argc, char *argv[]) set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_combinebackup")); handle_help_version_opts(argc, argv, progname, help); + /* detect CPU capabilities */ + pg_cpucap_initialize(); + memset(&opt, 0, sizeof(opt)); opt.manifest_checksums = CHECKSUM_TYPE_CRC32C; opt.sync_method = DATA_DIR_SYNC_METHOD_FSYNC; diff --git a/src/bin/pg_controldata/pg_controldata.c b/src/bin/pg_controldata/pg_controldata.c index cf11ab3f2e..deb6b16ae6 100644 --- a/src/bin/pg_controldata/pg_controldata.c +++ b/src/bin/pg_controldata/pg_controldata.c @@ -112,6 +112,9 @@ main(int argc, char *argv[]) set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_controldata")); progname = get_progname(argv[0]); + /* detect CPU capabilities */ + pg_cpucap_initialize(); + if (argc > 1) { if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0) diff --git a/src/bin/pg_ctl/pg_ctl.c b/src/bin/pg_ctl/pg_ctl.c index 8a405ff122..7dc4da932e 100644 --- a/src/bin/pg_ctl/pg_ctl.c +++ b/src/bin/pg_ctl/pg_ctl.c @@ -2226,6 +2226,9 @@ main(int argc, char **argv) set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_ctl")); start_time = time(NULL); + /* detect CPU capabilities */ + pg_cpucap_initialize(); + /* * save argv[0] so do_start() can look for the postmaster if necessary. we * don't look for postmaster here because in many cases we won't need it. diff --git a/src/bin/pg_resetwal/pg_resetwal.c b/src/bin/pg_resetwal/pg_resetwal.c index ed73607a46..52bcaadf69 100644 --- a/src/bin/pg_resetwal/pg_resetwal.c +++ b/src/bin/pg_resetwal/pg_resetwal.c @@ -123,6 +123,9 @@ main(int argc, char *argv[]) set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_resetwal")); progname = get_progname(argv[0]); + /* detect CPU capabilities */ + pg_cpucap_initialize(); + if (argc > 1) { if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0) diff --git a/src/bin/pg_rewind/pg_rewind.c b/src/bin/pg_rewind/pg_rewind.c index cae81cd6cb..f6c755883c 100644 --- a/src/bin/pg_rewind/pg_rewind.c +++ b/src/bin/pg_rewind/pg_rewind.c @@ -158,6 +158,9 @@ main(int argc, char **argv) set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_rewind")); progname = get_progname(argv[0]); + /* detect CPU capabilities */ + pg_cpucap_initialize(); + /* Process command-line arguments */ if (argc > 1) { diff --git a/src/bin/pg_verifybackup/pg_verifybackup.c b/src/bin/pg_verifybackup/pg_verifybackup.c index 7c720ab98b..d44a87e83a 100644 --- a/src/bin/pg_verifybackup/pg_verifybackup.c +++ b/src/bin/pg_verifybackup/pg_verifybackup.c @@ -144,6 +144,9 @@ main(int argc, char **argv) memset(&context, 0, sizeof(context)); + /* detect CPU capabilities */ + pg_cpucap_initialize(); + if (argc > 1) { if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0) diff --git a/src/bin/pg_waldump/pg_waldump.c b/src/bin/pg_waldump/pg_waldump.c index 51fb76efc4..10c529a5fa 100644 --- a/src/bin/pg_waldump/pg_waldump.c +++ b/src/bin/pg_waldump/pg_waldump.c @@ -835,6 +835,9 @@ main(int argc, char **argv) set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_waldump")); progname = get_progname(argv[0]); + /* detect CPU capabilities */ + pg_cpucap_initialize(); + if (argc > 1) { if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0) diff --git a/src/bin/pg_walsummary/pg_walsummary.c b/src/bin/pg_walsummary/pg_walsummary.c index cd7a6b147b..a38565ea6d 100644 --- a/src/bin/pg_walsummary/pg_walsummary.c +++ b/src/bin/pg_walsummary/pg_walsummary.c @@ -20,6 +20,7 @@ #include "common/logging.h" #include "fe_utils/option_utils.h" #include "getopt_long.h" +#include "port/pg_cpu.h" typedef struct ws_options { @@ -69,6 +70,9 @@ main(int argc, char *argv[]) set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_walsummary")); handle_help_version_opts(argc, argv, progname, help); + /* detect CPU capabilities */ + pg_cpucap_initialize(); + /* process command-line options */ while ((c = getopt_long(argc, argv, "iq", long_options, &optindex)) != -1) diff --git a/src/include/port/pg_cpu.h b/src/include/port/pg_cpu.h new file mode 100644 index 0000000000..45ce9d3c50 --- /dev/null +++ b/src/include/port/pg_cpu.h @@ -0,0 +1,23 @@ +/*------------------------------------------------------------------------- + * + * pg_cpu.h + * Runtime detection of CPU capabilities. + * + * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * src/include/port/pg_cpu.h + * + *------------------------------------------------------------------------- + */ +#ifndef PG_CPU_H +#define PG_CPU_H + +#define PGCPUCAP_INIT (1 << 0) +#define PGCPUCAP_CRC32C (1 << 1) + +extern uint32 pg_cpucap; +extern void pg_cpucap_initialize(void); + +#endif /* PG_CPU_H */ diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h index 65ebeacf4b..db155d690e 100644 --- a/src/include/port/pg_crc32c.h +++ b/src/include/port/pg_crc32c.h @@ -34,6 +34,7 @@ #define PG_CRC32C_H #include "port/pg_bswap.h" +#include "port/pg_cpu.h" typedef uint32 pg_crc32c; @@ -41,52 +42,55 @@ typedef uint32 pg_crc32c; #define INIT_CRC32C(crc) ((crc) = 0xFFFFFFFF) #define EQ_CRC32C(c1, c2) ((c1) == (c2)) -#if defined(USE_SSE42_CRC32C) +#if defined(USE_SSE42_CRC32C) || defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK) /* Use Intel SSE4.2 instructions. */ #define COMP_CRC32C(crc, data, len) \ + ((crc) = pg_comp_crc32c_dispatch((crc), (data), (len))) +#define COMP_CRC32C_HW(crc, data, len) \ ((crc) = pg_comp_crc32c_sse42((crc), (data), (len))) #define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF) +#if defined(USE_SSE42_CRC32C) +#define HAVE_CRC_COMPTIME +#else +#define HAVE_CRC_RUNTIME +extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len); +#endif + +extern bool pg_crc32c_sse42_available(void); extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len); -#elif defined(USE_ARMV8_CRC32C) +#elif defined(USE_ARMV8_CRC32C) || defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK) /* Use ARMv8 CRC Extension instructions. */ #define COMP_CRC32C(crc, data, len) \ + ((crc) = pg_comp_crc32c_dispatch((crc), (data), (len))) +#define COMP_CRC32C_HW(crc, data, len) \ ((crc) = pg_comp_crc32c_armv8((crc), (data), (len))) #define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF) +#if defined(USE_ARMV8_CRC32C) +#define HAVE_CRC_COMPTIME +#else +#define HAVE_CRC_RUNTIME +extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len); +#endif + +extern bool pg_crc32c_armv8_available(void); extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len); #elif defined(USE_LOONGARCH_CRC32C) /* Use LoongArch CRCC instructions. */ #define COMP_CRC32C(crc, data, len) \ + ((crc) = pg_comp_crc32c_dispatch((crc), (data), (len))) +#define COMP_CRC32C_HW(crc, data, len) \ ((crc) = pg_comp_crc32c_loongarch((crc), (data), (len))) #define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF) +#define HAVE_CRC_COMPTIME extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_t len); -#elif defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK) || defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK) - -/* - * Use Intel SSE 4.2 or ARMv8 instructions, but perform a runtime check first - * to check that they are available. - */ -#define COMP_CRC32C(crc, data, len) \ - ((crc) = pg_comp_crc32c((crc), (data), (len))) -#define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF) - -extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len); -extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len); - -#ifdef USE_SSE42_CRC32C_WITH_RUNTIME_CHECK -extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len); -#endif -#ifdef USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK -extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len); -#endif - #else /* * Use slicing-by-8 algorithm. @@ -105,6 +109,36 @@ extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t le extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len); +#endif /* end of CPU-specfic symbols */ + +#if defined(HAVE_CRC_COMPTIME) || defined(HAVE_CRC_RUNTIME) +/* + * Check if the CPU we're running on supports special + * instructions for CRC-32C computation. Otherwise, fall + * back to the pure software implementation (slicing-by-8). + */ +static inline pg_crc32c +pg_comp_crc32c_dispatch(pg_crc32c crc, const void *data, size_t len) +{ + /* + * If this is firing in a frontend program, first look if you forgot a + * call to pg_cpucap_initialize() in main(). See for example + * src/bin/pg_controldata/pg_controldata.c. + */ + Assert(pg_cpucap & PGCPUCAP_INIT); + + { +#if defined(HAVE_CRC_COMPTIME) + Assert(pg_cpucap & PGCPUCAP_CRC32C); + return COMP_CRC32C_HW(crc, data, len); +#else + if (pg_cpucap & PGCPUCAP_CRC32C) + return COMP_CRC32C_HW(crc, data, len); + else + return pg_comp_crc32c_sb8(crc, data, len); #endif + } +} +#endif /* HAVE_CRC_COMPTIME || HAVE_CRC_RUNTIME */ #endif /* PG_CRC32C_H */ diff --git a/src/port/Makefile b/src/port/Makefile index 4c22431951..2ac79ecb0f 100644 --- a/src/port/Makefile +++ b/src/port/Makefile @@ -44,6 +44,7 @@ OBJS = \ noblock.o \ path.o \ pg_bitutils.o \ + pg_cpu.o \ pg_popcount_avx512.o \ pg_strong_random.o \ pgcheckdir.o \ diff --git a/src/port/meson.build b/src/port/meson.build index 7fcfa728d4..02ae206760 100644 --- a/src/port/meson.build +++ b/src/port/meson.build @@ -7,6 +7,7 @@ pgport_sources = [ 'noblock.c', 'path.c', 'pg_bitutils.c', + 'pg_cpu.c', 'pg_popcount_avx512.c', 'pg_strong_random.c', 'pgcheckdir.c', @@ -83,12 +84,15 @@ replace_funcs_pos = [ # x86/x64 ['pg_crc32c_sse42', 'USE_SSE42_CRC32C'], ['pg_crc32c_sse42', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'], + # WIP sometime we'll need to build these based on host_cpu + ['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C'], ['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'], ['pg_crc32c_sb8', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'], # arm / aarch64 ['pg_crc32c_armv8', 'USE_ARMV8_CRC32C'], ['pg_crc32c_armv8', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK', 'crc'], + ['pg_crc32c_armv8_choose', 'USE_ARMV8_CRC32C'], ['pg_crc32c_armv8_choose', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'], ['pg_crc32c_sb8', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'], diff --git a/src/port/pg_cpu.c b/src/port/pg_cpu.c new file mode 100644 index 0000000000..c948335743 --- /dev/null +++ b/src/port/pg_cpu.c @@ -0,0 +1,54 @@ +/*------------------------------------------------------------------------- + * + * pg_cpu.c + * Runtime detection of CPU capabilities. + * + * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * src/port/pg_cpu.c + * + *------------------------------------------------------------------------- + */ +#include "c.h" + +#include "port/pg_cpu.h" +#include "port/pg_crc32c.h" + + +/* starts uninitialized so we can detect errors of omission */ +uint32 pg_cpucap = 0; + +/* + * Check if hardware instructions for CRC computation are available. + */ +static void +pg_cpucap_crc32c(void) +{ + /* WIP: It seems like we should use CPU arch symbols instead */ +#if defined(USE_SSE42_CRC32C) || defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK) + if (pg_crc32c_sse42_available()) + pg_cpucap |= PGCPUCAP_CRC32C; + +#elif defined(USE_ARMV8_CRC32C) || defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK) + if (pg_crc32c_armv8_available()) + pg_cpucap |= PGCPUCAP_CRC32C; + +#elif defined(USE_LOONGARCH_CRC32C) + pg_cpucap |= PGCPUCAP_CRC32C; +#endif +} + +/* + * This needs to be called in main() for every + * program that calls a function that dispatches + * according to CPU features. + */ +void +pg_cpucap_initialize(void) +{ + pg_cpucap = PGCPUCAP_INIT; + + pg_cpucap_crc32c(); +} diff --git a/src/port/pg_crc32c_armv8_choose.c b/src/port/pg_crc32c_armv8_choose.c index ec12be1bbc..e3654427c3 100644 --- a/src/port/pg_crc32c_armv8_choose.c +++ b/src/port/pg_crc32c_armv8_choose.c @@ -1,12 +1,7 @@ /*------------------------------------------------------------------------- * * pg_crc32c_armv8_choose.c - * Choose between ARMv8 and software CRC-32C implementation. - * - * On first call, checks if the CPU we're running on supports the ARMv8 - * CRC Extension. If it does, use the special instructions for CRC-32C - * computation. Otherwise, fall back to the pure software implementation - * (slicing-by-8). + * Check if the CPU we're running on supports the ARMv8 CRC Extension. * * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California @@ -40,7 +35,7 @@ #include "port/pg_crc32c.h" -static bool +bool pg_crc32c_armv8_available(void) { #if defined(HAVE_ELF_AUX_INFO) @@ -106,20 +101,3 @@ pg_crc32c_armv8_available(void) return false; #endif } - -/* - * This gets called on the first call. It replaces the function pointer - * so that subsequent calls are routed directly to the chosen implementation. - */ -static pg_crc32c -pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len) -{ - if (pg_crc32c_armv8_available()) - pg_comp_crc32c = pg_comp_crc32c_armv8; - else - pg_comp_crc32c = pg_comp_crc32c_sb8; - - return pg_comp_crc32c(crc, data, len); -} - -pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len) = pg_comp_crc32c_choose; diff --git a/src/port/pg_crc32c_sse42_choose.c b/src/port/pg_crc32c_sse42_choose.c index 65dbc4d424..f4d3215bc5 100644 --- a/src/port/pg_crc32c_sse42_choose.c +++ b/src/port/pg_crc32c_sse42_choose.c @@ -1,12 +1,7 @@ /*------------------------------------------------------------------------- * * pg_crc32c_sse42_choose.c - * Choose between Intel SSE 4.2 and software CRC-32C implementation. - * - * On first call, checks if the CPU we're running on supports Intel SSE - * 4.2. If it does, use the special SSE instructions for CRC-32C - * computation. Otherwise, fall back to the pure software implementation - * (slicing-by-8). + * Check if the CPU we're running on supports SSE4.2. * * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California @@ -30,7 +25,7 @@ #include "port/pg_crc32c.h" -static bool +bool pg_crc32c_sse42_available(void) { unsigned int exx[4] = {0, 0, 0, 0}; @@ -45,20 +40,3 @@ pg_crc32c_sse42_available(void) return (exx[2] & (1 << 20)) != 0; /* SSE 4.2 */ } - -/* - * This gets called on the first call. It replaces the function pointer - * so that subsequent calls are routed directly to the chosen implementation. - */ -static pg_crc32c -pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len) -{ - if (pg_crc32c_sse42_available()) - pg_comp_crc32c = pg_comp_crc32c_sse42; - else - pg_comp_crc32c = pg_comp_crc32c_sb8; - - return pg_comp_crc32c(crc, data, len); -} - -pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len) = pg_comp_crc32c_choose; -- 2.48.1