We did something similar for x86 for v18, and here is some progress towards Arm support.
0001: Like e2809e3a101 -- inline small constant inputs to compensate for the fact that 0002 will do a runtime check even if the usual CRC extension is targeted. There is a difference from x86, however: On Arm we currently align on 8-byte boundaries before looping on 8-byte chunks. That requirement would prevent loop unrolling. We could use 4-byte chunks to get around that, but it's not clear which way is best. I've coded it so it's easy to try both ways. 0002: Like 3c6e8c12389 and in fact uses the same program to generate the code, by specifying Neon instructions with the Arm "crypto" extension instead. There are some interesting differences from x86 here as well: - The upstream implementation chose to use inline assembly instead of intrinsics for some reason. I initially thought that was a way to get broader compiler support, but it turns out you still need to pass the relevant flags to get the assembly to link. - I only have Meson support for now, since I used MacOS on CI to test. That OS and compiler combination apparently targets the CRC extension, but the PMULL instruction runtime check uses Linux-only headers, I believe, so previously I hacked the choose function to return true for testing. The choose function in 0002 is untested in this form. - On x86 it could be fairly costly to align on a cacheline boundary before beginning the main loop so I elected to skip that for short-ish inputs in PG18. On Arm the main loop uses 4 16-byte accumulators, so the patch acts like upsteam and always aligns on 16-byte boundaries. 0003: An afterthought regarding the above-mentioned alignment, this is an alternative preamble that might shave a couple cycles for 4-byte aligned inputs, e.g. WAL. -- John Naylor Amazon Web Services
From 9f3096da8fbed3f1457e7d52dfd91e6b29557239 Mon Sep 17 00:00:00 2001 From: John Naylor <john.naylor@postgresql.org> Date: Sun, 27 Apr 2025 04:04:28 +0700 Subject: [PATCH v1 1/3] Inline CRC computation for small fixed-length input on Arm Similar vein to e2809e3a1. One difference is that the dispatch function requires 4-byte alignment to prevent unnecessary branching in the preamble. This corresponds to the alignment of WAL records. --- src/include/port/pg_crc32c.h | 44 +++++++++++++++++++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h index 82313bb7fcf..9f021db83b2 100644 --- a/src/include/port/pg_crc32c.h +++ b/src/include/port/pg_crc32c.h @@ -114,11 +114,53 @@ extern pg_crc32c pg_comp_crc32c_avx512(pg_crc32c crc, const void *data, size_t l /* Use ARMv8 CRC Extension instructions. */ #define COMP_CRC32C(crc, data, len) \ - ((crc) = pg_comp_crc32c_armv8((crc), (data), (len))) + ((crc) = pg_comp_crc32c_dispatch((crc), (data), (len))) #define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF) extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len); +static inline +pg_crc32c +pg_comp_crc32c_dispatch(pg_crc32c crc, const void *data, size_t len) +{ + /* require 4-byte alignment to avoid a long preamble */ + if (__builtin_constant_p(len) && + PointerIsAligned(data, uint32) && + len < 32) + { + const unsigned char *p = data; + + /* + * For small constant inputs, inline the computation to avoid a + * function call and allow the compiler to unroll loops. + */ +#if 1 + + /* + * WIP: is it better to avoid branching by unrolling the loop and + * processing only 4-bytes per iteration? + */ + if (!PointerIsAligned(p, uint64) && len > 4) + { + crc = __crc32cw(crc, *(uint32 *) p); + p += 4; + len -= 4; + } +#if SIZEOF_VOID_P >= 8 + for (; len >= 8; p += 8, len -= 8) + crc = __crc32cd(crc, *(const uint64 *) p); +#endif +#endif + for (; len >= 4; p += 4, len -= 4) + crc = __crc32cw(crc, *(const uint32 *) p); + for (; len > 0; --len) + crc = __crc32cb(crc, *p++); + return crc; + } + else + return pg_comp_crc32c_armv8(crc, data, len); +} + #elif defined(USE_LOONGARCH_CRC32C) /* Use LoongArch CRCC instructions. */ -- 2.49.0
From d447d42e5b0a89e4c2d0e6c40e38980df558b899 Mon Sep 17 00:00:00 2001 From: John Naylor <john.naylor@postgresql.org> Date: Fri, 9 May 2025 19:48:26 +0700 Subject: [PATCH v1 2/3] Compute CRC32C on ARM using the Crypto Extension where available --- meson.build | 32 ++++++++ src/include/port/pg_crc32c.h | 24 ++++-- src/port/meson.build | 1 + src/port/pg_crc32c_armv8.c | 125 ++++++++++++++++++++++++++++++ src/port/pg_crc32c_armv8_choose.c | 36 +++++++++ 5 files changed, 213 insertions(+), 5 deletions(-) diff --git a/meson.build b/meson.build index 12de5e80c31..5f7a10f4c8d 100644 --- a/meson.build +++ b/meson.build @@ -2523,6 +2523,38 @@ int main(void) have_optimized_crc = true endif + # Check if the compiler supports ARMv8 CRYPTO carryless multiplication + # and three-way exclusive-or instructions used for computing CRC. + # We test with cflags_crc since we also need __crc32cd. + prog = ''' +#include <arm_acle.h> +#include <arm_neon.h> +uint64x2_t a; +uint64x2_t b; +uint64x2_t c; + +int main(void) +{ + uint64x2_t r; + uint64x2_t r2; + +__asm("pmull %0.1q, %2.1d, %3.1d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r), "+w"(c):"w"(a), "w"(b)); +__asm("pmull2 %0.1q, %2.2d, %3.2d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r2), "+w"(c):"w"(a), "w"(b)); + + /* return computed value, to prevent the above being optimized away */ + r = veorq_u64(r, r2); + return __crc32cd(0, vgetq_lane_u64(r, 0)); +} +''' + + if cc.links(prog, + name: 'PMULL CRC32C', + args: test_c_args + ['-march=armv8-a+crc+simd+crypto']) + # Use ARM CRYPTO Extension, with runtime check + cflags_crc += '-march=armv8-a+crc+simd+crypto' + cdata.set('USE_PMULL_CRC32C_WITH_RUNTIME_CHECK', 1) + endif + elif host_cpu == 'loongarch64' prog = ''' diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h index 9f021db83b2..def3de5dfb9 100644 --- a/src/include/port/pg_crc32c.h +++ b/src/include/port/pg_crc32c.h @@ -111,13 +111,22 @@ extern pg_crc32c pg_comp_crc32c_avx512(pg_crc32c crc, const void *data, size_t l #endif #elif defined(USE_ARMV8_CRC32C) -/* Use ARMv8 CRC Extension instructions. */ +/* + * Use either ARMv8 CRC Extension or CRYPTO Extension (PMULL) instructions. + * We don't need a runtime check for CRC, so we can inline those in some cases. + */ + +#include <arm_acle.h> #define COMP_CRC32C(crc, data, len) \ ((crc) = pg_comp_crc32c_dispatch((crc), (data), (len))) #define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF) +extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len); extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len); +#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK +extern pg_crc32c pg_comp_crc32c_pmull(pg_crc32c crc, const void *data, size_t len); +#endif static inline pg_crc32c @@ -132,13 +141,14 @@ pg_comp_crc32c_dispatch(pg_crc32c crc, const void *data, size_t len) /* * For small constant inputs, inline the computation to avoid a - * function call and allow the compiler to unroll loops. + * function call. */ #if 1 /* - * WIP: is it better to avoid branching by unrolling the loop and - * processing only 4-bytes per iteration? + * WIP: Unlike x86, using 8-byte variants requires 8-byte alignment, + * so the compiler cannot unroll the loop unless we restrict ourselves + * to the 4-byte variant. Needs testing to choose the best one. */ if (!PointerIsAligned(p, uint64) && len > 4) { @@ -158,7 +168,8 @@ pg_comp_crc32c_dispatch(pg_crc32c crc, const void *data, size_t len) return crc; } else - return pg_comp_crc32c_armv8(crc, data, len); + /* Otherwise, use a runtime check for PMULL instructions. */ + return pg_comp_crc32c(crc, data, len); } #elif defined(USE_LOONGARCH_CRC32C) @@ -183,6 +194,9 @@ extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_ extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len); extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len); extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len); +#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK +extern pg_crc32c pg_comp_crc32c_pmull(pg_crc32c crc, const void *data, size_t len); +#endif #else /* diff --git a/src/port/meson.build b/src/port/meson.build index fc7b059fee5..8b0ac6b931a 100644 --- a/src/port/meson.build +++ b/src/port/meson.build @@ -94,6 +94,7 @@ replace_funcs_pos = [ # arm / aarch64 ['pg_crc32c_armv8', 'USE_ARMV8_CRC32C'], ['pg_crc32c_armv8', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK', 'crc'], + ['pg_crc32c_armv8_choose', 'USE_ARMV8_CRC32C'], ['pg_crc32c_armv8_choose', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'], ['pg_crc32c_sb8', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'], diff --git a/src/port/pg_crc32c_armv8.c b/src/port/pg_crc32c_armv8.c index 5ba070bb99d..f67de2016b4 100644 --- a/src/port/pg_crc32c_armv8.c +++ b/src/port/pg_crc32c_armv8.c @@ -15,6 +15,9 @@ #include "c.h" #include <arm_acle.h> +#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK +#include <arm_neon.h> +#endif #include "port/pg_crc32c.h" @@ -73,3 +76,125 @@ pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len) return crc; } + +#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK + +/* + * Note: There is no copyright notice in the following generated code. + * + * We have modified the output to + * - match our function declaration and alignment macro + * - match whitespace to our project style + */ + +/* Generated by https://github.com/corsix/fast-crc32/ using: */ +/* ./generate -i neon -p crc32c -a v4e */ +/* MIT licensed */ + +static inline +uint64x2_t +clmul_lo_e(uint64x2_t a, uint64x2_t b, uint64x2_t c) +{ + uint64x2_t r; + +__asm("pmull %0.1q, %2.1d, %3.1d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r), "+w"(c):"w"(a), "w"(b)); + return r; +} + +static inline +uint64x2_t +clmul_hi_e(uint64x2_t a, uint64x2_t b, uint64x2_t c) +{ + uint64x2_t r; + +__asm("pmull2 %0.1q, %2.2d, %3.2d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r), "+w"(c):"w"(a), "w"(b)); + return r; +} + +pg_crc32c +pg_comp_crc32c_pmull(pg_crc32c crc, const void *data, size_t len) +{ + /* adjust names to match generated code */ + pg_crc32c crc0 = crc; + const char *buf = data; + + /* Align to 16 bytes to prevent straddling cacheline boundaries. */ + for (; len && ((uintptr_t) buf & 7); --len) + { + crc0 = __crc32cb(crc0, *buf++); + } + if (((uintptr_t) buf & 8) && len >= 8) + { + crc0 = __crc32cd(crc0, *(const uint64_t *) buf); + buf += 8; + len -= 8; + } + + if (len >= 64) + { + const char *end = buf + len; + const char *limit = buf + len - 64; + + /* First vector chunk. */ + uint64x2_t x0 = vld1q_u64((const uint64_t *) buf), + y0; + uint64x2_t x1 = vld1q_u64((const uint64_t *) (buf + 16)), + y1; + uint64x2_t x2 = vld1q_u64((const uint64_t *) (buf + 32)), + y2; + uint64x2_t x3 = vld1q_u64((const uint64_t *) (buf + 48)), + y3; + uint64x2_t k; + + { + static const uint64_t pg_attribute_aligned(16) k_[] = {0x740eef02, 0x9e4addf8}; + + k = vld1q_u64(k_); + } + + + /* pgindent doesn't like this: */ + x0 = veorq_u64((uint64x2_t) {crc0, 0}, x0); + /* + * a possible alternative? + * + * x0 = veorq_u64((uint64x2_t) vsetq_lane_u32(crc0, vdupq_n_u32(0), 0), x0); + */ + + buf += 64; + + /* Main loop. */ + while (buf <= limit) + { + y0 = clmul_lo_e(x0, k, vld1q_u64((const uint64_t *) buf)), x0 = clmul_hi_e(x0, k, y0); + y1 = clmul_lo_e(x1, k, vld1q_u64((const uint64_t *) (buf + 16))), x1 = clmul_hi_e(x1, k, y1); + y2 = clmul_lo_e(x2, k, vld1q_u64((const uint64_t *) (buf + 32))), x2 = clmul_hi_e(x2, k, y2); + y3 = clmul_lo_e(x3, k, vld1q_u64((const uint64_t *) (buf + 48))), x3 = clmul_hi_e(x3, k, y3); + buf += 64; + } + + /* Reduce x0 ... x3 to just x0. */ + { + static const uint64_t pg_attribute_aligned(16) k_[] = {0xf20c0dfe, 0x493c7d27}; + + k = vld1q_u64(k_); + } + y0 = clmul_lo_e(x0, k, x1), x0 = clmul_hi_e(x0, k, y0); + y2 = clmul_lo_e(x2, k, x3), x2 = clmul_hi_e(x2, k, y2); + { + static const uint64_t pg_attribute_aligned(16) k_[] = {0x3da6d0cb, 0xba4fc28e}; + + k = vld1q_u64(k_); + } + y0 = clmul_lo_e(x0, k, x2), x0 = clmul_hi_e(x0, k, y0); + + /* Reduce 128 bits to 32 bits, and multiply by x^32. */ + crc0 = __crc32cd(0, vgetq_lane_u64(x0, 0)); + crc0 = __crc32cd(crc0, vgetq_lane_u64(x0, 1)); + len = end - buf; + } + + return pg_comp_crc32c_armv8(crc0, buf, len); +} + +#endif diff --git a/src/port/pg_crc32c_armv8_choose.c b/src/port/pg_crc32c_armv8_choose.c index ec12be1bbc3..9a1656a2685 100644 --- a/src/port/pg_crc32c_armv8_choose.c +++ b/src/port/pg_crc32c_armv8_choose.c @@ -107,6 +107,27 @@ pg_crc32c_armv8_available(void) #endif } +static inline bool +pg_pmull_available(void) +{ +#ifdef __aarch64__ + +#ifdef HAVE_ELF_AUX_INFO + unsigned long value; + + return elf_aux_info(AT_HWCAP, &value, sizeof(value)) == 0 && + (value & HWCAP_PMULL) != 0; +#elif defined(HAVE_GETAUXVAL) + return (getauxval(AT_HWCAP) & HWCAP_PMULL) != 0; +#else + return false; +#endif + +#else + return false; +#endif +} + /* * This gets called on the first call. It replaces the function pointer * so that subsequent calls are routed directly to the chosen implementation. @@ -114,11 +135,26 @@ pg_crc32c_armv8_available(void) static pg_crc32c pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len) { +#if defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK) if (pg_crc32c_armv8_available()) pg_comp_crc32c = pg_comp_crc32c_armv8; else pg_comp_crc32c = pg_comp_crc32c_sb8; +#elif defined(USE_ARMV8_CRC32C) + /* + * On MacOS, compilers may emit CRC instructions without extra CFLAGS, but + * we can't use Linux-isms to detect CPU features, so we just set it here + * as a fallback for PMULL. + */ + pg_comp_crc32c = pg_comp_crc32c_armv8; +#endif + +#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK + if (pg_pmull_available()) + pg_comp_crc32c = pg_comp_crc32c_pmull; +#endif + return pg_comp_crc32c(crc, data, len); } -- 2.49.0
From a0aad5bf044170e6a8ceaf368fcfff68177cb9fb Mon Sep 17 00:00:00 2001 From: John Naylor <john.naylor@postgresql.org> Date: Wed, 14 May 2025 01:59:41 +0700 Subject: [PATCH v1 3/3] WIP: Attempt alignment preamble better suited to WAL --- src/port/pg_crc32c_armv8.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/port/pg_crc32c_armv8.c b/src/port/pg_crc32c_armv8.c index f67de2016b4..da6116efa21 100644 --- a/src/port/pg_crc32c_armv8.c +++ b/src/port/pg_crc32c_armv8.c @@ -119,6 +119,24 @@ pg_comp_crc32c_pmull(pg_crc32c crc, const void *data, size_t len) const char *buf = data; /* Align to 16 bytes to prevent straddling cacheline boundaries. */ +#if 1 + + /* + * WIP: WAL is 4-byte aligned, so in that case the the first loop will be skipped. Is this + * better? + */ + for (; len && ((uintptr_t) buf & 3); --len) + { + crc0 = __crc32cb(crc0, *buf++); + } + if (((uintptr_t) buf & 12) && len >= 4) + { + crc0 = __crc32cw(crc0, *(const uint64_t *) buf); + buf += 4; + len -= 4; + } +#else + /* original */ for (; len && ((uintptr_t) buf & 7); --len) { crc0 = __crc32cb(crc0, *buf++); @@ -129,6 +147,7 @@ pg_comp_crc32c_pmull(pg_crc32c crc, const void *data, size_t len) buf += 8; len -= 8; } +#endif if (len >= 64) { -- 2.49.0