Here is a new patch set. 0001 is the currently-proposed patch from the other thread [0] for determining SSE2 support. 0002 introduces the optimized linear search function. And 0003 makes use of the new function for the [sub]xip lookups in XidInMVCCSnapshot().
[0] https://postgr.es/m/CAFBsxsGktHL7%3DJXbgnKTi_uL0VRPcH4FSAqc6yK-3%2BJYfqPPjA%40mail.gmail.com -- Nathan Bossart Amazon Web Services: https://aws.amazon.com
>From bd523948876f801b7f1b909f399b2cc41acf06cf Mon Sep 17 00:00:00 2001 From: John Naylor <john.nay...@postgresql.org> Date: Wed, 3 Aug 2022 11:07:40 +0700 Subject: [PATCH v5 1/3] Support SSE2 intrinsics where available SSE2 vector instructions are part of the spec for the 64-bit x86 architecture. Until now we have relied on the compiler to autovectorize in some limited situations, but some useful coding idioms can only be expressed explicitly via compiler intrinsics. To this end, add a header that defines USE_SSE2 when available. While x86-only for now, we can add other architectures in the future. This will also be the intended place for low-level hepler functions that use vector operations. Reviewed by Nathan Bossart Discussion: https://www.postgresql.org/message-id/CAFBsxsE2G_H_5Wbw%2BNOPm70-BK4xxKf86-mRzY%3DL2sLoQqM%2B-Q%40mail.gmail.com --- src/include/port/simd.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 src/include/port/simd.h diff --git a/src/include/port/simd.h b/src/include/port/simd.h new file mode 100644 index 0000000000..a571e79f57 --- /dev/null +++ b/src/include/port/simd.h @@ -0,0 +1,30 @@ +/*------------------------------------------------------------------------- + * + * simd.h + * Support for platform-specific vector operations. + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/port/simd.h + * + *------------------------------------------------------------------------- + */ +#ifndef SIMD_H +#define SIMD_H + +/* + * SSE2 instructions are part of the spec for the 64-bit x86 ISA. We assume + * that compilers targeting this architecture understand SSE2 intrinsics. + * + * We use emmintrin.h rather than the comprehensive header immintrin.h in + * order to exclude extensions beyond SSE2. This is because MSVC, at least, + * will allow the use of intrinsics that haven't been enabled at compile + * time. + */ +#if (defined(__x86_64__) || defined(_M_AMD64)) +#include <emmintrin.h> +#define USE_SSE2 +#endif + +#endif /* SIMD_H */ -- 2.25.1
>From 89d17ba8a669b53814551284f8f8c82192eb1402 Mon Sep 17 00:00:00 2001 From: Nathan Bossart <nathandboss...@gmail.com> Date: Wed, 3 Aug 2022 09:49:04 -0700 Subject: [PATCH v5 2/3] Introduce optimized routine for linear searches through an array of integers. If SSE2 is available, this function uses it to speed up the search. Otherwise, it uses a simple 'for' loop. This is a prerequisite for a follow-up commit that will use this function to optimize [sub]xip lookups in XidInMVCCSnapshot(), but it can be used anywhere that might benefit from such an optimization. It might be worthwhile to add an ARM-specific code path to this function in the future. Author: Nathan Bossart Reviewed by: Andres Freund, John Naylor Discussion: https://postgr.es/m/20220713170950.GA3116318%40nathanxps13 --- src/include/utils/linearsearch.h | 76 ++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 src/include/utils/linearsearch.h diff --git a/src/include/utils/linearsearch.h b/src/include/utils/linearsearch.h new file mode 100644 index 0000000000..51298b4355 --- /dev/null +++ b/src/include/utils/linearsearch.h @@ -0,0 +1,76 @@ +/*------------------------------------------------------------------------- + * + * linearsearch.h + * Optimized linear search routines. + * + * Copyright (c) 2022, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/include/utils/linearsearch.h + * + *------------------------------------------------------------------------- + */ +#ifndef LINEARSEARCH_H +#define LINEARSEARCH_H + +#include "port/simd.h" + +/* + * pg_linearsearch_uint32 + * + * Returns true if there is an element in 'base' that equals 'key'. Otherwise, + * returns false. + * + * Since pg_attribute_no_sanitize_alignment() is only intended for x86-specific + * code, we surround it with an SSE2 check. + */ +#ifdef USE_SSE2 +pg_attribute_no_sanitize_alignment() +#endif +static inline bool +pg_linearsearch_uint32(uint32 key, uint32 *base, uint32 nelem) +{ + uint32 i = 0; + + /* If possible, use SSE2 intrinsics to speed up the search. */ +#ifdef USE_SSE2 + __m128i keys = _mm_set1_epi32(key); /* load 4 copies of key */ + uint32 its = nelem & ~0xF; /* round down to multiple of 16 */ + + for (; i < its; i += 16) + { + /* load the next 16 values into __m128i variables */ + __m128i vals1 = _mm_loadu_si128((__m128i *) &base[i]); + __m128i vals2 = _mm_loadu_si128((__m128i *) &base[i + 4]); + __m128i vals3 = _mm_loadu_si128((__m128i *) &base[i + 8]); + __m128i vals4 = _mm_loadu_si128((__m128i *) &base[i + 12]); + + /* perform the comparisons */ + __m128i result1 = _mm_cmpeq_epi32(keys, vals1); + __m128i result2 = _mm_cmpeq_epi32(keys, vals2); + __m128i result3 = _mm_cmpeq_epi32(keys, vals3); + __m128i result4 = _mm_cmpeq_epi32(keys, vals4); + + /* shrink the results into a single variable */ + __m128i tmp1 = _mm_packs_epi32(result1, result2); + __m128i tmp2 = _mm_packs_epi32(result3, result4); + __m128i result = _mm_packs_epi16(tmp1, tmp2); + + /* see if there was a match */ + if (_mm_movemask_epi8(result) != 0) + return true; + } +#endif + + /* Process the remaining elements the slow way. */ + for (; i < nelem; i++) + { + if (key == base[i]) + return true; + } + + return false; +} + +#endif /* LINEARSEARCH_H */ -- 2.25.1
>From aadb77a991174771d62442904e32ac6fca64571f Mon Sep 17 00:00:00 2001 From: Nathan Bossart <nathandboss...@gmail.com> Date: Wed, 3 Aug 2022 09:59:28 -0700 Subject: [PATCH v5 3/3] Optimize linear searches in XidInMVCCSnapshot(). This change makes use of the recently-introduced optimized linear search routine to speed up searches through the [sub]xip arrays when possible, which should improve performance significantly when the arrays are large. Author: Nathan Bossart Reviewed by: Andres Freund, John Naylor Discussion: https://postgr.es/m/20220713170950.GA3116318%40nathanxps13 --- src/backend/utils/time/snapmgr.c | 28 +++++++--------------------- 1 file changed, 7 insertions(+), 21 deletions(-) diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c index 5bc2a15160..834c8867d4 100644 --- a/src/backend/utils/time/snapmgr.c +++ b/src/backend/utils/time/snapmgr.c @@ -63,6 +63,7 @@ #include "storage/sinvaladt.h" #include "storage/spin.h" #include "utils/builtins.h" +#include "utils/linearsearch.h" #include "utils/memutils.h" #include "utils/old_snapshot.h" #include "utils/rel.h" @@ -2284,8 +2285,6 @@ RestoreTransactionSnapshot(Snapshot snapshot, void *source_pgproc) bool XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot) { - uint32 i; - /* * Make a quick range check to eliminate most XIDs without looking at the * xip arrays. Note that this is OK even if we convert a subxact XID to @@ -2317,13 +2316,8 @@ XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot) if (!snapshot->suboverflowed) { /* we have full data, so search subxip */ - int32 j; - - for (j = 0; j < snapshot->subxcnt; j++) - { - if (TransactionIdEquals(xid, snapshot->subxip[j])) - return true; - } + if (pg_linearsearch_uint32(xid, snapshot->subxip, snapshot->subxcnt)) + return true; /* not there, fall through to search xip[] */ } @@ -2344,16 +2338,11 @@ XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot) return false; } - for (i = 0; i < snapshot->xcnt; i++) - { - if (TransactionIdEquals(xid, snapshot->xip[i])) - return true; - } + if (pg_linearsearch_uint32(xid, snapshot->xip, snapshot->xcnt)) + return true; } else { - int32 j; - /* * In recovery we store all xids in the subxact array because it is by * far the bigger array, and we mostly don't know which xids are @@ -2383,11 +2372,8 @@ XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot) * indeterminate xid. We don't know whether it's top level or subxact * but it doesn't matter. If it's present, the xid is visible. */ - for (j = 0; j < snapshot->subxcnt; j++) - { - if (TransactionIdEquals(xid, snapshot->subxip[j])) - return true; - } + if (pg_linearsearch_uint32(xid, snapshot->subxip, snapshot->subxcnt)) + return true; } return false; -- 2.25.1