On Fri, Aug 05, 2022 at 11:02:15AM +0700, John Naylor wrote: > That is a good point. Maybe potential helpers in simd.h should only deal > specifically with vector registers, with it's users providing C fallbacks. > I don't have any good ideas of where to put the new function, though.
I moved it to src/include/port for now since that's where files like pg_bswap.h live. > Hmm, I didn't know about those. lfind() is similar enough that it would > make sense to have pg_lfind32() etc in src/include/port/pg_lsearch.h, at > least for the v4 version that returns the pointer. We already declare > bsearch_arg() in src/include/port.h and that's another kind of array > search. Returning bool is different enough to have a different name. > pg_lfind32_ispresent()? *_noptr()? Meh. > > Having said all that, the man page under BUGS [1] says "The naming is > unfortunate." I went ahead and renamed it to pg_lfind32() and switched it back to returning the pointer. That felt the cleanest from the naming perspective, but as Andres noted, it might not be as fast as just looking for the presence of the element. I modified my small testing program to perform many searches on small arrays, and I wasn't able to identify any impact, so perhaps thіs is good enough. Thoughts? -- Nathan Bossart Amazon Web Services: https://aws.amazon.com
>From 205737c56c7e49e8de25e6b4afca6a96abbb4e60 Mon Sep 17 00:00:00 2001 From: Nathan Bossart <nathandboss...@gmail.com> Date: Wed, 3 Aug 2022 09:49:04 -0700 Subject: [PATCH v7 1/2] Introduce optimized routine for linear searches through an array of integers. If SSE2 is available, this function uses it to speed up the search. Otherwise, it uses a simple 'for' loop. This is a prerequisite for a follow-up commit that will use this function to optimize [sub]xip lookups in XidInMVCCSnapshot(), but it can be used anywhere that might benefit from such an optimization. It might be worthwhile to add an ARM-specific code path to this function in the future. Author: Nathan Bossart Reviewed by: Andres Freund, John Naylor Discussion: https://postgr.es/m/20220713170950.GA3116318%40nathanxps13 --- src/include/port/pg_lfind.h | 73 +++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 src/include/port/pg_lfind.h diff --git a/src/include/port/pg_lfind.h b/src/include/port/pg_lfind.h new file mode 100644 index 0000000000..27721490a6 --- /dev/null +++ b/src/include/port/pg_lfind.h @@ -0,0 +1,73 @@ +/*------------------------------------------------------------------------- + * + * pg_lfind.h + * Optimized linear search routines. + * + * Copyright (c) 2022, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/port/pg_lfind.h + * + *------------------------------------------------------------------------- + */ +#ifndef PG_LFIND_H +#define PG_LFIND_H + +#ifdef USE_SSE2 +#include "port/pg_bitutils.h" +#endif + +/* + * pg_lfind32 + * + * Returns the address of the first element in 'base' that equals 'key', or + * NULL if no match is found. + */ +static inline uint32 * +pg_lfind32(uint32 key, uint32 *base, uint32 nelem) +{ + uint32 i = 0; + + /* If possible, use SSE2 intrinsics to speed up the search. */ +#ifdef USE_SSE2 + __m128i keys = _mm_set1_epi32(key); /* load 4 copies of key */ + uint32 iterations = nelem & ~0xF; /* round down to multiple of 16 */ + + for (; i < iterations; i += 16) + { + /* load the next 16 values into __m128i variables */ + __m128i vals1 = _mm_loadu_si128((__m128i *) &base[i]); + __m128i vals2 = _mm_loadu_si128((__m128i *) &base[i + 4]); + __m128i vals3 = _mm_loadu_si128((__m128i *) &base[i + 8]); + __m128i vals4 = _mm_loadu_si128((__m128i *) &base[i + 12]); + + /* perform the comparisons */ + __m128i result1 = _mm_cmpeq_epi32(keys, vals1); + __m128i result2 = _mm_cmpeq_epi32(keys, vals2); + __m128i result3 = _mm_cmpeq_epi32(keys, vals3); + __m128i result4 = _mm_cmpeq_epi32(keys, vals4); + + /* shrink the results into a single variable */ + __m128i tmp1 = _mm_packs_epi32(result1, result2); + __m128i tmp2 = _mm_packs_epi32(result3, result4); + __m128i tmp3 = _mm_packs_epi16(tmp1, tmp2); + uint32 result = _mm_movemask_epi8(tmp3); + + /* see if there was a match */ + if (result != 0) + return &base[i + pg_rightmost_one_pos32(result)]; + } +#endif + + /* Process the remaining elements the slow way. */ + for (; i < nelem; i++) + { + if (key == base[i]) + return &base[i]; + } + + return NULL; +} + +#endif /* PG_LFIND_H */ -- 2.25.1
>From abbce6208a5463f4d5da177d05f167d08e7eee2d Mon Sep 17 00:00:00 2001 From: Nathan Bossart <nathandboss...@gmail.com> Date: Wed, 3 Aug 2022 09:59:28 -0700 Subject: [PATCH v7 2/2] Optimize linear searches in XidInMVCCSnapshot(). This change makes use of the recently-introduced optimized linear search routine to speed up searches through the [sub]xip arrays when possible, which should improve performance significantly when the arrays are large. Author: Nathan Bossart Reviewed by: Andres Freund, John Naylor Discussion: https://postgr.es/m/20220713170950.GA3116318%40nathanxps13 --- src/backend/utils/time/snapmgr.c | 28 +++++++--------------------- 1 file changed, 7 insertions(+), 21 deletions(-) diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c index 5bc2a15160..9b504c9745 100644 --- a/src/backend/utils/time/snapmgr.c +++ b/src/backend/utils/time/snapmgr.c @@ -56,6 +56,7 @@ #include "datatype/timestamp.h" #include "lib/pairingheap.h" #include "miscadmin.h" +#include "port/pg_lfind.h" #include "storage/predicate.h" #include "storage/proc.h" #include "storage/procarray.h" @@ -2284,8 +2285,6 @@ RestoreTransactionSnapshot(Snapshot snapshot, void *source_pgproc) bool XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot) { - uint32 i; - /* * Make a quick range check to eliminate most XIDs without looking at the * xip arrays. Note that this is OK even if we convert a subxact XID to @@ -2317,13 +2316,8 @@ XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot) if (!snapshot->suboverflowed) { /* we have full data, so search subxip */ - int32 j; - - for (j = 0; j < snapshot->subxcnt; j++) - { - if (TransactionIdEquals(xid, snapshot->subxip[j])) - return true; - } + if (pg_lfind32(xid, snapshot->subxip, snapshot->subxcnt)) + return true; /* not there, fall through to search xip[] */ } @@ -2344,16 +2338,11 @@ XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot) return false; } - for (i = 0; i < snapshot->xcnt; i++) - { - if (TransactionIdEquals(xid, snapshot->xip[i])) - return true; - } + if (pg_lfind32(xid, snapshot->xip, snapshot->xcnt)) + return true; } else { - int32 j; - /* * In recovery we store all xids in the subxact array because it is by * far the bigger array, and we mostly don't know which xids are @@ -2383,11 +2372,8 @@ XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot) * indeterminate xid. We don't know whether it's top level or subxact * but it doesn't matter. If it's present, the xid is visible. */ - for (j = 0; j < snapshot->subxcnt; j++) - { - if (TransactionIdEquals(xid, snapshot->subxip[j])) - return true; - } + if (pg_lfind32(xid, snapshot->subxip, snapshot->subxcnt)) + return true; } return false; -- 2.25.1