On Tue, Aug 9, 2022 at 10:51 AM Nathan Bossart <nathandboss...@gmail.com> wrote: > Fixed in v10.
I decided I wasn't quite comfortable changing snapshot handling without further guarantees. To this end, 0002 in the attached v11 is an addendum that adds assert checking (also pgindent and some comment-smithing). As I suspected, make check-world passes even with purposefully screwed-up coding. 0003 uses pg_lfind32 in syscache.c and I verified that sticking in the wrong answer will lead to a crash in assert-enabled builds in short order. I'd kind of like to throw this (or something else suitable) at the build farm first for that reason. It's simpler than the qsort/qunique/binary search that was there before, so that's nice, but I've not tried to test performance. -- John Naylor EDB: http://www.enterprisedb.com
From 1c89b9b7d3c71bb1a703caaf7c01c93bc9e2515f Mon Sep 17 00:00:00 2001 From: John Naylor <john.nay...@postgresql.org> Date: Tue, 9 Aug 2022 11:51:52 +0700 Subject: [PATCH v11 2/4] Add assert checking, run pgindent, comment changes --- src/include/port/pg_lfind.h | 78 ++++++++++++++++++++++++++----------- 1 file changed, 56 insertions(+), 22 deletions(-) diff --git a/src/include/port/pg_lfind.h b/src/include/port/pg_lfind.h index 24de544f63..fb125977b2 100644 --- a/src/include/port/pg_lfind.h +++ b/src/include/port/pg_lfind.h @@ -18,51 +18,85 @@ /* * pg_lfind32 * - * Returns true if there is an element in 'base' that equals 'key'. Otherwise, - * returns false. + * Return true if there is an element in 'base' that equals 'key', otherwise + * return false. */ static inline bool pg_lfind32(uint32 key, uint32 *base, uint32 nelem) { uint32 i = 0; - /* If possible, use SSE2 intrinsics to speed up the search. */ + /* Use SIMD intrinsics where available. */ #ifdef USE_SSE2 - const __m128i keys = _mm_set1_epi32(key); /* load 4 copies of key */ - uint32 iterations = nelem & ~0xF; /* round down to multiple of 16 */ - for (; i < iterations; i += 16) + /* + * A 16-byte register only has four 4-byte lanes. For better + * instruction-level parallelism, each loop iteration operates on a block + * of four registers. Testing has showed this is ~40% faster than using a + * block of two registers. + */ + const __m128i keys = _mm_set1_epi32(key); /* load 4 copies of key */ + uint32 iterations = nelem & ~0xF; /* round down to multiple of 16 */ + +#if defined(USE_ASSERT_CHECKING) + bool assert_result = false; + + /* pre-compute the result for assert checking */ + for (i = 0; i < nelem; i++) { - /* load the next 16 values into __m128i variables */ - const __m128i vals1 = _mm_loadu_si128((__m128i *) &base[i]); - const __m128i vals2 = _mm_loadu_si128((__m128i *) &base[i + 4]); - const __m128i vals3 = _mm_loadu_si128((__m128i *) &base[i + 8]); - const __m128i vals4 = _mm_loadu_si128((__m128i *) &base[i + 12]); + if (key == base[i]) + { + assert_result = true; + break; + } + } +#endif - /* perform the comparisons */ - const __m128i result1 = _mm_cmpeq_epi32(keys, vals1); - const __m128i result2 = _mm_cmpeq_epi32(keys, vals2); - const __m128i result3 = _mm_cmpeq_epi32(keys, vals3); - const __m128i result4 = _mm_cmpeq_epi32(keys, vals4); + for (i = 0; i < iterations; i += 16) + { + /* load the next block into 4 registers holding 4 values each */ + const __m128i vals1 = _mm_loadu_si128((__m128i *) & base[i]); + const __m128i vals2 = _mm_loadu_si128((__m128i *) & base[i + 4]); + const __m128i vals3 = _mm_loadu_si128((__m128i *) & base[i + 8]); + const __m128i vals4 = _mm_loadu_si128((__m128i *) & base[i + 12]); - /* shrink the results into a single variable */ - const __m128i tmp1 = _mm_or_si128(result1, result2); - const __m128i tmp2 = _mm_or_si128(result3, result4); - const __m128i result = _mm_or_si128(tmp1, tmp2); + /* compare each value to the key */ + const __m128i result1 = _mm_cmpeq_epi32(keys, vals1); + const __m128i result2 = _mm_cmpeq_epi32(keys, vals2); + const __m128i result3 = _mm_cmpeq_epi32(keys, vals3); + const __m128i result4 = _mm_cmpeq_epi32(keys, vals4); + + /* combine the results into a single variable */ + const __m128i tmp1 = _mm_or_si128(result1, result2); + const __m128i tmp2 = _mm_or_si128(result3, result4); + const __m128i result = _mm_or_si128(tmp1, tmp2); /* see if there was a match */ if (_mm_movemask_epi8(result) != 0) + { +#if defined(USE_ASSERT_CHECKING) + Assert(assert_result == true); +#endif return true; + } } -#endif +#endif /* USE_SSE2 */ - /* Process the remaining elements the slow way. */ + /* Process the remaining elements one at a time. */ for (; i < nelem; i++) { if (key == base[i]) + { +#if defined(USE_SSE2) && defined(USE_ASSERT_CHECKING) + Assert(assert_result == true); +#endif return true; + } } +#if defined(USE_SSE2) && defined(USE_ASSERT_CHECKING) + Assert(assert_result == false); +#endif return false; } -- 2.36.1
From ff77224f9227bcff88a68e63f39754296810351c Mon Sep 17 00:00:00 2001 From: Nathan Bossart <nathandboss...@gmail.com> Date: Wed, 3 Aug 2022 09:59:28 -0700 Subject: [PATCH v11 4/4] Optimize linear searches in XidInMVCCSnapshot(). This change makes use of the recently-introduced optimized linear search routine to speed up searches through the [sub]xip arrays when possible, which should improve performance significantly when the arrays are large. Author: Nathan Bossart Reviewed by: Andres Freund, John Naylor, Bharath Rupireddy, Masahiko Sawada Discussion: https://postgr.es/m/20220713170950.GA3116318%40nathanxps13 --- src/backend/utils/time/snapmgr.c | 28 +++++++--------------------- 1 file changed, 7 insertions(+), 21 deletions(-) diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c index 5bc2a15160..9b504c9745 100644 --- a/src/backend/utils/time/snapmgr.c +++ b/src/backend/utils/time/snapmgr.c @@ -56,6 +56,7 @@ #include "datatype/timestamp.h" #include "lib/pairingheap.h" #include "miscadmin.h" +#include "port/pg_lfind.h" #include "storage/predicate.h" #include "storage/proc.h" #include "storage/procarray.h" @@ -2284,8 +2285,6 @@ RestoreTransactionSnapshot(Snapshot snapshot, void *source_pgproc) bool XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot) { - uint32 i; - /* * Make a quick range check to eliminate most XIDs without looking at the * xip arrays. Note that this is OK even if we convert a subxact XID to @@ -2317,13 +2316,8 @@ XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot) if (!snapshot->suboverflowed) { /* we have full data, so search subxip */ - int32 j; - - for (j = 0; j < snapshot->subxcnt; j++) - { - if (TransactionIdEquals(xid, snapshot->subxip[j])) - return true; - } + if (pg_lfind32(xid, snapshot->subxip, snapshot->subxcnt)) + return true; /* not there, fall through to search xip[] */ } @@ -2344,16 +2338,11 @@ XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot) return false; } - for (i = 0; i < snapshot->xcnt; i++) - { - if (TransactionIdEquals(xid, snapshot->xip[i])) - return true; - } + if (pg_lfind32(xid, snapshot->xip, snapshot->xcnt)) + return true; } else { - int32 j; - /* * In recovery we store all xids in the subxact array because it is by * far the bigger array, and we mostly don't know which xids are @@ -2383,11 +2372,8 @@ XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot) * indeterminate xid. We don't know whether it's top level or subxact * but it doesn't matter. If it's present, the xid is visible. */ - for (j = 0; j < snapshot->subxcnt; j++) - { - if (TransactionIdEquals(xid, snapshot->subxip[j])) - return true; - } + if (pg_lfind32(xid, snapshot->subxip, snapshot->subxcnt)) + return true; } return false; -- 2.36.1
From 216415cff82aef98c02d9e953f956e8311b454d8 Mon Sep 17 00:00:00 2001 From: Nathan Bossart <nathandboss...@gmail.com> Date: Wed, 3 Aug 2022 09:49:04 -0700 Subject: [PATCH v11 1/4] Introduce optimized routine for linear searches through an array of integers. If SSE2 is available, this function uses it to speed up the search. Otherwise, it uses a simple 'for' loop. This is a prerequisite for a follow-up commit that will use this function to optimize [sub]xip lookups in XidInMVCCSnapshot(), but it can be used anywhere that might benefit from such an optimization. It might be worthwhile to add an ARM-specific code path to this function in the future. Author: Nathan Bossart Reviewed by: Andres Freund, John Naylor, Bharath Rupireddy, Masahiko Sawada Discussion: https://postgr.es/m/20220713170950.GA3116318%40nathanxps13 --- src/include/port/pg_lfind.h | 69 +++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 src/include/port/pg_lfind.h diff --git a/src/include/port/pg_lfind.h b/src/include/port/pg_lfind.h new file mode 100644 index 0000000000..24de544f63 --- /dev/null +++ b/src/include/port/pg_lfind.h @@ -0,0 +1,69 @@ +/*------------------------------------------------------------------------- + * + * pg_lfind.h + * Optimized linear search routines. + * + * Copyright (c) 2022, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/include/port/pg_lfind.h + * + *------------------------------------------------------------------------- + */ +#ifndef PG_LFIND_H +#define PG_LFIND_H + +#include "port/simd.h" + +/* + * pg_lfind32 + * + * Returns true if there is an element in 'base' that equals 'key'. Otherwise, + * returns false. + */ +static inline bool +pg_lfind32(uint32 key, uint32 *base, uint32 nelem) +{ + uint32 i = 0; + + /* If possible, use SSE2 intrinsics to speed up the search. */ +#ifdef USE_SSE2 + const __m128i keys = _mm_set1_epi32(key); /* load 4 copies of key */ + uint32 iterations = nelem & ~0xF; /* round down to multiple of 16 */ + + for (; i < iterations; i += 16) + { + /* load the next 16 values into __m128i variables */ + const __m128i vals1 = _mm_loadu_si128((__m128i *) &base[i]); + const __m128i vals2 = _mm_loadu_si128((__m128i *) &base[i + 4]); + const __m128i vals3 = _mm_loadu_si128((__m128i *) &base[i + 8]); + const __m128i vals4 = _mm_loadu_si128((__m128i *) &base[i + 12]); + + /* perform the comparisons */ + const __m128i result1 = _mm_cmpeq_epi32(keys, vals1); + const __m128i result2 = _mm_cmpeq_epi32(keys, vals2); + const __m128i result3 = _mm_cmpeq_epi32(keys, vals3); + const __m128i result4 = _mm_cmpeq_epi32(keys, vals4); + + /* shrink the results into a single variable */ + const __m128i tmp1 = _mm_or_si128(result1, result2); + const __m128i tmp2 = _mm_or_si128(result3, result4); + const __m128i result = _mm_or_si128(tmp1, tmp2); + + /* see if there was a match */ + if (_mm_movemask_epi8(result) != 0) + return true; + } +#endif + + /* Process the remaining elements the slow way. */ + for (; i < nelem; i++) + { + if (key == base[i]) + return true; + } + + return false; +} + +#endif /* PG_LFIND_H */ -- 2.36.1
From f714fa6fdb860c89b3bb8bc54465a43e24bf9496 Mon Sep 17 00:00:00 2001 From: John Naylor <john.nay...@postgresql.org> Date: Tue, 9 Aug 2022 12:37:22 +0700 Subject: [PATCH v11 3/4] Use optimized linear search for testing if relations have a syscache XXX: not for commit just yet --- src/backend/utils/cache/syscache.c | 69 +++--------------------------- 1 file changed, 5 insertions(+), 64 deletions(-) diff --git a/src/backend/utils/cache/syscache.c b/src/backend/utils/cache/syscache.c index 1912b12146..d354161c54 100644 --- a/src/backend/utils/cache/syscache.c +++ b/src/backend/utils/cache/syscache.c @@ -76,6 +76,7 @@ #include "catalog/pg_type.h" #include "catalog/pg_user_mapping.h" #include "lib/qunique.h" +#include "port/pg_lfind.h" #include "utils/catcache.h" #include "utils/rel.h" #include "utils/syscache.h" @@ -1044,16 +1045,14 @@ static CatCache *SysCache[SysCacheSize]; static bool CacheInitialized = false; -/* Sorted array of OIDs of tables that have caches on them */ +/* Array of OIDs of tables that have caches on them */ static Oid SysCacheRelationOid[SysCacheSize]; static int SysCacheRelationOidSize; -/* Sorted array of OIDs of tables and indexes used by caches */ +/* Array of OIDs of tables and indexes used by caches */ static Oid SysCacheSupportingRelOid[SysCacheSize * 2]; static int SysCacheSupportingRelOidSize; -static int oid_compare(const void *a, const void *b); - /* * InitCatalogCache - initialize the caches @@ -1100,19 +1099,6 @@ InitCatalogCache(void) Assert(SysCacheRelationOidSize <= lengthof(SysCacheRelationOid)); Assert(SysCacheSupportingRelOidSize <= lengthof(SysCacheSupportingRelOid)); - /* Sort and de-dup OID arrays, so we can use binary search. */ - pg_qsort(SysCacheRelationOid, SysCacheRelationOidSize, - sizeof(Oid), oid_compare); - SysCacheRelationOidSize = - qunique(SysCacheRelationOid, SysCacheRelationOidSize, sizeof(Oid), - oid_compare); - - pg_qsort(SysCacheSupportingRelOid, SysCacheSupportingRelOidSize, - sizeof(Oid), oid_compare); - SysCacheSupportingRelOidSize = - qunique(SysCacheSupportingRelOid, SysCacheSupportingRelOidSize, - sizeof(Oid), oid_compare); - CacheInitialized = true; } @@ -1552,22 +1538,7 @@ RelationInvalidatesSnapshotsOnly(Oid relid) bool RelationHasSysCache(Oid relid) { - int low = 0, - high = SysCacheRelationOidSize - 1; - - while (low <= high) - { - int middle = low + (high - low) / 2; - - if (SysCacheRelationOid[middle] == relid) - return true; - if (SysCacheRelationOid[middle] < relid) - low = middle + 1; - else - high = middle - 1; - } - - return false; + return pg_lfind32(relid, SysCacheRelationOid, SysCacheRelationOidSize); } /* @@ -1577,35 +1548,5 @@ RelationHasSysCache(Oid relid) bool RelationSupportsSysCache(Oid relid) { - int low = 0, - high = SysCacheSupportingRelOidSize - 1; - - while (low <= high) - { - int middle = low + (high - low) / 2; - - if (SysCacheSupportingRelOid[middle] == relid) - return true; - if (SysCacheSupportingRelOid[middle] < relid) - low = middle + 1; - else - high = middle - 1; - } - - return false; -} - - -/* - * OID comparator for pg_qsort - */ -static int -oid_compare(const void *a, const void *b) -{ - Oid oa = *((const Oid *) a); - Oid ob = *((const Oid *) b); - - if (oa == ob) - return 0; - return (oa > ob) ? 1 : -1; + return pg_lfind32(relid, SysCacheSupportingRelOid, SysCacheSupportingRelOidSize); } -- 2.36.1