Re: optimize lookups in snapshot [sub]xip arrays

Nathan Bossart Wed, 03 Aug 2022 10:12:17 -0700

Here is a new patch set.  0001 is the currently-proposed patch from the
other thread [0] for determining SSE2 support.  0002 introduces the
optimized linear search function.  And 0003 makes use of the new function
for the [sub]xip lookups in XidInMVCCSnapshot().


[0] 
https://postgr.es/m/CAFBsxsGktHL7%3DJXbgnKTi_uL0VRPcH4FSAqc6yK-3%2BJYfqPPjA%40mail.gmail.com

-- 
Nathan Bossart
Amazon Web Services: https://aws.amazon.com

>From bd523948876f801b7f1b909f399b2cc41acf06cf Mon Sep 17 00:00:00 2001
From: John Naylor <[email protected]>
Date: Wed, 3 Aug 2022 11:07:40 +0700
Subject: [PATCH v5 1/3] Support SSE2 intrinsics where available

SSE2 vector instructions are part of the spec for the 64-bit x86
architecture. Until now we have relied on the compiler to autovectorize
in some limited situations, but some useful coding idioms can only be
expressed explicitly via compiler intrinsics. To this end, add a header
that defines USE_SSE2 when available. While x86-only for now, we can
add other architectures in the future. This will also be the intended
place for low-level hepler functions that use vector operations.

Reviewed by Nathan Bossart

Discussion: https://www.postgresql.org/message-id/CAFBsxsE2G_H_5Wbw%2BNOPm70-BK4xxKf86-mRzY%3DL2sLoQqM%2B-Q%40mail.gmail.com
---
 src/include/port/simd.h | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)
 create mode 100644 src/include/port/simd.h

diff --git a/src/include/port/simd.h b/src/include/port/simd.h
new file mode 100644
index 0000000000..a571e79f57
--- /dev/null
+++ b/src/include/port/simd.h
@@ -0,0 +1,30 @@
+/*-------------------------------------------------------------------------
+ *
+ * simd.h
+ *	  Support for platform-specific vector operations.
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/port/simd.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef SIMD_H
+#define SIMD_H
+
+/*
+ * SSE2 instructions are part of the spec for the 64-bit x86 ISA. We assume
+ * that compilers targeting this architecture understand SSE2 intrinsics.
+ *
+ * We use emmintrin.h rather than the comprehensive header immintrin.h in
+ * order to exclude extensions beyond SSE2. This is because MSVC, at least,
+ * will allow the use of intrinsics that haven't been enabled at compile
+ * time.
+ */
+#if (defined(__x86_64__) || defined(_M_AMD64))
+#include <emmintrin.h>
+#define USE_SSE2
+#endif
+
+#endif							/* SIMD_H */
-- 
2.25.1

>From 89d17ba8a669b53814551284f8f8c82192eb1402 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <[email protected]>
Date: Wed, 3 Aug 2022 09:49:04 -0700
Subject: [PATCH v5 2/3] Introduce optimized routine for linear searches
 through an array of integers.

If SSE2 is available, this function uses it to speed up the search.  Otherwise,
it uses a simple 'for' loop.  This is a prerequisite for a follow-up commit
that will use this function to optimize [sub]xip lookups in
XidInMVCCSnapshot(), but it can be used anywhere that might benefit from such
an optimization.

It might be worthwhile to add an ARM-specific code path to this function in the
future.

Author: Nathan Bossart
Reviewed by: Andres Freund, John Naylor
Discussion: https://postgr.es/m/20220713170950.GA3116318%40nathanxps13
---
 src/include/utils/linearsearch.h | 76 ++++++++++++++++++++++++++++++++
 1 file changed, 76 insertions(+)
 create mode 100644 src/include/utils/linearsearch.h

diff --git a/src/include/utils/linearsearch.h b/src/include/utils/linearsearch.h
new file mode 100644
index 0000000000..51298b4355
--- /dev/null
+++ b/src/include/utils/linearsearch.h
@@ -0,0 +1,76 @@
+/*-------------------------------------------------------------------------
+ *
+ * linearsearch.h
+ *	  Optimized linear search routines.
+ *
+ * Copyright (c) 2022, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ *	  src/include/utils/linearsearch.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef LINEARSEARCH_H
+#define LINEARSEARCH_H
+
+#include "port/simd.h"
+
+/*
+ * pg_linearsearch_uint32
+ *
+ * Returns true if there is an element in 'base' that equals 'key'.  Otherwise,
+ * returns false.
+ *
+ * Since pg_attribute_no_sanitize_alignment() is only intended for x86-specific
+ * code, we surround it with an SSE2 check.
+ */
+#ifdef USE_SSE2
+pg_attribute_no_sanitize_alignment()
+#endif
+static inline bool
+pg_linearsearch_uint32(uint32 key, uint32 *base, uint32 nelem)
+{
+	uint32		i = 0;
+
+	/* If possible, use SSE2 intrinsics to speed up the search. */
+#ifdef USE_SSE2
+	__m128i		keys = _mm_set1_epi32(key);	/* load 4 copies of key */
+	uint32		its = nelem & ~0xF;			/* round down to multiple of 16 */
+
+	for (; i < its; i += 16)
+	{
+		/* load the next 16 values into __m128i variables */
+		__m128i vals1 = _mm_loadu_si128((__m128i *) &base[i]);
+		__m128i vals2 = _mm_loadu_si128((__m128i *) &base[i + 4]);
+		__m128i vals3 = _mm_loadu_si128((__m128i *) &base[i + 8]);
+		__m128i vals4 = _mm_loadu_si128((__m128i *) &base[i + 12]);
+
+		/* perform the comparisons */
+		__m128i result1 = _mm_cmpeq_epi32(keys, vals1);
+		__m128i result2 = _mm_cmpeq_epi32(keys, vals2);
+		__m128i result3 = _mm_cmpeq_epi32(keys, vals3);
+		__m128i result4 = _mm_cmpeq_epi32(keys, vals4);
+
+		/* shrink the results into a single variable */
+		__m128i tmp1 = _mm_packs_epi32(result1, result2);
+		__m128i tmp2 = _mm_packs_epi32(result3, result4);
+		__m128i result = _mm_packs_epi16(tmp1, tmp2);
+
+		/* see if there was a match */
+		if (_mm_movemask_epi8(result) != 0)
+			return true;
+	}
+#endif
+
+	/* Process the remaining elements the slow way. */
+	for (; i < nelem; i++)
+	{
+		if (key == base[i])
+			return true;
+	}
+
+	return false;
+}
+
+#endif							/* LINEARSEARCH_H */
-- 
2.25.1

>From aadb77a991174771d62442904e32ac6fca64571f Mon Sep 17 00:00:00 2001
From: Nathan Bossart <[email protected]>
Date: Wed, 3 Aug 2022 09:59:28 -0700
Subject: [PATCH v5 3/3] Optimize linear searches in XidInMVCCSnapshot().

This change makes use of the recently-introduced optimized linear search
routine to speed up searches through the [sub]xip arrays when possible, which
should improve performance significantly when the arrays are large.

Author: Nathan Bossart
Reviewed by: Andres Freund, John Naylor
Discussion: https://postgr.es/m/20220713170950.GA3116318%40nathanxps13
---
 src/backend/utils/time/snapmgr.c | 28 +++++++---------------------
 1 file changed, 7 insertions(+), 21 deletions(-)

diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c
index 5bc2a15160..834c8867d4 100644
--- a/src/backend/utils/time/snapmgr.c
+++ b/src/backend/utils/time/snapmgr.c
@@ -63,6 +63,7 @@
 #include "storage/sinvaladt.h"
 #include "storage/spin.h"
 #include "utils/builtins.h"
+#include "utils/linearsearch.h"
 #include "utils/memutils.h"
 #include "utils/old_snapshot.h"
 #include "utils/rel.h"
@@ -2284,8 +2285,6 @@ RestoreTransactionSnapshot(Snapshot snapshot, void *source_pgproc)
 bool
 XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot)
 {
-	uint32		i;
-
 	/*
 	 * Make a quick range check to eliminate most XIDs without looking at the
 	 * xip arrays.  Note that this is OK even if we convert a subxact XID to
@@ -2317,13 +2316,8 @@ XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot)
 		if (!snapshot->suboverflowed)
 		{
 			/* we have full data, so search subxip */
-			int32		j;
-
-			for (j = 0; j < snapshot->subxcnt; j++)
-			{
-				if (TransactionIdEquals(xid, snapshot->subxip[j]))
-					return true;
-			}
+			if (pg_linearsearch_uint32(xid, snapshot->subxip, snapshot->subxcnt))
+				return true;
 
 			/* not there, fall through to search xip[] */
 		}
@@ -2344,16 +2338,11 @@ XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot)
 				return false;
 		}
 
-		for (i = 0; i < snapshot->xcnt; i++)
-		{
-			if (TransactionIdEquals(xid, snapshot->xip[i]))
-				return true;
-		}
+		if (pg_linearsearch_uint32(xid, snapshot->xip, snapshot->xcnt))
+			return true;
 	}
 	else
 	{
-		int32		j;
-
 		/*
 		 * In recovery we store all xids in the subxact array because it is by
 		 * far the bigger array, and we mostly don't know which xids are
@@ -2383,11 +2372,8 @@ XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot)
 		 * indeterminate xid. We don't know whether it's top level or subxact
 		 * but it doesn't matter. If it's present, the xid is visible.
 		 */
-		for (j = 0; j < snapshot->subxcnt; j++)
-		{
-			if (TransactionIdEquals(xid, snapshot->subxip[j]))
-				return true;
-		}
+		if (pg_linearsearch_uint32(xid, snapshot->subxip, snapshot->subxcnt))
+			return true;
 	}
 
 	return false;
-- 
2.25.1

Re: optimize lookups in snapshot [sub]xip arrays

Reply via email to