On Tue, Aug 09, 2022 at 01:00:37PM -0700, Nathan Bossart wrote:
> Your adjustments in 0002 seem reasonable to me.  I think it makes sense to
> ensure there is test coverage for pg_lfind32(), but I don't know if that
> syscache code is the right choice.  For non-USE_SSE2 builds, it might make
> these lookups more expensive.  I'll look around to see if there are any
> other suitable candidates.

One option might be to create a small test module for pg_lfind32().  Here
is an example.

-- 
Nathan Bossart
Amazon Web Services: https://aws.amazon.com
>From 0b15a0569318cc846980b7771ff809ef5d2d505c Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nathandboss...@gmail.com>
Date: Wed, 3 Aug 2022 09:49:04 -0700
Subject: [PATCH v12 1/2] Introduce optimized routine for linear searches
 through an array of integers.

If SSE2 is available, this function uses it to speed up the search.  Otherwise,
it uses a simple 'for' loop.  This is a prerequisite for a follow-up commit
that will use this function to optimize [sub]xip lookups in
XidInMVCCSnapshot(), but it can be used anywhere that might benefit from such
an optimization.

It might be worthwhile to add an ARM-specific code path to this function in the
future.

Author: Nathan Bossart
Reviewed by: Andres Freund, John Naylor, Bharath Rupireddy, Masahiko Sawada
Discussion: https://postgr.es/m/20220713170950.GA3116318%40nathanxps13
---
 src/include/port/pg_lfind.h                   | 103 ++++++++++++++++++
 src/test/modules/Makefile                     |   1 +
 src/test/modules/test_lfind/.gitignore        |   4 +
 src/test/modules/test_lfind/Makefile          |  23 ++++
 .../test_lfind/expected/test_lfind.out        |  12 ++
 .../modules/test_lfind/sql/test_lfind.sql     |   8 ++
 .../modules/test_lfind/test_lfind--1.0.sql    |   8 ++
 src/test/modules/test_lfind/test_lfind.c      |  52 +++++++++
 .../modules/test_lfind/test_lfind.control     |   4 +
 9 files changed, 215 insertions(+)
 create mode 100644 src/include/port/pg_lfind.h
 create mode 100644 src/test/modules/test_lfind/.gitignore
 create mode 100644 src/test/modules/test_lfind/Makefile
 create mode 100644 src/test/modules/test_lfind/expected/test_lfind.out
 create mode 100644 src/test/modules/test_lfind/sql/test_lfind.sql
 create mode 100644 src/test/modules/test_lfind/test_lfind--1.0.sql
 create mode 100644 src/test/modules/test_lfind/test_lfind.c
 create mode 100644 src/test/modules/test_lfind/test_lfind.control

diff --git a/src/include/port/pg_lfind.h b/src/include/port/pg_lfind.h
new file mode 100644
index 0000000000..fb125977b2
--- /dev/null
+++ b/src/include/port/pg_lfind.h
@@ -0,0 +1,103 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_lfind.h
+ *	  Optimized linear search routines.
+ *
+ * Copyright (c) 2022, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/include/port/pg_lfind.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PG_LFIND_H
+#define PG_LFIND_H
+
+#include "port/simd.h"
+
+/*
+ * pg_lfind32
+ *
+ * Return true if there is an element in 'base' that equals 'key', otherwise
+ * return false.
+ */
+static inline bool
+pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
+{
+	uint32		i = 0;
+
+	/* Use SIMD intrinsics where available. */
+#ifdef USE_SSE2
+
+	/*
+	 * A 16-byte register only has four 4-byte lanes. For better
+	 * instruction-level parallelism, each loop iteration operates on a block
+	 * of four registers. Testing has showed this is ~40% faster than using a
+	 * block of two registers.
+	 */
+	const		__m128i keys = _mm_set1_epi32(key); /* load 4 copies of key */
+	uint32		iterations = nelem & ~0xF;	/* round down to multiple of 16 */
+
+#if defined(USE_ASSERT_CHECKING)
+	bool		assert_result = false;
+
+	/* pre-compute the result for assert checking */
+	for (i = 0; i < nelem; i++)
+	{
+		if (key == base[i])
+		{
+			assert_result = true;
+			break;
+		}
+	}
+#endif
+
+	for (i = 0; i < iterations; i += 16)
+	{
+		/* load the next block into 4 registers holding 4 values each */
+		const		__m128i vals1 = _mm_loadu_si128((__m128i *) & base[i]);
+		const		__m128i vals2 = _mm_loadu_si128((__m128i *) & base[i + 4]);
+		const		__m128i vals3 = _mm_loadu_si128((__m128i *) & base[i + 8]);
+		const		__m128i vals4 = _mm_loadu_si128((__m128i *) & base[i + 12]);
+
+		/* compare each value to the key */
+		const		__m128i result1 = _mm_cmpeq_epi32(keys, vals1);
+		const		__m128i result2 = _mm_cmpeq_epi32(keys, vals2);
+		const		__m128i result3 = _mm_cmpeq_epi32(keys, vals3);
+		const		__m128i result4 = _mm_cmpeq_epi32(keys, vals4);
+
+		/* combine the results into a single variable */
+		const		__m128i tmp1 = _mm_or_si128(result1, result2);
+		const		__m128i tmp2 = _mm_or_si128(result3, result4);
+		const		__m128i result = _mm_or_si128(tmp1, tmp2);
+
+		/* see if there was a match */
+		if (_mm_movemask_epi8(result) != 0)
+		{
+#if defined(USE_ASSERT_CHECKING)
+			Assert(assert_result == true);
+#endif
+			return true;
+		}
+	}
+#endif							/* USE_SSE2 */
+
+	/* Process the remaining elements one at a time. */
+	for (; i < nelem; i++)
+	{
+		if (key == base[i])
+		{
+#if defined(USE_SSE2) && defined(USE_ASSERT_CHECKING)
+			Assert(assert_result == true);
+#endif
+			return true;
+		}
+	}
+
+#if defined(USE_SSE2) && defined(USE_ASSERT_CHECKING)
+	Assert(assert_result == false);
+#endif
+	return false;
+}
+
+#endif							/* PG_LFIND_H */
diff --git a/src/test/modules/Makefile b/src/test/modules/Makefile
index 9090226daa..6c31c8707c 100644
--- a/src/test/modules/Makefile
+++ b/src/test/modules/Makefile
@@ -19,6 +19,7 @@ SUBDIRS = \
 		  test_extensions \
 		  test_ginpostinglist \
 		  test_integerset \
+		  test_lfind \
 		  test_misc \
 		  test_oat_hooks \
 		  test_parser \
diff --git a/src/test/modules/test_lfind/.gitignore b/src/test/modules/test_lfind/.gitignore
new file mode 100644
index 0000000000..5dcb3ff972
--- /dev/null
+++ b/src/test/modules/test_lfind/.gitignore
@@ -0,0 +1,4 @@
+# Generated subdirectories
+/log/
+/results/
+/tmp_check/
diff --git a/src/test/modules/test_lfind/Makefile b/src/test/modules/test_lfind/Makefile
new file mode 100644
index 0000000000..00ba56ff74
--- /dev/null
+++ b/src/test/modules/test_lfind/Makefile
@@ -0,0 +1,23 @@
+# src/test/modules/test_lfind/Makefile
+
+MODULE_big = test_lfind
+OBJS = \
+	$(WIN32RES) \
+	test_lfind.o
+PGFILEDESC = "test_lfind - test code for optimized linear search functions"
+
+EXTENSION = test_lfind
+DATA = test_lfind--1.0.sql
+
+REGRESS = test_lfind
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = src/test/modules/test_lfind
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/src/test/modules/test_lfind/expected/test_lfind.out b/src/test/modules/test_lfind/expected/test_lfind.out
new file mode 100644
index 0000000000..222c8fd7ff
--- /dev/null
+++ b/src/test/modules/test_lfind/expected/test_lfind.out
@@ -0,0 +1,12 @@
+CREATE EXTENSION test_lfind;
+--
+-- These tests don't produce any interesting output.  We're checking that
+-- the operations complete without crashing or hanging and that none of their
+-- internal sanity tests fail.
+--
+SELECT test_lfind();
+ test_lfind 
+------------
+ 
+(1 row)
+
diff --git a/src/test/modules/test_lfind/sql/test_lfind.sql b/src/test/modules/test_lfind/sql/test_lfind.sql
new file mode 100644
index 0000000000..899f1dd49b
--- /dev/null
+++ b/src/test/modules/test_lfind/sql/test_lfind.sql
@@ -0,0 +1,8 @@
+CREATE EXTENSION test_lfind;
+
+--
+-- These tests don't produce any interesting output.  We're checking that
+-- the operations complete without crashing or hanging and that none of their
+-- internal sanity tests fail.
+--
+SELECT test_lfind();
diff --git a/src/test/modules/test_lfind/test_lfind--1.0.sql b/src/test/modules/test_lfind/test_lfind--1.0.sql
new file mode 100644
index 0000000000..d82ab0567e
--- /dev/null
+++ b/src/test/modules/test_lfind/test_lfind--1.0.sql
@@ -0,0 +1,8 @@
+/* src/test/modules/test_lfind/test_lfind--1.0.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION test_lfind" to load this file. \quit
+
+CREATE FUNCTION test_lfind()
+	RETURNS pg_catalog.void
+	AS 'MODULE_PATHNAME' LANGUAGE C;
diff --git a/src/test/modules/test_lfind/test_lfind.c b/src/test/modules/test_lfind/test_lfind.c
new file mode 100644
index 0000000000..a000746fb8
--- /dev/null
+++ b/src/test/modules/test_lfind/test_lfind.c
@@ -0,0 +1,52 @@
+/*--------------------------------------------------------------------------
+ *
+ * test_lfind.c
+ *		Test correctness of optimized linear search functions.
+ *
+ * Copyright (c) 2022, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *		src/test/modules/test_lfind/test_lfind.c
+ *
+ * -------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "fmgr.h"
+#include "port/pg_lfind.h"
+
+PG_MODULE_MAGIC;
+
+PG_FUNCTION_INFO_V1(test_lfind);
+
+Datum
+test_lfind(PG_FUNCTION_ARGS)
+{
+#define TEST_ARRAY_SIZE 135
+	uint32		test_array[TEST_ARRAY_SIZE] = {0};
+
+	test_array[8] = 1;
+	test_array[64] = 2;
+	test_array[TEST_ARRAY_SIZE - 1] = 3;
+
+	if (pg_lfind32(1, test_array, 4))
+		elog(ERROR, "pg_lfind32() found nonexistent element");
+	if (!pg_lfind32(1, test_array, TEST_ARRAY_SIZE))
+		elog(ERROR, "pg_lfind32() did not find existing element");
+
+	if (pg_lfind32(2, test_array, 32))
+		elog(ERROR, "pg_lfind32() found nonexistent element");
+	if (!pg_lfind32(2, test_array, TEST_ARRAY_SIZE))
+		elog(ERROR, "pg_lfind32() did not find existing element");
+
+	if (pg_lfind32(3, test_array, 96))
+		elog(ERROR, "pg_lfind32() found nonexistent element");
+	if (!pg_lfind32(3, test_array, TEST_ARRAY_SIZE))
+		elog(ERROR, "pg_lfind32() did not find existing element");
+
+	if (pg_lfind32(4, test_array, TEST_ARRAY_SIZE))
+		elog(ERROR, "pg_lfind32() found nonexistent element");
+
+	PG_RETURN_VOID();
+}
diff --git a/src/test/modules/test_lfind/test_lfind.control b/src/test/modules/test_lfind/test_lfind.control
new file mode 100644
index 0000000000..d8b57dfca2
--- /dev/null
+++ b/src/test/modules/test_lfind/test_lfind.control
@@ -0,0 +1,4 @@
+comment = 'Test code for optimized linear search functions'
+default_version = '1.0'
+module_pathname = '$libdir/test_lfind'
+relocatable = true
-- 
2.25.1

>From 89b0ec186f7013aa6ba66c1733130d715161d8f0 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nathandboss...@gmail.com>
Date: Wed, 3 Aug 2022 09:59:28 -0700
Subject: [PATCH v12 2/2] Optimize linear searches in XidInMVCCSnapshot().

This change makes use of the recently-introduced optimized linear search
routine to speed up searches through the [sub]xip arrays when possible, which
should improve performance significantly when the arrays are large.

Author: Nathan Bossart
Reviewed by: Andres Freund, John Naylor, Bharath Rupireddy, Masahiko Sawada
Discussion: https://postgr.es/m/20220713170950.GA3116318%40nathanxps13
---
 src/backend/utils/time/snapmgr.c | 28 +++++++---------------------
 1 file changed, 7 insertions(+), 21 deletions(-)

diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c
index 5bc2a15160..9b504c9745 100644
--- a/src/backend/utils/time/snapmgr.c
+++ b/src/backend/utils/time/snapmgr.c
@@ -56,6 +56,7 @@
 #include "datatype/timestamp.h"
 #include "lib/pairingheap.h"
 #include "miscadmin.h"
+#include "port/pg_lfind.h"
 #include "storage/predicate.h"
 #include "storage/proc.h"
 #include "storage/procarray.h"
@@ -2284,8 +2285,6 @@ RestoreTransactionSnapshot(Snapshot snapshot, void *source_pgproc)
 bool
 XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot)
 {
-	uint32		i;
-
 	/*
 	 * Make a quick range check to eliminate most XIDs without looking at the
 	 * xip arrays.  Note that this is OK even if we convert a subxact XID to
@@ -2317,13 +2316,8 @@ XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot)
 		if (!snapshot->suboverflowed)
 		{
 			/* we have full data, so search subxip */
-			int32		j;
-
-			for (j = 0; j < snapshot->subxcnt; j++)
-			{
-				if (TransactionIdEquals(xid, snapshot->subxip[j]))
-					return true;
-			}
+			if (pg_lfind32(xid, snapshot->subxip, snapshot->subxcnt))
+				return true;
 
 			/* not there, fall through to search xip[] */
 		}
@@ -2344,16 +2338,11 @@ XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot)
 				return false;
 		}
 
-		for (i = 0; i < snapshot->xcnt; i++)
-		{
-			if (TransactionIdEquals(xid, snapshot->xip[i]))
-				return true;
-		}
+		if (pg_lfind32(xid, snapshot->xip, snapshot->xcnt))
+			return true;
 	}
 	else
 	{
-		int32		j;
-
 		/*
 		 * In recovery we store all xids in the subxact array because it is by
 		 * far the bigger array, and we mostly don't know which xids are
@@ -2383,11 +2372,8 @@ XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot)
 		 * indeterminate xid. We don't know whether it's top level or subxact
 		 * but it doesn't matter. If it's present, the xid is visible.
 		 */
-		for (j = 0; j < snapshot->subxcnt; j++)
-		{
-			if (TransactionIdEquals(xid, snapshot->subxip[j]))
-				return true;
-		}
+		if (pg_lfind32(xid, snapshot->subxip, snapshot->subxcnt))
+			return true;
 	}
 
 	return false;
-- 
2.25.1

Reply via email to