On Fri, Jul 29, 2022 at 10:38:11PM -0700, Nathan Bossart wrote:
> On Sat, Jul 30, 2022 at 12:02:02PM +0700, John Naylor wrote:
>> I'll be the first to say it's not committable and needs some thought. Since
>> there are several recently proposed patches that take advantage of SSE2, it
>> seems time for me to open a new thread and get that prerequisite settled.
>> I'll do that next week.
> 
> Awesome.  I will help test and review.

While this prerequisite is worked out [0], here is a new patch set.  I've
added an 0002 in which I've made use of the proposed SSE2 linear search
function in several other areas.  I haven't done any additional performance
analysis, and it's likely I'm missing some eligible code locations, but at
the very least, this demonstrates the reusability of the new function.

[0] 
https://postgr.es/m/CAFBsxsE2G_H_5Wbw%2BNOPm70-BK4xxKf86-mRzY%3DL2sLoQqM%2B-Q%40mail.gmail.com

-- 
Nathan Bossart
Amazon Web Services: https://aws.amazon.com
>From a2b2ab3777f689775c2e731822aefe2ab500e8ee Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nathandboss...@gmail.com>
Date: Thu, 28 Jul 2022 12:15:47 -0700
Subject: [PATCH v4 1/2] Use SSE2 intrinsics for XidInMVCCSnapshot().

This optimizes the linear searches through the [sub]xip arrays when
possible, which should improve performance significantly when the
arrays are large.
---
 src/backend/utils/time/snapmgr.c | 28 +++---------
 src/include/c.h                  |  8 ++++
 src/include/utils/linearsearch.h | 78 ++++++++++++++++++++++++++++++++
 3 files changed, 93 insertions(+), 21 deletions(-)
 create mode 100644 src/include/utils/linearsearch.h

diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c
index 5bc2a15160..834c8867d4 100644
--- a/src/backend/utils/time/snapmgr.c
+++ b/src/backend/utils/time/snapmgr.c
@@ -63,6 +63,7 @@
 #include "storage/sinvaladt.h"
 #include "storage/spin.h"
 #include "utils/builtins.h"
+#include "utils/linearsearch.h"
 #include "utils/memutils.h"
 #include "utils/old_snapshot.h"
 #include "utils/rel.h"
@@ -2284,8 +2285,6 @@ RestoreTransactionSnapshot(Snapshot snapshot, void *source_pgproc)
 bool
 XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot)
 {
-	uint32		i;
-
 	/*
 	 * Make a quick range check to eliminate most XIDs without looking at the
 	 * xip arrays.  Note that this is OK even if we convert a subxact XID to
@@ -2317,13 +2316,8 @@ XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot)
 		if (!snapshot->suboverflowed)
 		{
 			/* we have full data, so search subxip */
-			int32		j;
-
-			for (j = 0; j < snapshot->subxcnt; j++)
-			{
-				if (TransactionIdEquals(xid, snapshot->subxip[j]))
-					return true;
-			}
+			if (pg_linearsearch_uint32(xid, snapshot->subxip, snapshot->subxcnt))
+				return true;
 
 			/* not there, fall through to search xip[] */
 		}
@@ -2344,16 +2338,11 @@ XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot)
 				return false;
 		}
 
-		for (i = 0; i < snapshot->xcnt; i++)
-		{
-			if (TransactionIdEquals(xid, snapshot->xip[i]))
-				return true;
-		}
+		if (pg_linearsearch_uint32(xid, snapshot->xip, snapshot->xcnt))
+			return true;
 	}
 	else
 	{
-		int32		j;
-
 		/*
 		 * In recovery we store all xids in the subxact array because it is by
 		 * far the bigger array, and we mostly don't know which xids are
@@ -2383,11 +2372,8 @@ XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot)
 		 * indeterminate xid. We don't know whether it's top level or subxact
 		 * but it doesn't matter. If it's present, the xid is visible.
 		 */
-		for (j = 0; j < snapshot->subxcnt; j++)
-		{
-			if (TransactionIdEquals(xid, snapshot->subxip[j]))
-				return true;
-		}
+		if (pg_linearsearch_uint32(xid, snapshot->subxip, snapshot->subxcnt))
+			return true;
 	}
 
 	return false;
diff --git a/src/include/c.h b/src/include/c.h
index d35405f191..2c1a47bc28 100644
--- a/src/include/c.h
+++ b/src/include/c.h
@@ -371,6 +371,14 @@ typedef void (*pg_funcptr_t) (void);
 #endif
 #endif
 
+/*
+ * Are SSE2 intrinsics available?
+ */
+#if (defined(__x86_64__) || defined(_M_AMD64))
+#include <emmintrin.h>
+#define USE_SSE2
+#endif
+
 
 /* ----------------------------------------------------------------
  *				Section 2:	bool, true, false
diff --git a/src/include/utils/linearsearch.h b/src/include/utils/linearsearch.h
new file mode 100644
index 0000000000..65b0092a65
--- /dev/null
+++ b/src/include/utils/linearsearch.h
@@ -0,0 +1,78 @@
+/*-------------------------------------------------------------------------
+ *
+ * linearsearch.h
+ *	  Optimized linear search routines.
+ *
+ * Copyright (c) 2022, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ *	  src/include/utils/linearsearch.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef LINEARSEARCH_H
+#define LINEARSEARCH_H
+
+#include "c.h"
+
+#ifdef USE_SSE2
+#include "port/pg_bitutils.h"
+#endif
+
+/*
+ * pg_linearsearch_uint32
+ *
+ * Returns the address of the first element in 'base' that equals 'key', or
+ * NULL if no match is found.
+ */
+#ifdef USE_SSE2
+pg_attribute_no_sanitize_alignment()
+#endif
+static inline uint32 *
+pg_linearsearch_uint32(uint32 key, uint32 *base, uint32 nelem)
+{
+	uint32		i = 0;
+
+	/* If possible, use SSE2 intrinsics to speed up the search. */
+#ifdef USE_SSE2
+	__m128i		keys = _mm_set1_epi32(key);	/* load 4 copies of key */
+	uint32		its = nelem & ~0xF;			/* round down to multiple of 16 */
+
+	for (; i < its; i += 16)
+	{
+		/* load the next 16 values into __m128i variables */
+		__m128i vals1 = _mm_loadu_si128((__m128i *) &base[i]);
+		__m128i vals2 = _mm_loadu_si128((__m128i *) &base[i + 4]);
+		__m128i vals3 = _mm_loadu_si128((__m128i *) &base[i + 8]);
+		__m128i vals4 = _mm_loadu_si128((__m128i *) &base[i + 12]);
+
+		/* perform the comparisons */
+		__m128i result1 = _mm_cmpeq_epi32(keys, vals1);
+		__m128i result2 = _mm_cmpeq_epi32(keys, vals2);
+		__m128i result3 = _mm_cmpeq_epi32(keys, vals3);
+		__m128i result4 = _mm_cmpeq_epi32(keys, vals4);
+
+		/* shrink the results into an integer */
+		__m128i tmp1 = _mm_packs_epi32(result1, result2);
+		__m128i tmp2 = _mm_packs_epi32(result3, result4);
+		__m128i tmp3 = _mm_packs_epi16(tmp1, tmp2);
+		uint32 result = _mm_movemask_epi8(tmp3);
+
+		/* if found, return a pointer to the matching element */
+		if (result != 0)
+			return &base[i + pg_rightmost_one_pos32(result)];
+	}
+#endif
+
+	/* Process the remaining elements the slow way. */
+	for (; i < nelem; i++)
+	{
+		if (key == base[i])
+			return &base[i];
+	}
+
+	return NULL;
+}
+
+#endif							/* LINEARSEARCH_H */
-- 
2.25.1

>From 571f831fc1d55660afb1531c17572847ec57b446 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nathandboss...@gmail.com>
Date: Tue, 2 Aug 2022 14:56:53 -0700
Subject: [PATCH v4 2/2] SIMD-ify various linear searches through arrays of
 integers.

---
 src/backend/access/transam/slru.c    | 15 ++++++---------
 src/backend/commands/extension.c     | 25 +++++++------------------
 src/backend/commands/tsearchcmds.c   | 13 +++++--------
 src/backend/executor/execParallel.c  | 23 +++++++++++++++--------
 src/backend/parser/parse_coerce.c    | 11 ++++++-----
 src/backend/storage/ipc/procarray.c  |  8 +++-----
 src/backend/storage/lmgr/predicate.c |  9 +++------
 src/backend/utils/adt/lockfuncs.c    | 27 ++++++++++++---------------
 8 files changed, 57 insertions(+), 74 deletions(-)

diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c
index b65cb49d7f..84b0487f4d 100644
--- a/src/backend/access/transam/slru.c
+++ b/src/backend/access/transam/slru.c
@@ -59,6 +59,7 @@
 #include "pgstat.h"
 #include "storage/fd.h"
 #include "storage/shmem.h"
+#include "utils/linearsearch.h"
 
 #define SlruFileName(ctl, path, seg) \
 	snprintf(path, MAXPGPATH, "%s/%04X", (ctl)->Dir, seg)
@@ -812,16 +813,12 @@ SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruWriteAll fdata)
 	 */
 	if (fdata)
 	{
-		int			i;
+		int		   *match;
 
-		for (i = 0; i < fdata->num_files; i++)
-		{
-			if (fdata->segno[i] == segno)
-			{
-				fd = fdata->fd[i];
-				break;
-			}
-		}
+		if ((match = (int *) pg_linearsearch_uint32(segno,
+													(uint32 *) fdata->segno,
+													fdata->num_files)))
+			fd = fdata->fd[match - fdata->segno];
 	}
 
 	if (fd < 0)
diff --git a/src/backend/commands/extension.c b/src/backend/commands/extension.c
index 6b6720c690..06e045c0e0 100644
--- a/src/backend/commands/extension.c
+++ b/src/backend/commands/extension.c
@@ -60,6 +60,7 @@
 #include "utils/acl.h"
 #include "utils/builtins.h"
 #include "utils/fmgroids.h"
+#include "utils/linearsearch.h"
 #include "utils/lsyscache.h"
 #include "utils/memutils.h"
 #include "utils/rel.h"
@@ -2446,7 +2447,7 @@ pg_extension_config_dump(PG_FUNCTION_ARGS)
 	{
 		/* Modify or extend existing extconfig array */
 		Oid		   *arrayData;
-		int			i;
+		Oid		   *match;
 
 		a = DatumGetArrayTypeP(arrayDatum);
 
@@ -2461,14 +2462,8 @@ pg_extension_config_dump(PG_FUNCTION_ARGS)
 
 		arrayIndex = arrayLength + 1;	/* set up to add after end */
 
-		for (i = 0; i < arrayLength; i++)
-		{
-			if (arrayData[i] == tableoid)
-			{
-				arrayIndex = i + 1; /* replace this element instead */
-				break;
-			}
-		}
+		if ((match = pg_linearsearch_uint32(tableoid, arrayData, arrayLength)))
+			arrayIndex = (match - arrayData) + 1; /* replace this element instead */
 
 		a = array_set(a, 1, &arrayIndex,
 					  elementDatum,
@@ -2582,7 +2577,7 @@ extension_config_remove(Oid extensionoid, Oid tableoid)
 	else
 	{
 		Oid		   *arrayData;
-		int			i;
+		Oid		   *match;
 
 		a = DatumGetArrayTypeP(arrayDatum);
 
@@ -2597,14 +2592,8 @@ extension_config_remove(Oid extensionoid, Oid tableoid)
 
 		arrayIndex = -1;		/* flag for no deletion needed */
 
-		for (i = 0; i < arrayLength; i++)
-		{
-			if (arrayData[i] == tableoid)
-			{
-				arrayIndex = i; /* index to remove */
-				break;
-			}
-		}
+		if ((match = pg_linearsearch_uint32(tableoid, arrayData, arrayLength)))
+			arrayIndex = match - arrayData; /* index to remove */
 	}
 
 	/* If tableoid is not in extconfig, nothing to do */
diff --git a/src/backend/commands/tsearchcmds.c b/src/backend/commands/tsearchcmds.c
index 4cc4e3c00f..6ae41bdd56 100644
--- a/src/backend/commands/tsearchcmds.c
+++ b/src/backend/commands/tsearchcmds.c
@@ -44,6 +44,7 @@
 #include "tsearch/ts_utils.h"
 #include "utils/builtins.h"
 #include "utils/fmgroids.h"
+#include "utils/linearsearch.h"
 #include "utils/lsyscache.h"
 #include "utils/rel.h"
 #include "utils/syscache.h"
@@ -1302,14 +1303,10 @@ MakeConfigurationMapping(AlterTSConfigurationStmt *stmt,
 			{
 				bool		tokmatch = false;
 
-				for (j = 0; j < ntoken; j++)
-				{
-					if (cfgmap->maptokentype == tokens[j])
-					{
-						tokmatch = true;
-						break;
-					}
-				}
+				if (pg_linearsearch_uint32(cfgmap->maptokentype,
+										   (uint32 *) tokens, ntoken))
+					tokmatch = true;
+
 				if (!tokmatch)
 					continue;
 			}
diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c
index f1fd7f7e8b..2c9cbe6d52 100644
--- a/src/backend/executor/execParallel.c
+++ b/src/backend/executor/execParallel.c
@@ -47,6 +47,7 @@
 #include "tcop/tcopprot.h"
 #include "utils/datum.h"
 #include "utils/dsa.h"
+#include "utils/linearsearch.h"
 #include "utils/lsyscache.h"
 #include "utils/memutils.h"
 #include "utils/snapmgr.h"
@@ -1022,12 +1023,15 @@ ExecParallelRetrieveInstrumentation(PlanState *planstate,
 	int			ibytes;
 	int			plan_node_id = planstate->plan->plan_node_id;
 	MemoryContext oldcontext;
+	int		   *match;
 
 	/* Find the instrumentation for this node. */
-	for (i = 0; i < instrumentation->num_plan_nodes; ++i)
-		if (instrumentation->plan_node_id[i] == plan_node_id)
-			break;
-	if (i >= instrumentation->num_plan_nodes)
+	match = (int *) pg_linearsearch_uint32(plan_node_id,
+										   (uint32 *) instrumentation->plan_node_id,
+										   instrumentation->num_plan_nodes);
+	if (match)
+		i = match - instrumentation->plan_node_id;
+	else
 		elog(ERROR, "plan node %d not found", plan_node_id);
 
 	/* Accumulate the statistics from all workers. */
@@ -1265,6 +1269,7 @@ ExecParallelReportInstrumentation(PlanState *planstate,
 	int			i;
 	int			plan_node_id = planstate->plan->plan_node_id;
 	Instrumentation *instrument;
+	int		   *match;
 
 	InstrEndLoop(planstate->instrument);
 
@@ -1274,10 +1279,12 @@ ExecParallelReportInstrumentation(PlanState *planstate,
 	 * we're pushing down sufficiently large plan trees.  For now, do it the
 	 * slow, dumb way.
 	 */
-	for (i = 0; i < instrumentation->num_plan_nodes; ++i)
-		if (instrumentation->plan_node_id[i] == plan_node_id)
-			break;
-	if (i >= instrumentation->num_plan_nodes)
+	match = (int *) pg_linearsearch_uint32(plan_node_id,
+										   (uint32 *) instrumentation->plan_node_id,
+										   instrumentation->num_plan_nodes);
+	if (match)
+		i = match - instrumentation->plan_node_id;
+	else
 		elog(ERROR, "plan node %d not found", plan_node_id);
 
 	/*
diff --git a/src/backend/parser/parse_coerce.c b/src/backend/parser/parse_coerce.c
index c4e958e4aa..fd9291a4a4 100644
--- a/src/backend/parser/parse_coerce.c
+++ b/src/backend/parser/parse_coerce.c
@@ -27,6 +27,7 @@
 #include "utils/builtins.h"
 #include "utils/datum.h"		/* needed for datumIsEqual() */
 #include "utils/fmgroids.h"
+#include "utils/linearsearch.h"
 #include "utils/lsyscache.h"
 #include "utils/syscache.h"
 #include "utils/typcache.h"
@@ -2920,11 +2921,11 @@ check_valid_internal_signature(Oid ret_type,
 {
 	if (ret_type == INTERNALOID)
 	{
-		for (int i = 0; i < nargs; i++)
-		{
-			if (declared_arg_types[i] == ret_type)
-				return NULL;	/* OK */
-		}
+		if (pg_linearsearch_uint32(ret_type,
+								   unconstify(Oid *, declared_arg_types),
+								   nargs))
+			return NULL;		/* OK */
+
 		return pstrdup(_("A result of type internal requires at least one input of type internal."));
 	}
 	else
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c
index 0555b02a8d..157dab61d1 100644
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -63,6 +63,7 @@
 #include "storage/spin.h"
 #include "utils/acl.h"
 #include "utils/builtins.h"
+#include "utils/linearsearch.h"
 #include "utils/rel.h"
 #include "utils/snapmgr.h"
 
@@ -1588,11 +1589,8 @@ TransactionIdIsInProgress(TransactionId xid)
 	Assert(TransactionIdIsValid(topxid));
 	if (!TransactionIdEquals(topxid, xid))
 	{
-		for (int i = 0; i < nxids; i++)
-		{
-			if (TransactionIdEquals(xids[i], topxid))
-				return true;
-		}
+		if (pg_linearsearch_uint32(topxid, xids, nxids))
+			return true;
 	}
 
 	cachedXidIsNotInProgress = xid;
diff --git a/src/backend/storage/lmgr/predicate.c b/src/backend/storage/lmgr/predicate.c
index 5136da6ea3..11d3412650 100644
--- a/src/backend/storage/lmgr/predicate.c
+++ b/src/backend/storage/lmgr/predicate.c
@@ -207,6 +207,7 @@
 #include "storage/predicate_internals.h"
 #include "storage/proc.h"
 #include "storage/procarray.h"
+#include "utils/linearsearch.h"
 #include "utils/rel.h"
 #include "utils/snapmgr.h"
 
@@ -4065,7 +4066,6 @@ static bool
 XidIsConcurrent(TransactionId xid)
 {
 	Snapshot	snap;
-	uint32		i;
 
 	Assert(TransactionIdIsValid(xid));
 	Assert(!TransactionIdEquals(xid, GetTopTransactionIdIfAny()));
@@ -4078,11 +4078,8 @@ XidIsConcurrent(TransactionId xid)
 	if (TransactionIdFollowsOrEquals(xid, snap->xmax))
 		return true;
 
-	for (i = 0; i < snap->xcnt; i++)
-	{
-		if (xid == snap->xip[i])
-			return true;
-	}
+	if (pg_linearsearch_uint32(xid, snap->xip, snap->xcnt))
+		return true;
 
 	return false;
 }
diff --git a/src/backend/utils/adt/lockfuncs.c b/src/backend/utils/adt/lockfuncs.c
index f9b324efec..48cb0c87bb 100644
--- a/src/backend/utils/adt/lockfuncs.c
+++ b/src/backend/utils/adt/lockfuncs.c
@@ -20,6 +20,7 @@
 #include "storage/predicate_internals.h"
 #include "utils/array.h"
 #include "utils/builtins.h"
+#include "utils/linearsearch.h"
 
 
 /*
@@ -506,17 +507,15 @@ pg_blocking_pids(PG_FUNCTION_ARGS)
 			{
 				/* conflict in lock requests; who's in front in wait queue? */
 				bool		ahead = false;
-				int			k;
 
-				for (k = 0; k < bproc->num_waiters; k++)
+				if (pg_linearsearch_uint32(instance->pid,
+										   (uint32 *) preceding_waiters,
+										   bproc->num_waiters))
 				{
-					if (preceding_waiters[k] == instance->pid)
-					{
-						/* soft block: this entry is ahead of blocked proc */
-						ahead = true;
-						break;
-					}
+					/* soft block: this entry is ahead of blocked proc */
+					ahead = true;
 				}
+
 				if (!ahead)
 					continue;	/* not blocked by this entry */
 			}
@@ -598,8 +597,7 @@ pg_isolation_test_session_is_blocked(PG_FUNCTION_ARGS)
 	int			num_interesting_pids;
 	int			num_blocking_pids;
 	int			dummy;
-	int			i,
-				j;
+	int			i;
 
 	/* Validate the passed-in array */
 	Assert(ARR_ELEMTYPE(interesting_pids_a) == INT4OID);
@@ -632,11 +630,10 @@ pg_isolation_test_session_is_blocked(PG_FUNCTION_ARGS)
 	 * for a match.
 	 */
 	for (i = 0; i < num_blocking_pids; i++)
-		for (j = 0; j < num_interesting_pids; j++)
-		{
-			if (blocking_pids[i] == interesting_pids[j])
-				PG_RETURN_BOOL(true);
-		}
+		if (pg_linearsearch_uint32(blocking_pids[i],
+								   (uint32 *) interesting_pids,
+								   num_interesting_pids))
+			PG_RETURN_BOOL(true);
 
 	/*
 	 * Check if blocked_pid is waiting for a safe snapshot.  We could in
-- 
2.25.1

Reply via email to