On Mon, 29 Jun 2020 at 12:17, Julien Rouhaud <rjuju...@gmail.com> wrote:

> On Mon, Jun 29, 2020 at 10:55 AM Fujii Masao
> <masao.fu...@oss.nttdata.com> wrote:
> >
> > On 2020/06/29 16:05, Julien Rouhaud wrote:
> > > On Mon, Jun 29, 2020 at 7:49 AM Tharakan, Robins <tha...@amazon.com>
> wrote:
> > >>
> > >> During fully-cached SELECT-only test using pgbench, Postgres v13Beta1
> shows
> >
> > Thanks for the benchmark!
> >
> >
> > >> ~45% performance drop [2] at high DB connection counts (when compared
> with v12.3)
> >
> > That's bad :(
> >
> >
> > >>
> > >> Disabling pg_stat_statements.track_planning (which is 'On' by default)
> > >> brings the TPS numbers up to v12.3 levels.
> > >>
> > >> The inflection point (in this test-case) is 128 Connections, beyond
> which the
> > >> TPS numbers are consistently low. Looking at the mailing list [1],
> this issue
> > >> didn't surface earlier possibly since the regression is trivial at
> low connection counts.
> > >>
> > >> It would be great if this could be optimized further, or
> track_planning
> > >> disabled (by default) so as to not trip users upgrading from v12 with
> pg_stat_statement
> > >> enabled (but otherwise not particularly interested in track_planning).
> >
> > Your benchmark result seems to suggest that the cause of the problem is
> > the contention of per-query spinlock in pgss_store(). Right?
> > This lock contention is likely to happen when multiple sessions run
> > the same queries.
> >
> > One idea to reduce that lock contention is to separate per-query spinlock
> > into two; one is for planning, and the other is for execution.
> pgss_store()
> > determines which lock to use based on the given "kind" argument.
> > To make this idea work, also every pgss counters like shared_blks_hit
> > need to be separated into two, i.e., for planning and execution.
>
> This can probably remove some overhead, but won't it eventually hit
> the same issue when multiple connections try to plan the same query,
> given the number of different queries and very low execution runtime?
> It'll also quite increase the shared memory consumption.
>
> I'm wondering if we could instead use atomics to store the counters.
> The only downside is that we won't guarantee per-row consistency
> anymore, which may be problematic.
>


The problem looks to be that spinlocks are terrible with overloaded CPU and
a contended spinlock. A process holding the spinlock might easily get
scheduled out leading to excessive spinning by everybody. I think a simple
thing to try would be to replace the spinlock with LWLock.

I did a prototype patch that replaces spinlocks with futexes, but was not
able to find a workload where it mattered. We have done a great job at
eliminating spinlocks from contended code paths. Robins, perhaps you could
try it to see if it reduces the regression you are observing. The patch is
against v13 stable branch.

-- 
Ants Aasma
Senior Database Engineerwww.cybertec-postgresql.com
diff --git a/src/backend/storage/lmgr/s_lock.c b/src/backend/storage/lmgr/s_lock.c
index 7fac0703419..56d45b7cfce 100644
--- a/src/backend/storage/lmgr/s_lock.c
+++ b/src/backend/storage/lmgr/s_lock.c
@@ -90,6 +90,7 @@ s_lock_stuck(const char *file, int line, const char *func)
 int
 s_lock(volatile slock_t *lock, const char *file, int line, const char *func)
 {
+#ifndef HAS_FUTEX
 	SpinDelayStatus delayStatus;
 
 	init_spin_delay(&delayStatus, file, line, func);
@@ -102,6 +103,8 @@ s_lock(volatile slock_t *lock, const char *file, int line, const char *func)
 	finish_spin_delay(&delayStatus);
 
 	return delayStatus.delays;
+#endif
+	elog(FATAL, "Should not be called");
 }
 
 #ifdef USE_DEFAULT_S_UNLOCK
@@ -218,6 +221,71 @@ update_spins_per_delay(int shared_spins_per_delay)
 	return (shared_spins_per_delay * 15 + spins_per_delay) / 16;
 }
 
+#ifdef HAS_FUTEX
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <linux/futex.h>
+
+static int
+futex(volatile uint32 *uaddr, int futex_op, int val,
+	  const struct timespec *timeout, int *uaddr2, int val3)
+{
+	return syscall(SYS_futex, uaddr, futex_op, val,
+				   timeout, uaddr, val3);
+}
+
+int
+futex_lock(volatile slock_t *lock, uint32 current, const char *file, int line, const char *func)
+{
+	int i, s;
+	/*
+	 * First lets wait for a bit without involving the kernel, it is quite likely
+	 * the lock holder is still running.
+	 **/
+	if (likely(current < 2))
+	{
+		uint32 expected;
+		for (i = 0; i < DEFAULT_SPINS_PER_DELAY; i++)
+		{
+			SPIN_DELAY();
+			expected = lock->value;
+			if (expected == 0 && pg_atomic_compare_exchange_u32(lock, &expected, 1))
+				return i;
+		}
+
+		while (expected != 2 && !pg_atomic_compare_exchange_u32(lock, &expected, 2)) {
+			if (expected == 0 && pg_atomic_compare_exchange_u32(lock, &expected, 2))
+				return i;
+		}
+	}
+
+	/* At this point lock value is 2 and we will get waken up */
+	while (true)
+	{
+		uint32 expected = 0;
+		s = futex(&(lock->value), FUTEX_WAIT, 2, NULL, NULL, 0);
+		if (s == -1 && errno != EAGAIN)
+			elog(FATAL, "Futex wait failed with error: %m");
+
+		/* Maybe someone else was waiting too, we will try to wake them up. */
+		if (pg_atomic_compare_exchange_u32(lock, &expected, 2))
+			break;
+
+	}
+
+	return i;
+}
+
+int futex_unlock(volatile slock_t *lock, uint32 current)
+{
+	lock->value = 0;
+	if (futex(&(lock->value), FUTEX_WAKE, 1, NULL, NULL, 0) == -1)
+		elog(FATAL, "Futex wake failed with error: %m");
+
+	return 0;
+}
+
+#endif /* HAS_FUTEX */
 
 /*
  * Various TAS implementations that cannot live in s_lock.h as no inline
diff --git a/src/include/storage/s_lock.h b/src/include/storage/s_lock.h
index 31a5ca6fb32..ba0b0d7ef91 100644
--- a/src/include/storage/s_lock.h
+++ b/src/include/storage/s_lock.h
@@ -205,6 +205,52 @@ spin_delay(void)
 #ifdef __x86_64__		/* AMD Opteron, Intel EM64T */
 #define HAS_TEST_AND_SET
 
+#if defined(__linux__)
+#define HAS_FUTEX 1 	/* TODO: move to configure to check for old kernels */
+#endif
+
+#ifdef HAS_FUTEX
+
+#include "port/atomics.h"
+
+typedef pg_atomic_uint32 slock_t;
+
+#define S_LOCK(lock) \
+	do { \
+		uint32 expected = 0; \
+		if (unlikely(!pg_atomic_compare_exchange_u32((lock), &expected, 1))) \
+			futex_lock((lock), expected, __FILE__, __LINE__, PG_FUNCNAME_MACRO); \
+	} while (0)
+
+
+#define S_UNLOCK(lock) \
+	do { \
+		uint32 actual = pg_atomic_exchange_u32((lock), 0); \
+		if (unlikely(actual == 2)) \
+			futex_unlock((lock), actual); \
+	} while (0)
+extern int futex_lock(volatile slock_t *lock, uint32 current, const char *file, int line, const char *func);
+extern int futex_unlock(volatile slock_t *lock, uint32 current);
+
+/* TAS only needed for regress */
+#define TAS(lock) tas(lock)
+
+static __inline__ int
+tas(volatile slock_t *lock)
+{
+	register uint32 _res = 1;
+
+	__asm__ __volatile__(
+		"	lock			\n"
+		"	xchgl	%0,%1	\n"
+:		"+q"(_res), "+m"(*lock)
+:		/* no inputs */
+:		"memory", "cc");
+	return (int) _res;
+}
+
+
+#else
 typedef unsigned char slock_t;
 
 #define TAS(lock) tas(lock)
@@ -247,6 +293,7 @@ spin_delay(void)
 		" rep; nop			\n");
 }
 
+#endif   /* HAS_FUTEX */
 #endif	 /* __x86_64__ */
 
 
diff --git a/src/test/regress/regress.c b/src/test/regress/regress.c
index 02397f2eb10..a5ba0212460 100644
--- a/src/test/regress/regress.c
+++ b/src/test/regress/regress.c
@@ -832,7 +832,11 @@ test_spinlock(void)
 		S_UNLOCK(&struct_w_lock.lock);
 
 		/* and that "contended" acquisition works */
+#ifdef HAS_FUTEX
+		futex_lock(&struct_w_lock.lock, 1, "testfile", 17, "testfunc");
+#else
 		s_lock(&struct_w_lock.lock, "testfile", 17, "testfunc");
+#endif
 		S_UNLOCK(&struct_w_lock.lock);
 
 		/*

Reply via email to