On Thu, Jun 11, 2020 at 5:37 AM Robert Haas <robertmh...@gmail.com> wrote:
> On Tue, Jun 9, 2020 at 6:03 PM Thomas Munro <thomas.mu...@gmail.com> wrote:
> > That all makes sense.  Now I'm wondering if I should use exactly that
> > word in the GUC... dynamic_shared_memory_preallocate?
>
> I tend to prefer verb-object rather than object-verb word ordering,
> because that's how English normally works, but I realize this is not a
> unanimous view.

It's pretty much just me and Yoda against all the rest of you, so
let's try preallocate_dynamic_shared_memory.  I guess it could also be
min_dynamic_shared_memory to drop the verb.  Other ideas welcome.

> It's a little strange because the fact of preallocating it makes it
> not dynamic any more. I don't know what to do about that.

Well, it's not dynamic at the operating system level, but it's still
dynamic in the sense that PostgreSQL code can get some and give it
back, and there's no change from the point of view of any DSM client
code.

Admittedly, the shared memory architecture is a bit confusing.  We
have main shared memory, DSM memory, DSA memory that is inside main
shared memory with extra DSMs as required, DSA memory that is inside a
DSM and creates extra DSMs as required, and with this patch also DSMs
that are inside main shared memory.  Not to mention palloc and
MemoryContexts and all that.  As you probably remember I once managed
to give an internal presentation at EDB for one hour of solid talking
about all the different kinds of allocators and what they're good for.
It was like a Möbius slide deck already.

Here's a version that adds some documentation.
From 8f222062b60d6674cd9f46e716a56201ef498f84 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.mu...@gmail.com>
Date: Thu, 9 Apr 2020 14:12:38 +1200
Subject: [PATCH v2] Preallocate some DSM space at startup.

Create an optional region in the main shared memory segment that can be
used to acquire and release "fast" DSM segments, and can benefit from
huge pages allocated at cluster startup time, if configured.  Fall back
to the existing mechanisms when that space is full.  The size is
controlled by preallocate_dynamic_shared_memory, defaulting to 0.

Main region DSM segments initially contain whatever garbage the memory
held last time they were used, rather than zeroes.  That change revealed
that DSA areas failed to initialize themselves correctly in memory that
wasn't zeroed first, so fix that problem.

Reviewed-by: Robert Haas <robertmh...@gmail.com>
Discussion: https://postgr.es/m/CA%2BhUKGLAE2QBv-WgGp%2BD9P_J-%3Dyne3zof9nfMaqq1h3EGHFXYQ%40mail.gmail.com
---
 doc/src/sgml/config.sgml                      |  26 +++
 src/backend/storage/ipc/dsm.c                 | 184 ++++++++++++++++--
 src/backend/storage/ipc/dsm_impl.c            |   3 +
 src/backend/storage/ipc/ipci.c                |   3 +
 src/backend/utils/misc/guc.c                  |  11 ++
 src/backend/utils/misc/postgresql.conf.sample |   2 +
 src/backend/utils/mmgr/dsa.c                  |   5 +-
 src/include/storage/dsm.h                     |   3 +
 src/include/storage/dsm_impl.h                |   1 +
 9 files changed, 213 insertions(+), 25 deletions(-)

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 783bf7a12b..35d342a694 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1831,6 +1831,32 @@ include_dir 'conf.d'
       </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-preallocate-dynamic-shared-memory" xreflabel="preallocate_dynamic_shared_memory">
+      <term><varname>preallocate_dynamic_shared_memory</varname> (<type>integer</type>)
+      <indexterm>
+       <primary><varname>preallocate_dynamic_shared_memory</varname> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        Specifies the amount of memory that should be allocated at server
+        startup time for use by parallel queries.  When this memory region is
+        insufficient or exhausted by concurrent parallel queries, new
+        parallel queries try to allocate extra shared memory temporarily from
+        the operating system using the method configured with
+        <varname>dynamic_shared_memory_type</varname>, which may be slower
+        due to memory management overheads.
+        Memory that is allocated with
+        <varname>preallocate_dynamic_shared_memory</varname> is affected by the
+        <varname>huge_pages</varname> setting on operating systems where that
+        is supported, and may be more likely to benefit from larger pages on
+        operating systems where page size is managed automatically.  Larger
+        memory pages can improve the performance of parallel hash joins.
+        The default value is <literal>0</literal> (none).
+       </para>
+      </listitem>
+     </varlistentry>
+
      </variablelist>
      </sect2>
 
diff --git a/src/backend/storage/ipc/dsm.c b/src/backend/storage/ipc/dsm.c
index ef64d08357..4f87ece3b3 100644
--- a/src/backend/storage/ipc/dsm.c
+++ b/src/backend/storage/ipc/dsm.c
@@ -35,10 +35,12 @@
 
 #include "lib/ilist.h"
 #include "miscadmin.h"
+#include "port/pg_bitutils.h"
 #include "storage/dsm.h"
 #include "storage/ipc.h"
 #include "storage/lwlock.h"
 #include "storage/pg_shmem.h"
+#include "utils/freepage.h"
 #include "utils/guc.h"
 #include "utils/memutils.h"
 #include "utils/resowner_private.h"
@@ -76,6 +78,8 @@ typedef struct dsm_control_item
 {
 	dsm_handle	handle;
 	uint32		refcnt;			/* 2+ = active, 1 = moribund, 0 = gone */
+	size_t		first_page;
+	size_t		npages;
 	void	   *impl_private_pm_handle; /* only needed on Windows */
 	bool		pinned;
 } dsm_control_item;
@@ -95,10 +99,15 @@ static dsm_segment *dsm_create_descriptor(void);
 static bool dsm_control_segment_sane(dsm_control_header *control,
 									 Size mapped_size);
 static uint64 dsm_control_bytes_needed(uint32 nitems);
+static inline dsm_handle make_main_region_dsm_handle(int slot);
+static inline bool is_main_region_dsm_handle(dsm_handle handle);
 
 /* Has this backend initialized the dynamic shared memory system yet? */
 static bool dsm_init_done = false;
 
+/* The address range of DSM space in the main shared memory segment. */
+static void *dsm_main_space_begin = NULL;
+
 /*
  * List of dynamic shared memory segments used by this backend.
  *
@@ -247,8 +256,12 @@ dsm_cleanup_using_control_segment(dsm_handle old_control_handle)
 		if (refcnt == 0)
 			continue;
 
-		/* Log debugging information. */
+		/* If it was using the main shmem area, there is nothing to do. */
 		handle = old_control->item[i].handle;
+		if (is_main_region_dsm_handle(handle))
+			continue;
+
+		/* Log debugging information. */
 		elog(DEBUG2, "cleaning up orphaned dynamic shared memory with ID %u (reference count %u)",
 			 handle, refcnt);
 
@@ -348,8 +361,11 @@ dsm_postmaster_shutdown(int code, Datum arg)
 		if (dsm_control->item[i].refcnt == 0)
 			continue;
 
-		/* Log debugging information. */
 		handle = dsm_control->item[i].handle;
+		if (is_main_region_dsm_handle(handle))
+			continue;
+
+		/* Log debugging information. */
 		elog(DEBUG2, "cleaning up orphaned dynamic shared memory with ID %u",
 			 handle);
 
@@ -418,6 +434,45 @@ dsm_set_control_handle(dsm_handle h)
 }
 #endif
 
+/*
+ * Reserve some space in the main shared memory segment for DSM segments.
+ */
+size_t
+dsm_estimate_size(void)
+{
+	return 1024 * 1024 * (size_t) preallocate_dynamic_shared_memory;
+}
+
+/*
+ * Initialize space in the main shared memory segment for DSM segments.
+ */
+void
+dsm_shmem_init(void)
+{
+	size_t		size = dsm_estimate_size();
+	bool		found;
+
+	if (size == 0)
+		return;
+
+	dsm_main_space_begin = ShmemInitStruct("Preallocated DSM", size, &found);
+	if (!found)
+	{
+		FreePageManager *fpm = (FreePageManager *) dsm_main_space_begin;
+		size_t		first_page = 0;
+		size_t		pages;
+
+		/* Reserve space for the FreePageManager. */
+		while (first_page * FPM_PAGE_SIZE < sizeof(FreePageManager))
+			++first_page;
+
+		/* Initialize it and give it all the rest of the space. */
+		FreePageManagerInitialize(fpm, dsm_main_space_begin);
+		pages = (size / FPM_PAGE_SIZE) - first_page;
+		FreePageManagerPut(fpm, first_page, pages);
+	}
+}
+
 /*
  * Create a new dynamic shared memory segment.
  *
@@ -434,6 +489,10 @@ dsm_create(Size size, int flags)
 	dsm_segment *seg;
 	uint32		i;
 	uint32		nitems;
+	size_t		npages = 0;
+	size_t		first_page = 0;
+	FreePageManager *dsm_main_space_fpm = dsm_main_space_begin;
+	bool		using_main_dsm_region = false;
 
 	/* Unsafe in postmaster (and pointless in a stand-alone backend). */
 	Assert(IsUnderPostmaster);
@@ -444,20 +503,48 @@ dsm_create(Size size, int flags)
 	/* Create a new segment descriptor. */
 	seg = dsm_create_descriptor();
 
-	/* Loop until we find an unused segment identifier. */
-	for (;;)
+	/*
+	 * Lock the control segment while we try to allocate from the main shared
+	 * memory area, if configured.
+	 */
+	if (dsm_main_space_fpm)
 	{
-		Assert(seg->mapped_address == NULL && seg->mapped_size == 0);
-		seg->handle = random();
-		if (seg->handle == DSM_HANDLE_INVALID)	/* Reserve sentinel */
-			continue;
-		if (dsm_impl_op(DSM_OP_CREATE, seg->handle, size, &seg->impl_private,
-						&seg->mapped_address, &seg->mapped_size, ERROR))
-			break;
+		npages = size / FPM_PAGE_SIZE;
+		if (size % FPM_PAGE_SIZE > 0)
+			++npages;
+
+		LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
+		if (FreePageManagerGet(dsm_main_space_fpm, npages, &first_page))
+		{
+			/* We can carve out a piece of the main shared memory segment. */
+			seg->mapped_address = (char *) dsm_main_space_begin +
+				first_page * FPM_PAGE_SIZE;
+			seg->mapped_size = npages * FPM_PAGE_SIZE;
+			using_main_dsm_region = true;
+			/* We'll choose a handle below. */
+		}
 	}
 
-	/* Lock the control segment so we can register the new segment. */
-	LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
+	if (!using_main_dsm_region)
+	{
+		/*
+		 * We need to create a new memory segment.  Loop until we find an
+		 * unused segment identifier.
+		 */
+		if (dsm_main_space_fpm)
+			LWLockRelease(DynamicSharedMemoryControlLock);
+		for (;;)
+		{
+			Assert(seg->mapped_address == NULL && seg->mapped_size == 0);
+			seg->handle = random() & ~1;	/* Even numbers only */
+			if (seg->handle == DSM_HANDLE_INVALID)	/* Reserve sentinel */
+				continue;
+			if (dsm_impl_op(DSM_OP_CREATE, seg->handle, size, &seg->impl_private,
+							&seg->mapped_address, &seg->mapped_size, ERROR))
+				break;
+		}
+		LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
+	}
 
 	/* Search the control segment for an unused slot. */
 	nitems = dsm_control->nitems;
@@ -465,6 +552,14 @@ dsm_create(Size size, int flags)
 	{
 		if (dsm_control->item[i].refcnt == 0)
 		{
+			if (using_main_dsm_region)
+			{
+				seg->handle = make_main_region_dsm_handle(i);
+				dsm_control->item[i].first_page = first_page;
+				dsm_control->item[i].npages = npages;
+			}
+			else
+				Assert(!is_main_region_dsm_handle(seg->handle));
 			dsm_control->item[i].handle = seg->handle;
 			/* refcnt of 1 triggers destruction, so start at 2 */
 			dsm_control->item[i].refcnt = 2;
@@ -479,9 +574,12 @@ dsm_create(Size size, int flags)
 	/* Verify that we can support an additional mapping. */
 	if (nitems >= dsm_control->maxitems)
 	{
+		if (using_main_dsm_region)
+			FreePageManagerPut(dsm_main_space_fpm, first_page, npages);
 		LWLockRelease(DynamicSharedMemoryControlLock);
-		dsm_impl_op(DSM_OP_DESTROY, seg->handle, 0, &seg->impl_private,
-					&seg->mapped_address, &seg->mapped_size, WARNING);
+		if (!using_main_dsm_region)
+			dsm_impl_op(DSM_OP_DESTROY, seg->handle, 0, &seg->impl_private,
+						&seg->mapped_address, &seg->mapped_size, WARNING);
 		if (seg->resowner != NULL)
 			ResourceOwnerForgetDSM(seg->resowner, seg);
 		dlist_delete(&seg->node);
@@ -495,6 +593,12 @@ dsm_create(Size size, int flags)
 	}
 
 	/* Enter the handle into a new array slot. */
+	if (using_main_dsm_region)
+	{
+		seg->handle = make_main_region_dsm_handle(nitems);
+		dsm_control->item[i].first_page = first_page;
+		dsm_control->item[i].npages = npages;
+	}
 	dsm_control->item[nitems].handle = seg->handle;
 	/* refcnt of 1 triggers destruction, so start at 2 */
 	dsm_control->item[nitems].refcnt = 2;
@@ -580,6 +684,12 @@ dsm_attach(dsm_handle h)
 		/* Otherwise we've found a match. */
 		dsm_control->item[i].refcnt++;
 		seg->control_slot = i;
+		if (is_main_region_dsm_handle(seg->handle))
+		{
+			seg->mapped_address = (char *) dsm_main_space_begin +
+				dsm_control->item[i].first_page * FPM_PAGE_SIZE;
+			seg->mapped_size = dsm_control->item[i].npages * FPM_PAGE_SIZE;
+		}
 		break;
 	}
 	LWLockRelease(DynamicSharedMemoryControlLock);
@@ -597,8 +707,9 @@ dsm_attach(dsm_handle h)
 	}
 
 	/* Here's where we actually try to map the segment. */
-	dsm_impl_op(DSM_OP_ATTACH, seg->handle, 0, &seg->impl_private,
-				&seg->mapped_address, &seg->mapped_size, ERROR);
+	if (!is_main_region_dsm_handle(seg->handle))
+		dsm_impl_op(DSM_OP_ATTACH, seg->handle, 0, &seg->impl_private,
+					&seg->mapped_address, &seg->mapped_size, ERROR);
 
 	return seg;
 }
@@ -688,8 +799,9 @@ dsm_detach(dsm_segment *seg)
 	 */
 	if (seg->mapped_address != NULL)
 	{
-		dsm_impl_op(DSM_OP_DETACH, seg->handle, 0, &seg->impl_private,
-					&seg->mapped_address, &seg->mapped_size, WARNING);
+		if (!is_main_region_dsm_handle(seg->handle))
+			dsm_impl_op(DSM_OP_DETACH, seg->handle, 0, &seg->impl_private,
+						&seg->mapped_address, &seg->mapped_size, WARNING);
 		seg->impl_private = NULL;
 		seg->mapped_address = NULL;
 		seg->mapped_size = 0;
@@ -729,10 +841,15 @@ dsm_detach(dsm_segment *seg)
 			 * other reason, the postmaster may not have any better luck than
 			 * we did.  There's not much we can do about that, though.
 			 */
-			if (dsm_impl_op(DSM_OP_DESTROY, seg->handle, 0, &seg->impl_private,
+			if (is_main_region_dsm_handle(seg->handle) ||
+				dsm_impl_op(DSM_OP_DESTROY, seg->handle, 0, &seg->impl_private,
 							&seg->mapped_address, &seg->mapped_size, WARNING))
 			{
 				LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
+				if (is_main_region_dsm_handle(seg->handle))
+					FreePageManagerPut((FreePageManager *) dsm_main_space_begin,
+									   dsm_control->item[control_slot].first_page,
+									   dsm_control->item[control_slot].npages);
 				Assert(dsm_control->item[control_slot].handle == seg->handle);
 				Assert(dsm_control->item[control_slot].refcnt == 1);
 				dsm_control->item[control_slot].refcnt = 0;
@@ -894,10 +1011,15 @@ dsm_unpin_segment(dsm_handle handle)
 		 * pass the mapped size, mapped address, and private data as NULL
 		 * here.
 		 */
-		if (dsm_impl_op(DSM_OP_DESTROY, handle, 0, &junk_impl_private,
+		if (is_main_region_dsm_handle(handle) ||
+			dsm_impl_op(DSM_OP_DESTROY, handle, 0, &junk_impl_private,
 						&junk_mapped_address, &junk_mapped_size, WARNING))
 		{
 			LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
+			if (is_main_region_dsm_handle(handle))
+				FreePageManagerPut((FreePageManager *) dsm_main_space_begin,
+								   dsm_control->item[control_slot].first_page,
+								   dsm_control->item[control_slot].npages);
 			Assert(dsm_control->item[control_slot].handle == handle);
 			Assert(dsm_control->item[control_slot].refcnt == 1);
 			dsm_control->item[control_slot].refcnt = 0;
@@ -1094,3 +1216,23 @@ dsm_control_bytes_needed(uint32 nitems)
 	return offsetof(dsm_control_header, item)
 		+ sizeof(dsm_control_item) * (uint64) nitems;
 }
+
+static inline dsm_handle
+make_main_region_dsm_handle(int slot)
+{
+	/*
+	 * We need to create a handle that doesn't collide with any existing extra
+	 * segment created by dsm_impl_op(), so we'll make it odd.  It also
+	 * mustn't collide with any other main area pseudo-segment, so we'll
+	 * include the slot number in some of the bits.  We also want to make an
+	 * effort to avoid newly created and recently destroyed handles from being
+	 * confused, so we'll make the rest of the bits random.
+	 */
+	return 1 | (slot << 1) | (random() << (pg_leftmost_one_pos32(dsm_control->maxitems) + 1));
+}
+
+static inline bool
+is_main_region_dsm_handle(dsm_handle handle)
+{
+	return handle & 1;
+}
diff --git a/src/backend/storage/ipc/dsm_impl.c b/src/backend/storage/ipc/dsm_impl.c
index 1972aecbed..24952c698c 100644
--- a/src/backend/storage/ipc/dsm_impl.c
+++ b/src/backend/storage/ipc/dsm_impl.c
@@ -113,6 +113,9 @@ const struct config_enum_entry dynamic_shared_memory_options[] = {
 /* Implementation selector. */
 int			dynamic_shared_memory_type;
 
+/* Amount of space reserved for DSM segments in the main area. */
+int			preallocate_dynamic_shared_memory;
+
 /* Size of buffer to be used for zero-filling. */
 #define ZBUFFER_SIZE				8192
 
diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c
index 427b0d59cd..3c3b102b2b 100644
--- a/src/backend/storage/ipc/ipci.c
+++ b/src/backend/storage/ipc/ipci.c
@@ -119,6 +119,7 @@ CreateSharedMemoryAndSemaphores(void)
 		size = add_size(size, SpinlockSemaSize());
 		size = add_size(size, hash_estimate_size(SHMEM_INDEX_SIZE,
 												 sizeof(ShmemIndexEnt)));
+		size = add_size(size, dsm_estimate_size());
 		size = add_size(size, BufferShmemSize());
 		size = add_size(size, LockShmemSize());
 		size = add_size(size, PredicateLockShmemSize());
@@ -208,6 +209,8 @@ CreateSharedMemoryAndSemaphores(void)
 	 */
 	InitShmemIndex();
 
+	dsm_shmem_init();
+
 	/*
 	 * Set up xlog, clog, and buffers
 	 */
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 75fc6f11d6..811cd0c88c 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -2236,6 +2236,17 @@ static struct config_int ConfigureNamesInt[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"preallocate_dynamic_shared_memory", PGC_POSTMASTER, RESOURCES_MEM,
+			gettext_noop("Amount of dynamic shared memory reserved at startup."),
+			NULL,
+			GUC_UNIT_MB
+		},
+		&preallocate_dynamic_shared_memory,
+		0, 0, MAX_KILOBYTES,
+		NULL, NULL, NULL
+	},
+
 	/*
 	 * We sometimes multiply the number of shared buffers by two without
 	 * checking for overflow, so we mustn't allow more than INT_MAX / 2.
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 3a25287a39..6e23d7c188 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -145,6 +145,8 @@
 					#   windows
 					#   mmap
 					# (change requires restart)
+#preallocate_dynamic_shared_memory = 0GB	# zero disables the feature
+					# (change requires restart)
 
 # - Disk -
 
diff --git a/src/backend/utils/mmgr/dsa.c b/src/backend/utils/mmgr/dsa.c
index b7ad8e62ef..6e5e412429 100644
--- a/src/backend/utils/mmgr/dsa.c
+++ b/src/backend/utils/mmgr/dsa.c
@@ -1223,6 +1223,7 @@ create_internal(void *place, size_t size,
 	 * space.
 	 */
 	control = (dsa_area_control *) place;
+	memset(place, 0, sizeof(*control));
 	control->segment_header.magic =
 		DSA_SEGMENT_HEADER_MAGIC ^ control_handle ^ 0;
 	control->segment_header.next = DSA_SEGMENT_INDEX_NONE;
@@ -1233,14 +1234,10 @@ create_internal(void *place, size_t size,
 	control->handle = control_handle;
 	control->max_total_segment_size = (size_t) -1;
 	control->total_segment_size = size;
-	memset(&control->segment_handles[0], 0,
-		   sizeof(dsm_handle) * DSA_MAX_SEGMENTS);
 	control->segment_handles[0] = control_handle;
 	for (i = 0; i < DSA_NUM_SEGMENT_BINS; ++i)
 		control->segment_bins[i] = DSA_SEGMENT_INDEX_NONE;
-	control->high_segment_index = 0;
 	control->refcnt = 1;
-	control->freed_segment_counter = 0;
 	control->lwlock_tranche_id = tranche_id;
 
 	/*
diff --git a/src/include/storage/dsm.h b/src/include/storage/dsm.h
index 408c0543a6..0455576f4a 100644
--- a/src/include/storage/dsm.h
+++ b/src/include/storage/dsm.h
@@ -29,6 +29,9 @@ extern void dsm_postmaster_startup(struct PGShmemHeader *);
 extern void dsm_backend_shutdown(void);
 extern void dsm_detach_all(void);
 
+extern size_t dsm_estimate_size(void);
+extern void dsm_shmem_init(void);
+
 #ifdef EXEC_BACKEND
 extern void dsm_set_control_handle(dsm_handle h);
 #endif
diff --git a/src/include/storage/dsm_impl.h b/src/include/storage/dsm_impl.h
index 562cb781a8..5593ac80e7 100644
--- a/src/include/storage/dsm_impl.h
+++ b/src/include/storage/dsm_impl.h
@@ -40,6 +40,7 @@
 
 /* GUC. */
 extern int	dynamic_shared_memory_type;
+extern int	preallocate_dynamic_shared_memory;
 
 /*
  * Directory for on-disk state.
-- 
2.20.1

Reply via email to