Re: slab allocator performance issues

Andres Freund Mon, 19 Jul 2021 13:56:28 -0700

Hi,

On 2021-07-18 19:23:31 +0200, Tomas Vondra wrote:
> Sounds great! Thanks for investigating this and for the improvements.
> 
> It might be good to do some experiments to see how the changes affect memory
> consumption for practical workloads. I'm willing to spend soem time on that,
> if needed.


I've attached my changes. They're in a rough shape right now, but I
think good enough for an initial look.

Greetings,

Andres Freund

>From af4cd1f0b64cd52d7eab342493e3dfd6b0d8388e Mon Sep 17 00:00:00 2001
From: Andres Freund <and...@anarazel.de>
Date: Mon, 19 Jul 2021 12:55:51 -0700
Subject: [PATCH 1/2] WIP: optimize allocations by separating hot from cold
 paths.

---
 src/include/nodes/memnodes.h        |   4 +-
 src/include/utils/memutils.h        |  13 +
 src/backend/utils/mmgr/aset.c       | 537 ++++++++++++++--------------
 src/backend/utils/mmgr/generation.c |  22 +-
 src/backend/utils/mmgr/mcxt.c       | 179 +++-------
 src/backend/utils/mmgr/slab.c       |  14 +-
 6 files changed, 354 insertions(+), 415 deletions(-)

diff --git a/src/include/nodes/memnodes.h b/src/include/nodes/memnodes.h
index e6a757d6a07..8a42d2ff999 100644
--- a/src/include/nodes/memnodes.h
+++ b/src/include/nodes/memnodes.h
@@ -57,10 +57,10 @@ typedef void (*MemoryStatsPrintFunc) (MemoryContext context, void *passthru,
 
 typedef struct MemoryContextMethods
 {
-	void	   *(*alloc) (MemoryContext context, Size size);
+	void	   *(*alloc) (MemoryContext context, Size size, int flags);
 	/* call this free_p in case someone #define's free() */
 	void		(*free_p) (MemoryContext context, void *pointer);
-	void	   *(*realloc) (MemoryContext context, void *pointer, Size size);
+	void	   *(*realloc) (MemoryContext context, void *pointer, Size size, int flags);
 	void		(*reset) (MemoryContext context);
 	void		(*delete_context) (MemoryContext context);
 	Size		(*get_chunk_space) (MemoryContext context, void *pointer);
diff --git a/src/include/utils/memutils.h b/src/include/utils/memutils.h
index ff872274d44..2f75b4cca46 100644
--- a/src/include/utils/memutils.h
+++ b/src/include/utils/memutils.h
@@ -147,6 +147,19 @@ extern void MemoryContextCreate(MemoryContext node,
 
 extern void HandleLogMemoryContextInterrupt(void);
 extern void ProcessLogMemoryContextInterrupt(void);
+extern void *MemoryContextAllocationFailure(MemoryContext context, Size size, int flags);
+
+extern void MemoryContextSizeFailure(MemoryContext context, Size size, int flags) pg_attribute_noreturn();
+
+static inline void
+MemoryContextCheckSize(MemoryContext context, Size size, int flags)
+{
+	if (unlikely(!AllocSizeIsValid(size)))
+	{
+		if (!(flags & MCXT_ALLOC_HUGE) || !AllocHugeSizeIsValid(size))
+			MemoryContextSizeFailure(context, size, flags);
+	}
+}
 
 /*
  * Memory-context-type-specific functions
diff --git a/src/backend/utils/mmgr/aset.c b/src/backend/utils/mmgr/aset.c
index 77872e77bcd..00878354392 100644
--- a/src/backend/utils/mmgr/aset.c
+++ b/src/backend/utils/mmgr/aset.c
@@ -263,9 +263,9 @@ static AllocSetFreeList context_freelists[2] =
 /*
  * These functions implement the MemoryContext API for AllocSet contexts.
  */
-static void *AllocSetAlloc(MemoryContext context, Size size);
+static void *AllocSetAlloc(MemoryContext context, Size size, int flags);
 static void AllocSetFree(MemoryContext context, void *pointer);
-static void *AllocSetRealloc(MemoryContext context, void *pointer, Size size);
+static void *AllocSetRealloc(MemoryContext context, void *pointer, Size size, int flags);
 static void AllocSetReset(MemoryContext context);
 static void AllocSetDelete(MemoryContext context);
 static Size AllocSetGetChunkSpace(MemoryContext context, void *pointer);
@@ -704,266 +704,10 @@ AllocSetDelete(MemoryContext context)
 	free(set);
 }
 
-/*
- * AllocSetAlloc
- *		Returns pointer to allocated memory of given size or NULL if
- *		request could not be completed; memory is added to the set.
- *
- * No request may exceed:
- *		MAXALIGN_DOWN(SIZE_MAX) - ALLOC_BLOCKHDRSZ - ALLOC_CHUNKHDRSZ
- * All callers use a much-lower limit.
- *
- * Note: when using valgrind, it doesn't matter how the returned allocation
- * is marked, as mcxt.c will set it to UNDEFINED.  In some paths we will
- * return space that is marked NOACCESS - AllocSetRealloc has to beware!
- */
-static void *
-AllocSetAlloc(MemoryContext context, Size size)
+static inline void *
+AllocSetAllocReturnChunk(AllocSet set, Size size, AllocChunk chunk, Size chunk_size)
 {
-	AllocSet	set = (AllocSet) context;
-	AllocBlock	block;
-	AllocChunk	chunk;
-	int			fidx;
-	Size		chunk_size;
-	Size		blksize;
-
-	AssertArg(AllocSetIsValid(set));
-
-	/*
-	 * If requested size exceeds maximum for chunks, allocate an entire block
-	 * for this request.
-	 */
-	if (size > set->allocChunkLimit)
-	{
-		chunk_size = MAXALIGN(size);
-		blksize = chunk_size + ALLOC_BLOCKHDRSZ + ALLOC_CHUNKHDRSZ;
-		block = (AllocBlock) malloc(blksize);
-		if (block == NULL)
-			return NULL;
-
-		context->mem_allocated += blksize;
-
-		block->aset = set;
-		block->freeptr = block->endptr = ((char *) block) + blksize;
-
-		chunk = (AllocChunk) (((char *) block) + ALLOC_BLOCKHDRSZ);
-		chunk->aset = set;
-		chunk->size = chunk_size;
-#ifdef MEMORY_CONTEXT_CHECKING
-		chunk->requested_size = size;
-		/* set mark to catch clobber of "unused" space */
-		if (size < chunk_size)
-			set_sentinel(AllocChunkGetPointer(chunk), size);
-#endif
-#ifdef RANDOMIZE_ALLOCATED_MEMORY
-		/* fill the allocated space with junk */
-		randomize_mem((char *) AllocChunkGetPointer(chunk), size);
-#endif
-
-		/*
-		 * Stick the new block underneath the active allocation block, if any,
-		 * so that we don't lose the use of the space remaining therein.
-		 */
-		if (set->blocks != NULL)
-		{
-			block->prev = set->blocks;
-			block->next = set->blocks->next;
-			if (block->next)
-				block->next->prev = block;
-			set->blocks->next = block;
-		}
-		else
-		{
-			block->prev = NULL;
-			block->next = NULL;
-			set->blocks = block;
-		}
-
-		/* Ensure any padding bytes are marked NOACCESS. */
-		VALGRIND_MAKE_MEM_NOACCESS((char *) AllocChunkGetPointer(chunk) + size,
-								   chunk_size - size);
-
-		/* Disallow external access to private part of chunk header. */
-		VALGRIND_MAKE_MEM_NOACCESS(chunk, ALLOCCHUNK_PRIVATE_LEN);
-
-		return AllocChunkGetPointer(chunk);
-	}
-
-	/*
-	 * Request is small enough to be treated as a chunk.  Look in the
-	 * corresponding free list to see if there is a free chunk we could reuse.
-	 * If one is found, remove it from the free list, make it again a member
-	 * of the alloc set and return its data address.
-	 */
-	fidx = AllocSetFreeIndex(size);
-	chunk = set->freelist[fidx];
-	if (chunk != NULL)
-	{
-		Assert(chunk->size >= size);
-
-		set->freelist[fidx] = (AllocChunk) chunk->aset;
-
-		chunk->aset = (void *) set;
-
-#ifdef MEMORY_CONTEXT_CHECKING
-		chunk->requested_size = size;
-		/* set mark to catch clobber of "unused" space */
-		if (size < chunk->size)
-			set_sentinel(AllocChunkGetPointer(chunk), size);
-#endif
-#ifdef RANDOMIZE_ALLOCATED_MEMORY
-		/* fill the allocated space with junk */
-		randomize_mem((char *) AllocChunkGetPointer(chunk), size);
-#endif
-
-		/* Ensure any padding bytes are marked NOACCESS. */
-		VALGRIND_MAKE_MEM_NOACCESS((char *) AllocChunkGetPointer(chunk) + size,
-								   chunk->size - size);
-
-		/* Disallow external access to private part of chunk header. */
-		VALGRIND_MAKE_MEM_NOACCESS(chunk, ALLOCCHUNK_PRIVATE_LEN);
-
-		return AllocChunkGetPointer(chunk);
-	}
-
-	/*
-	 * Choose the actual chunk size to allocate.
-	 */
-	chunk_size = (1 << ALLOC_MINBITS) << fidx;
-	Assert(chunk_size >= size);
-
-	/*
-	 * If there is enough room in the active allocation block, we will put the
-	 * chunk into that block.  Else must start a new one.
-	 */
-	if ((block = set->blocks) != NULL)
-	{
-		Size		availspace = block->endptr - block->freeptr;
-
-		if (availspace < (chunk_size + ALLOC_CHUNKHDRSZ))
-		{
-			/*
-			 * The existing active (top) block does not have enough room for
-			 * the requested allocation, but it might still have a useful
-			 * amount of space in it.  Once we push it down in the block list,
-			 * we'll never try to allocate more space from it. So, before we
-			 * do that, carve up its free space into chunks that we can put on
-			 * the set's freelists.
-			 *
-			 * Because we can only get here when there's less than
-			 * ALLOC_CHUNK_LIMIT left in the block, this loop cannot iterate
-			 * more than ALLOCSET_NUM_FREELISTS-1 times.
-			 */
-			while (availspace >= ((1 << ALLOC_MINBITS) + ALLOC_CHUNKHDRSZ))
-			{
-				Size		availchunk = availspace - ALLOC_CHUNKHDRSZ;
-				int			a_fidx = AllocSetFreeIndex(availchunk);
-
-				/*
-				 * In most cases, we'll get back the index of the next larger
-				 * freelist than the one we need to put this chunk on.  The
-				 * exception is when availchunk is exactly a power of 2.
-				 */
-				if (availchunk != ((Size) 1 << (a_fidx + ALLOC_MINBITS)))
-				{
-					a_fidx--;
-					Assert(a_fidx >= 0);
-					availchunk = ((Size) 1 << (a_fidx + ALLOC_MINBITS));
-				}
-
-				chunk = (AllocChunk) (block->freeptr);
-
-				/* Prepare to initialize the chunk header. */
-				VALGRIND_MAKE_MEM_UNDEFINED(chunk, ALLOC_CHUNKHDRSZ);
-
-				block->freeptr += (availchunk + ALLOC_CHUNKHDRSZ);
-				availspace -= (availchunk + ALLOC_CHUNKHDRSZ);
-
-				chunk->size = availchunk;
-#ifdef MEMORY_CONTEXT_CHECKING
-				chunk->requested_size = 0;	/* mark it free */
-#endif
-				chunk->aset = (void *) set->freelist[a_fidx];
-				set->freelist[a_fidx] = chunk;
-			}
-
-			/* Mark that we need to create a new block */
-			block = NULL;
-		}
-	}
-
-	/*
-	 * Time to create a new regular (multi-chunk) block?
-	 */
-	if (block == NULL)
-	{
-		Size		required_size;
-
-		/*
-		 * The first such block has size initBlockSize, and we double the
-		 * space in each succeeding block, but not more than maxBlockSize.
-		 */
-		blksize = set->nextBlockSize;
-		set->nextBlockSize <<= 1;
-		if (set->nextBlockSize > set->maxBlockSize)
-			set->nextBlockSize = set->maxBlockSize;
-
-		/*
-		 * If initBlockSize is less than ALLOC_CHUNK_LIMIT, we could need more
-		 * space... but try to keep it a power of 2.
-		 */
-		required_size = chunk_size + ALLOC_BLOCKHDRSZ + ALLOC_CHUNKHDRSZ;
-		while (blksize < required_size)
-			blksize <<= 1;
-
-		/* Try to allocate it */
-		block = (AllocBlock) malloc(blksize);
-
-		/*
-		 * We could be asking for pretty big blocks here, so cope if malloc
-		 * fails.  But give up if there's less than 1 MB or so available...
-		 */
-		while (block == NULL && blksize > 1024 * 1024)
-		{
-			blksize >>= 1;
-			if (blksize < required_size)
-				break;
-			block = (AllocBlock) malloc(blksize);
-		}
-
-		if (block == NULL)
-			return NULL;
-
-		context->mem_allocated += blksize;
-
-		block->aset = set;
-		block->freeptr = ((char *) block) + ALLOC_BLOCKHDRSZ;
-		block->endptr = ((char *) block) + blksize;
-
-		/* Mark unallocated space NOACCESS. */
-		VALGRIND_MAKE_MEM_NOACCESS(block->freeptr,
-								   blksize - ALLOC_BLOCKHDRSZ);
-
-		block->prev = NULL;
-		block->next = set->blocks;
-		if (block->next)
-			block->next->prev = block;
-		set->blocks = block;
-	}
-
-	/*
-	 * OK, do the allocation
-	 */
-	chunk = (AllocChunk) (block->freeptr);
-
-	/* Prepare to initialize the chunk header. */
-	VALGRIND_MAKE_MEM_UNDEFINED(chunk, ALLOC_CHUNKHDRSZ);
-
-	block->freeptr += (chunk_size + ALLOC_CHUNKHDRSZ);
-	Assert(block->freeptr <= block->endptr);
-
 	chunk->aset = (void *) set;
-	chunk->size = chunk_size;
 #ifdef MEMORY_CONTEXT_CHECKING
 	chunk->requested_size = size;
 	/* set mark to catch clobber of "unused" space */
@@ -985,6 +729,273 @@ AllocSetAlloc(MemoryContext context, Size size)
 	return AllocChunkGetPointer(chunk);
 }
 
+static void * pg_noinline
+AllocSetAllocLarge(AllocSet	set, Size size, int flags)
+{
+	AllocBlock	block;
+	AllocChunk	chunk;
+	Size		chunk_size;
+	Size		blksize;
+
+	/* check size, only allocation path where the limits could be hit */
+	MemoryContextCheckSize(&set->header, size, flags);
+
+	AssertArg(AllocSetIsValid(set));
+
+	chunk_size = MAXALIGN(size);
+	blksize = chunk_size + ALLOC_BLOCKHDRSZ + ALLOC_CHUNKHDRSZ;
+	block = (AllocBlock) malloc(blksize);
+	if (block == NULL)
+		return NULL;
+
+	set->header.mem_allocated += blksize;
+
+	block->aset = set;
+	block->freeptr = block->endptr = ((char *) block) + blksize;
+
+	/*
+	 * Stick the new block underneath the active allocation block, if any,
+	 * so that we don't lose the use of the space remaining therein.
+	 */
+	if (set->blocks != NULL)
+	{
+		block->prev = set->blocks;
+		block->next = set->blocks->next;
+		if (block->next)
+			block->next->prev = block;
+		set->blocks->next = block;
+	}
+	else
+	{
+		block->prev = NULL;
+		block->next = NULL;
+		set->blocks = block;
+	}
+
+	chunk = (AllocChunk) (((char *) block) + ALLOC_BLOCKHDRSZ);
+	chunk->size = chunk_size;
+
+	return AllocSetAllocReturnChunk(set, size, chunk, chunk_size);
+}
+
+static void * pg_noinline
+AllocSetAllocFromNewBlock(AllocSet set, Size size, Size chunk_size)
+{
+	AllocBlock	block;
+	Size		blksize;
+	Size		required_size;
+	AllocChunk	chunk;
+
+	/*
+	 * The first such block has size initBlockSize, and we double the
+	 * space in each succeeding block, but not more than maxBlockSize.
+	 */
+	blksize = set->nextBlockSize;
+	set->nextBlockSize <<= 1;
+	if (set->nextBlockSize > set->maxBlockSize)
+		set->nextBlockSize = set->maxBlockSize;
+
+	/*
+	 * If initBlockSize is less than ALLOC_CHUNK_LIMIT, we could need more
+	 * space... but try to keep it a power of 2.
+	 */
+	required_size = chunk_size + ALLOC_BLOCKHDRSZ + ALLOC_CHUNKHDRSZ;
+	while (blksize < required_size)
+		blksize <<= 1;
+
+	/* Try to allocate it */
+	block = (AllocBlock) malloc(blksize);
+
+	/*
+	 * We could be asking for pretty big blocks here, so cope if malloc
+	 * fails.  But give up if there's less than 1 MB or so available...
+	 */
+	while (block == NULL && blksize > 1024 * 1024)
+	{
+		blksize >>= 1;
+		if (blksize < required_size)
+			break;
+		block = (AllocBlock) malloc(blksize);
+	}
+
+	if (block == NULL)
+		return NULL;
+
+	set->header.mem_allocated += blksize;
+
+	block->aset = set;
+	block->freeptr = ((char *) block) + ALLOC_BLOCKHDRSZ;
+	block->endptr = ((char *) block) + blksize;
+
+	/* Mark unallocated space NOACCESS. */
+	VALGRIND_MAKE_MEM_NOACCESS(block->freeptr,
+							   blksize - ALLOC_BLOCKHDRSZ);
+
+	block->prev = NULL;
+	block->next = set->blocks;
+	if (block->next)
+		block->next->prev = block;
+	set->blocks = block;
+
+	/*
+	 * OK, do the allocation
+	 */
+	chunk = (AllocChunk) (block->freeptr);
+
+	/* Prepare to initialize the chunk header. */
+	VALGRIND_MAKE_MEM_UNDEFINED(chunk, ALLOC_CHUNKHDRSZ);
+
+	block->freeptr += (chunk_size + ALLOC_CHUNKHDRSZ);
+	Assert(block->freeptr <= block->endptr);
+
+	chunk->size = chunk_size;
+
+	return AllocSetAllocReturnChunk(set, size, chunk, chunk_size);
+}
+
+static void * pg_noinline
+AllocSetAllocCarveOldAndAlloc(AllocSet set, Size size, Size chunk_size, AllocBlock	block, Size availspace)
+{
+	AllocChunk	chunk;
+
+	/*
+	 * The existing active (top) block does not have enough room for
+	 * the requested allocation, but it might still have a useful
+	 * amount of space in it.  Once we push it down in the block list,
+	 * we'll never try to allocate more space from it. So, before we
+	 * do that, carve up its free space into chunks that we can put on
+	 * the set's freelists.
+	 *
+	 * Because we can only get here when there's less than
+	 * ALLOC_CHUNK_LIMIT left in the block, this loop cannot iterate
+	 * more than ALLOCSET_NUM_FREELISTS-1 times.
+	 */
+	while (availspace >= ((1 << ALLOC_MINBITS) + ALLOC_CHUNKHDRSZ))
+	{
+		Size		availchunk = availspace - ALLOC_CHUNKHDRSZ;
+		int			a_fidx = AllocSetFreeIndex(availchunk);
+
+		/*
+		 * In most cases, we'll get back the index of the next larger
+		 * freelist than the one we need to put this chunk on.  The
+		 * exception is when availchunk is exactly a power of 2.
+		 */
+		if (availchunk != ((Size) 1 << (a_fidx + ALLOC_MINBITS)))
+		{
+			a_fidx--;
+			Assert(a_fidx >= 0);
+			availchunk = ((Size) 1 << (a_fidx + ALLOC_MINBITS));
+		}
+
+		chunk = (AllocChunk) (block->freeptr);
+
+		/* Prepare to initialize the chunk header. */
+		VALGRIND_MAKE_MEM_UNDEFINED(chunk, ALLOC_CHUNKHDRSZ);
+
+		block->freeptr += (availchunk + ALLOC_CHUNKHDRSZ);
+		availspace -= (availchunk + ALLOC_CHUNKHDRSZ);
+
+		chunk->size = availchunk;
+#ifdef MEMORY_CONTEXT_CHECKING
+		chunk->requested_size = 0;	/* mark it free */
+#endif
+		chunk->aset = (void *) set->freelist[a_fidx];
+		set->freelist[a_fidx] = chunk;
+	}
+
+	return AllocSetAllocFromNewBlock(set, size, chunk_size);
+}
+
+/*
+ * AllocSetAlloc
+ *		Returns pointer to allocated memory of given size or NULL if
+ *		request could not be completed; memory is added to the set.
+ *
+ * No request may exceed:
+ *		MAXALIGN_DOWN(SIZE_MAX) - ALLOC_BLOCKHDRSZ - ALLOC_CHUNKHDRSZ
+ * All callers use a much-lower limit.
+ *
+ * Note: when using valgrind, it doesn't matter how the returned allocation
+ * is marked, as mcxt.c will set it to UNDEFINED.  In some paths we will
+ * return space that is marked NOACCESS - AllocSetRealloc has to beware!
+ */
+static void *
+AllocSetAlloc(MemoryContext context, Size size, int flags)
+{
+	AllocSet	set = (AllocSet) context;
+	AllocBlock	block;
+	AllocChunk	chunk;
+	int			fidx;
+	Size		chunk_size;
+
+	AssertArg(AllocSetIsValid(set));
+
+	/*
+	 * If requested size exceeds maximum for chunks, allocate an entire block
+	 * for this request.
+	 */
+	if (unlikely(size > set->allocChunkLimit))
+		return AllocSetAllocLarge(set, size, flags);
+
+	/*
+	 * Request is small enough to be treated as a chunk.  Look in the
+	 * corresponding free list to see if there is a free chunk we could reuse.
+	 * If one is found, remove it from the free list, make it again a member
+	 * of the alloc set and return its data address.
+	 */
+	fidx = AllocSetFreeIndex(size);
+	chunk = set->freelist[fidx];
+	if (chunk != NULL)
+	{
+		Assert(chunk->size >= size);
+
+		set->freelist[fidx] = (AllocChunk) chunk->aset;
+
+		return AllocSetAllocReturnChunk(set, size, chunk, chunk->size);
+	}
+
+	/*
+	 * Choose the actual chunk size to allocate.
+	 */
+	chunk_size = (1 << ALLOC_MINBITS) << fidx;
+	Assert(chunk_size >= size);
+
+	/*
+	 * If there is enough room in the active allocation block, we will put the
+	 * chunk into that block.  Else must start a new one.
+	 */
+	if ((block = set->blocks) != NULL)
+	{
+		Size		availspace = block->endptr - block->freeptr;
+
+		if (unlikely(availspace < (chunk_size + ALLOC_CHUNKHDRSZ)))
+			return AllocSetAllocCarveOldAndAlloc(set, size, chunk_size,
+												 block, availspace);
+	}
+	else if (unlikely(block == NULL))
+	{
+		/*
+		 * Time to create a new regular (multi-chunk) block.
+		 */
+		return AllocSetAllocFromNewBlock(set, size, chunk_size);
+	}
+
+	/*
+	 * OK, do the allocation
+	 */
+	chunk = (AllocChunk) (block->freeptr);
+
+	/* Prepare to initialize the chunk header. */
+	VALGRIND_MAKE_MEM_UNDEFINED(chunk, ALLOC_CHUNKHDRSZ);
+
+	chunk->size = chunk_size;
+
+	block->freeptr += (chunk_size + ALLOC_CHUNKHDRSZ);
+	Assert(block->freeptr <= block->endptr);
+
+	return AllocSetAllocReturnChunk(set, size, chunk, chunk_size);
+}
+
 /*
  * AllocSetFree
  *		Frees allocated memory; memory is removed from the set.
@@ -1072,7 +1083,7 @@ AllocSetFree(MemoryContext context, void *pointer)
  * request size.)
  */
 static void *
-AllocSetRealloc(MemoryContext context, void *pointer, Size size)
+AllocSetRealloc(MemoryContext context, void *pointer, Size size, int flags)
 {
 	AllocSet	set = (AllocSet) context;
 	AllocChunk	chunk = AllocPointerGetChunk(pointer);
@@ -1081,6 +1092,8 @@ AllocSetRealloc(MemoryContext context, void *pointer, Size size)
 	/* Allow access to private part of chunk header. */
 	VALGRIND_MAKE_MEM_DEFINED(chunk, ALLOCCHUNK_PRIVATE_LEN);
 
+	MemoryContextCheckSize(context, size, flags);
+
 	oldsize = chunk->size;
 
 #ifdef MEMORY_CONTEXT_CHECKING
@@ -1260,7 +1273,7 @@ AllocSetRealloc(MemoryContext context, void *pointer, Size size)
 		AllocPointer newPointer;
 
 		/* allocate new chunk */
-		newPointer = AllocSetAlloc((MemoryContext) set, size);
+		newPointer = AllocSetAlloc((MemoryContext) set, size, flags);
 
 		/* leave immediately if request was not completed */
 		if (newPointer == NULL)
diff --git a/src/backend/utils/mmgr/generation.c b/src/backend/utils/mmgr/generation.c
index 584cd614da0..5dde15654d7 100644
--- a/src/backend/utils/mmgr/generation.c
+++ b/src/backend/utils/mmgr/generation.c
@@ -146,9 +146,9 @@ struct GenerationChunk
 /*
  * These functions implement the MemoryContext API for Generation contexts.
  */
-static void *GenerationAlloc(MemoryContext context, Size size);
+static void *GenerationAlloc(MemoryContext context, Size size, int flags);
 static void GenerationFree(MemoryContext context, void *pointer);
-static void *GenerationRealloc(MemoryContext context, void *pointer, Size size);
+static void *GenerationRealloc(MemoryContext context, void *pointer, Size size, int flags);
 static void GenerationReset(MemoryContext context);
 static void GenerationDelete(MemoryContext context);
 static Size GenerationGetChunkSpace(MemoryContext context, void *pointer);
@@ -323,7 +323,7 @@ GenerationDelete(MemoryContext context)
  * return space that is marked NOACCESS - GenerationRealloc has to beware!
  */
 static void *
-GenerationAlloc(MemoryContext context, Size size)
+GenerationAlloc(MemoryContext context, Size size, int flags)
 {
 	GenerationContext *set = (GenerationContext *) context;
 	GenerationBlock *block;
@@ -335,9 +335,12 @@ GenerationAlloc(MemoryContext context, Size size)
 	{
 		Size		blksize = chunk_size + Generation_BLOCKHDRSZ + Generation_CHUNKHDRSZ;
 
+		/* check size, only allocation path where the limits could be hit */
+		MemoryContextCheckSize(context, size, flags);
+
 		block = (GenerationBlock *) malloc(blksize);
 		if (block == NULL)
-			return NULL;
+			return MemoryContextAllocationFailure(context, size, flags);
 
 		context->mem_allocated += blksize;
 
@@ -392,7 +395,7 @@ GenerationAlloc(MemoryContext context, Size size)
 		block = (GenerationBlock *) malloc(blksize);
 
 		if (block == NULL)
-			return NULL;
+			return MemoryContextAllocationFailure(context, size, flags);
 
 		context->mem_allocated += blksize;
 
@@ -520,13 +523,15 @@ GenerationFree(MemoryContext context, void *pointer)
  *		into the old chunk - in that case we just update chunk header.
  */
 static void *
-GenerationRealloc(MemoryContext context, void *pointer, Size size)
+GenerationRealloc(MemoryContext context, void *pointer, Size size, int flags)
 {
 	GenerationContext *set = (GenerationContext *) context;
 	GenerationChunk *chunk = GenerationPointerGetChunk(pointer);
 	GenerationPointer newPointer;
 	Size		oldsize;
 
+	MemoryContextCheckSize(context, size, flags);
+
 	/* Allow access to private part of chunk header. */
 	VALGRIND_MAKE_MEM_DEFINED(chunk, GENERATIONCHUNK_PRIVATE_LEN);
 
@@ -596,14 +601,15 @@ GenerationRealloc(MemoryContext context, void *pointer, Size size)
 	}
 
 	/* allocate new chunk */
-	newPointer = GenerationAlloc((MemoryContext) set, size);
+	newPointer = GenerationAlloc((MemoryContext) set, size, flags);
 
 	/* leave immediately if request was not completed */
 	if (newPointer == NULL)
 	{
 		/* Disallow external access to private part of chunk header. */
 		VALGRIND_MAKE_MEM_NOACCESS(chunk, GENERATIONCHUNK_PRIVATE_LEN);
-		return NULL;
+		/* again? */
+		return MemoryContextAllocationFailure(context, size, flags);
 	}
 
 	/*
diff --git a/src/backend/utils/mmgr/mcxt.c b/src/backend/utils/mmgr/mcxt.c
index 6919a732804..13125c4a562 100644
--- a/src/backend/utils/mmgr/mcxt.c
+++ b/src/backend/utils/mmgr/mcxt.c
@@ -867,28 +867,9 @@ MemoryContextAlloc(MemoryContext context, Size size)
 	AssertArg(MemoryContextIsValid(context));
 	AssertNotInCriticalSection(context);
 
-	if (!AllocSizeIsValid(size))
-		elog(ERROR, "invalid memory alloc request size %zu", size);
-
 	context->isReset = false;
 
-	ret = context->methods->alloc(context, size);
-	if (unlikely(ret == NULL))
-	{
-		MemoryContextStats(TopMemoryContext);
-
-		/*
-		 * Here, and elsewhere in this module, we show the target context's
-		 * "name" but not its "ident" (if any) in user-visible error messages.
-		 * The "ident" string might contain security-sensitive data, such as
-		 * values in SQL commands.
-		 */
-		ereport(ERROR,
-				(errcode(ERRCODE_OUT_OF_MEMORY),
-				 errmsg("out of memory"),
-				 errdetail("Failed on request of size %zu in memory context \"%s\".",
-						   size, context->name)));
-	}
+	ret = context->methods->alloc(context, size, 0);
 
 	VALGRIND_MEMPOOL_ALLOC(context, ret, size);
 
@@ -910,21 +891,10 @@ MemoryContextAllocZero(MemoryContext context, Size size)
 	AssertArg(MemoryContextIsValid(context));
 	AssertNotInCriticalSection(context);
 
-	if (!AllocSizeIsValid(size))
-		elog(ERROR, "invalid memory alloc request size %zu", size);
-
 	context->isReset = false;
 
-	ret = context->methods->alloc(context, size);
-	if (unlikely(ret == NULL))
-	{
-		MemoryContextStats(TopMemoryContext);
-		ereport(ERROR,
-				(errcode(ERRCODE_OUT_OF_MEMORY),
-				 errmsg("out of memory"),
-				 errdetail("Failed on request of size %zu in memory context \"%s\".",
-						   size, context->name)));
-	}
+	ret = context->methods->alloc(context, size, 0);
+	Assert(ret != NULL);
 
 	VALGRIND_MEMPOOL_ALLOC(context, ret, size);
 
@@ -948,21 +918,10 @@ MemoryContextAllocZeroAligned(MemoryContext context, Size size)
 	AssertArg(MemoryContextIsValid(context));
 	AssertNotInCriticalSection(context);
 
-	if (!AllocSizeIsValid(size))
-		elog(ERROR, "invalid memory alloc request size %zu", size);
-
 	context->isReset = false;
 
-	ret = context->methods->alloc(context, size);
-	if (unlikely(ret == NULL))
-	{
-		MemoryContextStats(TopMemoryContext);
-		ereport(ERROR,
-				(errcode(ERRCODE_OUT_OF_MEMORY),
-				 errmsg("out of memory"),
-				 errdetail("Failed on request of size %zu in memory context \"%s\".",
-						   size, context->name)));
-	}
+	ret = context->methods->alloc(context, size, 0);
+	Assert(ret != NULL);
 
 	VALGRIND_MEMPOOL_ALLOC(context, ret, size);
 
@@ -983,26 +942,11 @@ MemoryContextAllocExtended(MemoryContext context, Size size, int flags)
 	AssertArg(MemoryContextIsValid(context));
 	AssertNotInCriticalSection(context);
 
-	if (((flags & MCXT_ALLOC_HUGE) != 0 && !AllocHugeSizeIsValid(size)) ||
-		((flags & MCXT_ALLOC_HUGE) == 0 && !AllocSizeIsValid(size)))
-		elog(ERROR, "invalid memory alloc request size %zu", size);
-
 	context->isReset = false;
 
-	ret = context->methods->alloc(context, size);
+	ret = context->methods->alloc(context, size, flags);
 	if (unlikely(ret == NULL))
-	{
-		if ((flags & MCXT_ALLOC_NO_OOM) == 0)
-		{
-			MemoryContextStats(TopMemoryContext);
-			ereport(ERROR,
-					(errcode(ERRCODE_OUT_OF_MEMORY),
-					 errmsg("out of memory"),
-					 errdetail("Failed on request of size %zu in memory context \"%s\".",
-							   size, context->name)));
-		}
 		return NULL;
-	}
 
 	VALGRIND_MEMPOOL_ALLOC(context, ret, size);
 
@@ -1067,22 +1011,10 @@ palloc(Size size)
 
 	AssertArg(MemoryContextIsValid(context));
 	AssertNotInCriticalSection(context);
-
-	if (!AllocSizeIsValid(size))
-		elog(ERROR, "invalid memory alloc request size %zu", size);
-
 	context->isReset = false;
 
-	ret = context->methods->alloc(context, size);
-	if (unlikely(ret == NULL))
-	{
-		MemoryContextStats(TopMemoryContext);
-		ereport(ERROR,
-				(errcode(ERRCODE_OUT_OF_MEMORY),
-				 errmsg("out of memory"),
-				 errdetail("Failed on request of size %zu in memory context \"%s\".",
-						   size, context->name)));
-	}
+	ret = context->methods->alloc(context, size, 0);
+	Assert(ret != NULL);
 
 	VALGRIND_MEMPOOL_ALLOC(context, ret, size);
 
@@ -1099,21 +1031,10 @@ palloc0(Size size)
 	AssertArg(MemoryContextIsValid(context));
 	AssertNotInCriticalSection(context);
 
-	if (!AllocSizeIsValid(size))
-		elog(ERROR, "invalid memory alloc request size %zu", size);
-
 	context->isReset = false;
 
-	ret = context->methods->alloc(context, size);
-	if (unlikely(ret == NULL))
-	{
-		MemoryContextStats(TopMemoryContext);
-		ereport(ERROR,
-				(errcode(ERRCODE_OUT_OF_MEMORY),
-				 errmsg("out of memory"),
-				 errdetail("Failed on request of size %zu in memory context \"%s\".",
-						   size, context->name)));
-	}
+	ret = context->methods->alloc(context, size, 0);
+	Assert(ret != NULL);
 
 	VALGRIND_MEMPOOL_ALLOC(context, ret, size);
 
@@ -1132,26 +1053,11 @@ palloc_extended(Size size, int flags)
 	AssertArg(MemoryContextIsValid(context));
 	AssertNotInCriticalSection(context);
 
-	if (((flags & MCXT_ALLOC_HUGE) != 0 && !AllocHugeSizeIsValid(size)) ||
-		((flags & MCXT_ALLOC_HUGE) == 0 && !AllocSizeIsValid(size)))
-		elog(ERROR, "invalid memory alloc request size %zu", size);
-
 	context->isReset = false;
 
-	ret = context->methods->alloc(context, size);
+	ret = context->methods->alloc(context, size, 0);
 	if (unlikely(ret == NULL))
-	{
-		if ((flags & MCXT_ALLOC_NO_OOM) == 0)
-		{
-			MemoryContextStats(TopMemoryContext);
-			ereport(ERROR,
-					(errcode(ERRCODE_OUT_OF_MEMORY),
-					 errmsg("out of memory"),
-					 errdetail("Failed on request of size %zu in memory context \"%s\".",
-							   size, context->name)));
-		}
 		return NULL;
-	}
 
 	VALGRIND_MEMPOOL_ALLOC(context, ret, size);
 
@@ -1184,24 +1090,13 @@ repalloc(void *pointer, Size size)
 	MemoryContext context = GetMemoryChunkContext(pointer);
 	void	   *ret;
 
-	if (!AllocSizeIsValid(size))
-		elog(ERROR, "invalid memory alloc request size %zu", size);
-
 	AssertNotInCriticalSection(context);
 
 	/* isReset must be false already */
 	Assert(!context->isReset);
 
-	ret = context->methods->realloc(context, pointer, size);
-	if (unlikely(ret == NULL))
-	{
-		MemoryContextStats(TopMemoryContext);
-		ereport(ERROR,
-				(errcode(ERRCODE_OUT_OF_MEMORY),
-				 errmsg("out of memory"),
-				 errdetail("Failed on request of size %zu in memory context \"%s\".",
-						   size, context->name)));
-	}
+	ret = context->methods->realloc(context, pointer, size, 0);
+	Assert(ret != NULL);
 
 	VALGRIND_MEMPOOL_CHANGE(context, pointer, ret, size);
 
@@ -1222,21 +1117,9 @@ MemoryContextAllocHuge(MemoryContext context, Size size)
 	AssertArg(MemoryContextIsValid(context));
 	AssertNotInCriticalSection(context);
 
-	if (!AllocHugeSizeIsValid(size))
-		elog(ERROR, "invalid memory alloc request size %zu", size);
-
 	context->isReset = false;
-
-	ret = context->methods->alloc(context, size);
-	if (unlikely(ret == NULL))
-	{
-		MemoryContextStats(TopMemoryContext);
-		ereport(ERROR,
-				(errcode(ERRCODE_OUT_OF_MEMORY),
-				 errmsg("out of memory"),
-				 errdetail("Failed on request of size %zu in memory context \"%s\".",
-						   size, context->name)));
-	}
+	ret = context->methods->alloc(context, size, MCXT_ALLOC_HUGE);
+	Assert(ret != NULL);
 
 	VALGRIND_MEMPOOL_ALLOC(context, ret, size);
 
@@ -1254,16 +1137,23 @@ repalloc_huge(void *pointer, Size size)
 	MemoryContext context = GetMemoryChunkContext(pointer);
 	void	   *ret;
 
-	if (!AllocHugeSizeIsValid(size))
-		elog(ERROR, "invalid memory alloc request size %zu", size);
-
 	AssertNotInCriticalSection(context);
 
 	/* isReset must be false already */
 	Assert(!context->isReset);
 
-	ret = context->methods->realloc(context, pointer, size);
-	if (unlikely(ret == NULL))
+	ret = context->methods->realloc(context, pointer, size, MCXT_ALLOC_HUGE);
+	Assert(ret != NULL);
+
+	VALGRIND_MEMPOOL_CHANGE(context, pointer, ret, size);
+
+	return ret;
+}
+
+void *
+MemoryContextAllocationFailure(MemoryContext context, Size size, int flags)
+{
+	if ((flags & MCXT_ALLOC_NO_OOM) == 0)
 	{
 		MemoryContextStats(TopMemoryContext);
 		ereport(ERROR,
@@ -1273,9 +1163,13 @@ repalloc_huge(void *pointer, Size size)
 						   size, context->name)));
 	}
 
-	VALGRIND_MEMPOOL_CHANGE(context, pointer, ret, size);
+	return NULL;
+}
 
-	return ret;
+void
+MemoryContextSizeFailure(MemoryContext context, Size size, int flags)
+{
+	elog(ERROR, "invalid memory alloc request size %zu", size);
 }
 
 /*
@@ -1298,6 +1192,13 @@ MemoryContextStrdup(MemoryContext context, const char *string)
 char *
 pstrdup(const char *in)
 {
+	/*
+	 * Here, and elsewhere in this module, we show the target context's
+	 * "name" but not its "ident" (if any) in user-visible error messages.
+	 * The "ident" string might contain security-sensitive data, such as
+	 * values in SQL commands.
+	 */
+
 	return MemoryContextStrdup(CurrentMemoryContext, in);
 }
 
diff --git a/src/backend/utils/mmgr/slab.c b/src/backend/utils/mmgr/slab.c
index 553dd7f6674..e4b8275045f 100644
--- a/src/backend/utils/mmgr/slab.c
+++ b/src/backend/utils/mmgr/slab.c
@@ -126,9 +126,9 @@ typedef struct SlabChunk
 /*
  * These functions implement the MemoryContext API for Slab contexts.
  */
-static void *SlabAlloc(MemoryContext context, Size size);
+static void *SlabAlloc(MemoryContext context, Size size, int flags);
 static void SlabFree(MemoryContext context, void *pointer);
-static void *SlabRealloc(MemoryContext context, void *pointer, Size size);
+static void *SlabRealloc(MemoryContext context, void *pointer, Size size, int flags);
 static void SlabReset(MemoryContext context);
 static void SlabDelete(MemoryContext context);
 static Size SlabGetChunkSpace(MemoryContext context, void *pointer);
@@ -337,7 +337,7 @@ SlabDelete(MemoryContext context)
  *		request could not be completed; memory is added to the slab.
  */
 static void *
-SlabAlloc(MemoryContext context, Size size)
+SlabAlloc(MemoryContext context, Size size, int flags)
 {
 	SlabContext *slab = castNode(SlabContext, context);
 	SlabBlock  *block;
@@ -346,6 +346,12 @@ SlabAlloc(MemoryContext context, Size size)
 
 	Assert(slab);
 
+	/*
+	 * XXX: Probably no need to check for huge allocations, we only support
+	 * one size? Which could theoretically be huge, but that'd not make
+	 * sense...
+	 */
+
 	Assert((slab->minFreeChunks >= 0) &&
 		   (slab->minFreeChunks < slab->chunksPerBlock));
 
@@ -583,7 +589,7 @@ SlabFree(MemoryContext context, void *pointer)
  * realloc is usually used to enlarge the chunk.
  */
 static void *
-SlabRealloc(MemoryContext context, void *pointer, Size size)
+SlabRealloc(MemoryContext context, void *pointer, Size size, int flags)
 {
 	SlabContext *slab = castNode(SlabContext, context);
 
-- 
2.32.0.rc2

>From 8331119ab405eb8e29cb7a76ea8bfbf7a8914d68 Mon Sep 17 00:00:00 2001
From: Andres Freund <and...@anarazel.de>
Date: Mon, 19 Jul 2021 13:48:14 -0700
Subject: [PATCH 2/2] WIP: slab performance.

Discussion: https://postgr.es/m/20210717194333.mr5io3zup3kxa...@alap3.anarazel.de
---
 src/backend/utils/mmgr/slab.c | 454 ++++++++++++++++++++++------------
 1 file changed, 295 insertions(+), 159 deletions(-)

diff --git a/src/backend/utils/mmgr/slab.c b/src/backend/utils/mmgr/slab.c
index e4b8275045f..c979553dcdc 100644
--- a/src/backend/utils/mmgr/slab.c
+++ b/src/backend/utils/mmgr/slab.c
@@ -23,15 +23,18 @@
  *	global (context) level. This is possible as the chunk size (and thus also
  *	the number of chunks per block) is fixed.
  *
- *	On each block, free chunks are tracked in a simple linked list. Contents
- *	of free chunks is replaced with an index of the next free chunk, forming
- *	a very simple linked list. Each block also contains a counter of free
- *	chunks. Combined with the local block-level freelist, it makes it trivial
- *	to eventually free the whole block.
+ *	On each block, never allocated chunks are tracked by a simple offset, and
+ *	free chunks are tracked in a simple linked list. The offset approach
+ *	avoids needing to iterate over all chunks when allocating a new block,
+ *	which would cause page faults and cache pollution. Contents of free chunks
+ *	is replaced with a pointer to the next free chunk, forming a very simple
+ *	linked list. Each block also contains a counter of free chunks. Combined
+ *	with the local block-level freelist, it makes it trivial to eventually
+ *	free the whole block.
  *
  *	At the context level, we use 'freelist' to track blocks ordered by number
  *	of free chunks, starting with blocks having a single allocated chunk, and
- *	with completely full blocks on the tail.
+ *	with completely full blocks on the tail. XXX
  *
  *	This also allows various optimizations - for example when searching for
  *	free chunk, the allocator reuses space from the fullest blocks first, in
@@ -44,7 +47,7 @@
  *	case this performs as if the pointer was not maintained.
  *
  *	We cache the freelist index for the blocks with the fewest free chunks
- *	(minFreeChunks), so that we don't have to search the freelist on every
+ *	(minFreeChunkIndex), so that we don't have to search the freelist on every
  *	SlabAlloc() call, which is quite expensive.
  *
  *-------------------------------------------------------------------------
@@ -56,6 +59,17 @@
 #include "utils/memdebug.h"
 #include "utils/memutils.h"
 
+struct SlabBlock;
+struct SlabChunk;
+
+/*
+ * Number of actual freelists + 1, for full blocks. Full blocks are always at
+ * offset 0.
+ */
+#define SLAB_FREELIST_COUNT 9
+
+#define SLAB_RETAIN_EMPTY_BLOCK_COUNT 10
+
 /*
  * SlabContext is a specialized implementation of MemoryContext.
  */
@@ -68,13 +82,16 @@ typedef struct SlabContext
 	Size		blockSize;		/* block size */
 	Size		headerSize;		/* allocated size of context header */
 	int			chunksPerBlock; /* number of chunks per block */
-	int			minFreeChunks;	/* min number of free chunks in any block */
+	int			minFreeChunksIndex;	/* min number of free chunks in any block XXX */
 	int			nblocks;		/* number of blocks allocated */
 #ifdef MEMORY_CONTEXT_CHECKING
 	bool	   *freechunks;		/* bitmap of free chunks in a block */
 #endif
+	int			nfreeblocks;
+	dlist_head	freeblocks;
+	int			freelist_shift;
 	/* blocks with free space, grouped by number of free chunks: */
-	dlist_head	freelist[FLEXIBLE_ARRAY_MEMBER];
+	dlist_head	freelist[SLAB_FREELIST_COUNT];
 } SlabContext;
 
 /*
@@ -83,13 +100,15 @@ typedef struct SlabContext
  *
  * node: doubly-linked list of blocks in global freelist
  * nfree: number of free chunks in this block
- * firstFreeChunk: index of the first free chunk
+ * firstFreeChunk: first free chunk
  */
 typedef struct SlabBlock
 {
 	dlist_node	node;			/* doubly-linked list */
-	int			nfree;			/* number of free chunks */
-	int			firstFreeChunk; /* index of the first free chunk in the block */
+	int			nfree;			/* number of chunks on freelist + unused */
+	int			nunused;		/* number of unused chunks */
+	struct SlabChunk  *unused;	/* */
+	struct SlabChunk  *firstFreeChunk; /* first free chunk in the block */
 } SlabBlock;
 
 /*
@@ -123,6 +142,35 @@ typedef struct SlabChunk
 #define SlabChunkIndex(slab, block, chunk)	\
 	(((char *) chunk - SlabBlockStart(block)) / slab->fullChunkSize)
 
+static inline uint8
+SlabFreelistIndex(SlabContext *slab, int freecount)
+{
+	uint8 index;
+
+	Assert(freecount <= slab->chunksPerBlock);
+
+	/*
+	 * Ensure, without a branch, that index 0 is only used for blocks entirely
+	 * without free chunks.
+	 * XXX: There probably is a cheaper way to do this. Needing to shift twice
+	 * by slab->freelist_shift isn't great.
+	 */
+	index = (freecount + (1 << slab->freelist_shift) - 1) >> slab->freelist_shift;
+
+	if (freecount == 0)
+		Assert(index == 0);
+	else
+		Assert(index > 0 && index < (SLAB_FREELIST_COUNT));
+
+	return index;
+}
+
+static inline dlist_head*
+SlabFreelist(SlabContext *slab, int freecount)
+{
+	return &slab->freelist[SlabFreelistIndex(slab, freecount)];
+}
+
 /*
  * These functions implement the MemoryContext API for Slab contexts.
  */
@@ -179,7 +227,6 @@ SlabContextCreate(MemoryContext parent,
 {
 	int			chunksPerBlock;
 	Size		fullChunkSize;
-	Size		freelistSize;
 	Size		headerSize;
 	SlabContext *slab;
 	int			i;
@@ -192,11 +239,11 @@ SlabContextCreate(MemoryContext parent,
 					 "padding calculation in SlabChunk is wrong");
 
 	/* Make sure the linked list node fits inside a freed chunk */
-	if (chunkSize < sizeof(int))
-		chunkSize = sizeof(int);
+	if (chunkSize < MAXALIGN(sizeof(void *)))
+		chunkSize = MAXALIGN(sizeof(void *));
 
 	/* chunk, including SLAB header (both addresses nicely aligned) */
-	fullChunkSize = sizeof(SlabChunk) + MAXALIGN(chunkSize);
+	fullChunkSize = sizeof(SlabChunk) + chunkSize;
 
 	/* Make sure the block can store at least one chunk. */
 	if (blockSize < fullChunkSize + sizeof(SlabBlock))
@@ -206,16 +253,14 @@ SlabContextCreate(MemoryContext parent,
 	/* Compute maximum number of chunks per block */
 	chunksPerBlock = (blockSize - sizeof(SlabBlock)) / fullChunkSize;
 
-	/* The freelist starts with 0, ends with chunksPerBlock. */
-	freelistSize = sizeof(dlist_head) * (chunksPerBlock + 1);
-
 	/*
 	 * Allocate the context header.  Unlike aset.c, we never try to combine
 	 * this with the first regular block; not worth the extra complication.
+	 * XXX: What's the evidence for that?
 	 */
 
 	/* Size of the memory context header */
-	headerSize = offsetof(SlabContext, freelist) + freelistSize;
+	headerSize = sizeof(SlabContext);
 
 #ifdef MEMORY_CONTEXT_CHECKING
 
@@ -249,17 +294,34 @@ SlabContextCreate(MemoryContext parent,
 	slab->blockSize = blockSize;
 	slab->headerSize = headerSize;
 	slab->chunksPerBlock = chunksPerBlock;
-	slab->minFreeChunks = 0;
+	slab->minFreeChunksIndex = 0;
 	slab->nblocks = 0;
+	slab->nfreeblocks = 0;
+
+	/*
+	 * Compute a shift that guarantees that shifting chunksPerBlock with it
+	 * yields is smaller than SLAB_FREELIST_COUNT - 1 (one freelist is used for full blocks).
+	 */
+	slab->freelist_shift = 0;
+	while ((slab->chunksPerBlock >> slab->freelist_shift) >= (SLAB_FREELIST_COUNT - 1))
+		slab->freelist_shift++;
+
+#if 0
+	elog(LOG, "freelist shift for %d chunks of size %zu is %d, block size %zu",
+		 slab->chunksPerBlock, slab->fullChunkSize, slab->freelist_shift,
+		 slab->blockSize);
+#endif
+
+	dlist_init(&slab->freeblocks);
 
 	/* initialize the freelist slots */
-	for (i = 0; i < (slab->chunksPerBlock + 1); i++)
+	for (i = 0; i < SLAB_FREELIST_COUNT; i++)
 		dlist_init(&slab->freelist[i]);
 
 #ifdef MEMORY_CONTEXT_CHECKING
 	/* set the freechunks pointer right after the freelists array */
 	slab->freechunks
-		= (bool *) slab + offsetof(SlabContext, freelist) + freelistSize;
+		= (bool *) slab + sizeof(SlabContext);
 #endif
 
 	/* Finally, do the type-independent part of context creation */
@@ -292,8 +354,27 @@ SlabReset(MemoryContext context)
 	SlabCheck(context);
 #endif
 
+	/* release retained empty blocks */
+	{
+		dlist_mutable_iter miter;
+
+		dlist_foreach_modify(miter, &slab->freeblocks)
+		{
+			SlabBlock  *block = dlist_container(SlabBlock, node, miter.cur);
+
+			dlist_delete(miter.cur);
+
+#ifdef CLOBBER_FREED_MEMORY
+			wipe_mem(block, slab->blockSize);
+#endif
+			free(block);
+			slab->nblocks--;
+			context->mem_allocated -= slab->blockSize;
+		}
+	}
+
 	/* walk over freelists and free the blocks */
-	for (i = 0; i <= slab->chunksPerBlock; i++)
+	for (i = 0; i < SLAB_FREELIST_COUNT; i++)
 	{
 		dlist_mutable_iter miter;
 
@@ -312,7 +393,7 @@ SlabReset(MemoryContext context)
 		}
 	}
 
-	slab->minFreeChunks = 0;
+	slab->minFreeChunksIndex = 0;
 
 	Assert(slab->nblocks == 0);
 	Assert(context->mem_allocated == 0);
@@ -342,7 +423,6 @@ SlabAlloc(MemoryContext context, Size size, int flags)
 	SlabContext *slab = castNode(SlabContext, context);
 	SlabBlock  *block;
 	SlabChunk  *chunk;
-	int			idx;
 
 	Assert(slab);
 
@@ -352,8 +432,8 @@ SlabAlloc(MemoryContext context, Size size, int flags)
 	 * sense...
 	 */
 
-	Assert((slab->minFreeChunks >= 0) &&
-		   (slab->minFreeChunks < slab->chunksPerBlock));
+	Assert((slab->minFreeChunksIndex >= 0) &&
+		   (slab->minFreeChunksIndex < SLAB_FREELIST_COUNT));
 
 	/* make sure we only allow correct request size */
 	if (size != slab->chunkSize)
@@ -367,25 +447,33 @@ SlabAlloc(MemoryContext context, Size size, int flags)
 	 * slab->minFreeChunks == 0 means there are no blocks with free chunks,
 	 * thanks to how minFreeChunks is updated at the end of SlabAlloc().
 	 */
-	if (slab->minFreeChunks == 0)
+	if (unlikely(slab->minFreeChunksIndex == 0))
 	{
-		block = (SlabBlock *) malloc(slab->blockSize);
+		if (slab->nfreeblocks > 0)
+		{
+			dlist_node *node;
 
-		if (block == NULL)
-			return NULL;
+			node = dlist_pop_head_node(&slab->freeblocks);
+			block = dlist_container(SlabBlock, node, node);
+			slab->nfreeblocks--;
+		}
+		else
+		{
+			block = (SlabBlock *) malloc(slab->blockSize);
+
+			if (unlikely(block == NULL))
+				return NULL;
+
+			slab->nblocks += 1;
+			context->mem_allocated += slab->blockSize;
+		}
 
 		block->nfree = slab->chunksPerBlock;
-		block->firstFreeChunk = 0;
+		block->firstFreeChunk = NULL;
+		block->nunused = slab->chunksPerBlock;
+		block->unused = (SlabChunk *) SlabBlockStart(block);
 
-		/*
-		 * Put all the chunks on a freelist. Walk the chunks and point each
-		 * one to the next one.
-		 */
-		for (idx = 0; idx < slab->chunksPerBlock; idx++)
-		{
-			chunk = SlabBlockGetChunk(slab, block, idx);
-			*(int32 *) SlabChunkGetPointer(chunk) = (idx + 1);
-		}
+		slab->minFreeChunksIndex = SlabFreelistIndex(slab, slab->chunksPerBlock);
 
 		/*
 		 * And add it to the last freelist with all chunks empty.
@@ -393,32 +481,54 @@ SlabAlloc(MemoryContext context, Size size, int flags)
 		 * We know there are no blocks in the freelist, otherwise we wouldn't
 		 * need a new block.
 		 */
-		Assert(dlist_is_empty(&slab->freelist[slab->chunksPerBlock]));
+		Assert(dlist_is_empty(SlabFreelist(slab, slab->chunksPerBlock)));
 
-		dlist_push_head(&slab->freelist[slab->chunksPerBlock], &block->node);
+		dlist_push_head(SlabFreelist(slab, slab->chunksPerBlock),
+						&block->node);
 
-		slab->minFreeChunks = slab->chunksPerBlock;
-		slab->nblocks += 1;
-		context->mem_allocated += slab->blockSize;
+		chunk = block->unused;
+		block->unused = (SlabChunk *)(((char *) block->unused) + slab->fullChunkSize);
+		block->nunused--;
+	}
+	else
+	{
+		/* grab the block from the freelist */
+		block = dlist_head_element(SlabBlock, node,
+								   &slab->freelist[slab->minFreeChunksIndex]);
+
+		/* make sure we actually got a valid block, with matching nfree */
+		Assert(block != NULL);
+		Assert(slab->minFreeChunksIndex == SlabFreelistIndex(slab, block->nfree));
+		Assert(block->nfree > 0);
+
+		/* we know index of the first free chunk in the block */
+		if (block->nunused > 0)
+		{
+			chunk = block->unused;
+			block->unused = (SlabChunk *)(((char *) block->unused) + slab->fullChunkSize);
+			block->nunused--;
+		}
+		else
+		{
+			chunk = block->firstFreeChunk;
+
+			/*
+			 * Remove the chunk from the freelist head. The index of the next free
+			 * chunk is stored in the chunk itself.
+			 */
+			VALGRIND_MAKE_MEM_DEFINED(SlabChunkGetPointer(chunk), sizeof(void*));
+			block->firstFreeChunk = *(SlabChunk **) SlabChunkGetPointer(chunk);
+		}
 	}
 
-	/* grab the block from the freelist (even the new block is there) */
-	block = dlist_head_element(SlabBlock, node,
-							   &slab->freelist[slab->minFreeChunks]);
+	/* make sure the chunk is in the block and that it's marked as empty (XXX?) */
+	Assert((char *) chunk >= SlabBlockStart(block));
+	Assert((char *) chunk < (((char *)block) + slab->blockSize));
 
-	/* make sure we actually got a valid block, with matching nfree */
-	Assert(block != NULL);
-	Assert(slab->minFreeChunks == block->nfree);
-	Assert(block->nfree > 0);
-
-	/* we know index of the first free chunk in the block */
-	idx = block->firstFreeChunk;
-
-	/* make sure the chunk index is valid, and that it's marked as empty */
-	Assert((idx >= 0) && (idx < slab->chunksPerBlock));
-
-	/* compute the chunk location block start (after the block header) */
-	chunk = SlabBlockGetChunk(slab, block, idx);
+	Assert(block->firstFreeChunk == NULL || (
+			   (block->firstFreeChunk >= (SlabChunk *) SlabBlockStart(block)) &&
+			   block->firstFreeChunk <= (SlabChunk *) (((char *)block) + slab->blockSize))
+		);
 
 	/*
 	 * Update the block nfree count, and also the minFreeChunks as we've
@@ -426,48 +536,6 @@ SlabAlloc(MemoryContext context, Size size, int flags)
 	 * (because that's how we chose the block).
 	 */
 	block->nfree--;
-	slab->minFreeChunks = block->nfree;
-
-	/*
-	 * Remove the chunk from the freelist head. The index of the next free
-	 * chunk is stored in the chunk itself.
-	 */
-	VALGRIND_MAKE_MEM_DEFINED(SlabChunkGetPointer(chunk), sizeof(int32));
-	block->firstFreeChunk = *(int32 *) SlabChunkGetPointer(chunk);
-
-	Assert(block->firstFreeChunk >= 0);
-	Assert(block->firstFreeChunk <= slab->chunksPerBlock);
-
-	Assert((block->nfree != 0 &&
-			block->firstFreeChunk < slab->chunksPerBlock) ||
-		   (block->nfree == 0 &&
-			block->firstFreeChunk == slab->chunksPerBlock));
-
-	/* move the whole block to the right place in the freelist */
-	dlist_delete(&block->node);
-	dlist_push_head(&slab->freelist[block->nfree], &block->node);
-
-	/*
-	 * And finally update minFreeChunks, i.e. the index to the block with the
-	 * lowest number of free chunks. We only need to do that when the block
-	 * got full (otherwise we know the current block is the right one). We'll
-	 * simply walk the freelist until we find a non-empty entry.
-	 */
-	if (slab->minFreeChunks == 0)
-	{
-		for (idx = 1; idx <= slab->chunksPerBlock; idx++)
-		{
-			if (dlist_is_empty(&slab->freelist[idx]))
-				continue;
-
-			/* found a non-empty freelist */
-			slab->minFreeChunks = idx;
-			break;
-		}
-	}
-
-	if (slab->minFreeChunks == slab->chunksPerBlock)
-		slab->minFreeChunks = 0;
 
 	/* Prepare to initialize the chunk header. */
 	VALGRIND_MAKE_MEM_UNDEFINED(chunk, sizeof(SlabChunk));
@@ -475,6 +543,45 @@ SlabAlloc(MemoryContext context, Size size, int flags)
 	chunk->block = block;
 	chunk->slab = slab;
 
+	Assert(slab->minFreeChunksIndex == SlabFreelistIndex(slab, block->nfree + 1));
+
+	/* move the whole block to the right place in the freelist */
+	if (unlikely(SlabFreelistIndex(slab, block->nfree) != slab->minFreeChunksIndex))
+	{
+		slab->minFreeChunksIndex = SlabFreelistIndex(slab, block->nfree);
+		dlist_delete(&block->node);
+		dlist_push_head(SlabFreelist(slab, block->nfree), &block->node);
+
+
+		/*
+		 * And finally update minFreeChunks, i.e. the index to the block with the
+		 * lowest number of free chunks. We only need to do that when the block
+		 * got full (otherwise we know the current block is the right one). We'll
+		 * simply walk the freelist until we find a non-empty entry.
+		 */
+		if (slab->minFreeChunksIndex == 0)
+		{
+			for (int idx = 1; idx < SLAB_FREELIST_COUNT; idx++)
+			{
+				if (dlist_is_empty(&slab->freelist[idx]))
+					continue;
+
+				/* found a non-empty freelist */
+				slab->minFreeChunksIndex = idx;
+				break;
+			}
+		}
+	}
+
+#if 0
+	/*
+	 * FIXME: I don't understand what this ever did? It should be unreachable
+	 * I think?
+	 */
+	if (slab->minFreeChunks == slab->chunksPerBlock)
+		slab->minFreeChunks = 0;
+#endif
+
 #ifdef MEMORY_CONTEXT_CHECKING
 	/* slab mark to catch clobber of "unused" space */
 	if (slab->chunkSize < (slab->fullChunkSize - sizeof(SlabChunk)))
@@ -496,6 +603,61 @@ SlabAlloc(MemoryContext context, Size size, int flags)
 	return SlabChunkGetPointer(chunk);
 }
 
+static void pg_noinline
+SlabFreeSlow(SlabContext *slab, SlabBlock  *block)
+{
+	dlist_delete(&block->node);
+
+	/*
+	 * See if we need to update the minFreeChunks field for the slab - we only
+	 * need to do that if the block had that number of free chunks
+	 * before we freed one. In that case, we check if there still are blocks
+	 * in the original freelist and we either keep the current value (if there
+	 * still are blocks) or increment it by one (the new block is still the
+	 * one with minimum free chunks).
+	 *
+	 * The one exception is when the block will get completely free - in that
+	 * case we will free it, se we can't use it for minFreeChunks. It however
+	 * means there are no more blocks with free chunks.
+	 */
+	if (slab->minFreeChunksIndex == SlabFreelistIndex(slab, block->nfree - 1))
+	{
+		/* Have we removed the last chunk from the freelist? */
+		if (dlist_is_empty(&slab->freelist[slab->minFreeChunksIndex]))
+		{
+			/* but if we made the block entirely free, we'll free it */
+			if (block->nfree == slab->chunksPerBlock)
+				slab->minFreeChunksIndex = 0;
+			else
+				slab->minFreeChunksIndex++;
+		}
+	}
+
+	/* If the block is now completely empty, free it (in a way). */
+	if (block->nfree == slab->chunksPerBlock)
+	{
+		/*
+		 * To avoid constantly freeing/allocating blocks in bursty patterns
+		 * (on most crucially cases of repeatedly allocating and freeing a
+		 * single chunk), retain a small number of blocks.
+		 */
+		if (slab->nfreeblocks < SLAB_RETAIN_EMPTY_BLOCK_COUNT)
+		{
+			dlist_push_head(&slab->freeblocks, &block->node);
+			slab->nfreeblocks++;
+		}
+		else
+		{
+			slab->nblocks--;
+			slab->header.mem_allocated -= slab->blockSize;
+			free(block);
+		}
+	}
+	else
+		dlist_push_head(SlabFreelist(slab, block->nfree), &block->node);
+
+}
+
 /*
  * SlabFree
  *		Frees allocated memory; memory is removed from the slab.
@@ -503,7 +665,6 @@ SlabAlloc(MemoryContext context, Size size, int flags)
 static void
 SlabFree(MemoryContext context, void *pointer)
 {
-	int			idx;
 	SlabContext *slab = castNode(SlabContext, context);
 	SlabChunk  *chunk = SlabPointerGetChunk(pointer);
 	SlabBlock  *block = chunk->block;
@@ -516,61 +677,26 @@ SlabFree(MemoryContext context, void *pointer)
 				 slab->header.name, chunk);
 #endif
 
-	/* compute index of the chunk with respect to block start */
-	idx = SlabChunkIndex(slab, block, chunk);
-
 	/* add chunk to freelist, and update block nfree count */
-	*(int32 *) pointer = block->firstFreeChunk;
-	block->firstFreeChunk = idx;
+	*(SlabChunk **) pointer = block->firstFreeChunk;
+	block->firstFreeChunk = chunk;
 	block->nfree++;
 
 	Assert(block->nfree > 0);
 	Assert(block->nfree <= slab->chunksPerBlock);
 
 #ifdef CLOBBER_FREED_MEMORY
-	/* XXX don't wipe the int32 index, used for block-level freelist */
-	wipe_mem((char *) pointer + sizeof(int32),
-			 slab->chunkSize - sizeof(int32));
+	/* XXX don't wipe the SlabChunk* index, used for block-level freelist */
+	wipe_mem((char *) pointer + sizeof(SlabChunk*),
+			 slab->chunkSize - sizeof(SlabChunk*));
 #endif
 
 	/* remove the block from a freelist */
-	dlist_delete(&block->node);
-
-	/*
-	 * See if we need to update the minFreeChunks field for the slab - we only
-	 * need to do that if there the block had that number of free chunks
-	 * before we freed one. In that case, we check if there still are blocks
-	 * in the original freelist and we either keep the current value (if there
-	 * still are blocks) or increment it by one (the new block is still the
-	 * one with minimum free chunks).
-	 *
-	 * The one exception is when the block will get completely free - in that
-	 * case we will free it, se we can't use it for minFreeChunks. It however
-	 * means there are no more blocks with free chunks.
-	 */
-	if (slab->minFreeChunks == (block->nfree - 1))
+	if (SlabFreelistIndex(slab, block->nfree) != SlabFreelistIndex(slab, block->nfree - 1))
 	{
-		/* Have we removed the last chunk from the freelist? */
-		if (dlist_is_empty(&slab->freelist[slab->minFreeChunks]))
-		{
-			/* but if we made the block entirely free, we'll free it */
-			if (block->nfree == slab->chunksPerBlock)
-				slab->minFreeChunks = 0;
-			else
-				slab->minFreeChunks++;
-		}
+		SlabFreeSlow(slab, block);
 	}
 
-	/* If the block is now completely empty, free it. */
-	if (block->nfree == slab->chunksPerBlock)
-	{
-		free(block);
-		slab->nblocks--;
-		context->mem_allocated -= slab->blockSize;
-	}
-	else
-		dlist_push_head(&slab->freelist[block->nfree], &block->node);
-
 	Assert(slab->nblocks >= 0);
 	Assert(slab->nblocks * slab->blockSize == context->mem_allocated);
 }
@@ -657,7 +783,7 @@ SlabStats(MemoryContext context,
 	/* Include context header in totalspace */
 	totalspace = slab->headerSize;
 
-	for (i = 0; i <= slab->chunksPerBlock; i++)
+	for (i = 0; i < SLAB_FREELIST_COUNT; i++)
 	{
 		dlist_iter	iter;
 
@@ -714,7 +840,7 @@ SlabCheck(MemoryContext context)
 	Assert(slab->chunksPerBlock > 0);
 
 	/* walk all the freelists */
-	for (i = 0; i <= slab->chunksPerBlock; i++)
+	for (i = 0; i < SLAB_FREELIST_COUNT; i++)
 	{
 		int			j,
 					nfree;
@@ -723,20 +849,19 @@ SlabCheck(MemoryContext context)
 		/* walk all blocks on this freelist */
 		dlist_foreach(iter, &slab->freelist[i])
 		{
-			int			idx;
 			SlabBlock  *block = dlist_container(SlabBlock, node, iter.cur);
+			SlabChunk  *cur_chunk;
 
 			/*
 			 * Make sure the number of free chunks (in the block header)
 			 * matches position in the freelist.
 			 */
-			if (block->nfree != i)
+			if (SlabFreelistIndex(slab, block->nfree) != i)
 				elog(WARNING, "problem in slab %s: number of free chunks %d in block %p does not match freelist %d",
 					 name, block->nfree, block, i);
 
 			/* reset the bitmap of free chunks for this block */
 			memset(slab->freechunks, 0, (slab->chunksPerBlock * sizeof(bool)));
-			idx = block->firstFreeChunk;
 
 			/*
 			 * Now walk through the chunks, count the free ones and also
@@ -744,20 +869,31 @@ SlabCheck(MemoryContext context)
 			 * freelist is stored within the chunks themselves, we have to
 			 * walk through the chunks and construct our own bitmap.
 			 */
-
+			cur_chunk = block->firstFreeChunk;
 			nfree = 0;
-			while (idx < slab->chunksPerBlock)
+			while (cur_chunk != NULL)
 			{
-				SlabChunk  *chunk;
+				int idx = SlabChunkIndex(slab, block, cur_chunk);
 
 				/* count the chunk as free, add it to the bitmap */
 				nfree++;
 				slab->freechunks[idx] = true;
 
 				/* read index of the next free chunk */
-				chunk = SlabBlockGetChunk(slab, block, idx);
-				VALGRIND_MAKE_MEM_DEFINED(SlabChunkGetPointer(chunk), sizeof(int32));
-				idx = *(int32 *) SlabChunkGetPointer(chunk);
+				VALGRIND_MAKE_MEM_DEFINED(SlabChunkGetPointer(cur_chunk), sizeof(SlabChunk **));
+				cur_chunk = *(SlabChunk **) SlabChunkGetPointer(cur_chunk);
+			}
+
+			cur_chunk = block->unused;
+			for (j = 0; j < block->nunused; j++)
+			{
+				int idx = SlabChunkIndex(slab, block, cur_chunk);
+
+				/* count the chunk as free, add it to the bitmap */
+				nfree++;
+				slab->freechunks[idx] = true;
+
+				cur_chunk = (SlabChunk *)(((char *) cur_chunk) + slab->fullChunkSize);
 			}
 
 			for (j = 0; j < slab->chunksPerBlock; j++)
-- 
2.32.0.rc2

Re: slab allocator performance issues

Reply via email to