>From d97ea94fc77b50d3b95159497331ef7dc5a93fae Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@pgaddict.com>
Date: Tue, 19 Jul 2016 00:15:54 +0200
Subject: [PATCH 1/2] simple slab allocator (fixed-size allocations)

improvements based on review by Petr Jelinek

- Better comment explaining how free chunks are tracked, etc.
- Removed unused SlabPointerIsValid macro.
- Added comment to SlabBlockData, explaining the fields.
- Modified SlabChunkData comment, explaining relation to StandardChunkHeader.
- Modified SlabContextCreate() to compute chunksPerBlock a bit more efficiently.
- Implemented SlabCheck().
- Replace two Assert() calls with elog().
---
 src/backend/replication/logical/reorderbuffer.c | 163 ++--
 src/backend/utils/mmgr/Makefile                 |   2 +-
 src/backend/utils/mmgr/slab.c                   | 940 ++++++++++++++++++++++++
 src/include/nodes/memnodes.h                    |   2 +-
 src/include/nodes/nodes.h                       |   1 +
 src/include/replication/reorderbuffer.h         |  13 +
 src/include/utils/memutils.h                    |  11 +
 7 files changed, 1039 insertions(+), 93 deletions(-)
 create mode 100644 src/backend/utils/mmgr/slab.c

diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c
index 9b430b9..00e2b7b 100644
--- a/src/backend/replication/logical/reorderbuffer.c
+++ b/src/backend/replication/logical/reorderbuffer.c
@@ -135,6 +135,9 @@ typedef struct ReorderBufferDiskChange
 	/* data follows */
 } ReorderBufferDiskChange;
 
+/* 10k tuples seems like a reasonable value (~80MB with MaxHeapTupleSize) */
+#define		TUPLES_PER_GENERATION		10000
+
 /*
  * Maximum number of changes kept in memory, per transaction. After that,
  * changes are spooled to disk.
@@ -156,9 +159,6 @@ static const Size max_changes_in_memory = 4096;
  * major bottleneck, especially when spilling to disk while decoding batch
  * workloads.
  */
-static const Size max_cached_changes = 4096 * 2;
-static const Size max_cached_tuplebufs = 4096 * 2;		/* ~8MB */
-static const Size max_cached_transactions = 512;
 
 
 /* ---------------------------------------
@@ -241,6 +241,28 @@ ReorderBufferAllocate(void)
 
 	buffer->context = new_ctx;
 
+	buffer->change_context = SlabContextCreate(new_ctx,
+									"Change",
+									SLAB_DEFAULT_BLOCK_SIZE,
+									sizeof(ReorderBufferChange));
+
+	buffer->txn_context = SlabContextCreate(new_ctx,
+									"TXN",
+									SLAB_DEFAULT_BLOCK_SIZE,
+									sizeof(ReorderBufferTXN));
+
+	buffer->tup_context_slab = SlabContextCreate(new_ctx,
+									"TuplesSlab",
+									SLAB_LARGE_BLOCK_SIZE,
+									sizeof(ReorderBufferTupleBuf) +
+									MAXIMUM_ALIGNOF + MaxHeapTupleSize);
+
+	buffer->tup_context_oversized = AllocSetContextCreate(new_ctx,
+									"TuplesOversized",
+									ALLOCSET_DEFAULT_MINSIZE,
+									ALLOCSET_DEFAULT_INITSIZE,
+									ALLOCSET_DEFAULT_MAXSIZE);
+
 	hash_ctl.keysize = sizeof(TransactionId);
 	hash_ctl.entrysize = sizeof(ReorderBufferTXNByIdEnt);
 	hash_ctl.hcxt = buffer->context;
@@ -260,11 +282,17 @@ ReorderBufferAllocate(void)
 
 	buffer->current_restart_decoding_lsn = InvalidXLogRecPtr;
 
+	buffer->tuples_count = 0;
+	buffer->tuples_size = 0;
+
 	dlist_init(&buffer->toplevel_by_lsn);
 	dlist_init(&buffer->cached_transactions);
 	dlist_init(&buffer->cached_changes);
 	slist_init(&buffer->cached_tuplebufs);
 
+	buffer->current_size = sizeof(ReorderBufferTupleBuf) +
+						   MAXIMUM_ALIGNOF + MaxHeapTupleSize;
+
 	return buffer;
 }
 
@@ -291,19 +319,8 @@ ReorderBufferGetTXN(ReorderBuffer *rb)
 {
 	ReorderBufferTXN *txn;
 
-	/* check the slab cache */
-	if (rb->nr_cached_transactions > 0)
-	{
-		rb->nr_cached_transactions--;
-		txn = (ReorderBufferTXN *)
-			dlist_container(ReorderBufferTXN, node,
-							dlist_pop_head_node(&rb->cached_transactions));
-	}
-	else
-	{
-		txn = (ReorderBufferTXN *)
-			MemoryContextAlloc(rb->context, sizeof(ReorderBufferTXN));
-	}
+	txn = (ReorderBufferTXN *)
+			MemoryContextAlloc(rb->txn_context, sizeof(ReorderBufferTXN));
 
 	memset(txn, 0, sizeof(ReorderBufferTXN));
 
@@ -344,18 +361,7 @@ ReorderBufferReturnTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
 		txn->invalidations = NULL;
 	}
 
-	/* check whether to put into the slab cache */
-	if (rb->nr_cached_transactions < max_cached_transactions)
-	{
-		rb->nr_cached_transactions++;
-		dlist_push_head(&rb->cached_transactions, &txn->node);
-		VALGRIND_MAKE_MEM_UNDEFINED(txn, sizeof(ReorderBufferTXN));
-		VALGRIND_MAKE_MEM_DEFINED(&txn->node, sizeof(txn->node));
-	}
-	else
-	{
-		pfree(txn);
-	}
+	pfree(txn);
 }
 
 /*
@@ -366,19 +372,8 @@ ReorderBufferGetChange(ReorderBuffer *rb)
 {
 	ReorderBufferChange *change;
 
-	/* check the slab cache */
-	if (rb->nr_cached_changes)
-	{
-		rb->nr_cached_changes--;
-		change = (ReorderBufferChange *)
-			dlist_container(ReorderBufferChange, node,
-							dlist_pop_head_node(&rb->cached_changes));
-	}
-	else
-	{
-		change = (ReorderBufferChange *)
-			MemoryContextAlloc(rb->context, sizeof(ReorderBufferChange));
-	}
+	change = (ReorderBufferChange *)
+			MemoryContextAlloc(rb->change_context, sizeof(ReorderBufferChange));
 
 	memset(change, 0, sizeof(ReorderBufferChange));
 	return change;
@@ -434,21 +429,9 @@ ReorderBufferReturnChange(ReorderBuffer *rb, ReorderBufferChange *change)
 			break;
 	}
 
-	/* check whether to put into the slab cache */
-	if (rb->nr_cached_changes < max_cached_changes)
-	{
-		rb->nr_cached_changes++;
-		dlist_push_head(&rb->cached_changes, &change->node);
-		VALGRIND_MAKE_MEM_UNDEFINED(change, sizeof(ReorderBufferChange));
-		VALGRIND_MAKE_MEM_DEFINED(&change->node, sizeof(change->node));
-	}
-	else
-	{
-		pfree(change);
-	}
+	pfree(change);
 }
 
-
 /*
  * Get an unused, possibly preallocated, ReorderBufferTupleBuf fitting at
  * least a tuple of size tuple_len (excluding header overhead).
@@ -461,37 +444,49 @@ ReorderBufferGetTupleBuf(ReorderBuffer *rb, Size tuple_len)
 
 	alloc_len = tuple_len + SizeofHeapTupleHeader;
 
-	/*
-	 * Most tuples are below MaxHeapTupleSize, so we use a slab allocator for
-	 * those. Thus always allocate at least MaxHeapTupleSize. Note that tuples
-	 * generated for oldtuples can be bigger, as they don't have out-of-line
-	 * toast columns.
-	 */
-	if (alloc_len < MaxHeapTupleSize)
-		alloc_len = MaxHeapTupleSize;
+	/* see if we need to allocate a new context generation */
+	if (rb->tuples_count == TUPLES_PER_GENERATION)
+	{
+		Size	new_size;
+		Size	avg_length = (rb->tuples_size / rb->tuples_count);
+
+		/* mark the current SLAB context for automatic destruction */
+		SlabAutodestruct(rb->tup_context_slab);
+
+		/* assume +50% is enough slack to fit most tuples into the slab context */
+		new_size = MAXALIGN(avg_length * 1.5);
+
+		rb->current_size = new_size;
+		rb->tup_context_slab = SlabContextCreate(rb->context,
+									"TuplesSlab",
+									SLAB_LARGE_BLOCK_SIZE,
+									sizeof(ReorderBufferTupleBuf) +
+									MAXIMUM_ALIGNOF + rb->current_size);
+
+		/* we could also recreate the aset context, with block sizes set so
+		 * that the palloc always does malloc(), but not sure about that */
+
+		rb->tuples_count = 0;
+		rb->tuples_size = 0;
+	}
 
+	rb->tuples_count += 1;
+	rb->tuples_size  += alloc_len;
 
 	/* if small enough, check the slab cache */
-	if (alloc_len <= MaxHeapTupleSize && rb->nr_cached_tuplebufs)
+	if (alloc_len <= rb->current_size)
 	{
-		rb->nr_cached_tuplebufs--;
-		tuple = slist_container(ReorderBufferTupleBuf, node,
-								slist_pop_head_node(&rb->cached_tuplebufs));
-		Assert(tuple->alloc_tuple_size == MaxHeapTupleSize);
-#ifdef USE_ASSERT_CHECKING
-		memset(&tuple->tuple, 0xa9, sizeof(HeapTupleData));
-		VALGRIND_MAKE_MEM_UNDEFINED(&tuple->tuple, sizeof(HeapTupleData));
-#endif
+		tuple = (ReorderBufferTupleBuf *)
+			MemoryContextAlloc(rb->tup_context_slab,
+							   sizeof(ReorderBufferTupleBuf) +
+							   MAXIMUM_ALIGNOF + rb->current_size);
+		tuple->alloc_tuple_size = rb->current_size;
 		tuple->tuple.t_data = ReorderBufferTupleBufData(tuple);
-#ifdef USE_ASSERT_CHECKING
-		memset(tuple->tuple.t_data, 0xa8, tuple->alloc_tuple_size);
-		VALGRIND_MAKE_MEM_UNDEFINED(tuple->tuple.t_data, tuple->alloc_tuple_size);
-#endif
 	}
 	else
 	{
 		tuple = (ReorderBufferTupleBuf *)
-			MemoryContextAlloc(rb->context,
+			MemoryContextAlloc(rb->tup_context_oversized,
 							   sizeof(ReorderBufferTupleBuf) +
 							   MAXIMUM_ALIGNOF + alloc_len);
 		tuple->alloc_tuple_size = alloc_len;
@@ -510,21 +505,7 @@ ReorderBufferGetTupleBuf(ReorderBuffer *rb, Size tuple_len)
 void
 ReorderBufferReturnTupleBuf(ReorderBuffer *rb, ReorderBufferTupleBuf *tuple)
 {
-	/* check whether to put into the slab cache, oversized tuples never are */
-	if (tuple->alloc_tuple_size == MaxHeapTupleSize &&
-		rb->nr_cached_tuplebufs < max_cached_tuplebufs)
-	{
-		rb->nr_cached_tuplebufs++;
-		slist_push_head(&rb->cached_tuplebufs, &tuple->node);
-		VALGRIND_MAKE_MEM_UNDEFINED(tuple->tuple.t_data, tuple->alloc_tuple_size);
-		VALGRIND_MAKE_MEM_UNDEFINED(tuple, sizeof(ReorderBufferTupleBuf));
-		VALGRIND_MAKE_MEM_DEFINED(&tuple->node, sizeof(tuple->node));
-		VALGRIND_MAKE_MEM_DEFINED(&tuple->alloc_tuple_size, sizeof(tuple->alloc_tuple_size));
-	}
-	else
-	{
-		pfree(tuple);
-	}
+	pfree(tuple);
 }
 
 /*
diff --git a/src/backend/utils/mmgr/Makefile b/src/backend/utils/mmgr/Makefile
index b2403e1..321289f 100644
--- a/src/backend/utils/mmgr/Makefile
+++ b/src/backend/utils/mmgr/Makefile
@@ -12,6 +12,6 @@ subdir = src/backend/utils/mmgr
 top_builddir = ../../../..
 include $(top_builddir)/src/Makefile.global
 
-OBJS = aset.o mcxt.o portalmem.o
+OBJS = aset.o mcxt.o portalmem.o slab.o
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/utils/mmgr/slab.c b/src/backend/utils/mmgr/slab.c
new file mode 100644
index 0000000..bde0ea1
--- /dev/null
+++ b/src/backend/utils/mmgr/slab.c
@@ -0,0 +1,940 @@
+/*-------------------------------------------------------------------------
+ *
+ * slab.c
+ *	  SLAB allocator definitions.
+ *
+ * SLAB is a custom memory context MemoryContext implementation designed for
+ * cases of equally-sized objects.
+ *
+ *
+ * Portions Copyright (c) 2016, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/backend/utils/mmgr/slab.c
+ *
+ *
+ *	The constant allocation size allows significant simplification and various
+ *	optimizations that are not possible in AllocSet. Firstly, we can get rid
+ *	of the doubling and carve the blocks into chunks of exactly the right size
+ *	(plus alignment), now wasting memory.
+ *
+ *	The information about free chunks is maintained both at the block level and
+ *	global (context) level. This is possible as the chunk size (and thus also
+ *	the number of chunks per block) is fixed.
+ *
+ *	Each block includes a simple bitmap tracking which chunks are used/free.
+ *	This makes it trivial to check if all chunks on the block are free, and
+ *	eventually free the whole block (which is almost impossible with AllocSet,
+ *	as it stores free chunks from all blocks in a single global freelist).
+ *
+ *	At the context level, we use 'freelist' array to track blocks grouped by
+ *	number of free chunks. For example freelist[0] is a list of completely full
+ *	blocks, freelist[1] is a block with a single free chunk, etc.
+ *
+ *	This also allows various optimizations - for example when searching for
+ *	free chunk, we the allocator reuses space from the most full blocks first,
+ *	in the hope that some of the less full blocks will get completely empty
+ *	(and returned back to the OS).
+ *
+ *	For each block, we maintain pointer to the first free chunk - this is quite
+ *	cheap and allows us to skip all the preceding used chunks, eliminating
+ *	a significant number of lookups in many common usage patters. In the worst
+ *	case this performs as if the pointer was not maintained.
+ *
+ *
+ *	About CLOBBER_FREED_MEMORY:
+ *
+ *	If this symbol is defined, all freed memory is overwritten with 0x7F's.
+ *	This is useful for catching places that reference already-freed memory.
+ *
+ *	About MEMORY_CONTEXT_CHECKING:
+ *
+ *	Since we usually round request sizes up to the next power of 2, there
+ *	is often some unused space immediately after a requested data area.
+ *	Thus, if someone makes the common error of writing past what they've
+ *	requested, the problem is likely to go unnoticed ... until the day when
+ *	there *isn't* any wasted space, perhaps because of different memory
+ *	alignment on a new platform, or some other effect.  To catch this sort
+ *	of problem, the MEMORY_CONTEXT_CHECKING option stores 0x7E just beyond
+ *	the requested space whenever the request is less than the actual chunk
+ *	size, and verifies that the byte is undamaged when the chunk is freed.
+ *
+ *
+ *	About USE_VALGRIND and Valgrind client requests:
+ *
+ *	Valgrind provides "client request" macros that exchange information with
+ *	the host Valgrind (if any).  Under !USE_VALGRIND, memdebug.h stubs out
+ *	currently-used macros.
+ *
+ *	When running under Valgrind, we want a NOACCESS memory region both before
+ *	and after the allocation.  The chunk header is tempting as the preceding
+ *	region, but mcxt.c expects to able to examine the standard chunk header
+ *	fields.  Therefore, we use, when available, the requested_size field and
+ *	any subsequent padding.  requested_size is made NOACCESS before returning
+ *	a chunk pointer to a caller.  However, to reduce client request traffic,
+ *	it is kept DEFINED in chunks on the free list.
+ *
+ *	The rounded-up capacity of the chunk usually acts as a post-allocation
+ *	NOACCESS region.  If the request consumes precisely the entire chunk,
+ *	there is no such region; another chunk header may immediately follow.  In
+ *	that case, Valgrind will not detect access beyond the end of the chunk.
+ *
+ *	See also the cooperating Valgrind client requests in mcxt.c.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "utils/memdebug.h"
+#include "utils/memutils.h"
+
+
+#define SLAB_BLOCKHDRSZ	MAXALIGN(sizeof(SlabBlockData))
+#define SLAB_CHUNKHDRSZ	MAXALIGN(sizeof(SlabChunkData))
+
+/* Portion of SLAB_CHUNKHDRSZ examined outside slab.c. */
+#define SLAB_CHUNK_PUBLIC	\
+	(offsetof(SlabChunkData, size) + sizeof(Size))
+
+/* Portion of SLAB_CHUNKHDRSZ excluding trailing padding. */
+#ifdef MEMORY_CONTEXT_CHECKING
+#define SLAB_CHUNK_USED	\
+	(offsetof(SlabChunkData, requested_size) + sizeof(Size))
+#else
+#define SLAB_CHUNK_USED	\
+	(offsetof(SlabChunkData, size) + sizeof(Size))
+#endif
+
+typedef struct SlabBlockData *SlabBlock;		/* forward reference */
+typedef struct SlabChunkData *SlabChunk;
+
+/*
+ * SlabPointer
+ *		Aligned pointer which may be a member of an allocation set.
+ */
+typedef void *SlabPointer;
+
+/*
+ * SlabContext is a specialized implementation of MemoryContext.
+ */
+typedef struct SlabContext
+{
+	MemoryContextData header;	/* Standard memory-context fields */
+	/* Allocation parameters for this context: */
+	Size		chunkSize;		/* chunk size */
+	Size		fullChunkSize;	/* chunk size including header and alignment */
+	Size		blockSize;		/* block size */
+	int			chunksPerBlock;	/* number of chunks per block */
+	int			minFreeCount;	/* min number of free chunks in any block */
+	int			nblocks;		/* number of blocks allocated */
+	bool		autodestruct;	/* destruct after freeing the last block */
+	/* Info about storage allocated in this context: */
+	SlabBlock	freelist[1];	/* free lists (block-level) */
+} SlabContext;
+
+typedef SlabContext *Slab;
+
+/*
+ * SlabBlockData
+ *		Structure of a single block in SLAB allocator.
+ *
+ * slab: context owning this block
+ * prev, next: used for doubly-linked list of blocks in global freelist
+ * nfree: number of free chunks in this block
+ * firstFreeChunk: pointer to the first free chunk
+ * bitmapptr: pointer to the free bitmap (tracking free chunks)
+ */
+typedef struct SlabBlockData
+{
+	Slab		slab;			/* slab that owns this block */
+	SlabBlock	prev;			/* previous block in slab's block list */
+	SlabBlock	next;			/* next block in slab's blocks list */
+	int			nfree;			/* number of free chunks */
+	int			firstFreeChunk;	/* index of the first free chunk in the block */
+	char	   *bitmapptr;		/* pointer to free bitmap */
+}	SlabBlockData;
+
+/*
+ * SlabChunk
+ *		The prefix of each piece of memory in an SlabBlock
+ *
+ * NB: this MUST match StandardChunkHeader as defined by utils/memutils.h.
+ * However it's possible to fields in front of the StandardChunkHeader fields,
+ * which is used to add pointer to the block.
+ */
+typedef struct SlabChunkData
+{
+	/* block owning this chunk */
+	void	   *block;
+	/* slab is the owning slab context */
+	void	   *slab;
+	/* size is always the size of the usable space in the chunk */
+	Size		size;
+#ifdef MEMORY_CONTEXT_CHECKING
+	/* when debugging memory usage, also store actual requested size */
+	/* this is zero in a free chunk */
+	Size		requested_size;
+#endif
+}	SlabChunkData;
+
+
+/*
+ * SlabIsValid
+ *		True iff set is valid allocation set.
+ */
+#define SlabIsValid(set) PointerIsValid(set)
+
+#define SlabPointerGetChunk(ptr)	\
+					((SlabChunk)(((char *)(ptr)) - SLAB_CHUNKHDRSZ))
+#define SlabChunkGetPointer(chk)	\
+					((SlabPointer)(((char *)(chk)) + SLAB_CHUNKHDRSZ))
+
+/*
+ * These functions implement the MemoryContext API for Slab contexts.
+ */
+static void *SlabAlloc(MemoryContext context, Size size);
+static void SlabFree(MemoryContext context, void *pointer);
+static void *SlabRealloc(MemoryContext context, void *pointer, Size size);
+static void SlabInit(MemoryContext context);
+static void SlabReset(MemoryContext context);
+static void SlabDelete(MemoryContext context);
+static Size SlabGetChunkSpace(MemoryContext context, void *pointer);
+static bool SlabIsEmpty(MemoryContext context);
+static void SlabStats(MemoryContext context, int level, bool print,
+			  MemoryContextCounters *totals);
+
+#ifdef MEMORY_CONTEXT_CHECKING
+static void SlabCheck(MemoryContext context);
+#endif
+
+/*
+ * This is the virtual function table for Slab contexts.
+ */
+static MemoryContextMethods SlabMethods = {
+	SlabAlloc,
+	SlabFree,
+	SlabRealloc,
+	SlabInit,
+	SlabReset,
+	SlabDelete,
+	SlabGetChunkSpace,
+	SlabIsEmpty,
+	SlabStats
+#ifdef MEMORY_CONTEXT_CHECKING
+	,SlabCheck
+#endif
+};
+
+/* ----------
+ * Debug macros
+ * ----------
+ */
+#ifdef HAVE_ALLOCINFO
+#define SlabFreeInfo(_cxt, _chunk) \
+			fprintf(stderr, "SlabFree: %s: %p, %d\n", \
+				(_cxt)->header.name, (_chunk), (_chunk)->size)
+#define SlabAllocInfo(_cxt, _chunk) \
+			fprintf(stderr, "SlabAlloc: %s: %p, %d\n", \
+				(_cxt)->header.name, (_chunk), (_chunk)->size)
+#else
+#define SlabFreeInfo(_cxt, _chunk)
+#define SlabAllocInfo(_cxt, _chunk)
+#endif
+
+
+#ifdef CLOBBER_FREED_MEMORY
+
+/* Wipe freed memory for debugging purposes */
+static void
+wipe_mem(void *ptr, size_t size)
+{
+	VALGRIND_MAKE_MEM_UNDEFINED(ptr, size);
+	memset(ptr, 0x7F, size);
+	VALGRIND_MAKE_MEM_NOACCESS(ptr, size);
+}
+#endif
+
+#ifdef MEMORY_CONTEXT_CHECKING
+static void
+set_sentinel(void *base, Size offset)
+{
+	char	   *ptr = (char *) base + offset;
+
+	VALGRIND_MAKE_MEM_UNDEFINED(ptr, 1);
+	*ptr = 0x7E;
+	VALGRIND_MAKE_MEM_NOACCESS(ptr, 1);
+}
+
+static bool
+sentinel_ok(const void *base, Size offset)
+{
+	const char *ptr = (const char *) base + offset;
+	bool		ret;
+
+	VALGRIND_MAKE_MEM_DEFINED(ptr, 1);
+	ret = *ptr == 0x7E;
+	VALGRIND_MAKE_MEM_NOACCESS(ptr, 1);
+
+	return ret;
+}
+#endif
+
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+
+/*
+ * Fill a just-allocated piece of memory with "random" data.  It's not really
+ * very random, just a repeating sequence with a length that's prime.  What
+ * we mainly want out of it is to have a good probability that two palloc's
+ * of the same number of bytes start out containing different data.
+ *
+ * The region may be NOACCESS, so make it UNDEFINED first to avoid errors as
+ * we fill it.  Filling the region makes it DEFINED, so make it UNDEFINED
+ * again afterward.  Whether to finally make it UNDEFINED or NOACCESS is
+ * fairly arbitrary.  UNDEFINED is more convenient for SlabRealloc(), and
+ * other callers have no preference.
+ */
+static void
+randomize_mem(char *ptr, size_t size)
+{
+	static int	save_ctr = 1;
+	size_t		remaining = size;
+	int			ctr;
+
+	ctr = save_ctr;
+	VALGRIND_MAKE_MEM_UNDEFINED(ptr, size);
+	while (remaining-- > 0)
+	{
+		*ptr++ = ctr;
+		if (++ctr > 251)
+			ctr = 1;
+	}
+	VALGRIND_MAKE_MEM_UNDEFINED(ptr - size, size);
+	save_ctr = ctr;
+}
+#endif   /* RANDOMIZE_ALLOCATED_MEMORY */
+
+
+/*
+ * Public routines
+ */
+
+
+/*
+ * SlabContextCreate
+ *		Create a new Slab context.
+ *
+ * parent: parent context, or NULL if top-level context
+ * name: name of context (for debugging --- string will be copied)
+ * blockSize: allocation block size
+ * chunkSize: allocation chunk size
+ */
+MemoryContext
+SlabContextCreate(MemoryContext parent,
+					  const char *name,
+					  Size blockSize,
+					  Size chunkSize)
+{
+	int		chunksPerBlock;
+	Size	fullChunkSize;
+	Slab	set;
+
+	/* chunk, including SLAB header (both addresses nicely aligned) */
+	fullChunkSize = MAXALIGN(sizeof(SlabChunkData) + MAXALIGN(chunkSize));
+
+	/* make sure the block can store at least one chunk (with 1B for a bitmap)? */
+	if (blockSize - sizeof(SlabChunkData) < fullChunkSize + 1)
+		elog(ERROR, "block size %ld for slab is too small for chunks %ld",
+					blockSize, chunkSize);
+
+	/* so how many chunks can we fit into a block, including header and bitmap? */
+	chunksPerBlock
+		=  (8 * (blockSize - sizeof(SlabBlockData)) - 7) / (8 * fullChunkSize + 1);
+
+	/* if we can't fit at least one chunk into the block, we're hosed */
+	Assert(chunksPerBlock > 0);
+
+	/* make sure the chunks (and bitmap) actually fit on the block  */
+	Assert(fullChunkSize * chunksPerBlock + ((chunksPerBlock + 7) / 8) + sizeof(SlabBlockData) <= blockSize);
+
+	/* Do the type-independent part of context creation */
+	set = (Slab) MemoryContextCreate(T_SlabContext,
+									 /* allocate context and freelist at once */
+									 (offsetof(SlabContext, freelist) + sizeof(SlabChunk) * (chunksPerBlock + 1)),
+									 &SlabMethods,
+									 parent,
+									 name);
+
+	set->blockSize = blockSize;
+	set->chunkSize = chunkSize;
+	set->fullChunkSize = fullChunkSize;
+	set->chunksPerBlock = chunksPerBlock;
+	set->nblocks = 0;
+	set->minFreeCount = 0;
+	set->autodestruct = false;
+
+	return (MemoryContext) set;
+}
+
+/*
+ * SlabInit
+ *		Context-type-specific initialization routine. SlabContext does not
+ *		need anything extra, at this moment.
+ */
+static void
+SlabInit(MemoryContext context)
+{
+	/*
+	 * Since MemoryContextCreate already zeroed the context node, we don't
+	 * have to do anything here: it's already OK.
+	 */
+}
+
+/*
+ * SlabReset
+ *		Frees all memory which is allocated in the given set.
+ *
+ * The code simply frees all the blocks in the context - we don't keep any
+ * keeper blocks or anything like that.
+ */
+static void
+SlabReset(MemoryContext context)
+{
+	int		i;
+	Slab	set = (Slab) context;
+
+	AssertArg(SlabIsValid(set));
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	/* Check for corruption and leaks before freeing */
+	SlabCheck(context);
+#endif
+
+	/* walk over freelists and free the blocks */
+	for (i = 0; i <= set->chunksPerBlock; i++)
+	{
+		SlabBlock block = set->freelist[i];
+		set->freelist[i] = NULL;
+
+		while (block != NULL)
+		{
+			SlabBlock	next = block->next;
+
+			/* Normal case, release the block */
+#ifdef CLOBBER_FREED_MEMORY
+			wipe_mem(block, set->blockSize);
+#endif
+			free(block);
+			set->nblocks--;
+
+			block = next;
+		}
+	}
+
+	set->minFreeCount = 0;
+	set->autodestruct = false;
+
+	Assert(set->nblocks == 0);
+}
+
+/*
+ * SlabDelete
+ *		Frees all memory which is allocated in the given set, in preparation
+ *		for deletion of the set. We simply call SlabReset().
+ */
+static void
+SlabDelete(MemoryContext context)
+{
+	/* just reset the context */
+	SlabReset(context);
+}
+
+/* operations on the freelist - adding/removing/moving blocks */
+static void
+remove_from_freelist(Slab set, SlabBlock block, int nfree_old)
+{
+	/* either it has a previous block, or it's the first block in list */
+	if (block->prev)
+		block->prev->next = block->next;
+	else
+		set->freelist[nfree_old] = block->next;
+
+	/* if it has a next block, update it too */
+	if (block->next)
+		block->next->prev = block->prev;
+
+	block->prev = NULL;
+	block->next = NULL;
+}
+
+static void
+add_to_freelist(Slab set, SlabBlock block)
+{
+	/* otherwise add it to the proper freelist bin */
+	if (set->freelist[block->nfree])
+		set->freelist[block->nfree]->prev = block;
+
+	block->next = set->freelist[block->nfree];
+	set->freelist[block->nfree] = block;
+}
+
+static void
+move_in_freelist(Slab set, SlabBlock block, int nfree_old)
+{
+	remove_from_freelist(set, block, nfree_old);
+	add_to_freelist(set, block);
+}
+
+
+/*
+ * SlabAlloc
+ *		Returns pointer to allocated memory of given size or NULL if
+ *		request could not be completed; memory is added to the set.
+ *
+ * No request may exceed:
+ *		MAXALIGN_DOWN(SIZE_MAX) - SLAB_BLOCKHDRSZ - SLAB_CHUNKHDRSZ
+ * All callers use a much-lower limit.
+ */
+static void *
+SlabAlloc(MemoryContext context, Size size)
+{
+	Slab	set = (Slab) context;
+	SlabBlock	block;
+	SlabChunk	chunk;
+	int			idx;
+
+	AssertArg(SlabIsValid(set));
+
+	Assert((set->minFreeCount >= 0) && (set->minFreeCount < set->chunksPerBlock));
+
+	/* make sure we only allow correct request size */
+	if (size != set->chunkSize)
+		elog(ERROR, "unexpected alloc chunk size %ld (expected %ld)",
+					size, set->chunkSize);
+
+	/*
+	 * If there are no free chunks in any existing block, create a new block
+	 * and put it to the last freelist bucket.
+	 */
+	if (set->minFreeCount == 0)
+	{
+		block = (SlabBlock)malloc(set->blockSize);
+
+		if (block == NULL)
+			return NULL;
+
+		memset(block, 0, set->blockSize);
+
+		block->slab = set;
+		block->nfree = set->chunksPerBlock;
+		block->prev = NULL;
+		block->next = NULL;
+		block->firstFreeChunk = 0;
+
+		/* the free bitmap is placed at the end */
+		block->bitmapptr
+			= ((char *) block) + set->blockSize - ((set->chunksPerBlock + 7) / 8);
+
+		/*
+		 * And add it to the last freelist with all chunks empty (we know
+		 * there are no blocks in the freelist, otherwise we wouldn't need
+		 * a new block.
+		 */
+		set->freelist[set->chunksPerBlock] = block;
+		set->minFreeCount = set->chunksPerBlock;
+		set->nblocks += 1;
+	}
+
+	/* grab the block from the freelist (even the new block is there) */
+	block = set->freelist[set->minFreeCount];
+
+	/* make sure we actually got a valid block, with matching nfree */
+	Assert(block != NULL);
+	Assert(set->minFreeCount == block->nfree);
+	Assert(block->nfree > 0);
+
+	Assert((char*)block < block->bitmapptr);
+	Assert((char*)block + set->blockSize > block->bitmapptr);
+
+	/* we know the first free chunk */
+	idx = block->firstFreeChunk;
+
+	/* make sure the chunk index is valid, and that it's marked as empty */
+	Assert((idx >= 0) && (idx < set->chunksPerBlock));
+	Assert(!((block->bitmapptr[idx/8] & (0x01 << (idx % 8)))));
+
+	/* mark the chunk as used (set 1 to the bit) */
+	block->bitmapptr[idx/8] |= (0x01 << (idx % 8));
+
+	/* compute the chunk location block start (after the block header) */
+	chunk = (SlabChunk) ((char*)block + sizeof(SlabBlockData)
+									  + (idx * set->fullChunkSize));
+
+	/*
+	 * update the block nfree count, and also the minFreeCount as we've
+	 * decreased nfree for a block with the minimum count
+	 */
+	block->nfree--;
+	set->minFreeCount = block->nfree;
+
+	/* but we need to find the next one, for the next alloc call (unless the
+	 * block just got full, in that case simply set it to -1 */
+	if (block->nfree == 0)
+		block->firstFreeChunk = set->chunksPerBlock;
+	else
+	{
+		/* look for the next free chunk in the block, after the first one */
+		while ((++block->firstFreeChunk) < set->chunksPerBlock)
+		{
+			int byte = block->firstFreeChunk / 8;
+			int bit  = block->firstFreeChunk % 8;
+
+			/* stop when you find 0 (unused chunk) */
+			if (! (block->bitmapptr[byte] & (0x01 << bit)))
+				break;
+		}
+
+		/* must have found the free chunk */
+		Assert(block->firstFreeChunk != set->chunksPerBlock);
+	}
+
+	/* move the block to the right place in the freelist */
+	move_in_freelist(set, block, (block->nfree + 1));
+
+	/* but if the minimum is 0, we need to look for a new one */
+	if (set->minFreeCount == 0)
+		for (idx = 1; idx <= set->chunksPerBlock; idx++)
+			if (set->freelist[idx])
+			{
+				set->minFreeCount = idx;
+				break;
+			}
+
+	if (set->minFreeCount == set->chunksPerBlock)
+		set->minFreeCount = 0;
+
+	/* Prepare to initialize the chunk header. */
+	VALGRIND_MAKE_MEM_UNDEFINED(chunk, SLAB_CHUNK_USED);
+
+	chunk->slab = (void *) set;
+	chunk->block = (void *) block;
+	chunk->size = MAXALIGN(size);
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	chunk->requested_size = size;
+	VALGRIND_MAKE_MEM_NOACCESS(&chunk->requested_size,
+							   sizeof(chunk->requested_size));
+	/* set mark to catch clobber of "unused" space */
+	if (size < chunk->size)
+		set_sentinel(SlabChunkGetPointer(chunk), size);
+#endif
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+	/* fill the allocated space with junk */
+	randomize_mem((char *) SlabChunkGetPointer(chunk), size);
+#endif
+
+	SlabAllocInfo(set, chunk);
+	return SlabChunkGetPointer(chunk);
+}
+
+/*
+ * SlabFree
+ *		Frees allocated memory; memory is removed from the set.
+ */
+static void
+SlabFree(MemoryContext context, void *pointer)
+{
+	int		idx;
+	Slab	set = (Slab) context;
+	SlabChunk	chunk = SlabPointerGetChunk(pointer);
+	SlabBlock	block = chunk->block;
+
+	SlabFreeInfo(set, chunk);
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	VALGRIND_MAKE_MEM_DEFINED(&chunk->requested_size,
+							  sizeof(chunk->requested_size));
+	/* Test for someone scribbling on unused space in chunk */
+	if (chunk->requested_size < chunk->size)
+		if (!sentinel_ok(pointer, chunk->requested_size))
+			elog(WARNING, "detected write past chunk end in %s %p",
+				 set->header.name, chunk);
+#endif
+
+	/* compute index wrt to block start */
+	idx = ((char*)chunk - ((char*)block + sizeof(SlabBlockData))) / set->fullChunkSize;
+
+	Assert((block->bitmapptr[idx/8] & (0x01 << (idx % 8))));
+
+	/* mark the chunk as unused (set 0 to the bit), and update block nfree count */
+	block->bitmapptr[idx/8] ^= (0x01 << (idx % 8));
+	block->nfree++;
+	block->firstFreeChunk = Min(block->firstFreeChunk, idx);
+
+	Assert(block->nfree > 0);
+	Assert(block->nfree <= set->chunksPerBlock);
+
+#ifdef CLOBBER_FREED_MEMORY
+	wipe_mem(pointer, chunk->size);
+#endif
+
+#ifdef MEMORY_CONTEXT_CHECKING
+	/* Reset requested_size to 0 in chunks that are on freelist */
+	chunk->requested_size = 0;
+#endif
+
+	/* now decide what to do with the block */
+
+	/*
+	 * See if we need to update the minFreeCount field for the set - we only
+	 * need to do that if the block had that number of free chunks before we
+	 * freed one. In that case, we check if there still are blocks with that
+	 * number of free chunks - we can simply check if the chunk has siblings.
+	 * Otherwise we simply increment the value by one, as the new block is
+	 * still the one with minimum free chunks (even without the one chunk).
+	 */
+	if (set->minFreeCount == (block->nfree-1))
+		if ((block->prev == NULL) && (block->next == NULL)) /* no other blocks */
+		{
+			/* but if we made the block entirely free, we'll free it */
+			if (block->nfree == set->chunksPerBlock)
+				set->minFreeCount = 0;
+			else
+				set->minFreeCount++;
+		}
+
+	/* remove the block from a freelist */
+	remove_from_freelist(set, block, block->nfree-1);
+
+	/* If the block is now completely empty, free it. */
+	if (block->nfree == set->chunksPerBlock)
+	{
+		free(block);
+		set->nblocks--;
+	}
+	else
+		add_to_freelist(set, block);
+
+	Assert(set->nblocks >= 0);
+
+	/*
+	 * If we've just released the last block in the context, destruct it.
+	 *
+	 * XXX But don't do that if the context has children.
+	 */
+	if (set->autodestruct && (set->nblocks == 0) && (context->firstchild == NULL))
+		MemoryContextDelete(context);
+}
+
+/*
+ * SlabRealloc
+ *		As Slab is designed for allocating equally-sized chunks of memory, it
+ *		can't really do an actual realloc. However we try to be gentle and
+ *		allow calls with exactly the same size as in that case we can simply
+ *		return the same chunk. When the size differs, we fail with assert
+ *		failure or return NULL.
+ *
+ *	XXX We might be even gentler and allow cases when (size < chunkSize).
+ */
+static void *
+SlabRealloc(MemoryContext context, void *pointer, Size size)
+{
+	Slab	set = (Slab)context;
+
+	/* can't do actual realloc with slab, but let's try to be gentle */
+	if (size == set->chunkSize)
+		return pointer;
+
+	elog(ERROR, "slab allocator does not support realloc()");
+}
+
+/*
+ * SlabGetChunkSpace
+ *		Given a currently-allocated chunk, determine the total space
+ *		it occupies (including all memory-allocation overhead).
+ */
+static Size
+SlabGetChunkSpace(MemoryContext context, void *pointer)
+{
+	SlabChunk	chunk = SlabPointerGetChunk(pointer);
+
+	return chunk->size + SLAB_CHUNKHDRSZ;
+}
+
+/*
+ * SlabIsEmpty
+ *		Is an Slab empty of any allocated space?
+ */
+static bool
+SlabIsEmpty(MemoryContext context)
+{
+	Slab		set = (Slab)context;
+	return (set->nblocks == 0);
+}
+
+/*
+ * SlabStats
+ *		Compute stats about memory consumption of an Slab.
+ *
+ * level: recursion level (0 at top level); used for print indentation.
+ * print: true to print stats to stderr.
+ * totals: if not NULL, add stats about this Slab into *totals.
+ */
+static void
+SlabStats(MemoryContext context, int level, bool print,
+			  MemoryContextCounters *totals)
+{
+	Slab		set = (Slab) context;
+	Size		nblocks = 0;
+	Size		freechunks = 0;
+	Size		totalspace = 0;
+	Size		freespace = 0;
+	int			i;
+
+	for (i = 0; i <= set->chunksPerBlock; i++)
+	{
+		SlabBlock block = set->freelist[i];
+		while (block != NULL)
+		{
+			nblocks++;
+			totalspace += set->blockSize;
+			freespace += set->fullChunkSize * block->nfree;
+			freechunks += block->nfree;
+			block = block->next;
+		}
+	}
+
+	if (print)
+	{
+		int			i;
+
+		for (i = 0; i < level; i++)
+			fprintf(stderr, "  ");
+		fprintf(stderr,
+			"%s: %zu total in %zd blocks; %zu free (%zd chunks); %zu used; autodestruct %d\n",
+				set->header.name, totalspace, nblocks, freespace, freechunks,
+				totalspace - freespace, set->autodestruct);
+	}
+
+	if (totals)
+	{
+		totals->nblocks += nblocks;
+		totals->freechunks += freechunks;
+		totals->totalspace += totalspace;
+		totals->freespace += freespace;
+	}
+}
+
+void
+SlabAutodestruct(MemoryContext context)
+{
+	Slab	set = (Slab)context;
+
+	Assert(IsA(set, SlabContext));
+
+	set->autodestruct = true;
+}
+
+#ifdef MEMORY_CONTEXT_CHECKING
+
+/*
+ * SlabCheck
+ *		Walk through chunks and check consistency of memory.
+ *
+ * NOTE: report errors as WARNING, *not* ERROR or FATAL.  Otherwise you'll
+ * find yourself in an infinite loop when trouble occurs, because this
+ * routine will be entered again when elog cleanup tries to release memory!
+ */
+static void
+SlabCheck(MemoryContext context)
+{
+	int i;
+	Slab	slab = (Slab) context;
+	char	   *name = slab->header.name;
+
+	for (i = 0; i <= slab->chunksPerBlock; i++)
+	{
+		int j, nfree;
+		SlabBlock block = slab->freelist[i];
+
+		/* no entries in this freelist slot */
+		if (! block)
+			continue;
+
+		if (block->slab != (void *) slab)
+			elog(WARNING, "problem in slab %s: bogus slab link in block %p",
+				 name, block);
+
+		/*
+		 * Make sure the number of free chunks (in the block header) matches
+		 * position in the freelist.
+		 */
+		if (block->nfree != i)
+			elog(WARNING, "problem in slab %s: number of free chunks %d in block %p does not match freelist %d",
+				 name, block->nfree, block, i);
+
+		/*
+		 * Now walk through the chunks, count the free ones and also perform
+		 * some additional checks for the used ones.
+		 */
+
+		nfree = 0;
+		for (j = 0; j <= slab->chunksPerBlock; j++)
+		{
+			/* non-zero bit in the bitmap means chunk the chunk is used */
+			if ((block->bitmapptr[j/8] & (0x01 << (j % 8))) != 0)
+			{
+				SlabChunk chunk = (SlabChunk) ((char*)block + sizeof(SlabBlockData)
+								  + (j * slab->fullChunkSize));
+
+				VALGRIND_MAKE_MEM_DEFINED(&chunk->requested_size,
+										  sizeof(chunk->requested_size));
+
+				/* we're in a no-freelist branch */
+				VALGRIND_MAKE_MEM_NOACCESS(&chunk->requested_size,
+										   sizeof(chunk->requested_size));
+
+				/* chunks have both block and slab pointers, so check both of them */
+
+				if (chunk->block != block)
+					elog(WARNING, "problem in slab %s: bogus block link in block %p, chunk %p",
+						 name, block, chunk);
+
+				if (chunk->slab != slab)
+					elog(WARNING, "problem in slab %s: bogus slab link in block %p, chunk %p",
+						 name, block, chunk);
+
+				/* now make sure the chunk size is correct */
+				if (chunk->size != MAXALIGN(slab->chunkSize))
+					elog(WARNING, "problem in slab %s: bogus chunk size in block %p, chunk %p",
+						 name, block, chunk);
+
+#ifdef MEMORY_CONTEXT_CHECKING
+				/* now make sure the chunk size is correct */
+				if (chunk->size != slab->chunkSize)
+					elog(WARNING, "problem in slab %s: bogus chunk requested size in block %p, chunk %p",
+						 name, block, chunk);
+
+				/* there might be sentinel (thanks to alignment) */
+				if (chunk->requested_size < chunk->size &&
+					!sentinel_ok(chunk, SLAB_CHUNKHDRSZ + chunk->requested_size))
+					elog(WARNING, "problem in slab %s: detected write past chunk end in block %p, chunk %p",
+						 name, block, chunk);
+#endif
+			}
+			else
+				/* free chunk */
+				nfree += 1;
+		}
+
+		/*
+		 * Make sure we got the expected number of free chunks (as tracked in
+		 * the block header).
+		 */
+		if (nfree != block->nfree)
+			elog(WARNING, "problem in slab %s: number of free chunks %d in block %p does not match bitmap %d",
+				 name, block->nfree, block, nfree);
+	}
+}
+
+#endif   /* MEMORY_CONTEXT_CHECKING */
diff --git a/src/include/nodes/memnodes.h b/src/include/nodes/memnodes.h
index ba069cc..92a7478 100644
--- a/src/include/nodes/memnodes.h
+++ b/src/include/nodes/memnodes.h
@@ -96,6 +96,6 @@ typedef struct MemoryContextData
  */
 #define MemoryContextIsValid(context) \
 	((context) != NULL && \
-	 (IsA((context), AllocSetContext)))
+	 (IsA((context), AllocSetContext) || IsA((context), SlabContext)))
 
 #endif   /* MEMNODES_H */
diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h
index 88297bb..2cbbfec 100644
--- a/src/include/nodes/nodes.h
+++ b/src/include/nodes/nodes.h
@@ -275,6 +275,7 @@ typedef enum NodeTag
 	 */
 	T_MemoryContext = 600,
 	T_AllocSetContext,
+	T_SlabContext,
 
 	/*
 	 * TAGS FOR VALUE NODES (value.h)
diff --git a/src/include/replication/reorderbuffer.h b/src/include/replication/reorderbuffer.h
index 9e209ae..e8a8d77 100644
--- a/src/include/replication/reorderbuffer.h
+++ b/src/include/replication/reorderbuffer.h
@@ -331,6 +331,19 @@ struct ReorderBuffer
 	MemoryContext context;
 
 	/*
+	 * slab contexts for change and TXN objects.
+	 */
+	MemoryContext change_context;
+	MemoryContext txn_context;
+	MemoryContext tup_context_slab;
+	MemoryContext tup_context_oversized;
+
+	/* counters for current generation of tuples */
+	int		tuples_count;
+	Size	tuples_size;
+	Size	current_size;
+
+	/*
 	 * Data structure slab cache.
 	 *
 	 * We allocate/deallocate some structures very frequently, to avoid bigger
diff --git a/src/include/utils/memutils.h b/src/include/utils/memutils.h
index e6334a2..fd2c9c2 100644
--- a/src/include/utils/memutils.h
+++ b/src/include/utils/memutils.h
@@ -135,6 +135,14 @@ extern MemoryContext AllocSetContextCreate(MemoryContext parent,
 					  Size initBlockSize,
 					  Size maxBlockSize);
 
+/* slab.c */
+extern MemoryContext SlabContextCreate(MemoryContext parent,
+					  const char *name,
+					  Size blockSize,
+					  Size chunkSize);
+
+extern void SlabAutodestruct(MemoryContext context);
+
 /*
  * Recommended default alloc parameters, suitable for "ordinary" contexts
  * that might hold quite a lot of data.
@@ -171,4 +179,7 @@ extern MemoryContext AllocSetContextCreate(MemoryContext parent,
  */
 #define ALLOCSET_SEPARATE_THRESHOLD  8192
 
+#define SLAB_DEFAULT_BLOCK_SIZE		8192
+#define SLAB_LARGE_BLOCK_SIZE		(8 * 1024 * 1024)
+
 #endif   /* MEMUTILS_H */
-- 
2.5.5

