commit 52a1753ffbdaf6fa4d02ed0accded0feb634068c
Author: mithun <mithun@localhost.localdomain>
Date:   Thu Oct 27 16:58:53 2016 +0530

    Proposal pg_autoprewarm

diff --git a/contrib/pg_autoprewarm/Makefile b/contrib/pg_autoprewarm/Makefile
new file mode 100644
index 0000000..2e754b0
--- /dev/null
+++ b/contrib/pg_autoprewarm/Makefile
@@ -0,0 +1,15 @@
+# contrib/pg_autorewarm/Makefile
+
+MODULE_big = pg_autoprewarm
+OBJS = pg_autoprewarm.o $(WIN32RES)
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = contrib/pg_autoprewarm
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/contrib/pg_autoprewarm/README b/contrib/pg_autoprewarm/README
new file mode 100644
index 0000000..2fda8ea
--- /dev/null
+++ b/contrib/pg_autoprewarm/README
@@ -0,0 +1,27 @@
+# pg_autoprewarm.
+
+This a PostgreSQL contrib module which automatically dump all of the blocknums
+present in buffer pool at the time of server shutdown(smart and fast mode only,
+to be enhanced to dump at regular interval.)
+and load these blocks when server restarts.
+
+Design:
+------
+We have created a BG worker Auto Pre-warmer which during shutdown dumps all the
+blocknum in buffer pool after sorting same.
+Format of each entry is <DatabaseId,TableSpaceId,RelationId,Forknum,BlockNum>.
+Auto Pre-warmer is started as soon as the postmaster is started we do not wait
+for recovery to finish and database to reach a consistent state. If there is a
+"dump_file" to load we start loading each block entry to buffer pool until
+there is a free buffer. This way we do not replace any new blocks which was
+loaded either by recovery process or querying clients.
+
+HOW TO USE:
+-----------
+Build and add the pg_autoprewarm to shared_preload_libraries. Auto Pre-warmer
+process automatically do dumping of buffer pool block info and load them when
+restarted.
+
+TO DO:
+------
+Add functionality to dump based on timer at regular interval.
\ No newline at end of file
diff --git a/contrib/pg_autoprewarm/pg_autoprewarm.c b/contrib/pg_autoprewarm/pg_autoprewarm.c
new file mode 100644
index 0000000..f23a8db
--- /dev/null
+++ b/contrib/pg_autoprewarm/pg_autoprewarm.c
@@ -0,0 +1,465 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_autoprewarm.c
+ *	Automatically dumps and load buffer pool.
+ *
+ *	contrib/pg_autoprewarm/pg_autoprewarm.c
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+#include "miscadmin.h"
+#include "postmaster/bgworker.h"
+#include "storage/ipc.h"
+#include "storage/latch.h"
+#include "storage/lwlock.h"
+#include "storage/proc.h"
+#include "storage/shmem.h"
+#include "pgstat.h"
+#include "storage/buf_internals.h"
+#include "storage/smgr.h"
+#include "utils/memutils.h"
+#include "utils/resowner.h"
+#include "catalog/pg_class.h"
+#include <unistd.h>
+
+PG_MODULE_MAGIC;
+
+static void AutoPreWarmerMain (Datum main_arg);
+static bool
+load_block(RelFileNode rnode, char reltype, ForkNumber forkNum,
+		   BlockNumber blockNum);
+
+/* Primary functions */
+void            _PG_init(void);
+
+/* Secondary/supporting functions */
+static void     sigtermHandler(SIGNAL_ARGS);
+
+/* flags set by signal handlers */
+static volatile sig_atomic_t got_sigterm = false;
+
+/*
+ *	Signal handler for SIGTERM
+ *	set our latch to wake it up.
+ */
+static void
+sigtermHandler(SIGNAL_ARGS)
+{
+	int save_errno = errno;
+	got_sigterm = true;
+
+	if (MyProc)
+		SetLatch(&MyProc->procLatch);
+
+	errno = save_errno;
+}
+
+/* Meta-data of each persistent page buffer which is dumped and used to load. */
+typedef struct BlockInfoRecord
+{
+	Oid			database;	/* datbase */
+	Oid			spcNode;	/* tablespace */
+	Oid			filenode;	/* relation */
+	ForkNumber	forknum;	/* fork number */
+	BlockNumber	blocknum;	/* block number */
+}BlockInfoRecord;
+
+/* Try loading only once during startup. If any error do not retry. */
+static bool avoid_loading = false;
+
+/*
+ * And avoid dumping if we receive sigterm while loading. Also do not re-try if
+ * dump has failed previously.
+ */
+static bool avoid_dumping = false;
+
+/* compare member elements to check if they are not equal. */
+#define cmp_member_elem(fld)	\
+do { \
+	if (a->fld < b->fld)		\
+		return -1;				\
+	else if (a->fld > b->fld)	\
+		return 1;				\
+} while(0);
+
+/*
+ * sort_cmp_func - compare function used while qsorting BlockInfoRecord objects.
+ */
+static int
+sort_cmp_func(const void *p, const void *q)
+{
+	BlockInfoRecord *a = (BlockInfoRecord *) p;
+	BlockInfoRecord *b = (BlockInfoRecord *) q;
+
+	cmp_member_elem(database);
+	cmp_member_elem(spcNode);
+	cmp_member_elem(filenode);
+	cmp_member_elem(forknum);
+	cmp_member_elem(blocknum);
+	return 0;
+}
+
+#define DEFAULT_DUMP_FILENAME "autoprewarm"
+
+/*
+ *	load_block -	Load a given block.
+ */
+bool
+load_block(RelFileNode rnode, char reltype, ForkNumber forkNum,
+		   BlockNumber blockNum)
+{
+	Buffer      buffer;
+
+	/* Load the page only if there exist a free buffer. We do not want to
+	 * replace an existing buffer. */
+	if (have_free_buffer())
+	{
+		SMgrRelation smgr = smgropen(rnode, InvalidBackendId);
+
+		/*
+		 * Check if fork exists first otherwise we will not be able to use one
+		 * free buffer for each non existing block.
+		 */
+		if  (smgrexists(smgr, forkNum))
+		{
+			buffer = ReadBufferForPrewarm(smgr, reltype,
+										  forkNum, blockNum,
+										  RBM_NORMAL, NULL);
+			if (!BufferIsValid(buffer))
+				elog(LOG, "\n Skipped the buff page. \n");
+			else
+				ReleaseBuffer(buffer);
+		}
+
+		return true;
+	}
+
+	return false;
+}
+
+/*
+ *	load_now - Main routine which reads from dump file and load each block.
+ *	We try to load each blocknum read from DEFAULT_DUMP_FILENAME until we have
+ *	any free buffer left or SIGTERM is received. If we fail to load a block we
+ *	ignore the ERROR and try to load next blocknum. This is because there is
+ *	possibility that corresponding blocknum might have been deleted.
+ */
+static void load_now(void)
+{
+	static char dump_file_path[MAXPGPATH];
+	FILE *file = NULL;
+	uint32 i, num_buffers = 0;
+
+	avoid_loading = true;
+
+	/* Check if file exists and open file in read mode. */
+	snprintf(dump_file_path, sizeof(dump_file_path), "%s.save", DEFAULT_DUMP_FILENAME);
+	file = fopen(dump_file_path, PG_BINARY_R);
+
+	if (!file)
+		return;	/* No file to load. */
+
+	if (fscanf(file,"<<%u>>", &num_buffers) != 1)
+	{
+		fclose(file);
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				errmsg("Error reading num of elements in \"%s\" for autoprewarm : %m", dump_file_path)));
+	}
+
+	elog(LOG, "Num buffers : %d \n", num_buffers);
+
+	for (i = 0; i < num_buffers; i++)
+	{
+		RelFileNode	rnode;
+		uint32		forknum;
+		BlockNumber	blocknum;
+		bool		have_free_buf = true;
+
+		if (got_sigterm)
+		{
+			/*
+			 * Received shutdown while we were still loading the buffers.
+			 * No need to dump at this stage.
+			 */
+			avoid_dumping = true;
+			break;
+		}
+
+		if(!have_free_buf)
+			break;
+
+		/* Get next block. */
+		if (5 != fscanf(file, "%u,%u,%u,%u,%u\n", &rnode.dbNode, &rnode.spcNode,
+							 &rnode.relNode, &forknum, &blocknum))
+			break;	/* No more valid entry hence stop processing. */
+
+		PG_TRY();
+		{
+			have_free_buf = load_block(rnode, RELPERSISTENCE_PERMANENT,
+									   (ForkNumber)forknum, blocknum);
+		}
+		PG_CATCH();
+		{
+			/* Any error handle it and then try to load next buffer. */
+
+			/* Prevent interrupts while cleaning up */
+			HOLD_INTERRUPTS();
+
+			/* Report the error to the server log */
+			EmitErrorReport();
+
+			LWLockReleaseAll();
+			AbortBufferIO();
+			UnlockBuffers();
+
+			/* buffer pins are released here. */
+			ResourceOwnerRelease(CurrentResourceOwner,
+								 RESOURCE_RELEASE_BEFORE_LOCKS,
+								 false, true);
+			FlushErrorState();
+
+			/* Now we can allow interrupts again */
+			RESUME_INTERRUPTS();
+		}
+		PG_END_TRY();
+	}
+
+	fclose(file);
+
+	elog(LOG, "loaded");
+	return;
+}
+
+
+/*
+ *	dump_now - Main routine which goes through each buffer header and dump
+ *	their metadata in the format.
+ *	<DatabaseId,TableSpaceId,RelationId,Forknum,BlockNum>. We Sort these data
+ *	and then dump them. Sorting is necessary as it facilitates sequential read
+ *	during load. Unlike load if we encounter any error we abort the dump.
+ */
+static void dump_now(void)
+{
+	static char		dump_file_path[MAXPGPATH],
+					transient_dump_file_path[MAXPGPATH];
+	uint32			i;
+	int				ret;
+	uint32			num_buffers;
+	BlockInfoRecord	*block_info_array;
+	BufferDesc		*bufHdr;
+	FILE			*file = NULL;
+
+	avoid_dumping = true;
+	block_info_array = (BlockInfoRecord *) palloc(sizeof(BlockInfoRecord) * NBuffers);
+
+	if (!block_info_array)
+		elog(ERROR, "Out of Memory!");
+
+	for (num_buffers = 0, i = 0; i < NBuffers; i++)
+	{
+		uint32 buf_state;
+
+		bufHdr = GetBufferDescriptor(i);
+
+		/* Lock each buffer header before inspecting. */
+		buf_state = LockBufHdr(bufHdr);
+
+		/* Only valid and persistant page buffers are dumped. */
+		if ((buf_state & BM_VALID) && (buf_state & BM_TAG_VALID) &&
+			(buf_state & BM_PERMANENT))
+		{
+			block_info_array[num_buffers].database = bufHdr->tag.rnode.dbNode;
+			block_info_array[num_buffers].spcNode  = bufHdr->tag.rnode.spcNode;
+			block_info_array[num_buffers].filenode = bufHdr->tag.rnode.relNode;
+			block_info_array[num_buffers].forknum  = bufHdr->tag.forkNum;
+			block_info_array[num_buffers].blocknum = bufHdr->tag.blockNum;
+			++num_buffers;
+		}
+
+		UnlockBufHdr(bufHdr, buf_state);
+	}
+
+	/* Sorting now only to avoid sorting while loading. */
+	pg_qsort(block_info_array, num_buffers, sizeof(BlockInfoRecord), sort_cmp_func);
+
+	snprintf(transient_dump_file_path, sizeof(dump_file_path),
+			 "%s.save.tmp", DEFAULT_DUMP_FILENAME);
+	file = fopen(transient_dump_file_path, "w");
+	if (file == NULL)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not open \"%s\": %m", dump_file_path)));
+
+	snprintf(dump_file_path, sizeof(dump_file_path),
+			 "%s.save", DEFAULT_DUMP_FILENAME);
+
+	/* Write num_buffers first and then BlockMetaInfoRecords. */
+	ret = fprintf(file, "<<%u>>\n", num_buffers);
+	if (ret < 0)
+	{
+		fclose(file);
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				errmsg("error writing to \"%s\" : %m", dump_file_path)));
+	}
+
+	for (i = 0; i < num_buffers; i++)
+	{
+		ret = fprintf(file, "%u,%u,%u,%u,%u\n",
+							block_info_array[i].database,
+							block_info_array[i].spcNode,
+							block_info_array[i].filenode,
+							(uint32)block_info_array[i].forknum,
+							block_info_array[i].blocknum);
+		if (ret < 0)
+		{
+			fclose(file);
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					errmsg("error writing to \"%s\" : %m", dump_file_path)));
+		}
+	}
+
+	pfree(block_info_array);
+
+	/*
+	 * Rename transient_dump_file_path to dump_file_path to make things
+	 * permanent.
+	 */
+	ret = fclose(file);
+	if (ret != 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				errmsg("error closing \"%s\" : %m", transient_dump_file_path)));
+
+	ret = unlink(dump_file_path);
+	if (ret != 0 && errno != ENOENT)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				errmsg("unlink \"%s\" failed : %m", dump_file_path)));
+
+	ret = rename(transient_dump_file_path, dump_file_path);
+	if (ret != 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				errmsg("Failed to rename \"%s\" to \"%s\" : %m",
+					   transient_dump_file_path, dump_file_path)));
+
+	elog(LOG, "Buffer Dump: saved metadata of %d blocks", num_buffers);
+}
+
+/* Extension's entry point. */
+void _PG_init(void)
+{
+	BackgroundWorker auto_prewarm;
+
+	/* Register AutoPreWarmer. */
+	MemSet(&auto_prewarm, 0, sizeof(auto_prewarm));
+	auto_prewarm.bgw_main_arg = Int32GetDatum(0);
+	auto_prewarm.bgw_flags      = BGWORKER_SHMEM_ACCESS;
+
+	/* Register the Auto Pre-warmer background worker */
+	auto_prewarm.bgw_start_time = BgWorkerStart_PostmasterStart;
+	auto_prewarm.bgw_restart_time   = 0;  /* Keep the Auto Pre-warmer running */
+	auto_prewarm.bgw_main           = AutoPreWarmerMain;
+	snprintf(auto_prewarm.bgw_name, BGW_MAXLEN, "Auto Pre-warmer");
+	RegisterBackgroundWorker(&auto_prewarm);
+}
+
+/*
+ * AutoPreWarmerMain -- Main entry point of Auto-prewarmer process.
+ *						This is invoked as a background worker.
+ */
+static void AutoPreWarmerMain (Datum main_arg)
+{
+	MemoryContext	autoprewarmer_context;
+	sigjmp_buf		local_sigjmp_buf;
+
+	/* Establish signal handlers before unblocking signals. */
+	pqsignal(SIGTERM, sigtermHandler);
+
+	/*
+	 * Create a resource owner to keep track of our resources.
+	 */
+	CurrentResourceOwner = ResourceOwnerCreate(NULL, "AutoPreWarmer");
+
+	/*
+	 * Create a memory context that we will do all our work in.  We do this so
+	 * that we can reset the context during error recovery and thereby avoid
+	 * possible memory leaks.
+	 */
+	autoprewarmer_context = AllocSetContextCreate(TopMemoryContext,
+												 "AutoPreWarmer",
+												 ALLOCSET_DEFAULT_MINSIZE,
+												 ALLOCSET_DEFAULT_INITSIZE,
+												 ALLOCSET_DEFAULT_MAXSIZE);
+	MemoryContextSwitchTo(autoprewarmer_context);
+
+
+	/*
+	 * If an exception is encountered, processing resumes here.
+	 * See notes in postgres.c about the design of this coding.
+	 */
+	if (sigsetjmp(local_sigjmp_buf, 1) != 0)
+	{
+		/* Since not using PG_TRY, must reset error stack by hand */
+		error_context_stack = NULL;
+
+		/* Prevent interrupts while cleaning up */
+		HOLD_INTERRUPTS();
+
+		/* Report the error to the server log */
+		EmitErrorReport();
+
+		LWLockReleaseAll();
+		AbortBufferIO();
+		UnlockBuffers();
+
+		/* buffer pins are released here. */
+		ResourceOwnerRelease(CurrentResourceOwner,
+							 RESOURCE_RELEASE_BEFORE_LOCKS,
+							 false, true);
+		AtEOXact_Buffers(false);
+		AtEOXact_SMgr();
+
+		MemoryContextSwitchTo(autoprewarmer_context);
+		FlushErrorState();
+
+		/* Flush any leaked data in the top-level context */
+		MemoryContextResetAndDeleteChildren(autoprewarmer_context);
+
+		/* Now we can allow interrupts again */
+		RESUME_INTERRUPTS();
+
+		/* Close all open files after any error. */
+		smgrcloseall();
+
+		/* Error while dumping is treated as fatal hence do proc_exit */
+		if (avoid_dumping)
+			proc_exit(0);
+	}
+
+	/* We can now handle ereport(ERROR) */
+	PG_exception_stack = &local_sigjmp_buf;
+
+	/* We're now ready to receive signals */
+	BackgroundWorkerUnblockSignals();
+	if (!avoid_loading)
+		load_now();
+	while (!got_sigterm)
+	{
+		int rc;
+		ResetLatch(&MyProc->procLatch);
+		rc = WaitLatch(&MyProc->procLatch,
+						WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
+						10 * 1000L, PG_WAIT_EXTENSION);
+
+		if (rc & WL_POSTMASTER_DEATH)
+			proc_exit(1);
+	}
+
+	if (!avoid_dumping)
+		dump_now();
+}
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 2b63cd3..f319953 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -693,6 +693,20 @@ ReadBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum,
 							 mode, strategy, &hit);
 }
 
+/*
+ * ReadBufferForPrewarm -- This new interface is for pg_autoprewarm.
+ */
+Buffer
+ReadBufferForPrewarm(SMgrRelation smgr, char relpersistence,
+					 ForkNumber forkNum, BlockNumber blockNum,
+					 ReadBufferMode mode, BufferAccessStrategy strategy)
+{
+	bool        hit;
+
+	return ReadBuffer_common(smgr, relpersistence, forkNum, blockNum,
+							 mode, strategy, &hit);
+}
+
 
 /*
  * ReadBuffer_common -- common logic for all ReadBuffer variants
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index 88b90dc..da66094 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -169,6 +169,19 @@ ClockSweepTick(void)
 }
 
 /*
+ * have_free_buffer -- This function check whether there is a free buffer in
+ * buffer pool. Used by pg_autoprewarm module.
+ */
+bool
+have_free_buffer()
+{
+	if	(StrategyControl->firstFreeBuffer >= 0)
+		return true;
+	else
+		return false;
+}
+
+/*
  * StrategyGetBuffer
  *
  *	Called by the bufmgr to get the next candidate buffer to use in
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index e0dfb2f..a156bf2 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -316,6 +316,7 @@ extern void StrategyNotifyBgWriter(int bgwprocno);
 
 extern Size StrategyShmemSize(void);
 extern void StrategyInitialize(bool init);
+extern bool have_free_buffer(void);
 
 /* buf_table.c */
 extern Size BufTableShmemSize(int size);
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 7b6ba96..2c70914 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -16,6 +16,7 @@
 
 #include "storage/block.h"
 #include "storage/buf.h"
+#include "storage/smgr.h"
 #include "storage/bufpage.h"
 #include "storage/relfilenode.h"
 #include "utils/relcache.h"
@@ -183,6 +184,10 @@ extern Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum,
 extern Buffer ReadBufferWithoutRelcache(RelFileNode rnode,
 						  ForkNumber forkNum, BlockNumber blockNum,
 						  ReadBufferMode mode, BufferAccessStrategy strategy);
+extern Buffer ReadBufferForPrewarm(SMgrRelation smgr, char relpersistence,
+								   ForkNumber forkNum, BlockNumber blockNum,
+								   ReadBufferMode mode,
+								   BufferAccessStrategy strategy);
 extern void ReleaseBuffer(Buffer buffer);
 extern void UnlockReleaseBuffer(Buffer buffer);
 extern void MarkBufferDirty(Buffer buffer);
