From 84dcc240c51c568a192da8d11a49144b027a87de Mon Sep 17 00:00:00 2001
From: Jakub Wartak <jakub.wartak@enterprisedb.com>
Date: Fri, 21 Feb 2025 11:17:28 +0100
Subject: [PATCH v7b 2/3] Extend pg_buffercache with new view
 pg_buffercache_numa to show NUMA zone for indvidual buffer.

Author: Jakub Wartak <jakub.wartak@enterprisedb.com>
Co-authored-by: Bertrand Drouvot <bertranddrouvot.pg@gmail.com>
Reviewed-by: Andres Freund <andres@anarazel.de>
Discussion: https://postgr.es/m/CAKZiRmxh6KWo0aqRqvmcoaX2jUxZYb4kGp3N%3Dq1w%2BDiH-696Xw%40mail.gmail.com
---
 contrib/pg_buffercache/Makefile               |   3 +-
 .../expected/pg_buffercache.out               |  26 +
 contrib/pg_buffercache/meson.build            |   1 +
 .../pg_buffercache--1.5--1.6.sql              |  42 ++
 contrib/pg_buffercache/pg_buffercache.control |   2 +-
 contrib/pg_buffercache/pg_buffercache_pages.c | 470 +++++++++++++-----
 contrib/pg_buffercache/sql/pg_buffercache.sql |  16 +
 src/backend/utils/misc/guc_tables.c           |   2 +-
 src/include/storage/pg_shmem.h                |   1 +
 9 files changed, 430 insertions(+), 133 deletions(-)
 create mode 100644 contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql

diff --git a/contrib/pg_buffercache/Makefile b/contrib/pg_buffercache/Makefile
index eae65ead9e..2a33602537 100644
--- a/contrib/pg_buffercache/Makefile
+++ b/contrib/pg_buffercache/Makefile
@@ -8,7 +8,8 @@ OBJS = \
 EXTENSION = pg_buffercache
 DATA = pg_buffercache--1.2.sql pg_buffercache--1.2--1.3.sql \
 	pg_buffercache--1.1--1.2.sql pg_buffercache--1.0--1.1.sql \
-	pg_buffercache--1.3--1.4.sql pg_buffercache--1.4--1.5.sql
+	pg_buffercache--1.3--1.4.sql pg_buffercache--1.4--1.5.sql \
+	pg_buffercache--1.5--1.6.sql
 PGFILEDESC = "pg_buffercache - monitoring of shared buffer cache in real-time"
 
 REGRESS = pg_buffercache
diff --git a/contrib/pg_buffercache/expected/pg_buffercache.out b/contrib/pg_buffercache/expected/pg_buffercache.out
index b745dc69ea..4da569c20a 100644
--- a/contrib/pg_buffercache/expected/pg_buffercache.out
+++ b/contrib/pg_buffercache/expected/pg_buffercache.out
@@ -8,6 +8,18 @@ from pg_buffercache;
  t
 (1 row)
 
+-- to ignore potential NOTICE:  libnuma initialization failed..
+SET client_min_messages TO warning ;
+select count(*) = (select setting::bigint
+                   from pg_settings
+                   where name = 'shared_buffers')
+from pg_buffercache_numa;
+ ?column? 
+----------
+ t
+(1 row)
+
+RESET client_min_messages;
 select buffers_used + buffers_unused > 0,
         buffers_dirty <= buffers_used,
         buffers_pinned <= buffers_used
@@ -34,6 +46,11 @@ SELECT * FROM pg_buffercache_summary();
 ERROR:  permission denied for function pg_buffercache_summary
 SELECT * FROM pg_buffercache_usage_counts();
 ERROR:  permission denied for function pg_buffercache_usage_counts
+-- to ignore potential NOTICE:  libnuma initialization failed..
+SET client_min_messages TO warning ;
+SELECT * FROM pg_buffercache_numa;
+ERROR:  permission denied for view pg_buffercache_numa
+RESET client_min_messages;
 RESET role;
 -- Check that pg_monitor is allowed to query view / function
 SET ROLE pg_monitor;
@@ -55,3 +72,12 @@ SELECT count(*) > 0 FROM pg_buffercache_usage_counts();
  t
 (1 row)
 
+-- to ignore potential NOTICE:  libnuma initialization failed..
+SET client_min_messages TO warning ;
+SELECT count(*) > 0 FROM pg_buffercache_numa;
+ ?column? 
+----------
+ t
+(1 row)
+
+RESET client_min_messages;
diff --git a/contrib/pg_buffercache/meson.build b/contrib/pg_buffercache/meson.build
index 12d1fe4871..9b2e939341 100644
--- a/contrib/pg_buffercache/meson.build
+++ b/contrib/pg_buffercache/meson.build
@@ -23,6 +23,7 @@ install_data(
   'pg_buffercache--1.2.sql',
   'pg_buffercache--1.3--1.4.sql',
   'pg_buffercache--1.4--1.5.sql',
+  'pg_buffercache--1.5--1.6.sql',
   'pg_buffercache.control',
   kwargs: contrib_data_args,
 )
diff --git a/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql b/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql
new file mode 100644
index 0000000000..52f63aa258
--- /dev/null
+++ b/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql
@@ -0,0 +1,42 @@
+/* contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION pg_buffercache" to load this file. \quit
+
+-- Register the new function.
+DROP VIEW pg_buffercache;
+DROP FUNCTION pg_buffercache_pages();
+
+CREATE OR REPLACE FUNCTION pg_buffercache_pages()
+RETURNS SETOF RECORD
+AS 'MODULE_PATHNAME', 'pg_buffercache_pages'
+LANGUAGE C PARALLEL SAFE;
+
+CREATE OR REPLACE FUNCTION pg_buffercache_numa_pages()
+RETURNS SETOF RECORD
+AS 'MODULE_PATHNAME', 'pg_buffercache_numa_pages'
+LANGUAGE C PARALLEL SAFE;
+
+-- Create a view for convenient access.
+CREATE OR REPLACE VIEW pg_buffercache AS
+	SELECT P.* FROM pg_buffercache_pages() AS P
+	(bufferid integer, relfilenode oid, reltablespace oid, reldatabase oid,
+	 relforknumber int2, relblocknumber int8, isdirty bool, usagecount int2,
+	 pinning_backends int4);
+
+CREATE OR REPLACE VIEW pg_buffercache_numa AS
+	SELECT P.* FROM pg_buffercache_numa_pages() AS P
+	(bufferid integer, relfilenode oid, reltablespace oid, reldatabase oid,
+	 relforknumber int2, relblocknumber int8, isdirty bool, usagecount int2,
+	 pinning_backends int4, numa_zone_id int4);
+
+-- Don't want these to be available to public.
+REVOKE ALL ON FUNCTION pg_buffercache_pages() FROM PUBLIC;
+REVOKE ALL ON FUNCTION pg_buffercache_numa_pages() FROM PUBLIC;
+REVOKE ALL ON pg_buffercache FROM PUBLIC;
+REVOKE ALL ON pg_buffercache_numa FROM PUBLIC;
+
+GRANT EXECUTE ON FUNCTION pg_buffercache_pages() TO pg_monitor;
+GRANT EXECUTE ON FUNCTION pg_buffercache_numa_pages() TO pg_monitor;
+GRANT SELECT ON pg_buffercache TO pg_monitor;
+GRANT SELECT ON pg_buffercache_numa TO pg_monitor;
diff --git a/contrib/pg_buffercache/pg_buffercache.control b/contrib/pg_buffercache/pg_buffercache.control
index 5ee875f77d..b030ba3a6f 100644
--- a/contrib/pg_buffercache/pg_buffercache.control
+++ b/contrib/pg_buffercache/pg_buffercache.control
@@ -1,5 +1,5 @@
 # pg_buffercache extension
 comment = 'examine the shared buffer cache'
-default_version = '1.5'
+default_version = '1.6'
 module_pathname = '$libdir/pg_buffercache'
 relocatable = true
diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c
index 3ae0a018e1..96b14f9ed4 100644
--- a/contrib/pg_buffercache/pg_buffercache_pages.c
+++ b/contrib/pg_buffercache/pg_buffercache_pages.c
@@ -11,12 +11,14 @@
 #include "access/htup_details.h"
 #include "catalog/pg_type.h"
 #include "funcapi.h"
+#include "port/pg_numa.h"
 #include "storage/buf_internals.h"
 #include "storage/bufmgr.h"
+#include "storage/pg_shmem.h"
 
 
 #define NUM_BUFFERCACHE_PAGES_MIN_ELEM	8
-#define NUM_BUFFERCACHE_PAGES_ELEM	9
+#define NUM_BUFFERCACHE_PAGES_ELEM	10
 #define NUM_BUFFERCACHE_SUMMARY_ELEM 5
 #define NUM_BUFFERCACHE_USAGE_COUNTS_ELEM 4
 
@@ -43,6 +45,7 @@ typedef struct
 	 * because of bufmgr.c's PrivateRefCount infrastructure.
 	 */
 	int32		pinning_backends;
+	int32		numa_zone_id;
 } BufferCachePagesRec;
 
 
@@ -61,84 +64,250 @@ typedef struct
  * relation node/tablespace/database/blocknum and dirty indicator.
  */
 PG_FUNCTION_INFO_V1(pg_buffercache_pages);
+PG_FUNCTION_INFO_V1(pg_buffercache_numa_pages);
 PG_FUNCTION_INFO_V1(pg_buffercache_summary);
 PG_FUNCTION_INFO_V1(pg_buffercache_usage_counts);
 PG_FUNCTION_INFO_V1(pg_buffercache_evict);
 
-Datum
-pg_buffercache_pages(PG_FUNCTION_ARGS)
+/*
+ * Helper routine to map Buffers into addresses that can be
+ * later consumed by pg_numa_query_pages()
+ *
+ * Many buffers can point to the same page (in case of
+ * BLCKSZ < 4kB), but we want to also query just first
+ * address.
+ *
+ * In order to get reliable results we also need to touch
+ * memory pages, so that inquiry about NUMA zone doesn't
+ * return -2.
+ */
+static inline void
+pg_buffercache_numa_prepare_ptrs(int i, float pages_per_blk, Size os_page_size,
+								 void **os_page_ptrs, bool firstUseInBackend)
+{
+	int			j = 0,
+				blk2page = (int) i * pages_per_blk;
+
+	do
+	{
+		if (os_page_ptrs[blk2page + j] == 0)
+		{
+			volatile uint64 touch pg_attribute_unused();
+
+			/* NBuffers count start really from 1 */
+			os_page_ptrs[blk2page + j] = (char *) BufferGetBlock(i + 1) + (os_page_size * j);
+
+			/* We just need to do it only once in backend */
+			if (firstUseInBackend == true)
+				pg_numa_touch_mem_if_required(touch, os_page_ptrs[blk2page + j]);
+
+			CHECK_FOR_INTERRUPTS();
+		}
+		j++;
+	} while (j < (int) pages_per_blk);
+}
+
+/*
+ * Helper routine for pg_buffercache_(numa_)pages.
+ *
+ * We need  fcinfo here and we pass it here with PG_FUNCTION_ARGS
+ */
+static BufferCachePagesContext *
+init_buffercache_entries(FuncCallContext *funcctx, PG_FUNCTION_ARGS)
 {
-	FuncCallContext *funcctx;
-	Datum		result;
-	MemoryContext oldcontext;
 	BufferCachePagesContext *fctx;	/* User function context. */
+	MemoryContext oldcontext;
 	TupleDesc	tupledesc;
 	TupleDesc	expected_tupledesc;
-	HeapTuple	tuple;
 
-	if (SRF_IS_FIRSTCALL())
-	{
-		int			i;
+	/*
+	 * Switch context when allocating stuff to be used in later calls
+	 */
+	oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
 
-		funcctx = SRF_FIRSTCALL_INIT();
+	/* Create a user function context for cross-call persistence */
+	fctx = (BufferCachePagesContext *) palloc(sizeof(BufferCachePagesContext));
+
+	/*
+	 * To smoothly support upgrades from version 1.0 of this extension
+	 * transparently handle the (non-)existence of the pinning_backends
+	 * column. We unfortunately have to get the result type for that... - we
+	 * can't use the result type determined by the function definition without
+	 * potentially crashing when somebody uses the old (or even wrong)
+	 * function definition though.
+	 */
+	if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE)
+		elog(ERROR, "return type must be a row type");
+
+	if (expected_tupledesc->natts < NUM_BUFFERCACHE_PAGES_MIN_ELEM ||
+		expected_tupledesc->natts > NUM_BUFFERCACHE_PAGES_ELEM)
+		elog(ERROR, "incorrect number of output arguments");
+
+	/* Construct a tuple descriptor for the result rows. */
+	tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts);
+	TupleDescInitEntry(tupledesc, (AttrNumber) 1, "bufferid",
+					   INT4OID, -1, 0);
+	TupleDescInitEntry(tupledesc, (AttrNumber) 2, "relfilenode",
+					   OIDOID, -1, 0);
+	TupleDescInitEntry(tupledesc, (AttrNumber) 3, "reltablespace",
+					   OIDOID, -1, 0);
+	TupleDescInitEntry(tupledesc, (AttrNumber) 4, "reldatabase",
+					   OIDOID, -1, 0);
+	TupleDescInitEntry(tupledesc, (AttrNumber) 5, "relforknumber",
+					   INT2OID, -1, 0);
+	TupleDescInitEntry(tupledesc, (AttrNumber) 6, "relblocknumber",
+					   INT8OID, -1, 0);
+	TupleDescInitEntry(tupledesc, (AttrNumber) 7, "isdirty",
+					   BOOLOID, -1, 0);
+	TupleDescInitEntry(tupledesc, (AttrNumber) 8, "usage_count",
+					   INT2OID, -1, 0);
+
+	if (expected_tupledesc->natts >= NUM_BUFFERCACHE_PAGES_ELEM - 1)
+		TupleDescInitEntry(tupledesc, (AttrNumber) 9, "pinning_backends",
+						   INT4OID, -1, 0);
+	if (expected_tupledesc->natts == NUM_BUFFERCACHE_PAGES_ELEM)
+		TupleDescInitEntry(tupledesc, (AttrNumber) 10, "numa_zone_id",
+						   INT4OID, -1, 0);
+
+	fctx->tupdesc = BlessTupleDesc(tupledesc);
+
+	/* Allocate NBuffers worth of BufferCachePagesRec records. */
+	fctx->record = (BufferCachePagesRec *)
+		MemoryContextAllocHuge(CurrentMemoryContext,
+							   sizeof(BufferCachePagesRec) * NBuffers);
+
+	/* Set max calls and remember the user function context. */
+	funcctx->max_calls = NBuffers;
+	funcctx->user_fctx = fctx;
+
+	/*
+	 * Return to original context when allocating transient memory
+	 */
+	MemoryContextSwitchTo(oldcontext);
+	return fctx;
+}
 
-		/* Switch context when allocating stuff to be used in later calls */
-		oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
+/*
+ * Helper routine for pg_buffercache_(numa_)pages
+ */
+static void
+populate_buffercache_entry(int i, BufferCachePagesContext *fctx)
+{
+	BufferDesc *bufHdr;
+	uint32		buf_state;
+
+	bufHdr = GetBufferDescriptor(i);
+	/* Lock each buffer header before inspecting. */
+	buf_state = LockBufHdr(bufHdr);
+
+	fctx->record[i].bufferid = BufferDescriptorGetBuffer(bufHdr);
+	fctx->record[i].relfilenumber = BufTagGetRelNumber(&bufHdr->tag);
+	fctx->record[i].reltablespace = bufHdr->tag.spcOid;
+	fctx->record[i].reldatabase = bufHdr->tag.dbOid;
+	fctx->record[i].forknum = BufTagGetForkNum(&bufHdr->tag);
+	fctx->record[i].blocknum = bufHdr->tag.blockNum;
+	fctx->record[i].usagecount = BUF_STATE_GET_USAGECOUNT(buf_state);
+	fctx->record[i].pinning_backends = BUF_STATE_GET_REFCOUNT(buf_state);
+
+	if (buf_state & BM_DIRTY)
+		fctx->record[i].isdirty = true;
+	else
+		fctx->record[i].isdirty = false;
 
-		/* Create a user function context for cross-call persistence */
-		fctx = (BufferCachePagesContext *) palloc(sizeof(BufferCachePagesContext));
+	/*
+	 * Note if the buffer is valid, and has storage created
+	 */
+	if ((buf_state & BM_VALID) && (buf_state & BM_TAG_VALID))
+		fctx->record[i].isvalid = true;
+	else
+		fctx->record[i].isvalid = false;
+
+	fctx->record[i].numa_zone_id = -1;
+
+	UnlockBufHdr(bufHdr, buf_state);
+}
+
+/*
+ * Helper routine for pg_buffercache_(numa_)pages
+ */
+static Datum
+get_buffercache_tuple(int i, BufferCachePagesContext *fctx)
+{
+	Datum		values[NUM_BUFFERCACHE_PAGES_ELEM];
+	bool		nulls[NUM_BUFFERCACHE_PAGES_ELEM];
+	HeapTuple	tuple;
+
+	values[0] = Int32GetDatum(fctx->record[i].bufferid);
+	nulls[0] = false;
+
+	/*
+	 * Set all fields except the bufferid to null if the buffer is unused or
+	 * not valid.
+	 */
+	if (fctx->record[i].blocknum == InvalidBlockNumber ||
+		fctx->record[i].isvalid == false)
+	{
+		nulls[1] = true;
+		nulls[2] = true;
+		nulls[3] = true;
+		nulls[4] = true;
+		nulls[5] = true;
+		nulls[6] = true;
+		nulls[7] = true;
 
 		/*
-		 * To smoothly support upgrades from version 1.0 of this extension
-		 * transparently handle the (non-)existence of the pinning_backends
-		 * column. We unfortunately have to get the result type for that... -
-		 * we can't use the result type determined by the function definition
-		 * without potentially crashing when somebody uses the old (or even
-		 * wrong) function definition though.
+		 * unused for v1.0 callers, but the array is always long enough
 		 */
-		if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE)
-			elog(ERROR, "return type must be a row type");
+		nulls[8] = true;
+		nulls[9] = true;
+	}
+	else
+	{
+		values[1] = ObjectIdGetDatum(fctx->record[i].relfilenumber);
+		nulls[1] = false;
+		values[2] = ObjectIdGetDatum(fctx->record[i].reltablespace);
+		nulls[2] = false;
+		values[3] = ObjectIdGetDatum(fctx->record[i].reldatabase);
+		nulls[3] = false;
+		values[4] = ObjectIdGetDatum(fctx->record[i].forknum);
+		nulls[4] = false;
+		values[5] = Int64GetDatum((int64) fctx->record[i].blocknum);
+		nulls[5] = false;
+		values[6] = BoolGetDatum(fctx->record[i].isdirty);
+		nulls[6] = false;
+		values[7] = Int16GetDatum(fctx->record[i].usagecount);
+		nulls[7] = false;
 
-		if (expected_tupledesc->natts < NUM_BUFFERCACHE_PAGES_MIN_ELEM ||
-			expected_tupledesc->natts > NUM_BUFFERCACHE_PAGES_ELEM)
-			elog(ERROR, "incorrect number of output arguments");
+		/*
+		 * unused for v1.0 callers, but the array is always long enough
+		 */
+		values[8] = Int32GetDatum(fctx->record[i].pinning_backends);
+		nulls[8] = false;
+		values[9] = Int32GetDatum(fctx->record[i].numa_zone_id);
+		nulls[9] = false;
+	}
 
-		/* Construct a tuple descriptor for the result rows. */
-		tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts);
-		TupleDescInitEntry(tupledesc, (AttrNumber) 1, "bufferid",
-						   INT4OID, -1, 0);
-		TupleDescInitEntry(tupledesc, (AttrNumber) 2, "relfilenode",
-						   OIDOID, -1, 0);
-		TupleDescInitEntry(tupledesc, (AttrNumber) 3, "reltablespace",
-						   OIDOID, -1, 0);
-		TupleDescInitEntry(tupledesc, (AttrNumber) 4, "reldatabase",
-						   OIDOID, -1, 0);
-		TupleDescInitEntry(tupledesc, (AttrNumber) 5, "relforknumber",
-						   INT2OID, -1, 0);
-		TupleDescInitEntry(tupledesc, (AttrNumber) 6, "relblocknumber",
-						   INT8OID, -1, 0);
-		TupleDescInitEntry(tupledesc, (AttrNumber) 7, "isdirty",
-						   BOOLOID, -1, 0);
-		TupleDescInitEntry(tupledesc, (AttrNumber) 8, "usage_count",
-						   INT2OID, -1, 0);
-
-		if (expected_tupledesc->natts == NUM_BUFFERCACHE_PAGES_ELEM)
-			TupleDescInitEntry(tupledesc, (AttrNumber) 9, "pinning_backends",
-							   INT4OID, -1, 0);
-
-		fctx->tupdesc = BlessTupleDesc(tupledesc);
-
-		/* Allocate NBuffers worth of BufferCachePagesRec records. */
-		fctx->record = (BufferCachePagesRec *)
-			MemoryContextAllocHuge(CurrentMemoryContext,
-								   sizeof(BufferCachePagesRec) * NBuffers);
-
-		/* Set max calls and remember the user function context. */
-		funcctx->max_calls = NBuffers;
-		funcctx->user_fctx = fctx;
-
-		/* Return to original context when allocating transient memory */
-		MemoryContextSwitchTo(oldcontext);
+	/* Build and return the tuple. */
+	tuple = heap_form_tuple(fctx->tupdesc, values, nulls);
+	return HeapTupleGetDatum(tuple);
+}
+
+/*
+ * When updating this routine please sync it with below one:
+ * pg_buffercache_numa_pages()
+ */
+Datum
+pg_buffercache_pages(PG_FUNCTION_ARGS)
+{
+	FuncCallContext *funcctx;
+	BufferCachePagesContext *fctx;	/* User function context. */
+
+	if (SRF_IS_FIRSTCALL())
+	{
+		int			i;
+
+		funcctx = SRF_FIRSTCALL_INIT();
+		fctx = init_buffercache_entries(funcctx, fcinfo);
 
 		/*
 		 * Scan through all the buffers, saving the relevant fields in the
@@ -149,36 +318,7 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
 		 * locks, so the information of each buffer is self-consistent.
 		 */
 		for (i = 0; i < NBuffers; i++)
-		{
-			BufferDesc *bufHdr;
-			uint32		buf_state;
-
-			bufHdr = GetBufferDescriptor(i);
-			/* Lock each buffer header before inspecting. */
-			buf_state = LockBufHdr(bufHdr);
-
-			fctx->record[i].bufferid = BufferDescriptorGetBuffer(bufHdr);
-			fctx->record[i].relfilenumber = BufTagGetRelNumber(&bufHdr->tag);
-			fctx->record[i].reltablespace = bufHdr->tag.spcOid;
-			fctx->record[i].reldatabase = bufHdr->tag.dbOid;
-			fctx->record[i].forknum = BufTagGetForkNum(&bufHdr->tag);
-			fctx->record[i].blocknum = bufHdr->tag.blockNum;
-			fctx->record[i].usagecount = BUF_STATE_GET_USAGECOUNT(buf_state);
-			fctx->record[i].pinning_backends = BUF_STATE_GET_REFCOUNT(buf_state);
-
-			if (buf_state & BM_DIRTY)
-				fctx->record[i].isdirty = true;
-			else
-				fctx->record[i].isdirty = false;
-
-			/* Note if the buffer is valid, and has storage created */
-			if ((buf_state & BM_VALID) && (buf_state & BM_TAG_VALID))
-				fctx->record[i].isvalid = true;
-			else
-				fctx->record[i].isvalid = false;
-
-			UnlockBufHdr(bufHdr, buf_state);
-		}
+			populate_buffercache_entry(i, fctx);
 	}
 
 	funcctx = SRF_PERCALL_SETUP();
@@ -188,59 +328,129 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
 
 	if (funcctx->call_cntr < funcctx->max_calls)
 	{
+		Datum		result;
 		uint32		i = funcctx->call_cntr;
-		Datum		values[NUM_BUFFERCACHE_PAGES_ELEM];
-		bool		nulls[NUM_BUFFERCACHE_PAGES_ELEM];
 
-		values[0] = Int32GetDatum(fctx->record[i].bufferid);
-		nulls[0] = false;
+		result = get_buffercache_tuple(i, fctx);
+		SRF_RETURN_NEXT(funcctx, result);
+	}
+	else
+	{
+		SRF_RETURN_DONE(funcctx);
+	}
+}
+
+/*
+ * This is almost identical to the above, but performs
+ * NUMA inuqiry about memory mappings
+ */
+Datum
+pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
+{
+	FuncCallContext *funcctx;
+	BufferCachePagesContext *fctx;	/* User function context. */
+	bool		query_numa = true;
+	static bool firstUseInBackend = true;
+
+	if (SRF_IS_FIRSTCALL())
+	{
+		int			i;
+		Size		os_page_size = 0;
+		void	  **os_page_ptrs = NULL;
+		int		   *os_pages_status = NULL;
+		int			os_page_count = 0;
+		float		pages_per_blk = 0;
+
+		funcctx = SRF_FIRSTCALL_INIT();
+		if (pg_numa_init() == -1)
+		{
+			elog(NOTICE, "libnuma initialization failed or NUMA is not supported on this platform, some NUMA data might be unavailable.");
+			query_numa = false;
+		}
+		fctx = init_buffercache_entries(funcctx, fcinfo);
+
+		if (query_numa)
+		{
+			/*
+			 * This is for gathering some NUMA statistics. We might be using
+			 * various DB block sizes (4kB, 8kB , .. 32kB) that end up being
+			 * allocated in various different OS memory pages sizes, so first
+			 * we need to understand the OS memory page size before calling
+			 * move_pages()
+			 */
+			os_page_size = pg_numa_get_pagesize();
+			os_page_count = ((uint64) NBuffers * BLCKSZ) / os_page_size;
+			pages_per_blk = (float) BLCKSZ / os_page_size;
+
+			elog(DEBUG1, "NUMA: os_page_count=%d os_page_size=%zu pages_per_blk=%f",
+				 os_page_count, os_page_size, pages_per_blk);
+
+			os_page_ptrs = palloc(sizeof(void *) * os_page_count);
+			os_pages_status = palloc(sizeof(int) * os_page_count);
+			memset(os_page_ptrs, 0, sizeof(void *) * os_page_count);
+
+			/*
+			 * If we ever get 0xff back from kernel inquiry, then we probably
+			 * have bug in our buffers to OS page mapping code here
+			 */
+			memset(os_pages_status, 0xff, sizeof(int) * os_page_count);
+
+			if (firstUseInBackend == true)
+				elog(DEBUG1, "NUMA: page-faulting the buffercache for proper NUMA readouts");
+		}
 
 		/*
-		 * Set all fields except the bufferid to null if the buffer is unused
-		 * or not valid.
+		 * Scan through all the buffers, saving the relevant fields in the
+		 * fctx->record structure.
+		 *
+		 * We don't hold the partition locks, so we don't get a consistent
+		 * snapshot across all buffers, but we do grab the buffer header
+		 * locks, so the information of each buffer is self-consistent.
 		 */
-		if (fctx->record[i].blocknum == InvalidBlockNumber ||
-			fctx->record[i].isvalid == false)
+		for (i = 0; i < NBuffers; i++)
 		{
-			nulls[1] = true;
-			nulls[2] = true;
-			nulls[3] = true;
-			nulls[4] = true;
-			nulls[5] = true;
-			nulls[6] = true;
-			nulls[7] = true;
-			/* unused for v1.0 callers, but the array is always long enough */
-			nulls[8] = true;
+			populate_buffercache_entry(i, fctx);
+			if (query_numa)
+				pg_buffercache_numa_prepare_ptrs(i, pages_per_blk, os_page_size, os_page_ptrs, firstUseInBackend);
 		}
-		else
+
+		if (query_numa)
 		{
-			values[1] = ObjectIdGetDatum(fctx->record[i].relfilenumber);
-			nulls[1] = false;
-			values[2] = ObjectIdGetDatum(fctx->record[i].reltablespace);
-			nulls[2] = false;
-			values[3] = ObjectIdGetDatum(fctx->record[i].reldatabase);
-			nulls[3] = false;
-			values[4] = ObjectIdGetDatum(fctx->record[i].forknum);
-			nulls[4] = false;
-			values[5] = Int64GetDatum((int64) fctx->record[i].blocknum);
-			nulls[5] = false;
-			values[6] = BoolGetDatum(fctx->record[i].isdirty);
-			nulls[6] = false;
-			values[7] = Int16GetDatum(fctx->record[i].usagecount);
-			nulls[7] = false;
-			/* unused for v1.0 callers, but the array is always long enough */
-			values[8] = Int32GetDatum(fctx->record[i].pinning_backends);
-			nulls[8] = false;
+			if (pg_numa_query_pages(0, os_page_count, os_page_ptrs, os_pages_status) == -1)
+				elog(ERROR, "failed NUMA pages inquiry: %m");
+
+			for (i = 0; i < NBuffers; i++)
+			{
+				int			blk2page = (int) i * pages_per_blk;
+
+				/*
+				 * Technically we can get errors too here and pass that to
+				 * user. Also we could somehow report single DB block spanning
+				 * more than one NUMA zone, but it should be rare.
+				 */
+				fctx->record[i].numa_zone_id = os_pages_status[blk2page];
+			}
 		}
+	}
 
-		/* Build and return the tuple. */
-		tuple = heap_form_tuple(fctx->tupdesc, values, nulls);
-		result = HeapTupleGetDatum(tuple);
+	funcctx = SRF_PERCALL_SETUP();
 
+	/* Get the saved state */
+	fctx = funcctx->user_fctx;
+
+	if (funcctx->call_cntr < funcctx->max_calls)
+	{
+		Datum		result;
+		uint32		i = funcctx->call_cntr;
+
+		result = get_buffercache_tuple(i, fctx);
 		SRF_RETURN_NEXT(funcctx, result);
 	}
 	else
+	{
+		firstUseInBackend = false;
 		SRF_RETURN_DONE(funcctx);
+	}
 }
 
 Datum
diff --git a/contrib/pg_buffercache/sql/pg_buffercache.sql b/contrib/pg_buffercache/sql/pg_buffercache.sql
index 944fbb1bea..d982048425 100644
--- a/contrib/pg_buffercache/sql/pg_buffercache.sql
+++ b/contrib/pg_buffercache/sql/pg_buffercache.sql
@@ -5,6 +5,14 @@ select count(*) = (select setting::bigint
                    where name = 'shared_buffers')
 from pg_buffercache;
 
+-- to ignore potential NOTICE:  libnuma initialization failed..
+SET client_min_messages TO warning ;
+select count(*) = (select setting::bigint
+                   from pg_settings
+                   where name = 'shared_buffers')
+from pg_buffercache_numa;
+RESET client_min_messages;
+
 select buffers_used + buffers_unused > 0,
         buffers_dirty <= buffers_used,
         buffers_pinned <= buffers_used
@@ -19,6 +27,10 @@ SELECT * FROM pg_buffercache;
 SELECT * FROM pg_buffercache_pages() AS p (wrong int);
 SELECT * FROM pg_buffercache_summary();
 SELECT * FROM pg_buffercache_usage_counts();
+-- to ignore potential NOTICE:  libnuma initialization failed..
+SET client_min_messages TO warning ;
+SELECT * FROM pg_buffercache_numa;
+RESET client_min_messages;
 RESET role;
 
 -- Check that pg_monitor is allowed to query view / function
@@ -26,3 +38,7 @@ SET ROLE pg_monitor;
 SELECT count(*) > 0 FROM pg_buffercache;
 SELECT buffers_used + buffers_unused > 0 FROM pg_buffercache_summary();
 SELECT count(*) > 0 FROM pg_buffercache_usage_counts();
+-- to ignore potential NOTICE:  libnuma initialization failed..
+SET client_min_messages TO warning ;
+SELECT count(*) > 0 FROM pg_buffercache_numa;
+RESET client_min_messages;
diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c
index ad25cbb39c..dd34c79f52 100644
--- a/src/backend/utils/misc/guc_tables.c
+++ b/src/backend/utils/misc/guc_tables.c
@@ -563,7 +563,7 @@ static int	ssl_renegotiation_limit;
  */
 int			huge_pages = HUGE_PAGES_TRY;
 int			huge_page_size;
-static int	huge_pages_status = HUGE_PAGES_UNKNOWN;
+int			huge_pages_status = HUGE_PAGES_UNKNOWN;
 
 /*
  * These variables are all dummies that don't do anything, except in some
diff --git a/src/include/storage/pg_shmem.h b/src/include/storage/pg_shmem.h
index b99ebc9e86..5f7d4b83a6 100644
--- a/src/include/storage/pg_shmem.h
+++ b/src/include/storage/pg_shmem.h
@@ -45,6 +45,7 @@ typedef struct PGShmemHeader	/* standard header for all Postgres shmem */
 extern PGDLLIMPORT int shared_memory_type;
 extern PGDLLIMPORT int huge_pages;
 extern PGDLLIMPORT int huge_page_size;
+extern PGDLLIMPORT int huge_pages_status;
 
 /* Possible values for huge_pages and huge_pages_status */
 typedef enum
-- 
2.39.5

