From df7f25e69e7e39d564e78783dab4d4fa4287dc4d Mon Sep 17 00:00:00 2001
From: Jakub Wartak <jakub.wartak@enterprisedb.com>
Date: Fri, 21 Feb 2025 11:17:28 +0100
Subject: [PATCH v10 2/3] Extend pg_buffercache with new view
 pg_buffercache_numa to show NUMA zone for indvidual buffer.

Author: Jakub Wartak <jakub.wartak@enterprisedb.com>
Reviewed-by: Andres Freund <andres@anarazel.de>
Reviewed-by: Bertrand Drouvot <bertranddrouvot.pg@gmail.com>
Discussion: https://postgr.es/m/CAKZiRmxh6KWo0aqRqvmcoaX2jUxZYb4kGp3N%3Dq1w%2BDiH-696Xw%40mail.gmail.com
---
 contrib/pg_buffercache/Makefile               |   3 +-
 .../expected/pg_buffercache_numa.out          |  28 ++
 .../expected/pg_buffercache_numa_1.out        |   3 +
 contrib/pg_buffercache/meson.build            |   2 +
 .../pg_buffercache--1.5--1.6.sql              |  42 ++
 contrib/pg_buffercache/pg_buffercache.control |   2 +-
 contrib/pg_buffercache/pg_buffercache_pages.c | 462 +++++++++++++-----
 .../sql/pg_buffercache_numa.sql               |  21 +
 doc/src/sgml/pgbuffercache.sgml               |  64 ++-
 9 files changed, 493 insertions(+), 134 deletions(-)
 create mode 100644 contrib/pg_buffercache/expected/pg_buffercache_numa.out
 create mode 100644 contrib/pg_buffercache/expected/pg_buffercache_numa_1.out
 create mode 100644 contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql
 create mode 100644 contrib/pg_buffercache/sql/pg_buffercache_numa.sql

diff --git a/contrib/pg_buffercache/Makefile b/contrib/pg_buffercache/Makefile
index eae65ead9e5..2a33602537e 100644
--- a/contrib/pg_buffercache/Makefile
+++ b/contrib/pg_buffercache/Makefile
@@ -8,7 +8,8 @@ OBJS = \
 EXTENSION = pg_buffercache
 DATA = pg_buffercache--1.2.sql pg_buffercache--1.2--1.3.sql \
 	pg_buffercache--1.1--1.2.sql pg_buffercache--1.0--1.1.sql \
-	pg_buffercache--1.3--1.4.sql pg_buffercache--1.4--1.5.sql
+	pg_buffercache--1.3--1.4.sql pg_buffercache--1.4--1.5.sql \
+	pg_buffercache--1.5--1.6.sql
 PGFILEDESC = "pg_buffercache - monitoring of shared buffer cache in real-time"
 
 REGRESS = pg_buffercache
diff --git a/contrib/pg_buffercache/expected/pg_buffercache_numa.out b/contrib/pg_buffercache/expected/pg_buffercache_numa.out
new file mode 100644
index 00000000000..d4de5ea52fc
--- /dev/null
+++ b/contrib/pg_buffercache/expected/pg_buffercache_numa.out
@@ -0,0 +1,28 @@
+SELECT NOT(pg_numa_available()) AS skip_test \gset
+\if :skip_test
+\quit
+\endif
+select count(*) = (select setting::bigint
+                   from pg_settings
+                   where name = 'shared_buffers')
+from pg_buffercache_numa;
+ ?column? 
+----------
+ t
+(1 row)
+
+-- Check that the functions / views can't be accessed by default. To avoid
+-- having to create a dedicated user, use the pg_database_owner pseudo-role.
+SET ROLE pg_database_owner;
+SELECT count(*) > 0 FROM pg_buffercache_numa;
+ERROR:  permission denied for view pg_buffercache_numa
+RESET role;
+-- Check that pg_monitor is allowed to query view / function
+SET ROLE pg_monitor;
+SELECT count(*) > 0 FROM pg_buffercache_numa;
+ ?column? 
+----------
+ t
+(1 row)
+
+RESET role;
diff --git a/contrib/pg_buffercache/expected/pg_buffercache_numa_1.out b/contrib/pg_buffercache/expected/pg_buffercache_numa_1.out
new file mode 100644
index 00000000000..6dd6824b4e4
--- /dev/null
+++ b/contrib/pg_buffercache/expected/pg_buffercache_numa_1.out
@@ -0,0 +1,3 @@
+SELECT NOT(pg_numa_available()) AS skip_test \gset
+\if :skip_test
+\quit
diff --git a/contrib/pg_buffercache/meson.build b/contrib/pg_buffercache/meson.build
index 12d1fe48717..7cd039a1df9 100644
--- a/contrib/pg_buffercache/meson.build
+++ b/contrib/pg_buffercache/meson.build
@@ -23,6 +23,7 @@ install_data(
   'pg_buffercache--1.2.sql',
   'pg_buffercache--1.3--1.4.sql',
   'pg_buffercache--1.4--1.5.sql',
+  'pg_buffercache--1.5--1.6.sql',
   'pg_buffercache.control',
   kwargs: contrib_data_args,
 )
@@ -34,6 +35,7 @@ tests += {
   'regress': {
     'sql': [
       'pg_buffercache',
+      'pg_buffercache_numa',
     ],
   },
 }
diff --git a/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql b/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql
new file mode 100644
index 00000000000..52f63aa258c
--- /dev/null
+++ b/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql
@@ -0,0 +1,42 @@
+/* contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION pg_buffercache" to load this file. \quit
+
+-- Register the new function.
+DROP VIEW pg_buffercache;
+DROP FUNCTION pg_buffercache_pages();
+
+CREATE OR REPLACE FUNCTION pg_buffercache_pages()
+RETURNS SETOF RECORD
+AS 'MODULE_PATHNAME', 'pg_buffercache_pages'
+LANGUAGE C PARALLEL SAFE;
+
+CREATE OR REPLACE FUNCTION pg_buffercache_numa_pages()
+RETURNS SETOF RECORD
+AS 'MODULE_PATHNAME', 'pg_buffercache_numa_pages'
+LANGUAGE C PARALLEL SAFE;
+
+-- Create a view for convenient access.
+CREATE OR REPLACE VIEW pg_buffercache AS
+	SELECT P.* FROM pg_buffercache_pages() AS P
+	(bufferid integer, relfilenode oid, reltablespace oid, reldatabase oid,
+	 relforknumber int2, relblocknumber int8, isdirty bool, usagecount int2,
+	 pinning_backends int4);
+
+CREATE OR REPLACE VIEW pg_buffercache_numa AS
+	SELECT P.* FROM pg_buffercache_numa_pages() AS P
+	(bufferid integer, relfilenode oid, reltablespace oid, reldatabase oid,
+	 relforknumber int2, relblocknumber int8, isdirty bool, usagecount int2,
+	 pinning_backends int4, numa_zone_id int4);
+
+-- Don't want these to be available to public.
+REVOKE ALL ON FUNCTION pg_buffercache_pages() FROM PUBLIC;
+REVOKE ALL ON FUNCTION pg_buffercache_numa_pages() FROM PUBLIC;
+REVOKE ALL ON pg_buffercache FROM PUBLIC;
+REVOKE ALL ON pg_buffercache_numa FROM PUBLIC;
+
+GRANT EXECUTE ON FUNCTION pg_buffercache_pages() TO pg_monitor;
+GRANT EXECUTE ON FUNCTION pg_buffercache_numa_pages() TO pg_monitor;
+GRANT SELECT ON pg_buffercache TO pg_monitor;
+GRANT SELECT ON pg_buffercache_numa TO pg_monitor;
diff --git a/contrib/pg_buffercache/pg_buffercache.control b/contrib/pg_buffercache/pg_buffercache.control
index 5ee875f77dd..b030ba3a6fa 100644
--- a/contrib/pg_buffercache/pg_buffercache.control
+++ b/contrib/pg_buffercache/pg_buffercache.control
@@ -1,5 +1,5 @@
 # pg_buffercache extension
 comment = 'examine the shared buffer cache'
-default_version = '1.5'
+default_version = '1.6'
 module_pathname = '$libdir/pg_buffercache'
 relocatable = true
diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c
index 3ae0a018e10..9cc04320d63 100644
--- a/contrib/pg_buffercache/pg_buffercache_pages.c
+++ b/contrib/pg_buffercache/pg_buffercache_pages.c
@@ -11,12 +11,12 @@
 #include "access/htup_details.h"
 #include "catalog/pg_type.h"
 #include "funcapi.h"
+#include "port/pg_numa.h"
 #include "storage/buf_internals.h"
 #include "storage/bufmgr.h"
 
-
 #define NUM_BUFFERCACHE_PAGES_MIN_ELEM	8
-#define NUM_BUFFERCACHE_PAGES_ELEM	9
+#define NUM_BUFFERCACHE_PAGES_ELEM	10
 #define NUM_BUFFERCACHE_SUMMARY_ELEM 5
 #define NUM_BUFFERCACHE_USAGE_COUNTS_ELEM 4
 
@@ -43,6 +43,7 @@ typedef struct
 	 * because of bufmgr.c's PrivateRefCount infrastructure.
 	 */
 	int32		pinning_backends;
+	int32		numa_zone_id;
 } BufferCachePagesRec;
 
 
@@ -61,84 +62,249 @@ typedef struct
  * relation node/tablespace/database/blocknum and dirty indicator.
  */
 PG_FUNCTION_INFO_V1(pg_buffercache_pages);
+PG_FUNCTION_INFO_V1(pg_buffercache_numa_pages);
 PG_FUNCTION_INFO_V1(pg_buffercache_summary);
 PG_FUNCTION_INFO_V1(pg_buffercache_usage_counts);
 PG_FUNCTION_INFO_V1(pg_buffercache_evict);
 
-Datum
-pg_buffercache_pages(PG_FUNCTION_ARGS)
+/*
+ * Helper routine to map Buffers into addresses that can be
+ * later consumed by pg_numa_query_pages()
+ *
+ * Many buffers can point to the same page (in case of
+ * BLCKSZ < 4kB), but we want to also query just first
+ * address.
+ *
+ * In order to get reliable results we also need to touch
+ * memory pages, so that inquiry about NUMA zone doesn't
+ * return -2.
+ */
+static inline void
+pg_buffercache_numa_prepare_ptrs(int buffer_id, float pages_per_blk, Size os_page_size,
+								 void **os_page_ptrs, bool firstUseInBackend)
+{
+	size_t blk2page = (size_t)(buffer_id * pages_per_blk);
+
+	for (size_t j = 0; j < pages_per_blk; j++)
+	{
+		size_t blk2pageoff = blk2page + j;
+		if (os_page_ptrs[blk2pageoff] == 0)
+		{
+			volatile uint64 touch pg_attribute_unused();
+
+			/* NBuffers count start really from 1 */
+			os_page_ptrs[blk2pageoff] = (char *) BufferGetBlock(buffer_id + 1) + (os_page_size * j);
+
+			/* We just need to do it only once in backend */
+			if (firstUseInBackend)
+				pg_numa_touch_mem_if_required(touch, os_page_ptrs[blk2pageoff]);
+
+			CHECK_FOR_INTERRUPTS();
+		}
+	}
+}
+
+/*
+ * Helper routine for pg_buffercache_(numa_)pages.
+ *
+ * We need  fcinfo here and we pass it here with PG_FUNCTION_ARGS
+ */
+static BufferCachePagesContext *
+pg_buffercache_init_entries(FuncCallContext *funcctx, PG_FUNCTION_ARGS)
 {
-	FuncCallContext *funcctx;
-	Datum		result;
-	MemoryContext oldcontext;
 	BufferCachePagesContext *fctx;	/* User function context. */
+	MemoryContext oldcontext;
 	TupleDesc	tupledesc;
 	TupleDesc	expected_tupledesc;
-	HeapTuple	tuple;
 
-	if (SRF_IS_FIRSTCALL())
-	{
-		int			i;
+	/*
+	 * Switch context when allocating stuff to be used in later calls
+	 */
+	oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
 
-		funcctx = SRF_FIRSTCALL_INIT();
+	/* Create a user function context for cross-call persistence */
+	fctx = (BufferCachePagesContext *) palloc(sizeof(BufferCachePagesContext));
 
-		/* Switch context when allocating stuff to be used in later calls */
-		oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
+	/*
+	 * To smoothly support upgrades from version 1.0 of this extension
+	 * transparently handle the (non-)existence of the pinning_backends
+	 * column. We unfortunately have to get the result type for that... - we
+	 * can't use the result type determined by the function definition without
+	 * potentially crashing when somebody uses the old (or even wrong)
+	 * function definition though.
+	 */
+	if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE)
+		elog(ERROR, "return type must be a row type");
 
-		/* Create a user function context for cross-call persistence */
-		fctx = (BufferCachePagesContext *) palloc(sizeof(BufferCachePagesContext));
+	if (expected_tupledesc->natts < NUM_BUFFERCACHE_PAGES_MIN_ELEM ||
+		expected_tupledesc->natts > NUM_BUFFERCACHE_PAGES_ELEM)
+		elog(ERROR, "incorrect number of output arguments");
+
+	/* Construct a tuple descriptor for the result rows. */
+	tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts);
+	TupleDescInitEntry(tupledesc, (AttrNumber) 1, "bufferid",
+					   INT4OID, -1, 0);
+	TupleDescInitEntry(tupledesc, (AttrNumber) 2, "relfilenode",
+					   OIDOID, -1, 0);
+	TupleDescInitEntry(tupledesc, (AttrNumber) 3, "reltablespace",
+					   OIDOID, -1, 0);
+	TupleDescInitEntry(tupledesc, (AttrNumber) 4, "reldatabase",
+					   OIDOID, -1, 0);
+	TupleDescInitEntry(tupledesc, (AttrNumber) 5, "relforknumber",
+					   INT2OID, -1, 0);
+	TupleDescInitEntry(tupledesc, (AttrNumber) 6, "relblocknumber",
+					   INT8OID, -1, 0);
+	TupleDescInitEntry(tupledesc, (AttrNumber) 7, "isdirty",
+					   BOOLOID, -1, 0);
+	TupleDescInitEntry(tupledesc, (AttrNumber) 8, "usage_count",
+					   INT2OID, -1, 0);
+
+	if (expected_tupledesc->natts >= NUM_BUFFERCACHE_PAGES_ELEM - 1)
+		TupleDescInitEntry(tupledesc, (AttrNumber) 9, "pinning_backends",
+						   INT4OID, -1, 0);
+	if (expected_tupledesc->natts == NUM_BUFFERCACHE_PAGES_ELEM)
+		TupleDescInitEntry(tupledesc, (AttrNumber) 10, "numa_zone_id",
+						   INT4OID, -1, 0);
+
+	fctx->tupdesc = BlessTupleDesc(tupledesc);
+
+	/* Allocate NBuffers worth of BufferCachePagesRec records. */
+	fctx->record = (BufferCachePagesRec *)
+		MemoryContextAllocHuge(CurrentMemoryContext,
+							   sizeof(BufferCachePagesRec) * NBuffers);
+
+	/* Set max calls and remember the user function context. */
+	funcctx->max_calls = NBuffers;
+	funcctx->user_fctx = fctx;
+
+	/*
+	 * Return to original context when allocating transient memory
+	 */
+	MemoryContextSwitchTo(oldcontext);
+	return fctx;
+}
+
+/*
+ * Helper routine for pg_buffercache_(numa_)pages
+ */
+static void
+pg_buffercache_build_tuple(int i, BufferCachePagesContext *fctx)
+{
+	BufferDesc *bufHdr;
+	uint32		buf_state;
+
+	bufHdr = GetBufferDescriptor(i);
+	/* Lock each buffer header before inspecting. */
+	buf_state = LockBufHdr(bufHdr);
+
+	fctx->record[i].bufferid = BufferDescriptorGetBuffer(bufHdr);
+	fctx->record[i].relfilenumber = BufTagGetRelNumber(&bufHdr->tag);
+	fctx->record[i].reltablespace = bufHdr->tag.spcOid;
+	fctx->record[i].reldatabase = bufHdr->tag.dbOid;
+	fctx->record[i].forknum = BufTagGetForkNum(&bufHdr->tag);
+	fctx->record[i].blocknum = bufHdr->tag.blockNum;
+	fctx->record[i].usagecount = BUF_STATE_GET_USAGECOUNT(buf_state);
+	fctx->record[i].pinning_backends = BUF_STATE_GET_REFCOUNT(buf_state);
+
+	if (buf_state & BM_DIRTY)
+		fctx->record[i].isdirty = true;
+	else
+		fctx->record[i].isdirty = false;
+
+	/*
+	 * Note if the buffer is valid, and has storage created
+	 */
+	if ((buf_state & BM_VALID) && (buf_state & BM_TAG_VALID))
+		fctx->record[i].isvalid = true;
+	else
+		fctx->record[i].isvalid = false;
+
+	fctx->record[i].numa_zone_id = -1;
+
+	UnlockBufHdr(bufHdr, buf_state);
+}
+
+/*
+ * Helper routine for pg_buffercache_(numa_)pages
+ */
+static Datum
+get_buffercache_tuple(int i, BufferCachePagesContext *fctx)
+{
+	Datum		values[NUM_BUFFERCACHE_PAGES_ELEM];
+	bool		nulls[NUM_BUFFERCACHE_PAGES_ELEM];
+	HeapTuple	tuple;
+
+	values[0] = Int32GetDatum(fctx->record[i].bufferid);
+	nulls[0] = false;
+
+	/*
+	 * Set all fields except the bufferid to null if the buffer is unused or
+	 * not valid.
+	 */
+	if (fctx->record[i].blocknum == InvalidBlockNumber ||
+		fctx->record[i].isvalid == false)
+	{
+		nulls[1] = true;
+		nulls[2] = true;
+		nulls[3] = true;
+		nulls[4] = true;
+		nulls[5] = true;
+		nulls[6] = true;
+		nulls[7] = true;
 
 		/*
-		 * To smoothly support upgrades from version 1.0 of this extension
-		 * transparently handle the (non-)existence of the pinning_backends
-		 * column. We unfortunately have to get the result type for that... -
-		 * we can't use the result type determined by the function definition
-		 * without potentially crashing when somebody uses the old (or even
-		 * wrong) function definition though.
+		 * unused for v1.0 callers, but the array is always long enough
 		 */
-		if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE)
-			elog(ERROR, "return type must be a row type");
+		nulls[8] = true;
+		nulls[9] = true;
+	}
+	else
+	{
+		values[1] = ObjectIdGetDatum(fctx->record[i].relfilenumber);
+		nulls[1] = false;
+		values[2] = ObjectIdGetDatum(fctx->record[i].reltablespace);
+		nulls[2] = false;
+		values[3] = ObjectIdGetDatum(fctx->record[i].reldatabase);
+		nulls[3] = false;
+		values[4] = ObjectIdGetDatum(fctx->record[i].forknum);
+		nulls[4] = false;
+		values[5] = Int64GetDatum((int64) fctx->record[i].blocknum);
+		nulls[5] = false;
+		values[6] = BoolGetDatum(fctx->record[i].isdirty);
+		nulls[6] = false;
+		values[7] = Int16GetDatum(fctx->record[i].usagecount);
+		nulls[7] = false;
 
-		if (expected_tupledesc->natts < NUM_BUFFERCACHE_PAGES_MIN_ELEM ||
-			expected_tupledesc->natts > NUM_BUFFERCACHE_PAGES_ELEM)
-			elog(ERROR, "incorrect number of output arguments");
+		/*
+		 * unused for v1.0 callers, but the array is always long enough
+		 */
+		values[8] = Int32GetDatum(fctx->record[i].pinning_backends);
+		nulls[8] = false;
+		values[9] = Int32GetDatum(fctx->record[i].numa_zone_id);
+		nulls[9] = false;
+	}
 
-		/* Construct a tuple descriptor for the result rows. */
-		tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts);
-		TupleDescInitEntry(tupledesc, (AttrNumber) 1, "bufferid",
-						   INT4OID, -1, 0);
-		TupleDescInitEntry(tupledesc, (AttrNumber) 2, "relfilenode",
-						   OIDOID, -1, 0);
-		TupleDescInitEntry(tupledesc, (AttrNumber) 3, "reltablespace",
-						   OIDOID, -1, 0);
-		TupleDescInitEntry(tupledesc, (AttrNumber) 4, "reldatabase",
-						   OIDOID, -1, 0);
-		TupleDescInitEntry(tupledesc, (AttrNumber) 5, "relforknumber",
-						   INT2OID, -1, 0);
-		TupleDescInitEntry(tupledesc, (AttrNumber) 6, "relblocknumber",
-						   INT8OID, -1, 0);
-		TupleDescInitEntry(tupledesc, (AttrNumber) 7, "isdirty",
-						   BOOLOID, -1, 0);
-		TupleDescInitEntry(tupledesc, (AttrNumber) 8, "usage_count",
-						   INT2OID, -1, 0);
-
-		if (expected_tupledesc->natts == NUM_BUFFERCACHE_PAGES_ELEM)
-			TupleDescInitEntry(tupledesc, (AttrNumber) 9, "pinning_backends",
-							   INT4OID, -1, 0);
-
-		fctx->tupdesc = BlessTupleDesc(tupledesc);
-
-		/* Allocate NBuffers worth of BufferCachePagesRec records. */
-		fctx->record = (BufferCachePagesRec *)
-			MemoryContextAllocHuge(CurrentMemoryContext,
-								   sizeof(BufferCachePagesRec) * NBuffers);
-
-		/* Set max calls and remember the user function context. */
-		funcctx->max_calls = NBuffers;
-		funcctx->user_fctx = fctx;
-
-		/* Return to original context when allocating transient memory */
-		MemoryContextSwitchTo(oldcontext);
+	/* Build and return the tuple. */
+	tuple = heap_form_tuple(fctx->tupdesc, values, nulls);
+	return HeapTupleGetDatum(tuple);
+}
+
+/*
+ * When updating this routine please sync it with below one:
+ * pg_buffercache_numa_pages()
+ */
+Datum
+pg_buffercache_pages(PG_FUNCTION_ARGS)
+{
+	FuncCallContext *funcctx;
+	BufferCachePagesContext *fctx;	/* User function context. */
+
+	if (SRF_IS_FIRSTCALL())
+	{
+		int			i;
+
+		funcctx = SRF_FIRSTCALL_INIT();
+		fctx = pg_buffercache_init_entries(funcctx, fcinfo);
 
 		/*
 		 * Scan through all the buffers, saving the relevant fields in the
@@ -149,36 +315,7 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
 		 * locks, so the information of each buffer is self-consistent.
 		 */
 		for (i = 0; i < NBuffers; i++)
-		{
-			BufferDesc *bufHdr;
-			uint32		buf_state;
-
-			bufHdr = GetBufferDescriptor(i);
-			/* Lock each buffer header before inspecting. */
-			buf_state = LockBufHdr(bufHdr);
-
-			fctx->record[i].bufferid = BufferDescriptorGetBuffer(bufHdr);
-			fctx->record[i].relfilenumber = BufTagGetRelNumber(&bufHdr->tag);
-			fctx->record[i].reltablespace = bufHdr->tag.spcOid;
-			fctx->record[i].reldatabase = bufHdr->tag.dbOid;
-			fctx->record[i].forknum = BufTagGetForkNum(&bufHdr->tag);
-			fctx->record[i].blocknum = bufHdr->tag.blockNum;
-			fctx->record[i].usagecount = BUF_STATE_GET_USAGECOUNT(buf_state);
-			fctx->record[i].pinning_backends = BUF_STATE_GET_REFCOUNT(buf_state);
-
-			if (buf_state & BM_DIRTY)
-				fctx->record[i].isdirty = true;
-			else
-				fctx->record[i].isdirty = false;
-
-			/* Note if the buffer is valid, and has storage created */
-			if ((buf_state & BM_VALID) && (buf_state & BM_TAG_VALID))
-				fctx->record[i].isvalid = true;
-			else
-				fctx->record[i].isvalid = false;
-
-			UnlockBufHdr(bufHdr, buf_state);
-		}
+			pg_buffercache_build_tuple(i, fctx);
 	}
 
 	funcctx = SRF_PERCALL_SETUP();
@@ -188,59 +325,122 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
 
 	if (funcctx->call_cntr < funcctx->max_calls)
 	{
+		Datum		result;
 		uint32		i = funcctx->call_cntr;
-		Datum		values[NUM_BUFFERCACHE_PAGES_ELEM];
-		bool		nulls[NUM_BUFFERCACHE_PAGES_ELEM];
 
-		values[0] = Int32GetDatum(fctx->record[i].bufferid);
-		nulls[0] = false;
+		result = get_buffercache_tuple(i, fctx);
+		SRF_RETURN_NEXT(funcctx, result);
+	}
+	else
+	{
+		SRF_RETURN_DONE(funcctx);
+	}
+}
+
+/*
+ * This is almost identical to the above, but performs
+ * NUMA inuqiry about memory mappings
+ */
+Datum
+pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
+{
+	FuncCallContext *funcctx;
+	BufferCachePagesContext *fctx;	/* User function context. */
+	/*
+	 * To get reliable results we need to "touch pages" once, see
+	 * comments nearby pg_buffercache_numa_prepare_ptrs().
+	 */
+	static bool firstUseInBackend = true;
+
+	if (SRF_IS_FIRSTCALL())
+	{
+		int			i;
+		Size		os_page_size = 0;
+		void	  **os_page_ptrs = NULL;
+		int		   *os_pages_status = NULL;
+		int			os_page_count = 0;
+		float		pages_per_blk = 0;
+
+		funcctx = SRF_FIRSTCALL_INIT();
+		if (pg_numa_init() == -1)
+			elog(ERROR, "libnuma initialization failed or NUMA is not supported on this platform, some NUMA data might be unavailable.");
+		fctx = pg_buffercache_init_entries(funcctx, fcinfo);
+
+		/*
+			* This is for gathering some NUMA statistics. We might be using
+			* various DB block sizes (4kB, 8kB , .. 32kB) that end up being
+			* allocated in various different OS memory pages sizes, so first
+			* we need to understand the OS memory page size before calling
+			* move_pages()
+			*/
+		os_page_size = pg_numa_get_pagesize();
+		os_page_count = ((uint64) NBuffers * BLCKSZ) / os_page_size;
+		pages_per_blk = (float) BLCKSZ / os_page_size;
+
+		elog(DEBUG1, "NUMA: os_page_count=%d os_page_size=%zu pages_per_blk=%f",
+				os_page_count, os_page_size, pages_per_blk);
+
+		os_page_ptrs = palloc(sizeof(void *) * os_page_count);
+		os_pages_status = palloc(sizeof(int) * os_page_count);
+		memset(os_page_ptrs, 0, sizeof(void *) * os_page_count);
+
+		/*
+			* If we ever get 0xff back from kernel inquiry, then we probably
+			* have bug in our buffers to OS page mapping code here
+			*/
+		memset(os_pages_status, 0xff, sizeof(int) * os_page_count);
+
+		if (firstUseInBackend)
+			elog(DEBUG1, "NUMA: page-faulting the buffercache for proper NUMA readouts");
 
 		/*
-		 * Set all fields except the bufferid to null if the buffer is unused
-		 * or not valid.
+		 * Scan through all the buffers, saving the relevant fields in the
+		 * fctx->record structure.
+		 *
+		 * We don't hold the partition locks, so we don't get a consistent
+		 * snapshot across all buffers, but we do grab the buffer header
+		 * locks, so the information of each buffer is self-consistent.
 		 */
-		if (fctx->record[i].blocknum == InvalidBlockNumber ||
-			fctx->record[i].isvalid == false)
+		for (i = 0; i < NBuffers; i++)
 		{
-			nulls[1] = true;
-			nulls[2] = true;
-			nulls[3] = true;
-			nulls[4] = true;
-			nulls[5] = true;
-			nulls[6] = true;
-			nulls[7] = true;
-			/* unused for v1.0 callers, but the array is always long enough */
-			nulls[8] = true;
+			pg_buffercache_build_tuple(i, fctx);
+			pg_buffercache_numa_prepare_ptrs(i, pages_per_blk, os_page_size, os_page_ptrs, firstUseInBackend);
 		}
-		else
+
+		if (pg_numa_query_pages(0, os_page_count, os_page_ptrs, os_pages_status) == -1)
+			elog(ERROR, "failed NUMA pages inquiry: %m");
+
+		for (i = 0; i < NBuffers; i++)
 		{
-			values[1] = ObjectIdGetDatum(fctx->record[i].relfilenumber);
-			nulls[1] = false;
-			values[2] = ObjectIdGetDatum(fctx->record[i].reltablespace);
-			nulls[2] = false;
-			values[3] = ObjectIdGetDatum(fctx->record[i].reldatabase);
-			nulls[3] = false;
-			values[4] = ObjectIdGetDatum(fctx->record[i].forknum);
-			nulls[4] = false;
-			values[5] = Int64GetDatum((int64) fctx->record[i].blocknum);
-			nulls[5] = false;
-			values[6] = BoolGetDatum(fctx->record[i].isdirty);
-			nulls[6] = false;
-			values[7] = Int16GetDatum(fctx->record[i].usagecount);
-			nulls[7] = false;
-			/* unused for v1.0 callers, but the array is always long enough */
-			values[8] = Int32GetDatum(fctx->record[i].pinning_backends);
-			nulls[8] = false;
+			int			blk2page = (int) i * pages_per_blk;
+
+			/*
+				* Technically we can get errors too here and pass that to
+				* user. Also we could somehow report single DB block spanning
+				* more than one NUMA zone, but it should be rare.
+				*/
+			fctx->record[i].numa_zone_id = os_pages_status[blk2page];
 		}
+	}
+
+	funcctx = SRF_PERCALL_SETUP();
 
-		/* Build and return the tuple. */
-		tuple = heap_form_tuple(fctx->tupdesc, values, nulls);
-		result = HeapTupleGetDatum(tuple);
+	/* Get the saved state */
+	fctx = funcctx->user_fctx;
 
+	if (funcctx->call_cntr < funcctx->max_calls)
+	{
+		Datum		result;
+		uint32		i = funcctx->call_cntr;
+
+		result = get_buffercache_tuple(i, fctx);
 		SRF_RETURN_NEXT(funcctx, result);
 	}
 	else
+	{
+		firstUseInBackend = false;
 		SRF_RETURN_DONE(funcctx);
+	}
 }
 
 Datum
diff --git a/contrib/pg_buffercache/sql/pg_buffercache_numa.sql b/contrib/pg_buffercache/sql/pg_buffercache_numa.sql
new file mode 100644
index 00000000000..e2e8cd6a241
--- /dev/null
+++ b/contrib/pg_buffercache/sql/pg_buffercache_numa.sql
@@ -0,0 +1,21 @@
+SELECT NOT(pg_numa_available()) AS skip_test \gset
+\if :skip_test
+\quit
+\endif
+
+select count(*) = (select setting::bigint
+                   from pg_settings
+                   where name = 'shared_buffers')
+from pg_buffercache_numa;
+
+-- Check that the functions / views can't be accessed by default. To avoid
+-- having to create a dedicated user, use the pg_database_owner pseudo-role.
+SET ROLE pg_database_owner;
+SELECT count(*) > 0 FROM pg_buffercache_numa;
+RESET role;
+
+-- Check that pg_monitor is allowed to query view / function
+SET ROLE pg_monitor;
+SELECT count(*) > 0 FROM pg_buffercache_numa;
+RESET role;
+
diff --git a/doc/src/sgml/pgbuffercache.sgml b/doc/src/sgml/pgbuffercache.sgml
index 802a5112d77..75978a6eaed 100644
--- a/doc/src/sgml/pgbuffercache.sgml
+++ b/doc/src/sgml/pgbuffercache.sgml
@@ -30,7 +30,9 @@
  <para>
   This module provides the <function>pg_buffercache_pages()</function>
   function (wrapped in the <structname>pg_buffercache</structname> view),
-  the <function>pg_buffercache_summary()</function> function, the
+  <function>pg_buffercache_numa_pages()</function> function (wrapped in the
+  <structname>pg_buffercache_numa</structname> view), the
+  <function>pg_buffercache_summary()</function> function, the
   <function>pg_buffercache_usage_counts()</function> function and
   the <function>pg_buffercache_evict()</function> function.
  </para>
@@ -42,6 +44,13 @@
   convenient use.
  </para>
 
+ <para>
+  The similiar <function>pg_buffercache_numa_pages()</function> is a slower
+  variant of the above, but also can provide NUMA node ID for shared buffer entry.
+  The <structname>pg_buffercache_numa</structname> view wraps the function for
+  convenient use.
+ </para>
+
  <para>
   The <function>pg_buffercache_summary()</function> function returns a single
   row summarizing the state of the shared buffer cache.
@@ -200,6 +209,59 @@
   </para>
  </sect2>
 
+ <sect2 id="pgbuffercache-pg-buffercache_numa">
+  <title>The <structname>pg_buffercache_numa</structname> View</title>
+
+  <para>
+   The definitions of the columns exposed are almost identical to the previous
+   <structname>pg_buffercache</structname> view, but this one includes one additional
+   column numa_zone_id as defined in <xref linkend="pgbuffercache-numa-columns"/>.
+  </para>
+
+  <table id="pgbuffercache-numa-columns">
+   <title><structname>pg_buffercache_numa</structname> Columns</title>
+   <tgroup cols="1">
+    <thead>
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       Column Type
+      </para>
+      <para>
+       Description
+      </para></entry>
+     </row>
+    </thead>
+
+    <tbody>
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>numa_zone_id</structfield> <type>integer</type>
+      </para>
+      <para>
+       ID (number) of the NUMA node for this particular buffer. NULL if the shared buffer
+       has not been used yet.On systems without NUMA this usually returns 0.
+      </para></entry>
+     </row>
+
+    </tbody>
+   </tgroup>
+  </table>
+
+  <para>
+   This is clone version of the original pg_buffercache view, however it provides
+   additional <structfield>numa_zone_id</structfield> column. Fetching this
+   information from OS is costly and might take much longer and querying it is not
+   recommended by automated or monitoring systems.
+  </para>
+
+  <para>
+   As NUMA node ID inquiry for each page requires memory pages to be paged-in, first
+   execution of this function can take long time especially on systems with bigint
+   shared_buffers and without huge_pages enabled.
+  </para>
+
+ </sect2>
+
  <sect2 id="pgbuffercache-summary">
   <title>The <function>pg_buffercache_summary()</function> Function</title>
 
-- 
2.39.5

