From 1cabc553ea4a7e185fd83f1ab081521820fe6229 Mon Sep 17 00:00:00 2001
From: Jakub Wartak <jakub.wartak@enterprisedb.com>
Date: Fri, 21 Feb 2025 14:20:18 +0100
Subject: [PATCH v25 4/6] Introduce pg_shmem_allocations_numa view

Introduce new pg_shmem_alloctions_numa view with information about how
shared memory is distributed across NUMA nodes.

Author: Jakub Wartak <jakub.wartak@enterprisedb.com>
Reviewed-by: Andres Freund <andres@anarazel.de>
Reviewed-by: Bertrand Drouvot <bertranddrouvot.pg@gmail.com>
Reviewed-by: Tomas Vondra <tomas@vondra.me>
Discussion: https://postgr.es/m/CAKZiRmxh6KWo0aqRqvmcoaX2jUxZYb4kGp3N%3Dq1w%2BDiH-696Xw%40mail.gmail.com
---
 doc/src/sgml/system-views.sgml           |  79 ++++++++++++
 src/backend/catalog/system_views.sql     |   8 ++
 src/backend/storage/ipc/shmem.c          | 152 +++++++++++++++++++++++
 src/include/catalog/pg_proc.dat          |   8 ++
 src/test/regress/expected/numa.out       |  12 ++
 src/test/regress/expected/numa_1.out     |   3 +
 src/test/regress/expected/privileges.out |  16 ++-
 src/test/regress/expected/rules.out      |   4 +
 src/test/regress/parallel_schedule       |   2 +-
 src/test/regress/sql/numa.sql            |   9 ++
 src/test/regress/sql/privileges.sql      |   6 +-
 11 files changed, 294 insertions(+), 5 deletions(-)
 create mode 100644 src/test/regress/expected/numa.out
 create mode 100644 src/test/regress/expected/numa_1.out
 create mode 100644 src/test/regress/sql/numa.sql

diff --git a/doc/src/sgml/system-views.sgml b/doc/src/sgml/system-views.sgml
index 4f336ee0adf..a83365ae24a 100644
--- a/doc/src/sgml/system-views.sgml
+++ b/doc/src/sgml/system-views.sgml
@@ -181,6 +181,11 @@
       <entry>shared memory allocations</entry>
      </row>
 
+     <row>
+      <entry><link linkend="view-pg-shmem-allocations-numa"><structname>pg_shmem_allocations_numa</structname></link></entry>
+      <entry>NUMA node mappings for shared memory allocations</entry>
+     </row>
+
      <row>
       <entry><link linkend="view-pg-stats"><structname>pg_stats</structname></link></entry>
       <entry>planner statistics</entry>
@@ -4051,6 +4056,80 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx
   </para>
  </sect1>
 
+ <sect1 id="view-pg-shmem-allocations-numa">
+  <title><structname>pg_shmem_allocations_numa</structname></title>
+
+  <indexterm zone="view-pg-shmem-allocations-numa">
+   <primary>pg_shmem_allocations_numa</primary>
+  </indexterm>
+
+  <para>
+   The <structname>pg_shmem_allocations_numa</structname> shows how shared
+   memory allocations in the server's main shared memory segment are distributed
+   across NUMA nodes. This includes both memory allocated by
+   <productname>PostgreSQL</productname> itself and memory allocated
+   by extensions using the mechanisms detailed in
+   <xref linkend="xfunc-shared-addin" />.
+  </para>
+
+  <para>
+   Note that this view does not include memory allocated using the dynamic
+   shared memory infrastructure.
+  </para>
+
+  <table>
+   <title><structname>pg_shmem_allocations_numa</structname> Columns</title>
+   <tgroup cols="1">
+    <thead>
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       Column Type
+      </para>
+      <para>
+       Description
+      </para></entry>
+     </row>
+    </thead>
+
+    <tbody>
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>name</structfield> <type>text</type>
+      </para>
+      <para>
+       The name of the shared memory allocation.
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>node_id</structfield> <type>int4</type>
+      </para>
+      <para>
+      ID of <acronym>NUMA</acronym> node
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>size</structfield> <type>int4</type>
+      </para>
+      <para>
+       Size of the allocation on this particular NUMA memory node in bytes
+      </para></entry>
+     </row>
+
+    </tbody>
+   </tgroup>
+  </table>
+
+  <para>
+   By default, the <structname>pg_shmem_allocations_numa</structname> view can be
+   read only by superusers or roles with privileges of the
+   <literal>pg_read_all_stats</literal> role.
+  </para>
+ </sect1>
+
  <sect1 id="view-pg-stats">
   <title><structname>pg_stats</structname></title>
 
diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index 273008db37f..08f780a2e63 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -658,6 +658,14 @@ GRANT SELECT ON pg_shmem_allocations TO pg_read_all_stats;
 REVOKE EXECUTE ON FUNCTION pg_get_shmem_allocations() FROM PUBLIC;
 GRANT EXECUTE ON FUNCTION pg_get_shmem_allocations() TO pg_read_all_stats;
 
+CREATE VIEW pg_shmem_allocations_numa AS
+    SELECT * FROM pg_get_shmem_allocations_numa();
+
+REVOKE ALL ON pg_shmem_allocations_numa FROM PUBLIC;
+GRANT SELECT ON pg_shmem_allocations_numa TO pg_read_all_stats;
+REVOKE EXECUTE ON FUNCTION pg_get_shmem_allocations_numa() FROM PUBLIC;
+GRANT EXECUTE ON FUNCTION pg_get_shmem_allocations_numa() TO pg_read_all_stats;
+
 CREATE VIEW pg_backend_memory_contexts AS
     SELECT * FROM pg_get_backend_memory_contexts();
 
diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c
index 895a43fb39e..5d979423bd9 100644
--- a/src/backend/storage/ipc/shmem.c
+++ b/src/backend/storage/ipc/shmem.c
@@ -68,6 +68,7 @@
 #include "fmgr.h"
 #include "funcapi.h"
 #include "miscadmin.h"
+#include "port/pg_numa.h"
 #include "storage/lwlock.h"
 #include "storage/pg_shmem.h"
 #include "storage/shmem.h"
@@ -89,6 +90,8 @@ slock_t    *ShmemLock;			/* spinlock for shared memory and LWLock
 
 static HTAB *ShmemIndex = NULL; /* primary index hashtable for shmem */
 
+/* To get reliable results for NUMA inquiry we need to "touch pages" once */
+static bool firstNumaTouch = true;
 
 /*
  *	InitShmemAccess() --- set up basic pointers to shared memory.
@@ -568,3 +571,152 @@ pg_get_shmem_allocations(PG_FUNCTION_ARGS)
 
 	return (Datum) 0;
 }
+
+/* SQL SRF showing NUMA memory nodes for allocated shared memory */
+Datum
+pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS)
+{
+#define PG_GET_SHMEM_NUMA_SIZES_COLS 3
+	ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+	HASH_SEQ_STATUS hstat;
+	ShmemIndexEnt *ent;
+	Datum		values[PG_GET_SHMEM_NUMA_SIZES_COLS];
+	bool		nulls[PG_GET_SHMEM_NUMA_SIZES_COLS];
+	Size		os_page_size;
+	void	  **page_ptrs;
+	int		   *pages_status;
+	uint64		shm_total_page_count,
+				shm_ent_page_count,
+				max_nodes;
+	Size	   *nodes;
+
+	if (pg_numa_init() == -1)
+		elog(ERROR, "libnuma initialization failed or NUMA is not supported on this platform");
+
+	InitMaterializedSRF(fcinfo, 0);
+
+	max_nodes = pg_numa_get_max_node();
+	nodes = palloc(sizeof(Size) * (max_nodes + 1));
+
+	/*
+	 * Different database block sizes (4kB, 8kB, ..., 32kB) can be used, while
+	 * the OS may have different memory page sizes.
+	 *
+	 * To correctly map between them, we need to: 1. Determine the OS memory
+	 * page size 2. Calculate how many OS pages are used by all buffer blocks
+	 * 3. Calculate how many OS pages are contained within each database
+	 * block.
+	 *
+	 * This information is needed before calling move_pages() for NUMA memory
+	 * node inquiry.
+	 */
+	os_page_size = pg_numa_get_pagesize();
+
+	/*
+	 * Allocate memory for page pointers and status based on total shared
+	 * memory size. This simplified approach allocates enough space for all
+	 * pages in shared memory rather than calculating the exact requirements
+	 * for each segment.
+	 *
+	 * XXX Isn't this wasteful? But there probably is one large segment of
+	 * shared memory, much larger than the rest anyway.
+	 */
+	shm_total_page_count = ShmemSegHdr->totalsize / os_page_size;
+	page_ptrs = palloc0(sizeof(void *) * shm_total_page_count);
+	pages_status = palloc(sizeof(int) * shm_total_page_count);
+
+	if (firstNumaTouch)
+		elog(DEBUG1, "NUMA: page-faulting shared memory segments for proper NUMA readouts");
+
+	LWLockAcquire(ShmemIndexLock, LW_SHARED);
+
+	hash_seq_init(&hstat, ShmemIndex);
+
+	/* output all allocated entries */
+	memset(nulls, 0, sizeof(nulls));
+	while ((ent = (ShmemIndexEnt *) hash_seq_search(&hstat)) != NULL)
+	{
+		int			i;
+
+		/* XXX I assume we use TYPEALIGN as a way to round to whole pages.
+		 * It's a bit misleading to call that "aligned", no? */
+
+		/* Get number of OS aligned pages */
+		shm_ent_page_count
+			= TYPEALIGN(os_page_size, ent->allocated_size) / os_page_size;
+
+		/*
+		 * If we get ever 0xff back from kernel inquiry, then we probably have
+		 * bug in our buffers to OS page mapping code here.
+		 */
+		memset(pages_status, 0xff, sizeof(int) * shm_ent_page_count);
+
+		/*
+		 * Setup page_ptrs[] with pointers to all OS pages for this segment,
+		 * and get the NUMA status using pg_numa_query_pages.
+		 *
+		 * In order to get reliable results we also need to touch memory
+		 * pages, so that inquiry about NUMA memory node doesn't return -2
+		 * (ENOENT, which indicates unmapped/unallocated pages).
+		 */
+		for (i = 0; i < shm_ent_page_count; i++)
+		{
+			volatile uint64 touch pg_attribute_unused();
+
+			page_ptrs[i] = (char *) ent->location + (i * os_page_size);
+
+			if (firstNumaTouch)
+				pg_numa_touch_mem_if_required(touch, page_ptrs[i]);
+
+			CHECK_FOR_INTERRUPTS();
+		}
+
+		if (pg_numa_query_pages(0, shm_ent_page_count, page_ptrs, pages_status) == -1)
+			elog(ERROR, "failed NUMA pages inquiry status: %m");
+
+		/* Count number of NUMA nodes used for this shared memory entry */
+		memset(nodes, 0, sizeof(Size) * (max_nodes + 1));
+
+		for (i = 0; i < shm_ent_page_count; i++)
+		{
+			int			s = pages_status[i];
+
+			/* Ensure we are adding only valid index to the array */
+			if (s < 0 || s > max_nodes)
+			{
+				elog(ERROR, "invalid NUMA node id outside of allowed range "
+							"[0, " UINT64_FORMAT "]: %d", max_nodes, s);
+			}
+
+			nodes[s]++;
+		}
+
+		/*
+		 * Add one entry for each NUMA node, including those without allocated
+		 * memory for this segment.
+		 */
+		for (i = 0; i <= max_nodes; i++)
+		{
+			values[0] = CStringGetTextDatum(ent->key);
+			values[1] = i;
+			values[2] = Int64GetDatum(nodes[i] * os_page_size);
+
+			tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc,
+								 values, nulls);
+		}
+	}
+
+	/*
+	 * We are ignoring the following memory regions (as compared to
+	 * pg_get_shmem_allocations()): 1. output shared memory allocated but not
+	 * counted via the shmem index 2. output as-of-yet unused shared memory.
+	 *
+	 * XXX Not quite sure why this is at the end, and what "output memory"
+	 * refers to.
+	 */
+
+	LWLockRelease(ShmemIndexLock);
+	firstNumaTouch = false;
+
+	return (Datum) 0;
+}
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index dfc59ea0cc8..a93075c675c 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -8546,6 +8546,14 @@
   proname => 'pg_numa_available', provolatile => 's', prorettype => 'bool',
   proargtypes => '', prosrc => 'pg_numa_available' },
 
+# shared memory usage with NUMA info
+{ oid => '9686', descr => 'NUMA mappings for the main shared memory segment',
+  proname => 'pg_get_shmem_allocations_numa', prorows => '50', proretset => 't',
+  provolatile => 'v', prorettype => 'record', proargtypes => '',
+  proallargtypes => '{text,int4,int8}', proargmodes => '{o,o,o}',
+  proargnames => '{name,node_id,size}',
+  prosrc => 'pg_get_shmem_allocations_numa' },
+
 # memory context of local backend
 { oid => '2282',
   descr => 'information about all memory contexts of local backend',
diff --git a/src/test/regress/expected/numa.out b/src/test/regress/expected/numa.out
new file mode 100644
index 00000000000..668172f7d79
--- /dev/null
+++ b/src/test/regress/expected/numa.out
@@ -0,0 +1,12 @@
+SELECT NOT(pg_numa_available()) AS skip_test \gset
+\if :skip_test
+\quit
+\endif
+-- switch to superuser
+\c -
+SELECT COUNT(*) >= 0 AS ok FROM pg_shmem_allocations_numa;
+ ok 
+----
+ t
+(1 row)
+
diff --git a/src/test/regress/expected/numa_1.out b/src/test/regress/expected/numa_1.out
new file mode 100644
index 00000000000..6dd6824b4e4
--- /dev/null
+++ b/src/test/regress/expected/numa_1.out
@@ -0,0 +1,3 @@
+SELECT NOT(pg_numa_available()) AS skip_test \gset
+\if :skip_test
+\quit
diff --git a/src/test/regress/expected/privileges.out b/src/test/regress/expected/privileges.out
index 1fddb13b6ae..c25062c288f 100644
--- a/src/test/regress/expected/privileges.out
+++ b/src/test/regress/expected/privileges.out
@@ -3219,8 +3219,8 @@ REVOKE MAINTAIN ON lock_table FROM regress_locktable_user;
 -- clean up
 DROP TABLE lock_table;
 DROP USER regress_locktable_user;
--- test to check privileges of system views pg_shmem_allocations and
--- pg_backend_memory_contexts.
+-- test to check privileges of system views pg_shmem_allocations,
+-- pg_shmem_allocations_numa and pg_backend_memory_contexts.
 -- switch to superuser
 \c -
 CREATE ROLE regress_readallstats;
@@ -3242,6 +3242,12 @@ SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT
  f
 (1 row)
 
+SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations_numa','SELECT'); -- no
+ has_table_privilege 
+---------------------
+ f
+(1 row)
+
 GRANT pg_read_all_stats TO regress_readallstats;
 SELECT has_table_privilege('regress_readallstats','pg_aios','SELECT'); -- yes
  has_table_privilege 
@@ -3261,6 +3267,12 @@ SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT
  t
 (1 row)
 
+SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations_numa','SELECT'); -- yes
+ has_table_privilege 
+---------------------
+ t
+(1 row)
+
 -- run query to ensure that functions within views can be executed
 SET ROLE regress_readallstats;
 SELECT COUNT(*) >= 0 AS ok FROM pg_aios;
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index 673c63b8d1b..abfdc97abc5 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -1757,6 +1757,10 @@ pg_shmem_allocations| SELECT name,
     size,
     allocated_size
    FROM pg_get_shmem_allocations() pg_get_shmem_allocations(name, off, size, allocated_size);
+pg_shmem_allocations_numa| SELECT name,
+    node_id,
+    size
+   FROM pg_get_shmem_allocations_numa() pg_get_shmem_allocations_numa(name, node_id, size);
 pg_stat_activity| SELECT s.datid,
     d.datname,
     s.pid,
diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule
index 0a35f2f8f6a..0f38caa0d24 100644
--- a/src/test/regress/parallel_schedule
+++ b/src/test/regress/parallel_schedule
@@ -119,7 +119,7 @@ test: plancache limit plpgsql copy2 temp domain rangefuncs prepare conversion tr
 # The stats test resets stats, so nothing else needing stats access can be in
 # this group.
 # ----------
-test: partition_join partition_prune reloptions hash_part indexing partition_aggregate partition_info tuplesort explain compression memoize stats predicate
+test: partition_join partition_prune reloptions hash_part indexing partition_aggregate partition_info tuplesort explain compression memoize stats predicate numa
 
 # event_trigger depends on create_am and cannot run concurrently with
 # any test that runs DDL
diff --git a/src/test/regress/sql/numa.sql b/src/test/regress/sql/numa.sql
new file mode 100644
index 00000000000..034098783fb
--- /dev/null
+++ b/src/test/regress/sql/numa.sql
@@ -0,0 +1,9 @@
+SELECT NOT(pg_numa_available()) AS skip_test \gset
+\if :skip_test
+\quit
+\endif
+
+-- switch to superuser
+\c -
+
+SELECT COUNT(*) >= 0 AS ok FROM pg_shmem_allocations_numa;
diff --git a/src/test/regress/sql/privileges.sql b/src/test/regress/sql/privileges.sql
index 85d7280f35f..f337aa67c13 100644
--- a/src/test/regress/sql/privileges.sql
+++ b/src/test/regress/sql/privileges.sql
@@ -1947,8 +1947,8 @@ REVOKE MAINTAIN ON lock_table FROM regress_locktable_user;
 DROP TABLE lock_table;
 DROP USER regress_locktable_user;
 
--- test to check privileges of system views pg_shmem_allocations and
--- pg_backend_memory_contexts.
+-- test to check privileges of system views pg_shmem_allocations,
+-- pg_shmem_allocations_numa and pg_backend_memory_contexts.
 
 -- switch to superuser
 \c -
@@ -1958,12 +1958,14 @@ CREATE ROLE regress_readallstats;
 SELECT has_table_privilege('regress_readallstats','pg_aios','SELECT'); -- no
 SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- no
 SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT'); -- no
+SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations_numa','SELECT'); -- no
 
 GRANT pg_read_all_stats TO regress_readallstats;
 
 SELECT has_table_privilege('regress_readallstats','pg_aios','SELECT'); -- yes
 SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- yes
 SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT'); -- yes
+SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations_numa','SELECT'); -- yes
 
 -- run query to ensure that functions within views can be executed
 SET ROLE regress_readallstats;
-- 
2.39.5

