From 189be69e5a256bf966ab7f883f6635f377e0a6bc Mon Sep 17 00:00:00 2001
From: Jakub Wartak <jakub.wartak@enterprisedb.com>
Date: Fri, 21 Feb 2025 14:20:18 +0100
Subject: [PATCH v10 3/3] Add pg_shmem_numa_allocations to show NUMA zones for
 shared memory allocations

Author: Jakub Wartak <jakub.wartak@enterprisedb.com>
Reviewed-by: Andres Freund <andres@anarazel.de>
Reviewed-by: Bertrand Drouvot <bertranddrouvot.pg@gmail.com>
Discussion: https://postgr.es/m/CAKZiRmxh6KWo0aqRqvmcoaX2jUxZYb4kGp3N%3Dq1w%2BDiH-696Xw%40mail.gmail.com
---
 doc/src/sgml/system-views.sgml           |  78 ++++++++++++++
 src/backend/catalog/system_views.sql     |   8 ++
 src/backend/storage/ipc/shmem.c          | 123 +++++++++++++++++++++++
 src/include/catalog/pg_proc.dat          |   8 ++
 src/test/regress/expected/numa.out       |  12 +++
 src/test/regress/expected/numa_1.out     |   3 +
 src/test/regress/expected/privileges.out |  16 ++-
 src/test/regress/expected/rules.out      |   4 +
 src/test/regress/parallel_schedule       |   2 +-
 src/test/regress/sql/numa.sql            |  10 ++
 src/test/regress/sql/privileges.sql      |   6 +-
 11 files changed, 265 insertions(+), 5 deletions(-)
 create mode 100644 src/test/regress/expected/numa.out
 create mode 100644 src/test/regress/expected/numa_1.out
 create mode 100644 src/test/regress/sql/numa.sql

diff --git a/doc/src/sgml/system-views.sgml b/doc/src/sgml/system-views.sgml
index 3f5a306247e..5164083131a 100644
--- a/doc/src/sgml/system-views.sgml
+++ b/doc/src/sgml/system-views.sgml
@@ -176,6 +176,11 @@
       <entry>shared memory allocations</entry>
      </row>
 
+     <row>
+      <entry><link linkend="view-pg-shmem-numa-allocations"><structname>pg_shmem_numa_allocations</structname></link></entry>
+      <entry>NUMA mappings for shared memory allocations</entry>
+     </row>
+
      <row>
       <entry><link linkend="view-pg-stats"><structname>pg_stats</structname></link></entry>
       <entry>planner statistics</entry>
@@ -3746,6 +3751,79 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx
   </para>
  </sect1>
 
+ <sect1 id="view-pg-shmem-numa-allocations">
+  <title><structname>pg_shmem_numa_allocations</structname></title>
+
+  <indexterm zone="view-pg-shmem-numa-allocations">
+   <primary>pg_shmem_numa_allocations</primary>
+  </indexterm>
+
+  <para>
+   The <structname>pg_shmem_numa_allocations</structname> view shows NUMA nodes
+   assigned allocations made from the server's main shared memory segment.
+   This includes both memory allocated by <productname>PostgreSQL</productname>
+   itself and memory allocated by extensions using the mechanisms detailed in
+   <xref linkend="xfunc-shared-addin" />.
+  </para>
+
+  <para>
+   Note that this view does not include memory allocated using the dynamic
+   shared memory infrastructure.
+  </para>
+
+  <table>
+   <title><structname>pg_shmem_numa_allocations</structname> Columns</title>
+   <tgroup cols="1">
+    <thead>
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       Column Type
+      </para>
+      <para>
+       Description
+      </para></entry>
+     </row>
+    </thead>
+
+    <tbody>
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>name</structfield> <type>text</type>
+      </para>
+      <para>
+       The name of the shared memory allocation.
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>numa_zone_id</structfield> <type>int4</type>
+      </para>
+      <para>
+      ID of NUMA node
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>numa_size</structfield> <type>int4</type>
+      </para>
+      <para>
+       Size of the allocation on this particular NUMA node in bytes
+      </para></entry>
+     </row>
+
+    </tbody>
+   </tgroup>
+  </table>
+
+  <para>
+   By default, the <structname>pg_shmem_numa_allocations</structname> view can be
+   read only by superusers or roles with privileges of the
+   <literal>pg_read_all_stats</literal> role.
+  </para>
+ </sect1>
+
  <sect1 id="view-pg-stats">
   <title><structname>pg_stats</structname></title>
 
diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index a4d2cfdcaf5..cc014a62dc2 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -658,6 +658,14 @@ GRANT SELECT ON pg_shmem_allocations TO pg_read_all_stats;
 REVOKE EXECUTE ON FUNCTION pg_get_shmem_allocations() FROM PUBLIC;
 GRANT EXECUTE ON FUNCTION pg_get_shmem_allocations() TO pg_read_all_stats;
 
+CREATE VIEW pg_shmem_numa_allocations AS
+    SELECT * FROM pg_get_shmem_numa_allocations();
+
+REVOKE ALL ON pg_shmem_numa_allocations FROM PUBLIC;
+GRANT SELECT ON pg_shmem_numa_allocations TO pg_read_all_stats;
+REVOKE EXECUTE ON FUNCTION pg_get_shmem_numa_allocations() FROM PUBLIC;
+GRANT EXECUTE ON FUNCTION pg_get_shmem_numa_allocations() TO pg_read_all_stats;
+
 CREATE VIEW pg_backend_memory_contexts AS
     SELECT * FROM pg_get_backend_memory_contexts();
 
diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c
index 4c9c3cb320f..61e603fc42a 100644
--- a/src/backend/storage/ipc/shmem.c
+++ b/src/backend/storage/ipc/shmem.c
@@ -579,3 +579,126 @@ pg_numa_available(PG_FUNCTION_ARGS)
 	PG_RETURN_BOOL(true);
 }
 
+/* SQL SRF showing NUMA zones for allocated shared memory */
+Datum
+pg_get_shmem_numa_allocations(PG_FUNCTION_ARGS)
+{
+#define PG_GET_SHMEM_NUMA_SIZES_COLS 3
+	ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+	HASH_SEQ_STATUS hstat;
+	ShmemIndexEnt *ent;
+	Datum		values[PG_GET_SHMEM_NUMA_SIZES_COLS];
+	bool		nulls[PG_GET_SHMEM_NUMA_SIZES_COLS];
+	Size		os_page_size;
+	void	  **page_ptrs;
+	int		   *pages_status;
+	int			shm_total_page_count,
+				shm_ent_page_count,
+				max_zones;
+	/* To get reliable results we need to "touch pages" once */
+	static bool firstUseInBackend = true;
+
+	InitMaterializedSRF(fcinfo, 0);
+
+	if (pg_numa_init() == -1)
+	{
+		elog(NOTICE, "libnuma initialization failed or NUMA is not supported on this platform, some NUMA data might be unavailable.");;
+		return (Datum) 0;
+	}
+	max_zones = pg_numa_get_max_node();
+
+	/*
+	 * This is for gathering some NUMA statistics. We might be using various
+	 * DB block sizes (4kB, 8kB , .. 32kB) that end up being allocated in
+	 * various different OS memory pages sizes, so first we need to understand
+	 * the OS memory page size before calling move_pages()
+	 */
+	os_page_size = pg_numa_get_pagesize();
+
+	/*
+	 * Preallocate memory all at once without going into details which shared
+	 * memory segment is the biggest (technically min s_b can be as low as
+	 * 16xBLCKSZ)
+	 */
+	shm_total_page_count = ShmemSegHdr->totalsize / os_page_size;
+	page_ptrs = palloc(sizeof(void *) * shm_total_page_count);
+	pages_status = palloc(sizeof(int) * shm_total_page_count);
+	memset(page_ptrs, 0, sizeof(void *) * shm_total_page_count);
+
+	if (firstUseInBackend)
+		elog(DEBUG1, "NUMA: page-faulting shared memory segments for proper NUMA readouts");
+
+	LWLockAcquire(ShmemIndexLock, LW_SHARED);
+
+	hash_seq_init(&hstat, ShmemIndex);
+
+	/* output all allocated entries */
+	memset(nulls, 0, sizeof(nulls));
+	while ((ent = (ShmemIndexEnt *) hash_seq_search(&hstat)) != NULL)
+	{
+		int			i;
+		Size		*zones;
+
+		shm_ent_page_count = ent->allocated_size / os_page_size;
+		/* It is always at least 1 page */
+		shm_ent_page_count = shm_ent_page_count == 0 ? 1 : shm_ent_page_count;
+
+		/*
+		 * If we get ever 0xff back from kernel inquiry, then we probably have
+		 * bug in our buffers to OS page mapping code here
+		 */
+		memset(pages_status, 0xff, sizeof(int) * shm_ent_page_count);
+
+		for (i = 0; i < shm_ent_page_count; i++)
+		{
+			/*
+			 * In order to get reliable results we also need to touch memory
+			 * pages so that inquiry about NUMA zone doesn't return -2.
+			 */
+			volatile uint64 touch pg_attribute_unused();
+
+			page_ptrs[i] = (char *) ent->location + (i * os_page_size);
+			if (firstUseInBackend)
+				pg_numa_touch_mem_if_required(touch, page_ptrs[i]);
+
+			CHECK_FOR_INTERRUPTS();
+		}
+
+		if (pg_numa_query_pages(0, shm_ent_page_count, page_ptrs, pages_status) == -1)
+			elog(ERROR, "failed NUMA pages inquiry status: %m");
+
+		zones = palloc0(sizeof(Size) * max_zones);
+		/* Count number of NUMA zones used for this shared memory entry */
+		for (i = 0; i < shm_ent_page_count; i++)
+		{
+			int			s = pages_status[i];
+
+			if (s >= 0)
+				zones[s]++;
+		}
+
+		for (i = 0; i <= max_zones; i++)
+		{
+			values[0] = CStringGetTextDatum(ent->key);
+			values[1] = i;
+			values[2] = Int64GetDatum(zones[i] * os_page_size);
+
+			tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc,
+								 values, nulls);
+		}
+
+		pfree(zones);
+	}
+
+	/*
+	 * XXX: We are ignoring in NUMA version reporting of the following regions
+	 * (compare to pg_get_shmem_allocations() case): 1. output shared memory
+	 * allocated but not counted via the shmem index 2. output as-of-yet
+	 * unused shared memory
+	 */
+
+	LWLockRelease(ShmemIndexLock);
+	firstUseInBackend = false;
+
+	return (Datum) 0;
+}
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 38612d8ae12..7fd68702be9 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -8493,6 +8493,14 @@
   proname => 'pg_numa_available', provolatile => 'v', prorettype => 'bool',
   proargtypes => '', prosrc => 'pg_numa_available' },
 
+# shared memory usage with NUMA info
+{ oid => '5101', descr => 'NUMA mappings for the main shared memory segment',
+  proname => 'pg_get_shmem_numa_allocations', prorows => '50', proretset => 't',
+  provolatile => 'v', prorettype => 'record', proargtypes => '',
+  proallargtypes => '{text,int4,int8}', proargmodes => '{o,o,o}',
+  proargnames => '{name,numa_zone_id,numa_size}',
+  prosrc => 'pg_get_shmem_numa_allocations' },
+
 # memory context of local backend
 { oid => '2282',
   descr => 'information about all memory contexts of local backend',
diff --git a/src/test/regress/expected/numa.out b/src/test/regress/expected/numa.out
new file mode 100644
index 00000000000..fb882c5b771
--- /dev/null
+++ b/src/test/regress/expected/numa.out
@@ -0,0 +1,12 @@
+SELECT NOT(pg_numa_available()) AS skip_test \gset
+\if :skip_test
+\quit
+\endif
+-- switch to superuser
+\c -
+SELECT COUNT(*) >= 0 AS ok FROM pg_shmem_numa_allocations;
+ ok 
+----
+ t
+(1 row)
+
diff --git a/src/test/regress/expected/numa_1.out b/src/test/regress/expected/numa_1.out
new file mode 100644
index 00000000000..6dd6824b4e4
--- /dev/null
+++ b/src/test/regress/expected/numa_1.out
@@ -0,0 +1,3 @@
+SELECT NOT(pg_numa_available()) AS skip_test \gset
+\if :skip_test
+\quit
diff --git a/src/test/regress/expected/privileges.out b/src/test/regress/expected/privileges.out
index a76256405fe..38ff8bcabe8 100644
--- a/src/test/regress/expected/privileges.out
+++ b/src/test/regress/expected/privileges.out
@@ -3127,8 +3127,8 @@ REVOKE MAINTAIN ON lock_table FROM regress_locktable_user;
 -- clean up
 DROP TABLE lock_table;
 DROP USER regress_locktable_user;
--- test to check privileges of system views pg_shmem_allocations and
--- pg_backend_memory_contexts.
+-- test to check privileges of system views pg_shmem_allocations,
+-- pg_shmem_numa_allocations and pg_backend_memory_contexts.
 -- switch to superuser
 \c -
 CREATE ROLE regress_readallstats;
@@ -3144,6 +3144,12 @@ SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT
  f
 (1 row)
 
+SELECT has_table_privilege('regress_readallstats','pg_shmem_numa_allocations','SELECT'); -- no
+ has_table_privilege 
+---------------------
+ f
+(1 row)
+
 GRANT pg_read_all_stats TO regress_readallstats;
 SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- yes
  has_table_privilege 
@@ -3157,6 +3163,12 @@ SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT
  t
 (1 row)
 
+SELECT has_table_privilege('regress_readallstats','pg_shmem_numa_allocations','SELECT'); -- yes
+ has_table_privilege 
+---------------------
+ t
+(1 row)
+
 -- run query to ensure that functions within views can be executed
 SET ROLE regress_readallstats;
 SELECT COUNT(*) >= 0 AS ok FROM pg_backend_memory_contexts;
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index 62f69ac20b2..b63c6e0f744 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -1740,6 +1740,10 @@ pg_shmem_allocations| SELECT name,
     size,
     allocated_size
    FROM pg_get_shmem_allocations() pg_get_shmem_allocations(name, off, size, allocated_size);
+pg_shmem_numa_allocations| SELECT name,
+    numa_zone_id,
+    numa_size
+   FROM pg_get_shmem_numa_allocations() pg_get_shmem_numa_allocations(name, numa_zone_id, numa_size);
 pg_stat_activity| SELECT s.datid,
     d.datname,
     s.pid,
diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule
index 37b6d21e1f9..c07a4c7633a 100644
--- a/src/test/regress/parallel_schedule
+++ b/src/test/regress/parallel_schedule
@@ -119,7 +119,7 @@ test: plancache limit plpgsql copy2 temp domain rangefuncs prepare conversion tr
 # The stats test resets stats, so nothing else needing stats access can be in
 # this group.
 # ----------
-test: partition_join partition_prune reloptions hash_part indexing partition_aggregate partition_info tuplesort explain compression memoize stats predicate
+test: partition_join partition_prune reloptions hash_part indexing partition_aggregate partition_info tuplesort explain compression memoize stats predicate numa
 
 # event_trigger depends on create_am and cannot run concurrently with
 # any test that runs DDL
diff --git a/src/test/regress/sql/numa.sql b/src/test/regress/sql/numa.sql
new file mode 100644
index 00000000000..e748434c2fe
--- /dev/null
+++ b/src/test/regress/sql/numa.sql
@@ -0,0 +1,10 @@
+SELECT NOT(pg_numa_available()) AS skip_test \gset
+\if :skip_test
+\quit
+\endif
+
+-- switch to superuser
+\c -
+
+SELECT COUNT(*) >= 0 AS ok FROM pg_shmem_numa_allocations;
+
diff --git a/src/test/regress/sql/privileges.sql b/src/test/regress/sql/privileges.sql
index d195aaf1377..28261fd774b 100644
--- a/src/test/regress/sql/privileges.sql
+++ b/src/test/regress/sql/privileges.sql
@@ -1911,8 +1911,8 @@ REVOKE MAINTAIN ON lock_table FROM regress_locktable_user;
 DROP TABLE lock_table;
 DROP USER regress_locktable_user;
 
--- test to check privileges of system views pg_shmem_allocations and
--- pg_backend_memory_contexts.
+-- test to check privileges of system views pg_shmem_allocations,
+-- pg_shmem_numa_allocations and pg_backend_memory_contexts.
 
 -- switch to superuser
 \c -
@@ -1921,11 +1921,13 @@ CREATE ROLE regress_readallstats;
 
 SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- no
 SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT'); -- no
+SELECT has_table_privilege('regress_readallstats','pg_shmem_numa_allocations','SELECT'); -- no
 
 GRANT pg_read_all_stats TO regress_readallstats;
 
 SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- yes
 SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT'); -- yes
+SELECT has_table_privilege('regress_readallstats','pg_shmem_numa_allocations','SELECT'); -- yes
 
 -- run query to ensure that functions within views can be executed
 SET ROLE regress_readallstats;
-- 
2.39.5

