Re: pgsql: Introduce pg_shmem_allocations_numa view

Christoph Berg Mon, 23 Jun 2025 07:42:45 -0700

Re: To Tomas Vondra
> This is acting up on Debian's 32-bit architectures, namely i386, armel
> and armhf:


... and x32 (x86_64 instruction set with 32-bit pointers).

>  SELECT COUNT(*) >= 0 AS ok FROM pg_shmem_allocations_numa;
> +ERROR:  invalid NUMA node id outside of allowed range [0, 0]: -14
> 
> -14 seems to be -EFAULT, and move_pages(2) says:
>        -EFAULT
>               This is a zero page or the memory area is not mapped by the 
> process.

I did some debugging on i386 and made it print the page numbers:

 SELECT COUNT(*) >= 0 AS ok FROM pg_shmem_allocations_numa;
+WARNING:  invalid NUMA node id outside of allowed range [0, 0]: -14 for page 35
+WARNING:  invalid NUMA node id outside of allowed range [0, 0]: -14 for page 36
...
+WARNING:  invalid NUMA node id outside of allowed range [0, 0]: -14 for page 
32768
+WARNING:  invalid NUMA node id outside of allowed range [0, 0]: -14 for page 
32769

So it works for the first few pages and then the rest is EFAULT.

I think the pg_numa_touch_mem_if_required() hack might not be enough
to force the pages to be allocated. Changing that to a memcpy() didn't
help. Is there some optimization that zero pages aren't allocated
until being written to?

Why do we try to force the pages to be allocated at all? This is just
a monitoring function, it should not change the actual system state.
Why not just skip any page where the status is <0 ?

The attached patch removes that logic. Regression tests pass, but we
probably have to think about whether to report these negative numbers
as-is or perhaps convert them to NULL.

Christoph

>From 189b70a625169687335606019f8d01aeb9a7e9fe Mon Sep 17 00:00:00 2001
From: Christoph Berg <m...@debian.org>
Date: Mon, 23 Jun 2025 16:37:17 +0200
Subject: [PATCH] Don't force-allocate pages for pg_get_shmem_allocations_numa

We tried to force all shared memory pages to be allocated on the first
call to pg_get_shmem_allocations_numa (and pg_buffercache_numa_pages),
but on 32-bit architectures that still left pages with status "EFAULT"
behind.

Instead of forcing the system into some state, just report that state
from these functions.
---
 contrib/pg_buffercache/pg_buffercache_pages.c | 21 ----------------
 src/backend/storage/ipc/shmem.c               | 25 ++-----------------
 src/include/port/pg_numa.h                    | 16 ------------
 3 files changed, 2 insertions(+), 60 deletions(-)

diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c
index 4b007f6e1b0..be927176f26 100644
--- a/contrib/pg_buffercache/pg_buffercache_pages.c
+++ b/contrib/pg_buffercache/pg_buffercache_pages.c
@@ -102,10 +102,6 @@ PG_FUNCTION_INFO_V1(pg_buffercache_evict_relation);
 PG_FUNCTION_INFO_V1(pg_buffercache_evict_all);
 
 
-/* Only need to touch memory once per backend process lifetime */
-static bool firstNumaTouch = true;
-
-
 Datum
 pg_buffercache_pages(PG_FUNCTION_ARGS)
 {
@@ -294,10 +290,6 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
  *
  * We expect both sizes (for buffers and memory pages) to be a power-of-2, so
  * one is always a multiple of the other.
- *
- * In order to get reliable results we also need to touch memory pages, so
- * that the inquiry about NUMA memory node doesn't return -2 (which indicates
- * unmapped/unallocated pages).
  */
 Datum
 pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
@@ -320,7 +312,6 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
 		uint64		os_page_count;
 		int			pages_per_buffer;
 		int			max_entries;
-		volatile uint64 touch pg_attribute_unused();
 		char	   *startptr,
 				   *endptr;
 
@@ -370,14 +361,8 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
 		/* Fill pointers for all the memory pages. */
 		idx = 0;
 		for (char *ptr = startptr; ptr < endptr; ptr += os_page_size)
-		{
 			os_page_ptrs[idx++] = ptr;
 
-			/* Only need to touch memory once per backend process lifetime */
-			if (firstNumaTouch)
-				pg_numa_touch_mem_if_required(touch, ptr);
-		}
-
 		Assert(idx == os_page_count);
 
 		elog(DEBUG1, "NUMA: NBuffers=%d os_page_count=" UINT64_FORMAT " "
@@ -438,9 +423,6 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
 		/* Return to original context when allocating transient memory */
 		MemoryContextSwitchTo(oldcontext);
 
-		if (firstNumaTouch)
-			elog(DEBUG1, "NUMA: page-faulting the buffercache for proper NUMA readouts");
-
 		/*
 		 * Scan through all the buffers, saving the relevant fields in the
 		 * fctx->record structure.
@@ -503,9 +485,6 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
 		/* Set max calls and remember the user function context. */
 		funcctx->max_calls = idx;
 		funcctx->user_fctx = fctx;
-
-		/* Remember this backend touched the pages */
-		firstNumaTouch = false;
 	}
 
 	funcctx = SRF_PERCALL_SETUP();
diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c
index c9ae3b45b76..1e02d97e61f 100644
--- a/src/backend/storage/ipc/shmem.c
+++ b/src/backend/storage/ipc/shmem.c
@@ -90,9 +90,6 @@ slock_t    *ShmemLock;			/* spinlock for shared memory and LWLock
 
 static HTAB *ShmemIndex = NULL; /* primary index hashtable for shmem */
 
-/* To get reliable results for NUMA inquiry we need to "touch pages" once */
-static bool firstNumaTouch = true;
-
 Datum		pg_numa_available(PG_FUNCTION_ARGS);
 
 /*
@@ -634,9 +631,6 @@ pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS)
 	page_ptrs = palloc0(sizeof(void *) * shm_total_page_count);
 	pages_status = palloc(sizeof(int) * shm_total_page_count);
 
-	if (firstNumaTouch)
-		elog(DEBUG1, "NUMA: page-faulting shared memory segments for proper NUMA readouts");
-
 	LWLockAcquire(ShmemIndexLock, LW_SHARED);
 
 	hash_seq_init(&hstat, ShmemIndex);
@@ -672,21 +666,12 @@ pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS)
 		/*
 		 * Setup page_ptrs[] with pointers to all OS pages for this segment,
 		 * and get the NUMA status using pg_numa_query_pages.
-		 *
-		 * In order to get reliable results we also need to touch memory
-		 * pages, so that inquiry about NUMA memory node doesn't return -2
-		 * (ENOENT, which indicates unmapped/unallocated pages).
 		 */
 		for (i = 0; i < shm_ent_page_count; i++)
 		{
 			volatile uint64 touch pg_attribute_unused();
 
 			page_ptrs[i] = startptr + (i * os_page_size);
-
-			if (firstNumaTouch)
-				pg_numa_touch_mem_if_required(touch, page_ptrs[i]);
-
-			CHECK_FOR_INTERRUPTS();
 		}
 
 		if (pg_numa_query_pages(0, shm_ent_page_count, page_ptrs, pages_status) == -1)
@@ -700,13 +685,8 @@ pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS)
 			int			s = pages_status[i];
 
 			/* Ensure we are adding only valid index to the array */
-			if (s < 0 || s > max_nodes)
-			{
-				elog(ERROR, "invalid NUMA node id outside of allowed range "
-					 "[0, " UINT64_FORMAT "]: %d", max_nodes, s);
-			}
-
-			nodes[s]++;
+			if (s >= 0 && s <= max_nodes)
+				nodes[s]++;
 		}
 
 		/*
@@ -725,7 +705,6 @@ pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS)
 	}
 
 	LWLockRelease(ShmemIndexLock);
-	firstNumaTouch = false;
 
 	return (Datum) 0;
 }
diff --git a/src/include/port/pg_numa.h b/src/include/port/pg_numa.h
index 40f1d324dcf..bebc4203184 100644
--- a/src/include/port/pg_numa.h
+++ b/src/include/port/pg_numa.h
@@ -18,20 +18,4 @@ extern PGDLLIMPORT int pg_numa_init(void);
 extern PGDLLIMPORT int pg_numa_query_pages(int pid, unsigned long count, void **pages, int *status);
 extern PGDLLIMPORT int pg_numa_get_max_node(void);
 
-#ifdef USE_LIBNUMA
-
-/*
- * This is required on Linux, before pg_numa_query_pages() as we
- * need to page-fault before move_pages(2) syscall returns valid results.
- */
-#define pg_numa_touch_mem_if_required(ro_volatile_var, ptr) \
-	ro_volatile_var = *(volatile uint64 *) ptr
-
-#else
-
-#define pg_numa_touch_mem_if_required(ro_volatile_var, ptr) \
-	do {} while(0)
-
-#endif
-
 #endif							/* PG_NUMA_H */
-- 
2.47.2

Re: pgsql: Introduce pg_shmem_allocations_numa view

Reply via email to