Re: To Tomas Vondra > This is acting up on Debian's 32-bit architectures, namely i386, armel > and armhf:
... and x32 (x86_64 instruction set with 32-bit pointers). > SELECT COUNT(*) >= 0 AS ok FROM pg_shmem_allocations_numa; > +ERROR: invalid NUMA node id outside of allowed range [0, 0]: -14 > > -14 seems to be -EFAULT, and move_pages(2) says: > -EFAULT > This is a zero page or the memory area is not mapped by the > process. I did some debugging on i386 and made it print the page numbers: SELECT COUNT(*) >= 0 AS ok FROM pg_shmem_allocations_numa; +WARNING: invalid NUMA node id outside of allowed range [0, 0]: -14 for page 35 +WARNING: invalid NUMA node id outside of allowed range [0, 0]: -14 for page 36 ... +WARNING: invalid NUMA node id outside of allowed range [0, 0]: -14 for page 32768 +WARNING: invalid NUMA node id outside of allowed range [0, 0]: -14 for page 32769 So it works for the first few pages and then the rest is EFAULT. I think the pg_numa_touch_mem_if_required() hack might not be enough to force the pages to be allocated. Changing that to a memcpy() didn't help. Is there some optimization that zero pages aren't allocated until being written to? Why do we try to force the pages to be allocated at all? This is just a monitoring function, it should not change the actual system state. Why not just skip any page where the status is <0 ? The attached patch removes that logic. Regression tests pass, but we probably have to think about whether to report these negative numbers as-is or perhaps convert them to NULL. Christoph
>From 189b70a625169687335606019f8d01aeb9a7e9fe Mon Sep 17 00:00:00 2001 From: Christoph Berg <m...@debian.org> Date: Mon, 23 Jun 2025 16:37:17 +0200 Subject: [PATCH] Don't force-allocate pages for pg_get_shmem_allocations_numa We tried to force all shared memory pages to be allocated on the first call to pg_get_shmem_allocations_numa (and pg_buffercache_numa_pages), but on 32-bit architectures that still left pages with status "EFAULT" behind. Instead of forcing the system into some state, just report that state from these functions. --- contrib/pg_buffercache/pg_buffercache_pages.c | 21 ---------------- src/backend/storage/ipc/shmem.c | 25 ++----------------- src/include/port/pg_numa.h | 16 ------------ 3 files changed, 2 insertions(+), 60 deletions(-) diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c index 4b007f6e1b0..be927176f26 100644 --- a/contrib/pg_buffercache/pg_buffercache_pages.c +++ b/contrib/pg_buffercache/pg_buffercache_pages.c @@ -102,10 +102,6 @@ PG_FUNCTION_INFO_V1(pg_buffercache_evict_relation); PG_FUNCTION_INFO_V1(pg_buffercache_evict_all); -/* Only need to touch memory once per backend process lifetime */ -static bool firstNumaTouch = true; - - Datum pg_buffercache_pages(PG_FUNCTION_ARGS) { @@ -294,10 +290,6 @@ pg_buffercache_pages(PG_FUNCTION_ARGS) * * We expect both sizes (for buffers and memory pages) to be a power-of-2, so * one is always a multiple of the other. - * - * In order to get reliable results we also need to touch memory pages, so - * that the inquiry about NUMA memory node doesn't return -2 (which indicates - * unmapped/unallocated pages). */ Datum pg_buffercache_numa_pages(PG_FUNCTION_ARGS) @@ -320,7 +312,6 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS) uint64 os_page_count; int pages_per_buffer; int max_entries; - volatile uint64 touch pg_attribute_unused(); char *startptr, *endptr; @@ -370,14 +361,8 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS) /* Fill pointers for all the memory pages. */ idx = 0; for (char *ptr = startptr; ptr < endptr; ptr += os_page_size) - { os_page_ptrs[idx++] = ptr; - /* Only need to touch memory once per backend process lifetime */ - if (firstNumaTouch) - pg_numa_touch_mem_if_required(touch, ptr); - } - Assert(idx == os_page_count); elog(DEBUG1, "NUMA: NBuffers=%d os_page_count=" UINT64_FORMAT " " @@ -438,9 +423,6 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS) /* Return to original context when allocating transient memory */ MemoryContextSwitchTo(oldcontext); - if (firstNumaTouch) - elog(DEBUG1, "NUMA: page-faulting the buffercache for proper NUMA readouts"); - /* * Scan through all the buffers, saving the relevant fields in the * fctx->record structure. @@ -503,9 +485,6 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS) /* Set max calls and remember the user function context. */ funcctx->max_calls = idx; funcctx->user_fctx = fctx; - - /* Remember this backend touched the pages */ - firstNumaTouch = false; } funcctx = SRF_PERCALL_SETUP(); diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c index c9ae3b45b76..1e02d97e61f 100644 --- a/src/backend/storage/ipc/shmem.c +++ b/src/backend/storage/ipc/shmem.c @@ -90,9 +90,6 @@ slock_t *ShmemLock; /* spinlock for shared memory and LWLock static HTAB *ShmemIndex = NULL; /* primary index hashtable for shmem */ -/* To get reliable results for NUMA inquiry we need to "touch pages" once */ -static bool firstNumaTouch = true; - Datum pg_numa_available(PG_FUNCTION_ARGS); /* @@ -634,9 +631,6 @@ pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS) page_ptrs = palloc0(sizeof(void *) * shm_total_page_count); pages_status = palloc(sizeof(int) * shm_total_page_count); - if (firstNumaTouch) - elog(DEBUG1, "NUMA: page-faulting shared memory segments for proper NUMA readouts"); - LWLockAcquire(ShmemIndexLock, LW_SHARED); hash_seq_init(&hstat, ShmemIndex); @@ -672,21 +666,12 @@ pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS) /* * Setup page_ptrs[] with pointers to all OS pages for this segment, * and get the NUMA status using pg_numa_query_pages. - * - * In order to get reliable results we also need to touch memory - * pages, so that inquiry about NUMA memory node doesn't return -2 - * (ENOENT, which indicates unmapped/unallocated pages). */ for (i = 0; i < shm_ent_page_count; i++) { volatile uint64 touch pg_attribute_unused(); page_ptrs[i] = startptr + (i * os_page_size); - - if (firstNumaTouch) - pg_numa_touch_mem_if_required(touch, page_ptrs[i]); - - CHECK_FOR_INTERRUPTS(); } if (pg_numa_query_pages(0, shm_ent_page_count, page_ptrs, pages_status) == -1) @@ -700,13 +685,8 @@ pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS) int s = pages_status[i]; /* Ensure we are adding only valid index to the array */ - if (s < 0 || s > max_nodes) - { - elog(ERROR, "invalid NUMA node id outside of allowed range " - "[0, " UINT64_FORMAT "]: %d", max_nodes, s); - } - - nodes[s]++; + if (s >= 0 && s <= max_nodes) + nodes[s]++; } /* @@ -725,7 +705,6 @@ pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS) } LWLockRelease(ShmemIndexLock); - firstNumaTouch = false; return (Datum) 0; } diff --git a/src/include/port/pg_numa.h b/src/include/port/pg_numa.h index 40f1d324dcf..bebc4203184 100644 --- a/src/include/port/pg_numa.h +++ b/src/include/port/pg_numa.h @@ -18,20 +18,4 @@ extern PGDLLIMPORT int pg_numa_init(void); extern PGDLLIMPORT int pg_numa_query_pages(int pid, unsigned long count, void **pages, int *status); extern PGDLLIMPORT int pg_numa_get_max_node(void); -#ifdef USE_LIBNUMA - -/* - * This is required on Linux, before pg_numa_query_pages() as we - * need to page-fault before move_pages(2) syscall returns valid results. - */ -#define pg_numa_touch_mem_if_required(ro_volatile_var, ptr) \ - ro_volatile_var = *(volatile uint64 *) ptr - -#else - -#define pg_numa_touch_mem_if_required(ro_volatile_var, ptr) \ - do {} while(0) - -#endif - #endif /* PG_NUMA_H */ -- 2.47.2