Re: [PATCH v3 07/26] mm: drop CONFIG_HAVE_ARCH_NODEDATA_EXTENSION
On Thu, 1 Aug 2024 09:08:07 +0300 Mike Rapoport wrote: > From: "Mike Rapoport (Microsoft)" > > There are no users of HAVE_ARCH_NODEDATA_EXTENSION left, so > arch_alloc_nodedata() and arch_refresh_nodedata() are not needed > anymore. > > Replace the call to arch_alloc_nodedata() in free_area_init() with > memblock_alloc(), remove arch_refresh_nodedata() and cleanup > include/linux/memory_hotplug.h from the associated ifdefery. > > Signed-off-by: Mike Rapoport (Microsoft) > Tested-by: Zi Yan # for x86_64 and arm64 Hi Mike, This has an accidental (I assume) functional change and if you have an initially offline node it all goes wrong. > --- > include/linux/memory_hotplug.h | 48 -- > mm/mm_init.c | 3 +-- > 2 files changed, 1 insertion(+), 50 deletions(-) > > diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h > index ebe876930e78..b27ddce5d324 100644 > --- a/include/linux/memory_hotplug.h > +++ b/include/linux/memory_hotplug.h > @@ -16,54 +16,6 @@ struct resource; > struct vmem_altmap; > struct dev_pagemap; > > -#ifdef CONFIG_HAVE_ARCH_NODEDATA_EXTENSION > -/* > - * For supporting node-hotadd, we have to allocate a new pgdat. > - * > - * If an arch has generic style NODE_DATA(), > - * node_data[nid] = kzalloc() works well. But it depends on the architecture. > - * > - * In general, generic_alloc_nodedata() is used. > - * > - */ > -extern pg_data_t *arch_alloc_nodedata(int nid); > -extern void arch_refresh_nodedata(int nid, pg_data_t *pgdat); > - > -#else /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */ > - > -#define arch_alloc_nodedata(nid) generic_alloc_nodedata(nid) > - > -#ifdef CONFIG_NUMA > -/* > - * XXX: node aware allocation can't work well to get new node's memory at > this time. > - * Because, pgdat for the new node is not allocated/initialized yet itself. > - * To use new node's memory, more consideration will be necessary. > - */ > -#define generic_alloc_nodedata(nid) \ > -({ \ > - memblock_alloc(sizeof(*pgdat), SMP_CACHE_BYTES);\ > -}) > - > -extern pg_data_t *node_data[]; > -static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat) > -{ > - node_data[nid] = pgdat; > -} > - > -#else /* !CONFIG_NUMA */ > - > -/* never called */ > -static inline pg_data_t *generic_alloc_nodedata(int nid) > -{ > - BUG(); > - return NULL; > -} > -static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat) > -{ > -} > -#endif /* CONFIG_NUMA */ > -#endif /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */ > - > #ifdef CONFIG_MEMORY_HOTPLUG > struct page *pfn_to_online_page(unsigned long pfn); > > diff --git a/mm/mm_init.c b/mm/mm_init.c > index 75c3bd42799b..bcc2f2dd8021 100644 > --- a/mm/mm_init.c > +++ b/mm/mm_init.c > @@ -1838,11 +1838,10 @@ void __init free_area_init(unsigned long > *max_zone_pfn) > > if (!node_online(nid)) { > /* Allocator not initialized yet */ > - pgdat = arch_alloc_nodedata(nid); > + pgdat = memblock_alloc(sizeof(*pgdat), SMP_CACHE_BYTES); > if (!pgdat) > panic("Cannot allocate %zuB for node %d.\n", > sizeof(*pgdat), nid); > - arch_refresh_nodedata(nid, pgdat); This allocates pgdat but never sets node_data[nid] to it and promptly leaks it on the line below. Just to sanity check this I spun up a qemu machine with no memory initially present on some nodes and it went boom as you'd expect. I tested with addition of NODE_DATA(nid) = pgdat; and it all seems to work as expected. Jonathan > } > > pgdat = NODE_DATA(nid);
Re: [PATCH v3 09/26] arch, mm: pull out allocation of NODE_DATA to generic code
On Thu, 1 Aug 2024 09:08:09 +0300 Mike Rapoport wrote: > From: "Mike Rapoport (Microsoft)" > > Architectures that support NUMA duplicate the code that allocates > NODE_DATA on the node-local memory with slight variations in reporting > of the addresses where the memory was allocated. > > Use x86 version as the basis for the generic alloc_node_data() function > and call this function in architecture specific numa initialization. > > Round up node data size to SMP_CACHE_BYTES rather than to PAGE_SIZE like > x86 used to do since the bootmem era when allocation granularity was > PAGE_SIZE anyway. > > Signed-off-by: Mike Rapoport (Microsoft) > Acked-by: David Hildenbrand > Reviewed-by: Jonathan Cameron > Tested-by: Zi Yan # for x86_64 and arm64 One comment unrelated to this patch set as such, just made more obvious by it. > diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c > index 0744a9a2944b..3c1da08304d0 100644 > --- a/arch/powerpc/mm/numa.c > +++ b/arch/powerpc/mm/numa.c > @@ -1093,27 +1093,9 @@ void __init dump_numa_cpu_topology(void) > static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn) > { > u64 spanned_pages = end_pfn - start_pfn; Trivial, but might as well squash this local variable into the single place it's used. > - const size_t nd_size = roundup(sizeof(pg_data_t), SMP_CACHE_BYTES); > - u64 nd_pa; > - void *nd; > - int tnid; > - > - nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid); > - if (!nd_pa) > - panic("Cannot allocate %zu bytes for node %d data\n", > - nd_size, nid); > - > - nd = __va(nd_pa); > - > - /* report and initialize */ > - pr_info(" NODE_DATA [mem %#010Lx-%#010Lx]\n", > - nd_pa, nd_pa + nd_size - 1); > - tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT); > - if (tnid != nid) > - pr_info("NODE_DATA(%d) on node %d\n", nid, tnid); > - > - node_data[nid] = nd; > - memset(NODE_DATA(nid), 0, sizeof(pg_data_t)); > + > + alloc_node_data(nid); > + > NODE_DATA(nid)->node_id = nid; > NODE_DATA(nid)->node_start_pfn = start_pfn; > NODE_DATA(nid)->node_spanned_pages = spanned_pages;
Re: [PATCH v3 10/26] x86/numa: simplify numa_distance allocation
On Thu, 1 Aug 2024 09:08:10 +0300 Mike Rapoport wrote: > From: "Mike Rapoport (Microsoft)" > > Allocation of numa_distance uses memblock_phys_alloc_range() to limit > allocation to be below the last mapped page. > > But NUMA initializaition runs after the direct map is populated and > there is also code in setup_arch() that adjusts memblock limit to > reflect how much memory is already mapped in the direct map. > > Simplify the allocation of numa_distance and use plain memblock_alloc(). > > Signed-off-by: Mike Rapoport (Microsoft) > Tested-by: Zi Yan # for x86_64 and arm64 Seems sensible. FWIW (which might just be me not bothering to read this one again ;) Reviewed-by: Jonathan Cameron
Re: [PATCH v3 17/26] mm: introduce numa_memblks
On Thu, 1 Aug 2024 09:08:17 +0300 Mike Rapoport wrote: > From: "Mike Rapoport (Microsoft)" > > Move code dealing with numa_memblks from arch/x86 to mm/ and add Kconfig > options to let x86 select it in its Kconfig. > > This code will be later reused by arch_numa. > > No functional changes. > > Signed-off-by: Mike Rapoport (Microsoft) > Tested-by: Zi Yan # for x86_64 and arm64 Reviewed-by: Jonathan Cameron
Re: [PATCH v3 18/26] mm: move numa_distance and related code from x86 to numa_memblks
On Thu, 1 Aug 2024 09:08:18 +0300 Mike Rapoport wrote: > From: "Mike Rapoport (Microsoft)" > > Move code dealing with numa_distance array from arch/x86 to > mm/numa_memblks.c > > This code will be later reused by arch_numa. > > No functional changes. > > Signed-off-by: Mike Rapoport (Microsoft) > Tested-by: Zi Yan # for x86_64 and arm64 As you say, simple code move and I'll cope with the confusion of stuff that isn't numa_memblk in that file given the convenience. Reviewed-by: Jonathan Cameron
Re: [PATCH v3 11/26] x86/numa: use get_pfn_range_for_nid to verify that node spans memory
On Thu, 1 Aug 2024 09:08:11 +0300 Mike Rapoport wrote: > From: "Mike Rapoport (Microsoft)" > > Instead of looping over numa_meminfo array to detect node's start and > end addresses use get_pfn_range_for_init(). > > This is shorter and make it easier to lift numa_memblks to generic code. > > Signed-off-by: Mike Rapoport (Microsoft) > Tested-by: Zi Yan # for x86_64 and arm64 Fair enough given code a few lines up has set the node for all the memblocks so this should I think give the same effective result. Reviewed-by: Jonathan Cameron > --- > arch/x86/mm/numa.c | 13 +++-- > 1 file changed, 3 insertions(+), 10 deletions(-) > > diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c > index edfc38803779..cfe7e5477cf8 100644 > --- a/arch/x86/mm/numa.c > +++ b/arch/x86/mm/numa.c > @@ -521,17 +521,10 @@ static int __init numa_register_memblks(struct > numa_meminfo *mi) > > /* Finally register nodes. */ > for_each_node_mask(nid, node_possible_map) { > - u64 start = PFN_PHYS(max_pfn); > - u64 end = 0; > + unsigned long start_pfn, end_pfn; > > - for (i = 0; i < mi->nr_blks; i++) { > - if (nid != mi->blk[i].nid) > - continue; > - start = min(mi->blk[i].start, start); > - end = max(mi->blk[i].end, end); > - } > - > - if (start >= end) > + get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); > + if (start_pfn >= end_pfn) > continue; > > alloc_node_data(nid);
Re: [PATCH v3 20/26] mm: numa_memblks: introduce numa_memblks_init
On Thu, 1 Aug 2024 09:08:20 +0300 Mike Rapoport wrote: > From: "Mike Rapoport (Microsoft)" > > Move most of x86::numa_init() to numa_memblks so that the latter will be > more self-contained. > > With this numa_memblk data structures should not be exposed to the > architecture specific code. > > Signed-off-by: Mike Rapoport (Microsoft) > Tested-by: Zi Yan # for x86_64 and arm64 Just code motion as expected. Reviewed-by: Jonathan Cameron
Re: [PATCH v3 21/26] mm: numa_memblks: make several functions and variables static
On Thu, 1 Aug 2024 09:08:21 +0300 Mike Rapoport wrote: > From: "Mike Rapoport (Microsoft)" > > Make functions and variables that are exclusively used by numa_memblks > static. > > Move numa_nodemask_from_meminfo() before its callers to avoid forward > declaration. > > Signed-off-by: Mike Rapoport (Microsoft) > Tested-by: Zi Yan # for x86_64 and arm64 Good. I was wondering why some of this internal detail was in the header. Much nicer after this. Reviewed-by: Jonathan Cameron
Re: [PATCH v3 22/26] mm: numa_memblks: use memblock_{start,end}_of_DRAM() when sanitizing meminfo
On Thu, 1 Aug 2024 09:08:22 +0300 Mike Rapoport wrote: > From: "Mike Rapoport (Microsoft)" > > numa_cleanup_meminfo() moves blocks outside system RAM to > numa_reserved_meminfo and it uses 0 and PFN_PHYS(max_pfn) to determine > the memory boundaries. > > Replace the memory range boundaries with more portable > memblock_start_of_DRAM() and memblock_end_of_DRAM(). > > Signed-off-by: Mike Rapoport (Microsoft) > Tested-by: Zi Yan # for x86_64 and arm64 Makes sense Reviewed-by: Jonathan Cameron > --- > mm/numa_memblks.c | 4 ++-- > 1 file changed, 2 insertions(+), 2 deletions(-) > > diff --git a/mm/numa_memblks.c b/mm/numa_memblks.c > index e97665a5e8ce..e4358ad92233 100644 > --- a/mm/numa_memblks.c > +++ b/mm/numa_memblks.c > @@ -212,8 +212,8 @@ int __init numa_add_memblk(int nid, u64 start, u64 end) > */ > int __init numa_cleanup_meminfo(struct numa_meminfo *mi) > { > - const u64 low = 0; > - const u64 high = PFN_PHYS(max_pfn); > + const u64 low = memblock_start_of_DRAM(); > + const u64 high = memblock_end_of_DRAM(); > int i, j, k; > > /* first, trim all entries */
Re: [PATCH v3 19/26] mm: introduce numa_emulation
On Thu, 1 Aug 2024 09:08:19 +0300 Mike Rapoport wrote: > From: "Mike Rapoport (Microsoft)" > > Move numa_emulation codfrom arch/x86 to mm/numa_emulation.c > > This code will be later reused by arch_numa. > > No functional changes. > > Signed-off-by: Mike Rapoport (Microsoft) > Tested-by: Zi Yan # for x86_64 and arm64 I ran some basic tests on ARM with this. Seems to do the job. Reviewed-by: Jonathan Cameron Tested-by: Jonathan Cameron Works on both ACPI and dsdt boots.
Re: [PATCH v3 23/26] of, numa: return -EINVAL when no numa-node-id is found
On Thu, 1 Aug 2024 09:08:23 +0300 Mike Rapoport wrote: > From: "Mike Rapoport (Microsoft)" > > Currently of_numa_parse_memory_nodes() returns 0 if no "memory" node in > device tree contains "numa-node-id" property. This makes of_numa_init() > to return "success" despite no NUMA nodes were actually parsed and set > up. > > arch_numa workarounds this by returning an error if numa_nodes_parsed is > empty. > > numa_memblks however would WARN() in such case and since it will be used > by arch_numa shortly, such warning is not desirable. > > Make sure of_numa_init() returns -EINVAL when no NUMA node information > was found in the device tree. > > Signed-off-by: Mike Rapoport (Microsoft) Reviewed-by: Jonathan Cameron
Re: [PATCH v3 24/26] arch_numa: switch over to numa_memblks
On Thu, 1 Aug 2024 09:08:24 +0300 Mike Rapoport wrote: > From: "Mike Rapoport (Microsoft)" > > Until now arch_numa was directly translating firmware NUMA information > to memblock. > > Using numa_memblks as an intermediate step has a few advantages: > * alignment with more battle tested x86 implementation > * availability of NUMA emulation > * maintaining node information for not yet populated memory > > Replace current functionality related to numa_add_memblk() and > __node_distance() with the implementation based on numa_memblks and add > functions required by numa_emulation. > > Signed-off-by: Mike Rapoport (Microsoft) > Tested-by: Zi Yan # for x86_64 and arm64 Reviewed-by: Jonathan Cameron
Re: [PATCH v3 26/26] docs: move numa=fake description to kernel-parameters.txt
On Thu, 1 Aug 2024 09:08:26 +0300 Mike Rapoport wrote: > From: "Mike Rapoport (Microsoft)" > > NUMA emulation can be now enabled on arm64 and riscv in addition to x86. > > Move description of numa=fake parameters from x86 documentation of > admin-guide/kernel-parameters.txt > > Suggested-by: Zi Yan > Signed-off-by: Mike Rapoport (Microsoft) Reviewed-by: Jonathan Cameron
Re: [PATCH v3 00/26] mm: introduce numa_memblks
On Thu, 1 Aug 2024 09:08:00 +0300 Mike Rapoport wrote: > From: "Mike Rapoport (Microsoft)" > > Hi, > > Following the discussion about handling of CXL fixed memory windows on > arm64 [1] I decided to bite the bullet and move numa_memblks from x86 to > the generic code so they will be available on arm64/riscv and maybe on > loongarch sometime later. > > While it could be possible to use memblock to describe CXL memory windows, > it currently lacks notion of unpopulated memory ranges and numa_memblks > does implement this. > > Another reason to make numa_memblks generic is that both arch_numa (arm64 > and riscv) and loongarch use trimmed copy of x86 code although there is no > fundamental reason why the same code cannot be used on all these platforms. > Having numa_memblks in mm/ will make it's interaction with ACPI and FDT > more consistent and I believe will reduce maintenance burden. > > And with generic numa_memblks it is (almost) straightforward to enable NUMA > emulation on arm64 and riscv. Tested-by: Jonathan Cameron #arm64 + CXL via QEMU With that one fix in patch 7. Feel free to figure out which patches actually got tested by that (or tag them all - I'll pretend I tested ip27 :) Jonathan
Re: [PATCH v13] mm: report per-page metadata information
++ nvdimm, linux-cxl, Yu Zhang On Wed, Jun 05, 2024 at 10:27:51PM +, Sourav Panda wrote: > Today, we do not have any observability of per-page metadata > and how much it takes away from the machine capacity. Thus, > we want to describe the amount of memory that is going towards > per-page metadata, which can vary depending on build > configuration, machine architecture, and system use. > > This patch adds 2 fields to /proc/vmstat that can used as shown > below: > > Accounting per-page metadata allocated by boot-allocator: > /proc/vmstat:nr_memmap_boot * PAGE_SIZE > > Accounting per-page metadata allocated by buddy-allocator: > /proc/vmstat:nr_memmap * PAGE_SIZE > > Accounting total Perpage metadata allocated on the machine: > (/proc/vmstat:nr_memmap_boot + >/proc/vmstat:nr_memmap) * PAGE_SIZE > > Utility for userspace: > > Observability: Describe the amount of memory overhead that is > going to per-page metadata on the system at any given time since > this overhead is not currently observable. > > Debugging: Tracking the changes or absolute value in struct pages > can help detect anomalies as they can be correlated with other > metrics in the machine (e.g., memtotal, number of huge pages, > etc). > > page_ext overheads: Some kernel features such as page_owner > page_table_check that use page_ext can be optionally enabled via > kernel parameters. Having the total per-page metadata information > helps users precisely measure impact. Furthermore, page-metadata > metrics will reflect the amount of struct pages reliquished > (or overhead reduced) when hugetlbfs pages are reserved which > will vary depending on whether hugetlb vmemmap optimization is > enabled or not. > > For background and results see: > lore.kernel.org/all/20240220214558.3377482-1-souravpa...@google.com > > Acked-by: David Rientjes > Signed-off-by: Sourav Panda > Reviewed-by: Pasha Tatashin This patch is leading to Oops in 6.11-rc1 when CONFIG_MEMORY_HOTPLUG is enabled. Folks hitting it have had success with reverting this patch. Disabling CONFIG_MEMORY_HOTPLUG is not a long term solution. Reported here: https://lore.kernel.org/linux-cxl/CAHj4cs9Ax1=CoJkgBGP_+sNu6-6=6v=_l-zbzy0bvld3wuw...@mail.gmail.com/ A bit of detail below, follow above link for more: dmesg: [ 1408.632268] Oops: general protection fault, probably for non-canonical address 0xdc005650: [#1] PREEMPT SMP KASAN PTI [ 1408.644006] KASAN: probably user-memory-access in range [0x0002b280-0x0002b287] [ 1408.652699] CPU: 26 UID: 0 PID: 1868 Comm: ndctl Not tainted 6.11.0-rc1 #1 [ 1408.659571] Hardware name: Dell Inc. PowerEdge R640/08HT8T, BIOS 2.20.1 09/13/2023 [ 1408.667136] RIP: 0010:mod_node_page_state+0x2a/0x110 [ 1408.672112] Code: 0f 1f 44 00 00 48 b8 00 00 00 00 00 fc ff df 41 54 55 48 89 fd 48 81 c7 80 b2 02 00 53 48 89 f9 89 d3 48 c1 e9 03 48 83 ec 10 <80> 3c 01 00 0f 85 b8 00 00 00 48 8b bd 80 b2 02 00 41 89 f0 83 ee [ 1408.690856] RSP: 0018:c900246d7388 EFLAGS: 00010286 [ 1408.696088] RAX: dc00 RBX: fe00 RCX: 5650 [ 1408.703222] RDX: fe00 RSI: 002f RDI: 0002b280 [ 1408.710353] RBP: R08: 88a06ffcb1c8 R09: 1218c681 [ 1408.717486] R10: 93d922bf R11: 88855e790f10 R12: 03ff [ 1408.724619] R13: 1920048dae7b R14: ea0081e0 R15: 90c63408 [ 1408.731750] FS: 7f753c219200() GS:889bf2a0() knlGS: [ 1408.739834] CS: 0010 DS: ES: CR0: 80050033 [ 1408.745581] CR2: 559f5902a5a8 CR3: 0001292f0006 CR4: 007706f0 [ 1408.752713] DR0: DR1: DR2: [ 1408.759843] DR3: DR6: fffe0ff0 DR7: 0400 [ 1408.766976] PKRU: 5554 [ 1408.769690] Call Trace: [ 1408.772143] [ 1408.774248] ? die_addr+0x3d/0xa0 [ 1408.777577] ? exc_general_protection+0x150/0x230 [ 1408.782297] ? asm_exc_general_protection+0x22/0x30 [ 1408.787182] ? mod_node_page_state+0x2a/0x110 [ 1408.791548] section_deactivate+0x519/0x780 [ 1408.795740] ? __pfx_section_deactivate+0x10/0x10 [ 1408.800449] __remove_pages+0x6c/0xa0 [ 1408.804119] arch_remove_memory+0x1a/0x70 [ 1408.808141] pageunmap_range+0x2ad/0x5e0 [ 1408.812067] memunmap_pages+0x320/0x5a0 [ 1408.815909] release_nodes+0xd6/0x170 [ 1408.819581] ? lockdep_hardirqs_on+0x78/0x100 [ 1408.823941] devres_release_all+0x106/0x170 [ 1408.828126] ? __pfx_devres_release_all+0x10/0x10 [ 1408.832834] device_unbind_cleanup+0x16/0x1a0 [ 1408.837198] device_release_driver_internal+0x3d5/0x530 [ 1408.842423] ? klist_put+0xf7/0x170 [ 1408.845916] bus_remove_device+0x1ed/0x3f0 [ 1408.850017] device_del+0x33b/0x8c0 [ 1408.853518] ? __pfx_device_del+0x10/0x10 [ 1408.857532] unregister_dev_dax+0x112/0x210 [ 1408.861722] release_nodes+0xd6/0x170 [ 1408.865387] ? lockdep_hardirqs_on+0x78/0x100 [ 1408.8697