Hey Nathan, > > Regardless of your change: at boot time, this set of calls to > set_numa_node() and set_numa_mem() is redundant, right? Because > smp_prepare_cpus() has: > > for_each_possible_cpu(cpu) { > ... > if (cpu_present(cpu)) { > set_cpu_numa_node(cpu, numa_cpu_lookup_table[cpu]); > set_cpu_numa_mem(cpu, > local_memory_node(numa_cpu_lookup_table[cpu])); > } > > I would rather that, when onlining a CPU that happens to have been > dynamically added after boot, we enter start_secondary() with conditions > equivalent to those at boot time. Or as close to that as is practical. > > So I'd suggest that pseries_add_processor() be made to update > these things when the CPUs are marked present, before onlining them.
I did try updating at pseries_add_processor when things were being marked as present. But looks like the zonelists may not be updated at that time. I saw couple of crashes in local_memory_node() when dplar adding CPUs. (Appending the patch that causes these crash to this mail for your reference) [ 293.669901] Kernel attempted to read user page (1508) - exploit attempt? (uid: 0) [1309/17174] [ 293.669925] BUG: Kernel NULL pointer dereference on read at 0x00001508 [ 293.669935] Faulting instruction address: 0xc000000000484ae0 [ 293.669947] Oops: Kernel access of bad area, sig: 11 [#1] [ 293.669956] LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA pSeries [ 293.669969] Modules linked in: nft_counter nft_compat rpadlpar_io rpaphp mptcp_diag xsk_diag tcp_diag udp_diag raw_diag inet_diag unix_diag af_packet_diag netlink_diag bonding tls nft_fib_inet nft_fib_ipv4 nft_ fib_ipv6 nft_fib nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 rfkill ip_set nf_tables nfnetlink dm_multipath pseries_rng xts vmx_c rypto uio_pdrv_genirq uio binfmt_misc ip_tables xfs libcrc32c sd_mod t10_pi sg ibmvscsi ibmveth scsi_transport_srp dm_mirror dm_region_hash dm_log dm_mod fuse [ 293.670101] CPU: 17 PID: 3183 Comm: drmgr Not tainted 5.12.0-rc5-master+ #2 [ 293.670113] NIP: c000000000484ae0 LR: c00000000010a548 CTR: 00000000006dd130 [ 293.670121] REGS: c0000025a9997160 TRAP: 0300 Not tainted (5.12.0-rc5-master+) [ 293.670131] MSR: 8000000000009033 <SF,EE,ME,IR,DR,RI,LE> CR: 48008422 XER: 00000008 [ 293.670155] CFAR: c00000000010a544 DAR: 0000000000001508 DSISR: 40000000 IRQMASK: 0 [ 293.670155] GPR00: c00000000010a548 c0000025a9997400 c000000001f55500 0000000000000000 [ 293.670155] GPR04: c000000001a3c598 0000000000000006 0000000000000027 c0000035873b8018 [ 293.670155] GPR08: 0000000000000023 c0000000020464f8 00000035894d0000 c000003584c8ffe8 [ 293.670155] GPR12: 0000000028008424 c0000035bf22ce80 0000000000000000 0000000000000000 [ 293.670155] GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 [ 293.670155] GPR20: 0000000000000000 0000000000000000 0000000000000000 c000000001fbc160 [ 293.670155] GPR24: 0000000000000002 0000000000000200 c000000001fc04c8 0000000000000001 [ 293.670155] GPR28: c0000000010c6fc8 c000000001fc08a0 c000002f8fb99e60 0000000000000040 [ 293.670263] NIP [c000000000484ae0] local_memory_node+0x20/0x80 [ 293.670281] LR [c00000000010a548] pseries_add_processor+0x368/0x3b0 [ 293.670292] Call Trace: [ 293.670297] [c0000025a9997400] [c00000000010a524] pseries_add_processor+0x344/0x3b0 (unreliable) [ 293.670311] [c0000025a99976c0] [c00000000010a790] pseries_smp_notifier+0x200/0x2a0 [ 293.670322] [c0000025a9997780] [c000000000197288] blocking_notifier_call_chain+0xa8/0x110 [ 293.670335] [c0000025a99977d0] [c000000000b27010] of_attach_node+0xc0/0x110 [ 293.670347] [c0000025a9997830] [c0000000001032a0] dlpar_attach_node+0x30/0x70 [ 293.670358] [c0000025a99978a0] [c000000000109ac0] dlpar_cpu_add+0x1d0/0x510 [ 293.670368] [c0000025a9997980] [c00000000010a898] dlpar_cpu+0x68/0x6e0 [ 293.670377] [c0000025a9997a80] [c0000000001036b8] handle_dlpar_errorlog+0xf8/0x190 [ 293.670388] [c0000025a9997af0] [c000000000103928] dlpar_store+0x178/0x4a0 [ 293.670396] [c0000025a9997bb0] [c0000000007df050] kobj_attr_store+0x30/0x50 [ 293.670408] [c0000025a9997bd0] [c00000000062f0b0] sysfs_kf_write+0x70/0xb0 [ 293.670421] [c0000025a9997c10] [c00000000062d4e0] kernfs_fop_write_iter+0x1d0/0x280 [ 293.670432] [c0000025a9997c60] [c00000000051673c] new_sync_write+0x14c/0x1d0 [ 293.670445] [c0000025a9997d00] [c000000000519df4] vfs_write+0x264/0x380 [ 293.670455] [c0000025a9997d60] [c00000000051a0ec] ksys_write+0x7c/0x140 [ 293.670464] [c0000025a9997db0] [c000000000032af0] system_call_exception+0x150/0x290 [ 293.670475] [c0000025a9997e10] [c00000000000d45c] system_call_common+0xec/0x278 [ 293.670486] --- interrupt: c00 at 0x20000025bd74 That leaves us with just 2 options for now. 1. Update numa_mem later and only update numa_node here. - Over a longer period of time, this would be more confusing since we may lose track of why we are splitting the set of numa_node and numa_mem. or 2. Use my earlier patch. My choice would be to go with my earlier patch. Please do let me know your thoughts on the same. -- Thanks and Regards Srikar Dronamraju diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index 3beeb030cd78..1cdd83703f93 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -44,6 +44,7 @@ extern void __init dump_numa_cpu_topology(void); extern int sysfs_add_device_to_node(struct device *dev, int nid); extern void sysfs_remove_device_from_node(struct device *dev, int nid); +extern int numa_setup_cpu(unsigned long cpu); static inline void update_numa_cpu_lookup_table(unsigned int cpu, int node) { @@ -81,6 +82,11 @@ static inline void sysfs_remove_device_from_node(struct device *dev, { } +static int numa_setup_cpu(unsigned long cpu) +{ + return first_online_node; +} + static inline void update_numa_cpu_lookup_table(unsigned int cpu, int node) {} static inline int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 5a4d59a1070d..0d0c71ba4672 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -1539,9 +1539,6 @@ void start_secondary(void *unused) shared_caches = true; } - set_numa_node(numa_cpu_lookup_table[cpu]); - set_numa_mem(local_memory_node(numa_cpu_lookup_table[cpu])); - smp_wmb(); notify_cpu_starting(cpu); set_cpu_online(cpu, true); diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index f2bf98bdcea2..11914a28db67 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -501,7 +501,7 @@ static int vphn_get_nid(long unused) * Figure out to which domain a cpu belongs and stick it there. * Return the id of the domain used. */ -static int numa_setup_cpu(unsigned long lcpu) +int numa_setup_cpu(unsigned long lcpu) { struct device_node *cpu; int fcpu = cpu_first_thread_sibling(lcpu); diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c index 12cbffd3c2e3..311fbe916d5b 100644 --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c @@ -198,9 +198,14 @@ static int pseries_add_processor(struct device_node *np) } for_each_cpu(cpu, tmp) { + int nid; + BUG_ON(cpu_present(cpu)); set_cpu_present(cpu, true); set_hard_smp_processor_id(cpu, be32_to_cpu(*intserv++)); + nid = numa_setup_cpu(cpu); + set_cpu_numa_node(cpu, nid); + set_cpu_numa_mem(cpu, local_memory_node(nid)); } err = 0; out_unlock: