Hey Nathan,

>
> Regardless of your change: at boot time, this set of calls to
> set_numa_node() and set_numa_mem() is redundant, right? Because
> smp_prepare_cpus() has:
>
>       for_each_possible_cpu(cpu) {
>               ...
>               if (cpu_present(cpu)) {
>                       set_cpu_numa_node(cpu, numa_cpu_lookup_table[cpu]);
>                       set_cpu_numa_mem(cpu,
>                               local_memory_node(numa_cpu_lookup_table[cpu]));
>               }
>
> I would rather that, when onlining a CPU that happens to have been
> dynamically added after boot, we enter start_secondary() with conditions
> equivalent to those at boot time. Or as close to that as is practical.
>
> So I'd suggest that pseries_add_processor() be made to update
> these things when the CPUs are marked present, before onlining them.

I did try updating at pseries_add_processor when things were being marked as
present. But looks like the zonelists may not be updated at that time.
I saw couple of crashes in local_memory_node() when dplar adding CPUs.
(Appending the patch that causes these crash to this mail for your reference)

[  293.669901] Kernel attempted to read user page (1508) - exploit attempt? 
(uid: 0)                                                                        
                                             [1309/17174]
[  293.669925] BUG: Kernel NULL pointer dereference on read at 0x00001508
[  293.669935] Faulting instruction address: 0xc000000000484ae0
[  293.669947] Oops: Kernel access of bad area, sig: 11 [#1]
[  293.669956] LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA pSeries
[  293.669969] Modules linked in: nft_counter nft_compat rpadlpar_io rpaphp
mptcp_diag xsk_diag tcp_diag udp_diag raw_diag inet_diag unix_diag
af_packet_diag netlink_diag bonding tls nft_fib_inet nft_fib_ipv4 nft_ fib_ipv6
nft_fib nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct
nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 rfkill ip_set
nf_tables nfnetlink dm_multipath pseries_rng xts vmx_c rypto uio_pdrv_genirq
uio binfmt_misc ip_tables xfs libcrc32c sd_mod t10_pi sg ibmvscsi ibmveth
scsi_transport_srp dm_mirror dm_region_hash dm_log dm_mod fuse
[  293.670101] CPU: 17 PID: 3183 Comm: drmgr Not tainted 5.12.0-rc5-master+ #2
[  293.670113] NIP:  c000000000484ae0 LR: c00000000010a548 CTR: 00000000006dd130
[  293.670121] REGS: c0000025a9997160 TRAP: 0300   Not tainted  
(5.12.0-rc5-master+)
[  293.670131] MSR:  8000000000009033 <SF,EE,ME,IR,DR,RI,LE>  CR: 48008422  
XER: 00000008
[  293.670155] CFAR: c00000000010a544 DAR: 0000000000001508 DSISR: 40000000 
IRQMASK: 0
[  293.670155] GPR00: c00000000010a548 c0000025a9997400 c000000001f55500 
0000000000000000
[  293.670155] GPR04: c000000001a3c598 0000000000000006 0000000000000027 
c0000035873b8018
[  293.670155] GPR08: 0000000000000023 c0000000020464f8 00000035894d0000 
c000003584c8ffe8
[  293.670155] GPR12: 0000000028008424 c0000035bf22ce80 0000000000000000 
0000000000000000
[  293.670155] GPR16: 0000000000000000 0000000000000000 0000000000000000 
0000000000000000
[  293.670155] GPR20: 0000000000000000 0000000000000000 0000000000000000 
c000000001fbc160
[  293.670155] GPR24: 0000000000000002 0000000000000200 c000000001fc04c8 
0000000000000001
[  293.670155] GPR28: c0000000010c6fc8 c000000001fc08a0 c000002f8fb99e60 
0000000000000040
[  293.670263] NIP [c000000000484ae0] local_memory_node+0x20/0x80
[  293.670281] LR [c00000000010a548] pseries_add_processor+0x368/0x3b0
[  293.670292] Call Trace:
[  293.670297] [c0000025a9997400] [c00000000010a524] 
pseries_add_processor+0x344/0x3b0 (unreliable)
[  293.670311] [c0000025a99976c0] [c00000000010a790] 
pseries_smp_notifier+0x200/0x2a0
[  293.670322] [c0000025a9997780] [c000000000197288] 
blocking_notifier_call_chain+0xa8/0x110
[  293.670335] [c0000025a99977d0] [c000000000b27010] of_attach_node+0xc0/0x110
[  293.670347] [c0000025a9997830] [c0000000001032a0] dlpar_attach_node+0x30/0x70
[  293.670358] [c0000025a99978a0] [c000000000109ac0] dlpar_cpu_add+0x1d0/0x510
[  293.670368] [c0000025a9997980] [c00000000010a898] dlpar_cpu+0x68/0x6e0
[  293.670377] [c0000025a9997a80] [c0000000001036b8] 
handle_dlpar_errorlog+0xf8/0x190
[  293.670388] [c0000025a9997af0] [c000000000103928] dlpar_store+0x178/0x4a0
[  293.670396] [c0000025a9997bb0] [c0000000007df050] kobj_attr_store+0x30/0x50
[  293.670408] [c0000025a9997bd0] [c00000000062f0b0] sysfs_kf_write+0x70/0xb0
[  293.670421] [c0000025a9997c10] [c00000000062d4e0] 
kernfs_fop_write_iter+0x1d0/0x280
[  293.670432] [c0000025a9997c60] [c00000000051673c] new_sync_write+0x14c/0x1d0
[  293.670445] [c0000025a9997d00] [c000000000519df4] vfs_write+0x264/0x380
[  293.670455] [c0000025a9997d60] [c00000000051a0ec] ksys_write+0x7c/0x140
[  293.670464] [c0000025a9997db0] [c000000000032af0] 
system_call_exception+0x150/0x290
[  293.670475] [c0000025a9997e10] [c00000000000d45c] 
system_call_common+0xec/0x278
[  293.670486] --- interrupt: c00 at 0x20000025bd74

That leaves us with just 2 options for now.
1. Update numa_mem later and only update numa_node here.
- Over a longer period of time, this would be more confusing since we
may lose track of why we are splitting the set of numa_node and numa_mem.

or
2. Use my earlier patch.

My choice would be to go with my earlier patch.
Please do let me know your thoughts on the same.

-- 
Thanks and Regards
Srikar Dronamraju

diff --git a/arch/powerpc/include/asm/topology.h 
b/arch/powerpc/include/asm/topology.h
index 3beeb030cd78..1cdd83703f93 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -44,6 +44,7 @@ extern void __init dump_numa_cpu_topology(void);
 
 extern int sysfs_add_device_to_node(struct device *dev, int nid);
 extern void sysfs_remove_device_from_node(struct device *dev, int nid);
+extern int numa_setup_cpu(unsigned long cpu);
 
 static inline void update_numa_cpu_lookup_table(unsigned int cpu, int node)
 {
@@ -81,6 +82,11 @@ static inline void sysfs_remove_device_from_node(struct 
device *dev,
 {
 }
 
+static int numa_setup_cpu(unsigned long cpu)
+{
+       return first_online_node;
+}
+
 static inline void update_numa_cpu_lookup_table(unsigned int cpu, int node) {}
 
 static inline int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 5a4d59a1070d..0d0c71ba4672 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1539,9 +1539,6 @@ void start_secondary(void *unused)
                        shared_caches = true;
        }
 
-       set_numa_node(numa_cpu_lookup_table[cpu]);
-       set_numa_mem(local_memory_node(numa_cpu_lookup_table[cpu]));
-
        smp_wmb();
        notify_cpu_starting(cpu);
        set_cpu_online(cpu, true);
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index f2bf98bdcea2..11914a28db67 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -501,7 +501,7 @@ static int vphn_get_nid(long unused)
  * Figure out to which domain a cpu belongs and stick it there.
  * Return the id of the domain used.
  */
-static int numa_setup_cpu(unsigned long lcpu)
+int numa_setup_cpu(unsigned long lcpu)
 {
        struct device_node *cpu;
        int fcpu = cpu_first_thread_sibling(lcpu);
diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c 
b/arch/powerpc/platforms/pseries/hotplug-cpu.c
index 12cbffd3c2e3..311fbe916d5b 100644
--- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
+++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
@@ -198,9 +198,14 @@ static int pseries_add_processor(struct device_node *np)
        }
 
        for_each_cpu(cpu, tmp) {
+               int nid;
+
                BUG_ON(cpu_present(cpu));
                set_cpu_present(cpu, true);
                set_hard_smp_processor_id(cpu, be32_to_cpu(*intserv++));
+               nid = numa_setup_cpu(cpu);
+               set_cpu_numa_node(cpu, nid);
+               set_cpu_numa_mem(cpu, local_memory_node(nid));
        }
        err = 0;
 out_unlock:

Reply via email to