Sparse node numbering occurs on powerpc in practice under PowerVM. In order to emulate the same NUMA topology under qemu, the assumption that NUMA nodes are linearly ordered has to be removed. qemu was recently modified to reject requests for sparse NUMA node numbering.
Leverage the present field in NodeInfo which indicates if a given nodeid was present on the command-line or not. Adjust the iteration of various NUMA loops to use the maximum known NUMA ID rather than the number of NUMA nodes. numa.c::set_numa_nodes() has become a bit more convoluted for round-robin'ing the CPUs over known nodes when not specified by the user. Note that the x86 code needs changes for both sparse node numbering and memoryless nodes, and the ppc code needs changes for memoryless nodes. ppc also requires node 0 to be present on the command-line (or else it fails with: "qemu: pSeries SLOF firmware requires >= 128M guest RMA (Real Mode Area memory)". Alexey has a series to address the latter. Examples: Before: mlock=off -numa node,nodeid=3 -numa node,nodeid=0 -smp 4 qemu-system-ppc64-base: numa: Node ID missing: 2 After: (qemu) info numa 2 nodes node 0 cpus: 1 3 node 0 size: 2048 MB node 3 cpus: 0 2 node 3 size: 2048 MB available: 2 nodes (0,3) node 0 cpus: 1 3 node 0 size: 2030 MB node 0 free: 1934 MB node 3 cpus: 0 2 node 3 size: 2045 MB node 3 free: 1957 MB node distances: node 0 3 0: 10 40 3: 40 10 Signed-off-by: Nishanth Aravamudan <n...@linux.vnet.ibm.com> Cc: Eduardo Habkost <ehabk...@redhat.com> Cc: Hu Tao <hu...@cn.fujitsu.com> Cc: Alexey Kardashevskiy <a...@ozlabs.ru> Cc: "Michael S. Tsirkin" <m...@redhat.com> Cc: Anton Blanchard <an...@samba.org> Cc: David Rientjes <rient...@google.com> Cc: Igor Mammedov <imamm...@redhat.com> Cc: qemu-devel@nongnu.org Cc: qemu-...@nongnu.org --- I understand that 2.1 is in freeze, so I'm not requesting this be applied, but would like feedback to have this ready to queue up for the next release. v1 -> v2: Modify set_numa_nodes loop for round-robin'ing CPUs. v2 -> v3: Updated changelog to indicate problem being solved. Updated memory_region_allocate_system_memory based upon feedback from Hu. Updated set_numa_nodes loop again to be simpler based upon feedback from Hu. Fixed bug with a mix of nodes with nodeid specified and without, where the same nodeid would be used by the explicit specification and the auto-numbering code. v3 -> v4: Rename nb_numa_nodes to num_numa_nodes to help catch usage. Rebased to master/origin as Eduardo's patches to support the Present flag have gone upstream. Disable sparse node numbering on i386, enable it on ppc. v4 -> v5: Split rename of nb_numa_nodes to separate patch. Fix checkpatch warnings. diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 12472c6..cdefafe 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -1121,6 +1121,18 @@ PcGuestInfo *pc_guest_info_init(ram_addr_t below_4g_mem_size, guest_info->ram_size = below_4g_mem_size + above_4g_mem_size; guest_info->apic_id_limit = pc_apic_id_limit(max_cpus); guest_info->apic_xrupt_override = kvm_allows_irq0_override(); + /* No support for sparse NUMA node IDs yet: */ + for (i = max_numa_nodeid - 1; i >= 0; i--) { + /* Report large node IDs first, to make mistakes easier to spot */ + if (!numa_info[i].present) { + error_report("numa: Node ID missing: %d", i); + exit(EXIT_FAILURE); + } + } + + /* This must be always true if all nodes are present */ + assert(num_numa_nodes == max_numa_nodeid); + guest_info->numa_nodes = num_numa_nodes; guest_info->node_mem = g_malloc0(guest_info->numa_nodes * sizeof *guest_info->node_mem); diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index 4b74fd6..9a247f8 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -642,7 +642,10 @@ static int spapr_populate_memory(sPAPREnvironment *spapr, void *fdt) /* RAM: Node 1 and beyond */ mem_start = node0_size; - for (i = 1; i < num_numa_nodes; i++) { + for (i = 1; i < max_numa_nodeid; i++) { + if (!numa_info[i].present) { + continue; + } mem_reg_property[0] = cpu_to_be64(mem_start); if (mem_start >= ram_size) { node_size = 0; diff --git a/monitor.c b/monitor.c index 392677a..53955e4 100644 --- a/monitor.c +++ b/monitor.c @@ -1949,7 +1949,10 @@ static void do_info_numa(Monitor *mon, const QDict *qdict) CPUState *cpu; monitor_printf(mon, "%d nodes\n", num_numa_nodes); - for (i = 0; i < num_numa_nodes; i++) { + for (i = 0; i < max_numa_nodeid; i++) { + if (!numa_info[i].present) { + continue; + } monitor_printf(mon, "node %d cpus:", i); CPU_FOREACH(cpu) { if (cpu->numa_node == i) { diff --git a/numa.c b/numa.c index 5930df0..a689e52 100644 --- a/numa.c +++ b/numa.c @@ -53,7 +53,10 @@ static void numa_node_parse(NumaNodeOptions *node, QemuOpts *opts, Error **errp) if (node->has_nodeid) { nodenr = node->nodeid; } else { - nodenr = num_numa_nodes; + nodenr = 0; + while (numa_info[nodenr].present) { + nodenr++; + } } if (nodenr >= MAX_NODES) { @@ -160,22 +163,10 @@ error: void set_numa_nodes(void) { - int i; + int i, j; assert(max_numa_nodeid <= MAX_NODES); - /* No support for sparse NUMA node IDs yet: */ - for (i = max_numa_nodeid - 1; i >= 0; i--) { - /* Report large node IDs first, to make mistakes easier to spot */ - if (!numa_info[i].present) { - error_report("numa: Node ID missing: %d", i); - exit(1); - } - } - - /* This must be always true if all nodes are present: */ - assert(num_numa_nodes == max_numa_nodeid); - if (num_numa_nodes > 0) { uint64_t numa_total; @@ -186,27 +177,30 @@ void set_numa_nodes(void) /* If no memory size is given for any node, assume the default case * and distribute the available memory equally across all nodes */ - for (i = 0; i < num_numa_nodes; i++) { - if (numa_info[i].node_mem != 0) { + for (i = 0; i < max_numa_nodeid; i++) { + if (numa_info[i].present && numa_info[i].node_mem != 0) { break; } } - if (i == num_numa_nodes) { + if (i == max_numa_nodeid) { uint64_t usedmem = 0; /* On Linux, each node's border has to be 8MB aligned, * the final node gets the rest. */ - for (i = 0; i < num_numa_nodes - 1; i++) { - numa_info[i].node_mem = (ram_size / num_numa_nodes) & - ~((1 << 23UL) - 1); - usedmem += numa_info[i].node_mem; + for (i = 0; i < max_numa_nodeid - 1; i++) { + if (numa_info[i].present) { + numa_info[i].node_mem = (ram_size / num_numa_nodes) & + ~((1 << 23UL) - 1); + usedmem += numa_info[i].node_mem; + } } + assert(numa_info[i].present); numa_info[i].node_mem = ram_size - usedmem; } numa_total = 0; - for (i = 0; i < num_numa_nodes; i++) { + for (i = 0; i < max_numa_nodeid; i++) { numa_total += numa_info[i].node_mem; } if (numa_total != ram_size) { @@ -216,8 +210,9 @@ void set_numa_nodes(void) exit(1); } - for (i = 0; i < num_numa_nodes; i++) { - if (!bitmap_empty(numa_info[i].node_cpu, MAX_CPUMASK_BITS)) { + for (i = 0; i < max_numa_nodeid; i++) { + if (numa_info[i].present && + !bitmap_empty(numa_info[i].node_cpu, MAX_CPUMASK_BITS)) { break; } } @@ -225,9 +220,12 @@ void set_numa_nodes(void) * must cope with this anyway, because there are BIOSes out there in * real machines which also use this scheme. */ - if (i == num_numa_nodes) { - for (i = 0; i < max_cpus; i++) { - set_bit(i, numa_info[i % num_numa_nodes].node_cpu); + if (i == max_numa_nodeid) { + for (i = 0, j = 0; i < max_cpus; i++) { + do { + j = (j + 1) % (max_numa_nodeid); + } while (!numa_info[j].present); + set_bit(i, numa_info[j].node_cpu); } } } @@ -239,8 +237,9 @@ void set_numa_modes(void) int i; CPU_FOREACH(cpu) { - for (i = 0; i < num_numa_nodes; i++) { - if (test_bit(cpu->cpu_index, numa_info[i].node_cpu)) { + for (i = 0; i < max_numa_nodeid; i++) { + if (numa_info[i].present && + test_bit(cpu->cpu_index, numa_info[i].node_cpu)) { cpu->numa_node = i; } } @@ -288,10 +287,13 @@ void memory_region_allocate_system_memory(MemoryRegion *mr, Object *owner, } memory_region_init(mr, owner, name, ram_size); - for (i = 0; i < MAX_NODES; i++) { + for (i = 0; i < max_numa_nodeid; i++) { Error *local_err = NULL; uint64_t size = numa_info[i].node_mem; HostMemoryBackend *backend = numa_info[i].node_memdev; + if (!numa_info[i].present) { + continue; + } if (!backend) { continue; }