since commit dd0247e0 pc: acpi: mark all possible CPUs as enabled in SRAT Linux kernel actually tries to use CPU to Node mapping from QEMU provided SRAT table instead of discarding it, and that in some cases breaks build_sched_domains() which expects sane mapping where cores/threads belonging to the same socket are on the same NUMA node.
With current default round-robin mapping of VCPUs to nodes guest ends-up with cores/threads belonging to the same socket being on different NUMA nodes. For example with following CLI: qemu-kvm -m 4G -smp 5,sockets=2,cores=4,threads=1,maxcpus=8 \ -numa node,nodeid=0 -numa node,nodeid=1 2.6.32 based kernels will hang on boot due to incorrectly build sched_group-s list in update_sd_lb_stats() so comment in QEMU justifying dumb default mapping: " guest OSes must cope with this anyway, because there are BIOSes out there in real machines which also use this scheme. " isn't really valid. Replacing default mapping with a manual, where VCPUs belonging to the same socket are on the same NUMA node, fixes issue for guests which can't handle nonsense topology i.e. changing CLI to: -numa node,nodeid=0,cpus=0-3 -numa node,nodeid=1,cpus=4-7 So instead of simply scattering VCPUs around nodes, map the same socket VCPUs to the same NUMA node, which is what guest would expect from a sane hardware/BIOS. Signed-off-by: Igor Mammedov <imamm...@redhat.com> --- v2: - add machine callback cpu_index_to_socket_id() and use it instead of stub approach --- hw/i386/pc.c | 9 +++++++++ include/hw/boards.h | 5 +++++ include/sysemu/numa.h | 3 ++- numa.c | 18 +++++++++++++----- vl.c | 2 +- 5 files changed, 30 insertions(+), 7 deletions(-) diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 4b46c29..a52d2af 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -1851,6 +1851,14 @@ static void pc_machine_initfn(Object *obj) NULL, NULL); } +static unsigned pc_cpu_index_to_socket_id(unsigned cpu_index) +{ + unsigned pkg_id, core_id, smt_id; + x86_topo_ids_from_idx(smp_cores, smp_threads, cpu_index, + &pkg_id, &core_id, &smt_id); + return pkg_id; +} + static void pc_machine_class_init(ObjectClass *oc, void *data) { MachineClass *mc = MACHINE_CLASS(oc); @@ -1859,6 +1867,7 @@ static void pc_machine_class_init(ObjectClass *oc, void *data) pcmc->get_hotplug_handler = mc->get_hotplug_handler; mc->get_hotplug_handler = pc_get_hotpug_handler; + mc->cpu_index_to_socket_id = pc_cpu_index_to_socket_id; hc->plug = pc_machine_device_plug_cb; hc->unplug_request = pc_machine_device_unplug_request_cb; hc->unplug = pc_machine_device_unplug_cb; diff --git a/include/hw/boards.h b/include/hw/boards.h index 1feea2b..78838d1 100644 --- a/include/hw/boards.h +++ b/include/hw/boards.h @@ -82,6 +82,10 @@ bool machine_mem_merge(MachineState *machine); * of HotplugHandler object, which handles hotplug operation * for a given @dev. It may return NULL if @dev doesn't require * any actions to be performed by hotplug handler. + * @cpu_index_to_socket_id: + * used to provide @cpu_index to socket number mapping, allowing + * a machine to group CPU threads belonging to the same socket/package + * Returns: socket number given cpu_index belongs to. */ struct MachineClass { /*< private >*/ @@ -118,6 +122,7 @@ struct MachineClass { HotplugHandler *(*get_hotplug_handler)(MachineState *machine, DeviceState *dev); + unsigned (*cpu_index_to_socket_id)(unsigned cpu_index); }; /** diff --git a/include/sysemu/numa.h b/include/sysemu/numa.h index 5633b85..6523b4d 100644 --- a/include/sysemu/numa.h +++ b/include/sysemu/numa.h @@ -6,6 +6,7 @@ #include "qemu/option.h" #include "sysemu/sysemu.h" #include "sysemu/hostmem.h" +#include "hw/boards.h" extern int nb_numa_nodes; /* Number of NUMA nodes */ @@ -16,7 +17,7 @@ typedef struct node_info { bool present; } NodeInfo; extern NodeInfo numa_info[MAX_NODES]; -void parse_numa_opts(void); +void parse_numa_opts(MachineClass *mc); void numa_post_machine_init(void); void query_numa_node_mem(uint64_t node_mem[]); extern QemuOptsList qemu_numa_opts; diff --git a/numa.c b/numa.c index ffbec68..f1f571a 100644 --- a/numa.c +++ b/numa.c @@ -165,7 +165,7 @@ error: return -1; } -void parse_numa_opts(void) +void parse_numa_opts(MachineClass *mc) { int i; @@ -233,13 +233,21 @@ void parse_numa_opts(void) break; } } - /* assigning the VCPUs round-robin is easier to implement, guest OSes - * must cope with this anyway, because there are BIOSes out there in - * real machines which also use this scheme. + /* Historically VCPUs were assigned in round-robin order to NUMA + * nodes. However it causes issues with guest not handling it nice + * in case where cores/threads from a multicore CPU appear on + * different nodes. So allow boards to override default distribution + * rule grouping VCPUs by socket so that VCPUs from the same socket + * would be on the same node. */ if (i == nb_numa_nodes) { for (i = 0; i < max_cpus; i++) { - set_bit(i, numa_info[i % nb_numa_nodes].node_cpu); + unsigned node_id = i % nb_numa_nodes; + if (mc->cpu_index_to_socket_id) { + node_id = mc->cpu_index_to_socket_id(i) % nb_numa_nodes; + } + + set_bit(i, numa_info[node_id].node_cpu); } } } diff --git a/vl.c b/vl.c index 694deb4..ab02de9 100644 --- a/vl.c +++ b/vl.c @@ -4168,7 +4168,7 @@ int main(int argc, char **argv, char **envp) default_drive(default_floppy, snapshot, IF_FLOPPY, 0, FD_OPTS); default_drive(default_sdcard, snapshot, IF_SD, 0, SD_OPTS); - parse_numa_opts(); + parse_numa_opts(machine_class); if (qemu_opts_foreach(qemu_find_opts("mon"), mon_init_func, NULL, 1) != 0) { exit(1); -- 1.8.3.1