Current, QEMU does not provide a clear command to set vNUMA distance for guest although we already have `-numa` command to set vNUMA nodes.
vNUMA distance makes sense in certain scenario. But now, if we create a guest that has 4 vNUMA nodes, when we check NUMA info via `numactl -H`, we will see: node distance: node 0 1 2 3 0: 10 20 20 20 1: 20 10 20 20 2: 20 20 10 20 3: 20 20 20 10 Guest kernel regards all local node as distance 10, and all remote node as distance 20 when there is no SLIT table since QEMU doesn't build it. It looks like a little strange when you have seen the distance in an actual physical machine that contains 4 NUMA nodes. My machine shows: node distance: node 0 1 2 3 0: 10 21 31 41 1: 21 10 21 31 2: 31 21 10 21 3: 41 31 21 10 This patch is going to add SLIT table support in QEMU, and provides addtional option `dist` for command `-numa` to allow user set vNUMA distance by QEMU command. With this patch, when a user wants to create a guest that contains several vNUMA nodes and also wants to set distance among those nodes, the QEMU command would like: ``` -object memory-backend-ram,size=1G,prealloc=yes,host-nodes=0,policy=bind,id=node0 \ -numa node,nodeid=0,cpus=0,memdev=node0 \ -object memory-backend-ram,size=1G,prealloc=yes,host-nodes=1,policy=bind,id=node1 \ -numa node,nodeid=1,cpus=1,memdev=node1 \ -object memory-backend-ram,size=1G,prealloc=yes,host-nodes=2,policy=bind,id=node2 \ -numa node,nodeid=2,cpus=2,memdev=node2 \ -object memory-backend-ram,size=1G,prealloc=yes,host-nodes=3,policy=bind,id=node3 \ -numa node,nodeid=3,cpus=3,memdev=node3 \ -numa dist,a=0,b=1,val=21 \ -numa dist,a=0,b=2,val=31 \ -numa dist,a=0,b=3,val=41 \ -numa dist,a=1,b=0,val=21 \ ... ``` Thanks, -He Signed-off-by: He Chen <he.c...@linux.intel.com> --- hw/i386/acpi-build.c | 28 ++++++++++++++++++++++++++ include/hw/acpi/acpi-defs.h | 9 +++++++++ include/sysemu/numa.h | 1 + numa.c | 48 +++++++++++++++++++++++++++++++++++++++++++++ qapi-schema.json | 24 +++++++++++++++++++++-- qemu-options.hx | 5 ++++- 6 files changed, 112 insertions(+), 3 deletions(-) diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c index 2073108..7ced37d 100644 --- a/hw/i386/acpi-build.c +++ b/hw/i386/acpi-build.c @@ -2396,6 +2396,32 @@ build_srat(GArray *table_data, BIOSLinker *linker, MachineState *machine) } static void +build_slit(GArray *table_data, BIOSLinker *linker, MachineState *machine) +{ + struct AcpiSystemLocalityDistanceTable *slit; + uint8_t *entry; + int slit_start, slit_data_len, i, j; + slit_start = table_data->len; + + slit = acpi_data_push(table_data, sizeof(*slit)); + slit->nb_localities = nb_numa_nodes; + + slit_data_len = sizeof(uint8_t) * nb_numa_nodes * nb_numa_nodes; + entry = acpi_data_push(table_data, slit_data_len); + + for (i = 0; i < nb_numa_nodes; i++) { + for (j = 0; j < nb_numa_nodes; j++) { + entry[i * nb_numa_nodes + j] = numa_info[i].distance[j]; + } + } + + build_header(linker, table_data, + (void *)(table_data->data + slit_start), + "SLIT", + table_data->len - slit_start, 1, NULL, NULL); +} + +static void build_mcfg_q35(GArray *table_data, BIOSLinker *linker, AcpiMcfgInfo *info) { AcpiTableMcfg *mcfg; @@ -2678,6 +2704,8 @@ void acpi_build(AcpiBuildTables *tables, MachineState *machine) if (pcms->numa_nodes) { acpi_add_table(table_offsets, tables_blob); build_srat(tables_blob, tables->linker, machine); + acpi_add_table(table_offsets, tables_blob); + build_slit(tables_blob, tables->linker, machine); } if (acpi_get_mcfg(&mcfg)) { acpi_add_table(table_offsets, tables_blob); diff --git a/include/hw/acpi/acpi-defs.h b/include/hw/acpi/acpi-defs.h index 4cc3630..b183a8f 100644 --- a/include/hw/acpi/acpi-defs.h +++ b/include/hw/acpi/acpi-defs.h @@ -527,6 +527,15 @@ struct AcpiSratProcessorGiccAffinity typedef struct AcpiSratProcessorGiccAffinity AcpiSratProcessorGiccAffinity; +/* + * SLIT (NUMA distance description) table + */ +struct AcpiSystemLocalityDistanceTable { + ACPI_TABLE_HEADER_DEF + uint64_t nb_localities; +} QEMU_PACKED; +typedef struct AcpiSystemLocalityDistanceTable AcpiSystemLocalityDistanceTable; + /* PCI fw r3.0 MCFG table. */ /* Subtable */ struct AcpiMcfgAllocation { diff --git a/include/sysemu/numa.h b/include/sysemu/numa.h index 8f09dcf..2f7a941 100644 --- a/include/sysemu/numa.h +++ b/include/sysemu/numa.h @@ -21,6 +21,7 @@ typedef struct node_info { struct HostMemoryBackend *node_memdev; bool present; QLIST_HEAD(, numa_addr_range) addr; /* List to store address ranges */ + uint8_t distance[MAX_NODES]; } NodeInfo; extern NodeInfo numa_info[MAX_NODES]; diff --git a/numa.c b/numa.c index e01cb54..897657a 100644 --- a/numa.c +++ b/numa.c @@ -50,6 +50,9 @@ static int have_memdevs = -1; static int max_numa_nodeid; /* Highest specified NUMA node ID, plus one. * For all nodes, nodeid < max_numa_nodeid */ +static int min_numa_distance = 10; +static int def_numa_distance = 20; +static int max_numa_distance = 255; int nb_numa_nodes; NodeInfo numa_info[MAX_NODES]; @@ -208,10 +211,33 @@ static void numa_node_parse(NumaNodeOptions *node, QemuOpts *opts, Error **errp) numa_info[nodenr].node_mem = object_property_get_int(o, "size", NULL); numa_info[nodenr].node_memdev = MEMORY_BACKEND(o); } + numa_info[nodenr].present = true; max_numa_nodeid = MAX(max_numa_nodeid, nodenr + 1); } +static void numa_distance_parse(NumaDistOptions *dist, QemuOpts *opts, Error **errp) +{ + uint8_t a = dist->a; + uint8_t b = dist->b; + uint8_t val = dist->val; + + if (a >= MAX_NODES || b >= MAX_NODES) { + error_setg(errp, "Max number of NUMA nodes reached: %" + PRIu16 "", a > b ? a : b); + return; + } + + if (val > max_numa_distance || val < min_numa_distance) { + error_setg(errp, + "NUMA distance (%" PRIu8 ") out of range (%d)~(%d)", + dist->val, max_numa_distance, min_numa_distance); + return; + } + + numa_info[a].distance[b] = val; +} + static int parse_numa(void *opaque, QemuOpts *opts, Error **errp) { NumaOptions *object = NULL; @@ -235,6 +261,12 @@ static int parse_numa(void *opaque, QemuOpts *opts, Error **errp) } nb_numa_nodes++; break; + case NUMA_OPTIONS_TYPE_DIST: + numa_distance_parse(&object->u.dist, opts, &err); + if (err) { + goto end; + } + break; default: abort(); } @@ -294,6 +326,21 @@ static void validate_numa_cpus(void) g_free(seen_cpus); } +static void default_numa_distance(void) +{ + int i, j; + + for (i = 0; i < nb_numa_nodes; i++) { + for (j = 0; j < nb_numa_nodes; j++) { + if (i == j && numa_info[i].distance[j] != min_numa_distance) { + numa_info[i].distance[j] = min_numa_distance; + } else if (numa_info[i].distance[j] <= min_numa_distance) { + numa_info[i].distance[j] = def_numa_distance; + } + } + } +} + void parse_numa_opts(MachineClass *mc) { int i; @@ -390,6 +437,7 @@ void parse_numa_opts(MachineClass *mc) } validate_numa_cpus(); + default_numa_distance(); } else { numa_set_mem_node_id(0, ram_size, 0); } diff --git a/qapi-schema.json b/qapi-schema.json index 32b4a4b..2988304 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -5647,7 +5647,7 @@ # Since: 2.1 ## { 'enum': 'NumaOptionsType', - 'data': [ 'node' ] } + 'data': [ 'node', 'dist' ] } ## # @NumaOptions: @@ -5660,7 +5660,8 @@ 'base': { 'type': 'NumaOptionsType' }, 'discriminator': 'type', 'data': { - 'node': 'NumaNodeOptions' }} + 'node': 'NumaNodeOptions', + 'dist': 'NumaDistOptions' }} ## # @NumaNodeOptions: @@ -5689,6 +5690,25 @@ '*memdev': 'str' }} ## +# @NumaDistOptions: +# +# Set distance between 2 NUMA nodes. (for OptsVisitor) +# +# @a: first NUMA node. +# +# @b: second NUMA node. +# +# @val: NUMA distance between 2 given NUMA nodes. +# +# Since: 2.9 +## +{ 'struct': 'NumaDistOptions', + 'data': { + 'a': 'uint8', + 'b': 'uint8', + 'val': 'uint8' }} + +## # @HostMemPolicy: # # Host memory policy types diff --git a/qemu-options.hx b/qemu-options.hx index 8dd8ee3..0de5cf8 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -139,12 +139,15 @@ ETEXI DEF("numa", HAS_ARG, QEMU_OPTION_numa, "-numa node[,mem=size][,cpus=firstcpu[-lastcpu]][,nodeid=node]\n" - "-numa node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node]\n", QEMU_ARCH_ALL) + "-numa node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node]\n" + "-numa dist,a=firstnode,b=secondnode,val=distance\n", QEMU_ARCH_ALL) STEXI @item -numa node[,mem=@var{size}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}] @itemx -numa node[,memdev=@var{id}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}] +@itemx -numa dist,a=@var{firstnode},b=@var{secondnode},val=@var{distance} @findex -numa Define a NUMA node and assign RAM and VCPUs to it. +Set NUMA distance between 2 NUMA nodes. @var{firstcpu} and @var{lastcpu} are CPU indexes. Each @samp{cpus} option represent a contiguous range of CPU indexes -- 2.7.4