On Tue, Aug 06, 2019 at 09:29:45PM +1000, Daniel Black wrote: > Replace all node_id assumptions with lookups from > machinestate->numa_state->nodes[] > and remove aspects that assume a sequential numbering of nodes. This enables > non-sequential NUMA node number topoligies to be created. > > Default assignments of CPU->nodeid (get_default_cpu_node_id) now return > a nodeid from the machinestate->numa_state->nodes[]. > > x86 will use the node is as the Proximity Domain (which the > Linux kernel will map down to sequential node numbers). Both HMAT and > SLIT ACPI data are entered based on this nodeid. In Linux kernel > output look at the SRAT/HMAT: and PXM: references in the kernel early boot. > > Small enhancements where made to error messages to be more explicit > about errors in node specification. > > CC: Tao Xu <tao3...@intel.com> > CC: Liu Jingqi <jingqi....@intel.com> > Signed-off-by: Daniel Black <dan...@linux.ibm.com>
I have no real opinion on whether this is a good idea overall. But, if we go for it then the ppc parts are Acked-by: David Gibson <da...@gibson.dropbear.id.au> > > --- > Based-on: 20190614155626.27932-1-tao3...@intel.com > ([PATCH RESEND v8 00/11] Build ACPI Heterogeneous Memory Attribute Table > (HMAT)) > > Test script: > > #!/bin/bash > set -x -v > > QEMUHOME=${HOME}/repos/qemu/ > # optional but make it easy to install/run numactl --hardware > #ALPINE_NET="" > ALPINE_NET="ip=dhcp > alpine_repo=http://dl-cdn.alpinelinux.org/alpine/edge/main/" > > ALPINE_HOME=${HOME}/repos/alpine/alpine-netboot-3.10.1- > > # x86 / armv7 - no CONFIG_NUMA=y support in kernel > # Kernel configs: https://git.alpinelinux.org/aports/tree/main/linux-vanilla/ > # s390x - no numa support in QEMU > for ARCH in x86_64 aarch64 ppc64le > do > if [ ! -d ${ALPINE_HOME}${ARCH} ] > then > mkdir ${ALPINE_HOME}${ARCH} > wget > http://dl-cdn.alpinelinux.org/alpine/v3.10/releases/${ARCH}/alpine-netboot-3.10.1-${ARCH}.tar.gz > -O - | tar -zxf - -C ${ALPINE_HOME}${ARCH} > fi > done > > if [ ! -x ${ALPINE_HOME}i386 ] > then > ln -s ${ALPINE_HOME}x86 ${ALPINE_HOME}i386 > fi > > if [ ! -x ${ALPINE_HOME}arm ] > then > ln -s ${ALPINE_HOME}armv7 ${ALPINE_HOME}arm > fi > > if [ ! -x ${ALPINE_HOME}ppc64 ] > then > ln -s ${ALPINE_HOME}ppc64le ${ALPINE_HOME}ppc64 > fi > > # Note "virtual" kernels don't have numa enabled > run() > { > NUMA=$1 > ARCH=$2 > ARGS=$3 > CONSOLE=$4 > #echo \ > ${QEMUHOME}/${ARCH}-softmmu/qemu-system-${ARCH} \ > ${ARGS} \ > -kernel ${ALPINE_HOME}${ARCH}/boot/vmlinuz-vanilla \ > -initrd ${ALPINE_HOME}${ARCH}/boot/initramfs-vanilla \ > -append "${CONSOLE} ${ALPINE_NET}" \ > -m 2G \ > ${NUMA} > echo > } > > # This ends up as odd: > # ends up with both CPUs are on same node > # as 0 and 8 % 2 (nodes) are the same > # in short - don't run legacy with gaps with > # odd numa node numbers (like 0 and 8). > run_legacy() > { > run "-smp 2,cores=3,sockets=2,maxcpus=6 \ > -numa node,mem=1G \ > -numa node,mem=1G,nodeid=8 \ > -numa dist,src=0,dst=8,val=21" "$@" > } > > run_memdev_implicit_core() > { > run "-smp cpus=6,maxcpus=8,cores=4,sockets=2 \ > -object memory-backend-ram,id=ram0,size=1G \ > -object memory-backend-ram,id=ram1,size=1G \ > -numa node,memdev=ram0,nodeid=0 \ > -numa node,memdev=ram1,nodeid=8 \ > -numa dist,src=0,dst=8,val=21" "$@" > } > > run_memdev_explicit_core() > { > run "-smp cpus=6,maxcpus=8,cores=4,sockets=2 \ > -object memory-backend-ram,id=ram0,size=1G \ > -object memory-backend-ram,id=ram1,size=1G \ > -numa node,memdev=ram0,cpus=0-3,nodeid=0 \ > -numa node,memdev=ram1,cpus=4-7,nodeid=8 \ > -numa dist,src=0,dst=8,val=21" "$@" > } > > # hmat isn't added until kernel-5.2-rc1 and requires > # CONFIG_ACPI_HMAT > run_hmat_lb() > { > run "-smp 2,sockets=2 \ > -m 128M,slots=2,maxmem=1G \ > -kernel ${HOME}/repos/linux/vmlinux \ > -object memory-backend-ram,size=64M,id=m0 \ > -object memory-backend-ram,size=64M,id=m1 \ > -numa node,nodeid=3,memdev=m0 \ > -numa node,nodeid=4,memdev=m1,initiator=3 \ > -numa cpu,node-id=3,socket-id=0 \ > -numa cpu,node-id=3,socket-id=1 \ > -numa > hmat-lb,initiator=3,target=3,hierarchy=memory,data-type=access-latency,base-lat=1000,latency=5 > \ > -numa > hmat-lb,initiator=3,target=3,hierarchy=memory,data-type=access-bandwidth,base-bw=20,bandwidth=5 > \ > -numa > hmat-lb,initiator=3,target=4,hierarchy=memory,data-type=access-latency,base-lat=1,latency=15 > \ > -numa > hmat-lb,initiator=3,target=4,hierarchy=memory,data-type=access-bandwidth,base-bw=20,bandwidth=10 > \ > -numa > hmat-cache,node-id=3,size=0x20000,total=1,level=1,assoc=direct,policy=write-back,line=8 > \ > -numa > hmat-cache,node-id=4,size=0x20000,total=1,level=1,assoc=direct,policy=write-back,line=8" > "$@" > } > > > for arch in x86_64 ppc64 aarch64 s390x; do killall qemu-system-$arch; done > killall vncviewer > > # i386 Alpine kernels don't have NUMA > #run_memdev_implicit_core i386 "-machine pc -nographic" console=ttyS0 > # armv7 kernel's don't have NUMA > #run_legacy arm "-machine virt -cpu cortex-a15 -nographic" console=ttyAMA0 > > # GOOD > run_legacy x86_64 "-machine pc -nographic" console=ttyS0 > run_memdev_implicit_core x86_64 "-machine pc -nographic" console=ttyS0 > run_memdev_explicit_core x86_64 "-machine pc -nographic" console=ttyS0 > > run_hmat_lb x86_64 "-machine pc -nographic" console=ttyS0 > > # GOOD > run_legacy aarch64 "-machine virt -cpu cortex-a57 -nographic" console=ttyAMA0 > run_memdev_implicit_core aarch64 "-machine virt -cpu cortex-a57 -nographic" > console=ttyAMA0 > run_memdev_explicit_core aarch64 "-machine virt -cpu cortex-a57 -nographic" > console=ttyAMA0 > > # PPC not doing numa distance (not a regression) > (sleep 1; vncviewer :0) & > > # GOOD > run_legacy ppc64 "-machine pseries -cpu POWER9 -display vnc=:0" "numa=debug" > run_memdev_implicit_core ppc64 "-machine pseries -cpu POWER9 -display > vnc=:0" "numa=debug" > run_memdev_explicit_core ppc64 "-machine pseries -cpu POWER9 -display > vnc=:0" "numa=debug" > > # ON P8 ppc64le host: > # run_memdev_implicit_core ppc64 "-machine pseries -cpu host -accel kvm > -display vnc=:0" "numa=debug" > > # Couldn't boot Alpine ARM kernel on this machine type: > # arm sbsa ref - appears to be a BMC so not really a numa target? > # seems ok looking at the results of sbsa_ref_get_default_cpu_node_id however > it display no > # output when booting > > # run_legacy aarch64 "-machine sbsa-ref -nographic" console=ttyAMA0 > > # Then run: > # sh -c 'apk add numactl-tools && numactl --hardware' > # > # alternately examine results in: > # ls -la /sys/devices/system/node/node*/cpu* > # more /sys/devices/system/node/node*/distance > # > # x86 node numbers are renumbered by kernel. To view > # acpi mappings: > # dmesg | egrep -A 2 '(HMAT|SRAT|PXM):' > --- > hw/acpi/aml-build.c | 31 ++++++--- > hw/acpi/hmat.c | 14 +++-- > hw/arm/boot.c | 3 +- > hw/arm/sbsa-ref.c | 6 +- > hw/arm/virt-acpi-build.c | 3 +- > hw/arm/virt.c | 6 +- > hw/core/machine.c | 40 +++++++++--- > hw/core/numa.c | 132 +++++++++++++++++++-------------------- > hw/i386/acpi-build.c | 12 ++-- > hw/i386/pc.c | 2 +- > hw/ppc/spapr.c | 12 ++-- > include/sysemu/numa.h | 2 + > 12 files changed, 154 insertions(+), 109 deletions(-) > > diff --git a/hw/acpi/aml-build.c b/hw/acpi/aml-build.c > index 26ccc1a3e2..512c76e3dd 100644 > --- a/hw/acpi/aml-build.c > +++ b/hw/acpi/aml-build.c > @@ -1728,19 +1728,34 @@ void build_srat_memory(AcpiSratMemoryAffinity > *numamem, uint64_t base, > */ > void build_slit(GArray *table_data, BIOSLinker *linker, MachineState *ms) > { > - int slit_start, i, j; > + int slit_start, i, j, src, dst, largest; > slit_start = table_data->len; > int nb_numa_nodes = ms->numa_state->num_nodes; > > acpi_data_push(table_data, sizeof(AcpiTableHeader)); > > - build_append_int_noprefix(table_data, nb_numa_nodes, 8); > - for (i = 0; i < nb_numa_nodes; i++) { > - for (j = 0; j < nb_numa_nodes; j++) { > - assert(ms->numa_state->nodes[i].distance[j]); > - build_append_int_noprefix(table_data, > - ms->numa_state->nodes[i].distance[j], > - 1); > + for (largest = 0, i = 0; i < nb_numa_nodes; i++) > + if (largest < ms->numa_state->nodes[i].nodeid) { > + largest = ms->numa_state->nodes[i].nodeid; > + } > + > + /* number of entries is largest + 1 as nodes start at 0 */ > + build_append_int_noprefix(table_data, largest + 1, 8); > + > + for (i = 0; i <= largest; i++) { > + src = find_numa(i, ms->numa_state); > + for (j = 0; j <= largest; j++) { > + dst = find_numa(j, ms->numa_state); > + > + if (dst == MAX_NODES || src == MAX_NODES) { > + /* 255 is unreachable. Linux expects 10 in self-maps entries > */ > + build_append_int_noprefix(table_data, > + i == j ? NUMA_DISTANCE_MIN : 255, > 1); > + } else { > + assert(ms->numa_state->nodes[src].distance[dst]); > + build_append_int_noprefix(table_data, > + ms->numa_state->nodes[src].distance[dst], 1); > + } > } > } > > diff --git a/hw/acpi/hmat.c b/hw/acpi/hmat.c > index 01a6552d51..0042be48d2 100644 > --- a/hw/acpi/hmat.c > +++ b/hw/acpi/hmat.c > @@ -73,7 +73,8 @@ static void build_hmat_mpda(GArray *table_data, uint16_t > flags, int initiator, > */ > static void build_hmat_lb(GArray *table_data, HMAT_LB_Info *hmat_lb, > uint32_t num_initiator, uint32_t num_target, > - uint32_t *initiator_pxm, int type) > + uint32_t *initiator_pxm, int type, > + NumaState *numa_state) > { > uint32_t s = num_initiator; > uint32_t t = num_target; > @@ -114,12 +115,12 @@ static void build_hmat_lb(GArray *table_data, > HMAT_LB_Info *hmat_lb, > > /* Target Proximity Domain List */ > for (i = 0; i < t; i++) { > - build_append_int_noprefix(table_data, i, 4); > + build_append_int_noprefix(table_data, numa_state->nodes[i].nodeid, > 4); > } > > /* Latency or Bandwidth Entries */ > for (i = 0; i < s; i++) { > - m = initiator_pxm[i]; > + m = find_numa(initiator_pxm[i], numa_state); > for (n = 0; n < t; n++) { > uint16_t entry; > > @@ -199,12 +200,13 @@ static void hmat_build_table_structs(GArray > *table_data, NumaState *nstat) > flags |= HMAT_PROX_INIT_VALID; > } > > - build_hmat_mpda(table_data, flags, nstat->nodes[i].initiator, i); > + build_hmat_mpda(table_data, flags, nstat->nodes[i].initiator, > + nstat->nodes[i].nodeid); > } > > for (i = 0; i < nstat->num_nodes; i++) { > if (nstat->nodes[i].has_cpu) { > - initiator_pxm[num_initiator++] = i; > + initiator_pxm[num_initiator++] = nstat->nodes[i].nodeid; > } > } > > @@ -220,7 +222,7 @@ static void hmat_build_table_structs(GArray *table_data, > NumaState *nstat) > > if (numa_hmat_lb) { > build_hmat_lb(table_data, numa_hmat_lb, num_initiator, > - nstat->num_nodes, initiator_pxm, type); > + nstat->num_nodes, initiator_pxm, type, nstat); > } > } > } > diff --git a/hw/arm/boot.c b/hw/arm/boot.c > index 6472aa441e..1d92001930 100644 > --- a/hw/arm/boot.c > +++ b/hw/arm/boot.c > @@ -603,7 +603,8 @@ int arm_load_dtb(hwaddr addr, const struct arm_boot_info > *binfo, > for (i = 0; i < ms->numa_state->num_nodes; i++) { > mem_len = ms->numa_state->nodes[i].node_mem; > rc = fdt_add_memory_node(fdt, acells, mem_base, > - scells, mem_len, i); > + scells, mem_len, > + ms->numa_state->nodes[i].nodeid); > if (rc < 0) { > fprintf(stderr, "couldn't add /memory@%"PRIx64" node\n", > mem_base); > diff --git a/hw/arm/sbsa-ref.c b/hw/arm/sbsa-ref.c > index 3a243e6a53..f2c3a6fefa 100644 > --- a/hw/arm/sbsa-ref.c > +++ b/hw/arm/sbsa-ref.c > @@ -166,8 +166,8 @@ static void create_fdt(SBSAMachineState *sms) > for (i = 0; i < nb_numa_nodes; i++) { > for (j = 0; j < nb_numa_nodes; j++) { > idx = (i * nb_numa_nodes + j) * 3; > - matrix[idx + 0] = cpu_to_be32(i); > - matrix[idx + 1] = cpu_to_be32(j); > + matrix[idx + 0] = > cpu_to_be32(ms->numa_state->nodes[i].nodeid); > + matrix[idx + 1] = > cpu_to_be32(ms->numa_state->nodes[j].nodeid); > matrix[idx + 2] = > cpu_to_be32(ms->numa_state->nodes[i].distance[j]); > } > @@ -762,7 +762,7 @@ sbsa_ref_cpu_index_to_props(MachineState *ms, unsigned > cpu_index) > static int64_t > sbsa_ref_get_default_cpu_node_id(const MachineState *ms, int idx) > { > - return idx % ms->numa_state->num_nodes; > + return ms->numa_state->nodes[idx % ms->numa_state->num_nodes].nodeid; > } > > static void sbsa_ref_instance_init(Object *obj) > diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c > index 89899ec4c1..0384339867 100644 > --- a/hw/arm/virt-acpi-build.c > +++ b/hw/arm/virt-acpi-build.c > @@ -537,7 +537,8 @@ build_srat(GArray *table_data, BIOSLinker *linker, > VirtMachineState *vms) > if (ms->numa_state->nodes[i].node_mem > 0) { > numamem = acpi_data_push(table_data, sizeof(*numamem)); > build_srat_memory(numamem, mem_base, > - ms->numa_state->nodes[i].node_mem, i, > + ms->numa_state->nodes[i].node_mem, > + ms->numa_state->nodes[i].nodeid, > MEM_AFFINITY_ENABLED); > mem_base += ms->numa_state->nodes[i].node_mem; > } > diff --git a/hw/arm/virt.c b/hw/arm/virt.c > index 46f39e20bc..1a2db6447f 100644 > --- a/hw/arm/virt.c > +++ b/hw/arm/virt.c > @@ -240,8 +240,8 @@ static void create_fdt(VirtMachineState *vms) > for (i = 0; i < nb_numa_nodes; i++) { > for (j = 0; j < nb_numa_nodes; j++) { > idx = (i * nb_numa_nodes + j) * 3; > - matrix[idx + 0] = cpu_to_be32(i); > - matrix[idx + 1] = cpu_to_be32(j); > + matrix[idx + 0] = > cpu_to_be32(ms->numa_state->nodes[i].nodeid); > + matrix[idx + 1] = > cpu_to_be32(ms->numa_state->nodes[j].nodeid); > matrix[idx + 2] = > cpu_to_be32(ms->numa_state->nodes[i].distance[j]); > } > @@ -1845,7 +1845,7 @@ virt_cpu_index_to_props(MachineState *ms, unsigned > cpu_index) > > static int64_t virt_get_default_cpu_node_id(const MachineState *ms, int idx) > { > - return idx % ms->numa_state->num_nodes; > + return ms->numa_state->nodes[idx % ms->numa_state->num_nodes].nodeid; > } > > static const CPUArchIdList *virt_possible_cpu_arch_ids(MachineState *ms) > diff --git a/hw/core/machine.c b/hw/core/machine.c > index b36d9a1ec8..faf6e05d84 100644 > --- a/hw/core/machine.c > +++ b/hw/core/machine.c > @@ -643,11 +643,19 @@ void machine_set_cpu_numa_node(MachineState *machine, > NodeInfo *numa_info = machine->numa_state->nodes; > bool match = false; > int i; > + int node_id = find_numa(props->node_id, machine->numa_state); > > if (!mc->possible_cpu_arch_ids) { > error_setg(errp, "mapping of CPUs to NUMA node is not supported"); > return; > } > + if (node_id == MAX_NODES) { > + if (props->has_node_id) { > + node_id = props->node_id; > + } else { > + node_id = machine->numa_state->num_nodes; > + } > + } > > /* disabling node mapping is not supported, forbid it */ > assert(props->has_node_id); > @@ -711,15 +719,15 @@ void machine_set_cpu_numa_node(MachineState *machine, > slot->props.node_id = props->node_id; > slot->props.has_node_id = props->has_node_id; > > - if (numa_info[props->node_id].initiator_valid && > - (props->node_id != numa_info[props->node_id].initiator)) { > + if (numa_info[node_id].initiator_valid && > + (props->node_id != numa_info[node_id].initiator)) { > error_setg(errp, "The initiator of CPU NUMA node %" PRId64 > " should be itself.", props->node_id); > return; > } > - numa_info[props->node_id].initiator_valid = true; > - numa_info[props->node_id].has_cpu = true; > - numa_info[props->node_id].initiator = props->node_id; > + numa_info[node_id].initiator_valid = true; > + numa_info[node_id].has_cpu = true; > + numa_info[node_id].initiator = props->node_id; > } > > if (!match) { > @@ -1097,14 +1105,28 @@ static void machine_numa_finish_cpu_init(MachineState > *machine) > } > > for (i = 0; i < machine->numa_state->num_nodes; i++) { > - if (numa_info[i].initiator_valid && > - !numa_info[numa_info[i].initiator].has_cpu) { > - error_report("The initiator-id %"PRIu16 " of NUMA node %d" > - " does not exist.", numa_info[i].initiator, i); > + int node_id; > + if (!numa_info[i].initiator_valid) { > + continue; > + } > + node_id = find_numa(numa_info[i].initiator, machine->numa_state); > + if (node_id == MAX_NODES) { > + error_report("The NUMA node %" PRIu16 " initiator node (id %" > PRIu16 > + ") does not exist", numa_info[i].nodeid, > + numa_info[i].initiator); > + error_printf("\n"); > + > + exit(1); > + } > + if (!numa_info[node_id].has_cpu) { > + error_report("The NUMA node %" PRIu16 " initiator node (id %" > PRIu16 > + ") has no cpus", numa_info[i].nodeid, > + numa_info[i].initiator); > error_printf("\n"); > > exit(1); > } > + > } > > if (s->len && !qtest_enabled()) { > diff --git a/hw/core/numa.c b/hw/core/numa.c > index 75db35ac19..50a156f39f 100644 > --- a/hw/core/numa.c > +++ b/hw/core/numa.c > @@ -48,9 +48,19 @@ QemuOptsList qemu_numa_opts = { > > static int have_memdevs; > static int have_mem; > -static int max_numa_nodeid; /* Highest specified NUMA node ID, plus one. > - * For all nodes, nodeid < max_numa_nodeid > - */ > + > +int find_numa(uint16_t node, NumaState *numa_state) > +{ > + NodeInfo *numa_info = numa_state->nodes; > + int nb_numa_nodes = numa_state->num_nodes; > + > + for (int i = 0; i < nb_numa_nodes; i++) { > + if (numa_info[i].present && numa_info[i].nodeid == node) { > + return i; > + } > + } > + return MAX_NODES; > +} > > static void parse_numa_node(MachineState *ms, NumaNodeOptions *node, > Error **errp) > @@ -61,20 +71,18 @@ static void parse_numa_node(MachineState *ms, > NumaNodeOptions *node, > MachineClass *mc = MACHINE_GET_CLASS(ms); > unsigned int max_cpus = ms->smp.max_cpus; > NodeInfo *numa_info = ms->numa_state->nodes; > + int nb_numa_nodes = ms->numa_state->num_nodes; > > - if (node->has_nodeid) { > - nodenr = node->nodeid; > - } else { > - nodenr = ms->numa_state->num_nodes; > - } > + nodenr = ms->numa_state->num_nodes; > > - if (nodenr >= MAX_NODES) { > + if (nb_numa_nodes >= MAX_NODES) { > error_setg(errp, "Max number of NUMA nodes reached: %" > PRIu16 "", nodenr); > return; > } > > - if (numa_info[nodenr].present) { > + if (node->has_nodeid && > + find_numa(node->nodeid, ms->numa_state) != MAX_NODES) { > error_setg(errp, "Duplicate NUMA nodeid: %" PRIu16, nodenr); > return; > } > @@ -93,7 +101,7 @@ static void parse_numa_node(MachineState *ms, > NumaNodeOptions *node, > return; > } > props = mc->cpu_index_to_instance_props(ms, cpus->value); > - props.node_id = nodenr; > + props.node_id = node->has_nodeid ? node->nodeid : nodenr; > props.has_node_id = true; > machine_set_cpu_numa_node(ms, &props, &err); > if (err) { > @@ -143,26 +151,26 @@ static void parse_numa_node(MachineState *ms, > NumaNodeOptions *node, > numa_info[nodenr].initiator = node->initiator; > } > numa_info[nodenr].present = true; > - max_numa_nodeid = MAX(max_numa_nodeid, nodenr + 1); > + numa_info[nodenr].nodeid = node->has_nodeid ? node->nodeid : > nb_numa_nodes; > ms->numa_state->num_nodes++; > } > > static > void parse_numa_distance(MachineState *ms, NumaDistOptions *dist, Error > **errp) > { > - uint16_t src = dist->src; > - uint16_t dst = dist->dst; > + int src = find_numa(dist->src, ms->numa_state); > + int dst = find_numa(dist->dst, ms->numa_state); > uint8_t val = dist->val; > NodeInfo *numa_info = ms->numa_state->nodes; > > - if (src >= MAX_NODES || dst >= MAX_NODES) { > - error_setg(errp, "Parameter '%s' expects an integer between 0 and > %d", > - src >= MAX_NODES ? "src" : "dst", MAX_NODES - 1); > + if (src >= MAX_NODES || !numa_info[src].present) { > + error_setg(errp, "Source NUMA node is missing. " > + "Please use '-numa node' option to declare it first."); > return; > } > > - if (!numa_info[src].present || !numa_info[dst].present) { > - error_setg(errp, "Source/Destination NUMA node is missing. " > + if (dst >= MAX_NODES || !numa_info[dst].present) { > + error_setg(errp, "Destination NUMA node is missing. " > "Please use '-numa node' option to declare it first."); > return; > } > @@ -175,8 +183,8 @@ void parse_numa_distance(MachineState *ms, > NumaDistOptions *dist, Error **errp) > } > > if (src == dst && val != NUMA_DISTANCE_MIN) { > - error_setg(errp, "Local distance of node %d should be %d.", > - src, NUMA_DISTANCE_MIN); > + error_setg(errp, "Local distance of node %" PRIu16 " should be %d.", > + dist->src, NUMA_DISTANCE_MIN); > return; > } > > @@ -187,9 +195,10 @@ void parse_numa_distance(MachineState *ms, > NumaDistOptions *dist, Error **errp) > void parse_numa_hmat_lb(MachineState *ms, NumaHmatLBOptions *node, > Error **errp) > { > - int nb_numa_nodes = ms->numa_state->num_nodes; > NodeInfo *numa_info = ms->numa_state->nodes; > HMAT_LB_Info *hmat_lb = NULL; > + int initiator = find_numa(node->initiator, ms->numa_state); > + int target = find_numa(node->target, ms->numa_state); > > if (node->data_type <= HMATLB_DATA_TYPE_WRITE_LATENCY) { > if (!node->has_latency) { > @@ -225,26 +234,26 @@ void parse_numa_hmat_lb(MachineState *ms, > NumaHmatLBOptions *node, > } > } > > - if (node->initiator >= nb_numa_nodes) { > + if (initiator >= MAX_NODES) { > error_setg(errp, "Invalid initiator=%" > - PRIu16 ", it should be less than %d.", > - node->initiator, nb_numa_nodes); > + PRIu16 ", not found.", > + node->initiator); > return; > } > - if (!numa_info[node->initiator].has_cpu) { > + if (!numa_info[initiator].has_cpu) { > error_setg(errp, "Invalid initiator=%" > PRIu16 ", it isn't an initiator proximity domain.", > node->initiator); > return; > } > > - if (node->target >= nb_numa_nodes) { > + if (target >= MAX_NODES) { > error_setg(errp, "Invalid target=%" > - PRIu16 ", it should be less than %d.", > - node->target, nb_numa_nodes); > + PRIu16 ", not found", > + node->target); > return; > } > - if (!numa_info[node->target].initiator_valid) { > + if (!numa_info[target].initiator_valid) { > error_setg(errp, "Invalid target=%" > PRIu16 ", it hasn't a valid initiator proximity domain.", > node->target); > @@ -257,7 +266,7 @@ void parse_numa_hmat_lb(MachineState *ms, > NumaHmatLBOptions *node, > if (!hmat_lb) { > hmat_lb = g_malloc0(sizeof(*hmat_lb)); > ms->numa_state->hmat_lb[node->hierarchy][node->data_type] = > hmat_lb; > - } else if (hmat_lb->latency[node->initiator][node->target]) { > + } else if (hmat_lb->latency[initiator][target]) { > error_setg(errp, "Duplicate configuration of the latency for " > "initiator=%" PRIu16 " and target=%" PRIu16 ".", > node->initiator, node->target); > @@ -269,7 +278,7 @@ void parse_numa_hmat_lb(MachineState *ms, > NumaHmatLBOptions *node, > hmat_lb->base_lat = node->base_lat; > } > > - hmat_lb->latency[node->initiator][node->target] = node->latency; > + hmat_lb->latency[initiator][target] = node->latency; > } > > if (node->has_bandwidth) { > @@ -278,7 +287,7 @@ void parse_numa_hmat_lb(MachineState *ms, > NumaHmatLBOptions *node, > if (!hmat_lb) { > hmat_lb = g_malloc0(sizeof(*hmat_lb)); > ms->numa_state->hmat_lb[node->hierarchy][node->data_type] = > hmat_lb; > - } else if (hmat_lb->bandwidth[node->initiator][node->target]) { > + } else if (hmat_lb->bandwidth[initiator][target]) { > error_setg(errp, "Duplicate configuration of the bandwidth for " > "initiator=%" PRIu16 " and target=%" PRIu16 ".", > node->initiator, node->target); > @@ -295,7 +304,7 @@ void parse_numa_hmat_lb(MachineState *ms, > NumaHmatLBOptions *node, > } > } > > - hmat_lb->bandwidth[node->initiator][node->target] = node->bandwidth; > + hmat_lb->bandwidth[initiator][target] = node->bandwidth; > } > > if (hmat_lb) { > @@ -307,13 +316,13 @@ void parse_numa_hmat_lb(MachineState *ms, > NumaHmatLBOptions *node, > void parse_numa_hmat_cache(MachineState *ms, NumaHmatCacheOptions *node, > Error **errp) > { > - int nb_numa_nodes = ms->numa_state->num_nodes; > HMAT_Cache_Info *hmat_cache = NULL; > + int node_id = find_numa(node->node_id, ms->numa_state); > > - if (node->node_id >= nb_numa_nodes) { > + if (node_id >= MAX_NODES) { > error_setg(errp, "Invalid node-id=%" PRIu32 > - ", it should be less than %d.", > - node->node_id, nb_numa_nodes); > + ", not found.", > + node->node_id); > return; > } > > @@ -330,7 +339,7 @@ void parse_numa_hmat_cache(MachineState *ms, > NumaHmatCacheOptions *node, > node->level, node->total); > return; > } > - if (ms->numa_state->hmat_cache[node->node_id][node->level]) { > + if (ms->numa_state->hmat_cache[node_id][node->level]) { > error_setg(errp, "Duplicate configuration of the side cache for " > "node-id=%" PRIu32 " and level=%" PRIu8 ".", > node->node_id, node->level); > @@ -338,15 +347,15 @@ void parse_numa_hmat_cache(MachineState *ms, > NumaHmatCacheOptions *node, > } > > if ((node->level > 1) && > - ms->numa_state->hmat_cache[node->node_id][node->level - 1] && > + ms->numa_state->hmat_cache[node_id][node->level - 1] && > (node->size >= > - ms->numa_state->hmat_cache[node->node_id][node->level - > 1]->size)) { > + ms->numa_state->hmat_cache[node_id][node->level - 1]->size)) { > error_setg(errp, "Invalid size=0x%" PRIx64 > ", the size of level=%" PRIu8 > " should be less than the size(0x%" PRIx64 > ") of level=%" PRIu8 ".", > node->size, node->level, > - ms->numa_state->hmat_cache[node->node_id] > + ms->numa_state->hmat_cache[node_id] > [node->level - 1]->size, > node->level - 1); > return; > @@ -362,7 +371,7 @@ void parse_numa_hmat_cache(MachineState *ms, > NumaHmatCacheOptions *node, > hmat_cache->write_policy = node->policy; > hmat_cache->line_size = node->line; > > - ms->numa_state->hmat_cache[node->node_id][node->level] = hmat_cache; > + ms->numa_state->hmat_cache[node_id][node->level] = hmat_cache; > } > > void set_numa_options(MachineState *ms, NumaOptions *object, Error **errp) > @@ -393,7 +402,7 @@ void set_numa_options(MachineState *ms, NumaOptions > *object, Error **errp) > error_setg(&err, "Missing mandatory node-id property"); > goto end; > } > - if (!ms->numa_state->nodes[object->u.cpu.node_id].present) { > + if (find_numa(object->u.cpu.node_id, ms->numa_state) == MAX_NODES) { > error_setg(&err, "Invalid node-id=%" PRId64 ", NUMA node must be > " > "defined with -numa node,nodeid=ID before it's used with " > "-numa cpu,node-id=ID", object->u.cpu.node_id); > @@ -472,10 +481,11 @@ static void validate_numa_distance(MachineState *ms) > if (numa_info[src].distance[dst] == 0 && > numa_info[dst].distance[src] == 0) { > if (src != dst) { > - error_report("The distance between node %d and %d is " > - "missing, at least one distance value " > - "between each nodes should be provided.", > - src, dst); > + error_report("The distance between node %" PRIu16 > + " and %" PRIu16 " is missing, at least one " > + "distance value between each nodes should > be " > + "provided.", > + numa_info[src].nodeid, > numa_info[dst].nodeid); > exit(EXIT_FAILURE); > } > } > @@ -493,9 +503,11 @@ static void validate_numa_distance(MachineState *ms) > for (src = 0; src < nb_numa_nodes; src++) { > for (dst = 0; dst < nb_numa_nodes; dst++) { > if (src != dst && numa_info[src].distance[dst] == 0) { > - error_report("At least one asymmetrical pair of " > - "distances is given, please provide distances " > - "for both directions of all node pairs."); > + error_report("At least one asymmetrical pair (%" PRIu16 > + ", %" PRIu16 ") of distances is given, please " > + "provide distances for both directions of all > node " > + "pairs.", > + numa_info[src].nodeid, numa_info[dst].nodeid); > exit(EXIT_FAILURE); > } > } > @@ -587,27 +599,11 @@ void numa_complete_configuration(MachineState *ms) > parse_numa_node(ms, &node, &error_abort); > } > > - assert(max_numa_nodeid <= MAX_NODES); > - > - /* No support for sparse NUMA node IDs yet: */ > - for (i = max_numa_nodeid - 1; i >= 0; i--) { > - /* Report large node IDs first, to make mistakes easier to spot */ > - if (!numa_info[i].present) { > - error_report("numa: Node ID missing: %d", i); > - exit(1); > - } > - } > - > - /* This must be always true if all nodes are present: */ > - assert(ms->numa_state->num_nodes == max_numa_nodeid); > + assert(ms->numa_state->num_nodes <= MAX_NODES); > > if (ms->numa_state->num_nodes > 0) { > uint64_t numa_total; > > - if (ms->numa_state->num_nodes > MAX_NODES) { > - ms->numa_state->num_nodes = MAX_NODES; > - } > - > /* If no memory size is given for any node, assume the default case > * and distribute the available memory equally across all nodes > */ > diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c > index 90ad0dff99..f4a906c72e 100644 > --- a/hw/i386/acpi-build.c > +++ b/hw/i386/acpi-build.c > @@ -2361,6 +2361,7 @@ build_srat(GArray *table_data, BIOSLinker *linker, > MachineState *machine) > numa_start = table_data->len; > > for (i = 1; i < pcms->numa_nodes + 1; ++i) { > + int nodeid = machine->numa_state->nodes[i - 1].nodeid; > mem_base = next_base; > mem_len = pcms->node_mem[i - 1]; > next_base = mem_base + mem_len; > @@ -2371,7 +2372,7 @@ build_srat(GArray *table_data, BIOSLinker *linker, > MachineState *machine) > mem_len -= next_base - HOLE_640K_START; > if (mem_len > 0) { > numamem = acpi_data_push(table_data, sizeof *numamem); > - build_srat_memory(numamem, mem_base, mem_len, i - 1, > + build_srat_memory(numamem, mem_base, mem_len, nodeid, > MEM_AFFINITY_ENABLED); > } > > @@ -2390,7 +2391,7 @@ build_srat(GArray *table_data, BIOSLinker *linker, > MachineState *machine) > mem_len -= next_base - pcms->below_4g_mem_size; > if (mem_len > 0) { > numamem = acpi_data_push(table_data, sizeof *numamem); > - build_srat_memory(numamem, mem_base, mem_len, i - 1, > + build_srat_memory(numamem, mem_base, mem_len, nodeid, > MEM_AFFINITY_ENABLED); > } > mem_base = 1ULL << 32; > @@ -2400,7 +2401,7 @@ build_srat(GArray *table_data, BIOSLinker *linker, > MachineState *machine) > > if (mem_len > 0) { > numamem = acpi_data_push(table_data, sizeof *numamem); > - build_srat_memory(numamem, mem_base, mem_len, i - 1, > + build_srat_memory(numamem, mem_base, mem_len, nodeid, > MEM_AFFINITY_ENABLED); > } > } > @@ -2421,8 +2422,9 @@ build_srat(GArray *table_data, BIOSLinker *linker, > MachineState *machine) > if (hotplugabble_address_space_size) { > numamem = acpi_data_push(table_data, sizeof *numamem); > build_srat_memory(numamem, machine->device_memory->base, > - hotplugabble_address_space_size, pcms->numa_nodes > - 1, > - MEM_AFFINITY_HOTPLUGGABLE | MEM_AFFINITY_ENABLED); > + hotplugabble_address_space_size, > + machine->numa_state->nodes[pcms->numa_nodes - 1].nodeid, > + MEM_AFFINITY_HOTPLUGGABLE | MEM_AFFINITY_ENABLED); > } > > build_header(linker, table_data, > diff --git a/hw/i386/pc.c b/hw/i386/pc.c > index c3f5a70a56..5b8db454b7 100644 > --- a/hw/i386/pc.c > +++ b/hw/i386/pc.c > @@ -2850,7 +2850,7 @@ static int64_t pc_get_default_cpu_node_id(const > MachineState *ms, int idx) > x86_topo_ids_from_apicid(ms->possible_cpus->cpus[idx].arch_id, > pcms->smp_dies, ms->smp.cores, > ms->smp.threads, &topo); > - return topo.pkg_id % ms->numa_state->num_nodes; > + return ms->numa_state->nodes[topo.pkg_id % > ms->numa_state->num_nodes].nodeid; > } > > static const CPUArchIdList *pc_possible_cpu_arch_ids(MachineState *ms) > diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c > index f607ca567b..ef4802698c 100644 > --- a/hw/ppc/spapr.c > +++ b/hw/ppc/spapr.c > @@ -424,7 +424,8 @@ static int spapr_populate_memory(SpaprMachineState > *spapr, void *fdt) > if (!mem_start) { > /* spapr_machine_init() checks for rma_size <= node0_size > * already */ > - spapr_populate_memory_node(fdt, i, 0, spapr->rma_size); > + spapr_populate_memory_node(fdt, nodes[i].nodeid, 0, > + spapr->rma_size); > mem_start += spapr->rma_size; > node_size -= spapr->rma_size; > } > @@ -436,7 +437,8 @@ static int spapr_populate_memory(SpaprMachineState > *spapr, void *fdt) > sizetmp = 1ULL << ctzl(mem_start); > } > > - spapr_populate_memory_node(fdt, i, mem_start, sizetmp); > + spapr_populate_memory_node(fdt, nodes[i].nodeid, mem_start, > + sizetmp); > node_size -= sizetmp; > mem_start += sizetmp; > } > @@ -2543,7 +2545,8 @@ static void spapr_validate_node_memory(MachineState > *machine, Error **errp) > error_setg(errp, > "Node %d memory size 0x%" PRIx64 > " is not aligned to %" PRIu64 " MiB", > - i, machine->numa_state->nodes[i].node_mem, > + machine->numa_state->nodes[i].nodeid, > + machine->numa_state->nodes[i].node_mem, > SPAPR_MEMORY_BLOCK_SIZE / MiB); > return; > } > @@ -4140,7 +4143,8 @@ spapr_cpu_index_to_props(MachineState *machine, > unsigned cpu_index) > > static int64_t spapr_get_default_cpu_node_id(const MachineState *ms, int idx) > { > - return idx / ms->smp.cores % ms->numa_state->num_nodes; > + return ms->numa_state->nodes[ > + idx / ms->smp.cores % ms->numa_state->num_nodes].nodeid; > } > > static const CPUArchIdList *spapr_possible_cpu_arch_ids(MachineState > *machine) > diff --git a/include/sysemu/numa.h b/include/sysemu/numa.h > index 9009bbdee3..7474f2c5b6 100644 > --- a/include/sysemu/numa.h > +++ b/include/sysemu/numa.h > @@ -13,6 +13,7 @@ struct NodeInfo { > bool has_cpu; > bool initiator_valid; > uint16_t initiator; > + uint16_t nodeid; > uint8_t distance[MAX_NODES]; > }; > > @@ -39,6 +40,7 @@ struct NumaState { > }; > typedef struct NumaState NumaState; > > +int find_numa(uint16_t node, NumaState *numa_state); > void set_numa_options(MachineState *ms, NumaOptions *object, Error **errp); > void parse_numa_opts(MachineState *ms); > void parse_numa_hmat_lb(MachineState *ms, NumaHmatLBOptions *node, -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson
signature.asc
Description: PGP signature