Re: [Qemu-devel] [PATCH v2] numa: numa nodeid need not be sequential

David Gibson Wed, 07 Aug 2019 23:30:45 -0700

On Tue, Aug 06, 2019 at 09:29:45PM +1000, Daniel Black wrote:
> Replace all node_id assumptions with lookups from 
> machinestate->numa_state->nodes[]
> and remove aspects that assume a sequential numbering of nodes. This enables
> non-sequential NUMA node number topoligies to be created.
> 
> Default assignments of CPU->nodeid (get_default_cpu_node_id) now return
> a nodeid from the machinestate->numa_state->nodes[].
> 
> x86 will use the node is as the Proximity Domain (which the
> Linux kernel will map down to sequential node numbers). Both HMAT and
> SLIT ACPI data are entered based on this nodeid. In Linux kernel
> output look at the SRAT/HMAT: and PXM: references in the kernel early boot.
> 
> Small enhancements where made to error messages to be more explicit
> about errors in node specification.
> 
> CC: Tao Xu <tao3...@intel.com>
> CC: Liu Jingqi <jingqi....@intel.com>
> Signed-off-by: Daniel Black <dan...@linux.ibm.com>


I have no real opinion on whether this is a good idea overall.  But,
if we go for it then the ppc parts are

Acked-by: David Gibson <da...@gibson.dropbear.id.au>

> 
> ---
> Based-on: 20190614155626.27932-1-tao3...@intel.com
> ([PATCH RESEND v8 00/11] Build ACPI Heterogeneous Memory Attribute Table
> (HMAT))
> 
> Test script:
> 
> #!/bin/bash
> set -x -v
> 
> QEMUHOME=${HOME}/repos/qemu/
> # optional but make it easy to install/run numactl --hardware
> #ALPINE_NET=""
> ALPINE_NET="ip=dhcp 
> alpine_repo=http://dl-cdn.alpinelinux.org/alpine/edge/main/";
> 
> ALPINE_HOME=${HOME}/repos/alpine/alpine-netboot-3.10.1-
> 
> # x86 / armv7 - no CONFIG_NUMA=y support in kernel
> # Kernel configs: https://git.alpinelinux.org/aports/tree/main/linux-vanilla/
> # s390x - no numa support in QEMU
> for ARCH in x86_64 aarch64 ppc64le
> do
>  if [ ! -d ${ALPINE_HOME}${ARCH} ]
>  then
>    mkdir ${ALPINE_HOME}${ARCH}
>    wget 
> http://dl-cdn.alpinelinux.org/alpine/v3.10/releases/${ARCH}/alpine-netboot-3.10.1-${ARCH}.tar.gz
>  -O - | tar -zxf - -C ${ALPINE_HOME}${ARCH}
>  fi
> done
> 
> if [ ! -x ${ALPINE_HOME}i386 ]
> then
>   ln -s ${ALPINE_HOME}x86 ${ALPINE_HOME}i386
> fi
> 
> if [ ! -x ${ALPINE_HOME}arm ]
> then
>   ln -s ${ALPINE_HOME}armv7 ${ALPINE_HOME}arm
> fi
> 
> if [ ! -x ${ALPINE_HOME}ppc64 ]
> then
>   ln -s ${ALPINE_HOME}ppc64le ${ALPINE_HOME}ppc64
> fi
> 
> # Note "virtual" kernels don't have numa enabled
> run()
> {
>   NUMA=$1
>   ARCH=$2
>   ARGS=$3
>   CONSOLE=$4
>   #echo \
>   ${QEMUHOME}/${ARCH}-softmmu/qemu-system-${ARCH} \
>     ${ARGS} \
>     -kernel ${ALPINE_HOME}${ARCH}/boot/vmlinuz-vanilla  \
>     -initrd ${ALPINE_HOME}${ARCH}/boot/initramfs-vanilla \
>     -append "${CONSOLE} ${ALPINE_NET}" \
>     -m 2G \
>     ${NUMA}
>   echo
> }
> 
> # This ends up as odd:
> # ends up with both CPUs are on same node
> # as 0 and 8 % 2 (nodes) are the same
> # in short - don't run legacy with gaps with
> # odd numa node numbers (like 0 and 8).
> run_legacy()
> {
>   run "-smp 2,cores=3,sockets=2,maxcpus=6 \
>        -numa node,mem=1G \
>        -numa node,mem=1G,nodeid=8 \
>        -numa dist,src=0,dst=8,val=21" "$@"
> }
> 
> run_memdev_implicit_core()
> {
>   run "-smp cpus=6,maxcpus=8,cores=4,sockets=2 \
>        -object memory-backend-ram,id=ram0,size=1G \
>        -object memory-backend-ram,id=ram1,size=1G \
>        -numa node,memdev=ram0,nodeid=0 \
>        -numa node,memdev=ram1,nodeid=8 \
>        -numa dist,src=0,dst=8,val=21" "$@"
> }
> 
> run_memdev_explicit_core()
> {
>   run "-smp cpus=6,maxcpus=8,cores=4,sockets=2 \
>        -object memory-backend-ram,id=ram0,size=1G \
>        -object memory-backend-ram,id=ram1,size=1G \
>        -numa node,memdev=ram0,cpus=0-3,nodeid=0 \
>        -numa node,memdev=ram1,cpus=4-7,nodeid=8 \
>        -numa dist,src=0,dst=8,val=21" "$@"
> }
> 
> # hmat isn't added until kernel-5.2-rc1 and requires
> # CONFIG_ACPI_HMAT
> run_hmat_lb()
> {
>   run "-smp 2,sockets=2 \
>        -m 128M,slots=2,maxmem=1G \
>        -kernel ${HOME}/repos/linux/vmlinux \
>        -object memory-backend-ram,size=64M,id=m0 \
>        -object memory-backend-ram,size=64M,id=m1 \
>        -numa node,nodeid=3,memdev=m0 \
>        -numa node,nodeid=4,memdev=m1,initiator=3 \
>        -numa cpu,node-id=3,socket-id=0 \
>        -numa cpu,node-id=3,socket-id=1 \
>        -numa 
> hmat-lb,initiator=3,target=3,hierarchy=memory,data-type=access-latency,base-lat=1000,latency=5
>  \
>        -numa 
> hmat-lb,initiator=3,target=3,hierarchy=memory,data-type=access-bandwidth,base-bw=20,bandwidth=5
>  \
>        -numa 
> hmat-lb,initiator=3,target=4,hierarchy=memory,data-type=access-latency,base-lat=1,latency=15
>  \
>        -numa 
> hmat-lb,initiator=3,target=4,hierarchy=memory,data-type=access-bandwidth,base-bw=20,bandwidth=10
>  \
>        -numa 
> hmat-cache,node-id=3,size=0x20000,total=1,level=1,assoc=direct,policy=write-back,line=8
>  \
>        -numa 
> hmat-cache,node-id=4,size=0x20000,total=1,level=1,assoc=direct,policy=write-back,line=8"
>  "$@"
> }
> 
> 
> for arch in x86_64 ppc64 aarch64 s390x; do killall qemu-system-$arch; done
> killall vncviewer
> 
> # i386 Alpine kernels don't have NUMA
> #run_memdev_implicit_core i386 "-machine pc -nographic" console=ttyS0
> # armv7 kernel's don't have NUMA
> #run_legacy arm "-machine virt -cpu cortex-a15 -nographic" console=ttyAMA0
> 
> # GOOD
> run_legacy x86_64 "-machine pc -nographic" console=ttyS0
> run_memdev_implicit_core x86_64 "-machine pc -nographic" console=ttyS0
> run_memdev_explicit_core x86_64 "-machine pc -nographic" console=ttyS0
> 
> run_hmat_lb x86_64 "-machine pc -nographic" console=ttyS0
> 
> # GOOD
> run_legacy aarch64 "-machine virt -cpu cortex-a57 -nographic" console=ttyAMA0
> run_memdev_implicit_core  aarch64 "-machine virt -cpu cortex-a57 -nographic" 
> console=ttyAMA0
> run_memdev_explicit_core  aarch64 "-machine virt -cpu cortex-a57 -nographic" 
> console=ttyAMA0
> 
> # PPC not doing numa distance (not a regression)
> (sleep 1; vncviewer :0) &
> 
> # GOOD
>  run_legacy ppc64 "-machine pseries -cpu POWER9 -display vnc=:0" "numa=debug"
>  run_memdev_implicit_core ppc64 "-machine pseries -cpu POWER9 -display 
> vnc=:0" "numa=debug"
>  run_memdev_explicit_core ppc64 "-machine pseries -cpu POWER9 -display 
> vnc=:0" "numa=debug"
> 
> # ON P8 ppc64le host:
> # run_memdev_implicit_core ppc64 "-machine pseries -cpu host -accel kvm 
> -display vnc=:0" "numa=debug"
> 
> # Couldn't boot Alpine ARM kernel on this machine type:
> # arm sbsa ref - appears to be a BMC so not really a numa target?
> # seems ok looking at the results of sbsa_ref_get_default_cpu_node_id however 
> it display no
> # output when booting
> 
> # run_legacy aarch64 "-machine sbsa-ref -nographic" console=ttyAMA0
> 
> # Then run:
> # sh -c 'apk add numactl-tools && numactl --hardware'
> #
> # alternately examine results in:
> # ls -la /sys/devices/system/node/node*/cpu*
> # more /sys/devices/system/node/node*/distance
> #
> # x86 node numbers are renumbered by kernel. To view
> # acpi mappings:
> # dmesg | egrep -A 2 '(HMAT|SRAT|PXM):'
> ---
>  hw/acpi/aml-build.c      |  31 ++++++---
>  hw/acpi/hmat.c           |  14 +++--
>  hw/arm/boot.c            |   3 +-
>  hw/arm/sbsa-ref.c        |   6 +-
>  hw/arm/virt-acpi-build.c |   3 +-
>  hw/arm/virt.c            |   6 +-
>  hw/core/machine.c        |  40 +++++++++---
>  hw/core/numa.c           | 132 +++++++++++++++++++--------------------
>  hw/i386/acpi-build.c     |  12 ++--
>  hw/i386/pc.c             |   2 +-
>  hw/ppc/spapr.c           |  12 ++--
>  include/sysemu/numa.h    |   2 +
>  12 files changed, 154 insertions(+), 109 deletions(-)
> 
> diff --git a/hw/acpi/aml-build.c b/hw/acpi/aml-build.c
> index 26ccc1a3e2..512c76e3dd 100644
> --- a/hw/acpi/aml-build.c
> +++ b/hw/acpi/aml-build.c
> @@ -1728,19 +1728,34 @@ void build_srat_memory(AcpiSratMemoryAffinity 
> *numamem, uint64_t base,
>   */
>  void build_slit(GArray *table_data, BIOSLinker *linker, MachineState *ms)
>  {
> -    int slit_start, i, j;
> +    int slit_start, i, j, src, dst, largest;
>      slit_start = table_data->len;
>      int nb_numa_nodes = ms->numa_state->num_nodes;
>  
>      acpi_data_push(table_data, sizeof(AcpiTableHeader));
>  
> -    build_append_int_noprefix(table_data, nb_numa_nodes, 8);
> -    for (i = 0; i < nb_numa_nodes; i++) {
> -        for (j = 0; j < nb_numa_nodes; j++) {
> -            assert(ms->numa_state->nodes[i].distance[j]);
> -            build_append_int_noprefix(table_data,
> -                                      ms->numa_state->nodes[i].distance[j],
> -                                      1);
> +    for (largest = 0, i = 0; i < nb_numa_nodes; i++)
> +        if (largest < ms->numa_state->nodes[i].nodeid) {
> +            largest = ms->numa_state->nodes[i].nodeid;
> +        }
> +
> +    /* number of entries is largest + 1 as nodes start at 0 */
> +    build_append_int_noprefix(table_data, largest + 1, 8);
> +
> +    for (i = 0; i <= largest; i++) {
> +        src = find_numa(i, ms->numa_state);
> +        for (j = 0; j <= largest; j++) {
> +            dst = find_numa(j, ms->numa_state);
> +
> +            if (dst == MAX_NODES || src == MAX_NODES) {
> +                /* 255 is unreachable. Linux expects 10 in self-maps entries 
> */
> +                build_append_int_noprefix(table_data,
> +                                          i == j ? NUMA_DISTANCE_MIN : 255, 
> 1);
> +            } else {
> +                assert(ms->numa_state->nodes[src].distance[dst]);
> +                build_append_int_noprefix(table_data,
> +                    ms->numa_state->nodes[src].distance[dst], 1);
> +            }
>          }
>      }
>  
> diff --git a/hw/acpi/hmat.c b/hw/acpi/hmat.c
> index 01a6552d51..0042be48d2 100644
> --- a/hw/acpi/hmat.c
> +++ b/hw/acpi/hmat.c
> @@ -73,7 +73,8 @@ static void build_hmat_mpda(GArray *table_data, uint16_t 
> flags, int initiator,
>   */
>  static void build_hmat_lb(GArray *table_data, HMAT_LB_Info *hmat_lb,
>                            uint32_t num_initiator, uint32_t num_target,
> -                          uint32_t *initiator_pxm, int type)
> +                          uint32_t *initiator_pxm, int type,
> +                          NumaState *numa_state)
>  {
>      uint32_t s = num_initiator;
>      uint32_t t = num_target;
> @@ -114,12 +115,12 @@ static void build_hmat_lb(GArray *table_data, 
> HMAT_LB_Info *hmat_lb,
>  
>      /* Target Proximity Domain List */
>      for (i = 0; i < t; i++) {
> -        build_append_int_noprefix(table_data, i, 4);
> +        build_append_int_noprefix(table_data, numa_state->nodes[i].nodeid, 
> 4);
>      }
>  
>      /* Latency or Bandwidth Entries */
>      for (i = 0; i < s; i++) {
> -        m = initiator_pxm[i];
> +        m = find_numa(initiator_pxm[i], numa_state);
>          for (n = 0; n < t; n++) {
>              uint16_t entry;
>  
> @@ -199,12 +200,13 @@ static void hmat_build_table_structs(GArray 
> *table_data, NumaState *nstat)
>              flags |= HMAT_PROX_INIT_VALID;
>          }
>  
> -        build_hmat_mpda(table_data, flags, nstat->nodes[i].initiator, i);
> +        build_hmat_mpda(table_data, flags, nstat->nodes[i].initiator,
> +                        nstat->nodes[i].nodeid);
>      }
>  
>      for (i = 0; i < nstat->num_nodes; i++) {
>          if (nstat->nodes[i].has_cpu) {
> -            initiator_pxm[num_initiator++] = i;
> +            initiator_pxm[num_initiator++] = nstat->nodes[i].nodeid;
>          }
>      }
>  
> @@ -220,7 +222,7 @@ static void hmat_build_table_structs(GArray *table_data, 
> NumaState *nstat)
>  
>              if (numa_hmat_lb) {
>                  build_hmat_lb(table_data, numa_hmat_lb, num_initiator,
> -                              nstat->num_nodes, initiator_pxm, type);
> +                              nstat->num_nodes, initiator_pxm, type, nstat);
>              }
>          }
>      }
> diff --git a/hw/arm/boot.c b/hw/arm/boot.c
> index 6472aa441e..1d92001930 100644
> --- a/hw/arm/boot.c
> +++ b/hw/arm/boot.c
> @@ -603,7 +603,8 @@ int arm_load_dtb(hwaddr addr, const struct arm_boot_info 
> *binfo,
>          for (i = 0; i < ms->numa_state->num_nodes; i++) {
>              mem_len = ms->numa_state->nodes[i].node_mem;
>              rc = fdt_add_memory_node(fdt, acells, mem_base,
> -                                     scells, mem_len, i);
> +                                     scells, mem_len,
> +                                     ms->numa_state->nodes[i].nodeid);
>              if (rc < 0) {
>                  fprintf(stderr, "couldn't add /memory@%"PRIx64" node\n",
>                          mem_base);
> diff --git a/hw/arm/sbsa-ref.c b/hw/arm/sbsa-ref.c
> index 3a243e6a53..f2c3a6fefa 100644
> --- a/hw/arm/sbsa-ref.c
> +++ b/hw/arm/sbsa-ref.c
> @@ -166,8 +166,8 @@ static void create_fdt(SBSAMachineState *sms)
>          for (i = 0; i < nb_numa_nodes; i++) {
>              for (j = 0; j < nb_numa_nodes; j++) {
>                  idx = (i * nb_numa_nodes + j) * 3;
> -                matrix[idx + 0] = cpu_to_be32(i);
> -                matrix[idx + 1] = cpu_to_be32(j);
> +                matrix[idx + 0] = 
> cpu_to_be32(ms->numa_state->nodes[i].nodeid);
> +                matrix[idx + 1] = 
> cpu_to_be32(ms->numa_state->nodes[j].nodeid);
>                  matrix[idx + 2] =
>                      cpu_to_be32(ms->numa_state->nodes[i].distance[j]);
>              }
> @@ -762,7 +762,7 @@ sbsa_ref_cpu_index_to_props(MachineState *ms, unsigned 
> cpu_index)
>  static int64_t
>  sbsa_ref_get_default_cpu_node_id(const MachineState *ms, int idx)
>  {
> -    return idx % ms->numa_state->num_nodes;
> +    return ms->numa_state->nodes[idx % ms->numa_state->num_nodes].nodeid;
>  }
>  
>  static void sbsa_ref_instance_init(Object *obj)
> diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c
> index 89899ec4c1..0384339867 100644
> --- a/hw/arm/virt-acpi-build.c
> +++ b/hw/arm/virt-acpi-build.c
> @@ -537,7 +537,8 @@ build_srat(GArray *table_data, BIOSLinker *linker, 
> VirtMachineState *vms)
>          if (ms->numa_state->nodes[i].node_mem > 0) {
>              numamem = acpi_data_push(table_data, sizeof(*numamem));
>              build_srat_memory(numamem, mem_base,
> -                              ms->numa_state->nodes[i].node_mem, i,
> +                              ms->numa_state->nodes[i].node_mem,
> +                              ms->numa_state->nodes[i].nodeid,
>                                MEM_AFFINITY_ENABLED);
>              mem_base += ms->numa_state->nodes[i].node_mem;
>          }
> diff --git a/hw/arm/virt.c b/hw/arm/virt.c
> index 46f39e20bc..1a2db6447f 100644
> --- a/hw/arm/virt.c
> +++ b/hw/arm/virt.c
> @@ -240,8 +240,8 @@ static void create_fdt(VirtMachineState *vms)
>          for (i = 0; i < nb_numa_nodes; i++) {
>              for (j = 0; j < nb_numa_nodes; j++) {
>                  idx = (i * nb_numa_nodes + j) * 3;
> -                matrix[idx + 0] = cpu_to_be32(i);
> -                matrix[idx + 1] = cpu_to_be32(j);
> +                matrix[idx + 0] = 
> cpu_to_be32(ms->numa_state->nodes[i].nodeid);
> +                matrix[idx + 1] = 
> cpu_to_be32(ms->numa_state->nodes[j].nodeid);
>                  matrix[idx + 2] =
>                      cpu_to_be32(ms->numa_state->nodes[i].distance[j]);
>              }
> @@ -1845,7 +1845,7 @@ virt_cpu_index_to_props(MachineState *ms, unsigned 
> cpu_index)
>  
>  static int64_t virt_get_default_cpu_node_id(const MachineState *ms, int idx)
>  {
> -    return idx % ms->numa_state->num_nodes;
> +    return ms->numa_state->nodes[idx % ms->numa_state->num_nodes].nodeid;
>  }
>  
>  static const CPUArchIdList *virt_possible_cpu_arch_ids(MachineState *ms)
> diff --git a/hw/core/machine.c b/hw/core/machine.c
> index b36d9a1ec8..faf6e05d84 100644
> --- a/hw/core/machine.c
> +++ b/hw/core/machine.c
> @@ -643,11 +643,19 @@ void machine_set_cpu_numa_node(MachineState *machine,
>      NodeInfo *numa_info = machine->numa_state->nodes;
>      bool match = false;
>      int i;
> +    int node_id = find_numa(props->node_id, machine->numa_state);
>  
>      if (!mc->possible_cpu_arch_ids) {
>          error_setg(errp, "mapping of CPUs to NUMA node is not supported");
>          return;
>      }
> +    if (node_id == MAX_NODES) {
> +        if (props->has_node_id) {
> +            node_id = props->node_id;
> +        } else {
> +            node_id = machine->numa_state->num_nodes;
> +        }
> +    }
>  
>      /* disabling node mapping is not supported, forbid it */
>      assert(props->has_node_id);
> @@ -711,15 +719,15 @@ void machine_set_cpu_numa_node(MachineState *machine,
>          slot->props.node_id = props->node_id;
>          slot->props.has_node_id = props->has_node_id;
>  
> -        if (numa_info[props->node_id].initiator_valid &&
> -            (props->node_id != numa_info[props->node_id].initiator)) {
> +        if (numa_info[node_id].initiator_valid &&
> +            (props->node_id != numa_info[node_id].initiator)) {
>              error_setg(errp, "The initiator of CPU NUMA node %" PRId64
>                         " should be itself.", props->node_id);
>              return;
>          }
> -        numa_info[props->node_id].initiator_valid = true;
> -        numa_info[props->node_id].has_cpu = true;
> -        numa_info[props->node_id].initiator = props->node_id;
> +        numa_info[node_id].initiator_valid = true;
> +        numa_info[node_id].has_cpu = true;
> +        numa_info[node_id].initiator = props->node_id;
>      }
>  
>      if (!match) {
> @@ -1097,14 +1105,28 @@ static void machine_numa_finish_cpu_init(MachineState 
> *machine)
>      }
>  
>      for (i = 0; i < machine->numa_state->num_nodes; i++) {
> -        if (numa_info[i].initiator_valid &&
> -            !numa_info[numa_info[i].initiator].has_cpu) {
> -            error_report("The initiator-id %"PRIu16 " of NUMA node %d"
> -                         " does not exist.", numa_info[i].initiator, i);
> +        int node_id;
> +        if (!numa_info[i].initiator_valid) {
> +            continue;
> +        }
> +        node_id = find_numa(numa_info[i].initiator, machine->numa_state);
> +        if (node_id == MAX_NODES) {
> +            error_report("The NUMA node %" PRIu16 " initiator node (id %" 
> PRIu16
> +                         ") does not exist", numa_info[i].nodeid,
> +                         numa_info[i].initiator);
> +            error_printf("\n");
> +
> +            exit(1);
> +        }
> +        if (!numa_info[node_id].has_cpu) {
> +            error_report("The NUMA node %" PRIu16 " initiator node (id %" 
> PRIu16
> +                         ") has no cpus", numa_info[i].nodeid,
> +                         numa_info[i].initiator);
>              error_printf("\n");
>  
>              exit(1);
>          }
> +
>      }
>  
>      if (s->len && !qtest_enabled()) {
> diff --git a/hw/core/numa.c b/hw/core/numa.c
> index 75db35ac19..50a156f39f 100644
> --- a/hw/core/numa.c
> +++ b/hw/core/numa.c
> @@ -48,9 +48,19 @@ QemuOptsList qemu_numa_opts = {
>  
>  static int have_memdevs;
>  static int have_mem;
> -static int max_numa_nodeid; /* Highest specified NUMA node ID, plus one.
> -                             * For all nodes, nodeid < max_numa_nodeid
> -                             */
> +
> +int find_numa(uint16_t node, NumaState *numa_state)
> +{
> +    NodeInfo *numa_info = numa_state->nodes;
> +    int nb_numa_nodes = numa_state->num_nodes;
> +
> +    for (int i = 0; i < nb_numa_nodes; i++) {
> +        if (numa_info[i].present && numa_info[i].nodeid == node) {
> +            return i;
> +        }
> +    }
> +    return MAX_NODES;
> +}
>  
>  static void parse_numa_node(MachineState *ms, NumaNodeOptions *node,
>                              Error **errp)
> @@ -61,20 +71,18 @@ static void parse_numa_node(MachineState *ms, 
> NumaNodeOptions *node,
>      MachineClass *mc = MACHINE_GET_CLASS(ms);
>      unsigned int max_cpus = ms->smp.max_cpus;
>      NodeInfo *numa_info = ms->numa_state->nodes;
> +    int nb_numa_nodes = ms->numa_state->num_nodes;
>  
> -    if (node->has_nodeid) {
> -        nodenr = node->nodeid;
> -    } else {
> -        nodenr = ms->numa_state->num_nodes;
> -    }
> +    nodenr = ms->numa_state->num_nodes;
>  
> -    if (nodenr >= MAX_NODES) {
> +    if (nb_numa_nodes >= MAX_NODES) {
>          error_setg(errp, "Max number of NUMA nodes reached: %"
>                     PRIu16 "", nodenr);
>          return;
>      }
>  
> -    if (numa_info[nodenr].present) {
> +    if (node->has_nodeid &&
> +        find_numa(node->nodeid, ms->numa_state) != MAX_NODES) {
>          error_setg(errp, "Duplicate NUMA nodeid: %" PRIu16, nodenr);
>          return;
>      }
> @@ -93,7 +101,7 @@ static void parse_numa_node(MachineState *ms, 
> NumaNodeOptions *node,
>              return;
>          }
>          props = mc->cpu_index_to_instance_props(ms, cpus->value);
> -        props.node_id = nodenr;
> +        props.node_id = node->has_nodeid ? node->nodeid : nodenr;
>          props.has_node_id = true;
>          machine_set_cpu_numa_node(ms, &props, &err);
>          if (err) {
> @@ -143,26 +151,26 @@ static void parse_numa_node(MachineState *ms, 
> NumaNodeOptions *node,
>          numa_info[nodenr].initiator = node->initiator;
>      }
>      numa_info[nodenr].present = true;
> -    max_numa_nodeid = MAX(max_numa_nodeid, nodenr + 1);
> +    numa_info[nodenr].nodeid = node->has_nodeid ? node->nodeid : 
> nb_numa_nodes;
>      ms->numa_state->num_nodes++;
>  }
>  
>  static
>  void parse_numa_distance(MachineState *ms, NumaDistOptions *dist, Error 
> **errp)
>  {
> -    uint16_t src = dist->src;
> -    uint16_t dst = dist->dst;
> +    int src = find_numa(dist->src, ms->numa_state);
> +    int dst = find_numa(dist->dst, ms->numa_state);
>      uint8_t val = dist->val;
>      NodeInfo *numa_info = ms->numa_state->nodes;
>  
> -    if (src >= MAX_NODES || dst >= MAX_NODES) {
> -        error_setg(errp, "Parameter '%s' expects an integer between 0 and 
> %d",
> -                   src >= MAX_NODES ? "src" : "dst", MAX_NODES - 1);
> +    if (src >= MAX_NODES || !numa_info[src].present) {
> +        error_setg(errp, "Source NUMA node is missing. "
> +                   "Please use '-numa node' option to declare it first.");
>          return;
>      }
>  
> -    if (!numa_info[src].present || !numa_info[dst].present) {
> -        error_setg(errp, "Source/Destination NUMA node is missing. "
> +    if (dst >= MAX_NODES || !numa_info[dst].present) {
> +        error_setg(errp, "Destination NUMA node is missing. "
>                     "Please use '-numa node' option to declare it first.");
>          return;
>      }
> @@ -175,8 +183,8 @@ void parse_numa_distance(MachineState *ms, 
> NumaDistOptions *dist, Error **errp)
>      }
>  
>      if (src == dst && val != NUMA_DISTANCE_MIN) {
> -        error_setg(errp, "Local distance of node %d should be %d.",
> -                   src, NUMA_DISTANCE_MIN);
> +        error_setg(errp, "Local distance of node %" PRIu16 " should be %d.",
> +                   dist->src, NUMA_DISTANCE_MIN);
>          return;
>      }
>  
> @@ -187,9 +195,10 @@ void parse_numa_distance(MachineState *ms, 
> NumaDistOptions *dist, Error **errp)
>  void parse_numa_hmat_lb(MachineState *ms, NumaHmatLBOptions *node,
>                          Error **errp)
>  {
> -    int nb_numa_nodes = ms->numa_state->num_nodes;
>      NodeInfo *numa_info = ms->numa_state->nodes;
>      HMAT_LB_Info *hmat_lb = NULL;
> +    int initiator = find_numa(node->initiator, ms->numa_state);
> +    int target = find_numa(node->target, ms->numa_state);
>  
>      if (node->data_type <= HMATLB_DATA_TYPE_WRITE_LATENCY) {
>          if (!node->has_latency) {
> @@ -225,26 +234,26 @@ void parse_numa_hmat_lb(MachineState *ms, 
> NumaHmatLBOptions *node,
>          }
>      }
>  
> -    if (node->initiator >= nb_numa_nodes) {
> +    if (initiator >= MAX_NODES) {
>          error_setg(errp, "Invalid initiator=%"
> -                   PRIu16 ", it should be less than %d.",
> -                   node->initiator, nb_numa_nodes);
> +                   PRIu16 ", not found.",
> +                   node->initiator);
>          return;
>      }
> -    if (!numa_info[node->initiator].has_cpu) {
> +    if (!numa_info[initiator].has_cpu) {
>          error_setg(errp, "Invalid initiator=%"
>                     PRIu16 ", it isn't an initiator proximity domain.",
>                     node->initiator);
>          return;
>      }
>  
> -    if (node->target >= nb_numa_nodes) {
> +    if (target >= MAX_NODES) {
>          error_setg(errp, "Invalid target=%"
> -                   PRIu16 ", it should be less than %d.",
> -                   node->target, nb_numa_nodes);
> +                   PRIu16 ", not found",
> +                   node->target);
>          return;
>      }
> -    if (!numa_info[node->target].initiator_valid) {
> +    if (!numa_info[target].initiator_valid) {
>          error_setg(errp, "Invalid target=%"
>                     PRIu16 ", it hasn't a valid initiator proximity domain.",
>                     node->target);
> @@ -257,7 +266,7 @@ void parse_numa_hmat_lb(MachineState *ms, 
> NumaHmatLBOptions *node,
>          if (!hmat_lb) {
>              hmat_lb = g_malloc0(sizeof(*hmat_lb));
>              ms->numa_state->hmat_lb[node->hierarchy][node->data_type] = 
> hmat_lb;
> -        } else if (hmat_lb->latency[node->initiator][node->target]) {
> +        } else if (hmat_lb->latency[initiator][target]) {
>              error_setg(errp, "Duplicate configuration of the latency for "
>                         "initiator=%" PRIu16 " and target=%" PRIu16 ".",
>                         node->initiator, node->target);
> @@ -269,7 +278,7 @@ void parse_numa_hmat_lb(MachineState *ms, 
> NumaHmatLBOptions *node,
>              hmat_lb->base_lat = node->base_lat;
>          }
>  
> -        hmat_lb->latency[node->initiator][node->target] = node->latency;
> +        hmat_lb->latency[initiator][target] = node->latency;
>      }
>  
>      if (node->has_bandwidth) {
> @@ -278,7 +287,7 @@ void parse_numa_hmat_lb(MachineState *ms, 
> NumaHmatLBOptions *node,
>          if (!hmat_lb) {
>              hmat_lb = g_malloc0(sizeof(*hmat_lb));
>              ms->numa_state->hmat_lb[node->hierarchy][node->data_type] = 
> hmat_lb;
> -        } else if (hmat_lb->bandwidth[node->initiator][node->target]) {
> +        } else if (hmat_lb->bandwidth[initiator][target]) {
>              error_setg(errp, "Duplicate configuration of the bandwidth for "
>                         "initiator=%" PRIu16 " and target=%" PRIu16 ".",
>                         node->initiator, node->target);
> @@ -295,7 +304,7 @@ void parse_numa_hmat_lb(MachineState *ms, 
> NumaHmatLBOptions *node,
>              }
>          }
>  
> -        hmat_lb->bandwidth[node->initiator][node->target] = node->bandwidth;
> +        hmat_lb->bandwidth[initiator][target] = node->bandwidth;
>      }
>  
>      if (hmat_lb) {
> @@ -307,13 +316,13 @@ void parse_numa_hmat_lb(MachineState *ms, 
> NumaHmatLBOptions *node,
>  void parse_numa_hmat_cache(MachineState *ms, NumaHmatCacheOptions *node,
>                             Error **errp)
>  {
> -    int nb_numa_nodes = ms->numa_state->num_nodes;
>      HMAT_Cache_Info *hmat_cache = NULL;
> +    int node_id = find_numa(node->node_id, ms->numa_state);
>  
> -    if (node->node_id >= nb_numa_nodes) {
> +    if (node_id >= MAX_NODES) {
>          error_setg(errp, "Invalid node-id=%" PRIu32
> -                   ", it should be less than %d.",
> -                   node->node_id, nb_numa_nodes);
> +                   ", not found.",
> +                   node->node_id);
>          return;
>      }
>  
> @@ -330,7 +339,7 @@ void parse_numa_hmat_cache(MachineState *ms, 
> NumaHmatCacheOptions *node,
>                     node->level, node->total);
>          return;
>      }
> -    if (ms->numa_state->hmat_cache[node->node_id][node->level]) {
> +    if (ms->numa_state->hmat_cache[node_id][node->level]) {
>          error_setg(errp, "Duplicate configuration of the side cache for "
>                     "node-id=%" PRIu32 " and level=%" PRIu8 ".",
>                     node->node_id, node->level);
> @@ -338,15 +347,15 @@ void parse_numa_hmat_cache(MachineState *ms, 
> NumaHmatCacheOptions *node,
>      }
>  
>      if ((node->level > 1) &&
> -        ms->numa_state->hmat_cache[node->node_id][node->level - 1] &&
> +        ms->numa_state->hmat_cache[node_id][node->level - 1] &&
>          (node->size >=
> -            ms->numa_state->hmat_cache[node->node_id][node->level - 
> 1]->size)) {
> +            ms->numa_state->hmat_cache[node_id][node->level - 1]->size)) {
>          error_setg(errp, "Invalid size=0x%" PRIx64
>                     ", the size of level=%" PRIu8
>                     " should be less than the size(0x%" PRIx64
>                     ") of level=%" PRIu8 ".",
>                     node->size, node->level,
> -                   ms->numa_state->hmat_cache[node->node_id]
> +                   ms->numa_state->hmat_cache[node_id]
>                                               [node->level - 1]->size,
>                     node->level - 1);
>          return;
> @@ -362,7 +371,7 @@ void parse_numa_hmat_cache(MachineState *ms, 
> NumaHmatCacheOptions *node,
>      hmat_cache->write_policy = node->policy;
>      hmat_cache->line_size = node->line;
>  
> -    ms->numa_state->hmat_cache[node->node_id][node->level] = hmat_cache;
> +    ms->numa_state->hmat_cache[node_id][node->level] = hmat_cache;
>  }
>  
>  void set_numa_options(MachineState *ms, NumaOptions *object, Error **errp)
> @@ -393,7 +402,7 @@ void set_numa_options(MachineState *ms, NumaOptions 
> *object, Error **errp)
>              error_setg(&err, "Missing mandatory node-id property");
>              goto end;
>          }
> -        if (!ms->numa_state->nodes[object->u.cpu.node_id].present) {
> +        if (find_numa(object->u.cpu.node_id, ms->numa_state) == MAX_NODES) {
>              error_setg(&err, "Invalid node-id=%" PRId64 ", NUMA node must be 
> "
>                  "defined with -numa node,nodeid=ID before it's used with "
>                  "-numa cpu,node-id=ID", object->u.cpu.node_id);
> @@ -472,10 +481,11 @@ static void validate_numa_distance(MachineState *ms)
>              if (numa_info[src].distance[dst] == 0 &&
>                  numa_info[dst].distance[src] == 0) {
>                  if (src != dst) {
> -                    error_report("The distance between node %d and %d is "
> -                                 "missing, at least one distance value "
> -                                 "between each nodes should be provided.",
> -                                 src, dst);
> +                    error_report("The distance between node %" PRIu16
> +                                 " and %" PRIu16 " is missing, at least one "
> +                                 "distance value between each nodes should 
> be "
> +                                 "provided.",
> +                                 numa_info[src].nodeid, 
> numa_info[dst].nodeid);
>                      exit(EXIT_FAILURE);
>                  }
>              }
> @@ -493,9 +503,11 @@ static void validate_numa_distance(MachineState *ms)
>          for (src = 0; src < nb_numa_nodes; src++) {
>              for (dst = 0; dst < nb_numa_nodes; dst++) {
>                  if (src != dst && numa_info[src].distance[dst] == 0) {
> -                    error_report("At least one asymmetrical pair of "
> -                            "distances is given, please provide distances "
> -                            "for both directions of all node pairs.");
> +                    error_report("At least one asymmetrical pair (%" PRIu16
> +                            ", %" PRIu16 ") of distances is given, please "
> +                            "provide distances for both directions of all 
> node "
> +                            "pairs.",
> +                            numa_info[src].nodeid, numa_info[dst].nodeid);
>                      exit(EXIT_FAILURE);
>                  }
>              }
> @@ -587,27 +599,11 @@ void numa_complete_configuration(MachineState *ms)
>              parse_numa_node(ms, &node, &error_abort);
>      }
>  
> -    assert(max_numa_nodeid <= MAX_NODES);
> -
> -    /* No support for sparse NUMA node IDs yet: */
> -    for (i = max_numa_nodeid - 1; i >= 0; i--) {
> -        /* Report large node IDs first, to make mistakes easier to spot */
> -        if (!numa_info[i].present) {
> -            error_report("numa: Node ID missing: %d", i);
> -            exit(1);
> -        }
> -    }
> -
> -    /* This must be always true if all nodes are present: */
> -    assert(ms->numa_state->num_nodes == max_numa_nodeid);
> +    assert(ms->numa_state->num_nodes <= MAX_NODES);
>  
>      if (ms->numa_state->num_nodes > 0) {
>          uint64_t numa_total;
>  
> -        if (ms->numa_state->num_nodes > MAX_NODES) {
> -            ms->numa_state->num_nodes = MAX_NODES;
> -        }
> -
>          /* If no memory size is given for any node, assume the default case
>           * and distribute the available memory equally across all nodes
>           */
> diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
> index 90ad0dff99..f4a906c72e 100644
> --- a/hw/i386/acpi-build.c
> +++ b/hw/i386/acpi-build.c
> @@ -2361,6 +2361,7 @@ build_srat(GArray *table_data, BIOSLinker *linker, 
> MachineState *machine)
>      numa_start = table_data->len;
>  
>      for (i = 1; i < pcms->numa_nodes + 1; ++i) {
> +        int nodeid = machine->numa_state->nodes[i - 1].nodeid;
>          mem_base = next_base;
>          mem_len = pcms->node_mem[i - 1];
>          next_base = mem_base + mem_len;
> @@ -2371,7 +2372,7 @@ build_srat(GArray *table_data, BIOSLinker *linker, 
> MachineState *machine)
>              mem_len -= next_base - HOLE_640K_START;
>              if (mem_len > 0) {
>                  numamem = acpi_data_push(table_data, sizeof *numamem);
> -                build_srat_memory(numamem, mem_base, mem_len, i - 1,
> +                build_srat_memory(numamem, mem_base, mem_len, nodeid,
>                                    MEM_AFFINITY_ENABLED);
>              }
>  
> @@ -2390,7 +2391,7 @@ build_srat(GArray *table_data, BIOSLinker *linker, 
> MachineState *machine)
>              mem_len -= next_base - pcms->below_4g_mem_size;
>              if (mem_len > 0) {
>                  numamem = acpi_data_push(table_data, sizeof *numamem);
> -                build_srat_memory(numamem, mem_base, mem_len, i - 1,
> +                build_srat_memory(numamem, mem_base, mem_len, nodeid,
>                                    MEM_AFFINITY_ENABLED);
>              }
>              mem_base = 1ULL << 32;
> @@ -2400,7 +2401,7 @@ build_srat(GArray *table_data, BIOSLinker *linker, 
> MachineState *machine)
>  
>          if (mem_len > 0) {
>              numamem = acpi_data_push(table_data, sizeof *numamem);
> -            build_srat_memory(numamem, mem_base, mem_len, i - 1,
> +            build_srat_memory(numamem, mem_base, mem_len, nodeid,
>                                MEM_AFFINITY_ENABLED);
>          }
>      }
> @@ -2421,8 +2422,9 @@ build_srat(GArray *table_data, BIOSLinker *linker, 
> MachineState *machine)
>      if (hotplugabble_address_space_size) {
>          numamem = acpi_data_push(table_data, sizeof *numamem);
>          build_srat_memory(numamem, machine->device_memory->base,
> -                          hotplugabble_address_space_size, pcms->numa_nodes 
> - 1,
> -                          MEM_AFFINITY_HOTPLUGGABLE | MEM_AFFINITY_ENABLED);
> +            hotplugabble_address_space_size,
> +            machine->numa_state->nodes[pcms->numa_nodes - 1].nodeid,
> +            MEM_AFFINITY_HOTPLUGGABLE | MEM_AFFINITY_ENABLED);
>      }
>  
>      build_header(linker, table_data,
> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
> index c3f5a70a56..5b8db454b7 100644
> --- a/hw/i386/pc.c
> +++ b/hw/i386/pc.c
> @@ -2850,7 +2850,7 @@ static int64_t pc_get_default_cpu_node_id(const 
> MachineState *ms, int idx)
>     x86_topo_ids_from_apicid(ms->possible_cpus->cpus[idx].arch_id,
>                              pcms->smp_dies, ms->smp.cores,
>                              ms->smp.threads, &topo);
> -   return topo.pkg_id % ms->numa_state->num_nodes;
> +   return ms->numa_state->nodes[topo.pkg_id % 
> ms->numa_state->num_nodes].nodeid;
>  }
>  
>  static const CPUArchIdList *pc_possible_cpu_arch_ids(MachineState *ms)
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index f607ca567b..ef4802698c 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -424,7 +424,8 @@ static int spapr_populate_memory(SpaprMachineState 
> *spapr, void *fdt)
>          if (!mem_start) {
>              /* spapr_machine_init() checks for rma_size <= node0_size
>               * already */
> -            spapr_populate_memory_node(fdt, i, 0, spapr->rma_size);
> +            spapr_populate_memory_node(fdt, nodes[i].nodeid, 0,
> +                                       spapr->rma_size);
>              mem_start += spapr->rma_size;
>              node_size -= spapr->rma_size;
>          }
> @@ -436,7 +437,8 @@ static int spapr_populate_memory(SpaprMachineState 
> *spapr, void *fdt)
>                  sizetmp = 1ULL << ctzl(mem_start);
>              }
>  
> -            spapr_populate_memory_node(fdt, i, mem_start, sizetmp);
> +            spapr_populate_memory_node(fdt, nodes[i].nodeid, mem_start,
> +                                       sizetmp);
>              node_size -= sizetmp;
>              mem_start += sizetmp;
>          }
> @@ -2543,7 +2545,8 @@ static void spapr_validate_node_memory(MachineState 
> *machine, Error **errp)
>              error_setg(errp,
>                         "Node %d memory size 0x%" PRIx64
>                         " is not aligned to %" PRIu64 " MiB",
> -                       i, machine->numa_state->nodes[i].node_mem,
> +                       machine->numa_state->nodes[i].nodeid,
> +                       machine->numa_state->nodes[i].node_mem,
>                         SPAPR_MEMORY_BLOCK_SIZE / MiB);
>              return;
>          }
> @@ -4140,7 +4143,8 @@ spapr_cpu_index_to_props(MachineState *machine, 
> unsigned cpu_index)
>  
>  static int64_t spapr_get_default_cpu_node_id(const MachineState *ms, int idx)
>  {
> -    return idx / ms->smp.cores % ms->numa_state->num_nodes;
> +    return ms->numa_state->nodes[
> +        idx / ms->smp.cores % ms->numa_state->num_nodes].nodeid;
>  }
>  
>  static const CPUArchIdList *spapr_possible_cpu_arch_ids(MachineState 
> *machine)
> diff --git a/include/sysemu/numa.h b/include/sysemu/numa.h
> index 9009bbdee3..7474f2c5b6 100644
> --- a/include/sysemu/numa.h
> +++ b/include/sysemu/numa.h
> @@ -13,6 +13,7 @@ struct NodeInfo {
>      bool has_cpu;
>      bool initiator_valid;
>      uint16_t initiator;
> +    uint16_t nodeid;
>      uint8_t distance[MAX_NODES];
>  };
>  
> @@ -39,6 +40,7 @@ struct NumaState {
>  };
>  typedef struct NumaState NumaState;
>  
> +int find_numa(uint16_t node, NumaState *numa_state);
>  void set_numa_options(MachineState *ms, NumaOptions *object, Error **errp);
>  void parse_numa_opts(MachineState *ms);
>  void parse_numa_hmat_lb(MachineState *ms, NumaHmatLBOptions *node,

-- 
David Gibson                    | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
                                | _way_ _around_!
http://www.ozlabs.org/~dgibson

signature.asc
Description: PGP signature

Re: [Qemu-devel] [PATCH v2] numa: numa nodeid need not be sequential

Reply via email to