Re: [PATCH v6 3/6] spapr: introduce spapr_numa_associativity_reset()

Greg Kurz Tue, 14 Sep 2021 04:58:57 -0700

On Fri, 10 Sep 2021 16:55:36 -0300
Daniel Henrique Barboza <danielhb...@gmail.com> wrote:


> Introducing a new NUMA affinity, FORM2, requires a new mechanism to
> switch between affinity modes after CAS. Also, we want FORM2 data
> structures and functions to be completely separated from the existing
> FORM1 code, allowing us to avoid adding new code that inherits the
> existing complexity of FORM1.
> 
> At the same time, it's also desirable to minimize the amount of changes
> made in write_dt() functions that are used to write ibm,associativity of
> the resources, RTAS artifacts and h_home_node_associativity. These
> functions can work in the same way in both NUMA affinity modes, as long
> as we use a similar data structure and parametrize it properly depending
> on the affinity mode selected.
> 
> This patch introduces spapr_numa_associativity_reset() to start this
> process. This function will be used to switch to the chosen NUMA
> affinity after CAS and after migrating the guest. To do that, the
> existing 'numa_assoc_array' is renamed to 'FORM1_assoc_array' and will
> hold FORM1 data that is populated at associativity_init().
> 'numa_assoc_array' is now a pointer that can be switched between the
> existing affinity arrays. We don't have FORM2 data structures yet, so
> 'numa_assoc_array' will always point to 'FORM1_assoc_array'.
> 
> We also take the precaution of pointing 'numa_assoc_array' to
> 'FORM1_assoc_array' in associativity_init() time, before CAS, to not
> change FORM1 availability for existing guests.
> 
> A small change in spapr_numa_write_associativity_dt() is made to reflect
> the fact that 'numa_assoc_array' is now a pointer and we must be
> explicit with the size being written in the DT.
> 
> Signed-off-by: Daniel Henrique Barboza <danielhb...@gmail.com>
> ---
>  hw/ppc/spapr.c              | 14 +++++++++++++
>  hw/ppc/spapr_hcall.c        |  7 +++++++
>  hw/ppc/spapr_numa.c         | 42 +++++++++++++++++++++++++++++--------
>  include/hw/ppc/spapr.h      |  3 ++-
>  include/hw/ppc/spapr_numa.h |  1 +
>  5 files changed, 57 insertions(+), 10 deletions(-)
> 
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index d39fd4e644..5afbb76cab 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -1786,6 +1786,20 @@ static int spapr_post_load(void *opaque, int 
> version_id)
>          return err;
>      }
>  
> +    /*
> +     * NUMA affinity selection is made in CAS time. There is no reliable
> +     * way of telling whether the guest already went through CAS before
> +     * migration due to how spapr_ov5_cas_needed works: a FORM1 guest can
> +     * be migrated with ov5_cas empty regardless of going through CAS
> +     * first.
> +     *
> +     * One solution is to call numa_associativity_reset(). The downside
> +     * is that a guest migrated before CAS will reset it again when going
> +     * through it, but since it's a lightweight operation it's worth being
> +     * a little redundant to be safe.

Also this isn't a hot path.

> +     */
> +     spapr_numa_associativity_reset(spapr);
> +
>      return err;
>  }
>  
> diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
> index 0e9a5b2e40..82ab92ddba 100644
> --- a/hw/ppc/spapr_hcall.c
> +++ b/hw/ppc/spapr_hcall.c
> @@ -17,6 +17,7 @@
>  #include "kvm_ppc.h"
>  #include "hw/ppc/fdt.h"
>  #include "hw/ppc/spapr_ovec.h"
> +#include "hw/ppc/spapr_numa.h"
>  #include "mmu-book3s-v3.h"
>  #include "hw/mem/memory-device.h"
>  
> @@ -1197,6 +1198,12 @@ target_ulong do_client_architecture_support(PowerPCCPU 
> *cpu,
>      spapr->cas_pre_isa3_guest = !spapr_ovec_test(ov1_guest, OV1_PPC_3_00);
>      spapr_ovec_cleanup(ov1_guest);
>  
> +    /*
> +     * Reset numa_assoc_array now that we know which NUMA affinity
> +     * the guest will use.
> +     */
> +    spapr_numa_associativity_reset(spapr);
> +
>      /*
>       * Ensure the guest asks for an interrupt mode we support;
>       * otherwise terminate the boot.
> diff --git a/hw/ppc/spapr_numa.c b/hw/ppc/spapr_numa.c
> index fb6059550f..327952ba9e 100644
> --- a/hw/ppc/spapr_numa.c
> +++ b/hw/ppc/spapr_numa.c
> @@ -97,7 +97,7 @@ static void 
> spapr_numa_define_FORM1_domains(SpaprMachineState *spapr)
>       */
>      for (i = 1; i < nb_numa_nodes; i++) {
>          for (j = 1; j < MAX_DISTANCE_REF_POINTS; j++) {
> -            spapr->numa_assoc_array[i][j] = cpu_to_be32(i);
> +            spapr->FORM1_assoc_array[i][j] = cpu_to_be32(i);
>          }
>      }
>  
> @@ -149,8 +149,8 @@ static void 
> spapr_numa_define_FORM1_domains(SpaprMachineState *spapr)
>               * and going up to 0x1.
>               */
>              for (i = n_level; i > 0; i--) {
> -                assoc_src = spapr->numa_assoc_array[src][i];
> -                spapr->numa_assoc_array[dst][i] = assoc_src;
> +                assoc_src = spapr->FORM1_assoc_array[src][i];
> +                spapr->FORM1_assoc_array[dst][i] = assoc_src;
>              }
>          }
>      }
> @@ -167,6 +167,11 @@ static void 
> spapr_numa_FORM1_affinity_init(SpaprMachineState *spapr,
>      int nb_numa_nodes = machine->numa_state->num_nodes;
>      int i, j, max_nodes_with_gpus;
>  
> +    /* init FORM1_assoc_array */
> +    for (i = 0; i < MAX_NODES + NVGPU_MAX_NUM; i++) {
> +        spapr->FORM1_assoc_array[i] = g_new0(uint32_t, NUMA_ASSOC_SIZE);

Why dynamic allocation since you have fixed size ?

> +    }
> +
>      /*
>       * For all associativity arrays: first position is the size,
>       * position MAX_DISTANCE_REF_POINTS is always the numa_id,
> @@ -177,8 +182,8 @@ static void 
> spapr_numa_FORM1_affinity_init(SpaprMachineState *spapr,
>       * 'i' will be a valid node_id set by the user.
>       */
>      for (i = 0; i < nb_numa_nodes; i++) {
> -        spapr->numa_assoc_array[i][0] = cpu_to_be32(MAX_DISTANCE_REF_POINTS);
> -        spapr->numa_assoc_array[i][MAX_DISTANCE_REF_POINTS] = cpu_to_be32(i);
> +        spapr->FORM1_assoc_array[i][0] = 
> cpu_to_be32(MAX_DISTANCE_REF_POINTS);
> +        spapr->FORM1_assoc_array[i][MAX_DISTANCE_REF_POINTS] = 
> cpu_to_be32(i);
>      }
>  
>      /*
> @@ -192,15 +197,15 @@ static void 
> spapr_numa_FORM1_affinity_init(SpaprMachineState *spapr,
>      max_nodes_with_gpus = nb_numa_nodes + NVGPU_MAX_NUM;
>  
>      for (i = nb_numa_nodes; i < max_nodes_with_gpus; i++) {
> -        spapr->numa_assoc_array[i][0] = cpu_to_be32(MAX_DISTANCE_REF_POINTS);
> +        spapr->FORM1_assoc_array[i][0] = 
> cpu_to_be32(MAX_DISTANCE_REF_POINTS);
>  
>          for (j = 1; j < MAX_DISTANCE_REF_POINTS; j++) {
>              uint32_t gpu_assoc = smc->pre_5_1_assoc_refpoints ?
>                                   SPAPR_GPU_NUMA_ID : cpu_to_be32(i);
> -            spapr->numa_assoc_array[i][j] = gpu_assoc;
> +            spapr->FORM1_assoc_array[i][j] = gpu_assoc;
>          }
>  
> -        spapr->numa_assoc_array[i][MAX_DISTANCE_REF_POINTS] = cpu_to_be32(i);
> +        spapr->FORM1_assoc_array[i][MAX_DISTANCE_REF_POINTS] = 
> cpu_to_be32(i);
>      }
>  
>      /*
> @@ -227,14 +232,33 @@ void spapr_numa_associativity_init(SpaprMachineState 
> *spapr,
>                                     MachineState *machine)
>  {
>      spapr_numa_FORM1_affinity_init(spapr, machine);
> +
> +    /*
> +     * Default to FORM1 affinity until CAS. We'll call affinity_reset()
> +     * during CAS when we're sure about which NUMA affinity the guest
> +     * is going to use.
> +     *
> +     * This step is a failsafe - guests in the wild were able to read
> +     * FORM1 affinity info before CAS for a long time. Since affinity_reset()
> +     * is just a pointer switch between data that was already populated
> +     * here, this is an acceptable overhead to be on the safer side.
> +     */
> +    spapr->numa_assoc_array = spapr->FORM1_assoc_array;

The right way to do that is to call spapr_numa_associativity_reset() from
spapr_machine_reset() because we want to revert to FORM1 each time the
guest is rebooted.

> +}
> +
> +void spapr_numa_associativity_reset(SpaprMachineState *spapr)
> +{
> +    /* No FORM2 affinity implemented yet */

This seems quite obvious at this point, not sure the comment helps.

> +    spapr->numa_assoc_array = spapr->FORM1_assoc_array;
>  }
>  
>  void spapr_numa_write_associativity_dt(SpaprMachineState *spapr, void *fdt,
>                                         int offset, int nodeid)
>  {
> +    /* Hardcode the size of FORM1 associativity array for now */
>      _FDT((fdt_setprop(fdt, offset, "ibm,associativity",
>                        spapr->numa_assoc_array[nodeid],
> -                      sizeof(spapr->numa_assoc_array[nodeid]))));
> +                      NUMA_ASSOC_SIZE * sizeof(uint32_t))));

Rather than doing this temporary change that gets undone in
a later patch, I suggest you introduce get_numa_assoc_size()
in a preliminary patch and use it here already :

-                      NUMA_ASSOC_SIZE * sizeof(uint32_t))));
+                      get_numa_assoc_size(spapr) * sizeof(uint32_t))));

This will simplify the reviewing.

>  }
>  
>  static uint32_t *spapr_numa_get_vcpu_assoc(SpaprMachineState *spapr,
> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> index 637652ad16..8a9490f0bf 100644
> --- a/include/hw/ppc/spapr.h
> +++ b/include/hw/ppc/spapr.h
> @@ -249,7 +249,8 @@ struct SpaprMachineState {
>      unsigned gpu_numa_id;
>      SpaprTpmProxy *tpm_proxy;
>  
> -    uint32_t numa_assoc_array[MAX_NODES + NVGPU_MAX_NUM][NUMA_ASSOC_SIZE];
> +    uint32_t *FORM1_assoc_array[MAX_NODES + NVGPU_MAX_NUM];

As said above, I really don't see the point in not having :

    uint32_t *FORM1_assoc_array[MAX_NODES + NVGPU_MAX_NUM][NUMA_ASSOC_SIZE];

> +    uint32_t **numa_assoc_array;
>  
>      Error *fwnmi_migration_blocker;
>  };
> diff --git a/include/hw/ppc/spapr_numa.h b/include/hw/ppc/spapr_numa.h
> index 6f9f02d3de..ccf3e4eae8 100644
> --- a/include/hw/ppc/spapr_numa.h
> +++ b/include/hw/ppc/spapr_numa.h
> @@ -24,6 +24,7 @@
>   */
>  void spapr_numa_associativity_init(SpaprMachineState *spapr,
>                                     MachineState *machine);
> +void spapr_numa_associativity_reset(SpaprMachineState *spapr);
>  void spapr_numa_write_rtas_dt(SpaprMachineState *spapr, void *fdt, int rtas);
>  void spapr_numa_write_associativity_dt(SpaprMachineState *spapr, void *fdt,
>                                         int offset, int nodeid);

Re: [PATCH v6 3/6] spapr: introduce spapr_numa_associativity_reset()

Reply via email to