On Fri, 10 Sep 2021 16:55:36 -0300 Daniel Henrique Barboza <danielhb...@gmail.com> wrote:
> Introducing a new NUMA affinity, FORM2, requires a new mechanism to > switch between affinity modes after CAS. Also, we want FORM2 data > structures and functions to be completely separated from the existing > FORM1 code, allowing us to avoid adding new code that inherits the > existing complexity of FORM1. > > At the same time, it's also desirable to minimize the amount of changes > made in write_dt() functions that are used to write ibm,associativity of > the resources, RTAS artifacts and h_home_node_associativity. These > functions can work in the same way in both NUMA affinity modes, as long > as we use a similar data structure and parametrize it properly depending > on the affinity mode selected. > > This patch introduces spapr_numa_associativity_reset() to start this > process. This function will be used to switch to the chosen NUMA > affinity after CAS and after migrating the guest. To do that, the > existing 'numa_assoc_array' is renamed to 'FORM1_assoc_array' and will > hold FORM1 data that is populated at associativity_init(). > 'numa_assoc_array' is now a pointer that can be switched between the > existing affinity arrays. We don't have FORM2 data structures yet, so > 'numa_assoc_array' will always point to 'FORM1_assoc_array'. > > We also take the precaution of pointing 'numa_assoc_array' to > 'FORM1_assoc_array' in associativity_init() time, before CAS, to not > change FORM1 availability for existing guests. > > A small change in spapr_numa_write_associativity_dt() is made to reflect > the fact that 'numa_assoc_array' is now a pointer and we must be > explicit with the size being written in the DT. > > Signed-off-by: Daniel Henrique Barboza <danielhb...@gmail.com> > --- > hw/ppc/spapr.c | 14 +++++++++++++ > hw/ppc/spapr_hcall.c | 7 +++++++ > hw/ppc/spapr_numa.c | 42 +++++++++++++++++++++++++++++-------- > include/hw/ppc/spapr.h | 3 ++- > include/hw/ppc/spapr_numa.h | 1 + > 5 files changed, 57 insertions(+), 10 deletions(-) > > diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c > index d39fd4e644..5afbb76cab 100644 > --- a/hw/ppc/spapr.c > +++ b/hw/ppc/spapr.c > @@ -1786,6 +1786,20 @@ static int spapr_post_load(void *opaque, int > version_id) > return err; > } > > + /* > + * NUMA affinity selection is made in CAS time. There is no reliable > + * way of telling whether the guest already went through CAS before > + * migration due to how spapr_ov5_cas_needed works: a FORM1 guest can > + * be migrated with ov5_cas empty regardless of going through CAS > + * first. > + * > + * One solution is to call numa_associativity_reset(). The downside > + * is that a guest migrated before CAS will reset it again when going > + * through it, but since it's a lightweight operation it's worth being > + * a little redundant to be safe. Also this isn't a hot path. > + */ > + spapr_numa_associativity_reset(spapr); > + > return err; > } > > diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c > index 0e9a5b2e40..82ab92ddba 100644 > --- a/hw/ppc/spapr_hcall.c > +++ b/hw/ppc/spapr_hcall.c > @@ -17,6 +17,7 @@ > #include "kvm_ppc.h" > #include "hw/ppc/fdt.h" > #include "hw/ppc/spapr_ovec.h" > +#include "hw/ppc/spapr_numa.h" > #include "mmu-book3s-v3.h" > #include "hw/mem/memory-device.h" > > @@ -1197,6 +1198,12 @@ target_ulong do_client_architecture_support(PowerPCCPU > *cpu, > spapr->cas_pre_isa3_guest = !spapr_ovec_test(ov1_guest, OV1_PPC_3_00); > spapr_ovec_cleanup(ov1_guest); > > + /* > + * Reset numa_assoc_array now that we know which NUMA affinity > + * the guest will use. > + */ > + spapr_numa_associativity_reset(spapr); > + > /* > * Ensure the guest asks for an interrupt mode we support; > * otherwise terminate the boot. > diff --git a/hw/ppc/spapr_numa.c b/hw/ppc/spapr_numa.c > index fb6059550f..327952ba9e 100644 > --- a/hw/ppc/spapr_numa.c > +++ b/hw/ppc/spapr_numa.c > @@ -97,7 +97,7 @@ static void > spapr_numa_define_FORM1_domains(SpaprMachineState *spapr) > */ > for (i = 1; i < nb_numa_nodes; i++) { > for (j = 1; j < MAX_DISTANCE_REF_POINTS; j++) { > - spapr->numa_assoc_array[i][j] = cpu_to_be32(i); > + spapr->FORM1_assoc_array[i][j] = cpu_to_be32(i); > } > } > > @@ -149,8 +149,8 @@ static void > spapr_numa_define_FORM1_domains(SpaprMachineState *spapr) > * and going up to 0x1. > */ > for (i = n_level; i > 0; i--) { > - assoc_src = spapr->numa_assoc_array[src][i]; > - spapr->numa_assoc_array[dst][i] = assoc_src; > + assoc_src = spapr->FORM1_assoc_array[src][i]; > + spapr->FORM1_assoc_array[dst][i] = assoc_src; > } > } > } > @@ -167,6 +167,11 @@ static void > spapr_numa_FORM1_affinity_init(SpaprMachineState *spapr, > int nb_numa_nodes = machine->numa_state->num_nodes; > int i, j, max_nodes_with_gpus; > > + /* init FORM1_assoc_array */ > + for (i = 0; i < MAX_NODES + NVGPU_MAX_NUM; i++) { > + spapr->FORM1_assoc_array[i] = g_new0(uint32_t, NUMA_ASSOC_SIZE); Why dynamic allocation since you have fixed size ? > + } > + > /* > * For all associativity arrays: first position is the size, > * position MAX_DISTANCE_REF_POINTS is always the numa_id, > @@ -177,8 +182,8 @@ static void > spapr_numa_FORM1_affinity_init(SpaprMachineState *spapr, > * 'i' will be a valid node_id set by the user. > */ > for (i = 0; i < nb_numa_nodes; i++) { > - spapr->numa_assoc_array[i][0] = cpu_to_be32(MAX_DISTANCE_REF_POINTS); > - spapr->numa_assoc_array[i][MAX_DISTANCE_REF_POINTS] = cpu_to_be32(i); > + spapr->FORM1_assoc_array[i][0] = > cpu_to_be32(MAX_DISTANCE_REF_POINTS); > + spapr->FORM1_assoc_array[i][MAX_DISTANCE_REF_POINTS] = > cpu_to_be32(i); > } > > /* > @@ -192,15 +197,15 @@ static void > spapr_numa_FORM1_affinity_init(SpaprMachineState *spapr, > max_nodes_with_gpus = nb_numa_nodes + NVGPU_MAX_NUM; > > for (i = nb_numa_nodes; i < max_nodes_with_gpus; i++) { > - spapr->numa_assoc_array[i][0] = cpu_to_be32(MAX_DISTANCE_REF_POINTS); > + spapr->FORM1_assoc_array[i][0] = > cpu_to_be32(MAX_DISTANCE_REF_POINTS); > > for (j = 1; j < MAX_DISTANCE_REF_POINTS; j++) { > uint32_t gpu_assoc = smc->pre_5_1_assoc_refpoints ? > SPAPR_GPU_NUMA_ID : cpu_to_be32(i); > - spapr->numa_assoc_array[i][j] = gpu_assoc; > + spapr->FORM1_assoc_array[i][j] = gpu_assoc; > } > > - spapr->numa_assoc_array[i][MAX_DISTANCE_REF_POINTS] = cpu_to_be32(i); > + spapr->FORM1_assoc_array[i][MAX_DISTANCE_REF_POINTS] = > cpu_to_be32(i); > } > > /* > @@ -227,14 +232,33 @@ void spapr_numa_associativity_init(SpaprMachineState > *spapr, > MachineState *machine) > { > spapr_numa_FORM1_affinity_init(spapr, machine); > + > + /* > + * Default to FORM1 affinity until CAS. We'll call affinity_reset() > + * during CAS when we're sure about which NUMA affinity the guest > + * is going to use. > + * > + * This step is a failsafe - guests in the wild were able to read > + * FORM1 affinity info before CAS for a long time. Since affinity_reset() > + * is just a pointer switch between data that was already populated > + * here, this is an acceptable overhead to be on the safer side. > + */ > + spapr->numa_assoc_array = spapr->FORM1_assoc_array; The right way to do that is to call spapr_numa_associativity_reset() from spapr_machine_reset() because we want to revert to FORM1 each time the guest is rebooted. > +} > + > +void spapr_numa_associativity_reset(SpaprMachineState *spapr) > +{ > + /* No FORM2 affinity implemented yet */ This seems quite obvious at this point, not sure the comment helps. > + spapr->numa_assoc_array = spapr->FORM1_assoc_array; > } > > void spapr_numa_write_associativity_dt(SpaprMachineState *spapr, void *fdt, > int offset, int nodeid) > { > + /* Hardcode the size of FORM1 associativity array for now */ > _FDT((fdt_setprop(fdt, offset, "ibm,associativity", > spapr->numa_assoc_array[nodeid], > - sizeof(spapr->numa_assoc_array[nodeid])))); > + NUMA_ASSOC_SIZE * sizeof(uint32_t)))); Rather than doing this temporary change that gets undone in a later patch, I suggest you introduce get_numa_assoc_size() in a preliminary patch and use it here already : - NUMA_ASSOC_SIZE * sizeof(uint32_t)))); + get_numa_assoc_size(spapr) * sizeof(uint32_t)))); This will simplify the reviewing. > } > > static uint32_t *spapr_numa_get_vcpu_assoc(SpaprMachineState *spapr, > diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h > index 637652ad16..8a9490f0bf 100644 > --- a/include/hw/ppc/spapr.h > +++ b/include/hw/ppc/spapr.h > @@ -249,7 +249,8 @@ struct SpaprMachineState { > unsigned gpu_numa_id; > SpaprTpmProxy *tpm_proxy; > > - uint32_t numa_assoc_array[MAX_NODES + NVGPU_MAX_NUM][NUMA_ASSOC_SIZE]; > + uint32_t *FORM1_assoc_array[MAX_NODES + NVGPU_MAX_NUM]; As said above, I really don't see the point in not having : uint32_t *FORM1_assoc_array[MAX_NODES + NVGPU_MAX_NUM][NUMA_ASSOC_SIZE]; > + uint32_t **numa_assoc_array; > > Error *fwnmi_migration_blocker; > }; > diff --git a/include/hw/ppc/spapr_numa.h b/include/hw/ppc/spapr_numa.h > index 6f9f02d3de..ccf3e4eae8 100644 > --- a/include/hw/ppc/spapr_numa.h > +++ b/include/hw/ppc/spapr_numa.h > @@ -24,6 +24,7 @@ > */ > void spapr_numa_associativity_init(SpaprMachineState *spapr, > MachineState *machine); > +void spapr_numa_associativity_reset(SpaprMachineState *spapr); > void spapr_numa_write_rtas_dt(SpaprMachineState *spapr, void *fdt, int rtas); > void spapr_numa_write_associativity_dt(SpaprMachineState *spapr, void *fdt, > int offset, int nodeid);