Fabiano Rosas <faro...@linux.ibm.com> writes: > QEMU reports MMU support to the guest via the ibm,architecture-vec-5 > property of the /chosen node. Byte number 26 specifies Radix Table > Expansions, currently only GTSE (Guest Translation Shootdown > Enable). This feature determines whether the tlbie instruction (and > others) are HV privileged. > > Up until now, we always reported GTSE=1 to guests. Even after the > support for GTSE=0 was added. As part of that support, a kernel > command line radix_hcall_invalidate=on was introduced that overrides > the GTSE value received via CAS. So a guest can run with GTSE=0 and > use the H_RPT_INVALIDATE hcall instead of tlbie. > > In this scenario, having GTSE always set to 1 by QEMU leads to a crash > when running nested KVM guests because KVM does not allow a nested > hypervisor to set GTSE support for its nested guests. So a nested > guest always uses the same value for LPCR_GTSE as its HV. Since the > nested HV disabled GTSE, but the L2 QEMU always reports GTSE=1, we run > into a crash when: > > L1 LPCR_GTSE=0 > L2 LPCR_GTSE=0 > L2 CAS GTSE=1 > > The nested guest will run 'tlbie' and crash because the HW looks at > LPCR_GTSE, which is clear. > > Having GTSE disabled in the L1 and enabled in the L2 is not an option > because the whole purpose of GTSE is to disallow access to tlbie and > we cannot allow L1 to spawn L2s that can access features that L1 > itself cannot. > > We also cannot have the guest check the LPCR bit, because LPCR is > HV-privileged. > > So this patch goes through the most intuitive route which is to have > QEMU ask KVM about GTSE support and advertise the correct value to the > guest. A new KVM_CAP_PPC_GTSE capability is being added. > > TCG continues to always enable GTSE. > > Signed-off-by: Fabiano Rosas <faro...@linux.ibm.com> > --- > hw/ppc/spapr.c | 38 +++++++++++++++++++++++++++++++------- > target/ppc/kvm.c | 8 ++++++++ > target/ppc/kvm_ppc.h | 6 ++++++ > 3 files changed, 45 insertions(+), 7 deletions(-) > > diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c > index 4cc204f90d..3e95a1831f 100644 > --- a/hw/ppc/spapr.c > +++ b/hw/ppc/spapr.c > @@ -971,7 +971,7 @@ static void > spapr_dt_ov5_platform_support(SpaprMachineState *spapr, void *fdt, > 23, 0x00, /* XICS / XIVE mode */ > 24, 0x00, /* Hash/Radix, filled in below. */ > 25, 0x00, /* Hash options: Segment Tables == no, GTSE == no. */ > - 26, 0x40, /* Radix options: GTSE == yes. */ > + 26, 0x00, /* Radix options, filled in below. */ > }; > > if (spapr->irq->xics && spapr->irq->xive) { > @@ -1000,10 +1000,16 @@ static void > spapr_dt_ov5_platform_support(SpaprMachineState *spapr, void *fdt, > } else { > val[3] = 0x00; /* Hash */ > } > + > + if (kvmppc_has_cap_gtse()) { > + val[7] = 0x40 /* OV5_MMU_RADIX_GTSE */; > + }
This needs the same treatment as below to support kernels that don't know about the cap. Also, look at that semicolon! =D > } else { > /* V3 MMU supports both hash and radix in tcg (with dynamic > switching) */ > val[3] = 0xC0; > + val[7] = 0x40 /* OV5_MMU_RADIX_GTSE */; > } > + > _FDT(fdt_setprop(fdt, chosen, "ibm,arch-vec-5-platform-support", > val, sizeof(val))); > } > @@ -2824,14 +2830,32 @@ static void spapr_machine_init(MachineState *machine) > /* Init numa_assoc_array */ > spapr_numa_associativity_init(spapr, machine); > > - if ((!kvm_enabled() || kvmppc_has_cap_mmu_radix()) && > - ppc_type_check_compat(machine->cpu_type, CPU_POWERPC_LOGICAL_3_00, 0, > + if (ppc_type_check_compat(machine->cpu_type, CPU_POWERPC_LOGICAL_3_00, 0, > spapr->max_compat_pvr)) { > - spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_300); > - /* KVM and TCG always allow GTSE with radix... */ > - spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_GTSE); > + > + /* TCG always supports Radix w/ GTSE */ > + if (!kvm_enabled()) { > + spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_300); > + spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_GTSE); > + } else { > + if (kvmppc_has_cap_mmu_radix()) { > + spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_300); > + } > + > + /* > + * Only disable Guest Translation Shootdown if KVM > + * supports the H_RPT_INVALIDATE hypercall, otherwise we'd > + * leave the guest with no way to make TLB invalidations. > + */ > + if (kvmppc_has_cap_rpt_invalidate()) { > + if (kvmppc_has_cap_gtse()) { > + spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_GTSE); > + } > + } else { > + spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_GTSE); > + } > + } > } > - /* ... but not with hash (currently). */ > > if (kvm_enabled()) { > /* Enable H_LOGICAL_CI_* so SLOF can talk to in-kernel devices */ > diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c > index dc93b99189..91582c4b15 100644 > --- a/target/ppc/kvm.c > +++ b/target/ppc/kvm.c > @@ -90,6 +90,7 @@ static int cap_ppc_nested_kvm_hv; > static int cap_large_decr; > static int cap_fwnmi; > static int cap_rpt_invalidate; > +static int cap_gtse; > > static uint32_t debug_inst_opcode; > > @@ -154,6 +155,8 @@ int kvm_arch_init(MachineState *ms, KVMState *s) > } > > cap_rpt_invalidate = kvm_vm_check_extension(s, > KVM_CAP_PPC_RPT_INVALIDATE); > + cap_gtse = kvm_vm_check_extension(s, KVM_CAP_PPC_GTSE); > + > kvm_ppc_register_host_cpu_type(); > > return 0; > @@ -2397,6 +2400,11 @@ bool kvmppc_has_cap_mmu_hash_v3(void) > return cap_mmu_hash_v3; > } > > +bool kvmppc_has_cap_gtse(void) > +{ > + return cap_gtse; > +} > + > static bool kvmppc_power8_host(void) > { > bool ret = false; > diff --git a/target/ppc/kvm_ppc.h b/target/ppc/kvm_ppc.h > index ee9325bf9a..7d6980edb7 100644 > --- a/target/ppc/kvm_ppc.h > +++ b/target/ppc/kvm_ppc.h > @@ -63,6 +63,7 @@ bool kvmppc_has_cap_fixup_hcalls(void); > bool kvmppc_has_cap_htm(void); > bool kvmppc_has_cap_mmu_radix(void); > bool kvmppc_has_cap_mmu_hash_v3(void); > +bool kvmppc_has_cap_gtse(void); > bool kvmppc_has_cap_xive(void); > int kvmppc_get_cap_safe_cache(void); > int kvmppc_get_cap_safe_bounds_check(void); > @@ -343,6 +344,11 @@ static inline bool kvmppc_has_cap_mmu_hash_v3(void) > return false; > } > > +static inline bool kvmppc_has_cap_gtse(void) > +{ > + return false; > +} > + > static inline bool kvmppc_has_cap_xive(void) > { > return false;