Nicholas Piggin <npig...@gmail.com> writes: > This implements the nested-hv hcall API for spapr under TCG. > It's still a bit rough around the edges, concept seems to work. > > Some HV exceptions can be raised now in the TCG spapr machine when > running a nested guest. The main ones are the lev==1 syscall, the > hdecr, hdsi and hisi, and h_virt external interrupts. These are > dealt with in the interrupt delivery code by noticing MSR[HV] raised > and instead of switching the machine to HV mode, it exits the > H_ENTER_NESTED hcall with the interrupt vector as return value as > required by the hcall API. > > Address translation is provided by the 2-level page table walker > that is implemented for the pnv machine. The partition scope page > table is pointed to the L1's partition scope, and a few tests have > to take into account that nested-hv translations are 2-level. This > could perhaps be tidied up a bit e.g., with a 'bool two_level = ...' > but it's surprisingly little code. > > There is no TLB tagging between L1 and L2 translations at the moment > so the TLB is flushed on any L1<->L2 transition (hcall entry and exit). > > XXX: stop doing atomic RC on page table walks (not for nested but in general) > > not-yet-Signed-off-by: Nicholas Piggin <npig...@gmail.com>
Hi Nick, Sorry it took me so long to get to this. It think overall it looks good. I have some comments about code structure that you probably already know about. I don't have answers to your XXX questions but I think it's ok to handle them later individually as they come up, after this is merged. > --- > hw/ppc/ppc.c | 20 +++ > hw/ppc/spapr.c | 16 ++ > hw/ppc/spapr_caps.c | 5 +- > hw/ppc/spapr_hcall.c | 316 +++++++++++++++++++++++++++++++++++++ > include/hw/ppc/ppc.h | 3 + > include/hw/ppc/spapr.h | 75 ++++++++- > target/ppc/cpu.h | 6 + > target/ppc/excp_helper.c | 60 ++++--- > target/ppc/helper_regs.c | 1 + > target/ppc/mmu-book3s-v3.c | 20 ++- > target/ppc/mmu-radix64.c | 15 +- > 11 files changed, 499 insertions(+), 38 deletions(-) > > diff --git a/hw/ppc/ppc.c b/hw/ppc/ppc.c > index a7c262db93..135900a6f4 100644 > --- a/hw/ppc/ppc.c > +++ b/hw/ppc/ppc.c > @@ -1083,6 +1083,26 @@ clk_setup_cb cpu_ppc_tb_init (CPUPPCState *env, > uint32_t freq) > return &cpu_ppc_set_tb_clk; > } > > +void cpu_ppc_hdecr_init (CPUPPCState *env) > +{ > + PowerPCCPU *cpu = env_archcpu(env); > + > + assert(env->tb_env->hdecr_timer == NULL); > + > + env->tb_env->hdecr_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, > &cpu_ppc_hdecr_cb, > + cpu); > +} > + > +void cpu_ppc_hdecr_exit (CPUPPCState *env) > +{ > + PowerPCCPU *cpu = env_archcpu(env); > + > + timer_free(env->tb_env->hdecr_timer); > + env->tb_env->hdecr_timer = NULL; > + > + cpu_ppc_hdecr_lower(cpu); > +} > + > /* Specific helpers for POWER & PowerPC 601 RTC */ > void cpu_ppc601_store_rtcu (CPUPPCState *env, uint32_t value) > { > diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c > index 3d6ec309dd..f0c3f726f2 100644 > --- a/hw/ppc/spapr.c > +++ b/hw/ppc/spapr.c > @@ -1273,6 +1273,8 @@ static void > emulate_spapr_hypercall(PPCVirtualHypervisor *vhyp, > if (msr_pr) { > hcall_dprintf("Hypercall made with MSR[PR]=1\n"); > env->gpr[3] = H_PRIVILEGE; > + } else if (env->gpr[3] == KVMPPC_H_ENTER_NESTED) { > + spapr_enter_nested(cpu); This looks like it could be plumbed along with spapr_hypercall below. > } else { > env->gpr[3] = spapr_hypercall(cpu, env->gpr[3], &env->gpr[4]); > } > @@ -4465,6 +4467,17 @@ PowerPCCPU *spapr_find_cpu(int vcpu_id) > return NULL; > } > > +static bool spapr_cpu_in_nested(PowerPCCPU *cpu) > +{ > + return cpu->in_spapr_nested; > +} > + > +static target_ulong spapr_get_nested_ptcr(PowerPCCPU *cpu, target_ulong lpid) > +{ > + SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine()); > + return spapr->nested_ptcr; > +} > + > static void spapr_cpu_exec_enter(PPCVirtualHypervisor *vhyp, PowerPCCPU *cpu) > { > SpaprCpuState *spapr_cpu = spapr_cpu_state(cpu); > @@ -4573,6 +4586,9 @@ static void spapr_machine_class_init(ObjectClass *oc, > void *data) > fwc->get_dev_path = spapr_get_fw_dev_path; > nc->nmi_monitor_handler = spapr_nmi; > smc->phb_placement = spapr_phb_placement; > + vhc->cpu_in_nested = spapr_cpu_in_nested; > + vhc->get_nested_ptcr = spapr_get_nested_ptcr; > + vhc->exit_nested = spapr_exit_nested; > vhc->hypercall = emulate_spapr_hypercall; > vhc->hpt_mask = spapr_hpt_mask; > vhc->map_hptes = spapr_map_hptes; > diff --git a/hw/ppc/spapr_caps.c b/hw/ppc/spapr_caps.c > index ed7c077a0d..a665245f6f 100644 > --- a/hw/ppc/spapr_caps.c > +++ b/hw/ppc/spapr_caps.c > @@ -450,10 +450,7 @@ static void cap_nested_kvm_hv_apply(SpaprMachineState > *spapr, > return; > } > > - if (tcg_enabled()) { > - error_setg(errp, "No Nested KVM-HV support in TCG"); > - error_append_hint(errp, "Try appending -machine > cap-nested-hv=off\n"); > - } else if (kvm_enabled()) { > + if (!tcg_enabled()) { > if (!ppc_check_compat(cpu, CPU_POWERPC_LOGICAL_3_00, 0, > spapr->max_compat_pvr)) { > error_setg(errp, "Nested KVM-HV only supported on POWER9"); > diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c > index 222c1b6bbd..8ffb13ada0 100644 > --- a/hw/ppc/spapr_hcall.c > +++ b/hw/ppc/spapr_hcall.c > @@ -9,6 +9,7 @@ > #include "qemu/error-report.h" > #include "exec/exec-all.h" > #include "helper_regs.h" > +#include "hw/ppc/ppc.h" > #include "hw/ppc/spapr.h" > #include "hw/ppc/spapr_cpu_core.h" > #include "mmu-hash64.h" > @@ -1497,6 +1498,317 @@ static void hypercall_register_softmmu(void) > } > #endif > > +/* TCG only */ > +#define PRTS_MASK 0x1f > + > +static target_ulong h_set_ptbl(PowerPCCPU *cpu, > + SpaprMachineState *spapr, > + target_ulong opcode, > + target_ulong *args) > +{ > + target_ulong ptcr = args[0]; > + > + if (!spapr_get_cap(spapr, SPAPR_CAP_NESTED_KVM_HV)) { > + return H_FUNCTION; > + } > + > + if ((ptcr & PRTS_MASK) + 12 - 4 > 12) { > + return H_PARAMETER; > + } > + > + spapr->nested_ptcr = ptcr; /* Save new partition table */ > + > + return H_SUCCESS; > +} > + > +static target_ulong h_tlb_invalidate(PowerPCCPU *cpu, > + SpaprMachineState *spapr, > + target_ulong opcode, > + target_ulong *args) > +{ > + CPUState *cs = CPU(cpu); > + > + /* > + * The spapr virtual hypervisor nested HV implementation retains no > + * translation state except for TLB. This might be optimised to > + * invalidate fewer entries, but at the moment it's not important > + * because L1<->L2 transitions always flush the entire TLB for now. > + */ > + tlb_flush(cs); > + > + return H_SUCCESS; > +} > + > +static target_ulong h_copy_tofrom_guest(PowerPCCPU *cpu, > + SpaprMachineState *spapr, > + target_ulong opcode, > + target_ulong *args) > +{ > + /* > + * This HCALL is not required, L1 KVM will take a slow path and walk the > + * page tables manually to do the data copy. > + */ Kind of unrelated to the patch itself but I thought the point of this hcall was less about performance and more about Ln not having access to the KVM memslots for the Ln+2. KVM accesses a guest address space by using the GFN to find the memory slot and from there the userspace address that is part of the memslot. Accesses from Ln -> Ln+1 can use that mechanism because any Ln will have slots registered for its guests. For L0 -> L2, we can go via radix quadrants because the L0 is the real HV (that's how L0 handles this hcall after all). For L1 -> L3, we need the hcall since L1 cannot access quadrants 1 and 2 and it does not have the memslot so it cannot use the QEMU address. Does that make sense? > + return H_FUNCTION; > +} > + > +void spapr_enter_nested(PowerPCCPU *cpu) > +{ > + SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine()); > + PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cpu); > + CPUState *cs = CPU(cpu); > + CPUPPCState *env = &cpu->env; > + target_ulong hv_ptr = env->gpr[4]; > + target_ulong regs_ptr = env->gpr[5]; > + target_ulong hdec, now = cpu_ppc_load_tbl(env); > + struct kvmppc_hv_guest_state *hvstate; > + struct kvmppc_hv_guest_state hv_state; > + struct kvmppc_pt_regs *regs; > + hwaddr len; > + uint32_t cr; > + int i; > + > + if (cpu->in_spapr_nested) { > + env->gpr[3] = H_FUNCTION; > + return; > + } > + if (spapr->nested_ptcr == 0) { > + env->gpr[3] = H_NOT_AVAILABLE; > + return; > + } > + > + len = sizeof(*hvstate); > + hvstate = cpu_physical_memory_map(hv_ptr, &len, true); > + if (!hvstate || len != sizeof(*hvstate)) { > + env->gpr[3] = H_PARAMETER; > + return; > + } > + > + memcpy(&hv_state, hvstate, len); > + > + cpu_physical_memory_unmap(hvstate, len, 0 /* read */, len /* access len > */); > + > + if (hv_state.version != HV_GUEST_STATE_VERSION) { KVM uses > here. Should we do the same? Daniel and I have been discussing if we need some sort of version negotiation for nested KVM since recently someone tried to use an L0 version 1 with an L1 version 2: https://gitlab.com/qemu-project/qemu/-/issues/860 > + env->gpr[3] = H_PARAMETER; > + return; > + } > + <snip> > +/* 64-bit powerpc pt_regs struct used by nested HV */ > +struct kvmppc_pt_regs { > + uint64_t gpr[32]; > + uint64_t nip; > + uint64_t msr; > + uint64_t orig_gpr3; /* Used for restarting system calls */ > + uint64_t ctr; > + uint64_t link; > + uint64_t xer; > + uint64_t ccr; > + uint64_t softe; /* Soft enabled/disabled */ > + uint64_t trap; /* Reason for being here */ > + uint64_t dar; /* Fault registers */ > + uint64_t dsisr; /* on 4xx/Book-E used for ESR */ There's no 4xx/booke support for spapr nested. > + uint64_t result; /* Result of a system call */ > +}; > > typedef struct SpaprDeviceTreeUpdateHeader { > uint32_t version_id; > @@ -604,6 +673,10 @@ typedef target_ulong (*spapr_hcall_fn)(PowerPCCPU *cpu, > SpaprMachineState *sm, > void spapr_register_hypercall(target_ulong opcode, spapr_hcall_fn fn); > target_ulong spapr_hypercall(PowerPCCPU *cpu, target_ulong opcode, > target_ulong *args); > + > +void spapr_enter_nested(PowerPCCPU *cpu); > +void spapr_exit_nested(PowerPCCPU *cpu, int excp); > + > target_ulong softmmu_resize_hpt_prepare(PowerPCCPU *cpu, SpaprMachineState > *spapr, > target_ulong shift); > target_ulong softmmu_resize_hpt_commit(PowerPCCPU *cpu, SpaprMachineState > *spapr, > diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h > index dcd83b503c..1806a8e776 100644 > --- a/target/ppc/cpu.h > +++ b/target/ppc/cpu.h > @@ -1306,6 +1306,9 @@ struct PowerPCCPU { > bool pre_2_10_migration; > bool pre_3_0_migration; > int32_t mig_slb_nr; > + > + bool in_spapr_nested; > + CPUPPCState *nested_host_state; > }; > > > @@ -1316,6 +1319,9 @@ PowerPCCPUClass > *ppc_cpu_get_family_class(PowerPCCPUClass *pcc); > #ifndef CONFIG_USER_ONLY > struct PPCVirtualHypervisorClass { > InterfaceClass parent; > + bool (*cpu_in_nested)(PowerPCCPU *cpu); > + target_ulong (*get_nested_ptcr)(PowerPCCPU *cpu, target_ulong lpid); > + void (*exit_nested)(PowerPCCPU *cpu, int excp); > void (*hypercall)(PPCVirtualHypervisor *vhyp, PowerPCCPU *cpu); > hwaddr (*hpt_mask)(PPCVirtualHypervisor *vhyp); > const ppc_hash_pte64_t *(*map_hptes)(PPCVirtualHypervisor *vhyp, > diff --git a/target/ppc/excp_helper.c b/target/ppc/excp_helper.c > index c107953dec..239c253dbc 100644 > --- a/target/ppc/excp_helper.c > +++ b/target/ppc/excp_helper.c > @@ -361,8 +361,8 @@ static void ppc_excp_apply_ail(PowerPCCPU *cpu, int > excp_model, int excp, > #endif > } > > -static void powerpc_set_excp_state(PowerPCCPU *cpu, > - target_ulong vector, target_ulong > msr) > +static void powerpc_set_excp_state(PowerPCCPU *cpu, int excp, > + target_ulong vector, target_ulong msr) > { > CPUState *cs = CPU(cpu); > CPUPPCState *env = &cpu->env; > @@ -375,9 +375,17 @@ static void powerpc_set_excp_state(PowerPCCPU *cpu, > * will prevent setting of the HV bit which some exceptions might need > * to do. > */ > - env->msr = msr & env->msr_mask; > - hreg_compute_hflags(env); > - env->nip = vector; > + if (cpu->vhyp && cpu->in_spapr_nested && (msr & MSR_HVB)) { > + PPCVirtualHypervisorClass *vhc = > + PPC_VIRTUAL_HYPERVISOR_GET_CLASS(cpu->vhyp); > + // Deliver interrupt to L1 by returning from the H_ENTER_NESTED call > + vhc->exit_nested(cpu, excp); > + } else { > + env->nip = vector; > + env->msr = msr & env->msr_mask; > + hreg_compute_hflags(env); > + } > + All of this should fit better at the end of powerpc_excp_books. Even if we need to duplicate it for fwnmi_machine_check. > /* Reset exception state */ > cs->exception_index = POWERPC_EXCP_NONE; > env->error_code = 0; > @@ -548,7 +556,7 @@ static void powerpc_excp_40x(PowerPCCPU *cpu, int excp) > /* Save MSR */ > env->spr[srr1] = msr; > > - powerpc_set_excp_state(cpu, vector, new_msr); > + powerpc_set_excp_state(cpu, excp, vector, new_msr); > } > > static void powerpc_excp_74xx(PowerPCCPU *cpu, int excp) > @@ -742,7 +750,7 @@ static void powerpc_excp_74xx(PowerPCCPU *cpu, int excp) > /* Save MSR */ > env->spr[SPR_SRR1] = msr; > > - powerpc_set_excp_state(cpu, vector, new_msr); > + powerpc_set_excp_state(cpu, excp, vector, new_msr); > } > > #ifdef TARGET_PPC64 > @@ -916,7 +924,7 @@ static void powerpc_excp_books(PowerPCCPU *cpu, int excp) > env->nip += 4; > > /* "PAPR mode" built-in hypercall emulation */ > - if ((lev == 1) && cpu->vhyp) { > + if ((lev == 1) && cpu->vhyp && !cpu->in_spapr_nested) { > PPCVirtualHypervisorClass *vhc = > PPC_VIRTUAL_HYPERVISOR_GET_CLASS(cpu->vhyp); > vhc->hypercall(cpu->vhyp, cpu); > @@ -1004,18 +1012,6 @@ static void powerpc_excp_books(PowerPCCPU *cpu, int > excp) > break; > } > > - /* Sanity check */ > - if (!(env->msr_mask & MSR_HVB)) { > - if (new_msr & MSR_HVB) { > - cpu_abort(cs, "Trying to deliver HV exception (MSR) %d with " > - "no HV support\n", excp); > - } > - if (srr0 == SPR_HSRR0) { > - cpu_abort(cs, "Trying to deliver HV exception (HSRR) %d with " > - "no HV support\n", excp); > - } > - } > - > /* > * Sort out endianness of interrupt, this differs depending on the > * CPU, the HV mode, etc... > @@ -1037,7 +1033,19 @@ static void powerpc_excp_books(PowerPCCPU *cpu, int > excp) > /* This can update new_msr and vector if AIL applies */ > ppc_excp_apply_ail(cpu, excp_model, excp, msr, &new_msr, &vector); > > - powerpc_set_excp_state(cpu, vector, new_msr); > + powerpc_set_excp_state(cpu, excp, vector, new_msr); > + > + /* Sanity check */ > + if (!(env->msr_mask & MSR_HVB)) { > + if (env->msr & MSR_HVB) { > + cpu_abort(cs, "Trying to deliver HV exception (MSR) %d with " > + "no HV support\n", excp); > + } FYI, this doesn't exist anymore. > + if (0 && srr0 == SPR_HSRR0) { > + cpu_abort(cs, "Trying to deliver HV exception (HSRR) %d with " > + "no HV support\n", excp); > + } msr_mask should have MSR_HVB independently of MSR_HV being actually set, so this shouldn't be causing any problems. Is it? > + } > } > #else > static inline void powerpc_excp_books(PowerPCCPU *cpu, int excp) > @@ -1517,7 +1525,7 @@ static inline void powerpc_excp_legacy(PowerPCCPU *cpu, > int excp) > /* This can update new_msr and vector if AIL applies */ > ppc_excp_apply_ail(cpu, excp_model, excp, msr, &new_msr, &vector); > > - powerpc_set_excp_state(cpu, vector, new_msr); > + powerpc_set_excp_state(cpu, excp, vector, new_msr); > } > > static void powerpc_excp(PowerPCCPU *cpu, int excp) > @@ -1613,7 +1621,11 @@ static void ppc_hw_interrupt(CPUPPCState *env) > /* HEIC blocks delivery to the hypervisor */ > if ((async_deliver && !(heic && msr_hv && !msr_pr)) || > (env->has_hv_mode && msr_hv == 0 && !lpes0)) { > - powerpc_excp(cpu, POWERPC_EXCP_EXTERNAL); > + if (cpu->in_spapr_nested) { > + powerpc_excp(cpu, POWERPC_EXCP_HVIRT); > + } else { > + powerpc_excp(cpu, POWERPC_EXCP_EXTERNAL); > + } This function affects other CPUs, so it's better to leave this as it were and change the exception inside powerpc_excp_books, like we do for HV_EMUL. > return; > } > } > @@ -1723,7 +1735,7 @@ void ppc_cpu_do_fwnmi_machine_check(CPUState *cs, > target_ulong vector) > msr |= (1ULL << MSR_LE); > } > > - powerpc_set_excp_state(cpu, vector, msr); > + powerpc_set_excp_state(cpu, POWERPC_EXCP_MCHECK, vector, msr); > } > > bool ppc_cpu_exec_interrupt(CPUState *cs, int interrupt_request) > diff --git a/target/ppc/helper_regs.c b/target/ppc/helper_regs.c > index 5b12cb03c9..7f92606522 100644 > --- a/target/ppc/helper_regs.c > +++ b/target/ppc/helper_regs.c > @@ -163,6 +163,7 @@ static uint32_t hreg_compute_hflags_value(CPUPPCState > *env) > immu_idx |= msr & (1 << MSR_IS) ? 2 : 0; > dmmu_idx |= msr & (1 << MSR_DS) ? 2 : 0; > } else { > + /* Could have nested IDX instead of HV to avoid tlb flush on nested > enter/exit? */ Sounds like a good idea. However I don't know enough about softmmu stuff to point a direction here. > dmmu_idx |= msr & (1ull << MSR_HV) ? 4 : 0; > immu_idx = dmmu_idx; > immu_idx |= msr & (1 << MSR_IR) ? 0 : 2;