Nicholas Piggin <npig...@gmail.com> writes:

> In the interest of minimising the amount of code that is run in
> "real-mode", don't handle hcalls in real mode in the P9 path.
>
> POWER8 and earlier are much more expensive to exit from HV real mode
> and switch to host mode, because on those processors HV interrupts get
> to the hypervisor with the MMU off, and the other threads in the core
> need to be pulled out of the guest, and SLBs all need to be saved,
> ERATs invalidated, and host SLB reloaded before the MMU is re-enabled
> in host mode. Hash guests also require a lot of hcalls to run. The
> XICS interrupt controller requires hcalls to run.
>
> By contrast, POWER9 has independent thread switching, and in radix mode
> the hypervisor is already in a host virtual memory mode when the HV
> interrupt is taken. Radix + xive guests don't need hcalls to handle
> interrupts or manage translations.
>
> So it's much less important to handle hcalls in real mode in P9.
>
> Signed-off-by: Nicholas Piggin <npig...@gmail.com>

I tried this again in the L2 with xive=off and it works as expected now.

Tested-by: Fabiano Rosas <faro...@linux.ibm.com>

> ---
>  arch/powerpc/include/asm/kvm_ppc.h      |  5 ++
>  arch/powerpc/kvm/book3s_hv.c            | 57 ++++++++++++++++----
>  arch/powerpc/kvm/book3s_hv_rmhandlers.S |  5 ++
>  arch/powerpc/kvm/book3s_xive.c          | 70 +++++++++++++++++++++++++
>  4 files changed, 127 insertions(+), 10 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
> b/arch/powerpc/include/asm/kvm_ppc.h
> index 73b1ca5a6471..db6646c2ade2 100644
> --- a/arch/powerpc/include/asm/kvm_ppc.h
> +++ b/arch/powerpc/include/asm/kvm_ppc.h
> @@ -607,6 +607,7 @@ extern void kvmppc_free_pimap(struct kvm *kvm);
>  extern int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall);
>  extern void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu);
>  extern int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd);
> +extern int kvmppc_xive_xics_hcall(struct kvm_vcpu *vcpu, u32 req);
>  extern u64 kvmppc_xics_get_icp(struct kvm_vcpu *vcpu);
>  extern int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval);
>  extern int kvmppc_xics_connect_vcpu(struct kvm_device *dev,
> @@ -639,6 +640,8 @@ static inline int kvmppc_xics_enabled(struct kvm_vcpu 
> *vcpu)
>  static inline void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu) { }
>  static inline int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd)
>       { return 0; }
> +static inline int kvmppc_xive_xics_hcall(struct kvm_vcpu *vcpu, u32 req)
> +     { return 0; }
>  #endif
>
>  #ifdef CONFIG_KVM_XIVE
> @@ -673,6 +676,7 @@ extern int kvmppc_xive_set_irq(struct kvm *kvm, int 
> irq_source_id, u32 irq,
>                              int level, bool line_status);
>  extern void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu);
>  extern void kvmppc_xive_pull_vcpu(struct kvm_vcpu *vcpu);
> +extern void kvmppc_xive_cede_vcpu(struct kvm_vcpu *vcpu);
>
>  static inline int kvmppc_xive_enabled(struct kvm_vcpu *vcpu)
>  {
> @@ -714,6 +718,7 @@ static inline int kvmppc_xive_set_irq(struct kvm *kvm, 
> int irq_source_id, u32 ir
>                                     int level, bool line_status) { return 
> -ENODEV; }
>  static inline void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu) { }
>  static inline void kvmppc_xive_pull_vcpu(struct kvm_vcpu *vcpu) { }
> +static inline void kvmppc_xive_cede_vcpu(struct kvm_vcpu *vcpu) { }
>
>  static inline int kvmppc_xive_enabled(struct kvm_vcpu *vcpu)
>       { return 0; }
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index fa7614c37e08..17739aaee3d8 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -1142,12 +1142,13 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
>  }
>
>  /*
> - * Handle H_CEDE in the nested virtualization case where we haven't
> - * called the real-mode hcall handlers in book3s_hv_rmhandlers.S.
> + * Handle H_CEDE in the P9 path where we don't call the real-mode hcall
> + * handlers in book3s_hv_rmhandlers.S.
> + *
>   * This has to be done early, not in kvmppc_pseries_do_hcall(), so
>   * that the cede logic in kvmppc_run_single_vcpu() works properly.
>   */
> -static void kvmppc_nested_cede(struct kvm_vcpu *vcpu)
> +static void kvmppc_cede(struct kvm_vcpu *vcpu)
>  {
>       vcpu->arch.shregs.msr |= MSR_EE;
>       vcpu->arch.ceded = 1;
> @@ -1403,9 +1404,15 @@ static int kvmppc_handle_exit_hv(struct kvm_vcpu *vcpu,
>               /* hcall - punt to userspace */
>               int i;
>
> -             /* hypercall with MSR_PR has already been handled in rmode,
> -              * and never reaches here.
> -              */
> +             if (unlikely(vcpu->arch.shregs.msr & MSR_PR)) {
> +                     /*
> +                      * Guest userspace executed sc 1, reflect it back as a
> +                      * privileged program check interrupt.
> +                      */
> +                     kvmppc_core_queue_program(vcpu, SRR1_PROGPRIV);
> +                     r = RESUME_GUEST;
> +                     break;
> +             }
>
>               run->papr_hcall.nr = kvmppc_get_gpr(vcpu, 3);
>               for (i = 0; i < 9; ++i)
> @@ -3663,6 +3670,12 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu 
> *vcpu, u64 time_limit,
>       return trap;
>  }
>
> +static inline bool hcall_is_xics(unsigned long req)
> +{
> +     return (req == H_EOI || req == H_CPPR || req == H_IPI ||
> +             req == H_IPOLL || req == H_XIRR || req == H_XIRR_X);
> +}
> +
>  /*
>   * Virtual-mode guest entry for POWER9 and later when the host and
>   * guest are both using the radix MMU.  The LPIDR has already been set.
> @@ -3774,15 +3787,36 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu 
> *vcpu, u64 time_limit,
>               /* H_CEDE has to be handled now, not later */
>               if (trap == BOOK3S_INTERRUPT_SYSCALL && !vcpu->arch.nested &&
>                   kvmppc_get_gpr(vcpu, 3) == H_CEDE) {
> -                     kvmppc_nested_cede(vcpu);
> +                     kvmppc_cede(vcpu);
>                       kvmppc_set_gpr(vcpu, 3, 0);
>                       trap = 0;
>               }
>       } else {
>               kvmppc_xive_push_vcpu(vcpu);
>               trap = kvmhv_load_hv_regs_and_go(vcpu, time_limit, lpcr);
> +             if (trap == BOOK3S_INTERRUPT_SYSCALL && !vcpu->arch.nested &&
> +                 !(vcpu->arch.shregs.msr & MSR_PR)) {
> +                     unsigned long req = kvmppc_get_gpr(vcpu, 3);
> +
> +                     /* H_CEDE has to be handled now, not later */
> +                     if (req == H_CEDE) {
> +                             kvmppc_cede(vcpu);
> +                             kvmppc_xive_cede_vcpu(vcpu); /* may un-cede */
> +                             kvmppc_set_gpr(vcpu, 3, 0);
> +                             trap = 0;
> +
> +                     /* XICS hcalls must be handled before xive is pulled */
> +                     } else if (hcall_is_xics(req)) {
> +                             int ret;
> +
> +                             ret = kvmppc_xive_xics_hcall(vcpu, req);
> +                             if (ret != H_TOO_HARD) {
> +                                     kvmppc_set_gpr(vcpu, 3, ret);
> +                                     trap = 0;
> +                             }
> +                     }
> +             }
>               kvmppc_xive_pull_vcpu(vcpu);
> -
>       }
>
>       vcpu->arch.slb_max = 0;
> @@ -4442,8 +4476,11 @@ static int kvmppc_vcpu_run_hv(struct kvm_vcpu *vcpu)
>               else
>                       r = kvmppc_run_vcpu(vcpu);
>
> -             if (run->exit_reason == KVM_EXIT_PAPR_HCALL &&
> -                 !(vcpu->arch.shregs.msr & MSR_PR)) {
> +             if (run->exit_reason == KVM_EXIT_PAPR_HCALL) {
> +                     if (WARN_ON_ONCE(vcpu->arch.shregs.msr & MSR_PR)) {
> +                             r = RESUME_GUEST;
> +                             continue;
> +                     }
>                       trace_kvm_hcall_enter(vcpu);
>                       r = kvmppc_pseries_do_hcall(vcpu);
>                       trace_kvm_hcall_exit(vcpu, r);
> diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
> b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> index c11597f815e4..2d0d14ed1d92 100644
> --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> @@ -1397,9 +1397,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
>       mr      r4,r9
>       bge     fast_guest_return
>  2:
> +     /* If we came in through the P9 short path, no real mode hcalls */
> +     lwz     r0, STACK_SLOT_SHORT_PATH(r1)
> +     cmpwi   r0, 0
> +     bne     no_try_real
>       /* See if this is an hcall we can handle in real mode */
>       cmpwi   r12,BOOK3S_INTERRUPT_SYSCALL
>       beq     hcall_try_real_mode
> +no_try_real:
>
>       /* Hypervisor doorbell - exit only if host IPI flag set */
>       cmpwi   r12, BOOK3S_INTERRUPT_H_DOORBELL
> diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
> index 741bf1f4387a..dcc07ceaf5ca 100644
> --- a/arch/powerpc/kvm/book3s_xive.c
> +++ b/arch/powerpc/kvm/book3s_xive.c
> @@ -158,6 +158,40 @@ void kvmppc_xive_pull_vcpu(struct kvm_vcpu *vcpu)
>  }
>  EXPORT_SYMBOL_GPL(kvmppc_xive_pull_vcpu);
>
> +void kvmppc_xive_cede_vcpu(struct kvm_vcpu *vcpu)
> +{
> +     void __iomem *esc_vaddr = (void __iomem *)vcpu->arch.xive_esc_vaddr;
> +
> +     if (!esc_vaddr)
> +             return;
> +
> +     /* we are using XIVE with single escalation */
> +
> +     if (vcpu->arch.xive_esc_on) {
> +             /*
> +              * If we still have a pending escalation, abort the cede,
> +              * and we must set PQ to 10 rather than 00 so that we don't
> +              * potentially end up with two entries for the escalation
> +              * interrupt in the XIVE interrupt queue.  In that case
> +              * we also don't want to set xive_esc_on to 1 here in
> +              * case we race with xive_esc_irq().
> +              */
> +             vcpu->arch.ceded = 0;
> +             /*
> +              * The escalation interrupts are special as we don't EOI them.
> +              * There is no need to use the load-after-store ordering offset
> +              * to set PQ to 10 as we won't use StoreEOI.
> +              */
> +             __raw_readq(esc_vaddr + XIVE_ESB_SET_PQ_10);
> +     } else {
> +             vcpu->arch.xive_esc_on = true;
> +             mb();
> +             __raw_readq(esc_vaddr + XIVE_ESB_SET_PQ_00);
> +     }
> +     mb();
> +}
> +EXPORT_SYMBOL_GPL(kvmppc_xive_cede_vcpu);
> +
>  /*
>   * This is a simple trigger for a generic XIVE IRQ. This must
>   * only be called for interrupts that support a trigger page
> @@ -2106,6 +2140,42 @@ static int kvmppc_xive_create(struct kvm_device *dev, 
> u32 type)
>       return 0;
>  }
>
> +int kvmppc_xive_xics_hcall(struct kvm_vcpu *vcpu, u32 req)
> +{
> +     struct kvmppc_vcore *vc = vcpu->arch.vcore;
> +
> +     /*
> +      * This test covers the case in which a vCPU does XICS hcalls without
> +      * QEMU having connected the vCPU to a XICS ICP. The ICP is the KVM
> +      * XICS device on P8 or XICS-on-XIVE on P9. It catches QEMU errors when
> +      * the interrupt mode is negotiated, we don't want the OS to do XICS
> +      * hcalls after having negotiated the XIVE interrupt mode.
> +      */
> +     if (!kvmppc_xics_enabled(vcpu))
> +             return H_TOO_HARD;
> +
> +     switch (req) {
> +     case H_XIRR:
> +             return xive_vm_h_xirr(vcpu);
> +     case H_CPPR:
> +             return xive_vm_h_cppr(vcpu, kvmppc_get_gpr(vcpu, 4));
> +     case H_EOI:
> +             return xive_vm_h_eoi(vcpu, kvmppc_get_gpr(vcpu, 4));
> +     case H_IPI:
> +             return xive_vm_h_ipi(vcpu, kvmppc_get_gpr(vcpu, 4),
> +                                       kvmppc_get_gpr(vcpu, 5));
> +     case H_IPOLL:
> +             return xive_vm_h_ipoll(vcpu, kvmppc_get_gpr(vcpu, 4));
> +     case H_XIRR_X:
> +             xive_vm_h_xirr(vcpu);
> +             kvmppc_set_gpr(vcpu, 5, get_tb() + vc->tb_offset);
> +             return H_SUCCESS;
> +     }
> +
> +     return H_UNSUPPORTED;
> +}
> +EXPORT_SYMBOL_GPL(kvmppc_xive_xics_hcall);
> +
>  int kvmppc_xive_debug_show_queues(struct seq_file *m, struct kvm_vcpu *vcpu)
>  {
>       struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;

Reply via email to