On Fri, Feb 22, 2019 at 12:28:27PM +0100, Cédric Le Goater wrote:
> The user interface exposes a new capability to let QEMU connect the
> vCPU to the XIVE KVM device if required. The capability is only
> advertised on a PowerNV Hypervisor as support for nested guests
> (pseries KVM Hypervisor) is not yet available.
> 
> Internally, the interface to the new KVM device is protected with a
> new interrupt mode: KVMPPC_IRQ_XIVE.
> 
> Signed-off-by: Cédric Le Goater <c...@kaod.org>
> ---
>  arch/powerpc/include/asm/kvm_host.h   |   1 +
>  arch/powerpc/include/asm/kvm_ppc.h    |  13 +++
>  arch/powerpc/kvm/book3s_xive.h        |   6 ++
>  include/uapi/linux/kvm.h              |   1 +
>  arch/powerpc/kvm/book3s_xive.c        |  67 +++++++-----
>  arch/powerpc/kvm/book3s_xive_native.c | 144 ++++++++++++++++++++++++++
>  arch/powerpc/kvm/powerpc.c            |  33 ++++++
>  Documentation/virtual/kvm/api.txt     |   9 ++
>  8 files changed, 246 insertions(+), 28 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/kvm_host.h 
> b/arch/powerpc/include/asm/kvm_host.h
> index 9f75a75a07f2..eb8581be0ee8 100644
> --- a/arch/powerpc/include/asm/kvm_host.h
> +++ b/arch/powerpc/include/asm/kvm_host.h
> @@ -448,6 +448,7 @@ struct kvmppc_passthru_irqmap {
>  #define KVMPPC_IRQ_DEFAULT   0
>  #define KVMPPC_IRQ_MPIC              1
>  #define KVMPPC_IRQ_XICS              2 /* Includes a XIVE option */
> +#define KVMPPC_IRQ_XIVE              3 /* XIVE native exploitation mode */
>  
>  #define MMIO_HPTE_CACHE_SIZE 4
>  
> diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
> b/arch/powerpc/include/asm/kvm_ppc.h
> index 4b72ddde7dc1..1e61877fe147 100644
> --- a/arch/powerpc/include/asm/kvm_ppc.h
> +++ b/arch/powerpc/include/asm/kvm_ppc.h
> @@ -594,6 +594,14 @@ extern int kvmppc_xive_set_irq(struct kvm *kvm, int 
> irq_source_id, u32 irq,
>                              int level, bool line_status);
>  extern void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu);
>  
> +static inline int kvmppc_xive_enabled(struct kvm_vcpu *vcpu)
> +{
> +     return vcpu->arch.irq_type == KVMPPC_IRQ_XIVE;
> +}
> +
> +extern int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev,
> +                                        struct kvm_vcpu *vcpu, u32 cpu);
> +extern void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu);
>  extern void kvmppc_xive_native_init_module(void);
>  extern void kvmppc_xive_native_exit_module(void);
>  
> @@ -621,6 +629,11 @@ static inline int kvmppc_xive_set_irq(struct kvm *kvm, 
> int irq_source_id, u32 ir
>                                     int level, bool line_status) { return 
> -ENODEV; }
>  static inline void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu) { }
>  
> +static inline int kvmppc_xive_enabled(struct kvm_vcpu *vcpu)
> +     { return 0; }
> +static inline int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev,
> +                       struct kvm_vcpu *vcpu, u32 cpu) { return -EBUSY; }
> +static inline void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu) { }
>  static inline void kvmppc_xive_native_init_module(void) { }
>  static inline void kvmppc_xive_native_exit_module(void) { }
>  
> diff --git a/arch/powerpc/kvm/book3s_xive.h b/arch/powerpc/kvm/book3s_xive.h
> index a08ae6fd4c51..bcb1bbcf0359 100644
> --- a/arch/powerpc/kvm/book3s_xive.h
> +++ b/arch/powerpc/kvm/book3s_xive.h
> @@ -248,5 +248,11 @@ extern int (*__xive_vm_h_ipi)(struct kvm_vcpu *vcpu, 
> unsigned long server,
>  extern int (*__xive_vm_h_cppr)(struct kvm_vcpu *vcpu, unsigned long cppr);
>  extern int (*__xive_vm_h_eoi)(struct kvm_vcpu *vcpu, unsigned long xirr);
>  
> +/*
> + * Common Xive routines for XICS-over-XIVE and XIVE native
> + */
> +void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu);
> +int kvmppc_xive_debug_show_queues(struct seq_file *m, struct kvm_vcpu *vcpu);
> +
>  #endif /* CONFIG_KVM_XICS */
>  #endif /* _KVM_PPC_BOOK3S_XICS_H */
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index e6368163d3a0..52bf74a1616e 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -988,6 +988,7 @@ struct kvm_ppc_resize_hpt {
>  #define KVM_CAP_ARM_VM_IPA_SIZE 165
>  #define KVM_CAP_MANUAL_DIRTY_LOG_PROTECT 166
>  #define KVM_CAP_HYPERV_CPUID 167
> +#define KVM_CAP_PPC_IRQ_XIVE 168
>  
>  #ifdef KVM_CAP_IRQ_ROUTING
>  
> diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
> index f78d002f0fe0..d1cc18a5b1c4 100644
> --- a/arch/powerpc/kvm/book3s_xive.c
> +++ b/arch/powerpc/kvm/book3s_xive.c
> @@ -1049,7 +1049,7 @@ int kvmppc_xive_clr_mapped(struct kvm *kvm, unsigned 
> long guest_irq,
>  }
>  EXPORT_SYMBOL_GPL(kvmppc_xive_clr_mapped);
>  
> -static void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu)
> +void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu)
>  {
>       struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
>       struct kvm *kvm = vcpu->kvm;
> @@ -1883,6 +1883,43 @@ static int kvmppc_xive_create(struct kvm_device *dev, 
> u32 type)
>       return 0;
>  }
>  
> +int kvmppc_xive_debug_show_queues(struct seq_file *m, struct kvm_vcpu *vcpu)
> +{
> +     struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
> +     unsigned int i;
> +
> +     for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
> +             struct xive_q *q = &xc->queues[i];
> +             u32 i0, i1, idx;
> +
> +             if (!q->qpage && !xc->esc_virq[i])
> +                     continue;
> +
> +             seq_printf(m, " [q%d]: ", i);
> +
> +             if (q->qpage) {
> +                     idx = q->idx;
> +                     i0 = be32_to_cpup(q->qpage + idx);
> +                     idx = (idx + 1) & q->msk;
> +                     i1 = be32_to_cpup(q->qpage + idx);
> +                     seq_printf(m, "T=%d %08x %08x...\n", q->toggle,
> +                                i0, i1);
> +             }
> +             if (xc->esc_virq[i]) {
> +                     struct irq_data *d = irq_get_irq_data(xc->esc_virq[i]);
> +                     struct xive_irq_data *xd =
> +                             irq_data_get_irq_handler_data(d);
> +                     u64 pq = xive_vm_esb_load(xd, XIVE_ESB_GET);
> +
> +                     seq_printf(m, "E:%c%c I(%d:%llx:%llx)",
> +                                (pq & XIVE_ESB_VAL_P) ? 'P' : 'p',
> +                                (pq & XIVE_ESB_VAL_Q) ? 'Q' : 'q',
> +                                xc->esc_virq[i], pq, xd->eoi_page);
> +                     seq_puts(m, "\n");
> +             }
> +     }
> +     return 0;
> +}
>  
>  static int xive_debug_show(struct seq_file *m, void *private)
>  {
> @@ -1908,7 +1945,6 @@ static int xive_debug_show(struct seq_file *m, void 
> *private)
>  
>       kvm_for_each_vcpu(i, vcpu, kvm) {
>               struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
> -             unsigned int i;
>  
>               if (!xc)
>                       continue;
> @@ -1918,33 +1954,8 @@ static int xive_debug_show(struct seq_file *m, void 
> *private)
>                          xc->server_num, xc->cppr, xc->hw_cppr,
>                          xc->mfrr, xc->pending,
>                          xc->stat_rm_h_xirr, xc->stat_vm_h_xirr);
> -             for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
> -                     struct xive_q *q = &xc->queues[i];
> -                     u32 i0, i1, idx;
>  
> -                     if (!q->qpage && !xc->esc_virq[i])
> -                             continue;
> -
> -                     seq_printf(m, " [q%d]: ", i);
> -
> -                     if (q->qpage) {
> -                             idx = q->idx;
> -                             i0 = be32_to_cpup(q->qpage + idx);
> -                             idx = (idx + 1) & q->msk;
> -                             i1 = be32_to_cpup(q->qpage + idx);
> -                             seq_printf(m, "T=%d %08x %08x... \n", 
> q->toggle, i0, i1);
> -                     }
> -                     if (xc->esc_virq[i]) {
> -                             struct irq_data *d = 
> irq_get_irq_data(xc->esc_virq[i]);
> -                             struct xive_irq_data *xd = 
> irq_data_get_irq_handler_data(d);
> -                             u64 pq = xive_vm_esb_load(xd, XIVE_ESB_GET);
> -                             seq_printf(m, "E:%c%c I(%d:%llx:%llx)",
> -                                        (pq & XIVE_ESB_VAL_P) ? 'P' : 'p',
> -                                        (pq & XIVE_ESB_VAL_Q) ? 'Q' : 'q',
> -                                        xc->esc_virq[i], pq, xd->eoi_page);
> -                             seq_printf(m, "\n");
> -                     }
> -             }
> +             kvmppc_xive_debug_show_queues(m, vcpu);
>  
>               t_rm_h_xirr += xc->stat_rm_h_xirr;
>               t_rm_h_ipoll += xc->stat_rm_h_ipoll;
> diff --git a/arch/powerpc/kvm/book3s_xive_native.c 
> b/arch/powerpc/kvm/book3s_xive_native.c
> index e475ce83ad14..1f3da47a4a6a 100644
> --- a/arch/powerpc/kvm/book3s_xive_native.c
> +++ b/arch/powerpc/kvm/book3s_xive_native.c
> @@ -31,6 +31,128 @@
>  
>  #include "book3s_xive.h"
>  
> +static void kvmppc_xive_native_cleanup_queue(struct kvm_vcpu *vcpu, int prio)
> +{
> +     struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
> +     struct xive_q *q = &xc->queues[prio];
> +
> +     xive_native_disable_queue(xc->vp_id, q, prio);
> +     if (q->qpage) {
> +             put_page(virt_to_page(q->qpage));
> +             q->qpage = NULL;
> +     }
> +}
> +
> +void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu)
> +{
> +     struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
> +     int i;
> +
> +     if (!kvmppc_xive_enabled(vcpu))
> +             return;
> +
> +     if (!xc)
> +             return;
> +
> +     pr_devel("native_cleanup_vcpu(cpu=%d)\n", xc->server_num);
> +
> +     /* Ensure no interrupt is still routed to that VP */
> +     xc->valid = false;
> +     kvmppc_xive_disable_vcpu_interrupts(vcpu);
> +
> +     /* Disable the VP */
> +     xive_native_disable_vp(xc->vp_id);
> +
> +     /* Free the queues & associated interrupts */
> +     for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
> +             /* Free the escalation irq */
> +             if (xc->esc_virq[i]) {
> +                     free_irq(xc->esc_virq[i], vcpu);
> +                     irq_dispose_mapping(xc->esc_virq[i]);
> +                     kfree(xc->esc_virq_names[i]);
> +                     xc->esc_virq[i] = 0;
> +             }
> +
> +             /* Free the queue */
> +             kvmppc_xive_native_cleanup_queue(vcpu, i);
> +     }
> +
> +     /* Free the VP */
> +     kfree(xc);
> +
> +     /* Cleanup the vcpu */
> +     vcpu->arch.irq_type = KVMPPC_IRQ_DEFAULT;
> +     vcpu->arch.xive_vcpu = NULL;
> +}
> +
> +int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev,
> +                                 struct kvm_vcpu *vcpu, u32 cpu)
> +{
> +     struct kvmppc_xive *xive = dev->private;
> +     struct kvmppc_xive_vcpu *xc;
> +     int rc;
> +
> +     pr_devel("native_connect_vcpu(cpu=%d)\n", cpu);
> +
> +     if (dev->ops != &kvm_xive_native_ops) {
> +             pr_devel("Wrong ops !\n");
> +             return -EPERM;
> +     }
> +     if (xive->kvm != vcpu->kvm)
> +             return -EPERM;
> +     if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT)
> +             return -EBUSY;
> +     if (kvmppc_xive_find_server(vcpu->kvm, cpu)) {

You haven't taken the kvm->lock yet, so couldn't a race mean a
duplicate server gets inserted after you make this check?

> +             pr_devel("Duplicate !\n");
> +             return -EEXIST;
> +     }
> +     if (cpu >= KVM_MAX_VCPUS) {
> +             pr_devel("Out of bounds !\n");
> +             return -EINVAL;
> +     }
> +     xc = kzalloc(sizeof(*xc), GFP_KERNEL);
> +     if (!xc)
> +             return -ENOMEM;
> +
> +     mutex_lock(&vcpu->kvm->lock);
> +     vcpu->arch.xive_vcpu = xc;

Similarly you don't verify this is NULL after taking the lock, so
couldn't another thread race and make a connect which gets clobbered
here?

> +     xc->xive = xive;
> +     xc->vcpu = vcpu;
> +     xc->server_num = cpu;
> +     xc->vp_id = xive->vp_base + cpu;

Hrm.  This ties the internal VP id to the userspace chosen server
number, which isn't ideal.  It puts a constraint on those server
numbers that you wouldn't otherwise have.

> +     xc->valid = true;
> +
> +     rc = xive_native_get_vp_info(xc->vp_id, &xc->vp_cam, &xc->vp_chip_id);
> +     if (rc) {
> +             pr_err("Failed to get VP info from OPAL: %d\n", rc);
> +             goto bail;
> +     }
> +
> +     /*
> +      * Enable the VP first as the single escalation mode will
> +      * affect escalation interrupts numbering
> +      */
> +     rc = xive_native_enable_vp(xc->vp_id, xive->single_escalation);
> +     if (rc) {
> +             pr_err("Failed to enable VP in OPAL: %d\n", rc);
> +             goto bail;
> +     }
> +
> +     /* Configure VCPU fields for use by assembly push/pull */
> +     vcpu->arch.xive_saved_state.w01 = cpu_to_be64(0xff000000);
> +     vcpu->arch.xive_cam_word = cpu_to_be32(xc->vp_cam | TM_QW1W2_VO);
> +
> +     /* TODO: initialize queues ? */
> +
> +bail:
> +     vcpu->arch.irq_type = KVMPPC_IRQ_XIVE;
> +     mutex_unlock(&vcpu->kvm->lock);
> +     if (rc)
> +             kvmppc_xive_native_cleanup_vcpu(vcpu);
> +
> +     return rc;
> +}
> +
>  static int kvmppc_xive_native_set_attr(struct kvm_device *dev,
>                                      struct kvm_device_attr *attr)
>  {
> @@ -126,10 +248,32 @@ static int xive_native_debug_show(struct seq_file *m, 
> void *private)
>  {
>       struct kvmppc_xive *xive = m->private;
>       struct kvm *kvm = xive->kvm;
> +     struct kvm_vcpu *vcpu;
> +     unsigned int i;
>  
>       if (!kvm)
>               return 0;
>  
> +     seq_puts(m, "=========\nVCPU state\n=========\n");
> +
> +     kvm_for_each_vcpu(i, vcpu, kvm) {
> +             struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
> +
> +             if (!xc)
> +                     continue;
> +
> +             seq_printf(m, "cpu server %#x NSR=%02x CPPR=%02x IBP=%02x 
> PIPR=%02x w01=%016llx w2=%08x\n",
> +                        xc->server_num,
> +                        vcpu->arch.xive_saved_state.nsr,
> +                        vcpu->arch.xive_saved_state.cppr,
> +                        vcpu->arch.xive_saved_state.ipb,
> +                        vcpu->arch.xive_saved_state.pipr,
> +                        vcpu->arch.xive_saved_state.w01,
> +                        (u32) vcpu->arch.xive_cam_word);
> +
> +             kvmppc_xive_debug_show_queues(m, vcpu);
> +     }
> +
>       return 0;
>  }
>  
> diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
> index 8c69af10f91d..a38a643a24dd 100644
> --- a/arch/powerpc/kvm/powerpc.c
> +++ b/arch/powerpc/kvm/powerpc.c
> @@ -570,6 +570,12 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long 
> ext)
>       case KVM_CAP_PPC_GET_CPU_CHAR:
>               r = 1;
>               break;
> +#ifdef CONFIG_KVM_XIVE
> +     case KVM_CAP_PPC_IRQ_XIVE:
> +             /* only for PowerNV */
> +             r = !!cpu_has_feature(CPU_FTR_HVMODE);
> +             break;
> +#endif
>  
>       case KVM_CAP_PPC_ALLOC_HTAB:
>               r = hv_enabled;
> @@ -753,6 +759,9 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
>               else
>                       kvmppc_xics_free_icp(vcpu);
>               break;
> +     case KVMPPC_IRQ_XIVE:
> +             kvmppc_xive_native_cleanup_vcpu(vcpu);
> +             break;
>       }
>  
>       kvmppc_core_vcpu_free(vcpu);
> @@ -1941,6 +1950,30 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu 
> *vcpu,
>               break;
>       }
>  #endif /* CONFIG_KVM_XICS */
> +#ifdef CONFIG_KVM_XIVE
> +     case KVM_CAP_PPC_IRQ_XIVE: {
> +             struct fd f;
> +             struct kvm_device *dev;
> +
> +             r = -EBADF;
> +             f = fdget(cap->args[0]);
> +             if (!f.file)
> +                     break;
> +
> +             r = -ENXIO;
> +             if (!xive_enabled())
> +                     break;
> +
> +             r = -EPERM;
> +             dev = kvm_device_from_filp(f.file);
> +             if (dev)
> +                     r = kvmppc_xive_native_connect_vcpu(dev, vcpu,
> +                                                         cap->args[1]);
> +
> +             fdput(f);
> +             break;
> +     }
> +#endif /* CONFIG_KVM_XIVE */
>  #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
>       case KVM_CAP_PPC_FWNMI:
>               r = -EINVAL;
> diff --git a/Documentation/virtual/kvm/api.txt 
> b/Documentation/virtual/kvm/api.txt
> index 356156f5c52d..1db1435769b4 100644
> --- a/Documentation/virtual/kvm/api.txt
> +++ b/Documentation/virtual/kvm/api.txt
> @@ -4458,6 +4458,15 @@ struct kvm_sync_regs {
>          struct kvm_vcpu_events events;
>  };
>  
> +6.75 KVM_CAP_PPC_IRQ_XIVE
> +
> +Architectures: ppc
> +Target: vcpu
> +Parameters: args[0] is the XIVE device fd
> +            args[1] is the XIVE CPU number (server ID) for this vcpu
> +
> +This capability connects the vcpu to an in-kernel XIVE device.
> +
>  7. Capabilities that can be enabled on VMs
>  ------------------------------------------
>  

-- 
David Gibson                    | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
                                | _way_ _around_!
http://www.ozlabs.org/~dgibson

Attachment: signature.asc
Description: PGP signature

Reply via email to