On Mon, Mar 20, 2017 at 05:49:14PM +1100, Benjamin Herrenschmidt wrote: > This patch makes KVM capable of using the XIVE interrupt controller > to provide the standard PAPR "XICS" style hypercalls. It is necessary > for proper operations when the host uses XIVE natively. > > This has been lightly tested on an actual system, including PCI > pass-through with a TG3 device. > > Signed-off-by: Benjamin Herrenschmidt <b...@kernel.crashing.org>
Looks good overall, some comments below... > --- > arch/powerpc/include/asm/kvm_book3s_asm.h | 2 + > arch/powerpc/include/asm/kvm_host.h | 28 +- > arch/powerpc/include/asm/kvm_ppc.h | 38 + > arch/powerpc/include/asm/xive.h | 11 +- > arch/powerpc/kernel/asm-offsets.c | 10 + > arch/powerpc/kvm/Makefile | 4 +- > arch/powerpc/kvm/book3s.c | 73 +- > arch/powerpc/kvm/book3s_hv.c | 52 +- > arch/powerpc/kvm/book3s_hv_builtin.c | 108 ++ > arch/powerpc/kvm/book3s_hv_rm_xics.c | 10 +- > arch/powerpc/kvm/book3s_hv_rm_xive.c | 47 + > arch/powerpc/kvm/book3s_hv_rmhandlers.S | 60 +- > arch/powerpc/kvm/book3s_rtas.c | 21 +- > arch/powerpc/kvm/book3s_xics.c | 35 +- > arch/powerpc/kvm/book3s_xics.h | 5 + > arch/powerpc/kvm/book3s_xive.c | 1898 > +++++++++++++++++++++++++++++ > arch/powerpc/kvm/book3s_xive.h | 251 ++++ > arch/powerpc/kvm/book3s_xive_template.c | 490 ++++++++ > arch/powerpc/kvm/irq.h | 1 + > arch/powerpc/kvm/powerpc.c | 17 +- > arch/powerpc/platforms/powernv/opal.c | 1 + > arch/powerpc/sysdev/xive/common.c | 131 +- > arch/powerpc/sysdev/xive/native.c | 92 +- > include/linux/kvm_host.h | 1 - > virt/kvm/kvm_main.c | 4 - > 25 files changed, 3305 insertions(+), 85 deletions(-) > create mode 100644 arch/powerpc/kvm/book3s_hv_rm_xive.c > create mode 100644 arch/powerpc/kvm/book3s_xive.c > create mode 100644 arch/powerpc/kvm/book3s_xive.h > create mode 100644 arch/powerpc/kvm/book3s_xive_template.c > > diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h > b/arch/powerpc/include/asm/kvm_book3s_asm.h > index 0593d94..e719002 100644 > --- a/arch/powerpc/include/asm/kvm_book3s_asm.h > +++ b/arch/powerpc/include/asm/kvm_book3s_asm.h > @@ -111,6 +111,8 @@ struct kvmppc_host_state { > struct kvm_vcpu *kvm_vcpu; > struct kvmppc_vcore *kvm_vcore; > void __iomem *xics_phys; > + void __iomem *xive_tm_area_phys; > + void __iomem *xive_tm_area_virt; Does this cause the paca to become a cacheline larger? (Not that there is much alternative to having these fields.) > u32 saved_xirr; > u64 dabr; > u64 host_mmcr[7]; /* MMCR 0,1,A, SIAR, SDAR, MMCR2, SIER */ > diff --git a/arch/powerpc/include/asm/kvm_host.h > b/arch/powerpc/include/asm/kvm_host.h > index 7bba8f4..fc491ac 100644 > --- a/arch/powerpc/include/asm/kvm_host.h > +++ b/arch/powerpc/include/asm/kvm_host.h > @@ -205,6 +205,12 @@ struct kvmppc_spapr_tce_table { > /* XICS components, defined in book3s_xics.c */ > struct kvmppc_xics; > struct kvmppc_icp; > +extern struct kvm_device_ops kvm_xics_ops; > + > +/* XIVE components, defined in book3s_xive.c */ > +struct kvmppc_xive; > +struct kvmppc_xive_vcpu; > +extern struct kvm_device_ops kvm_xive_ops; > > struct kvmppc_passthru_irqmap; > > @@ -293,6 +299,7 @@ struct kvm_arch { > #endif > #ifdef CONFIG_KVM_XICS > struct kvmppc_xics *xics; > + struct kvmppc_xive *xive; > struct kvmppc_passthru_irqmap *pimap; > #endif > struct kvmppc_ops *kvm_ops; > @@ -421,7 +428,7 @@ struct kvmppc_passthru_irqmap { > > #define KVMPPC_IRQ_DEFAULT 0 > #define KVMPPC_IRQ_MPIC 1 > -#define KVMPPC_IRQ_XICS 2 > +#define KVMPPC_IRQ_XICS 2 /* Includes a XIVE option */ > > #define MMIO_HPTE_CACHE_SIZE 4 > > @@ -443,6 +450,21 @@ struct mmio_hpte_cache { > > struct openpic; > > +/* QW0 and QW1 of a context */ > +union xive_qw01 { > + struct { > + u8 nsr; > + u8 cppr; > + u8 ipb; > + u8 lsmfb; > + u8 ack; > + u8 inc; > + u8 age; > + u8 pipr; > + }; > + __be64 qw; > +}; This is slightly confusing because a "QW" (quadword) would normally be 128 bits, but this union is 64 bits. > + > struct kvm_vcpu_arch { > ulong host_stack; > u32 host_pid; > @@ -688,6 +710,10 @@ struct kvm_vcpu_arch { > struct openpic *mpic; /* KVM_IRQ_MPIC */ > #ifdef CONFIG_KVM_XICS > struct kvmppc_icp *icp; /* XICS presentation controller */ > + struct kvmppc_xive_vcpu *xive_vcpu; /* XIVE virtual CPU data */ > + __be32 xive_cam_word; /* Cooked W2 in proper endian with valid bit */ > + u32 xive_pushed; /* Is the VP pushed on the physical CPU ? */ > + union xive_qw01 xive_saved_state; /* W0..1 of XIVE state */ > #endif > > #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE > diff --git a/arch/powerpc/include/asm/kvm_ppc.h > b/arch/powerpc/include/asm/kvm_ppc.h > index c387799..2fcf6cf 100644 > --- a/arch/powerpc/include/asm/kvm_ppc.h > +++ b/arch/powerpc/include/asm/kvm_ppc.h > @@ -225,6 +225,7 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, > struct kvm_interrupt *irq); > extern int kvm_vm_ioctl_rtas_define_token(struct kvm *kvm, void __user > *argp); > extern int kvmppc_rtas_hcall(struct kvm_vcpu *vcpu); > extern void kvmppc_rtas_tokens_free(struct kvm *kvm); > + > extern int kvmppc_xics_set_xive(struct kvm *kvm, u32 irq, u32 server, > u32 priority); > extern int kvmppc_xics_get_xive(struct kvm *kvm, u32 irq, u32 *server, > @@ -232,6 +233,15 @@ extern int kvmppc_xics_get_xive(struct kvm *kvm, u32 > irq, u32 *server, > extern int kvmppc_xics_int_on(struct kvm *kvm, u32 irq); > extern int kvmppc_xics_int_off(struct kvm *kvm, u32 irq); > > +extern int kvmppc_xive_set_xive(struct kvm *kvm, u32 irq, u32 server, > + u32 priority); > +extern int kvmppc_xive_get_xive(struct kvm *kvm, u32 irq, u32 *server, > + u32 *priority); Might be worth a comment here to explain that the first xive is eXternal Interrupt Virtualization Engine and the second xive is eXternal Interrupt Vector Entry. > +extern int kvmppc_xive_int_on(struct kvm *kvm, u32 irq); > +extern int kvmppc_xive_int_off(struct kvm *kvm, u32 irq); > +extern void kvmppc_xive_init_module(void); > +extern void kvmppc_xive_exit_module(void); > + > void kvmppc_core_dequeue_debug(struct kvm_vcpu *vcpu); > void kvmppc_core_queue_debug(struct kvm_vcpu *vcpu); > > @@ -412,6 +422,14 @@ static inline void kvmppc_set_xics_phys(int cpu, > unsigned long addr) > paca[cpu].kvm_hstate.xics_phys = (void __iomem *)addr; > } > > +static inline void kvmppc_set_xive_tm_area(int cpu, > + unsigned long phys_addr, > + void __iomem *virt_addr) > +{ > + paca[cpu].kvm_hstate.xive_tm_area_phys = (void __iomem *)phys_addr; > + paca[cpu].kvm_hstate.xive_tm_area_virt = virt_addr; > +} > + > static inline u32 kvmppc_get_xics_latch(void) > { > u32 xirr; > @@ -442,6 +460,9 @@ static inline void __init kvm_cma_reserve(void) > static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr) > {} > > +static inline void kvmppc_set_xive_tm_area_phys(int cpu, unsigned long addr) > +{} Shouldn't this be kvmppc_set_xive_tm_area to match the other definition? > + > static inline u32 kvmppc_get_xics_latch(void) > { > return 0; > @@ -492,6 +513,21 @@ extern long kvmppc_deliver_irq_passthru(struct kvm_vcpu > *vcpu, __be32 xirr, > struct kvmppc_irq_map *irq_map, > struct kvmppc_passthru_irqmap *pimap, > bool *again); > +extern int kvmppc_xive_connect_vcpu(struct kvm_device *dev, > + struct kvm_vcpu *vcpu, u32 cpu); > +extern void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu); > +extern int kvmppc_xive_set_mapped(struct kvm *kvm, unsigned long guest_irq, > + struct irq_desc *host_desc); > +extern int kvmppc_xive_clr_mapped(struct kvm *kvm, unsigned long guest_irq, > + struct irq_desc *host_desc); > +extern u64 kvmppc_xive_get_icp(struct kvm_vcpu *vcpu); > +extern int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval); > + > +extern int kvmppc_xics_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, > + int level, bool line_status); > +extern int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, > + int level, bool line_status); > + > extern int h_ipi_redirect; > #else > static inline struct kvmppc_passthru_irqmap *kvmppc_get_passthru_irqmap( > @@ -546,6 +582,8 @@ long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned > long flags, > long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr, > unsigned long slb_v, unsigned int status, bool > data); > unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu); > +unsigned long kvmppc_rm_h_xirr_x(struct kvm_vcpu *vcpu); > +unsigned long kvmppc_rm_h_ipoll(struct kvm_vcpu *vcpu, unsigned long server); > int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server, > unsigned long mfrr); > int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr); > diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h > index b1604b73..94b5cca 100644 > --- a/arch/powerpc/include/asm/xive.h > +++ b/arch/powerpc/include/asm/xive.h > @@ -55,7 +55,8 @@ struct xive_q { > #define XIVE_ESB_SET_PQ_01 0xd00 > #define XIVE_ESB_SET_PQ_10 0xe00 > #define XIVE_ESB_SET_PQ_11 0xf00 > -#define XIVE_ESB_MASK XIVE_ESB_SET_PQ_01 > +#define XIVE_ESB_SOFT_MASK XIVE_ESB_SET_PQ_10 > +#define XIVE_ESB_HARD_MASK XIVE_ESB_SET_PQ_01 What's the difference between a "soft" mask and a "hard" mask? > > extern bool __xive_enabled; > > @@ -88,11 +89,11 @@ extern int xive_native_configure_queue(u32 vp_id, struct > xive_q *q, u8 prio, > __be32 *qpage, u32 order, bool > can_escalate); > extern void xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio); > > -extern bool __xive_irq_trigger(struct xive_irq_data *xd); > -extern bool __xive_irq_retrigger(struct xive_irq_data *xd); > -extern void xive_do_source_eoi(u32 hw_irq, struct xive_irq_data *xd); > - > +extern void xive_native_sync_source(u32 hw_irq); > extern bool is_xive_irq(struct irq_chip *chip); > +extern int xive_native_enable_vp(u32 vp_id); > +extern int xive_native_disable_vp(u32 vp_id); > +extern int xive_native_get_vp_info(u32 vp_id, u32 *out_cam_id, u32 > *out_chip_id); > > #else > > diff --git a/arch/powerpc/kernel/asm-offsets.c > b/arch/powerpc/kernel/asm-offsets.c > index 4367e7d..59fa705 100644 > --- a/arch/powerpc/kernel/asm-offsets.c > +++ b/arch/powerpc/kernel/asm-offsets.c > @@ -630,6 +630,8 @@ int main(void) > HSTATE_FIELD(HSTATE_KVM_VCPU, kvm_vcpu); > HSTATE_FIELD(HSTATE_KVM_VCORE, kvm_vcore); > HSTATE_FIELD(HSTATE_XICS_PHYS, xics_phys); > + HSTATE_FIELD(HSTATE_XIVE_TM_AREA_PHYS, xive_tm_area_phys); > + HSTATE_FIELD(HSTATE_XIVE_TM_AREA_VIRT, xive_tm_area_virt); > HSTATE_FIELD(HSTATE_SAVED_XIRR, saved_xirr); > HSTATE_FIELD(HSTATE_HOST_IPI, host_ipi); > HSTATE_FIELD(HSTATE_PTID, ptid); > @@ -715,6 +717,14 @@ int main(void) > OFFSET(VCPU_HOST_MAS6, kvm_vcpu, arch.host_mas6); > #endif > > +#ifdef CONFIG_KVM_XICS > + DEFINE(VCPU_XIVE_SAVED_STATE, offsetof(struct kvm_vcpu, > + arch.xive_saved_state)); > + DEFINE(VCPU_XIVE_CAM_WORD, offsetof(struct kvm_vcpu, > + arch.xive_cam_word)); > + DEFINE(VCPU_XIVE_PUSHED, offsetof(struct kvm_vcpu, arch.xive_pushed)); > +#endif > + > #ifdef CONFIG_KVM_EXIT_TIMING > OFFSET(VCPU_TIMING_EXIT_TBU, kvm_vcpu, arch.timing_exit.tv32.tbu); > OFFSET(VCPU_TIMING_EXIT_TBL, kvm_vcpu, arch.timing_exit.tv32.tbl); > diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile > index b87ccde..ef89c8c 100644 > --- a/arch/powerpc/kvm/Makefile > +++ b/arch/powerpc/kvm/Makefile > @@ -74,7 +74,7 @@ kvm-hv-y += \ > book3s_64_mmu_radix.o > > kvm-book3s_64-builtin-xics-objs-$(CONFIG_KVM_XICS) := \ > - book3s_hv_rm_xics.o > + book3s_hv_rm_xics.o book3s_hv_rm_xive.o > > ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE > kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) += \ > @@ -87,7 +87,7 @@ kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) > += \ > endif > > kvm-book3s_64-objs-$(CONFIG_KVM_XICS) += \ > - book3s_xics.o > + book3s_xics.o book3s_xive.o > > kvm-book3s_64-module-objs := \ > $(common-objs-y) \ > diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c > index aedacef..e459ec4 100644 > --- a/arch/powerpc/kvm/book3s.c > +++ b/arch/powerpc/kvm/book3s.c > @@ -35,6 +35,7 @@ > #include <asm/kvm_book3s.h> > #include <asm/mmu_context.h> > #include <asm/page.h> > +#include <asm/xive.h> > > #include "book3s.h" > #include "trace.h" > @@ -578,11 +579,14 @@ int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, > break; > #ifdef CONFIG_KVM_XICS > case KVM_REG_PPC_ICP_STATE: > - if (!vcpu->arch.icp) { > + if (!vcpu->arch.icp && !vcpu->arch.xive_vcpu) { > r = -ENXIO; > break; > } > - *val = get_reg_val(id, kvmppc_xics_get_icp(vcpu)); > + if (xive_enabled()) > + *val = get_reg_val(id, > kvmppc_xive_get_icp(vcpu)); > + else > + *val = get_reg_val(id, > kvmppc_xics_get_icp(vcpu)); > break; > #endif /* CONFIG_KVM_XICS */ > case KVM_REG_PPC_FSCR: > @@ -648,12 +652,14 @@ int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, > #endif /* CONFIG_VSX */ > #ifdef CONFIG_KVM_XICS > case KVM_REG_PPC_ICP_STATE: > - if (!vcpu->arch.icp) { > + if (!vcpu->arch.icp && !vcpu->arch.xive_vcpu) { > r = -ENXIO; > break; > } > - r = kvmppc_xics_set_icp(vcpu, > - set_reg_val(id, *val)); > + if (xive_enabled()) > + r = kvmppc_xive_set_icp(vcpu, set_reg_val(id, > *val)); > + else > + r = kvmppc_xics_set_icp(vcpu, set_reg_val(id, > *val)); > break; > #endif /* CONFIG_KVM_XICS */ > case KVM_REG_PPC_FSCR: > @@ -924,6 +930,50 @@ int kvmppc_book3s_hcall_implemented(struct kvm *kvm, > unsigned long hcall) > return kvm->arch.kvm_ops->hcall_implemented(hcall); > } > > +#ifdef CONFIG_KVM_XICS > +int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, > + bool line_status) > +{ > + if (xive_enabled()) > + return kvmppc_xive_set_irq(kvm, irq_source_id, irq, level, > + line_status); > + else > + return kvmppc_xics_set_irq(kvm, irq_source_id, irq, level, > + line_status); > +} > + > +int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *irq_entry, > + struct kvm *kvm, int irq_source_id, > + int level, bool line_status) > +{ > + return kvm_set_irq(kvm, irq_source_id, irq_entry->gsi, > + level, line_status); > +} > +static int kvmppc_book3s_set_irq(struct kvm_kernel_irq_routing_entry *e, > + struct kvm *kvm, int irq_source_id, int level, > + bool line_status) > +{ > + return kvm_set_irq(kvm, irq_source_id, e->gsi, level, line_status); > +} > + > +int kvm_irq_map_gsi(struct kvm *kvm, > + struct kvm_kernel_irq_routing_entry *entries, int gsi) > +{ > + entries->gsi = gsi; > + entries->type = KVM_IRQ_ROUTING_IRQCHIP; > + entries->set = kvmppc_book3s_set_irq; > + entries->irqchip.irqchip = 0; > + entries->irqchip.pin = gsi; > + return 1; > +} > + > +int kvm_irq_map_chip_pin(struct kvm *kvm, unsigned irqchip, unsigned pin) > +{ > + return pin; > +} > + > +#endif /* CONFIG_KVM_XICS */ > + > static int kvmppc_book3s_init(void) > { > int r; > @@ -934,12 +984,23 @@ static int kvmppc_book3s_init(void) > #ifdef CONFIG_KVM_BOOK3S_32_HANDLER > r = kvmppc_book3s_init_pr(); > #endif > - return r; > > +#ifdef CONFIG_KVM_XICS > + if (xive_enabled()) { > + kvmppc_xive_init_module(); > + kvm_register_device_ops(&kvm_xive_ops, KVM_DEV_TYPE_XICS); > + } else > + kvm_register_device_ops(&kvm_xics_ops, KVM_DEV_TYPE_XICS); > +#endif > + return r; > } > > static void kvmppc_book3s_exit(void) > { > +#ifdef CONFIG_KVM_XICS > + if (xive_enabled()) > + kvmppc_xive_exit_module(); > +#endif > #ifdef CONFIG_KVM_BOOK3S_32_HANDLER > kvmppc_book3s_exit_pr(); > #endif > diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c > index fadb75a..5c340c2 100644 > --- a/arch/powerpc/kvm/book3s_hv.c > +++ b/arch/powerpc/kvm/book3s_hv.c > @@ -67,6 +67,7 @@ > #include <asm/mmu.h> > #include <asm/opal.h> > #include <asm/xics.h> > +#include <asm/xive.h> > > #include "book3s.h" > > @@ -837,6 +838,10 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) > case H_IPOLL: > case H_XIRR_X: > if (kvmppc_xics_enabled(vcpu)) { > + if (xive_enabled()) { > + ret = H_NOT_AVAILABLE; > + return RESUME_GUEST; > + } > ret = kvmppc_xics_hcall(vcpu, req); > break; > } > @@ -2947,8 +2952,12 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, > struct kvm_vcpu *vcpu) > r = kvmppc_book3s_hv_page_fault(run, vcpu, > vcpu->arch.fault_dar, vcpu->arch.fault_dsisr); > srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); > - } else if (r == RESUME_PASSTHROUGH) > - r = kvmppc_xics_rm_complete(vcpu, 0); > + } else if (r == RESUME_PASSTHROUGH) { > + if (WARN_ON(xive_enabled())) > + r = H_SUCCESS; > + else > + r = kvmppc_xics_rm_complete(vcpu, 0); > + } > } while (is_kvmppc_resume_guest(r)); > > out: > @@ -3400,10 +3409,19 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm) > /* > * On POWER9, VPM0 bit is reserved (VPM0=1 behaviour is assumed) > * Set HVICE bit to enable hypervisor virtualization interrupts. > + * Set HEIC to prevent OS interrupts to go to hypervisor (should > + * be unnecessary but better safe than sorry in case we re-enable > + * EE in HV mode with this LPCR still set) > */ > if (cpu_has_feature(CPU_FTR_ARCH_300)) { > lpcr &= ~LPCR_VPM0; > - lpcr |= LPCR_HVICE; > + lpcr |= LPCR_HVICE | LPCR_HEIC; > + > + /* If xive is enabled, we route 0x500 interrupts directly > + * to the guest > + */ > + if (xive_enabled()) > + lpcr |= LPCR_LPES; > } > > /* > @@ -3533,7 +3551,7 @@ static int kvmppc_set_passthru_irq(struct kvm *kvm, int > host_irq, int guest_gsi) > struct kvmppc_irq_map *irq_map; > struct kvmppc_passthru_irqmap *pimap; > struct irq_chip *chip; > - int i; > + int i, rc = 0; > > if (!kvm_irq_bypass) > return 1; > @@ -3558,10 +3576,10 @@ static int kvmppc_set_passthru_irq(struct kvm *kvm, > int host_irq, int guest_gsi) > /* > * For now, we only support interrupts for which the EOI operation > * is an OPAL call followed by a write to XIRR, since that's > - * what our real-mode EOI code does. > + * what our real-mode EOI code does, or a XIVE interrupt > */ > chip = irq_data_get_irq_chip(&desc->irq_data); > - if (!chip || !is_pnv_opal_msi(chip)) { > + if (!chip || !(is_pnv_opal_msi(chip) || is_xive_irq(chip))) { > pr_warn("kvmppc_set_passthru_irq_hv: Could not assign IRQ map > for (%d,%d)\n", > host_irq, guest_gsi); > mutex_unlock(&kvm->lock); > @@ -3603,7 +3621,14 @@ static int kvmppc_set_passthru_irq(struct kvm *kvm, > int host_irq, int guest_gsi) > if (i == pimap->n_mapped) > pimap->n_mapped++; > > - kvmppc_xics_set_mapped(kvm, guest_gsi, desc->irq_data.hwirq); > + if (xive_enabled()) > + rc = kvmppc_xive_set_mapped(kvm, guest_gsi, desc); > + else > + kvmppc_xics_set_mapped(kvm, guest_gsi, desc->irq_data.hwirq); > + printk("set mapped for IRQ %d -> %d returned %d\n", > + host_irq, guest_gsi, rc); This seems like a debugging thing that should be removed or turned into a DBG(). > + if (rc) > + irq_map->r_hwirq = 0; > > mutex_unlock(&kvm->lock); > > @@ -3614,7 +3639,7 @@ static int kvmppc_clr_passthru_irq(struct kvm *kvm, int > host_irq, int guest_gsi) > { > struct irq_desc *desc; > struct kvmppc_passthru_irqmap *pimap; > - int i; > + int i, rc = 0; > > if (!kvm_irq_bypass) > return 0; > @@ -3641,9 +3666,12 @@ static int kvmppc_clr_passthru_irq(struct kvm *kvm, > int host_irq, int guest_gsi) > return -ENODEV; > } > > - kvmppc_xics_clr_mapped(kvm, guest_gsi, pimap->mapped[i].r_hwirq); > + if (xive_enabled()) > + rc = kvmppc_xive_clr_mapped(kvm, guest_gsi, > pimap->mapped[i].desc); > + else > + kvmppc_xics_clr_mapped(kvm, guest_gsi, > pimap->mapped[i].r_hwirq); > > - /* invalidate the entry */ > + /* invalidate the entry (what do do on error from the above ?) */ > pimap->mapped[i].r_hwirq = 0; > > /* > @@ -3652,7 +3680,7 @@ static int kvmppc_clr_passthru_irq(struct kvm *kvm, int > host_irq, int guest_gsi) > */ > > mutex_unlock(&kvm->lock); > - return 0; > + return rc; > } > > static int kvmppc_irq_bypass_add_producer_hv(struct irq_bypass_consumer > *cons, > @@ -3930,7 +3958,7 @@ static int kvmppc_book3s_init_hv(void) > * indirectly, via OPAL. > */ > #ifdef CONFIG_SMP > - if (!get_paca()->kvm_hstate.xics_phys) { > + if (!xive_enabled() && !get_paca()->kvm_hstate.xics_phys) { > struct device_node *np; > > np = of_find_compatible_node(NULL, NULL, "ibm,opal-intc"); > diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c > b/arch/powerpc/kvm/book3s_hv_builtin.c > index d48f9b6..8de7ed4 100644 > --- a/arch/powerpc/kvm/book3s_hv_builtin.c > +++ b/arch/powerpc/kvm/book3s_hv_builtin.c > @@ -23,6 +23,7 @@ > #include <asm/kvm_book3s.h> > #include <asm/archrandom.h> > #include <asm/xics.h> > +#include <asm/xive.h> > #include <asm/dbell.h> > #include <asm/cputhreads.h> > #include <asm/io.h> > @@ -31,6 +32,24 @@ > > #define KVM_CMA_CHUNK_ORDER 18 > > +#include "book3s_xics.h" > +#include "book3s_xive.h" > + > +/* > + * The XIVE module will populate these when it loads > + */ > +unsigned long (*__xive_vm_h_xirr)(struct kvm_vcpu *vcpu); > +unsigned long (*__xive_vm_h_ipoll)(struct kvm_vcpu *vcpu, unsigned long > server); > +int (*__xive_vm_h_ipi)(struct kvm_vcpu *vcpu, unsigned long server, > + unsigned long mfrr); > +int (*__xive_vm_h_cppr)(struct kvm_vcpu *vcpu, unsigned long cppr); > +int (*__xive_vm_h_eoi)(struct kvm_vcpu *vcpu, unsigned long xirr); > +EXPORT_SYMBOL_GPL(__xive_vm_h_xirr); > +EXPORT_SYMBOL_GPL(__xive_vm_h_ipoll); > +EXPORT_SYMBOL_GPL(__xive_vm_h_ipi); > +EXPORT_SYMBOL_GPL(__xive_vm_h_cppr); > +EXPORT_SYMBOL_GPL(__xive_vm_h_eoi); > + > /* > * Hash page table alignment on newer cpus(CPU_FTR_ARCH_206) > * should be power of 2. > @@ -209,6 +228,7 @@ void kvmhv_rm_send_ipi(int cpu) > __asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg)); > return; > } > + > /* On POWER8 for IPIs to threads in the same core, use msgsnd. */ > if (cpu_has_feature(CPU_FTR_ARCH_207S) && > cpu_first_thread_sibling(cpu) == > @@ -218,6 +238,10 @@ void kvmhv_rm_send_ipi(int cpu) > return; > } > > + /* We should never reach this */ > + if (WARN_ON_ONCE(xive_enabled())) > + return; > + > /* Else poke the target with an IPI */ > xics_phys = paca[cpu].kvm_hstate.xics_phys; > if (xics_phys) > @@ -398,6 +422,9 @@ static long kvmppc_read_one_intr(bool *again) > u8 host_ipi; > int64_t rc; > > + if (xive_enabled()) > + return 1; Why not do this in kvmppc_read_intr() rather than here? > + > /* see if a host IPI is pending */ > host_ipi = local_paca->kvm_hstate.host_ipi; > if (host_ipi) > @@ -482,3 +509,84 @@ static long kvmppc_read_one_intr(bool *again) > > return kvmppc_check_passthru(xisr, xirr, again); > } > + > +static inline bool is_rm(void) > +{ > + return !(mfmsr() & MSR_DR); > +} > + > +/* XXX FIXME: The xive_vm_* calls are in a module... */ > + > +unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu) > +{ > + if (xive_enabled()) { > + if (is_rm()) > + return xive_rm_h_xirr(vcpu); > + if (unlikely(!__xive_vm_h_xirr)) > + return H_NOT_AVAILABLE; > + return __xive_vm_h_xirr(vcpu); > + } else > + return xics_rm_h_xirr(vcpu); > +} > + > +unsigned long kvmppc_rm_h_xirr_x(struct kvm_vcpu *vcpu) > +{ > + vcpu->arch.gpr[5] = get_tb(); > + if (xive_enabled()) { > + if (is_rm()) > + return xive_rm_h_xirr(vcpu); > + if (unlikely(!__xive_vm_h_xirr)) > + return H_NOT_AVAILABLE; > + return __xive_vm_h_xirr(vcpu); > + } else > + return xics_rm_h_xirr(vcpu); > +} > + > +unsigned long kvmppc_rm_h_ipoll(struct kvm_vcpu *vcpu, unsigned long server) > +{ > + if (xive_enabled()) { > + if (is_rm()) > + return xive_rm_h_ipoll(vcpu, server); > + if (unlikely(!__xive_vm_h_ipoll)) > + return H_NOT_AVAILABLE; > + return __xive_vm_h_ipoll(vcpu, server); > + } else > + return H_TOO_HARD; > +} > + > +int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server, > + unsigned long mfrr) > +{ > + if (xive_enabled()) { > + if (is_rm()) > + return xive_rm_h_ipi(vcpu, server, mfrr); > + if (unlikely(!__xive_vm_h_ipi)) > + return H_NOT_AVAILABLE; > + return __xive_vm_h_ipi(vcpu, server, mfrr); > + } else > + return xics_rm_h_ipi(vcpu, server, mfrr); > +} > + > +int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr) > +{ > + if (xive_enabled()) { > + if (is_rm()) > + return xive_rm_h_cppr(vcpu, cppr); > + if (unlikely(!__xive_vm_h_cppr)) > + return H_NOT_AVAILABLE; > + return __xive_vm_h_cppr(vcpu, cppr); > + } else > + return xics_rm_h_cppr(vcpu, cppr); > +} > + > +int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) > +{ > + if (xive_enabled()) { > + if (is_rm()) > + return xive_rm_h_eoi(vcpu, xirr); > + if (unlikely(!__xive_vm_h_eoi)) > + return H_NOT_AVAILABLE; > + return __xive_vm_h_eoi(vcpu, xirr); > + } else > + return xics_rm_h_eoi(vcpu, xirr); > +} > diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c > b/arch/powerpc/kvm/book3s_hv_rm_xics.c > index 3a1a463..f806880 100644 > --- a/arch/powerpc/kvm/book3s_hv_rm_xics.c > +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c > @@ -485,7 +485,7 @@ static void icp_rm_down_cppr(struct kvmppc_xics *xics, > struct kvmppc_icp *icp, > } > > > -unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu) > +unsigned long xics_rm_h_xirr(struct kvm_vcpu *vcpu) > { > union kvmppc_icp_state old_state, new_state; > struct kvmppc_xics *xics = vcpu->kvm->arch.xics; > @@ -523,8 +523,8 @@ unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu) > return check_too_hard(xics, icp); > } > > -int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server, > - unsigned long mfrr) > +int xics_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server, > + unsigned long mfrr) > { > union kvmppc_icp_state old_state, new_state; > struct kvmppc_xics *xics = vcpu->kvm->arch.xics; > @@ -610,7 +610,7 @@ int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long > server, > return check_too_hard(xics, this_icp); > } > > -int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr) > +int xics_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr) > { > union kvmppc_icp_state old_state, new_state; > struct kvmppc_xics *xics = vcpu->kvm->arch.xics; > @@ -730,7 +730,7 @@ static int ics_rm_eoi(struct kvm_vcpu *vcpu, u32 irq) > return check_too_hard(xics, icp); > } > > -int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) > +int xics_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) > { > struct kvmppc_xics *xics = vcpu->kvm->arch.xics; > struct kvmppc_icp *icp = vcpu->arch.icp; > diff --git a/arch/powerpc/kvm/book3s_hv_rm_xive.c > b/arch/powerpc/kvm/book3s_hv_rm_xive.c > new file mode 100644 > index 0000000..6390f71 > --- /dev/null > +++ b/arch/powerpc/kvm/book3s_hv_rm_xive.c > @@ -0,0 +1,47 @@ > +#include <linux/kernel.h> > +#include <linux/kvm_host.h> > +#include <linux/err.h> > +#include <linux/kernel_stat.h> > + > +#include <asm/kvm_book3s.h> > +#include <asm/kvm_ppc.h> > +#include <asm/hvcall.h> > +#include <asm/xics.h> > +#include <asm/debug.h> > +#include <asm/synch.h> > +#include <asm/cputhreads.h> > +#include <asm/pgtable.h> > +#include <asm/ppc-opcode.h> > +#include <asm/pnv-pci.h> > +#include <asm/opal.h> > +#include <asm/smp.h> > +#include <asm/asm-prototypes.h> > +#include <asm/xive.h> > + > +#include "book3s_xive.h" > +#include "../sysdev/xive/xive-regs.h" > + > +/* XXX */ > +#include <asm/udbg.h> > +//#define DBG(fmt...) udbg_printf(fmt) > +#define DBG(fmt...) do { } while(0) > + > +static inline void __iomem *get_tm_area_phys(void) > +{ > + return local_paca->kvm_hstate.xive_tm_area_phys; > +} > + > +#undef XIVE_RUNTIME_CHECKS > +#define X_PFX xive_rm_ > +#define X_STATIC > +#define X_STAT_PFX stat_rm_ > +#define __x_tm_area get_tm_area_phys() > +#define __x_eoi_page(xd) ((void __iomem *)((xd)->eoi_page)) > +#define __x_trig_page(xd) ((void __iomem *)((xd)->trig_page)) > +#define __x_readb __raw_rm_readb > +#define __x_writeb __raw_rm_writeb > +#define __x_readw __raw_rm_readw > +#define __x_readq __raw_rm_readq > +#define __x_writeq __raw_rm_writeq > + > +#include "book3s_xive_template.c" > diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S > b/arch/powerpc/kvm/book3s_hv_rmhandlers.S > index 720b9c0..c06cccd 100644 > --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S > +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S > @@ -31,6 +31,8 @@ > #include <asm/tm.h> > #include <asm/opal.h> > > +#include "../sysdev/xive/xive-regs.h" > + > #define VCPU_GPRS_TM(reg) (((reg) * ULONG_SIZE) + VCPU_GPR_TM) > > /* Values in HSTATE_NAPPING(r13) */ > @@ -982,6 +984,23 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) > cmpwi r3, 512 /* 1 microsecond */ > blt hdec_soon > > +#ifdef CONFIG_KVM_XICS > + /* We are entering the guest on that thread, push VCPU to XIVE */ > + ld r10, HSTATE_XIVE_TM_AREA_PHYS(r13) > + cmpldi cr0, r10, r0 > + beq no_xive > + ld r11, VCPU_XIVE_SAVED_STATE(r4) > + li r9, TM_QW1_OS > + stdcix r11,r9,r10 > + eieio > + lwz r11, VCPU_XIVE_CAM_WORD(r4) > + li r9, TM_QW1_OS + TM_WORD2 > + stwcix r11,r9,r10 > + li r9, 1 > + stw r9, VCPU_XIVE_PUSHED(r4) > +no_xive: > +#endif /* CONFIG_KVM_XICS */ > + > deliver_guest_interrupt: > ld r6, VCPU_CTR(r4) > ld r7, VCPU_XER(r4) > @@ -1319,6 +1338,38 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) > blt deliver_guest_interrupt > > guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */ > +#ifdef CONFIG_KVM_XICS > + /* We are exiting, pull the VP from the XIVE */ > + lwz r0, VCPU_XIVE_PUSHED(r9) > + cmpwi cr0, r0, 0 > + beq 1f > + li r7, TM_SPC_PULL_OS_CTX > + li r6, TM_QW1_OS > + mfmsr r0 > + andi. r0, r0, MSR_IR /* in real mode? */ > + beq 2f > + ld r10, HSTATE_XIVE_TM_AREA_VIRT(r13) > + cmpldi cr0, r10, 0 > + beq 1f > + lwzx r11, r7, r10 > + eieio > + ldx r11, r6, r10 I assume you meant to do these two loads into the same target register, but I don't know why, so a comment would be useful. > + b 3f > +2: ld r10, HSTATE_XIVE_TM_AREA_PHYS(r13) > + cmpldi cr0, r10, 0 > + beq 1f > + lwzcix r11, r7, r10 > + eieio > + ldcix r11, r6, r10 > +3: std r11, VCPU_XIVE_SAVED_STATE(r9) > + /* Fixup some of the state for the next load */ > + li r10, 0 > + li r0, 0xff > + stw r10, VCPU_XIVE_PUSHED(r9) > + stb r10, (VCPU_XIVE_SAVED_STATE+3)(r9) > + stb r0, (VCPU_XIVE_SAVED_STATE+4)(r9) > +1: > +#endif /* CONFIG_KVM_XICS */ > /* Save more register state */ > mfdar r6 > mfdsisr r7 > @@ -2035,7 +2086,7 @@ hcall_real_table: > .long DOTSYM(kvmppc_rm_h_eoi) - hcall_real_table > .long DOTSYM(kvmppc_rm_h_cppr) - hcall_real_table > .long DOTSYM(kvmppc_rm_h_ipi) - hcall_real_table > - .long 0 /* 0x70 - H_IPOLL */ > + .long DOTSYM(kvmppc_rm_h_ipoll) - hcall_real_table > .long DOTSYM(kvmppc_rm_h_xirr) - hcall_real_table > #else > .long 0 /* 0x64 - H_EOI */ > @@ -2205,7 +2256,11 @@ hcall_real_table: > .long 0 /* 0x2f0 */ > .long 0 /* 0x2f4 */ > .long 0 /* 0x2f8 */ > - .long 0 /* 0x2fc */ > +#ifdef CONFIG_KVM_XICS > + .long DOTSYM(kvmppc_rm_h_xirr_x) - hcall_real_table > +#else > + .long 0 /* 0x2fc - H_XIRR_X*/ > +#endif > .long DOTSYM(kvmppc_h_random) - hcall_real_table > .globl hcall_real_table_end > hcall_real_table_end: > @@ -2980,6 +3035,7 @@ kvmppc_fix_pmao: > isync > blr > > + Gratuitous extra blank line. > #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING > /* > * Start timing an activity > diff --git a/arch/powerpc/kvm/book3s_rtas.c b/arch/powerpc/kvm/book3s_rtas.c > index 20528701..2d3b2b1 100644 > --- a/arch/powerpc/kvm/book3s_rtas.c > +++ b/arch/powerpc/kvm/book3s_rtas.c > @@ -16,6 +16,7 @@ > #include <asm/kvm_ppc.h> > #include <asm/hvcall.h> > #include <asm/rtas.h> > +#include <asm/xive.h> > > #ifdef CONFIG_KVM_XICS > static void kvm_rtas_set_xive(struct kvm_vcpu *vcpu, struct rtas_args *args) > @@ -32,7 +33,10 @@ static void kvm_rtas_set_xive(struct kvm_vcpu *vcpu, > struct rtas_args *args) > server = be32_to_cpu(args->args[1]); > priority = be32_to_cpu(args->args[2]); > > - rc = kvmppc_xics_set_xive(vcpu->kvm, irq, server, priority); > + if (xive_enabled()) > + rc = kvmppc_xive_set_xive(vcpu->kvm, irq, server, priority); > + else > + rc = kvmppc_xics_set_xive(vcpu->kvm, irq, server, priority); > if (rc) > rc = -3; > out: > @@ -52,7 +56,10 @@ static void kvm_rtas_get_xive(struct kvm_vcpu *vcpu, > struct rtas_args *args) > irq = be32_to_cpu(args->args[0]); > > server = priority = 0; > - rc = kvmppc_xics_get_xive(vcpu->kvm, irq, &server, &priority); > + if (xive_enabled()) > + rc = kvmppc_xive_get_xive(vcpu->kvm, irq, &server, &priority); > + else > + rc = kvmppc_xics_get_xive(vcpu->kvm, irq, &server, &priority); > if (rc) { > rc = -3; > goto out; > @@ -76,7 +83,10 @@ static void kvm_rtas_int_off(struct kvm_vcpu *vcpu, struct > rtas_args *args) > > irq = be32_to_cpu(args->args[0]); > > - rc = kvmppc_xics_int_off(vcpu->kvm, irq); > + if (xive_enabled()) > + rc = kvmppc_xive_int_off(vcpu->kvm, irq); > + else > + rc = kvmppc_xics_int_off(vcpu->kvm, irq); > if (rc) > rc = -3; > out: > @@ -95,7 +105,10 @@ static void kvm_rtas_int_on(struct kvm_vcpu *vcpu, struct > rtas_args *args) > > irq = be32_to_cpu(args->args[0]); > > - rc = kvmppc_xics_int_on(vcpu->kvm, irq); > + if (xive_enabled()) > + rc = kvmppc_xive_int_on(vcpu->kvm, irq); > + else > + rc = kvmppc_xics_int_on(vcpu->kvm, irq); > if (rc) > rc = -3; > out: > diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c > index ef4fd52..e6829c4 100644 > --- a/arch/powerpc/kvm/book3s_xics.c > +++ b/arch/powerpc/kvm/book3s_xics.c > @@ -1307,8 +1307,8 @@ static int xics_set_source(struct kvmppc_xics *xics, > long irq, u64 addr) > return 0; > } > > -int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, > - bool line_status) > +int kvmppc_xics_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int > level, > + bool line_status) > { > struct kvmppc_xics *xics = kvm->arch.xics; > > @@ -1317,14 +1317,6 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, > u32 irq, int level, > return ics_deliver_irq(xics, irq, level); > } > > -int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *irq_entry, > - struct kvm *kvm, int irq_source_id, > - int level, bool line_status) > -{ > - return kvm_set_irq(kvm, irq_source_id, irq_entry->gsi, > - level, line_status); > -} > - > static int xics_set_attr(struct kvm_device *dev, struct kvm_device_attr > *attr) > { > struct kvmppc_xics *xics = dev->private; > @@ -1458,29 +1450,6 @@ void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu) > vcpu->arch.irq_type = KVMPPC_IRQ_DEFAULT; > } > > -static int xics_set_irq(struct kvm_kernel_irq_routing_entry *e, > - struct kvm *kvm, int irq_source_id, int level, > - bool line_status) > -{ > - return kvm_set_irq(kvm, irq_source_id, e->gsi, level, line_status); > -} > - > -int kvm_irq_map_gsi(struct kvm *kvm, > - struct kvm_kernel_irq_routing_entry *entries, int gsi) > -{ > - entries->gsi = gsi; > - entries->type = KVM_IRQ_ROUTING_IRQCHIP; > - entries->set = xics_set_irq; > - entries->irqchip.irqchip = 0; > - entries->irqchip.pin = gsi; > - return 1; > -} > - > -int kvm_irq_map_chip_pin(struct kvm *kvm, unsigned irqchip, unsigned pin) > -{ > - return pin; > -} > - > void kvmppc_xics_set_mapped(struct kvm *kvm, unsigned long irq, > unsigned long host_irq) > { > diff --git a/arch/powerpc/kvm/book3s_xics.h b/arch/powerpc/kvm/book3s_xics.h > index ec5474c..5016676 100644 > --- a/arch/powerpc/kvm/book3s_xics.h > +++ b/arch/powerpc/kvm/book3s_xics.h > @@ -144,5 +144,10 @@ static inline struct kvmppc_ics > *kvmppc_xics_find_ics(struct kvmppc_xics *xics, > return ics; > } > > +extern unsigned long xics_rm_h_xirr(struct kvm_vcpu *vcpu); > +extern int xics_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server, > + unsigned long mfrr); > +extern int xics_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr); > +extern int xics_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr); > > #endif /* _KVM_PPC_BOOK3S_XICS_H */ > diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c > new file mode 100644 > index 0000000..acc882d > --- /dev/null > +++ b/arch/powerpc/kvm/book3s_xive.c > @@ -0,0 +1,1898 @@ > +/* > + * Copyright 2017 Benjamin Herrenschmidt, IBM Corporation. > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License, version 2, as > + * published by the Free Software Foundation. > + */ > + > +#include <linux/kernel.h> > +#include <linux/kvm_host.h> > +#include <linux/err.h> > +#include <linux/gfp.h> > +#include <linux/spinlock.h> > +#include <linux/delay.h> > +#include <linux/percpu.h> > +#include <linux/cpumask.h> > +#include <asm/uaccess.h> > +#include <asm/kvm_book3s.h> > +#include <asm/kvm_ppc.h> > +#include <asm/hvcall.h> > +#include <asm/xics.h> > +#include <asm/xive.h> > +#include <asm/debug.h> > +#include <asm/time.h> > +#include <asm/opal.h> > + > +#include <linux/debugfs.h> > +#include <linux/seq_file.h> > + > +#include "book3s_xive.h" > +#include "../sysdev/xive/xive-regs.h" > + > +//#define DBG(fmt...) printk("KVM/XIVE: " fmt) > +#define DBG(fmt...) do { } while(0) > + > +#ifdef XIVE_RUNTIME_CHECKS > +#define xive_assert(cond) WARN_ON(!(cond)) > +#else > +#define xive_assert(cond) (false) > +#endif > + > +/* > + * Virtual mode variants of the hcalls for use on radix/radix > + * with AIL. They require the VCPU's VP to be "pushed" > + * > + * We still instanciate them here because we use some of the > + * generated utility functions as well in this file. > + */ > +#define XIVE_RUNTIME_CHECKS > +#define X_PFX xive_vm_ > +#define X_STATIC static > +#define X_STAT_PFX stat_vm_ > +#define __x_tm_area xive_tm_area > +#define __x_eoi_page(xd) ((void __iomem *)((xd)->eoi_mmio)) > +#define __x_trig_page(xd) ((void __iomem *)((xd)->trig_mmio)) > +#define __x_readb __raw_readb > +#define __x_writeb __raw_writeb > +#define __x_readw __raw_readw > +#define __x_readq __raw_readq > +#define __x_writeq __raw_writeq > + > +#include "book3s_xive_template.c" > + > +/* We leave a gap of a couple of interrupts in the queue to > + * account for the IPI and additional safety guard > + */ > +#define XIVE_Q_GAP 2 > + > +/* > + * This is a simple trigger for a generic XIVE IRQ. This must > + * only be called for interrupts that support a trigger page > + */ > +static bool xive_irq_trigger(struct xive_irq_data *xd) > +{ > + /* This should be only for MSIs */ > + if (WARN_ON(xd->flags & XIVE_IRQ_FLAG_LSI)) > + return false; > + > + /* Those interrupts should always have a trigger page */ > + if (WARN_ON(!xd->trig_mmio)) > + return false; > + > + out_be64(xd->trig_mmio, 0); > + > + return true; > +} > + > +static irqreturn_t xive_esc_irq(int irq, void *data) > +{ > + struct kvm_vcpu *vcpu = data; > + > + /* We use the existing H_PROD mechanism to wake up the target */ > + vcpu->arch.prodded = 1; > + smp_mb(); > + if (vcpu->arch.ceded) > + kvmppc_fast_vcpu_kick(vcpu); > + > + return IRQ_HANDLED; > +} > + > +static int xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio) > +{ > + struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; > + struct xive_q *q = &xc->queues[prio]; > + char *name = NULL; > + int rc; > + > + /* Already there ? */ > + if (xc->esc_virq[prio]) > + return 0; > + > + /* Hook up the escalation interrupt */ > + xc->esc_virq[prio] = irq_create_mapping(NULL, q->esc_irq); > + if (!xc->esc_virq[prio]) { > + pr_err("XIVE-KVM: Failed to map escalation interrupt" > + " for queue %d of VCPU %d\n", > + prio, xc->server_num); > + return -EIO; > + } > + > + /* > + * Future improvement: start with them disabled > + * and handle DD2 and later scheme of merged escalation > + * interrupts > + */ > + name = kasprintf(GFP_KERNEL, "kvm-%d-%d-%d\n", > + vcpu->kvm->arch.lpid, xc->server_num, prio); > + if (!name) { > + pr_err("XIVE-KVM: Failed to allocate escalation irq name" > + " for queue %d of VCPU %d\n", > + prio, xc->server_num); > + rc = -ENOMEM; > + goto error; > + } > + rc = request_irq(xc->esc_virq[prio], xive_esc_irq, > + IRQF_NO_THREAD, name, vcpu); > + if (rc) { > + pr_err("XIVE-KVM: Failed to request escalation interrupt" > + " for queue %d of VCPU %d\n", > + prio, xc->server_num); > + goto error; > + } > + xc->esc_virq_names[prio] = name; > + return 0; > + error: > + irq_dispose_mapping(xc->esc_virq[prio]); > + xc->esc_virq[prio] = 0; > + kfree(name); > + return rc; > +} > + > +static int xive_provision_queue(struct kvm_vcpu *vcpu, u8 prio) > +{ > + struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; > + struct kvmppc_xive *xive = xc->xive; > + struct xive_q *q = &xc->queues[prio]; > + void *qpage; > + int rc; > + > + if (WARN_ON(q->qpage)) > + return 0; > + > + /* Allocate the queue and retrieve infos on current node for now */ > + qpage = (__be32 *)__get_free_pages(GFP_KERNEL, xive->q_alloc_order); Possibly q_page_order would be a better name than q_alloc_order. > + if (!qpage) { > + pr_err("XIVE-KVM: Failed to allocate queue %d for VCPU %d\n", > + prio, xc->server_num); > + return -ENOMEM;; > + } > + memset(qpage, 0, 1 << xive->q_order); > + > + /* > + * Reconfigure the queue. This will set q->qpage only once the > + * queue is fully configured. This is a requirement for prio 0 > + * as we will stop doing EOIs for every IPI as soon as we observe > + * qpage being non-NULL, and instead will only EOI when we receive > + * corresponding queue 0 entries > + */ > + rc = xive_native_configure_queue(xc->vp_id, q, prio, qpage, > + xive->q_order, true); > + if (rc) > + pr_err("XIVE-KVM: Failed to configure queue %d for VCPU %d\n", > + prio, xc->server_num); > + return rc; > +} > + > +/* Called with kvm_lock held */ > +static int xive_check_provisioning(struct kvm *kvm, u8 prio) > +{ > + struct kvmppc_xive *xive = kvm->arch.xive; > + struct kvm_vcpu *vcpu; > + int i, rc; > + > + lockdep_assert_held(&kvm->lock); > + > + /* Already provisioned ? */ > + if (xive->qmap & (1 << prio)) > + return 0; > + > + DBG("Provisioning prio... %d\n", prio); > + > + /* Provision each VCPU and enable escalations */ > + kvm_for_each_vcpu(i, vcpu, kvm) { > + if (!vcpu->arch.xive_vcpu) > + continue; > + rc = xive_provision_queue(vcpu, prio); > + if (rc == 0) > + xive_attach_escalation(vcpu, prio); > + if (rc) > + return rc; > + } > + > + /* Order previous stores and mark it as provisioned */ > + mb(); > + xive->qmap |= (1 << prio); > + return 0; > +} > + > +static void xive_inc_q_pending(struct kvm *kvm, u32 server, u8 prio) > +{ > + struct kvm_vcpu *vcpu; > + struct kvmppc_xive_vcpu *xc; > + struct xive_q *q; > + > + /* Locate target server */ > + vcpu = kvmppc_xive_find_server(kvm, server); > + if (!vcpu) { > + pr_warn("%s: Can't find server %d\n", __func__, server); > + return; > + } > + xc = vcpu->arch.xive_vcpu; > + if (WARN_ON(!xc)) > + return; > + > + q = &xc->queues[prio]; > + atomic_inc(&q->pending_count); > +} > + > +static int xive_try_pick_queue(struct kvm_vcpu *vcpu, u8 prio) > +{ > + struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; > + struct xive_q *q; > + u32 max; > + > + if (WARN_ON(!xc)) > + return -ENXIO; > + if (!xc->valid) > + return -ENXIO; > + > + q = &xc->queues[prio]; > + if (WARN_ON(!q->qpage)) > + return -ENXIO; > + > + /* Calculate max number of interrupts in that queue. */ > + max = (q->msk + 1) - XIVE_Q_GAP; > + return atomic_add_unless(&q->count, 1, max) ? 0 : -EBUSY; > +} > + > +static int xive_select_target(struct kvm *kvm, u32 *server, u8 prio) > +{ > + struct kvm_vcpu *vcpu; > + int i, rc; > + > + /* Locate target server */ > + vcpu = kvmppc_xive_find_server(kvm, *server); > + if (!vcpu) { > + DBG("Can't find server %d\n", *server); > + return -EINVAL; > + } > + > + DBG("Finding irq target on 0x%x/%d...\n", *server, prio); > + > + /* Try pick it */ > + rc = xive_try_pick_queue(vcpu, prio); > + if (rc == 0) > + return rc; > + > + DBG(" .. failed, looking up candidate...\n"); > + > + /* Failed, pick another VCPU */ > + kvm_for_each_vcpu(i, vcpu, kvm) { > + if (!vcpu->arch.xive_vcpu) > + continue; > + rc = xive_try_pick_queue(vcpu, prio); > + if (rc == 0) { > + *server = vcpu->arch.xive_vcpu->server_num; > + DBG(" found on 0x%x/%d\n", *server, prio); > + return rc; > + } > + } > + DBG(" no available target !\n"); > + > + /* No available target ! */ > + return -EBUSY; > +} > + > +static u8 xive_lock_and_mask(struct kvmppc_xive *xive, > + struct kvmppc_xive_src_block *sb, > + struct kvmppc_xive_irq_state *state) > +{ > + struct xive_irq_data *xd; > + u32 hw_num; > + u8 old_prio; > + u64 val; > + > + /* > + * Take the lock, set masked, try again if racing > + * with H_EOI > + */ > + for (;;) { > + arch_spin_lock(&sb->lock); > + old_prio = state->guest_priority; > + state->guest_priority = MASKED; > + mb(); > + if (!state->in_eoi) > + break; > + state->guest_priority = old_prio; > + arch_spin_unlock(&sb->lock); > + } > + > + /* No change ? Bail */ > + if (old_prio == MASKED) > + return old_prio; > + > + /* Get the right irq */ > + kvmppc_xive_select_irq(state, &hw_num, &xd); > + > + /* > + * If the interrupt is marked as needing masking via > + * firmware, we do it here. Firmware masking however > + * is "lossy", it won't return the old p and q bits > + * and won't set the interrupt to a state where it will > + * record queued ones. If this is an issue we should do > + * lazy masking instead. > + * > + * For now, we work around this in unmask by forcing > + * an interrupt whenever we unmask a non-LSI via FW > + * (if ever). > + */ > + if (xd->flags & OPAL_XIVE_IRQ_MASK_VIA_FW) { > + xive_native_configure_irq(hw_num, > + xive->vp_base + state->act_server, > + MASKED, state->number); > + /* set old_p so we can track if an H_EOI was done */ > + state->old_p = true; > + state->old_q = false; > + } else { > + /* Set PQ to 10, return old P and old Q and remember them */ > + val = xive_vm_esb_load(xd, XIVE_ESB_SET_PQ_10); > + state->old_p = !!(val & 2); > + state->old_q = !!(val & 1); > + > + /* > + * Synchronize hardware to sensure the queues are updated > + * when masking > + */ > + xive_native_sync_source(hw_num); > + } > + > + return old_prio; > +} > + > +static void xive_lock_for_unmask(struct kvmppc_xive_src_block *sb, > + struct kvmppc_xive_irq_state *state) > +{ > + /* > + * Take the lock try again if racing with H_EOI > + */ > + for (;;) { > + arch_spin_lock(&sb->lock); > + if (!state->in_eoi) > + break; > + arch_spin_unlock(&sb->lock); > + } > +} > + > +static void xive_finish_unmask(struct kvmppc_xive *xive, > + struct kvmppc_xive_src_block *sb, > + struct kvmppc_xive_irq_state *state, > + u8 prio) > +{ > + struct xive_irq_data *xd; > + u32 hw_num; > + > + /* If we aren't changing a thing, move on */ > + if (state->guest_priority != MASKED) > + goto bail; > + > + /* Get the right irq */ > + kvmppc_xive_select_irq(state, &hw_num, &xd); > + > + /* > + * See command in xive_lock_and_mask() concerning masking > + * via firmware. > + */ > + if (xd->flags & OPAL_XIVE_IRQ_MASK_VIA_FW) { > + xive_native_configure_irq(hw_num, > + xive->vp_base + state->act_server, > + state->act_priority, state->number); > + /* If an EOI is needed, do it here */ > + if (!state->old_p) > + xive_vm_source_eoi(hw_num, xd); > + /* If this is not an LSI, force a trigger */ > + if (!(xd->flags & OPAL_XIVE_IRQ_LSI)) > + xive_irq_trigger(xd); > + goto bail; > + } > + > + /* Old Q set, set PQ to 11 */ > + if (state->old_q) > + xive_vm_esb_load(xd, XIVE_ESB_SET_PQ_11); > + > + /* > + * If not old P, then perform an "effective" EOI, > + * on the source. This will handle the cases where > + * FW EOI is needed. > + */ > + if (!state->old_p) > + xive_vm_source_eoi(hw_num, xd); > + > + /* Synchronize ordering and mark unmasked */ > + mb(); > + bail: > + state->guest_priority = prio; > +} > + > +/* > + * Target an interrupt to a given server/prio, this will fallback > + * to another server if necessary and perform the HW targetting > + * updates as needed > + * > + * NOTE: Must be called with the state lock held > + */ > +static int xive_target_interrupt(struct kvm *kvm, > + struct kvmppc_xive_irq_state *state, > + u32 server, u8 prio) > +{ > + struct kvmppc_xive *xive = kvm->arch.xive; > + u32 hw_num; > + int rc; > + > + /* > + * This will return a tentative server and actual > + * priority. The count for that new target will have > + * already been incremented. > + */ > + rc = xive_select_target(kvm, &server, prio); > + > + /* We failed to find a target ? Not much we can do > + * at least until we support the GIQ. > + */ > + if (rc) > + return rc; > + > + /* > + * Increment the old queue pending count if there > + * was one so that the old queue count gets adjusted later > + * when observed to be empty. > + */ > + if (state->act_priority != MASKED) > + xive_inc_q_pending(kvm, > + state->act_server, > + state->act_priority); > + /* > + * Update state and HW > + */ > + state->act_priority = prio; > + state->act_server = server; > + > + /* Get the right irq */ > + kvmppc_xive_select_irq(state, &hw_num, NULL); > + > + return xive_native_configure_irq(hw_num, > + xive->vp_base + server, > + prio, state->number); > +} > + > +/* > + * Targetting rules: In order to avoid losing track of > + * pending interrupts accross mask and unmask, which would > + * allow queue overflows, we implement the following rules: > + * > + * - Unless it was never enabled (or we run out of capacity) > + * an interrupt is always targetted at a valid server/queue > + * pair even when "masked" by the guest. This pair tends to > + * be the last one used but it can be changed under some > + * circumstances. That allows us to separate targetting > + * from masking, we only handle accounting during (re)targetting, > + * this also allows us to let an interrupt drain into its target > + * queue after masking, avoiding complex schemes to remove > + * interrupts out of remote processor queues. > + * > + * - When masking, we set PQ to 10 and save the previous value > + * of P and Q. > + * > + * - When unmasking, if saved Q was set, we set PQ to 11 > + * otherwise we leave PQ to the HW state which will be either > + * 10 if nothing happened or 11 if the interrupt fired while > + * masked. Effectively we are OR'ing the previous Q into the > + * HW Q. > + * > + * Then if saved P is clear, we do an effective EOI (Q->P->Trigger) > + * which will unmask the interrupt and shoot a new one if Q was > + * set. > + * > + * Otherwise (saved P is set) we leave PQ unchanged (so 10 or 11, > + * effectively meaning an H_EOI from the guest is still expected > + * for that interrupt). > + * > + * - If H_EOI occurs while masked, we clear the saved P. > + * > + * - When changing target, we account on the new target and > + * increment a separate "pending" counter on the old one. > + * This pending counter will be used to decrement the old > + * target's count when its queue has been observed empty. > + */ > + > +int kvmppc_xive_set_xive(struct kvm *kvm, u32 irq, u32 server, > + u32 priority) > +{ > + struct kvmppc_xive *xive = kvm->arch.xive; > + struct kvmppc_xive_src_block *sb; > + struct kvmppc_xive_irq_state *state; > + u8 new_act_prio; > + int rc = 0; > + u16 idx; > + > + if (!xive) > + return -ENODEV; > + > + DBG("set_xive ! irq 0x%x server 0x%x prio %d\n", > + irq, server, priority); > + > + /* First, check provisioning of queues */ > + if (priority != MASKED) > + rc = xive_check_provisioning(xive->kvm, > + xive_prio_from_guest(priority)); > + if (rc) { > + DBG(" provisioning failure %d !\n", rc); > + return rc; > + } > + > + sb = kvmppc_xive_find_source(xive, irq, &idx); > + if (!sb) > + return -EINVAL; > + state = &sb->irq_state[idx]; > + > + /* > + * We first handle masking/unmasking since the locking > + * might need to be retried due to EOIs, we'll handle > + * targetting changes later. These functions will return > + * with the SB lock held. > + * > + * xive_lock_and_mask() will also set state->guest_priority > + * but won't otherwise change other fields of the state. > + * > + * xive_lock_for_unmask will not actually unmask, this will > + * be done later by xive_finish_unmask() once the targetting > + * has been done, so we don't try to unmask an interrupt > + * that hasn't yet been targetted. > + */ > + if (priority == MASKED) > + xive_lock_and_mask(xive, sb, state); > + else > + xive_lock_for_unmask(sb, state); > + > + > + /* > + * Then we handle targetting. > + * > + * First calculate a new "actual priority" > + */ > + new_act_prio = state->act_priority; > + if (priority != MASKED) > + new_act_prio = xive_prio_from_guest(priority); > + > + DBG(" new_act_prio=%x act_server=%x act_prio=%x\n", > + new_act_prio, state->act_server, state->act_priority); > + > + /* > + * Then check if we actually need to change anything, > + * > + * The condition for re-targetting the interrupt is that > + * we have a valid new priority (new_act_prio is not 0xff) > + * and either the server or the priority changed. > + * > + * Note: If act_priority was ff and the new priority is > + * also ff, we don't do anything and leave the interrupt > + * untargetted. An attempt of doing an int_on on an > + * untargetted interrupt will fail. If that is a problem > + * we could initialize interrupts with valid default > + */ > + > + if (new_act_prio != MASKED && > + (state->act_server != server || > + state->act_priority != new_act_prio)) > + rc = xive_target_interrupt(kvm, state, server, new_act_prio); > + > + /* > + * Perform the final unmasking of the interrupt source > + * if necessary > + */ > + if (priority != MASKED) > + xive_finish_unmask(xive, sb, state, priority); > + > + /* > + * Finally Update saved_priority to match. Only int_on/off > + * set this field to a different value. > + */ > + state->saved_priority = priority; > + > + arch_spin_unlock(&sb->lock); > + return rc; > +} > + > +int kvmppc_xive_get_xive(struct kvm *kvm, u32 irq, u32 *server, > + u32 *priority) > +{ > + struct kvmppc_xive *xive = kvm->arch.xive; > + struct kvmppc_xive_src_block *sb; > + struct kvmppc_xive_irq_state *state; > + u16 idx; > + > + if (!xive) > + return -ENODEV; > + > + sb = kvmppc_xive_find_source(xive, irq, &idx); > + if (!sb) > + return -EINVAL; > + state = &sb->irq_state[idx]; > + arch_spin_lock(&sb->lock); > + *server = state->guest_server; > + *priority = state->guest_priority; > + arch_spin_unlock(&sb->lock); > + > + return 0; > +} > + > +int kvmppc_xive_int_on(struct kvm *kvm, u32 irq) > +{ > + struct kvmppc_xive *xive = kvm->arch.xive; > + struct kvmppc_xive_src_block *sb; > + struct kvmppc_xive_irq_state *state; > + u16 idx; > + > + if (!xive) > + return -ENODEV; > + > + sb = kvmppc_xive_find_source(xive, irq, &idx); > + if (!sb) > + return -EINVAL; > + state = &sb->irq_state[idx]; > + > + DBG("int_on(irq=0x%x)\n", irq); > + > + /* > + * Check if interrupt was not targetted > + */ > + if (state->act_priority == MASKED) { > + DBG("int_on on untargetted interrupt\n"); > + return -EINVAL; > + } > + > + /* If saved_priority is 0xff, do nothing */ > + if (state->saved_priority == MASKED) > + return 0; > + > + /* > + * Lock and unmask it. > + */ > + xive_lock_for_unmask(sb, state); > + xive_finish_unmask(xive, sb, state, state->saved_priority); > + arch_spin_unlock(&sb->lock); > + > + return 0; > +} > + > +int kvmppc_xive_int_off(struct kvm *kvm, u32 irq) > +{ > + struct kvmppc_xive *xive = kvm->arch.xive; > + struct kvmppc_xive_src_block *sb; > + struct kvmppc_xive_irq_state *state; > + u16 idx; > + > + if (!xive) > + return -ENODEV; > + > + sb = kvmppc_xive_find_source(xive, irq, &idx); > + if (!sb) > + return -EINVAL; > + state = &sb->irq_state[idx]; > + > + DBG("int_off(irq=0x%x)\n", irq); > + > + /* > + * Lock and mask > + */ > + state->saved_priority = xive_lock_and_mask(xive, sb, state); > + arch_spin_unlock(&sb->lock); > + > + return 0; > +} > + > +static bool xive_restore_pending_irq(struct kvmppc_xive *xive, u32 irq) > +{ > + struct kvmppc_xive_src_block *sb; > + struct kvmppc_xive_irq_state *state; > + u16 idx; > + > + sb = kvmppc_xive_find_source(xive, irq, &idx); > + if (!sb) > + return false; > + state = &sb->irq_state[idx]; > + if (!state->valid) > + return false; > + > + /* > + * Trigger the IPI. This assumes we never restore a pass-through > + * interrupt which should be safe enough > + */ > + xive_irq_trigger(&state->ipi_data); > + > + return true; > +} > + > +u64 kvmppc_xive_get_icp(struct kvm_vcpu *vcpu) > +{ > + struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; > + > + if (!xc) > + return 0; > + > + /* Return the per-cpu state for state saving/migration */ > + return (u64)xc->cppr << KVM_REG_PPC_ICP_CPPR_SHIFT | > + (u64)xc->mfrr << KVM_REG_PPC_ICP_MFRR_SHIFT; > +} > + > +int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval) > +{ > + struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; > + struct kvmppc_xive *xive = vcpu->kvm->arch.xive; > + u8 cppr, mfrr; > + u32 xisr; > + > + if (!xc || !xive) > + return -ENOENT; > + > + /* Grab individual state fields. We don't use pending_pri */ > + cppr = icpval >> KVM_REG_PPC_ICP_CPPR_SHIFT; > + xisr = (icpval >> KVM_REG_PPC_ICP_XISR_SHIFT) & > + KVM_REG_PPC_ICP_XISR_MASK; > + mfrr = icpval >> KVM_REG_PPC_ICP_MFRR_SHIFT; > + > + DBG("set_icp vcpu %d cppr=0x%x mfrr=0x%x xisr=0x%x\n", > + xc->server_num, cppr, mfrr, xisr); > + > + /* > + * We can't update the state of a "pushed" VCPU, but that > + * shouldn't happen. > + */ > + if (WARN_ON(vcpu->arch.xive_pushed)) > + return -EIO; > + > + /* Update VCPU HW saved state */ > + vcpu->arch.xive_saved_state.cppr = cppr; > + xc->hw_cppr = xc->cppr = cppr; > + > + /* > + * Update MFRR state. If it's not 0xff, we mark the VCPU as > + * having a pending MFRR change, which will re-evaluate the > + * target. The VCPU will thus potentially get a spurious > + * interrupt but that's not a big deal. > + */ > + xc->mfrr = mfrr; > + if (mfrr < cppr) > + xive_irq_trigger(&xc->vp_ipi_data); > + > + /* > + * Now saved XIRR is "interesting". It means there's something in > + * the legacy "1 element" queue... for an IPI we simply ignore it, > + * as the MFRR restore will handle that. For anything else we need > + * to force a resend of the source. > + * However the source may not have been setup yet. If that's the > + * case, we keep that info and increment a counter in the xive to > + * tell subsequent xive_set_source() to go look. > + */ > + if (xisr > XICS_IPI && !xive_restore_pending_irq(xive, xisr)) { > + xc->delayed_irq = xisr; > + xive->delayed_irqs++; > + DBG(" xisr restore delayed\n"); > + } > + > + return 0; > +} > + > +int kvmppc_xive_set_mapped(struct kvm *kvm, unsigned long guest_irq, > + struct irq_desc *host_desc) > +{ > + struct kvmppc_xive *xive = kvm->arch.xive; > + struct kvmppc_xive_src_block *sb; > + struct kvmppc_xive_irq_state *state; > + struct irq_data *host_data = irq_desc_get_irq_data(host_desc); > + unsigned int host_irq = irq_desc_get_irq(host_desc); > + unsigned int hw_irq = (unsigned int)irqd_to_hwirq(host_data); > + u16 idx; > + u8 prio; > + int rc; > + > + if (!xive) > + return -ENODEV; > + > + DBG("set_mapped girq 0x%lx host HW irq 0x%x...\n", guest_irq, hw_irq); > + > + sb = kvmppc_xive_find_source(xive, guest_irq, &idx); > + if (!sb) > + return -EINVAL; > + state = &sb->irq_state[idx]; > + > + /* > + * Mark the passed-through interrupt as going to a VCPU, > + * this will prevent further EOIs and similar operations > + * from the XIVE code. It will also mask the interrupt > + * to either PQ=10 or 11 state, the latter if the interrupt > + * is pending. This will allow us to unmask or retrigger it > + * after routing it to the guest with a simple EOI. > + * > + * The "state" argument is a "token", all it needs is to be > + * non-NULL to switch to passed-through or NULL for the > + * other way around. We may not yet have an actual VCPU > + * target here and we don't really care. > + */ > + rc = irq_set_vcpu_affinity(host_irq, state); > + if (rc) { > + pr_err("Failed to set VCPU affinity for irq %d\n", host_irq); > + return rc; > + } > + > + /* > + * Mask and read state of IPI. We need to know if its P bit > + * is set as that means it's potentially already using a > + * queue entry in the target > + */ > + prio = xive_lock_and_mask(xive, sb, state); > + DBG(" old IPI prio %02x P:%d Q:%d\n", prio, state->old_p, state->old_q); > + > + /* Turn the IPI hard off */ > + xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_01); > + > + /* Grab info about irq */ > + state->pt_number = hw_irq; > + state->pt_data = irq_data_get_irq_handler_data(host_data); > + > + /* > + * Configure the IRQ to match the existing configuration of > + * the IPI if it was already targetted. Otherwise this will > + * mask the interrupt in a lossy way (act_priority is 0xff) > + * which is fine for a never started interrupt. > + */ > + xive_native_configure_irq(hw_irq, > + xive->vp_base + state->act_server, > + state->act_priority, state->number); > + > + /* > + * We do an EOI to enable the interrupt (and retrigger if needed) > + * if the guest has the interrupt unmasked and the P bit was *not* > + * set in the IPI. If it was set, we know a slot may still be in > + * use in the target queue thus we have to wait for a guest > + * originated EOI > + */ > + if (prio != MASKED && !state->old_p) > + xive_vm_source_eoi(hw_irq, state->pt_data); > + > + /* Clear old_p/old_q as they are no longer relevant */ > + state->old_p = state->old_q = false; > + > + /* Restore guest prio (unlocks EOI) */ > + mb(); > + state->guest_priority = prio; > + arch_spin_unlock(&sb->lock); > + > + return 0; > +} > +EXPORT_SYMBOL_GPL(kvmppc_xive_set_mapped); > + > +int kvmppc_xive_clr_mapped(struct kvm *kvm, unsigned long guest_irq, > + struct irq_desc *host_desc) > +{ > + struct kvmppc_xive *xive = kvm->arch.xive; > + struct kvmppc_xive_src_block *sb; > + struct kvmppc_xive_irq_state *state; > + unsigned int host_irq = irq_desc_get_irq(host_desc); > + u16 idx; > + u8 prio; > + int rc; > + > + if (!xive) > + return -ENODEV; > + > + DBG("clr_mapped girq 0x%lx...\n", guest_irq); > + > + sb = kvmppc_xive_find_source(xive, guest_irq, &idx); > + if (!sb) > + return -EINVAL; > + state = &sb->irq_state[idx]; > + > + /* > + * Mask and read state of IRQ. We need to know if its P bit > + * is set as that means it's potentially already using a > + * queue entry in the target > + */ > + prio = xive_lock_and_mask(xive, sb, state); > + DBG(" old IRQ prio %02x P:%d Q:%d\n", prio, state->old_p, state->old_q); > + > + /* > + * If old_p is set, the interrupt is pending, we switch it to > + * PQ=11. This will force a resend in the host so the interrupt > + * isn't lost to whatver host driver may pick it up > + */ > + if (state->old_p) > + xive_vm_esb_load(state->pt_data, XIVE_ESB_SET_PQ_11); > + > + /* Relase the passed-through interrupt to the host */ ^^^^^^ Release > + rc = irq_set_vcpu_affinity(host_irq, NULL); > + if (rc) { > + pr_err("Failed to clr VCPU affinity for irq %d\n", host_irq); > + return rc; > + } > + > + /* Forget about the IRQ */ > + state->pt_number = 0; > + state->pt_data = NULL; > + > + /* Reconfigure the IPI */ > + xive_native_configure_irq(state->ipi_number, > + xive->vp_base + state->act_server, > + state->act_priority, state->number); > + > + /* > + * If old_p is set (we have a queue entry potentially > + * occupied) or the interrupt is masked, we set the IPI > + * to PQ=10 state. Otherwise we just re-enable it (PQ=00). > + */ > + if (prio == MASKED || state->old_p) > + xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_10); > + else > + xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_00); > + > + /* Restore guest prio (unlocks EOI) */ > + mb(); > + state->guest_priority = prio; > + arch_spin_unlock(&sb->lock); > + > + return 0; > +} > +EXPORT_SYMBOL_GPL(kvmppc_xive_clr_mapped); > + > +static void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu) > +{ > + struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; > + struct kvm *kvm = vcpu->kvm; > + struct kvmppc_xive *xive = kvm->arch.xive; > + int i, j; > + > + for (i = 0; i <= xive->max_sbid; i++) { > + struct kvmppc_xive_src_block *sb = xive->src_blocks[i]; > + > + if (!sb) > + continue; > + for (j = 0; j < KVMPPC_XICS_IRQ_PER_ICS; j++) { > + struct kvmppc_xive_irq_state *state = &sb->irq_state[j]; > + > + if (!state->valid) > + continue; > + if (state->act_priority == MASKED) > + continue; > + if (state->act_server != xc->server_num) > + continue; > + > + /* Clean it up */ > + arch_spin_lock(&sb->lock); > + state->act_priority = MASKED; > + xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_01); > + xive_native_configure_irq(state->ipi_number, 0, MASKED, > 0); > + if (state->pt_number) { > + xive_vm_esb_load(state->pt_data, > XIVE_ESB_SET_PQ_01); > + xive_native_configure_irq(state->pt_number, 0, > MASKED, 0); > + } > + arch_spin_unlock(&sb->lock); > + } > + } > +} > + > +void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu) > +{ > + struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; > + struct kvmppc_xive *xive = xc->xive; > + int i; > + > + DBG("cleanup_vcpu(cpu=%d)\n", xc->server_num); > + > + /* Ensure no interrupt is still routed to that VP */ > + xc->valid = false; > + kvmppc_xive_disable_vcpu_interrupts(vcpu); > + > + /* Mask the VP IPI */ > + xive_vm_esb_load(&xc->vp_ipi_data, XIVE_ESB_SET_PQ_01); > + > + /* Disable the VP */ > + xive_native_disable_vp(xc->vp_id); > + > + /* Free the queues & associated interrupts */ > + for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) { > + struct xive_q *q = &xc->queues[i]; > + > + /* Free the escalation irq */ > + if (xc->esc_virq[i]) { > + free_irq(xc->esc_virq[i], vcpu); > + irq_dispose_mapping(xc->esc_virq[i]); > + kfree(xc->esc_virq_names[i]); > + } > + /* Free the queue */ > + xive_native_disable_queue(xc->vp_id, q, i); > + if (q->qpage) { > + free_pages((unsigned long)q->qpage, > + xive->q_alloc_order); > + q->qpage = NULL; > + } > + } > + > + /* Free the IPI */ > + if (xc->vp_ipi) { > + xive_cleanup_irq_data(&xc->vp_ipi_data); > + xive_native_free_irq(xc->vp_ipi); > + } > + /* Free the VP */ > + kfree(xc); > +} > + > +int kvmppc_xive_connect_vcpu(struct kvm_device *dev, > + struct kvm_vcpu *vcpu, u32 cpu) > +{ > + struct kvmppc_xive *xive = dev->private; > + struct kvmppc_xive_vcpu *xc; > + int i, r = -EBUSY; > + > + DBG("connect_vcpu(cpu=%d)\n", cpu); > + > + if (dev->ops != &kvm_xive_ops) { > + DBG("Wrong ops !\n"); > + return -EPERM; > + } > + if (xive->kvm != vcpu->kvm) > + return -EPERM; > + if (vcpu->arch.irq_type) > + return -EBUSY; > + if (kvmppc_xive_find_server(vcpu->kvm, cpu)) { > + DBG("Duplicate !\n"); > + return -EEXIST; > + } > + if (cpu >= KVM_MAX_VCPUS) { > + DBG("Out of bounds !\n"); > + return -EINVAL; > + } > + xc = kzalloc(sizeof(*xc), GFP_KERNEL); > + if (!xc) > + return -ENOMEM; > + > + /* We need to synchronize with queue provisioning */ > + mutex_lock(&vcpu->kvm->lock); > + vcpu->arch.xive_vcpu = xc; > + xc->xive = xive; > + xc->vcpu = vcpu; > + xc->server_num = cpu; > + xc->vp_id = xive->vp_base + cpu; > + xc->mfrr = 0xff; > + xc->valid = true; > + > + r = xive_native_get_vp_info(xc->vp_id, &xc->vp_cam, &xc->vp_chip_id); > + if (r) > + goto bail; > + > + /* Configure VCPU fields for use by assembly push/pull */ > + vcpu->arch.xive_saved_state.qw = cpu_to_be64(0xff000000); > + vcpu->arch.xive_cam_word = cpu_to_be32(xc->vp_cam | TM_QW1W2_VO); > + > + /* Allocate IPI */ > + xc->vp_ipi = xive_native_alloc_irq(); > + if (!xc->vp_ipi) { > + r = -EIO; > + goto bail; > + } > + DBG(" IPI=0x%x\n", xc->vp_ipi); > + > + r = xive_native_populate_irq_data(xc->vp_ipi, &xc->vp_ipi_data); > + if (r) > + goto bail; > + > + /* > + * Initialize queues. Initially we set them all for no queueing > + * and we enable escalation for queue 0 only which we'll use for > + * our mfrr change notifications. If the VCPU is hot-plugged, we > + * do handle provisioning however. > + */ > + for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) { > + struct xive_q *q = &xc->queues[i]; > + > + /* Is queue already enabled ? Provision it */ > + if (xive->qmap & (1 << i)) { > + r = xive_provision_queue(vcpu, i); > + if (r == 0) > + xive_attach_escalation(vcpu, i); > + if (r) > + goto bail; > + } else { > + r = xive_native_configure_queue(xc->vp_id, > + q, i, NULL, 0, true); > + if (r) { > + pr_err("XIVE-KVM: Failed to configure queue %d" > + " for VCPU %d\n", > + i, cpu); > + goto bail; > + } > + } > + } > + > + /* If not done above, attach priority 0 escalation */ > + r = xive_attach_escalation(vcpu, 0); > + if (r) > + goto bail; > + > + /* Enable the VP */ > + r = xive_native_enable_vp(xc->vp_id); > + if (r) > + goto bail; > + > + /* Route the IPI */ > + r = xive_native_configure_irq(xc->vp_ipi, xc->vp_id, 0, XICS_IPI); > + if (!r) > + xive_vm_esb_load(&xc->vp_ipi_data, XIVE_ESB_SET_PQ_00); > + > + bail: > + mutex_unlock(&vcpu->kvm->lock); > + if (r) { > + kvmppc_xive_cleanup_vcpu(vcpu); > + return r; > + } > + > + vcpu->arch.irq_type = KVMPPC_IRQ_XICS; > + return 0; > +} > + > +/* > + * Scanning of queues before/after migration save > + */ > +static void xive_pre_save_set_queued(struct kvmppc_xive *xive, u32 irq) > +{ > + struct kvmppc_xive_src_block *sb; > + struct kvmppc_xive_irq_state *state; > + u16 idx; > + > + sb = kvmppc_xive_find_source(xive, irq, &idx); > + if (!sb) > + return; > + > + state = &sb->irq_state[idx]; > + > + /* Some sanity checking */ > + if (!state->valid) { > + pr_err("XIVE/XIVE: invalid irq 0x%x in cpu queue!\n", irq); > + return; > + } > + > + /* > + * If the interrupt is in a queue it should have P set. > + * We warn so that gets reported. A backtrace isn't useful > + * so no need to use a WARN_ON. > + */ > + if (!state->saved_p) > + pr_err("KVM/XIVE: Interrupt 0x%x is marked in a queue" > + " but P not set !\n", irq); > + > + /* Set flag */ > + state->in_queue = true; > +} > + > +static void xive_pre_scan_mask_irq(struct kvmppc_xive *xive, > + struct kvmppc_xive_src_block *sb, > + u32 irq) > +{ > + struct kvmppc_xive_irq_state *state = &sb->irq_state[irq]; > + > + if (!state->valid) > + return; > + > + /* Mask and save state, this will also sync HW queues */ > + state->saved_scan_prio = xive_lock_and_mask(xive, sb, state); > + > + /* Transfer P and Q */ > + state->saved_p = state->old_p; > + state->saved_q = state->old_q; > + > + /* Unlock */ > + arch_spin_unlock(&sb->lock); > +} > + > +static void xive_pre_scan_unmask_irq(struct kvmppc_xive *xive, I think a better name would be "xive_pre_save_unmask", since this is actually called after the scan. > + struct kvmppc_xive_src_block *sb, > + u32 irq) > +{ > + struct kvmppc_xive_irq_state *state = &sb->irq_state[irq]; > + > + if (!state->valid) > + return; > + > + /* > + * Lock / exclude EOI (not technically necessary if the > + * guest isn't running concurrently. If this becomes a > + * performance issue we can probably remove the lock. > + */ > + xive_lock_for_unmask(sb, state); > + > + /* Restore mask/prio if it wasn't masked */ > + if (state->saved_scan_prio != MASKED) > + xive_finish_unmask(xive, sb, state, state->saved_scan_prio); > + > + /* Unlock */ > + arch_spin_unlock(&sb->lock); > +} > + > +static void xive_pre_save_queue(struct kvmppc_xive *xive, struct xive_q *q) > +{ > + u32 idx = q->idx; > + u32 toggle = q->toggle; > + u32 irq; > + > + do { > + irq = __xive_read_eq(q->qpage, q->msk, &idx, &toggle); > + if (irq > XICS_IPI) > + xive_pre_save_set_queued(xive, irq); > + } while(irq); > +} > + > +static void xive_pre_save_scan(struct kvmppc_xive *xive) > +{ > + struct kvm_vcpu *vcpu = NULL; > + int i, j; > + > + /* > + * See comment in xive_get_source() about how this > + * work. Collect a stable state for all interrupts > + */ > + for (i = 0; i <= xive->max_sbid; i++) { > + struct kvmppc_xive_src_block *sb = xive->src_blocks[i]; > + if (!sb) > + continue; > + for (j = 0; j < KVMPPC_XICS_IRQ_PER_ICS; j++) > + xive_pre_scan_mask_irq(xive, sb, j); > + } > + > + /* Then scan the queues and update the "in_queue" flag */ > + kvm_for_each_vcpu(i, vcpu, xive->kvm) { > + struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; > + if (!xc) > + continue; > + for (j = 0; j < KVMPPC_XIVE_Q_COUNT; j++) { > + if (xc->queues[i].qpage) > + xive_pre_save_queue(xive, &xc->queues[i]); > + } > + } > + > + /* Finally restore interrupt states */ > + for (i = 0; i <= xive->max_sbid; i++) { > + struct kvmppc_xive_src_block *sb = xive->src_blocks[i]; > + if (!sb) > + continue; > + for (j = 0; j < KVMPPC_XICS_IRQ_PER_ICS; j++) > + xive_pre_scan_unmask_irq(xive, sb, j); > + } > +} > + > +static void xive_post_save_scan(struct kvmppc_xive *xive) > +{ > + u32 i, j; > + > + /* Clear all the in_queue flags */ > + for (i = 0; i <= xive->max_sbid; i++) { > + struct kvmppc_xive_src_block *sb = xive->src_blocks[i]; > + if (!sb) > + continue; > + for (j = 0; j < KVMPPC_XICS_IRQ_PER_ICS; j++) > + sb->irq_state[j].in_queue = false; > + } > + > + /* Next get_source() will do a new scan */ > + xive->saved_src_count = 0; > +} > + > +/* > + * This returns the source configuration and state to user space. > + */ > +static int xive_get_source(struct kvmppc_xive *xive, long irq, u64 addr) > +{ > + struct kvmppc_xive_src_block *sb; > + struct kvmppc_xive_irq_state *state; > + u64 __user *ubufp = (u64 __user *) addr; > + u64 val, prio; > + u16 idx; > + > + sb = kvmppc_xive_find_source(xive, irq, &idx); > + if (!sb) > + return -ENOENT; > + > + state = &sb->irq_state[idx]; > + > + if (!state->valid) > + return -ENOENT; > + > + DBG("get_source(%ld)...\n", irq); > + > + /* > + * So to properly save the state into something that looks like a > + * XICS migration stream we cannot treat interrupts individually. > + * > + * We need, instead, mask them all (& save their previous PQ state) > + * to get a stable state in the HW, then sync them to ensure that > + * any interrupt that had already fired hits its queue, and finally > + * scan all the queues to collect which interrupts are still present > + * in the queues, so we can set the "pending" flag on them and > + * they can be resent on restore. > + * > + * So we do it all when the "first" interrupt gets saved, all the > + * state is collected at that point, the rest of xive_get_source() > + * will merely collect and convert that state to the expected > + * userspace bit mask. > + */ > + if (xive->saved_src_count == 0) > + xive_pre_save_scan(xive); > + xive->saved_src_count++; > + > + /* Convert saved state into something compatible with xics */ > + val = state->guest_server; > + prio = state->saved_scan_prio; > + > + if (prio == MASKED) { > + val |= KVM_XICS_MASKED; > + prio = state->saved_priority; > + } > + val |= prio << KVM_XICS_PRIORITY_SHIFT; > + if (state->lsi) { > + val |= KVM_XICS_LEVEL_SENSITIVE; > + if (state->saved_p) > + val |= KVM_XICS_PENDING; > + } else { > + if (state->saved_p) > + val |= KVM_XICS_PRESENTED; > + > + if (state->saved_q) > + val |= KVM_XICS_QUEUED; > + > + /* > + * We mark it pending (which will attempt a re-delivery) > + * if we are in a queue *or* we were masked and had > + * Q set which is equivalent to the XICS "masked pending" > + * state > + */ > + if (state->in_queue || (prio == MASKED && state->saved_q)) > + val |= KVM_XICS_PENDING; > + } > + > + /* > + * If that was the last interrupt saved, reset the > + * in_queue flags > + */ > + if (xive->saved_src_count == xive->src_count) > + xive_post_save_scan(xive); > + > + /* Copy the result to userspace */ > + if (put_user(val, ubufp)) > + return -EFAULT; > + > + return 0; > +} > + > +static struct kvmppc_xive_src_block *xive_create_src_block(struct > kvmppc_xive *xive, > + int irq) > +{ > + struct kvm *kvm = xive->kvm; > + struct kvmppc_xive_src_block *sb; > + int i, bid; > + > + bid = irq >> KVMPPC_XICS_ICS_SHIFT; > + > + mutex_lock(&kvm->lock); > + > + /* block already exists - somebody else got here first */ > + if (xive->src_blocks[bid]) > + goto out; > + > + /* Create the ICS */ > + sb = kzalloc(sizeof(*sb), GFP_KERNEL); > + if (!sb) > + goto out; > + > + sb->id = bid; > + > + for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) { > + sb->irq_state[i].number = (bid << KVMPPC_XICS_ICS_SHIFT) | i; > + sb->irq_state[i].guest_priority = MASKED; > + sb->irq_state[i].saved_priority = MASKED; > + sb->irq_state[i].act_priority = MASKED; > + } > + smp_wmb(); > + xive->src_blocks[bid] = sb; > + > + if (bid > xive->max_sbid) > + xive->max_sbid = bid; > + > + out: > + mutex_unlock(&kvm->lock); > + return xive->src_blocks[bid]; > +} > + > +static bool xive_check_delayed_irq(struct kvmppc_xive *xive, u32 irq) > +{ > + struct kvm *kvm = xive->kvm; > + struct kvm_vcpu *vcpu = NULL; > + int i; > + > + kvm_for_each_vcpu(i, vcpu, kvm) { > + struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; > + > + if (!xc) > + continue; > + > + if (xc->delayed_irq == irq) { > + xc->delayed_irq = 0; > + xive->delayed_irqs--; > + return true; > + } > + } > + return false; > +} > + > +static int xive_set_source(struct kvmppc_xive *xive, long irq, u64 addr) > +{ > + struct kvmppc_xive_src_block *sb; > + struct kvmppc_xive_irq_state *state; > + u64 __user *ubufp = (u64 __user *) addr; > + u16 idx; > + u64 val; > + u8 act_prio, guest_prio; > + u32 server; > + int rc = 0; > + > + if (irq < KVMPPC_XICS_FIRST_IRQ || irq >= KVMPPC_XICS_NR_IRQS) > + return -ENOENT; > + > + DBG("set_source(irq=0x%lx)\n", irq); > + > + /* Find the source */ > + sb = kvmppc_xive_find_source(xive, irq, &idx); > + if (!sb) { > + DBG("No source, creating source block...\n"); > + sb = xive_create_src_block(xive, irq); > + if (!sb) { > + DBG("Failed to create block...\n"); > + return -ENOMEM; > + } > + } > + state = &sb->irq_state[idx]; > + > + /* Read user passed data */ > + if (get_user(val, ubufp)) { > + DBG("fault getting user info !\n"); > + return -EFAULT; > + } > + > + server = val & KVM_XICS_DESTINATION_MASK; > + guest_prio = val >> KVM_XICS_PRIORITY_SHIFT; > + > + DBG(" val=0x016%llx (server=0x%x, guest_prio=%d)\n", > + val, server, guest_prio); > + /* > + * If the source doesn't already have an IPI, allocate > + * one and get the corresponding data > + */ > + if (!state->ipi_number) { > + state->ipi_number = xive_native_alloc_irq(); > + if (state->ipi_number == 0) { > + DBG("Failed to allocate IPI !\n"); > + return -ENOMEM; > + } > + xive_native_populate_irq_data(state->ipi_number, > &state->ipi_data); > + DBG(" src_ipi=0x%x\n", state->ipi_number); > + } > + > + /* > + * We use lock_and_mask() to set us in the right masked > + * state. We will override that state from the saved state > + * further down, but this will handle the cases of interrupts > + * that need FW masking. We set the initial guest_priority to > + * 0 before calling it to ensure it actually performs the masking. > + */ > + state->guest_priority = 0; > + xive_lock_and_mask(xive, sb, state); > + > + /* > + * Now, we select a target if we have one. If we don't we > + * leave the interrupt untargetted. It means that an interrupt > + * can become "untargetted" accross migration if it was masked > + * by set_xive() but there is little we can do about it. > + */ > + > + /* First convert prio and mark interrupt as untargetted */ > + act_prio = xive_prio_from_guest(guest_prio); > + state->act_priority = MASKED; > + state->guest_server = server; > + > + /* > + * We need to drop the lock due to the mutex below. Hopefully > + * nothing is touching that interrupt yet since it hasn't been > + * advertized to a running guest yet > + */ > + arch_spin_unlock(&sb->lock); > + > + /* If we have a priority target the interrupt */ > + if (act_prio != MASKED) { > + /* First, check provisioning of queues */ > + mutex_lock(&xive->kvm->lock); > + rc = xive_check_provisioning(xive->kvm, act_prio); > + mutex_unlock(&xive->kvm->lock); > + > + /* Target interrupt */ > + if (rc == 0) > + rc = xive_target_interrupt(xive->kvm, state, > + server, act_prio); > + /* > + * If provisioning or targetting failed, leave it > + * alone and masked. It will remain disabled until > + * the guest re-targets it. > + */ > + } > + > + /* > + * Find out if this was a delayed irq stashed in an ICP, > + * in which case, treat it as pending > + */ > + if (xive->delayed_irqs && xive_check_delayed_irq(xive, irq)) { > + val |= KVM_XICS_PENDING; > + DBG(" Found delayed ! forcing PENDING !\n"); > + } > + > + /* Cleanup the SW state */ > + state->old_p = false; > + state->old_q = false; > + state->lsi = false; > + state->asserted = false; > + > + /* Restore LSI state */ > + if (val & KVM_XICS_LEVEL_SENSITIVE) { > + state->lsi = true; > + if (val & KVM_XICS_PENDING) > + state->asserted = true; > + DBG(" LSI ! Asserted=%d\n", state->asserted); > + } > + > + /* > + * Restore P and Q. If the interrupt was pending, we > + * force both P and Q, which will trigger a resend. > + * > + * That means that a guest that had both an interrupt > + * pending (queued) and Q set will restore with only > + * one instance of that interrupt instead of 2, but that > + * is perfectly fine as coalescing interrupts that haven't > + * been presented yet is always allowed. > + */ > + if (val & KVM_XICS_PRESENTED || val & KVM_XICS_PENDING) > + state->old_p = true; > + if (val & KVM_XICS_QUEUED || val & KVM_XICS_PENDING) > + state->old_q = true; > + > + DBG(" P=%d, Q=%d\n", state->old_p, state->old_q); > + > + /* > + * If the interrupt was unmasked, update guest priority and > + * perform the appropriate state transition and do a > + * re-trigger if necessary. > + */ > + if (val & KVM_XICS_MASKED) { > + DBG(" masked, saving prio\n"); > + state->guest_priority = MASKED; > + state->saved_priority = guest_prio; > + } else { > + DBG(" unmasked, restoring to prio %d\n", guest_prio); > + xive_finish_unmask(xive, sb, state, guest_prio); > + state->saved_priority = guest_prio; > + } > + > + /* Increment the number of valid sources and mark this one valid */ > + if (!state->valid) > + xive->src_count++; > + state->valid = true; > + > + return 0; > +} > + > +int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int > level, > + bool line_status) > +{ > + struct kvmppc_xive *xive = kvm->arch.xive; > + struct kvmppc_xive_src_block *sb; > + struct kvmppc_xive_irq_state *state; > + u16 idx; > + > + if (!xive) > + return -ENODEV; > + > + sb = kvmppc_xive_find_source(xive, irq, &idx); > + if (!sb) > + return -EINVAL; > + > + /* Perform locklessly .... (we need to do some RCUisms here...) */ > + state = &sb->irq_state[idx]; > + if (!state->valid) > + return -EINVAL; > + > + /* We don't allow a trigger on a passed-through interrupt */ > + if (state->pt_number) > + return -EINVAL; > + > + if ((level == 1 && state->lsi) || level == KVM_INTERRUPT_SET_LEVEL) > + state->asserted = 1; > + else if (level == 0 || level == KVM_INTERRUPT_UNSET) { > + state->asserted = 0; > + return 0; > + } > + > + /* Trigger the IPI */ > + xive_irq_trigger(&state->ipi_data); > + > + return 0; > +} > + > +static int xive_set_attr(struct kvm_device *dev, struct kvm_device_attr > *attr) > +{ > + struct kvmppc_xive *xive = dev->private; > + > + /* We honor the existing XICS ioctl */ > + switch (attr->group) { > + case KVM_DEV_XICS_GRP_SOURCES: > + return xive_set_source(xive, attr->attr, attr->addr); > + } > + return -ENXIO; > +} > + > +static int xive_get_attr(struct kvm_device *dev, struct kvm_device_attr > *attr) > +{ > + struct kvmppc_xive *xive = dev->private; > + > + /* We honor the existing XICS ioctl */ > + switch (attr->group) { > + case KVM_DEV_XICS_GRP_SOURCES: > + return xive_get_source(xive, attr->attr, attr->addr); > + } > + return -ENXIO; > +} > + > +static int xive_has_attr(struct kvm_device *dev, struct kvm_device_attr > *attr) > +{ > + /* We honor the same limits as XICS, at least for now */ > + switch (attr->group) { > + case KVM_DEV_XICS_GRP_SOURCES: > + if (attr->attr >= KVMPPC_XICS_FIRST_IRQ && > + attr->attr < KVMPPC_XICS_NR_IRQS) > + return 0; > + break; > + } > + return -ENXIO; > +} > + > +static void kvmppc_xive_cleanup_irq(u32 hw_num, struct xive_irq_data *xd) > +{ > + xive_vm_esb_load(xd, XIVE_ESB_SET_PQ_01); > + xive_native_configure_irq(hw_num, 0, MASKED, 0); > + xive_cleanup_irq_data(xd); > +} > + > +static void kvmppc_xive_free_sources(struct kvmppc_xive_src_block *sb) > +{ > + int i; > + > + for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) { > + struct kvmppc_xive_irq_state *state = &sb->irq_state[i]; > + > + if (!state->valid) > + continue; > + > + kvmppc_xive_cleanup_irq(state->ipi_number, &state->ipi_data); > + xive_native_free_irq(state->ipi_number); > + > + /* Pass-through, cleanup too */ > + if (state->pt_number) > + kvmppc_xive_cleanup_irq(state->pt_number, > state->pt_data); > + > + state->valid = false; > + } > +} > + > +static void kvmppc_xive_free(struct kvm_device *dev) > +{ > + struct kvmppc_xive *xive = dev->private; > + struct kvm *kvm = xive->kvm; > + int i; > + > + debugfs_remove(xive->dentry); > + > + if (kvm) > + kvm->arch.xive = NULL; > + > + /* Mask and free interrupts */ > + for (i = 0; i <= xive->max_sbid; i++) { > + if (xive->src_blocks[i]) > + kvmppc_xive_free_sources(xive->src_blocks[i]); > + kfree(xive->src_blocks[i]); > + xive->src_blocks[i] = NULL; > + } > + > + if (xive->vp_base != XIVE_INVALID_VP) > + xive_native_free_vp_block(xive->vp_base); > + > + > + kfree(xive); > + kfree(dev); > +} > + > +static int kvmppc_xive_create(struct kvm_device *dev, u32 type) > +{ > + struct kvmppc_xive *xive; > + struct kvm *kvm = dev->kvm; > + int ret = 0; > + > + DBG("Creating xive for partition\n"); > + > + xive = kzalloc(sizeof(*xive), GFP_KERNEL); > + if (!xive) > + return -ENOMEM; > + > + dev->private = xive; > + xive->dev = dev; > + xive->kvm = kvm; > + > + /* Already there ? */ > + if (kvm->arch.xive) > + ret = -EEXIST; > + else > + kvm->arch.xive = xive; > + > + /* We use the default queue size set by the host */ > + xive->q_order = xive_native_default_eq_shift(); > + if (xive->q_order < PAGE_SHIFT) > + xive->q_alloc_order = 0; > + else > + xive->q_alloc_order = xive->q_order - PAGE_SHIFT; > + > + /* Allocate a bunch of VPs */ > + xive->vp_base = xive_native_alloc_vp_block(KVM_MAX_VCPUS); > + DBG("VP_Base=%x\n", xive->vp_base); > + if (xive->vp_base == XIVE_INVALID_VP) > + ret = -ENOMEM; > + > + if (ret) { > + kfree(xive); > + return ret; > + } > + > + return 0; > +} > + > + > +static int xive_debug_show(struct seq_file *m, void *private) > +{ > + struct kvmppc_xive *xive = m->private; > + struct kvm *kvm = xive->kvm; > + struct kvm_vcpu *vcpu; > + u64 t_rm_h_xirr = 0; > + u64 t_rm_h_ipoll = 0; > + u64 t_rm_h_cppr = 0; > + u64 t_rm_h_eoi = 0; > + u64 t_rm_h_ipi = 0; > + u64 t_vm_h_xirr = 0; > + u64 t_vm_h_ipoll = 0; > + u64 t_vm_h_cppr = 0; > + u64 t_vm_h_eoi = 0; > + u64 t_vm_h_ipi = 0; > + unsigned int i; > + > + if (!kvm) > + return 0; > + > + seq_printf(m, "=========\nVCPU state\n=========\n"); > + > + kvm_for_each_vcpu(i, vcpu, kvm) { > + struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; > + > + if (!xc) > + continue; > + > + seq_printf(m, "cpu server %#x CPPR:%#x HWCPPR:%#x" > + " MFRR:%#x PEND:%#x h_xirr: R=%lld V=%lld\n", > + xc->server_num, xc->cppr, xc->hw_cppr, > + xc->mfrr, xc->pending, > + xc->stat_rm_h_xirr, xc->stat_vm_h_xirr); > + t_rm_h_xirr += xc->stat_rm_h_xirr; > + t_rm_h_ipoll += xc->stat_rm_h_ipoll; > + t_rm_h_cppr += xc->stat_rm_h_cppr; > + t_rm_h_eoi += xc->stat_rm_h_eoi; > + t_rm_h_ipi += xc->stat_rm_h_ipi; > + t_vm_h_xirr += xc->stat_vm_h_xirr; > + t_vm_h_ipoll += xc->stat_vm_h_ipoll; > + t_vm_h_cppr += xc->stat_vm_h_cppr; > + t_vm_h_eoi += xc->stat_vm_h_eoi; > + t_vm_h_ipi += xc->stat_vm_h_ipi; > + } > + > + seq_printf(m, "Hcalls totals\n"); > + seq_printf(m, " H_XIRR R=%10lld V=%10lld\n", t_rm_h_xirr, t_vm_h_xirr); > + seq_printf(m, " H_IPOLL R=%10lld V=%10lld\n", t_rm_h_ipoll, > t_vm_h_ipoll); > + seq_printf(m, " H_CPPR R=%10lld V=%10lld\n", t_rm_h_cppr, t_vm_h_cppr); > + seq_printf(m, " H_EOI R=%10lld V=%10lld\n", t_rm_h_eoi, t_vm_h_eoi); > + seq_printf(m, " H_IPI R=%10lld V=%10lld\n", t_rm_h_ipi, t_vm_h_ipi); > + > + return 0; > +} > + > +static int xive_debug_open(struct inode *inode, struct file *file) > +{ > + return single_open(file, xive_debug_show, inode->i_private); > +} > + > +static const struct file_operations xive_debug_fops = { > + .open = xive_debug_open, > + .read = seq_read, > + .llseek = seq_lseek, > + .release = single_release, > +}; > + > +static void xive_debugfs_init(struct kvmppc_xive *xive) > +{ > + char *name; > + > + name = kasprintf(GFP_KERNEL, "kvm-xive-%p", xive); > + if (!name) { > + pr_err("%s: no memory for name\n", __func__); > + return; > + } > + > + xive->dentry = debugfs_create_file(name, S_IRUGO, powerpc_debugfs_root, > + xive, &xive_debug_fops); > + > + pr_debug("%s: created %s\n", __func__, name); > + kfree(name); > +} > + > +static void kvmppc_xive_init(struct kvm_device *dev) > +{ > + struct kvmppc_xive *xive = (struct kvmppc_xive *)dev->private; > + > + /* Register some debug interfaces */ > + xive_debugfs_init(xive); > +} > + > +struct kvm_device_ops kvm_xive_ops = { > + .name = "kvm-xive", > + .create = kvmppc_xive_create, > + .init = kvmppc_xive_init, > + .destroy = kvmppc_xive_free, > + .set_attr = xive_set_attr, > + .get_attr = xive_get_attr, > + .has_attr = xive_has_attr, > +}; > + > +void kvmppc_xive_init_module(void) > +{ > + __xive_vm_h_xirr = xive_vm_h_xirr; > + __xive_vm_h_ipoll = xive_vm_h_ipoll; > + __xive_vm_h_ipi = xive_vm_h_ipi; > + __xive_vm_h_cppr = xive_vm_h_cppr; > + __xive_vm_h_eoi = xive_vm_h_eoi; > +} > + > +void kvmppc_xive_exit_module(void) > +{ > + __xive_vm_h_xirr = NULL; > + __xive_vm_h_ipoll = NULL; > + __xive_vm_h_ipi = NULL; > + __xive_vm_h_cppr = NULL; > + __xive_vm_h_eoi = NULL; > +} > diff --git a/arch/powerpc/kvm/book3s_xive.h b/arch/powerpc/kvm/book3s_xive.h > new file mode 100644 > index 0000000..2b7fdbd > --- /dev/null > +++ b/arch/powerpc/kvm/book3s_xive.h > @@ -0,0 +1,251 @@ > +/* > + * Copyright 2017 Benjamin Herrenschmidt, IBM Corporation > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License, version 2, as > + * published by the Free Software Foundation. > + */ > + > +#ifndef _KVM_PPC_BOOK3S_XIVE_H > +#define _KVM_PPC_BOOK3S_XIVE_H > + > +#include "book3s_xics.h" > + > +/* State for one guest irq source. > + * > + * For each guest source we allocate a HW interrupt in the XIVE > + * which we use for all SW triggers. It will be unused for > + * pass-through but it's easier to keep around as the same > + * guest interrupt can alternatively be emulated or pass-through > + * if a physical device is hot unplugged and replaced with an > + * emulated one. > + * > + * This state structure is very similar to the XICS one with > + * additional XIVE specific tracking. > + */ > +struct kvmppc_xive_irq_state { > + bool valid; /* Interrupt entry is valid */ > + > + u32 number; /* Guest IRQ number */ > + u32 ipi_number; /* XIVE IPI HW number */ > + struct xive_irq_data ipi_data; /* XIVE IPI associated data */ > + u32 pt_number; /* XIVE Pass-through number if any */ > + struct xive_irq_data *pt_data; /* XIVE Pass-through associated data */ > + > + /* Targetting as set by guest */ > + u32 guest_server; /* Current guest selected target */ > + u8 guest_priority; /* Guest set priority */ > + u8 saved_priority; /* Saved priority when masking */ > + > + /* Actual targetting */ > + u32 act_server; /* Actual server */ > + u8 act_priority; /* Actual priority */ > + > + /* Various state bits */ > + bool in_eoi; /* Synchronize with H_EOI */ > + bool old_p; /* P bit state when masking */ > + bool old_q; /* Q bit state when masking */ > + bool lsi; /* level-sensitive interrupt */ > + bool asserted; /* Only for emulated LSI: current state > */ > + > + /* Saved for migration state */ > + bool in_queue; > + bool saved_p; > + bool saved_q; > + u8 saved_scan_prio; > +}; > + > +/* Select the "right" interrupt (IPI vs. passthrough) */ > +static inline void kvmppc_xive_select_irq(struct kvmppc_xive_irq_state > *state, > + u32 *out_hw_irq, > + struct xive_irq_data **out_xd) > +{ > + if (state->pt_number) { > + if (out_hw_irq) > + *out_hw_irq = state->pt_number; > + if (out_xd) > + *out_xd = state->pt_data; > + } else { > + if (out_hw_irq) > + *out_hw_irq = state->ipi_number; > + if (out_xd) > + *out_xd = &state->ipi_data; > + } > +} > + > +/* This corresponds to an "ICS" in XICS terminology, we use it > + * as a mean to break up source information into multiple structures > + */ > +struct kvmppc_xive_src_block { > + arch_spinlock_t lock; > + u16 id; > + struct kvmppc_xive_irq_state irq_state[KVMPPC_XICS_IRQ_PER_ICS]; > +}; > + > + > +struct kvmppc_xive { > + struct kvm *kvm; > + struct kvm_device *dev; > + struct dentry *dentry; > + > + /* VP block associated with the VM */ > + u32 vp_base; > + > + /* Blocks of sources */ > + struct kvmppc_xive_src_block *src_blocks[KVMPPC_XICS_MAX_ICS_ID + 1]; > + u32 max_sbid; > + > + /* > + * For state save, we lazily scan the queues on the first interrupt > + * being migrated. We don't have a clean way to reset that flags > + * so we keep track of the number of valid sources and how many of > + * them were migrated so we can reset when all of them have been > + * processed. > + */ > + u32 src_count; > + u32 saved_src_count; > + > + /* > + * Some irqs are delayed on restore until the source is created, > + * keep track here of how many of them > + */ > + u32 delayed_irqs; > + > + /* Which queues (priorities) are in use by the guest */ > + u8 qmap; > + > + /* Queue orders */ > + u32 q_order; > + u32 q_alloc_order; > + > +}; > + > +#define KVMPPC_XIVE_Q_COUNT 8 > + > +struct kvmppc_xive_vcpu { > + struct kvmppc_xive *xive; > + struct kvm_vcpu *vcpu; > + bool valid; > + > + /* Server number. This is the HW CPU ID from a guest perspective */ > + u32 server_num; > + > + /* HW VP corresponding to this VCPU. This is the base of the VP > + * block plus the server number > + */ > + u32 vp_id; > + u32 vp_chip_id; > + u32 vp_cam; > + > + /* IPI used for sending ... IPIs */ > + u32 vp_ipi; > + struct xive_irq_data vp_ipi_data; > + > + /* Local emulation state */ > + uint8_t cppr; /* guest CPPR */ > + uint8_t hw_cppr;/* Hardware CPPR */ > + uint8_t mfrr; > + uint8_t pending; > + > + /* Each VP has 8 queues though we only provision some */ > + struct xive_q queues[KVMPPC_XIVE_Q_COUNT]; > + u32 esc_virq[KVMPPC_XIVE_Q_COUNT]; > + char *esc_virq_names[KVMPPC_XIVE_Q_COUNT]; > + > + /* Stash a delayed irq on restore from migration (see set_icp) */ > + u32 delayed_irq; > + > + /* Stats */ > + u64 stat_rm_h_xirr; > + u64 stat_rm_h_ipoll; > + u64 stat_rm_h_cppr; > + u64 stat_rm_h_eoi; > + u64 stat_rm_h_ipi; > + u64 stat_vm_h_xirr; > + u64 stat_vm_h_ipoll; > + u64 stat_vm_h_cppr; > + u64 stat_vm_h_eoi; > + u64 stat_vm_h_ipi; > +}; > + > +static inline struct kvm_vcpu *kvmppc_xive_find_server(struct kvm *kvm, u32 > nr) > +{ > + struct kvm_vcpu *vcpu = NULL; > + int i; > + > + kvm_for_each_vcpu(i, vcpu, kvm) { > + if (vcpu->arch.xive_vcpu && nr == > vcpu->arch.xive_vcpu->server_num) > + return vcpu; > + } > + return NULL; > +} > + > +static inline struct kvmppc_xive_src_block *kvmppc_xive_find_source(struct > kvmppc_xive *xive, > + u32 irq, u16 *source) > +{ > + u32 bid = irq >> KVMPPC_XICS_ICS_SHIFT; > + u16 src = irq & KVMPPC_XICS_SRC_MASK; > + > + if (source) > + *source = src; > + if (bid > KVMPPC_XICS_MAX_ICS_ID) > + return NULL; > + return xive->src_blocks[bid]; > +} > + > +/* > + * Mapping between guest priorities and host priorities > + * is as follow. > + * > + * Guest request for 0...6 are honored. Guest request for anything > + * higher results in a priority of 7 being applied. > + * > + * However, when XIRR is returned via H_XIRR, 7 is translated to 0xb > + * in order to match AIX expectations > + * > + * Similar mapping is done for CPPR values > + */ > +static inline u8 xive_prio_from_guest(u8 prio) > +{ > + if (prio == 0xff || prio < 8) > + return prio; > + return 7; > +} > + > +static inline u8 xive_prio_to_guest(u8 prio) > +{ > + if (prio == 0xff || prio < 7) > + return prio; > + return 0xb; > +} > + > +static inline u32 __xive_read_eq(__be32 *qpage, u32 msk, u32 *idx, u32 > *toggle) > +{ > + u32 cur; > + > + if (!qpage) > + return 0; > + cur = be32_to_cpup(qpage + *idx); > + if ((cur >> 31) == *toggle) > + return 0; > + *idx = (*idx + 1) & msk; > + if (*idx == 0) > + (*toggle) ^= 1; > + return cur & 0x7fffffff; > +} > + > +extern unsigned long xive_rm_h_xirr(struct kvm_vcpu *vcpu); > +extern unsigned long xive_rm_h_ipoll(struct kvm_vcpu *vcpu, unsigned long > server); > +extern int xive_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server, > + unsigned long mfrr); > +extern int xive_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr); > +extern int xive_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr); > + > +extern unsigned long (*__xive_vm_h_xirr)(struct kvm_vcpu *vcpu); > +extern unsigned long (*__xive_vm_h_ipoll)(struct kvm_vcpu *vcpu, unsigned > long server); > +extern int (*__xive_vm_h_ipi)(struct kvm_vcpu *vcpu, unsigned long server, > + unsigned long mfrr); > +extern int (*__xive_vm_h_cppr)(struct kvm_vcpu *vcpu, unsigned long cppr); > +extern int (*__xive_vm_h_eoi)(struct kvm_vcpu *vcpu, unsigned long xirr); > + > +#endif /* _KVM_PPC_BOOK3S_XICS_H */ > diff --git a/arch/powerpc/kvm/book3s_xive_template.c > b/arch/powerpc/kvm/book3s_xive_template.c > new file mode 100644 > index 0000000..b28c264 > --- /dev/null > +++ b/arch/powerpc/kvm/book3s_xive_template.c > @@ -0,0 +1,490 @@ > +/* > + * Copyright 2017 Benjamin Herrenschmidt, IBM Corporation > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License, version 2, as > + * published by the Free Software Foundation. > + */ > + > +/* File to be included by other .c files */ > + > +#define XGLUE(a,b) a##b > +#define GLUE(a,b) XGLUE(a,b) > + > +static void GLUE(X_PFX,ack_pending)(struct kvmppc_xive_vcpu *xc) > +{ > + u8 cppr; > + u16 ack; > + > + /* XXX DD1 bug workaround: Check PIPR vs. CPPR first ! */ > + > + /* Perform the acknowledge OS to register cycle. */ > + ack = be16_to_cpu(__x_readw(__x_tm_area + TM_SPC_ACK_OS_REG)); > + > + /* Synchronize subsequent queue accesses */ > + mb(); > + > + /* XXX Check grouping level */ > + > + /* Anything ? */ > + if (!((ack >> 8) & TM_QW1_NSR_EO)) > + return; > + > + /* Grab CPPR of the most favored pending interrupt */ > + cppr = ack & 0xff; > + if (cppr < 8) > + xc->pending |= 1 << cppr; > + > +#ifdef XIVE_RUNTIME_CHECKS > + /* Check consistency */ > + if (cppr >= xc->hw_cppr) > + pr_warn("KVM-XIVE: CPU %d odd ack CPPR, got %d at %d\n", > + smp_processor_id(), cppr, xc->hw_cppr); > +#endif > + > + /* Update our image of the HW CPPR. We don't yet modify > + * xc->cppr, this will be done as we scan for interrupts > + * in the queues. > + */ > + xc->hw_cppr = cppr; > +} > + > +static u8 GLUE(X_PFX,esb_load)(struct xive_irq_data *xd, u32 offset) > +{ > + u64 val; > + > + if (xd->flags & XIVE_IRQ_FLAG_SHIFT_BUG) > + offset |= offset << 4; > + > + val =__x_readq(__x_eoi_page(xd) + offset); > +#ifdef __LITTLE_ENDIAN__ > + val >>= 64-8; > +#endif > + return (u8)val; > +} > + > + > +static void GLUE(X_PFX,source_eoi)(u32 hw_irq, struct xive_irq_data *xd) > +{ > + /* If the XIVE supports the new "store EOI facility, use it */ > + if (xd->flags & XIVE_IRQ_FLAG_STORE_EOI) > + __x_writeq(0, __x_eoi_page(xd)); > + else if (hw_irq && xd->flags & XIVE_IRQ_FLAG_EOI_FW) { > + opal_int_eoi(hw_irq); > + } else { > + uint64_t eoi_val; > + > + /* Otherwise for EOI, we use the special MMIO that does > + * a clear of both P and Q and returns the old Q. > + * > + * This allows us to then do a re-trigger if Q was set > + * rather than synthetizing an interrupt in software > + */ > + eoi_val = GLUE(X_PFX,esb_load)(xd, XIVE_ESB_SET_PQ_00); > + if ((xd->flags & XIVE_IRQ_FLAG_LSI) || !(eoi_val & 1)) > + return; > + > + /* Re-trigger */ > + if (__x_trig_page(xd)) > + __x_writeq(0, __x_trig_page(xd)); > + } > + > +} > + > +enum { > + scan_fetch, > + scan_poll, > + scan_eoi, > +}; > + > +static u32 GLUE(X_PFX,scan_interrupts)(struct kvmppc_xive_vcpu *xc, > + u8 pending, int scan_type) > +{ > + u32 hirq = 0; > + u8 prio = 0xff; > + > + /* Find highest pending priority */ > + while ((xc->mfrr != 0xff || pending != 0) && hirq == 0) { > + struct xive_q *q; > + u32 idx, toggle; > + __be32 *qpage; > + > + /* > + * If pending is 0 this will return 0xff which is what > + * we want > + */ > + prio = ffs(pending) - 1; > + > + /* > + * If the most favoured prio we found pending is less > + * favored (or equal) than a pending IPI, we return > + * the IPI instead. > + * > + * Note: If pending was 0 and mfrr is 0xff, we will > + * not spurriously take an IPI because mfrr cannot > + * then be smaller than cppr. > + */ > + if (prio >= xc->mfrr && xc->mfrr < xc->cppr) { > + prio = xc->mfrr; > + hirq = XICS_IPI; > + break; > + } > + > + /* Don't scan past the guest cppr */ > + if (prio >= xc->cppr || prio > 7) > + break; > + > + /* Grab queue and pointers */ > + q = &xc->queues[prio]; > + idx = q->idx; > + toggle = q->toggle; > + > + /* > + * Snapshot the queue page. The test further down for EOI > + * must use the same "copy" that was used by __xive_read_eq > + * since qpage can be set concurrently and we don't want > + * to miss an EOI. > + */ > + qpage = READ_ONCE(q->qpage); > + > + skip_ipi: > + /* Try to fetch from the queue. Will return 0 for a > + * non-queueing priority (ie, qpage = 0) > + */ > + hirq = __xive_read_eq(qpage, q->msk, &idx, &toggle); > + > + /* > + * If this was a signal for an MFFR change done by > + * H_IPI we skip it. Additionally, if we were fetching > + * we EOI it now, thus re-enabling reception of a new > + * such signal. > + * > + * We also need to do that if prio is 0 and we had no > + * page for the queue. In this case, we have non-queued > + * IPI that needs to be EOId. > + * > + * This is safe because if we have another pending MFRR > + * change that wasn't observed above, the Q bit will have > + * been set and another occurrence of the IPI will trigger. > + */ > + if (hirq == XICS_IPI || (prio == 0 && !qpage)) { > + if (scan_type == scan_fetch) > + GLUE(X_PFX,source_eoi)(xc->vp_ipi, > + &xc->vp_ipi_data); > + /* Loop back on same queue with updated idx/toggle */ > +#ifdef XIVE_RUNTIME_CHECKS > + WARN_ON(hirq && hirq != XICS_IPI); > +#endif > + if (hirq) > + goto skip_ipi; > + } > + > + /* If fetching, update queue pointers */ > + if (scan_type == scan_fetch) { > + q->idx = idx; > + q->toggle = toggle; > + } > + > + /* Something found, stop searching */ > + if (hirq) > + break; > + > + /* Clear the pending bit on the now empty queue */ > + pending &= ~(1 << prio); > + > + /* > + * Check if the queue count needs adjusting due to > + * interrupts being moved away. > + */ > + if (atomic_read(&q->pending_count)) { > + int p = atomic_xchg(&q->pending_count, 0); > + if (p) { > +#ifdef XIVE_RUNTIME_CHECKS > + WARN_ON(p > atomic_read(&q->count)); > +#endif > + atomic_sub(p, &q->count); > + } > + } > + } > + > + /* If we are just taking a "peek", do nothing else */ > + if (scan_type == scan_poll) > + return hirq; > + > + /* Update the pending bits */ > + xc->pending = pending; > + > + /* If this is an EOI that's it, no CPPR adjustment done here, > + * all we needed was cleanup the stale pending bits and check > + * if there's anything left. > + */ > + if (scan_type == scan_eoi) > + return hirq; > + > + /* If we found an interrupt, adjust what the guest CPPR should > + * be as if we had just fetched that interrupt from HW > + */ > + if (hirq) > + xc->cppr = prio; > + /* > + * If it was an IPI the HW CPPR might have been lowered too much > + * as the HW interrupt we use for IPIs is routed to priority 0. > + * > + * We re-sync it here. > + */ > + if (xc->cppr != xc->hw_cppr) { > + xc->hw_cppr = xc->cppr; > + __x_writeb(xc->cppr, __x_tm_area + TM_QW1_OS + TM_CPPR); > + } > + > + return hirq; > +} > + > +X_STATIC unsigned long GLUE(X_PFX,h_xirr)(struct kvm_vcpu *vcpu) > +{ > + struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; > + u8 old_cppr; > + u32 hirq; > + > + DBG("H_XIRR\n"); > + > + xc->GLUE(X_STAT_PFX,h_xirr)++; > + > + /* First collect pending bits from HW */ > + GLUE(X_PFX,ack_pending)(xc); > + > + /* Cleanup the old-style bits if needed (they may have been > + * set by pull or an escalation interrupts) > + */ > + if (test_bit(BOOK3S_IRQPRIO_EXTERNAL, &vcpu->arch.pending_exceptions)) > + clear_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL, > + &vcpu->arch.pending_exceptions); > + > + DBG(" new pending=0x%02x hw_cppr=%d cppr=%d\n", > + xc->pending, xc->hw_cppr, xc->cppr); > + > + /* Grab previous CPPR and reverse map it */ > + old_cppr = xive_prio_to_guest(xc->cppr); > + > + /* Scan for actual interrupts */ > + hirq = GLUE(X_PFX,scan_interrupts)(xc, xc->pending, scan_fetch); > + > + DBG(" got hirq=0x%x hw_cppr=%d cppr=%d\n", > + hirq, xc->hw_cppr, xc->cppr); > + > +#ifdef XIVE_RUNTIME_CHECKS > + /* That should never hit */ > + if (hirq & 0xff000000) > + pr_warn("XIVE: Weird guest interrupt number 0x%08x\n", hirq); > +#endif > + > + /* > + * XXX We could check if the interrupt is masked here and > + * filter it. If we chose to do so, we would need to do: > + * > + * if (masked) { > + * lock(); > + * if (masked) { > + * old_Q = true; > + * hirq = 0; > + * } > + * unlock(); > + * } > + */ > + > + /* Return interrupt and old CPPR in GPR4 */ > + vcpu->arch.gpr[4] = hirq | (old_cppr << 24); > + > + return H_SUCCESS; > +} > + > +X_STATIC unsigned long GLUE(X_PFX,h_ipoll)(struct kvm_vcpu *vcpu, unsigned > long server) > +{ > + struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; > + u8 pending = xc->pending; > + u32 hirq; > + u8 pipr; > + > + DBG("H_IPOLL(server=%ld)\n", server); > + > + xc->GLUE(X_STAT_PFX,h_ipoll)++; > + > + /* Grab the target VCPU if not the current one */ > + if (xc->server_num != server) { > + vcpu = kvmppc_xive_find_server(vcpu->kvm, server); > + if (!vcpu) > + return H_PARAMETER; > + xc = vcpu->arch.xive_vcpu; > + > + /* Scan all priorities */ > + pending = 0xff; > + } else { > + /* Grab pending interrupt if any */ > + pipr = __x_readb(__x_tm_area + TM_QW1_OS + TM_PIPR); > + if (pipr < 8) > + pending |= 1 << pipr; > + } > + > + hirq = GLUE(X_PFX,scan_interrupts)(xc, pending, scan_poll); > + > + /* Return interrupt and old CPPR in GPR4 */ > + vcpu->arch.gpr[4] = hirq | (xc->cppr << 24); > + > + return H_SUCCESS; > +} > + > +static void GLUE(X_PFX,push_pending_to_hw)(struct kvmppc_xive_vcpu *xc) > +{ > + u8 pending, prio; > + > + pending = xc->pending; > + if (xc->mfrr != 0xff) { > + if (xc->mfrr < 8) > + pending |= 1 << xc->mfrr; > + else > + pending |= 0x80; > + } > + if (!pending) > + return; > + prio = ffs(pending) - 1; > + > + __x_writeb(prio, __x_tm_area + TM_SPC_SET_OS_PENDING); > +} > + > +X_STATIC int GLUE(X_PFX,h_cppr)(struct kvm_vcpu *vcpu, unsigned long cppr) > +{ > + struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; > + u8 old_cppr; > + > + DBG("H_CPPR(cppr=%ld)\n", cppr); > + > + xc->GLUE(X_STAT_PFX,h_cppr)++; > + > + /* Map CPPR */ > + cppr = xive_prio_from_guest(cppr); > + > + /* Remember old and update SW state */ > + old_cppr = xc->cppr; > + xc->cppr = cppr; > + > + /* > + * We are masking less, we need to look for pending things > + * to deliver and set VP pending bits accordingly to trigger > + * a new interrupt otherwise we might miss MFRR changes for > + * which we have optimized out sending an IPI signal. > + */ > + if (cppr > old_cppr) > + GLUE(X_PFX,push_pending_to_hw)(xc); > + > + /* Apply new CPPR */ > + xc->hw_cppr = cppr; > + __x_writeb(cppr, __x_tm_area + TM_QW1_OS + TM_CPPR); > + > + return H_SUCCESS; > +} > + > +X_STATIC int GLUE(X_PFX,h_eoi)(struct kvm_vcpu *vcpu, unsigned long xirr) > +{ > + struct kvmppc_xive *xive = vcpu->kvm->arch.xive; > + struct kvmppc_xive_src_block *sb; > + struct kvmppc_xive_irq_state *state; > + struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; > + struct xive_irq_data *xd; > + u8 new_cppr = xirr >> 24; > + u32 irq = xirr & 0x00ffffff, hw_num; > + u16 src; > + int rc = 0; > + > + DBG("H_EOI(xirr=%08lx)\n", xirr); > + > + xc->GLUE(X_STAT_PFX,h_eoi)++; > + > + xc->cppr = xive_prio_from_guest(new_cppr); > + > + /* > + * IPIs are synthetized from MFRR and thus don't need > + * any special EOI handling. The underlying interrupt > + * used to signal MFRR changes is EOId when fetched from > + * the queue. > + */ > + if (irq == XICS_IPI || irq == 0) > + goto bail; > + > + /* Find interrupt source */ > + sb = kvmppc_xive_find_source(xive, irq, &src); > + if (!sb) { > + DBG(" source not found !\n"); > + rc = H_PARAMETER; > + goto bail; > + } > + state = &sb->irq_state[src]; > + kvmppc_xive_select_irq(state, &hw_num, &xd); > + > + state->in_eoi = true; > + mb(); > + > + again: > + if (state->guest_priority == MASKED) { > + arch_spin_lock(&sb->lock); > + if (state->guest_priority != MASKED) { > + arch_spin_unlock(&sb->lock); > + goto again; > + } > + DBG(" EOI on saved P...\n"); > + > + /* Clear old_p, that will cause unmask to perform an EOI */ > + state->old_p = false; > + > + arch_spin_unlock(&sb->lock); > + } else { > + DBG(" EOI on source...\n"); > + > + /* Perform EOI on the source */ > + GLUE(X_PFX,source_eoi)(hw_num, xd); > + > + /* If it's an emulated LSI, check level and resend */ > + if (state->lsi && state->asserted) > + __x_writeq(0, __x_trig_page(xd)); > + > + } > + > + mb(); > + state->in_eoi = false; > + bail: > + > + /* Re-evaluate pending IRQs and update HW */ > + GLUE(X_PFX,scan_interrupts)(xc, xc->pending, scan_eoi); > + GLUE(X_PFX,push_pending_to_hw)(xc); > + DBG(" after scan pending=%02x\n", xc->pending); > + > + /* Apply new CPPR */ > + xc->hw_cppr = xc->cppr; > + __x_writeb(xc->cppr, __x_tm_area + TM_QW1_OS + TM_CPPR); > + > + return rc; > +} > + > +X_STATIC int GLUE(X_PFX,h_ipi)(struct kvm_vcpu *vcpu, unsigned long server, > + unsigned long mfrr) > +{ > + struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; > + > + DBG("H_IPI(server=%08lx,mfrr=%ld)\n", server, mfrr); > + > + xc->GLUE(X_STAT_PFX,h_ipi)++; > + > + /* Find target */ > + vcpu = kvmppc_xive_find_server(vcpu->kvm, server); > + if (!vcpu) > + return H_PARAMETER; > + xc = vcpu->arch.xive_vcpu; > + > + /* Locklessly write over MFRR */ > + xc->mfrr = mfrr; > + > + /* Shoot the IPI if most favored than target cppr */ > + if (mfrr < xc->cppr) > + __x_writeq(0, __x_trig_page(&xc->vp_ipi_data)); > + > + return H_SUCCESS; > +} > diff --git a/arch/powerpc/kvm/irq.h b/arch/powerpc/kvm/irq.h > index 5a9a10b..3f1be85 100644 > --- a/arch/powerpc/kvm/irq.h > +++ b/arch/powerpc/kvm/irq.h > @@ -12,6 +12,7 @@ static inline int irqchip_in_kernel(struct kvm *kvm) > #endif > #ifdef CONFIG_KVM_XICS > ret = ret || (kvm->arch.xics != NULL); > + ret = ret || (kvm->arch.xive != NULL); > #endif > smp_rmb(); > return ret; > diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c > index 95c91a9..de79bd72 100644 > --- a/arch/powerpc/kvm/powerpc.c > +++ b/arch/powerpc/kvm/powerpc.c > @@ -37,6 +37,8 @@ > #include <asm/cputhreads.h> > #include <asm/irqflags.h> > #include <asm/iommu.h> > +#include <asm/xive.h> > + > #include "timing.h" > #include "irq.h" > #include "../mm/mmu_decl.h" > @@ -699,7 +701,10 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) > kvmppc_mpic_disconnect_vcpu(vcpu->arch.mpic, vcpu); > break; > case KVMPPC_IRQ_XICS: > - kvmppc_xics_free_icp(vcpu); > + if (xive_enabled()) > + kvmppc_xive_cleanup_vcpu(vcpu); > + else > + kvmppc_xics_free_icp(vcpu); > break; > } > > @@ -1219,8 +1224,12 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu > *vcpu, > > r = -EPERM; > dev = kvm_device_from_filp(f.file); > - if (dev) > - r = kvmppc_xics_connect_vcpu(dev, vcpu, cap->args[1]); > + if (dev) { > + if (xive_enabled()) > + r = kvmppc_xive_connect_vcpu(dev, vcpu, > cap->args[1]); > + else > + r = kvmppc_xics_connect_vcpu(dev, vcpu, > cap->args[1]); > + } > > fdput(f); > break; > @@ -1244,7 +1253,7 @@ bool kvm_arch_intc_initialized(struct kvm *kvm) > return true; > #endif > #ifdef CONFIG_KVM_XICS > - if (kvm->arch.xics) > + if (kvm->arch.xics || kvm->arch.xive) > return true; > #endif > return false; > diff --git a/arch/powerpc/platforms/powernv/opal.c > b/arch/powerpc/platforms/powernv/opal.c > index e0f856b..d71cd77 100644 > --- a/arch/powerpc/platforms/powernv/opal.c > +++ b/arch/powerpc/platforms/powernv/opal.c > @@ -890,3 +890,4 @@ EXPORT_SYMBOL_GPL(opal_leds_set_ind); > EXPORT_SYMBOL_GPL(opal_write_oppanel_async); > /* Export this for KVM */ > EXPORT_SYMBOL_GPL(opal_int_set_mfrr); > +EXPORT_SYMBOL_GPL(opal_int_eoi); > diff --git a/arch/powerpc/sysdev/xive/common.c > b/arch/powerpc/sysdev/xive/common.c > index 96037e0..6429cd3 100644 > --- a/arch/powerpc/sysdev/xive/common.c > +++ b/arch/powerpc/sysdev/xive/common.c > @@ -45,12 +45,14 @@ > #endif > > bool __xive_enabled; > +EXPORT_SYMBOL_GPL(__xive_enabled); > bool xive_cmdline_disabled; > > /* We use only one priority for now */ > static u8 xive_irq_priority; > > void __iomem *xive_tm_area; > +EXPORT_SYMBOL_GPL(xive_tm_area); > u32 xive_tm_offset; > static const struct xive_ops *xive_ops; > static struct irq_domain *xive_irq_domain; > @@ -304,7 +306,7 @@ static void xive_irq_eoi(struct irq_data *d) > DBG_VERBOSE("eoi_irq: irq=%d [0x%lx] pending=%02x\n", > d->irq, irqd_to_hwirq(d), xc->pending_prio); > > - if (!irqd_irq_disabled(d)) > + if (!irqd_irq_disabled(d) && !irqd_is_forwarded_to_vcpu(d)) > xive_do_source_eoi(irqd_to_hwirq(d), xd); > > /* > @@ -579,9 +581,10 @@ static int xive_irq_set_affinity(struct irq_data *d, > * Only configure the irq if it's not currently passed-through to > * a KVM guest > */ > - rc = xive_ops->configure_irq(hw_irq, > - get_hard_smp_processor_id(target), > - xive_irq_priority, d->irq); > + if (!irqd_is_forwarded_to_vcpu(d)) > + rc = xive_ops->configure_irq(hw_irq, > + get_hard_smp_processor_id(target), > + xive_irq_priority, d->irq); > if (rc < 0) { > pr_err("XIVE: Error %d reconfiguring irq %d\n", rc, d->irq); > return rc; > @@ -661,6 +664,123 @@ static int xive_irq_retrigger(struct irq_data *d) > return 1; > } > > +static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state) > +{ > + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); > + unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); > + int rc; > + u8 pq; > + > + /* > + * We only support this on interrupts that do not require > + * firmware calls for masking and unmasking > + */ > + if (xd->flags & XIVE_IRQ_FLAG_MASK_FW) > + return -EIO; > + > + /* > + * This is called by KVM with state non-NULL for enabling > + * pass-through or NULL for disabling it > + */ > + if (state) { > + irqd_set_forwarded_to_vcpu(d); > + > + /* Set it to PQ=10 state to prevent further sends */ > + pq = xive_poke_esb(xd, 0xe00); Use XIVE_ESB_SET_PQ_xx constants in these xive_poke_esb() calls (as you have done elsewhere). > + > + /* No target ? nothing to do */ > + if (xd->target == XIVE_INVALID_TARGET) { > + /* > + * An untargetted interrupt should have been > + * also masked at the source > + */ > + WARN_ON(pq & 2); > + > + return 0; > + } > + > + /* > + * If P was set, adjust state to PQ=11 to indicate > + * that a resend is needed for the interrupt to reach > + * the guest. Also remember the value of P. > + * > + * This also tells us that it's in flight to a host queue > + * or has already been fetched but hasn't been EOIed yet > + * by the host. This it's potentially using up a host > + * queue slot. This is important to know because as long > + * as this is the case, we must not hard-unmask it when > + * "returning" that interrupt to the host. > + * > + * This saved_p is cleared by the host EOI, when we know > + * for sure the queue slot is no longer in use. > + */ > + if (pq & 2) { > + pq = xive_poke_esb(xd, 0xf00); > + xd->saved_p = true; > + > + /* > + * Sync the XIVE source HW to ensure the interrupt > + * has gone through the EAS before we change its > + * target to the guest. That should guarantee us > + * that we *will* eventually get an EOI for it on > + * the host. Otherwise there would be a small window > + * for P to be seen here but the interrupt going > + * to the guest queue. > + */ > + if (xive_ops->sync_source) > + xive_ops->sync_source(hw_irq); > + } else > + xd->saved_p = false; > + } else { > + irqd_clr_forwarded_to_vcpu(d); > + > + /* No host target ? hard mask and return */ > + if (xd->target == XIVE_INVALID_TARGET) { > + xive_do_source_set_mask(xd, true); > + return 0; > + } > + > + /* > + * Sync the XIVE source HW to ensure the interrupt > + * has gone through the EAS before we change its > + * target to the host. > + */ > + if (xive_ops->sync_source) > + xive_ops->sync_source(hw_irq); > + > + /* > + * By convention we are called with the interrupt in > + * a PQ=10 or PQ=11 state, ie, it won't fire and will > + * have latched in Q whether there's a pending HW > + * interrupt or not. > + * > + * First reconfigure the target. > + */ > + rc = xive_ops->configure_irq(hw_irq, > + > get_hard_smp_processor_id(xd->target), > + xive_irq_priority, d->irq); > + if (rc) > + return rc; > + > + /* > + * Then if saved_p is not set, effectively re-enable the > + * interrupt with an EOI. If it is set, we know there is > + * still a message in a host queue somewhere that will be > + * EOId eventually. > + * > + * Note: We don't check irqd_irq_disabled(). Effectively, > + * we *will* let the irq get through even if masked if the > + * HW is still firing it in order to deal with the whole > + * saved_p business properly. If the interrupt triggers > + * while masked, the generic code will re-mask it anyway. > + */ > + if (!xd->saved_p) > + xive_do_source_eoi(hw_irq, xd); > + > + } > + return 0; > +} > + > static struct irq_chip xive_irq_chip = { > .name = "XIVE-IRQ", > .irq_startup = xive_irq_startup, > @@ -671,12 +791,14 @@ static struct irq_chip xive_irq_chip = { > .irq_set_affinity = xive_irq_set_affinity, > .irq_set_type = xive_irq_set_type, > .irq_retrigger = xive_irq_retrigger, > + .irq_set_vcpu_affinity = xive_irq_set_vcpu_affinity, > }; > > bool is_xive_irq(struct irq_chip *chip) > { > return chip == &xive_irq_chip; > } > +EXPORT_SYMBOL_GPL(is_xive_irq); > > void xive_cleanup_irq_data(struct xive_irq_data *xd) > { > @@ -691,6 +813,7 @@ void xive_cleanup_irq_data(struct xive_irq_data *xd) > xd->trig_mmio = NULL; > } > } > +EXPORT_SYMBOL_GPL(xive_cleanup_irq_data); > > static int xive_irq_alloc_data(unsigned int virq, irq_hw_number_t hw) > { > diff --git a/arch/powerpc/sysdev/xive/native.c > b/arch/powerpc/sysdev/xive/native.c > index 26cc6bf..0130af8 100644 > --- a/arch/powerpc/sysdev/xive/native.c > +++ b/arch/powerpc/sysdev/xive/native.c > @@ -27,6 +27,7 @@ > #include <asm/errno.h> > #include <asm/xive.h> > #include <asm/opal.h> > +#include <asm/kvm_ppc.h> > > #include "xive-regs.h" > #include "xive-internal.h" > @@ -98,6 +99,7 @@ int xive_native_populate_irq_data(u32 hw_irq, struct > xive_irq_data *data) > } > return 0; > } > +EXPORT_SYMBOL_GPL(xive_native_populate_irq_data); > > int xive_native_configure_irq(u32 hw_irq, u32 target, u8 prio, u32 sw_irq) > { > @@ -111,6 +113,8 @@ int xive_native_configure_irq(u32 hw_irq, u32 target, u8 > prio, u32 sw_irq) > } > return rc == 0 ? 0 : -ENXIO; > } > +EXPORT_SYMBOL_GPL(xive_native_configure_irq); > + > > /* This can be called multiple time to change a queue configuration */ > int xive_native_configure_queue(u32 vp_id, struct xive_q *q, u8 prio, > @@ -187,6 +191,7 @@ int xive_native_configure_queue(u32 vp_id, struct xive_q > *q, u8 prio, > fail: > return rc; > } > +EXPORT_SYMBOL_GPL(xive_native_configure_queue); > > static void __xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio) > { > @@ -211,6 +216,7 @@ void xive_native_disable_queue(u32 vp_id, struct xive_q > *q, u8 prio) > iounmap(q->eoi_mmio); > q->eoi_mmio = NULL; > } > +EXPORT_SYMBOL_GPL(xive_native_disable_queue); > > static int xive_native_setup_queue(unsigned int cpu, struct xive_cpu *xc, u8 > prio) > { > @@ -297,6 +303,7 @@ u32 xive_native_alloc_irq(void) > return 0; > return rc; > } > +EXPORT_SYMBOL_GPL(xive_native_alloc_irq); > > void xive_native_free_irq(u32 irq) > { > @@ -307,6 +314,7 @@ void xive_native_free_irq(u32 irq) > msleep(1); > } > } > +EXPORT_SYMBOL_GPL(xive_native_free_irq); > > static void xive_native_put_ipi(unsigned int cpu, struct xive_cpu *xc) > { > @@ -406,10 +414,11 @@ static void xive_native_teardown_cpu(unsigned int cpu, > struct xive_cpu *xc) > } > } > > -static void xive_native_sync_source(u32 hw_irq) > +void xive_native_sync_source(u32 hw_irq) > { > opal_xive_sync(XIVE_SYNC_EAS, hw_irq); > } > +EXPORT_SYMBOL_GPL(xive_native_sync_source); > > static const struct xive_ops xive_native_ops = { > .populate_irq_data = xive_native_populate_irq_data, > @@ -468,10 +477,38 @@ static bool xive_parse_provisioning(struct device_node > *np) > return true; > } > > +static void xive_native_setup_pools(void) > +{ > + u32 max_pir = 0; > + unsigned int cpu; > + > + /* > + * The HW won't let us enable OS VPs for KVM is we don't > + * have enabled pool VPs so let's do that. First we find > + * out our highest HW CPU ID > + */ > + for_each_possible_cpu(cpu) { > + u32 hw_id = get_hard_smp_processor_id(cpu); > + if (hw_id > max_pir) > + max_pir = hw_id; > + } > + > + /* Allocate a pool big enough */ > + pr_debug("XIVE: Allocating VP block for pool size %d\n", > + max_pir + 1); > + xive_pool_vps = xive_native_alloc_vp_block(max_pir + 1); > + if (WARN_ON(xive_pool_vps == XIVE_INVALID_VP)) > + pr_err("XIVE: No pool VPsvp KVM might not function\n"); > + > + pr_debug("XIVE: Pool VPs allocated at 0x%x for max_pir 0x%x\n", > + xive_pool_vps, max_pir); > +} > + > u32 xive_native_default_eq_shift(void) > { > return xive_queue_shift; > } > +EXPORT_SYMBOL_GPL(xive_native_default_eq_shift); > > bool xive_native_init(void) > { > @@ -481,7 +518,7 @@ bool xive_native_init(void) > struct property *prop; > u8 max_prio = 7; > const __be32 *p; > - u32 val; > + u32 val, cpu; > s64 rc; > > if (xive_cmdline_disabled) > @@ -517,6 +554,10 @@ bool xive_native_init(void) > break; > } > > + /* Configure TM areas for KVM */ > + for_each_possible_cpu(cpu) > + kvmppc_set_xive_tm_area(cpu, r.start, tm_area); > + > /* Grab size of provisionning pages */ > xive_parse_provisioning(np); > > @@ -528,6 +569,9 @@ bool xive_native_init(void) > return false; > } > > + /* Setup some dummy HV pool VPs */ > + xive_native_setup_pools(); > + > /* Initialize XIVE core with our backend */ > if (!xive_core_init(&xive_native_ops, tm_area, TM_QW3_HV_PHYS, > max_prio)) { > @@ -602,3 +646,47 @@ void xive_native_free_vp_block(u32 vp_base) > pr_warn("XIVE: OPAL error %lld freeing VP block\n", rc); > } > EXPORT_SYMBOL_GPL(xive_native_free_vp_block); > + > +int xive_native_enable_vp(u32 vp_id) > +{ > + s64 rc; > + > + for (;;) { > + rc = opal_xive_set_vp_info(vp_id, OPAL_XIVE_VP_ENABLED, 0); > + if (rc != OPAL_BUSY) > + break; > + msleep(1); > + } > + return rc ? -EIO : 0; > +} > +EXPORT_SYMBOL_GPL(xive_native_enable_vp); > + > +int xive_native_disable_vp(u32 vp_id) > +{ > + s64 rc; > + > + for (;;) { > + rc = opal_xive_set_vp_info(vp_id, 0, 0); > + if (rc != OPAL_BUSY) > + break; > + msleep(1); > + } > + return rc ? -EIO : 0; > +} > +EXPORT_SYMBOL_GPL(xive_native_disable_vp); > + > +int xive_native_get_vp_info(u32 vp_id, u32 *out_cam_id, u32 *out_chip_id) > +{ > + __be64 vp_cam_be; > + __be32 vp_chip_id_be; > + s64 rc; > + > + rc = opal_xive_get_vp_info(vp_id, NULL, &vp_cam_be, NULL, > &vp_chip_id_be); > + if (rc) > + return -EIO; > + *out_cam_id = be64_to_cpu(vp_cam_be) & 0xffffffffu; > + *out_chip_id = be32_to_cpu(vp_chip_id_be); > + > + return 0; > +} > +EXPORT_SYMBOL_GPL(xive_native_get_vp_info); > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > index 2c14ad9..d1a6e55 100644 > --- a/include/linux/kvm_host.h > +++ b/include/linux/kvm_host.h > @@ -1165,7 +1165,6 @@ int kvm_register_device_ops(struct kvm_device_ops *ops, > u32 type); > void kvm_unregister_device_ops(u32 type); > > extern struct kvm_device_ops kvm_mpic_ops; > -extern struct kvm_device_ops kvm_xics_ops; > extern struct kvm_device_ops kvm_arm_vgic_v2_ops; > extern struct kvm_device_ops kvm_arm_vgic_v3_ops; > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > index a17d787..1b0da57 100644 > --- a/virt/kvm/kvm_main.c > +++ b/virt/kvm/kvm_main.c > @@ -2839,10 +2839,6 @@ static struct kvm_device_ops > *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = { > [KVM_DEV_TYPE_FSL_MPIC_20] = &kvm_mpic_ops, > [KVM_DEV_TYPE_FSL_MPIC_42] = &kvm_mpic_ops, > #endif > - > -#ifdef CONFIG_KVM_XICS > - [KVM_DEV_TYPE_XICS] = &kvm_xics_ops, > -#endif > }; > > int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type) > -- > 2.9.3