Benjamin Herrenschmidt <b...@kernel.crashing.org> writes:

> The XIVE interrupt controller is the new interrupt controller
> found in POWER9. It supports advanced virtualization capabilities
> among other things.
>
> Currently we use a set of firmware calls that simulate the old
> "XICS" interrupt controller but this is fairly inefficient.
>
> This adds the framework for using XIVE along with a native
> backend which OPAL for configuration. Later, a backend allowing
               ^
               calls?

> the use in a KVM or PowerVM guest will also be provided.
>
> This disables some fast path for interrupts in KVM when XIVE is
> enabled as these rely on the firmware emulation code which is no
> longer available when the XIVE is used natively by Linux.
>
> A latter patch will make KVM also directly exploit the XIVE, thus
> recovering the lost performance (and more).
>
> Signed-off-by: Benjamin Herrenschmidt <b...@kernel.crashing.org>
> ---
>  arch/powerpc/include/asm/xive.h          |  116 +++
>  arch/powerpc/include/asm/xmon.h          |    2 +
>  arch/powerpc/platforms/powernv/Kconfig   |    2 +
>  arch/powerpc/platforms/powernv/setup.c   |   15 +-
>  arch/powerpc/platforms/powernv/smp.c     |   39 +-
>  arch/powerpc/sysdev/Kconfig              |    1 +
>  arch/powerpc/sysdev/Makefile             |    1 +
>  arch/powerpc/sysdev/xive/Kconfig         |    7 +
>  arch/powerpc/sysdev/xive/Makefile        |    4 +
>  arch/powerpc/sysdev/xive/common.c        | 1175 
> ++++++++++++++++++++++++++++++
>  arch/powerpc/sysdev/xive/native.c        |  604 +++++++++++++++
>  arch/powerpc/sysdev/xive/xive-internal.h |   51 ++
>  arch/powerpc/sysdev/xive/xive-regs.h     |   88 +++
>  arch/powerpc/xmon/xmon.c                 |   93 ++-
>  14 files changed, 2186 insertions(+), 12 deletions(-)

I'm not going to review this in one go, given it's 10:30pm already.

So just a few things that hit me straight away.

> diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h
> new file mode 100644
> index 0000000..b1604b73
> --- /dev/null
> +++ b/arch/powerpc/include/asm/xive.h
> @@ -0,0 +1,116 @@

Copyright missing.

> +#ifndef _ASM_POWERPC_XIVE_H
> +#define _ASM_POWERPC_XIVE_H
> +
> +#define XIVE_INVALID_VP      0xffffffff
> +
> +#ifdef CONFIG_PPC_XIVE
> +
> +extern void __iomem *xive_tm_area;

I think Paul already commented on "tm" being an overly used acronym.

> +extern u32 xive_tm_offset;
> +
> +/*
> + * Per-irq data (irq_get_handler_data for normal IRQs), IPIs
> + * have it stored in the xive_cpu structure. We also cache
> + * for normal interrupts the current target CPU.
> + */
> +struct xive_irq_data {
> +     /* Setup by backend */
> +     u64 flags;
> +#define XIVE_IRQ_FLAG_STORE_EOI      0x01
> +#define XIVE_IRQ_FLAG_LSI    0x02
> +#define XIVE_IRQ_FLAG_SHIFT_BUG      0x04
> +#define XIVE_IRQ_FLAG_MASK_FW        0x08
> +#define XIVE_IRQ_FLAG_EOI_FW 0x10

I don't love that style, prefer them just prior to the struct.

> +     u64 eoi_page;
> +     void __iomem *eoi_mmio;
> +     u64 trig_page;
> +     void __iomem *trig_mmio;
> +     u32 esb_shift;
> +     int src_chip;

Why not space out the members like you do in xive_q below, I think that
looks better given you have the long __iomem lines.

> +
> +     /* Setup/used by frontend */
> +     int target;
> +     bool saved_p;
> +};
> +#define XIVE_INVALID_CHIP_ID -1
> +
> +/* A queue tracking structure in a CPU */
> +struct xive_q {
> +     __be32                  *qpage;
> +     u32                     msk;
> +     u32                     idx;
> +     u32                     toggle;
> +     u64                     eoi_phys;
> +     void __iomem            *eoi_mmio;
> +     u32                     esc_irq;
> +     atomic_t                count;
> +     atomic_t                pending_count;
> +};
> +
> +/*
> + * "magic" ESB MMIO offsets

What's an ESB?

> + */
> +#define XIVE_ESB_GET         0x800
> +#define XIVE_ESB_SET_PQ_00   0xc00
> +#define XIVE_ESB_SET_PQ_01   0xd00
> +#define XIVE_ESB_SET_PQ_10   0xe00
> +#define XIVE_ESB_SET_PQ_11   0xf00
> +#define XIVE_ESB_MASK                XIVE_ESB_SET_PQ_01
> +
> +extern bool __xive_enabled;
> +
> +static inline bool xive_enabled(void) { return __xive_enabled; }
> +
> +extern bool xive_native_init(void);
> +extern void xive_smp_probe(void);
> +extern int  xive_smp_prepare_cpu(unsigned int cpu);
> +extern void xive_smp_setup_cpu(void);
> +extern void xive_smp_disable_cpu(void);
> +extern void xive_kexec_teardown_cpu(int secondary);
> +extern void xive_shutdown(void);
> +extern void xive_flush_interrupt(void);
> +
> +/* xmon hook */
> +extern void xmon_xive_do_dump(int cpu);
> +
> +/* APIs used by KVM */
> +extern u32 xive_native_default_eq_shift(void);
> +extern u32 xive_native_alloc_vp_block(u32 max_vcpus);
> +extern void xive_native_free_vp_block(u32 vp_base);
> +extern int xive_native_populate_irq_data(u32 hw_irq,
> +                                      struct xive_irq_data *data);
> +extern void xive_cleanup_irq_data(struct xive_irq_data *xd);
> +extern u32 xive_native_alloc_irq(void);
> +extern void xive_native_free_irq(u32 irq);
> +extern int xive_native_configure_irq(u32 hw_irq, u32 target, u8 prio, u32 
> sw_irq);
> +
> +extern int xive_native_configure_queue(u32 vp_id, struct xive_q *q, u8 prio,
> +                                    __be32 *qpage, u32 order, bool 
> can_escalate);
> +extern void xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio);
> +
> +extern bool __xive_irq_trigger(struct xive_irq_data *xd);
> +extern bool __xive_irq_retrigger(struct xive_irq_data *xd);
> +extern void xive_do_source_eoi(u32 hw_irq, struct xive_irq_data *xd);
> +
> +extern bool is_xive_irq(struct irq_chip *chip);
> +
> +#else
> +
> +static inline bool xive_enabled(void) { return false; }
> +
> +static inline bool xive_native_init(void) { return false; }
> +static inline void xive_smp_probe(void) { }
> +extern inline int  xive_smp_prepare_cpu(unsigned int cpu) { return -EINVAL; }
> +static inline void xive_smp_setup_cpu(void) { }
> +static inline void xive_smp_disable_cpu(void) { }
> +static inline void xive_kexec_teardown_cpu(int secondary) { }
> +static inline void xive_shutdown(void) { }
> +static inline void xive_flush_interrupt(void) { }
> +
> +static inline u32 xive_native_alloc_vp_block(u32 max_vcpus)
> +    { return XIVE_INVALID_VP; }
> +static inline void xive_native_free_vp_block(u32 vp_base) { }
> +
> +#endif
> +
> +#endif /* _ASM_POWERPC_XIVE_H */
> diff --git a/arch/powerpc/include/asm/xmon.h b/arch/powerpc/include/asm/xmon.h
> index 5eb8e59..eb42a0c 100644
> --- a/arch/powerpc/include/asm/xmon.h
> +++ b/arch/powerpc/include/asm/xmon.h
> @@ -29,5 +29,7 @@ static inline void xmon_register_spus(struct list_head 
> *list) { };
>  extern int cpus_are_in_xmon(void);
>  #endif
>  
> +extern void xmon_printf(const char *format, ...);
> +
>  #endif /* __KERNEL __ */
>  #endif /* __ASM_POWERPC_XMON_H */
> diff --git a/arch/powerpc/platforms/powernv/Kconfig 
> b/arch/powerpc/platforms/powernv/Kconfig
> index 3a07e4d..81ee2ed 100644
> --- a/arch/powerpc/platforms/powernv/Kconfig
> +++ b/arch/powerpc/platforms/powernv/Kconfig
> @@ -4,6 +4,8 @@ config PPC_POWERNV
>       select PPC_NATIVE
>       select PPC_XICS
>       select PPC_ICP_NATIVE
> +     select PPC_XIVE
> +     select PPC_XIVE_NATIVE
>       select PPC_P7_NAP
>       select PCI
>       select PCI_MSI
> diff --git a/arch/powerpc/platforms/powernv/setup.c 
> b/arch/powerpc/platforms/powernv/setup.c
> index d50c7d9..adceac9 100644
> --- a/arch/powerpc/platforms/powernv/setup.c
> +++ b/arch/powerpc/platforms/powernv/setup.c
> @@ -32,6 +32,7 @@
>  #include <asm/machdep.h>
>  #include <asm/firmware.h>
>  #include <asm/xics.h>
> +#include <asm/xive.h>
>  #include <asm/opal.h>
>  #include <asm/kexec.h>
>  #include <asm/smp.h>
> @@ -76,7 +77,9 @@ static void __init pnv_init(void)
>  
>  static void __init pnv_init_IRQ(void)
>  {
> -     xics_init();
> +     /* Try using a XIVE if available, otherwise use a XICS */
> +     if (!xive_native_init())
> +             xics_init();
>  
>       WARN_ON(!ppc_md.get_irq);
>  }
> @@ -218,10 +221,12 @@ static void pnv_kexec_wait_secondaries_down(void)
>  
>  static void pnv_kexec_cpu_down(int crash_shutdown, int secondary)
>  {
> -     xics_kexec_teardown_cpu(secondary);
> +     if (xive_enabled())
> +             xive_kexec_teardown_cpu(secondary);
> +     else
> +             xics_kexec_teardown_cpu(secondary);
>  
>       /* On OPAL, we return all CPUs to firmware */
> -
>       if (!firmware_has_feature(FW_FEATURE_OPAL))
>               return;
>  
> @@ -237,6 +242,10 @@ static void pnv_kexec_cpu_down(int crash_shutdown, int 
> secondary)
>               /* Primary waits for the secondaries to have reached OPAL */
>               pnv_kexec_wait_secondaries_down();
>  
> +             /* Switch XIVE back to emulation mode */
> +             if (xive_enabled())
> +                     xive_shutdown();
> +
>               /*
>                * We might be running as little-endian - now that interrupts
>                * are disabled, reset the HILE bit to big-endian so we don't
> diff --git a/arch/powerpc/platforms/powernv/smp.c 
> b/arch/powerpc/platforms/powernv/smp.c
> index 8b67e1e..f571955 100644
> --- a/arch/powerpc/platforms/powernv/smp.c
> +++ b/arch/powerpc/platforms/powernv/smp.c
> @@ -29,6 +29,7 @@
>  #include <asm/vdso_datapage.h>
>  #include <asm/cputhreads.h>
>  #include <asm/xics.h>
> +#include <asm/xive.h>
>  #include <asm/opal.h>
>  #include <asm/runlatch.h>
>  #include <asm/code-patching.h>
> @@ -47,7 +48,9 @@
>  
>  static void pnv_smp_setup_cpu(int cpu)
>  {
> -     if (cpu != boot_cpuid)
> +     if (xive_enabled())
> +             xive_smp_setup_cpu();
> +     else if (cpu != boot_cpuid)
>               xics_setup_cpu();
>  
>  #ifdef CONFIG_PPC_DOORBELL
> @@ -132,7 +135,10 @@ static int pnv_smp_cpu_disable(void)
>       vdso_data->processorCount--;
>       if (cpu == boot_cpuid)
>               boot_cpuid = cpumask_any(cpu_online_mask);
> -     xics_migrate_irqs_away();
> +     if (xive_enabled())
> +             xive_smp_disable_cpu();
> +     else
> +             xics_migrate_irqs_away();
>       return 0;
>  }
>  
> @@ -213,9 +219,12 @@ static void pnv_smp_cpu_kill_self(void)
>               if (((srr1 & wmask) == SRR1_WAKEEE) ||
>                   ((srr1 & wmask) == SRR1_WAKEHVI) ||
>                   (local_paca->irq_happened & PACA_IRQ_EE)) {
> -                     if (cpu_has_feature(CPU_FTR_ARCH_300))
> -                             icp_opal_flush_interrupt();
> -                     else
> +                     if (cpu_has_feature(CPU_FTR_ARCH_300)) {
> +                             if (xive_enabled())
> +                                     xive_flush_interrupt();
> +                             else
> +                                     icp_opal_flush_interrupt();
> +                     } else
>                               icp_native_flush_interrupt();
>               } else if ((srr1 & wmask) == SRR1_WAKEHDBELL) {
>                       unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
> @@ -252,10 +261,26 @@ static int pnv_cpu_bootable(unsigned int nr)
>       return smp_generic_cpu_bootable(nr);
>  }
>  
> +static int pnv_smp_prepare_cpu(int cpu)
> +{
> +     if (xive_enabled())
> +             return xive_smp_prepare_cpu(cpu);
> +     return 0;
> +}
> +
> +static void __init pnv_smp_probe(void)
> +{
> +     if (xive_enabled())
> +             xive_smp_probe();
> +     else
> +             xics_smp_probe();
> +}
> +
>  static struct smp_ops_t pnv_smp_ops = {
>       .message_pass   = smp_muxed_ipi_message_pass,
> -     .cause_ipi      = NULL, /* Filled at runtime by xics_smp_probe() */
> -     .probe          = xics_smp_probe,
> +     .cause_ipi      = NULL, /* Filled at runtime by xi{cs,ve}_smp_probe() */
> +     .probe          = pnv_smp_probe,
> +     .prepare_cpu    = pnv_smp_prepare_cpu,
>       .kick_cpu       = pnv_smp_kick_cpu,
>       .setup_cpu      = pnv_smp_setup_cpu,
>       .cpu_bootable   = pnv_cpu_bootable,
> diff --git a/arch/powerpc/sysdev/Kconfig b/arch/powerpc/sysdev/Kconfig
> index 52dc165..caf882e 100644
> --- a/arch/powerpc/sysdev/Kconfig
> +++ b/arch/powerpc/sysdev/Kconfig
> @@ -28,6 +28,7 @@ config PPC_MSI_BITMAP
>       default y if PPC_POWERNV
>  
>  source "arch/powerpc/sysdev/xics/Kconfig"
> +source "arch/powerpc/sysdev/xive/Kconfig"
>  
>  config PPC_SCOM
>       bool
> diff --git a/arch/powerpc/sysdev/Makefile b/arch/powerpc/sysdev/Makefile
> index a254824..c0ae11d 100644
> --- a/arch/powerpc/sysdev/Makefile
> +++ b/arch/powerpc/sysdev/Makefile
> @@ -71,5 +71,6 @@ obj-$(CONFIG_PPC_EARLY_DEBUG_MEMCONS)       += 
> udbg_memcons.o
>  subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror
>  
>  obj-$(CONFIG_PPC_XICS)               += xics/
> +obj-$(CONFIG_PPC_XIVE)               += xive/
>  
>  obj-$(CONFIG_GE_FPGA)                += ge/
> diff --git a/arch/powerpc/sysdev/xive/Kconfig 
> b/arch/powerpc/sysdev/xive/Kconfig
> new file mode 100644
> index 0000000..c8816c8
> --- /dev/null
> +++ b/arch/powerpc/sysdev/xive/Kconfig
> @@ -0,0 +1,7 @@
> +config PPC_XIVE
> +       def_bool n
> +       select PPC_SMP_MUXED_IPI
> +       select HARDIRQS_SW_RESEND
> +
> +config PPC_XIVE_NATIVE
> +       def_bool n
> diff --git a/arch/powerpc/sysdev/xive/Makefile 
> b/arch/powerpc/sysdev/xive/Makefile
> new file mode 100644
> index 0000000..3fab303
> --- /dev/null
> +++ b/arch/powerpc/sysdev/xive/Makefile
> @@ -0,0 +1,4 @@
> +subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror
> +
> +obj-y                                += common.o
> +obj-$(CONFIG_PPC_XIVE_NATIVE)        += native.o
> diff --git a/arch/powerpc/sysdev/xive/common.c 
> b/arch/powerpc/sysdev/xive/common.c
> new file mode 100644
> index 0000000..96037e0
> --- /dev/null
> +++ b/arch/powerpc/sysdev/xive/common.c
> @@ -0,0 +1,1175 @@
> +/*
> + * Copyright 2016,2017 IBM Corporation.
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License
> + * as published by the Free Software Foundation; either version
> + * 2 of the License, or (at your option) any later version.
> + */

If here you put:

#define pr_fmt(fmt) "xive: " fmt

Then you can drop the prefix from every pr_xxx() in the whole file.

> +#include <linux/types.h>
> +#include <linux/threads.h>
> +#include <linux/kernel.h>
> +#include <linux/irq.h>
> +#include <linux/debugfs.h>

Unused?

> +#include <linux/smp.h>
> +#include <linux/interrupt.h>
> +#include <linux/seq_file.h>

Unused?

> +#include <linux/init.h>
> +#include <linux/cpu.h>
> +#include <linux/of.h>
> +#include <linux/slab.h>
> +#include <linux/spinlock.h>
> +#include <linux/msi.h>
> +
> +#include <asm/prom.h>
> +#include <asm/io.h>
> +#include <asm/smp.h>
> +#include <asm/machdep.h>
> +#include <asm/irq.h>
> +#include <asm/errno.h>
> +#include <asm/xive.h>
> +#include <asm/xmon.h>
> +
> +#include "xive-regs.h"
> +#include "xive-internal.h"
> +
> +#undef DEBUG_FLUSH
> +#undef DEBUG_ALL
> +
> +#define DBG(fmt...)          pr_devel("XIVE: " fmt)
> +
> +#ifdef DEBUG_ALL
> +#define DBG_VERBOSE(fmt...)  pr_devel("XIVE: " fmt)
> +#else
> +#define DBG_VERBOSE(fmt...)  do { } while(0)
> +#endif
> +
> +bool __xive_enabled;
> +bool xive_cmdline_disabled;
> +
> +/* We use only one priority for now */
> +static u8 xive_irq_priority;
> +
> +void __iomem *xive_tm_area;
> +u32 xive_tm_offset;
> +static const struct xive_ops *xive_ops;
> +static struct irq_domain *xive_irq_domain;
> +
> +/* The IPIs all use the same logical irq number */
> +static u32 xive_ipi_irq;
> +
> +/* Xive state for each CPU */
> +static DEFINE_PER_CPU(struct xive_cpu *, xive_cpu);
> +
> +/*
> + * A "disabled" interrupt should never fire, to catch problems
> + * we set its logical number to this
> + */
> +#define XIVE_BAD_IRQ         0x7fffffff

Can it be anything? How about 0x7fbadbad ?

> +#define XIVE_MAX_IRQ         (XIVE_BAD_IRQ - 1)
> +
> +/* An invalid CPU target */
> +#define XIVE_INVALID_TARGET  (-1)
> +
> +static u32 xive_read_eq(struct xive_q *q, u8 prio, bool just_peek)

Can it have a doc comment? And tell me what an EQ is?

> +{
> +     u32 cur;
> +
> +     if (!q->qpage)
> +             return 0;

A newline or ..

> +     cur = be32_to_cpup(q->qpage + q->idx);
> +     if ((cur >> 31) == q->toggle)
> +             return 0;

.. two wouldn't hurt here.

> +     if (!just_peek) {
> +             q->idx = (q->idx + 1) & q->msk;
> +             if (q->idx == 0)
> +                     q->toggle ^= 1;
> +     }
> +     return cur & 0x7fffffff;

Is that XIVE_BAD_IRQ ?

> +}
> +
> +static u32 xive_scan_interrupts(struct xive_cpu *xc, bool just_peek)
> +{
> +     u32 hirq = 0;

Is that a hwirq or something different?

> +     u8 prio;
> +
> +     /* Find highest pending priority */
> +     while (xc->pending_prio != 0) {
> +             struct xive_q *q;
> +
> +             prio = ffs(xc->pending_prio) - 1;
> +             DBG_VERBOSE("scan_irq: trying prio %d\n", prio);
> +
> +             /* Try to fetch */
> +             hirq = xive_read_eq(&xc->queue[prio], prio, just_peek);
> +
> +             /* Found something ? That's it */
> +             if (hirq)
> +                     break;
> +
> +             /* Clear pending bits */
> +             xc->pending_prio &= ~(1 << prio);
> +
> +             /*
> +              * Check if the queue count needs adjusting due to
> +              * interrupts being moved away.
> +              */
> +             q = &xc->queue[prio];
> +             if (atomic_read(&q->pending_count)) {
> +                     int p = atomic_xchg(&q->pending_count, 0);
> +                     if (p) {
> +                             WARN_ON(p > atomic_read(&q->count));
> +                             atomic_sub(p, &q->count);

I am not sure what's going on there.

> +                     }
> +             }
> +     }
> +
> +     /* If nothing was found, set CPPR to 0xff */

Would be nice to spell out CPPR somewhere.

> +     if (hirq == 0)
> +             prio = 0xff;
> +
> +     /* Update HW CPPR to match if necessary */
> +     if (prio != xc->cppr) {
> +             DBG_VERBOSE("scan_irq: adjusting CPPR to %d\n", prio);
> +             xc->cppr = prio;
> +             out_8(xive_tm_area + xive_tm_offset + TM_CPPR, prio);

What's the out_8() doing? I was expecting it to use xc, or something per-cpu.

> +     }
> +
> +     return hirq;
> +}
> +
> +#ifdef CONFIG_XMON
> +static void xive_dump_eq(const char *name, struct xive_q *q)
> +{
> +     u32 i0, i1, idx;
> +
> +     if (!q->qpage)
> +             return;
> +     idx = q->idx;
> +     i0 = be32_to_cpup(q->qpage + idx);
> +     idx = (idx + 1) & q->msk;
> +     i1 = be32_to_cpup(q->qpage + idx);
> +     xmon_printf("  %s Q T=%d %08x %08x ...\n", name,
> +                 q->toggle, i0, i1);
> +}
> +
> +void xmon_xive_do_dump(int cpu)
> +{
> +     struct xive_cpu *xc = per_cpu(xive_cpu, cpu);
> +     struct xive_irq_data *xd;
> +     uint64_t val, offset;

u64 ?

> +
> +     xmon_printf("XIVE state for CPU %d:\n", cpu);
> +     xmon_printf("  pp=%02x cppr=%02x\n", xc->pending_prio, xc->cppr);
> +     xive_dump_eq("IRQ", &xc->queue[xive_irq_priority]);
> +     xd = &xc->ipi_data;
> +     offset = 0x800;
> +     if (xd->flags & XIVE_IRQ_FLAG_SHIFT_BUG)
> +             offset |= offset << 4;
> +     val = in_be64(xd->eoi_mmio + offset);
> +     xmon_printf("  IPI state: %x:%c%c\n", xc->hw_ipi,
> +                 val & 2 ? 'P' : 'p',
> +                 val & 1 ? 'Q' : 'q');
> +}
> +#endif /* CONFIG_XMON */
> +
> +static void xive_update_pending_irqs(struct xive_cpu *xc)
> +{
> +     u8 he, cppr;
> +     u16 ack;
> +
> +     /* Perform the acknowledge hypervisor to register cycle */
> +     ack = be16_to_cpu(__raw_readw(xive_tm_area + TM_SPC_ACK_HV_REG));
> +
> +     /* Synchronize subsequent queue accesses */
> +     mb();
> +
> +     DBG_VERBOSE("CPU %d get_irq, ack=%04x\n", smp_processor_id(), ack);
> +
> +     /* Check the HE field */
> +     cppr = ack & 0xff;
> +     he = GETFIELD(TM_QW3_NSR_HE, (ack >> 8));
> +     switch(he) {
> +     case TM_QW3_NSR_HE_NONE:
> +             break;
> +     case TM_QW3_NSR_HE_PHYS:
> +             if (cppr == 0xff)
> +                     return;
> +             xc->pending_prio |= 1 << cppr;
> +             if (cppr >= xc->cppr)
> +                     pr_err("XIVE: CPU %d odd ack CPPR, got %d at %d\n",
> +                            smp_processor_id(), cppr, xc->cppr);
> +             xc->cppr = cppr;
> +             break;
> +     case TM_QW3_NSR_HE_POOL:
> +     case TM_QW3_NSR_HE_LSI:
> +             pr_err("XIVE: CPU %d got unexpected interrupt type HE=%d\n",
> +                    smp_processor_id(), he);
> +             return;
> +     }
> +}
> +
> +static unsigned int xive_get_irq(void)
> +{
> +     struct xive_cpu *xc = __this_cpu_read(xive_cpu);
> +     u32 hirq;
> +
> +     /*
> +      * This can be called either as a result of a HW interrupt or
> +      * as a "replay" because EOI decided there was still something
> +      * in one of the queues.
> +      *
> +      * First we perform an ACK cycle in order to update our mask
> +      * of pending priorities. This will also have the effect of
> +      * updating the CPPR to the most favored pending interrupts.
> +      *
> +      * In the future, if we have a way to differenciate a first
> +      * entry (on HW interrupt) from a replay triggered by EOI,
> +      * we could skip this on replays unless we soft-mask tells us
> +      * that a new HW interrupt occurred.
> +      */
> +     xive_update_pending_irqs(xc);
> +
> +     DBG_VERBOSE("get_irq: pending=%02x\n", xc->pending_prio);
> +
> +     hirq = xive_scan_interrupts(xc, false);
> +
> +     DBG_VERBOSE("get_irq: got irq 0x%x, new pending=0x%02x\n",
> +         hirq, xc->pending_prio);
> +
> +     /* Return pending interrupt if any */
> +     if (hirq == XIVE_BAD_IRQ)
> +             return 0;
> +     return hirq;
> +}
> +
> +
> +static void xive_do_queue_eoi(struct xive_cpu *xc)
> +{
> +     if (xive_scan_interrupts(xc, true) != 0) {
> +             DBG_VERBOSE("eoi: pending=0x%02x\n", xc->pending_prio);
> +             force_external_irq_replay();
> +     }
> +}
> +
> +static u8 xive_poke_esb(struct xive_irq_data *xd, u32 offset)
> +{
> +     u64 val;
> +
> +     if (xd->flags & XIVE_IRQ_FLAG_SHIFT_BUG)
> +             offset |= offset << 4;
> +
> +     val = in_be64(xd->eoi_mmio + offset);
> +
> +     return (u8)val;
> +}
> +
> +static void xive_do_source_eoi(u32 hw_irq, struct xive_irq_data *xd)
> +{
> +     /* If the XIVE supports the new "store EOI facility, use it */
> +     if (xd->flags & XIVE_IRQ_FLAG_STORE_EOI)
> +             out_be64(xd->eoi_mmio, 0);
> +     else if (hw_irq && xd->flags & XIVE_IRQ_FLAG_EOI_FW) {
> +             if (WARN_ON_ONCE(!xive_ops->eoi))
> +                     return;
> +             xive_ops->eoi(hw_irq);
> +     } else {
> +             uint8_t eoi_val;

u8?

> +
> +             /*
> +              * Otherwise for EOI, we use the special MMIO that does
> +              * a clear of both P and Q and returns the old Q.
> +              *
> +              * This allows us to then do a re-trigger if Q was set
> +              * rather than synthetizing an interrupt in software
> +              */
> +             eoi_val = xive_poke_esb(xd, XIVE_ESB_SET_PQ_00);
> +             DBG_VERBOSE("eoi_val=%x\n", offset, eoi_val);
> +
> +             if ((xd->flags & XIVE_IRQ_FLAG_LSI) || !(eoi_val & 1))
> +                     return;
> +
> +             /* Re-trigger */
> +             if (xd->trig_mmio)
> +                     out_be64(xd->trig_mmio, 0);
> +     }
> +
> +}
> +
> +static void xive_irq_eoi(struct irq_data *d)
> +{
> +     struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
> +     struct xive_cpu *xc = __this_cpu_read(xive_cpu);
> +
> +     DBG_VERBOSE("eoi_irq: irq=%d [0x%lx] pending=%02x\n",
> +                 d->irq, irqd_to_hwirq(d), xc->pending_prio);
> +
> +     if (!irqd_irq_disabled(d))
> +             xive_do_source_eoi(irqd_to_hwirq(d), xd);
> +
> +     /*
> +      * Clear saved_p to indicate that it's no longer occupying
> +      * a queue slot on the target queue
> +      */
> +     xd->saved_p = false;
> +
> +     xive_do_queue_eoi(xc);
> +}
> +
> +static void xive_do_source_set_mask(struct xive_irq_data *xd,
> +                                 bool masked)
> +{
> +     if (masked)
> +             xive_poke_esb(xd, XIVE_ESB_SET_PQ_01);
> +     else
> +             xive_poke_esb(xd, XIVE_ESB_SET_PQ_00);
> +}
> +
> +static bool xive_try_pick_target(int cpu)
> +{
> +     struct xive_cpu *xc = per_cpu(xive_cpu, cpu);
> +     struct xive_q *q = &xc->queue[xive_irq_priority];
> +     int max;
> +
> +     /* Calculate max number of interrupts in that queue.
> +      *
> +      * We leave a gap of 1 just in case...
> +      */
> +     max = (q->msk + 1) - 1;
> +     return !!atomic_add_unless(&q->count, 1, max);
> +}
> +
> +static void xive_dec_target_count(int cpu)
> +{
> +     struct xive_cpu *xc = per_cpu(xive_cpu, cpu);
> +     struct xive_q *q = &xc->queue[xive_irq_priority];
> +
> +     if (WARN_ON(cpu < 0))
> +             return;
> +
> +     /*
> +      * We increment the "pending count" which will be used
> +      * to decrement the target queue count whenever it's next
> +      * processed and found empty. This ensure that we don't
> +      * decrement while we still have the interrupt there
> +      * occupying a slot.
> +      */
> +     atomic_inc(&q->pending_count);
> +}
> +
> +static int xive_find_target_in_mask(const struct cpumask *mask,
> +                                 unsigned int fuzz)
> +{
> +     int cpu, first, num, i;
> +
> +     /* Pick up a starting point CPU in the mask based on  fuzz */
> +     num = cpumask_weight(mask);
> +     first = (fuzz++) % num;
> +
> +     /* Locate it */
> +     cpu = cpumask_first(mask);
> +     for (i = 0; i < first; i++)
> +             cpu = cpumask_next(cpu, mask);
> +     first = cpu;
> +
> +     /*
> +      * Now go through the entire mask until we find a valid
> +      * target.
> +      */
> +     for (;;) {
> +             /*
> +              * We re-check online as the fallback case passes us
> +              * an untested affinity mask
> +              */
> +             if (cpu_online(cpu) && xive_try_pick_target(cpu))
> +                     return cpu;
> +             cpu = cpumask_next(cpu, mask);
> +             if (cpu == first)
> +                     break;
> +     }
> +     return -1;
> +}
> +
> +static int xive_pick_irq_target(struct irq_data *d,
> +                             const struct cpumask *affinity)
> +{
> +     static unsigned int fuzz;
> +     struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
> +     cpumask_var_t mask;
> +     int cpu = -1;
> +
> +     /*
> +      * Pick a target CPU for an interrupt. This is done at
> +      * startup or if the affinity is changed in a way that
> +      * invalidates the current target.
> +      */
> +
> +     /* If we have chip IDs, first we try to build a mask of
> +      * CPUs matching ther CPU and find a target in there
> +      */
> +     if (xd->src_chip != XIVE_INVALID_CHIP_ID &&
> +             zalloc_cpumask_var(&mask, GFP_ATOMIC)) {
> +             /* Build a mask of matching chip IDs */
> +             for_each_cpu_and(cpu, affinity, cpu_online_mask) {
> +                     struct xive_cpu *xc = per_cpu(xive_cpu, cpu);
> +                     if (xc->chip_id == xd->src_chip)
> +                             cpumask_set_cpu(cpu, mask);
> +             }
> +             /* Try to find a target */
> +             if (!cpumask_empty(mask))
> +                     cpu = xive_find_target_in_mask(mask, fuzz++);
> +             free_cpumask_var(mask);
> +             if (cpu >= 0)
> +                     return cpu;
> +             fuzz--;
> +     }
> +
> +     /* No chip IDs, fallback to using the affinity mask */
> +     return xive_find_target_in_mask(affinity, fuzz++);
> +}
> +
> +static unsigned int xive_irq_startup(struct irq_data *d)
> +{
> +     struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
> +     unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
> +     int target, rc;
> +
> +     DBG("xive_irq_startup: irq %d [0x%x] data @%p\n",
> +         d->irq, hw_irq, d);
> +
> +#ifdef CONFIG_PCI_MSI
> +     /*
> +      * The generic MSI code returns with the interrupt disabled on the
> +      * card, using the MSI mask bits. Firmware doesn't appear to unmask
> +      * at that level, so we do it here by hand.
> +      */
> +     if (irq_data_get_msi_desc(d))
> +             pci_msi_unmask_irq(d);
> +#endif
> +
> +     /* Pick a target */
> +     target = xive_pick_irq_target(d, irq_data_get_affinity_mask(d));
> +     if (target == XIVE_INVALID_TARGET) {
> +             /* Try again breaking affinity */
> +             target = xive_pick_irq_target(d, cpu_online_mask);
> +             if (target == XIVE_INVALID_TARGET)
> +                     return -ENXIO;
> +             pr_warn("XIVE: irq %d started with broken affinity\n",
> +                     d->irq);
> +     }
> +     xd->target = target;
> +
> +     /*
> +      * Configure the logical number to be the Linux IRQ number
> +      * and set the target queue
> +      */
> +     rc = xive_ops->configure_irq(hw_irq,
> +                                  get_hard_smp_processor_id(target),
> +                                  xive_irq_priority, d->irq);
> +     if (rc)
> +             return rc;
> +
> +     /* Unmask the ESB */
> +     xive_do_source_set_mask(xd, false);
> +
> +     return 0;
> +}
> +
> +static void xive_irq_shutdown(struct irq_data *d)
> +{
> +     struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
> +     unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
> +
> +     DBG("xive_irq_shutdown: irq %d [0x%x] data @%p\n",
> +         d->irq, hw_irq, d);
> +
> +     if (WARN_ON(xd->target == XIVE_INVALID_TARGET))
> +             return;
> +
> +     /* Mask the interrupt at the source */
> +     xive_do_source_set_mask(xd, true);
> +
> +     /* Mask the interrupt in HW in the IVT/EAS */
> +     xive_ops->configure_irq(hw_irq,
> +                             get_hard_smp_processor_id(xd->target),
> +                             0xff, hw_irq);
> +
> +     xive_dec_target_count(xd->target);
> +     xd->target = XIVE_INVALID_TARGET;
> +}
> +
> +static void xive_irq_unmask(struct irq_data *d)
> +{
> +     struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
> +
> +     DBG("xive_irq_unmask: irq %d data @%p\n", d->irq, xd);
> +
> +     /*
> +      * This is a workaround for PCI LSI problems on P9, for
> +      * these, we call FW to set the mask. The problems might
> +      * be fixed by P9 DD2.0, if that is the case, we will make
> +      * this a DD1 workaround only
> +      */
> +     if (xd->flags & XIVE_IRQ_FLAG_MASK_FW) {
> +             unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
> +             xive_ops->configure_irq(hw_irq,
> +                                     get_hard_smp_processor_id(xd->target),
> +                                     xive_irq_priority, d->irq);
> +             return;
> +     }
> +
> +     xive_do_source_set_mask(xd, false);
> +}
> +
> +static void xive_irq_mask(struct irq_data *d)
> +{
> +     struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
> +
> +     DBG("xive_irq_mask: irq %d data @%p\n", d->irq, xd);
> +
> +     /*
> +      * This is a workaround for PCI LSI problems on P9, for
> +      * these, we call OPAL to set the mask. The problems might
> +      * be fixed by P9 DD2.0, if that is the case, we will make
> +      * this a DD1 workaround only
> +      */
> +     if (xd->flags & XIVE_IRQ_FLAG_MASK_FW) {
> +             unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
> +             xive_ops->configure_irq(hw_irq,
> +                                     get_hard_smp_processor_id(xd->target),
> +                                     0xff, d->irq);
> +             return;
> +     }
> +
> +     xive_do_source_set_mask(xd, true);
> +}
> +
> +static int xive_irq_set_affinity(struct irq_data *d,
> +                              const struct cpumask *cpumask,
> +                              bool force)
> +{
> +     struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
> +     unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
> +     u32 target, old_target;
> +     int rc = 0;
> +
> +     DBG("xive_irq_set_affinity: irq %d\n", d->irq);
> +
> +     /* Is this valid ? */
> +     if (cpumask_any_and(cpumask, cpu_online_mask) >= nr_cpu_ids)
> +             return -EINVAL;
> +
> +     /* If existing target is already in the new mask, and is
> +      * online then do nothing.
> +      */
> +     if (cpu_online(xd->target) &&
> +         cpumask_test_cpu(xd->target, cpumask))
> +             return IRQ_SET_MASK_OK;
> +
> +     /* Pick a new target */
> +     target = xive_pick_irq_target(d, cpumask);
> +
> +     /* No target found */
> +     if (target == XIVE_INVALID_TARGET)
> +             return -ENXIO;
> +
> +     old_target = xd->target;
> +
> +     /*
> +      * Only configure the irq if it's not currently passed-through to
> +      * a KVM guest
> +      */
> +     rc = xive_ops->configure_irq(hw_irq,
> +                                  get_hard_smp_processor_id(target),
> +                                  xive_irq_priority, d->irq);
> +     if (rc < 0) {
> +             pr_err("XIVE: Error %d reconfiguring irq %d\n", rc, d->irq);
> +             return rc;
> +     }
> +
> +     DBG("  target: 0x%x\n", target);
> +     xd->target = target;
> +
> +     /* Give up previous target */
> +     if (old_target != XIVE_INVALID_TARGET)
> +         xive_dec_target_count(old_target);
> +
> +     return IRQ_SET_MASK_OK;
> +}
> +
> +static int xive_irq_set_type(struct irq_data *d, unsigned int flow_type)
> +{
> +     struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
> +
> +     /*
> +      * We only support these. This has really no effect other than setting
> +      * the corresponding descriptor bits mind you but those will in turn
> +      * affect the resend function when re-enabling an edge interrupt.
> +      *
> +      * Set set the default to edge as explained in map().
> +      */
> +     if (flow_type == IRQ_TYPE_DEFAULT || flow_type == IRQ_TYPE_NONE)
> +             flow_type = IRQ_TYPE_EDGE_RISING;
> +
> +     if (flow_type != IRQ_TYPE_EDGE_RISING &&
> +         flow_type != IRQ_TYPE_LEVEL_LOW)
> +             return -EINVAL;
> +
> +     irqd_set_trigger_type(d, flow_type);
> +
> +     /*
> +      * Double check it matches what the FW thinks
> +      *
> +      * NOTE: We don't know yet if the PAPR interface will provide
> +      * the LSI vs MSI information appart from the device-tree so
> +      * this check might have to move into an optional backend call
> +      * that is specific to the native backend
> +      */
> +     if ((flow_type == IRQ_TYPE_LEVEL_LOW) !=
> +         !!(xd->flags & XIVE_IRQ_FLAG_LSI))
> +             pr_warn("XIVE: Interrupt %d (HW 0x%x) type mismatch,"
> +                     " Linux says %s, FW says %s\n",
> +                     d->irq, (u32)irqd_to_hwirq(d),
> +                     (flow_type == IRQ_TYPE_LEVEL_LOW) ? "Level" : "Edge",
> +                     (xd->flags & XIVE_IRQ_FLAG_LSI) ? "Level" : "Edge");
> +
> +     return IRQ_SET_MASK_OK_NOCOPY;
> +}
> +
> +static int xive_irq_retrigger(struct irq_data *d)
> +{
> +     struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
> +
> +     /* This should be only for MSIs */
> +     if (WARN_ON(xd->flags & XIVE_IRQ_FLAG_LSI))
> +             return 0;
> +
> +     /*
> +      * To perform a retrigger, we first set the PQ bits to
> +      * 11, then perform an EOI.
> +      */
> +     xive_poke_esb(xd, XIVE_ESB_SET_PQ_11);
> +
> +     /*
> +      * Note: We pass "0" to the hw_irq argument in order to
> +      * avoid calling into the backend EOI code which we don't
> +      * want to do in the case of a re-trigger. Backends typically
> +      * only do EOI for LSIs anyway.
> +      */
> +     xive_do_source_eoi(0, xd);
> +
> +     return 1;
> +}
> +
> +static struct irq_chip xive_irq_chip = {
> +     .name = "XIVE-IRQ",
> +     .irq_startup = xive_irq_startup,
> +     .irq_shutdown = xive_irq_shutdown,
> +     .irq_eoi = xive_irq_eoi,
> +     .irq_mask = xive_irq_mask,
> +     .irq_unmask = xive_irq_unmask,
> +     .irq_set_affinity = xive_irq_set_affinity,
> +     .irq_set_type = xive_irq_set_type,
> +     .irq_retrigger = xive_irq_retrigger,
> +};
> +
> +bool is_xive_irq(struct irq_chip *chip)
> +{
> +     return chip == &xive_irq_chip;
> +}
> +
> +void xive_cleanup_irq_data(struct xive_irq_data *xd)
> +{
> +     if (xd->eoi_mmio) {
> +             iounmap(xd->eoi_mmio);
> +             if (xd->eoi_mmio == xd->trig_mmio)
> +                     xd->trig_mmio = NULL;
> +             xd->eoi_mmio = NULL;
> +     }
> +     if (xd->trig_mmio) {
> +             iounmap(xd->trig_mmio);
> +             xd->trig_mmio = NULL;
> +     }
> +}
> +
> +static int xive_irq_alloc_data(unsigned int virq, irq_hw_number_t hw)
> +{
> +     struct xive_irq_data *xd;
> +     int rc;
> +
> +     xd = kzalloc(sizeof(struct xive_irq_data), GFP_KERNEL);
> +     if (!xd)
> +             return -ENOMEM;
> +     rc = xive_ops->populate_irq_data(hw, xd);
> +     if (rc) {
> +             kfree(xd);
> +             return rc;
> +     }
> +     xd->target = XIVE_INVALID_TARGET;
> +     irq_set_handler_data(virq, xd);
> +
> +     return 0;
> +}
> +
> +static void xive_irq_free_data(unsigned int virq)
> +{
> +     struct xive_irq_data *xd = irq_get_handler_data(virq);
> +
> +     if (!xd)
> +             return;
> +     irq_set_handler_data(virq, NULL);
> +     xive_cleanup_irq_data(xd);
> +     kfree(xd);
> +}
> +
> +#ifdef CONFIG_SMP
> +
> +static void xive_cause_ipi(int cpu, unsigned long msg)
> +{
> +     struct xive_cpu *xc;
> +     struct xive_irq_data *xd;
> +
> +     xc = per_cpu(xive_cpu, cpu);
> +
> +     DBG_VERBOSE("IPI msg#%ld CPU %d -> %d (HW IRQ 0x%x)\n",
> +                 msg, smp_processor_id(), cpu, xc->hw_ipi);
> +
> +     xd = &xc->ipi_data;
> +     if (WARN_ON(!xd->trig_mmio))
> +             return;
> +     out_be64(xd->trig_mmio, 0);
> +}
> +
> +static irqreturn_t xive_muxed_ipi_action(int irq, void *dev_id)
> +{
> +     return smp_ipi_demux();
> +}
> +
> +static void xive_ipi_eoi(struct irq_data *d)
> +{
> +     struct xive_cpu *xc = __this_cpu_read(xive_cpu);
> +
> +     /* Handle possible race with unplug and drop stale IPIs */
> +     if (!xc)
> +             return;
> +     xive_do_source_eoi(xc->hw_ipi, &xc->ipi_data);
> +     xive_do_queue_eoi(xc);
> +}
> +
> +static void xive_ipi_unmask(struct irq_data *d)
> +{
> +     /* Nothing to do, we never mask IPIs, but the callback
> +      * must exist
> +      */
> +}
> +
> +static void xive_ipi_mask(struct irq_data *d)
> +{
> +     /* Nothing to do, we never mask IPIs, but the callback
> +      * must exist
> +      */
> +}
> +
> +static struct irq_chip xive_ipi_chip = {
> +     .name = "XIVE-IPI",
> +     .irq_eoi = xive_ipi_eoi,
> +     .irq_mask = xive_ipi_mask,
> +     .irq_unmask = xive_ipi_unmask,
> +};
> +
> +static void __init xive_request_ipi(void)
> +{
> +     unsigned int virq;
> +
> +     /* Initialize it */
> +     virq = irq_create_mapping(xive_irq_domain, 0);
> +     xive_ipi_irq = virq;
> +
> +     BUG_ON(request_irq(virq, xive_muxed_ipi_action,
> +                        IRQF_PERCPU | IRQF_NO_THREAD, "IPI", NULL));
> +}
> +
> +static int xive_setup_cpu_ipi(unsigned int cpu)
> +{
> +     struct xive_cpu *xc;
> +     int rc;
> +
> +     pr_debug("XIVE: Setting up IPI for CPU %d\n", cpu);
> +
> +     xc = per_cpu(xive_cpu, cpu);
> +
> +     /* Check if we are already setup */
> +     if (xc->hw_ipi != 0)
> +             return 0;
> +
> +     /* Grab an IPI from the backend, this will populate xc->hw_ipi */
> +     if (xive_ops->get_ipi(cpu, xc))
> +             return -EIO;
> +
> +     /* Populate the IRQ data in the xive_cpu structure and
> +      * configure the HW / enable the IPIs
> +      */
> +     rc = xive_ops->populate_irq_data(xc->hw_ipi, &xc->ipi_data);
> +     if (rc) {
> +             pr_err("XIVE: Failed to populate IPI data on CPU %d\n", cpu);
> +             return -EIO;
> +     }
> +     rc = xive_ops->configure_irq(xc->hw_ipi,
> +                                  get_hard_smp_processor_id(cpu),
> +                                  xive_irq_priority, xive_ipi_irq);
> +     if (rc) {
> +             pr_err("XIVE: Failed to map IPI CPU %d\n", cpu);
> +             return -EIO;
> +     }
> +     DBG("XIVE: CPU %d HW IPI %x, virq %d, trig_mmio=%p\n", cpu,
> +         xc->hw_ipi, xive_ipi_irq, xc->ipi_data.trig_mmio);
> +
> +     /* Unmask it */
> +     xive_do_source_set_mask(&xc->ipi_data, false);
> +
> +     return 0;
> +}
> +
> +static void xive_cleanup_cpu_ipi(unsigned int cpu, struct xive_cpu *xc)
> +{
> +     /* Disable the IPI and free the IRQ data */
> +
> +     /* Already cleaned up ? */
> +     if (xc->hw_ipi == 0)
> +             return;
> +
> +     /* Mask the IPI */
> +     xive_do_source_set_mask(&xc->ipi_data, true);
> +
> +     /*
> +      * Note: We don't call xive_cleanup_irq_data() to free
> +      * the mappings as this is called from an IPI on kexec
> +      * which is not a safe environment to call iounmap()
> +      */
> +
> +     /* Deconfigure/mask in the backend */
> +     xive_ops->configure_irq(xc->hw_ipi, hard_smp_processor_id(),
> +                             0xff, xive_ipi_irq);
> +
> +     /* Free the IPIs in the backend */
> +     xive_ops->put_ipi(cpu, xc);
> +}
> +
> +void __init xive_smp_probe(void)
> +{
> +     smp_ops->cause_ipi = xive_cause_ipi;
> +
> +     /* Register the IPI */
> +     xive_request_ipi();
> +
> +     /* Allocate and setup IPI for the boot CPU */
> +     xive_setup_cpu_ipi(smp_processor_id());
> +}
> +
> +#endif /* CONFIG_SMP */
> +
> +static int xive_irq_domain_map(struct irq_domain *h, unsigned int virq,
> +                            irq_hw_number_t hw)
> +{
> +     int rc;
> +
> +     /*
> +      * Mark interrupts as edge sensitive by default so that resend
> +      * actually works. Will fix that up below if needed.
> +      */
> +     irq_clear_status_flags(virq, IRQ_LEVEL);
> +
> +     /* IPIs are special and come up with HW number 0 */
> +     if (hw == 0) {
> +             /*
> +              * IPIs are marked per-cpu. We use separate HW interrupts under
> +              * the hood but associated with the same "linux" interrupt
> +              */
> +             irq_set_chip_and_handler(virq, &xive_ipi_chip,
> +                                      handle_percpu_irq);
> +             return 0;
> +     }
> +
> +     rc = xive_irq_alloc_data(virq, hw);
> +     if (rc)
> +             return rc;
> +
> +     irq_set_chip_and_handler(virq, &xive_irq_chip, handle_fasteoi_irq);
> +
> +     return 0;
> +}
> +
> +static void xive_irq_domain_unmap(struct irq_domain *d, unsigned int virq)
> +{
> +     struct irq_data *data = irq_get_irq_data(virq);
> +     unsigned int hw_irq;
> +
> +     if (!data)
> +             return;
> +     hw_irq = (unsigned int)irqd_to_hwirq(data);
> +     if (hw_irq)
> +             xive_irq_free_data(virq);
> +}
> +
> +static int xive_irq_domain_xlate(struct irq_domain *h, struct device_node 
> *ct,
> +                              const u32 *intspec, unsigned int intsize,
> +                              irq_hw_number_t *out_hwirq, unsigned int 
> *out_flags)
> +
> +{
> +     *out_hwirq = intspec[0];
> +
> +     /*
> +      * If intsize is at least 2, we look for the type in the second cell,
> +      * we assume the LSB indicates a level interrupt.
> +      */
> +     if (intsize > 1) {
> +             if (intspec[1] & 1)
> +                     *out_flags = IRQ_TYPE_LEVEL_LOW;
> +             else
> +                     *out_flags = IRQ_TYPE_EDGE_RISING;
> +     } else
> +             *out_flags = IRQ_TYPE_LEVEL_LOW;
> +
> +     return 0;
> +}
> +
> +static int xive_irq_domain_match(struct irq_domain *h, struct device_node 
> *node,
> +                              enum irq_domain_bus_token bus_token)
> +{
> +     return xive_ops->match(node);
> +}
> +
> +static const struct irq_domain_ops xive_irq_domain_ops = {
> +     .match = xive_irq_domain_match,
> +     .map = xive_irq_domain_map,
> +     .unmap = xive_irq_domain_unmap,
> +     .xlate = xive_irq_domain_xlate,
> +};
> +
> +static void __init xive_init_host(void)
> +{
> +     xive_irq_domain = irq_domain_add_nomap(NULL, XIVE_MAX_IRQ,
> +                                            &xive_irq_domain_ops, NULL);
> +     BUG_ON(xive_irq_domain == NULL);
> +     irq_set_default_host(xive_irq_domain);
> +}
> +
> +static void xive_cleanup_cpu_queues(unsigned int cpu, struct xive_cpu *xc)
> +{
> +     if (xc->queue[xive_irq_priority].qpage)
> +             xive_ops->cleanup_queue(cpu, xc, xive_irq_priority);
> +}
> +
> +static int xive_setup_cpu_queues(unsigned int cpu, struct xive_cpu *xc)
> +{
> +     int rc = 0;
> +
> +     /* We setup 1 queues for now with a 64k page */
> +     if (!xc->queue[xive_irq_priority].qpage)
> +             rc = xive_ops->setup_queue(cpu, xc, xive_irq_priority);
> +
> +     return rc;
> +}
> +
> +static int xive_prepare_cpu(unsigned int cpu)
> +{
> +     struct xive_cpu *xc;
> +
> +     xc = per_cpu(xive_cpu, cpu);
> +     if (!xc) {
> +             struct device_node *np;
> +
> +             xc = kzalloc_node(sizeof(struct xive_cpu),
> +                               GFP_KERNEL, cpu_to_node(cpu));
> +             if (!xc)
> +                     return -ENOMEM;
> +             np = of_get_cpu_node(cpu, NULL);
> +             if (np)
> +                     xc->chip_id = of_get_ibm_chip_id(np);
> +             of_node_put(np);
> +
> +             per_cpu(xive_cpu, cpu) = xc;
> +     }
> +
> +     /* Setup EQs if not already */
> +     return xive_setup_cpu_queues(cpu, xc);
> +}
> +
> +static void xive_setup_cpu(void)
> +{
> +     struct xive_cpu *xc = __this_cpu_read(xive_cpu);
> +
> +     /* Debug: Dump the TM state */
> +     DBG("CPU %d [HW 0x%02x] VT=%02x\n",
> +         smp_processor_id(), hard_smp_processor_id(),
> +         in_8(xive_tm_area + xive_tm_offset + TM_WORD2));
> +
> +     /* The backend might have additional things to do */
> +     if (xive_ops->setup_cpu)
> +             xive_ops->setup_cpu(smp_processor_id(), xc);
> +
> +     /* Set CPPR to 0xff to enable flow of interrupts */
> +     xc->cppr = 0xff;
> +     out_8(xive_tm_area + xive_tm_offset + TM_CPPR, 0xff);
> +}
> +
> +#ifdef CONFIG_SMP
> +void xive_smp_setup_cpu(void)
> +{
> +     DBG("XIVE: SMP setup CPU %d\n", smp_processor_id());
> +
> +     /* This will have already been done on the boot CPU */
> +     if (smp_processor_id() != boot_cpuid)
> +             xive_setup_cpu();
> +
> +}
> +
> +int xive_smp_prepare_cpu(unsigned int cpu)
> +{
> +     int rc;
> +
> +     /* Allocate per-CPU data and queues */
> +     rc = xive_prepare_cpu(cpu);
> +     if (rc)
> +             return rc;
> +
> +     /* Allocate and setup IPI for the new CPU */
> +     return xive_setup_cpu_ipi(cpu);
> +}
> +
> +#ifdef CONFIG_HOTPLUG_CPU
> +static void xive_flush_cpu_queue(unsigned int cpu, struct xive_cpu *xc)
> +{
> +     u32 irq;
> +
> +     /* We assume local irqs are disabled */
> +     WARN_ON(!irqs_disabled());
> +
> +     /* Check what's already in the CPU queue */
> +     while ((irq = xive_scan_interrupts(xc, false)) != 0) {
> +             /*
> +              * We need to re-route that interrupt to its new distination.
> +              * First get and lock the descriptor
> +              */
> +             struct irq_desc *desc = irq_to_desc(irq);
> +             struct irq_data *d = irq_desc_get_irq_data(desc);
> +             struct xive_irq_data *xd;
> +             unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
> +
> +             /*
> +              * Ignore anything that isn't a XIVE irq and ignore
> +              * IPIs, so can just be dropped.
> +              */
> +             if (d->domain != xive_irq_domain || hw_irq == 0)
> +                     continue;
> +#ifdef DEBUG_FLUSH
> +             pr_info("CPU %d: Got irq %d while offline, re-routing...\n",
> +                     cpu, irq);
> +#endif
> +             raw_spin_lock(&desc->lock);
> +             xd = irq_desc_get_handler_data(desc);
> +
> +             /* For LSIs, we EOI, this will cause a resend if it's
> +              * still asserted. Otherwise do an MSI retrigger
> +              */
> +             if (xd->flags & XIVE_IRQ_FLAG_LSI)
> +                     xive_do_source_eoi(irqd_to_hwirq(d), xd);
> +             else
> +                     xive_irq_retrigger(d);
> +             raw_spin_unlock(&desc->lock);
> +     }
> +}
> +
> +void xive_smp_disable_cpu(void)
> +{
> +     struct xive_cpu *xc = __this_cpu_read(xive_cpu);
> +     unsigned int cpu = smp_processor_id();
> +
> +     /* Migrate interrupts away from the CPU */
> +     irq_migrate_all_off_this_cpu();
> +
> +     /* Set CPPR to 0 to disable flow of interrupts */
> +     xc->cppr = 0;
> +     out_8(xive_tm_area + xive_tm_offset + TM_CPPR, 0);
> +
> +     /* Flush everything still in the queue */
> +     xive_flush_cpu_queue(cpu, xc);
> +
> +     /* Re-enable CPPR  */
> +     xc->cppr = 0xff;
> +     out_8(xive_tm_area + xive_tm_offset + TM_CPPR, 0xff);
> +}
> +
> +void xive_flush_interrupt(void)
> +{
> +     struct xive_cpu *xc = __this_cpu_read(xive_cpu);
> +     unsigned int cpu = smp_processor_id();
> +
> +     /* Called if an interrupt occurs while the CPU is hot unplugged */
> +     xive_flush_cpu_queue(cpu, xc);
> +}
> +
> +#endif /* CONFIG_HOTPLUG_CPU */
> +
> +#endif /* CONFIG_SMP */
> +
> +void xive_kexec_teardown_cpu(int secondary)
> +{
> +     struct xive_cpu *xc = __this_cpu_read(xive_cpu);
> +     unsigned int cpu = smp_processor_id();
> +
> +     /* Set CPPR to 0 to disable flow of interrupts */
> +     xc->cppr = 0;
> +     out_8(xive_tm_area + xive_tm_offset + TM_CPPR, 0);
> +
> +     /* Backend cleanup if any */
> +     if (xive_ops->teardown_cpu)
> +             xive_ops->teardown_cpu(cpu, xc);
> +
> +     /* Get rid of IPI */
> +     xive_cleanup_cpu_ipi(cpu, xc);
> +
> +     /* Disable and free the queues */
> +     xive_cleanup_cpu_queues(cpu, xc);
> +}
> +
> +void xive_shutdown(void)
> +{
> +     xive_ops->shutdown();
> +}
> +
> +bool xive_core_init(const struct xive_ops *ops, void __iomem *area, u32 
> offset,
> +                 u8 max_prio)
> +{
> +     xive_tm_area = area;
> +     xive_tm_offset = offset;
> +     xive_ops = ops;
> +     xive_irq_priority = max_prio;
> +
> +     ppc_md.get_irq = xive_get_irq;
> +     __xive_enabled = true;
> +
> +     DBG("Initializing host..\n");
> +     xive_init_host();
> +
> +     DBG("Initializing boot CPU..\n");
> +
> +     /* Allocate per-CPU data and queues */
> +     xive_prepare_cpu(smp_processor_id());
> +
> +     /* Get ready for interrupts */
> +     xive_setup_cpu();
> +
> +     pr_info("XIVE: Interrupt handling intialized with %s backend\n",
> +             xive_ops->name);
> +     pr_info("XIVE: Using priority %d for all interrupts\n", max_prio);
> +
> +     return true;
> +}
> +
> +static int __init xive_off(char *arg)
> +{
> +     xive_cmdline_disabled = true;
> +     return 0;
> +}
> +__setup("xive=off", xive_off);
> diff --git a/arch/powerpc/sysdev/xive/native.c 
> b/arch/powerpc/sysdev/xive/native.c
> new file mode 100644
> index 0000000..26cc6bf
> --- /dev/null
> +++ b/arch/powerpc/sysdev/xive/native.c
> @@ -0,0 +1,604 @@
> +/*
> + * Copyright 2016,2017 IBM Corporation.
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License
> + * as published by the Free Software Foundation; either version
> + * 2 of the License, or (at your option) any later version.
> + */
> +#include <linux/types.h>
> +#include <linux/irq.h>
> +#include <linux/debugfs.h>

Unused?

> +#include <linux/smp.h>
> +#include <linux/interrupt.h>
> +#include <linux/seq_file.h>

Unused?

> +#include <linux/init.h>
> +#include <linux/of.h>
> +#include <linux/slab.h>
> +#include <linux/spinlock.h>
> +#include <linux/delay.h>
> +#include <linux/cpumask.h>
> +#include <linux/mm.h>
> +
> +#include <asm/prom.h>
> +#include <asm/io.h>
> +#include <asm/smp.h>
> +#include <asm/irq.h>
> +#include <asm/errno.h>
> +#include <asm/xive.h>
> +#include <asm/opal.h>
> +
> +#include "xive-regs.h"
> +#include "xive-internal.h"
> +
> +#define DBG(fmt...)  pr_devel("XIVE: " fmt)
> +
> +/* Enable this for using queue MMIO page for EOI. We don't currently
> + * use it as we always notify
> + */
> +#undef USE_QUEUE_MMIO

Dead code? Or we want to keep it?


> +static u32 xive_provision_size;
> +static u32 *xive_provision_chips;
> +static u32 xive_provision_chip_count;
> +static u32 xive_queue_shift;
> +static u32 xive_pool_vps = XIVE_INVALID_VP;
> +static struct kmem_cache *xive_provision_cache;
> +
> +int xive_native_populate_irq_data(u32 hw_irq, struct xive_irq_data *data)
> +{
> +     __be64 flags, eoi_page, trig_page;
> +     __be32 esb_shift, src_chip;
> +     u64 opal_flags;
> +     s64 rc;
> +
> +     memset(data, 0, sizeof(*data));
> +
> +     rc = opal_xive_get_irq_info(hw_irq, &flags, &eoi_page, &trig_page,
> +                                 &esb_shift, &src_chip);
> +     if (rc) {
> +             pr_err("XIVE: opal_xive_get_irq_info(0x%x) returned %lld\n",
> +                    hw_irq, rc);
> +             return -EINVAL;
> +     }
> +
> +     opal_flags = be64_to_cpu(flags);
> +     if (opal_flags & OPAL_XIVE_IRQ_STORE_EOI)
> +             data->flags |= XIVE_IRQ_FLAG_STORE_EOI;
> +     if (opal_flags & OPAL_XIVE_IRQ_LSI)
> +             data->flags |= XIVE_IRQ_FLAG_LSI;
> +     if (opal_flags & OPAL_XIVE_IRQ_SHIFT_BUG)
> +             data->flags |= XIVE_IRQ_FLAG_SHIFT_BUG;
> +     if (opal_flags & OPAL_XIVE_IRQ_MASK_VIA_FW)
> +             data->flags |= XIVE_IRQ_FLAG_MASK_FW;
> +     if (opal_flags & OPAL_XIVE_IRQ_EOI_VIA_FW)
> +             data->flags |= XIVE_IRQ_FLAG_EOI_FW;
> +     data->eoi_page = be64_to_cpu(eoi_page);
> +     data->trig_page = be64_to_cpu(trig_page);
> +     data->esb_shift = be32_to_cpu(esb_shift);
> +     data->src_chip = be32_to_cpu(src_chip);
> +
> +     data->eoi_mmio = ioremap(data->eoi_page, 1u << data->esb_shift);
> +     if (!data->eoi_mmio) {
> +             pr_err("XIVE: Failed to map EOI page for irq 0x%x\n", hw_irq);
> +             return -ENOMEM;
> +     }
> +
> +     if (!data->trig_page)
> +             return 0;
> +     if (data->trig_page == data->eoi_page) {
> +             data->trig_mmio = data->eoi_mmio;
> +             return 0;
> +     }
> +
> +     data->trig_mmio = ioremap(data->trig_page, 1u << data->esb_shift);
> +     if (!data->trig_mmio) {
> +             pr_err("XIVE: Failed to map trigger page for irq 0x%x\n", 
> hw_irq);
> +             return -ENOMEM;
> +     }
> +     return 0;
> +}
> +
> +int xive_native_configure_irq(u32 hw_irq, u32 target, u8 prio, u32 sw_irq)
> +{
> +     s64 rc;
> +
> +     for (;;) {
> +             rc = opal_xive_set_irq_config(hw_irq, target, prio, sw_irq);
> +             if (rc != OPAL_BUSY)
> +                     break;
> +             msleep(1);
> +     }
> +     return rc == 0 ? 0 : -ENXIO;
> +}
> +
> +/* This can be called multiple time to change a queue configuration */
> +int xive_native_configure_queue(u32 vp_id, struct xive_q *q, u8 prio,
> +                             __be32 *qpage, u32 order, bool can_escalate)
> +{
> +     s64 rc = 0;
> +     __be64 qeoi_page_be;
> +     __be32 esc_irq_be;
> +     u64 flags, qpage_phys;
> +
> +     /* If there's an actual queue page, clean it */
> +     if (order) {
> +             BUG_ON(!qpage);

Can't we just return an error?

> +             qpage_phys = __pa(qpage);
> +     } else
> +             qpage_phys = 0;
> +
> +     /* Initialize the rest of the fields */
> +     q->msk = order ? ((1u << (order - 2)) - 1) : 0;
> +     q->idx = 0;
> +     q->toggle = 0;
> +
> +     rc = opal_xive_get_queue_info(vp_id, prio, NULL, NULL,
> +                                   &qeoi_page_be,
> +                                   &esc_irq_be,
> +                                   NULL);
> +     if (rc) {
> +             pr_err("XIVE: Error %lld getting queue info prio %d\n",
> +                    rc, prio);
> +             rc = -EIO;
> +             goto fail;
> +     }
> +     q->eoi_phys = be64_to_cpu(qeoi_page_be);
> +
> +#ifdef USE_QUEUE_MMIO
> +     if (!q->eoi_mmio)
> +             q->eoi_mmio = ioremap(q->eoi_phys, PAGE_SIZE);
> +     if (!q->eoi_mmio) {
> +             pr_err("XIVE: Failed to map queue MMIO prio %d CPU %d\n",
> +                    rc, prio, cpu);
> +             rc = -ENOMEM;
> +             goto fail;
> +     }
> +#endif /* USE_QUEUE_MMIO */
> +
> +
...
> +static bool xive_parse_provisioning(struct device_node *np)
> +{
> +     int rc;
> +
> +     if (of_property_read_u32(np, "ibm,xive-provision-page-size",
> +                              &xive_provision_size) < 0)
> +             return true;
> +     rc = of_property_count_elems_of_size(np, "ibm,xive-provision-chips", 4);
> +     if (rc < 0) {
> +             pr_err("XIVE: Error %d getting provision chips array\n", rc);
> +             return false;
> +     }
> +     xive_provision_chip_count = rc;
> +     if (rc == 0)
> +             return true;
> +
> +     xive_provision_chips = kzalloc(4 * xive_provision_chip_count,
> +                                    GFP_KERNEL);
> +     BUG_ON(!xive_provision_chips);

return false?

> +
> +     rc = of_property_read_u32_array(np, "ibm,xive-provision-chips",
> +                                     xive_provision_chips,
> +                                     xive_provision_chip_count);
...
> diff --git a/arch/powerpc/sysdev/xive/xive-internal.h 
> b/arch/powerpc/sysdev/xive/xive-internal.h
> new file mode 100644
> index 0000000..e736fc5
> --- /dev/null
> +++ b/arch/powerpc/sysdev/xive/xive-internal.h
> @@ -0,0 +1,51 @@

Copyright missing.

> +#ifndef __XIVE_INTERNAL_H
> +#define __XIVE_INTERNAL_H
...
> diff --git a/arch/powerpc/sysdev/xive/xive-regs.h 
> b/arch/powerpc/sysdev/xive/xive-regs.h
> new file mode 100644
> index 0000000..f1edb23
> --- /dev/null
> +++ b/arch/powerpc/sysdev/xive/xive-regs.h
> @@ -0,0 +1,88 @@

Copyright missing.

> +#ifndef __XIVE_REGS_H__
> +#define __XIVE_REGS_H__
...
> diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
> index 16321ad..c71e919 100644
> --- a/arch/powerpc/xmon/xmon.c
> +++ b/arch/powerpc/xmon/xmon.c
...
> +
> +static void dump_one_xive_irq(uint32_t num)

u32?

> +{
> +     int64_t rc;
> +     __be64 vp;
> +     uint8_t prio;

u8?


zzzzz ...

cheers

Reply via email to