On Wed, 3 Mar 2021 18:48:57 +0100 Cédric Le Goater <c...@kaod.org> wrote:
> ipistorm [*] can be used to benchmark the raw interrupt rate of an > interrupt controller by measuring the number of IPIs a system can > sustain. When applied to the XIVE interrupt controller of POWER9 and > POWER10 systems, a significant drop of the interrupt rate can be > observed when crossing the second node boundary. > > This is due to the fact that a single IPI interrupt is used for all > CPUs of the system. The structure is shared and the cache line updates > impact greatly the traffic between nodes and the overall IPI > performance. > > As a workaround, the impact can be reduced by deactivating the IRQ > lockup detector ("noirqdebug") which does a lot of accounting in the > Linux IRQ descriptor structure and is responsible for most of the > performance penalty. > > As a fix, this proposal allocates an IPI interrupt per node, to be > shared by all CPUs of that node. It solves the scaling issue, the IRQ > lockup detector still has an impact but the XIVE interrupt rate scales > linearly. It also improves the "noirqdebug" case as showed in the > tables below. > > * P9 DD2.2 - 2s * 64 threads > > "noirqdebug" > Mint/s Mint/s > chips cpus IPI/sys IPI/chip IPI/chip IPI/sys > -------------------------------------------------------------- > 1 0-15 4.984023 4.875405 4.996536 5.048892 > 0-31 10.879164 10.544040 10.757632 11.037859 > 0-47 15.345301 14.688764 14.926520 15.310053 > 0-63 17.064907 17.066812 17.613416 17.874511 > 2 0-79 11.768764 21.650749 22.689120 22.566508 > 0-95 10.616812 26.878789 28.434703 28.320324 > 0-111 10.151693 31.397803 31.771773 32.388122 > 0-127 9.948502 33.139336 34.875716 35.224548 > > * P10 DD1 - 4s (not homogeneous) 352 threads > > "noirqdebug" > Mint/s Mint/s > chips cpus IPI/sys IPI/chip IPI/chip IPI/sys > -------------------------------------------------------------- > 1 0-15 2.409402 2.364108 2.383303 2.395091 > 0-31 6.028325 6.046075 6.089999 6.073750 > 0-47 8.655178 8.644531 8.712830 8.724702 > 0-63 11.629652 11.735953 12.088203 12.055979 > 0-79 14.392321 14.729959 14.986701 14.973073 > 0-95 12.604158 13.004034 17.528748 17.568095 > 2 0-111 9.767753 13.719831 19.968606 20.024218 > 0-127 6.744566 16.418854 22.898066 22.995110 > 0-143 6.005699 19.174421 25.425622 25.417541 > 0-159 5.649719 21.938836 27.952662 28.059603 > 0-175 5.441410 24.109484 31.133915 31.127996 > 3 0-191 5.318341 24.405322 33.999221 33.775354 > 0-207 5.191382 26.449769 36.050161 35.867307 > 0-223 5.102790 29.356943 39.544135 39.508169 > 0-239 5.035295 31.933051 42.135075 42.071975 > 0-255 4.969209 34.477367 44.655395 44.757074 > 4 0-271 4.907652 35.887016 47.080545 47.318537 > 0-287 4.839581 38.076137 50.464307 50.636219 > 0-303 4.786031 40.881319 53.478684 53.310759 > 0-319 4.743750 43.448424 56.388102 55.973969 > 0-335 4.709936 45.623532 59.400930 58.926857 > 0-351 4.681413 45.646151 62.035804 61.830057 > > [*] https://github.com/antonblanchard/ipistorm > > Signed-off-by: Cédric Le Goater <c...@kaod.org> > --- > arch/powerpc/sysdev/xive/xive-internal.h | 2 -- > arch/powerpc/sysdev/xive/common.c | 39 ++++++++++++++++++------ > 2 files changed, 30 insertions(+), 11 deletions(-) > > diff --git a/arch/powerpc/sysdev/xive/xive-internal.h > b/arch/powerpc/sysdev/xive/xive-internal.h > index 9cf57c722faa..b3a456fdd3a5 100644 > --- a/arch/powerpc/sysdev/xive/xive-internal.h > +++ b/arch/powerpc/sysdev/xive/xive-internal.h > @@ -5,8 +5,6 @@ > #ifndef __XIVE_INTERNAL_H > #define __XIVE_INTERNAL_H > > -#define XIVE_IPI_HW_IRQ 0 /* interrupt source # for IPIs */ > - > /* > * A "disabled" interrupt should never fire, to catch problems > * we set its logical number to this > diff --git a/arch/powerpc/sysdev/xive/common.c > b/arch/powerpc/sysdev/xive/common.c > index 8eefd152b947..c27f7bb0494b 100644 > --- a/arch/powerpc/sysdev/xive/common.c > +++ b/arch/powerpc/sysdev/xive/common.c > @@ -65,8 +65,16 @@ static struct irq_domain *xive_irq_domain; > #ifdef CONFIG_SMP > static struct irq_domain *xive_ipi_irq_domain; > > -/* The IPIs all use the same logical irq number */ > -static u32 xive_ipi_irq; > +/* The IPIs use the same logical irq number when on the same chip */ > +static struct xive_ipi_desc { > + unsigned int irq; > + char name[8]; /* enough bytes to fit IPI-XXX */ So this assumes that the node number that node is <= 999 ? This is certainly the case for now since CONFIG_NODES_SHIFT is 8 on ppc64 but starting with 10, you'd have truncated names. What about deriving the size of name[] from CONFIG_NODES_SHIFT ? Apart from that, LGTM. Probably not worth to respin just for this. I also could give a try in a KVM guest. Topology passed to QEMU: -smp 8,maxcpus=8,cores=2,threads=2,sockets=2 \ -numa node,nodeid=0,cpus=0-4 \ -numa node,nodeid=1,cpus=4-7 Topology observed in guest with lstopo : Package L#0 NUMANode L#0 (P#0 30GB) L1d L#0 (32KB) + L1i L#0 (32KB) + Core L#0 PU L#0 (P#0) PU L#1 (P#1) L1d L#1 (32KB) + L1i L#1 (32KB) + Core L#1 PU L#2 (P#2) PU L#3 (P#3) Package L#1 NUMANode L#1 (P#1 32GB) L1d L#2 (32KB) + L1i L#2 (32KB) + Core L#2 PU L#4 (P#4) PU L#5 (P#5) L1d L#3 (32KB) + L1i L#3 (32KB) + Core L#3 PU L#6 (P#6) PU L#7 (P#7) Interrupts in guest: $ cat /proc/interrupts CPU0 CPU1 CPU2 CPU3 CPU4 CPU5 CPU6 CPU7 16: 1023 871 1042 749 0 0 0 0 XIVE-IPI 0 Edge IPI-0 17: 0 0 0 0 2123 1019 1263 1288 XIVE-IPI 1 Edge IPI-1 IPIs are mapped to the appropriate nodes, and the numbers indicate that everything is working as expected. Reviewed-and-tested-by: Greg Kurz <gr...@kaod.org> > +} *xive_ipis; > + > +static unsigned int xive_ipi_cpu_to_irq(unsigned int cpu) > +{ > + return xive_ipis[cpu_to_node(cpu)].irq; > +} > #endif > > /* Xive state for each CPU */ > @@ -1106,25 +1114,36 @@ static const struct irq_domain_ops > xive_ipi_irq_domain_ops = { > > static void __init xive_request_ipi(void) > { > - unsigned int virq; > + unsigned int node; > > - xive_ipi_irq_domain = irq_domain_add_linear(NULL, 1, > + xive_ipi_irq_domain = irq_domain_add_linear(NULL, nr_node_ids, > &xive_ipi_irq_domain_ops, > NULL); > if (WARN_ON(xive_ipi_irq_domain == NULL)) > return; > > - /* Initialize it */ > - virq = irq_create_mapping(xive_ipi_irq_domain, XIVE_IPI_HW_IRQ); > - xive_ipi_irq = virq; > + xive_ipis = kcalloc(nr_node_ids, sizeof(*xive_ipis), GFP_KERNEL | > __GFP_NOFAIL); > + for_each_node(node) { > + struct xive_ipi_desc *xid = &xive_ipis[node]; > + irq_hw_number_t node_ipi_hwirq = node; > + > + /* > + * Map one IPI interrupt per node for all cpus of that node. > + * Since the HW interrupt number doesn't have any meaning, > + * simply use the node number. > + */ > + xid->irq = irq_create_mapping(xive_ipi_irq_domain, > node_ipi_hwirq); > + snprintf(xid->name, sizeof(xid->name), "IPI-%d", node); > > - WARN_ON(request_irq(virq, xive_muxed_ipi_action, > - IRQF_PERCPU | IRQF_NO_THREAD, "IPI", NULL)); > + WARN_ON(request_irq(xid->irq, xive_muxed_ipi_action, > + IRQF_PERCPU | IRQF_NO_THREAD, xid->name, > NULL)); > + } > } > > static int xive_setup_cpu_ipi(unsigned int cpu) > { > struct xive_cpu *xc; > int rc; > + unsigned int xive_ipi_irq = xive_ipi_cpu_to_irq(cpu); > > pr_debug("Setting up IPI for CPU %d\n", cpu); > > @@ -1165,6 +1184,8 @@ static int xive_setup_cpu_ipi(unsigned int cpu) > > static void xive_cleanup_cpu_ipi(unsigned int cpu, struct xive_cpu *xc) > { > + unsigned int xive_ipi_irq = xive_ipi_cpu_to_irq(cpu); > + > /* Disable the IPI and free the IRQ data */ > > /* Already cleaned up ? */