On Tue, 10 Sep 2019 08:13:26 +0200 Cédric Le Goater <c...@kaod.org> wrote:
> A typical pseries VM with 16 vCPUs, one disk, one network adapater > uses less than 100 interrupts but the whole IRQ number space of the > QEMU machine is allocated at reset time and it is 8K wide. This is > wasting considerably the global IRQ space of the overall system which > has 1M interrupts per socket on a POWER9. > > To optimise the HW resources, only request at the KVM level interrupts > which have been claimed. This will help up increase the maximum number > of VMs per system. > > To keep migration compatibility, we introduce a machine class > attribute to adapt the reset behavior on older pseries machines. > This can be achieved in a simpler way, see below. > Signed-off-by: Cédric Le Goater <c...@kaod.org> > --- > include/hw/ppc/spapr.h | 1 + > include/hw/ppc/spapr_xive.h | 1 + > include/hw/ppc/xics.h | 6 ++++++ > hw/intc/spapr_xive.c | 1 + > hw/intc/spapr_xive_kvm.c | 34 +++++++++++++++++++++++++++++++--- > hw/intc/xics.c | 1 + > hw/intc/xics_kvm.c | 14 ++++++++++++++ > hw/ppc/spapr.c | 1 + > hw/ppc/spapr_irq.c | 13 +++++++------ > 9 files changed, 63 insertions(+), 9 deletions(-) > > diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h > index 03111fd55bc8..6c622954a60c 100644 > --- a/include/hw/ppc/spapr.h > +++ b/include/hw/ppc/spapr.h > @@ -131,6 +131,7 @@ struct SpaprMachineClass { > SpaprResizeHpt resize_hpt_default; > SpaprCapabilities default_caps; > SpaprIrq *irq; > + bool irq_reset_all; > }; > > /** > diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h > index bfd40f01d882..b33913eb0f28 100644 > --- a/include/hw/ppc/spapr_xive.h > +++ b/include/hw/ppc/spapr_xive.h > @@ -45,6 +45,7 @@ typedef struct SpaprXive { > void *tm_mmap; > MemoryRegion tm_mmio_kvm; > VMChangeStateEntry *change; > + bool reset_all; > } SpaprXive; > > /* > diff --git a/include/hw/ppc/xics.h b/include/hw/ppc/xics.h > index f2a8d6a4b4f9..856815362406 100644 > --- a/include/hw/ppc/xics.h > +++ b/include/hw/ppc/xics.h > @@ -117,6 +117,7 @@ struct ICSState { > DeviceState parent_obj; > /*< public >*/ > uint32_t nr_irqs; > + bool reset_all; > uint32_t offset; > ICSIRQState *irqs; > XICSFabric *xics; > @@ -179,6 +180,11 @@ void ics_simple_write_xive(ICSState *ics, int nr, int > server, > uint8_t priority, uint8_t saved_priority); > void ics_simple_set_irq(void *opaque, int srcno, int val); > > +static inline bool ics_irq_free(ICSState *ics, uint32_t srcno) > +{ > + return !(ics->irqs[srcno].flags & XICS_FLAGS_IRQ_MASK); > +} > + > void ics_set_irq_type(ICSState *ics, int srcno, bool lsi); > void icp_pic_print_info(ICPState *icp, Monitor *mon); > void ics_pic_print_info(ICSState *ics, Monitor *mon); > diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c > index c1c97192a7d2..b717d9e09314 100644 > --- a/hw/intc/spapr_xive.c > +++ b/hw/intc/spapr_xive.c > @@ -492,6 +492,7 @@ static Property spapr_xive_properties[] = { > DEFINE_PROP_UINT32("nr-ends", SpaprXive, nr_ends, 0), > DEFINE_PROP_UINT64("vc-base", SpaprXive, vc_base, SPAPR_XIVE_VC_BASE), > DEFINE_PROP_UINT64("tm-base", SpaprXive, tm_base, SPAPR_XIVE_TM_BASE), > + DEFINE_PROP_BOOL("reset-all", ICSState, reset_all, false), s/ICSState/SpaprXive > DEFINE_PROP_END_OF_LIST(), > }; > > diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c > index 17af4d19f54e..225abce36270 100644 > --- a/hw/intc/spapr_xive_kvm.c > +++ b/hw/intc/spapr_xive_kvm.c > @@ -253,13 +253,23 @@ void kvmppc_xive_source_reset_one(XiveSource *xsrc, int > srcno, Error **errp) > true, errp); > } > > +static bool xive_source_skip_reset(SpaprXive *xive, int srcno) > +{ > + return !xive->reset_all && !xive_eas_is_valid(&xive->eat[srcno]); > +} > + > static void kvmppc_xive_source_reset(XiveSource *xsrc, Error **errp) > { > + SpaprXive *xive = SPAPR_XIVE(xsrc->xive); > int i; > > for (i = 0; i < xsrc->nr_irqs; i++) { > Error *local_err = NULL; > > + if (xive_source_skip_reset(xive, i)) { > + continue; > + } > + > kvmppc_xive_source_reset_one(xsrc, i, &local_err); > if (local_err) { > error_propagate(errp, local_err); > @@ -328,11 +338,18 @@ uint64_t kvmppc_xive_esb_rw(XiveSource *xsrc, int > srcno, uint32_t offset, > > static void kvmppc_xive_source_get_state(XiveSource *xsrc) > { > + SpaprXive *xive = SPAPR_XIVE(xsrc->xive); > int i; > > for (i = 0; i < xsrc->nr_irqs; i++) { > + uint8_t pq; > + > + if (xive_source_skip_reset(xive, i)) { This looks a bit weird to "skip reset" in a function that isn't supposed to reset anything... Is it really necessary to check the reset_all flag actually ? I guess checking the EAS is valid should be enough. > + continue; > + } > + > /* Perform a load without side effect to retrieve the PQ bits */ > - uint8_t pq = xive_esb_read(xsrc, i, XIVE_ESB_GET); > + pq = xive_esb_read(xsrc, i, XIVE_ESB_GET); > > /* and save PQ locally */ > xive_source_esb_set(xsrc, i, pq); > @@ -521,9 +538,14 @@ static void kvmppc_xive_change_state_handler(void > *opaque, int running, > */ > if (running) { > for (i = 0; i < xsrc->nr_irqs; i++) { > - uint8_t pq = xive_source_esb_get(xsrc, i); > + uint8_t pq; > uint8_t old_pq; > > + if (xive_source_skip_reset(xive, i)) { Same here ? > + continue; > + } > + > + pq = xive_source_esb_get(xsrc, i); > old_pq = xive_esb_read(xsrc, i, XIVE_ESB_SET_PQ_00 + (pq << 8)); > > /* > @@ -545,7 +567,13 @@ static void kvmppc_xive_change_state_handler(void > *opaque, int running, > * migration is in progress. > */ > for (i = 0; i < xsrc->nr_irqs; i++) { > - uint8_t pq = xive_esb_read(xsrc, i, XIVE_ESB_GET); > + uint8_t pq; > + > + if (xive_source_skip_reset(xive, i)) { and here ? > + continue; > + } > + > + pq = xive_esb_read(xsrc, i, XIVE_ESB_GET); > > /* > * PQ is set to PENDING to possibly catch a triggered > diff --git a/hw/intc/xics.c b/hw/intc/xics.c > index b2fca2975cc4..ae3f83cf0aad 100644 > --- a/hw/intc/xics.c > +++ b/hw/intc/xics.c > @@ -746,6 +746,7 @@ static const VMStateDescription vmstate_ics_base = { > > static Property ics_base_properties[] = { > DEFINE_PROP_UINT32("nr-irqs", ICSState, nr_irqs, 0), > + DEFINE_PROP_BOOL("reset-all", ICSState, reset_all, false), > DEFINE_PROP_END_OF_LIST(), > }; > > diff --git a/hw/intc/xics_kvm.c b/hw/intc/xics_kvm.c > index a4d2e876cc5f..aa017b99801c 100644 > --- a/hw/intc/xics_kvm.c > +++ b/hw/intc/xics_kvm.c > @@ -177,6 +177,12 @@ void icp_kvm_realize(DeviceState *dev, Error **errp) > /* > * ICS-KVM > */ > + > +static bool ics_irq_skip_reset(ICSState *ics, int srcno) > +{ > + return !ics->reset_all && ics_irq_free(ics, srcno); > +} > + > void ics_get_kvm_state(ICSState *ics) > { > uint64_t state; > @@ -190,6 +196,10 @@ void ics_get_kvm_state(ICSState *ics) > for (i = 0; i < ics->nr_irqs; i++) { > ICSIRQState *irq = &ics->irqs[i]; > > + if (ics_irq_skip_reset(ics, i)) { And here ? > + continue; > + } > + > kvm_device_access(kernel_xics_fd, KVM_DEV_XICS_GRP_SOURCES, > i + ics->offset, &state, false, &error_fatal); > > @@ -301,6 +311,10 @@ int ics_set_kvm_state(ICSState *ics, Error **errp) > Error *local_err = NULL; > int ret; > > + if (ics_irq_skip_reset(ics, i)) { > + continue; > + } > + > ret = ics_set_kvm_state_one(ics, i, &local_err); > if (ret < 0) { > error_propagate(errp, local_err); > diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c > index 7124053b43b7..af89f3b6c698 100644 > --- a/hw/ppc/spapr.c > +++ b/hw/ppc/spapr.c > @@ -4594,6 +4594,7 @@ static void > spapr_machine_4_1_class_options(MachineClass *mc) > smc->linux_pci_probe = false; > compat_props_add(mc->compat_props, hw_compat_4_1, hw_compat_4_1_len); > compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat)); > + smc->irq_reset_all = true; Drop this property and add two lines in compat static: static GlobalProperty compat[] = { /* Only allow 4kiB and 64kiB IOMMU pagesizes */ { TYPE_SPAPR_PCI_HOST_BRIDGE, "pgsz", "0x11000" }, { TYPE_SPAPR_XIVE, "reset-all", "on" }, { TYPE_ICS_BASE, "reset-all", "on" }, }; This ensures that any instance of SpaprXive or ICSState created by a pseries-4.1 machine has reset_all set to true. > } > > DEFINE_SPAPR_MACHINE(4_1, "4.1", false); > diff --git a/hw/ppc/spapr_irq.c b/hw/ppc/spapr_irq.c > index 06fe2432bae5..8c3682613404 100644 > --- a/hw/ppc/spapr_irq.c > +++ b/hw/ppc/spapr_irq.c > @@ -97,11 +97,13 @@ static void spapr_irq_init_xics(SpaprMachineState *spapr, > int nr_irqs, > { > Object *obj; > Error *local_err = NULL; > + bool reset_all = SPAPR_MACHINE_GET_CLASS(spapr)->irq_reset_all; > ... and you can drop all the propagation code here... > obj = object_new(TYPE_ICS_SIMPLE); > object_property_add_child(OBJECT(spapr), "ics", obj, &error_abort); > object_property_add_const_link(obj, ICS_PROP_XICS, OBJECT(spapr), > &error_fatal); > + object_property_set_bool(obj, reset_all, "reset-all", &error_fatal); > object_property_set_int(obj, nr_irqs, "nr-irqs", &error_fatal); > object_property_set_bool(obj, true, "realized", &local_err); > if (local_err) { > @@ -114,9 +116,6 @@ static void spapr_irq_init_xics(SpaprMachineState *spapr, > int nr_irqs, > xics_spapr_init(spapr); > } > > -#define ICS_IRQ_FREE(ics, srcno) \ > - (!((ics)->irqs[(srcno)].flags & (XICS_FLAGS_IRQ_MASK))) > - > static int spapr_irq_claim_xics(SpaprMachineState *spapr, int irq, bool lsi, > Error **errp) > { > @@ -129,7 +128,7 @@ static int spapr_irq_claim_xics(SpaprMachineState *spapr, > int irq, bool lsi, > return -1; > } > > - if (!ICS_IRQ_FREE(ics, irq - ics->offset)) { > + if (!ics_irq_free(ics, irq - ics->offset)) { > error_setg(errp, "IRQ %d is not free", irq); > return -1; > } > @@ -147,7 +146,7 @@ static void spapr_irq_free_xics(SpaprMachineState *spapr, > int irq, int num) > if (ics_valid_irq(ics, irq)) { > trace_spapr_irq_free(0, irq, num); > for (i = srcno; i < srcno + num; ++i) { > - if (ICS_IRQ_FREE(ics, i)) { > + if (ics_irq_free(ics, i)) { > trace_spapr_irq_free_warn(0, i); > } > memset(&ics->irqs[i], 0, sizeof(ICSIRQState)); > @@ -270,9 +269,11 @@ static void spapr_irq_init_xive(SpaprMachineState > *spapr, int nr_irqs, > { > uint32_t nr_servers = spapr_max_server_number(spapr); > DeviceState *dev; > + bool reset_all = SPAPR_MACHINE_GET_CLASS(spapr)->irq_reset_all; ... and here. > int i; > > dev = qdev_create(NULL, TYPE_SPAPR_XIVE); > + object_property_set_bool(OBJECT(dev), reset_all, "reset-all", > &error_fatal); > qdev_prop_set_uint32(dev, "nr-irqs", nr_irqs); > /* > * 8 XIVE END structures per CPU. One for each available priority > @@ -767,7 +768,7 @@ static int ics_find_free_block(ICSState *ics, int num, > int alignnum) > return -1; > } > for (i = first; i < first + num; ++i) { > - if (!ICS_IRQ_FREE(ics, i)) { > + if (!ics_irq_free(ics, i)) { > break; > } > }