On Wed, 2018-07-18 at 16:12 +1000, David Gibson wrote: > On Thu, Jun 28, 2018 at 10:36:32AM +0200, Cédric Le Goater wrote: > > From: Benjamin Herrenschmidt <b...@kernel.crashing.org>
Can you trim your replies ? It's really hard to find your comments in such a long patch... > > diff --git a/include/hw/ppc/xics.h b/include/hw/ppc/xics.h > > index 6ac8a9392da6..966a996c2eac 100644 > > --- a/include/hw/ppc/xics.h > > +++ b/include/hw/ppc/xics.h > > @@ -194,6 +194,7 @@ void icp_set_mfrr(ICPState *icp, uint8_t mfrr); > > uint32_t icp_accept(ICPState *ss); > > uint32_t icp_ipoll(ICPState *ss, uint32_t *mfrr); > > void icp_eoi(ICPState *icp, uint32_t xirr); > > +void icp_irq(ICSState *ics, int server, int nr, uint8_t priority); > > Hrm... are you sure you need to expose this? See further down... > > +/* > > + * The CONFIG_DATA register expects little endian accesses, but as the > > + * region is big endian, we have to swap the value. > > + */ > > +static void pnv_phb3_config_write(PnvPHB3 *phb, unsigned off, > > + unsigned size, uint64_t val) > > +{ > > + uint32_t cfg_addr, limit; > > + PCIDevice *pdev; > > + > > + pdev = pnv_phb3_find_cfg_dev(phb); > > + if (!pdev) { > > + return; > > + } > > + cfg_addr = (phb->regs[PHB_CONFIG_ADDRESS >> 3] >> 32) & 0xfff; > > + cfg_addr |= off; > > This looks weird - there are callers of this that appear to have low > bits in 'off', then you're ORing it with overlapping low bits. Should be ffc like the read case. > > > + limit = pci_config_size(pdev); > > + if (limit <= cfg_addr) { > > + /* conventional pci device can be behind pcie-to-pci bridge. > > + 256 <= addr < 4K has no effects. */ > > + return; > > + } > > + switch (size) { > > + case 1: > > + break; > > + case 2: > > + val = bswap16(val); > > + break; > > + case 4: > > + val = bswap32(val); > > + break; > > + default: > > + g_assert_not_reached(); > > + } > > + pci_host_config_write_common(pdev, cfg_addr, limit, val, size); > > +} > > + > > +static uint64_t pnv_phb3_config_read(PnvPHB3 *phb, unsigned off, > > + unsigned size) > > +{ > > + uint32_t cfg_addr, limit; > > + PCIDevice *pdev; > > + uint64_t val; > > + > > + pdev = pnv_phb3_find_cfg_dev(phb); > > + if (!pdev) { > > + return ~0ull; > > + } > > + cfg_addr = (phb->regs[PHB_CONFIG_ADDRESS >> 3] >> 32) & 0xffc; > > + cfg_addr |= off; > > This looks better, should it be 0xffc in the write path as well? > > > + limit = pci_config_size(pdev); > > + if (limit <= cfg_addr) { > > + /* conventional pci device can be behind pcie-to-pci bridge. > > + 256 <= addr < 4K has no effects. */ > > + return ~0ull; > > + } > > + val = pci_host_config_read_common(pdev, cfg_addr, limit, size); > > + switch (size) { > > + case 1: > > + return val; > > + case 2: > > + return bswap16(val); > > + case 4: > > + return bswap32(val); > > + default: > > + g_assert_not_reached(); > > + } > > +} > > + > > +static void pnv_phb3_check_m32(PnvPHB3 *phb) > > +{ > > + uint64_t base, start, size; > > + MemoryRegion *parent; > > + PnvPBCQState *pbcq = &phb->pbcq; > > + > > + if (phb->m32_mapped) { > > + memory_region_del_subregion(phb->mr_m32.container, &phb->mr_m32); > > + phb->m32_mapped = false; > > Could you use memory_region_set_enabled() rather than having to add > and delete the subregion and keep track of it in this separate > variable? There was a reason for that but it's years old and I forgot... I think when re-enabled it moves around, among other things. So it's not more efficient. > > + } > > + > > + /* Disabled ? move on with life ... */ > > Trivial: "nothing to do" seems to be the standard way to express this. > Even trivialler: usual English rules don't put a space before '?' or > '!' punctuation. No, that's the tasteless English rule :-) It shall be overridden by anybody interested in making things actually readable :-) > > + > > +static void pnv_phb3_lxivt_write(PnvPHB3 *phb, unsigned idx, uint64_t val) > > +{ > > + ICSState *ics = &phb->lsis; > > + uint8_t server, prio; > > + > > + phb->ioda_LXIVT[idx] = val & (IODA2_LXIVT_SERVER | > > + IODA2_LXIVT_PRIORITY | > > + IODA2_LXIVT_NODE_ID); > > + server = GETFIELD(IODA2_LXIVT_SERVER, val); > > + prio = GETFIELD(IODA2_LXIVT_PRIORITY, val); > > + > > + /* > > + * The low order 2 bits are the link pointer (Type II interrupts). > > I don't think we've currently implemented link pointers in our xics > code. Do we need to do that? Not until somebody uses them (other than pHyp). > > + * Shift back to get a valid IRQ server. > > + */ > > + server >>= 2; > > + > > + ics_simple_write_xive(ics, idx, server, prio, prio); > > +} > > + > > +static uint64_t *pnv_phb3_ioda_access(PnvPHB3 *phb, > > + unsigned *out_table, unsigned > > *out_idx) > > +{ > > + uint64_t adreg = phb->regs[PHB_IODA_ADDR >> 3]; > > + unsigned int index = GETFIELD(PHB_IODA_AD_TADR, adreg); > > + unsigned int table = GETFIELD(PHB_IODA_AD_TSEL, adreg); > > + unsigned int mask; > > + uint64_t *tptr = NULL; > > + > > + switch (table) { > > + case IODA2_TBL_LIST: > > + tptr = phb->ioda_LIST; > > + mask = 7; > > I'd suggest hex for the masks. This is more readable when matched with the documentation but not a big deal. > > + break; > > + case IODA2_TBL_LXIVT: > > + tptr = phb->ioda_LXIVT; > > + mask = 7; > > + break; > > + case IODA2_TBL_IVC_CAM: > > + case IODA2_TBL_RBA: > > + mask = 31; > > + break; > > + case IODA2_TBL_RCAM: > > + mask = 63; > > + break; > > + case IODA2_TBL_MRT: > > + mask = 7; > > + break; > > + case IODA2_TBL_PESTA: > > + case IODA2_TBL_PESTB: > > + mask = 255; > > + break; > > + case IODA2_TBL_TVT: > > + tptr = phb->ioda_TVT; > > + mask = 511; > > + break; > > + case IODA2_TBL_TCAM: > > + case IODA2_TBL_TDR: > > + mask = 63; > > + break; > > + case IODA2_TBL_M64BT: > > + tptr = phb->ioda_M64BT; > > + mask = 15; > > + break; > > + case IODA2_TBL_M32DT: > > + tptr = phb->ioda_MDT; > > + mask = 255; > > + break; > > + case IODA2_TBL_PEEV: > > + tptr = phb->ioda_PEEV; > > + mask = 3; > > + break; > > + default: > > + return NULL; > > + } > > + index &= mask; > > Do we want to silently mask, rather than logging an error if there's > an access out of bounds for a particular table? We could do both, as to behave like the HW but also flag an error. But you have to be careful with the auto-increment below. > > + if (out_idx) { > > + *out_idx = index; > > + } > > + if (out_table) { > > + *out_table = table; > > + } > > + if (adreg & PHB_IODA_AD_AUTOINC) { > > + index = (index + 1) & mask; > > + adreg = SETFIELD(PHB_IODA_AD_TADR, adreg, index); > > + } > > + if (tptr) { > > + tptr += index; > > + } > > + phb->regs[PHB_IODA_ADDR >> 3] = adreg; > > + return tptr; > > +} ../.. > > + if ((comp & mask) != comp) { > > + qemu_log_mask(LOG_GUEST_ERROR, > > + "IRQ compare bits not in mask: comp=0x%x mask=0x%x", > > + comp, mask); > > + comp &= mask; > > + } > > + /* Setup LSI offset */ > > + ics->offset = comp + global; > > Oh.. changing ICS offset at runtime. I hadn't considered that case.. As above, see further down. > > + /* Special case configuration data */ > > + if ((off & 0xfffc) == PHB_CONFIG_DATA) { > > + pnv_phb3_config_write(phb, off & 0x3, size, val); > > + return; > > + } > > + > > + /* Other registers are 64-bit only */ > > + if (size != 8 || off & 0x7) { > > + qemu_log_mask(LOG_GUEST_ERROR, > > + "Invalid register access, offset: 0x%"PRIx64" size: > > %d", > > + off, size); > > + return; > > + } > > + > > + /* Handle masking */ > > + switch (off) { > > + case PHB_M64_UPPER_BITS: > > Couldn't you handle this in the switch below - it should be ok to > momentarily have the invalid bits set in your reg case, as long as you > mask them before applying the side-effects, yes? That felt easier that way ... > That said... shouldn't you filter our invalid or read-only regs before > updating the cache? Well, I had a reason for doing it that way, I do have a vague memory of that but I can't remember it now :-) > > +/* > > + * MSI/MSIX memory region implementation. > > + * The handler handles both MSI and MSIX. > > + */ > > +static void pnv_phb3_msi_write(void *opaque, hwaddr addr, > > + uint64_t data, unsigned size) > > +{ > > + PnvPhb3DMASpace *ds = opaque; > > + > > + /* Resolve PE# */ > > + if (!pnv_phb3_resolve_pe(ds)) { > > + qemu_log_mask(LOG_GUEST_ERROR, > > + "Failed to resolve PE# for bus @%p (%d) devfn 0x%x", > > + ds->bus, pci_bus_num(ds->bus), ds->devfn); > > + return; > > + } > > + > > + pnv_phb3_msi_send(&ds->phb->msis, addr, data, ds->pe_num); > > +} > > + > > +static const MemoryRegionOps pnv_phb3_msi_ops = { > > + /* There is no .read as the read result is undefined by PCI spec */ > > What will qemu do if it hits a NULL read here? The behaviour may be > undefind, but we'd prefer it didn't crash qemu.. Yeah. > > + .read = NULL, > > + .write = pnv_phb3_msi_write, > > + .endianness = DEVICE_LITTLE_ENDIAN > > +}; > > + > > +static AddressSpace *pnv_phb3_dma_iommu(PCIBus *bus, void *opaque, int > > devfn) > > +{ > > + PnvPHB3 *phb = opaque; > > + PnvPhb3DMASpace *ds; > > + > > + QLIST_FOREACH(ds, &phb->dma_spaces, list) { > > + if (ds->bus == bus && ds->devfn == devfn) { > > + break; > > + } > > + } > > + > > + if (ds == NULL) { > > + ds = g_malloc0(sizeof(PnvPhb3DMASpace)); > > + ds->bus = bus; > > + ds->devfn = devfn; > > + ds->pe_num = PHB_INVALID_PE; > > + ds->phb = phb; > > + memory_region_init_iommu(&ds->dma_mr, sizeof(ds->dma_mr), > > + TYPE_PNV_PHB3_IOMMU_MEMORY_REGION, > > + OBJECT(phb), "phb3_iommu", UINT64_MAX); > > + address_space_init(&ds->dma_as, MEMORY_REGION(&ds->dma_mr), > > + "phb3_iommu"); > > + memory_region_init_io(&ds->msi32_mr, OBJECT(phb), > > &pnv_phb3_msi_ops, > > + ds, "msi32", 0x10000); > > + memory_region_init_io(&ds->msi64_mr, OBJECT(phb), > > &pnv_phb3_msi_ops, > > + ds, "msi64", 0x100000); > > + pnv_phb3_update_msi_regions(ds); > > + > > + QLIST_INSERT_HEAD(&phb->dma_spaces, ds, list); > > + } > > + return &ds->dma_as; > > +} > > + > > +static void pnv_phb3_instance_init(Object *obj) > > +{ > > + PnvPHB3 *phb = PNV_PHB3(obj); > > + > > + /* Create LSI source */ > > + object_initialize(&phb->lsis, sizeof(phb->lsis), TYPE_ICS_SIMPLE); > > + object_property_add_child(obj, "ics-phb-lsi", OBJECT(&phb->lsis), > > NULL); > > + > > + /* Default init ... will be fixed by HW inits */ > > + phb->lsis.offset = 0; > > + > > + /* Create MSI source */ > > + object_initialize(&phb->msis, sizeof(phb->msis), TYPE_PHB3_MSI); > > + object_property_add_const_link(OBJECT(&phb->msis), "phb", obj, > > + &error_abort); > > + object_property_add_child(obj, "ics-phb-msi", OBJECT(&phb->msis), > > NULL); > > + > > + /* Create PBCQ */ > > + object_initialize(&phb->pbcq, sizeof(phb->pbcq), TYPE_PNV_PBCQ); > > + object_property_add_const_link(OBJECT(&phb->pbcq), "phb", obj, > > + &error_abort); > > + object_property_add_child(obj, "pbcq", OBJECT(&phb->pbcq), NULL); > > + > > + QLIST_INIT(&phb->dma_spaces); > > +} > > + > > +/* > > + * This could be done under pnv_pbcq_realize > > + */ > > +static void pnv_phb3_pci_create(PnvPHB3 *phb, Error **errp) > > +{ > > + PCIHostState *pcih = PCI_HOST_BRIDGE(phb); > > + PCIDevice *brdev; > > + PCIDevice *pdev; > > + PCIBus *parent; > > + uint8_t chassis = phb->chip_id * 4 + phb->phb_id; > > + uint8_t chassis_nr = 128; > > + > > + /* Add root complex */ > > + pdev = pci_create(pcih->bus, 0, TYPE_PNV_PHB3_RC); > > + object_property_add_child(OBJECT(phb), "phb3-rc", OBJECT(pdev), NULL); > > + qdev_prop_set_uint8(DEVICE(pdev), "chassis", chassis); > > + qdev_prop_set_uint16(DEVICE(pdev), "slot", 1); > > + qdev_init_nofail(DEVICE(pdev)); > > + > > + /* Setup bus for that chip */ > > + parent = pci_bridge_get_sec_bus(PCI_BRIDGE(pdev)); > > + > > + brdev = pci_create(parent, 0, "pci-bridge"); > > What is this pci bridge representing? I know PCI-e PHBs typically > have a pseudo P2P bridge right under them, but isn't that represnted > by the root complex above? > > > + object_property_add_child(OBJECT(parent), "pci-bridge", OBJECT(brdev), > > + NULL); > > + qdev_prop_set_uint8(DEVICE(brdev), PCI_BRIDGE_DEV_PROP_CHASSIS_NR, > > + chassis_nr); > > + qdev_init_nofail(DEVICE(brdev)); > > +} > > + > > +static void pnv_phb3_realize(DeviceState *dev, Error **errp) > > +{ > > + PnvPHB3 *phb = PNV_PHB3(dev); > > + PCIHostState *pci = PCI_HOST_BRIDGE(dev); > > + Object *xics = OBJECT(qdev_get_machine()); > > + Error *local_err = NULL; > > + int i; > > + > > + memory_region_init(&phb->pci_mmio, OBJECT(phb), "pci-mmio", > > + PCI_MMIO_TOTAL_SIZE); > > + > > + /* PHB3 doesn't support IO space. However, qemu gets very upset if > > + * we don't have an IO region to anchor IO BARs onto so we just > > + * initialize one which we never hook up to anything > > + */ > > + memory_region_init(&phb->pci_io, OBJECT(phb), "pci-io", 0x10000); > > + > > + memory_region_init_io(&phb->mr_regs, OBJECT(phb), &pnv_phb3_reg_ops, > > phb, > > + "phb3-regs", 0x1000); > > + > > + object_property_set_int(OBJECT(&phb->lsis), PNV_PHB3_NUM_LSI, > > "nr-irqs", > > + &error_abort); > > + object_property_add_const_link(OBJECT(&phb->lsis), "xics", xics, > > + &error_abort); > > + object_property_set_bool(OBJECT(&phb->lsis), true, "realized", > > &local_err); > > + if (local_err) { > > + error_propagate(errp, local_err); > > + return; > > + } > > + > > + for (i = 0; i < PNV_PHB3_NUM_LSI; i++) { > > + ics_set_irq_type(&phb->lsis, i, true); > > + } > > + > > + object_property_add_const_link(OBJECT(&phb->msis), "xics", xics, > > + &error_abort); > > + object_property_set_int(OBJECT(&phb->msis), PHB3_MAX_MSI, "nr-irqs", > > + &error_abort); > > + object_property_set_bool(OBJECT(&phb->msis), true, "realized", > > &local_err); > > + if (local_err) { > > + error_propagate(errp, local_err); > > + return; > > + } > > + > > + object_property_set_bool(OBJECT(&phb->pbcq), true, "realized", > > &local_err); > > + if (local_err) { > > + error_propagate(errp, local_err); > > + return; > > + } > > + > > + pci->bus = pci_register_root_bus(dev, "phb3-root-bus", > > + pnv_phb3_set_irq, pnv_phb3_map_irq, phb, > > + &phb->pci_mmio, &phb->pci_io, > > + 0, 4, TYPE_PNV_PHB3_ROOT_BUS); > > + pci_setup_iommu(pci->bus, pnv_phb3_dma_iommu, phb); > > + > > + /* Setup the PCI busses */ > > + pnv_phb3_pci_create(phb, &local_err); > > + if (local_err) { > > + error_propagate(errp, local_err); > > + return; > > + } > > +} > > + > > +void pnv_phb3_update_regions(PnvPHB3 *phb) > > +{ > > + PnvPBCQState *pbcq = &phb->pbcq; > > + > > + /* Unmap first always */ > > + if (phb->regs_mapped) { > > + memory_region_del_subregion(&pbcq->phbbar, &phb->mr_regs); > > + phb->regs_mapped = false; > > + } > > + > > + /* Map registers if enabled */ > > + if (pbcq->phb_mapped) { > > + /* XXX We should use the PHB BAR 2 register but we don't ... */ > > + memory_region_add_subregion(&pbcq->phbbar, 0, &phb->mr_regs); > > + phb->regs_mapped = true; > > + } > > + > > + /* Check/update m32 */ > > + if (phb->m32_mapped) { > > + pnv_phb3_check_m32(phb); > > + } > > +} > > + > > +static const char *pnv_phb3_root_bus_path(PCIHostState *host_bridge, > > + PCIBus *rootbus) > > +{ > > + PnvPHB3 *phb = PNV_PHB3(host_bridge); > > + > > + snprintf(phb->bus_path, sizeof(phb->bus_path), "00%02x:%02x", > > + phb->chip_id, phb->phb_id); > > + return phb->bus_path; > > +} > > + > > +static void pnv_phb3_get_phb_id(Object *obj, Visitor *v, const char *name, > > + void *opaque, Error **errp) > > +{ > > + Property *prop = opaque; > > + uint32_t *ptr = qdev_get_prop_ptr(DEVICE(obj), prop); > > + > > + visit_type_uint32(v, name, ptr, errp); > > +} > > + > > +static void pnv_phb3_set_phb_id(Object *obj, Visitor *v, const char *name, > > + void *opaque, Error **errp) > > +{ > > + PnvPHB3 *phb = PNV_PHB3(obj); > > + uint32_t phb_id; > > + Error *local_err = NULL; > > + > > + visit_type_uint32(v, name, &phb_id, &local_err); > > + if (local_err) { > > + error_propagate(errp, local_err); > > + return; > > + } > > + > > + /* > > + * Limit to a maximum of 6 PHBs per chip > > + */ > > + if (phb_id >= PNV8_CHIP_PHB3_MAX) { > > + error_setg(errp, "invalid PHB index: '%d'", phb_id); > > + return; > > + } > > + > > + phb->phb_id = phb_id; > > +} > > + > > +static const PropertyInfo pnv_phb3_phb_id_propinfo = { > > + .name = "irq", > > + .get = pnv_phb3_get_phb_id, > > + .set = pnv_phb3_set_phb_id, > > +}; > > Can't you use a static DeviceProps style property for this, which is a > bit simpler? > > > + /* > > + * The low order 2 bits are the link pointer (Type II interrupts). > > + * Shift back to get a valid IRQ server. > > + */ > > + server >>= 2; > > + > > + switch (pq) { > > + case 0: /* 00 */ > > + if (prio == 0xff) { > > + /* Masked, set Q */ > > + phb3_msi_set_q(msi, srcno); > > + } else { > > + /* Enabled, set P and send */ > > + phb3_msi_set_p(msi, srcno, gen); > > + icp_irq(ics, server, srcno + ics->offset, prio); > > Can't you just pulse the right qirq here, which will use the core ICS > logic to propagate to the ICP? Ok, interrupts ... sooooo... This code predates a pile of XICS work you guys did to start with. Now, this is an ICS subclass, so why shouldn't it directly poke at the target ICP ? It's an alternate to the normal ICS since it behaves a bit differently (PQ bits & all). Cheers, Ben.