On Tue, Nov 28, 2017 at 06:33:06PM +0000, Cédric Le Goater wrote: > On 11/28/2017 06:38 AM, David Gibson wrote: > > On Thu, Nov 23, 2017 at 02:29:40PM +0100, Cédric Le Goater wrote: > >> Each interrupt source is associated with a two bit state machine > >> called an Event State Buffer (ESB). The bits are named "P" (pending) > >> and "Q" (queued) and can be controlled by MMIO. It is used to trigger > >> events. See code for more details on the states and transitions. > >> > >> The MMIO space for the ESB translation is 512GB large on baremetal > >> (powernv) systems and the BAR depends on the chip id. In our model for > >> the sPAPR machine, we choose to only map a sub memory region for the > >> provisionned IRQ numbers and to use the mapping address of chip 0 on a > >> real system. The OS will get the address of the MMIO page of the ESB > >> entry associated with an IRQ using the H_INT_GET_SOURCE_INFO hcall. > >> > >> Signed-off-by: Cédric Le Goater <c...@kaod.org> > >> --- > >> hw/intc/spapr_xive.c | 268 > >> +++++++++++++++++++++++++++++++++++++++++++- > >> include/hw/ppc/spapr_xive.h | 8 ++ > >> 2 files changed, 275 insertions(+), 1 deletion(-) > >> > >> diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c > >> index 66c533fb1d78..f45f50fd017e 100644 > >> --- a/hw/intc/spapr_xive.c > >> +++ b/hw/intc/spapr_xive.c > >> @@ -32,6 +32,216 @@ static void spapr_xive_irq(sPAPRXive *xive, int lisn) > >> } > >> > >> /* > >> + * "magic" Event State Buffer (ESB) MMIO offsets. > >> + * > >> + * Each interrupt source has a 2-bit state machine called ESB > >> + * which can be controlled by MMIO. It's made of 2 bits, P and > >> + * Q. P indicates that an interrupt is pending (has been sent > >> + * to a queue and is waiting for an EOI). Q indicates that the > >> + * interrupt has been triggered while pending. > >> + * > >> + * This acts as a coalescing mechanism in order to guarantee > >> + * that a given interrupt only occurs at most once in a queue. > >> + * > >> + * When doing an EOI, the Q bit will indicate if the interrupt > >> + * needs to be re-triggered. > >> + * > >> + * The following offsets into the ESB MMIO allow to read or > >> + * manipulate the PQ bits. They must be used with an 8-bytes > >> + * load instruction. They all return the previous state of the > >> + * interrupt (atomically). > >> + * > >> + * Additionally, some ESB pages support doing an EOI via a > >> + * store at 0 and some ESBs support doing a trigger via a > >> + * separate trigger page. > >> + */ > >> +#define XIVE_ESB_GET 0x800 > >> +#define XIVE_ESB_SET_PQ_00 0xc00 > >> +#define XIVE_ESB_SET_PQ_01 0xd00 > >> +#define XIVE_ESB_SET_PQ_10 0xe00 > >> +#define XIVE_ESB_SET_PQ_11 0xf00 > >> + > >> +#define XIVE_ESB_VAL_P 0x2 > >> +#define XIVE_ESB_VAL_Q 0x1 > >> + > >> +#define XIVE_ESB_RESET 0x0 > >> +#define XIVE_ESB_PENDING XIVE_ESB_VAL_P > >> +#define XIVE_ESB_QUEUED (XIVE_ESB_VAL_P | XIVE_ESB_VAL_Q) > >> +#define XIVE_ESB_OFF XIVE_ESB_VAL_Q > >> + > >> +static uint8_t spapr_xive_pq_get(sPAPRXive *xive, uint32_t lisn) > >> +{ > >> + uint32_t byte = lisn / 4; > >> + uint32_t bit = (lisn % 4) * 2; > >> + > >> + assert(byte < xive->sbe_size); > >> + > >> + return (xive->sbe[byte] >> bit) & 0x3; > >> +} > >> + > >> +static uint8_t spapr_xive_pq_set(sPAPRXive *xive, uint32_t lisn, uint8_t > >> pq) > >> +{ > >> + uint32_t byte = lisn / 4; > >> + uint32_t bit = (lisn % 4) * 2; > >> + uint8_t old, new; > >> + > >> + assert(byte < xive->sbe_size); > >> + > >> + old = xive->sbe[byte]; > >> + > >> + new = xive->sbe[byte] & ~(0x3 << bit); > >> + new |= (pq & 0x3) << bit; > >> + > >> + xive->sbe[byte] = new; > >> + > >> + return (old >> bit) & 0x3; > >> +} > >> + > >> +static bool spapr_xive_pq_eoi(sPAPRXive *xive, uint32_t lisn) > >> +{ > >> + uint8_t old_pq = spapr_xive_pq_get(xive, lisn); > >> + > >> + switch (old_pq) { > >> + case XIVE_ESB_RESET: > >> + spapr_xive_pq_set(xive, lisn, XIVE_ESB_RESET); > >> + return false; > >> + case XIVE_ESB_PENDING: > >> + spapr_xive_pq_set(xive, lisn, XIVE_ESB_RESET); > >> + return false; > >> + case XIVE_ESB_QUEUED: > >> + spapr_xive_pq_set(xive, lisn, XIVE_ESB_PENDING); > >> + return true; > >> + case XIVE_ESB_OFF: > >> + spapr_xive_pq_set(xive, lisn, XIVE_ESB_OFF); > >> + return false; > >> + default: > >> + g_assert_not_reached(); > >> + } > >> +} > >> + > >> +static bool spapr_xive_pq_trigger(sPAPRXive *xive, uint32_t lisn) > >> +{ > >> + uint8_t old_pq = spapr_xive_pq_get(xive, lisn); > >> + > >> + switch (old_pq) { > >> + case XIVE_ESB_RESET: > >> + spapr_xive_pq_set(xive, lisn, XIVE_ESB_PENDING); > >> + return true; > >> + case XIVE_ESB_PENDING: > >> + spapr_xive_pq_set(xive, lisn, XIVE_ESB_QUEUED); > >> + return true; > >> + case XIVE_ESB_QUEUED: > >> + spapr_xive_pq_set(xive, lisn, XIVE_ESB_QUEUED); > >> + return true; > >> + case XIVE_ESB_OFF: > >> + spapr_xive_pq_set(xive, lisn, XIVE_ESB_OFF); > >> + return false; > >> + default: > >> + g_assert_not_reached(); > >> + } > >> +} > >> + > >> +/* > >> + * XIVE Interrupt Source MMIOs > >> + */ > >> +static void spapr_xive_source_eoi(sPAPRXive *xive, uint32_t lisn) > >> +{ > >> + if (spapr_xive_irq_is_lsi(xive, lisn)) { > >> + xive->status[lisn] &= ~XIVE_STATUS_SENT; > >> + } > >> +} > >> + > >> +/* TODO: handle second page > >> + * > >> + * Some HW use a separate page for trigger. We only support the case > >> + * in which the trigger can be done in the same page as the EOI. > >> + */ > >> +static uint64_t spapr_xive_esb_read(void *opaque, hwaddr addr, unsigned > >> size) > >> +{ > >> + sPAPRXive *xive = SPAPR_XIVE(opaque); > >> + uint32_t offset = addr & 0xF00; > >> + uint32_t lisn = addr >> xive->esb_shift; > >> + XiveIVE *ive; > >> + uint64_t ret = -1; > >> + > >> + ive = spapr_xive_get_ive(xive, lisn); > >> + if (!ive || !(ive->w & IVE_VALID)) { > >> + qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid LISN %d\n", lisn); > >> + goto out; > >> + } > >> + > >> + switch (offset) { > >> + case 0: > >> + spapr_xive_source_eoi(xive, lisn); > > > > Hrm. I don't love that you're dealing with clearing that LSI bit > > here, but setting it at a different level. > > > > The state machines are doing my head in a bit, is there any way > > you could derive the STATUS_SENT bit from the PQ bits? > > Yes. I should. > > I am also lacking a guest driver to exercise these LSIs so I didn't > pay a lot of attention to level interrupts. Any idea ?
How about an old-school emulated PCI device? Maybe rtl8139? > >> + /* return TRUE or FALSE depending on PQ value */ > >> + ret = spapr_xive_pq_eoi(xive, lisn); > >> + break; > >> + > >> + case XIVE_ESB_GET: > >> + ret = spapr_xive_pq_get(xive, lisn); > >> + break; > >> + > >> + case XIVE_ESB_SET_PQ_00: > >> + case XIVE_ESB_SET_PQ_01: > >> + case XIVE_ESB_SET_PQ_10: > >> + case XIVE_ESB_SET_PQ_11: > >> + ret = spapr_xive_pq_set(xive, lisn, (offset >> 8) & 0x3); > >> + break; > >> + default: > >> + qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid ESB addr %d\n", > >> offset); > >> + } > >> + > >> +out: > >> + return ret; > >> +} > >> + > >> +static void spapr_xive_esb_write(void *opaque, hwaddr addr, > >> + uint64_t value, unsigned size) > >> +{ > >> + sPAPRXive *xive = SPAPR_XIVE(opaque); > >> + uint32_t offset = addr & 0xF00; > >> + uint32_t lisn = addr >> xive->esb_shift; > >> + XiveIVE *ive; > >> + bool notify = false; > >> + > >> + ive = spapr_xive_get_ive(xive, lisn); > >> + if (!ive || !(ive->w & IVE_VALID)) { > >> + qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid LISN %d\n", lisn); > >> + return; > >> + } > >> + > >> + switch (offset) { > >> + case 0: > >> + /* TODO: should we trigger even if the IVE is masked ? */ > >> + notify = spapr_xive_pq_trigger(xive, lisn); > >> + break; > >> + default: > >> + qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid ESB write addr > >> %d\n", > >> + offset); > >> + return; > >> + } > >> + > >> + if (notify && !(ive->w & IVE_MASKED)) { > >> + qemu_irq_pulse(xive->qirqs[lisn]); > >> + } > >> +} > >> + > >> +static const MemoryRegionOps spapr_xive_esb_ops = { > >> + .read = spapr_xive_esb_read, > >> + .write = spapr_xive_esb_write, > >> + .endianness = DEVICE_BIG_ENDIAN, > >> + .valid = { > >> + .min_access_size = 8, > >> + .max_access_size = 8, > >> + }, > >> + .impl = { > >> + .min_access_size = 8, > >> + .max_access_size = 8, > >> + }, > >> +}; > >> + > >> +/* > >> * XIVE Interrupt Source > >> */ > >> static void spapr_xive_source_set_irq_msi(sPAPRXive *xive, int lisn, int > >> val) > >> @@ -70,6 +280,33 @@ static void spapr_xive_source_set_irq(void *opaque, > >> int lisn, int val) > >> /* > >> * Main XIVE object > >> */ > >> +#define P9_MMIO_BASE 0x006000000000000ull > >> + > >> +/* VC BAR contains set translations for the ESBs and the EQs. */ > >> +#define VC_BAR_DEFAULT 0x10000000000ull > >> +#define VC_BAR_SIZE 0x08000000000ull > >> +#define ESB_SHIFT 16 /* One 64k page. OPAL has two */ > >> + > >> +static uint64_t spapr_xive_esb_default_read(void *p, hwaddr offset, > >> + unsigned size) > >> +{ > >> + qemu_log_mask(LOG_UNIMP, "%s: 0x%" HWADDR_PRIx " [%u]\n", > >> + __func__, offset, size); > >> + return 0; > >> +} > >> + > >> +static void spapr_xive_esb_default_write(void *opaque, hwaddr offset, > >> + uint64_t value, unsigned size) > >> +{ > >> + qemu_log_mask(LOG_UNIMP, "%s: 0x%" HWADDR_PRIx " <- 0x%" PRIx64 " > >> [%u]\n", > >> + __func__, offset, value, size); > >> +} > >> + > >> +static const MemoryRegionOps spapr_xive_esb_default_ops = { > >> + .read = spapr_xive_esb_default_read, > >> + .write = spapr_xive_esb_default_write, > >> + .endianness = DEVICE_BIG_ENDIAN, > > > > I think you should at least have a valid access size field here. > > yes. > > >> +}; > >> > >> void spapr_xive_pic_print_info(sPAPRXive *xive, Monitor *mon) > >> { > >> @@ -77,14 +314,19 @@ void spapr_xive_pic_print_info(sPAPRXive *xive, > >> Monitor *mon) > >> > >> for (i = 0; i < xive->nr_irqs; i++) { > >> XiveIVE *ive = &xive->ivt[i]; > >> + uint8_t pq; > >> > >> if (!(ive->w & IVE_VALID)) { > >> continue; > >> } > >> > >> - monitor_printf(mon, " %4x %s %s %08x %08x\n", i, > >> + pq = spapr_xive_pq_get(xive, i); > >> + > >> + monitor_printf(mon, " %4x %s %s %c%c %08x %08x\n", i, > >> spapr_xive_irq_is_lsi(xive, i) ? "LSI" : "MSI", > >> ive->w & IVE_MASKED ? "M" : " ", > >> + pq & XIVE_ESB_VAL_P ? 'P' : '-', > >> + pq & XIVE_ESB_VAL_Q ? 'Q' : '-', > >> (int) GETFIELD(IVE_EQ_INDEX, ive->w), > >> (int) GETFIELD(IVE_EQ_DATA, ive->w)); > >> } > >> @@ -104,6 +346,9 @@ void spapr_xive_reset(void *dev) > >> ive->w |= IVE_MASKED; > >> } > >> } > >> + > >> + /* SBEs are initialized to 0b01 which corresponds to "ints off" */ > >> + memset(xive->sbe, 0x55, xive->sbe_size); > >> } > >> > >> static void spapr_xive_realize(DeviceState *dev, Error **errp) > >> @@ -123,6 +368,26 @@ static void spapr_xive_realize(DeviceState *dev, > >> Error **errp) > >> /* Allocate the IVT (Interrupt Virtualization Table) */ > >> xive->ivt = g_malloc0(xive->nr_irqs * sizeof(XiveIVE)); > >> > >> + /* Allocate SBEs (State Bit Entry). 2 bits, so 4 entries per byte */ > >> + xive->sbe_size = DIV_ROUND_UP(xive->nr_irqs, 4); > >> + xive->sbe = g_malloc0(xive->sbe_size); > >> + > >> + /* VC BAR. That's the full window but we will only map the > >> + * subregions in use. */ > >> + xive->esb_base = (P9_MMIO_BASE | VC_BAR_DEFAULT); > >> + xive->esb_shift = ESB_SHIFT; > > > > Any point to having this as a variable, if it's always the same size? > > Well, it is related to the page size and we could have a HW configuration > with two pages, with one specific page for trigger. But we don't need to > model this, yet. > > The 'esb_shift' field is used in different places. I would rather > keep it. Hm, ok. > >> + > >> + /* Install default memory region handlers to log bogus access */ > >> + memory_region_init_io(&xive->esb_mr, NULL, > >> &spapr_xive_esb_default_ops, > >> + NULL, "xive.esb.full", VC_BAR_SIZE); > >> + sysbus_init_mmio(SYS_BUS_DEVICE(dev), &xive->esb_mr); > >> + > >> + /* Install the ESB memory region in the overall one */ > >> + memory_region_init_io(&xive->esb_iomem, OBJECT(xive), > >> &spapr_xive_esb_ops, > >> + xive, "xive.esb", > >> + (1ull << xive->esb_shift) * xive->nr_irqs); > >> + memory_region_add_subregion(&xive->esb_mr, 0, &xive->esb_iomem); > > > > Is there a benegit to to having these nested regions, rather than just > > validating the lisn in the read/write functions? > > No. this was to represent the full ESB space of the system but we can > reduce it to one region. This is not a problem I think. > > >> qemu_register_reset(spapr_xive_reset, dev); > >> } > >> > >> @@ -152,6 +417,7 @@ static const VMStateDescription vmstate_spapr_xive = { > >> VMSTATE_STRUCT_VARRAY_UINT32_ALLOC(ivt, sPAPRXive, nr_irqs, 1, > >> vmstate_spapr_xive_ive, > >> XiveIVE), > >> VMSTATE_VBUFFER_UINT32(status, sPAPRXive, 1, NULL, nr_irqs), > >> + VMSTATE_VBUFFER_UINT32(sbe, sPAPRXive, 1, NULL, sbe_size), > >> VMSTATE_END_OF_LIST() > >> }, > >> }; > >> diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h > >> index 6a799cdaba66..84c910e62e56 100644 > >> --- a/include/hw/ppc/spapr_xive.h > >> +++ b/include/hw/ppc/spapr_xive.h > >> @@ -42,6 +42,14 @@ struct sPAPRXive { > >> > >> /* XIVE internal tables */ > >> XiveIVE *ivt; > >> + uint8_t *sbe; > >> + uint32_t sbe_size; > > > > sbe_size is derivable from nr_irqs, so I don't think there's a point > > to storing it separately . > > I needed the value for the vmstate macros. Ah, right. > > Thanks, > > C. > > > >> + > >> + /* ESB memory region */ > >> + uint32_t esb_shift; > >> + hwaddr esb_base; > >> + MemoryRegion esb_mr; > >> + MemoryRegion esb_iomem; > >> }; > >> > >> static inline bool spapr_xive_irq_is_lsi(sPAPRXive *xive, int lisn) > > > -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson
signature.asc
Description: PGP signature