On 07/24/2017 06:29 AM, David Gibson wrote: > On Wed, Jul 05, 2017 at 07:13:20PM +0200, Cédric Le Goater wrote: >> Each interrupt source is associated with a 2-bit state machine called >> an Event State Buffer (ESB). It is controlled by MMIO to trigger >> events. >> >> See code for more details on the states. >> >> Signed-off-by: Cédric Le Goater <c...@kaod.org> >> --- >> hw/intc/xive.c | 230 >> ++++++++++++++++++++++++++++++++++++++++++++++++++ >> include/hw/ppc/xive.h | 3 + >> 2 files changed, 233 insertions(+) >> >> diff --git a/hw/intc/xive.c b/hw/intc/xive.c >> index 9ff14c0da595..816031b8ac81 100644 >> --- a/hw/intc/xive.c >> +++ b/hw/intc/xive.c >> @@ -32,6 +32,226 @@ static void xive_icp_irq(XiveICSState *xs, int lisn) >> } >> >> /* >> + * "magic" Event State Buffer (ESB) MMIO offsets. >> + * >> + * Each interrupt source has a 2-bit state machine called ESB >> + * which can be controlled by MMIO. It's made of 2 bits, P and >> + * Q. P indicates that an interrupt is pending (has been sent >> + * to a queue and is waiting for an EOI). Q indicates that the >> + * interrupt has been triggered while pending. >> + * >> + * This acts as a coalescing mechanism in order to guarantee >> + * that a given interrupt only occurs at most once in a queue. >> + * >> + * When doing an EOI, the Q bit will indicate if the interrupt >> + * needs to be re-triggered. >> + * >> + * The following offsets into the ESB MMIO allow to read or >> + * manipulate the PQ bits. They must be used with an 8-bytes >> + * load instruction. They all return the previous state of the >> + * interrupt (atomically). >> + * >> + * Additionally, some ESB pages support doing an EOI via a >> + * store at 0 and some ESBs support doing a trigger via a >> + * separate trigger page. >> + */ >> +#define XIVE_ESB_GET 0x800 >> +#define XIVE_ESB_SET_PQ_00 0xc00 >> +#define XIVE_ESB_SET_PQ_01 0xd00 >> +#define XIVE_ESB_SET_PQ_10 0xe00 >> +#define XIVE_ESB_SET_PQ_11 0xf00 >> + >> +#define XIVE_ESB_VAL_P 0x2 >> +#define XIVE_ESB_VAL_Q 0x1 >> + >> +#define XIVE_ESB_RESET 0x0 >> +#define XIVE_ESB_PENDING 0x2 >> +#define XIVE_ESB_QUEUED 0x3 >> +#define XIVE_ESB_OFF 0x1 >> + >> +static uint8_t xive_pq_get(XIVE *x, uint32_t lisn) >> +{ >> + uint32_t idx = lisn; >> + uint32_t byte = idx / 4; >> + uint32_t bit = (idx % 4) * 2; >> + uint8_t* pqs = (uint8_t *) x->sbe; >> + >> + return (pqs[byte] >> bit) & 0x3; >> +} >> + >> +static void xive_pq_set(XIVE *x, uint32_t lisn, uint8_t pq) >> +{ >> + uint32_t idx = lisn; >> + uint32_t byte = idx / 4; >> + uint32_t bit = (idx % 4) * 2; >> + uint8_t* pqs = (uint8_t *) x->sbe; >> + >> + pqs[byte] &= ~(0x3 << bit); >> + pqs[byte] |= (pq & 0x3) << bit; > > I know it probably amounts to the same thing given the context, but > I'd be more comfortable with a temporary and an obviously atomic > update than two writes to the real state variable.
yes. I will look better. >> +} >> + >> +static bool xive_pq_eoi(XIVE *x, uint32_t lisn) >> +{ >> + uint8_t old_pq = xive_pq_get(x, lisn); >> + >> + switch (old_pq) { >> + case XIVE_ESB_RESET: >> + xive_pq_set(x, lisn, XIVE_ESB_RESET); >> + return false; >> + case XIVE_ESB_PENDING: >> + xive_pq_set(x, lisn, XIVE_ESB_RESET); >> + return false; >> + case XIVE_ESB_QUEUED: >> + xive_pq_set(x, lisn, XIVE_ESB_PENDING); >> + return true; >> + case XIVE_ESB_OFF: >> + xive_pq_set(x, lisn, XIVE_ESB_OFF); >> + return false; >> + default: >> + g_assert_not_reached(); >> + } >> +} >> + >> +static bool xive_pq_trigger(XIVE *x, uint32_t lisn) >> +{ >> + uint8_t old_pq = xive_pq_get(x, lisn); >> + >> + switch (old_pq) { >> + case XIVE_ESB_RESET: >> + xive_pq_set(x, lisn, XIVE_ESB_PENDING); >> + return true; >> + case XIVE_ESB_PENDING: >> + xive_pq_set(x, lisn, XIVE_ESB_QUEUED); >> + return true; >> + case XIVE_ESB_QUEUED: >> + xive_pq_set(x, lisn, XIVE_ESB_QUEUED); >> + return true; >> + case XIVE_ESB_OFF: >> + xive_pq_set(x, lisn, XIVE_ESB_OFF); >> + return false; >> + default: >> + g_assert_not_reached(); >> + } >> +} >> + >> +/* >> + * XIVE Interrupt Source MMIOs >> + */ >> +static void xive_ics_eoi(XiveICSState *xs, uint32_t srcno) >> +{ >> + ICSIRQState *irq = &ICS_BASE(xs)->irqs[srcno]; >> + >> + if (irq->flags & XICS_FLAGS_IRQ_LSI) { >> + irq->status &= ~XICS_STATUS_SENT; >> + } >> +} >> + >> +/* TODO: handle second page */ > > Is this comment still relevent? Some HW have a second page to trigger the event. I am not sure we need to model it though. I will make some inquiries. >> +static uint64_t xive_esb_read(void *opaque, hwaddr addr, unsigned size) >> +{ >> + XiveICSState *xs = ICS_XIVE(opaque); >> + XIVE *x = xs->xive; >> + uint32_t offset = addr & 0xF00; >> + uint32_t srcno = addr >> xs->esb_shift; >> + uint32_t lisn = srcno + ICS_BASE(xs)->offset; >> + XiveIVE *ive; >> + uint64_t ret = -1; >> + >> + ive = xive_get_ive(x, lisn); >> + if (!ive || !(ive->w & IVE_VALID)) { >> + qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid LISN %d\n", lisn); >> + goto out; >> + } >> + >> + if (srcno >= ICS_BASE(xs)->nr_irqs) { >> + qemu_log_mask(LOG_GUEST_ERROR, >> + "XIVE: invalid IRQ number: %d/%d lisn: %d\n", >> + srcno, ICS_BASE(xs)->nr_irqs, lisn); >> + goto out; >> + } >> + >> + switch (offset) { >> + case 0: >> + xive_ics_eoi(xs, srcno); >> + >> + /* return TRUE or FALSE depending on PQ value */ >> + ret = xive_pq_eoi(x, lisn); >> + break; >> + >> + case XIVE_ESB_GET: >> + ret = xive_pq_get(x, lisn); >> + break; >> + >> + case XIVE_ESB_SET_PQ_00: >> + case XIVE_ESB_SET_PQ_01: >> + case XIVE_ESB_SET_PQ_10: >> + case XIVE_ESB_SET_PQ_11: >> + ret = xive_pq_get(x, lisn); >> + xive_pq_set(x, lisn, (offset >> 8) & 0x3); > > Again I'd prefer xive_pq_set() return the old value itself, for more > obvious atomicity. yes. ok. > >> + break; >> + default: >> + qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid ESB addr %d\n", >> offset); >> + } >> + >> +out: >> + return ret; >> +} >> + >> +static void xive_esb_write(void *opaque, hwaddr addr, >> + uint64_t value, unsigned size) >> +{ >> + XiveICSState *xs = ICS_XIVE(opaque); >> + XIVE *x = xs->xive; >> + uint32_t offset = addr & 0xF00; >> + uint32_t srcno = addr >> xs->esb_shift; >> + uint32_t lisn = srcno + ICS_BASE(xs)->offset; >> + XiveIVE *ive; >> + bool notify = false; >> + >> + ive = xive_get_ive(x, lisn); >> + if (!ive || !(ive->w & IVE_VALID)) { >> + qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid LISN %d\n", lisn); >> + return; >> + } > > Having this code associated with the individual ICS look directly at > the IVE table in the core xive object seems a bit dubious. The IVE table holds the validity and mask status of the interrupt entries, so we need that lookup. However, (continues below) ... > This also > points out another mismatch between the re-used ICS code and the new > XIVE code: ICS gathers all the per-source-irq flags/state into the > irqstate structure, whereas xive has per-irq information in the > centralized ecb and IVE tables. There can certainly be good reasons > for that, but using both at once is kind of clunky. I understand that you would rather put the esbs in the source they belong to. That is the case on real HW but it makes the modeling a bit more difficult. We would need to choose a MMIO address to give to the guest OS. I had some issues with the allocator (I need to look at this problem closer). It might also be an "issue" for KVM. Ben talked about maintaining all the esbs of a guest under a single memory region to be able to map the pages in the host. Any how, I agree this is another point to discuss in the sPAPR model. Thanks, C. >> + if (srcno >= ICS_BASE(xs)->nr_irqs) { >> + qemu_log_mask(LOG_GUEST_ERROR, >> + "XIVE: invalid IRQ number: %d/%d lisn: %d\n", >> + srcno, ICS_BASE(xs)->nr_irqs, lisn); >> + return; >> + } >> + >> + switch (offset) { >> + case 0: >> + /* TODO: should we trigger even if the IVE is masked ? */ >> + notify = xive_pq_trigger(x, lisn); >> + break; >> + default: >> + qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid ESB write addr %d\n", >> + offset); >> + return; >> + } >> + >> + if (notify && !(ive->w & IVE_MASKED)) { >> + qemu_irq_pulse(ICS_BASE(xs)->qirqs[srcno]); >> + } >> +} >> + >> +static const MemoryRegionOps xive_esb_ops = { >> + .read = xive_esb_read, >> + .write = xive_esb_write, >> + .endianness = DEVICE_BIG_ENDIAN, >> + .valid = { >> + .min_access_size = 8, >> + .max_access_size = 8, >> + }, >> + .impl = { >> + .min_access_size = 8, >> + .max_access_size = 8, >> + }, >> +}; >> + >> +/* >> * XIVE Interrupt Source >> */ >> static void xive_ics_set_irq_msi(XiveICSState *xs, int srcno, int val) >> @@ -106,15 +326,25 @@ static void xive_ics_realize(ICSState *ics, Error >> **errp) >> return; >> } >> >> + if (!xs->esb_shift) { >> + error_setg(errp, "ESB page size needs to be greater 0"); >> + return; >> + } >> + >> ics->irqs = g_malloc0(ics->nr_irqs * sizeof(ICSIRQState)); >> ics->qirqs = qemu_allocate_irqs(xive_ics_set_irq, xs, ics->nr_irqs); >> >> + memory_region_init_io(&xs->esb_iomem, OBJECT(xs), &xive_esb_ops, xs, >> + "xive.esb", >> + (1ull << xs->esb_shift) * ICS_BASE(xs)->nr_irqs); >> + >> qemu_register_reset(xive_ics_reset, xs); >> } >> >> static Property xive_ics_properties[] = { >> DEFINE_PROP_UINT32("nr-irqs", ICSState, nr_irqs, 0), >> DEFINE_PROP_UINT32("irq-base", ICSState, offset, 0), >> + DEFINE_PROP_UINT32("shift", XiveICSState, esb_shift, 0), >> DEFINE_PROP_END_OF_LIST(), >> }; >> >> diff --git a/include/hw/ppc/xive.h b/include/hw/ppc/xive.h >> index 544cc6e0c796..5303d96f5f59 100644 >> --- a/include/hw/ppc/xive.h >> +++ b/include/hw/ppc/xive.h >> @@ -33,6 +33,9 @@ typedef struct XiveICSState XiveICSState; >> struct XiveICSState { >> ICSState parent_obj; >> >> + uint32_t esb_shift; >> + MemoryRegion esb_iomem; >> + >> XIVE *xive; >> }; >> >