On 07/19/2017 05:24 AM, David Gibson wrote: > On Wed, Jul 05, 2017 at 07:13:18PM +0200, Cédric Le Goater wrote: >> The XIVE interrupt controller of the POWER9 uses a set of tables to >> redirect exception from event sources to CPU threads. Among which we >> choose to model : >> >> - the State Bit Entries (SBE), also known as Event State Buffer >> (ESB). This is a two bit state machine for each event source which >> is used to trigger events. The bits are named "P" (pending) and "Q" >> (queued) and can be controlled by MMIO. >> >> - the Interrupt Virtualization Entry (IVE) table, also known as Event >> Assignment Structure (EAS). This table is indexed by the IRQ number >> and is looked up to find the Event Queue associated with a >> triggered event. >> >> - the Event Queue Descriptor (EQD) table, also known as Event >> Notification Descriptor (END). The EQD contains fields that specify >> the Event Queue on which event data is posted (and later pulled by >> the OS) and also a target (or VPD) to notify. >> >> An additional table was not modeled but we might need to to support >> the H_INT_SET_OS_REPORTING_LINE hcall: >> >> - the Virtual Processor Descriptor (VPD) table, also known as >> Notification Virtual Target (NVT). >> >> The XIVE object is expanded with the tables described above. The size >> of each table depends on the number of provisioned IRQ and the maximum >> number of CPUs in the system. The indexing is very basic and might >> need to be improved for the EQs. >> >> Signed-off-by: Cédric Le Goater <c...@kaod.org> >> --- >> hw/intc/xive-internal.h | 95 >> +++++++++++++++++++++++++++++++++++++++++++++++++ >> hw/intc/xive.c | 72 +++++++++++++++++++++++++++++++++++++ >> 2 files changed, 167 insertions(+) >> >> diff --git a/hw/intc/xive-internal.h b/hw/intc/xive-internal.h >> index 155c2dcd6066..8e755aa88a14 100644 >> --- a/hw/intc/xive-internal.h >> +++ b/hw/intc/xive-internal.h >> @@ -11,6 +11,89 @@ >> >> #include <hw/sysbus.h> >> >> +/* Utilities to manipulate these (originaly from OPAL) */ >> +#define MASK_TO_LSH(m) (__builtin_ffsl(m) - 1) >> +#define GETFIELD(m, v) (((v) & (m)) >> MASK_TO_LSH(m)) >> +#define SETFIELD(m, v, val) \ >> + (((v) & ~(m)) | ((((typeof(v))(val)) << MASK_TO_LSH(m)) & (m))) >> + >> +#define PPC_BIT(bit) (0x8000000000000000UL >> (bit)) >> +#define PPC_BIT32(bit) (0x80000000UL >> (bit)) >> +#define PPC_BIT8(bit) (0x80UL >> (bit)) >> +#define PPC_BITMASK(bs, be) ((PPC_BIT(bs) - PPC_BIT(be)) | PPC_BIT(bs)) >> +#define PPC_BITMASK32(bs, be) ((PPC_BIT32(bs) - PPC_BIT32(be)) | \ >> + PPC_BIT32(bs)) >> + >> +/* IVE/EAS >> + * >> + * One per interrupt source. Targets that interrupt to a given EQ >> + * and provides the corresponding logical interrupt number (EQ data) >> + * >> + * We also map this structure to the escalation descriptor inside >> + * an EQ, though in that case the valid and masked bits are not used. >> + */ >> +typedef struct XiveIVE { >> + /* Use a single 64-bit definition to make it easier to >> + * perform atomic updates >> + */ >> + uint64_t w; >> +#define IVE_VALID PPC_BIT(0) >> +#define IVE_EQ_BLOCK PPC_BITMASK(4, 7) /* Destination EQ block# */ >> +#define IVE_EQ_INDEX PPC_BITMASK(8, 31) /* Destination EQ index */ >> +#define IVE_MASKED PPC_BIT(32) /* Masked */ >> +#define IVE_EQ_DATA PPC_BITMASK(33, 63) /* Data written to the EQ >> */ >> +} XiveIVE; >> + >> +/* EQ */ >> +typedef struct XiveEQ { >> + uint32_t w0; >> +#define EQ_W0_VALID PPC_BIT32(0) >> +#define EQ_W0_ENQUEUE PPC_BIT32(1) >> +#define EQ_W0_UCOND_NOTIFY PPC_BIT32(2) >> +#define EQ_W0_BACKLOG PPC_BIT32(3) >> +#define EQ_W0_PRECL_ESC_CTL PPC_BIT32(4) >> +#define EQ_W0_ESCALATE_CTL PPC_BIT32(5) >> +#define EQ_W0_END_OF_INTR PPC_BIT32(6) >> +#define EQ_W0_QSIZE PPC_BITMASK32(12, 15) >> +#define EQ_W0_SW0 PPC_BIT32(16) >> +#define EQ_W0_FIRMWARE EQ_W0_SW0 /* Owned by FW */ >> +#define EQ_QSIZE_4K 0 >> +#define EQ_QSIZE_64K 4 >> +#define EQ_W0_HWDEP PPC_BITMASK32(24, 31) >> + uint32_t w1; >> +#define EQ_W1_ESn PPC_BITMASK32(0, 1) >> +#define EQ_W1_ESn_P PPC_BIT32(0) >> +#define EQ_W1_ESn_Q PPC_BIT32(1) >> +#define EQ_W1_ESe PPC_BITMASK32(2, 3) >> +#define EQ_W1_ESe_P PPC_BIT32(2) >> +#define EQ_W1_ESe_Q PPC_BIT32(3) >> +#define EQ_W1_GENERATION PPC_BIT32(9) >> +#define EQ_W1_PAGE_OFF PPC_BITMASK32(10, 31) >> + uint32_t w2; >> +#define EQ_W2_MIGRATION_REG PPC_BITMASK32(0, 3) >> +#define EQ_W2_OP_DESC_HI PPC_BITMASK32(4, 31) >> + uint32_t w3; >> +#define EQ_W3_OP_DESC_LO PPC_BITMASK32(0, 31) >> + uint32_t w4; >> +#define EQ_W4_ESC_EQ_BLOCK PPC_BITMASK32(4, 7) >> +#define EQ_W4_ESC_EQ_INDEX PPC_BITMASK32(8, 31) >> + uint32_t w5; >> +#define EQ_W5_ESC_EQ_DATA PPC_BITMASK32(1, 31) >> + uint32_t w6; >> +#define EQ_W6_FORMAT_BIT PPC_BIT32(8) >> +#define EQ_W6_NVT_BLOCK PPC_BITMASK32(9, 12) >> +#define EQ_W6_NVT_INDEX PPC_BITMASK32(13, 31) >> + uint32_t w7; >> +#define EQ_W7_F0_IGNORE PPC_BIT32(0) >> +#define EQ_W7_F0_BLK_GROUPING PPC_BIT32(1) >> +#define EQ_W7_F0_PRIORITY PPC_BITMASK32(8, 15) >> +#define EQ_W7_F1_WAKEZ PPC_BIT32(0) >> +#define EQ_W7_F1_LOG_SERVER_ID PPC_BITMASK32(1, 31) >> +} XiveEQ; >> + >> +#define XIVE_EQ_PRIORITY_COUNT 8 >> +#define XIVE_PRIORITY_MAX (XIVE_EQ_PRIORITY_COUNT - 1) >> + >> struct XIVE { >> SysBusDevice parent; >> >> @@ -23,6 +106,18 @@ struct XIVE { >> uint32_t int_max; /* Max index */ >> uint32_t int_hw_bot; /* Bottom index of HW IRQ allocator */ >> uint32_t int_ipi_top; /* Highest IPI index handed out so far + 1 >> */ >> + >> + /* XIVE internal tables */ >> + void *sbe; >> + XiveIVE *ivt; >> + XiveEQ *eqdt; >> }; >> >> +void xive_reset(void *dev); >> +XiveIVE *xive_get_ive(XIVE *x, uint32_t isn); >> +XiveEQ *xive_get_eq(XIVE *x, uint32_t idx); >> + >> +bool xive_eq_for_target(XIVE *x, uint32_t target, uint8_t prio, >> + uint32_t *out_eq_idx); >> + >> #endif /* _INTC_XIVE_INTERNAL_H */ >> diff --git a/hw/intc/xive.c b/hw/intc/xive.c >> index 5b4ea915d87c..5b14d8155317 100644 >> --- a/hw/intc/xive.c >> +++ b/hw/intc/xive.c >> @@ -35,6 +35,27 @@ >> */ >> #define MAX_HW_IRQS_ENTRIES (8 * 1024) >> >> + >> +void xive_reset(void *dev) >> +{ >> + XIVE *x = XIVE(dev); >> + int i; >> + >> + /* SBEs are initialized to 0b01 which corresponds to "ints off" */ >> + memset(x->sbe, 0x55, x->int_count / 4); > > I think strictly this should be a DIV_ROUND_UP to handle the case of > int_count not a multiple of 4.
ok. >> + >> + /* Clear and mask all valid IVEs */ >> + for (i = x->int_base; i < x->int_max; i++) { >> + XiveIVE *ive = &x->ivt[i]; >> + if (ive->w & IVE_VALID) { >> + ive->w = IVE_VALID | IVE_MASKED; >> + } >> + } >> + >> + /* clear all EQs */ >> + memset(x->eqdt, 0, x->nr_targets * XIVE_EQ_PRIORITY_COUNT * >> sizeof(XiveEQ)); >> +} >> + >> static void xive_init(Object *obj) >> { >> ; >> @@ -62,6 +83,19 @@ static void xive_realize(DeviceState *dev, Error **errp) >> if (x->int_ipi_top < 0x10) { >> x->int_ipi_top = 0x10; >> } >> + >> + /* Allocate SBEs (State Bit Entry). 2 bits, so 4 entries per byte */ >> + x->sbe = g_malloc0(x->int_count / 4); > > And here as well. yes. >> + >> + /* Allocate the IVT (Interrupt Virtualization Table) */ >> + x->ivt = g_malloc0(x->int_count * sizeof(XiveIVE)); >> + >> + /* Allocate the EQDT (Event Queue Descriptor Table), 8 priorities >> + * for each thread in the system */ >> + x->eqdt = g_malloc0(x->nr_targets * XIVE_EQ_PRIORITY_COUNT * >> + sizeof(XiveEQ)); >> + >> + qemu_register_reset(xive_reset, dev); >> } >> >> static Property xive_properties[] = { >> @@ -92,3 +126,41 @@ static void xive_register_types(void) >> } >> >> type_init(xive_register_types) >> + >> +XiveIVE *xive_get_ive(XIVE *x, uint32_t lisn) >> +{ >> + uint32_t idx = lisn; >> + >> + if (idx < x->int_base || idx >= x->int_max) { >> + return NULL; >> + } >> + >> + return &x->ivt[idx]; > > Should be idx - int_base, no? no, not in the allocator model I have chosen. The IRQ numbers are exposed to the guest with their offset. But this is another discussion which I would rather continue in another thread. >> +} >> + >> +XiveEQ *xive_get_eq(XIVE *x, uint32_t idx) >> +{ >> + if (idx >= x->nr_targets * XIVE_EQ_PRIORITY_COUNT) { >> + return NULL; >> + } >> + >> + return &x->eqdt[idx]; >> +} >> + >> +/* TODO: improve EQ indexing. This is very simple and relies on the >> + * fact that target (CPU) numbers start at 0 and are contiguous. It >> + * should be OK for sPAPR. >> + */ >> +bool xive_eq_for_target(XIVE *x, uint32_t target, uint8_t priority, >> + uint32_t *out_eq_idx) >> +{ >> + if (priority > XIVE_PRIORITY_MAX || target >= x->nr_targets) { >> + return false; >> + } >> + >> + if (out_eq_idx) { >> + *out_eq_idx = target + priority; >> + } >> + >> + return true; > > Seems a clunky interface. Why not return a XiveEQ *, NULL if the > inputs aren't valud. Yes. This interface is inherited from OPAL and it's not consistent with the other xive_get_*() routines. But we are missing a XIVE internal table for VPs which explains the difference. I need to look at the support of the OS_REPORTING_LINE hcalls before simplifying. Thanks, C.