On 07/25/2017 04:16 AM, David Gibson wrote: > On Mon, Jul 24, 2017 at 02:52:29PM +0200, Cédric Le Goater wrote: >> On 07/19/2017 05:24 AM, David Gibson wrote: >>> On Wed, Jul 05, 2017 at 07:13:18PM +0200, Cédric Le Goater wrote: >>>> The XIVE interrupt controller of the POWER9 uses a set of tables to >>>> redirect exception from event sources to CPU threads. Among which we >>>> choose to model : >>>> >>>> - the State Bit Entries (SBE), also known as Event State Buffer >>>> (ESB). This is a two bit state machine for each event source which >>>> is used to trigger events. The bits are named "P" (pending) and "Q" >>>> (queued) and can be controlled by MMIO. >>>> >>>> - the Interrupt Virtualization Entry (IVE) table, also known as Event >>>> Assignment Structure (EAS). This table is indexed by the IRQ number >>>> and is looked up to find the Event Queue associated with a >>>> triggered event. >>>> >>>> - the Event Queue Descriptor (EQD) table, also known as Event >>>> Notification Descriptor (END). The EQD contains fields that specify >>>> the Event Queue on which event data is posted (and later pulled by >>>> the OS) and also a target (or VPD) to notify. >>>> >>>> An additional table was not modeled but we might need to to support >>>> the H_INT_SET_OS_REPORTING_LINE hcall: >>>> >>>> - the Virtual Processor Descriptor (VPD) table, also known as >>>> Notification Virtual Target (NVT). >>>> >>>> The XIVE object is expanded with the tables described above. The size >>>> of each table depends on the number of provisioned IRQ and the maximum >>>> number of CPUs in the system. The indexing is very basic and might >>>> need to be improved for the EQs. >>>> >>>> Signed-off-by: Cédric Le Goater <c...@kaod.org> >>>> --- >>>> hw/intc/xive-internal.h | 95 >>>> +++++++++++++++++++++++++++++++++++++++++++++++++ >>>> hw/intc/xive.c | 72 +++++++++++++++++++++++++++++++++++++ >>>> 2 files changed, 167 insertions(+) >>>> >>>> diff --git a/hw/intc/xive-internal.h b/hw/intc/xive-internal.h >>>> index 155c2dcd6066..8e755aa88a14 100644 >>>> --- a/hw/intc/xive-internal.h >>>> +++ b/hw/intc/xive-internal.h >>>> @@ -11,6 +11,89 @@ >>>> >>>> #include <hw/sysbus.h> >>>> >>>> +/* Utilities to manipulate these (originaly from OPAL) */ >>>> +#define MASK_TO_LSH(m) (__builtin_ffsl(m) - 1) >>>> +#define GETFIELD(m, v) (((v) & (m)) >> MASK_TO_LSH(m)) >>>> +#define SETFIELD(m, v, val) \ >>>> + (((v) & ~(m)) | ((((typeof(v))(val)) << MASK_TO_LSH(m)) & (m))) >>>> + >>>> +#define PPC_BIT(bit) (0x8000000000000000UL >> (bit)) >>>> +#define PPC_BIT32(bit) (0x80000000UL >> (bit)) >>>> +#define PPC_BIT8(bit) (0x80UL >> (bit)) >>>> +#define PPC_BITMASK(bs, be) ((PPC_BIT(bs) - PPC_BIT(be)) | >>>> PPC_BIT(bs)) >>>> +#define PPC_BITMASK32(bs, be) ((PPC_BIT32(bs) - PPC_BIT32(be)) | \ >>>> + PPC_BIT32(bs)) >>>> + >>>> +/* IVE/EAS >>>> + * >>>> + * One per interrupt source. Targets that interrupt to a given EQ >>>> + * and provides the corresponding logical interrupt number (EQ data) >>>> + * >>>> + * We also map this structure to the escalation descriptor inside >>>> + * an EQ, though in that case the valid and masked bits are not used. >>>> + */ >>>> +typedef struct XiveIVE { >>>> + /* Use a single 64-bit definition to make it easier to >>>> + * perform atomic updates >>>> + */ >>>> + uint64_t w; >>>> +#define IVE_VALID PPC_BIT(0) >>>> +#define IVE_EQ_BLOCK PPC_BITMASK(4, 7) /* Destination EQ block# >>>> */ >>>> +#define IVE_EQ_INDEX PPC_BITMASK(8, 31) /* Destination EQ index >>>> */ >>>> +#define IVE_MASKED PPC_BIT(32) /* Masked */ >>>> +#define IVE_EQ_DATA PPC_BITMASK(33, 63) /* Data written to the >>>> EQ */ >>>> +} XiveIVE; >>>> + >>>> +/* EQ */ >>>> +typedef struct XiveEQ { >>>> + uint32_t w0; >>>> +#define EQ_W0_VALID PPC_BIT32(0) >>>> +#define EQ_W0_ENQUEUE PPC_BIT32(1) >>>> +#define EQ_W0_UCOND_NOTIFY PPC_BIT32(2) >>>> +#define EQ_W0_BACKLOG PPC_BIT32(3) >>>> +#define EQ_W0_PRECL_ESC_CTL PPC_BIT32(4) >>>> +#define EQ_W0_ESCALATE_CTL PPC_BIT32(5) >>>> +#define EQ_W0_END_OF_INTR PPC_BIT32(6) >>>> +#define EQ_W0_QSIZE PPC_BITMASK32(12, 15) >>>> +#define EQ_W0_SW0 PPC_BIT32(16) >>>> +#define EQ_W0_FIRMWARE EQ_W0_SW0 /* Owned by FW */ >>>> +#define EQ_QSIZE_4K 0 >>>> +#define EQ_QSIZE_64K 4 >>>> +#define EQ_W0_HWDEP PPC_BITMASK32(24, 31) >>>> + uint32_t w1; >>>> +#define EQ_W1_ESn PPC_BITMASK32(0, 1) >>>> +#define EQ_W1_ESn_P PPC_BIT32(0) >>>> +#define EQ_W1_ESn_Q PPC_BIT32(1) >>>> +#define EQ_W1_ESe PPC_BITMASK32(2, 3) >>>> +#define EQ_W1_ESe_P PPC_BIT32(2) >>>> +#define EQ_W1_ESe_Q PPC_BIT32(3) >>>> +#define EQ_W1_GENERATION PPC_BIT32(9) >>>> +#define EQ_W1_PAGE_OFF PPC_BITMASK32(10, 31) >>>> + uint32_t w2; >>>> +#define EQ_W2_MIGRATION_REG PPC_BITMASK32(0, 3) >>>> +#define EQ_W2_OP_DESC_HI PPC_BITMASK32(4, 31) >>>> + uint32_t w3; >>>> +#define EQ_W3_OP_DESC_LO PPC_BITMASK32(0, 31) >>>> + uint32_t w4; >>>> +#define EQ_W4_ESC_EQ_BLOCK PPC_BITMASK32(4, 7) >>>> +#define EQ_W4_ESC_EQ_INDEX PPC_BITMASK32(8, 31) >>>> + uint32_t w5; >>>> +#define EQ_W5_ESC_EQ_DATA PPC_BITMASK32(1, 31) >>>> + uint32_t w6; >>>> +#define EQ_W6_FORMAT_BIT PPC_BIT32(8) >>>> +#define EQ_W6_NVT_BLOCK PPC_BITMASK32(9, 12) >>>> +#define EQ_W6_NVT_INDEX PPC_BITMASK32(13, 31) >>>> + uint32_t w7; >>>> +#define EQ_W7_F0_IGNORE PPC_BIT32(0) >>>> +#define EQ_W7_F0_BLK_GROUPING PPC_BIT32(1) >>>> +#define EQ_W7_F0_PRIORITY PPC_BITMASK32(8, 15) >>>> +#define EQ_W7_F1_WAKEZ PPC_BIT32(0) >>>> +#define EQ_W7_F1_LOG_SERVER_ID PPC_BITMASK32(1, 31) >>>> +} XiveEQ; >>>> + >>>> +#define XIVE_EQ_PRIORITY_COUNT 8 >>>> +#define XIVE_PRIORITY_MAX (XIVE_EQ_PRIORITY_COUNT - 1) >>>> + >>>> struct XIVE { >>>> SysBusDevice parent; >>>> >>>> @@ -23,6 +106,18 @@ struct XIVE { >>>> uint32_t int_max; /* Max index */ >>>> uint32_t int_hw_bot; /* Bottom index of HW IRQ allocator */ >>>> uint32_t int_ipi_top; /* Highest IPI index handed out so far + >>>> 1 */ >>>> + >>>> + /* XIVE internal tables */ >>>> + void *sbe; >>>> + XiveIVE *ivt; >>>> + XiveEQ *eqdt; >>>> }; >>>> >>>> +void xive_reset(void *dev); >>>> +XiveIVE *xive_get_ive(XIVE *x, uint32_t isn); >>>> +XiveEQ *xive_get_eq(XIVE *x, uint32_t idx); >>>> + >>>> +bool xive_eq_for_target(XIVE *x, uint32_t target, uint8_t prio, >>>> + uint32_t *out_eq_idx); >>>> + >>>> #endif /* _INTC_XIVE_INTERNAL_H */ >>>> diff --git a/hw/intc/xive.c b/hw/intc/xive.c >>>> index 5b4ea915d87c..5b14d8155317 100644 >>>> --- a/hw/intc/xive.c >>>> +++ b/hw/intc/xive.c >>>> @@ -35,6 +35,27 @@ >>>> */ >>>> #define MAX_HW_IRQS_ENTRIES (8 * 1024) >>>> >>>> + >>>> +void xive_reset(void *dev) >>>> +{ >>>> + XIVE *x = XIVE(dev); >>>> + int i; >>>> + >>>> + /* SBEs are initialized to 0b01 which corresponds to "ints off" */ >>>> + memset(x->sbe, 0x55, x->int_count / 4); >>> >>> I think strictly this should be a DIV_ROUND_UP to handle the case of >>> int_count not a multiple of 4. >> >> ok. >> >>>> + >>>> + /* Clear and mask all valid IVEs */ >>>> + for (i = x->int_base; i < x->int_max; i++) { >>>> + XiveIVE *ive = &x->ivt[i]; >>>> + if (ive->w & IVE_VALID) { >>>> + ive->w = IVE_VALID | IVE_MASKED; >>>> + } >>>> + } >>>> + >>>> + /* clear all EQs */ >>>> + memset(x->eqdt, 0, x->nr_targets * XIVE_EQ_PRIORITY_COUNT * >>>> sizeof(XiveEQ)); >>>> +} >>>> + >>>> static void xive_init(Object *obj) >>>> { >>>> ; >>>> @@ -62,6 +83,19 @@ static void xive_realize(DeviceState *dev, Error **errp) >>>> if (x->int_ipi_top < 0x10) { >>>> x->int_ipi_top = 0x10; >>>> } >>>> + >>>> + /* Allocate SBEs (State Bit Entry). 2 bits, so 4 entries per byte */ >>>> + x->sbe = g_malloc0(x->int_count / 4); >>> >>> And here as well. >> >> yes. >> >>>> + >>>> + /* Allocate the IVT (Interrupt Virtualization Table) */ >>>> + x->ivt = g_malloc0(x->int_count * sizeof(XiveIVE)); >>>> + >>>> + /* Allocate the EQDT (Event Queue Descriptor Table), 8 priorities >>>> + * for each thread in the system */ >>>> + x->eqdt = g_malloc0(x->nr_targets * XIVE_EQ_PRIORITY_COUNT * >>>> + sizeof(XiveEQ)); >>>> + >>>> + qemu_register_reset(xive_reset, dev); >>>> } >>>> >>>> static Property xive_properties[] = { >>>> @@ -92,3 +126,41 @@ static void xive_register_types(void) >>>> } >>>> >>>> type_init(xive_register_types) >>>> + >>>> +XiveIVE *xive_get_ive(XIVE *x, uint32_t lisn) >>>> +{ >>>> + uint32_t idx = lisn; >>>> + >>>> + if (idx < x->int_base || idx >= x->int_max) { >>>> + return NULL; >>>> + } >>>> + >>>> + return &x->ivt[idx]; >>> >>> Should be idx - int_base, no? >> >> no, not in the allocator model I have chosen. The IRQ numbers >> are exposed to the guest with their offset. But this is another >> discussion which I would rather continue in another thread. > > Uh.. but you're using idx to index IVT directly, after verifying that > it lies between int_base and int_max. AFAICT IVT is only allocated > with int_max - int_base entries, so without an offset here you'll > overrun it, won't you?
ah yes, you are right. I got confused because the idx used to be calculated in a different way. Luckily, 'int_base' is zero for the moment. Anyway I need to rework the allocator and the indexing of these tables, it's too complex for sPAPR. Thanks, C. >>>> +} >>>> + >>>> +XiveEQ *xive_get_eq(XIVE *x, uint32_t idx) >>>> +{ >>>> + if (idx >= x->nr_targets * XIVE_EQ_PRIORITY_COUNT) { >>>> + return NULL; >>>> + } >>>> + >>>> + return &x->eqdt[idx]; >>>> +} >>>> + >>>> +/* TODO: improve EQ indexing. This is very simple and relies on the >>>> + * fact that target (CPU) numbers start at 0 and are contiguous. It >>>> + * should be OK for sPAPR. >>>> + */ >>>> +bool xive_eq_for_target(XIVE *x, uint32_t target, uint8_t priority, >>>> + uint32_t *out_eq_idx) >>>> +{ >>>> + if (priority > XIVE_PRIORITY_MAX || target >= x->nr_targets) { >>>> + return false; >>>> + } >>>> + >>>> + if (out_eq_idx) { >>>> + *out_eq_idx = target + priority; >>>> + } >>>> + >>>> + return true; >>> >>> Seems a clunky interface. Why not return a XiveEQ *, NULL if the >>> inputs aren't valud. >> >> Yes. This interface is inherited from OPAL and it's not consistent >> with the other xive_get_*() routines. But we are missing a XIVE >> internal table for VPs which explains the difference. I need to look >> at the support of the OS_REPORTING_LINE hcalls before simplifying. >> >> Thanks, >> >> C. >> >> >