On 11/29/2017 06:11 AM, David Gibson wrote: > On Thu, Nov 23, 2017 at 02:29:42PM +0100, Cédric Le Goater wrote: >> The XIVE interrupt presenter exposes a set of rings, also called >> Thread Interrupt Management Areas (TIMA), to handle priority >> management and interrupt acknowledgment among other things. There is >> one ring per level of privilege, four in all. The one we are >> interested in for the sPAPR machine is the OS ring. >> >> The TIMA is mapped at the same address for each CPU. 'current_cpu' is >> used to retrieve the targeted interrupt presenter object holding the >> cache data of the registers the model use. >> >> Signed-off-by: Cédric Le Goater <c...@kaod.org> >> --- >> hw/intc/spapr_xive.c | 271 >> ++++++++++++++++++++++++++++++++++++++++++++ >> hw/intc/xive-internal.h | 89 +++++++++++++++ >> include/hw/ppc/spapr_xive.h | 11 ++ >> 3 files changed, 371 insertions(+) >> >> diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c >> index b1e3f8710cff..554b25e0884c 100644 >> --- a/hw/intc/spapr_xive.c >> +++ b/hw/intc/spapr_xive.c >> @@ -23,9 +23,166 @@ >> #include "sysemu/dma.h" >> #include "monitor/monitor.h" >> #include "hw/ppc/spapr_xive.h" >> +#include "hw/ppc/xics.h" >> >> #include "xive-internal.h" >> >> +struct sPAPRXiveICP { > > I'd really prefer to avoid calling anything in xive "icp" to avoid > confusion with xics.
OK. The specs refers to the whole as an IVPE : Interrupt Virtualization Presentation Engine. In our model, we use the TIMA cached values of the OS ring and the qemu_irq for the CPU line. Would 'sPAPRXivePresenter' be fine ? >> + DeviceState parent_obj; >> + >> + CPUState *cs; >> + uint8_t tima[TM_RING_COUNT * 0x10]; > > What does the 0x10 represent? #define for clarity, maybe. yes. > Do we need to model the whole range as memory, or just the relevant > pieces with read/write meaning? Yes. we could limit the TIMA and MMIO region to what sPAPR only needs : the OS ring. > >> + uint8_t *tima_os; >> + qemu_irq output; >> +}; >> + >> +static uint64_t spapr_xive_icp_accept(sPAPRXiveICP *icp) >> +{ >> + return 0; >> +} >> + >> +static void spapr_xive_icp_set_cppr(sPAPRXiveICP *icp, uint8_t cppr) >> +{ >> + if (cppr > XIVE_PRIORITY_MAX) { >> + cppr = 0xff; >> + } >> + >> + icp->tima_os[TM_CPPR] = cppr; >> +} >> + >> +/* >> + * Thread Interrupt Management Area MMIO >> + */ >> +static uint64_t spapr_xive_tm_read_special(sPAPRXiveICP *icp, hwaddr offset, >> + unsigned size) >> +{ >> + uint64_t ret = -1; >> + >> + if (offset == TM_SPC_ACK_OS_REG && size == 2) { >> + ret = spapr_xive_icp_accept(icp); >> + } else { >> + qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid TIMA read @%" >> + HWADDR_PRIx" size %d\n", offset, size); >> + } >> + >> + return ret; >> +} >> + >> +static uint64_t spapr_xive_tm_read(void *opaque, hwaddr offset, unsigned >> size) >> +{ >> + PowerPCCPU *cpu = POWERPC_CPU(current_cpu); > > So, strictly speaking this could be handled by setting each of the > CPUs address spaces separately, to something with their own TIMA > superimposed on address_space_memory. Ah. I didn't know we could do that. > What you have might be more practical though. well, you will see at the end of the patchset how cpu->intc is assigned. >> + sPAPRXiveICP *icp = SPAPR_XIVE_ICP(cpu->intc); >> + uint64_t ret = -1; >> + int i; >> + >> + if (offset >= TM_SPC_ACK_EBB) { >> + return spapr_xive_tm_read_special(icp, offset, size); >> + } >> + >> + if ((offset & 0xf0) == TM_QW1_OS) { >> + switch (size) { >> + case 1: >> + case 2: >> + case 4: >> + case 8: >> + if (QEMU_IS_ALIGNED(offset, size)) { > > Hm, the MR subsystem doesn't already split unaligned accesses? euh. yes, I might be doing a little too much. >> + ret = 0; >> + for (i = 0; i < size; i++) { >> + ret |= icp->tima[offset + i] << (8 * i); >> + } >> + } else { >> + qemu_log_mask(LOG_GUEST_ERROR, >> + "XIVE: invalid TIMA read alignment @%" >> + HWADDR_PRIx" size %d\n", offset, size); >> + } >> + break; >> + default: >> + g_assert_not_reached(); >> + } >> + } else { >> + qemu_log_mask(LOG_UNIMP, "XIVE: does handle non-OS TIMA ring @%" >> + HWADDR_PRIx"\n", offset); >> + } >> + >> + return ret; >> +} >> + >> +static bool spapr_xive_tm_is_readonly(uint8_t offset) >> +{ >> + /* Let's be optimistic and prepare ground for HV mode support */ >> + switch (offset) { >> + case TM_QW1_OS + TM_CPPR: >> + return false; >> + default: >> + return true; >> + } >> +} >> + >> +static void spapr_xive_tm_write_special(sPAPRXiveICP *icp, hwaddr offset, >> + uint64_t value, unsigned size) >> +{ >> + /* TODO: support TM_SPC_SET_OS_PENDING */ >> + >> + /* TODO: support TM_SPC_ACK_OS_EL */ >> +} >> + >> +static void spapr_xive_tm_write(void *opaque, hwaddr offset, >> + uint64_t value, unsigned size) >> +{ >> + PowerPCCPU *cpu = POWERPC_CPU(current_cpu); >> + sPAPRXiveICP *icp = SPAPR_XIVE_ICP(cpu->intc); >> + int i; >> + >> + if (offset >= TM_SPC_ACK_EBB) { >> + spapr_xive_tm_write_special(icp, offset, value, size); >> + return; >> + } >> + >> + if ((offset & 0xf0) == TM_QW1_OS) { >> + switch (size) { >> + case 1: >> + if (offset == TM_QW1_OS + TM_CPPR) { >> + spapr_xive_icp_set_cppr(icp, value & 0xff); >> + } >> + break; >> + case 4: >> + case 8: >> + if (QEMU_IS_ALIGNED(offset, size)) { >> + for (i = 0; i < size; i++) { >> + if (!spapr_xive_tm_is_readonly(offset + i)) { >> + icp->tima[offset + i] = (value >> (8 * i)) & 0xff; >> + } >> + } >> + } else { >> + qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid TIMA write @%" >> + HWADDR_PRIx" size %d\n", offset, size); >> + } >> + break; >> + default: >> + qemu_log_mask(LOG_GUEST_ERROR, "XIVE: invalid TIMA write @%" >> + HWADDR_PRIx" size %d\n", offset, size); >> + } >> + } else { >> + qemu_log_mask(LOG_UNIMP, "XIVE: does handle non-OS TIMA ring @%" >> + HWADDR_PRIx"\n", offset); > > The many qemu_log()s worry me a little. They're not ratelimited, so > the guest could in principle chew through the host's log space. > > IIUC these are very unlikely to be hit in practice, so maybe > tracepoints would be more suitable. ok. >> + } >> +} >> + >> + >> +static const MemoryRegionOps spapr_xive_tm_ops = { >> + .read = spapr_xive_tm_read, >> + .write = spapr_xive_tm_write, >> + .endianness = DEVICE_BIG_ENDIAN, >> + .valid = { >> + .min_access_size = 1, >> + .max_access_size = 8, >> + }, >> + .impl = { >> + .min_access_size = 1, >> + .max_access_size = 8, >> + }, >> +}; >> + >> static void spapr_xive_irq(sPAPRXive *xive, int lisn) >> { >> >> @@ -287,6 +444,11 @@ static void spapr_xive_source_set_irq(void *opaque, int >> lisn, int val) >> #define VC_BAR_SIZE 0x08000000000ull >> #define ESB_SHIFT 16 /* One 64k page. OPAL has two */ >> >> +/* Thread Interrupt Management Area MMIO */ >> +#define TM_BAR_DEFAULT 0x30203180000ull >> +#define TM_SHIFT 16 >> +#define TM_BAR_SIZE (TM_RING_COUNT * (1 << TM_SHIFT)) >> + >> static uint64_t spapr_xive_esb_default_read(void *p, hwaddr offset, >> unsigned size) >> { >> @@ -392,6 +554,14 @@ static void spapr_xive_realize(DeviceState *dev, Error >> **errp) >> (1ull << xive->esb_shift) * xive->nr_irqs); >> memory_region_add_subregion(&xive->esb_mr, 0, &xive->esb_iomem); >> >> + /* TM BAR. Same address for each chip */ >> + xive->tm_base = (P9_MMIO_BASE | TM_BAR_DEFAULT); >> + xive->tm_shift = TM_SHIFT; > > Any reason for this to be a variable? no, we could just use TM_SHIFT. I will look into it. >> + >> + memory_region_init_io(&xive->tm_iomem, OBJECT(xive), &spapr_xive_tm_ops, >> + xive, "xive.tm", TM_BAR_SIZE); >> + sysbus_init_mmio(SYS_BUS_DEVICE(dev), &xive->tm_iomem); >> + >> qemu_register_reset(spapr_xive_reset, dev); >> } >> >> @@ -448,9 +618,110 @@ static const TypeInfo spapr_xive_info = { >> .class_init = spapr_xive_class_init, >> }; >> >> +void spapr_xive_icp_pic_print_info(sPAPRXiveICP *xicp, Monitor *mon) >> +{ >> + int cpu_index = xicp->cs ? xicp->cs->cpu_index : -1; >> + >> + monitor_printf(mon, "CPU %d CPPR=%02x IPB=%02x PIPR=%02x NSR=%02x\n", >> + cpu_index, xicp->tima_os[TM_CPPR], xicp->tima_os[TM_IPB], >> + xicp->tima_os[TM_PIPR], xicp->tima_os[TM_NSR]); >> +} >> + >> +static void spapr_xive_icp_reset(void *dev) >> +{ >> + sPAPRXiveICP *xicp = SPAPR_XIVE_ICP(dev); >> + >> + memset(xicp->tima, 0, sizeof(xicp->tima)); >> +} >> + >> +static void spapr_xive_icp_realize(DeviceState *dev, Error **errp) >> +{ >> + sPAPRXiveICP *xicp = SPAPR_XIVE_ICP(dev); >> + PowerPCCPU *cpu; >> + CPUPPCState *env; >> + Object *obj; >> + Error *err = NULL; >> + >> + obj = object_property_get_link(OBJECT(dev), ICP_PROP_CPU, &err); >> + if (!obj) { >> + error_propagate(errp, err); >> + error_prepend(errp, "required link '" ICP_PROP_CPU "' not found: "); >> + return; >> + } >> + >> + cpu = POWERPC_CPU(obj); >> + xicp->cs = CPU(obj); >> + >> + env = &cpu->env; >> + switch (PPC_INPUT(env)) { >> + case PPC_FLAGS_INPUT_POWER7: >> + xicp->output = env->irq_inputs[POWER7_INPUT_INT]; >> + break; >> + >> + case PPC_FLAGS_INPUT_970: >> + xicp->output = env->irq_inputs[PPC970_INPUT_INT]; >> + break; > > I really don't think we need to implement XIVE for 970. Indeed. This is a left over from a copy/paste of the ICPState realize routine. >> + >> + default: >> + error_setg(errp, "XIVE interrupt controller does not support " >> + "this CPU bus model"); >> + return; >> + } >> + >> + qemu_register_reset(spapr_xive_icp_reset, dev); >> +} >> + >> +static void spapr_xive_icp_unrealize(DeviceState *dev, Error **errp) >> +{ >> + qemu_unregister_reset(spapr_xive_icp_reset, dev); >> +} >> + >> +static void spapr_xive_icp_init(Object *obj) >> +{ >> + sPAPRXiveICP *xicp = SPAPR_XIVE_ICP(obj); >> + >> + xicp->tima_os = &xicp->tima[TM_QW1_OS]; > > This is a fixed offset, so why store it as a pointer. For the PAPR > guest case, do we even need to model the other rings? No we don't. I will simplify. Thanks, C. > >> +} >> + >> +static bool vmstate_spapr_xive_icp_needed(void *opaque) >> +{ >> + /* TODO check machine XIVE support */ >> + return true; >> +} >> + >> +static const VMStateDescription vmstate_spapr_xive_icp = { >> + .name = TYPE_SPAPR_XIVE_ICP, >> + .version_id = 1, >> + .minimum_version_id = 1, >> + .needed = vmstate_spapr_xive_icp_needed, >> + .fields = (VMStateField[]) { >> + VMSTATE_BUFFER(tima, sPAPRXiveICP), >> + VMSTATE_END_OF_LIST() >> + }, >> +}; >> + >> +static void spapr_xive_icp_class_init(ObjectClass *klass, void *data) >> +{ >> + DeviceClass *dc = DEVICE_CLASS(klass); >> + >> + dc->realize = spapr_xive_icp_realize; >> + dc->unrealize = spapr_xive_icp_unrealize; >> + dc->desc = "sPAPR XIVE Interrupt Presenter"; >> + dc->vmsd = &vmstate_spapr_xive_icp; >> +} >> + >> +static const TypeInfo xive_icp_info = { >> + .name = TYPE_SPAPR_XIVE_ICP, >> + .parent = TYPE_DEVICE, >> + .instance_size = sizeof(sPAPRXiveICP), >> + .instance_init = spapr_xive_icp_init, >> + .class_init = spapr_xive_icp_class_init, >> +}; >> + >> static void spapr_xive_register_types(void) >> { >> type_register_static(&spapr_xive_info); >> + type_register_static(&xive_icp_info); >> } >> >> type_init(spapr_xive_register_types) >> diff --git a/hw/intc/xive-internal.h b/hw/intc/xive-internal.h >> index bea88d82992c..7d329f203a9b 100644 >> --- a/hw/intc/xive-internal.h >> +++ b/hw/intc/xive-internal.h >> @@ -24,6 +24,93 @@ >> #define PPC_BITMASK32(bs, be) ((PPC_BIT32(bs) - PPC_BIT32(be)) | \ >> PPC_BIT32(bs)) >> >> +/* >> + * Thread Management (aka "TM") registers > > Because "TM" didn't stand for enough things already :/. > >> + */ >> + >> +/* Number of Thread Management Interrupt Areas */ >> +#define TM_RING_COUNT 4 >> + >> +/* TM register offsets */ >> +#define TM_QW0_USER 0x000 /* All rings */ >> +#define TM_QW1_OS 0x010 /* Ring 0..2 */ >> +#define TM_QW2_HV_POOL 0x020 /* Ring 0..1 */ >> +#define TM_QW3_HV_PHYS 0x030 /* Ring 0..1 */ >> + >> +/* Byte offsets inside a QW QW0 QW1 QW2 QW3 */ >> +#define TM_NSR 0x0 /* + + - + */ >> +#define TM_CPPR 0x1 /* - + - + */ >> +#define TM_IPB 0x2 /* - + + + */ >> +#define TM_LSMFB 0x3 /* - + + + */ >> +#define TM_ACK_CNT 0x4 /* - + - - */ >> +#define TM_INC 0x5 /* - + - + */ >> +#define TM_AGE 0x6 /* - + - + */ >> +#define TM_PIPR 0x7 /* - + - + */ >> + >> +#define TM_WORD0 0x0 >> +#define TM_WORD1 0x4 >> + >> +/* >> + * QW word 2 contains the valid bit at the top and other fields >> + * depending on the QW. >> + */ >> +#define TM_WORD2 0x8 >> +#define TM_QW0W2_VU PPC_BIT32(0) >> +#define TM_QW0W2_LOGIC_SERV PPC_BITMASK32(1, 31) /* XX 2,31 ? */ >> +#define TM_QW1W2_VO PPC_BIT32(0) >> +#define TM_QW1W2_OS_CAM PPC_BITMASK32(8, 31) >> +#define TM_QW2W2_VP PPC_BIT32(0) >> +#define TM_QW2W2_POOL_CAM PPC_BITMASK32(8, 31) >> +#define TM_QW3W2_VT PPC_BIT32(0) >> +#define TM_QW3W2_LP PPC_BIT32(6) >> +#define TM_QW3W2_LE PPC_BIT32(7) >> +#define TM_QW3W2_T PPC_BIT32(31) >> + >> +/* >> + * In addition to normal loads to "peek" and writes (only when invalid) >> + * using 4 and 8 bytes accesses, the above registers support these >> + * "special" byte operations: >> + * >> + * - Byte load from QW0[NSR] - User level NSR (EBB) >> + * - Byte store to QW0[NSR] - User level NSR (EBB) >> + * - Byte load/store to QW1[CPPR] and QW3[CPPR] - CPPR access >> + * - Byte load from QW3[TM_WORD2] - Read VT||00000||LP||LE on thrd 0 >> + * otherwise VT||0000000 >> + * - Byte store to QW3[TM_WORD2] - Set VT bit (and LP/LE if present) >> + * >> + * Then we have all these "special" CI ops at these offset that trigger >> + * all sorts of side effects: >> + */ >> +#define TM_SPC_ACK_EBB 0x800 /* Load8 ack EBB to reg*/ >> +#define TM_SPC_ACK_OS_REG 0x810 /* Load16 ack OS irq to reg */ >> +#define TM_SPC_PUSH_USR_CTX 0x808 /* Store32 Push/Validate user >> context */ >> +#define TM_SPC_PULL_USR_CTX 0x808 /* Load32 Pull/Invalidate user >> + * context */ >> +#define TM_SPC_SET_OS_PENDING 0x812 /* Store8 Set OS irq pending bit */ >> +#define TM_SPC_PULL_OS_CTX 0x818 /* Load32/Load64 Pull/Invalidate OS >> + * context to reg */ >> +#define TM_SPC_PULL_POOL_CTX 0x828 /* Load32/Load64 Pull/Invalidate >> Pool >> + * context to reg*/ >> +#define TM_SPC_ACK_HV_REG 0x830 /* Load16 ack HV irq to reg */ >> +#define TM_SPC_PULL_USR_CTX_OL 0xc08 /* Store8 Pull/Inval usr ctx to odd >> + * line */ >> +#define TM_SPC_ACK_OS_EL 0xc10 /* Store8 ack OS irq to even line */ >> +#define TM_SPC_ACK_HV_POOL_EL 0xc20 /* Store8 ack HV evt pool to even >> + * line */ >> +#define TM_SPC_ACK_HV_EL 0xc30 /* Store8 ack HV irq to even line */ >> +/* XXX more... */ >> + >> +/* NSR fields for the various QW ack types */ >> +#define TM_QW0_NSR_EB PPC_BIT8(0) >> +#define TM_QW1_NSR_EO PPC_BIT8(0) >> +#define TM_QW3_NSR_HE PPC_BITMASK8(0, 1) >> +#define TM_QW3_NSR_HE_NONE 0 >> +#define TM_QW3_NSR_HE_POOL 1 >> +#define TM_QW3_NSR_HE_PHYS 2 >> +#define TM_QW3_NSR_HE_LSI 3 >> +#define TM_QW3_NSR_I PPC_BIT8(2) >> +#define TM_QW3_NSR_GRP_LVL PPC_BIT8(3, 7) >> + >> /* IVE/EAS >> * >> * One per interrupt source. Targets that interrupt to a given EQ >> @@ -44,6 +131,8 @@ typedef struct XiveIVE { >> #define IVE_EQ_DATA PPC_BITMASK(33, 63) /* Data written to the EQ >> */ >> } XiveIVE; >> >> +#define XIVE_PRIORITY_MAX 7 >> + >> void spapr_xive_reset(void *dev); >> XiveIVE *spapr_xive_get_ive(sPAPRXive *xive, uint32_t lisn); >> >> diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h >> index 7a308fb4db2b..6e8a189e723f 100644 >> --- a/include/hw/ppc/spapr_xive.h >> +++ b/include/hw/ppc/spapr_xive.h >> @@ -23,10 +23,15 @@ >> >> typedef struct sPAPRXive sPAPRXive; >> typedef struct XiveIVE XiveIVE; >> +typedef struct sPAPRXiveICP sPAPRXiveICP; >> >> #define TYPE_SPAPR_XIVE "spapr-xive" >> #define SPAPR_XIVE(obj) OBJECT_CHECK(sPAPRXive, (obj), TYPE_SPAPR_XIVE) >> >> +#define TYPE_SPAPR_XIVE_ICP "spapr-xive-icp" >> +#define SPAPR_XIVE_ICP(obj) \ >> + OBJECT_CHECK(sPAPRXiveICP, (obj), TYPE_SPAPR_XIVE_ICP) >> + >> struct sPAPRXive { >> SysBusDevice parent; >> >> @@ -57,6 +62,11 @@ struct sPAPRXive { >> hwaddr esb_base; >> MemoryRegion esb_mr; >> MemoryRegion esb_iomem; >> + >> + /* TIMA memory region */ >> + uint32_t tm_shift; >> + hwaddr tm_base; >> + MemoryRegion tm_iomem; >> }; >> >> static inline bool spapr_xive_irq_is_lsi(sPAPRXive *xive, int lisn) >> @@ -67,5 +77,6 @@ static inline bool spapr_xive_irq_is_lsi(sPAPRXive *xive, >> int lisn) >> bool spapr_xive_irq_set(sPAPRXive *xive, uint32_t lisn, bool lsi); >> bool spapr_xive_irq_unset(sPAPRXive *xive, uint32_t lisn); >> void spapr_xive_pic_print_info(sPAPRXive *xive, Monitor *mon); >> +void spapr_xive_icp_pic_print_info(sPAPRXiveICP *xicp, Monitor *mon); >> >> #endif /* PPC_SPAPR_XIVE_H */ >