On 14/10/2016 01:29, David Gibson wrote: > From 98f07c09c6d72218800d6cfbf44b973a88ece2aa Mon Sep 17 00:00:00 2001 > From: David Gibson <da...@gibson.dropbear.id.au> > Date: Fri, 14 Oct 2016 10:21:00 +1100 > Subject: [PATCH] spapr: Improved placement of PCI host bridges in guest memory > map > > Currently, the MMIO space for accessing PCI on pseries guests begins at > 1 TiB in guest address space. Each PCI host bridge (PHB) has a 64 GiB > chunk of address space in which it places its outbound PIO and 32-bit and > 64-bit MMIO windows. > > This scheme as several problems: > - It limits guest RAM to 1 TiB (though we have a limited fix for this > now) > - It limits the total MMIO window to 64 GiB. This is not always enough > for some of the large nVidia GPGPU cards > - Putting all the windows into a single 64 GiB area means that naturally > aligning things within there will waste more address space. > In addition there was a miscalculation in some of the defaults, which meant > that the MMIO windows for each PHB actually slightly overran the 64 GiB > region for that PHB. We got away without nasty consequences because > the overrun fit within an unused area at the beginning of the next PHB's > region, but it's not pretty. > > This patch implements a new scheme which addresses those problems, and is > also closer to what bare metal hardware and pHyp guests generally use. > > Because some guest versions (including most current distro kernels) can't > access PCI MMIO above 64 TiB, we put all the PCI windows between 32 TiB and > 64 TiB. This is broken into 1 TiB chunks. The 1 TiB contains the PIO > (64 kiB) and 32-bit MMIO (2 GiB) windows for all of the PHBs. Each > subsequent TiB chunk contains a naturally aligned 64-bit MMIO window for > one PHB each. > > This reduces the number of allowed PHBs (without full manual configuration > of all the windows) from 256 to 31, but this should still be plenty in > practice. > > We also change some of the default window sizes for manually configured > PHBs to saner values. > > Finally we adjust some tests and libqos so that it correctly uses the new > default locations. Ideally it would parse the device tree given to the > guest, but that's a more complex problem for another time. > > Signed-off-by: David Gibson <da...@gibson.dropbear.id.au>
I really like this new version. Reviewed-by: Laurent Vivier <lviv...@redhat.com> Laurent > --- > hw/ppc/spapr.c | 121 > +++++++++++++++++++++++++++++++++----------- > hw/ppc/spapr_pci.c | 5 +- > include/hw/pci-host/spapr.h | 8 ++- > tests/endianness-test.c | 3 +- > tests/libqos/pci-spapr.c | 9 ++-- > tests/spapr-phb-test.c | 2 +- > 6 files changed, 108 insertions(+), 40 deletions(-) > > diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c > index 8db3657..4bdf15b 100644 > --- a/hw/ppc/spapr.c > +++ b/hw/ppc/spapr.c > @@ -2375,31 +2375,37 @@ static void spapr_phb_placement(sPAPRMachineState > *spapr, uint32_t index, > hwaddr *mmio32, hwaddr *mmio64, > unsigned n_dma, uint32_t *liobns, Error > **errp) > { > + /* > + * New-style PHB window placement. > + * > + * Goals: Gives large (1TiB), naturally aligned 64-bit MMIO window > + * for each PHB, in addition to 2GiB 32-bit MMIO and 64kiB PIO > + * windows. > + * > + * Some guest kernels can't work with MMIO windows above 1<<46 > + * (64TiB), so we place up to 31 PHBs in the area 32TiB..64TiB > + * > + * 32TiB..33TiB contains the PIO and 32-bit MMIO windows for all > + * PHBs. 33..34TiB has the 64-bit MMIO window for PHB0, 34..35 > + * has the 64-bit window for PHB1 and so forth. > + */ > const uint64_t base_buid = 0x800000020000000ULL; > - const hwaddr phb_spacing = 0x1000000000ULL; /* 64 GiB */ > - const hwaddr mmio_offset = 0xa0000000; /* 2 GiB + 512 MiB */ > - const hwaddr pio_offset = 0x80000000; /* 2 GiB */ > - const uint32_t max_index = 255; > - const hwaddr phb0_alignment = 0x10000000000ULL; /* 1 TiB */ > - > - uint64_t ram_top = MACHINE(spapr)->ram_size; > - hwaddr phb0_base, phb_base; > + const int max_phbs = > + (SPAPR_PCI_LIMIT - SPAPR_PCI_BASE) / SPAPR_PCI_MEM64_WIN_SIZE - 1; > int i; > > - /* Do we have hotpluggable memory? */ > - if (MACHINE(spapr)->maxram_size > ram_top) { > - /* Can't just use maxram_size, because there may be an > - * alignment gap between normal and hotpluggable memory > - * regions */ > - ram_top = spapr->hotplug_memory.base + > - memory_region_size(&spapr->hotplug_memory.mr); > - } > - > - phb0_base = QEMU_ALIGN_UP(ram_top, phb0_alignment); > + /* Sanity check natural alignments */ > + QEMU_BUILD_BUG_ON((SPAPR_PCI_BASE % SPAPR_PCI_MEM64_WIN_SIZE) != 0); > + QEMU_BUILD_BUG_ON((SPAPR_PCI_LIMIT % SPAPR_PCI_MEM64_WIN_SIZE) != 0); > + QEMU_BUILD_BUG_ON((SPAPR_PCI_MEM64_WIN_SIZE % SPAPR_PCI_MEM32_WIN_SIZE) > != 0); > + QEMU_BUILD_BUG_ON((SPAPR_PCI_MEM32_WIN_SIZE % SPAPR_PCI_IO_WIN_SIZE) != > 0); > + /* Sanity check bounds */ > + QEMU_BUILD_BUG_ON((max_phbs * SPAPR_PCI_IO_WIN_SIZE) > > SPAPR_PCI_MEM32_WIN_SIZE); > + QEMU_BUILD_BUG_ON((max_phbs * SPAPR_PCI_MEM32_WIN_SIZE) > > SPAPR_PCI_MEM64_WIN_SIZE); > > - if (index > max_index) { > + if (index >= max_phbs) { > error_setg(errp, "\"index\" for PAPR PHB is too large (max %u)", > - max_index); > + max_phbs - 1); > return; > } > > @@ -2408,14 +2414,9 @@ static void spapr_phb_placement(sPAPRMachineState > *spapr, uint32_t index, > liobns[i] = SPAPR_PCI_LIOBN(index, i); > } > > - phb_base = phb0_base + index * phb_spacing; > - *pio = phb_base + pio_offset; > - *mmio32 = phb_base + mmio_offset; > - /* > - * We don't set the 64-bit MMIO window, relying on the PHB's > - * fallback behaviour of automatically splitting a large "32-bit" > - * window into contiguous 32-bit and 64-bit windows > - */ > + *pio = SPAPR_PCI_BASE + index * SPAPR_PCI_IO_WIN_SIZE; > + *mmio32 = SPAPR_PCI_BASE + (index + 1) * SPAPR_PCI_MEM32_WIN_SIZE; > + *mmio64 = SPAPR_PCI_BASE + (index + 1) * SPAPR_PCI_MEM64_WIN_SIZE; > } > > static void spapr_machine_class_init(ObjectClass *oc, void *data) > @@ -2519,8 +2520,67 @@ DEFINE_SPAPR_MACHINE(2_8, "2.8", true); > /* > * pseries-2.7 > */ > -#define SPAPR_COMPAT_2_7 \ > - HW_COMPAT_2_7 \ > +#define SPAPR_COMPAT_2_7 \ > + HW_COMPAT_2_7 \ > + { \ > + .driver = TYPE_SPAPR_PCI_HOST_BRIDGE, \ > + .property = "mem_win_size", \ > + .value = stringify(SPAPR_PCI_2_7_MMIO_WIN_SIZE),\ > + }, \ > + { \ > + .driver = TYPE_SPAPR_PCI_HOST_BRIDGE, \ > + .property = "mem64_win_size", \ > + .value = "0", \ > + }, > + > +static void phb_placement_2_7(sPAPRMachineState *spapr, uint32_t index, > + uint64_t *buid, hwaddr *pio, > + hwaddr *mmio32, hwaddr *mmio64, > + unsigned n_dma, uint32_t *liobns, Error **errp) > +{ > + /* Legacy PHB placement for pseries-2.7 and earlier machine types */ > + const uint64_t base_buid = 0x800000020000000ULL; > + const hwaddr phb_spacing = 0x1000000000ULL; /* 64 GiB */ > + const hwaddr mmio_offset = 0xa0000000; /* 2 GiB + 512 MiB */ > + const hwaddr pio_offset = 0x80000000; /* 2 GiB */ > + const uint32_t max_index = 255; > + const hwaddr phb0_alignment = 0x10000000000ULL; /* 1 TiB */ > + > + uint64_t ram_top = MACHINE(spapr)->ram_size; > + hwaddr phb0_base, phb_base; > + int i; > + > + /* Do we have hotpluggable memory? */ > + if (MACHINE(spapr)->maxram_size > ram_top) { > + /* Can't just use maxram_size, because there may be an > + * alignment gap between normal and hotpluggable memory > + * regions */ > + ram_top = spapr->hotplug_memory.base + > + memory_region_size(&spapr->hotplug_memory.mr); > + } > + > + phb0_base = QEMU_ALIGN_UP(ram_top, phb0_alignment); > + > + if (index > max_index) { > + error_setg(errp, "\"index\" for PAPR PHB is too large (max %u)", > + max_index); > + return; > + } > + > + *buid = base_buid + index; > + for (i = 0; i < n_dma; ++i) { > + liobns[i] = SPAPR_PCI_LIOBN(index, i); > + } > + > + phb_base = phb0_base + index * phb_spacing; > + *pio = phb_base + pio_offset; > + *mmio32 = phb_base + mmio_offset; > + /* > + * We don't set the 64-bit MMIO window, relying on the PHB's > + * fallback behaviour of automatically splitting a large "32-bit" > + * window into contiguous 32-bit and 64-bit windows > + */ > +} > > static void spapr_machine_2_7_instance_options(MachineState *machine) > { > @@ -2533,6 +2593,7 @@ static void > spapr_machine_2_7_class_options(MachineClass *mc) > spapr_machine_2_8_class_options(mc); > smc->tcg_default_cpu = "POWER7"; > SET_MACHINE_COMPAT(mc, SPAPR_COMPAT_2_7); > + smc->phb_placement = phb_placement_2_7; > } > > DEFINE_SPAPR_MACHINE(2_7, "2.7", false); > diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c > index 31ca6fa..3ef6a26 100644 > --- a/hw/ppc/spapr_pci.c > +++ b/hw/ppc/spapr_pci.c > @@ -1565,9 +1565,10 @@ static Property spapr_phb_properties[] = { > DEFINE_PROP_UINT32("liobn64", sPAPRPHBState, dma_liobn[1], -1), > DEFINE_PROP_UINT64("mem_win_addr", sPAPRPHBState, mem_win_addr, -1), > DEFINE_PROP_UINT64("mem_win_size", sPAPRPHBState, mem_win_size, > - SPAPR_PCI_MMIO_WIN_SIZE), > + SPAPR_PCI_MEM32_WIN_SIZE), > DEFINE_PROP_UINT64("mem64_win_addr", sPAPRPHBState, mem64_win_addr, -1), > - DEFINE_PROP_UINT64("mem64_win_size", sPAPRPHBState, mem64_win_size, 0), > + DEFINE_PROP_UINT64("mem64_win_size", sPAPRPHBState, mem64_win_size, > + SPAPR_PCI_MEM64_WIN_SIZE), > DEFINE_PROP_UINT64("mem64_win_pciaddr", sPAPRPHBState, mem64_win_pciaddr, > -1), > DEFINE_PROP_UINT64("io_win_addr", sPAPRPHBState, io_win_addr, -1), > diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h > index 23dfb09..b92c1b5 100644 > --- a/include/hw/pci-host/spapr.h > +++ b/include/hw/pci-host/spapr.h > @@ -84,8 +84,14 @@ struct sPAPRPHBState { > #define SPAPR_PCI_MEM_WIN_BUS_OFFSET 0x80000000ULL > #define SPAPR_PCI_MEM32_WIN_SIZE \ > ((1ULL << 32) - SPAPR_PCI_MEM_WIN_BUS_OFFSET) > +#define SPAPR_PCI_MEM64_WIN_SIZE 0x10000000000ULL /* 1 TiB */ > > -#define SPAPR_PCI_MMIO_WIN_SIZE 0xf80000000 > +/* Without manual configuration, all PCI outbound windows will be > + * within this range */ > +#define SPAPR_PCI_BASE (1ULL << 45) /* 32 TiB */ > +#define SPAPR_PCI_LIMIT (1ULL << 46) /* 64 TiB */ > + > +#define SPAPR_PCI_2_7_MMIO_WIN_SIZE 0xf80000000 > #define SPAPR_PCI_IO_WIN_SIZE 0x10000 > > #define SPAPR_PCI_MSI_WINDOW 0x40000000000ULL > diff --git a/tests/endianness-test.c b/tests/endianness-test.c > index b7a120e..cf8d41b 100644 > --- a/tests/endianness-test.c > +++ b/tests/endianness-test.c > @@ -38,7 +38,8 @@ static const TestCase test_cases[] = { > { "ppc", "prep", 0x80000000, .bswap = true }, > { "ppc", "bamboo", 0xe8000000, .bswap = true, .superio = "i82378" }, > { "ppc64", "mac99", 0xf2000000, .bswap = true, .superio = "i82378" }, > - { "ppc64", "pseries", 0x10080000000ULL, > + { "ppc64", "pseries", (1ULL << 45), .bswap = true, .superio = "i82378" }, > + { "ppc64", "pseries-2.7", 0x10080000000ULL, > .bswap = true, .superio = "i82378" }, > { "sh4", "r2d", 0xfe240000, .superio = "i82378" }, > { "sh4eb", "r2d", 0xfe240000, .bswap = true, .superio = "i82378" }, > diff --git a/tests/libqos/pci-spapr.c b/tests/libqos/pci-spapr.c > index 558dfc3..2eaaf91 100644 > --- a/tests/libqos/pci-spapr.c > +++ b/tests/libqos/pci-spapr.c > @@ -235,10 +235,9 @@ static void qpci_spapr_iounmap(QPCIBus *bus, void *data) > /* FIXME */ > } > > -#define SPAPR_PCI_WINDOW_BASE 0x10000000000ULL > -#define SPAPR_PCI_MMIO32_WIN_OFF 0xA0000000 > +#define SPAPR_PCI_BASE (1ULL << 45) > + > #define SPAPR_PCI_MMIO32_WIN_SIZE 0x80000000 /* 2 GiB */ > -#define SPAPR_PCI_IO_WIN_OFF 0x80000000 > #define SPAPR_PCI_IO_WIN_SIZE 0x10000 > > QPCIBus *qpci_init_spapr(QGuestAllocator *alloc) > @@ -273,12 +272,12 @@ QPCIBus *qpci_init_spapr(QGuestAllocator *alloc) > * get the window locations */ > ret->buid = 0x800000020000000ULL; > > - ret->pio_cpu_base = SPAPR_PCI_WINDOW_BASE + SPAPR_PCI_IO_WIN_OFF; > + ret->pio_cpu_base = SPAPR_PCI_BASE; > ret->pio.pci_base = 0; > ret->pio.size = SPAPR_PCI_IO_WIN_SIZE; > > /* 32-bit portion of the MMIO window is at PCI address 2..4 GiB */ > - ret->mmio32_cpu_base = SPAPR_PCI_WINDOW_BASE + SPAPR_PCI_MMIO32_WIN_OFF; > + ret->mmio32_cpu_base = SPAPR_PCI_BASE + SPAPR_PCI_MMIO32_WIN_SIZE; > ret->mmio32.pci_base = 0x80000000; /* 2 GiB */ > ret->mmio32.size = SPAPR_PCI_MMIO32_WIN_SIZE; > > diff --git a/tests/spapr-phb-test.c b/tests/spapr-phb-test.c > index 21004a7..d3522ea 100644 > --- a/tests/spapr-phb-test.c > +++ b/tests/spapr-phb-test.c > @@ -25,7 +25,7 @@ int main(int argc, char **argv) > g_test_init(&argc, &argv, NULL); > qtest_add_func("/spapr-phb/device", test_phb_device); > > - qtest_start("-device " TYPE_SPAPR_PCI_HOST_BRIDGE ",index=100"); > + qtest_start("-device " TYPE_SPAPR_PCI_HOST_BRIDGE ",index=30"); > > ret = g_test_run(); > >