On Mon, 7 Feb 2022 20:24:20 +0000 Joao Martins <joao.m.mart...@oracle.com> wrote:
> It is assumed that the whole GPA space is available to be DMA > addressable, within a given address space limit, expect for a > tiny region before the 4G. Since Linux v5.4, VFIO validates > whether the selected GPA is indeed valid i.e. not reserved by > IOMMU on behalf of some specific devices or platform-defined > restrictions, and thus failing the ioctl(VFIO_DMA_MAP) with > -EINVAL. > > AMD systems with an IOMMU are examples of such platforms and > particularly may only have these ranges as allowed: > > 0000000000000000 - 00000000fedfffff (0 .. 3.982G) > 00000000fef00000 - 000000fcffffffff (3.983G .. 1011.9G) > 0000010000000000 - ffffffffffffffff (1Tb .. 16Pb[*]) > > We already account for the 4G hole, albeit if the guest is big > enough we will fail to allocate a guest with >1010G due to the > ~12G hole at the 1Tb boundary, reserved for HyperTransport (HT). > > [*] there is another reserved region unrelated to HT that exists > in the 256T boundaru in Fam 17h according to Errata #1286, > documeted also in "Open-Source Register Reference for AMD Family > 17h Processors (PUB)" > > When creating the region above 4G, take into account that on AMD > platforms the HyperTransport range is reserved and hence it > cannot be used either as GPAs. On those cases rather than > establishing the start of ram-above-4g to be 4G, relocate instead > to 1Tb. See AMD IOMMU spec, section 2.1.2 "IOMMU Logical > Topology", for more information on the underlying restriction of > IOVAs. > > After accounting for the 1Tb hole on AMD hosts, mtree should > look like: > > 0000000000000000-000000007fffffff (prio 0, i/o): > alias ram-below-4g @pc.ram 0000000000000000-000000007fffffff > 0000010000000000-000001ff7fffffff (prio 0, i/o): > alias ram-above-4g @pc.ram 0000000080000000-000000ffffffffff > > If the relocation is done, we also add the the reserved HT > e820 range as reserved. > > Suggested-by: Igor Mammedov <imamm...@redhat.com> > Signed-off-by: Joao Martins <joao.m.mart...@oracle.com> > --- > hw/i386/pc.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++ > target/i386/cpu.h | 4 +++ > 2 files changed, 70 insertions(+) > > diff --git a/hw/i386/pc.c b/hw/i386/pc.c > index 7de0e87f4a3f..b060aedd38f3 100644 > --- a/hw/i386/pc.c > +++ b/hw/i386/pc.c > @@ -802,6 +802,65 @@ void xen_load_linux(PCMachineState *pcms) > #define PC_ROM_ALIGN 0x800 > #define PC_ROM_SIZE (PC_ROM_MAX - PC_ROM_MIN_VGA) > > +/* > + * AMD systems with an IOMMU have an additional hole close to the > + * 1Tb, which are special GPAs that cannot be DMA mapped. Depending > + * on kernel version, VFIO may or may not let you DMA map those ranges. > + * Starting Linux v5.4 we validate it, and can't create guests on AMD > machines > + * with certain memory sizes. It's also wrong to use those IOVA ranges > + * in detriment of leading to IOMMU INVALID_DEVICE_REQUEST or worse. > + * The ranges reserved for Hyper-Transport are: > + * > + * FD_0000_0000h - FF_FFFF_FFFFh > + * > + * The ranges represent the following: > + * > + * Base Address Top Address Use > + * > + * FD_0000_0000h FD_F7FF_FFFFh Reserved interrupt address space > + * FD_F800_0000h FD_F8FF_FFFFh Interrupt/EOI IntCtl > + * FD_F900_0000h FD_F90F_FFFFh Legacy PIC IACK > + * FD_F910_0000h FD_F91F_FFFFh System Management > + * FD_F920_0000h FD_FAFF_FFFFh Reserved Page Tables > + * FD_FB00_0000h FD_FBFF_FFFFh Address Translation > + * FD_FC00_0000h FD_FDFF_FFFFh I/O Space > + * FD_FE00_0000h FD_FFFF_FFFFh Configuration > + * FE_0000_0000h FE_1FFF_FFFFh Extended Configuration/Device Messages > + * FE_2000_0000h FF_FFFF_FFFFh Reserved > + * > + * See AMD IOMMU spec, section 2.1.2 "IOMMU Logical Topology", > + * Table 3: Special Address Controls (GPA) for more information. > + */ > +#define AMD_HT_START 0xfd00000000UL > +#define AMD_HT_END 0xffffffffffUL > +#define AMD_ABOVE_1TB_START (AMD_HT_END + 1) > +#define AMD_HT_SIZE (AMD_ABOVE_1TB_START - AMD_HT_START) > + > +static void relocate_4g(MachineState *machine, PCMachineState *pcms) perhaps rename it to x86_update_above_4g_mem_start() ? > +{ > + PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms); > + X86MachineState *x86ms = X86_MACHINE(pcms); > + ram_addr_t device_mem_size = 0; > + uint32_t eax, vendor[3]; > + > + host_cpuid(0x0, 0, &eax, &vendor[0], &vendor[2], &vendor[1]); > + if (!IS_AMD_VENDOR(vendor)) { > + return; > + } > + > + if (pcmc->has_reserved_memory && > + (machine->ram_size < machine->maxram_size)) { > + device_mem_size = machine->maxram_size - machine->ram_size; > + } > + > + if ((x86ms->above_4g_mem_start + x86ms->above_4g_mem_size + > + device_mem_size) < AMD_HT_START) { should it account for sgx as well? what if above sum ends up right before AMD_HT_START, and exit without adjusting above_4g_mem_start, but pci64 hole eventually will fall into HT range? Is it expected behaviour? > + return; > + } > + > + x86ms->above_4g_mem_start = AMD_ABOVE_1TB_START; > +} > + > void pc_memory_init(PCMachineState *pcms, > MemoryRegion *system_memory, > MemoryRegion *rom_memory, > @@ -821,6 +880,8 @@ void pc_memory_init(PCMachineState *pcms, > > linux_boot = (machine->kernel_filename != NULL); > > + relocate_4g(machine, pcms); > + > /* > * Split single memory region and use aliases to address portions of it, > * done for backwards compatibility with older qemus. > @@ -831,6 +892,11 @@ void pc_memory_init(PCMachineState *pcms, > 0, x86ms->below_4g_mem_size); > memory_region_add_subregion(system_memory, 0, ram_below_4g); > e820_add_entry(0, x86ms->below_4g_mem_size, E820_RAM); > + > + if (x86ms->above_4g_mem_start == AMD_ABOVE_1TB_START) { > + e820_add_entry(AMD_HT_START, AMD_HT_SIZE, E820_RESERVED); > + } btw: do we have to add reservation record for HT zone, why? > if (x86ms->above_4g_mem_size > 0) { > ram_above_4g = g_malloc(sizeof(*ram_above_4g)); > memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", > diff --git a/target/i386/cpu.h b/target/i386/cpu.h > index 9911d7c8711b..1acebc569b02 100644 > --- a/target/i386/cpu.h > +++ b/target/i386/cpu.h > @@ -906,6 +906,10 @@ typedef uint64_t FeatureWordArray[FEATURE_WORDS]; > #define IS_AMD_CPU(env) ((env)->cpuid_vendor1 == CPUID_VENDOR_AMD_1 && \ > (env)->cpuid_vendor2 == CPUID_VENDOR_AMD_2 && \ > (env)->cpuid_vendor3 == CPUID_VENDOR_AMD_3) > +#define IS_AMD_VENDOR(vendor) ((vendor[0]) == CPUID_VENDOR_AMD_1 && \ > + (vendor[1]) == CPUID_VENDOR_AMD_2 && \ > + (vendor[2]) == CPUID_VENDOR_AMD_3) > + > > #define CPUID_MWAIT_IBE (1U << 1) /* Interrupts can exit capability */ > #define CPUID_MWAIT_EMX (1U << 0) /* enumeration supported */