> assert(piecetwosize <= holesize); > > piecetwosize = MIN(above_4g_mem_size, piecetwosize); > if ((above_4g_mem_size - piecetwosize) > 0) { > memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", > ram, 0x100000000ULL, > above_4g_mem_size - piecetwosize); > memory_region_add_subregion(system_memory, 0x100000000ULL, > ram_above_4g); > } else { > g_free(ram_above_4g); > } > memory_region_init_alias(ram_above_4g_piecetwo, NULL, > "ram-above-4g-piecetwo", ram, > 0x100000000ULL - holesize, piecetwosize); > memory_region_add_subregion(system_memory, > 0x100000000ULL + > above_4g_mem_size - piecetwosize, > ram_above_4g_piecetwo);
There is still a small problem in that the 2MB rounding must not be done for old machine types. I did a really careful review of the code and everything else looks okay to me. However, it grew by accretion from v1 and now it took me really a long time to figure it out... I adjusted it a bit and the result seems easier to understand to me. Here's the hw/i386/pc.c part of the patch (the patch from v6 is unreadable): diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 12c436e..f2fd138 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -1156,8 +1156,10 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory, { int linux_boot, i; MemoryRegion *ram, *option_rom_mr; - MemoryRegion *ram_below_4g, *ram_above_4g; + MemoryRegion *ram_below_4g, *ram_above_4g_pieceone, *ram_above_4g_piecetwo; FWCfgState *fw_cfg; + uint64_t holesize, pieceonesize, piecetwosize; + uint64_t memsize, align_offset; linux_boot = (kernel_filename != NULL); @@ -1165,26 +1167,74 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory, * aliases to address portions of it, mostly for backwards compatibility * with older qemus that used qemu_ram_alloc(). */ + memsize = below_4g_mem_size + above_4g_mem_size; + holesize = 0x100000000ULL - below_4g_mem_size; + + /* If 1GB hugepages are used to back guest RAM, we want the + * physical address 4GB to map to 4GB in the RAM, so that + * memory beyond 4GB is aligned on a 1GB boundary, at the + * host physical address space. Thus, the ram block range + * [holestart, 4GB] is mapped to the last holesize bytes of RAM: + * + * 0 h 4G memsize-holesize + * + * contiguous-ram-block [xxxxxx][yyy][zzzzz] + * '-----------. + * guest-addr-space [xxxxxx] [zzzzz][yyy] + * + * This is only done in new-enough machine types, and of course + * it is only necessary if the [zzzzz] block exists at all. + */ + if (guest_info->gb_align && above_4g_mem_size > holesize) { + /* Round the allocation up to 2 MB to use more hugepages. + * Remove the slack from the [yyy] piece so that pieceonesize + * (and thus the start of piecetwo) remains aligned. + */ + align_offset = ROUND_UP(memsize, 1UL << 21) - memsize; + piecetwosize = holesize - align_offset; + } else { + /* There's no "piece one", all memory above 4G starts + * at below_4g_mem_size in the RAM block. Also no need + * to align anything. + */ + align_offset = 0; + piecetwosize = above_4g_mem_size; + } + ram = g_malloc(sizeof(*ram)); - memory_region_init_ram(ram, NULL, "pc.ram", - below_4g_mem_size + above_4g_mem_size); + memory_region_init_ram(ram, NULL, "pc.ram", memsize + align_offset); vmstate_register_ram_global(ram); *ram_memory = ram; + ram_below_4g = g_malloc(sizeof(*ram_below_4g)); memory_region_init_alias(ram_below_4g, NULL, "ram-below-4g", ram, 0, below_4g_mem_size); memory_region_add_subregion(system_memory, 0, ram_below_4g); + + pieceonesize = above_4g_mem_size - piecetwosize; + if (pieceonesize) { + ram_above_4g_pieceone = g_malloc(sizeof(*ram_above_4g_pieceone)); + memory_region_init_alias(ram_above_4g_pieceone, NULL, + "ram-above-4g-pieceone", ram, + 0x100000000ULL, pieceonesize); + memory_region_add_subregion(system_memory, 0x100000000ULL, + ram_above_4g_pieceone); + } + if (piecetwosize) { + ram_above_4g_piecetwo = g_malloc(sizeof(*ram_above_4g_piecetwo)); + memory_region_init_alias(ram_above_4g_piecetwo, NULL, + "ram-above-4g-piecetwo", ram, + below_4g_mem_size, piecetwosize); + memory_region_add_subregion(system_memory, + 0x100000000ULL + pieceonesize, + ram_above_4g_piecetwo); + } + e820_add_entry(0, below_4g_mem_size, E820_RAM); if (above_4g_mem_size > 0) { - ram_above_4g = g_malloc(sizeof(*ram_above_4g)); - memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram, - below_4g_mem_size, above_4g_mem_size); - memory_region_add_subregion(system_memory, 0x100000000ULL, - ram_above_4g); e820_add_entry(0x100000000ULL, above_4g_mem_size, E820_RAM); } - /* Initialize PC system firmware */ pc_system_firmware_init(rom_memory, guest_info->isapc_ram_fw); Tests: -m 3585 (without gb_align) 0000000100000000-00000001000fffff (prio 0, RW): alias ram-above-4g-piecetwo @pc.ram 00000000e0000000-00000000e00fffff -m 4096 (without gb_align) 0000000100000000-000000011fffffff (prio 0, RW): alias ram-above-4g-piecetwo @pc.ram 00000000e0000000-00000000ffffffff -m 4097 (without gb_align) 0000000100000000-00000001200fffff (prio 0, RW): alias ram-above-4g @pc.ram 00000000e0000000-00000001000fffff -m 4097 (with gb_align, both high regions 2MB aligned, unused MB of RAM at 0xfff00000) 0000000100000000-00000001001fffff (prio 0, RW): alias ram-above-4g @pc.ram 0000000100000000-00000001001fffff 0000000100200000-00000001200fffff (prio 0, RW): alias ram-above-4g-piecetwo @pc.ram 00000000e0000000-00000000ffefffff -m 8192 (without gb_align) 0000000100000000-000000021fffffff (prio 0, RW): alias ram-above-4g-piecetwo @pc.ram 00000000e0000000-00000001ffffffff -m 8192 (with gb_align) 0000000100000000-00000001ffffffff (prio 0, RW): alias ram-above-4g @pc.ram 0000000100000000-00000001ffffffff 0000000200000000-000000021fffffff (prio 0, RW): alias ram-above-4g-piecetwo @pc.ram 00000000e0000000-00000000ffffffff -m 8193 (without gb_align) 0000000100000000-00000002200fffff (prio 0, RW): alias ram-above-4g-piecetwo @pc.ram 00000000e0000000-00000002000fffff -m 8193 (with gb_align) 0000000100000000-00000002001fffff (prio 0, RW): alias ram-above-4g @pc.ram 0000000100000000-00000002001fffff 0000000200200000-00000002200fffff (prio 0, RW): alias ram-above-4g-piecetwo @pc.ram 00000000e0000000-00000000ffefffff Ok to apply this version? Paolo