When allocating the guest memory for an HVM domain, libxc keeps the P2M mapping for the entirety of the guest memory around for the time of the launch as xc_dom_image->p2m_host. For guests that have a large memory (3904 GiB), the p2m_host allocation takes more than 7.5 GiB of space, and leaves xl susceptible to getting OOM-killed on guest creation.
Convert the p2m_host table lookups to an arch-specific function that returns the mapping on-the-fly for x86 HVM guests to avoid this allocation, bringing down xl's memory usage from > 8GiB to < 70Mib for such launches. Signed-off-by: Varad Gautam <v...@amazon.de> --- Applies to stable-4.11+. tools/libxc/include/xc_dom.h | 11 +++- tools/libxc/xc_dom_arm.c | 2 + tools/libxc/xc_dom_core.c | 4 +- tools/libxc/xc_dom_x86.c | 126 ++++++++++++++++++++++++++++--------------- 4 files changed, 99 insertions(+), 44 deletions(-) diff --git a/tools/libxc/include/xc_dom.h b/tools/libxc/include/xc_dom.h index 8a66889..43abc0d 100644 --- a/tools/libxc/include/xc_dom.h +++ b/tools/libxc/include/xc_dom.h @@ -131,6 +131,9 @@ struct xc_dom_image { * a hybrid guest this means that it maps GPFNs to GPFNS. * * Note that the input is offset by rambase. + * + * This is not populated for guests that provide an arch-specific + * lookup hook in arch_hooks. */ xen_pfn_t *p2m_host; void *p2m_guest; @@ -274,6 +277,10 @@ struct xc_dom_arch { int arch_private_size; struct xc_dom_arch *next; + + /* arch-specific p2m table lookup to get rid of the p2m_host array stored in + * xc_dom_image. */ + xen_pfn_t (*p2m_host) (struct xc_dom_image *dom, unsigned long idx); }; void xc_dom_register_arch_hooks(struct xc_dom_arch *hooks); @@ -437,7 +444,9 @@ static inline xen_pfn_t xc_dom_p2m(struct xc_dom_image *dom, xen_pfn_t pfn) return pfn; if (pfn < dom->rambase_pfn || pfn >= dom->rambase_pfn + dom->total_pages) return INVALID_MFN; - return dom->p2m_host[pfn - dom->rambase_pfn]; + return dom->arch_hooks->p2m_host ? + dom->arch_hooks->p2m_host(dom, pfn - dom->rambase_pfn) + : dom->p2m_host[pfn - dom->rambase_pfn]; } #endif /* _XC_DOM_H */ diff --git a/tools/libxc/xc_dom_arm.c b/tools/libxc/xc_dom_arm.c index 5b9eca6..b15c6d2 100644 --- a/tools/libxc/xc_dom_arm.c +++ b/tools/libxc/xc_dom_arm.c @@ -547,6 +547,7 @@ static struct xc_dom_arch xc_dom_32 = { .meminit = meminit, .bootearly = bootearly, .bootlate = bootlate, + .p2m_host = NULL, }; static struct xc_dom_arch xc_dom_64 = { @@ -563,6 +564,7 @@ static struct xc_dom_arch xc_dom_64 = { .meminit = meminit, .bootearly = bootearly, .bootlate = bootlate, + .p2m_host = NULL, }; static void __init register_arch_hooks(void) diff --git a/tools/libxc/xc_dom_core.c b/tools/libxc/xc_dom_core.c index 9bd04cb..f3eaae3 100644 --- a/tools/libxc/xc_dom_core.c +++ b/tools/libxc/xc_dom_core.c @@ -985,7 +985,9 @@ int xc_dom_update_guest_p2m(struct xc_dom_image *dom) __FUNCTION__, dom->p2m_size); p2m_32 = dom->p2m_guest; for ( i = 0; i < dom->p2m_size; i++ ) - if ( dom->p2m_host[i] != INVALID_PFN ) + if ( dom->arch_hooks->p2m_host ) + p2m_32[i] = dom->arch_hooks->p2m_host(dom, i); + else if ( dom->p2m_host[i] != INVALID_PFN ) p2m_32[i] = dom->p2m_host[i]; else p2m_32[i] = (uint32_t) - 1; diff --git a/tools/libxc/xc_dom_x86.c b/tools/libxc/xc_dom_x86.c index 3ab918c..58f9894 100644 --- a/tools/libxc/xc_dom_x86.c +++ b/tools/libxc/xc_dom_x86.c @@ -101,6 +101,10 @@ struct xc_dom_image_x86 { #define MAPPING_MAX 2 struct xc_dom_x86_mapping maps[MAPPING_MAX]; struct xc_dom_params *params; + + /* Used to fake vmemrange information in case vNUMA information was not provided. */ + xen_vmemrange_t dummy_vmemrange[2]; + unsigned int nr_dummy_vmemranges; }; /* get guest IO ABI protocol */ @@ -1252,13 +1256,13 @@ static int meminit_hvm(struct xc_dom_image *dom) unsigned int memflags = 0; int claim_enabled = dom->claim_enabled; uint64_t total_pages; - xen_vmemrange_t dummy_vmemrange[2]; unsigned int dummy_vnode_to_pnode[1]; xen_vmemrange_t *vmemranges; unsigned int *vnode_to_pnode; unsigned int nr_vmemranges, nr_vnodes; xc_interface *xch = dom->xch; uint32_t domid = dom->guest_domid; + struct xc_dom_image_x86 *domx86 = dom->arch_private; if ( nr_pages > target_pages ) memflags |= XENMEMF_populate_on_demand; @@ -1274,25 +1278,26 @@ static int meminit_hvm(struct xc_dom_image *dom) * has no effect on the actual result. */ - dummy_vmemrange[0].start = 0; - dummy_vmemrange[0].end = dom->lowmem_end; - dummy_vmemrange[0].flags = 0; - dummy_vmemrange[0].nid = 0; - nr_vmemranges = 1; + domx86->dummy_vmemrange[0].start = 0; + domx86->dummy_vmemrange[0].end = dom->lowmem_end; + domx86->dummy_vmemrange[0].flags = 0; + domx86->dummy_vmemrange[0].nid = 0; + domx86->nr_dummy_vmemranges = 1; if ( dom->highmem_end > (1ULL << 32) ) { - dummy_vmemrange[1].start = 1ULL << 32; - dummy_vmemrange[1].end = dom->highmem_end; - dummy_vmemrange[1].flags = 0; - dummy_vmemrange[1].nid = 0; + domx86->dummy_vmemrange[1].start = 1ULL << 32; + domx86->dummy_vmemrange[1].end = dom->highmem_end; + domx86->dummy_vmemrange[1].flags = 0; + domx86->dummy_vmemrange[1].nid = 0; - nr_vmemranges++; + domx86->nr_dummy_vmemranges++; } dummy_vnode_to_pnode[0] = XC_NUMA_NO_NODE; nr_vnodes = 1; - vmemranges = dummy_vmemrange; + vmemranges = domx86->dummy_vmemrange; + nr_vmemranges = domx86->nr_dummy_vmemranges; vnode_to_pnode = dummy_vnode_to_pnode; } else @@ -1329,25 +1334,6 @@ static int meminit_hvm(struct xc_dom_image *dom) } dom->p2m_size = p2m_size; - dom->p2m_host = xc_dom_malloc(dom, sizeof(xen_pfn_t) * - dom->p2m_size); - if ( dom->p2m_host == NULL ) - { - DOMPRINTF("Could not allocate p2m"); - goto error_out; - } - - for ( i = 0; i < p2m_size; i++ ) - dom->p2m_host[i] = ((xen_pfn_t)-1); - for ( vmemid = 0; vmemid < nr_vmemranges; vmemid++ ) - { - uint64_t pfn; - - for ( pfn = vmemranges[vmemid].start >> PAGE_SHIFT; - pfn < vmemranges[vmemid].end >> PAGE_SHIFT; - pfn++ ) - dom->p2m_host[pfn] = pfn; - } /* * Try to claim pages for early warning of insufficient memory available. @@ -1395,8 +1381,12 @@ static int meminit_hvm(struct xc_dom_image *dom) */ if ( dom->device_model ) { + xen_pfn_t pfn_batch[0xa0]; + for ( i = 0; i < 0xa0; i++ ) + pfn_batch[i] = dom->arch_hooks->p2m_host(dom, i); + rc = xc_domain_populate_physmap_exact( - xch, domid, 0xa0, 0, memflags, &dom->p2m_host[0x00]); + xch, domid, 0xa0, 0, memflags, &pfn_batch[0x00]); if ( rc != 0 ) { DOMPRINTF("Could not populate low memory (< 0xA0).\n"); @@ -1439,7 +1429,7 @@ static int meminit_hvm(struct xc_dom_image *dom) if ( count > max_pages ) count = max_pages; - cur_pfn = dom->p2m_host[cur_pages]; + cur_pfn = dom->arch_hooks->p2m_host(dom, cur_pages); /* Take care the corner cases of super page tails */ if ( ((cur_pfn & (SUPERPAGE_1GB_NR_PFNS-1)) != 0) && @@ -1465,8 +1455,7 @@ static int meminit_hvm(struct xc_dom_image *dom) xen_pfn_t sp_extents[nr_extents]; for ( i = 0; i < nr_extents; i++ ) - sp_extents[i] = - dom->p2m_host[cur_pages+(i<<SUPERPAGE_1GB_SHIFT)]; + sp_extents[i] = dom->arch_hooks->p2m_host(dom, cur_pages+(i<<SUPERPAGE_1GB_SHIFT)); done = xc_domain_populate_physmap(xch, domid, nr_extents, SUPERPAGE_1GB_SHIFT, @@ -1505,8 +1494,7 @@ static int meminit_hvm(struct xc_dom_image *dom) xen_pfn_t sp_extents[nr_extents]; for ( i = 0; i < nr_extents; i++ ) - sp_extents[i] = - dom->p2m_host[cur_pages+(i<<SUPERPAGE_2MB_SHIFT)]; + sp_extents[i] = dom->arch_hooks->p2m_host(dom, cur_pages+(i<<SUPERPAGE_2MB_SHIFT)); done = xc_domain_populate_physmap(xch, domid, nr_extents, SUPERPAGE_2MB_SHIFT, @@ -1521,14 +1509,39 @@ static int meminit_hvm(struct xc_dom_image *dom) } } } - /* Fall back to 4kB extents. */ if ( count != 0 ) { - rc = xc_domain_populate_physmap_exact( - xch, domid, count, 0, new_memflags, &dom->p2m_host[cur_pages]); - cur_pages += count; - stat_normal_pages += count; + unsigned long nr_extents; + xen_pfn_t *pfn_batch; + + pfn_batch = calloc(SUPERPAGE_1GB_NR_PFNS, sizeof(*pfn_batch)); + if ( !pfn_batch ) { + DOMPRINTF("Could not allocate memory to construct physmap batch."); + rc = -1; + goto error_out; + } + + while ( count > 0 ) { + for ( i = 0; i < count && i < SUPERPAGE_1GB_NR_PFNS; i++) + pfn_batch[i] = dom->arch_hooks->p2m_host(dom, cur_pages+i); + + nr_extents = count > SUPERPAGE_1GB_NR_PFNS ? SUPERPAGE_1GB_NR_PFNS : count; + rc = xc_domain_populate_physmap_exact(xch, domid, nr_extents, + 0, new_memflags, &pfn_batch[0]); + if ( rc != 0 ) { + DOMPRINTF("Could not populate physmap batch."); + free(pfn_batch); + rc = -1; + goto error_out; + } + + stat_normal_pages += nr_extents; + cur_pages += nr_extents; + count -= nr_extents; + } + + free(pfn_batch); } } @@ -1780,6 +1793,31 @@ static int bootlate_hvm(struct xc_dom_image *dom) return 0; } +static xen_pfn_t p2m_host_hvm(struct xc_dom_image *dom, unsigned long idx) +{ + struct xc_dom_image_x86 *domx86 = dom->arch_private; + xen_vmemrange_t *vmemranges; + unsigned int nr_vmemranges; + int vmemid; + + if ( dom->nr_vmemranges ) { + vmemranges = dom->vmemranges; + nr_vmemranges = dom->nr_vmemranges; + } else { + vmemranges = domx86->dummy_vmemrange; + nr_vmemranges = domx86->nr_dummy_vmemranges; + } + + for ( vmemid = 0; vmemid < nr_vmemranges ; vmemid++ ) { + if ( idx >= (vmemranges[vmemid].start >> XC_DOM_PAGE_SHIFT(dom)) + && idx < (vmemranges[vmemid].end >> XC_DOM_PAGE_SHIFT(dom)) ) { + return idx; + } + } + + return ((xen_pfn_t)-1); +} + bool xc_dom_translated(const struct xc_dom_image *dom) { /* HVM guests are translated. PV guests are not. */ @@ -1805,6 +1843,7 @@ static struct xc_dom_arch xc_dom_32_pae = { .meminit = meminit_pv, .bootearly = bootearly, .bootlate = bootlate_pv, + .p2m_host = NULL, }; static struct xc_dom_arch xc_dom_64 = { @@ -1824,6 +1863,7 @@ static struct xc_dom_arch xc_dom_64 = { .meminit = meminit_pv, .bootearly = bootearly, .bootlate = bootlate_pv, + .p2m_host = NULL, }; static struct xc_dom_arch xc_hvm_32 = { @@ -1831,6 +1871,7 @@ static struct xc_dom_arch xc_hvm_32 = { .native_protocol = XEN_IO_PROTO_ABI_X86_32, .page_shift = PAGE_SHIFT_X86, .sizeof_pfn = 4, + .arch_private_size = sizeof(struct xc_dom_image_x86), .alloc_magic_pages = alloc_magic_pages_hvm, .alloc_pgtables = alloc_pgtables_hvm, .setup_pgtables = NULL, @@ -1840,6 +1881,7 @@ static struct xc_dom_arch xc_hvm_32 = { .meminit = meminit_hvm, .bootearly = bootearly, .bootlate = bootlate_hvm, + .p2m_host = p2m_host_hvm, }; static void __init register_arch_hooks(void) -- 2.7.4 Amazon Development Center Germany GmbH Krausenstr. 38 10117 Berlin Geschaeftsfuehrung: Christian Schlaeger, Ralf Herbrich Eingetragen am Amtsgericht Charlottenburg unter HRB 149173 B Sitz: Berlin Ust-ID: DE 289 237 879 _______________________________________________ Xen-devel mailing list Xen-devel@lists.xenproject.org https://lists.xenproject.org/mailman/listinfo/xen-devel