When allocating the guest memory for an HVM domain, libxc keeps the P2M
mapping for the entirety of the guest memory around for the time of the
launch as xc_dom_image->p2m_host. For guests that have a large memory
(3904 GiB), the p2m_host allocation takes more than 7.5 GiB of space, and
leaves xl susceptible to getting OOM-killed on guest creation.

Convert the p2m_host table lookups to an arch-specific function that
returns the mapping on-the-fly for x86 HVM guests to avoid this
allocation, bringing down xl's memory usage from > 8GiB to < 70Mib for
such launches.

Signed-off-by: Varad Gautam <v...@amazon.de>

---

Applies to stable-4.11+.

 tools/libxc/include/xc_dom.h |  11 +++-
 tools/libxc/xc_dom_arm.c     |   2 +
 tools/libxc/xc_dom_core.c    |   4 +-
 tools/libxc/xc_dom_x86.c     | 126 ++++++++++++++++++++++++++++---------------
 4 files changed, 99 insertions(+), 44 deletions(-)

diff --git a/tools/libxc/include/xc_dom.h b/tools/libxc/include/xc_dom.h
index 8a66889..43abc0d 100644
--- a/tools/libxc/include/xc_dom.h
+++ b/tools/libxc/include/xc_dom.h
@@ -131,6 +131,9 @@ struct xc_dom_image {
      * a hybrid guest this means that it maps GPFNs to GPFNS.
      *
      * Note that the input is offset by rambase.
+     *
+     * This is not populated for guests that provide an arch-specific
+     * lookup hook in arch_hooks.
      */
     xen_pfn_t *p2m_host;
     void *p2m_guest;
@@ -274,6 +277,10 @@ struct xc_dom_arch {
     int arch_private_size;
 
     struct xc_dom_arch *next;
+
+    /* arch-specific p2m table lookup to get rid of the p2m_host array stored 
in
+     * xc_dom_image. */
+    xen_pfn_t (*p2m_host) (struct xc_dom_image *dom, unsigned long idx);
 };
 void xc_dom_register_arch_hooks(struct xc_dom_arch *hooks);
 
@@ -437,7 +444,9 @@ static inline xen_pfn_t xc_dom_p2m(struct xc_dom_image 
*dom, xen_pfn_t pfn)
         return pfn;
     if (pfn < dom->rambase_pfn || pfn >= dom->rambase_pfn + dom->total_pages)
         return INVALID_MFN;
-    return dom->p2m_host[pfn - dom->rambase_pfn];
+    return dom->arch_hooks->p2m_host ?
+            dom->arch_hooks->p2m_host(dom, pfn - dom->rambase_pfn)
+            : dom->p2m_host[pfn - dom->rambase_pfn];
 }
 
 #endif /* _XC_DOM_H */
diff --git a/tools/libxc/xc_dom_arm.c b/tools/libxc/xc_dom_arm.c
index 5b9eca6..b15c6d2 100644
--- a/tools/libxc/xc_dom_arm.c
+++ b/tools/libxc/xc_dom_arm.c
@@ -547,6 +547,7 @@ static struct xc_dom_arch xc_dom_32 = {
     .meminit = meminit,
     .bootearly = bootearly,
     .bootlate = bootlate,
+    .p2m_host = NULL,
 };
 
 static struct xc_dom_arch xc_dom_64 = {
@@ -563,6 +564,7 @@ static struct xc_dom_arch xc_dom_64 = {
     .meminit = meminit,
     .bootearly = bootearly,
     .bootlate = bootlate,
+    .p2m_host = NULL,
 };
 
 static void __init register_arch_hooks(void)
diff --git a/tools/libxc/xc_dom_core.c b/tools/libxc/xc_dom_core.c
index 9bd04cb..f3eaae3 100644
--- a/tools/libxc/xc_dom_core.c
+++ b/tools/libxc/xc_dom_core.c
@@ -985,7 +985,9 @@ int xc_dom_update_guest_p2m(struct xc_dom_image *dom)
                   __FUNCTION__, dom->p2m_size);
         p2m_32 = dom->p2m_guest;
         for ( i = 0; i < dom->p2m_size; i++ )
-            if ( dom->p2m_host[i] != INVALID_PFN )
+            if ( dom->arch_hooks->p2m_host )
+                p2m_32[i] = dom->arch_hooks->p2m_host(dom, i);
+            else if ( dom->p2m_host[i] != INVALID_PFN )
                 p2m_32[i] = dom->p2m_host[i];
             else
                 p2m_32[i] = (uint32_t) - 1;
diff --git a/tools/libxc/xc_dom_x86.c b/tools/libxc/xc_dom_x86.c
index 3ab918c..58f9894 100644
--- a/tools/libxc/xc_dom_x86.c
+++ b/tools/libxc/xc_dom_x86.c
@@ -101,6 +101,10 @@ struct xc_dom_image_x86 {
 #define MAPPING_MAX 2
     struct xc_dom_x86_mapping maps[MAPPING_MAX];
     struct xc_dom_params *params;
+
+    /* Used to fake vmemrange information in case vNUMA information was not 
provided. */
+    xen_vmemrange_t dummy_vmemrange[2];
+    unsigned int nr_dummy_vmemranges;
 };
 
 /* get guest IO ABI protocol */
@@ -1252,13 +1256,13 @@ static int meminit_hvm(struct xc_dom_image *dom)
     unsigned int memflags = 0;
     int claim_enabled = dom->claim_enabled;
     uint64_t total_pages;
-    xen_vmemrange_t dummy_vmemrange[2];
     unsigned int dummy_vnode_to_pnode[1];
     xen_vmemrange_t *vmemranges;
     unsigned int *vnode_to_pnode;
     unsigned int nr_vmemranges, nr_vnodes;
     xc_interface *xch = dom->xch;
     uint32_t domid = dom->guest_domid;
+    struct xc_dom_image_x86 *domx86 = dom->arch_private;
 
     if ( nr_pages > target_pages )
         memflags |= XENMEMF_populate_on_demand;
@@ -1274,25 +1278,26 @@ static int meminit_hvm(struct xc_dom_image *dom)
          * has no effect on the actual result.
          */
 
-        dummy_vmemrange[0].start = 0;
-        dummy_vmemrange[0].end   = dom->lowmem_end;
-        dummy_vmemrange[0].flags = 0;
-        dummy_vmemrange[0].nid   = 0;
-        nr_vmemranges = 1;
+        domx86->dummy_vmemrange[0].start = 0;
+        domx86->dummy_vmemrange[0].end   = dom->lowmem_end;
+        domx86->dummy_vmemrange[0].flags = 0;
+        domx86->dummy_vmemrange[0].nid   = 0;
+        domx86->nr_dummy_vmemranges = 1;
 
         if ( dom->highmem_end > (1ULL << 32) )
         {
-            dummy_vmemrange[1].start = 1ULL << 32;
-            dummy_vmemrange[1].end   = dom->highmem_end;
-            dummy_vmemrange[1].flags = 0;
-            dummy_vmemrange[1].nid   = 0;
+            domx86->dummy_vmemrange[1].start = 1ULL << 32;
+            domx86->dummy_vmemrange[1].end   = dom->highmem_end;
+            domx86->dummy_vmemrange[1].flags = 0;
+            domx86->dummy_vmemrange[1].nid   = 0;
 
-            nr_vmemranges++;
+            domx86->nr_dummy_vmemranges++;
         }
 
         dummy_vnode_to_pnode[0] = XC_NUMA_NO_NODE;
         nr_vnodes = 1;
-        vmemranges = dummy_vmemrange;
+        vmemranges = domx86->dummy_vmemrange;
+        nr_vmemranges = domx86->nr_dummy_vmemranges;
         vnode_to_pnode = dummy_vnode_to_pnode;
     }
     else
@@ -1329,25 +1334,6 @@ static int meminit_hvm(struct xc_dom_image *dom)
     }
 
     dom->p2m_size = p2m_size;
-    dom->p2m_host = xc_dom_malloc(dom, sizeof(xen_pfn_t) *
-                                      dom->p2m_size);
-    if ( dom->p2m_host == NULL )
-    {
-        DOMPRINTF("Could not allocate p2m");
-        goto error_out;
-    }
-
-    for ( i = 0; i < p2m_size; i++ )
-        dom->p2m_host[i] = ((xen_pfn_t)-1);
-    for ( vmemid = 0; vmemid < nr_vmemranges; vmemid++ )
-    {
-        uint64_t pfn;
-
-        for ( pfn = vmemranges[vmemid].start >> PAGE_SHIFT;
-              pfn < vmemranges[vmemid].end >> PAGE_SHIFT;
-              pfn++ )
-            dom->p2m_host[pfn] = pfn;
-    }
 
     /*
      * Try to claim pages for early warning of insufficient memory available.
@@ -1395,8 +1381,12 @@ static int meminit_hvm(struct xc_dom_image *dom)
      */
     if ( dom->device_model )
     {
+        xen_pfn_t pfn_batch[0xa0];
+        for ( i = 0; i < 0xa0; i++ )
+            pfn_batch[i] = dom->arch_hooks->p2m_host(dom, i);
+
         rc = xc_domain_populate_physmap_exact(
-            xch, domid, 0xa0, 0, memflags, &dom->p2m_host[0x00]);
+            xch, domid, 0xa0, 0, memflags, &pfn_batch[0x00]);
         if ( rc != 0 )
         {
             DOMPRINTF("Could not populate low memory (< 0xA0).\n");
@@ -1439,7 +1429,7 @@ static int meminit_hvm(struct xc_dom_image *dom)
             if ( count > max_pages )
                 count = max_pages;
 
-            cur_pfn = dom->p2m_host[cur_pages];
+            cur_pfn = dom->arch_hooks->p2m_host(dom, cur_pages);
 
             /* Take care the corner cases of super page tails */
             if ( ((cur_pfn & (SUPERPAGE_1GB_NR_PFNS-1)) != 0) &&
@@ -1465,8 +1455,7 @@ static int meminit_hvm(struct xc_dom_image *dom)
                 xen_pfn_t sp_extents[nr_extents];
 
                 for ( i = 0; i < nr_extents; i++ )
-                    sp_extents[i] =
-                        dom->p2m_host[cur_pages+(i<<SUPERPAGE_1GB_SHIFT)];
+                    sp_extents[i] = dom->arch_hooks->p2m_host(dom, 
cur_pages+(i<<SUPERPAGE_1GB_SHIFT));
 
                 done = xc_domain_populate_physmap(xch, domid, nr_extents,
                                                   SUPERPAGE_1GB_SHIFT,
@@ -1505,8 +1494,7 @@ static int meminit_hvm(struct xc_dom_image *dom)
                     xen_pfn_t sp_extents[nr_extents];
 
                     for ( i = 0; i < nr_extents; i++ )
-                        sp_extents[i] =
-                            dom->p2m_host[cur_pages+(i<<SUPERPAGE_2MB_SHIFT)];
+                        sp_extents[i] = dom->arch_hooks->p2m_host(dom, 
cur_pages+(i<<SUPERPAGE_2MB_SHIFT));
 
                     done = xc_domain_populate_physmap(xch, domid, nr_extents,
                                                       SUPERPAGE_2MB_SHIFT,
@@ -1521,14 +1509,39 @@ static int meminit_hvm(struct xc_dom_image *dom)
                     }
                 }
             }
-
             /* Fall back to 4kB extents. */
             if ( count != 0 )
             {
-                rc = xc_domain_populate_physmap_exact(
-                    xch, domid, count, 0, new_memflags, 
&dom->p2m_host[cur_pages]);
-                cur_pages += count;
-                stat_normal_pages += count;
+                unsigned long nr_extents;
+                xen_pfn_t *pfn_batch;
+
+                pfn_batch = calloc(SUPERPAGE_1GB_NR_PFNS, sizeof(*pfn_batch));
+                if  ( !pfn_batch ) {
+                    DOMPRINTF("Could not allocate memory to construct physmap 
batch.");
+                    rc = -1;
+                    goto error_out;
+                }
+
+                while ( count > 0 ) {
+                    for ( i = 0; i < count && i < SUPERPAGE_1GB_NR_PFNS; i++)
+                        pfn_batch[i] = dom->arch_hooks->p2m_host(dom, 
cur_pages+i);
+
+                    nr_extents = count > SUPERPAGE_1GB_NR_PFNS ? 
SUPERPAGE_1GB_NR_PFNS : count;
+                    rc = xc_domain_populate_physmap_exact(xch, domid, 
nr_extents,
+                                                      0, new_memflags, 
&pfn_batch[0]);
+                    if ( rc != 0 ) {
+                        DOMPRINTF("Could not populate physmap batch.");
+                        free(pfn_batch);
+                        rc = -1;
+                        goto error_out;
+                    }
+
+                    stat_normal_pages += nr_extents;
+                    cur_pages += nr_extents;
+                    count -= nr_extents;
+                }
+
+                free(pfn_batch);
             }
         }
 
@@ -1780,6 +1793,31 @@ static int bootlate_hvm(struct xc_dom_image *dom)
     return 0;
 }
 
+static xen_pfn_t p2m_host_hvm(struct xc_dom_image *dom, unsigned long idx)
+{
+    struct xc_dom_image_x86 *domx86 = dom->arch_private;
+    xen_vmemrange_t *vmemranges;
+    unsigned int nr_vmemranges;
+    int vmemid;
+
+    if ( dom->nr_vmemranges ) {
+        vmemranges = dom->vmemranges;
+        nr_vmemranges = dom->nr_vmemranges;
+    } else {
+        vmemranges = domx86->dummy_vmemrange;
+        nr_vmemranges = domx86->nr_dummy_vmemranges;
+    }
+
+    for ( vmemid = 0; vmemid < nr_vmemranges ; vmemid++ ) {
+        if ( idx >= (vmemranges[vmemid].start >> XC_DOM_PAGE_SHIFT(dom))
+             && idx < (vmemranges[vmemid].end >> XC_DOM_PAGE_SHIFT(dom)) ) {
+            return idx;
+        }
+    }
+
+    return ((xen_pfn_t)-1);
+}
+
 bool xc_dom_translated(const struct xc_dom_image *dom)
 {
     /* HVM guests are translated.  PV guests are not. */
@@ -1805,6 +1843,7 @@ static struct xc_dom_arch xc_dom_32_pae = {
     .meminit = meminit_pv,
     .bootearly = bootearly,
     .bootlate = bootlate_pv,
+    .p2m_host = NULL,
 };
 
 static struct xc_dom_arch xc_dom_64 = {
@@ -1824,6 +1863,7 @@ static struct xc_dom_arch xc_dom_64 = {
     .meminit = meminit_pv,
     .bootearly = bootearly,
     .bootlate = bootlate_pv,
+    .p2m_host = NULL,
 };
 
 static struct xc_dom_arch xc_hvm_32 = {
@@ -1831,6 +1871,7 @@ static struct xc_dom_arch xc_hvm_32 = {
     .native_protocol = XEN_IO_PROTO_ABI_X86_32,
     .page_shift = PAGE_SHIFT_X86,
     .sizeof_pfn = 4,
+    .arch_private_size = sizeof(struct xc_dom_image_x86),
     .alloc_magic_pages = alloc_magic_pages_hvm,
     .alloc_pgtables = alloc_pgtables_hvm,
     .setup_pgtables = NULL,
@@ -1840,6 +1881,7 @@ static struct xc_dom_arch xc_hvm_32 = {
     .meminit = meminit_hvm,
     .bootearly = bootearly,
     .bootlate = bootlate_hvm,
+    .p2m_host = p2m_host_hvm,
 };
 
 static void __init register_arch_hooks(void)
-- 
2.7.4




Amazon Development Center Germany GmbH
Krausenstr. 38
10117 Berlin
Geschaeftsfuehrung: Christian Schlaeger, Ralf Herbrich
Eingetragen am Amtsgericht Charlottenburg unter HRB 149173 B
Sitz: Berlin
Ust-ID: DE 289 237 879




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Reply via email to