[RFC PATCH 15/39] KVM: guest_memfd: hugetlb: allocate and truncate from hugetlb

Ackerley Tng Tue, 10 Sep 2024 16:50:26 -0700

If HugeTLB is requested at guest_memfd creation time, HugeTLB pages
will be used to back guest_memfd.


Signed-off-by: Ackerley Tng <ackerley...@google.com>
---
 virt/kvm/guest_memfd.c | 252 ++++++++++++++++++++++++++++++++++++++---
 1 file changed, 239 insertions(+), 13 deletions(-)

diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 31e1115273e1..2e6f12e2bac8 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -8,6 +8,8 @@
 #include <linux/pseudo_fs.h>
 #include <linux/pagemap.h>
 #include <linux/anon_inodes.h>
+#include <linux/memcontrol.h>
+#include <linux/mempolicy.h>
 
 #include "kvm_mm.h"
 
@@ -29,6 +31,13 @@ static struct kvm_gmem_hugetlb *kvm_gmem_hgmem(struct inode 
*inode)
        return inode->i_mapping->i_private_data;
 }
 
+static bool is_kvm_gmem_hugetlb(struct inode *inode)
+{
+       u64 flags = (u64)inode->i_private;
+
+       return flags & KVM_GUEST_MEMFD_HUGETLB;
+}
+
 /**
  * folio_file_pfn - like folio_file_page, but return a pfn.
  * @folio: The folio which contains this index.
@@ -58,6 +67,9 @@ static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct 
kvm_memory_slot *slo
        return 0;
 }
 
+/**
+ * Use the uptodate flag to indicate that the folio is prepared for KVM's 
usage.
+ */
 static inline void kvm_gmem_mark_prepared(struct folio *folio)
 {
        folio_mark_uptodate(folio);
@@ -72,13 +84,18 @@ static inline void kvm_gmem_mark_prepared(struct folio 
*folio)
 static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot 
*slot,
                                  gfn_t gfn, struct folio *folio)
 {
-       unsigned long nr_pages, i;
        pgoff_t index;
        int r;
 
-       nr_pages = folio_nr_pages(folio);
-       for (i = 0; i < nr_pages; i++)
-               clear_highpage(folio_page(folio, i));
+       if (folio_test_hugetlb(folio)) {
+               folio_zero_user(folio, folio->index << PAGE_SHIFT);
+       } else {
+               unsigned long nr_pages, i;
+
+               nr_pages = folio_nr_pages(folio);
+               for (i = 0; i < nr_pages; i++)
+                       clear_highpage(folio_page(folio, i));
+       }
 
        /*
         * Preparing huge folios should always be safe, since it should
@@ -103,6 +120,174 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct 
kvm_memory_slot *slot,
        return r;
 }
 
+static int kvm_gmem_get_mpol_node_nodemask(gfp_t gfp_mask,
+                                          struct mempolicy **mpol,
+                                          nodemask_t **nodemask)
+{
+       /*
+        * TODO: mempolicy would probably have to be stored on the inode, use
+        * task policy for now.
+        */
+       *mpol = get_task_policy(current);
+
+       /* TODO: ignore interleaving (set ilx to 0) for now. */
+       return policy_node_nodemask(*mpol, gfp_mask, 0, nodemask);
+}
+
+static struct folio *kvm_gmem_hugetlb_alloc_folio(struct hstate *h,
+                                                 struct hugepage_subpool 
*spool)
+{
+       bool memcg_charge_was_prepared;
+       struct mem_cgroup *memcg;
+       struct mempolicy *mpol;
+       nodemask_t *nodemask;
+       struct folio *folio;
+       gfp_t gfp_mask;
+       int ret;
+       int nid;
+
+       gfp_mask = htlb_alloc_mask(h);
+
+       memcg = get_mem_cgroup_from_current();
+       ret = mem_cgroup_hugetlb_try_charge(memcg,
+                                           gfp_mask | __GFP_RETRY_MAYFAIL,
+                                           pages_per_huge_page(h));
+       if (ret == -ENOMEM)
+               goto err;
+
+       memcg_charge_was_prepared = ret != -EOPNOTSUPP;
+
+       /* Pages are only to be taken from guest_memfd subpool and nowhere 
else. */
+       if (hugepage_subpool_get_pages(spool, 1))
+               goto err_cancel_charge;
+
+       nid = kvm_gmem_get_mpol_node_nodemask(htlb_alloc_mask(h), &mpol,
+                                             &nodemask);
+       /*
+        * charge_cgroup_reservation is false because we didn't make any cgroup
+        * reservations when creating the guest_memfd subpool.
+        *
+        * use_hstate_resv is true because we reserved from global hstate when
+        * creating the guest_memfd subpool.
+        */
+       folio = hugetlb_alloc_folio(h, mpol, nid, nodemask, false, true);
+       mpol_cond_put(mpol);
+
+       if (!folio)
+               goto err_put_pages;
+
+       hugetlb_set_folio_subpool(folio, spool);
+
+       if (memcg_charge_was_prepared)
+               mem_cgroup_commit_charge(folio, memcg);
+
+out:
+       mem_cgroup_put(memcg);
+
+       return folio;
+
+err_put_pages:
+       hugepage_subpool_put_pages(spool, 1);
+
+err_cancel_charge:
+       if (memcg_charge_was_prepared)
+               mem_cgroup_cancel_charge(memcg, pages_per_huge_page(h));
+
+err:
+       folio = ERR_PTR(-ENOMEM);
+       goto out;
+}
+
+static int kvm_gmem_hugetlb_filemap_add_folio(struct address_space *mapping,
+                                             struct folio *folio, pgoff_t 
index,
+                                             gfp_t gfp)
+{
+       int ret;
+
+       __folio_set_locked(folio);
+       ret = __filemap_add_folio(mapping, folio, index, gfp, NULL);
+       if (unlikely(ret)) {
+               __folio_clear_locked(folio);
+               return ret;
+       }
+
+       /*
+        * In hugetlb_add_to_page_cache(), there is a call to
+        * folio_clear_hugetlb_restore_reserve(). This is handled when the pages
+        * are removed from the page cache in unmap_hugepage_range() ->
+        * __unmap_hugepage_range() by conditionally calling
+        * folio_set_hugetlb_restore_reserve(). In kvm_gmem_hugetlb's usage of
+        * hugetlb, there are no VMAs involved, and pages are never taken from
+        * the surplus, so when pages are freed, the hstate reserve must be
+        * restored. Hence, this function makes no call to
+        * folio_clear_hugetlb_restore_reserve().
+        */
+
+       /* mark folio dirty so that it will not be removed from cache/inode */
+       folio_mark_dirty(folio);
+
+       return 0;
+}
+
+static struct folio *kvm_gmem_hugetlb_alloc_and_cache_folio(struct inode 
*inode,
+                                                           pgoff_t index)
+{
+       struct kvm_gmem_hugetlb *hgmem;
+       struct folio *folio;
+       int ret;
+
+       hgmem = kvm_gmem_hgmem(inode);
+       folio = kvm_gmem_hugetlb_alloc_folio(hgmem->h, hgmem->spool);
+       if (IS_ERR(folio))
+               return folio;
+
+       /* TODO: Fix index here to be aligned to huge page size. */
+       ret = kvm_gmem_hugetlb_filemap_add_folio(
+               inode->i_mapping, folio, index, htlb_alloc_mask(hgmem->h));
+       if (ret) {
+               folio_put(folio);
+               return ERR_PTR(ret);
+       }
+
+       spin_lock(&inode->i_lock);
+       inode->i_blocks += blocks_per_huge_page(hgmem->h);
+       spin_unlock(&inode->i_lock);
+
+       return folio;
+}
+
+static struct folio *kvm_gmem_get_hugetlb_folio(struct inode *inode,
+                                               pgoff_t index)
+{
+       struct address_space *mapping;
+       struct folio *folio;
+       struct hstate *h;
+       pgoff_t hindex;
+       u32 hash;
+
+       h = kvm_gmem_hgmem(inode)->h;
+       hindex = index >> huge_page_order(h);
+       mapping = inode->i_mapping;
+
+       /* To lock, we calculate the hash using the hindex and not index. */
+       hash = hugetlb_fault_mutex_hash(mapping, hindex);
+       mutex_lock(&hugetlb_fault_mutex_table[hash]);
+
+       /*
+        * The filemap is indexed with index and not hindex. Taking lock on
+        * folio to align with kvm_gmem_get_regular_folio()
+        */
+       folio = filemap_lock_folio(mapping, index);
+       if (!IS_ERR(folio))
+               goto out;
+
+       folio = kvm_gmem_hugetlb_alloc_and_cache_folio(inode, index);
+out:
+       mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+
+       return folio;
+}
+
 /*
  * Returns a locked folio on success.  The caller is responsible for
  * setting the up-to-date flag before the memory is mapped into the guest.
@@ -114,8 +299,10 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct 
kvm_memory_slot *slot,
  */
 static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
 {
-       /* TODO: Support huge pages. */
-       return filemap_grab_folio(inode->i_mapping, index);
+       if (is_kvm_gmem_hugetlb(inode))
+               return kvm_gmem_get_hugetlb_folio(inode, index);
+       else
+               return filemap_grab_folio(inode->i_mapping, index);
 }
 
 static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start,
@@ -240,6 +427,35 @@ static void kvm_gmem_hugetlb_truncate_folios_range(struct 
inode *inode,
        spin_unlock(&inode->i_lock);
 }
 
+static void kvm_gmem_hugetlb_truncate_range(struct inode *inode, loff_t lstart,
+                                           loff_t lend)
+{
+       loff_t full_hpage_start;
+       loff_t full_hpage_end;
+       unsigned long hsize;
+       struct hstate *h;
+
+       h = kvm_gmem_hgmem(inode)->h;
+       hsize = huge_page_size(h);
+
+       full_hpage_start = round_up(lstart, hsize);
+       full_hpage_end = round_down(lend, hsize);
+
+       if (lstart < full_hpage_start) {
+               hugetlb_zero_partial_page(h, inode->i_mapping, lstart,
+                                         full_hpage_start);
+       }
+
+       if (full_hpage_end > full_hpage_start) {
+               kvm_gmem_hugetlb_truncate_folios_range(inode, full_hpage_start,
+                                                      full_hpage_end);
+       }
+
+       if (lend > full_hpage_end) {
+               hugetlb_zero_partial_page(h, inode->i_mapping, full_hpage_end,
+                                         lend);
+       }
+}
 
 static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 {
@@ -257,7 +473,12 @@ static long kvm_gmem_punch_hole(struct inode *inode, 
loff_t offset, loff_t len)
        list_for_each_entry(gmem, gmem_list, entry)
                kvm_gmem_invalidate_begin(gmem, start, end);
 
-       truncate_inode_pages_range(inode->i_mapping, offset, offset + len - 1);
+       if (is_kvm_gmem_hugetlb(inode)) {
+               kvm_gmem_hugetlb_truncate_range(inode, offset, offset + len);
+       } else {
+               truncate_inode_pages_range(inode->i_mapping, offset,
+                                          offset + len - 1);
+       }
 
        list_for_each_entry(gmem, gmem_list, entry)
                kvm_gmem_invalidate_end(gmem, start, end);
@@ -279,8 +500,15 @@ static long kvm_gmem_allocate(struct inode *inode, loff_t 
offset, loff_t len)
 
        filemap_invalidate_lock_shared(mapping);
 
-       start = offset >> PAGE_SHIFT;
-       end = (offset + len) >> PAGE_SHIFT;
+       if (is_kvm_gmem_hugetlb(inode)) {
+               unsigned long hsize = huge_page_size(kvm_gmem_hgmem(inode)->h);
+
+               start = round_down(offset, hsize) >> PAGE_SHIFT;
+               end = round_down(offset + len, hsize) >> PAGE_SHIFT;
+       } else {
+               start = offset >> PAGE_SHIFT;
+               end = (offset + len) >> PAGE_SHIFT;
+       }
 
        r = 0;
        for (index = start; index < end; ) {
@@ -408,9 +636,7 @@ static void kvm_gmem_hugetlb_teardown(struct inode *inode)
 
 static void kvm_gmem_evict_inode(struct inode *inode)
 {
-       u64 flags = (u64)inode->i_private;
-
-       if (flags & KVM_GUEST_MEMFD_HUGETLB)
+       if (is_kvm_gmem_hugetlb(inode))
                kvm_gmem_hugetlb_teardown(inode);
        else
                truncate_inode_pages_final(inode->i_mapping);
@@ -827,7 +1053,7 @@ __kvm_gmem_get_pfn(struct file *file, struct 
kvm_memory_slot *slot,
 
        *pfn = folio_file_pfn(folio, index);
        if (max_order)
-               *max_order = 0;
+               *max_order = folio_order(folio);
 
        *is_prepared = folio_test_uptodate(folio);
        return folio;
-- 
2.46.0.598.g6f2099f65c-goog

[RFC PATCH 15/39] KVM: guest_memfd: hugetlb: allocate and truncate from hugetlb

Reply via email to