Introduced kvm_mmu_notifier_test_and_clear_dirty(), 
kvm_mmu_notifier_dirty_update()
and their mmu_notifier interfaces to support KSM dirty bit tracking, which 
brings
significant performance gain in volatile pages scanning in KSM.
Currently, kvm_mmu_notifier_dirty_update() returns 0 if and only if intel EPT is
enabled to indicate that the dirty bits of underlying sptes are not updated by
hardware.

Signed-off-by: Nai Xia <[email protected]>
Acked-by: Izik Eidus <[email protected]>
---
 arch/x86/include/asm/kvm_host.h |    1 +
 arch/x86/kvm/mmu.c              |   36 +++++++++++++++++++++++++++++
 arch/x86/kvm/mmu.h              |    3 +-
 arch/x86/kvm/vmx.c              |    1 +
 include/linux/kvm_host.h        |    2 +-
 include/linux/mmu_notifier.h    |   48 +++++++++++++++++++++++++++++++++++++++
 mm/mmu_notifier.c               |   33 ++++++++++++++++++++++++++
 virt/kvm/kvm_main.c             |   27 ++++++++++++++++++++++
 8 files changed, 149 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d2ac8e2..f0d7aa0 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -848,6 +848,7 @@ extern bool kvm_rebooting;
 int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
 int kvm_age_hva(struct kvm *kvm, unsigned long hva);
 int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
+int kvm_test_and_clear_dirty_hva(struct kvm *kvm, unsigned long hva);
 void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
 int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
 int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index aee3862..a5a0c51 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -979,6 +979,37 @@ out:
        return young;
 }
 
+/*
+ * Caller is supposed to SetPageDirty(), it's not done inside this.
+ */
+static
+int kvm_test_and_clear_dirty_rmapp(struct kvm *kvm, unsigned long *rmapp,
+                                  unsigned long data)
+{
+       u64 *spte;
+       int dirty = 0;
+
+       if (!shadow_dirty_mask) {
+               WARN(1, "KVM: do NOT try to test dirty bit in EPT\n");
+               goto out;
+       }
+
+       spte = rmap_next(kvm, rmapp, NULL);
+       while (spte) {
+               int _dirty;
+               u64 _spte = *spte;
+               BUG_ON(!(_spte & PT_PRESENT_MASK));
+               _dirty = _spte & PT_DIRTY_MASK;
+               if (_dirty) {
+                       dirty = 1;
+                       clear_bit(PT_DIRTY_SHIFT, (unsigned long *)spte);
+               }
+               spte = rmap_next(kvm, rmapp, spte);
+       }
+out:
+       return dirty;
+}
+
 #define RMAP_RECYCLE_THRESHOLD 1000
 
 static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
@@ -1004,6 +1035,11 @@ int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
        return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
 }
 
+int kvm_test_and_clear_dirty_hva(struct kvm *kvm, unsigned long hva)
+{
+       return kvm_handle_hva(kvm, hva, 0, kvm_test_and_clear_dirty_rmapp);
+}
+
 #ifdef MMU_DEBUG
 static int is_empty_shadow_page(u64 *spt)
 {
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 7086ca8..b8d01c3 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -18,7 +18,8 @@
 #define PT_PCD_MASK (1ULL << 4)
 #define PT_ACCESSED_SHIFT 5
 #define PT_ACCESSED_MASK (1ULL << PT_ACCESSED_SHIFT)
-#define PT_DIRTY_MASK (1ULL << 6)
+#define PT_DIRTY_SHIFT 6
+#define PT_DIRTY_MASK (1ULL << PT_DIRTY_SHIFT)
 #define PT_PAGE_SIZE_MASK (1ULL << 7)
 #define PT_PAT_MASK (1ULL << 7)
 #define PT_GLOBAL_MASK (1ULL << 8)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d48ec60..b407a69 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4674,6 +4674,7 @@ static int __init vmx_init(void)
                kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
                                VMX_EPT_EXECUTABLE_MASK);
                kvm_enable_tdp();
+               kvm_dirty_update = 0;
        } else
                kvm_disable_tdp();
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 31ebb59..2036bae 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -53,7 +53,7 @@
 struct kvm;
 struct kvm_vcpu;
 extern struct kmem_cache *kvm_vcpu_cache;
-
+extern int kvm_dirty_update;
 /*
  * It would be nice to use something smarter than a linear search, TBD...
  * Thankfully we dont expect many devices to register (famous last words :),
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 1d1b1e1..bd6ba2d 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -24,6 +24,9 @@ struct mmu_notifier_mm {
 };
 
 struct mmu_notifier_ops {
+       int (*dirty_update)(struct mmu_notifier *mn,
+                            struct mm_struct *mm);
+
        /*
         * Called either by mmu_notifier_unregister or when the mm is
         * being destroyed by exit_mmap, always before all pages are
@@ -72,6 +75,16 @@ struct mmu_notifier_ops {
                          unsigned long address);
 
        /*
+        * clear_flush_dirty is called after the VM is
+        * test-and-clearing the dirty/modified bitflag in the
+        * pte. This way the VM will provide proper volatile page
+        * testing to ksm.
+        */
+       int (*test_and_clear_dirty)(struct mmu_notifier *mn,
+                                   struct mm_struct *mm,
+                                   unsigned long address);
+
+       /*
         * change_pte is called in cases that pte mapping to page is changed:
         * for example, when ksm remaps pte to point to a new shared page.
         */
@@ -170,11 +183,14 @@ extern int __mmu_notifier_register(struct mmu_notifier 
*mn,
 extern void mmu_notifier_unregister(struct mmu_notifier *mn,
                                    struct mm_struct *mm);
 extern void __mmu_notifier_mm_destroy(struct mm_struct *mm);
+extern int __mmu_notifier_dirty_update(struct mm_struct *mm);
 extern void __mmu_notifier_release(struct mm_struct *mm);
 extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
                                          unsigned long address);
 extern int __mmu_notifier_test_young(struct mm_struct *mm,
                                     unsigned long address);
+extern int __mmu_notifier_test_and_clear_dirty(struct mm_struct *mm,
+                                              unsigned long address);
 extern void __mmu_notifier_change_pte(struct mm_struct *mm,
                                      unsigned long address, pte_t pte);
 extern void __mmu_notifier_invalidate_page(struct mm_struct *mm,
@@ -184,6 +200,19 @@ extern void __mmu_notifier_invalidate_range_start(struct 
mm_struct *mm,
 extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
                                  unsigned long start, unsigned long end);
 
+/*
+ * For ksm to make use of dirty bit, it wants to make sure that the dirty bits
+ * in sptes really carry the dirty information. Currently only intel EPT is
+ * not for ksm dirty bit tracking.
+ */
+static inline int mmu_notifier_dirty_update(struct mm_struct *mm)
+{
+       if (mm_has_notifiers(mm))
+               return __mmu_notifier_dirty_update(mm);
+
+       return 1;
+}
+
 static inline void mmu_notifier_release(struct mm_struct *mm)
 {
        if (mm_has_notifiers(mm))
@@ -206,6 +235,14 @@ static inline int mmu_notifier_test_young(struct mm_struct 
*mm,
        return 0;
 }
 
+static inline int mmu_notifier_test_and_clear_dirty(struct mm_struct *mm,
+                                                   unsigned long address)
+{
+       if (mm_has_notifiers(mm))
+               return __mmu_notifier_test_and_clear_dirty(mm, address);
+       return 0;
+}
+
 static inline void mmu_notifier_change_pte(struct mm_struct *mm,
                                           unsigned long address, pte_t pte)
 {
@@ -323,6 +360,11 @@ static inline void mmu_notifier_mm_destroy(struct 
mm_struct *mm)
 
 #else /* CONFIG_MMU_NOTIFIER */
 
+static inline int mmu_notifier_dirty_update(struct mm_struct *mm)
+{
+       return 1;
+}
+
 static inline void mmu_notifier_release(struct mm_struct *mm)
 {
 }
@@ -339,6 +381,12 @@ static inline int mmu_notifier_test_young(struct mm_struct 
*mm,
        return 0;
 }
 
+static inline int mmu_notifier_test_and_clear_dirty(struct mm_struct *mm,
+                                                   unsigned long address)
+{
+       return 0;
+}
+
 static inline void mmu_notifier_change_pte(struct mm_struct *mm,
                                           unsigned long address, pte_t pte)
 {
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 8d032de..a4a1467 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -18,6 +18,22 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 
+int __mmu_notifier_dirty_update(struct mm_struct *mm)
+{
+       struct mmu_notifier *mn;
+       struct hlist_node *n;
+       int dirty_update = 0;
+
+       rcu_read_lock();
+       hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
+               if (mn->ops->dirty_update)
+                       dirty_update |= mn->ops->dirty_update(mn, mm);
+       }
+       rcu_read_unlock();
+
+       return dirty_update;
+}
+
 /*
  * This function can't run concurrently against mmu_notifier_register
  * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap
@@ -120,6 +136,23 @@ int __mmu_notifier_test_young(struct mm_struct *mm,
        return young;
 }
 
+int __mmu_notifier_test_and_clear_dirty(struct mm_struct *mm,
+                                       unsigned long address)
+{
+       struct mmu_notifier *mn;
+       struct hlist_node *n;
+       int dirty = 0;
+
+       rcu_read_lock();
+       hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
+               if (mn->ops->test_and_clear_dirty)
+                       dirty |= mn->ops->test_and_clear_dirty(mn, mm, address);
+       }
+       rcu_read_unlock();
+
+       return dirty;
+}
+
 void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
                               pte_t pte)
 {
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 96ebc06..22967c8 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -78,6 +78,8 @@ static atomic_t hardware_enable_failed;
 struct kmem_cache *kvm_vcpu_cache;
 EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
 
+int kvm_dirty_update = 1;
+
 static __read_mostly struct preempt_ops kvm_preempt_ops;
 
 struct dentry *kvm_debugfs_dir;
@@ -398,6 +400,23 @@ static int kvm_mmu_notifier_test_young(struct mmu_notifier 
*mn,
        return young;
 }
 
+/* Caller should SetPageDirty(), no need to flush tlb */
+static int kvm_mmu_notifier_test_and_clear_dirty(struct mmu_notifier *mn,
+                                                struct mm_struct *mm,
+                                                unsigned long address)
+{
+       struct kvm *kvm = mmu_notifier_to_kvm(mn);
+       int dirty, idx;
+
+       idx = srcu_read_lock(&kvm->srcu);
+       spin_lock(&kvm->mmu_lock);
+       dirty = kvm_test_and_clear_dirty_hva(kvm, address);
+       spin_unlock(&kvm->mmu_lock);
+       srcu_read_unlock(&kvm->srcu, idx);
+
+       return dirty;
+}
+
 static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
                                     struct mm_struct *mm)
 {
@@ -409,14 +428,22 @@ static void kvm_mmu_notifier_release(struct mmu_notifier 
*mn,
        srcu_read_unlock(&kvm->srcu, idx);
 }
 
+static int kvm_mmu_notifier_dirty_update(struct mmu_notifier *mn,
+                                        struct mm_struct *mm)
+{
+       return kvm_dirty_update;
+}
+
 static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
        .invalidate_page        = kvm_mmu_notifier_invalidate_page,
        .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
        .invalidate_range_end   = kvm_mmu_notifier_invalidate_range_end,
        .clear_flush_young      = kvm_mmu_notifier_clear_flush_young,
        .test_young             = kvm_mmu_notifier_test_young,
+       .test_and_clear_dirty   = kvm_mmu_notifier_test_and_clear_dirty,
        .change_pte             = kvm_mmu_notifier_change_pte,
        .release                = kvm_mmu_notifier_release,
+       .dirty_update           = kvm_mmu_notifier_dirty_update,
 };
 
 static int kvm_init_mmu_notifier(struct kvm *kvm)
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to