This patch documents and implements ROE_MPROTECT_CHUNK, a part of ROE
hypercall designed to protect regions of a memory page with byte
granularity. This feature provides a key primitive to protect against
attacks involving pages remapping. However this attack  will be
addressed in future patches.

Signed-off-by: Ahmed Abd El Mawgood <ahmedsoliman0x...@gmail.com>
---
 Documentation/virtual/kvm/hypercalls.txt |   9 ++
 arch/x86/kvm/mmu.c                       |   6 +-
 arch/x86/kvm/x86.c                       | 156 +++++++++++++++++++++--
 include/linux/kvm_host.h                 |  26 ++++
 include/uapi/linux/kvm_para.h            |   1 +
 virt/kvm/kvm_main.c                      |  88 +++++++++++--
 6 files changed, 266 insertions(+), 20 deletions(-)

diff --git a/Documentation/virtual/kvm/hypercalls.txt 
b/Documentation/virtual/kvm/hypercalls.txt
index 8af64d826f03..a31f316ce6e6 100644
--- a/Documentation/virtual/kvm/hypercalls.txt
+++ b/Documentation/virtual/kvm/hypercalls.txt
@@ -164,6 +164,15 @@ This configuration lets a guest kernel have part of its 
read/write memory
 converted into read-only.  This action is irreversible.
 Upon successful run, the number of pages protected is returned.
 
+Usage 3:
+     a0: ROE_MPROTECT_CHUNK    (requires version >= 2)
+     a1: Start address aligned to page boundary.
+     a2: Number of bytes to be protected.
+This configuration lets a guest kernel have part of its read/write memory
+converted into read-only with bytes granularity. ROE_MPROTECT_CHUNK is
+relatively slow compared to ROE_MPROTECT. This action is irreversible.
+Upon successful run, the number of bytes protected is returned.
+
 Error codes:
        -KVM_ENOSYS: system call being triggered from ring 3 or it is not
        implemented.
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 7d9b63ddbb81..becb95b5f76e 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1513,9 +1513,11 @@ static bool __rmap_write_protect_roe(struct kvm *kvm,
        struct rmap_iterator iter;
        bool prot;
        bool flush = false;
-
+       void *full_bmp =  d->memslot->roe_bitmap;
+       void *part_bmp = d->memslot->partial_roe_bitmap;
        for_each_rmap_spte(rmap_head, &iter, sptep) {
-               prot = !test_bit(d->i, d->memslot->roe_bitmap) && pt_protect;
+               prot = !(test_bit(d->i, full_bmp) || test_bit(d->i, part_bmp));
+               prot = prot && pt_protect;
                flush |= spte_write_protect(sptep, prot);
                d->i++;
        }
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ce798b30b69a..581bd18910df 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6930,17 +6930,23 @@ static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, 
gpa_t paddr,
 
 #ifdef CONFIG_KVM_ROE
 static void kvm_roe_protect_slot(struct kvm *kvm, struct kvm_memory_slot *slot,
-                               gfn_t gfn, u64 npages)
+                               gfn_t gfn, u64 npages, bool partial)
 {
        int i;
+       void *bitmap;
 
+       if (partial)
+               bitmap = slot->partial_roe_bitmap;
+       else
+               bitmap = slot->roe_bitmap;
        for (i = gfn - slot->base_gfn; i < gfn + npages - slot->base_gfn; i++)
-               set_bit(i, slot->roe_bitmap);
+               set_bit(i, bitmap);
        kvm_mmu_slot_apply_write_access(kvm, slot);
        kvm_arch_flush_shadow_memslot(kvm, slot);
 }
 
-static int __kvm_roe_protect_range(struct kvm *kvm, gpa_t gpa, u64 npages)
+static int __kvm_roe_protect_range(struct kvm *kvm, gpa_t gpa, u64 npages,
+                               bool partial)
 {
        struct kvm_memory_slot *slot;
        gfn_t gfn = gpa >> PAGE_SHIFT;
@@ -6956,12 +6962,12 @@ static int __kvm_roe_protect_range(struct kvm *kvm, 
gpa_t gpa, u64 npages)
                if (gfn + npages > slot->base_gfn + slot->npages) {
                        u64 _npages = slot->base_gfn + slot->npages - gfn;
 
-                       kvm_roe_protect_slot(kvm, slot, gfn, _npages);
+                       kvm_roe_protect_slot(kvm, slot, gfn, _npages, partial);
                        gfn += _npages;
                        count += _npages;
                        npages -= _npages;
                } else {
-                       kvm_roe_protect_slot(kvm, slot, gfn, npages);
+                       kvm_roe_protect_slot(kvm, slot, gfn, npages, partial);
                        count += npages;
                        npages = 0;
                }
@@ -6971,12 +6977,13 @@ static int __kvm_roe_protect_range(struct kvm *kvm, 
gpa_t gpa, u64 npages)
        return count;
 }
 
-static int kvm_roe_protect_range(struct kvm *kvm, gpa_t gpa, u64 npages)
+static int kvm_roe_protect_range(struct kvm *kvm, gpa_t gpa, u64 npages,
+               bool partial)
 {
        int r;
 
        mutex_lock(&kvm->slots_lock);
-       r = __kvm_roe_protect_range(kvm, gpa, npages);
+       r = __kvm_roe_protect_range(kvm, gpa, npages, partial);
        mutex_unlock(&kvm->slots_lock);
        return r;
 }
@@ -7025,7 +7032,7 @@ static int kvm_roe_full_protect_range(struct kvm_vcpu 
*vcpu, u64 gva,
                        continue;
                if (!access_ok(VERIFY_WRITE, hva, 1 << PAGE_SHIFT))
                        continue;
-               status =  kvm_roe_protect_range(vcpu->kvm, gpa, 1);
+               status =  kvm_roe_protect_range(vcpu->kvm, gpa, 1, false);
                if (status > 0)
                        count += status;
        }
@@ -7033,7 +7040,135 @@ static int kvm_roe_full_protect_range(struct kvm_vcpu 
*vcpu, u64 gva,
                return -EINVAL;
        return count;
 }
+static int kvm_roe_insert_chunk_next(struct list_head *pos, u64 gpa, u64 size)
+{
+       struct protected_chunk *chunk;
+
+       chunk = kvzalloc(sizeof(struct protected_chunk), GFP_KERNEL);
+       chunk->gpa = gpa;
+       chunk->size = size;
+       INIT_LIST_HEAD(&chunk->list);
+       list_add(&chunk->list, pos);
+       return size;
+}
+static int kvm_roe_expand_chunk(struct protected_chunk *pos, u64 gpa, u64 size)
+{
+       u64 old_ptr = pos->gpa;
+       u64 old_size = pos->size;
+
+       if (gpa < old_ptr)
+               pos->gpa = gpa;
+       if (gpa + size > old_ptr + old_size)
+               pos->size = gpa + size - pos->gpa;
+       return size;
+}
+
+static bool kvm_roe_merge_chunks(struct protected_chunk *chunk)
+{
+       /*attempt merging 2 consecutive given the first one*/
+       struct protected_chunk *next = list_next_entry(chunk, list);
+
+       if (!kvm_roe_range_overlap(chunk, next->gpa, next->size))
+               return false;
+       kvm_roe_expand_chunk(chunk, next->gpa, next->size);
+       list_del(&next->list);
+       kvfree(next);
+       return true;
+}
+static int __kvm_roe_insert_chunk(struct kvm_memory_slot *slot, u64 gpa,
+               u64 size)
+{
+       /* kvm->slots_lock must be acquired*/
+       struct protected_chunk *pos;
+       struct list_head *head = slot->prot_list;
+
+       if (list_empty(head))
+               return kvm_roe_insert_chunk_next(head, gpa, size);
+       /*
+        * pos here will never get deleted maybe the next one will
+        * that is why list_for_each_entry_safe is completely unsafe
+        */
+       list_for_each_entry(pos, head, list) {
+               if (kvm_roe_range_overlap(pos, gpa, size)) {
+                       int ret = kvm_roe_expand_chunk(pos, gpa, size);
+
+                       while (head != pos->list.next)
+                               if (!kvm_roe_merge_chunks(pos))
+                                       break;
+                       return ret;
+               }
+               if (pos->gpa > gpa) {
+                       struct protected_chunk *prev;
 
+                       prev = list_prev_entry(pos, list);
+                       return kvm_roe_insert_chunk_next(&prev->list, gpa,
+                                       size);
+               }
+       }
+       pos = list_last_entry(head, struct protected_chunk, list);
+
+       return kvm_roe_insert_chunk_next(&pos->list, gpa, size);
+}
+static int kvm_roe_insert_chunk(struct kvm *kvm, u64 gpa, u64 size)
+{
+       struct kvm_memory_slot *slot;
+       gfn_t gfn = gpa >> PAGE_SHIFT;
+       int ret;
+
+       mutex_lock(&kvm->slots_lock);
+       slot = gfn_to_memslot(kvm, gfn);
+       ret = __kvm_roe_insert_chunk(slot, gpa, size);
+       mutex_unlock(&kvm->slots_lock);
+       return ret;
+}
+
+static int kvm_roe_partial_page_protect(struct kvm_vcpu *vcpu, u64 gva,
+               u64 size)
+{
+       gpa_t gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL);
+
+       kvm_roe_protect_range(vcpu->kvm, gpa, 1, true);
+       return kvm_roe_insert_chunk(vcpu->kvm, gpa, size);
+}
+
+static int kvm_roe_partial_protect(struct kvm_vcpu *vcpu, u64 gva, u64 size)
+{
+       u64 gva_start = gva;
+       u64 gva_end = gva+size;
+       u64 gpn_start = gva_start >> PAGE_SHIFT;
+       u64 gpn_end = gva_end >> PAGE_SHIFT;
+       u64 _size;
+       int count = 0;
+       // We need to make sure that there will be no overflow or zero size
+       if (gva_end <= gva_start)
+               return -EINVAL;
+
+       // protect the partial page at the start
+       if (gpn_end > gpn_start)
+               _size = PAGE_SIZE - (gva_start & PAGE_MASK) + 1;
+       else
+               _size = size;
+       size -= _size;
+       count += kvm_roe_partial_page_protect(vcpu, gva_start, _size);
+       // full protect in the middle pages
+       if (gpn_end - gpn_start > 1) {
+               int ret;
+               u64 _gva = (gpn_start + 1) << PAGE_SHIFT;
+               u64 npages = gpn_end - gpn_start - 1;
+
+               size -= npages << PAGE_SHIFT;
+               ret = kvm_roe_full_protect_range(vcpu, _gva, npages);
+               if (ret > 0)
+                       count += ret << PAGE_SHIFT;
+       }
+       // protect the partial page at the end
+       if (size != 0)
+               count += kvm_roe_partial_page_protect(vcpu,
+                               gpn_end << PAGE_SHIFT, size);
+       if (count == 0)
+               return -EINVAL;
+       return count;
+}
 static int kvm_roe(struct kvm_vcpu *vcpu, u64 a0, u64 a1, u64 a2, u64 a3)
 {
        int ret;
@@ -7045,11 +7180,14 @@ static int kvm_roe(struct kvm_vcpu *vcpu, u64 a0, u64 
a1, u64 a2, u64 a3)
                return -KVM_ENOSYS;
        switch (a0) {
        case ROE_VERSION:
-               ret = 1; //current version
+               ret = 2; //current version
                break;
        case ROE_MPROTECT:
                ret = kvm_roe_full_protect_range(vcpu, a1, a2);
                break;
+       case ROE_MPROTECT_CHUNK:
+               ret = kvm_roe_partial_protect(vcpu, a1, a2);
+               break;
        default:
                ret = -EINVAL;
        }
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index be6885bc28bc..a6749a52386b 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -294,11 +294,37 @@ static inline int kvm_vcpu_exiting_guest_mode(struct 
kvm_vcpu *vcpu)
  */
 #define KVM_MEM_MAX_NR_PAGES ((1UL << 31) - 1)
 
+#ifdef CONFIG_KVM_ROE
+/*
+ * This structure is used to hold memory areas that are to be protected in a
+ * memory frame with mixed page permissions.
+ **/
+struct protected_chunk {
+       gpa_t gpa;
+       u64 size;
+       struct list_head list;
+};
+
+static inline bool kvm_roe_range_overlap(struct protected_chunk *chunk,
+               gpa_t gpa, int len) {
+       /*
+        * https://stackoverflow.com/questions/325933/
+        * determine-whether-two-date-ranges-overlap
+        * Assuming that it works, that link ^ provides a solution that is
+        * better than anything I would ever come up with.
+        */
+       return (gpa <= chunk->gpa + chunk->size - 1) &&
+               (gpa + len - 1 >= chunk->gpa);
+}
+#endif
+
 struct kvm_memory_slot {
        gfn_t base_gfn;
        unsigned long npages;
 #ifdef CONFIG_KVM_ROE
        unsigned long *roe_bitmap;
+       unsigned long *partial_roe_bitmap;
+       struct list_head *prot_list;
 #endif
        unsigned long *dirty_bitmap;
        struct kvm_arch_memory_slot arch;
diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h
index e6004e0750fd..4a84f974bc58 100644
--- a/include/uapi/linux/kvm_para.h
+++ b/include/uapi/linux/kvm_para.h
@@ -33,6 +33,7 @@
 /* ROE Functionality parameters */
 #define ROE_VERSION                    0
 #define ROE_MPROTECT                   1
+#define ROE_MPROTECT_CHUNK             2
 /*
  * hypercalls use architecture specific
  */
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index f9382a839361..2d3011e8490e 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -553,10 +553,19 @@ static void kvm_free_memslot(struct kvm *kvm, struct 
kvm_memory_slot *free,
                              struct kvm_memory_slot *dont)
 {
 #ifdef CONFIG_KVM_ROE
-       if (!dont)
+       if (!dont) {
+               //TODO still this might leak
+               struct protected_chunk *pos, *n;
+               struct list_head *head = free->prot_list;
                kvfree(free->roe_bitmap);
+               kvfree(free->partial_roe_bitmap);
+               list_for_each_entry_safe(pos, n, head, list) {
+                       list_del(&pos->list);
+                       kvfree(pos);
+               }
+               kvfree(free->prot_list);
+       }
 #endif
-
        if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
                kvm_destroy_dirty_bitmap(free);
 
@@ -803,13 +812,22 @@ static int kvm_create_dirty_bitmap(struct kvm_memory_slot 
*memslot)
        return 0;
 }
 
-static int kvm_init_roe_bitmap(struct kvm_memory_slot *slot)
+static int kvm_init_roe(struct kvm_memory_slot *slot)
 {
 #ifdef CONFIG_KVM_ROE
        slot->roe_bitmap = kvzalloc(BITS_TO_LONGS(slot->npages) *
        sizeof(unsigned long), GFP_KERNEL);
        if (!slot->roe_bitmap)
                return -ENOMEM;
+       slot->partial_roe_bitmap = kvzalloc(BITS_TO_LONGS(slot->npages) *
+       sizeof(unsigned long), GFP_KERNEL);
+       if (!slot->partial_roe_bitmap) {
+               kvfree(slot->roe_bitmap);
+               return -ENOMEM;
+       }
+       slot->prot_list = kvzalloc(sizeof(struct list_head), GFP_KERNEL);
+       INIT_LIST_HEAD(slot->prot_list);
+
 #endif
        return 0;
 }
@@ -1036,7 +1054,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
                if (kvm_create_dirty_bitmap(&new) < 0)
                        goto out_free;
        }
-       if (kvm_init_roe_bitmap(&new) < 0)
+       if (kvm_init_roe(&new) < 0)
                goto out_free;
 
        slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
@@ -1290,26 +1308,37 @@ static bool memslot_is_readonly(struct kvm_memory_slot 
*slot)
 {
        return slot->flags & KVM_MEM_READONLY;
 }
+#ifdef CONFIG_KVM_ROE
+static bool gfn_is_partially_protected(struct kvm_memory_slot *slot, gfn_t gfn)
+{
+
+       return test_bit(gfn - slot->base_gfn, slot->partial_roe_bitmap);
+}
 
+static bool gfn_is_fully_protected(struct kvm_memory_slot *slot, gfn_t gfn)
+{
+       return test_bit(gfn - slot->base_gfn, slot->roe_bitmap);
+}
+#endif
 static bool gfn_is_readonly(struct kvm_memory_slot *slot, gfn_t gfn)
 {
 #ifdef CONFIG_KVM_ROE
-       return test_bit(gfn - slot->base_gfn, slot->roe_bitmap) ||
-               memslot_is_readonly(slot);
+       return gfn_is_fully_protected(slot, gfn) ||
+              gfn_is_partially_protected(slot, gfn) ||
+              memslot_is_readonly(slot);
 #else
        return memslot_is_readonly(slot);
 #endif
 }
 
+
 static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
                                       gfn_t *nr_pages, bool write)
 {
        if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
                return KVM_HVA_ERR_BAD;
-
        if (gfn_is_readonly(slot, gfn) && write)
                return KVM_HVA_ERR_RO_BAD;
-
        if (nr_pages)
                *nr_pages = slot->npages - (gfn - slot->base_gfn);
 
@@ -1871,14 +1900,55 @@ int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, 
gpa_t gpa,
        return __kvm_read_guest_atomic(slot, gfn, data, offset, len);
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic);
+#ifdef CONFIG_KVM_ROE
 
+static bool kvm_roe_protected_range(struct kvm_memory_slot *slot, gpa_t gpa,
+               int len)
+{
+       struct list_head *pos;
+       struct protected_chunk *cur_chunk;
+
+       list_for_each(pos, slot->prot_list) {
+               cur_chunk = list_entry(pos, struct protected_chunk, list);
+               if (kvm_roe_range_overlap(cur_chunk, gpa, len))
+                       return true;
+       }
+       return false;
+}
+static bool kvm_roe_check_range(struct kvm_memory_slot *slot,
+               gfn_t gfn, int offset, int len)
+{
+       gpa_t gpa = (gfn << PAGE_SHIFT) + offset;
+
+       if (!gfn_is_partially_protected(slot, gfn))
+               return false;
+       return kvm_roe_protected_range(slot, gpa, len);
+}
+#endif
+static u64 roe_gfn_to_hva(struct kvm_memory_slot *slot, gfn_t gfn, int offset,
+               int len)
+{
+       u64 addr;
+#ifdef CONFIG_KVM_ROE
+       if (kvm_roe_check_range(slot, gfn, offset, len))
+               return KVM_HVA_ERR_RO_BAD;
+       if (memslot_is_readonly(slot))
+               return KVM_HVA_ERR_RO_BAD;
+       if (gfn_is_fully_protected(slot, gfn))
+               return KVM_HVA_ERR_RO_BAD;
+       addr = __gfn_to_hva_many(slot, gfn, NULL, false);
+#else
+       addr = gfn_to_hva_memslot(slot, gfn);
+#endif
+       return addr;
+}
 static int __kvm_write_guest_page(struct kvm_memory_slot *memslot, gfn_t gfn,
                                  const void *data, int offset, int len)
 {
        int r;
        unsigned long addr;
 
-       addr = gfn_to_hva_memslot(memslot, gfn);
+       addr = roe_gfn_to_hva(memslot, gfn, offset, len);
        if (kvm_is_error_hva(addr))
                return -EFAULT;
        r = __copy_to_user((void __user *)addr + offset, data, len);
-- 
2.18.1

Reply via email to