Handles the guest faults in KVM by mapping in corresponding user pages
in the 2nd stage page tables.

Introduces new ARM-specific kernel memory types, PAGE_KVM_GUEST and
pgprot_guest variables used to map 2nd stage memory for KVM guests.

Leverages MMU notifiers on KVM/ARM by supporting the kvm_unmap_hva() and
kvm_set_spte_hva operations.  All other KVM MMU notifierhooks are NOPs.

Signed-off-by: Marc Zyngier <marc.zyng...@arm.com>
Signed-off-by: Christoffer Dall <c.d...@virtualopensystems.com>
---
 arch/arm/include/asm/kvm_arm.h        |    9 +
 arch/arm/include/asm/kvm_asm.h        |    3 
 arch/arm/include/asm/kvm_host.h       |   16 ++
 arch/arm/include/asm/pgtable-3level.h |    9 +
 arch/arm/include/asm/pgtable.h        |    4 +
 arch/arm/kvm/Kconfig                  |    1 
 arch/arm/kvm/exports.c                |    1 
 arch/arm/kvm/interrupts.S             |   37 ++++++
 arch/arm/kvm/mmu.c                    |  218 +++++++++++++++++++++++++++++++++
 arch/arm/mm/mmu.c                     |    3 
 10 files changed, 300 insertions(+), 1 deletion(-)

diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h
index 0d1e895..7f6cad4 100644
--- a/arch/arm/include/asm/kvm_arm.h
+++ b/arch/arm/include/asm/kvm_arm.h
@@ -149,6 +149,15 @@
 #define HSR_ISS                (HSR_IL - 1)
 #define HSR_ISV_SHIFT  (24)
 #define HSR_ISV                (1U << HSR_ISV_SHIFT)
+#define HSR_FSC                (0x3f)
+#define HSR_FSC_TYPE   (0x3c)
+#define HSR_WNR                (1 << 6)
+
+#define FSC_FAULT      (0x04)
+#define FSC_PERM       (0x0c)
+
+/* Hyp Prefetch Fault Address Register (HPFAR/HDFAR) */
+#define HPFAR_MASK     (~0xf)
 
 #define HSR_EC_UNKNOWN (0x00)
 #define HSR_EC_WFI     (0x01)
diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h
index 58d51e3..e01dfab 100644
--- a/arch/arm/include/asm/kvm_asm.h
+++ b/arch/arm/include/asm/kvm_asm.h
@@ -34,6 +34,7 @@
 #define SMCHYP_HVBAR_W 0xfffffff0
 
 #ifndef __ASSEMBLY__
+struct kvm;
 struct kvm_vcpu;
 
 extern char __kvm_hyp_init[];
@@ -47,6 +48,8 @@ extern char __kvm_hyp_vector[];
 extern char __kvm_hyp_code_start[];
 extern char __kvm_hyp_code_end[];
 
+extern void __kvm_tlb_flush_vmid(struct kvm *kvm);
+
 extern void __kvm_flush_vm_context(void);
 
 extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index c58865b..0c7e782 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -140,4 +140,20 @@ struct kvm_vcpu_stat {
        u32 halt_wakeup;
 };
 
+#define KVM_ARCH_WANT_MMU_NOTIFIER
+struct kvm;
+int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
+void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
+
+/* We do not have shadow page tables, hence the empty hooks */
+static inline int kvm_age_hva(struct kvm *kvm, unsigned long hva)
+{
+       return 0;
+}
+
+static inline int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
+{
+       return 0;
+}
+
 #endif /* __ARM_KVM_HOST_H__ */
diff --git a/arch/arm/include/asm/pgtable-3level.h 
b/arch/arm/include/asm/pgtable-3level.h
index 1169a8a..7351eee 100644
--- a/arch/arm/include/asm/pgtable-3level.h
+++ b/arch/arm/include/asm/pgtable-3level.h
@@ -102,6 +102,15 @@
  */
 #define L_PGD_SWAPPER          (_AT(pgdval_t, 1) << 55)        /* 
swapper_pg_dir entry */
 
+/*
+ * 2-nd stage PTE definitions for LPAE.
+ */
+#define L_PTE2_SHARED          L_PTE_SHARED
+#define L_PTE2_READ            (_AT(pteval_t, 1) << 6) /* HAP[0] */
+#define L_PTE2_WRITE           (_AT(pteval_t, 1) << 7) /* HAP[1] */
+#define L_PTE2_NORM_WB         (_AT(pteval_t, 3) << 4) /* MemAttr[3:2] */
+#define L_PTE2_INNER_WB                (_AT(pteval_t, 3) << 2) /* MemAttr[1:0] 
*/
+
 #ifndef __ASSEMBLY__
 
 #define pud_none(pud)          (!pud_val(pud))
diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h
index bc83540..a31d0e9 100644
--- a/arch/arm/include/asm/pgtable.h
+++ b/arch/arm/include/asm/pgtable.h
@@ -70,6 +70,7 @@ extern void __pgd_error(const char *file, int line, pgd_t);
 
 extern pgprot_t                pgprot_user;
 extern pgprot_t                pgprot_kernel;
+extern pgprot_t                pgprot_guest;
 
 #define _MOD_PROT(p, b)        __pgprot(pgprot_val(p) | (b))
 
@@ -83,6 +84,9 @@ extern pgprot_t               pgprot_kernel;
 #define PAGE_KERNEL            _MOD_PROT(pgprot_kernel, L_PTE_XN)
 #define PAGE_KERNEL_EXEC       pgprot_kernel
 #define PAGE_HYP               _MOD_PROT(pgprot_kernel, L_PTE_USER)
+#define PAGE_KVM_GUEST         _MOD_PROT(pgprot_guest, L_PTE2_READ | \
+                                         L_PTE2_NORM_WB | L_PTE2_INNER_WB | \
+                                         L_PTE2_SHARED)
 
 #define __PAGE_NONE            __pgprot(_L_PTE_DEFAULT | L_PTE_RDONLY | 
L_PTE_XN)
 #define __PAGE_SHARED          __pgprot(_L_PTE_DEFAULT | L_PTE_USER | L_PTE_XN)
diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig
index 83abbe0..7fa50d3 100644
--- a/arch/arm/kvm/Kconfig
+++ b/arch/arm/kvm/Kconfig
@@ -36,6 +36,7 @@ config KVM_ARM_HOST
        depends on KVM
        depends on MMU
        depends on CPU_V7 && ARM_VIRT_EXT
+       select  MMU_NOTIFIER
        ---help---
          Provides host support for ARM processors.
 
diff --git a/arch/arm/kvm/exports.c b/arch/arm/kvm/exports.c
index 8ebdf07..f39f823 100644
--- a/arch/arm/kvm/exports.c
+++ b/arch/arm/kvm/exports.c
@@ -33,5 +33,6 @@ EXPORT_SYMBOL_GPL(__kvm_hyp_code_end);
 EXPORT_SYMBOL_GPL(__kvm_vcpu_run);
 
 EXPORT_SYMBOL_GPL(__kvm_flush_vm_context);
+EXPORT_SYMBOL_GPL(__kvm_tlb_flush_vmid);
 
 EXPORT_SYMBOL_GPL(smp_send_reschedule);
diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S
index a0e370b..fd7331c 100644
--- a/arch/arm/kvm/interrupts.S
+++ b/arch/arm/kvm/interrupts.S
@@ -36,9 +36,46 @@ __kvm_hyp_code_start:
        .globl __kvm_hyp_code_start
 
 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+@  Flush per-VMID TLBs
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+/*
+ * void __kvm_tlb_flush_vmid(struct kvm *kvm);
+ *
+ * We rely on the hardware to broadcast the TLB invalidation to all CPUs
+ * inside the inner-shareable domain (which is the case for all v7
+ * implementations).  If we come across a non-IS SMP implementation, we'll
+ * have to use an IPI based mechanism. Until then, we stick to the simple
+ * hardware assisted version.
+ */
+ENTRY(__kvm_tlb_flush_vmid)
+       hvc     #0                      @ Switch to Hyp mode
+       push    {r2, r3}
+
+       add     r0, r0, #KVM_VTTBR
+       ldrd    r2, r3, [r0]
+       mcrr    p15, 6, r2, r3, c2      @ Write VTTBR
+       isb
+       mcr     p15, 0, r0, c8, c3, 0   @ TLBIALLIS (rt ignored)
+       dsb
+       isb
+       mov     r2, #0
+       mov     r3, #0
+       mcrr    p15, 6, r2, r3, c2      @ Back to VMID #0
+       isb
+
+       pop     {r2, r3}
+       hvc     #0                      @ Back to SVC
+       bx      lr
+ENDPROC(__kvm_tlb_flush_vmid)
+
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
 @  Flush TLBs and instruction caches of current CPU for all VMIDs
 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
 
+/*
+ * void __kvm_flush_vm_context(void);
+ */
 ENTRY(__kvm_flush_vm_context)
        hvc     #0                      @ switch to hyp-mode
 
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index ddfb3df..f3b0048 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -22,6 +22,7 @@
 #include <asm/pgalloc.h>
 #include <asm/kvm_arm.h>
 #include <asm/kvm_mmu.h>
+#include <asm/kvm_asm.h>
 
 static DEFINE_MUTEX(kvm_hyp_pgd_mutex);
 
@@ -169,6 +170,9 @@ out:
  * Allocates the 1st level table only of size defined by PGD2_ORDER (can
  * support either full 40-bit input addresses or limited to 32-bit input
  * addresses). Clears the allocated pages.
+ *
+ * Note we don't need locking here as this is only called when the VM is
+ * created, which can only be done once.
  */
 int kvm_alloc_stage2_pgd(struct kvm *kvm)
 {
@@ -229,6 +233,9 @@ static void free_stage2_ptes(pmd_t *pmd, unsigned long addr)
  * Walks the level-1 page table pointed to by kvm->arch.pgd and frees all
  * underlying level-2 and level-3 tables before freeing the actual level-1 
table
  * and setting the struct pointer to NULL.
+ *
+ * Note we don't need locking here as this is only called when the VM is
+ * destroyed, which can only be done once.
  */
 void kvm_free_stage2_pgd(struct kvm *kvm)
 {
@@ -264,7 +271,216 @@ void kvm_free_stage2_pgd(struct kvm *kvm)
        kvm->arch.pgd = NULL;
 }
 
+static const pte_t null_pte;
+
+static int stage2_set_pte(struct kvm *kvm, phys_addr_t addr,
+                         const pte_t *new_pte)
+{
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+
+       /* Create 2nd stage page table mapping - Level 1 */
+       pgd = kvm->arch.pgd + pgd_index(addr);
+       pud = pud_offset(pgd, addr);
+       if (pud_none(*pud)) {
+               BUG_ON(new_pte == &null_pte);
+               pmd = pmd_alloc_one(NULL, addr);
+               if (!pmd) {
+                       kvm_err("Cannot allocate 2nd stage pmd\n");
+                       return -ENOMEM;
+               }
+               pud_populate(NULL, pud, pmd);
+               pmd += pmd_index(addr);
+       } else
+               pmd = pmd_offset(pud, addr);
+
+       /* Create 2nd stage page table mapping - Level 2 */
+       if (pmd_none(*pmd)) {
+               BUG_ON(new_pte == &null_pte);
+               pte = pte_alloc_one_kernel(NULL, addr);
+               if (!pte) {
+                       kvm_err("Cannot allocate 2nd stage pte\n");
+                       return -ENOMEM;
+               }
+               pmd_populate_kernel(NULL, pmd, pte);
+               pte += pte_index(addr);
+       } else
+               pte = pte_offset_kernel(pmd, addr);
+
+       /* Create 2nd stage page table mapping - Level 3 */
+       set_pte_ext(pte, *new_pte, 0);
+
+       return 0;
+}
+
+static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
+                         gfn_t gfn, struct kvm_memory_slot *memslot,
+                         bool is_iabt)
+{
+       pte_t new_pte;
+       pfn_t pfn;
+       int ret;
+       bool write_fault, writable;
+
+       /* TODO: Use instr. decoding for non-ISV to determine r/w fault */
+       if (is_iabt)
+               write_fault = false;
+       else if ((vcpu->arch.hsr & HSR_ISV) && !(vcpu->arch.hsr & HSR_WNR))
+               write_fault = false;
+       else
+               write_fault = true;
+
+       if ((vcpu->arch.hsr & HSR_FSC_TYPE) == FSC_PERM && !write_fault) {
+               kvm_err("Unexpected L2 read permission error\n");
+               return -EFAULT;
+       }
+
+       /* preemption disabled for handle_exit, gfn_to_pfn may sleep */
+       preempt_enable();
+       pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write_fault, &writable);
+       preempt_disable();
+
+       if (is_error_pfn(pfn)) {
+               put_page(pfn_to_page(pfn));
+               kvm_err("No host mapping: gfn %u (0x%08x)\n",
+                       (unsigned int)gfn,
+                       (unsigned int)gfn << PAGE_SHIFT);
+               return -EFAULT;
+       }
+
+       mutex_lock(&vcpu->kvm->arch.pgd_mutex);
+       new_pte = pfn_pte(pfn, PAGE_KVM_GUEST);
+       if (writable)
+               new_pte |= L_PTE2_WRITE;
+       ret = stage2_set_pte(vcpu->kvm, fault_ipa, &new_pte);
+       if (ret)
+               put_page(pfn_to_page(pfn));
+       mutex_unlock(&vcpu->kvm->arch.pgd_mutex);
+
+       return ret;
+}
+
+/**
+ * kvm_handle_guest_abort - handles all 2nd stage aborts
+ * @vcpu:      the VCPU pointer
+ * @run:       the kvm_run structure
+ *
+ * Any abort that gets to the host is almost guaranteed to be caused by a
+ * missing second stage translation table entry, which can mean that either the
+ * guest simply needs more memory and we must allocate an appropriate page or 
it
+ * can mean that the guest tried to access I/O memory, which is emulated by 
user
+ * space. The distinction is based on the IPA causing the fault and whether 
this
+ * memory region has been registered as standard RAM by user space.
+ */
 int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run)
 {
-       return -EINVAL;
+       unsigned long hsr_ec;
+       unsigned long fault_status;
+       phys_addr_t fault_ipa;
+       struct kvm_memory_slot *memslot = NULL;
+       bool is_iabt;
+       gfn_t gfn;
+
+       hsr_ec = vcpu->arch.hsr >> HSR_EC_SHIFT;
+       is_iabt = (hsr_ec == HSR_EC_IABT);
+
+       /* Check that the second stage fault is a translation fault */
+       fault_status = (vcpu->arch.hsr & HSR_FSC_TYPE);
+       if (fault_status != FSC_FAULT && fault_status != FSC_PERM) {
+               kvm_err("Unsupported fault status: EC=%#lx DFCS=%#lx\n",
+                       hsr_ec, fault_status);
+               return -EFAULT;
+       }
+
+       fault_ipa = ((phys_addr_t)vcpu->arch.hpfar & HPFAR_MASK) << 8;
+
+       gfn = fault_ipa >> PAGE_SHIFT;
+       if (!kvm_is_visible_gfn(vcpu->kvm, gfn)) {
+               if (is_iabt) {
+                       kvm_err("Inst. abort on I/O address %08lx\n",
+                               (unsigned long)fault_ipa);
+                       return -EFAULT;
+               }
+
+               kvm_pr_unimpl("I/O address abort...");
+               return 0;
+       }
+
+       memslot = gfn_to_memslot(vcpu->kvm, gfn);
+       if (!memslot->user_alloc) {
+               kvm_err("non user-alloc memslots not supported\n");
+               return -EINVAL;
+       }
+
+       return user_mem_abort(vcpu, fault_ipa, gfn, memslot, is_iabt);
+}
+
+static bool hva_to_gpa(struct kvm *kvm, unsigned long hva, gpa_t *gpa)
+{
+       struct kvm_memslots *slots;
+       struct kvm_memory_slot *memslot;
+       bool found = false;
+
+       mutex_lock(&kvm->slots_lock);
+       slots = kvm_memslots(kvm);
+
+       /* we only care about the pages that the guest sees */
+       kvm_for_each_memslot(memslot, slots) {
+               unsigned long start = memslot->userspace_addr;
+               unsigned long end;
+
+               end = start + (memslot->npages << PAGE_SHIFT);
+               if (hva >= start && hva < end) {
+                       gpa_t gpa_offset = hva - start;
+                       *gpa = (memslot->base_gfn << PAGE_SHIFT) + gpa_offset;
+                       found = true;
+                       /* no overlapping memslots allowed: break */
+                       break;
+               }
+       }
+
+       mutex_unlock(&kvm->slots_lock);
+       return found;
+}
+
+int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
+{
+       bool found;
+       gpa_t gpa;
+
+       if (!kvm->arch.pgd)
+               return 0;
+
+       mutex_lock(&kvm->arch.pgd_mutex);
+       found = hva_to_gpa(kvm, hva, &gpa);
+       if (found) {
+               stage2_set_pte(kvm, gpa, &null_pte);
+               __kvm_tlb_flush_vmid(kvm);
+       }
+       mutex_unlock(&kvm->arch.pgd_mutex);
+       return 0;
+}
+
+void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+{
+       gpa_t gpa;
+       bool found;
+
+       if (!kvm->arch.pgd)
+               return;
+
+       mutex_lock(&kvm->arch.pgd_mutex);
+       found = hva_to_gpa(kvm, hva, &gpa);
+       if (found) {
+               stage2_set_pte(kvm, gpa, &pte);
+               /*
+                * Ignore return code from stage2_set_pte, since -ENOMEM would
+                * indicate this IPA is is not mapped and there is no harm
+                * that the PTE changed.
+                */
+               __kvm_tlb_flush_vmid(kvm);
+       }
+       mutex_unlock(&kvm->arch.pgd_mutex);
 }
diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
index f7439e7..7dd4b54 100644
--- a/arch/arm/mm/mmu.c
+++ b/arch/arm/mm/mmu.c
@@ -56,9 +56,11 @@ static unsigned int cachepolicy __initdata = 
CPOLICY_WRITEBACK;
 static unsigned int ecc_mask __initdata = 0;
 pgprot_t pgprot_user;
 pgprot_t pgprot_kernel;
+pgprot_t pgprot_guest;
 
 EXPORT_SYMBOL(pgprot_user);
 EXPORT_SYMBOL(pgprot_kernel);
+EXPORT_SYMBOL(pgprot_guest);
 
 struct cachepolicy {
        const char      policy[16];
@@ -520,6 +522,7 @@ static void __init build_mem_type_table(void)
        pgprot_user   = __pgprot(L_PTE_PRESENT | L_PTE_YOUNG | user_pgprot);
        pgprot_kernel = __pgprot(L_PTE_PRESENT | L_PTE_YOUNG |
                                 L_PTE_DIRTY | kern_pgprot);
+       pgprot_guest  = __pgprot(L_PTE_PRESENT | L_PTE_YOUNG);
 
        mem_types[MT_LOW_VECTORS].prot_l1 |= ecc_mask;
        mem_types[MT_HIGH_VECTORS].prot_l1 |= ecc_mask;

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to