[PATCH RFC 3/4] KVM: MMU: Add 5 level EPT & Shadow page table support.

Liang Li Thu, 29 Dec 2016 01:34:54 -0800

The future Intel CPU will extend the max physical address to 52 bits.
To support the new physical address width, EPT is extended to support
5 level page table.
This patch add the 5 level EPT and extend shadow page to support
5 level paging guest. As the RFC version, this patch enables 5 level
EPT once the hardware supports, and this is not a good choice because
5 level EPT requires more memory access comparing to use 4 level EPT.
The right thing is to use 5 level EPT only when it's needed, will
change in the future version.


Signed-off-by: Liang Li <liang.z...@intel.com>
Cc: Thomas Gleixner <t...@linutronix.de>
Cc: Ingo Molnar <mi...@redhat.com>
Cc: Kirill A. Shutemov <kirill.shute...@linux.intel.com>
Cc: Dave Hansen <dave.han...@linux.intel.com>
Cc: Xiao Guangrong <guangrong.x...@linux.intel.com>
Cc: Paolo Bonzini <pbonz...@redhat.com>
Cc: "Radim Kr??m????" <rkrc...@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |   3 +-
 arch/x86/include/asm/vmx.h      |   1 +
 arch/x86/kvm/cpuid.h            |   8 ++
 arch/x86/kvm/mmu.c              | 167 +++++++++++++++++++++++++++++++---------
 arch/x86/kvm/mmu_audit.c        |   5 +-
 arch/x86/kvm/paging_tmpl.h      |  19 ++++-
 arch/x86/kvm/vmx.c              |  19 +++--
 arch/x86/kvm/x86.h              |  10 +++
 8 files changed, 184 insertions(+), 48 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index a7066dc..e505dac 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -124,6 +124,7 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, 
int level)
 #define KVM_NR_VAR_MTRR 8
 
 #define ASYNC_PF_PER_VCPU 64
+#define PT64_ROOT_5LEVEL 5
 
 enum kvm_reg {
        VCPU_REGS_RAX = 0,
@@ -310,7 +311,7 @@ struct kvm_pio_request {
 };
 
 struct rsvd_bits_validate {
-       u64 rsvd_bits_mask[2][4];
+       u64 rsvd_bits_mask[2][PT64_ROOT_5LEVEL];
        u64 bad_mt_xwr;
 };
 
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 2b5b2d4..bf2f178 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -442,6 +442,7 @@ enum vmcs_field {
 
 #define VMX_EPT_EXECUTE_ONLY_BIT               (1ull)
 #define VMX_EPT_PAGE_WALK_4_BIT                        (1ull << 6)
+#define VMX_EPT_PAGE_WALK_5_BIT                        (1ull << 7)
 #define VMX_EPTP_UC_BIT                                (1ull << 8)
 #define VMX_EPTP_WB_BIT                                (1ull << 14)
 #define VMX_EPT_2MB_PAGE_BIT                   (1ull << 16)
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 35058c2..4bdf3dc 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -88,6 +88,14 @@ static inline bool guest_cpuid_has_pku(struct kvm_vcpu *vcpu)
        return best && (best->ecx & bit(X86_FEATURE_PKU));
 }
 
+static inline bool guest_cpuid_has_la57(struct kvm_vcpu *vcpu)
+{
+       struct kvm_cpuid_entry2 *best;
+
+       best = kvm_find_cpuid_entry(vcpu, 7, 0);
+       return best && (best->ecx & bit(X86_FEATURE_LA57));
+}
+
 static inline bool guest_cpuid_has_longmode(struct kvm_vcpu *vcpu)
 {
        struct kvm_cpuid_entry2 *best;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 4c40273..0a56f27 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1986,8 +1986,8 @@ static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t 
gfn,
 }
 
 struct mmu_page_path {
-       struct kvm_mmu_page *parent[PT64_ROOT_4LEVEL];
-       unsigned int idx[PT64_ROOT_4LEVEL];
+       struct kvm_mmu_page *parent[PT64_ROOT_5LEVEL];
+       unsigned int idx[PT64_ROOT_5LEVEL];
 };
 
 #define for_each_sp(pvec, sp, parents, i)                      \
@@ -2198,6 +2198,11 @@ static void shadow_walk_init(struct 
kvm_shadow_walk_iterator *iterator,
            !vcpu->arch.mmu.direct_map)
                --iterator->level;
 
+       if (iterator->level == PT64_ROOT_5LEVEL &&
+           vcpu->arch.mmu.root_level < PT64_ROOT_5LEVEL &&
+           !vcpu->arch.mmu.direct_map)
+               iterator->level -= 2;
+
        if (iterator->level == PT32E_ROOT_LEVEL) {
                iterator->shadow_addr
                        = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
@@ -3061,9 +3066,12 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
        if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
                return;
 
-       if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL &&
-           (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL ||
-            vcpu->arch.mmu.direct_map)) {
+       if ((vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL &&
+            (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL ||
+             vcpu->arch.mmu.direct_map)) ||
+           (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_5LEVEL &&
+            (vcpu->arch.mmu.root_level == PT64_ROOT_5LEVEL ||
+             vcpu->arch.mmu.direct_map))) {
                hpa_t root = vcpu->arch.mmu.root_hpa;
 
                spin_lock(&vcpu->kvm->mmu_lock);
@@ -3114,10 +3122,12 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
        struct kvm_mmu_page *sp;
        unsigned i;
 
-       if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL) {
+       if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL ||
+           vcpu->arch.mmu.shadow_root_level == PT64_ROOT_5LEVEL) {
                spin_lock(&vcpu->kvm->mmu_lock);
                make_mmu_pages_available(vcpu);
-               sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_4LEVEL, 1, ACC_ALL);
+               sp = kvm_mmu_get_page(vcpu, 0, 0,
+                               vcpu->arch.mmu.shadow_root_level, 1, ACC_ALL);
                ++sp->root_count;
                spin_unlock(&vcpu->kvm->mmu_lock);
                vcpu->arch.mmu.root_hpa = __pa(sp->spt);
@@ -3158,15 +3168,16 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
         * Do we shadow a long mode page table? If so we need to
         * write-protect the guests page table root.
         */
-       if (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL) {
+       if (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL ||
+           vcpu->arch.mmu.root_level == PT64_ROOT_5LEVEL) {
                hpa_t root = vcpu->arch.mmu.root_hpa;
 
                MMU_WARN_ON(VALID_PAGE(root));
 
                spin_lock(&vcpu->kvm->mmu_lock);
                make_mmu_pages_available(vcpu);
-               sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_4LEVEL,
-                                     0, ACC_ALL);
+               sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
+                               vcpu->arch.mmu.root_level, 0, ACC_ALL);
                root = __pa(sp->spt);
                ++sp->root_count;
                spin_unlock(&vcpu->kvm->mmu_lock);
@@ -3180,7 +3191,8 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
         * the shadow page table may be a PAE or a long mode page table.
         */
        pm_mask = PT_PRESENT_MASK;
-       if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL)
+       if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL ||
+           vcpu->arch.mmu.shadow_root_level == PT64_ROOT_5LEVEL)
                pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
 
        for (i = 0; i < 4; ++i) {
@@ -3213,7 +3225,8 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
         * If we shadow a 32 bit page table with a long mode page
         * table we enter this path.
         */
-       if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL) {
+       if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL ||
+           vcpu->arch.mmu.shadow_root_level == PT64_ROOT_5LEVEL) {
                if (vcpu->arch.mmu.lm_root == NULL) {
                        /*
                         * The additional page necessary for this is only
@@ -3257,8 +3270,8 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
                return;
 
        vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
-       kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
-       if (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL) {
+       if (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL ||
+           vcpu->arch.mmu.root_level == PT64_ROOT_5LEVEL) {
                hpa_t root = vcpu->arch.mmu.root_hpa;
                sp = page_header(root);
                mmu_sync_children(vcpu, sp);
@@ -3334,7 +3347,7 @@ static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 
addr, bool direct)
 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
 {
        struct kvm_shadow_walk_iterator iterator;
-       u64 sptes[PT64_ROOT_4LEVEL], spte = 0ull;
+       u64 sptes[PT64_ROOT_5LEVEL], spte = 0ull;
        int root, leaf;
        bool reserved = false;
 
@@ -3655,10 +3668,16 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu,
 }
 
 #define PTTYPE_EPT 18 /* arbitrary */
+#define PTTYPE_LA57 57
+
 #define PTTYPE PTTYPE_EPT
 #include "paging_tmpl.h"
 #undef PTTYPE
 
+#define PTTYPE PTTYPE_LA57
+#include "paging_tmpl.h"
+#undef PTTYPE
+
 #define PTTYPE 64
 #include "paging_tmpl.h"
 #undef PTTYPE
@@ -3747,6 +3766,26 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu,
                rsvd_check->rsvd_bits_mask[1][0] =
                        rsvd_check->rsvd_bits_mask[0][0];
                break;
+       case PT64_ROOT_5LEVEL:
+               rsvd_check->rsvd_bits_mask[0][4] = exb_bit_rsvd |
+                       nonleaf_bit8_rsvd | rsvd_bits(7, 7);
+               rsvd_check->rsvd_bits_mask[0][3] = exb_bit_rsvd |
+                       nonleaf_bit8_rsvd | rsvd_bits(7, 7);
+               rsvd_check->rsvd_bits_mask[0][2] = exb_bit_rsvd |
+                       nonleaf_bit8_rsvd | gbpages_bit_rsvd;
+               rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd;
+               rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd;
+               rsvd_check->rsvd_bits_mask[1][4] =
+                       rsvd_check->rsvd_bits_mask[0][4];
+               rsvd_check->rsvd_bits_mask[1][3] =
+                       rsvd_check->rsvd_bits_mask[0][3];
+               rsvd_check->rsvd_bits_mask[1][2] = exb_bit_rsvd |
+                       gbpages_bit_rsvd | rsvd_bits(13, 29);
+               rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd |
+                       rsvd_bits(13, 20);              /* large page */
+               rsvd_check->rsvd_bits_mask[1][0] =
+                       rsvd_check->rsvd_bits_mask[0][0];
+               break;
        }
 }
 
@@ -3761,25 +3800,43 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
 
 static void
 __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
-                           int maxphyaddr, bool execonly)
+                           int maxphyaddr, bool execonly, int ept_level)
 {
        u64 bad_mt_xwr;
 
-       rsvd_check->rsvd_bits_mask[0][3] =
-               rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
-       rsvd_check->rsvd_bits_mask[0][2] =
-               rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
-       rsvd_check->rsvd_bits_mask[0][1] =
-               rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
-       rsvd_check->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51);
-
-       /* large page */
-       rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3];
-       rsvd_check->rsvd_bits_mask[1][2] =
-               rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29);
-       rsvd_check->rsvd_bits_mask[1][1] =
-               rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20);
-       rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0];
+       if (ept_level == 5) {
+               rsvd_check->rsvd_bits_mask[0][4] = rsvd_bits(3, 7);
+               rsvd_check->rsvd_bits_mask[0][3] = rsvd_bits(3, 7);
+               rsvd_check->rsvd_bits_mask[0][2] = rsvd_bits(3, 6);
+               rsvd_check->rsvd_bits_mask[0][1] = rsvd_bits(3, 6);
+               rsvd_check->rsvd_bits_mask[0][0] = 0;
+
+               /* large page */
+               rsvd_check->rsvd_bits_mask[1][4] =
+                        rsvd_check->rsvd_bits_mask[0][4];
+               rsvd_check->rsvd_bits_mask[1][3] =
+                        rsvd_check->rsvd_bits_mask[0][3];
+               rsvd_check->rsvd_bits_mask[1][2] = rsvd_bits(12, 29);
+               rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(12, 20);
+               rsvd_check->rsvd_bits_mask[1][0] = 0;
+       } else {
+               rsvd_check->rsvd_bits_mask[0][3] =
+                       rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
+               rsvd_check->rsvd_bits_mask[0][2] =
+                       rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
+               rsvd_check->rsvd_bits_mask[0][1] =
+                       rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
+               rsvd_check->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51);
+               /* large page */
+               rsvd_check->rsvd_bits_mask[1][3] =
+                        rsvd_check->rsvd_bits_mask[0][3];
+               rsvd_check->rsvd_bits_mask[1][2] =
+                       rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29);
+               rsvd_check->rsvd_bits_mask[1][1] =
+                       rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20);
+               rsvd_check->rsvd_bits_mask[1][0] =
+                        rsvd_check->rsvd_bits_mask[0][0];
+       }
 
        bad_mt_xwr = 0xFFull << (2 * 8);        /* bits 3..5 must not be 2 */
        bad_mt_xwr |= 0xFFull << (3 * 8);       /* bits 3..5 must not be 3 */
@@ -3794,10 +3851,10 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
 }
 
 static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
-               struct kvm_mmu *context, bool execonly)
+               struct kvm_mmu *context, bool execonly, int ept_level)
 {
        __reset_rsvds_bits_mask_ept(&context->guest_rsvd_check,
-                                   cpuid_maxphyaddr(vcpu), execonly);
+                       cpuid_maxphyaddr(vcpu), execonly, ept_level);
 }
 
 /*
@@ -3844,8 +3901,8 @@ static inline bool boot_cpu_is_amd(void)
                                        true, true);
        else
                __reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
-                                           boot_cpu_data.x86_phys_bits,
-                                           false);
+                                           boot_cpu_data.x86_phys_bits, false,
+                                           context->shadow_root_level);
 
 }
 
@@ -3858,7 +3915,8 @@ static inline bool boot_cpu_is_amd(void)
                                struct kvm_mmu *context, bool execonly)
 {
        __reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
-                                   boot_cpu_data.x86_phys_bits, execonly);
+                                   boot_cpu_data.x86_phys_bits, execonly,
+                                   context->shadow_root_level);
 }
 
 static void update_permission_bitmask(struct kvm_vcpu *vcpu,
@@ -4037,6 +4095,28 @@ static void paging64_init_context(struct kvm_vcpu *vcpu,
        paging64_init_context_common(vcpu, context, PT64_ROOT_4LEVEL);
 }
 
+static void paging_la57_init_context(struct kvm_vcpu *vcpu,
+                                 struct kvm_mmu *context)
+{
+       context->nx = is_nx(vcpu);
+       context->root_level = PT64_ROOT_5LEVEL;
+
+       reset_rsvds_bits_mask(vcpu, context);
+       update_permission_bitmask(vcpu, context, false);
+       update_pkru_bitmask(vcpu, context, false);
+       update_last_nonleaf_level(vcpu, context);
+
+       MMU_WARN_ON(!is_pae(vcpu));
+       context->page_fault = paging_la57_page_fault;
+       context->gva_to_gpa = paging_la57_gva_to_gpa;
+       context->sync_page = paging_la57_sync_page;
+       context->invlpg = paging_la57_invlpg;
+       context->update_pte = paging_la57_update_pte;
+       context->shadow_root_level = PT64_ROOT_5LEVEL;
+       context->root_hpa = INVALID_PAGE;
+       context->direct_map = false;
+}
+
 static void paging32_init_context(struct kvm_vcpu *vcpu,
                                  struct kvm_mmu *context)
 {
@@ -4086,6 +4166,11 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
                context->nx = false;
                context->gva_to_gpa = nonpaging_gva_to_gpa;
                context->root_level = 0;
+       } else if (is_la57_mode(vcpu)) {
+               context->nx = is_nx(vcpu);
+               context->root_level = PT64_ROOT_5LEVEL;
+               reset_rsvds_bits_mask(vcpu, context);
+               context->gva_to_gpa = paging_la57_gva_to_gpa;
        } else if (is_long_mode(vcpu)) {
                context->nx = is_nx(vcpu);
                context->root_level = PT64_ROOT_4LEVEL;
@@ -4119,6 +4204,8 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
 
        if (!is_paging(vcpu))
                nonpaging_init_context(vcpu, context);
+       else if (is_la57_mode(vcpu))
+               paging_la57_init_context(vcpu, context);
        else if (is_long_mode(vcpu))
                paging64_init_context(vcpu, context);
        else if (is_pae(vcpu))
@@ -4158,7 +4245,8 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool 
execonly)
 
        update_permission_bitmask(vcpu, context, true);
        update_pkru_bitmask(vcpu, context, true);
-       reset_rsvds_bits_mask_ept(vcpu, context, execonly);
+       reset_rsvds_bits_mask_ept(vcpu, context, execonly,
+                                 context->shadow_root_level);
        reset_ept_shadow_zero_bits_mask(vcpu, context, execonly);
 }
 EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
@@ -4194,6 +4282,11 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
                g_context->nx = false;
                g_context->root_level = 0;
                g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
+       } else if (is_la57_mode(vcpu)) {
+               g_context->nx = is_nx(vcpu);
+               g_context->root_level = PT64_ROOT_5LEVEL;
+               reset_rsvds_bits_mask(vcpu, g_context);
+               g_context->gva_to_gpa = paging_la57_gva_to_gpa_nested;
        } else if (is_long_mode(vcpu)) {
                g_context->nx = is_nx(vcpu);
                g_context->root_level = PT64_ROOT_4LEVEL;
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 2e6996d..bb40094 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -62,11 +62,12 @@ static void mmu_spte_walk(struct kvm_vcpu *vcpu, 
inspect_spte_fn fn)
        if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
                return;
 
-       if (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL) {
+       if (vcpu->arch.mmu.root_level == PT64_ROOT_4LEVEL ||
+           vcpu->arch.mmu.root_level == PT64_ROOT_5LEVEL) {
                hpa_t root = vcpu->arch.mmu.root_hpa;
 
                sp = page_header(root);
-               __mmu_spte_walk(vcpu, sp, fn, PT64_ROOT_4LEVEL);
+               __mmu_spte_walk(vcpu, sp, fn, vcpu->arch.mmu.root_level);
                return;
        }
 
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index a011054..c126cd3 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -50,6 +50,21 @@ extern u64 __pure __using_nonexistent_pte_bit(void)
        #define CMPXCHG cmpxchg64
        #define PT_MAX_FULL_LEVELS 2
        #endif
+#elif PTTYPE == PTTYPE_LA57
+       #define pt_element_t u64
+       #define guest_walker guest_walker_la57
+       #define FNAME(name) paging_la57_##name
+       #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
+       #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
+       #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
+       #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
+       #define PT_LEVEL_BITS PT64_LEVEL_BITS
+       #define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK
+       #define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK
+       #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
+       #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
+       #define PT_MAX_FULL_LEVELS 5
+       #define CMPXCHG cmpxchg
 #elif PTTYPE == 32
        #define pt_element_t u32
        #define guest_walker guest_walker32
@@ -266,7 +281,7 @@ static int FNAME(update_accessed_dirty_bits)(struct 
kvm_vcpu *vcpu,
 static inline unsigned FNAME(gpte_pkeys)(struct kvm_vcpu *vcpu, u64 gpte)
 {
        unsigned pkeys = 0;
-#if PTTYPE == 64
+#if PTTYPE == 64 || PTTYPE == PTTYPE_LA57
        pte_t pte = {.pte = gpte};
 
        pkeys = pte_flags_pkey(pte_flags(pte));
@@ -300,7 +315,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker 
*walker,
        walker->level = mmu->root_level;
        pte           = mmu->get_cr3(vcpu);
 
-#if PTTYPE == 64
+#if PTTYPE == 64 || PTTYPE == PTTYPE_LA57
        if (walker->level == PT32E_ROOT_LEVEL) {
                pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3);
                trace_kvm_mmu_paging_element(pte, walker->level);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 24db5fb..bfc9f0a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1220,6 +1220,11 @@ static inline bool cpu_has_vmx_ept_4levels(void)
        return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT;
 }
 
+static inline bool cpu_has_vmx_ept_5levels(void)
+{
+       return vmx_capability.ept & VMX_EPT_PAGE_WALK_5_BIT;
+}
+
 static inline bool cpu_has_vmx_ept_ad_bits(void)
 {
        return vmx_capability.ept & VMX_EPT_AD_BIT;
@@ -4249,13 +4254,20 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned 
long cr0)
        vmx->emulation_required = emulation_required(vcpu);
 }
 
+static int get_ept_level(void)
+{
+       if (cpu_has_vmx_ept_5levels())
+               return VMX_EPT_MAX_GAW + 1;
+       return VMX_EPT_DEFAULT_GAW + 1;
+}
+
 static u64 construct_eptp(unsigned long root_hpa)
 {
        u64 eptp;
 
        /* TODO write the value reading from MSR */
        eptp = VMX_EPT_DEFAULT_MT |
-               VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT;
+               (get_ept_level() - 1) << VMX_EPT_GAW_EPTP_SHIFT;
        if (enable_ept_ad_bits)
                eptp |= VMX_EPT_AD_ENABLE_BIT;
        eptp |= (root_hpa & PAGE_MASK);
@@ -9356,11 +9368,6 @@ static void __init vmx_check_processor_compat(void *rtn)
        }
 }
 
-static int get_ept_level(void)
-{
-       return VMX_EPT_DEFAULT_GAW + 1;
-}
-
 static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
 {
        u8 cache;
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index e8ff3e4..26627df 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -60,6 +60,16 @@ static inline bool is_64_bit_mode(struct kvm_vcpu *vcpu)
        return cs_l;
 }
 
+static inline bool is_la57_mode(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_X86_64
+       return (vcpu->arch.efer & EFER_LMA) &&
+                kvm_read_cr4_bits(vcpu, X86_CR4_LA57);
+#else
+       return 0;
+#endif
+}
+
 static inline bool mmu_is_nested(struct kvm_vcpu *vcpu)
 {
        return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu;
-- 
1.9.1

[PATCH RFC 3/4] KVM: MMU: Add 5 level EPT & Shadow page table support.

Reply via email to