On 08/01/2012 10:37 PM, Nadav Har'El wrote:
> This is the first patch in a series which adds nested EPT support to KVM's
> nested VMX. Nested EPT means emulating EPT for an L1 guest so that L1 can use
> EPT when running a nested guest L2. When L1 uses EPT, it allows the L2 guest
> to set its own cr3 and take its own page faults without either of L0 or L1
> getting involved. This often significanlty improves L2's performance over the
> previous two alternatives (shadow page tables over EPT, and shadow page
> tables over shadow page tables).
> 
> This patch adds EPT support to paging_tmpl.h.
> 
> paging_tmpl.h contains the code for reading and writing page tables. The code
> for 32-bit and 64-bit tables is very similar, but not identical, so
> paging_tmpl.h is #include'd twice in mmu.c, once with PTTTYPE=32 and once
> with PTTYPE=64, and this generates the two sets of similar functions.
> 
> There are subtle but important differences between the format of EPT tables
> and that of ordinary x86 64-bit page tables, so for nested EPT we need a
> third set of functions to read the guest EPT table and to write the shadow
> EPT table.
> 
> So this patch adds third PTTYPE, PTTYPE_EPT, which creates functions (prefixed
> with "EPT") which correctly read and write EPT tables.
> 

Now, paging_tmpl.h becomes really untidy and hard to read, may be we need
to abstract the specified operations depends on PTTYPE.

> Signed-off-by: Nadav Har'El <n...@il.ibm.com>
> ---
>  arch/x86/kvm/mmu.c         |   14 +----
>  arch/x86/kvm/paging_tmpl.h |   98 ++++++++++++++++++++++++++++++++---
>  2 files changed, 96 insertions(+), 16 deletions(-)
> 
> --- .before/arch/x86/kvm/mmu.c        2012-08-01 17:22:46.000000000 +0300
> +++ .after/arch/x86/kvm/mmu.c 2012-08-01 17:22:46.000000000 +0300
> @@ -1971,15 +1971,6 @@ static void shadow_walk_next(struct kvm_
>       return __shadow_walk_next(iterator, *iterator->sptep);
>  }
> 
> -static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
> -{
> -     u64 spte;
> -
> -     spte = __pa(sp->spt)
> -             | PT_PRESENT_MASK | PT_ACCESSED_MASK
> -             | PT_WRITABLE_MASK | PT_USER_MASK;
> -     mmu_spte_set(sptep, spte);
> -}
> 
>  static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
>                                  unsigned direct_access)
> @@ -3427,6 +3418,11 @@ static bool sync_mmio_spte(u64 *sptep, g
>       return false;
>  }
> 
> +#define PTTYPE_EPT 18 /* arbitrary */
> +#define PTTYPE PTTYPE_EPT
> +#include "paging_tmpl.h"
> +#undef PTTYPE
> +
>  #define PTTYPE 64
>  #include "paging_tmpl.h"
>  #undef PTTYPE
> --- .before/arch/x86/kvm/paging_tmpl.h        2012-08-01 17:22:46.000000000 
> +0300
> +++ .after/arch/x86/kvm/paging_tmpl.h 2012-08-01 17:22:46.000000000 +0300
> @@ -50,6 +50,22 @@
>       #define PT_LEVEL_BITS PT32_LEVEL_BITS
>       #define PT_MAX_FULL_LEVELS 2
>       #define CMPXCHG cmpxchg
> +#elif PTTYPE == PTTYPE_EPT
> +     #define pt_element_t u64
> +     #define guest_walker guest_walkerEPT
> +     #define FNAME(name) EPT_##name
> +     #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
> +     #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
> +     #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
> +     #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
> +     #define PT_LEVEL_BITS PT64_LEVEL_BITS
> +     #ifdef CONFIG_X86_64
> +     #define PT_MAX_FULL_LEVELS 4
> +     #define CMPXCHG cmpxchg
> +     #else
> +     #define CMPXCHG cmpxchg64
> +     #define PT_MAX_FULL_LEVELS 2
> +     #endif

Missing the case of FULL_LEVELS == 3? Oh, you mentioned it
as PAE case in the PATCH 0.

>  #else
>       #error Invalid PTTYPE value
>  #endif
> @@ -78,6 +94,7 @@ static gfn_t gpte_to_gfn_lvl(pt_element_
>       return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT;
>  }
> 
> +#if PTTYPE != PTTYPE_EPT
>  static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
>                              pt_element_t __user *ptep_user, unsigned index,
>                              pt_element_t orig_pte, pt_element_t new_pte)
> @@ -100,15 +117,22 @@ static int FNAME(cmpxchg_gpte)(struct kv
> 
>       return (ret != orig_pte);
>  }
> +#endif
> 

Note A/D bits are supported on new intel cpus, this function should be reworked
for nept. I know you did not export this feather to guest, but we can reduce
the difference between nept and other mmu models if A/D are supported.

>  static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte,
>                                  bool last)
>  {
>       unsigned access;
> 
> +#if PTTYPE == PTTYPE_EPT
> +     /* We rely here that ACC_WRITE_MASK==VMX_EPT_WRITABLE_MASK */
> +     access = (gpte & VMX_EPT_WRITABLE_MASK) | ACC_USER_MASK |
> +                     ((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0);
> +#else
>       access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
>       if (last && !is_dirty_gpte(gpte))
>               access &= ~ACC_WRITE_MASK;
> +#endif
> 

May be we can introduce PT_xxx_MASK to abstracter the access bits.

>  #if PTTYPE == 64
>       if (vcpu->arch.mmu.nx)
> @@ -135,6 +159,30 @@ static bool FNAME(is_last_gpte)(struct g
>       return false;
>  }
> 
> +static inline int FNAME(is_present_gpte)(unsigned long pte)
> +{
> +#if PTTYPE == PTTYPE_EPT
> +     return pte & (VMX_EPT_READABLE_MASK | VMX_EPT_WRITABLE_MASK |
> +                     VMX_EPT_EXECUTABLE_MASK);
> +#else
> +     return is_present_gpte(pte);
> +#endif
> +}
> +

Introduce PT_PRESENT_BITS can eliminate the dependence, and we
need to rework is_present_gpte since it is used out of paging_tmpl.h.

> +static inline int FNAME(check_write_user_access)(struct kvm_vcpu *vcpu,
> +                                        bool write_fault, bool user_fault,
> +                                        unsigned long pte)
> +{
> +#if PTTYPE == PTTYPE_EPT
> +     if (unlikely(write_fault && !(pte & VMX_EPT_WRITABLE_MASK)
> +           && (user_fault || is_write_protection(vcpu))))
> +             return false;
> +     return true;
> +#else
> +     return check_write_user_access(vcpu, write_fault, user_fault, pte);
> +#endif
> +}
> +

Ditto, need to rework check_write_user_access.

>  /*
>   * Fetch a guest pte for a guest virtual address
>   */
> @@ -155,7 +203,9 @@ static int FNAME(walk_addr_generic)(stru
>       u16 errcode = 0;
> 
>       trace_kvm_mmu_pagetable_walk(addr, access);
> +#if PTTYPE != PTTYPE_EPT
>  retry_walk:
> +#endif
>       eperm = false;
>       walker->level = mmu->root_level;
>       pte           = mmu->get_cr3(vcpu);
> @@ -202,7 +252,7 @@ retry_walk:
> 
>               trace_kvm_mmu_paging_element(pte, walker->level);
> 
> -             if (unlikely(!is_present_gpte(pte)))
> +             if (unlikely(!FNAME(is_present_gpte)(pte)))
>                       goto error;
> 
>               if (unlikely(is_rsvd_bits_set(&vcpu->arch.mmu, pte,
> @@ -211,13 +261,16 @@ retry_walk:
>                       goto error;
>               }
> 
> -             if (!check_write_user_access(vcpu, write_fault, user_fault,
> -                                       pte))
> +             if (!FNAME(check_write_user_access)(vcpu, write_fault,
> +                                       user_fault, pte))
>                       eperm = true;
> 
>  #if PTTYPE == 64
>               if (unlikely(fetch_fault && (pte & PT64_NX_MASK)))
>                       eperm = true;
> +#elif PTTYPE == PTTYPE_EPT
> +             if (unlikely(fetch_fault && !(pte & VMX_EPT_EXECUTABLE_MASK)))
> +                     eperm = true;
>  #endif
> 
>               last_gpte = FNAME(is_last_gpte)(walker, vcpu, mmu, pte);
> @@ -225,12 +278,15 @@ retry_walk:
>                       pte_access = pt_access &
>                                    FNAME(gpte_access)(vcpu, pte, true);
>                       /* check if the kernel is fetching from user page */
> +#if PTTYPE != PTTYPE_EPT
>                       if (unlikely(pte_access & PT_USER_MASK) &&
>                           kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
>                               if (fetch_fault && !user_fault)
>                                       eperm = true;
> +#endif
>               }
> 
> +#if PTTYPE != PTTYPE_EPT
>               if (!eperm && unlikely(!(pte & PT_ACCESSED_MASK))) {
>                       int ret;
>                       trace_kvm_mmu_set_accessed_bit(table_gfn, index,
> @@ -245,6 +301,7 @@ retry_walk:
>                       mark_page_dirty(vcpu->kvm, table_gfn);
>                       pte |= PT_ACCESSED_MASK;
>               }
> +#endif

If A/D supported, these differences can be be removed?

> 
>               walker->ptes[walker->level - 1] = pte;
> 
> @@ -283,6 +340,7 @@ retry_walk:
>               goto error;
>       }
> 
> +#if PTTYPE != PTTYPE_EPT
>       if (write_fault && unlikely(!is_dirty_gpte(pte))) {
>               int ret;
> 
> @@ -298,6 +356,7 @@ retry_walk:
>               pte |= PT_DIRTY_MASK;
>               walker->ptes[walker->level - 1] = pte;
>       }
> +#endif
> 
>       walker->pt_access = pt_access;
>       walker->pte_access = pte_access;
> @@ -328,6 +387,7 @@ static int FNAME(walk_addr)(struct guest
>                                       access);
>  }
> 
> +#if PTTYPE != PTTYPE_EPT
>  static int FNAME(walk_addr_nested)(struct guest_walker *walker,
>                                  struct kvm_vcpu *vcpu, gva_t addr,
>                                  u32 access)
> @@ -335,6 +395,7 @@ static int FNAME(walk_addr_nested)(struc
>       return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu,
>                                       addr, access);
>  }
> +#endif
> 

Hmm, you do not need the special walking functions?

>  static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
>                                   struct kvm_mmu_page *sp, u64 *spte,
> @@ -343,11 +404,13 @@ static bool FNAME(prefetch_invalid_gpte)
>       if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
>               goto no_present;
> 
> -     if (!is_present_gpte(gpte))
> +     if (!FNAME(is_present_gpte)(gpte))
>               goto no_present;
> 
> +#if PTTYPE != PTTYPE_EPT
>       if (!(gpte & PT_ACCESSED_MASK))
>               goto no_present;
> +#endif
> 
>       return false;
> 
> @@ -458,6 +521,20 @@ static void FNAME(pte_prefetch)(struct k
>                            pfn, true, true);
>       }
>  }
> +static void FNAME(link_shadow_page)(u64 *sptep, struct kvm_mmu_page *sp)
> +{
> +     u64 spte;
> +
> +     spte = __pa(sp->spt)
> +#if PTTYPE == PTTYPE_EPT
> +             | VMX_EPT_READABLE_MASK | VMX_EPT_WRITABLE_MASK
> +             | VMX_EPT_EXECUTABLE_MASK;
> +#else
> +             | PT_PRESENT_MASK | PT_ACCESSED_MASK
> +             | PT_WRITABLE_MASK | PT_USER_MASK;
> +#endif
> +     mmu_spte_set(sptep, spte);
> +}
> 
>  /*
>   * Fetch a shadow pte for a specific level in the paging hierarchy.
> @@ -474,7 +551,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu
>       unsigned direct_access;
>       struct kvm_shadow_walk_iterator it;
> 
> -     if (!is_present_gpte(gw->ptes[gw->level - 1]))
> +     if (!FNAME(is_present_gpte)(gw->ptes[gw->level - 1]))
>               return NULL;
> 
>       direct_access = gw->pte_access;
> @@ -514,7 +591,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu
>                       goto out_gpte_changed;
> 
>               if (sp)
> -                     link_shadow_page(it.sptep, sp);
> +                     FNAME(link_shadow_page)(it.sptep, sp);
>       }
> 
>       for (;
> @@ -534,10 +611,15 @@ static u64 *FNAME(fetch)(struct kvm_vcpu
> 
>               sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1,
>                                     true, direct_access, it.sptep);
> -             link_shadow_page(it.sptep, sp);
> +             FNAME(link_shadow_page)(it.sptep, sp);
>       }
> 
>       clear_sp_write_flooding_count(it.sptep);
> +     /* TODO: Consider if everything that set_spte() does is correct when
> +        the shadow page table is actually EPT. Most is fine (for direct_map)
> +        but it appears there they be a few wrong corner cases with
> +        PT_USER_MASK, PT64_NX_MASK, etc., and I need to review everything
> +      */

Maybe it is ok. But you need to care A/D bits (current, you did not export
A/D bits to guest, however, it may be supported on L0).

>       mmu_set_spte(vcpu, it.sptep, access, gw->pte_access,
>                    user_fault, write_fault, emulate, it.level,
>                    gw->gfn, pfn, prefault, map_writable);
> @@ -733,6 +815,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kv
>       return gpa;
>  }
> 
> +#if PTTYPE != PTTYPE_EPT
>  static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
>                                     u32 access,
>                                     struct x86_exception *exception)
> @@ -751,6 +834,7 @@ static gpa_t FNAME(gva_to_gpa_nested)(st
> 
>       return gpa;
>  }
> +#endif

Why it is not needed?



--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to