On Wed, Jun 18, 2014 at 08:12:05PM -0300, mtosa...@redhat.com wrote:
> Allow vcpus to pin spte translations by:
> 
> 1) Creating a per-vcpu list of pinned ranges.
What if memory slot containing pinned range is going away?

> 2) On mmu reload request:
>       - Fault ranges.
>       - Mark sptes with a pinned bit.
Should also be marked "dirty" as per SDM:
 The three DS save area sections should be allocated from a non-paged pool, and 
marked accessed and dirty

Some comment below.

>       - Mark shadow pages as pinned.
> 
> 3) Then modify the following actions:
>       - Page age => skip spte flush.
>       - MMU notifiers => force mmu reload request (which kicks cpu out of
>                               guest mode).
>       - GET_DIRTY_LOG => force mmu reload request.
>       - SLAB shrinker => skip shadow page deletion.
> 
> TDP-only.
> 
> Signed-off-by: Marcelo Tosatti <mtosa...@redhat.com>
> 
> ---
>  arch/x86/include/asm/kvm_host.h |   14 ++
>  arch/x86/kvm/mmu.c              |  202 
> ++++++++++++++++++++++++++++++++++++++--
>  arch/x86/kvm/mmu.h              |    5 
>  arch/x86/kvm/mmutrace.h         |   23 ++++
>  arch/x86/kvm/paging_tmpl.h      |    2 
>  arch/x86/kvm/x86.c              |    4 
>  6 files changed, 241 insertions(+), 9 deletions(-)
> 
> Index: kvm.pinned-sptes/arch/x86/include/asm/kvm_host.h
> ===================================================================
> --- kvm.pinned-sptes.orig/arch/x86/include/asm/kvm_host.h     2014-06-18 
> 17:28:17.549456614 -0300
> +++ kvm.pinned-sptes/arch/x86/include/asm/kvm_host.h  2014-06-18 
> 17:28:24.338435658 -0300
> @@ -221,6 +221,8 @@
>       /* hold the gfn of each spte inside spt */
>       gfn_t *gfns;
>       bool unsync;
> +     bool pinned;
> +
>       int root_count;          /* Currently serving as active root */
>       unsigned int unsync_children;
>       unsigned long parent_ptes;      /* Reverse mapping for parent_pte */
> @@ -337,6 +339,14 @@
>       KVM_DEBUGREG_WONT_EXIT = 2,
>  };
>  
> +struct kvm_pinned_page_range {
> +     gfn_t base_gfn;
> +     unsigned long npages;
> +     struct list_head link;
> +};
> +
> +#define KVM_MAX_PER_VCPU_PINNED_RANGE 10
> +
>  struct kvm_vcpu_arch {
>       /*
>        * rip and regs accesses must go through
> @@ -392,6 +402,10 @@
>       struct kvm_mmu_memory_cache mmu_page_cache;
>       struct kvm_mmu_memory_cache mmu_page_header_cache;
>  
> +     struct list_head pinned_mmu_pages;
> +     struct mutex pinned_mmu_mutex;
> +     unsigned int nr_pinned_ranges;
> +
>       struct fpu guest_fpu;
>       u64 xcr0;
>       u64 guest_supported_xcr0;
> Index: kvm.pinned-sptes/arch/x86/kvm/mmu.c
> ===================================================================
> --- kvm.pinned-sptes.orig/arch/x86/kvm/mmu.c  2014-06-18 17:28:17.550456611 
> -0300
> +++ kvm.pinned-sptes/arch/x86/kvm/mmu.c       2014-06-18 17:28:24.339435654 
> -0300
> @@ -148,6 +148,9 @@
>  
>  #define SPTE_HOST_WRITEABLE  (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
>  #define SPTE_MMU_WRITEABLE   (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
> +#define SPTE_PINNED          (1ULL << (PT64_SECOND_AVAIL_BITS_SHIFT))
> +
> +#define SPTE_PINNED_BIT PT64_SECOND_AVAIL_BITS_SHIFT
>  
>  #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
>  
> @@ -327,6 +330,11 @@
>       return pte & PT_PRESENT_MASK && !is_mmio_spte(pte);
>  }
>  
> +static int is_pinned_spte(u64 spte)
> +{
> +     return spte & SPTE_PINNED && is_shadow_present_pte(spte);
> +}
> +
>  static int is_large_pte(u64 pte)
>  {
>       return pte & PT_PAGE_SIZE_MASK;
> @@ -2818,7 +2826,7 @@
>   * - false: let the real page fault path to fix it.
>   */
>  static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
> -                         u32 error_code)
> +                         u32 error_code, bool pin)
>  {
>       struct kvm_shadow_walk_iterator iterator;
>       struct kvm_mmu_page *sp;
> @@ -2828,6 +2836,9 @@
>       if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
>               return false;
>  
> +     if (pin)
> +             return false;
> +
>       if (!page_fault_can_be_fast(error_code))
>               return false;
>  
> @@ -2895,9 +2906,55 @@
>  }
>  
>  static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
> -                      gva_t gva, pfn_t *pfn, bool write, bool *writable);
> +                      gva_t gva, pfn_t *pfn, bool write, bool *writable,
> +                      bool pin);
>  static void make_mmu_pages_available(struct kvm_vcpu *vcpu);
>  
> +
> +static int get_sptep_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 
> *sptes[4])
> +{
> +     struct kvm_shadow_walk_iterator iterator;
> +     int nr_sptes = 0;
> +
> +     if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
> +             return nr_sptes;
> +
> +     for_each_shadow_entry(vcpu, addr, iterator) {
> +             sptes[iterator.level-1] = iterator.sptep;
> +             nr_sptes++;
> +             if (!is_shadow_present_pte(*iterator.sptep))
> +                     break;
> +     }
> +
> +     return nr_sptes;
> +}
> +
> +static bool direct_pin_sptes(struct kvm_vcpu *vcpu, gfn_t gfn)
> +{
> +     u64 *sptes[4];
> +     int r, i, level;
> +
> +     r = get_sptep_hierarchy(vcpu, gfn << PAGE_SHIFT, sptes);
> +     if (!r)
> +             return false;
> +
> +     level = 5 - r;
> +     if (!is_last_spte(*sptes[r-1], level))
> +             return false;
> +     if (!is_shadow_present_pte(*sptes[r-1]))
> +             return false;
> +
> +     for (i = 0; i < r; i++) {
> +             u64 *sptep = sptes[i];
> +             struct kvm_mmu_page *sp = page_header(__pa(sptep));
> +
> +             sp->pinned = true;
> +             set_bit(SPTE_PINNED_BIT, (unsigned long *)sptep);
> +     }
> +
> +     return true;
> +}
> +
>  static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
>                        gfn_t gfn, bool prefault, bool pin, bool *pinned)
>  {
> @@ -2923,13 +2980,14 @@
>       } else
>               level = PT_PAGE_TABLE_LEVEL;
>  
> -     if (fast_page_fault(vcpu, v, level, error_code))
> +     if (fast_page_fault(vcpu, v, level, error_code, pin))
>               return 0;
>  
>       mmu_seq = vcpu->kvm->mmu_notifier_seq;
>       smp_rmb();
>  
> -     if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
> +     if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable,
> +                      pin))
>               return 0;
>  
>       if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r))
> @@ -2943,6 +3001,8 @@
>               transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
>       r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn,
>                        prefault);
> +     if (pin)
> +             *pinned = direct_pin_sptes(vcpu, gfn);
>       spin_unlock(&vcpu->kvm->mmu_lock);
>  
>  
> @@ -3349,7 +3409,8 @@
>  }
>  
>  static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
> -                      gva_t gva, pfn_t *pfn, bool write, bool *writable)
> +                      gva_t gva, pfn_t *pfn, bool write, bool *writable,
> +                      bool pin)
>  {
>       bool async;
>  
> @@ -3358,7 +3419,7 @@
>       if (!async)
>               return false; /* *pfn has correct page already */
>  
> -     if (!prefault && can_do_async_pf(vcpu)) {
> +     if (!prefault && !pin && can_do_async_pf(vcpu)) {
>               trace_kvm_try_async_get_page(gva, gfn);
>               if (kvm_find_async_pf_gfn(vcpu, gfn)) {
>                       trace_kvm_async_pf_doublefault(gva, gfn);
> @@ -3406,13 +3467,14 @@
>       } else
>               level = PT_PAGE_TABLE_LEVEL;
>  
> -     if (fast_page_fault(vcpu, gpa, level, error_code))
> +     if (fast_page_fault(vcpu, gpa, level, error_code, pin))
>               return 0;
>  
>       mmu_seq = vcpu->kvm->mmu_notifier_seq;
>       smp_rmb();
>  
> -     if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
> +     if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable,
> +                      pin))
>               return 0;
>  
>       if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r))
> @@ -3426,6 +3488,8 @@
>               transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
>       r = __direct_map(vcpu, gpa, write, map_writable,
>                        level, gfn, pfn, prefault);
> +     if (pin)
> +             *pinned = direct_pin_sptes(vcpu, gfn);
>       spin_unlock(&vcpu->kvm->mmu_lock);
>  
>       return r;
> @@ -3903,6 +3967,127 @@
>  }
>  EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
>  
> +int kvm_mmu_register_pinned_range(struct kvm_vcpu *vcpu,
> +                               gfn_t base_gfn, unsigned long npages)
> +{
> +     struct kvm_pinned_page_range *p;
> +
> +     mutex_lock(&vcpu->arch.pinned_mmu_mutex);
> +     list_for_each_entry(p, &vcpu->arch.pinned_mmu_pages, link) {
> +             if (p->base_gfn == base_gfn && p->npages == npages) {
> +                     mutex_unlock(&vcpu->arch.pinned_mmu_mutex);
> +                     return -EEXIST;
> +             }
> +     }
> +     mutex_unlock(&vcpu->arch.pinned_mmu_mutex);
> +
> +     if (vcpu->arch.nr_pinned_ranges >=
> +         KVM_MAX_PER_VCPU_PINNED_RANGE)
> +             return -ENOSPC;
Shouldn't we refuse to register pinned range if !TDP?

> +
> +     p = kzalloc(sizeof(struct kvm_pinned_page_range), GFP_KERNEL);
> +     if (!p)
> +             return -ENOMEM;
> +
> +     vcpu->arch.nr_pinned_ranges++;
> +
> +     trace_kvm_mmu_register_pinned_range(vcpu->vcpu_id, base_gfn, npages);
> +
> +     INIT_LIST_HEAD(&p->link);
> +     p->base_gfn = base_gfn;
> +     p->npages = npages;
> +     mutex_lock(&vcpu->arch.pinned_mmu_mutex);
> +     list_add(&p->link, &vcpu->arch.pinned_mmu_pages);
> +     mutex_unlock(&vcpu->arch.pinned_mmu_mutex);
> +     kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
> +
> +     return 0;
> +}
> +
> +int kvm_mmu_unregister_pinned_range(struct kvm_vcpu *vcpu,
> +                                 gfn_t base_gfn, unsigned long npages)
> +{
> +     struct kvm_pinned_page_range *p;
> +
> +     mutex_lock(&vcpu->arch.pinned_mmu_mutex);
> +     list_for_each_entry(p, &vcpu->arch.pinned_mmu_pages, link) {
> +             if (p->base_gfn == base_gfn && p->npages == npages) {
> +                     list_del(&p->link);
> +                     vcpu->arch.nr_pinned_ranges--;
> +                     mutex_unlock(&vcpu->arch.pinned_mmu_mutex);
> +                     kfree(p);
> +                     return 0;
> +             }
> +     }
> +
> +     mutex_unlock(&vcpu->arch.pinned_mmu_mutex);
> +     return -ENOENT;
> +}
> +
> +void kvm_mmu_free_pinned_ranges(struct kvm_vcpu *vcpu)
> +{
> +     struct kvm_pinned_page_range *p, *p2;
> +
> +     mutex_lock(&vcpu->arch.pinned_mmu_mutex);
> +     list_for_each_entry_safe(p, p2, &vcpu->arch.pinned_mmu_pages, link) {
> +             list_del(&p->link);
> +             kfree(p);
> +     }
> +     mutex_unlock(&vcpu->arch.pinned_mmu_mutex);
> +}
> +
> +/*
> + * Pin KVM MMU page translations. This guarantees, for valid
> + * addresses registered by kvm_mmu_register_pinned_range (valid address
> + * meaning address which posses sufficient information for fault to
> + * be resolved), valid translations exist while in guest mode and
> + * therefore no VM-exits due to faults will occur.
> + *
> + * Failure to instantiate pages will abort guest entry.
> + *
> + * Page frames should be pinned with get_page in advance.
> + *
> + * Pinning is not guaranteed while executing as L2 guest.
> + *
> + */
> +
> +static void kvm_mmu_pin_pages(struct kvm_vcpu *vcpu)
> +{
> +     struct kvm_pinned_page_range *p;
> +
> +     if (is_guest_mode(vcpu))
> +             return;
> +
> +     if (!vcpu->arch.mmu.direct_map)
> +             return;
> +
> +     ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
> +
> +     mutex_lock(&vcpu->arch.pinned_mmu_mutex);
> +     list_for_each_entry(p, &vcpu->arch.pinned_mmu_pages, link) {
> +             gfn_t gfn_offset;
> +
> +             for (gfn_offset = 0; gfn_offset < p->npages; gfn_offset++) {
> +                     gfn_t gfn = p->base_gfn + gfn_offset;
> +                     int r;
> +                     bool pinned = false;
> +
> +                     r = vcpu->arch.mmu.page_fault(vcpu, gfn << PAGE_SHIFT,
> +                                                  PFERR_WRITE_MASK, false,
> +                                                  true, &pinned);
> +                     /* MMU notifier sequence window: retry */
> +                     if (!r && !pinned)
> +                             kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
> +                     if (r) {
> +                             kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
I do not think triple fault is appropriate here. The reasons for triple fault 
are
documented in SDM and this is not one of them. What about error exit to user 
space?

> +                             break;
> +                     }
> +
> +             }
> +     }
> +     mutex_unlock(&vcpu->arch.pinned_mmu_mutex);
> +}
> +
>  int kvm_mmu_load(struct kvm_vcpu *vcpu)
>  {
>       int r;
> @@ -3916,6 +4101,7 @@
>               goto out;
>       /* set_cr3() should ensure TLB has been flushed */
>       vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
> +     kvm_mmu_pin_pages(vcpu);
>  out:
>       return r;
>  }
> Index: kvm.pinned-sptes/arch/x86/kvm/mmu.h
> ===================================================================
> --- kvm.pinned-sptes.orig/arch/x86/kvm/mmu.h  2014-06-18 17:27:47.582549238 
> -0300
> +++ kvm.pinned-sptes/arch/x86/kvm/mmu.h       2014-06-18 17:28:24.339435654 
> -0300
> @@ -178,4 +178,9 @@
>  }
>  
>  void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm);
> +int kvm_mmu_register_pinned_range(struct kvm_vcpu *vcpu,
> +                              gfn_t base_gfn, unsigned long npages);
> +int kvm_mmu_unregister_pinned_range(struct kvm_vcpu *vcpu,
> +                              gfn_t base_gfn, unsigned long npages);
> +void kvm_mmu_free_pinned_ranges(struct kvm_vcpu *vcpu);
>  #endif
> Index: kvm.pinned-sptes/arch/x86/kvm/x86.c
> ===================================================================
> --- kvm.pinned-sptes.orig/arch/x86/kvm/x86.c  2014-06-18 17:28:17.552456605 
> -0300
> +++ kvm.pinned-sptes/arch/x86/kvm/x86.c       2014-06-18 17:28:24.340435651 
> -0300
> @@ -7049,6 +7049,8 @@
>  
>       kvm_async_pf_hash_reset(vcpu);
>       kvm_pmu_init(vcpu);
> +     INIT_LIST_HEAD(&vcpu->arch.pinned_mmu_pages);
> +     mutex_init(&vcpu->arch.pinned_mmu_mutex);
>  
>       return 0;
>  fail_free_wbinvd_dirty_mask:
> @@ -7069,6 +7071,7 @@
>  {
>       int idx;
>  
> +     kvm_mmu_free_pinned_ranges(vcpu);
>       kvm_pmu_destroy(vcpu);
>       kfree(vcpu->arch.mce_banks);
>       kvm_free_lapic(vcpu);
> @@ -7113,6 +7116,7 @@
>       int r;
>       r = vcpu_load(vcpu);
>       BUG_ON(r);
> +     kvm_mmu_free_pinned_ranges(vcpu);
>       kvm_mmu_unload(vcpu);
>       vcpu_put(vcpu);
>  }
> Index: kvm.pinned-sptes/arch/x86/kvm/paging_tmpl.h
> ===================================================================
> --- kvm.pinned-sptes.orig/arch/x86/kvm/paging_tmpl.h  2014-06-18 
> 17:28:17.550456611 -0300
> +++ kvm.pinned-sptes/arch/x86/kvm/paging_tmpl.h       2014-06-18 
> 17:28:24.340435651 -0300
> @@ -747,7 +747,7 @@
>       smp_rmb();
>  
>       if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault,
> -                      &map_writable))
> +                      &map_writable, false))
>               return 0;
>  
>       if (handle_abnormal_pfn(vcpu, mmu_is_nested(vcpu) ? 0 : addr,
> Index: kvm.pinned-sptes/arch/x86/kvm/mmutrace.h
> ===================================================================
> --- kvm.pinned-sptes.orig/arch/x86/kvm/mmutrace.h     2014-06-18 
> 17:27:47.583549234 -0300
> +++ kvm.pinned-sptes/arch/x86/kvm/mmutrace.h  2014-06-18 17:28:24.340435651 
> -0300
> @@ -322,6 +322,29 @@
>                 __entry->kvm_gen == __entry->spte_gen
>       )
>  );
> +
> +TRACE_EVENT(
> +     kvm_mmu_register_pinned_range,
> +     TP_PROTO(unsigned int vcpu_id, gfn_t gfn, unsigned long npages),
> +     TP_ARGS(vcpu_id, gfn, npages),
> +
> +     TP_STRUCT__entry(
> +             __field(        unsigned int,   vcpu_id )
> +             __field(        gfn_t,          gfn     )
> +             __field(        unsigned long,  npages  )
> +     ),
> +
> +     TP_fast_assign(
> +             __entry->vcpu_id                = vcpu_id;
> +             __entry->gfn                    = gfn;
> +             __entry->npages                 = npages;
> +     ),
> +
> +     TP_printk("vcpu_id %u gfn %llx npages %lx",
> +               __entry->vcpu_id,
> +               __entry->gfn,
> +               __entry->npages)
> +);
>  #endif /* _TRACE_KVMMMU_H */
>  
>  #undef TRACE_INCLUDE_PATH
> 
> 

--
                        Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to