Andrea Arcangeli wrote:
Hello,
this is an update of the patch to test kvm on mmu notifier v18. I'll
post the mmu notifier v18 tomorrow after some more review but I can
post the kvm side in the meantime (which works with the previous v17
as well if anyone wants to test).
This has a relevant fix for kvm_unmap_rmapp: rmap_remove while
deleting the current spte from the desc array, can overwrite the
deleted current spte with the last spte in the desc array in turn
reodering it. So if we restart rmap_next from the sptes after the
deleted current spte, we may miss the later sptes that have been moved
in the slot of the current spte. We've to teardown the whole desc
array so the fix was to simply pick from the first entry and wait the
others to come down.
I also wonder if the update_pte done outside the mmu_lock is safe
without mmu notifiers, or if the below changes are required regardless
(I think they are). I cleaned up the fix but I probably need to
extract it from this patch.
+
+static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp)
+{
+ u64 *spte;
+ int young = 0;
+
+ spte = rmap_next(kvm, rmapp, NULL);
+ while (spte) {
+ int _young;
+ u64 _spte = *spte;
+ BUG_ON(!(_spte & PT_PRESENT_MASK));
+ _young = _spte & PT_ACCESSED_MASK;
+ if (_young) {
+ young = !!_young;
young = 1?
+ set_shadow_pte(spte, _spte & ~PT_ACCESSED_MASK);
This can theoretically lose the dirty bit. We don't track it now, so
that's okay.
+ }
+ spte = rmap_next(kvm, rmapp, spte);
+ }
+ return young;
+}
+
+int kvm_age_hva(struct kvm *kvm, unsigned long hva)
+{
+ int i;
+ int young = 0;
+
+ /*
+ * If mmap_sem isn't taken, we can look the memslots with only
+ * the mmu_lock by skipping over the slots with userspace_addr == 0.
+ */
One day we want to sort the slots according to size. We'll need better
locking then (rcu, likely).
@@ -1235,9 +1340,9 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
int i;
struct kvm_mmu_page *sp;
- if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
- return;
spin_lock(&vcpu->kvm->mmu_lock);
+ if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ goto out;
vcpu-> stuff is protected by the vcpu mutex.
@@ -1626,18 +1744,20 @@ static bool last_updated_pte_accessed(struct kvm_vcpu
*vcpu)
return !!(spte && (*spte & shadow_accessed_mask));
}
-static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
- const u8 *new, int bytes)
+static int mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
+ const u8 *new, int bytes,
+ gfn_t *_gfn, pfn_t *_pfn,
+ int *_mmu_seq, int *_largepage)
{
gfn_t gfn;
int r;
u64 gpte = 0;
pfn_t pfn;
-
- vcpu->arch.update_pte.largepage = 0;
+ int mmu_seq;
+ int largepage;
if (bytes != 4 && bytes != 8)
- return;
+ return 0;
/*
* Assume that the pte write on a page table of the same type
@@ -1650,7 +1770,7 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu
*vcpu, gpa_t gpa,
if ((bytes == 4) && (gpa % 4 == 0)) {
r = kvm_read_guest(vcpu->kvm, gpa & ~(u64)7, &gpte, 8);
if (r)
- return;
+ return 0;
memcpy((void *)&gpte + (gpa % 8), new, 4);
} else if ((bytes == 8) && (gpa % 8 == 0)) {
memcpy((void *)&gpte, new, 8);
@@ -1660,23 +1780,30 @@ static void mmu_guess_page_from_pte_write(struct
kvm_vcpu *vcpu, gpa_t gpa,
memcpy((void *)&gpte, new, 4);
}
if (!is_present_pte(gpte))
- return;
+ return 0;
gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
+ largepage = 0;
down_read(¤t->mm->mmap_sem);
if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) {
gfn &= ~(KVM_PAGES_PER_HPAGE-1);
- vcpu->arch.update_pte.largepage = 1;
+ largepage = 1;
}
+ mmu_seq = atomic_read(&vcpu->kvm->arch.mmu_notifier_seq);
+ /* implicit mb(), we'll read before PT lock is unlocked */
pfn = gfn_to_pfn(vcpu->kvm, gfn);
up_read(¤t->mm->mmap_sem);
- if (is_error_pfn(pfn)) {
+ if (unlikely(is_error_pfn(pfn))) {
kvm_release_pfn_clean(pfn);
- return;
+ return 0;
}
- vcpu->arch.update_pte.gfn = gfn;
- vcpu->arch.update_pte.pfn = pfn;
+
+ *_gfn = gfn;
+ *_pfn = pfn;
+ *_mmu_seq = mmu_seq;
+ *_largepage = largepage;
+ return 1;
}
Alternatively, we can replace this with follow_page() in update_pte().
Probably best to defer it, though.
+ /*
+ * When ->invalidate_page runs, the linux pte has been zapped
+ * already but the page is still allocated until
+ * ->invalidate_page returns. So if we increase the sequence
+ * here the kvm page fault will notice if the spte can't be
+ * established because the page is going to be freed. If
+ * instead the kvm page fault establishes the spte before
+ * ->invalidate_page runs, kvm_unmap_hva will release it
+ * before returning.
This too...
Please move the registration to virt/kvm/kvm_main.c, and provide stubs
for non-x86. This is definitely something that we want to do cross-arch
(except s390 which have it in hardware).
--
error compiling committee.c: too many arguments to function
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at http://vger.kernel.org/majordomo-info.html