Avi Kivity wrote:
There are (at least) three options available:
- detect and special-case this scenario
- change the flood detector to be per page table instead of per vcpu
- change the flood detector to look at a list of recently used page
tables instead of the last page table
I'm having a hard time trying to pick between the second and third
options.
The answer turns out to be "yes", so here's a patch that adds a pte
access history table for each shadowed guest page-table. Let me know if
it helps. Benchmarking a variety of workloads on all guests supported
by kvm is left as an exercise for the reader, but I suspect the patch
will either improve things all around, or can be modified to do so.
--
error compiling committee.c: too many arguments to function
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 154727d..1a3d01a 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1130,7 +1130,8 @@ unshadowed:
if (speculative) {
vcpu->arch.last_pte_updated = shadow_pte;
vcpu->arch.last_pte_gfn = gfn;
- }
+ } else
+ page_header(__pa(shadow_pte))->pte_history_len = 0;
}
static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
@@ -1616,13 +1617,6 @@ static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, u64 old, u64 new)
kvm_mmu_flush_tlb(vcpu);
}
-static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
-{
- u64 *spte = vcpu->arch.last_pte_updated;
-
- return !!(spte && (*spte & shadow_accessed_mask));
-}
-
static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
const u8 *new, int bytes)
{
@@ -1679,13 +1673,49 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn)
{
u64 *spte = vcpu->arch.last_pte_updated;
+ struct kvm_mmu_page *page;
+
+ if (spte && vcpu->arch.last_pte_gfn == gfn) {
+ page = page_header(__pa(spte));
+ page->pte_history_len = 0;
+ pgprintk("clearing page history, gfn %x ent %lx\n",
+ page->gfn, spte - page->spt);
+ }
+}
+
+static bool kvm_mmu_page_flooded(struct kvm_mmu_page *page)
+{
+ int i, j, ent, len;
- if (spte
- && vcpu->arch.last_pte_gfn == gfn
- && shadow_accessed_mask
- && !(*spte & shadow_accessed_mask)
- && is_shadow_present_pte(*spte))
- set_bit(PT_ACCESSED_SHIFT, spte);
+ len = page->pte_history_len;
+ for (i = len; i != 0; --i) {
+ ent = page->pte_history[i - 1];
+ if (test_bit(PT_ACCESSED_SHIFT, &page->spt[ent])) {
+ for (j = i; j < len; ++j)
+ page->pte_history[j-i] = page->pte_history[j];
+ page->pte_history_len = len - i;
+ return false;
+ }
+ }
+ if (page->pte_history_len < KVM_MAX_PTE_HISTORY)
+ return false;
+ return true;
+}
+
+static void kvm_mmu_log_pte_history(struct kvm_mmu_page *page, u64 *spte)
+{
+ int i;
+ unsigned ent = spte - page->spt;
+
+ if (page->pte_history_len > 0
+ && page->pte_history[page->pte_history_len - 1] == ent)
+ return;
+ if (page->pte_history_len == KVM_MAX_PTE_HISTORY) {
+ for (i = 1; i < KVM_MAX_PTE_HISTORY; ++i)
+ page->pte_history[i-1] = page->pte_history[i];
+ --page->pte_history_len;
+ }
+ page->pte_history[page->pte_history_len++] = ent;
}
void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
@@ -1704,7 +1734,6 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
unsigned misaligned;
unsigned quadrant;
int level;
- int flooded = 0;
int npte;
int r;
@@ -1715,16 +1744,6 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
kvm_mmu_free_some_pages(vcpu);
++vcpu->kvm->stat.mmu_pte_write;
kvm_mmu_audit(vcpu, "pre pte write");
- if (gfn == vcpu->arch.last_pt_write_gfn
- && !last_updated_pte_accessed(vcpu)) {
- ++vcpu->arch.last_pt_write_count;
- if (vcpu->arch.last_pt_write_count >= 3)
- flooded = 1;
- } else {
- vcpu->arch.last_pt_write_gfn = gfn;
- vcpu->arch.last_pt_write_count = 1;
- vcpu->arch.last_pte_updated = NULL;
- }
index = kvm_page_table_hashfn(gfn);
bucket = &vcpu->kvm->arch.mmu_page_hash[index];
hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
@@ -1733,7 +1752,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
misaligned |= bytes < 4;
- if (misaligned || flooded) {
+ if (misaligned || kvm_mmu_page_flooded(sp)) {
/*
* Misaligned accesses are too much trouble to fix
* up; also, they usually indicate a page is not used
@@ -1785,6 +1804,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
mmu_pte_write_zap_pte(vcpu, sp, spte);
if (new)
mmu_pte_write_new_pte(vcpu, sp, spte, new);
+ kvm_mmu_log_pte_history(sp, spte);
mmu_pte_write_flush_tlb(vcpu, entry, *spte);
++spte;
}
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index a71f3aa..cbe550e 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -78,6 +78,7 @@
#define KVM_MIN_FREE_MMU_PAGES 5
#define KVM_REFILL_PAGES 25
#define KVM_MAX_CPUID_ENTRIES 40
+#define KVM_MAX_PTE_HISTORY 4
extern spinlock_t kvm_lock;
extern struct list_head vm_list;
@@ -189,6 +190,9 @@ struct kvm_mmu_page {
u64 *parent_pte; /* !multimapped */
struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */
};
+
+ u16 pte_history_len;
+ u16 pte_history[KVM_MAX_PTE_HISTORY];
};
/*