[PATCH 2/2] kvm: fix kvm_is_mmio_pfn() and rename to kvm_is_reserved_pfn()
This reverts commit 85c8555ff0 ("KVM: check for !is_zero_pfn() in kvm_is_mmio_pfn()") and renames the function to kvm_is_reserved_pfn. The problem being addressed by the patch above was that some ARM code based the memory mapping attributes of a pfn on the return value of kvm_is_mmio_pfn(), whose name indeed suggests that such pfns should be mapped as device memory. However, kvm_is_mmio_pfn() doesn't do quite what it says on the tin, and the existing non-ARM users were already using it in a way which suggests that its name should probably have been 'kvm_is_reserved_pfn' from the beginning, e.g., whether or not to call get_page/put_page on it etc. This means that returning false for the zero page is a mistake and the patch above should be reverted. Signed-off-by: Ard Biesheuvel --- arch/ia64/kvm/kvm-ia64.c | 2 +- arch/x86/kvm/mmu.c | 6 +++--- include/linux/kvm_host.h | 2 +- virt/kvm/kvm_main.c | 16 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index ec6b9acb6bea..dbe46f43884d 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -1563,7 +1563,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, for (i = 0; i < npages; i++) { pfn = gfn_to_pfn(kvm, base_gfn + i); - if (!kvm_is_mmio_pfn(pfn)) { + if (!kvm_is_reserved_pfn(pfn)) { kvm_set_pmt_entry(kvm, base_gfn + i, pfn << PAGE_SHIFT, _PAGE_AR_RWX | _PAGE_MA_WB); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index ac1c4de3a484..978f402006ee 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -630,7 +630,7 @@ static int mmu_spte_clear_track_bits(u64 *sptep) * kvm mmu, before reclaiming the page, we should * unmap it from mmu first. */ - WARN_ON(!kvm_is_mmio_pfn(pfn) && !page_count(pfn_to_page(pfn))); + WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn))); if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) kvm_set_pfn_accessed(pfn); @@ -2461,7 +2461,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, spte |= PT_PAGE_SIZE_MASK; if (tdp_enabled) spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn, - kvm_is_mmio_pfn(pfn)); + kvm_is_reserved_pfn(pfn)); if (host_writable) spte |= SPTE_HOST_WRITEABLE; @@ -2737,7 +2737,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, * PT_PAGE_TABLE_LEVEL and there would be no adjustment done * here. */ - if (!is_error_noslot_pfn(pfn) && !kvm_is_mmio_pfn(pfn) && + if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL && PageTransCompound(pfn_to_page(pfn)) && !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) { diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index ea53b04993f2..a6059bdf7b03 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -703,7 +703,7 @@ void kvm_arch_sync_events(struct kvm *kvm); int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu); void kvm_vcpu_kick(struct kvm_vcpu *vcpu); -bool kvm_is_mmio_pfn(pfn_t pfn); +bool kvm_is_reserved_pfn(pfn_t pfn); struct kvm_irq_ack_notifier { struct hlist_node link; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 25ffac9e947d..3cee7b167052 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -107,10 +107,10 @@ EXPORT_SYMBOL_GPL(kvm_rebooting); static bool largepages_enabled = true; -bool kvm_is_mmio_pfn(pfn_t pfn) +bool kvm_is_reserved_pfn(pfn_t pfn) { if (pfn_valid(pfn)) - return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)); + return PageReserved(pfn_to_page(pfn)); return true; } @@ -1321,7 +1321,7 @@ static pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, else if ((vma->vm_flags & VM_PFNMAP)) { pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; - BUG_ON(!kvm_is_mmio_pfn(pfn)); + BUG_ON(!kvm_is_reserved_pfn(pfn)); } else { if (async && vma_is_valid(vma, write_fault)) *async = true; @@ -1427,7 +1427,7 @@ static struct page *kvm_pfn_to_page(pfn_t pfn) if (is_error_noslot_pfn(pfn)) return KVM_ERR_PTR_BAD_PAGE; - if (kvm_is_mmio_pfn(pfn)) { + if (kvm_is_reserved_pfn(pfn)) { WARN_ON(1); return KVM_ERR_PTR_BAD_PAGE; } @@ -1456,7 +1456,7 @@ EXPORT_SYMBOL_GPL(kvm_release_page_clean); void kvm_release_pfn_clean(pfn_t pfn) { - if (!is_error_noslot_pfn(pfn) && !kvm_is_mmio_pfn(pfn)) + if (!is_erro
[PATCH 1/2] arm/arm64: kvm: drop inappropriate use of kvm_is_mmio_pfn()
Instead of using kvm_is_mmio_pfn() to decide whether a host region should be stage 2 mapped with device attributes, add a new static function kvm_is_device_pfn() that disregards RAM pages with the reserved bit set, as those should usually not be mapped as device memory. Signed-off-by: Ard Biesheuvel --- arch/arm/kvm/mmu.c | 7 ++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 57a403a5c22b..b007438242e2 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -834,6 +834,11 @@ static bool kvm_is_write_fault(struct kvm_vcpu *vcpu) return kvm_vcpu_dabt_iswrite(vcpu); } +static bool kvm_is_device_pfn(unsigned long pfn) +{ + return !pfn_valid(pfn); +} + static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, struct kvm_memory_slot *memslot, unsigned long hva, unsigned long fault_status) @@ -904,7 +909,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, if (is_error_pfn(pfn)) return -EFAULT; - if (kvm_is_mmio_pfn(pfn)) + if (kvm_is_device_pfn(pfn)) mem_type = PAGE_S2_DEVICE; spin_lock(&kvm->mmu_lock); -- 1.8.3.2 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
KVM call for agenda for 2014-11-11
Hi Please, send any topic that you are interested in covering. Thanks, Juan. Call details: 15:00 CEST 13:00 UTC 09:00 EDT Every two weeks By popular demand, a google calendar public entry with it https://www.google.com/calendar/embed?src=dG9iMXRqcXAzN3Y4ZXZwNzRoMHE4a3BqcXNAZ3JvdXAuY2FsZW5kYXIuZ29vZ2xlLmNvbQ (Let me know if you have any problems with the calendar entry) If you need phone number details, contact me privately Thanks, Juan. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Seeking a KVM benchmark
On 09/11/2014 17:36, Andy Lutomirski wrote: >> The purpose of vmexit test is to show us various overheads, so why not >> measure EFER switch overhead by having two tests one with equal EFER >> another with different EFER, instead of hiding it. > > I'll try this. We might need three tests, though: NX different, NX > same but SCE different, and all flags the same. The test actually explicitly enables NX in order to put itself in the "common case": commit 82d4ccb9daf67885a0316b1d763ce5ace57cff36 Author: Marcelo Tosatti Date: Tue Jun 8 15:33:29 2010 -0300 test: vmexit: enable NX Enable NX to disable MSR autoload/save. This is the common case anyway. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity (this commit is in qemu-kvm.git), so I guess forgetting to set SCE is just a bug. The results on my Xeon Sandy Bridge are very interesting: NX different~11.5k (load/save EFER path) NX same, SCE different ~19.5k (urn path) all flags the same ~10.2k The inl_from_kernel results have absolutely no change, usually at most 5 cycles difference. This could be because I've added the SCE=1 variant directly to vmexit.c, so I'm running the tests one next to the other. I tried making also the other shared MSRs the same between guest and host (STAR, LSTAR, CSTAR, SYSCALL_MASK), so that the user return notifier has nothing to do. That saves about 4-500 cycles on inl_from_qemu. I do want to dig out my old Core 2 and see how the new test fares, but it really looks like your patch will be in 3.19. Paolo -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH kvm-unit-tests] vmexit: add tests with EFER.SCE=1
These activate a faster path that does not set EFER in the user return notifier. Time the difference between the two. Signed-off-by: Paolo Bonzini --- x86/vmexit.c | 20 1 file changed, 20 insertions(+) diff --git a/x86/vmexit.c b/x86/vmexit.c index 3bd0c81..a0380ab 100644 --- a/x86/vmexit.c +++ b/x86/vmexit.c @@ -3,6 +3,7 @@ #include "processor.h" #include "atomic.h" #include "x86/vm.h" +#include "x86/msr.h" #include "x86/desc.h" #include "x86/pci.h" @@ -10,6 +11,7 @@ struct test { void (*func)(void); const char *name; int (*valid)(void); + void (*after)(void); int parallel; bool (*next)(struct test *); }; @@ -119,6 +121,17 @@ static void inl_nop_kernel(void) inb(0x4d0); } +static int set_sce(void) +{ +wrmsr(MSR_EFER, rdmsr(MSR_EFER) | EFER_SCE); +return true; +} + +static void clear_sce(void) +{ +wrmsr(MSR_EFER, rdmsr(MSR_EFER) & ~EFER_SCE); +} + static void outl_elcr_kernel(void) { outb(0x4d0, 0); @@ -315,7 +328,11 @@ static struct test tests[] = { #endif { inl_pmtimer, "inl_from_pmtimer", .parallel = 1, }, { inl_nop_qemu, "inl_from_qemu", .parallel = 1 }, + { inl_nop_qemu, "inl_from_qemu (EFER guest = EFER host)", .parallel = 1, + .valid = set_sce, .after = clear_sce }, { inl_nop_kernel, "inl_from_kernel", .parallel = 1 }, + { inl_nop_kernel, "inl_from_kernel (EFER guest = EFER host)", .parallel = 1, + .valid = set_sce, .after = clear_sce }, { outl_elcr_kernel, "outl_to_kernel", .parallel = 1 }, { mov_dr, "mov_dr", .parallel = 1 }, { ipi, "ipi", is_smp, .parallel = 0, }, @@ -381,6 +398,9 @@ static bool do_test(struct test *test) t2 = rdtsc(); } while ((t2 - t1) < GOAL); printf("%s %d\n", test->name, (int)((t2 - t1) / iterations)); + if (test->after) { + test->after(); + } return test->next; } -- 1.8.3.1 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Seeking a KVM benchmark
On Mon, Nov 10, 2014 at 11:03:35AM +0100, Paolo Bonzini wrote: > > > On 09/11/2014 17:36, Andy Lutomirski wrote: > >> The purpose of vmexit test is to show us various overheads, so why not > >> measure EFER switch overhead by having two tests one with equal EFER > >> another with different EFER, instead of hiding it. > > > > I'll try this. We might need three tests, though: NX different, NX > > same but SCE different, and all flags the same. > > The test actually explicitly enables NX in order to put itself in the > "common case": > > commit 82d4ccb9daf67885a0316b1d763ce5ace57cff36 > Author: Marcelo Tosatti > Date: Tue Jun 8 15:33:29 2010 -0300 > > test: vmexit: enable NX > > Enable NX to disable MSR autoload/save. This is the common case anyway. > > Signed-off-by: Marcelo Tosatti > Signed-off-by: Avi Kivity > > (this commit is in qemu-kvm.git), so I guess forgetting to set SCE is > just a bug. The results on my Xeon Sandy Bridge are very interesting: > > NX different~11.5k (load/save EFER path) > NX same, SCE different ~19.5k (urn path) > all flags the same ~10.2k > > The inl_from_kernel results have absolutely no change, usually at most 5 > cycles difference. This could be because I've added the SCE=1 variant > directly to vmexit.c, so I'm running the tests one next to the other. > > I tried making also the other shared MSRs the same between guest and > host (STAR, LSTAR, CSTAR, SYSCALL_MASK), so that the user return notifier > has nothing to do. That saves about 4-500 cycles on inl_from_qemu. I > do want to dig out my old Core 2 and see how the new test fares, but it > really looks like your patch will be in 3.19. > Please test on wide variety of HW before final decision. Also it would be nice to ask Intel what is expected overhead. It is awesome if they mange to add EFER switching with non measurable overhead, but also hard to believe :) Also Andy had an idea do disable switching in case host and guest EFERs are the same but IIRC his patch does not include it yet. -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/2] kvm: fix kvm_is_mmio_pfn() and rename to kvm_is_reserved_pfn()
Hi Ard, On Mon, Nov 10, 2014 at 09:33:56AM +0100, Ard Biesheuvel wrote: > This reverts commit 85c8555ff0 ("KVM: check for !is_zero_pfn() in > kvm_is_mmio_pfn()") and renames the function to kvm_is_reserved_pfn. > > The problem being addressed by the patch above was that some ARM code > based the memory mapping attributes of a pfn on the return value of > kvm_is_mmio_pfn(), whose name indeed suggests that such pfns should > be mapped as device memory. > > However, kvm_is_mmio_pfn() doesn't do quite what it says on the tin, > and the existing non-ARM users were already using it in a way which > suggests that its name should probably have been 'kvm_is_reserved_pfn' > from the beginning, e.g., whether or not to call get_page/put_page on > it etc. This means that returning false for the zero page is a mistake > and the patch above should be reverted. > > Signed-off-by: Ard Biesheuvel > --- > arch/ia64/kvm/kvm-ia64.c | 2 +- > arch/x86/kvm/mmu.c | 6 +++--- > include/linux/kvm_host.h | 2 +- > virt/kvm/kvm_main.c | 16 > 4 files changed, 13 insertions(+), 13 deletions(-) > > diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c > index ec6b9acb6bea..dbe46f43884d 100644 > --- a/arch/ia64/kvm/kvm-ia64.c > +++ b/arch/ia64/kvm/kvm-ia64.c > @@ -1563,7 +1563,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, > > for (i = 0; i < npages; i++) { > pfn = gfn_to_pfn(kvm, base_gfn + i); > - if (!kvm_is_mmio_pfn(pfn)) { > + if (!kvm_is_reserved_pfn(pfn)) { > kvm_set_pmt_entry(kvm, base_gfn + i, > pfn << PAGE_SHIFT, > _PAGE_AR_RWX | _PAGE_MA_WB); > diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c > index ac1c4de3a484..978f402006ee 100644 > --- a/arch/x86/kvm/mmu.c > +++ b/arch/x86/kvm/mmu.c > @@ -630,7 +630,7 @@ static int mmu_spte_clear_track_bits(u64 *sptep) >* kvm mmu, before reclaiming the page, we should >* unmap it from mmu first. >*/ > - WARN_ON(!kvm_is_mmio_pfn(pfn) && !page_count(pfn_to_page(pfn))); > + WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn))); > > if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) > kvm_set_pfn_accessed(pfn); > @@ -2461,7 +2461,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, > spte |= PT_PAGE_SIZE_MASK; > if (tdp_enabled) > spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn, > - kvm_is_mmio_pfn(pfn)); > + kvm_is_reserved_pfn(pfn)); > > if (host_writable) > spte |= SPTE_HOST_WRITEABLE; > @@ -2737,7 +2737,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu > *vcpu, >* PT_PAGE_TABLE_LEVEL and there would be no adjustment done >* here. >*/ > - if (!is_error_noslot_pfn(pfn) && !kvm_is_mmio_pfn(pfn) && > + if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) && > level == PT_PAGE_TABLE_LEVEL && > PageTransCompound(pfn_to_page(pfn)) && > !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) { > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > index ea53b04993f2..a6059bdf7b03 100644 > --- a/include/linux/kvm_host.h > +++ b/include/linux/kvm_host.h > @@ -703,7 +703,7 @@ void kvm_arch_sync_events(struct kvm *kvm); > int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu); > void kvm_vcpu_kick(struct kvm_vcpu *vcpu); > > -bool kvm_is_mmio_pfn(pfn_t pfn); > +bool kvm_is_reserved_pfn(pfn_t pfn); > > struct kvm_irq_ack_notifier { > struct hlist_node link; > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > index 25ffac9e947d..3cee7b167052 100644 > --- a/virt/kvm/kvm_main.c > +++ b/virt/kvm/kvm_main.c > @@ -107,10 +107,10 @@ EXPORT_SYMBOL_GPL(kvm_rebooting); > > static bool largepages_enabled = true; > > -bool kvm_is_mmio_pfn(pfn_t pfn) > +bool kvm_is_reserved_pfn(pfn_t pfn) > { > if (pfn_valid(pfn)) > - return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)); > + return PageReserved(pfn_to_page(pfn)); so we return true for !pfn_valid(pfn), is this still semantically correct with the rename? > > return true; > } > @@ -1321,7 +1321,7 @@ static pfn_t hva_to_pfn(unsigned long addr, bool > atomic, bool *async, > else if ((vma->vm_flags & VM_PFNMAP)) { > pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + > vma->vm_pgoff; > - BUG_ON(!kvm_is_mmio_pfn(pfn)); > + BUG_ON(!kvm_is_reserved_pfn(pfn)); > } else { > if (async && vma_is_valid(vma, write_fault)) > *async = true; > @@ -1427,7 +1427,7 @@ static struct page *kvm_pfn_to_page(pfn_t pfn) > if (is_error_noslot_pfn(pfn)) > return KVM_ERR_PTR_BAD_PAGE; > > - if (kvm_is_mmio_pfn(pfn)) { > + if (kv
Re: [PATCH 1/2] arm/arm64: kvm: drop inappropriate use of kvm_is_mmio_pfn()
On Mon, Nov 10, 2014 at 09:33:55AM +0100, Ard Biesheuvel wrote: > Instead of using kvm_is_mmio_pfn() to decide whether a host region > should be stage 2 mapped with device attributes, add a new static > function kvm_is_device_pfn() that disregards RAM pages with the > reserved bit set, as those should usually not be mapped as device > memory. > > Signed-off-by: Ard Biesheuvel > --- > arch/arm/kvm/mmu.c | 7 ++- > 1 file changed, 6 insertions(+), 1 deletion(-) > > diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c > index 57a403a5c22b..b007438242e2 100644 > --- a/arch/arm/kvm/mmu.c > +++ b/arch/arm/kvm/mmu.c > @@ -834,6 +834,11 @@ static bool kvm_is_write_fault(struct kvm_vcpu *vcpu) > return kvm_vcpu_dabt_iswrite(vcpu); > } > > +static bool kvm_is_device_pfn(unsigned long pfn) > +{ > + return !pfn_valid(pfn); > +} So this works for Magnus' use case, because a device tree memreserve results in reserved, but valid, existing pages being backed by a struct page? > + > static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, > struct kvm_memory_slot *memslot, unsigned long hva, > unsigned long fault_status) > @@ -904,7 +909,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, > phys_addr_t fault_ipa, > if (is_error_pfn(pfn)) > return -EFAULT; > > - if (kvm_is_mmio_pfn(pfn)) > + if (kvm_is_device_pfn(pfn)) > mem_type = PAGE_S2_DEVICE; > > spin_lock(&kvm->mmu_lock); > -- > 1.8.3.2 > If my understanding above is correct, then: Acked-by: Christoffer Dall -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/2] kvm: fix kvm_is_mmio_pfn() and rename to kvm_is_reserved_pfn()
On 10 November 2014 11:53, Christoffer Dall wrote: > Hi Ard, > > On Mon, Nov 10, 2014 at 09:33:56AM +0100, Ard Biesheuvel wrote: >> This reverts commit 85c8555ff0 ("KVM: check for !is_zero_pfn() in >> kvm_is_mmio_pfn()") and renames the function to kvm_is_reserved_pfn. >> >> The problem being addressed by the patch above was that some ARM code >> based the memory mapping attributes of a pfn on the return value of >> kvm_is_mmio_pfn(), whose name indeed suggests that such pfns should >> be mapped as device memory. >> >> However, kvm_is_mmio_pfn() doesn't do quite what it says on the tin, >> and the existing non-ARM users were already using it in a way which >> suggests that its name should probably have been 'kvm_is_reserved_pfn' >> from the beginning, e.g., whether or not to call get_page/put_page on >> it etc. This means that returning false for the zero page is a mistake >> and the patch above should be reverted. >> >> Signed-off-by: Ard Biesheuvel >> --- >> arch/ia64/kvm/kvm-ia64.c | 2 +- >> arch/x86/kvm/mmu.c | 6 +++--- >> include/linux/kvm_host.h | 2 +- >> virt/kvm/kvm_main.c | 16 >> 4 files changed, 13 insertions(+), 13 deletions(-) >> >> diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c >> index ec6b9acb6bea..dbe46f43884d 100644 >> --- a/arch/ia64/kvm/kvm-ia64.c >> +++ b/arch/ia64/kvm/kvm-ia64.c >> @@ -1563,7 +1563,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, >> >> for (i = 0; i < npages; i++) { >> pfn = gfn_to_pfn(kvm, base_gfn + i); >> - if (!kvm_is_mmio_pfn(pfn)) { >> + if (!kvm_is_reserved_pfn(pfn)) { >> kvm_set_pmt_entry(kvm, base_gfn + i, >> pfn << PAGE_SHIFT, >> _PAGE_AR_RWX | _PAGE_MA_WB); >> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c >> index ac1c4de3a484..978f402006ee 100644 >> --- a/arch/x86/kvm/mmu.c >> +++ b/arch/x86/kvm/mmu.c >> @@ -630,7 +630,7 @@ static int mmu_spte_clear_track_bits(u64 *sptep) >>* kvm mmu, before reclaiming the page, we should >>* unmap it from mmu first. >>*/ >> - WARN_ON(!kvm_is_mmio_pfn(pfn) && !page_count(pfn_to_page(pfn))); >> + WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn))); >> >> if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) >> kvm_set_pfn_accessed(pfn); >> @@ -2461,7 +2461,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, >> spte |= PT_PAGE_SIZE_MASK; >> if (tdp_enabled) >> spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn, >> - kvm_is_mmio_pfn(pfn)); >> + kvm_is_reserved_pfn(pfn)); >> >> if (host_writable) >> spte |= SPTE_HOST_WRITEABLE; >> @@ -2737,7 +2737,7 @@ static void transparent_hugepage_adjust(struct >> kvm_vcpu *vcpu, >>* PT_PAGE_TABLE_LEVEL and there would be no adjustment done >>* here. >>*/ >> - if (!is_error_noslot_pfn(pfn) && !kvm_is_mmio_pfn(pfn) && >> + if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) && >> level == PT_PAGE_TABLE_LEVEL && >> PageTransCompound(pfn_to_page(pfn)) && >> !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) { >> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h >> index ea53b04993f2..a6059bdf7b03 100644 >> --- a/include/linux/kvm_host.h >> +++ b/include/linux/kvm_host.h >> @@ -703,7 +703,7 @@ void kvm_arch_sync_events(struct kvm *kvm); >> int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu); >> void kvm_vcpu_kick(struct kvm_vcpu *vcpu); >> >> -bool kvm_is_mmio_pfn(pfn_t pfn); >> +bool kvm_is_reserved_pfn(pfn_t pfn); >> >> struct kvm_irq_ack_notifier { >> struct hlist_node link; >> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c >> index 25ffac9e947d..3cee7b167052 100644 >> --- a/virt/kvm/kvm_main.c >> +++ b/virt/kvm/kvm_main.c >> @@ -107,10 +107,10 @@ EXPORT_SYMBOL_GPL(kvm_rebooting); >> >> static bool largepages_enabled = true; >> >> -bool kvm_is_mmio_pfn(pfn_t pfn) >> +bool kvm_is_reserved_pfn(pfn_t pfn) >> { >> if (pfn_valid(pfn)) >> - return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)); >> + return PageReserved(pfn_to_page(pfn)); > > so we return true for !pfn_valid(pfn), is this still semantically > correct with the rename? > I guess it is still debatable, but is arguably more correct than 'kvm_is_mmio_pfn' I was reluctant to choose something like 'kvm_is_special_pfn' because 'special' is not very discriminating here, and at least 'reserved' has a very clear meaning wrt pages, and treating non-struct page backed pfn's as reserved implicitly is not counter-intuitive imo. >> >> return true; >> } >> @@ -1321,7 +1321,7 @@ static pfn_t hva_to_pfn(unsigned long addr, bool >> atomic, bool *async, >> else if ((vma->vm_flags & VM_PFNMAP)) { >> pfn = ((ad
Re: [PATCH 2/2] kvm: fix kvm_is_mmio_pfn() and rename to kvm_is_reserved_pfn()
On Mon, Nov 10, 2014 at 12:05 PM, Ard Biesheuvel wrote: > On 10 November 2014 11:53, Christoffer Dall > wrote: >> Hi Ard, >> >> On Mon, Nov 10, 2014 at 09:33:56AM +0100, Ard Biesheuvel wrote: >>> This reverts commit 85c8555ff0 ("KVM: check for !is_zero_pfn() in >>> kvm_is_mmio_pfn()") and renames the function to kvm_is_reserved_pfn. >>> >>> The problem being addressed by the patch above was that some ARM code >>> based the memory mapping attributes of a pfn on the return value of >>> kvm_is_mmio_pfn(), whose name indeed suggests that such pfns should >>> be mapped as device memory. >>> >>> However, kvm_is_mmio_pfn() doesn't do quite what it says on the tin, >>> and the existing non-ARM users were already using it in a way which >>> suggests that its name should probably have been 'kvm_is_reserved_pfn' >>> from the beginning, e.g., whether or not to call get_page/put_page on >>> it etc. This means that returning false for the zero page is a mistake >>> and the patch above should be reverted. >>> >>> Signed-off-by: Ard Biesheuvel >>> --- >>> arch/ia64/kvm/kvm-ia64.c | 2 +- >>> arch/x86/kvm/mmu.c | 6 +++--- >>> include/linux/kvm_host.h | 2 +- >>> virt/kvm/kvm_main.c | 16 >>> 4 files changed, 13 insertions(+), 13 deletions(-) >>> >>> diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c >>> index ec6b9acb6bea..dbe46f43884d 100644 >>> --- a/arch/ia64/kvm/kvm-ia64.c >>> +++ b/arch/ia64/kvm/kvm-ia64.c >>> @@ -1563,7 +1563,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, >>> >>> for (i = 0; i < npages; i++) { >>> pfn = gfn_to_pfn(kvm, base_gfn + i); >>> - if (!kvm_is_mmio_pfn(pfn)) { >>> + if (!kvm_is_reserved_pfn(pfn)) { >>> kvm_set_pmt_entry(kvm, base_gfn + i, >>> pfn << PAGE_SHIFT, >>> _PAGE_AR_RWX | _PAGE_MA_WB); >>> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c >>> index ac1c4de3a484..978f402006ee 100644 >>> --- a/arch/x86/kvm/mmu.c >>> +++ b/arch/x86/kvm/mmu.c >>> @@ -630,7 +630,7 @@ static int mmu_spte_clear_track_bits(u64 *sptep) >>>* kvm mmu, before reclaiming the page, we should >>>* unmap it from mmu first. >>>*/ >>> - WARN_ON(!kvm_is_mmio_pfn(pfn) && !page_count(pfn_to_page(pfn))); >>> + WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn))); >>> >>> if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) >>> kvm_set_pfn_accessed(pfn); >>> @@ -2461,7 +2461,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, >>> spte |= PT_PAGE_SIZE_MASK; >>> if (tdp_enabled) >>> spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn, >>> - kvm_is_mmio_pfn(pfn)); >>> + kvm_is_reserved_pfn(pfn)); >>> >>> if (host_writable) >>> spte |= SPTE_HOST_WRITEABLE; >>> @@ -2737,7 +2737,7 @@ static void transparent_hugepage_adjust(struct >>> kvm_vcpu *vcpu, >>>* PT_PAGE_TABLE_LEVEL and there would be no adjustment done >>>* here. >>>*/ >>> - if (!is_error_noslot_pfn(pfn) && !kvm_is_mmio_pfn(pfn) && >>> + if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) && >>> level == PT_PAGE_TABLE_LEVEL && >>> PageTransCompound(pfn_to_page(pfn)) && >>> !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) { >>> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h >>> index ea53b04993f2..a6059bdf7b03 100644 >>> --- a/include/linux/kvm_host.h >>> +++ b/include/linux/kvm_host.h >>> @@ -703,7 +703,7 @@ void kvm_arch_sync_events(struct kvm *kvm); >>> int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu); >>> void kvm_vcpu_kick(struct kvm_vcpu *vcpu); >>> >>> -bool kvm_is_mmio_pfn(pfn_t pfn); >>> +bool kvm_is_reserved_pfn(pfn_t pfn); >>> >>> struct kvm_irq_ack_notifier { >>> struct hlist_node link; >>> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c >>> index 25ffac9e947d..3cee7b167052 100644 >>> --- a/virt/kvm/kvm_main.c >>> +++ b/virt/kvm/kvm_main.c >>> @@ -107,10 +107,10 @@ EXPORT_SYMBOL_GPL(kvm_rebooting); >>> >>> static bool largepages_enabled = true; >>> >>> -bool kvm_is_mmio_pfn(pfn_t pfn) >>> +bool kvm_is_reserved_pfn(pfn_t pfn) >>> { >>> if (pfn_valid(pfn)) >>> - return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)); >>> + return PageReserved(pfn_to_page(pfn)); >> >> so we return true for !pfn_valid(pfn), is this still semantically >> correct with the rename? >> > > I guess it is still debatable, but is arguably more correct than > 'kvm_is_mmio_pfn' > agreed > I was reluctant to choose something like 'kvm_is_special_pfn' because > 'special' is not very discriminating here, and at least 'reserved' has > a very clear meaning wrt pages, and treating non-struct page backed > pfn's as reserved implicitly is not counter-intuitive imo. > just wanted to ma
Re: [PATCH 1/2] arm/arm64: kvm: drop inappropriate use of kvm_is_mmio_pfn()
On 10 November 2014 11:57, Christoffer Dall wrote: > On Mon, Nov 10, 2014 at 09:33:55AM +0100, Ard Biesheuvel wrote: >> Instead of using kvm_is_mmio_pfn() to decide whether a host region >> should be stage 2 mapped with device attributes, add a new static >> function kvm_is_device_pfn() that disregards RAM pages with the >> reserved bit set, as those should usually not be mapped as device >> memory. >> >> Signed-off-by: Ard Biesheuvel >> --- >> arch/arm/kvm/mmu.c | 7 ++- >> 1 file changed, 6 insertions(+), 1 deletion(-) >> >> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c >> index 57a403a5c22b..b007438242e2 100644 >> --- a/arch/arm/kvm/mmu.c >> +++ b/arch/arm/kvm/mmu.c >> @@ -834,6 +834,11 @@ static bool kvm_is_write_fault(struct kvm_vcpu *vcpu) >> return kvm_vcpu_dabt_iswrite(vcpu); >> } >> >> +static bool kvm_is_device_pfn(unsigned long pfn) >> +{ >> + return !pfn_valid(pfn); >> +} > > So this works for Magnus' use case, because a device tree memreserve > results in reserved, but valid, existing pages being backed by a struct > page? > That is the idea, yes, but it would be good if he could confirm that it works as expected. Also, there may be some corner cases where pfn_valid returns false for regions that are in fact mapped as MT_NORMAL by the host kernel, i.e., UEFI configuration tables, for instance, so this test may require further refinement. But it at least eliminates the false positives for plain memreserve regions. >> + >> static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, >> struct kvm_memory_slot *memslot, unsigned long hva, >> unsigned long fault_status) >> @@ -904,7 +909,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, >> phys_addr_t fault_ipa, >> if (is_error_pfn(pfn)) >> return -EFAULT; >> >> - if (kvm_is_mmio_pfn(pfn)) >> + if (kvm_is_device_pfn(pfn)) >> mem_type = PAGE_S2_DEVICE; >> >> spin_lock(&kvm->mmu_lock); >> -- >> 1.8.3.2 >> > > If my understanding above is correct, then: > > Acked-by: Christoffer Dall -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Seeking a KVM benchmark
On 10/11/2014 11:45, Gleb Natapov wrote: > > I tried making also the other shared MSRs the same between guest and > > host (STAR, LSTAR, CSTAR, SYSCALL_MASK), so that the user return notifier > > has nothing to do. That saves about 4-500 cycles on inl_from_qemu. I > > do want to dig out my old Core 2 and see how the new test fares, but it > > really looks like your patch will be in 3.19. > > Please test on wide variety of HW before final decision. Yes, definitely. > Also it would > be nice to ask Intel what is expected overhead. It is awesome if they > mange to add EFER switching with non measurable overhead, but also hard > to believe :) So let's see what happens. Sneak preview: the result is definitely worth asking Intel about. I ran these benchmarks with a stock 3.16.6 KVM. Instead I patched kvm-unit-tests to set EFER.SCE in enable_nx. This makes it much simpler for others to reproduce the results. I only ran the inl_from_qemu test. Perf stat reports that the processor goes from 0.65 to 0.46 instructions per cycle, which is consistent with the improvement from 19k to 12k cycles per iteration. Unpatched KVM-unit-tests: 3,385,586,563 cycles#3.189 GHz [83.25%] 2,475,979,685 stalled-cycles-frontend # 73.13% frontend cycles idle [83.37%] 2,083,556,270 stalled-cycles-backend# 61.54% backend cycles idle [66.71%] 1,573,854,041 instructions #0.46 insns per cycle #1.57 stalled cycles per insn [83.20%] 1.108486526 seconds time elapsed Patched KVM-unit-tests: 3,252,297,378 cycles#3.147 GHz [83.32%] 2,010,266,184 stalled-cycles-frontend # 61.81% frontend cycles idle [83.36%] 1,560,371,769 stalled-cycles-backend# 47.98% backend cycles idle [66.51%] 2,133,698,018 instructions #0.66 insns per cycle #0.94 stalled cycles per insn [83.45%] 1.072395697 seconds time elapsed Playing with other events shows that the unpatched benchmark has an awful load of TLB misses Unpatched: 30,311 iTLB-loads 464,641,844 dTLB-loads 10,813,839 dTLB-load-misses #2.33% of all dTLB cache hits 20436,027 iTLB-load-misses # 67421.16% of all iTLB cache hits Patched: 1,440,033 iTLB-loads 640,970,836 dTLB-loads 2,345,112 dTLB-load-misses #0.37% of all dTLB cache hits 270,884 iTLB-load-misses # 18.81% of all iTLB cache hits This is 100% reproducible. The meaning of the numbers is clearer if you look up the raw event numbers in the Intel manuals: - iTLB-loads is 85h/10h aka "perf -e r1085": "Number of cache load STLB [second-level TLB] hits. No page walk." - iTLB-load-misses is 85h/01h aka r185: "Misses in all ITLB levels that cause page walks." So for example event 85h/04h aka r485 ("Cycle PMH is busy with a walk.") and friends show that the unpatched KVM wastes about 0.1 seconds more than the patched KVM on page walks: Unpatched: 22,583,440 r449 (cycles on dTLB store miss page walks) 40,452,018 r408 (cycles on dTLB load miss page walks) 2,115,981 r485 (cycles on iTLB miss page walks) 65,151,439 total Patched: 24,430,676 r449 (cycles on dTLB store miss page walks) 196,017,693 r408 (cycles on dTLB load miss page walks) 213,266,243 r485 (cycles on iTLB miss page walks) - 433,714,612 total These 0.1 seconds probably are all on instructions that would have been fast, since the slow instructions responsible for the low IPC are the microcoded instructions including VMX and other privileged stuff. Similarly, BDh/20h counts STLB flushes, which are 3k in unpatched KVM and 260k in patched KVM. Let's see where they come from: Unpatched: + 98.97% qemu-kvm [kernel.kallsyms] [k] native_write_msr_safe + 0.70% qemu-kvm [kernel.kallsyms] [k] page_fault It's expected that most TLB misses happen just before a page fault (there are also events to count how many TLB misses do result in a page fault, if you care about that), and thus are accounted to the first instruction of the exception handler. We do not know what causes second-level TLB _flushes_ but it's quite expected that you'll have a TLB miss after them and possibly a page fault. And anyway 98.97% of them coming from native_write_msr_safe is totally anomalous. A patched benchmark shows no second-level TLB flush occurs after a WRMSR: +
Re: [PATCH] kvm: Fix memory slot page alignment logic
On Fri, 7 Nov 2014 22:18:45 +0100 Alexander Graf wrote: > Memory slots have to be page aligned to get entered into KVM. There > is existing logic that tries to ensure that we pad memory slots that > are not page aligned to the biggest region that would still fit in the > alignment requirements. > > Unfortunately, that logic is broken. It tries to calculate the start > offset based on the region size. > > Fix up the logic to do the thing it was intended to do and document it > properly in the comment above it. > > With this patch applied, I can successfully run an e500 guest with more > than 3GB RAM (at which point RAM starts overlapping subpage memory regions). > > Cc: qemu-sta...@nongnu.org > Signed-off-by: Alexander Graf > --- > kvm-all.c | 6 -- > 1 file changed, 4 insertions(+), 2 deletions(-) > > diff --git a/kvm-all.c b/kvm-all.c > index 44a5e72..596e7ce 100644 > --- a/kvm-all.c > +++ b/kvm-all.c > @@ -634,8 +634,10 @@ static void kvm_set_phys_mem(MemoryRegionSection > *section, bool add) > unsigned delta; > > /* kvm works in page size chunks, but the function may be called > - with sub-page size and unaligned start address. */ > -delta = TARGET_PAGE_ALIGN(size) - size; > + with sub-page size and unaligned start address. Pad the start > + address to next and truncate size to previous page boundary. */ I'm a bit confused how it works at all. Lets assume that there is no mapped pages that include start_addr, then if start_addr were padded to next page, kvm would map it from there but the rest of QEMU would still use unaligned start_addr for MemoryRegion that isn't even mapped. It would seem that instead of padding up to the next page, start_addr should be moved to the start of the page that includes it to make page with original start_addr available to guest. > +delta = (TARGET_PAGE_SIZE - (start_addr & ~TARGET_PAGE_MASK)); > +delta &= ~TARGET_PAGE_MASK; > if (delta > size) { > return; > } -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] kvm: Fix memory slot page alignment logic
On 10.11.14 13:31, Igor Mammedov wrote: > On Fri, 7 Nov 2014 22:18:45 +0100 > Alexander Graf wrote: > >> Memory slots have to be page aligned to get entered into KVM. There >> is existing logic that tries to ensure that we pad memory slots that >> are not page aligned to the biggest region that would still fit in the >> alignment requirements. >> >> Unfortunately, that logic is broken. It tries to calculate the start >> offset based on the region size. >> >> Fix up the logic to do the thing it was intended to do and document it >> properly in the comment above it. >> >> With this patch applied, I can successfully run an e500 guest with more >> than 3GB RAM (at which point RAM starts overlapping subpage memory regions). >> >> Cc: qemu-sta...@nongnu.org >> Signed-off-by: Alexander Graf >> --- >> kvm-all.c | 6 -- >> 1 file changed, 4 insertions(+), 2 deletions(-) >> >> diff --git a/kvm-all.c b/kvm-all.c >> index 44a5e72..596e7ce 100644 >> --- a/kvm-all.c >> +++ b/kvm-all.c >> @@ -634,8 +634,10 @@ static void kvm_set_phys_mem(MemoryRegionSection >> *section, bool add) >> unsigned delta; >> >> /* kvm works in page size chunks, but the function may be called >> - with sub-page size and unaligned start address. */ >> -delta = TARGET_PAGE_ALIGN(size) - size; >> + with sub-page size and unaligned start address. Pad the start >> + address to next and truncate size to previous page boundary. */ > I'm a bit confused how it works at all. > Lets assume that there is no mapped pages that include start_addr, > then if start_addr were padded to next page, kvm would map it from there > but the rest of QEMU would still use unaligned start_addr for MemoryRegion > that isn't even mapped. Sorry, I don't understand this paragraph. Memory slots in general are accelerations for memory access - for MMIO (RAM is usually aligned), KVM can always exit to QEMU and just do a manual MMIO exit. > It would seem that instead of padding up to the next page, start_addr > should be moved to the start of the page that includes it to make page > with original start_addr available to guest. No, because in that case you would map something as RAM that really isn't RAM. Imagine you have the following memory layout: 0x1000 page size 1) 0x0 - 0x1 RAM 2) 0x1 - 0x10100 MMIO 3) 0x10100 - 0x2 RAM Then you want to map 1) as memory slot and 4) from 0x11000 onwards as memory slot. You can't map the page from 0x1 - 0x11000 as memory slot, because part of it is MMIO. Alex -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v4 4/6] hw_random: fix unregister race.
On Mon, Nov 03, 2014 at 11:56:24PM +0800, Amos Kong wrote: > > @@ -98,6 +99,8 @@ static inline void cleanup_rng(struct kref *kref) > > if (rng->cleanup) > rng->cleanup(rng); You need a compiler barrier here to prevent reordering. > + rng->cleanup_done = true; Thanks, -- Email: Herbert Xu Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] kvm: Fix memory slot page alignment logic
On 10/11/2014 14:16, Alexander Graf wrote: > No, because in that case you would map something as RAM that really > isn't RAM. > > Imagine you have the following memory layout: > > 0x1000 page size > > 1) 0x0 - 0x1 RAM > 2) 0x1 - 0x10100 MMIO > 3) 0x10100 - 0x2 RAM > > Then you want to map 1) as memory slot and 4) from 0x11000 onwards as > memory slot. > > You can't map the page from 0x1 - 0x11000 as memory slot, because > part of it is MMIO. Right. The partial RAM page remains marked as MMIO as far as KVM is concerned, so accesses are slow and you cannot run code from it. However, it is fundamental that MMIO areas are not marked as RAM. Paolo -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] [PATCH] kvm: Fix memory slot page alignment logic
On 10 November 2014 13:16, Alexander Graf wrote: > Sorry, I don't understand this paragraph. Memory slots in general are > accelerations for memory access - for MMIO (RAM is usually aligned), KVM > can always exit to QEMU and just do a manual MMIO exit. ...you're a bit stuck if you were hoping to execute code from that RAM, though, so they're not *purely* acceleration, right? -- PMM -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] kvm: Fix memory slot page alignment logic
On Mon, 10 Nov 2014 14:16:58 +0100 Alexander Graf wrote: > > > On 10.11.14 13:31, Igor Mammedov wrote: > > On Fri, 7 Nov 2014 22:18:45 +0100 > > Alexander Graf wrote: > > > >> Memory slots have to be page aligned to get entered into KVM. There > >> is existing logic that tries to ensure that we pad memory slots that > >> are not page aligned to the biggest region that would still fit in the > >> alignment requirements. > >> > >> Unfortunately, that logic is broken. It tries to calculate the start > >> offset based on the region size. > >> > >> Fix up the logic to do the thing it was intended to do and document it > >> properly in the comment above it. > >> > >> With this patch applied, I can successfully run an e500 guest with more > >> than 3GB RAM (at which point RAM starts overlapping subpage memory > >> regions). > >> > >> Cc: qemu-sta...@nongnu.org > >> Signed-off-by: Alexander Graf > >> --- > >> kvm-all.c | 6 -- > >> 1 file changed, 4 insertions(+), 2 deletions(-) > >> > >> diff --git a/kvm-all.c b/kvm-all.c > >> index 44a5e72..596e7ce 100644 > >> --- a/kvm-all.c > >> +++ b/kvm-all.c > >> @@ -634,8 +634,10 @@ static void kvm_set_phys_mem(MemoryRegionSection > >> *section, bool add) > >> unsigned delta; > >> > >> /* kvm works in page size chunks, but the function may be called > >> - with sub-page size and unaligned start address. */ > >> -delta = TARGET_PAGE_ALIGN(size) - size; > >> + with sub-page size and unaligned start address. Pad the start > >> + address to next and truncate size to previous page boundary. */ > > I'm a bit confused how it works at all. > > Lets assume that there is no mapped pages that include start_addr, > > then if start_addr were padded to next page, kvm would map it from there > > but the rest of QEMU would still use unaligned start_addr for MemoryRegion > > that isn't even mapped. > > Sorry, I don't understand this paragraph. Memory slots in general are > accelerations for memory access - for MMIO (RAM is usually aligned), KVM > can always exit to QEMU and just do a manual MMIO exit. > > > It would seem that instead of padding up to the next page, start_addr > > should be moved to the start of the page that includes it to make page > > with original start_addr available to guest. > > No, because in that case you would map something as RAM that really > isn't RAM. > > Imagine you have the following memory layout: > > 0x1000 page size > > 1) 0x0 - 0x1 RAM > 2) 0x1 - 0x10100 MMIO > 3) 0x10100 - 0x2 RAM > > Then you want to map 1) as memory slot and 4) from 0x11000 onwards as > memory slot. so every access to RAM 0x10100-0x11000 which is not represented as memory slot would cause VMEXIT? > > You can't map the page from 0x1 - 0x11000 as memory slot, because > part of it is MMIO. > > > Alex -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Seeking a KVM benchmark
On 11/10/2014 02:15 PM, Paolo Bonzini wrote: On 10/11/2014 11:45, Gleb Natapov wrote: I tried making also the other shared MSRs the same between guest and host (STAR, LSTAR, CSTAR, SYSCALL_MASK), so that the user return notifier has nothing to do. That saves about 4-500 cycles on inl_from_qemu. I do want to dig out my old Core 2 and see how the new test fares, but it really looks like your patch will be in 3.19. Please test on wide variety of HW before final decision. Yes, definitely. Also it would be nice to ask Intel what is expected overhead. It is awesome if they mange to add EFER switching with non measurable overhead, but also hard to believe :) So let's see what happens. Sneak preview: the result is definitely worth asking Intel about. I ran these benchmarks with a stock 3.16.6 KVM. Instead I patched kvm-unit-tests to set EFER.SCE in enable_nx. This makes it much simpler for others to reproduce the results. I only ran the inl_from_qemu test. Perf stat reports that the processor goes from 0.65 to 0.46 instructions per cycle, which is consistent with the improvement from 19k to 12k cycles per iteration. Unpatched KVM-unit-tests: 3,385,586,563 cycles#3.189 GHz [83.25%] 2,475,979,685 stalled-cycles-frontend # 73.13% frontend cycles idle [83.37%] 2,083,556,270 stalled-cycles-backend# 61.54% backend cycles idle [66.71%] 1,573,854,041 instructions #0.46 insns per cycle #1.57 stalled cycles per insn [83.20%] 1.108486526 seconds time elapsed Patched KVM-unit-tests: 3,252,297,378 cycles#3.147 GHz [83.32%] 2,010,266,184 stalled-cycles-frontend # 61.81% frontend cycles idle [83.36%] 1,560,371,769 stalled-cycles-backend# 47.98% backend cycles idle [66.51%] 2,133,698,018 instructions #0.66 insns per cycle #0.94 stalled cycles per insn [83.45%] 1.072395697 seconds time elapsed Playing with other events shows that the unpatched benchmark has an awful load of TLB misses Unpatched: 30,311 iTLB-loads 464,641,844 dTLB-loads 10,813,839 dTLB-load-misses #2.33% of all dTLB cache hits 20436,027 iTLB-load-misses # 67421.16% of all iTLB cache hits Patched: 1,440,033 iTLB-loads 640,970,836 dTLB-loads 2,345,112 dTLB-load-misses #0.37% of all dTLB cache hits 270,884 iTLB-load-misses # 18.81% of all iTLB cache hits This is 100% reproducible. The meaning of the numbers is clearer if you look up the raw event numbers in the Intel manuals: - iTLB-loads is 85h/10h aka "perf -e r1085": "Number of cache load STLB [second-level TLB] hits. No page walk." - iTLB-load-misses is 85h/01h aka r185: "Misses in all ITLB levels that cause page walks." So for example event 85h/04h aka r485 ("Cycle PMH is busy with a walk.") and friends show that the unpatched KVM wastes about 0.1 seconds more than the patched KVM on page walks: Unpatched: 22,583,440 r449 (cycles on dTLB store miss page walks) 40,452,018 r408 (cycles on dTLB load miss page walks) 2,115,981 r485 (cycles on iTLB miss page walks) 65,151,439 total Patched: 24,430,676 r449 (cycles on dTLB store miss page walks) 196,017,693 r408 (cycles on dTLB load miss page walks) 213,266,243 r485 (cycles on iTLB miss page walks) - 433,714,612 total These 0.1 seconds probably are all on instructions that would have been fast, since the slow instructions responsible for the low IPC are the microcoded instructions including VMX and other privileged stuff. Similarly, BDh/20h counts STLB flushes, which are 3k in unpatched KVM and 260k in patched KVM. Let's see where they come from: Unpatched: + 98.97% qemu-kvm [kernel.kallsyms] [k] native_write_msr_safe + 0.70% qemu-kvm [kernel.kallsyms] [k] page_fault It's expected that most TLB misses happen just before a page fault (there are also events to count how many TLB misses do result in a page fault, if you care about that), and thus are accounted to the first instruction of the exception handler. We do not know what causes second-level TLB _flushes_ but it's quite expected that you'll have a TLB miss after them and possibly a page fault. And anyway 98.97% of them coming from native_write_msr_safe is totally anomalous. A patched benchmark shows no second-level TLB flush occurs after a WRMSR: + 72.41% qemu-kvm [kernel.kallsyms] [k] page_fault + 9.07% qemu-kvm [kvm_intel][k] vmx_flush_tlb + 6.60% qemu-kvm [kernel.kallsyms] [k] set_pte_vaddr_pud +
Re: [PATCH] kvm: Fix memory slot page alignment logic
On 10.11.14 14:55, Igor Mammedov wrote: > On Mon, 10 Nov 2014 14:16:58 +0100 > Alexander Graf wrote: > >> >> >> On 10.11.14 13:31, Igor Mammedov wrote: >>> On Fri, 7 Nov 2014 22:18:45 +0100 >>> Alexander Graf wrote: >>> Memory slots have to be page aligned to get entered into KVM. There is existing logic that tries to ensure that we pad memory slots that are not page aligned to the biggest region that would still fit in the alignment requirements. Unfortunately, that logic is broken. It tries to calculate the start offset based on the region size. Fix up the logic to do the thing it was intended to do and document it properly in the comment above it. With this patch applied, I can successfully run an e500 guest with more than 3GB RAM (at which point RAM starts overlapping subpage memory regions). Cc: qemu-sta...@nongnu.org Signed-off-by: Alexander Graf --- kvm-all.c | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kvm-all.c b/kvm-all.c index 44a5e72..596e7ce 100644 --- a/kvm-all.c +++ b/kvm-all.c @@ -634,8 +634,10 @@ static void kvm_set_phys_mem(MemoryRegionSection *section, bool add) unsigned delta; /* kvm works in page size chunks, but the function may be called - with sub-page size and unaligned start address. */ -delta = TARGET_PAGE_ALIGN(size) - size; + with sub-page size and unaligned start address. Pad the start + address to next and truncate size to previous page boundary. */ >>> I'm a bit confused how it works at all. >>> Lets assume that there is no mapped pages that include start_addr, >>> then if start_addr were padded to next page, kvm would map it from there >>> but the rest of QEMU would still use unaligned start_addr for MemoryRegion >>> that isn't even mapped. >> >> Sorry, I don't understand this paragraph. Memory slots in general are >> accelerations for memory access - for MMIO (RAM is usually aligned), KVM >> can always exit to QEMU and just do a manual MMIO exit. >> >>> It would seem that instead of padding up to the next page, start_addr >>> should be moved to the start of the page that includes it to make page >>> with original start_addr available to guest. >> >> No, because in that case you would map something as RAM that really >> isn't RAM. >> >> Imagine you have the following memory layout: >> >> 0x1000 page size >> >> 1) 0x0 - 0x1 RAM >> 2) 0x1 - 0x10100 MMIO >> 3) 0x10100 - 0x2 RAM >> >> Then you want to map 1) as memory slot and 4) from 0x11000 onwards as >> memory slot. > so every access to RAM 0x10100-0x11000 which is not represented as memory > slot would cause VMEXIT? Yes, there's no other way. Otherwise we wouldn't be able to trap on the exits from 0x1 - 0x10100. Hardware only gives us page granularity. Usually this isn't an issue because overlapping MMIO regions are pretty large chunks of power-of-2 size - if you see any overlapping at all. On e500 this bites us though, because we end up with small MSI-X windows inside our address space (which in turn might also be a bug, but that doesn't mean that the slot mapping logic should be left as broken as it is). Alex -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] [PATCH] kvm: Fix memory slot page alignment logic
On 10.11.14 14:55, Peter Maydell wrote: > On 10 November 2014 13:16, Alexander Graf wrote: >> Sorry, I don't understand this paragraph. Memory slots in general are >> accelerations for memory access - for MMIO (RAM is usually aligned), KVM >> can always exit to QEMU and just do a manual MMIO exit. > > ...you're a bit stuck if you were hoping to execute code from > that RAM, though, so they're not *purely* acceleration, right? Yes and no. Technically, there's no reason KVM couldn't do an MMIO exit dance to fetch the next instruction. From user space this should be indistinguishable. Today, I don't think it's implemented though :). Alex -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC PATCH 1/4] ARM: KVM: on unhandled IO mem abort, route the call to the KVM MMIO bus
Hello, On Fri, Mar 28, 2014 at 9:09 PM, Christoffer Dall wrote: > > On Thu, Mar 13, 2014 at 04:57:26PM +0100, Antonios Motakis wrote: > > On an unhandled IO memory abort, use the kvm_io_bus_* API in order to > > handle the MMIO access through any registered read/write callbacks. This > > is a dependency for eventfd support (ioeventfd and irqfd). > > > > However, accesses to the VGIC are still left implemented independently, > > since the kvm_io_bus_* API doesn't pass the VCPU pointer doing the access. > > > > Signed-off-by: Antonios Motakis > > Signed-off-by: Nikolay Nikolaev > > --- > > arch/arm/kvm/mmio.c | 32 > > virt/kvm/arm/vgic.c | 5 - > > 2 files changed, 36 insertions(+), 1 deletion(-) > > > > diff --git a/arch/arm/kvm/mmio.c b/arch/arm/kvm/mmio.c > > index 4cb5a93..1d17831 100644 > > --- a/arch/arm/kvm/mmio.c > > +++ b/arch/arm/kvm/mmio.c > > @@ -162,6 +162,35 @@ static int decode_hsr(struct kvm_vcpu *vcpu, > > phys_addr_t fault_ipa, > > return 0; > > } > > > > +/** > > + * kvm_handle_mmio - handle an in-kernel MMIO access > > + * @vcpu:pointer to the vcpu performing the access > > + * @run: pointer to the kvm_run structure > > + * @mmio:pointer to the data describing the access > > + * > > + * returns true if the MMIO access has been performed in kernel space, > > + * and false if it needs to be emulated in user space. > > + */ > > +static bool handle_kernel_mmio(struct kvm_vcpu *vcpu, struct kvm_run *run, > > + struct kvm_exit_mmio *mmio) > > +{ > > + int ret; > > + if (mmio->is_write) { > > + ret = kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, > > mmio->phys_addr, > > + mmio->len, &mmio->data); > > + > > + } else { > > + ret = kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, > > mmio->phys_addr, > > + mmio->len, &mmio->data); > > + } > > + if (!ret) { > > + kvm_prepare_mmio(run, mmio); > > + kvm_handle_mmio_return(vcpu, run); > > + } > > + > > + return !ret; > > +} > > + > > int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run, > >phys_addr_t fault_ipa) > > { > > @@ -200,6 +229,9 @@ int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run > > *run, > > if (vgic_handle_mmio(vcpu, run, &mmio)) > > return 1; > > > > + if (handle_kernel_mmio(vcpu, run, &mmio)) > > + return 1; > > + We're reconsidering ioeventfds patchseries and we tried to evaluate what you suggested here. > > this special-casing of the vgic is now really terrible. Is there > anything holding you back from doing the necessary restructure of the > kvm_bus_io_*() API instead? Restructuring the kvm_io_bus_ API is not a big thing (we actually did it), but is not directly related to the these patches. Of course it can be justified if we do it in the context of removing vgic_handle_mmio and leaving only handle_kernel_mmio. > > That would allow us to get rid of the ugly > Fix it! in the vgic driver as well. Going through the vgic_handle_mmio we see that it will require large refactoring: - there are 15 MMIO ranges for the vgic now - each should be registered as a separate device - the handler of each range should be split into read and write - all handlers take 'struct kvm_exit_mmio', and pass it to 'vgic_reg_access', 'mmio_data_read' and 'mmio_data_read' To sum up - if we do this refactoring of vgic's MMIO handling + kvm_io_bus_ API getting 'vcpu" argument we'll get a 'much' cleaner vgic code and as a bonus we'll get 'ioeventfd' capabilities. We have 3 questions: - is the kvm_io_bus_ getting 'vcpu' argument acceptable for the other architectures too? - is this huge vgic MMIO handling redesign acceptable/desired (it touches a lot of code)? - is there a way that ioeventfd is accepted leaving vgic.c in it's current state? regards, Nikolay NIkolaev Virtual Open Systems > > -Christoffer > > > kvm_prepare_mmio(run, &mmio); > > return 0; > > } > > diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c > > index 8ca405c..afdecc3 100644 > > --- a/virt/kvm/arm/vgic.c > > +++ b/virt/kvm/arm/vgic.c > > @@ -849,13 +849,16 @@ struct mmio_range *find_matching_range(const struct > > mmio_range *ranges, > > } > > > > /** > > - * vgic_handle_mmio - handle an in-kernel MMIO access > > + * vgic_handle_mmio - handle an in-kernel vgic MMIO access > > * @vcpu:pointer to the vcpu performing the access > > * @run: pointer to the kvm_run structure > > * @mmio:pointer to the data describing the access > > * > > * returns true if the MMIO access has been performed in kernel space, > > * and false if it needs to be emulated in user space. > > + * > > + * This is handled outside of kvm_handle_mmio because the kvm_io_bus only > > + * passes the VM pointer, while we need the VCPU performing the access. > > */ > > bool vgic_handle_mmio(struct kvm_vcpu *vcpu
Re: [RFC PATCH 1/4] ARM: KVM: on unhandled IO mem abort, route the call to the KVM MMIO bus
On Mon, Nov 10, 2014 at 05:09:07PM +0200, Nikolay Nikolaev wrote: > Hello, > > On Fri, Mar 28, 2014 at 9:09 PM, Christoffer Dall > wrote: > > > > On Thu, Mar 13, 2014 at 04:57:26PM +0100, Antonios Motakis wrote: > > > On an unhandled IO memory abort, use the kvm_io_bus_* API in order to > > > handle the MMIO access through any registered read/write callbacks. This > > > is a dependency for eventfd support (ioeventfd and irqfd). > > > > > > However, accesses to the VGIC are still left implemented independently, > > > since the kvm_io_bus_* API doesn't pass the VCPU pointer doing the access. > > > > > > Signed-off-by: Antonios Motakis > > > Signed-off-by: Nikolay Nikolaev > > > --- > > > arch/arm/kvm/mmio.c | 32 > > > virt/kvm/arm/vgic.c | 5 - > > > 2 files changed, 36 insertions(+), 1 deletion(-) > > > > > > diff --git a/arch/arm/kvm/mmio.c b/arch/arm/kvm/mmio.c > > > index 4cb5a93..1d17831 100644 > > > --- a/arch/arm/kvm/mmio.c > > > +++ b/arch/arm/kvm/mmio.c > > > @@ -162,6 +162,35 @@ static int decode_hsr(struct kvm_vcpu *vcpu, > > > phys_addr_t fault_ipa, > > > return 0; > > > } > > > > > > +/** > > > + * kvm_handle_mmio - handle an in-kernel MMIO access > > > + * @vcpu:pointer to the vcpu performing the access > > > + * @run: pointer to the kvm_run structure > > > + * @mmio:pointer to the data describing the access > > > + * > > > + * returns true if the MMIO access has been performed in kernel space, > > > + * and false if it needs to be emulated in user space. > > > + */ > > > +static bool handle_kernel_mmio(struct kvm_vcpu *vcpu, struct kvm_run > > > *run, > > > + struct kvm_exit_mmio *mmio) > > > +{ > > > + int ret; > > > + if (mmio->is_write) { > > > + ret = kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, > > > mmio->phys_addr, > > > + mmio->len, &mmio->data); > > > + > > > + } else { > > > + ret = kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, > > > mmio->phys_addr, > > > + mmio->len, &mmio->data); > > > + } > > > + if (!ret) { > > > + kvm_prepare_mmio(run, mmio); > > > + kvm_handle_mmio_return(vcpu, run); > > > + } > > > + > > > + return !ret; > > > +} > > > + > > > int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run, > > >phys_addr_t fault_ipa) > > > { > > > @@ -200,6 +229,9 @@ int io_mem_abort(struct kvm_vcpu *vcpu, struct > > > kvm_run *run, > > > if (vgic_handle_mmio(vcpu, run, &mmio)) > > > return 1; > > > > > > + if (handle_kernel_mmio(vcpu, run, &mmio)) > > > + return 1; > > > + > > > We're reconsidering ioeventfds patchseries and we tried to evaluate > what you suggested here. > > > > > this special-casing of the vgic is now really terrible. Is there > > anything holding you back from doing the necessary restructure of the > > kvm_bus_io_*() API instead? > > Restructuring the kvm_io_bus_ API is not a big thing (we actually did > it), but is not directly related to the these patches. > Of course it can be justified if we do it in the context of removing > vgic_handle_mmio and leaving only handle_kernel_mmio. > > > > > That would allow us to get rid of the ugly > > Fix it! in the vgic driver as well. > > Going through the vgic_handle_mmio we see that it will require large > refactoring: > - there are 15 MMIO ranges for the vgic now - each should be > registered as a separate device > - the handler of each range should be split into read and write > - all handlers take 'struct kvm_exit_mmio', and pass it to > 'vgic_reg_access', 'mmio_data_read' and 'mmio_data_read' > > To sum up - if we do this refactoring of vgic's MMIO handling + > kvm_io_bus_ API getting 'vcpu" argument we'll get a 'much' cleaner > vgic code and as a bonus we'll get 'ioeventfd' capabilities. > > We have 3 questions: > - is the kvm_io_bus_ getting 'vcpu' argument acceptable for the other > architectures too? > - is this huge vgic MMIO handling redesign acceptable/desired (it > touches a lot of code)? > - is there a way that ioeventfd is accepted leaving vgic.c in it's > current state? > Not sure how the latter question is relevant to this, but check with Andre who recently looked at this as well and decided that for GICv3 the only sane thing was to remove that comment for the gic. I don't recall the details of what you were trying to accomplish here (it's been 8 months or so) but the surely the vgic handling code should *somehow* be integrated into the handle_kernel_mmio (like Paolo suggested), unless you come back and tell me that that would involve a complete rewrite of the vgic code. -Christoffer -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Seeking a KVM benchmark
On 10/11/2014 15:23, Avi Kivity wrote: > It's not surprising [1]. Since the meaning of some PTE bits change [2], > the TLB has to be flushed. In VMX we have VPIDs, so we only need to flush > if EFER changed between two invocations of the same VPID, which isn't the > case. > > [1] after the fact > [2] although those bits were reserved with NXE=0, so they shouldn't have > any TLB footprint You're right that this is not that surprising after the fact, and that both Sandy Bridge and Ivy Bridge have VPIDs (even the non-Xeon ones). This is also why I'm curious about the Nehalem. However note that even toggling the SCE bit is flushing the TLB. The NXE bit is not being toggled here! That's the more surprising part. Paolo -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 14/21] KVM: x86: Software disabled APIC should still deliver NMIs
On 06/11/2014 17:45, Radim Krčmář wrote: > - } else if (kvm_apic_hw_enabled(apic)) { > + } else if (kvm_apic_get_reg(apic, APIC_LDR)) { > if (kvm_apic_get_reg(apic, APIC_DFR) == > APIC_DFR_CLUSTER) { > new->cid_shift = 4; Why are you removing the kvm_apic_hw_enabled(apic)? We do not reset the APIC to the power-up values (the SDM says that "may" happen) when the enabled bit is turned off in the APIC_BASE MSR. Paolo -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Seeking a KVM benchmark
On Mon, Nov 10, 2014 at 06:28:25PM +0100, Paolo Bonzini wrote: > On 10/11/2014 15:23, Avi Kivity wrote: > > It's not surprising [1]. Since the meaning of some PTE bits change [2], > > the TLB has to be flushed. In VMX we have VPIDs, so we only need to flush > > if EFER changed between two invocations of the same VPID, which isn't the > > case. > > > > [1] after the fact > > [2] although those bits were reserved with NXE=0, so they shouldn't have > > any TLB footprint > > You're right that this is not that surprising after the fact, and that > both Sandy Bridge and Ivy Bridge have VPIDs (even the non-Xeon ones). > This is also why I'm curious about the Nehalem. > > However note that even toggling the SCE bit is flushing the TLB. The > NXE bit is not being toggled here! That's the more surprising part. > Just a guess, but may be because writing EFER is not something that happens often in regular OSes it is not optimized to handle different bits differently. -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 14/21] KVM: x86: Software disabled APIC should still deliver NMIs
2014-11-10 18:35+0100, Paolo Bonzini: > On 06/11/2014 17:45, Radim Krčmář wrote: > > - } else if (kvm_apic_hw_enabled(apic)) { > > + } else if (kvm_apic_get_reg(apic, APIC_LDR)) { > > if (kvm_apic_get_reg(apic, APIC_DFR) == > > APIC_DFR_CLUSTER) { > > new->cid_shift = 4; > > Why are you removing the kvm_apic_hw_enabled(apic)? We do not reset the > APIC to the power-up values (the SDM says that "may" happen) when the > enabled bit is turned off in the APIC_BASE MSR. We checked for that before, in kvm_apic_present(). -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: several messages
On Mon, 10 Nov 2014, Feng Wu wrote: > VT-d Posted-Interrupts is an enhancement to CPU side Posted-Interrupt. > With VT-d Posted-Interrupts enabled, external interrupts from > direct-assigned devices can be delivered to guests without VMM > intervention when guest is running in non-root mode. Can you please talk to Jiang and synchronize your work with his refactoring of the x86 interrupt handling subsystem. I want this stuff cleaned up first before we add new stuff to it. Thanks, tglx -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Seeking a KVM benchmark
On Mon, Nov 10, 2014 at 2:45 AM, Gleb Natapov wrote: > On Mon, Nov 10, 2014 at 11:03:35AM +0100, Paolo Bonzini wrote: >> >> >> On 09/11/2014 17:36, Andy Lutomirski wrote: >> >> The purpose of vmexit test is to show us various overheads, so why not >> >> measure EFER switch overhead by having two tests one with equal EFER >> >> another with different EFER, instead of hiding it. >> > >> > I'll try this. We might need three tests, though: NX different, NX >> > same but SCE different, and all flags the same. >> >> The test actually explicitly enables NX in order to put itself in the >> "common case": >> >> commit 82d4ccb9daf67885a0316b1d763ce5ace57cff36 >> Author: Marcelo Tosatti >> Date: Tue Jun 8 15:33:29 2010 -0300 >> >> test: vmexit: enable NX >> >> Enable NX to disable MSR autoload/save. This is the common case anyway. >> >> Signed-off-by: Marcelo Tosatti >> Signed-off-by: Avi Kivity >> >> (this commit is in qemu-kvm.git), so I guess forgetting to set SCE is >> just a bug. The results on my Xeon Sandy Bridge are very interesting: >> >> NX different~11.5k (load/save EFER path) >> NX same, SCE different ~19.5k (urn path) >> all flags the same ~10.2k >> >> The inl_from_kernel results have absolutely no change, usually at most 5 >> cycles difference. This could be because I've added the SCE=1 variant >> directly to vmexit.c, so I'm running the tests one next to the other. >> >> I tried making also the other shared MSRs the same between guest and >> host (STAR, LSTAR, CSTAR, SYSCALL_MASK), so that the user return notifier >> has nothing to do. That saves about 4-500 cycles on inl_from_qemu. I >> do want to dig out my old Core 2 and see how the new test fares, but it >> really looks like your patch will be in 3.19. >> > Please test on wide variety of HW before final decision. Also it would > be nice to ask Intel what is expected overhead. It is awesome if they > mange to add EFER switching with non measurable overhead, but also hard > to believe :) Also Andy had an idea do disable switching in case host > and guest EFERs are the same but IIRC his patch does not include it yet. I'll send that patch as a followup in a sec. It doesn't seem to make a difference, which reinforces my hypothesis that microcode is fiddling with EFER on entry and exit anyway to handle LME and LMA anyway, so adjusting the other bits doesn't effect performance. --Andy > > -- > Gleb. -- Andy Lutomirski AMA Capital Management, LLC -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] x86, kvm, vmx: Don't set LOAD_IA32_EFER when host and guest match
There's nothing to switch if the host and guest values are the same. I am unable to find evidence that this makes any difference whatsoever. Signed-off-by: Andy Lutomirski --- arch/x86/kvm/vmx.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index e72b9660e51c..c430b01b012b 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1670,7 +1670,9 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) guest_efer = vmx->vcpu.arch.efer; if (!(guest_efer & EFER_LMA)) guest_efer &= ~EFER_LME; - add_atomic_switch_msr(vmx, MSR_EFER, guest_efer, host_efer); + if (guest_efer != host_efer) + add_atomic_switch_msr(vmx, MSR_EFER, + guest_efer, host_efer); return false; } -- 1.9.3 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] x86, kvm, vmx: Always use LOAD_IA32_EFER if available
On Sun, Nov 9, 2014 at 8:33 PM, Wanpeng Li wrote: > Hi Andy, > On Fri, Nov 07, 2014 at 06:25:18PM -0800, Andy Lutomirski wrote: >>At least on Sandy Bridge, letting the CPU switch IA32_EFER is much >>faster than switching it manually. >> >>I benchmarked this using the vmexit kvm-unit-test (single run, but >>GOAL multiplied by 5 to do more iterations): >> >>Test Before AfterChange >>cpuid 2000 1932-3.40% >>vmcall 1914 1817-5.07% >>mov_from_cr8 13 13 0.00% >>mov_to_cr819 19 0.00% >>inl_from_pmtimer 19164 10619 -44.59% >>inl_from_qemu 15662 10302 -34.22% > > What's the difference of IA32_EFER between guest and host in your config? > > IIUC, > - NX is not consistent > IA32_EFER will be auto load w/ and w/o the patch. > - SCE is not consistent > IA32_EFER will be switched through wrmsr(urn) w/o the patch, and auto load > w/ the patch. This is with kvm-unit-test as is, so NX is consistent but SCE is different. --Andy -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
compiler bug gcc4.6/4.7 with ACCESS_ONCE and workarounds
Linus, Last week I sent belows patch to Paolo for kvm/next. Heiko Carstens pointed out that the kernel often does not work around compiler bugs, instead we blacklist the compiler, make a buildbug on or similar when we cant be sure to catch all broken cases, e.g. Heiko mentioned the the x86 specific test cases for stack protector. Now: I can reproduces belows miscompile on gcc46 and gcc 47 gcc 45 seems ok, gcc 48 is fixed. This makes blacklisting a bit hard, especially since it is not limited to s390, but covers all architectures. In essence ACCESS_ONCE will not work reliably on aggregate types with gcc 4.6 and gcc 4.7. In Linux we seem to use ACCESS_ONCE mostly on scalar types, below code is an example were we dont - and break. Linus, what is your take on workarounds of compiler bugs? The barrier solution below will fix that particular case, but are there others? Maybe we can come with a clever trick of creating a build-bug for ACCESS_ONCE on non-scalar types? A testcase is not trivial, since we have to rely on other optimization steps to actually do the wrong thing and the gcc testcase test will dump the internal tree and check that - something that does not seem to be ok for the kernel. Christian Forwarded Message Subject: [GIT PULL 1/4] KVM: s390: Fix ipte locking Date: Fri, 7 Nov 2014 12:45:13 +0100 From: Christian Borntraeger To: Paolo Bonzini CC: KVM , Gleb Natapov , Alexander Graf , Cornelia Huck , Jens Freimann , linux-s390 , Christian Borntraeger , sta...@vger.kernel.org ipte_unlock_siif uses cmpxchg to replace the in-memory data of the ipte lock together with ACCESS_ONCE for the intial read. union ipte_control { unsigned long val; struct { unsigned long k : 1; unsigned long kh : 31; unsigned long kg : 32; }; }; [...] static void ipte_unlock_siif(struct kvm_vcpu *vcpu) { union ipte_control old, new, *ic; ic = &vcpu->kvm->arch.sca->ipte_control; do { new = old = ACCESS_ONCE(*ic); new.kh--; if (!new.kh) new.k = 0; } while (cmpxchg(&ic->val, old.val, new.val) != old.val); if (!new.kh) wake_up(&vcpu->kvm->arch.ipte_wq); } The new value, is loaded twice from memory with gcc 4.7.2 of fedora 18, despite the ACCESS_ONCE: ---> l %r4,0(%r3) <--- load first 32 bit of lock (k and kh) in r4 alfi%r4,2147483647 <--- add -1 to r4 llgtr %r4,%r4 <--- zero out the sign bit of r4 lg %r1,0(%r3) <--- load all 64 bit of lock into new lgr %r2,%r1 <--- load the same into old risbg %r1,%r4,1,31,32 <--- shift and insert r4 into the bits 1-31 of new llihf %r4,2147483647 ngrk%r4,%r1,%r4 jne aa0 nihh%r1,32767 lgr %r4,%r2 csg %r4,%r1,0(%r3) cgr %r2,%r4 jne a70 If the memory value changes between the first load (l) and the second load (lg) we are broken. If that happens VCPU threads will hang (unkillable) in handle_ipte_interlock. Andreas Krebbel analyzed this and tracked it down to a compiler bug in that version: "while it is not that obvious the C99 standard basically forbids duplicating the memory access also in that case. For an argumentation of a similiar case please see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=22278#c43 For the implementation-defined cases regarding volatile there are some GCC-specific clarifications which can be found here: https://gcc.gnu.org/onlinedocs/gcc/Volatiles.html#Volatiles I've tracked down the problem with a reduced testcase. The problem was that during a tree level optimization (SRA - scalar replacement of aggregates) the volatile marker is lost. And an RTL level optimizer (CSE - common subexpression elimination) then propagated the memory read into its second use introducing another access to the memory location. So indeed Christian's suspicion that the union access has something to do with it is correct (since it triggered the SRA optimization). This issue has been reported and fixed in the GCC 4.8 development cycle: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58145"; This patch replaces the ACCESS_ONCE scheme with a barrier() based scheme that should work for all supported compilers. Signed-off-by: Christian Borntraeger Cc: sta...@vger.kernel.org # v3.16+ --- arch/s390/kvm/gaccess.c | 20 ++-- 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index 0f961a1..6dc0ad9 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -229,10 +229,12 @@ static void ipte_lock_simple(struct kvm_vcpu *vcpu) goto out; ic = &vcpu->kvm->arch.sca->ipte_control; do { - old = ACCESS_ONCE(*ic); + old = *ic; + barrier(); while (old.k) { cond_resched(); -
Re: compiler bug gcc4.6/4.7 with ACCESS_ONCE and workarounds
On Mon, Nov 10, 2014 at 12:18 PM, Christian Borntraeger wrote: > > Now: I can reproduces belows miscompile on gcc46 and gcc 47 > gcc 45 seems ok, gcc 48 is fixed. This makes blacklisting > a bit hard, especially since it is not limited to s390, but > covers all architectures. > In essence ACCESS_ONCE will not work reliably on aggregate > types with gcc 4.6 and gcc 4.7. > In Linux we seem to use ACCESS_ONCE mostly on scalar types, > below code is an example were we dont - and break. Hmm. I think we should see how painful it would be to make it a rule that ACCESS_ONCE() only works on scalar types. Even in the actual code you show as an example, the "fix" is really to use the "unsigned long val" member of the union for the ACCESS_ONCE(). And that seems to be true in many other cases too. So before blacklisting any compilers, let's first see if (a) we can actually make it a real rule that we only use ACCESS_ONCE on scalars (b) we can somehow enforce this with a compiler warning/error for mis-uses For example, the attached patch works for some cases, but shows how we use ACCESS_ONCE() on pointers to pte_t's etc, so it doesn't come even close to compiling the whole kernel. But I wonder how painful that would be to change.. The places where it complains are actually somewhat debatable to begin with, like: - handle_pte_fault(.. pte_t *pte ..): entry = ACCESS_ONCE(*pte); and the thing is, "pte" is actually possibly an 8-byte entity on x86-32, and that ACCESS_ONCE() fundamentally will be two 32-byte reads. So there is a very valid argument for saying "well, you shouldn't do that, then", and that we might be better off cleaning up our ACCESS_ONCE() uses, than to just blindly blacklist compilers. NOTE! I'm not at all advocating the attached patch. I'm sending it out white-space damaged on purpose, it's more of a "hey, something like this might be the direction we want to go in", with the spinlock.h part of the patch also acting as an example of the kind of changes the "ACCESS_ONCE() only works on scalars" rule would require. So I do agree with Heiko that we generally don't want to work around compiler bugs if we can avoid it. But sometimes the compiler bugs do end up saying "your'e doing something very fragile". Maybe we should try to be less fragile here. And in your example, the whole old = ACCESS_ONCE(*ic); *could* just be a old->val = ACCESS_ONCE(ic->val); the same way the x86 spinlock.h changes below. I did *not* try to see how many other cases we have. It's possible that your "don't use ACCESS_ONCE, use a barrier() instead" ends up being a valid workaround. For the pte case, that may well be the solution, for example (because what we really care about is not so much "it's an atomic access" but "it's still the same that we originally assumed"). Sometimes we want ACCESS_ONCE() because we really want an atomic value (and we just don't care if it's old or new), but sometimes it's really because we don't want the compiler to re-load it and possibly see two different values - one that we check, and one that we actually use (and then a barrier() would generally be perfectly sufficient) Adding some more people to the discussion just to see if anybody else has comments about ACCESS_ONCE() on aggregate types. (Btw, it's not just aggregate types, even non-aggregate types like "long long" are not necessarily safe, to give the same 64-bit on x86-32 example. So adding an assert that it's smaller or equal in size to a "long" might also not be unreasonable) Linus --- diff --git a/include/linux/compiler.h b/include/linux/compiler.h index d5ad7b1118fc..63e82f1dfc1a 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -378,7 +378,11 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect); * use is to mediate communication between process-level code and irq/NMI * handlers, all running on the same CPU. */ -#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x)) +#define get_scalar_volatile_pointer(x) ({ \ + typeof(x) *__p = &(x); \ + volatile typeof(x) *__vp = __p; \ + (void)(long)*__p; __vp; }) +#define ACCESS_ONCE(x) (*get_scalar_volatile_pointer(x)) /* Ignore/forbid kprobes attach on very low level functions marked by this attribute: */ #ifdef CONFIG_KPROBES diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index 9295016485c9..b7e6825af5e3 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -105,7 +105,7 @@ static __always_inline int arch_spin_trylock(arch_spinlock_t *lock) { arch_spinlock_t old, new; - old.tickets = ACCESS_ONCE(lock->tickets); + old.head_tail = ACCESS_ONCE(lock->head_tail); if (old.tickets.head != (old.tickets.tail & ~TICKET_SLOWPATH_FLAG)) return 0; @@ -162,16 +162,14 @@ static __always_inline void arch_spin_unlock(arch_spinlock_t *lock) static inline int arch_spin_i
Re: [PATCH 05/13] KVM: Update IRTE according to guest interrupt configuration changes
On Mon, 2014-11-10 at 14:26 +0800, Feng Wu wrote: > When guest changes its interrupt configuration (such as, vector, etc.) > for direct-assigned devices, we need to update the associated IRTE > with the new guest vector, so external interrupts from the assigned > devices can be injected to guests without VM-Exit. > > The current method of handling guest lowest priority interrtups > is to use a counter 'apic_arb_prio' for each VCPU, we choose the > VCPU with smallest 'apic_arb_prio' and then increase it by 1. > However, for VT-d PI, we cannot re-use this, since we no longer > have control to 'apic_arb_prio' with posted interrupt direct > delivery by Hardware. > > Here, we introduce a similiar way with 'apic_arb_prio' to handle > guest lowest priority interrtups when VT-d PI is used. Here is the > ideas: > - Each VCPU has a counter 'round_robin_counter'. > - When guests sets an interrupts to lowest priority, we choose > the VCPU with smallest 'round_robin_counter' as the destination, > then increase it. > > Signed-off-by: Feng Wu > --- > arch/x86/include/asm/irq_remapping.h |6 ++ > arch/x86/include/asm/kvm_host.h |2 + > arch/x86/kvm/vmx.c | 12 +++ > arch/x86/kvm/x86.c | 11 +++ > drivers/iommu/amd_iommu.c|6 ++ > drivers/iommu/intel_irq_remapping.c | 28 +++ > drivers/iommu/irq_remapping.c|9 ++ > drivers/iommu/irq_remapping.h|3 + > include/linux/dmar.h | 26 ++ > include/linux/kvm_host.h | 22 + > include/uapi/linux/kvm.h |1 + > virt/kvm/assigned-dev.c | 141 > ++ > virt/kvm/irq_comm.c |4 +- > virt/kvm/irqchip.c | 11 --- > 14 files changed, 269 insertions(+), 13 deletions(-) > > diff --git a/arch/x86/include/asm/irq_remapping.h > b/arch/x86/include/asm/irq_remapping.h > index a3cc437..32d6cc4 100644 > --- a/arch/x86/include/asm/irq_remapping.h > +++ b/arch/x86/include/asm/irq_remapping.h > @@ -51,6 +51,7 @@ extern void compose_remapped_msi_msg(struct pci_dev *pdev, >unsigned int irq, unsigned int dest, >struct msi_msg *msg, u8 hpet_id); > extern int setup_hpet_msi_remapped(unsigned int irq, unsigned int id); > +extern int update_pi_irte(unsigned int irq, u64 pi_desc_addr, u32 vector); > extern void panic_if_irq_remap(const char *msg); > extern bool setup_remapped_irq(int irq, > struct irq_cfg *cfg, > @@ -88,6 +89,11 @@ static inline int setup_hpet_msi_remapped(unsigned int > irq, unsigned int id) > return -ENODEV; > } > > +static inline int update_pi_irte(unsigned int irq, u64 pi_desc_addr, u32 > vector) > +{ > + return -ENODEV; > +} > + > static inline void panic_if_irq_remap(const char *msg) > { > } > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h > index 6ed0c30..0630161 100644 > --- a/arch/x86/include/asm/kvm_host.h > +++ b/arch/x86/include/asm/kvm_host.h > @@ -358,6 +358,7 @@ struct kvm_vcpu_arch { > struct kvm_lapic *apic;/* kernel irqchip context */ > unsigned long apic_attention; > int32_t apic_arb_prio; > + int32_t round_robin_counter; > int mp_state; > u64 ia32_misc_enable_msr; > bool tpr_access_reporting; > @@ -771,6 +772,7 @@ struct kvm_x86_ops { > int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr); > > void (*sched_in)(struct kvm_vcpu *kvm, int cpu); > + u64 (*get_pi_desc_addr)(struct kvm_vcpu *vcpu); > }; > > struct kvm_arch_async_pf { > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c > index a4670d3..ae91b72 100644 > --- a/arch/x86/kvm/vmx.c > +++ b/arch/x86/kvm/vmx.c > @@ -544,6 +544,11 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu > *vcpu) > return container_of(vcpu, struct vcpu_vmx, vcpu); > } > > +struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) > +{ > + return &(to_vmx(vcpu)->pi_desc); > +} > + > #define VMCS12_OFFSET(x) offsetof(struct vmcs12, x) > #define FIELD(number, name) [number] = VMCS12_OFFSET(name) > #define FIELD64(number, name)[number] = VMCS12_OFFSET(name), \ > @@ -4280,6 +4285,11 @@ static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu > *vcpu) > return; > } > > +static u64 vmx_get_pi_desc_addr(struct kvm_vcpu *vcpu) > +{ > + return __pa((u64)vcpu_to_pi_desc(vcpu)); > +} > + > /* > * Set up the vmcs's constant host-state fields, i.e., host-state fields that > * will not change in the lifetime of the guest. > @@ -9232,6 +9242,8 @@ static struct kvm_x86_ops vmx_x86_ops = { > .check_nested_events = vmx_check_nested_events, > > .sched_in = vmx_sched_in, > + > + .get_pi_desc_addr = vmx_get_pi_desc_addr, > }; > > static int __init vmx_init(void) > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > inde
Re: [PATCH 02/13] KVM: Initialize VT-d Posted-Interrtups Descriptor
On Mon, 2014-11-10 at 14:26 +0800, Feng Wu wrote: > This patch initialize the VT-d Posted-interrupt Descritpor. > > Signed-off-by: Feng Wu > --- > arch/x86/include/asm/irq_remapping.h |1 + > arch/x86/kernel/apic/apic.c |1 + > arch/x86/kvm/vmx.c | 56 - > 3 files changed, 56 insertions(+), 2 deletions(-) > > diff --git a/arch/x86/include/asm/irq_remapping.h > b/arch/x86/include/asm/irq_remapping.h > index b7747c4..a3cc437 100644 > --- a/arch/x86/include/asm/irq_remapping.h > +++ b/arch/x86/include/asm/irq_remapping.h > @@ -57,6 +57,7 @@ extern bool setup_remapped_irq(int irq, > struct irq_chip *chip); > > void irq_remap_modify_chip_defaults(struct irq_chip *chip); > +extern int irq_post_enabled; > > #else /* CONFIG_IRQ_REMAP */ > > diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c > index ba6cc04..987408d 100644 > --- a/arch/x86/kernel/apic/apic.c > +++ b/arch/x86/kernel/apic/apic.c > @@ -162,6 +162,7 @@ __setup("apicpmtimer", setup_apicpmtimer); > #endif > > int x2apic_mode; > +EXPORT_SYMBOL_GPL(x2apic_mode); > #ifdef CONFIG_X86_X2APIC > /* x2apic enabled before OS handover */ > int x2apic_preenabled; > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c > index 3e556c6..a4670d3 100644 > --- a/arch/x86/kvm/vmx.c > +++ b/arch/x86/kvm/vmx.c > @@ -45,6 +45,7 @@ > #include > #include > #include > +#include > > #include "trace.h" > > @@ -408,13 +409,32 @@ struct nested_vmx { > }; > > #define POSTED_INTR_ON 0 > +#define POSTED_INTR_SN 1 > + > /* Posted-Interrupt Descriptor */ > struct pi_desc { > u32 pir[8]; /* Posted interrupt requested */ > - u32 control;/* bit 0 of control is outstanding notification bit */ > - u32 rsvd[7]; > + union { > + struct { > + u64 on : 1, > + sn : 1, > + rsvd_1 : 13, > + ndm : 1, > + nv : 8, > + rsvd_2 : 8, > + ndst: 32; > + }; > + u64 control; > + }; > + u32 rsvd[6]; > } __aligned(64); > > +static void pi_clear_sn(struct pi_desc *pi_desc) > +{ > + return clear_bit(POSTED_INTR_SN, > + (unsigned long *)&pi_desc->control); > +} > + > static bool pi_test_and_set_on(struct pi_desc *pi_desc) > { > return test_and_set_bit(POSTED_INTR_ON, > @@ -4396,6 +4416,33 @@ static void ept_set_mmio_spte_mask(void) > kvm_mmu_set_mmio_spte_mask((0x3ull << 62) | 0x6ull); > } > > +static bool pi_desc_init(struct vcpu_vmx *vmx) > +{ > + unsigned int dest; > + > + if (irq_post_enabled == 0) > + return true; > + > + /* > + * Initialize Posted-Interrupt Descriptor > + */ > + > + pi_clear_sn(&vmx->pi_desc); > + vmx->pi_desc.nv = POSTED_INTR_VECTOR; > + > + /* Physical mode for Notificaiton Event */ > + vmx->pi_desc.ndm = 0; > + dest = cpu_physical_id(vmx->vcpu.cpu); > + > + if (x2apic_mode) > + vmx->pi_desc.ndst = dest; > + else > + vmx->pi_desc.ndst = (dest << 8) & 0xFF00; > + > + return true; Why does this bother to return anything since it can only return true? > +} > + > + > /* > * Sets up the vmcs for emulated real mode. > */ > @@ -4439,6 +4486,11 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) > > vmcs_write64(POSTED_INTR_NV, POSTED_INTR_VECTOR); > vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc))); > + > + if (!pi_desc_init(vmx)) { And therefore this cannot happen. > + printk(KERN_ERR "Initialize PI descriptor error!\n"); > + return 1; This is the wrong error anyway, vmx_create_vcpu() returns ERR_PTR(1) which fails the reverse IS_ERR() Thanks, Alex > + } > } > > if (ple_gap) { -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] x86: Update VT-d Posted-Interrupts related information
On Mon, 2014-11-10 at 14:31 +0800, Feng Wu wrote: > VT-d Posted-Interrupts(PI) is an enhancement to CPU side Posted-Interrupt. > With VT-d Posted-Interrupts enabled, external interrupts from > direct-assigned devices can be delivered to guests without VMM > involvement when guest is running in non-root mode. > > If VT-d PI is supported by KVM, we need to update the IRTE with > the new guest interrtup configuration. > > Signed-off-by: Feng Wu > --- > hw/i386/kvm/pci-assign.c | 24 > linux-headers/linux/kvm.h |2 ++ > target-i386/kvm.c |5 + > target-i386/kvm_i386.h|1 + > 4 files changed, 32 insertions(+), 0 deletions(-) > > diff --git a/hw/i386/kvm/pci-assign.c b/hw/i386/kvm/pci-assign.c > index bb206da..e55a99b 100644 > --- a/hw/i386/kvm/pci-assign.c > +++ b/hw/i386/kvm/pci-assign.c > @@ -1005,6 +1005,12 @@ static void assigned_dev_update_msi(PCIDevice *pci_dev) > assigned_dev->intx_route.mode = PCI_INTX_DISABLED; > assigned_dev->intx_route.irq = -1; > assigned_dev->assigned_irq_type = ASSIGNED_IRQ_MSI; > + > +if (kvm_check_extension(kvm_state, KVM_CAP_PI)) { > +if (kvm_device_pi_update(kvm_state, assigned_dev->dev_id) < 0) { > +perror("assigned_dev_update_msi: kvm_device_pi_update"); > +} > +} I don't really understand this ioctl, we need to call into KVM to set the new IRQ routing anyway, why can't KVM figure out that it needs to update the posted interrupt config at the same time? Also, we'll need to support this through vfio-pci. Thanks, Alex > } else { > Error *local_err = NULL; > > @@ -1029,6 +1035,12 @@ static void assigned_dev_update_msi_msg(PCIDevice > *pci_dev) > > kvm_irqchip_update_msi_route(kvm_state, assigned_dev->msi_virq[0], > msi_get_message(pci_dev, 0)); > + > +if (kvm_check_extension(kvm_state, KVM_CAP_PI)) { > +if (kvm_device_pi_update(kvm_state, assigned_dev->dev_id) < 0) { > +perror("assigned_dev_update_msi_msg: kvm_device_pi_update"); > +} > +} > } > > static bool assigned_dev_msix_masked(MSIXTableEntry *entry) > @@ -1149,6 +1161,12 @@ static void assigned_dev_update_msix(PCIDevice > *pci_dev) > perror("assigned_dev_enable_msix: assign irq"); > return; > } > + > +if (kvm_check_extension(kvm_state, KVM_CAP_PI)) { > +if (kvm_device_pi_update(kvm_state, assigned_dev->dev_id) < > 0) { > +perror("assigned_dev_update_msix: kvm_device_pi_update"); > +} > +} > } > assigned_dev->intx_route.mode = PCI_INTX_DISABLED; > assigned_dev->intx_route.irq = -1; > @@ -1618,6 +1636,12 @@ static void assigned_dev_msix_mmio_write(void *opaque, > hwaddr addr, > if (ret) { > error_report("Error updating irq routing entry (%d)", > ret); > } > +if (kvm_check_extension(kvm_state, KVM_CAP_PI)) { > +if (kvm_device_pi_update(kvm_state, adev->dev_id) < 0) { > +perror("assigned_dev_update_msi_msg: " > +"kvm_device_pi_update"); > +} > +} > } > } > } > diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h > index 2669938..b34f3c4 100644 > --- a/linux-headers/linux/kvm.h > +++ b/linux-headers/linux/kvm.h > @@ -765,6 +765,7 @@ struct kvm_ppc_smmu_info { > #define KVM_CAP_PPC_FIXUP_HCALL 103 > #define KVM_CAP_PPC_ENABLE_HCALL 104 > #define KVM_CAP_CHECK_EXTENSION_VM 105 > +#define KVM_CAP_PI 106 > > #ifdef KVM_CAP_IRQ_ROUTING > > @@ -1020,6 +1021,7 @@ struct kvm_s390_ucas_mapping { > #define KVM_XEN_HVM_CONFIG_IOW(KVMIO, 0x7a, struct > kvm_xen_hvm_config) > #define KVM_SET_CLOCK _IOW(KVMIO, 0x7b, struct kvm_clock_data) > #define KVM_GET_CLOCK _IOR(KVMIO, 0x7c, struct kvm_clock_data) > +#define KVM_ASSIGN_DEV_PI_UPDATE _IOR(KVMIO, 0x7d, __u32) > /* Available with KVM_CAP_PIT_STATE2 */ > #define KVM_GET_PIT2 _IOR(KVMIO, 0x9f, struct kvm_pit_state2) > #define KVM_SET_PIT2 _IOW(KVMIO, 0xa0, struct kvm_pit_state2) > diff --git a/target-i386/kvm.c b/target-i386/kvm.c > index ccf36e8..3dd8e5e 100644 > --- a/target-i386/kvm.c > +++ b/target-i386/kvm.c > @@ -2660,6 +2660,11 @@ int kvm_device_msi_assign(KVMState *s, uint32_t > dev_id, int virq) >KVM_DEV_IRQ_GUEST_MSI, virq); > } > > +int kvm_device_pi_update(KVMState *s, uint32_t dev_id) > +{ > +return kvm_vm_ioctl(s, KVM_ASSIGN_DEV_PI_UPDATE, &dev_id); > +} > + > int kvm_device_msi_deassign(KVMState *s, uint32_t dev_id) > { > return kvm_deassign_irq_internal(s, dev_id, KVM_DEV_IRQ_GUEST_MSI | > diff --git a/target-i386/kv
Re: [PATCH 13/13] iommu/vt-d: Add a command line parameter for VT-d posted-interrupts
On Mon, 2014-11-10 at 14:26 +0800, Feng Wu wrote: > Enable VT-d Posted-Interrtups and add a command line > parameter for it. > > Signed-off-by: Feng Wu > --- > drivers/iommu/irq_remapping.c |9 - > 1 files changed, 8 insertions(+), 1 deletions(-) > > diff --git a/drivers/iommu/irq_remapping.c b/drivers/iommu/irq_remapping.c > index 0e36860..3cb9429 100644 > --- a/drivers/iommu/irq_remapping.c > +++ b/drivers/iommu/irq_remapping.c > @@ -23,7 +23,7 @@ int irq_remap_broken; > int disable_sourceid_checking; > int no_x2apic_optout; > > -int disable_irq_post = 1; > +int disable_irq_post = 0; > int irq_post_enabled = 0; > EXPORT_SYMBOL_GPL(irq_post_enabled); > > @@ -206,6 +206,13 @@ static __init int setup_irqremap(char *str) > } > early_param("intremap", setup_irqremap); > > +static __init int setup_nointpost(char *str) > +{ > + disable_irq_post = 1; > + return 0; > +} > +early_param("nointpost", setup_nointpost); > + Documentation/kernel-parameters.txt? Thanks, Alex > void __init setup_irq_remapping_ops(void) > { > remap_ops = &intel_irq_remap_ops; -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Hotplugging usb devices on guest boot
Hello list, The short version is: how do I best use an external deamon to hotplug all the usb devices for each guest when the guest boots? The solution needs to work for quests started through virt-manager (as folks with minimal knowledge of the system may need to start VM's). I thought there might be a way to get signals about which guests are being started from libvirtd, but I haven't been able to find any information about that. Another option might be using the hooks at /etc/libvirt/hooks/qemu. ( http://www.libvirt.org/hooks.html#names ). Since libvert functions should not be called from within this script, I should have it open a socket and send a signal to me custom deamon, which would wait a short amount of time before envoking "virsh attach-device"? background: I recently got a multiseat system working using KVM (I computer serving guest OS's to monitors through VGA passthrough, one GPU per guest). I works great! One issue is that the input devices need to be hard coded into the XML files for each seat. Sometimes, on system reboot, the device numbers assigned to the various mice and keyboards get changed, and I have to check lsusb and fix the XML. Also, hotplugging is obviously not automatic. To make hotplugging work the way I want, I have written a python deamon to notice new usb devices through udev. It recognizes the port, and can thus send new devices to guests based on usb ports (each guest is associated with a hub plugged into a port). It would be best if this deamon could also let each guest know about the usb devices that are already plugged in to its hub on guest boot. That way, there would be no usb devices hard coded into the xml. What is the best way to do this? I am using Ubuntu 14.04 as the host OS. Thanks, Elliot -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] KVM: PPC: BookE: Improve irq inject tracepoint
When injecting an IRQ, we only document which IRQ priority (which translates to IRQ type) gets injected. However, when reading traces you don't necessarily have all the numbers in your head to know which IRQ really is meant. This patch converts the IRQ number field to a symbolic name that is in sync with the respective define. That way it's a lot easier for readers to figure out what interrupt gets injected. Signed-off-by: Alexander Graf --- arch/powerpc/kvm/trace_booke.h | 48 -- 1 file changed, 46 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kvm/trace_booke.h b/arch/powerpc/kvm/trace_booke.h index f7537cf..5734c0a 100644 --- a/arch/powerpc/kvm/trace_booke.h +++ b/arch/powerpc/kvm/trace_booke.h @@ -151,6 +151,47 @@ TRACE_EVENT(kvm_booke206_ref_release, __entry->pfn, __entry->flags) ); +#ifdef CONFIG_SPE_POSSIBLE +#define kvm_trace_symbol_irqprio_spe \ + {BOOKE_IRQPRIO_SPE_UNAVAIL, "SPE_UNAVAIL"}, \ + {BOOKE_IRQPRIO_SPE_FP_DATA, "SPE_FP_DATA"}, \ + {BOOKE_IRQPRIO_SPE_FP_ROUND, "SPE_FP_ROUND"}, +#else +#define kvm_trace_symbol_irqprio_spe +#endif + +#ifdef CONFIG_PPC_E500MC +#define kvm_trace_symbol_irqprio_e500mc \ + {BOOKE_IRQPRIO_ALTIVEC_UNAVAIL, "ALTIVEC_UNAVAIL"}, \ + {BOOKE_IRQPRIO_ALTIVEC_ASSIST, "ALTIVEC_ASSIST"}, +#else +#define kvm_trace_symbol_irqprio_e500mc +#endif + +#define kvm_trace_symbol_irqprio \ + kvm_trace_symbol_irqprio_spe \ + kvm_trace_symbol_irqprio_e500mc \ + {BOOKE_IRQPRIO_DATA_STORAGE, "DATA_STORAGE"}, \ + {BOOKE_IRQPRIO_INST_STORAGE, "INST_STORAGE"}, \ + {BOOKE_IRQPRIO_ALIGNMENT, "ALIGNMENT"}, \ + {BOOKE_IRQPRIO_PROGRAM, "PROGRAM"}, \ + {BOOKE_IRQPRIO_FP_UNAVAIL, "FP_UNAVAIL"}, \ + {BOOKE_IRQPRIO_SYSCALL, "SYSCALL"}, \ + {BOOKE_IRQPRIO_AP_UNAVAIL, "AP_UNAVAIL"}, \ + {BOOKE_IRQPRIO_DTLB_MISS, "DTLB_MISS"}, \ + {BOOKE_IRQPRIO_ITLB_MISS, "ITLB_MISS"}, \ + {BOOKE_IRQPRIO_MACHINE_CHECK, "MACHINE_CHECK"}, \ + {BOOKE_IRQPRIO_DEBUG, "DEBUG"}, \ + {BOOKE_IRQPRIO_CRITICAL, "CRITICAL"}, \ + {BOOKE_IRQPRIO_WATCHDOG, "WATCHDOG"}, \ + {BOOKE_IRQPRIO_EXTERNAL, "EXTERNAL"}, \ + {BOOKE_IRQPRIO_FIT, "FIT"}, \ + {BOOKE_IRQPRIO_DECREMENTER, "DECREMENTER"}, \ + {BOOKE_IRQPRIO_PERFORMANCE_MONITOR, "PERFORMANCE_MONITOR"}, \ + {BOOKE_IRQPRIO_EXTERNAL_LEVEL, "EXTERNAL_LEVEL"}, \ + {BOOKE_IRQPRIO_DBELL, "DBELL"}, \ + {BOOKE_IRQPRIO_DBELL_CRIT, "DBELL_CRIT"} \ + TRACE_EVENT(kvm_booke_queue_irqprio, TP_PROTO(struct kvm_vcpu *vcpu, unsigned int priority), TP_ARGS(vcpu, priority), @@ -167,8 +208,11 @@ TRACE_EVENT(kvm_booke_queue_irqprio, __entry->pending= vcpu->arch.pending_exceptions; ), - TP_printk("vcpu=%x prio=%x pending=%lx", - __entry->cpu_nr, __entry->priority, __entry->pending) + TP_printk("vcpu=%x prio=%s pending=%lx", + __entry->cpu_nr, + __print_symbolic(__entry->priority, kvm_trace_symbol_irqprio), + __entry->pending) +); ); #endif -- 1.8.1.4 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: compiler bug gcc4.6/4.7 with ACCESS_ONCE and workarounds
On Mon, Nov 10, 2014 at 01:07:33PM -0800, Linus Torvalds wrote: > On Mon, Nov 10, 2014 at 12:18 PM, Christian Borntraeger > wrote: > > > > Now: I can reproduces belows miscompile on gcc46 and gcc 47 > > gcc 45 seems ok, gcc 48 is fixed. This makes blacklisting > > a bit hard, especially since it is not limited to s390, but > > covers all architectures. > > In essence ACCESS_ONCE will not work reliably on aggregate > > types with gcc 4.6 and gcc 4.7. > > In Linux we seem to use ACCESS_ONCE mostly on scalar types, > > below code is an example were we dont - and break. > > Hmm. I think we should see how painful it would be to make it a rule > that ACCESS_ONCE() only works on scalar types. For whatever it is worth, I have been assuming that ACCESS_ONCE() was only ever supposed to work on scalar types. And one of the reasons that I have been giving the pre-EV56 Alpha guys a hard time is because I would like us to be able to continue using ACCESS_ONCE() on 8-bit and 16-bit quantities as well. > Even in the actual code you show as an example, the "fix" is really to > use the "unsigned long val" member of the union for the ACCESS_ONCE(). > And that seems to be true in many other cases too. > > So before blacklisting any compilers, let's first see if > > (a) we can actually make it a real rule that we only use ACCESS_ONCE on > scalars > (b) we can somehow enforce this with a compiler warning/error for mis-uses > > For example, the attached patch works for some cases, but shows how we > use ACCESS_ONCE() on pointers to pte_t's etc, so it doesn't come even > close to compiling the whole kernel. But I wonder how painful that > would be to change.. The places where it complains are actually > somewhat debatable to begin with, like: > > - handle_pte_fault(.. pte_t *pte ..): > > entry = ACCESS_ONCE(*pte); > > and the thing is, "pte" is actually possibly an 8-byte entity on > x86-32, and that ACCESS_ONCE() fundamentally will be two 32-byte > reads. > > So there is a very valid argument for saying "well, you shouldn't do > that, then", and that we might be better off cleaning up our > ACCESS_ONCE() uses, than to just blindly blacklist compilers. > > NOTE! I'm not at all advocating the attached patch. I'm sending it out > white-space damaged on purpose, it's more of a "hey, something like > this might be the direction we want to go in", with the spinlock.h > part of the patch also acting as an example of the kind of changes the > "ACCESS_ONCE() only works on scalars" rule would require. > > So I do agree with Heiko that we generally don't want to work around > compiler bugs if we can avoid it. But sometimes the compiler bugs do > end up saying "your'e doing something very fragile". Maybe we should > try to be less fragile here. > > And in your example, the whole > > old = ACCESS_ONCE(*ic); > > *could* just be a > > old->val = ACCESS_ONCE(ic->val); > > the same way the x86 spinlock.h changes below. > > I did *not* try to see how many other cases we have. It's possible > that your "don't use ACCESS_ONCE, use a barrier() instead" ends up > being a valid workaround. For the pte case, that may well be the > solution, for example (because what we really care about is not so > much "it's an atomic access" but "it's still the same that we > originally assumed"). Sometimes we want ACCESS_ONCE() because we > really want an atomic value (and we just don't care if it's old or > new), but sometimes it's really because we don't want the compiler to > re-load it and possibly see two different values - one that we check, > and one that we actually use (and then a barrier() would generally be > perfectly sufficient) > > Adding some more people to the discussion just to see if anybody else > has comments about ACCESS_ONCE() on aggregate types. > > (Btw, it's not just aggregate types, even non-aggregate types like > "long long" are not necessarily safe, to give the same 64-bit on > x86-32 example. So adding an assert that it's smaller or equal in size > to a "long" might also not be unreasonable) Good point on "long long" on 32-bit systems. Another reason to avoid trying to do anything that even smells atomic on non-machine-sized/aligned variables is that the compiler guys' current reaction to this sort of situation is "No problem!!! The compiler can just invent a lock to guard all such accesses!" I don't think that we want to go there. >Linus > > --- > diff --git a/include/linux/compiler.h b/include/linux/compiler.h > index d5ad7b1118fc..63e82f1dfc1a 100644 > --- a/include/linux/compiler.h > +++ b/include/linux/compiler.h > @@ -378,7 +378,11 @@ void ftrace_likely_update(struct > ftrace_branch_data *f, int val, int expect); > * use is to mediate communication between process-level code and irq/NMI > * handlers, all running on the same CPU. > */ > -#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x)) > +#define get_scalar_volatile_pointer(x) ({ \ > + type
Re: [PATCH v4 10/25] virtio: add API to enable VQs early
On 10/13/2014 12:50 AM, Michael S. Tsirkin wrote: virtio spec 0.9.X requires DRIVER_OK to be set before VQs are used, but some drivers use VQs before probe function returns. Since DRIVER_OK is set after probe, this violates the spec. Even though under virtio 1.0 transitional devices support this behaviour, we want to make it possible for those early callers to become spec compliant and eventually support non-transitional devices. Add API for drivers to call before using VQs. Sets DRIVER_OK internally. Signed-off-by: Michael S. Tsirkin Reviewed-by: Cornelia Huck --- include/linux/virtio_config.h | 17 + 1 file changed, 17 insertions(+) diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index e8f8f71..e36403b 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -109,6 +109,23 @@ struct virtqueue *virtio_find_single_vq(struct virtio_device *vdev, return vq; } +/** + * virtio_device_ready - enable vq use in probe function + * @vdev: the device + * + * Driver must call this to use vqs in the probe function. + * + * Note: vqs are enabled automatically after probe returns. + */ +static inline +void virtio_device_ready(struct virtio_device *dev) +{ + unsigned status = dev->config->get_status(dev); + + BUG_ON(status & VIRTIO_CONFIG_S_DRIVER_OK); + dev->config->set_status(dev, status | VIRTIO_CONFIG_S_DRIVER_OK); +} Getting a BUG when booting via KVM, host Fedora 20, guest Fedora 20. my config is at: https://fedorapeople.org/~grover/config-20141110 [0.828494] [ cut here ] [0.829039] kernel BUG at /home/agrover/git/kernel/include/linux/virtio_config.h:125! [0.831266] invalid opcode: [#1] SMP DEBUG_PAGEALLOC [0.831266] Modules linked in: [0.831266] CPU: 1 PID: 30 Comm: kworker/1:1 Not tainted 3.18.0-rc4 #120 [0.831266] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [0.831266] Workqueue: events control_work_handler [0.831266] task: 88003cd98000 ti: 88003cd94000 task.ti: 88003cd94000 [0.831266] RIP: 0010:[] [] add_port+0x264/0x410 [0.831266] RSP: :88003cd97c78 EFLAGS: 00010202 [0.831266] RAX: 0007 RBX: 88003c58c400 RCX: 0001 [0.831266] RDX: c132 RSI: 81a955e9 RDI: 0001c132 [0.831266] RBP: 88003cd97cc8 R08: R09: [0.831266] R10: 0001 R11: R12: 88003c58be00 [0.831266] R13: 0001 R14: 8800395ca800 R15: 88003c58c420 [0.831266] FS: () GS:88003fa0() knlGS: [0.831266] CS: 0010 DS: ES: CR0: 8005003b [0.831266] CR2: CR3: 01c11000 CR4: 06e0 [0.831266] DR0: DR1: DR2: [0.831266] DR3: DR6: fffe0ff0 DR7: 0400 [0.831266] Stack: [0.831266] 8801 0292 0001 [0.831266] 88003cd97cc8 88003dfa8a20 88003c58beb8 88003c58be10 [0.831266] 8800395a2000 88003cd97d38 8144531a [0.831266] Call Trace: [0.831266] [] control_work_handler+0x16a/0x3c0 [0.831266] [] ? process_one_work+0x208/0x500 [0.831266] [] process_one_work+0x2ac/0x500 [0.831266] [] ? process_one_work+0x208/0x500 [0.831266] [] worker_thread+0x2ce/0x4e0 [0.831266] [] ? process_one_work+0x500/0x500 [0.831266] [] kthread+0xf8/0x100 [0.831266] [] ? trace_hardirqs_on+0xd/0x10 [0.831266] [] ? kthread_stop+0x140/0x140 [0.831266] [] ret_from_fork+0x7c/0xb0 [0.831266] [] ? kthread_stop+0x140/0x140 [0.831266] Code: c7 c2 48 31 01 83 48 c7 c6 e9 55 a9 81 e8 55 b4 c6 ff 4d 8b b4 24 58 01 00 00 49 8b 86 e8 04 00 00 4c 89 f7 ff 50 10 a8 04 74 0c <0f> 0b 66 2e 0f 1f 84 00 00 00 00 00 49 8b 96 e8 04 00 00 83 c8 [0.831266] RIP [] add_port+0x264/0x410 [0.831266] RSP [0.878202] ---[ end trace f98fbb172cc7bbf4 ]--- Thanks -- Andy -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] kvm: x86: add trace event for pvclock updates
On Wed, Nov 05, 2014 at 11:46:42AM -0800, David Matlack wrote: > The new trace event records: > * the id of vcpu being updated > * the pvclock_vcpu_time_info struct being written to guest memory > > This is useful for debugging pvclock bugs, such as the bug fixed by > "[PATCH] kvm: x86: Fix kvm clock versioning.". > > Signed-off-by: David Matlack So you actually hit that bug in practice? Can you describe the scenario? -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] virtio_console: move early VQ enablement
On 21 October 2014 03:09, Josh Boyer wrote: > On Mon, Oct 20, 2014 at 10:05 AM, Michael S. Tsirkin wrote: >> On Mon, Oct 20, 2014 at 03:58:49PM +0200, Cornelia Huck wrote: >>> Commit f5866db6 (virtio_console: enable VQs early) tried to make >>> sure that DRIVER_OK was set when virtio_console started using its >>> virtqueues. Doing this in add_port(), however, means that we try >>> to set DRIVER_OK again when when a port is dynamically added after >>> the probe function is done. >>> >>> Let's move virtio_device_ready() to the probe function just before >>> trying to use the virtqueues instead. This is fine as nothing can >>> fail inbetween. >>> >>> Reported-by: Thomas Graf >>> Reviewed-by: Michael S. Tsirkin >>> Signed-off-by: Cornelia Huck >> >> Thanks! >> >> Acked-by: Michael S. Tsirkin >> Tested-by: Michael S. Tsirkin > > This fixed my KVM guest boot issue with 3.18-rc1. Thanks for such a quick > fix. > > Tested-by: Josh Boyer ping So who's merging this? Rusty? still happens in -rc4. Dave. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: several messages
On 2014/11/11 2:15, Thomas Gleixner wrote: > On Mon, 10 Nov 2014, Feng Wu wrote: > >> VT-d Posted-Interrupts is an enhancement to CPU side Posted-Interrupt. >> With VT-d Posted-Interrupts enabled, external interrupts from >> direct-assigned devices can be delivered to guests without VMM >> intervention when guest is running in non-root mode. > > Can you please talk to Jiang and synchronize your work with his > refactoring of the x86 interrupt handling subsystem. > > I want this stuff cleaned up first before we add new stuff to it. Hi Thomas, Just talked with Feng, we will focused on refactor first and then add posted interrupt support. Regards! Gerry > > Thanks, > > tglx > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v4] KVM: x86: fix access memslots w/o hold srcu read lock
Hi Wanpeng, I think I have totally missed this thread. I opened lockdep and RCU debug, and tried on 3.18-rc1. But I didn't get the warning. My steps are: 1. Use numactl to bind a qemu process to node1. 2. Offline all node1 memory. And the qemu process is still running. Would you please tell me how did you reproduce it ? Thanks. On 11/02/2014 03:07 PM, Wanpeng Li wrote: The srcu read lock must be held while accessing memslots (e.g. when using gfn_to_* functions), however, commit c24ae0dcd3e8 ("kvm: x86: Unpin and remove kvm_arch->apic_access_page") call gfn_to_page() in kvm_vcpu_reload_apic_access_page() w/o hold it in vmx_vcpu_reset() path which leads to suspicious rcu_dereference_check() usage warning. This patch fix it by holding srcu read lock in all kvm_vcpu_reset() call path. [ INFO: suspicious RCU usage. ] 3.18.0-rc2-test2+ #70 Not tainted --- include/linux/kvm_host.h:474 suspicious rcu_dereference_check() usage! other info that might help us debug this: rcu_scheduler_active = 1, debug_locks = 0 1 lock held by qemu-system-x86/2371: #0: (&vcpu->mutex){+.+...}, at: [] vcpu_load+0x20/0xd0 [kvm] stack backtrace: CPU: 4 PID: 2371 Comm: qemu-system-x86 Not tainted 3.18.0-rc2-test2+ #70 Hardware name: Dell Inc. OptiPlex 9010/0M9KCM, BIOS A12 01/10/2013 0001 880209983ca8 816f514f 8802099b8990 880209983cd8 810bd687 000fee00 880208a2c000 880208a1 88020ef50040 880209983d08 Call Trace: [] dump_stack+0x4e/0x71 [] lockdep_rcu_suspicious+0xe7/0x120 [] gfn_to_memslot+0xd5/0xe0 [kvm] [] __gfn_to_pfn+0x33/0x60 [kvm] [] gfn_to_page+0x25/0x90 [kvm] [] kvm_vcpu_reload_apic_access_page+0x3c/0x80 [kvm] [] vmx_vcpu_reset+0x20c/0x460 [kvm_intel] [] kvm_vcpu_reset+0x15e/0x1b0 [kvm] [] kvm_arch_vcpu_setup+0x2c/0x50 [kvm] [] kvm_vm_ioctl+0x1d0/0x780 [kvm] [] ? __lock_is_held+0x54/0x80 [] do_vfs_ioctl+0x300/0x520 [] ? __fget+0x5/0x250 [] ? __fget_light+0x2a/0xe0 [] SyS_ioctl+0x81/0xa0 [] system_call_fastpath+0x16/0x1b Reported-by: Takashi Iwai Reported-by: Alexei Starovoitov Suggested-by: Paolo Bonzini Signed-off-by: Wanpeng Li --- v3 -> v4: * bypass the problem altoghter by kvm_make_request v2 -> v3: * take care all vmx_vcpu_reset call path v1 -> v2: * just fix hold the srcu read lock in vmx_vcpu_reset path arch/x86/kvm/vmx.c |2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a0f78db..3e556c6 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4579,7 +4579,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmcs_write32(TPR_THRESHOLD, 0); } - kvm_vcpu_reload_apic_access_page(vcpu); + kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); if (vmx_vm_has_apicv(vcpu->kvm)) memset(&vmx->pi_desc, 0, sizeof(struct pi_desc)); -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v4 10/25] virtio: add API to enable VQs early
On Mon, Nov 10, 2014 at 04:45:09PM -0800, Andy Grover wrote: > On 10/13/2014 12:50 AM, Michael S. Tsirkin wrote: > >virtio spec 0.9.X requires DRIVER_OK to be set before > >VQs are used, but some drivers use VQs before probe > >function returns. > >Since DRIVER_OK is set after probe, this violates the spec. > > > >Even though under virtio 1.0 transitional devices support this > >behaviour, we want to make it possible for those early callers to become > >spec compliant and eventually support non-transitional devices. > > > >Add API for drivers to call before using VQs. > > > >Sets DRIVER_OK internally. > > > >Signed-off-by: Michael S. Tsirkin > >Reviewed-by: Cornelia Huck > >--- > > include/linux/virtio_config.h | 17 + > > 1 file changed, 17 insertions(+) > > > >diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h > >index e8f8f71..e36403b 100644 > >--- a/include/linux/virtio_config.h > >+++ b/include/linux/virtio_config.h > >@@ -109,6 +109,23 @@ struct virtqueue *virtio_find_single_vq(struct > >virtio_device *vdev, > > return vq; > > } > > > >+/** > >+ * virtio_device_ready - enable vq use in probe function > >+ * @vdev: the device > >+ * > >+ * Driver must call this to use vqs in the probe function. > >+ * > >+ * Note: vqs are enabled automatically after probe returns. > >+ */ > >+static inline > >+void virtio_device_ready(struct virtio_device *dev) > >+{ > >+unsigned status = dev->config->get_status(dev); > >+ > >+BUG_ON(status & VIRTIO_CONFIG_S_DRIVER_OK); > >+dev->config->set_status(dev, status | VIRTIO_CONFIG_S_DRIVER_OK); > >+} > > Getting a BUG when booting via KVM, host Fedora 20, guest Fedora 20. > > my config is at: > > https://fedorapeople.org/~grover/config-20141110 > The fix is here: http://article.gmane.org/gmane.linux.kernel.virtualization/23324/raw I'm surprised it's not merged yet. Rusty, could you pick it up please? > > [0.828494] [ cut here ] > [0.829039] kernel BUG at > /home/agrover/git/kernel/include/linux/virtio_config.h:125! > [0.831266] invalid opcode: [#1] SMP DEBUG_PAGEALLOC > [0.831266] Modules linked in: > [0.831266] CPU: 1 PID: 30 Comm: kworker/1:1 Not tainted 3.18.0-rc4 #120 > [0.831266] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 > [0.831266] Workqueue: events control_work_handler > [0.831266] task: 88003cd98000 ti: 88003cd94000 task.ti: > 88003cd94000 > [0.831266] RIP: 0010:[] [] > add_port+0x264/0x410 > [0.831266] RSP: :88003cd97c78 EFLAGS: 00010202 > [0.831266] RAX: 0007 RBX: 88003c58c400 RCX: > 0001 > [0.831266] RDX: c132 RSI: 81a955e9 RDI: > 0001c132 > [0.831266] RBP: 88003cd97cc8 R08: R09: > > [0.831266] R10: 0001 R11: R12: > 88003c58be00 > [0.831266] R13: 0001 R14: 8800395ca800 R15: > 88003c58c420 > [0.831266] FS: () GS:88003fa0() > knlGS: > [0.831266] CS: 0010 DS: ES: CR0: 8005003b > [0.831266] CR2: CR3: 01c11000 CR4: > 06e0 > [0.831266] DR0: DR1: DR2: > > [0.831266] DR3: DR6: fffe0ff0 DR7: > 0400 > [0.831266] Stack: > [0.831266] 8801 0292 > 0001 > [0.831266] 88003cd97cc8 88003dfa8a20 88003c58beb8 > 88003c58be10 > [0.831266] 8800395a2000 88003cd97d38 > 8144531a > [0.831266] Call Trace: > [0.831266] [] control_work_handler+0x16a/0x3c0 > [0.831266] [] ? process_one_work+0x208/0x500 > [0.831266] [] process_one_work+0x2ac/0x500 > [0.831266] [] ? process_one_work+0x208/0x500 > [0.831266] [] worker_thread+0x2ce/0x4e0 > [0.831266] [] ? process_one_work+0x500/0x500 > [0.831266] [] kthread+0xf8/0x100 > [0.831266] [] ? trace_hardirqs_on+0xd/0x10 > [0.831266] [] ? kthread_stop+0x140/0x140 > [0.831266] [] ret_from_fork+0x7c/0xb0 > [0.831266] [] ? kthread_stop+0x140/0x140 > [0.831266] Code: c7 c2 48 31 01 83 48 c7 c6 e9 55 a9 81 e8 55 b4 c6 ff > 4d 8b b4 24 58 01 00 00 49 8b 86 e8 04 00 00 4c 89 f7 ff 50 10 a8 04 74 0c > <0f> 0b 66 2e 0f 1f 84 00 00 00 00 00 49 8b 96 e8 04 00 00 83 c8 > [0.831266] RIP [] add_port+0x264/0x410 > [0.831266] RSP > [0.878202] ---[ end trace f98fbb172cc7bbf4 ]--- > > Thanks -- Andy -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: several messages
> -Original Message- > From: Jiang Liu [mailto:jiang@linux.intel.com] > Sent: Tuesday, November 11, 2014 10:29 AM > To: Thomas Gleixner; Wu, Feng > Cc: g...@kernel.org; pbonz...@redhat.com; David Woodhouse; > j...@8bytes.org; mi...@redhat.com; H. Peter Anvin; x...@kernel.org; > kvm@vger.kernel.org; io...@lists.linux-foundation.org; LKML > Subject: Re: several messages > > On 2014/11/11 2:15, Thomas Gleixner wrote: > > On Mon, 10 Nov 2014, Feng Wu wrote: > > > >> VT-d Posted-Interrupts is an enhancement to CPU side Posted-Interrupt. > >> With VT-d Posted-Interrupts enabled, external interrupts from > >> direct-assigned devices can be delivered to guests without VMM > >> intervention when guest is running in non-root mode. > > > > Can you please talk to Jiang and synchronize your work with his > > refactoring of the x86 interrupt handling subsystem. > > > > I want this stuff cleaned up first before we add new stuff to it. > Hi Thomas, > Just talked with Feng, we will focused on refactor first and > then add posted interrupt support. > Regards! > Gerry No problem! Thanks, Feng > > > > > Thanks, > > > > tglx > > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html