date:20141110

[PATCH 2/2] kvm: fix kvm_is_mmio_pfn() and rename to kvm_is_reserved_pfn()

2014-11-10 Thread Ard Biesheuvel

This reverts commit 85c8555ff0 ("KVM: check for !is_zero_pfn() in
kvm_is_mmio_pfn()") and renames the function to kvm_is_reserved_pfn.

The problem being addressed by the patch above was that some ARM code
based the memory mapping attributes of a pfn on the return value of
kvm_is_mmio_pfn(), whose name indeed suggests that such pfns should
be mapped as device memory.

However, kvm_is_mmio_pfn() doesn't do quite what it says on the tin,
and the existing non-ARM users were already using it in a way which
suggests that its name should probably have been 'kvm_is_reserved_pfn'
from the beginning, e.g., whether or not to call get_page/put_page on
it etc. This means that returning false for the zero page is a mistake
and the patch above should be reverted.

Signed-off-by: Ard Biesheuvel 
---
 arch/ia64/kvm/kvm-ia64.c |  2 +-
 arch/x86/kvm/mmu.c   |  6 +++---
 include/linux/kvm_host.h |  2 +-
 virt/kvm/kvm_main.c  | 16 
 4 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index ec6b9acb6bea..dbe46f43884d 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -1563,7 +1563,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 
for (i = 0; i < npages; i++) {
pfn = gfn_to_pfn(kvm, base_gfn + i);
-   if (!kvm_is_mmio_pfn(pfn)) {
+   if (!kvm_is_reserved_pfn(pfn)) {
kvm_set_pmt_entry(kvm, base_gfn + i,
pfn << PAGE_SHIFT,
_PAGE_AR_RWX | _PAGE_MA_WB);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index ac1c4de3a484..978f402006ee 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -630,7 +630,7 @@ static int mmu_spte_clear_track_bits(u64 *sptep)
 * kvm mmu, before reclaiming the page, we should
 * unmap it from mmu first.
 */
-   WARN_ON(!kvm_is_mmio_pfn(pfn) && !page_count(pfn_to_page(pfn)));
+   WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn)));
 
if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
kvm_set_pfn_accessed(pfn);
@@ -2461,7 +2461,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
spte |= PT_PAGE_SIZE_MASK;
if (tdp_enabled)
spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
-   kvm_is_mmio_pfn(pfn));
+   kvm_is_reserved_pfn(pfn));
 
if (host_writable)
spte |= SPTE_HOST_WRITEABLE;
@@ -2737,7 +2737,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu 
*vcpu,
 * PT_PAGE_TABLE_LEVEL and there would be no adjustment done
 * here.
 */
-   if (!is_error_noslot_pfn(pfn) && !kvm_is_mmio_pfn(pfn) &&
+   if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
level == PT_PAGE_TABLE_LEVEL &&
PageTransCompound(pfn_to_page(pfn)) &&
!has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) {
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index ea53b04993f2..a6059bdf7b03 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -703,7 +703,7 @@ void kvm_arch_sync_events(struct kvm *kvm);
 int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu);
 void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
 
-bool kvm_is_mmio_pfn(pfn_t pfn);
+bool kvm_is_reserved_pfn(pfn_t pfn);
 
 struct kvm_irq_ack_notifier {
struct hlist_node link;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 25ffac9e947d..3cee7b167052 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -107,10 +107,10 @@ EXPORT_SYMBOL_GPL(kvm_rebooting);
 
 static bool largepages_enabled = true;
 
-bool kvm_is_mmio_pfn(pfn_t pfn)
+bool kvm_is_reserved_pfn(pfn_t pfn)
 {
if (pfn_valid(pfn))
-   return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn));
+   return PageReserved(pfn_to_page(pfn));
 
return true;
 }
@@ -1321,7 +1321,7 @@ static pfn_t hva_to_pfn(unsigned long addr, bool atomic, 
bool *async,
else if ((vma->vm_flags & VM_PFNMAP)) {
pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) +
vma->vm_pgoff;
-   BUG_ON(!kvm_is_mmio_pfn(pfn));
+   BUG_ON(!kvm_is_reserved_pfn(pfn));
} else {
if (async && vma_is_valid(vma, write_fault))
*async = true;
@@ -1427,7 +1427,7 @@ static struct page *kvm_pfn_to_page(pfn_t pfn)
if (is_error_noslot_pfn(pfn))
return KVM_ERR_PTR_BAD_PAGE;
 
-   if (kvm_is_mmio_pfn(pfn)) {
+   if (kvm_is_reserved_pfn(pfn)) {
WARN_ON(1);
return KVM_ERR_PTR_BAD_PAGE;
}
@@ -1456,7 +1456,7 @@ EXPORT_SYMBOL_GPL(kvm_release_page_clean);
 
 void kvm_release_pfn_clean(pfn_t pfn)
 {
-   if (!is_error_noslot_pfn(pfn) && !kvm_is_mmio_pfn(pfn))
+   if (!is_erro

[PATCH 1/2] arm/arm64: kvm: drop inappropriate use of kvm_is_mmio_pfn()

2014-11-10 Thread Ard Biesheuvel

Instead of using kvm_is_mmio_pfn() to decide whether a host region
should be stage 2 mapped with device attributes, add a new static
function kvm_is_device_pfn() that disregards RAM pages with the
reserved bit set, as those should usually not be mapped as device
memory.

Signed-off-by: Ard Biesheuvel 
---
 arch/arm/kvm/mmu.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 57a403a5c22b..b007438242e2 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -834,6 +834,11 @@ static bool kvm_is_write_fault(struct kvm_vcpu *vcpu)
return kvm_vcpu_dabt_iswrite(vcpu);
 }
 
+static bool kvm_is_device_pfn(unsigned long pfn)
+{
+   return !pfn_valid(pfn);
+}
+
 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
  struct kvm_memory_slot *memslot, unsigned long hva,
  unsigned long fault_status)
@@ -904,7 +909,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
if (is_error_pfn(pfn))
return -EFAULT;
 
-   if (kvm_is_mmio_pfn(pfn))
+   if (kvm_is_device_pfn(pfn))
mem_type = PAGE_S2_DEVICE;
 
spin_lock(&kvm->mmu_lock);
-- 
1.8.3.2

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

KVM call for agenda for 2014-11-11

2014-11-10 Thread Juan Quintela


Hi

Please, send any topic that you are interested in covering.

 Thanks, Juan.

 Call details:

 15:00 CEST
 13:00 UTC
 09:00 EDT

 Every two weeks

By popular demand, a google calendar public entry with it

 
https://www.google.com/calendar/embed?src=dG9iMXRqcXAzN3Y4ZXZwNzRoMHE4a3BqcXNAZ3JvdXAuY2FsZW5kYXIuZ29vZ2xlLmNvbQ

 (Let me know if you have any problems with the calendar entry)

If you need phone number details,  contact me privately

Thanks, Juan.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Seeking a KVM benchmark

2014-11-10 Thread Paolo Bonzini

On 09/11/2014 17:36, Andy Lutomirski wrote:
>> The purpose of vmexit test is to show us various overheads, so why not
>> measure EFER switch overhead by having two tests one with equal EFER
>> another with different EFER, instead of hiding it.
> 
> I'll try this.  We might need three tests, though: NX different, NX
> same but SCE different, and all flags the same.

The test actually explicitly enables NX in order to put itself in the 
"common case":

commit 82d4ccb9daf67885a0316b1d763ce5ace57cff36
Author: Marcelo Tosatti 
Date:   Tue Jun 8 15:33:29 2010 -0300

test: vmexit: enable NX

Enable NX to disable MSR autoload/save. This is the common case anyway.

Signed-off-by: Marcelo Tosatti 
Signed-off-by: Avi Kivity 

(this commit is in qemu-kvm.git), so I guess forgetting to set SCE is
just a bug.  The results on my Xeon Sandy Bridge are very interesting:

NX different~11.5k (load/save EFER path)
NX same, SCE different  ~19.5k (urn path)
all flags the same  ~10.2k

The inl_from_kernel results have absolutely no change, usually at most 5
cycles difference.  This could be because I've added the SCE=1 variant
directly to vmexit.c, so I'm running the tests one next to the other.

I tried making also the other shared MSRs the same between guest and
host (STAR, LSTAR, CSTAR, SYSCALL_MASK), so that the user return notifier
has nothing to do.  That saves about 4-500 cycles on inl_from_qemu.  I
do want to dig out my old Core 2 and see how the new test fares, but it
really looks like your patch will be in 3.19.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH kvm-unit-tests] vmexit: add tests with EFER.SCE=1

2014-11-10 Thread Paolo Bonzini

These activate a faster path that does not set EFER in the
user return notifier.  Time the difference between the two.

Signed-off-by: Paolo Bonzini 
---
 x86/vmexit.c | 20 
 1 file changed, 20 insertions(+)

diff --git a/x86/vmexit.c b/x86/vmexit.c
index 3bd0c81..a0380ab 100644
--- a/x86/vmexit.c
+++ b/x86/vmexit.c
@@ -3,6 +3,7 @@
 #include "processor.h"
 #include "atomic.h"
 #include "x86/vm.h"
+#include "x86/msr.h"
 #include "x86/desc.h"
 #include "x86/pci.h"
 
@@ -10,6 +11,7 @@ struct test {
void (*func)(void);
const char *name;
int (*valid)(void);
+   void (*after)(void);
int parallel;
bool (*next)(struct test *);
 };
@@ -119,6 +121,17 @@ static void inl_nop_kernel(void)
 inb(0x4d0);
 }
 
+static int set_sce(void)
+{
+wrmsr(MSR_EFER, rdmsr(MSR_EFER) | EFER_SCE);
+return true;
+}
+
+static void clear_sce(void)
+{
+wrmsr(MSR_EFER, rdmsr(MSR_EFER) & ~EFER_SCE);
+}
+
 static void outl_elcr_kernel(void)
 {
 outb(0x4d0, 0);
@@ -315,7 +328,11 @@ static struct test tests[] = {
 #endif
{ inl_pmtimer, "inl_from_pmtimer", .parallel = 1, },
{ inl_nop_qemu, "inl_from_qemu", .parallel = 1 },
+   { inl_nop_qemu, "inl_from_qemu (EFER guest = EFER host)", .parallel = 1,
+  .valid = set_sce, .after = clear_sce },
{ inl_nop_kernel, "inl_from_kernel", .parallel = 1 },
+   { inl_nop_kernel, "inl_from_kernel (EFER guest = EFER host)", .parallel 
= 1,
+  .valid = set_sce, .after = clear_sce },
{ outl_elcr_kernel, "outl_to_kernel", .parallel = 1 },
{ mov_dr, "mov_dr", .parallel = 1 },
{ ipi, "ipi", is_smp, .parallel = 0, },
@@ -381,6 +398,9 @@ static bool do_test(struct test *test)
t2 = rdtsc();
} while ((t2 - t1) < GOAL);
printf("%s %d\n", test->name, (int)((t2 - t1) / iterations));
+   if (test->after) {
+   test->after();
+   }
return test->next;
 }
 
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Seeking a KVM benchmark

2014-11-10 Thread Gleb Natapov

On Mon, Nov 10, 2014 at 11:03:35AM +0100, Paolo Bonzini wrote:
> 
> 
> On 09/11/2014 17:36, Andy Lutomirski wrote:
> >> The purpose of vmexit test is to show us various overheads, so why not
> >> measure EFER switch overhead by having two tests one with equal EFER
> >> another with different EFER, instead of hiding it.
> > 
> > I'll try this.  We might need three tests, though: NX different, NX
> > same but SCE different, and all flags the same.
> 
> The test actually explicitly enables NX in order to put itself in the 
> "common case":
> 
> commit 82d4ccb9daf67885a0316b1d763ce5ace57cff36
> Author: Marcelo Tosatti 
> Date:   Tue Jun 8 15:33:29 2010 -0300
> 
> test: vmexit: enable NX
> 
> Enable NX to disable MSR autoload/save. This is the common case anyway.
> 
> Signed-off-by: Marcelo Tosatti 
> Signed-off-by: Avi Kivity 
> 
> (this commit is in qemu-kvm.git), so I guess forgetting to set SCE is
> just a bug.  The results on my Xeon Sandy Bridge are very interesting:
> 
> NX different~11.5k (load/save EFER path)
> NX same, SCE different  ~19.5k (urn path)
> all flags the same  ~10.2k
> 
> The inl_from_kernel results have absolutely no change, usually at most 5
> cycles difference.  This could be because I've added the SCE=1 variant
> directly to vmexit.c, so I'm running the tests one next to the other.
> 
> I tried making also the other shared MSRs the same between guest and
> host (STAR, LSTAR, CSTAR, SYSCALL_MASK), so that the user return notifier
> has nothing to do.  That saves about 4-500 cycles on inl_from_qemu.  I
> do want to dig out my old Core 2 and see how the new test fares, but it
> really looks like your patch will be in 3.19.
>
Please test on wide variety of HW before final decision. Also it would
be nice to ask Intel what is expected overhead. It is awesome if they
mange to add EFER switching with non measurable overhead, but also hard
to believe :) Also Andy had an idea do disable switching in case host
and guest EFERs are the same but IIRC his patch does not include it yet.

--
Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 2/2] kvm: fix kvm_is_mmio_pfn() and rename to kvm_is_reserved_pfn()

2014-11-10 Thread Christoffer Dall

Hi Ard,

On Mon, Nov 10, 2014 at 09:33:56AM +0100, Ard Biesheuvel wrote:
> This reverts commit 85c8555ff0 ("KVM: check for !is_zero_pfn() in
> kvm_is_mmio_pfn()") and renames the function to kvm_is_reserved_pfn.
> 
> The problem being addressed by the patch above was that some ARM code
> based the memory mapping attributes of a pfn on the return value of
> kvm_is_mmio_pfn(), whose name indeed suggests that such pfns should
> be mapped as device memory.
> 
> However, kvm_is_mmio_pfn() doesn't do quite what it says on the tin,
> and the existing non-ARM users were already using it in a way which
> suggests that its name should probably have been 'kvm_is_reserved_pfn'
> from the beginning, e.g., whether or not to call get_page/put_page on
> it etc. This means that returning false for the zero page is a mistake
> and the patch above should be reverted.
> 
> Signed-off-by: Ard Biesheuvel 
> ---
>  arch/ia64/kvm/kvm-ia64.c |  2 +-
>  arch/x86/kvm/mmu.c   |  6 +++---
>  include/linux/kvm_host.h |  2 +-
>  virt/kvm/kvm_main.c  | 16 
>  4 files changed, 13 insertions(+), 13 deletions(-)
> 
> diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
> index ec6b9acb6bea..dbe46f43884d 100644
> --- a/arch/ia64/kvm/kvm-ia64.c
> +++ b/arch/ia64/kvm/kvm-ia64.c
> @@ -1563,7 +1563,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
>  
>   for (i = 0; i < npages; i++) {
>   pfn = gfn_to_pfn(kvm, base_gfn + i);
> - if (!kvm_is_mmio_pfn(pfn)) {
> + if (!kvm_is_reserved_pfn(pfn)) {
>   kvm_set_pmt_entry(kvm, base_gfn + i,
>   pfn << PAGE_SHIFT,
>   _PAGE_AR_RWX | _PAGE_MA_WB);
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index ac1c4de3a484..978f402006ee 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -630,7 +630,7 @@ static int mmu_spte_clear_track_bits(u64 *sptep)
>* kvm mmu, before reclaiming the page, we should
>* unmap it from mmu first.
>*/
> - WARN_ON(!kvm_is_mmio_pfn(pfn) && !page_count(pfn_to_page(pfn)));
> + WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn)));
>  
>   if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
>   kvm_set_pfn_accessed(pfn);
> @@ -2461,7 +2461,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
>   spte |= PT_PAGE_SIZE_MASK;
>   if (tdp_enabled)
>   spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
> - kvm_is_mmio_pfn(pfn));
> + kvm_is_reserved_pfn(pfn));
>  
>   if (host_writable)
>   spte |= SPTE_HOST_WRITEABLE;
> @@ -2737,7 +2737,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu 
> *vcpu,
>* PT_PAGE_TABLE_LEVEL and there would be no adjustment done
>* here.
>*/
> - if (!is_error_noslot_pfn(pfn) && !kvm_is_mmio_pfn(pfn) &&
> + if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
>   level == PT_PAGE_TABLE_LEVEL &&
>   PageTransCompound(pfn_to_page(pfn)) &&
>   !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) {
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index ea53b04993f2..a6059bdf7b03 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -703,7 +703,7 @@ void kvm_arch_sync_events(struct kvm *kvm);
>  int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu);
>  void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
>  
> -bool kvm_is_mmio_pfn(pfn_t pfn);
> +bool kvm_is_reserved_pfn(pfn_t pfn);
>  
>  struct kvm_irq_ack_notifier {
>   struct hlist_node link;
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 25ffac9e947d..3cee7b167052 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -107,10 +107,10 @@ EXPORT_SYMBOL_GPL(kvm_rebooting);
>  
>  static bool largepages_enabled = true;
>  
> -bool kvm_is_mmio_pfn(pfn_t pfn)
> +bool kvm_is_reserved_pfn(pfn_t pfn)
>  {
>   if (pfn_valid(pfn))
> - return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn));
> + return PageReserved(pfn_to_page(pfn));

so we return true for !pfn_valid(pfn), is this still semantically
correct with the rename?

>  
>   return true;
>  }
> @@ -1321,7 +1321,7 @@ static pfn_t hva_to_pfn(unsigned long addr, bool 
> atomic, bool *async,
>   else if ((vma->vm_flags & VM_PFNMAP)) {
>   pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) +
>   vma->vm_pgoff;
> - BUG_ON(!kvm_is_mmio_pfn(pfn));
> + BUG_ON(!kvm_is_reserved_pfn(pfn));
>   } else {
>   if (async && vma_is_valid(vma, write_fault))
>   *async = true;
> @@ -1427,7 +1427,7 @@ static struct page *kvm_pfn_to_page(pfn_t pfn)
>   if (is_error_noslot_pfn(pfn))
>   return KVM_ERR_PTR_BAD_PAGE;
>  
> - if (kvm_is_mmio_pfn(pfn)) {
> + if (kv

Re: [PATCH 1/2] arm/arm64: kvm: drop inappropriate use of kvm_is_mmio_pfn()

2014-11-10 Thread Christoffer Dall

On Mon, Nov 10, 2014 at 09:33:55AM +0100, Ard Biesheuvel wrote:
> Instead of using kvm_is_mmio_pfn() to decide whether a host region
> should be stage 2 mapped with device attributes, add a new static
> function kvm_is_device_pfn() that disregards RAM pages with the
> reserved bit set, as those should usually not be mapped as device
> memory.
> 
> Signed-off-by: Ard Biesheuvel 
> ---
>  arch/arm/kvm/mmu.c | 7 ++-
>  1 file changed, 6 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
> index 57a403a5c22b..b007438242e2 100644
> --- a/arch/arm/kvm/mmu.c
> +++ b/arch/arm/kvm/mmu.c
> @@ -834,6 +834,11 @@ static bool kvm_is_write_fault(struct kvm_vcpu *vcpu)
>   return kvm_vcpu_dabt_iswrite(vcpu);
>  }
>  
> +static bool kvm_is_device_pfn(unsigned long pfn)
> +{
> + return !pfn_valid(pfn);
> +}

So this works for Magnus' use case, because a device tree memreserve
results in reserved, but valid, existing pages being backed by a struct
page?

> +
>  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> struct kvm_memory_slot *memslot, unsigned long hva,
> unsigned long fault_status)
> @@ -904,7 +909,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
> phys_addr_t fault_ipa,
>   if (is_error_pfn(pfn))
>   return -EFAULT;
>  
> - if (kvm_is_mmio_pfn(pfn))
> + if (kvm_is_device_pfn(pfn))
>   mem_type = PAGE_S2_DEVICE;
>  
>   spin_lock(&kvm->mmu_lock);
> -- 
> 1.8.3.2
> 

If my understanding above is correct, then:

Acked-by: Christoffer Dall 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 2/2] kvm: fix kvm_is_mmio_pfn() and rename to kvm_is_reserved_pfn()

2014-11-10 Thread Ard Biesheuvel

On 10 November 2014 11:53, Christoffer Dall  wrote:
> Hi Ard,
>
> On Mon, Nov 10, 2014 at 09:33:56AM +0100, Ard Biesheuvel wrote:
>> This reverts commit 85c8555ff0 ("KVM: check for !is_zero_pfn() in
>> kvm_is_mmio_pfn()") and renames the function to kvm_is_reserved_pfn.
>>
>> The problem being addressed by the patch above was that some ARM code
>> based the memory mapping attributes of a pfn on the return value of
>> kvm_is_mmio_pfn(), whose name indeed suggests that such pfns should
>> be mapped as device memory.
>>
>> However, kvm_is_mmio_pfn() doesn't do quite what it says on the tin,
>> and the existing non-ARM users were already using it in a way which
>> suggests that its name should probably have been 'kvm_is_reserved_pfn'
>> from the beginning, e.g., whether or not to call get_page/put_page on
>> it etc. This means that returning false for the zero page is a mistake
>> and the patch above should be reverted.
>>
>> Signed-off-by: Ard Biesheuvel 
>> ---
>>  arch/ia64/kvm/kvm-ia64.c |  2 +-
>>  arch/x86/kvm/mmu.c   |  6 +++---
>>  include/linux/kvm_host.h |  2 +-
>>  virt/kvm/kvm_main.c  | 16 
>>  4 files changed, 13 insertions(+), 13 deletions(-)
>>
>> diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
>> index ec6b9acb6bea..dbe46f43884d 100644
>> --- a/arch/ia64/kvm/kvm-ia64.c
>> +++ b/arch/ia64/kvm/kvm-ia64.c
>> @@ -1563,7 +1563,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
>>
>>   for (i = 0; i < npages; i++) {
>>   pfn = gfn_to_pfn(kvm, base_gfn + i);
>> - if (!kvm_is_mmio_pfn(pfn)) {
>> + if (!kvm_is_reserved_pfn(pfn)) {
>>   kvm_set_pmt_entry(kvm, base_gfn + i,
>>   pfn << PAGE_SHIFT,
>>   _PAGE_AR_RWX | _PAGE_MA_WB);
>> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
>> index ac1c4de3a484..978f402006ee 100644
>> --- a/arch/x86/kvm/mmu.c
>> +++ b/arch/x86/kvm/mmu.c
>> @@ -630,7 +630,7 @@ static int mmu_spte_clear_track_bits(u64 *sptep)
>>* kvm mmu, before reclaiming the page, we should
>>* unmap it from mmu first.
>>*/
>> - WARN_ON(!kvm_is_mmio_pfn(pfn) && !page_count(pfn_to_page(pfn)));
>> + WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn)));
>>
>>   if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
>>   kvm_set_pfn_accessed(pfn);
>> @@ -2461,7 +2461,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
>>   spte |= PT_PAGE_SIZE_MASK;
>>   if (tdp_enabled)
>>   spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
>> - kvm_is_mmio_pfn(pfn));
>> + kvm_is_reserved_pfn(pfn));
>>
>>   if (host_writable)
>>   spte |= SPTE_HOST_WRITEABLE;
>> @@ -2737,7 +2737,7 @@ static void transparent_hugepage_adjust(struct 
>> kvm_vcpu *vcpu,
>>* PT_PAGE_TABLE_LEVEL and there would be no adjustment done
>>* here.
>>*/
>> - if (!is_error_noslot_pfn(pfn) && !kvm_is_mmio_pfn(pfn) &&
>> + if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
>>   level == PT_PAGE_TABLE_LEVEL &&
>>   PageTransCompound(pfn_to_page(pfn)) &&
>>   !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) {
>> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
>> index ea53b04993f2..a6059bdf7b03 100644
>> --- a/include/linux/kvm_host.h
>> +++ b/include/linux/kvm_host.h
>> @@ -703,7 +703,7 @@ void kvm_arch_sync_events(struct kvm *kvm);
>>  int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu);
>>  void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
>>
>> -bool kvm_is_mmio_pfn(pfn_t pfn);
>> +bool kvm_is_reserved_pfn(pfn_t pfn);
>>
>>  struct kvm_irq_ack_notifier {
>>   struct hlist_node link;
>> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
>> index 25ffac9e947d..3cee7b167052 100644
>> --- a/virt/kvm/kvm_main.c
>> +++ b/virt/kvm/kvm_main.c
>> @@ -107,10 +107,10 @@ EXPORT_SYMBOL_GPL(kvm_rebooting);
>>
>>  static bool largepages_enabled = true;
>>
>> -bool kvm_is_mmio_pfn(pfn_t pfn)
>> +bool kvm_is_reserved_pfn(pfn_t pfn)
>>  {
>>   if (pfn_valid(pfn))
>> - return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn));
>> + return PageReserved(pfn_to_page(pfn));
>
> so we return true for !pfn_valid(pfn), is this still semantically
> correct with the rename?
>

I guess it is still debatable, but is arguably more correct than
'kvm_is_mmio_pfn'

I was reluctant to choose something like 'kvm_is_special_pfn' because
'special' is not very discriminating here, and at least 'reserved' has
a very clear meaning wrt pages, and treating non-struct page backed
pfn's as reserved implicitly is not counter-intuitive imo.

>>
>>   return true;
>>  }
>> @@ -1321,7 +1321,7 @@ static pfn_t hva_to_pfn(unsigned long addr, bool 
>> atomic, bool *async,
>>   else if ((vma->vm_flags & VM_PFNMAP)) {
>>   pfn = ((ad

Re: [PATCH 2/2] kvm: fix kvm_is_mmio_pfn() and rename to kvm_is_reserved_pfn()

2014-11-10 Thread Christoffer Dall

On Mon, Nov 10, 2014 at 12:05 PM, Ard Biesheuvel
 wrote:
> On 10 November 2014 11:53, Christoffer Dall  
> wrote:
>> Hi Ard,
>>
>> On Mon, Nov 10, 2014 at 09:33:56AM +0100, Ard Biesheuvel wrote:
>>> This reverts commit 85c8555ff0 ("KVM: check for !is_zero_pfn() in
>>> kvm_is_mmio_pfn()") and renames the function to kvm_is_reserved_pfn.
>>>
>>> The problem being addressed by the patch above was that some ARM code
>>> based the memory mapping attributes of a pfn on the return value of
>>> kvm_is_mmio_pfn(), whose name indeed suggests that such pfns should
>>> be mapped as device memory.
>>>
>>> However, kvm_is_mmio_pfn() doesn't do quite what it says on the tin,
>>> and the existing non-ARM users were already using it in a way which
>>> suggests that its name should probably have been 'kvm_is_reserved_pfn'
>>> from the beginning, e.g., whether or not to call get_page/put_page on
>>> it etc. This means that returning false for the zero page is a mistake
>>> and the patch above should be reverted.
>>>
>>> Signed-off-by: Ard Biesheuvel 
>>> ---
>>>  arch/ia64/kvm/kvm-ia64.c |  2 +-
>>>  arch/x86/kvm/mmu.c   |  6 +++---
>>>  include/linux/kvm_host.h |  2 +-
>>>  virt/kvm/kvm_main.c  | 16 
>>>  4 files changed, 13 insertions(+), 13 deletions(-)
>>>
>>> diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
>>> index ec6b9acb6bea..dbe46f43884d 100644
>>> --- a/arch/ia64/kvm/kvm-ia64.c
>>> +++ b/arch/ia64/kvm/kvm-ia64.c
>>> @@ -1563,7 +1563,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
>>>
>>>   for (i = 0; i < npages; i++) {
>>>   pfn = gfn_to_pfn(kvm, base_gfn + i);
>>> - if (!kvm_is_mmio_pfn(pfn)) {
>>> + if (!kvm_is_reserved_pfn(pfn)) {
>>>   kvm_set_pmt_entry(kvm, base_gfn + i,
>>>   pfn << PAGE_SHIFT,
>>>   _PAGE_AR_RWX | _PAGE_MA_WB);
>>> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
>>> index ac1c4de3a484..978f402006ee 100644
>>> --- a/arch/x86/kvm/mmu.c
>>> +++ b/arch/x86/kvm/mmu.c
>>> @@ -630,7 +630,7 @@ static int mmu_spte_clear_track_bits(u64 *sptep)
>>>* kvm mmu, before reclaiming the page, we should
>>>* unmap it from mmu first.
>>>*/
>>> - WARN_ON(!kvm_is_mmio_pfn(pfn) && !page_count(pfn_to_page(pfn)));
>>> + WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn)));
>>>
>>>   if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
>>>   kvm_set_pfn_accessed(pfn);
>>> @@ -2461,7 +2461,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
>>>   spte |= PT_PAGE_SIZE_MASK;
>>>   if (tdp_enabled)
>>>   spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
>>> - kvm_is_mmio_pfn(pfn));
>>> + kvm_is_reserved_pfn(pfn));
>>>
>>>   if (host_writable)
>>>   spte |= SPTE_HOST_WRITEABLE;
>>> @@ -2737,7 +2737,7 @@ static void transparent_hugepage_adjust(struct 
>>> kvm_vcpu *vcpu,
>>>* PT_PAGE_TABLE_LEVEL and there would be no adjustment done
>>>* here.
>>>*/
>>> - if (!is_error_noslot_pfn(pfn) && !kvm_is_mmio_pfn(pfn) &&
>>> + if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
>>>   level == PT_PAGE_TABLE_LEVEL &&
>>>   PageTransCompound(pfn_to_page(pfn)) &&
>>>   !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) {
>>> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
>>> index ea53b04993f2..a6059bdf7b03 100644
>>> --- a/include/linux/kvm_host.h
>>> +++ b/include/linux/kvm_host.h
>>> @@ -703,7 +703,7 @@ void kvm_arch_sync_events(struct kvm *kvm);
>>>  int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu);
>>>  void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
>>>
>>> -bool kvm_is_mmio_pfn(pfn_t pfn);
>>> +bool kvm_is_reserved_pfn(pfn_t pfn);
>>>
>>>  struct kvm_irq_ack_notifier {
>>>   struct hlist_node link;
>>> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
>>> index 25ffac9e947d..3cee7b167052 100644
>>> --- a/virt/kvm/kvm_main.c
>>> +++ b/virt/kvm/kvm_main.c
>>> @@ -107,10 +107,10 @@ EXPORT_SYMBOL_GPL(kvm_rebooting);
>>>
>>>  static bool largepages_enabled = true;
>>>
>>> -bool kvm_is_mmio_pfn(pfn_t pfn)
>>> +bool kvm_is_reserved_pfn(pfn_t pfn)
>>>  {
>>>   if (pfn_valid(pfn))
>>> - return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn));
>>> + return PageReserved(pfn_to_page(pfn));
>>
>> so we return true for !pfn_valid(pfn), is this still semantically
>> correct with the rename?
>>
>
> I guess it is still debatable, but is arguably more correct than
> 'kvm_is_mmio_pfn'
>

agreed

> I was reluctant to choose something like 'kvm_is_special_pfn' because
> 'special' is not very discriminating here, and at least 'reserved' has
> a very clear meaning wrt pages, and treating non-struct page backed
> pfn's as reserved implicitly is not counter-intuitive imo.
>

just wanted to ma

Re: [PATCH 1/2] arm/arm64: kvm: drop inappropriate use of kvm_is_mmio_pfn()

2014-11-10 Thread Ard Biesheuvel

On 10 November 2014 11:57, Christoffer Dall  wrote:
> On Mon, Nov 10, 2014 at 09:33:55AM +0100, Ard Biesheuvel wrote:
>> Instead of using kvm_is_mmio_pfn() to decide whether a host region
>> should be stage 2 mapped with device attributes, add a new static
>> function kvm_is_device_pfn() that disregards RAM pages with the
>> reserved bit set, as those should usually not be mapped as device
>> memory.
>>
>> Signed-off-by: Ard Biesheuvel 
>> ---
>>  arch/arm/kvm/mmu.c | 7 ++-
>>  1 file changed, 6 insertions(+), 1 deletion(-)
>>
>> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
>> index 57a403a5c22b..b007438242e2 100644
>> --- a/arch/arm/kvm/mmu.c
>> +++ b/arch/arm/kvm/mmu.c
>> @@ -834,6 +834,11 @@ static bool kvm_is_write_fault(struct kvm_vcpu *vcpu)
>>   return kvm_vcpu_dabt_iswrite(vcpu);
>>  }
>>
>> +static bool kvm_is_device_pfn(unsigned long pfn)
>> +{
>> + return !pfn_valid(pfn);
>> +}
>
> So this works for Magnus' use case, because a device tree memreserve
> results in reserved, but valid, existing pages being backed by a struct
> page?
>

That is the idea, yes, but it would be good if he could confirm that
it works as expected.

Also, there may be some corner cases where pfn_valid returns false for
regions that are in fact mapped as MT_NORMAL by the host kernel, i.e.,
UEFI configuration tables, for instance, so this test may require
further refinement. But it at least eliminates the false positives for
plain memreserve regions.


>> +
>>  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>> struct kvm_memory_slot *memslot, unsigned long hva,
>> unsigned long fault_status)
>> @@ -904,7 +909,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
>> phys_addr_t fault_ipa,
>>   if (is_error_pfn(pfn))
>>   return -EFAULT;
>>
>> - if (kvm_is_mmio_pfn(pfn))
>> + if (kvm_is_device_pfn(pfn))
>>   mem_type = PAGE_S2_DEVICE;
>>
>>   spin_lock(&kvm->mmu_lock);
>> --
>> 1.8.3.2
>>
>
> If my understanding above is correct, then:
>
> Acked-by: Christoffer Dall 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Seeking a KVM benchmark

2014-11-10 Thread Paolo Bonzini

On 10/11/2014 11:45, Gleb Natapov wrote:
> > I tried making also the other shared MSRs the same between guest and
> > host (STAR, LSTAR, CSTAR, SYSCALL_MASK), so that the user return notifier
> > has nothing to do.  That saves about 4-500 cycles on inl_from_qemu.  I
> > do want to dig out my old Core 2 and see how the new test fares, but it
> > really looks like your patch will be in 3.19.
>
> Please test on wide variety of HW before final decision.

Yes, definitely.

> Also it would
> be nice to ask Intel what is expected overhead. It is awesome if they
> mange to add EFER switching with non measurable overhead, but also hard
> to believe :)

So let's see what happens.  Sneak preview: the result is definitely worth
asking Intel about.

I ran these benchmarks with a stock 3.16.6 KVM.  Instead I patched
kvm-unit-tests to set EFER.SCE in enable_nx.  This makes it much simpler
for others to reproduce the results.  I only ran the inl_from_qemu test.

Perf stat reports that the processor goes from 0.65 to 0.46 
instructions per cycle, which is consistent with the improvement from 
19k to 12k cycles per iteration.

Unpatched KVM-unit-tests:

 3,385,586,563 cycles#3.189 GHz 
[83.25%]
 2,475,979,685 stalled-cycles-frontend   #   73.13% frontend cycles idle
[83.37%]
 2,083,556,270 stalled-cycles-backend#   61.54% backend  cycles idle
[66.71%]
 1,573,854,041 instructions  #0.46  insns per cycle
 #1.57  stalled cycles per insn 
[83.20%]
   1.108486526 seconds time elapsed

Patched KVM-unit-tests:

 3,252,297,378 cycles#3.147 GHz 
[83.32%]
 2,010,266,184 stalled-cycles-frontend   #   61.81% frontend cycles idle
[83.36%]
 1,560,371,769 stalled-cycles-backend#   47.98% backend  cycles idle
[66.51%]
 2,133,698,018 instructions  #0.66  insns per cycle
 #0.94  stalled cycles per insn 
[83.45%]
   1.072395697 seconds time elapsed

Playing with other events shows that the unpatched benchmark has an
awful load of TLB misses

Unpatched:

30,311 iTLB-loads  
   464,641,844 dTLB-loads  
10,813,839 dTLB-load-misses  #2.33% of all dTLB cache hits 
20436,027 iTLB-load-misses  #  67421.16% of all iTLB cache hits 

Patched:

 1,440,033 iTLB-loads  
   640,970,836 dTLB-loads  
 2,345,112 dTLB-load-misses  #0.37% of all dTLB cache hits 
   270,884 iTLB-load-misses  #   18.81% of all iTLB cache hits 

This is 100% reproducible.  The meaning of the numbers is clearer if you
look up the raw event numbers in the Intel manuals:

- iTLB-loads is 85h/10h aka "perf -e r1085": "Number of cache load STLB 
[second-level
TLB] hits. No page walk."

- iTLB-load-misses is 85h/01h aka r185: "Misses in all ITLB levels that
cause page walks."

So for example event 85h/04h aka r485 ("Cycle PMH is busy with a walk.") and
friends show that the unpatched KVM wastes about 0.1 seconds more than
the patched KVM on page walks:

Unpatched:

22,583,440 r449 (cycles on dTLB store miss page walks)
40,452,018 r408 (cycles on dTLB load miss page walks)
 2,115,981 r485 (cycles on iTLB miss page walks)

65,151,439 total

Patched:

24,430,676 r449 (cycles on dTLB store miss page walks)
   196,017,693 r408 (cycles on dTLB load miss page walks)
   213,266,243 r485 (cycles on iTLB miss page walks)
-
   433,714,612 total

These 0.1 seconds probably are all on instructions that would have been
fast, since the slow instructions responsible for the low IPC are the
microcoded instructions including VMX and other privileged stuff.

Similarly, BDh/20h counts STLB flushes, which are 3k in unpatched KVM
and 260k in patched KVM.  Let's see where they come from:

Unpatched:

+  98.97%  qemu-kvm  [kernel.kallsyms]  [k] native_write_msr_safe
+   0.70%  qemu-kvm  [kernel.kallsyms]  [k] page_fault

It's expected that most TLB misses happen just before a page fault (there
are also events to count how many TLB misses do result in a page fault,
if you care about that), and thus are accounted to the first instruction of the
exception handler.

We do not know what causes second-level TLB _flushes_ but it's quite
expected that you'll have a TLB miss after them and possibly a page fault.
And anyway 98.97% of them coming from native_write_msr_safe is totally
anomalous.

A patched benchmark shows no second-level TLB flush occurs after a WRMSR:

+

Re: [PATCH] kvm: Fix memory slot page alignment logic

2014-11-10 Thread Igor Mammedov

On Fri,  7 Nov 2014 22:18:45 +0100
Alexander Graf  wrote:

> Memory slots have to be page aligned to get entered into KVM. There
> is existing logic that tries to ensure that we pad memory slots that
> are not page aligned to the biggest region that would still fit in the
> alignment requirements.
> 
> Unfortunately, that logic is broken. It tries to calculate the start
> offset based on the region size.
> 
> Fix up the logic to do the thing it was intended to do and document it
> properly in the comment above it.
> 
> With this patch applied, I can successfully run an e500 guest with more
> than 3GB RAM (at which point RAM starts overlapping subpage memory regions).
> 
> Cc: qemu-sta...@nongnu.org
> Signed-off-by: Alexander Graf 
> ---
>  kvm-all.c | 6 --
>  1 file changed, 4 insertions(+), 2 deletions(-)
> 
> diff --git a/kvm-all.c b/kvm-all.c
> index 44a5e72..596e7ce 100644
> --- a/kvm-all.c
> +++ b/kvm-all.c
> @@ -634,8 +634,10 @@ static void kvm_set_phys_mem(MemoryRegionSection 
> *section, bool add)
>  unsigned delta;
>  
>  /* kvm works in page size chunks, but the function may be called
> -   with sub-page size and unaligned start address. */
> -delta = TARGET_PAGE_ALIGN(size) - size;
> +   with sub-page size and unaligned start address. Pad the start
> +   address to next and truncate size to previous page boundary. */
I'm a bit confused how it works at all.
Lets assume that there is no mapped pages that include start_addr,
then if start_addr were padded to next page, kvm would map it from there
but the rest of QEMU would still use unaligned start_addr for MemoryRegion
that isn't even mapped.

It would seem that instead of padding up to the next page, start_addr
should be moved to the start of the page that includes it to make page
with original start_addr available to guest.

> +delta = (TARGET_PAGE_SIZE - (start_addr & ~TARGET_PAGE_MASK));
> +delta &= ~TARGET_PAGE_MASK;
>  if (delta > size) {
>  return;
>  }

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] kvm: Fix memory slot page alignment logic

2014-11-10 Thread Alexander Graf



On 10.11.14 13:31, Igor Mammedov wrote:
> On Fri,  7 Nov 2014 22:18:45 +0100
> Alexander Graf  wrote:
> 
>> Memory slots have to be page aligned to get entered into KVM. There
>> is existing logic that tries to ensure that we pad memory slots that
>> are not page aligned to the biggest region that would still fit in the
>> alignment requirements.
>>
>> Unfortunately, that logic is broken. It tries to calculate the start
>> offset based on the region size.
>>
>> Fix up the logic to do the thing it was intended to do and document it
>> properly in the comment above it.
>>
>> With this patch applied, I can successfully run an e500 guest with more
>> than 3GB RAM (at which point RAM starts overlapping subpage memory regions).
>>
>> Cc: qemu-sta...@nongnu.org
>> Signed-off-by: Alexander Graf 
>> ---
>>  kvm-all.c | 6 --
>>  1 file changed, 4 insertions(+), 2 deletions(-)
>>
>> diff --git a/kvm-all.c b/kvm-all.c
>> index 44a5e72..596e7ce 100644
>> --- a/kvm-all.c
>> +++ b/kvm-all.c
>> @@ -634,8 +634,10 @@ static void kvm_set_phys_mem(MemoryRegionSection 
>> *section, bool add)
>>  unsigned delta;
>>  
>>  /* kvm works in page size chunks, but the function may be called
>> -   with sub-page size and unaligned start address. */
>> -delta = TARGET_PAGE_ALIGN(size) - size;
>> +   with sub-page size and unaligned start address. Pad the start
>> +   address to next and truncate size to previous page boundary. */
> I'm a bit confused how it works at all.
> Lets assume that there is no mapped pages that include start_addr,
> then if start_addr were padded to next page, kvm would map it from there
> but the rest of QEMU would still use unaligned start_addr for MemoryRegion
> that isn't even mapped.

Sorry, I don't understand this paragraph. Memory slots in general are
accelerations for memory access - for MMIO (RAM is usually aligned), KVM
can always exit to QEMU and just do a manual MMIO exit.

> It would seem that instead of padding up to the next page, start_addr
> should be moved to the start of the page that includes it to make page
> with original start_addr available to guest.

No, because in that case you would map something as RAM that really
isn't RAM.

Imagine you have the following memory layout:

0x1000 page size

1) 0x0 - 0x1 RAM
2) 0x1 - 0x10100 MMIO
3) 0x10100 - 0x2 RAM

Then you want to map 1) as memory slot and 4) from 0x11000 onwards as
memory slot.

You can't map the page from 0x1 - 0x11000 as memory slot, because
part of it is MMIO.


Alex
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v4 4/6] hw_random: fix unregister race.

2014-11-10 Thread Herbert Xu

On Mon, Nov 03, 2014 at 11:56:24PM +0800, Amos Kong wrote:
>
> @@ -98,6 +99,8 @@ static inline void cleanup_rng(struct kref *kref)
>  
>   if (rng->cleanup)
>   rng->cleanup(rng);

You need a compiler barrier here to prevent reordering.

> + rng->cleanup_done = true;

Thanks,
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] kvm: Fix memory slot page alignment logic

2014-11-10 Thread Paolo Bonzini



On 10/11/2014 14:16, Alexander Graf wrote:
> No, because in that case you would map something as RAM that really
> isn't RAM.
> 
> Imagine you have the following memory layout:
> 
> 0x1000 page size
> 
> 1) 0x0 - 0x1 RAM
> 2) 0x1 - 0x10100 MMIO
> 3) 0x10100 - 0x2 RAM
> 
> Then you want to map 1) as memory slot and 4) from 0x11000 onwards as
> memory slot.
> 
> You can't map the page from 0x1 - 0x11000 as memory slot, because
> part of it is MMIO.

Right.  The partial RAM page remains marked as MMIO as far as KVM is
concerned, so accesses are slow and you cannot run code from it.
However, it is fundamental that MMIO areas are not marked as RAM.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [Qemu-devel] [PATCH] kvm: Fix memory slot page alignment logic

2014-11-10 Thread Peter Maydell

On 10 November 2014 13:16, Alexander Graf  wrote:
> Sorry, I don't understand this paragraph. Memory slots in general are
> accelerations for memory access - for MMIO (RAM is usually aligned), KVM
> can always exit to QEMU and just do a manual MMIO exit.

...you're a bit stuck if you were hoping to execute code from
that RAM, though, so they're not *purely* acceleration, right?

-- PMM
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] kvm: Fix memory slot page alignment logic

2014-11-10 Thread Igor Mammedov

On Mon, 10 Nov 2014 14:16:58 +0100
Alexander Graf  wrote:

> 
> 
> On 10.11.14 13:31, Igor Mammedov wrote:
> > On Fri,  7 Nov 2014 22:18:45 +0100
> > Alexander Graf  wrote:
> > 
> >> Memory slots have to be page aligned to get entered into KVM. There
> >> is existing logic that tries to ensure that we pad memory slots that
> >> are not page aligned to the biggest region that would still fit in the
> >> alignment requirements.
> >>
> >> Unfortunately, that logic is broken. It tries to calculate the start
> >> offset based on the region size.
> >>
> >> Fix up the logic to do the thing it was intended to do and document it
> >> properly in the comment above it.
> >>
> >> With this patch applied, I can successfully run an e500 guest with more
> >> than 3GB RAM (at which point RAM starts overlapping subpage memory 
> >> regions).
> >>
> >> Cc: qemu-sta...@nongnu.org
> >> Signed-off-by: Alexander Graf 
> >> ---
> >>  kvm-all.c | 6 --
> >>  1 file changed, 4 insertions(+), 2 deletions(-)
> >>
> >> diff --git a/kvm-all.c b/kvm-all.c
> >> index 44a5e72..596e7ce 100644
> >> --- a/kvm-all.c
> >> +++ b/kvm-all.c
> >> @@ -634,8 +634,10 @@ static void kvm_set_phys_mem(MemoryRegionSection 
> >> *section, bool add)
> >>  unsigned delta;
> >>  
> >>  /* kvm works in page size chunks, but the function may be called
> >> -   with sub-page size and unaligned start address. */
> >> -delta = TARGET_PAGE_ALIGN(size) - size;
> >> +   with sub-page size and unaligned start address. Pad the start
> >> +   address to next and truncate size to previous page boundary. */
> > I'm a bit confused how it works at all.
> > Lets assume that there is no mapped pages that include start_addr,
> > then if start_addr were padded to next page, kvm would map it from there
> > but the rest of QEMU would still use unaligned start_addr for MemoryRegion
> > that isn't even mapped.
> 
> Sorry, I don't understand this paragraph. Memory slots in general are
> accelerations for memory access - for MMIO (RAM is usually aligned), KVM
> can always exit to QEMU and just do a manual MMIO exit.
> 
> > It would seem that instead of padding up to the next page, start_addr
> > should be moved to the start of the page that includes it to make page
> > with original start_addr available to guest.
> 
> No, because in that case you would map something as RAM that really
> isn't RAM.
> 
> Imagine you have the following memory layout:
> 
> 0x1000 page size
> 
> 1) 0x0 - 0x1 RAM
> 2) 0x1 - 0x10100 MMIO
> 3) 0x10100 - 0x2 RAM
> 
> Then you want to map 1) as memory slot and 4) from 0x11000 onwards as
> memory slot.
so every access to RAM 0x10100-0x11000 which is not represented as memory
slot would cause VMEXIT?

> 
> You can't map the page from 0x1 - 0x11000 as memory slot, because
> part of it is MMIO.
> 
> 
> Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Seeking a KVM benchmark

2014-11-10 Thread Avi Kivity



On 11/10/2014 02:15 PM, Paolo Bonzini wrote:


On 10/11/2014 11:45, Gleb Natapov wrote:

I tried making also the other shared MSRs the same between guest and
host (STAR, LSTAR, CSTAR, SYSCALL_MASK), so that the user return notifier
has nothing to do.  That saves about 4-500 cycles on inl_from_qemu.  I
do want to dig out my old Core 2 and see how the new test fares, but it
really looks like your patch will be in 3.19.

Please test on wide variety of HW before final decision.

Yes, definitely.


Also it would
be nice to ask Intel what is expected overhead. It is awesome if they
mange to add EFER switching with non measurable overhead, but also hard
to believe :)

So let's see what happens.  Sneak preview: the result is definitely worth
asking Intel about.

I ran these benchmarks with a stock 3.16.6 KVM.  Instead I patched
kvm-unit-tests to set EFER.SCE in enable_nx.  This makes it much simpler
for others to reproduce the results.  I only ran the inl_from_qemu test.

Perf stat reports that the processor goes from 0.65 to 0.46
instructions per cycle, which is consistent with the improvement from
19k to 12k cycles per iteration.

Unpatched KVM-unit-tests:

  3,385,586,563 cycles#3.189 GHz
 [83.25%]
  2,475,979,685 stalled-cycles-frontend   #   73.13% frontend cycles idle   
 [83.37%]
  2,083,556,270 stalled-cycles-backend#   61.54% backend  cycles idle   
 [66.71%]
  1,573,854,041 instructions  #0.46  insns per cycle
  #1.57  stalled cycles per 
insn [83.20%]
1.108486526 seconds time elapsed


Patched KVM-unit-tests:

  3,252,297,378 cycles#3.147 GHz
 [83.32%]
  2,010,266,184 stalled-cycles-frontend   #   61.81% frontend cycles idle   
 [83.36%]
  1,560,371,769 stalled-cycles-backend#   47.98% backend  cycles idle   
 [66.51%]
  2,133,698,018 instructions  #0.66  insns per cycle
  #0.94  stalled cycles per 
insn [83.45%]
1.072395697 seconds time elapsed

Playing with other events shows that the unpatched benchmark has an
awful load of TLB misses

Unpatched:

 30,311 iTLB-loads
464,641,844 dTLB-loads
 10,813,839 dTLB-load-misses  #2.33% of all dTLB cache hits
 20436,027 iTLB-load-misses  #  67421.16% of all iTLB cache hits

Patched:

  1,440,033 iTLB-loads
640,970,836 dTLB-loads
  2,345,112 dTLB-load-misses  #0.37% of all dTLB cache hits
270,884 iTLB-load-misses  #   18.81% of all iTLB cache hits

This is 100% reproducible.  The meaning of the numbers is clearer if you
look up the raw event numbers in the Intel manuals:

- iTLB-loads is 85h/10h aka "perf -e r1085": "Number of cache load STLB 
[second-level
TLB] hits. No page walk."

- iTLB-load-misses is 85h/01h aka r185: "Misses in all ITLB levels that
cause page walks."

So for example event 85h/04h aka r485 ("Cycle PMH is busy with a walk.") and
friends show that the unpatched KVM wastes about 0.1 seconds more than
the patched KVM on page walks:

Unpatched:

 22,583,440 r449 (cycles on dTLB store miss page walks)
 40,452,018 r408 (cycles on dTLB load miss page walks)
  2,115,981 r485 (cycles on iTLB miss page walks)

 65,151,439 total

Patched:

 24,430,676 r449 (cycles on dTLB store miss page walks)
196,017,693 r408 (cycles on dTLB load miss page walks)
213,266,243 r485 (cycles on iTLB miss page walks)
-
433,714,612 total

These 0.1 seconds probably are all on instructions that would have been
fast, since the slow instructions responsible for the low IPC are the
microcoded instructions including VMX and other privileged stuff.

Similarly, BDh/20h counts STLB flushes, which are 3k in unpatched KVM
and 260k in patched KVM.  Let's see where they come from:

Unpatched:

+  98.97%  qemu-kvm  [kernel.kallsyms]  [k] native_write_msr_safe
+   0.70%  qemu-kvm  [kernel.kallsyms]  [k] page_fault

It's expected that most TLB misses happen just before a page fault (there
are also events to count how many TLB misses do result in a page fault,
if you care about that), and thus are accounted to the first instruction of the
exception handler.

We do not know what causes second-level TLB _flushes_ but it's quite
expected that you'll have a TLB miss after them and possibly a page fault.
And anyway 98.97% of them coming from native_write_msr_safe is totally
anomalous.

A patched benchmark shows no second-level TLB flush occurs after a WRMSR:

+  72.41%  qemu-kvm  [kernel.kallsyms]  [k] page_fault
+   9.07%  qemu-kvm  [kvm_intel][k] vmx_flush_tlb
+   6.60%  qemu-kvm  [kernel.kallsyms]  [k] set_pte_vaddr_pud
+

Re: [PATCH] kvm: Fix memory slot page alignment logic

2014-11-10 Thread Alexander Graf



On 10.11.14 14:55, Igor Mammedov wrote:
> On Mon, 10 Nov 2014 14:16:58 +0100
> Alexander Graf  wrote:
> 
>>
>>
>> On 10.11.14 13:31, Igor Mammedov wrote:
>>> On Fri,  7 Nov 2014 22:18:45 +0100
>>> Alexander Graf  wrote:
>>>
 Memory slots have to be page aligned to get entered into KVM. There
 is existing logic that tries to ensure that we pad memory slots that
 are not page aligned to the biggest region that would still fit in the
 alignment requirements.

 Unfortunately, that logic is broken. It tries to calculate the start
 offset based on the region size.

 Fix up the logic to do the thing it was intended to do and document it
 properly in the comment above it.

 With this patch applied, I can successfully run an e500 guest with more
 than 3GB RAM (at which point RAM starts overlapping subpage memory 
 regions).

 Cc: qemu-sta...@nongnu.org
 Signed-off-by: Alexander Graf 
 ---
  kvm-all.c | 6 --
  1 file changed, 4 insertions(+), 2 deletions(-)

 diff --git a/kvm-all.c b/kvm-all.c
 index 44a5e72..596e7ce 100644
 --- a/kvm-all.c
 +++ b/kvm-all.c
 @@ -634,8 +634,10 @@ static void kvm_set_phys_mem(MemoryRegionSection 
 *section, bool add)
  unsigned delta;
  
  /* kvm works in page size chunks, but the function may be called
 -   with sub-page size and unaligned start address. */
 -delta = TARGET_PAGE_ALIGN(size) - size;
 +   with sub-page size and unaligned start address. Pad the start
 +   address to next and truncate size to previous page boundary. */
>>> I'm a bit confused how it works at all.
>>> Lets assume that there is no mapped pages that include start_addr,
>>> then if start_addr were padded to next page, kvm would map it from there
>>> but the rest of QEMU would still use unaligned start_addr for MemoryRegion
>>> that isn't even mapped.
>>
>> Sorry, I don't understand this paragraph. Memory slots in general are
>> accelerations for memory access - for MMIO (RAM is usually aligned), KVM
>> can always exit to QEMU and just do a manual MMIO exit.
>>
>>> It would seem that instead of padding up to the next page, start_addr
>>> should be moved to the start of the page that includes it to make page
>>> with original start_addr available to guest.
>>
>> No, because in that case you would map something as RAM that really
>> isn't RAM.
>>
>> Imagine you have the following memory layout:
>>
>> 0x1000 page size
>>
>> 1) 0x0 - 0x1 RAM
>> 2) 0x1 - 0x10100 MMIO
>> 3) 0x10100 - 0x2 RAM
>>
>> Then you want to map 1) as memory slot and 4) from 0x11000 onwards as
>> memory slot.
> so every access to RAM 0x10100-0x11000 which is not represented as memory
> slot would cause VMEXIT?

Yes, there's no other way. Otherwise we wouldn't be able to trap on the
exits from 0x1 - 0x10100. Hardware only gives us page granularity.

Usually this isn't an issue because overlapping MMIO regions are pretty
large chunks of power-of-2 size - if you see any overlapping at all. On
e500 this bites us though, because we end up with small MSI-X windows
inside our address space (which in turn might also be a bug, but that
doesn't mean that the slot mapping logic should be left as broken as it is).


Alex
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [Qemu-devel] [PATCH] kvm: Fix memory slot page alignment logic

2014-11-10 Thread Alexander Graf



On 10.11.14 14:55, Peter Maydell wrote:
> On 10 November 2014 13:16, Alexander Graf  wrote:
>> Sorry, I don't understand this paragraph. Memory slots in general are
>> accelerations for memory access - for MMIO (RAM is usually aligned), KVM
>> can always exit to QEMU and just do a manual MMIO exit.
> 
> ...you're a bit stuck if you were hoping to execute code from
> that RAM, though, so they're not *purely* acceleration, right?

Yes and no. Technically, there's no reason KVM couldn't do an MMIO exit
dance to fetch the next instruction. From user space this should be
indistinguishable.

Today, I don't think it's implemented though :).


Alex
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC PATCH 1/4] ARM: KVM: on unhandled IO mem abort, route the call to the KVM MMIO bus

2014-11-10 Thread Nikolay Nikolaev

Hello,

On Fri, Mar 28, 2014 at 9:09 PM, Christoffer Dall
 wrote:
>
> On Thu, Mar 13, 2014 at 04:57:26PM +0100, Antonios Motakis wrote:
> > On an unhandled IO memory abort, use the kvm_io_bus_* API in order to
> > handle the MMIO access through any registered read/write callbacks. This
> > is a dependency for eventfd support (ioeventfd and irqfd).
> >
> > However, accesses to the VGIC are still left implemented independently,
> > since the kvm_io_bus_* API doesn't pass the VCPU pointer doing the access.
> >
> > Signed-off-by: Antonios Motakis 
> > Signed-off-by: Nikolay Nikolaev 
> > ---
> >  arch/arm/kvm/mmio.c | 32 
> >  virt/kvm/arm/vgic.c |  5 -
> >  2 files changed, 36 insertions(+), 1 deletion(-)
> >
> > diff --git a/arch/arm/kvm/mmio.c b/arch/arm/kvm/mmio.c
> > index 4cb5a93..1d17831 100644
> > --- a/arch/arm/kvm/mmio.c
> > +++ b/arch/arm/kvm/mmio.c
> > @@ -162,6 +162,35 @@ static int decode_hsr(struct kvm_vcpu *vcpu, 
> > phys_addr_t fault_ipa,
> >   return 0;
> >  }
> >
> > +/**
> > + * kvm_handle_mmio - handle an in-kernel MMIO access
> > + * @vcpu:pointer to the vcpu performing the access
> > + * @run: pointer to the kvm_run structure
> > + * @mmio:pointer to the data describing the access
> > + *
> > + * returns true if the MMIO access has been performed in kernel space,
> > + * and false if it needs to be emulated in user space.
> > + */
> > +static bool handle_kernel_mmio(struct kvm_vcpu *vcpu, struct kvm_run *run,
> > + struct kvm_exit_mmio *mmio)
> > +{
> > + int ret;
> > + if (mmio->is_write) {
> > + ret = kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, 
> > mmio->phys_addr,
> > + mmio->len, &mmio->data);
> > +
> > + } else {
> > + ret = kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, 
> > mmio->phys_addr,
> > + mmio->len, &mmio->data);
> > + }
> > + if (!ret) {
> > + kvm_prepare_mmio(run, mmio);
> > + kvm_handle_mmio_return(vcpu, run);
> > + }
> > +
> > + return !ret;
> > +}
> > +
> >  int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run,
> >phys_addr_t fault_ipa)
> >  {
> > @@ -200,6 +229,9 @@ int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run 
> > *run,
> >   if (vgic_handle_mmio(vcpu, run, &mmio))
> >   return 1;
> >
> > + if (handle_kernel_mmio(vcpu, run, &mmio))
> > + return 1;
> > +


We're reconsidering ioeventfds patchseries and we tried to evaluate
what you suggested here.

>
> this special-casing of the vgic is now really terrible.  Is there
> anything holding you back from doing the necessary restructure of the
> kvm_bus_io_*() API instead?

Restructuring the kvm_io_bus_ API is not a big thing (we actually did
it), but is not directly related to the these patches.
Of course it can be justified if we do it in the context of removing
vgic_handle_mmio and leaving only handle_kernel_mmio.

>
> That would allow us to get rid of the ugly
> Fix it! in the vgic driver as well.

Going through the vgic_handle_mmio we see that it will require large
refactoring:
 - there are 15 MMIO ranges for the vgic now - each should be
registered as a separate device
 - the handler of each range should be split into read and write
 - all handlers take 'struct kvm_exit_mmio', and pass it to
'vgic_reg_access', 'mmio_data_read' and 'mmio_data_read'

To sum up - if we do this refactoring of vgic's MMIO handling +
kvm_io_bus_ API getting 'vcpu" argument we'll get a 'much' cleaner
vgic code and as a bonus we'll get 'ioeventfd' capabilities.

We have 3 questions:
 - is the kvm_io_bus_ getting 'vcpu' argument acceptable for the other
architectures too?
 - is this huge vgic MMIO handling redesign acceptable/desired (it
touches a lot of code)?
 - is there a way that ioeventfd is accepted leaving vgic.c in it's
current state?

regards,
Nikolay NIkolaev
Virtual Open Systems

>
> -Christoffer
>
> >   kvm_prepare_mmio(run, &mmio);
> >   return 0;
> >  }
> > diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
> > index 8ca405c..afdecc3 100644
> > --- a/virt/kvm/arm/vgic.c
> > +++ b/virt/kvm/arm/vgic.c
> > @@ -849,13 +849,16 @@ struct mmio_range *find_matching_range(const struct 
> > mmio_range *ranges,
> >  }
> >
> >  /**
> > - * vgic_handle_mmio - handle an in-kernel MMIO access
> > + * vgic_handle_mmio - handle an in-kernel vgic MMIO access
> >   * @vcpu:pointer to the vcpu performing the access
> >   * @run: pointer to the kvm_run structure
> >   * @mmio:pointer to the data describing the access
> >   *
> >   * returns true if the MMIO access has been performed in kernel space,
> >   * and false if it needs to be emulated in user space.
> > + *
> > + * This is handled outside of kvm_handle_mmio because the kvm_io_bus only
> > + * passes the VM pointer, while we need the VCPU performing the access.
> >   */
> >  bool vgic_handle_mmio(struct kvm_vcpu *vcpu

Re: [RFC PATCH 1/4] ARM: KVM: on unhandled IO mem abort, route the call to the KVM MMIO bus

2014-11-10 Thread Christoffer Dall

On Mon, Nov 10, 2014 at 05:09:07PM +0200, Nikolay Nikolaev wrote:
> Hello,
> 
> On Fri, Mar 28, 2014 at 9:09 PM, Christoffer Dall
>  wrote:
> >
> > On Thu, Mar 13, 2014 at 04:57:26PM +0100, Antonios Motakis wrote:
> > > On an unhandled IO memory abort, use the kvm_io_bus_* API in order to
> > > handle the MMIO access through any registered read/write callbacks. This
> > > is a dependency for eventfd support (ioeventfd and irqfd).
> > >
> > > However, accesses to the VGIC are still left implemented independently,
> > > since the kvm_io_bus_* API doesn't pass the VCPU pointer doing the access.
> > >
> > > Signed-off-by: Antonios Motakis 
> > > Signed-off-by: Nikolay Nikolaev 
> > > ---
> > >  arch/arm/kvm/mmio.c | 32 
> > >  virt/kvm/arm/vgic.c |  5 -
> > >  2 files changed, 36 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/arch/arm/kvm/mmio.c b/arch/arm/kvm/mmio.c
> > > index 4cb5a93..1d17831 100644
> > > --- a/arch/arm/kvm/mmio.c
> > > +++ b/arch/arm/kvm/mmio.c
> > > @@ -162,6 +162,35 @@ static int decode_hsr(struct kvm_vcpu *vcpu, 
> > > phys_addr_t fault_ipa,
> > >   return 0;
> > >  }
> > >
> > > +/**
> > > + * kvm_handle_mmio - handle an in-kernel MMIO access
> > > + * @vcpu:pointer to the vcpu performing the access
> > > + * @run: pointer to the kvm_run structure
> > > + * @mmio:pointer to the data describing the access
> > > + *
> > > + * returns true if the MMIO access has been performed in kernel space,
> > > + * and false if it needs to be emulated in user space.
> > > + */
> > > +static bool handle_kernel_mmio(struct kvm_vcpu *vcpu, struct kvm_run 
> > > *run,
> > > + struct kvm_exit_mmio *mmio)
> > > +{
> > > + int ret;
> > > + if (mmio->is_write) {
> > > + ret = kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, 
> > > mmio->phys_addr,
> > > + mmio->len, &mmio->data);
> > > +
> > > + } else {
> > > + ret = kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, 
> > > mmio->phys_addr,
> > > + mmio->len, &mmio->data);
> > > + }
> > > + if (!ret) {
> > > + kvm_prepare_mmio(run, mmio);
> > > + kvm_handle_mmio_return(vcpu, run);
> > > + }
> > > +
> > > + return !ret;
> > > +}
> > > +
> > >  int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run,
> > >phys_addr_t fault_ipa)
> > >  {
> > > @@ -200,6 +229,9 @@ int io_mem_abort(struct kvm_vcpu *vcpu, struct 
> > > kvm_run *run,
> > >   if (vgic_handle_mmio(vcpu, run, &mmio))
> > >   return 1;
> > >
> > > + if (handle_kernel_mmio(vcpu, run, &mmio))
> > > + return 1;
> > > +
> 
> 
> We're reconsidering ioeventfds patchseries and we tried to evaluate
> what you suggested here.
> 
> >
> > this special-casing of the vgic is now really terrible.  Is there
> > anything holding you back from doing the necessary restructure of the
> > kvm_bus_io_*() API instead?
> 
> Restructuring the kvm_io_bus_ API is not a big thing (we actually did
> it), but is not directly related to the these patches.
> Of course it can be justified if we do it in the context of removing
> vgic_handle_mmio and leaving only handle_kernel_mmio.
> 
> >
> > That would allow us to get rid of the ugly
> > Fix it! in the vgic driver as well.
> 
> Going through the vgic_handle_mmio we see that it will require large
> refactoring:
>  - there are 15 MMIO ranges for the vgic now - each should be
> registered as a separate device
>  - the handler of each range should be split into read and write
>  - all handlers take 'struct kvm_exit_mmio', and pass it to
> 'vgic_reg_access', 'mmio_data_read' and 'mmio_data_read'
> 
> To sum up - if we do this refactoring of vgic's MMIO handling +
> kvm_io_bus_ API getting 'vcpu" argument we'll get a 'much' cleaner
> vgic code and as a bonus we'll get 'ioeventfd' capabilities.
> 
> We have 3 questions:
>  - is the kvm_io_bus_ getting 'vcpu' argument acceptable for the other
> architectures too?
>  - is this huge vgic MMIO handling redesign acceptable/desired (it
> touches a lot of code)?
>  - is there a way that ioeventfd is accepted leaving vgic.c in it's
> current state?
> 
Not sure how the latter question is relevant to this, but check with
Andre who recently looked at this as well and decided that for GICv3 the
only sane thing was to remove that comment for the gic.

I don't recall the details of what you were trying to accomplish here
(it's been 8 months or so) but the surely the vgic handling code should
*somehow* be integrated into the handle_kernel_mmio (like Paolo
suggested), unless you come back and tell me that that would involve a
complete rewrite of the vgic code.

-Christoffer
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Seeking a KVM benchmark

2014-11-10 Thread Paolo Bonzini

On 10/11/2014 15:23, Avi Kivity wrote:
> It's not surprising [1].  Since the meaning of some PTE bits change [2],
> the TLB has to be flushed.  In VMX we have VPIDs, so we only need to flush
> if EFER changed between two invocations of the same VPID, which isn't the
> case.
> 
> [1] after the fact
> [2] although those bits were reserved with NXE=0, so they shouldn't have
> any TLB footprint

You're right that this is not that surprising after the fact, and that
both Sandy Bridge and Ivy Bridge have VPIDs (even the non-Xeon ones).
This is also why I'm curious about the Nehalem.

However note that even toggling the SCE bit is flushing the TLB.  The
NXE bit is not being toggled here!  That's the more surprising part.

Paolo

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 14/21] KVM: x86: Software disabled APIC should still deliver NMIs

2014-11-10 Thread Paolo Bonzini

On 06/11/2014 17:45, Radim Krčmář wrote:
> - } else if (kvm_apic_hw_enabled(apic)) {
> + } else if (kvm_apic_get_reg(apic, APIC_LDR)) {
>   if (kvm_apic_get_reg(apic, APIC_DFR) ==
>   APIC_DFR_CLUSTER) {
>   new->cid_shift = 4;

Why are you removing the kvm_apic_hw_enabled(apic)?  We do not reset the
APIC to the power-up values (the SDM says that "may" happen) when the
enabled bit is turned off in the APIC_BASE MSR.

Paolo

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Seeking a KVM benchmark

2014-11-10 Thread Gleb Natapov

On Mon, Nov 10, 2014 at 06:28:25PM +0100, Paolo Bonzini wrote:
> On 10/11/2014 15:23, Avi Kivity wrote:
> > It's not surprising [1].  Since the meaning of some PTE bits change [2],
> > the TLB has to be flushed.  In VMX we have VPIDs, so we only need to flush
> > if EFER changed between two invocations of the same VPID, which isn't the
> > case.
> > 
> > [1] after the fact
> > [2] although those bits were reserved with NXE=0, so they shouldn't have
> > any TLB footprint
> 
> You're right that this is not that surprising after the fact, and that
> both Sandy Bridge and Ivy Bridge have VPIDs (even the non-Xeon ones).
> This is also why I'm curious about the Nehalem.
> 
> However note that even toggling the SCE bit is flushing the TLB.  The
> NXE bit is not being toggled here!  That's the more surprising part.
> 
Just a guess, but may be because writing EFER is not something that happens
often in regular OSes it is not optimized to handle different bits differently.

--
Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 14/21] KVM: x86: Software disabled APIC should still deliver NMIs

2014-11-10 Thread Radim Krčmář

2014-11-10 18:35+0100, Paolo Bonzini:
> On 06/11/2014 17:45, Radim Krčmář wrote:
> > -   } else if (kvm_apic_hw_enabled(apic)) {
> > +   } else if (kvm_apic_get_reg(apic, APIC_LDR)) {
> > if (kvm_apic_get_reg(apic, APIC_DFR) ==
> > APIC_DFR_CLUSTER) {
> > new->cid_shift = 4;
> 
> Why are you removing the kvm_apic_hw_enabled(apic)?  We do not reset the
> APIC to the power-up values (the SDM says that "may" happen) when the
> enabled bit is turned off in the APIC_BASE MSR.

We checked for that before, in kvm_apic_present().
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: several messages

2014-11-10 Thread Thomas Gleixner

On Mon, 10 Nov 2014, Feng Wu wrote:

> VT-d Posted-Interrupts is an enhancement to CPU side Posted-Interrupt.
> With VT-d Posted-Interrupts enabled, external interrupts from
> direct-assigned devices can be delivered to guests without VMM
> intervention when guest is running in non-root mode.

Can you please talk to Jiang and synchronize your work with his
refactoring of the x86 interrupt handling subsystem.

I want this stuff cleaned up first before we add new stuff to it.

Thanks,

tglx
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Seeking a KVM benchmark

2014-11-10 Thread Andy Lutomirski

On Mon, Nov 10, 2014 at 2:45 AM, Gleb Natapov  wrote:
> On Mon, Nov 10, 2014 at 11:03:35AM +0100, Paolo Bonzini wrote:
>>
>>
>> On 09/11/2014 17:36, Andy Lutomirski wrote:
>> >> The purpose of vmexit test is to show us various overheads, so why not
>> >> measure EFER switch overhead by having two tests one with equal EFER
>> >> another with different EFER, instead of hiding it.
>> >
>> > I'll try this.  We might need three tests, though: NX different, NX
>> > same but SCE different, and all flags the same.
>>
>> The test actually explicitly enables NX in order to put itself in the
>> "common case":
>>
>> commit 82d4ccb9daf67885a0316b1d763ce5ace57cff36
>> Author: Marcelo Tosatti 
>> Date:   Tue Jun 8 15:33:29 2010 -0300
>>
>> test: vmexit: enable NX
>>
>> Enable NX to disable MSR autoload/save. This is the common case anyway.
>>
>> Signed-off-by: Marcelo Tosatti 
>> Signed-off-by: Avi Kivity 
>>
>> (this commit is in qemu-kvm.git), so I guess forgetting to set SCE is
>> just a bug.  The results on my Xeon Sandy Bridge are very interesting:
>>
>> NX different~11.5k (load/save EFER path)
>> NX same, SCE different  ~19.5k (urn path)
>> all flags the same  ~10.2k
>>
>> The inl_from_kernel results have absolutely no change, usually at most 5
>> cycles difference.  This could be because I've added the SCE=1 variant
>> directly to vmexit.c, so I'm running the tests one next to the other.
>>
>> I tried making also the other shared MSRs the same between guest and
>> host (STAR, LSTAR, CSTAR, SYSCALL_MASK), so that the user return notifier
>> has nothing to do.  That saves about 4-500 cycles on inl_from_qemu.  I
>> do want to dig out my old Core 2 and see how the new test fares, but it
>> really looks like your patch will be in 3.19.
>>
> Please test on wide variety of HW before final decision. Also it would
> be nice to ask Intel what is expected overhead. It is awesome if they
> mange to add EFER switching with non measurable overhead, but also hard
> to believe :) Also Andy had an idea do disable switching in case host
> and guest EFERs are the same but IIRC his patch does not include it yet.

I'll send that patch as a followup in a sec.  It doesn't seem to make
a difference, which reinforces my hypothesis that microcode is
fiddling with EFER on entry and exit anyway to handle LME and LMA
anyway, so adjusting the other bits doesn't effect performance.

--Andy

>
> --
> Gleb.



-- 
Andy Lutomirski
AMA Capital Management, LLC
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] x86, kvm, vmx: Don't set LOAD_IA32_EFER when host and guest match

2014-11-10 Thread Andy Lutomirski

There's nothing to switch if the host and guest values are the same.
I am unable to find evidence that this makes any difference
whatsoever.

Signed-off-by: Andy Lutomirski 
---
 arch/x86/kvm/vmx.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e72b9660e51c..c430b01b012b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1670,7 +1670,9 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, 
int efer_offset)
guest_efer = vmx->vcpu.arch.efer;
if (!(guest_efer & EFER_LMA))
guest_efer &= ~EFER_LME;
-   add_atomic_switch_msr(vmx, MSR_EFER, guest_efer, host_efer);
+   if (guest_efer != host_efer)
+   add_atomic_switch_msr(vmx, MSR_EFER,
+ guest_efer, host_efer);
return false;
}
 
-- 
1.9.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] x86, kvm, vmx: Always use LOAD_IA32_EFER if available

2014-11-10 Thread Andy Lutomirski

On Sun, Nov 9, 2014 at 8:33 PM, Wanpeng Li  wrote:
> Hi Andy,
> On Fri, Nov 07, 2014 at 06:25:18PM -0800, Andy Lutomirski wrote:
>>At least on Sandy Bridge, letting the CPU switch IA32_EFER is much
>>faster than switching it manually.
>>
>>I benchmarked this using the vmexit kvm-unit-test (single run, but
>>GOAL multiplied by 5 to do more iterations):
>>
>>Test  Before  AfterChange
>>cpuid   2000   1932-3.40%
>>vmcall  1914   1817-5.07%
>>mov_from_cr8  13 13 0.00%
>>mov_to_cr819 19 0.00%
>>inl_from_pmtimer   19164  10619   -44.59%
>>inl_from_qemu  15662  10302   -34.22%
>
> What's the difference of IA32_EFER between guest and host in your config?
>
> IIUC,
> - NX is not consistent
>   IA32_EFER will be auto load w/ and w/o the patch.
> - SCE is not consistent
>   IA32_EFER will be switched through wrmsr(urn) w/o the patch, and auto load
>   w/ the patch.

This is with kvm-unit-test as is, so NX is consistent but SCE is different.

--Andy
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

compiler bug gcc4.6/4.7 with ACCESS_ONCE and workarounds

2014-11-10 Thread Christian Borntraeger

Linus,

Last week I sent belows patch to Paolo for kvm/next.
Heiko Carstens pointed out that the kernel often does not 
work around compiler bugs, instead we blacklist the 
compiler, make a buildbug on or similar when we cant be 
sure to catch all broken cases, e.g. Heiko mentioned 
the the x86 specific test cases for stack protector.

Now: I can reproduces belows miscompile on gcc46 and gcc 47
gcc 45 seems ok, gcc 48 is fixed.  This makes blacklisting
a bit hard, especially since it is not limited to s390, but
covers all architectures.
In essence ACCESS_ONCE will not work reliably on aggregate 
types with gcc 4.6 and gcc 4.7.
In Linux we seem to use ACCESS_ONCE mostly on scalar types,
below code is an example were we dont - and break.

Linus, what is your take on workarounds of compiler bugs?
The barrier solution below will fix that particular case, but
are there others? Maybe we can come with a clever trick of
creating a build-bug for ACCESS_ONCE on non-scalar types?
A testcase is not trivial, since we have to rely on other
optimization steps to actually do the wrong thing and the 
gcc testcase test will dump the internal tree and check
that - something that does not seem to be ok for the kernel.

Christian


 Forwarded Message 
Subject: [GIT PULL 1/4] KVM: s390: Fix ipte locking
Date: Fri,  7 Nov 2014 12:45:13 +0100
From: Christian Borntraeger 
To: Paolo Bonzini 
CC: KVM , Gleb Natapov , Alexander Graf 
, Cornelia Huck , Jens Freimann 
, linux-s390 , Christian 
Borntraeger , sta...@vger.kernel.org

ipte_unlock_siif uses cmpxchg to replace the in-memory data of the ipte
lock together with ACCESS_ONCE for the intial read.

union ipte_control {
unsigned long val;
struct {
unsigned long k  : 1;
unsigned long kh : 31;
unsigned long kg : 32;
};
};
[...]
static void ipte_unlock_siif(struct kvm_vcpu *vcpu)
{
union ipte_control old, new, *ic;

ic = &vcpu->kvm->arch.sca->ipte_control;
do {
new = old = ACCESS_ONCE(*ic);
new.kh--;
if (!new.kh)
new.k = 0;
} while (cmpxchg(&ic->val, old.val, new.val) != old.val);
if (!new.kh)
wake_up(&vcpu->kvm->arch.ipte_wq);
}

The new value, is loaded twice from memory with gcc 4.7.2 of
fedora 18, despite the ACCESS_ONCE:

--->

l   %r4,0(%r3)  <--- load first 32 bit of lock (k and kh) in r4
alfi%r4,2147483647  <--- add -1 to r4
llgtr   %r4,%r4 <--- zero out the sign bit of r4
lg  %r1,0(%r3)  <--- load all 64 bit of lock into new
lgr %r2,%r1 <--- load the same into old
risbg   %r1,%r4,1,31,32 <--- shift and insert r4 into the bits 1-31 of
new
llihf   %r4,2147483647
ngrk%r4,%r1,%r4
jne aa0 
nihh%r1,32767
lgr %r4,%r2
csg %r4,%r1,0(%r3)
cgr %r2,%r4
jne a70 

If the memory value changes between the first load (l) and the second
load (lg) we are broken. If that happens VCPU threads will hang
(unkillable) in handle_ipte_interlock.

Andreas Krebbel analyzed this and tracked it down to a compiler bug in
that version:
"while it is not that obvious the C99 standard basically forbids
duplicating the memory access also in that case. For an argumentation of
a similiar case please see:
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=22278#c43

For the implementation-defined cases regarding volatile there are some
GCC-specific clarifications which can be found here:
https://gcc.gnu.org/onlinedocs/gcc/Volatiles.html#Volatiles

I've tracked down the problem with a reduced testcase. The problem was
that during a tree level optimization (SRA - scalar replacement of
aggregates) the volatile marker is lost. And an RTL level optimizer (CSE
- common subexpression elimination) then propagated the memory read into
  its second use introducing another access to the memory location. So
indeed Christian's suspicion that the union access has something to do
with it is correct (since it triggered the SRA optimization).

This issue has been reported and fixed in the GCC 4.8 development cycle:
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58145";

This patch replaces the ACCESS_ONCE scheme with a barrier() based scheme
that should work for all supported compilers.

Signed-off-by: Christian Borntraeger 
Cc: sta...@vger.kernel.org # v3.16+
---
 arch/s390/kvm/gaccess.c | 20 ++--
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index 0f961a1..6dc0ad9 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -229,10 +229,12 @@ static void ipte_lock_simple(struct kvm_vcpu *vcpu)
goto out;
ic = &vcpu->kvm->arch.sca->ipte_control;
do {
-   old = ACCESS_ONCE(*ic);
+   old = *ic;
+   barrier();
while (old.k) {
cond_resched();
-

Re: compiler bug gcc4.6/4.7 with ACCESS_ONCE and workarounds

2014-11-10 Thread Linus Torvalds

On Mon, Nov 10, 2014 at 12:18 PM, Christian Borntraeger
 wrote:
>
> Now: I can reproduces belows miscompile on gcc46 and gcc 47
> gcc 45 seems ok, gcc 48 is fixed.  This makes blacklisting
> a bit hard, especially since it is not limited to s390, but
> covers all architectures.
> In essence ACCESS_ONCE will not work reliably on aggregate
> types with gcc 4.6 and gcc 4.7.
> In Linux we seem to use ACCESS_ONCE mostly on scalar types,
> below code is an example were we dont - and break.

Hmm. I think we should see how painful it would be to make it a rule
that ACCESS_ONCE() only works on scalar types.

Even in the actual code you show as an example, the "fix" is really to
use the "unsigned long val" member of the union for the ACCESS_ONCE().
And that seems to be true in many other cases too.

So before blacklisting any compilers, let's first see if

 (a) we can actually make it a real rule that we only use ACCESS_ONCE on scalars
 (b) we can somehow enforce this with a compiler warning/error for mis-uses

For example, the attached patch works for some cases, but shows how we
use ACCESS_ONCE() on pointers to pte_t's etc, so it doesn't come even
close to compiling the whole kernel. But I wonder how painful that
would be to change.. The places where it complains are actually
somewhat debatable to begin with, like:

 - handle_pte_fault(.. pte_t *pte ..):

entry = ACCESS_ONCE(*pte);

and the thing is, "pte" is actually possibly an 8-byte entity on
x86-32, and that ACCESS_ONCE() fundamentally will be two 32-byte
reads.

So there is a very valid argument for saying "well, you shouldn't do
that, then", and that we might be better off cleaning up our
ACCESS_ONCE() uses, than to just blindly blacklist compilers.

NOTE! I'm not at all advocating the attached patch. I'm sending it out
white-space damaged on purpose, it's more of a "hey, something like
this might be the direction we want to go in", with the spinlock.h
part of the patch also acting as an example of the kind of changes the
"ACCESS_ONCE() only works on scalars" rule would require.

So I do agree with Heiko that we generally don't want to work around
compiler bugs if we can avoid it. But sometimes the compiler bugs do
end up saying "your'e doing something very fragile". Maybe we should
try to be less fragile here.

And in your example, the whole

old = ACCESS_ONCE(*ic);

*could* just be a

old->val = ACCESS_ONCE(ic->val);

the same way the x86 spinlock.h changes below.

I did *not* try to see how many other cases we have. It's possible
that your "don't use ACCESS_ONCE, use a barrier() instead" ends up
being a valid workaround. For the pte case, that may well be the
solution, for example (because what we really care about is not so
much "it's an atomic access" but "it's still the same that we
originally assumed").  Sometimes we want ACCESS_ONCE() because we
really want an atomic value (and we just don't care if it's old or
new), but sometimes it's really because we don't want the compiler to
re-load it and possibly see two different values - one that we check,
and one that we actually use (and then a barrier() would generally be
perfectly sufficient)

Adding some more people to the discussion just to see if anybody else
has comments about ACCESS_ONCE() on aggregate types.

(Btw, it's not just aggregate types, even non-aggregate types like
"long long" are not necessarily safe, to give the same 64-bit on
x86-32 example. So adding an assert that it's smaller or equal in size
to a "long" might also not be unreasonable)

   Linus

---
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index d5ad7b1118fc..63e82f1dfc1a 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -378,7 +378,11 @@ void ftrace_likely_update(struct
ftrace_branch_data *f, int val, int expect);
  * use is to mediate communication between process-level code and irq/NMI
  * handlers, all running on the same CPU.
  */
-#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x))
+#define get_scalar_volatile_pointer(x) ({ \
+   typeof(x) *__p = &(x); \
+   volatile typeof(x) *__vp = __p; \
+   (void)(long)*__p; __vp; })
+#define ACCESS_ONCE(x) (*get_scalar_volatile_pointer(x))

 /* Ignore/forbid kprobes attach on very low level functions marked by
this attribute: */
 #ifdef CONFIG_KPROBES
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index 9295016485c9..b7e6825af5e3 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -105,7 +105,7 @@ static __always_inline int
arch_spin_trylock(arch_spinlock_t *lock)
 {
arch_spinlock_t old, new;

-   old.tickets = ACCESS_ONCE(lock->tickets);
+   old.head_tail = ACCESS_ONCE(lock->head_tail);
if (old.tickets.head != (old.tickets.tail & ~TICKET_SLOWPATH_FLAG))
return 0;

@@ -162,16 +162,14 @@ static __always_inline void
arch_spin_unlock(arch_spinlock_t *lock)

 static inline int arch_spin_i

Re: [PATCH 05/13] KVM: Update IRTE according to guest interrupt configuration changes

2014-11-10 Thread Alex Williamson

On Mon, 2014-11-10 at 14:26 +0800, Feng Wu wrote:
> When guest changes its interrupt configuration (such as, vector, etc.)
> for direct-assigned devices, we need to update the associated IRTE
> with the new guest vector, so external interrupts from the assigned
> devices can be injected to guests without VM-Exit.
> 
> The current method of handling guest lowest priority interrtups
> is to use a counter 'apic_arb_prio' for each VCPU, we choose the
> VCPU with smallest 'apic_arb_prio' and then increase it by 1.
> However, for VT-d PI, we cannot re-use this, since we no longer
> have control to 'apic_arb_prio' with posted interrupt direct
> delivery by Hardware.
> 
> Here, we introduce a similiar way with 'apic_arb_prio' to handle
> guest lowest priority interrtups when VT-d PI is used. Here is the
> ideas:
> - Each VCPU has a counter 'round_robin_counter'.
> - When guests sets an interrupts to lowest priority, we choose
> the VCPU with smallest 'round_robin_counter' as the destination,
> then increase it.
> 
> Signed-off-by: Feng Wu 
> ---
>  arch/x86/include/asm/irq_remapping.h |6 ++
>  arch/x86/include/asm/kvm_host.h  |2 +
>  arch/x86/kvm/vmx.c   |   12 +++
>  arch/x86/kvm/x86.c   |   11 +++
>  drivers/iommu/amd_iommu.c|6 ++
>  drivers/iommu/intel_irq_remapping.c  |   28 +++
>  drivers/iommu/irq_remapping.c|9 ++
>  drivers/iommu/irq_remapping.h|3 +
>  include/linux/dmar.h |   26 ++
>  include/linux/kvm_host.h |   22 +
>  include/uapi/linux/kvm.h |1 +
>  virt/kvm/assigned-dev.c  |  141 
> ++
>  virt/kvm/irq_comm.c  |4 +-
>  virt/kvm/irqchip.c   |   11 ---
>  14 files changed, 269 insertions(+), 13 deletions(-)
> 
> diff --git a/arch/x86/include/asm/irq_remapping.h 
> b/arch/x86/include/asm/irq_remapping.h
> index a3cc437..32d6cc4 100644
> --- a/arch/x86/include/asm/irq_remapping.h
> +++ b/arch/x86/include/asm/irq_remapping.h
> @@ -51,6 +51,7 @@ extern void compose_remapped_msi_msg(struct pci_dev *pdev,
>unsigned int irq, unsigned int dest,
>struct msi_msg *msg, u8 hpet_id);
>  extern int setup_hpet_msi_remapped(unsigned int irq, unsigned int id);
> +extern int update_pi_irte(unsigned int irq, u64 pi_desc_addr, u32 vector);
>  extern void panic_if_irq_remap(const char *msg);
>  extern bool setup_remapped_irq(int irq,
>  struct irq_cfg *cfg,
> @@ -88,6 +89,11 @@ static inline int setup_hpet_msi_remapped(unsigned int 
> irq, unsigned int id)
>   return -ENODEV;
>  }
>  
> +static inline int update_pi_irte(unsigned int irq, u64 pi_desc_addr, u32 
> vector)
> +{
> + return -ENODEV;
> +}
> +
>  static inline void panic_if_irq_remap(const char *msg)
>  {
>  }
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 6ed0c30..0630161 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -358,6 +358,7 @@ struct kvm_vcpu_arch {
>   struct kvm_lapic *apic;/* kernel irqchip context */
>   unsigned long apic_attention;
>   int32_t apic_arb_prio;
> + int32_t round_robin_counter;
>   int mp_state;
>   u64 ia32_misc_enable_msr;
>   bool tpr_access_reporting;
> @@ -771,6 +772,7 @@ struct kvm_x86_ops {
>   int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr);
>  
>   void (*sched_in)(struct kvm_vcpu *kvm, int cpu);
> + u64 (*get_pi_desc_addr)(struct kvm_vcpu *vcpu);
>  };
>  
>  struct kvm_arch_async_pf {
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index a4670d3..ae91b72 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -544,6 +544,11 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu 
> *vcpu)
>   return container_of(vcpu, struct vcpu_vmx, vcpu);
>  }
>  
> +struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
> +{
> + return &(to_vmx(vcpu)->pi_desc);
> +}
> +
>  #define VMCS12_OFFSET(x) offsetof(struct vmcs12, x)
>  #define FIELD(number, name)  [number] = VMCS12_OFFSET(name)
>  #define FIELD64(number, name)[number] = VMCS12_OFFSET(name), \
> @@ -4280,6 +4285,11 @@ static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu 
> *vcpu)
>   return;
>  }
>  
> +static u64 vmx_get_pi_desc_addr(struct kvm_vcpu *vcpu)
> +{
> + return __pa((u64)vcpu_to_pi_desc(vcpu));
> +}
> +
>  /*
>   * Set up the vmcs's constant host-state fields, i.e., host-state fields that
>   * will not change in the lifetime of the guest.
> @@ -9232,6 +9242,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
>   .check_nested_events = vmx_check_nested_events,
>  
>   .sched_in = vmx_sched_in,
> +
> + .get_pi_desc_addr = vmx_get_pi_desc_addr,
>  };
>  
>  static int __init vmx_init(void)
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> inde

Re: [PATCH 02/13] KVM: Initialize VT-d Posted-Interrtups Descriptor

2014-11-10 Thread Alex Williamson

On Mon, 2014-11-10 at 14:26 +0800, Feng Wu wrote:
> This patch initialize the VT-d Posted-interrupt Descritpor.
> 
> Signed-off-by: Feng Wu 
> ---
>  arch/x86/include/asm/irq_remapping.h |1 +
>  arch/x86/kernel/apic/apic.c  |1 +
>  arch/x86/kvm/vmx.c   |   56 -
>  3 files changed, 56 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/x86/include/asm/irq_remapping.h 
> b/arch/x86/include/asm/irq_remapping.h
> index b7747c4..a3cc437 100644
> --- a/arch/x86/include/asm/irq_remapping.h
> +++ b/arch/x86/include/asm/irq_remapping.h
> @@ -57,6 +57,7 @@ extern bool setup_remapped_irq(int irq,
>  struct irq_chip *chip);
>  
>  void irq_remap_modify_chip_defaults(struct irq_chip *chip);
> +extern int irq_post_enabled;
>  
>  #else  /* CONFIG_IRQ_REMAP */
>  
> diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
> index ba6cc04..987408d 100644
> --- a/arch/x86/kernel/apic/apic.c
> +++ b/arch/x86/kernel/apic/apic.c
> @@ -162,6 +162,7 @@ __setup("apicpmtimer", setup_apicpmtimer);
>  #endif
>  
>  int x2apic_mode;
> +EXPORT_SYMBOL_GPL(x2apic_mode);
>  #ifdef CONFIG_X86_X2APIC
>  /* x2apic enabled before OS handover */
>  int x2apic_preenabled;
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 3e556c6..a4670d3 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -45,6 +45,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #include "trace.h"
>  
> @@ -408,13 +409,32 @@ struct nested_vmx {
>  };
>  
>  #define POSTED_INTR_ON  0
> +#define POSTED_INTR_SN  1
> +
>  /* Posted-Interrupt Descriptor */
>  struct pi_desc {
>   u32 pir[8]; /* Posted interrupt requested */
> - u32 control;/* bit 0 of control is outstanding notification bit */
> - u32 rsvd[7];
> + union {
> + struct {
> + u64 on  : 1,
> + sn  : 1,
> + rsvd_1  : 13,
> + ndm : 1,
> + nv  : 8,
> + rsvd_2  : 8,
> + ndst: 32;
> + };
> + u64 control;
> + };
> + u32 rsvd[6];
>  } __aligned(64);
>  
> +static void pi_clear_sn(struct pi_desc *pi_desc)
> +{
> + return clear_bit(POSTED_INTR_SN,
> + (unsigned long *)&pi_desc->control);
> +}
> +
>  static bool pi_test_and_set_on(struct pi_desc *pi_desc)
>  {
>   return test_and_set_bit(POSTED_INTR_ON,
> @@ -4396,6 +4416,33 @@ static void ept_set_mmio_spte_mask(void)
>   kvm_mmu_set_mmio_spte_mask((0x3ull << 62) | 0x6ull);
>  }
>  
> +static bool pi_desc_init(struct vcpu_vmx *vmx)
> +{
> + unsigned int dest;
> +
> + if (irq_post_enabled == 0)
> + return true;
> +
> + /*
> +  * Initialize Posted-Interrupt Descriptor
> +  */
> +
> + pi_clear_sn(&vmx->pi_desc);
> + vmx->pi_desc.nv = POSTED_INTR_VECTOR;
> +
> + /* Physical mode for Notificaiton Event */
> + vmx->pi_desc.ndm = 0;
> + dest = cpu_physical_id(vmx->vcpu.cpu);
> +
> + if (x2apic_mode)
> + vmx->pi_desc.ndst = dest;
> + else
> + vmx->pi_desc.ndst = (dest << 8) & 0xFF00;
> +
> + return true;

Why does this bother to return anything since it can only return true?

> +}
> +
> +
>  /*
>   * Sets up the vmcs for emulated real mode.
>   */
> @@ -4439,6 +4486,11 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
>  
>   vmcs_write64(POSTED_INTR_NV, POSTED_INTR_VECTOR);
>   vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc)));
> +
> + if (!pi_desc_init(vmx)) {

And therefore this cannot happen.

> + printk(KERN_ERR "Initialize PI descriptor error!\n");
> + return 1;

This is the wrong error anyway, vmx_create_vcpu() returns ERR_PTR(1)
which fails the reverse IS_ERR()

Thanks,
Alex

> + }
>   }
>  
>   if (ple_gap) {



--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] x86: Update VT-d Posted-Interrupts related information

2014-11-10 Thread Alex Williamson

On Mon, 2014-11-10 at 14:31 +0800, Feng Wu wrote:
> VT-d Posted-Interrupts(PI) is an enhancement to CPU side Posted-Interrupt.
> With VT-d Posted-Interrupts enabled, external interrupts from
> direct-assigned devices can be delivered to guests without VMM
> involvement when guest is running in non-root mode.
> 
> If VT-d PI is supported by KVM, we need to update the IRTE with
> the new guest interrtup configuration.
> 
> Signed-off-by: Feng Wu 
> ---
>  hw/i386/kvm/pci-assign.c  |   24 
>  linux-headers/linux/kvm.h |2 ++
>  target-i386/kvm.c |5 +
>  target-i386/kvm_i386.h|1 +
>  4 files changed, 32 insertions(+), 0 deletions(-)
> 
> diff --git a/hw/i386/kvm/pci-assign.c b/hw/i386/kvm/pci-assign.c
> index bb206da..e55a99b 100644
> --- a/hw/i386/kvm/pci-assign.c
> +++ b/hw/i386/kvm/pci-assign.c
> @@ -1005,6 +1005,12 @@ static void assigned_dev_update_msi(PCIDevice *pci_dev)
>  assigned_dev->intx_route.mode = PCI_INTX_DISABLED;
>  assigned_dev->intx_route.irq = -1;
>  assigned_dev->assigned_irq_type = ASSIGNED_IRQ_MSI;
> +
> +if (kvm_check_extension(kvm_state, KVM_CAP_PI)) {
> +if (kvm_device_pi_update(kvm_state, assigned_dev->dev_id) < 0) {
> +perror("assigned_dev_update_msi: kvm_device_pi_update");
> +}
> +}


I don't really understand this ioctl, we need to call into KVM to set
the new IRQ routing anyway, why can't KVM figure out that it needs to
update the posted interrupt config at the same time?  Also, we'll need
to support this through vfio-pci.  Thanks,

Alex

>  } else {
>  Error *local_err = NULL;
>  
> @@ -1029,6 +1035,12 @@ static void assigned_dev_update_msi_msg(PCIDevice 
> *pci_dev)
>  
>  kvm_irqchip_update_msi_route(kvm_state, assigned_dev->msi_virq[0],
>   msi_get_message(pci_dev, 0));
> +
> +if (kvm_check_extension(kvm_state, KVM_CAP_PI)) {
> +if (kvm_device_pi_update(kvm_state, assigned_dev->dev_id) < 0) {
> +perror("assigned_dev_update_msi_msg: kvm_device_pi_update");
> +}
> +}
>  }
>  
>  static bool assigned_dev_msix_masked(MSIXTableEntry *entry)
> @@ -1149,6 +1161,12 @@ static void assigned_dev_update_msix(PCIDevice 
> *pci_dev)
>  perror("assigned_dev_enable_msix: assign irq");
>  return;
>  }
> +
> +if (kvm_check_extension(kvm_state, KVM_CAP_PI)) {
> +if (kvm_device_pi_update(kvm_state, assigned_dev->dev_id) < 
> 0) {
> +perror("assigned_dev_update_msix: kvm_device_pi_update");
> +}
> +}
>  }
>  assigned_dev->intx_route.mode = PCI_INTX_DISABLED;
>  assigned_dev->intx_route.irq = -1;
> @@ -1618,6 +1636,12 @@ static void assigned_dev_msix_mmio_write(void *opaque, 
> hwaddr addr,
>  if (ret) {
>  error_report("Error updating irq routing entry (%d)", 
> ret);
>  }
> +if (kvm_check_extension(kvm_state, KVM_CAP_PI)) {
> +if (kvm_device_pi_update(kvm_state, adev->dev_id) < 0) {
> +perror("assigned_dev_update_msi_msg: "
> +"kvm_device_pi_update");
> +}
> +}
>  }
>  }
>  }
> diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h
> index 2669938..b34f3c4 100644
> --- a/linux-headers/linux/kvm.h
> +++ b/linux-headers/linux/kvm.h
> @@ -765,6 +765,7 @@ struct kvm_ppc_smmu_info {
>  #define KVM_CAP_PPC_FIXUP_HCALL 103
>  #define KVM_CAP_PPC_ENABLE_HCALL 104
>  #define KVM_CAP_CHECK_EXTENSION_VM 105
> +#define KVM_CAP_PI 106
>  
>  #ifdef KVM_CAP_IRQ_ROUTING
>  
> @@ -1020,6 +1021,7 @@ struct kvm_s390_ucas_mapping {
>  #define KVM_XEN_HVM_CONFIG_IOW(KVMIO,  0x7a, struct 
> kvm_xen_hvm_config)
>  #define KVM_SET_CLOCK _IOW(KVMIO,  0x7b, struct kvm_clock_data)
>  #define KVM_GET_CLOCK _IOR(KVMIO,  0x7c, struct kvm_clock_data)
> +#define KVM_ASSIGN_DEV_PI_UPDATE  _IOR(KVMIO,  0x7d, __u32)
>  /* Available with KVM_CAP_PIT_STATE2 */
>  #define KVM_GET_PIT2  _IOR(KVMIO,  0x9f, struct kvm_pit_state2)
>  #define KVM_SET_PIT2  _IOW(KVMIO,  0xa0, struct kvm_pit_state2)
> diff --git a/target-i386/kvm.c b/target-i386/kvm.c
> index ccf36e8..3dd8e5e 100644
> --- a/target-i386/kvm.c
> +++ b/target-i386/kvm.c
> @@ -2660,6 +2660,11 @@ int kvm_device_msi_assign(KVMState *s, uint32_t 
> dev_id, int virq)
>KVM_DEV_IRQ_GUEST_MSI, virq);
>  }
>  
> +int kvm_device_pi_update(KVMState *s, uint32_t dev_id)
> +{
> +return kvm_vm_ioctl(s, KVM_ASSIGN_DEV_PI_UPDATE, &dev_id);
> +}
> +
>  int kvm_device_msi_deassign(KVMState *s, uint32_t dev_id)
>  {
>  return kvm_deassign_irq_internal(s, dev_id, KVM_DEV_IRQ_GUEST_MSI |
> diff --git a/target-i386/kv

Re: [PATCH 13/13] iommu/vt-d: Add a command line parameter for VT-d posted-interrupts

2014-11-10 Thread Alex Williamson

On Mon, 2014-11-10 at 14:26 +0800, Feng Wu wrote:
> Enable VT-d Posted-Interrtups and add a command line
> parameter for it.
> 
> Signed-off-by: Feng Wu 
> ---
>  drivers/iommu/irq_remapping.c |9 -
>  1 files changed, 8 insertions(+), 1 deletions(-)
> 
> diff --git a/drivers/iommu/irq_remapping.c b/drivers/iommu/irq_remapping.c
> index 0e36860..3cb9429 100644
> --- a/drivers/iommu/irq_remapping.c
> +++ b/drivers/iommu/irq_remapping.c
> @@ -23,7 +23,7 @@ int irq_remap_broken;
>  int disable_sourceid_checking;
>  int no_x2apic_optout;
>  
> -int disable_irq_post = 1;
> +int disable_irq_post = 0;
>  int irq_post_enabled = 0;
>  EXPORT_SYMBOL_GPL(irq_post_enabled);
>  
> @@ -206,6 +206,13 @@ static __init int setup_irqremap(char *str)
>  }
>  early_param("intremap", setup_irqremap);
>  
> +static __init int setup_nointpost(char *str)
> +{
> + disable_irq_post = 1;
> + return 0;
> +}
> +early_param("nointpost", setup_nointpost);
> +

Documentation/kernel-parameters.txt?  Thanks,

Alex

>  void __init setup_irq_remapping_ops(void)
>  {
>   remap_ops = &intel_irq_remap_ops;



--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Hotplugging usb devices on guest boot

2014-11-10 Thread Elliot Hallmark

Hello list,

The short version is: how do I best use an external deamon to hotplug
all the usb devices for each guest when the guest boots?

 The solution needs to work for quests started through virt-manager
(as folks with minimal knowledge of the system may need to start
VM's).  I thought there might be a way to get signals about which
guests are being started from libvirtd, but I haven't been able to
find any information about that.  Another option might be using the
hooks at /etc/libvirt/hooks/qemu.  (
http://www.libvirt.org/hooks.html#names ).  Since libvert functions
should not be called from within this script, I should have it open a
socket and send a signal to me custom deamon, which would wait a short
amount of time before envoking "virsh attach-device"?


background:


I recently got a multiseat system working using KVM (I computer
serving guest OS's to monitors through VGA passthrough, one GPU per
guest). I works great!

One issue is that the input devices need to be hard coded into the XML
files for each seat.  Sometimes, on system reboot, the device numbers
assigned to the various mice and keyboards get changed, and I have to
check lsusb and fix the XML.  Also, hotplugging is obviously not
automatic.

To make hotplugging work the way I want, I have written a python
deamon to notice new usb devices through udev.  It recognizes the
port, and can thus send new devices to guests based on usb ports (each
guest is associated with a hub plugged into a port).

It would be best if this deamon could also let each guest know about
the usb devices that are already plugged in to its hub on guest boot.
That way, there would be no usb devices hard coded into the xml.  What
is the best way to do this?

I am using Ubuntu 14.04 as the host OS.

Thanks,
  Elliot
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] KVM: PPC: BookE: Improve irq inject tracepoint

2014-11-10 Thread Alexander Graf

When injecting an IRQ, we only document which IRQ priority (which translates
to IRQ type) gets injected. However, when reading traces you don't necessarily
have all the numbers in your head to know which IRQ really is meant.

This patch converts the IRQ number field to a symbolic name that is in sync
with the respective define. That way it's a lot easier for readers to figure
out what interrupt gets injected.

Signed-off-by: Alexander Graf 
---
 arch/powerpc/kvm/trace_booke.h | 48 --
 1 file changed, 46 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kvm/trace_booke.h b/arch/powerpc/kvm/trace_booke.h
index f7537cf..5734c0a 100644
--- a/arch/powerpc/kvm/trace_booke.h
+++ b/arch/powerpc/kvm/trace_booke.h
@@ -151,6 +151,47 @@ TRACE_EVENT(kvm_booke206_ref_release,
__entry->pfn, __entry->flags)
 );
 
+#ifdef CONFIG_SPE_POSSIBLE
+#define kvm_trace_symbol_irqprio_spe \
+   {BOOKE_IRQPRIO_SPE_UNAVAIL, "SPE_UNAVAIL"}, \
+   {BOOKE_IRQPRIO_SPE_FP_DATA, "SPE_FP_DATA"}, \
+   {BOOKE_IRQPRIO_SPE_FP_ROUND, "SPE_FP_ROUND"},
+#else
+#define kvm_trace_symbol_irqprio_spe
+#endif
+
+#ifdef CONFIG_PPC_E500MC
+#define kvm_trace_symbol_irqprio_e500mc \
+   {BOOKE_IRQPRIO_ALTIVEC_UNAVAIL, "ALTIVEC_UNAVAIL"}, \
+   {BOOKE_IRQPRIO_ALTIVEC_ASSIST, "ALTIVEC_ASSIST"},
+#else
+#define kvm_trace_symbol_irqprio_e500mc
+#endif
+
+#define kvm_trace_symbol_irqprio \
+   kvm_trace_symbol_irqprio_spe \
+   kvm_trace_symbol_irqprio_e500mc \
+   {BOOKE_IRQPRIO_DATA_STORAGE, "DATA_STORAGE"}, \
+   {BOOKE_IRQPRIO_INST_STORAGE, "INST_STORAGE"}, \
+   {BOOKE_IRQPRIO_ALIGNMENT, "ALIGNMENT"}, \
+   {BOOKE_IRQPRIO_PROGRAM, "PROGRAM"}, \
+   {BOOKE_IRQPRIO_FP_UNAVAIL, "FP_UNAVAIL"}, \
+   {BOOKE_IRQPRIO_SYSCALL, "SYSCALL"}, \
+   {BOOKE_IRQPRIO_AP_UNAVAIL, "AP_UNAVAIL"}, \
+   {BOOKE_IRQPRIO_DTLB_MISS, "DTLB_MISS"}, \
+   {BOOKE_IRQPRIO_ITLB_MISS, "ITLB_MISS"}, \
+   {BOOKE_IRQPRIO_MACHINE_CHECK, "MACHINE_CHECK"}, \
+   {BOOKE_IRQPRIO_DEBUG, "DEBUG"}, \
+   {BOOKE_IRQPRIO_CRITICAL, "CRITICAL"}, \
+   {BOOKE_IRQPRIO_WATCHDOG, "WATCHDOG"}, \
+   {BOOKE_IRQPRIO_EXTERNAL, "EXTERNAL"}, \
+   {BOOKE_IRQPRIO_FIT, "FIT"}, \
+   {BOOKE_IRQPRIO_DECREMENTER, "DECREMENTER"}, \
+   {BOOKE_IRQPRIO_PERFORMANCE_MONITOR, "PERFORMANCE_MONITOR"}, \
+   {BOOKE_IRQPRIO_EXTERNAL_LEVEL, "EXTERNAL_LEVEL"}, \
+   {BOOKE_IRQPRIO_DBELL, "DBELL"}, \
+   {BOOKE_IRQPRIO_DBELL_CRIT, "DBELL_CRIT"} \
+
 TRACE_EVENT(kvm_booke_queue_irqprio,
TP_PROTO(struct kvm_vcpu *vcpu, unsigned int priority),
TP_ARGS(vcpu, priority),
@@ -167,8 +208,11 @@ TRACE_EVENT(kvm_booke_queue_irqprio,
__entry->pending= vcpu->arch.pending_exceptions;
),
 
-   TP_printk("vcpu=%x prio=%x pending=%lx",
-   __entry->cpu_nr, __entry->priority, __entry->pending)
+   TP_printk("vcpu=%x prio=%s pending=%lx",
+   __entry->cpu_nr,
+   __print_symbolic(__entry->priority, kvm_trace_symbol_irqprio),
+   __entry->pending)
+);
 );
 
 #endif
-- 
1.8.1.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: compiler bug gcc4.6/4.7 with ACCESS_ONCE and workarounds

2014-11-10 Thread Paul E. McKenney

On Mon, Nov 10, 2014 at 01:07:33PM -0800, Linus Torvalds wrote:
> On Mon, Nov 10, 2014 at 12:18 PM, Christian Borntraeger
>  wrote:
> >
> > Now: I can reproduces belows miscompile on gcc46 and gcc 47
> > gcc 45 seems ok, gcc 48 is fixed.  This makes blacklisting
> > a bit hard, especially since it is not limited to s390, but
> > covers all architectures.
> > In essence ACCESS_ONCE will not work reliably on aggregate
> > types with gcc 4.6 and gcc 4.7.
> > In Linux we seem to use ACCESS_ONCE mostly on scalar types,
> > below code is an example were we dont - and break.
> 
> Hmm. I think we should see how painful it would be to make it a rule
> that ACCESS_ONCE() only works on scalar types.

For whatever it is worth, I have been assuming that ACCESS_ONCE() was
only ever supposed to work on scalar types.  And one of the reasons
that I have been giving the pre-EV56 Alpha guys a hard time is because
I would like us to be able to continue using ACCESS_ONCE() on 8-bit and
16-bit quantities as well.

> Even in the actual code you show as an example, the "fix" is really to
> use the "unsigned long val" member of the union for the ACCESS_ONCE().
> And that seems to be true in many other cases too.
> 
> So before blacklisting any compilers, let's first see if
> 
>  (a) we can actually make it a real rule that we only use ACCESS_ONCE on 
> scalars
>  (b) we can somehow enforce this with a compiler warning/error for mis-uses
> 
> For example, the attached patch works for some cases, but shows how we
> use ACCESS_ONCE() on pointers to pte_t's etc, so it doesn't come even
> close to compiling the whole kernel. But I wonder how painful that
> would be to change.. The places where it complains are actually
> somewhat debatable to begin with, like:
> 
>  - handle_pte_fault(.. pte_t *pte ..):
> 
> entry = ACCESS_ONCE(*pte);
> 
> and the thing is, "pte" is actually possibly an 8-byte entity on
> x86-32, and that ACCESS_ONCE() fundamentally will be two 32-byte
> reads.
> 
> So there is a very valid argument for saying "well, you shouldn't do
> that, then", and that we might be better off cleaning up our
> ACCESS_ONCE() uses, than to just blindly blacklist compilers.
> 
> NOTE! I'm not at all advocating the attached patch. I'm sending it out
> white-space damaged on purpose, it's more of a "hey, something like
> this might be the direction we want to go in", with the spinlock.h
> part of the patch also acting as an example of the kind of changes the
> "ACCESS_ONCE() only works on scalars" rule would require.
> 
> So I do agree with Heiko that we generally don't want to work around
> compiler bugs if we can avoid it. But sometimes the compiler bugs do
> end up saying "your'e doing something very fragile". Maybe we should
> try to be less fragile here.
> 
> And in your example, the whole
> 
> old = ACCESS_ONCE(*ic);
> 
> *could* just be a
> 
> old->val = ACCESS_ONCE(ic->val);
> 
> the same way the x86 spinlock.h changes below.
> 
> I did *not* try to see how many other cases we have. It's possible
> that your "don't use ACCESS_ONCE, use a barrier() instead" ends up
> being a valid workaround. For the pte case, that may well be the
> solution, for example (because what we really care about is not so
> much "it's an atomic access" but "it's still the same that we
> originally assumed").  Sometimes we want ACCESS_ONCE() because we
> really want an atomic value (and we just don't care if it's old or
> new), but sometimes it's really because we don't want the compiler to
> re-load it and possibly see two different values - one that we check,
> and one that we actually use (and then a barrier() would generally be
> perfectly sufficient)
> 
> Adding some more people to the discussion just to see if anybody else
> has comments about ACCESS_ONCE() on aggregate types.
> 
> (Btw, it's not just aggregate types, even non-aggregate types like
> "long long" are not necessarily safe, to give the same 64-bit on
> x86-32 example. So adding an assert that it's smaller or equal in size
> to a "long" might also not be unreasonable)

Good point on "long long" on 32-bit systems.

Another reason to avoid trying to do anything that even smells atomic on
non-machine-sized/aligned variables is that the compiler guys' current
reaction to this sort of situation is "No problem!!!  The compiler can
just invent a lock to guard all such accesses!"  I don't think that we
want to go there.

>Linus
> 
> ---
> diff --git a/include/linux/compiler.h b/include/linux/compiler.h
> index d5ad7b1118fc..63e82f1dfc1a 100644
> --- a/include/linux/compiler.h
> +++ b/include/linux/compiler.h
> @@ -378,7 +378,11 @@ void ftrace_likely_update(struct
> ftrace_branch_data *f, int val, int expect);
>   * use is to mediate communication between process-level code and irq/NMI
>   * handlers, all running on the same CPU.
>   */
> -#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x))
> +#define get_scalar_volatile_pointer(x) ({ \
> +   type

Re: [PATCH v4 10/25] virtio: add API to enable VQs early

2014-11-10 Thread Andy Grover


On 10/13/2014 12:50 AM, Michael S. Tsirkin wrote:

virtio spec 0.9.X requires DRIVER_OK to be set before
VQs are used, but some drivers use VQs before probe
function returns.
Since DRIVER_OK is set after probe, this violates the spec.

Even though under virtio 1.0 transitional devices support this
behaviour, we want to make it possible for those early callers to become
spec compliant and eventually support non-transitional devices.

Add API for drivers to call before using VQs.

Sets DRIVER_OK internally.

Signed-off-by: Michael S. Tsirkin 
Reviewed-by: Cornelia Huck 
---
  include/linux/virtio_config.h | 17 +
  1 file changed, 17 insertions(+)

diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index e8f8f71..e36403b 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -109,6 +109,23 @@ struct virtqueue *virtio_find_single_vq(struct 
virtio_device *vdev,
return vq;
  }

+/**
+ * virtio_device_ready - enable vq use in probe function
+ * @vdev: the device
+ *
+ * Driver must call this to use vqs in the probe function.
+ *
+ * Note: vqs are enabled automatically after probe returns.
+ */
+static inline
+void virtio_device_ready(struct virtio_device *dev)
+{
+   unsigned status = dev->config->get_status(dev);
+
+   BUG_ON(status & VIRTIO_CONFIG_S_DRIVER_OK);
+   dev->config->set_status(dev, status | VIRTIO_CONFIG_S_DRIVER_OK);
+}


Getting a BUG when booting via KVM, host Fedora 20, guest Fedora 20.

my config is at:

https://fedorapeople.org/~grover/config-20141110



[0.828494] [ cut here ]
[0.829039] kernel BUG at 
/home/agrover/git/kernel/include/linux/virtio_config.h:125!

[0.831266] invalid opcode:  [#1] SMP DEBUG_PAGEALLOC
[0.831266] Modules linked in:
[0.831266] CPU: 1 PID: 30 Comm: kworker/1:1 Not tainted 3.18.0-rc4 #120
[0.831266] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
[0.831266] Workqueue: events control_work_handler
[0.831266] task: 88003cd98000 ti: 88003cd94000 task.ti: 
88003cd94000
[0.831266] RIP: 0010:[]  [] 
add_port+0x264/0x410

[0.831266] RSP: :88003cd97c78  EFLAGS: 00010202
[0.831266] RAX: 0007 RBX: 88003c58c400 RCX: 
0001
[0.831266] RDX: c132 RSI: 81a955e9 RDI: 
0001c132
[0.831266] RBP: 88003cd97cc8 R08:  R09: 

[0.831266] R10: 0001 R11:  R12: 
88003c58be00
[0.831266] R13: 0001 R14: 8800395ca800 R15: 
88003c58c420
[0.831266] FS:  () GS:88003fa0() 
knlGS:

[0.831266] CS:  0010 DS:  ES:  CR0: 8005003b
[0.831266] CR2:  CR3: 01c11000 CR4: 
06e0
[0.831266] DR0:  DR1:  DR2: 

[0.831266] DR3:  DR6: fffe0ff0 DR7: 
0400

[0.831266] Stack:
[0.831266]  8801 0292  
0001
[0.831266]  88003cd97cc8 88003dfa8a20 88003c58beb8 
88003c58be10
[0.831266]  8800395a2000  88003cd97d38 
8144531a

[0.831266] Call Trace:
[0.831266]  [] control_work_handler+0x16a/0x3c0
[0.831266]  [] ? process_one_work+0x208/0x500
[0.831266]  [] process_one_work+0x2ac/0x500
[0.831266]  [] ? process_one_work+0x208/0x500
[0.831266]  [] worker_thread+0x2ce/0x4e0
[0.831266]  [] ? process_one_work+0x500/0x500
[0.831266]  [] kthread+0xf8/0x100
[0.831266]  [] ? trace_hardirqs_on+0xd/0x10
[0.831266]  [] ? kthread_stop+0x140/0x140
[0.831266]  [] ret_from_fork+0x7c/0xb0
[0.831266]  [] ? kthread_stop+0x140/0x140
[0.831266] Code: c7 c2 48 31 01 83 48 c7 c6 e9 55 a9 81 e8 55 b4 c6 
ff 4d 8b b4 24 58 01 00 00 49 8b 86 e8 04 00 00 4c 89 f7 ff 50 10 a8 04 
74 0c <0f> 0b 66 2e 0f 1f 84 00 00 00 00 00 49 8b 96 e8 04 00 00 83 c8

[0.831266] RIP  [] add_port+0x264/0x410
[0.831266]  RSP 
[0.878202] ---[ end trace f98fbb172cc7bbf4 ]---

Thanks -- Andy

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] kvm: x86: add trace event for pvclock updates

2014-11-10 Thread Marcelo Tosatti

On Wed, Nov 05, 2014 at 11:46:42AM -0800, David Matlack wrote:
> The new trace event records:
>   * the id of vcpu being updated
>   * the pvclock_vcpu_time_info struct being written to guest memory
> 
> This is useful for debugging pvclock bugs, such as the bug fixed by
> "[PATCH] kvm: x86: Fix kvm clock versioning.".
> 
> Signed-off-by: David Matlack 

So you actually hit that bug in practice? Can you describe the
scenario?


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] virtio_console: move early VQ enablement

2014-11-10 Thread Dave Airlie

On 21 October 2014 03:09, Josh Boyer  wrote:
> On Mon, Oct 20, 2014 at 10:05 AM, Michael S. Tsirkin  wrote:
>> On Mon, Oct 20, 2014 at 03:58:49PM +0200, Cornelia Huck wrote:
>>> Commit f5866db6 (virtio_console: enable VQs early) tried to make
>>> sure that DRIVER_OK was set when virtio_console started using its
>>> virtqueues. Doing this in add_port(), however, means that we try
>>> to set DRIVER_OK again when when a port is dynamically added after
>>> the probe function is done.
>>>
>>> Let's move virtio_device_ready() to the probe function just before
>>> trying to use the virtqueues instead. This is fine as nothing can
>>> fail inbetween.
>>>
>>> Reported-by: Thomas Graf 
>>> Reviewed-by: Michael S. Tsirkin 
>>> Signed-off-by: Cornelia Huck 
>>
>> Thanks!
>>
>> Acked-by: Michael S. Tsirkin 
>> Tested-by: Michael S. Tsirkin 
>
> This fixed my KVM guest boot issue with 3.18-rc1.  Thanks for such a quick 
> fix.
>
> Tested-by: Josh Boyer 

ping So who's merging this? Rusty?

still happens in -rc4.

Dave.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: several messages

2014-11-10 Thread Jiang Liu

On 2014/11/11 2:15, Thomas Gleixner wrote:
> On Mon, 10 Nov 2014, Feng Wu wrote:
> 
>> VT-d Posted-Interrupts is an enhancement to CPU side Posted-Interrupt.
>> With VT-d Posted-Interrupts enabled, external interrupts from
>> direct-assigned devices can be delivered to guests without VMM
>> intervention when guest is running in non-root mode.
> 
> Can you please talk to Jiang and synchronize your work with his
> refactoring of the x86 interrupt handling subsystem.
> 
> I want this stuff cleaned up first before we add new stuff to it.
Hi Thomas,
Just talked with Feng, we will focused on refactor first and
then add posted interrupt support.
Regards!
Gerry

> 
> Thanks,
> 
>   tglx
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v4] KVM: x86: fix access memslots w/o hold srcu read lock

2014-11-10 Thread Tang Chen


Hi Wanpeng,

I think I have totally missed this thread.
I opened lockdep and RCU debug, and tried on 3.18-rc1. But I didn't get 
the warning.

My steps are:

1. Use numactl to bind a qemu process to node1.
2. Offline all node1 memory. And the qemu process is still running.

Would you please tell me how did you reproduce it ?

Thanks.

On 11/02/2014 03:07 PM, Wanpeng Li wrote:

The srcu read lock must be held while accessing memslots (e.g.
when using gfn_to_* functions), however, commit c24ae0dcd3e8
("kvm: x86: Unpin and remove kvm_arch->apic_access_page") call
gfn_to_page() in kvm_vcpu_reload_apic_access_page() w/o hold it in
vmx_vcpu_reset() path which leads to suspicious rcu_dereference_check()
usage warning. This patch fix it by holding srcu read lock in all
kvm_vcpu_reset() call path.


[ INFO: suspicious RCU usage. ]
3.18.0-rc2-test2+ #70 Not tainted
---
include/linux/kvm_host.h:474 suspicious rcu_dereference_check() usage!

other info that might help us debug this:

rcu_scheduler_active = 1, debug_locks = 0
1 lock held by qemu-system-x86/2371:
  #0:  (&vcpu->mutex){+.+...}, at: [] vcpu_load+0x20/0xd0 
[kvm]

stack backtrace:
CPU: 4 PID: 2371 Comm: qemu-system-x86 Not tainted 3.18.0-rc2-test2+ #70
Hardware name: Dell Inc. OptiPlex 9010/0M9KCM, BIOS A12 01/10/2013
  0001 880209983ca8 816f514f 
  8802099b8990 880209983cd8 810bd687 000fee00
  880208a2c000 880208a1 88020ef50040 880209983d08
Call Trace:
  [] dump_stack+0x4e/0x71
  [] lockdep_rcu_suspicious+0xe7/0x120
  [] gfn_to_memslot+0xd5/0xe0 [kvm]
  [] __gfn_to_pfn+0x33/0x60 [kvm]
  [] gfn_to_page+0x25/0x90 [kvm]
  [] kvm_vcpu_reload_apic_access_page+0x3c/0x80 [kvm]
  [] vmx_vcpu_reset+0x20c/0x460 [kvm_intel]
  [] kvm_vcpu_reset+0x15e/0x1b0 [kvm]
  [] kvm_arch_vcpu_setup+0x2c/0x50 [kvm]
  [] kvm_vm_ioctl+0x1d0/0x780 [kvm]
  [] ? __lock_is_held+0x54/0x80
  [] do_vfs_ioctl+0x300/0x520
  [] ? __fget+0x5/0x250
  [] ? __fget_light+0x2a/0xe0
  [] SyS_ioctl+0x81/0xa0
  [] system_call_fastpath+0x16/0x1b

Reported-by: Takashi Iwai 
Reported-by: Alexei Starovoitov 
Suggested-by: Paolo Bonzini 
Signed-off-by: Wanpeng Li 
---
v3 -> v4:
  * bypass the problem altoghter by kvm_make_request
v2 -> v3:
  * take care all vmx_vcpu_reset call path
v1 -> v2:
  * just fix hold the srcu read lock in vmx_vcpu_reset path

  arch/x86/kvm/vmx.c |2 +-
  1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a0f78db..3e556c6 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4579,7 +4579,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
vmcs_write32(TPR_THRESHOLD, 0);
}
  
-	kvm_vcpu_reload_apic_access_page(vcpu);

+   kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
  
  	if (vmx_vm_has_apicv(vcpu->kvm))

memset(&vmx->pi_desc, 0, sizeof(struct pi_desc));


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v4 10/25] virtio: add API to enable VQs early

2014-11-10 Thread Michael S. Tsirkin

On Mon, Nov 10, 2014 at 04:45:09PM -0800, Andy Grover wrote:
> On 10/13/2014 12:50 AM, Michael S. Tsirkin wrote:
> >virtio spec 0.9.X requires DRIVER_OK to be set before
> >VQs are used, but some drivers use VQs before probe
> >function returns.
> >Since DRIVER_OK is set after probe, this violates the spec.
> >
> >Even though under virtio 1.0 transitional devices support this
> >behaviour, we want to make it possible for those early callers to become
> >spec compliant and eventually support non-transitional devices.
> >
> >Add API for drivers to call before using VQs.
> >
> >Sets DRIVER_OK internally.
> >
> >Signed-off-by: Michael S. Tsirkin 
> >Reviewed-by: Cornelia Huck 
> >---
> >  include/linux/virtio_config.h | 17 +
> >  1 file changed, 17 insertions(+)
> >
> >diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
> >index e8f8f71..e36403b 100644
> >--- a/include/linux/virtio_config.h
> >+++ b/include/linux/virtio_config.h
> >@@ -109,6 +109,23 @@ struct virtqueue *virtio_find_single_vq(struct 
> >virtio_device *vdev,
> > return vq;
> >  }
> >
> >+/**
> >+ * virtio_device_ready - enable vq use in probe function
> >+ * @vdev: the device
> >+ *
> >+ * Driver must call this to use vqs in the probe function.
> >+ *
> >+ * Note: vqs are enabled automatically after probe returns.
> >+ */
> >+static inline
> >+void virtio_device_ready(struct virtio_device *dev)
> >+{
> >+unsigned status = dev->config->get_status(dev);
> >+
> >+BUG_ON(status & VIRTIO_CONFIG_S_DRIVER_OK);
> >+dev->config->set_status(dev, status | VIRTIO_CONFIG_S_DRIVER_OK);
> >+}
> 
> Getting a BUG when booting via KVM, host Fedora 20, guest Fedora 20.
> 
> my config is at:
> 
> https://fedorapeople.org/~grover/config-20141110
> 

The fix is here:
http://article.gmane.org/gmane.linux.kernel.virtualization/23324/raw

I'm surprised it's not merged yet.

Rusty, could you pick it up please?


> 
> [0.828494] [ cut here ]
> [0.829039] kernel BUG at
> /home/agrover/git/kernel/include/linux/virtio_config.h:125!
> [0.831266] invalid opcode:  [#1] SMP DEBUG_PAGEALLOC
> [0.831266] Modules linked in:
> [0.831266] CPU: 1 PID: 30 Comm: kworker/1:1 Not tainted 3.18.0-rc4 #120
> [0.831266] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
> [0.831266] Workqueue: events control_work_handler
> [0.831266] task: 88003cd98000 ti: 88003cd94000 task.ti:
> 88003cd94000
> [0.831266] RIP: 0010:[]  []
> add_port+0x264/0x410
> [0.831266] RSP: :88003cd97c78  EFLAGS: 00010202
> [0.831266] RAX: 0007 RBX: 88003c58c400 RCX:
> 0001
> [0.831266] RDX: c132 RSI: 81a955e9 RDI:
> 0001c132
> [0.831266] RBP: 88003cd97cc8 R08:  R09:
> 
> [0.831266] R10: 0001 R11:  R12:
> 88003c58be00
> [0.831266] R13: 0001 R14: 8800395ca800 R15:
> 88003c58c420
> [0.831266] FS:  () GS:88003fa0()
> knlGS:
> [0.831266] CS:  0010 DS:  ES:  CR0: 8005003b
> [0.831266] CR2:  CR3: 01c11000 CR4:
> 06e0
> [0.831266] DR0:  DR1:  DR2:
> 
> [0.831266] DR3:  DR6: fffe0ff0 DR7:
> 0400
> [0.831266] Stack:
> [0.831266]  8801 0292 
> 0001
> [0.831266]  88003cd97cc8 88003dfa8a20 88003c58beb8
> 88003c58be10
> [0.831266]  8800395a2000  88003cd97d38
> 8144531a
> [0.831266] Call Trace:
> [0.831266]  [] control_work_handler+0x16a/0x3c0
> [0.831266]  [] ? process_one_work+0x208/0x500
> [0.831266]  [] process_one_work+0x2ac/0x500
> [0.831266]  [] ? process_one_work+0x208/0x500
> [0.831266]  [] worker_thread+0x2ce/0x4e0
> [0.831266]  [] ? process_one_work+0x500/0x500
> [0.831266]  [] kthread+0xf8/0x100
> [0.831266]  [] ? trace_hardirqs_on+0xd/0x10
> [0.831266]  [] ? kthread_stop+0x140/0x140
> [0.831266]  [] ret_from_fork+0x7c/0xb0
> [0.831266]  [] ? kthread_stop+0x140/0x140
> [0.831266] Code: c7 c2 48 31 01 83 48 c7 c6 e9 55 a9 81 e8 55 b4 c6 ff
> 4d 8b b4 24 58 01 00 00 49 8b 86 e8 04 00 00 4c 89 f7 ff 50 10 a8 04 74 0c
> <0f> 0b 66 2e 0f 1f 84 00 00 00 00 00 49 8b 96 e8 04 00 00 83 c8
> [0.831266] RIP  [] add_port+0x264/0x410
> [0.831266]  RSP 
> [0.878202] ---[ end trace f98fbb172cc7bbf4 ]---
> 
> Thanks -- Andy
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

RE: several messages

2014-11-10 Thread Wu, Feng



> -Original Message-
> From: Jiang Liu [mailto:jiang@linux.intel.com]
> Sent: Tuesday, November 11, 2014 10:29 AM
> To: Thomas Gleixner; Wu, Feng
> Cc: g...@kernel.org; pbonz...@redhat.com; David Woodhouse;
> j...@8bytes.org; mi...@redhat.com; H. Peter Anvin; x...@kernel.org;
> kvm@vger.kernel.org; io...@lists.linux-foundation.org; LKML
> Subject: Re: several messages
> 
> On 2014/11/11 2:15, Thomas Gleixner wrote:
> > On Mon, 10 Nov 2014, Feng Wu wrote:
> >
> >> VT-d Posted-Interrupts is an enhancement to CPU side Posted-Interrupt.
> >> With VT-d Posted-Interrupts enabled, external interrupts from
> >> direct-assigned devices can be delivered to guests without VMM
> >> intervention when guest is running in non-root mode.
> >
> > Can you please talk to Jiang and synchronize your work with his
> > refactoring of the x86 interrupt handling subsystem.
> >
> > I want this stuff cleaned up first before we add new stuff to it.
> Hi Thomas,
>   Just talked with Feng, we will focused on refactor first and
> then add posted interrupt support.
> Regards!
> Gerry

No problem!

Thanks,
Feng

> 
> >
> > Thanks,
> >
> > tglx
> >
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

47 matches

Mail list logo