date:20141202

Re: [PATCH 0/5] kvm: memslots lookup optimization

2014-12-02 Thread Igor Mammedov

On Mon, 01 Dec 2014 18:38:34 +0100
Paolo Bonzini  wrote:

> 
> 
> On 01/12/2014 18:29, Igor Mammedov wrote:
> > Series speed-ups GFN to memslot lookup time by:
> >  * introducing LRU cache, which improves looukup time for
> >same slot workload (typically boot time of Windows and Linux guest)
> >  * switching to binary search for GFN to memslot lookup,
> >improving lookup time with large amount of memory slots
> > 
> > Igor Mammedov (5):
> >   kvm: update_memslots: drop not needed check for the same number of
> > pages
> >   kvm: update_memslots: drop not needed check for the same slot
> >   kvm: search_memslots: add simple LRU memslot caching
> >   kvm: change memslot sorting rule from size to GFN
> >   kvm: optimize GFN to memslot lookup with large slots amount
> > 
> >  include/linux/kvm_host.h | 28 +++-
> >  virt/kvm/kvm_main.c  | 46 
> > ++
> >  2 files changed, 49 insertions(+), 25 deletions(-)
> > 
> 
> Applied patches 1-3 for now, I'm not in the mood for proving that the
> binary search is correct. :)
Following write up could help with improving a proving mood :)
http://community.topcoder.com/tc?module=Static&d1=tutorials&d2=binarySearch


> 
> Paolo

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v2 2/2] kvm: vmx: enable intel xsaves for guest

2014-12-02 Thread Paolo Bonzini



On 02/12/2014 07:14, Wanpeng Li wrote:
> Expose intel xsaves feature to guest.
> 
> Signed-off-by: Wanpeng Li 
> ---
> v1 -> v2:
>  *auto switch msr ia32_xss if this msr is present
> 
>  arch/x86/include/asm/kvm_host.h |  1 +
>  arch/x86/include/asm/vmx.h  |  3 +++
>  arch/x86/include/uapi/asm/vmx.h |  6 +-
>  arch/x86/kvm/vmx.c  | 35 ++-
>  4 files changed, 43 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 2896dbc..95dde42 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -362,6 +362,7 @@ struct kvm_vcpu_arch {
>   int mp_state;
>   u64 ia32_misc_enable_msr;
>   bool tpr_access_reporting;
> + u64 ia32_xss;

The patch is not getting/setting ia32_xss when the guest does
RDMSR/WRMSR.  You also need a QEMU patch to migrate XSS.

>   /*
>* Paging state of the vcpu
> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
> index bcbfade..bdb79ef 100644
> --- a/arch/x86/include/asm/vmx.h
> +++ b/arch/x86/include/asm/vmx.h
> @@ -69,6 +69,7 @@
>  #define SECONDARY_EXEC_PAUSE_LOOP_EXITING0x0400
>  #define SECONDARY_EXEC_ENABLE_INVPCID0x1000
>  #define SECONDARY_EXEC_SHADOW_VMCS  0x4000
> +#define SECONDARY_EXEC_XSAVES0x0010
>  
>  
>  #define PIN_BASED_EXT_INTR_MASK 0x0001
> @@ -159,6 +160,8 @@ enum vmcs_field {
>   EOI_EXIT_BITMAP3_HIGH   = 0x2023,
>   VMREAD_BITMAP   = 0x2026,
>   VMWRITE_BITMAP  = 0x2028,
> + XSS_EXIT_BIMTAP = 0x202C,
> + XSS_EXIT_BIMTAP_HIGH= 0x202D,

s/BIMTAP/BITMAP/

>   GUEST_PHYSICAL_ADDRESS  = 0x2400,
>   GUEST_PHYSICAL_ADDRESS_HIGH = 0x2401,
>   VMCS_LINK_POINTER   = 0x2800,
> diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
> index 990a2fe..b813bf9 100644
> --- a/arch/x86/include/uapi/asm/vmx.h
> +++ b/arch/x86/include/uapi/asm/vmx.h
> @@ -72,6 +72,8 @@
>  #define EXIT_REASON_XSETBV  55
>  #define EXIT_REASON_APIC_WRITE  56
>  #define EXIT_REASON_INVPCID 58
> +#define EXIT_REASON_XSAVES  63
> +#define EXIT_REASON_XRSTORS 64
>  
>  #define VMX_EXIT_REASONS \
>   { EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \
> @@ -116,6 +118,8 @@
>   { EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \
>   { EXIT_REASON_INVD,  "INVD" }, \
>   { EXIT_REASON_INVVPID,   "INVVPID" }, \
> - { EXIT_REASON_INVPCID,   "INVPCID" }
> + { EXIT_REASON_INVPCID,   "INVPCID" }, \
> + { EXIT_REASON_XSAVES,"XSAVES" }, \
> + { EXIT_REASON_XRSTORS,   "XRSTORS" }
>  
>  #endif /* _UAPIVMX_H */
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 6a951d8..b87b5b8 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -1045,6 +1045,12 @@ static inline bool cpu_has_vmx_invpcid(void)
>   SECONDARY_EXEC_ENABLE_INVPCID;
>  }
>  
> +static inline bool cpu_has_xss_exit_bitmap(void)
> +{
> + return vmcs_config.cpu_based_2nd_exec_ctrl &
> + SECONDARY_EXEC_XSAVES;
> +}
> +
>  static inline bool cpu_has_virtual_nmis(void)
>  {
>   return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
> @@ -1773,6 +1779,14 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
>   kvm_set_shared_msr(vmx->guest_msrs[i].index,
>  vmx->guest_msrs[i].data,
>  vmx->guest_msrs[i].mask);
> +
> + if (cpu_has_xsaves) {
> + u64 host_xss;
> +
> + rdmsrl(MSR_IA32_XSS, host_xss);

Is this host value fixed?  If so, please load it just once in
setup_vmcs_config.

> + add_atomic_switch_msr(vmx, MSR_IA32_XSS,
> + vcpu->arch.ia32_xss, host_xss);

Also, if host_xss is fixed you can do this add_atomic_switch_msr at
WRMSR time rather than here, and only if vcpu->arch.ia32_xss !=
host_xss.  If the two XSS values match, do clear_atomic_switch_msr instead.

> + }
>  }
>  
>  static void __vmx_load_host_state(struct vcpu_vmx *vmx)
> @@ -2895,7 +2909,8 @@ static __init int setup_vmcs_config(struct vmcs_config 
> *vmcs_conf)
>   SECONDARY_EXEC_ENABLE_INVPCID |
>   SECONDARY_EXEC_APIC_REGISTER_VIRT |
>   SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
> - SECONDARY_EXEC_SHADOW_VMCS;
> + SECONDARY_EXEC_SHADOW_VMCS |
> + SECONDARY_EXEC_XSAVES;
>   if (adjust_vmx_controls(min2, opt2,
>   MSR_IA32_VMX_PROCBASED_CTLS2,
>

Re: [PATCH v2 1/2] kvm: x86: revert mask out xsaves

2014-12-02 Thread Paolo Bonzini



On 02/12/2014 07:14, Wanpeng Li wrote:
> xsaves will be exported to guest in the next patch, so revert the
> mask out xsaves patch.
> 
> Signed-off-by: Wanpeng Li 
> ---
>  arch/x86/kvm/cpuid.c | 10 +-
>  1 file changed, 1 insertion(+), 9 deletions(-)
> 
> diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
> index a4f5ac4..7af07571 100644
> --- a/arch/x86/kvm/cpuid.c
> +++ b/arch/x86/kvm/cpuid.c
> @@ -320,10 +320,6 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
> *entry, u32 function,
>   F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) |
>   F(AVX512CD);
>  
> - /* cpuid 0xD.1.eax */
> - const u32 kvm_supported_word10_x86_features =
> - F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1);

Every single bit needs to be whitelisted independently of the others, so
a full revert is not possible.  We need to be careful about post-Skylake
processors introducing new bits in this cpuid leaf.

Also, you cannot just add F(XSAVES) here; you must only do it if XSAVES
is actually supported.  This makes sure that nested virtualization will
_not_ present XSAVES to the guests until specific support is introduced
for XSAVES and XRSTORS exits (including the XSS exit bitmap).

In order to do this, you have to introduce a new member in kvm_x86_ops,
modeling what was done for MPX.  The second patch can then implement
this new member.

Thanks,

Paolo

>   /* all calls to cpuid_count() should be made on the same cpu */
>   get_cpu();
>  
> @@ -460,8 +456,6 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
> *entry, u32 function,
>   entry->eax &= supported;
>   entry->edx &= supported >> 32;
>   entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
> - if (!supported)
> - break;
>  
>   for (idx = 1, i = 1; idx < 64; ++idx) {
>   u64 mask = ((u64)1 << idx);
> @@ -469,9 +463,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
> *entry, u32 function,
>   goto out;
>  
>   do_cpuid_1_ent(&entry[i], function, idx);
> - if (idx == 1)
> - entry[i].eax &= 
> kvm_supported_word10_x86_features;
> - else if (entry[i].eax == 0 || !(supported & mask))
> + if (entry[i].eax == 0 || !(supported & mask))
>   continue;
>   entry[i].flags |=
>  KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v2 2/2] kvm: vmx: enable intel xsaves for guest

2014-12-02 Thread Wanpeng Li

Hi Paolo,
On Tue, Dec 02, 2014 at 09:51:22AM +0100, Paolo Bonzini wrote:
>
>
>On 02/12/2014 07:14, Wanpeng Li wrote:
>> Expose intel xsaves feature to guest.
>> 
>> Signed-off-by: Wanpeng Li 
>> ---
>> v1 -> v2:
>>  *auto switch msr ia32_xss if this msr is present
>> 
>>  arch/x86/include/asm/kvm_host.h |  1 +
>>  arch/x86/include/asm/vmx.h  |  3 +++
>>  arch/x86/include/uapi/asm/vmx.h |  6 +-
>>  arch/x86/kvm/vmx.c  | 35 ++-
>>  4 files changed, 43 insertions(+), 2 deletions(-)
>> 
>> diff --git a/arch/x86/include/asm/kvm_host.h 
>> b/arch/x86/include/asm/kvm_host.h
>> index 2896dbc..95dde42 100644
>> --- a/arch/x86/include/asm/kvm_host.h
>> +++ b/arch/x86/include/asm/kvm_host.h
>> @@ -362,6 +362,7 @@ struct kvm_vcpu_arch {
>>  int mp_state;
>>  u64 ia32_misc_enable_msr;
>>  bool tpr_access_reporting;
>> +u64 ia32_xss;
>
>The patch is not getting/setting ia32_xss when the guest does
>RDMSR/WRMSR.  You also need a QEMU patch to migrate XSS.

Will do.

>
>>  /*
>>   * Paging state of the vcpu
>> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
>> index bcbfade..bdb79ef 100644
>> --- a/arch/x86/include/asm/vmx.h
>> +++ b/arch/x86/include/asm/vmx.h
>> @@ -69,6 +69,7 @@
>>  #define SECONDARY_EXEC_PAUSE_LOOP_EXITING   0x0400
>>  #define SECONDARY_EXEC_ENABLE_INVPCID   0x1000
>>  #define SECONDARY_EXEC_SHADOW_VMCS  0x4000
>> +#define SECONDARY_EXEC_XSAVES   0x0010
>>  
>>  
>>  #define PIN_BASED_EXT_INTR_MASK 0x0001
>> @@ -159,6 +160,8 @@ enum vmcs_field {
>>  EOI_EXIT_BITMAP3_HIGH   = 0x2023,
>>  VMREAD_BITMAP   = 0x2026,
>>  VMWRITE_BITMAP  = 0x2028,
>> +XSS_EXIT_BIMTAP = 0x202C,
>> +XSS_EXIT_BIMTAP_HIGH= 0x202D,
>
>s/BIMTAP/BITMAP/

Ok.

>
>>  GUEST_PHYSICAL_ADDRESS  = 0x2400,
>>  GUEST_PHYSICAL_ADDRESS_HIGH = 0x2401,
>>  VMCS_LINK_POINTER   = 0x2800,
>> diff --git a/arch/x86/include/uapi/asm/vmx.h 
>> b/arch/x86/include/uapi/asm/vmx.h
>> index 990a2fe..b813bf9 100644
>> --- a/arch/x86/include/uapi/asm/vmx.h
>> +++ b/arch/x86/include/uapi/asm/vmx.h
>> @@ -72,6 +72,8 @@
>>  #define EXIT_REASON_XSETBV  55
>>  #define EXIT_REASON_APIC_WRITE  56
>>  #define EXIT_REASON_INVPCID 58
>> +#define EXIT_REASON_XSAVES  63
>> +#define EXIT_REASON_XRSTORS 64
>>  
>>  #define VMX_EXIT_REASONS \
>>  { EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \
>> @@ -116,6 +118,8 @@
>>  { EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \
>>  { EXIT_REASON_INVD,  "INVD" }, \
>>  { EXIT_REASON_INVVPID,   "INVVPID" }, \
>> -{ EXIT_REASON_INVPCID,   "INVPCID" }
>> +{ EXIT_REASON_INVPCID,   "INVPCID" }, \
>> +{ EXIT_REASON_XSAVES,"XSAVES" }, \
>> +{ EXIT_REASON_XRSTORS,   "XRSTORS" }
>>  
>>  #endif /* _UAPIVMX_H */
>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>> index 6a951d8..b87b5b8 100644
>> --- a/arch/x86/kvm/vmx.c
>> +++ b/arch/x86/kvm/vmx.c
>> @@ -1045,6 +1045,12 @@ static inline bool cpu_has_vmx_invpcid(void)
>>  SECONDARY_EXEC_ENABLE_INVPCID;
>>  }
>>  
>> +static inline bool cpu_has_xss_exit_bitmap(void)
>> +{
>> +return vmcs_config.cpu_based_2nd_exec_ctrl &
>> +SECONDARY_EXEC_XSAVES;
>> +}
>> +
>>  static inline bool cpu_has_virtual_nmis(void)
>>  {
>>  return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
>> @@ -1773,6 +1779,14 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
>>  kvm_set_shared_msr(vmx->guest_msrs[i].index,
>> vmx->guest_msrs[i].data,
>> vmx->guest_msrs[i].mask);
>> +
>> +if (cpu_has_xsaves) {
>> +u64 host_xss;
>> +
>> +rdmsrl(MSR_IA32_XSS, host_xss);
>
>Is this host value fixed?  If so, please load it just once in
>setup_vmcs_config.

Will do.

>
>> +add_atomic_switch_msr(vmx, MSR_IA32_XSS,
>> +vcpu->arch.ia32_xss, host_xss);
>
>Also, if host_xss is fixed you can do this add_atomic_switch_msr at
>WRMSR time rather than here, and only if vcpu->arch.ia32_xss !=
>host_xss.  If the two XSS values match, do clear_atomic_switch_msr instead.

Agreed.

>
>> +}
>>  }
>>  
>>  static void __vmx_load_host_state(struct vcpu_vmx *vmx)
>> @@ -2895,7 +2909,8 @@ static __init int setup_vmcs_config(struct vmcs_config 
>> *vmcs_conf)
>>  SECONDARY_EXEC_ENABLE_INVPCID |
>>  SECONDARY_EXEC_APIC_REGISTER_VIRT |
>>  SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
>> -SECONDARY_EXEC_SHADOW_VMCS;
>> +SECONDARY_EXEC_SHADOW_VMCS |
>> +

Re: [PATCH v2 1/2] kvm: x86: revert mask out xsaves

2014-12-02 Thread Wanpeng Li

On Tue, Dec 02, 2014 at 09:55:09AM +0100, Paolo Bonzini wrote:
>
>
>On 02/12/2014 07:14, Wanpeng Li wrote:
>> xsaves will be exported to guest in the next patch, so revert the
>> mask out xsaves patch.
>> 
>> Signed-off-by: Wanpeng Li 
>> ---
>>  arch/x86/kvm/cpuid.c | 10 +-
>>  1 file changed, 1 insertion(+), 9 deletions(-)
>> 
>> diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
>> index a4f5ac4..7af07571 100644
>> --- a/arch/x86/kvm/cpuid.c
>> +++ b/arch/x86/kvm/cpuid.c
>> @@ -320,10 +320,6 @@ static inline int __do_cpuid_ent(struct 
>> kvm_cpuid_entry2 *entry, u32 function,
>>  F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) |
>>  F(AVX512CD);
>>  
>> -/* cpuid 0xD.1.eax */
>> -const u32 kvm_supported_word10_x86_features =
>> -F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1);
>
>Every single bit needs to be whitelisted independently of the others, so
>a full revert is not possible.  We need to be careful about post-Skylake
>processors introducing new bits in this cpuid leaf.
>
>Also, you cannot just add F(XSAVES) here; you must only do it if XSAVES
>is actually supported.  This makes sure that nested virtualization will
>_not_ present XSAVES to the guests until specific support is introduced
>for XSAVES and XRSTORS exits (including the XSS exit bitmap).
>
>In order to do this, you have to introduce a new member in kvm_x86_ops,
>modeling what was done for MPX.  The second patch can then implement
>this new member.

Will do, thanks for your review.

Regards,
Wanpeng Li 

>
>Thanks,
>
>Paolo
>
>>  /* all calls to cpuid_count() should be made on the same cpu */
>>  get_cpu();
>>  
>> @@ -460,8 +456,6 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
>> *entry, u32 function,
>>  entry->eax &= supported;
>>  entry->edx &= supported >> 32;
>>  entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
>> -if (!supported)
>> -break;
>>  
>>  for (idx = 1, i = 1; idx < 64; ++idx) {
>>  u64 mask = ((u64)1 << idx);
>> @@ -469,9 +463,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
>> *entry, u32 function,
>>  goto out;
>>  
>>  do_cpuid_1_ent(&entry[i], function, idx);
>> -if (idx == 1)
>> -entry[i].eax &= 
>> kvm_supported_word10_x86_features;
>> -else if (entry[i].eax == 0 || !(supported & mask))
>> +if (entry[i].eax == 0 || !(supported & mask))
>>  continue;
>>  entry[i].flags |=
>> KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
>> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[qemu] virtio drivers implementation details

2014-12-02 Thread Vasile Catalin-B50542


 Hi,

I'm trying to develop a custom virtio driver for myself.
Can I ask here virtio driver implementation details?
I've tried the kvm and qemu irc channel, but had no luck.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [Xen-devel] [PATCH] xen: privcmd: schedule() after private hypercall when non CONFIG_PREEMPT

2014-12-02 Thread David Vrabel

On 01/12/14 22:36, Luis R. Rodriguez wrote:
> 
> Then I do agree its a fair analogy (and find this obviously odd that how
> widespread cond_resched() is), we just don't have an equivalent for IRQ
> context, why not avoid the special check then and use this all the time in the
> middle of a hypercall on the return from an interrupt (e.g., the timer
> interrupt)?

http://lists.xen.org/archives/html/xen-devel/2014-02/msg01101.html

David
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v3 0/3] kvm: vmx: enable xsaves for kvm

2014-12-02 Thread Wanpeng Li

This patchset is to enable xsaves for kvm part, the patch for 
qemu part will be sent out later. 

The patchset is tested on skylake-client.

v2 -> v3:
 * add kvm_get/set for ia32_xss
 * fix the type XSS_EXIT_BITMAP
 * load host_xss just once in setup_vmcs_config
 * add/clear atuo switch ia32_xss msr in kvm_get/clear
 * add VMX_XSS_EXIT_BITMAP macro
 * add WARN() in handle_xsaves/xrstors
 * export xsaves if related vmcs field is set 

v1 -> v2: 
 * auto switch ia32_xss msr just if this msr is present 

Wanpeng Li (3):
  kvm: x86: Intel XSAVES vmx and msr handle
  kvm: vmx: add kvm_get/set logic to xsaves
  kvm: x86: Enable Intel XSAVES for guest

 arch/x86/include/asm/kvm_host.h |  2 ++
 arch/x86/include/asm/vmx.h  |  3 +++
 arch/x86/include/uapi/asm/vmx.h |  6 -
 arch/x86/kvm/cpuid.c|  3 ++-
 arch/x86/kvm/vmx.c  | 51 -
 5 files changed, 62 insertions(+), 3 deletions(-)

-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v3 2/3] kvm: vmx: add kvm_get/set logic to xsaves

2014-12-02 Thread Wanpeng Li

Add kvm_get/set logic to xsaves. 

Signed-off-by: Wanpeng Li 
---
 arch/x86/kvm/vmx.c | 20 
 1 file changed, 20 insertions(+)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 12915f1..66d1e3d 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -99,6 +99,8 @@ module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, 
bool, S_IRUGO);
 static bool __read_mostly nested = 0;
 module_param(nested, bool, S_IRUGO);
 
+static u64 __read_mostly host_xss;
+
 #define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
 #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE)
 #define KVM_VM_CR0_ALWAYS_ON   \
@@ -2570,6 +2572,11 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 
msr_index, u64 *pdata)
if (!nested_vmx_allowed(vcpu))
return 1;
return vmx_get_vmx_msr(vcpu, msr_index, pdata);
+   case MSR_IA32_XSS:
+   if (!vmx_xsaves_supported())
+   return 1;
+   data = vcpu->arch.ia32_xss;
+   break;
case MSR_TSC_AUX:
if (!to_vmx(vcpu)->rdtscp_enabled)
return 1;
@@ -2661,6 +2668,16 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct 
msr_data *msr_info)
break;
case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
return 1; /* they are read-only */
+   case MSR_IA32_XSS:
+   if (!vmx_xsaves_supported())
+   return 1;
+   vcpu->arch.ia32_xss = data;
+   if (vcpu->arch.ia32_xss != host_xss)
+   add_atomic_switch_msr(vmx, MSR_IA32_XSS,
+   vcpu->arch.ia32_xss, host_xss);
+   else
+   clear_atomic_switch_msr(vmx, MSR_IA32_XSS);
+   break;
case MSR_TSC_AUX:
if (!vmx->rdtscp_enabled)
return 1;
@@ -3020,6 +3037,9 @@ static __init int setup_vmcs_config(struct vmcs_config 
*vmcs_conf)
}
}
 
+   if (cpu_has_xsaves)
+   rdmsrl(MSR_IA32_XSS, host_xss);
+
return 0;
 }
 
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v3 1/3] kvm: x86: Intel XSAVES vmx and msr handle

2014-12-02 Thread Wanpeng Li

Intel xsaves vmx and msr handle.

Signed-off-by: Wanpeng Li 
---
 arch/x86/include/asm/kvm_host.h |  2 ++
 arch/x86/include/asm/vmx.h  |  3 +++
 arch/x86/include/uapi/asm/vmx.h |  6 +-
 arch/x86/kvm/vmx.c  | 31 ++-
 4 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 2896dbc..0c4c88c 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -362,6 +362,7 @@ struct kvm_vcpu_arch {
int mp_state;
u64 ia32_misc_enable_msr;
bool tpr_access_reporting;
+   u64 ia32_xss;
 
/*
 * Paging state of the vcpu
@@ -771,6 +772,7 @@ struct kvm_x86_ops {
   enum x86_intercept_stage stage);
void (*handle_external_intr)(struct kvm_vcpu *vcpu);
bool (*mpx_supported)(void);
+   bool (*xsaves_supported)(void);
 
int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr);
 
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index bcbfade..45afaee 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -69,6 +69,7 @@
 #define SECONDARY_EXEC_PAUSE_LOOP_EXITING  0x0400
 #define SECONDARY_EXEC_ENABLE_INVPCID  0x1000
 #define SECONDARY_EXEC_SHADOW_VMCS  0x4000
+#define SECONDARY_EXEC_XSAVES  0x0010
 
 
 #define PIN_BASED_EXT_INTR_MASK 0x0001
@@ -159,6 +160,8 @@ enum vmcs_field {
EOI_EXIT_BITMAP3_HIGH   = 0x2023,
VMREAD_BITMAP   = 0x2026,
VMWRITE_BITMAP  = 0x2028,
+   XSS_EXIT_BITMAP = 0x202C,
+   XSS_EXIT_BITMAP_HIGH= 0x202D,
GUEST_PHYSICAL_ADDRESS  = 0x2400,
GUEST_PHYSICAL_ADDRESS_HIGH = 0x2401,
VMCS_LINK_POINTER   = 0x2800,
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index 990a2fe..b813bf9 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -72,6 +72,8 @@
 #define EXIT_REASON_XSETBV  55
 #define EXIT_REASON_APIC_WRITE  56
 #define EXIT_REASON_INVPCID 58
+#define EXIT_REASON_XSAVES  63
+#define EXIT_REASON_XRSTORS 64
 
 #define VMX_EXIT_REASONS \
{ EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \
@@ -116,6 +118,8 @@
{ EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \
{ EXIT_REASON_INVD,  "INVD" }, \
{ EXIT_REASON_INVVPID,   "INVVPID" }, \
-   { EXIT_REASON_INVPCID,   "INVPCID" }
+   { EXIT_REASON_INVPCID,   "INVPCID" }, \
+   { EXIT_REASON_XSAVES,"XSAVES" }, \
+   { EXIT_REASON_XRSTORS,   "XRSTORS" }
 
 #endif /* _UAPIVMX_H */
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 6a951d8..12915f1 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -761,6 +761,7 @@ static u64 construct_eptp(unsigned long root_hpa);
 static void kvm_cpu_vmxon(u64 addr);
 static void kvm_cpu_vmxoff(void);
 static bool vmx_mpx_supported(void);
+static bool vmx_xsaves_supported(void);
 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
 static void vmx_set_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg);
@@ -2895,7 +2896,8 @@ static __init int setup_vmcs_config(struct vmcs_config 
*vmcs_conf)
SECONDARY_EXEC_ENABLE_INVPCID |
SECONDARY_EXEC_APIC_REGISTER_VIRT |
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
-   SECONDARY_EXEC_SHADOW_VMCS;
+   SECONDARY_EXEC_SHADOW_VMCS |
+   SECONDARY_EXEC_XSAVES;
if (adjust_vmx_controls(min2, opt2,
MSR_IA32_VMX_PROCBASED_CTLS2,
&_cpu_based_2nd_exec_control) < 0)
@@ -4337,6 +4339,7 @@ static void ept_set_mmio_spte_mask(void)
kvm_mmu_set_mmio_spte_mask((0x3ull << 62) | 0x6ull);
 }
 
+#define VMX_XSS_EXIT_BITMAP 0
 /*
  * Sets up the vmcs for emulated real mode.
  */
@@ -4446,6 +4449,9 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
set_cr4_guest_host_mask(vmx);
 
+   if (vmx_xsaves_supported())
+   vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP);
+
return 0;
 }
 
@@ -5334,6 +5340,20 @@ static int handle_xsetbv(struct kvm_vcpu *vcpu)
return 1;
 }
 
+static int handle_xsaves(struct kvm_vcpu *vcpu)
+{
+   skip_emulated_instruction(vcpu);
+   WARN(1, "this should never happen\n");
+   return 1;
+}
+
+static int handle_xrstors(struct kvm_vcpu *vcpu)
+{
+   skip_emulated_instruction(vcpu);
+   WARN(1, "this sho

[PATCH v3 3/3] kvm: x86: Enable Intel XSAVES for guest

2014-12-02 Thread Wanpeng Li

Exporse intel xsaves feature to guest.

Signed-off-by: Wanpeng Li 
---
 arch/x86/kvm/cpuid.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index a4f5ac4..0d919bc 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -267,6 +267,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0;
unsigned f_invpcid = kvm_x86_ops->invpcid_supported() ? F(INVPCID) : 0;
unsigned f_mpx = kvm_x86_ops->mpx_supported() ? F(MPX) : 0;
+   unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0;
 
/* cpuid 1.edx */
const u32 kvm_supported_word0_x86_features =
@@ -322,7 +323,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
 
/* cpuid 0xD.1.eax */
const u32 kvm_supported_word10_x86_features =
-   F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1);
+   F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | f_xsaves;
 
/* all calls to cpuid_count() should be made on the same cpu */
get_cpu();
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [qemu] virtio drivers implementation details

2014-12-02 Thread Vasile Catalin-B50542


This is the link to the presentation:
http://www.slideshare.net/zenixls2/052-virtio-introduction-17191942
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [qemu] virtio drivers implementation details

2014-12-02 Thread Vasile Catalin-B50542


I'm trying to make a custom virtio driver that mostly interacts with memory.
I've came upon a presentation which shows which files need to be edited 
on qemu

and kvm in order to add a new virtio driver.
One of the files in qemu is include/hw/pci/pci.h,
but that was an example specific to virtio pci.
virtio-mmio is better mapped on my situation, but I can't seem to find a 
similar header for it.
In fact I can't find anything related to virtio mmio through out the 
header files

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

usb audio device troubles

2014-12-02 Thread Eric S. Johansson

I got win7 installed, virtio devices working and took forever to trickle 
in updates because of a w7 bug update manager bug that take up all cpu 
resources.  now I got DNS 13 installed but I'm getting no audio.


I pass throught the usb audio device (logitech h800 USB 046d:0a29) and 
it is seen as a device in windows.  then I hear the headset sync-up 
beeps and the device vanishes from windows.  pointers as to what I 
should look at next?


--- eric

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] target-i386: Intel xsaves

2014-12-02 Thread Wanpeng Li

Add xsaves related definition, it also add corresponding part to 
kvm_get/put, and vmstate.

Signed-off-by: Wanpeng Li 
---
 target-i386/cpu.h |  2 ++
 target-i386/kvm.c | 15 +++
 target-i386/machine.c |  3 ++-
 3 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/target-i386/cpu.h b/target-i386/cpu.h
index 015f5b5..cff7433 100644
--- a/target-i386/cpu.h
+++ b/target-i386/cpu.h
@@ -389,6 +389,7 @@
 #define MSR_VM_HSAVE_PA 0xc0010117
 
 #define MSR_IA32_BNDCFGS0x0d90
+#define MSR_IA32_XSS0x0da0
 
 #define XSTATE_FP   (1ULL << 0)
 #define XSTATE_SSE  (1ULL << 1)
@@ -1019,6 +1020,7 @@ typedef struct CPUX86State {
 uint64_t xstate_bv;
 
 uint64_t xcr0;
+uint64_t xss;
 
 TPRAccess tpr_access_type;
 } CPUX86State;
diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index ccf36e8..c6fc417 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -80,6 +80,7 @@ static bool has_msr_hv_hypercall;
 static bool has_msr_hv_vapic;
 static bool has_msr_hv_tsc;
 static bool has_msr_mtrr;
+static bool has_msr_xss;
 
 static bool has_msr_architectural_pmu;
 static uint32_t num_architectural_pmu_counters;
@@ -826,6 +827,10 @@ static int kvm_get_supported_msrs(KVMState *s)
 has_msr_bndcfgs = true;
 continue;
 }
+if (kvm_msr_list->indices[i] == MSR_IA32_XSS) {
+has_msr_xss = true;
+continue;
+}
 }
 }
 
@@ -1224,6 +1229,9 @@ static int kvm_put_msrs(X86CPU *cpu, int level)
 if (has_msr_bndcfgs) {
 kvm_msr_entry_set(&msrs[n++], MSR_IA32_BNDCFGS, env->msr_bndcfgs);
 }
+if (has_msr_xss) {
+kvm_msr_entry_set(&msrs[n++], MSR_IA32_XSS, env->xss);
+}
 #ifdef TARGET_X86_64
 if (lm_capable_kernel) {
 kvm_msr_entry_set(&msrs[n++], MSR_CSTAR, env->cstar);
@@ -1570,6 +1578,10 @@ static int kvm_get_msrs(X86CPU *cpu)
 if (has_msr_bndcfgs) {
 msrs[n++].index = MSR_IA32_BNDCFGS;
 }
+if (has_msr_xss) {
+msrs[n++].index = MSR_IA32_XSS;
+}
+
 
 if (!env->tsc_valid) {
 msrs[n++].index = MSR_IA32_TSC;
@@ -1717,6 +1729,9 @@ static int kvm_get_msrs(X86CPU *cpu)
 case MSR_IA32_BNDCFGS:
 env->msr_bndcfgs = msrs[i].data;
 break;
+case MSR_IA32_XSS:
+env->xss = msrs[i].data;
+break;
 default:
 if (msrs[i].index >= MSR_MC0_CTL &&
 msrs[i].index < MSR_MC0_CTL + (env->mcg_cap & 0xff) * 4) {
diff --git a/target-i386/machine.c b/target-i386/machine.c
index 1c13b14..43af33f 100644
--- a/target-i386/machine.c
+++ b/target-i386/machine.c
@@ -689,7 +689,7 @@ static const VMStateDescription vmstate_avx512 = {
 
 VMStateDescription vmstate_x86_cpu = {
 .name = "cpu",
-.version_id = 12,
+.version_id = 13,
 .minimum_version_id = 3,
 .pre_save = cpu_pre_save,
 .post_load = cpu_post_load,
@@ -786,6 +786,7 @@ VMStateDescription vmstate_x86_cpu = {
 VMSTATE_UINT64_V(env.xcr0, X86CPU, 12),
 VMSTATE_UINT64_V(env.xstate_bv, X86CPU, 12),
 VMSTATE_YMMH_REGS_VARS(env.ymmh_regs, X86CPU, CPU_NB_REGS, 12),
+VMSTATE_UINT64_V(env.xss, X86CPU, 13),
 VMSTATE_END_OF_LIST()
 /* The above list is not sorted /wrt version numbers, watch out! */
 },
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] target-i386: Intel xsaves

2014-12-02 Thread Paolo Bonzini



On 02/12/2014 13:19, Wanpeng Li wrote:
> Add xsaves related definition, it also add corresponding part to 
> kvm_get/put, and vmstate.
> 
> Signed-off-by: Wanpeng Li 
> ---
>  target-i386/cpu.h |  2 ++
>  target-i386/kvm.c | 15 +++
>  target-i386/machine.c |  3 ++-
>  3 files changed, 19 insertions(+), 1 deletion(-)
> 
> diff --git a/target-i386/cpu.h b/target-i386/cpu.h
> index 015f5b5..cff7433 100644
> --- a/target-i386/cpu.h
> +++ b/target-i386/cpu.h
> @@ -389,6 +389,7 @@
>  #define MSR_VM_HSAVE_PA 0xc0010117
>  
>  #define MSR_IA32_BNDCFGS0x0d90
> +#define MSR_IA32_XSS0x0da0
>  
>  #define XSTATE_FP   (1ULL << 0)
>  #define XSTATE_SSE  (1ULL << 1)
> @@ -1019,6 +1020,7 @@ typedef struct CPUX86State {
>  uint64_t xstate_bv;
>  
>  uint64_t xcr0;
> +uint64_t xss;
>  
>  TPRAccess tpr_access_type;
>  } CPUX86State;
> diff --git a/target-i386/kvm.c b/target-i386/kvm.c
> index ccf36e8..c6fc417 100644
> --- a/target-i386/kvm.c
> +++ b/target-i386/kvm.c
> @@ -80,6 +80,7 @@ static bool has_msr_hv_hypercall;
>  static bool has_msr_hv_vapic;
>  static bool has_msr_hv_tsc;
>  static bool has_msr_mtrr;
> +static bool has_msr_xss;
>  
>  static bool has_msr_architectural_pmu;
>  static uint32_t num_architectural_pmu_counters;
> @@ -826,6 +827,10 @@ static int kvm_get_supported_msrs(KVMState *s)
>  has_msr_bndcfgs = true;
>  continue;
>  }
> +if (kvm_msr_list->indices[i] == MSR_IA32_XSS) {
> +has_msr_xss = true;
> +continue;
> +}
>  }
>  }
>  
> @@ -1224,6 +1229,9 @@ static int kvm_put_msrs(X86CPU *cpu, int level)
>  if (has_msr_bndcfgs) {
>  kvm_msr_entry_set(&msrs[n++], MSR_IA32_BNDCFGS, env->msr_bndcfgs);
>  }
> +if (has_msr_xss) {
> +kvm_msr_entry_set(&msrs[n++], MSR_IA32_XSS, env->xss);
> +}
>  #ifdef TARGET_X86_64
>  if (lm_capable_kernel) {
>  kvm_msr_entry_set(&msrs[n++], MSR_CSTAR, env->cstar);
> @@ -1570,6 +1578,10 @@ static int kvm_get_msrs(X86CPU *cpu)
>  if (has_msr_bndcfgs) {
>  msrs[n++].index = MSR_IA32_BNDCFGS;
>  }
> +if (has_msr_xss) {
> +msrs[n++].index = MSR_IA32_XSS;
> +}
> +
>  
>  if (!env->tsc_valid) {
>  msrs[n++].index = MSR_IA32_TSC;
> @@ -1717,6 +1729,9 @@ static int kvm_get_msrs(X86CPU *cpu)
>  case MSR_IA32_BNDCFGS:
>  env->msr_bndcfgs = msrs[i].data;
>  break;
> +case MSR_IA32_XSS:
> +env->xss = msrs[i].data;
> +break;
>  default:
>  if (msrs[i].index >= MSR_MC0_CTL &&
>  msrs[i].index < MSR_MC0_CTL + (env->mcg_cap & 0xff) * 4) {
> diff --git a/target-i386/machine.c b/target-i386/machine.c
> index 1c13b14..43af33f 100644
> --- a/target-i386/machine.c
> +++ b/target-i386/machine.c
> @@ -689,7 +689,7 @@ static const VMStateDescription vmstate_avx512 = {
>  
>  VMStateDescription vmstate_x86_cpu = {
>  .name = "cpu",
> -.version_id = 12,
> +.version_id = 13,
>  .minimum_version_id = 3,
>  .pre_save = cpu_pre_save,
>  .post_load = cpu_post_load,
> @@ -786,6 +786,7 @@ VMStateDescription vmstate_x86_cpu = {
>  VMSTATE_UINT64_V(env.xcr0, X86CPU, 12),
>  VMSTATE_UINT64_V(env.xstate_bv, X86CPU, 12),
>  VMSTATE_YMMH_REGS_VARS(env.ymmh_regs, X86CPU, CPU_NB_REGS, 12),
> +VMSTATE_UINT64_V(env.xss, X86CPU, 13),
>  VMSTATE_END_OF_LIST()
>  /* The above list is not sorted /wrt version numbers, watch out! */
>  },
> 

Please use a subsection instead of bumping the version number.
Otherwise looks good.

Thanks!

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: usb audio device troubles

2014-12-02 Thread Paolo Bonzini



On 02/12/2014 13:16, Eric S. Johansson wrote:
> I got win7 installed, virtio devices working and took forever to trickle
> in updates because of a w7 bug update manager bug that take up all cpu
> resources.  now I got DNS 13 installed but I'm getting no audio.
> 
> I pass throught the usb audio device (logitech h800 USB 046d:0a29) and
> it is seen as a device in windows.  then I hear the headset sync-up
> beeps and the device vanishes from windows.  pointers as to what I
> should look at next?

Adding back Hans and Gerd...

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH RFC v5 00/19] qemu: towards virtio-1 host support

2014-12-02 Thread Cornelia Huck

Another iteration of virtio-1 patches for qemu, as always available on
git://github.com/cohuck/qemu virtio-1

This one seems to work together with the current vhost-next patches
(well, I can ping :)

Changes from v4:
- add helpers for feature bit manipulation and checking
- use 64 bit feature bits instead of 32 bit arrays
- infrastructure to allow devices to offer different sets of feature
  bits for legacy and standard devices
- several fixes (mainly regarding, you guessed it, feature bits)

Cornelia Huck (16):
  virtio: cull virtio_bus_set_vdev_features
  virtio: feature bit manipulation helpers
  virtio: add feature checking helpers
  virtio: support more feature bits
  virtio: endianness checks for virtio 1.0 devices
  virtio: allow virtio-1 queue layout
  dataplane: allow virtio-1 devices
  s390x/virtio-ccw: support virtio-1 set_vq format
  virtio: disallow late feature changes for virtio-1
  virtio: allow to fail setting status
  s390x/virtio-ccw: enable virtio 1.0
  virtio-net: no writeable mac for virtio-1
  virtio-net: support longer header
  virtio-net: enable virtio 1.0
  virtio: support revision-specific features
  virtio-blk: revision specific feature bits

Thomas Huth (3):
  linux-headers/virtio_config: Update with VIRTIO_F_VERSION_1
  s390x/css: Add a callback for when subchannel gets disabled
  s390x/virtio-ccw: add virtio set-revision call

 hw/9pfs/virtio-9p-device.c|4 +-
 hw/block/dataplane/virtio-blk.c   |4 +-
 hw/block/virtio-blk.c |   44 +++--
 hw/char/virtio-serial-bus.c   |6 +-
 hw/net/virtio-net.c   |  100 ++-
 hw/s390x/css.c|   12 ++
 hw/s390x/css.h|1 +
 hw/s390x/s390-virtio-bus.c|3 +-
 hw/s390x/s390-virtio-bus.h|2 +-
 hw/s390x/virtio-ccw.c |  235 ++---
 hw/s390x/virtio-ccw.h |8 +-
 hw/scsi/vhost-scsi.c  |3 +-
 hw/scsi/virtio-scsi-dataplane.c   |2 +-
 hw/scsi/virtio-scsi.c |   12 +-
 hw/virtio/Makefile.objs   |2 +-
 hw/virtio/dataplane/Makefile.objs |2 +-
 hw/virtio/dataplane/vring.c   |   96 +-
 hw/virtio/virtio-balloon.c|4 +-
 hw/virtio/virtio-bus.c|   24 ++-
 hw/virtio/virtio-mmio.c   |6 +-
 hw/virtio/virtio-pci.c|7 +-
 hw/virtio/virtio-pci.h|2 +-
 hw/virtio/virtio-rng.c|2 +-
 hw/virtio/virtio.c|   83 +++--
 include/hw/qdev-properties.h  |   11 ++
 include/hw/virtio/dataplane/vring-accessors.h |   75 
 include/hw/virtio/dataplane/vring.h   |   14 +-
 include/hw/virtio/virtio-access.h |4 +
 include/hw/virtio/virtio-bus.h|   14 +-
 include/hw/virtio/virtio-net.h|   46 ++---
 include/hw/virtio/virtio-scsi.h   |6 +-
 include/hw/virtio/virtio.h|   61 +--
 linux-headers/linux/virtio_config.h   |3 +
 33 files changed, 625 insertions(+), 273 deletions(-)
 create mode 100644 include/hw/virtio/dataplane/vring-accessors.h

-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH RFC v5 16/19] virtio-net: support longer header

2014-12-02 Thread Cornelia Huck

virtio-1 devices always use num_buffers in the header, even if
mergeable rx buffers have not been negotiated.

Signed-off-by: Cornelia Huck 
---
 hw/net/virtio-net.c |   21 +++--
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index ebbea60..7ee2bd6 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -373,15 +373,21 @@ static int peer_has_ufo(VirtIONet *n)
 return n->has_ufo;
 }
 
-static void virtio_net_set_mrg_rx_bufs(VirtIONet *n, int mergeable_rx_bufs)
+static void virtio_net_set_mrg_rx_bufs(VirtIONet *n, int mergeable_rx_bufs,
+   int version_1)
 {
 int i;
 NetClientState *nc;
 
 n->mergeable_rx_bufs = mergeable_rx_bufs;
 
-n->guest_hdr_len = n->mergeable_rx_bufs ?
-sizeof(struct virtio_net_hdr_mrg_rxbuf) : sizeof(struct 
virtio_net_hdr);
+if (version_1) {
+n->guest_hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
+} else {
+n->guest_hdr_len = n->mergeable_rx_bufs ?
+sizeof(struct virtio_net_hdr_mrg_rxbuf) :
+sizeof(struct virtio_net_hdr);
+}
 
 for (i = 0; i < n->max_queues; i++) {
 nc = qemu_get_subqueue(n->nic, i);
@@ -525,7 +531,9 @@ static void virtio_net_set_features(VirtIODevice *vdev, 
uint64_t features)
 
 virtio_net_set_mrg_rx_bufs(n,
__virtio_has_feature(features,
-VIRTIO_NET_F_MRG_RXBUF));
+VIRTIO_NET_F_MRG_RXBUF),
+   __virtio_has_feature(features,
+VIRTIO_F_VERSION_1));
 
 if (n->has_vnet_hdr) {
 n->curr_guest_offloads =
@@ -1407,7 +1415,8 @@ static int virtio_net_load_device(VirtIODevice *vdev, 
QEMUFile *f,
 qemu_get_buffer(f, n->mac, ETH_ALEN);
 n->vqs[0].tx_waiting = qemu_get_be32(f);
 
-virtio_net_set_mrg_rx_bufs(n, qemu_get_be32(f));
+virtio_net_set_mrg_rx_bufs(n, qemu_get_be32(f),
+   virtio_has_feature(vdev, VIRTIO_F_VERSION_1));
 
 if (version_id >= 3)
 n->status = qemu_get_be16(f);
@@ -1653,7 +1662,7 @@ static void virtio_net_device_realize(DeviceState *dev, 
Error **errp)
 
 n->vqs[0].tx_waiting = 0;
 n->tx_burst = n->net_conf.txburst;
-virtio_net_set_mrg_rx_bufs(n, 0);
+virtio_net_set_mrg_rx_bufs(n, 0, 0);
 n->promisc = 1; /* for compatibility */
 
 n->mac_table.macs = g_malloc0(MAC_TABLE_ENTRIES * ETH_ALEN);
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH RFC v5 17/19] virtio-net: enable virtio 1.0

2014-12-02 Thread Cornelia Huck

virtio-net (non-vhost) now should have everything in place to support
virtio 1.0: let's enable the feature bit for it.

Note that VIRTIO_F_VERSION_1 is technically a transport feature; once
every device is ready for virtio 1.0, we can move setting this
feature bit out of the individual devices.

Signed-off-by: Cornelia Huck 
---
 hw/net/virtio-net.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index 7ee2bd6..b5dd356 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -473,6 +473,7 @@ static uint64_t virtio_net_get_features(VirtIODevice *vdev, 
uint64_t features)
 }
 
 if (!get_vhost_net(nc->peer)) {
+virtio_add_feature(&features, VIRTIO_F_VERSION_1);
 return features;
 }
 return vhost_net_get_features(get_vhost_net(nc->peer), features);
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH RFC v5 19/19] virtio-blk: revision specific feature bits

2014-12-02 Thread Cornelia Huck

Wire up virtio-blk to provide different feature bit sets depending
on whether legacy or v1.0 has been requested.

Note that VERSION_1 is still disabled due to missing ANY_LAYOUT support.

Signed-off-by: Cornelia Huck 
---
 hw/block/virtio-blk.c |   19 +++
 1 file changed, 19 insertions(+)

diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index 9cfae66..fdc236a 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -587,6 +587,24 @@ static uint64_t virtio_blk_get_features(VirtIODevice 
*vdev, uint64_t features)
 return features;
 }
 
+static uint64_t virtio_blk_get_features_rev(VirtIODevice *vdev,
+uint64_t features,
+unsigned int revision)
+{
+if (revision == 0) {
+/* legacy */
+virtio_clear_feature(&features, VIRTIO_F_VERSION_1);
+return virtio_blk_get_features(vdev, features);
+}
+/* virtio 1.0 or later */
+virtio_clear_feature(&features, VIRTIO_BLK_F_SCSI);
+virtio_clear_feature(&features, VIRTIO_BLK_F_CONFIG_WCE);
+virtio_clear_feature(&features, VIRTIO_BLK_F_WCE);
+/* we're still missing ANY_LAYOUT */
+/* virtio_add_feature(&features, VIRTIO_F_VERSION_1); */
+return features;
+}
+
 static void virtio_blk_set_status(VirtIODevice *vdev, uint8_t status)
 {
 VirtIOBlock *s = VIRTIO_BLK(vdev);
@@ -821,6 +839,7 @@ static void virtio_blk_class_init(ObjectClass *klass, void 
*data)
 vdc->get_config = virtio_blk_update_config;
 vdc->set_config = virtio_blk_set_config;
 vdc->get_features = virtio_blk_get_features;
+vdc->get_features_rev = virtio_blk_get_features_rev;
 vdc->set_status = virtio_blk_set_status;
 vdc->reset = virtio_blk_reset;
 vdc->save = virtio_blk_save_device;
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH RFC v5 18/19] virtio: support revision-specific features

2014-12-02 Thread Cornelia Huck

Devices may support different sets of feature bits depending on which
revision they're operating at. Let's give the transport a way to
re-query the device about its features when the revision has been
changed.

Signed-off-by: Cornelia Huck 
---
 hw/s390x/virtio-ccw.c  |   12 ++--
 hw/virtio/virtio-bus.c |   14 --
 include/hw/virtio/virtio-bus.h |3 +++
 include/hw/virtio/virtio.h |3 +++
 4 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/hw/s390x/virtio-ccw.c b/hw/s390x/virtio-ccw.c
index ec492b8..3826074 100644
--- a/hw/s390x/virtio-ccw.c
+++ b/hw/s390x/virtio-ccw.c
@@ -699,6 +699,10 @@ static int virtio_ccw_cb(SubchDev *sch, CCW1 ccw)
 }
 ret = 0;
 dev->revision = revinfo.revision;
+/* Re-evaluate which features the device wants to offer. */
+dev->host_features =
+virtio_bus_get_vdev_features_rev(&dev->bus, dev->host_features,
+ dev->revision >= 1 ? 1 : 0);
 break;
 default:
 ret = -ENOSYS;
@@ -712,6 +716,9 @@ static void virtio_sch_disable_cb(SubchDev *sch)
 VirtioCcwDevice *dev = sch->driver_data;
 
 dev->revision = -1;
+/* Reset the device's features to legacy. */
+dev->host_features =
+virtio_bus_get_vdev_features_rev(&dev->bus, dev->host_features, 0);
 }
 
 static int virtio_ccw_device_init(VirtioCcwDevice *dev, VirtIODevice *vdev)
@@ -854,8 +861,9 @@ static int virtio_ccw_device_init(VirtioCcwDevice *dev, 
VirtIODevice *vdev)
 virtio_add_feature(&dev->host_features, VIRTIO_F_NOTIFY_ON_EMPTY);
 virtio_add_feature(&dev->host_features, VIRTIO_F_BAD_FEATURE);
 
-dev->host_features = virtio_bus_get_vdev_features(&dev->bus,
-  dev->host_features);
+/* All devices start in legacy mode. */
+dev->host_features =
+virtio_bus_get_vdev_features_rev(&dev->bus, dev->host_features, 0);
 
 css_generate_sch_crws(sch->cssid, sch->ssid, sch->schid,
   parent->hotplugged, 1);
diff --git a/hw/virtio/virtio-bus.c b/hw/virtio/virtio-bus.c
index 32e3fab..a30826c 100644
--- a/hw/virtio/virtio-bus.c
+++ b/hw/virtio/virtio-bus.c
@@ -97,18 +97,28 @@ size_t virtio_bus_get_vdev_config_len(VirtioBusState *bus)
 }
 
 /* Get the features of the plugged device. */
-uint64_t virtio_bus_get_vdev_features(VirtioBusState *bus,
-  uint64_t requested_features)
+uint64_t virtio_bus_get_vdev_features_rev(VirtioBusState *bus,
+  uint64_t requested_features,
+  unsigned int revision)
 {
 VirtIODevice *vdev = virtio_bus_get_device(bus);
 VirtioDeviceClass *k;
 
 assert(vdev != NULL);
 k = VIRTIO_DEVICE_GET_CLASS(vdev);
+if (revision > 0 && k->get_features_rev) {
+return k->get_features_rev(vdev, requested_features, revision);
+}
 assert(k->get_features != NULL);
 return k->get_features(vdev, requested_features);
 }
 
+uint64_t virtio_bus_get_vdev_features(VirtioBusState *bus,
+  uint64_t requested_features)
+{
+return virtio_bus_get_vdev_features_rev(bus, requested_features, 0);
+}
+
 /* Get bad features of the plugged device. */
 uint64_t virtio_bus_get_vdev_bad_features(VirtioBusState *bus)
 {
diff --git a/include/hw/virtio/virtio-bus.h b/include/hw/virtio/virtio-bus.h
index 0a4dde1..f0916ef 100644
--- a/include/hw/virtio/virtio-bus.h
+++ b/include/hw/virtio/virtio-bus.h
@@ -84,6 +84,9 @@ size_t virtio_bus_get_vdev_config_len(VirtioBusState *bus);
 /* Get the features of the plugged device. */
 uint64_t virtio_bus_get_vdev_features(VirtioBusState *bus,
   uint64_t requested_features);
+uint64_t virtio_bus_get_vdev_features_rev(VirtioBusState *bus,
+  uint64_t requested_features,
+  unsigned int revision);
 /* Get bad features of the plugged device. */
 uint64_t virtio_bus_get_vdev_bad_features(VirtioBusState *bus);
 /* Get config of the plugged device. */
diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
index e7bedd1..f31e3df 100644
--- a/include/hw/virtio/virtio.h
+++ b/include/hw/virtio/virtio.h
@@ -147,6 +147,9 @@ typedef struct VirtioDeviceClass {
 DeviceRealize realize;
 DeviceUnrealize unrealize;
 uint64_t (*get_features)(VirtIODevice *vdev, uint64_t requested_features);
+uint64_t (*get_features_rev)(VirtIODevice *vdev,
+ uint64_t requested_features,
+ unsigned int revision);
 uint64_t (*bad_features)(VirtIODevice *vdev);
 void (*set_features)(VirtIODevice *vdev, uint64_t val);
 int (*validate_features)(VirtIODevice *vdev);
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of

[PATCH RFC v5 06/19] virtio: endianness checks for virtio 1.0 devices

2014-12-02 Thread Cornelia Huck

Add code that checks for the VERSION_1 feature bit in order to make
decisions about the device's endianness. This allows us to support
transitional devices.

Signed-off-by: Cornelia Huck 
---
 hw/virtio/virtio.c|6 +-
 include/hw/virtio/virtio-access.h |4 
 include/hw/virtio/virtio.h|8 ++--
 3 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index 7f74ae5..8f69ffa 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -881,7 +881,11 @@ static bool virtio_device_endian_needed(void *opaque)
 VirtIODevice *vdev = opaque;
 
 assert(vdev->device_endian != VIRTIO_DEVICE_ENDIAN_UNKNOWN);
-return vdev->device_endian != virtio_default_endian();
+if (!virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) {
+return vdev->device_endian != virtio_default_endian();
+}
+/* Devices conforming to VIRTIO 1.0 or later are always LE. */
+return vdev->device_endian != VIRTIO_DEVICE_ENDIAN_LITTLE;
 }
 
 static const VMStateDescription vmstate_virtio_device_endian = {
diff --git a/include/hw/virtio/virtio-access.h 
b/include/hw/virtio/virtio-access.h
index 46456fd..ee28c21 100644
--- a/include/hw/virtio/virtio-access.h
+++ b/include/hw/virtio/virtio-access.h
@@ -19,6 +19,10 @@
 
 static inline bool virtio_access_is_big_endian(VirtIODevice *vdev)
 {
+if (virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) {
+/* Devices conforming to VIRTIO 1.0 or later are always LE. */
+return false;
+}
 #if defined(TARGET_IS_BIENDIAN)
 return virtio_is_big_endian(vdev);
 #elif defined(TARGET_WORDS_BIGENDIAN)
diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
index 08141c7..68c40db 100644
--- a/include/hw/virtio/virtio.h
+++ b/include/hw/virtio/virtio.h
@@ -297,7 +297,11 @@ static inline bool virtio_has_feature(VirtIODevice *vdev, 
unsigned int fbit)
 
 static inline bool virtio_is_big_endian(VirtIODevice *vdev)
 {
-assert(vdev->device_endian != VIRTIO_DEVICE_ENDIAN_UNKNOWN);
-return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_BIG;
+if (!virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) {
+assert(vdev->device_endian != VIRTIO_DEVICE_ENDIAN_UNKNOWN);
+return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_BIG;
+}
+/* Devices conforming to VIRTIO 1.0 or later are always LE. */
+return false;
 }
 #endif
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH RFC v5 07/19] virtio: allow virtio-1 queue layout

2014-12-02 Thread Cornelia Huck

For virtio-1 devices, we allow a more complex queue layout that doesn't
require descriptor table and rings on a physically-contigous memory area:
add virtio_queue_set_rings() to allow transports to set this up.

Signed-off-by: Cornelia Huck 
---
 hw/virtio/virtio.c |   16 
 include/hw/virtio/virtio.h |2 ++
 2 files changed, 18 insertions(+)

diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index 8f69ffa..508dccf 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -96,6 +96,13 @@ static void virtqueue_init(VirtQueue *vq)
 {
 hwaddr pa = vq->pa;
 
+if (pa == -1ULL) {
+/*
+ * This is a virtio-1 style vq that has already been setup
+ * in virtio_queue_set.
+ */
+return;
+}
 vq->vring.desc = pa;
 vq->vring.avail = pa + vq->vring.num * sizeof(VRingDesc);
 vq->vring.used = vring_align(vq->vring.avail +
@@ -717,6 +724,15 @@ hwaddr virtio_queue_get_addr(VirtIODevice *vdev, int n)
 return vdev->vq[n].pa;
 }
 
+void virtio_queue_set_rings(VirtIODevice *vdev, int n, hwaddr desc,
+hwaddr avail, hwaddr used)
+{
+vdev->vq[n].pa = -1ULL;
+vdev->vq[n].vring.desc = desc;
+vdev->vq[n].vring.avail = avail;
+vdev->vq[n].vring.used = used;
+}
+
 void virtio_queue_set_num(VirtIODevice *vdev, int n, int num)
 {
 /* Don't allow guest to flip queue between existent and
diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
index 68c40db..80ee313 100644
--- a/include/hw/virtio/virtio.h
+++ b/include/hw/virtio/virtio.h
@@ -224,6 +224,8 @@ void virtio_queue_set_addr(VirtIODevice *vdev, int n, 
hwaddr addr);
 hwaddr virtio_queue_get_addr(VirtIODevice *vdev, int n);
 void virtio_queue_set_num(VirtIODevice *vdev, int n, int num);
 int virtio_queue_get_num(VirtIODevice *vdev, int n);
+void virtio_queue_set_rings(VirtIODevice *vdev, int n, hwaddr desc,
+hwaddr avail, hwaddr used);
 void virtio_queue_set_align(VirtIODevice *vdev, int n, int align);
 void virtio_queue_notify(VirtIODevice *vdev, int n);
 uint16_t virtio_queue_vector(VirtIODevice *vdev, int n);
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH RFC v5 12/19] virtio: disallow late feature changes for virtio-1

2014-12-02 Thread Cornelia Huck

For virtio-1 devices, the driver must not attempt to set feature bits
after it set FEATURES_OK in the device status. Simply reject it in
that case.

Signed-off-by: Cornelia Huck 
---
 hw/virtio/virtio.c |   16 ++--
 include/hw/virtio/virtio.h |2 ++
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index 508dccf..4f2dc48 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -980,7 +980,7 @@ void virtio_save(VirtIODevice *vdev, QEMUFile *f)
 vmstate_save_state(f, &vmstate_virtio, vdev);
 }
 
-int virtio_set_features(VirtIODevice *vdev, uint64_t val)
+static int __virtio_set_features(VirtIODevice *vdev, uint64_t val)
 {
 BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
 VirtioBusClass *vbusk = VIRTIO_BUS_GET_CLASS(qbus);
@@ -996,6 +996,18 @@ int virtio_set_features(VirtIODevice *vdev, uint64_t val)
 return bad ? -1 : 0;
 }
 
+int virtio_set_features(VirtIODevice *vdev, uint64_t val)
+{
+   /*
+ * The driver must not attempt to set features after feature negotiation
+ * has finished.
+ */
+if (vdev->status & VIRTIO_CONFIG_S_FEATURES_OK) {
+return -EINVAL;
+}
+return __virtio_set_features(vdev, val);
+}
+
 int virtio_load(VirtIODevice *vdev, QEMUFile *f, int version_id)
 {
 int i, ret;
@@ -1028,7 +1040,7 @@ int virtio_load(VirtIODevice *vdev, QEMUFile *f, int 
version_id)
 qemu_get_be32s(f, &features);
 
 /* XXX features >= 32 */
-if (virtio_set_features(vdev, features) < 0) {
+if (__virtio_set_features(vdev, features) < 0) {
 supported_features = k->get_features(qbus->parent);
 error_report("Features 0x%x unsupported. Allowed features: 0x%lx",
  features, supported_features);
diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
index 80ee313..9a984c2 100644
--- a/include/hw/virtio/virtio.h
+++ b/include/hw/virtio/virtio.h
@@ -32,6 +32,8 @@
 #define VIRTIO_CONFIG_S_DRIVER  2
 /* Driver has used its parts of the config, and is happy */
 #define VIRTIO_CONFIG_S_DRIVER_OK   4
+/* Driver has finished configuring features */
+#define VIRTIO_CONFIG_S_FEATURES_OK 8
 /* We've given up on this device. */
 #define VIRTIO_CONFIG_S_FAILED  0x80
 
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH RFC v5 09/19] s390x/css: Add a callback for when subchannel gets disabled

2014-12-02 Thread Cornelia Huck

From: Thomas Huth 

We need a possibility to run code when a subchannel gets disabled.
This patch adds the necessary infrastructure.

Signed-off-by: Thomas Huth 
Signed-off-by: Cornelia Huck 
---
 hw/s390x/css.c |   12 
 hw/s390x/css.h |1 +
 2 files changed, 13 insertions(+)

diff --git a/hw/s390x/css.c b/hw/s390x/css.c
index b67c039..735ec55 100644
--- a/hw/s390x/css.c
+++ b/hw/s390x/css.c
@@ -588,6 +588,7 @@ int css_do_msch(SubchDev *sch, SCHIB *orig_schib)
 {
 SCSW *s = &sch->curr_status.scsw;
 PMCW *p = &sch->curr_status.pmcw;
+uint16_t oldflags;
 int ret;
 SCHIB schib;
 
@@ -610,6 +611,7 @@ int css_do_msch(SubchDev *sch, SCHIB *orig_schib)
 copy_schib_from_guest(&schib, orig_schib);
 /* Only update the program-modifiable fields. */
 p->intparm = schib.pmcw.intparm;
+oldflags = p->flags;
 p->flags &= ~(PMCW_FLAGS_MASK_ISC | PMCW_FLAGS_MASK_ENA |
   PMCW_FLAGS_MASK_LM | PMCW_FLAGS_MASK_MME |
   PMCW_FLAGS_MASK_MP);
@@ -625,6 +627,12 @@ int css_do_msch(SubchDev *sch, SCHIB *orig_schib)
 (PMCW_CHARS_MASK_MBFC | PMCW_CHARS_MASK_CSENSE);
 sch->curr_status.mba = schib.mba;
 
+/* Has the channel been disabled? */
+if (sch->disable_cb && (oldflags & PMCW_FLAGS_MASK_ENA) != 0
+&& (p->flags & PMCW_FLAGS_MASK_ENA) == 0) {
+sch->disable_cb(sch);
+}
+
 ret = 0;
 
 out:
@@ -1443,6 +1451,10 @@ void css_reset_sch(SubchDev *sch)
 {
 PMCW *p = &sch->curr_status.pmcw;
 
+if ((p->flags & PMCW_FLAGS_MASK_ENA) != 0 && sch->disable_cb) {
+sch->disable_cb(sch);
+}
+
 p->intparm = 0;
 p->flags &= ~(PMCW_FLAGS_MASK_ISC | PMCW_FLAGS_MASK_ENA |
   PMCW_FLAGS_MASK_LM | PMCW_FLAGS_MASK_MME |
diff --git a/hw/s390x/css.h b/hw/s390x/css.h
index 33104ac..7fa807b 100644
--- a/hw/s390x/css.h
+++ b/hw/s390x/css.h
@@ -81,6 +81,7 @@ struct SubchDev {
 uint8_t ccw_no_data_cnt;
 /* transport-provided data: */
 int (*ccw_cb) (SubchDev *, CCW1);
+void (*disable_cb)(SubchDev *);
 SenseId id;
 void *driver_data;
 };
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH RFC v5 13/19] virtio: allow to fail setting status

2014-12-02 Thread Cornelia Huck

virtio-1 allow setting of the FEATURES_OK status bit to fail if
the negotiated feature bits are inconsistent: let's fail
virtio_set_status() in that case and update virtio-ccw to post an
error to the guest.

Signed-off-by: Cornelia Huck 
---
 hw/s390x/virtio-ccw.c  |   20 
 hw/virtio/virtio.c |   24 +++-
 include/hw/virtio/virtio.h |3 ++-
 3 files changed, 37 insertions(+), 10 deletions(-)

diff --git a/hw/s390x/virtio-ccw.c b/hw/s390x/virtio-ccw.c
index 75c9ff9..ec492b8 100644
--- a/hw/s390x/virtio-ccw.c
+++ b/hw/s390x/virtio-ccw.c
@@ -555,15 +555,19 @@ static int virtio_ccw_cb(SubchDev *sch, CCW1 ccw)
 if (!(status & VIRTIO_CONFIG_S_DRIVER_OK)) {
 virtio_ccw_stop_ioeventfd(dev);
 }
-virtio_set_status(vdev, status);
-if (vdev->status == 0) {
-virtio_reset(vdev);
-}
-if (status & VIRTIO_CONFIG_S_DRIVER_OK) {
-virtio_ccw_start_ioeventfd(dev);
+if (virtio_set_status(vdev, status) == 0) {
+if (vdev->status == 0) {
+virtio_reset(vdev);
+}
+if (status & VIRTIO_CONFIG_S_DRIVER_OK) {
+virtio_ccw_start_ioeventfd(dev);
+}
+sch->curr_status.scsw.count = ccw.count - sizeof(status);
+ret = 0;
+} else {
+/* Trigger a command reject. */
+ret = -ENOSYS;
 }
-sch->curr_status.scsw.count = ccw.count - sizeof(status);
-ret = 0;
 }
 break;
 case CCW_CMD_SET_IND:
diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index 4f2dc48..be128f7 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -548,15 +548,37 @@ void virtio_update_irq(VirtIODevice *vdev)
 virtio_notify_vector(vdev, VIRTIO_NO_VECTOR);
 }
 
-void virtio_set_status(VirtIODevice *vdev, uint8_t val)
+static int virtio_validate_features(VirtIODevice *vdev)
+{
+VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
+
+if (k->validate_features) {
+return k->validate_features(vdev);
+} else {
+return 0;
+}
+}
+
+int virtio_set_status(VirtIODevice *vdev, uint8_t val)
 {
 VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
 trace_virtio_set_status(vdev, val);
 
+if (virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) {
+if (!(vdev->status & VIRTIO_CONFIG_S_FEATURES_OK) &&
+val & VIRTIO_CONFIG_S_FEATURES_OK) {
+int ret = virtio_validate_features(vdev);
+
+if (ret) {
+return ret;
+}
+}
+}
 if (k->set_status) {
 k->set_status(vdev, val);
 }
 vdev->status = val;
+return 0;
 }
 
 bool target_words_bigendian(void);
diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
index 9a984c2..e7bedd1 100644
--- a/include/hw/virtio/virtio.h
+++ b/include/hw/virtio/virtio.h
@@ -149,6 +149,7 @@ typedef struct VirtioDeviceClass {
 uint64_t (*get_features)(VirtIODevice *vdev, uint64_t requested_features);
 uint64_t (*bad_features)(VirtIODevice *vdev);
 void (*set_features)(VirtIODevice *vdev, uint64_t val);
+int (*validate_features)(VirtIODevice *vdev);
 void (*get_config)(VirtIODevice *vdev, uint8_t *config);
 void (*set_config)(VirtIODevice *vdev, const uint8_t *config);
 void (*reset)(VirtIODevice *vdev);
@@ -232,7 +233,7 @@ void virtio_queue_set_align(VirtIODevice *vdev, int n, int 
align);
 void virtio_queue_notify(VirtIODevice *vdev, int n);
 uint16_t virtio_queue_vector(VirtIODevice *vdev, int n);
 void virtio_queue_set_vector(VirtIODevice *vdev, int n, uint16_t vector);
-void virtio_set_status(VirtIODevice *vdev, uint8_t val);
+int virtio_set_status(VirtIODevice *vdev, uint8_t val);
 void virtio_reset(void *opaque);
 void virtio_update_irq(VirtIODevice *vdev);
 int virtio_set_features(VirtIODevice *vdev, uint64_t val);
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH RFC v5 01/19] linux-headers/virtio_config: Update with VIRTIO_F_VERSION_1

2014-12-02 Thread Cornelia Huck

From: Thomas Huth 

Add the new VIRTIO_F_VERSION_1 definition to the virtio_config.h
linux header.

Signed-off-by: Thomas Huth 
Signed-off-by: Cornelia Huck 
---
 linux-headers/linux/virtio_config.h |3 +++
 1 file changed, 3 insertions(+)

diff --git a/linux-headers/linux/virtio_config.h 
b/linux-headers/linux/virtio_config.h
index 75dc20b..16aa289 100644
--- a/linux-headers/linux/virtio_config.h
+++ b/linux-headers/linux/virtio_config.h
@@ -54,4 +54,7 @@
 /* Can the device handle any descriptor layout? */
 #define VIRTIO_F_ANY_LAYOUT27
 
+/* v1.0 compliant. */
+#define VIRTIO_F_VERSION_1 32
+
 #endif /* _LINUX_VIRTIO_CONFIG_H */
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH RFC v5 15/19] virtio-net: no writeable mac for virtio-1

2014-12-02 Thread Cornelia Huck

Devices operating as virtio 1.0 may not allow writes to the mac
address in config space.

Signed-off-by: Cornelia Huck 
---
 hw/net/virtio-net.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index d6d1b98..ebbea60 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -87,6 +87,7 @@ static void virtio_net_set_config(VirtIODevice *vdev, const 
uint8_t *config)
 memcpy(&netcfg, config, n->config_size);
 
 if (!virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR) &&
+!virtio_has_feature(vdev, VIRTIO_F_VERSION_1) &&
 memcmp(netcfg.mac, n->mac, ETH_ALEN)) {
 memcpy(n->mac, netcfg.mac, ETH_ALEN);
 qemu_format_nic_info_str(qemu_get_queue(n->nic), n->mac);
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH RFC v5 10/19] s390x/virtio-ccw: add virtio set-revision call

2014-12-02 Thread Cornelia Huck

From: Thomas Huth 

Handle the virtio-ccw revision according to what the guest sets.
When revision 1 is selected, we have a virtio-1 standard device
with byteswapping for the virtio rings.

When a channel gets disabled, we have to revert to the legacy behavior
in case the next user of the device does not negotiate the revision 1
anymore (e.g. the boot firmware uses revision 1, but the operating
system only uses the legacy mode).

Note that revisions > 0 are still disabled.

Signed-off-by: Thomas Huth 
Signed-off-by: Cornelia Huck 
---
 hw/s390x/virtio-ccw.c |   52 +
 hw/s390x/virtio-ccw.h |5 +
 2 files changed, 57 insertions(+)

diff --git a/hw/s390x/virtio-ccw.c b/hw/s390x/virtio-ccw.c
index e434718..5311d9f 100644
--- a/hw/s390x/virtio-ccw.c
+++ b/hw/s390x/virtio-ccw.c
@@ -20,9 +20,11 @@
 #include "hw/virtio/virtio-net.h"
 #include "hw/sysbus.h"
 #include "qemu/bitops.h"
+#include "hw/virtio/virtio-access.h"
 #include "hw/virtio/virtio-bus.h"
 #include "hw/s390x/adapter.h"
 #include "hw/s390x/s390_flic.h"
+#include "linux/virtio_config.h"
 
 #include "ioinst.h"
 #include "css.h"
@@ -260,6 +262,12 @@ typedef struct VirtioThinintInfo {
 uint8_t isc;
 } QEMU_PACKED VirtioThinintInfo;
 
+typedef struct VirtioRevInfo {
+uint16_t revision;
+uint16_t length;
+uint8_t data[0];
+} QEMU_PACKED VirtioRevInfo;
+
 /* Specify where the virtqueues for the subchannel are in guest memory. */
 static int virtio_ccw_set_vqs(SubchDev *sch, uint64_t addr, uint32_t align,
   uint16_t index, uint16_t num)
@@ -299,6 +307,7 @@ static int virtio_ccw_cb(SubchDev *sch, CCW1 ccw)
 {
 int ret;
 VqInfoBlock info;
+VirtioRevInfo revinfo;
 uint8_t status;
 VirtioFeatDesc features;
 void *config;
@@ -375,6 +384,13 @@ static int virtio_ccw_cb(SubchDev *sch, CCW1 ccw)
 features.features = (uint32_t)dev->host_features;
 } else if (features.index == 1) {
 features.features = (uint32_t)(dev->host_features >> 32);
+/*
+ * Don't offer version 1 to the guest if it did not
+ * negotiate at least revision 1.
+ */
+if (dev->revision <= 0) {
+features.features &= ~(1 << (VIRTIO_F_VERSION_1 - 32));
+}
 } else {
 /* Return zeroes if the guest supports more feature bits. */
 features.features = 0;
@@ -406,6 +422,13 @@ static int virtio_ccw_cb(SubchDev *sch, CCW1 ccw)
 (vdev->guest_features & 
0x) |
 features.features);
 } else if (features.index == 1) {
+/*
+ * The guest should not set version 1 if it didn't
+ * negotiate a revision >= 1.
+ */
+if (dev->revision <= 0) {
+features.features &= ~(1 << (VIRTIO_F_VERSION_1 - 32));
+}
 virtio_set_features(vdev,
 (vdev->guest_features & 
0x) |
 ((uint64_t)features.features << 32));
@@ -608,6 +631,25 @@ static int virtio_ccw_cb(SubchDev *sch, CCW1 ccw)
 }
 }
 break;
+case CCW_CMD_SET_VIRTIO_REV:
+len = sizeof(revinfo);
+if (ccw.count < len || (check_len && ccw.count > len)) {
+ret = -EINVAL;
+break;
+}
+if (!ccw.cda) {
+ret = -EFAULT;
+break;
+}
+cpu_physical_memory_read(ccw.cda, &revinfo, len);
+if (dev->revision >= 0 ||
+revinfo.revision > VIRTIO_CCW_REV_MAX) {
+ret = -ENOSYS;
+break;
+}
+ret = 0;
+dev->revision = revinfo.revision;
+break;
 default:
 ret = -ENOSYS;
 break;
@@ -615,6 +657,13 @@ static int virtio_ccw_cb(SubchDev *sch, CCW1 ccw)
 return ret;
 }
 
+static void virtio_sch_disable_cb(SubchDev *sch)
+{
+VirtioCcwDevice *dev = sch->driver_data;
+
+dev->revision = -1;
+}
+
 static int virtio_ccw_device_init(VirtioCcwDevice *dev, VirtIODevice *vdev)
 {
 unsigned int cssid = 0;
@@ -740,6 +789,7 @@ static int virtio_ccw_device_init(VirtioCcwDevice *dev, 
VirtIODevice *vdev)
 css_sch_build_virtual_schib(sch, 0, VIRTIO_CCW_CHPID_TYPE);
 
 sch->ccw_cb = virtio_ccw_cb;
+sch->disable_cb = virtio_sch_disable_cb;
 
 /* Build senseid data. */
 memset(&sch->id, 0, sizeof(SenseId));
@@ -747,6 +797,8 @@ static int virtio_ccw_device_init(VirtioCcwDevice *dev, 
VirtIODevice *vdev)
 sch->id.cu_type = VIRTIO_CCW_CU_TYPE;
 sch->id.cu_model = vdev->device_id;
 
+dev->revision = -1;
+
 /* Set default feature bits that are offered by the host. */
 dev->host_features = 0;
 virtio_add_feature(&dev->host_featur

[PATCH RFC v5 11/19] s390x/virtio-ccw: support virtio-1 set_vq format

2014-12-02 Thread Cornelia Huck

Support the new CCW_CMD_SET_VQ format for virtio-1 devices.

While we're at it, refactor the code a bit and enforce big endian
fields (which had always been required, even for legacy).

Reviewed-by: Thomas Huth 
Signed-off-by: Cornelia Huck 
---
 hw/s390x/virtio-ccw.c |  114 ++---
 1 file changed, 80 insertions(+), 34 deletions(-)

diff --git a/hw/s390x/virtio-ccw.c b/hw/s390x/virtio-ccw.c
index 5311d9f..75c9ff9 100644
--- a/hw/s390x/virtio-ccw.c
+++ b/hw/s390x/virtio-ccw.c
@@ -238,11 +238,20 @@ VirtualCssBus *virtual_css_bus_init(void)
 }
 
 /* Communication blocks used by several channel commands. */
-typedef struct VqInfoBlock {
+typedef struct VqInfoBlockLegacy {
 uint64_t queue;
 uint32_t align;
 uint16_t index;
 uint16_t num;
+} QEMU_PACKED VqInfoBlockLegacy;
+
+typedef struct VqInfoBlock {
+uint64_t desc;
+uint32_t res0;
+uint16_t index;
+uint16_t num;
+uint64_t avail;
+uint64_t used;
 } QEMU_PACKED VqInfoBlock;
 
 typedef struct VqConfigBlock {
@@ -269,17 +278,20 @@ typedef struct VirtioRevInfo {
 } QEMU_PACKED VirtioRevInfo;
 
 /* Specify where the virtqueues for the subchannel are in guest memory. */
-static int virtio_ccw_set_vqs(SubchDev *sch, uint64_t addr, uint32_t align,
-  uint16_t index, uint16_t num)
+static int virtio_ccw_set_vqs(SubchDev *sch, VqInfoBlock *info,
+  VqInfoBlockLegacy *linfo)
 {
 VirtIODevice *vdev = virtio_ccw_get_vdev(sch);
+uint16_t index = info ? info->index : linfo->index;
+uint16_t num = info ? info->num : linfo->num;
+uint64_t desc = info ? info->desc : linfo->queue;
 
 if (index > VIRTIO_PCI_QUEUE_MAX) {
 return -EINVAL;
 }
 
 /* Current code in virtio.c relies on 4K alignment. */
-if (addr && (align != 4096)) {
+if (linfo && desc && (linfo->align != 4096)) {
 return -EINVAL;
 }
 
@@ -287,8 +299,12 @@ static int virtio_ccw_set_vqs(SubchDev *sch, uint64_t 
addr, uint32_t align,
 return -EINVAL;
 }
 
-virtio_queue_set_addr(vdev, index, addr);
-if (!addr) {
+if (info) {
+virtio_queue_set_rings(vdev, index, desc, info->avail, info->used);
+} else {
+virtio_queue_set_addr(vdev, index, desc);
+}
+if (!desc) {
 virtio_queue_set_vector(vdev, index, 0);
 } else {
 /* Fail if we don't have a big enough queue. */
@@ -303,10 +319,66 @@ static int virtio_ccw_set_vqs(SubchDev *sch, uint64_t 
addr, uint32_t align,
 return 0;
 }
 
-static int virtio_ccw_cb(SubchDev *sch, CCW1 ccw)
+static int virtio_ccw_handle_set_vq(SubchDev *sch, CCW1 ccw, bool check_len,
+bool is_legacy)
 {
 int ret;
 VqInfoBlock info;
+VqInfoBlockLegacy linfo;
+size_t info_len = is_legacy ? sizeof(linfo) : sizeof(info);
+
+if (check_len) {
+if (ccw.count != info_len) {
+return -EINVAL;
+}
+} else if (ccw.count < info_len) {
+/* Can't execute command. */
+return -EINVAL;
+}
+if (!ccw.cda) {
+return -EFAULT;
+}
+if (is_legacy) {
+linfo.queue = ldq_be_phys(&address_space_memory, ccw.cda);
+linfo.align = ldl_be_phys(&address_space_memory,
+  ccw.cda + sizeof(linfo.queue));
+linfo.index = lduw_be_phys(&address_space_memory,
+   ccw.cda + sizeof(linfo.queue)
+   + sizeof(linfo.align));
+linfo.num = lduw_be_phys(&address_space_memory,
+ ccw.cda + sizeof(linfo.queue)
+ + sizeof(linfo.align)
+ + sizeof(linfo.index));
+ret = virtio_ccw_set_vqs(sch, NULL, &linfo);
+} else {
+info.desc = ldq_be_phys(&address_space_memory, ccw.cda);
+info.index = lduw_be_phys(&address_space_memory,
+  ccw.cda + sizeof(info.desc)
+  + sizeof(info.res0));
+info.num = lduw_be_phys(&address_space_memory,
+ccw.cda + sizeof(info.desc)
+  + sizeof(info.res0)
+  + sizeof(info.index));
+info.avail = ldq_be_phys(&address_space_memory,
+ ccw.cda + sizeof(info.desc)
+ + sizeof(info.res0)
+ + sizeof(info.index)
+ + sizeof(info.num));
+info.used = ldq_be_phys(&address_space_memory,
+ccw.cda + sizeof(info.desc)
++ sizeof(info.res0)
++ sizeof(info.index)
++ sizeof(info.num)
++ sizeof(info.avail));
+ret = virtio_ccw_set_vqs(sch, &info, NULL);
+}
+

[PATCH RFC v5 03/19] virtio: feature bit manipulation helpers

2014-12-02 Thread Cornelia Huck

Add virtio_{add,clear}_feature helper functions for manipulating a
feature bits variable. This has some benefits over open coding:
- add check that the bit is in a sane range
- make it obvious at a glance what is going on
- have a central point to change when we want to extend feature bits

Convert existing code manipulating features to use the new helpers.

Signed-off-by: Cornelia Huck 
---
 hw/9pfs/virtio-9p-device.c  |2 +-
 hw/block/virtio-blk.c   |   16 
 hw/char/virtio-serial-bus.c |2 +-
 hw/net/virtio-net.c |   34 +-
 hw/s390x/virtio-ccw.c   |4 ++--
 hw/virtio/virtio-mmio.c |2 +-
 hw/virtio/virtio-pci.c  |4 ++--
 include/hw/virtio/virtio.h  |   12 
 8 files changed, 44 insertions(+), 32 deletions(-)

diff --git a/hw/9pfs/virtio-9p-device.c b/hw/9pfs/virtio-9p-device.c
index 2572747..30492ec 100644
--- a/hw/9pfs/virtio-9p-device.c
+++ b/hw/9pfs/virtio-9p-device.c
@@ -23,7 +23,7 @@
 
 static uint32_t virtio_9p_get_features(VirtIODevice *vdev, uint32_t features)
 {
-features |= 1 << VIRTIO_9P_MOUNT_TAG;
+virtio_add_feature(&features, VIRTIO_9P_MOUNT_TAG);
 return features;
 }
 
diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index b19b102..3f76e2a 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -568,20 +568,20 @@ static uint32_t virtio_blk_get_features(VirtIODevice 
*vdev, uint32_t features)
 {
 VirtIOBlock *s = VIRTIO_BLK(vdev);
 
-features |= (1 << VIRTIO_BLK_F_SEG_MAX);
-features |= (1 << VIRTIO_BLK_F_GEOMETRY);
-features |= (1 << VIRTIO_BLK_F_TOPOLOGY);
-features |= (1 << VIRTIO_BLK_F_BLK_SIZE);
-features |= (1 << VIRTIO_BLK_F_SCSI);
+virtio_add_feature(&features, VIRTIO_BLK_F_SEG_MAX);
+virtio_add_feature(&features, VIRTIO_BLK_F_GEOMETRY);
+virtio_add_feature(&features, VIRTIO_BLK_F_TOPOLOGY);
+virtio_add_feature(&features, VIRTIO_BLK_F_BLK_SIZE);
+virtio_add_feature(&features, VIRTIO_BLK_F_SCSI);
 
 if (s->conf.config_wce) {
-features |= (1 << VIRTIO_BLK_F_CONFIG_WCE);
+virtio_add_feature(&features, VIRTIO_BLK_F_CONFIG_WCE);
 }
 if (blk_enable_write_cache(s->blk)) {
-features |= (1 << VIRTIO_BLK_F_WCE);
+virtio_add_feature(&features, VIRTIO_BLK_F_WCE);
 }
 if (blk_is_read_only(s->blk)) {
-features |= 1 << VIRTIO_BLK_F_RO;
+virtio_add_feature(&features, VIRTIO_BLK_F_RO);
 }
 
 return features;
diff --git a/hw/char/virtio-serial-bus.c b/hw/char/virtio-serial-bus.c
index a7b1b68..0f637db 100644
--- a/hw/char/virtio-serial-bus.c
+++ b/hw/char/virtio-serial-bus.c
@@ -474,7 +474,7 @@ static uint32_t get_features(VirtIODevice *vdev, uint32_t 
features)
 vser = VIRTIO_SERIAL(vdev);
 
 if (vser->bus.max_nr_ports > 1) {
-features |= (1 << VIRTIO_CONSOLE_F_MULTIPORT);
+virtio_add_feature(&features, VIRTIO_CONSOLE_F_MULTIPORT);
 }
 return features;
 }
diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index e574bd4..f1aa100 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -446,23 +446,23 @@ static uint32_t virtio_net_get_features(VirtIODevice 
*vdev, uint32_t features)
 VirtIONet *n = VIRTIO_NET(vdev);
 NetClientState *nc = qemu_get_queue(n->nic);
 
-features |= (1 << VIRTIO_NET_F_MAC);
+virtio_add_feature(&features, VIRTIO_NET_F_MAC);
 
 if (!peer_has_vnet_hdr(n)) {
-features &= ~(0x1 << VIRTIO_NET_F_CSUM);
-features &= ~(0x1 << VIRTIO_NET_F_HOST_TSO4);
-features &= ~(0x1 << VIRTIO_NET_F_HOST_TSO6);
-features &= ~(0x1 << VIRTIO_NET_F_HOST_ECN);
+virtio_clear_feature(&features, VIRTIO_NET_F_CSUM);
+virtio_clear_feature(&features, VIRTIO_NET_F_HOST_TSO4);
+virtio_clear_feature(&features, VIRTIO_NET_F_HOST_TSO6);
+virtio_clear_feature(&features, VIRTIO_NET_F_HOST_ECN);
 
-features &= ~(0x1 << VIRTIO_NET_F_GUEST_CSUM);
-features &= ~(0x1 << VIRTIO_NET_F_GUEST_TSO4);
-features &= ~(0x1 << VIRTIO_NET_F_GUEST_TSO6);
-features &= ~(0x1 << VIRTIO_NET_F_GUEST_ECN);
+virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_CSUM);
+virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_TSO4);
+virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_TSO6);
+virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_ECN);
 }
 
 if (!peer_has_vnet_hdr(n) || !peer_has_ufo(n)) {
-features &= ~(0x1 << VIRTIO_NET_F_GUEST_UFO);
-features &= ~(0x1 << VIRTIO_NET_F_HOST_UFO);
+virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_UFO);
+virtio_clear_feature(&features, VIRTIO_NET_F_HOST_UFO);
 }
 
 if (!get_vhost_net(nc->peer)) {
@@ -477,11 +477,11 @@ static uint32_t virtio_net_bad_features(VirtIODevice 
*vdev)
 
 /* Linux kernel 2.6.25.  It understood MAC (as everyone must),
  * but also these: */
-features |= (1 << VIRTIO_NET_F_MAC);
-feat

[PATCH RFC v5 04/19] virtio: add feature checking helpers

2014-12-02 Thread Cornelia Huck

Add a helper function for checking whether a bit is set in the guest
features for a vdev as well as one that works on a feature bit set.

Convert code that open-coded this: It cleans up the code and makes it
easier to extend the guest feature bits.

Signed-off-by: Cornelia Huck 
---
 hw/block/virtio-blk.c   |7 ++-
 hw/char/virtio-serial-bus.c |2 +-
 hw/net/virtio-net.c |   23 +--
 hw/scsi/virtio-scsi.c   |8 
 hw/virtio/dataplane/vring.c |   10 +-
 hw/virtio/virtio-balloon.c  |2 +-
 hw/virtio/virtio.c  |   10 +-
 include/hw/virtio/virtio.h  |   11 +++
 8 files changed, 42 insertions(+), 31 deletions(-)

diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index 3f76e2a..27f263a 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -590,7 +590,6 @@ static uint32_t virtio_blk_get_features(VirtIODevice *vdev, 
uint32_t features)
 static void virtio_blk_set_status(VirtIODevice *vdev, uint8_t status)
 {
 VirtIOBlock *s = VIRTIO_BLK(vdev);
-uint32_t features;
 
 if (s->dataplane && !(status & (VIRTIO_CONFIG_S_DRIVER |
 VIRTIO_CONFIG_S_DRIVER_OK))) {
@@ -601,8 +600,6 @@ static void virtio_blk_set_status(VirtIODevice *vdev, 
uint8_t status)
 return;
 }
 
-features = vdev->guest_features;
-
 /* A guest that supports VIRTIO_BLK_F_CONFIG_WCE must be able to send
  * cache flushes.  Thus, the "auto writethrough" behavior is never
  * necessary for guests that support the VIRTIO_BLK_F_CONFIG_WCE feature.
@@ -618,10 +615,10 @@ static void virtio_blk_set_status(VirtIODevice *vdev, 
uint8_t status)
  *
  * s->blk would erroneously be placed in writethrough mode.
  */
-if (!(features & (1 << VIRTIO_BLK_F_CONFIG_WCE))) {
+if (!virtio_has_feature(vdev, VIRTIO_BLK_F_CONFIG_WCE)) {
 aio_context_acquire(blk_get_aio_context(s->blk));
 blk_set_enable_write_cache(s->blk,
-   !!(features & (1 << VIRTIO_BLK_F_WCE)));
+   virtio_has_feature(vdev, VIRTIO_BLK_F_WCE));
 aio_context_release(blk_get_aio_context(s->blk));
 }
 }
diff --git a/hw/char/virtio-serial-bus.c b/hw/char/virtio-serial-bus.c
index 0f637db..d49883f 100644
--- a/hw/char/virtio-serial-bus.c
+++ b/hw/char/virtio-serial-bus.c
@@ -75,7 +75,7 @@ static VirtIOSerialPort *find_port_by_name(char *name)
 static bool use_multiport(VirtIOSerial *vser)
 {
 VirtIODevice *vdev = VIRTIO_DEVICE(vser);
-return vdev->guest_features & (1 << VIRTIO_CONSOLE_F_MULTIPORT);
+return virtio_has_feature(vdev, VIRTIO_CONSOLE_F_MULTIPORT);
 }
 
 static size_t write_to_port(VirtIOSerialPort *port,
diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index f1aa100..9f3c58a 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -86,7 +86,7 @@ static void virtio_net_set_config(VirtIODevice *vdev, const 
uint8_t *config)
 
 memcpy(&netcfg, config, n->config_size);
 
-if (!(vdev->guest_features >> VIRTIO_NET_F_CTRL_MAC_ADDR & 1) &&
+if (!virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR) &&
 memcmp(netcfg.mac, n->mac, ETH_ALEN)) {
 memcpy(n->mac, netcfg.mac, ETH_ALEN);
 qemu_format_nic_info_str(qemu_get_queue(n->nic), n->mac);
@@ -305,7 +305,7 @@ static RxFilterInfo 
*virtio_net_query_rxfilter(NetClientState *nc)
 info->multicast_table = str_list;
 info->vlan_table = get_vlan_table(n);
 
-if (!((1 << VIRTIO_NET_F_CTRL_VLAN) & vdev->guest_features)) {
+if (!virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VLAN)) {
 info->vlan = RX_STATE_ALL;
 } else if (!info->vlan_table) {
 info->vlan = RX_STATE_NONE;
@@ -519,9 +519,12 @@ static void virtio_net_set_features(VirtIODevice *vdev, 
uint32_t features)
 VirtIONet *n = VIRTIO_NET(vdev);
 int i;
 
-virtio_net_set_multiqueue(n, !!(features & (1 << VIRTIO_NET_F_MQ)));
+virtio_net_set_multiqueue(n,
+  __virtio_has_feature(features, VIRTIO_NET_F_MQ));
 
-virtio_net_set_mrg_rx_bufs(n, !!(features & (1 << 
VIRTIO_NET_F_MRG_RXBUF)));
+virtio_net_set_mrg_rx_bufs(n,
+   __virtio_has_feature(features,
+VIRTIO_NET_F_MRG_RXBUF));
 
 if (n->has_vnet_hdr) {
 n->curr_guest_offloads =
@@ -538,7 +541,7 @@ static void virtio_net_set_features(VirtIODevice *vdev, 
uint32_t features)
 vhost_net_ack_features(get_vhost_net(nc->peer), features);
 }
 
-if ((1 << VIRTIO_NET_F_CTRL_VLAN) & features) {
+if (__virtio_has_feature(features, VIRTIO_NET_F_CTRL_VLAN)) {
 memset(n->vlans, 0, MAX_VLAN >> 3);
 } else {
 memset(n->vlans, 0xff, MAX_VLAN >> 3);
@@ -585,7 +588,7 @@ static int virtio_net_handle_offloads(VirtIONet *n, uint8_t 
cmd,
 uint64_t offloads;
 size_t s;
 
-if (!((1 << VIRTIO_NET_F_CTRL_GUEST_OFFLOADS) &

[PATCH RFC v5 14/19] s390x/virtio-ccw: enable virtio 1.0

2014-12-02 Thread Cornelia Huck

virtio-ccw should now have everything in place to operate virtio 1.0
devices, so let's enable revision 1.

Signed-off-by: Cornelia Huck 
---
 hw/s390x/virtio-ccw.h |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/s390x/virtio-ccw.h b/hw/s390x/virtio-ccw.h
index fe5c782..d40e3be 100644
--- a/hw/s390x/virtio-ccw.h
+++ b/hw/s390x/virtio-ccw.h
@@ -70,7 +70,7 @@ typedef struct VirtIOCCWDeviceClass {
 } VirtIOCCWDeviceClass;
 
 /* The maximum virtio revision we support. */
-#define VIRTIO_CCW_REV_MAX 0
+#define VIRTIO_CCW_REV_MAX 1
 
 /* Performance improves when virtqueue kick processing is decoupled from the
  * vcpu thread using ioeventfd for some devices. */
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH RFC v5 02/19] virtio: cull virtio_bus_set_vdev_features

2014-12-02 Thread Cornelia Huck

The only user of this function was virtio-ccw, and it should use
virtio_set_features() like everybody else: We need to make sure
that bad features are masked out properly, which this function did
not do.

Reviewed-by: Thomas Huth 
Signed-off-by: Cornelia Huck 
---
 hw/s390x/virtio-ccw.c  |3 +--
 hw/virtio/virtio-bus.c |   14 --
 include/hw/virtio/virtio-bus.h |3 ---
 3 files changed, 1 insertion(+), 19 deletions(-)

diff --git a/hw/s390x/virtio-ccw.c b/hw/s390x/virtio-ccw.c
index ea236c9..84f17bc 100644
--- a/hw/s390x/virtio-ccw.c
+++ b/hw/s390x/virtio-ccw.c
@@ -400,8 +400,7 @@ static int virtio_ccw_cb(SubchDev *sch, CCW1 ccw)
ccw.cda + sizeof(features.features));
 features.features = ldl_le_phys(&address_space_memory, ccw.cda);
 if (features.index < ARRAY_SIZE(dev->host_features)) {
-virtio_bus_set_vdev_features(&dev->bus, features.features);
-vdev->guest_features = features.features;
+virtio_set_features(vdev, features.features);
 } else {
 /*
  * If the guest supports more feature bits, assert that it
diff --git a/hw/virtio/virtio-bus.c b/hw/virtio/virtio-bus.c
index eb77019..a8ffa07 100644
--- a/hw/virtio/virtio-bus.c
+++ b/hw/virtio/virtio-bus.c
@@ -109,20 +109,6 @@ uint32_t virtio_bus_get_vdev_features(VirtioBusState *bus,
 return k->get_features(vdev, requested_features);
 }
 
-/* Set the features of the plugged device. */
-void virtio_bus_set_vdev_features(VirtioBusState *bus,
-  uint32_t requested_features)
-{
-VirtIODevice *vdev = virtio_bus_get_device(bus);
-VirtioDeviceClass *k;
-
-assert(vdev != NULL);
-k = VIRTIO_DEVICE_GET_CLASS(vdev);
-if (k->set_features != NULL) {
-k->set_features(vdev, requested_features);
-}
-}
-
 /* Get bad features of the plugged device. */
 uint32_t virtio_bus_get_vdev_bad_features(VirtioBusState *bus)
 {
diff --git a/include/hw/virtio/virtio-bus.h b/include/hw/virtio/virtio-bus.h
index 0756545..0d2e7b4 100644
--- a/include/hw/virtio/virtio-bus.h
+++ b/include/hw/virtio/virtio-bus.h
@@ -84,9 +84,6 @@ size_t virtio_bus_get_vdev_config_len(VirtioBusState *bus);
 /* Get the features of the plugged device. */
 uint32_t virtio_bus_get_vdev_features(VirtioBusState *bus,
 uint32_t requested_features);
-/* Set the features of the plugged device. */
-void virtio_bus_set_vdev_features(VirtioBusState *bus,
-  uint32_t requested_features);
 /* Get bad features of the plugged device. */
 uint32_t virtio_bus_get_vdev_bad_features(VirtioBusState *bus);
 /* Get config of the plugged device. */
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH RFC v5 05/19] virtio: support more feature bits

2014-12-02 Thread Cornelia Huck

With virtio-1, we support more than 32 feature bits. Let's extend both
host and guest features to 64, which should suffice for a while.

vhost and migration have been ignored for now.

Signed-off-by: Cornelia Huck 
---
 hw/9pfs/virtio-9p-device.c  |2 +-
 hw/block/virtio-blk.c   |2 +-
 hw/char/virtio-serial-bus.c |2 +-
 hw/net/virtio-net.c |   22 +--
 hw/s390x/s390-virtio-bus.c  |3 ++-
 hw/s390x/s390-virtio-bus.h  |2 +-
 hw/s390x/virtio-ccw.c   |   40 --
 hw/s390x/virtio-ccw.h   |5 +
 hw/scsi/vhost-scsi.c|3 +--
 hw/scsi/virtio-scsi.c   |4 ++--
 hw/virtio/virtio-balloon.c  |2 +-
 hw/virtio/virtio-bus.c  |6 ++---
 hw/virtio/virtio-mmio.c |4 ++--
 hw/virtio/virtio-pci.c  |3 ++-
 hw/virtio/virtio-pci.h  |2 +-
 hw/virtio/virtio-rng.c  |2 +-
 hw/virtio/virtio.c  |   13 ++-
 include/hw/qdev-properties.h|   11 ++
 include/hw/virtio/virtio-bus.h  |8 +++
 include/hw/virtio/virtio-net.h  |   46 +++
 include/hw/virtio/virtio-scsi.h |6 ++---
 include/hw/virtio/virtio.h  |   38 ++--
 22 files changed, 126 insertions(+), 100 deletions(-)

diff --git a/hw/9pfs/virtio-9p-device.c b/hw/9pfs/virtio-9p-device.c
index 30492ec..60f9ff9 100644
--- a/hw/9pfs/virtio-9p-device.c
+++ b/hw/9pfs/virtio-9p-device.c
@@ -21,7 +21,7 @@
 #include "virtio-9p-coth.h"
 #include "hw/virtio/virtio-access.h"
 
-static uint32_t virtio_9p_get_features(VirtIODevice *vdev, uint32_t features)
+static uint64_t virtio_9p_get_features(VirtIODevice *vdev, uint64_t features)
 {
 virtio_add_feature(&features, VIRTIO_9P_MOUNT_TAG);
 return features;
diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index 27f263a..9cfae66 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -564,7 +564,7 @@ static void virtio_blk_set_config(VirtIODevice *vdev, const 
uint8_t *config)
 aio_context_release(blk_get_aio_context(s->blk));
 }
 
-static uint32_t virtio_blk_get_features(VirtIODevice *vdev, uint32_t features)
+static uint64_t virtio_blk_get_features(VirtIODevice *vdev, uint64_t features)
 {
 VirtIOBlock *s = VIRTIO_BLK(vdev);
 
diff --git a/hw/char/virtio-serial-bus.c b/hw/char/virtio-serial-bus.c
index d49883f..2d2ed9c 100644
--- a/hw/char/virtio-serial-bus.c
+++ b/hw/char/virtio-serial-bus.c
@@ -467,7 +467,7 @@ static void handle_input(VirtIODevice *vdev, VirtQueue *vq)
 {
 }
 
-static uint32_t get_features(VirtIODevice *vdev, uint32_t features)
+static uint64_t get_features(VirtIODevice *vdev, uint64_t features)
 {
 VirtIOSerial *vser;
 
diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index 9f3c58a..d6d1b98 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -38,16 +38,16 @@
 (offsetof(container, field) + sizeof(((container *)0)->field))
 
 typedef struct VirtIOFeature {
-uint32_t flags;
+uint64_t flags;
 size_t end;
 } VirtIOFeature;
 
 static VirtIOFeature feature_sizes[] = {
-{.flags = 1 << VIRTIO_NET_F_MAC,
+{.flags = 1ULL << VIRTIO_NET_F_MAC,
  .end = endof(struct virtio_net_config, mac)},
-{.flags = 1 << VIRTIO_NET_F_STATUS,
+{.flags = 1ULL << VIRTIO_NET_F_STATUS,
  .end = endof(struct virtio_net_config, status)},
-{.flags = 1 << VIRTIO_NET_F_MQ,
+{.flags = 1ULL << VIRTIO_NET_F_MQ,
  .end = endof(struct virtio_net_config, max_virtqueue_pairs)},
 {}
 };
@@ -441,7 +441,7 @@ static void virtio_net_set_queues(VirtIONet *n)
 
 static void virtio_net_set_multiqueue(VirtIONet *n, int multiqueue);
 
-static uint32_t virtio_net_get_features(VirtIODevice *vdev, uint32_t features)
+static uint64_t virtio_net_get_features(VirtIODevice *vdev, uint64_t features)
 {
 VirtIONet *n = VIRTIO_NET(vdev);
 NetClientState *nc = qemu_get_queue(n->nic);
@@ -471,9 +471,9 @@ static uint32_t virtio_net_get_features(VirtIODevice *vdev, 
uint32_t features)
 return vhost_net_get_features(get_vhost_net(nc->peer), features);
 }
 
-static uint32_t virtio_net_bad_features(VirtIODevice *vdev)
+static uint64_t virtio_net_bad_features(VirtIODevice *vdev)
 {
-uint32_t features = 0;
+uint64_t features = 0;
 
 /* Linux kernel 2.6.25.  It understood MAC (as everyone must),
  * but also these: */
@@ -496,7 +496,7 @@ static void virtio_net_apply_guest_offloads(VirtIONet *n)
 !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_UFO)));
 }
 
-static uint64_t virtio_net_guest_offloads_by_features(uint32_t features)
+static uint64_t virtio_net_guest_offloads_by_features(uint64_t features)
 {
 static const uint64_t guest_offloads_mask =
 (1ULL << VIRTIO_NET_F_GUEST_CSUM) |
@@ -514,7 +514,7 @@ static inline uint64_t 
virtio_net_supported_guest_offloads(VirtIONet *n)
 return virtio_net_guest_offloads_by_feat

[PATCH RFC v5 08/19] dataplane: allow virtio-1 devices

2014-12-02 Thread Cornelia Huck

Handle endianness conversion for virtio-1 virtqueues correctly.

Note that dataplane now needs to be built per-target.

Signed-off-by: Cornelia Huck 
---
 hw/block/dataplane/virtio-blk.c   |4 +-
 hw/scsi/virtio-scsi-dataplane.c   |2 +-
 hw/virtio/Makefile.objs   |2 +-
 hw/virtio/dataplane/Makefile.objs |2 +-
 hw/virtio/dataplane/vring.c   |   86 ++---
 include/hw/virtio/dataplane/vring-accessors.h |   75 +
 include/hw/virtio/dataplane/vring.h   |   14 +---
 7 files changed, 131 insertions(+), 54 deletions(-)
 create mode 100644 include/hw/virtio/dataplane/vring-accessors.h

diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c
index 1222a37..2d8cc15 100644
--- a/hw/block/dataplane/virtio-blk.c
+++ b/hw/block/dataplane/virtio-blk.c
@@ -16,7 +16,9 @@
 #include "qemu/iov.h"
 #include "qemu/thread.h"
 #include "qemu/error-report.h"
+#include "hw/virtio/virtio-access.h"
 #include "hw/virtio/dataplane/vring.h"
+#include "hw/virtio/dataplane/vring-accessors.h"
 #include "sysemu/block-backend.h"
 #include "hw/virtio/virtio-blk.h"
 #include "virtio-blk.h"
@@ -75,7 +77,7 @@ static void complete_request_vring(VirtIOBlockReq *req, 
unsigned char status)
 VirtIOBlockDataPlane *s = req->dev->dataplane;
 stb_p(&req->in->status, status);
 
-vring_push(&req->dev->dataplane->vring, &req->elem,
+vring_push(s->vdev, &req->dev->dataplane->vring, &req->elem,
req->qiov.size + sizeof(*req->in));
 
 /* Suppress notification to guest by BH and its scheduled
diff --git a/hw/scsi/virtio-scsi-dataplane.c b/hw/scsi/virtio-scsi-dataplane.c
index 03a1e8c..418d73b 100644
--- a/hw/scsi/virtio-scsi-dataplane.c
+++ b/hw/scsi/virtio-scsi-dataplane.c
@@ -94,7 +94,7 @@ void virtio_scsi_vring_push_notify(VirtIOSCSIReq *req)
 {
 VirtIODevice *vdev = VIRTIO_DEVICE(req->vring->parent);
 
-vring_push(&req->vring->vring, &req->elem,
+vring_push(vdev, &req->vring->vring, &req->elem,
req->qsgl.size + req->resp_iov.size);
 
 if (vring_should_notify(vdev, &req->vring->vring)) {
diff --git a/hw/virtio/Makefile.objs b/hw/virtio/Makefile.objs
index d21c397..19b224a 100644
--- a/hw/virtio/Makefile.objs
+++ b/hw/virtio/Makefile.objs
@@ -2,7 +2,7 @@ common-obj-y += virtio-rng.o
 common-obj-$(CONFIG_VIRTIO_PCI) += virtio-pci.o
 common-obj-y += virtio-bus.o
 common-obj-y += virtio-mmio.o
-common-obj-$(CONFIG_VIRTIO) += dataplane/
+obj-$(CONFIG_VIRTIO) += dataplane/
 
 obj-y += virtio.o virtio-balloon.o 
 obj-$(CONFIG_LINUX) += vhost.o vhost-backend.o vhost-user.o
diff --git a/hw/virtio/dataplane/Makefile.objs 
b/hw/virtio/dataplane/Makefile.objs
index 9a8cfc0..753a9ca 100644
--- a/hw/virtio/dataplane/Makefile.objs
+++ b/hw/virtio/dataplane/Makefile.objs
@@ -1 +1 @@
-common-obj-y += vring.o
+obj-y += vring.o
diff --git a/hw/virtio/dataplane/vring.c b/hw/virtio/dataplane/vring.c
index 6e283fc..a44c8c8 100644
--- a/hw/virtio/dataplane/vring.c
+++ b/hw/virtio/dataplane/vring.c
@@ -18,7 +18,9 @@
 #include "hw/hw.h"
 #include "exec/memory.h"
 #include "exec/address-spaces.h"
+#include "hw/virtio/virtio-access.h"
 #include "hw/virtio/dataplane/vring.h"
+#include "hw/virtio/dataplane/vring-accessors.h"
 #include "qemu/error-report.h"
 
 /* vring_map can be coupled with vring_unmap or (if you still have the
@@ -83,7 +85,7 @@ bool vring_setup(Vring *vring, VirtIODevice *vdev, int n)
 vring_init(&vring->vr, virtio_queue_get_num(vdev, n), vring_ptr, 4096);
 
 vring->last_avail_idx = virtio_queue_get_last_avail_idx(vdev, n);
-vring->last_used_idx = vring->vr.used->idx;
+vring->last_used_idx = vring_get_used_idx(vdev, vring);
 vring->signalled_used = 0;
 vring->signalled_used_valid = false;
 
@@ -104,7 +106,7 @@ void vring_teardown(Vring *vring, VirtIODevice *vdev, int n)
 void vring_disable_notification(VirtIODevice *vdev, Vring *vring)
 {
 if (!virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX)) {
-vring->vr.used->flags |= VRING_USED_F_NO_NOTIFY;
+vring_set_used_flags(vdev, vring, VRING_USED_F_NO_NOTIFY);
 }
 }
 
@@ -117,10 +119,10 @@ bool vring_enable_notification(VirtIODevice *vdev, Vring 
*vring)
 if (virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX)) {
 vring_avail_event(&vring->vr) = vring->vr.avail->idx;
 } else {
-vring->vr.used->flags &= ~VRING_USED_F_NO_NOTIFY;
+vring_clear_used_flags(vdev, vring, VRING_USED_F_NO_NOTIFY);
 }
 smp_mb(); /* ensure update is seen before reading avail_idx */
-return !vring_more_avail(vring);
+return !vring_more_avail(vdev, vring);
 }
 
 /* This is stolen from linux/drivers/vhost/vhost.c:vhost_notify() */
@@ -134,12 +136,13 @@ bool vring_should_notify(VirtIODevice *vdev, Vring *vring)
 smp_mb();
 
 if (virtio_has_feature(vdev, VIRTIO_F_NOTIFY_ON_EMPTY) &&
-unlikely(vring->vr.avail->idx == vring->last_av

Re: [PATCH v3 0/3] kvm: vmx: enable xsaves for kvm

2014-12-02 Thread Paolo Bonzini



On 02/12/2014 12:14, Wanpeng Li wrote:
> This patchset is to enable xsaves for kvm part, the patch for 
> qemu part will be sent out later. 
> 
> The patchset is tested on skylake-client.
> 
> v2 -> v3:
>  * add kvm_get/set for ia32_xss
>  * fix the type XSS_EXIT_BITMAP
>  * load host_xss just once in setup_vmcs_config
>  * add/clear atuo switch ia32_xss msr in kvm_get/clear
>  * add VMX_XSS_EXIT_BITMAP macro
>  * add WARN() in handle_xsaves/xrstors
>  * export xsaves if related vmcs field is set 
> 
> v1 -> v2: 
>  * auto switch ia32_xss msr just if this msr is present 
> 
> Wanpeng Li (3):
>   kvm: x86: Intel XSAVES vmx and msr handle
>   kvm: vmx: add kvm_get/set logic to xsaves
>   kvm: x86: Enable Intel XSAVES for guest
> 
>  arch/x86/include/asm/kvm_host.h |  2 ++
>  arch/x86/include/asm/vmx.h  |  3 +++
>  arch/x86/include/uapi/asm/vmx.h |  6 -
>  arch/x86/kvm/cpuid.c|  3 ++-
>  arch/x86/kvm/vmx.c  | 51 
> -
>  5 files changed, 62 insertions(+), 3 deletions(-)
> 

We need to return false from an svm_xsaves_supported function too, and 
we need to prevent setting MSR_IA32_XSS to any non-zero value because 
we do not support getting/setting a guest's Trace Packet Configuration 
State.

I will squash this hunk in, and send a new patch to return zero for
CPUID(0xd,i).ECX and CPUID(0xd,i).EDX.

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 66d1e3d0195e..6e3a4486749c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2671,6 +2671,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct 
msr_data *msr_info)
case MSR_IA32_XSS:
if (!vmx_xsaves_supported())
return 1;
+   /* The only supported bit as of Skylake is bit 8, but
+* it is not supported on KVM.
+*/
+   if (data != 0)
+   return 1;
vcpu->arch.ia32_xss = data;
if (vcpu->arch.ia32_xss != host_xss)
add_atomic_switch_msr(vmx, MSR_IA32_XSS,

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] KVM: cpuid: mask more bits in leaf 0xd and subleaves

2014-12-02 Thread Paolo Bonzini

- EAX=0Dh, ECX=1: output registers EBX/ECX/EDX are reserved.

- EAX=0Dh, ECX>1: output register ECX is zero for all the CPUID leaves
we support, because variable "supported" comes from XCR0 and not XSS.
However, only bits above 0 are reserved.  Output register EDX is reserved.

Source: Intel Architecture Instruction Set Extensions Programming
Reference, ref. number 319433-022

Signed-off-by: Paolo Bonzini 
---
 arch/x86/kvm/cpuid.c | 13 ++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 0d919bc33b02..b1366743a728 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -470,10 +470,17 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
goto out;
 
do_cpuid_1_ent(&entry[i], function, idx);
-   if (idx == 1)
+   if (idx == 1) {
entry[i].eax &= 
kvm_supported_word10_x86_features;
-   else if (entry[i].eax == 0 || !(supported & mask))
-   continue;
+   entry[i].ebx = 0;
+   entry[i].ecx = 0;
+   } else {
+   if (entry[i].eax == 0 || !(supported & mask))
+   continue;
+   WARN_ON_ONCE(entry[i].ecx & 1);
+   entry[i].ecx &= 1;
+   }
+   entry[i].edx = 0;
entry[i].flags |=
   KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
++*nent;
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 5/7] KVM: arm64: guest debug, add support for single-step

2014-12-02 Thread Christoffer Dall

On Mon, Dec 01, 2014 at 11:50:14AM +, Alex Bennée wrote:
> 
> Christoffer Dall  writes:
> 
> > On Tue, Nov 25, 2014 at 04:10:03PM +, Alex Bennée wrote:
> >> This adds support for single-stepping the guest. As userspace can and
> >> will manipulate guest registers before restarting any tweaking of the
> >> registers has to occur just before control is passed back to the guest.
> >> Furthermore while guest debugging is in effect we need to squash the
> >> ability of the guest to single-step itself as we have no easy way of
> >> re-entering the guest after the exception has been delivered to the
> >> hypervisor.
> >
> > Admittedly this is a corner case, but wouldn't the only really nasty bit
> > of this be to emulate the guest debug exception?
> 
> Well yes - currently this is all squashed by ignoring the guest's wishes
> while we are debugging (save for SW breakpoints).
> 
> >
> >> 
> >> Signed-off-by: Alex Bennée 
> >> 
> >> diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
> >> index 48d26bb..a76daae 100644
> >> --- a/arch/arm/kvm/arm.c
> >> +++ b/arch/arm/kvm/arm.c
> >> @@ -38,6 +38,7 @@
> >>  #include 
> >>  #include 
> >>  #include 
> >> +#include 
> >>  #include 
> >>  #include 
> >>  #include 
> >> @@ -300,6 +301,17 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
> >>kvm_arm_set_running_vcpu(NULL);
> >>  }
> >>  
> >> +/**
> >> + * kvm_arch_vcpu_ioctl_set_guest_debug - Setup guest debugging
> >> + * @kvm:  pointer to the KVM struct
> >> + * @kvm_guest_debug: the ioctl data buffer
> >> + *
> >> + * This sets up the VM for guest debugging. Care has to be taken when
> >> + * manipulating guest registers as these will be set/cleared by the
> >> + * hyper-visor controller, typically before each kvm_run event. As a
> >
> > hypervisor
> >
> >> + * result modification of the guest registers needs to take place
> >> + * after they have been restored in the hyp.S trampoline code.
> >
> > I don't understand this??
> 
> We can't use GET/SET one reg to manipulate the registers we want as
> these are the guest visible versions and subject to modification by
> userspace. This is why the debugging code makes it's changes after the
> guest state has been restored.
> 

eh, once you're in the KVM_RUN ioctl, user space can't fiddle your VCPU
regs because you're holding the vcpu mutex, so doing stuff in some
callout from kvm_arch_vcpu_ioctl_run() seems every bid as valid for this
case as doing it in EL2.  In fact, the only reason why we're doing
anything in EL2 is when you're accessing state only accessible in EL2,
when you need to write the whole thing in assembly (like the context
switch of GP registers) etc.

If it doesn't have huge performance costs, we should use C-code in EL1
to the furthest extent possible.

> >
> >> + */
> >>  int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
> >>struct kvm_guest_debug *dbg)
> >>  {
> >> @@ -317,8 +329,8 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct 
> >> kvm_vcpu *vcpu,
> >>  
> >>/* Single Step */
> >>if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
> >> -  kvm_info("SS requested, not yet implemented\n");
> >> -  return -EINVAL;
> >> +  kvm_info("SS requested\n");
> >> +  route_el2 = true;
> >>}
> >>  
> >>/* Software Break Points */
> >> diff --git a/arch/arm64/kernel/asm-offsets.c 
> >> b/arch/arm64/kernel/asm-offsets.c
> >> index 8da1043..78e5ae1 100644
> >> --- a/arch/arm64/kernel/asm-offsets.c
> >> +++ b/arch/arm64/kernel/asm-offsets.c
> >> @@ -121,6 +121,7 @@ int main(void)
> >>DEFINE(VCPU_FAR_EL2,offsetof(struct kvm_vcpu, 
> >> arch.fault.far_el2));
> >>DEFINE(VCPU_HPFAR_EL2,  offsetof(struct kvm_vcpu, 
> >> arch.fault.hpfar_el2));
> >>DEFINE(VCPU_DEBUG_FLAGS,offsetof(struct kvm_vcpu, 
> >> arch.debug_flags));
> >> +  DEFINE(GUEST_DEBUG, offsetof(struct kvm_vcpu, guest_debug));
> >>DEFINE(VCPU_HCR_EL2,offsetof(struct kvm_vcpu, 
> >> arch.hcr_el2));
> >>DEFINE(VCPU_MDCR_EL2,   offsetof(struct kvm_vcpu, arch.mdcr_el2));
> >>DEFINE(VCPU_IRQ_LINES,  offsetof(struct kvm_vcpu, arch.irq_lines));
> >> diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c
> >> index 28dc92b..6def054 100644
> >> --- a/arch/arm64/kvm/handle_exit.c
> >> +++ b/arch/arm64/kvm/handle_exit.c
> >> @@ -91,6 +91,25 @@ static int kvm_handle_bkpt(struct kvm_vcpu *vcpu, 
> >> struct kvm_run *run)
> >>return 0;
> >>  }
> >>  
> >> +/**
> >> + * kvm_handle_ss - handle single step exceptions
> >> + *
> >> + * @vcpu: the vcpu pointer
> >> + *
> >> + * See: ARM ARM D2.12 for the details. While the host is routing debug
> >> + * exceptions to it's handlers we have to suppress the ability of the
> >
> > its handlers
> >
> >> + * guest to trigger exceptions.
> >
> > not really sure why this comment is here?  Does it really help anyone
> > reading this specific function or does it just confuse people more?
> >
>

Re: [PATCH 6/7] KVM: arm64: re-factor hyp.S debug register code

2014-12-02 Thread Christoffer Dall

On Mon, Dec 01, 2014 at 11:52:44AM +, Alex Bennée wrote:
> 
> Christoffer Dall  writes:
> 
> > On Tue, Nov 25, 2014 at 04:10:04PM +, Alex Bennée wrote:
> >> This is a pre-cursor to sharing the code with the guest debug support.
> >> This replaces the big macro that fishes data out of a fixed location
> >> with a more general helper macro to restore a set of debug registers. It
> >> uses macro substitution so it can be re-used for debug control and value
> >> registers. It does however rely on the debug registers being 64 bit
> >> aligned (as they happen to be in the hyp ABI).
> >
> > can you enforce that somewhere?
> 
> There is a comment in kvm_asm.h:
> 
> /*
>  * 0 is reserved as an invalid value.
>  * Order *must* be kept in sync with the hyp switch code.
>  */
> 
> But I'm not sure how to enforce it in assembly. Is there a #pragma or
> something I can use?
> 

You can add a BUG_ON somewhere at runtime, but I wouldn't bother, you
can stick a note in that existing comment just so people don't change
the declaration of the registers to be 32-bit aligned or something else.

-Christoffer
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 0/5] kvm: memslots lookup optimization

2014-12-02 Thread Paolo Bonzini



On 02/12/2014 08:57, Igor Mammedov wrote:
>> On 01/12/2014 18:29, Igor Mammedov wrote:
>>> Series speed-ups GFN to memslot lookup time by:
>>>  * introducing LRU cache, which improves looukup time for
>>>same slot workload (typically boot time of Windows and Linux guest)
>>>  * switching to binary search for GFN to memslot lookup,
>>>improving lookup time with large amount of memory slots
>>>
>>> Igor Mammedov (5):
>>>   kvm: update_memslots: drop not needed check for the same number of
>>> pages
>>>   kvm: update_memslots: drop not needed check for the same slot
>>>   kvm: search_memslots: add simple LRU memslot caching
>>>   kvm: change memslot sorting rule from size to GFN
>>>   kvm: optimize GFN to memslot lookup with large slots amount
>>>
>>>  include/linux/kvm_host.h | 28 +++-
>>>  virt/kvm/kvm_main.c  | 46 
>>> ++
>>>  2 files changed, 49 insertions(+), 25 deletions(-)
>>>
>>
>> Applied patches 1-3 for now, I'm not in the mood for proving that the
>> binary search is correct. :)

Looks good, thanks.  Gleb, any objections?

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 5/5] arm/arm64: KVM: Turn off vcpus and flush stage-2 pgtables on sytem exit events

2014-12-02 Thread Christoffer Dall

On Mon, Dec 01, 2014 at 05:57:53PM +, Peter Maydell wrote:
> On 27 November 2014 at 23:10, Peter Maydell  wrote:
> > It seems odd to have this unmap happen on attempted system reset/powerdown,
> > not on cpu init/start.
> 
> Here's a concrete case that I think requires the unmap to be
> done on cpu init:
>  * start a VM and run it for a bit
>  * from the QEMU monitor, use "loadvm" to load a VM snapshot
> 
> This will cause QEMU to do a system reset (including calling
> VCPU_INIT to reset the CPUs), load the contents of guest
> RAM from the snapshot, set guest CPU registers with a pile
> of SET_ONE_REG calls, and then KVM_RUN to start the VM.
> 
> If we don't unmap stage2 on vcpu init,  then what in this
> sequence causes the icaches to be flushed so we execute
> the newly loaded ram contents rather than stale data
> from the first VM run?
> 

You're absolutely right that it makes more sense to stick it in
vcpu_init.  I put it only in the shutdown event handler for debugging
and forgot that was what I was doing :)

The only down-side is that we'll be trying to free memory that was never
mapped on initial startup, but it's not in the critical path and we
could add an explicit check to early-out if the vcpu has never been run,
which may increase code readibility too (we already have that flag I
belive).

-Christoffer
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Allocating dedicated RAM to host that guest can not use

2014-12-02 Thread Paolo Bonzini



On 27/11/2014 13:16, mad Engineer wrote:
> a random thought can we set qemu user/group rss to a particular hard
> limit in limits.conf

Those apply per process, not per user/group.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 1/5] arm/arm64: KVM: Correct KVM_ARM_VCPU_INIT power off option

2014-12-02 Thread Christoffer Dall

On Thu, Nov 27, 2014 at 10:44:29PM +, Peter Maydell wrote:
> On 27 November 2014 at 18:40, Christoffer Dall
>  wrote:
> > The implementation of KVM_ARM_VCPU_INIT is currently not doing what
> > userspace expects, namely making sure that a vcpu which may have been
> > turned off using PSCI is returned to its initial state, which would be
> > powered on if userspace does not set the KVM_ARM_VCPU_POWER_OFF flag.
> >
> > Implment the expected functionality and clarify the ABI.
> 
> ("Implement", if you have to respin.)
> 
> > diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
> > index 9e193c8..4dcc8c2 100644
> > --- a/arch/arm/kvm/arm.c
> > +++ b/arch/arm/kvm/arm.c
> > @@ -663,6 +663,8 @@ static int kvm_arch_vcpu_ioctl_vcpu_init(struct 
> > kvm_vcpu *vcpu,
> >  */
> > if (__test_and_clear_bit(KVM_ARM_VCPU_POWER_OFF, 
> > vcpu->arch.features))
> > vcpu->arch.pause = true;
> > +   else
> > +   vcpu->arch.pause = false;
> 
> Out of curiosity, why do we have to test-and-clear the bit rather than
> just testing it?
> 
No reason, I think we used to do this when we were always testing the
flag directly instead of through the pause flag.

I'll add a change of this.

-Christoffer
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] x86, microcode: Don't initialize microcode code on paravirt

2014-12-02 Thread Boris Ostrovsky


On 12/01/2014 05:37 PM, Borislav Petkov wrote:

On Mon, Dec 01, 2014 at 05:31:56PM -0500, Boris Ostrovsky wrote:

I think so. The problem we have now is __pa() macro that we only use
on 32-bit. I'll queue this for overnight tests to make sure and if it
indeed works then 3.19 should be fine.

Cool, thanks.


All tests passed.


I'd still take your patch for 3.19 though because I'm fixing the 32-bit
reloading path properly and will remove the ifdef afterwards.

And even then, I'd like to prevent loading the module on a paravirt
guest if it is totally unneeded there.



I wonder whether we should prevent all guests (not just paravirt) from 
loading microcode driver (and from doing early microcode loading).


-boris
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH RFC v5 07/19] virtio: allow virtio-1 queue layout

2014-12-02 Thread Michael S. Tsirkin

On Tue, Dec 02, 2014 at 02:00:15PM +0100, Cornelia Huck wrote:
> For virtio-1 devices, we allow a more complex queue layout that doesn't
> require descriptor table and rings on a physically-contigous memory area:
> add virtio_queue_set_rings() to allow transports to set this up.
> 
> Signed-off-by: Cornelia Huck 
> ---
>  hw/virtio/virtio.c |   16 
>  include/hw/virtio/virtio.h |2 ++
>  2 files changed, 18 insertions(+)
> 
> diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
> index 8f69ffa..508dccf 100644
> --- a/hw/virtio/virtio.c
> +++ b/hw/virtio/virtio.c
> @@ -96,6 +96,13 @@ static void virtqueue_init(VirtQueue *vq)
>  {
>  hwaddr pa = vq->pa;
>  
> +if (pa == -1ULL) {
> +/*
> + * This is a virtio-1 style vq that has already been setup
> + * in virtio_queue_set.
> + */
> +return;
> +}
>  vq->vring.desc = pa;
>  vq->vring.avail = pa + vq->vring.num * sizeof(VRingDesc);
>  vq->vring.used = vring_align(vq->vring.avail +
> @@ -717,6 +724,15 @@ hwaddr virtio_queue_get_addr(VirtIODevice *vdev, int n)
>  return vdev->vq[n].pa;
>  }
>  
> +void virtio_queue_set_rings(VirtIODevice *vdev, int n, hwaddr desc,
> +hwaddr avail, hwaddr used)
> +{
> +vdev->vq[n].pa = -1ULL;
> +vdev->vq[n].vring.desc = desc;
> +vdev->vq[n].vring.avail = avail;
> +vdev->vq[n].vring.used = used;
> +}
> +
>  void virtio_queue_set_num(VirtIODevice *vdev, int n, int num)
>  {
>  /* Don't allow guest to flip queue between existent and

pa == -1ULL tricks look quite ugly.
Can't we set desc/avail/used unconditionally, and drop
the pa value?

> diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
> index 68c40db..80ee313 100644
> --- a/include/hw/virtio/virtio.h
> +++ b/include/hw/virtio/virtio.h
> @@ -224,6 +224,8 @@ void virtio_queue_set_addr(VirtIODevice *vdev, int n, 
> hwaddr addr);
>  hwaddr virtio_queue_get_addr(VirtIODevice *vdev, int n);
>  void virtio_queue_set_num(VirtIODevice *vdev, int n, int num);
>  int virtio_queue_get_num(VirtIODevice *vdev, int n);
> +void virtio_queue_set_rings(VirtIODevice *vdev, int n, hwaddr desc,
> +hwaddr avail, hwaddr used);
>  void virtio_queue_set_align(VirtIODevice *vdev, int n, int align);
>  void virtio_queue_notify(VirtIODevice *vdev, int n);
>  uint16_t virtio_queue_vector(VirtIODevice *vdev, int n);
> -- 
> 1.7.9.5
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 3/5] arm/arm64: KVM: Clarify KVM_ARM_VCPU_INIT ABI

2014-12-02 Thread Christoffer Dall

On Thu, Nov 27, 2014 at 10:53:50PM +, Peter Maydell wrote:
> On 27 November 2014 at 18:40, Christoffer Dall
>  wrote:
> > It is not clear that this ioctl can be called multiple times for a given
> > vcpu.  Userspace already does this, so clarify the ABI.
> >
> > Signed-off-by: Christoffer Dall 
> > ---
> >  Documentation/virtual/kvm/api.txt | 3 +++
> >  1 file changed, 3 insertions(+)
> >
> > diff --git a/Documentation/virtual/kvm/api.txt 
> > b/Documentation/virtual/kvm/api.txt
> > index bb82a90..fc12b4f 100644
> > --- a/Documentation/virtual/kvm/api.txt
> > +++ b/Documentation/virtual/kvm/api.txt
> > @@ -2453,6 +2453,9 @@ return ENOEXEC for that vcpu.
> >  Note that because some registers reflect machine topology, all vcpus
> >  should be created before this ioctl is invoked.
> >
> > +Userspace can call this function multiple times for a given VCPU, which 
> > will
> > +reset the VCPU to its initial states.
> 
> How about being a little bit more explicit here with something like:
> 
> "Userspace can call this function multiple times for a given VCPU, including
> after the VCPU has been run. This will reset the VCPU to its initial
> state."

yeah, better.

> 
> (I notice that api.txt is inconsistent about using "vcpu" or "VCPU"
> or "vCPU"... do we have a preference for new text?)
> 

I generally try to match whatever the context is, but I clearly failed
here.  I don't think there's a preference, no.

> > +
> >  Possible features:
> > - KVM_ARM_VCPU_POWER_OFF: Starts the CPU in a power-off state.
> >   Depends on KVM_CAP_ARM_PSCI.  If not set, the CPU will be powered 
> > on
> 
> Do you have to use the same set of feature flags for second and
> subsequent VCPU_INIT calls, or can they be different each time?
> 
That's a good question.  Do you have any opinion on the matter?

It seems weird to change the target of a Vcpu from some core to another
core, but there is not reason why you shouldn't be able to set a vCpU to
be powered off when run, just because it wasn't earlier on, is
there?

Thanks,
-Christoffer
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 0/5] Improve PSCI system events and fix reboot bugs

2014-12-02 Thread Christoffer Dall

On Mon, Dec 01, 2014 at 02:34:12PM +0100, Andrew Jones wrote:
> On Thu, Nov 27, 2014 at 07:40:55PM +0100, Christoffer Dall wrote:
> > Several people have reported problems with rebooting ARM VMs, especially
> > on 32-bit ARM.  This is mainly due to the same reason we were seeing
> > boot errors in the past, namely that the ram, dcache, and icache weren't
> > coherent on guest boot with the guest (stage-1) MMU disabled.  We solved
> > this by ensuring coherency when we fault in pages, but since most memory
> > is already mapped after a reboot, we don't do anything.
> > 
> > The solution is to unmap the regular RAM on system events, but we must
> > take care to not unmap the GIC or other IO regions, hence the somehwat
> > complicated solution.
> > 
> > As part of figuring this out, it became clear that some semantics around
> > the KVM_ARM_VCPU_INIT ABI and system event ABI was unclear (what is
> > userspace expected to do when it receives a system event).  This series
> > also clarifies the ABI and changes the kernel functionality to do what
> > userspace expects (turn off VCPUs on a system shutdown event).
> > 
> > The code is avaliable here as well:
> > http://git.linaro.org/people/christoffer.dall/linux-kvm-arm.git 
> > vcpu_init_fixes
> > 
> > There is an alternative version with more code reuse for what is patch 4
> > in this series available here:
> > http://git.linaro.org/people/christoffer.dall/linux-kvm-arm.git 
> > vcpu_init_fixes-alternative
> > 
> > See patch 4 for more info on this one.
> > 
> > Testing
> > ---
> > This has been tested on CubieBoard, Arndale, TC2, and Juno.  On Arndale
> > and TC2 it was extremely easy to reproduce the setup (just start a VM
> > that runs reboot from /etc/rc.local or similar) and this series clearly
> > fixes the behavior.
> 
> We've also seen reboots leading to a stuck vcpu. It appeared to be 100%
> reproducible on a freshly installed guest (first reboot after running
> the installer), and then intermittently afterwards. I've just tested
> this patch series, and it appears to resolve the issue. No stuck vcpu
> after install, and a reboot loop has been running for a while now. I'm
> testing on a mustang. If you like, feel free to add a
> 
> Tested-by: Andrew Jones 
> 
Thanks!
-Christoffer
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH RFC v5 07/19] virtio: allow virtio-1 queue layout

2014-12-02 Thread Cornelia Huck

On Tue, 2 Dec 2014 16:46:28 +0200
"Michael S. Tsirkin"  wrote:

> On Tue, Dec 02, 2014 at 02:00:15PM +0100, Cornelia Huck wrote:
> > For virtio-1 devices, we allow a more complex queue layout that doesn't
> > require descriptor table and rings on a physically-contigous memory area:
> > add virtio_queue_set_rings() to allow transports to set this up.
> > 
> > Signed-off-by: Cornelia Huck 
> > ---
> >  hw/virtio/virtio.c |   16 
> >  include/hw/virtio/virtio.h |2 ++
> >  2 files changed, 18 insertions(+)
> > 
> > diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
> > index 8f69ffa..508dccf 100644
> > --- a/hw/virtio/virtio.c
> > +++ b/hw/virtio/virtio.c
> > @@ -96,6 +96,13 @@ static void virtqueue_init(VirtQueue *vq)
> >  {
> >  hwaddr pa = vq->pa;
> >  
> > +if (pa == -1ULL) {
> > +/*
> > + * This is a virtio-1 style vq that has already been setup
> > + * in virtio_queue_set.
> > + */
> > +return;
> > +}
> >  vq->vring.desc = pa;
> >  vq->vring.avail = pa + vq->vring.num * sizeof(VRingDesc);
> >  vq->vring.used = vring_align(vq->vring.avail +
> > @@ -717,6 +724,15 @@ hwaddr virtio_queue_get_addr(VirtIODevice *vdev, int n)
> >  return vdev->vq[n].pa;
> >  }
> >  
> > +void virtio_queue_set_rings(VirtIODevice *vdev, int n, hwaddr desc,
> > +hwaddr avail, hwaddr used)
> > +{
> > +vdev->vq[n].pa = -1ULL;
> > +vdev->vq[n].vring.desc = desc;
> > +vdev->vq[n].vring.avail = avail;
> > +vdev->vq[n].vring.used = used;
> > +}
> > +
> >  void virtio_queue_set_num(VirtIODevice *vdev, int n, int num)
> >  {
> >  /* Don't allow guest to flip queue between existent and
> 
> pa == -1ULL tricks look quite ugly.
> Can't we set desc/avail/used unconditionally, and drop
> the pa value?

And have virtio_queue_get_addr() return desc? Let me see if I can come
up with a patch.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] x86, microcode: Don't initialize microcode code on paravirt

2014-12-02 Thread Borislav Petkov

On Tue, Dec 02, 2014 at 09:36:40AM -0500, Boris Ostrovsky wrote:
> All tests passed.

Thanks!

> I wonder whether we should prevent all guests (not just paravirt) from
> loading microcode driver (and from doing early microcode loading).

I don't think the unmodified ones need to. At least I haven't seen any
issues so far.

-- 
Regards/Gruss,
Boris.

Sent from a fat crate under my desk. Formatting is fine.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 5/5] arm/arm64: KVM: Turn off vcpus and flush stage-2 pgtables on sytem exit events

2014-12-02 Thread Christoffer Dall

On Thu, Nov 27, 2014 at 11:10:14PM +, Peter Maydell wrote:
> On 27 November 2014 at 18:41, Christoffer Dall
>  wrote:
> > When a vcpu calls SYSTEM_OFF or SYSTEM_RESET with PSCI v0.2, the vcpus
> > should really be turned off for the VM adhering to the suggestions in
> > the PSCI spec, and it's the sane thing to do.
> >
> > Also, to ensure a coherent icache/dcache/ram situation when restarting
> > with the guest MMU off, flush all stage-2 page table entries so we start
> > taking aborts when the guest reboots, and flush/invalidate the necessary
> > cache lines.
> >
> > Clarify the behavior and expectations for arm/arm64 in the
> > KVM_EXIT_SYSTEM_EVENT case.
> >
> > Signed-off-by: Christoffer Dall 
> > ---
> >  Documentation/virtual/kvm/api.txt |  4 
> >  arch/arm/kvm/psci.c   | 18 ++
> >  arch/arm64/include/asm/kvm_host.h |  1 +
> >  3 files changed, 23 insertions(+)
> >
> > diff --git a/Documentation/virtual/kvm/api.txt 
> > b/Documentation/virtual/kvm/api.txt
> > index fc12b4f..c67e4956 100644
> > --- a/Documentation/virtual/kvm/api.txt
> > +++ b/Documentation/virtual/kvm/api.txt
> > @@ -2955,6 +2955,10 @@ HVC instruction based PSCI call from the vcpu. The 
> > 'type' field describes
> >  the system-level event type. The 'flags' field describes architecture
> >  specific flags for the system-level event.
> >
> > +In the case of ARM/ARM64, all vcpus will be powered off when requesting 
> > shutdown
> > +or reset, and it is the responsibility of userspace to reinitialize the 
> > vcpus
> > +using KVM_ARM_VCPU_INIT.
> 
> Heh, we're not even consistent within this patchseries about the 
> capitalisation
> of "vcpu" :-)
> 
> What happens if you try to KVM_RUN a CPU the kernel thinks is powered down?
> Does the kernel just say "ok, doing nothing"?

yes, it blocks the vcpu execution by putting the thread on a wait-queue.
That's exactly what happens for the secondary vcpus in an SMP guest
using PSCI.

> 
> Also, the clarification we want here should not I think be architecture
> specific -- the handling of the exit system event in QEMU is in common
> code. What you want to say is something like:
> 
> "Valid values for 'type' are:
>   KVM_SYSTEM_EVENT_SHUTDOWN -- the guest has requested a shutdown of the
>VM. Userspace is not obliged to honour this, and if it does honour
>this does not need to destroy the VM synchronously (ie it may call
>KVM_RUN again before shutdown finally occurs).
>   KVM_SYSTEM_EVENT_RESET -- the guest has requested a reset of the VM.
>As with SHUTDOWN, userspace is permitted to ignore the request, or
>to schedule the reset to occur in the future and may call KVM_RUN again."

ok, this is pretty good, but do we need to say that userspace is
permitted to do this or that?  The kernel never relies on user space for
correct functionality, so do you mean 'for the run a vm semantics to
still otherwise be functional'?

> 
> The corollary is that it's the kernel's job to deal with any impedance
> mismatch between this and whatever ABI like PSCI it's implementing, but
> that's fairly obvious so doesn't really need mentioning in the docs.

I didn't find it obvious (which is why I thought we'd spell it out), but
I agree that not mentioning it makes this arch-generic and we can put
the other stuff into a comment in arch/arm/kvm/psci.c.

> 
> (I'd like to claim that "the vcpus are powered off when requesting shutdown"
> is an implementation detail of this, not part of the API. I think we can
> get away with that...)
> 

ok

> > +
> > /* Fix the size of the union. */
> > char padding[256];
> > };
> > diff --git a/arch/arm/kvm/psci.c b/arch/arm/kvm/psci.c
> > index 09cf377..b4ab613 100644
> > --- a/arch/arm/kvm/psci.c
> > +++ b/arch/arm/kvm/psci.c
> > @@ -15,11 +15,13 @@
> >   * along with this program.  If not, see .
> >   */
> >
> > +#include 
> >  #include 
> >  #include 
> >
> >  #include 
> >  #include 
> > +#include 
> >  #include 
> >
> >  /*
> > @@ -166,6 +168,22 @@ static unsigned long 
> > kvm_psci_vcpu_affinity_info(struct kvm_vcpu *vcpu)
> >
> >  static void kvm_prepare_system_event(struct kvm_vcpu *vcpu, u32 type)
> >  {
> > +   int i;
> > +   struct kvm_vcpu *tmp;
> > +
> > +   /* Stop all vcpus */
> > +   kvm_for_each_vcpu(i, tmp, vcpu->kvm)
> > +   tmp->arch.pause = true;
> > +   preempt_disable();
> > +   force_vm_exit(cpu_all_mask);
> > +   preempt_enable();
> > +
> > +   /*
> > +* Ensure a rebooted VM will fault in RAM pages and detect if the
> > +* guest MMU is turned off and flush the caches as needed.
> > +*/
> > +   stage2_unmap_vm(vcpu->kvm);
> 
> It seems odd to have this unmap happen on attempted system reset/powerdown,
> not on cpu init/start. (I seem to remember having this conversation on
> IRC, so maybe I've just forgotten why it has to be this way...)
> 

no, as I said in the other

Re: [PATCH 3/5] arm/arm64: KVM: Clarify KVM_ARM_VCPU_INIT ABI

2014-12-02 Thread Peter Maydell

On 2 December 2014 at 14:47, Christoffer Dall
 wrote:
> On Thu, Nov 27, 2014 at 10:53:50PM +, Peter Maydell wrote:
>> On 27 November 2014 at 18:40, Christoffer Dall
>>  wrote:
>> >  Possible features:
>> > - KVM_ARM_VCPU_POWER_OFF: Starts the CPU in a power-off state.
>> >   Depends on KVM_CAP_ARM_PSCI.  If not set, the CPU will be 
>> > powered on
>>
>> Do you have to use the same set of feature flags for second and
>> subsequent VCPU_INIT calls, or can they be different each time?
>>
> That's a good question.  Do you have any opinion on the matter?

QEMU always will, so I'd be happy if we said it has to be the same
set of flags each time. I guess I'd go for "say they have to match";
we can always relax later if we need to.

> It seems weird to change the target of a Vcpu from some core to another
> core, but there is not reason why you shouldn't be able to set a vCpU to
> be powered off when run, just because it wasn't earlier on, is
> there?

We need an API for get/set of PSCI power state for migration
anyhow, so it's not inherently required to be able to flip
this bit on reset.

-- PMM
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH RFC v5 07/19] virtio: allow virtio-1 queue layout

2014-12-02 Thread Cornelia Huck

On Tue, 2 Dec 2014 15:54:44 +0100
Cornelia Huck  wrote:

> On Tue, 2 Dec 2014 16:46:28 +0200
> "Michael S. Tsirkin"  wrote:

> > pa == -1ULL tricks look quite ugly.
> > Can't we set desc/avail/used unconditionally, and drop
> > the pa value?
> 
> And have virtio_queue_get_addr() return desc? Let me see if I can come
> up with a patch.

I came up with the following (untested) patch, which should hopefully
suit mmio as well. I haven't cared about migration yet.

diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index 8f69ffa..ac3c615 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -69,7 +69,6 @@ typedef struct VRing
 struct VirtQueue
 {
 VRing vring;
-hwaddr pa;
 uint16_t last_avail_idx;
 /* Last used index value we have signalled on */
 uint16_t signalled_used;
@@ -92,12 +91,13 @@ struct VirtQueue
 };
 
 /* virt queue functions */
-static void virtqueue_init(VirtQueue *vq)
+static void virtqueue_update_rings(VirtQueue *vq)
 {
-hwaddr pa = vq->pa;
-
-vq->vring.desc = pa;
-vq->vring.avail = pa + vq->vring.num * sizeof(VRingDesc);
+if (!vq->vring.desc) {
+/* not yet setup -> nothing to do */
+return;
+}
+vq->vring.avail = vq->vring.desc + vq->vring.num * sizeof(VRingDesc);
 vq->vring.used = vring_align(vq->vring.avail +
  offsetof(VRingAvail, ring[vq->vring.num]),
  vq->vring.align);
@@ -605,7 +605,6 @@ void virtio_reset(void *opaque)
 vdev->vq[i].vring.avail = 0;
 vdev->vq[i].vring.used = 0;
 vdev->vq[i].last_avail_idx = 0;
-vdev->vq[i].pa = 0;
 vdev->vq[i].vector = VIRTIO_NO_VECTOR;
 vdev->vq[i].signalled_used = 0;
 vdev->vq[i].signalled_used_valid = false;
@@ -708,17 +707,34 @@ void virtio_config_writel(VirtIODevice *vdev, uint32_t 
addr, uint32_t data)
 
 void virtio_queue_set_addr(VirtIODevice *vdev, int n, hwaddr addr)
 {
-vdev->vq[n].pa = addr;
-virtqueue_init(&vdev->vq[n]);
+vdev->vq[n].vring.desc = addr;
+virtqueue_update_rings(&vdev->vq[n]);
 }
 
 hwaddr virtio_queue_get_addr(VirtIODevice *vdev, int n)
 {
-return vdev->vq[n].pa;
+return vdev->vq[n].vring.desc;
+}
+
+void virtio_queue_set_rings(VirtIODevice *vdev, int n, hwaddr desc,
+hwaddr avail, hwaddr used)
+{
+vdev->vq[n].vring.desc = desc;
+vdev->vq[n].vring.avail = avail;
+vdev->vq[n].vring.used = used;
 }
 
 void virtio_queue_set_num(VirtIODevice *vdev, int n, int num)
 {
+/*
+ * For virtio-1 devices, the number of buffers may only be
+ * updated if the ring addresses have not yet been set up.
+ */
+if (virtio_has_feature(vdev, VIRTIO_F_VERSION_1) &&
+vdev->vq[n].vring.desc) {
+error_report("tried to modify buffer num for virtio-1 device");
+return;
+}
 /* Don't allow guest to flip queue between existent and
  * nonexistent states, or to set it to an invalid size.
  */
@@ -728,7 +744,7 @@ void virtio_queue_set_num(VirtIODevice *vdev, int n, int 
num)
 return;
 }
 vdev->vq[n].vring.num = num;
-virtqueue_init(&vdev->vq[n]);
+virtqueue_update_rings(&vdev->vq[n]);
 }
 
 int virtio_queue_get_num(VirtIODevice *vdev, int n)
@@ -748,6 +764,11 @@ void virtio_queue_set_align(VirtIODevice *vdev, int n, int 
align)
 BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
 VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
 
+/* virtio-1 compliant devices cannot change the aligment */
+if (virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) {
+error_report("tried to modify queue alignment for virtio-1 device");
+return;
+}
 /* Check that the transport told us it was going to do this
  * (so a buggy transport will immediately assert rather than
  * silently failing to migrate this state)
@@ -755,7 +776,7 @@ void virtio_queue_set_align(VirtIODevice *vdev, int n, int 
align)
 assert(k->has_variable_vring_alignment);
 
 vdev->vq[n].vring.align = align;
-virtqueue_init(&vdev->vq[n]);
+virtqueue_update_rings(&vdev->vq[n]);
 }
 
 void virtio_queue_notify_vq(VirtQueue *vq)
@@ -949,7 +970,8 @@ void virtio_save(VirtIODevice *vdev, QEMUFile *f)
 if (k->has_variable_vring_alignment) {
 qemu_put_be32(f, vdev->vq[i].vring.align);
 }
-qemu_put_be64(f, vdev->vq[i].pa);
+/* XXX virtio-1 devices */
+qemu_put_be64(f, vdev->vq[i].vring.desc);
 qemu_put_be16s(f, &vdev->vq[i].last_avail_idx);
 if (k->save_queue) {
 k->save_queue(qbus->parent, i, f);
@@ -1044,13 +1066,14 @@ int virtio_load(VirtIODevice *vdev, QEMUFile *f, int 
version_id)
 if (k->has_variable_vring_alignment) {
 vdev->vq[i].vring.align = qemu_get_be32(f);
 }
-vdev->vq[i].pa = qemu_get_be64(f);
+vdev->vq[i].vring.desc = qemu_get_be64(f);
 qemu_get_be16s(f, &vdev->vq[i].last_avail_idx);

Re: [PATCH 5/5] arm/arm64: KVM: Turn off vcpus and flush stage-2 pgtables on sytem exit events

2014-12-02 Thread Peter Maydell

On 2 December 2014 at 15:01, Christoffer Dall
 wrote:
> On Thu, Nov 27, 2014 at 11:10:14PM +, Peter Maydell wrote:
>> Also, the clarification we want here should not I think be architecture
>> specific -- the handling of the exit system event in QEMU is in common
>> code. What you want to say is something like:
>>
>> "Valid values for 'type' are:
>>   KVM_SYSTEM_EVENT_SHUTDOWN -- the guest has requested a shutdown of the
>>VM. Userspace is not obliged to honour this, and if it does honour
>>this does not need to destroy the VM synchronously (ie it may call
>>KVM_RUN again before shutdown finally occurs).
>>   KVM_SYSTEM_EVENT_RESET -- the guest has requested a reset of the VM.
>>As with SHUTDOWN, userspace is permitted to ignore the request, or
>>to schedule the reset to occur in the future and may call KVM_RUN again."
>
> ok, this is pretty good, but do we need to say that userspace is
> permitted to do this or that?  The kernel never relies on user space for
> correct functionality, so do you mean 'for the run a vm semantics to
> still otherwise be functional'?

I meant "permitted" in the sense of "the kernel won't kill the VM,
return errnos to subsequent KVM_RUN requests or otherwise treat
this userspace behaviour as buggy". If you want to rephrase it
somehow I don't object, as long as the docs make it clear that
it's a valid implementation strategy for userspace to do that.

-- PMM
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC PATCH v2 1/2] KVM: kvm-vfio: User API for VT-d Posted-Interrupts

2014-12-02 Thread Alex Williamson

On Tue, 2014-12-02 at 08:52 +0100, Eric Auger wrote:
> On 12/02/2014 05:48 AM, Alex Williamson wrote:
> > On Tue, 2014-12-02 at 02:08 +, Wu, Feng wrote:
> >>
> >>> -Original Message-
> >>> From: Eric Auger [mailto:eric.au...@linaro.org]
> >>> Sent: Monday, December 01, 2014 6:10 PM
> >>> To: Alex Williamson
> >>> Cc: Wu, Feng; pbonz...@redhat.com; g...@kernel.org; kvm@vger.kernel.org
> >>> Subject: Re: [RFC PATCH v2 1/2] KVM: kvm-vfio: User API for VT-d
> >>> Posted-Interrupts
> >>>
> >>> On 11/25/2014 05:10 PM, Alex Williamson wrote:
>  On Tue, 2014-11-25 at 16:01 +0100, Eric Auger wrote:
> > On 11/25/2014 01:23 PM, Feng Wu wrote:
> >> This patch adds and documents a new attribute
> >> KVM_DEV_VFIO_DEVICE_POSTING_IRQ in KVM_DEV_VFIO_DEVICE
> >>> group.
> >> This new attribute is used for VT-d Posted-Interrupts.
> >>
> >> When guest OS changes the interrupt configuration for an
> >> assigned device, such as, MSI/MSIx data/address fields,
> >> QEMU will use this IRQ attribute to tell KVM to update the
> >> related IRTE according the VT-d Posted-Interrrupts Specification,
> >> such as, the guest vector should be updated in the related IRTE.
> >>
> >> Signed-off-by: Feng Wu 
> >> ---
> >>  Documentation/virtual/kvm/devices/vfio.txt |9 +
> >>  include/uapi/linux/kvm.h   |   10 ++
> >>  2 files changed, 19 insertions(+), 0 deletions(-)
> >>
> >> diff --git a/Documentation/virtual/kvm/devices/vfio.txt
> >>> b/Documentation/virtual/kvm/devices/vfio.txt
> >> index f7aff29..39dee86 100644
> >> --- a/Documentation/virtual/kvm/devices/vfio.txt
> >> +++ b/Documentation/virtual/kvm/devices/vfio.txt
> >> @@ -42,3 +42,12 @@ activated before VFIO_DEVICE_SET_IRQS has been
> >>> called to trigger the IRQ
> >>  or associate an eventfd to it. Unforwarding can only be called while 
> >> the
> >>  signaling has been disabled with VFIO_DEVICE_SET_IRQS. If this
> >>> condition is
> >>  not satisfied, the command returns an -EBUSY.
> >> +
> >> +  KVM_DEV_VFIO_DEVICE_POSTING_IRQ: Use posted interrtups
> >>> mechanism to post
> >> +   the IRQ to guests.
> >> +For this attribute, kvm_device_attr.addr points to a kvm_posted_intr
> >>> struct.
> >> +
> >> +When guest OS changes the interrupt configuration for an assigned
> >>> device,
> >> +such as, MSI/MSIx data/address fields, QEMU will use this IRQ 
> >> attribute
> >> +to tell KVM to update the related IRTE according the VT-d
> >>> Posted-Interrrupts
> >> +Specification, such as, the guest vector should be updated in the 
> >> related
> >>> IRTE.
> >> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> >> index a269a42..e5f86ad 100644
> >> --- a/include/uapi/linux/kvm.h
> >> +++ b/include/uapi/linux/kvm.h
> >> @@ -949,6 +949,7 @@ struct kvm_device_attr {
> >>  #define  KVM_DEV_VFIO_DEVICE  2
> >>  #define   KVM_DEV_VFIO_DEVICE_FORWARD_IRQ 1
> >>  #define   KVM_DEV_VFIO_DEVICE_UNFORWARD_IRQ   2
> >> +#define   KVM_DEV_VFIO_DEVICE_POSTING_IRQ 3
> >>
> >>  enum kvm_device_type {
> >>KVM_DEV_TYPE_FSL_MPIC_20= 1,
> >> @@ -973,6 +974,15 @@ struct kvm_arch_forwarded_irq {
> >>__u32 gsi; /* gsi, ie. virtual IRQ number */
> >>  };
> >>
> >>> Hi Feng, Alex,
> >>> I am currently reworking my code to use something closer to this struct.
> >>> Would you agree with following changes?
> >> +struct kvm_posted_intr {
> >>> kvm_posted_irq
> >>
> >> Hi Alex,
> >>
> >> Do you mean changing the structure name to "kvm_posted_irq"? I am okay
> >> If you think this name is also suitable for ARM forwarded irq. Or we can 
> >> find
> >> a more common name, such as "struct kvm_accel_irq", what is your opinion, 
> >> Alex?
> > 
> > I'd think something like struct kvm_vfio_dev_irq describes it fairly
> > well.
> ok for that name
> > 
> >> +  __u32   argsz;
> >> +  __u32   fd; /* file descriptor of the VFIO device */
> >> +  __u32   index;  /* VFIO device IRQ index */
> >> +  __u32   start;
> >> +  __u32   count;
> >> +  int virq[0];/* gsi, ie. virtual IRQ number */
> >>> __u32 gsi[];
> >>
> >> I think this change is okay to me. If Alex also agree, I will follow this 
> >> in the
> >> next post. 
> >>
> >> +};
> > Hi Feng,
> >
> > This struct could be used by arm code too. If Alex agrees I could use
> > that one instead. We just need to find a common sensible name
> 
>  Yep, the interface might as well support batch setup.  The vfio code
>  uses -1 for teardown if we want to avoid FORWARD vs UNFORWARD we could
>  let the data in the structure define which operation to do.
> >>>
> >>> In case we

Re: [PATCH] KVM: Introduce dynamically registered hypercall capability

2014-12-02 Thread Radim Krčmář

(tl;dr version at the bottom)

2014-12-01 15:43-0800, Phil White:
> On Mon, Dec 1, 2014 at 5:47 AM, Radim Krčmář  wrote:
> > 2014-11-28 17:29-0800, Phil White:
> >> Good questions.
> >>
> >> One thing that prompted this code is the presence and proliferation of
> >> architecture specific hypercalls in include/uapi/linux/kvm_para.h.
> >> I'm not sure why the tree has continued on for as long as it has with
> >> a list of reserved hypercall indices -- most of which are unused on
> >> any given architecture.  Is there a reason that I'm unaware of?
> >
> > Name-space won't be exhausted, so nothing forced them to separate and
> > centralization easily avoids conflicts with generic hypercalls.
> 
> Consider: All the arch specific defines were defined in asm/kvm_para.h.
> Each asm/kvm_para.h defines KVM_HC_ARCH_MAX as a relative
> index from which generic hypercalls ought to be applied (e.g.
> KVM_HC_ARCH_MAX+1 rather than 11).

Every hypercall is still hardcoded on host's side, so this is a harmful
complication.  (When compared with current state and pre-defined arch
space, like -1,-2,...)

The guest can deal with it quite easily, a special hypercall that
returns the first generic one, but there is no real reason to.

> This would at least organize the hypercalls and avoid a situation in which a
> number of hypercalls which are not applicable pollute the namespace.  I'll
> grant that the grounds here may be largely aesthetic.

(It is a commendable pursuit, working code definitely isn't enough.)

> The other worry is that institutionalization of this method will lead to a
> hesitance to associate a specific hypercall index with anything other than the
> function which it has been assigned in past kernel revisions.

Yes, we'd have keep "legacy" hypercalls.
Breaking old guests isn't what we want.

> In addition, this leads to a maintenance problem for anyone seeking to add a
> hypercall in the future in which their hypercalls will need to be
> updated in order
> to avoid collisions with the community as well as any other sources they may
> be dealing with.

(I think that this wouldn't increase the load on maintainers by much.)

> These are all minor headaches, but they can be avoided.  A registration
> method like this -- albeit somewhat more refined -- could be used to eliminate
> all of those headaches in my opinion.

What worries me is the hypercall negotiation ...
If we added truly dynamic hypercall numbers, then the guest would have
to agree with host on function/position of hypercalls.

This has a major drawback:  host and guest have no common definition for
hypercalls => they do not know what the other is talking about.
This can be solved by introducing a "hypercall protocol", which it is
just a more round-about way of having hardcoded ids ...

(You did that by having shared memory that exposed a structure that was
 decoded by your guest.)

> > I'd say that a virtio device is the way to go if you want to stay in the
> > kernel, but outside of kvm modules.  In which ways is virtio lacking?
> 
> Virtio has several limitations.  It implies a situation in which the system 
> has
> already booted.  Secondly, there's no easy way to access the kvm structure.
> Thirdly, it cannot be used effectively to implement an optimization for
> virtualization on a platform.  Fourthly, I believe it would require changes to
> qemu command lines -- and any associated tools which might be used to
> cobble together a qemu command line.

True, I misunderstood the scope of your modification.  I think it would
be "easier" to merge the paravirtualization into KVM+Linux ...

Calling your code by live-patching the hypercall handler could be
mentioned as an easy solution, but it has its problems ...
(A continued use of forked kernel is definitely the easiest.)

> A simple way of putting it, using the existing in-kernel code: I don't see how
> you could use virtio to map the powerpc magic page at bootup.

Agreed, and this code dwells in KVM modules because of that.

(I wasn't talking about existing hypercalls, just foreign modules.)

> >> It does occur to me that in the absence of the setup which I had
> >> available, one could simply treat hc_nr as a 4 character ID rather
> >> than a particular digit.
> >
> > (This would probably solve the situation in practice, but the conflict
> >  is still there, so design hasn't improved.)
> 
> I'm not sure which conflict you mean.  I presume you mean the possibility
> that two separate modules may attempt to claim the same hypercall index?

Yes, integer -> char[4] just switched from sequential assignment to a
"random" one and shrinked the space.  (If people used random generator
for new hypercall numbers, it would have a similar effect.)

> Presuming you do -- and I may be arguing a straw man here -- I'm not sure
> that's classifiable as a design flaw as no method occurs to me by which
> one could add the capability of dynamically registering a hypercall and have
> access to the capabilities I mentio

[PATCH] KVM: arm/arm64: vgic: add init entry to VGIC KVM device

2014-12-02 Thread Eric Auger

Since the advent of dynamic initialization of VGIC, this latter is
initialized very late, on the first vcpu run. This initialization
could be initiated much earlier by the user, as soon as it has
provided the requested dimensioning parameters:
- number of IRQs and number of vCPUs,
- DIST and CPU interface base address.

One motivation behind being able to initialize the VGIC sooner is
related to the setup of IRQ injection in VFIO use case. The VFIO
signaling, especially when used along with irqfd must be set *after*
vgic initialization to prevent any virtual IRQ injection before
VGIC initialization. If virtual IRQ injection occurs before the VGIC
init, the IRQ cannot be injected and subsequent injection is blocked
due to VFIO completion mechanism (unmask/mask or forward/unforward).

This patch adds a new entry to the VGIC KVM device that allows
the user to manually request the VGIC init:
- a new KVM_DEV_ARM_VGIC_GRP_CTRL group is introduced.
- Its first attribute is KVM_DEV_ARM_VGIC_CTRL_INIT

The rationale behind introducing a group is to be able to add other
controls later on, if needed.

Obviously, as soon as the init is done, the dimensioning parameters
cannot be changed.

Signed-off-by: Eric Auger 
---
 Documentation/virtual/kvm/devices/arm-vgic.txt | 11 +++
 arch/arm/include/uapi/asm/kvm.h|  2 ++
 arch/arm64/include/uapi/asm/kvm.h  |  2 ++
 virt/kvm/arm/vgic.c| 14 +-
 4 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/Documentation/virtual/kvm/devices/arm-vgic.txt 
b/Documentation/virtual/kvm/devices/arm-vgic.txt
index df8b0c7..80db43f 100644
--- a/Documentation/virtual/kvm/devices/arm-vgic.txt
+++ b/Documentation/virtual/kvm/devices/arm-vgic.txt
@@ -81,3 +81,14 @@ Groups:
 -EINVAL: Value set is out of the expected range
 -EBUSY: Value has already be set, or GIC has already been initialized
 with default values.
+
+  KVM_DEV_ARM_VGIC_GRP_CTRL
+  Attributes:
+KVM_DEV_ARM_VGIC_CTRL_INIT
+  request the initialization of the VGIC, no additional parameter in
+  kvm_device_attr.addr.
+  Errors:
+-ENXIO: distributor or CPU interface base address were not set prior
+to that call
+-EINVAL: number of vcpus is not known
+-ENOMEM: memory shortage when allocating vgic internal data
diff --git a/arch/arm/include/uapi/asm/kvm.h b/arch/arm/include/uapi/asm/kvm.h
index 77547bb..2499867 100644
--- a/arch/arm/include/uapi/asm/kvm.h
+++ b/arch/arm/include/uapi/asm/kvm.h
@@ -175,6 +175,8 @@ struct kvm_arch_memory_slot {
 #define   KVM_DEV_ARM_VGIC_OFFSET_SHIFT0
 #define   KVM_DEV_ARM_VGIC_OFFSET_MASK (0xULL << 
KVM_DEV_ARM_VGIC_OFFSET_SHIFT)
 #define KVM_DEV_ARM_VGIC_GRP_NR_IRQS   3
+#define KVM_DEV_ARM_VGIC_GRP_CTRL   4
+#define   KVM_DEV_ARM_VGIC_CTRL_INIT0
 
 /* KVM_IRQ_LINE irq field index values */
 #define KVM_ARM_IRQ_TYPE_SHIFT 24
diff --git a/arch/arm64/include/uapi/asm/kvm.h 
b/arch/arm64/include/uapi/asm/kvm.h
index 1ed4417..b35c95a 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -161,6 +161,8 @@ struct kvm_arch_memory_slot {
 #define   KVM_DEV_ARM_VGIC_OFFSET_SHIFT0
 #define   KVM_DEV_ARM_VGIC_OFFSET_MASK (0xULL << 
KVM_DEV_ARM_VGIC_OFFSET_SHIFT)
 #define KVM_DEV_ARM_VGIC_GRP_NR_IRQS   3
+#define KVM_DEV_ARM_VGIC_GRP_CTRL  4
+#define   KVM_DEV_ARM_VGIC_CTRL_INIT   0
 
 /* KVM_IRQ_LINE irq field index values */
 #define KVM_ARM_IRQ_TYPE_SHIFT 24
diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
index b76c38c..2fe5bdb 100644
--- a/virt/kvm/arm/vgic.c
+++ b/virt/kvm/arm/vgic.c
@@ -2474,7 +2474,14 @@ static int vgic_set_attr(struct kvm_device *dev, struct 
kvm_device_attr *attr)
 
return ret;
}
-
+   case KVM_DEV_ARM_VGIC_GRP_CTRL: {
+   switch (attr->attr) {
+   case KVM_DEV_ARM_VGIC_CTRL_INIT:
+   r = kvm_vgic_init(dev->kvm);
+   return r;
+   }
+   break;
+   }
}
 
return -ENXIO;
@@ -2553,6 +2560,11 @@ static int vgic_has_attr(struct kvm_device *dev, struct 
kvm_device_attr *attr)
return vgic_has_attr_regs(vgic_cpu_ranges, offset);
case KVM_DEV_ARM_VGIC_GRP_NR_IRQS:
return 0;
+   case KVM_DEV_ARM_VGIC_GRP_CTRL:
+   switch (attr->attr) {
+   case KVM_DEV_ARM_VGIC_CTRL_INIT:
+   return 0;
+   }
}
return -ENXIO;
 }
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 5/5] kvm: optimize GFN to memslot lookup with large slots amount

2014-12-02 Thread Radim Krčmář

2014-12-01 17:29+, Igor Mammedov:
> Current linear search doesn't scale well when
> large amount of memslots is used and looked up slot
> is not in the beginning memslots array.
> Taking in account that memslots don't overlap, it's
> possible to switch sorting order of memslots array from
> 'npages' to 'base_gfn' and use binary search for
> memslot lookup by GFN.
> 
> As result of switching to binary search lookup times
> are reduced with large amount of memslots.
> 
> Following is a table of search_memslot() cycles
> during WS2008R2 guest boot.
> 
>  boot,  boot + ~10 min
>  mostly sameof using it,
>  slot lookuprandomized lookup
> max  averageaverage
> cycles   cycles cycles
> 
> 13 slots  : 1450   28   30
> 
> 13 slots  : 1400   30   40
> binary search
> 
> 117 slots : 13000  30   460
> 
> 117 slots : 2000   35   180
> binary search
> 
> Signed-off-by: Igor Mammedov 
> ---

Fast ... it looks that we don't even want to transfort the list-in-array
into a tree-in-array to have multiplication instead of division.

Reviewed-by: Radim Krčmář 
(Actually, all patches.)

>  include/linux/kvm_host.h | 34 ++
>  virt/kvm/kvm_main.c  |  8 +++-
>  2 files changed, 29 insertions(+), 13 deletions(-)
> 
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 1a37144..193bca6 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -354,6 +354,7 @@ struct kvm_memslots {
>   /* The mapping table from slot id to the index in memslots[]. */
>   short id_to_index[KVM_MEM_SLOTS_NUM];
>   atomic_t lru_slot;
> + int used_slots;
>  };
>  
>  struct kvm {
> @@ -791,19 +792,28 @@ static inline void kvm_guest_exit(void)
>  static inline struct kvm_memory_slot *
>  search_memslots(struct kvm_memslots *slots, gfn_t gfn)
>  {
> + int start = 0, end = slots->used_slots;
>   int slot = atomic_read(&slots->lru_slot);
> - struct kvm_memory_slot *memslot = &slots->memslots[slot];
> -
> - if (gfn >= memslot->base_gfn &&
> - gfn < memslot->base_gfn + memslot->npages)
> - return memslot;
> -
> - kvm_for_each_memslot(memslot, slots)
> - if (gfn >= memslot->base_gfn &&
> -   gfn < memslot->base_gfn + memslot->npages) {
> - atomic_set(&slots->lru_slot, memslot - slots->memslots);
> - return memslot;
> - }
> + struct kvm_memory_slot *memslots = slots->memslots;
> +
> + if (gfn >= memslots[slot].base_gfn &&
> + gfn < memslots[slot].base_gfn + memslots[slot].npages)
> + return &memslots[slot];
> +
> + while (start < end) {
> + slot = start + (end - start) / 2;
> +
> + if (gfn >= memslots[slot].base_gfn)

(Even thought division is costly, I think that checking here if 'slot'
 is the one we want wouldn't help very much.)

> + end = slot;
> + else
> + start = slot + 1;
> + }
> +
> + if (gfn >= memslots[start].base_gfn &&
> + gfn < memslots[start].base_gfn + memslots[start].npages) {
> + atomic_set(&slots->lru_slot, start);
> + return &memslots[start];
> + }
>  
>   return NULL;
>  }
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 162817f..759af659 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -679,8 +679,14 @@ static void update_memslots(struct kvm_memslots *slots,
>   struct kvm_memory_slot *mslots = slots->memslots;
>  
>   WARN_ON(mslots[i].id != id);
> - if (!new->npages)
> + if (!new->npages) {
>   new->base_gfn = 0;
> + if (mslots[i].npages)
> + slots->used_slots--;
> + } else {
> + if (!mslots[i].npages)
> + slots->used_slots++;
> + }
>  
>   while (i < KVM_MEM_SLOTS_NUM - 1 &&
>  new->base_gfn <= mslots[i + 1].base_gfn) {
> -- 
> 1.8.3.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH net] Revert "drivers/net: Disable UFO through virtio" in macvtap and tun

2014-12-02 Thread Vlad Yasevich

On 11/11/2014 12:12 PM, Ben Hutchings wrote:
> This reverts commit 88e0e0e5aa722b193c8758c8b45d041de5316924 for
> the tap drivers, but leaves UFO disabled in virtio_net.
> 
> libvirt at least assumes that tap features will never be dropped
> in new kernel versions, and doing so prevents migration of VMs to
> the never kernel version while they are running with virtio net
> devices.
> 
> Fixes: 88e0e0e5aa7a ("drivers/net: Disable UFO through virtio")
> Signed-off-by: Ben Hutchings 
> ---
> Compile-tested only.

I ran some migrations tests of different guests between the hosts
with 3.17 and a newly patched kernel and they all worked for me.

Tested-by: Vladislav Yasevich 

-vlad

> 
> Ben.
> 
>  drivers/net/macvtap.c | 13 -
>  drivers/net/tun.c | 19 ---
>  2 files changed, 16 insertions(+), 16 deletions(-)
> 
> diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
> index 6f226de..aeaeb6d 100644
> --- a/drivers/net/macvtap.c
> +++ b/drivers/net/macvtap.c
> @@ -66,7 +66,7 @@ static struct cdev macvtap_cdev;
>  static const struct proto_ops macvtap_socket_ops;
>  
>  #define TUN_OFFLOADS (NETIF_F_HW_CSUM | NETIF_F_TSO_ECN | NETIF_F_TSO | \
> -   NETIF_F_TSO6)
> +   NETIF_F_TSO6 | NETIF_F_UFO)
>  #define RX_OFFLOADS (NETIF_F_GRO | NETIF_F_LRO)
>  #define TAP_FEATURES (NETIF_F_GSO | NETIF_F_SG)
>  
> @@ -570,8 +570,6 @@ static int macvtap_skb_from_vnet_hdr(struct sk_buff *skb,
>   gso_type = SKB_GSO_TCPV6;
>   break;
>   case VIRTIO_NET_HDR_GSO_UDP:
> - pr_warn_once("macvtap: %s: using disabled UFO feature; 
> please fix this program\n",
> -  current->comm);
>   gso_type = SKB_GSO_UDP;
>   if (skb->protocol == htons(ETH_P_IPV6))
>   ipv6_proxy_select_ident(skb);
> @@ -619,6 +617,8 @@ static void macvtap_skb_to_vnet_hdr(const struct sk_buff 
> *skb,
>   vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
>   else if (sinfo->gso_type & SKB_GSO_TCPV6)
>   vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
> + else if (sinfo->gso_type & SKB_GSO_UDP)
> + vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP;
>   else
>   BUG();
>   if (sinfo->gso_type & SKB_GSO_TCP_ECN)
> @@ -953,6 +953,9 @@ static int set_offload(struct macvtap_queue *q, unsigned 
> long arg)
>   if (arg & TUN_F_TSO6)
>   feature_mask |= NETIF_F_TSO6;
>   }
> +
> + if (arg & TUN_F_UFO)
> + feature_mask |= NETIF_F_UFO;
>   }
>  
>   /* tun/tap driver inverts the usage for TSO offloads, where
> @@ -963,7 +966,7 @@ static int set_offload(struct macvtap_queue *q, unsigned 
> long arg)
>* When user space turns off TSO, we turn off GSO/LRO so that
>* user-space will not receive TSO frames.
>*/
> - if (feature_mask & (NETIF_F_TSO | NETIF_F_TSO6))
> + if (feature_mask & (NETIF_F_TSO | NETIF_F_TSO6 | NETIF_F_UFO))
>   features |= RX_OFFLOADS;
>   else
>   features &= ~RX_OFFLOADS;
> @@ -1064,7 +1067,7 @@ static long macvtap_ioctl(struct file *file, unsigned 
> int cmd,
>   case TUNSETOFFLOAD:
>   /* let the user check for future flags */
>   if (arg & ~(TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 |
> - TUN_F_TSO_ECN))
> + TUN_F_TSO_ECN | TUN_F_UFO))
>   return -EINVAL;
>  
>   rtnl_lock();
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index 7302398..a0987d1 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -175,7 +175,7 @@ struct tun_struct {
>   struct net_device   *dev;
>   netdev_features_t   set_features;
>  #define TUN_USER_FEATURES (NETIF_F_HW_CSUM|NETIF_F_TSO_ECN|NETIF_F_TSO| \
> -   NETIF_F_TSO6)
> +   NETIF_F_TSO6|NETIF_F_UFO)
>  
>   int vnet_hdr_sz;
>   int sndbuf;
> @@ -1152,20 +1152,10 @@ static ssize_t tun_get_user(struct tun_struct *tun, 
> struct tun_file *tfile,
>   skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6;
>   break;
>   case VIRTIO_NET_HDR_GSO_UDP:
> - {
> - static bool warned;
> -
> - if (!warned) {
> - warned = true;
> - netdev_warn(tun->dev,
> - "%s: using disabled UFO feature; 
> please fix this program\n",
> - current->comm);
> - }
>   skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
>   if (skb->protocol == htons(ETH_P_IPV

Re: [PATCH] KVM: arm/arm64: vgic: add init entry to VGIC KVM device

2014-12-02 Thread Peter Maydell

On 2 December 2014 at 17:27, Eric Auger  wrote:
> Since the advent of dynamic initialization of VGIC, this latter is
> initialized very late, on the first vcpu run. This initialization
> could be initiated much earlier by the user, as soon as it has
> provided the requested dimensioning parameters:
> - number of IRQs and number of vCPUs,
> - DIST and CPU interface base address.
>
> One motivation behind being able to initialize the VGIC sooner is
> related to the setup of IRQ injection in VFIO use case. The VFIO
> signaling, especially when used along with irqfd must be set *after*
> vgic initialization to prevent any virtual IRQ injection before
> VGIC initialization. If virtual IRQ injection occurs before the VGIC
> init, the IRQ cannot be injected and subsequent injection is blocked
> due to VFIO completion mechanism (unmask/mask or forward/unforward).

This implies that you're potentially injecting virtual IRQs
(and changing the state of the VGIC) before we actually
start running the VM (ie before userspace calls KVM_RUN).
Is that right? It seems odd, but maybe vfio works that way?

-- PMM
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] KVM: arm/arm64: vgic: add init entry to VGIC KVM device

2014-12-02 Thread Eric Auger

On 12/02/2014 06:50 PM, Peter Maydell wrote:
> On 2 December 2014 at 17:27, Eric Auger  wrote:
>> Since the advent of dynamic initialization of VGIC, this latter is
>> initialized very late, on the first vcpu run. This initialization
>> could be initiated much earlier by the user, as soon as it has
>> provided the requested dimensioning parameters:
>> - number of IRQs and number of vCPUs,
>> - DIST and CPU interface base address.
>>
>> One motivation behind being able to initialize the VGIC sooner is
>> related to the setup of IRQ injection in VFIO use case. The VFIO
>> signaling, especially when used along with irqfd must be set *after*
>> vgic initialization to prevent any virtual IRQ injection before
>> VGIC initialization. If virtual IRQ injection occurs before the VGIC
>> init, the IRQ cannot be injected and subsequent injection is blocked
>> due to VFIO completion mechanism (unmask/mask or forward/unforward).
> 
> This implies that you're potentially injecting virtual IRQs
> (and changing the state of the VGIC) before we actually
> start running the VM (ie before userspace calls KVM_RUN).
> Is that right? It seems odd, but maybe vfio works that way?

Hi Peter,

as soon as VFIO signaling is set up (the device IRQ index is linked to
an eventfd, the physical IRQ VFIO handler is installed and the physical
IRQ is enabled at interrupt controller level), virtual IRQs are likely
to be injected. With current QEMU code, we setup this VFIO signaling
*before* the vgic readiness (either on machine init done or reset
notifier) and we face that issue of early injection. QEMU related
patches to follow ...

Best Regards

Eric
> 
> -- PMM
> 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC PATCH v2 1/2] KVM: kvm-vfio: User API for VT-d Posted-Interrupts

2014-12-02 Thread Eric Auger

On 12/02/2014 05:02 PM, Alex Williamson wrote:
> On Tue, 2014-12-02 at 08:52 +0100, Eric Auger wrote:
>> On 12/02/2014 05:48 AM, Alex Williamson wrote:
>>> On Tue, 2014-12-02 at 02:08 +, Wu, Feng wrote:

> -Original Message-
> From: Eric Auger [mailto:eric.au...@linaro.org]
> Sent: Monday, December 01, 2014 6:10 PM
> To: Alex Williamson
> Cc: Wu, Feng; pbonz...@redhat.com; g...@kernel.org; kvm@vger.kernel.org
> Subject: Re: [RFC PATCH v2 1/2] KVM: kvm-vfio: User API for VT-d
> Posted-Interrupts
>
> On 11/25/2014 05:10 PM, Alex Williamson wrote:
>> On Tue, 2014-11-25 at 16:01 +0100, Eric Auger wrote:
>>> On 11/25/2014 01:23 PM, Feng Wu wrote:
 This patch adds and documents a new attribute
 KVM_DEV_VFIO_DEVICE_POSTING_IRQ in KVM_DEV_VFIO_DEVICE
> group.
 This new attribute is used for VT-d Posted-Interrupts.

 When guest OS changes the interrupt configuration for an
 assigned device, such as, MSI/MSIx data/address fields,
 QEMU will use this IRQ attribute to tell KVM to update the
 related IRTE according the VT-d Posted-Interrrupts Specification,
 such as, the guest vector should be updated in the related IRTE.

 Signed-off-by: Feng Wu 
 ---
  Documentation/virtual/kvm/devices/vfio.txt |9 +
  include/uapi/linux/kvm.h   |   10 ++
  2 files changed, 19 insertions(+), 0 deletions(-)

 diff --git a/Documentation/virtual/kvm/devices/vfio.txt
> b/Documentation/virtual/kvm/devices/vfio.txt
 index f7aff29..39dee86 100644
 --- a/Documentation/virtual/kvm/devices/vfio.txt
 +++ b/Documentation/virtual/kvm/devices/vfio.txt
 @@ -42,3 +42,12 @@ activated before VFIO_DEVICE_SET_IRQS has been
> called to trigger the IRQ
  or associate an eventfd to it. Unforwarding can only be called while 
 the
  signaling has been disabled with VFIO_DEVICE_SET_IRQS. If this
> condition is
  not satisfied, the command returns an -EBUSY.
 +
 +  KVM_DEV_VFIO_DEVICE_POSTING_IRQ: Use posted interrtups
> mechanism to post
 +   the IRQ to guests.
 +For this attribute, kvm_device_attr.addr points to a kvm_posted_intr
> struct.
 +
 +When guest OS changes the interrupt configuration for an assigned
> device,
 +such as, MSI/MSIx data/address fields, QEMU will use this IRQ 
 attribute
 +to tell KVM to update the related IRTE according the VT-d
> Posted-Interrrupts
 +Specification, such as, the guest vector should be updated in the 
 related
> IRTE.
 diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
 index a269a42..e5f86ad 100644
 --- a/include/uapi/linux/kvm.h
 +++ b/include/uapi/linux/kvm.h
 @@ -949,6 +949,7 @@ struct kvm_device_attr {
  #define  KVM_DEV_VFIO_DEVICE  2
  #define   KVM_DEV_VFIO_DEVICE_FORWARD_IRQ 1
  #define   KVM_DEV_VFIO_DEVICE_UNFORWARD_IRQ   2
 +#define   KVM_DEV_VFIO_DEVICE_POSTING_IRQ 3

  enum kvm_device_type {
KVM_DEV_TYPE_FSL_MPIC_20= 1,
 @@ -973,6 +974,15 @@ struct kvm_arch_forwarded_irq {
__u32 gsi; /* gsi, ie. virtual IRQ number */
  };

> Hi Feng, Alex,
> I am currently reworking my code to use something closer to this struct.
> Would you agree with following changes?
 +struct kvm_posted_intr {
> kvm_posted_irq

 Hi Alex,

 Do you mean changing the structure name to "kvm_posted_irq"? I am okay
 If you think this name is also suitable for ARM forwarded irq. Or we can 
 find
 a more common name, such as "struct kvm_accel_irq", what is your opinion, 
 Alex?
>>>
>>> I'd think something like struct kvm_vfio_dev_irq describes it fairly
>>> well.
>> ok for that name
>>>
 +  __u32   argsz;
 +  __u32   fd; /* file descriptor of the VFIO device */
 +  __u32   index;  /* VFIO device IRQ index */
 +  __u32   start;
 +  __u32   count;
 +  int virq[0];/* gsi, ie. virtual IRQ number */
> __u32 gsi[];

 I think this change is okay to me. If Alex also agree, I will follow this 
 in the
 next post. 

 +};
>>> Hi Feng,
>>>
>>> This struct could be used by arm code too. If Alex agrees I could use
>>> that one instead. We just need to find a common sensible name
>>
>> Yep, the interface might as well support batch setup.  The vfio code
>> uses -1 for teardown if we want to avoid FORWARD vs UNFORWARD we could
>> let the data in the structure def

Re: [PATCH 5/5] kvm: optimize GFN to memslot lookup with large slots amount

2014-12-02 Thread Paolo Bonzini



On 02/12/2014 18:33, Radim Krčmář wrote:
>> > +  while (start < end) {
>> > +  slot = start + (end - start) / 2;
>> > +
>> > +  if (gfn >= memslots[slot].base_gfn)
> (Even thought division is costly, I think that checking here if 'slot'
>  is the one we want wouldn't help very much.)
> 

Division by an unsigned is just a right shift.  Division by signed
integer is a right shift + conditional move.  We can change / 2 to
explicit >> 1, or change start and end to unsigned, or both.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 3/5] arm/arm64: KVM: Clarify KVM_ARM_VCPU_INIT ABI

2014-12-02 Thread Christoffer Dall

On Tue, Dec 02, 2014 at 03:39:05PM +, Peter Maydell wrote:
> On 2 December 2014 at 14:47, Christoffer Dall
>  wrote:
> > On Thu, Nov 27, 2014 at 10:53:50PM +, Peter Maydell wrote:
> >> On 27 November 2014 at 18:40, Christoffer Dall
> >>  wrote:
> >> >  Possible features:
> >> > - KVM_ARM_VCPU_POWER_OFF: Starts the CPU in a power-off state.
> >> >   Depends on KVM_CAP_ARM_PSCI.  If not set, the CPU will be 
> >> > powered on
> >>
> >> Do you have to use the same set of feature flags for second and
> >> subsequent VCPU_INIT calls, or can they be different each time?
> >>
> > That's a good question.  Do you have any opinion on the matter?
> 
> QEMU always will, so I'd be happy if we said it has to be the same
> set of flags each time. I guess I'd go for "say they have to match";
> we can always relax later if we need to.
> 
> > It seems weird to change the target of a Vcpu from some core to another
> > core, but there is not reason why you shouldn't be able to set a vCpU to
> > be powered off when run, just because it wasn't earlier on, is
> > there?
> 
> We need an API for get/set of PSCI power state for migration
> anyhow, so it's not inherently required to be able to flip
> this bit on reset.
> 
Actually I think the current migration patches rely on being able to
call the init ioctl to turn off a vcpu, but I guess you could use the
KVM_SET_MP_STATE for that.

Alex, any thoughts?

-Christoffer
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH RFC v5 07/19] virtio: allow virtio-1 queue layout

2014-12-02 Thread Michael S. Tsirkin

On Tue, Dec 02, 2014 at 04:41:36PM +0100, Cornelia Huck wrote:
>  void virtio_queue_set_num(VirtIODevice *vdev, int n, int num)
>  {
> +/*
> + * For virtio-1 devices, the number of buffers may only be
> + * updated if the ring addresses have not yet been set up.

Where does it say that?

> + */
> +if (virtio_has_feature(vdev, VIRTIO_F_VERSION_1) &&
> +vdev->vq[n].vring.desc) {
> +error_report("tried to modify buffer num for virtio-1 device");
> +return;
> +}
>  /* Don't allow guest to flip queue between existent and
>   * nonexistent states, or to set it to an invalid size.
>   */
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v13 10/11] pvqspinlock, x86: Enable PV qspinlock for KVM

2014-12-02 Thread Konrad Rzeszutek Wilk

On Wed, Oct 29, 2014 at 04:19:10PM -0400, Waiman Long wrote:
> This patch adds the necessary KVM specific code to allow KVM to
> support the CPU halting and kicking operations needed by the queue
> spinlock PV code.
> 
> Two KVM guests of 20 CPU cores (2 nodes) were created for performance
> testing in one of the following three configurations:
>  1) Only 1 VM is active
>  2) Both VMs are active and they share the same 20 physical CPUs
> (200% overcommit)
> 
> The tests run included the disk workload of the AIM7 benchmark on
> both ext4 and xfs RAM disks at 3000 users on a 3.17 based kernel. The
> "ebizzy -m" test and futextest was was also run and its performance
> data were recorded.  With two VMs running, the "idle=poll" kernel
> option was added to simulate a busy guest. If PV qspinlock is not
> enabled, unfairlock will be used automically in a guest.

What is the unfairlock? Isn't it just using a bytelock at this point?
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

KVM call for agenda for 2014-12-08

2014-12-02 Thread Juan Quintela


Hi

Please, send any topic that you are interested in covering.

hanks, Juan.

By popular demand, a google calendar public entry with it

 
https://www.google.com/calendar/embed?src=dG9iMXRqcXAzN3Y4ZXZwNzRoMHE4a3BqcXNAZ3JvdXAuY2FsZW5kYXIuZ29vZ2xlLmNvbQ

 (Let me know if you have any problems with the calendar entry)

If you need phone number details,  contact me privately

Thanks, Juan.

PD.  Use the google calendar entry to now the time, I gave up at getting
 three timezones right.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 5/5] kvm: optimize GFN to memslot lookup with large slots amount

2014-12-02 Thread Radim Krčmář

2014-12-02 19:45+0100, Paolo Bonzini:
> On 02/12/2014 18:33, Radim Krčmář wrote:
> >> > +while (start < end) {
> >> > +slot = start + (end - start) / 2;
> >> > +
> >> > +if (gfn >= memslots[slot].base_gfn)
> > (Even thought division is costly, I think that checking here if 'slot'
> >  is the one we want wouldn't help very much.)
> > 
> 
> Division by an unsigned is just a right shift.  Division by signed
> integer is a right shift + conditional move.  We can change / 2 to
> explicit >> 1, or change start and end to unsigned, or both.

My bad, no respectable optimizer would miss that.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[kvm-unit-tests PATCH] x86: emulator: Fix h_mem usage in tests_smsw

2014-12-02 Thread Chris J Arges

In emulator.c/tests_smsw, smsw (3) fails because h_mem isn't being set correctly
before smsw is called. By declaring the h_mem function parameter as volatile,
the compiler no longer optimizes out the assignment before smsw.

Signed-off-by: Chris J Arges 
---
 x86/emulator.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/x86/emulator.c b/x86/emulator.c
index 5aa4dbf..570628f 100644
--- a/x86/emulator.c
+++ b/x86/emulator.c
@@ -337,7 +337,7 @@ void test_incdecnotneg(void *mem)
 report("lock notb", *mb == vb);
 }
 
-void test_smsw(uint64_t *h_mem)
+void test_smsw(volatile uint64_t *h_mem)
 {
char mem[16];
unsigned short msw, msw_orig, *pmsw;
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] KVM: cpuid: mask more bits in leaf 0xd and subleaves

2014-12-02 Thread Radim Krčmář

2014-12-02 14:09+0100, Paolo Bonzini:
> - EAX=0Dh, ECX=1: output registers EBX/ECX/EDX are reserved.

(As good as reserved without XSAVES/IA32_XSS.)

> - EAX=0Dh, ECX>1: output register ECX is zero for all the CPUID leaves
> we support, because variable "supported" comes from XCR0 and not XSS.
> However, only bits above 0 are reserved.  Output register EDX is reserved.

(Yes.  Well, EDX is 0 when the sub-leaf is invalid.)

> Source: Intel Architecture Instruction Set Extensions Programming
> Reference, ref. number 319433-022
> 
> Signed-off-by: Paolo Bonzini 
> --
>  arch/x86/kvm/cpuid.c | 13 ++---
>  1 file changed, 10 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
> index 0d919bc33b02..b1366743a728 100644
> --- a/arch/x86/kvm/cpuid.c
> +++ b/arch/x86/kvm/cpuid.c
> @@ -470,10 +470,17 @@ static inline int __do_cpuid_ent(struct 
> kvm_cpuid_entry2 *entry, u32 function,
>   goto out;
>  
>   do_cpuid_1_ent(&entry[i], function, idx);
> - if (idx == 1)
> + if (idx == 1) {
>   entry[i].eax &= 
> kvm_supported_word10_x86_features;
> - else if (entry[i].eax == 0 || !(supported & mask))
> - continue;
> + entry[i].ebx = 0;
> + entry[i].ecx = 0;
> + } else {
> + if (entry[i].eax == 0 || !(supported & mask))
> + continue;
> + WARN_ON_ONCE(entry[i].ecx & 1);
> + entry[i].ecx &= 1;

 ECX  Bit 0 is set if the sub-leaf index, n, maps to a valid bit in the
  IA32_XSS MSR and bit 0 is clear if n maps to a valid bit in XCR0.

ECX should be set to 0 instead, we definitely don't map to a valid bit
in IA32_XSS now.
(Having only one part of cpuid ready for it is weird ...)

> + }
> + entry[i].edx = 0;
>   entry[i].flags |=
>  KVM_CPUID_FLAG_SIGNIFCANT_INDEX;

(Unrelated, I have yet to understand how this flag translates
 * If ECX contains an invalid sub-leaf index, EAX/EBX/ECX/EDX return 0.)

>   ++*nent;

Forcing a change of the XSAVES implementation is a likely purpose of
this patch and it is correct after changing the ecx handling, so then,

Reviewed-by: Radim Krčmář 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v3 0/3] kvm: vmx: enable xsaves for kvm

2014-12-02 Thread Wanpeng Li

On Tue, Dec 02, 2014 at 02:00:24PM +0100, Paolo Bonzini wrote:
>
>
>On 02/12/2014 12:14, Wanpeng Li wrote:
>> This patchset is to enable xsaves for kvm part, the patch for 
>> qemu part will be sent out later. 
>> 
>> The patchset is tested on skylake-client.
>> 
>> v2 -> v3:
>>  * add kvm_get/set for ia32_xss
>>  * fix the type XSS_EXIT_BITMAP
>>  * load host_xss just once in setup_vmcs_config
>>  * add/clear atuo switch ia32_xss msr in kvm_get/clear
>>  * add VMX_XSS_EXIT_BITMAP macro
>>  * add WARN() in handle_xsaves/xrstors
>>  * export xsaves if related vmcs field is set 
>> 
>> v1 -> v2: 
>>  * auto switch ia32_xss msr just if this msr is present 
>> 
>> Wanpeng Li (3):
>>   kvm: x86: Intel XSAVES vmx and msr handle
>>   kvm: vmx: add kvm_get/set logic to xsaves
>>   kvm: x86: Enable Intel XSAVES for guest
>> 
>>  arch/x86/include/asm/kvm_host.h |  2 ++
>>  arch/x86/include/asm/vmx.h  |  3 +++
>>  arch/x86/include/uapi/asm/vmx.h |  6 -
>>  arch/x86/kvm/cpuid.c|  3 ++-
>>  arch/x86/kvm/vmx.c  | 51 
>> -
>>  5 files changed, 62 insertions(+), 3 deletions(-)
>> 
>
>We need to return false from an svm_xsaves_supported function too, and 
>we need to prevent setting MSR_IA32_XSS to any non-zero value because 
>we do not support getting/setting a guest's Trace Packet Configuration 
>State.
>

Agreed, I will fix them in next version.

>I will squash this hunk in, and send a new patch to return zero for
>CPUID(0xd,i).ECX and CPUID(0xd,i).EDX.

Cool, I will test it.

Regards,
Wanpeng Li 

>
>diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>index 66d1e3d0195e..6e3a4486749c 100644
>--- a/arch/x86/kvm/vmx.c
>+++ b/arch/x86/kvm/vmx.c
>@@ -2671,6 +2671,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct 
>msr_data *msr_info)
>   case MSR_IA32_XSS:
>   if (!vmx_xsaves_supported())
>   return 1;
>+  /* The only supported bit as of Skylake is bit 8, but
>+   * it is not supported on KVM.
>+   */
>+  if (data != 0)
>+  return 1;
>   vcpu->arch.ia32_xss = data;
>   if (vcpu->arch.ia32_xss != host_xss)
>   add_atomic_switch_msr(vmx, MSR_IA32_XSS,
>
>Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v13 10/11] pvqspinlock, x86: Enable PV qspinlock for KVM

2014-12-02 Thread Thomas Gleixner

On Wed, 29 Oct 2014, Waiman Long wrote:
> AIM7 XFS Disk Test (no overcommit)
>   kernel JPMReal Time   Sys TimeUsr Time
>   -  ----   
>   PV ticketlock 25423737.08   98.95   5.44
>   PV qspinlock  25495757.06   98.63   5.40
>   unfairlock  26162796.91   97.05   5.42
> 
> AIM7 XFS Disk Test (200% overcommit)
>   kernel JPMReal Time   Sys TimeUsr Time
>   -  ----   
>   PV ticketlock 64446827.93  415.22   6.33
>   PV qspinlock  64562427.88  419.84   0.39

That number is made up by what? 

>   unfairlock  69551825.88  377.40   4.09
> 
> AIM7 EXT4 Disk Test (no overcommit)
>   kernel JPMReal Time   Sys TimeUsr Time
>   -  ----   
>   PV ticketlock 19955659.02  103.67   5.76
>   PV qspinlock  20111738.95  102.15   5.40
>   unfairlock  20665908.71   98.13   5.46
> 
> AIM7 EXT4 Disk Test (200% overcommit)
>   kernel JPMReal Time   Sys TimeUsr Time
>   -  ----   
>   PV ticketlock 47834137.63  495.81  30.78
>   PV qspinlock  47405837.97  475.74  30.95
>   unfairlock  56022432.13  398.43  26.27
> 
> For the AIM7 disk workload, both PV ticketlock and qspinlock have
> about the same performance. The unfairlock performs slightly better
> than the PV lock.

Slightly?

Taking the PV locks, which are basically the same for the existing
ticket locks and your new fangled qlocks as a reference then the so
called 'unfair locks' which are just the native locks w/o the PV
nonsense are fundamentally better up to a whopping 18% in the
ext4/200% overcommit case. See below.
 
> EBIZZY-m Test (no overcommit)
>   kernelRec/s   Real Time   Sys TimeUsr Time
>   - -   -   
>   PV ticketlock 3255  10.00   60.65   3.62
>   PV qspinlock  3318  10.00   54.27   3.60
>   unfairlock  2833  10.00   26.66   3.09
> 
> EBIZZY-m Test (200% overcommit)
>   kernelRec/s   Real Time   Sys TimeUsr Time
>   - -   -   
>   PV ticketlock  841  10.00   71.03   2.37
>   PV qspinlock   834  10.00   68.27   2.39
>   unfairlock   865  10.00   27.08   1.51
> 
>   futextest (no overcommit)
>   kernel   kops/s
>   ---
>   PV ticketlock11523
>   PV qspinlock 12328
>   unfairlock  9478
> 
>   futextest (200% overcommit)
>   kernel   kops/s
>   ---
>   PV ticketlock 7276
>   PV qspinlock  7095
>   unfairlock  5614
> 
> The ebizzy and futextest have much higher spinlock contention than
> the AIM7 disk workload. In this case, the unfairlock performs worse
> than both the PV ticketlock and qspinlock. The performance of the 2
> PV locks are comparable.

While I can see that the PV lock stuff performs 13% better for the
ebizzy no overcommit case, what about the very interresting numbers
for the same test with 200% overcommit?

The regular lock has a slightly better performance, but significantly
less sys/usr time. How do you explain that?

'Lies, damned lies and statistics' comes to my mind.

Thanks,

tglx
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v4 3/3] kvm: x86: Enable Intel XSAVES for guest

2014-12-02 Thread Wanpeng Li

Expose intel xsaves feature to guest.

Signed-off-by: Wanpeng Li 
---
 arch/x86/kvm/cpuid.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index a4f5ac4..0d919bc 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -267,6 +267,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0;
unsigned f_invpcid = kvm_x86_ops->invpcid_supported() ? F(INVPCID) : 0;
unsigned f_mpx = kvm_x86_ops->mpx_supported() ? F(MPX) : 0;
+   unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0;
 
/* cpuid 1.edx */
const u32 kvm_supported_word0_x86_features =
@@ -322,7 +323,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
 
/* cpuid 0xD.1.eax */
const u32 kvm_supported_word10_x86_features =
-   F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1);
+   F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | f_xsaves;
 
/* all calls to cpuid_count() should be made on the same cpu */
get_cpu();
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v4 2/3] kvm: vmx: add kvm_get/set logic to xsaves

2014-12-02 Thread Wanpeng Li

Add kvm_get/set logic to xsaves.

Signed-off-by: Wanpeng Li 
---
 arch/x86/kvm/vmx.c | 26 ++
 1 file changed, 26 insertions(+)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 12915f1..ad1153a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -99,6 +99,8 @@ module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, 
bool, S_IRUGO);
 static bool __read_mostly nested = 0;
 module_param(nested, bool, S_IRUGO);
 
+static u64 __read_mostly host_xss;
+
 #define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
 #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE)
 #define KVM_VM_CR0_ALWAYS_ON   \
@@ -2570,6 +2572,11 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 
msr_index, u64 *pdata)
if (!nested_vmx_allowed(vcpu))
return 1;
return vmx_get_vmx_msr(vcpu, msr_index, pdata);
+   case MSR_IA32_XSS:
+   if (!vmx_xsaves_supported())
+   return 1;
+   data = vcpu->arch.ia32_xss;
+   break;
case MSR_TSC_AUX:
if (!to_vmx(vcpu)->rdtscp_enabled)
return 1;
@@ -2661,6 +2668,22 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct 
msr_data *msr_info)
break;
case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
return 1; /* they are read-only */
+   case MSR_IA32_XSS:
+   if (!vmx_xsaves_supported())
+   return 1;
+   /*
+* The only supported bit as of Skylake is bit 8, but
+* it is not supported on KVM.
+*/
+   if (data != 0)
+   return 1;
+   vcpu->arch.ia32_xss = data;
+   if (vcpu->arch.ia32_xss != host_xss)
+   add_atomic_switch_msr(vmx, MSR_IA32_XSS,
+   vcpu->arch.ia32_xss, host_xss);
+   else
+   clear_atomic_switch_msr(vmx, MSR_IA32_XSS);
+   break;
case MSR_TSC_AUX:
if (!vmx->rdtscp_enabled)
return 1;
@@ -3020,6 +3043,9 @@ static __init int setup_vmcs_config(struct vmcs_config 
*vmcs_conf)
}
}
 
+   if (cpu_has_xsaves)
+   rdmsrl(MSR_IA32_XSS, host_xss);
+
return 0;
 }
 
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v4 0/3] kvm: vmx: enable xsaves for kvm

2014-12-02 Thread Wanpeng Li

This patchset is to enable xsaves for kvm part and tested on skylake-client.

v3 -> v4:
 * return false from an svm_xsaves_supported function
 * prevent setting MSR_IA32_XSS to any non-zero value

v2 -> v3:
 * add kvm_get/set for ia32_xss
 * fix the type XSS_EXIT_BITMAP
 * load host_xss just once in setup_vmcs_config
 * add/clear atuo switch ia32_xss msr in kvm_get/clear
 * add VMX_XSS_EXIT_BITMAP macro
 * add WARN() in handle_xsaves/xrstors
 * export xsaves if related vmcs field is set

v1 -> v2:
 * auto switch ia32_xss msr just if this msr is present

Wanpeng Li (3):
  kvm: vmx: Intel XSAVES vmx and msr handle
  kvm: vmx: add kvm_get/set logic to xsaves
  kvm: x86: Enable Intel XSAVES for guest

 arch/x86/include/asm/kvm_host.h |  2 ++
 arch/x86/include/asm/vmx.h  |  3 +++
 arch/x86/include/uapi/asm/vmx.h |  6 -
 arch/x86/kvm/cpuid.c|  3 ++-
 arch/x86/kvm/svm.c  |  6 +
 arch/x86/kvm/vmx.c  | 57 -
 6 files changed, 74 insertions(+), 3 deletions(-)

-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v4 1/3] kvm: vmx: Intel XSAVES vmx and msr handle

2014-12-02 Thread Wanpeng Li

Intel xsaves vmx and msr handle.

Signed-off-by: Wanpeng Li 
---
 arch/x86/include/asm/kvm_host.h |  2 ++
 arch/x86/include/asm/vmx.h  |  3 +++
 arch/x86/include/uapi/asm/vmx.h |  6 +-
 arch/x86/kvm/svm.c  |  6 ++
 arch/x86/kvm/vmx.c  | 31 ++-
 5 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 2896dbc..0c4c88c 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -362,6 +362,7 @@ struct kvm_vcpu_arch {
int mp_state;
u64 ia32_misc_enable_msr;
bool tpr_access_reporting;
+   u64 ia32_xss;
 
/*
 * Paging state of the vcpu
@@ -771,6 +772,7 @@ struct kvm_x86_ops {
   enum x86_intercept_stage stage);
void (*handle_external_intr)(struct kvm_vcpu *vcpu);
bool (*mpx_supported)(void);
+   bool (*xsaves_supported)(void);
 
int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr);
 
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index bcbfade..45afaee 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -69,6 +69,7 @@
 #define SECONDARY_EXEC_PAUSE_LOOP_EXITING  0x0400
 #define SECONDARY_EXEC_ENABLE_INVPCID  0x1000
 #define SECONDARY_EXEC_SHADOW_VMCS  0x4000
+#define SECONDARY_EXEC_XSAVES  0x0010
 
 
 #define PIN_BASED_EXT_INTR_MASK 0x0001
@@ -159,6 +160,8 @@ enum vmcs_field {
EOI_EXIT_BITMAP3_HIGH   = 0x2023,
VMREAD_BITMAP   = 0x2026,
VMWRITE_BITMAP  = 0x2028,
+   XSS_EXIT_BITMAP = 0x202C,
+   XSS_EXIT_BITMAP_HIGH= 0x202D,
GUEST_PHYSICAL_ADDRESS  = 0x2400,
GUEST_PHYSICAL_ADDRESS_HIGH = 0x2401,
VMCS_LINK_POINTER   = 0x2800,
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index 990a2fe..b813bf9 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -72,6 +72,8 @@
 #define EXIT_REASON_XSETBV  55
 #define EXIT_REASON_APIC_WRITE  56
 #define EXIT_REASON_INVPCID 58
+#define EXIT_REASON_XSAVES  63
+#define EXIT_REASON_XRSTORS 64
 
 #define VMX_EXIT_REASONS \
{ EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \
@@ -116,6 +118,8 @@
{ EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \
{ EXIT_REASON_INVD,  "INVD" }, \
{ EXIT_REASON_INVVPID,   "INVVPID" }, \
-   { EXIT_REASON_INVPCID,   "INVPCID" }
+   { EXIT_REASON_INVPCID,   "INVPCID" }, \
+   { EXIT_REASON_XSAVES,"XSAVES" }, \
+   { EXIT_REASON_XRSTORS,   "XRSTORS" }
 
 #endif /* _UAPIVMX_H */
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 6b411ad..41dd038 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -4127,6 +4127,11 @@ static bool svm_mpx_supported(void)
return false;
 }
 
+static bool svm_xsaves_supported(void)
+{
+   return false;
+}
+
 static bool svm_has_wbinvd_exit(void)
 {
return true;
@@ -4414,6 +4419,7 @@ static struct kvm_x86_ops svm_x86_ops = {
.rdtscp_supported = svm_rdtscp_supported,
.invpcid_supported = svm_invpcid_supported,
.mpx_supported = svm_mpx_supported,
+   .xsaves_supported = svm_xsaves_supported,
 
.set_supported_cpuid = svm_set_supported_cpuid,
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 6a951d8..12915f1 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -761,6 +761,7 @@ static u64 construct_eptp(unsigned long root_hpa);
 static void kvm_cpu_vmxon(u64 addr);
 static void kvm_cpu_vmxoff(void);
 static bool vmx_mpx_supported(void);
+static bool vmx_xsaves_supported(void);
 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
 static void vmx_set_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg);
@@ -2895,7 +2896,8 @@ static __init int setup_vmcs_config(struct vmcs_config 
*vmcs_conf)
SECONDARY_EXEC_ENABLE_INVPCID |
SECONDARY_EXEC_APIC_REGISTER_VIRT |
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
-   SECONDARY_EXEC_SHADOW_VMCS;
+   SECONDARY_EXEC_SHADOW_VMCS |
+   SECONDARY_EXEC_XSAVES;
if (adjust_vmx_controls(min2, opt2,
MSR_IA32_VMX_PROCBASED_CTLS2,
&_cpu_based_2nd_exec_control) < 0)
@@ -4337,6 +4339,7 @@ static void ept_set_mmio_spte_mask(void)
kvm_mmu_set_mmio_spte_mask((0x3ull << 62) | 0x6ull);
 }
 
+#define VMX_XSS_EXIT_BITMAP 0

Re: [PATCH v2 1/2] kvm: x86: revert mask out xsaves

2014-12-02 Thread Wanpeng Li

On Tue, Dec 02, 2014 at 09:55:09AM +0100, Paolo Bonzini wrote:
>
>
>On 02/12/2014 07:14, Wanpeng Li wrote:
>> xsaves will be exported to guest in the next patch, so revert the
>> mask out xsaves patch.
>> 
>> Signed-off-by: Wanpeng Li 
>> ---
>>  arch/x86/kvm/cpuid.c | 10 +-
>>  1 file changed, 1 insertion(+), 9 deletions(-)
>> 
>> diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
>> index a4f5ac4..7af07571 100644
>> --- a/arch/x86/kvm/cpuid.c
>> +++ b/arch/x86/kvm/cpuid.c
>> @@ -320,10 +320,6 @@ static inline int __do_cpuid_ent(struct 
>> kvm_cpuid_entry2 *entry, u32 function,
>>  F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) |
>>  F(AVX512CD);
>>  
>> -/* cpuid 0xD.1.eax */
>> -const u32 kvm_supported_word10_x86_features =
>> -F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1);
>
>Every single bit needs to be whitelisted independently of the others, so
>a full revert is not possible.  We need to be careful about post-Skylake
>processors introducing new bits in this cpuid leaf.
>
>Also, you cannot just add F(XSAVES) here; you must only do it if XSAVES
>is actually supported.  This makes sure that nested virtualization will
>_not_ present XSAVES to the guests until specific support is introduced
>for XSAVES and XRSTORS exits (including the XSS exit bitmap).

I will try to enable nested xsaves.

Regards,
Wanpeng Li 

>
>In order to do this, you have to introduce a new member in kvm_x86_ops,
>modeling what was done for MPX.  The second patch can then implement
>this new member.
>
>Thanks,
>
>Paolo
>
>>  /* all calls to cpuid_count() should be made on the same cpu */
>>  get_cpu();
>>  
>> @@ -460,8 +456,6 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
>> *entry, u32 function,
>>  entry->eax &= supported;
>>  entry->edx &= supported >> 32;
>>  entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
>> -if (!supported)
>> -break;
>>  
>>  for (idx = 1, i = 1; idx < 64; ++idx) {
>>  u64 mask = ((u64)1 << idx);
>> @@ -469,9 +463,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
>> *entry, u32 function,
>>  goto out;
>>  
>>  do_cpuid_1_ent(&entry[i], function, idx);
>> -if (idx == 1)
>> -entry[i].eax &= 
>> kvm_supported_word10_x86_features;
>> -else if (entry[i].eax == 0 || !(supported & mask))
>> +if (entry[i].eax == 0 || !(supported & mask))
>>  continue;
>>  entry[i].flags |=
>> KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
>> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v2] target-i386: Intel xsaves

2014-12-02 Thread Wanpeng Li

Add xsaves related definition, it also adds corresponding part 
to kvm_get/put, and vmstate.

Signed-off-by: Wanpeng Li 
---
v1 -> v2:
 * use a subsection instead of bumping the version number.

 target-i386/cpu.h |  2 ++
 target-i386/kvm.c | 15 +++
 target-i386/machine.c | 21 +
 3 files changed, 38 insertions(+)

diff --git a/target-i386/cpu.h b/target-i386/cpu.h
index 015f5b5..cff7433 100644
--- a/target-i386/cpu.h
+++ b/target-i386/cpu.h
@@ -389,6 +389,7 @@
 #define MSR_VM_HSAVE_PA 0xc0010117
 
 #define MSR_IA32_BNDCFGS0x0d90
+#define MSR_IA32_XSS0x0da0
 
 #define XSTATE_FP   (1ULL << 0)
 #define XSTATE_SSE  (1ULL << 1)
@@ -1019,6 +1020,7 @@ typedef struct CPUX86State {
 uint64_t xstate_bv;
 
 uint64_t xcr0;
+uint64_t xss;
 
 TPRAccess tpr_access_type;
 } CPUX86State;
diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index ccf36e8..c6fc417 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -80,6 +80,7 @@ static bool has_msr_hv_hypercall;
 static bool has_msr_hv_vapic;
 static bool has_msr_hv_tsc;
 static bool has_msr_mtrr;
+static bool has_msr_xss;
 
 static bool has_msr_architectural_pmu;
 static uint32_t num_architectural_pmu_counters;
@@ -826,6 +827,10 @@ static int kvm_get_supported_msrs(KVMState *s)
 has_msr_bndcfgs = true;
 continue;
 }
+if (kvm_msr_list->indices[i] == MSR_IA32_XSS) {
+has_msr_xss = true;
+continue;
+}
 }
 }
 
@@ -1224,6 +1229,9 @@ static int kvm_put_msrs(X86CPU *cpu, int level)
 if (has_msr_bndcfgs) {
 kvm_msr_entry_set(&msrs[n++], MSR_IA32_BNDCFGS, env->msr_bndcfgs);
 }
+if (has_msr_xss) {
+kvm_msr_entry_set(&msrs[n++], MSR_IA32_XSS, env->xss);
+}
 #ifdef TARGET_X86_64
 if (lm_capable_kernel) {
 kvm_msr_entry_set(&msrs[n++], MSR_CSTAR, env->cstar);
@@ -1570,6 +1578,10 @@ static int kvm_get_msrs(X86CPU *cpu)
 if (has_msr_bndcfgs) {
 msrs[n++].index = MSR_IA32_BNDCFGS;
 }
+if (has_msr_xss) {
+msrs[n++].index = MSR_IA32_XSS;
+}
+
 
 if (!env->tsc_valid) {
 msrs[n++].index = MSR_IA32_TSC;
@@ -1717,6 +1729,9 @@ static int kvm_get_msrs(X86CPU *cpu)
 case MSR_IA32_BNDCFGS:
 env->msr_bndcfgs = msrs[i].data;
 break;
+case MSR_IA32_XSS:
+env->xss = msrs[i].data;
+break;
 default:
 if (msrs[i].index >= MSR_MC0_CTL &&
 msrs[i].index < MSR_MC0_CTL + (env->mcg_cap & 0xff) * 4) {
diff --git a/target-i386/machine.c b/target-i386/machine.c
index 1c13b14..722d62e 100644
--- a/target-i386/machine.c
+++ b/target-i386/machine.c
@@ -687,6 +687,24 @@ static const VMStateDescription vmstate_avx512 = {
 }
 };
 
+static bool xss_needed(void *opaque)
+{
+X86CPU *cpu = opaque;
+CPUX86State *env = &cpu->env;
+
+return env->xss != 0;
+}
+
+static const VMStateDescription vmstate_xss = {
+.name = "cpu/xss",
+.version_id = 1,
+.minimum_version_id = 1,
+.fields = (VMStateField[]) {
+VMSTATE_UINT64(env.xss, X86CPU),
+VMSTATE_END_OF_LIST()
+}
+};
+
 VMStateDescription vmstate_x86_cpu = {
 .name = "cpu",
 .version_id = 12,
@@ -832,6 +850,9 @@ VMStateDescription vmstate_x86_cpu = {
 }, {
 .vmsd = &vmstate_avx512,
 .needed = avx512_needed,
+ }, {
+.vmsd = &vmstate_xss,
+.needed = xss_needed,
 } , {
 /* empty */
 }
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [Xen-devel] [PATCH] xen: privcmd: schedule() after private hypercall when non CONFIG_PREEMPT

2014-12-02 Thread Luis R. Rodriguez

On Tue, Dec 02, 2014 at 11:11:18AM +, David Vrabel wrote:
> On 01/12/14 22:36, Luis R. Rodriguez wrote:
> > 
> > Then I do agree its a fair analogy (and find this obviously odd that how
> > widespread cond_resched() is), we just don't have an equivalent for IRQ
> > context, why not avoid the special check then and use this all the time in 
> > the
> > middle of a hypercall on the return from an interrupt (e.g., the timer
> > interrupt)?
> 
> http://lists.xen.org/archives/html/xen-devel/2014-02/msg01101.html

OK thanks! That explains why we need some asm code but in that submission you
still also had used is_preemptible_hypercall(regs) and in the new
implementation you use a CPU variable xen_in_preemptible_hcall prior to calling
preempt_schedule_irq(). I believe you added the CPU variable because
preempt_schedule_irq() will preempt first without any checks if it should, I'm
asking why not do something like cond_resched_irq() where we check with
should_resched() prior to preempting and that way we can avoid having to use
the CPU variable?

  Luis
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 0/5] Fixes and improvements for HV KVM on PPC

2014-12-02 Thread Paul Mackerras

This series of patches is based on Alex Graf's kvm-ppc-queue branch
and is intended for the 3.19 merge window.  It starts by removing the
code to support HV KVM on PPC970 processors.  This code is hardly used
now since there are not many HV-capable PPC970 machines (Apple G5
machines are not HV-capable) and POWER8 systems capable of running HV
KVM are generally available now.

Then there is a fix for a potential endianness problem, an improvement
for the existing H_CONFER implementation, a real-mode H_RANDOM
implementation, and a small Kconfig change.  None of these should be
controversial with the possible exception of H_RANDOM - but now that
userspace has full control over whether the H_RANDOM handler is active
or not (via the KVM_CAP_PPC_ENABLE_HCALL capability) it will hopefully
be controversial no longer.

Thanks,
Paul.

 Documentation/virtual/kvm/api.txt|  17 ++
 arch/powerpc/include/asm/archrandom.h|  11 +-
 arch/powerpc/include/asm/kvm_book3s.h|   2 -
 arch/powerpc/include/asm/kvm_book3s_64.h |   1 -
 arch/powerpc/include/asm/kvm_host.h  |  17 +-
 arch/powerpc/include/asm/kvm_ppc.h   |   4 +-
 arch/powerpc/kernel/asm-offsets.c|   2 +-
 arch/powerpc/kvm/Kconfig |   1 +
 arch/powerpc/kvm/book3s_64_mmu_hv.c  | 200 ++---
 arch/powerpc/kvm/book3s_hv.c | 337 ++--
 arch/powerpc/kvm/book3s_hv_builtin.c | 151 +
 arch/powerpc/kvm/book3s_hv_interrupts.S  |  39 +---
 arch/powerpc/kvm/book3s_hv_ras.c |   5 +-
 arch/powerpc/kvm/book3s_hv_rm_mmu.c  | 110 ++
 arch/powerpc/kvm/book3s_hv_rmhandlers.S  | 366 +++
 arch/powerpc/kvm/powerpc.c   |  13 +-
 arch/powerpc/platforms/powernv/rng.c |  25 +++
 include/uapi/linux/kvm.h |   1 +
 18 files changed, 341 insertions(+), 961 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 3/5] KVM: PPC: Book3S HV: Improve H_CONFER implementation

2014-12-02 Thread Paul Mackerras

From: Sam Bobroff 

Currently the H_CONFER hcall is implemented in kernel virtual mode,
meaning that whenever a guest thread does an H_CONFER, all the threads
in that virtual core have to exit the guest.  This is bad for
performance because it interrupts the other threads even if they
are doing useful work.

The H_CONFER hcall is called by a guest VCPU when it is spinning on a
spinlock and it detects that the spinlock is held by a guest VCPU that
is currently not running on a physical CPU.  The idea is to give this
VCPU's time slice to the holder VCPU so that it can make progress
towards releasing the lock.

To avoid having the other threads exit the guest unnecessarily,
we add a real-mode implementation of H_CONFER that checks whether
the other threads are doing anything.  If all the other threads
are idle (i.e. in H_CEDE) or trying to confer (i.e. in H_CONFER),
it returns H_TOO_HARD which causes a guest exit and allows the
H_CONFER to be handled in virtual mode.

Otherwise it spins for a short time (up to 10 microseconds) to give
other threads the chance to observe that this thread is trying to
confer.  The spin loop also terminates when any thread exits the guest
or when all other threads are idle or trying to confer.  If the
timeout is reached, the H_CONFER returns H_SUCCESS.  In this case the
guest VCPU will recheck the spinlock word and most likely call
H_CONFER again.

This also improves the implementation of the H_CONFER virtual mode
handler.  If the VCPU is part of a virtual core (vcore) which is
runnable, there will be a 'runner' VCPU which has taken responsibility
for running the vcore.  In this case we yield to the runner VCPU
rather than the target VCPU.

We also introduce a check on the target VCPU's yield count: if it
differs from the yield count passed to H_CONFER, the target VCPU
has run since H_CONFER was called and may have already released
the lock.  This check is required by PAPR.

Signed-off-by: Sam Bobroff 
Signed-off-by: Paul Mackerras 
---
 arch/powerpc/include/asm/kvm_host.h |  1 +
 arch/powerpc/kvm/book3s_hv.c| 41 -
 arch/powerpc/kvm/book3s_hv_builtin.c| 32 +
 arch/powerpc/kvm/book3s_hv_rmhandlers.S |  2 +-
 4 files changed, 74 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index 771988d..19ff9ee 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -289,6 +289,7 @@ struct kvmppc_vcore {
ulong dpdes;/* doorbell state (POWER8) */
void *mpp_buffer; /* Micro Partition Prefetch buffer */
bool mpp_buffer_is_valid;
+   ulong conferring_threads;
 };
 
 #define VCORE_ENTRY_COUNT(vc)  ((vc)->entry_exit_count & 0xff)
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 2d119ce..b404cc6 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -604,10 +604,45 @@ static int kvmppc_h_set_mode(struct kvm_vcpu *vcpu, 
unsigned long mflags,
}
 }
 
+static int kvm_arch_vcpu_yield_to(struct kvm_vcpu *target)
+{
+   struct kvmppc_vcore *vcore = target->arch.vcore;
+
+   /*
+* We expect to have been called by the real mode handler
+* (kvmppc_rm_h_confer()) which would have directly returned
+* H_SUCCESS if the source vcore wasn't idle (e.g. if it may
+* have useful work to do and should not confer) so we don't
+* recheck that here.
+*/
+
+   spin_lock(&vcore->lock);
+   if (target->arch.state == KVMPPC_VCPU_RUNNABLE &&
+   vcore->vcore_state != VCORE_INACTIVE)
+   target = vcore->runner;
+   spin_unlock(&vcore->lock);
+
+   return kvm_vcpu_yield_to(target);
+}
+
+static int kvmppc_get_yield_count(struct kvm_vcpu *vcpu)
+{
+   int yield_count = 0;
+   struct lppaca *lppaca;
+
+   spin_lock(&vcpu->arch.vpa_update_lock);
+   lppaca = (struct lppaca *)vcpu->arch.vpa.pinned_addr;
+   if (lppaca)
+   yield_count = lppaca->yield_count;
+   spin_unlock(&vcpu->arch.vpa_update_lock);
+   return yield_count;
+}
+
 int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
 {
unsigned long req = kvmppc_get_gpr(vcpu, 3);
unsigned long target, ret = H_SUCCESS;
+   int yield_count;
struct kvm_vcpu *tvcpu;
int idx, rc;
 
@@ -643,7 +678,10 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
ret = H_PARAMETER;
break;
}
-   kvm_vcpu_yield_to(tvcpu);
+   yield_count = kvmppc_get_gpr(vcpu, 5);
+   if (kvmppc_get_yield_count(tvcpu) != yield_count)
+   break;
+   kvm_arch_vcpu_yield_to(tvcpu);
break;
case H_REGISTER_VPA:
ret = do_h_register_vpa(vcpu, kvmppc_get_gpr(vcpu, 4),
@@ -1692,6 +1730,7 @@ static void kvmppc_

[PATCH 1/5] KVM: PPC: Book3S HV: Remove code for PPC970 processors

2014-12-02 Thread Paul Mackerras

This removes the code that was added to enable HV KVM to work
on PPC970 processors.  The PPC970 is an old CPU that doesn't
support virtualizing guest memory.  Removing PPC970 support also
lets us remove the code for allocating and managing contiguous
real-mode areas, the code for the !kvm->arch.using_mmu_notifiers
case, the code for pinning pages of guest memory when first
accessed and keeping track of which pages have been pinned, and
the code for handling H_ENTER hypercalls in virtual mode.

Book3S HV KVM is now supported only on POWER7 and POWER8 processors.
The KVM_CAP_PPC_RMA capability now always returns 0.

Signed-off-by: Paul Mackerras 
---
 arch/powerpc/include/asm/kvm_book3s.h|   2 -
 arch/powerpc/include/asm/kvm_book3s_64.h |   1 -
 arch/powerpc/include/asm/kvm_host.h  |  14 --
 arch/powerpc/include/asm/kvm_ppc.h   |   2 -
 arch/powerpc/kernel/asm-offsets.c|   1 -
 arch/powerpc/kvm/book3s_64_mmu_hv.c  | 200 ++---
 arch/powerpc/kvm/book3s_hv.c | 292 +++
 arch/powerpc/kvm/book3s_hv_builtin.c | 104 +--
 arch/powerpc/kvm/book3s_hv_interrupts.S  |  39 +
 arch/powerpc/kvm/book3s_hv_ras.c |   5 +-
 arch/powerpc/kvm/book3s_hv_rm_mmu.c  | 110 ++--
 arch/powerpc/kvm/book3s_hv_rmhandlers.S  | 245 +-
 arch/powerpc/kvm/powerpc.c   |  10 +-
 13 files changed, 70 insertions(+), 955 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s.h 
b/arch/powerpc/include/asm/kvm_book3s.h
index 6acf0c2..942c7b1 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -170,8 +170,6 @@ extern void *kvmppc_pin_guest_page(struct kvm *kvm, 
unsigned long addr,
unsigned long *nb_ret);
 extern void kvmppc_unpin_guest_page(struct kvm *kvm, void *addr,
unsigned long gpa, bool dirty);
-extern long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
-   long pte_index, unsigned long pteh, unsigned long ptel);
 extern long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
long pte_index, unsigned long pteh, unsigned long ptel,
pgd_t *pgdir, bool realmode, unsigned long *idx_ret);
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h 
b/arch/powerpc/include/asm/kvm_book3s_64.h
index a37f1a4..2d81e20 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -37,7 +37,6 @@ static inline void svcpu_put(struct kvmppc_book3s_shadow_vcpu 
*svcpu)
 
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
 #define KVM_DEFAULT_HPT_ORDER  24  /* 16MB HPT by default */
-extern unsigned long kvm_rma_pages;
 #endif
 
 #define VRMA_VSID  0x1ffUL /* 1TB VSID reserved for VRMA */
diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index d243240..c567df6 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -175,11 +175,6 @@ struct kvmppc_spapr_tce_table {
struct page *pages[0];
 };
 
-struct kvm_rma_info {
-   atomic_t use_count;
-   unsigned long base_pfn;
-};
-
 /* XICS components, defined in book3s_xics.c */
 struct kvmppc_xics;
 struct kvmppc_icp;
@@ -209,16 +204,9 @@ struct revmap_entry {
 #define KVMPPC_RMAP_PRESENT0x1ul
 #define KVMPPC_RMAP_INDEX  0xul
 
-/* Low-order bits in memslot->arch.slot_phys[] */
-#define KVMPPC_PAGE_ORDER_MASK 0x1f
-#define KVMPPC_PAGE_NO_CACHE   HPTE_R_I/* 0x20 */
-#define KVMPPC_PAGE_WRITETHRU  HPTE_R_W/* 0x40 */
-#define KVMPPC_GOT_PAGE0x80
-
 struct kvm_arch_memory_slot {
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
unsigned long *rmap;
-   unsigned long *slot_phys;
 #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
 };
 
@@ -237,14 +225,12 @@ struct kvm_arch {
struct kvm_rma_info *rma;
unsigned long vrma_slb_v;
int rma_setup_done;
-   int using_mmu_notifiers;
u32 hpt_order;
atomic_t vcpus_running;
u32 online_vcores;
unsigned long hpt_npte;
unsigned long hpt_mask;
atomic_t hpte_mod_interest;
-   spinlock_t slot_phys_lock;
cpumask_t need_tlb_flush;
int hpt_cma_alloc;
 #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index dbd160f..6d15f49 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -170,8 +170,6 @@ extern long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, 
unsigned long liobn,
 unsigned long ioba, unsigned long tce);
 extern long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 unsigned long ioba);
-extern struct kvm_rma_info *kvm_alloc_rma(void);
-extern void kvm_release_rma(struct kvm_rma_info *ri);
 extern stru

[PATCH 5/5] KVM: PPC: Book3S: Enable in-kernel XICS emulation by default

2014-12-02 Thread Paul Mackerras

From: Anton Blanchard 

The in-kernel XICS emulation is faster than doing it all in QEMU
and it has got a lot of testing, so enable it by default.

Signed-off-by: Anton Blanchard 
Signed-off-by: Paul Mackerras 
---
 arch/powerpc/kvm/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index 602eb51..f5769f1 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -172,6 +172,7 @@ config KVM_XICS
depends on KVM_BOOK3S_64 && !KVM_MPIC
select HAVE_KVM_IRQCHIP
select HAVE_KVM_IRQFD
+   default y
---help---
  Include support for the XICS (eXternal Interrupt Controller
  Specification) interrupt controller architecture used on
-- 
2.1.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 4/5] KVM: PPC: Book3S HV: Add fast real-mode H_RANDOM implementation.

2014-12-02 Thread Paul Mackerras

From: Michael Ellerman 

Some PowerNV systems include a hardware random-number generator.
This HWRNG is present on POWER7+ and POWER8 chips and is capable of
generating one 64-bit random number every microsecond.  The random
numbers are produced by sampling a set of 64 unstable high-frequency
oscillators and are almost completely entropic.

PAPR defines an H_RANDOM hypercall which guests can use to obtain one
64-bit random sample from the HWRNG.  This adds a real-mode
implementation of the H_RANDOM hypercall.  This hypercall was
implemented in real mode because the latency of reading the HWRNG is
generally small compared to the latency of a guest exit and entry for
all the threads in the same virtual core.

Userspace can detect the presence of the HWRNG and the H_RANDOM
implementation by quering the KVM_CAP_PPC_HWRNG capability.  The
H_RANDOM hypercall implementation will only be invoked when the guest
does an H_RANDOM hypercall if userspace first enables the in-kernel
H_RANDOM implementation using the KVM_CAP_PPC_ENABLE_HCALL capability.

Signed-off-by: Michael Ellerman 
Signed-off-by: Paul Mackerras 
---
 Documentation/virtual/kvm/api.txt   |  17 +
 arch/powerpc/include/asm/archrandom.h   |  11 ++-
 arch/powerpc/include/asm/kvm_ppc.h  |   2 +
 arch/powerpc/kvm/book3s_hv_builtin.c|  15 +
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 115 
 arch/powerpc/kvm/powerpc.c  |   3 +
 arch/powerpc/platforms/powernv/rng.c|  25 +++
 include/uapi/linux/kvm.h|   1 +
 8 files changed, 187 insertions(+), 2 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt 
b/Documentation/virtual/kvm/api.txt
index 7610eaa..87590fc 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -3185,3 +3185,20 @@ userspace from doing that.
 If the hcall number specified is not one that has an in-kernel
 implementation, the KVM_ENABLE_CAP ioctl will fail with an EINVAL
 error.
+
+
+8. Other capabilities.
+--
+
+This section lists capabilities that give information about other
+features of the KVM implementation.
+
+8.1 KVM_CAP_PPC_HWRNG
+
+Architectures: ppc
+
+This capability, if KVM_CHECK_EXTENSION indicates that it is
+available, means that that the kernel has an implementation of the
+H_RANDOM hypercall backed by a hardware random-number generator.
+If present, the kernel H_RANDOM handler can be enabled for guest use
+with the KVM_CAP_PPC_ENABLE_HCALL capability.
diff --git a/arch/powerpc/include/asm/archrandom.h 
b/arch/powerpc/include/asm/archrandom.h
index bde5311..0cc6eed 100644
--- a/arch/powerpc/include/asm/archrandom.h
+++ b/arch/powerpc/include/asm/archrandom.h
@@ -30,8 +30,6 @@ static inline int arch_has_random(void)
return !!ppc_md.get_random_long;
 }
 
-int powernv_get_random_long(unsigned long *v);
-
 static inline int arch_get_random_seed_long(unsigned long *v)
 {
return 0;
@@ -47,4 +45,13 @@ static inline int arch_has_random_seed(void)
 
 #endif /* CONFIG_ARCH_RANDOM */
 
+#ifdef CONFIG_PPC_POWERNV
+int powernv_hwrng_present(void);
+int powernv_get_random_long(unsigned long *v);
+int powernv_get_random_real_mode(unsigned long *v);
+#else
+static inline int powernv_hwrng_present(void) { return 0; }
+static inline int powernv_get_random_real_mode(unsigned long *v) { return 0; }
+#endif
+
 #endif /* _ASM_POWERPC_ARCHRANDOM_H */
diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index 6d15f49..1673cdd 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -302,6 +302,8 @@ static inline bool is_kvmppc_hv_enabled(struct kvm *kvm)
return kvm->arch.kvm_ops == kvmppc_hv_ops;
 }
 
+extern int kvmppc_hwrng_present(void);
+
 /*
  * Cuts out inst bits with ordering according to spec.
  * That means the leftmost bit is zero. All given bits are included.
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c 
b/arch/powerpc/kvm/book3s_hv_builtin.c
index 3e43f81..9811e21 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -22,6 +22,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #define KVM_CMA_CHUNK_ORDER18
 
@@ -170,3 +171,17 @@ int kvmppc_hcall_impl_hv_realmode(unsigned long cmd)
return 0;
 }
 EXPORT_SYMBOL_GPL(kvmppc_hcall_impl_hv_realmode);
+
+int kvmppc_hwrng_present(void)
+{
+   return powernv_hwrng_present();
+}
+EXPORT_SYMBOL_GPL(kvmppc_hwrng_present);
+
+long kvmppc_h_random(struct kvm_vcpu *vcpu)
+{
+   if (powernv_get_random_real_mode(&vcpu->arch.gpr[4]))
+   return H_SUCCESS;
+
+   return H_HARDWARE;
+}
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 0a2d64f..7230e11 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -1819,6 +1819,121 @@ hcall_real_table:
.long   0   /* 0x12c */
.lo

[PATCH 2/5] KVM: PPC: Book3S HV: Fix endianness of instruction obtained from HEIR register

2014-12-02 Thread Paul Mackerras

There are two ways in which a guest instruction can be obtained from
the guest in the guest exit code in book3s_hv_rmhandlers.S.  If the
exit was caused by a Hypervisor Emulation interrupt (i.e. an illegal
instruction), the offending instruction is in the HEIR register
(Hypervisor Emulation Instruction Register).  If the exit was caused
by a load or store to an emulated MMIO device, we load the instruction
from the guest by turning data relocation on and loading the instruction
with an lwz instruction.

Unfortunately, in the case where the guest has opposite endianness to
the host, these two methods give results of different endianness, but
both get put into vcpu->arch.last_inst.  The HEIR value has been loaded
using guest endianness, whereas the lwz will load the instruction using
host endianness.  The rest of the code that uses vcpu->arch.last_inst
assumes it was loaded using host endianness.

To fix this, we define a new vcpu field to store the HEIR value.  Then,
in kvmppc_handle_exit_hv(), we transfer the value from this new field to
vcpu->arch.last_inst, doing a byte-swap if the guest and host endianness
differ.

Signed-off-by: Paul Mackerras 
---
 arch/powerpc/include/asm/kvm_host.h | 2 ++
 arch/powerpc/kernel/asm-offsets.c   | 1 +
 arch/powerpc/kvm/book3s_hv.c| 4 
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 4 ++--
 4 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index c567df6..771988d 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -645,6 +645,8 @@ struct kvm_vcpu_arch {
spinlock_t tbacct_lock;
u64 busy_stolen;
u64 busy_preempt;
+
+   u32 emul_inst;
 #endif
 };
 
diff --git a/arch/powerpc/kernel/asm-offsets.c 
b/arch/powerpc/kernel/asm-offsets.c
index 815212e..b14716b 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -498,6 +498,7 @@ int main(void)
DEFINE(VCPU_DAR, offsetof(struct kvm_vcpu, arch.shregs.dar));
DEFINE(VCPU_VPA, offsetof(struct kvm_vcpu, arch.vpa.pinned_addr));
DEFINE(VCPU_VPA_DIRTY, offsetof(struct kvm_vcpu, arch.vpa.dirty));
+   DEFINE(VCPU_HEIR, offsetof(struct kvm_vcpu, arch.emul_inst));
 #endif
 #ifdef CONFIG_PPC_BOOK3S
DEFINE(VCPU_VCPUID, offsetof(struct kvm_vcpu, vcpu_id));
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index a6d00a0..2d119ce 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -828,6 +828,10 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, 
struct kvm_vcpu *vcpu,
 * Accordingly return to Guest or Host.
 */
case BOOK3S_INTERRUPT_H_EMUL_ASSIST:
+   if (vcpu->arch.emul_inst != KVM_INST_FETCH_FAILED)
+   vcpu->arch.last_inst = kvmppc_need_byteswap(vcpu) ?
+   swab32(vcpu->arch.emul_inst) :
+   vcpu->arch.emul_inst;
if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) {
r = kvmppc_emulate_debug_inst(run, vcpu);
} else {
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index c0f9e68..26a5b8d 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -983,13 +983,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 
stw r12,VCPU_TRAP(r9)
 
-   /* Save HEIR (HV emulation assist reg) in last_inst
+   /* Save HEIR (HV emulation assist reg) in emul_inst
   if this is an HEI (HV emulation interrupt, e40) */
li  r3,KVM_INST_FETCH_FAILED
cmpwi   r12,BOOK3S_INTERRUPT_H_EMUL_ASSIST
bne 11f
mfspr   r3,SPRN_HEIR
-11:stw r3,VCPU_LAST_INST(r9)
+11:stw r3,VCPU_HEIR(r9)
 
/* these are volatile across C function calls */
mfctr   r3
-- 
2.1.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH RESCEND v2] target-i386: Intel xsaves

2014-12-02 Thread Wanpeng Li

Add xsaves related definition, it also adds corresponding part 
to kvm_get/put, and vmstate.

Signed-off-by: Wanpeng Li 
---
v1 -> v2:
 * use a subsection instead of bumping the version number.

 target-i386/cpu.h |  2 ++
 target-i386/kvm.c | 15 +++
 target-i386/machine.c | 21 +
 3 files changed, 38 insertions(+)

diff --git a/target-i386/cpu.h b/target-i386/cpu.h
index 015f5b5..cff7433 100644
--- a/target-i386/cpu.h
+++ b/target-i386/cpu.h
@@ -389,6 +389,7 @@
 #define MSR_VM_HSAVE_PA 0xc0010117
 
 #define MSR_IA32_BNDCFGS0x0d90
+#define MSR_IA32_XSS0x0da0
 
 #define XSTATE_FP   (1ULL << 0)
 #define XSTATE_SSE  (1ULL << 1)
@@ -1019,6 +1020,7 @@ typedef struct CPUX86State {
 uint64_t xstate_bv;
 
 uint64_t xcr0;
+uint64_t xss;
 
 TPRAccess tpr_access_type;
 } CPUX86State;
diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index ccf36e8..c6fc417 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -80,6 +80,7 @@ static bool has_msr_hv_hypercall;
 static bool has_msr_hv_vapic;
 static bool has_msr_hv_tsc;
 static bool has_msr_mtrr;
+static bool has_msr_xss;
 
 static bool has_msr_architectural_pmu;
 static uint32_t num_architectural_pmu_counters;
@@ -826,6 +827,10 @@ static int kvm_get_supported_msrs(KVMState *s)
 has_msr_bndcfgs = true;
 continue;
 }
+if (kvm_msr_list->indices[i] == MSR_IA32_XSS) {
+has_msr_xss = true;
+continue;
+}
 }
 }
 
@@ -1224,6 +1229,9 @@ static int kvm_put_msrs(X86CPU *cpu, int level)
 if (has_msr_bndcfgs) {
 kvm_msr_entry_set(&msrs[n++], MSR_IA32_BNDCFGS, env->msr_bndcfgs);
 }
+if (has_msr_xss) {
+kvm_msr_entry_set(&msrs[n++], MSR_IA32_XSS, env->xss);
+}
 #ifdef TARGET_X86_64
 if (lm_capable_kernel) {
 kvm_msr_entry_set(&msrs[n++], MSR_CSTAR, env->cstar);
@@ -1570,6 +1578,10 @@ static int kvm_get_msrs(X86CPU *cpu)
 if (has_msr_bndcfgs) {
 msrs[n++].index = MSR_IA32_BNDCFGS;
 }
+if (has_msr_xss) {
+msrs[n++].index = MSR_IA32_XSS;
+}
+
 
 if (!env->tsc_valid) {
 msrs[n++].index = MSR_IA32_TSC;
@@ -1717,6 +1729,9 @@ static int kvm_get_msrs(X86CPU *cpu)
 case MSR_IA32_BNDCFGS:
 env->msr_bndcfgs = msrs[i].data;
 break;
+case MSR_IA32_XSS:
+env->xss = msrs[i].data;
+break;
 default:
 if (msrs[i].index >= MSR_MC0_CTL &&
 msrs[i].index < MSR_MC0_CTL + (env->mcg_cap & 0xff) * 4) {
diff --git a/target-i386/machine.c b/target-i386/machine.c
index 1c13b14..722d62e 100644
--- a/target-i386/machine.c
+++ b/target-i386/machine.c
@@ -687,6 +687,24 @@ static const VMStateDescription vmstate_avx512 = {
 }
 };
 
+static bool xss_needed(void *opaque)
+{
+X86CPU *cpu = opaque;
+CPUX86State *env = &cpu->env;
+
+return env->xss != 0;
+}
+
+static const VMStateDescription vmstate_xss = {
+.name = "cpu/xss",
+.version_id = 1,
+.minimum_version_id = 1,
+.fields = (VMStateField[]) {
+VMSTATE_UINT64(env.xss, X86CPU),
+VMSTATE_END_OF_LIST()
+}
+};
+
 VMStateDescription vmstate_x86_cpu = {
 .name = "cpu",
 .version_id = 12,
@@ -832,6 +850,9 @@ VMStateDescription vmstate_x86_cpu = {
 }, {
 .vmsd = &vmstate_avx512,
 .needed = avx512_needed,
+ }, {
+.vmsd = &vmstate_xss,
+.needed = xss_needed,
 } , {
 /* empty */
 }
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] powerpc: powernv: Return to cpu offline loop when finished in KVM guest

2014-12-02 Thread Paul Mackerras

When a secondary hardware thread has finished running a KVM guest, we
currently put that thread into nap mode using a nap instruction in
the KVM code.  This changes the code so that instead of doing a nap
instruction directly, we instead cause the call to power7_nap() that
put the thread into nap mode to return.  The reason for doing this is
to avoid having the KVM code having to know what low-power mode to
put the thread into.

In the case of a secondary thread used to run a KVM guest, the thread
will be offline from the point of view of the host kernel, and the
relevant power7_nap() call is the one in pnv_smp_cpu_disable().
In this case we don't want to clear pending IPIs in the offline loop
in that function, since that might cause us to miss the wakeup for
the next time the thread needs to run a guest.  To tell whether or
not to clear the interrupt, we use the SRR1 value returned from
power7_nap(), and check if it indicates an external interrupt.  We
arrange that the return from power7_nap() when we have finished running
a guest returns 0, so pending interrupts don't get flushed in that
case.

Note that it is important a secondary thread that has finished
executing in the guest, or that didn't have a guest to run, should
not return to power7_nap's caller while the kvm_hstate.hwthread_req
flag in the PACA is non-zero, because the return from power7_nap
will reenable the MMU, and the MMU might still be in guest context.
In this situation we spin at low priority in real mode waiting for
hwthread_req to become zero.

Signed-off-by: Paul Mackerras 
---
I think this would be best going through the powerpc tree.  Alex,
if you can give me an acked-by for this that would be appreciated.

 arch/powerpc/include/asm/processor.h|  2 +-
 arch/powerpc/kernel/exceptions-64s.S|  2 ++
 arch/powerpc/kernel/idle_power7.S   | 12 ++--
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 54 ++---
 arch/powerpc/platforms/powernv/smp.c| 23 +++---
 5 files changed, 68 insertions(+), 25 deletions(-)

diff --git a/arch/powerpc/include/asm/processor.h 
b/arch/powerpc/include/asm/processor.h
index dda7ac4..29c3798 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -451,7 +451,7 @@ extern unsigned long cpuidle_disable;
 enum idle_boot_override {IDLE_NO_OVERRIDE = 0, IDLE_POWERSAVE_OFF};
 
 extern int powersave_nap;  /* set if nap mode can be used in idle loop */
-extern void power7_nap(int check_irq);
+extern unsigned long power7_nap(int check_irq);
 extern void power7_sleep(void);
 extern void flush_instruction_cache(void);
 extern void hard_reset_now(void);
diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index a1d45c1..7f29c5f 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -131,6 +131,8 @@ BEGIN_FTR_SECTION
 1:
 #endif
 
+   /* Return SRR1 from power7_nap() */
+   mfspr   r3,SPRN_SRR1
beq cr1,2f
b   power7_wakeup_noloss
 2: b   power7_wakeup_loss
diff --git a/arch/powerpc/kernel/idle_power7.S 
b/arch/powerpc/kernel/idle_power7.S
index c0754bb..18c0687 100644
--- a/arch/powerpc/kernel/idle_power7.S
+++ b/arch/powerpc/kernel/idle_power7.S
@@ -212,6 +212,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
mtspr   SPRN_SRR0,r5
rfid
 
+/*
+ * R3 here contains the value that will be returned to the caller
+ * of power7_nap.
+ */
 _GLOBAL(power7_wakeup_loss)
ld  r1,PACAR1(r13)
 BEGIN_FTR_SECTION
@@ -219,15 +223,19 @@ BEGIN_FTR_SECTION
 END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
REST_NVGPRS(r1)
REST_GPR(2, r1)
-   ld  r3,_CCR(r1)
+   ld  r6,_CCR(r1)
ld  r4,_MSR(r1)
ld  r5,_NIP(r1)
addir1,r1,INT_FRAME_SIZE
-   mtcrr3
+   mtcrr6
mtspr   SPRN_SRR1,r4
mtspr   SPRN_SRR0,r5
rfid
 
+/*
+ * R3 here contains the value that will be returned to the caller
+ * of power7_nap.
+ */
 _GLOBAL(power7_wakeup_noloss)
lbz r0,PACA_NAPSTATELOST(r13)
cmpwi   r0,0
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index edb2ccd..65c105b 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -201,8 +201,6 @@ kvmppc_primary_no_guest:
bge kvm_novcpu_exit /* another thread already exiting */
li  r3, NAPPING_NOVCPU
stb r3, HSTATE_NAPPING(r13)
-   li  r3, 1
-   stb r3, HSTATE_HWTHREAD_REQ(r13)
 
b   kvm_do_nap
 
@@ -293,6 +291,8 @@ kvm_start_guest:
/* if we have no vcpu to run, go back to sleep */
beq kvm_no_guest
 
+kvm_secondary_got_guest:
+
/* Set HSTATE_DSCR(r13) to something sensible */
ld  r6, PACA_DSCR(r13)
std r6, HSTATE_DSCR(r13)
@@ -318,27 +318,46 @@ kvm_start_guest:
stwcx.  r3, 0, r4

Re: [Xen-devel] [PATCH] xen: privcmd: schedule() after private hypercall when non CONFIG_PREEMPT

2014-12-02 Thread Juergen Gross


On 12/03/2014 03:28 AM, Luis R. Rodriguez wrote:

On Tue, Dec 02, 2014 at 11:11:18AM +, David Vrabel wrote:

On 01/12/14 22:36, Luis R. Rodriguez wrote:


Then I do agree its a fair analogy (and find this obviously odd that how
widespread cond_resched() is), we just don't have an equivalent for IRQ
context, why not avoid the special check then and use this all the time in the
middle of a hypercall on the return from an interrupt (e.g., the timer
interrupt)?


http://lists.xen.org/archives/html/xen-devel/2014-02/msg01101.html


OK thanks! That explains why we need some asm code but in that submission you
still also had used is_preemptible_hypercall(regs) and in the new
implementation you use a CPU variable xen_in_preemptible_hcall prior to calling
preempt_schedule_irq(). I believe you added the CPU variable because
preempt_schedule_irq() will preempt first without any checks if it should, I'm
asking why not do something like cond_resched_irq() where we check with
should_resched() prior to preempting and that way we can avoid having to use
the CPU variable?


Because that could preempt at any asynchronous interrupt making the
no-preempt kernel fully preemptive. How would you know you are just
doing a critical hypercall which should be preempted?

Juergen

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[v2 16/25] KVM: make kvm_set_msi_irq() public

2014-12-02 Thread Feng Wu

Make kvm_set_msi_irq() public, we can use this function outside.

Signed-off-by: Feng Wu 
---
 include/linux/kvm_host.h |2 ++
 virt/kvm/irq_comm.c  |2 +-
 2 files changed, 3 insertions(+), 1 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index cfa85ac..5cd4420 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -785,6 +785,8 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
   struct kvm_irq_ack_notifier *kian);
 int kvm_request_irq_source_id(struct kvm *kvm);
 void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
+void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
+struct kvm_lapic_irq *irq);
 
 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
 int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot);
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index f3c5d69..231671a 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -106,7 +106,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct 
kvm_lapic *src,
return r;
 }
 
-static inline void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
+void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
   struct kvm_lapic_irq *irq)
 {
trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data);
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[v2 25/25] iommu/vt-d: Add a command line parameter for VT-d posted-interrupts

2014-12-02 Thread Feng Wu

Enable VT-d Posted-Interrtups and add a command line
parameter for it.

Signed-off-by: Feng Wu 
---
 Documentation/kernel-parameters.txt |1 +
 drivers/iommu/irq_remapping.c   |   12 
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/Documentation/kernel-parameters.txt 
b/Documentation/kernel-parameters.txt
index 838f377..324b790 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1453,6 +1453,7 @@ bytes respectively. Such letter suffixes can also be 
entirely omitted.
nosid   disable Source ID checking
no_x2apic_optout
BIOS x2APIC opt-out request will be ignored
+   nopost  disable Interrupt Posting
 
iomem=  Disable strict checking of access to MMIO memory
strict  regions from userspace.
diff --git a/drivers/iommu/irq_remapping.c b/drivers/iommu/irq_remapping.c
index b008663..aa3cd23 100644
--- a/drivers/iommu/irq_remapping.c
+++ b/drivers/iommu/irq_remapping.c
@@ -24,7 +24,7 @@ int irq_remap_broken;
 int disable_sourceid_checking;
 int no_x2apic_optout;
 
-int disable_irq_post = 1;
+int disable_irq_post = 0;
 
 static struct irq_remap_ops *remap_ops;
 
@@ -59,14 +59,18 @@ static __init int setup_irqremap(char *str)
return -EINVAL;
 
while (*str) {
-   if (!strncmp(str, "on", 2))
+   if (!strncmp(str, "on", 2)) {
disable_irq_remap = 0;
-   else if (!strncmp(str, "off", 3))
+   disable_irq_post = 0;
+   } else if (!strncmp(str, "off", 3)) {
disable_irq_remap = 1;
-   else if (!strncmp(str, "nosid", 5))
+   disable_irq_post = 1;
+   } else if (!strncmp(str, "nosid", 5))
disable_sourceid_checking = 1;
else if (!strncmp(str, "no_x2apic_optout", 16))
no_x2apic_optout = 1;
+   else if (!strncmp(str, "nopost", 6))
+   disable_irq_post = 1;
 
str += strcspn(str, ",");
while (*str == ',')
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[v2 23/25] KVM: Add the handler for Wake-up Vector

2014-12-02 Thread Feng Wu

When vCPU is blocked and an external interrupts from assigned
devices is delivered to it, VT-d Posted-Interrupts mechanism
will deliver an interrupt to the associated physical CPU with
Wake-up Vector. In its handler, we find the destination vCPU
and wake up it.

Signed-off-by: Feng Wu 
---
 arch/x86/include/asm/kvm_host.h |2 +
 arch/x86/kvm/vmx.c  |   52 +++
 arch/x86/kvm/x86.c  |   22 +++-
 include/linux/kvm_host.h|3 ++
 virt/kvm/kvm_main.c |3 ++
 5 files changed, 75 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 2fd85a5..76fc32d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -101,6 +101,8 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, 
int level)
 
 #define ASYNC_PF_PER_VCPU 64
 
+extern void (*wakeup_handler_callback)(void);
+
 enum kvm_reg {
VCPU_REGS_RAX = 0,
VCPU_REGS_RCX = 1,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e71bf3b..dc6fd84 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -822,6 +822,13 @@ static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
 static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
 static DEFINE_PER_CPU(struct desc_ptr, host_gdt);
 
+/*
+ * We maintian a per-CPU linked-list of VCPU, so in wakeup_handler() we
+ * can find which VCPU should be waken up.
+ */
+static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
+static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
+
 static unsigned long *vmx_io_bitmap_a;
 static unsigned long *vmx_io_bitmap_b;
 static unsigned long *vmx_msr_bitmap_legacy;
@@ -2813,6 +2820,8 @@ static int hardware_enable(void)
return -EBUSY;
 
INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
+   INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
+   spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
 
/*
 * Now we can enable the vmclear operation in kdump
@@ -9177,6 +9186,7 @@ static int vmx_vcpu_pre_block(struct kvm_vcpu *vcpu)
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
struct pi_desc old;
struct pi_desc new;
+   unsigned long flags;
 
if (!irq_remapping_cap(IRQ_POSTING_CAP))
return 0;
@@ -9216,9 +9226,22 @@ static int vmx_vcpu_pre_block(struct kvm_vcpu *vcpu)
 
/* set 'NV' to 'wakeup vector' */
new.nv = POSTED_INTR_WAKEUP_VECTOR;
+
+   /*
+* We should save physical cpu id here, vcpu->cpu may
+* be changed due to preemption, in that case, this
+* do-while loop will run again.
+*/
+   vcpu->wakeup_cpu = vcpu->cpu;
} while (cmpxchg(&pi_desc->control, old.control, new.control)
!= old.control);
 
+   spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock,
+   vcpu->wakeup_cpu), flags);
+   list_add_tail(&vcpu->blocked_vcpu_list,
+   &per_cpu(blocked_vcpu_on_cpu, vcpu->wakeup_cpu));
+   spin_unlock_irqrestore(&per_cpu(blocked_vcpu_on_cpu_lock,
+   vcpu->wakeup_cpu), flags);
return 0;
 }
 
@@ -9228,6 +9251,7 @@ static void vmx_vcpu_post_block(struct kvm_vcpu *vcpu)
struct pi_desc old;
struct pi_desc new;
unsigned int dest = 0;
+   unsigned long flags;
 
if (!irq_remapping_cap(IRQ_POSTING_CAP))
return;
@@ -9249,6 +9273,13 @@ static void vmx_vcpu_post_block(struct kvm_vcpu *vcpu)
} while (cmpxchg(&pi_desc->control, old.control, new.control)
!= old.control);
 
+   spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock,
+   vcpu->wakeup_cpu), flags);
+   list_del(&vcpu->blocked_vcpu_list);
+   spin_unlock_irqrestore(&per_cpu(blocked_vcpu_on_cpu_lock,
+   vcpu->wakeup_cpu), flags);
+   vcpu->wakeup_cpu = -1;
+
pi_clear_sn(pi_desc);
 }
 
@@ -9366,6 +9397,25 @@ static struct kvm_x86_ops vmx_x86_ops = {
.vcpu_post_block = vmx_vcpu_post_block,
 };
 
+/*
+ * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
+ */
+void wakeup_handler(void)
+{
+   struct kvm_vcpu *vcpu;
+   int cpu = smp_processor_id();
+
+   spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
+   list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu),
+   blocked_vcpu_list) {
+   struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+
+   if (pi_test_on(pi_desc) == 1)
+   kvm_vcpu_kick(vcpu);
+   }
+   spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
+}
+
 static int __init vmx_init(void)
 {
int r, i, msr;
@@ -9480,6 +9530,8 @@ static int __init vmx_init(void)
 
update_ple_window_actual_max();
 
+   wakeup_ha

[v2 24/25] KVM: Suppress posted-interrupt when 'SN' is set

2014-12-02 Thread Feng Wu

Currently, we don't support urgent interrupt, all interrupts
are recognized as non-urgent interrupt, so we cannot send
posted-interrupt when 'SN' is set.

Signed-off-by: Feng Wu 
---
 arch/x86/kvm/vmx.c |   11 +--
 1 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index dc6fd84..6b2f3e7 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4306,15 +4306,22 @@ static int vmx_vm_has_apicv(struct kvm *kvm)
 static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
 {
struct vcpu_vmx *vmx = to_vmx(vcpu);
-   int r;
+   int r, sn;
 
if (pi_test_and_set_pir(vector, &vmx->pi_desc))
return;
 
+   /*
+* Currently, we don't support urgent interrupt, all interrupts
+* are recognized as non-urgent interrupt, so we cannot send
+* posted-interrupt when 'SN' is set.
+*/
+   sn = pi_test_sn(&vmx->pi_desc);
+
r = pi_test_and_set_on(&vmx->pi_desc);
kvm_make_request(KVM_REQ_EVENT, vcpu);
 #ifdef CONFIG_SMP
-   if (!r && (vcpu->mode == IN_GUEST_MODE))
+   if (!r && !sn && (vcpu->mode == IN_GUEST_MODE))
apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
POSTED_INTR_VECTOR);
else
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[v2 21/25] KVM: Update Posted-Interrupts descriptor during vCPU scheduling

2014-12-02 Thread Feng Wu

Update Posted-Interrupts descriptor according to the
following rules:
- Before vCPU block, set 'NV' to POSTED_INTR_WAKEUP_VECTOR
- After vCPU block, set 'NV' back to POSTED_INTR_VECTOR

Signed-off-by: Feng Wu 
---
 arch/x86/include/asm/kvm_host.h |5 ++
 arch/x86/kvm/vmx.c  |   83 +++
 arch/x86/kvm/x86.c  |   16 +++
 virt/kvm/kvm_main.c |   11 +
 4 files changed, 115 insertions(+), 0 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 6878429..2fd85a5 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -775,6 +775,8 @@ struct kvm_x86_ops {
 
void (*sched_in)(struct kvm_vcpu *kvm, int cpu);
u64 (*get_pi_desc_addr)(struct kvm_vcpu *vcpu);
+   int (*vcpu_pre_block)(struct kvm_vcpu *vcpu);
+   void (*vcpu_post_block)(struct kvm_vcpu *vcpu);
 };
 
 struct kvm_arch_async_pf {
@@ -1100,4 +1102,7 @@ void kvm_deliver_pmi(struct kvm_vcpu *vcpu);
 bool kvm_find_dest_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
struct kvm_vcpu **dest_vcpu);
 
+int kvm_arch_vcpu_pre_block(struct kvm_vcpu *vcpu);
+void kvm_arch_vcpu_post_block(struct kvm_vcpu *vcpu);
+
 #endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 81f239b..a1966b9 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -9147,6 +9147,86 @@ static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
shrink_ple_window(vcpu);
 }
 
+static int vmx_vcpu_pre_block(struct kvm_vcpu *vcpu)
+{
+   struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+   struct pi_desc old;
+   struct pi_desc new;
+
+   if (!irq_remapping_cap(IRQ_POSTING_CAP))
+   return 0;
+
+   memset(&old, 0, sizeof(old));
+   memset(&new, 0, sizeof(new));
+
+   do {
+   old.control = new.control = pi_desc->control;
+
+   /*
+* A posted-interrupt happened in the one of the
+* following two cases:
+* 1. After the latest pir-to-virr sync operation
+* in kvm_arch_vcpu_runnable() function
+* 2. In this do-while() loop, a posted-interrupt
+* occurs.
+*
+* For either of above cases, we should not block
+* the VCPU.
+*/
+   if (pi_test_on(pi_desc) == 1) {
+   /*
+* Need to set this flag, then the inject will
+* be synced from PIR to vIRR before VM-ENTRY.
+* In fact, for guest IPI case, in function
+* vmx_deliver_posted_interrupt(), this flags
+* has already been set, but if the interrupt
+* is injected by VT-d PI hardware, we need
+* to set this.
+*/
+   kvm_make_request(KVM_REQ_EVENT, vcpu);
+   return 1;
+   }
+
+   pi_clear_sn(&new);
+
+   /* set 'NV' to 'wakeup vector' */
+   new.nv = POSTED_INTR_WAKEUP_VECTOR;
+   } while (cmpxchg(&pi_desc->control, old.control, new.control)
+   != old.control);
+
+   return 0;
+}
+
+static void vmx_vcpu_post_block(struct kvm_vcpu *vcpu)
+{
+   struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+   struct pi_desc old;
+   struct pi_desc new;
+   unsigned int dest = 0;
+
+   if (!irq_remapping_cap(IRQ_POSTING_CAP))
+   return;
+
+   pi_set_sn(pi_desc);
+
+   do {
+   old.control = new.control = pi_desc->control;
+
+   dest = cpu_physical_id(vcpu->cpu);
+
+   if (x2apic_enabled())
+   new.ndst = dest;
+   else
+   new.ndst = (dest << 8) & 0xFF00;
+
+   /* set 'NV' to 'notification vector' */
+   new.nv = POSTED_INTR_VECTOR;
+   } while (cmpxchg(&pi_desc->control, old.control, new.control)
+   != old.control);
+
+   pi_clear_sn(pi_desc);
+}
+
 static struct kvm_x86_ops vmx_x86_ops = {
.cpu_has_kvm_support = cpu_has_kvm_support,
.disabled_by_bios = vmx_disabled_by_bios,
@@ -9256,6 +9336,9 @@ static struct kvm_x86_ops vmx_x86_ops = {
.sched_in = vmx_sched_in,
 
.get_pi_desc_addr = vmx_get_pi_desc_addr,
+
+   .vcpu_pre_block = vmx_vcpu_pre_block,
+   .vcpu_post_block = vmx_vcpu_post_block,
 };
 
 static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0033df3..9706984 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7731,6 +7731,22 @@ bool kvm_arch_has_noncoherent_dma(struct kvm *kvm)
 }
 EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma);
 
+int kvm_arch_vcpu_pre_block(struct kvm_vcpu *vcpu)
+{
+   i

[v2 22/25] KVM: Change NDST field after vCPU scheduling

2014-12-02 Thread Feng Wu

This patch changes the NDST filed of Posted-Interrupts
Descriptor after vCPU is scheduled to another physical
CPU.

Signed-off-by: Feng Wu 
---
 arch/x86/kvm/vmx.c |   25 +
 1 files changed, 25 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a1966b9..e71bf3b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1906,6 +1906,31 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
vmx->loaded_vmcs->cpu = cpu;
}
+
+   if (irq_remapping_cap(IRQ_POSTING_CAP) && (vcpu->cpu != cpu)) {
+   struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+   struct pi_desc old, new;
+   unsigned int dest;
+
+   memset(&old, 0, sizeof(old));
+   memset(&new, 0, sizeof(new));
+
+   pi_set_sn(pi_desc);
+
+   do {
+   old.control = new.control = pi_desc->control;
+
+   dest = cpu_physical_id(cpu);
+
+   if (x2apic_enabled())
+   new.ndst = dest;
+   else
+   new.ndst = (dest << 8) & 0xFF00;
+
+   } while (cmpxchg(&pi_desc->control, old.control,
+   new.control) != old.control);
+   pi_clear_sn(pi_desc);
+   }
 }
 
 static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[v2 19/25] KVM: x86: kvm-vfio: VT-d posted-interrupts setup

2014-12-02 Thread Feng Wu

This patch defines macro __KVM_HAVE_ARCH_KVM_VFIO_POSTING and
implement kvm_arch_vfio_update_pi_irte for x86 architecture.

Signed-off-by: Feng Wu 
---
 arch/x86/include/asm/kvm_host.h |2 +
 arch/x86/kvm/Makefile   |2 +-
 arch/x86/kvm/kvm_vfio_x86.c |   68 +++
 3 files changed, 71 insertions(+), 1 deletions(-)
 create mode 100644 arch/x86/kvm/kvm_vfio_x86.c

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 9b45b78..6878429 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -82,6 +82,8 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, 
int level)
(base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
 }
 
+#define __KVM_HAVE_ARCH_KVM_VFIO_POSTING
+
 #define SELECTOR_TI_MASK (1 << 2)
 #define SELECTOR_RPL_MASK 0x03
 
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 25d22b2..8809d58 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -14,7 +14,7 @@ kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT)   += 
$(KVM)/assigned-dev.o $(KVM)/iommu.o
 kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o
 
 kvm-y  += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
-  i8254.o cpuid.o pmu.o
+  i8254.o cpuid.o pmu.o kvm_vfio_x86.o
 kvm-intel-y+= vmx.o
 kvm-amd-y  += svm.o
 
diff --git a/arch/x86/kvm/kvm_vfio_x86.c b/arch/x86/kvm/kvm_vfio_x86.c
new file mode 100644
index 000..c59a31a
--- /dev/null
+++ b/arch/x86/kvm/kvm_vfio_x86.c
@@ -0,0 +1,68 @@
+/*
+ * Copyright (C) 2014 Intel Corporation.
+ * Authors: Feng Wu 
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include 
+#include 
+
+/*
+ * kvm_arch_vfio_update_pi_irte - set IRTE for Posted-Interrupts
+ *
+ * @kvm: kvm
+ * @host_irq: host irq of the interrupt
+ * @guest_irq: gsi of the interrupt
+ * returns 0 on success, < 0 on failure
+ */
+int kvm_arch_vfio_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
+uint32_t guest_irq)
+{
+   struct kvm_kernel_irq_routing_entry *e;
+   struct kvm_irq_routing_table *irq_rt;
+   struct kvm_lapic_irq irq;
+   struct kvm_vcpu *vcpu;
+   struct vcpu_data vcpu_info;
+   int idx, ret = -EINVAL;
+
+   idx = srcu_read_lock(&kvm->irq_srcu);
+   irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
+   BUG_ON(guest_irq >= irq_rt->nr_rt_entries);
+
+   hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
+   if (e->type != KVM_IRQ_ROUTING_MSI)
+   continue;
+   /*
+* VT-d PI cannot support posting multicast/broadcast
+* interrupts to a VCPU, we still use interrupt remapping
+* for these kind of interrupts.
+*/
+
+   kvm_set_msi_irq(e, &irq);
+   if (!kvm_find_dest_vcpu(kvm, &irq, &vcpu))
+   continue;
+
+   vcpu_info.pi_desc_addr = kvm_x86_ops->get_pi_desc_addr(vcpu);
+   vcpu_info.vector = irq.vector;
+
+   if (irq_set_vcpu_affinity(host_irq, &vcpu_info) < 0) {
+   printk(KERN_INFO "%s: failed to update PI IRTE\n",
+   __func__);
+   ret = -EINVAL;
+   goto out;
+   }
+   }
+
+   ret = 0;
+out:
+   srcu_read_unlock(&kvm->irq_srcu, idx);
+   return ret;
+}
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[v2 17/25] KVM: kvm-vfio: User API for VT-d Posted-Interrupts

2014-12-02 Thread Feng Wu

This patch adds and documents a new attribute
KVM_DEV_VFIO_DEVICE_POSTING_IRQ in KVM_DEV_VFIO_DEVICE group.
This new attribute is used for VT-d Posted-Interrupts.

When guest OS changes the interrupt configuration for an
assigned device, such as, MSI/MSIx data/address fields,
QEMU will use this IRQ attribute to tell KVM to update the
related IRTE according the VT-d Posted-Interrrupts Specification,
such as, the guest vector should be updated in the related IRTE.

Signed-off-by: Feng Wu 
---
 Documentation/virtual/kvm/devices/vfio.txt |9 +
 include/uapi/linux/kvm.h   |   10 ++
 2 files changed, 19 insertions(+), 0 deletions(-)

diff --git a/Documentation/virtual/kvm/devices/vfio.txt 
b/Documentation/virtual/kvm/devices/vfio.txt
index f7aff29..41e12b7 100644
--- a/Documentation/virtual/kvm/devices/vfio.txt
+++ b/Documentation/virtual/kvm/devices/vfio.txt
@@ -42,3 +42,12 @@ activated before VFIO_DEVICE_SET_IRQS has been called to 
trigger the IRQ
 or associate an eventfd to it. Unforwarding can only be called while the
 signaling has been disabled with VFIO_DEVICE_SET_IRQS. If this condition is
 not satisfied, the command returns an -EBUSY.
+
+  KVM_DEV_VFIO_DEVICE_POSTING_IRQ: Use posted interrtups mechanism to post
+   the IRQ to guests.
+For this attribute, kvm_device_attr.addr points to a kvm_vfio_dev_irq struct.
+
+When guest OS changes the interrupt configuration for an assigned device,
+such as, MSI/MSIx data/address fields, QEMU will use this IRQ attribute
+to tell KVM to update the related IRTE according the VT-d Posted-Interrrupts
+Specification, such as, the guest vector should be updated in the related IRTE.
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index a269a42..7d98650 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -949,6 +949,7 @@ struct kvm_device_attr {
 #define  KVM_DEV_VFIO_DEVICE   2
 #define   KVM_DEV_VFIO_DEVICE_FORWARD_IRQ  1
 #define   KVM_DEV_VFIO_DEVICE_UNFORWARD_IRQ2
+#define   KVM_DEV_VFIO_DEVICE_POSTING_IRQ  3
 
 enum kvm_device_type {
KVM_DEV_TYPE_FSL_MPIC_20= 1,
@@ -973,6 +974,15 @@ struct kvm_arch_forwarded_irq {
__u32 gsi; /* gsi, ie. virtual IRQ number */
 };
 
+struct kvm_vfio_dev_irq {
+   __u32   argsz;
+   __u32   fd; /* file descriptor of the VFIO device */
+   __u32   index;  /* VFIO device IRQ index */
+   __u32   start;
+   __u32   count;
+   __u32   gsi[];  /* gsi, ie. virtual IRQ number */
+};
+
 /*
  * ioctls for VM fds
  */
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[v2 20/25] x86, irq: Define a global vector for VT-d Posted-Interrupts

2014-12-02 Thread Feng Wu

Currently, we use a global vector as the Posted-Interrupts
Notification Event for all the vCPUs in the system. We need
to introduce another global vector for VT-d Posted-Interrtups,
which will be used to wakeup the sleep vCPU when an external
interrupt from a direct-assigned device happens for that vCPU.

Signed-off-by: Feng Wu 
---
 arch/x86/include/asm/entry_arch.h  |2 ++
 arch/x86/include/asm/hardirq.h |1 +
 arch/x86/include/asm/hw_irq.h  |2 ++
 arch/x86/include/asm/irq_vectors.h |1 +
 arch/x86/kernel/entry_64.S |2 ++
 arch/x86/kernel/irq.c  |   27 +++
 arch/x86/kernel/irqinit.c  |2 ++
 7 files changed, 37 insertions(+), 0 deletions(-)

diff --git a/arch/x86/include/asm/entry_arch.h 
b/arch/x86/include/asm/entry_arch.h
index dc5fa66..27ca0af 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -23,6 +23,8 @@ BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR)
 #ifdef CONFIG_HAVE_KVM
 BUILD_INTERRUPT3(kvm_posted_intr_ipi, POSTED_INTR_VECTOR,
 smp_kvm_posted_intr_ipi)
+BUILD_INTERRUPT3(kvm_posted_intr_wakeup_ipi, POSTED_INTR_WAKEUP_VECTOR,
+smp_kvm_posted_intr_wakeup_ipi)
 #endif
 
 /*
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 0f5fb6b..9866065 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -14,6 +14,7 @@ typedef struct {
 #endif
 #ifdef CONFIG_HAVE_KVM
unsigned int kvm_posted_intr_ipis;
+   unsigned int kvm_posted_intr_wakeup_ipis;
 #endif
unsigned int x86_platform_ipis; /* arch dependent */
unsigned int apic_perf_irqs;
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index e7ae6eb..38fac9b 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -29,6 +29,7 @@
 extern asmlinkage void apic_timer_interrupt(void);
 extern asmlinkage void x86_platform_ipi(void);
 extern asmlinkage void kvm_posted_intr_ipi(void);
+extern asmlinkage void kvm_posted_intr_wakeup_ipi(void);
 extern asmlinkage void error_interrupt(void);
 extern asmlinkage void irq_work_interrupt(void);
 
@@ -92,6 +93,7 @@ extern void trace_call_function_single_interrupt(void);
 #define trace_irq_move_cleanup_interrupt  irq_move_cleanup_interrupt
 #define trace_reboot_interrupt  reboot_interrupt
 #define trace_kvm_posted_intr_ipi kvm_posted_intr_ipi
+#define trace_kvm_posted_intr_wakeup_ipi kvm_posted_intr_wakeup_ipi
 #endif /* CONFIG_TRACING */
 
 struct irq_domain;
diff --git a/arch/x86/include/asm/irq_vectors.h 
b/arch/x86/include/asm/irq_vectors.h
index b26cb12..dca94f2 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -105,6 +105,7 @@
 /* Vector for KVM to deliver posted interrupt IPI */
 #ifdef CONFIG_HAVE_KVM
 #define POSTED_INTR_VECTOR 0xf2
+#define POSTED_INTR_WAKEUP_VECTOR  0xf1
 #endif
 
 /*
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index e61c14a..a598447 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -960,6 +960,8 @@ apicinterrupt X86_PLATFORM_IPI_VECTOR \
 #ifdef CONFIG_HAVE_KVM
 apicinterrupt3 POSTED_INTR_VECTOR \
kvm_posted_intr_ipi smp_kvm_posted_intr_ipi
+apicinterrupt3 POSTED_INTR_WAKEUP_VECTOR \
+   kvm_posted_intr_wakeup_ipi smp_kvm_posted_intr_wakeup_ipi
 #endif
 
 #ifdef CONFIG_X86_MCE_THRESHOLD
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 922d285..47408c3 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -237,6 +237,9 @@ __visible void smp_x86_platform_ipi(struct pt_regs *regs)
 }
 
 #ifdef CONFIG_HAVE_KVM
+void (*wakeup_handler_callback)(void) = NULL;
+EXPORT_SYMBOL_GPL(wakeup_handler_callback);
+
 /*
  * Handler for POSTED_INTERRUPT_VECTOR.
  */
@@ -256,6 +259,30 @@ __visible void smp_kvm_posted_intr_ipi(struct pt_regs 
*regs)
 
set_irq_regs(old_regs);
 }
+
+/*
+ * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
+ */
+__visible void smp_kvm_posted_intr_wakeup_ipi(struct pt_regs *regs)
+{
+   struct pt_regs *old_regs = set_irq_regs(regs);
+
+   ack_APIC_irq();
+
+   irq_enter();
+
+   exit_idle();
+
+   inc_irq_stat(kvm_posted_intr_wakeup_ipis);
+
+   if (wakeup_handler_callback)
+   wakeup_handler_callback();
+
+   irq_exit();
+
+   set_irq_regs(old_regs);
+}
+
 #endif
 
 __visible void smp_trace_x86_platform_ipi(struct pt_regs *regs)
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 70e181e..844673c 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -144,6 +144,8 @@ static void __init apic_intr_init(void)
 #ifdef CONFIG_HAVE_KVM
/* IPI for KVM to deliver posted interrupt */
alloc_intr_gate(POSTED_INTR_VECTOR, kvm_posted_intr_ipi);
+   /* IPI for KVM to deliver interrupt to wake up tasks */
+   alloc_intr_gate(POSTED_INTR_WAKEUP_VECTOR,

[v2 18/25] KVM: kvm-vfio: implement the VFIO skeleton for VT-d Posted-Interrupts

2014-12-02 Thread Feng Wu

This patch adds the kvm-vfio interface for VT-d Posted-Interrrupts.
When guests updates MSI/MSI-x information for an assigned-device,
QEMU will use KVM_DEV_VFIO_DEVICE_POSTING_IRQ attribute to setup
IRTE for VT-d PI. This patch implement this IRQ attribute.

Signed-off-by: Feng Wu 
---
 include/linux/kvm_host.h |   19 
 virt/kvm/vfio.c  |  103 ++
 2 files changed, 122 insertions(+), 0 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 5cd4420..8d06678 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1134,6 +1134,25 @@ static inline int kvm_arch_vfio_set_forward(struct 
kvm_fwd_irq *fwd_irq,
 }
 #endif
 
+#ifdef __KVM_HAVE_ARCH_KVM_VFIO_POSTING
+/*
+ * kvm_arch_vfio_update_pi_irte - set IRTE for Posted-Interrupts
+ *
+ * @kvm: kvm
+ * @host_irq: host irq of the interrupt
+ * @guest_irq: gsi of the interrupt
+ * returns 0 on success, < 0 on failure
+ */
+int kvm_arch_vfio_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
+uint32_t guest_irq);
+#else
+static int kvm_arch_vfio_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
+   uint32_t guest_irq)
+{
+   return 0;
+}
+#endif
+
 #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
 
 static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val)
diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c
index 6bc7001..5e5515f 100644
--- a/virt/kvm/vfio.c
+++ b/virt/kvm/vfio.c
@@ -446,6 +446,99 @@ out:
return ret;
 }
 
+static int kvm_vfio_pci_get_irq_count(struct pci_dev *pdev, int irq_type)
+{
+   if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) {
+   u8 pin;
+
+   pci_read_config_byte(pdev, PCI_INTERRUPT_PIN, &pin);
+   if (pin)
+   return 1;
+   } else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX)
+   return pci_msi_vec_count(pdev);
+   else if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX)
+   return pci_msix_vec_count(pdev);
+
+   return 0;
+}
+
+static int kvm_vfio_set_pi(struct kvm_device *kdev, int32_t __user *argp)
+{
+   struct kvm_vfio_dev_irq pi_info;
+   uint32_t *gsi;
+   unsigned long minsz;
+   struct vfio_device *vdev;
+   struct msi_desc *entry;
+   struct device *dev;
+   struct pci_dev *pdev;
+   int i, max, ret;
+
+   minsz = offsetofend(struct kvm_vfio_dev_irq, count);
+
+   if (copy_from_user(&pi_info, (void __user *)argp, minsz))
+   return -EFAULT;
+
+   if (pi_info.argsz < minsz || pi_info.index >= VFIO_PCI_NUM_IRQS)
+   return -EINVAL;
+
+   vdev = kvm_vfio_get_vfio_device(pi_info.fd);
+   if (IS_ERR(vdev))
+   return PTR_ERR(vdev);
+
+   dev = kvm_vfio_external_base_device(vdev);
+   if (!dev || !dev_is_pci(dev)) {
+   ret = -EFAULT;
+   goto put_vfio_device;
+   }
+
+   pdev = to_pci_dev(dev);
+
+   max = kvm_vfio_pci_get_irq_count(pdev, pi_info.index);
+   if (max <= 0) {
+   ret = -EFAULT;
+   goto put_vfio_device;
+   }
+
+   if (pi_info.argsz - minsz < pi_info.count * sizeof(int) ||
+   pi_info.start >= max || pi_info.start + pi_info.count > max) {
+   ret = -EINVAL;
+   goto put_vfio_device;
+   }
+
+   gsi = memdup_user((void __user *)((unsigned long)argp + minsz),
+  pi_info.count * sizeof(int));
+   if (IS_ERR(gsi)) {
+   ret = PTR_ERR(gsi);
+   goto put_vfio_device;
+   }
+
+#ifdef CONFIG_PCI_MSI
+   for (i = 0; i < pi_info.count; i++) {
+   list_for_each_entry(entry, &pdev->msi_list, list) {
+   if (entry->msi_attrib.entry_nr != pi_info.start+i)
+   continue;
+
+   ret = kvm_arch_vfio_update_pi_irte(kdev->kvm,
+  entry->irq,
+  gsi[i]);
+   if (ret) {
+   ret = -EFAULT;
+   goto free_gsi;
+   }
+   }
+   }
+#endif
+
+   ret = 0;
+
+free_gsi:
+   kfree(gsi);
+
+put_vfio_device:
+   kvm_vfio_put_vfio_device(vdev);
+   return ret;
+}
+
 static int kvm_vfio_set_device(struct kvm_device *kdev, long attr, u64 arg)
 {
int32_t __user *argp = (int32_t __user *)(unsigned long)arg;
@@ -456,6 +549,11 @@ static int kvm_vfio_set_device(struct kvm_device *kdev, 
long attr, u64 arg)
case KVM_DEV_VFIO_DEVICE_UNFORWARD_IRQ:
ret = kvm_vfio_control_irq_forward(kdev, attr, argp);
break;
+#ifdef __KVM_HAVE_ARCH_KVM_VFIO_POSTING
+   case KVM_DEV_VFIO_DEVICE_POSTING_IRQ:
+   ret = kvm_vfio_set_pi(kdev, argp);
+   break;
+#en

[v2 11/25] KVM: Add some helper functions for Posted-Interrupts

2014-12-02 Thread Feng Wu

This patch adds some helper functions to manipulate the
Posted-Interrupts Descriptor.

Signed-off-by: Feng Wu 
---
 arch/x86/kvm/vmx.c |   26 ++
 1 files changed, 26 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index abdb84f..0b1383e 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -408,6 +408,8 @@ struct nested_vmx {
 };
 
 #define POSTED_INTR_ON  0
+#define POSTED_INTR_SN  1
+
 /* Posted-Interrupt Descriptor */
 struct pi_desc {
u32 pir[8]; /* Posted interrupt requested */
@@ -443,6 +445,30 @@ static int pi_test_and_set_pir(int vector, struct pi_desc 
*pi_desc)
return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
 }
 
+static void pi_clear_sn(struct pi_desc *pi_desc)
+{
+   return clear_bit(POSTED_INTR_SN,
+   (unsigned long *)&pi_desc->control);
+}
+
+static void pi_set_sn(struct pi_desc *pi_desc)
+{
+   return set_bit(POSTED_INTR_SN,
+   (unsigned long *)&pi_desc->control);
+}
+
+static int pi_test_on(struct pi_desc *pi_desc)
+{
+   return test_bit(POSTED_INTR_ON,
+   (unsigned long *)&pi_desc->control);
+}
+
+static int pi_test_sn(struct pi_desc *pi_desc)
+{
+   return test_bit(POSTED_INTR_SN,
+   (unsigned long *)&pi_desc->control);
+}
+
 struct vcpu_vmx {
struct kvm_vcpu   vcpu;
unsigned long host_rsp;
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

1 2 >

1 - 100 of 114 matches

Mail list logo