Re: [GIT PULL 05/11] KVM: s390: refactor interrupt injection code

2014-12-01 Thread Christian Borntraeger
Am 28.11.2014 um 18:16 schrieb Paolo Bonzini:
> 
> 
> On 28/11/2014 14:25, Christian Borntraeger wrote:
>>  
>> +static int __inject_prog_irq(struct kvm_vcpu *vcpu,
>> + struct kvm_s390_interrupt_info *inti)
>> +{
> 
> Why the __s? :)

In this case because its called with a lock already held. (from 
kvm_s390_inject_vcpu).
I usually extend the usage of the __ prefix to "heho, pay attention. YOu are 
calling this function, but you are maybe supposed to do something else).

Christian

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [GIT PULL 09/11] KVM: s390: add bitmap for handling cpu-local interrupts

2014-12-01 Thread Jens Freimann
Excerpts from Paolo Bonzini's message of 2014-11-28 18:18:53 +0100:
> 
> On 28/11/2014 14:25, Christian Borntraeger wrote:
> >  
> > +struct kvm_s390_irq_payload {
> > +struct kvm_s390_io_info io;
> > +struct kvm_s390_ext_info ext;
> > +struct kvm_s390_pgm_info pgm;
> > +struct kvm_s390_emerg_info emerg;
> > +struct kvm_s390_extcall_info extcall;
> > +struct kvm_s390_prefix_info prefix;
> > +struct kvm_s390_mchk_info mchk;
> > +};
> > +
> 
> struct or union?

This really should be a struct because we use it to keep payload data for
several parallel pending interrupts and don't want to write over other data. 

Jens

> 
> Paolo

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [GIT PULL 10/11] KVM: s390: handle pending local interrupts via bitmap

2014-12-01 Thread Heiko Carstens
On Fri, Nov 28, 2014 at 02:25:38PM +0100, Christian Borntraeger wrote:
> +static int __must_check __deliver_mchk_floating(struct kvm_vcpu *vcpu,
> +struct kvm_s390_interrupt_info *inti)
> +{
> + struct kvm_s390_mchk_info *mchk = &inti->mchk;
> + int rc;
> +
> + VCPU_EVENT(vcpu, 4, "interrupt: machine check mcic=%llx",
> +mchk->mcic);
> + trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_MCHK,
> +  mchk->cr14, mchk->mcic);
> +
> + rc  = kvm_s390_vcpu_store_status(vcpu, KVM_S390_STORE_STATUS_PREFIXED);
> + rc |= put_guest_lc(vcpu, mchk->mcic,
> + (u64 __user *) __LC_MCCK_CODE);
> + rc |= put_guest_lc(vcpu, mchk->failing_storage_address,
> + (u64 __user *) __LC_MCCK_FAIL_STOR_ADDR);
> + rc |= write_guest_lc(vcpu, __LC_PSW_SAVE_AREA,
> +  &mchk->fixed_logout, sizeof(mchk->fixed_logout));
> + rc |= write_guest_lc(vcpu, __LC_MCK_OLD_PSW,
> +  &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
> + rc |= read_guest_lc(vcpu, __LC_MCK_NEW_PSW,
> + &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
> + return rc;
> +}

FWIW, rc handling seems to be a bit fragile.
The usual return statement for such a pattern is
return rc ? -EWHATEVER : 0;
so we don't get random or'ed return values.

> -static int __inject_prog_irq(struct kvm_vcpu *vcpu,
> -  struct kvm_s390_interrupt_info *inti)
> +static int __inject_prog(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq)
>  {
>   struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
>  
> - list_add(&inti->list, &li->list);
> - atomic_set(&li->active, 1);
> + li->irq.pgm = irq->u.pgm;
> + __set_bit(IRQ_PEND_PROG, &li->pending_irqs);

^

> +static int __inject_pfault_init(struct kvm_vcpu *vcpu, struct kvm_s390_irq 
> *irq)
>  {
>   struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
>  
> - inti->ext.ext_params2 = s390int->parm64;
> - list_add_tail(&inti->list, &li->list);
> - atomic_set(&li->active, 1);
> + VCPU_EVENT(vcpu, 3, "inject: external irq params:%x, params2:%llx",
> +irq->u.ext.ext_params, irq->u.ext.ext_params2);
> + trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_INT_PFAULT_INIT,
> +irq->u.ext.ext_params,
> +irq->u.ext.ext_params2, 2);
> +
> + li->irq.ext = irq->u.ext;
> + set_bit(IRQ_PEND_PFAULT_INIT, &li->pending_irqs);

^^^

You're using atomic and non-atomic bitops all over the place on the same
object(s). It would be very good to have some consistency here.
And as far as I remember the non-atomic variant is good enough anyway.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [GIT PULL 05/11] KVM: s390: refactor interrupt injection code

2014-12-01 Thread Jens Freimann
Excerpts from Paolo Bonzini's message of 2014-11-28 18:16:03 +0100:
> 
> On 28/11/2014 14:25, Christian Borntraeger wrote:
> >  
> > +static int __inject_prog_irq(struct kvm_vcpu *vcpu,
> > + struct kvm_s390_interrupt_info *inti)
> > +{
> 
> Why the __s? :)

I used the __s here to say "careful, this is usually not called directly". 

Jens

> 
> Paolo

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/2] arm/arm64: kvm: drop inappropriate use of kvm_is_mmio_pfn()

2014-12-01 Thread Ard Biesheuvel
On 21 November 2014 at 12:24, Christoffer Dall
 wrote:
> On Mon, Nov 10, 2014 at 09:33:55AM +0100, Ard Biesheuvel wrote:
>> Instead of using kvm_is_mmio_pfn() to decide whether a host region
>> should be stage 2 mapped with device attributes, add a new static
>> function kvm_is_device_pfn() that disregards RAM pages with the
>> reserved bit set, as those should usually not be mapped as device
>> memory.
>>
>> Signed-off-by: Ard Biesheuvel 
>> ---
>>  arch/arm/kvm/mmu.c | 7 ++-
>>  1 file changed, 6 insertions(+), 1 deletion(-)
>>
>> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
>> index 57a403a5c22b..b007438242e2 100644
>> --- a/arch/arm/kvm/mmu.c
>> +++ b/arch/arm/kvm/mmu.c
>> @@ -834,6 +834,11 @@ static bool kvm_is_write_fault(struct kvm_vcpu *vcpu)
>>   return kvm_vcpu_dabt_iswrite(vcpu);
>>  }
>>
>> +static bool kvm_is_device_pfn(unsigned long pfn)
>> +{
>> + return !pfn_valid(pfn);
>> +}
>> +
>>  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>> struct kvm_memory_slot *memslot, unsigned long hva,
>> unsigned long fault_status)
>> @@ -904,7 +909,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
>> phys_addr_t fault_ipa,
>>   if (is_error_pfn(pfn))
>>   return -EFAULT;
>>
>> - if (kvm_is_mmio_pfn(pfn))
>> + if (kvm_is_device_pfn(pfn))
>>   mem_type = PAGE_S2_DEVICE;
>>
>>   spin_lock(&kvm->mmu_lock);
>> --
>> 1.8.3.2
>>
> Acked-by: Christoffer Dall 

These 2 patches are now in 3.18-rc7, so they can be dropped from the
kvmarm queue/next/etc branches

Thanks,
Ard.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next] vhost: remove unnecessary forward declarations in vhost.h

2014-12-01 Thread Michael S. Tsirkin
On Thu, Nov 27, 2014 at 02:41:21PM +0800, Jason Wang wrote:
> Signed-off-by: Jason Wang 

Applied, thanks.

> ---
>  drivers/vhost/vhost.h | 4 
>  1 file changed, 4 deletions(-)
> 
> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> index 3eda654..7d039ef 100644
> --- a/drivers/vhost/vhost.h
> +++ b/drivers/vhost/vhost.h
> @@ -12,8 +12,6 @@
>  #include 
>  #include 
>  
> -struct vhost_device;
> -
>  struct vhost_work;
>  typedef void (*vhost_work_fn_t)(struct vhost_work *work);
>  
> @@ -54,8 +52,6 @@ struct vhost_log {
>   u64 len;
>  };
>  
> -struct vhost_virtqueue;
> -
>  /* The virtqueue structure describes a queue attached to a device. */
>  struct vhost_virtqueue {
>   struct vhost_dev *dev;
> -- 
> 1.9.1
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v7 28/46] vhost: make features 64 bit

2014-12-01 Thread Michael S. Tsirkin
On Mon, Dec 01, 2014 at 04:12:37AM +, Ben Hutchings wrote:
> On Sun, 2014-11-30 at 18:44 +0300, Sergei Shtylyov wrote:
> > Hello.
> > 
> > On 11/30/2014 6:11 PM, Michael S. Tsirkin wrote:
> > 
> > > We need to use bit 32 for virtio 1.0
> > 
> > > Signed-off-by: Michael S. Tsirkin 
> > > Reviewed-by: Jason Wang 
> > > ---
> > >   drivers/vhost/vhost.h | 4 ++--
> > >   1 file changed, 2 insertions(+), 2 deletions(-)
> > 
> > > diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> > > index 3eda654..c624b09 100644
> > > --- a/drivers/vhost/vhost.h
> > > +++ b/drivers/vhost/vhost.h
> > > @@ -106,7 +106,7 @@ struct vhost_virtqueue {
> > >   /* Protected by virtqueue mutex. */
> > >   struct vhost_memory *memory;
> > >   void *private_data;
> > > - unsigned acked_features;
> > > + u64 acked_features;
> > >   /* Log write descriptors */
> > >   void __user *log_base;
> > >   struct vhost_log *log;
> > > @@ -174,6 +174,6 @@ enum {
> > >
> > >   static inline int vhost_has_feature(struct vhost_virtqueue *vq, int bit)
> > >   {
> > > - return vq->acked_features & (1 << bit);
> > > + return vq->acked_features & (1ULL << bit);
> > 
> > Erm, wouldn't the high word be just dropped when returning *int*? I 
> > think 
> > you need !!(vq->acked_features & (1ULL << bit)).
> 
> Or change the return type to bool.
> 
> Ben.
> 
> -- 
> Ben Hutchings
> The first rule of tautology club is the first rule of tautology club.



Yes, I'll do that I think.
Thanks!

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] kvm: x86: remove vmx_vm_has_apicv() outside of hwapic_isr_update()

2014-12-01 Thread Tiejun Chen
In most cases calling hwapic_isr_update(), actually we always
check if kvm_apic_vid_enabled() == 1, and also actually,
kvm_apic_vid_enabled()
-> kvm_x86_ops->vm_has_apicv()
-> vmx_vm_has_apicv() or '0' in svm case

So its unnecessary to recall this inside hwapic_isr_update(), here
just remove vmx_vm_has_apicv() out and follow others.

Signed-off-by: Tiejun Chen 
---
 arch/x86/kvm/lapic.c | 3 ++-
 arch/x86/kvm/vmx.c   | 3 ---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index e0e5642..2ddc426 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1739,7 +1739,8 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
if (kvm_x86_ops->hwapic_irr_update)
kvm_x86_ops->hwapic_irr_update(vcpu,
apic_find_highest_irr(apic));
-   kvm_x86_ops->hwapic_isr_update(vcpu->kvm, apic_find_highest_isr(apic));
+   if (kvm_apic_vid_enabled(vcpu->kvm))
+   kvm_x86_ops->hwapic_isr_update(vcpu->kvm, 
apic_find_highest_isr(apic));
kvm_make_request(KVM_REQ_EVENT, vcpu);
kvm_rtc_eoi_tracking_restore_one(vcpu);
 }
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 6a951d8..f0c16a9 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -7406,9 +7406,6 @@ static void vmx_hwapic_isr_update(struct kvm *kvm, int 
isr)
u16 status;
u8 old;
 
-   if (!vmx_vm_has_apicv(kvm))
-   return;
-
if (isr == -1)
isr = 0;
 
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net] Revert "drivers/net: Disable UFO through virtio" in macvtap and tun

2014-12-01 Thread Michael S. Tsirkin
On Tue, Nov 11, 2014 at 05:12:58PM +, Ben Hutchings wrote:
> This reverts commit 88e0e0e5aa722b193c8758c8b45d041de5316924 for
> the tap drivers, but leaves UFO disabled in virtio_net.
> 
> libvirt at least assumes that tap features will never be dropped
> in new kernel versions, and doing so prevents migration of VMs to
> the never kernel version while they are running with virtio net
> devices.
> 
> Fixes: 88e0e0e5aa7a ("drivers/net: Disable UFO through virtio")
> Signed-off-by: Ben Hutchings 

I did some light testing with IPv4, and this seems to migrate fine now.
So let's apply, please

Acked-by: Michael S. Tsirkin 


> ---
> Compile-tested only.
> 
> Ben.
> 
>  drivers/net/macvtap.c | 13 -
>  drivers/net/tun.c | 19 ---
>  2 files changed, 16 insertions(+), 16 deletions(-)
> 
> diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
> index 6f226de..aeaeb6d 100644
> --- a/drivers/net/macvtap.c
> +++ b/drivers/net/macvtap.c
> @@ -66,7 +66,7 @@ static struct cdev macvtap_cdev;
>  static const struct proto_ops macvtap_socket_ops;
>  
>  #define TUN_OFFLOADS (NETIF_F_HW_CSUM | NETIF_F_TSO_ECN | NETIF_F_TSO | \
> -   NETIF_F_TSO6)
> +   NETIF_F_TSO6 | NETIF_F_UFO)
>  #define RX_OFFLOADS (NETIF_F_GRO | NETIF_F_LRO)
>  #define TAP_FEATURES (NETIF_F_GSO | NETIF_F_SG)
>  
> @@ -570,8 +570,6 @@ static int macvtap_skb_from_vnet_hdr(struct sk_buff *skb,
>   gso_type = SKB_GSO_TCPV6;
>   break;
>   case VIRTIO_NET_HDR_GSO_UDP:
> - pr_warn_once("macvtap: %s: using disabled UFO feature; 
> please fix this program\n",
> -  current->comm);
>   gso_type = SKB_GSO_UDP;
>   if (skb->protocol == htons(ETH_P_IPV6))
>   ipv6_proxy_select_ident(skb);
> @@ -619,6 +617,8 @@ static void macvtap_skb_to_vnet_hdr(const struct sk_buff 
> *skb,
>   vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
>   else if (sinfo->gso_type & SKB_GSO_TCPV6)
>   vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
> + else if (sinfo->gso_type & SKB_GSO_UDP)
> + vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP;
>   else
>   BUG();
>   if (sinfo->gso_type & SKB_GSO_TCP_ECN)
> @@ -953,6 +953,9 @@ static int set_offload(struct macvtap_queue *q, unsigned 
> long arg)
>   if (arg & TUN_F_TSO6)
>   feature_mask |= NETIF_F_TSO6;
>   }
> +
> + if (arg & TUN_F_UFO)
> + feature_mask |= NETIF_F_UFO;
>   }
>  
>   /* tun/tap driver inverts the usage for TSO offloads, where
> @@ -963,7 +966,7 @@ static int set_offload(struct macvtap_queue *q, unsigned 
> long arg)
>* When user space turns off TSO, we turn off GSO/LRO so that
>* user-space will not receive TSO frames.
>*/
> - if (feature_mask & (NETIF_F_TSO | NETIF_F_TSO6))
> + if (feature_mask & (NETIF_F_TSO | NETIF_F_TSO6 | NETIF_F_UFO))
>   features |= RX_OFFLOADS;
>   else
>   features &= ~RX_OFFLOADS;
> @@ -1064,7 +1067,7 @@ static long macvtap_ioctl(struct file *file, unsigned 
> int cmd,
>   case TUNSETOFFLOAD:
>   /* let the user check for future flags */
>   if (arg & ~(TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 |
> - TUN_F_TSO_ECN))
> + TUN_F_TSO_ECN | TUN_F_UFO))
>   return -EINVAL;
>  
>   rtnl_lock();
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index 7302398..a0987d1 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -175,7 +175,7 @@ struct tun_struct {
>   struct net_device   *dev;
>   netdev_features_t   set_features;
>  #define TUN_USER_FEATURES (NETIF_F_HW_CSUM|NETIF_F_TSO_ECN|NETIF_F_TSO| \
> -   NETIF_F_TSO6)
> +   NETIF_F_TSO6|NETIF_F_UFO)
>  
>   int vnet_hdr_sz;
>   int sndbuf;
> @@ -1152,20 +1152,10 @@ static ssize_t tun_get_user(struct tun_struct *tun, 
> struct tun_file *tfile,
>   skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6;
>   break;
>   case VIRTIO_NET_HDR_GSO_UDP:
> - {
> - static bool warned;
> -
> - if (!warned) {
> - warned = true;
> - netdev_warn(tun->dev,
> - "%s: using disabled UFO feature; 
> please fix this program\n",
> - current->comm);
> - }
>   skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
>   if (skb->protocol == htons(ETH_P_IPV6))
>

Re: [GIT PULL 10/11] KVM: s390: handle pending local interrupts via bitmap

2014-12-01 Thread Jens Freimann
Excerpts from Heiko Carstens's message of 2014-12-01 09:19:16 +0100:
> On Fri, Nov 28, 2014 at 02:25:38PM +0100, Christian Borntraeger wrote:
> > +static int __must_check __deliver_mchk_floating(struct kvm_vcpu *vcpu,
> > +   struct kvm_s390_interrupt_info *inti)
> > +{
> > +struct kvm_s390_mchk_info *mchk = &inti->mchk;
> > +int rc;
> > +
> > +VCPU_EVENT(vcpu, 4, "interrupt: machine check mcic=%llx",
> > +   mchk->mcic);
> > +trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_MCHK,
> > + mchk->cr14, mchk->mcic);
> > +
> > +rc  = kvm_s390_vcpu_store_status(vcpu, KVM_S390_STORE_STATUS_PREFIXED);
> > +rc |= put_guest_lc(vcpu, mchk->mcic,
> > +(u64 __user *) __LC_MCCK_CODE);
> > +rc |= put_guest_lc(vcpu, mchk->failing_storage_address,
> > +(u64 __user *) __LC_MCCK_FAIL_STOR_ADDR);
> > +rc |= write_guest_lc(vcpu, __LC_PSW_SAVE_AREA,
> > + &mchk->fixed_logout, sizeof(mchk->fixed_logout));
> > +rc |= write_guest_lc(vcpu, __LC_MCK_OLD_PSW,
> > + &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
> > +rc |= read_guest_lc(vcpu, __LC_MCK_NEW_PSW,
> > +&vcpu->arch.sie_block->gpsw, sizeof(psw_t));
> > +return rc;
> > +}
> 
> FWIW, rc handling seems to be a bit fragile.
> The usual return statement for such a pattern is
> return rc ? -EWHATEVER : 0;
> so we don't get random or'ed return values.

Ok, I'll change this to return -EFAULT (need to double check) if rc is set.

> 
> > -static int __inject_prog_irq(struct kvm_vcpu *vcpu,
> > - struct kvm_s390_interrupt_info *inti)
> > +static int __inject_prog(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq)
> >  {
> >  struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
> >  
> > -list_add(&inti->list, &li->list);
> > -atomic_set(&li->active, 1);
> > +li->irq.pgm = irq->u.pgm;
> > +__set_bit(IRQ_PEND_PROG, &li->pending_irqs);
> 
> ^
> 
> > +static int __inject_pfault_init(struct kvm_vcpu *vcpu, struct kvm_s390_irq 
> > *irq)
> >  {
> >  struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
> >  
> > -inti->ext.ext_params2 = s390int->parm64;
> > -list_add_tail(&inti->list, &li->list);
> > -atomic_set(&li->active, 1);
> > +VCPU_EVENT(vcpu, 3, "inject: external irq params:%x, params2:%llx",
> > +   irq->u.ext.ext_params, irq->u.ext.ext_params2);
> > +trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_INT_PFAULT_INIT,
> > +   irq->u.ext.ext_params,
> > +   irq->u.ext.ext_params2, 2);
> > +
> > +li->irq.ext = irq->u.ext;
> > +set_bit(IRQ_PEND_PFAULT_INIT, &li->pending_irqs);
> 
> ^^^
> 
> You're using atomic and non-atomic bitops all over the place on the same
> object(s). It would be very good to have some consistency here.
> And as far as I remember the non-atomic variant is good enough anyway.

I think you are right. The non-atomic bitops are sufficient here. I'll fix this.

Paolo, is a follow-up patch good enough or should we fix this one?


regards
Jens

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC 1/2] KVM: don't check for PF_VCPU when yielding

2014-12-01 Thread David Hildenbrand
> On 11/28/2014 04:28 PM, Christian Borntraeger wrote:
> > Am 28.11.2014 um 11:08 schrieb Raghavendra KT:
> >> Was able to test the patch, here is the result: I have not tested with
> >> bigger VMs though. Results make it difficult to talk about any side
> >> effect of
> >> patch if any.
> >
> > Thanks a log.
> >
> > If our assumption is correct, then this patch should have no side effect on 
> > x86. Do you have any confidence guess if the numbers below mean: no-change 
> > vs. regression vs improvement?
> >
> 
> I am seeing very small improvement in <= 1x commit cases
> and for >1x overcommit, a very slight regression. But considering the
> test environment noises, I do not see much effect from the
> patch.
> 
> But I admit, I have not explored deeply about,
> 1. assumption of preempted approximately equals PF_VCPU case logic,

PF_VCPU is only a hint whether the target vcpu is executing the guest.
If preemption is off or !s390, PF_VCPU means that the target vcpu is running
and can't be preempted.

Although for preemption on and s390, this statement is false. Therefore this
check is not always right.

> 2. whether it helps for any future usages of yield_to against current
> sole usage of virtualization.
> 
> 
> 
Thanks for your test!

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/2] arm/arm64: kvm: drop inappropriate use of kvm_is_mmio_pfn()

2014-12-01 Thread Paolo Bonzini


On 01/12/2014 10:16, Ard Biesheuvel wrote:
> On 21 November 2014 at 12:24, Christoffer Dall
>  wrote:
>> On Mon, Nov 10, 2014 at 09:33:55AM +0100, Ard Biesheuvel wrote:
>>> Instead of using kvm_is_mmio_pfn() to decide whether a host region
>>> should be stage 2 mapped with device attributes, add a new static
>>> function kvm_is_device_pfn() that disregards RAM pages with the
>>> reserved bit set, as those should usually not be mapped as device
>>> memory.
>>>
>>> Signed-off-by: Ard Biesheuvel 
>>> ---
>>>  arch/arm/kvm/mmu.c | 7 ++-
>>>  1 file changed, 6 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
>>> index 57a403a5c22b..b007438242e2 100644
>>> --- a/arch/arm/kvm/mmu.c
>>> +++ b/arch/arm/kvm/mmu.c
>>> @@ -834,6 +834,11 @@ static bool kvm_is_write_fault(struct kvm_vcpu *vcpu)
>>>   return kvm_vcpu_dabt_iswrite(vcpu);
>>>  }
>>>
>>> +static bool kvm_is_device_pfn(unsigned long pfn)
>>> +{
>>> + return !pfn_valid(pfn);
>>> +}
>>> +
>>>  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>> struct kvm_memory_slot *memslot, unsigned long hva,
>>> unsigned long fault_status)
>>> @@ -904,7 +909,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
>>> phys_addr_t fault_ipa,
>>>   if (is_error_pfn(pfn))
>>>   return -EFAULT;
>>>
>>> - if (kvm_is_mmio_pfn(pfn))
>>> + if (kvm_is_device_pfn(pfn))
>>>   mem_type = PAGE_S2_DEVICE;
>>>
>>>   spin_lock(&kvm->mmu_lock);
>>> --
>>> 1.8.3.2
>>>
>> Acked-by: Christoffer Dall 
> 
> These 2 patches are now in 3.18-rc7, so they can be dropped from the
> kvmarm queue/next/etc branches

If they are in queue, they can be dropped.  If they are in next, please
leave them in as the next branch should not be rebased.  Duplicate
commits are generally harmless.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH v2 1/2] KVM: kvm-vfio: User API for VT-d Posted-Interrupts

2014-12-01 Thread Eric Auger
On 11/25/2014 05:10 PM, Alex Williamson wrote:
> On Tue, 2014-11-25 at 16:01 +0100, Eric Auger wrote:
>> On 11/25/2014 01:23 PM, Feng Wu wrote:
>>> This patch adds and documents a new attribute
>>> KVM_DEV_VFIO_DEVICE_POSTING_IRQ in KVM_DEV_VFIO_DEVICE group.
>>> This new attribute is used for VT-d Posted-Interrupts.
>>>
>>> When guest OS changes the interrupt configuration for an
>>> assigned device, such as, MSI/MSIx data/address fields,
>>> QEMU will use this IRQ attribute to tell KVM to update the
>>> related IRTE according the VT-d Posted-Interrrupts Specification,
>>> such as, the guest vector should be updated in the related IRTE.
>>>
>>> Signed-off-by: Feng Wu 
>>> ---
>>>  Documentation/virtual/kvm/devices/vfio.txt |9 +
>>>  include/uapi/linux/kvm.h   |   10 ++
>>>  2 files changed, 19 insertions(+), 0 deletions(-)
>>>
>>> diff --git a/Documentation/virtual/kvm/devices/vfio.txt 
>>> b/Documentation/virtual/kvm/devices/vfio.txt
>>> index f7aff29..39dee86 100644
>>> --- a/Documentation/virtual/kvm/devices/vfio.txt
>>> +++ b/Documentation/virtual/kvm/devices/vfio.txt
>>> @@ -42,3 +42,12 @@ activated before VFIO_DEVICE_SET_IRQS has been called to 
>>> trigger the IRQ
>>>  or associate an eventfd to it. Unforwarding can only be called while the
>>>  signaling has been disabled with VFIO_DEVICE_SET_IRQS. If this condition is
>>>  not satisfied, the command returns an -EBUSY.
>>> +
>>> +  KVM_DEV_VFIO_DEVICE_POSTING_IRQ: Use posted interrtups mechanism to post
>>> +   the IRQ to guests.
>>> +For this attribute, kvm_device_attr.addr points to a kvm_posted_intr 
>>> struct.
>>> +
>>> +When guest OS changes the interrupt configuration for an assigned device,
>>> +such as, MSI/MSIx data/address fields, QEMU will use this IRQ attribute
>>> +to tell KVM to update the related IRTE according the VT-d 
>>> Posted-Interrrupts
>>> +Specification, such as, the guest vector should be updated in the related 
>>> IRTE.
>>> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
>>> index a269a42..e5f86ad 100644
>>> --- a/include/uapi/linux/kvm.h
>>> +++ b/include/uapi/linux/kvm.h
>>> @@ -949,6 +949,7 @@ struct kvm_device_attr {
>>>  #define  KVM_DEV_VFIO_DEVICE   2
>>>  #define   KVM_DEV_VFIO_DEVICE_FORWARD_IRQ  1
>>>  #define   KVM_DEV_VFIO_DEVICE_UNFORWARD_IRQ2
>>> +#define   KVM_DEV_VFIO_DEVICE_POSTING_IRQ  3
>>>  
>>>  enum kvm_device_type {
>>> KVM_DEV_TYPE_FSL_MPIC_20= 1,
>>> @@ -973,6 +974,15 @@ struct kvm_arch_forwarded_irq {
>>> __u32 gsi; /* gsi, ie. virtual IRQ number */
>>>  };
>>>  
Hi Feng, Alex,
I am currently reworking my code to use something closer to this struct.
Would you agree with following changes?
>>> +struct kvm_posted_intr {
kvm_posted_irq
>>> +   __u32   argsz;
>>> +   __u32   fd; /* file descriptor of the VFIO device */
>>> +   __u32   index;  /* VFIO device IRQ index */
>>> +   __u32   start;
>>> +   __u32   count;
>>> +   int virq[0];/* gsi, ie. virtual IRQ number */
__u32 gsi[];

>>> +};
>> Hi Feng,
>>
>> This struct could be used by arm code too. If Alex agrees I could use
>> that one instead. We just need to find a common sensible name
> 
> Yep, the interface might as well support batch setup.  The vfio code
> uses -1 for teardown if we want to avoid FORWARD vs UNFORWARD we could
> let the data in the structure define which operation to do. 

In case we remove the unforward and use fd=1 to tear down, the virq=gsi
must uniquely identify the struct. For ARM I think this is true, we
cannot have several physical IRQ forwarded to the same GSI. I don't know
about posted irqs or other archs.

Best Regards

Eric
 Ideally the
> code in virt/kvm/vfio.c would be almost entirely shared and just make
> different arch_foo() callouts.  The PCI smarts in 2/2 here should
> probably be moved out to that same arch_ code.  Thanks,
> 
> Alex
> 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Xen-devel] [PATCH] xen: privcmd: schedule() after private hypercall when non CONFIG_PREEMPT

2014-12-01 Thread David Vrabel
On 28/11/14 04:49, Juergen Gross wrote:
> On 11/27/2014 07:50 PM, Andrew Cooper wrote:
>> 
>> XenServer uses
>>
>> https://github.com/xenserver/linux-3.x.pg/blob/master/master/0001-x86-xen-allow-privcmd-hypercalls-to-be-preempted.patch
>>
>>
>> to deal with these issues.  That patch is based on 3.10.
> 
> Clever. :-)
> 
>>
>> I can remember whether this has been submitted upstream before (and
>> there were outstanding issues), or whether it fell at an inconvenient
>> time with our development cycles.
> 
> I found
> 
> http://lists.xen.org/archives/html/xen-devel/2014-02/msg02540.html
> 
> and nothing else.

I dropped it because it copy-and-paste a bunch of otherwise generic x86
assembler and looked unlikely to get an x86 maintainer ack.  If you
think otherwise, feel free to pick it up and run with it.

David
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] xen: privcmd: schedule() after private hypercall when non CONFIG_PREEMPT

2014-12-01 Thread David Vrabel
On 27/11/14 18:36, Luis R. Rodriguez wrote:
> On Thu, Nov 27, 2014 at 07:36:31AM +0100, Juergen Gross wrote:
>> On 11/26/2014 11:26 PM, Luis R. Rodriguez wrote:
>>> From: "Luis R. Rodriguez" 
>>>
>>> Some folks had reported that some xen hypercalls take a long time
>>> to complete when issued from the userspace private ioctl mechanism,
>>> this can happen for instance with some hypercalls that have many
>>> sub-operations, this can happen for instance on hypercalls that use
[...]
>>> --- a/drivers/xen/privcmd.c
>>> +++ b/drivers/xen/privcmd.c
>>> @@ -60,6 +60,9 @@ static long privcmd_ioctl_hypercall(void __user *udata)
>>>hypercall.arg[0], hypercall.arg[1],
>>>hypercall.arg[2], hypercall.arg[3],
>>>hypercall.arg[4]);
>>> +#ifndef CONFIG_PREEMPT
>>> +   schedule();
>>> +#endif

As Juergen points out, this does nothing.  You need to schedule while in
the middle of the hypercall.

Remember that Xen's hypercall preemption only preempts the hypercall to
run interrupts in the guest.

>>>
>>> return ret;
>>>   }
>>>
>>
>> Sorry, I don't think this will solve anything. You're calling schedule()
>> right after the long running hypercall just nanoseconds before returning
>> to the user.
> 
> Yeah, well that is what [1] tried as well only it tried using
> preempt_schedule_irq() on the hypercall callback...

No.  My patch added a schedule point in the middle of a hypercall on the
return from an interrupt (e.g., the timer interrupt).

David
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/7] KVM: arm: guest debug, define API headers

2014-12-01 Thread Alex Bennée

Christoffer Dall  writes:

> On Wed, Nov 26, 2014 at 03:04:10PM +, Alex Bennée wrote:
>> 
>> Peter Maydell  writes:
>> 
>> > On 25 November 2014 at 16:10, Alex Bennée  wrote:
>> >> +/* Exit types which define why we did a debug exit */
>> >> +#define KVM_DEBUG_EXIT_ERROR   0x0
>> >> +#define KVM_DEBUG_EXIT_SINGLE_STEP 0x1
>> >> +#define KVM_DEBUG_EXIT_SW_BKPT 0x2
>> >> +#define KVM_DEBUG_EXIT_HW_BKPT 0x3
>> >> +#define KVM_DEBUG_EXIT_HW_WTPT 0x4
>> >
>> > The names of these imply that they're generic, but they're
>> > defined in an arch-specific header file...
>> 
>> Yeah, I think these will die and I'll just export the syndrome
>> information directly to QEMU.
>
> huh?

Rather than mapping syndrome to a specific exit value we might as well
export syndrome information to QEMU and let it define the reason.


>
> -Christoffer

-- 
Alex Bennée
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 4/7] KVM: arm64: guest debug, add SW break point support

2014-12-01 Thread Alex Bennée

Christoffer Dall  writes:

> On Tue, Nov 25, 2014 at 04:10:02PM +, Alex Bennée wrote:
>> This adds support for SW breakpoints inserted by userspace.
>> 
>> First we need to trap all BKPT exceptions in the hypervisor (ELS). This
>> in controlled through the MDCR_EL2 register. I've added a new field to
>
> this is ?
>
>> the vcpu structure to hold this value. There should be scope to
>> rationlise this with the VCPU_DEBUG_FLAGS/KVM_ARM64_DEBUG_DIRTY_SHIFT
>> manipulation in later patches.
>
> I don't understand what you mean with rationalising something in later
> patches.

I was pointing to potential future improvements. I'm wary of touching
the lazy register syncing code in this patch set as it greatly increases
the scope of testing. However I guess I'll have to bite the bullet and
merge it in a sensible way.

>
>> 
>> Once the exception arrives we triggers an exit from the main hypervisor
>> loop to the hypervisor controller. The kvm_debug_exit_arch carries the
>
> hypervisor controller is userspace?

Yeah sorry that one sneaked through.

>
>> address of the exception. If the controller doesn't know of breakpoint
>
> the breakint ?
>
>> then we have a guest inserted breakpoint and the hypervisor needs to
>> start again and deliver the exception to guest.
>
> to the guest
>
>> 
>> Signed-off-by: Alex Bennée 
>> 
>> diff --git a/Documentation/virtual/kvm/api.txt 
>> b/Documentation/virtual/kvm/api.txt
>> index 2c6386e..9383359 100644
>> --- a/Documentation/virtual/kvm/api.txt
>> +++ b/Documentation/virtual/kvm/api.txt
>> @@ -2592,7 +2592,7 @@ when running. Common control bits are:
>>  The top 16 bits of the control field are architecture specific control
>>  flags which can include the following:
>>  
>> -  - KVM_GUESTDBG_USE_SW_BP: using software breakpoints [x86]
>> +  - KVM_GUESTDBG_USE_SW_BP: using software breakpoints [x86, arm64]
>>- KVM_GUESTDBG_USE_HW_BP: using hardware breakpoints [x86, s390]
>>- KVM_GUESTDBG_INJECT_DB: inject DB type exception [x86]
>>- KVM_GUESTDBG_INJECT_BP: inject BP type exception [x86]
>> diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
>> index a0ff410..48d26bb 100644
>> --- a/arch/arm/kvm/arm.c
>> +++ b/arch/arm/kvm/arm.c
>> @@ -303,6 +303,9 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
>>  int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
>>  struct kvm_guest_debug *dbg)
>>  {
>> +/* Route debug traps to el2? */
>> +bool route_el2 = false;
>> +
>>  /* If it's not enabled clear all flags */
>>  if (!(dbg->control & KVM_GUESTDBG_ENABLE)) {
>>  vcpu->guest_debug = 0;
>> @@ -320,8 +323,8 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu 
>> *vcpu,
>>  
>>  /* Software Break Points */
>>  if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) {
>> -kvm_info("SW BP support requested, not yet implemented\n");
>> -return -EINVAL;
>> +kvm_info("SW BP support requested\n");
>> +route_el2 = true;
>>  }
>>  
>>  /* Hardware assisted Break and Watch points */
>> @@ -330,6 +333,20 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu 
>> *vcpu,
>>  return -EINVAL;
>>  }
>>  
>> +/* If we are going to handle any debug exceptions we need to
>
> wings on the comments
>
>> + * set MDCE_EL2.TDE to ensure they are routed to the
>> + * hypervisor. If userspace determines the exception was
>> + * actually internal to the guest it needs to handle
>> + * re-injecting the exception.
>> + */
>> +if (route_el2) {
>> +kvm_info("routing debug exceptions");
>> +vcpu->arch.mdcr_el2 = MDCR_EL2_TDE;
>> +} else {
>> +kvm_info("not routing debug exceptions");
>> +vcpu->arch.mdcr_el2 = 0;
>> +}
>> +
>>  return 0;
>>  }
>>  
>> diff --git a/arch/arm64/include/asm/kvm_host.h 
>> b/arch/arm64/include/asm/kvm_host.h
>> index 2012c4b..38b0f07 100644
>> --- a/arch/arm64/include/asm/kvm_host.h
>> +++ b/arch/arm64/include/asm/kvm_host.h
>> @@ -98,6 +98,7 @@ struct kvm_vcpu_arch {
>>  
>>  /* HYP configuration */
>>  u64 hcr_el2;
>> +u32 mdcr_el2;
>>  
>>  /* Exception Information */
>>  struct kvm_vcpu_fault_info fault;
>> diff --git a/arch/arm64/kernel/asm-offsets.c 
>> b/arch/arm64/kernel/asm-offsets.c
>> index 9a9fce0..8da1043 100644
>> --- a/arch/arm64/kernel/asm-offsets.c
>> +++ b/arch/arm64/kernel/asm-offsets.c
>> @@ -122,6 +122,7 @@ int main(void)
>>DEFINE(VCPU_HPFAR_EL2,offsetof(struct kvm_vcpu, 
>> arch.fault.hpfar_el2));
>>DEFINE(VCPU_DEBUG_FLAGS,  offsetof(struct kvm_vcpu, arch.debug_flags));
>>DEFINE(VCPU_HCR_EL2,  offsetof(struct kvm_vcpu, 
>> arch.hcr_el2));
>> +  DEFINE(VCPU_MDCR_EL2, offsetof(struct kvm_vcpu, arch.mdcr_el2));
>>DEFINE(VCPU_IRQ_LINES,offsetof(struct kvm_vcpu, arch.irq_lines));
>>DEFINE(VCPU_HOST_CONTEXT, offsetof(struct kvm_vcpu, 

Re: [PATCH] kvm: x86: remove vmx_vm_has_apicv() outside of hwapic_isr_update()

2014-12-01 Thread Paolo Bonzini


On 01/12/2014 10:28, Tiejun Chen wrote:
> In most cases calling hwapic_isr_update(), actually we always
> check if kvm_apic_vid_enabled() == 1, and also actually,
> kvm_apic_vid_enabled()
> -> kvm_x86_ops->vm_has_apicv()
> -> vmx_vm_has_apicv() or '0' in svm case
> 
> So its unnecessary to recall this inside hwapic_isr_update(), here
> just remove vmx_vm_has_apicv() out and follow others.

If you want to do this, please NULL out the function pointer instead, as
KVM already does for hwapic_irr_update.

Paolo

> Signed-off-by: Tiejun Chen 
> ---
>  arch/x86/kvm/lapic.c | 3 ++-
>  arch/x86/kvm/vmx.c   | 3 ---
>  2 files changed, 2 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index e0e5642..2ddc426 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -1739,7 +1739,8 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
>   if (kvm_x86_ops->hwapic_irr_update)
>   kvm_x86_ops->hwapic_irr_update(vcpu,
>   apic_find_highest_irr(apic));
> - kvm_x86_ops->hwapic_isr_update(vcpu->kvm, apic_find_highest_isr(apic));
> + if (kvm_apic_vid_enabled(vcpu->kvm))
> + kvm_x86_ops->hwapic_isr_update(vcpu->kvm, 
> apic_find_highest_isr(apic));
>   kvm_make_request(KVM_REQ_EVENT, vcpu);
>   kvm_rtc_eoi_tracking_restore_one(vcpu);
>  }
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 6a951d8..f0c16a9 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -7406,9 +7406,6 @@ static void vmx_hwapic_isr_update(struct kvm *kvm, int 
> isr)
>   u16 status;
>   u8 old;
>  
> - if (!vmx_vm_has_apicv(kvm))
> - return;
> -
>   if (isr == -1)
>   isr = 0;
>  
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 5/7] KVM: arm64: guest debug, add support for single-step

2014-12-01 Thread Alex Bennée

Christoffer Dall  writes:

> On Tue, Nov 25, 2014 at 04:10:03PM +, Alex Bennée wrote:
>> This adds support for single-stepping the guest. As userspace can and
>> will manipulate guest registers before restarting any tweaking of the
>> registers has to occur just before control is passed back to the guest.
>> Furthermore while guest debugging is in effect we need to squash the
>> ability of the guest to single-step itself as we have no easy way of
>> re-entering the guest after the exception has been delivered to the
>> hypervisor.
>
> Admittedly this is a corner case, but wouldn't the only really nasty bit
> of this be to emulate the guest debug exception?

Well yes - currently this is all squashed by ignoring the guest's wishes
while we are debugging (save for SW breakpoints).

>
>> 
>> Signed-off-by: Alex Bennée 
>> 
>> diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
>> index 48d26bb..a76daae 100644
>> --- a/arch/arm/kvm/arm.c
>> +++ b/arch/arm/kvm/arm.c
>> @@ -38,6 +38,7 @@
>>  #include 
>>  #include 
>>  #include 
>> +#include 
>>  #include 
>>  #include 
>>  #include 
>> @@ -300,6 +301,17 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
>>  kvm_arm_set_running_vcpu(NULL);
>>  }
>>  
>> +/**
>> + * kvm_arch_vcpu_ioctl_set_guest_debug - Setup guest debugging
>> + * @kvm:pointer to the KVM struct
>> + * @kvm_guest_debug: the ioctl data buffer
>> + *
>> + * This sets up the VM for guest debugging. Care has to be taken when
>> + * manipulating guest registers as these will be set/cleared by the
>> + * hyper-visor controller, typically before each kvm_run event. As a
>
> hypervisor
>
>> + * result modification of the guest registers needs to take place
>> + * after they have been restored in the hyp.S trampoline code.
>
> I don't understand this??

We can't use GET/SET one reg to manipulate the registers we want as
these are the guest visible versions and subject to modification by
userspace. This is why the debugging code makes it's changes after the
guest state has been restored.

>
>> + */
>>  int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
>>  struct kvm_guest_debug *dbg)
>>  {
>> @@ -317,8 +329,8 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu 
>> *vcpu,
>>  
>>  /* Single Step */
>>  if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
>> -kvm_info("SS requested, not yet implemented\n");
>> -return -EINVAL;
>> +kvm_info("SS requested\n");
>> +route_el2 = true;
>>  }
>>  
>>  /* Software Break Points */
>> diff --git a/arch/arm64/kernel/asm-offsets.c 
>> b/arch/arm64/kernel/asm-offsets.c
>> index 8da1043..78e5ae1 100644
>> --- a/arch/arm64/kernel/asm-offsets.c
>> +++ b/arch/arm64/kernel/asm-offsets.c
>> @@ -121,6 +121,7 @@ int main(void)
>>DEFINE(VCPU_FAR_EL2,  offsetof(struct kvm_vcpu, 
>> arch.fault.far_el2));
>>DEFINE(VCPU_HPFAR_EL2,offsetof(struct kvm_vcpu, 
>> arch.fault.hpfar_el2));
>>DEFINE(VCPU_DEBUG_FLAGS,  offsetof(struct kvm_vcpu, arch.debug_flags));
>> +  DEFINE(GUEST_DEBUG,   offsetof(struct kvm_vcpu, guest_debug));
>>DEFINE(VCPU_HCR_EL2,  offsetof(struct kvm_vcpu, 
>> arch.hcr_el2));
>>DEFINE(VCPU_MDCR_EL2, offsetof(struct kvm_vcpu, arch.mdcr_el2));
>>DEFINE(VCPU_IRQ_LINES,offsetof(struct kvm_vcpu, arch.irq_lines));
>> diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c
>> index 28dc92b..6def054 100644
>> --- a/arch/arm64/kvm/handle_exit.c
>> +++ b/arch/arm64/kvm/handle_exit.c
>> @@ -91,6 +91,25 @@ static int kvm_handle_bkpt(struct kvm_vcpu *vcpu, struct 
>> kvm_run *run)
>>  return 0;
>>  }
>>  
>> +/**
>> + * kvm_handle_ss - handle single step exceptions
>> + *
>> + * @vcpu:   the vcpu pointer
>> + *
>> + * See: ARM ARM D2.12 for the details. While the host is routing debug
>> + * exceptions to it's handlers we have to suppress the ability of the
>
> its handlers
>
>> + * guest to trigger exceptions.
>
> not really sure why this comment is here?  Does it really help anyone
> reading this specific function or does it just confuse people more?
>
>> + */
>> +static int kvm_handle_ss(struct kvm_vcpu *vcpu, struct kvm_run *run)
>> +{
>> +WARN_ON(!(vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP));
>
> is this something that can actually happen or should it be a BUG_ON() -
> which may even go away once you're doing hacking on this?

It shouldn't happen. I was treating more like an assert, failure of
which would indicate something has gone wrong somewhere although
generally not worth bringing the kernel down for.

>
>> +
>> +run->exit_reason = KVM_EXIT_DEBUG;
>> +run->debug.arch.exit_type = KVM_DEBUG_EXIT_SINGLE_STEP;
>> +run->debug.arch.address = *vcpu_pc(vcpu);
>> +return 0;
>> +}
>> +
>>  static exit_handle_fn arm_exit_handlers[] = {
>>  [ESR_EL2_EC_WFI]= kvm_handle_wfx,
>>  [ESR_EL2_EC_CP15_32]= kvm_handle_cp15_32,
>> @

Re: [PATCH 6/7] KVM: arm64: re-factor hyp.S debug register code

2014-12-01 Thread Alex Bennée

Christoffer Dall  writes:

> On Tue, Nov 25, 2014 at 04:10:04PM +, Alex Bennée wrote:
>> This is a pre-cursor to sharing the code with the guest debug support.
>> This replaces the big macro that fishes data out of a fixed location
>> with a more general helper macro to restore a set of debug registers. It
>> uses macro substitution so it can be re-used for debug control and value
>> registers. It does however rely on the debug registers being 64 bit
>> aligned (as they happen to be in the hyp ABI).
>
> can you enforce that somewhere?

There is a comment in kvm_asm.h:

/*
 * 0 is reserved as an invalid value.
 * Order *must* be kept in sync with the hyp switch code.
 */

But I'm not sure how to enforce it in assembly. Is there a #pragma or
something I can use?

>
>> 
>> Signed-off-by: Alex Bennée 
>> 
>> diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S
>> index c0bc218..b38ce3d 100644
>> --- a/arch/arm64/kvm/hyp.S
>> +++ b/arch/arm64/kvm/hyp.S
>> @@ -490,65 +490,12 @@
>>  msr mdscr_el1,  x25
>>  .endm
>>  
>> -.macro restore_debug
>> -// x2: base address for cpu context
>> -// x3: tmp register
>> -
>> -mrs x26, id_aa64dfr0_el1
>> -ubfxx24, x26, #12, #4   // Extract BRPs
>> -ubfxx25, x26, #20, #4   // Extract WRPs
>> -mov w26, #15
>> -sub w24, w26, w24   // How many BPs to skip
>> -sub w25, w26, w25   // How many WPs to skip
>> -
>> -add x3, x2, #CPU_SYSREG_OFFSET(DBGBCR0_EL1)
>> -
>> -adr x26, 1f
>> -add x26, x26, x24, lsl #2
>> -br  x26
>> -1:
>> -ldr x20, [x3, #(15 * 8)]
>> -ldr x19, [x3, #(14 * 8)]
>> -ldr x18, [x3, #(13 * 8)]
>> -ldr x17, [x3, #(12 * 8)]
>> -ldr x16, [x3, #(11 * 8)]
>> -ldr x15, [x3, #(10 * 8)]
>> -ldr x14, [x3, #(9 * 8)]
>> -ldr x13, [x3, #(8 * 8)]
>> -ldr x12, [x3, #(7 * 8)]
>> -ldr x11, [x3, #(6 * 8)]
>> -ldr x10, [x3, #(5 * 8)]
>> -ldr x9, [x3, #(4 * 8)]
>> -ldr x8, [x3, #(3 * 8)]
>> -ldr x7, [x3, #(2 * 8)]
>> -ldr x6, [x3, #(1 * 8)]
>> -ldr x5, [x3, #(0 * 8)]
>> -
>> +.macro setup_debug_registers type
>> +// x3: pointer to register set
>> +// x4: number of registers to copy
>> +// x5..x20/x26 trashed
>
> does this mean x5 through x20, and x26, get trashed or that potentially
> x5 thgough x26 get trashed?  I'm assuming the first, so replace the
> slash with 'and' would be more clear.
>
>>  adr x26, 1f
>> -add x26, x26, x24, lsl #2
>> -br  x26
>> -1:
>> -msr dbgbcr15_el1, x20
>> -msr dbgbcr14_el1, x19
>> -msr dbgbcr13_el1, x18
>> -msr dbgbcr12_el1, x17
>> -msr dbgbcr11_el1, x16
>> -msr dbgbcr10_el1, x15
>> -msr dbgbcr9_el1, x14
>> -msr dbgbcr8_el1, x13
>> -msr dbgbcr7_el1, x12
>> -msr dbgbcr6_el1, x11
>> -msr dbgbcr5_el1, x10
>> -msr dbgbcr4_el1, x9
>> -msr dbgbcr3_el1, x8
>> -msr dbgbcr2_el1, x7
>> -msr dbgbcr1_el1, x6
>> -msr dbgbcr0_el1, x5
>> -
>> -add x3, x2, #CPU_SYSREG_OFFSET(DBGBVR0_EL1)
>> -
>> -adr x26, 1f
>> -add x26, x26, x24, lsl #2
>> +add x26, x26, x4, lsl #2
>>  br  x26
>>  1:
>>  ldr x20, [x3, #(15 * 8)]
>> @@ -569,116 +516,25 @@
>>  ldr x5, [x3, #(0 * 8)]
>>  
>>  adr x26, 1f
>> -add x26, x26, x24, lsl #2
>> -br  x26
>> -1:
>> -msr dbgbvr15_el1, x20
>> -msr dbgbvr14_el1, x19
>> -msr dbgbvr13_el1, x18
>> -msr dbgbvr12_el1, x17
>> -msr dbgbvr11_el1, x16
>> -msr dbgbvr10_el1, x15
>> -msr dbgbvr9_el1, x14
>> -msr dbgbvr8_el1, x13
>> -msr dbgbvr7_el1, x12
>> -msr dbgbvr6_el1, x11
>> -msr dbgbvr5_el1, x10
>> -msr dbgbvr4_el1, x9
>> -msr dbgbvr3_el1, x8
>> -msr dbgbvr2_el1, x7
>> -msr dbgbvr1_el1, x6
>> -msr dbgbvr0_el1, x5
>> -
>> -add x3, x2, #CPU_SYSREG_OFFSET(DBGWCR0_EL1)
>> -
>> -adr x26, 1f
>> -add x26, x26, x25, lsl #2
>> +add x26, x26, x4, lsl #2
>>  br  x26
>>  1:
>> -ldr x20, [x3, #(15 * 8)]
>> -ldr x19, [x3, #(14 * 8)]
>> -ldr x18, [x3, #(13 * 8)]
>> -ldr x17, [x3, #(12 * 8)]
>> -ldr x16, [x3, #(11 * 8)]
>> -ldr x15, [x3, #(10 * 8)]
>> -ldr x14, [x3, #(9 * 8)]
>> -ldr x13, [x3, #(8 * 8)]
>> -ldr x12, [x3, #(7 * 8)]
>> -ldr x11, [x3, #(6 * 8)]
>> -ldr x10, [x3, #(5 * 8)]
>> -ldr x9, [x3, #(4 * 8)]
>> -ldr x8, [x3, #(3 * 8)]
>> -ldr x7, [x3, #(2 * 8)]
>> -ldr x6, [x3, #(1 * 8)]
>> -ldr x5, [x3, #(0 * 8)]
>> -
>> -adr x26, 1f
>> -add x26, x26, x25, lsl #2
>> -br  x26
>> -1:
>> -msr dbgwcr15_el1, x20
>> -msr dbgwcr14_el1, x1

Re: [PATCH 7/7] KVM: arm64: guest debug, HW assisted debug support

2014-12-01 Thread Alex Bennée

Christoffer Dall  writes:

> On Tue, Nov 25, 2014 at 04:10:05PM +, Alex Bennée wrote:

>> --- a/arch/arm64/kvm/hyp.S
>> +++ b/arch/arm64/kvm/hyp.S
>> @@ -18,6 +18,7 @@
>>  #include 
>>  #include 
>>  
>> +#include 
>>  #include 
>>  #include 
>>  #include 
>> @@ -174,6 +175,7 @@
>>  ldr x3, [x0, #GUEST_DEBUG]
>>  tbz x3, #KVM_GUESTDBG_ENABLE_SHIFT, 2f  // No guest debug
>>  
>> +// Both Step and HW BP/WP ops need to modify spsr_el2 and mdscr_el1
>>  // x0 - preserved as VCPU ptr
>>  // x1 - spsr
>>  // x2 - mdscr
>> @@ -191,6 +193,11 @@
>>  eor x1, x1, #DBG_SPSR_SS
>>  eor x2, x2, #DBG_MDSCR_SS
>>  1:
>> +// If we are doing HW BP/WP - set MDSCR_EL1.KDE/MDE
>> +tbz x3, #KVM_GUESTDBG_USE_HW_BP_SHIFT, 3f
>> +orr x2, x2, #DBG_MDSCR_KDE
>> +orr x2, x2, #DBG_MDSCR_MDE
>> +3:
>>  msr spsr_el2, x1
>>  msr mdscr_el1, x2
>>  2:
>> @@ -815,6 +822,33 @@ __restore_debug:
>>  
>>  ret
>>  
>> +/* Setup debug state for debug of guest */
>> +__setup_debug:
>> +// x0: vcpu base address
>> +// x3: ptr to guest registers passed to setup_debug_registers
>> +// x5..x20/x26: trashed
>> +
>> +mrs x26, id_aa64dfr0_el1
>> +ubfxx24, x26, #12, #4   // Extract BRPs
>> +ubfxx25, x26, #20, #4   // Extract WRPs
>> +mov w26, #15
>> +sub w24, w26, w24   // How many BPs to skip
>> +sub w25, w26, w25   // How many WPs to skip
>> +
>> +mov x4, x24
>> +add x3, x0, #GUEST_DEBUG_BCR
>> +setup_debug_registers dbgbcr
>> +add x3, x0, #GUEST_DEBUG_BVR
>> +setup_debug_registers dbgbvr
>> +
>> +mov x4, x25
>> +add x3, x0, #GUEST_DEBUG_WCR
>> +setup_debug_registers dbgwcr
>> +add x3, x0, #GUEST_DEBUG_WVR
>> +setup_debug_registers dbgwvr
>> +
>> +ret
>> +
>>  __save_fpsimd:
>>  save_fpsimd
>>  ret
>> @@ -861,6 +895,13 @@ ENTRY(__kvm_vcpu_run)
>>  bl __restore_sysregs
>>  bl __restore_fpsimd
>>  
>> +// Now is the time to set-up the debug registers if we
>> +// are debugging the guest
>> +ldr x3, [x0, #GUEST_DEBUG]
>> +tbz x3, #KVM_GUESTDBG_USE_HW_BP_SHIFT, 2f
>> +bl  __setup_debug
>> +b   1f
>> +2:
>>  skip_debug_state x3, 1f
>>  bl  __restore_debug
>>  1:
>> @@ -881,6 +922,11 @@ __kvm_vcpu_return:
>>  bl __save_fpsimd
>>  bl __save_sysregs
>>  
>> +// If we are debugging the guest don't save debug registers
>> +// otherwise we'll be trashing are only good copy we have.
>> +ldr x3, [x0, #GUEST_DEBUG]
>> +tbnzx3, #KVM_GUESTDBG_USE_HW_BP_SHIFT, 1f
>> +
>
> we're introducing an awful lot of conditionals in the assembly code with
> these patches, can you re-consider if there's a cleaner abstraction that
> allows us to deal with some of this stuff in C-code?

See previous mail. It would be good but we need a place to do it before
we enter hyp.S on a KVM_RUN ioctl. I'm open to suggestions.

>
> -Christoffer

-- 
Alex Bennée
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v7 29/46] vhost: add memory access wrappers

2014-12-01 Thread Cornelia Huck
On Sun, 30 Nov 2014 17:11:39 +0200
"Michael S. Tsirkin"  wrote:

> Add guest memory access wrappers to handle virtio endianness
> conversions.
> 
> Signed-off-by: Michael S. Tsirkin 
> Reviewed-by: Jason Wang 
> ---
>  drivers/vhost/vhost.h | 31 +++
>  1 file changed, 31 insertions(+)
> 
Reviewed-by: Cornelia Huck 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v7 30/46] vhost/net: force len for TX to host endian

2014-12-01 Thread Cornelia Huck
On Sun, 30 Nov 2014 17:11:44 +0200
"Michael S. Tsirkin"  wrote:

> vhost/net keeps a copy of some used ring but (ab)uses length
> field for internal house-keeping. This works because
> for tx used length is always 0.
> Suppress sparse errors: we use native endian-ness internally but never
> expose it to guest.

I admit that I find this patch description hard to read :)


"vhost/net keeps a copy of the used ring in host memory but (ab)uses
the length field for internal house-keeping. This works because the
length in the used ring for tx is always 0. In order to suppress sparse
errors, we need to force native endianness."

?

> 
> Signed-off-by: Michael S. Tsirkin 
> Reviewed-by: Jason Wang 
> ---
>  drivers/vhost/net.c | 10 +-
>  1 file changed, 5 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 8dae2f7..dce5c58 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -48,15 +48,15 @@ MODULE_PARM_DESC(experimental_zcopytx, "Enable Zero Copy 
> TX;"
>   * status internally; used for zerocopy tx only.
>   */
>  /* Lower device DMA failed */
> -#define VHOST_DMA_FAILED_LEN 3
> +#define VHOST_DMA_FAILED_LEN ((__force __virtio32)3)
>  /* Lower device DMA done */
> -#define VHOST_DMA_DONE_LEN   2
> +#define VHOST_DMA_DONE_LEN   ((__force __virtio32)2)
>  /* Lower device DMA in progress */
> -#define VHOST_DMA_IN_PROGRESS1
> +#define VHOST_DMA_IN_PROGRESS((__force __virtio32)1)
>  /* Buffer unused */
> -#define VHOST_DMA_CLEAR_LEN  0
> +#define VHOST_DMA_CLEAR_LEN  ((__force __virtio32)0)
> 
> -#define VHOST_DMA_IS_DONE(len) ((len) >= VHOST_DMA_DONE_LEN)
> +#define VHOST_DMA_IS_DONE(len) ((__force u32)(len) >= (__force 
> u32)VHOST_DMA_DONE_LEN)
> 
>  enum {
>   VHOST_NET_FEATURES = VHOST_FEATURES |

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/2] arm/arm64: kvm: drop inappropriate use of kvm_is_mmio_pfn()

2014-12-01 Thread Christoffer Dall
On Mon, Dec 01, 2014 at 10:54:44AM +0100, Paolo Bonzini wrote:
> 
> 
> On 01/12/2014 10:16, Ard Biesheuvel wrote:
> > On 21 November 2014 at 12:24, Christoffer Dall
> >  wrote:
> >> On Mon, Nov 10, 2014 at 09:33:55AM +0100, Ard Biesheuvel wrote:
> >>> Instead of using kvm_is_mmio_pfn() to decide whether a host region
> >>> should be stage 2 mapped with device attributes, add a new static
> >>> function kvm_is_device_pfn() that disregards RAM pages with the
> >>> reserved bit set, as those should usually not be mapped as device
> >>> memory.
> >>>
> >>> Signed-off-by: Ard Biesheuvel 
> >>> ---
> >>>  arch/arm/kvm/mmu.c | 7 ++-
> >>>  1 file changed, 6 insertions(+), 1 deletion(-)
> >>>
> >>> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
> >>> index 57a403a5c22b..b007438242e2 100644
> >>> --- a/arch/arm/kvm/mmu.c
> >>> +++ b/arch/arm/kvm/mmu.c
> >>> @@ -834,6 +834,11 @@ static bool kvm_is_write_fault(struct kvm_vcpu *vcpu)
> >>>   return kvm_vcpu_dabt_iswrite(vcpu);
> >>>  }
> >>>
> >>> +static bool kvm_is_device_pfn(unsigned long pfn)
> >>> +{
> >>> + return !pfn_valid(pfn);
> >>> +}
> >>> +
> >>>  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> >>> struct kvm_memory_slot *memslot, unsigned long 
> >>> hva,
> >>> unsigned long fault_status)
> >>> @@ -904,7 +909,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
> >>> phys_addr_t fault_ipa,
> >>>   if (is_error_pfn(pfn))
> >>>   return -EFAULT;
> >>>
> >>> - if (kvm_is_mmio_pfn(pfn))
> >>> + if (kvm_is_device_pfn(pfn))
> >>>   mem_type = PAGE_S2_DEVICE;
> >>>
> >>>   spin_lock(&kvm->mmu_lock);
> >>> --
> >>> 1.8.3.2
> >>>
> >> Acked-by: Christoffer Dall 
> > 
> > These 2 patches are now in 3.18-rc7, so they can be dropped from the
> > kvmarm queue/next/etc branches
> 
> If they are in queue, they can be dropped.  If they are in next, please
> leave them in as the next branch should not be rebased.  Duplicate
> commits are generally harmless.
> 
They are in kvmarm/next and will stay there ;)

-Christoffer
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v7 30/46] vhost/net: force len for TX to host endian

2014-12-01 Thread Michael S. Tsirkin
On Mon, Dec 01, 2014 at 01:20:51PM +0100, Cornelia Huck wrote:
> On Sun, 30 Nov 2014 17:11:44 +0200
> "Michael S. Tsirkin"  wrote:
> 
> > vhost/net keeps a copy of some used ring but (ab)uses length
> > field for internal house-keeping. This works because
> > for tx used length is always 0.
> > Suppress sparse errors: we use native endian-ness internally but never
> > expose it to guest.
> 
> I admit that I find this patch description hard to read :)
> 
> 
> "vhost/net keeps a copy of the used ring in host memory but (ab)uses
> the length field for internal house-keeping. This works because the
> length in the used ring for tx is always 0. In order to suppress sparse
> errors, we need to force native endianness."
> 
> ?

Yes. Add to this "These values are never exposed to guest."

> > 
> > Signed-off-by: Michael S. Tsirkin 
> > Reviewed-by: Jason Wang 
> > ---
> >  drivers/vhost/net.c | 10 +-
> >  1 file changed, 5 insertions(+), 5 deletions(-)
> > 
> > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> > index 8dae2f7..dce5c58 100644
> > --- a/drivers/vhost/net.c
> > +++ b/drivers/vhost/net.c
> > @@ -48,15 +48,15 @@ MODULE_PARM_DESC(experimental_zcopytx, "Enable Zero 
> > Copy TX;"
> >   * status internally; used for zerocopy tx only.
> >   */
> >  /* Lower device DMA failed */
> > -#define VHOST_DMA_FAILED_LEN   3
> > +#define VHOST_DMA_FAILED_LEN   ((__force __virtio32)3)
> >  /* Lower device DMA done */
> > -#define VHOST_DMA_DONE_LEN 2
> > +#define VHOST_DMA_DONE_LEN ((__force __virtio32)2)
> >  /* Lower device DMA in progress */
> > -#define VHOST_DMA_IN_PROGRESS  1
> > +#define VHOST_DMA_IN_PROGRESS  ((__force __virtio32)1)
> >  /* Buffer unused */
> > -#define VHOST_DMA_CLEAR_LEN0
> > +#define VHOST_DMA_CLEAR_LEN((__force __virtio32)0)
> > 
> > -#define VHOST_DMA_IS_DONE(len) ((len) >= VHOST_DMA_DONE_LEN)
> > +#define VHOST_DMA_IS_DONE(len) ((__force u32)(len) >= (__force 
> > u32)VHOST_DMA_DONE_LEN)
> > 
> >  enum {
> > VHOST_NET_FEATURES = VHOST_FEATURES |
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v7 31/46] vhost: virtio 1.0 endian-ness support

2014-12-01 Thread Cornelia Huck
On Sun, 30 Nov 2014 17:11:49 +0200
"Michael S. Tsirkin"  wrote:

> Signed-off-by: Michael S. Tsirkin 
> ---
>  drivers/vhost/vhost.c | 93 
> +++
>  1 file changed, 56 insertions(+), 37 deletions(-)
> 

> @@ -1113,18 +1120,19 @@ static int get_indirect(struct vhost_virtqueue *vq,
>  {
>   struct vring_desc desc;
>   unsigned int i = 0, count, found = 0;
> + u32 len = vhost32_to_cpu(vq, indirect->len);
>   int ret;
> 
>   /* Sanity check */
> - if (unlikely(indirect->len % sizeof desc)) {
> + if (unlikely(len % sizeof desc)) {
>   vq_err(vq, "Invalid length in indirect descriptor: "
>  "len 0x%llx not multiple of 0x%zx\n",
> -(unsigned long long)indirect->len,
> +(unsigned long long)vhost32_to_cpu(vq, indirect->len),

Can't you use len here?

>  sizeof desc);
>   return -EINVAL;
>   }
> 
> - ret = translate_desc(vq, indirect->addr, indirect->len, vq->indirect,
> + ret = translate_desc(vq, vhost64_to_cpu(vq, indirect->addr), len, 
> vq->indirect,
>UIO_MAXIOV);
>   if (unlikely(ret < 0)) {
>   vq_err(vq, "Translation failure %d in indirect.\n", ret);


> @@ -1404,7 +1422,7 @@ int vhost_add_used_n(struct vhost_virtqueue *vq, struct 
> vring_used_elem *heads,
> 
>   /* Make sure buffer is written before we update index. */
>   smp_wmb();
> - if (put_user(vq->last_used_idx, &vq->used->idx)) {
> + if (__put_user(cpu_to_vhost16(vq, vq->last_used_idx), &vq->used->idx)) {

Why s/put_user/__put_user/ - I don't see how endianness conversions
should have an influence there?

>   vq_err(vq, "Failed to increment used idx");
>   return -EFAULT;
>   }

> @@ -1449,11 +1468,11 @@ static bool vhost_notify(struct vhost_dev *dev, 
> struct vhost_virtqueue *vq)
>   if (unlikely(!v))
>   return true;
> 
> - if (get_user(event, vhost_used_event(vq))) {
> + if (__get_user(event, vhost_used_event(vq))) {

Dito: why the change?

>   vq_err(vq, "Failed to get used event idx");
>   return true;
>   }
> - return vring_need_event(event, new, old);
> + return vring_need_event(vhost16_to_cpu(vq, event), new, old);
>  }
> 
>  /* This actually signals the guest, using eventfd. */

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v7 32/46] vhost/net: virtio 1.0 byte swap

2014-12-01 Thread Cornelia Huck
On Sun, 30 Nov 2014 17:11:54 +0200
"Michael S. Tsirkin"  wrote:

> Signed-off-by: Michael S. Tsirkin 
> ---
>  drivers/vhost/net.c | 12 +++-
>  1 file changed, 7 insertions(+), 5 deletions(-)
> 
Reviewed-by: Cornelia Huck 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v7 33/46] vhost/net: larger header for virtio 1.0

2014-12-01 Thread Cornelia Huck
On Sun, 30 Nov 2014 17:11:59 +0200
"Michael S. Tsirkin"  wrote:

> Signed-off-by: Michael S. Tsirkin 
> Reviewed-by: Jason Wang 
> ---
>  drivers/vhost/net.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
> 

Reviewed-by: Cornelia Huck 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v7 31/46] vhost: virtio 1.0 endian-ness support

2014-12-01 Thread Michael S. Tsirkin
On Mon, Dec 01, 2014 at 01:33:53PM +0100, Cornelia Huck wrote:
> On Sun, 30 Nov 2014 17:11:49 +0200
> "Michael S. Tsirkin"  wrote:
> 
> > Signed-off-by: Michael S. Tsirkin 
> > ---
> >  drivers/vhost/vhost.c | 93 
> > +++
> >  1 file changed, 56 insertions(+), 37 deletions(-)
> > 
> 
> > @@ -1113,18 +1120,19 @@ static int get_indirect(struct vhost_virtqueue *vq,
> >  {
> > struct vring_desc desc;
> > unsigned int i = 0, count, found = 0;
> > +   u32 len = vhost32_to_cpu(vq, indirect->len);
> > int ret;
> > 
> > /* Sanity check */
> > -   if (unlikely(indirect->len % sizeof desc)) {
> > +   if (unlikely(len % sizeof desc)) {
> > vq_err(vq, "Invalid length in indirect descriptor: "
> >"len 0x%llx not multiple of 0x%zx\n",
> > -  (unsigned long long)indirect->len,
> > +  (unsigned long long)vhost32_to_cpu(vq, indirect->len),
> 
> Can't you use len here?

Not if I want error message to be readable.

> >sizeof desc);
> > return -EINVAL;
> > }
> > 
> > -   ret = translate_desc(vq, indirect->addr, indirect->len, vq->indirect,
> > +   ret = translate_desc(vq, vhost64_to_cpu(vq, indirect->addr), len, 
> > vq->indirect,
> >  UIO_MAXIOV);
> > if (unlikely(ret < 0)) {
> > vq_err(vq, "Translation failure %d in indirect.\n", ret);
> 
> 
> > @@ -1404,7 +1422,7 @@ int vhost_add_used_n(struct vhost_virtqueue *vq, 
> > struct vring_used_elem *heads,
> > 
> > /* Make sure buffer is written before we update index. */
> > smp_wmb();
> > -   if (put_user(vq->last_used_idx, &vq->used->idx)) {
> > +   if (__put_user(cpu_to_vhost16(vq, vq->last_used_idx), &vq->used->idx)) {
> 
> Why s/put_user/__put_user/ - I don't see how endianness conversions
> should have an influence there?

We should generally to __ variants since addresses are pre-validated.
But I agree - should be a separate patch.

> 
> > vq_err(vq, "Failed to increment used idx");
> > return -EFAULT;
> > }
> 
> > @@ -1449,11 +1468,11 @@ static bool vhost_notify(struct vhost_dev *dev, 
> > struct vhost_virtqueue *vq)
> > if (unlikely(!v))
> > return true;
> > 
> > -   if (get_user(event, vhost_used_event(vq))) {
> > +   if (__get_user(event, vhost_used_event(vq))) {
> 
> Dito: why the change?

Same. Will split this out, it's unrelated to virtio 1.0.

> > vq_err(vq, "Failed to get used event idx");
> > return true;
> > }
> > -   return vring_need_event(event, new, old);
> > +   return vring_need_event(vhost16_to_cpu(vq, event), new, old);
> >  }
> > 
> >  /* This actually signals the guest, using eventfd. */
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v7 36/46] vhost/net: suppress compiler warning

2014-12-01 Thread Cornelia Huck
On Sun, 30 Nov 2014 17:12:13 +0200
"Michael S. Tsirkin"  wrote:

> len is always initialized since function is called with size > 0.
> 
> Signed-off-by: Michael S. Tsirkin 
> ---
>  drivers/vhost/net.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 984242e..54ffbb0 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -501,7 +501,7 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
>   int headcount = 0;
>   unsigned d;
>   int r, nlogs = 0;
> - u32 len;
> + u32 uninitialized_var(len);
> 
>   while (datalen > 0 && headcount < quota) {
>   if (unlikely(seg >= UIO_MAXIOV)) {

Want to merge this with the patch introducing the variable and add a
comment there?

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


freebsd 10 under linux with kvm and apicv

2014-12-01 Thread Vasiliy Tolstov
Hello. I found some issues with enable_apicv=Y and freebsd, does this
problem solved or no?
I'm have latest linux 3.10.x.


-- 
Vasiliy Tolstov,
e-mail: v.tols...@selfip.ru
jabber: v...@selfip.ru
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v7 31/46] vhost: virtio 1.0 endian-ness support

2014-12-01 Thread Cornelia Huck
On Mon, 1 Dec 2014 14:37:01 +0200
"Michael S. Tsirkin"  wrote:

> On Mon, Dec 01, 2014 at 01:33:53PM +0100, Cornelia Huck wrote:
> > On Sun, 30 Nov 2014 17:11:49 +0200
> > "Michael S. Tsirkin"  wrote:
> > 
> > > Signed-off-by: Michael S. Tsirkin 
> > > ---
> > >  drivers/vhost/vhost.c | 93 
> > > +++
> > >  1 file changed, 56 insertions(+), 37 deletions(-)
> > > 
> > 
> > > @@ -1113,18 +1120,19 @@ static int get_indirect(struct vhost_virtqueue 
> > > *vq,
> > >  {
> > >   struct vring_desc desc;
> > >   unsigned int i = 0, count, found = 0;
> > > + u32 len = vhost32_to_cpu(vq, indirect->len);
> > >   int ret;
> > > 
> > >   /* Sanity check */
> > > - if (unlikely(indirect->len % sizeof desc)) {
> > > + if (unlikely(len % sizeof desc)) {
> > >   vq_err(vq, "Invalid length in indirect descriptor: "
> > >  "len 0x%llx not multiple of 0x%zx\n",
> > > -(unsigned long long)indirect->len,
> > > +(unsigned long long)vhost32_to_cpu(vq, indirect->len),
> > 
> > Can't you use len here?
> 
> Not if I want error message to be readable.

Huh? Both have the same value.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v7 31/46] vhost: virtio 1.0 endian-ness support

2014-12-01 Thread Michael S. Tsirkin
On Mon, Dec 01, 2014 at 01:42:47PM +0100, Cornelia Huck wrote:
> On Mon, 1 Dec 2014 14:37:01 +0200
> "Michael S. Tsirkin"  wrote:
> 
> > On Mon, Dec 01, 2014 at 01:33:53PM +0100, Cornelia Huck wrote:
> > > On Sun, 30 Nov 2014 17:11:49 +0200
> > > "Michael S. Tsirkin"  wrote:
> > > 
> > > > Signed-off-by: Michael S. Tsirkin 
> > > > ---
> > > >  drivers/vhost/vhost.c | 93 
> > > > +++
> > > >  1 file changed, 56 insertions(+), 37 deletions(-)
> > > > 
> > > 
> > > > @@ -1113,18 +1120,19 @@ static int get_indirect(struct vhost_virtqueue 
> > > > *vq,
> > > >  {
> > > > struct vring_desc desc;
> > > > unsigned int i = 0, count, found = 0;
> > > > +   u32 len = vhost32_to_cpu(vq, indirect->len);
> > > > int ret;
> > > > 
> > > > /* Sanity check */
> > > > -   if (unlikely(indirect->len % sizeof desc)) {
> > > > +   if (unlikely(len % sizeof desc)) {
> > > > vq_err(vq, "Invalid length in indirect descriptor: "
> > > >"len 0x%llx not multiple of 0x%zx\n",
> > > > -  (unsigned long long)indirect->len,
> > > > +  (unsigned long long)vhost32_to_cpu(vq, 
> > > > indirect->len),
> > > 
> > > Can't you use len here?
> > 
> > Not if I want error message to be readable.
> 
> Huh? Both have the same value.

Ah, good point.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Xen-devel] [PATCH] xen: privcmd: schedule() after private hypercall when non CONFIG_PREEMPT

2014-12-01 Thread Luis R. Rodriguez
On Mon, Dec 01, 2014 at 11:01:18AM +, David Vrabel wrote:
> On 28/11/14 04:49, Juergen Gross wrote:
> > On 11/27/2014 07:50 PM, Andrew Cooper wrote:
> >> 
> >> XenServer uses
> >>
> >> https://github.com/xenserver/linux-3.x.pg/blob/master/master/0001-x86-xen-allow-privcmd-hypercalls-to-be-preempted.patch
> >>
> >>
> >> to deal with these issues.  That patch is based on 3.10.
> > 
> > Clever. :-)
> > 
> >>
> >> I can remember whether this has been submitted upstream before (and
> >> there were outstanding issues), or whether it fell at an inconvenient
> >> time with our development cycles.
> > 
> > I found
> > 
> > http://lists.xen.org/archives/html/xen-devel/2014-02/msg02540.html
> > 
> > and nothing else.
> 
> I dropped it because it copy-and-paste a bunch of otherwise generic x86
> assembler and looked unlikely to get an x86 maintainer ack.  If you
> think otherwise, feel free to pick it up and run with it.

I was trying to run with it, but my biggest gripe with this was
the use of preempt_schedule_irq(), but we can review that on the
other thread.

  Luis
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/5] Improve PSCI system events and fix reboot bugs

2014-12-01 Thread Andrew Jones
On Thu, Nov 27, 2014 at 07:40:55PM +0100, Christoffer Dall wrote:
> Several people have reported problems with rebooting ARM VMs, especially
> on 32-bit ARM.  This is mainly due to the same reason we were seeing
> boot errors in the past, namely that the ram, dcache, and icache weren't
> coherent on guest boot with the guest (stage-1) MMU disabled.  We solved
> this by ensuring coherency when we fault in pages, but since most memory
> is already mapped after a reboot, we don't do anything.
> 
> The solution is to unmap the regular RAM on system events, but we must
> take care to not unmap the GIC or other IO regions, hence the somehwat
> complicated solution.
> 
> As part of figuring this out, it became clear that some semantics around
> the KVM_ARM_VCPU_INIT ABI and system event ABI was unclear (what is
> userspace expected to do when it receives a system event).  This series
> also clarifies the ABI and changes the kernel functionality to do what
> userspace expects (turn off VCPUs on a system shutdown event).
> 
> The code is avaliable here as well:
> http://git.linaro.org/people/christoffer.dall/linux-kvm-arm.git 
> vcpu_init_fixes
> 
> There is an alternative version with more code reuse for what is patch 4
> in this series available here:
> http://git.linaro.org/people/christoffer.dall/linux-kvm-arm.git 
> vcpu_init_fixes-alternative
> 
> See patch 4 for more info on this one.
> 
> Testing
> ---
> This has been tested on CubieBoard, Arndale, TC2, and Juno.  On Arndale
> and TC2 it was extremely easy to reproduce the setup (just start a VM
> that runs reboot from /etc/rc.local or similar) and this series clearly
> fixes the behavior.

We've also seen reboots leading to a stuck vcpu. It appeared to be 100%
reproducible on a freshly installed guest (first reboot after running
the installer), and then intermittently afterwards. I've just tested
this patch series, and it appears to resolve the issue. No stuck vcpu
after install, and a reboot loop has been running for a while now. I'm
testing on a mustang. If you like, feel free to add a

Tested-by: Andrew Jones 

drew

> 
> On Juno we occasionally see lockups of reboot, but I see this both with
> and without this series.  I have run a VM in a loop where the guest
> shuts itself down (same code path) a couple of hundred times without
> seeing any issues, so I think it's safe to merge this and further
> investigate the Juno reboot issue.
> 
> 
> Christoffer Dall (5):
>   arm/arm64: KVM: Correct KVM_ARM_VCPU_INIT power off option
>   arm/arm64: KVM: Reset the HCR on each vcpu when resetting the vcpu
>   arm/arm64: KVM: Clarify KVM_ARM_VCPU_INIT ABI
>   arm/arm64: KVM: Introduce stage2_unmap_vm
>   arm/arm64: KVM: Turn off vcpus and flush stage-2 pgtables on sytem
> exit events
> 
>  Documentation/virtual/kvm/api.txt| 10 +-
>  arch/arm/include/asm/kvm_emulate.h   |  5 +++
>  arch/arm/include/asm/kvm_mmu.h   |  1 +
>  arch/arm/kvm/arm.c   |  4 +++
>  arch/arm/kvm/guest.c |  1 -
>  arch/arm/kvm/mmu.c   | 65 
> 
>  arch/arm/kvm/psci.c  | 18 ++
>  arch/arm64/include/asm/kvm_emulate.h |  5 +++
>  arch/arm64/include/asm/kvm_host.h|  1 +
>  arch/arm64/include/asm/kvm_mmu.h |  1 +
>  arch/arm64/kvm/guest.c   |  1 -
>  11 files changed, 109 insertions(+), 3 deletions(-)
> 
> -- 
> 2.1.2.330.g565301e.dirty
> 
> ___
> kvmarm mailing list
> kvm...@lists.cs.columbia.edu
> https://lists.cs.columbia.edu/mailman/listinfo/kvmarm
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: freebsd 10 under linux with kvm and apicv

2014-12-01 Thread Vasiliy Tolstov
2014-12-01 15:38 GMT+03:00 Vasiliy Tolstov :
> Hello. I found some issues with enable_apicv=Y and freebsd, does this
> problem solved or no?
> I'm have latest linux 3.10.x.


Also i'm succeseful run freebsd 10 with 1Gb memory, but failed to boot
with 4gb memory.
But in this case i think that is feebsd problem.

-- 
Vasiliy Tolstov,
e-mail: v.tols...@selfip.ru
jabber: v...@selfip.ru
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM: Introduce dynamically registered hypercall capability

2014-12-01 Thread Radim Krčmář
2014-11-28 17:29-0800, Phil White:
> Good questions.
> 
> One thing that prompted this code is the presence and proliferation of
> architecture specific hypercalls in include/uapi/linux/kvm_para.h.
> I'm not sure why the tree has continued on for as long as it has with
> a list of reserved hypercall indices -- most of which are unused on
> any given architecture.  Is there a reason that I'm unaware of?

Name-space won't be exhausted, so nothing forced them to separate and
centralization easily avoids conflicts with generic hypercalls.

> This was written because I had a module which was meant to be loaded
> for paravirtualized VMs and it needed a hook to receive a hypercall.
> We originally reserved an index, but that struck me as sloppy for the
> same reason I mentioned before -- not every system is going to need
> that hypercall number reserved.

Makes sense;  out-of-tree modules are in an especially bad position to
get their hypercalls into the kernel.  (Is a hypercall necessary?)

> I didn't have a problem communicating the hypercall number to the
> guest incidentally -- it had a bit of shared memory where the request
> structure was placed.  That said, it could just as easily be used in a
> static arrangement where each request is made to claim a particular
> ID.

My main trouble is that we would export a very specific/limited feature
for an unknown problem -- we can't even tell if it is appropriate, which
strongly favors rejecting it.

I'd say that a virtio device is the way to go if you want to stay in the
kernel, but outside of kvm modules.  In which ways is virtio lacking?

> It does occur to me that in the absence of the setup which I had
> available, one could simply treat hc_nr as a 4 character ID rather
> than a particular digit.

(This would probably solve the situation in practice, but the conflict
 is still there, so design hasn't improved.)

> "The generation of random numbers is too important to be left to
> chance." -Robert R. Coveyou, Oak Ridge National Laboratory, 1969

:)  (Exactly.)

> On Thu, Nov 27, 2014 at 7:31 AM, Radim Krčmář  wrote:
> > 2014-11-27 05:30-0800, Phil White:
> >> This introduces a list of entries which associate a function pointer of
> >> kvm_hc_type to a hypercall number and allows the ability to register and
> >> unregister entries.  In addition, it also allows the ability to retrieve a
> >> function pointer of kvm_hc_type for a given hypercall number which is meant
> >> to be called from the arch-specific section.
> >>
> >> The main intent is to allow modules to register hypercalls which they own
> >> rather than requiring the addition of a stub of some sort.  It will also
> >> allow each arch to maintain separate lists of hypercalls rather than having
> >> to respect changes in include/uapi/linux/kvm_para.h
> >>
> >> Signed-off-by: Phil White 
> >> ---
> >
> > Apart from other problems,
> > how are guests going to use these hypercalls?
> >
> > (If hc_nr is dynamic, a guest doesn't know its number and even if it is
> >  static, someone could have registered it beforehand => this needs some
> >  kind of synchronization with host modules.  A hardcoded reservation?)
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v7 36/46] vhost/net: suppress compiler warning

2014-12-01 Thread Michael S. Tsirkin
On Mon, Dec 01, 2014 at 01:37:40PM +0100, Cornelia Huck wrote:
> On Sun, 30 Nov 2014 17:12:13 +0200
> "Michael S. Tsirkin"  wrote:
> 
> > len is always initialized since function is called with size > 0.
> > 
> > Signed-off-by: Michael S. Tsirkin 
> > ---
> >  drivers/vhost/net.c | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> > 
> > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> > index 984242e..54ffbb0 100644
> > --- a/drivers/vhost/net.c
> > +++ b/drivers/vhost/net.c
> > @@ -501,7 +501,7 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
> > int headcount = 0;
> > unsigned d;
> > int r, nlogs = 0;
> > -   u32 len;
> > +   u32 uninitialized_var(len);
> > 
> > while (datalen > 0 && headcount < quota) {
> > if (unlikely(seg >= UIO_MAXIOV)) {
> 
> Want to merge this with the patch introducing the variable and add a
> comment there?

Not really. Warnings in bisect are fine  I think.

-- 
MST
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v7 36/46] vhost/net: suppress compiler warning

2014-12-01 Thread Cornelia Huck
On Mon, 1 Dec 2014 15:48:39 +0200
"Michael S. Tsirkin"  wrote:

> On Mon, Dec 01, 2014 at 01:37:40PM +0100, Cornelia Huck wrote:
> > On Sun, 30 Nov 2014 17:12:13 +0200
> > "Michael S. Tsirkin"  wrote:
> > 
> > > len is always initialized since function is called with size > 0.
> > > 
> > > Signed-off-by: Michael S. Tsirkin 
> > > ---
> > >  drivers/vhost/net.c | 2 +-
> > >  1 file changed, 1 insertion(+), 1 deletion(-)
> > > 
> > > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> > > index 984242e..54ffbb0 100644
> > > --- a/drivers/vhost/net.c
> > > +++ b/drivers/vhost/net.c
> > > @@ -501,7 +501,7 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
> > >   int headcount = 0;
> > >   unsigned d;
> > >   int r, nlogs = 0;
> > > - u32 len;
> > > + u32 uninitialized_var(len);
> > > 
> > >   while (datalen > 0 && headcount < quota) {
> > >   if (unlikely(seg >= UIO_MAXIOV)) {
> > 
> > Want to merge this with the patch introducing the variable and add a
> > comment there?
> 
> Not really. Warnings in bisect are fine  I think.

I'm not sure what a separate patch buys us, though, as it should be
trivial to merge.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 3/8] VFIO: platform: forwarded state tested when selecting IRQ handler

2014-12-01 Thread Eric Auger
Hi Christoffer,
On 11/30/2014 01:47 PM, Christoffer Dall wrote:
> The subject reads strangely, perhaps just:
> 
> VFIO: platform: test forward state when selecting IRQ handler
OK
> 
> On Sun, Nov 23, 2014 at 07:35:55PM +0100, Eric Auger wrote:
>> In case the IRQ is forwarded, the VFIO platform IRQ handler does not
>> need to disable the IRQ anymore.
>>
>> When setting the IRQ handler we now also test the forwarded state. In
>> case the IRQ is forwarded we select the edge handler (no automaske).
>>
>> Signed-off-by: Eric Auger 
>>
>> ---
>>
>> v2 -> v3:
>> - forwarded state was tested in the handler. Now the forwarded state
>>   is tested before setting the handler. This definitively limits
>>   the dynamics of forwarded state changes but I don't think there is
>>   a use case where we need to be able to change the state at any time.
> 
> user space can change this by calling the VFIO_IRQ_SET_ACTION_TRIGGER
> whenever it wants, right?
yes the user can set/unset the VFIO signaling (and request_irq/free_irq)
using VFIO_IRQ_SET_ACTION_TRIGGER. In this new version I do not allow
changing the forwarded state when the handler is attached (request_irq).

Does it answer your interrogation?

Best Regards

Eric

> 
>> ---
>>  drivers/vfio/platform/vfio_platform_irq.c | 7 ++-
>>  1 file changed, 6 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/vfio/platform/vfio_platform_irq.c 
>> b/drivers/vfio/platform/vfio_platform_irq.c
>> index 08d400e..61a2920 100644
>> --- a/drivers/vfio/platform/vfio_platform_irq.c
>> +++ b/drivers/vfio/platform/vfio_platform_irq.c
>> @@ -230,8 +230,13 @@ static int vfio_platform_set_irq_trigger(struct 
>> vfio_platform_device *vdev,
>>  {
>>  struct vfio_platform_irq *irq = &vdev->irqs[index];
>>  irq_handler_t handler;
>> +struct irq_data *d;
>> +bool is_forwarded;
>>  
>> -if (vdev->irqs[index].flags & VFIO_IRQ_INFO_MASKABLE)
>> +d = irq_get_irq_data(irq->hwirq);
>> +is_forwarded = irqd_irq_forwarded(d);
>> +
>> +if ((vdev->irqs[index].flags & VFIO_IRQ_INFO_MASKABLE) && !is_forwarded)
>>  handler = vfio_maskable_irq_handler;
>>  else
>>  handler = vfio_irq_handler;
>> -- 
>> 1.9.1
>>

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Xen-devel] [PATCH] xen: privcmd: schedule() after private hypercall when non CONFIG_PREEMPT

2014-12-01 Thread Juergen Gross

On 12/01/2014 02:32 PM, Luis R. Rodriguez wrote:

On Mon, Dec 01, 2014 at 11:01:18AM +, David Vrabel wrote:

On 28/11/14 04:49, Juergen Gross wrote:

On 11/27/2014 07:50 PM, Andrew Cooper wrote:


XenServer uses

https://github.com/xenserver/linux-3.x.pg/blob/master/master/0001-x86-xen-allow-privcmd-hypercalls-to-be-preempted.patch


to deal with these issues.  That patch is based on 3.10.


Clever. :-)



I can remember whether this has been submitted upstream before (and
there were outstanding issues), or whether it fell at an inconvenient
time with our development cycles.


I found

http://lists.xen.org/archives/html/xen-devel/2014-02/msg02540.html

and nothing else.


I dropped it because it copy-and-paste a bunch of otherwise generic x86
assembler and looked unlikely to get an x86 maintainer ack.  If you
think otherwise, feel free to pick it up and run with it.


I was trying to run with it, but my biggest gripe with this was
the use of preempt_schedule_irq(), but we can review that on the
other thread.


I think this can be handled completely inside xen_evtchn_do_upcall().

xen_preemptible_hcall_begin() (possibly with another name) could take
the pointer of a function as parameter which is used as continuation
point after an asynchronous interrupt in the critical section.
Parameter for that function would be the exception frame of the
original interrupt to be able to continue at the interrupted position
after e.g. calling schedule().


Juergen
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 4/8] KVM: kvm-vfio: User API for IRQ forwarding

2014-12-01 Thread Eric Auger
On 11/30/2014 01:53 PM, Christoffer Dall wrote:
> On Sun, Nov 23, 2014 at 07:35:56PM +0100, Eric Auger wrote:
>> This patch adds and document a new KVM_DEV_VFIO_DEVICE group
> 
> documents
OK
> 
>> and 2 device attributes: KVM_DEV_VFIO_DEVICE_FORWARD_IRQ,
>> KVM_DEV_VFIO_DEVICE_UNFORWARD_IRQ. The purpose is to be able
>> to set a VFIO device IRQ as forwarded or not forwarded.
>> the command takes as argument a handle to a new struct named
>> kvm_arch_forwarded_irq.
>>
>> Signed-off-by: Eric Auger 
>>
>> ---
>>
>> v2 -> v3:
>> - rework vfio kvm device documentation
>> - reword commit message and title
>> - add subindex in kvm_arch_forwarded_irq to be closer to VFIO API
>> - forwarding state can only be changed with VFIO IRQ signaling is off
>>
>> v1 -> v2:
>> - struct kvm_arch_forwarded_irq moved from arch/arm/include/uapi/asm/kvm.h
>>   to include/uapi/linux/kvm.h
>>   also irq_index renamed into index and guest_irq renamed into gsi
>> - ASSIGN/DEASSIGN renamed into FORWARD/UNFORWARD
>> ---
>>  Documentation/virtual/kvm/devices/vfio.txt | 34 
>> --
>>  include/uapi/linux/kvm.h   | 10 +
>>  2 files changed, 38 insertions(+), 6 deletions(-)
>>
>> diff --git a/Documentation/virtual/kvm/devices/vfio.txt 
>> b/Documentation/virtual/kvm/devices/vfio.txt
>> index ef51740..f7aff29 100644
>> --- a/Documentation/virtual/kvm/devices/vfio.txt
>> +++ b/Documentation/virtual/kvm/devices/vfio.txt
>> @@ -4,15 +4,24 @@ VFIO virtual device
>>  Device types supported:
>>KVM_DEV_TYPE_VFIO
>>  
>> -Only one VFIO instance may be created per VM.  The created device
>> -tracks VFIO groups in use by the VM and features of those groups
>> -important to the correctness and acceleration of the VM.  As groups
>> -are enabled and disabled for use by the VM, KVM should be updated
>> -about their presence.  When registered with KVM, a reference to the
>> -VFIO-group is held by KVM.
>> +Only one VFIO instance may be created per VM.
>> +
>> +The created device tracks VFIO groups in use by the VM and features
>> +of those groups important to the correctness and acceleration of
>> +the VM.  As groups are enabled and disabled for use by the VM, KVM
>> +should be updated about their presence.  When registered with KVM,
>> +a reference to the VFIO-group is held by KVM.
>> +
>> +The device also tracks & enable VFIO device forwarded IRQs, if any.
> 
> s/tracks & enable/tracks and enables/
ok
> 
>> +A physical forwarded IRQ is directly completed by the guest. This
>> +requires HW support in the interrupt controller which must be able
>> +to automatically complete the physical IRQ when it detects the guest
>> +has completed the corresponding virtual IRQ. The modality sometimes
>> +is named direct EOI.
> 
> this last sentence is a bit floating out there; perhaps you want to be
> more specific and say what it is on ARM.
Well this is naming some people used at the KVM forum. I thought this
was something known on other archs but I can drop this comment.
> 
>>  
>>  Groups:
>>KVM_DEV_VFIO_GROUP
>> +  KVM_DEV_VFIO_DEVICE
>>  
>>  KVM_DEV_VFIO_GROUP attributes:
>>KVM_DEV_VFIO_GROUP_ADD: Add a VFIO group to VFIO-KVM device tracking
>> @@ -20,3 +29,16 @@ KVM_DEV_VFIO_GROUP attributes:
>>  
>>  For each, kvm_device_attr.addr points to an int32_t file descriptor
>>  for the VFIO group.
>> +
>> +KVM_DEV_VFIO_DEVICE attributes:
>> +  KVM_DEV_VFIO_DEVICE_FORWARD_IRQ: set a VFIO device IRQ as forwarded
>> +  KVM_DEV_VFIO_DEVICE_UNFORWARD_IRQ: set a VFIO device IRQ as not forwarded
>> +
>> +For each, kvm_device_attr.addr points to a kvm_arch_forwarded_irq struct.
>> +
>> +The forwarded state can only be changed when the VFIO signaling mechanism
>> +for this physical IRQ is not set. In other words, forwarding must be
>> +activated before VFIO_DEVICE_SET_IRQS has been called to trigger the IRQ
>> +or associate an eventfd to it. Unforwarding can only be called while the
>> +signaling has been disabled with VFIO_DEVICE_SET_IRQS. If this condition is
>> +not satisfied, the command returns an -EBUSY.
> 
> s/command/ioctl/
ok
> 
>> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
>> index 6076882..a269a42 100644
>> --- a/include/uapi/linux/kvm.h
>> +++ b/include/uapi/linux/kvm.h
>> @@ -946,6 +946,9 @@ struct kvm_device_attr {
>>  #define  KVM_DEV_VFIO_GROUP 1
>>  #define   KVM_DEV_VFIO_GROUP_ADD1
>>  #define   KVM_DEV_VFIO_GROUP_DEL2
>> +#define  KVM_DEV_VFIO_DEVICE2
>> +#define   KVM_DEV_VFIO_DEVICE_FORWARD_IRQ   1
>> +#define   KVM_DEV_VFIO_DEVICE_UNFORWARD_IRQ 2
>>  
>>  enum kvm_device_type {
>>  KVM_DEV_TYPE_FSL_MPIC_20= 1,
>> @@ -963,6 +966,13 @@ enum kvm_device_type {
>>  KVM_DEV_TYPE_MAX,
>>  };
>>  
>> +struct kvm_arch_forwarded_irq {
>> +__u32 fd; /* file desciptor of the VFIO device */
>> +__u32 index; /* VFIO device IRQ index */
>> +__u32 subi

Re: [PATCH v3 2/8] KVM: arm64: Enable the KVM-VFIO device

2014-12-01 Thread Eric Auger
On 11/30/2014 01:14 PM, Christoffer Dall wrote:
> On Sun, Nov 23, 2014 at 07:35:54PM +0100, Eric Auger wrote:
>> Used by KVM-enabled VFIO-based device passthrough support in QEMU.
>>
>> Signed-off-by: Joel Schopp 
>> Signed-off-by: Eric Auger 
>>
>> ---
>>
>> Extracted from [RFC PATCH] arm64: KVM: add irqfd support
>> http://www.spinics.net/lists/kvm-arm/msg10798.html
>> ---
>>  arch/arm64/kvm/Kconfig  | 1 +
>>  arch/arm64/kvm/Makefile | 2 +-
>>  2 files changed, 2 insertions(+), 1 deletion(-)
>>
>> diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
>> index 09c25c2..2edf926 100644
>> --- a/arch/arm64/kvm/Kconfig
>> +++ b/arch/arm64/kvm/Kconfig
>> @@ -26,6 +26,7 @@ config KVM
>>  select KVM_ARM_HOST
>>  select KVM_ARM_VGIC
>>  select KVM_ARM_TIMER
>> +select KVM_VFIO
>>  select HAVE_KVM_EVENTFD
>>  ---help---
>>Support hosting virtualized guest machines.
>> diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
>> index 2e6b827..81ed091 100644
>> --- a/arch/arm64/kvm/Makefile
>> +++ b/arch/arm64/kvm/Makefile
>> @@ -11,7 +11,7 @@ ARM=../../../arch/arm/kvm
>>  
>>  obj-$(CONFIG_KVM_ARM_HOST) += kvm.o
>>  
>> -kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o 
>> $(KVM)/eventfd.o
>> +kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o 
>> $(KVM)/eventfd.o $(KVM)/vfio.o
>>  kvm-$(CONFIG_KVM_ARM_HOST) += $(ARM)/arm.o $(ARM)/mmu.o $(ARM)/mmio.o
>>  kvm-$(CONFIG_KVM_ARM_HOST) += $(ARM)/psci.o $(ARM)/perf.o
>>  
>> -- 
>> 1.9.1
>>
> 
> Should these patches not be squashed into one?
Yes I can do.
> 
> Also, what do they enable at this point?  Should they be queued by the
> end of the series instead?
Well to me this patch should be moved even outside of this series. The
KVM-VFIO device is loaded when the QEMU VFIO device is instantiated.
This is used to record the VFIO groups in use. In VFIO platform case, if
the KVM-VFIO device does not exist, this is not fatal but we get a
warning in QEMU.

The KVM-VFIO device however is mandatory to enable forwarded irq feature.

Best Regards

Eric
> 
> -Christoffer
> 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [GIT PULL 09/11] KVM: s390: add bitmap for handling cpu-local interrupts

2014-12-01 Thread Jens Freimann
Excerpts from Christian Borntraeger's message of 2014-11-29 22:05:10 +0100:
> Am 28.11.2014 um 18:18 schrieb Paolo Bonzini:
> > 
> > 
> > On 28/11/2014 14:25, Christian Borntraeger wrote:
> >>  
> >> +struct kvm_s390_irq_payload {
> >> +struct kvm_s390_io_info io;
> >> +struct kvm_s390_ext_info ext;
> >> +struct kvm_s390_pgm_info pgm;
> >> +struct kvm_s390_emerg_info emerg;
> >> +struct kvm_s390_extcall_info extcall;
> >> +struct kvm_s390_prefix_info prefix;
> >> +struct kvm_s390_mchk_info mchk;
> >> +};
> >> +
> > 
> > struct or union?
> 
> struct. This is used for keeping the payload of the interrupts. Multiple 
> different interrupts can be pending and most of them have payload - we want 
> to keep everything.
> 
> Now, looking at that code again, as I/O is floating and emergency is also 
> handled via a separate bitmap we could get rid of these two in a follow-up 
> patch. Jens, can you have a look and prepare a followup-cleanup if 
> appropriate?

I/O is floating and currently not used in that struct, but I have a patch 
series in the works to change floating interrupts as well and will need it then.

I will check if I can get rid of the emerg_info and sent a followup patch.

Jens

> Christian

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] xen: privcmd: schedule() after private hypercall when non CONFIG_PREEMPT

2014-12-01 Thread Luis R. Rodriguez
On Mon, Dec 01, 2014 at 11:11:43AM +, David Vrabel wrote:
> On 27/11/14 18:36, Luis R. Rodriguez wrote:
> > On Thu, Nov 27, 2014 at 07:36:31AM +0100, Juergen Gross wrote:
> >> On 11/26/2014 11:26 PM, Luis R. Rodriguez wrote:
> >>> From: "Luis R. Rodriguez" 
> >>>
> >>> Some folks had reported that some xen hypercalls take a long time
> >>> to complete when issued from the userspace private ioctl mechanism,
> >>> this can happen for instance with some hypercalls that have many
> >>> sub-operations, this can happen for instance on hypercalls that use
> [...]
> >>> --- a/drivers/xen/privcmd.c
> >>> +++ b/drivers/xen/privcmd.c
> >>> @@ -60,6 +60,9 @@ static long privcmd_ioctl_hypercall(void __user *udata)
> >>>  hypercall.arg[0], hypercall.arg[1],
> >>>  hypercall.arg[2], hypercall.arg[3],
> >>>  hypercall.arg[4]);
> >>> +#ifndef CONFIG_PREEMPT
> >>> + schedule();
> >>> +#endif
> 
> As Juergen points out, this does nothing.  You need to schedule while in
> the middle of the hypercall.
> 
> Remember that Xen's hypercall preemption only preempts the hypercall to
> run interrupts in the guest.

How is it ensured that when the kernel preempts on this code path on
CONFIG_PREEMPT=n kernel that only interrupts in the guest are run?

> >>>
> >>>   return ret;
> >>>   }
> >>>
> >>
> >> Sorry, I don't think this will solve anything. You're calling schedule()
> >> right after the long running hypercall just nanoseconds before returning
> >> to the user.
> > 
> > Yeah, well that is what [1] tried as well only it tried using
> > preempt_schedule_irq() on the hypercall callback...
> 
> No.  My patch added a schedule point in the middle of a hypercall on the
> return from an interrupt (e.g., the timer interrupt).

OK that provides much better context and given that I do see the above hunk as
pointless. I was completely misrepresenting what the callback was for. Now --
just to address my issues with the use of preempt_schedule_irq(). If the above
is addressed that I think should address most of my concerns, if we can figure
out a way to not deal with it to be arch specific that'd be neat, and if we
could not have to ifdef around stuff even better.

  Luis
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v7 36/46] vhost/net: suppress compiler warning

2014-12-01 Thread Michael S. Tsirkin
On Mon, Dec 01, 2014 at 03:21:03PM +0100, Cornelia Huck wrote:
> On Mon, 1 Dec 2014 15:48:39 +0200
> "Michael S. Tsirkin"  wrote:
> 
> > On Mon, Dec 01, 2014 at 01:37:40PM +0100, Cornelia Huck wrote:
> > > On Sun, 30 Nov 2014 17:12:13 +0200
> > > "Michael S. Tsirkin"  wrote:
> > > 
> > > > len is always initialized since function is called with size > 0.
> > > > 
> > > > Signed-off-by: Michael S. Tsirkin 
> > > > ---
> > > >  drivers/vhost/net.c | 2 +-
> > > >  1 file changed, 1 insertion(+), 1 deletion(-)
> > > > 
> > > > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> > > > index 984242e..54ffbb0 100644
> > > > --- a/drivers/vhost/net.c
> > > > +++ b/drivers/vhost/net.c
> > > > @@ -501,7 +501,7 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
> > > > int headcount = 0;
> > > > unsigned d;
> > > > int r, nlogs = 0;
> > > > -   u32 len;
> > > > +   u32 uninitialized_var(len);
> > > > 
> > > > while (datalen > 0 && headcount < quota) {
> > > > if (unlikely(seg >= UIO_MAXIOV)) {
> > > 
> > > Want to merge this with the patch introducing the variable and add a
> > > comment there?
> > 
> > Not really. Warnings in bisect are fine  I think.
> 
> I'm not sure what a separate patch buys us, though, as it should be
> trivial to merge.

Easier to document the reason if it's split out.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] xen: privcmd: schedule() after private hypercall when non CONFIG_PREEMPT

2014-12-01 Thread David Vrabel
On 01/12/14 15:05, Luis R. Rodriguez wrote:
> On Mon, Dec 01, 2014 at 11:11:43AM +, David Vrabel wrote:
>> On 27/11/14 18:36, Luis R. Rodriguez wrote:
>>> On Thu, Nov 27, 2014 at 07:36:31AM +0100, Juergen Gross wrote:
 On 11/26/2014 11:26 PM, Luis R. Rodriguez wrote:
> From: "Luis R. Rodriguez" 
>
> Some folks had reported that some xen hypercalls take a long time
> to complete when issued from the userspace private ioctl mechanism,
> this can happen for instance with some hypercalls that have many
> sub-operations, this can happen for instance on hypercalls that use
>> [...]
> --- a/drivers/xen/privcmd.c
> +++ b/drivers/xen/privcmd.c
> @@ -60,6 +60,9 @@ static long privcmd_ioctl_hypercall(void __user *udata)
>  hypercall.arg[0], hypercall.arg[1],
>  hypercall.arg[2], hypercall.arg[3],
>  hypercall.arg[4]);
> +#ifndef CONFIG_PREEMPT
> + schedule();
> +#endif
>>
>> As Juergen points out, this does nothing.  You need to schedule while in
>> the middle of the hypercall.
>>
>> Remember that Xen's hypercall preemption only preempts the hypercall to
>> run interrupts in the guest.
> 
> How is it ensured that when the kernel preempts on this code path on
> CONFIG_PREEMPT=n kernel that only interrupts in the guest are run?

Sorry, I really didn't describe this very well.

If a hypercall needs a continuation, Xen returns to the guest with the
IP set to the hypercall instruction, and on the way back to the guest
Xen may schedule a different VCPU or it will do any upcalls (as per normal).

The guest is free to return from the upcall to the original task
(continuing the hypercall) or to a different one.

David
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v7 36/46] vhost/net: suppress compiler warning

2014-12-01 Thread Cornelia Huck
On Mon, 1 Dec 2014 17:12:08 +0200
"Michael S. Tsirkin"  wrote:

> On Mon, Dec 01, 2014 at 03:21:03PM +0100, Cornelia Huck wrote:
> > On Mon, 1 Dec 2014 15:48:39 +0200
> > "Michael S. Tsirkin"  wrote:
> > 
> > > On Mon, Dec 01, 2014 at 01:37:40PM +0100, Cornelia Huck wrote:
> > > > On Sun, 30 Nov 2014 17:12:13 +0200
> > > > "Michael S. Tsirkin"  wrote:
> > > > 
> > > > > len is always initialized since function is called with size > 0.
> > > > > 
> > > > > Signed-off-by: Michael S. Tsirkin 
> > > > > ---
> > > > >  drivers/vhost/net.c | 2 +-
> > > > >  1 file changed, 1 insertion(+), 1 deletion(-)
> > > > > 
> > > > > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> > > > > index 984242e..54ffbb0 100644
> > > > > --- a/drivers/vhost/net.c
> > > > > +++ b/drivers/vhost/net.c
> > > > > @@ -501,7 +501,7 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
> > > > >   int headcount = 0;
> > > > >   unsigned d;
> > > > >   int r, nlogs = 0;
> > > > > - u32 len;
> > > > > + u32 uninitialized_var(len);
> > > > > 
> > > > >   while (datalen > 0 && headcount < quota) {
> > > > >   if (unlikely(seg >= UIO_MAXIOV)) {
> > > > 
> > > > Want to merge this with the patch introducing the variable and add a
> > > > comment there?
> > > 
> > > Not really. Warnings in bisect are fine  I think.
> > 
> > I'm not sure what a separate patch buys us, though, as it should be
> > trivial to merge.
> 
> Easier to document the reason if it's split out.
> 

That's why I suggested a comment ;)

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] xen: privcmd: schedule() after private hypercall when non CONFIG_PREEMPT

2014-12-01 Thread Luis R. Rodriguez
On Mon, Dec 1, 2014 at 10:18 AM, David Vrabel  wrote:
> On 01/12/14 15:05, Luis R. Rodriguez wrote:
>> On Mon, Dec 01, 2014 at 11:11:43AM +, David Vrabel wrote:
>>> On 27/11/14 18:36, Luis R. Rodriguez wrote:
 On Thu, Nov 27, 2014 at 07:36:31AM +0100, Juergen Gross wrote:
> On 11/26/2014 11:26 PM, Luis R. Rodriguez wrote:
>> From: "Luis R. Rodriguez" 
>>
>> Some folks had reported that some xen hypercalls take a long time
>> to complete when issued from the userspace private ioctl mechanism,
>> this can happen for instance with some hypercalls that have many
>> sub-operations, this can happen for instance on hypercalls that use
>>> [...]
>> --- a/drivers/xen/privcmd.c
>> +++ b/drivers/xen/privcmd.c
>> @@ -60,6 +60,9 @@ static long privcmd_ioctl_hypercall(void __user *udata)
>>  hypercall.arg[0], hypercall.arg[1],
>>  hypercall.arg[2], hypercall.arg[3],
>>  hypercall.arg[4]);
>> +#ifndef CONFIG_PREEMPT
>> + schedule();
>> +#endif
>>>
>>> As Juergen points out, this does nothing.  You need to schedule while in
>>> the middle of the hypercall.
>>>
>>> Remember that Xen's hypercall preemption only preempts the hypercall to
>>> run interrupts in the guest.
>>
>> How is it ensured that when the kernel preempts on this code path on
>> CONFIG_PREEMPT=n kernel that only interrupts in the guest are run?
>
> Sorry, I really didn't describe this very well.
>
> If a hypercall needs a continuation, Xen returns to the guest with the
> IP set to the hypercall instruction, and on the way back to the guest
> Xen may schedule a different VCPU or it will do any upcalls (as per normal).
>
> The guest is free to return from the upcall to the original task
> (continuing the hypercall) or to a different one.

OK so that addresses what Xen will do when using continuation and
hypercall preemption, my concern here was that using
preempt_schedule_irq() on CONFIG_PREEMPT=n kernels in the middle of a
hypercall on the return from an interrupt (e.g., the timer interrupt)
would still let the kernel preempt to tasks other than those related
to Xen.

My gripe was that if this was being done it'd be a bit abusive of the
API even if its safe.

  Luis
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v7 31/46] vhost: virtio 1.0 endian-ness support

2014-12-01 Thread Michael S. Tsirkin
On Mon, Dec 01, 2014 at 01:33:53PM +0100, Cornelia Huck wrote:
> On Sun, 30 Nov 2014 17:11:49 +0200
> "Michael S. Tsirkin"  wrote:
> 
> > Signed-off-by: Michael S. Tsirkin 
> > ---
> >  drivers/vhost/vhost.c | 93 
> > +++
> >  1 file changed, 56 insertions(+), 37 deletions(-)
> > 
> 
> > @@ -1113,18 +1120,19 @@ static int get_indirect(struct vhost_virtqueue *vq,
> >  {
> > struct vring_desc desc;
> > unsigned int i = 0, count, found = 0;
> > +   u32 len = vhost32_to_cpu(vq, indirect->len);
> > int ret;
> > 
> > /* Sanity check */
> > -   if (unlikely(indirect->len % sizeof desc)) {
> > +   if (unlikely(len % sizeof desc)) {
> > vq_err(vq, "Invalid length in indirect descriptor: "
> >"len 0x%llx not multiple of 0x%zx\n",
> > -  (unsigned long long)indirect->len,
> > +  (unsigned long long)vhost32_to_cpu(vq, indirect->len),
> 
> Can't you use len here?
> 
> >sizeof desc);
> > return -EINVAL;
> > }
> > 
> > -   ret = translate_desc(vq, indirect->addr, indirect->len, vq->indirect,
> > +   ret = translate_desc(vq, vhost64_to_cpu(vq, indirect->addr), len, 
> > vq->indirect,
> >  UIO_MAXIOV);
> > if (unlikely(ret < 0)) {
> > vq_err(vq, "Translation failure %d in indirect.\n", ret);
> 
> 
> > @@ -1404,7 +1422,7 @@ int vhost_add_used_n(struct vhost_virtqueue *vq, 
> > struct vring_used_elem *heads,
> > 
> > /* Make sure buffer is written before we update index. */
> > smp_wmb();
> > -   if (put_user(vq->last_used_idx, &vq->used->idx)) {
> > +   if (__put_user(cpu_to_vhost16(vq, vq->last_used_idx), &vq->used->idx)) {
> 
> Why s/put_user/__put_user/ - I don't see how endianness conversions
> should have an influence there?
> 
> > vq_err(vq, "Failed to increment used idx");
> > return -EFAULT;
> > }
> 
> > @@ -1449,11 +1468,11 @@ static bool vhost_notify(struct vhost_dev *dev, 
> > struct vhost_virtqueue *vq)
> > if (unlikely(!v))
> > return true;
> > 
> > -   if (get_user(event, vhost_used_event(vq))) {
> > +   if (__get_user(event, vhost_used_event(vq))) {
> 
> Dito: why the change?

I remember now.
put_user/get_user macros don't work well
with __virtio tags.

As __ are a good idea anyway, I just switched to that
everywhere.



> > vq_err(vq, "Failed to get used event idx");
> > return true;
> > }
> > -   return vring_need_event(event, new, old);
> > +   return vring_need_event(vhost16_to_cpu(vq, event), new, old);
> >  }
> > 
> >  /* This actually signals the guest, using eventfd. */
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Xen-devel] [PATCH] xen: privcmd: schedule() after private hypercall when non CONFIG_PREEMPT

2014-12-01 Thread Luis R. Rodriguez
On Mon, Dec 01, 2014 at 03:42:33PM +0100, Juergen Gross wrote:
> On 12/01/2014 02:32 PM, Luis R. Rodriguez wrote:
>> On Mon, Dec 01, 2014 at 11:01:18AM +, David Vrabel wrote:
>>> On 28/11/14 04:49, Juergen Gross wrote:
 On 11/27/2014 07:50 PM, Andrew Cooper wrote:
>
> XenServer uses
>
> https://github.com/xenserver/linux-3.x.pg/blob/master/master/0001-x86-xen-allow-privcmd-hypercalls-to-be-preempted.patch
>
>
> to deal with these issues.  That patch is based on 3.10.

 Clever. :-)

>
> I can remember whether this has been submitted upstream before (and
> there were outstanding issues), or whether it fell at an inconvenient
> time with our development cycles.

 I found

 http://lists.xen.org/archives/html/xen-devel/2014-02/msg02540.html

 and nothing else.
>>>
>>> I dropped it because it copy-and-paste a bunch of otherwise generic x86
>>> assembler and looked unlikely to get an x86 maintainer ack.  If you
>>> think otherwise, feel free to pick it up and run with it.
>>
>> I was trying to run with it, but my biggest gripe with this was
>> the use of preempt_schedule_irq(), but we can review that on the
>> other thread.

So much for the other thread :)

> I think this can be handled completely inside xen_evtchn_do_upcall().
>
> xen_preemptible_hcall_begin() (possibly with another name) could take
> the pointer of a function as parameter which is used as continuation
> point after an asynchronous interrupt in the critical section.

Oh so we'd only preempt to one specific task?

> Parameter for that function would be the exception frame of the
> original interrupt to be able to continue at the interrupted position
> after e.g. calling schedule().

Interesting... if reasonable I wonder if this should be generalized.
Are there other CONFIG_PREEMPT=n use cases for such excemptions?

  Luis
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] xen: privcmd: schedule() after private hypercall when non CONFIG_PREEMPT

2014-12-01 Thread David Vrabel
On 01/12/14 15:44, Luis R. Rodriguez wrote:
> On Mon, Dec 1, 2014 at 10:18 AM, David Vrabel  wrote:
>> On 01/12/14 15:05, Luis R. Rodriguez wrote:
>>> On Mon, Dec 01, 2014 at 11:11:43AM +, David Vrabel wrote:
 On 27/11/14 18:36, Luis R. Rodriguez wrote:
> On Thu, Nov 27, 2014 at 07:36:31AM +0100, Juergen Gross wrote:
>> On 11/26/2014 11:26 PM, Luis R. Rodriguez wrote:
>>> From: "Luis R. Rodriguez" 
>>>
>>> Some folks had reported that some xen hypercalls take a long time
>>> to complete when issued from the userspace private ioctl mechanism,
>>> this can happen for instance with some hypercalls that have many
>>> sub-operations, this can happen for instance on hypercalls that use
 [...]
>>> --- a/drivers/xen/privcmd.c
>>> +++ b/drivers/xen/privcmd.c
>>> @@ -60,6 +60,9 @@ static long privcmd_ioctl_hypercall(void __user 
>>> *udata)
>>>  hypercall.arg[0], hypercall.arg[1],
>>>  hypercall.arg[2], hypercall.arg[3],
>>>  hypercall.arg[4]);
>>> +#ifndef CONFIG_PREEMPT
>>> + schedule();
>>> +#endif

 As Juergen points out, this does nothing.  You need to schedule while in
 the middle of the hypercall.

 Remember that Xen's hypercall preemption only preempts the hypercall to
 run interrupts in the guest.
>>>
>>> How is it ensured that when the kernel preempts on this code path on
>>> CONFIG_PREEMPT=n kernel that only interrupts in the guest are run?
>>
>> Sorry, I really didn't describe this very well.
>>
>> If a hypercall needs a continuation, Xen returns to the guest with the
>> IP set to the hypercall instruction, and on the way back to the guest
>> Xen may schedule a different VCPU or it will do any upcalls (as per normal).
>>
>> The guest is free to return from the upcall to the original task
>> (continuing the hypercall) or to a different one.
> 
> OK so that addresses what Xen will do when using continuation and
> hypercall preemption, my concern here was that using
> preempt_schedule_irq() on CONFIG_PREEMPT=n kernels in the middle of a
> hypercall on the return from an interrupt (e.g., the timer interrupt)
> would still let the kernel preempt to tasks other than those related
> to Xen.

Um.  Why would that be a problem?  We do want to switch to any task the
Linux scheduler thinks is best.

David
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v8 36/50] vhost/net: enable virtio 1.0

2014-12-01 Thread Michael S. Tsirkin
Signed-off-by: Michael S. Tsirkin 
---
 drivers/vhost/net.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 8ff4a6d..a935c25 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -61,7 +61,8 @@ MODULE_PARM_DESC(experimental_zcopytx, "Enable Zero Copy TX;"
 enum {
VHOST_NET_FEATURES = VHOST_FEATURES |
 (1ULL << VHOST_NET_F_VIRTIO_NET_HDR) |
-(1ULL << VIRTIO_NET_F_MRG_RXBUF),
+(1ULL << VIRTIO_NET_F_MRG_RXBUF) |
+(1ULL << VIRTIO_F_VERSION_1),
 };
 
 enum {
-- 
MST

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v8 32/50] vhost: switch to __get/__put_user exclusively

2014-12-01 Thread Michael S. Tsirkin
Most places in vhost can use __get/__put_user rather than
get/put_user since addresses are pre-validated.
This should be good for performance, but this also
will help make code sparse-clean: get/put_user macros
don't play well with __virtioXX bitwise tags.
Switch to get/put_user to __ variants everywhere in vhost.
There's one exception - for consistency switch that
as well, and add an explicit access_ok check.

Signed-off-by: Michael S. Tsirkin 
---
 drivers/vhost/vhost.c | 13 ++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index c90f437..6a40837 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1038,6 +1038,7 @@ static int vhost_update_avail_event(struct 
vhost_virtqueue *vq, u16 avail_event)
 
 int vhost_init_used(struct vhost_virtqueue *vq)
 {
+   u16 last_used_idx;
int r;
if (!vq->private_data)
return 0;
@@ -1046,7 +1047,13 @@ int vhost_init_used(struct vhost_virtqueue *vq)
if (r)
return r;
vq->signalled_used_valid = false;
-   return get_user(vq->last_used_idx, &vq->used->idx);
+   if (!access_ok(VERIFY_READ, &vq->used->idx, sizeof vq->used->idx))
+   return -EFAULT;
+   r = __get_user(last_used_idx, &vq->used->idx);
+   if (r)
+   return r;
+   vq->last_used_idx = last_used_idx;
+   return 0;
 }
 EXPORT_SYMBOL_GPL(vhost_init_used);
 
@@ -1404,7 +1411,7 @@ int vhost_add_used_n(struct vhost_virtqueue *vq, struct 
vring_used_elem *heads,
 
/* Make sure buffer is written before we update index. */
smp_wmb();
-   if (put_user(vq->last_used_idx, &vq->used->idx)) {
+   if (__put_user(vq->last_used_idx, &vq->used->idx)) {
vq_err(vq, "Failed to increment used idx");
return -EFAULT;
}
@@ -1449,7 +1456,7 @@ static bool vhost_notify(struct vhost_dev *dev, struct 
vhost_virtqueue *vq)
if (unlikely(!v))
return true;
 
-   if (get_user(event, vhost_used_event(vq))) {
+   if (__get_user(event, vhost_used_event(vq))) {
vq_err(vq, "Failed to get used event idx");
return true;
}
-- 
MST

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v8 45/50] vhost/scsi: partial virtio 1.0 support

2014-12-01 Thread Michael S. Tsirkin
Include all endian conversions as required by virtio 1.0.
Don't set virtio 1.0 yet, since that requires ANY_LAYOUT
which we don't yet support.

Signed-off-by: Michael S. Tsirkin 
Acked-by: Paolo Bonzini 
---
 drivers/vhost/scsi.c | 22 --
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index a17f118..01c01cb 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -168,6 +168,7 @@ enum {
VHOST_SCSI_VQ_IO = 2,
 };
 
+/* Note: can't set VIRTIO_F_VERSION_1 yet, since that implies ANY_LAYOUT. */
 enum {
VHOST_SCSI_FEATURES = VHOST_FEATURES | (1ULL << VIRTIO_SCSI_F_HOTPLUG) |
   (1ULL << VIRTIO_SCSI_F_T10_PI)
@@ -577,8 +578,8 @@ tcm_vhost_allocate_evt(struct vhost_scsi *vs,
return NULL;
}
 
-   evt->event.event = event;
-   evt->event.reason = reason;
+   evt->event.event = cpu_to_vhost32(vq, event);
+   evt->event.reason = cpu_to_vhost32(vq, reason);
vs->vs_events_nr++;
 
return evt;
@@ -636,7 +637,7 @@ again:
}
 
if (vs->vs_events_missed) {
-   event->event |= VIRTIO_SCSI_T_EVENTS_MISSED;
+   event->event |= cpu_to_vhost32(vq, VIRTIO_SCSI_T_EVENTS_MISSED);
vs->vs_events_missed = false;
}
 
@@ -695,12 +696,13 @@ static void vhost_scsi_complete_cmd_work(struct 
vhost_work *work)
cmd, se_cmd->residual_count, se_cmd->scsi_status);
 
memset(&v_rsp, 0, sizeof(v_rsp));
-   v_rsp.resid = se_cmd->residual_count;
+   v_rsp.resid = cpu_to_vhost32(cmd->tvc_vq, 
se_cmd->residual_count);
/* TODO is status_qualifier field needed? */
v_rsp.status = se_cmd->scsi_status;
-   v_rsp.sense_len = se_cmd->scsi_sense_length;
+   v_rsp.sense_len = cpu_to_vhost32(cmd->tvc_vq,
+se_cmd->scsi_sense_length);
memcpy(v_rsp.sense, cmd->tvc_sense_buf,
-  v_rsp.sense_len);
+  se_cmd->scsi_sense_length);
ret = copy_to_user(cmd->tvc_resp, &v_rsp, sizeof(v_rsp));
if (likely(ret == 0)) {
struct vhost_scsi_virtqueue *q;
@@ -1095,14 +1097,14 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct 
vhost_virtqueue *vq)
", but wrong data_direction\n");
goto err_cmd;
}
-   prot_bytes = v_req_pi.pi_bytesout;
+   prot_bytes = vhost32_to_cpu(vq, 
v_req_pi.pi_bytesout);
} else if (v_req_pi.pi_bytesin) {
if (data_direction != DMA_FROM_DEVICE) {
vq_err(vq, "Received non zero 
di_pi_niov"
", but wrong data_direction\n");
goto err_cmd;
}
-   prot_bytes = v_req_pi.pi_bytesin;
+   prot_bytes = vhost32_to_cpu(vq, 
v_req_pi.pi_bytesin);
}
if (prot_bytes) {
int tmp = 0;
@@ -1117,12 +1119,12 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct 
vhost_virtqueue *vq)
data_first += prot_niov;
data_niov = data_num - prot_niov;
}
-   tag = v_req_pi.tag;
+   tag = vhost64_to_cpu(vq, v_req_pi.tag);
task_attr = v_req_pi.task_attr;
cdb = &v_req_pi.cdb[0];
lun = ((v_req_pi.lun[2] << 8) | v_req_pi.lun[3]) & 
0x3FFF;
} else {
-   tag = v_req.tag;
+   tag = vhost64_to_cpu(vq, v_req.tag);
task_attr = v_req.task_attr;
cdb = &v_req.cdb[0];
lun = ((v_req.lun[2] << 8) | v_req.lun[3]) & 0x3FFF;
-- 
MST

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v8 29/50] vhost: make features 64 bit

2014-12-01 Thread Michael S. Tsirkin
We need to use bit 32 for virtio 1.0.
Make vhost_has_feature bool to avoid discarding high bits.

Cc: Sergei Shtylyov 
Cc: Ben Hutchings 
Signed-off-by: Michael S. Tsirkin 
Reviewed-by: Jason Wang 
---
 drivers/vhost/vhost.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 3eda654..55a95c9 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -106,7 +106,7 @@ struct vhost_virtqueue {
/* Protected by virtqueue mutex. */
struct vhost_memory *memory;
void *private_data;
-   unsigned acked_features;
+   u64 acked_features;
/* Log write descriptors */
void __user *log_base;
struct vhost_log *log;
@@ -172,8 +172,8 @@ enum {
 (1ULL << VHOST_F_LOG_ALL),
 };
 
-static inline int vhost_has_feature(struct vhost_virtqueue *vq, int bit)
+static inline bool vhost_has_feature(struct vhost_virtqueue *vq, int bit)
 {
-   return vq->acked_features & (1 << bit);
+   return vq->acked_features & (1ULL << bit);
 }
 #endif
-- 
MST

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v8 31/50] vhost/net: force len for TX to host endian

2014-12-01 Thread Michael S. Tsirkin
vhost/net keeps a copy of the used ring in host memory but (ab)uses
the length field for internal house-keeping. This works because the
length in the used ring for tx is always 0. In order to suppress sparse
warnings, we force native endianness here.
Note that these values are never exposed to guests.

Signed-off-by: Michael S. Tsirkin 
Reviewed-by: Jason Wang 
---
 drivers/vhost/net.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 8dae2f7..dce5c58 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -48,15 +48,15 @@ MODULE_PARM_DESC(experimental_zcopytx, "Enable Zero Copy 
TX;"
  * status internally; used for zerocopy tx only.
  */
 /* Lower device DMA failed */
-#define VHOST_DMA_FAILED_LEN   3
+#define VHOST_DMA_FAILED_LEN   ((__force __virtio32)3)
 /* Lower device DMA done */
-#define VHOST_DMA_DONE_LEN 2
+#define VHOST_DMA_DONE_LEN ((__force __virtio32)2)
 /* Lower device DMA in progress */
-#define VHOST_DMA_IN_PROGRESS  1
+#define VHOST_DMA_IN_PROGRESS  ((__force __virtio32)1)
 /* Buffer unused */
-#define VHOST_DMA_CLEAR_LEN0
+#define VHOST_DMA_CLEAR_LEN((__force __virtio32)0)
 
-#define VHOST_DMA_IS_DONE(len) ((len) >= VHOST_DMA_DONE_LEN)
+#define VHOST_DMA_IS_DONE(len) ((__force u32)(len) >= (__force 
u32)VHOST_DMA_DONE_LEN)
 
 enum {
VHOST_NET_FEATURES = VHOST_FEATURES |
-- 
MST

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v8 30/50] vhost: add memory access wrappers

2014-12-01 Thread Michael S. Tsirkin
Add guest memory access wrappers to handle virtio endianness
conversions.

Signed-off-by: Michael S. Tsirkin 
Reviewed-by: Jason Wang 
Reviewed-by: Cornelia Huck 
---
 drivers/vhost/vhost.h | 31 +++
 1 file changed, 31 insertions(+)

diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 55a95c9..a0a6c98 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -176,4 +176,35 @@ static inline bool vhost_has_feature(struct 
vhost_virtqueue *vq, int bit)
 {
return vq->acked_features & (1ULL << bit);
 }
+
+/* Memory accessors */
+static inline u16 vhost16_to_cpu(struct vhost_virtqueue *vq, __virtio16 val)
+{
+   return __virtio16_to_cpu(vhost_has_feature(vq, VIRTIO_F_VERSION_1), 
val);
+}
+
+static inline __virtio16 cpu_to_vhost16(struct vhost_virtqueue *vq, u16 val)
+{
+   return __cpu_to_virtio16(vhost_has_feature(vq, VIRTIO_F_VERSION_1), 
val);
+}
+
+static inline u32 vhost32_to_cpu(struct vhost_virtqueue *vq, __virtio32 val)
+{
+   return __virtio32_to_cpu(vhost_has_feature(vq, VIRTIO_F_VERSION_1), 
val);
+}
+
+static inline __virtio32 cpu_to_vhost32(struct vhost_virtqueue *vq, u32 val)
+{
+   return __cpu_to_virtio32(vhost_has_feature(vq, VIRTIO_F_VERSION_1), 
val);
+}
+
+static inline u64 vhost64_to_cpu(struct vhost_virtqueue *vq, __virtio64 val)
+{
+   return __virtio64_to_cpu(vhost_has_feature(vq, VIRTIO_F_VERSION_1), 
val);
+}
+
+static inline __virtio64 cpu_to_vhost64(struct vhost_virtqueue *vq, u64 val)
+{
+   return __cpu_to_virtio64(vhost_has_feature(vq, VIRTIO_F_VERSION_1), 
val);
+}
 #endif
-- 
MST

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] xen: privcmd: schedule() after private hypercall when non CONFIG_PREEMPT

2014-12-01 Thread Luis R. Rodriguez
On Mon, Dec 01, 2014 at 03:54:24PM +, David Vrabel wrote:
> On 01/12/14 15:44, Luis R. Rodriguez wrote:
> > On Mon, Dec 1, 2014 at 10:18 AM, David Vrabel  
> > wrote:
> >> On 01/12/14 15:05, Luis R. Rodriguez wrote:
> >>> On Mon, Dec 01, 2014 at 11:11:43AM +, David Vrabel wrote:
>  On 27/11/14 18:36, Luis R. Rodriguez wrote:
> > On Thu, Nov 27, 2014 at 07:36:31AM +0100, Juergen Gross wrote:
> >> On 11/26/2014 11:26 PM, Luis R. Rodriguez wrote:
> >>> From: "Luis R. Rodriguez" 
> >>>
> >>> Some folks had reported that some xen hypercalls take a long time
> >>> to complete when issued from the userspace private ioctl mechanism,
> >>> this can happen for instance with some hypercalls that have many
> >>> sub-operations, this can happen for instance on hypercalls that use
>  [...]
> >>> --- a/drivers/xen/privcmd.c
> >>> +++ b/drivers/xen/privcmd.c
> >>> @@ -60,6 +60,9 @@ static long privcmd_ioctl_hypercall(void __user 
> >>> *udata)
> >>>  hypercall.arg[0], hypercall.arg[1],
> >>>  hypercall.arg[2], hypercall.arg[3],
> >>>  hypercall.arg[4]);
> >>> +#ifndef CONFIG_PREEMPT
> >>> + schedule();
> >>> +#endif
> 
>  As Juergen points out, this does nothing.  You need to schedule while in
>  the middle of the hypercall.
> 
>  Remember that Xen's hypercall preemption only preempts the hypercall to
>  run interrupts in the guest.
> >>>
> >>> How is it ensured that when the kernel preempts on this code path on
> >>> CONFIG_PREEMPT=n kernel that only interrupts in the guest are run?
> >>
> >> Sorry, I really didn't describe this very well.
> >>
> >> If a hypercall needs a continuation, Xen returns to the guest with the
> >> IP set to the hypercall instruction, and on the way back to the guest
> >> Xen may schedule a different VCPU or it will do any upcalls (as per 
> >> normal).
> >>
> >> The guest is free to return from the upcall to the original task
> >> (continuing the hypercall) or to a different one.
> > 
> > OK so that addresses what Xen will do when using continuation and
> > hypercall preemption, my concern here was that using
> > preempt_schedule_irq() on CONFIG_PREEMPT=n kernels in the middle of a
> > hypercall on the return from an interrupt (e.g., the timer interrupt)
> > would still let the kernel preempt to tasks other than those related
> > to Xen.
> 
> Um.  Why would that be a problem?  We do want to switch to any task the
> Linux scheduler thinks is best.

Its safe but -- it technically is doing kernel preemption, unless we want
to adjust the definition of CONFIG_PREEMPT=n to exclude hypercalls. This
was my original concern with the use of preempt_schedule_irq() to do this.
I am afraid of setting precedents without being clear or wider review and
acceptance.

  Luis
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v8 33/50] vhost: virtio 1.0 endian-ness support

2014-12-01 Thread Michael S. Tsirkin
Signed-off-by: Michael S. Tsirkin 
---
 drivers/vhost/vhost.c | 86 +--
 1 file changed, 49 insertions(+), 37 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 6a40837..ed71b53 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -33,8 +33,8 @@ enum {
VHOST_MEMORY_F_LOG = 0x1,
 };
 
-#define vhost_used_event(vq) ((u16 __user *)&vq->avail->ring[vq->num])
-#define vhost_avail_event(vq) ((u16 __user *)&vq->used->ring[vq->num])
+#define vhost_used_event(vq) ((__virtio16 __user *)&vq->avail->ring[vq->num])
+#define vhost_avail_event(vq) ((__virtio16 __user *)&vq->used->ring[vq->num])
 
 static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh,
poll_table *pt)
@@ -1001,7 +1001,7 @@ EXPORT_SYMBOL_GPL(vhost_log_write);
 static int vhost_update_used_flags(struct vhost_virtqueue *vq)
 {
void __user *used;
-   if (__put_user(vq->used_flags, &vq->used->flags) < 0)
+   if (__put_user(cpu_to_vhost16(vq, vq->used_flags), &vq->used->flags) < 
0)
return -EFAULT;
if (unlikely(vq->log_used)) {
/* Make sure the flag is seen before log. */
@@ -1019,7 +1019,7 @@ static int vhost_update_used_flags(struct vhost_virtqueue 
*vq)
 
 static int vhost_update_avail_event(struct vhost_virtqueue *vq, u16 
avail_event)
 {
-   if (__put_user(vq->avail_idx, vhost_avail_event(vq)))
+   if (__put_user(cpu_to_vhost16(vq, vq->avail_idx), 
vhost_avail_event(vq)))
return -EFAULT;
if (unlikely(vq->log_used)) {
void __user *used;
@@ -1038,7 +1038,7 @@ static int vhost_update_avail_event(struct 
vhost_virtqueue *vq, u16 avail_event)
 
 int vhost_init_used(struct vhost_virtqueue *vq)
 {
-   u16 last_used_idx;
+   __virtio16 last_used_idx;
int r;
if (!vq->private_data)
return 0;
@@ -1052,7 +1052,7 @@ int vhost_init_used(struct vhost_virtqueue *vq)
r = __get_user(last_used_idx, &vq->used->idx);
if (r)
return r;
-   vq->last_used_idx = last_used_idx;
+   vq->last_used_idx = vhost16_to_cpu(vq, last_used_idx);
return 0;
 }
 EXPORT_SYMBOL_GPL(vhost_init_used);
@@ -1094,16 +1094,16 @@ static int translate_desc(struct vhost_virtqueue *vq, 
u64 addr, u32 len,
 /* Each buffer in the virtqueues is actually a chain of descriptors.  This
  * function returns the next descriptor in the chain,
  * or -1U if we're at the end. */
-static unsigned next_desc(struct vring_desc *desc)
+static unsigned next_desc(struct vhost_virtqueue *vq, struct vring_desc *desc)
 {
unsigned int next;
 
/* If this descriptor says it doesn't chain, we're done. */
-   if (!(desc->flags & VRING_DESC_F_NEXT))
+   if (!(desc->flags & cpu_to_vhost16(vq, VRING_DESC_F_NEXT)))
return -1U;
 
/* Check they're not leading us off end of descriptors. */
-   next = desc->next;
+   next = vhost16_to_cpu(vq, desc->next);
/* Make sure compiler knows to grab that: we don't want it changing! */
/* We will use the result as an index in an array, so most
 * architectures only need a compiler barrier here. */
@@ -1120,18 +1120,19 @@ static int get_indirect(struct vhost_virtqueue *vq,
 {
struct vring_desc desc;
unsigned int i = 0, count, found = 0;
+   u32 len = vhost32_to_cpu(vq, indirect->len);
int ret;
 
/* Sanity check */
-   if (unlikely(indirect->len % sizeof desc)) {
+   if (unlikely(len % sizeof desc)) {
vq_err(vq, "Invalid length in indirect descriptor: "
   "len 0x%llx not multiple of 0x%zx\n",
-  (unsigned long long)indirect->len,
+  (unsigned long long)len,
   sizeof desc);
return -EINVAL;
}
 
-   ret = translate_desc(vq, indirect->addr, indirect->len, vq->indirect,
+   ret = translate_desc(vq, vhost64_to_cpu(vq, indirect->addr), len, 
vq->indirect,
 UIO_MAXIOV);
if (unlikely(ret < 0)) {
vq_err(vq, "Translation failure %d in indirect.\n", ret);
@@ -1142,7 +1143,7 @@ static int get_indirect(struct vhost_virtqueue *vq,
 * architectures only need a compiler barrier here. */
read_barrier_depends();
 
-   count = indirect->len / sizeof desc;
+   count = len / sizeof desc;
/* Buffers are chained via a 16 bit next field, so
 * we can have at most 2^16 of these. */
if (unlikely(count > USHRT_MAX + 1)) {
@@ -1162,16 +1163,17 @@ static int get_indirect(struct vhost_virtqueue *vq,
if (unlikely(memcpy_fromiovec((unsigned char *)&desc,
  vq->indirect, sizeof desc))) {
vq_err(vq, "Failed indirect descriptor: idx %d, %zx\n",
-  i, (siz

Re: [PATCH 0/4] KVM: x86: APIC fixes

2014-12-01 Thread Paolo Bonzini


On 27/11/2014 20:03, Radim Krčmář wrote:
> The interesting one is [3/4], which improves upon a previous CVE fix;
> we also handle logical destination wrapping in it, so [2/4] does the
> same for physical;  and to make it nicer, [1/4] removes a condition.
> [4/4] makes our fast path return true when the message was handled.
> 
> Radim Krčmář (4):
>   KVM: x86: deliver phys lowest-prio
>   KVM: x86: fix APIC physical destination wrapping
>   KVM: x86: allow 256 logical x2APICs again
>   KVM: x86: don't retry hopeless APIC delivery
> 
>  arch/x86/kvm/lapic.c | 20 +++-
>  arch/x86/kvm/lapic.h |  2 --
>  2 files changed, 11 insertions(+), 11 deletions(-)
> 

So the order should be 1/2/5/3/4, right?

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] kvm: x86: vmx: add checks on guest RIP

2014-12-01 Thread Paolo Bonzini


On 29/11/2014 16:27, Eugene Korenevsky wrote:
> Signed-off-by: Eugene Korenevsky 
> ---
> 
> Notes:
> This patch adds checks on Guest RIP specified in Intel Software Developer 
> Manual.
> 
> The following checks are performed on processors that support Intel 64 
> architecture:
> - Bits 63:32 must be 0 if the "IA-32e mode guest" VM-entry control is 0 
> or if
> the L bit (bit 13) in the access-rights field for CS is 0.
> - If the processor supports N < 64 linear-address bits, bits 63:N must be 
> identical
> if the "IA-32e mode guest" VM-entry control is 1 and the L bit in the 
> access-rights
> field for CS is 1. (No check applies if the processor supports 64 
> linear-address
> bits.)
> 
>  arch/x86/kvm/vmx.c | 27 ++-
>  1 file changed, 26 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 6a951d8..e2da83b 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -3828,6 +3828,28 @@ static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
>(ss.selector & SELECTOR_RPL_MASK));
>  }
>  
> +#ifdef CONFIG_X86_64
> +static bool rip_valid(struct kvm_vcpu *vcpu)
> +{
> + unsigned long rip;
> + struct kvm_segment cs;
> + bool longmode;
> +
> + /* RIP must be canonical in long mode
> +  * Bits 63:32 of RIP must be zero in other processor modes */
> + longmode = false;
> + if (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE) {
> + vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
> + longmode = (cs.l != 0);
> + }
> + rip = kvm_register_read(vcpu, VCPU_REGS_RIP);
> + if (longmode)
> + return !is_noncanonical_address(rip);

This check is off by one.  It is checking bits 63:47 instead of bits
63:48 (this quirk is intentionally part of the specification, so that
you can reenter a guest at 0x8000 after e.g. a VMCALL vmexit and
cause a general protection fault).

However, I am not sure how this can occur.  A #GP should have been
injected as part of the instruction that caused RIP to become invalid.
Perhaps you should check in nested_vmx_run instead?

Paolo

> + else
> + return (rip >> 32) == 0;
> +}
> +#endif
> +
>  /*
>   * Check if guest state is valid. Returns true if valid, false if
>   * not.
> @@ -3873,8 +3895,11 @@ static bool guest_state_valid(struct kvm_vcpu *vcpu)
>   if (!ldtr_valid(vcpu))
>   return false;
>   }
> +#ifdef CONFIG_X86_64
> + if (!rip_valid(vcpu))
> + return false;
> +#endif
>   /* TODO:
> -  * - Add checks on RIP
>* - Add checks on RFLAGS
>*/
>  
> 

My
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 14/21] KVM: x86: Software disabled APIC should still deliver NMIs

2014-12-01 Thread Paolo Bonzini


On 27/11/2014 23:26, Radim Krčmář wrote:
> We can't deliver xAPIC and x2APIC broadcasts/logical messages at the
> same time with current KVM and this patch just switches the working case
> in favour of x2APIC, which is why I didn't think it was necessary ...
> (And I didn't understand why prefer disabled x2APIC to enabled xAPIC.)

Indeed.  A possible thing to do perhaps would be to build two logical
maps, one for x2APIC and one for xAPIC, and consult both...  That would
slow down the common case when one map is empty, though.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v8 35/50] vhost/net: larger header for virtio 1.0

2014-12-01 Thread Michael S. Tsirkin
Signed-off-by: Michael S. Tsirkin 
Reviewed-by: Jason Wang 
Reviewed-by: Cornelia Huck 
---
 drivers/vhost/net.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index c218188..8ff4a6d 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -1030,7 +1030,8 @@ static int vhost_net_set_features(struct vhost_net *n, 
u64 features)
size_t vhost_hlen, sock_hlen, hdr_len;
int i;
 
-   hdr_len = (features & (1 << VIRTIO_NET_F_MRG_RXBUF)) ?
+   hdr_len = (features & ((1ULL << VIRTIO_NET_F_MRG_RXBUF) |
+  (1ULL << VIRTIO_F_VERSION_1))) ?
sizeof(struct virtio_net_hdr_mrg_rxbuf) :
sizeof(struct virtio_net_hdr);
if (features & (1 << VHOST_NET_F_VIRTIO_NET_HDR)) {
-- 
MST

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v8 34/50] vhost/net: virtio 1.0 byte swap

2014-12-01 Thread Michael S. Tsirkin
I had to add an explicit tag to suppress compiler warning:
gcc isn't smart enough to notice that
len is always initialized since function is called with size > 0.

Signed-off-by: Michael S. Tsirkin 
Reviewed-by: Cornelia Huck 
---
 drivers/vhost/net.c | 15 ++-
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index dce5c58..c218188 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -416,7 +416,7 @@ static void handle_tx(struct vhost_net *net)
struct ubuf_info *ubuf;
ubuf = nvq->ubuf_info + nvq->upend_idx;
 
-   vq->heads[nvq->upend_idx].id = head;
+   vq->heads[nvq->upend_idx].id = cpu_to_vhost32(vq, head);
vq->heads[nvq->upend_idx].len = VHOST_DMA_IN_PROGRESS;
ubuf->callback = vhost_zerocopy_callback;
ubuf->ctx = nvq->ubufs;
@@ -500,6 +500,10 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
int headcount = 0;
unsigned d;
int r, nlogs = 0;
+   /* len is always initialized before use since we are always called with
+* datalen > 0.
+*/
+   u32 uninitialized_var(len);
 
while (datalen > 0 && headcount < quota) {
if (unlikely(seg >= UIO_MAXIOV)) {
@@ -527,13 +531,14 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
nlogs += *log_num;
log += *log_num;
}
-   heads[headcount].id = d;
-   heads[headcount].len = iov_length(vq->iov + seg, in);
-   datalen -= heads[headcount].len;
+   heads[headcount].id = cpu_to_vhost32(vq, d);
+   len = iov_length(vq->iov + seg, in);
+   heads[headcount].len = cpu_to_vhost32(vq, len);
+   datalen -= len;
++headcount;
seg += in;
}
-   heads[headcount - 1].len += datalen;
+   heads[headcount - 1].len = cpu_to_vhost32(vq, len - datalen);
*iovcount = seg;
if (unlikely(log))
*log_num = nlogs;
-- 
MST

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] xen: privcmd: schedule() after private hypercall when non CONFIG_PREEMPT

2014-12-01 Thread Juergen Gross

On 12/01/2014 05:19 PM, Luis R. Rodriguez wrote:

On Mon, Dec 01, 2014 at 03:54:24PM +, David Vrabel wrote:

On 01/12/14 15:44, Luis R. Rodriguez wrote:

On Mon, Dec 1, 2014 at 10:18 AM, David Vrabel  wrote:

On 01/12/14 15:05, Luis R. Rodriguez wrote:

On Mon, Dec 01, 2014 at 11:11:43AM +, David Vrabel wrote:

On 27/11/14 18:36, Luis R. Rodriguez wrote:

On Thu, Nov 27, 2014 at 07:36:31AM +0100, Juergen Gross wrote:

On 11/26/2014 11:26 PM, Luis R. Rodriguez wrote:

From: "Luis R. Rodriguez" 

Some folks had reported that some xen hypercalls take a long time
to complete when issued from the userspace private ioctl mechanism,
this can happen for instance with some hypercalls that have many
sub-operations, this can happen for instance on hypercalls that use

[...]

--- a/drivers/xen/privcmd.c
+++ b/drivers/xen/privcmd.c
@@ -60,6 +60,9 @@ static long privcmd_ioctl_hypercall(void __user *udata)
  hypercall.arg[0], hypercall.arg[1],
  hypercall.arg[2], hypercall.arg[3],
  hypercall.arg[4]);
+#ifndef CONFIG_PREEMPT
+ schedule();
+#endif


As Juergen points out, this does nothing.  You need to schedule while in
the middle of the hypercall.

Remember that Xen's hypercall preemption only preempts the hypercall to
run interrupts in the guest.


How is it ensured that when the kernel preempts on this code path on
CONFIG_PREEMPT=n kernel that only interrupts in the guest are run?


Sorry, I really didn't describe this very well.

If a hypercall needs a continuation, Xen returns to the guest with the
IP set to the hypercall instruction, and on the way back to the guest
Xen may schedule a different VCPU or it will do any upcalls (as per normal).

The guest is free to return from the upcall to the original task
(continuing the hypercall) or to a different one.


OK so that addresses what Xen will do when using continuation and
hypercall preemption, my concern here was that using
preempt_schedule_irq() on CONFIG_PREEMPT=n kernels in the middle of a
hypercall on the return from an interrupt (e.g., the timer interrupt)
would still let the kernel preempt to tasks other than those related
to Xen.


Um.  Why would that be a problem?  We do want to switch to any task the
Linux scheduler thinks is best.


Its safe but -- it technically is doing kernel preemption, unless we want
to adjust the definition of CONFIG_PREEMPT=n to exclude hypercalls. This
was my original concern with the use of preempt_schedule_irq() to do this.
I am afraid of setting precedents without being clear or wider review and
acceptance.


I wonder whether it would be more acceptable to add (or completely
switch to) another preemption model: PREEMPT_SWITCHABLE. This would be
similar to CONFIG_PREEMPT, but the "normal" value of __preempt_count
would be settable via kernel parameter (default 2):

0: preempt
1: preempt_voluntary
2: preempt_none

The kernel would run with preemption enabled. cond_sched() would
reschedule if __preempt_count <= 1. And in case of long running kernel
activities (like the hypercall case or other stuff requiring schedule()
calls to avoid hangups) we would just set __preempt_count to 0 during
these periods and restore the old value afterwards.

This would be a rather intrusive but clean change IMO.

Any thoughts?


Juergen
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 5/5] kvm: optimize GFN to memslot lookup with large slots amount

2014-12-01 Thread Igor Mammedov
Current linear search doesn't scale well when
large amount of memslots is used and looked up slot
is not in the beginning memslots array.
Taking in account that memslots don't overlap, it's
possible to switch sorting order of memslots array from
'npages' to 'base_gfn' and use binary search for
memslot lookup by GFN.

As result of switching to binary search lookup times
are reduced with large amount of memslots.

Following is a table of search_memslot() cycles
during WS2008R2 guest boot.

 boot,  boot + ~10 min
 mostly sameof using it,
 slot lookuprandomized lookup
max  averageaverage
cycles   cycles cycles

13 slots  : 1450   28   30

13 slots  : 1400   30   40
binary search

117 slots : 13000  30   460

117 slots : 2000   35   180
binary search

Signed-off-by: Igor Mammedov 
---
 include/linux/kvm_host.h | 34 ++
 virt/kvm/kvm_main.c  |  8 +++-
 2 files changed, 29 insertions(+), 13 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 1a37144..193bca6 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -354,6 +354,7 @@ struct kvm_memslots {
/* The mapping table from slot id to the index in memslots[]. */
short id_to_index[KVM_MEM_SLOTS_NUM];
atomic_t lru_slot;
+   int used_slots;
 };
 
 struct kvm {
@@ -791,19 +792,28 @@ static inline void kvm_guest_exit(void)
 static inline struct kvm_memory_slot *
 search_memslots(struct kvm_memslots *slots, gfn_t gfn)
 {
+   int start = 0, end = slots->used_slots;
int slot = atomic_read(&slots->lru_slot);
-   struct kvm_memory_slot *memslot = &slots->memslots[slot];
-
-   if (gfn >= memslot->base_gfn &&
-   gfn < memslot->base_gfn + memslot->npages)
-   return memslot;
-
-   kvm_for_each_memslot(memslot, slots)
-   if (gfn >= memslot->base_gfn &&
- gfn < memslot->base_gfn + memslot->npages) {
-   atomic_set(&slots->lru_slot, memslot - slots->memslots);
-   return memslot;
-   }
+   struct kvm_memory_slot *memslots = slots->memslots;
+
+   if (gfn >= memslots[slot].base_gfn &&
+   gfn < memslots[slot].base_gfn + memslots[slot].npages)
+   return &memslots[slot];
+
+   while (start < end) {
+   slot = start + (end - start) / 2;
+
+   if (gfn >= memslots[slot].base_gfn)
+   end = slot;
+   else
+   start = slot + 1;
+   }
+
+   if (gfn >= memslots[start].base_gfn &&
+   gfn < memslots[start].base_gfn + memslots[start].npages) {
+   atomic_set(&slots->lru_slot, start);
+   return &memslots[start];
+   }
 
return NULL;
 }
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 162817f..759af659 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -679,8 +679,14 @@ static void update_memslots(struct kvm_memslots *slots,
struct kvm_memory_slot *mslots = slots->memslots;
 
WARN_ON(mslots[i].id != id);
-   if (!new->npages)
+   if (!new->npages) {
new->base_gfn = 0;
+   if (mslots[i].npages)
+   slots->used_slots--;
+   } else {
+   if (!mslots[i].npages)
+   slots->used_slots++;
+   }
 
while (i < KVM_MEM_SLOTS_NUM - 1 &&
   new->base_gfn <= mslots[i + 1].base_gfn) {
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 4/5] kvm: change memslot sorting rule from size to GFN

2014-12-01 Thread Igor Mammedov
it will allow to use binary search for GFN -> memslot
lookups, reducing lookup cost with large slots amount.

Signed-off-by: Igor Mammedov 
---
 virt/kvm/kvm_main.c | 17 +++--
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 278ed65..162817f 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -666,10 +666,10 @@ static int kvm_create_dirty_bitmap(struct kvm_memory_slot 
*memslot)
 }
 
 /*
- * Insert memslot and re-sort memslots based on their size,
- * so the larger slots will get better fit. Sorting algorithm
- * takes advantage of having initially sorted array and
- * known changed memslot position.
+ * Insert memslot and re-sort memslots based on their GFN,
+ * so binary search could be used to lookup GFN.
+ * Sorting algorithm takes advantage of having initially
+ * sorted array and known changed memslot position.
  */
 static void update_memslots(struct kvm_memslots *slots,
struct kvm_memory_slot *new)
@@ -679,14 +679,19 @@ static void update_memslots(struct kvm_memslots *slots,
struct kvm_memory_slot *mslots = slots->memslots;
 
WARN_ON(mslots[i].id != id);
+   if (!new->npages)
+   new->base_gfn = 0;
+
while (i < KVM_MEM_SLOTS_NUM - 1 &&
-  new->npages < mslots[i + 1].npages) {
+  new->base_gfn <= mslots[i + 1].base_gfn) {
+   if (!mslots[i + 1].npages)
+   break;
mslots[i] = mslots[i + 1];
slots->id_to_index[mslots[i].id] = i;
i++;
}
while (i > 0 &&
-  new->npages > mslots[i - 1].npages) {
+  new->base_gfn > mslots[i - 1].base_gfn) {
mslots[i] = mslots[i - 1];
slots->id_to_index[mslots[i].id] = i;
i--;
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/5] kvm: search_memslots: add simple LRU memslot caching

2014-12-01 Thread Igor Mammedov
In typical guest boot workload only 2-3 memslots are used
extensively, and at that it's mostly the same memslot
lookup operation.

Adding LRU cache improves average lookup time from
46 to 28 cycles (~40%) for this workload.

Signed-off-by: Igor Mammedov 
---
 include/linux/kvm_host.h | 12 ++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 231dd94..1a37144 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -353,6 +353,7 @@ struct kvm_memslots {
struct kvm_memory_slot memslots[KVM_MEM_SLOTS_NUM];
/* The mapping table from slot id to the index in memslots[]. */
short id_to_index[KVM_MEM_SLOTS_NUM];
+   atomic_t lru_slot;
 };
 
 struct kvm {
@@ -790,12 +791,19 @@ static inline void kvm_guest_exit(void)
 static inline struct kvm_memory_slot *
 search_memslots(struct kvm_memslots *slots, gfn_t gfn)
 {
-   struct kvm_memory_slot *memslot;
+   int slot = atomic_read(&slots->lru_slot);
+   struct kvm_memory_slot *memslot = &slots->memslots[slot];
+
+   if (gfn >= memslot->base_gfn &&
+   gfn < memslot->base_gfn + memslot->npages)
+   return memslot;
 
kvm_for_each_memslot(memslot, slots)
if (gfn >= memslot->base_gfn &&
- gfn < memslot->base_gfn + memslot->npages)
+ gfn < memslot->base_gfn + memslot->npages) {
+   atomic_set(&slots->lru_slot, memslot - slots->memslots);
return memslot;
+   }
 
return NULL;
 }
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/5] kvm: update_memslots: drop not needed check for the same number of pages

2014-12-01 Thread Igor Mammedov
if number of pages haven't changed sorting algorithm
will do nothing, so there is no need to do extra check
to avoid entering sorting logic.

Signed-off-by: Igor Mammedov 
---
 virt/kvm/kvm_main.c | 28 +---
 1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 5b45330..407277b 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -679,21 +679,19 @@ static void update_memslots(struct kvm_memslots *slots,
struct kvm_memory_slot *mslots = slots->memslots;
 
WARN_ON(mslots[i].id != id);
-   if (new->npages != mslots[i].npages) {
-   if (new->npages < mslots[i].npages) {
-   while (i < KVM_MEM_SLOTS_NUM - 1 &&
-  new->npages < mslots[i + 1].npages) {
-   mslots[i] = mslots[i + 1];
-   slots->id_to_index[mslots[i].id] = i;
-   i++;
-   }
-   } else {
-   while (i > 0 &&
-  new->npages > mslots[i - 1].npages) {
-   mslots[i] = mslots[i - 1];
-   slots->id_to_index[mslots[i].id] = i;
-   i--;
-   }
+   if (new->npages < mslots[i].npages) {
+   while (i < KVM_MEM_SLOTS_NUM - 1 &&
+  new->npages < mslots[i + 1].npages) {
+   mslots[i] = mslots[i + 1];
+   slots->id_to_index[mslots[i].id] = i;
+   i++;
+   }
+   } else {
+   while (i > 0 &&
+  new->npages > mslots[i - 1].npages) {
+   mslots[i] = mslots[i - 1];
+   slots->id_to_index[mslots[i].id] = i;
+   i--;
}
}
 
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/5] kvm: memslots lookup optimization

2014-12-01 Thread Igor Mammedov
Series speed-ups GFN to memslot lookup time by:
 * introducing LRU cache, which improves looukup time for
   same slot workload (typically boot time of Windows and Linux guest)
 * switching to binary search for GFN to memslot lookup,
   improving lookup time with large amount of memory slots

Igor Mammedov (5):
  kvm: update_memslots: drop not needed check for the same number of
pages
  kvm: update_memslots: drop not needed check for the same slot
  kvm: search_memslots: add simple LRU memslot caching
  kvm: change memslot sorting rule from size to GFN
  kvm: optimize GFN to memslot lookup with large slots amount

 include/linux/kvm_host.h | 28 +++-
 virt/kvm/kvm_main.c  | 46 ++
 2 files changed, 49 insertions(+), 25 deletions(-)

-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/5] kvm: update_memslots: drop not needed check for the same slot

2014-12-01 Thread Igor Mammedov
UP/DOWN shift loops will shift array in needed
direction and stop at place where new slot should
be placed regardless of old slot size.

Signed-off-by: Igor Mammedov 
---
 virt/kvm/kvm_main.c | 25 +++--
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 407277b..278ed65 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -679,20 +679,17 @@ static void update_memslots(struct kvm_memslots *slots,
struct kvm_memory_slot *mslots = slots->memslots;
 
WARN_ON(mslots[i].id != id);
-   if (new->npages < mslots[i].npages) {
-   while (i < KVM_MEM_SLOTS_NUM - 1 &&
-  new->npages < mslots[i + 1].npages) {
-   mslots[i] = mslots[i + 1];
-   slots->id_to_index[mslots[i].id] = i;
-   i++;
-   }
-   } else {
-   while (i > 0 &&
-  new->npages > mslots[i - 1].npages) {
-   mslots[i] = mslots[i - 1];
-   slots->id_to_index[mslots[i].id] = i;
-   i--;
-   }
+   while (i < KVM_MEM_SLOTS_NUM - 1 &&
+  new->npages < mslots[i + 1].npages) {
+   mslots[i] = mslots[i + 1];
+   slots->id_to_index[mslots[i].id] = i;
+   i++;
+   }
+   while (i > 0 &&
+  new->npages > mslots[i - 1].npages) {
+   mslots[i] = mslots[i - 1];
+   slots->id_to_index[mslots[i].id] = i;
+   i--;
}
 
mslots[i] = *new;
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/5] kvm: memslots lookup optimization

2014-12-01 Thread Paolo Bonzini


On 01/12/2014 18:29, Igor Mammedov wrote:
> Series speed-ups GFN to memslot lookup time by:
>  * introducing LRU cache, which improves looukup time for
>same slot workload (typically boot time of Windows and Linux guest)
>  * switching to binary search for GFN to memslot lookup,
>improving lookup time with large amount of memory slots
> 
> Igor Mammedov (5):
>   kvm: update_memslots: drop not needed check for the same number of
> pages
>   kvm: update_memslots: drop not needed check for the same slot
>   kvm: search_memslots: add simple LRU memslot caching
>   kvm: change memslot sorting rule from size to GFN
>   kvm: optimize GFN to memslot lookup with large slots amount
> 
>  include/linux/kvm_host.h | 28 +++-
>  virt/kvm/kvm_main.c  | 46 ++
>  2 files changed, 49 insertions(+), 25 deletions(-)
> 

Applied patches 1-3 for now, I'm not in the mood for proving that the
binary search is correct. :)

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 14/21] KVM: x86: Software disabled APIC should still deliver NMIs

2014-12-01 Thread Radim Krčmář
2014-12-01 17:30+0100, Paolo Bonzini:
> On 27/11/2014 23:26, Radim Krčmář wrote:
> > We can't deliver xAPIC and x2APIC broadcasts/logical messages at the
> > same time with current KVM and this patch just switches the working case
> > in favour of x2APIC, which is why I didn't think it was necessary ...
> > (And I didn't understand why prefer disabled x2APIC to enabled xAPIC.)
> 
> Indeed.  A possible thing to do perhaps would be to build two logical
> maps, one for x2APIC and one for xAPIC, and consult both...  That would
> slow down the common case when one map is empty, though.

Yeah, that seems like the cleanest solution ... I'll prepare the patch.
(so we can then throw it away :)

We'll need to take a look at the source (IO)APIC now.
Some design points I'm not sure about now:
 - Does xAPIC ignore reserved bytes in destination?
   (= Is x2APIC broadcast also delivered to xAPICs?)
 - we'll probably still have to hack it to deliver IOAPIC requests to
   low x2APIC logical addresses for compatibility reasons
 - to gain speed, we could try to encode them into the same cache-line
 struct {kvm_apic_t *xapic, kvm_apic_t *x2apic} logical_map[16][16];
   (and waste some space instead)


---
If we wanted to be super correct, with respect to sw-disabled xAPIC,
there should be three: x2APIC, flat xAPIC, and cluster xAPIC :)
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] xen: privcmd: schedule() after private hypercall when non CONFIG_PREEMPT

2014-12-01 Thread Luis R. Rodriguez
On Mon, Dec 01, 2014 at 06:07:48PM +0100, Juergen Gross wrote:
> On 12/01/2014 05:19 PM, Luis R. Rodriguez wrote:
>> On Mon, Dec 01, 2014 at 03:54:24PM +, David Vrabel wrote:
>>> On 01/12/14 15:44, Luis R. Rodriguez wrote:
 On Mon, Dec 1, 2014 at 10:18 AM, David Vrabel  
 wrote:
> On 01/12/14 15:05, Luis R. Rodriguez wrote:
>> On Mon, Dec 01, 2014 at 11:11:43AM +, David Vrabel wrote:
>>> On 27/11/14 18:36, Luis R. Rodriguez wrote:
 On Thu, Nov 27, 2014 at 07:36:31AM +0100, Juergen Gross wrote:
> On 11/26/2014 11:26 PM, Luis R. Rodriguez wrote:
>> From: "Luis R. Rodriguez" 
>>
>> Some folks had reported that some xen hypercalls take a long time
>> to complete when issued from the userspace private ioctl mechanism,
>> this can happen for instance with some hypercalls that have many
>> sub-operations, this can happen for instance on hypercalls that use
>>> [...]
>> --- a/drivers/xen/privcmd.c
>> +++ b/drivers/xen/privcmd.c
>> @@ -60,6 +60,9 @@ static long privcmd_ioctl_hypercall(void __user 
>> *udata)
>>   hypercall.arg[0], hypercall.arg[1],
>>   hypercall.arg[2], hypercall.arg[3],
>>   hypercall.arg[4]);
>> +#ifndef CONFIG_PREEMPT
>> + schedule();
>> +#endif
>>>
>>> As Juergen points out, this does nothing.  You need to schedule while in
>>> the middle of the hypercall.
>>>
>>> Remember that Xen's hypercall preemption only preempts the hypercall to
>>> run interrupts in the guest.
>>
>> How is it ensured that when the kernel preempts on this code path on
>> CONFIG_PREEMPT=n kernel that only interrupts in the guest are run?
>
> Sorry, I really didn't describe this very well.
>
> If a hypercall needs a continuation, Xen returns to the guest with the
> IP set to the hypercall instruction, and on the way back to the guest
> Xen may schedule a different VCPU or it will do any upcalls (as per 
> normal).
>
> The guest is free to return from the upcall to the original task
> (continuing the hypercall) or to a different one.

 OK so that addresses what Xen will do when using continuation and
 hypercall preemption, my concern here was that using
 preempt_schedule_irq() on CONFIG_PREEMPT=n kernels in the middle of a
 hypercall on the return from an interrupt (e.g., the timer interrupt)
 would still let the kernel preempt to tasks other than those related
 to Xen.
>>>
>>> Um.  Why would that be a problem?  We do want to switch to any task the
>>> Linux scheduler thinks is best.
>>
>> Its safe but -- it technically is doing kernel preemption, unless we want
>> to adjust the definition of CONFIG_PREEMPT=n to exclude hypercalls. This
>> was my original concern with the use of preempt_schedule_irq() to do this.
>> I am afraid of setting precedents without being clear or wider review and
>> acceptance.
>
> I wonder whether it would be more acceptable to add (or completely
> switch to) another preemption model: PREEMPT_SWITCHABLE. This would be
> similar to CONFIG_PREEMPT, but the "normal" value of __preempt_count
> would be settable via kernel parameter (default 2):
>
> 0: preempt
> 1: preempt_voluntary
> 2: preempt_none
>
> The kernel would run with preemption enabled. cond_sched() would
> reschedule if __preempt_count <= 1. And in case of long running kernel
> activities (like the hypercall case or other stuff requiring schedule()
> calls to avoid hangups) we would just set __preempt_count to 0 during
> these periods and restore the old value afterwards.
>
> This would be a rather intrusive but clean change IMO.
>
> Any thoughts?

I like the idea of dynamically changing at run time the preemption model and
personally find this reasonable, however I am not certain if this would
introduce a series of issues hard to address. Thoughts by others who linger
deep in the cold lonely scheduler caves ?

  Luis
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/4] KVM: x86: APIC fixes

2014-12-01 Thread Radim Krčmář
2014-12-01 17:22+0100, Paolo Bonzini:
> On 27/11/2014 20:03, Radim Krčmář wrote:
> > The interesting one is [3/4], which improves upon a previous CVE fix;
> > we also handle logical destination wrapping in it, so [2/4] does the
> > same for physical;  and to make it nicer, [1/4] removes a condition.
> > [4/4] makes our fast path return true when the message was handled.
> > 
> > Radim Krčmář (4):
> >   KVM: x86: deliver phys lowest-prio
> >   KVM: x86: fix APIC physical destination wrapping
> >   KVM: x86: allow 256 logical x2APICs again
> >   KVM: x86: don't retry hopeless APIC delivery
> > 
> >  arch/x86/kvm/lapic.c | 20 +++-
> >  arch/x86/kvm/lapic.h |  2 --
> >  2 files changed, 11 insertions(+), 11 deletions(-)
> > 
> 
> So the order should be 1/2/5/3/4, right?

It would be safer, thank you.

(And when I look at it now, [4/4] would be better as 1st.)
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/4] KVM: x86: APIC fixes

2014-12-01 Thread Paolo Bonzini


On 01/12/2014 18:55, Radim Krčmář wrote:
> 2014-12-01 17:22+0100, Paolo Bonzini:
>> On 27/11/2014 20:03, Radim Krčmář wrote:
>>> The interesting one is [3/4], which improves upon a previous CVE fix;
>>> we also handle logical destination wrapping in it, so [2/4] does the
>>> same for physical;  and to make it nicer, [1/4] removes a condition.
>>> [4/4] makes our fast path return true when the message was handled.
>>>
>>> Radim Krčmář (4):
>>>   KVM: x86: deliver phys lowest-prio
>>>   KVM: x86: fix APIC physical destination wrapping
>>>   KVM: x86: allow 256 logical x2APICs again
>>>   KVM: x86: don't retry hopeless APIC delivery
>>>
>>>  arch/x86/kvm/lapic.c | 20 +++-
>>>  arch/x86/kvm/lapic.h |  2 --
>>>  2 files changed, 11 insertions(+), 11 deletions(-)
>>>
>>
>> So the order should be 1/2/5/3/4, right?
> 
> It would be safer, thank you.
> 
> (And when I look at it now, [4/4] would be better as 1st.)

Ok, applying 4/1/2/5/3.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 5/5] arm/arm64: KVM: Turn off vcpus and flush stage-2 pgtables on sytem exit events

2014-12-01 Thread Peter Maydell
On 27 November 2014 at 23:10, Peter Maydell  wrote:
> It seems odd to have this unmap happen on attempted system reset/powerdown,
> not on cpu init/start.

Here's a concrete case that I think requires the unmap to be
done on cpu init:
 * start a VM and run it for a bit
 * from the QEMU monitor, use "loadvm" to load a VM snapshot

This will cause QEMU to do a system reset (including calling
VCPU_INIT to reset the CPUs), load the contents of guest
RAM from the snapshot, set guest CPU registers with a pile
of SET_ONE_REG calls, and then KVM_RUN to start the VM.

If we don't unmap stage2 on vcpu init,  then what in this
sequence causes the icaches to be flushed so we execute
the newly loaded ram contents rather than stale data
from the first VM run?

thanks
-- PMM
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Xen-devel] [PATCH] xen: privcmd: schedule() after private hypercall when non CONFIG_PREEMPT

2014-12-01 Thread David Vrabel
On 01/12/14 16:19, Luis R. Rodriguez wrote:
> On Mon, Dec 01, 2014 at 03:54:24PM +, David Vrabel wrote:
>> On 01/12/14 15:44, Luis R. Rodriguez wrote:
>>> On Mon, Dec 1, 2014 at 10:18 AM, David Vrabel  
>>> wrote:
 On 01/12/14 15:05, Luis R. Rodriguez wrote:
> On Mon, Dec 01, 2014 at 11:11:43AM +, David Vrabel wrote:
>> On 27/11/14 18:36, Luis R. Rodriguez wrote:
>>> On Thu, Nov 27, 2014 at 07:36:31AM +0100, Juergen Gross wrote:
 On 11/26/2014 11:26 PM, Luis R. Rodriguez wrote:
> From: "Luis R. Rodriguez" 
>
> Some folks had reported that some xen hypercalls take a long time
> to complete when issued from the userspace private ioctl mechanism,
> this can happen for instance with some hypercalls that have many
> sub-operations, this can happen for instance on hypercalls that use
>> [...]
> --- a/drivers/xen/privcmd.c
> +++ b/drivers/xen/privcmd.c
> @@ -60,6 +60,9 @@ static long privcmd_ioctl_hypercall(void __user 
> *udata)
>  hypercall.arg[0], hypercall.arg[1],
>  hypercall.arg[2], hypercall.arg[3],
>  hypercall.arg[4]);
> +#ifndef CONFIG_PREEMPT
> + schedule();
> +#endif
>>
>> As Juergen points out, this does nothing.  You need to schedule while in
>> the middle of the hypercall.
>>
>> Remember that Xen's hypercall preemption only preempts the hypercall to
>> run interrupts in the guest.
>
> How is it ensured that when the kernel preempts on this code path on
> CONFIG_PREEMPT=n kernel that only interrupts in the guest are run?

 Sorry, I really didn't describe this very well.

 If a hypercall needs a continuation, Xen returns to the guest with the
 IP set to the hypercall instruction, and on the way back to the guest
 Xen may schedule a different VCPU or it will do any upcalls (as per 
 normal).

 The guest is free to return from the upcall to the original task
 (continuing the hypercall) or to a different one.
>>>
>>> OK so that addresses what Xen will do when using continuation and
>>> hypercall preemption, my concern here was that using
>>> preempt_schedule_irq() on CONFIG_PREEMPT=n kernels in the middle of a
>>> hypercall on the return from an interrupt (e.g., the timer interrupt)
>>> would still let the kernel preempt to tasks other than those related
>>> to Xen.
>>
>> Um.  Why would that be a problem?  We do want to switch to any task the
>> Linux scheduler thinks is best.
> 
> Its safe but -- it technically is doing kernel preemption, unless we want
> to adjust the definition of CONFIG_PREEMPT=n to exclude hypercalls. This
> was my original concern with the use of preempt_schedule_irq() to do this.
> I am afraid of setting precedents without being clear or wider review and
> acceptance.

It's voluntary preemption at a well defined point.  It's no different to
a cond_resched() call.

Note that we're not trying to fix this for the non-voluntary-preempt
kernels.

David
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 3/8] VFIO: platform: forwarded state tested when selecting IRQ handler

2014-12-01 Thread Christoffer Dall
On Mon, Dec 01, 2014 at 03:39:24PM +0100, Eric Auger wrote:
> Hi Christoffer,
> On 11/30/2014 01:47 PM, Christoffer Dall wrote:
> > The subject reads strangely, perhaps just:
> > 
> > VFIO: platform: test forward state when selecting IRQ handler
> OK
> > 
> > On Sun, Nov 23, 2014 at 07:35:55PM +0100, Eric Auger wrote:
> >> In case the IRQ is forwarded, the VFIO platform IRQ handler does not
> >> need to disable the IRQ anymore.
> >>
> >> When setting the IRQ handler we now also test the forwarded state. In
> >> case the IRQ is forwarded we select the edge handler (no automaske).
> >>
> >> Signed-off-by: Eric Auger 
> >>
> >> ---
> >>
> >> v2 -> v3:
> >> - forwarded state was tested in the handler. Now the forwarded state
> >>   is tested before setting the handler. This definitively limits
> >>   the dynamics of forwarded state changes but I don't think there is
> >>   a use case where we need to be able to change the state at any time.
> > 
> > user space can change this by calling the VFIO_IRQ_SET_ACTION_TRIGGER
> > whenever it wants, right?
> yes the user can set/unset the VFIO signaling (and request_irq/free_irq)
> using VFIO_IRQ_SET_ACTION_TRIGGER. In this new version I do not allow
> changing the forwarded state when the handler is attached (request_irq).
> 
> Does it answer your interrogation?
> 
interrogation?  hopefully it wasn't that bad ;)

Yes, that answers my question.

Thanks,
-Christoffer
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3] virt: kvm: arm: vgic: Process the failure case when kvm_register_device_ops() fails

2014-12-01 Thread Chen Gang
kvm_register_device_ops() may fail, so need process the related failure
case.

Since it is related with neither vgic_arch_setup() nor on_each_cpu(),
can move it before them, then simplify the related failure processing
code.

Signed-off-by: Chen Gang 
---
 virt/kvm/arm/vgic.c | 11 +--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
index cf23c13..814cae2 100644
--- a/virt/kvm/arm/vgic.c
+++ b/virt/kvm/arm/vgic.c
@@ -2471,13 +2471,20 @@ int kvm_vgic_hyp_init(void)
goto out_free_irq;
}
 
+   ret = kvm_register_device_ops(&kvm_arm_vgic_v2_ops,
+ KVM_DEV_TYPE_ARM_VGIC_V2);
+   if (ret) {
+   __unregister_cpu_notifier(&vgic_cpu_nb);
+   kvm_err("Cannot register vgic device ops\n");
+   goto out_free_irq;
+   }
+
/* Callback into for arch code for setup */
vgic_arch_setup(vgic);
 
on_each_cpu(vgic_init_maintenance_interrupt, NULL, 1);
 
-   return kvm_register_device_ops(&kvm_arm_vgic_v2_ops,
-  KVM_DEV_TYPE_ARM_VGIC_V2);
+   return 0;
 
 out_free_irq:
free_percpu_irq(vgic->maint_irq, kvm_get_running_vcpus());
-- 
1.9.3
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 3/8] VFIO: platform: forwarded state tested when selecting IRQ handler

2014-12-01 Thread Eric Auger
On 12/01/2014 09:10 PM, Christoffer Dall wrote:
> On Mon, Dec 01, 2014 at 03:39:24PM +0100, Eric Auger wrote:
>> Hi Christoffer,
>> On 11/30/2014 01:47 PM, Christoffer Dall wrote:
>>> The subject reads strangely, perhaps just:
>>>
>>> VFIO: platform: test forward state when selecting IRQ handler
>> OK
>>>
>>> On Sun, Nov 23, 2014 at 07:35:55PM +0100, Eric Auger wrote:
 In case the IRQ is forwarded, the VFIO platform IRQ handler does not
 need to disable the IRQ anymore.

 When setting the IRQ handler we now also test the forwarded state. In
 case the IRQ is forwarded we select the edge handler (no automaske).

 Signed-off-by: Eric Auger 

 ---

 v2 -> v3:
 - forwarded state was tested in the handler. Now the forwarded state
   is tested before setting the handler. This definitively limits
   the dynamics of forwarded state changes but I don't think there is
   a use case where we need to be able to change the state at any time.
>>>
>>> user space can change this by calling the VFIO_IRQ_SET_ACTION_TRIGGER
>>> whenever it wants, right?
>> yes the user can set/unset the VFIO signaling (and request_irq/free_irq)
>> using VFIO_IRQ_SET_ACTION_TRIGGER. In this new version I do not allow
>> changing the forwarded state when the handler is attached (request_irq).
>>
>> Does it answer your interrogation?
>>
> interrogation?  hopefully it wasn't that bad ;)
Oups sorry "faux ami" in french. I wanted to say question but wanted to
say it smarter. another time ;-)
> 
> Yes, that answers my question.
> 
> Thanks,
> -Christoffer
> 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v12 09/11] pvqspinlock, x86: Add para-virtualization support

2014-12-01 Thread Konrad Rzeszutek Wilk
On Tue, Nov 25, 2014 at 07:33:58PM -0500, Waiman Long wrote:
> On 10/27/2014 02:02 PM, Konrad Rzeszutek Wilk wrote:
> >On Mon, Oct 27, 2014 at 01:38:20PM -0400, Waiman Long wrote:
> >>
> >>My concern is that spin_unlock() can be called in many places, including
> >>loadable kernel modules. Can the paravirt_patch_ident_32() function able to
> >>patch all of them in reasonable time? How about a kernel module loaded later
> >>at run time?
> >It has too. When the modules are loaded the .paravirt symbols are exposed
> >and the module loader patches that.
> >
> >And during bootup time (before modules are loaded) it also patches everything
> >- when it only runs on one CPU.
> >
> 
> I have been changing the patching code to patch the unlock call sites and it
> seems to be working now. However, when I manually inserted a kernel module
> using insmod and run the code in the newly inserted module, I got memory
> access violation as follows:
> 
> BUG: unable to handle kernel NULL pointer dereference at   (null)
> IP: [<  (null)>]   (null)
> PGD 18d62f3067 PUD 18d476f067 PMD 0
> Oops: 0010 [#1] SMP
> Modules linked in: locktest(OE) ebtable_nat ebtables xt_CHECKSUM
> iptable_mangle bridge autofs4 8021q garp stp llc ipt_REJECT
> nf_conntrack_ipv4 nf_defrag_ipv4 iptable_filter ip_tables ip6t_REJECT
> nf_conntrack_ipv6 nf_defrag_ipv6 xt_state nf_conntrack ip6table_filter
> ip6_tables ipv6 vhost_net macvtap macvlan vhost tun uinput ppdev parport_pc
> parport sg microcode pcspkr virtio_balloon snd_hda_codec_generic
> virtio_console snd_hda_intel snd_hda_controller snd_hda_codec snd_hwdep
> snd_seq snd_seq_device snd_pcm snd_timer snd soundcore virtio_net i2c_piix4
> i2c_core ext4(E) jbd2(E) mbcache(E) floppy(E) virtio_blk(E) sr_mod(E)
> cdrom(E) virtio_pci(E) virtio_ring(E) virtio(E) pata_acpi(E) ata_generic(E)
> ata_piix(E) dm_mirror(E) dm_region_hash(E) dm_log(E) dm_mod(E) [last
> unloaded: speedstep_lib]
> CPU: 1 PID: 3907 Comm: run-locktest Tainted: GW  OE  3.17.0-pvqlock
> #3
> Hardware name: Red Hat KVM, BIOS Bochs 01/01/2011
> task: 8818cc5baf90 ti: 8818b7094000 task.ti: 8818b7094000
> RIP: 0010:[<>]  [<  (null)>]   (null)
> RSP: 0018:8818b7097db0  EFLAGS: 00010246
> RAX:  RBX: 004c4b40 RCX: 
> RDX: 0001 RSI:  RDI: 8818d3f052c0
> RBP: 8818b7097dd8 R08: 80522014 R09: 
> R10: 1000 R11: 0001 R12: 0001
> R13:  R14: 0001 R15: 8818b7097ea0
> FS:  7fb828ece700() GS:88193ec2() knlGS:
> CS:  0010 DS:  ES:  CR0: 8005003b
> CR2:  CR3: 0018cc7e9000 CR4: 06e0
> Stack:
>  a06ff395 8818d465e000 8164bec0 0001
>  0050 8818b7097e18 a06ff785 8818b7097e38
>  0246 54755e3a 39f8ba72 8818c174f000
> Call Trace:
>  [] ? test_spinlock+0x65/0x90 [locktest]
>  [] etime_show+0xd5/0x120 [locktest]
>  [] kobj_attr_show+0x16/0x20
>  [] sysfs_kf_seq_show+0xca/0x1b0
>  [] kernfs_seq_show+0x23/0x30
>  [] seq_read+0xbb/0x400
>  [] kernfs_fop_read+0x35/0x40
>  [] vfs_read+0xa3/0x110
>  [] SyS_read+0x56/0xd0
>  [] ? __audit_syscall_exit+0x216/0x2c0
>  [] system_call_fastpath+0x16/0x1b
> Code:  Bad RIP value.
>  RSP 
> CR2: 
> ---[ end trace 69d0e259c9ec632f ]---
> 
> It seems like call site patching isn't properly done or the kernel module
> that I built was missing some critical information necessary for the proper

Did the readelf give you the paravirt note section?
> linking. Anyway, I will include the unlock call patching code as a separate
> patch as it seems there may be problem under certain circumstance.

one way to troubleshoot those is to enable the paravirt patching code to
actually print where it is patching the code. That way when you load the
module you can confirm it has done its job.

Then you can verify that the address  where the code is called:

a06ff395

is indeed patched. You might as well also do a hexdump in the module loading
to confim that the patching had been done correctly.
> 
> BTW, the kernel panic problem that your team reported had been fixed. The
> fix will be in the next version of the patch.
> 
> -Longman
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v13 09/11] pvqspinlock, x86: Add para-virtualization support

2014-12-01 Thread Konrad Rzeszutek Wilk
On Wed, Oct 29, 2014 at 04:19:09PM -0400, Waiman Long wrote:
> This patch adds para-virtualization support to the queue spinlock
> code base with minimal impact to the native case. There are some
> minor code changes in the generic qspinlock.c file which should be
> usable in other architectures. The other code changes are specific
> to x86 processors and so are all put under the arch/x86 directory.
> 
> On the lock side, the slowpath code is split into 2 separate functions
> generated from the same code - one for bare metal and one for PV guest.
> The switching is done in the _raw_spin_lock* functions. This makes
> sure that the performance impact to the bare metal case is minimal,
> just a few NOPs in the _raw_spin_lock* functions. In the PV slowpath
> code, there are 2 paravirt callee saved calls that minimize register
> pressure.
> 
> On the unlock side, however, the disabling of unlock function inlining
> does have some slight impact on bare metal performance.
> 
> The actual paravirt code comes in 5 parts;
> 
>  - init_node; this initializes the extra data members required for PV
>state. PV state data is kept 1 cacheline ahead of the regular data.
> 
>  - link_and_wait_node; this replaces the regular MCS queuing code. CPU
>halting can happen if the wait is too long.
> 
>  - wait_head; this waits until the lock is avialable and the CPU will
>be halted if the wait is too long.
> 
>  - wait_check; this is called after acquiring the lock to see if the
>next queue head CPU is halted. If this is the case, the lock bit is
>changed to indicate the queue head will have to be kicked on unlock.
> 
>  - queue_unlock;  this routine has a jump label to check if paravirt
>is enabled. If yes, it has to do an atomic cmpxchg to clear the lock
>bit or call the slowpath function to kick the queue head cpu.
> 
> Tracking the head is done in two parts, firstly the pv_wait_head will
> store its cpu number in whichever node is pointed to by the tail part
> of the lock word. Secondly, pv_link_and_wait_node() will propagate the
> existing head from the old to the new tail node.
> 
> Signed-off-by: Waiman Long 
> ---
>  arch/x86/include/asm/paravirt.h   |   19 ++
>  arch/x86/include/asm/paravirt_types.h |   20 ++
>  arch/x86/include/asm/pvqspinlock.h|  411 
> +
>  arch/x86/include/asm/qspinlock.h  |   71 ++-
>  arch/x86/kernel/paravirt-spinlocks.c  |6 +
>  include/asm-generic/qspinlock.h   |2 +
>  kernel/locking/qspinlock.c|   69 +-
>  7 files changed, 591 insertions(+), 7 deletions(-)
>  create mode 100644 arch/x86/include/asm/pvqspinlock.h
> 
> diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
> index cd6e161..7e296e6 100644
> --- a/arch/x86/include/asm/paravirt.h
> +++ b/arch/x86/include/asm/paravirt.h
> @@ -712,6 +712,24 @@ static inline void __set_fixmap(unsigned /* enum 
> fixed_addresses */ idx,
>  
>  #if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS)
>  
> +#ifdef CONFIG_QUEUE_SPINLOCK
> +
> +static __always_inline void pv_kick_cpu(int cpu)
> +{
> + PVOP_VCALLEE1(pv_lock_ops.kick_cpu, cpu);
> +}
> +
> +static __always_inline void pv_lockwait(u8 *lockbyte)
> +{
> + PVOP_VCALLEE1(pv_lock_ops.lockwait, lockbyte);
> +}
> +
> +static __always_inline void pv_lockstat(enum pv_lock_stats type)
> +{
> + PVOP_VCALLEE1(pv_lock_ops.lockstat, type);
> +}
> +
> +#else
>  static __always_inline void __ticket_lock_spinning(struct arch_spinlock 
> *lock,
>   __ticket_t ticket)
>  {
> @@ -723,6 +741,7 @@ static __always_inline void __ticket_unlock_kick(struct 
> arch_spinlock *lock,
>  {
>   PVOP_VCALL2(pv_lock_ops.unlock_kick, lock, ticket);
>  }
> +#endif
>  
>  #endif
>  
> diff --git a/arch/x86/include/asm/paravirt_types.h 
> b/arch/x86/include/asm/paravirt_types.h
> index 7549b8b..49e4b76 100644
> --- a/arch/x86/include/asm/paravirt_types.h
> +++ b/arch/x86/include/asm/paravirt_types.h
> @@ -326,6 +326,9 @@ struct pv_mmu_ops {
>  phys_addr_t phys, pgprot_t flags);
>  };
>  
> +struct mcs_spinlock;
> +struct qspinlock;
> +
>  struct arch_spinlock;
>  #ifdef CONFIG_SMP
>  #include 
> @@ -333,9 +336,26 @@ struct arch_spinlock;
>  typedef u16 __ticket_t;
>  #endif
>  
> +#ifdef CONFIG_QUEUE_SPINLOCK
> +enum pv_lock_stats {
> + PV_HALT_QHEAD,  /* Queue head halting   */
> + PV_HALT_QNODE,  /* Other queue node halting */
> + PV_HALT_ABORT,  /* Halting aborted  */
> + PV_WAKE_KICKED, /* Wakeup by kicking*/
> + PV_WAKE_SPURIOUS,   /* Spurious wakeup  */
> + PV_KICK_NOHALT  /* Kick but CPU not halted  */
> +};
> +#endif
> +
>  struct pv_lock_ops {
> +#ifdef CONFIG_QUEUE_SPINLOCK
> + struct paravirt_callee_save kick_cpu;
> + struct paravirt_callee_save lockstat;
> + struct paravirt_callee_save lockwait;

Re: [PATCH] x86, microcode: Don't initialize microcode code on paravirt

2014-12-01 Thread Konrad Rzeszutek Wilk
On Mon, Dec 01, 2014 at 04:27:44PM -0500, Boris Ostrovsky wrote:
> Paravirtual guests are not expected to load microcode into processors
> and therefore it is not necessary to initialize microcode loading
> logic.

CC-ing the KVM folks since they use the paravirt interface too.
> 
> In fact, under certain circumstances initializing this logic may cause
> the guest to crash. Specifically, 32-bit kernels use __pa_nodebug()
> macro which does not work in Xen (the code path that leads to this macro
> happens during resume when we call mc_bp_resume()->load_ucode_ap()
> ->check_loader_disabled_ap())
> 
> Signed-off-by: Boris Ostrovsky 
> ---
>  arch/x86/kernel/cpu/microcode/core.c |2 +-
>  1 files changed, 1 insertions(+), 1 deletions(-)
> 
> diff --git a/arch/x86/kernel/cpu/microcode/core.c 
> b/arch/x86/kernel/cpu/microcode/core.c
> index 2ce9051..ebd232d 100644
> --- a/arch/x86/kernel/cpu/microcode/core.c
> +++ b/arch/x86/kernel/cpu/microcode/core.c
> @@ -557,7 +557,7 @@ static int __init microcode_init(void)
>   struct cpuinfo_x86 *c = &cpu_data(0);
>   int error;
>  
> - if (dis_ucode_ldr)
> + if (paravirt_enabled() || dis_ucode_ldr)
>   return 0;
>  
>   if (c->x86_vendor == X86_VENDOR_INTEL)
> -- 
> 1.7.1
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] x86, microcode: Don't initialize microcode code on paravirt

2014-12-01 Thread Paolo Bonzini


On 01/12/2014 22:57, Konrad Rzeszutek Wilk wrote:
> On Mon, Dec 01, 2014 at 04:27:44PM -0500, Boris Ostrovsky wrote:
>> Paravirtual guests are not expected to load microcode into processors
>> and therefore it is not necessary to initialize microcode loading
>> logic.
> 
> CC-ing the KVM folks since they use the paravirt interface too.

We also do not want to load microcode. :)  Thanks for the heads-up.

Acked-by: Paolo Bonzini 

Paolo

>> In fact, under certain circumstances initializing this logic may cause
>> the guest to crash. Specifically, 32-bit kernels use __pa_nodebug()
>> macro which does not work in Xen (the code path that leads to this macro
>> happens during resume when we call mc_bp_resume()->load_ucode_ap()
>> ->check_loader_disabled_ap())
>>
>> Signed-off-by: Boris Ostrovsky 
>> ---
>>  arch/x86/kernel/cpu/microcode/core.c |2 +-
>>  1 files changed, 1 insertions(+), 1 deletions(-)
>>
>> diff --git a/arch/x86/kernel/cpu/microcode/core.c 
>> b/arch/x86/kernel/cpu/microcode/core.c
>> index 2ce9051..ebd232d 100644
>> --- a/arch/x86/kernel/cpu/microcode/core.c
>> +++ b/arch/x86/kernel/cpu/microcode/core.c
>> @@ -557,7 +557,7 @@ static int __init microcode_init(void)
>>  struct cpuinfo_x86 *c = &cpu_data(0);
>>  int error;
>>  
>> -if (dis_ucode_ldr)
>> +if (paravirt_enabled() || dis_ucode_ldr)
>>  return 0;
>>  
>>  if (c->x86_vendor == X86_VENDOR_INTEL)
>> -- 
>> 1.7.1
>>
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] x86, microcode: Don't initialize microcode code on paravirt

2014-12-01 Thread Boris Ostrovsky

On 12/01/2014 05:00 PM, Borislav Petkov wrote:

On Mon, Dec 01, 2014 at 04:27:44PM -0500, Boris Ostrovsky wrote:

Paravirtual guests are not expected to load microcode into processors
and therefore it is not necessary to initialize microcode loading
logic.

In fact, under certain circumstances initializing this logic may cause
the guest to crash. Specifically, 32-bit kernels use __pa_nodebug()
macro which does not work in Xen (the code path that leads to this macro
happens during resume when we call mc_bp_resume()->load_ucode_ap()
->check_loader_disabled_ap())

Signed-off-by: Boris Ostrovsky 
---
  arch/x86/kernel/cpu/microcode/core.c |2 +-
  1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kernel/cpu/microcode/core.c 
b/arch/x86/kernel/cpu/microcode/core.c
index 2ce9051..ebd232d 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -557,7 +557,7 @@ static int __init microcode_init(void)
struct cpuinfo_x86 *c = &cpu_data(0);
int error;
  
-	if (dis_ucode_ldr)

+   if (paravirt_enabled() || dis_ucode_ldr)

Ok, let me make sure I understand this correctly: The early path doesn't
get executed on paravirt, i.e. the path along load_ucode_intel_ap()?


(+KVM folks here as well)

Xen PV doesn't start with startup_32()/startup_32_smp() so for Xen this 
is true. Don't know about KVM (or lguest).




And you want to avoid loading of the microcode driver because the only
path we come to load_ucode_ap() on paravirt is the hotplug notifier?


That's my understanding, yes.



Btw, we've applied another fix today for 3.18 final which limits the
microcode reloading to 64-bit only:

http://git.kernel.org/tip/02ecc41abcea4ff9291d548f6f846b29b354ddd2

which should accidentally fix the paravirt issue too, no?


I think so. The problem we have now is __pa() macro that we only use on 
32-bit. I'll queue this for overnight tests to make sure and if it 
indeed works then 3.19 should be fine.


Thanks.
-boris



Because if so, I'd like to delay your patch for the 3.19 merge window
unless absolutely necessary.

Thanks.



--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] x86, microcode: Don't initialize microcode code on paravirt

2014-12-01 Thread Borislav Petkov
On Mon, Dec 01, 2014 at 11:12:38PM +0100, Paolo Bonzini wrote:
> We also do not want to load microcode. :)  Thanks for the heads-up.

You don't need to :P

-- 
Regards/Gruss,
Boris.

Sent from a fat crate under my desk. Formatting is fine.
--
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Xen-devel] [PATCH] xen: privcmd: schedule() after private hypercall when non CONFIG_PREEMPT

2014-12-01 Thread Luis R. Rodriguez
On Mon, Dec 01, 2014 at 06:16:28PM +, David Vrabel wrote:
> On 01/12/14 16:19, Luis R. Rodriguez wrote:
> > On Mon, Dec 01, 2014 at 03:54:24PM +, David Vrabel wrote:
> >> On 01/12/14 15:44, Luis R. Rodriguez wrote:
> >>> On Mon, Dec 1, 2014 at 10:18 AM, David Vrabel  
> >>> wrote:
>  On 01/12/14 15:05, Luis R. Rodriguez wrote:
> > On Mon, Dec 01, 2014 at 11:11:43AM +, David Vrabel wrote:
> >> On 27/11/14 18:36, Luis R. Rodriguez wrote:
> >>> On Thu, Nov 27, 2014 at 07:36:31AM +0100, Juergen Gross wrote:
>  On 11/26/2014 11:26 PM, Luis R. Rodriguez wrote:
> > From: "Luis R. Rodriguez" 
> >
> > Some folks had reported that some xen hypercalls take a long time
> > to complete when issued from the userspace private ioctl mechanism,
> > this can happen for instance with some hypercalls that have many
> > sub-operations, this can happen for instance on hypercalls that use
> >> [...]
> > --- a/drivers/xen/privcmd.c
> > +++ b/drivers/xen/privcmd.c
> > @@ -60,6 +60,9 @@ static long privcmd_ioctl_hypercall(void __user 
> > *udata)
> >  hypercall.arg[0], hypercall.arg[1],
> >  hypercall.arg[2], hypercall.arg[3],
> >  hypercall.arg[4]);
> > +#ifndef CONFIG_PREEMPT
> > + schedule();
> > +#endif
> >>
> >> As Juergen points out, this does nothing.  You need to schedule while 
> >> in
> >> the middle of the hypercall.
> >>
> >> Remember that Xen's hypercall preemption only preempts the hypercall to
> >> run interrupts in the guest.
> >
> > How is it ensured that when the kernel preempts on this code path on
> > CONFIG_PREEMPT=n kernel that only interrupts in the guest are run?
> 
>  Sorry, I really didn't describe this very well.
> 
>  If a hypercall needs a continuation, Xen returns to the guest with the
>  IP set to the hypercall instruction, and on the way back to the guest
>  Xen may schedule a different VCPU or it will do any upcalls (as per 
>  normal).
> 
>  The guest is free to return from the upcall to the original task
>  (continuing the hypercall) or to a different one.
> >>>
> >>> OK so that addresses what Xen will do when using continuation and
> >>> hypercall preemption, my concern here was that using
> >>> preempt_schedule_irq() on CONFIG_PREEMPT=n kernels in the middle of a
> >>> hypercall on the return from an interrupt (e.g., the timer interrupt)
> >>> would still let the kernel preempt to tasks other than those related
> >>> to Xen.
> >>
> >> Um.  Why would that be a problem?  We do want to switch to any task the
> >> Linux scheduler thinks is best.
> > 
> > Its safe but -- it technically is doing kernel preemption, unless we want
> > to adjust the definition of CONFIG_PREEMPT=n to exclude hypercalls. This
> > was my original concern with the use of preempt_schedule_irq() to do this.
> > I am afraid of setting precedents without being clear or wider review and
> > acceptance.
> 
> It's voluntary preemption at a well defined point. 

Its voluntarily preempting the kernel even for CONFIG_PREEMPT=n kernels... 

> It's no different to a cond_resched() call.

Then I do agree its a fair analogy (and find this obviously odd that how
widespread cond_resched() is), we just don't have an equivalent for IRQ
context, why not avoid the special check then and use this all the time in the
middle of a hypercall on the return from an interrupt (e.g., the timer
interrupt)?

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5e344bb..e60b5a1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2759,6 +2759,12 @@ static inline int signal_pending_state(long state, 
struct task_struct *p)
  */
 extern int _cond_resched(void);
 
+/*
+ * Voluntarily preempting the kernel even for CONFIG_PREEMPT=n kernels
+ * on very special circumstances.
+ */
+extern int cond_resched_irq(void);
+
 #define cond_resched() ({  \
__might_sleep(__FILE__, __LINE__, 0);   \
_cond_resched();\
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 240157c..1c4d443 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4264,6 +4264,16 @@ int __sched _cond_resched(void)
 }
 EXPORT_SYMBOL(_cond_resched);
 
+int __sched cond_resched_irq(void)
+{
+   if (should_resched()) {
+   preempt_schedule_irq();
+   return 1;
+   }
+   return 0;
+}
+EXPORT_SYMBOL_GPL(cond_resched_irq);
+
 /*
  * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
  * call schedule, and on return reacquire the lock.
  Luis
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.h

Re: [PATCH] x86, microcode: Don't initialize microcode code on paravirt

2014-12-01 Thread Borislav Petkov
On Mon, Dec 01, 2014 at 05:31:56PM -0500, Boris Ostrovsky wrote:
> I think so. The problem we have now is __pa() macro that we only use
> on 32-bit. I'll queue this for overnight tests to make sure and if it
> indeed works then 3.19 should be fine.

Cool, thanks.

I'd still take your patch for 3.19 though because I'm fixing the 32-bit
reloading path properly and will remove the ifdef afterwards.

And even then, I'd like to prevent loading the module on a paravirt
guest if it is totally unneeded there.

-- 
Regards/Gruss,
Boris.

Sent from a fat crate under my desk. Formatting is fine.
--
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM: Introduce dynamically registered hypercall capability

2014-12-01 Thread Phil White
On Mon, Dec 1, 2014 at 5:47 AM, Radim Krčmář  wrote:
> 2014-11-28 17:29-0800, Phil White:
>> Good questions.
>>
>> One thing that prompted this code is the presence and proliferation of
>> architecture specific hypercalls in include/uapi/linux/kvm_para.h.
>> I'm not sure why the tree has continued on for as long as it has with
>> a list of reserved hypercall indices -- most of which are unused on
>> any given architecture.  Is there a reason that I'm unaware of?
>
> Name-space won't be exhausted, so nothing forced them to separate and
> centralization easily avoids conflicts with generic hypercalls.

Consider: All the arch specific defines were defined in asm/kvm_para.h.
Each asm/kvm_para.h defines KVM_HC_ARCH_MAX as a relative
index from which generic hypercalls ought to be applied (e.g.
KVM_HC_ARCH_MAX+1 rather than 11).

This would at least organize the hypercalls and avoid a situation in which a
number of hypercalls which are not applicable pollute the namespace.  I'll
grant that the grounds here may be largely aesthetic.

The other worry is that institutionalization of this method will lead to a
hesitance to associate a specific hypercall index with anything other than the
function which it has been assigned in past kernel revisions.

In addition, this leads to a maintenance problem for anyone seeking to add a
hypercall in the future in which their hypercalls will need to be
updated in order
to avoid collisions with the community as well as any other sources they may
be dealing with.

These are all minor headaches, but they can be avoided.  A registration
method like this -- albeit somewhat more refined -- could be used to eliminate
all of those headaches in my opinion.

> I'd say that a virtio device is the way to go if you want to stay in the
> kernel, but outside of kvm modules.  In which ways is virtio lacking?

Virtio has several limitations.  It implies a situation in which the system has
already booted.  Secondly, there's no easy way to access the kvm structure.
Thirdly, it cannot be used effectively to implement an optimization for
virtualization on a platform.  Fourthly, I believe it would require changes to
qemu command lines -- and any associated tools which might be used to
cobble together a qemu command line.

A simple way of putting it, using the existing in-kernel code: I don't see how
you could use virtio to map the powerpc magic page at bootup.

>> It does occur to me that in the absence of the setup which I had
>> available, one could simply treat hc_nr as a 4 character ID rather
>> than a particular digit.
>
> (This would probably solve the situation in practice, but the conflict
>  is still there, so design hasn't improved.)

I'm not sure which conflict you mean.  I presume you mean the possibility
that two separate modules may attempt to claim the same hypercall index?

Presuming you do -- and I may be arguing a straw man here -- I'm not sure
that's classifiable as a design flaw as no method occurs to me by which
one could add the capability of dynamically registering a hypercall and have
access to the capabilities I mentioned above.

However, I can be trivially proven wrong by the suggestion of a design by
which an out-of-tree module could dynamically register a hypercall, have
access to the kvm structure (at a minimum), and avoid the possibility that
some other user may race to claim that hypercall.

>> "The generation of random numbers is too important to be left to
>> chance." -Robert R. Coveyou, Oak Ridge National Laboratory, 1969
>
> :)  (Exactly.)
>
>> On Thu, Nov 27, 2014 at 7:31 AM, Radim Krčmář  wrote:
>> > 2014-11-27 05:30-0800, Phil White:
>> >> This introduces a list of entries which associate a function pointer of
>> >> kvm_hc_type to a hypercall number and allows the ability to register and
>> >> unregister entries.  In addition, it also allows the ability to retrieve a
>> >> function pointer of kvm_hc_type for a given hypercall number which is 
>> >> meant
>> >> to be called from the arch-specific section.
>> >>
>> >> The main intent is to allow modules to register hypercalls which they own
>> >> rather than requiring the addition of a stub of some sort.  It will also
>> >> allow each arch to maintain separate lists of hypercalls rather than 
>> >> having
>> >> to respect changes in include/uapi/linux/kvm_para.h
>> >>
>> >> Signed-off-by: Phil White 
>> >> ---
>> >
>> > Apart from other problems,
>> > how are guests going to use these hypercalls?
>> >
>> > (If hc_nr is dynamic, a guest doesn't know its number and even if it is
>> >  static, someone could have registered it beforehand => this needs some
>> >  kind of synchronization with host modules.  A hardcoded reservation?)
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM: PPC: Book3S HV: Tracepoints for KVM HV guest interactions

2014-12-01 Thread Suresh E. Warrier


On 11/20/2014 08:01 AM, Steven Rostedt wrote:
> On Thu, 20 Nov 2014 13:10:12 +0100
> Alexander Graf  wrote:
> 
>>
>>
>> On 20.11.14 11:40, Aneesh Kumar K.V wrote:
>>> "Suresh E. Warrier"  writes:
>>>
 This patch adds trace points in the guest entry and exit code and also
 for exceptions handled by the host in kernel mode - hypercalls and page
 faults. The new events are added to /sys/kernel/debug/tracing/events
 under a new subsystem called kvm_hv.
>>>
>>> 
>>>
/* Set this explicitly in case thread 0 doesn't have a vcpu */
 @@ -1687,6 +1691,9 @@ static void kvmppc_run_core(struct kvmppc_vcore *vc)
  
vc->vcore_state = VCORE_RUNNING;
preempt_disable();
 +
 +  trace_kvmppc_run_core(vc, 0);
 +
spin_unlock(&vc->lock);
>>>
>>> Do we really want to call tracepoint with spin lock held ? Is that a good
>>> thing to do ?. 
>>
>> I thought it was safe to call tracepoints inside of spin lock regions?
>> Steve?
>>
> 
> There's tracepoints in the guts of the scheduler where rq lock is held.
> Don't worry about it. The tracing system is lockless.
> 

Thanks for confirming.

-suresh
 
> -- Steve
> 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [RFC PATCH v2 1/2] KVM: kvm-vfio: User API for VT-d Posted-Interrupts

2014-12-01 Thread Wu, Feng


> -Original Message-
> From: Eric Auger [mailto:eric.au...@linaro.org]
> Sent: Monday, December 01, 2014 6:10 PM
> To: Alex Williamson
> Cc: Wu, Feng; pbonz...@redhat.com; g...@kernel.org; kvm@vger.kernel.org
> Subject: Re: [RFC PATCH v2 1/2] KVM: kvm-vfio: User API for VT-d
> Posted-Interrupts
> 
> On 11/25/2014 05:10 PM, Alex Williamson wrote:
> > On Tue, 2014-11-25 at 16:01 +0100, Eric Auger wrote:
> >> On 11/25/2014 01:23 PM, Feng Wu wrote:
> >>> This patch adds and documents a new attribute
> >>> KVM_DEV_VFIO_DEVICE_POSTING_IRQ in KVM_DEV_VFIO_DEVICE
> group.
> >>> This new attribute is used for VT-d Posted-Interrupts.
> >>>
> >>> When guest OS changes the interrupt configuration for an
> >>> assigned device, such as, MSI/MSIx data/address fields,
> >>> QEMU will use this IRQ attribute to tell KVM to update the
> >>> related IRTE according the VT-d Posted-Interrrupts Specification,
> >>> such as, the guest vector should be updated in the related IRTE.
> >>>
> >>> Signed-off-by: Feng Wu 
> >>> ---
> >>>  Documentation/virtual/kvm/devices/vfio.txt |9 +
> >>>  include/uapi/linux/kvm.h   |   10 ++
> >>>  2 files changed, 19 insertions(+), 0 deletions(-)
> >>>
> >>> diff --git a/Documentation/virtual/kvm/devices/vfio.txt
> b/Documentation/virtual/kvm/devices/vfio.txt
> >>> index f7aff29..39dee86 100644
> >>> --- a/Documentation/virtual/kvm/devices/vfio.txt
> >>> +++ b/Documentation/virtual/kvm/devices/vfio.txt
> >>> @@ -42,3 +42,12 @@ activated before VFIO_DEVICE_SET_IRQS has been
> called to trigger the IRQ
> >>>  or associate an eventfd to it. Unforwarding can only be called while the
> >>>  signaling has been disabled with VFIO_DEVICE_SET_IRQS. If this
> condition is
> >>>  not satisfied, the command returns an -EBUSY.
> >>> +
> >>> +  KVM_DEV_VFIO_DEVICE_POSTING_IRQ: Use posted interrtups
> mechanism to post
> >>> +   the IRQ to guests.
> >>> +For this attribute, kvm_device_attr.addr points to a kvm_posted_intr
> struct.
> >>> +
> >>> +When guest OS changes the interrupt configuration for an assigned
> device,
> >>> +such as, MSI/MSIx data/address fields, QEMU will use this IRQ attribute
> >>> +to tell KVM to update the related IRTE according the VT-d
> Posted-Interrrupts
> >>> +Specification, such as, the guest vector should be updated in the related
> IRTE.
> >>> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> >>> index a269a42..e5f86ad 100644
> >>> --- a/include/uapi/linux/kvm.h
> >>> +++ b/include/uapi/linux/kvm.h
> >>> @@ -949,6 +949,7 @@ struct kvm_device_attr {
> >>>  #define  KVM_DEV_VFIO_DEVICE 2
> >>>  #define   KVM_DEV_VFIO_DEVICE_FORWARD_IRQ1
> >>>  #define   KVM_DEV_VFIO_DEVICE_UNFORWARD_IRQ  2
> >>> +#define   KVM_DEV_VFIO_DEVICE_POSTING_IRQ3
> >>>
> >>>  enum kvm_device_type {
> >>>   KVM_DEV_TYPE_FSL_MPIC_20= 1,
> >>> @@ -973,6 +974,15 @@ struct kvm_arch_forwarded_irq {
> >>>   __u32 gsi; /* gsi, ie. virtual IRQ number */
> >>>  };
> >>>
> Hi Feng, Alex,
> I am currently reworking my code to use something closer to this struct.
> Would you agree with following changes?
> >>> +struct kvm_posted_intr {
> kvm_posted_irq

Hi Alex,

Do you mean changing the structure name to "kvm_posted_irq"? I am okay
If you think this name is also suitable for ARM forwarded irq. Or we can find
a more common name, such as "struct kvm_accel_irq", what is your opinion, Alex?

> >>> + __u32   argsz;
> >>> + __u32   fd; /* file descriptor of the VFIO device */
> >>> + __u32   index;  /* VFIO device IRQ index */
> >>> + __u32   start;
> >>> + __u32   count;
> >>> + int virq[0];/* gsi, ie. virtual IRQ number */
> __u32 gsi[];

I think this change is okay to me. If Alex also agree, I will follow this in the
next post. 

Thanks,
Feng


> >>> +};
> >> Hi Feng,
> >>
> >> This struct could be used by arm code too. If Alex agrees I could use
> >> that one instead. We just need to find a common sensible name
> >
> > Yep, the interface might as well support batch setup.  The vfio code
> > uses -1 for teardown if we want to avoid FORWARD vs UNFORWARD we could
> > let the data in the structure define which operation to do.
> 
> In case we remove the unforward and use fd=1 to tear down, the virq=gsi
> must uniquely identify the struct. For ARM I think this is true, we
> cannot have several physical IRQ forwarded to the same GSI. I don't know
> about posted irqs or other archs.
> 
> Best Regards
> 
> Eric
>  Ideally the
> > code in virt/kvm/vfio.c would be almost entirely shared and just make
> > different arch_foo() callouts.  The PCI smarts in 2/2 here should
> > probably be moved out to that same arch_ code.  Thanks,
> >
> > Alex
> >



Re: Question about physical page allocation to the guest

2014-12-01 Thread Steven
On Wed, Oct 22, 2014 at 12:39 PM, Xiao Guangrong
 wrote:
>
>
> On 10/20/14 2:09 AM, Steven wrote:
>>
>> Hi, Eric,
>> I am trying to understand how KVM allocates physical pages to the
>> guest and your slides clarify a lot of questions.
>>
>> (https://events.linuxfoundation.org/slides/2011/linuxcon-japan/lcj2011_guangrong.pdf)
>>
>> However, I still have some difficulty in figuring out what happens in
>> the kvm code.
>>
>> The host kernel is 3.2.14 and EPT is disabled.
>>
>> In the guest VM, I run a micro-benchmark program that touches 1024
>> pages (as an integer array). So in the guest VM, I can trace 1024
>> mm_page_alloc event. However, in the hypervisor I can only trace about
>> 45 (sometimes < 45) kvm_page_fault events, which means that most page
>> faults in the guest are not exposed to the hypervisor. My questions
>> are
>>
>> (1) why such kind of page faults are not exposed to the hypervisor as
>> EPT is disabled? (My doubt is that it is related to non-presetn PTE as
>> you discussed in the slides. But could you give some more details?)
>
>
> Two cases, the one is the page you accessed have already mapped into the
> guest (the Present bit in the SPTE (shadow page table entry) is set).
> Another is that we can do page prefetch in KVM that can fill
> more nonpresent sptes in one vm-exit.

Hi, Eric,
Thanks for these insightful comments. I have done some traces and
found that there is very few vm_try_async_get_page event (as called in
the try_async_pf). So I think the major reason of few page allocation
I observed is due to the page to be accessed that have already mapped
into the
guest (the Present bit in the SPTE (shadow page table entry) is set).
My questions are

(1) Is this related to unsync shadow pages as in page 15 of your slides?
https://events.linuxfoundation.org/slides/2011/linuxcon-japan/lcj2011_guangrong.pdf

(2) If this is true, then there will be no vmexit to the hypervisor on
the page fault in guest user space. When will the shadow page table be
synched with the guest page table? Thanks.


>
>>
>> (2) In such case, when the physical page are allocated to the VM to
>> backup the virtual pages? Could you give some hint about which piece
>> of KVM code calling the get get_user_pages()?
>
>
> In the case if the hva of the gpa you are accessing is not mapped in
> the hypervisor's address space. Please follow the code of try_async_pf
>
>>
>> Thanks in advance.
>
>
> Enjoy it. :)
>
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH v2 1/2] KVM: kvm-vfio: User API for VT-d Posted-Interrupts

2014-12-01 Thread Alex Williamson
On Tue, 2014-12-02 at 02:08 +, Wu, Feng wrote:
> 
> > -Original Message-
> > From: Eric Auger [mailto:eric.au...@linaro.org]
> > Sent: Monday, December 01, 2014 6:10 PM
> > To: Alex Williamson
> > Cc: Wu, Feng; pbonz...@redhat.com; g...@kernel.org; kvm@vger.kernel.org
> > Subject: Re: [RFC PATCH v2 1/2] KVM: kvm-vfio: User API for VT-d
> > Posted-Interrupts
> > 
> > On 11/25/2014 05:10 PM, Alex Williamson wrote:
> > > On Tue, 2014-11-25 at 16:01 +0100, Eric Auger wrote:
> > >> On 11/25/2014 01:23 PM, Feng Wu wrote:
> > >>> This patch adds and documents a new attribute
> > >>> KVM_DEV_VFIO_DEVICE_POSTING_IRQ in KVM_DEV_VFIO_DEVICE
> > group.
> > >>> This new attribute is used for VT-d Posted-Interrupts.
> > >>>
> > >>> When guest OS changes the interrupt configuration for an
> > >>> assigned device, such as, MSI/MSIx data/address fields,
> > >>> QEMU will use this IRQ attribute to tell KVM to update the
> > >>> related IRTE according the VT-d Posted-Interrrupts Specification,
> > >>> such as, the guest vector should be updated in the related IRTE.
> > >>>
> > >>> Signed-off-by: Feng Wu 
> > >>> ---
> > >>>  Documentation/virtual/kvm/devices/vfio.txt |9 +
> > >>>  include/uapi/linux/kvm.h   |   10 ++
> > >>>  2 files changed, 19 insertions(+), 0 deletions(-)
> > >>>
> > >>> diff --git a/Documentation/virtual/kvm/devices/vfio.txt
> > b/Documentation/virtual/kvm/devices/vfio.txt
> > >>> index f7aff29..39dee86 100644
> > >>> --- a/Documentation/virtual/kvm/devices/vfio.txt
> > >>> +++ b/Documentation/virtual/kvm/devices/vfio.txt
> > >>> @@ -42,3 +42,12 @@ activated before VFIO_DEVICE_SET_IRQS has been
> > called to trigger the IRQ
> > >>>  or associate an eventfd to it. Unforwarding can only be called while 
> > >>> the
> > >>>  signaling has been disabled with VFIO_DEVICE_SET_IRQS. If this
> > condition is
> > >>>  not satisfied, the command returns an -EBUSY.
> > >>> +
> > >>> +  KVM_DEV_VFIO_DEVICE_POSTING_IRQ: Use posted interrtups
> > mechanism to post
> > >>> +   the IRQ to guests.
> > >>> +For this attribute, kvm_device_attr.addr points to a kvm_posted_intr
> > struct.
> > >>> +
> > >>> +When guest OS changes the interrupt configuration for an assigned
> > device,
> > >>> +such as, MSI/MSIx data/address fields, QEMU will use this IRQ attribute
> > >>> +to tell KVM to update the related IRTE according the VT-d
> > Posted-Interrrupts
> > >>> +Specification, such as, the guest vector should be updated in the 
> > >>> related
> > IRTE.
> > >>> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> > >>> index a269a42..e5f86ad 100644
> > >>> --- a/include/uapi/linux/kvm.h
> > >>> +++ b/include/uapi/linux/kvm.h
> > >>> @@ -949,6 +949,7 @@ struct kvm_device_attr {
> > >>>  #define  KVM_DEV_VFIO_DEVICE   2
> > >>>  #define   KVM_DEV_VFIO_DEVICE_FORWARD_IRQ  1
> > >>>  #define   KVM_DEV_VFIO_DEVICE_UNFORWARD_IRQ2
> > >>> +#define   KVM_DEV_VFIO_DEVICE_POSTING_IRQ  3
> > >>>
> > >>>  enum kvm_device_type {
> > >>> KVM_DEV_TYPE_FSL_MPIC_20= 1,
> > >>> @@ -973,6 +974,15 @@ struct kvm_arch_forwarded_irq {
> > >>> __u32 gsi; /* gsi, ie. virtual IRQ number */
> > >>>  };
> > >>>
> > Hi Feng, Alex,
> > I am currently reworking my code to use something closer to this struct.
> > Would you agree with following changes?
> > >>> +struct kvm_posted_intr {
> > kvm_posted_irq
> 
> Hi Alex,
> 
> Do you mean changing the structure name to "kvm_posted_irq"? I am okay
> If you think this name is also suitable for ARM forwarded irq. Or we can find
> a more common name, such as "struct kvm_accel_irq", what is your opinion, 
> Alex?

I'd think something like struct kvm_vfio_dev_irq describes it fairly
well.

> > >>> +   __u32   argsz;
> > >>> +   __u32   fd; /* file descriptor of the VFIO device */
> > >>> +   __u32   index;  /* VFIO device IRQ index */
> > >>> +   __u32   start;
> > >>> +   __u32   count;
> > >>> +   int virq[0];/* gsi, ie. virtual IRQ number */
> > __u32 gsi[];
> 
> I think this change is okay to me. If Alex also agree, I will follow this in 
> the
> next post. 
> 
> > >>> +};
> > >> Hi Feng,
> > >>
> > >> This struct could be used by arm code too. If Alex agrees I could use
> > >> that one instead. We just need to find a common sensible name
> > >
> > > Yep, the interface might as well support batch setup.  The vfio code
> > > uses -1 for teardown if we want to avoid FORWARD vs UNFORWARD we could
> > > let the data in the structure define which operation to do.
> > 
> > In case we remove the unforward and use fd=1 to tear down, the virq=gsi
> > must uniquely identify the struct. For ARM I think this is true, we
> > cannot have several physical IRQ forwarded to the same GSI. I don't know
> > about posted irqs or other archs.

It makes more sense to me that fd is the real 

RE: [RFC PATCH v2 1/2] KVM: kvm-vfio: User API for VT-d Posted-Interrupts

2014-12-01 Thread Wu, Feng


> -Original Message-
> From: Alex Williamson [mailto:alex.william...@redhat.com]
> Sent: Tuesday, December 02, 2014 12:48 PM
> To: Wu, Feng
> Cc: Eric Auger; pbonz...@redhat.com; g...@kernel.org; kvm@vger.kernel.org
> Subject: Re: [RFC PATCH v2 1/2] KVM: kvm-vfio: User API for VT-d
> Posted-Interrupts
> 
> On Tue, 2014-12-02 at 02:08 +, Wu, Feng wrote:
> >
> > > -Original Message-
> > > From: Eric Auger [mailto:eric.au...@linaro.org]
> > > Sent: Monday, December 01, 2014 6:10 PM
> > > To: Alex Williamson
> > > Cc: Wu, Feng; pbonz...@redhat.com; g...@kernel.org;
> kvm@vger.kernel.org
> > > Subject: Re: [RFC PATCH v2 1/2] KVM: kvm-vfio: User API for VT-d
> > > Posted-Interrupts
> > >
> > > On 11/25/2014 05:10 PM, Alex Williamson wrote:
> > > > On Tue, 2014-11-25 at 16:01 +0100, Eric Auger wrote:
> > > >> On 11/25/2014 01:23 PM, Feng Wu wrote:
> > > >>> This patch adds and documents a new attribute
> > > >>> KVM_DEV_VFIO_DEVICE_POSTING_IRQ in KVM_DEV_VFIO_DEVICE
> > > group.
> > > >>> This new attribute is used for VT-d Posted-Interrupts.
> > > >>>
> > > >>> When guest OS changes the interrupt configuration for an
> > > >>> assigned device, such as, MSI/MSIx data/address fields,
> > > >>> QEMU will use this IRQ attribute to tell KVM to update the
> > > >>> related IRTE according the VT-d Posted-Interrrupts Specification,
> > > >>> such as, the guest vector should be updated in the related IRTE.
> > > >>>
> > > >>> Signed-off-by: Feng Wu 
> > > >>> ---
> > > >>>  Documentation/virtual/kvm/devices/vfio.txt |9 +
> > > >>>  include/uapi/linux/kvm.h   |   10 ++
> > > >>>  2 files changed, 19 insertions(+), 0 deletions(-)
> > > >>>
> > > >>> diff --git a/Documentation/virtual/kvm/devices/vfio.txt
> > > b/Documentation/virtual/kvm/devices/vfio.txt
> > > >>> index f7aff29..39dee86 100644
> > > >>> --- a/Documentation/virtual/kvm/devices/vfio.txt
> > > >>> +++ b/Documentation/virtual/kvm/devices/vfio.txt
> > > >>> @@ -42,3 +42,12 @@ activated before VFIO_DEVICE_SET_IRQS has
> been
> > > called to trigger the IRQ
> > > >>>  or associate an eventfd to it. Unforwarding can only be called while
> the
> > > >>>  signaling has been disabled with VFIO_DEVICE_SET_IRQS. If this
> > > condition is
> > > >>>  not satisfied, the command returns an -EBUSY.
> > > >>> +
> > > >>> +  KVM_DEV_VFIO_DEVICE_POSTING_IRQ: Use posted interrtups
> > > mechanism to post
> > > >>> +   the IRQ to guests.
> > > >>> +For this attribute, kvm_device_attr.addr points to a kvm_posted_intr
> > > struct.
> > > >>> +
> > > >>> +When guest OS changes the interrupt configuration for an assigned
> > > device,
> > > >>> +such as, MSI/MSIx data/address fields, QEMU will use this IRQ
> attribute
> > > >>> +to tell KVM to update the related IRTE according the VT-d
> > > Posted-Interrrupts
> > > >>> +Specification, such as, the guest vector should be updated in the
> related
> > > IRTE.
> > > >>> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> > > >>> index a269a42..e5f86ad 100644
> > > >>> --- a/include/uapi/linux/kvm.h
> > > >>> +++ b/include/uapi/linux/kvm.h
> > > >>> @@ -949,6 +949,7 @@ struct kvm_device_attr {
> > > >>>  #define  KVM_DEV_VFIO_DEVICE 2
> > > >>>  #define   KVM_DEV_VFIO_DEVICE_FORWARD_IRQ1
> > > >>>  #define   KVM_DEV_VFIO_DEVICE_UNFORWARD_IRQ
>   2
> > > >>> +#define   KVM_DEV_VFIO_DEVICE_POSTING_IRQ3
> > > >>>
> > > >>>  enum kvm_device_type {
> > > >>>   KVM_DEV_TYPE_FSL_MPIC_20= 1,
> > > >>> @@ -973,6 +974,15 @@ struct kvm_arch_forwarded_irq {
> > > >>>   __u32 gsi; /* gsi, ie. virtual IRQ number */
> > > >>>  };
> > > >>>
> > > Hi Feng, Alex,
> > > I am currently reworking my code to use something closer to this struct.
> > > Would you agree with following changes?
> > > >>> +struct kvm_posted_intr {
> > > kvm_posted_irq
> >
> > Hi Alex,
> >
> > Do you mean changing the structure name to "kvm_posted_irq"? I am okay
> > If you think this name is also suitable for ARM forwarded irq. Or we can 
> > find
> > a more common name, such as "struct kvm_accel_irq", what is your opinion,
> Alex?
> 
> I'd think something like struct kvm_vfio_dev_irq describes it fairly
> well.

No problem! I will follow this in the next post.

> 
> > > >>> + __u32   argsz;
> > > >>> + __u32   fd; /* file descriptor of the VFIO device */
> > > >>> + __u32   index;  /* VFIO device IRQ index */
> > > >>> + __u32   start;
> > > >>> + __u32   count;
> > > >>> + int virq[0];/* gsi, ie. virtual IRQ number */
> > > __u32 gsi[];
> >
> > I think this change is okay to me. If Alex also agree, I will follow this 
> > in the
> > next post.
> >
> > > >>> +};
> > > >> Hi Feng,
> > > >>
> > > >> This struct could be used by arm code too. If Alex agrees I could use
> > > >> that one instead. We just need to find a common sensible name
> > > >
> 

[PATCH 2/2] KVM: vmx: Enable Intel xsaves for guest

2014-12-01 Thread Wanpeng Li
Expose Intel xsaves feature to guest.

Signed-off-by: Wanpeng Li 
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/include/asm/vmx.h  |  3 +++
 arch/x86/include/uapi/asm/vmx.h |  6 +-
 arch/x86/kvm/vmx.c  | 30 +-
 4 files changed, 38 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 2896dbc..95dde42 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -362,6 +362,7 @@ struct kvm_vcpu_arch {
int mp_state;
u64 ia32_misc_enable_msr;
bool tpr_access_reporting;
+   u64 ia32_xss;
 
/*
 * Paging state of the vcpu
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index bcbfade..bdb79ef 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -69,6 +69,7 @@
 #define SECONDARY_EXEC_PAUSE_LOOP_EXITING  0x0400
 #define SECONDARY_EXEC_ENABLE_INVPCID  0x1000
 #define SECONDARY_EXEC_SHADOW_VMCS  0x4000
+#define SECONDARY_EXEC_XSAVES  0x0010
 
 
 #define PIN_BASED_EXT_INTR_MASK 0x0001
@@ -159,6 +160,8 @@ enum vmcs_field {
EOI_EXIT_BITMAP3_HIGH   = 0x2023,
VMREAD_BITMAP   = 0x2026,
VMWRITE_BITMAP  = 0x2028,
+   XSS_EXIT_BIMTAP = 0x202C,
+   XSS_EXIT_BIMTAP_HIGH= 0x202D,
GUEST_PHYSICAL_ADDRESS  = 0x2400,
GUEST_PHYSICAL_ADDRESS_HIGH = 0x2401,
VMCS_LINK_POINTER   = 0x2800,
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index 990a2fe..b813bf9 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -72,6 +72,8 @@
 #define EXIT_REASON_XSETBV  55
 #define EXIT_REASON_APIC_WRITE  56
 #define EXIT_REASON_INVPCID 58
+#define EXIT_REASON_XSAVES  63
+#define EXIT_REASON_XRSTORS 64
 
 #define VMX_EXIT_REASONS \
{ EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \
@@ -116,6 +118,8 @@
{ EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \
{ EXIT_REASON_INVD,  "INVD" }, \
{ EXIT_REASON_INVVPID,   "INVVPID" }, \
-   { EXIT_REASON_INVPCID,   "INVPCID" }
+   { EXIT_REASON_INVPCID,   "INVPCID" }, \
+   { EXIT_REASON_XSAVES,"XSAVES" }, \
+   { EXIT_REASON_XRSTORS,   "XRSTORS" }
 
 #endif /* _UAPIVMX_H */
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 6a951d8..383150e 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1045,6 +1045,12 @@ static inline bool cpu_has_vmx_invpcid(void)
SECONDARY_EXEC_ENABLE_INVPCID;
 }
 
+static inline bool cpu_has_xss_exit_bitmap(void)
+{
+   return vmcs_config.cpu_based_2nd_exec_ctrl &
+   SECONDARY_EXEC_XSAVES;
+}
+
 static inline bool cpu_has_virtual_nmis(void)
 {
return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
@@ -1722,6 +1728,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
 {
struct vcpu_vmx *vmx = to_vmx(vcpu);
int i;
+   u64 host_xss;
 
if (vmx->host_state.loaded)
return;
@@ -1773,6 +1780,8 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
kvm_set_shared_msr(vmx->guest_msrs[i].index,
   vmx->guest_msrs[i].data,
   vmx->guest_msrs[i].mask);
+   rdmsrl(MSR_IA32_XSS, host_xss);
+   add_atomic_switch_msr(vmx, MSR_IA32_XSS, vcpu->arch.ia32_xss, host_xss);
 }
 
 static void __vmx_load_host_state(struct vcpu_vmx *vmx)
@@ -2895,7 +2904,8 @@ static __init int setup_vmcs_config(struct vmcs_config 
*vmcs_conf)
SECONDARY_EXEC_ENABLE_INVPCID |
SECONDARY_EXEC_APIC_REGISTER_VIRT |
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
-   SECONDARY_EXEC_SHADOW_VMCS;
+   SECONDARY_EXEC_SHADOW_VMCS |
+   SECONDARY_EXEC_XSAVES;
if (adjust_vmx_controls(min2, opt2,
MSR_IA32_VMX_PROCBASED_CTLS2,
&_cpu_based_2nd_exec_control) < 0)
@@ -4346,6 +4356,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
unsigned long a;
 #endif
int i;
+   u64 xss = 0;
 
/* I/O */
vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a));
@@ -4446,6 +4457,9 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
set_cr4_guest_host_mask(vmx);
 
+   if (cpu_has_xss_exit_bitmap())
+   vmcs_write64(XSS_EXIT_BIMTAP, xss);
+
return 0;
 }
 
@@ -5334,6 +5348,18 @@ static int handle_xsetb

[PATCH 1/2] kvm: x86: revert mask out xsaves

2014-12-01 Thread Wanpeng Li
xsaves will be exported to guest in the next patch, so revert the
mask out xsaves patch.

Signed-off-by: Wanpeng Li 
---
 arch/x86/kvm/cpuid.c | 10 +-
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index a4f5ac4..7af07571 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -320,10 +320,6 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) |
F(AVX512CD);
 
-   /* cpuid 0xD.1.eax */
-   const u32 kvm_supported_word10_x86_features =
-   F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1);
-
/* all calls to cpuid_count() should be made on the same cpu */
get_cpu();
 
@@ -460,8 +456,6 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
entry->eax &= supported;
entry->edx &= supported >> 32;
entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
-   if (!supported)
-   break;
 
for (idx = 1, i = 1; idx < 64; ++idx) {
u64 mask = ((u64)1 << idx);
@@ -469,9 +463,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
goto out;
 
do_cpuid_1_ent(&entry[i], function, idx);
-   if (idx == 1)
-   entry[i].eax &= 
kvm_supported_word10_x86_features;
-   else if (entry[i].eax == 0 || !(supported & mask))
+   if (entry[i].eax == 0 || !(supported & mask))
continue;
entry[i].flags |=
   KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [CFT PATCH v2 0/2] KVM: support XSAVES usage in the host

2014-12-01 Thread Wanpeng Li
Hi Paolo,
On Tue, Nov 25, 2014 at 04:50:06PM +0200, Nadav Amit wrote:
[...]
>I am just worried that Wanpeng reported it fails, while I report it works...

I just get a real skylake-client machine on hand, both the two patches from 
you can run normally, it seems that the silly emulator which I used is buggy. 
Sorry for the false report. In addition, I send out the xsaves enable patch 
for kvm, it can work w/ your patch 2/2 correctly.

Regards,
Wanpeng Li 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 2/2] kvm: vmx: enable intel xsaves for guest

2014-12-01 Thread Wanpeng Li
Expose intel xsaves feature to guest.

Signed-off-by: Wanpeng Li 
---
v1 -> v2:
 *auto switch msr ia32_xss if this msr is present

 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/include/asm/vmx.h  |  3 +++
 arch/x86/include/uapi/asm/vmx.h |  6 +-
 arch/x86/kvm/vmx.c  | 35 ++-
 4 files changed, 43 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 2896dbc..95dde42 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -362,6 +362,7 @@ struct kvm_vcpu_arch {
int mp_state;
u64 ia32_misc_enable_msr;
bool tpr_access_reporting;
+   u64 ia32_xss;
 
/*
 * Paging state of the vcpu
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index bcbfade..bdb79ef 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -69,6 +69,7 @@
 #define SECONDARY_EXEC_PAUSE_LOOP_EXITING  0x0400
 #define SECONDARY_EXEC_ENABLE_INVPCID  0x1000
 #define SECONDARY_EXEC_SHADOW_VMCS  0x4000
+#define SECONDARY_EXEC_XSAVES  0x0010
 
 
 #define PIN_BASED_EXT_INTR_MASK 0x0001
@@ -159,6 +160,8 @@ enum vmcs_field {
EOI_EXIT_BITMAP3_HIGH   = 0x2023,
VMREAD_BITMAP   = 0x2026,
VMWRITE_BITMAP  = 0x2028,
+   XSS_EXIT_BIMTAP = 0x202C,
+   XSS_EXIT_BIMTAP_HIGH= 0x202D,
GUEST_PHYSICAL_ADDRESS  = 0x2400,
GUEST_PHYSICAL_ADDRESS_HIGH = 0x2401,
VMCS_LINK_POINTER   = 0x2800,
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index 990a2fe..b813bf9 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -72,6 +72,8 @@
 #define EXIT_REASON_XSETBV  55
 #define EXIT_REASON_APIC_WRITE  56
 #define EXIT_REASON_INVPCID 58
+#define EXIT_REASON_XSAVES  63
+#define EXIT_REASON_XRSTORS 64
 
 #define VMX_EXIT_REASONS \
{ EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \
@@ -116,6 +118,8 @@
{ EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \
{ EXIT_REASON_INVD,  "INVD" }, \
{ EXIT_REASON_INVVPID,   "INVVPID" }, \
-   { EXIT_REASON_INVPCID,   "INVPCID" }
+   { EXIT_REASON_INVPCID,   "INVPCID" }, \
+   { EXIT_REASON_XSAVES,"XSAVES" }, \
+   { EXIT_REASON_XRSTORS,   "XRSTORS" }
 
 #endif /* _UAPIVMX_H */
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 6a951d8..b87b5b8 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1045,6 +1045,12 @@ static inline bool cpu_has_vmx_invpcid(void)
SECONDARY_EXEC_ENABLE_INVPCID;
 }
 
+static inline bool cpu_has_xss_exit_bitmap(void)
+{
+   return vmcs_config.cpu_based_2nd_exec_ctrl &
+   SECONDARY_EXEC_XSAVES;
+}
+
 static inline bool cpu_has_virtual_nmis(void)
 {
return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
@@ -1773,6 +1779,14 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
kvm_set_shared_msr(vmx->guest_msrs[i].index,
   vmx->guest_msrs[i].data,
   vmx->guest_msrs[i].mask);
+
+   if (cpu_has_xsaves) {
+   u64 host_xss;
+
+   rdmsrl(MSR_IA32_XSS, host_xss);
+   add_atomic_switch_msr(vmx, MSR_IA32_XSS,
+   vcpu->arch.ia32_xss, host_xss);
+   }
 }
 
 static void __vmx_load_host_state(struct vcpu_vmx *vmx)
@@ -2895,7 +2909,8 @@ static __init int setup_vmcs_config(struct vmcs_config 
*vmcs_conf)
SECONDARY_EXEC_ENABLE_INVPCID |
SECONDARY_EXEC_APIC_REGISTER_VIRT |
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
-   SECONDARY_EXEC_SHADOW_VMCS;
+   SECONDARY_EXEC_SHADOW_VMCS |
+   SECONDARY_EXEC_XSAVES;
if (adjust_vmx_controls(min2, opt2,
MSR_IA32_VMX_PROCBASED_CTLS2,
&_cpu_based_2nd_exec_control) < 0)
@@ -4346,6 +4361,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
unsigned long a;
 #endif
int i;
+   u64 xss = 0;
 
/* I/O */
vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a));
@@ -4446,6 +4462,9 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
set_cr4_guest_host_mask(vmx);
 
+   if (cpu_has_xss_exit_bitmap())
+   vmcs_write64(XSS_EXIT_BIMTAP, xss);
+
return 0;
 }
 
@@ -5334,6 +5353,18 @@ static int handle_xsetbv(struct kvm_vcpu *vcpu)
return 1;

  1   2   >