date:20140911

Re: [PATCH] KVM: EVENTFD: remove inclusion of irq.h

2014-09-11 Thread Paolo Bonzini

Il 11/09/2014 05:09, Christoffer Dall ha scritto:
> On Mon, Sep 01, 2014 at 12:11:19PM +0200, Paolo Bonzini wrote:
>> Il 01/09/2014 10:36, Eric Auger ha scritto:
>>> No more needed. irq.h would be void on ARM.
>>>
>>> Signed-off-by: Eric Auger 
>>>
>>> ---
>>>
>>> I don't think irq.h is needed anymore since Paul Mackerras' work. However
>>> I did not compile for all architectures.
>>> ---
>>>  virt/kvm/eventfd.c | 1 -
>>>  1 file changed, 1 deletion(-)
>>>
>>> diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
>>> index 3c5981c..0c712a7 100644
>>> --- a/virt/kvm/eventfd.c
>>> +++ b/virt/kvm/eventfd.c
>>> @@ -36,7 +36,6 @@
>>>  #include 
>>>  #include 
>>>  
>>> -#include "irq.h"
>>>  #include "iodev.h"
>>>  
>>>  #ifdef CONFIG_HAVE_KVM_IRQFD
>>>
>>
>> Acked-by: Paolo Bonzini 
>>
>> Christoffer, please include this via the ARM tree, together with ARM
>> irqfd support.  Thanks,
>>
> Marc is dealing with the tree this week and the next so he can apply it
> to kvmarm/queue.  Do you want it to wait and go with the irqfd patch
> (which has dependencies not yet resolved) or should we just queue it?

As you prefer.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] kvm: ioapic: conditionally delay irq delivery duringeoi broadcast

2014-09-11 Thread Zhang Haoyu

>> Currently, we call ioapic_service() immediately when we find the irq is still
>> active during eoi broadcast. But for real hardware, there's some dealy 
>> between
>> the EOI writing and irq delivery (system bus latency?). So we need to emulate
>> this behavior. Otherwise, for a guest who haven't register a proper irq 
>> handler
>> , it would stay in the interrupt routine as this irq would be re-injected
>> immediately after guest enables interrupt. This would lead guest can't move
>> forward and may miss the possibility to get proper irq handler registered 
>> (one
>> example is windows guest resuming from hibernation).
>> 
>> As there's no way to differ the unhandled irq from new raised ones, this 
>> patch
>> solve this problems by scheduling a delayed work when the count of irq 
>> injected
>> during eoi broadcast exceeds a threshold value. After this patch, the guest 
>> can
>> move a little forward when there's no suitable irq handler in case it may
>> register one very soon and for guest who has a bad irq detection routine ( 
>> such
>> as note_interrupt() in linux ), this bad irq would be recognized soon as in 
>> the
>> past.
>> 
>> Cc: Michael S. Tsirkin 
>> Signed-off-by: Jason Wang 
>> Signed-off-by: Zhang Haoyu 
>> ---
>>  include/trace/events/kvm.h | 20 ++
>>  virt/kvm/ioapic.c  | 51 
>> --
>>  virt/kvm/ioapic.h  |  6 ++
>>  3 files changed, 75 insertions(+), 2 deletions(-)
>> 
>> diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
>> index 908925a..b05f688 100644
>> --- a/include/trace/events/kvm.h
>> +++ b/include/trace/events/kvm.h
>> @@ -95,6 +95,26 @@ TRACE_EVENT(kvm_ioapic_set_irq,
>>__entry->coalesced ? " (coalesced)" : "")
>>  );
>>  
>> +TRACE_EVENT(kvm_ioapic_delayed_eoi_inj,
>> +TP_PROTO(__u64 e),
>> +TP_ARGS(e),
>> +
>> +TP_STRUCT__entry(
>> +__field(__u64,  e   )
>> +),
>> +
>> +TP_fast_assign(
>> +__entry->e  = e;
>> +),
>> +
>> +TP_printk("dst %x vec=%u (%s|%s|%s%s)",
>> +  (u8)(__entry->e >> 56), (u8)__entry->e,
>> +  __print_symbolic((__entry->e >> 8 & 0x7), kvm_deliver_mode),
>> +  (__entry->e & (1<<11)) ? "logical" : "physical",
>> +  (__entry->e & (1<<15)) ? "level" : "edge",
>> +  (__entry->e & (1<<16)) ? "|masked" : "")
>> +);
>> +
>>  TRACE_EVENT(kvm_msi_set_irq,
>>  TP_PROTO(__u64 address, __u64 data),
>>  TP_ARGS(address, data),
>> diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
>> index e8ce34c..a36c4c4 100644
>> --- a/virt/kvm/ioapic.c
>> +++ b/virt/kvm/ioapic.c
>> @@ -405,6 +405,24 @@ void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, 
>> int irq_source_id)
>>  spin_unlock(&ioapic->lock);
>>  }
>>  
>> +static void kvm_ioapic_eoi_inject_work(struct work_struct *work)
>> +{
>> +int i;
>> +struct kvm_ioapic *ioapic = container_of(work, struct kvm_ioapic,
>> + eoi_inject.work);
>> +spin_lock(&ioapic->lock);
>> +for (i = 0; i < IOAPIC_NUM_PINS; i++) {
>> +union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i];
>> +
>> +if (ent->fields.trig_mode != IOAPIC_LEVEL_TRIG)
>> +continue;
>> +
>> +if (ioapic->irr & (1 << i) && !ent->fields.remote_irr)
>> +ioapic_service(ioapic, i, false);
>> +}
>> +spin_unlock(&ioapic->lock);
>> +}
>> +
>>  static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu,
>>  struct kvm_ioapic *ioapic, int vector, int trigger_mode)
>>  {
>> @@ -435,8 +453,33 @@ static void __kvm_ioapic_update_eoi(struct kvm_vcpu 
>> *vcpu,
>>  
>>  ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);
>>  ent->fields.remote_irr = 0;
>> -if (ioapic->irr & (1 << i))
>> -ioapic_service(ioapic, i, false);
>> +if (!ent->fields.mask && (ioapic->irr & (1 << i))) {
>
>The mask check is new - why now? You don't check it in the work handler
>as well.
>
The mask check is to avoid incrementing ioapic->irq_eoi[i] when this irq is 
masked, the count should be zeroed,
but needless to check it in the work handler, the check will be performed in 
ioapic_service().

>> +++ioapic->irq_eoi[i];
>> +if (ioapic->irq_eoi[i] == 
>> IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT) {
>> +/*
>> + * Real hardware does not deliver the irq so
>> + * immediately during eoi broadcast, so we need
>> + * to emulate this behavior. Otherwise, for
>> + * guests who has not registered handler of a
>> + * level irq, this irq would be injected
>> + * immediately aft

Re: [PATCH v3] ARM: KVM: add irqfd support

2014-09-11 Thread Eric Auger

On 09/11/2014 05:09 AM, Christoffer Dall wrote:
> On Mon, Sep 01, 2014 at 10:53:04AM +0200, Eric Auger wrote:
>> This patch enables irqfd on ARM.
>>
>> irqfd framework enables to inject a virtual IRQ into a guest upon an
>> eventfd trigger. User-side uses KVM_IRQFD VM ioctl to provide KVM with
>> a kvm_irqfd struct that associates a VM, an eventfd, a virtual IRQ number
>> (aka. the gsi). When an actor signals the eventfd (typically a VFIO
>> platform driver), the kvm irqfd subsystem injects the provided virtual
>> IRQ into the guest.
>>
>> Resamplefd also is supported for level sensitive interrupts, ie. the
>> user can provide another eventfd that is triggered when the completion
>> of the virtual IRQ (gsi) is detected by the GIC.
>>
>> The gsi must correspond to a shared peripheral interrupt (SPI), ie the
>> GIC interrupt ID is gsi+32.
>>
>> this patch enables CONFIG_HAVE_KVM_EVENTFD and CONFIG_HAVE_KVM_IRQFD.
>> CONFIG_HAVE_KVM_IRQCHIP is removed. No IRQ routing table is used
>> (irqchip.c and irqcomm.c are not used).
>>
>> Both KVM_CAP_IRQFD & KVM_CAP_IRQFD_RESAMPLE capabilities are exposed
>>
>> Signed-off-by: Eric Auger 
>>
>> ---
>>
>> This patch serie deprecates the previous serie featuring GSI routing
>> (https://patches.linaro.org/32261/)
>>
>> The patch serie has the following dependencies:
>> - arm/arm64: KVM: Various VGIC cleanups and improvements
>>   https://lists.cs.columbia.edu/pipermail/kvmarm/2014-June/009979.html
>> - "KVM: EVENTFD: remove inclusion of irq.h"
>>
>> All pieces can be found on git://git.linaro.org/people/eric.auger/linux.git
>> branch irqfd_norouting_integ_v3
>>
>> This work was tested with Calxeda Midway xgmac main interrupt with
>> qemu-system-arm and QEMU VFIO platform device.
>>
>> v2 -> v3:
>> - removal of irq.h from eventfd.c put in a separate patch to increase
>>   visibility
>> - properly expose KVM_CAP_IRQFD capability in arm.c
>> - remove CONFIG_HAVE_KVM_IRQCHIP meaningfull only if irq_comm.c is used
>>
>> v1 -> v2:
>> - rebase on 3.17rc1
>> - move of the dist unlock in process_maintenance
>> - remove of dist lock in __kvm_vgic_sync_hwstate
>> - rewording of the commit message (add resamplefd reference)
>> - remove irq.h
>> ---
>>  Documentation/virtual/kvm/api.txt |  5 +++-
>>  arch/arm/include/uapi/asm/kvm.h   |  3 +++
>>  arch/arm/kvm/Kconfig  |  4 +--
>>  arch/arm/kvm/Makefile |  2 +-
>>  arch/arm/kvm/arm.c|  3 +++
>>  virt/kvm/arm/vgic.c   | 56 
>> ---
>>  6 files changed, 65 insertions(+), 8 deletions(-)
>>
>> diff --git a/Documentation/virtual/kvm/api.txt 
>> b/Documentation/virtual/kvm/api.txt
>> index beae3fd..8118b12 100644
>> --- a/Documentation/virtual/kvm/api.txt
>> +++ b/Documentation/virtual/kvm/api.txt
>> @@ -2204,7 +2204,7 @@ into the hash PTE second double word).
>>  4.75 KVM_IRQFD
>>  
>>  Capability: KVM_CAP_IRQFD
>> -Architectures: x86 s390
>> +Architectures: x86 s390 arm
>>  Type: vm ioctl
>>  Parameters: struct kvm_irqfd (in)
>>  Returns: 0 on success, -1 on error
>> @@ -2230,6 +2230,9 @@ Note that closing the resamplefd is not sufficient to 
>> disable the
>>  irqfd.  The KVM_IRQFD_FLAG_RESAMPLE is only necessary on assignment
>>  and need not be specified with KVM_IRQFD_FLAG_DEASSIGN.
>>  
>> +On ARM/arm64 the injected must be a shared peripheral interrupt (SPI).
>> +This means the programmed GIC interrupt ID is gsi+32.
>> +
> 
> See above comment.
Hi Christoffer,

sorry which comment do you refer to?  wrt your last comment do you
consider PPI injection support is a mandated feature for this patch to
be upstreamable?
> 
>>  4.76 KVM_PPC_ALLOCATE_HTAB
>>  
>>  Capability: KVM_CAP_PPC_ALLOC_HTAB
>> diff --git a/arch/arm/include/uapi/asm/kvm.h 
>> b/arch/arm/include/uapi/asm/kvm.h
>> index e6ebdd3..3034c66 100644
>> --- a/arch/arm/include/uapi/asm/kvm.h
>> +++ b/arch/arm/include/uapi/asm/kvm.h
>> @@ -194,6 +194,9 @@ struct kvm_arch_memory_slot {
>>  /* Highest supported SPI, from VGIC_NR_IRQS */
>>  #define KVM_ARM_IRQ_GIC_MAX 127
>>  
>> +/* One single KVM irqchip, ie. the VGIC */
>> +#define KVM_NR_IRQCHIPS  1
>> +
>>  /* PSCI interface */
>>  #define KVM_PSCI_FN_BASE0x95c1ba5e
>>  #define KVM_PSCI_FN(n)  (KVM_PSCI_FN_BASE + (n))
>> diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig
>> index 466bd29..e519a40 100644
>> --- a/arch/arm/kvm/Kconfig
>> +++ b/arch/arm/kvm/Kconfig
>> @@ -24,6 +24,7 @@ config KVM
>>  select KVM_MMIO
>>  select KVM_ARM_HOST
>>  depends on ARM_VIRT_EXT && ARM_LPAE
>> +select HAVE_KVM_EVENTFD
>>  ---help---
>>Support hosting virtualized guest machines. You will also
>>need to select one or more of the processor modules below.
>> @@ -55,7 +56,7 @@ config KVM_ARM_MAX_VCPUS
>>  config KVM_ARM_VGIC
>>  bool "KVM support for Virtual GIC"
>>  depends on KVM_ARM_HOST && OF
>> -select HAVE_KVM_IRQCHIP
>> +select HAVE_KVM_IRQFD
>>

Re: [RFC v2 4/9] VFIO: platform: handler tests whether the IRQ is forwarded

2014-09-11 Thread Eric Auger

On 09/11/2014 05:10 AM, Christoffer Dall wrote:
> On Mon, Sep 01, 2014 at 02:52:43PM +0200, Eric Auger wrote:
>> In case the IRQ is forwarded, the VFIO platform IRQ handler does not
>> need to disable the IRQ anymore. In that mode, when the handler completes
> 
> add a comma after completes
Hi Christoffer,
ok
> 
>> the IRQ is not deactivated but only its priority is lowered.
>>
>> Some other actor (typically a guest) is supposed to deactivate the IRQ,
>> allowing at that time a new physical IRQ to hit.
>>
>> In virtualization use case, the physical IRQ is automatically completed
>> by the interrupt controller when the guest completes the corresponding
>> virtual IRQ.
>>
>> Signed-off-by: Eric Auger 
>> ---
>>  drivers/vfio/platform/vfio_platform_irq.c | 7 ++-
>>  1 file changed, 6 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/vfio/platform/vfio_platform_irq.c 
>> b/drivers/vfio/platform/vfio_platform_irq.c
>> index 6768508..1f851b2 100644
>> --- a/drivers/vfio/platform/vfio_platform_irq.c
>> +++ b/drivers/vfio/platform/vfio_platform_irq.c
>> @@ -88,13 +88,18 @@ static irqreturn_t vfio_irq_handler(int irq, void 
>> *dev_id)
>>  struct vfio_platform_irq *irq_ctx = dev_id;
>>  unsigned long flags;
>>  int ret = IRQ_NONE;
>> +struct irq_data *d;
>> +bool is_forwarded;
>>  
>>  spin_lock_irqsave(&irq_ctx->lock, flags);
>>  
>>  if (!irq_ctx->masked) {
>>  ret = IRQ_HANDLED;
>> +d = irq_get_irq_data(irq_ctx->hwirq);
>> +is_forwarded = irqd_irq_forwarded(d);
>>  
>> -if (irq_ctx->flags & VFIO_IRQ_INFO_AUTOMASKED) {
>> +if (irq_ctx->flags & VFIO_IRQ_INFO_AUTOMASKED &&
>> +!is_forwarded) {
>>  disable_irq_nosync(irq_ctx->hwirq);
>>  irq_ctx->masked = true;
>>  }
>> -- 
>> 1.9.1
>>
> It makes sense that these needs to be all controlled in the kernel, but
> I'm wondering if it would be cleaner / more correct to clear the
> AUTOMASKED flag when the IRQ is forwarded and have vfio refuse setting
> this flag as long as the irq is forwarded?

If I am not wrong, even if the user sets AUTOMASKED, this info never is
exploited by the vfio platform driver. AUTOMASKED only is set internally
to the driver, on init, for level sensitive IRQs.

It seems to be the same on PCI (for INTx). I do not see anywhere the
user flag curectly copied into a local storage. But I prefer to be
careful ;-)

If confirmed, although the flag value is exposed in the user API, the
user set value never is exploited so this removes the need to check.

the forwarded IRQ modality being fully dynamic currently, then I would
need to update the irq_ctx->flags on each vfio_irq_handler call. I don't
know if its better?

Best Regards

Eric


> 
> -Christoffer
> 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC v2 5/9] KVM: KVM-VFIO: update user API to program forwarded IRQ

2014-09-11 Thread Eric Auger

On 09/11/2014 05:10 AM, Christoffer Dall wrote:
> On Mon, Sep 01, 2014 at 02:52:44PM +0200, Eric Auger wrote:
>> add new device group commands:
>> - KVM_DEV_VFIO_DEVICE_FORWARD_IRQ and
>>   KVM_DEV_VFIO_DEVICE_UNFORWARD_IRQ
>>
>> which enable to turn forwarded IRQ mode on/off.
>>
>> the kvm_arch_forwarded_irq struct embodies a forwarded IRQ
>>
>> Signed-off-by: Eric Auger 
>>
>> ---
>>
>> v1 -> v2:
>> - struct kvm_arch_forwarded_irq moved from arch/arm/include/uapi/asm/kvm.h
>>   to include/uapi/linux/kvm.h
>>   also irq_index renamed into index and guest_irq renamed into gsi
>> - ASSIGN/DEASSIGN renamed into FORWARD/UNFORWARD
>> ---
>>  Documentation/virtual/kvm/devices/vfio.txt | 26 ++
>>  include/uapi/linux/kvm.h   |  9 +
>>  2 files changed, 35 insertions(+)
>>
>> diff --git a/Documentation/virtual/kvm/devices/vfio.txt 
>> b/Documentation/virtual/kvm/devices/vfio.txt
>> index ef51740..048baa0 100644
>> --- a/Documentation/virtual/kvm/devices/vfio.txt
>> +++ b/Documentation/virtual/kvm/devices/vfio.txt
>> @@ -13,6 +13,7 @@ VFIO-group is held by KVM.
>>  
>>  Groups:
>>KVM_DEV_VFIO_GROUP
>> +  KVM_DEV_VFIO_DEVICE
>>  
>>  KVM_DEV_VFIO_GROUP attributes:
>>KVM_DEV_VFIO_GROUP_ADD: Add a VFIO group to VFIO-KVM device tracking
>> @@ -20,3 +21,28 @@ KVM_DEV_VFIO_GROUP attributes:
>>  
>>  For each, kvm_device_attr.addr points to an int32_t file descriptor
>>  for the VFIO group.
>> +
>> +KVM_DEV_VFIO_DEVICE attributes:
>> +  KVM_DEV_VFIO_DEVICE_FORWARD_IRQ
>> +  KVM_DEV_VFIO_DEVICE_UNFORWARD_IRQ
>> +
>> +For each, kvm_device_attr.addr points to a kvm_arch_forwarded_irq struct.
>> +This user API makes possible to create a special IRQ handling mode,
> 
>   KVM_DEV_VFIO_DEVICE_FORWARD_IRQ enables a special IRQ handling mode on
>   hardware that supports it,
OK
> 
>> +where KVM and a VFIO platform driver collaborate to improve IRQ
>> +handling performance.
>> +
>> +'fd represents the file descriptor of a valid VFIO device whose physical
> 
> fd is described out of context here.  Can you copy the struct definition
> into this document, perhaps right after the "For each, ..." line above.
yes sure
> 
>> +IRQ, referenced by its index, is injected into the VM guest irq (gsi).
>  as a virtual IRQ (specified
>by the gsi field) into the
>VM.
> 
>> +
>> +On FORWARD_IRQ, KVM-VFIO device programs:
>When setting the  KVM_DEV_VFIO_DEVICE_FORWARD_IRQ attribute, the
>KVM-VFIO device tells the host (or VFIO?) to not complete the
>physical IRQ, and instead ensures that KVM (or the VM) completes the
>physical IRQ.
> 
>> +- the host, to not complete the physical IRQ itself.
>> +- the GIC, to automatically complete the physical IRQ when the guest
>> +  completes the virtual IRQ.
> 
> and drop this bullet form.
ok
> 
>> +This avoids trapping the end-of-interrupt for level sensitive IRQ.
> 
> avoid this last line, it's specific to ARM.
ok
> 
>> +
>> +On UNFORWARD_IRQ, one returns to the mode where the host completes the
>When setting the KVM_DEV_VFIO_DEVICE_UNFORWARD_IRQ attribute, the
>host (VFIO?) will again complete the physical IRQ and KVM will not...
>  
>> +physical IRQ and the guest completes the virtual IRQ.
>> +
>> +It is up to the caller of this API to make sure the IRQ is not
>> +outstanding when the FORWARD/UNFORWARD is called. This could lead to
> 
> outstanding? can you be specific?
active? and I should add *physical* IRQ
> 
> don't refer to FOWARD/UNFORWARD, either refer to these attributes by
> their full name or use a clear reference in proper English.
ok
> 
>> +some inconsistency on who is going to complete the IRQ.
> 
> This sounds like the whole thing is fragile and if userspace doesn't do
> things right, IRQ handling of a piece of hardware is going to be
> inconsistent?  Is this the case?  If so, we need some stronger
> semantics.  If not, this should be rephrased.
Actually the KVM-VFIO device rejects any attempt to change the
forwarding mode if the physical IRQ is active. So I hope this is robust
and will change the explanation.

Thanks

Eric
> 
>> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
>> index cf3a2ff..8cd7b0e 100644
>> --- a/include/uapi/linux/kvm.h
>> +++ b/include/uapi/linux/kvm.h
>> @@ -947,6 +947,12 @@ struct kvm_device_attr {
>>  __u64   addr;   /* userspace address of attr data */
>>  };
>>  
>> +struct kvm_arch_forwarded_irq {
>> +__u32 fd; /* file desciptor of the VFIO device */
>> +__u32 index; /* VFIO device IRQ index */
>> +__u32 gsi; /* gsi, ie. virtual IRQ number */
>> +};
>> +
>>  #define KVM_DEV_TYPE_FSL_MPIC_201
>>  #define KVM_DEV_TYPE_FSL_MPIC_422
>>  #define KVM_DEV_TYPE_XICS   3
>> @@ -954,6 +960,9 @@ struct kvm_device_attr {
>>  #define  KVM_DEV_VFIO_GROUP 1
>>  #define   KVM_DEV_VFIO_GROUP_ADD

Re: [RFC v2 6/9] VFIO: Extend external user API

2014-09-11 Thread Eric Auger

On 09/11/2014 05:10 AM, Christoffer Dall wrote:
> On Mon, Sep 01, 2014 at 02:52:45PM +0200, Eric Auger wrote:
>> New functions are added to be called from ARM KVM-VFIO device.
> 
> This commit message seems somewhat random.  This patch doesn't deal with
> anything ARM specific, it introduces some generic functions that allows
> users external to vfio itself to retrieve information about a vfio
> platform device.

Yes you're right.
> 
>>
>> - vfio_device_get_external_user enables to get a vfio device from
>>   its fd
>> - vfio_device_put_external_user puts the vfio device
>> - vfio_external_base_device returns the struct device*,
>>   useful to access the platform_device
>>
>> Signed-off-by: Eric Auger 
>>
>> ---
>>
>> v1 -> v2:
>>
>> - vfio_external_get_base_device renamed into vfio_external_base_device
>> - vfio_external_get_type removed
>> ---
>>  drivers/vfio/vfio.c  | 24 
>>  include/linux/vfio.h |  3 +++
>>  2 files changed, 27 insertions(+)
>>
>> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
>> index 8e84471..282814e 100644
>> --- a/drivers/vfio/vfio.c
>> +++ b/drivers/vfio/vfio.c
>> @@ -1401,6 +1401,30 @@ void vfio_group_put_external_user(struct vfio_group 
>> *group)
>>  }
>>  EXPORT_SYMBOL_GPL(vfio_group_put_external_user);
>>  
>> +struct vfio_device *vfio_device_get_external_user(struct file *filep)
>> +{
>> +struct vfio_device *vdev = filep->private_data;
>> +
>> +if (filep->f_op != &vfio_device_fops)
>> +return ERR_PTR(-EINVAL);
>> +
>> +vfio_device_get(vdev);
>> +return vdev;
>> +}
>> +EXPORT_SYMBOL_GPL(vfio_device_get_external_user);
>> +
>> +void vfio_device_put_external_user(struct vfio_device *vdev)
>> +{
>> +vfio_device_put(vdev);
>> +}
>> +EXPORT_SYMBOL_GPL(vfio_device_put_external_user);
>> +
>> +struct device *vfio_external_base_device(struct vfio_device *vdev)
>> +{
>> +return vdev->dev;
>> +}
>> +EXPORT_SYMBOL_GPL(vfio_external_base_device);
>> +
>>  int vfio_external_user_iommu_id(struct vfio_group *group)
>>  {
>>  return iommu_group_id(group->iommu_group);
>> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
>> index ffe04ed..bd4b6cb 100644
>> --- a/include/linux/vfio.h
>> +++ b/include/linux/vfio.h
>> @@ -99,6 +99,9 @@ extern void vfio_group_put_external_user(struct vfio_group 
>> *group);
>>  extern int vfio_external_user_iommu_id(struct vfio_group *group);
>>  extern long vfio_external_check_extension(struct vfio_group *group,
>>unsigned long arg);
>> +extern struct vfio_device *vfio_device_get_external_user(struct file 
>> *filep);
>> +extern void vfio_device_put_external_user(struct vfio_device *vdev);
>> +extern struct device *vfio_external_base_device(struct vfio_device *vdev);
>>  
>>  struct pci_dev;
>>  #ifdef CONFIG_EEH
>> -- 
>> 1.9.1
>>
> Looks good to me,
> -Christoffer
> 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC v2 7/9] KVM: KVM-VFIO: add new VFIO external API hooks

2014-09-11 Thread Eric Auger

On 09/11/2014 05:10 AM, Christoffer Dall wrote:
> On Mon, Sep 01, 2014 at 02:52:46PM +0200, Eric Auger wrote:
>> add functions that implement the gateway to the extended
> 
> Capital letter when beginning a new sentence.  Also the reference to
> 'the extended VFIO API' feels a bit weird.  Can't you make your commit
> message a little more descriptive of this patch, something along the
> lines of:
> 
> Provide wrapper functions that allows KVM-VFIO device code to get an
> external handle on a struct vfio_device based on a vfio device file
> descriptor.  We provide this through three new functions:
> 
> (assuming I got this right).
sure
> 
> 
> 
>> external VFIO API:
>> - kvm_vfio_device_get_external_user
>> - kvm_vfio_device_put_external_user
>> - kvm_vfio_external_base_device
>>
>> Signed-off-by: Eric Auger 
>>
>> ---
>>
>> v1 -> v2:
>> - kvm_vfio_external_get_base_device renamed into
>>   kvm_vfio_external_base_device
>> - kvm_vfio_external_get_type removed
>> ---
>>  arch/arm/include/asm/kvm_host.h |  5 +
>>  virt/kvm/vfio.c | 45 
>> +
>>  2 files changed, 50 insertions(+)
>>
>> diff --git a/arch/arm/include/asm/kvm_host.h 
>> b/arch/arm/include/asm/kvm_host.h
>> index 6dfb404..1aee6bb 100644
>> --- a/arch/arm/include/asm/kvm_host.h
>> +++ b/arch/arm/include/asm/kvm_host.h
>> @@ -171,6 +171,11 @@ void kvm_set_spte_hva(struct kvm *kvm, unsigned long 
>> hva, pte_t pte);
>>  unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu);
>>  int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices);
>>  
>> +struct vfio_device;
>> +struct vfio_device *kvm_vfio_device_get_external_user(struct file *filep);
>> +void kvm_vfio_device_put_external_user(struct vfio_device *vdev);
>> +struct device *kvm_vfio_external_base_device(struct vfio_device *vdev);
>> +
>>  /* We do not have shadow page tables, hence the empty hooks */
>>  static inline int kvm_age_hva(struct kvm *kvm, unsigned long hva)
>>  {
>> diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c
>> index ba1a93f..76dc7a1 100644
>> --- a/virt/kvm/vfio.c
>> +++ b/virt/kvm/vfio.c
>> @@ -59,6 +59,51 @@ static void kvm_vfio_group_put_external_user(struct 
>> vfio_group *vfio_group)
>>  symbol_put(vfio_group_put_external_user);
>>  }
>>  
>> +struct vfio_device *kvm_vfio_device_get_external_user(struct file *filep)
>> +{
>> +struct vfio_device *vdev;
>> +struct vfio_device *(*fn)(struct file *);
>> +
>> +fn = symbol_get(vfio_device_get_external_user);
>> +if (!fn)
>> +return ERR_PTR(-EINVAL);
>> +
>> +vdev = fn(filep);
>> +
>> +symbol_put(vfio_device_get_external_user);
>> +
>> +return vdev;
>> +}
>> +
>> +void kvm_vfio_device_put_external_user(struct vfio_device *vdev)
>> +{
>> +void (*fn)(struct vfio_device *);
>> +
>> +fn = symbol_get(vfio_device_put_external_user);
>> +if (!fn)
>> +return;
>> +
>> +fn(vdev);
>> +
>> +symbol_put(vfio_device_put_external_user);
>> +}
>> +
>> +struct device *kvm_vfio_external_base_device(struct vfio_device *vdev)
>> +{
>> +struct device *(*fn)(struct vfio_device *);
>> +struct device *dev;
>> +
>> +fn = symbol_get(vfio_external_base_device);
>> +if (!fn)
>> +return NULL;
>> +
>> +dev = fn(vdev);
>> +
>> +symbol_put(vfio_external_base_device);
>> +
>> +return dev;
>> +}
>> +
>>  static bool kvm_vfio_group_is_coherent(struct vfio_group *vfio_group)
>>  {
>>  long (*fn)(struct vfio_group *, unsigned long);
>> -- 
>> 1.9.1
>>
> 
> otherwise looks good to me!
> -Christoffer
> 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] kvm: ioapic: conditionally delay irq delivery duringeoi broadcast

2014-09-11 Thread Zhang Haoyu

Currently, we call ioapic_service() immediately when we find the irq is still
active during eoi broadcast. But for real hardware, there's some dealy between
the EOI writing and irq delivery (system bus latency?). So we need to emulate
this behavior. Otherwise, for a guest who haven't register a proper irq handler
, it would stay in the interrupt routine as this irq would be re-injected
immediately after guest enables interrupt. This would lead guest can't move
forward and may miss the possibility to get proper irq handler registered (one
example is windows guest resuming from hibernation).

As there's no way to differ the unhandled irq from new raised ones, this patch
solve this problems by scheduling a delayed work when the count of irq injected
during eoi broadcast exceeds a threshold value. After this patch, the guest can
move a little forward when there's no suitable irq handler in case it may
register one very soon and for guest who has a bad irq detection routine ( such
as note_interrupt() in linux ), this bad irq would be recognized soon as in the
past.

Cc: Michael S. Tsirkin 
Signed-off-by: Jason Wang 
Signed-off-by: Zhang Haoyu 
---
 include/trace/events/kvm.h | 20 +++
 virt/kvm/ioapic.c  | 50 --
 virt/kvm/ioapic.h  |  6 ++
 3 files changed, 74 insertions(+), 2 deletions(-)

diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index 908925a..ab679c3 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -95,6 +95,26 @@ TRACE_EVENT(kvm_ioapic_set_irq,
  __entry->coalesced ? " (coalesced)" : "")
 );
 
+TRACE_EVENT(kvm_ioapic_delayed_eoi_inj,
+   TP_PROTO(__u64 e),
+   TP_ARGS(e),
+
+   TP_STRUCT__entry(
+   __field(__u64,  e   )
+   ),
+
+   TP_fast_assign(
+   __entry->e  = e;
+   ),
+
+   TP_printk("dst %x vec=%u (%s|%s|%s%s)",
+ (u8)(__entry->e >> 56), (u8)__entry->e,
+ __print_symbolic((__entry->e >> 8 & 0x7), kvm_deliver_mode),
+ (__entry->e & (1<<11)) ? "logical" : "physical",
+ (__entry->e & (1<<15)) ? "level" : "edge",
+ (__entry->e & (1<<16)) ? "|masked" : "")
+);
+
 TRACE_EVENT(kvm_msi_set_irq,
TP_PROTO(__u64 address, __u64 data),
TP_ARGS(address, data),
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index e8ce34c..8e1dc67 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -405,6 +405,24 @@ void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int 
irq_source_id)
spin_unlock(&ioapic->lock);
 }
 
+static void kvm_ioapic_eoi_inject_work(struct work_struct *work)
+{
+   int i;
+   struct kvm_ioapic *ioapic = container_of(work, struct kvm_ioapic,
+eoi_inject.work);
+   spin_lock(&ioapic->lock);
+   for (i = 0; i < IOAPIC_NUM_PINS; i++) {
+   union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i];
+
+   if (ent->fields.trig_mode != IOAPIC_LEVEL_TRIG)
+   continue;
+
+   if (ioapic->irr & (1 << i) && !ent->fields.remote_irr)
+   ioapic_service(ioapic, i, false);
+   }
+   spin_unlock(&ioapic->lock);
+}
+
 static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu,
struct kvm_ioapic *ioapic, int vector, int trigger_mode)
 {
@@ -435,8 +453,32 @@ static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu,
 
ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);
ent->fields.remote_irr = 0;
-   if (ioapic->irr & (1 << i))
-   ioapic_service(ioapic, i, false);
+   if (!ent->fields.mask && (ioapic->irr & (1 << i))) {
+   ++ioapic->irq_eoi[i];
+   if (ioapic->irq_eoi[i] == 
IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT) {
+   /*
+* Real hardware does not deliver the irq so
+* immediately during eoi broadcast, so we need
+* to emulate this behavior. Otherwise, for
+* guests who has not registered handler of a
+* level irq, this irq would be injected
+* immediately after guest enables interrupt
+* (which happens usually at the end of the
+* common interrupt routine). This would lead
+* guest can't move forward and may miss the
+* possibility to get proper irq handler
+* registered. So we need to give some breath to
+* guest.
+*/
+

Re: [PATCH] KVM: ioapic: add check for RTC_GSI

2014-09-11 Thread Paolo Bonzini

Il 11/09/2014 08:01, Jan Kiszka ha scritto:
> That issue is limited to the dead ia64 architecture, no other builds
> ioapic.c (besides x86, of course). Maybe finally remove KVM support for
> that arch instead of bloating otherwise correct code?

Yes, we should.  I've just been too busy to do it.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v5 1/7] kvm: Use APIC_DEFAULT_PHYS_BASE macro as the apic access page address.

2014-09-11 Thread Paolo Bonzini

Il 11/09/2014 07:38, Tang Chen ha scritto:
> We have APIC_DEFAULT_PHYS_BASE defined as 0xfee0, which is also the 
> address of
> apic access page. So use this macro.
> 
> Signed-off-by: Tang Chen 
> Reviewed-by: Gleb Natapov 
> ---
>  arch/x86/kvm/svm.c | 3 ++-
>  arch/x86/kvm/vmx.c | 6 +++---
>  2 files changed, 5 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> index ddf7427..1d941ad 100644
> --- a/arch/x86/kvm/svm.c
> +++ b/arch/x86/kvm/svm.c
> @@ -1257,7 +1257,8 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm 
> *kvm, unsigned int id)
>   svm->asid_generation = 0;
>   init_vmcb(svm);
>  
> - svm->vcpu.arch.apic_base = 0xfee0 | MSR_IA32_APICBASE_ENABLE;
> + svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
> +MSR_IA32_APICBASE_ENABLE;
>   if (kvm_vcpu_is_bsp(&svm->vcpu))
>   svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
>  
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index bfe11cf..4b80ead 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -3999,13 +3999,13 @@ static int alloc_apic_access_page(struct kvm *kvm)
>   goto out;
>   kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
>   kvm_userspace_mem.flags = 0;
> - kvm_userspace_mem.guest_phys_addr = 0xfee0ULL;
> + kvm_userspace_mem.guest_phys_addr = APIC_DEFAULT_PHYS_BASE;
>   kvm_userspace_mem.memory_size = PAGE_SIZE;
>   r = __kvm_set_memory_region(kvm, &kvm_userspace_mem);
>   if (r)
>   goto out;
>  
> - page = gfn_to_page(kvm, 0xfee00);
> + page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
>   if (is_error_page(page)) {
>   r = -EFAULT;
>   goto out;
> @@ -4477,7 +4477,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
>  
>   vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
>   kvm_set_cr8(&vmx->vcpu, 0);
> - apic_base_msr.data = 0xfee0 | MSR_IA32_APICBASE_ENABLE;
> + apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE;
>   if (kvm_vcpu_is_bsp(&vmx->vcpu))
>   apic_base_msr.data |= MSR_IA32_APICBASE_BSP;
>   apic_base_msr.host_initiated = true;
> 

Applied, thanks.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [GIT PULL 0/7] KVM: s390: Fixes and features for next (3.18)

2014-09-11 Thread Paolo Bonzini

Il 10/09/2014 15:53, Christian Borntraeger ha scritto:
> Paolo,
> 
> please have a look at the next bunch of s390 patches and consider to apply:
> 
> The following changes since commit fd2752352bbc98850d83b5448a288d8991590317:
> 
>   KVM: x86: use guest maxphyaddr to check MTRR values (2014-08-29 18:56:24 
> +0200)
> 
> are available in the git repository at:
> 
>   git://git.kernel.org/pub/scm/linux/kernel/git/kvms390/linux.git  
> tags/kvm-s390-next-20140910
> 
> for you to fetch changes up to bfac1f59a1afb13a3cf225bffd04be99a49c51a6:
> 
>   KVM: s390/interrupt: remove double assignment (2014-09-10 12:19:45 +0200)
> 
> 
> KVM: s390: Fixes and features for next (3.18)
> 
> 1. Crypto/CPACF support: To enable the MSA4 instructions we have to
>provide a common control structure for each SIE control block
> 2. Two cleanups found by a static code checker: one redundant assignment
>and one useless if
> 3. Fix the page handling of the diag10 ballooning interface. If the
>guest freed the pages at absolute 0 some checks and frees were
>incorrect
> 4. Limit guests to 16TB
> 5. Add __must_check to interrupt injection code
> 
> 
> Christian Borntraeger (6):
>   KVM: s390: add __must_check to interrupt deliver functions
>   KVM: s390: Limit guest size to 16TB
>   KVM: s390: unintended fallthrough for external call
>   KVM: s390: get rid of constant condition in ipte_unlock_simple
>   KVM: s390/cmm: Fix prefix handling for diag 10 balloon
>   KVM: s390/interrupt: remove double assignment
> 
> Tony Krowiak (1):
>   KVM: CPACF: Enable MSA4 instructions for kvm guest
> 
>  arch/s390/include/asm/kvm_host.h | 14 +-
>  arch/s390/kvm/diag.c | 26 ++
>  arch/s390/kvm/gaccess.c  |  3 +--
>  arch/s390/kvm/interrupt.c| 14 +++---
>  arch/s390/kvm/kvm-s390.c | 35 ++-
>  arch/s390/kvm/kvm-s390.h |  2 +-
>  6 files changed, 74 insertions(+), 20 deletions(-)
> 

Thanks, applying to kvm/queue.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v5 2/7] kvm: Remove ept_identity_pagetable from struct kvm_arch.

2014-09-11 Thread Paolo Bonzini

Il 11/09/2014 07:38, Tang Chen ha scritto:
> kvm_arch->ept_identity_pagetable holds the ept identity pagetable page. But
> it is never used to refer to the page at all.
> 
> In vcpu initialization, it indicates two things:
> 1. indicates if ept page is allocated
> 2. indicates if a memory slot for identity page is initialized
> 
> Actually, kvm_arch->ept_identity_pagetable_done is enough to tell if the ept
> identity pagetable is initialized. So we can remove ept_identity_pagetable.
> 
> NOTE: In the original code, ept identity pagetable page is pinned in memroy.
>   As a result, it cannot be migrated/hot-removed. After this patch, since
>   kvm_arch->ept_identity_pagetable is removed, ept identity pagetable page
>   is no longer pinned in memory. And it can be migrated/hot-removed.
> 
> Signed-off-by: Tang Chen 
> Reviewed-by: Gleb Natapov 
> ---
>  arch/x86/include/asm/kvm_host.h |  1 -
>  arch/x86/kvm/vmx.c  | 50 
> -
>  arch/x86/kvm/x86.c  |  2 --
>  3 files changed, 25 insertions(+), 28 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 7c492ed..35171c7 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -580,7 +580,6 @@ struct kvm_arch {
>  
>   gpa_t wall_clock;
>  
> - struct page *ept_identity_pagetable;
>   bool ept_identity_pagetable_done;
>   gpa_t ept_identity_map_addr;
>  
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 4b80ead..953d529 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -743,6 +743,7 @@ static u32 vmx_segment_access_rights(struct kvm_segment 
> *var);
>  static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu);
>  static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx);
>  static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx);
> +static int alloc_identity_pagetable(struct kvm *kvm);
>  
>  static DEFINE_PER_CPU(struct vmcs *, vmxarea);
>  static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
> @@ -3938,21 +3939,27 @@ out:
>  
>  static int init_rmode_identity_map(struct kvm *kvm)
>  {
> - int i, idx, r, ret;
> + int i, idx, r, ret = 0;
>   pfn_t identity_map_pfn;
>   u32 tmp;
>  
>   if (!enable_ept)
>   return 1;
> - if (unlikely(!kvm->arch.ept_identity_pagetable)) {
> - printk(KERN_ERR "EPT: identity-mapping pagetable "
> - "haven't been allocated!\n");
> - return 0;
> +
> + /* Protect kvm->arch.ept_identity_pagetable_done. */
> + mutex_lock(&kvm->slots_lock);
> +
> + if (likely(kvm->arch.ept_identity_pagetable_done)) {
> + ret = 1;
> + goto out2;
>   }
> - if (likely(kvm->arch.ept_identity_pagetable_done))
> - return 1;
> - ret = 0;
> +
>   identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT;
> +
> + r = alloc_identity_pagetable(kvm);
> + if (r)
> + goto out2;
> +
>   idx = srcu_read_lock(&kvm->srcu);
>   r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
>   if (r < 0)
> @@ -3970,6 +3977,9 @@ static int init_rmode_identity_map(struct kvm *kvm)
>   ret = 1;
>  out:
>   srcu_read_unlock(&kvm->srcu, idx);
> +
> +out2:
> + mutex_unlock(&kvm->slots_lock);
>   return ret;
>  }
>  
> @@ -4019,31 +4029,23 @@ out:
>  
>  static int alloc_identity_pagetable(struct kvm *kvm)
>  {
> - struct page *page;
> + /*
> +  * In init_rmode_identity_map(), kvm->arch.ept_identity_pagetable_done
> +  * is checked before calling this function and set to true after the
> +  * calling. The access to kvm->arch.ept_identity_pagetable_done should
> +  * be protected by kvm->slots_lock.
> +  */

Not just the access, also the call to this function must be protected by
the lock.

So replace this commend by just

/* Called with kvm->slots_lock held.  */

and add a BUG_ON that ept_identity_pagetable_done is false.

Paolo

>   struct kvm_userspace_memory_region kvm_userspace_mem;
>   int r = 0;
>  
> - mutex_lock(&kvm->slots_lock);
> - if (kvm->arch.ept_identity_pagetable)
> - goto out;
>   kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT;
>   kvm_userspace_mem.flags = 0;
>   kvm_userspace_mem.guest_phys_addr =
>   kvm->arch.ept_identity_map_addr;
>   kvm_userspace_mem.memory_size = PAGE_SIZE;
>   r = __kvm_set_memory_region(kvm, &kvm_userspace_mem);
> - if (r)
> - goto out;
>  
> - page = gfn_to_page(kvm, kvm->arch.ept_identity_map_addr >> PAGE_SHIFT);
> - if (is_error_page(page)) {
> - r = -EFAULT;
> - goto out;
> - }
> -
> - kvm->arch.ept_identity_pagetable = page;
> -out:
> - mutex_unlock(&kvm->slots_lock);
>   return r;
>  }
>  
> @@ -7643,8 +7645,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm 
> *kvm, un

Re: [PATCH v5 3/7] kvm: Make init_rmode_identity_map() return 0 on success.

2014-09-11 Thread Paolo Bonzini

Il 11/09/2014 07:38, Tang Chen ha scritto:
> In init_rmode_identity_map(), there two variables indicating the return
> value, r and ret, and it return 0 on error, 1 on success. The function
> is only called by vmx_create_vcpu(), and r is redundant.
> 
> This patch removes the redundant variable r, and make 
> init_rmode_identity_map()
> return 0 on success, -errno on failure.
> 
> Signed-off-by: Tang Chen 
> ---
>  arch/x86/kvm/vmx.c | 25 +++--
>  1 file changed, 11 insertions(+), 14 deletions(-)
> 
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 953d529..63c4c3e 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -3939,45 +3939,42 @@ out:
>  
>  static int init_rmode_identity_map(struct kvm *kvm)
>  {
> - int i, idx, r, ret = 0;
> + int i, idx, ret = 0;
>   pfn_t identity_map_pfn;
>   u32 tmp;
>  
>   if (!enable_ept)
> - return 1;
> + return 0;
>  
>   /* Protect kvm->arch.ept_identity_pagetable_done. */
>   mutex_lock(&kvm->slots_lock);
>  
> - if (likely(kvm->arch.ept_identity_pagetable_done)) {
> - ret = 1;
> + if (likely(kvm->arch.ept_identity_pagetable_done))
>   goto out2;
> - }
>  
>   identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT;
>  
> - r = alloc_identity_pagetable(kvm);
> - if (r)
> + ret = alloc_identity_pagetable(kvm);
> + if (ret)
>   goto out2;
>  
>   idx = srcu_read_lock(&kvm->srcu);
> - r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
> - if (r < 0)
> + ret = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
> + if (ret)
>   goto out;
>   /* Set up identity-mapping pagetable for EPT in real mode */
>   for (i = 0; i < PT32_ENT_PER_PAGE; i++) {
>   tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
>   _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
> - r = kvm_write_guest_page(kvm, identity_map_pfn,
> + ret = kvm_write_guest_page(kvm, identity_map_pfn,
>   &tmp, i * sizeof(tmp), sizeof(tmp));
> - if (r < 0)
> + if (ret)
>   goto out;
>   }
>   kvm->arch.ept_identity_pagetable_done = true;
> - ret = 1;
> +
>  out:
>   srcu_read_unlock(&kvm->srcu, idx);
> -
>  out2:
>   mutex_unlock(&kvm->slots_lock);
>   return ret;
> @@ -7645,7 +7642,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm 
> *kvm, unsigned int id)
>   kvm->arch.ept_identity_map_addr =
>   VMX_EPT_IDENTITY_PAGETABLE_ADDR;
>   err = -ENOMEM;
> - if (!init_rmode_identity_map(kvm))
> + if (init_rmode_identity_map(kvm))

Please add "< 0" here.  I would also consider setting err to the return
value of init_rmode_identity_map, and initializing it to -ENOMEM only
after the "if".

Paolo

>   goto free_vmcs;

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] KVM: ioapic: add check for RTC_GSI

2014-09-11 Thread Jan Kiszka

On 2014-09-11 11:08, Paolo Bonzini wrote:
> Il 11/09/2014 08:01, Jan Kiszka ha scritto:
>> That issue is limited to the dead ia64 architecture, no other builds
>> ioapic.c (besides x86, of course). Maybe finally remove KVM support for
>> that arch instead of bloating otherwise correct code?
> 
> Yes, we should.  I've just been too busy to do it.

As I enjoy creating large negative diffs :), I would almost do it - if I
had a test environment for IA64...

Jan

-- 
Siemens AG, Corporate Technology, CT RTC ITP SES-DE
Corporate Competence Center Embedded Linux
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v5 4/7] kvm, mem-hotplug: Reload L1' apic access page on migration in vcpu_enter_guest().

2014-09-11 Thread Paolo Bonzini

Il 11/09/2014 07:38, Tang Chen ha scritto:
> apic access page is pinned in memory. As a result, it cannot be 
> migrated/hot-removed.
> Actually, it is not necessary to be pinned.
> 
> The hpa of apic access page is stored in VMCS APIC_ACCESS_ADDR pointer. When
> the page is migrated, kvm_mmu_notifier_invalidate_page() will invalidate the
> corresponding ept entry. This patch introduces a new vcpu request named
> KVM_REQ_APIC_PAGE_RELOAD, and makes this request to all the vcpus at this 
> time,
> and force all the vcpus exit guest, and re-enter guest till they updates the 
> VMCS
> APIC_ACCESS_ADDR pointer to the new apic access page address, and updates
> kvm->arch.apic_access_page to the new page.
> 
> Signed-off-by: Tang Chen 
> ---
>  arch/x86/include/asm/kvm_host.h |  1 +
>  arch/x86/kvm/svm.c  |  6 ++
>  arch/x86/kvm/vmx.c  |  6 ++
>  arch/x86/kvm/x86.c  | 15 +++
>  include/linux/kvm_host.h|  2 ++
>  virt/kvm/kvm_main.c | 12 
>  6 files changed, 42 insertions(+)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 35171c7..514183e 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -739,6 +739,7 @@ struct kvm_x86_ops {
>   void (*hwapic_isr_update)(struct kvm *kvm, int isr);
>   void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
>   void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set);
> + void (*set_apic_access_page_addr)(struct kvm *kvm, hpa_t hpa);
>   void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
>   void (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
>   int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> index 1d941ad..f2eacc4 100644
> --- a/arch/x86/kvm/svm.c
> +++ b/arch/x86/kvm/svm.c
> @@ -3619,6 +3619,11 @@ static void svm_set_virtual_x2apic_mode(struct 
> kvm_vcpu *vcpu, bool set)
>   return;
>  }
>  
> +static void svm_set_apic_access_page_addr(struct kvm *kvm, hpa_t hpa)
> +{
> + return;
> +}
> +
>  static int svm_vm_has_apicv(struct kvm *kvm)
>  {
>   return 0;
> @@ -4373,6 +4378,7 @@ static struct kvm_x86_ops svm_x86_ops = {
>   .enable_irq_window = enable_irq_window,
>   .update_cr8_intercept = update_cr8_intercept,
>   .set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode,
> + .set_apic_access_page_addr = svm_set_apic_access_page_addr,
>   .vm_has_apicv = svm_vm_has_apicv,
>   .load_eoi_exitmap = svm_load_eoi_exitmap,
>   .hwapic_isr_update = svm_hwapic_isr_update,
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 63c4c3e..da6d55d 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -7093,6 +7093,11 @@ static void vmx_set_virtual_x2apic_mode(struct 
> kvm_vcpu *vcpu, bool set)
>   vmx_set_msr_bitmap(vcpu);
>  }
>  
> +static void vmx_set_apic_access_page_addr(struct kvm *kvm, hpa_t hpa)
> +{
> + vmcs_write64(APIC_ACCESS_ADDR, hpa);

This has to be guarded by "if (!is_guest_mode(vcpu))".

> +}
> +
>  static void vmx_hwapic_isr_update(struct kvm *kvm, int isr)
>  {
>   u16 status;
> @@ -8910,6 +8915,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
>   .enable_irq_window = enable_irq_window,
>   .update_cr8_intercept = update_cr8_intercept,
>   .set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode,
> + .set_apic_access_page_addr = vmx_set_apic_access_page_addr,
>   .vm_has_apicv = vmx_vm_has_apicv,
>   .load_eoi_exitmap = vmx_load_eoi_exitmap,
>   .hwapic_irr_update = vmx_hwapic_irr_update,
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index e05bd58..96f4188 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -5989,6 +5989,19 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
>   kvm_apic_update_tmr(vcpu, tmr);
>  }
>  
> +static void vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
> +{
> + /*
> +  * apic access page could be migrated. When the page is being migrated,
> +  * GUP will wait till the migrate entry is replaced with the new pte
> +  * entry pointing to the new page.
> +  */
> + vcpu->kvm->arch.apic_access_page = gfn_to_page(vcpu->kvm,
> + APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
> + kvm_x86_ops->set_apic_access_page_addr(vcpu->kvm,
> + page_to_phys(vcpu->kvm->arch.apic_access_page));
> +}
> +
>  /*
>   * Returns 1 to let __vcpu_run() continue the guest execution loop without
>   * exiting to the userspace.  Otherwise, the value will be returned to the
> @@ -6049,6 +6062,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>   kvm_deliver_pmi(vcpu);
>   if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu))
>   vcpu_scan_ioapic(vcpu);
> + if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu))

Re: [PATCH v5 5/7] kvm, mem-hotplug: Reload L1's apic access page on migration when L2 is running.

2014-09-11 Thread Paolo Bonzini

Il 11/09/2014 07:38, Tang Chen ha scritto:
> This patch only handle "L1 and L2 vm share one apic access page" situation.
> 
> When L1 vm is running, if the shared apic access page is migrated, 
> mmu_notifier will
> request all vcpus to exit to L0, and reload apic access page physical address 
> for
> all the vcpus' vmcs (which is done by patch 5/6). And when it enters L2 vm, 
> L2's vmcs
> will be updated in prepare_vmcs02() called by nested_vm_run(). So we need to 
> do
> nothing.
> 
> When L2 vm is running, if the shared apic access page is migrated, 
> mmu_notifier will
> request all vcpus to exit to L0, and reload apic access page physical address 
> for
> all L2 vmcs. And this patch requests apic access page reload in L2->L1 vmexit.
> 
> Signed-off-by: Tang Chen 
> ---
>  arch/x86/kvm/vmx.c  | 7 +++
>  virt/kvm/kvm_main.c | 1 +
>  2 files changed, 8 insertions(+)
> 
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index da6d55d..e7704b2 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -8796,6 +8796,13 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, 
> u32 exit_reason,
>   }
>  
>   /*
> +  * Do not call kvm_reload_apic_access_page() because we are now
> +  * running, mmu_notifier will force to reload the page's hpa for L2
> +  * vmcs. Need to reload it for L1 before entering L1.
> +  */
> + kvm_reload_apic_access_page(vcpu->kvm);

That would kill performance for large L1 hosts.
vcpu_reload_apic_access_page should be enough (please rename it to
kvm_vcpu_reload_apic_access_page in patch 4, and export it).

Paolo

> + /*
>* Exiting from L2 to L1, we're now back to L1 which thinks it just
>* finished a VMLAUNCH or VMRESUME instruction, so we need to set the
>* success or failure flag accordingly.
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index d8280de..784127e 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -214,6 +214,7 @@ void kvm_reload_apic_access_page(struct kvm *kvm)
>  {
>   make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
>  }
> +EXPORT_SYMBOL_GPL(kvm_reload_apic_access_page);
>  
>  int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
>  {
> 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

RE: [Qemu-devel] [PATCH] kvm: ioapic: conditionally delay irq delivery duringeoi broadcast

2014-09-11 Thread Gonglei (Arei)

> Subject: [Qemu-devel] [PATCH] kvm: ioapic: conditionally delay irq delivery
> duringeoi broadcast
> 
> Currently, we call ioapic_service() immediately when we find the irq is still
> active during eoi broadcast. But for real hardware, there's some dealy between
> the EOI writing and irq delivery (system bus latency?). So we need to emulate
> this behavior. Otherwise, for a guest who haven't register a proper irq 
> handler
> , it would stay in the interrupt routine as this irq would be re-injected
> immediately after guest enables interrupt. This would lead guest can't move
> forward and may miss the possibility to get proper irq handler registered (one
> example is windows guest resuming from hibernation).
> 
> As there's no way to differ the unhandled irq from new raised ones, this patch
> solve this problems by scheduling a delayed work when the count of irq 
> injected
> during eoi broadcast exceeds a threshold value. After this patch, the guest 
> can
> move a little forward when there's no suitable irq handler in case it may
> register one very soon and for guest who has a bad irq detection routine ( 
> such
> as note_interrupt() in linux ), this bad irq would be recognized soon as in 
> the
> past.
> 
> Cc: Michael S. Tsirkin 
> Signed-off-by: Jason Wang 
> Signed-off-by: Zhang Haoyu 
> ---
>  include/trace/events/kvm.h | 20 +++
>  virt/kvm/ioapic.c  | 50
> --
>  virt/kvm/ioapic.h  |  6 ++
>  3 files changed, 74 insertions(+), 2 deletions(-)
> 
If this is a new version, please add a v2/v3 suffix and describe the changes at
those different versions .

You can get more information from:
http://wiki.qemu.org/Contribute/SubmitAPatch

Best regards,
-Gonglei
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v5 7/7] kvm, mem-hotplug: Unpin and remove nested_vmx->apic_access_page.

2014-09-11 Thread Paolo Bonzini

Il 11/09/2014 07:38, Tang Chen ha scritto:
> Just like we removed kvm_arch->apic_access_page, nested_vmx->apic_access_page
> becomes useless for the same reason. This patch removes 
> nested_vmx->apic_access_page,
> and use gfn_to_page() to pin it in memory when we need it, and unpin it after 
> then.
> 
> Signed-off-by: Tang Chen 
> ---
>  arch/x86/kvm/vmx.c | 31 +--
>  1 file changed, 9 insertions(+), 22 deletions(-)
> 
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 058c373..4aa73cb 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -374,11 +374,6 @@ struct nested_vmx {
>   u64 vmcs01_tsc_offset;
>   /* L2 must run next, and mustn't decide to exit to L1. */
>   bool nested_run_pending;
> - /*
> -  * Guest pages referred to in vmcs02 with host-physical pointers, so
> -  * we must keep them pinned while L2 runs.
> -  */
> - struct page *apic_access_page;
>   u64 msr_ia32_feature_control;
>  
>   struct hrtimer preemption_timer;
> @@ -6154,11 +6149,6 @@ static void free_nested(struct vcpu_vmx *vmx)
>   nested_release_vmcs12(vmx);
>   if (enable_shadow_vmcs)
>   free_vmcs(vmx->nested.current_shadow_vmcs);
> - /* Unpin physical memory we referred to in current vmcs02 */
> - if (vmx->nested.apic_access_page) {
> - nested_release_page(vmx->nested.apic_access_page);
> - vmx->nested.apic_access_page = 0;
> - }
>  
>   nested_free_all_saved_vmcss(vmx);
>  }
> @@ -7983,28 +7973,31 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, 
> struct vmcs12 *vmcs12)
>   exec_control |= vmcs12->secondary_vm_exec_control;
>  
>   if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) {
> + struct page *page;
>   /*
>* Translate L1 physical address to host physical
>* address for vmcs02. Keep the page pinned, so this
>* physical address remains valid. We keep a reference
>* to it so we can release it later.
>*/
> - if (vmx->nested.apic_access_page) /* shouldn't happen */
> - 
> nested_release_page(vmx->nested.apic_access_page);
> - vmx->nested.apic_access_page =
> - nested_get_page(vcpu, vmcs12->apic_access_addr);
> + page = nested_get_page(vcpu, vmcs12->apic_access_addr);
>   /*
>* If translation failed, no matter: This feature asks
>* to exit when accessing the given address, and if it
>* can never be accessed, this feature won't do
>* anything anyway.
>*/
> - if (!vmx->nested.apic_access_page)
> + if (!page)
>   exec_control &=
> ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
>   else
>   vmcs_write64(APIC_ACCESS_ADDR,
> -   page_to_phys(vmx->nested.apic_access_page));
> +  page_to_phys(page));
> + /*
> +  * Do not pin nested vm's apic access page in memory so
> +  * that memory hotplug process is able to migrate it.
> +  */
> + put_page(page);
>   } else if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) {
>   struct page *page = gfn_to_page(vmx->vcpu.kvm,
>   APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
> @@ -8807,12 +8800,6 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, 
> u32 exit_reason,
>   /* This is needed for same reason as it was needed in prepare_vmcs02 */
>   vmx->host_rsp = 0;
>  
> - /* Unpin physical memory we referred to in vmcs02 */
> - if (vmx->nested.apic_access_page) {
> - nested_release_page(vmx->nested.apic_access_page);
> - vmx->nested.apic_access_page = 0;
> - }
> -
>   /*
>* Do not call kvm_reload_apic_access_page() because we are now
>* running, mmu_notifier will force to reload the page's hpa for L2
> 

This patch is not against the latest KVM tree.  The call to
nested_get_page is now in nested_get_vmcs12_pages, and you have to
handle virtual_apic_page in a similar manner.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v5 6/7] kvm, mem-hotplug: Unpin and remove kvm_arch->apic_access_page.

2014-09-11 Thread Paolo Bonzini

Il 11/09/2014 07:38, Tang Chen ha scritto:
> + if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) {
> + struct page *page = gfn_to_page(vmx->vcpu.kvm,
> + APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
> + vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(page));
> + /*
> +  * Do not pin apic access page in memory so that memory
> +  * hotplug process is able to migrate it.
> +  */
> + put_page(page);
> + }

Please reuse vcpu_reload_apic_access_page here, too.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] KVM: ioapic: add check for RTC_GSI

2014-09-11 Thread Paolo Bonzini

Il 11/09/2014 11:19, Jan Kiszka ha scritto:
> On 2014-09-11 11:08, Paolo Bonzini wrote:
>> Il 11/09/2014 08:01, Jan Kiszka ha scritto:
>>> That issue is limited to the dead ia64 architecture, no other builds
>>> ioapic.c (besides x86, of course). Maybe finally remove KVM support for
>>> that arch instead of bloating otherwise correct code?
>>
>> Yes, we should.  I've just been too busy to do it.
> 
> As I enjoy creating large negative diffs :), I would almost do it - if I
> had a test environment for IA64...

I have one. :)

Paolo

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC v2 8/9] KVM: KVM-VFIO: generic KVM_DEV_VFIO_DEVICE command and IRQ forwarding control

2014-09-11 Thread Eric Auger

On 09/11/2014 05:10 AM, Christoffer Dall wrote:
> On Mon, Sep 01, 2014 at 02:52:47PM +0200, Eric Auger wrote:
>> This patch introduces a new KVM_DEV_VFIO_DEVICE attribute.
>>
>> This is a new control channel which enables KVM to cooperate with
>> viable VFIO devices.
>>
>> The kvm-vfio device now holds a list of devices (kvm_vfio_device)
>> in addition to a list of groups (kvm_vfio_group). The new
>> infrastructure enables to check the validity of the VFIO device
>> file descriptor, get and hold a reference to it.
>>
>> The first concrete implemented command is IRQ forward control:
>> KVM_DEV_VFIO_DEVICE_FORWARD_IRQ, KVM_DEV_VFIO_DEVICE_UNFORWARD_IRQ.
>>
>> It consists in programing the VFIO driver and KVM in a consistent manner
>> so that an optimized IRQ injection/completion is set up. Each
>> kvm_vfio_device holds a list of forwarded IRQ. When putting a
>> kvm_vfio_device, the implementation makes sure the forwarded IRQs
>> are set again in the normal handling state (non forwarded).
> 
> 'putting a kvm_vfio_device' sounds to like you're golf'ing :)
> 
> When a kvm_vfio_device is released?
sure
> 
>>
>> The forwarding programmming is architecture specific, embodied by the
>> kvm_arch_set_fwd_state function. Its implementation is given in a
>> separate patch file.
> 
> I would drop the last sentence and instead indicate that this is handled
> properly when the architecture does not support such a feature.
ok
> 
>>
>> The forwarding control modality is enabled by the
>> __KVM_HAVE_ARCH_KVM_VFIO_FORWARD define.
>>
>> Signed-off-by: Eric Auger 
>>
>> ---
>>
>> v1 -> v2:
>> - __KVM_HAVE_ARCH_KVM_VFIO renamed into __KVM_HAVE_ARCH_KVM_VFIO_FORWARD
>> - original patch file separated into 2 parts: generic part moved in vfio.c
>>   and ARM specific part(kvm_arch_set_fwd_state)
>> ---
>>  include/linux/kvm_host.h |  27 +++
>>  virt/kvm/vfio.c  | 452 
>> ++-
>>  2 files changed, 477 insertions(+), 2 deletions(-)
>>
>> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
>> index a4c33b3..24350dc 100644
>> --- a/include/linux/kvm_host.h
>> +++ b/include/linux/kvm_host.h
>> @@ -1065,6 +1065,21 @@ struct kvm_device_ops {
>>unsigned long arg);
>>  };
>>  
>> +enum kvm_fwd_irq_action {
>> +KVM_VFIO_IRQ_SET_FORWARD,
>> +KVM_VFIO_IRQ_SET_NORMAL,
>> +KVM_VFIO_IRQ_CLEANUP,
> 
> This is KVM internal API, so it would probably be good to document this.
> Especially the CLEANUP bit worries me, see below.
I will document it
> 
>> +};
>> +
>> +/* internal structure describing a forwarded IRQ */
>> +struct kvm_fwd_irq {
>> +struct list_head link;
> 
> this list entry is local to the kvm vfio device, right? that means you
> probably want a struct with just the below fields, and then have a
> containing struct in the generic device file, private to it's logic.
I will introduce 2 separate structs
> 
>> +__u32 index; /* platform device irq index */
>> +__u32 hwirq; /*physical IRQ */
>> +__u32 gsi; /* virtual IRQ */
>> +struct kvm_vcpu *vcpu; /* vcpu to inject into*/
>> +};
>> +
>>  void kvm_device_get(struct kvm_device *dev);
>>  void kvm_device_put(struct kvm_device *dev);
>>  struct kvm_device *kvm_device_from_filp(struct file *filp);
>> @@ -1075,6 +1090,18 @@ extern struct kvm_device_ops kvm_vfio_ops;
>>  extern struct kvm_device_ops kvm_arm_vgic_v2_ops;
>>  extern struct kvm_device_ops kvm_flic_ops;
>>  
>> +#ifdef __KVM_HAVE_ARCH_KVM_VFIO_FORWARD
>> +int kvm_arch_set_fwd_state(struct kvm_fwd_irq *pfwd,
> 
> what's the 'p' in pfwd?
will rename
> 
>> +   enum kvm_fwd_irq_action action);
>> +
>> +#else
>> +static inline int kvm_arch_set_fwd_state(struct kvm_fwd_irq *pfwd,
>> + enum kvm_fwd_irq_action action)
>> +{
>> +return 0;
>> +}
>> +#endif
>> +
>>  #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
>>  
>>  static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool 
>> val)
>> diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c
>> index 76dc7a1..e4a81c4 100644
>> --- a/virt/kvm/vfio.c
>> +++ b/virt/kvm/vfio.c
>> @@ -18,14 +18,24 @@
>>  #include 
>>  #include 
>>  #include 
>> +#include 
>>  
>>  struct kvm_vfio_group {
>>  struct list_head node;
>>  struct vfio_group *vfio_group;
>>  };
>>  
>> +struct kvm_vfio_device {
>> +struct list_head node;
>> +struct vfio_device *vfio_device;
>> +/* list of forwarded IRQs for that VFIO device */
>> +struct list_head fwd_irq_list;
>> +int fd;
>> +};
>> +
>>  struct kvm_vfio {
>>  struct list_head group_list;
>> +struct list_head device_list;
>>  struct mutex lock;
>>  bool noncoherent;
>>  };
>> @@ -246,12 +256,441 @@ static int kvm_vfio_set_group(struct kvm_device *dev, 
>> long attr, u64 arg)
>>  return -ENXIO;
>>  }
>>  
>> +/**
>> + * get_vfio_device - returns the vfio-device corresponding to this fd
>> + * @fd:fd of the vfio platform device
>> + *
>> + * checks it

Re: [PATCH v5 7/7] kvm, mem-hotplug: Unpin and remove nested_vmx->apic_access_page.

2014-09-11 Thread tangchen



On 09/11/2014 05:33 PM, Paolo Bonzini wrote:

This patch is not against the latest KVM tree.  The call to
nested_get_page is now in nested_get_vmcs12_pages, and you have to
handle virtual_apic_page in a similar manner.

Hi Paolo,

Thanks for the reviewing.

This patch-set is against Linux v3.17-rc4.
Will make it against the latest KVM tree, and resend a patch set 
following you comments.


Thanks.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [BUG] Guest kernel divide error in kvm_unlock_kick

2014-09-11 Thread Paolo Bonzini

Il 08/09/2014 15:28, Chris Webb ha scritto:
> divide error:  [#1] PREEMPT SMP 
> Modules linked in:
> CPU: 0 PID: 743 Comm: syslogd Not tainted 3.16.2-guest #2
> Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 
> rel-1.7.5-0-ge51488c-20140602_164612-nilsson.home.kraxel.org 04/01/2014
> task: 88007c972580 ti: 88007cb7c000 task.ti: 88007cb7c000
> RIP: 0010:[]  [] kvm_unlock_kick+0x72/0x80
> RSP: :88007fc03ec8  EFLAGS: 00010046
> RAX: 0005 RBX:  RCX: 0003
> RDX: 0003 RSI: 81a466a0 RDI: 
> RBP: 81a466a0 R08: 81b98940 R09: 0246
> R10: 0400 R11:  R12: 00ea
> R13: 0009 R14: 0002 R15: 88007fc0d300
> FS:  7f2a6473e700() GS:88007fc0() knlGS:
> CS:  0010 DS:  ES:  CR0: 8005003b
> CR2: 004a8240 CR3: 7ac75000 CR4: 000406f0
> Stack:
>  81a46400 0246 0001 8168979d
>  0282 81110d97 0007 88007cb7ffd8
>  88007c972580 4b0782e8 0002 81a0b0c8
> Call Trace:
>   
>  [] ? _raw_spin_unlock_irqrestore+0x5d/0x80
>  [] ? rcu_process_callbacks+0x337/0x4f0
>  [] ? __do_softirq+0xfd/0x210
>  [] ? irq_exit+0x7e/0xa0
>  [] ? smp_apic_timer_interrupt+0x3b/0x50
>  [] ? apic_timer_interrupt+0x6d/0x80
>   
>  [] ? filemap_map_pages+0x17b/0x240
>  [] ? filemap_map_pages+0x230/0x240
>  [] ? do_read_fault.isra.70+0x2a2/0x320
>  [] ? handle_mm_fault+0x37c/0xd00
>  [] ? __do_page_fault+0x185/0x4c0
>  [] ? async_page_fault+0x28/0x30
>  [] ? __put_user_4+0x20/0x30
>  [] ? async_page_fault+0x28/0x30
> Code: c0 ca a7 81 48 8d 04 0b 48 8b 30 48 39 ee 75 c9 0f b6 40 08 44 38 e0 75 
> c0 48 c7 c0 22 b0 00 00 31 db 0f b7 0c 08 b8 05 00 00 00 <0f> 01 c1 0f 1f 00 
> 5b 5d 41 5c c3 0f 1f 00 48 c7 c0 10 cf 00 00 

Hi Chris,

sorry for not following up on your previous patch.

This is a hypercall that should have kicked VCPU 3 (see rcx).

Can you please apply this patch and gather a trace of the host
(using "trace-cmd -e kvm qemu-kvm ")?

Thanks,

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index fb919c574e23..25ed29f68419 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -709,6 +709,8 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int 
delivery_mode,
int result = 0;
struct kvm_vcpu *vcpu = apic->vcpu;
 
+   trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode,
+ trig_mode, vector, false);
switch (delivery_mode) {
case APIC_DM_LOWEST:
vcpu->arch.apic_arb_prio++;
@@ -730,8 +732,6 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int 
delivery_mode,
kvm_make_request(KVM_REQ_EVENT, vcpu);
kvm_vcpu_kick(vcpu);
}
-   trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode,
- trig_mode, vector, false);
break;
 
case APIC_DM_REMRD:



Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] KVM: x86: make apic_accept_irq tracepoint more generic

2014-09-11 Thread Paolo Bonzini

Initially the tracepoint was added only to the APIC_DM_FIXED case,
also because it reported coalesced interrupts that only made sense
for that case.  However, the coalesced argument is not used anymore
and tracing other delivery modes is useful, so hoist the call out
of the switch statement.

Signed-off-by: Paolo Bonzini 
---
 arch/x86/kvm/lapic.c |  4 ++--
 arch/x86/kvm/trace.h | 11 ---
 2 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index fb919c574e23..b8345dd41b25 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -709,6 +709,8 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int 
delivery_mode,
int result = 0;
struct kvm_vcpu *vcpu = apic->vcpu;
 
+   trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode,
+ trig_mode, vector);
switch (delivery_mode) {
case APIC_DM_LOWEST:
vcpu->arch.apic_arb_prio++;
@@ -730,8 +732,6 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int 
delivery_mode,
kvm_make_request(KVM_REQ_EVENT, vcpu);
kvm_vcpu_kick(vcpu);
}
-   trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode,
- trig_mode, vector, false);
break;
 
case APIC_DM_REMRD:
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 4c2868f36808..6b06ab8748dd 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -415,15 +415,14 @@ TRACE_EVENT(kvm_apic_ipi,
 );
 
 TRACE_EVENT(kvm_apic_accept_irq,
-   TP_PROTO(__u32 apicid, __u16 dm, __u8 tm, __u8 vec, bool coalesced),
-   TP_ARGS(apicid, dm, tm, vec, coalesced),
+   TP_PROTO(__u32 apicid, __u16 dm, __u8 tm, __u8 vec),
+   TP_ARGS(apicid, dm, tm, vec),
 
TP_STRUCT__entry(
__field(__u32,  apicid  )
__field(__u16,  dm  )
__field(__u8,   tm  )
__field(__u8,   vec )
-   __field(bool,   coalesced   )
),
 
TP_fast_assign(
@@ -431,14 +430,12 @@ TRACE_EVENT(kvm_apic_accept_irq,
__entry->dm = dm;
__entry->tm = tm;
__entry->vec= vec;
-   __entry->coalesced  = coalesced;
),
 
-   TP_printk("apicid %x vec %u (%s|%s)%s",
+   TP_printk("apicid %x vec %u (%s|%s)",
  __entry->apicid, __entry->vec,
  __print_symbolic((__entry->dm >> 8 & 0x7), kvm_deliver_mode),
- __entry->tm ? "level" : "edge",
- __entry->coalesced ? " (coalesced)" : "")
+ __entry->tm ? "level" : "edge")
 );
 
 TRACE_EVENT(kvm_eoi,
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v5 4/7] kvm, mem-hotplug: Reload L1' apic access page on migration in vcpu_enter_guest().

2014-09-11 Thread Gleb Natapov

On Thu, Sep 11, 2014 at 11:21:49AM +0200, Paolo Bonzini wrote:
> Il 11/09/2014 07:38, Tang Chen ha scritto:
> > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> > index 63c4c3e..da6d55d 100644
> > --- a/arch/x86/kvm/vmx.c
> > +++ b/arch/x86/kvm/vmx.c
> > @@ -7093,6 +7093,11 @@ static void vmx_set_virtual_x2apic_mode(struct 
> > kvm_vcpu *vcpu, bool set)
> > vmx_set_msr_bitmap(vcpu);
> >  }
> >  
> > +static void vmx_set_apic_access_page_addr(struct kvm *kvm, hpa_t hpa)
> > +{
> > +   vmcs_write64(APIC_ACCESS_ADDR, hpa);
> 
> This has to be guarded by "if (!is_guest_mode(vcpu))".
> 
We do need to write it if L1 and L2 share APIC_ACCESS_ADDR and skip
it otherwise, no?

> > +}
> > +

--
Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v5 4/7] kvm, mem-hotplug: Reload L1' apic access page on migration in vcpu_enter_guest().

2014-09-11 Thread tangchen



On 09/11/2014 05:21 PM, Paolo Bonzini wrote:

Il 11/09/2014 07:38, Tang Chen ha scritto:

apic access page is pinned in memory. As a result, it cannot be 
migrated/hot-removed.
Actually, it is not necessary to be pinned.

The hpa of apic access page is stored in VMCS APIC_ACCESS_ADDR pointer. When
the page is migrated, kvm_mmu_notifier_invalidate_page() will invalidate the
corresponding ept entry. This patch introduces a new vcpu request named
KVM_REQ_APIC_PAGE_RELOAD, and makes this request to all the vcpus at this time,
and force all the vcpus exit guest, and re-enter guest till they updates the 
VMCS
APIC_ACCESS_ADDR pointer to the new apic access page address, and updates
kvm->arch.apic_access_page to the new page.

Signed-off-by: Tang Chen 
---
  arch/x86/include/asm/kvm_host.h |  1 +
  arch/x86/kvm/svm.c  |  6 ++
  arch/x86/kvm/vmx.c  |  6 ++
  arch/x86/kvm/x86.c  | 15 +++
  include/linux/kvm_host.h|  2 ++
  virt/kvm/kvm_main.c | 12 
  6 files changed, 42 insertions(+)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 35171c7..514183e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -739,6 +739,7 @@ struct kvm_x86_ops {
void (*hwapic_isr_update)(struct kvm *kvm, int isr);
void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set);
+   void (*set_apic_access_page_addr)(struct kvm *kvm, hpa_t hpa);
void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
void (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 1d941ad..f2eacc4 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3619,6 +3619,11 @@ static void svm_set_virtual_x2apic_mode(struct kvm_vcpu 
*vcpu, bool set)
return;
  }
  
+static void svm_set_apic_access_page_addr(struct kvm *kvm, hpa_t hpa)

+{
+   return;
+}
+
  static int svm_vm_has_apicv(struct kvm *kvm)
  {
return 0;
@@ -4373,6 +4378,7 @@ static struct kvm_x86_ops svm_x86_ops = {
.enable_irq_window = enable_irq_window,
.update_cr8_intercept = update_cr8_intercept,
.set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode,
+   .set_apic_access_page_addr = svm_set_apic_access_page_addr,
.vm_has_apicv = svm_vm_has_apicv,
.load_eoi_exitmap = svm_load_eoi_exitmap,
.hwapic_isr_update = svm_hwapic_isr_update,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 63c4c3e..da6d55d 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -7093,6 +7093,11 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu 
*vcpu, bool set)
vmx_set_msr_bitmap(vcpu);
  }
  
+static void vmx_set_apic_access_page_addr(struct kvm *kvm, hpa_t hpa)

+{
+   vmcs_write64(APIC_ACCESS_ADDR, hpa);

This has to be guarded by "if (!is_guest_mode(vcpu))".


Since we cannot get vcpu through kvm, I'd like to move this check to
vcpu_reload_apic_access_page() when 
kvm_x86_ops->set_apic_access_page_addr()

is called.

Thanks.



+}
+
  static void vmx_hwapic_isr_update(struct kvm *kvm, int isr)
  {
u16 status;
@@ -8910,6 +8915,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
.enable_irq_window = enable_irq_window,
.update_cr8_intercept = update_cr8_intercept,
.set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode,
+   .set_apic_access_page_addr = vmx_set_apic_access_page_addr,
.vm_has_apicv = vmx_vm_has_apicv,
.load_eoi_exitmap = vmx_load_eoi_exitmap,
.hwapic_irr_update = vmx_hwapic_irr_update,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e05bd58..96f4188 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5989,6 +5989,19 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
kvm_apic_update_tmr(vcpu, tmr);
  }
  
+static void vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)

+{
+   /*
+* apic access page could be migrated. When the page is being migrated,
+* GUP will wait till the migrate entry is replaced with the new pte
+* entry pointing to the new page.
+*/
+   vcpu->kvm->arch.apic_access_page = gfn_to_page(vcpu->kvm,
+   APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
+   kvm_x86_ops->set_apic_access_page_addr(vcpu->kvm,
+   page_to_phys(vcpu->kvm->arch.apic_access_page));
+}
+
  /*
   * Returns 1 to let __vcpu_run() continue the guest execution loop without
   * exiting to the userspace.  Otherwise, the value will be returned to the
@@ -6049,6 +6062,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
kvm_deliver_pmi(vcpu);
if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu))

Re: [PATCH v2] ARM/arm64: KVM: fix use of WnR bit in kvm_is_write_fault()

2014-09-11 Thread Marc Zyngier

On 11/09/14 04:12, Christoffer Dall wrote:
> On Tue, Sep 09, 2014 at 12:02:59PM +0100, Marc Zyngier wrote:
>> [resending, as ARM email server seems to be in some mood]
>>
>> On 09/09/14 11:27, Ard Biesheuvel wrote:
>>> The ISS encoding for an exception from a Data Abort has a WnR
>>> bit[6] that indicates whether the Data Abort was caused by a
>>> read or a write instruction. While there are several fields
>>> in the encoding that are only valid if the ISV bit[24] is set,
>>> WnR is not one of them, so we can read it unconditionally.
>>>
>>> Instead of fixing both implementations of kvm_is_write_fault()
>>> in place, reimplement it just once using kvm_vcpu_dabt_iswrite(),
>>> which already does the right thing with respect to the WnR bit.
>>> Also fix up the callers to pass 'vcpu'
>>>
>>> Acked-by: Laszlo Ersek 
>>> Signed-off-by: Ard Biesheuvel 
>>
>> Because I like that kind of diffstat:
>> Acked-by: Marc Zyngier 
>>
>> Christoffer, if you too are happy with that, I'll queue it right away.
>>
> Extremely happy:
> 
> Acked-by: Christoffer Dall 

Added to kvmarm/queue.

Thanks,

M.
-- 
Jazz is not dead. It just smells funny...

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v5 3/7] kvm: Make init_rmode_identity_map() return 0 on success.

2014-09-11 Thread tangchen



On 09/11/2014 05:17 PM, Paolo Bonzini wrote:

..
@@ -7645,7 +7642,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, 
unsigned int id)
kvm->arch.ept_identity_map_addr =
VMX_EPT_IDENTITY_PAGETABLE_ADDR;
err = -ENOMEM;
-   if (!init_rmode_identity_map(kvm))
+   if (init_rmode_identity_map(kvm))
Please add "< 0" here.  I would also consider setting err to the return
value of init_rmode_identity_map, and initializing it to -ENOMEM only
after the "if".


I'd like to move err = -ENOMEM to the following place:

vmx_create_vcpu()
{
..
err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
if (err)
goto free_vcpu;

err = -ENOMEM;  -- move it here

vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);

vmx->loaded_vmcs->vmcs = alloc_vmcs();

}

So that it can be used to handle the next two memory allocation error.

Thanks.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] KVM: EVENTFD: remove inclusion of irq.h

2014-09-11 Thread Marc Zyngier

On 11/09/14 08:11, Paolo Bonzini wrote:
> Il 11/09/2014 05:09, Christoffer Dall ha scritto:
>> On Mon, Sep 01, 2014 at 12:11:19PM +0200, Paolo Bonzini wrote:
>>> Il 01/09/2014 10:36, Eric Auger ha scritto:
 No more needed. irq.h would be void on ARM.

 Signed-off-by: Eric Auger 

 ---

 I don't think irq.h is needed anymore since Paul Mackerras' work. However
 I did not compile for all architectures.
 ---
  virt/kvm/eventfd.c | 1 -
  1 file changed, 1 deletion(-)

 diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
 index 3c5981c..0c712a7 100644
 --- a/virt/kvm/eventfd.c
 +++ b/virt/kvm/eventfd.c
 @@ -36,7 +36,6 @@
  #include 
  #include 
  
 -#include "irq.h"
  #include "iodev.h"
  
  #ifdef CONFIG_HAVE_KVM_IRQFD

>>>
>>> Acked-by: Paolo Bonzini 
>>>
>>> Christoffer, please include this via the ARM tree, together with ARM
>>> irqfd support.  Thanks,
>>>
>> Marc is dealing with the tree this week and the next so he can apply it
>> to kvmarm/queue.  Do you want it to wait and go with the irqfd patch
>> (which has dependencies not yet resolved) or should we just queue it?
> 
> As you prefer.

Added to kvmarm/queue.

Thanks,

M.
-- 
Jazz is not dead. It just smells funny...

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

blk-mq crash under KVM in multiqueue block code (with virtio-blk and ext4)

2014-09-11 Thread Christian Borntraeger

Folks,

we have seen the following bug with 3.16 as a KVM guest. It suspect the blk-mq 
rework that happened between 3.15 and 3.16, but it can be something completely 
different.


[   65.992022] Unable to handle kernel pointer dereference in virtual kernel 
address space
[   65.992187] failing address: d000 TEID: d803
[   65.992363] Fault in home space mode while using kernel ASCE.
[   65.992365] AS:00a7c007 R3:0024 
[   65.993754] Oops: 0038 [#1] SMP 
[   65.993923] Modules linked in: iscsi_tcp libiscsi_tcp libiscsi 
scsi_transport_iscsi virtio_balloon vhost_net vhost macvtap macvlan kvm 
dm_multipath virtio_net virtio_blk sunrpc
[   65.994274] CPU: 0 PID: 44 Comm: kworker/u6:2 Not tainted 
3.16.0-20140814.0.c66c84c.fc18-s390xfrob #1
[   65.996043] Workqueue: writeback bdi_writeback_workfn (flush-251:32)
[   65.996222] task: 0225 ti: 02258000 task.ti: 
02258000
[   65.996228] Krnl PSW : 0704f0018000 003ed114 
(blk_mq_tag_to_rq+0x20/0x38)
[   65.997299]R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:3 PM:0 
EA:3
   Krnl GPRS: 0040  01619000 
004e
[   65.997301]004e  0001 
00a0de18
[   65.997302]77ffbe18 77ffbd50 6d72d620 
004f
[   65.997304]01a99400 0080 003eddee 
77ffbc28
[   65.997864] Krnl Code: 003ed106: e3102034lg  
%r1,48(%r2)
  003ed10c: 91082044tm  
68(%r2),8
 #003ed110: a7840009brc 8,3ed122
 >003ed114: e34016880004lg  
%r4,1672(%r1)
  003ed11a: 59304100c   
%r3,256(%r4)
  003ed11e: a7840003brc 8,3ed124
  003ed122: 07febcr 15,%r14
  003ed124: b9040024lgr %r2,%r4
[   65.998221] Call Trace:
[   65.998224] ([<0001>] 0x1)
[   65.998227]  [<003f17b6>] blk_mq_tag_busy_iter+0x7a/0xc4
[   65.998228]  [<003edcd6>] blk_mq_rq_timer+0x96/0x13c
[   65.999226]  [<0013ee60>] call_timer_fn+0x40/0x110
[   65.999230]  [<0013f642>] run_timer_softirq+0x2de/0x3d0
[   65.999238]  [<00135b70>] __do_softirq+0x124/0x2ac
[   65.999241]  [<00136000>] irq_exit+0xc4/0xe4
[   65.999435]  [<0010bc08>] do_IRQ+0x64/0x84
[   66.437533]  [<0067ccd8>] ext_skip+0x42/0x46
[   66.437541]  [<003ed7b4>] __blk_mq_alloc_request+0x58/0x1e8
[   66.437544] ([<003ed788>] __blk_mq_alloc_request+0x2c/0x1e8)
[   66.437547]  [<003eef82>] blk_mq_map_request+0xc2/0x208
[   66.437549]  [<003ef860>] blk_sq_make_request+0xac/0x350
[   66.437721]  [<003e2d6c>] generic_make_request+0xc4/0xfc
[   66.437723]  [<003e2e56>] submit_bio+0xb2/0x1a8
[   66.438373]  [<0031e8aa>] ext4_io_submit+0x52/0x80
[   66.438375]  [<0031ccfa>] ext4_writepages+0x7c6/0xd0c
[   66.438378]  [<002aea20>] __writeback_single_inode+0x54/0x274
[   66.438379]  [<002b0134>] writeback_sb_inodes+0x28c/0x4ec
[   66.438380]  [<002b042e>] __writeback_inodes_wb+0x9a/0xe4
[   66.438382]  [<002b06a2>] wb_writeback+0x22a/0x358
[   66.438383]  [<002b0cd0>] bdi_writeback_workfn+0x354/0x538
[   66.438618]  [<0014e3aa>] process_one_work+0x1aa/0x418
[   66.438621]  [<0014ef94>] worker_thread+0x48/0x524
[   66.438625]  [<001560ca>] kthread+0xee/0x108
[   66.438627]  [<0067c76e>] kernel_thread_starter+0x6/0xc
[   66.438628]  [<0067c768>] kernel_thread_starter+0x0/0xc
[   66.438629] Last Breaking-Event-Address:
[   66.438631]  [<003edde8>] blk_mq_timeout_check+0x6c/0xb8

I looked into the dump, and the full function is  (annotated by me to match the 
source code)
r2= tags
r3= tag (4e)
Dump of assembler code for function blk_mq_tag_to_rq:
   0x003ed0f4 <+0>: lg  %r1,96(%r2) # r1 
has now tags->rqs
   0x003ed0fa <+6>: sllg%r2,%r3,3   # r2 
has tag*8
   0x003ed100 <+12>:lg  %r2,0(%r2,%r1)  # r2 
now has rq (=tags->rqs[tag])
   0x003ed106 <+18>:lg  %r1,48(%r2) # r1 
now has rq->q
   0x003ed10c <+24>:tm  68(%r2),8   # test 
for rq->cmd_flags & REQ_FLUSH_SEQ
   0x003ed110 <+28>:je  0x3ed122   #  if 
not goto 3ed122
   0x003ed114 <+32>:lg  %r4,1672(%r1)   # r4 = 
rq->q->flush_rq  < CRASHES as rq->q points to 
   0x003ed11a <+38>:c   %r3,256(%r4)# 
compare

Re: [PATCH v5 4/7] kvm, mem-hotplug: Reload L1' apic access page on migration in vcpu_enter_guest().

2014-09-11 Thread Paolo Bonzini

Il 11/09/2014 12:20, tangchen ha scritto:
>>>
>>> +vmcs_write64(APIC_ACCESS_ADDR, hpa);
>> This has to be guarded by "if (!is_guest_mode(vcpu))".
> 
> Since we cannot get vcpu through kvm, I'd like to move this check to
> vcpu_reload_apic_access_page() when
> kvm_x86_ops->set_apic_access_page_addr()
> is called.

Good idea!  Though passing the vcpu to vmx_set_apic_access_page_addr
would also work.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v5 4/7] kvm, mem-hotplug: Reload L1' apic access page on migration in vcpu_enter_guest().

2014-09-11 Thread Paolo Bonzini

Il 11/09/2014 12:12, Gleb Natapov ha scritto:
> On Thu, Sep 11, 2014 at 11:21:49AM +0200, Paolo Bonzini wrote:
>> Il 11/09/2014 07:38, Tang Chen ha scritto:
>>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>>> index 63c4c3e..da6d55d 100644
>>> --- a/arch/x86/kvm/vmx.c
>>> +++ b/arch/x86/kvm/vmx.c
>>> @@ -7093,6 +7093,11 @@ static void vmx_set_virtual_x2apic_mode(struct 
>>> kvm_vcpu *vcpu, bool set)
>>> vmx_set_msr_bitmap(vcpu);
>>>  }
>>>  
>>> +static void vmx_set_apic_access_page_addr(struct kvm *kvm, hpa_t hpa)
>>> +{
>>> +   vmcs_write64(APIC_ACCESS_ADDR, hpa);
>>
>> This has to be guarded by "if (!is_guest_mode(vcpu))".
>>
> We do need to write it if L1 and L2 share APIC_ACCESS_ADDR and skip
> it otherwise, no?

Yes, but this would be handled by patch 6:

} else if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) {
+   struct page *page = gfn_to_page(vmx->vcpu.kvm,
+   APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
exec_control |=
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
-   vmcs_write64(APIC_ACCESS_ADDR,
-   page_to_phys(vcpu->kvm->arch.apic_access_page));
+   vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(page));
+   /*
+* Do not pin apic access page in memory so that memory
+* hotplug process is able to migrate it.
+*/
+   put_page(page);
}

However, this is also useless code duplication because the above snippet could
reuse vcpu_reload_apic_access_page too.

So I think you cannot do the is_guest_mode check in
kvm_vcpu_reload_apic_access_page and also not in
vmx_reload_apic_access_page.  But you could do something like

kvm_vcpu_reload_apic_access_page(...)
{
...
kvm_x86_ops->reload_apic_access_page(...);
}
EXPORT_SYMBOL_GPL(kvm_vcpu_reload_apic_access_page);

/* used in vcpu_enter_guest only */
vcpu_reload_apic_access_page(...)
{
if (!is_guest_mode(vcpu))
kvm_vcpu_reload_apic_access_page(...)
}

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v4 2/8] arm/arm64: KVM: vgic: switch to dynamic allocation

2014-09-11 Thread Marc Zyngier

So far, all the VGIC data structures are statically defined by the
*maximum* number of vcpus and interrupts it supports. It means that
we always have to oversize it to cater for the worse case.

Start by changing the data structures to be dynamically sizeable,
and allocate them at runtime.

The sizes are still very static though.

Signed-off-by: Marc Zyngier 
---
 arch/arm/kvm/arm.c |   3 +
 include/kvm/arm_vgic.h |  76 
 virt/kvm/arm/vgic.c| 237 ++---
 3 files changed, 267 insertions(+), 49 deletions(-)

diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index a99e0cd..923a01d 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -172,6 +172,8 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
kvm->vcpus[i] = NULL;
}
}
+
+   kvm_vgic_destroy(kvm);
 }
 
 int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
@@ -253,6 +255,7 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
 {
kvm_mmu_free_memory_caches(vcpu);
kvm_timer_vcpu_terminate(vcpu);
+   kvm_vgic_vcpu_destroy(vcpu);
kmem_cache_free(kvm_vcpu_cache, vcpu);
 }
 
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index f074539..bdaac57 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -54,19 +54,33 @@
  * - a bunch of shared interrupts (SPI)
  */
 struct vgic_bitmap {
-   union {
-   u32 reg[VGIC_NR_PRIVATE_IRQS / 32];
-   DECLARE_BITMAP(reg_ul, VGIC_NR_PRIVATE_IRQS);
-   } percpu[VGIC_MAX_CPUS];
-   union {
-   u32 reg[VGIC_NR_SHARED_IRQS / 32];
-   DECLARE_BITMAP(reg_ul, VGIC_NR_SHARED_IRQS);
-   } shared;
+   /*
+* - One UL per VCPU for private interrupts (assumes UL is at
+*   least 32 bits)
+* - As many UL as necessary for shared interrupts.
+*
+* The private interrupts are accessed via the "private"
+* field, one UL per vcpu (the state for vcpu n is in
+* private[n]). The shared interrupts are accessed via the
+* "shared" pointer (IRQn state is at bit n-32 in the bitmap).
+*/
+   unsigned long *private;
+   unsigned long *shared;
 };
 
 struct vgic_bytemap {
-   u32 percpu[VGIC_MAX_CPUS][VGIC_NR_PRIVATE_IRQS / 4];
-   u32 shared[VGIC_NR_SHARED_IRQS  / 4];
+   /*
+* - 8 u32 per VCPU for private interrupts
+* - As many u32 as necessary for shared interrupts.
+*
+* The private interrupts are accessed via the "private"
+* field, (the state for vcpu n is in private[n*8] to
+* private[n*8 + 7]). The shared interrupts are accessed via
+* the "shared" pointer (IRQn state is at byte (n-32)%4 of the
+* shared[(n-32)/4] word).
+*/
+   u32 *private;
+   u32 *shared;
 };
 
 struct kvm_vcpu;
@@ -127,6 +141,9 @@ struct vgic_dist {
boolin_kernel;
boolready;
 
+   int nr_cpus;
+   int nr_irqs;
+
/* Virtual control interface mapping */
void __iomem*vctrl_base;
 
@@ -166,15 +183,36 @@ struct vgic_dist {
/* Level/edge triggered */
struct vgic_bitmap  irq_cfg;
 
-   /* Source CPU per SGI and target CPU */
-   u8  irq_sgi_sources[VGIC_MAX_CPUS][VGIC_NR_SGIS];
+   /*
+* Source CPU per SGI and target CPU:
+*
+* Each byte represent a SGI observable on a VCPU, each bit of
+* this byte indicating if the corresponding VCPU has
+* generated this interrupt. This is a GICv2 feature only.
+*
+* For VCPUn (n < 8), irq_sgi_sources[n*16] to [n*16 + 15] are
+* the SGIs observable on VCPUn.
+*/
+   u8  *irq_sgi_sources;
 
-   /* Target CPU for each IRQ */
-   u8  irq_spi_cpu[VGIC_NR_SHARED_IRQS];
-   struct vgic_bitmap  irq_spi_target[VGIC_MAX_CPUS];
+   /*
+* Target CPU for each SPI:
+*
+* Array of available SPI, each byte indicating the target
+* VCPU for SPI. IRQn (n >=32) is at irq_spi_cpu[n-32].
+*/
+   u8  *irq_spi_cpu;
+
+   /*
+* Reverse lookup of irq_spi_cpu for faster compute pending:
+*
+* Array of bitmaps, one per VCPU, describing is IRQn is
+* routed to a particular VCPU.
+*/
+   struct vgic_bitmap  *irq_spi_target;
 
/* Bitmap indicating which CPU has something pending */
-   unsigned long   irq_pending_on_cpu;
+   unsigned long   *irq_pending_on_cpu;
 #endif
 };
 
@@ -204,11 +242,11 @@ struct vgic_v3_cpu_if {
 struct vgic_cpu {
 #ifdef CONFIG_KVM_ARM_VGIC
/* per IRQ to LR mapping */
-   u8  vgic_irq_lr_map[VGIC_NR_IRQS];
+   u8  *vgic_irq_lr_map;

[PATCH v4 0/8] arm/arm64: KVM: dynamic VGIC sizing

2014-09-11 Thread Marc Zyngier

So far, the VGIC data structures have been statically sized, meaning
that we always have to support more interrupts than we actually want,
and more CPU interfaces than we should. This is a waste of resource,
and is the kind of things that should be tuneable.

This series addresses that issue by changing the data structures to be
dynamically allocated, and adds a new configuration attribute to
allocate the number of interrupts. When the attribute is not used, we
fallback to the old behaviour of allocating a fixed number of
interrupts.

This series is also the base for Andre Przywara's GICv3 distributor
emulation code (which can support far more than 8 vcpus and 1020
interrupts).

This has been tested on both ARM (TC2, A20) and arm64 (model and Juno).

The code is available from my kvm-arm64/kvmtool-vgic-dyn branch,
together with the corresponding kvmtool code.

* From v3 [3]
  - Number of comments added to the data structures, making slightly more
obvious the various mappings
  - Dropped the nr_irqs field from bitmap and bytemap structures, as
it was a leftover from the initial revision that only had a single
pointer
  - Small cleanups all over the place
  - Dropped the "sub-page offset" patch for now, as this need some
serious reworking
  - Rebased on top of Christoffer "vgic cleanup" series, with 3.17-rc4
thrown in for a good measure

* From v2 [2]
  - Fixed bug that broke QEMU (register access can trigger allocation)
  - irq_pending_on_cpu is now dynamic (needed for more than 32 or 64 vcpus)
  - Rebased on top of Victor's BE patches

* From v1 [1]
  - Rebased on top of 3.16-rc1
  - Lots of cleanup

[1]: https://lists.cs.columbia.edu/pipermail/kvmarm/2013-October/005879.html
[2]: https://lists.cs.columbia.edu/pipermail/kvmarm/2014-June/010050.html
[3]: https://lists.cs.columbia.edu/pipermail/kvmarm/2014-July/010383.html

Marc Zyngier (8):
  KVM: ARM: vgic: plug irq injection race
  arm/arm64: KVM: vgic: switch to dynamic allocation
  arm/arm64: KVM: vgic: Parametrize VGIC_NR_SHARED_IRQS
  arm/arm64: KVM: vgic: kill VGIC_MAX_CPUS
  arm/arm64: KVM: vgic: handle out-of-range MMIO accesses
  arm/arm64: KVM: vgic: kill VGIC_NR_IRQS
  arm/arm64: KVM: vgic: delay vgic allocation until init time
  arm/arm64: KVM: vgic: make number of irqs a configurable attribute

 Documentation/virtual/kvm/devices/arm-vgic.txt |  10 +
 arch/arm/include/uapi/asm/kvm.h|   1 +
 arch/arm/kvm/arm.c |  10 +-
 arch/arm64/include/uapi/asm/kvm.h  |   1 +
 include/kvm/arm_vgic.h |  88 --
 virt/kvm/arm/vgic.c| 396 +
 6 files changed, 413 insertions(+), 93 deletions(-)

-- 
2.0.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v4 3/8] arm/arm64: KVM: vgic: Parametrize VGIC_NR_SHARED_IRQS

2014-09-11 Thread Marc Zyngier

Having a dynamic number of supported interrupts means that we
cannot relly on VGIC_NR_SHARED_IRQS being fixed anymore.

Instead, make it take the distributor structure as a parameter,
so it can return the right value.

Reviewed-by: Christoffer Dall 
Signed-off-by: Marc Zyngier 
---
 include/kvm/arm_vgic.h |  1 -
 virt/kvm/arm/vgic.c| 16 +++-
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index bdaac57..bdeb451 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -29,7 +29,6 @@
 #define VGIC_NR_SGIS   16
 #define VGIC_NR_PPIS   16
 #define VGIC_NR_PRIVATE_IRQS   (VGIC_NR_SGIS + VGIC_NR_PPIS)
-#define VGIC_NR_SHARED_IRQS(VGIC_NR_IRQS - VGIC_NR_PRIVATE_IRQS)
 #define VGIC_MAX_CPUS  KVM_MAX_VCPUS
 
 #define VGIC_V2_MAX_LRS(1 << 6)
diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
index 92c086e..93fe73b 100644
--- a/virt/kvm/arm/vgic.c
+++ b/virt/kvm/arm/vgic.c
@@ -1083,11 +1083,17 @@ static void vgic_dispatch_sgi(struct kvm_vcpu *vcpu, 
u32 reg)
}
 }
 
+static int vgic_nr_shared_irqs(struct vgic_dist *dist)
+{
+   return dist->nr_irqs - VGIC_NR_PRIVATE_IRQS;
+}
+
 static int compute_pending_for_cpu(struct kvm_vcpu *vcpu)
 {
struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
unsigned long *pending, *enabled, *pend_percpu, *pend_shared;
unsigned long pending_private, pending_shared;
+   int nr_shared = vgic_nr_shared_irqs(dist);
int vcpu_id;
 
vcpu_id = vcpu->vcpu_id;
@@ -1100,15 +1106,15 @@ static int compute_pending_for_cpu(struct kvm_vcpu 
*vcpu)
 
pending = vgic_bitmap_get_shared_map(&dist->irq_pending);
enabled = vgic_bitmap_get_shared_map(&dist->irq_enabled);
-   bitmap_and(pend_shared, pending, enabled, VGIC_NR_SHARED_IRQS);
+   bitmap_and(pend_shared, pending, enabled, nr_shared);
bitmap_and(pend_shared, pend_shared,
   vgic_bitmap_get_shared_map(&dist->irq_spi_target[vcpu_id]),
-  VGIC_NR_SHARED_IRQS);
+  nr_shared);
 
pending_private = find_first_bit(pend_percpu, VGIC_NR_PRIVATE_IRQS);
-   pending_shared = find_first_bit(pend_shared, VGIC_NR_SHARED_IRQS);
+   pending_shared = find_first_bit(pend_shared, nr_shared);
return (pending_private < VGIC_NR_PRIVATE_IRQS ||
-   pending_shared < VGIC_NR_SHARED_IRQS);
+   pending_shared < vgic_nr_shared_irqs(dist));
 }
 
 /*
@@ -1365,7 +1371,7 @@ static void __kvm_vgic_flush_hwstate(struct kvm_vcpu 
*vcpu)
}
 
/* SPIs */
-   for_each_set_bit(i, vgic_cpu->pending_shared, VGIC_NR_SHARED_IRQS) {
+   for_each_set_bit(i, vgic_cpu->pending_shared, 
vgic_nr_shared_irqs(dist)) {
if (!vgic_queue_hwirq(vcpu, i + VGIC_NR_PRIVATE_IRQS))
overflow = 1;
}
-- 
2.0.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v4 1/8] KVM: ARM: vgic: plug irq injection race

2014-09-11 Thread Marc Zyngier

As it stands, nothing prevents userspace from injecting an interrupt
before the guest's GIC is actually initialized.

This goes unnoticed so far (as everything is pretty much statically
allocated), but ends up exploding in a spectacular way once we switch
to a more dynamic allocation (the GIC data structure isn't there yet).

The fix is to test for the "ready" flag in the VGIC distributor before
trying to inject the interrupt. Note that in order to avoid breaking
userspace, we have to ignore what is essentially an error.

Signed-off-by: Marc Zyngier 
Acked-by: Christoffer Dall 
---
 virt/kvm/arm/vgic.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
index f7ab1ca..d3299d4 100644
--- a/virt/kvm/arm/vgic.c
+++ b/virt/kvm/arm/vgic.c
@@ -1584,7 +1584,8 @@ out:
 int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int irq_num,
bool level)
 {
-   if (vgic_update_irq_pending(kvm, cpuid, irq_num, level))
+   if (likely(vgic_initialized(kvm)) &&
+   vgic_update_irq_pending(kvm, cpuid, irq_num, level))
vgic_kick_vcpus(kvm);
 
return 0;
-- 
2.0.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v4 5/8] arm/arm64: KVM: vgic: handle out-of-range MMIO accesses

2014-09-11 Thread Marc Zyngier

Now that we can (almost) dynamically size the number of interrupts,
we're facing an interesting issue:

We have to evaluate at runtime whether or not an access hits a valid
register, based on the sizing of this particular instance of the
distributor. Furthermore, the GIC spec says that accessing a reserved
register is RAZ/WI.

For this, add a new field to our range structure, indicating the number
of bits a single interrupts uses. That allows us to find out whether or
not the access is in range.

Reviewed-by: Christoffer Dall 
Signed-off-by: Marc Zyngier 
---
 include/kvm/arm_vgic.h |  3 ++-
 virt/kvm/arm/vgic.c| 56 --
 2 files changed, 47 insertions(+), 12 deletions(-)

diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 3900e31..97f5f57 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -32,6 +32,7 @@
 
 #define VGIC_V2_MAX_LRS(1 << 6)
 #define VGIC_V3_MAX_LRS16
+#define VGIC_MAX_IRQS  1024
 
 /* Sanity checks... */
 #if (KVM_MAX_VCPUS > 8)
@@ -42,7 +43,7 @@
 #error "VGIC_NR_IRQS must be a multiple of 32"
 #endif
 
-#if (VGIC_NR_IRQS > 1024)
+#if (VGIC_NR_IRQS > VGIC_MAX_IRQS)
 #error "VGIC_NR_IRQS must be <= 1024"
 #endif
 
diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
index 7e6e64d..ab01cab 100644
--- a/virt/kvm/arm/vgic.c
+++ b/virt/kvm/arm/vgic.c
@@ -892,6 +892,7 @@ static bool handle_mmio_sgi_clear(struct kvm_vcpu *vcpu,
 struct mmio_range {
phys_addr_t base;
unsigned long len;
+   int bits_per_irq;
bool (*handle_mmio)(struct kvm_vcpu *vcpu, struct kvm_exit_mmio *mmio,
phys_addr_t offset);
 };
@@ -900,56 +901,67 @@ static const struct mmio_range vgic_dist_ranges[] = {
{
.base   = GIC_DIST_CTRL,
.len= 12,
+   .bits_per_irq   = 0,
.handle_mmio= handle_mmio_misc,
},
{
.base   = GIC_DIST_IGROUP,
-   .len= VGIC_NR_IRQS / 8,
+   .len= VGIC_MAX_IRQS / 8,
+   .bits_per_irq   = 1,
.handle_mmio= handle_mmio_raz_wi,
},
{
.base   = GIC_DIST_ENABLE_SET,
-   .len= VGIC_NR_IRQS / 8,
+   .len= VGIC_MAX_IRQS / 8,
+   .bits_per_irq   = 1,
.handle_mmio= handle_mmio_set_enable_reg,
},
{
.base   = GIC_DIST_ENABLE_CLEAR,
-   .len= VGIC_NR_IRQS / 8,
+   .len= VGIC_MAX_IRQS / 8,
+   .bits_per_irq   = 1,
.handle_mmio= handle_mmio_clear_enable_reg,
},
{
.base   = GIC_DIST_PENDING_SET,
-   .len= VGIC_NR_IRQS / 8,
+   .len= VGIC_MAX_IRQS / 8,
+   .bits_per_irq   = 1,
.handle_mmio= handle_mmio_set_pending_reg,
},
{
.base   = GIC_DIST_PENDING_CLEAR,
-   .len= VGIC_NR_IRQS / 8,
+   .len= VGIC_MAX_IRQS / 8,
+   .bits_per_irq   = 1,
.handle_mmio= handle_mmio_clear_pending_reg,
},
{
.base   = GIC_DIST_ACTIVE_SET,
-   .len= VGIC_NR_IRQS / 8,
+   .len= VGIC_MAX_IRQS / 8,
+   .bits_per_irq   = 1,
.handle_mmio= handle_mmio_raz_wi,
},
{
.base   = GIC_DIST_ACTIVE_CLEAR,
-   .len= VGIC_NR_IRQS / 8,
+   .len= VGIC_MAX_IRQS / 8,
+   .bits_per_irq   = 1,
.handle_mmio= handle_mmio_raz_wi,
},
{
.base   = GIC_DIST_PRI,
-   .len= VGIC_NR_IRQS,
+   .len= VGIC_MAX_IRQS,
+   .bits_per_irq   = 8,
.handle_mmio= handle_mmio_priority_reg,
},
{
.base   = GIC_DIST_TARGET,
-   .len= VGIC_NR_IRQS,
+   .len= VGIC_MAX_IRQS,
+   .bits_per_irq   = 8,
.handle_mmio= handle_mmio_target_reg,
},
{
.base   = GIC_DIST_CONFIG,
-   .len= VGIC_NR_IRQS / 4,
+   .len= VGIC_MAX_IRQS / 4,
+   .bits_per_irq   = 2,
.handle_mmio= handle_mmio_cfg_reg,
},
{
@@ -987,6 +999,22 @@ struct mmio_range *find_matching_range(const struct 
mmio_range *ranges,
return NULL;
 }
 
+static bool vgic_validate_access(const struct vgic_dist *dist,
+const struct mmio_range *range,
+

[PATCH v4 8/8] arm/arm64: KVM: vgic: make number of irqs a configurable attribute

2014-09-11 Thread Marc Zyngier

In order to make the number of interrupts configurable, use the new
fancy device management API to add KVM_DEV_ARM_VGIC_GRP_NR_IRQS as
a VGIC configurable attribute.

Userspace can now specify the exact size of the GIC (by increments
of 32 interrupts).

Signed-off-by: Marc Zyngier 
---
 Documentation/virtual/kvm/devices/arm-vgic.txt | 10 +++
 arch/arm/include/uapi/asm/kvm.h|  1 +
 arch/arm64/include/uapi/asm/kvm.h  |  1 +
 virt/kvm/arm/vgic.c| 37 ++
 4 files changed, 49 insertions(+)

diff --git a/Documentation/virtual/kvm/devices/arm-vgic.txt 
b/Documentation/virtual/kvm/devices/arm-vgic.txt
index 7f4e91b..df8b0c7 100644
--- a/Documentation/virtual/kvm/devices/arm-vgic.txt
+++ b/Documentation/virtual/kvm/devices/arm-vgic.txt
@@ -71,3 +71,13 @@ Groups:
   Errors:
 -ENODEV: Getting or setting this register is not yet supported
 -EBUSY: One or more VCPUs are running
+
+  KVM_DEV_ARM_VGIC_GRP_NR_IRQS
+  Attributes:
+A value describing the number of interrupts (SGI, PPI and SPI) for
+this GIC instance, ranging from 64 to 1024, in increments of 32.
+
+  Errors:
+-EINVAL: Value set is out of the expected range
+-EBUSY: Value has already be set, or GIC has already been initialized
+with default values.
diff --git a/arch/arm/include/uapi/asm/kvm.h b/arch/arm/include/uapi/asm/kvm.h
index e6ebdd3..8b51c1a 100644
--- a/arch/arm/include/uapi/asm/kvm.h
+++ b/arch/arm/include/uapi/asm/kvm.h
@@ -173,6 +173,7 @@ struct kvm_arch_memory_slot {
 #define   KVM_DEV_ARM_VGIC_CPUID_MASK  (0xffULL << 
KVM_DEV_ARM_VGIC_CPUID_SHIFT)
 #define   KVM_DEV_ARM_VGIC_OFFSET_SHIFT0
 #define   KVM_DEV_ARM_VGIC_OFFSET_MASK (0xULL << 
KVM_DEV_ARM_VGIC_OFFSET_SHIFT)
+#define KVM_DEV_ARM_VGIC_GRP_NR_IRQS   3
 
 /* KVM_IRQ_LINE irq field index values */
 #define KVM_ARM_IRQ_TYPE_SHIFT 24
diff --git a/arch/arm64/include/uapi/asm/kvm.h 
b/arch/arm64/include/uapi/asm/kvm.h
index e633ff8..b5cd6ed 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -159,6 +159,7 @@ struct kvm_arch_memory_slot {
 #define   KVM_DEV_ARM_VGIC_CPUID_MASK  (0xffULL << 
KVM_DEV_ARM_VGIC_CPUID_SHIFT)
 #define   KVM_DEV_ARM_VGIC_OFFSET_SHIFT0
 #define   KVM_DEV_ARM_VGIC_OFFSET_MASK (0xULL << 
KVM_DEV_ARM_VGIC_OFFSET_SHIFT)
+#define KVM_DEV_ARM_VGIC_GRP_NR_IRQS   3
 
 /* KVM_IRQ_LINE irq field index values */
 #define KVM_ARM_IRQ_TYPE_SHIFT 24
diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
index 9180823..744388d 100644
--- a/virt/kvm/arm/vgic.c
+++ b/virt/kvm/arm/vgic.c
@@ -2331,6 +2331,36 @@ static int vgic_set_attr(struct kvm_device *dev, struct 
kvm_device_attr *attr)
 
return vgic_attr_regs_access(dev, attr, ®, true);
}
+   case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: {
+   u32 __user *uaddr = (u32 __user *)(long)attr->addr;
+   u32 val;
+   int ret = 0;
+
+   if (get_user(val, uaddr))
+   return -EFAULT;
+
+   /*
+* We require:
+* - at least 32 SPIs on top of the 16 SGIs and 16 PPIs
+* - at most 1024 interrupts
+* - a multiple of 32 interrupts
+*/
+   if (val < (VGIC_NR_PRIVATE_IRQS + 32) ||
+   val > VGIC_MAX_IRQS ||
+   (val & 31))
+   return -EINVAL;
+
+   mutex_lock(&dev->kvm->lock);
+
+   if (vgic_initialized(dev->kvm) || dev->kvm->arch.vgic.nr_irqs)
+   ret = -EBUSY;
+   else
+   dev->kvm->arch.vgic.nr_irqs = val;
+
+   mutex_unlock(&dev->kvm->lock);
+
+   return ret;
+   }
 
}
 
@@ -2367,6 +2397,11 @@ static int vgic_get_attr(struct kvm_device *dev, struct 
kvm_device_attr *attr)
r = put_user(reg, uaddr);
break;
}
+   case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: {
+   u32 __user *uaddr = (u32 __user *)(long)attr->addr;
+   r = put_user(dev->kvm->arch.vgic.nr_irqs, uaddr);
+   break;
+   }
 
}
 
@@ -2403,6 +2438,8 @@ static int vgic_has_attr(struct kvm_device *dev, struct 
kvm_device_attr *attr)
case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
offset = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK;
return vgic_has_attr_regs(vgic_cpu_ranges, offset);
+   case KVM_DEV_ARM_VGIC_GRP_NR_IRQS:
+   return 0;
}
return -ENXIO;
 }
-- 
2.0.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v4 6/8] arm/arm64: KVM: vgic: kill VGIC_NR_IRQS

2014-09-11 Thread Marc Zyngier

Nuke VGIC_NR_IRQS entierly, now that the distributor instance
contains the number of IRQ allocated to this GIC.

Also add VGIC_NR_IRQS_LEGACY to preserve the current API.

Signed-off-by: Marc Zyngier 
---
 include/kvm/arm_vgic.h |  6 +++---
 virt/kvm/arm/vgic.c| 17 +++--
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 97f5f57..0a27564 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -25,7 +25,7 @@
 #include 
 #include 
 
-#define VGIC_NR_IRQS   256
+#define VGIC_NR_IRQS_LEGACY256
 #define VGIC_NR_SGIS   16
 #define VGIC_NR_PPIS   16
 #define VGIC_NR_PRIVATE_IRQS   (VGIC_NR_SGIS + VGIC_NR_PPIS)
@@ -39,11 +39,11 @@
 #error Invalid number of CPU interfaces
 #endif
 
-#if (VGIC_NR_IRQS & 31)
+#if (VGIC_NR_IRQS_LEGACY & 31)
 #error "VGIC_NR_IRQS must be a multiple of 32"
 #endif
 
-#if (VGIC_NR_IRQS > VGIC_MAX_IRQS)
+#if (VGIC_NR_IRQS_LEGACY > VGIC_MAX_IRQS)
 #error "VGIC_NR_IRQS must be <= 1024"
 #endif
 
diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
index ab01cab..dfa6430 100644
--- a/virt/kvm/arm/vgic.c
+++ b/virt/kvm/arm/vgic.c
@@ -436,7 +436,7 @@ static bool handle_mmio_misc(struct kvm_vcpu *vcpu,
 
case 4: /* GICD_TYPER */
reg  = (atomic_read(&vcpu->kvm->online_vcpus) - 1) << 5;
-   reg |= (VGIC_NR_IRQS >> 5) - 1;
+   reg |= (vcpu->kvm->arch.vgic.nr_irqs >> 5) - 1;
vgic_reg_access(mmio, ®, word_offset,
ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
break;
@@ -1274,13 +1274,14 @@ static void vgic_retire_disabled_irqs(struct kvm_vcpu 
*vcpu)
 static bool vgic_queue_irq(struct kvm_vcpu *vcpu, u8 sgi_source_id, int irq)
 {
struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+   struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
struct vgic_lr vlr;
int lr;
 
/* Sanitize the input... */
BUG_ON(sgi_source_id & ~7);
BUG_ON(sgi_source_id && irq >= VGIC_NR_SGIS);
-   BUG_ON(irq >= VGIC_NR_IRQS);
+   BUG_ON(irq >= dist->nr_irqs);
 
kvm_debug("Queue IRQ%d\n", irq);
 
@@ -1516,7 +1517,7 @@ static void __kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
 
vlr = vgic_get_lr(vcpu, lr);
 
-   BUG_ON(vlr.irq >= VGIC_NR_IRQS);
+   BUG_ON(vlr.irq >= dist->nr_irqs);
vgic_cpu->vgic_irq_lr_map[vlr.irq] = LR_EMPTY;
}
 
@@ -1738,7 +1739,7 @@ int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu)
if (vcpu->vcpu_id >= dist->nr_cpus)
return -EBUSY;
 
-   for (i = 0; i < VGIC_NR_IRQS; i++) {
+   for (i = 0; i < dist->nr_irqs; i++) {
if (i < VGIC_NR_PPIS)
vgic_bitmap_set_irq_val(&dist->irq_enabled,
vcpu->vcpu_id, i, 1);
@@ -1880,7 +1881,11 @@ static int vgic_init_maps(struct kvm *kvm)
int ret, i;
 
nr_cpus = dist->nr_cpus = KVM_MAX_VCPUS;
-   nr_irqs = dist->nr_irqs = VGIC_NR_IRQS;
+
+   if (!dist->nr_irqs)
+   dist->nr_irqs = VGIC_NR_IRQS_LEGACY;
+
+   nr_irqs = dist->nr_irqs;
 
ret  = vgic_init_bitmap(&dist->irq_enabled, nr_cpus, nr_irqs);
ret |= vgic_init_bitmap(&dist->irq_level, nr_cpus, nr_irqs);
@@ -1964,7 +1969,7 @@ int kvm_vgic_init(struct kvm *kvm)
goto out;
}
 
-   for (i = VGIC_NR_PRIVATE_IRQS; i < VGIC_NR_IRQS; i += 4)
+   for (i = VGIC_NR_PRIVATE_IRQS; i < kvm->arch.vgic.nr_irqs; i += 4)
vgic_set_target_reg(kvm, 0, i);
 
kvm->arch.vgic.ready = true;
-- 
2.0.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v4 7/8] arm/arm64: KVM: vgic: delay vgic allocation until init time

2014-09-11 Thread Marc Zyngier

It is now quite easy to delay the allocation of the vgic tables
until we actually require it to be up and running (when the first
vcpu is kicking around, or someones tries to access the GIC registers).

This allow us to allocate memory for the exact number of CPUs we
have. As nobody configures the number of interrupts just yet,
use a fallback to VGIC_NR_IRQS_LEGACY.

Reviewed-by: Christoffer Dall 
Signed-off-by: Marc Zyngier 
---
 arch/arm/kvm/arm.c |  7 ---
 include/kvm/arm_vgic.h |  1 -
 virt/kvm/arm/vgic.c| 42 +-
 3 files changed, 29 insertions(+), 21 deletions(-)

diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 923a01d..7d5065e 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -271,16 +271,9 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
 
 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 {
-   int ret;
-
/* Force users to call KVM_ARM_VCPU_INIT */
vcpu->arch.target = -1;
 
-   /* Set up VGIC */
-   ret = kvm_vgic_vcpu_init(vcpu);
-   if (ret)
-   return ret;
-
/* Set up the timer */
kvm_timer_vcpu_init(vcpu);
 
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 0a27564..73cbb61 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -277,7 +277,6 @@ int kvm_vgic_hyp_init(void);
 int kvm_vgic_init(struct kvm *kvm);
 int kvm_vgic_create(struct kvm *kvm);
 void kvm_vgic_destroy(struct kvm *kvm);
-int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu);
 void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu);
 void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu);
 void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu);
diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
index dfa6430..9180823 100644
--- a/virt/kvm/arm/vgic.c
+++ b/virt/kvm/arm/vgic.c
@@ -1730,15 +1730,12 @@ static int vgic_vcpu_init_maps(struct kvm_vcpu *vcpu, 
int nr_irqs)
  * Initialize the vgic_cpu struct and vgic_dist struct fields pertaining to
  * this vcpu and enable the VGIC for this VCPU
  */
-int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu)
+static void kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu)
 {
struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
int i;
 
-   if (vcpu->vcpu_id >= dist->nr_cpus)
-   return -EBUSY;
-
for (i = 0; i < dist->nr_irqs; i++) {
if (i < VGIC_NR_PPIS)
vgic_bitmap_set_irq_val(&dist->irq_enabled,
@@ -1758,8 +1755,6 @@ int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu)
vgic_cpu->nr_lr = vgic->nr_lr;
 
vgic_enable(vcpu);
-
-   return 0;
 }
 
 static void vgic_init_maintenance_interrupt(void *info)
@@ -1880,8 +1875,17 @@ static int vgic_init_maps(struct kvm *kvm)
int nr_cpus, nr_irqs;
int ret, i;
 
-   nr_cpus = dist->nr_cpus = KVM_MAX_VCPUS;
+   if (dist->nr_cpus)  /* Already allocated */
+   return 0;
+
+   nr_cpus = dist->nr_cpus = atomic_read(&kvm->online_vcpus);
+   if (!nr_cpus)   /* No vcpus? Can't be good... */
+   return -EINVAL;
 
+   /*
+* If nobody configured the number of interrupts, use the
+* legacy one.
+*/
if (!dist->nr_irqs)
dist->nr_irqs = VGIC_NR_IRQS_LEGACY;
 
@@ -1927,6 +1931,9 @@ static int vgic_init_maps(struct kvm *kvm)
}
}
 
+   for (i = VGIC_NR_PRIVATE_IRQS; i < dist->nr_irqs; i += 4)
+   vgic_set_target_reg(kvm, 0, i);
+
 out:
if (ret)
kvm_vgic_destroy(kvm);
@@ -1945,6 +1952,7 @@ out:
  */
 int kvm_vgic_init(struct kvm *kvm)
 {
+   struct kvm_vcpu *vcpu;
int ret = 0, i;
 
if (!irqchip_in_kernel(kvm))
@@ -1962,6 +1970,12 @@ int kvm_vgic_init(struct kvm *kvm)
goto out;
}
 
+   ret = vgic_init_maps(kvm);
+   if (ret) {
+   kvm_err("Unable to allocate maps\n");
+   goto out;
+   }
+
ret = kvm_phys_addr_ioremap(kvm, kvm->arch.vgic.vgic_cpu_base,
vgic->vcpu_base, KVM_VGIC_V2_CPU_SIZE);
if (ret) {
@@ -1969,11 +1983,13 @@ int kvm_vgic_init(struct kvm *kvm)
goto out;
}
 
-   for (i = VGIC_NR_PRIVATE_IRQS; i < kvm->arch.vgic.nr_irqs; i += 4)
-   vgic_set_target_reg(kvm, 0, i);
+   kvm_for_each_vcpu(i, vcpu, kvm)
+   kvm_vgic_vcpu_init(vcpu);
 
kvm->arch.vgic.ready = true;
 out:
+   if (ret)
+   kvm_vgic_destroy(kvm);
mutex_unlock(&kvm->lock);
return ret;
 }
@@ -2014,10 +2030,6 @@ int kvm_vgic_create(struct kvm *kvm)
kvm->arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF;
kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF;
 
-   ret = vgic_init_maps(kvm);
-   if (ret)
-   kvm_err("Unable to allocate maps\n");
-
 out_unlock:
for (; vcpu_lock_idx >= 0; vcpu_lock_idx--)

[PATCH v4 4/8] arm/arm64: KVM: vgic: kill VGIC_MAX_CPUS

2014-09-11 Thread Marc Zyngier

We now have the information about the number of CPU interfaces in
the distributor itself. Let's get rid of VGIC_MAX_CPUS, and just
rely on KVM_MAX_VCPUS where we don't have the choice. Yet.

Reviewed-by: Christoffer Dall 
Signed-off-by: Marc Zyngier 
---
 include/kvm/arm_vgic.h | 3 +--
 virt/kvm/arm/vgic.c| 6 +++---
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index bdeb451..3900e31 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -29,13 +29,12 @@
 #define VGIC_NR_SGIS   16
 #define VGIC_NR_PPIS   16
 #define VGIC_NR_PRIVATE_IRQS   (VGIC_NR_SGIS + VGIC_NR_PPIS)
-#define VGIC_MAX_CPUS  KVM_MAX_VCPUS
 
 #define VGIC_V2_MAX_LRS(1 << 6)
 #define VGIC_V3_MAX_LRS16
 
 /* Sanity checks... */
-#if (VGIC_MAX_CPUS > 8)
+#if (KVM_MAX_VCPUS > 8)
 #error Invalid number of CPU interfaces
 #endif
 
diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
index 93fe73b..7e6e64d 100644
--- a/virt/kvm/arm/vgic.c
+++ b/virt/kvm/arm/vgic.c
@@ -1294,7 +1294,7 @@ static bool vgic_queue_sgi(struct kvm_vcpu *vcpu, int irq)
 
sources = *vgic_get_sgi_sources(dist, vcpu_id, irq);
 
-   for_each_set_bit(c, &sources, VGIC_MAX_CPUS) {
+   for_each_set_bit(c, &sources, dist->nr_cpus) {
if (vgic_queue_irq(vcpu, c, irq))
clear_bit(c, &sources);
}
@@ -1701,7 +1701,7 @@ int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu)
struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
int i;
 
-   if (vcpu->vcpu_id >= VGIC_MAX_CPUS)
+   if (vcpu->vcpu_id >= dist->nr_cpus)
return -EBUSY;
 
for (i = 0; i < VGIC_NR_IRQS; i++) {
@@ -1845,7 +1845,7 @@ static int vgic_init_maps(struct kvm *kvm)
int nr_cpus, nr_irqs;
int ret, i;
 
-   nr_cpus = dist->nr_cpus = VGIC_MAX_CPUS;
+   nr_cpus = dist->nr_cpus = KVM_MAX_VCPUS;
nr_irqs = dist->nr_irqs = VGIC_NR_IRQS;
 
ret  = vgic_init_bitmap(&dist->irq_enabled, nr_cpus, nr_irqs);
-- 
2.0.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v5 4/7] kvm, mem-hotplug: Reload L1' apic access page on migration in vcpu_enter_guest().

2014-09-11 Thread Gleb Natapov

On Thu, Sep 11, 2014 at 12:47:16PM +0200, Paolo Bonzini wrote:
> Il 11/09/2014 12:12, Gleb Natapov ha scritto:
> > On Thu, Sep 11, 2014 at 11:21:49AM +0200, Paolo Bonzini wrote:
> >> Il 11/09/2014 07:38, Tang Chen ha scritto:
> >>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> >>> index 63c4c3e..da6d55d 100644
> >>> --- a/arch/x86/kvm/vmx.c
> >>> +++ b/arch/x86/kvm/vmx.c
> >>> @@ -7093,6 +7093,11 @@ static void vmx_set_virtual_x2apic_mode(struct 
> >>> kvm_vcpu *vcpu, bool set)
> >>>   vmx_set_msr_bitmap(vcpu);
> >>>  }
> >>>  
> >>> +static void vmx_set_apic_access_page_addr(struct kvm *kvm, hpa_t hpa)
> >>> +{
> >>> + vmcs_write64(APIC_ACCESS_ADDR, hpa);
> >>
> >> This has to be guarded by "if (!is_guest_mode(vcpu))".
> >>
> > We do need to write it if L1 and L2 share APIC_ACCESS_ADDR and skip
> > it otherwise, no?
> 
> Yes, but this would be handled by patch 6:
> 
>   } else if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) {
> + struct page *page = gfn_to_page(vmx->vcpu.kvm,
> + APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
>   exec_control |=
>   SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
> - vmcs_write64(APIC_ACCESS_ADDR,
> - page_to_phys(vcpu->kvm->arch.apic_access_page));
> + vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(page));
> + /*
> +  * Do not pin apic access page in memory so that memory
> +  * hotplug process is able to migrate it.
> +  */
> + put_page(page);
>   }
This code is in prepare_vmcs02() and is executed during L1->L2 vmentry. What 
happens
when apic access page is migrated while L2 is running? It needs to be update 
somewhere.

> 
> However, this is also useless code duplication because the above snippet could
> reuse vcpu_reload_apic_access_page too.
> 
> So I think you cannot do the is_guest_mode check in
> kvm_vcpu_reload_apic_access_page and also not in
> vmx_reload_apic_access_page.  But you could do something like
> 
> kvm_vcpu_reload_apic_access_page(...)
> {
>   ...
>   kvm_x86_ops->reload_apic_access_page(...);
> }
> EXPORT_SYMBOL_GPL(kvm_vcpu_reload_apic_access_page);
> 
> /* used in vcpu_enter_guest only */
> vcpu_reload_apic_access_page(...)
> {
>   if (!is_guest_mode(vcpu))
>   kvm_vcpu_reload_apic_access_page(...)
> }
> 
> Paolo

--
Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 2/2] virtio-rng: fix stuck in catting hwrng attributes

2014-09-11 Thread Rusty Russell

Amos Kong  writes:
> When I check hwrng attributes in sysfs, cat process always gets
> stuck if guest has only 1 vcpu and uses a slow rng backend.
>
> Currently we check if there is any tasks waiting to be run on
> current cpu in rng_dev_read() by need_resched(). But need_resched()
> doesn't work because rng_dev_read() is executing in user context.

I don't understand this explanation?  I'd expect the sysfs process to be
woken by the mutex_unlock().

If we're really high priority (vs. the sysfs process) then I can see why
we'd need schedule_timeout_interruptible() instead of just schedule(),
and in that case, need_resched() would be false too.

You could argue that's intended behaviour, but I can't see how it
happens in the normal case anyway.

What am I missing?

Thanks,
Rusty.

> This patch removed need_resched() and increase delay to 10 jiffies,
> then other tasks can have chance to execute protected code.
> Delaying 1 jiffy also works, but 10 jiffies is safer.
>
> Signed-off-by: Amos Kong 
> ---
>  drivers/char/hw_random/core.c | 3 +--
>  1 file changed, 1 insertion(+), 2 deletions(-)
>
> diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
> index c591d7e..b5d1b6f 100644
> --- a/drivers/char/hw_random/core.c
> +++ b/drivers/char/hw_random/core.c
> @@ -195,8 +195,7 @@ static ssize_t rng_dev_read(struct file *filp, char 
> __user *buf,
>  
>   mutex_unlock(&rng_mutex);
>  
> - if (need_resched())
> - schedule_timeout_interruptible(1);
> + schedule_timeout_interruptible(10);
>  
>   if (signal_pending(current)) {
>   err = -ERESTARTSYS;
> -- 
> 1.9.3
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Howto connect to a terminal in an emalated linux-livecd?

2014-09-11 Thread Oliver Rath

Hi list,

after hours for searching in google-world, i didnt find any appropriate
for this problem:

I want to boot a live-cd (i.e. ubuntu 14.04.1-desktop) in qemu, which
starts with an graphical interface, done i.e. by

qemu-system-x86_64 -m 3G -smp 2 -drive
file=ubuntu-14.04.1-desktop-i386.iso,media=cdrom,if=virtio --enable-kvm

Now i want to access to the console of the ubuntu-livecd. At the moment
i can do this over changing to text mode via

sendkey ctrl-alt-f1

in qemu-console (Alt-2), then switching back to qemu-window (alt-1). Now
i have access to tty1 of my livecd.

But IMHO there should be a more simple way to access to such a console
with qemu, i.e. through a pipe, a serial console etc., but i didnt found
anything working. The best i got was with -chardev pty,id=myid, which
resulted in a "char device redirected to /dev/pts/0 (label myid)".  But
with a "screen /dev/pts/0" i wasnt able to see any input or output.

ssh is unfortunatly not available at this time on the livecd (so i could
connect i.e. via -net user,hostfwd:tcp:10022-:22)

Any hints to connect directly to a console in an emulated linux?

Tfh!
Oliver


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC v2 8/9] KVM: KVM-VFIO: generic KVM_DEV_VFIO_DEVICE command and IRQ forwarding control

2014-09-11 Thread Eric Auger

On 09/11/2014 07:05 AM, Alex Williamson wrote:
> On Thu, 2014-09-11 at 05:10 +0200, Christoffer Dall wrote:
>> On Mon, Sep 01, 2014 at 02:52:47PM +0200, Eric Auger wrote:
>>> This patch introduces a new KVM_DEV_VFIO_DEVICE attribute.
>>>
>>> This is a new control channel which enables KVM to cooperate with
>>> viable VFIO devices.
>>>
>>> The kvm-vfio device now holds a list of devices (kvm_vfio_device)
>>> in addition to a list of groups (kvm_vfio_group). The new
>>> infrastructure enables to check the validity of the VFIO device
>>> file descriptor, get and hold a reference to it.
>>>
>>> The first concrete implemented command is IRQ forward control:
>>> KVM_DEV_VFIO_DEVICE_FORWARD_IRQ, KVM_DEV_VFIO_DEVICE_UNFORWARD_IRQ.
>>>
>>> It consists in programing the VFIO driver and KVM in a consistent manner
>>> so that an optimized IRQ injection/completion is set up. Each
>>> kvm_vfio_device holds a list of forwarded IRQ. When putting a
>>> kvm_vfio_device, the implementation makes sure the forwarded IRQs
>>> are set again in the normal handling state (non forwarded).
>>
>> 'putting a kvm_vfio_device' sounds to like you're golf'ing :)
>>
>> When a kvm_vfio_device is released?
>>
>>>
>>> The forwarding programmming is architecture specific, embodied by the
>>> kvm_arch_set_fwd_state function. Its implementation is given in a
>>> separate patch file.
>>
>> I would drop the last sentence and instead indicate that this is handled
>> properly when the architecture does not support such a feature.
>>
>>>
>>> The forwarding control modality is enabled by the
>>> __KVM_HAVE_ARCH_KVM_VFIO_FORWARD define.
>>>
>>> Signed-off-by: Eric Auger 
>>>
>>> ---
>>>
>>> v1 -> v2:
>>> - __KVM_HAVE_ARCH_KVM_VFIO renamed into __KVM_HAVE_ARCH_KVM_VFIO_FORWARD
>>> - original patch file separated into 2 parts: generic part moved in vfio.c
>>>   and ARM specific part(kvm_arch_set_fwd_state)
>>> ---
>>>  include/linux/kvm_host.h |  27 +++
>>>  virt/kvm/vfio.c  | 452 
>>> ++-
>>>  2 files changed, 477 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
>>> index a4c33b3..24350dc 100644
>>> --- a/include/linux/kvm_host.h
>>> +++ b/include/linux/kvm_host.h
>>> @@ -1065,6 +1065,21 @@ struct kvm_device_ops {
>>>   unsigned long arg);
>>>  };
>>>  
>>> +enum kvm_fwd_irq_action {
>>> +   KVM_VFIO_IRQ_SET_FORWARD,
>>> +   KVM_VFIO_IRQ_SET_NORMAL,
>>> +   KVM_VFIO_IRQ_CLEANUP,
>>
>> This is KVM internal API, so it would probably be good to document this.
>> Especially the CLEANUP bit worries me, see below.
> 
> This also doesn't match the user API, which is simply FORWARD/UNFORWARD.
Hi Alex,

will change that.
> Extra states worry me too.

I tried to explained the 2 motivations behind. Please let me know if it
makes sense.
> 
>>> +};
>>> +
>>> +/* internal structure describing a forwarded IRQ */
>>> +struct kvm_fwd_irq {
>>> +   struct list_head link;
>>
>> this list entry is local to the kvm vfio device, right? that means you
>> probably want a struct with just the below fields, and then have a
>> containing struct in the generic device file, private to it's logic.
> 
> Yes, this is part of the abstraction problem.
OK will fix that.
> 
>>> +   __u32 index; /* platform device irq index */
> 
> This is a vfio_device irq_index, but vfio_devices support indexes and
> sub-indexes.  At this level the API should match vfio, not the specifics
> of platform devices not supporting sub-index.
I will add sub-indexes then.
> 
>>> +   __u32 hwirq; /*physical IRQ */
>>> +   __u32 gsi; /* virtual IRQ */
>>> +   struct kvm_vcpu *vcpu; /* vcpu to inject into*/
> 
> Not sure I understand why vcpu is necessary.
vcpu is used when providing the physical IRQ/virtual IRQ mapping to the
virtual GIC. I can remove it from and add a vcpu struct * param to
kvm_arch_set_fwd_state if you prefer.

  Also I see a 'get' in the code below, but not a 'put'.
Sorry I do not understand your comment here? What 'get' do you mention?
> 
>>> +};
>>> +
>>>  void kvm_device_get(struct kvm_device *dev);
>>>  void kvm_device_put(struct kvm_device *dev);
>>>  struct kvm_device *kvm_device_from_filp(struct file *filp);
>>> @@ -1075,6 +1090,18 @@ extern struct kvm_device_ops kvm_vfio_ops;
>>>  extern struct kvm_device_ops kvm_arm_vgic_v2_ops;
>>>  extern struct kvm_device_ops kvm_flic_ops;
>>>  
>>> +#ifdef __KVM_HAVE_ARCH_KVM_VFIO_FORWARD
>>> +int kvm_arch_set_fwd_state(struct kvm_fwd_irq *pfwd,
>>
>> what's the 'p' in pfwd?
> 
> p is for pointer?
yes it was ;-)
> 
>>> +  enum kvm_fwd_irq_action action);
>>> +
>>> +#else
>>> +static inline int kvm_arch_set_fwd_state(struct kvm_fwd_irq *pfwd,
>>> +enum kvm_fwd_irq_action action)
>>> +{
>>> +   return 0;
>>> +}
>>> +#endif
>>> +
>>>  #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
>>>  
>>>  static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool 
>>> val)
>>> dif

Re: [Qemu-devel] [PATCH] kvm: ioapic: conditionally delay irq delivery during eoi broadcast

2014-09-11 Thread Zhang Haoyu

>> Subject: [Qemu-devel] [PATCH] kvm: ioapic: conditionally delay irq delivery
>> duringeoi broadcast
>> 
>> Currently, we call ioapic_service() immediately when we find the irq is still
>> active during eoi broadcast. But for real hardware, there's some dealy 
>> between
>> the EOI writing and irq delivery (system bus latency?). So we need to emulate
>> this behavior. Otherwise, for a guest who haven't register a proper irq 
>> handler
>> , it would stay in the interrupt routine as this irq would be re-injected
>> immediately after guest enables interrupt. This would lead guest can't move
>> forward and may miss the possibility to get proper irq handler registered 
>> (one
>> example is windows guest resuming from hibernation).
>> 
>> As there's no way to differ the unhandled irq from new raised ones, this 
>> patch
>> solve this problems by scheduling a delayed work when the count of irq 
>> injected
>> during eoi broadcast exceeds a threshold value. After this patch, the guest 
>> can
>> move a little forward when there's no suitable irq handler in case it may
>> register one very soon and for guest who has a bad irq detection routine ( 
>> such
>> as note_interrupt() in linux ), this bad irq would be recognized soon as in 
>> the
>> past.
>> 
>> Cc: Michael S. Tsirkin 
>> Signed-off-by: Jason Wang 
>> Signed-off-by: Zhang Haoyu 
>> ---
>>  include/trace/events/kvm.h | 20 +++
>>  virt/kvm/ioapic.c  | 50
>> --
>>  virt/kvm/ioapic.h  |  6 ++
>>  3 files changed, 74 insertions(+), 2 deletions(-)
>> 
>If this is a new version, please add a v2/v3 suffix and describe the changes at
>those different versions .
>
>You can get more information from:
>http://wiki.qemu.org/Contribute/SubmitAPatch
>
Okay, thanks.

>Best regards,
>-Gonglei

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v5 4/7] kvm, mem-hotplug: Reload L1' apic access page on migration in vcpu_enter_guest().

2014-09-11 Thread Paolo Bonzini

Il 11/09/2014 13:30, Gleb Natapov ha scritto:
>> > +  vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(page));
>> > +  /*
>> > +   * Do not pin apic access page in memory so that memory
>> > +   * hotplug process is able to migrate it.
>> > +   */
>> > +  put_page(page);
>> >}
> This code is in prepare_vmcs02() and is executed during L1->L2 vmentry. What 
> happens
> when apic access page is migrated while L2 is running? It needs to be update 
> somewhere.

Before it is migrated, the MMU notifier is called and will force a
vmexit on all CPUs.  The reload code will call GUP again on the page
again and swap it in.

Paolo

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] KVM: PPC: Convert openpic lock to raw_spinlock

2014-09-11 Thread Bogdan Purcareata

This patch enables running intensive I/O workloads, e.g. netperf, in a guest
deployed on a RT host. No change for !RT kernels.

The openpic spinlock becomes a sleeping mutex on a RT system. This no longer
guarantees that EPR is atomic with exception delivery. The guest VCPU thread
fails due to a BUG_ON(preemptible()) when running netperf.

In order to make the kvmppc_mpic_set_epr() call safe on RT from non-atomic
context, convert the openpic lock to a raw_spinlock. A similar approach can
be seen for x86 platforms in the following commit [1].

Here are some comparative cyclitest measurements run inside a high priority RT
guest run on a RT host. The guest has 1 VCPU and the test has been run for 15
minutes. The guest runs ~750 hackbench processes as background stress.

  spinlock  raw_spinlock
Min latency (us)  4 4
Avg latency (us)  1519
Max latency (us)  7062

[1] https://lkml.org/lkml/2010/1/11/289

Signed-off-by: Bogdan Purcareata 
---
 arch/powerpc/kvm/mpic.c |   44 ++--
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/arch/powerpc/kvm/mpic.c b/arch/powerpc/kvm/mpic.c
index 2861ae9..309036c 100644
--- a/arch/powerpc/kvm/mpic.c
+++ b/arch/powerpc/kvm/mpic.c
@@ -194,7 +194,7 @@ struct openpic {
int num_mmio_regions;
 
gpa_t reg_base;
-   spinlock_t lock;
+   raw_spinlock_t lock;
 
/* Behavior control */
struct fsl_mpic_info *fsl;
@@ -1105,9 +1105,9 @@ static int openpic_cpu_write_internal(void *opaque, gpa_t 
addr,
mpic_irq_raise(opp, dst, ILR_INTTGT_INT);
}
 
-   spin_unlock(&opp->lock);
+   raw_spin_unlock(&opp->lock);
kvm_notify_acked_irq(opp->kvm, 0, notify_eoi);
-   spin_lock(&opp->lock);
+   raw_spin_lock(&opp->lock);
 
break;
}
@@ -1182,12 +1182,12 @@ void kvmppc_mpic_set_epr(struct kvm_vcpu *vcpu)
int cpu = vcpu->arch.irq_cpu_id;
unsigned long flags;
 
-   spin_lock_irqsave(&opp->lock, flags);
+   raw_spin_lock_irqsave(&opp->lock, flags);
 
if ((opp->gcr & opp->mpic_mode_mask) == GCR_MODE_PROXY)
kvmppc_set_epr(vcpu, openpic_iack(opp, &opp->dst[cpu], cpu));
 
-   spin_unlock_irqrestore(&opp->lock, flags);
+   raw_spin_unlock_irqrestore(&opp->lock, flags);
 }
 
 static int openpic_cpu_read_internal(void *opaque, gpa_t addr,
@@ -1387,9 +1387,9 @@ static int kvm_mpic_read(struct kvm_io_device *this, 
gpa_t addr,
return -EINVAL;
}
 
-   spin_lock_irq(&opp->lock);
+   raw_spin_lock_irq(&opp->lock);
ret = kvm_mpic_read_internal(opp, addr - opp->reg_base, &u.val);
-   spin_unlock_irq(&opp->lock);
+   raw_spin_unlock_irq(&opp->lock);
 
/*
 * Technically only 32-bit accesses are allowed, but be nice to
@@ -1427,10 +1427,10 @@ static int kvm_mpic_write(struct kvm_io_device *this, 
gpa_t addr,
return -EOPNOTSUPP;
}
 
-   spin_lock_irq(&opp->lock);
+   raw_spin_lock_irq(&opp->lock);
ret = kvm_mpic_write_internal(opp, addr - opp->reg_base,
  *(const u32 *)ptr);
-   spin_unlock_irq(&opp->lock);
+   raw_spin_unlock_irq(&opp->lock);
 
pr_debug("%s: addr %llx ret %d val %x\n",
 __func__, addr, ret, *(const u32 *)ptr);
@@ -1501,14 +1501,14 @@ static int access_reg(struct openpic *opp, gpa_t addr, 
u32 *val, int type)
if (addr & 3)
return -ENXIO;
 
-   spin_lock_irq(&opp->lock);
+   raw_spin_lock_irq(&opp->lock);
 
if (type == ATTR_SET)
ret = kvm_mpic_write_internal(opp, addr, *val);
else
ret = kvm_mpic_read_internal(opp, addr, val);
 
-   spin_unlock_irq(&opp->lock);
+   raw_spin_unlock_irq(&opp->lock);
 
pr_debug("%s: type %d addr %llx val %x\n", __func__, type, addr, *val);
 
@@ -1545,9 +1545,9 @@ static int mpic_set_attr(struct kvm_device *dev, struct 
kvm_device_attr *attr)
if (attr32 != 0 && attr32 != 1)
return -EINVAL;
 
-   spin_lock_irq(&opp->lock);
+   raw_spin_lock_irq(&opp->lock);
openpic_set_irq(opp, attr->attr, attr32);
-   spin_unlock_irq(&opp->lock);
+   raw_spin_unlock_irq(&opp->lock);
return 0;
}
 
@@ -1592,9 +1592,9 @@ static int mpic_get_attr(struct kvm_device *dev, struct 
kvm_device_attr *attr)
if (attr->attr > MAX_SRC)
return -EINVAL;
 
-   spin_lock_irq(&opp->lock);
+   raw_spin_lock_irq(&opp->lock);
attr32 = opp->src[attr->attr].pending;
-   spin_unlock_irq(&opp->lock);
+   raw_spin_unlock_irq(&opp->lock);
 
if (put_user(attr32, (u32 __user *)(long)attr->addr))

[PATCH v2] kvm: ioapic: conditionally delay irq delivery duringeoi broadcast

2014-09-11 Thread Zhang Haoyu

Currently, we call ioapic_service() immediately when we find the irq is still
active during eoi broadcast. But for real hardware, there's some dealy between
the EOI writing and irq delivery (system bus latency?). So we need to emulate
this behavior. Otherwise, for a guest who haven't register a proper irq handler
, it would stay in the interrupt routine as this irq would be re-injected
immediately after guest enables interrupt. This would lead guest can't move
forward and may miss the possibility to get proper irq handler registered (one
example is windows guest resuming from hibernation).

As there's no way to differ the unhandled irq from new raised ones, this patch
solve this problems by scheduling a delayed work when the count of irq injected
during eoi broadcast exceeds a threshold value. After this patch, the guest can
move a little forward when there's no suitable irq handler in case it may
register one very soon and for guest who has a bad irq detection routine ( such
as note_interrupt() in linux ), this bad irq would be recognized soon as in the
past.

v1 -> v2:
  - delete the TODO comment
  - adjust the coding style to kernel style
  - add the missing "}" for if (ioapic->irq_eoi[i] == 
IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT) {

Cc: Michael S. Tsirkin 
Cc: Jan Kiszka 
Signed-off-by: Jason Wang 
Signed-off-by: Zhang Haoyu 
---
 include/trace/events/kvm.h | 20 +++
 virt/kvm/ioapic.c  | 50 --
 virt/kvm/ioapic.h  |  6 ++
 3 files changed, 74 insertions(+), 2 deletions(-)

diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index 908925a..ab679c3 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -95,6 +95,26 @@ TRACE_EVENT(kvm_ioapic_set_irq,
  __entry->coalesced ? " (coalesced)" : "")
 );
 
+TRACE_EVENT(kvm_ioapic_delayed_eoi_inj,
+   TP_PROTO(__u64 e),
+   TP_ARGS(e),
+
+   TP_STRUCT__entry(
+   __field(__u64,  e   )
+   ),
+
+   TP_fast_assign(
+   __entry->e  = e;
+   ),
+
+   TP_printk("dst %x vec=%u (%s|%s|%s%s)",
+ (u8)(__entry->e >> 56), (u8)__entry->e,
+ __print_symbolic((__entry->e >> 8 & 0x7), kvm_deliver_mode),
+ (__entry->e & (1<<11)) ? "logical" : "physical",
+ (__entry->e & (1<<15)) ? "level" : "edge",
+ (__entry->e & (1<<16)) ? "|masked" : "")
+);
+
 TRACE_EVENT(kvm_msi_set_irq,
TP_PROTO(__u64 address, __u64 data),
TP_ARGS(address, data),
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index e8ce34c..8e1dc67 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -405,6 +405,24 @@ void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int 
irq_source_id)
spin_unlock(&ioapic->lock);
 }
 
+static void kvm_ioapic_eoi_inject_work(struct work_struct *work)
+{
+   int i;
+   struct kvm_ioapic *ioapic = container_of(work, struct kvm_ioapic,
+eoi_inject.work);
+   spin_lock(&ioapic->lock);
+   for (i = 0; i < IOAPIC_NUM_PINS; i++) {
+   union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i];
+
+   if (ent->fields.trig_mode != IOAPIC_LEVEL_TRIG)
+   continue;
+
+   if (ioapic->irr & (1 << i) && !ent->fields.remote_irr)
+   ioapic_service(ioapic, i, false);
+   }
+   spin_unlock(&ioapic->lock);
+}
+
 static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu,
struct kvm_ioapic *ioapic, int vector, int trigger_mode)
 {
@@ -435,8 +453,32 @@ static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu,
 
ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);
ent->fields.remote_irr = 0;
-   if (ioapic->irr & (1 << i))
-   ioapic_service(ioapic, i, false);
+   if (!ent->fields.mask && (ioapic->irr & (1 << i))) {
+   ++ioapic->irq_eoi[i];
+   if (ioapic->irq_eoi[i] == 
IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT) {
+   /*
+* Real hardware does not deliver the irq so
+* immediately during eoi broadcast, so we need
+* to emulate this behavior. Otherwise, for
+* guests who has not registered handler of a
+* level irq, this irq would be injected
+* immediately after guest enables interrupt
+* (which happens usually at the end of the
+* common interrupt routine). This would lead
+* guest can't move forward and may miss the
+* possibility to get proper irq h

Re: [Qemu-devel] QEMU with KVM does not start Win8 on kernel 3.4.67 and core2duo

2014-09-11 Thread Erik Rull

> On August 6, 2014 at 1:19 PM Erik Rull  wrote:
>
>
> Hi all,
>
> I did already several tests and I'm not completely sure what's going wrong,
> but
> here my scenario:
>
> When I start up QEMU w/ KVM 1.7.0 on a Core2Duo machine running a vanilla
> kernel
> 3.4.67 to run a Windows 8.0 guest, the guest freezes at boot without any
> error.
> When I dump the CPU registers via "info registers", nothing changes, that
> means
> the system really stalled. Same happens with QEMU 2.0.0.
>
> But - when I run the very same guest using Kernel 2.6.32.12 and QEMU 1.7.0 on
> the host side it works on the Core2Duo. Also the system above but just with an
> i3 or i5 CPU it works, too.
>
> I already disabled networking and USB for the guest and changed the graphics
> card - no effect. I assume that some mean bits and bytes have to be set up
> properly to get the thing running.
>
> Any hint what to change / test would be really appreciated.
>
> Thanks in advance,
>
> Best regards,
>
> Erik
>

Hi all,

I opened a qemu bug report on that and Jan helped me creating a kvm trace. I
attached it to the bug report.
https://bugs.launchpad.net/qemu/+bug/1366836

If you have further questions, please let me know.

Thanks.

Best regards,

Erik
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [Qemu-devel] QEMU with KVM does not start Win8 on kernel 3.4.67 and core2duo

2014-09-11 Thread Jan Kiszka

On 2014-09-11 15:25, Erik Rull wrote:
>> On August 6, 2014 at 1:19 PM Erik Rull  wrote:
>>
>>
>> Hi all,
>>
>> I did already several tests and I'm not completely sure what's going wrong,
>> but
>> here my scenario:
>>
>> When I start up QEMU w/ KVM 1.7.0 on a Core2Duo machine running a vanilla
>> kernel
>> 3.4.67 to run a Windows 8.0 guest, the guest freezes at boot without any
>> error.
>> When I dump the CPU registers via "info registers", nothing changes, that
>> means
>> the system really stalled. Same happens with QEMU 2.0.0.
>>
>> But - when I run the very same guest using Kernel 2.6.32.12 and QEMU 1.7.0 on
>> the host side it works on the Core2Duo. Also the system above but just with 
>> an
>> i3 or i5 CPU it works, too.
>>
>> I already disabled networking and USB for the guest and changed the graphics
>> card - no effect. I assume that some mean bits and bytes have to be set up
>> properly to get the thing running.
>>
>> Any hint what to change / test would be really appreciated.
>>
>> Thanks in advance,
>>
>> Best regards,
>>
>> Erik
>>
> 
> Hi all,
> 
> I opened a qemu bug report on that and Jan helped me creating a kvm trace. I
> attached it to the bug report.
> https://bugs.launchpad.net/qemu/+bug/1366836
> 
> If you have further questions, please let me know.

"File possibly truncated. Need at least 346583040, but file size is
133414912."

Does "trace-cmd report" work for you? Is your file larger?

Again, please also validate the behavior on latest next branch from kvm.git.

Jan

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v5 4/7] kvm, mem-hotplug: Reload L1' apic access page on migration in vcpu_enter_guest().

2014-09-11 Thread Gleb Natapov

On Thu, Sep 11, 2014 at 03:05:05PM +0200, Paolo Bonzini wrote:
> Il 11/09/2014 13:30, Gleb Natapov ha scritto:
> >> > +vmcs_write64(APIC_ACCESS_ADDR, 
> >> > page_to_phys(page));
> >> > +/*
> >> > + * Do not pin apic access page in memory so 
> >> > that memory
> >> > + * hotplug process is able to migrate it.
> >> > + */
> >> > +put_page(page);
> >> >  }
> > This code is in prepare_vmcs02() and is executed during L1->L2 vmentry. 
> > What happens
> > when apic access page is migrated while L2 is running? It needs to be 
> > update somewhere.
> 
> Before it is migrated, the MMU notifier is called and will force a
> vmexit on all CPUs.  The reload code will call GUP again on the page
> again and swap it in.
> 
This is how it will work without "if (!is_guest_mode(vcpu))". But,
unless I am missing something, with this check it will not work while
vcpu is in L2.

Suppose vmcs01->APIC_ACCESS_ADDR = 0xf000. During L2 entry
vmcs02->APIC_ACCESS_ADDR is set to 0xf000 too (by prepare_vmcs02). Now
0xf000 is migrated to 0x8000, mmu notifier is called, it forces vmexit,
but vcpu is in a guest mode so vmcs02->APIC_ACCESS_ADDR is never updated
to 0x8000 because of "if (!is_guest_mode(vcpu))" check. So what am I
missing here?

--
Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v5 4/7] kvm, mem-hotplug: Reload L1' apic access page on migration in vcpu_enter_guest().

2014-09-11 Thread Paolo Bonzini

Il 11/09/2014 15:59, Gleb Natapov ha scritto:
> 
> Suppose vmcs01->APIC_ACCESS_ADDR = 0xf000. During L2 entry
> vmcs02->APIC_ACCESS_ADDR is set to 0xf000 too (by prepare_vmcs02). Now
> 0xf000 is migrated to 0x8000, mmu notifier is called, it forces vmexit,
> but vcpu is in a guest mode so vmcs02->APIC_ACCESS_ADDR is never updated
> to 0x8000 because of "if (!is_guest_mode(vcpu))" check. So what am I
> missing here?

Right, guest mode isn't left as soon as you execute nested_vmx_vmexit,
because this isn't an L2->L1 exit.  So we need an "else" for that "if
(!is_guest_mode(vcpu))", in which case the hpa is ignored and
vmcs12->apic_access_addr is used instead?

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [Qemu-devel] QEMU with KVM does not start Win8 on kernel 3.4.67 and core2duo

2014-09-11 Thread Erik Rull

> On September 11, 2014 at 3:32 PM Jan Kiszka  wrote:
>
>
> On 2014-09-11 15:25, Erik Rull wrote:
> >> On August 6, 2014 at 1:19 PM Erik Rull  wrote:
> >>
> >>
> >> Hi all,
> >>
> >> I did already several tests and I'm not completely sure what's going wrong,
> >> but
> >> here my scenario:
> >>
> >> When I start up QEMU w/ KVM 1.7.0 on a Core2Duo machine running a vanilla
> >> kernel
> >> 3.4.67 to run a Windows 8.0 guest, the guest freezes at boot without any
> >> error.
> >> When I dump the CPU registers via "info registers", nothing changes, that
> >> means
> >> the system really stalled. Same happens with QEMU 2.0.0.
> >>
> >> But - when I run the very same guest using Kernel 2.6.32.12 and QEMU 1.7.0
> >> on
> >> the host side it works on the Core2Duo. Also the system above but just with
> >> an
> >> i3 or i5 CPU it works, too.
> >>
> >> I already disabled networking and USB for the guest and changed the
> >> graphics
> >> card - no effect. I assume that some mean bits and bytes have to be set up
> >> properly to get the thing running.
> >>
> >> Any hint what to change / test would be really appreciated.
> >>
> >> Thanks in advance,
> >>
> >> Best regards,
> >>
> >> Erik
> >>
> >
> > Hi all,
> >
> > I opened a qemu bug report on that and Jan helped me creating a kvm trace. I
> > attached it to the bug report.
> > https://bugs.launchpad.net/qemu/+bug/1366836
> >
> > If you have further questions, please let me know.
>
> "File possibly truncated. Need at least 346583040, but file size is
> 133414912."
>
> Does "trace-cmd report" work for you? Is your file larger?
>
> Again, please also validate the behavior on latest next branch from kvm.git.
>
> Jan
>

Sorry for the corrupt file, didn't work on my side as well. I re-captured the
trace (trace-cmd report worked!) and posted it on the bugtracker. The kvm.git
test is in progress, my build system is already busy.

Best regards,

Erik
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v5 4/7] kvm, mem-hotplug: Reload L1' apic access page on migration in vcpu_enter_guest().

2014-09-11 Thread Paolo Bonzini

Il 11/09/2014 16:21, Gleb Natapov ha scritto:
> As far as I can tell the if that is needed there is:
> 
> if (!is_guest_mode() || !(vmcs12->secondary_vm_exec_control & 
> ECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
> write(PIC_ACCESS_ADDR)
> 
> In other words if L2 shares L1 apic access page then reload, otherwise do 
> nothing.

What if the page being swapped out is L1's APIC access page?  We don't
run prepare_vmcs12 in that case because it's an L2->L0->L2 entry, so we
need to "do something".

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v5 4/7] kvm, mem-hotplug: Reload L1' apic access page on migration in vcpu_enter_guest().

2014-09-11 Thread Gleb Natapov

On Thu, Sep 11, 2014 at 04:06:58PM +0200, Paolo Bonzini wrote:
> Il 11/09/2014 15:59, Gleb Natapov ha scritto:
> > 
> > Suppose vmcs01->APIC_ACCESS_ADDR = 0xf000. During L2 entry
> > vmcs02->APIC_ACCESS_ADDR is set to 0xf000 too (by prepare_vmcs02). Now
> > 0xf000 is migrated to 0x8000, mmu notifier is called, it forces vmexit,
> > but vcpu is in a guest mode so vmcs02->APIC_ACCESS_ADDR is never updated
> > to 0x8000 because of "if (!is_guest_mode(vcpu))" check. So what am I
> > missing here?
> 
> Right, guest mode isn't left as soon as you execute nested_vmx_vmexit,
> because this isn't an L2->L1 exit.  So we need an "else" for that "if
> (!is_guest_mode(vcpu))", in which case the hpa is ignored and
> vmcs12->apic_access_addr is used instead?
> 
As far as I can tell the if that is needed there is:

if (!is_guest_mode() || !(vmcs12->secondary_vm_exec_control & 
ECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
write(PIC_ACCESS_ADDR)

In other words if L2 shares L1 apic access page then reload, otherwise do 
nothing.

--
Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v5 4/7] kvm, mem-hotplug: Reload L1' apic access page on migration in vcpu_enter_guest().

2014-09-11 Thread Gleb Natapov

On Thu, Sep 11, 2014 at 04:24:04PM +0200, Paolo Bonzini wrote:
> Il 11/09/2014 16:21, Gleb Natapov ha scritto:
> > As far as I can tell the if that is needed there is:
> > 
> > if (!is_guest_mode() || !(vmcs12->secondary_vm_exec_control & 
> > ECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
> > write(PIC_ACCESS_ADDR)
> > 
> > In other words if L2 shares L1 apic access page then reload, otherwise do 
> > nothing.
> 
> What if the page being swapped out is L1's APIC access page?  We don't
> run prepare_vmcs12 in that case because it's an L2->L0->L2 entry, so we
> need to "do something".
We will do something on L2->L1 exit. We will call kvm_reload_apic_access_page().
That is what patch 5 of this series is doing.

--
Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v5 4/7] kvm, mem-hotplug: Reload L1' apic access page on migration in vcpu_enter_guest().

2014-09-11 Thread Paolo Bonzini

Il 11/09/2014 16:31, Gleb Natapov ha scritto:
>> > What if the page being swapped out is L1's APIC access page?  We don't
>> > run prepare_vmcs12 in that case because it's an L2->L0->L2 entry, so we
>> > need to "do something".
> We will do something on L2->L1 exit. We will call 
> kvm_reload_apic_access_page().
> That is what patch 5 of this series is doing.

Sorry, I meant "the APIC access page prepared by L1" for L2's execution.

You wrote:

> if (!is_guest_mode() || !(vmcs12->secondary_vm_exec_control & 
> ECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
> write(PIC_ACCESS_ADDR)
> 
> In other words if L2 shares L1 apic access page then reload, otherwise do 
> nothing.

but in that case you have to redo nested_get_page, so "do nothing"
doesn't work.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v5 4/7] kvm, mem-hotplug: Reload L1' apic access page on migration in vcpu_enter_guest().

2014-09-11 Thread Gleb Natapov

On Thu, Sep 11, 2014 at 04:37:39PM +0200, Paolo Bonzini wrote:
> Il 11/09/2014 16:31, Gleb Natapov ha scritto:
> >> > What if the page being swapped out is L1's APIC access page?  We don't
> >> > run prepare_vmcs12 in that case because it's an L2->L0->L2 entry, so we
> >> > need to "do something".
> > We will do something on L2->L1 exit. We will call 
> > kvm_reload_apic_access_page().
> > That is what patch 5 of this series is doing.
> 
> Sorry, I meant "the APIC access page prepared by L1" for L2's execution.
> 
> You wrote:
> 
> > if (!is_guest_mode() || !(vmcs12->secondary_vm_exec_control & 
> > ECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
> > write(PIC_ACCESS_ADDR)
> > 
> > In other words if L2 shares L1 apic access page then reload, otherwise do 
> > nothing.
> 
> but in that case you have to redo nested_get_page, so "do nothing"
> doesn't work.
> 
Ah, 7/7 is new in this submission. Before that this page was still
pinned.  Looking at 7/7 now I do not see how it can work since it has no
code for mmu notifier to detect that it deals with such page and call
kvm_reload_apic_access_page().  I said to Tang previously that nested
kvm has a bunch of pinned page that are hard to deal with and suggested
to iron out non nested case first :(

--
Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v3 1/4] kvmtool: ARM: Use KVM_ARM_PREFERRED_TARGET vm ioctl to determine target cpu

2014-09-11 Thread Andre Przywara

Hi Anup,

On 08/09/14 09:17, Anup Patel wrote:
> Instead, of trying out each and every target type we should
> use KVM_ARM_PREFERRED_TARGET vm ioctl to determine target type
> for KVM ARM/ARM64.
> 
> If KVM_ARM_PREFERRED_TARGET vm ioctl fails then we fallback to
> old method of trying all known target types.
> 
> If KVM_ARM_PREFERRED_TARGET vm ioctl succeeds but the returned
> target type is not known to KVMTOOL then we forcefully init
> VCPU with target type returned by KVM_ARM_PREFERRED_TARGET vm ioctl.
> 
> Signed-off-by: Pranavkumar Sawargaonkar 
> Signed-off-by: Anup Patel 
> ---
>  tools/kvm/arm/kvm-cpu.c |   52 
> +--
>  1 file changed, 41 insertions(+), 11 deletions(-)
> 
> diff --git a/tools/kvm/arm/kvm-cpu.c b/tools/kvm/arm/kvm-cpu.c
> index aeaa4cf..ba7a762 100644
> --- a/tools/kvm/arm/kvm-cpu.c
> +++ b/tools/kvm/arm/kvm-cpu.c
> @@ -33,7 +33,8 @@ struct kvm_cpu *kvm_cpu__arch_init(struct kvm *kvm, 
> unsigned long cpu_id)
>   struct kvm_arm_target *target;
>   struct kvm_cpu *vcpu;
>   int coalesced_offset, mmap_size, err = -1;
> - unsigned int i;
> + unsigned int i, target_type;
> + struct kvm_vcpu_init preferred_init;
>   struct kvm_vcpu_init vcpu_init = {
>   .features = ARM_VCPU_FEATURE_FLAGS(kvm, cpu_id)
>   };
> @@ -55,19 +56,47 @@ struct kvm_cpu *kvm_cpu__arch_init(struct kvm *kvm, 
> unsigned long cpu_id)
>   if (vcpu->kvm_run == MAP_FAILED)
>   die("unable to mmap vcpu fd");
>  
> - /* Find an appropriate target CPU type. */
> - for (i = 0; i < ARRAY_SIZE(kvm_arm_targets); ++i) {
> - if (!kvm_arm_targets[i])
> - continue;
> - target = kvm_arm_targets[i];
> - vcpu_init.target = target->id;
> - err = ioctl(vcpu->vcpu_fd, KVM_ARM_VCPU_INIT, &vcpu_init);
> - if (!err)
> - break;
> + /*
> +  * If preferred target ioctl successful then use preferred target
> +  * else try each and every target type.
> +  */
> + err = ioctl(kvm->vm_fd, KVM_ARM_PREFERRED_TARGET, &preferred_init);
> + if (!err) {
> + /* Match preferred target CPU type. */
> + target = NULL;
> + for (i = 0; i < ARRAY_SIZE(kvm_arm_targets); ++i) {
> + if (!kvm_arm_targets[i])
> + continue;
> + if (kvm_arm_targets[i]->id == preferred_init.target) {
> + target = kvm_arm_targets[i];
> + target_type = kvm_arm_targets[i]->id;
> + break;
> + }
> + }
> + if (!target) {
> + target = kvm_arm_targets[0];

I think you missed the part of the patch which adds the now magic zero
member of kvm_arm_targets[]. A simple static initializer should work.

> + target_type = preferred_init.target;

Can't you move that out of the loop, in front of it actually? Then you
can get rid of the line above setting the target_type also, since you
always use the same value now, regardless whether you found that CPU in
the list or not.

> + }
> + } else {
> + /* Find an appropriate target CPU type. */
> + for (i = 0; i < ARRAY_SIZE(kvm_arm_targets); ++i) {
> + if (!kvm_arm_targets[i])
> + continue;
> + target = kvm_arm_targets[i];
> + target_type = target->id;
> + vcpu_init.target = target_type;
> + err = ioctl(vcpu->vcpu_fd, KVM_ARM_VCPU_INIT, 
> &vcpu_init);
> + if (!err)
> + break;
> + }
> + if (err)
> + die("Unable to find matching target");
>   }
>  
> + vcpu_init.target = target_type;
> + err = ioctl(vcpu->vcpu_fd, KVM_ARM_VCPU_INIT, &vcpu_init);

You should do this only in the if-branch above, since you (try to) call
KVM_ARM_VCPU_INIT already in the else branch before. Otherwise in the
latter case you would do it twice.

Regards,
Andre.

>   if (err || target->init(vcpu))
> - die("Unable to initialise ARM vcpu");
> + die("Unable to initialise vcpu");
>  
>   coalesced_offset = ioctl(kvm->sys_fd, KVM_CHECK_EXTENSION,
>KVM_CAP_COALESCED_MMIO);
> @@ -81,6 +110,7 @@ struct kvm_cpu *kvm_cpu__arch_init(struct kvm *kvm, 
> unsigned long cpu_id)
>   vcpu->cpu_type  = target->id;
>   vcpu->cpu_compatible= target->compatible;
>   vcpu->is_running= true;
> +
>   return vcpu;
>  }
>  
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC v2 8/9] KVM: KVM-VFIO: generic KVM_DEV_VFIO_DEVICE command and IRQ forwarding control

2014-09-11 Thread Alex Williamson

On Thu, 2014-09-11 at 14:04 +0200, Eric Auger wrote:
> On 09/11/2014 07:05 AM, Alex Williamson wrote:
> > On Thu, 2014-09-11 at 05:10 +0200, Christoffer Dall wrote:
> >> On Mon, Sep 01, 2014 at 02:52:47PM +0200, Eric Auger wrote:
> >>> This patch introduces a new KVM_DEV_VFIO_DEVICE attribute.
> >>>
> >>> This is a new control channel which enables KVM to cooperate with
> >>> viable VFIO devices.
> >>>
> >>> The kvm-vfio device now holds a list of devices (kvm_vfio_device)
> >>> in addition to a list of groups (kvm_vfio_group). The new
> >>> infrastructure enables to check the validity of the VFIO device
> >>> file descriptor, get and hold a reference to it.
> >>>
> >>> The first concrete implemented command is IRQ forward control:
> >>> KVM_DEV_VFIO_DEVICE_FORWARD_IRQ, KVM_DEV_VFIO_DEVICE_UNFORWARD_IRQ.
> >>>
> >>> It consists in programing the VFIO driver and KVM in a consistent manner
> >>> so that an optimized IRQ injection/completion is set up. Each
> >>> kvm_vfio_device holds a list of forwarded IRQ. When putting a
> >>> kvm_vfio_device, the implementation makes sure the forwarded IRQs
> >>> are set again in the normal handling state (non forwarded).
> >>
> >> 'putting a kvm_vfio_device' sounds to like you're golf'ing :)
> >>
> >> When a kvm_vfio_device is released?
> >>
> >>>
> >>> The forwarding programmming is architecture specific, embodied by the
> >>> kvm_arch_set_fwd_state function. Its implementation is given in a
> >>> separate patch file.
> >>
> >> I would drop the last sentence and instead indicate that this is handled
> >> properly when the architecture does not support such a feature.
> >>
> >>>
> >>> The forwarding control modality is enabled by the
> >>> __KVM_HAVE_ARCH_KVM_VFIO_FORWARD define.
> >>>
> >>> Signed-off-by: Eric Auger 
> >>>
> >>> ---
> >>>
> >>> v1 -> v2:
> >>> - __KVM_HAVE_ARCH_KVM_VFIO renamed into __KVM_HAVE_ARCH_KVM_VFIO_FORWARD
> >>> - original patch file separated into 2 parts: generic part moved in vfio.c
> >>>   and ARM specific part(kvm_arch_set_fwd_state)
> >>> ---
> >>>  include/linux/kvm_host.h |  27 +++
> >>>  virt/kvm/vfio.c  | 452 
> >>> ++-
> >>>  2 files changed, 477 insertions(+), 2 deletions(-)
> >>>
> >>> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> >>> index a4c33b3..24350dc 100644
> >>> --- a/include/linux/kvm_host.h
> >>> +++ b/include/linux/kvm_host.h
> >>> @@ -1065,6 +1065,21 @@ struct kvm_device_ops {
> >>> unsigned long arg);
> >>>  };
> >>>  
> >>> +enum kvm_fwd_irq_action {
> >>> + KVM_VFIO_IRQ_SET_FORWARD,
> >>> + KVM_VFIO_IRQ_SET_NORMAL,
> >>> + KVM_VFIO_IRQ_CLEANUP,
> >>
> >> This is KVM internal API, so it would probably be good to document this.
> >> Especially the CLEANUP bit worries me, see below.
> > 
> > This also doesn't match the user API, which is simply FORWARD/UNFORWARD.
> Hi Alex,
> 
> will change that.
> > Extra states worry me too.
> 
> I tried to explained the 2 motivations behind. Please let me know if it
> makes sense.

Not really.  It seems like it's just a leak of arch specific handling
out into common code.

> >>> +};
> >>> +
> >>> +/* internal structure describing a forwarded IRQ */
> >>> +struct kvm_fwd_irq {
> >>> + struct list_head link;
> >>
> >> this list entry is local to the kvm vfio device, right? that means you
> >> probably want a struct with just the below fields, and then have a
> >> containing struct in the generic device file, private to it's logic.
> > 
> > Yes, this is part of the abstraction problem.
> OK will fix that.
> > 
> >>> + __u32 index; /* platform device irq index */
> > 
> > This is a vfio_device irq_index, but vfio_devices support indexes and
> > sub-indexes.  At this level the API should match vfio, not the specifics
> > of platform devices not supporting sub-index.
> I will add sub-indexes then.
> > 
> >>> + __u32 hwirq; /*physical IRQ */
> >>> + __u32 gsi; /* virtual IRQ */
> >>> + struct kvm_vcpu *vcpu; /* vcpu to inject into*/
> > 
> > Not sure I understand why vcpu is necessary.
> vcpu is used when providing the physical IRQ/virtual IRQ mapping to the
> virtual GIC. I can remove it from and add a vcpu struct * param to
> kvm_arch_set_fwd_state if you prefer.

The kvm-vfio API for this interface doesn't allow the user to indicate
which vcpu to inject to.  On x86, it would be the programming of the
interrupt controller that would decide that.  In the code here we
arbitrarily pick vcpu0.  It feels both architecture specific and a bit
unspecified.

> 
>   Also I see a 'get' in the code below, but not a 'put'.
> Sorry I do not understand your comment here? What 'get' do you mention?

I suppose vcpus don't subscribe to the get/put philosophy, I was
expecting a reference count, but there is none.  How do we know that
vcpu pointer is still valid later?

> > 
> >>> +};
> >>> +
> >>>  void kvm_device_get(struct kvm_device *dev);
> >>>  void kvm_device_put(struct kvm_device *dev);
> >>>  struct kvm_device *

Re: [RFC v2 8/9] KVM: KVM-VFIO: generic KVM_DEV_VFIO_DEVICE command and IRQ forwarding control

2014-09-11 Thread Alex Williamson

On Thu, 2014-09-11 at 11:35 +0200, Eric Auger wrote:
> On 09/11/2014 05:10 AM, Christoffer Dall wrote:
> > On Mon, Sep 01, 2014 at 02:52:47PM +0200, Eric Auger wrote:
> >> This patch introduces a new KVM_DEV_VFIO_DEVICE attribute.
> >>
> >> This is a new control channel which enables KVM to cooperate with
> >> viable VFIO devices.
> >>
> >> The kvm-vfio device now holds a list of devices (kvm_vfio_device)
> >> in addition to a list of groups (kvm_vfio_group). The new
> >> infrastructure enables to check the validity of the VFIO device
> >> file descriptor, get and hold a reference to it.
> >>
> >> The first concrete implemented command is IRQ forward control:
> >> KVM_DEV_VFIO_DEVICE_FORWARD_IRQ, KVM_DEV_VFIO_DEVICE_UNFORWARD_IRQ.
> >>
> >> It consists in programing the VFIO driver and KVM in a consistent manner
> >> so that an optimized IRQ injection/completion is set up. Each
> >> kvm_vfio_device holds a list of forwarded IRQ. When putting a
> >> kvm_vfio_device, the implementation makes sure the forwarded IRQs
> >> are set again in the normal handling state (non forwarded).
> > 
> > 'putting a kvm_vfio_device' sounds to like you're golf'ing :)
> > 
> > When a kvm_vfio_device is released?
> sure
> > 
> >>
> >> The forwarding programmming is architecture specific, embodied by the
> >> kvm_arch_set_fwd_state function. Its implementation is given in a
> >> separate patch file.
> > 
> > I would drop the last sentence and instead indicate that this is handled
> > properly when the architecture does not support such a feature.
> ok
> > 
> >>
> >> The forwarding control modality is enabled by the
> >> __KVM_HAVE_ARCH_KVM_VFIO_FORWARD define.
> >>
> >> Signed-off-by: Eric Auger 
> >>
> >> ---
> >>
> >> v1 -> v2:
> >> - __KVM_HAVE_ARCH_KVM_VFIO renamed into __KVM_HAVE_ARCH_KVM_VFIO_FORWARD
> >> - original patch file separated into 2 parts: generic part moved in vfio.c
> >>   and ARM specific part(kvm_arch_set_fwd_state)
> >> ---
> >>  include/linux/kvm_host.h |  27 +++
> >>  virt/kvm/vfio.c  | 452 
> >> ++-
> >>  2 files changed, 477 insertions(+), 2 deletions(-)
> >>
> >> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> >> index a4c33b3..24350dc 100644
> >> --- a/include/linux/kvm_host.h
> >> +++ b/include/linux/kvm_host.h
> >> @@ -1065,6 +1065,21 @@ struct kvm_device_ops {
> >>  unsigned long arg);
> >>  };
> >>  
> >> +enum kvm_fwd_irq_action {
> >> +  KVM_VFIO_IRQ_SET_FORWARD,
> >> +  KVM_VFIO_IRQ_SET_NORMAL,
> >> +  KVM_VFIO_IRQ_CLEANUP,
> > 
> > This is KVM internal API, so it would probably be good to document this.
> > Especially the CLEANUP bit worries me, see below.
> I will document it
> > 
> >> +};
> >> +
> >> +/* internal structure describing a forwarded IRQ */
> >> +struct kvm_fwd_irq {
> >> +  struct list_head link;
> > 
> > this list entry is local to the kvm vfio device, right? that means you
> > probably want a struct with just the below fields, and then have a
> > containing struct in the generic device file, private to it's logic.
> I will introduce 2 separate structs
> > 
> >> +  __u32 index; /* platform device irq index */
> >> +  __u32 hwirq; /*physical IRQ */
> >> +  __u32 gsi; /* virtual IRQ */
> >> +  struct kvm_vcpu *vcpu; /* vcpu to inject into*/
> >> +};
> >> +
> >>  void kvm_device_get(struct kvm_device *dev);
> >>  void kvm_device_put(struct kvm_device *dev);
> >>  struct kvm_device *kvm_device_from_filp(struct file *filp);
> >> @@ -1075,6 +1090,18 @@ extern struct kvm_device_ops kvm_vfio_ops;
> >>  extern struct kvm_device_ops kvm_arm_vgic_v2_ops;
> >>  extern struct kvm_device_ops kvm_flic_ops;
> >>  
> >> +#ifdef __KVM_HAVE_ARCH_KVM_VFIO_FORWARD
> >> +int kvm_arch_set_fwd_state(struct kvm_fwd_irq *pfwd,
> > 
> > what's the 'p' in pfwd?
> will rename
> > 
> >> + enum kvm_fwd_irq_action action);
> >> +
> >> +#else
> >> +static inline int kvm_arch_set_fwd_state(struct kvm_fwd_irq *pfwd,
> >> +   enum kvm_fwd_irq_action action)
> >> +{
> >> +  return 0;
> >> +}
> >> +#endif
> >> +
> >>  #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
> >>  
> >>  static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool 
> >> val)
> >> diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c
> >> index 76dc7a1..e4a81c4 100644
> >> --- a/virt/kvm/vfio.c
> >> +++ b/virt/kvm/vfio.c
> >> @@ -18,14 +18,24 @@
> >>  #include 
> >>  #include 
> >>  #include 
> >> +#include 
> >>  
> >>  struct kvm_vfio_group {
> >>struct list_head node;
> >>struct vfio_group *vfio_group;
> >>  };
> >>  
> >> +struct kvm_vfio_device {
> >> +  struct list_head node;
> >> +  struct vfio_device *vfio_device;
> >> +  /* list of forwarded IRQs for that VFIO device */
> >> +  struct list_head fwd_irq_list;
> >> +  int fd;
> >> +};
> >> +
> >>  struct kvm_vfio {
> >>struct list_head group_list;
> >> +  struct list_head device_list;
> >>struct mutex lock;
> >>bool noncoherent;
>

Re: [PATCH v3 2/4] kvmtool: ARM64: Add target type potenza for aarch64

2014-09-11 Thread Andre Przywara

Anup,

On 08/09/14 09:17, Anup Patel wrote:
> The VCPU target type KVM_ARM_TARGET_XGENE_POTENZA is available
> in latest Linux-3.16-rcX or higher hence register aarch64 target
> type for it.
>
> This patch enables us to run KVMTOOL on X-Gene Potenza host.

Why do you need this still if the previous patch got rid of the need for
naming each and every CPU in kvmtool?
Do you care about kernels older than 3.12? I wouldn't bother so much
since you'd need a much newer kvmtool anyway.

Can you consider dropping this patch then?
I'd rather avoid adding CPUs to this list needlessly from now on.

Regards,
Andre.

> 
> Signed-off-by: Pranavkumar Sawargaonkar 
> Signed-off-by: Anup Patel 
> ---
>  tools/kvm/arm/aarch64/arm-cpu.c |9 -
>  1 file changed, 8 insertions(+), 1 deletion(-)
> 
> diff --git a/tools/kvm/arm/aarch64/arm-cpu.c b/tools/kvm/arm/aarch64/arm-cpu.c
> index ce5ea2f..ce526e3 100644
> --- a/tools/kvm/arm/aarch64/arm-cpu.c
> +++ b/tools/kvm/arm/aarch64/arm-cpu.c
> @@ -41,10 +41,17 @@ static struct kvm_arm_target target_cortex_a57 = {
>   .init   = arm_cpu__vcpu_init,
>  };
>  
> +static struct kvm_arm_target target_potenza = {
> + .id = KVM_ARM_TARGET_XGENE_POTENZA,
> + .compatible = "arm,arm-v8",
> + .init   = arm_cpu__vcpu_init,
> +};
> +
>  static int arm_cpu__core_init(struct kvm *kvm)
>  {
>   return (kvm_cpu__register_kvm_arm_target(&target_aem_v8) ||
>   kvm_cpu__register_kvm_arm_target(&target_foundation_v8) ||
> - kvm_cpu__register_kvm_arm_target(&target_cortex_a57));
> + kvm_cpu__register_kvm_arm_target(&target_cortex_a57) ||
> + kvm_cpu__register_kvm_arm_target(&target_potenza));
>  }
>  core_init(arm_cpu__core_init);
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v3 3/4] kvmtool: Handle exit reason KVM_EXIT_SYSTEM_EVENT

2014-09-11 Thread Andre Przywara


On 08/09/14 09:17, Anup Patel wrote:
> The KVM_EXIT_SYSTEM_EVENT exit reason was added to define
> architecture independent system-wide events for a Guest.
> 
> Currently, it is used by in-kernel PSCI-0.2 emulation of
> KVM ARM/ARM64 to inform user space about PSCI SYSTEM_OFF
> or PSCI SYSTEM_RESET request.
> 
> For now, we simply treat all system-wide guest events as
> shutdown request in KVMTOOL.

Is that really a good idea to default to exit_kvm?
I find a shutdown a rather drastic default.
Also I'd like to see RESET not easily mapped to shutdown. If the user
resets the box explicitly, it's probably expected to come up again (to
load an updated kernel or proceed with an install).
So what about a more explicit message like: "... please restart the VM"
until we gain proper reboot support in kvmtool?

Regards,
Andre.

> Signed-off-by: Pranavkumar Sawargaonkar 
> Signed-off-by: Anup Patel 
> ---
>  tools/kvm/kvm-cpu.c |   19 +++
>  1 file changed, 19 insertions(+)
> 
> diff --git a/tools/kvm/kvm-cpu.c b/tools/kvm/kvm-cpu.c
> index ee0a8ec..6d01192 100644
> --- a/tools/kvm/kvm-cpu.c
> +++ b/tools/kvm/kvm-cpu.c
> @@ -160,6 +160,25 @@ int kvm_cpu__start(struct kvm_cpu *cpu)
>   goto exit_kvm;
>   case KVM_EXIT_SHUTDOWN:
>   goto exit_kvm;
> + case KVM_EXIT_SYSTEM_EVENT:
> + /*
> +  * Print the type of system event and
> +  * treat all system events as shutdown request.
> +  */
> + switch (cpu->kvm_run->system_event.type) {
> + case KVM_SYSTEM_EVENT_SHUTDOWN:
> + printf("  # Info: shutdown system event\n");
> + break;
> + case KVM_SYSTEM_EVENT_RESET:
> + printf("  # Info: reset system event\n");
> + break;
> + default:
> + printf("  # Warning: unknown system event 
> type=%d\n",
> +cpu->kvm_run->system_event.type);
> + break;
> + };
> + printf("  # Info: exiting KVMTOOL\n");
> + goto exit_kvm;
>   default: {
>   bool ret;
>  
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v3] ARM: KVM: add irqfd support

2014-09-11 Thread Christoffer Dall

On Thu, Sep 11, 2014 at 10:14:13AM +0200, Eric Auger wrote:
> On 09/11/2014 05:09 AM, Christoffer Dall wrote:
> > On Mon, Sep 01, 2014 at 10:53:04AM +0200, Eric Auger wrote:
> >> This patch enables irqfd on ARM.
> >>
> >> irqfd framework enables to inject a virtual IRQ into a guest upon an
> >> eventfd trigger. User-side uses KVM_IRQFD VM ioctl to provide KVM with
> >> a kvm_irqfd struct that associates a VM, an eventfd, a virtual IRQ number
> >> (aka. the gsi). When an actor signals the eventfd (typically a VFIO
> >> platform driver), the kvm irqfd subsystem injects the provided virtual
> >> IRQ into the guest.
> >>
> >> Resamplefd also is supported for level sensitive interrupts, ie. the
> >> user can provide another eventfd that is triggered when the completion
> >> of the virtual IRQ (gsi) is detected by the GIC.
> >>
> >> The gsi must correspond to a shared peripheral interrupt (SPI), ie the
> >> GIC interrupt ID is gsi+32.
> >>
> >> this patch enables CONFIG_HAVE_KVM_EVENTFD and CONFIG_HAVE_KVM_IRQFD.
> >> CONFIG_HAVE_KVM_IRQCHIP is removed. No IRQ routing table is used
> >> (irqchip.c and irqcomm.c are not used).
> >>
> >> Both KVM_CAP_IRQFD & KVM_CAP_IRQFD_RESAMPLE capabilities are exposed
> >>
> >> Signed-off-by: Eric Auger 
> >>
> >> ---
> >>
> >> This patch serie deprecates the previous serie featuring GSI routing
> >> (https://patches.linaro.org/32261/)
> >>
> >> The patch serie has the following dependencies:
> >> - arm/arm64: KVM: Various VGIC cleanups and improvements
> >>   https://lists.cs.columbia.edu/pipermail/kvmarm/2014-June/009979.html
> >> - "KVM: EVENTFD: remove inclusion of irq.h"
> >>
> >> All pieces can be found on git://git.linaro.org/people/eric.auger/linux.git
> >> branch irqfd_norouting_integ_v3
> >>
> >> This work was tested with Calxeda Midway xgmac main interrupt with
> >> qemu-system-arm and QEMU VFIO platform device.
> >>
> >> v2 -> v3:
> >> - removal of irq.h from eventfd.c put in a separate patch to increase
> >>   visibility
> >> - properly expose KVM_CAP_IRQFD capability in arm.c
> >> - remove CONFIG_HAVE_KVM_IRQCHIP meaningfull only if irq_comm.c is used
> >>
> >> v1 -> v2:
> >> - rebase on 3.17rc1
> >> - move of the dist unlock in process_maintenance
> >> - remove of dist lock in __kvm_vgic_sync_hwstate
> >> - rewording of the commit message (add resamplefd reference)
> >> - remove irq.h
> >> ---
> >>  Documentation/virtual/kvm/api.txt |  5 +++-
> >>  arch/arm/include/uapi/asm/kvm.h   |  3 +++
> >>  arch/arm/kvm/Kconfig  |  4 +--
> >>  arch/arm/kvm/Makefile |  2 +-
> >>  arch/arm/kvm/arm.c|  3 +++
> >>  virt/kvm/arm/vgic.c   | 56 
> >> ---
> >>  6 files changed, 65 insertions(+), 8 deletions(-)
> >>
> >> diff --git a/Documentation/virtual/kvm/api.txt 
> >> b/Documentation/virtual/kvm/api.txt
> >> index beae3fd..8118b12 100644
> >> --- a/Documentation/virtual/kvm/api.txt
> >> +++ b/Documentation/virtual/kvm/api.txt
> >> @@ -2204,7 +2204,7 @@ into the hash PTE second double word).
> >>  4.75 KVM_IRQFD
> >>  
> >>  Capability: KVM_CAP_IRQFD
> >> -Architectures: x86 s390
> >> +Architectures: x86 s390 arm
> >>  Type: vm ioctl
> >>  Parameters: struct kvm_irqfd (in)
> >>  Returns: 0 on success, -1 on error
> >> @@ -2230,6 +2230,9 @@ Note that closing the resamplefd is not sufficient 
> >> to disable the
> >>  irqfd.  The KVM_IRQFD_FLAG_RESAMPLE is only necessary on assignment
> >>  and need not be specified with KVM_IRQFD_FLAG_DEASSIGN.
> >>  
> >> +On ARM/arm64 the injected must be a shared peripheral interrupt (SPI).
> >> +This means the programmed GIC interrupt ID is gsi+32.
> >> +
> > 
> > See above comment.
> Hi Christoffer,
> 
> sorry which comment do you refer to?

good question, I thought I had a comment above, just disregard.

> wrt your last comment do you
> consider PPI injection support is a mandated feature for this patch to
> be upstreamable?

well, right now, the only reason it's not supported is "we didn't bother
thinking about it or doing it" and I haven't heard a valid reason for
why we should be designing a new user space API etc. without supporting
PPIs.

So yes, either argue why it's better to not include PPI support in the
first round, why we never need to, or just support it ;)

> > 
> >>  4.76 KVM_PPC_ALLOCATE_HTAB
> >>  
> >>  Capability: KVM_CAP_PPC_ALLOC_HTAB
> >> diff --git a/arch/arm/include/uapi/asm/kvm.h 
> >> b/arch/arm/include/uapi/asm/kvm.h
> >> index e6ebdd3..3034c66 100644
> >> --- a/arch/arm/include/uapi/asm/kvm.h
> >> +++ b/arch/arm/include/uapi/asm/kvm.h
> >> @@ -194,6 +194,9 @@ struct kvm_arch_memory_slot {
> >>  /* Highest supported SPI, from VGIC_NR_IRQS */
> >>  #define KVM_ARM_IRQ_GIC_MAX   127
> >>  
> >> +/* One single KVM irqchip, ie. the VGIC */
> >> +#define KVM_NR_IRQCHIPS  1
> >> +
> >>  /* PSCI interface */
> >>  #define KVM_PSCI_FN_BASE  0x95c1ba5e
> >>  #define KVM_PSCI_FN(n)(KVM_PSCI

Re: [BUG] Guest kernel divide error in kvm_unlock_kick

2014-09-11 Thread Chris Webb

Paolo Bonzini  wrote:

> This is a hypercall that should have kicked VCPU 3 (see rcx).
> 
> Can you please apply this patch and gather a trace of the host
> (using "trace-cmd -e kvm qemu-kvm ")?

Sure, no problem. I've built the trace-cmd tool against udis86 (I hope) and
have put the resulting trace.dat at

  http://cdw.me.uk/tmp/trace.dat

This is actually for a -smp 2 qemu (failing to kick VCPU 1?) as I was having
trouble persuading the -smp 4 qemu to crash as reliably under tracing.
(Something timing related?) Otherwise the qemu-system-x86 command line is
exactly as before.

The guest kernel crash message which corresponds to this trace was:

divide error:  [#1] PREEMPT SMP 
Modules linked in:
CPU: 0 PID: 618 Comm: mkdir Not tainted 3.16.2-guest #2
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 
rel-1.7.5-0-ge51488c-20140602_164612-nilsson.home.kraxel.org 04/01/2014
task: 88007c997080 ti: 88007c614000 task.ti: 88007c614000
RIP: 0010:[]  [] kvm_unlock_kick+0x72/0x80
RSP: 0018:88007c617d40  EFLAGS: 00010046
RAX: 0005 RBX:  RCX: 0001
RDX: 0001 RSI: 88007fd11c40 RDI: 
RBP: 88007fd11c40 R08: 81b98940 R09: 0001
R10:  R11: 0007 R12: 00f6
R13: 0001 R14: 0001 R15: 00011c40
FS:  7f43eb1ed700() GS:88007fc0() knlGS:
CS:  0010 DS:  ES:  CR0: 8005003b
CR2: 7f43eace0a30 CR3: 01a12000 CR4: 000406f0
Stack:
 88007c994380 88007c9949aa 0046 81689715
 810f3174 0001 ea0001f16320 ea0001f17860
  88007c99e1e8 88007c997080 0001
Call Trace:
 [] ? _raw_spin_unlock+0x45/0x70
 [] ? try_to_wake_up+0x2a4/0x330
 [] ? __wake_up_common+0x4c/0x80
 [] ? __wake_up_sync_key+0x38/0x60
 [] ? do_notify_parent+0x19a/0x280
 [] ? sched_move_task+0xb6/0x190
 [] ? do_exit+0xa1c/0xab0
 [] ? do_group_exit+0x34/0xb0
 [] ? SyS_exit_group+0xb/0x10
 [] ? system_call_fastpath+0x1a/0x1f
Code: c0 ca a7 81 48 8d 04 0b 48 8b 30 48 39 ee 75 c9 0f b6 40 08 44 38 e0 75 
c0 48 c7 c0 22 b0 00 00 31 db 0f b7 0c 08 b8 05 00 00 00 <0f> 01 c1 0f 1f 00 5b 
5d 41 5c c3 0f 1f 00 48 c7 c0 10 cf 00 00 
RIP  [] kvm_unlock_kick+0x72/0x80
 RSP 
---[ end trace bf5a4445f9decdbb ]---
Fixing recursive fault but reboot is needed!
BUG: scheduling while atomic: mkdir/618/0x0006
Modules linked in:
CPU: 0 PID: 618 Comm: mkdir Tainted: G  D   3.16.2-guest #2
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 
rel-1.7.5-0-ge51488c-20140602_164612-nilsson.home.kraxel.org 04/01/2014
  c022d302 81684029 
 810ee956 81686266 00011c40 88007c617fd8
 00011c40 88007c997080 0006 0046
Call Trace:
 [] ? dump_stack+0x49/0x6a
 [] ? __schedule_bug+0x46/0x60
 [] ? __schedule+0x5a6/0x7c0
 [] ? printk+0x59/0x75
 [] ? do_exit+0x85b/0xab0
 [] ? printk+0x59/0x75
 [] ? oops_end+0x7a/0x100
 [] ? do_error_trap+0x85/0x110
 [] ? kvm_unlock_kick+0x72/0x80
 [] ? __alloc_pages_nodemask+0x108/0xa60
 [] ? divide_error+0x1e/0x30
 [] ? kvm_unlock_kick+0x72/0x80
 [] ? _raw_spin_unlock+0x45/0x70
 [] ? try_to_wake_up+0x2a4/0x330
 [] ? __wake_up_common+0x4c/0x80
 [] ? __wake_up_sync_key+0x38/0x60
 [] ? do_notify_parent+0x19a/0x280
 [] ? sched_move_task+0xb6/0x190
 [] ? do_exit+0xa1c/0xab0
 [] ? do_group_exit+0x34/0xb0
 [] ? SyS_exit_group+0xb/0x10
 [] ? system_call_fastpath+0x1a/0x1f

Best wishes,

Chris.--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC v2 4/9] VFIO: platform: handler tests whether the IRQ is forwarded

2014-09-11 Thread Christoffer Dall

On Thu, Sep 11, 2014 at 10:44:02AM +0200, Eric Auger wrote:
> On 09/11/2014 05:10 AM, Christoffer Dall wrote:
> > On Mon, Sep 01, 2014 at 02:52:43PM +0200, Eric Auger wrote:
> >> In case the IRQ is forwarded, the VFIO platform IRQ handler does not
> >> need to disable the IRQ anymore. In that mode, when the handler completes
> > 
> > add a comma after completes
> Hi Christoffer,
> ok
> > 
> >> the IRQ is not deactivated but only its priority is lowered.
> >>
> >> Some other actor (typically a guest) is supposed to deactivate the IRQ,
> >> allowing at that time a new physical IRQ to hit.
> >>
> >> In virtualization use case, the physical IRQ is automatically completed
> >> by the interrupt controller when the guest completes the corresponding
> >> virtual IRQ.
> >>
> >> Signed-off-by: Eric Auger 
> >> ---
> >>  drivers/vfio/platform/vfio_platform_irq.c | 7 ++-
> >>  1 file changed, 6 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/drivers/vfio/platform/vfio_platform_irq.c 
> >> b/drivers/vfio/platform/vfio_platform_irq.c
> >> index 6768508..1f851b2 100644
> >> --- a/drivers/vfio/platform/vfio_platform_irq.c
> >> +++ b/drivers/vfio/platform/vfio_platform_irq.c
> >> @@ -88,13 +88,18 @@ static irqreturn_t vfio_irq_handler(int irq, void 
> >> *dev_id)
> >>struct vfio_platform_irq *irq_ctx = dev_id;
> >>unsigned long flags;
> >>int ret = IRQ_NONE;
> >> +  struct irq_data *d;
> >> +  bool is_forwarded;
> >>  
> >>spin_lock_irqsave(&irq_ctx->lock, flags);
> >>  
> >>if (!irq_ctx->masked) {
> >>ret = IRQ_HANDLED;
> >> +  d = irq_get_irq_data(irq_ctx->hwirq);
> >> +  is_forwarded = irqd_irq_forwarded(d);
> >>  
> >> -  if (irq_ctx->flags & VFIO_IRQ_INFO_AUTOMASKED) {
> >> +  if (irq_ctx->flags & VFIO_IRQ_INFO_AUTOMASKED &&
> >> +  !is_forwarded) {
> >>disable_irq_nosync(irq_ctx->hwirq);
> >>irq_ctx->masked = true;
> >>}
> >> -- 
> >> 1.9.1
> >>
> > It makes sense that these needs to be all controlled in the kernel, but
> > I'm wondering if it would be cleaner / more correct to clear the
> > AUTOMASKED flag when the IRQ is forwarded and have vfio refuse setting
> > this flag as long as the irq is forwarded?
> 
> If I am not wrong, even if the user sets AUTOMASKED, this info never is
> exploited by the vfio platform driver. AUTOMASKED only is set internally
> to the driver, on init, for level sensitive IRQs.
> 
> It seems to be the same on PCI (for INTx). I do not see anywhere the
> user flag curectly copied into a local storage. But I prefer to be
> careful ;-)
> 
> If confirmed, although the flag value is exposed in the user API, the
> user set value never is exploited so this removes the need to check.
> 
> the forwarded IRQ modality being fully dynamic currently, then I would
> need to update the irq_ctx->flags on each vfio_irq_handler call. I don't
> know if its better?
> 
I'm not an expert on vfio, so I'll leave that to Alex Williamson to
answer, but I'm just worried that we need to special-case the forwarded
IRQ here, and if that may get lost elsewhere in the vfio code.  If the
AUTOMASKED flag covers specifically this behavior, then why don't we
simply clear/set that flag when forwarding/unforwarding the specific
IRQ?

-Christoffer
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC v2 5/9] KVM: KVM-VFIO: update user API to program forwarded IRQ

2014-09-11 Thread Christoffer Dall

On Thu, Sep 11, 2014 at 10:49:08AM +0200, Eric Auger wrote:
> On 09/11/2014 05:10 AM, Christoffer Dall wrote:
> > On Mon, Sep 01, 2014 at 02:52:44PM +0200, Eric Auger wrote:

[...]

> >> +
> >> +It is up to the caller of this API to make sure the IRQ is not
> >> +outstanding when the FORWARD/UNFORWARD is called. This could lead to
> > 
> > outstanding? can you be specific?
> active? and I should add *physical* IRQ
> > 
> > don't refer to FOWARD/UNFORWARD, either refer to these attributes by
> > their full name or use a clear reference in proper English.
> ok
> > 
> >> +some inconsistency on who is going to complete the IRQ.
> > 
> > This sounds like the whole thing is fragile and if userspace doesn't do
> > things right, IRQ handling of a piece of hardware is going to be
> > inconsistent?  Is this the case?  If so, we need some stronger
> > semantics.  If not, this should be rephrased.
> Actually the KVM-VFIO device rejects any attempt to change the
> forwarding mode if the physical IRQ is active. So I hope this is robust
> and will change the explanation.
> 
ok, so what is the proposed method if the IRQ is indeed active, should
user space loop around and try or can user space make sure somehow?  If
user space should simply retry for a number of times, we should probalby
return a proper error code for this case -EINTR?

-Christoffer
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC v2 4/9] VFIO: platform: handler tests whether the IRQ is forwarded

2014-09-11 Thread Antonios Motakis

On Thu, Sep 11, 2014 at 10:44 AM, Eric Auger  wrote:
>
> On 09/11/2014 05:10 AM, Christoffer Dall wrote:
> > On Mon, Sep 01, 2014 at 02:52:43PM +0200, Eric Auger wrote:
> >> In case the IRQ is forwarded, the VFIO platform IRQ handler does not
> >> need to disable the IRQ anymore. In that mode, when the handler completes
> >
> > add a comma after completes
> Hi Christoffer,
> ok
> >
> >> the IRQ is not deactivated but only its priority is lowered.
> >>
> >> Some other actor (typically a guest) is supposed to deactivate the IRQ,
> >> allowing at that time a new physical IRQ to hit.
> >>
> >> In virtualization use case, the physical IRQ is automatically completed
> >> by the interrupt controller when the guest completes the corresponding
> >> virtual IRQ.
> >>
> >> Signed-off-by: Eric Auger 
> >> ---
> >>  drivers/vfio/platform/vfio_platform_irq.c | 7 ++-
> >>  1 file changed, 6 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/drivers/vfio/platform/vfio_platform_irq.c 
> >> b/drivers/vfio/platform/vfio_platform_irq.c
> >> index 6768508..1f851b2 100644
> >> --- a/drivers/vfio/platform/vfio_platform_irq.c
> >> +++ b/drivers/vfio/platform/vfio_platform_irq.c
> >> @@ -88,13 +88,18 @@ static irqreturn_t vfio_irq_handler(int irq, void 
> >> *dev_id)
> >>  struct vfio_platform_irq *irq_ctx = dev_id;
> >>  unsigned long flags;
> >>  int ret = IRQ_NONE;
> >> +struct irq_data *d;
> >> +bool is_forwarded;
> >>
> >>  spin_lock_irqsave(&irq_ctx->lock, flags);
> >>
> >>  if (!irq_ctx->masked) {
> >>  ret = IRQ_HANDLED;
> >> +d = irq_get_irq_data(irq_ctx->hwirq);
> >> +is_forwarded = irqd_irq_forwarded(d);
> >>
> >> -if (irq_ctx->flags & VFIO_IRQ_INFO_AUTOMASKED) {
> >> +if (irq_ctx->flags & VFIO_IRQ_INFO_AUTOMASKED &&
> >> +!is_forwarded) {
> >>  disable_irq_nosync(irq_ctx->hwirq);
> >>  irq_ctx->masked = true;
> >>  }
> >> --
> >> 1.9.1
> >>
> > It makes sense that these needs to be all controlled in the kernel, but
> > I'm wondering if it would be cleaner / more correct to clear the
> > AUTOMASKED flag when the IRQ is forwarded and have vfio refuse setting
> > this flag as long as the irq is forwarded?
>
> If I am not wrong, even if the user sets AUTOMASKED, this info never is
> exploited by the vfio platform driver. AUTOMASKED only is set internally
> to the driver, on init, for level sensitive IRQs.
>
> It seems to be the same on PCI (for INTx). I do not see anywhere the
> user flag curectly copied into a local storage. But I prefer to be
> careful ;-)
>
> If confirmed, although the flag value is exposed in the user API, the
> user set value never is exploited so this removes the need to check.
>

Yeah, the way the API is right now the AUTOMASKED flag is only to be
communicated by the kernel to the user, never the other way around.

IMHO there shouldn't be a need to change that. The flag is there just
to inform the user for the kernel behavior for non-forwarded IRQs (and
it's still true if the user unforwards the IRQ later). The user
decides the mode of operation, but it might still be a bit of
information he wants to know.

> the forwarded IRQ modality being fully dynamic currently, then I would
> need to update the irq_ctx->flags on each vfio_irq_handler call. I don't
> know if its better?
>
> Best Regards
>
> Eric
>
>
> >
> > -Christoffer
> >
>



-- 
Antonios Motakis
Virtual Open Systems
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC v2 8/9] KVM: KVM-VFIO: generic KVM_DEV_VFIO_DEVICE command and IRQ forwarding control

2014-09-11 Thread Christoffer Dall

On Wed, Sep 10, 2014 at 11:05:49PM -0600, Alex Williamson wrote:
> On Thu, 2014-09-11 at 05:10 +0200, Christoffer Dall wrote:
> > On Mon, Sep 01, 2014 at 02:52:47PM +0200, Eric Auger wrote:

[...]

> > >  
> > > +#ifdef __KVM_HAVE_ARCH_KVM_VFIO_FORWARD
> > > +int kvm_arch_set_fwd_state(struct kvm_fwd_irq *pfwd,
> > 
> > what's the 'p' in pfwd?
> 
> p is for pointer?
> 

shouldn't the type declation spell out quite clearly to me that I'm
dealing with a pointer?

[...]

> > 
> > need some spaceing here, also, I would turn this around, first check if
> > the strcmp fails, and then error out, then do you next check etc., to
> > avoid so many nested statements.
> > 
> > > + /* is a ref to this device already owned by the KVM-VFIO device? */
> > 
> > this comment is not particularly helpful in its current form, it would
> > be helpful if you specified that we're checking whether that particular
> > device/irq combo is already registered.
> > 
> > > + *kvm_vdev = kvm_vfio_find_device(kv, vdev);
> > > + if (*kvm_vdev) {
> > > + if (kvm_vfio_find_irq(*kvm_vdev, fwd_irq->index)) {
> > > + kvm_err("%s irq %d already forwarded\n",
> > > + __func__, *hwirq);
> 
> Why didn't we do this first?
> 
huh?

-Christoffer
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v3 0/4] Make kvm_device_ops registration dynamic

2014-09-11 Thread Will Deacon

Hi Paolo,

On Tue, Sep 02, 2014 at 10:27:32AM +0100, Will Deacon wrote:
> Hi all,
> 
> This is version 3 of the patches originally posted here:
> 
>   v1: http://www.spinics.net/lists/kvm-arm/msg10219.html
>   v2: http://www.spinics.net/lists/kvm/msg105197.html
> 
> Changes since v2 include:
> 
>   - Rebased onto 3.17-rc* (the vgic code changed a lot!)
>   - Added relevant acks
> 
> The mpic, flic and xics are still not ported over, as I don't want to
> risk breaking those devices (it's not clear at which point they need to
> be registered).

Any further comments on this lot?

Will
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC PATCH v6 15/20] vfio/platform: support for maskable and automasked interrupts

2014-09-11 Thread Antonios Motakis

On Wed, Sep 10, 2014 at 12:13 PM, Christoffer Dall
 wrote:
> On Tue, Sep 02, 2014 at 06:06:17PM +0200, Antonios Motakis wrote:
>> On Sun, Jun 8, 2014 at 12:17 PM, Christoffer Dall
>>  wrote:
>> > On Thu, Jun 05, 2014 at 07:03:23PM +0200, Antonios Motakis wrote:
>> >> Adds support to mask interrupts, and also for automasked interrupts.
>> >> Level sensitive interrupts are exposed as automasked interrupts and
>> >> are masked and disabled automatically when they fire.
>> >>
>> >> Signed-off-by: Antonios Motakis 
>> >> ---
>> >>  drivers/vfio/platform/vfio_platform_irq.c | 112 
>> >> --
>> >>  drivers/vfio/platform/vfio_platform_private.h |   2 +
>> >>  2 files changed, 109 insertions(+), 5 deletions(-)
>> >>
>> >> diff --git a/drivers/vfio/platform/vfio_platform_irq.c 
>> >> b/drivers/vfio/platform/vfio_platform_irq.c
>> >> index d79f5af..10dfbf0 100644
>> >> --- a/drivers/vfio/platform/vfio_platform_irq.c
>> >> +++ b/drivers/vfio/platform/vfio_platform_irq.c
>> >> @@ -51,9 +51,17 @@ int vfio_platform_irq_init(struct vfio_platform_device 
>> >> *vdev)
>> >>   if (hwirq < 0)
>> >>   goto err;
>> >>
>> >> - vdev->irq[i].flags = VFIO_IRQ_INFO_EVENTFD;
>> >> + spin_lock_init(&vdev->irq[i].lock);
>> >> +
>> >> + vdev->irq[i].flags = VFIO_IRQ_INFO_EVENTFD
>> >> + | VFIO_IRQ_INFO_MASKABLE;
>> >> +
>> >> + if (irq_get_trigger_type(hwirq) & IRQ_TYPE_LEVEL_MASK)
>> >> + vdev->irq[i].flags |= VFIO_IRQ_INFO_AUTOMASKED;
>> >
>> > This seems to rely on the fact that you had actually loaded a driver for
>> > your device to set the right type.  Is this assumption always correct?
>> >
>> > It seems to me that this configuration bit should now be up to your user
>> > space drive who is the best candidate to know details about your device
>> > at this point?
>> >
>>
>> Hm, I see this type being set usually either in a device tree source,
>> or in the support code for a specific platform. Are there any
>> situations where this is actually set by the driver? If I understand
>> right this is not the case for the PL330, but if it is possible that
>> it is the case for another device then I need to rethink this. Though
>> as far as I understand this should not be the case.
>>
>
> Wow, this has been incredibly long time since I looked at this code, so
> not sure if I remember my original reasoning anymore, however,
>
> while device properties are set in the DT, they would only be available
> to this code if you actually loaded a device driver for that device,
> right?  I'm just not sure that assumption always holds for devices used
> by VFIO, but I'm really not sure anymore.  Maybe I'm rambling.

The device I'm testing with, the PL330 DMAC, is one of the devices
that exposes level sensitive interrupts, and therefore for it to
properly work VFIO needs to be able to expose it as automasked.

I just tested the code on a kernel that doesn't include the native
PL330 DMA driver. It seems that even so, the unmasked property is
properly detected and exposed by VFIO. So for this scenario at least
the assumptions are true...

I'm afraid I have to admit that if there are any edge cases where this
might not be true, I don't know which they are :(

>
>> >> +
>> >>   vdev->irq[i].count = 1;
>> >>   vdev->irq[i].hwirq = hwirq;
>> >> + vdev->irq[i].masked = false;
>> >>   }
>> >>
>> >>   vdev->num_irqs = cnt;
>> >> @@ -77,11 +85,27 @@ void vfio_platform_irq_cleanup(struct 
>> >> vfio_platform_device *vdev)
>> >>
>> >>  static irqreturn_t vfio_irq_handler(int irq, void *dev_id)
>> >>  {
>> >> - struct eventfd_ctx *trigger = dev_id;
>> >> + struct vfio_platform_irq *irq_ctx = dev_id;
>> >> + unsigned long flags;
>> >> + int ret = IRQ_NONE;
>> >> +
>> >> + spin_lock_irqsave(&irq_ctx->lock, flags);
>> >> +
>> >> + if (!irq_ctx->masked) {
>> >> + ret = IRQ_HANDLED;
>> >> +
>> >> + if (irq_ctx->flags & VFIO_IRQ_INFO_AUTOMASKED) {
>> >> + disable_irq_nosync(irq_ctx->hwirq);
>> >> + irq_ctx->masked = true;
>> >> + }
>> >> + }
>> >>
>> >> - eventfd_signal(trigger, 1);
>> >> + spin_unlock_irqrestore(&irq_ctx->lock, flags);
>> >>
>> >> - return IRQ_HANDLED;
>> >> + if (ret == IRQ_HANDLED)
>> >> + eventfd_signal(irq_ctx->trigger, 1);
>> >> +
>> >> + return ret;
>> >>  }
>> >>
>> >>  static int vfio_set_trigger(struct vfio_platform_device *vdev,
>> >> @@ -162,6 +186,82 @@ static int vfio_platform_set_irq_trigger(struct 
>> >> vfio_platform_device *vdev,
>> >>   return -EFAULT;
>> >>  }
>> >>
>> >> +static int vfio_platform_set_irq_unmask(struct vfio_platform_device 
>> >> *vdev,
>> >> + unsigned index, unsigned start,
>> >> + unsigned count, uint32_t flags,

Re: [RFC v2 8/9] KVM: KVM-VFIO: generic KVM_DEV_VFIO_DEVICE command and IRQ forwarding control

2014-09-11 Thread Christoffer Dall

On Thu, Sep 11, 2014 at 02:04:39PM +0200, Eric Auger wrote:
> On 09/11/2014 07:05 AM, Alex Williamson wrote:
> > On Thu, 2014-09-11 at 05:10 +0200, Christoffer Dall wrote:
> >> On Mon, Sep 01, 2014 at 02:52:47PM +0200, Eric Auger wrote:
> >>> This patch introduces a new KVM_DEV_VFIO_DEVICE attribute.
> >>>
> >>> This is a new control channel which enables KVM to cooperate with
> >>> viable VFIO devices.
> >>>
> >>> The kvm-vfio device now holds a list of devices (kvm_vfio_device)
> >>> in addition to a list of groups (kvm_vfio_group). The new
> >>> infrastructure enables to check the validity of the VFIO device
> >>> file descriptor, get and hold a reference to it.
> >>>
> >>> The first concrete implemented command is IRQ forward control:
> >>> KVM_DEV_VFIO_DEVICE_FORWARD_IRQ, KVM_DEV_VFIO_DEVICE_UNFORWARD_IRQ.
> >>>
> >>> It consists in programing the VFIO driver and KVM in a consistent manner
> >>> so that an optimized IRQ injection/completion is set up. Each
> >>> kvm_vfio_device holds a list of forwarded IRQ. When putting a
> >>> kvm_vfio_device, the implementation makes sure the forwarded IRQs
> >>> are set again in the normal handling state (non forwarded).
> >>
> >> 'putting a kvm_vfio_device' sounds to like you're golf'ing :)
> >>
> >> When a kvm_vfio_device is released?
> >>
> >>>
> >>> The forwarding programmming is architecture specific, embodied by the
> >>> kvm_arch_set_fwd_state function. Its implementation is given in a
> >>> separate patch file.
> >>
> >> I would drop the last sentence and instead indicate that this is handled
> >> properly when the architecture does not support such a feature.
> >>
> >>>
> >>> The forwarding control modality is enabled by the
> >>> __KVM_HAVE_ARCH_KVM_VFIO_FORWARD define.
> >>>
> >>> Signed-off-by: Eric Auger 
> >>>
> >>> ---
> >>>
> >>> v1 -> v2:
> >>> - __KVM_HAVE_ARCH_KVM_VFIO renamed into __KVM_HAVE_ARCH_KVM_VFIO_FORWARD
> >>> - original patch file separated into 2 parts: generic part moved in vfio.c
> >>>   and ARM specific part(kvm_arch_set_fwd_state)
> >>> ---
> >>>  include/linux/kvm_host.h |  27 +++
> >>>  virt/kvm/vfio.c  | 452 
> >>> ++-
> >>>  2 files changed, 477 insertions(+), 2 deletions(-)
> >>>
> >>> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> >>> index a4c33b3..24350dc 100644
> >>> --- a/include/linux/kvm_host.h
> >>> +++ b/include/linux/kvm_host.h
> >>> @@ -1065,6 +1065,21 @@ struct kvm_device_ops {
> >>> unsigned long arg);
> >>>  };
> >>>  
> >>> +enum kvm_fwd_irq_action {
> >>> + KVM_VFIO_IRQ_SET_FORWARD,
> >>> + KVM_VFIO_IRQ_SET_NORMAL,
> >>> + KVM_VFIO_IRQ_CLEANUP,
> >>
> >> This is KVM internal API, so it would probably be good to document this.
> >> Especially the CLEANUP bit worries me, see below.
> > 
> > This also doesn't match the user API, which is simply FORWARD/UNFORWARD.
> Hi Alex,
> 
> will change that.
> > Extra states worry me too.
> 
> I tried to explained the 2 motivations behind. Please let me know if it
> makes sense.
> > 
> >>> +};
> >>> +
> >>> +/* internal structure describing a forwarded IRQ */
> >>> +struct kvm_fwd_irq {
> >>> + struct list_head link;
> >>
> >> this list entry is local to the kvm vfio device, right? that means you
> >> probably want a struct with just the below fields, and then have a
> >> containing struct in the generic device file, private to it's logic.
> > 
> > Yes, this is part of the abstraction problem.
> OK will fix that.
> > 
> >>> + __u32 index; /* platform device irq index */
> > 
> > This is a vfio_device irq_index, but vfio_devices support indexes and
> > sub-indexes.  At this level the API should match vfio, not the specifics
> > of platform devices not supporting sub-index.
> I will add sub-indexes then.
> > 
> >>> + __u32 hwirq; /*physical IRQ */
> >>> + __u32 gsi; /* virtual IRQ */
> >>> + struct kvm_vcpu *vcpu; /* vcpu to inject into*/
> > 
> > Not sure I understand why vcpu is necessary.
> vcpu is used when providing the physical IRQ/virtual IRQ mapping to the
> virtual GIC. I can remove it from and add a vcpu struct * param to
> kvm_arch_set_fwd_state if you prefer.
> 
>   Also I see a 'get' in the code below, but not a 'put'.
> Sorry I do not understand your comment here? What 'get' do you mention?

he means kvm_get_vcpu(), but you are ok on that one, the kvm naming of
this function is unfortunate, because it doesn't increment any refcounts
but just resolves to an entry in the array.

> > 
> >>> +};
> >>> +
> >>>  void kvm_device_get(struct kvm_device *dev);
> >>>  void kvm_device_put(struct kvm_device *dev);
> >>>  struct kvm_device *kvm_device_from_filp(struct file *filp);
> >>> @@ -1075,6 +1090,18 @@ extern struct kvm_device_ops kvm_vfio_ops;
> >>>  extern struct kvm_device_ops kvm_arm_vgic_v2_ops;
> >>>  extern struct kvm_device_ops kvm_flic_ops;
> >>>  
> >>> +#ifdef __KVM_HAVE_ARCH_KVM_VFIO_FORWARD
> >>> +int kvm_arch_set_fwd_state(struct kvm_fwd_irq *pfwd,
> >>
> >> what's the

Re: [RFC v2 8/9] KVM: KVM-VFIO: generic KVM_DEV_VFIO_DEVICE command and IRQ forwarding control

2014-09-11 Thread Christoffer Dall

On Thu, Sep 11, 2014 at 09:59:24AM -0600, Alex Williamson wrote:
> On Thu, 2014-09-11 at 14:04 +0200, Eric Auger wrote:
> > On 09/11/2014 07:05 AM, Alex Williamson wrote:
> > > On Thu, 2014-09-11 at 05:10 +0200, Christoffer Dall wrote:
> > >> On Mon, Sep 01, 2014 at 02:52:47PM +0200, Eric Auger wrote:
> > >>> This patch introduces a new KVM_DEV_VFIO_DEVICE attribute.
> > >>>
> > >>> This is a new control channel which enables KVM to cooperate with
> > >>> viable VFIO devices.
> > >>>
> > >>> The kvm-vfio device now holds a list of devices (kvm_vfio_device)
> > >>> in addition to a list of groups (kvm_vfio_group). The new
> > >>> infrastructure enables to check the validity of the VFIO device
> > >>> file descriptor, get and hold a reference to it.
> > >>>
> > >>> The first concrete implemented command is IRQ forward control:
> > >>> KVM_DEV_VFIO_DEVICE_FORWARD_IRQ, KVM_DEV_VFIO_DEVICE_UNFORWARD_IRQ.
> > >>>
> > >>> It consists in programing the VFIO driver and KVM in a consistent manner
> > >>> so that an optimized IRQ injection/completion is set up. Each
> > >>> kvm_vfio_device holds a list of forwarded IRQ. When putting a
> > >>> kvm_vfio_device, the implementation makes sure the forwarded IRQs
> > >>> are set again in the normal handling state (non forwarded).
> > >>
> > >> 'putting a kvm_vfio_device' sounds to like you're golf'ing :)
> > >>
> > >> When a kvm_vfio_device is released?
> > >>
> > >>>
> > >>> The forwarding programmming is architecture specific, embodied by the
> > >>> kvm_arch_set_fwd_state function. Its implementation is given in a
> > >>> separate patch file.
> > >>
> > >> I would drop the last sentence and instead indicate that this is handled
> > >> properly when the architecture does not support such a feature.
> > >>
> > >>>
> > >>> The forwarding control modality is enabled by the
> > >>> __KVM_HAVE_ARCH_KVM_VFIO_FORWARD define.
> > >>>
> > >>> Signed-off-by: Eric Auger 
> > >>>
> > >>> ---
> > >>>
> > >>> v1 -> v2:
> > >>> - __KVM_HAVE_ARCH_KVM_VFIO renamed into __KVM_HAVE_ARCH_KVM_VFIO_FORWARD
> > >>> - original patch file separated into 2 parts: generic part moved in 
> > >>> vfio.c
> > >>>   and ARM specific part(kvm_arch_set_fwd_state)
> > >>> ---
> > >>>  include/linux/kvm_host.h |  27 +++
> > >>>  virt/kvm/vfio.c  | 452 
> > >>> ++-
> > >>>  2 files changed, 477 insertions(+), 2 deletions(-)
> > >>>
> > >>> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > >>> index a4c33b3..24350dc 100644
> > >>> --- a/include/linux/kvm_host.h
> > >>> +++ b/include/linux/kvm_host.h
> > >>> @@ -1065,6 +1065,21 @@ struct kvm_device_ops {
> > >>>   unsigned long arg);
> > >>>  };
> > >>>  
> > >>> +enum kvm_fwd_irq_action {
> > >>> +   KVM_VFIO_IRQ_SET_FORWARD,
> > >>> +   KVM_VFIO_IRQ_SET_NORMAL,
> > >>> +   KVM_VFIO_IRQ_CLEANUP,
> > >>
> > >> This is KVM internal API, so it would probably be good to document this.
> > >> Especially the CLEANUP bit worries me, see below.
> > > 
> > > This also doesn't match the user API, which is simply FORWARD/UNFORWARD.
> > Hi Alex,
> > 
> > will change that.
> > > Extra states worry me too.
> > 
> > I tried to explained the 2 motivations behind. Please let me know if it
> > makes sense.
> 
> Not really.  It seems like it's just a leak of arch specific handling
> out into common code.
> 
> > >>> +};
> > >>> +
> > >>> +/* internal structure describing a forwarded IRQ */
> > >>> +struct kvm_fwd_irq {
> > >>> +   struct list_head link;
> > >>
> > >> this list entry is local to the kvm vfio device, right? that means you
> > >> probably want a struct with just the below fields, and then have a
> > >> containing struct in the generic device file, private to it's logic.
> > > 
> > > Yes, this is part of the abstraction problem.
> > OK will fix that.
> > > 
> > >>> +   __u32 index; /* platform device irq index */
> > > 
> > > This is a vfio_device irq_index, but vfio_devices support indexes and
> > > sub-indexes.  At this level the API should match vfio, not the specifics
> > > of platform devices not supporting sub-index.
> > I will add sub-indexes then.
> > > 
> > >>> +   __u32 hwirq; /*physical IRQ */
> > >>> +   __u32 gsi; /* virtual IRQ */
> > >>> +   struct kvm_vcpu *vcpu; /* vcpu to inject into*/
> > > 
> > > Not sure I understand why vcpu is necessary.
> > vcpu is used when providing the physical IRQ/virtual IRQ mapping to the
> > virtual GIC. I can remove it from and add a vcpu struct * param to
> > kvm_arch_set_fwd_state if you prefer.
> 
> The kvm-vfio API for this interface doesn't allow the user to indicate
> which vcpu to inject to.  On x86, it would be the programming of the
> interrupt controller that would decide that.  In the code here we
> arbitrarily pick vcpu0.  It feels both architecture specific and a bit
> unspecified.
> 
> > 
> >   Also I see a 'get' in the code below, but not a 'put'.
> > Sorry I do not understand your

Re: [RFC v2 2/9] KVM: ARM: VGIC: add forwarded irq rbtree lock

2014-09-11 Thread Eric Auger

On 09/11/2014 05:09 AM, Christoffer Dall wrote:
> On Mon, Sep 01, 2014 at 02:52:41PM +0200, Eric Auger wrote:
>> add a lock related to the rb tree manipulation. The rb tree can be
> 
> Ok, I can't hold myself back any longer. 


 Please begin sentences with a
> capital letter. You don't do this in French? :)
Hi Christoffer,


yep that's understood ;-) Definitively we do. Just that I am discovering
it is common too in commits and comments ;-)
> 
>> searched in one thread (irqfd handler for instance) and map/unmap
>> happen in another.
>>
>> Signed-off-by: Eric Auger 
>> ---
>>  include/kvm/arm_vgic.h |  1 +
>>  virt/kvm/arm/vgic.c| 46 +-
>>  2 files changed, 38 insertions(+), 9 deletions(-)
>>
>> diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
>> index 743020f..3da244f 100644
>> --- a/include/kvm/arm_vgic.h
>> +++ b/include/kvm/arm_vgic.h
>> @@ -177,6 +177,7 @@ struct vgic_dist {
>>  unsigned long   irq_pending_on_cpu;
>>  
>>  struct rb_root  irq_phys_map;
>> +spinlock_t  rb_tree_lock;
>>  #endif
>>  };
>>  
>> diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
>> index 8ef495b..dbc2a5a 100644
>> --- a/virt/kvm/arm/vgic.c
>> +++ b/virt/kvm/arm/vgic.c
>> @@ -1630,9 +1630,15 @@ static struct rb_root *vgic_get_irq_phys_map(struct 
>> kvm_vcpu *vcpu,
>>  
>>  int vgic_map_phys_irq(struct kvm_vcpu *vcpu, int virt_irq, int phys_irq)
>>  {
>> -struct rb_root *root = vgic_get_irq_phys_map(vcpu, virt_irq);
>> -struct rb_node **new = &root->rb_node, *parent = NULL;
>> +struct rb_root *root;
>> +struct rb_node **new, *parent = NULL;
>>  struct irq_phys_map *new_map;
>> +struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
>> +
>> +spin_lock(&dist->rb_tree_lock);
>> +
>> +root = vgic_get_irq_phys_map(vcpu, virt_irq);
>> +new = &root->rb_node;
>>  
>>  /* Boilerplate rb_tree code */
>>  while (*new) {
>> @@ -1644,13 +1650,17 @@ int vgic_map_phys_irq(struct kvm_vcpu *vcpu, int 
>> virt_irq, int phys_irq)
>>  new = &(*new)->rb_left;
>>  else if (this->virt_irq > virt_irq)
>>  new = &(*new)->rb_right;
>> -else
>> +else {
>> +spin_unlock(&dist->rb_tree_lock);
>>  return -EEXIST;
>> +}
> 
> can you initialize a ret variable to -EEXIST in the beginning of this
> function, and add an out label above the unlock below, replace this
> multi-line statement with a goto out, and set ret = 0 after the while
> loop?
sure
> 
>>  }
>>  
>>  new_map = kzalloc(sizeof(*new_map), GFP_KERNEL);
>> -if (!new_map)
>> +if (!new_map) {
>> +spin_unlock(&dist->rb_tree_lock);
>>  return -ENOMEM;
> 
> then this becomes ret = -ENOMEM; goto out;
OK
> 
>> +}
>>  
>>  new_map->virt_irq = virt_irq;
>>  new_map->phys_irq = phys_irq;
>> @@ -1658,6 +1668,8 @@ int vgic_map_phys_irq(struct kvm_vcpu *vcpu, int 
>> virt_irq, int phys_irq)
>>  rb_link_node(&new_map->node, parent, new);
>>  rb_insert_color(&new_map->node, root);
>>  
>> +spin_unlock(&dist->rb_tree_lock);
>> +
> 
> aren't you allocating memory with GFP_KERNEL while holding a spinlock
> here?
oups. Thanks for noticing. I Will move the lock.
> 
>>  return 0;
>>  }
>>  
>> @@ -1685,24 +1697,39 @@ static struct irq_phys_map 
>> *vgic_irq_map_search(struct kvm_vcpu *vcpu,
>>  
>>  int vgic_get_phys_irq(struct kvm_vcpu *vcpu, int virt_irq)
>>  {
>> -struct irq_phys_map *map = vgic_irq_map_search(vcpu, virt_irq);
>> +struct irq_phys_map *map;
>> +struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
>> +int ret;
>> +
>> +spin_lock(&dist->rb_tree_lock);
>> +map = vgic_irq_map_search(vcpu, virt_irq);
>>  
>>  if (map)
>> -return map->phys_irq;
>> +ret = map->phys_irq;
>> +else
>> +ret =  -ENOENT;
> 
> initialize ret to -ENOENT and avoid the else statement.
ok
> 
>> +
>> +spin_unlock(&dist->rb_tree_lock);
>> +return ret;
>>  
>> -return -ENOENT;
>>  }
>>  
>>  int vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, int virt_irq, int phys_irq)
>>  {
>> -struct irq_phys_map *map = vgic_irq_map_search(vcpu, virt_irq);
>> +struct irq_phys_map *map;
>> +struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
>> +
>> +spin_lock(&dist->rb_tree_lock);
>> +
>> +map = vgic_irq_map_search(vcpu, virt_irq);
>>  
>>  if (map && map->phys_irq == phys_irq) {
>>  rb_erase(&map->node, vgic_get_irq_phys_map(vcpu, virt_irq));
>>  kfree(map);
>> +spin_unlock(&dist->rb_tree_lock);
> 
> can kfree sleep?  I don't remember.  In any case, you can unlock before
> calling kfree.
no it can't but I will move anyway.
> 
>>  return 0;
>>  }
>> -
>> +spin_unlock(&dist->rb_tree_lock);
>>  return -ENOENT;
> 
> an out label and single unlock location would be pre

Re: [RFC v2 8/9] KVM: KVM-VFIO: generic KVM_DEV_VFIO_DEVICE command and IRQ forwarding control

2014-09-11 Thread Christoffer Dall

On Thu, Sep 11, 2014 at 11:35:56AM +0200, Eric Auger wrote:
> On 09/11/2014 05:10 AM, Christoffer Dall wrote:
> > On Mon, Sep 01, 2014 at 02:52:47PM +0200, Eric Auger wrote:

[...]

> >> +  if (!pfwd)
> >> +  return -ENOMEM;
> >> +  pfwd->index = fwd_irq->index;
> >> +  pfwd->gsi = fwd_irq->gsi;
> >> +  pfwd->hwirq = hwirq;
> >> +  pfwd->vcpu = kvm_get_vcpu(kdev->kvm, 0);
> >> +  ret = kvm_arch_set_fwd_state(pfwd, KVM_VFIO_IRQ_SET_FORWARD);
> >> +  if (ret < 0) {
> >> +  kvm_arch_set_fwd_state(pfwd, KVM_VFIO_IRQ_CLEANUP);
> > 
> > this whole thing feels incredibly broken to me.  Setting a forward
> > should either work or not work, not something in between that leaves
> > something to be cleaned up.  Why this two-stage thingy here?
> I wanted to exploit the return value of vgic_map_phys_irq which is
> likely to fail if the phys/virt mapping exists at VGIC level.

then just have the kvm_arch_set_fwd_state return with -EXIST and it is
the responsibility of that function itself to cleanup from whatever it
was doing, not to rely on its caller to call a cleanup function.

> 
> I already validated the injection from a KVM_VFIO_DEVICE point of view
> (the device/irq is not known internally). But what if another external
> component - which does not exist yet - maps the IRQ at VGIC level? Maybe
> I need to replace the existing validation check by querying the VGIC at
> low level instead of checking KVM-VFIO local variables.

No need to over-complicate this, in this case, the
kvm_arch_set_fwd_state() will simply fail (graceously), as I said above,
and you just return to the user, "sorry, couldn't do what you asked me
because of this error code".


[...]

> >> + *
> >> + * When this function is called, the vcpu already are destroyed. No
> > the VPUCs are already destroyed.
> >> + * vgic manipulation can happen hence the KVM_VFIO_IRQ_CLEANUP
> >> + * kvm_arch_set_fwd_state action
> > 
> > this last bit didn't make any sense to me.  Also, why are we referring
> > to the vgic in generic code?
> doesn't make sense anymore indeed. I wanted to emphasize the fact that
> VGIC KVM device is destroyed before the KVM VFIO device and this
> explains why I need a special CLEANUP cmd (besides the fact I need to
> call chip->irq_eoi(d) for the forwarded IRQs);
> 

I don't think it explains why you need a special CLEANUP cmd.  When the
vgic is going away it must cleanup its state.  When the kvm vfio device
goes away, it must unforward any unforwarded IRQs, and the architecture
specific implementation MUST correctly unforward such IRQs - as a single
operation!

Hope this helps.
-Christoffer
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

hardware problem + travelling

2014-09-11 Thread Paolo Bonzini

An OOPS has just bricked my main machine (or so it seems---firmware
doesn't show up at all, not even with disks removed), so I'll likely
spend part of tomorrow reviving an older one which I've luckily kept.

Plus, I'll be travelling most of next week.

For this reason I might be slow answering the mailing list and, for the
rest of the 3.18 series, I might be applying fewer x86 patches due to
limited testing ability.

Thanks, and sorry for the inconvenience.

Paolo

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC v2 4/9] VFIO: platform: handler tests whether the IRQ is forwarded

2014-09-11 Thread Alex Williamson

On Thu, 2014-09-11 at 19:05 +0200, Christoffer Dall wrote:
> On Thu, Sep 11, 2014 at 10:44:02AM +0200, Eric Auger wrote:
> > On 09/11/2014 05:10 AM, Christoffer Dall wrote:
> > > On Mon, Sep 01, 2014 at 02:52:43PM +0200, Eric Auger wrote:
> > >> In case the IRQ is forwarded, the VFIO platform IRQ handler does not
> > >> need to disable the IRQ anymore. In that mode, when the handler completes
> > > 
> > > add a comma after completes
> > Hi Christoffer,
> > ok
> > > 
> > >> the IRQ is not deactivated but only its priority is lowered.
> > >>
> > >> Some other actor (typically a guest) is supposed to deactivate the IRQ,
> > >> allowing at that time a new physical IRQ to hit.
> > >>
> > >> In virtualization use case, the physical IRQ is automatically completed
> > >> by the interrupt controller when the guest completes the corresponding
> > >> virtual IRQ.
> > >>
> > >> Signed-off-by: Eric Auger 
> > >> ---
> > >>  drivers/vfio/platform/vfio_platform_irq.c | 7 ++-
> > >>  1 file changed, 6 insertions(+), 1 deletion(-)
> > >>
> > >> diff --git a/drivers/vfio/platform/vfio_platform_irq.c 
> > >> b/drivers/vfio/platform/vfio_platform_irq.c
> > >> index 6768508..1f851b2 100644
> > >> --- a/drivers/vfio/platform/vfio_platform_irq.c
> > >> +++ b/drivers/vfio/platform/vfio_platform_irq.c
> > >> @@ -88,13 +88,18 @@ static irqreturn_t vfio_irq_handler(int irq, void 
> > >> *dev_id)
> > >>  struct vfio_platform_irq *irq_ctx = dev_id;
> > >>  unsigned long flags;
> > >>  int ret = IRQ_NONE;
> > >> +struct irq_data *d;
> > >> +bool is_forwarded;
> > >>  
> > >>  spin_lock_irqsave(&irq_ctx->lock, flags);
> > >>  
> > >>  if (!irq_ctx->masked) {
> > >>  ret = IRQ_HANDLED;
> > >> +d = irq_get_irq_data(irq_ctx->hwirq);
> > >> +is_forwarded = irqd_irq_forwarded(d);
> > >>  
> > >> -if (irq_ctx->flags & VFIO_IRQ_INFO_AUTOMASKED) {
> > >> +if (irq_ctx->flags & VFIO_IRQ_INFO_AUTOMASKED &&
> > >> +!is_forwarded) {
> > >>  disable_irq_nosync(irq_ctx->hwirq);
> > >>  irq_ctx->masked = true;
> > >>  }
> > >> -- 
> > >> 1.9.1
> > >>
> > > It makes sense that these needs to be all controlled in the kernel, but
> > > I'm wondering if it would be cleaner / more correct to clear the
> > > AUTOMASKED flag when the IRQ is forwarded and have vfio refuse setting
> > > this flag as long as the irq is forwarded?
> > 
> > If I am not wrong, even if the user sets AUTOMASKED, this info never is
> > exploited by the vfio platform driver. AUTOMASKED only is set internally
> > to the driver, on init, for level sensitive IRQs.
> > 
> > It seems to be the same on PCI (for INTx). I do not see anywhere the
> > user flag curectly copied into a local storage. But I prefer to be
> > careful ;-)
> > 
> > If confirmed, although the flag value is exposed in the user API, the
> > user set value never is exploited so this removes the need to check.
> > 
> > the forwarded IRQ modality being fully dynamic currently, then I would
> > need to update the irq_ctx->flags on each vfio_irq_handler call. I don't
> > know if its better?
> > 
> I'm not an expert on vfio, so I'll leave that to Alex Williamson to
> answer, but I'm just worried that we need to special-case the forwarded
> IRQ here, and if that may get lost elsewhere in the vfio code.  If the
> AUTOMASKED flag covers specifically this behavior, then why don't we
> simply clear/set that flag when forwarding/unforwarding the specific
> IRQ?

The way that VFIO_IRQ_INFO_AUTOMASKED is being used here is unique to
the platform device vfio backend.  In the rest of VFIO,
VFIO_IRQ_INFO_AUTOMASKED is simply a flag bit exposed via
VFIO_DEVICE_GET_IRQ_INFO.  The flags field of struct vfio_irq_info is
output-only.  vfio-pci knows by the IRQ index whether it is edge or
level.  I do agree though that changing the flag bit, or better yet a
bool, rather than adding extra tests that need to be handled as each
usage seems less error prone.

Things could get confusing for userspace though if suddenly
VFIO_DEVICE_GET_IRQ_INFO starts calling the index edge triggered once
forwarding mode is enabled.  Thanks,

Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC v2 8/9] KVM: KVM-VFIO: generic KVM_DEV_VFIO_DEVICE command and IRQ forwarding control

2014-09-11 Thread Alex Williamson

On Thu, 2014-09-11 at 19:10 +0200, Christoffer Dall wrote:
> On Wed, Sep 10, 2014 at 11:05:49PM -0600, Alex Williamson wrote:
> > On Thu, 2014-09-11 at 05:10 +0200, Christoffer Dall wrote:
> > > On Mon, Sep 01, 2014 at 02:52:47PM +0200, Eric Auger wrote:
> 
> [...]
> 
> > > >  
> > > > +#ifdef __KVM_HAVE_ARCH_KVM_VFIO_FORWARD
> > > > +int kvm_arch_set_fwd_state(struct kvm_fwd_irq *pfwd,
> > > 
> > > what's the 'p' in pfwd?
> > 
> > p is for pointer?
> > 
> 
> shouldn't the type declation spell out quite clearly to me that I'm
> dealing with a pointer?

Sure.  In the cases where I've done similar things it's more a matter of
not needing to come up with another variable, for instance if I need
both a struct and a struct* I might call them foo and pfoo if I can't
come up with anything more meaningful.


> [...]
> 
> > > 
> > > need some spaceing here, also, I would turn this around, first check if
> > > the strcmp fails, and then error out, then do you next check etc., to
> > > avoid so many nested statements.
> > > 
> > > > +   /* is a ref to this device already owned by the KVM-VFIO 
> > > > device? */
> > > 
> > > this comment is not particularly helpful in its current form, it would
> > > be helpful if you specified that we're checking whether that particular
> > > device/irq combo is already registered.
> > > 
> > > > +   *kvm_vdev = kvm_vfio_find_device(kv, vdev);
> > > > +   if (*kvm_vdev) {
> > > > +   if (kvm_vfio_find_irq(*kvm_vdev, fwd_irq->index)) {
> > > > +   kvm_err("%s irq %d already forwarded\n",
> > > > +   __func__, *hwirq);
> > 
> > Why didn't we do this first?
> > 
> huh?

The code is doing:

1. can the arch forward this irq
2. are we already forwarding this irq

It's backwards, test for duplicates locally before calling out into arch
code.  Besides, I think the arch code here should go away and just be
another error condition for the call-out.  Thanks,

Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC v2 1/9] KVM: ARM: VGIC: fix multiple injection of level sensitive forwarded IRQ

2014-09-11 Thread Eric Auger

On 09/11/2014 05:09 AM, Christoffer Dall wrote:
> On Mon, Sep 01, 2014 at 02:52:40PM +0200, Eric Auger wrote:
>> Fix multiple injection of level sensitive forwarded IRQs.
>> With current code, the second injection fails since the state bitmaps
>> are not reset (process_maintenance is not called anymore).
>> New implementation consists in fully bypassing the vgic state
>> management for forwarded IRQ (checks are ignored in
>> vgic_update_irq_pending). This obviously assumes the forwarded IRQ is
>> injected from kernel side.
>>
>> Signed-off-by: Eric Auger 
>>
>> ---
>>
>> It was attempted to reset the states in __kvm_vgic_sync_hwstate, checking
>> the emptied LR of forwarded IRQ. However surprisingly this solution does
>> not seem to work. Some times, a new forwarded IRQ injection is observed
>> while the LR of the previous instance was not observed as empty.
> 
> hmmm, concerning.  It would probably have been helpful overall if you
> could start by describing the problem with the current implementation in
> the commit message, and then explain the fix...
> 
>>
>> v1 -> v2:
>> - fix vgic state bypass in vgic_queue_hwirq
>> ---
>>  virt/kvm/arm/vgic.c | 13 ++---
>>  1 file changed, 10 insertions(+), 3 deletions(-)
>>
>> diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
>> index 0007300..8ef495b 100644
>> --- a/virt/kvm/arm/vgic.c
>> +++ b/virt/kvm/arm/vgic.c
>> @@ -1259,7 +1259,9 @@ static bool vgic_queue_sgi(struct kvm_vcpu *vcpu, int 
>> irq)
>>  
>>  static bool vgic_queue_hwirq(struct kvm_vcpu *vcpu, int irq)
>>  {
>> -if (vgic_irq_is_queued(vcpu, irq))
>> +bool is_forwarded =  (vgic_get_phys_irq(vcpu, irq) > 0);
> 
> can you create a static function to factor this vgic_get_phys_irq check out, 
> please?
yes sure
> 
>> +
>> +if (vgic_irq_is_queued(vcpu, irq) && !is_forwarded)
>>  return true; /* level interrupt, already queued */
> 
> so essentially if an IRQ is already on a LR so we shouldn't resample the
> line, then we still resample the line if the IRQ is forwarded?
> 
> I think you need to explain this, both to me here, and also in the code
> by moving the comment following the return statement above the check and
> comment this clearly.
Well, I admit it may look a bit pushy! When we discussed this issue with
Marc, the outcome was that the vgic states were not accurate with
forwarded IRQs and VGIC state may be fully bypassed. Since the first
injection still sets the state - and I did not want to modify this - the
2d one would fail due to that check, and the validate_injection. May be
cleaner to not update the states when injecting the fwd irq too.

> 
>>  
>>  if (vgic_queue_irq(vcpu, 0, irq)) {
>> @@ -1517,14 +1519,18 @@ static bool vgic_update_irq_pending(struct kvm *kvm, 
>> int cpuid,
>>  int edge_triggered, level_triggered;
>>  int enabled;
>>  bool ret = true;
>> +bool is_forwarded;
>>  
>>  spin_lock(&dist->lock);
>>  
>>  vcpu = kvm_get_vcpu(kvm, cpuid);
>> +is_forwarded = (vgic_get_phys_irq(vcpu, irq_num) > 0);
> 
> use your new function here as well.
ok
> 
>> +
>>  edge_triggered = vgic_irq_is_edge(vcpu, irq_num);
>>  level_triggered = !edge_triggered;
>>  
>> -if (!vgic_validate_injection(vcpu, irq_num, level)) {
>> +if (!is_forwarded &&
>> +!vgic_validate_injection(vcpu, irq_num, level)) {
> 
> I don't see the rationale here either.  If an IRQ is forwarded, why do
> you need to do anything if the condition of the line hasn't changed for
> a level-triggered IRQ or if you have a falling edge on an edge-triggered
> IRQ (assuming active-HIGH)?
To me this even cannot cannot happen. a second fwd irq can only hit if
the same virtual IRQ was completed and completed the corresponding phys
IRQ. Still the problem is that on the 1st injection we updated the VGIC
state. I aknowledge this is a hack to work around the 1st injection
update the state and nothing reset them. So on subsequent injections, -
and even on the 1st one-  I never check the state.
> 
>>  ret = false;
>>  goto out;
>>  }
>> @@ -1557,7 +1563,8 @@ static bool vgic_update_irq_pending(struct kvm *kvm, 
>> int cpuid,
>>  goto out;
>>  }
>>  
>> -if (level_triggered && vgic_irq_is_queued(vcpu, irq_num)) {
>> +if (!is_forwarded &&
>> +level_triggered && vgic_irq_is_queued(vcpu, irq_num)) {
> 
> So here it's making sense for SPIs since you can have an EOIed interrupt
> on a CPU that didn't exit the VM yet, and this it's still queued, but
> you still need to resample the line to respect other CPUs.  Only, we
> ever only target a single CPU for SPIs IIRC (the first in the target
> list register) so we have to wait for that CPU to to exit the VM anyhow.
> 
> This leads me to believe that, given a fowarded irq, you can only have
> XXX situations at this point:
> 
> (1) is_queued && target_vcpu_in_vm:
> The vcpu should resample this line when it exits the VM, because we
> check the LRs

Re: [PATCH] KVM: PPC: Convert openpic lock to raw_spinlock

2014-09-11 Thread Scott Wood

On Thu, 2014-09-11 at 15:25 -0400, Bogdan Purcareata wrote:
> This patch enables running intensive I/O workloads, e.g. netperf, in a guest
> deployed on a RT host. No change for !RT kernels.
> 
> The openpic spinlock becomes a sleeping mutex on a RT system. This no longer
> guarantees that EPR is atomic with exception delivery. The guest VCPU thread
> fails due to a BUG_ON(preemptible()) when running netperf.
> 
> In order to make the kvmppc_mpic_set_epr() call safe on RT from non-atomic
> context, convert the openpic lock to a raw_spinlock. A similar approach can
> be seen for x86 platforms in the following commit [1].
> 
> Here are some comparative cyclitest measurements run inside a high priority RT
> guest run on a RT host. The guest has 1 VCPU and the test has been run for 15
> minutes. The guest runs ~750 hackbench processes as background stress.

Does hackbench involve triggering interrupts that would go through the
MPIC?  You may want to try an I/O-heavy benchmark to stress the MPIC
code (the more interrupt sources are active at once, the "better").
Also try a guest with many vcpus.

-Scott


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Recommended Kernel and KVM version

2014-09-11 Thread Flávio Ramalho

Hi,

I am running an OpenStack infrastructure and some compute nodes are
frequently having kernel panic, as far as I see, the kernel panics are
related with KVM.

Do you guys have any recommendation about the kernel and KVM version
to be used in
an production environment?

Thanks,

Flávio
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC v2 8/9] KVM: KVM-VFIO: generic KVM_DEV_VFIO_DEVICE command and IRQ forwarding control

2014-09-11 Thread Christoffer Dall

On Thu, Sep 11, 2014 at 12:14:10PM -0600, Alex Williamson wrote:
> On Thu, 2014-09-11 at 19:10 +0200, Christoffer Dall wrote:
> > On Wed, Sep 10, 2014 at 11:05:49PM -0600, Alex Williamson wrote:
> > > On Thu, 2014-09-11 at 05:10 +0200, Christoffer Dall wrote:
> > > > On Mon, Sep 01, 2014 at 02:52:47PM +0200, Eric Auger wrote:
> > 
> > [...]
> > 
> > > > >  
> > > > > +#ifdef __KVM_HAVE_ARCH_KVM_VFIO_FORWARD
> > > > > +int kvm_arch_set_fwd_state(struct kvm_fwd_irq *pfwd,
> > > > 
> > > > what's the 'p' in pfwd?
> > > 
> > > p is for pointer?
> > > 
> > 
> > shouldn't the type declation spell out quite clearly to me that I'm
> > dealing with a pointer?
> 
> Sure.  In the cases where I've done similar things it's more a matter of
> not needing to come up with another variable, for instance if I need
> both a struct and a struct* I might call them foo and pfoo if I can't
> come up with anything more meaningful.
> 
> 
> > [...]
> > 
> > > > 
> > > > need some spaceing here, also, I would turn this around, first check if
> > > > the strcmp fails, and then error out, then do you next check etc., to
> > > > avoid so many nested statements.
> > > > 
> > > > > + /* is a ref to this device already owned by the KVM-VFIO 
> > > > > device? */
> > > > 
> > > > this comment is not particularly helpful in its current form, it would
> > > > be helpful if you specified that we're checking whether that particular
> > > > device/irq combo is already registered.
> > > > 
> > > > > + *kvm_vdev = kvm_vfio_find_device(kv, vdev);
> > > > > + if (*kvm_vdev) {
> > > > > + if (kvm_vfio_find_irq(*kvm_vdev, fwd_irq->index)) {
> > > > > + kvm_err("%s irq %d already forwarded\n",
> > > > > + __func__, *hwirq);
> > > 
> > > Why didn't we do this first?
> > > 
> > huh?
> 
> The code is doing:
> 
> 1. can the arch forward this irq
> 2. are we already forwarding this irq
> 
> It's backwards, test for duplicates locally before calling out into arch
> code.  Besides, I think the arch code here should go away and just be
> another error condition for the call-out.  Thanks,
> 
Ah, right, you meant for the whole check.  I agree completely.

-Christoffer
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC v2 1/9] KVM: ARM: VGIC: fix multiple injection of level sensitive forwarded IRQ

2014-09-11 Thread Christoffer Dall

On Thu, Sep 11, 2014 at 08:17:49PM +0200, Eric Auger wrote:
> On 09/11/2014 05:09 AM, Christoffer Dall wrote:
> > On Mon, Sep 01, 2014 at 02:52:40PM +0200, Eric Auger wrote:
> >> Fix multiple injection of level sensitive forwarded IRQs.
> >> With current code, the second injection fails since the state bitmaps
> >> are not reset (process_maintenance is not called anymore).
> >> New implementation consists in fully bypassing the vgic state
> >> management for forwarded IRQ (checks are ignored in
> >> vgic_update_irq_pending). This obviously assumes the forwarded IRQ is
> >> injected from kernel side.
> >>
> >> Signed-off-by: Eric Auger 
> >>
> >> ---
> >>
> >> It was attempted to reset the states in __kvm_vgic_sync_hwstate, checking
> >> the emptied LR of forwarded IRQ. However surprisingly this solution does
> >> not seem to work. Some times, a new forwarded IRQ injection is observed
> >> while the LR of the previous instance was not observed as empty.
> > 
> > hmmm, concerning.  It would probably have been helpful overall if you
> > could start by describing the problem with the current implementation in
> > the commit message, and then explain the fix...
> > 
> >>
> >> v1 -> v2:
> >> - fix vgic state bypass in vgic_queue_hwirq
> >> ---
> >>  virt/kvm/arm/vgic.c | 13 ++---
> >>  1 file changed, 10 insertions(+), 3 deletions(-)
> >>
> >> diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
> >> index 0007300..8ef495b 100644
> >> --- a/virt/kvm/arm/vgic.c
> >> +++ b/virt/kvm/arm/vgic.c
> >> @@ -1259,7 +1259,9 @@ static bool vgic_queue_sgi(struct kvm_vcpu *vcpu, 
> >> int irq)
> >>  
> >>  static bool vgic_queue_hwirq(struct kvm_vcpu *vcpu, int irq)
> >>  {
> >> -  if (vgic_irq_is_queued(vcpu, irq))
> >> +  bool is_forwarded =  (vgic_get_phys_irq(vcpu, irq) > 0);
> > 
> > can you create a static function to factor this vgic_get_phys_irq check 
> > out, please?
> yes sure
> > 
> >> +
> >> +  if (vgic_irq_is_queued(vcpu, irq) && !is_forwarded)
> >>return true; /* level interrupt, already queued */
> > 
> > so essentially if an IRQ is already on a LR so we shouldn't resample the
> > line, then we still resample the line if the IRQ is forwarded?
> > 
> > I think you need to explain this, both to me here, and also in the code
> > by moving the comment following the return statement above the check and
> > comment this clearly.
> Well, I admit it may look a bit pushy! When we discussed this issue with
> Marc, the outcome was that the vgic states were not accurate with
> forwarded IRQs and VGIC state may be fully bypassed.

Can you explain this in more details?  Perhaps with a concrete example?

> Since the first
> injection still sets the state - and I did not want to modify this - the
> 2d one would fail due to that check, and the validate_injection. May be
> cleaner to not update the states when injecting the fwd irq too.

Hmmm, I don't think I understand you here.  I think you need to think
about the whole flow of things here and understand any posible sequence
of events combined with any potential state you may have.  Perhaps this
is better deferred to a face-to-face discussion.

> 
> > 
> >>  
> >>if (vgic_queue_irq(vcpu, 0, irq)) {
> >> @@ -1517,14 +1519,18 @@ static bool vgic_update_irq_pending(struct kvm 
> >> *kvm, int cpuid,
> >>int edge_triggered, level_triggered;
> >>int enabled;
> >>bool ret = true;
> >> +  bool is_forwarded;
> >>  
> >>spin_lock(&dist->lock);
> >>  
> >>vcpu = kvm_get_vcpu(kvm, cpuid);
> >> +  is_forwarded = (vgic_get_phys_irq(vcpu, irq_num) > 0);
> > 
> > use your new function here as well.
> ok
> > 
> >> +
> >>edge_triggered = vgic_irq_is_edge(vcpu, irq_num);
> >>level_triggered = !edge_triggered;
> >>  
> >> -  if (!vgic_validate_injection(vcpu, irq_num, level)) {
> >> +  if (!is_forwarded &&
> >> +  !vgic_validate_injection(vcpu, irq_num, level)) {
> > 
> > I don't see the rationale here either.  If an IRQ is forwarded, why do
> > you need to do anything if the condition of the line hasn't changed for
> > a level-triggered IRQ or if you have a falling edge on an edge-triggered
> > IRQ (assuming active-HIGH)?
> To me this even cannot cannot happen. a second fwd irq can only hit if
> the same virtual IRQ was completed and completed the corresponding phys
> IRQ. Still the problem is that on the 1st injection we updated the VGIC
> state. 

Updated teh VGIC state?  Be more specific!

> I aknowledge this is a hack to work around the 1st injection
> update the state and nothing reset them. So on subsequent injections, -
> and even on the 1st one-  I never check the state.

Is the case here that you propogate the line state onto the vcpu pending
state when somebody calls this inject function, so you use this as a
chance to resample the line?

If so, we need to document this clearly (and you need to convince me
that this is in fact the right thing we're doing overall), and we may
have to reword and refactor some of this to not

Re: [PATCH v4 2/8] arm/arm64: KVM: vgic: switch to dynamic allocation

2014-09-11 Thread Christoffer Dall

On Thu, Sep 11, 2014 at 12:09:09PM +0100, Marc Zyngier wrote:
> So far, all the VGIC data structures are statically defined by the
> *maximum* number of vcpus and interrupts it supports. It means that
> we always have to oversize it to cater for the worse case.
> 
> Start by changing the data structures to be dynamically sizeable,
> and allocate them at runtime.
> 
> The sizes are still very static though.
> 
> Signed-off-by: Marc Zyngier 
> ---
>  arch/arm/kvm/arm.c |   3 +
>  include/kvm/arm_vgic.h |  76 
>  virt/kvm/arm/vgic.c| 237 
> ++---
>  3 files changed, 267 insertions(+), 49 deletions(-)
> 
> diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
> index a99e0cd..923a01d 100644
> --- a/arch/arm/kvm/arm.c
> +++ b/arch/arm/kvm/arm.c
> @@ -172,6 +172,8 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
>   kvm->vcpus[i] = NULL;
>   }
>   }
> +
> + kvm_vgic_destroy(kvm);
>  }
>  
>  int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
> @@ -253,6 +255,7 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
>  {
>   kvm_mmu_free_memory_caches(vcpu);
>   kvm_timer_vcpu_terminate(vcpu);
> + kvm_vgic_vcpu_destroy(vcpu);
>   kmem_cache_free(kvm_vcpu_cache, vcpu);
>  }
>  
> diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
> index f074539..bdaac57 100644
> --- a/include/kvm/arm_vgic.h
> +++ b/include/kvm/arm_vgic.h
> @@ -54,19 +54,33 @@
>   * - a bunch of shared interrupts (SPI)
>   */
>  struct vgic_bitmap {
> - union {
> - u32 reg[VGIC_NR_PRIVATE_IRQS / 32];
> - DECLARE_BITMAP(reg_ul, VGIC_NR_PRIVATE_IRQS);
> - } percpu[VGIC_MAX_CPUS];
> - union {
> - u32 reg[VGIC_NR_SHARED_IRQS / 32];
> - DECLARE_BITMAP(reg_ul, VGIC_NR_SHARED_IRQS);
> - } shared;
> + /*
> +  * - One UL per VCPU for private interrupts (assumes UL is at
> +  *   least 32 bits)
> +  * - As many UL as necessary for shared interrupts.
> +  *
> +  * The private interrupts are accessed via the "private"
> +  * field, one UL per vcpu (the state for vcpu n is in
> +  * private[n]). The shared interrupts are accessed via the
> +  * "shared" pointer (IRQn state is at bit n-32 in the bitmap).
> +  */
> + unsigned long *private;
> + unsigned long *shared;

the comment above the define for REG_OFFSET_SWIZZLE still talks about
the unions in struct vgic_bitmap, which is no longer true.  Mind
updating that comment?

>  };
>  
>  struct vgic_bytemap {
> - u32 percpu[VGIC_MAX_CPUS][VGIC_NR_PRIVATE_IRQS / 4];
> - u32 shared[VGIC_NR_SHARED_IRQS  / 4];
> + /*
> +  * - 8 u32 per VCPU for private interrupts
> +  * - As many u32 as necessary for shared interrupts.
> +  *
> +  * The private interrupts are accessed via the "private"
> +  * field, (the state for vcpu n is in private[n*8] to
> +  * private[n*8 + 7]). The shared interrupts are accessed via
> +  * the "shared" pointer (IRQn state is at byte (n-32)%4 of the
> +  * shared[(n-32)/4] word).
> +  */
> + u32 *private;
> + u32 *shared;
>  };
>  
>  struct kvm_vcpu;
> @@ -127,6 +141,9 @@ struct vgic_dist {
>   boolin_kernel;
>   boolready;
>  
> + int nr_cpus;
> + int nr_irqs;
> +
>   /* Virtual control interface mapping */
>   void __iomem*vctrl_base;
>  
> @@ -166,15 +183,36 @@ struct vgic_dist {
>   /* Level/edge triggered */
>   struct vgic_bitmap  irq_cfg;
>  
> - /* Source CPU per SGI and target CPU */
> - u8  irq_sgi_sources[VGIC_MAX_CPUS][VGIC_NR_SGIS];
> + /*
> +  * Source CPU per SGI and target CPU:
> +  *
> +  * Each byte represent a SGI observable on a VCPU, each bit of
> +  * this byte indicating if the corresponding VCPU has
> +  * generated this interrupt. This is a GICv2 feature only.
> +  *
> +  * For VCPUn (n < 8), irq_sgi_sources[n*16] to [n*16 + 15] are
> +  * the SGIs observable on VCPUn.
> +  */
> + u8  *irq_sgi_sources;
>  
> - /* Target CPU for each IRQ */
> - u8  irq_spi_cpu[VGIC_NR_SHARED_IRQS];
> - struct vgic_bitmap  irq_spi_target[VGIC_MAX_CPUS];
> + /*
> +  * Target CPU for each SPI:
> +  *
> +  * Array of available SPI, each byte indicating the target
> +  * VCPU for SPI. IRQn (n >=32) is at irq_spi_cpu[n-32].
> +  */
> + u8  *irq_spi_cpu;
> +
> + /*
> +  * Reverse lookup of irq_spi_cpu for faster compute pending:
> +  *
> +  * Array of bitmaps, one per VCPU, describing is IRQn is

ah, describing *if* ?

> +  * routed to a particular VCPU.
> +  */
> + struct vgic_bitmap  *irq_spi_target;
>  
>   /* Bitmap indicating which CPU has something pending */

Re: [PATCH v4 6/8] arm/arm64: KVM: vgic: kill VGIC_NR_IRQS

2014-09-11 Thread Christoffer Dall

On Thu, Sep 11, 2014 at 12:09:13PM +0100, Marc Zyngier wrote:
> Nuke VGIC_NR_IRQS entierly, now that the distributor instance
> contains the number of IRQ allocated to this GIC.
> 
> Also add VGIC_NR_IRQS_LEGACY to preserve the current API.
> 
> Signed-off-by: Marc Zyngier 

Did anything dramtically change in this patch since last time around?

If not, I'll re-affirm my tag:

Reviewed-by: Christoffer Dall 

Thanks,
-Christoffer
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v4 8/8] arm/arm64: KVM: vgic: make number of irqs a configurable attribute

2014-09-11 Thread Christoffer Dall

On Thu, Sep 11, 2014 at 12:09:15PM +0100, Marc Zyngier wrote:
> In order to make the number of interrupts configurable, use the new
> fancy device management API to add KVM_DEV_ARM_VGIC_GRP_NR_IRQS as
> a VGIC configurable attribute.
> 
> Userspace can now specify the exact size of the GIC (by increments
> of 32 interrupts).
> 
> Signed-off-by: Marc Zyngier 
> ---
>  Documentation/virtual/kvm/devices/arm-vgic.txt | 10 +++
>  arch/arm/include/uapi/asm/kvm.h|  1 +
>  arch/arm64/include/uapi/asm/kvm.h  |  1 +
>  virt/kvm/arm/vgic.c| 37 
> ++
>  4 files changed, 49 insertions(+)
> 
> diff --git a/Documentation/virtual/kvm/devices/arm-vgic.txt 
> b/Documentation/virtual/kvm/devices/arm-vgic.txt
> index 7f4e91b..df8b0c7 100644
> --- a/Documentation/virtual/kvm/devices/arm-vgic.txt
> +++ b/Documentation/virtual/kvm/devices/arm-vgic.txt
> @@ -71,3 +71,13 @@ Groups:
>Errors:
>  -ENODEV: Getting or setting this register is not yet supported
>  -EBUSY: One or more VCPUs are running
> +
> +  KVM_DEV_ARM_VGIC_GRP_NR_IRQS
> +  Attributes:
> +A value describing the number of interrupts (SGI, PPI and SPI) for
> +this GIC instance, ranging from 64 to 1024, in increments of 32.
> +
> +  Errors:
> +-EINVAL: Value set is out of the expected range
> +-EBUSY: Value has already be set, or GIC has already been initialized
> +with default values.
> diff --git a/arch/arm/include/uapi/asm/kvm.h b/arch/arm/include/uapi/asm/kvm.h
> index e6ebdd3..8b51c1a 100644
> --- a/arch/arm/include/uapi/asm/kvm.h
> +++ b/arch/arm/include/uapi/asm/kvm.h
> @@ -173,6 +173,7 @@ struct kvm_arch_memory_slot {
>  #define   KVM_DEV_ARM_VGIC_CPUID_MASK(0xffULL << 
> KVM_DEV_ARM_VGIC_CPUID_SHIFT)
>  #define   KVM_DEV_ARM_VGIC_OFFSET_SHIFT  0
>  #define   KVM_DEV_ARM_VGIC_OFFSET_MASK   (0xULL << 
> KVM_DEV_ARM_VGIC_OFFSET_SHIFT)
> +#define KVM_DEV_ARM_VGIC_GRP_NR_IRQS 3
>  
>  /* KVM_IRQ_LINE irq field index values */
>  #define KVM_ARM_IRQ_TYPE_SHIFT   24
> diff --git a/arch/arm64/include/uapi/asm/kvm.h 
> b/arch/arm64/include/uapi/asm/kvm.h
> index e633ff8..b5cd6ed 100644
> --- a/arch/arm64/include/uapi/asm/kvm.h
> +++ b/arch/arm64/include/uapi/asm/kvm.h
> @@ -159,6 +159,7 @@ struct kvm_arch_memory_slot {
>  #define   KVM_DEV_ARM_VGIC_CPUID_MASK(0xffULL << 
> KVM_DEV_ARM_VGIC_CPUID_SHIFT)
>  #define   KVM_DEV_ARM_VGIC_OFFSET_SHIFT  0
>  #define   KVM_DEV_ARM_VGIC_OFFSET_MASK   (0xULL << 
> KVM_DEV_ARM_VGIC_OFFSET_SHIFT)
> +#define KVM_DEV_ARM_VGIC_GRP_NR_IRQS 3
>  
>  /* KVM_IRQ_LINE irq field index values */
>  #define KVM_ARM_IRQ_TYPE_SHIFT   24
> diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
> index 9180823..744388d 100644
> --- a/virt/kvm/arm/vgic.c
> +++ b/virt/kvm/arm/vgic.c
> @@ -2331,6 +2331,36 @@ static int vgic_set_attr(struct kvm_device *dev, 
> struct kvm_device_attr *attr)
>  
>   return vgic_attr_regs_access(dev, attr, ®, true);
>   }
> + case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: {
> + u32 __user *uaddr = (u32 __user *)(long)attr->addr;
> + u32 val;
> + int ret = 0;
> +
> + if (get_user(val, uaddr))
> + return -EFAULT;
> +
> + /*
> +  * We require:
> +  * - at least 32 SPIs on top of the 16 SGIs and 16 PPIs
> +  * - at most 1024 interrupts
> +  * - a multiple of 32 interrupts
> +  */
> + if (val < (VGIC_NR_PRIVATE_IRQS + 32) ||
> + val > VGIC_MAX_IRQS ||
> + (val & 31))
> + return -EINVAL;
> +
> + mutex_lock(&dev->kvm->lock);
> +
> + if (vgic_initialized(dev->kvm) || dev->kvm->arch.vgic.nr_irqs)
> + ret = -EBUSY;
> + else
> + dev->kvm->arch.vgic.nr_irqs = val;
> +
> + mutex_unlock(&dev->kvm->lock);
> +
> + return ret;
> + }
>  
>   }
>  
> @@ -2367,6 +2397,11 @@ static int vgic_get_attr(struct kvm_device *dev, 
> struct kvm_device_attr *attr)
>   r = put_user(reg, uaddr);
>   break;
>   }
> + case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: {
> + u32 __user *uaddr = (u32 __user *)(long)attr->addr;
> + r = put_user(dev->kvm->arch.vgic.nr_irqs, uaddr);
> + break;
> + }
>  
>   }
>  
> @@ -2403,6 +2438,8 @@ static int vgic_has_attr(struct kvm_device *dev, struct 
> kvm_device_attr *attr)
>   case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
>   offset = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK;
>   return vgic_has_attr_regs(vgic_cpu_ranges, offset);
> + case KVM_DEV_ARM_VGIC_GRP_NR_IRQS:
> + return 0;
>   }
>   return -ENXIO;
>  }
> -- 
> 2.0.4
> 

Reviewed-by: Christoffer Dall 
--
To unsubscribe from this list: send the line

Re: [PATCH 1/2] KVM: document KVM_SET_GUEST_DEBUG api

2014-09-11 Thread Christoffer Dall

On Tue, Sep 09, 2014 at 05:27:18PM +0100, Alex Bennée wrote:
> In preparation for working on the ARM implementation I noticed the debug
> interface was missing from the API document. I've pieced together the
> expected behaviour from the code and commit messages written it up as
> best I can.
> 
> Signed-off-by: Alex Bennée 
> 
> diff --git a/Documentation/virtual/kvm/api.txt 
> b/Documentation/virtual/kvm/api.txt
> index d3dde61..723d3f3 100644
> --- a/Documentation/virtual/kvm/api.txt
> +++ b/Documentation/virtual/kvm/api.txt
> @@ -2575,6 +2575,50 @@ associated with the service will be forgotten, and 
> subsequent RTAS
>  calls by the guest for that service will be passed to userspace to be
>  handled.
>  
> +4.87 KVM_SET_GUEST_DEBUG
> +
> +Capability: KVM_CAP_SET_GUEST_DEBUG
> +Architectures: x86, s390, ppc
> +Type: vcpu ioctl
> +Parameters: struct kvm_guest_debug (in)
> +Returns: 0 on success; -1 on error

Any specific error codes that need explaining here?

> +
> +struct kvm_guest_debug {
> +   __u32 control;
> +   __u32 pad;
> +   struct kvm_guest_debug_arch arch;
> +};
> +
> +Set up the processor specific debug registers and configure vcpu for

configure *the* vcpu?

> +handling guest debug events. There are two parts to the structure, the

handling guest debug events> does this mean whatever user space needs to
configure so that the guest can deug things or is this for userspace to
debug the guest execution, we could probably be more specific.

> +first a control bitfield indicates the type of debug events to handle
> +when running. Common control bits are:

'when running'? 'when running the vcpu'?

> +
> +  - KVM_GUESTDBG_ENABLE:guest debugging is enabled
> +  - KVM_GUESTDBG_SINGLESTEP:the next run should single-step
> +
> +The top 16 bits of the control field are architecture specific control
> +flags which can include the following:
> +
> +  - KVM_GUESTDBG_USE_SW_BP: using software breakpoints [x86]
> +  - KVM_GUESTDBG_USE_HW_BP: using hardware breakpoints [x86, s390]
> +  - KVM_GUESTDBG_INJECT_DB: inject DB type exception [x86]
> +  - KVM_GUESTDBG_INJECT_BP: inject BP type exception [x86]
> +  - KVM_GUESTDBG_EXIT_PENDING:  trigger an immediate guest exit [s390]
> +
> +For example KVM_GUESTDBG_USE_SW_BP indicates that software breakpoints
> +are enabled in memory so we need to ensure breakpoint exceptions are
> +correctly trapped and the KVM run loop exits at the breakpoint and not
> +running off into the normal guest vector. For KVM_GUESTDBG_USE_HW_BP

I didn't quite understand this bit, can you clarify slightly?  For
example, I don't know what it means that a software breakpoint is
'enabled in memory' and I'm not quite sure what the goal you are arguing
for here is; is this about if this ioctl is used to set a specific
breakpoint then we want to make sure that the breakpoint exception goes
to KVM and not to the guest?

also, are you not missing a 'that' before 'the KVM run loop...'

> +we need to ensure the guest vCPUs architecture specific registers are

please be consistent about the use of vcpu, VCPUs, vCPUs, etc.  The
document seems to prefer lowercase vcpus most places.

> +updated to the correct (supplied) values.
> +
> +The second part of the structure is architecture specific and
> +typically contains a set of debug registers.
> +
> +When debug events exit the main run loop with the reason

I think you should just talk about then the VCPU exits the guest and not
be specific about whether we have a main run loop or not.

> +KVM_EXIT_DEBUG with the kvm_debug_exit_arch part of the kvm_run
> +structure containing architecture specific debug information.

I feel like this sentence leaves me hanging; When they exit with the
debug information, then what?  Or did you mean to say that when they
exit, then the arch struct is filled with the debug info?

>  
>  5. The kvm_run structure
>  
> -- 
> 2.1.0
> 

Thanks for taking care of this!
-Christoffer
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 0/2] KVM API documentation patches

2014-09-11 Thread Christoffer Dall

On Wed, Sep 10, 2014 at 11:34:53AM +0200, Paolo Bonzini wrote:
> Il 09/09/2014 18:27, Alex Bennée ha scritto:
> > Hi,
> > 
> > I'm preparing to add ARM KVM GDB support and I went to read the API
> > documentation and found it surprisingly mute on the subject ;-)
> > 
> > The first patch documents the "new" KVM_SET_GUEST_DEBUG ioctl based on
> > reviewing the code. I've included a long CC list of people who've
> > actually done the various implementations who I hope can sanity check
> > the write-up. The second is a trivial formatting fix for what looks
> > like a minor merge trip-up.
> > 
> > Alex Bennée (2):
> >   KVM: document KVM_SET_GUEST_DEBUG api
> >   KVM: fix api documentation of KVM_GET_EMULATED_CPUID
> > 
> >  Documentation/virtual/kvm/api.txt | 184 
> > +++---
> >  1 file changed, 114 insertions(+), 70 deletions(-)
> > 
> 
> Thanks, applying both!
> 
ha, read this after reviewing his text.  Was I too nit-picky? ;)

-Christoffer
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] KVM: arm64: add gic-400 compatible

2014-09-11 Thread Christoffer Dall

On Mon, Sep 08, 2014 at 10:21:44PM +, Joel Schopp wrote:
> Add a one liner to identify the gic-400.  It's gicv2 with optional MSI 
> extensions.
> 
> Cc: Christoffer Dall 
> Signed-off-by: Joel Schopp 
> ---
>  virt/kvm/arm/vgic.c |1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
> index 73eba79..e81444e 100644
> --- a/virt/kvm/arm/vgic.c
> +++ b/virt/kvm/arm/vgic.c
> @@ -1550,6 +1550,7 @@ static struct notifier_block vgic_cpu_nb = {
>  
>  static const struct of_device_id vgic_ids[] = {
>   { .compatible = "arm,cortex-a15-gic", .data = vgic_v2_probe, },
> + { .compatible = "arm,gic-400", .data = vgic_v2_probe, },
>   { .compatible = "arm,gic-v3", .data = vgic_v3_probe, },
>   {},
>  };
> 
Acked-by: Christoffer Dall 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v3 1/2] virtio-rng: fix stuck of hot-unplugging busy device

2014-09-11 Thread Rusty Russell

Amit Shah  writes:
> On (Wed) 10 Sep 2014 [14:11:36], Amos Kong wrote:
>> When we try to hot-remove a busy virtio-rng device from QEMU monitor,
>> the device can't be hot-removed. Because virtio-rng driver hangs at
>> wait_for_completion_killable().
>> 
>> This patch exits the waiting by completing have_data completion before
>> unregistering, resets data_avail to avoid the hwrng core use wrong
>> buffer bytes.
>> 
>> Signed-off-by: Amos Kong 
>> Cc: sta...@vger.kernel.org
>
> Reviewed-by: Amit Shah 

Thanks, applied.

Cheers,
Rusty.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v5 4/7] kvm, mem-hotplug: Reload L1' apic access page on migration in vcpu_enter_guest().

2014-09-11 Thread tangchen


Hi Gleb, Paolo,

On 09/11/2014 10:47 PM, Gleb Natapov wrote:

On Thu, Sep 11, 2014 at 04:37:39PM +0200, Paolo Bonzini wrote:

Il 11/09/2014 16:31, Gleb Natapov ha scritto:

What if the page being swapped out is L1's APIC access page?  We don't
run prepare_vmcs12 in that case because it's an L2->L0->L2 entry, so we
need to "do something".

We will do something on L2->L1 exit. We will call kvm_reload_apic_access_page().
That is what patch 5 of this series is doing.

Sorry, I meant "the APIC access page prepared by L1" for L2's execution.

You wrote:


if (!is_guest_mode() || !(vmcs12->secondary_vm_exec_control & 
ECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
 write(PIC_ACCESS_ADDR)

In other words if L2 shares L1 apic access page then reload, otherwise do 
nothing.

but in that case you have to redo nested_get_page, so "do nothing"
doesn't work.


Ah, 7/7 is new in this submission. Before that this page was still
pinned.  Looking at 7/7 now I do not see how it can work since it has no
code for mmu notifier to detect that it deals with such page and call
kvm_reload_apic_access_page().


Since L1 and L2 share one apic page, if the page is unmapped, 
mmu_notifier will

be called, and :

 - if vcpu is in L1, a L1->L0 exit is rised. apic page's pa will be 
updated in the next

   L0->L1 entry by making vcpu request.

 - if vcpu is in L2 (is_guest_mode, right?), a L2->L0 exit is rised. 
nested_vmx_vmexit()
   will not be called since it is called in L2->L1 exit. It returns 
from vmx_vcpu_run()
   directly, right ? So we should update apic page in L0->L2 entry. 
This is also done

   by making vcpu request, right ?.

   prepare_vmcs02() is called in L1->L2 entry, and nested_vmx_vmexit() 
is called in
   L2->L1 exit. So we also need to update L1's vmcs in 
nested_vmx_vmexit() in patch 5/7.


IIUC, I think patch 1~6 has done such things.

And yes, the is_guest_mode() check is not needed.


I said to Tang previously that nested
kvm has a bunch of pinned page that are hard to deal with and suggested
to iron out non nested case first :(


Yes, and maybe adding patch 7 is not a good idea for now.

Thanks.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v5 4/7] kvm, mem-hotplug: Reload L1' apic access page on migration in vcpu_enter_guest().

2014-09-11 Thread tangchen


Hi Paolo,

On 09/11/2014 10:24 PM, Paolo Bonzini wrote:

Il 11/09/2014 16:21, Gleb Natapov ha scritto:

As far as I can tell the if that is needed there is:

if (!is_guest_mode() || !(vmcs12->secondary_vm_exec_control & 
ECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
 write(PIC_ACCESS_ADDR)

In other words if L2 shares L1 apic access page then reload, otherwise do 
nothing.

What if the page being swapped out is L1's APIC access page?  We don't
run prepare_vmcs12 in that case because it's an L2->L0->L2 entry, so we
need to "do something".


Are you talking about the case that L1 and L2 have different apic pages ?
I think I didn't deal with this situation in this patch set.

Sorry I didn't say it clearly. Here, I assume L1 and L2 share the same 
apic page.
If we are in L2, and the page is migrated, we updated L2's vmcs by 
making vcpu
request. And of course, we should also update L1's vmcs. This is done by 
patch 5.

We make vcpu request again in nested_vmx_exit().

Thanks.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [Qemu-devel] [question] virtio-blk performance degradation happened with virito-serial

2014-09-11 Thread Zhang Haoyu

>>> > > If virtio-blk and virtio-serial share an IRQ, the guest operating 
>>> > > system has to check each virtqueue for activity. Maybe there is some 
>>> > > inefficiency doing that.
>>> > > AFAIK virtio-serial registers 64 virtqueues (on 31 ports + console) 
>>> > > even if everything is unused.
>>> > 
>>> > That could be the case if MSI is disabled.
>>> 
>>> Do the windows virtio drivers enable MSIs, in their inf file?
>>
>>It depends on the version of the drivers, but it is a reasonable guess
>>at what differs between Linux and Windows.  Haoyu, can you give us the
>>output of lspci from a Linux guest?
>>
>I made a test with fio on rhel-6.5 guest, the same degradation happened too,  
>this degradation can be reproduced on rhel6.5 guest 100%.
>virtio_console module installed:
>64K-write-sequence: 285 MBPS, 4380 IOPS
>virtio_console module uninstalled:
>64K-write-sequence: 370 MBPS, 5670 IOPS
>
I use top -d 1 -H -p  to monitor the cpu usage, and found that,
virtio_console module installed:
qemu main thread cpu usage: 98%
virtio_console module uninstalled:
qemu main thread cpu usage: 60%

perf top -p  result,
virtio_console module installed:
   PerfTop:9868 irqs/sec  kernel:76.4%  exact:  0.0% [4000Hz cycles],  
(target_pid: 88381)
--

11.80%  [kernel] [k] _raw_spin_lock_irqsave
 8.42%  [kernel] [k] _raw_spin_unlock_irqrestore
 7.33%  [kernel] [k] fget_light
 6.28%  [kernel] [k] fput
 3.61%  [kernel] [k] do_sys_poll
 3.30%  qemu-system-x86_64   [.] qcow2_check_metadata_overlap
 3.10%  [kernel] [k] __pollwait
 2.15%  qemu-system-x86_64   [.] qemu_iohandler_poll
 1.44%  libglib-2.0.so.0.3200.4  [.] g_array_append_vals
 1.36%  libc-2.13.so [.] 0x0011fc2a
 1.31%  libpthread-2.13.so   [.] pthread_mutex_lock
 1.24%  libglib-2.0.so.0.3200.4  [.] 0x0001f961
 1.20%  libpthread-2.13.so   [.] __pthread_mutex_unlock_usercnt
 0.99%  [kernel] [k] eventfd_poll
 0.98%  [vdso]   [.] 0x0771
 0.97%  [kernel] [k] remove_wait_queue
 0.96%  qemu-system-x86_64   [.] qemu_iohandler_fill
 0.95%  [kernel] [k] add_wait_queue
 0.69%  [kernel] [k] __srcu_read_lock
 0.58%  [kernel] [k] poll_freewait
 0.57%  [kernel] [k] _raw_spin_lock_irq
 0.54%  [kernel] [k] __srcu_read_unlock
 0.47%  [kernel] [k] copy_user_enhanced_fast_string
 0.46%  [kvm_intel]  [k] vmx_vcpu_run
 0.46%  [kvm][k] vcpu_enter_guest
 0.42%  [kernel] [k] tcp_poll
 0.41%  [kernel] [k] system_call_after_swapgs
 0.40%  libglib-2.0.so.0.3200.4  [.] g_slice_alloc
 0.40%  [kernel] [k] system_call
 0.38%  libpthread-2.13.so   [.] 0xe18d
 0.38%  libglib-2.0.so.0.3200.4  [.] g_slice_free1
 0.38%  qemu-system-x86_64   [.] address_space_translate_internal
 0.38%  [kernel] [k] _raw_spin_lock
 0.37%  qemu-system-x86_64   [.] phys_page_find
 0.36%  [kernel] [k] get_page_from_freelist
 0.35%  [kernel] [k] sock_poll
 0.34%  [kernel] [k] fsnotify
 0.31%  libglib-2.0.so.0.3200.4  [.] g_main_context_check
 0.30%  [kernel] [k] do_direct_IO
 0.29%  libpthread-2.13.so   [.] pthread_getspecific

virtio_console module uninstalled:
   PerfTop:9138 irqs/sec  kernel:71.7%  exact:  0.0% [4000Hz cycles],  
(target_pid: 88381)
--

 5.72%  qemu-system-x86_64   [.] qcow2_check_metadata_overlap
 4.51%  [kernel] [k] fget_light
 3.98%  [kernel] [k] _raw_spin_lock_irqsave
 2.55%  [kernel] [k] fput
 2.48%  libpthread-2.13.so   [.] pthread_mutex_lock
 2.46%  [kernel] [k] _raw_spin_unlock_irqrestore
 2.21%  libpthread-2.13.so   [.] __pthread_mutex_unlock_usercnt
 1.71%  [vdso]   [.] 0x060c
 1.68%  libc-2.13.so [.] 0x000e751f
 1.64%  libglib-2.0.so.0.3200.4  [.] 0x0004fca0
 1.20%  [kernel] [k] __srcu_read_lock
 1.14%  [kernel] [k] do_sys_poll
 0.96%  [kernel] [k] _raw_spin_lock_irq
 0.95%  [kernel] [k] __pollwait
 0.91%  [kernel] [k] __srcu_read_unlock
 0.78%  [kernel] [k] tcp_poll
 0.74%  [

[PATCH] KVM: Refactor making request to makes it meaningful

2014-09-11 Thread guohliu

This patch replace the set_bit method by kvm_make_request
to makes it more readable and consistency.

Signed-off-by: Guo Hui Liu 
---
 arch/x86/kvm/x86.c | 15 +++
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 916e895..5fed2de 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1518,7 +1518,7 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
pvclock_update_vm_gtod_copy(kvm);
 
kvm_for_each_vcpu(i, vcpu, kvm)
-   set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
+   kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
 
/* guest entries allowed */
kvm_for_each_vcpu(i, vcpu, kvm)
@@ -1661,7 +1661,7 @@ static void kvmclock_update_fn(struct work_struct *work)
struct kvm_vcpu *vcpu;
 
kvm_for_each_vcpu(i, vcpu, kvm) {
-   set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
+   kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
kvm_vcpu_kick(vcpu);
}
 }
@@ -1670,7 +1670,7 @@ static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
 {
struct kvm *kvm = v->kvm;
 
-   set_bit(KVM_REQ_CLOCK_UPDATE, &v->requests);
+   kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
schedule_delayed_work(&kvm->arch.kvmclock_update_work,
KVMCLOCK_UPDATE_DELAY);
 }
@@ -2846,7 +2846,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
vcpu->arch.tsc_offset_adjustment = 0;
-   set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
+   kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
}
 
if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
@@ -5600,7 +5600,7 @@ static void pvclock_gtod_update_fn(struct work_struct 
*work)
spin_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list)
kvm_for_each_vcpu(i, vcpu, kvm)
-   set_bit(KVM_REQ_MASTERCLOCK_UPDATE, &vcpu->requests);
+   kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
atomic_set(&kvm_guest_has_master_clock, 0);
spin_unlock(&kvm_lock);
 }
@@ -6978,7 +6978,7 @@ int kvm_arch_hardware_enable(void)
list_for_each_entry(kvm, &vm_list, vm_list) {
kvm_for_each_vcpu(i, vcpu, kvm) {
if (!stable && vcpu->cpu == smp_processor_id())
-   set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
+   kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
if (stable && vcpu->arch.last_host_tsc > local_tsc) {
backwards_tsc = true;
if (vcpu->arch.last_host_tsc > max_tsc)
@@ -7032,8 +7032,7 @@ int kvm_arch_hardware_enable(void)
kvm_for_each_vcpu(i, vcpu, kvm) {
vcpu->arch.tsc_offset_adjustment += delta_cyc;
vcpu->arch.last_host_tsc = local_tsc;
-   set_bit(KVM_REQ_MASTERCLOCK_UPDATE,
-   &vcpu->requests);
+   kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, 
vcpu);
}
 
/*
-- 
1.9.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC PATCH 11/17] COLO ctl: implement colo checkpoint protocol

2014-09-11 Thread Hongyang Yang




在 08/01/2014 11:03 PM, Dr. David Alan Gilbert 写道:

* Yang Hongyang (yan...@cn.fujitsu.com) wrote:

implement colo checkpoint protocol.

Checkpoint synchronzing points.

   Primary Secondary
   NEW @
   Suspend
   SUSPENDED   @
   Suspend&Save state
   SEND@
   Send state  Receive state
   RECEIVED@
   Flush network   Load state
   LOADED  @
   Resume  Resume

   Start Comparing
NOTE:
  1) '@' who sends the message
  2) Every sync-point is synchronized by two sides with only
 one handshake(single direction) for low-latency.
 If more strict synchronization is required, a opposite direction
 sync-point should be added.
  3) Since sync-points are single direction, the remote side may
 go forward a lot when this side just receives the sync-point.

Signed-off-by: Yang Hongyang 
---
  migration-colo.c | 268 +--
  1 file changed, 262 insertions(+), 6 deletions(-)

diff --git a/migration-colo.c b/migration-colo.c
index 2699e77..a708872 100644
--- a/migration-colo.c
+++ b/migration-colo.c
@@ -24,6 +24,41 @@
   */
  #define CHKPOINT_TIMER 1

+enum {
+COLO_READY = 0x46,
+
+/*
+ * Checkpoint synchronzing points.
+ *
+ *  Primary Secondary
+ *  NEW @
+ *  Suspend
+ *  SUSPENDED   @
+ *  Suspend&Save state
+ *  SEND@
+ *  Send state  Receive state
+ *  RECEIVED@
+ *  Flush network   Load state
+ *  LOADED  @
+ *  Resume  Resume
+ *
+ *  Start Comparing
+ * NOTE:
+ * 1) '@' who sends the message
+ * 2) Every sync-point is synchronized by two sides with only
+ *one handshake(single direction) for low-latency.
+ *If more strict synchronization is required, a opposite direction
+ *sync-point should be added.
+ * 3) Since sync-points are single direction, the remote side may
+ *go forward a lot when this side just receives the sync-point.
+ */
+COLO_CHECKPOINT_NEW,
+COLO_CHECKPOINT_SUSPENDED,
+COLO_CHECKPOINT_SEND,
+COLO_CHECKPOINT_RECEIVED,
+COLO_CHECKPOINT_LOADED,
+};
+
  static QEMUBH *colo_bh;

  bool colo_supported(void)
@@ -185,30 +220,161 @@ static const QEMUFileOps colo_read_ops = {
  .close = colo_close,
  };

+/* colo checkpoint control helper */
+static bool is_master(void);
+static bool is_slave(void);
+
+static void ctl_error_handler(void *opaque, int err)
+{
+if (is_slave()) {
+/* TODO: determine whether we need to failover */
+/* FIXME: we will not failover currently, just kill slave */
+error_report("error: colo transmission failed!\n");
+exit(1);
+} else if (is_master()) {
+/* Master still alive, do not failover */
+error_report("error: colo transmission failed!\n");
+return;
+} else {
+error_report("COLO: Unexpected error happend!\n");
+exit(EXIT_FAILURE);
+}
+}
+
+static int colo_ctl_put(QEMUFile *f, uint64_t request)
+{
+int ret = 0;
+
+qemu_put_be64(f, request);
+qemu_fflush(f);
+
+ret = qemu_file_get_error(f);
+if (ret < 0) {
+ctl_error_handler(f, ret);
+return 1;
+}
+
+return ret;
+}
+
+static int colo_ctl_get_value(QEMUFile *f, uint64_t *value)
+{
+int ret = 0;
+uint64_t temp;
+
+temp = qemu_get_be64(f);
+
+ret = qemu_file_get_error(f);
+if (ret < 0) {
+ctl_error_handler(f, ret);
+return 1;
+}
+
+*value = temp;
+return 0;
+}
+
+static int colo_ctl_get(QEMUFile *f, uint64_t require)
+{
+int ret;
+uint64_t value;
+
+ret = colo_ctl_get_value(f, &value);
+if (ret) {
+return ret;
+}
+
+if (value != require) {
+error_report("unexpected state received!\n");


I find it useful to print the expected/received state to
be able to figure out what went wrong.


Good idea!




+exit(1);
+}
+
+return ret;
+}
+
  /* save */

-static __attribute__((unused)) bool is_master(void)
+static bool is_master(void)
  {
  MigrationState *s = migrate_get_current();
  return (s->state == MIG_STATE_COLO);
  }

+static int do_colo_transaction(MigrationState *s, QEMUFile *control,
+   QEMUFile *trans)
+{
+int ret;
+
+ret = colo_ctl_put(s->file, COLO_CHECKPOINT_NEW);
+if (ret) {
+goto out;
+}
+
+ret = colo_ctl_get(control, COLO_C

Re: [RFC PATCH 16/17] COLO ram cache: implement colo ram cache on slaver

2014-09-11 Thread Hongyang Yang




在 08/01/2014 11:10 PM, Dr. David Alan Gilbert 写道:

* Yang Hongyang (yan...@cn.fujitsu.com) wrote:

The ram cache was initially the same as PVM's memory. At
checkpoint, we cache the dirty memory of PVM into ram cache
(so that ram cache always the same as PVM's memory at every
checkpoint), flush cached memory to SVM after we received
all PVM dirty memory(only needed to flush memory that was
both dirty on PVM and SVM since last checkpoint).


(Typo: 'r' on the end of the title)

I think I understand the need for the cache, to be able to restore pages
that the SVM has modified that the PVM hadn't; however, if I understand
the change here, (to host_from_stream_offset) the SVM will load the
snapshot into the ram_cache rather than directly into host memory - why
is this necessary?  If the SVMs CPU is stopped at this point couldn't
it load snapshot pages directly into host memory, clearing pages in the SVMs
bitmap, so that the only pages that then get copied in flush_cache are
the pages that the SVM modified but the PVM *didn't* include in the snapshot?
I can see that you would need to do it the way you've done it if the
snapshot-load could fail (at the sametime the PVM failed) and thus the old SVM
state would be the surviving state, but how could it fail at this point
given the whole stream is in the colo-buffer?


I can see your confusion. Yes, you are right, we can do as what you said, but
at last, we still need to copy the dirty pages into ram cache as well (because
the ram cache is a snapshot and we need to keep this updated). So the question
is whether we load the dirty pages into snapshot first or into host memory
first. I think both methods can work and make no difference...





+static void ram_flush_cache(void);
  static int ram_load(QEMUFile *f, void *opaque, int version_id)
  {
  ram_addr_t addr;
  int flags, ret = 0;
  static uint64_t seq_iter;
+bool need_flush = false;


Probably better as 'ram_cache_needs_flush'

Dave
--
Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK
.



--
Thanks,
Yang.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC PATCH 04/17] COLO info: use colo info to tell migration target colo is enabled

2014-09-11 Thread Hongyang Yang




在 08/01/2014 10:43 PM, Dr. David Alan Gilbert 写道:

* Yang Hongyang (yan...@cn.fujitsu.com) wrote:

migrate colo info to migration target to tell the target colo is
enabled.


If I understand this correctly this means that you send a 'colo info' device
information for migrations that don't have COLO enabled; that's bad because
it breaks migration unless the destination has it; I guess it's OK if you
were to guard it with a thing so it didn't do it for old machine-types.

You could use the QEMU_VM_COMMAND sections I've created for postcopy;
( http://lists.gnu.org/archive/html/qemu-devel/2014-07/msg00889.html ) and
add a QEMU_VM_CMD_COLO to indicate you want the destination to become an SVM,
   then check the capability near the start of migration and send the command.


Thank you for the reference, I've read part of your Postcopy patches, but
haven't into detailed implementation. I will use QEMUSizedBuffer/QEMUFile in
next version. For QEMU_VM_COMMAND sections, can you separate it out so that I
can make use of it? Do you have a public git tree or something?



Or perhaps there's a way to add the colo-info device on the command line so it's
not always there.

Dave


Signed-off-by: Yang Hongyang 
---
  Makefile.objs  |  1 +
  include/migration/migration-colo.h |  3 ++
  migration-colo-comm.c  | 68 ++
  vl.c   |  4 +++
  4 files changed, 76 insertions(+)
  create mode 100644 migration-colo-comm.c

diff --git a/Makefile.objs b/Makefile.objs
index cab5824..1836a68 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -50,6 +50,7 @@ common-obj-$(CONFIG_POSIX) += os-posix.o
  common-obj-$(CONFIG_LINUX) += fsdev/

  common-obj-y += migration.o migration-tcp.o
+common-obj-y += migration-colo-comm.o
  common-obj-$(CONFIG_COLO) += migration-colo.o
  common-obj-y += vmstate.o
  common-obj-y += qemu-file.o
diff --git a/include/migration/migration-colo.h 
b/include/migration/migration-colo.h
index 35b384c..e3735d8 100644
--- a/include/migration/migration-colo.h
+++ b/include/migration/migration-colo.h
@@ -12,6 +12,9 @@
  #define QEMU_MIGRATION_COLO_H

  #include "qemu-common.h"
+#include "migration/migration.h"
+
+void colo_info_mig_init(void);

  bool colo_supported(void);

diff --git a/migration-colo-comm.c b/migration-colo-comm.c
new file mode 100644
index 000..ccbc246
--- /dev/null
+++ b/migration-colo-comm.c
@@ -0,0 +1,68 @@
+/*
+ *  COarse-grain LOck-stepping Virtual Machines for Non-stop Service (COLO)
+ *  (a.k.a. Fault Tolerance or Continuous Replication)
+ *
+ *  Copyright (C) 2014 FUJITSU LIMITED
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later.  See the COPYING file in the top-level directory.
+ *
+ */
+
+#include 
+
+#define DEBUG_COLO
+
+#ifdef DEBUG_COLO
+#define DPRINTF(fmt, ...) \
+do { fprintf(stdout, "COLO: " fmt, ## __VA_ARGS__); } while (0)
+#else
+#define DPRINTF(fmt, ...) \
+do { } while (0)
+#endif
+
+static bool colo_requested;
+
+/* save */
+
+static bool migrate_use_colo(void)
+{
+MigrationState *s = migrate_get_current();
+return s->enabled_capabilities[MIGRATION_CAPABILITY_COLO];
+}
+
+static void colo_info_save(QEMUFile *f, void *opaque)
+{
+qemu_put_byte(f, migrate_use_colo());
+}
+
+/* restore */
+
+static int colo_info_load(QEMUFile *f, void *opaque, int version_id)
+{
+int value = qemu_get_byte(f);
+
+if (value && !colo_supported()) {
+fprintf(stderr, "COLO is not supported\n");
+return -EINVAL;
+}
+
+if (value && !colo_requested) {
+DPRINTF("COLO requested!\n");
+}
+
+colo_requested = value;
+
+return 0;
+}
+
+static SaveVMHandlers savevm_colo_info_handlers = {
+.save_state = colo_info_save,
+.load_state = colo_info_load,
+};
+
+void colo_info_mig_init(void)
+{
+register_savevm_live(NULL, "colo info", -1, 1,
+ &savevm_colo_info_handlers, NULL);
+}
diff --git a/vl.c b/vl.c
index fe451aa..1a282d8 100644
--- a/vl.c
+++ b/vl.c
@@ -89,6 +89,7 @@ int main(int argc, char **argv)
  #include "sysemu/dma.h"
  #include "audio/audio.h"
  #include "migration/migration.h"
+#include "migration/migration-colo.h"
  #include "sysemu/kvm.h"
  #include "qapi/qmp/qjson.h"
  #include "qemu/option.h"
@@ -4339,6 +4340,9 @@ int main(int argc, char **argv, char **envp)

  blk_mig_init();
  ram_mig_init();
+if (colo_supported()) {
+colo_info_mig_init();
+}

  /* open the virtual block devices */
  if (snapshot)
--
1.9.1


--
Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK
.



--
Thanks,
Yang.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

98 matches

Mail list logo