date:20141127

RE: [question] lots of interrupts injected to vm when pressing some key w/o releasing

2014-11-27 Thread Zhang Haoyu

 I tested win-server-2008 with "-cpu
 core2duo,hv_spinlocks=0x,hv_relaxed,hv_time",
 this problem still happened, about 200,000 vmexits per-second,
 bringing very bad experience, just like being stuck.
>>> 
>>> Please upload a full trace somewhere, or at least the "perf report"
>>> output.
>>> 
>> 
>> And, if I remove the commit of 0bc830b0, the problem disappeared.
> 
> Please send the full trace file.  If you compress it, it should be small.
> 
 See the attach 1, please.
 
> Paolo
>>> 
>>> Can you try the follow draft patch to see whether it solve your
>>> problem? This patch is based on commit 0bc830b0.
>>> 
>> After applying this patch, VM got stuck with black-screen at boot
>
>It is works well in my side. And this patch does fix the issue in my box.
>I am using window 2008 R2 for testing, here is the qemu cmdline:
>qemu-system-x86_64 -enable-kvm -m 4G -smp 4 -net nic,macaddr=00:12:41:41:13:41 
>-net tap,script=/etc/kvm/qemu-ifup win2k8.qcow
>
>Can you provide your configuration for me to test?
>
/usr/bin/kvm -id 2284096534876 -chardev 
socket,id=qmp,path=/var/run/qemu-server/2284096534876.qmp,server,nowait -mon 
chardev=qmp,mode=control -vnc :0,websocket,to=200,x509,password -pidfile 
/var/run/qemu-server/2284096534876.pid -daemonize -name win2008-x64 -smp 
sockets=1,cores=2 -cpu core2duo,hv_spinlocks=0x,hv_relaxed -nodefaults -vga 
cirrus -no-hpet -k en-us -boot menu=on,splash-time=8000 -m 4096 -usb -drive 
if=none,id=drive-ide0,media=cdrom,aio=threads -device 
ide-cd,bus=ide.0,unit=0,drive=drive-ide0,id=ide0,bootindex=100 -drive 
file=/sf/data/local/images/host-4061863144dc/d23a20df36a5/win2008-x64.vm/vm-disk-1.qcow2,if=none,id=drive-ide1,cache=none,aio=threads
 -device ide-hd,bus=ide.0,unit=1,drive=drive-ide1,id=ide1 -netdev 
type=tap,id=net0,ifname=228409653487600,script=/sf/etc/kvm/vtp-bridge,vhost=on,vhostforce=on
 -device 
virtio-net-pci,romfile=,mac=FE:FC:FE:99:4C:2E,netdev=net0,bus=pci.0,addr=0x12,id=net0
 -rtc driftfix=slew,clock=rt,base=localtime -global kvm-pit.lo
 st_tick_policy=discard -global PIIX4_PM.disable_s3=1 -global 
PIIX4_PM.disable_s4=1 -chardev 
socket,path=/var/run/qemu-server/2284096534876.virtser,server,nowait,id=channelser
 -device virtio-serial,max_ports=2,ioeventfd=off -device 
virtserialport,chardev=channelser,name=channelser.virtserial0.0 -bios 
/sf/share/kvm/bios.bin.1

You can change "-id" option to "-uuid".

Thanks,
Zhang Haoyu
>> stage, # trace-cmd report:
>> version = 6
>> CPU 0 is empty
>> CPU 1 is empty
>> CPU 2 is empty
>> CPU 3 is empty
>> CPU 5 is empty
>> CPU 7 is empty
>> cpus=8
>>  kvm-1266  [004] 14399.834397: kvm_set_irq:  gsi 9
>>  level 1 source 0 kvm-1266  [004] 14399.834403:
>>  kvm_pic_set_irq:  chip 1 pin 1 (edge|masked) kvm-1266 
>>  [004] 14399.834411: kvm_apic_accept_irq: apicid 0 vec 177
>>  (LowPrio|level) kvm-1266  [004] 14399.834412:
>>  kvm_ioapic_set_irq:   pin 9 dst 3 vec=177
>>  (LowPrio|logical|level) kvm-1266  [004] 14402.180013:
>>  kvm_set_irq:  gsi 9 level 1 source 0 kvm-1266 
>>  [004] 14402.180019: kvm_pic_set_irq:  chip 1 pin 1
>>  (edge|masked) kvm-1266  [004] 14402.180028:
>>  kvm_apic_accept_irq: apicid 1 vec 177 (LowPrio|level)
>>  kvm-1266  [004] 14402.180029: kvm_ioapic_set_irq:   pin 9
>>  dst 3 vec=177 (LowPrio|logical|level) kvm-1266  [004]
>>  14404.525627: kvm_set_irq:  gsi 9 level 1 source 0
>>  kvm-1266  [004] 14404.525634: kvm_pic_set_irq:  chip 1
>>  pin 1 (edge|masked) kvm-1266  [004] 14404.525641:
>>  kvm_apic_accept_irq: apicid 0 vec 177 (LowPrio|level)
>>  kvm-1266  [004] 14404.525642: kvm_ioapic_set_irq:   pin 9
>>  dst 3 vec=177 (LowPrio|logical|level) kvm-1266  [004]
>>  14406.871238: kvm_set_irq:  gsi 9 level 1 source 0
>>  kvm-1266  [004] 14406.871245: kvm_pic_set_irq:  chip 1
>>  pin 1 (edge|masked) kvm-1266  [004] 14406.871254:
>>  kvm_apic_accept_irq: apicid 1 vec 177 (LowPrio|level)
>>  kvm-1266  [004] 14406.871256: kvm_ioapic_set_irq:   pin 9
>>  dst 3 vec=177 (LowPrio|logical|level) kvm-1266  [006]
>>  14409.216849: kvm_set_irq:  gsi 9 level 1 source 0
>>  kvm-1266  [006] 14409.216855: kvm_pic_set_irq:  chip 1
>>  pin 1 (edge|masked) kvm-1266  [006] 14409.216862:
>>  kvm_apic_accept_irq: apicid 0 vec 177 (LowPrio|level)
>>  kvm-1266  [006] 14409.216863: kvm_ioapic_set_irq:   pin 9
>>  dst 3 vec=177 (LowPrio|logical|level) kvm-1266  [006]
>>  14411.562475: kvm_set_irq:  gsi 9 level 1 source 0
>>  kvm-1266  [006] 14411.562481: kvm_pic_set_irq:  chip 1
>>

Re: [RFC PATCH 2/5] ARM: on IO mem abort - route the call to KVM MMIO bus

2014-11-27 Thread Eric Auger

On 11/24/2014 10:26 PM, Nikolay Nikolaev wrote:
> On IO memory abort, try to handle the MMIO access thorugh the KVM
typo
> registered read/write callbacks. This is done by invoking the relevant
> kvm_io_bus_* API.
> 
> Signed-off-by: Nikolay Nikolaev 
> ---
>  arch/arm/kvm/mmio.c |   33 +
>  1 file changed, 33 insertions(+)
> 
> diff --git a/arch/arm/kvm/mmio.c b/arch/arm/kvm/mmio.c
> index 4cb5a93..81230da 100644
> --- a/arch/arm/kvm/mmio.c
> +++ b/arch/arm/kvm/mmio.c
> @@ -162,6 +162,36 @@ static int decode_hsr(struct kvm_vcpu *vcpu, phys_addr_t 
> fault_ipa,
>   return 0;
>  }
>  
> +/**
> + * kvm_handle_mmio - handle an in-kernel MMIO access
handle_kernel_mmio
> + * @vcpu:pointer to the vcpu performing the access
> + * @run: pointer to the kvm_run structure
> + * @mmio:pointer to the data describing the access
> + *
> + * returns true if the MMIO access has been performed in kernel space,
> + * and false if it needs to be emulated in user space.
> + */
> +static bool handle_kernel_mmio(struct kvm_vcpu *vcpu, struct kvm_run *run,
> + struct kvm_exit_mmio *mmio)
> +{
> + int ret;
> +
> + if (mmio->is_write) {
> + ret = kvm_io_bus_write(vcpu, KVM_MMIO_BUS, mmio->phys_addr,
> + mmio->len, &mmio->data);
> +
> + } else {
> + ret = kvm_io_bus_read(vcpu, KVM_MMIO_BUS, mmio->phys_addr,
> + mmio->len, &mmio->data);

> + }
> + if (!ret) {
> + kvm_prepare_mmio(run, mmio);
> + kvm_handle_mmio_return(vcpu, run);
Shouldn't the comment associated to kvm_handle_mmio_return saying
the function "should only be called after returning from userspace for
MMIO load emulation" be updated? Here we did kernel emulation and we
call it. also in vgic it is called. Must aknowledge I do not fully
understand what the .rt is.
> + }
> +
> + return !ret;
what if kvm_io_bus_read returned -EOPNOTSUPP?
> +}
> +
>  int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run,
>phys_addr_t fault_ipa)
>  {
> @@ -200,6 +230,9 @@ int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run 
> *run,
>   if (vgic_handle_mmio(vcpu, run, &mmio))
>   return 1;
>  
> + if (handle_kernel_mmio(vcpu, run, &mmio))
> + return 1;
> +

>   kvm_prepare_mmio(run, &mmio);
>   return 0;
>  }
> 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC PATCH 0/6] ARM64: KVM: PMU infrastructure support

2014-11-27 Thread Anup Patel

On Tue, Nov 25, 2014 at 7:12 PM, Christoffer Dall
 wrote:
> On Tue, Nov 25, 2014 at 06:17:03PM +0530, Anup Patel wrote:
>> Hi Christoffer,
>>
>> On Mon, Nov 24, 2014 at 8:07 PM, Christoffer Dall
>>  wrote:
>> > On Mon, Nov 24, 2014 at 02:14:48PM +0530, Anup Patel wrote:
>> >> On Fri, Nov 21, 2014 at 5:19 PM, Christoffer Dall
>> >>  wrote:
>> >> > On Fri, Nov 21, 2014 at 04:06:05PM +0530, Anup Patel wrote:
>> >> >> Hi Christoffer,
>> >> >>
>> >> >> On Fri, Nov 21, 2014 at 3:29 PM, Christoffer Dall
>> >> >>  wrote:
>> >> >> > On Thu, Nov 20, 2014 at 08:17:32PM +0530, Anup Patel wrote:
>> >> >> >> On Wed, Nov 19, 2014 at 8:59 PM, Christoffer Dall
>> >> >> >>  wrote:
>> >> >> >> > On Tue, Nov 11, 2014 at 02:48:25PM +0530, Anup Patel wrote:
>> >> >> >> >> Hi All,
>> >> >> >> >>
>> >> >> >> >> I have second thoughts about rebasing KVM PMU patches
>> >> >> >> >> to Marc's irq-forwarding patches.
>> >> >> >> >>
>> >> >> >> >> The PMU IRQs (when virtualized by KVM) are not exactly
>> >> >> >> >> forwarded IRQs because they are shared between Host
>> >> >> >> >> and Guest.
>> >> >> >> >>
>> >> >> >> >> Scenario1
>> >> >> >> >> -
>> >> >> >> >>
>> >> >> >> >> We might have perf running on Host and no KVM guest
>> >> >> >> >> running. In this scenario, we wont get interrupts on Host
>> >> >> >> >> because the kvm_pmu_hyp_init() (similar to the function
>> >> >> >> >> kvm_timer_hyp_init() of Marc's IRQ-forwarding
>> >> >> >> >> implementation) has put all host PMU IRQs in forwarding
>> >> >> >> >> mode.
>> >> >> >> >>
>> >> >> >> >> The only way solve this problem is to not set forwarding
>> >> >> >> >> mode for PMU IRQs in kvm_pmu_hyp_init() and instead
>> >> >> >> >> have special routines to turn on and turn off the forwarding
>> >> >> >> >> mode of PMU IRQs. These routines will be called from
>> >> >> >> >> kvm_arch_vcpu_ioctl_run() for toggling the PMU IRQ
>> >> >> >> >> forwarding state.
>> >> >> >> >>
>> >> >> >> >> Scenario2
>> >> >> >> >> -
>> >> >> >> >>
>> >> >> >> >> We might have perf running on Host and Guest simultaneously
>> >> >> >> >> which means it is quite likely that PMU HW trigger IRQ meant
>> >> >> >> >> for Host between "ret = kvm_call_hyp(__kvm_vcpu_run, vcpu);"
>> >> >> >> >> and "kvm_pmu_sync_hwstate(vcpu);" (similar to timer sync routine
>> >> >> >> >> of Marc's patchset which is called before local_irq_enable()).
>> >> >> >> >>
>> >> >> >> >> In this scenario, the updated kvm_pmu_sync_hwstate(vcpu)
>> >> >> >> >> will accidentally forward IRQ meant for Host to Guest unless
>> >> >> >> >> we put additional checks to inspect VCPU PMU state.
>> >> >> >> >>
>> >> >> >> >> Am I missing any detail about IRQ forwarding for above
>> >> >> >> >> scenarios?
>> >> >> >> >>
>> >> >> >> > Hi Anup,
>> >> >> >>
>> >> >> >> Hi Christoffer,
>> >> >> >>
>> >> >> >> >
>> >> >> >> > I briefly discussed this with Marc.  What I don't understand is 
>> >> >> >> > how it
>> >> >> >> > would be possible to get an interrupt for the host while running 
>> >> >> >> > the
>> >> >> >> > guest?
>> >> >> >> >
>> >> >> >> > The rationale behind my question is that whenever you're running 
>> >> >> >> > the
>> >> >> >> > guest, the PMU should be programmed exclusively with guest state, 
>> >> >> >> > and
>> >> >> >> > since the PMU is per core, any interrupts should be for the 
>> >> >> >> > guest, where
>> >> >> >> > it would always be pending.
>> >> >> >>
>> >> >> >> Yes, thats right PMU is programmed exclusively for guest when
>> >> >> >> guest is running and for host when host is running.
>> >> >> >>
>> >> >> >> Let us assume a situation (Scenario2 mentioned previously)
>> >> >> >> where both host and guest are using PMU. When the guest is
>> >> >> >> running we come back to host mode due to variety of reasons
>> >> >> >> (stage2 fault, guest IO, regular host interrupt, host interrupt
>> >> >> >> meant for guest, ) which means we will return from the
>> >> >> >> "ret = kvm_call_hyp(__kvm_vcpu_run, vcpu);" statement in the
>> >> >> >> kvm_arch_vcpu_ioctl_run() function with local IRQs disabled.
>> >> >> >> At this point we would have restored back host PMU context and
>> >> >> >> any PMU counter used by host can trigger PMU overflow interrup
>> >> >> >> for host. Now we will be having "kvm_pmu_sync_hwstate(vcpu);"
>> >> >> >> in the kvm_arch_vcpu_ioctl_run() function (similar to the
>> >> >> >> kvm_timer_sync_hwstate() of Marc's IRQ forwarding patchset)
>> >> >> >> which will try to detect PMU irq forwarding state in GIC hence it
>> >> >> >> can accidentally discover PMU irq pending for guest while this
>> >> >> >> PMU irq is actually meant for host.
>> >> >> >>
>> >> >> >> This above mentioned situation does not happen for timer
>> >> >> >> because virtual timer interrupts are exclusively used for guest.
>> >> >> >> The exclusive use of virtual timer interrupt for guest ensures that
>> >> >> >> the function kvm_timer_sync_hwstate() will always see correct
>> >> >> >> state of virtual timer IRQ

Re: [PATCH RFC v3 11/12] virtio-net/virtio-blk: enable virtio 1.0

2014-11-27 Thread Cornelia Huck

On Wed, 26 Nov 2014 20:50:05 +0200
"Michael S. Tsirkin"  wrote:

> On Wed, Nov 26, 2014 at 06:28:42PM +0100, Cornelia Huck wrote:
> > virtio-net (non-vhost) and virtio-blk have everything in place to support
> > virtio 1.0: let's enable the feature bit for them.
> 
> Hmm I doubt that.
> At least not without more patches.
> 
> For block, scsi, wce and config-wce must be off, and
> it must support ANY_LAYOUT (which might be a bit more code).
> 
> For net, header size is different when mergeable bufs are off,
> and mac is read-only.

Yeah, I'll need to walk through the spec again, this was mainly to test
my changes.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC PATCH] KVM: Introduce dynamically registered hypercall capability

2014-11-27 Thread Phil White

Disregard please.  Part of this patch was embarrassingly munged.  I'll
repost it presently.

-Phil
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC PATCH 0/6] ARM64: KVM: PMU infrastructure support

2014-11-27 Thread Marc Zyngier

On 27/11/14 10:22, Anup Patel wrote:
> On Tue, Nov 25, 2014 at 7:12 PM, Christoffer Dall
>  wrote:
>> On Tue, Nov 25, 2014 at 06:17:03PM +0530, Anup Patel wrote:
>>> Hi Christoffer,
>>>
>>> On Mon, Nov 24, 2014 at 8:07 PM, Christoffer Dall
>>>  wrote:
 On Mon, Nov 24, 2014 at 02:14:48PM +0530, Anup Patel wrote:
> On Fri, Nov 21, 2014 at 5:19 PM, Christoffer Dall
>  wrote:
>> On Fri, Nov 21, 2014 at 04:06:05PM +0530, Anup Patel wrote:
>>> Hi Christoffer,
>>>
>>> On Fri, Nov 21, 2014 at 3:29 PM, Christoffer Dall
>>>  wrote:
 On Thu, Nov 20, 2014 at 08:17:32PM +0530, Anup Patel wrote:
> On Wed, Nov 19, 2014 at 8:59 PM, Christoffer Dall
>  wrote:
>> On Tue, Nov 11, 2014 at 02:48:25PM +0530, Anup Patel wrote:
>>> Hi All,
>>>
>>> I have second thoughts about rebasing KVM PMU patches
>>> to Marc's irq-forwarding patches.
>>>
>>> The PMU IRQs (when virtualized by KVM) are not exactly
>>> forwarded IRQs because they are shared between Host
>>> and Guest.
>>>
>>> Scenario1
>>> -
>>>
>>> We might have perf running on Host and no KVM guest
>>> running. In this scenario, we wont get interrupts on Host
>>> because the kvm_pmu_hyp_init() (similar to the function
>>> kvm_timer_hyp_init() of Marc's IRQ-forwarding
>>> implementation) has put all host PMU IRQs in forwarding
>>> mode.
>>>
>>> The only way solve this problem is to not set forwarding
>>> mode for PMU IRQs in kvm_pmu_hyp_init() and instead
>>> have special routines to turn on and turn off the forwarding
>>> mode of PMU IRQs. These routines will be called from
>>> kvm_arch_vcpu_ioctl_run() for toggling the PMU IRQ
>>> forwarding state.
>>>
>>> Scenario2
>>> -
>>>
>>> We might have perf running on Host and Guest simultaneously
>>> which means it is quite likely that PMU HW trigger IRQ meant
>>> for Host between "ret = kvm_call_hyp(__kvm_vcpu_run, vcpu);"
>>> and "kvm_pmu_sync_hwstate(vcpu);" (similar to timer sync routine
>>> of Marc's patchset which is called before local_irq_enable()).
>>>
>>> In this scenario, the updated kvm_pmu_sync_hwstate(vcpu)
>>> will accidentally forward IRQ meant for Host to Guest unless
>>> we put additional checks to inspect VCPU PMU state.
>>>
>>> Am I missing any detail about IRQ forwarding for above
>>> scenarios?
>>>
>> Hi Anup,
>
> Hi Christoffer,
>
>>
>> I briefly discussed this with Marc.  What I don't understand is how 
>> it
>> would be possible to get an interrupt for the host while running the
>> guest?
>>
>> The rationale behind my question is that whenever you're running the
>> guest, the PMU should be programmed exclusively with guest state, and
>> since the PMU is per core, any interrupts should be for the guest, 
>> where
>> it would always be pending.
>
> Yes, thats right PMU is programmed exclusively for guest when
> guest is running and for host when host is running.
>
> Let us assume a situation (Scenario2 mentioned previously)
> where both host and guest are using PMU. When the guest is
> running we come back to host mode due to variety of reasons
> (stage2 fault, guest IO, regular host interrupt, host interrupt
> meant for guest, ) which means we will return from the
> "ret = kvm_call_hyp(__kvm_vcpu_run, vcpu);" statement in the
> kvm_arch_vcpu_ioctl_run() function with local IRQs disabled.
> At this point we would have restored back host PMU context and
> any PMU counter used by host can trigger PMU overflow interrup
> for host. Now we will be having "kvm_pmu_sync_hwstate(vcpu);"
> in the kvm_arch_vcpu_ioctl_run() function (similar to the
> kvm_timer_sync_hwstate() of Marc's IRQ forwarding patchset)
> which will try to detect PMU irq forwarding state in GIC hence it
> can accidentally discover PMU irq pending for guest while this
> PMU irq is actually meant for host.
>
> This above mentioned situation does not happen for timer
> because virtual timer interrupts are exclusively used for guest.
> The exclusive use of virtual timer interrupt for guest ensures that
> the function kvm_timer_sync_hwstate() will always see correct
> state of virtual timer IRQ from GIC.
>
 I'm not quite following.

 When you call kvm_pmu_sync_hwstate(vcpu) in the non-preemtible section,
 you would (1) capture the active state of the IRQ pertaining to the

Re: [RFC PATCH 3/5] KVM: ARM VGIC add kvm_io_bus_ frontend

2014-11-27 Thread Eric Auger

On 11/24/2014 10:26 PM, Nikolay Nikolaev wrote:
> In io_mem_abort remove the call to vgic_handle_mmio. The target is to have
> a single MMIO handling path - that is through the kvm_io_bus_ API.
> 
> Register a kvm_io_device in kvm_vgic_init on the whole vGIC MMIO region.
> Both read and write calls are redirected to vgic_io_dev_access where
> kvm_exit_mmio is composed to pass it to vm_ops.handle_mmio.
> 
> 
> Signed-off-by: Nikolay Nikolaev 
> ---
>  arch/arm/kvm/mmio.c|3 --
>  include/kvm/arm_vgic.h |3 +-
>  virt/kvm/arm/vgic.c|   88 
> 
>  3 files changed, 74 insertions(+), 20 deletions(-)
> 
> diff --git a/arch/arm/kvm/mmio.c b/arch/arm/kvm/mmio.c
> index 81230da..1c44a2b 100644
> --- a/arch/arm/kvm/mmio.c
> +++ b/arch/arm/kvm/mmio.c
> @@ -227,9 +227,6 @@ int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run 
> *run,
>   if (mmio.is_write)
>   mmio_write_buf(mmio.data, mmio.len, data);
>  
> - if (vgic_handle_mmio(vcpu, run, &mmio))
> - return 1;
> -
>   if (handle_kernel_mmio(vcpu, run, &mmio))
>   return 1;
>  
> diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
> index e452ef7..d9b7d2a 100644
> --- a/include/kvm/arm_vgic.h
> +++ b/include/kvm/arm_vgic.h
> @@ -233,6 +233,7 @@ struct vgic_dist {
>   unsigned long   *irq_pending_on_cpu;
>  
>   struct vgic_vm_ops  vm_ops;
> + struct kvm_io_device*io_dev;
>  #endif
>  };
>  
> @@ -307,8 +308,6 @@ int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, 
> unsigned int irq_num,
>   bool level);
>  void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg);
>  int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu);
> -bool vgic_handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *run,
> -   struct kvm_exit_mmio *mmio);
>  
>  #define irqchip_in_kernel(k) (!!((k)->arch.vgic.in_kernel))
>  #define vgic_initialized(k)  ((k)->arch.vgic.ready)
> diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
> index 1213da5..3da1115 100644
> --- a/virt/kvm/arm/vgic.c
> +++ b/virt/kvm/arm/vgic.c
> @@ -31,6 +31,9 @@
>  #include 
>  #include 
>  #include 
> +#include 
> +
> +#include "iodev.h"
>  
>  /*
>   * How the whole thing works (courtesy of Christoffer Dall):
> @@ -775,28 +778,81 @@ bool vgic_handle_mmio_range(struct kvm_vcpu *vcpu, 
> struct kvm_run *run,
>   return true;
>  }
>  
> -/**
> - * vgic_handle_mmio - handle an in-kernel MMIO access for the GIC emulation
> - * @vcpu:  pointer to the vcpu performing the access
> - * @run:   pointer to the kvm_run structure
> - * @mmio:  pointer to the data describing the access
> - *
> - * returns true if the MMIO access has been performed in kernel space,
> - * and false if it needs to be emulated in user space.
> - * Calls the actual handling routine for the selected VGIC model.
> - */
> -bool vgic_handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *run,
> -   struct kvm_exit_mmio *mmio)
may be worth to restore some comments
> +static int vgic_io_dev_access(struct kvm_vcpu *vcpu, struct kvm_io_device 
> *this,
> + gpa_t addr, int len, void *val, bool is_write)
>  {
> - if (!irqchip_in_kernel(vcpu->kvm))
> - return false;
> + struct kvm_exit_mmio mmio;
> + bool ret;
> +
> + mmio = (struct kvm_exit_mmio) {
> + .phys_addr = addr,
> + .len = len,
> + .is_write = is_write,
> + };
> +
> + if (is_write)
> + memcpy(mmio.data, val, len);
>  
>   /*
>* This will currently call either vgic_v2_handle_mmio() or
>* vgic_v3_handle_mmio(), which in turn will call
>* vgic_handle_mmio_range() defined above.
>*/
> - return vcpu->kvm->arch.vgic.vm_ops.handle_mmio(vcpu, run, mmio);
> + ret = vcpu->kvm->arch.vgic.vm_ops.handle_mmio(vcpu, vcpu->run, &mmio);
> +
> + if (!is_write)
> + memcpy(val, mmio.data, len);
> +
> + return ret ? 0 : 1;
> +}
> +
> +static int vgic_io_dev_read(struct kvm_vcpu *vcpu, struct kvm_io_device 
> *this,
> +   gpa_t addr, int len, void *val)
> +{
> + return vgic_io_dev_access(vcpu, this, addr, len, val, false);
> +}
> +
> +static int vgic_io_dev_write(struct kvm_vcpu *vcpu, struct kvm_io_device 
> *this,
> +gpa_t addr, int len, const void *val)
> +{
> + return vgic_io_dev_access(vcpu, this, addr, len, (void *)val, true);
> +}
> +
> +static const struct kvm_io_device_ops vgic_io_dev_ops = {
> + .read   = vgic_io_dev_read,
> + .write  = vgic_io_dev_write,
> +};
> +
> +static int vgic_register_kvm_io_dev(struct kvm *kvm)
> +{
> + struct kvm_io_device *dev;
> + int ret;
> +
> + struct vgic_dist *dist = &kvm->arch.vgic;
> + unsigned long base = dist->vgic_dist_base;
Nikolay,

are you sure dist->vgic_dist_base was set by the guest?
in vgic_init/vm_ops.vgic_init (vg

Re: [RFC PATCH 0/6] ARM64: KVM: PMU infrastructure support

2014-11-27 Thread Anup Patel

On Thu, Nov 27, 2014 at 4:10 PM, Marc Zyngier  wrote:
> On 27/11/14 10:22, Anup Patel wrote:
>> On Tue, Nov 25, 2014 at 7:12 PM, Christoffer Dall
>>  wrote:
>>> On Tue, Nov 25, 2014 at 06:17:03PM +0530, Anup Patel wrote:
 Hi Christoffer,

 On Mon, Nov 24, 2014 at 8:07 PM, Christoffer Dall
  wrote:
> On Mon, Nov 24, 2014 at 02:14:48PM +0530, Anup Patel wrote:
>> On Fri, Nov 21, 2014 at 5:19 PM, Christoffer Dall
>>  wrote:
>>> On Fri, Nov 21, 2014 at 04:06:05PM +0530, Anup Patel wrote:
 Hi Christoffer,

 On Fri, Nov 21, 2014 at 3:29 PM, Christoffer Dall
  wrote:
> On Thu, Nov 20, 2014 at 08:17:32PM +0530, Anup Patel wrote:
>> On Wed, Nov 19, 2014 at 8:59 PM, Christoffer Dall
>>  wrote:
>>> On Tue, Nov 11, 2014 at 02:48:25PM +0530, Anup Patel wrote:
 Hi All,

 I have second thoughts about rebasing KVM PMU patches
 to Marc's irq-forwarding patches.

 The PMU IRQs (when virtualized by KVM) are not exactly
 forwarded IRQs because they are shared between Host
 and Guest.

 Scenario1
 -

 We might have perf running on Host and no KVM guest
 running. In this scenario, we wont get interrupts on Host
 because the kvm_pmu_hyp_init() (similar to the function
 kvm_timer_hyp_init() of Marc's IRQ-forwarding
 implementation) has put all host PMU IRQs in forwarding
 mode.

 The only way solve this problem is to not set forwarding
 mode for PMU IRQs in kvm_pmu_hyp_init() and instead
 have special routines to turn on and turn off the forwarding
 mode of PMU IRQs. These routines will be called from
 kvm_arch_vcpu_ioctl_run() for toggling the PMU IRQ
 forwarding state.

 Scenario2
 -

 We might have perf running on Host and Guest simultaneously
 which means it is quite likely that PMU HW trigger IRQ meant
 for Host between "ret = kvm_call_hyp(__kvm_vcpu_run, vcpu);"
 and "kvm_pmu_sync_hwstate(vcpu);" (similar to timer sync routine
 of Marc's patchset which is called before local_irq_enable()).

 In this scenario, the updated kvm_pmu_sync_hwstate(vcpu)
 will accidentally forward IRQ meant for Host to Guest unless
 we put additional checks to inspect VCPU PMU state.

 Am I missing any detail about IRQ forwarding for above
 scenarios?

>>> Hi Anup,
>>
>> Hi Christoffer,
>>
>>>
>>> I briefly discussed this with Marc.  What I don't understand is how 
>>> it
>>> would be possible to get an interrupt for the host while running the
>>> guest?
>>>
>>> The rationale behind my question is that whenever you're running the
>>> guest, the PMU should be programmed exclusively with guest state, 
>>> and
>>> since the PMU is per core, any interrupts should be for the guest, 
>>> where
>>> it would always be pending.
>>
>> Yes, thats right PMU is programmed exclusively for guest when
>> guest is running and for host when host is running.
>>
>> Let us assume a situation (Scenario2 mentioned previously)
>> where both host and guest are using PMU. When the guest is
>> running we come back to host mode due to variety of reasons
>> (stage2 fault, guest IO, regular host interrupt, host interrupt
>> meant for guest, ) which means we will return from the
>> "ret = kvm_call_hyp(__kvm_vcpu_run, vcpu);" statement in the
>> kvm_arch_vcpu_ioctl_run() function with local IRQs disabled.
>> At this point we would have restored back host PMU context and
>> any PMU counter used by host can trigger PMU overflow interrup
>> for host. Now we will be having "kvm_pmu_sync_hwstate(vcpu);"
>> in the kvm_arch_vcpu_ioctl_run() function (similar to the
>> kvm_timer_sync_hwstate() of Marc's IRQ forwarding patchset)
>> which will try to detect PMU irq forwarding state in GIC hence it
>> can accidentally discover PMU irq pending for guest while this
>> PMU irq is actually meant for host.
>>
>> This above mentioned situation does not happen for timer
>> because virtual timer interrupts are exclusively used for guest.
>> The exclusive use of virtual timer interrupt for guest ensures that
>> the function kvm_timer_sync_hwstate() will always see correct
>> state of virtual timer IRQ from GIC.
>>
> I'm not quite following.

Re: [RFC PATCH 0/5] ARM: KVM: Enable the ioeventfd capability of KVM on ARM

2014-11-27 Thread Eric Auger

On 11/24/2014 10:26 PM, Nikolay Nikolaev wrote:
> The IOEVENTFD KVM capability is a prerequisite for vhost support,
> and is also used to implement improved interrupt handling in VFIO drivers.
Hi Nikolay

As far as I am aware, irqfd currently is used in vfio context, not
ioeventfd, although they both are implemented in eventfd.c. Could you
elaborate?
> 
> This series enables the ioeventfd KVM capability on ARM.
> 
> The implementation routes MMIO access in the IO abort handler to the KVM IO 
> bus.
> If there is already a registered ioeventfd handler for this address, the file
> descriptor will be triggered.
> 
> We extended the KVM IO bus API to expose the VCPU struct pointer. Now the VGIC
> MMIO access is done through this API. For this to operate the VGIC registers a
> kvm_io_device which reprresents the whole MMIO region.
typo & the whole dist MMIO region?

I think the transformation makes sense and does not bring upheavals in
the vgic code which is good.

Best Regards

Eric
> 
> The patches are implemented on top of the latest Andre's vGICv3 work from 
> here:
> http://www.linux-arm.org/git?p=linux-ap.git;a=shortlog;h=refs/heads/kvm-gicv3/v4
> 
> The code was tested on Dual Cortex-A15 Exynos5250 (ARM Chromebook).
> 
> ---
> 
> Nikolay Nikolaev (5):
>   KVM: redesing kvm_io_bus_ API to pass VCPU structure to the callbacks.
>   ARM: on IO mem abort - route the call to KVM MMIO bus
>   KVM: ARM VGIC add kvm_io_bus_ frontend
>   ARM: enable linking against eventfd and irqchip
>   ARM: enable KVM_CAP_IOEVENTFD
> 
> 
>  arch/arm/kvm/Kconfig   |1 +
>  arch/arm/kvm/Makefile  |2 +
>  arch/arm/kvm/arm.c |3 ++
>  arch/arm/kvm/mmio.c|   32 
>  arch/ia64/kvm/kvm-ia64.c   |4 +-
>  arch/powerpc/kvm/powerpc.c |4 +-
>  arch/s390/kvm/diag.c   |2 +
>  arch/x86/kvm/vmx.c |2 +
>  arch/x86/kvm/x86.c |   11 +++---
>  include/kvm/arm_vgic.h |3 +-
>  include/linux/kvm_host.h   |   10 +++--
>  virt/kvm/arm/vgic.c|   88 
> +---
>  virt/kvm/coalesced_mmio.c  |5 ++-
>  virt/kvm/eventfd.c |4 +-
>  virt/kvm/iodev.h   |   23 
>  virt/kvm/kvm_main.c|   32 
>  16 files changed, 163 insertions(+), 63 deletions(-)
> 
> --
> Signature
> 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC PATCH 0/6] ARM64: KVM: PMU infrastructure support

2014-11-27 Thread Marc Zyngier

On 27/11/14 10:54, Anup Patel wrote:
> On Thu, Nov 27, 2014 at 4:10 PM, Marc Zyngier  wrote:
>> On 27/11/14 10:22, Anup Patel wrote:
>>> On Tue, Nov 25, 2014 at 7:12 PM, Christoffer Dall
>>>  wrote:
 On Tue, Nov 25, 2014 at 06:17:03PM +0530, Anup Patel wrote:
> Hi Christoffer,
>
> On Mon, Nov 24, 2014 at 8:07 PM, Christoffer Dall
>  wrote:
>> On Mon, Nov 24, 2014 at 02:14:48PM +0530, Anup Patel wrote:
>>> On Fri, Nov 21, 2014 at 5:19 PM, Christoffer Dall
>>>  wrote:
 On Fri, Nov 21, 2014 at 04:06:05PM +0530, Anup Patel wrote:
> Hi Christoffer,
>
> On Fri, Nov 21, 2014 at 3:29 PM, Christoffer Dall
>  wrote:
>> On Thu, Nov 20, 2014 at 08:17:32PM +0530, Anup Patel wrote:
>>> On Wed, Nov 19, 2014 at 8:59 PM, Christoffer Dall
>>>  wrote:
 On Tue, Nov 11, 2014 at 02:48:25PM +0530, Anup Patel wrote:
> Hi All,
>
> I have second thoughts about rebasing KVM PMU patches
> to Marc's irq-forwarding patches.
>
> The PMU IRQs (when virtualized by KVM) are not exactly
> forwarded IRQs because they are shared between Host
> and Guest.
>
> Scenario1
> -
>
> We might have perf running on Host and no KVM guest
> running. In this scenario, we wont get interrupts on Host
> because the kvm_pmu_hyp_init() (similar to the function
> kvm_timer_hyp_init() of Marc's IRQ-forwarding
> implementation) has put all host PMU IRQs in forwarding
> mode.
>
> The only way solve this problem is to not set forwarding
> mode for PMU IRQs in kvm_pmu_hyp_init() and instead
> have special routines to turn on and turn off the forwarding
> mode of PMU IRQs. These routines will be called from
> kvm_arch_vcpu_ioctl_run() for toggling the PMU IRQ
> forwarding state.
>
> Scenario2
> -
>
> We might have perf running on Host and Guest simultaneously
> which means it is quite likely that PMU HW trigger IRQ meant
> for Host between "ret = kvm_call_hyp(__kvm_vcpu_run, vcpu);"
> and "kvm_pmu_sync_hwstate(vcpu);" (similar to timer sync routine
> of Marc's patchset which is called before local_irq_enable()).
>
> In this scenario, the updated kvm_pmu_sync_hwstate(vcpu)
> will accidentally forward IRQ meant for Host to Guest unless
> we put additional checks to inspect VCPU PMU state.
>
> Am I missing any detail about IRQ forwarding for above
> scenarios?
>
 Hi Anup,
>>>
>>> Hi Christoffer,
>>>

 I briefly discussed this with Marc.  What I don't understand is 
 how it
 would be possible to get an interrupt for the host while running 
 the
 guest?

 The rationale behind my question is that whenever you're running 
 the
 guest, the PMU should be programmed exclusively with guest state, 
 and
 since the PMU is per core, any interrupts should be for the guest, 
 where
 it would always be pending.
>>>
>>> Yes, thats right PMU is programmed exclusively for guest when
>>> guest is running and for host when host is running.
>>>
>>> Let us assume a situation (Scenario2 mentioned previously)
>>> where both host and guest are using PMU. When the guest is
>>> running we come back to host mode due to variety of reasons
>>> (stage2 fault, guest IO, regular host interrupt, host interrupt
>>> meant for guest, ) which means we will return from the
>>> "ret = kvm_call_hyp(__kvm_vcpu_run, vcpu);" statement in the
>>> kvm_arch_vcpu_ioctl_run() function with local IRQs disabled.
>>> At this point we would have restored back host PMU context and
>>> any PMU counter used by host can trigger PMU overflow interrup
>>> for host. Now we will be having "kvm_pmu_sync_hwstate(vcpu);"
>>> in the kvm_arch_vcpu_ioctl_run() function (similar to the
>>> kvm_timer_sync_hwstate() of Marc's IRQ forwarding patchset)
>>> which will try to detect PMU irq forwarding state in GIC hence it
>>> can accidentally discover PMU irq pending for guest while this
>>> PMU irq is actually meant for host.
>>>
>>> This above mentioned situation does not happen for timer
>>> because virtual timer interrupts are exclusively used for guest.
>>> The exclusive use of virtual timer interrupt for guest ensures that
>>>

Allocating dedicated RAM to host that guest can not use

2014-11-27 Thread mad Engineer

Hi,
Is there any way to set some RAM dedicated to host that guest can
not access?
Similar to setting RAM to Dom0 in Xen.

I am over committing RAM for the instances but don't want host to swap.

i understand that virtual machines are process,but can we achieve this

Thanks
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Allocating dedicated RAM to host that guest can not use

2014-11-27 Thread Wanpeng Li

On Thu, Nov 27, 2014 at 05:12:52PM +0530, mad Engineer wrote:
>Hi,
>Is there any way to set some RAM dedicated to host that guest can
>not access?
>Similar to setting RAM to Dom0 in Xen.
>
>I am over committing RAM for the instances but don't want host to swap.
>
>i understand that virtual machines are process,but can we achieve this

How about limit the memory of which guest can access through memory cgroup?

Regards,
Wanpeng Li 

>
>Thanks
>--
>To unsubscribe from this list: send the line "unsubscribe kvm" in
>the body of a message to majord...@vger.kernel.org
>More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Allocating dedicated RAM to host that guest can not use

2014-11-27 Thread mad Engineer

never tried that.
can we do that transparently ie with out setting cgroups for each
virtul machines?.
A global group such that all combined virtual machines RAM utilization
to be with in a specific value ?

On Thu, Nov 27, 2014 at 4:59 PM, Wanpeng Li  wrote:
> On Thu, Nov 27, 2014 at 05:12:52PM +0530, mad Engineer wrote:
>>Hi,
>>Is there any way to set some RAM dedicated to host that guest can
>>not access?
>>Similar to setting RAM to Dom0 in Xen.
>>
>>I am over committing RAM for the instances but don't want host to swap.
>>
>>i understand that virtual machines are process,but can we achieve this
>
> How about limit the memory of which guest can access through memory cgroup?
>
> Regards,
> Wanpeng Li
>
>>
>>Thanks
>>--
>>To unsubscribe from this list: send the line "unsubscribe kvm" in
>>the body of a message to majord...@vger.kernel.org
>>More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Allocating dedicated RAM to host that guest can not use

2014-11-27 Thread mad Engineer

a random thought can we set qemu user/group rss to a particular hard
limit in limits.conf

Can this work?

On Thu, Nov 27, 2014 at 5:39 PM, mad Engineer  wrote:
> never tried that.
> can we do that transparently ie with out setting cgroups for each
> virtul machines?.
> A global group such that all combined virtual machines RAM utilization
> to be with in a specific value ?
>
> On Thu, Nov 27, 2014 at 4:59 PM, Wanpeng Li  
> wrote:
>> On Thu, Nov 27, 2014 at 05:12:52PM +0530, mad Engineer wrote:
>>>Hi,
>>>Is there any way to set some RAM dedicated to host that guest can
>>>not access?
>>>Similar to setting RAM to Dom0 in Xen.
>>>
>>>I am over committing RAM for the instances but don't want host to swap.
>>>
>>>i understand that virtual machines are process,but can we achieve this
>>
>> How about limit the memory of which guest can access through memory cgroup?
>>
>> Regards,
>> Wanpeng Li
>>
>>>
>>>Thanks
>>>--
>>>To unsubscribe from this list: send the line "unsubscribe kvm" in
>>>the body of a message to majord...@vger.kernel.org
>>>More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v5 35/45] vhost/net: suppress compiler warning

2014-11-27 Thread Michael S. Tsirkin

len is always initialized since function is called with size > 0.

Signed-off-by: Michael S. Tsirkin 
---
 drivers/vhost/net.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 984242e..54ffbb0 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -501,7 +501,7 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
int headcount = 0;
unsigned d;
int r, nlogs = 0;
-   u32 len;
+   u32 uninitialized_var(len);
 
while (datalen > 0 && headcount < quota) {
if (unlikely(seg >= UIO_MAXIOV)) {
-- 
MST

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v5 44/45] vhost/scsi: partial virtio 1.0 support

2014-11-27 Thread Michael S. Tsirkin

Include all endian conversions as required by virtio 1.0.
Don't set virtio 1.0 yet, since that requires ANY_LAYOUT
which we don't yet support.

Signed-off-by: Michael S. Tsirkin 
Acked-by: Paolo Bonzini 
---
 drivers/vhost/scsi.c | 22 --
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index a17f118..01c01cb 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -168,6 +168,7 @@ enum {
VHOST_SCSI_VQ_IO = 2,
 };
 
+/* Note: can't set VIRTIO_F_VERSION_1 yet, since that implies ANY_LAYOUT. */
 enum {
VHOST_SCSI_FEATURES = VHOST_FEATURES | (1ULL << VIRTIO_SCSI_F_HOTPLUG) |
   (1ULL << VIRTIO_SCSI_F_T10_PI)
@@ -577,8 +578,8 @@ tcm_vhost_allocate_evt(struct vhost_scsi *vs,
return NULL;
}
 
-   evt->event.event = event;
-   evt->event.reason = reason;
+   evt->event.event = cpu_to_vhost32(vq, event);
+   evt->event.reason = cpu_to_vhost32(vq, reason);
vs->vs_events_nr++;
 
return evt;
@@ -636,7 +637,7 @@ again:
}
 
if (vs->vs_events_missed) {
-   event->event |= VIRTIO_SCSI_T_EVENTS_MISSED;
+   event->event |= cpu_to_vhost32(vq, VIRTIO_SCSI_T_EVENTS_MISSED);
vs->vs_events_missed = false;
}
 
@@ -695,12 +696,13 @@ static void vhost_scsi_complete_cmd_work(struct 
vhost_work *work)
cmd, se_cmd->residual_count, se_cmd->scsi_status);
 
memset(&v_rsp, 0, sizeof(v_rsp));
-   v_rsp.resid = se_cmd->residual_count;
+   v_rsp.resid = cpu_to_vhost32(cmd->tvc_vq, 
se_cmd->residual_count);
/* TODO is status_qualifier field needed? */
v_rsp.status = se_cmd->scsi_status;
-   v_rsp.sense_len = se_cmd->scsi_sense_length;
+   v_rsp.sense_len = cpu_to_vhost32(cmd->tvc_vq,
+se_cmd->scsi_sense_length);
memcpy(v_rsp.sense, cmd->tvc_sense_buf,
-  v_rsp.sense_len);
+  se_cmd->scsi_sense_length);
ret = copy_to_user(cmd->tvc_resp, &v_rsp, sizeof(v_rsp));
if (likely(ret == 0)) {
struct vhost_scsi_virtqueue *q;
@@ -1095,14 +1097,14 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct 
vhost_virtqueue *vq)
", but wrong data_direction\n");
goto err_cmd;
}
-   prot_bytes = v_req_pi.pi_bytesout;
+   prot_bytes = vhost32_to_cpu(vq, 
v_req_pi.pi_bytesout);
} else if (v_req_pi.pi_bytesin) {
if (data_direction != DMA_FROM_DEVICE) {
vq_err(vq, "Received non zero 
di_pi_niov"
", but wrong data_direction\n");
goto err_cmd;
}
-   prot_bytes = v_req_pi.pi_bytesin;
+   prot_bytes = vhost32_to_cpu(vq, 
v_req_pi.pi_bytesin);
}
if (prot_bytes) {
int tmp = 0;
@@ -1117,12 +1119,12 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct 
vhost_virtqueue *vq)
data_first += prot_niov;
data_niov = data_num - prot_niov;
}
-   tag = v_req_pi.tag;
+   tag = vhost64_to_cpu(vq, v_req_pi.tag);
task_attr = v_req_pi.task_attr;
cdb = &v_req_pi.cdb[0];
lun = ((v_req_pi.lun[2] << 8) | v_req_pi.lun[3]) & 
0x3FFF;
} else {
-   tag = v_req.tag;
+   tag = vhost64_to_cpu(vq, v_req.tag);
task_attr = v_req.task_attr;
cdb = &v_req.cdb[0];
lun = ((v_req.lun[2] << 8) | v_req.lun[3]) & 0x3FFF;
-- 
MST

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v5 32/45] vhost/net: larger header for virtio 1.0

2014-11-27 Thread Michael S. Tsirkin

Signed-off-by: Michael S. Tsirkin 
---
 drivers/vhost/net.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index cae22f9..1ac58d0 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -1027,7 +1027,8 @@ static int vhost_net_set_features(struct vhost_net *n, 
u64 features)
size_t vhost_hlen, sock_hlen, hdr_len;
int i;
 
-   hdr_len = (features & (1 << VIRTIO_NET_F_MRG_RXBUF)) ?
+   hdr_len = (features & ((1ULL << VIRTIO_NET_F_MRG_RXBUF) |
+  (1ULL << VIRTIO_F_VERSION_1))) ?
sizeof(struct virtio_net_hdr_mrg_rxbuf) :
sizeof(struct virtio_net_hdr);
if (features & (1 << VHOST_NET_F_VIRTIO_NET_HDR)) {
-- 
MST

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v5 28/45] vhost: add memory access wrappers

2014-11-27 Thread Michael S. Tsirkin

Add guest memory access wrappers to handle virtio endianness
conversions.

Signed-off-by: Michael S. Tsirkin 
---
 drivers/vhost/vhost.h | 31 +++
 1 file changed, 31 insertions(+)

diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index c624b09..1f321fd 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -176,4 +176,35 @@ static inline int vhost_has_feature(struct vhost_virtqueue 
*vq, int bit)
 {
return vq->acked_features & (1ULL << bit);
 }
+
+/* Memory accessors */
+static inline u16 vhost16_to_cpu(struct vhost_virtqueue *vq, __virtio16 val)
+{
+   return __virtio16_to_cpu(vhost_has_feature(vq, VIRTIO_F_VERSION_1), 
val);
+}
+
+static inline __virtio16 cpu_to_vhost16(struct vhost_virtqueue *vq, u16 val)
+{
+   return __cpu_to_virtio16(vhost_has_feature(vq, VIRTIO_F_VERSION_1), 
val);
+}
+
+static inline u32 vhost32_to_cpu(struct vhost_virtqueue *vq, __virtio32 val)
+{
+   return __virtio32_to_cpu(vhost_has_feature(vq, VIRTIO_F_VERSION_1), 
val);
+}
+
+static inline __virtio32 cpu_to_vhost32(struct vhost_virtqueue *vq, u32 val)
+{
+   return __cpu_to_virtio32(vhost_has_feature(vq, VIRTIO_F_VERSION_1), 
val);
+}
+
+static inline u64 vhost64_to_cpu(struct vhost_virtqueue *vq, __virtio64 val)
+{
+   return __virtio64_to_cpu(vhost_has_feature(vq, VIRTIO_F_VERSION_1), 
val);
+}
+
+static inline __virtio64 cpu_to_vhost64(struct vhost_virtqueue *vq, u64 val)
+{
+   return __cpu_to_virtio64(vhost_has_feature(vq, VIRTIO_F_VERSION_1), 
val);
+}
 #endif
-- 
MST

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v5 30/45] vhost: virtio 1.0 endian-ness support

2014-11-27 Thread Michael S. Tsirkin

Signed-off-by: Michael S. Tsirkin 
---
 drivers/vhost/vhost.c | 93 +++
 1 file changed, 56 insertions(+), 37 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index c90f437..4d379ed 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -33,8 +33,8 @@ enum {
VHOST_MEMORY_F_LOG = 0x1,
 };
 
-#define vhost_used_event(vq) ((u16 __user *)&vq->avail->ring[vq->num])
-#define vhost_avail_event(vq) ((u16 __user *)&vq->used->ring[vq->num])
+#define vhost_used_event(vq) ((__virtio16 __user *)&vq->avail->ring[vq->num])
+#define vhost_avail_event(vq) ((__virtio16 __user *)&vq->used->ring[vq->num])
 
 static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh,
poll_table *pt)
@@ -1001,7 +1001,7 @@ EXPORT_SYMBOL_GPL(vhost_log_write);
 static int vhost_update_used_flags(struct vhost_virtqueue *vq)
 {
void __user *used;
-   if (__put_user(vq->used_flags, &vq->used->flags) < 0)
+   if (__put_user(cpu_to_vhost16(vq, vq->used_flags), &vq->used->flags) < 
0)
return -EFAULT;
if (unlikely(vq->log_used)) {
/* Make sure the flag is seen before log. */
@@ -1019,7 +1019,7 @@ static int vhost_update_used_flags(struct vhost_virtqueue 
*vq)
 
 static int vhost_update_avail_event(struct vhost_virtqueue *vq, u16 
avail_event)
 {
-   if (__put_user(vq->avail_idx, vhost_avail_event(vq)))
+   if (__put_user(cpu_to_vhost16(vq, vq->avail_idx), 
vhost_avail_event(vq)))
return -EFAULT;
if (unlikely(vq->log_used)) {
void __user *used;
@@ -1038,6 +1038,7 @@ static int vhost_update_avail_event(struct 
vhost_virtqueue *vq, u16 avail_event)
 
 int vhost_init_used(struct vhost_virtqueue *vq)
 {
+   __virtio16 last_used_idx;
int r;
if (!vq->private_data)
return 0;
@@ -1046,7 +1047,13 @@ int vhost_init_used(struct vhost_virtqueue *vq)
if (r)
return r;
vq->signalled_used_valid = false;
-   return get_user(vq->last_used_idx, &vq->used->idx);
+   if (!access_ok(VERIFY_READ, &vq->used->idx, sizeof vq->used->idx))
+   return -EFAULT;
+   r = __get_user(last_used_idx, &vq->used->idx);
+   if (r)
+   return r;
+   vq->last_used_idx = vhost16_to_cpu(vq, last_used_idx);
+   return 0;
 }
 EXPORT_SYMBOL_GPL(vhost_init_used);
 
@@ -1087,16 +1094,16 @@ static int translate_desc(struct vhost_virtqueue *vq, 
u64 addr, u32 len,
 /* Each buffer in the virtqueues is actually a chain of descriptors.  This
  * function returns the next descriptor in the chain,
  * or -1U if we're at the end. */
-static unsigned next_desc(struct vring_desc *desc)
+static unsigned next_desc(struct vhost_virtqueue *vq, struct vring_desc *desc)
 {
unsigned int next;
 
/* If this descriptor says it doesn't chain, we're done. */
-   if (!(desc->flags & VRING_DESC_F_NEXT))
+   if (!(desc->flags & cpu_to_vhost16(vq, VRING_DESC_F_NEXT)))
return -1U;
 
/* Check they're not leading us off end of descriptors. */
-   next = desc->next;
+   next = vhost16_to_cpu(vq, desc->next);
/* Make sure compiler knows to grab that: we don't want it changing! */
/* We will use the result as an index in an array, so most
 * architectures only need a compiler barrier here. */
@@ -1113,18 +1120,19 @@ static int get_indirect(struct vhost_virtqueue *vq,
 {
struct vring_desc desc;
unsigned int i = 0, count, found = 0;
+   u32 len = vhost32_to_cpu(vq, indirect->len);
int ret;
 
/* Sanity check */
-   if (unlikely(indirect->len % sizeof desc)) {
+   if (unlikely(len % sizeof desc)) {
vq_err(vq, "Invalid length in indirect descriptor: "
   "len 0x%llx not multiple of 0x%zx\n",
-  (unsigned long long)indirect->len,
+  (unsigned long long)vhost32_to_cpu(vq, indirect->len),
   sizeof desc);
return -EINVAL;
}
 
-   ret = translate_desc(vq, indirect->addr, indirect->len, vq->indirect,
+   ret = translate_desc(vq, vhost64_to_cpu(vq, indirect->addr), len, 
vq->indirect,
 UIO_MAXIOV);
if (unlikely(ret < 0)) {
vq_err(vq, "Translation failure %d in indirect.\n", ret);
@@ -1135,7 +1143,7 @@ static int get_indirect(struct vhost_virtqueue *vq,
 * architectures only need a compiler barrier here. */
read_barrier_depends();
 
-   count = indirect->len / sizeof desc;
+   count = len / sizeof desc;
/* Buffers are chained via a 16 bit next field, so
 * we can have at most 2^16 of these. */
if (unlikely(count > USHRT_MAX + 1)) {
@@ -1155,16 +1163,17 @@ static int get_indirect(struct vhost_virtqueue *vq,
if (unlikely(memcpy_fromiovec((unsig

[PATCH v5 34/45] vhost/net: enable virtio 1.0

2014-11-27 Thread Michael S. Tsirkin

Signed-off-by: Michael S. Tsirkin 
---
 drivers/vhost/net.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 1ac58d0..984242e 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -61,7 +61,8 @@ MODULE_PARM_DESC(experimental_zcopytx, "Enable Zero Copy TX;"
 enum {
VHOST_NET_FEATURES = VHOST_FEATURES |
 (1ULL << VHOST_NET_F_VIRTIO_NET_HDR) |
-(1ULL << VIRTIO_NET_F_MRG_RXBUF),
+(1ULL << VIRTIO_NET_F_MRG_RXBUF) |
+(1ULL << VIRTIO_F_VERSION_1),
 };
 
 enum {
-- 
MST

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v5 31/45] vhost/net: virtio 1.0 byte swap

2014-11-27 Thread Michael S. Tsirkin

Signed-off-by: Michael S. Tsirkin 
---
 drivers/vhost/net.c | 12 +++-
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index dce5c58..cae22f9 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -416,7 +416,7 @@ static void handle_tx(struct vhost_net *net)
struct ubuf_info *ubuf;
ubuf = nvq->ubuf_info + nvq->upend_idx;
 
-   vq->heads[nvq->upend_idx].id = head;
+   vq->heads[nvq->upend_idx].id = cpu_to_vhost32(vq, head);
vq->heads[nvq->upend_idx].len = VHOST_DMA_IN_PROGRESS;
ubuf->callback = vhost_zerocopy_callback;
ubuf->ctx = nvq->ubufs;
@@ -500,6 +500,7 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
int headcount = 0;
unsigned d;
int r, nlogs = 0;
+   u32 len;
 
while (datalen > 0 && headcount < quota) {
if (unlikely(seg >= UIO_MAXIOV)) {
@@ -527,13 +528,14 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
nlogs += *log_num;
log += *log_num;
}
-   heads[headcount].id = d;
-   heads[headcount].len = iov_length(vq->iov + seg, in);
-   datalen -= heads[headcount].len;
+   heads[headcount].id = cpu_to_vhost32(vq, d);
+   len = iov_length(vq->iov + seg, in);
+   heads[headcount].len = cpu_to_vhost32(vq, len);
+   datalen -= len;
++headcount;
seg += in;
}
-   heads[headcount - 1].len += datalen;
+   heads[headcount - 1].len = cpu_to_vhost32(vq, len - datalen);
*iovcount = seg;
if (unlikely(log))
*log_num = nlogs;
-- 
MST

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v5 29/45] vhost/net: force len for TX to host endian

2014-11-27 Thread Michael S. Tsirkin

vhost/net keeps a copy of some used ring but (ab)uses length
field for internal house-keeping. This works because
for tx used length is always 0.
Suppress sparse errors: we use native endian-ness internally but never
expose it to guest.

Signed-off-by: Michael S. Tsirkin 
---
 drivers/vhost/net.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 8dae2f7..dce5c58 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -48,15 +48,15 @@ MODULE_PARM_DESC(experimental_zcopytx, "Enable Zero Copy 
TX;"
  * status internally; used for zerocopy tx only.
  */
 /* Lower device DMA failed */
-#define VHOST_DMA_FAILED_LEN   3
+#define VHOST_DMA_FAILED_LEN   ((__force __virtio32)3)
 /* Lower device DMA done */
-#define VHOST_DMA_DONE_LEN 2
+#define VHOST_DMA_DONE_LEN ((__force __virtio32)2)
 /* Lower device DMA in progress */
-#define VHOST_DMA_IN_PROGRESS  1
+#define VHOST_DMA_IN_PROGRESS  ((__force __virtio32)1)
 /* Buffer unused */
-#define VHOST_DMA_CLEAR_LEN0
+#define VHOST_DMA_CLEAR_LEN((__force __virtio32)0)
 
-#define VHOST_DMA_IS_DONE(len) ((len) >= VHOST_DMA_DONE_LEN)
+#define VHOST_DMA_IS_DONE(len) ((__force u32)(len) >= (__force 
u32)VHOST_DMA_DONE_LEN)
 
 enum {
VHOST_NET_FEATURES = VHOST_FEATURES |
-- 
MST

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v5 27/45] vhost: make features 64 bit

2014-11-27 Thread Michael S. Tsirkin

We need to use bit 32 for virtio 1.0

Signed-off-by: Michael S. Tsirkin 
---
 drivers/vhost/vhost.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 3eda654..c624b09 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -106,7 +106,7 @@ struct vhost_virtqueue {
/* Protected by virtqueue mutex. */
struct vhost_memory *memory;
void *private_data;
-   unsigned acked_features;
+   u64 acked_features;
/* Log write descriptors */
void __user *log_base;
struct vhost_log *log;
@@ -174,6 +174,6 @@ enum {
 
 static inline int vhost_has_feature(struct vhost_virtqueue *vq, int bit)
 {
-   return vq->acked_features & (1 << bit);
+   return vq->acked_features & (1ULL << bit);
 }
 #endif
-- 
MST

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] KVM: Introduce dynamically registered hypercall capability

2014-11-27 Thread Phil White

This introduces a list of entries which associate a function pointer of
kvm_hc_type to a hypercall number and allows the ability to register and
unregister entries.  In addition, it also allows the ability to retrieve a
function pointer of kvm_hc_type for a given hypercall number which is meant
to be called from the arch-specific section.

The main intent is to allow modules to register hypercalls which they own
rather than requiring the addition of a stub of some sort.  It will also
allow each arch to maintain separate lists of hypercalls rather than having
to respect changes in include/uapi/linux/kvm_para.h

Signed-off-by: Phil White 
---
 arch/arm/kvm/Makefile   |   2 +-
 arch/arm64/kvm/Makefile |   1 +
 arch/ia64/kvm/Makefile  |   2 +-
 arch/mips/include/asm/kvm_
para.h|   6 ++
 arch/mips/kvm/Makefile  |   3 +-
 arch/powerpc/include/asm/kvm_para.h |   7 ++
 arch/powerpc/kvm/Makefile   |   2 +-
 arch/powerpc/kvm/powerpc.c  |   5 ++
 arch/s390/include/asm/kvm_para.h|   7 +-
 arch/s390/kvm/Makefile  |   2 +-
 arch/x86/include/asm/kvm_para.h |   6 ++
 arch/x86/kvm/Makefile   |   3 +-
 arch/x86/kvm/x86.c  |   6 ++
 include/uapi/linux/kvm.h|   1 +
 include/uapi/linux/kvm_para.h   |  19 +-
 virt/kvm/hypercall.c| 125 
 16 files changed, 187 insertions(+), 10 deletions(-)
 create mode 100644 virt/kvm/hypercall.c

diff --git a/arch/arm/kvm/Makefile b/arch/arm/kvm/Makefile
index f7057ed..0f9adf9 100644
--- a/arch/arm/kvm/Makefile
+++ b/arch/arm/kvm/Makefile
@@ -15,7 +15,7 @@ AFLAGS_init.o := -Wa,-march=armv7-a$(plus_virt)
 AFLAGS_interrupts.o := -Wa,-march=armv7-a$(plus_virt)

 KVM := ../../../virt/kvm
-kvm-arm-y = $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o
+kvm-arm-y = $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/hypercall.o

 obj-y += kvm-arm.o init.o interrupts.o
 obj-y += arm.o handle_exit.o guest.o mmu.o emulate.o reset.o
diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index 32a0961..735ea53 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -12,6 +12,7 @@ ARM=../../../arch/arm/kvm
 obj-$(CONFIG_KVM_ARM_HOST) += kvm.o

 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o
+kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/hypercall.o
 kvm-$(CONFIG_KVM_ARM_HOST) += $(ARM)/arm.o $(ARM)/mmu.o $(ARM)/mmio.o
 kvm-$(CONFIG_KVM_ARM_HOST) += $(ARM)/psci.o $(ARM)/perf.o

diff --git a/arch/ia64/kvm/Makefile b/arch/ia64/kvm/Makefile
index 18e45ec..abbde9a 100644
--- a/arch/ia64/kvm/Makefile
+++ b/arch/ia64/kvm/Makefile
@@ -49,7 +49,7 @@ ccflags-y := -Ivirt/kvm -Iarch/ia64/kvm/
 asflags-y := -Ivirt/kvm -Iarch/ia64/kvm/
 KVM := ../../../virt/kvm

-common-objs = $(KVM)/kvm_main.o $(KVM)/ioapic.o \
+common-objs = $(KVM)/kvm_main.o $(KVM)/ioapic.o $(KVM)/hypercall.o \
$(KVM)/coalesced_mmio.o $(KVM)/irq_comm.o

 ifeq ($(CONFIG_KVM_DEVICE_ASSIGNMENT),y)
diff --git a/arch/mips/include/asm/kvm_para.h b/arch/mips/include/asm/kvm_para.h
index 5a9aa91..85e44d02 100644
--- a/arch/mips/include/asm/kvm_para.h
+++ b/arch/mips/include/asm/kvm_para.h
@@ -2,6 +2,7 @@
 #define _ASM_MIPS_KVM_PARA_H

 #include 
+#include 

 #define KVM_HYPERCALL ".word 0x4228"

@@ -105,5 +106,10 @@ static inline bool kvm_para_available(void)
 }
 #endif

+struct kvm_vcpu;
+
+typedef unsigned long (*kvm_hc_type)(struct kvm_vcpu *vcpu, unsigned long a0,
+   unsigned long a1, unsigned long a2, unsigned long a3);
+

 #endif /* _ASM_MIPS_KVM_PARA_H */
diff --git a/arch/mips/kvm/Makefile b/arch/mips/kvm/Makefile
index 401fe02..99afce8 100644
--- a/arch/mips/kvm/Makefile
+++ b/arch/mips/kvm/Makefile
@@ -1,7 +1,8 @@
 # Makefile for KVM support for MIPS
 #

-common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o)
+common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o \
+   hypercall.o)

 EXTRA_CFLAGS += -Ivirt/kvm -Iarch/mips/kvm

diff --git a/arch/powerpc/include/asm/kvm_para.h
b/arch/powerpc/include/asm/kvm_para.h
index 336a91a..2818a15 100644
--- a/arch/powerpc/include/asm/kvm_para.h
+++ b/arch/powerpc/include/asm/kvm_para.h
@@ -20,6 +20,7 @@
 #define __POWERPC_KVM_PARA_H__

 #include 
+#include 

 #ifdef CONFIG_KVM_GUEST

@@ -66,4 +67,10 @@ static inline bool kvm_check_and_clear_guest_paused(void)
return false;
 }

+struct kvm_vcpu;
+
+typedef unsigned long (*kvm_hc_type)(struct kvm_vcpu *vcpu,
+   unsigned long param1, unsigned long param2,
+   unsigned long param3, unsigned long param4);
+
 #endif /* __POWERPC_KVM_PARA_H__ */
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index 0570eef..9b5f239 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -8,7 +8,7 @@ ccflags-y := -Ivirt/kvm -Iarch/powerpc/kvm
 KVM := ../../../virt/kvm

 common-objs-y = $(KVM)/kvm_main.o $(KVM)/coalesced_mm

Re: [PATCH 14/21] KVM: x86: Software disabled APIC should still deliver NMIs

2014-11-27 Thread Radim Krčmář

2014-11-26 19:01+0200, Nadav Amit:
> Sorry for the late and long reply, but I got an issue with the new version
> (and my previous version as well). Indeed, the SDM states that DFR should
> be the same for enabled CPUs, and that the BIOS should get all CPUs in
> either xAPIC or x2APIC. Yet, there is nothing that says all CPUs need to be
> in xAPIC/x2APIC mode.
> 
> In my tests (which pass on bare-metal), I got a scenario in which some CPUs
> are in xAPIC mode, the BSP changed (which is currently not handled correctly
> by KVM) and the BSP has x2APIC enabled.

How many (V)CPUs were you using?
(We fail hard with logical destination x2APIC and 16+ VCPUs.)

> All the core APICs are
> software-enabled. The expected behaviour is that the CPUs with x2APIC
> enabled would work in x2APIC mode.

(Nice, I bet that made some Intel designers happy.)

There shouldn't be any message conflict when using APIC IDs <255, so it
might be possible if the x2APIC isn't programmed to issue weird
messages, like physical to nonexistent APIC ID 0xff00, which would
be also interpreted as xAPIC broadcast.

> I think such a transitory scenario is possible on real-systems as well,
> perhaps during CPU hot-plug. It appears the previous version (before all of
> our changes) handled it better. I presume the most efficient way is to start
> determining the APIC logical mode from the BSP, and if it is disabled,
> traverse the rest of the CPUs until finding the first one with APIC enabled.
> Yet, I have not finished doing and checking the BSP fix and other dependent
> INIT signal handling fixes.
> 
> In the meanwhile, would you be ok with restoring some of the previous
> behaviour - i.e., x2APIC is enabled if any CPU turned it on (regardless to
> whether APIC is software enabled), otherwise use the configuration of the
> last enabled APIC?

I don't think this patch improves anything.
(Both behaviors are wrong and I think the current one is a bit less so.)

Our x2APIC implementation is a hack that allowed faster IPI thanks to 1
MSR exit instead of 2 MMIO ones.  No OS, that doesn't know KVM's
limitations, should have enabled it because we didn't emulate interrupt
remapping, which is an architectural requirement for x2APIC ...

And for more concrete points:
- Physical x2APIC isn't affected (only broadcast, which is incorrect
  either way)

- Logical x2APIC and xAPIC don't work at the same time
  - Btw. logical x2APIC isn't supposed to work (see KVM_X2APIC_CID_BITS)
  - Logical xAPIC is shifted incorrectly in x2APIC mode, so they are all
going to be inaccessible (ldr = 0)
  - Our map isn't designed to allow x2APIC and xAPIC at the same time

- Your patch does not cover the case where sw-disabled x2APIC is
  "before" sw-enabled xAPIC, only if it is after.

> -- >8 —
> Subject: [PATCH] KVM: x86: Traverse all CPUs during recalculate_apic_map
> 
> ---
>  arch/x86/kvm/lapic.c | 20 +++-
>  1 file changed, 11 insertions(+), 9 deletions(-)
> 
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index 9c90d31..6dc2be6 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -139,6 +139,7 @@ static void recalculate_apic_map(struct kvm *kvm)
>   struct kvm_apic_map *new, *old = NULL;
>   struct kvm_vcpu *vcpu;
>   int i;
> + bool any_enabled = false;
>  
>   new = kzalloc(sizeof(struct kvm_apic_map), GFP_KERNEL);
>  
> @@ -160,13 +161,21 @@ static void recalculate_apic_map(struct kvm *kvm)
>   if (!kvm_apic_present(vcpu))
>   continue;
>  
> + /*
> +  * All APICs DFRs have to be configured the same mode by an OS.
> +  * We take advatage of this while building logical id lookup
> +  * table. After reset APICs are in software disabled mode, so if
> +  * we find apic with different setting we assume this is the 
> mode
> +  * OS wants all apics to be in; build lookup table accordingly.
> +  */
>   if (apic_x2apic_mode(apic)) {
>   new->ldr_bits = 32;
>   new->cid_shift = 16;
>   new->cid_mask = (1 << KVM_X2APIC_CID_BITS) - 1;
>   new->lid_mask = 0x;
>   new->broadcast = X2APIC_BROADCAST;
> - } else if (kvm_apic_get_reg(apic, APIC_LDR)) {
> + break;
> + } else if (!any_enabled && kvm_apic_get_reg(apic, APIC_LDR)) {
>   if (kvm_apic_get_reg(apic, APIC_DFR) ==
>   APIC_DFR_CLUSTER) {
>   new->cid_shift = 4;
> @@ -179,15 +188,8 @@ static void recalculate_apic_map(struct kvm *kvm)
>   }
>   }
>  
> - /*
> -  * All APICs have to be configured in the same mode by an OS.
> -  * We take advatage of this while building logical id lo

Re: [PATCH] KVM: Introduce dynamically registered hypercall capability

2014-11-27 Thread Jan Kiszka

On 2014-11-27 14:30, Phil White wrote:
> This introduces a list of entries which associate a function pointer of
> kvm_hc_type to a hypercall number and allows the ability to register and
> unregister entries.  In addition, it also allows the ability to retrieve a
> function pointer of kvm_hc_type for a given hypercall number which is meant
> to be called from the arch-specific section.
> 
> The main intent is to allow modules to register hypercalls which they own
> rather than requiring the addition of a stub of some sort.  It will also
> allow each arch to maintain separate lists of hypercalls rather than having
> to respect changes in include/uapi/linux/kvm_para.h

Who is using this? The patch lacks a concrete reference to an in-tree
user or at least an open source out-of-tree use case. Will this follow?

And was EXPORT_SYMBOL only accidentally chosen over ..._GPL?

> 
> Signed-off-by: Phil White 
> ---
>  arch/arm/kvm/Makefile   |   2 +-
>  arch/arm64/kvm/Makefile |   1 +
>  arch/ia64/kvm/Makefile  |   2 +-
>  arch/mips/include/asm/kvm_
> para.h|   6 ++

Posting was somehow mangled.

On first glance, the code also lacks any locking for managing the
hypercall lists.

Jan

-- 
Siemens AG, Corporate Technology, CT RTC ITP SES-DE
Corporate Competence Center Embedded Linux
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] KVM: x86: Generate #UD when memory operand is required

2014-11-27 Thread Radim Krčmář

2014-11-26 15:47+0200, Nadav Amit:
> Certain x86 instructions that use modrm operands only allow memory operand
> (i.e., mod012), and cause a #UD exception otherwise. KVM ignores this fact.
> Currently, the instructions that are such and are emulated by KVM are MOVBE,
> MOVNTPS, MOVNTPD and MOVNTI.  MOVBE is the most blunt example, since it may be
> emulated by the host regardless of MMIO.
> 
> The fix introduces a new group for handling such instructions, marking mod3 as
> illegal instruction.
> 
> Signed-off-by: Nadav Amit 
> ---

Reviewed-by: Radim Krčmář 

(We could remove GroupDual in the future.)
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH RFC v4 00/16] qemu: towards virtio-1 host support

2014-11-27 Thread Cornelia Huck

Yet another version of the virtio-1 support patches.

This one has seen some (very) light testing with the virtio-1 guest
support patches currently on vhost-next.

Changes from v3:

- Add support for FEATURES_OK. We refuse to set features after the
  driver has set this in the status field, and we allow to fail
  setting the status if the features are inconsistent.
- Add missing virtio-1 changes for virtio-net (header size and mac).
- Dropped setting the VERSION_1 bit for virtio-blk: There's still
  some stuff missing.

For virtio-blk, we need to validate the feature bits if version 1 is
negotiated: some legacy features are not allowed in that case. I'm not
quite sure how to handle this, though. We could use the new
validate_features callback to verify that the driver negotiated a
sensible feature set, but that would require us to offer a superset
of legacy and version 1 bits, which feels wrong. Any ideas?

Cornelia Huck (13):
  virtio: cull virtio_bus_set_vdev_features
  virtio: support more feature bits
  s390x/virtio-ccw: fix check for WRITE_FEAT
  virtio: introduce legacy virtio devices
  virtio: allow virtio-1 queue layout
  dataplane: allow virtio-1 devices
  s390x/virtio-ccw: support virtio-1 set_vq format
  virtio: disallow late feature changes for virtio-1
  virtio: allow to fail setting status
  s390x/virtio-ccw: enable virtio 1.0
  virtio-net: no writeable mac for virtio-1
  virtio-net: support longer header
  virtio-net: enable virtio 1.0

Thomas Huth (3):
  linux-headers/virtio_config: Update with VIRTIO_F_VERSION_1
  s390x/css: Add a callback for when subchannel gets disabled
  s390x/virtio-ccw: add virtio set-revision call

 hw/9pfs/virtio-9p-device.c|7 +-
 hw/block/dataplane/virtio-blk.c   |4 +-
 hw/block/virtio-blk.c |9 +-
 hw/char/virtio-serial-bus.c   |9 +-
 hw/net/virtio-net.c   |   52 +--
 hw/s390x/css.c|   12 ++
 hw/s390x/css.h|1 +
 hw/s390x/s390-virtio-bus.c|9 +-
 hw/s390x/virtio-ccw.c |  206 +++--
 hw/s390x/virtio-ccw.h |7 +-
 hw/scsi/vhost-scsi.c  |7 +-
 hw/scsi/virtio-scsi-dataplane.c   |2 +-
 hw/scsi/virtio-scsi.c |   10 +-
 hw/virtio/Makefile.objs   |2 +-
 hw/virtio/dataplane/Makefile.objs |2 +-
 hw/virtio/dataplane/vring.c   |   96 ++--
 hw/virtio/virtio-balloon.c|8 +-
 hw/virtio/virtio-bus.c|   23 +--
 hw/virtio/virtio-mmio.c   |9 +-
 hw/virtio/virtio-pci.c|   13 +-
 hw/virtio/virtio-rng.c|2 +-
 hw/virtio/virtio.c|   88 +--
 include/hw/virtio/dataplane/vring-accessors.h |   75 +
 include/hw/virtio/dataplane/vring.h   |   14 +-
 include/hw/virtio/virtio-access.h |4 +
 include/hw/virtio/virtio-bus.h|   10 +-
 include/hw/virtio/virtio.h|   39 -
 linux-headers/linux/virtio_config.h   |3 +
 28 files changed, 523 insertions(+), 200 deletions(-)
 create mode 100644 include/hw/virtio/dataplane/vring-accessors.h

-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH RFC v4 03/16] virtio: support more feature bits

2014-11-27 Thread Cornelia Huck

With virtio-1, we support more than 32 feature bits. Let's make
vdev->guest_features depend on the number of supported feature bits,
allowing us to grow the feature bits automatically.

We also need to enhance the internal functions dealing with getting
and setting features with an additional index field, so that all feature
bits may be accessed (in chunks of 32 bits).

vhost and migration have been ignored for now.

Reviewed-by: Thomas Huth 
Signed-off-by: Cornelia Huck 
---
 hw/9pfs/virtio-9p-device.c |7 ++-
 hw/block/virtio-blk.c  |9 +++--
 hw/char/virtio-serial-bus.c|9 +++--
 hw/net/virtio-net.c|   38 ++
 hw/s390x/s390-virtio-bus.c |9 +
 hw/s390x/virtio-ccw.c  |   17 ++---
 hw/scsi/vhost-scsi.c   |7 +--
 hw/scsi/virtio-scsi.c  |   10 +-
 hw/virtio/dataplane/vring.c|   10 +-
 hw/virtio/virtio-balloon.c |8 ++--
 hw/virtio/virtio-bus.c |9 +
 hw/virtio/virtio-mmio.c|9 +
 hw/virtio/virtio-pci.c |   13 +++--
 hw/virtio/virtio-rng.c |2 +-
 hw/virtio/virtio.c |   29 +
 include/hw/virtio/virtio-bus.h |7 ---
 include/hw/virtio/virtio.h |   19 ++-
 17 files changed, 135 insertions(+), 77 deletions(-)

diff --git a/hw/9pfs/virtio-9p-device.c b/hw/9pfs/virtio-9p-device.c
index 2572747..c29c8c8 100644
--- a/hw/9pfs/virtio-9p-device.c
+++ b/hw/9pfs/virtio-9p-device.c
@@ -21,8 +21,13 @@
 #include "virtio-9p-coth.h"
 #include "hw/virtio/virtio-access.h"
 
-static uint32_t virtio_9p_get_features(VirtIODevice *vdev, uint32_t features)
+static uint32_t virtio_9p_get_features(VirtIODevice *vdev, unsigned int index,
+   uint32_t features)
 {
+if (index > 0) {
+return features;
+}
+
 features |= 1 << VIRTIO_9P_MOUNT_TAG;
 return features;
 }
diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index b19b102..6d86f60 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -564,10 +564,15 @@ static void virtio_blk_set_config(VirtIODevice *vdev, 
const uint8_t *config)
 aio_context_release(blk_get_aio_context(s->blk));
 }
 
-static uint32_t virtio_blk_get_features(VirtIODevice *vdev, uint32_t features)
+static uint32_t virtio_blk_get_features(VirtIODevice *vdev, unsigned int index,
+uint32_t features)
 {
 VirtIOBlock *s = VIRTIO_BLK(vdev);
 
+if (index > 0) {
+return features;
+}
+
 features |= (1 << VIRTIO_BLK_F_SEG_MAX);
 features |= (1 << VIRTIO_BLK_F_GEOMETRY);
 features |= (1 << VIRTIO_BLK_F_TOPOLOGY);
@@ -601,7 +606,7 @@ static void virtio_blk_set_status(VirtIODevice *vdev, 
uint8_t status)
 return;
 }
 
-features = vdev->guest_features;
+features = vdev->guest_features[0];
 
 /* A guest that supports VIRTIO_BLK_F_CONFIG_WCE must be able to send
  * cache flushes.  Thus, the "auto writethrough" behavior is never
diff --git a/hw/char/virtio-serial-bus.c b/hw/char/virtio-serial-bus.c
index a7b1b68..55de504 100644
--- a/hw/char/virtio-serial-bus.c
+++ b/hw/char/virtio-serial-bus.c
@@ -75,7 +75,7 @@ static VirtIOSerialPort *find_port_by_name(char *name)
 static bool use_multiport(VirtIOSerial *vser)
 {
 VirtIODevice *vdev = VIRTIO_DEVICE(vser);
-return vdev->guest_features & (1 << VIRTIO_CONSOLE_F_MULTIPORT);
+return vdev->guest_features[0] & (1 << VIRTIO_CONSOLE_F_MULTIPORT);
 }
 
 static size_t write_to_port(VirtIOSerialPort *port,
@@ -467,10 +467,15 @@ static void handle_input(VirtIODevice *vdev, VirtQueue 
*vq)
 {
 }
 
-static uint32_t get_features(VirtIODevice *vdev, uint32_t features)
+static uint32_t get_features(VirtIODevice *vdev, unsigned int index,
+ uint32_t features)
 {
 VirtIOSerial *vser;
 
+if (index > 0) {
+return features;
+}
+
 vser = VIRTIO_SERIAL(vdev);
 
 if (vser->bus.max_nr_ports > 1) {
diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index 9b88775..1e214b5 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -86,7 +86,7 @@ static void virtio_net_set_config(VirtIODevice *vdev, const 
uint8_t *config)
 
 memcpy(&netcfg, config, n->config_size);
 
-if (!(vdev->guest_features >> VIRTIO_NET_F_CTRL_MAC_ADDR & 1) &&
+if (!(vdev->guest_features[0] >> VIRTIO_NET_F_CTRL_MAC_ADDR & 1) &&
 memcmp(netcfg.mac, n->mac, ETH_ALEN)) {
 memcpy(n->mac, netcfg.mac, ETH_ALEN);
 qemu_format_nic_info_str(qemu_get_queue(n->nic), n->mac);
@@ -305,7 +305,7 @@ static RxFilterInfo 
*virtio_net_query_rxfilter(NetClientState *nc)
 info->multicast_table = str_list;
 info->vlan_table = get_vlan_table(n);
 
-if (!((1 << VIRTIO_NET_F_CTRL_VLAN) & vdev->guest_features)) {
+if (!((1 << VIRTIO_NET_F_CTRL_VLAN) & vdev->g

[PATCH RFC v4 13/16] s390x/virtio-ccw: enable virtio 1.0

2014-11-27 Thread Cornelia Huck

virtio-ccw should now have everything in place to operate virtio 1.0
devices, so let's enable revision 1.

Signed-off-by: Cornelia Huck 
---
 hw/s390x/virtio-ccw.h |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/s390x/virtio-ccw.h b/hw/s390x/virtio-ccw.h
index 03d5955..08edd8d 100644
--- a/hw/s390x/virtio-ccw.h
+++ b/hw/s390x/virtio-ccw.h
@@ -73,7 +73,7 @@ typedef struct VirtIOCCWDeviceClass {
 #define VIRTIO_CCW_FEATURE_SIZE NR_VIRTIO_FEATURE_WORDS
 
 /* The maximum virtio revision we support. */
-#define VIRTIO_CCW_REV_MAX 0
+#define VIRTIO_CCW_REV_MAX 1
 
 /* Performance improves when virtqueue kick processing is decoupled from the
  * vcpu thread using ioeventfd for some devices. */
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH RFC v4 15/16] virtio-net: support longer header

2014-11-27 Thread Cornelia Huck

virtio-1 devices always use num_buffers in the header, even if
mergeable rx buffers have not been negotiated.

Signed-off-by: Cornelia Huck 
---
 hw/net/virtio-net.c |9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index ad477bf..b31b3a4 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -380,8 +380,13 @@ static void virtio_net_set_mrg_rx_bufs(VirtIONet *n, int 
mergeable_rx_bufs)
 
 n->mergeable_rx_bufs = mergeable_rx_bufs;
 
-n->guest_hdr_len = n->mergeable_rx_bufs ?
-sizeof(struct virtio_net_hdr_mrg_rxbuf) : sizeof(struct 
virtio_net_hdr);
+if (n->parent_obj.guest_features[1] >> (VIRTIO_F_VERSION_1 - 32) & 1) {
+n->guest_hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
+} else {
+n->guest_hdr_len = n->mergeable_rx_bufs ?
+sizeof(struct virtio_net_hdr_mrg_rxbuf) :
+sizeof(struct virtio_net_hdr);
+}
 
 for (i = 0; i < n->max_queues; i++) {
 nc = qemu_get_subqueue(n->nic, i);
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH RFC v4 06/16] virtio: allow virtio-1 queue layout

2014-11-27 Thread Cornelia Huck

For virtio-1 devices, we allow a more complex queue layout that doesn't
require descriptor table and rings on a physically-contigous memory area:
add virtio_queue_set_rings() to allow transports to set this up.

Signed-off-by: Cornelia Huck 
---
 hw/virtio/virtio.c |   16 
 include/hw/virtio/virtio.h |2 ++
 2 files changed, 18 insertions(+)

diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index 4149f45..2c6bb91 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -96,6 +96,13 @@ static void virtqueue_init(VirtQueue *vq)
 {
 hwaddr pa = vq->pa;
 
+if (pa == -1ULL) {
+/*
+ * This is a virtio-1 style vq that has already been setup
+ * in virtio_queue_set.
+ */
+return;
+}
 vq->vring.desc = pa;
 vq->vring.avail = pa + vq->vring.num * sizeof(VRingDesc);
 vq->vring.used = vring_align(vq->vring.avail +
@@ -719,6 +726,15 @@ hwaddr virtio_queue_get_addr(VirtIODevice *vdev, int n)
 return vdev->vq[n].pa;
 }
 
+void virtio_queue_set_rings(VirtIODevice *vdev, int n, hwaddr desc,
+hwaddr avail, hwaddr used)
+{
+vdev->vq[n].pa = -1ULL;
+vdev->vq[n].vring.desc = desc;
+vdev->vq[n].vring.avail = avail;
+vdev->vq[n].vring.used = used;
+}
+
 void virtio_queue_set_num(VirtIODevice *vdev, int n, int num)
 {
 /* Don't allow guest to flip queue between existent and
diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
index 40e567c..f840320 100644
--- a/include/hw/virtio/virtio.h
+++ b/include/hw/virtio/virtio.h
@@ -227,6 +227,8 @@ void virtio_queue_set_addr(VirtIODevice *vdev, int n, 
hwaddr addr);
 hwaddr virtio_queue_get_addr(VirtIODevice *vdev, int n);
 void virtio_queue_set_num(VirtIODevice *vdev, int n, int num);
 int virtio_queue_get_num(VirtIODevice *vdev, int n);
+void virtio_queue_set_rings(VirtIODevice *vdev, int n, hwaddr desc,
+hwaddr avail, hwaddr used);
 void virtio_queue_set_align(VirtIODevice *vdev, int n, int align);
 void virtio_queue_notify(VirtIODevice *vdev, int n);
 uint16_t virtio_queue_vector(VirtIODevice *vdev, int n);
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH RFC v4 14/16] virtio-net: no writeable mac for virtio-1

2014-11-27 Thread Cornelia Huck

Devices operating as virtio 1.0 may not allow writes to the mac
address in config space.

Signed-off-by: Cornelia Huck 
---
 hw/net/virtio-net.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index 1e214b5..ad477bf 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -87,6 +87,7 @@ static void virtio_net_set_config(VirtIODevice *vdev, const 
uint8_t *config)
 memcpy(&netcfg, config, n->config_size);
 
 if (!(vdev->guest_features[0] >> VIRTIO_NET_F_CTRL_MAC_ADDR & 1) &&
+!(vdev->guest_features[1] >> (VIRTIO_F_VERSION_1 - 32) & 1) &&
 memcmp(netcfg.mac, n->mac, ETH_ALEN)) {
 memcpy(n->mac, netcfg.mac, ETH_ALEN);
 qemu_format_nic_info_str(qemu_get_queue(n->nic), n->mac);
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH RFC v4 07/16] dataplane: allow virtio-1 devices

2014-11-27 Thread Cornelia Huck

Handle endianness conversion for virtio-1 virtqueues correctly.

Note that dataplane now needs to be built per-target.

Signed-off-by: Cornelia Huck 
---
 hw/block/dataplane/virtio-blk.c   |4 +-
 hw/scsi/virtio-scsi-dataplane.c   |2 +-
 hw/virtio/Makefile.objs   |2 +-
 hw/virtio/dataplane/Makefile.objs |2 +-
 hw/virtio/dataplane/vring.c   |   86 ++---
 include/hw/virtio/dataplane/vring-accessors.h |   75 +
 include/hw/virtio/dataplane/vring.h   |   14 +---
 7 files changed, 131 insertions(+), 54 deletions(-)
 create mode 100644 include/hw/virtio/dataplane/vring-accessors.h

diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c
index 1222a37..2d8cc15 100644
--- a/hw/block/dataplane/virtio-blk.c
+++ b/hw/block/dataplane/virtio-blk.c
@@ -16,7 +16,9 @@
 #include "qemu/iov.h"
 #include "qemu/thread.h"
 #include "qemu/error-report.h"
+#include "hw/virtio/virtio-access.h"
 #include "hw/virtio/dataplane/vring.h"
+#include "hw/virtio/dataplane/vring-accessors.h"
 #include "sysemu/block-backend.h"
 #include "hw/virtio/virtio-blk.h"
 #include "virtio-blk.h"
@@ -75,7 +77,7 @@ static void complete_request_vring(VirtIOBlockReq *req, 
unsigned char status)
 VirtIOBlockDataPlane *s = req->dev->dataplane;
 stb_p(&req->in->status, status);
 
-vring_push(&req->dev->dataplane->vring, &req->elem,
+vring_push(s->vdev, &req->dev->dataplane->vring, &req->elem,
req->qiov.size + sizeof(*req->in));
 
 /* Suppress notification to guest by BH and its scheduled
diff --git a/hw/scsi/virtio-scsi-dataplane.c b/hw/scsi/virtio-scsi-dataplane.c
index 03a1e8c..418d73b 100644
--- a/hw/scsi/virtio-scsi-dataplane.c
+++ b/hw/scsi/virtio-scsi-dataplane.c
@@ -94,7 +94,7 @@ void virtio_scsi_vring_push_notify(VirtIOSCSIReq *req)
 {
 VirtIODevice *vdev = VIRTIO_DEVICE(req->vring->parent);
 
-vring_push(&req->vring->vring, &req->elem,
+vring_push(vdev, &req->vring->vring, &req->elem,
req->qsgl.size + req->resp_iov.size);
 
 if (vring_should_notify(vdev, &req->vring->vring)) {
diff --git a/hw/virtio/Makefile.objs b/hw/virtio/Makefile.objs
index d21c397..19b224a 100644
--- a/hw/virtio/Makefile.objs
+++ b/hw/virtio/Makefile.objs
@@ -2,7 +2,7 @@ common-obj-y += virtio-rng.o
 common-obj-$(CONFIG_VIRTIO_PCI) += virtio-pci.o
 common-obj-y += virtio-bus.o
 common-obj-y += virtio-mmio.o
-common-obj-$(CONFIG_VIRTIO) += dataplane/
+obj-$(CONFIG_VIRTIO) += dataplane/
 
 obj-y += virtio.o virtio-balloon.o 
 obj-$(CONFIG_LINUX) += vhost.o vhost-backend.o vhost-user.o
diff --git a/hw/virtio/dataplane/Makefile.objs 
b/hw/virtio/dataplane/Makefile.objs
index 9a8cfc0..753a9ca 100644
--- a/hw/virtio/dataplane/Makefile.objs
+++ b/hw/virtio/dataplane/Makefile.objs
@@ -1 +1 @@
-common-obj-y += vring.o
+obj-y += vring.o
diff --git a/hw/virtio/dataplane/vring.c b/hw/virtio/dataplane/vring.c
index a051775..0da8d6b 100644
--- a/hw/virtio/dataplane/vring.c
+++ b/hw/virtio/dataplane/vring.c
@@ -18,7 +18,9 @@
 #include "hw/hw.h"
 #include "exec/memory.h"
 #include "exec/address-spaces.h"
+#include "hw/virtio/virtio-access.h"
 #include "hw/virtio/dataplane/vring.h"
+#include "hw/virtio/dataplane/vring-accessors.h"
 #include "qemu/error-report.h"
 
 /* vring_map can be coupled with vring_unmap or (if you still have the
@@ -83,7 +85,7 @@ bool vring_setup(Vring *vring, VirtIODevice *vdev, int n)
 vring_init(&vring->vr, virtio_queue_get_num(vdev, n), vring_ptr, 4096);
 
 vring->last_avail_idx = virtio_queue_get_last_avail_idx(vdev, n);
-vring->last_used_idx = vring->vr.used->idx;
+vring->last_used_idx = vring_get_used_idx(vdev, vring);
 vring->signalled_used = 0;
 vring->signalled_used_valid = false;
 
@@ -104,7 +106,7 @@ void vring_teardown(Vring *vring, VirtIODevice *vdev, int n)
 void vring_disable_notification(VirtIODevice *vdev, Vring *vring)
 {
 if (!(vdev->guest_features[0] & (1 << VIRTIO_RING_F_EVENT_IDX))) {
-vring->vr.used->flags |= VRING_USED_F_NO_NOTIFY;
+vring_set_used_flags(vdev, vring, VRING_USED_F_NO_NOTIFY);
 }
 }
 
@@ -117,10 +119,10 @@ bool vring_enable_notification(VirtIODevice *vdev, Vring 
*vring)
 if (vdev->guest_features[0] & (1 << VIRTIO_RING_F_EVENT_IDX)) {
 vring_avail_event(&vring->vr) = vring->vr.avail->idx;
 } else {
-vring->vr.used->flags &= ~VRING_USED_F_NO_NOTIFY;
+vring_clear_used_flags(vdev, vring, VRING_USED_F_NO_NOTIFY);
 }
 smp_mb(); /* ensure update is seen before reading avail_idx */
-return !vring_more_avail(vring);
+return !vring_more_avail(vdev, vring);
 }
 
 /* This is stolen from linux/drivers/vhost/vhost.c:vhost_notify() */
@@ -134,12 +136,13 @@ bool vring_should_notify(VirtIODevice *vdev, Vring *vring)
 smp_mb();
 
 if ((vdev->guest_features[0] & VIRTIO_F_NOTIFY_ON_EMPTY) &&
-unlikely(vring->vr.avail->idx

[PATCH RFC v4 01/16] linux-headers/virtio_config: Update with VIRTIO_F_VERSION_1

2014-11-27 Thread Cornelia Huck

From: Thomas Huth 

Add the new VIRTIO_F_VERSION_1 definition to the virtio_config.h
linux header.

Signed-off-by: Thomas Huth 
Signed-off-by: Cornelia Huck 
---
 linux-headers/linux/virtio_config.h |3 +++
 1 file changed, 3 insertions(+)

diff --git a/linux-headers/linux/virtio_config.h 
b/linux-headers/linux/virtio_config.h
index 75dc20b..16aa289 100644
--- a/linux-headers/linux/virtio_config.h
+++ b/linux-headers/linux/virtio_config.h
@@ -54,4 +54,7 @@
 /* Can the device handle any descriptor layout? */
 #define VIRTIO_F_ANY_LAYOUT27
 
+/* v1.0 compliant. */
+#define VIRTIO_F_VERSION_1 32
+
 #endif /* _LINUX_VIRTIO_CONFIG_H */
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH RFC v4 10/16] s390x/virtio-ccw: support virtio-1 set_vq format

2014-11-27 Thread Cornelia Huck

Support the new CCW_CMD_SET_VQ format for virtio-1 devices.

While we're at it, refactor the code a bit and enforce big endian
fields (which had always been required, even for legacy).

Reviewed-by: Thomas Huth 
Signed-off-by: Cornelia Huck 
---
 hw/s390x/virtio-ccw.c |  114 ++---
 1 file changed, 80 insertions(+), 34 deletions(-)

diff --git a/hw/s390x/virtio-ccw.c b/hw/s390x/virtio-ccw.c
index e79f3b8..60d67a3 100644
--- a/hw/s390x/virtio-ccw.c
+++ b/hw/s390x/virtio-ccw.c
@@ -238,11 +238,20 @@ VirtualCssBus *virtual_css_bus_init(void)
 }
 
 /* Communication blocks used by several channel commands. */
-typedef struct VqInfoBlock {
+typedef struct VqInfoBlockLegacy {
 uint64_t queue;
 uint32_t align;
 uint16_t index;
 uint16_t num;
+} QEMU_PACKED VqInfoBlockLegacy;
+
+typedef struct VqInfoBlock {
+uint64_t desc;
+uint32_t res0;
+uint16_t index;
+uint16_t num;
+uint64_t avail;
+uint64_t used;
 } QEMU_PACKED VqInfoBlock;
 
 typedef struct VqConfigBlock {
@@ -269,17 +278,20 @@ typedef struct VirtioRevInfo {
 } QEMU_PACKED VirtioRevInfo;
 
 /* Specify where the virtqueues for the subchannel are in guest memory. */
-static int virtio_ccw_set_vqs(SubchDev *sch, uint64_t addr, uint32_t align,
-  uint16_t index, uint16_t num)
+static int virtio_ccw_set_vqs(SubchDev *sch, VqInfoBlock *info,
+  VqInfoBlockLegacy *linfo)
 {
 VirtIODevice *vdev = virtio_ccw_get_vdev(sch);
+uint16_t index = info ? info->index : linfo->index;
+uint16_t num = info ? info->num : linfo->num;
+uint64_t desc = info ? info->desc : linfo->queue;
 
 if (index > VIRTIO_PCI_QUEUE_MAX) {
 return -EINVAL;
 }
 
 /* Current code in virtio.c relies on 4K alignment. */
-if (addr && (align != 4096)) {
+if (linfo && desc && (linfo->align != 4096)) {
 return -EINVAL;
 }
 
@@ -287,8 +299,12 @@ static int virtio_ccw_set_vqs(SubchDev *sch, uint64_t 
addr, uint32_t align,
 return -EINVAL;
 }
 
-virtio_queue_set_addr(vdev, index, addr);
-if (!addr) {
+if (info) {
+virtio_queue_set_rings(vdev, index, desc, info->avail, info->used);
+} else {
+virtio_queue_set_addr(vdev, index, desc);
+}
+if (!desc) {
 virtio_queue_set_vector(vdev, index, 0);
 } else {
 /* Fail if we don't have a big enough queue. */
@@ -303,10 +319,66 @@ static int virtio_ccw_set_vqs(SubchDev *sch, uint64_t 
addr, uint32_t align,
 return 0;
 }
 
-static int virtio_ccw_cb(SubchDev *sch, CCW1 ccw)
+static int virtio_ccw_handle_set_vq(SubchDev *sch, CCW1 ccw, bool check_len,
+bool is_legacy)
 {
 int ret;
 VqInfoBlock info;
+VqInfoBlockLegacy linfo;
+size_t info_len = is_legacy ? sizeof(linfo) : sizeof(info);
+
+if (check_len) {
+if (ccw.count != info_len) {
+return -EINVAL;
+}
+} else if (ccw.count < info_len) {
+/* Can't execute command. */
+return -EINVAL;
+}
+if (!ccw.cda) {
+return -EFAULT;
+}
+if (is_legacy) {
+linfo.queue = ldq_be_phys(&address_space_memory, ccw.cda);
+linfo.align = ldl_be_phys(&address_space_memory,
+  ccw.cda + sizeof(linfo.queue));
+linfo.index = lduw_be_phys(&address_space_memory,
+   ccw.cda + sizeof(linfo.queue)
+   + sizeof(linfo.align));
+linfo.num = lduw_be_phys(&address_space_memory,
+ ccw.cda + sizeof(linfo.queue)
+ + sizeof(linfo.align)
+ + sizeof(linfo.index));
+ret = virtio_ccw_set_vqs(sch, NULL, &linfo);
+} else {
+info.desc = ldq_be_phys(&address_space_memory, ccw.cda);
+info.index = lduw_be_phys(&address_space_memory,
+  ccw.cda + sizeof(info.desc)
+  + sizeof(info.res0));
+info.num = lduw_be_phys(&address_space_memory,
+ccw.cda + sizeof(info.desc)
+  + sizeof(info.res0)
+  + sizeof(info.index));
+info.avail = ldq_be_phys(&address_space_memory,
+ ccw.cda + sizeof(info.desc)
+ + sizeof(info.res0)
+ + sizeof(info.index)
+ + sizeof(info.num));
+info.used = ldq_be_phys(&address_space_memory,
+ccw.cda + sizeof(info.desc)
++ sizeof(info.res0)
++ sizeof(info.index)
++ sizeof(info.num)
++ sizeof(info.avail));
+ret = virtio_ccw_set_vqs(sch, &info, NULL);
+}
+

[PATCH RFC v4 12/16] virtio: allow to fail setting status

2014-11-27 Thread Cornelia Huck

virtio-1 allow setting of the FEATURES_OK status bit to fail if
the negotiated feature bits are inconsistent: let's fail
virtio_set_status() in that case and update virtio-ccw to post an
error to the guest.

Signed-off-by: Cornelia Huck 
---
 hw/s390x/virtio-ccw.c  |   20 
 hw/virtio/virtio.c |   24 +++-
 include/hw/virtio/virtio.h |3 ++-
 3 files changed, 37 insertions(+), 10 deletions(-)

diff --git a/hw/s390x/virtio-ccw.c b/hw/s390x/virtio-ccw.c
index 60d67a3..65bef4b 100644
--- a/hw/s390x/virtio-ccw.c
+++ b/hw/s390x/virtio-ccw.c
@@ -547,15 +547,19 @@ static int virtio_ccw_cb(SubchDev *sch, CCW1 ccw)
 if (!(status & VIRTIO_CONFIG_S_DRIVER_OK)) {
 virtio_ccw_stop_ioeventfd(dev);
 }
-virtio_set_status(vdev, status);
-if (vdev->status == 0) {
-virtio_reset(vdev);
-}
-if (status & VIRTIO_CONFIG_S_DRIVER_OK) {
-virtio_ccw_start_ioeventfd(dev);
+if (virtio_set_status(vdev, status) == 0) {
+if (vdev->status == 0) {
+virtio_reset(vdev);
+}
+if (status & VIRTIO_CONFIG_S_DRIVER_OK) {
+virtio_ccw_start_ioeventfd(dev);
+}
+sch->curr_status.scsw.count = ccw.count - sizeof(status);
+ret = 0;
+} else {
+/* Trigger a command reject. */
+ret = -ENOSYS;
 }
-sch->curr_status.scsw.count = ccw.count - sizeof(status);
-ret = 0;
 }
 break;
 case CCW_CMD_SET_IND:
diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index 8cdc0cb..ab5c671 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -548,15 +548,37 @@ void virtio_update_irq(VirtIODevice *vdev)
 virtio_notify_vector(vdev, VIRTIO_NO_VECTOR);
 }
 
-void virtio_set_status(VirtIODevice *vdev, uint8_t val)
+static int virtio_validate_features(VirtIODevice *vdev)
+{
+VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
+
+if (k->validate_features) {
+return k->validate_features(vdev);
+} else {
+return 0;
+}
+}
+
+int virtio_set_status(VirtIODevice *vdev, uint8_t val)
 {
 VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
 trace_virtio_set_status(vdev, val);
 
+if (!virtio_device_is_legacy(vdev)) {
+if (!(vdev->status & VIRTIO_CONFIG_S_FEATURES_OK) &&
+val & VIRTIO_CONFIG_S_FEATURES_OK) {
+int ret = virtio_validate_features(vdev);
+
+if (ret) {
+return ret;
+}
+}
+}
 if (k->set_status) {
 k->set_status(vdev, val);
 }
 vdev->status = val;
+return 0;
 }
 
 bool target_words_bigendian(void);
diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
index ec1be3b..601578f 100644
--- a/include/hw/virtio/virtio.h
+++ b/include/hw/virtio/virtio.h
@@ -152,6 +152,7 @@ typedef struct VirtioDeviceClass {
  uint32_t requested_features);
 uint32_t (*bad_features)(VirtIODevice *vdev, unsigned int index);
 void (*set_features)(VirtIODevice *vdev, unsigned int index, uint32_t val);
+int (*validate_features)(VirtIODevice *vdev);
 void (*get_config)(VirtIODevice *vdev, uint8_t *config);
 void (*set_config)(VirtIODevice *vdev, const uint8_t *config);
 void (*reset)(VirtIODevice *vdev);
@@ -235,7 +236,7 @@ void virtio_queue_set_align(VirtIODevice *vdev, int n, int 
align);
 void virtio_queue_notify(VirtIODevice *vdev, int n);
 uint16_t virtio_queue_vector(VirtIODevice *vdev, int n);
 void virtio_queue_set_vector(VirtIODevice *vdev, int n, uint16_t vector);
-void virtio_set_status(VirtIODevice *vdev, uint8_t val);
+int virtio_set_status(VirtIODevice *vdev, uint8_t val);
 void virtio_reset(void *opaque);
 void virtio_update_irq(VirtIODevice *vdev);
 int virtio_set_features(VirtIODevice *vdev, unsigned int index, uint32_t val);
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH RFC v4 11/16] virtio: disallow late feature changes for virtio-1

2014-11-27 Thread Cornelia Huck

For virtio-1 devices, the driver must not attempt to set feature bits
after it set FEATURES_OK in the device status. Simply reject it in
that case.

Signed-off-by: Cornelia Huck 
---
 hw/virtio/virtio.c |   17 +++--
 include/hw/virtio/virtio.h |2 ++
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index 2c6bb91..8cdc0cb 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -982,7 +982,8 @@ void virtio_save(VirtIODevice *vdev, QEMUFile *f)
 vmstate_save_state(f, &vmstate_virtio, vdev);
 }
 
-int virtio_set_features(VirtIODevice *vdev, unsigned int index, uint32_t val)
+static int __virtio_set_features(VirtIODevice *vdev, unsigned int index,
+ uint32_t val)
 {
 BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
 VirtioBusClass *vbusk = VIRTIO_BUS_GET_CLASS(qbus);
@@ -998,6 +999,18 @@ int virtio_set_features(VirtIODevice *vdev, unsigned int 
index, uint32_t val)
 return bad ? -1 : 0;
 }
 
+int virtio_set_features(VirtIODevice *vdev, unsigned int index, uint32_t val)
+{
+   /*
+ * The driver must not attempt to set features after feature negotiation
+ * has finished.
+ */
+if (vdev->status & VIRTIO_CONFIG_S_FEATURES_OK) {
+return -EINVAL;
+}
+return __virtio_set_features(vdev, index, val);
+}
+
 int virtio_load(VirtIODevice *vdev, QEMUFile *f, int version_id)
 {
 int i, ret;
@@ -1030,7 +1043,7 @@ int virtio_load(VirtIODevice *vdev, QEMUFile *f, int 
version_id)
 qemu_get_be32s(f, &features);
 
 /* XXX features >= 32 */
-if (virtio_set_features(vdev, 0, features) < 0) {
+if (__virtio_set_features(vdev, 0, features) < 0) {
 supported_features = k->get_features(qbus->parent, 0);
 error_report("Features 0x%x unsupported. Allowed features: 0x%x",
  features, supported_features);
diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
index f840320..ec1be3b 100644
--- a/include/hw/virtio/virtio.h
+++ b/include/hw/virtio/virtio.h
@@ -32,6 +32,8 @@
 #define VIRTIO_CONFIG_S_DRIVER  2
 /* Driver has used its parts of the config, and is happy */
 #define VIRTIO_CONFIG_S_DRIVER_OK   4
+/* Driver has finished configuring features */
+#define VIRTIO_CONFIG_S_FEATURES_OK 8
 /* We've given up on this device. */
 #define VIRTIO_CONFIG_S_FAILED  0x80
 
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH RFC v4 16/16] virtio-net: enable virtio 1.0

2014-11-27 Thread Cornelia Huck

virtio-net (non-vhost) now should have everything in place to support
virtio 1.0: let's enable the feature bit for it.

Note that VIRTIO_F_VERSION_1 is technically a transport feature; once
every device is ready for virtio 1.0, we can move setting this
feature bit out of the individual devices.

Signed-off-by: Cornelia Huck 
---
 hw/net/virtio-net.c |4 
 1 file changed, 4 insertions(+)

diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index b31b3a4..9ceff02 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -453,6 +453,10 @@ static uint32_t virtio_net_get_features(VirtIODevice 
*vdev, unsigned int index,
 VirtIONet *n = VIRTIO_NET(vdev);
 NetClientState *nc = qemu_get_queue(n->nic);
 
+if (index == 1 && !get_vhost_net(nc->peer)) {
+features |= (1 << (VIRTIO_F_VERSION_1 - 32));
+}
+
 if (index > 0) {
 return features;
 }
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH RFC v4 04/16] s390x/virtio-ccw: fix check for WRITE_FEAT

2014-11-27 Thread Cornelia Huck

We need to check guest feature size, not host feature size to find
out whether we should call virtio_set_features(). This check is
possible now that vdev->guest_features is an array.

Reviewed-by: Thomas Huth 
Signed-off-by: Cornelia Huck 
---
 hw/s390x/virtio-ccw.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/s390x/virtio-ccw.c b/hw/s390x/virtio-ccw.c
index 2f52b82..62ec852 100644
--- a/hw/s390x/virtio-ccw.c
+++ b/hw/s390x/virtio-ccw.c
@@ -399,7 +399,7 @@ static int virtio_ccw_cb(SubchDev *sch, CCW1 ccw)
 features.index = ldub_phys(&address_space_memory,
ccw.cda + sizeof(features.features));
 features.features = ldl_le_phys(&address_space_memory, ccw.cda);
-if (features.index < ARRAY_SIZE(dev->host_features)) {
+if (features.index < ARRAY_SIZE(vdev->guest_features)) {
 virtio_set_features(vdev, features.index, features.features);
 } else {
 /*
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH RFC v4 09/16] s390x/virtio-ccw: add virtio set-revision call

2014-11-27 Thread Cornelia Huck

From: Thomas Huth 

Handle the virtio-ccw revision according to what the guest sets.
When revision 1 is selected, we have a virtio-1 standard device
with byteswapping for the virtio rings.

When a channel gets disabled, we have to revert to the legacy behavior
in case the next user of the device does not negotiate the revision 1
anymore (e.g. the boot firmware uses revision 1, but the operating
system only uses the legacy mode).

Note that revisions > 0 are still disabled; but we still extend the
feature bit size to be able to handle the VERSION_1 bit.

Signed-off-by: Thomas Huth 
Signed-off-by: Cornelia Huck 
---
 hw/s390x/virtio-ccw.c |   52 +
 hw/s390x/virtio-ccw.h |7 ++-
 2 files changed, 58 insertions(+), 1 deletion(-)

diff --git a/hw/s390x/virtio-ccw.c b/hw/s390x/virtio-ccw.c
index 62ec852..e79f3b8 100644
--- a/hw/s390x/virtio-ccw.c
+++ b/hw/s390x/virtio-ccw.c
@@ -20,9 +20,11 @@
 #include "hw/virtio/virtio-net.h"
 #include "hw/sysbus.h"
 #include "qemu/bitops.h"
+#include "hw/virtio/virtio-access.h"
 #include "hw/virtio/virtio-bus.h"
 #include "hw/s390x/adapter.h"
 #include "hw/s390x/s390_flic.h"
+#include "linux/virtio_config.h"
 
 #include "ioinst.h"
 #include "css.h"
@@ -260,6 +262,12 @@ typedef struct VirtioThinintInfo {
 uint8_t isc;
 } QEMU_PACKED VirtioThinintInfo;
 
+typedef struct VirtioRevInfo {
+uint16_t revision;
+uint16_t length;
+uint8_t data[0];
+} QEMU_PACKED VirtioRevInfo;
+
 /* Specify where the virtqueues for the subchannel are in guest memory. */
 static int virtio_ccw_set_vqs(SubchDev *sch, uint64_t addr, uint32_t align,
   uint16_t index, uint16_t num)
@@ -299,6 +307,7 @@ static int virtio_ccw_cb(SubchDev *sch, CCW1 ccw)
 {
 int ret;
 VqInfoBlock info;
+VirtioRevInfo revinfo;
 uint8_t status;
 VirtioFeatDesc features;
 void *config;
@@ -373,6 +382,13 @@ static int virtio_ccw_cb(SubchDev *sch, CCW1 ccw)
ccw.cda + sizeof(features.features));
 if (features.index < ARRAY_SIZE(dev->host_features)) {
 features.features = dev->host_features[features.index];
+/*
+ * Don't offer version 1 to the guest if it did not
+ * negotiate at least revision 1.
+ */
+if (features.index == 1 && dev->revision <= 0) {
+features.features &= ~(1 << (VIRTIO_F_VERSION_1 - 32));
+}
 } else {
 /* Return zeroes if the guest supports more feature bits. */
 features.features = 0;
@@ -400,6 +416,13 @@ static int virtio_ccw_cb(SubchDev *sch, CCW1 ccw)
ccw.cda + sizeof(features.features));
 features.features = ldl_le_phys(&address_space_memory, ccw.cda);
 if (features.index < ARRAY_SIZE(vdev->guest_features)) {
+/*
+ * The guest should not set version 1 if it didn't
+ * negotiate a revision >= 1.
+ */
+if (features.index == 1 && dev->revision <= 0) {
+features.features &= ~(1 << (VIRTIO_F_VERSION_1 - 32));
+}
 virtio_set_features(vdev, features.index, features.features);
 } else {
 /*
@@ -600,6 +623,25 @@ static int virtio_ccw_cb(SubchDev *sch, CCW1 ccw)
 }
 }
 break;
+case CCW_CMD_SET_VIRTIO_REV:
+len = sizeof(revinfo);
+if (ccw.count < len || (check_len && ccw.count > len)) {
+ret = -EINVAL;
+break;
+}
+if (!ccw.cda) {
+ret = -EFAULT;
+break;
+}
+cpu_physical_memory_read(ccw.cda, &revinfo, len);
+if (dev->revision >= 0 ||
+revinfo.revision > VIRTIO_CCW_REV_MAX) {
+ret = -ENOSYS;
+break;
+}
+ret = 0;
+dev->revision = revinfo.revision;
+break;
 default:
 ret = -ENOSYS;
 break;
@@ -607,6 +649,13 @@ static int virtio_ccw_cb(SubchDev *sch, CCW1 ccw)
 return ret;
 }
 
+static void virtio_sch_disable_cb(SubchDev *sch)
+{
+VirtioCcwDevice *dev = sch->driver_data;
+
+dev->revision = -1;
+}
+
 static int virtio_ccw_device_init(VirtioCcwDevice *dev, VirtIODevice *vdev)
 {
 unsigned int cssid = 0;
@@ -733,6 +782,7 @@ static int virtio_ccw_device_init(VirtioCcwDevice *dev, 
VirtIODevice *vdev)
 css_sch_build_virtual_schib(sch, 0, VIRTIO_CCW_CHPID_TYPE);
 
 sch->ccw_cb = virtio_ccw_cb;
+sch->disable_cb = virtio_sch_disable_cb;
 
 /* Build senseid data. */
 memset(&sch->id, 0, sizeof(SenseId));
@@ -740,6 +790,8 @@ static int virtio_ccw_device_init(VirtioCcwDevice *dev, 
VirtIODevice *vdev)
 sch->id.cu_type = VIRTIO_CCW_CU_TYPE;
 sch->id.cu_model = vdev->device_id;
 
+dev->revision = -1;
+

[PATCH RFC v4 02/16] virtio: cull virtio_bus_set_vdev_features

2014-11-27 Thread Cornelia Huck

The only user of this function was virtio-ccw, and it should use
virtio_set_features() like everybody else: We need to make sure
that bad features are masked out properly, which this function did
not do.

Reviewed-by: Thomas Huth 
Signed-off-by: Cornelia Huck 
---
 hw/s390x/virtio-ccw.c  |3 +--
 hw/virtio/virtio-bus.c |   14 --
 include/hw/virtio/virtio-bus.h |3 ---
 3 files changed, 1 insertion(+), 19 deletions(-)

diff --git a/hw/s390x/virtio-ccw.c b/hw/s390x/virtio-ccw.c
index ea236c9..84f17bc 100644
--- a/hw/s390x/virtio-ccw.c
+++ b/hw/s390x/virtio-ccw.c
@@ -400,8 +400,7 @@ static int virtio_ccw_cb(SubchDev *sch, CCW1 ccw)
ccw.cda + sizeof(features.features));
 features.features = ldl_le_phys(&address_space_memory, ccw.cda);
 if (features.index < ARRAY_SIZE(dev->host_features)) {
-virtio_bus_set_vdev_features(&dev->bus, features.features);
-vdev->guest_features = features.features;
+virtio_set_features(vdev, features.features);
 } else {
 /*
  * If the guest supports more feature bits, assert that it
diff --git a/hw/virtio/virtio-bus.c b/hw/virtio/virtio-bus.c
index eb77019..a8ffa07 100644
--- a/hw/virtio/virtio-bus.c
+++ b/hw/virtio/virtio-bus.c
@@ -109,20 +109,6 @@ uint32_t virtio_bus_get_vdev_features(VirtioBusState *bus,
 return k->get_features(vdev, requested_features);
 }
 
-/* Set the features of the plugged device. */
-void virtio_bus_set_vdev_features(VirtioBusState *bus,
-  uint32_t requested_features)
-{
-VirtIODevice *vdev = virtio_bus_get_device(bus);
-VirtioDeviceClass *k;
-
-assert(vdev != NULL);
-k = VIRTIO_DEVICE_GET_CLASS(vdev);
-if (k->set_features != NULL) {
-k->set_features(vdev, requested_features);
-}
-}
-
 /* Get bad features of the plugged device. */
 uint32_t virtio_bus_get_vdev_bad_features(VirtioBusState *bus)
 {
diff --git a/include/hw/virtio/virtio-bus.h b/include/hw/virtio/virtio-bus.h
index 0756545..0d2e7b4 100644
--- a/include/hw/virtio/virtio-bus.h
+++ b/include/hw/virtio/virtio-bus.h
@@ -84,9 +84,6 @@ size_t virtio_bus_get_vdev_config_len(VirtioBusState *bus);
 /* Get the features of the plugged device. */
 uint32_t virtio_bus_get_vdev_features(VirtioBusState *bus,
 uint32_t requested_features);
-/* Set the features of the plugged device. */
-void virtio_bus_set_vdev_features(VirtioBusState *bus,
-  uint32_t requested_features);
 /* Get bad features of the plugged device. */
 uint32_t virtio_bus_get_vdev_bad_features(VirtioBusState *bus);
 /* Get config of the plugged device. */
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH RFC v4 08/16] s390x/css: Add a callback for when subchannel gets disabled

2014-11-27 Thread Cornelia Huck

From: Thomas Huth 

We need a possibility to run code when a subchannel gets disabled.
This patch adds the necessary infrastructure.

Signed-off-by: Thomas Huth 
Signed-off-by: Cornelia Huck 
---
 hw/s390x/css.c |   12 
 hw/s390x/css.h |1 +
 2 files changed, 13 insertions(+)

diff --git a/hw/s390x/css.c b/hw/s390x/css.c
index b67c039..735ec55 100644
--- a/hw/s390x/css.c
+++ b/hw/s390x/css.c
@@ -588,6 +588,7 @@ int css_do_msch(SubchDev *sch, SCHIB *orig_schib)
 {
 SCSW *s = &sch->curr_status.scsw;
 PMCW *p = &sch->curr_status.pmcw;
+uint16_t oldflags;
 int ret;
 SCHIB schib;
 
@@ -610,6 +611,7 @@ int css_do_msch(SubchDev *sch, SCHIB *orig_schib)
 copy_schib_from_guest(&schib, orig_schib);
 /* Only update the program-modifiable fields. */
 p->intparm = schib.pmcw.intparm;
+oldflags = p->flags;
 p->flags &= ~(PMCW_FLAGS_MASK_ISC | PMCW_FLAGS_MASK_ENA |
   PMCW_FLAGS_MASK_LM | PMCW_FLAGS_MASK_MME |
   PMCW_FLAGS_MASK_MP);
@@ -625,6 +627,12 @@ int css_do_msch(SubchDev *sch, SCHIB *orig_schib)
 (PMCW_CHARS_MASK_MBFC | PMCW_CHARS_MASK_CSENSE);
 sch->curr_status.mba = schib.mba;
 
+/* Has the channel been disabled? */
+if (sch->disable_cb && (oldflags & PMCW_FLAGS_MASK_ENA) != 0
+&& (p->flags & PMCW_FLAGS_MASK_ENA) == 0) {
+sch->disable_cb(sch);
+}
+
 ret = 0;
 
 out:
@@ -1443,6 +1451,10 @@ void css_reset_sch(SubchDev *sch)
 {
 PMCW *p = &sch->curr_status.pmcw;
 
+if ((p->flags & PMCW_FLAGS_MASK_ENA) != 0 && sch->disable_cb) {
+sch->disable_cb(sch);
+}
+
 p->intparm = 0;
 p->flags &= ~(PMCW_FLAGS_MASK_ISC | PMCW_FLAGS_MASK_ENA |
   PMCW_FLAGS_MASK_LM | PMCW_FLAGS_MASK_MME |
diff --git a/hw/s390x/css.h b/hw/s390x/css.h
index 33104ac..7fa807b 100644
--- a/hw/s390x/css.h
+++ b/hw/s390x/css.h
@@ -81,6 +81,7 @@ struct SubchDev {
 uint8_t ccw_no_data_cnt;
 /* transport-provided data: */
 int (*ccw_cb) (SubchDev *, CCW1);
+void (*disable_cb)(SubchDev *);
 SenseId id;
 void *driver_data;
 };
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH RFC v4 05/16] virtio: introduce legacy virtio devices

2014-11-27 Thread Cornelia Huck

Introduce a helper function to indicate  whether a virtio device is
operating in legacy or virtio standard mode.

It may be used to make decisions about the endianess of virtio accesses
and other virtio-1 specific changes, enabling us to support transitional
devices.

Reviewed-by: Thomas Huth 
Signed-off-by: Cornelia Huck 
---
 hw/virtio/virtio.c|6 +-
 include/hw/virtio/virtio-access.h |4 
 include/hw/virtio/virtio.h|   13 +++--
 3 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index 2eb5d3c..4149f45 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -883,7 +883,11 @@ static bool virtio_device_endian_needed(void *opaque)
 VirtIODevice *vdev = opaque;
 
 assert(vdev->device_endian != VIRTIO_DEVICE_ENDIAN_UNKNOWN);
-return vdev->device_endian != virtio_default_endian();
+if (virtio_device_is_legacy(vdev)) {
+return vdev->device_endian != virtio_default_endian();
+}
+/* Devices conforming to VIRTIO 1.0 or later are always LE. */
+return vdev->device_endian != VIRTIO_DEVICE_ENDIAN_LITTLE;
 }
 
 static const VMStateDescription vmstate_virtio_device_endian = {
diff --git a/include/hw/virtio/virtio-access.h 
b/include/hw/virtio/virtio-access.h
index 46456fd..c123ee0 100644
--- a/include/hw/virtio/virtio-access.h
+++ b/include/hw/virtio/virtio-access.h
@@ -19,6 +19,10 @@
 
 static inline bool virtio_access_is_big_endian(VirtIODevice *vdev)
 {
+if (!virtio_device_is_legacy(vdev)) {
+/* Devices conforming to VIRTIO 1.0 or later are always LE. */
+return false;
+}
 #if defined(TARGET_IS_BIENDIAN)
 return virtio_is_big_endian(vdev);
 #elif defined(TARGET_WORDS_BIGENDIAN)
diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
index b408166..40e567c 100644
--- a/include/hw/virtio/virtio.h
+++ b/include/hw/virtio/virtio.h
@@ -275,9 +275,18 @@ void virtio_queue_set_host_notifier_fd_handler(VirtQueue 
*vq, bool assign,
 void virtio_queue_notify_vq(VirtQueue *vq);
 void virtio_irq(VirtQueue *vq);
 
+static inline bool virtio_device_is_legacy(VirtIODevice *vdev)
+{
+return !(vdev->guest_features[1] & (1 << (VIRTIO_F_VERSION_1 - 32)));
+}
+
 static inline bool virtio_is_big_endian(VirtIODevice *vdev)
 {
-assert(vdev->device_endian != VIRTIO_DEVICE_ENDIAN_UNKNOWN);
-return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_BIG;
+if (virtio_device_is_legacy(vdev)) {
+assert(vdev->device_endian != VIRTIO_DEVICE_ENDIAN_UNKNOWN);
+return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_BIG;
+}
+/* Devices conforming to VIRTIO 1.0 or later are always LE. */
+return false;
 }
 #endif
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH RFC v4 00/16] qemu: towards virtio-1 host support

2014-11-27 Thread Michael S. Tsirkin

On Thu, Nov 27, 2014 at 04:16:33PM +0100, Cornelia Huck wrote:
> Yet another version of the virtio-1 support patches.
> 
> This one has seen some (very) light testing with the virtio-1 guest
> support patches currently on vhost-next.
> 
> Changes from v3:
> 
> - Add support for FEATURES_OK. We refuse to set features after the
>   driver has set this in the status field, and we allow to fail
>   setting the status if the features are inconsistent.
> - Add missing virtio-1 changes for virtio-net (header size and mac).
> - Dropped setting the VERSION_1 bit for virtio-blk: There's still
>   some stuff missing.
> 
> For virtio-blk, we need to validate the feature bits if version 1 is
> negotiated: some legacy features are not allowed in that case. I'm not
> quite sure how to handle this, though. We could use the new
> validate_features callback to verify that the driver negotiated a
> sensible feature set, but that would require us to offer a superset
> of legacy and version 1 bits, which feels wrong. Any ideas?

No, that's violating the spec.
I think the simplest way is to have separate features and
legacy_features fields.  Present the correct one depending on which
revision was negotiated.


> Cornelia Huck (13):
>   virtio: cull virtio_bus_set_vdev_features
>   virtio: support more feature bits
>   s390x/virtio-ccw: fix check for WRITE_FEAT
>   virtio: introduce legacy virtio devices
>   virtio: allow virtio-1 queue layout
>   dataplane: allow virtio-1 devices
>   s390x/virtio-ccw: support virtio-1 set_vq format
>   virtio: disallow late feature changes for virtio-1
>   virtio: allow to fail setting status
>   s390x/virtio-ccw: enable virtio 1.0
>   virtio-net: no writeable mac for virtio-1
>   virtio-net: support longer header
>   virtio-net: enable virtio 1.0
> 
> Thomas Huth (3):
>   linux-headers/virtio_config: Update with VIRTIO_F_VERSION_1
>   s390x/css: Add a callback for when subchannel gets disabled
>   s390x/virtio-ccw: add virtio set-revision call
> 
>  hw/9pfs/virtio-9p-device.c|7 +-
>  hw/block/dataplane/virtio-blk.c   |4 +-
>  hw/block/virtio-blk.c |9 +-
>  hw/char/virtio-serial-bus.c   |9 +-
>  hw/net/virtio-net.c   |   52 +--
>  hw/s390x/css.c|   12 ++
>  hw/s390x/css.h|1 +
>  hw/s390x/s390-virtio-bus.c|9 +-
>  hw/s390x/virtio-ccw.c |  206 
> +++--
>  hw/s390x/virtio-ccw.h |7 +-
>  hw/scsi/vhost-scsi.c  |7 +-
>  hw/scsi/virtio-scsi-dataplane.c   |2 +-
>  hw/scsi/virtio-scsi.c |   10 +-
>  hw/virtio/Makefile.objs   |2 +-
>  hw/virtio/dataplane/Makefile.objs |2 +-
>  hw/virtio/dataplane/vring.c   |   96 ++--
>  hw/virtio/virtio-balloon.c|8 +-
>  hw/virtio/virtio-bus.c|   23 +--
>  hw/virtio/virtio-mmio.c   |9 +-
>  hw/virtio/virtio-pci.c|   13 +-
>  hw/virtio/virtio-rng.c|2 +-
>  hw/virtio/virtio.c|   88 +--
>  include/hw/virtio/dataplane/vring-accessors.h |   75 +
>  include/hw/virtio/dataplane/vring.h   |   14 +-
>  include/hw/virtio/virtio-access.h |4 +
>  include/hw/virtio/virtio-bus.h|   10 +-
>  include/hw/virtio/virtio.h|   39 -
>  linux-headers/linux/virtio_config.h   |3 +
>  28 files changed, 523 insertions(+), 200 deletions(-)
>  create mode 100644 include/hw/virtio/dataplane/vring-accessors.h
> 
> -- 
> 1.7.9.5
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] KVM: Introduce dynamically registered hypercall capability

2014-11-27 Thread Radim Krčmář

2014-11-27 05:30-0800, Phil White:
> This introduces a list of entries which associate a function pointer of
> kvm_hc_type to a hypercall number and allows the ability to register and
> unregister entries.  In addition, it also allows the ability to retrieve a
> function pointer of kvm_hc_type for a given hypercall number which is meant
> to be called from the arch-specific section.
> 
> The main intent is to allow modules to register hypercalls which they own
> rather than requiring the addition of a stub of some sort.  It will also
> allow each arch to maintain separate lists of hypercalls rather than having
> to respect changes in include/uapi/linux/kvm_para.h
> 
> Signed-off-by: Phil White 
> ---

Apart from other problems,
how are guests going to use these hypercalls?

(If hc_nr is dynamic, a guest doesn't know its number and even if it is
 static, someone could have registered it beforehand => this needs some
 kind of synchronization with host modules.  A hardcoded reservation?)
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH RFC v4 00/16] qemu: towards virtio-1 host support

2014-11-27 Thread Cornelia Huck

On Thu, 27 Nov 2014 17:24:22 +0200
"Michael S. Tsirkin"  wrote:

> On Thu, Nov 27, 2014 at 04:16:33PM +0100, Cornelia Huck wrote:
> > Yet another version of the virtio-1 support patches.
> > 
> > This one has seen some (very) light testing with the virtio-1 guest
> > support patches currently on vhost-next.
> > 
> > Changes from v3:
> > 
> > - Add support for FEATURES_OK. We refuse to set features after the
> >   driver has set this in the status field, and we allow to fail
> >   setting the status if the features are inconsistent.
> > - Add missing virtio-1 changes for virtio-net (header size and mac).
> > - Dropped setting the VERSION_1 bit for virtio-blk: There's still
> >   some stuff missing.
> > 
> > For virtio-blk, we need to validate the feature bits if version 1 is
> > negotiated: some legacy features are not allowed in that case. I'm not
> > quite sure how to handle this, though. We could use the new
> > validate_features callback to verify that the driver negotiated a
> > sensible feature set, but that would require us to offer a superset
> > of legacy and version 1 bits, which feels wrong. Any ideas?
> 
> No, that's violating the spec.
> I think the simplest way is to have separate features and
> legacy_features fields.  Present the correct one depending on which
> revision was negotiated.

But revisions are a virtio-ccw only thing - what can other transports
do here? The basic problem is that we decide via a feature bit that
needs to be negotiated which feature bits we want to present. pci and
mmio don't have a way to know whether the driver wants to use 1.0 or
legacy prior to feature negotiation, do they?

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH RFC v4 03/16] virtio: support more feature bits

2014-11-27 Thread Michael S. Tsirkin

On Thu, Nov 27, 2014 at 04:16:36PM +0100, Cornelia Huck wrote:
> With virtio-1, we support more than 32 feature bits. Let's make
> vdev->guest_features depend on the number of supported feature bits,
> allowing us to grow the feature bits automatically.
> 
> We also need to enhance the internal functions dealing with getting
> and setting features with an additional index field, so that all feature
> bits may be accessed (in chunks of 32 bits).
> 
> vhost and migration have been ignored for now.
> 
> Reviewed-by: Thomas Huth 
> Signed-off-by: Cornelia Huck 
> @@ -117,7 +125,7 @@ struct VirtIODevice
>  uint8_t status;
>  uint8_t isr;
>  uint16_t queue_sel;
> -uint32_t guest_features;
> +uint32_t guest_features[NR_VIRTIO_FEATURE_WORDS];
>  size_t config_len;
>  void *config;
>  uint16_t config_vector;

Ugh.

That's quite tricky to use correctly.
Why don't we just make it uint64_t?

The only real issue is that DEFINE_PROP_BIT wants
a uint32_t.

But that's easy to fix: add DEFINE_PROP64_BIT
that is the same but handles a 64 bit array.

-- 
MST
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH RFC v4 03/16] virtio: support more feature bits

2014-11-27 Thread Cornelia Huck

On Thu, 27 Nov 2014 17:34:19 +0200
"Michael S. Tsirkin"  wrote:

> On Thu, Nov 27, 2014 at 04:16:36PM +0100, Cornelia Huck wrote:
> > With virtio-1, we support more than 32 feature bits. Let's make
> > vdev->guest_features depend on the number of supported feature bits,
> > allowing us to grow the feature bits automatically.

^ This was one reason why I did it this way...

> > 
> > We also need to enhance the internal functions dealing with getting
> > and setting features with an additional index field, so that all feature
> > bits may be accessed (in chunks of 32 bits).
> > 
> > vhost and migration have been ignored for now.
> > 
> > Reviewed-by: Thomas Huth 
> > Signed-off-by: Cornelia Huck 
> > @@ -117,7 +125,7 @@ struct VirtIODevice
> >  uint8_t status;
> >  uint8_t isr;
> >  uint16_t queue_sel;
> > -uint32_t guest_features;
> > +uint32_t guest_features[NR_VIRTIO_FEATURE_WORDS];
> >  size_t config_len;
> >  void *config;
> >  uint16_t config_vector;
> 
> Ugh.
> 
> That's quite tricky to use correctly.
> Why don't we just make it uint64_t?

...and another one was that at least virtio-ccw reads/writes in chunks
of 32 bits anyway.
> 
> The only real issue is that DEFINE_PROP_BIT wants
> a uint32_t.
> 
> But that's easy to fix: add DEFINE_PROP64_BIT
> that is the same but handles a 64 bit array.
> 
Sure, this would not really be a problem to add. But we'll stand before
the same problem again when we want to grow past 64 bits, won't we?

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH RFC v4 00/16] qemu: towards virtio-1 host support

2014-11-27 Thread Michael S. Tsirkin

On Thu, Nov 27, 2014 at 04:31:39PM +0100, Cornelia Huck wrote:
> On Thu, 27 Nov 2014 17:24:22 +0200
> "Michael S. Tsirkin"  wrote:
> 
> > On Thu, Nov 27, 2014 at 04:16:33PM +0100, Cornelia Huck wrote:
> > > Yet another version of the virtio-1 support patches.
> > > 
> > > This one has seen some (very) light testing with the virtio-1 guest
> > > support patches currently on vhost-next.
> > > 
> > > Changes from v3:
> > > 
> > > - Add support for FEATURES_OK. We refuse to set features after the
> > >   driver has set this in the status field, and we allow to fail
> > >   setting the status if the features are inconsistent.
> > > - Add missing virtio-1 changes for virtio-net (header size and mac).
> > > - Dropped setting the VERSION_1 bit for virtio-blk: There's still
> > >   some stuff missing.
> > > 
> > > For virtio-blk, we need to validate the feature bits if version 1 is
> > > negotiated: some legacy features are not allowed in that case. I'm not
> > > quite sure how to handle this, though. We could use the new
> > > validate_features callback to verify that the driver negotiated a
> > > sensible feature set, but that would require us to offer a superset
> > > of legacy and version 1 bits, which feels wrong. Any ideas?
> > 
> > No, that's violating the spec.
> > I think the simplest way is to have separate features and
> > legacy_features fields.  Present the correct one depending on which
> > revision was negotiated.
> 
> But revisions are a virtio-ccw only thing - what can other transports
> do here?

Other transports have different ways to deal with this.
For example virtio pci exposes a legacy header and
a modern header. Legacy header will expose old features,
modern one - new features.

mmio simply does not support transitional devices.
So qemu user will have to specify virtio 1.0 or 0.9 for mmio.

Other transports are out of virtio 1.0 spec so
they just use legacy features.

> The basic problem is that we decide via a feature bit that
> needs to be negotiated which feature bits we want to present.

Consider wce as one example.  This is not needed for modern guests, so
we can just mask it from modern feature mask.  Consider virtio blk scsi
commands as another example.  this feature is not supported in virtio
1.0, so we must mask it from modern feature mask.

Seems the same handling works in all cases?


> pci and
> mmio don't have a way to know whether the driver wants to use 1.0 or
> legacy prior to feature negotiation, do they?

pci does. mmio doesn't but it does not want to support transitional
devices.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH RFC v4 03/16] virtio: support more feature bits

2014-11-27 Thread Michael S. Tsirkin

On Thu, Nov 27, 2014 at 04:40:29PM +0100, Cornelia Huck wrote:
> On Thu, 27 Nov 2014 17:34:19 +0200
> "Michael S. Tsirkin"  wrote:
> 
> > On Thu, Nov 27, 2014 at 04:16:36PM +0100, Cornelia Huck wrote:
> > > With virtio-1, we support more than 32 feature bits. Let's make
> > > vdev->guest_features depend on the number of supported feature bits,
> > > allowing us to grow the feature bits automatically.
> 
> ^ This was one reason why I did it this way...

Then use bitmap.h
But I think it's overdesign.


> > > 
> > > We also need to enhance the internal functions dealing with getting
> > > and setting features with an additional index field, so that all feature
> > > bits may be accessed (in chunks of 32 bits).
> > > 
> > > vhost and migration have been ignored for now.
> > > 
> > > Reviewed-by: Thomas Huth 
> > > Signed-off-by: Cornelia Huck 
> > > @@ -117,7 +125,7 @@ struct VirtIODevice
> > >  uint8_t status;
> > >  uint8_t isr;
> > >  uint16_t queue_sel;
> > > -uint32_t guest_features;
> > > +uint32_t guest_features[NR_VIRTIO_FEATURE_WORDS];
> > >  size_t config_len;
> > >  void *config;
> > >  uint16_t config_vector;
> > 
> > Ugh.
> > 
> > That's quite tricky to use correctly.
> > Why don't we just make it uint64_t?
> 
> ...and another one was that at least virtio-ccw reads/writes in chunks
> of 32 bits anyway.

It's quite easy to get at the high 32 bit of
64 bit integers.

> > 
> > The only real issue is that DEFINE_PROP_BIT wants
> > a uint32_t.
> > 
> > But that's easy to fix: add DEFINE_PROP64_BIT
> > that is the same but handles a 64 bit array.
> > 
> Sure, this would not really be a problem to add. But we'll stand before
> the same problem again when we want to grow past 64 bits, won't we?

It will take years till we run out of bits.
We'll handle it then, it's not like it's rocket science.

-- 
MST
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH RFC v4 00/16] qemu: towards virtio-1 host support

2014-11-27 Thread Cornelia Huck

On Thu, 27 Nov 2014 17:42:11 +0200
"Michael S. Tsirkin"  wrote:

> On Thu, Nov 27, 2014 at 04:31:39PM +0100, Cornelia Huck wrote:
> > On Thu, 27 Nov 2014 17:24:22 +0200
> > "Michael S. Tsirkin"  wrote:
> > 
> > > On Thu, Nov 27, 2014 at 04:16:33PM +0100, Cornelia Huck wrote:
> > > > Yet another version of the virtio-1 support patches.
> > > > 
> > > > This one has seen some (very) light testing with the virtio-1 guest
> > > > support patches currently on vhost-next.
> > > > 
> > > > Changes from v3:
> > > > 
> > > > - Add support for FEATURES_OK. We refuse to set features after the
> > > >   driver has set this in the status field, and we allow to fail
> > > >   setting the status if the features are inconsistent.
> > > > - Add missing virtio-1 changes for virtio-net (header size and mac).
> > > > - Dropped setting the VERSION_1 bit for virtio-blk: There's still
> > > >   some stuff missing.
> > > > 
> > > > For virtio-blk, we need to validate the feature bits if version 1 is
> > > > negotiated: some legacy features are not allowed in that case. I'm not
> > > > quite sure how to handle this, though. We could use the new
> > > > validate_features callback to verify that the driver negotiated a
> > > > sensible feature set, but that would require us to offer a superset
> > > > of legacy and version 1 bits, which feels wrong. Any ideas?
> > > 
> > > No, that's violating the spec.
> > > I think the simplest way is to have separate features and
> > > legacy_features fields.  Present the correct one depending on which
> > > revision was negotiated.
> > 
> > But revisions are a virtio-ccw only thing - what can other transports
> > do here?
> 
> Other transports have different ways to deal with this.
> For example virtio pci exposes a legacy header and
> a modern header. Legacy header will expose old features,
> modern one - new features.
> 
> mmio simply does not support transitional devices.
> So qemu user will have to specify virtio 1.0 or 0.9 for mmio.
> 
> Other transports are out of virtio 1.0 spec so
> they just use legacy features.
> 
> > The basic problem is that we decide via a feature bit that
> > needs to be negotiated which feature bits we want to present.
> 
> Consider wce as one example.  This is not needed for modern guests, so
> we can just mask it from modern feature mask.  Consider virtio blk scsi
> commands as another example.  this feature is not supported in virtio
> 1.0, so we must mask it from modern feature mask.
> 
> Seems the same handling works in all cases?

This was just what I was talking about...

> 
> 
> > pci and
> > mmio don't have a way to know whether the driver wants to use 1.0 or
> > legacy prior to feature negotiation, do they?
> 
> pci does. mmio doesn't but it does not want to support transitional
> devices.
> 

So we should have a per-device callback into the transport layer, say
check_legacy()?

For ccw, this would check for the negotiated revision; for mmio, it
could check a device property configured with the device; and for pci,
whatever the mechanism is there :)

A transport not implementing this callback is simply considered
legacy-only.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH RFC v4 00/16] qemu: towards virtio-1 host support

2014-11-27 Thread Michael S. Tsirkin

On Thu, Nov 27, 2014 at 05:06:51PM +0100, Cornelia Huck wrote:
> On Thu, 27 Nov 2014 17:42:11 +0200
> "Michael S. Tsirkin"  wrote:
> 
> > On Thu, Nov 27, 2014 at 04:31:39PM +0100, Cornelia Huck wrote:
> > > On Thu, 27 Nov 2014 17:24:22 +0200
> > > "Michael S. Tsirkin"  wrote:
> > > 
> > > > On Thu, Nov 27, 2014 at 04:16:33PM +0100, Cornelia Huck wrote:
> > > > > Yet another version of the virtio-1 support patches.
> > > > > 
> > > > > This one has seen some (very) light testing with the virtio-1 guest
> > > > > support patches currently on vhost-next.
> > > > > 
> > > > > Changes from v3:
> > > > > 
> > > > > - Add support for FEATURES_OK. We refuse to set features after the
> > > > >   driver has set this in the status field, and we allow to fail
> > > > >   setting the status if the features are inconsistent.
> > > > > - Add missing virtio-1 changes for virtio-net (header size and mac).
> > > > > - Dropped setting the VERSION_1 bit for virtio-blk: There's still
> > > > >   some stuff missing.
> > > > > 
> > > > > For virtio-blk, we need to validate the feature bits if version 1 is
> > > > > negotiated: some legacy features are not allowed in that case. I'm not
> > > > > quite sure how to handle this, though. We could use the new
> > > > > validate_features callback to verify that the driver negotiated a
> > > > > sensible feature set, but that would require us to offer a superset
> > > > > of legacy and version 1 bits, which feels wrong. Any ideas?
> > > > 
> > > > No, that's violating the spec.
> > > > I think the simplest way is to have separate features and
> > > > legacy_features fields.  Present the correct one depending on which
> > > > revision was negotiated.
> > > 
> > > But revisions are a virtio-ccw only thing - what can other transports
> > > do here?
> > 
> > Other transports have different ways to deal with this.
> > For example virtio pci exposes a legacy header and
> > a modern header. Legacy header will expose old features,
> > modern one - new features.
> > 
> > mmio simply does not support transitional devices.
> > So qemu user will have to specify virtio 1.0 or 0.9 for mmio.
> > 
> > Other transports are out of virtio 1.0 spec so
> > they just use legacy features.
> > 
> > > The basic problem is that we decide via a feature bit that
> > > needs to be negotiated which feature bits we want to present.
> > 
> > Consider wce as one example.  This is not needed for modern guests, so
> > we can just mask it from modern feature mask.  Consider virtio blk scsi
> > commands as another example.  this feature is not supported in virtio
> > 1.0, so we must mask it from modern feature mask.
> > 
> > Seems the same handling works in all cases?
> 
> This was just what I was talking about...
> 
> > 
> > 
> > > pci and
> > > mmio don't have a way to know whether the driver wants to use 1.0 or
> > > legacy prior to feature negotiation, do they?
> > 
> > pci does. mmio doesn't but it does not want to support transitional
> > devices.
> > 
> 
> So we should have a per-device callback into the transport layer, say
> check_legacy()?

I would just have 2 masks: legacy_features and features.

> For ccw, this would check for the negotiated revision; for mmio, it
> could check a device property configured with the device; and for pci,
> whatever the mechanism is there :)
> 
> A transport not implementing this callback is simply considered
> legacy-only.

I dislike callbacks. Let's just give all info to core,
and have it DTRT.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH RFC v4 00/16] qemu: towards virtio-1 host support

2014-11-27 Thread Cornelia Huck

On Thu, 27 Nov 2014 18:18:25 +0200
"Michael S. Tsirkin"  wrote:

> On Thu, Nov 27, 2014 at 05:06:51PM +0100, Cornelia Huck wrote:

> > So we should have a per-device callback into the transport layer, say
> > check_legacy()?
> 
> I would just have 2 masks: legacy_features and features.

But these belong to the device type and the transport just needs to
trigger usage of the right one, right?

> 
> > For ccw, this would check for the negotiated revision; for mmio, it
> > could check a device property configured with the device; and for pci,
> > whatever the mechanism is there :)
> > 
> > A transport not implementing this callback is simply considered
> > legacy-only.
> 
> I dislike callbacks. Let's just give all info to core,
> and have it DTRT.
> 
Have a is_legacy flag in the vdev that is initialized to 1, and
transports can unset it when the revision is negotiated or during init?

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH RFC v4 00/16] qemu: towards virtio-1 host support

2014-11-27 Thread Michael S. Tsirkin

On Thu, Nov 27, 2014 at 05:28:42PM +0100, Cornelia Huck wrote:
> On Thu, 27 Nov 2014 18:18:25 +0200
> "Michael S. Tsirkin"  wrote:
> 
> > On Thu, Nov 27, 2014 at 05:06:51PM +0100, Cornelia Huck wrote:
> 
> > > So we should have a per-device callback into the transport layer, say
> > > check_legacy()?
> > 
> > I would just have 2 masks: legacy_features and features.
> 
> But these belong to the device type and the transport just needs to
> trigger usage of the right one, right?

Yes.

> > 
> > > For ccw, this would check for the negotiated revision; for mmio, it
> > > could check a device property configured with the device; and for pci,
> > > whatever the mechanism is there :)
> > > 
> > > A transport not implementing this callback is simply considered
> > > legacy-only.
> > 
> > I dislike callbacks. Let's just give all info to core,
> > and have it DTRT.
> > 
> Have a is_legacy flag in the vdev that is initialized to 1, and
> transports can unset it when the revision is negotiated or during init?

I would say have modern_features, legacy_features, and set host_features
correctly.

-- 
MST
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH RFC v4 00/16] qemu: towards virtio-1 host support

2014-11-27 Thread Michael S. Tsirkin

On Thu, Nov 27, 2014 at 05:28:42PM +0100, Cornelia Huck wrote:
> On Thu, 27 Nov 2014 18:18:25 +0200
> "Michael S. Tsirkin"  wrote:
> 
> > On Thu, Nov 27, 2014 at 05:06:51PM +0100, Cornelia Huck wrote:
> 
> > > So we should have a per-device callback into the transport layer, say
> > > check_legacy()?
> > 
> > I would just have 2 masks: legacy_features and features.
> 
> But these belong to the device type and the transport just needs to
> trigger usage of the right one, right?
> 
> > 
> > > For ccw, this would check for the negotiated revision; for mmio, it
> > > could check a device property configured with the device; and for pci,
> > > whatever the mechanism is there :)
> > > 
> > > A transport not implementing this callback is simply considered
> > > legacy-only.
> > 
> > I dislike callbacks. Let's just give all info to core,
> > and have it DTRT.
> > 
> Have a is_legacy flag in the vdev that is initialized to 1, and
> transports can unset it when the revision is negotiated or during init?


Also, let's focus on one device, e.g. -net for now.
Then probably virtio scsi.
That's because blk needs to be reworked to support ANY_LAYOUT.

-- 
MST
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] KVM: x86: use MSR_ICR instead of a number

2014-11-27 Thread Paolo Bonzini



On 26/11/2014 17:07, Radim Krčmář wrote:
> 0x830 MSR is 0x300 xAPIC MMIO, which is MSR_ICR.
> 
> Signed-off-by: Radim Krčmář 
> ---
>  This applies on top of Amit's
>[PATCH v2] KVM: x86: Fix reserved x2apic registers
>  in which I noticed this minor deficit.
> 
>  arch/x86/kvm/lapic.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index 9c90d31..687874f 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -1887,7 +1887,7 @@ int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 
> msr, u64 data)
>   return 1;
>  
>   /* if this is ICR write vector before command */
> - if (msr == 0x830)
> + if (reg == APIC_ICR)
>   apic_reg_write(apic, APIC_ICR2, (u32)(data >> 32));
>   return apic_reg_write(apic, reg, (u32)data);
>  }
> @@ -1908,7 +1908,7 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, 
> u64 *data)
>  
>   if (apic_reg_read(apic, reg, 4, &low))
>   return 1;
> - if (msr == 0x830)
> + if (reg == APIC_ICR)
>   apic_reg_read(apic, APIC_ICR2, 4, &high);
>  
>   *data = (((u64)high) << 32) | low;
> 

Applied, thanks.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v2] KVM: x86: Fix reserved x2apic registers

2014-11-27 Thread Paolo Bonzini



On 26/11/2014 16:56, Nadav Amit wrote:
> x2APIC has no registers for DFR and ICR2 (see Intel SDM 10.12.1.2 "x2APIC
> Register Address Space"). KVM needs to cause #GP on such accesses.
> 
> Fix it (DFR and ICR2 on read, ICR2 on write, DFR already handled on writes).
> 
> Signed-off-by: Nadav Amit 
> 
> ---
> v1->v2: Cause #GP on ICR2 write access (as Radim pointed)
> ---
>  arch/x86/kvm/lapic.c | 9 +
>  1 file changed, 9 insertions(+)
> 
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index e0e5642..9c90d31 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -1883,6 +1883,9 @@ int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 
> msr, u64 data)
>   if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
>   return 1;
>  
> + if (reg == APIC_ICR2)
> + return 1;
> +
>   /* if this is ICR write vector before command */
>   if (msr == 0x830)
>   apic_reg_write(apic, APIC_ICR2, (u32)(data >> 32));
> @@ -1897,6 +1900,12 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 
> msr, u64 *data)
>   if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
>   return 1;
>  
> + if (reg == APIC_DFR || reg == APIC_ICR2) {
> + apic_debug("KVM_APIC_READ: read x2apic reserved register %x\n",
> +reg);
> + return 1;
> + }
> +
>   if (apic_reg_read(apic, reg, 4, &low))
>   return 1;
>   if (msr == 0x830)
> 

Applied, thanks.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v3 3/6] vfio: introduce the VFIO_DMA_MAP_FLAG_NOEXEC flag

2014-11-27 Thread Antonios Motakis

We introduce the VFIO_DMA_MAP_FLAG_NOEXEC flag to the VFIO dma map call,
and expose its availability via the capability VFIO_DMA_NOEXEC_IOMMU.
This way the user can control whether the XN flag will be set on the
requested mappings. The IOMMU_NOEXEC flag needs to be available for all
the IOMMUs of the container used.

Signed-off-by: Antonios Motakis 
---
 include/uapi/linux/vfio.h | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 59d516f..9ade02b 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -31,6 +31,7 @@ enum vfio_iommu_cap {
   (ex. PCIe NoSnoop stripping) */
VFIO_EEH = 5,   /* Check if EEH is supported */
VFIO_TYPE1_NESTING_IOMMU = 6,   /* Two-stage IOMMU, implies v2  */
+   VFIO_DMA_NOEXEC_IOMMU = 7,
 };
 
 
@@ -396,12 +397,17 @@ struct vfio_iommu_type1_info {
  *
  * Map process virtual addresses to IO virtual addresses using the
  * provided struct vfio_dma_map. Caller sets argsz. READ &/ WRITE required.
+ *
+ * To use the VFIO_DMA_MAP_FLAG_NOEXEC flag, the container must support the
+ * VFIO_DMA_NOEXEC_IOMMU capability. If mappings are created using this flag,
+ * any groups subsequently added to the container must support this capability.
  */
 struct vfio_iommu_type1_dma_map {
__u32   argsz;
__u32   flags;
 #define VFIO_DMA_MAP_FLAG_READ (1 << 0)/* readable from device 
*/
 #define VFIO_DMA_MAP_FLAG_WRITE (1 << 1)   /* writable from device */
+#define VFIO_DMA_MAP_FLAG_NOEXEC (1 << 2)  /* not executable from device */
__u64   vaddr;  /* Process virtual address */
__u64   iova;   /* IO virtual address */
__u64   size;   /* Size of mapping (bytes) */
-- 
2.1.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v3 1/6] vfio: implement iommu driver capabilities with an enum

2014-11-27 Thread Antonios Motakis

Currently a VFIO driver's IOMMU capabilities are encoded as a series of
numerical defines. Replace this with an enum for future maintainability.

Signed-off-by: Antonios Motakis 
---
 include/uapi/linux/vfio.h | 24 +++-
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 29715d2..59d516f 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -19,22 +19,20 @@
 
 /* Kernel & User level defines for VFIO IOCTLs. */
 
-/* Extensions */
-
-#define VFIO_TYPE1_IOMMU   1
-#define VFIO_SPAPR_TCE_IOMMU   2
-#define VFIO_TYPE1v2_IOMMU 3
 /*
- * IOMMU enforces DMA cache coherence (ex. PCIe NoSnoop stripping).  This
- * capability is subject to change as groups are added or removed.
+ * Capabilities exposed by the VFIO IOMMU driver. Some capabilities are subject
+ * to change as groups are added or removed.
  */
-#define VFIO_DMA_CC_IOMMU  4
-
-/* Check if EEH is supported */
-#define VFIO_EEH   5
+enum vfio_iommu_cap {
+   VFIO_TYPE1_IOMMU = 1,
+   VFIO_SPAPR_TCE_IOMMU = 2,
+   VFIO_TYPE1v2_IOMMU = 3,
+   VFIO_DMA_CC_IOMMU = 4,  /* IOMMU enforces DMA cache coherence
+  (ex. PCIe NoSnoop stripping) */
+   VFIO_EEH = 5,   /* Check if EEH is supported */
+   VFIO_TYPE1_NESTING_IOMMU = 6,   /* Two-stage IOMMU, implies v2  */
+};
 
-/* Two-stage IOMMU */
-#define VFIO_TYPE1_NESTING_IOMMU   6   /* Implies v2 */
 
 /*
  * The IOCTL interface is designed for extensibility by embedding the
-- 
2.1.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v3 6/6] vfio: type1: implement the VFIO_DMA_MAP_FLAG_NOEXEC flag

2014-11-27 Thread Antonios Motakis

Some IOMMU drivers, such as the ARM SMMU driver, make available the
IOMMU_NOEXEC flag to set the page tables for a device as XN (execute never).
This affects devices such as the ARM PL330 DMA Controller, which respects
this flag and will refuse to fetch DMA instructions from memory where the
XN flag has been set.

The flag can be used only if all IOMMU domains behind the container support
the IOMMU_NOEXEC flag. Also, if any mappings are created with the flag, any
new domains with devices will have to support it as well.

Signed-off-by: Antonios Motakis 
---
 drivers/vfio/vfio_iommu_type1.c | 25 -
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index bdf7dc9..8ba8a87 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -573,6 +573,12 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
if (!prot || !size || (size | iova | vaddr) & mask)
return -EINVAL;
 
+   if (map->flags & VFIO_DMA_MAP_FLAG_NOEXEC) {
+   if (!vfio_domains_have_iommu_cap(iommu, IOMMU_CAP_NOEXEC))
+   return -EINVAL;
+   prot |= IOMMU_NOEXEC;
+   }
+
/* Don't allow IOVA or virtual address wrap */
if (iova + size - 1 < iova || vaddr + size - 1 < vaddr)
return -EINVAL;
@@ -663,6 +669,14 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
dma = rb_entry(n, struct vfio_dma, node);
iova = dma->iova;
 
+   /*
+* if any of the mappings to be replayed has the NOEXEC flag
+* set, then the new iommu domain must support it
+*/
+   if ((dma->prot & IOMMU_NOEXEC) &&
+   !(domain->caps & IOMMU_CAP_NOEXEC))
+   return -EINVAL;
+
while (iova < dma->iova + dma->size) {
phys_addr_t phys = iommu_iova_to_phys(d->domain, iova);
size_t size;
@@ -759,6 +773,9 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
if (iommu_capable(bus, IOMMU_CAP_CACHE_COHERENCY))
domain->caps |= IOMMU_CAP_CACHE_COHERENCY;
 
+   if (iommu_capable(bus, IOMMU_CAP_NOEXEC))
+   domain->caps |= IOMMU_CAP_NOEXEC;
+
/*
 * Try to match an existing compatible domain.  We don't want to
 * preclude an IOMMU driver supporting multiple bus_types and being
@@ -920,6 +937,11 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
return 0;
return vfio_domains_have_iommu_cap(iommu,
  IOMMU_CAP_CACHE_COHERENCY);
+   case VFIO_DMA_NOEXEC_IOMMU:
+   if (!iommu)
+   return 0;
+   return vfio_domains_have_iommu_cap(iommu,
+  IOMMU_CAP_NOEXEC);
default:
return 0;
}
@@ -943,7 +965,8 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
} else if (cmd == VFIO_IOMMU_MAP_DMA) {
struct vfio_iommu_type1_dma_map map;
uint32_t mask = VFIO_DMA_MAP_FLAG_READ |
-   VFIO_DMA_MAP_FLAG_WRITE;
+   VFIO_DMA_MAP_FLAG_WRITE |
+   VFIO_DMA_MAP_FLAG_NOEXEC;
 
minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
 
-- 
2.1.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v3 4/6] vfio: type1: replace domain wide protection flags with supported capabilities

2014-11-27 Thread Antonios Motakis

VFIO_IOMMU_TYPE1 keeps track for each domain it knows a list of protection
flags it always applies to all mappings in the domain. This is used for
domains that support IOMMU_CAP_CACHE_COHERENCY.

Refactor this slightly, by keeping track instead that a given domain
supports the capability, and applying the IOMMU_CACHE protection flag when
doing the actual DMA mappings.

This will allow us to reuse the behavior for IOMMU_CAP_NOEXEC, which we
also want to keep track of, but without applying it to all domains that
support it unless the user explicitly requests it.

Signed-off-by: Antonios Motakis 
---
 drivers/vfio/vfio_iommu_type1.c | 25 +
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 4a9d666..c54dab8 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -65,7 +65,7 @@ struct vfio_domain {
struct iommu_domain *domain;
struct list_headnext;
struct list_headgroup_list;
-   int prot;   /* IOMMU_CACHE */
+   int caps;
 };
 
 struct vfio_dma {
@@ -486,7 +486,7 @@ static int map_try_harder(struct vfio_domain *domain, 
dma_addr_t iova,
for (i = 0; i < npage; i++, pfn++, iova += PAGE_SIZE) {
ret = iommu_map(domain->domain, iova,
(phys_addr_t)pfn << PAGE_SHIFT,
-   PAGE_SIZE, prot | domain->prot);
+   PAGE_SIZE, prot);
if (ret)
break;
}
@@ -504,11 +504,16 @@ static int vfio_iommu_map(struct vfio_iommu *iommu, 
dma_addr_t iova,
int ret;
 
list_for_each_entry(d, &iommu->domain_list, next) {
+   int dprot = prot;
+
+   if (d->caps & IOMMU_CAP_CACHE_COHERENCY)
+   dprot |= IOMMU_CACHE;
+
ret = iommu_map(d->domain, iova, (phys_addr_t)pfn << PAGE_SHIFT,
-   npage << PAGE_SHIFT, prot | d->prot);
+   npage << PAGE_SHIFT, dprot);
if (ret) {
if (ret != -EBUSY ||
-   map_try_harder(d, iova, pfn, npage, prot))
+   map_try_harder(d, iova, pfn, npage, dprot))
goto unwind;
}
}
@@ -621,6 +626,10 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
struct vfio_domain *d;
struct rb_node *n;
int ret;
+   int dprot = 0;
+
+   if (domain->caps & IOMMU_CAP_CACHE_COHERENCY)
+   dprot |= IOMMU_CACHE;
 
/* Arbitrarily pick the first domain in the list for lookups */
d = list_first_entry(&iommu->domain_list, struct vfio_domain, next);
@@ -654,7 +663,7 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
size += PAGE_SIZE;
 
ret = iommu_map(domain->domain, iova, phys,
-   size, dma->prot | domain->prot);
+   size, dma->prot | dprot);
if (ret)
return ret;
 
@@ -731,7 +740,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
}
 
if (iommu_capable(bus, IOMMU_CAP_CACHE_COHERENCY))
-   domain->prot |= IOMMU_CACHE;
+   domain->caps |= IOMMU_CAP_CACHE_COHERENCY;
 
/*
 * Try to match an existing compatible domain.  We don't want to
@@ -742,7 +751,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 */
list_for_each_entry(d, &iommu->domain_list, next) {
if (d->domain->ops == domain->domain->ops &&
-   d->prot == domain->prot) {
+   d->caps == domain->caps) {
iommu_detach_group(domain->domain, iommu_group);
if (!iommu_attach_group(d->domain, iommu_group)) {
list_add(&group->next, &d->group_list);
@@ -884,7 +893,7 @@ static int vfio_domains_have_iommu_cache(struct vfio_iommu 
*iommu)
 
mutex_lock(&iommu->lock);
list_for_each_entry(domain, &iommu->domain_list, next) {
-   if (!(domain->prot & IOMMU_CACHE)) {
+   if (!(domain->caps & IOMMU_CAP_CACHE_COHERENCY)) {
ret = 0;
break;
}
-- 
2.1.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v3 5/6] vfio: type1: replace vfio_domains_have_iommu_cache with generic function

2014-11-27 Thread Antonios Motakis

Replace the function vfio_domains_have_iommu_cache() with a more generic
function vfio_domains_have_iommu_cap() which allows to check all domains
of an vfio_iommu structure for a given cached capability.

Signed-off-by: Antonios Motakis 
---
 drivers/vfio/vfio_iommu_type1.c | 37 +++--
 1 file changed, 19 insertions(+), 18 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index c54dab8..bdf7dc9 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -81,6 +81,23 @@ struct vfio_group {
struct list_headnext;
 };
 
+static int vfio_domains_have_iommu_cap(struct vfio_iommu *iommu, int cap)
+{
+   struct vfio_domain *domain;
+   int ret = 1;
+
+   mutex_lock(&iommu->lock);
+   list_for_each_entry(domain, &iommu->domain_list, next) {
+   if (!(domain->caps & cap)) {
+   ret = 0;
+   break;
+   }
+   }
+   mutex_unlock(&iommu->lock);
+
+   return ret;
+}
+
 /*
  * This code handles mapping and unmapping of user data buffers
  * into DMA'ble space using the IOMMU
@@ -886,23 +903,6 @@ static void vfio_iommu_type1_release(void *iommu_data)
kfree(iommu);
 }
 
-static int vfio_domains_have_iommu_cache(struct vfio_iommu *iommu)
-{
-   struct vfio_domain *domain;
-   int ret = 1;
-
-   mutex_lock(&iommu->lock);
-   list_for_each_entry(domain, &iommu->domain_list, next) {
-   if (!(domain->caps & IOMMU_CAP_CACHE_COHERENCY)) {
-   ret = 0;
-   break;
-   }
-   }
-   mutex_unlock(&iommu->lock);
-
-   return ret;
-}
-
 static long vfio_iommu_type1_ioctl(void *iommu_data,
   unsigned int cmd, unsigned long arg)
 {
@@ -918,7 +918,8 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
case VFIO_DMA_CC_IOMMU:
if (!iommu)
return 0;
-   return vfio_domains_have_iommu_cache(iommu);
+   return vfio_domains_have_iommu_cap(iommu,
+ IOMMU_CAP_CACHE_COHERENCY);
default:
return 0;
}
-- 
2.1.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v3 2/6] vfio: type1: support for ARM SMMUs

2014-11-27 Thread Antonios Motakis

This allows to make use of the VFIO_IOMMU_TYPE1 driver with devices
behind an IOMMU on ARM platforms. The driver can then be used with
an Exynos SMMU, or ARM SMMU driver.

Signed-off-by: Antonios Motakis 
---
 drivers/vfio/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index d8c5763..a0abe04 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -16,7 +16,7 @@ config VFIO_SPAPR_EEH
 menuconfig VFIO
tristate "VFIO Non-Privileged userspace driver framework"
depends on IOMMU_API
-   select VFIO_IOMMU_TYPE1 if X86
+   select VFIO_IOMMU_TYPE1 if X86 || ARM
select VFIO_IOMMU_SPAPR_TCE if (PPC_POWERNV || PPC_PSERIES)
select VFIO_SPAPR_EEH if (PPC_POWERNV || PPC_PSERIES)
select ANON_INODES
-- 
2.1.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v10 01/20] vfio/platform: initial skeleton of VFIO support for platform devices

2014-11-27 Thread Antonios Motakis

This patch forms the common skeleton code for platform devices support
with VFIO. This will include the core functionality of VFIO_PLATFORM,
however binding to the device and discovering the device resources will
be done with the help of a separate file where any Linux platform bus
specific code will reside.

This will allow us to implement support for also discovering AMBA devices
and their resources, but still reuse a large part of the VFIO_PLATFORM
implementation.

Signed-off-by: Antonios Motakis 
---
 drivers/vfio/platform/vfio_platform_common.c  | 121 ++
 drivers/vfio/platform/vfio_platform_private.h |  36 
 2 files changed, 157 insertions(+)
 create mode 100644 drivers/vfio/platform/vfio_platform_common.c
 create mode 100644 drivers/vfio/platform/vfio_platform_private.h

diff --git a/drivers/vfio/platform/vfio_platform_common.c 
b/drivers/vfio/platform/vfio_platform_common.c
new file mode 100644
index 000..34d023b
--- /dev/null
+++ b/drivers/vfio/platform/vfio_platform_common.c
@@ -0,0 +1,121 @@
+/*
+ * Copyright (C) 2013 - Virtual Open Systems
+ * Author: Antonios Motakis 
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "vfio_platform_private.h"
+
+static void vfio_platform_release(void *device_data)
+{
+   module_put(THIS_MODULE);
+}
+
+static int vfio_platform_open(void *device_data)
+{
+   if (!try_module_get(THIS_MODULE))
+   return -ENODEV;
+
+   return 0;
+}
+
+static long vfio_platform_ioctl(void *device_data,
+   unsigned int cmd, unsigned long arg)
+{
+   if (cmd == VFIO_DEVICE_GET_INFO)
+   return -EINVAL;
+
+   else if (cmd == VFIO_DEVICE_GET_REGION_INFO)
+   return -EINVAL;
+
+   else if (cmd == VFIO_DEVICE_GET_IRQ_INFO)
+   return -EINVAL;
+
+   else if (cmd == VFIO_DEVICE_SET_IRQS)
+   return -EINVAL;
+
+   else if (cmd == VFIO_DEVICE_RESET)
+   return -EINVAL;
+
+   return -ENOTTY;
+}
+
+static ssize_t vfio_platform_read(void *device_data, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+   return -EINVAL;
+}
+
+static ssize_t vfio_platform_write(void *device_data, const char __user *buf,
+  size_t count, loff_t *ppos)
+{
+   return -EINVAL;
+}
+
+static int vfio_platform_mmap(void *device_data, struct vm_area_struct *vma)
+{
+   return -EINVAL;
+}
+
+static const struct vfio_device_ops vfio_platform_ops = {
+   .name   = "vfio-platform",
+   .open   = vfio_platform_open,
+   .release= vfio_platform_release,
+   .ioctl  = vfio_platform_ioctl,
+   .read   = vfio_platform_read,
+   .write  = vfio_platform_write,
+   .mmap   = vfio_platform_mmap,
+};
+
+int vfio_platform_probe_common(struct vfio_platform_device *vdev,
+  struct device *dev)
+{
+   struct iommu_group *group;
+   int ret;
+
+   if (!vdev)
+   return -EINVAL;
+
+   group = iommu_group_get(dev);
+   if (!group) {
+   pr_err("VFIO: No IOMMU group for device %s\n", vdev->name);
+   return -EINVAL;
+   }
+
+   ret = vfio_add_group_dev(dev, &vfio_platform_ops, vdev);
+   if (ret) {
+   iommu_group_put(group);
+   return ret;
+   }
+
+   return 0;
+}
+EXPORT_SYMBOL_GPL(vfio_platform_probe_common);
+
+struct vfio_platform_device *vfio_platform_remove_common(struct device *dev)
+{
+   struct vfio_platform_device *vdev;
+
+   vdev = vfio_del_group_dev(dev);
+   if (vdev)
+   iommu_group_put(dev->iommu_group);
+
+   return vdev;
+}
+EXPORT_SYMBOL_GPL(vfio_platform_remove_common);
diff --git a/drivers/vfio/platform/vfio_platform_private.h 
b/drivers/vfio/platform/vfio_platform_private.h
new file mode 100644
index 000..062b92d
--- /dev/null
+++ b/drivers/vfio/platform/vfio_platform_private.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright (C) 2013 - Virtual Open Systems
+ * Author: Antonios Motakis 
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR

[PATCH v10 13/20] vfio/platform: support for level sensitive interrupts

2014-11-27 Thread Antonios Motakis

Level sensitive interrupts are exposed as maskable and automasked
interrupts and are masked and disabled automatically when they fire.

Signed-off-by: Antonios Motakis 
---
 drivers/vfio/platform/vfio_platform_irq.c | 99 ++-
 drivers/vfio/platform/vfio_platform_private.h |  2 +
 2 files changed, 98 insertions(+), 3 deletions(-)

diff --git a/drivers/vfio/platform/vfio_platform_irq.c 
b/drivers/vfio/platform/vfio_platform_irq.c
index 414b4c5..1765b6c 100644
--- a/drivers/vfio/platform/vfio_platform_irq.c
+++ b/drivers/vfio/platform/vfio_platform_irq.c
@@ -23,12 +23,59 @@
 
 #include "vfio_platform_private.h"
 
+static void vfio_platform_mask(struct vfio_platform_irq *irq_ctx)
+{
+   unsigned long flags;
+
+   spin_lock_irqsave(&irq_ctx->lock, flags);
+
+   if (!irq_ctx->masked) {
+   disable_irq(irq_ctx->hwirq);
+   irq_ctx->masked = true;
+   }
+
+   spin_unlock_irqrestore(&irq_ctx->lock, flags);
+}
+
 static int vfio_platform_set_irq_mask(struct vfio_platform_device *vdev,
  unsigned index, unsigned start,
  unsigned count, uint32_t flags,
  void *data)
 {
-   return -EINVAL;
+   if (start != 0 || count != 1)
+   return -EINVAL;
+
+   if (!(vdev->irqs[index].flags & VFIO_IRQ_INFO_MASKABLE))
+   return -EINVAL;
+
+   if (flags & VFIO_IRQ_SET_DATA_EVENTFD)
+   return -EINVAL; /* not implemented yet */
+
+   if (flags & VFIO_IRQ_SET_DATA_NONE) {
+   vfio_platform_mask(&vdev->irqs[index]);
+
+   } else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
+   uint8_t mask = *(uint8_t *)data;
+
+   if (mask)
+   vfio_platform_mask(&vdev->irqs[index]);
+   }
+
+   return 0;
+}
+
+static void vfio_platform_unmask(struct vfio_platform_irq *irq_ctx)
+{
+   unsigned long flags;
+
+   spin_lock_irqsave(&irq_ctx->lock, flags);
+
+   if (irq_ctx->masked) {
+   enable_irq(irq_ctx->hwirq);
+   irq_ctx->masked = false;
+   }
+
+   spin_unlock_irqrestore(&irq_ctx->lock, flags);
 }
 
 static int vfio_platform_set_irq_unmask(struct vfio_platform_device *vdev,
@@ -36,7 +83,50 @@ static int vfio_platform_set_irq_unmask(struct 
vfio_platform_device *vdev,
unsigned count, uint32_t flags,
void *data)
 {
-   return -EINVAL;
+   if (start != 0 || count != 1)
+   return -EINVAL;
+
+   if (!(vdev->irqs[index].flags & VFIO_IRQ_INFO_MASKABLE))
+   return -EINVAL;
+
+   if (flags & VFIO_IRQ_SET_DATA_EVENTFD)
+   return -EINVAL; /* not implemented yet */
+
+   if (flags & VFIO_IRQ_SET_DATA_NONE) {
+   vfio_platform_unmask(&vdev->irqs[index]);
+
+   } else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
+   uint8_t unmask = *(uint8_t *)data;
+
+   if (unmask)
+   vfio_platform_unmask(&vdev->irqs[index]);
+   }
+
+   return 0;
+}
+
+static irqreturn_t vfio_automasked_irq_handler(int irq, void *dev_id)
+{
+   struct vfio_platform_irq *irq_ctx = dev_id;
+   unsigned long flags;
+   int ret = IRQ_NONE;
+
+   spin_lock_irqsave(&irq_ctx->lock, flags);
+
+   if (!irq_ctx->masked) {
+   ret = IRQ_HANDLED;
+
+   /* automask maskable interrupts */
+   disable_irq_nosync(irq_ctx->hwirq);
+   irq_ctx->masked = true;
+   }
+
+   spin_unlock_irqrestore(&irq_ctx->lock, flags);
+
+   if (ret == IRQ_HANDLED)
+   eventfd_signal(irq_ctx->trigger, 1);
+
+   return ret;
 }
 
 static irqreturn_t vfio_irq_handler(int irq, void *dev_id)
@@ -97,7 +187,7 @@ static int vfio_platform_set_irq_trigger(struct 
vfio_platform_device *vdev,
irq_handler_t handler;
 
if (vdev->irqs[index].flags & VFIO_IRQ_INFO_AUTOMASKED)
-   return -EINVAL; /* not implemented */
+   handler = vfio_automasked_irq_handler;
else
handler = vfio_irq_handler;
 
@@ -169,6 +259,8 @@ int vfio_platform_irq_init(struct vfio_platform_device 
*vdev)
if (hwirq < 0)
goto err;
 
+   spin_lock_init(&vdev->irqs[i].lock);
+
vdev->irqs[i].flags = VFIO_IRQ_INFO_EVENTFD;
 
if (irq_get_trigger_type(hwirq) & IRQ_TYPE_LEVEL_MASK)
@@ -177,6 +269,7 @@ int vfio_platform_irq_init(struct vfio_platform_device 
*vdev)
 
vdev->irqs[i].count = 1;
vdev->irqs[i].hwirq = hwirq;
+   vdev->irqs[i].masked = false;
}
 
vdev->num_irqs = cnt;
diff --git a/drivers/vfio/platform/vfio_platform_private.h 
b/drivers/vfio/platform/vfio_platform_private.h
index b705f17..eef6d1b 100644
--- a/drivers/vfio/platform/vfio_platfo

[PATCH v10 12/20] vfio/platform: trigger an interrupt via eventfd

2014-11-27 Thread Antonios Motakis

This patch allows to set an eventfd for a platform device's interrupt,
and also to trigger the interrupt eventfd from userspace for testing.
Level sensitive interrupts are marked as maskable and are handled in
a later patch. Edge triggered interrupts are not advertised as maskable
and are implemented here using a simple and efficient IRQ handler.

Signed-off-by: Antonios Motakis 
---
 drivers/vfio/platform/vfio_platform_irq.c | 93 ++-
 drivers/vfio/platform/vfio_platform_private.h |  2 +
 2 files changed, 93 insertions(+), 2 deletions(-)

diff --git a/drivers/vfio/platform/vfio_platform_irq.c 
b/drivers/vfio/platform/vfio_platform_irq.c
index df5c919..414b4c5 100644
--- a/drivers/vfio/platform/vfio_platform_irq.c
+++ b/drivers/vfio/platform/vfio_platform_irq.c
@@ -39,12 +39,91 @@ static int vfio_platform_set_irq_unmask(struct 
vfio_platform_device *vdev,
return -EINVAL;
 }
 
+static irqreturn_t vfio_irq_handler(int irq, void *dev_id)
+{
+   struct vfio_platform_irq *irq_ctx = dev_id;
+
+   eventfd_signal(irq_ctx->trigger, 1);
+
+   return IRQ_HANDLED;
+}
+
+static int vfio_set_trigger(struct vfio_platform_device *vdev, int index,
+   int fd, irq_handler_t handler)
+{
+   struct vfio_platform_irq *irq = &vdev->irqs[index];
+   struct eventfd_ctx *trigger;
+   int ret;
+
+   if (irq->trigger) {
+   free_irq(irq->hwirq, irq);
+   kfree(irq->name);
+   eventfd_ctx_put(irq->trigger);
+   irq->trigger = NULL;
+   }
+
+   if (fd < 0) /* Disable only */
+   return 0;
+
+   irq->name = kasprintf(GFP_KERNEL, "vfio-irq[%d](%s)",
+   irq->hwirq, vdev->name);
+   if (!irq->name)
+   return -ENOMEM;
+
+   trigger = eventfd_ctx_fdget(fd);
+   if (IS_ERR(trigger)) {
+   kfree(irq->name);
+   return PTR_ERR(trigger);
+   }
+
+   irq->trigger = trigger;
+
+   ret = request_irq(irq->hwirq, handler, 0, irq->name, irq);
+   if (ret) {
+   kfree(irq->name);
+   eventfd_ctx_put(trigger);
+   irq->trigger = NULL;
+   }
+
+   return ret;
+}
+
 static int vfio_platform_set_irq_trigger(struct vfio_platform_device *vdev,
 unsigned index, unsigned start,
 unsigned count, uint32_t flags,
 void *data)
 {
-   return -EINVAL;
+   struct vfio_platform_irq *irq = &vdev->irqs[index];
+   irq_handler_t handler;
+
+   if (vdev->irqs[index].flags & VFIO_IRQ_INFO_AUTOMASKED)
+   return -EINVAL; /* not implemented */
+   else
+   handler = vfio_irq_handler;
+
+   if (!count && (flags & VFIO_IRQ_SET_DATA_NONE))
+   return vfio_set_trigger(vdev, index, -1, handler);
+
+   if (start != 0 || count != 1)
+   return -EINVAL;
+
+   if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
+   int32_t fd = *(int32_t *)data;
+
+   return vfio_set_trigger(vdev, index, fd, handler);
+   }
+
+   if (flags & VFIO_IRQ_SET_DATA_NONE) {
+   handler(irq->hwirq, irq);
+
+   } else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
+   uint8_t trigger = *(uint8_t *)data;
+
+   if (trigger)
+   handler(irq->hwirq, irq);
+   }
+
+   return 0;
 }
 
 int vfio_platform_set_irqs_ioctl(struct vfio_platform_device *vdev,
@@ -90,7 +169,12 @@ int vfio_platform_irq_init(struct vfio_platform_device 
*vdev)
if (hwirq < 0)
goto err;
 
-   vdev->irqs[i].flags = 0;
+   vdev->irqs[i].flags = VFIO_IRQ_INFO_EVENTFD;
+
+   if (irq_get_trigger_type(hwirq) & IRQ_TYPE_LEVEL_MASK)
+   vdev->irqs[i].flags |= VFIO_IRQ_INFO_MASKABLE
+   | VFIO_IRQ_INFO_AUTOMASKED;
+
vdev->irqs[i].count = 1;
vdev->irqs[i].hwirq = hwirq;
}
@@ -105,6 +189,11 @@ err:
 
 void vfio_platform_irq_cleanup(struct vfio_platform_device *vdev)
 {
+   int i;
+
+   for (i = 0; i < vdev->num_irqs; i++)
+   vfio_set_trigger(vdev, i, -1, NULL);
+
vdev->num_irqs = 0;
kfree(vdev->irqs);
 }
diff --git a/drivers/vfio/platform/vfio_platform_private.h 
b/drivers/vfio/platform/vfio_platform_private.h
index 61f3ed4..b705f17 100644
--- a/drivers/vfio/platform/vfio_platform_private.h
+++ b/drivers/vfio/platform/vfio_platform_private.h
@@ -28,6 +28,8 @@ struct vfio_platform_irq {
u32 flags;
u32 count;
int hwirq;
+   char*name;
+   struct eventfd_ctx  *trigger;
 };
 
 struct vfio_platform_region {
-- 
2.1.3

--
To unsubscribe from this list:

[PATCH v10 09/20] vfio/platform: support MMAP of MMIO regions

2014-11-27 Thread Antonios Motakis

Allow to memory map the MMIO regions of the device so userspace can
directly access them. PIO regions are not being handled at this point.

Signed-off-by: Antonios Motakis 
---
 drivers/vfio/platform/vfio_platform_common.c | 65 
 1 file changed, 65 insertions(+)

diff --git a/drivers/vfio/platform/vfio_platform_common.c 
b/drivers/vfio/platform/vfio_platform_common.c
index fda4c30..6bf78ee 100644
--- a/drivers/vfio/platform/vfio_platform_common.c
+++ b/drivers/vfio/platform/vfio_platform_common.c
@@ -54,6 +54,16 @@ static int vfio_platform_regions_init(struct 
vfio_platform_device *vdev)
if (!(res->flags & IORESOURCE_READONLY))
vdev->regions[i].flags |=
VFIO_REGION_INFO_FLAG_WRITE;
+
+   /*
+* Only regions addressed with PAGE granularity may be
+* MMAPed securely.
+*/
+   if (!(vdev->regions[i].addr & ~PAGE_MASK) &&
+   !(vdev->regions[i].size & ~PAGE_MASK))
+   vdev->regions[i].flags |=
+   VFIO_REGION_INFO_FLAG_MMAP;
+
break;
case IORESOURCE_IO:
vdev->regions[i].type = VFIO_PLATFORM_REGION_TYPE_PIO;
@@ -333,8 +343,63 @@ static ssize_t vfio_platform_write(void *device_data, 
const char __user *buf,
return -EINVAL;
 }
 
+static int vfio_platform_mmap_mmio(struct vfio_platform_region region,
+  struct vm_area_struct *vma)
+{
+   u64 req_len, pgoff, req_start;
+
+   req_len = vma->vm_end - vma->vm_start;
+   pgoff = vma->vm_pgoff &
+   ((1U << (VFIO_PLATFORM_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
+   req_start = pgoff << PAGE_SHIFT;
+
+   if (region.size < PAGE_SIZE || req_start + req_len > region.size)
+   return -EINVAL;
+
+   vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+   vma->vm_pgoff = (region.addr >> PAGE_SHIFT) + pgoff;
+
+   return remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
+  req_len, vma->vm_page_prot);
+}
+
 static int vfio_platform_mmap(void *device_data, struct vm_area_struct *vma)
 {
+   struct vfio_platform_device *vdev = device_data;
+   unsigned int index;
+
+   index = vma->vm_pgoff >> (VFIO_PLATFORM_OFFSET_SHIFT - PAGE_SHIFT);
+
+   if (vma->vm_end < vma->vm_start)
+   return -EINVAL;
+   if (!(vma->vm_flags & VM_SHARED))
+   return -EINVAL;
+   if (index >= vdev->num_regions)
+   return -EINVAL;
+   if (vma->vm_start & ~PAGE_MASK)
+   return -EINVAL;
+   if (vma->vm_end & ~PAGE_MASK)
+   return -EINVAL;
+
+   if (!(vdev->regions[index].flags & VFIO_REGION_INFO_FLAG_MMAP))
+   return -EINVAL;
+
+   if (!(vdev->regions[index].flags & VFIO_REGION_INFO_FLAG_READ)
+   && (vma->vm_flags & VM_READ))
+   return -EINVAL;
+
+   if (!(vdev->regions[index].flags & VFIO_REGION_INFO_FLAG_WRITE)
+   && (vma->vm_flags & VM_WRITE))
+   return -EINVAL;
+
+   vma->vm_private_data = vdev;
+
+   if (vdev->regions[index].type & VFIO_PLATFORM_REGION_TYPE_MMIO)
+   return vfio_platform_mmap_mmio(vdev->regions[index], vma);
+
+   else if (vdev->regions[index].type & VFIO_PLATFORM_REGION_TYPE_PIO)
+   return -EINVAL; /* not implemented */
+
return -EINVAL;
 }
 
-- 
2.1.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v10 17/20] vfio: pass an opaque pointer on virqfd initialization

2014-11-27 Thread Antonios Motakis

VFIO_PCI passes the VFIO device structure *vdev via eventfd to the handler
that implements masking/unmasking of IRQs via an eventfd. We can replace
it in the virqfd infrastructure with an opaque type so we can make use
of the mechanism from other VFIO bus drivers.

Signed-off-by: Antonios Motakis 
---
 drivers/vfio/pci/vfio_pci_intrs.c | 30 --
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci_intrs.c 
b/drivers/vfio/pci/vfio_pci_intrs.c
index 61505c1c..86bdcd6 100644
--- a/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -31,10 +31,10 @@
  * IRQfd - generic
  */
 struct virqfd {
-   struct vfio_pci_device  *vdev;
+   void*opaque;
struct eventfd_ctx  *eventfd;
-   int (*handler)(struct vfio_pci_device *, void *);
-   void(*thread)(struct vfio_pci_device *, void *);
+   int (*handler)(void *, void *);
+   void(*thread)(void *, void *);
void*data;
struct work_struct  inject;
wait_queue_twait;
@@ -74,7 +74,7 @@ static int virqfd_wakeup(wait_queue_t *wait, unsigned mode, 
int sync, void *key)
if (flags & POLLIN) {
/* An event has been signaled, call function */
if ((!virqfd->handler ||
-virqfd->handler(virqfd->vdev, virqfd->data)) &&
+virqfd->handler(virqfd->opaque, virqfd->data)) &&
virqfd->thread)
schedule_work(&virqfd->inject);
}
@@ -124,12 +124,12 @@ static void virqfd_inject(struct work_struct *work)
 {
struct virqfd *virqfd = container_of(work, struct virqfd, inject);
if (virqfd->thread)
-   virqfd->thread(virqfd->vdev, virqfd->data);
+   virqfd->thread(virqfd->opaque, virqfd->data);
 }
 
-int vfio_virqfd_enable(struct vfio_pci_device *vdev,
-  int (*handler)(struct vfio_pci_device *, void *),
-  void (*thread)(struct vfio_pci_device *, void *),
+int vfio_virqfd_enable(void *opaque,
+  int (*handler)(void *, void *),
+  void (*thread)(void *, void *),
   void *data, struct virqfd **pvirqfd, int fd)
 {
struct fd irqfd;
@@ -143,7 +143,7 @@ int vfio_virqfd_enable(struct vfio_pci_device *vdev,
return -ENOMEM;
 
virqfd->pvirqfd = pvirqfd;
-   virqfd->vdev = vdev;
+   virqfd->opaque = opaque;
virqfd->handler = handler;
virqfd->thread = thread;
virqfd->data = data;
@@ -196,7 +196,7 @@ int vfio_virqfd_enable(struct vfio_pci_device *vdev,
 * before we registered and trigger it as if we didn't miss it.
 */
if (events & POLLIN) {
-   if ((!handler || handler(vdev, data)) && thread)
+   if ((!handler || handler(opaque, data)) && thread)
schedule_work(&virqfd->inject);
}
 
@@ -243,8 +243,10 @@ EXPORT_SYMBOL_GPL(vfio_virqfd_disable);
 /*
  * INTx
  */
-static void vfio_send_intx_eventfd(struct vfio_pci_device *vdev, void *unused)
+static void vfio_send_intx_eventfd(void *opaque, void *unused)
 {
+   struct vfio_pci_device *vdev = opaque;
+
if (likely(is_intx(vdev) && !vdev->virq_disabled))
eventfd_signal(vdev->ctx[0].trigger, 1);
 }
@@ -287,9 +289,9 @@ void vfio_pci_intx_mask(struct vfio_pci_device *vdev)
  * a signal is necessary, which can then be handled via a work queue
  * or directly depending on the caller.
  */
-static int vfio_pci_intx_unmask_handler(struct vfio_pci_device *vdev,
-   void *unused)
+static int vfio_pci_intx_unmask_handler(void *opaque, void *unused)
 {
+   struct vfio_pci_device *vdev = opaque;
struct pci_dev *pdev = vdev->pdev;
unsigned long flags;
int ret = 0;
@@ -641,7 +643,7 @@ static int vfio_pci_set_intx_unmask(struct vfio_pci_device 
*vdev,
} else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
int32_t fd = *(int32_t *)data;
if (fd >= 0)
-   return vfio_virqfd_enable(vdev,
+   return vfio_virqfd_enable((void *) vdev,
  vfio_pci_intx_unmask_handler,
  vfio_send_intx_eventfd, NULL,
  &vdev->ctx[0].unmask, fd);
-- 
2.1.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v10 20/20] vfio/platform: implement IRQ masking/unmasking via an eventfd

2014-11-27 Thread Antonios Motakis

With this patch the VFIO user will be able to set an eventfd that can be
used in order to mask and unmask IRQs of platform devices.

Signed-off-by: Antonios Motakis 
---
 drivers/vfio/platform/vfio_platform_irq.c | 47 ---
 drivers/vfio/platform/vfio_platform_private.h |  2 ++
 2 files changed, 45 insertions(+), 4 deletions(-)

diff --git a/drivers/vfio/platform/vfio_platform_irq.c 
b/drivers/vfio/platform/vfio_platform_irq.c
index 1765b6c..0e90365 100644
--- a/drivers/vfio/platform/vfio_platform_irq.c
+++ b/drivers/vfio/platform/vfio_platform_irq.c
@@ -37,6 +37,15 @@ static void vfio_platform_mask(struct vfio_platform_irq 
*irq_ctx)
spin_unlock_irqrestore(&irq_ctx->lock, flags);
 }
 
+static int vfio_platform_mask_handler(void *opaque, void *unused)
+{
+   struct vfio_platform_irq *irq_ctx = opaque;
+
+   vfio_platform_mask(irq_ctx);
+
+   return 0;
+}
+
 static int vfio_platform_set_irq_mask(struct vfio_platform_device *vdev,
  unsigned index, unsigned start,
  unsigned count, uint32_t flags,
@@ -48,8 +57,18 @@ static int vfio_platform_set_irq_mask(struct 
vfio_platform_device *vdev,
if (!(vdev->irqs[index].flags & VFIO_IRQ_INFO_MASKABLE))
return -EINVAL;
 
-   if (flags & VFIO_IRQ_SET_DATA_EVENTFD)
-   return -EINVAL; /* not implemented yet */
+   if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
+   int32_t fd = *(int32_t *)data;
+
+   if (fd >= 0)
+   return vfio_virqfd_enable((void *) &vdev->irqs[index],
+ vfio_platform_mask_handler,
+ NULL, NULL,
+ &vdev->irqs[index].mask, fd);
+
+   vfio_virqfd_disable(&vdev->irqs[index].mask);
+   return 0;
+   }
 
if (flags & VFIO_IRQ_SET_DATA_NONE) {
vfio_platform_mask(&vdev->irqs[index]);
@@ -78,6 +97,15 @@ static void vfio_platform_unmask(struct vfio_platform_irq 
*irq_ctx)
spin_unlock_irqrestore(&irq_ctx->lock, flags);
 }
 
+static int vfio_platform_unmask_handler(void *opaque, void *unused)
+{
+   struct vfio_platform_irq *irq_ctx = opaque;
+
+   vfio_platform_unmask(irq_ctx);
+
+   return 0;
+}
+
 static int vfio_platform_set_irq_unmask(struct vfio_platform_device *vdev,
unsigned index, unsigned start,
unsigned count, uint32_t flags,
@@ -89,8 +117,19 @@ static int vfio_platform_set_irq_unmask(struct 
vfio_platform_device *vdev,
if (!(vdev->irqs[index].flags & VFIO_IRQ_INFO_MASKABLE))
return -EINVAL;
 
-   if (flags & VFIO_IRQ_SET_DATA_EVENTFD)
-   return -EINVAL; /* not implemented yet */
+   if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
+   int32_t fd = *(int32_t *)data;
+
+   if (fd >= 0)
+   return vfio_virqfd_enable((void *) &vdev->irqs[index],
+ vfio_platform_unmask_handler,
+ NULL, NULL,
+ &vdev->irqs[index].unmask,
+ fd);
+
+   vfio_virqfd_disable(&vdev->irqs[index].unmask);
+   return 0;
+   }
 
if (flags & VFIO_IRQ_SET_DATA_NONE) {
vfio_platform_unmask(&vdev->irqs[index]);
diff --git a/drivers/vfio/platform/vfio_platform_private.h 
b/drivers/vfio/platform/vfio_platform_private.h
index eef6d1b..c3d5b4b 100644
--- a/drivers/vfio/platform/vfio_platform_private.h
+++ b/drivers/vfio/platform/vfio_platform_private.h
@@ -32,6 +32,8 @@ struct vfio_platform_irq {
struct eventfd_ctx  *trigger;
boolmasked;
spinlock_t  lock;
+   struct virqfd   *unmask;
+   struct virqfd   *mask;
 };
 
 struct vfio_platform_region {
-- 
2.1.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v10 18/20] vfio: move eventfd support code for VFIO_PCI to a separate file

2014-11-27 Thread Antonios Motakis

The virqfd functionality that is used by VFIO_PCI to implement interrupt
masking and unmasking via an eventfd, is generic enough and can be reused
by another driver. Move it to a separate file in order to allow the code
to be shared.

Signed-off-by: Antonios Motakis 
---
 drivers/vfio/pci/Makefile   |   3 +-
 drivers/vfio/pci/vfio_pci_intrs.c   | 215 
 drivers/vfio/pci/vfio_pci_private.h |   3 -
 drivers/vfio/virqfd.c   | 213 +++
 include/linux/vfio.h|  27 +
 5 files changed, 242 insertions(+), 219 deletions(-)
 create mode 100644 drivers/vfio/virqfd.c

diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
index 1310792..c7c8644 100644
--- a/drivers/vfio/pci/Makefile
+++ b/drivers/vfio/pci/Makefile
@@ -1,4 +1,5 @@
 
-vfio-pci-y := vfio_pci.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o
+vfio-pci-y := vfio_pci.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o \
+ ../virqfd.o
 
 obj-$(CONFIG_VFIO_PCI) += vfio-pci.o
diff --git a/drivers/vfio/pci/vfio_pci_intrs.c 
b/drivers/vfio/pci/vfio_pci_intrs.c
index 86bdcd6..b56b719 100644
--- a/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -19,228 +19,13 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
-#include 
 #include 
 
 #include "vfio_pci_private.h"
 
 /*
- * IRQfd - generic
- */
-struct virqfd {
-   void*opaque;
-   struct eventfd_ctx  *eventfd;
-   int (*handler)(void *, void *);
-   void(*thread)(void *, void *);
-   void*data;
-   struct work_struct  inject;
-   wait_queue_twait;
-   poll_table  pt;
-   struct work_struct  shutdown;
-   struct virqfd   **pvirqfd;
-};
-
-static struct workqueue_struct *vfio_irqfd_cleanup_wq;
-DEFINE_SPINLOCK(virqfd_lock);
-
-int __init vfio_virqfd_init(void)
-{
-   vfio_irqfd_cleanup_wq =
-   create_singlethread_workqueue("vfio-irqfd-cleanup");
-   if (!vfio_irqfd_cleanup_wq)
-   return -ENOMEM;
-
-   return 0;
-}
-
-void vfio_virqfd_exit(void)
-{
-   destroy_workqueue(vfio_irqfd_cleanup_wq);
-}
-
-static void virqfd_deactivate(struct virqfd *virqfd)
-{
-   queue_work(vfio_irqfd_cleanup_wq, &virqfd->shutdown);
-}
-
-static int virqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void 
*key)
-{
-   struct virqfd *virqfd = container_of(wait, struct virqfd, wait);
-   unsigned long flags = (unsigned long)key;
-
-   if (flags & POLLIN) {
-   /* An event has been signaled, call function */
-   if ((!virqfd->handler ||
-virqfd->handler(virqfd->opaque, virqfd->data)) &&
-   virqfd->thread)
-   schedule_work(&virqfd->inject);
-   }
-
-   if (flags & POLLHUP) {
-   unsigned long flags;
-   spin_lock_irqsave(&virqfd_lock, flags);
-
-   /*
-* The eventfd is closing, if the virqfd has not yet been
-* queued for release, as determined by testing whether the
-* virqfd pointer to it is still valid, queue it now.  As
-* with kvm irqfds, we know we won't race against the virqfd
-* going away because we hold the lock to get here.
-*/
-   if (*(virqfd->pvirqfd) == virqfd) {
-   *(virqfd->pvirqfd) = NULL;
-   virqfd_deactivate(virqfd);
-   }
-
-   spin_unlock_irqrestore(&virqfd_lock, flags);
-   }
-
-   return 0;
-}
-
-static void virqfd_ptable_queue_proc(struct file *file,
-wait_queue_head_t *wqh, poll_table *pt)
-{
-   struct virqfd *virqfd = container_of(pt, struct virqfd, pt);
-   add_wait_queue(wqh, &virqfd->wait);
-}
-
-static void virqfd_shutdown(struct work_struct *work)
-{
-   struct virqfd *virqfd = container_of(work, struct virqfd, shutdown);
-   u64 cnt;
-
-   eventfd_ctx_remove_wait_queue(virqfd->eventfd, &virqfd->wait, &cnt);
-   flush_work(&virqfd->inject);
-   eventfd_ctx_put(virqfd->eventfd);
-
-   kfree(virqfd);
-}
-
-static void virqfd_inject(struct work_struct *work)
-{
-   struct virqfd *virqfd = container_of(work, struct virqfd, inject);
-   if (virqfd->thread)
-   virqfd->thread(virqfd->opaque, virqfd->data);
-}
-
-int vfio_virqfd_enable(void *opaque,
-  int (*handler)(void *, void *),
-  void (*thread)(void *, void *),
-  void *data, struct virqfd **pvirqfd, int fd)
-{
-   struct fd irqfd;
-   struct eventfd_ctx *ctx;
-   struct virqfd *virqfd;
-   int ret = 0;
-   unsigned int events;
-
-   virqfd = kzalloc(sizeof(*virqfd), GFP_KERNEL);
-

[PATCH v10 16/20] vfio: add local lock for virqfd instead of depending on VFIO PCI

2014-11-27 Thread Antonios Motakis

The Virqfd code needs to keep accesses to any struct *virqfd safe, but
this comes into play only when creating or destroying eventfds, so sharing
the same spinlock with the VFIO bus driver is not necessary.

Signed-off-by: Antonios Motakis 
---
 drivers/vfio/pci/vfio_pci_intrs.c | 31 ---
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci_intrs.c 
b/drivers/vfio/pci/vfio_pci_intrs.c
index d178496..61505c1c 100644
--- a/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -44,6 +44,7 @@ struct virqfd {
 };
 
 static struct workqueue_struct *vfio_irqfd_cleanup_wq;
+DEFINE_SPINLOCK(virqfd_lock);
 
 int __init vfio_virqfd_init(void)
 {
@@ -80,21 +81,21 @@ static int virqfd_wakeup(wait_queue_t *wait, unsigned mode, 
int sync, void *key)
 
if (flags & POLLHUP) {
unsigned long flags;
-   spin_lock_irqsave(&virqfd->vdev->irqlock, flags);
+   spin_lock_irqsave(&virqfd_lock, flags);
 
/*
 * The eventfd is closing, if the virqfd has not yet been
 * queued for release, as determined by testing whether the
-* vdev pointer to it is still valid, queue it now.  As
+* virqfd pointer to it is still valid, queue it now.  As
 * with kvm irqfds, we know we won't race against the virqfd
-* going away because we hold wqh->lock to get here.
+* going away because we hold the lock to get here.
 */
if (*(virqfd->pvirqfd) == virqfd) {
*(virqfd->pvirqfd) = NULL;
virqfd_deactivate(virqfd);
}
 
-   spin_unlock_irqrestore(&virqfd->vdev->irqlock, flags);
+   spin_unlock_irqrestore(&virqfd_lock, flags);
}
 
return 0;
@@ -170,16 +171,16 @@ int vfio_virqfd_enable(struct vfio_pci_device *vdev,
 * we update the pointer to the virqfd under lock to avoid
 * pushing multiple jobs to release the same virqfd.
 */
-   spin_lock_irq(&vdev->irqlock);
+   spin_lock_irq(&virqfd_lock);
 
if (*pvirqfd) {
-   spin_unlock_irq(&vdev->irqlock);
+   spin_unlock_irq(&virqfd_lock);
ret = -EBUSY;
goto err_busy;
}
*pvirqfd = virqfd;
 
-   spin_unlock_irq(&vdev->irqlock);
+   spin_unlock_irq(&virqfd_lock);
 
/*
 * Install our own custom wake-up handling so we are notified via
@@ -217,18 +218,18 @@ err_fd:
 }
 EXPORT_SYMBOL_GPL(vfio_virqfd_enable);
 
-void vfio_virqfd_disable(struct vfio_pci_device *vdev, struct virqfd **pvirqfd)
+void vfio_virqfd_disable(struct virqfd **pvirqfd)
 {
unsigned long flags;
 
-   spin_lock_irqsave(&vdev->irqlock, flags);
+   spin_lock_irqsave(&virqfd_lock, flags);
 
if (*pvirqfd) {
virqfd_deactivate(*pvirqfd);
*pvirqfd = NULL;
}
 
-   spin_unlock_irqrestore(&vdev->irqlock, flags);
+   spin_unlock_irqrestore(&virqfd_lock, flags);
 
/*
 * Block until we know all outstanding shutdown jobs have completed.
@@ -441,8 +442,8 @@ static int vfio_intx_set_signal(struct vfio_pci_device 
*vdev, int fd)
 static void vfio_intx_disable(struct vfio_pci_device *vdev)
 {
vfio_intx_set_signal(vdev, -1);
-   vfio_virqfd_disable(vdev, &vdev->ctx[0].unmask);
-   vfio_virqfd_disable(vdev, &vdev->ctx[0].mask);
+   vfio_virqfd_disable(&vdev->ctx[0].unmask);
+   vfio_virqfd_disable(&vdev->ctx[0].mask);
vdev->irq_type = VFIO_PCI_NUM_IRQS;
vdev->num_ctx = 0;
kfree(vdev->ctx);
@@ -606,8 +607,8 @@ static void vfio_msi_disable(struct vfio_pci_device *vdev, 
bool msix)
vfio_msi_set_block(vdev, 0, vdev->num_ctx, NULL, msix);
 
for (i = 0; i < vdev->num_ctx; i++) {
-   vfio_virqfd_disable(vdev, &vdev->ctx[i].unmask);
-   vfio_virqfd_disable(vdev, &vdev->ctx[i].mask);
+   vfio_virqfd_disable(&vdev->ctx[i].unmask);
+   vfio_virqfd_disable(&vdev->ctx[i].mask);
}
 
if (msix) {
@@ -645,7 +646,7 @@ static int vfio_pci_set_intx_unmask(struct vfio_pci_device 
*vdev,
  vfio_send_intx_eventfd, NULL,
  &vdev->ctx[0].unmask, fd);
 
-   vfio_virqfd_disable(vdev, &vdev->ctx[0].unmask);
+   vfio_virqfd_disable(&vdev->ctx[0].unmask);
}
 
return 0;
-- 
2.1.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v10 19/20] vfio: initialize the virqfd workqueue in VFIO generic code

2014-11-27 Thread Antonios Motakis

Now we have finally completely decoupled virqfd from VFIO_PCI. We can
initialize it from the VFIO generic code, in order to safely use it from
multiple independent VFIO bus drivers.

Signed-off-by: Antonios Motakis 
---
 drivers/vfio/Makefile   | 4 +++-
 drivers/vfio/pci/Makefile   | 3 +--
 drivers/vfio/pci/vfio_pci.c | 8 
 drivers/vfio/vfio.c | 8 
 4 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
index dadf0ca..d798b09 100644
--- a/drivers/vfio/Makefile
+++ b/drivers/vfio/Makefile
@@ -1,4 +1,6 @@
-obj-$(CONFIG_VFIO) += vfio.o
+vfio_core-y := vfio.o virqfd.o
+
+obj-$(CONFIG_VFIO) += vfio_core.o
 obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
 obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
 obj-$(CONFIG_VFIO_SPAPR_EEH) += vfio_spapr_eeh.o
diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
index c7c8644..1310792 100644
--- a/drivers/vfio/pci/Makefile
+++ b/drivers/vfio/pci/Makefile
@@ -1,5 +1,4 @@
 
-vfio-pci-y := vfio_pci.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o \
- ../virqfd.o
+vfio-pci-y := vfio_pci.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o
 
 obj-$(CONFIG_VFIO_PCI) += vfio-pci.o
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index fc4308c..8d156d7 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -1012,7 +1012,6 @@ put_devs:
 static void __exit vfio_pci_cleanup(void)
 {
pci_unregister_driver(&vfio_pci_driver);
-   vfio_virqfd_exit();
vfio_pci_uninit_perm_bits();
 }
 
@@ -1025,11 +1024,6 @@ static int __init vfio_pci_init(void)
if (ret)
return ret;
 
-   /* Start the virqfd cleanup handler */
-   ret = vfio_virqfd_init();
-   if (ret)
-   goto out_virqfd;
-
/* Register and scan for devices */
ret = pci_register_driver(&vfio_pci_driver);
if (ret)
@@ -1038,8 +1032,6 @@ static int __init vfio_pci_init(void)
return 0;
 
 out_driver:
-   vfio_virqfd_exit();
-out_virqfd:
vfio_pci_uninit_perm_bits();
return ret;
 }
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index f018d8d..8e84471 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -1464,6 +1464,11 @@ static int __init vfio_init(void)
if (ret)
goto err_cdev_add;
 
+   /* Start the virqfd cleanup handler used by some VFIO bus drivers */
+   ret = vfio_virqfd_init();
+   if (ret)
+   goto err_virqfd;
+
pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
 
/*
@@ -1476,6 +1481,8 @@ static int __init vfio_init(void)
 
return 0;
 
+err_virqfd:
+   cdev_del(&vfio.group_cdev);
 err_cdev_add:
unregister_chrdev_region(vfio.group_devt, MINORMASK);
 err_alloc_chrdev:
@@ -1490,6 +1497,7 @@ static void __exit vfio_cleanup(void)
 {
WARN_ON(!list_empty(&vfio.group_list));
 
+   vfio_virqfd_exit();
idr_destroy(&vfio.group_idr);
cdev_del(&vfio.group_cdev);
unregister_chrdev_region(vfio.group_devt, MINORMASK);
-- 
2.1.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v10 15/20] vfio: virqfd: rename vfio_pci_virqfd_init and vfio_pci_virqfd_exit

2014-11-27 Thread Antonios Motakis

The functions vfio_pci_virqfd_init and vfio_pci_virqfd_exit are not really
PCI specific, since we plan to reuse the virqfd code with more VFIO drivers
in addition to VFIO_PCI.

Signed-off-by: Antonios Motakis 
---
 drivers/vfio/pci/vfio_pci.c   | 6 +++---
 drivers/vfio/pci/vfio_pci_intrs.c | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 9558da3..fc4308c 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -1012,7 +1012,7 @@ put_devs:
 static void __exit vfio_pci_cleanup(void)
 {
pci_unregister_driver(&vfio_pci_driver);
-   vfio_pci_virqfd_exit();
+   vfio_virqfd_exit();
vfio_pci_uninit_perm_bits();
 }
 
@@ -1026,7 +1026,7 @@ static int __init vfio_pci_init(void)
return ret;
 
/* Start the virqfd cleanup handler */
-   ret = vfio_pci_virqfd_init();
+   ret = vfio_virqfd_init();
if (ret)
goto out_virqfd;
 
@@ -1038,7 +1038,7 @@ static int __init vfio_pci_init(void)
return 0;
 
 out_driver:
-   vfio_pci_virqfd_exit();
+   vfio_virqfd_exit();
 out_virqfd:
vfio_pci_uninit_perm_bits();
return ret;
diff --git a/drivers/vfio/pci/vfio_pci_intrs.c 
b/drivers/vfio/pci/vfio_pci_intrs.c
index 3aaac47..d178496 100644
--- a/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -45,7 +45,7 @@ struct virqfd {
 
 static struct workqueue_struct *vfio_irqfd_cleanup_wq;
 
-int __init vfio_pci_virqfd_init(void)
+int __init vfio_virqfd_init(void)
 {
vfio_irqfd_cleanup_wq =
create_singlethread_workqueue("vfio-irqfd-cleanup");
@@ -55,7 +55,7 @@ int __init vfio_pci_virqfd_init(void)
return 0;
 }
 
-void vfio_pci_virqfd_exit(void)
+void vfio_virqfd_exit(void)
 {
destroy_workqueue(vfio_irqfd_cleanup_wq);
 }
-- 
2.1.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v10 14/20] vfio: add a vfio_ prefix to virqfd_enable and virqfd_disable and export

2014-11-27 Thread Antonios Motakis

We want to reuse virqfd functionality in multiple VFIO drivers; before
moving these functions to core VFIO, add the vfio_ prefix to the
virqfd_enable and virqfd_disable functions, and export them so they can
be used from other modules.

Signed-off-by: Antonios Motakis 
---
 drivers/vfio/pci/vfio_pci_intrs.c   | 30 --
 drivers/vfio/pci/vfio_pci_private.h |  4 ++--
 2 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci_intrs.c 
b/drivers/vfio/pci/vfio_pci_intrs.c
index 553212f..3aaac47 100644
--- a/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -126,10 +126,10 @@ static void virqfd_inject(struct work_struct *work)
virqfd->thread(virqfd->vdev, virqfd->data);
 }
 
-static int virqfd_enable(struct vfio_pci_device *vdev,
-int (*handler)(struct vfio_pci_device *, void *),
-void (*thread)(struct vfio_pci_device *, void *),
-void *data, struct virqfd **pvirqfd, int fd)
+int vfio_virqfd_enable(struct vfio_pci_device *vdev,
+  int (*handler)(struct vfio_pci_device *, void *),
+  void (*thread)(struct vfio_pci_device *, void *),
+  void *data, struct virqfd **pvirqfd, int fd)
 {
struct fd irqfd;
struct eventfd_ctx *ctx;
@@ -215,9 +215,9 @@ err_fd:
 
return ret;
 }
+EXPORT_SYMBOL_GPL(vfio_virqfd_enable);
 
-static void virqfd_disable(struct vfio_pci_device *vdev,
-  struct virqfd **pvirqfd)
+void vfio_virqfd_disable(struct vfio_pci_device *vdev, struct virqfd **pvirqfd)
 {
unsigned long flags;
 
@@ -237,6 +237,7 @@ static void virqfd_disable(struct vfio_pci_device *vdev,
 */
flush_workqueue(vfio_irqfd_cleanup_wq);
 }
+EXPORT_SYMBOL_GPL(vfio_virqfd_disable);
 
 /*
  * INTx
@@ -440,8 +441,8 @@ static int vfio_intx_set_signal(struct vfio_pci_device 
*vdev, int fd)
 static void vfio_intx_disable(struct vfio_pci_device *vdev)
 {
vfio_intx_set_signal(vdev, -1);
-   virqfd_disable(vdev, &vdev->ctx[0].unmask);
-   virqfd_disable(vdev, &vdev->ctx[0].mask);
+   vfio_virqfd_disable(vdev, &vdev->ctx[0].unmask);
+   vfio_virqfd_disable(vdev, &vdev->ctx[0].mask);
vdev->irq_type = VFIO_PCI_NUM_IRQS;
vdev->num_ctx = 0;
kfree(vdev->ctx);
@@ -605,8 +606,8 @@ static void vfio_msi_disable(struct vfio_pci_device *vdev, 
bool msix)
vfio_msi_set_block(vdev, 0, vdev->num_ctx, NULL, msix);
 
for (i = 0; i < vdev->num_ctx; i++) {
-   virqfd_disable(vdev, &vdev->ctx[i].unmask);
-   virqfd_disable(vdev, &vdev->ctx[i].mask);
+   vfio_virqfd_disable(vdev, &vdev->ctx[i].unmask);
+   vfio_virqfd_disable(vdev, &vdev->ctx[i].mask);
}
 
if (msix) {
@@ -639,11 +640,12 @@ static int vfio_pci_set_intx_unmask(struct 
vfio_pci_device *vdev,
} else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
int32_t fd = *(int32_t *)data;
if (fd >= 0)
-   return virqfd_enable(vdev, vfio_pci_intx_unmask_handler,
-vfio_send_intx_eventfd, NULL,
-&vdev->ctx[0].unmask, fd);
+   return vfio_virqfd_enable(vdev,
+ vfio_pci_intx_unmask_handler,
+ vfio_send_intx_eventfd, NULL,
+ &vdev->ctx[0].unmask, fd);
 
-   virqfd_disable(vdev, &vdev->ctx[0].unmask);
+   vfio_virqfd_disable(vdev, &vdev->ctx[0].unmask);
}
 
return 0;
diff --git a/drivers/vfio/pci/vfio_pci_private.h 
b/drivers/vfio/pci/vfio_pci_private.h
index 671c17a..2e2f0ea 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -86,8 +86,8 @@ extern ssize_t vfio_pci_vga_rw(struct vfio_pci_device *vdev, 
char __user *buf,
 extern int vfio_pci_init_perm_bits(void);
 extern void vfio_pci_uninit_perm_bits(void);
 
-extern int vfio_pci_virqfd_init(void);
-extern void vfio_pci_virqfd_exit(void);
+extern int vfio_virqfd_init(void);
+extern void vfio_virqfd_exit(void);
 
 extern int vfio_config_init(struct vfio_pci_device *vdev);
 extern void vfio_config_free(struct vfio_pci_device *vdev);
-- 
2.1.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v10 10/20] vfio/platform: return IRQ info

2014-11-27 Thread Antonios Motakis

Return information for the interrupts exposed by the device.
This patch extends VFIO_DEVICE_GET_INFO with the number of IRQs
and enables VFIO_DEVICE_GET_IRQ_INFO.

Signed-off-by: Antonios Motakis 
---
 drivers/vfio/platform/Makefile|  2 +-
 drivers/vfio/platform/vfio_platform_common.c  | 31 +---
 drivers/vfio/platform/vfio_platform_irq.c | 51 +++
 drivers/vfio/platform/vfio_platform_private.h | 10 ++
 4 files changed, 89 insertions(+), 5 deletions(-)
 create mode 100644 drivers/vfio/platform/vfio_platform_irq.c

diff --git a/drivers/vfio/platform/Makefile b/drivers/vfio/platform/Makefile
index 1957170..81de144 100644
--- a/drivers/vfio/platform/Makefile
+++ b/drivers/vfio/platform/Makefile
@@ -1,5 +1,5 @@
 
-vfio-platform-y := vfio_platform.o vfio_platform_common.o
+vfio-platform-y := vfio_platform.o vfio_platform_common.o vfio_platform_irq.o
 
 obj-$(CONFIG_VFIO_PLATFORM) += vfio-platform.o
 
diff --git a/drivers/vfio/platform/vfio_platform_common.c 
b/drivers/vfio/platform/vfio_platform_common.c
index 6bf78ee..cf7bb08 100644
--- a/drivers/vfio/platform/vfio_platform_common.c
+++ b/drivers/vfio/platform/vfio_platform_common.c
@@ -100,6 +100,7 @@ static void vfio_platform_release(void *device_data)
 
if (!(--vdev->refcnt)) {
vfio_platform_regions_cleanup(vdev);
+   vfio_platform_irq_cleanup(vdev);
}
 
mutex_unlock(&driver_lock);
@@ -121,6 +122,10 @@ static int vfio_platform_open(void *device_data)
ret = vfio_platform_regions_init(vdev);
if (ret)
goto err_reg;
+
+   ret = vfio_platform_irq_init(vdev);
+   if (ret)
+   goto err_irq;
}
 
vdev->refcnt++;
@@ -128,6 +133,8 @@ static int vfio_platform_open(void *device_data)
mutex_unlock(&driver_lock);
return 0;
 
+err_irq:
+   vfio_platform_regions_cleanup(vdev);
 err_reg:
mutex_unlock(&driver_lock);
module_put(THIS_MODULE);
@@ -153,7 +160,7 @@ static long vfio_platform_ioctl(void *device_data,
 
info.flags = vdev->flags;
info.num_regions = vdev->num_regions;
-   info.num_irqs = 0;
+   info.num_irqs = vdev->num_irqs;
 
return copy_to_user((void __user *)arg, &info, minsz);
 
@@ -178,10 +185,26 @@ static long vfio_platform_ioctl(void *device_data,
 
return copy_to_user((void __user *)arg, &info, minsz);
 
-   } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO)
-   return -EINVAL;
+   } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) {
+   struct vfio_irq_info info;
+
+   minsz = offsetofend(struct vfio_irq_info, count);
+
+   if (copy_from_user(&info, (void __user *)arg, minsz))
+   return -EFAULT;
+
+   if (info.argsz < minsz)
+   return -EINVAL;
+
+   if (info.index >= vdev->num_irqs)
+   return -EINVAL;
+
+   info.flags = vdev->irqs[info.index].flags;
+   info.count = vdev->irqs[info.index].count;
+
+   return copy_to_user((void __user *)arg, &info, minsz);
 
-   else if (cmd == VFIO_DEVICE_SET_IRQS)
+   } else if (cmd == VFIO_DEVICE_SET_IRQS)
return -EINVAL;
 
else if (cmd == VFIO_DEVICE_RESET)
diff --git a/drivers/vfio/platform/vfio_platform_irq.c 
b/drivers/vfio/platform/vfio_platform_irq.c
new file mode 100644
index 000..c6c3ec1
--- /dev/null
+++ b/drivers/vfio/platform/vfio_platform_irq.c
@@ -0,0 +1,51 @@
+/*
+ * VFIO platform devices interrupt handling
+ *
+ * Copyright (C) 2013 - Virtual Open Systems
+ * Author: Antonios Motakis 
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "vfio_platform_private.h"
+
+int vfio_platform_irq_init(struct vfio_platform_device *vdev)
+{
+   int cnt = 0, i;
+
+   while (vdev->get_irq(vdev, cnt) >= 0)
+   cnt++;
+
+   vdev->irqs = kcalloc(cnt, sizeof(struct vfio_platform_irq), GFP_KERNEL);
+   if (!vdev->irqs)
+   return -ENOMEM;
+
+   for (i = 0; i < cnt; i++) {
+   vdev->irqs[i].flags = 0;
+   vdev->irqs[i].count = 1;
+   }
+
+   vdev->num_irqs = cnt;
+
+   return 0;
+}
+
+void vfio_platform_irq_cleanup(struct vfio_platform_device *vdev)
+{
+   vdev->num_irqs = 0;
+   kfree(vdev->irqs);
+}
diff --git a/drivers/vfio/pl

[PATCH v10 11/20] vfio/platform: initial interrupts support code

2014-11-27 Thread Antonios Motakis

This patch is a skeleton for the VFIO_DEVICE_SET_IRQS IOCTL, around which
most IRQ functionality is implemented in VFIO.

Signed-off-by: Antonios Motakis 
---
 drivers/vfio/platform/vfio_platform_common.c  | 52 +--
 drivers/vfio/platform/vfio_platform_irq.c | 59 +++
 drivers/vfio/platform/vfio_platform_private.h |  7 
 3 files changed, 115 insertions(+), 3 deletions(-)

diff --git a/drivers/vfio/platform/vfio_platform_common.c 
b/drivers/vfio/platform/vfio_platform_common.c
index cf7bb08..a532a25 100644
--- a/drivers/vfio/platform/vfio_platform_common.c
+++ b/drivers/vfio/platform/vfio_platform_common.c
@@ -204,10 +204,54 @@ static long vfio_platform_ioctl(void *device_data,
 
return copy_to_user((void __user *)arg, &info, minsz);
 
-   } else if (cmd == VFIO_DEVICE_SET_IRQS)
-   return -EINVAL;
+   } else if (cmd == VFIO_DEVICE_SET_IRQS) {
+   struct vfio_irq_set hdr;
+   u8 *data = NULL;
+   int ret = 0;
+
+   minsz = offsetofend(struct vfio_irq_set, count);
+
+   if (copy_from_user(&hdr, (void __user *)arg, minsz))
+   return -EFAULT;
+
+   if (hdr.argsz < minsz)
+   return -EINVAL;
+
+   if (hdr.index >= vdev->num_irqs)
+   return -EINVAL;
+
+   if (hdr.flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
+ VFIO_IRQ_SET_ACTION_TYPE_MASK))
+   return -EINVAL;
 
-   else if (cmd == VFIO_DEVICE_RESET)
+   if (!(hdr.flags & VFIO_IRQ_SET_DATA_NONE)) {
+   size_t size;
+
+   if (hdr.flags & VFIO_IRQ_SET_DATA_BOOL)
+   size = sizeof(uint8_t);
+   else if (hdr.flags & VFIO_IRQ_SET_DATA_EVENTFD)
+   size = sizeof(int32_t);
+   else
+   return -EINVAL;
+
+   if (hdr.argsz - minsz < size)
+   return -EINVAL;
+
+   data = memdup_user((void __user *)(arg + minsz), size);
+   if (IS_ERR(data))
+   return PTR_ERR(data);
+   }
+
+   mutex_lock(&vdev->igate);
+
+   ret = vfio_platform_set_irqs_ioctl(vdev, hdr.flags, hdr.index,
+  hdr.start, hdr.count, data);
+   mutex_unlock(&vdev->igate);
+   kfree(data);
+
+   return ret;
+
+   } else if (cmd == VFIO_DEVICE_RESET)
return -EINVAL;
 
return -ENOTTY;
@@ -457,6 +501,8 @@ int vfio_platform_probe_common(struct vfio_platform_device 
*vdev,
return ret;
}
 
+   mutex_init(&vdev->igate);
+
return 0;
 }
 EXPORT_SYMBOL_GPL(vfio_platform_probe_common);
diff --git a/drivers/vfio/platform/vfio_platform_irq.c 
b/drivers/vfio/platform/vfio_platform_irq.c
index c6c3ec1..df5c919 100644
--- a/drivers/vfio/platform/vfio_platform_irq.c
+++ b/drivers/vfio/platform/vfio_platform_irq.c
@@ -23,6 +23,56 @@
 
 #include "vfio_platform_private.h"
 
+static int vfio_platform_set_irq_mask(struct vfio_platform_device *vdev,
+ unsigned index, unsigned start,
+ unsigned count, uint32_t flags,
+ void *data)
+{
+   return -EINVAL;
+}
+
+static int vfio_platform_set_irq_unmask(struct vfio_platform_device *vdev,
+   unsigned index, unsigned start,
+   unsigned count, uint32_t flags,
+   void *data)
+{
+   return -EINVAL;
+}
+
+static int vfio_platform_set_irq_trigger(struct vfio_platform_device *vdev,
+unsigned index, unsigned start,
+unsigned count, uint32_t flags,
+void *data)
+{
+   return -EINVAL;
+}
+
+int vfio_platform_set_irqs_ioctl(struct vfio_platform_device *vdev,
+uint32_t flags, unsigned index, unsigned start,
+unsigned count, void *data)
+{
+   int (*func)(struct vfio_platform_device *vdev, unsigned index,
+   unsigned start, unsigned count, uint32_t flags,
+   void *data) = NULL;
+
+   switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
+   case VFIO_IRQ_SET_ACTION_MASK:
+   func = vfio_platform_set_irq_mask;
+   break;
+   case VFIO_IRQ_SET_ACTION_UNMASK:
+   func = vfio_platform_set_irq_unmask;
+   break;
+   case VFIO_IRQ_SET_ACTION_TRIGGER:
+   func = vfio_platform_set_irq_trigger;
+   break;
+   }
+
+   if (

[PATCH v10 06/20] vfio/platform: return info for bound device

2014-11-27 Thread Antonios Motakis

A VFIO userspace driver will start by opening the VFIO device
that corresponds to an IOMMU group, and will use the ioctl interface
to get the basic device info, such as number of memory regions and
interrupts, and their properties. This patch enables the
VFIO_DEVICE_GET_INFO ioctl call.

Signed-off-by: Antonios Motakis 
---
 drivers/vfio/platform/vfio_platform_common.c | 23 ---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/drivers/vfio/platform/vfio_platform_common.c 
b/drivers/vfio/platform/vfio_platform_common.c
index 34d023b..862b43b 100644
--- a/drivers/vfio/platform/vfio_platform_common.c
+++ b/drivers/vfio/platform/vfio_platform_common.c
@@ -38,10 +38,27 @@ static int vfio_platform_open(void *device_data)
 static long vfio_platform_ioctl(void *device_data,
unsigned int cmd, unsigned long arg)
 {
-   if (cmd == VFIO_DEVICE_GET_INFO)
-   return -EINVAL;
+   struct vfio_platform_device *vdev = device_data;
+   unsigned long minsz;
+
+   if (cmd == VFIO_DEVICE_GET_INFO) {
+   struct vfio_device_info info;
+
+   minsz = offsetofend(struct vfio_device_info, num_irqs);
+
+   if (copy_from_user(&info, (void __user *)arg, minsz))
+   return -EFAULT;
+
+   if (info.argsz < minsz)
+   return -EINVAL;
+
+   info.flags = vdev->flags;
+   info.num_regions = 0;
+   info.num_irqs = 0;
+
+   return copy_to_user((void __user *)arg, &info, minsz);
 
-   else if (cmd == VFIO_DEVICE_GET_REGION_INFO)
+   } else if (cmd == VFIO_DEVICE_GET_REGION_INFO)
return -EINVAL;
 
else if (cmd == VFIO_DEVICE_GET_IRQ_INFO)
-- 
2.1.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v10 08/20] vfio/platform: read and write support for the device fd

2014-11-27 Thread Antonios Motakis

VFIO returns a file descriptor which we can use to manipulate the memory
regions of the device. Usually, the user will mmap memory regions that are
addressable on page boundaries, however for memory regions where this is
not the case we cannot provide mmap functionality due to security concerns.
For this reason we also allow to use read and write functions to the file
descriptor pointing to the memory regions.

We implement this functionality only for MMIO regions of platform devices;
PIO regions are not being handled at this point.

Signed-off-by: Antonios Motakis 
---
 drivers/vfio/platform/vfio_platform_common.c  | 150 ++
 drivers/vfio/platform/vfio_platform_private.h |   1 +
 2 files changed, 151 insertions(+)

diff --git a/drivers/vfio/platform/vfio_platform_common.c 
b/drivers/vfio/platform/vfio_platform_common.c
index 2a4613c..fda4c30 100644
--- a/drivers/vfio/platform/vfio_platform_common.c
+++ b/drivers/vfio/platform/vfio_platform_common.c
@@ -50,6 +50,10 @@ static int vfio_platform_regions_init(struct 
vfio_platform_device *vdev)
switch (resource_type(res)) {
case IORESOURCE_MEM:
vdev->regions[i].type = VFIO_PLATFORM_REGION_TYPE_MMIO;
+   vdev->regions[i].flags |= VFIO_REGION_INFO_FLAG_READ;
+   if (!(res->flags & IORESOURCE_READONLY))
+   vdev->regions[i].flags |=
+   VFIO_REGION_INFO_FLAG_WRITE;
break;
case IORESOURCE_IO:
vdev->regions[i].type = VFIO_PLATFORM_REGION_TYPE_PIO;
@@ -69,6 +73,11 @@ err:
 
 static void vfio_platform_regions_cleanup(struct vfio_platform_device *vdev)
 {
+   int i;
+
+   for (i = 0; i < vdev->num_regions; i++)
+   iounmap(vdev->regions[i].ioaddr);
+
vdev->num_regions = 0;
kfree(vdev->regions);
 }
@@ -171,15 +180,156 @@ static long vfio_platform_ioctl(void *device_data,
return -ENOTTY;
 }
 
+static ssize_t vfio_platform_read_mmio(struct vfio_platform_region reg,
+  char __user *buf, size_t count,
+  loff_t off)
+{
+   unsigned int done = 0;
+
+   if (!reg.ioaddr) {
+   reg.ioaddr =
+   ioremap_nocache(reg.addr, reg.size);
+
+   if (!reg.ioaddr)
+   return -ENOMEM;
+   }
+
+   while (count) {
+   size_t filled;
+
+   if (count >= 4 && !(off % 4)) {
+   u32 val;
+
+   val = ioread32(reg.ioaddr + off);
+   if (copy_to_user(buf, &val, 4))
+   goto err;
+
+   filled = 4;
+   } else if (count >= 2 && !(off % 2)) {
+   u16 val;
+
+   val = ioread16(reg.ioaddr + off);
+   if (copy_to_user(buf, &val, 2))
+   goto err;
+
+   filled = 2;
+   } else {
+   u8 val;
+
+   val = ioread8(reg.ioaddr + off);
+   if (copy_to_user(buf, &val, 1))
+   goto err;
+
+   filled = 1;
+   }
+
+
+   count -= filled;
+   done += filled;
+   off += filled;
+   buf += filled;
+   }
+
+   return done;
+err:
+   return -EFAULT;
+}
+
 static ssize_t vfio_platform_read(void *device_data, char __user *buf,
  size_t count, loff_t *ppos)
 {
+   struct vfio_platform_device *vdev = device_data;
+   unsigned int index = VFIO_PLATFORM_OFFSET_TO_INDEX(*ppos);
+   loff_t off = *ppos & VFIO_PLATFORM_OFFSET_MASK;
+
+   if (index >= vdev->num_regions)
+   return -EINVAL;
+
+   if (!(vdev->regions[index].flags & VFIO_REGION_INFO_FLAG_READ))
+   return -EINVAL;
+
+   if (vdev->regions[index].type & VFIO_PLATFORM_REGION_TYPE_MMIO)
+   return vfio_platform_read_mmio(vdev->regions[index],
+   buf, count, off);
+   else if (vdev->regions[index].type & VFIO_PLATFORM_REGION_TYPE_PIO)
+   return -EINVAL; /* not implemented */
+
return -EINVAL;
 }
 
+static ssize_t vfio_platform_write_mmio(struct vfio_platform_region reg,
+   const char __user *buf, size_t count,
+   loff_t off)
+{
+   unsigned int done = 0;
+
+   if (!reg.ioaddr) {
+   reg.ioaddr =
+   ioremap_nocache(reg.addr, reg.size);
+
+   if (!reg.ioaddr)
+   return -ENOMEM;
+   }
+
+   while (count) {
+   size_t filled;
+
+   if (count >= 4 && !(off % 4)) {
+

[PATCH v10 07/20] vfio/platform: return info for device memory mapped IO regions

2014-11-27 Thread Antonios Motakis

This patch enables the IOCTLs VFIO_DEVICE_GET_REGION_INFO ioctl call,
which allows the user to learn about the available MMIO resources of
a device.

Signed-off-by: Antonios Motakis 
---
 drivers/vfio/platform/vfio_platform_common.c  | 106 +-
 drivers/vfio/platform/vfio_platform_private.h |  22 ++
 2 files changed, 124 insertions(+), 4 deletions(-)

diff --git a/drivers/vfio/platform/vfio_platform_common.c 
b/drivers/vfio/platform/vfio_platform_common.c
index 862b43b..2a4613c 100644
--- a/drivers/vfio/platform/vfio_platform_common.c
+++ b/drivers/vfio/platform/vfio_platform_common.c
@@ -22,17 +22,97 @@
 
 #include "vfio_platform_private.h"
 
+static DEFINE_MUTEX(driver_lock);
+
+static int vfio_platform_regions_init(struct vfio_platform_device *vdev)
+{
+   int cnt = 0, i;
+
+   while (vdev->get_resource(vdev, cnt))
+   cnt++;
+
+   vdev->regions = kcalloc(cnt, sizeof(struct vfio_platform_region),
+   GFP_KERNEL);
+   if (!vdev->regions)
+   return -ENOMEM;
+
+   for (i = 0; i < cnt;  i++) {
+   struct resource *res =
+   vdev->get_resource(vdev, i);
+
+   if (!res)
+   goto err;
+
+   vdev->regions[i].addr = res->start;
+   vdev->regions[i].size = resource_size(res);
+   vdev->regions[i].flags = 0;
+
+   switch (resource_type(res)) {
+   case IORESOURCE_MEM:
+   vdev->regions[i].type = VFIO_PLATFORM_REGION_TYPE_MMIO;
+   break;
+   case IORESOURCE_IO:
+   vdev->regions[i].type = VFIO_PLATFORM_REGION_TYPE_PIO;
+   break;
+   default:
+   goto err;
+   }
+   }
+
+   vdev->num_regions = cnt;
+
+   return 0;
+err:
+   kfree(vdev->regions);
+   return -EINVAL;
+}
+
+static void vfio_platform_regions_cleanup(struct vfio_platform_device *vdev)
+{
+   vdev->num_regions = 0;
+   kfree(vdev->regions);
+}
+
 static void vfio_platform_release(void *device_data)
 {
+   struct vfio_platform_device *vdev = device_data;
+
+   mutex_lock(&driver_lock);
+
+   if (!(--vdev->refcnt)) {
+   vfio_platform_regions_cleanup(vdev);
+   }
+
+   mutex_unlock(&driver_lock);
+
module_put(THIS_MODULE);
 }
 
 static int vfio_platform_open(void *device_data)
 {
+   struct vfio_platform_device *vdev = device_data;
+   int ret;
+
if (!try_module_get(THIS_MODULE))
return -ENODEV;
 
+   mutex_lock(&driver_lock);
+
+   if (!vdev->refcnt) {
+   ret = vfio_platform_regions_init(vdev);
+   if (ret)
+   goto err_reg;
+   }
+
+   vdev->refcnt++;
+
+   mutex_unlock(&driver_lock);
return 0;
+
+err_reg:
+   mutex_unlock(&driver_lock);
+   module_put(THIS_MODULE);
+   return ret;
 }
 
 static long vfio_platform_ioctl(void *device_data,
@@ -53,15 +133,33 @@ static long vfio_platform_ioctl(void *device_data,
return -EINVAL;
 
info.flags = vdev->flags;
-   info.num_regions = 0;
+   info.num_regions = vdev->num_regions;
info.num_irqs = 0;
 
return copy_to_user((void __user *)arg, &info, minsz);
 
-   } else if (cmd == VFIO_DEVICE_GET_REGION_INFO)
-   return -EINVAL;
+   } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) {
+   struct vfio_region_info info;
+
+   minsz = offsetofend(struct vfio_region_info, offset);
+
+   if (copy_from_user(&info, (void __user *)arg, minsz))
+   return -EFAULT;
+
+   if (info.argsz < minsz)
+   return -EINVAL;
+
+   if (info.index >= vdev->num_regions)
+   return -EINVAL;
+
+   /* map offset to the physical address  */
+   info.offset = VFIO_PLATFORM_INDEX_TO_OFFSET(info.index);
+   info.size = vdev->regions[info.index].size;
+   info.flags = vdev->regions[info.index].flags;
+
+   return copy_to_user((void __user *)arg, &info, minsz);
 
-   else if (cmd == VFIO_DEVICE_GET_IRQ_INFO)
+   } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO)
return -EINVAL;
 
else if (cmd == VFIO_DEVICE_SET_IRQS)
diff --git a/drivers/vfio/platform/vfio_platform_private.h 
b/drivers/vfio/platform/vfio_platform_private.h
index 062b92d..b24729f 100644
--- a/drivers/vfio/platform/vfio_platform_private.h
+++ b/drivers/vfio/platform/vfio_platform_private.h
@@ -15,7 +15,29 @@
 #ifndef VFIO_PLATFORM_PRIVATE_H
 #define VFIO_PLATFORM_PRIVATE_H
 
+#define VFIO_PLATFORM_OFFSET_SHIFT   40
+#define VFIO_PLATFORM_OFFSET_MASK (((u64)(1) << VFIO_PLATFORM_OFFSET_SHIFT) - 
1)
+
+#define VFIO_PLATFORM_OFFSET_TO_INDEX(off)

[PATCH v10 04/20] vfio: amba: VFIO support for AMBA devices

2014-11-27 Thread Antonios Motakis

Add support for discovering AMBA devices with VFIO and handle them
similarly to Linux platform devices.

Signed-off-by: Antonios Motakis 
---
 drivers/vfio/platform/vfio_amba.c | 110 ++
 include/uapi/linux/vfio.h |   1 +
 2 files changed, 111 insertions(+)
 create mode 100644 drivers/vfio/platform/vfio_amba.c

diff --git a/drivers/vfio/platform/vfio_amba.c 
b/drivers/vfio/platform/vfio_amba.c
new file mode 100644
index 000..be33fb5
--- /dev/null
+++ b/drivers/vfio/platform/vfio_amba.c
@@ -0,0 +1,110 @@
+/*
+ * Copyright (C) 2013 - Virtual Open Systems
+ * Author: Antonios Motakis 
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include 
+#include 
+#include 
+#include 
+
+#include "vfio_platform_private.h"
+
+#define DRIVER_VERSION  "0.10"
+#define DRIVER_AUTHOR   "Antonios Motakis "
+#define DRIVER_DESC "VFIO for AMBA devices - User Level meta-driver"
+
+/* probing devices from the AMBA bus */
+
+static struct resource *get_amba_resource(struct vfio_platform_device *vdev,
+ int i)
+{
+   struct amba_device *adev = (struct amba_device *) vdev->opaque;
+
+   if (i == 0)
+   return &adev->res;
+
+   return NULL;
+}
+
+static int get_amba_irq(struct vfio_platform_device *vdev, int i)
+{
+   struct amba_device *adev = (struct amba_device *) vdev->opaque;
+   int ret = 0;
+
+   if (i < AMBA_NR_IRQS)
+   ret = adev->irq[i];
+
+   /* zero is an unset IRQ for AMBA devices */
+   return ret ? ret : -ENXIO;
+}
+
+static int vfio_amba_probe(struct amba_device *adev, const struct amba_id *id)
+{
+   struct vfio_platform_device *vdev;
+   int ret;
+
+   vdev = kzalloc(sizeof(*vdev), GFP_KERNEL);
+   if (!vdev)
+   return -ENOMEM;
+
+   vdev->opaque = (void *) adev;
+   vdev->name = kasprintf(GFP_KERNEL, "vfio-amba-%08x", adev->periphid);
+   vdev->flags = VFIO_DEVICE_FLAGS_AMBA;
+   vdev->get_resource = get_amba_resource;
+   vdev->get_irq = get_amba_irq;
+
+   ret = vfio_platform_probe_common(vdev, &adev->dev);
+   if (ret) {
+   kfree(vdev->name);
+   kfree(vdev);
+   }
+
+   return ret;
+}
+
+static int vfio_amba_remove(struct amba_device *adev)
+{
+   struct vfio_platform_device *vdev;
+
+   vdev = vfio_platform_remove_common(&adev->dev);
+   if (vdev) {
+   kfree(vdev->name);
+   kfree(vdev);
+   return 0;
+   }
+
+   return -EINVAL;
+}
+
+static struct amba_id pl330_ids[] = {
+   { 0, 0 },
+};
+
+MODULE_DEVICE_TABLE(amba, pl330_ids);
+
+static struct amba_driver vfio_amba_driver = {
+   .probe = vfio_amba_probe,
+   .remove = vfio_amba_remove,
+   .id_table = pl330_ids,
+   .drv = {
+   .name = "vfio-amba",
+   .owner = THIS_MODULE,
+   },
+};
+
+module_amba_driver(vfio_amba_driver);
+
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 4e93a97..544d3d8 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -160,6 +160,7 @@ struct vfio_device_info {
 #define VFIO_DEVICE_FLAGS_RESET(1 << 0)/* Device supports 
reset */
 #define VFIO_DEVICE_FLAGS_PCI  (1 << 1)/* vfio-pci device */
 #define VFIO_DEVICE_FLAGS_PLATFORM (1 << 2)/* vfio-platform device */
+#define VFIO_DEVICE_FLAGS_AMBA  (1 << 3)   /* vfio-amba device */
__u32   num_regions;/* Max region index + 1 */
__u32   num_irqs;   /* Max IRQ index + 1 */
 };
-- 
2.1.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v10 05/20] vfio: amba: add the VFIO for AMBA devices module to Kconfig

2014-11-27 Thread Antonios Motakis

Enable building the VFIO AMBA driver. VFIO_AMBA depends on VFIO_PLATFORM,
since it is sharing a portion of the code, and it is essentially implemented
as a platform device whose resources are discovered via AMBA specific APIs
in the kernel.

Signed-off-by: Antonios Motakis 
---
 drivers/vfio/platform/Kconfig  | 10 ++
 drivers/vfio/platform/Makefile |  4 
 2 files changed, 14 insertions(+)

diff --git a/drivers/vfio/platform/Kconfig b/drivers/vfio/platform/Kconfig
index c51af17..c0a3bff 100644
--- a/drivers/vfio/platform/Kconfig
+++ b/drivers/vfio/platform/Kconfig
@@ -7,3 +7,13 @@ config VFIO_PLATFORM
  framework.
 
  If you don't know what to do here, say N.
+
+config VFIO_AMBA
+   tristate "VFIO support for AMBA devices"
+   depends on VFIO_PLATFORM && ARM_AMBA
+   help
+ Support for ARM AMBA devices with VFIO. This is required to make
+ use of ARM AMBA devices present on the system using the VFIO
+ framework.
+
+ If you don't know what to do here, say N.
diff --git a/drivers/vfio/platform/Makefile b/drivers/vfio/platform/Makefile
index 279862b..1957170 100644
--- a/drivers/vfio/platform/Makefile
+++ b/drivers/vfio/platform/Makefile
@@ -2,3 +2,7 @@
 vfio-platform-y := vfio_platform.o vfio_platform_common.o
 
 obj-$(CONFIG_VFIO_PLATFORM) += vfio-platform.o
+
+vfio-amba-y := vfio_amba.o
+
+obj-$(CONFIG_VFIO_AMBA) += vfio-amba.o
-- 
2.1.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v10 02/20] vfio: platform: probe to devices on the platform bus

2014-11-27 Thread Antonios Motakis

Driver to bind to Linux platform devices, and callbacks to discover their
resources to be used by the main VFIO PLATFORM code.

Signed-off-by: Antonios Motakis 
---
 drivers/vfio/platform/vfio_platform.c | 103 ++
 include/uapi/linux/vfio.h |   1 +
 2 files changed, 104 insertions(+)
 create mode 100644 drivers/vfio/platform/vfio_platform.c

diff --git a/drivers/vfio/platform/vfio_platform.c 
b/drivers/vfio/platform/vfio_platform.c
new file mode 100644
index 000..cef645c
--- /dev/null
+++ b/drivers/vfio/platform/vfio_platform.c
@@ -0,0 +1,103 @@
+/*
+ * Copyright (C) 2013 - Virtual Open Systems
+ * Author: Antonios Motakis 
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include 
+#include 
+#include 
+#include 
+
+#include "vfio_platform_private.h"
+
+#define DRIVER_VERSION  "0.10"
+#define DRIVER_AUTHOR   "Antonios Motakis "
+#define DRIVER_DESC "VFIO for platform devices - User Level meta-driver"
+
+/* probing devices from the linux platform bus */
+
+static struct resource *get_platform_resource(struct vfio_platform_device 
*vdev,
+ int num)
+{
+   struct platform_device *dev = (struct platform_device *) vdev->opaque;
+   int i;
+
+   for (i = 0; i < dev->num_resources; i++) {
+   struct resource *r = &dev->resource[i];
+
+   if (resource_type(r) & (IORESOURCE_MEM|IORESOURCE_IO)) {
+   if (!num)
+   return r;
+
+   num--;
+   }
+   }
+   return NULL;
+}
+
+static int get_platform_irq(struct vfio_platform_device *vdev, int i)
+{
+   struct platform_device *pdev = (struct platform_device *) vdev->opaque;
+
+   return platform_get_irq(pdev, i);
+}
+
+static int vfio_platform_probe(struct platform_device *pdev)
+{
+   struct vfio_platform_device *vdev;
+   int ret;
+
+   vdev = kzalloc(sizeof(*vdev), GFP_KERNEL);
+   if (!vdev)
+   return -ENOMEM;
+
+   vdev->opaque = (void *) pdev;
+   vdev->name = pdev->name;
+   vdev->flags = VFIO_DEVICE_FLAGS_PLATFORM;
+   vdev->get_resource = get_platform_resource;
+   vdev->get_irq = get_platform_irq;
+
+   ret = vfio_platform_probe_common(vdev, &pdev->dev);
+   if (ret)
+   kfree(vdev);
+
+   return ret;
+}
+
+static int vfio_platform_remove(struct platform_device *pdev)
+{
+   struct vfio_platform_device *vdev;
+
+   vdev = vfio_platform_remove_common(&pdev->dev);
+   if (vdev) {
+   kfree(vdev);
+   return 0;
+   }
+
+   return -EINVAL;
+}
+
+static struct platform_driver vfio_platform_driver = {
+   .probe  = vfio_platform_probe,
+   .remove = vfio_platform_remove,
+   .driver = {
+   .name   = "vfio-platform",
+   .owner  = THIS_MODULE,
+   },
+};
+
+module_platform_driver(vfio_platform_driver);
+
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 9ade02b..4e93a97 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -159,6 +159,7 @@ struct vfio_device_info {
__u32   flags;
 #define VFIO_DEVICE_FLAGS_RESET(1 << 0)/* Device supports 
reset */
 #define VFIO_DEVICE_FLAGS_PCI  (1 << 1)/* vfio-pci device */
+#define VFIO_DEVICE_FLAGS_PLATFORM (1 << 2)/* vfio-platform device */
__u32   num_regions;/* Max region index + 1 */
__u32   num_irqs;   /* Max IRQ index + 1 */
 };
-- 
2.1.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v10 03/20] vfio: platform: add the VFIO PLATFORM module to Kconfig

2014-11-27 Thread Antonios Motakis

Enable building the VFIO PLATFORM driver that allows to use Linux platform
devices with VFIO.

Signed-off-by: Antonios Motakis 
---
 drivers/vfio/Kconfig   | 1 +
 drivers/vfio/Makefile  | 1 +
 drivers/vfio/platform/Kconfig  | 9 +
 drivers/vfio/platform/Makefile | 4 
 4 files changed, 15 insertions(+)
 create mode 100644 drivers/vfio/platform/Kconfig
 create mode 100644 drivers/vfio/platform/Makefile

diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index a0abe04..962fb80 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -27,3 +27,4 @@ menuconfig VFIO
  If you don't know what to do here, say N.
 
 source "drivers/vfio/pci/Kconfig"
+source "drivers/vfio/platform/Kconfig"
diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
index 0b035b1..dadf0ca 100644
--- a/drivers/vfio/Makefile
+++ b/drivers/vfio/Makefile
@@ -3,3 +3,4 @@ obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
 obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
 obj-$(CONFIG_VFIO_SPAPR_EEH) += vfio_spapr_eeh.o
 obj-$(CONFIG_VFIO_PCI) += pci/
+obj-$(CONFIG_VFIO_PLATFORM) += platform/
diff --git a/drivers/vfio/platform/Kconfig b/drivers/vfio/platform/Kconfig
new file mode 100644
index 000..c51af17
--- /dev/null
+++ b/drivers/vfio/platform/Kconfig
@@ -0,0 +1,9 @@
+config VFIO_PLATFORM
+   tristate "VFIO support for platform devices"
+   depends on VFIO && EVENTFD && ARM
+   help
+ Support for platform devices with VFIO. This is required to make
+ use of platform devices present on the system using the VFIO
+ framework.
+
+ If you don't know what to do here, say N.
diff --git a/drivers/vfio/platform/Makefile b/drivers/vfio/platform/Makefile
new file mode 100644
index 000..279862b
--- /dev/null
+++ b/drivers/vfio/platform/Makefile
@@ -0,0 +1,4 @@
+
+vfio-platform-y := vfio_platform.o vfio_platform_common.o
+
+obj-$(CONFIG_VFIO_PLATFORM) += vfio-platform.o
-- 
2.1.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] xen: privcmd: schedule() after private hypercall when non CONFIG_PREEMPT

2014-11-27 Thread Luis R. Rodriguez

On Thu, Nov 27, 2014 at 07:36:31AM +0100, Juergen Gross wrote:
> On 11/26/2014 11:26 PM, Luis R. Rodriguez wrote:
>> From: "Luis R. Rodriguez" 
>>
>> Some folks had reported that some xen hypercalls take a long time
>> to complete when issued from the userspace private ioctl mechanism,
>> this can happen for instance with some hypercalls that have many
>> sub-operations, this can happen for instance on hypercalls that use
>> multi-call feature whereby Xen lets one hypercall batch out a series
>> of other hypercalls on the hypervisor. At times such hypercalls can
>> even end up triggering the TASK_UNINTERRUPTIBLE hanger check (default
>> 120 seconds), this a non-issue issue on preemptible kernels though as
>> the kernel may deschedule such long running tasks. Xen for instance
>> supports multicalls to be preempted as well, this is what Xen calls
>> continuation (see xen commit 42217cbc5b which introduced this [0]).
>> On systems without CONFIG_PREEMPT though -- a kernel with voluntary
>> or no preemption -- a long running hypercall will not be descheduled
>> until the hypercall is complete and the ioctl returns to user space.
>>
>> To help with this David had originally implemented support for use
>> of preempt_schedule_irq() [1] for non CONFIG_PREEMPT kernels. This
>> solution never went upstream though and upon review to help refactor
>> this I've concluded that usage of preempt_schedule_irq() would be
>> a bit abussive of existing APIs -- for a few reasons:
>>
>> 0) we want to avoid spreading its use on non CONFIG_PREEMPT kernels
>>
>> 1) we want try to consider solutions that might work for other
>> hypervisors for this same problem, and identify it its an issue
>> even present on other hypervisors or if this is a self
>> inflicted architectural issue caused by use of multicalls
>>
>> 2) there is no documentation or profiling of the exact hypercalls
>> that were causing these issues, nor do we have any context
>> to help evaluate this any further
>>
>> I at least checked with kvm folks and it seems hypercall preemption
>> is not needed there. We can survey other hypervisors...
>>
>> If 'something like preemption' is needed then CONFIG_PREEMPT
>> should just be enabled and encouraged, it seems we want to
>> encourage CONFIG_PREEMPT on xen, specially when multicalls are
>> used. In the meantime this tries to address a solution to help
>> xen on non CONFIG_PREEMPT kernels.
>>
>> One option tested and evaluated was to put private hypercalls in
>> process context, however this would introduce complexities such
>> originating hypercalls from different contexts. Current xen
>> hypercall callback handlers would need to be changed per architecture,
>> for instance, we'd also incur the cost of switching states from
>> user / kernel (this cost is also present if preempt_schedule_irq()
>> is used). There may be other issues which could be introduced with
>> this strategy as well. The simplest *shared* alternative is instead
>> to just explicitly schedule() at the end of a private hypercall on non
>> preempt kernels. This forces our private hypercall call mechanism
>> to try to be fair only on non CONFIG_PREEMPT kernels at the cost of
>> more context switch but keeps the private hypercall context intact.
>>
>> [0] 
>> http://xenbits.xen.org/gitweb/?p=xen.git;a=commitdiff;h=42217cbc5b3e84b8c145d8cfb62dd5de0134b9e8;hp=3a0b9c57d5c9e82c55dd967c84dd06cb43c49ee9
>> [1] 
>> http://ftp.suse.com/pub/people/mcgrof/xen-preempt-hypercalls/0001-x86-xen-allow-privcmd-hypercalls-to-be-preempted.patch
>>
>> Cc: Davidlohr Bueso 
>> Cc: Joerg Roedel 
>> Cc: Borislav Petkov 
>> Cc: Konrad Rzeszutek Wilk 
>> Cc: Jan Beulich 
>> Cc: Juergen Gross 
>> Cc: Olaf Hering 
>> Cc: David Vrabel 
>> Signed-off-by: Luis R. Rodriguez 
>> ---
>>   drivers/xen/privcmd.c | 3 +++
>>   1 file changed, 3 insertions(+)
>>
>> diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c
>> index 569a13b..e29edba 100644
>> --- a/drivers/xen/privcmd.c
>> +++ b/drivers/xen/privcmd.c
>> @@ -60,6 +60,9 @@ static long privcmd_ioctl_hypercall(void __user *udata)
>> hypercall.arg[0], hypercall.arg[1],
>> hypercall.arg[2], hypercall.arg[3],
>> hypercall.arg[4]);
>> +#ifndef CONFIG_PREEMPT
>> +schedule();
>> +#endif
>>
>>  return ret;
>>   }
>>
>
> Sorry, I don't think this will solve anything. You're calling schedule()
> right after the long running hypercall just nanoseconds before returning
> to the user.

Yeah, well that is what [1] tried as well only it tried using
preempt_schedule_irq() on the hypercall callback...

> I suppose you were mislead by the "int 0x82" in [0]. This is the
> hypercall from the kernel into the hypervisor, e.g. inside of
> privcmd_call().

Nope, you have to consider what was done in [1], I was trying to
do something similar but less complex that didn't involve mucking
with the callbacks but also not abusing APIs.

I'm afraid we don't have m

[PATCH 2/5] arm/arm64: KVM: Reset the HCR on each vcpu when resetting the vcpu

2014-11-27 Thread Christoffer Dall

When userspace resets the vcpu using KVM_ARM_VCPU_INIT, we should also
reset the HCR, because we now modify the HCR dynamically to
enable/disable trapping of guest accesses to the VM registers.

This is crucial for reboot of VMs working since otherwise we will not be
doing the necessary cache maintenance operations when faulting in pages
with the guest MMU off.

Signed-off-by: Christoffer Dall 
---
 arch/arm/include/asm/kvm_emulate.h   | 5 +
 arch/arm/kvm/arm.c   | 2 ++
 arch/arm/kvm/guest.c | 1 -
 arch/arm64/include/asm/kvm_emulate.h | 5 +
 arch/arm64/kvm/guest.c   | 1 -
 5 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/arch/arm/include/asm/kvm_emulate.h 
b/arch/arm/include/asm/kvm_emulate.h
index b9db269..66ce176 100644
--- a/arch/arm/include/asm/kvm_emulate.h
+++ b/arch/arm/include/asm/kvm_emulate.h
@@ -33,6 +33,11 @@ void kvm_inject_undefined(struct kvm_vcpu *vcpu);
 void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr);
 void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr);
 
+static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu)
+{
+   vcpu->arch.hcr = HCR_GUEST_MASK;
+}
+
 static inline bool vcpu_mode_is_32bit(struct kvm_vcpu *vcpu)
 {
return 1;
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 4dcc8c2..a09a55b 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -658,6 +658,8 @@ static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu 
*vcpu,
if (ret)
return ret;
 
+   vcpu_reset_hcr(vcpu);
+
/*
 * Handle the "start in power-off" case by marking the VCPU as paused.
 */
diff --git a/arch/arm/kvm/guest.c b/arch/arm/kvm/guest.c
index cc0b787..8c97208 100644
--- a/arch/arm/kvm/guest.c
+++ b/arch/arm/kvm/guest.c
@@ -38,7 +38,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 
 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 {
-   vcpu->arch.hcr = HCR_GUEST_MASK;
return 0;
 }
 
diff --git a/arch/arm64/include/asm/kvm_emulate.h 
b/arch/arm64/include/asm/kvm_emulate.h
index 5674a55..8127e45 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -38,6 +38,11 @@ void kvm_inject_undefined(struct kvm_vcpu *vcpu);
 void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr);
 void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr);
 
+static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu)
+{
+   vcpu->arch.hcr_el2 = HCR_GUEST_FLAGS;
+}
+
 static inline unsigned long *vcpu_pc(const struct kvm_vcpu *vcpu)
 {
return (unsigned long *)&vcpu_gp_regs(vcpu)->regs.pc;
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index 7679469..84d5959 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -38,7 +38,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 
 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 {
-   vcpu->arch.hcr_el2 = HCR_GUEST_FLAGS;
return 0;
 }
 
-- 
2.1.2.330.g565301e.dirty

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 1/5] arm/arm64: KVM: Correct KVM_ARM_VCPU_INIT power off option

2014-11-27 Thread Christoffer Dall

The implementation of KVM_ARM_VCPU_INIT is currently not doing what
userspace expects, namely making sure that a vcpu which may have been
turned off using PSCI is returned to its initial state, which would be
powered on if userspace does not set the KVM_ARM_VCPU_POWER_OFF flag.

Implment the expected functionality and clarify the ABI.

Signed-off-by: Christoffer Dall 
---
 Documentation/virtual/kvm/api.txt | 3 ++-
 arch/arm/kvm/arm.c| 2 ++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/Documentation/virtual/kvm/api.txt 
b/Documentation/virtual/kvm/api.txt
index 7610eaa..bb82a90 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2455,7 +2455,8 @@ should be created before this ioctl is invoked.
 
 Possible features:
- KVM_ARM_VCPU_POWER_OFF: Starts the CPU in a power-off state.
- Depends on KVM_CAP_ARM_PSCI.
+ Depends on KVM_CAP_ARM_PSCI.  If not set, the CPU will be powered on
+ and execute guest code when KVM_RUN is called.
- KVM_ARM_VCPU_EL1_32BIT: Starts the CPU in a 32bit mode.
  Depends on KVM_CAP_ARM_EL1_32BIT (arm64 only).
- KVM_ARM_VCPU_PSCI_0_2: Emulate PSCI v0.2 for the CPU.
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 9e193c8..4dcc8c2 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -663,6 +663,8 @@ static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu 
*vcpu,
 */
if (__test_and_clear_bit(KVM_ARM_VCPU_POWER_OFF, vcpu->arch.features))
vcpu->arch.pause = true;
+   else
+   vcpu->arch.pause = false;
 
return 0;
 }
-- 
2.1.2.330.g565301e.dirty

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 5/5] arm/arm64: KVM: Turn off vcpus and flush stage-2 pgtables on sytem exit events

2014-11-27 Thread Christoffer Dall

When a vcpu calls SYSTEM_OFF or SYSTEM_RESET with PSCI v0.2, the vcpus
should really be turned off for the VM adhering to the suggestions in
the PSCI spec, and it's the sane thing to do.

Also, to ensure a coherent icache/dcache/ram situation when restarting
with the guest MMU off, flush all stage-2 page table entries so we start
taking aborts when the guest reboots, and flush/invalidate the necessary
cache lines.

Clarify the behavior and expectations for arm/arm64 in the
KVM_EXIT_SYSTEM_EVENT case.

Signed-off-by: Christoffer Dall 
---
 Documentation/virtual/kvm/api.txt |  4 
 arch/arm/kvm/psci.c   | 18 ++
 arch/arm64/include/asm/kvm_host.h |  1 +
 3 files changed, 23 insertions(+)

diff --git a/Documentation/virtual/kvm/api.txt 
b/Documentation/virtual/kvm/api.txt
index fc12b4f..c67e4956 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2955,6 +2955,10 @@ HVC instruction based PSCI call from the vcpu. The 
'type' field describes
 the system-level event type. The 'flags' field describes architecture
 specific flags for the system-level event.
 
+In the case of ARM/ARM64, all vcpus will be powered off when requesting 
shutdown
+or reset, and it is the responsibility of userspace to reinitialize the vcpus
+using KVM_ARM_VCPU_INIT.
+
/* Fix the size of the union. */
char padding[256];
};
diff --git a/arch/arm/kvm/psci.c b/arch/arm/kvm/psci.c
index 09cf377..b4ab613 100644
--- a/arch/arm/kvm/psci.c
+++ b/arch/arm/kvm/psci.c
@@ -15,11 +15,13 @@
  * along with this program.  If not, see .
  */
 
+#include 
 #include 
 #include 
 
 #include 
 #include 
+#include 
 #include 
 
 /*
@@ -166,6 +168,22 @@ static unsigned long kvm_psci_vcpu_affinity_info(struct 
kvm_vcpu *vcpu)
 
 static void kvm_prepare_system_event(struct kvm_vcpu *vcpu, u32 type)
 {
+   int i;
+   struct kvm_vcpu *tmp;
+
+   /* Stop all vcpus */
+   kvm_for_each_vcpu(i, tmp, vcpu->kvm)
+   tmp->arch.pause = true;
+   preempt_disable();
+   force_vm_exit(cpu_all_mask);
+   preempt_enable();
+
+   /*
+* Ensure a rebooted VM will fault in RAM pages and detect if the
+* guest MMU is turned off and flush the caches as needed.
+*/
+   stage2_unmap_vm(vcpu->kvm);
+
memset(&vcpu->run->system_event, 0, sizeof(vcpu->run->system_event));
vcpu->run->system_event.type = type;
vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
diff --git a/arch/arm64/include/asm/kvm_host.h 
b/arch/arm64/include/asm/kvm_host.h
index 2012c4b..dbd3212 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -200,6 +200,7 @@ struct kvm_vcpu *kvm_arm_get_running_vcpu(void);
 struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void);
 
 u64 kvm_call_hyp(void *hypfn, ...);
+void force_vm_exit(const cpumask_t *mask);
 
 int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
int exception_index);
-- 
2.1.2.330.g565301e.dirty

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 3/5] arm/arm64: KVM: Clarify KVM_ARM_VCPU_INIT ABI

2014-11-27 Thread Christoffer Dall

It is not clear that this ioctl can be called multiple times for a given
vcpu.  Userspace already does this, so clarify the ABI.

Signed-off-by: Christoffer Dall 
---
 Documentation/virtual/kvm/api.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/Documentation/virtual/kvm/api.txt 
b/Documentation/virtual/kvm/api.txt
index bb82a90..fc12b4f 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2453,6 +2453,9 @@ return ENOEXEC for that vcpu.
 Note that because some registers reflect machine topology, all vcpus
 should be created before this ioctl is invoked.
 
+Userspace can call this function multiple times for a given VCPU, which will
+reset the VCPU to its initial states.
+
 Possible features:
- KVM_ARM_VCPU_POWER_OFF: Starts the CPU in a power-off state.
  Depends on KVM_CAP_ARM_PSCI.  If not set, the CPU will be powered on
-- 
2.1.2.330.g565301e.dirty

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 4/5] arm/arm64: KVM: Introduce stage2_unmap_vm

2014-11-27 Thread Christoffer Dall

Introduce a new function to unmap user RAM regions in the stage2 page
tables.  This is needed on reboot (or when the guest turns off the MMU)
to ensure we fault in pages again and make the dcache, RAM, and icache
coherent.

Using unmap_stage2_range for the whole guest physical range does not
work, because that unmaps IO regions (such as the GIC) which will not be
recreated or in the best case faulted in on a page-by-page basis.

Cc: Ard Biesheuvel 
Signed-off-by: Christoffer Dall 
---
There is an alternative version with more code reuse available here:
http://git.linaro.org/people/christoffer.dall/linux-kvm-arm.git 
vcpu_init_fixes-alternative

That version improves code-reuse at the cost of reduced code-readibility and
increased complexity.  I didn't test the alternative version or spend huge
amounts of time thinking about potential cleaner versions of the code, but
chose to include a pointer to the version as I can't make up my mind about the
preferred approach.  Input is welcome.

 arch/arm/include/asm/kvm_mmu.h   |  1 +
 arch/arm/kvm/mmu.c   | 65 
 arch/arm64/include/asm/kvm_mmu.h |  1 +
 3 files changed, 67 insertions(+)

diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index acb0d57..4654c42 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -52,6 +52,7 @@ int create_hyp_io_mappings(void *from, void *to, phys_addr_t);
 void free_boot_hyp_pgd(void);
 void free_hyp_pgds(void);
 
+void stage2_unmap_vm(struct kvm *kvm);
 int kvm_alloc_stage2_pgd(struct kvm *kvm);
 void kvm_free_stage2_pgd(struct kvm *kvm);
 int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 57a403a..b1f3c9a 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -611,6 +611,71 @@ static void unmap_stage2_range(struct kvm *kvm, 
phys_addr_t start, u64 size)
unmap_range(kvm, kvm->arch.pgd, start, size);
 }
 
+static void stage2_unmap_memslot(struct kvm *kvm,
+struct kvm_memory_slot *memslot)
+{
+   hva_t hva = memslot->userspace_addr;
+   phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
+   phys_addr_t size = PAGE_SIZE * memslot->npages;
+   hva_t reg_end = hva + size;
+
+   /*
+* A memory region could potentially cover multiple VMAs, and any holes
+* between them, so iterate over all of them to find out if we should
+* unmap any of them.
+*
+* ++
+* +---++   ++
+* |   : VMA 1 |  VMA 2 |   |VMA 3  :|
+* +---++   ++
+* |   memory region|
+* ++
+*/
+   do {
+   struct vm_area_struct *vma = find_vma(current->mm, hva);
+   hva_t vm_start, vm_end;
+
+   if (!vma || vma->vm_start >= reg_end)
+   break;
+
+   /*
+* Take the intersection of this VMA with the memory region
+*/
+   vm_start = max(hva, vma->vm_start);
+   vm_end = min(reg_end, vma->vm_end);
+
+   if (!(vma->vm_flags & VM_PFNMAP)) {
+   gpa_t gpa = addr + (vm_start - memslot->userspace_addr);
+   unmap_stage2_range(kvm, gpa, vm_end - vm_start);
+   }
+   hva = vm_end;
+   } while (hva < reg_end);
+}
+
+/**
+ * stage2_unmap_vm - Unmap Stage-2 RAM mappings
+ * @kvm: The struct kvm pointer
+ *
+ * Go through the memregions and unmap any reguler RAM
+ * backing memory already mapped to the VM.
+ */
+void stage2_unmap_vm(struct kvm *kvm)
+{
+   struct kvm_memslots *slots;
+   struct kvm_memory_slot *memslot;
+   int idx;
+
+   idx = srcu_read_lock(&kvm->srcu);
+   spin_lock(&kvm->mmu_lock);
+
+   slots = kvm_memslots(kvm);
+   kvm_for_each_memslot(memslot, slots)
+   stage2_unmap_memslot(kvm, memslot);
+
+   spin_unlock(&kvm->mmu_lock);
+   srcu_read_unlock(&kvm->srcu, idx);
+}
+
 /**
  * kvm_free_stage2_pgd - free all stage-2 tables
  * @kvm:   The KVM struct pointer for the VM.
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 0caf7a5..061fed7 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -83,6 +83,7 @@ int create_hyp_io_mappings(void *from, void *to, phys_addr_t);
 void free_boot_hyp_pgd(void);
 void free_hyp_pgds(void);
 
+void stage2_unmap_vm(struct kvm *kvm);
 int kvm_alloc_stage2_pgd(struct kvm *kvm);
 void kvm_free_stage2_pgd(struct kvm *kvm);
 int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
-- 
2.1.2.330.g565301e.dirty

--
To unsubscribe from this list: send the l

[PATCH 0/5] Improve PSCI system events and fix reboot bugs

2014-11-27 Thread Christoffer Dall

Several people have reported problems with rebooting ARM VMs, especially
on 32-bit ARM.  This is mainly due to the same reason we were seeing
boot errors in the past, namely that the ram, dcache, and icache weren't
coherent on guest boot with the guest (stage-1) MMU disabled.  We solved
this by ensuring coherency when we fault in pages, but since most memory
is already mapped after a reboot, we don't do anything.

The solution is to unmap the regular RAM on system events, but we must
take care to not unmap the GIC or other IO regions, hence the somehwat
complicated solution.

As part of figuring this out, it became clear that some semantics around
the KVM_ARM_VCPU_INIT ABI and system event ABI was unclear (what is
userspace expected to do when it receives a system event).  This series
also clarifies the ABI and changes the kernel functionality to do what
userspace expects (turn off VCPUs on a system shutdown event).

The code is avaliable here as well:
http://git.linaro.org/people/christoffer.dall/linux-kvm-arm.git vcpu_init_fixes

There is an alternative version with more code reuse for what is patch 4
in this series available here:
http://git.linaro.org/people/christoffer.dall/linux-kvm-arm.git 
vcpu_init_fixes-alternative

See patch 4 for more info on this one.

Testing
---
This has been tested on CubieBoard, Arndale, TC2, and Juno.  On Arndale
and TC2 it was extremely easy to reproduce the setup (just start a VM
that runs reboot from /etc/rc.local or similar) and this series clearly
fixes the behavior.

On Juno we occasionally see lockups of reboot, but I see this both with
and without this series.  I have run a VM in a loop where the guest
shuts itself down (same code path) a couple of hundred times without
seeing any issues, so I think it's safe to merge this and further
investigate the Juno reboot issue.


Christoffer Dall (5):
  arm/arm64: KVM: Correct KVM_ARM_VCPU_INIT power off option
  arm/arm64: KVM: Reset the HCR on each vcpu when resetting the vcpu
  arm/arm64: KVM: Clarify KVM_ARM_VCPU_INIT ABI
  arm/arm64: KVM: Introduce stage2_unmap_vm
  arm/arm64: KVM: Turn off vcpus and flush stage-2 pgtables on sytem
exit events

 Documentation/virtual/kvm/api.txt| 10 +-
 arch/arm/include/asm/kvm_emulate.h   |  5 +++
 arch/arm/include/asm/kvm_mmu.h   |  1 +
 arch/arm/kvm/arm.c   |  4 +++
 arch/arm/kvm/guest.c |  1 -
 arch/arm/kvm/mmu.c   | 65 
 arch/arm/kvm/psci.c  | 18 ++
 arch/arm64/include/asm/kvm_emulate.h |  5 +++
 arch/arm64/include/asm/kvm_host.h|  1 +
 arch/arm64/include/asm/kvm_mmu.h |  1 +
 arch/arm64/kvm/guest.c   |  1 -
 11 files changed, 109 insertions(+), 3 deletions(-)

-- 
2.1.2.330.g565301e.dirty

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] xen: privcmd: schedule() after private hypercall when non CONFIG_PREEMPT

2014-11-27 Thread Luis R. Rodriguez

On Thu, Nov 27, 2014 at 1:36 PM, Luis R. Rodriguez  wrote:
> I'm afraid we don't have much leg room.

Let me be clear, I still think putting some hypercalls in process
context *might help* but because of notes 1) and 2) I highlighted I
think this is the best we can do, with more information we should be
able to consider weighing pros / cons with actual metrics from
alternatives, without more information we're just shooting in the dark
and the last thing I want is to see APIs abused or setting precedents.

  Luis
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [Xen-devel] [PATCH] xen: privcmd: schedule() after private hypercall when non CONFIG_PREEMPT

2014-11-27 Thread Andrew Cooper

On 27/11/14 18:36, Luis R. Rodriguez wrote:
> On Thu, Nov 27, 2014 at 07:36:31AM +0100, Juergen Gross wrote:
>> On 11/26/2014 11:26 PM, Luis R. Rodriguez wrote:
>>> From: "Luis R. Rodriguez" 
>>>
>>> Some folks had reported that some xen hypercalls take a long time
>>> to complete when issued from the userspace private ioctl mechanism,
>>> this can happen for instance with some hypercalls that have many
>>> sub-operations, this can happen for instance on hypercalls that use
>>> multi-call feature whereby Xen lets one hypercall batch out a series
>>> of other hypercalls on the hypervisor. At times such hypercalls can
>>> even end up triggering the TASK_UNINTERRUPTIBLE hanger check (default
>>> 120 seconds), this a non-issue issue on preemptible kernels though as
>>> the kernel may deschedule such long running tasks. Xen for instance
>>> supports multicalls to be preempted as well, this is what Xen calls
>>> continuation (see xen commit 42217cbc5b which introduced this [0]).
>>> On systems without CONFIG_PREEMPT though -- a kernel with voluntary
>>> or no preemption -- a long running hypercall will not be descheduled
>>> until the hypercall is complete and the ioctl returns to user space.
>>>
>>> To help with this David had originally implemented support for use
>>> of preempt_schedule_irq() [1] for non CONFIG_PREEMPT kernels. This
>>> solution never went upstream though and upon review to help refactor
>>> this I've concluded that usage of preempt_schedule_irq() would be
>>> a bit abussive of existing APIs -- for a few reasons:
>>>
>>> 0) we want to avoid spreading its use on non CONFIG_PREEMPT kernels
>>>
>>> 1) we want try to consider solutions that might work for other
>>> hypervisors for this same problem, and identify it its an issue
>>> even present on other hypervisors or if this is a self
>>> inflicted architectural issue caused by use of multicalls
>>>
>>> 2) there is no documentation or profiling of the exact hypercalls
>>> that were causing these issues, nor do we have any context
>>> to help evaluate this any further
>>>
>>> I at least checked with kvm folks and it seems hypercall preemption
>>> is not needed there. We can survey other hypervisors...
>>>
>>> If 'something like preemption' is needed then CONFIG_PREEMPT
>>> should just be enabled and encouraged, it seems we want to
>>> encourage CONFIG_PREEMPT on xen, specially when multicalls are
>>> used. In the meantime this tries to address a solution to help
>>> xen on non CONFIG_PREEMPT kernels.
>>>
>>> One option tested and evaluated was to put private hypercalls in
>>> process context, however this would introduce complexities such
>>> originating hypercalls from different contexts. Current xen
>>> hypercall callback handlers would need to be changed per architecture,
>>> for instance, we'd also incur the cost of switching states from
>>> user / kernel (this cost is also present if preempt_schedule_irq()
>>> is used). There may be other issues which could be introduced with
>>> this strategy as well. The simplest *shared* alternative is instead
>>> to just explicitly schedule() at the end of a private hypercall on non
>>> preempt kernels. This forces our private hypercall call mechanism
>>> to try to be fair only on non CONFIG_PREEMPT kernels at the cost of
>>> more context switch but keeps the private hypercall context intact.
>>>
>>> [0] 
>>> http://xenbits.xen.org/gitweb/?p=xen.git;a=commitdiff;h=42217cbc5b3e84b8c145d8cfb62dd5de0134b9e8;hp=3a0b9c57d5c9e82c55dd967c84dd06cb43c49ee9
>>> [1] 
>>> http://ftp.suse.com/pub/people/mcgrof/xen-preempt-hypercalls/0001-x86-xen-allow-privcmd-hypercalls-to-be-preempted.patch
>>>
>>> Cc: Davidlohr Bueso 
>>> Cc: Joerg Roedel 
>>> Cc: Borislav Petkov 
>>> Cc: Konrad Rzeszutek Wilk 
>>> Cc: Jan Beulich 
>>> Cc: Juergen Gross 
>>> Cc: Olaf Hering 
>>> Cc: David Vrabel 
>>> Signed-off-by: Luis R. Rodriguez 
>>> ---
>>>   drivers/xen/privcmd.c | 3 +++
>>>   1 file changed, 3 insertions(+)
>>>
>>> diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c
>>> index 569a13b..e29edba 100644
>>> --- a/drivers/xen/privcmd.c
>>> +++ b/drivers/xen/privcmd.c
>>> @@ -60,6 +60,9 @@ static long privcmd_ioctl_hypercall(void __user *udata)
>>>hypercall.arg[0], hypercall.arg[1],
>>>hypercall.arg[2], hypercall.arg[3],
>>>hypercall.arg[4]);
>>> +#ifndef CONFIG_PREEMPT
>>> +   schedule();
>>> +#endif
>>>
>>> return ret;
>>>   }
>>>
>> Sorry, I don't think this will solve anything. You're calling schedule()
>> right after the long running hypercall just nanoseconds before returning
>> to the user.
> Yeah, well that is what [1] tried as well only it tried using
> preempt_schedule_irq() on the hypercall callback...
>
>> I suppose you were mislead by the "int 0x82" in [0]. This is the
>> hypercall from the kernel into the hypervisor, e.g. inside of
>> privcmd_call().
> Nope, you have to consider what was done in [1], I was trying to

[PATCH 3/4] KVM: x86: allow 256 logical x2APICs again

2014-11-27 Thread Radim Krčmář

While fixing an x2apic bug,
 17d68b7 KVM: x86: fix guest-initiated crash with x2apic (CVE-2013-6376)
we've made only one cluster available.  This means that the amount of
logically addressible x2APICs was reduced to 16 and VCPUs kept
overwriting themselves in that region, so even the first cluster wasn't
set up correctly.

This patch extends x2APIC support back to the logical_map's limit, and
keeps the CVE fixed as messages for non-present APICs are dropped.

Signed-off-by: Radim Krčmář 
---
 arch/x86/kvm/lapic.c | 11 ++-
 arch/x86/kvm/lapic.h |  2 --
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 049d30f..f6e3369 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -132,8 +132,6 @@ static inline int kvm_apic_id(struct kvm_lapic *apic)
return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
 }
 
-#define KVM_X2APIC_CID_BITS 0
-
 static void recalculate_apic_map(struct kvm *kvm)
 {
struct kvm_apic_map *new, *old = NULL;
@@ -163,8 +161,7 @@ static void recalculate_apic_map(struct kvm *kvm)
if (apic_x2apic_mode(apic)) {
new->ldr_bits = 32;
new->cid_shift = 16;
-   new->cid_mask = (1 << KVM_X2APIC_CID_BITS) - 1;
-   new->lid_mask = 0x;
+   new->cid_mask = new->lid_mask = 0x;
new->broadcast = X2APIC_BROADCAST;
} else if (kvm_apic_get_reg(apic, APIC_LDR)) {
if (kvm_apic_get_reg(apic, APIC_DFR) ==
@@ -697,8 +694,12 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct 
kvm_lapic *src,
dst = &map->phys_map[irq->dest_id];
} else {
u32 mda = irq->dest_id << (32 - map->ldr_bits);
+   u16 cid = apic_cluster_id(map, mda);
 
-   dst = map->logical_map[apic_cluster_id(map, mda)];
+   if (cid >= ARRAY_SIZE(map->logical_map))
+   goto out;
+
+   dst = map->logical_map[cid];
 
bitmap = apic_logical_id(map, mda);
 
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index d4365f2..c674fce 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -154,8 +154,6 @@ static inline u16 apic_cluster_id(struct kvm_apic_map *map, 
u32 ldr)
ldr >>= 32 - map->ldr_bits;
cid = (ldr >> map->cid_shift) & map->cid_mask;
 
-   BUG_ON(cid >= ARRAY_SIZE(map->logical_map));
-
return cid;
 }
 
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 4/4] KVM: x86: don't retry hopeless APIC delivery

2014-11-27 Thread Radim Krčmář

False from kvm_irq_delivery_to_apic_fast() means that we don't handle it
in the fast path, but we still return false in cases that were perfectly
handled, fix that.

Signed-off-by: Radim Krčmář 
---
 arch/x86/kvm/lapic.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index f6e3369..6c2b8a5 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -687,6 +687,8 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct 
kvm_lapic *src,
if (irq->dest_id == map->broadcast)
goto out;
 
+   ret = true;
+
if (irq->dest_mode == 0) { /* physical mode */
if (irq->dest_id >= ARRAY_SIZE(map->phys_map))
goto out;
@@ -725,8 +727,6 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct 
kvm_lapic *src,
*r = 0;
*r += kvm_apic_set_irq(dst[i]->vcpu, irq, dest_map);
}
-
-   ret = true;
 out:
rcu_read_unlock();
return ret;
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 2/4] KVM: x86: fix APIC physical destination wrapping

2014-11-27 Thread Radim Krčmář

x2apic allows destinations > 0xff and we don't want them delivered to
lower APICs.  They are correctly handled by doing nothing.

Signed-off-by: Radim Krčmář 
---
 arch/x86/kvm/lapic.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index e8ad09d..049d30f 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -691,7 +691,10 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct 
kvm_lapic *src,
goto out;
 
if (irq->dest_mode == 0) { /* physical mode */
-   dst = &map->phys_map[irq->dest_id & 0xff];
+   if (irq->dest_id >= ARRAY_SIZE(map->phys_map))
+   goto out;
+
+   dst = &map->phys_map[irq->dest_id];
} else {
u32 mda = irq->dest_id << (32 - map->ldr_bits);
 
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 1/4] KVM: x86: deliver phys lowest-prio

2014-11-27 Thread Radim Krčmář

Physical mode can't address more than one APIC, but lowest-prio is
allowed, so we just reuse our paths.

SDM 10.6.2.1 Physical Destination:
  Also, for any non-broadcast IPI or I/O subsystem initiated interrupt
  with lowest priority delivery mode, software must ensure that APICs
  defined in the interrupt address are present and enabled to receive
  interrupts.

We could warn on top of that.

Signed-off-by: Radim Krčmář 
---
 arch/x86/kvm/lapic.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index e0e5642..e8ad09d 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -691,8 +691,6 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct 
kvm_lapic *src,
goto out;
 
if (irq->dest_mode == 0) { /* physical mode */
-   if (irq->delivery_mode == APIC_DM_LOWEST)
-   goto out;
dst = &map->phys_map[irq->dest_id & 0xff];
} else {
u32 mda = irq->dest_id << (32 - map->ldr_bits);
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 0/4] KVM: x86: APIC fixes

2014-11-27 Thread Radim Krčmář

The interesting one is [3/4], which improves upon a previous CVE fix;
we also handle logical destination wrapping in it, so [2/4] does the
same for physical;  and to make it nicer, [1/4] removes a condition.
[4/4] makes our fast path return true when the message was handled.

Radim Krčmář (4):
  KVM: x86: deliver phys lowest-prio
  KVM: x86: fix APIC physical destination wrapping
  KVM: x86: allow 256 logical x2APICs again
  KVM: x86: don't retry hopeless APIC delivery

 arch/x86/kvm/lapic.c | 20 +++-
 arch/x86/kvm/lapic.h |  2 --
 2 files changed, 11 insertions(+), 11 deletions(-)

-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 3/4] KVM: x86: allow 256 logical x2APICs again

2014-11-27 Thread Nadav Amit

Radim Krčmář  wrote:

> While fixing an x2apic bug,
> 17d68b7 KVM: x86: fix guest-initiated crash with x2apic (CVE-2013-6376)
> we've made only one cluster available.  This means that the amount of
> logically addressible x2APICs was reduced to 16 and VCPUs kept
> overwriting themselves in that region, so even the first cluster wasn't
> set up correctly.
> 
> This patch extends x2APIC support back to the logical_map's limit, and
> keeps the CVE fixed as messages for non-present APICs are dropped.
> 
> Signed-off-by: Radim Krčmář 
> ---
> arch/x86/kvm/lapic.c | 11 ++-
> arch/x86/kvm/lapic.h |  2 --
> 2 files changed, 6 insertions(+), 7 deletions(-)
> 
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index 049d30f..f6e3369 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -132,8 +132,6 @@ static inline int kvm_apic_id(struct kvm_lapic *apic)
>   return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
> }
> 
> -#define KVM_X2APIC_CID_BITS 0
> -
> static void recalculate_apic_map(struct kvm *kvm)
> {
>   struct kvm_apic_map *new, *old = NULL;
> @@ -163,8 +161,7 @@ static void recalculate_apic_map(struct kvm *kvm)
>   if (apic_x2apic_mode(apic)) {
>   new->ldr_bits = 32;
>   new->cid_shift = 16;
> - new->cid_mask = (1 << KVM_X2APIC_CID_BITS) - 1;
> - new->lid_mask = 0x;
> + new->cid_mask = new->lid_mask = 0x;
You set cid_mask to 0x, while there are only 16 clusters. I think it is
risky (if you twist my hand would come with a scenario). Yet, why not to set
cid_mask to (ARRAY_SIZE(map->logical_map) - 1) ?


Nadav--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v6 45/46] vhost/scsi: partial virtio 1.0 support

2014-11-27 Thread Michael S. Tsirkin

Include all endian conversions as required by virtio 1.0.
Don't set virtio 1.0 yet, since that requires ANY_LAYOUT
which we don't yet support.

Signed-off-by: Michael S. Tsirkin 
Acked-by: Paolo Bonzini 
---
 drivers/vhost/scsi.c | 22 --
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index a17f118..01c01cb 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -168,6 +168,7 @@ enum {
VHOST_SCSI_VQ_IO = 2,
 };
 
+/* Note: can't set VIRTIO_F_VERSION_1 yet, since that implies ANY_LAYOUT. */
 enum {
VHOST_SCSI_FEATURES = VHOST_FEATURES | (1ULL << VIRTIO_SCSI_F_HOTPLUG) |
   (1ULL << VIRTIO_SCSI_F_T10_PI)
@@ -577,8 +578,8 @@ tcm_vhost_allocate_evt(struct vhost_scsi *vs,
return NULL;
}
 
-   evt->event.event = event;
-   evt->event.reason = reason;
+   evt->event.event = cpu_to_vhost32(vq, event);
+   evt->event.reason = cpu_to_vhost32(vq, reason);
vs->vs_events_nr++;
 
return evt;
@@ -636,7 +637,7 @@ again:
}
 
if (vs->vs_events_missed) {
-   event->event |= VIRTIO_SCSI_T_EVENTS_MISSED;
+   event->event |= cpu_to_vhost32(vq, VIRTIO_SCSI_T_EVENTS_MISSED);
vs->vs_events_missed = false;
}
 
@@ -695,12 +696,13 @@ static void vhost_scsi_complete_cmd_work(struct 
vhost_work *work)
cmd, se_cmd->residual_count, se_cmd->scsi_status);
 
memset(&v_rsp, 0, sizeof(v_rsp));
-   v_rsp.resid = se_cmd->residual_count;
+   v_rsp.resid = cpu_to_vhost32(cmd->tvc_vq, 
se_cmd->residual_count);
/* TODO is status_qualifier field needed? */
v_rsp.status = se_cmd->scsi_status;
-   v_rsp.sense_len = se_cmd->scsi_sense_length;
+   v_rsp.sense_len = cpu_to_vhost32(cmd->tvc_vq,
+se_cmd->scsi_sense_length);
memcpy(v_rsp.sense, cmd->tvc_sense_buf,
-  v_rsp.sense_len);
+  se_cmd->scsi_sense_length);
ret = copy_to_user(cmd->tvc_resp, &v_rsp, sizeof(v_rsp));
if (likely(ret == 0)) {
struct vhost_scsi_virtqueue *q;
@@ -1095,14 +1097,14 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct 
vhost_virtqueue *vq)
", but wrong data_direction\n");
goto err_cmd;
}
-   prot_bytes = v_req_pi.pi_bytesout;
+   prot_bytes = vhost32_to_cpu(vq, 
v_req_pi.pi_bytesout);
} else if (v_req_pi.pi_bytesin) {
if (data_direction != DMA_FROM_DEVICE) {
vq_err(vq, "Received non zero 
di_pi_niov"
", but wrong data_direction\n");
goto err_cmd;
}
-   prot_bytes = v_req_pi.pi_bytesin;
+   prot_bytes = vhost32_to_cpu(vq, 
v_req_pi.pi_bytesin);
}
if (prot_bytes) {
int tmp = 0;
@@ -1117,12 +1119,12 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct 
vhost_virtqueue *vq)
data_first += prot_niov;
data_niov = data_num - prot_niov;
}
-   tag = v_req_pi.tag;
+   tag = vhost64_to_cpu(vq, v_req_pi.tag);
task_attr = v_req_pi.task_attr;
cdb = &v_req_pi.cdb[0];
lun = ((v_req_pi.lun[2] << 8) | v_req_pi.lun[3]) & 
0x3FFF;
} else {
-   tag = v_req.tag;
+   tag = vhost64_to_cpu(vq, v_req.tag);
task_attr = v_req.task_attr;
cdb = &v_req.cdb[0];
lun = ((v_req.lun[2] << 8) | v_req.lun[3]) & 0x3FFF;
-- 
MST

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

1 2 >

1 - 100 of 118 matches

Mail list logo