[PATCH 0/2] cmd646: remove pci_cmd646_ide_init() function

2020-02-14 Thread Mark Cave-Ayland
The recent cmd646 discussions reminded me of this patch I've had sitting in an
old branch for a while.

The DP264 machine is the last remaining user of the deprecated
pci_cmd646_ide_init() init function. Switch it over to using qdev via 
pci_create()
and then remove the now-unused pci_cmd646_ide_init() function.

Signed-off-by: Mark Cave-Ayland 

Mark Cave-Ayland (2):
  dp264: use pci_create() to initialise the cmd646 device
  cmd646: remove unused pci_cmd646_ide_init() function

 hw/alpha/dp264.c |  8 +++-
 hw/ide/cmd646.c  | 12 
 include/hw/ide.h |  2 --
 3 files changed, 7 insertions(+), 15 deletions(-)

-- 
2.20.1




[PATCH 1/2] dp264: use pci_create() to initialise the cmd646 device

2020-02-14 Thread Mark Cave-Ayland
Remove the call to pci_cmd646_ide_init() since global device init functions
are deprecated in preference of using qdev directly.

Signed-off-by: Mark Cave-Ayland 
---
 hw/alpha/dp264.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/hw/alpha/dp264.c b/hw/alpha/dp264.c
index a8f9a89cc4..e91989bf9a 100644
--- a/hw/alpha/dp264.c
+++ b/hw/alpha/dp264.c
@@ -16,6 +16,7 @@
 #include "sysemu/sysemu.h"
 #include "hw/rtc/mc146818rtc.h"
 #include "hw/ide.h"
+#include "hw/ide/pci.h"
 #include "hw/timer/i8254.h"
 #include "hw/isa/superio.h"
 #include "hw/dma/i8257.h"
@@ -100,9 +101,14 @@ static void clipper_init(MachineState *machine)
 /* IDE disk setup.  */
 {
 DriveInfo *hd[MAX_IDE_BUS * MAX_IDE_DEVS];
+PCIDevice *pci_dev;
+
 ide_drive_get(hd, ARRAY_SIZE(hd));
 
-pci_cmd646_ide_init(pci_bus, hd, 0);
+pci_dev = pci_create(pci_bus, -1, "cmd646-ide");
+qdev_prop_set_uint32(DEVICE(pci_dev), "secondary", 0);
+qdev_init_nofail(DEVICE(pci_dev));
+pci_ide_create_devs(pci_dev, hd);
 }
 
 /* Load PALcode.  Given that this is not "real" cpu palcode,
-- 
2.20.1




[PATCH 2/2] cmd646: remove unused pci_cmd646_ide_init() function

2020-02-14 Thread Mark Cave-Ayland
Signed-off-by: Mark Cave-Ayland 
---
 hw/ide/cmd646.c  | 12 
 include/hw/ide.h |  2 --
 2 files changed, 14 deletions(-)

diff --git a/hw/ide/cmd646.c b/hw/ide/cmd646.c
index 335c060673..d953932104 100644
--- a/hw/ide/cmd646.c
+++ b/hw/ide/cmd646.c
@@ -317,18 +317,6 @@ static void pci_cmd646_ide_exitfn(PCIDevice *dev)
 }
 }
 
-void pci_cmd646_ide_init(PCIBus *bus, DriveInfo **hd_table,
- int secondary_ide_enabled)
-{
-PCIDevice *dev;
-
-dev = pci_create(bus, -1, "cmd646-ide");
-qdev_prop_set_uint32(&dev->qdev, "secondary", secondary_ide_enabled);
-qdev_init_nofail(&dev->qdev);
-
-pci_ide_create_devs(dev, hd_table);
-}
-
 static Property cmd646_ide_properties[] = {
 DEFINE_PROP_UINT32("secondary", PCIIDEState, secondary, 0),
 DEFINE_PROP_END_OF_LIST(),
diff --git a/include/hw/ide.h b/include/hw/ide.h
index 28d8a06439..0c7080ed92 100644
--- a/include/hw/ide.h
+++ b/include/hw/ide.h
@@ -12,8 +12,6 @@ ISADevice *isa_ide_init(ISABus *bus, int iobase, int iobase2, 
int isairq,
 DriveInfo *hd0, DriveInfo *hd1);
 
 /* ide-pci.c */
-void pci_cmd646_ide_init(PCIBus *bus, DriveInfo **hd_table,
- int secondary_ide_enabled);
 PCIDevice *pci_piix3_xen_ide_init(PCIBus *bus, DriveInfo **hd_table, int 
devfn);
 PCIDevice *pci_piix3_ide_init(PCIBus *bus, DriveInfo **hd_table, int devfn);
 PCIDevice *pci_piix4_ide_init(PCIBus *bus, DriveInfo **hd_table, int devfn);
-- 
2.20.1




[PATCH] Report stringified errno in VFIO related errors

2020-02-14 Thread Michal Privoznik
In a few places we report errno formatted as a negative integer.
This is not as user friendly as it can be. Use strerror() and/or
error_setg_errno() instead.

Signed-off-by: Michal Privoznik 
---
 hw/vfio/common.c| 2 +-
 util/vfio-helpers.c | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 5ca11488d6..a3a2a82d99 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -352,7 +352,7 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr 
iova,
 return 0;
 }
 
-error_report("VFIO_MAP_DMA: %d", -errno);
+error_report("VFIO_MAP_DMA failed: %s", strerror(errno));
 return -errno;
 }
 
diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c
index 813f7ec564..ddd9a96e76 100644
--- a/util/vfio-helpers.c
+++ b/util/vfio-helpers.c
@@ -545,7 +545,7 @@ static int qemu_vfio_do_mapping(QEMUVFIOState *s, void 
*host, size_t size,
 trace_qemu_vfio_do_mapping(s, host, size, iova);
 
 if (ioctl(s->container, VFIO_IOMMU_MAP_DMA, &dma_map)) {
-error_report("VFIO_MAP_DMA: %d", -errno);
+error_report("VFIO_MAP_DMA failed: %s", strerror(errno));
 return -errno;
 }
 return 0;
@@ -570,7 +570,7 @@ static void qemu_vfio_undo_mapping(QEMUVFIOState *s, 
IOVAMapping *mapping,
 assert(QEMU_IS_ALIGNED(mapping->size, qemu_real_host_page_size));
 assert(index >= 0 && index < s->nr_mappings);
 if (ioctl(s->container, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
-error_setg(errp, "VFIO_UNMAP_DMA failed: %d", -errno);
+error_setg_errno(errp, errno, "VFIO_UNMAP_DMA failed");
 }
 memmove(mapping, &s->mappings[index + 1],
 sizeof(s->mappings[0]) * (s->nr_mappings - index - 1));
@@ -669,7 +669,7 @@ int qemu_vfio_dma_reset_temporary(QEMUVFIOState *s)
 trace_qemu_vfio_dma_reset_temporary(s);
 qemu_mutex_lock(&s->lock);
 if (ioctl(s->container, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
-error_report("VFIO_UNMAP_DMA: %d", -errno);
+error_report("VFIO_UNMAP_DMA failed: %s", strerror(errno));
 qemu_mutex_unlock(&s->lock);
 return -errno;
 }
-- 
2.24.1




Re: [PATCH v5 7/8] multifd: Add multifd-zstd-level parameter

2020-02-14 Thread Markus Armbruster
Juan Quintela  writes:

> Markus Armbruster  wrote:
>> "Dr. David Alan Gilbert"  writes:
>>
>>> * Juan Quintela (quint...@redhat.com) wrote:
 Signed-off-by: Juan Quintela 
 ---
  migration/migration.c | 15 +++
  monitor/hmp-cmds.c|  4 
  qapi/migration.json   | 29 ++---
  3 files changed, 45 insertions(+), 3 deletions(-)
 
 diff --git a/migration/migration.c b/migration/migration.c
 index 3b081e8147..b690500545 100644
 --- a/migration/migration.c
 +++ b/migration/migration.c
 @@ -91,6 +91,8 @@
  #define DEFAULT_MIGRATE_MULTIFD_METHOD MULTIFD_METHOD_NONE
  /*0: means nocompress, 1: best speed, ... 9: best compress ratio */
  #define DEFAULT_MIGRATE_MULTIFD_ZLIB_LEVEL 1
 +/* 0: means nocompress, 1: best speed, ... 20: best compress ratio */
 +#define DEFAULT_MIGRATE_MULTIFD_ZSTD_LEVEL 1
  
  /* Background transfer rate for postcopy, 0 means unlimited, note
   * that page requests can still exceed this limit.
 @@ -805,6 +807,8 @@ MigrationParameters 
 *qmp_query_migrate_parameters(Error **errp)
  params->multifd_method = s->parameters.multifd_method;
  params->has_multifd_zlib_level = true;
  params->multifd_zlib_level = s->parameters.multifd_zlib_level;
 +params->has_multifd_zstd_level = true;
 +params->multifd_zstd_level = s->parameters.multifd_zstd_level;
>>>
>>> Do we really want different 'multifd__level's or just one
>>> 'multifd_compress_level' - or even just reuse the existing
>>> 'compress-level' parameter.
>>
>> compress-level,
>
> possible values: 1 to 9 deprecated
>
>> multifd-zlib-level
>
> possible values: 1 to 9, default 1
> (zlib default is -1, let the lib decide)
>
> , and multifd-zstd-level apply
>
> possible values: 1 to 19, default 1
> (zstd default is 3)
>
>> "normal" live migration compression, multifd zlib live migration
>> compression, and multifd zstd live migration compression, respectively.
>>
>> Any live migration can only use one of the three compressions.
>>
>> Correct?
>
> Yeap.
>
>>> The only tricky thing about combining them is how to handle
>>> the difference in allowed ranges;  When would the right time be
>>> to check it?
>>>
>>> Markus/Eric: Any idea?
>>
>> To have an informed opinion, I'd have to dig through the migration
>> code.
>
> Problem is _how_ to setup them.  if we setup zstd compression method,
> put the value at 19, and then setup zlib compression method, what should
> we do?
>
> Truncate to 9?
> Give one error?
> Don't allow the zlib setup?
>
> Too complicated.

The interface pretends the parameters are all independent: you get to
set them one by one.

They are in fact not independent, and this now leads to difficulties.

To avoid them, the interface should let you specify a desired
configuration all at once, and its implementation should then do what it
takes to get from here to there.

Example: current state is multifd compression method "zstd", compression
level is 19.  User wants to switch to method "zlib" level 9 the obvious
way

With old interface:
step 1: set method to "zlib"
step 2: set level to 9
Problem: after step 1, we have method "zlib" with invalid level 19.
Workaround: swap the steps.

Note that the workaround only works because the set of levels both
methods support is non-empty.  We might still come up with more
complicated parameter combinations where that is not the case.

With new interface:
set compression to "zlib" level 9

The new interface require us to specify a QAPI type capable of holding
the complete migration configuration.

The stupid way is to throw all migration parameters into a struct, and
make the ones optional that apply only when others have certain values.
Thus, the "applies only when" bits are semantical, documented in
comments, and enforced by C code.

With a bit more care, we can make "applies only when" syntactical
instead.  Examples:

@max-bandwidth and @downtime-limit always apply.  They go straight
into the struct.

@tls-creds, @tls-hostname, @tls-authz apply only when TLS is enabled
by setting @tls-creds.  Have an optional member @tls, which is a
struct with mandatory member @creds, optional members @hostname,
@authz.

@multifd-zlib-level applies when @multifd-method is "zlib", and
@multifd-zstd-level applies when it's "zstd".  Have a union
@multifd-compression, cases "none", "zlib" and "zstd", where each
case's members are the parameters applying in that case.

Please note the purpose of these examples is to show how things can be
done in the schema.  I'm not trying to tell you how these specific
things *should* be done.

The resulting type is perfectly suited as return value of a query
command.  It's awkward as argument of a "specify desired configuration"
command, because it requires the user to specify *complete*
configuration.  If we want to suppor

Re: [PATCH] Report stringified errno in VFIO related errors

2020-02-14 Thread Ján Tomko

On Fri, Feb 14, 2020 at 09:47:39AM +0100, Michal Privoznik wrote:

In a few places we report errno formatted as a negative integer.
This is not as user friendly as it can be. Use strerror() and/or
error_setg_errno() instead.

Signed-off-by: Michal Privoznik 
---
hw/vfio/common.c| 2 +-
util/vfio-helpers.c | 6 +++---
2 files changed, 4 insertions(+), 4 deletions(-)



Reviewed-by: Ján Tomko 

Jano


signature.asc
Description: PGP signature


Re: [PATCH RFC] memory: Don't allow to resize RAM while migrating

2020-02-14 Thread David Hildenbrand
On 13.02.20 21:56, Peter Xu wrote:
> On Thu, Feb 13, 2020 at 08:42:23PM +0100, David Hildenbrand wrote:
>> On 13.02.20 19:32, Peter Xu wrote:
>>> On Thu, Feb 13, 2020 at 06:20:16PM +0100, David Hildenbrand wrote:
 Resizing while migrating is dangerous and does not work as expected.
 The whole migration code works on the usable_length of ram blocks and does
 not expect this to change at random points in time.

 Precopy: The ram block size must not change on the source, after
 ram_save_setup(), so as long as the guest is still running on the source.

 Postcopy: The ram block size must not change on the target, after
 synchronizing the RAM block list (ram_load_precopy()).

 AFAIKS, resizing can be trigger *after* (but not during) a reset in
 ACPI code by the guest
 - hw/arm/virt-acpi-build.c:acpi_ram_update()
 - hw/i386/acpi-build.c:acpi_ram_update()
>>>
>>> What can be the pre-condition of triggering this after reset?  I'm
>>> thinking whether QEMU can be aware of this "resizing can happen"
>>> condition, then we could simply stop the migration from happening even
>>> before the resizing happens.  Thanks,
>>
>> I think the condition is not known before the guest actually tries to
>> read the relevant memory areas (which trigger the rebuilt+resize, and
>> AFAIK, the new size depends on fw config done by the guest after the
>> reset). So it's hard to "predict".
> 
> I chimmed in without much context, sorry if I'm going to ask naive
> questions. :)

I think the problem is quite involved and not obvious, so there are no
naive questions :)

> 
> What I was asking is about why the resizing can happen.  A quick read
> told me that it was majorly for easier extension of ROMs (firmware
> updates?).  With that, I'm imaging a common case for memory
> resizing...
> 
>   (1) Source QEMU runs VM on old host, with old firmware
> 
>   (2) Migrate source QEMU to destination new host, with new and bigger
>   firmware
> 
>   (3) During the migration, the ROM size on the destination will still
>   be the old, referring to ram_load_precopy(), as long as no
>   system reset
> 
>   (4) After migration finished, when the system reboots, memory
>   resizing happens with the new and bigger firmware loaded

AFAIK it could trigger

a) In precopy during the second migration.
b) In postcopy during the first migration.

> 
> And is this patch trying to fix/warn when there's a reboot during (3)
> so the new size is discovered at a wrong time?  Is my understanding
> correct?

It's trying to bail out early instead of failing at other random points
(with an unclear outcome).

>>
>> In the precopy case it would be easier to abort (although, not simple
>> AFAIKS), in the postcopy not so easy - because you're already partially
>> running on the migration target.
> 
> Prior to this patch, would a precopy still survive with such an
> accident (asked because I _feel_ like migrating a ramblock with
> smaller used_length to the same ramblock with bigger used_length seems
> to be fine?)?  Or we can stop the precopy and restart.  After this

I assume growing the region is the usual case (not shrinking). FW blobs
tend to get bigger.

Migrating while growing a ram block on the source won't work. The source
would try to send a dirt page that's outside of the used_length on the
target, making e.g., ram_load_postcopy()/ram_load_precopy() fail with
"Illegal RAM offset...".

In the postcopy case, e.g., ram_dirty_bitmap_reload() will fail in case
there is a mismatch between ram block size on source/target.

Another issue is if the used_length changes while in ram_save_setup(),
just between storing ram_bytes_total_common(true) and storing
block->used_length. A mismatch will screw up the migration stream.

But these are just the immediately visible issues. I am more concerned
about used_length changing at random points in time, resulting in more
harm. (e.g., non-obvious load-store tearing when accessing the used length)

Migration code is inherently racy when it comes to ram block resizes.
And that might become more dangerous once we want to size the migration
bitmaps smaller (used_length instead of max_length) or disallow access
to ram blocks beyond the used_length. Both are things I am working on :)

> patch, it'll crash the source VM (&error_abort specified in
> memory_region_ram_resize()), which seems a bit more harsh?

There seems to be no easy way to abort migration from outside the
migration thread. As Juan said, you actually don't want to fail
migration but instead soft-abort migration and continue running the
guest on the target on a reset. But that's not easy as well.

One could think about extending ram block notifiers to notify migration
code (before) resizes, so that migration code can work around the
resize (how is TBD). Not easy as well :)

But then, I am not sure
a) If we run into this issue in real life a lot.
b) If we actually need an elaborate solutions within QEMU to hand

Re: [PATCH v2 04/19] tests/rcutorture: mild documenting refactor of update thread

2020-02-14 Thread Paolo Bonzini
On 13/02/20 23:50, Alex Bennée wrote:
> This is mainly to help with reasoning what the test is trying to do.
> We can move rcu_stress_idx to a local variable as there is only ever
> one updater thread. I've also added an assert to catch the case where
> we end up updating the current structure to itself which is the only
> way I can see the mberror cases we are seeing on Travis.
> 
> We shall see if the rcutorture test failures go away now.
> 
> Signed-off-by: Alex Bennée 

Reviewed-by: Paolo Bonzini 

Yes, the failures are quire mysterious and I agree that they can only happen if:

1) p == cp in your patch below

2) the memory barrier can be overtaken by the store above it.

Even then, (2) would be unlikely because then the compiler would 
coalesce the two stores to p->mbtest.  However we could add a patch such 
as this to rule it out:

diff --git a/tests/rcutorture.c b/tests/rcutorture.c
index 9559f13..969a19a 100644
--- a/tests/rcutorture.c
+++ b/tests/rcutorture.c
@@ -260,7 +260,7 @@ static void *rcu_read_stress_test(void *arg)
 while (goflag == GOFLAG_RUN) {
 rcu_read_lock();
 p = atomic_rcu_read(&rcu_stress_current);
-if (p->mbtest == 0) {
+if (atomic_read(&p->mbtest) == 0) {
 n_mberror++;
 }
 rcu_read_lock();
@@ -268,7 +268,7 @@ static void *rcu_read_stress_test(void *arg)
 garbage++;
 }
 rcu_read_unlock();
-pc = p->pipe_count;
+pc = atomic_read(&p->pipe_count);
 rcu_read_unlock();
 if ((pc > RCU_STRESS_PIPE_LEN) || (pc < 0)) {
 pc = RCU_STRESS_PIPE_LEN;
@@ -319,10 +319,10 @@ static void *rcu_update_stress_test(void *arg)
 p = &rcu_stress_array[rcu_stress_idx];
 /* catching up with ourselves would be a bug */
 assert(p != cp);
-p->mbtest = 0;
+atomic_set(&p->mbtest, 0);
 smp_mb();
-p->pipe_count = 0;
-p->mbtest = 1;
+atomic_set(&p->pipe_count, 0);
+atomic_set(&p->mbtest, 1);
 atomic_rcu_set(&rcu_stress_current, p);
 cp = p;
 /*
@@ -331,7 +331,8 @@ static void *rcu_update_stress_test(void *arg)
  */
 for (i = 0; i < RCU_STRESS_PIPE_LEN; i++) {
 if (i != rcu_stress_idx) {
-rcu_stress_array[i].pipe_count++;
+atomic_set(&rcu_stress_array[i].pipe_count,
+  rcu_stress_array[i].pipe_count + 1);
 }
 }
 synchronize_rcu();


Finally, the "pipe_count" naming is a bit mysterious.  The idea behind the test
is that RCU can only ever see the current or the previous version of a
struct (in this case, the current or the previous index) and a "pipe_count"
of 3 means for example that the index is 3 updates behind the current
index.  To check that the RCU invariant is preserved, the test checks that
the reader does not observe an index that is 2 updates or more behind the
current index.

I have never changed it because I took it directly from Paul McKenney's
rcutorture and I didn't want to deviate too much in order to keep Paul's
code as a reference.  But perhaps we should rename it to something more
intuitive like "age" (which is short enough that "pc" could also be
renamed to "age").

Paolo




Re: [PATCH] Report stringified errno in VFIO related errors

2020-02-14 Thread Michal Privoznik

On 2/14/20 9:47 AM, Michal Privoznik wrote:

In a few places we report errno formatted as a negative integer.
This is not as user friendly as it can be. Use strerror() and/or
error_setg_errno() instead.

Signed-off-by: Michal Privoznik 
---
  hw/vfio/common.c| 2 +-
  util/vfio-helpers.c | 6 +++---
  2 files changed, 4 insertions(+), 4 deletions(-)



BTW the reason I've noticed these is because I'm seeing some errors when 
assigning my NVMe disk to qemu. This is the full command line:



/home/zippy/work/qemu/qemu.git/x86_64-softmmu/qemu-system-x86_64 \
-name guest=fedora,debug-threads=on \
-S \
-object 
secret,id=masterKey0,format=raw,file=/var/lib/libvirt/qemu/domain-2-fedora/master-key.aes 
\

-machine pc-i440fx-4.1,accel=kvm,usb=off,dump-guest-core=off \
-cpu host \
-m size=4194304k,slots=16,maxmem=1099511627776k \
-overcommit mem-lock=off \
-smp 4,sockets=1,dies=1,cores=2,threads=2 \
-object iothread,id=iothread1 \
-object iothread,id=iothread2 \
-object iothread,id=iothread3 \
-object iothread,id=iothread4 \
-mem-prealloc \
-mem-path /hugepages2M/libvirt/qemu/2-fedora \
-numa node,nodeid=0,cpus=0,mem=4096 \
-uuid 63840878-0deb-4095-97e6-fc444d9bc9fa \
-no-user-config \
-nodefaults \
-chardev socket,id=charmonitor,fd=31,server,nowait \
-mon chardev=charmonitor,id=monitor,mode=control \
-rtc base=utc \
-no-shutdown \
-global PIIX4_PM.disable_s3=0 \
-global PIIX4_PM.disable_s4=0 \
-boot menu=on,strict=on \
-device piix3-usb-uhci,id=usb,bus=pci.0,addr=0x1.0x2 \
-device virtio-scsi-pci,id=scsi0,bus=pci.0,addr=0x4 \
-device virtio-serial-pci,id=virtio-serial0,bus=pci.0,addr=0x5 \
-blockdev 
'{"driver":"file","filename":"/var/lib/libvirt/images/fedora.qcow2","node-name":"libvirt-2-storage","auto-read-only":true,"discard":"unmap"}' 
\
-blockdev 
'{"node-name":"libvirt-2-format","read-only":false,"discard":"unmap","driver":"qcow2","file":"libvirt-2-storage","backing":null}' 
\
-device 
scsi-hd,bus=scsi0.0,channel=0,scsi-id=0,lun=0,device_id=drive-scsi0-0-0-0,drive=libvirt-2-format,id=scsi0-0-0-0,bootindex=1 
\
-blockdev 
'{"driver":"nvme","device":":02:00.0","namespace":1,"node-name":"libvirt-1-storage","auto-read-only":true,"discard":"unmap"}' 
\
-blockdev 
'{"node-name":"libvirt-1-format","read-only":false,"driver":"raw","file":"libvirt-1-storage"}' 
\
-device 
virtio-blk-pci,scsi=off,bus=pci.0,addr=0x6,drive=libvirt-1-format,id=virtio-disk0 
\

-netdev tap,fd=33,id=hostnet0,vhost=on,vhostfd=34 \
-device 
virtio-net-pci,host_mtu=9000,netdev=hostnet0,id=net0,mac=52:54:00:a4:6f:91,bus=pci.0,addr=0x3 
\

-chardev pty,id=charserial0 \
-device isa-serial,chardev=charserial0,id=serial0 \
-chardev socket,id=charchannel0,fd=35,server,nowait \
-device 
virtserialport,bus=virtio-serial0.0,nr=1,chardev=charchannel0,id=channel0,name=org.qemu.guest_agent.0 
\

-spice port=5900,addr=0.0.0.0,disable-ticketing,seamless-migration=on \
-device virtio-vga,id=video0,virgl=on,max_outputs=1,bus=pci.0,addr=0x2 \
-device virtio-balloon-pci,id=balloon0,bus=pci.0,addr=0x7 \
-sandbox 
on,obsolete=deny,elevateprivileges=deny,spawn=deny,resourcecontrol=deny \

-msg timestamp=on

And these are the errors I see:

2020-02-14T09:06:18.183167Z qemu-system-x86_64: VFIO_MAP_DMA failed: 
Invalid argument
2020-02-14T09:10:49.753767Z qemu-system-x86_64: VFIO_MAP_DMA failed: 
Cannot allocate memory
2020-02-14T09:11:04.530344Z qemu-system-x86_64: VFIO_MAP_DMA failed: No 
space left on device
2020-02-14T09:11:04.531087Z qemu-system-x86_64: VFIO_MAP_DMA failed: No 
space left on device
2020-02-14T09:11:04.531230Z qemu-system-x86_64: VFIO_MAP_DMA failed: No 
space left on device



I'm doing nothing with the disk inside the guest, but:

  # dd if=/dev/vda of=/dev/null status=progress

(the disk appears as /dev/vda in the guest). Surprisingly, I do not see 
these errors when I use the traditional PCI assignment (-device 
vfio-pci). My versions of kernel and qemu:


moe ~ # uname -r
5.4.15-gentoo
moe ~ # /home/zippy/work/qemu/qemu.git/x86_64-softmmu/qemu-system-x86_64 
--version

QEMU emulator version 4.2.50 (v4.2.0-1439-g5d6542bea7-dirty)
Copyright (c) 2003-2019 Fabrice Bellard and the QEMU Project developers

Do you guys want me to file a bug?

Michal




Re: [PATCH v2 01/30] configure: Allow user to specify sphinx-build binary

2020-02-14 Thread Peter Maydell
On Fri, 14 Feb 2020 at 06:33, Markus Armbruster  wrote:
>
> Does not work out of the box on my Fedora 30 build host, where
> sphinx-build gives me sphinx-build-2.  I have to specify
> --sphinx-build=/usr/bin/sphinx-build-3 to unbreak it.  Which of course
> breaks things when I try to build anything before this commit
>
> The appended patch makes it work out of the box.  Please consider
> squashing it in.
>
> diff --git a/configure b/configure
> index 14172909f0..a9d175c400 100755
> --- a/configure
> +++ b/configure
> @@ -584,7 +584,6 @@ query_pkg_config() {
>  }
>  pkg_config=query_pkg_config
>  sdl2_config="${SDL2_CONFIG-${cross_prefix}sdl2-config}"
> -sphinx_build=sphinx-build
>
>  # If the user hasn't specified ARFLAGS, default to 'rv', just as make does.
>  ARFLAGS="${ARFLAGS-rv}"
> @@ -903,6 +902,7 @@ fi
>
>  : ${make=${MAKE-make}}
>  : ${install=${INSTALL-install}}
> +
>  # We prefer python 3.x. A bare 'python' is traditionally
>  # python 2.x, but some distros have it as python 3.x, so
>  # we check that too
> @@ -915,6 +915,19 @@ do
>  break
>  fi
>  done
> +
> +set -x

I guess the set -x / set +x here are accidentally left in
debug printing?

> +sphinx_build=
> +for binary in sphinx-build-3 sphinx-build
> +do
> +if has "$binary"
> +then
> +sphinx_build=$(command -v "$binary")
> +break
> +fi
> +done
> +set +x
> +
>  : ${smbd=${SMBD-/usr/sbin/smbd}}
>
>  # Default objcc to clang if available, otherwise use CC
> @@ -4803,7 +4816,7 @@ has_sphinx_build() {
>  # sphinx-build doesn't exist at all or if it is too old.
>  mkdir -p "$TMPDIR1/sphinx"
>  touch "$TMPDIR1/sphinx/index.rst"
> -$sphinx_build -c "$source_path/docs" -b html "$TMPDIR1/sphinx" 
> "$TMPDIR1/sphinx/out" >/dev/null 2>&1
> +"$sphinx_build" -c "$source_path/docs" -b html "$TMPDIR1/sphinx" 
> "$TMPDIR1/sphinx/out" >/dev/null 2>&1
>  }

This change isn't related to trying sphinx-build-3 --
did you actually need it ?

I did think about quoting when I wrote the patch,
but looking at existing practice we are all over the
place on whether we bother to quote variables containing
program names in configure. I think I ended up following
the same thing we do for $python, which doesn't quote.

Other than that, I'm happy to squash this in, or for
you to squash it in if you are otherwise OK taking
the first chunk of the patchset via your tree now.
(Do you have a preference for whether you take these
patches via your tree or I send them in a docs pullreq?)

thanks
-- PMM



Re: [PATCH 1/2] hw/arm/xilinx_zynq: Fix USB port instantiation

2020-02-14 Thread Peter Maydell
On Fri, 14 Feb 2020 at 06:05, Guenter Roeck  wrote:
>
> USB ports must be instantiated as TYPE_CHIPIDEA to work.
> Linux expects and checks various chipidea registers, which
> do not exist with the basic ehci emulation.

Hi; I didn't see a cover letter email for this series (sorry if
I missed it). Would you mind including a cover letter email
for any more-than-one-patch set, please? It helps the
automatic tools and it also decreases the chances that
I miss the patches when I'm scanning through my email
(when I'm tagging things I might want to review I basically
look through subject lines for single patch patches
and for 00/nn cover letters and ignore middle-of-patchset
emails...)


thanks
-- PMM



Re: [PATCH] Report stringified errno in VFIO related errors

2020-02-14 Thread Cornelia Huck
On Fri, 14 Feb 2020 09:47:39 +0100
Michal Privoznik  wrote:

> In a few places we report errno formatted as a negative integer.
> This is not as user friendly as it can be. Use strerror() and/or
> error_setg_errno() instead.
> 
> Signed-off-by: Michal Privoznik 
> ---
>  hw/vfio/common.c| 2 +-
>  util/vfio-helpers.c | 6 +++---
>  2 files changed, 4 insertions(+), 4 deletions(-)
> 
> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> index 5ca11488d6..a3a2a82d99 100644
> --- a/hw/vfio/common.c
> +++ b/hw/vfio/common.c
> @@ -352,7 +352,7 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr 
> iova,
>  return 0;
>  }
>  
> -error_report("VFIO_MAP_DMA: %d", -errno);
> +error_report("VFIO_MAP_DMA failed: %s", strerror(errno));
>  return -errno;
>  }

I think you missed the one in vfio_dma_unmap().

(...)

The other file looks good.




RE: [PATCH RFC 03/14] migration/rdma: Create multiFd migration threads

2020-02-14 Thread fengzhimin
Thanks for your review. I will fix these errors in the next version(V3).

Due to migration data transfer using RDMA WRITE operation, we don't need to 
receive data in the destination.
We only need to poll the CQE in the destination, so multifd_recv_thread() can't 
be used directly.

-Original Message-
From: Juan Quintela [mailto:quint...@redhat.com] 
Sent: Thursday, February 13, 2020 6:13 PM
To: fengzhimin 
Cc: dgilb...@redhat.com; arm...@redhat.com; ebl...@redhat.com; 
qemu-devel@nongnu.org; Zhanghailiang ; 
jemmy858...@gmail.com
Subject: Re: [PATCH RFC 03/14] migration/rdma: Create multiFd migration threads

Zhimin Feng  wrote:
> Creation of the multifd send threads for RDMA migration, nothing 
> inside yet.
>
> Signed-off-by: Zhimin Feng 
> ---
>  migration/multifd.c   | 33 +---
>  migration/multifd.h   |  2 +
>  migration/qemu-file.c |  5 +++
>  migration/qemu-file.h |  1 +
>  migration/rdma.c  | 88 ++-
>  migration/rdma.h  |  3 ++
>  6 files changed, 125 insertions(+), 7 deletions(-)
>
> diff --git a/migration/multifd.c b/migration/multifd.c index 
> b3e8ae9bcc..63678d7fdd 100644
> --- a/migration/multifd.c
> +++ b/migration/multifd.c
> @@ -424,7 +424,7 @@ void multifd_send_sync_main(QEMUFile *f)  {
>  int i;
>  
> -if (!migrate_use_multifd()) {
> +if (!migrate_use_multifd() || migrate_use_rdma()) {

You don't need sync with main channel on rdma?

> +static void rdma_send_channel_create(MultiFDSendParams *p) {
> +Error *local_err = NULL;
> +
> +if (p->quit) {
> +error_setg(&local_err, "multifd: send id %d already quit", p->id);
> +return ;
> +}
> +p->running = true;
> +
> +qemu_thread_create(&p->thread, p->name, multifd_rdma_send_thread, p,
> +   QEMU_THREAD_JOINABLE); }
> +
>  static void multifd_new_send_channel_async(QIOTask *task, gpointer 
> opaque)  {
>  MultiFDSendParams *p = opaque;
> @@ -621,7 +635,11 @@ int multifd_save_setup(Error **errp)
>  p->packet->magic = cpu_to_be32(MULTIFD_MAGIC);
>  p->packet->version = cpu_to_be32(MULTIFD_VERSION);
>  p->name = g_strdup_printf("multifdsend_%d", i);
> -socket_send_channel_create(multifd_new_send_channel_async, p);
> +if (!migrate_use_rdma()) {
> +socket_send_channel_create(multifd_new_send_channel_async, p);
> +} else {
> +rdma_send_channel_create(p);
> +}

This is what we are trying to avoid.  Just create a struct ops, where we have a

ops->create_channel(new_channel_async, p)

or whatever, and fill it differently for rdma and for tcp.


>  }
>  return 0;
>  }
> @@ -720,7 +738,7 @@ void multifd_recv_sync_main(void)  {
>  int i;
>  
> -if (!migrate_use_multifd()) {
> +if (!migrate_use_multifd() || migrate_use_rdma()) {
>  return;
>  }

Ok. you can just put an empty function for you here.

>  for (i = 0; i < migrate_multifd_channels(); i++) { @@ -890,8 
> +908,13 @@ bool multifd_recv_new_channel(QIOChannel *ioc, Error **errp)
>  p->num_packets = 1;
>  
>  p->running = true;
> -qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
> -   QEMU_THREAD_JOINABLE);
> +if (!migrate_use_rdma()) {
> +qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
> +   QEMU_THREAD_JOINABLE);
> +} else {
> +qemu_thread_create(&p->thread, p->name, multifd_rdma_recv_thread, p,
> +   QEMU_THREAD_JOINABLE);
> +}

new_recv_chanel() member function.

>  atomic_inc(&multifd_recv_state->count);
>  return atomic_read(&multifd_recv_state->count) ==
> migrate_multifd_channels(); diff --git 
> a/migration/multifd.h b/migration/multifd.h index 
> d8b0205977..c9c11ad140 100644
> --- a/migration/multifd.h
> +++ b/migration/multifd.h
> @@ -13,6 +13,8 @@
>  #ifndef QEMU_MIGRATION_MULTIFD_H
>  #define QEMU_MIGRATION_MULTIFD_H
>  
> +#include "migration/rdma.h"
> +
>  int multifd_save_setup(Error **errp);  void 
> multifd_save_cleanup(void);  int multifd_load_setup(Error **errp);

You are not exporting anything rdma related from here, are you?

> diff --git a/migration/qemu-file.c b/migration/qemu-file.c index 
> 1c3a358a14..f0ed8f1381 100644
> --- a/migration/qemu-file.c
> +++ b/migration/qemu-file.c
> @@ -248,6 +248,11 @@ void qemu_fflush(QEMUFile *f)
>  f->iovcnt = 0;
>  }
>  
> +void *getQIOChannel(QEMUFile *f)
> +{
> +return f->opaque;
> +}
> +

We really want this to return a void?  and not a better type?
> +static void migration_rdma_process_incoming(QEMUFile *f, Error 
> +**errp) {
> +MigrationIncomingState *mis = migration_incoming_get_current();
> +Error *local_err = NULL;
> +QIOChannel *ioc = NULL;
> +bool start_migration;
> +
> +if (!mis->from_src_file) {
> +mis->from_src_file = f;
> +qemu_file_set_blocking(f, false);
> +
> +start

Re: [PATCH] Report stringified errno in VFIO related errors

2020-02-14 Thread Michal Privoznik

On 2/14/20 10:47 AM, Cornelia Huck wrote:

On Fri, 14 Feb 2020 09:47:39 +0100
Michal Privoznik  wrote:


In a few places we report errno formatted as a negative integer.
This is not as user friendly as it can be. Use strerror() and/or
error_setg_errno() instead.

Signed-off-by: Michal Privoznik 
---
  hw/vfio/common.c| 2 +-
  util/vfio-helpers.c | 6 +++---
  2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 5ca11488d6..a3a2a82d99 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -352,7 +352,7 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr 
iova,
  return 0;
  }
  
-error_report("VFIO_MAP_DMA: %d", -errno);

+error_report("VFIO_MAP_DMA failed: %s", strerror(errno));
  return -errno;
  }


I think you missed the one in vfio_dma_unmap().



Indeed, will send v2.

Michal




[PATCH v2] Report stringified errno in VFIO related errors

2020-02-14 Thread Michal Privoznik
In a few places we report errno formatted as a negative integer.
This is not as user friendly as it can be. Use strerror() and/or
error_setg_errno() instead.

Signed-off-by: Michal Privoznik 
---

v1 posted here:

https://lists.nongnu.org/archive/html/qemu-devel/2020-02/msg03623.html

diff to v1:
 - Change error reporting in vfio_dma_unmap() too as I missed it in v1.

 hw/vfio/common.c| 4 ++--
 util/vfio-helpers.c | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 5ca11488d6..0b3593b3c0 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -319,7 +319,7 @@ static int vfio_dma_unmap(VFIOContainer *container,
 unmap.size -= 1ULL << ctz64(container->pgsizes);
 continue;
 }
-error_report("VFIO_UNMAP_DMA: %d", -errno);
+error_report("VFIO_UNMAP_DMA failed: %s", strerror(errno));
 return -errno;
 }
 
@@ -352,7 +352,7 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr 
iova,
 return 0;
 }
 
-error_report("VFIO_MAP_DMA: %d", -errno);
+error_report("VFIO_MAP_DMA failed: %s", strerror(errno));
 return -errno;
 }
 
diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c
index 813f7ec564..ddd9a96e76 100644
--- a/util/vfio-helpers.c
+++ b/util/vfio-helpers.c
@@ -545,7 +545,7 @@ static int qemu_vfio_do_mapping(QEMUVFIOState *s, void 
*host, size_t size,
 trace_qemu_vfio_do_mapping(s, host, size, iova);
 
 if (ioctl(s->container, VFIO_IOMMU_MAP_DMA, &dma_map)) {
-error_report("VFIO_MAP_DMA: %d", -errno);
+error_report("VFIO_MAP_DMA failed: %s", strerror(errno));
 return -errno;
 }
 return 0;
@@ -570,7 +570,7 @@ static void qemu_vfio_undo_mapping(QEMUVFIOState *s, 
IOVAMapping *mapping,
 assert(QEMU_IS_ALIGNED(mapping->size, qemu_real_host_page_size));
 assert(index >= 0 && index < s->nr_mappings);
 if (ioctl(s->container, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
-error_setg(errp, "VFIO_UNMAP_DMA failed: %d", -errno);
+error_setg_errno(errp, errno, "VFIO_UNMAP_DMA failed");
 }
 memmove(mapping, &s->mappings[index + 1],
 sizeof(s->mappings[0]) * (s->nr_mappings - index - 1));
@@ -669,7 +669,7 @@ int qemu_vfio_dma_reset_temporary(QEMUVFIOState *s)
 trace_qemu_vfio_dma_reset_temporary(s);
 qemu_mutex_lock(&s->lock);
 if (ioctl(s->container, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
-error_report("VFIO_UNMAP_DMA: %d", -errno);
+error_report("VFIO_UNMAP_DMA failed: %s", strerror(errno));
 qemu_mutex_unlock(&s->lock);
 return -errno;
 }
-- 
2.24.1




Re: [PATCH 3/3] arm: allwinner: Wire up USB EHCI

2020-02-14 Thread Gerd Hoffmann
> ehci-platform 1c1c000.usb: new USB bus registered, assigned bus number 2
> ehci-platform 1c1c000.usb: irq 31, io mem 0x01c1c000
> ehci-platform 1c1c000.usb: USB 2.0 started, EHCI 1.00

> +for (i = 0; i < ARRAY_SIZE(s->ehci); i++) {
> +object_property_set_bool(OBJECT(&s->ehci[i]), true, "realized",
> + &err);

I suspect the ohci controllers are ehci companions, i.e. they handle a
single usb bus, with ehci handling usb2 and ohci handling usb1 devices?

If so then you should initialize ehci first, explicitly assign bus
names and set the masterbus property for the ohci controllers.

See also docs/usb2.txt

cheers,
  Gerd




Re: [PATCH v2] Report stringified errno in VFIO related errors

2020-02-14 Thread Ján Tomko

On Fri, Feb 14, 2020 at 10:55:19AM +0100, Michal Privoznik wrote:

In a few places we report errno formatted as a negative integer.
This is not as user friendly as it can be. Use strerror() and/or
error_setg_errno() instead.

Signed-off-by: Michal Privoznik 
---

v1 posted here:

https://lists.nongnu.org/archive/html/qemu-devel/2020-02/msg03623.html

diff to v1:
- Change error reporting in vfio_dma_unmap() too as I missed it in v1.

hw/vfio/common.c| 4 ++--
util/vfio-helpers.c | 6 +++---
2 files changed, 5 insertions(+), 5 deletions(-)



Reviewed-by: Ján Tomko 

Jano


signature.asc
Description: PGP signature


Re: [PATCH v6 3/8] multifd: Make no compression operations into its own structure

2020-02-14 Thread Dr. David Alan Gilbert
* Juan Quintela (quint...@redhat.com) wrote:
> It will be used later.
> 
> Signed-off-by: Juan Quintela 

Reviewed-by: Dr. David Alan Gilbert 

> 
> ---
> 
> No comp value needs to be zero.
> ---
>  migration/migration.c |   9 ++
>  migration/migration.h |   1 +
>  migration/multifd.c   | 185 --
>  migration/multifd.h   |  26 ++
>  migration/ram.c   |   1 +
>  5 files changed, 214 insertions(+), 8 deletions(-)
> 
> diff --git a/migration/migration.c b/migration/migration.c
> index bc744d1734..eb7b0a2df0 100644
> --- a/migration/migration.c
> +++ b/migration/migration.c
> @@ -2245,6 +2245,15 @@ int migrate_multifd_channels(void)
>  return s->parameters.multifd_channels;
>  }
>  
> +MultiFDCompression migrate_multifd_compression(void)
> +{
> +MigrationState *s;
> +
> +s = migrate_get_current();
> +
> +return s->parameters.multifd_compression;
> +}
> +
>  int migrate_use_xbzrle(void)
>  {
>  MigrationState *s;
> diff --git a/migration/migration.h b/migration/migration.h
> index 8473ddfc88..59fea02482 100644
> --- a/migration/migration.h
> +++ b/migration/migration.h
> @@ -300,6 +300,7 @@ bool migrate_auto_converge(void);
>  bool migrate_use_multifd(void);
>  bool migrate_pause_before_switchover(void);
>  int migrate_multifd_channels(void);
> +MultiFDCompression migrate_multifd_compression(void);
>  
>  int migrate_use_xbzrle(void);
>  int64_t migrate_xbzrle_cache_size(void);
> diff --git a/migration/multifd.c b/migration/multifd.c
> index b3e8ae9bcc..97433e5135 100644
> --- a/migration/multifd.c
> +++ b/migration/multifd.c
> @@ -38,6 +38,134 @@ typedef struct {
>  uint64_t unused2[4];/* Reserved for future use */
>  } __attribute__((packed)) MultiFDInit_t;
>  
> +/* Multifd without compression */
> +
> +/**
> + * nocomp_send_setup: setup send side
> + *
> + * For no compression this function does nothing.
> + *
> + * Returns 0 for success or -1 for error
> + *
> + * @p: Params for the channel that we are using
> + * @errp: pointer to an error
> + */
> +static int nocomp_send_setup(MultiFDSendParams *p, Error **errp)
> +{
> +return 0;
> +}
> +
> +/**
> + * nocomp_send_cleanup: cleanup send side
> + *
> + * For no compression this function does nothing.
> + *
> + * @p: Params for the channel that we are using
> + */
> +static void nocomp_send_cleanup(MultiFDSendParams *p, Error **errp)
> +{
> +return;
> +}
> +
> +/**
> + * nocomp_send_prepare: prepare date to be able to send
> + *
> + * For no compression we just have to calculate the size of the
> + * packet.
> + *
> + * Returns 0 for success or -1 for error
> + *
> + * @p: Params for the channel that we are using
> + * @used: number of pages used
> + * @errp: pointer to an error
> + */
> +static int nocomp_send_prepare(MultiFDSendParams *p, uint32_t used,
> +   Error **errp)
> +{
> +p->next_packet_size = used * qemu_target_page_size();
> +p->flags |= MULTIFD_FLAG_NOCOMP;
> +return 0;
> +}
> +
> +/**
> + * nocomp_send_write: do the actual write of the data
> + *
> + * For no compression we just have to write the data.
> + *
> + * Returns 0 for success or -1 for error
> + *
> + * @p: Params for the channel that we are using
> + * @used: number of pages used
> + * @errp: pointer to an error
> + */
> +static int nocomp_send_write(MultiFDSendParams *p, uint32_t used, Error 
> **errp)
> +{
> +return qio_channel_writev_all(p->c, p->pages->iov, used, errp);
> +}
> +
> +/**
> + * nocomp_recv_setup: setup receive side
> + *
> + * For no compression this function does nothing.
> + *
> + * Returns 0 for success or -1 for error
> + *
> + * @p: Params for the channel that we are using
> + * @errp: pointer to an error
> + */
> +static int nocomp_recv_setup(MultiFDRecvParams *p, Error **errp)
> +{
> +return 0;
> +}
> +
> +/**
> + * nocomp_recv_cleanup: setup receive side
> + *
> + * For no compression this function does nothing.
> + *
> + * @p: Params for the channel that we are using
> + */
> +static void nocomp_recv_cleanup(MultiFDRecvParams *p)
> +{
> +}
> +
> +/**
> + * nocomp_recv_pages: read the data from the channel into actual pages
> + *
> + * For no compression we just need to read things into the correct place.
> + *
> + * Returns 0 for success or -1 for error
> + *
> + * @p: Params for the channel that we are using
> + * @used: number of pages used
> + * @errp: pointer to an error
> + */
> +static int nocomp_recv_pages(MultiFDRecvParams *p, uint32_t used, Error 
> **errp)
> +{
> +uint32_t flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK;
> +
> +if (flags != MULTIFD_FLAG_NOCOMP) {
> +error_setg(errp, "multifd %d: flags received %x flags expected %x",
> +   p->id, flags, MULTIFD_FLAG_NOCOMP);
> +return -1;
> +}
> +return qio_channel_readv_all(p->c, p->pages->iov, used, errp);
> +}
> +
> +static MultiFDMethods multifd_nocomp_ops = {
> +.send_setup = nocomp_send_setup,
> +.send_clea

Re: [PATCH v12 Kernel 1/7] vfio: KABI for migration interface for device state

2020-02-14 Thread Cornelia Huck
On Sat, 8 Feb 2020 01:12:28 +0530
Kirti Wankhede  wrote:

(...)

Minor wording nits:

> +/*
> + * Structure vfio_device_migration_info is placed at 0th offset of

"...at the 0th offset..."

> + * VFIO_REGION_SUBTYPE_MIGRATION region to get/set VFIO device related 
> migration
> + * information. Field accesses from this structure are only supported at 
> their
> + * native width and alignment, otherwise the result is undefined and vendor
> + * drivers should return an error.
> + *
> + * device_state: (read/write)
> + *  - User application writes this field to inform vendor driver about 
> the

I'd probably add a definitive article before "user application",
"vendor driver", etc. Not sure if it's too much churn.

> + *device state to be transitioned to.
> + *  - Vendor driver should take necessary actions to change device state.
> + *On successful transition to given state, vendor driver should 
> return
> + *success on write(device_state, state) system call. If device state
> + *transition fails, vendor driver should return error, -EFAULT.
> + *  - On user application side, if device state transition fails, i.e. if
> + *write(device_state, state) returns error, read device_state again 
> to
> + *determine the current state of the device from vendor driver.
> + *  - Vendor driver should return previous state of the device unless 
> vendor
> + *driver has encountered an internal error, in which case vendor 
> driver
> + *may report the device_state VFIO_DEVICE_STATE_ERROR.
> + *   - User application must use the device reset ioctl in order to recover
> + * the device from VFIO_DEVICE_STATE_ERROR state. If the device is
> + * indicated in a valid device state via reading device_state, the user
> + * application may decide attempt to transition the device to any valid
> + * state reachable from the current state or terminate itself.
> + *
> + *  device_state consists of 3 bits:
> + *  - If bit 0 set, indicates _RUNNING state. When it's clear, that
> + * indicates _STOP state. When device is changed to _STOP, driver should
> + * stop device before write() returns.

"If set, bit 0 indicates _RUNNING state. If unset, it indicates _STOP
state. When the device is changed to _STOP state, the driver should
stop the device before write() returns."

?

> + *  - If bit 1 set, indicates _SAVING state. When set, that indicates 
> driver
> + *should start gathering device state information which will be 
> provided
> + *to VFIO user application to save device's state.

"If set, bit 1 indicates _SAVING state. When it is set, the driver
should start to gather the device state information that will be
provided to the VFIO user application to save the device's state."

?

> + *  - If bit 2 set, indicates _RESUMING state. When set, that indicates
> + *prepare to resume device, data provided through migration region
> + *should be used to resume device.

"If set, bit 2 indicates _RESUMING state. When it is set, the driver
should prepare to resume the device, using the data provided via the
migration region."

?

> + *  Bits 3 - 31 are reserved for future use. In order to preserve them,
> + *   user application should perform read-modify-write operation on this
> + *   field when modifying the specified bits.

"In order to preserve them, the user application should use a
read-modify-write operation on the device_state field when modifying
the state."

?


(...)




Re: [PULL 0/1] Vga 20200213 patches

2020-02-14 Thread Peter Maydell
On Thu, 13 Feb 2020 at 09:07, Gerd Hoffmann  wrote:
>
> The following changes since commit e18e5501d8ac692d32657a3e1ef545b14e72b730:
>
>   Merge remote-tracking branch 
> 'remotes/dgilbert-gitlab/tags/pull-virtiofs-20200210' into staging 
> (2020-02-10 18:09:14 +)
>
> are available in the Git repository at:
>
>   git://git.kraxel.org/qemu tags/vga-20200213-pull-request
>
> for you to fetch changes up to ed71c09ffd6fbd01c2a487d47291ae57b08671ea:
>
>   qxl: introduce hardware revision 5 (2020-02-13 08:31:40 +0100)
>
> 
> qxl: introduce hardware revision 5
>
> 
>
> Gerd Hoffmann (1):
>   qxl: introduce hardware revision 5


Applied, thanks.

Please update the changelog at https://wiki.qemu.org/ChangeLog/5.0
for any user-visible changes.

-- PMM



Re: [RFC 2/2] pci-expender-bus:Add pcie-root-port to pxb-pcie under arm.

2020-02-14 Thread Daniel P . Berrangé
On Fri, Feb 14, 2020 at 07:25:43AM +, miaoyubo wrote:
> 
> > -Original Message-
> > From: Daniel P. Berrangé [mailto:berra...@redhat.com]
> > Sent: Thursday, February 13, 2020 9:52 PM
> > To: miaoyubo 
> > Cc: peter.mayd...@linaro.org; shannon.zha...@gmail.com;
> > imamm...@redhat.com; qemu-devel@nongnu.org; Xiexiangyou
> > ; m...@redhat.com
> > Subject: Re: [RFC 2/2] pci-expender-bus:Add pcie-root-port to pxb-pcie
> > under arm.
> > 
> > On Thu, Feb 13, 2020 at 03:49:52PM +0800, Yubo Miao wrote:
> > > From: miaoyubo 
> > >
> > > Since devices could not directly plugged into pxb-pcie, under arm, one
> > > pcie-root port is plugged into pxb-pcie. Due to the bus for each
> > > pxb-pcie is defined as 2 in acpi dsdt tables(one for pxb-pcie, one for
> > > pcie-root-port), only one device could be plugged into one pxb-pcie.
> > 
> > What is the cause of this arm specific requirement for pxb-pcie and more
> > importantly can be fix it so that we don't need this patch ?
> > I think it is highly undesirable to have such a per-arch difference in
> > configuration of the pxb-pcie device. It means any mgmt app which already
> > supports pxb-pcie will be broken and need to special case arm.
> > 
> 
> Thanks for your reply, Without this patch, the pxb-pcie is also useable, 
> however, one extra pcie-root-port or pci-bridge or something else need 
> to be defined by mgmt. app. This patch will could be abandoned.

That's not really answering my question. IIUC, this pxb-pcie device
works fine on x86_64, and I want to know why it doesn't work on arm ?
Requiring different setups by the mgmt apps is not at all nice because
it will inevitably lead to broken arm setups. x86_64 gets far more testing
& usage, developers won't realize arm is different.



Regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|




Re: [PATCH RFC] memory: Don't allow to resize RAM while migrating

2020-02-14 Thread Dr. David Alan Gilbert
* David Hildenbrand (da...@redhat.com) wrote:
> Resizing while migrating is dangerous and does not work as expected.
> The whole migration code works on the usable_length of ram blocks and does
> not expect this to change at random points in time.
> 
> Precopy: The ram block size must not change on the source, after
> ram_save_setup(), so as long as the guest is still running on the source.
> 
> Postcopy: The ram block size must not change on the target, after
> synchronizing the RAM block list (ram_load_precopy()).
> 
> AFAIKS, resizing can be trigger *after* (but not during) a reset in
> ACPI code by the guest
> - hw/arm/virt-acpi-build.c:acpi_ram_update()
> - hw/i386/acpi-build.c:acpi_ram_update()
> 
> I see no easy way to work around this. Fail hard instead of failing
> somewhere in migration code due to strange other reasons. AFAIKs, the
> rebuilts will be triggered during reboot, so this should not affect
> running guests, but only guests that reboot at a very bad time and
> actually require size changes.
> 
> Let's further limit the impact by checking if an actual resize of the
> RAM (in number of pages) is required.
> 
> Don't perform the checks in qemu_ram_resize(), as that's called during
> migration when syncing the used_length. Update documentation.

Interesting; we need to do something about this - but banning resets
during migration is a bit harsh; and aborting the source VM is really
nasty - for a precopy especially we shouldn't kill the source VM,
we should just abort the migration.

The other thing that worries me is that acpi_build_update calls
   acpi_ram_update->memory_region_ram_resize
multiple times.
So, it might be that the size you end up with at the end of
acpi_build_update is actually the same size as the original - so
the net effect is the RAMBlock didn't really get resized.

Dave


> Cc: "Dr. David Alan Gilbert" 
> Cc: Eduardo Habkost 
> Cc: Paolo Bonzini 
> Cc: Igor Mammedov 
> Cc: "Michael S. Tsirkin" 
> Cc: Richard Henderson 
> Cc: Shannon Zhao 
> Cc: Alex Bennée 
> Cc: Shameerali Kolothum Thodi 
> Cc: Juan Quintela 
> Signed-off-by: David Hildenbrand 
> ---
> 
> Any idea how to avoid killing the guest? Anything obvious I am missing?
> 
> ---
>  exec.c|  6 --
>  include/exec/memory.h | 11 +++
>  memory.c  | 12 
>  3 files changed, 23 insertions(+), 6 deletions(-)
> 
> diff --git a/exec.c b/exec.c
> index 67e520d18e..faa6708414 100644
> --- a/exec.c
> +++ b/exec.c
> @@ -2116,8 +2116,10 @@ static int memory_try_enable_merging(void *addr, 
> size_t len)
>  return qemu_madvise(addr, len, QEMU_MADV_MERGEABLE);
>  }
>  
> -/* Only legal before guest might have detected the memory size: e.g. on
> - * incoming migration, or right after reset.
> +/*
> + * RAM must not be resized while migration is active (except from migration
> + * code). Care has to be taken if the guest might have already detected
> + * the memory.
>   *
>   * As memory core doesn't know how is memory accessed, it is up to
>   * resize callback to update device state and/or add assertions to detect
> diff --git a/include/exec/memory.h b/include/exec/memory.h
> index e85b7de99a..1e5bfbe805 100644
> --- a/include/exec/memory.h
> +++ b/include/exec/memory.h
> @@ -113,7 +113,8 @@ typedef struct IOMMUNotifier IOMMUNotifier;
>  #define RAM_SHARED (1 << 1)
>  
>  /* Only a portion of RAM (used_length) is actually used, and migrated.
> - * This used_length size can change across reboots.
> + * RAM must not be resized while migration is active (except from migration
> + * code).
>   */
>  #define RAM_RESIZEABLE (1 << 2)
>  
> @@ -843,7 +844,8 @@ void memory_region_init_ram_shared_nomigrate(MemoryRegion 
> *mr,
>   * RAM.  Accesses into the region will
>   * modify memory directly.  Only an 
> initial
>   * portion of this RAM is actually used.
> - * The used size can change across 
> reboots.
> + * The size must not change while 
> migration
> + * is active.
>   *
>   * @mr: the #MemoryRegion to be initialized.
>   * @owner: the object that tracks the region's reference count
> @@ -1464,8 +1466,9 @@ void *memory_region_get_ram_ptr(MemoryRegion *mr);
>  
>  /* memory_region_ram_resize: Resize a RAM region.
>   *
> - * Only legal before guest might have detected the memory size: e.g. on
> - * incoming migration, or right after reset.
> + * RAM must not be resized while migration is active (except from migration
> + * code). Care has to be taken if the guest might have already detected
> + * the memory.
>   *
>   * @mr: a memory region created with @memory_region_init_resizeable_ram.
>   * @newsize: the new size the region
> diff --git a/memory.c b/memory.c
> index aeaa8dcc9e..7fa048aa3a 100644
> --- a/memory.c
> +++ b/memory.c
> @@ -34,6 +34,7 @@
>  #incl

Re: [PATCH v5 6/8] configure: Enable test and libs for zstd

2020-02-14 Thread Daniel P . Berrangé
On Thu, Feb 13, 2020 at 10:08:59PM +0100, Juan Quintela wrote:
> Daniel P. Berrangé  wrote:
> > On Wed, Jan 29, 2020 at 12:56:53PM +0100, Juan Quintela wrote:
> >> Signed-off-by: Juan Quintela 
> >> Reviewed-by: Dr. David Alan Gilbert 
> >> ---
> >>  configure | 30 ++
> >>  1 file changed, 30 insertions(+)
> >
> > This is adding a new 3rd party library to QEMU that we've not previously
> > built and so isn't included in any of our CI platforms.
> 
> Ok.
> 
> Learning how one does that.
> 
> > This commit should be updating at least some of our CI platforms to
> > request the libzstd library installation to get CI coverage for the
> > latest patches in this series.
> 
> > Probably the docker files,
> 
> I added it in all debian/centos/fedora files that zlib-dev or xen-dev
> 
> > the VM installs for FreeBSD at least,
> 
> A fast google finds that library is called "zstd" and that it includes
> the includes (put intended)
> 
> tests/vm/freebsd
> 
> Once there, include it in fedora
> 
> > travis and
> 
> I added it to .travis.yml
> 
> > gitlab CI.
> 
> gitlab-ci.yml (just when we compile x86_64-softmmu)
> 
> I have something like this, but net real clue how to test that I haven't
> broken anything:
> 
> commit 59d8694dfcfc3d5ef36bc72a5c02bedaa3a6a6ec
> Author: Juan Quintela 
> Date:   Thu Feb 13 22:06:16 2020 +0100
> 
> Use zstd packages
> 
> Signed-off-by: Juan Quintela 
> 
> diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
> index c15e394f09..72f8b8aa51 100644
> --- a/.gitlab-ci.yml
> +++ b/.gitlab-ci.yml
> @@ -21,6 +21,7 @@ build-system2:
>   script:
>   - apt-get install -y -qq libsdl2-dev libgcrypt-dev libbrlapi-dev libaio-dev
>libfdt-dev liblzo2-dev librdmacm-dev libibverbs-dev libibumad-dev
> +  libzstd-dev
>   - mkdir build
>   - cd build
>   - ../configure --enable-werror --target-list="tricore-softmmu 
> unicore32-softmmu
> diff --git a/.travis.yml b/.travis.yml
> index 5887055951..dd17301f3b 100644
> --- a/.travis.yml
> +++ b/.travis.yml
> @@ -48,6 +48,7 @@ addons:
>- libusb-1.0-0-dev
>- libvdeplug-dev
>- libvte-2.91-dev
> +  - libzstd-dev
>- sparse
>- uuid-dev
>- gcovr
> diff --git a/tests/docker/dockerfiles/centos7.docker 
> b/tests/docker/dockerfiles/centos7.docker
> index 562d65be9e..cdd72de7eb 100644
> --- a/tests/docker/dockerfiles/centos7.docker
> +++ b/tests/docker/dockerfiles/centos7.docker
> @@ -33,6 +33,7 @@ ENV PACKAGES \
>  tar \
>  vte-devel \
>  xen-devel \
> -zlib-devel
> +zlib-devel \
> +libzstd-devel
>  RUN yum install -y $PACKAGES
>  RUN rpm -q $PACKAGES | sort > /packages.txt
> diff --git a/tests/docker/dockerfiles/fedora-i386-cross.docker 
> b/tests/docker/dockerfiles/fedora-i386-cross.docker
> index 9106cf9ebe..cd16cd1bfa 100644
> --- a/tests/docker/dockerfiles/fedora-i386-cross.docker
> +++ b/tests/docker/dockerfiles/fedora-i386-cross.docker
> @@ -7,7 +7,8 @@ ENV PACKAGES \
>  gnutls-devel.i686 \
>  nettle-devel.i686 \
>  pixman-devel.i686 \
> -zlib-devel.i686
> +zlib-devel.i686 \
> +libzstd-devel.i686
>  
>  RUN dnf install -y $PACKAGES
>  RUN rpm -q $PACKAGES | sort > /packages.txt
> diff --git a/tests/docker/dockerfiles/fedora.docker 
> b/tests/docker/dockerfiles/fedora.docker
> index 987a3c170a..a658c0 100644
> --- a/tests/docker/dockerfiles/fedora.docker
> +++ b/tests/docker/dockerfiles/fedora.docker
> @@ -92,7 +92,8 @@ ENV PACKAGES \
>  vte291-devel \
>  which \
>  xen-devel \
> -zlib-devel
> +zlib-devel \
> +libzstd-devel
>  ENV QEMU_CONFIGURE_OPTS --python=/usr/bin/python3
>  
>  RUN dnf install -y $PACKAGES
> diff --git a/tests/docker/dockerfiles/ubuntu.docker 
> b/tests/docker/dockerfiles/ubuntu.docker
> index 4177f33691..b6c7b41ddd 100644
> --- a/tests/docker/dockerfiles/ubuntu.docker
> +++ b/tests/docker/dockerfiles/ubuntu.docker
> @@ -58,6 +58,7 @@ ENV PACKAGES flex bison \
>  libvdeplug-dev \
>  libvte-2.91-dev \
>  libxen-dev \
> +libzstd-dev \
>  make \
>  python3-yaml \
>  python3-sphinx \
> diff --git a/tests/docker/dockerfiles/ubuntu1804.docker 
> b/tests/docker/dockerfiles/ubuntu1804.docker
> index 0766f94cf4..1efedeef99 100644
> --- a/tests/docker/dockerfiles/ubuntu1804.docker
> +++ b/tests/docker/dockerfiles/ubuntu1804.docker
> @@ -44,6 +44,7 @@ ENV PACKAGES flex bison \
>  libvdeplug-dev \
>  libvte-2.91-dev \
>  libxen-dev \
> +libzstd-dev \
>  make \
>  python3-yaml \
>  python3-sphinx \
> diff --git a/tests/vm/fedora b/tests/vm/fedora
> index 4d7d6049f4..4843b4175e 100755
> --- a/tests/vm/fedora
> +++ b/tests/vm/fedora
> @@ -53,7 +53,10 @@ class FedoraVM(basevm.BaseVM):
>  # libs: audio
>  '"pkgconfig(libpulse)"',
>  '"pkgconfig(alsa)"',
> -]
> +
> +# libs: migration
> +'"pkgconfig(libzstd)"',
> +]
>  
>  BUILD_SCRIPT = """
>  set -e;
> diff --git a/tests/vm/freebsd b/

Re: [PATCH RFC] memory: Don't allow to resize RAM while migrating

2020-02-14 Thread David Hildenbrand
On 14.02.20 11:25, Dr. David Alan Gilbert wrote:
> * David Hildenbrand (da...@redhat.com) wrote:
>> Resizing while migrating is dangerous and does not work as expected.
>> The whole migration code works on the usable_length of ram blocks and does
>> not expect this to change at random points in time.
>>
>> Precopy: The ram block size must not change on the source, after
>> ram_save_setup(), so as long as the guest is still running on the source.
>>
>> Postcopy: The ram block size must not change on the target, after
>> synchronizing the RAM block list (ram_load_precopy()).
>>
>> AFAIKS, resizing can be trigger *after* (but not during) a reset in
>> ACPI code by the guest
>> - hw/arm/virt-acpi-build.c:acpi_ram_update()
>> - hw/i386/acpi-build.c:acpi_ram_update()
>>
>> I see no easy way to work around this. Fail hard instead of failing
>> somewhere in migration code due to strange other reasons. AFAIKs, the
>> rebuilts will be triggered during reboot, so this should not affect
>> running guests, but only guests that reboot at a very bad time and
>> actually require size changes.
>>
>> Let's further limit the impact by checking if an actual resize of the
>> RAM (in number of pages) is required.
>>
>> Don't perform the checks in qemu_ram_resize(), as that's called during
>> migration when syncing the used_length. Update documentation.
> 
> Interesting; we need to do something about this - but banning resets
> during migration is a bit harsh; and aborting the source VM is really
> nasty - for a precopy especially we shouldn't kill the source VM,
> we should just abort the migration.

Any alternative, easy solutions to handle this? I do wonder how often
this will actually trigger in real life.

> 
> The other thing that worries me is that acpi_build_update calls
>acpi_ram_update->memory_region_ram_resize
> multiple times.

It's different memory regions, no? table_mr, rsdp_mr, linker_mr.

> So, it might be that the size you end up with at the end of
> acpi_build_update is actually the same size as the original - so
> the net effect is the RAMBlock didn't really get resized.

Are you sure?


-- 
Thanks,

David / dhildenb




Re: [PATCH v2] Report stringified errno in VFIO related errors

2020-02-14 Thread Cornelia Huck
On Fri, 14 Feb 2020 10:55:19 +0100
Michal Privoznik  wrote:

> In a few places we report errno formatted as a negative integer.
> This is not as user friendly as it can be. Use strerror() and/or
> error_setg_errno() instead.
> 
> Signed-off-by: Michal Privoznik 
> ---
> 
> v1 posted here:
> 
> https://lists.nongnu.org/archive/html/qemu-devel/2020-02/msg03623.html
> 
> diff to v1:
>  - Change error reporting in vfio_dma_unmap() too as I missed it in v1.
> 
>  hw/vfio/common.c| 4 ++--
>  util/vfio-helpers.c | 6 +++---
>  2 files changed, 5 insertions(+), 5 deletions(-)

Reviewed-by: Cornelia Huck 




Re: [PATCH RFC] memory: Don't allow to resize RAM while migrating

2020-02-14 Thread Dr. David Alan Gilbert
* David Hildenbrand (da...@redhat.com) wrote:
> On 14.02.20 11:25, Dr. David Alan Gilbert wrote:
> > * David Hildenbrand (da...@redhat.com) wrote:
> >> Resizing while migrating is dangerous and does not work as expected.
> >> The whole migration code works on the usable_length of ram blocks and does
> >> not expect this to change at random points in time.
> >>
> >> Precopy: The ram block size must not change on the source, after
> >> ram_save_setup(), so as long as the guest is still running on the source.
> >>
> >> Postcopy: The ram block size must not change on the target, after
> >> synchronizing the RAM block list (ram_load_precopy()).
> >>
> >> AFAIKS, resizing can be trigger *after* (but not during) a reset in
> >> ACPI code by the guest
> >> - hw/arm/virt-acpi-build.c:acpi_ram_update()
> >> - hw/i386/acpi-build.c:acpi_ram_update()
> >>
> >> I see no easy way to work around this. Fail hard instead of failing
> >> somewhere in migration code due to strange other reasons. AFAIKs, the
> >> rebuilts will be triggered during reboot, so this should not affect
> >> running guests, but only guests that reboot at a very bad time and
> >> actually require size changes.
> >>
> >> Let's further limit the impact by checking if an actual resize of the
> >> RAM (in number of pages) is required.
> >>
> >> Don't perform the checks in qemu_ram_resize(), as that's called during
> >> migration when syncing the used_length. Update documentation.
> > 
> > Interesting; we need to do something about this - but banning resets
> > during migration is a bit harsh; and aborting the source VM is really
> > nasty - for a precopy especially we shouldn't kill the source VM,
> > we should just abort the migration.
> 
> Any alternative, easy solutions to handle this? I do wonder how often
> this will actually trigger in real life.

Well it's not that hard to abort a migration (I'm not sure we've got a
convenient wrapper to do it - but it shouldn't be hard to add).

> > 
> > The other thing that worries me is that acpi_build_update calls
> >acpi_ram_update->memory_region_ram_resize
> > multiple times.
> 
> It's different memory regions, no? table_mr, rsdp_mr, linker_mr.

Oh, so it is.

> > So, it might be that the size you end up with at the end of
> > acpi_build_update is actually the same size as the original - so
> > the net effect is the RAMBlock didn't really get resized.
> 
> Are you sure?

No!
Avocado has a migration+reset test, so it's worth trying.
Certainly in a cloud setup migrations happen often and no one knows
what the guest is doing;  aborting the source isn't acceptable.

It surprises me a bit that the region sizes would change due to guest
actions - I thought they were determined by the set of virtual hardware;
not sure if a hot-unplug/plug followed by reset would trigger it or not.

Dave

> 
> -- 
> Thanks,
> 
> David / dhildenb
--
Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK




Re: [PATCH RFC] memory: Don't allow to resize RAM while migrating

2020-02-14 Thread David Hildenbrand
On 14.02.20 11:42, Dr. David Alan Gilbert wrote:
> * David Hildenbrand (da...@redhat.com) wrote:
>> On 14.02.20 11:25, Dr. David Alan Gilbert wrote:
>>> * David Hildenbrand (da...@redhat.com) wrote:
 Resizing while migrating is dangerous and does not work as expected.
 The whole migration code works on the usable_length of ram blocks and does
 not expect this to change at random points in time.

 Precopy: The ram block size must not change on the source, after
 ram_save_setup(), so as long as the guest is still running on the source.

 Postcopy: The ram block size must not change on the target, after
 synchronizing the RAM block list (ram_load_precopy()).

 AFAIKS, resizing can be trigger *after* (but not during) a reset in
 ACPI code by the guest
 - hw/arm/virt-acpi-build.c:acpi_ram_update()
 - hw/i386/acpi-build.c:acpi_ram_update()

 I see no easy way to work around this. Fail hard instead of failing
 somewhere in migration code due to strange other reasons. AFAIKs, the
 rebuilts will be triggered during reboot, so this should not affect
 running guests, but only guests that reboot at a very bad time and
 actually require size changes.

 Let's further limit the impact by checking if an actual resize of the
 RAM (in number of pages) is required.

 Don't perform the checks in qemu_ram_resize(), as that's called during
 migration when syncing the used_length. Update documentation.
>>>
>>> Interesting; we need to do something about this - but banning resets
>>> during migration is a bit harsh; and aborting the source VM is really
>>> nasty - for a precopy especially we shouldn't kill the source VM,
>>> we should just abort the migration.
>>
>> Any alternative, easy solutions to handle this? I do wonder how often
>> this will actually trigger in real life.
> 
> Well it's not that hard to abort a migration (I'm not sure we've got a
> convenient wrapper to do it - but it shouldn't be hard to add).
> 

We do have qmp_migrate_cancel(). I hope that can be called under BQL.

Can that be called in both, precopy and postcopy case? I assume in the
precopy, it's easy.

-- 
Thanks,

David / dhildenb




Re: [PATCH RFC] memory: Don't allow to resize RAM while migrating

2020-02-14 Thread Dr. David Alan Gilbert
* David Hildenbrand (da...@redhat.com) wrote:
> On 14.02.20 11:42, Dr. David Alan Gilbert wrote:
> > * David Hildenbrand (da...@redhat.com) wrote:
> >> On 14.02.20 11:25, Dr. David Alan Gilbert wrote:
> >>> * David Hildenbrand (da...@redhat.com) wrote:
>  Resizing while migrating is dangerous and does not work as expected.
>  The whole migration code works on the usable_length of ram blocks and 
>  does
>  not expect this to change at random points in time.
> 
>  Precopy: The ram block size must not change on the source, after
>  ram_save_setup(), so as long as the guest is still running on the source.
> 
>  Postcopy: The ram block size must not change on the target, after
>  synchronizing the RAM block list (ram_load_precopy()).
> 
>  AFAIKS, resizing can be trigger *after* (but not during) a reset in
>  ACPI code by the guest
>  - hw/arm/virt-acpi-build.c:acpi_ram_update()
>  - hw/i386/acpi-build.c:acpi_ram_update()
> 
>  I see no easy way to work around this. Fail hard instead of failing
>  somewhere in migration code due to strange other reasons. AFAIKs, the
>  rebuilts will be triggered during reboot, so this should not affect
>  running guests, but only guests that reboot at a very bad time and
>  actually require size changes.
> 
>  Let's further limit the impact by checking if an actual resize of the
>  RAM (in number of pages) is required.
> 
>  Don't perform the checks in qemu_ram_resize(), as that's called during
>  migration when syncing the used_length. Update documentation.
> >>>
> >>> Interesting; we need to do something about this - but banning resets
> >>> during migration is a bit harsh; and aborting the source VM is really
> >>> nasty - for a precopy especially we shouldn't kill the source VM,
> >>> we should just abort the migration.
> >>
> >> Any alternative, easy solutions to handle this? I do wonder how often
> >> this will actually trigger in real life.
> > 
> > Well it's not that hard to abort a migration (I'm not sure we've got a
> > convenient wrapper to do it - but it shouldn't be hard to add).
> > 
> 
> We do have qmp_migrate_cancel(). I hope that can be called under BQL.

Well it's a monitor command so I think so; although it's not really
designed for an error - it's a user action.  Doing a
migrate_set_error(..) followed by a qemu_file_shutdown is probably a
good bet.

> Can that be called in both, precopy and postcopy case? I assume in the
> precopy, it's easy.

The problem is during postcopy you're toast when that happens because
you can't restart; however, can this happen once we're actually in
postcopy?  It's a little different - if it happens before the transition
to postcopy then it's the same as precopy; if it happens afterwards..
well it's going to happen ont he destination side and that's quite
different.

Dave

> 
> -- 
> Thanks,
> 
> David / dhildenb
--
Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK




Re: [PATCH RFC] memory: Don't allow to resize RAM while migrating

2020-02-14 Thread David Hildenbrand
On 14.02.20 12:02, Dr. David Alan Gilbert wrote:
> * David Hildenbrand (da...@redhat.com) wrote:
>> On 14.02.20 11:42, Dr. David Alan Gilbert wrote:
>>> * David Hildenbrand (da...@redhat.com) wrote:
 On 14.02.20 11:25, Dr. David Alan Gilbert wrote:
> * David Hildenbrand (da...@redhat.com) wrote:
>> Resizing while migrating is dangerous and does not work as expected.
>> The whole migration code works on the usable_length of ram blocks and 
>> does
>> not expect this to change at random points in time.
>>
>> Precopy: The ram block size must not change on the source, after
>> ram_save_setup(), so as long as the guest is still running on the source.
>>
>> Postcopy: The ram block size must not change on the target, after
>> synchronizing the RAM block list (ram_load_precopy()).
>>
>> AFAIKS, resizing can be trigger *after* (but not during) a reset in
>> ACPI code by the guest
>> - hw/arm/virt-acpi-build.c:acpi_ram_update()
>> - hw/i386/acpi-build.c:acpi_ram_update()
>>
>> I see no easy way to work around this. Fail hard instead of failing
>> somewhere in migration code due to strange other reasons. AFAIKs, the
>> rebuilts will be triggered during reboot, so this should not affect
>> running guests, but only guests that reboot at a very bad time and
>> actually require size changes.
>>
>> Let's further limit the impact by checking if an actual resize of the
>> RAM (in number of pages) is required.
>>
>> Don't perform the checks in qemu_ram_resize(), as that's called during
>> migration when syncing the used_length. Update documentation.
>
> Interesting; we need to do something about this - but banning resets
> during migration is a bit harsh; and aborting the source VM is really
> nasty - for a precopy especially we shouldn't kill the source VM,
> we should just abort the migration.

 Any alternative, easy solutions to handle this? I do wonder how often
 this will actually trigger in real life.
>>>
>>> Well it's not that hard to abort a migration (I'm not sure we've got a
>>> convenient wrapper to do it - but it shouldn't be hard to add).
>>>
>>
>> We do have qmp_migrate_cancel(). I hope that can be called under BQL.
> 
> Well it's a monitor command so I think so; although it's not really
> designed for an error - it's a user action.  Doing a
> migrate_set_error(..) followed by a qemu_file_shutdown is probably a
> good bet.

I'll base on "[PATCH v2 fixed 00/16] Ram blocks with resizable anonymous
allocations under POSIX", where I extend the ram block notifier with a
resize notification.

migrate/ram.c can register the notifier and react accordingly. E.g., for
precopy, abort migration. Not sure about postcopy (below).

> 
>> Can that be called in both, precopy and postcopy case? I assume in the
>> precopy, it's easy.
> 
> The problem is during postcopy you're toast when that happens because
> you can't restart; however, can this happen once we're actually in
> postcopy?  It's a little different - if it happens before the transition
> to postcopy then it's the same as precopy; if it happens afterwards..
> well it's going to happen ont he destination side and that's quite
> different.

If it happens after, we are in trouble at least with received bitmaps.
Not sure about other issues (it's a lot of code :) ). Especially
shrinking while trying to place pages will be bad and fail. It's code
that assumes used_length won't change.

ramblock_recv_bitmap_send() on the target and ram_dirty_bitmap_reload()
on the source. ram_dirty_bitmap_reload() will bail out if the sizes
don't match.

-- 
Thanks,

David / dhildenb




Re: [PATCH v2] Report stringified errno in VFIO related errors

2020-02-14 Thread Auger Eric
Hi Michal,

On 2/14/20 10:55 AM, Michal Privoznik wrote:
> In a few places we report errno formatted as a negative integer.
> This is not as user friendly as it can be. Use strerror() and/or
> error_setg_errno() instead.
> 
> Signed-off-by: Michal Privoznik 
Reviewed-by: Eric Auger 

Thanks

Eric
> ---
> 
> v1 posted here:
> 
> https://lists.nongnu.org/archive/html/qemu-devel/2020-02/msg03623.html
> 
> diff to v1:
>  - Change error reporting in vfio_dma_unmap() too as I missed it in v1.
> 
>  hw/vfio/common.c| 4 ++--
>  util/vfio-helpers.c | 6 +++---
>  2 files changed, 5 insertions(+), 5 deletions(-)
> 
> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> index 5ca11488d6..0b3593b3c0 100644
> --- a/hw/vfio/common.c
> +++ b/hw/vfio/common.c
> @@ -319,7 +319,7 @@ static int vfio_dma_unmap(VFIOContainer *container,
>  unmap.size -= 1ULL << ctz64(container->pgsizes);
>  continue;
>  }
> -error_report("VFIO_UNMAP_DMA: %d", -errno);
> +error_report("VFIO_UNMAP_DMA failed: %s", strerror(errno));
>  return -errno;
>  }
>  
> @@ -352,7 +352,7 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr 
> iova,
>  return 0;
>  }
>  
> -error_report("VFIO_MAP_DMA: %d", -errno);
> +error_report("VFIO_MAP_DMA failed: %s", strerror(errno));
>  return -errno;
>  }
>  
> diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c
> index 813f7ec564..ddd9a96e76 100644
> --- a/util/vfio-helpers.c
> +++ b/util/vfio-helpers.c
> @@ -545,7 +545,7 @@ static int qemu_vfio_do_mapping(QEMUVFIOState *s, void 
> *host, size_t size,
>  trace_qemu_vfio_do_mapping(s, host, size, iova);
>  
>  if (ioctl(s->container, VFIO_IOMMU_MAP_DMA, &dma_map)) {
> -error_report("VFIO_MAP_DMA: %d", -errno);
> +error_report("VFIO_MAP_DMA failed: %s", strerror(errno));
>  return -errno;
>  }
>  return 0;
> @@ -570,7 +570,7 @@ static void qemu_vfio_undo_mapping(QEMUVFIOState *s, 
> IOVAMapping *mapping,
>  assert(QEMU_IS_ALIGNED(mapping->size, qemu_real_host_page_size));
>  assert(index >= 0 && index < s->nr_mappings);
>  if (ioctl(s->container, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
> -error_setg(errp, "VFIO_UNMAP_DMA failed: %d", -errno);
> +error_setg_errno(errp, errno, "VFIO_UNMAP_DMA failed");
>  }
>  memmove(mapping, &s->mappings[index + 1],
>  sizeof(s->mappings[0]) * (s->nr_mappings - index - 1));
> @@ -669,7 +669,7 @@ int qemu_vfio_dma_reset_temporary(QEMUVFIOState *s)
>  trace_qemu_vfio_dma_reset_temporary(s);
>  qemu_mutex_lock(&s->lock);
>  if (ioctl(s->container, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
> -error_report("VFIO_UNMAP_DMA: %d", -errno);
> +error_report("VFIO_UNMAP_DMA failed: %s", strerror(errno));
>  qemu_mutex_unlock(&s->lock);
>  return -errno;
>  }
> 




Re: [PATCH v4 03/20] target/arm: Add isar_feature tests for PAN + ATS1E1

2020-02-14 Thread Peter Maydell
On Sat, 8 Feb 2020 at 12:58, Richard Henderson
 wrote:
>
> Include definitions for all of the bits in ID_MMFR3.
> We already have a definition for ID_AA64MMFR1.PAN.
>
> Reviewed-by: Alex Bennée 
> Reviewed-by: Peter Maydell 
> Signed-off-by: Richard Henderson 


> @@ -3443,6 +3452,16 @@ static inline bool isar_feature_aa32_vminmaxnm(const 
> ARMISARegisters *id)
>  return FIELD_EX64(id->mvfr2, MVFR2, FPMISC) >= 4;
>  }
>
> +static inline bool isar_feature_aa32_pan(const ARMISARegisters *id)
> +{
> +return FIELD_EX64(id->mvfr0, ID_MMFR3, PAN) != 0;
> +}
> +
> +static inline bool isar_feature_aa32_ats1e1(const ARMISARegisters *id)
> +{
> +return FIELD_EX64(id->mvfr0, ID_MMFR3, PAN) >= 2;
> +}

Didn't spot this before it hit master, but these feature
test functions are looking at id->mvfr0, which is MVFR0, not
MMFR3 !

Also they're using FIELD_EX64 on a 32-bit register: is there
a reason for that?

thanks
-- PMM



Re: Mapping between Host virtual address and guest physical address

2020-02-14 Thread Dr. David Alan Gilbert
* Muhui Jiang (jiangmu...@gmail.com) wrote:
> Dear All
> 
> I am recently using qemu-system-arm to boot a linux uImage.
> 
> I would like to do some dynamic instrumentation on the uncompressed kernel.
> It seems that I need to focus on two key points.
> 
> Firstly, I need to know when the kernel is uncompressed, which means the
> compression process is finished. By analyzing the vmlinux.elf  and the
> trace I can figure it out.
> 
> Secondly, I need to know where the uncompressed is, which means where the
> uncompressed linux kernel (Before translated into tcg) stores in the
> virtual address of qemu. Does anyone have ideas? Many Thanks

There's are some existing HMP monitor commands for this type of
debugging:

gpa2hpa addr -- print the host physical address corresponding to a guest 
physical address
gpa2hva addr -- print the host virtual address corresponding to a guest 
physical address
gva2gpa addr -- print the guest physical address corresponding to a guest 
virtual address

so I think you're saying you want gpa2hva

Dave

> Regards
> Muhui
--
Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK




Re: [PATCH] migration/throttle: Make throttle slower at tail stage

2020-02-14 Thread Eric Blake

On 2/13/20 9:27 PM, Keqian Zhu wrote:

At the tail stage of throttle, VM is very sensitive to
CPU percentage. We just throttle 30% of remaining CPU
when throttle is more than 80 percentage.

This doesn't conflict with cpu_throttle_increment.

This may make migration time longer, and is disabled
by default.

Signed-off-by: Keqian Zhu 
---
Cc: Juan Quintela 
Cc: "Dr. David Alan Gilbert" 
Cc: Eric Blake 
Cc: Markus Armbruster 



+++ b/qapi/migration.json
@@ -532,6 +532,12 @@
  #  auto-converge detects that migration is not making
  #  progress. The default value is 10. (Since 2.7)
  #
+# @cpu-throttle-tailslow: Make throttle slower at tail stage
+# At the tail stage of throttle, VM is very sensitive 
to
+# CPU percentage. We just throttle 30% of remaining CPU
+# when throttle is more than 80 percentage. The default
+# value is false. (Since 4.1)


The next release is 5.0, not 4.1.

--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3226
Virtualization:  qemu.org | libvirt.org




Re: [PATCH 1/2] dp264: use pci_create() to initialise the cmd646 device

2020-02-14 Thread Philippe Mathieu-Daudé
Hi Mark,

On Fri, Feb 14, 2020 at 9:48 AM Mark Cave-Ayland
 wrote:
>
> Remove the call to pci_cmd646_ide_init() since global device init functions
> are deprecated in preference of using qdev directly.
>
> Signed-off-by: Mark Cave-Ayland 
> ---
>  hw/alpha/dp264.c | 8 +++-
>  1 file changed, 7 insertions(+), 1 deletion(-)
>
> diff --git a/hw/alpha/dp264.c b/hw/alpha/dp264.c
> index a8f9a89cc4..e91989bf9a 100644
> --- a/hw/alpha/dp264.c
> +++ b/hw/alpha/dp264.c
> @@ -16,6 +16,7 @@
>  #include "sysemu/sysemu.h"
>  #include "hw/rtc/mc146818rtc.h"
>  #include "hw/ide.h"
> +#include "hw/ide/pci.h"
>  #include "hw/timer/i8254.h"
>  #include "hw/isa/superio.h"
>  #include "hw/dma/i8257.h"
> @@ -100,9 +101,14 @@ static void clipper_init(MachineState *machine)
>  /* IDE disk setup.  */
>  {
>  DriveInfo *hd[MAX_IDE_BUS * MAX_IDE_DEVS];
> +PCIDevice *pci_dev;
> +
>  ide_drive_get(hd, ARRAY_SIZE(hd));
>
> -pci_cmd646_ide_init(pci_bus, hd, 0);
> +pci_dev = pci_create(pci_bus, -1, "cmd646-ide");

Not this patch problem, but it would be nice to have a TYPE_CMD646_IDE.

> +qdev_prop_set_uint32(DEVICE(pci_dev), "secondary", 0);

Secondary_ide disabled is the default in cmd646_ide_properties[], can
we avoid this call?

> +qdev_init_nofail(DEVICE(pci_dev));
> +pci_ide_create_devs(pci_dev, hd);
>  }
>
>  /* Load PALcode.  Given that this is not "real" cpu palcode,
> --
> 2.20.1
>
>




Re: [PATCH 3/3] spapr: Migrate SpaprDrc::unplug_requested

2020-02-14 Thread Greg Kurz
On Fri, 14 Feb 2020 13:29:00 +1100
David Gibson  wrote:

> On Mon, Feb 03, 2020 at 11:36:22PM +0100, Greg Kurz wrote:
> > Hot unplugging a device is an asynchronous operation. If the guest is
> > migrated after the event was sent but before it could release the
> > device with RTAS, the destination QEMU doesn't know about the pending
> > unplug operation and doesn't actually remove the device when the guest
> > finally releases it. The device
> > 
> > Migrate SpaprDrc::unplug_requested to fix the inconsistency. This is
> > done with a subsection that is only sent if an unplug request is
> > pending. This allows to preserve migration with older guests in the
> > case of a pending hotplug request. This will cause migration to fail
> > if the destination can't handle the subsection, but this is better
> > than ending with an inconsistency.
> > 
> > Signed-off-by: Greg Kurz 
> > ---
> >  hw/ppc/spapr_drc.c |   27 +--
> >  1 file changed, 25 insertions(+), 2 deletions(-)
> > 
> > diff --git a/hw/ppc/spapr_drc.c b/hw/ppc/spapr_drc.c
> > index d512ac6e1e7f..6f5cab70fc6b 100644
> > --- a/hw/ppc/spapr_drc.c
> > +++ b/hw/ppc/spapr_drc.c
> > @@ -455,6 +455,22 @@ void spapr_drc_reset(SpaprDrc *drc)
> >  }
> >  }
> >  
> > +static bool spapr_drc_unplug_requested_needed(void *opaque)
> > +{
> > +return spapr_drc_unplug_requested(opaque);
> > +}
> > +
> > +static const VMStateDescription vmstate_spapr_drc_unplug_requested = {
> > +.name = "spapr_drc/unplug_requested",
> > +.version_id = 1,
> > +.minimum_version_id = 1,
> > +.needed = spapr_drc_unplug_requested_needed,
> > +.fields  = (VMStateField []) {
> > +VMSTATE_BOOL(unplug_requested, SpaprDrc),
> > +VMSTATE_END_OF_LIST()
> > +}
> > +};
> > +
> >  static bool spapr_drc_needed(void *opaque)
> >  {
> >  SpaprDrc *drc = (SpaprDrc *)opaque;
> > @@ -467,8 +483,11 @@ static bool spapr_drc_needed(void *opaque)
> >  /*
> >   * We need to migrate the state if it's not equal to the expected
> >   * long-term state, which is the same as the coldplugged initial
> > - * state */
> > -return !spapr_drc_device_ready(drc);
> > + * state, or if an unplug request is pending.
> > + */
> > +return
> > +spapr_drc_unplug_requested_needed(drc) ||
> > +!spapr_drc_device_ready(drc);
> 
> Hrm.  You start the series by splitting spapr_drc_device_ready() from
> spapr_drc_needed().  But at this point, I'm pretty sure you've now got
> all the callers of spapr_drc_device_ready() doing equivalent logic
> about them, so they might as well be one function again.  That seems
> pretty roundabout.
> 

Yeah... I did the split because an earlier draft of this series had
a separate path at some point for the plug and unplug cases... but
I agree these should be reunited.

> I don't think the rationale for not using the drc_ready function from
> the CAS path really makes sense.  It's not just an accident that those
> use the same logic - in both cases what we're testing is "Is the DRC
> in a state other than that of a default cold-plugged device?".
>

"Is the DRC in a state other than that of a default cold-plugged device
or is an unplug request pending ?" since the DRC of the device to be
unplugged only transitions away from the "ready state" when the guest
asks to isolate the device with the "set-indicator" RTAS call.

> Changing the name might be sensible, but I still think we want a
> common function for the two cases.
> 

I'll go for that. Maybe reverse the semantics, like if "the DRC has
no attached device or it has an attached device without pending unplug
request" then it is in a steady state that doesn't require anything
special at CAS or migration time, eg. spapr_drc_steady() ?

> >  }
> >  
> >  static const VMStateDescription vmstate_spapr_drc = {
> > @@ -479,6 +498,10 @@ static const VMStateDescription vmstate_spapr_drc = {
> >  .fields  = (VMStateField []) {
> >  VMSTATE_UINT32(state, SpaprDrc),
> >  VMSTATE_END_OF_LIST()
> > +},
> > +.subsections = (const VMStateDescription * []) {
> > +&vmstate_spapr_drc_unplug_requested,
> > +NULL
> >  }
> >  };
> >  
> > 
> 



pgpnc6bQufgoa.pgp
Description: OpenPGP digital signature


Re: [PATCH] Report stringified errno in VFIO related errors

2020-02-14 Thread Philippe Mathieu-Daudé
On Fri, Feb 14, 2020 at 10:20 AM Michal Privoznik  wrote:
> On 2/14/20 9:47 AM, Michal Privoznik wrote:
> > In a few places we report errno formatted as a negative integer.
> > This is not as user friendly as it can be. Use strerror() and/or
> > error_setg_errno() instead.
> >
> > Signed-off-by: Michal Privoznik 
> > ---
> >   hw/vfio/common.c| 2 +-
> >   util/vfio-helpers.c | 6 +++---
> >   2 files changed, 4 insertions(+), 4 deletions(-)
> >
>
> BTW the reason I've noticed these is because I'm seeing some errors when
> assigning my NVMe disk to qemu. This is the full command line:
>
>
> /home/zippy/work/qemu/qemu.git/x86_64-softmmu/qemu-system-x86_64 \
> -name guest=fedora,debug-threads=on \
> -S \
> -object
> secret,id=masterKey0,format=raw,file=/var/lib/libvirt/qemu/domain-2-fedora/master-key.aes
> \
> -machine pc-i440fx-4.1,accel=kvm,usb=off,dump-guest-core=off \
> -cpu host \
> -m size=4194304k,slots=16,maxmem=1099511627776k \
> -overcommit mem-lock=off \
> -smp 4,sockets=1,dies=1,cores=2,threads=2 \
> -object iothread,id=iothread1 \
> -object iothread,id=iothread2 \
> -object iothread,id=iothread3 \
> -object iothread,id=iothread4 \
> -mem-prealloc \
> -mem-path /hugepages2M/libvirt/qemu/2-fedora \
> -numa node,nodeid=0,cpus=0,mem=4096 \
> -uuid 63840878-0deb-4095-97e6-fc444d9bc9fa \
> -no-user-config \
> -nodefaults \
> -chardev socket,id=charmonitor,fd=31,server,nowait \
> -mon chardev=charmonitor,id=monitor,mode=control \
> -rtc base=utc \
> -no-shutdown \
> -global PIIX4_PM.disable_s3=0 \
> -global PIIX4_PM.disable_s4=0 \
> -boot menu=on,strict=on \
> -device piix3-usb-uhci,id=usb,bus=pci.0,addr=0x1.0x2 \
> -device virtio-scsi-pci,id=scsi0,bus=pci.0,addr=0x4 \
> -device virtio-serial-pci,id=virtio-serial0,bus=pci.0,addr=0x5 \
> -blockdev
> '{"driver":"file","filename":"/var/lib/libvirt/images/fedora.qcow2","node-name":"libvirt-2-storage","auto-read-only":true,"discard":"unmap"}'
> \
> -blockdev
> '{"node-name":"libvirt-2-format","read-only":false,"discard":"unmap","driver":"qcow2","file":"libvirt-2-storage","backing":null}'
> \
> -device
> scsi-hd,bus=scsi0.0,channel=0,scsi-id=0,lun=0,device_id=drive-scsi0-0-0-0,drive=libvirt-2-format,id=scsi0-0-0-0,bootindex=1
> \
> -blockdev
> '{"driver":"nvme","device":":02:00.0","namespace":1,"node-name":"libvirt-1-storage","auto-read-only":true,"discard":"unmap"}'
> \
> -blockdev
> '{"node-name":"libvirt-1-format","read-only":false,"driver":"raw","file":"libvirt-1-storage"}'
> \
> -device
> virtio-blk-pci,scsi=off,bus=pci.0,addr=0x6,drive=libvirt-1-format,id=virtio-disk0
> \
> -netdev tap,fd=33,id=hostnet0,vhost=on,vhostfd=34 \
> -device
> virtio-net-pci,host_mtu=9000,netdev=hostnet0,id=net0,mac=52:54:00:a4:6f:91,bus=pci.0,addr=0x3
> \
> -chardev pty,id=charserial0 \
> -device isa-serial,chardev=charserial0,id=serial0 \
> -chardev socket,id=charchannel0,fd=35,server,nowait \
> -device
> virtserialport,bus=virtio-serial0.0,nr=1,chardev=charchannel0,id=channel0,name=org.qemu.guest_agent.0
> \
> -spice port=5900,addr=0.0.0.0,disable-ticketing,seamless-migration=on \
> -device virtio-vga,id=video0,virgl=on,max_outputs=1,bus=pci.0,addr=0x2 \
> -device virtio-balloon-pci,id=balloon0,bus=pci.0,addr=0x7 \
> -sandbox
> on,obsolete=deny,elevateprivileges=deny,spawn=deny,resourcecontrol=deny \
> -msg timestamp=on
>
> And these are the errors I see:
>
> 2020-02-14T09:06:18.183167Z qemu-system-x86_64: VFIO_MAP_DMA failed:
> Invalid argument
> 2020-02-14T09:10:49.753767Z qemu-system-x86_64: VFIO_MAP_DMA failed:
> Cannot allocate memory
> 2020-02-14T09:11:04.530344Z qemu-system-x86_64: VFIO_MAP_DMA failed: No
> space left on device
> 2020-02-14T09:11:04.531087Z qemu-system-x86_64: VFIO_MAP_DMA failed: No
> space left on device
> 2020-02-14T09:11:04.531230Z qemu-system-x86_64: VFIO_MAP_DMA failed: No
> space left on device
>
>
> I'm doing nothing with the disk inside the guest, but:
>
># dd if=/dev/vda of=/dev/null status=progress
>
> (the disk appears as /dev/vda in the guest). Surprisingly, I do not see
> these errors when I use the traditional PCI assignment (-device
> vfio-pci). My versions of kernel and qemu:
>
> moe ~ # uname -r
> 5.4.15-gentoo
> moe ~ # /home/zippy/work/qemu/qemu.git/x86_64-softmmu/qemu-system-x86_64
> --version
> QEMU emulator version 4.2.50 (v4.2.0-1439-g5d6542bea7-dirty)
> Copyright (c) 2003-2019 Fabrice Bellard and the QEMU Project developers
>
> Do you guys want me to file a bug?

As you already have all the information, and it is a simple
copy/paste, I'd say "yes"




Re: [PATCH v2] Report stringified errno in VFIO related errors

2020-02-14 Thread Philippe Mathieu-Daudé
On Fri, Feb 14, 2020 at 10:56 AM Michal Privoznik  wrote:
>
> In a few places we report errno formatted as a negative integer.
> This is not as user friendly as it can be. Use strerror() and/or
> error_setg_errno() instead.
>
> Signed-off-by: Michal Privoznik 
> ---

Reviewed-by: Philippe Mathieu-Daudé 




Re: [PATCH RFC] memory: Don't allow to resize RAM while migrating

2020-02-14 Thread David Hildenbrand
On 14.02.20 12:08, David Hildenbrand wrote:
> On 14.02.20 12:02, Dr. David Alan Gilbert wrote:
>> * David Hildenbrand (da...@redhat.com) wrote:
>>> On 14.02.20 11:42, Dr. David Alan Gilbert wrote:
 * David Hildenbrand (da...@redhat.com) wrote:
> On 14.02.20 11:25, Dr. David Alan Gilbert wrote:
>> * David Hildenbrand (da...@redhat.com) wrote:
>>> Resizing while migrating is dangerous and does not work as expected.
>>> The whole migration code works on the usable_length of ram blocks and 
>>> does
>>> not expect this to change at random points in time.
>>>
>>> Precopy: The ram block size must not change on the source, after
>>> ram_save_setup(), so as long as the guest is still running on the 
>>> source.
>>>
>>> Postcopy: The ram block size must not change on the target, after
>>> synchronizing the RAM block list (ram_load_precopy()).
>>>
>>> AFAIKS, resizing can be trigger *after* (but not during) a reset in
>>> ACPI code by the guest
>>> - hw/arm/virt-acpi-build.c:acpi_ram_update()
>>> - hw/i386/acpi-build.c:acpi_ram_update()
>>>
>>> I see no easy way to work around this. Fail hard instead of failing
>>> somewhere in migration code due to strange other reasons. AFAIKs, the
>>> rebuilts will be triggered during reboot, so this should not affect
>>> running guests, but only guests that reboot at a very bad time and
>>> actually require size changes.
>>>
>>> Let's further limit the impact by checking if an actual resize of the
>>> RAM (in number of pages) is required.
>>>
>>> Don't perform the checks in qemu_ram_resize(), as that's called during
>>> migration when syncing the used_length. Update documentation.
>>
>> Interesting; we need to do something about this - but banning resets
>> during migration is a bit harsh; and aborting the source VM is really
>> nasty - for a precopy especially we shouldn't kill the source VM,
>> we should just abort the migration.
>
> Any alternative, easy solutions to handle this? I do wonder how often
> this will actually trigger in real life.

 Well it's not that hard to abort a migration (I'm not sure we've got a
 convenient wrapper to do it - but it shouldn't be hard to add).

>>>
>>> We do have qmp_migrate_cancel(). I hope that can be called under BQL.
>>
>> Well it's a monitor command so I think so; although it's not really
>> designed for an error - it's a user action.  Doing a
>> migrate_set_error(..) followed by a qemu_file_shutdown is probably a
>> good bet.
> 
> I'll base on "[PATCH v2 fixed 00/16] Ram blocks with resizable anonymous
> allocations under POSIX", where I extend the ram block notifier with a
> resize notification.
> 
> migrate/ram.c can register the notifier and react accordingly. E.g., for
> precopy, abort migration. Not sure about postcopy (below).
> 
>>
>>> Can that be called in both, precopy and postcopy case? I assume in the
>>> precopy, it's easy.
>>
>> The problem is during postcopy you're toast when that happens because
>> you can't restart; however, can this happen once we're actually in
>> postcopy?  It's a little different - if it happens before the transition
>> to postcopy then it's the same as precopy; if it happens afterwards..
>> well it's going to happen ont he destination side and that's quite
>> different.
> 
> If it happens after, we are in trouble at least with received bitmaps.
> Not sure about other issues (it's a lot of code :) ). Especially
> shrinking while trying to place pages will be bad and fail. It's code
> that assumes used_length won't change.
> 
> ramblock_recv_bitmap_send() on the target and ram_dirty_bitmap_reload()
> on the source. ram_dirty_bitmap_reload() will bail out if the sizes
> don't match.
> 

So, with (modified) ram block notifiers it could look something like this:

>From c0049ac2e95d6756037db918852c507fb88297d9 Mon Sep 17 00:00:00 2001
From: David Hildenbrand 
Date: Fri, 14 Feb 2020 13:01:03 +0100
Subject: [PATCH v1] tmp

Signed-off-by: David Hildenbrand 
---
 migration/migration.c |  9 +++--
 migration/migration.h |  1 +
 migration/ram.c   | 42 ++
 3 files changed, 50 insertions(+), 2 deletions(-)

diff --git a/migration/migration.c b/migration/migration.c
index 3a21a4686c..0e7efe2920 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -175,13 +175,18 @@ void migration_object_init(void)
 }
 }
 
+void migration_cancel(void)
+{
+migrate_fd_cancel(current_migration);
+}
+
 void migration_shutdown(void)
 {
 /*
  * Cancel the current migration - that will (eventually)
  * stop the migration using this structure
  */
-migrate_fd_cancel(current_migration);
+migration_cancel();
 object_unref(OBJECT(current_migration));
 }
 
@@ -2019,7 +2024,7 @@ void qmp_migrate(const char *uri, bool has_blk, bool blk,
 
 void qmp_migrate_cancel(Error **errp)
 {

Re: [PATCH v2 07/19] travis.yml: single-thread build-tcg stages

2020-02-14 Thread Philippe Mathieu-Daudé
On Thu, Feb 13, 2020 at 11:51 PM Alex Bennée  wrote:
>
> This still seems to be a problem for Travis.
>
> Signed-off-by: Alex Bennée 
> ---
>  .travis.yml | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/.travis.yml b/.travis.yml
> index 0612998958b..f4020dcc6c8 100644
> --- a/.travis.yml
> +++ b/.travis.yml
> @@ -400,7 +400,7 @@ jobs:
>  - name: "GCC check-tcg (some-softmmu)"
>env:
>  - CONFIG="--enable-debug-tcg 
> --target-list=xtensa-softmmu,arm-softmmu,aarch64-softmmu,alpha-softmmu"
> -- TEST_BUILD_CMD="make -j${JOBS} build-tcg"
> +- TEST_BUILD_CMD="make build-tcg"

Can you explicit -j1 instead? This is self-explanatory.

>  - TEST_CMD="make check-tcg"
>  - CACHE_NAME="${TRAVIS_BRANCH}-linux-gcc-debug-tcg"
>
> @@ -409,7 +409,7 @@ jobs:
>  - name: "GCC plugins check-tcg (some-softmmu)"
>env:
>  - CONFIG="--enable-plugins --enable-debug-tcg 
> --target-list=xtensa-softmmu,arm-softmmu,aarch64-softmmu,alpha-softmmu"
> -- TEST_BUILD_CMD="make -j${JOBS} build-tcg"
> +- TEST_BUILD_CMD="make build-tcg"

With -j1 in both places:
Reviewed-by: Philippe Mathieu-Daudé 

>  - TEST_CMD="make check-tcg"
>  - CACHE_NAME="${TRAVIS_BRANCH}-linux-gcc-debug-tcg"
>
> --
> 2.20.1
>




Re: [PATCH v2 01/30] configure: Allow user to specify sphinx-build binary

2020-02-14 Thread Markus Armbruster
Peter Maydell  writes:

> On Fri, 14 Feb 2020 at 06:33, Markus Armbruster  wrote:
>>
>> Does not work out of the box on my Fedora 30 build host, where
>> sphinx-build gives me sphinx-build-2.  I have to specify
>> --sphinx-build=/usr/bin/sphinx-build-3 to unbreak it.  Which of course
>> breaks things when I try to build anything before this commit
>>
>> The appended patch makes it work out of the box.  Please consider
>> squashing it in.
>>
>> diff --git a/configure b/configure
>> index 14172909f0..a9d175c400 100755
>> --- a/configure
>> +++ b/configure
>> @@ -584,7 +584,6 @@ query_pkg_config() {
>>  }
>>  pkg_config=query_pkg_config
>>  sdl2_config="${SDL2_CONFIG-${cross_prefix}sdl2-config}"
>> -sphinx_build=sphinx-build
>>
>>  # If the user hasn't specified ARFLAGS, default to 'rv', just as make does.
>>  ARFLAGS="${ARFLAGS-rv}"
>> @@ -903,6 +902,7 @@ fi
>>
>>  : ${make=${MAKE-make}}
>>  : ${install=${INSTALL-install}}
>> +
>>  # We prefer python 3.x. A bare 'python' is traditionally
>>  # python 2.x, but some distros have it as python 3.x, so
>>  # we check that too
>> @@ -915,6 +915,19 @@ do
>>  break
>>  fi
>>  done
>> +
>> +set -x
>
> I guess the set -x / set +x here are accidentally left in
> debug printing?

Mispasted.  I just double-checked these two lines are the only crap I
left in.

>> +sphinx_build=
>> +for binary in sphinx-build-3 sphinx-build
>> +do
>> +if has "$binary"
>> +then
>> +sphinx_build=$(command -v "$binary")
>> +break
>> +fi
>> +done
>> +set +x
>> +
>>  : ${smbd=${SMBD-/usr/sbin/smbd}}
>>
>>  # Default objcc to clang if available, otherwise use CC
>> @@ -4803,7 +4816,7 @@ has_sphinx_build() {
>>  # sphinx-build doesn't exist at all or if it is too old.
>>  mkdir -p "$TMPDIR1/sphinx"
>>  touch "$TMPDIR1/sphinx/index.rst"
>> -$sphinx_build -c "$source_path/docs" -b html "$TMPDIR1/sphinx" 
>> "$TMPDIR1/sphinx/out" >/dev/null 2>&1
>> +"$sphinx_build" -c "$source_path/docs" -b html "$TMPDIR1/sphinx" 
>> "$TMPDIR1/sphinx/out" >/dev/null 2>&1
>>  }
>
> This change isn't related to trying sphinx-build-3 --
> did you actually need it ?

If the for loop finds nothing, $sphinx_build remains empty.  Quoting the
variable seems cleaner.

Oh, and if the user passes '--sphinx-build=', $sphinx_build becomes
empty.  Precedes my fixup.  Admittedly a rather silly thing to do.

> I did think about quoting when I wrote the patch,
> but looking at existing practice we are all over the
> place on whether we bother to quote variables containing
> program names in configure. I think I ended up following
> the same thing we do for $python, which doesn't quote.

I tend to omit quotes when it's obvious the variable's value can only be
harmless.  An empty value isn't.

> Other than that, I'm happy to squash this in, or for
> you to squash it in if you are otherwise OK taking
> the first chunk of the patchset via your tree now.
> (Do you have a preference for whether you take these
> patches via your tree or I send them in a docs pullreq?)

I can do the pull request.




Re: [PATCH] migration/throttle: Make throttle slower at tail stage

2020-02-14 Thread Dr. David Alan Gilbert
* Keqian Zhu (zhukeqi...@huawei.com) wrote:
> At the tail stage of throttle, VM is very sensitive to
> CPU percentage. We just throttle 30% of remaining CPU
> when throttle is more than 80 percentage.

This is a bit unusual;  all of the rest of the throttling has no
fixed constants; all values are set by parameters.

You've got the two, the '80' and the '0.3'

I thinkt he easy solution is to replace your parameter 'tailslow' by two
new parameters; 'tailstart' and 'tailrate';  both defaulting to 100%.

Then you make it:

if (cpu_throttle_now >= pct_tailstart) {
/* Eat some scale of CPU from remaining */
cpu_throttle_inc = ceil((100 - cpu_throttle_now) * pct_tailrate);

(with percentage scaling added).

Then setting 'tailstart' to 80 and 'tailrate' to 30 is equivalent to
what you have, but means we have no magical constants in the code.

Dave


> This doesn't conflict with cpu_throttle_increment.
> 
> This may make migration time longer, and is disabled
> by default.
> 
> Signed-off-by: Keqian Zhu 
> ---
> Cc: Juan Quintela 
> Cc: "Dr. David Alan Gilbert" 
> Cc: Eric Blake 
> Cc: Markus Armbruster 
> ---
>  migration/migration.c | 13 +
>  migration/ram.c   | 18 --
>  monitor/hmp-cmds.c|  4 
>  qapi/migration.json   | 21 +
>  4 files changed, 54 insertions(+), 2 deletions(-)
> 
> diff --git a/migration/migration.c b/migration/migration.c
> index 3a21a4686c..37b569cee9 100644
> --- a/migration/migration.c
> +++ b/migration/migration.c
> @@ -782,6 +782,8 @@ MigrationParameters *qmp_query_migrate_parameters(Error 
> **errp)
>  params->cpu_throttle_initial = s->parameters.cpu_throttle_initial;
>  params->has_cpu_throttle_increment = true;
>  params->cpu_throttle_increment = s->parameters.cpu_throttle_increment;
> +params->has_cpu_throttle_tailslow = true;
> +params->cpu_throttle_tailslow = s->parameters.cpu_throttle_tailslow;
>  params->has_tls_creds = true;
>  params->tls_creds = g_strdup(s->parameters.tls_creds);
>  params->has_tls_hostname = true;
> @@ -1287,6 +1289,10 @@ static void 
> migrate_params_test_apply(MigrateSetParameters *params,
>  dest->cpu_throttle_increment = params->cpu_throttle_increment;
>  }
>  
> +if (params->has_cpu_throttle_tailslow) {
> +dest->cpu_throttle_tailslow = params->cpu_throttle_tailslow;
> +}
> +
>  if (params->has_tls_creds) {
>  assert(params->tls_creds->type == QTYPE_QSTRING);
>  dest->tls_creds = g_strdup(params->tls_creds->u.s);
> @@ -1368,6 +1374,10 @@ static void migrate_params_apply(MigrateSetParameters 
> *params, Error **errp)
>  s->parameters.cpu_throttle_increment = 
> params->cpu_throttle_increment;
>  }
>  
> +if (params->has_cpu_throttle_tailslow) {
> +s->parameters.cpu_throttle_tailslow = params->cpu_throttle_tailslow;
> +}
> +
>  if (params->has_tls_creds) {
>  g_free(s->parameters.tls_creds);
>  assert(params->tls_creds->type == QTYPE_QSTRING);
> @@ -3504,6 +3514,8 @@ static Property migration_properties[] = {
>  DEFINE_PROP_UINT8("x-cpu-throttle-increment", MigrationState,
>parameters.cpu_throttle_increment,
>DEFAULT_MIGRATE_CPU_THROTTLE_INCREMENT),
> +DEFINE_PROP_BOOL("x-cpu-throttle-tailslow", MigrationState,
> +  parameters.cpu_throttle_tailslow, false),
>  DEFINE_PROP_SIZE("x-max-bandwidth", MigrationState,
>parameters.max_bandwidth, MAX_THROTTLE),
>  DEFINE_PROP_UINT64("x-downtime-limit", MigrationState,
> @@ -3600,6 +3612,7 @@ static void migration_instance_init(Object *obj)
>  params->has_decompress_threads = true;
>  params->has_cpu_throttle_initial = true;
>  params->has_cpu_throttle_increment = true;
> +params->has_cpu_throttle_tailslow = true;
>  params->has_max_bandwidth = true;
>  params->has_downtime_limit = true;
>  params->has_x_checkpoint_delay = true;
> diff --git a/migration/ram.c b/migration/ram.c
> index ed23ed1c7c..d86413a5d1 100644
> --- a/migration/ram.c
> +++ b/migration/ram.c
> @@ -29,6 +29,7 @@
>  #include "qemu/osdep.h"
>  #include "cpu.h"
>  #include 
> +#include 
>  #include "qemu/cutils.h"
>  #include "qemu/bitops.h"
>  #include "qemu/bitmap.h"
> @@ -620,15 +621,28 @@ static void mig_throttle_guest_down(void)
>  {
>  MigrationState *s = migrate_get_current();
>  uint64_t pct_initial = s->parameters.cpu_throttle_initial;
> -uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
> +uint64_t pct_increment = s->parameters.cpu_throttle_increment;
> +bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
>  int pct_max = s->parameters.max_cpu_throttle;
>  
> +const uint64_t cpu_throttle_now = cpu_throttle_get_percentage();
> +uint64_t cpu_throttle_inc;
> +
>  /* We have not started throttling yet. Let's start it. */
>  if (!cpu_throttle

Re: [PATCH v2 05/30] qga/qapi-schema.json: Fix indent level on doc comments

2020-02-14 Thread Markus Armbruster
Peter Maydell  writes:

> The texinfo doc generation doesn't care much about indentation

Let's spell Texinfo with a capital T.

> levels, but we would like to add a rST backend, and rST does care
> about indentation.

Nitpick: an rST *backend* should not care about the doc generator's
*input* format.  We actually intend to change the input format into a
domain-specific dialect of rST, as you state in your cover letter: "This
series switches all our QAPI doc comments over from texinfo format to
rST."

Perhaps:

  The current doc generation doesn't care much about indentation levels,
  but we would like to switch to an rST format, and rST does care about
  indentation.

> Make the doc comments more strongly consistent about indentation
> for multiline constructs like:
>
> @arg: description line 1
>   description line 2
>
> Returns: line one
>  line 2
>
> so that there is always exactly one space after the colon, and
> subsequent lines align with the first.
>
> This commit is a purely whitespace change, and it does not alter the
> generated .texi files (because the texi generation code strips away
> all the extra whitespace).  This does mean that we end up with some
> over-length lines.
>
> Note that when the documentation for an argument fits on a single
> line like this:
>
> @arg: one line only
>
> then stray extra spaces after the ':' don't affect the rST output, so
> I have not attempted to methodically fix them, though the preference
> is a single space here too.
>
> Signed-off-by: Peter Maydell 
> Reviewed-by: Markus Armbruster 

R-by stands even without the commit message improvement I suggested.




Re: [PATCH] migration/throttle: Make throttle slower at tail stage

2020-02-14 Thread Juan Quintela
Keqian Zhu  wrote:
> At the tail stage of throttle, VM is very sensitive to
> CPU percentage. We just throttle 30% of remaining CPU
> when throttle is more than 80 percentage.

Why?

If we really think that this is better that current approarch, just do
this _always_.  And throothre 30% of remaining CPU.  So we go:

30%
30% + 0.3(70%)
...

Or anything else.

My experience is:
- you really need to go to very high throothle to make migration happens
  (more than 70% or so usually).
- The way that we throotle is not completely exact.

> This doesn't conflict with cpu_throttle_increment.
>
> This may make migration time longer, and is disabled
> by default.


What do you think?
I think that it is better to change method and improve documentation
that yet adding another parameter.

Later, Juan.




Re: [PATCH v2 01/30] configure: Allow user to specify sphinx-build binary

2020-02-14 Thread Peter Maydell
On Fri, 14 Feb 2020 at 12:20, Markus Armbruster  wrote:
>
> Peter Maydell  writes:
> >>  # Default objcc to clang if available, otherwise use CC
> >> @@ -4803,7 +4816,7 @@ has_sphinx_build() {
> >>  # sphinx-build doesn't exist at all or if it is too old.
> >>  mkdir -p "$TMPDIR1/sphinx"
> >>  touch "$TMPDIR1/sphinx/index.rst"
> >> -$sphinx_build -c "$source_path/docs" -b html "$TMPDIR1/sphinx" 
> >> "$TMPDIR1/sphinx/out" >/dev/null 2>&1
> >> +"$sphinx_build" -c "$source_path/docs" -b html "$TMPDIR1/sphinx" 
> >> "$TMPDIR1/sphinx/out" >/dev/null 2>&1
> >>  }
> >
> > This change isn't related to trying sphinx-build-3 --
> > did you actually need it ?
>
> If the for loop finds nothing, $sphinx_build remains empty.  Quoting the
> variable seems cleaner.

Oh, I see. Anyway, yes, happy to have quotes here.

thanks
-- PMM



Re: [PATCH v2 05/30] qga/qapi-schema.json: Fix indent level on doc comments

2020-02-14 Thread Peter Maydell
On Fri, 14 Feb 2020 at 12:36, Markus Armbruster  wrote:
>
> Peter Maydell  writes:
>
> > The texinfo doc generation doesn't care much about indentation
>
> Let's spell Texinfo with a capital T.
>
> > levels, but we would like to add a rST backend, and rST does care
> > about indentation.
>
> Nitpick: an rST *backend* should not care about the doc generator's
> *input* format.  We actually intend to change the input format into a
> domain-specific dialect of rST, as you state in your cover letter: "This
> series switches all our QAPI doc comments over from texinfo format to
> rST."
>
> Perhaps:
>
>   The current doc generation doesn't care much about indentation levels,
>   but we would like to switch to an rST format, and rST does care about
>   indentation.

Works for me. Let me know if you're OK just making this kind
of commit message tweak/minor fixup and sending a pull, or
if you want me to roll a v3.

thanks
-- PMM



Re: [PATCH v2 06/30] qga/qapi-schema.json: minor format fixups for rST

2020-02-14 Thread Markus Armbruster
Peter Maydell  writes:

> rST format requires a blank line before the start of a bulleted
> or enumerated list. Two places in qapi-schema.json were missing
> this blank line.

Relies on the previous commit message's "we would like to add a rST
backend" to establish context.  Suggest:

  We would like to switch the doc comments to rST format, and rST
  requires...

> Some places were using an indented line as a sort of single-item
> bulleted list, which in the texinfo output comes out all run
> onto a single line; use a real bulleted list instead.

The old text tempts developers to add additional items the same way, and
the all items run together.  So this makes sense on its own.

> Some places unnecessarily indented lists, which confuses rST.
>
> guest-fstrim:minimum's documentation was indented the
> right amount to share a line with @minimum, but wasn't
> actually doing so.
>
> The indent on the bulleted list in the guest-set-vcpus
> Returns section meant rST misindented it.
>
> Changes to the generated texinfo are very minor (the new
> bulleted lists, and a few extra blank lines).
>
> Signed-off-by: Peter Maydell 

I checked the plain text output, and it's fine.

Preferably with the commit message tweak I suggested
Reviewed-by: Markus Armbruster 




Re: [PATCH RFC] memory: Don't allow to resize RAM while migrating

2020-02-14 Thread Juan Quintela
David Hildenbrand  wrote:

I agree with the removed bits.


> diff --git a/migration/ram.c b/migration/ram.c
> index ed23ed1c7c..f86f32b453 100644
> --- a/migration/ram.c
> +++ b/migration/ram.c
> @@ -52,6 +52,7 @@
>  #include "migration/colo.h"
>  #include "block.h"
>  #include "sysemu/sysemu.h"
> +#include "sysemu/runstate.h"
>  #include "savevm.h"
>  #include "qemu/iov.h"
>  #include "multifd.h"
> @@ -3710,8 +3711,49 @@ static SaveVMHandlers savevm_ram_handlers = {
>  .resume_prepare = ram_resume_prepare,
>  };
>  
> +static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
> +  size_t old_size, size_t new_size)
> +{
> +/*
> + * We don't care about resizes triggered on incoming migration (when
> + * syncing ram blocks), or of course, when no migration is going on.
> + */
> +if (migration_is_idle() || !runstate_is_running()) {
> +return;
> +}
> +
> +if (!postcopy_is_running()) {
> +Error *err = NULL;
> +
> +/*
> + * Precopy code cannot deal with the size of ram blocks changing at
> + * random points in time. We're still running on the source, abort
> + * the migration and continue running here. Make sure to wait until
> + * migration was canceled.
> + */
> +error_setg(&err, "RAM resized during precopy.");
> +migrate_set_error(migrate_get_current(), err);
> +error_free(err);
> +migration_cancel();

If we can't do anything else, this is reasonable.

But as discussed before, it is still not fully clear to me _why_ are
ramblocks changing if we have disabled add/remove memory during migration.

> +} else {
> +/*
> + * Postcopy code cannot deal with the size of ram blocks changing at
> + * random points in time. We're running on the target. Fail hard.
> + *
> + * TODO: How to handle this in a better way?
> + */
> +error_report("RAM resized during postcopy.");
> +exit(-1);

Idea is good, but we also need to exit destination, not only source, no?

> +}
> +}



> +static RAMBlockNotifier ram_mig_ram_notifier = {
> +.ram_block_resized = ram_mig_ram_block_resized,
> +};
> +
>  void ram_mig_init(void)
>  {
>  qemu_mutex_init(&XBZRLE.lock);
>  register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
> +ram_block_notifier_add(&ram_mig_ram_notifier);
>  }

Shouldn't we remove the notifier when we finish the migration.

Later, Juan.




Re: [PATCH] qcow2: Fix alignment checks in encrypted images

2020-02-14 Thread Kevin Wolf
Am 13.02.2020 um 18:16 hat Alberto Garcia geschrieben:
> I/O requests to encrypted media should be aligned to the sector size
> used by the underlying encryption method, not to BDRV_SECTOR_SIZE.
> Fortunately this doesn't break anything at the moment because
> both existing QCRYPTO_BLOCK_*_SECTOR_SIZE have the same value as
> BDRV_SECTOR_SIZE.
> 
> The checks in qcow2_co_preadv_encrypted() are also unnecessary because
> they are repeated immediately afterwards in qcow2_co_encdec().
> 
> Signed-off-by: Alberto Garcia 

Thanks, applied to the block branch.

Kevin




Re: [PATCH v2 07/30] qapi/block-core.json: Use literal block for ascii art

2020-02-14 Thread Markus Armbruster
Peter Maydell  writes:

> The ascii-art graph in the BlockLatencyHistogramInfo documentation
> doesn't render correctly, because the whitespace is collapsed.
>
> Use the '|' format that emits a literal 'example' block so the graph
> is displayed correctly.
>
> Strictly the texinfo generated is still wrong because each line
> goes into its own @example environment, but it renders better
> than what we had before.
>
> Fixing this rendering is a necessary prerequisite for the rST
> generator, which otherwise complains about the inconsistent
> indentation in the ascii-art graph.
>
> Signed-off-by: Peter Maydell 
> ---
> v1->v2: tweaked commit message, made graph still line up
> with preceding paragraph text
> ---
>  qapi/block-core.json | 14 +++---
>  1 file changed, 7 insertions(+), 7 deletions(-)
>
> diff --git a/qapi/block-core.json b/qapi/block-core.json
> index ef94a296868..db9ca688d49 100644
> --- a/qapi/block-core.json
> +++ b/qapi/block-core.json
> @@ -550,13 +550,13 @@
>  #For the example above, @bins may be something like [3, 1, 5, 2],
>  #and corresponding histogram looks like:
>  #
> -#5|   *
> -#4|   *
> -#3| * *
> -#2| * **
> -#1| ****
> -# +--
> -# 10   50   100
> +# |  5|   *
> +# |  4|   *
> +# |  3| * *
> +# |  2| * **
> +# |  1| ****
> +# |   +--
> +# |   10   50   100
>  #
>  # Since: 4.0
>  ##

Reviewed-by: Markus Armbruster 




Re: [PATCH v2 08/30] qapi: Use ':' after @argument in doc comments

2020-02-14 Thread Markus Armbruster
Peter Maydell  writes:

> Some qapi doc comments have forgotten the ':' after the
> @argument, like this:
>
> # @filename Filename for the new image file
> # @size Size of the virtual disk in bytes
>
> The result is that these are parsed as part of the body
> text and appear as a run-on line:
>   filename Filename for the new image file size Size of the virtual disk in 
> bytes"
> followed by
>   filename: string
> Not documented
>   size: int
> Not documented
>
> in the 'Members' section.
>
> Correct the formatting.
>
> Signed-off-by: Peter Maydell 
> Reviewed-by: Markus Armbruster 

The title sounds like it's just a tweak, but it actually fixes bugs in
the generated documentation.  Suggest to change the title to

qapi: Fix incorrect "Not documented" claims in QMP documentation

R-by stands regardless.




Re: [PATCH v2 fixed 00/16] Ram blocks with resizable anonymous allocations under POSIX

2020-02-14 Thread Dr. David Alan Gilbert
* David Hildenbrand (da...@redhat.com) wrote:
> On 12.02.20 14:42, David Hildenbrand wrote:
> > We already allow resizable ram blocks for anonymous memory, however, they
> > are not actually resized. All memory is mmaped() R/W, including the memory
> > exceeding the used_length, up to the max_length.
> > 
> > When resizing, effectively only the boundary is moved. Implement actually
> > resizable anonymous allocations and make use of them in resizable ram
> > blocks when possible. Memory exceeding the used_length will be
> > inaccessible. Especially ram block notifiers require care.
> > 
> > Having actually resizable anonymous allocations (via mmap-hackery) allows
> > to reserve a big region in virtual address space and grow the
> > accessible/usable part on demand. Even if "/proc/sys/vm/overcommit_memory"
> > is set to "never" under Linux, huge reservations will succeed. If there is
> > not enough memory when resizing (to populate parts of the reserved region),
> > trying to resize will fail. Only the actually used size is reserved in the
> > OS.
> > 
> > E.g., virtio-mem [1] wants to reserve big resizable memory regions and
> > grow the usable part on demand. I think this change is worth sending out
> > individually. Accompanied by a bunch of minor fixes and cleanups.
> > 
> > Especially, memory notifiers already handle resizing by first removing
> > the old region, and then re-adding the resized region. prealloc is
> > currently not possible with resizable ram blocks. mlock() should continue
> > to work as is. Resizing is currently rare and must only happen on the
> > start of an incoming migration, or during resets. No code path (except
> > HAX and SEV ram block notifiers) should access memory outside of the usable
> > range - and if we ever find one, that one has to be fixed (I did not
> > identify any).
> > 
> > v1 -> v2:
> > - Add "util: vfio-helpers: Fix qemu_vfio_close()"
> > - Add "util: vfio-helpers: Remove Error parameter from
> >qemu_vfio_undo_mapping()"
> > - Add "util: vfio-helpers: Factor out removal from
> >qemu_vfio_undo_mapping()"
> > - "util/mmap-alloc: ..."
> >  -- Minor changes due to review feedback (e.g., assert alignment, return
> > bool when resizing)
> > - "util: vfio-helpers: Implement ram_block_resized()"
> >  -- Reserve max_size in the IOVA address space.
> >  -- On resize, undo old mapping and do new mapping. We can later implement
> > a new ioctl to resize the mapping directly.
> > - "numa: Teach ram block notifiers about resizable ram blocks"
> >  -- Pass size/max_size to ram block notifiers, which makes things easier an
> > cleaner
> > - "exec: Ram blocks with resizable anonymous allocations under POSIX"
> >  -- Adapt to new ram block notifiers
> >  -- Shrink after notifying. Always trigger ram block notifiers on resizes
> >  -- Add a safety net that all ram block notifiers registered at runtime
> > support resizes.
> > 
> > [1] https://lore.kernel.org/kvm/20191212171137.13872-1-da...@redhat.com/
> > 
> > David Hildenbrand (16):
> >   util: vfio-helpers: Factor out and fix processing of existing ram
> > blocks
> >   util: vfio-helpers: Fix qemu_vfio_close()
> >   util: vfio-helpers: Remove Error parameter from
> > qemu_vfio_undo_mapping()
> >   util: vfio-helpers: Factor out removal from qemu_vfio_undo_mapping()
> >   exec: Factor out setting ram settings (madvise ...) into
> > qemu_ram_apply_settings()
> >   exec: Reuse qemu_ram_apply_settings() in qemu_ram_remap()
> >   exec: Drop "shared" parameter from ram_block_add()
> >   util/mmap-alloc: Factor out calculation of pagesize to mmap_pagesize()
> >   util/mmap-alloc: Factor out reserving of a memory region to
> > mmap_reserve()
> >   util/mmap-alloc: Factor out populating of memory to mmap_populate()
> >   util/mmap-alloc: Prepare for resizable mmaps
> >   util/mmap-alloc: Implement resizable mmaps
> >   numa: Teach ram block notifiers about resizable ram blocks
> >   util: vfio-helpers: Implement ram_block_resized()
> >   util: oslib: Resizable anonymous allocations under POSIX
> >   exec: Ram blocks with resizable anonymous allocations under POSIX
> > 
> >  exec.c | 104 +++
> >  hw/core/numa.c |  53 +++-
> >  hw/i386/xen/xen-mapcache.c |   7 +-
> >  include/exec/cpu-common.h  |   3 +
> >  include/exec/memory.h  |   8 ++
> >  include/exec/ramlist.h |  14 +++-
> >  include/qemu/mmap-alloc.h  |  21 +++--
> >  include/qemu/osdep.h   |   6 +-
> >  stubs/ram-block.c  |  20 -
> >  target/i386/hax-mem.c  |   5 +-
> >  target/i386/sev.c  |  18 ++--
> >  util/mmap-alloc.c  | 165 +++--
> >  util/oslib-posix.c |  37 -
> >  util/oslib-win32.c |  14 
> >  util/trace-events  |   9 +-
> >  util/vfio-helpers.c| 145 +---
> >  16 files changed, 450 insertions(+), 179 deletions(-)
> > 
>

Re: [PATCH] block/vvfat: Do not unref qcow on closing backing bdrv

2020-02-14 Thread Kevin Wolf
Am 09.02.2020 um 18:51 hat Hikaru Nishida geschrieben:
> Before this commit, BDRVVVFATState.qcow is unrefed in write_target_close
> on closing backing bdrv of vvfat. However, qcow bdrv is opend as a child
> of vvfat in enable_write_target() so it will be also unrefed on closing
> vvfat itself. This causes use-after-free of qcow on freeing vvfat which
> has backing bdrv and qcow bdrv as children in this order because
> bdrv_close(vvfat) tries to free qcow bdrv after freeing backing bdrv
> as QLIST_FOREACH_SAFE() loop keeps next pointer, but BdrvChild of qcow
> is already freed in bdrv_close(backing bdrv).
> 
> Signed-off-by: Hikaru Nishida 

Thanks, applied to the block branch.

Kevin




Re: [PATCH 06/29] qga/qapi-schema.json: minor format fixups for rST

2020-02-14 Thread Markus Armbruster
Peter Maydell  writes:

> On Fri, 7 Feb 2020 at 08:33, Markus Armbruster  wrote:
>>
>> Peter Maydell  writes:
>>
>> > rST format requires a blank line before the start of a bulleted
>> > or enumerated list. Two places in qapi-schema.json were missing
>> > this blank line.
>> >
>> > Some places were using an indented line as a sort of single-item
>> > bulleted list, which in the texinfo output comes out all run
>> > onto a single line; use a real bulleted list instead.
>> >
>> > Some places unnecessarily indented lists, which confuses rST.
>> >
>> > guest-fstrim:minimum's documentation was indented the
>> > right amount to share a line with @minimum, but wasn't
>> > actually doing so.
>> >
>> > The indent on the bulleted list in the guest-set-vcpus
>> > Returns section meant rST misindented it.
>> >
>> > Changes to the generated texinfo are very minor (the new
>> > bulletted lists, and a few extra blank lines).
>> >
>> > Signed-off-by: Peter Maydell 
>
>> > @@ -767,17 +771,17 @@
>> >  # Returns: The length of the initial sublist that has been successfully
>> >  #  processed. The guest agent maximizes this value. Possible 
>> > cases:
>> >  #
>> > -#  - 0:  if the @vcpus list was empty on input. Guest 
>> > state
>> > -#has not been changed. Otherwise,
>> > -#  - Error:  processing the first node of @vcpus failed 
>> > for the
>> > -#reason returned. Guest state has not been 
>> > changed.
>> > -#Otherwise,
>> > +#  - 0: if the @vcpus list was empty on input. Guest state
>> > +#has not been changed. Otherwise,
>> > +#  - Error: processing the first node of @vcpus failed for the
>> > +#reason returned. Guest state has not been changed.
>> > +#Otherwise,
>> >  #  - < length(@vcpus): more than zero initial nodes have been 
>> > processed,
>> > -#but not the entire @vcpus list. Guest state 
>> > has
>> > -#changed accordingly. To retrieve the error
>> > -#(assuming it persists), repeat the call with 
>> > the
>> > -#successfully processed initial sublist 
>> > removed.
>> > -#Otherwise,
>> > +#but not the entire @vcpus list. Guest state has
>> > +#changed accordingly. To retrieve the error
>> > +#(assuming it persists), repeat the call with the
>> > +#successfully processed initial sublist removed.
>> > +#Otherwise,
>> >  #  - length(@vcpus): call successful.
>>
>> Source readability suffers a bit here.
>>
>> Can we break the line after the colon?
>>
>>#  - 0:
>>#if the @vcpus list was empty on input. Guest state has
>>#not been changed. Otherwise,
>>
>> Or would a definition list be a better fit?
>
> A definition list does produce nicer rendering in the rST, but
> it breaks the rendering in the texinfo (which interprets the
> indent of a rST definition list as meaninglist and renders it
> all as one long run-on paragraph). For the purposes of this
> initial cleanup, I'll put in the newlines you suggest, which
> have no effect on rendering output for either texinfo or rST.

Okay.  We can switch to definition lists later.




[Bug 1863247] [NEW] AArch64 EXT instruction for V register does not clear MSB side bits

2020-02-14 Thread Kentaro Kawakami
Public bug reported:

On AArch64 CPU with SVE register, there seems to be a bug in the
operation when executing EXT instruction to V registers. Bits above the
128 bits of the SVE register must be cleared to 0, but qemu-aarch64
seems to hold the value.

Example
ext v0.16b, v1.16b v2.16b, 8

After executing above instruction, (N-1) to 128 bits of z0 register must
be 0, where N is SVE register width.

** Affects: qemu
 Importance: Undecided
 Status: New

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/1863247

Title:
  AArch64 EXT instruction for V register does not clear MSB side bits

Status in QEMU:
  New

Bug description:
  On AArch64 CPU with SVE register, there seems to be a bug in the
  operation when executing EXT instruction to V registers. Bits above
  the 128 bits of the SVE register must be cleared to 0, but qemu-
  aarch64 seems to hold the value.

  Example
  ext v0.16b, v1.16b v2.16b, 8

  After executing above instruction, (N-1) to 128 bits of z0 register
  must be 0, where N is SVE register width.

To manage notifications about this bug go to:
https://bugs.launchpad.net/qemu/+bug/1863247/+subscriptions



[PATCH 0/3] Fix number of priority bits in zynq gic

2020-02-14 Thread Sai Pavan Boddu
This patch series implements the mask for un-implemented priority bits
in arm-gic. Which will return the expected number of priority bits on
read.

Sai Pavan Boddu (3):
  arm_gic: Mask the un-supported priority bits
  cpu/a9mpcore: Add num priority bits property
  arm/xilinx_zynq: Set number of priority bits

 hw/arm/xilinx_zynq.c | 1 +
 hw/cpu/a9mpcore.c| 2 ++
 hw/intc/arm_gic.c| 9 ++---
 hw/intc/arm_gic_common.c | 1 +
 include/hw/cpu/a9mpcore.h| 1 +
 include/hw/intc/arm_gic_common.h | 1 +
 6 files changed, 12 insertions(+), 3 deletions(-)

-- 
2.7.4




[PATCH 1/3] arm_gic: Mask the un-supported priority bits

2020-02-14 Thread Sai Pavan Boddu
Priority bits implemented in arm-gic can 8 to 4, un-implemented bits
are read as zeros(RAZ).

Signed-off-by: Sai Pavan Boddu 
---
 hw/intc/arm_gic.c| 9 ++---
 hw/intc/arm_gic_common.c | 1 +
 include/hw/intc/arm_gic_common.h | 1 +
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/hw/intc/arm_gic.c b/hw/intc/arm_gic.c
index 1d7da7b..8875330 100644
--- a/hw/intc/arm_gic.c
+++ b/hw/intc/arm_gic.c
@@ -43,6 +43,9 @@
 }   \
 } while (0)
 
+#define UMASK(n) \
+1 << n) - 1) << (8 - n)) & 0xFF)
+
 static const uint8_t gic_id_11mpcore[] = {
 0x00, 0x00, 0x00, 0x00, 0x90, 0x13, 0x04, 0x00, 0x0d, 0xf0, 0x05, 0xb1
 };
@@ -652,9 +655,9 @@ void gic_dist_set_priority(GICState *s, int cpu, int irq, 
uint8_t val,
 }
 
 if (irq < GIC_INTERNAL) {
-s->priority1[irq][cpu] = val;
+s->priority1[irq][cpu] = val & UMASK(s->n_prio_bits) ;
 } else {
-s->priority2[(irq) - GIC_INTERNAL] = val;
+s->priority2[(irq) - GIC_INTERNAL] = val & UMASK(s->n_prio_bits);
 }
 }
 
@@ -684,7 +687,7 @@ static void gic_set_priority_mask(GICState *s, int cpu, 
uint8_t pmask,
 return;
 }
 }
-s->priority_mask[cpu] = pmask;
+s->priority_mask[cpu] = pmask & UMASK(s->n_prio_bits);
 }
 
 static uint32_t gic_get_priority_mask(GICState *s, int cpu, MemTxAttrs attrs)
diff --git a/hw/intc/arm_gic_common.c b/hw/intc/arm_gic_common.c
index e6c4fe7..e4668c7 100644
--- a/hw/intc/arm_gic_common.c
+++ b/hw/intc/arm_gic_common.c
@@ -357,6 +357,7 @@ static Property arm_gic_common_properties[] = {
 DEFINE_PROP_BOOL("has-security-extensions", GICState, security_extn, 0),
 /* True if the GIC should implement the virtualization extensions */
 DEFINE_PROP_BOOL("has-virtualization-extensions", GICState, virt_extn, 0),
+DEFINE_PROP_UINT32("num-prio-bits", GICState, n_prio_bits, 8),
 DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/include/hw/intc/arm_gic_common.h b/include/hw/intc/arm_gic_common.h
index b5585fe..6e0d6b8 100644
--- a/include/hw/intc/arm_gic_common.h
+++ b/include/hw/intc/arm_gic_common.h
@@ -96,6 +96,7 @@ typedef struct GICState {
 uint16_t priority_mask[GIC_NCPU_VCPU];
 uint16_t running_priority[GIC_NCPU_VCPU];
 uint16_t current_pending[GIC_NCPU_VCPU];
+uint32_t n_prio_bits;
 
 /* If we present the GICv2 without security extensions to a guest,
  * the guest can configure the GICC_CTLR to configure group 1 binary point
-- 
2.7.4




[PATCH 2/3] cpu/a9mpcore: Add num priority bits property

2020-02-14 Thread Sai Pavan Boddu
Set number of priority bits property of gic as guided by machine
configuration.

Signed-off-by: Sai Pavan Boddu 
---
 hw/cpu/a9mpcore.c | 2 ++
 include/hw/cpu/a9mpcore.h | 1 +
 2 files changed, 3 insertions(+)

diff --git a/hw/cpu/a9mpcore.c b/hw/cpu/a9mpcore.c
index 1f8bc8a..eb1e752 100644
--- a/hw/cpu/a9mpcore.c
+++ b/hw/cpu/a9mpcore.c
@@ -68,6 +68,7 @@ static void a9mp_priv_realize(DeviceState *dev, Error **errp)
 gicdev = DEVICE(&s->gic);
 qdev_prop_set_uint32(gicdev, "num-cpu", s->num_cpu);
 qdev_prop_set_uint32(gicdev, "num-irq", s->num_irq);
+qdev_prop_set_uint32(gicdev, "num-prio-bits", s->n_prio_bits);
 
 /* Make the GIC's TZ support match the CPUs. We assume that
  * either all the CPUs have TZ, or none do.
@@ -167,6 +168,7 @@ static Property a9mp_priv_properties[] = {
  * Other boards may differ and should set this property appropriately.
  */
 DEFINE_PROP_UINT32("num-irq", A9MPPrivState, num_irq, 96),
+DEFINE_PROP_UINT32("num-priority-bits", A9MPPrivState, n_prio_bits, 8),
 DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/include/hw/cpu/a9mpcore.h b/include/hw/cpu/a9mpcore.h
index 5d67ca2..4f146bd 100644
--- a/include/hw/cpu/a9mpcore.h
+++ b/include/hw/cpu/a9mpcore.h
@@ -28,6 +28,7 @@ typedef struct A9MPPrivState {
 uint32_t num_cpu;
 MemoryRegion container;
 uint32_t num_irq;
+uint32_t n_prio_bits;
 
 A9SCUState scu;
 GICState gic;
-- 
2.7.4




Re: [PATCH RFC 00/14] *** multifd for RDMA v2 ***

2020-02-14 Thread Dr. David Alan Gilbert
Make sure that it compiles OK with RDMA compiled out; I think this is a
windows cross build that's failing, but more generally even a Linux box
with no-RDMA libraries.

Dave

* no-re...@patchew.org (no-re...@patchew.org) wrote:
> Patchew URL: 
> https://patchew.org/QEMU/20200213093755.370-1-fengzhim...@huawei.com/
> 
> 
> 
> Hi,
> 
> This series failed the docker-mingw@fedora build test. Please find the 
> testing commands and
> their output below. If you have Docker installed, you can probably reproduce 
> it
> locally.
> 
> === TEST SCRIPT BEGIN ===
> #! /bin/bash
> export ARCH=x86_64
> make docker-image-fedora V=1 NETWORK=1
> time make docker-test-mingw@fedora J=14 NETWORK=1
> === TEST SCRIPT END ===
> 
> /tmp/qemu-test/src/migration/multifd.c:663: undefined reference to 
> `multifd_channel_rdma_connect'
> ../migration/multifd.o: In function `multifd_load_cleanup':
> /tmp/qemu-test/src/migration/multifd.c:843: undefined reference to 
> `qemu_rdma_cleanup'
> collect2: error: ld returned 1 exit status
> make[1]: *** [Makefile:206: qemu-system-x86_64w.exe] Error 1
> make: *** [Makefile:497: x86_64-softmmu/all] Error 2
> make: *** Waiting for unfinished jobs
> ../migration/multifd.o: In function `multifd_rdma_recv_thread':
> /tmp/qemu-test/src/migration/multifd.c:898: undefined reference to 
> `qemu_rdma_registration_handle'
> ---
> /tmp/qemu-test/src/migration/multifd.c:663: undefined reference to 
> `multifd_channel_rdma_connect'
> ../migration/multifd.o: In function `multifd_load_cleanup':
> /tmp/qemu-test/src/migration/multifd.c:843: undefined reference to 
> `qemu_rdma_cleanup'
> collect2: error: ld returned 1 exit status
> make[1]: *** [Makefile:206: qemu-system-aarch64w.exe] Error 1
> make: *** [Makefile:497: aarch64-softmmu/all] Error 2
> Traceback (most recent call last):
>   File "./tests/docker/docker.py", line 664, in 
> sys.exit(main())
> ---
> raise CalledProcessError(retcode, cmd)
> subprocess.CalledProcessError: Command '['sudo', '-n', 'docker', 'run', 
> '--label', 'com.qemu.instance.uuid=88473d634d6543ea992045cbe9a806e1', '-u', 
> '1003', '--security-opt', 'seccomp=unconfined', '--rm', '-e', 'TARGET_LIST=', 
> '-e', 'EXTRA_CONFIGURE_OPTS=', '-e', 'V=', '-e', 'J=14', '-e', 'DEBUG=', 
> '-e', 'SHOW_ENV=', '-e', 'CCACHE_DIR=/var/tmp/ccache', '-v', 
> '/home/patchew2/.cache/qemu-docker-ccache:/var/tmp/ccache:z', '-v', 
> '/var/tmp/patchew-tester-tmp-yiq7aevf/src/docker-src.2020-02-13-05.11.35.1374:/var/tmp/qemu:z,ro',
>  'qemu:fedora', '/var/tmp/qemu/run', 'test-mingw']' returned non-zero exit 
> status 2.
> filter=--filter=label=com.qemu.instance.uuid=88473d634d6543ea992045cbe9a806e1
> make[1]: *** [docker-run] Error 1
> make[1]: Leaving directory `/var/tmp/patchew-tester-tmp-yiq7aevf/src'
> make: *** [docker-run-test-mingw@fedora] Error 2
> 
> real2m35.791s
> user0m7.717s
> 
> 
> The full log is available at
> http://patchew.org/logs/20200213093755.370-1-fengzhim...@huawei.com/testing.docker-mingw@fedora/?type=message.
> ---
> Email generated automatically by Patchew [https://patchew.org/].
> Please send your feedback to patchew-de...@redhat.com
--
Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK




[PATCH 3/3] arm/xilinx_zynq: Set number of priority bits

2020-02-14 Thread Sai Pavan Boddu
Arm GIC in Zynq SOC implements 5 priority bits i.e bits 7..3.

Signed-off-by: Sai Pavan Boddu 
---
 hw/arm/xilinx_zynq.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/hw/arm/xilinx_zynq.c b/hw/arm/xilinx_zynq.c
index 3a0fa5b..7aa43ad 100644
--- a/hw/arm/xilinx_zynq.c
+++ b/hw/arm/xilinx_zynq.c
@@ -214,6 +214,7 @@ static void zynq_init(MachineState *machine)
 
 dev = qdev_create(NULL, TYPE_A9MPCORE_PRIV);
 qdev_prop_set_uint32(dev, "num-cpu", 1);
+qdev_prop_set_uint32(dev, "num-priority-bits", 5);
 qdev_init_nofail(dev);
 busdev = SYS_BUS_DEVICE(dev);
 sysbus_mmio_map(busdev, 0, MPCORE_PERIPHBASE);
-- 
2.7.4




Re: [PATCH 0/3] qcow2: Fix write/copy error path with data file

2020-02-14 Thread Kevin Wolf
Am 11.02.2020 um 10:48 hat Kevin Wolf geschrieben:
> Kevin Wolf (3):
>   qcow2: update_refcount(): Reset old_table_index after
> qcow2_cache_put()
>   qcow2: Fix qcow2_alloc_cluster_abort() for external data file
>   iotests: Test copy offloading with external data file

Applied to the block branch.

Kevin




Re: docs: Update vhost-user spec regarding backend program conventions

2020-02-14 Thread Boeuf, Sebastien
Hi Marc-Andre,

On Tue, 2020-02-11 at 22:24 +0100, Marc-André Lureau wrote:
> Hi
> 
> On Tue, Feb 11, 2020 at 4:24 PM Boeuf, Sebastien
>  wrote:
> > From c073d528b8cd7082832fd1825dc33dd65b305aa2 Mon Sep 17 00:00:00
> > 2001
> > From: Sebastien Boeuf 
> > Date: Tue, 11 Feb 2020 16:01:22 +0100
> > Subject: [PATCH] docs: Update vhost-user spec regarding backend
> > program
> >  conventions
> > 
> > The vhost-user specification is not clearly stating the expected
> > behavior from a backend program whenever the client disconnects.
> > 
> > This patch addresses the issue by defining the default behavior and
> > proposing an alternative through a command line option.
> > 
> > By default, a backend program will have to keep listening even if
> > the
> > client disconnects, unless told otherwise through the newly
> > introduced
> > option --exit-on-disconnect.
> > 
> > Signed-off-by: Sebastien Boeuf 
> > Signed-off-by: Stefan Hajnoczi 
> > ---
> >  docs/interop/vhost-user.rst | 10 ++
> >  1 file changed, 10 insertions(+)
> > 
> > diff --git a/docs/interop/vhost-user.rst b/docs/interop/vhost-
> > user.rst
> > index 5f8b3a456b..da9a1ebc4c 100644
> > --- a/docs/interop/vhost-user.rst
> > +++ b/docs/interop/vhost-user.rst
> > @@ -1323,6 +1323,10 @@ The backend program must end (as quickly and
> > cleanly as possible) when
> >  the SIGTERM signal is received. Eventually, it may receive SIGKILL
> > by
> >  the management layer after a few seconds.
> > 
> > +By default, the backend program continues running after the client
> > +disconnects. It accepts only 1 connection at a time on each UNIX
> > domain
> > +socket.
> 
> I don't think that's the most common behaviour. libvhost-user will
> panic() on disconnect in general, unless the error/exit is handled
> gracefully by the backend.

It's not the default behavior from libvhost-user, but that's exactly
something I'd like to see changing. This should be the normal case if
you think about a standard client/server connection, where the server
would simply listen again after the client disconnects.

> 
> The most common case is to have 1-1 relation between device/qemu
> instance and backend.

Yes this part is fine, but that's not a reason why the backend should
terminates.

> 
> Why not restart the backend for another instance?

Because you need some management tool to do so. And I think that by
default it could be interesting to have the least amount of extra
management involved.

> 
> > +
> >  The following command line options have an expected behaviour.
> > They
> >  are mandatory, unless explicitly said differently:
> > 
> > @@ -1337,6 +1341,12 @@ are mandatory, unless explicitly said
> > differently:
> >vhost-user socket as file descriptor FDNUM. It is incompatible
> > with
> >--socket-path.
> > 
> > +--exit-on-disconnect
> > +
> > +  When this option is provided, the backend program must terminate
> > when
> > +  the client disconnects. This can be used to keep the backend
> > program's
> > +  lifetime synchronized with its client process.
> 
> This section list options that are mandatory. It's probably a bit
> late
> to add more mandatory options (I regret already some of them)

The spec states "They are mandatory, unless explicitly said
differently", and in this case I'm explicitely saying "When this option
is provided", which means if it's not provided it's fine and we can
ignore the fact it's not there.

> 
> Do we need to specify the behaviour on client disconnect? Can't we
> leave that to the backend and management layer to decide?

My goal here is to make the spec a bit less loose. I know libvhost-user 
is the de-facto implementation but we cannot just assume everything out
of the libvhost-user implementation, especially since there is a
dedicated spec. That's the reason why I thought it'd be nice to have
the backend behavior well defined in the spec.
The point is, relying on the current definition, there's not enough
information to make sure a VMM will properly interface with a vhost-
user backend.

Thanks,
Sebastien

> 
> 
> > +
> >  --print-capabilities
> > 
> >Output to stdout the backend capabilities in JSON format, and
> > then
> > --
> > 2.20.1
> > 
> > -
> > 
> > Intel Corporation SAS (French simplified joint stock company)
> > Registered headquarters: "Les Montalets"- 2, rue de Paris,
> > 92196 Meudon Cedex, France
> > Registration Number:  302 456 199 R.C.S. NANTERRE
> > Capital: 4,572,000 Euros
> > 
> > This e-mail and any attachments may contain confidential material
> > for
> > the sole use of the intended recipient(s). Any review or
> > distribution
> > by others is strictly prohibited. If you are not the intended
> > recipient, please contact the sender and delete all copies.
-
Intel Corporation SAS (French simplified joint stock company)
Registered headquarters: "Les Montalets"- 2, rue de Pari

[PATCH v16 00/10] VIRTIO-IOMMU device

2020-02-14 Thread Eric Auger
This series implements the QEMU virtio-iommu device.

This matches the v0.12 spec (voted) and the corresponding
virtio-iommu driver upstreamed in 5.3. All kernel dependencies
are resolved for DT integration. The virtio-iommu can be
instantiated in ARM virt using:

"-device virtio-iommu-pci".

Non DT mode is not yet supported as it has non resolved kernel
dependencies [1].

This feature targets 5.0.

Integration with vhost devices and vfio devices is not part
of this series. Please follow Bharat's respins [2].

Best Regards

Eric

This series can be found at:
https://github.com/eauger/qemu/tree/v4.2-virtio-iommu-v16

References:
[1] [RFC 00/13] virtio-iommu on non-devicetree platforms
[2] [PATCH RFC v5 0/5] virtio-iommu: VFIO integration

Testing:
- tested with guest using virtio-net-pci
  (,vhost=off,iommu_platform,disable-modern=off,disable-legacy=on)
  and virtio-blk-pci
- migration

History:

v15 -> v16:
- Collected Jean, Peter and Michael's R-bs
- last patches without R-b is the one related to hw/arm/virt.c
  + the last patch, added in this version
- Made the virtio-iommu-pci not hotpluggable (I dared to
  leave the R-b though)
- Renamed create_virtio_iommu into create_virtio_iommu_dt_bindings
- added entry in maintenance file

v14 -> v15:
- removed x-dt-binding and just kept check on hotplug_handler
- removed "tests: Add virtio-iommu test" as the check on
  hotplug_handler fails on PC machine
- destroy mappings in put_domain and remove
  g_tree_destroy(domain->mappings) in virtio_iommu_detach

v13 -> v14:
- added "virtio-iommu-pci: Introduce the x-dt-binding option"
- Removed the mappings gtree ref counting and simply delete
  the gtree when the last EP is detached from the domain


Eric Auger (10):
  virtio-iommu: Add skeleton
  virtio-iommu: Decode the command payload
  virtio-iommu: Implement attach/detach command
  virtio-iommu: Implement map/unmap
  virtio-iommu: Implement translate
  virtio-iommu: Implement fault reporting
  virtio-iommu: Support migration
  virtio-iommu-pci: Add virtio iommu pci support
  hw/arm/virt: Add the virtio-iommu device tree mappings
  MAINTAINERS: add virtio-iommu related files

 MAINTAINERS  |   6 +
 hw/arm/virt.c|  57 +-
 hw/virtio/Kconfig|   5 +
 hw/virtio/Makefile.objs  |   2 +
 hw/virtio/trace-events   |  20 +
 hw/virtio/virtio-iommu-pci.c | 104 
 hw/virtio/virtio-iommu.c | 890 +++
 include/hw/arm/virt.h|   2 +
 include/hw/pci/pci.h |   1 +
 include/hw/virtio/virtio-iommu.h |  61 +++
 qdev-monitor.c   |   1 +
 11 files changed, 1142 insertions(+), 7 deletions(-)
 create mode 100644 hw/virtio/virtio-iommu-pci.c
 create mode 100644 hw/virtio/virtio-iommu.c
 create mode 100644 include/hw/virtio/virtio-iommu.h

-- 
2.20.1




[PATCH v16 02/10] virtio-iommu: Decode the command payload

2020-02-14 Thread Eric Auger
This patch adds the command payload decoding and
introduces the functions that will do the actual
command handling. Those functions are not yet implemented.

Signed-off-by: Eric Auger 
Reviewed-by: Jean-Philippe Brucker 
Reviewed-by: Peter Xu 
Reviewed-by: Michael S. Tsirkin 

---

v11 -> v12:
- ADded Jean and Peter's R-b

v10 -> v11:
- use a macro for handle command functions

v9 -> v10:
- make virtio_iommu_handle_* more compact and
  remove get_payload_size

v7 -> v8:
- handle new domain parameter in detach
- remove reserved checks

v5 -> v6:
- change map/unmap semantics (remove size)

v4 -> v5:
- adopt new v0.5 terminology

v3 -> v4:
- no flags field anymore in struct virtio_iommu_req_unmap
- test reserved on attach/detach, change trace proto
- rebase on v2.10.0.
---
 hw/virtio/trace-events   |  4 +++
 hw/virtio/virtio-iommu.c | 76 +---
 2 files changed, 68 insertions(+), 12 deletions(-)

diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events
index 02d93d7f63..f7141aa2f6 100644
--- a/hw/virtio/trace-events
+++ b/hw/virtio/trace-events
@@ -60,3 +60,7 @@ virtio_iommu_get_features(uint64_t features) "device supports 
features=0x%"PRIx6
 virtio_iommu_device_status(uint8_t status) "driver status = %d"
 virtio_iommu_get_config(uint64_t page_size_mask, uint64_t start, uint64_t end, 
uint32_t domain_range, uint32_t probe_size) "page_size_mask=0x%"PRIx64" 
start=0x%"PRIx64" end=0x%"PRIx64" domain_range=%d probe_size=0x%x"
 virtio_iommu_set_config(uint64_t page_size_mask, uint64_t start, uint64_t end, 
uint32_t domain_range, uint32_t probe_size) "page_size_mask=0x%"PRIx64" 
start=0x%"PRIx64" end=0x%"PRIx64" domain_bits=%d probe_size=0x%x"
+virtio_iommu_attach(uint32_t domain_id, uint32_t ep_id) "domain=%d endpoint=%d"
+virtio_iommu_detach(uint32_t domain_id, uint32_t ep_id) "domain=%d endpoint=%d"
+virtio_iommu_map(uint32_t domain_id, uint64_t virt_start, uint64_t virt_end, 
uint64_t phys_start, uint32_t flags) "domain=%d virt_start=0x%"PRIx64" 
virt_end=0x%"PRIx64 " phys_start=0x%"PRIx64" flags=%d"
+virtio_iommu_unmap(uint32_t domain_id, uint64_t virt_start, uint64_t virt_end) 
"domain=%d virt_start=0x%"PRIx64" virt_end=0x%"PRIx64
diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c
index 30579267d5..86dcdc09a1 100644
--- a/hw/virtio/virtio-iommu.c
+++ b/hw/virtio/virtio-iommu.c
@@ -34,31 +34,83 @@
 /* Max size */
 #define VIOMMU_DEFAULT_QUEUE_SIZE 256
 
-static int virtio_iommu_handle_attach(VirtIOIOMMU *s,
-  struct iovec *iov,
-  unsigned int iov_cnt)
+static int virtio_iommu_attach(VirtIOIOMMU *s,
+   struct virtio_iommu_req_attach *req)
 {
+uint32_t domain_id = le32_to_cpu(req->domain);
+uint32_t ep_id = le32_to_cpu(req->endpoint);
+
+trace_virtio_iommu_attach(domain_id, ep_id);
+
 return VIRTIO_IOMMU_S_UNSUPP;
 }
-static int virtio_iommu_handle_detach(VirtIOIOMMU *s,
-  struct iovec *iov,
-  unsigned int iov_cnt)
+
+static int virtio_iommu_detach(VirtIOIOMMU *s,
+   struct virtio_iommu_req_detach *req)
 {
+uint32_t domain_id = le32_to_cpu(req->domain);
+uint32_t ep_id = le32_to_cpu(req->endpoint);
+
+trace_virtio_iommu_detach(domain_id, ep_id);
+
 return VIRTIO_IOMMU_S_UNSUPP;
 }
-static int virtio_iommu_handle_map(VirtIOIOMMU *s,
-   struct iovec *iov,
-   unsigned int iov_cnt)
+
+static int virtio_iommu_map(VirtIOIOMMU *s,
+struct virtio_iommu_req_map *req)
 {
+uint32_t domain_id = le32_to_cpu(req->domain);
+uint64_t phys_start = le64_to_cpu(req->phys_start);
+uint64_t virt_start = le64_to_cpu(req->virt_start);
+uint64_t virt_end = le64_to_cpu(req->virt_end);
+uint32_t flags = le32_to_cpu(req->flags);
+
+trace_virtio_iommu_map(domain_id, virt_start, virt_end, phys_start, flags);
+
 return VIRTIO_IOMMU_S_UNSUPP;
 }
-static int virtio_iommu_handle_unmap(VirtIOIOMMU *s,
- struct iovec *iov,
- unsigned int iov_cnt)
+
+static int virtio_iommu_unmap(VirtIOIOMMU *s,
+  struct virtio_iommu_req_unmap *req)
 {
+uint32_t domain_id = le32_to_cpu(req->domain);
+uint64_t virt_start = le64_to_cpu(req->virt_start);
+uint64_t virt_end = le64_to_cpu(req->virt_end);
+
+trace_virtio_iommu_unmap(domain_id, virt_start, virt_end);
+
 return VIRTIO_IOMMU_S_UNSUPP;
 }
 
+static int virtio_iommu_iov_to_req(struct iovec *iov,
+   unsigned int iov_cnt,
+   void *req, size_t req_sz)
+{
+size_t sz, payload_sz = req_sz - sizeof(struct virtio_iommu_req_tail);
+
+sz = iov_to_buf(iov, iov_cnt, 0, req, payload_sz);
+if (unlikely(sz != payload_sz)) 

[PATCH v16 05/10] virtio-iommu: Implement translate

2020-02-14 Thread Eric Auger
This patch implements the translate callback

Signed-off-by: Eric Auger 
Reviewed-by: Jean-Philippe Brucker 
Reviewed-by: Michael S. Tsirkin 

---

v11 -> v12:
- Added Jean's R-b
- s/qemu_log_mask/error_report_once

v10 -> v11:
- take into account the new value struct and use
  g_tree_lookup_extended
- switched to error_report_once

v6 -> v7:
- implemented bypass-mode

v5 -> v6:
- replace error_report by qemu_log_mask

v4 -> v5:
- check the device domain is not NULL
- s/printf/error_report
- set flags to IOMMU_NONE in case of all translation faults
---
 hw/virtio/trace-events   |  1 +
 hw/virtio/virtio-iommu.c | 60 +++-
 2 files changed, 60 insertions(+), 1 deletion(-)

diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events
index 22162d6583..095aa8b509 100644
--- a/hw/virtio/trace-events
+++ b/hw/virtio/trace-events
@@ -71,3 +71,4 @@ virtio_iommu_get_endpoint(uint32_t ep_id) "Alloc endpoint=%d"
 virtio_iommu_put_endpoint(uint32_t ep_id) "Free endpoint=%d"
 virtio_iommu_get_domain(uint32_t domain_id) "Alloc domain=%d"
 virtio_iommu_put_domain(uint32_t domain_id) "Free domain=%d"
+virtio_iommu_translate_out(uint64_t virt_addr, uint64_t phys_addr, uint32_t 
sid) "0x%"PRIx64" -> 0x%"PRIx64 " for sid=%d"
diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c
index 844d34c270..59e9cd3d9a 100644
--- a/hw/virtio/virtio-iommu.c
+++ b/hw/virtio/virtio-iommu.c
@@ -473,19 +473,77 @@ static IOMMUTLBEntry 
virtio_iommu_translate(IOMMUMemoryRegion *mr, hwaddr addr,
 int iommu_idx)
 {
 IOMMUDevice *sdev = container_of(mr, IOMMUDevice, iommu_mr);
+VirtIOIOMMUInterval interval, *mapping_key;
+VirtIOIOMMUMapping *mapping_value;
+VirtIOIOMMU *s = sdev->viommu;
+VirtIOIOMMUEndpoint *ep;
+bool bypass_allowed;
 uint32_t sid;
+bool found;
+
+interval.low = addr;
+interval.high = addr + 1;
 
 IOMMUTLBEntry entry = {
 .target_as = &address_space_memory,
 .iova = addr,
 .translated_addr = addr,
-.addr_mask = ~(hwaddr)0,
+.addr_mask = (1 << ctz32(s->config.page_size_mask)) - 1,
 .perm = IOMMU_NONE,
 };
 
+bypass_allowed = virtio_vdev_has_feature(&s->parent_obj,
+ VIRTIO_IOMMU_F_BYPASS);
+
 sid = virtio_iommu_get_bdf(sdev);
 
 trace_virtio_iommu_translate(mr->parent_obj.name, sid, addr, flag);
+qemu_mutex_lock(&s->mutex);
+
+ep = g_tree_lookup(s->endpoints, GUINT_TO_POINTER(sid));
+if (!ep) {
+if (!bypass_allowed) {
+error_report_once("%s sid=%d is not known!!", __func__, sid);
+} else {
+entry.perm = flag;
+}
+goto unlock;
+}
+
+if (!ep->domain) {
+if (!bypass_allowed) {
+error_report_once("%s %02x:%02x.%01x not attached to any domain",
+  __func__, PCI_BUS_NUM(sid),
+  PCI_SLOT(sid), PCI_FUNC(sid));
+} else {
+entry.perm = flag;
+}
+goto unlock;
+}
+
+found = g_tree_lookup_extended(ep->domain->mappings, (gpointer)(&interval),
+   (void **)&mapping_key,
+   (void **)&mapping_value);
+if (!found) {
+error_report_once("%s no mapping for 0x%"PRIx64" for sid=%d",
+  __func__, addr, sid);
+goto unlock;
+}
+
+if (((flag & IOMMU_RO) &&
+!(mapping_value->flags & VIRTIO_IOMMU_MAP_F_READ)) ||
+((flag & IOMMU_WO) &&
+!(mapping_value->flags & VIRTIO_IOMMU_MAP_F_WRITE))) {
+error_report_once("%s permission error on 0x%"PRIx64"(%d): allowed=%d",
+  __func__, addr, flag, mapping_value->flags);
+goto unlock;
+}
+entry.translated_addr = addr - mapping_key->low + mapping_value->phys_addr;
+entry.perm = flag;
+trace_virtio_iommu_translate_out(addr, entry.translated_addr, sid);
+
+unlock:
+qemu_mutex_unlock(&s->mutex);
 return entry;
 }
 
-- 
2.20.1




[PATCH v16 06/10] virtio-iommu: Implement fault reporting

2020-02-14 Thread Eric Auger
The event queue allows to report asynchronous errors.
The translate function now injects faults when relevant.

Signed-off-by: Eric Auger 
Reviewed-by: Peter Xu 
Reviewed-by: Michael S. Tsirkin 

---
v13 -> v14:
- remove loop
- add Peter's R-b

v12 -> v13:
- return on virtio_error()

v11 -> v12:
- reporting the addr associated with the fault and set the
  VIRTIO_IOMMU_FAULT_F_ADDRESS flag.
- added cpu_to_le

v10 -> v11:
- change a virtio_error into an error_report_once
  (no buffer available for output faults)
---
 hw/virtio/trace-events   |  1 +
 hw/virtio/virtio-iommu.c | 70 +---
 2 files changed, 66 insertions(+), 5 deletions(-)

diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events
index 095aa8b509..e83500bee9 100644
--- a/hw/virtio/trace-events
+++ b/hw/virtio/trace-events
@@ -72,3 +72,4 @@ virtio_iommu_put_endpoint(uint32_t ep_id) "Free endpoint=%d"
 virtio_iommu_get_domain(uint32_t domain_id) "Alloc domain=%d"
 virtio_iommu_put_domain(uint32_t domain_id) "Free domain=%d"
 virtio_iommu_translate_out(uint64_t virt_addr, uint64_t phys_addr, uint32_t 
sid) "0x%"PRIx64" -> 0x%"PRIx64 " for sid=%d"
+virtio_iommu_report_fault(uint8_t reason, uint32_t flags, uint32_t endpoint, 
uint64_t addr) "FAULT reason=%d flags=%d endpoint=%d address =0x%"PRIx64
diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c
index 59e9cd3d9a..8509f64004 100644
--- a/hw/virtio/virtio-iommu.c
+++ b/hw/virtio/virtio-iommu.c
@@ -468,6 +468,48 @@ out:
 }
 }
 
+static void virtio_iommu_report_fault(VirtIOIOMMU *viommu, uint8_t reason,
+  int flags, uint32_t endpoint,
+  uint64_t address)
+{
+VirtIODevice *vdev = &viommu->parent_obj;
+VirtQueue *vq = viommu->event_vq;
+struct virtio_iommu_fault fault;
+VirtQueueElement *elem;
+size_t sz;
+
+memset(&fault, 0, sizeof(fault));
+fault.reason = reason;
+fault.flags = cpu_to_le32(flags);
+fault.endpoint = cpu_to_le32(endpoint);
+fault.address = cpu_to_le64(address);
+
+elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
+
+if (!elem) {
+error_report_once(
+"no buffer available in event queue to report event");
+return;
+}
+
+if (iov_size(elem->in_sg, elem->in_num) < sizeof(fault)) {
+virtio_error(vdev, "error buffer of wrong size");
+virtqueue_detach_element(vq, elem, 0);
+g_free(elem);
+return;
+}
+
+sz = iov_from_buf(elem->in_sg, elem->in_num, 0,
+  &fault, sizeof(fault));
+assert(sz == sizeof(fault));
+
+trace_virtio_iommu_report_fault(reason, flags, endpoint, address);
+virtqueue_push(vq, elem, sz);
+virtio_notify(vdev, vq);
+g_free(elem);
+
+}
+
 static IOMMUTLBEntry virtio_iommu_translate(IOMMUMemoryRegion *mr, hwaddr addr,
 IOMMUAccessFlags flag,
 int iommu_idx)
@@ -476,9 +518,10 @@ static IOMMUTLBEntry 
virtio_iommu_translate(IOMMUMemoryRegion *mr, hwaddr addr,
 VirtIOIOMMUInterval interval, *mapping_key;
 VirtIOIOMMUMapping *mapping_value;
 VirtIOIOMMU *s = sdev->viommu;
+bool read_fault, write_fault;
 VirtIOIOMMUEndpoint *ep;
+uint32_t sid, flags;
 bool bypass_allowed;
-uint32_t sid;
 bool found;
 
 interval.low = addr;
@@ -504,6 +547,9 @@ static IOMMUTLBEntry 
virtio_iommu_translate(IOMMUMemoryRegion *mr, hwaddr addr,
 if (!ep) {
 if (!bypass_allowed) {
 error_report_once("%s sid=%d is not known!!", __func__, sid);
+virtio_iommu_report_fault(s, VIRTIO_IOMMU_FAULT_R_UNKNOWN,
+  VIRTIO_IOMMU_FAULT_F_ADDRESS,
+  sid, addr);
 } else {
 entry.perm = flag;
 }
@@ -515,6 +561,9 @@ static IOMMUTLBEntry 
virtio_iommu_translate(IOMMUMemoryRegion *mr, hwaddr addr,
 error_report_once("%s %02x:%02x.%01x not attached to any domain",
   __func__, PCI_BUS_NUM(sid),
   PCI_SLOT(sid), PCI_FUNC(sid));
+virtio_iommu_report_fault(s, VIRTIO_IOMMU_FAULT_R_DOMAIN,
+  VIRTIO_IOMMU_FAULT_F_ADDRESS,
+  sid, addr);
 } else {
 entry.perm = flag;
 }
@@ -527,15 +576,26 @@ static IOMMUTLBEntry 
virtio_iommu_translate(IOMMUMemoryRegion *mr, hwaddr addr,
 if (!found) {
 error_report_once("%s no mapping for 0x%"PRIx64" for sid=%d",
   __func__, addr, sid);
+virtio_iommu_report_fault(s, VIRTIO_IOMMU_FAULT_R_MAPPING,
+  VIRTIO_IOMMU_FAULT_F_ADDRESS,
+  sid, addr);
 goto unlock;
 }
 
-if (((flag & IOMMU_RO) &&
-!(mapping_value->flags & VIRTIO_IOMMU_MAP_F_READ)) |

[PATCH v16 03/10] virtio-iommu: Implement attach/detach command

2020-02-14 Thread Eric Auger
This patch implements the endpoint attach/detach to/from
a domain.

Domain and endpoint internal datatypes are introduced.
Both are stored in RB trees. The domain owns a list of
endpoints attached to it. Also helpers to get/put
end points and domains are introduced.

As for the IOMMU memory regions, a callback is called on
PCI bus enumeration that initializes for a given device
on the bus hierarchy an IOMMU memory region. The PCI bus
hierarchy is stored locally in IOMMUPciBus and IOMMUDevice
objects.

At the time of the enumeration, the bus number may not be
computed yet.

So operations that will need to retrieve the IOMMUdevice
and its IOMMU memory region from the bus number and devfn,
once the bus number is garanteed to be frozen, use an array
of IOMMUPciBus, lazily populated.

Signed-off-by: Eric Auger 
Reviewed-by: Peter Xu 
Reviewed-by: Michael S. Tsirkin 

---
v14 -> v15:
- destroy mappings in put_domain and remove
  g_tree_destroy(domain->mappings) in virtio_iommu_detach

v13 -> v14:
- in virtio_iommu_put_endpoint, if the EP is attached to a
  domain, call virtio_iommu_detach_endpoint_from_domain()
- remove domain ref counting and simply delete the mappings
  gtree when the last EP is detached from the domain
- in virtio_iommu_detach_endpoint_from_domain(), return if the
  ep's domain is unset.

v12 -> v13:
- squashed v12 4, 5, 6 into this patch
- rename virtio_iommu_get_sid into virtio_iommu_get_bdf

v11 -> v12:
- check the device is protected by the iommu on attach
- on detach, check the domain id the device is attached to matches
  the one used in the detach command
- fix mapping ref counter and destroy the domain when no end-points
  are attached anymore
---
 hw/virtio/trace-events   |   6 +
 hw/virtio/virtio-iommu.c | 311 ++-
 include/hw/virtio/virtio-iommu.h |   3 +
 3 files changed, 318 insertions(+), 2 deletions(-)

diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events
index f7141aa2f6..15595f8cd7 100644
--- a/hw/virtio/trace-events
+++ b/hw/virtio/trace-events
@@ -64,3 +64,9 @@ virtio_iommu_attach(uint32_t domain_id, uint32_t ep_id) 
"domain=%d endpoint=%d"
 virtio_iommu_detach(uint32_t domain_id, uint32_t ep_id) "domain=%d endpoint=%d"
 virtio_iommu_map(uint32_t domain_id, uint64_t virt_start, uint64_t virt_end, 
uint64_t phys_start, uint32_t flags) "domain=%d virt_start=0x%"PRIx64" 
virt_end=0x%"PRIx64 " phys_start=0x%"PRIx64" flags=%d"
 virtio_iommu_unmap(uint32_t domain_id, uint64_t virt_start, uint64_t virt_end) 
"domain=%d virt_start=0x%"PRIx64" virt_end=0x%"PRIx64
+virtio_iommu_translate(const char *name, uint32_t rid, uint64_t iova, int 
flag) "mr=%s rid=%d addr=0x%"PRIx64" flag=%d"
+virtio_iommu_init_iommu_mr(char *iommu_mr) "init %s"
+virtio_iommu_get_endpoint(uint32_t ep_id) "Alloc endpoint=%d"
+virtio_iommu_put_endpoint(uint32_t ep_id) "Free endpoint=%d"
+virtio_iommu_get_domain(uint32_t domain_id) "Alloc domain=%d"
+virtio_iommu_put_domain(uint32_t domain_id) "Free domain=%d"
diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c
index 86dcdc09a1..d9fe83f530 100644
--- a/hw/virtio/virtio-iommu.c
+++ b/hw/virtio/virtio-iommu.c
@@ -23,6 +23,8 @@
 #include "hw/qdev-properties.h"
 #include "hw/virtio/virtio.h"
 #include "sysemu/kvm.h"
+#include "qapi/error.h"
+#include "qemu/error-report.h"
 #include "trace.h"
 
 #include "standard-headers/linux/virtio_ids.h"
@@ -30,19 +32,236 @@
 #include "hw/virtio/virtio-bus.h"
 #include "hw/virtio/virtio-access.h"
 #include "hw/virtio/virtio-iommu.h"
+#include "hw/pci/pci_bus.h"
+#include "hw/pci/pci.h"
 
 /* Max size */
 #define VIOMMU_DEFAULT_QUEUE_SIZE 256
 
+typedef struct VirtIOIOMMUDomain {
+uint32_t id;
+GTree *mappings;
+QLIST_HEAD(, VirtIOIOMMUEndpoint) endpoint_list;
+} VirtIOIOMMUDomain;
+
+typedef struct VirtIOIOMMUEndpoint {
+uint32_t id;
+VirtIOIOMMUDomain *domain;
+QLIST_ENTRY(VirtIOIOMMUEndpoint) next;
+} VirtIOIOMMUEndpoint;
+
+typedef struct VirtIOIOMMUInterval {
+uint64_t low;
+uint64_t high;
+} VirtIOIOMMUInterval;
+
+static inline uint16_t virtio_iommu_get_bdf(IOMMUDevice *dev)
+{
+return PCI_BUILD_BDF(pci_bus_num(dev->bus), dev->devfn);
+}
+
+/**
+ * The bus number is used for lookup when SID based operations occur.
+ * In that case we lazily populate the IOMMUPciBus array from the bus hash
+ * table. At the time the IOMMUPciBus is created (iommu_find_add_as), the bus
+ * numbers may not be always initialized yet.
+ */
+static IOMMUPciBus *iommu_find_iommu_pcibus(VirtIOIOMMU *s, uint8_t bus_num)
+{
+IOMMUPciBus *iommu_pci_bus = s->iommu_pcibus_by_bus_num[bus_num];
+
+if (!iommu_pci_bus) {
+GHashTableIter iter;
+
+g_hash_table_iter_init(&iter, s->as_by_busptr);
+while (g_hash_table_iter_next(&iter, NULL, (void **)&iommu_pci_bus)) {
+if (pci_bus_num(iommu_pci_bus->bus) == bus_num) {
+s->iommu_pcibus_by_bus_num[bus_num] = iommu_pci_bus;
+return iommu_pci_b

[PATCH v16 01/10] virtio-iommu: Add skeleton

2020-02-14 Thread Eric Auger
This patchs adds the skeleton for the virtio-iommu device.

Signed-off-by: Eric Auger 
Reviewed-by: Peter Xu 
Reviewed-by: Michael S. Tsirkin 

---

v13 -> v14:
- use device_class_set_props
- updated copyright's year

v12 -> v13
- removed IOMMU_PCI_BUS_MAX and IOMMU_PCI_DEVFN_MAX

v11 -> v12:
- remove s_by_bus_num
- drop set_features (rely on default implementation) and
  acked_features

v9 -> v10:
- mutex initialized here
- initialize tail
- included hw/qdev-properties.h
- removed g_memdup
- removed s->config.domain_range.start = 0;

v9 -> v10:
- expose VIRTIO_IOMMU_F_MMIO feature
- s/domain_bits/domain_range struct
- change error codes
- enforce unmigratable
- Kconfig

v7 -> v8:
- expose VIRTIO_IOMMU_F_BYPASS and VIRTIO_F_VERSION_1
  features
- set_config dummy implementation + tracing
- add trace in get_features
- set the features on realize() and store the acked ones
- remove inclusion of linux/virtio_iommu.h

v6 -> v7:
- removed qapi-event.h include
- add primary_bus and associated property

v4 -> v5:
- use the new v0.5 terminology (domain, endpoint)
- add the event virtqueue

v3 -> v4:
- use page_size_mask instead of page_sizes
- added set_features()
- added some traces (reset, set_status, set_features)
- empty virtio_iommu_set_config() as the driver MUST NOT
  write to device configuration fields
- add get_config trace

v2 -> v3:
- rebase on 2.10-rc0, ie. use IOMMUMemoryRegion and remove
  iommu_ops.
- advertise VIRTIO_IOMMU_F_MAP_UNMAP feature
- page_sizes set to TARGET_PAGE_SIZE

Conflicts:
hw/virtio/trace-events
---
 hw/virtio/Kconfig|   5 +
 hw/virtio/Makefile.objs  |   1 +
 hw/virtio/trace-events   |   7 +
 hw/virtio/virtio-iommu.c | 265 +++
 include/hw/virtio/virtio-iommu.h |  57 +++
 5 files changed, 335 insertions(+)
 create mode 100644 hw/virtio/virtio-iommu.c
 create mode 100644 include/hw/virtio/virtio-iommu.h

diff --git a/hw/virtio/Kconfig b/hw/virtio/Kconfig
index f87def27a6..d29525b36f 100644
--- a/hw/virtio/Kconfig
+++ b/hw/virtio/Kconfig
@@ -9,6 +9,11 @@ config VIRTIO_RNG
 default y
 depends on VIRTIO
 
+config VIRTIO_IOMMU
+bool
+default y
+depends on VIRTIO
+
 config VIRTIO_PCI
 bool
 default y if PCI_DEVICES
diff --git a/hw/virtio/Makefile.objs b/hw/virtio/Makefile.objs
index de0f5fc39b..2fd9da7410 100644
--- a/hw/virtio/Makefile.objs
+++ b/hw/virtio/Makefile.objs
@@ -16,6 +16,7 @@ obj-$(call land,$(CONFIG_VIRTIO_CRYPTO),$(CONFIG_VIRTIO_PCI)) 
+= virtio-crypto-p
 obj-$(CONFIG_VIRTIO_PMEM) += virtio-pmem.o
 common-obj-$(call land,$(CONFIG_VIRTIO_PMEM),$(CONFIG_VIRTIO_PCI)) += 
virtio-pmem-pci.o
 obj-$(call land,$(CONFIG_VHOST_USER_FS),$(CONFIG_VIRTIO_PCI)) += 
vhost-user-fs-pci.o
+obj-$(CONFIG_VIRTIO_IOMMU) += virtio-iommu.o
 obj-$(CONFIG_VHOST_VSOCK) += vhost-vsock.o
 
 ifeq ($(CONFIG_VIRTIO_PCI),y)
diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events
index e28ba48da6..02d93d7f63 100644
--- a/hw/virtio/trace-events
+++ b/hw/virtio/trace-events
@@ -53,3 +53,10 @@ virtio_mmio_write_offset(uint64_t offset, uint64_t value) 
"virtio_mmio_write off
 virtio_mmio_guest_page(uint64_t size, int shift) "guest page size 0x%" PRIx64 
" shift %d"
 virtio_mmio_queue_write(uint64_t value, int max_size) "mmio_queue write 0x%" 
PRIx64 " max %d"
 virtio_mmio_setting_irq(int level) "virtio_mmio setting IRQ %d"
+
+# hw/virtio/virtio-iommu.c
+virtio_iommu_device_reset(void) "reset!"
+virtio_iommu_get_features(uint64_t features) "device supports 
features=0x%"PRIx64
+virtio_iommu_device_status(uint8_t status) "driver status = %d"
+virtio_iommu_get_config(uint64_t page_size_mask, uint64_t start, uint64_t end, 
uint32_t domain_range, uint32_t probe_size) "page_size_mask=0x%"PRIx64" 
start=0x%"PRIx64" end=0x%"PRIx64" domain_range=%d probe_size=0x%x"
+virtio_iommu_set_config(uint64_t page_size_mask, uint64_t start, uint64_t end, 
uint32_t domain_range, uint32_t probe_size) "page_size_mask=0x%"PRIx64" 
start=0x%"PRIx64" end=0x%"PRIx64" domain_bits=%d probe_size=0x%x"
diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c
new file mode 100644
index 00..30579267d5
--- /dev/null
+++ b/hw/virtio/virtio-iommu.c
@@ -0,0 +1,265 @@
+/*
+ * virtio-iommu device
+ *
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2 or later, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see .
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/iov.h"
+#include "qemu-commo

[PATCH v16 04/10] virtio-iommu: Implement map/unmap

2020-02-14 Thread Eric Auger
This patch implements virtio_iommu_map/unmap.

Signed-off-by: Eric Auger 
Reviewed-by: Peter Xu 
Reviewed-by: Michael S. Tsirkin 

---

v11 -> v12:
- check unmanaged managed flags on map
- removed 2 qemu_log_mask in unmap()
- fix leak

v10 -> v11:
- revisit the implementation of unmap according to Peter's suggestion
- removed virt_addr and size from viommu_mapping struct
- use g_tree_lookup_extended()
- return VIRTIO_IOMMU_S_RANGE in case a mapping were
  to be split on unmap (instead of INVAL)

v5 -> v6:
- use new v0.6 fields
- replace error_report by qemu_log_mask

v3 -> v4:
- implement unmap semantics as specified in v0.4
---
 hw/virtio/trace-events   |  1 +
 hw/virtio/virtio-iommu.c | 63 ++--
 2 files changed, 62 insertions(+), 2 deletions(-)

diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events
index 15595f8cd7..22162d6583 100644
--- a/hw/virtio/trace-events
+++ b/hw/virtio/trace-events
@@ -64,6 +64,7 @@ virtio_iommu_attach(uint32_t domain_id, uint32_t ep_id) 
"domain=%d endpoint=%d"
 virtio_iommu_detach(uint32_t domain_id, uint32_t ep_id) "domain=%d endpoint=%d"
 virtio_iommu_map(uint32_t domain_id, uint64_t virt_start, uint64_t virt_end, 
uint64_t phys_start, uint32_t flags) "domain=%d virt_start=0x%"PRIx64" 
virt_end=0x%"PRIx64 " phys_start=0x%"PRIx64" flags=%d"
 virtio_iommu_unmap(uint32_t domain_id, uint64_t virt_start, uint64_t virt_end) 
"domain=%d virt_start=0x%"PRIx64" virt_end=0x%"PRIx64
+virtio_iommu_unmap_done(uint32_t domain_id, uint64_t virt_start, uint64_t 
virt_end) "domain=%d virt_start=0x%"PRIx64" virt_end=0x%"PRIx64
 virtio_iommu_translate(const char *name, uint32_t rid, uint64_t iova, int 
flag) "mr=%s rid=%d addr=0x%"PRIx64" flag=%d"
 virtio_iommu_init_iommu_mr(char *iommu_mr) "init %s"
 virtio_iommu_get_endpoint(uint32_t ep_id) "Alloc endpoint=%d"
diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c
index d9fe83f530..844d34c270 100644
--- a/hw/virtio/virtio-iommu.c
+++ b/hw/virtio/virtio-iommu.c
@@ -18,6 +18,7 @@
  */
 
 #include "qemu/osdep.h"
+#include "qemu/log.h"
 #include "qemu/iov.h"
 #include "qemu-common.h"
 #include "hw/qdev-properties.h"
@@ -55,6 +56,11 @@ typedef struct VirtIOIOMMUInterval {
 uint64_t high;
 } VirtIOIOMMUInterval;
 
+typedef struct VirtIOIOMMUMapping {
+uint64_t phys_addr;
+uint32_t flags;
+} VirtIOIOMMUMapping;
+
 static inline uint16_t virtio_iommu_get_bdf(IOMMUDevice *dev)
 {
 return PCI_BUILD_BDF(pci_bus_num(dev->bus), dev->devfn);
@@ -301,10 +307,39 @@ static int virtio_iommu_map(VirtIOIOMMU *s,
 uint64_t virt_start = le64_to_cpu(req->virt_start);
 uint64_t virt_end = le64_to_cpu(req->virt_end);
 uint32_t flags = le32_to_cpu(req->flags);
+VirtIOIOMMUDomain *domain;
+VirtIOIOMMUInterval *interval;
+VirtIOIOMMUMapping *mapping;
+
+if (flags & ~VIRTIO_IOMMU_MAP_F_MASK) {
+return VIRTIO_IOMMU_S_INVAL;
+}
+
+domain = g_tree_lookup(s->domains, GUINT_TO_POINTER(domain_id));
+if (!domain) {
+return VIRTIO_IOMMU_S_NOENT;
+}
+
+interval = g_malloc0(sizeof(*interval));
+
+interval->low = virt_start;
+interval->high = virt_end;
+
+mapping = g_tree_lookup(domain->mappings, (gpointer)interval);
+if (mapping) {
+g_free(interval);
+return VIRTIO_IOMMU_S_INVAL;
+}
 
 trace_virtio_iommu_map(domain_id, virt_start, virt_end, phys_start, flags);
 
-return VIRTIO_IOMMU_S_UNSUPP;
+mapping = g_malloc0(sizeof(*mapping));
+mapping->phys_addr = phys_start;
+mapping->flags = flags;
+
+g_tree_insert(domain->mappings, interval, mapping);
+
+return VIRTIO_IOMMU_S_OK;
 }
 
 static int virtio_iommu_unmap(VirtIOIOMMU *s,
@@ -313,10 +348,34 @@ static int virtio_iommu_unmap(VirtIOIOMMU *s,
 uint32_t domain_id = le32_to_cpu(req->domain);
 uint64_t virt_start = le64_to_cpu(req->virt_start);
 uint64_t virt_end = le64_to_cpu(req->virt_end);
+VirtIOIOMMUMapping *iter_val;
+VirtIOIOMMUInterval interval, *iter_key;
+VirtIOIOMMUDomain *domain;
+int ret = VIRTIO_IOMMU_S_OK;
 
 trace_virtio_iommu_unmap(domain_id, virt_start, virt_end);
 
-return VIRTIO_IOMMU_S_UNSUPP;
+domain = g_tree_lookup(s->domains, GUINT_TO_POINTER(domain_id));
+if (!domain) {
+return VIRTIO_IOMMU_S_NOENT;
+}
+interval.low = virt_start;
+interval.high = virt_end;
+
+while (g_tree_lookup_extended(domain->mappings, &interval,
+  (void **)&iter_key, (void**)&iter_val)) {
+uint64_t current_low = iter_key->low;
+uint64_t current_high = iter_key->high;
+
+if (interval.low <= current_low && interval.high >= current_high) {
+g_tree_remove(domain->mappings, iter_key);
+trace_virtio_iommu_unmap_done(domain_id, current_low, 
current_high);
+} else {
+ret = VIRTIO_IOMMU_S_RANGE;
+break;
+}
+}
+return ret;
 }
 
 static int virtio_iommu_iov_to_re

Re: [PATCH v2 08/30] qapi: Use ':' after @argument in doc comments

2020-02-14 Thread Markus Armbruster
Markus Armbruster  writes:

> Peter Maydell  writes:
>
>> Some qapi doc comments have forgotten the ':' after the
>> @argument, like this:
>>
>> # @filename Filename for the new image file
>> # @size Size of the virtual disk in bytes
>>
>> The result is that these are parsed as part of the body
>> text and appear as a run-on line:
>>   filename Filename for the new image file size Size of the virtual disk in 
>> bytes"
>> followed by
>>   filename: string
>> Not documented
>>   size: int
>> Not documented
>>
>> in the 'Members' section.
>>
>> Correct the formatting.
>>
>> Signed-off-by: Peter Maydell 
>> Reviewed-by: Markus Armbruster 
>
> The title sounds like it's just a tweak, but it actually fixes bugs in
> the generated documentation.  Suggest to change the title to
>
> qapi: Fix incorrect "Not documented" claims in QMP documentation
>
> R-by stands regardless.

Forgot to mention:

[...]
>> @@ -4645,17 +4645,17 @@
>>  #
>>  # Driver specific image creation options for vhdx.
>>  #
>> -# @file Node to create the image format on
>> -# @size Size of the virtual disk in bytes
>> -# @log-size Log size in bytes, must be a multiple of 1 MB
>> -#   (default: 1 MB)
>> -# @block-size   Block size in bytes, must be a multiple of 1 MB and not
>> -#   larger than 256 MB (default: automatically choose a 
>> block
>> -#   size depending on the image size)
>> -# @subformatvhdx subformat (default: dynamic)
>> -# @block-state-zero Force use of payload blocks of type 'ZERO'. 
>> Non-standard,
>> -#   but default.  Do not set to 'off' when using 'qemu-img
>> -#   convert' with subformat=dynamic.
>> +# @file: Node to create the image format on
>> +# @size: Size of the virtual disk in bytes
>> +# @log-size: Log size in bytes, must be a multiple of 1 MB
>> +#(default: 1 MB)
>> +# @block-size: Block size in bytes, must be a multiple of 1 MB and not
>> +#  larger than 256 MB (default: automatically choose a block
>> +#  size depending on the image size)
>> +# @subformat: vhdx subformat (default: dynamic)
>> +# @block-state-zero: Force use of payload blocks of type 'ZERO'. 
>> Non-standard,
>> +#but default.  Do not set to 'off' when using 'qemu-img
>> +#convert' with subformat=dynamic.
>>  #
>>  # Since: 2.12
>>  ##

Kevin dislikes the loss of visual alignment here.  I dislike the
formatting before and after the patch.  I suggested a new style:

# @file:
# Node to create the image format on
#
# @size:
# Size of the virtual disk in bytes
#
# @log-size:
# Log size in bytes, must be a multiple of 1 MB (default: 1 MB)
#
# @block-size:
# Block size in bytes, must be a multiple of 1 MB and not larger
# than 256 MB (default: automatically choose a block size depending
# on the image size)
#
# @subformat:
# vhdx subformat (default: dynamic)
#
# @block-state-zero:
# Force use of payload blocks of type 'ZERO'.  Non-standard, but
# default.  Do not set to 'off' when using 'qemu-img convert' with
# subformat=dynamic.

Nobody disliked this style.  Peter reports his code already supports it,
but objects to converting to it in his series.  Okay; we can convert
later.  More churn, but the individual patches / series are simpler.




[PATCH v16 07/10] virtio-iommu: Support migration

2020-02-14 Thread Eric Auger
Add Migration support. We rely on recently added gtree and qlist
migration. We only migrate the domain gtree. The endpoint gtree
is re-constructed in a post-load operation.

Signed-off-by: Eric Auger 
Acked-by: Peter Xu 
Reviewed-by: Juan Quintela 
Reviewed-by: Michael S. Tsirkin 

---

v15 -> v16:
- added Juan's R-b

v11 -> v12:
- do not migrate the endpoint gtree but reconstruct it from the
  domain gtree (Peter's suggestion)
- add MIG_PRI_IOMMU
---
 hw/virtio/virtio-iommu.c | 109 +++
 1 file changed, 99 insertions(+), 10 deletions(-)

diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c
index 8509f64004..4cee8083bc 100644
--- a/hw/virtio/virtio-iommu.c
+++ b/hw/virtio/virtio-iommu.c
@@ -643,16 +643,6 @@ static uint64_t virtio_iommu_get_features(VirtIODevice 
*vdev, uint64_t f,
 return f;
 }
 
-/*
- * Migration is not yet supported: most of the state consists
- * of balanced binary trees which are not yet ready for getting
- * migrated
- */
-static const VMStateDescription vmstate_virtio_iommu_device = {
-.name = "virtio-iommu-device",
-.unmigratable = 1,
-};
-
 static gint int_cmp(gconstpointer a, gconstpointer b, gpointer user_data)
 {
 guint ua = GPOINTER_TO_UINT(a);
@@ -736,9 +726,108 @@ static void virtio_iommu_instance_init(Object *obj)
 {
 }
 
+#define VMSTATE_INTERVAL   \
+{  \
+.name = "interval",\
+.version_id = 1,   \
+.minimum_version_id = 1,   \
+.fields = (VMStateField[]) {   \
+VMSTATE_UINT64(low, VirtIOIOMMUInterval),  \
+VMSTATE_UINT64(high, VirtIOIOMMUInterval), \
+VMSTATE_END_OF_LIST()  \
+}  \
+}
+
+#define VMSTATE_MAPPING   \
+{ \
+.name = "mapping",\
+.version_id = 1,  \
+.minimum_version_id = 1,  \
+.fields = (VMStateField[]) {  \
+VMSTATE_UINT64(phys_addr, VirtIOIOMMUMapping),\
+VMSTATE_UINT32(flags, VirtIOIOMMUMapping),\
+VMSTATE_END_OF_LIST() \
+},\
+}
+
+static const VMStateDescription vmstate_interval_mapping[2] = {
+VMSTATE_MAPPING,   /* value */
+VMSTATE_INTERVAL   /* key   */
+};
+
+static int domain_preload(void *opaque)
+{
+VirtIOIOMMUDomain *domain = opaque;
+
+domain->mappings = g_tree_new_full((GCompareDataFunc)interval_cmp,
+   NULL, g_free, g_free);
+return 0;
+}
+
+static const VMStateDescription vmstate_endpoint = {
+.name = "endpoint",
+.version_id = 1,
+.minimum_version_id = 1,
+.fields = (VMStateField[]) {
+VMSTATE_UINT32(id, VirtIOIOMMUEndpoint),
+VMSTATE_END_OF_LIST()
+}
+};
+
+static const VMStateDescription vmstate_domain = {
+.name = "domain",
+.version_id = 1,
+.minimum_version_id = 1,
+.pre_load = domain_preload,
+.fields = (VMStateField[]) {
+VMSTATE_UINT32(id, VirtIOIOMMUDomain),
+VMSTATE_GTREE_V(mappings, VirtIOIOMMUDomain, 1,
+vmstate_interval_mapping,
+VirtIOIOMMUInterval, VirtIOIOMMUMapping),
+VMSTATE_QLIST_V(endpoint_list, VirtIOIOMMUDomain, 1,
+vmstate_endpoint, VirtIOIOMMUEndpoint, next),
+VMSTATE_END_OF_LIST()
+}
+};
+
+static gboolean reconstruct_endpoints(gpointer key, gpointer value,
+  gpointer data)
+{
+VirtIOIOMMU *s = (VirtIOIOMMU *)data;
+VirtIOIOMMUDomain *d = (VirtIOIOMMUDomain *)value;
+VirtIOIOMMUEndpoint *iter;
+
+QLIST_FOREACH(iter, &d->endpoint_list, next) {
+iter->domain = d;
+g_tree_insert(s->endpoints, GUINT_TO_POINTER(iter->id), iter);
+}
+return false; /* continue the domain traversal */
+}
+
+static int iommu_post_load(void *opaque, int version_id)
+{
+VirtIOIOMMU *s = opaque;
+
+g_tree_foreach(s->domains, reconstruct_endpoints, s);
+return 0;
+}
+
+static const VMStateDescription vmstate_virtio_iommu_device = {
+.name = "virtio-iommu-device",
+.minimum_version_id = 1,
+.version_id = 1,
+.post_load = iommu_post_load,
+.fields = (VMStateField[]) {
+VMSTATE_GTREE_DIRECT_KEY_V(domains, VirtIOIOMMU, 1,
+   &vmstate_domain, VirtIOIOMMUDomain),
+VMSTATE_END_OF_LIST()
+},
+};
+
 static const VMStateDescription vmstate_virtio_iommu = {
 .name = "virtio-iommu",
 .minimum_version_id = 1,
+.priority = MIG_PRI_IOMMU,
 .version_id = 1,
 .fields = (VMStateField[]) {
 

[PATCH v16 08/10] virtio-iommu-pci: Add virtio iommu pci support

2020-02-14 Thread Eric Auger
This patch adds virtio-iommu-pci, which is the pci proxy for
the virtio-iommu device.

Currently non DT integration is not yet supported by the kernel.
So the machine must implement a hotplug handler for the
virtio-iommu-pci device that creates the device tree iommu-map
bindings as documented in kernel documentation:

Documentation/devicetree/bindings/virtio/iommu.txt

Signed-off-by: Eric Auger 
Reviewed-by: Jean-Philippe Brucker 
Reviewed-by: Michael S. Tsirkin 

---
v15 -> v16:
- made the device not hotpluggable

v14 -> v15:
- checker whether the machine implements a hotplug handler
  for the virtio-iommu-pci

v13 -> v14:
- add device_class_set_props

v11 -> v12:
- added Jean's R-b
- remove the array of intervals. Will be introduced later?

v10 -> v11:
- add the reserved_regions array property

v9 -> v10:
- include "hw/qdev-properties.h" header

v8 -> v9:
- add the msi-bypass property
- create virtio-iommu-pci.c
---
 hw/virtio/Makefile.objs  |   1 +
 hw/virtio/virtio-iommu-pci.c | 104 +++
 include/hw/pci/pci.h |   1 +
 include/hw/virtio/virtio-iommu.h |   1 +
 qdev-monitor.c   |   1 +
 5 files changed, 108 insertions(+)
 create mode 100644 hw/virtio/virtio-iommu-pci.c

diff --git a/hw/virtio/Makefile.objs b/hw/virtio/Makefile.objs
index 2fd9da7410..4e4d39a0a4 100644
--- a/hw/virtio/Makefile.objs
+++ b/hw/virtio/Makefile.objs
@@ -29,6 +29,7 @@ obj-$(CONFIG_VIRTIO_INPUT_HOST) += virtio-input-host-pci.o
 obj-$(CONFIG_VIRTIO_INPUT) += virtio-input-pci.o
 obj-$(CONFIG_VIRTIO_RNG) += virtio-rng-pci.o
 obj-$(CONFIG_VIRTIO_BALLOON) += virtio-balloon-pci.o
+obj-$(CONFIG_VIRTIO_IOMMU) += virtio-iommu-pci.o
 obj-$(CONFIG_VIRTIO_9P) += virtio-9p-pci.o
 obj-$(CONFIG_VIRTIO_SCSI) += virtio-scsi-pci.o
 obj-$(CONFIG_VIRTIO_BLK) += virtio-blk-pci.o
diff --git a/hw/virtio/virtio-iommu-pci.c b/hw/virtio/virtio-iommu-pci.c
new file mode 100644
index 00..3dfbf55b47
--- /dev/null
+++ b/hw/virtio/virtio-iommu-pci.c
@@ -0,0 +1,104 @@
+/*
+ * Virtio IOMMU PCI Bindings
+ *
+ * Copyright (c) 2019 Red Hat, Inc.
+ * Written by Eric Auger
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 or
+ *  (at your option) any later version.
+ */
+
+#include "qemu/osdep.h"
+
+#include "virtio-pci.h"
+#include "hw/virtio/virtio-iommu.h"
+#include "hw/qdev-properties.h"
+#include "qapi/error.h"
+#include "hw/boards.h"
+
+typedef struct VirtIOIOMMUPCI VirtIOIOMMUPCI;
+
+/*
+ * virtio-iommu-pci: This extends VirtioPCIProxy.
+ *
+ */
+#define VIRTIO_IOMMU_PCI(obj) \
+OBJECT_CHECK(VirtIOIOMMUPCI, (obj), TYPE_VIRTIO_IOMMU_PCI)
+
+struct VirtIOIOMMUPCI {
+VirtIOPCIProxy parent_obj;
+VirtIOIOMMU vdev;
+};
+
+static Property virtio_iommu_pci_properties[] = {
+DEFINE_PROP_UINT32("class", VirtIOPCIProxy, class_code, 0),
+DEFINE_PROP_END_OF_LIST(),
+};
+
+static void virtio_iommu_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
+{
+VirtIOIOMMUPCI *dev = VIRTIO_IOMMU_PCI(vpci_dev);
+DeviceState *vdev = DEVICE(&dev->vdev);
+
+if (!qdev_get_machine_hotplug_handler(DEVICE(vpci_dev))) {
+MachineClass *mc = MACHINE_GET_CLASS(qdev_get_machine());
+
+error_setg(errp,
+   "%s machine fails to create iommu-map device tree bindings",
+   mc->name);
+error_append_hint(errp,
+  "Check you machine implements a hotplug handler "
+  "for the virtio-iommu-pci device\n");
+error_append_hint(errp, "Check the guest is booted without FW or with "
+  "-no-acpi\n");
+return;
+}
+qdev_set_parent_bus(vdev, BUS(&vpci_dev->bus));
+object_property_set_link(OBJECT(dev),
+ OBJECT(pci_get_bus(&vpci_dev->pci_dev)),
+ "primary-bus", errp);
+object_property_set_bool(OBJECT(vdev), true, "realized", errp);
+}
+
+static void virtio_iommu_pci_class_init(ObjectClass *klass, void *data)
+{
+DeviceClass *dc = DEVICE_CLASS(klass);
+VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass);
+PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass);
+k->realize = virtio_iommu_pci_realize;
+set_bit(DEVICE_CATEGORY_MISC, dc->categories);
+device_class_set_props(dc, virtio_iommu_pci_properties);
+pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET;
+pcidev_k->device_id = PCI_DEVICE_ID_VIRTIO_IOMMU;
+pcidev_k->revision = VIRTIO_PCI_ABI_VERSION;
+pcidev_k->class_id = PCI_CLASS_OTHERS;
+dc->hotpluggable = false;
+}
+
+static void virtio_iommu_pci_instance_init(Object *obj)
+{
+VirtIOIOMMUPCI *dev = VIRTIO_IOMMU_PCI(obj);
+
+virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
+TYPE_VIRTIO_IOMMU);
+}
+
+static const VirtioPCIDeviceTypeInfo virtio_iommu_pci_info = {
+.base_name = TYPE_VIRTIO_IOMMU_PC

[PATCH v16 10/10] MAINTAINERS: add virtio-iommu related files

2020-02-14 Thread Eric Auger
Add a new "virtio-iommu" section with the new files
related to this device.

Signed-off-by: Eric Auger 
---
 MAINTAINERS | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index c7717df720..b7a7a18737 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1623,6 +1623,12 @@ F: hw/input/virtio-input*.c
 F: include/hw/virtio/virtio-input.h
 F: contrib/vhost-user-input/*
 
+virtio-iommu
+M: Eric Auger 
+S: Maintained
+F: hw/virtio/virtio-iommu*.c
+F: include/hw/virtio/virtio-iommu.h
+
 virtio-serial
 M: Laurent Vivier 
 R: Amit Shah 
-- 
2.20.1




[PATCH v16 09/10] hw/arm/virt: Add the virtio-iommu device tree mappings

2020-02-14 Thread Eric Auger
Adds the "virtio,pci-iommu" node in the host bridge node and
the RID mapping, excluding the IOMMU RID.

This is done in the virtio-iommu-pci hotplug handler which
gets called only if no firmware is loaded or if -no-acpi is
passed on the command line. As non DT integration is
not yet supported by the kernel we must make sure we
are in DT mode. This limitation will be removed as soon
as the topology description feature gets supported.

Signed-off-by: Eric Auger 

---

v15 -> v16:
- rename create_virtio_iommu into create_virtio_iommu_dt_bindings

v14 -> v15:
- only support the hotplug handler for virtio-iommu-pci if
  dt can be guaranteed

v11 -> v12:
- Added Jean's R-b

v10 -> v11:
- remove msi_bypass

v8 -> v9:
- disable msi-bypass property
- addition of the subnode is handled is the hotplug handler
  and IOMMU RID is notimposed anymore

v6 -> v7:
- align to the smmu instantiation code

v4 -> v5:
- VirtMachineClass no_iommu added in this patch
- Use object_resolve_path_type
---
 hw/arm/virt.c | 57 +--
 include/hw/arm/virt.h |  2 ++
 2 files changed, 52 insertions(+), 7 deletions(-)

diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index f788fe27d6..cfe317922f 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -32,6 +32,7 @@
 #include "qemu-common.h"
 #include "qemu/units.h"
 #include "qemu/option.h"
+#include "monitor/qdev.h"
 #include "qapi/error.h"
 #include "hw/sysbus.h"
 #include "hw/boards.h"
@@ -54,6 +55,7 @@
 #include "qemu/error-report.h"
 #include "qemu/module.h"
 #include "hw/pci-host/gpex.h"
+#include "hw/virtio/virtio-pci.h"
 #include "hw/arm/sysbus-fdt.h"
 #include "hw/platform-bus.h"
 #include "hw/qdev-properties.h"
@@ -71,6 +73,7 @@
 #include "hw/mem/pc-dimm.h"
 #include "hw/mem/nvdimm.h"
 #include "hw/acpi/generic_event_device.h"
+#include "hw/virtio/virtio-iommu.h"
 
 #define DEFINE_VIRT_MACHINE_LATEST(major, minor, latest) \
 static void virt_##major##_##minor##_class_init(ObjectClass *oc, \
@@ -1180,6 +1183,30 @@ static void create_smmu(const VirtMachineState *vms,
 g_free(node);
 }
 
+static void create_virtio_iommu_dt_bindings(VirtMachineState *vms, Error 
**errp)
+{
+const char compat[] = "virtio,pci-iommu";
+uint16_t bdf = vms->virtio_iommu_bdf;
+char *node;
+
+vms->iommu_phandle = qemu_fdt_alloc_phandle(vms->fdt);
+
+node = g_strdup_printf("%s/virtio_iommu@%d", vms->pciehb_nodename, bdf);
+qemu_fdt_add_subnode(vms->fdt, node);
+qemu_fdt_setprop(vms->fdt, node, "compatible", compat, sizeof(compat));
+qemu_fdt_setprop_sized_cells(vms->fdt, node, "reg",
+ 1, bdf << 8, 1, 0, 1, 0,
+ 1, 0, 1, 0);
+
+qemu_fdt_setprop_cell(vms->fdt, node, "#iommu-cells", 1);
+qemu_fdt_setprop_cell(vms->fdt, node, "phandle", vms->iommu_phandle);
+g_free(node);
+
+qemu_fdt_setprop_cells(vms->fdt, vms->pciehb_nodename, "iommu-map",
+   0x0, vms->iommu_phandle, 0x0, bdf,
+   bdf + 1, vms->iommu_phandle, bdf + 1, 0x - bdf);
+}
+
 static void create_pcie(VirtMachineState *vms)
 {
 hwaddr base_mmio = vms->memmap[VIRT_PCIE_MMIO].base;
@@ -1258,7 +1285,7 @@ static void create_pcie(VirtMachineState *vms)
 }
 }
 
-nodename = g_strdup_printf("/pcie@%" PRIx64, base);
+nodename = vms->pciehb_nodename = g_strdup_printf("/pcie@%" PRIx64, base);
 qemu_fdt_add_subnode(vms->fdt, nodename);
 qemu_fdt_setprop_string(vms->fdt, nodename,
 "compatible", "pci-host-ecam-generic");
@@ -1301,13 +1328,16 @@ static void create_pcie(VirtMachineState *vms)
 if (vms->iommu) {
 vms->iommu_phandle = qemu_fdt_alloc_phandle(vms->fdt);
 
-create_smmu(vms, pci->bus);
-
-qemu_fdt_setprop_cells(vms->fdt, nodename, "iommu-map",
-   0x0, vms->iommu_phandle, 0x0, 0x1);
+switch (vms->iommu) {
+case VIRT_IOMMU_SMMUV3:
+create_smmu(vms, pci->bus);
+qemu_fdt_setprop_cells(vms->fdt, nodename, "iommu-map",
+   0x0, vms->iommu_phandle, 0x0, 0x1);
+break;
+default:
+g_assert_not_reached();
+}
 }
-
-g_free(nodename);
 }
 
 static void create_platform_bus(VirtMachineState *vms)
@@ -1976,6 +2006,13 @@ static void virt_machine_device_plug_cb(HotplugHandler 
*hotplug_dev,
 if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
 virt_memory_plug(hotplug_dev, dev, errp);
 }
+if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_IOMMU_PCI)) {
+PCIDevice *pdev = PCI_DEVICE(dev);
+
+vms->iommu = VIRT_IOMMU_VIRTIO;
+vms->virtio_iommu_bdf = pci_get_bdf(pdev);
+create_virtio_iommu_dt_bindings(vms, errp);
+}
 }
 
 static void virt_machine_device_unplug_request_cb(HotplugHandler *hotplug_dev,
@@ -1992,7 +2029,13 @@ static HotplugHandler 
*virt_machine_get_hotplug_handler(Mach

Re: [PATCH v2 09/30] qapi: Fix indent level on doc comments in json files

2020-02-14 Thread Markus Armbruster
Peter Maydell  writes:

> The texinfo doc generation doesn't care much about indentation
> levels, but we would like to add a rST backend, and rST does care
> about indentation.
>
> Make the doc comments more strongly consistent about indentation
> for multiline constructs like:
>
> @arg: description line 1
>   description line 2
>
> Returns: line one
>  line 2
>
> so that there is always exactly one space after the colon, and
> subsequent lines align with the first.
>
> This commit is a purely whitespace change, and it does not alter the
> generated .texi files (because the texi generation code strips away
> all the extra whitespace).  This does mean that we end up with some
> over-length lines.

Overlong lines need to be corrected.  Not necessarily in this patch.

> Note that when the documentation for an argument fits on a single
> line like this:
>
> @arg: one line only
>
> then stray extra spaces after the ':' don't affect the rST output, so
> I have not attempted to methodically fix them, though the preference
> is a single space here too.
>
> Signed-off-by: Peter Maydell 

Reviewed-by: Markus Armbruster 




Re: [PATCH RFC] memory: Don't allow to resize RAM while migrating

2020-02-14 Thread David Hildenbrand


>> diff --git a/migration/ram.c b/migration/ram.c
>> index ed23ed1c7c..f86f32b453 100644
>> --- a/migration/ram.c
>> +++ b/migration/ram.c
>> @@ -52,6 +52,7 @@
>>  #include "migration/colo.h"
>>  #include "block.h"
>>  #include "sysemu/sysemu.h"
>> +#include "sysemu/runstate.h"
>>  #include "savevm.h"
>>  #include "qemu/iov.h"
>>  #include "multifd.h"
>> @@ -3710,8 +3711,49 @@ static SaveVMHandlers savevm_ram_handlers = {
>>  .resume_prepare = ram_resume_prepare,
>>  };
>>  
>> +static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
>> +  size_t old_size, size_t new_size)
>> +{
>> +/*
>> + * We don't care about resizes triggered on incoming migration (when
>> + * syncing ram blocks), or of course, when no migration is going on.
>> + */
>> +if (migration_is_idle() || !runstate_is_running()) {
>> +return;
>> +}
>> +
>> +if (!postcopy_is_running()) {
>> +Error *err = NULL;
>> +
>> +/*
>> + * Precopy code cannot deal with the size of ram blocks changing at
>> + * random points in time. We're still running on the source, abort
>> + * the migration and continue running here. Make sure to wait until
>> + * migration was canceled.
>> + */
>> +error_setg(&err, "RAM resized during precopy.");
>> +migrate_set_error(migrate_get_current(), err);
>> +error_free(err);
>> +migration_cancel();
> 
> If we can't do anything else, this is reasonable.
> 
> But as discussed before, it is still not fully clear to me _why_ are
> ramblocks changing if we have disabled add/remove memory during migration.


Ramblock add/remove is ties to device add/remove, which we block.

Resize, however, it not. Here, the resize happens while the guest is
booting up. The content/size of the ram block depends also on previous
guest action AFAIK. There is no way from stopping the guest from doing
that. It's required for the guest to continue booting (with ACPI).

I'm currently working on a project which reuses resizable ram blocks in
different context. There, I can simply defer/avoid resizing when
migration is active. In the ACPI case, however, we cannot avoid it.

Hope that answers your question

> 
>> +} else {
>> +/*
>> + * Postcopy code cannot deal with the size of ram blocks changing at
>> + * random points in time. We're running on the target. Fail hard.
>> + *
>> + * TODO: How to handle this in a better way?
>> + */
>> +error_report("RAM resized during postcopy.");
>> +exit(-1);
> 
> Idea is good, but we also need to exit destination, not only source, no?

@Dave, any idea what could be the right thing to do here?

> 
>> +}
>> +}
> 
> 
> 
>> +static RAMBlockNotifier ram_mig_ram_notifier = {
>> +.ram_block_resized = ram_mig_ram_block_resized,
>> +};
>> +
>>  void ram_mig_init(void)
>>  {
>>  qemu_mutex_init(&XBZRLE.lock);
>>  register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
>> +ram_block_notifier_add(&ram_mig_ram_notifier);
>>  }
> 
> Shouldn't we remove the notifier when we finish the migration.

It's called from main() unconditionally (so not when migration starts),
so the notifier remains active the whole QEMU lifetime (which should be
fine AFAIKT).

-- 
Thanks,

David / dhildenb




Re: docs: Update vhost-user spec regarding backend program conventions

2020-02-14 Thread Marc-André Lureau
Hi

On Fri, Feb 14, 2020 at 2:24 PM Boeuf, Sebastien
 wrote:
>
> Hi Marc-Andre,
>
> On Tue, 2020-02-11 at 22:24 +0100, Marc-André Lureau wrote:
> > Hi
> >
> > On Tue, Feb 11, 2020 at 4:24 PM Boeuf, Sebastien
> >  wrote:
> > > From c073d528b8cd7082832fd1825dc33dd65b305aa2 Mon Sep 17 00:00:00
> > > 2001
> > > From: Sebastien Boeuf 
> > > Date: Tue, 11 Feb 2020 16:01:22 +0100
> > > Subject: [PATCH] docs: Update vhost-user spec regarding backend
> > > program
> > >  conventions
> > >
> > > The vhost-user specification is not clearly stating the expected
> > > behavior from a backend program whenever the client disconnects.
> > >
> > > This patch addresses the issue by defining the default behavior and
> > > proposing an alternative through a command line option.
> > >
> > > By default, a backend program will have to keep listening even if
> > > the
> > > client disconnects, unless told otherwise through the newly
> > > introduced
> > > option --exit-on-disconnect.
> > >
> > > Signed-off-by: Sebastien Boeuf 
> > > Signed-off-by: Stefan Hajnoczi 
> > > ---
> > >  docs/interop/vhost-user.rst | 10 ++
> > >  1 file changed, 10 insertions(+)
> > >
> > > diff --git a/docs/interop/vhost-user.rst b/docs/interop/vhost-
> > > user.rst
> > > index 5f8b3a456b..da9a1ebc4c 100644
> > > --- a/docs/interop/vhost-user.rst
> > > +++ b/docs/interop/vhost-user.rst
> > > @@ -1323,6 +1323,10 @@ The backend program must end (as quickly and
> > > cleanly as possible) when
> > >  the SIGTERM signal is received. Eventually, it may receive SIGKILL
> > > by
> > >  the management layer after a few seconds.
> > >
> > > +By default, the backend program continues running after the client
> > > +disconnects. It accepts only 1 connection at a time on each UNIX
> > > domain
> > > +socket.
> >
> > I don't think that's the most common behaviour. libvhost-user will
> > panic() on disconnect in general, unless the error/exit is handled
> > gracefully by the backend.
>
> It's not the default behavior from libvhost-user, but that's exactly
> something I'd like to see changing. This should be the normal case if
> you think about a standard client/server connection, where the server
> would simply listen again after the client disconnects.

I disagree, a "normal" lifecycle is a single connection & instance per device.

Having the backend handle multiple connections is needlessly more
complicated. You need to correctly handle multiple states, flushed
anything private between connections etc. It should be optional.


> >
> > The most common case is to have 1-1 relation between device/qemu
> > instance and backend.
>
> Yes this part is fine, but that's not a reason why the backend should
> terminates.

It is simpler to ensure it is reset correctly.

>
> >
> > Why not restart the backend for another instance?
>
> Because you need some management tool to do so. And I think that by
> default it could be interesting to have the least amount of extra
> management involved.

The management layer should be involved if any side crashes or restart anyway.

>
> >
> > > +
> > >  The following command line options have an expected behaviour.
> > > They
> > >  are mandatory, unless explicitly said differently:
> > >
> > > @@ -1337,6 +1341,12 @@ are mandatory, unless explicitly said
> > > differently:
> > >vhost-user socket as file descriptor FDNUM. It is incompatible
> > > with
> > >--socket-path.
> > >
> > > +--exit-on-disconnect
> > > +
> > > +  When this option is provided, the backend program must terminate
> > > when
> > > +  the client disconnects. This can be used to keep the backend
> > > program's
> > > +  lifetime synchronized with its client process.
> >
> > This section list options that are mandatory. It's probably a bit
> > late
> > to add more mandatory options (I regret already some of them)
>
> The spec states "They are mandatory, unless explicitly said
> differently", and in this case I'm explicitely saying "When this option
> is provided", which means if it's not provided it's fine and we can
> ignore the fact it's not there.

Ah ok,  I think we can guard it with a capability too.

>
> >
> > Do we need to specify the behaviour on client disconnect? Can't we
> > leave that to the backend and management layer to decide?
>
> My goal here is to make the spec a bit less loose. I know libvhost-user
> is the de-facto implementation but we cannot just assume everything out
> of the libvhost-user implementation, especially since there is a
> dedicated spec. That's the reason why I thought it'd be nice to have
> the backend behavior well defined in the spec.

Sure, the goal of the spec is to have basic interoperability between
implementations, not to follow whatever libvhost-user is currently
doing.

> The point is, relying on the current definition, there's not enough
> information to make sure a VMM will properly interface with a vhost-
> user backend.

I don't know why having the backend handle multiple connections would
help with that.

Hav

Re: [PATCH v2 14/30] qapi/ui.json: Use explicit bulleted lists

2020-02-14 Thread Markus Armbruster
Peter Maydell  writes:

> A JSON block comment like this:
>  Returns: nothing on success
>   If @node is not a valid block device, DeviceNotFound
>   If @name is not found, GenericError with an explanation
>
> renders like this:
>
>  Returns: nothing on success If node is not a valid block device,
>  DeviceNotFound If name is not found, GenericError with an explanation
>
> because whitespace is not significant.
>
> Use an actual bulleted list, so that the formatting is correct.
>
> Signed-off-by: Peter Maydell 

Reviewed-by: Markus Armbruster 




Re: docs: Update vhost-user spec regarding backend program conventions

2020-02-14 Thread Daniel P . Berrangé
On Fri, Feb 14, 2020 at 03:00:34PM +0100, Marc-André Lureau wrote:
> Hi
> 
> On Fri, Feb 14, 2020 at 2:24 PM Boeuf, Sebastien
>  wrote:
> >
> > Hi Marc-Andre,
> >
> > On Tue, 2020-02-11 at 22:24 +0100, Marc-André Lureau wrote:
> > > Hi
> > >
> > > On Tue, Feb 11, 2020 at 4:24 PM Boeuf, Sebastien
> > >  wrote:
> > > > From c073d528b8cd7082832fd1825dc33dd65b305aa2 Mon Sep 17 00:00:00
> > > > 2001
> > > > From: Sebastien Boeuf 
> > > > Date: Tue, 11 Feb 2020 16:01:22 +0100
> > > > Subject: [PATCH] docs: Update vhost-user spec regarding backend
> > > > program
> > > >  conventions
> > > >
> > > > The vhost-user specification is not clearly stating the expected
> > > > behavior from a backend program whenever the client disconnects.
> > > >
> > > > This patch addresses the issue by defining the default behavior and
> > > > proposing an alternative through a command line option.
> > > >
> > > > By default, a backend program will have to keep listening even if
> > > > the
> > > > client disconnects, unless told otherwise through the newly
> > > > introduced
> > > > option --exit-on-disconnect.
> > > >
> > > > Signed-off-by: Sebastien Boeuf 
> > > > Signed-off-by: Stefan Hajnoczi 
> > > > ---
> > > >  docs/interop/vhost-user.rst | 10 ++
> > > >  1 file changed, 10 insertions(+)
> > > >
> > > > diff --git a/docs/interop/vhost-user.rst b/docs/interop/vhost-
> > > > user.rst
> > > > index 5f8b3a456b..da9a1ebc4c 100644
> > > > --- a/docs/interop/vhost-user.rst
> > > > +++ b/docs/interop/vhost-user.rst
> > > > @@ -1323,6 +1323,10 @@ The backend program must end (as quickly and
> > > > cleanly as possible) when
> > > >  the SIGTERM signal is received. Eventually, it may receive SIGKILL
> > > > by
> > > >  the management layer after a few seconds.
> > > >
> > > > +By default, the backend program continues running after the client
> > > > +disconnects. It accepts only 1 connection at a time on each UNIX
> > > > domain
> > > > +socket.
> > >
> > > I don't think that's the most common behaviour. libvhost-user will
> > > panic() on disconnect in general, unless the error/exit is handled
> > > gracefully by the backend.
> >
> > It's not the default behavior from libvhost-user, but that's exactly
> > something I'd like to see changing. This should be the normal case if
> > you think about a standard client/server connection, where the server
> > would simply listen again after the client disconnects.
> 
> I disagree, a "normal" lifecycle is a single connection & instance per device.
> 
> Having the backend handle multiple connections is needlessly more
> complicated. You need to correctly handle multiple states, flushed
> anything private between connections etc. It should be optional.
> 
> 
> > >
> > > The most common case is to have 1-1 relation between device/qemu
> > > instance and backend.
> >
> > Yes this part is fine, but that's not a reason why the backend should
> > terminates.
> 
> It is simpler to ensure it is reset correctly.
> 
> >
> > >
> > > Why not restart the backend for another instance?
> >
> > Because you need some management tool to do so. And I think that by
> > default it could be interesting to have the least amount of extra
> > management involved.
> 
> The management layer should be involved if any side crashes or restart anyway.

Further, this vhost-user.rst spec document is explicitly describing
the contract between the vhost-user binaries and the management
layer. So it doesn't make sense to say update this doc to describe
desired semantics for usage /without/ a management layer.

So I agree, the default behaviour should be that there is one binary
spawned at time the associated device is initialization, and that
the lifetime of the binary is 1:1 associated with the lifetime of
the VM, or until the device is unplugged. 

Regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|




Re: [PATCH v2 15/30] qapi/{block, misc, tmp, net}.json: Use explicit bulleted lists

2020-02-14 Thread Markus Armbruster
Peter Maydell  writes:

> A JSON block comment like this:
>  Returns: nothing on success
>   If @node is not a valid block device, DeviceNotFound
>   If @name is not found, GenericError with an explanation
>
> renders like this:
>
>  Returns: nothing on success If node is not a valid block device,
>  DeviceNotFound If name is not found, GenericError with an explanation
>
> because whitespace is not significant.
>
> Use an actual bulleted list, so that the formatting is correct.
>
> This commit gathers up the remaining json files which had
> places needing this fix.
>
> Signed-off-by: Peter Maydell 

Reviewed-by: Markus Armbruster 

What's the rationale for the split between

[PATCH v2 13/30] qapi/block-core.json: Use explicit bulleted lists
[PATCH v2 14/30] qapi/ui.json: Use explicit bulleted lists
[PATCH v2 15/30] qapi/{block, misc, tmp, net}.json: Use explicit bulleted 
lists

?




Re: [PATCH v2 05/30] qga/qapi-schema.json: Fix indent level on doc comments

2020-02-14 Thread Markus Armbruster
Peter Maydell  writes:

> On Fri, 14 Feb 2020 at 12:36, Markus Armbruster  wrote:
>>
>> Peter Maydell  writes:
>>
>> > The texinfo doc generation doesn't care much about indentation
>>
>> Let's spell Texinfo with a capital T.
>>
>> > levels, but we would like to add a rST backend, and rST does care
>> > about indentation.
>>
>> Nitpick: an rST *backend* should not care about the doc generator's
>> *input* format.  We actually intend to change the input format into a
>> domain-specific dialect of rST, as you state in your cover letter: "This
>> series switches all our QAPI doc comments over from texinfo format to
>> rST."
>>
>> Perhaps:
>>
>>   The current doc generation doesn't care much about indentation levels,
>>   but we would like to switch to an rST format, and rST does care about
>>   indentation.
>
> Works for me. Let me know if you're OK just making this kind
> of commit message tweak/minor fixup and sending a pull, or
> if you want me to roll a v3.

I'm happy to tweak commit messages myself.  As always in such cases,
I'll appreciate an eye-over before the merge.




Re: [PATCH 3/3] arm: allwinner: Wire up USB EHCI

2020-02-14 Thread Guenter Roeck

On 2/14/20 2:08 AM, Gerd Hoffmann wrote:

ehci-platform 1c1c000.usb: new USB bus registered, assigned bus number 2
ehci-platform 1c1c000.usb: irq 31, io mem 0x01c1c000
ehci-platform 1c1c000.usb: USB 2.0 started, EHCI 1.00



+for (i = 0; i < ARRAY_SIZE(s->ehci); i++) {
+object_property_set_bool(OBJECT(&s->ehci[i]), true, "realized",
+ &err);


I suspect the ohci controllers are ehci companions, i.e. they handle a
single usb bus, with ehci handling usb2 and ohci handling usb1 devices?

If so then you should initialize ehci first, explicitly assign bus
names and set the masterbus property for the ohci controllers.

See also docs/usb2.txt



Thanks for the references. Makes sense; I'll give it a try.

Guenter



Re: [PATCH v2 15/30] qapi/{block, misc, tmp, net}.json: Use explicit bulleted lists

2020-02-14 Thread Peter Maydell
On Fri, 14 Feb 2020 at 14:23, Markus Armbruster  wrote:
> What's the rationale for the split between
>
> [PATCH v2 13/30] qapi/block-core.json: Use explicit bulleted lists
> [PATCH v2 14/30] qapi/ui.json: Use explicit bulleted lists
> [PATCH v2 15/30] qapi/{block, misc, tmp, net}.json: Use explicit bulleted 
> lists
>
> ?

Just keeping down the size of individual patches for ease of
review. ui.json and block-core.json both had enough uses that
needed fixing to feel like reasonably patch-sized amounts. This
patch is all the straggler files, none of which had numerous
enough changes to seem worth splitting further.

thanks
-- PMM



[Bug 1863025] Re: Use-after-free after flush in TCG accelerator

2020-02-14 Thread Alex Bennée
** Tags added: tcg

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/1863025

Title:
  Use-after-free after flush in TCG accelerator

Status in QEMU:
  Confirmed

Bug description:
  I believe I found a UAF in TCG that can lead to a guest VM escape. The
  security list informed me "This can not be treated as a security
  issue." and to post it here. I am looking at the 4.2.0 source code.
  The issue requires a race and I will try to describe it in terms of
  three concurrent threads.

  Thread A:

  A1. qemu_tcg_cpu_thread_fn runs work loop
  A2. qemu_wait_io_event => qemu_wait_io_event_common => process_queued_cpu_work
  A3. start_exclusive critical section entered
  A4. do_tb_flush is called, TB memory freed/re-allocated
  A5. end_exclusive exits critical section

  Thread B:

  B1. qemu_tcg_cpu_thread_fn runs work loop
  B2. tcg_cpu_exec => cpu_exec => tb_find => tb_gen_code
  B3. tcg_tb_alloc obtains a new TB

  Thread C:

  C1. qemu_tcg_cpu_thread_fn runs work loop
  C2. cpu_exec_step_atomic executes
  C3. TB obtained with tb_lookup__cpu_state or tb_gen_code
  C4. start_exclusive critical section entered
  C5. cpu_tb_exec executes the TB code
  C6. end_exclusive exits critical section

  Consider the following sequence of events:
    B2 => B3 => C3 (same TB as B2) => A3 => A4 (TB freed) => A5 => B2 =>
    B3 (re-allocates TB from B2) => C4 => C5 (freed/reused TB now executing) => 
C6

  In short, because thread C uses the TB in the critical section, there
  is no guarantee that the pointer has not been "freed" (rather the
  memory is marked as re-usable) and therefore a use-after-free occurs.

  Since the TCG generated code can be in the same memory as the TB data
  structure, it is possible for an attacker to overwrite the UAF pointer
  with code generated from TCG. This can overwrite key pointer values
  and could lead to code execution on the host outside of the TCG
  sandbox.

To manage notifications about this bug go to:
https://bugs.launchpad.net/qemu/+bug/1863025/+subscriptions



Re: [PATCH v2 06/30] qga/qapi-schema.json: minor format fixups for rST

2020-02-14 Thread Markus Armbruster
Markus Armbruster  writes:

> Peter Maydell  writes:
>
>> rST format requires a blank line before the start of a bulleted
>> or enumerated list. Two places in qapi-schema.json were missing
>> this blank line.
>
> Relies on the previous commit message's "we would like to add a rST
> backend" to establish context.  Suggest:
>
>   We would like to switch the doc comments to rST format, and rST
>   requires...

A few more commit message could use the same treatment.  Since I have
reason to hope to find only minor things I can address in my tree, I
won't point it out every time, but just do it, then give you an
opportunity to check my tweaks.

[...]




Re: [PATCH v2 16/30] qapi: Add blank lines before bulleted lists

2020-02-14 Thread Markus Armbruster
Peter Maydell  writes:

> rST insists on a blank line before and after a bulleted list,
> but our texinfo doc generator did not. Add some extra blank
> lines in the doc comments so they're acceptable rST input.
>
> Signed-off-by: Peter Maydell 
> Reviewed-by: Philippe Mathieu-Daudé 

Reviewed-by: Markus Armbruster 




Re: [PATCH v2 17/30] qapi/migration.json: Replace _this_ with *this*

2020-02-14 Thread Markus Armbruster
Peter Maydell  writes:

> The MigrationInfo::setup-time documentation is the only place where
> we use _this_ inline markup for emphasis, commonly rendered in
> italics.  rST doesn't recognize that markup and emits literal
> underscores.
>
> Switch to *this* instead.  Changes markup to strong emphasis with
> Texinfo, commonly rendered as bold.  With rST, it will go right back
> to emphasis / italics.
>
> rST also uses **this** for strong (commonly rendered bold) where
> Texinfo uses *this*. We have one place in the doc comments
> which uses strong/bold markup, in qapi/introspect.json:
> Note: the QAPI schema is also used to help define *internal*
>
> When we switch to rST that will be rendered as emphasis / italics.
> Markus (who wrote that) thinks that using emphasis / italics
> there is an improvement, so we leave that markup alone.
>
> Signed-off-by: Peter Maydell 
> Reviewed-by: Markus Armbruster 
> ---
> v1->v2: updated commit message. Paras 1 and 2 are from Markus;
> paras 3 and 4 are new, and mention a non-change agreed in
> the thread off the cover letter of the v1 patch series.

I appreciate your diligent notes here.  Thanks!

> ---
>  qapi/migration.json | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/qapi/migration.json b/qapi/migration.json
> index 11033b7a8e6..52f34299698 100644
> --- a/qapi/migration.json
> +++ b/qapi/migration.json
> @@ -178,8 +178,8 @@
>  # expected downtime in milliseconds for the guest in 
> last walk
>  # of the dirty bitmap. (since 1.3)
>  #
> -# @setup-time: amount of setup time in milliseconds _before_ the
> -#  iterations begin but _after_ the QMP command is issued. This 
> is designed
> +# @setup-time: amount of setup time in milliseconds *before* the
> +#  iterations begin but *after* the QMP command is issued. This 
> is designed
>  #  to provide an accounting of any activities (such as RDMA 
> pinning) which
>  #  may be expensive, but do not actually occur during the 
> iterative
>  #  migration rounds themselves. (since 1.6)

Reviewed-by: Markus Armbruster 




[Bug 1863025] Re: Use-after-free after flush in TCG accelerator

2020-02-14 Thread Alex Bennée
** Changed in: qemu
 Assignee: (unassigned) => Alex Bennée (ajbennee)

** Changed in: qemu
   Status: New => Confirmed

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/1863025

Title:
  Use-after-free after flush in TCG accelerator

Status in QEMU:
  Confirmed

Bug description:
  I believe I found a UAF in TCG that can lead to a guest VM escape. The
  security list informed me "This can not be treated as a security
  issue." and to post it here. I am looking at the 4.2.0 source code.
  The issue requires a race and I will try to describe it in terms of
  three concurrent threads.

  Thread A:

  A1. qemu_tcg_cpu_thread_fn runs work loop
  A2. qemu_wait_io_event => qemu_wait_io_event_common => process_queued_cpu_work
  A3. start_exclusive critical section entered
  A4. do_tb_flush is called, TB memory freed/re-allocated
  A5. end_exclusive exits critical section

  Thread B:

  B1. qemu_tcg_cpu_thread_fn runs work loop
  B2. tcg_cpu_exec => cpu_exec => tb_find => tb_gen_code
  B3. tcg_tb_alloc obtains a new TB

  Thread C:

  C1. qemu_tcg_cpu_thread_fn runs work loop
  C2. cpu_exec_step_atomic executes
  C3. TB obtained with tb_lookup__cpu_state or tb_gen_code
  C4. start_exclusive critical section entered
  C5. cpu_tb_exec executes the TB code
  C6. end_exclusive exits critical section

  Consider the following sequence of events:
    B2 => B3 => C3 (same TB as B2) => A3 => A4 (TB freed) => A5 => B2 =>
    B3 (re-allocates TB from B2) => C4 => C5 (freed/reused TB now executing) => 
C6

  In short, because thread C uses the TB in the critical section, there
  is no guarantee that the pointer has not been "freed" (rather the
  memory is marked as re-usable) and therefore a use-after-free occurs.

  Since the TCG generated code can be in the same memory as the TB data
  structure, it is possible for an attacker to overwrite the UAF pointer
  with code generated from TCG. This can overwrite key pointer values
  and could lead to code execution on the host outside of the TCG
  sandbox.

To manage notifications about this bug go to:
https://bugs.launchpad.net/qemu/+bug/1863025/+subscriptions



[PATCH] accel/tcg: fix race in cpu_exec_step_atomic (bug 1863025)

2020-02-14 Thread Alex Bennée
The bug describes a race whereby cpu_exec_step_atomic can acquire a TB
which is invalidated by a tb_flush before we execute it. This doesn't
affect the other cpu_exec modes as a tb_flush by it's nature can only
occur on a quiescent system. The race was described as:

  B2. tcg_cpu_exec => cpu_exec => tb_find => tb_gen_code
  B3. tcg_tb_alloc obtains a new TB

  C3. TB obtained with tb_lookup__cpu_state or tb_gen_code
  (same TB as B2)

  A3. start_exclusive critical section entered
  A4. do_tb_flush is called, TB memory freed/re-allocated
  A5. end_exclusive exits critical section

  B2. tcg_cpu_exec => cpu_exec => tb_find => tb_gen_code
  B3. tcg_tb_alloc reallocates TB from B2

  C4. start_exclusive critical section entered
  C5. cpu_tb_exec executes the TB code that was free in A4

The simplest fix is to widen the exclusive period to include the TB
lookup. As a result we can drop the complication of checking we are in
the exclusive region before we end it.

Signed-off-by: Alex Bennée 
Cc: Yifan 
Cc: Bug 1863025 <1863...@bugs.launchpad.net>
---
 accel/tcg/cpu-exec.c | 21 +++--
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
index 2560c90eec7..d95c4848a47 100644
--- a/accel/tcg/cpu-exec.c
+++ b/accel/tcg/cpu-exec.c
@@ -240,6 +240,8 @@ void cpu_exec_step_atomic(CPUState *cpu)
 uint32_t cf_mask = cflags & CF_HASH_MASK;
 
 if (sigsetjmp(cpu->jmp_env, 0) == 0) {
+start_exclusive();
+
 tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags, cf_mask);
 if (tb == NULL) {
 mmap_lock();
@@ -247,8 +249,6 @@ void cpu_exec_step_atomic(CPUState *cpu)
 mmap_unlock();
 }
 
-start_exclusive();
-
 /* Since we got here, we know that parallel_cpus must be true.  */
 parallel_cpus = false;
 cc->cpu_exec_enter(cpu);
@@ -271,14 +271,15 @@ void cpu_exec_step_atomic(CPUState *cpu)
 qemu_plugin_disable_mem_helpers(cpu);
 }
 
-if (cpu_in_exclusive_context(cpu)) {
-/* We might longjump out of either the codegen or the
- * execution, so must make sure we only end the exclusive
- * region if we started it.
- */
-parallel_cpus = true;
-end_exclusive();
-}
+
+/*
+ * As we start the exclusive region before codegen we must still
+ * be in the region if we longjump out of either the codegen or
+ * the execution.
+ */
+g_assert(cpu_in_exclusive_context(cpu));
+parallel_cpus = true;
+end_exclusive();
 }
 
 struct tb_desc {
-- 
2.20.1




[PATCH v1 0/4] Extension of migration tests

2020-02-14 Thread Oksana Vohchana
This series adds a new migration test through RDMA and provides new
functions to it.
The last update by mistake was not provided a full scenario to the EXEC
migration test. One of patch fixed it.

Oksana Vohchana (4):
  Acceptance test: add address as param
  Acceptance test: EXEC migration
  Acceptance test: provides new functions
  Acceptance test: provides to use RDMA transport for migration

 tests/acceptance/migration.py | 31 +--
 1 file changed, 29 insertions(+), 2 deletions(-)

-- 
2.21.1




[PATCH v1 3/4] Acceptance test: provides new functions

2020-02-14 Thread Oksana Vohchana
Adds functions to check if service RDMA is enabled and gets the interface
where it was configured

Signed-off-by: Oksana Vohchana 
---
 tests/acceptance/migration.py | 17 +
 1 file changed, 17 insertions(+)

diff --git a/tests/acceptance/migration.py b/tests/acceptance/migration.py
index 8209dcf71d..bbd88f8dda 100644
--- a/tests/acceptance/migration.py
+++ b/tests/acceptance/migration.py
@@ -11,12 +11,16 @@
 
 
 import tempfile
+import re
+import netifaces
 from avocado_qemu import Test
 from avocado import skipUnless
 
 from avocado.utils import network
 from avocado.utils import wait
 from avocado.utils.path import find_command
+from avocado.utils import service
+from avocado.utils import process
 
 
 class Migration(Test):
@@ -58,6 +62,19 @@ class Migration(Test):
 self.cancel('Failed to find a free port')
 return port
 
+def _if_rdma_enable(self):
+rdma_stat = service.ServiceManager()
+rdma = rdma_stat.status('rdma')
+return rdma
+
+def _get_ip_rdma(self):
+get_ip_rdma = process.run('rdma link show').stdout.decode()
+for line in get_ip_rdma.split('\n'):
+if re.search(r"ACTIVE", line):
+interface = line.split(" ")[-2]
+ip = 
netifaces.ifaddresses(interface)[netifaces.AF_INET][0]['addr']
+return ip
+
 
 def test_migration_with_tcp_localhost(self):
 dest_uri = 'tcp:localhost:%u' % self._get_free_port()
-- 
2.21.1




[PATCH v1 2/4] Acceptance test: EXEC migration

2020-02-14 Thread Oksana Vohchana
Improves EXEC migration to run whole test stage

Signed-off-by: Oksana Vohchana 
---
 tests/acceptance/migration.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/acceptance/migration.py b/tests/acceptance/migration.py
index e4c39b85a1..8209dcf71d 100644
--- a/tests/acceptance/migration.py
+++ b/tests/acceptance/migration.py
@@ -75,3 +75,5 @@ class Migration(Test):
 """
 free_port = self._get_free_port()
 dest_uri = 'exec:nc -l localhost %u' % free_port
+src_uri = 'exec:nc localhost %u' % free_port
+self.do_migrate(dest_uri, src_uri)
-- 
2.21.1




[PATCH v1 1/4] Acceptance test: add address as param

2020-02-14 Thread Oksana Vohchana
Provides param address in _get_free_port()
because by default it takes free port only on the localhost

Signed-off-by: Oksana Vohchana 
---
 tests/acceptance/migration.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/acceptance/migration.py b/tests/acceptance/migration.py
index a8367ca023..e4c39b85a1 100644
--- a/tests/acceptance/migration.py
+++ b/tests/acceptance/migration.py
@@ -52,8 +52,8 @@ class Migration(Test):
 source_vm.qmp('migrate', uri=src_uri)
 self.assert_migration(source_vm, dest_vm)
 
-def _get_free_port(self):
-port = network.find_free_port()
+def _get_free_port(self, address='localhost'):
+port = network.find_free_port(address=address)
 if port is None:
 self.cancel('Failed to find a free port')
 return port
-- 
2.21.1




[PATCH v1 4/4] Acceptance test: provides to use RDMA transport for migration

2020-02-14 Thread Oksana Vohchana
Adds test for RDMA migration check

Signed-off-by: Oksana Vohchana 
---
 tests/acceptance/migration.py | 8 
 1 file changed, 8 insertions(+)

diff --git a/tests/acceptance/migration.py b/tests/acceptance/migration.py
index bbd88f8dda..c0a3031e67 100644
--- a/tests/acceptance/migration.py
+++ b/tests/acceptance/migration.py
@@ -94,3 +94,11 @@ class Migration(Test):
 dest_uri = 'exec:nc -l localhost %u' % free_port
 src_uri = 'exec:nc localhost %u' % free_port
 self.do_migrate(dest_uri, src_uri)
+
+@skipUnless(_if_rdma_enable(None), "Unit rdma.service could not be found")
+@skipUnless(_get_ip_rdma(None), 'RoCE(RDMA) service or interface not 
configured')
+def test_migration_with_rdma_localhost(self):
+ip = self._get_ip_rdma()
+free_port = self._get_free_port(address=ip)
+dest_uri = 'rdma:%s:%u' % (ip, free_port)
+self.do_migrate(dest_uri)
-- 
2.21.1




[Bug 1863025] Re: Use-after-free after flush in TCG accelerator

2020-02-14 Thread Alex Bennée
I've attached a variant of the suggested patch which simply expands the
exclusive period. It's hard to test extensively as not many things use
the EXCP_ATOMIC mechanism. Can I ask how you found the bug and if you
can re-test with the suggested patch?

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/1863025

Title:
  Use-after-free after flush in TCG accelerator

Status in QEMU:
  Confirmed

Bug description:
  I believe I found a UAF in TCG that can lead to a guest VM escape. The
  security list informed me "This can not be treated as a security
  issue." and to post it here. I am looking at the 4.2.0 source code.
  The issue requires a race and I will try to describe it in terms of
  three concurrent threads.

  Thread A:

  A1. qemu_tcg_cpu_thread_fn runs work loop
  A2. qemu_wait_io_event => qemu_wait_io_event_common => process_queued_cpu_work
  A3. start_exclusive critical section entered
  A4. do_tb_flush is called, TB memory freed/re-allocated
  A5. end_exclusive exits critical section

  Thread B:

  B1. qemu_tcg_cpu_thread_fn runs work loop
  B2. tcg_cpu_exec => cpu_exec => tb_find => tb_gen_code
  B3. tcg_tb_alloc obtains a new TB

  Thread C:

  C1. qemu_tcg_cpu_thread_fn runs work loop
  C2. cpu_exec_step_atomic executes
  C3. TB obtained with tb_lookup__cpu_state or tb_gen_code
  C4. start_exclusive critical section entered
  C5. cpu_tb_exec executes the TB code
  C6. end_exclusive exits critical section

  Consider the following sequence of events:
    B2 => B3 => C3 (same TB as B2) => A3 => A4 (TB freed) => A5 => B2 =>
    B3 (re-allocates TB from B2) => C4 => C5 (freed/reused TB now executing) => 
C6

  In short, because thread C uses the TB in the critical section, there
  is no guarantee that the pointer has not been "freed" (rather the
  memory is marked as re-usable) and therefore a use-after-free occurs.

  Since the TCG generated code can be in the same memory as the TB data
  structure, it is possible for an attacker to overwrite the UAF pointer
  with code generated from TCG. This can overwrite key pointer values
  and could lead to code execution on the host outside of the TCG
  sandbox.

To manage notifications about this bug go to:
https://bugs.launchpad.net/qemu/+bug/1863025/+subscriptions



[PATCH v2 1/2] spapr: Don't use spapr_drc_needed() in CAS code

2020-02-14 Thread Greg Kurz
We currently don't support hotplug of devices between boot and CAS. If
this happens a CAS reboot is triggered. We detect this during CAS using
the spapr_drc_needed() function which is essentially a VMStateDescription
.needed callback. Even if the condition for CAS reboot happens to be the
same as for DRC migration, it looks wrong to piggyback a migration helper
for this.

Introduce a helper with slightly more explicit name and use it in both CAS
and DRC migration code. Since a subsequent patch will enhance this helper
to cover the case of hot unplug, let's go for spapr_drc_transient(). While
here convert spapr_hotplugged_dev_before_cas() to the "transient" wording as
well.

This doesn't change any behaviour.

Signed-off-by: Greg Kurz 
---
v2: - spapr_drc_transient() helper
---
 hw/ppc/spapr_drc.c |   20 ++--
 hw/ppc/spapr_hcall.c   |   14 +-
 include/hw/ppc/spapr_drc.h |4 +++-
 3 files changed, 26 insertions(+), 12 deletions(-)

diff --git a/hw/ppc/spapr_drc.c b/hw/ppc/spapr_drc.c
index fc62e049010f..4c35ce7c5c37 100644
--- a/hw/ppc/spapr_drc.c
+++ b/hw/ppc/spapr_drc.c
@@ -456,23 +456,31 @@ void spapr_drc_reset(SpaprDrc *drc)
 }
 }
 
-bool spapr_drc_needed(void *opaque)
+bool spapr_drc_transient(SpaprDrc *drc)
 {
-SpaprDrc *drc = (SpaprDrc *)opaque;
 SpaprDrcClass *drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc);
 
-/* If no dev is plugged in there is no need to migrate the DRC state */
+/*
+ * If no dev is plugged in there is no need to migrate the DRC state
+ * nor to reset the DRC at CAS.
+ */
 if (!drc->dev) {
 return false;
 }
 
 /*
- * We need to migrate the state if it's not equal to the expected
- * long-term state, which is the same as the coldplugged initial
- * state */
+ * We need to reset the DRC at CAS or to migrate the DRC state if it's
+ * not equal to the expected long-term state, which is the same as the
+ * coldplugged initial state.
+ */
 return (drc->state != drck->ready_state);
 }
 
+static bool spapr_drc_needed(void *opaque)
+{
+return spapr_drc_transient(opaque);
+}
+
 static const VMStateDescription vmstate_spapr_drc = {
 .name = "spapr_drc",
 .version_id = 1,
diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
index b8bb66b5c0d4..6db3dbde9c92 100644
--- a/hw/ppc/spapr_hcall.c
+++ b/hw/ppc/spapr_hcall.c
@@ -1640,20 +1640,24 @@ static uint32_t cas_check_pvr(SpaprMachineState *spapr, 
PowerPCCPU *cpu,
 return best_compat;
 }
 
-static bool spapr_hotplugged_dev_before_cas(void)
+static bool spapr_transient_dev_before_cas(void)
 {
-Object *drc_container, *obj;
+Object *drc_container;
 ObjectProperty *prop;
 ObjectPropertyIterator iter;
 
 drc_container = container_get(object_get_root(), "/dr-connector");
 object_property_iter_init(&iter, drc_container);
 while ((prop = object_property_iter_next(&iter))) {
+SpaprDrc *drc;
+
 if (!strstart(prop->type, "link<", NULL)) {
 continue;
 }
-obj = object_property_get_link(drc_container, prop->name, NULL);
-if (spapr_drc_needed(obj)) {
+drc = SPAPR_DR_CONNECTOR(object_property_get_link(drc_container,
+  prop->name, NULL));
+
+if (spapr_drc_transient(drc)) {
 return true;
 }
 }
@@ -1830,7 +1834,7 @@ static target_ulong 
h_client_architecture_support(PowerPCCPU *cpu,
 
 spapr_irq_update_active_intc(spapr);
 
-if (spapr_hotplugged_dev_before_cas()) {
+if (spapr_transient_dev_before_cas()) {
 spapr->cas_reboot = true;
 }
 
diff --git a/include/hw/ppc/spapr_drc.h b/include/hw/ppc/spapr_drc.h
index df3d958a66a2..21af8deac13f 100644
--- a/include/hw/ppc/spapr_drc.h
+++ b/include/hw/ppc/spapr_drc.h
@@ -278,7 +278,9 @@ int spapr_dt_drc(void *fdt, int offset, Object *owner, 
uint32_t drc_type_mask);
 
 void spapr_drc_attach(SpaprDrc *drc, DeviceState *d, Error **errp);
 void spapr_drc_detach(SpaprDrc *drc);
-bool spapr_drc_needed(void *opaque);
+
+/* Returns true if a hot plug/unplug request is pending */
+bool spapr_drc_transient(SpaprDrc *drc);
 
 static inline bool spapr_drc_unplug_requested(SpaprDrc *drc)
 {




[PATCH v2 0/2] spapr: Fix device unplug vs CAS or migration

2020-02-14 Thread Greg Kurz
While working on getting rid of CAS reboot, I realized that we currently
don't handle device hot unplug properly in the following situations:

1) if the device is unplugged between boot and CAS, SLOF doesn't handle
   the even, which is a known limitation. The device hence stays around
   forever (specifically, until some other event is emitted and the guest
   eventually completes the unplug or a reboot). Until we can teach SLOF
   to correctly process the full FDT at CAS, we should trigger a CAS reboot,
   like we already do for hotplug.

2) if the guest is migrated after the even was emitted but before the
   guest could process it, the destination is unaware of the pending
   unplug operation and doesn't remove the device when the guests
   releases it. The 'unplug_requested' field of the DRC is actually state
   that should be migrated.

Changes since v1:
   - new spapr_drc_transient() helper that covers pending plug and unplug
 situations for both CAS and migration
   - as a mechanical consequence, fix unplug for CAS an migration in the
 same patch

--
Greg

---

Greg Kurz (2):
  spapr: Don't use spapr_drc_needed() in CAS code
  spapr: Fix handling of unplugged devices during CAS and migration


 hw/ppc/spapr_drc.c |   43 ---
 hw/ppc/spapr_hcall.c   |   14 +-
 include/hw/ppc/spapr_drc.h |4 +++-
 3 files changed, 48 insertions(+), 13 deletions(-)




Re: [PATCH 1/2] hw/arm/xilinx_zynq: Fix USB port instantiation

2020-02-14 Thread Guenter Roeck

On 2/14/20 1:26 AM, Peter Maydell wrote:

On Fri, 14 Feb 2020 at 06:05, Guenter Roeck  wrote:


USB ports must be instantiated as TYPE_CHIPIDEA to work.
Linux expects and checks various chipidea registers, which
do not exist with the basic ehci emulation.


Hi; I didn't see a cover letter email for this series (sorry if
I missed it). Would you mind including a cover letter email


I didn't send a cover letter, sorry.


for any more-than-one-patch set, please? It helps the
automatic tools and it also decreases the chances that
I miss the patches when I'm scanning through my email
(when I'm tagging things I might want to review I basically
look through subject lines for single patch patches
and for 00/nn cover letters and ignore middle-of-patchset
emails...)



Sure, NP. I'll resend with a cover letter.

Guenter



[PATCH v2 2/2] spapr: Fix handling of unplugged devices during CAS and migration

2020-02-14 Thread Greg Kurz
We already detect if a device is being hot plugged before CAS to trigger
a CAS reboot and during migration to migrate the state of the associated
DRC. But hot unplugging a device is also an asynchronous operation that
requires the guest to take action. This means that if the guest is migrated
after the hot unplug event was sent but before it could release the device
with RTAS, the destination QEMU doesn't know about the pending unplug
operation and doesn't actually remove the device when the guest finally
releases it.

Similarly, if the unplug request is fired before CAS, the guest isn't
notified of the change, just like with hotplug. It ends up booting with
the device still present in the DT and configures it, just like it was
never removed. Even weirder, since the event is still queued, it will
be eventually processed when some other unrelated event is posted to
the guest.

Enhance spapr_drc_transient() to also return true if an unplug request is
pending. This fixes the issue at CAS with a CAS reboot request and
causes the DRC state to be migrated. Some extra care is still needed to
inform the destination that an unplug request is pending : migrate the
unplug_requested field of the DRC in an optional subsection. This might
break backwards migration, but this is still better than ending with
an inconsistent guest.

Signed-off-by: Greg Kurz 
---
 hw/ppc/spapr_drc.c |   25 +++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/hw/ppc/spapr_drc.c b/hw/ppc/spapr_drc.c
index 4c35ce7c5c37..e373d342eb84 100644
--- a/hw/ppc/spapr_drc.c
+++ b/hw/ppc/spapr_drc.c
@@ -456,6 +456,22 @@ void spapr_drc_reset(SpaprDrc *drc)
 }
 }
 
+static bool spapr_drc_unplug_requested_needed(void *opaque)
+{
+return spapr_drc_unplug_requested(opaque);
+}
+
+static const VMStateDescription vmstate_spapr_drc_unplug_requested = {
+.name = "spapr_drc/unplug_requested",
+.version_id = 1,
+.minimum_version_id = 1,
+.needed = spapr_drc_unplug_requested_needed,
+.fields  = (VMStateField []) {
+VMSTATE_BOOL(unplug_requested, SpaprDrc),
+VMSTATE_END_OF_LIST()
+}
+};
+
 bool spapr_drc_transient(SpaprDrc *drc)
 {
 SpaprDrcClass *drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc);
@@ -471,9 +487,10 @@ bool spapr_drc_transient(SpaprDrc *drc)
 /*
  * We need to reset the DRC at CAS or to migrate the DRC state if it's
  * not equal to the expected long-term state, which is the same as the
- * coldplugged initial state.
+ * coldplugged initial state, or if an unplug request is pending.
  */
-return (drc->state != drck->ready_state);
+return drc->state != drck->ready_state ||
+spapr_drc_unplug_requested(drc);
 }
 
 static bool spapr_drc_needed(void *opaque)
@@ -489,6 +506,10 @@ static const VMStateDescription vmstate_spapr_drc = {
 .fields  = (VMStateField []) {
 VMSTATE_UINT32(state, SpaprDrc),
 VMSTATE_END_OF_LIST()
+},
+.subsections = (const VMStateDescription * []) {
+&vmstate_spapr_drc_unplug_requested,
+NULL
 }
 };
 




Re: [PATCH v2 18/30] qapi: Delete all the "foo: dropped in n.n" notes

2020-02-14 Thread Markus Armbruster
Markus Armbruster  writes:

> Peter Maydell  writes:
>
>> A handful of QAPI doc comments include lines like
>> "ppcemb: dropped in 3.1". The doc comment parser will just
>> put these into whatever the preceding section was; sometimes
>> that's "Notes", and sometimes it's some random other section,
>> as with "NetClientDriver" where the "'dump': dropped in 2.12"
>> line ends up in the "Since:" section.
>>
>> This tends to render wrongly, more so in the upcoming rST
>> generator, but sometimes even in the texinfo, as in the case
>> of QKeyCode:
>>ac_bookmarks
>>since 2.10 altgr, altgr_r: dropped in 2.10
>>
>> We now have a better place to tell users about deprecated
>> and deleted functionality -- qemu-deprecated.texi.
>> So just remove all these "dropped in" remarks entirely.

The first sentence makes me expect we'll move these bits to the better
place.  The second then tells me we drop them, without giving a reason.

Suggest:

   Since commit 3264ffced3 (v4.2.0), we have a better place to tell
   users about deprecated and deleted functionality --
   qemu-deprecated.texi.  These "dropped in" remarks all predate it, and
   other feature drops of that vintage are not documented anywhere, so
   moving these to qemu-deprecated.texi makes little sense.  Drop them
   instead.

With something like that
Reviewed-by: Markus Armbruster 

>>
>> Signed-off-by: Peter Maydell 
>> ---
>> Perhaps qemu-deprecated.texi should be updated -- Markus
>> said he'd look into that. So this patch is to some extent
>> a placeholder to get these broken bits of doc comment out
>> of the way.
>
> The appropriate place is appendix "Recently removed features", which
> appeared in commit 3264ffced3 "dirty-bitmaps: remove deprecated autoload
> parameter", v4.2.0.  We did not document any prior removals then.
>
> Perhaps we should systematically document all removals since v4.1.0.  I
> can look into that.
>
> I'm not sure documenting older removals now is worth our while.  If you
> think it is, let me know.
>
> All the 'dropped in' notes removed in this patch are older.  Nothing to
> do for qemu-deprecated.texi unless we choose to systematically document
> older removals.




Re: [PATCH RFC] memory: Don't allow to resize RAM while migrating

2020-02-14 Thread Dr. David Alan Gilbert
* David Hildenbrand (da...@redhat.com) wrote:
> 
> >> diff --git a/migration/ram.c b/migration/ram.c
> >> index ed23ed1c7c..f86f32b453 100644
> >> --- a/migration/ram.c
> >> +++ b/migration/ram.c
> >> @@ -52,6 +52,7 @@
> >>  #include "migration/colo.h"
> >>  #include "block.h"
> >>  #include "sysemu/sysemu.h"
> >> +#include "sysemu/runstate.h"
> >>  #include "savevm.h"
> >>  #include "qemu/iov.h"
> >>  #include "multifd.h"
> >> @@ -3710,8 +3711,49 @@ static SaveVMHandlers savevm_ram_handlers = {
> >>  .resume_prepare = ram_resume_prepare,
> >>  };
> >>  
> >> +static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
> >> +  size_t old_size, size_t new_size)
> >> +{
> >> +/*
> >> + * We don't care about resizes triggered on incoming migration (when
> >> + * syncing ram blocks), or of course, when no migration is going on.
> >> + */
> >> +if (migration_is_idle() || !runstate_is_running()) {
> >> +return;
> >> +}
> >> +
> >> +if (!postcopy_is_running()) {
> >> +Error *err = NULL;
> >> +
> >> +/*
> >> + * Precopy code cannot deal with the size of ram blocks changing 
> >> at
> >> + * random points in time. We're still running on the source, abort
> >> + * the migration and continue running here. Make sure to wait 
> >> until
> >> + * migration was canceled.
> >> + */
> >> +error_setg(&err, "RAM resized during precopy.");
> >> +migrate_set_error(migrate_get_current(), err);
> >> +error_free(err);
> >> +migration_cancel();
> > 
> > If we can't do anything else, this is reasonable.
> > 
> > But as discussed before, it is still not fully clear to me _why_ are
> > ramblocks changing if we have disabled add/remove memory during migration.
> 
> 
> Ramblock add/remove is ties to device add/remove, which we block.
> 
> Resize, however, it not. Here, the resize happens while the guest is
> booting up. The content/size of the ram block depends also on previous
> guest action AFAIK. There is no way from stopping the guest from doing
> that. It's required for the guest to continue booting (with ACPI).
> 
> I'm currently working on a project which reuses resizable ram blocks in
> different context. There, I can simply defer/avoid resizing when
> migration is active. In the ACPI case, however, we cannot avoid it.
> 
> Hope that answers your question
> 
> > 
> >> +} else {
> >> +/*
> >> + * Postcopy code cannot deal with the size of ram blocks changing 
> >> at
> >> + * random points in time. We're running on the target. Fail hard.
> >> + *
> >> + * TODO: How to handle this in a better way?
> >> + */
> >> +error_report("RAM resized during postcopy.");
> >> +exit(-1);
> > 
> > Idea is good, but we also need to exit destination, not only source, no?
> 
> @Dave, any idea what could be the right thing to do here?

I think that's OK; postcopy_is_running() will return true on the
destination (e.g. see it's use in ram_load()) and should work.

I'd really appreciate if you could print hte RAMBlock or something at
this point - when we hit this error we're going to want to try and
figure out why.

Dave

> > 
> >> +}
> >> +}
> > 
> > 
> > 
> >> +static RAMBlockNotifier ram_mig_ram_notifier = {
> >> +.ram_block_resized = ram_mig_ram_block_resized,
> >> +};
> >> +
> >>  void ram_mig_init(void)
> >>  {
> >>  qemu_mutex_init(&XBZRLE.lock);
> >>  register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
> >> +ram_block_notifier_add(&ram_mig_ram_notifier);
> >>  }
> > 
> > Shouldn't we remove the notifier when we finish the migration.
> 
> It's called from main() unconditionally (so not when migration starts),
> so the notifier remains active the whole QEMU lifetime (which should be
> fine AFAIKT).
> 
> -- 
> Thanks,
> 
> David / dhildenb
--
Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK




  1   2   3   >