date:20240408


On 4/4/24 21:13, Philippe Mathieu-Daudé wrote:

Replace qemu_bh_new_guarded() by virtio_bh_new_guarded()
so the bus and device use the same guard. Otherwise the
DMA-reentrancy protection can be bypassed.

Cc: qemu-sta...@nongnu.org
Suggested-by: Alexander Bulekov 
Signed-off-by: Philippe Mathieu-Daudé 
---
  hw/char/virtio-serial-bus.c | 3 +--
  1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/hw/char/virtio-serial-bus.c b/hw/char/virtio-serial-bus.c
index 016aba6374..cd0e3a11f7 100644
--- a/hw/char/virtio-serial-bus.c
+++ b/hw/char/virtio-serial-bus.c
@@ -985,8 +985,7 @@ static void virtser_port_device_realize(DeviceState *dev, 
Error **errp)
  return;
  }
  
-port->bh = qemu_bh_new_guarded(flush_queued_data_bh, port,

-   &dev->mem_reentrancy_guard);
+port->bh = virtio_bh_new_guarded(vdev, flush_queued_data_bh, port);


Missing:
-- >8 --
-port->bh = virtio_bh_new_guarded(vdev, flush_queued_data_bh, port);
+port->bh = virtio_bh_new_guarded(VIRTIO_DEVICE(dev),
+ flush_queued_data_bh, port);
---


  port->elem = NULL;
  }

Re: [PATCH 1/1] virtio-net: fix bug 1451 aka "assert(!virtio_net_get_subqueue(nc)->async_tx.elem); "

2024-04-08 Thread Jason Wang

On Fri, Apr 5, 2024 at 7:22 PM Alexey Dobriyan  wrote:
>
> Don't send zero length packets in virtio_net_flush_tx().
>
> Reproducer from https://gitlab.com/qemu-project/qemu/-/issues/1451
> creates small packet (1 segment, len = 10 == n->guest_hdr_len),
> destroys queue.
>
> "if (n->host_hdr_len != n->guest_hdr_len)" is triggered, if body creates
> zero length/zero segment packet, because there is nothing after guest
> header.

And in this case host_hdr_len is 0.

>
> qemu_sendv_packet_async() tries to send it.
>
> slirp discards it because it is smaller than Ethernet header,
> but returns 0.
>
> 0 length is propagated upwards and is interpreted as "packet has been sent"
> which is terrible because queue is being destroyed, nothing has been sent,
> nobody is waiting for TX to complete and assert it triggered.
>
> Signed-off-by: Alexey Dobriyan 
> ---
>  hw/net/virtio-net.c | 18 --
>  1 file changed, 12 insertions(+), 6 deletions(-)
>
> diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
> index 58014a92ad..258633f885 100644
> --- a/hw/net/virtio-net.c
> +++ b/hw/net/virtio-net.c
> @@ -2765,18 +2765,14 @@ static int32_t virtio_net_flush_tx(VirtIONetQueue *q)
>  out_sg = elem->out_sg;
>  if (out_num < 1) {
>  virtio_error(vdev, "virtio-net header not in first element");
> -virtqueue_detach_element(q->tx_vq, elem, 0);
> -g_free(elem);
> -return -EINVAL;
> +goto detach;
>  }
>
>  if (n->has_vnet_hdr) {
>  if (iov_to_buf(out_sg, out_num, 0, &vhdr, n->guest_hdr_len) <
>  n->guest_hdr_len) {
>  virtio_error(vdev, "virtio-net header incorrect");
> -virtqueue_detach_element(q->tx_vq, elem, 0);
> -g_free(elem);
> -return -EINVAL;
> +goto detach;
>  }
>  if (n->needs_vnet_hdr_swap) {
>  virtio_net_hdr_swap(vdev, (void *) &vhdr);
> @@ -2807,6 +2803,11 @@ static int32_t virtio_net_flush_tx(VirtIONetQueue *q)
>   n->guest_hdr_len, -1);
>  out_num = sg_num;
>  out_sg = sg;
> +
> +if (iov_size(out_sg, out_num) == 0) {
> +virtio_error(vdev, "virtio-net nothing to send");
> +goto detach;
> +}

Nit, I think we can do this check before the iov_copy()?

Thanks

>  }
>
>  ret = qemu_sendv_packet_async(qemu_get_subqueue(n->nic, queue_index),
> @@ -2827,6 +2828,11 @@ drop:
>  }
>  }
>  return num_packets;
> +
> +detach:
> +virtqueue_detach_element(q->tx_vq, elem, 0);
> +g_free(elem);
> +return -EINVAL;
>  }
>
>  static void virtio_net_tx_timer(void *opaque);
> --
> 2.34.1
>

Re: [PATCH] hw/virtio: Add support for VDPA network simulation devices

2024-04-08 Thread Jason Wang

On Mon, Mar 18, 2024 at 8:41 PM Michael S. Tsirkin  wrote:
>
> On Thu, Mar 14, 2024 at 11:24:33AM +0800, Jason Wang wrote:
> > On Thu, Mar 14, 2024 at 3:52 AM Michael S. Tsirkin  wrote:
> > >
> > > On Wed, Mar 13, 2024 at 07:51:08PM +0100, Thomas Weißschuh wrote:
> > > > On 2024-02-21 15:38:02+0800, Hao Chen wrote:
> > > > > This patch adds support for VDPA network simulation devices.
> > > > > The device is developed based on virtio-net and tap backend,
> > > > > and supports hardware live migration function.
> > > > >
> > > > > For more details, please refer to "docs/system/devices/vdpa-net.rst"
> > > > >
> > > > > Signed-off-by: Hao Chen 
> > > > > ---
> > > > >  MAINTAINERS |   5 +
> > > > >  docs/system/device-emulation.rst|   1 +
> > > > >  docs/system/devices/vdpa-net.rst| 121 +
> > > > >  hw/net/virtio-net.c |  16 ++
> > > > >  hw/virtio/virtio-pci.c  | 189 
> > > > > +++-
> >
> > I think those modifications should belong to a separate file as it
> > might conflict with virito features in the future.
> >
> > > > >  hw/virtio/virtio.c  |  39 
> > > > >  include/hw/virtio/virtio-pci.h  |   5 +
> > > > >  include/hw/virtio/virtio.h  |  19 ++
> > > > >  include/standard-headers/linux/virtio_pci.h |   7 +
> > > > >  9 files changed, 399 insertions(+), 3 deletions(-)
> > > > >  create mode 100644 docs/system/devices/vdpa-net.rst
> > > >
> > > > [..]
> > > >
> > > > > diff --git a/include/standard-headers/linux/virtio_pci.h 
> > > > > b/include/standard-headers/linux/virtio_pci.h
> > > > > index b7fdfd0668..fb5391cef6 100644
> > > > > --- a/include/standard-headers/linux/virtio_pci.h
> > > > > +++ b/include/standard-headers/linux/virtio_pci.h
> > > > > @@ -216,6 +216,13 @@ struct virtio_pci_cfg_cap {
> > > > >  #define VIRTIO_PCI_COMMON_Q_NDATA  56
> > > > >  #define VIRTIO_PCI_COMMON_Q_RESET  58
> > > > >
> > > > > +#define LM_LOGGING_CTRL 0
> > > > > +#define LM_BASE_ADDR_LOW4
> > > > > +#define LM_BASE_ADDR_HIGH   8
> > > > > +#define LM_END_ADDR_LOW 12
> > > > > +#define LM_END_ADDR_HIGH16
> > > > > +#define LM_VRING_STATE_OFFSET   0x20
> > > >
> > > > These changes are not in upstream Linux and will be undone by
> > > > ./scripts/update-linux-headers.sh.
> > > >
> > > > Are they intentionally in this header?
> > >
> > >
> > > Good point. Pls move.
> >
> > Right and this part, it's not a part of standard virtio.
> >
> > Thanks
>
> I'm thinking of reverting this patch unless there's a resolution
> soon, and reapplying later after the release.

I think we need to revert this and re-visit in the next release.

Thanks

>
>
> > >
> > > > > +
> > > > >  #endif /* VIRTIO_PCI_NO_MODERN */
> > > > >
> > > > >  #endif
> > >
>

[PATCH v2] vhost: don't set vring call if guest notifiers is not enabled

2024-04-08 Thread lyx634449800

When conducting performance testing using testpmd in the guest os,
it was observed that the performance was lower compared to the
scenario of direct vfio-pci usage.

In the commit 96a3d98d2cdbd897ff5ab33427aa4cfb94077665, the author
provided a good solution. However, because the guest OS's
driver(e.g., virtio-net pmd) may not enable the msix capability, the
function k->query_guest_notifiers(qbus->parent) may return false,
resulting in the expected effect not being achieved. To address this
issue, modify the conditional statement.

Signed-off-by: Yuxue Liu 
---
V2: Update commit description and title

 hw/virtio/vhost.c | 16 +---
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
index f50180e60e..b972c84e67 100644
--- a/hw/virtio/vhost.c
+++ b/hw/virtio/vhost.c
@@ -1266,13 +1266,15 @@ int vhost_virtqueue_start(struct vhost_dev *dev,
 vhost_virtqueue_mask(dev, vdev, idx, false);
 }
 
-if (k->query_guest_notifiers &&
-k->query_guest_notifiers(qbus->parent) &&
-virtio_queue_vector(vdev, idx) == VIRTIO_NO_VECTOR) {
-file.fd = -1;
-r = dev->vhost_ops->vhost_set_vring_call(dev, &file);
-if (r) {
-goto fail_vector;
+if (k->query_guest_notifiers) {
+if (!k->query_guest_notifiers(qbus->parent) ||
+(k->query_guest_notifiers(qbus->parent) &&
+virtio_queue_vector(vdev, idx) == VIRTIO_NO_VECTOR)) {
+file.fd = -1;
+r = dev->vhost_ops->vhost_set_vring_call(dev, &file);
+if (r) {
+goto fail_vector;
+}
 }
 }
 
-- 
2.43.0

Re: [PATCH-for-9.0 0/4] hw/virtio: Protect from more DMA re-entrancy bugs

2024-04-08 Thread Mauro Matteo Cascella

Hi,

On Thu, Apr 4, 2024 at 9:13 PM Philippe Mathieu-Daudé  wrote:
>
> Gerd suggested to use the transport guard to protect the
> device from DMA re-entrancy abuses.

This was assigned CVE-2024-3446.

> Philippe Mathieu-Daudé (4):
>   hw/virtio: Introduce virtio_bh_new_guarded() helper
>   hw/display/virtio-gpu: Protect from DMA re-entrancy bugs
>   hw/char/virtio-serial-bus: Protect from DMA re-entrancy bugs
>   hw/virtio/virtio-crypto: Protect from DMA re-entrancy bugs
>
>  include/hw/virtio/virtio.h  |  7 +++
>  hw/char/virtio-serial-bus.c |  3 +--
>  hw/display/virtio-gpu.c |  6 ++
>  hw/virtio/virtio-crypto.c   |  4 ++--
>  hw/virtio/virtio.c  | 10 ++
>  5 files changed, 22 insertions(+), 8 deletions(-)
>
> --
> 2.41.0
>

Thanks,
--
Mauro Matteo Cascella
Red Hat Product Security
PGP-Key ID: BB3410B0

Re: [PATCH v9 16/20] virtio-net: Do not write hashes to peer buffer

2024-04-08 Thread Yuri Benditovich

On Mon, Apr 8, 2024 at 4:30 AM Akihiko Odaki  wrote:
>
> On 2024/04/08 7:09, Yuri Benditovich wrote:
> > On Wed, Apr 3, 2024 at 2:12 PM Akihiko Odaki  
> > wrote:
> >>
> >> The peer buffer is qualified with const and not meant to be modified.
> >
> > IMHO, this buffer is not so 'const' (although the prototype states so),
> > it is allocated in net.c
> > btw, another procedure in this file also modifies the buffer
> > (work_around_broken_dhclient)
>
> Right but it has a FIXME comment.
>
> >
> >> It also prevents enabling VIRTIO_NET_F_HASH_REPORT for peers without
> >> virtio-net header support.
> >
> > Does it mean _this commit_ prevents enabling VIRTIO_NET_F_HASH_REPORT
> > for peers without
> > virtio-net header support? Where?
>
> No, but I meant that this patch fixes such a problem.

No, it does not. Such a problem does not exist in the master, the
hash_report feature
is silently dropped in such case:
https://github.com/qemu/qemu/blob/master/hw/net/virtio-net.c#L816

>
> Regards,
> Akihiko Odaki

Re: [PATCH for-9.1 v3 08/11] contrib/vhost-user-blk: enable it on any POSIX system


On Thu, Apr 04, 2024 at 04:00:38PM +0200, Philippe Mathieu-Daudé wrote:

Hi Stefano,


Hi Phil!



On 4/4/24 14:23, Stefano Garzarella wrote:

Let's make the code more portable by using the "qemu/bswap.h" API
and adding defines from block/file-posix.c to support O_DIRECT in
other systems (e.g. macOS).

vhost-user-server.c is a dependency, let's enable it for any POSIX
system.

Signed-off-by: Stefano Garzarella 
---
 meson.build |  2 --
 contrib/vhost-user-blk/vhost-user-blk.c | 19 +--
 util/meson.build|  4 +++-
 3 files changed, 20 insertions(+), 5 deletions(-)




diff --git a/contrib/vhost-user-blk/vhost-user-blk.c 
b/contrib/vhost-user-blk/vhost-user-blk.c
index a8ab9269a2..462e584857 100644
--- a/contrib/vhost-user-blk/vhost-user-blk.c
+++ b/contrib/vhost-user-blk/vhost-user-blk.c
@@ -16,6 +16,7 @@
  */
 #include "qemu/osdep.h"
+#include "qemu/bswap.h"
 #include "standard-headers/linux/virtio_blk.h"
 #include "libvhost-user-glib.h"




@@ -267,13 +282,13 @@ static int vub_virtio_process_req(VubDev *vdev_blk,
 req->in = (struct virtio_blk_inhdr *)elem->in_sg[in_num - 1].iov_base;
 in_num--;
-type = le32toh(req->out->type);
+type = le32_to_cpu(req->out->type);
 switch (type & ~VIRTIO_BLK_T_BARRIER) {
 case VIRTIO_BLK_T_IN:
 case VIRTIO_BLK_T_OUT: {
 ssize_t ret = 0;
 bool is_write = type & VIRTIO_BLK_T_OUT;
-req->sector_num = le64toh(req->out->sector);
+req->sector_num = le64_to_cpu(req->out->sector);
 if (is_write) {
 ret  = vub_writev(req, &elem->out_sg[1], out_num);
 } else {

Can we switch to the bswap API in a preliminary patch,


Sure, I tried to minimize the patches because it's already big,
but I can split this.


converting all the source files?



What do you mean with "all the source files"?

"le64toh" is used here and in some subprojects (e.g. libvduse,
libvhost-user), where IIUC we can't use QEMU's bswap.h because we
don't want to put a dependency with the QEMU code.

BTW I'll check for other *toh() usage in QEMU code and change in the
preliminary patch you suggested to add.

Thanks for the review,
Stefano

Re: [PATCH v9 16/20] virtio-net: Do not write hashes to peer buffer

2024-04-08 Thread Akihiko Odaki


On 2024/04/08 16:40, Yuri Benditovich wrote:

On Mon, Apr 8, 2024 at 4:30 AM Akihiko Odaki  wrote:


On 2024/04/08 7:09, Yuri Benditovich wrote:

On Wed, Apr 3, 2024 at 2:12 PM Akihiko Odaki  wrote:


The peer buffer is qualified with const and not meant to be modified.


IMHO, this buffer is not so 'const' (although the prototype states so),
it is allocated in net.c
btw, another procedure in this file also modifies the buffer
(work_around_broken_dhclient)


Right but it has a FIXME comment.




It also prevents enabling VIRTIO_NET_F_HASH_REPORT for peers without
virtio-net header support.


Does it mean _this commit_ prevents enabling VIRTIO_NET_F_HASH_REPORT
for peers without
virtio-net header support? Where?


No, but I meant that this patch fixes such a problem.


No, it does not. Such a problem does not exist in the master, the
hash_report feature
is silently dropped in such case:
https://github.com/qemu/qemu/blob/master/hw/net/virtio-net.c#L816


Well, silently dropping VIRTIO_NET_F_HASH_REPORT is not different from 
preventing enabling VIRTIO_NET_F_HASH_REPORT, is it?


Regards,
Akihiko Odaki

Re: [PATCH v9 16/20] virtio-net: Do not write hashes to peer buffer

2024-04-08 Thread Yuri Benditovich

On Mon, Apr 8, 2024 at 10:42 AM Akihiko Odaki  wrote:
>
> On 2024/04/08 16:40, Yuri Benditovich wrote:
> > On Mon, Apr 8, 2024 at 4:30 AM Akihiko Odaki  
> > wrote:
> >>
> >> On 2024/04/08 7:09, Yuri Benditovich wrote:
> >>> On Wed, Apr 3, 2024 at 2:12 PM Akihiko Odaki  
> >>> wrote:
> 
>  The peer buffer is qualified with const and not meant to be modified.
> >>>
> >>> IMHO, this buffer is not so 'const' (although the prototype states so),
> >>> it is allocated in net.c
> >>> btw, another procedure in this file also modifies the buffer
> >>> (work_around_broken_dhclient)
> >>
> >> Right but it has a FIXME comment.
> >>
> >>>
>  It also prevents enabling VIRTIO_NET_F_HASH_REPORT for peers without
>  virtio-net header support.
> >>>
> >>> Does it mean _this commit_ prevents enabling VIRTIO_NET_F_HASH_REPORT
> >>> for peers without
> >>> virtio-net header support? Where?
> >>
> >> No, but I meant that this patch fixes such a problem.
> >
> > No, it does not. Such a problem does not exist in the master, the
> > hash_report feature
> > is silently dropped in such case:
> > https://github.com/qemu/qemu/blob/master/hw/net/virtio-net.c#L816
>
> Well, silently dropping VIRTIO_NET_F_HASH_REPORT is not different from
> preventing enabling VIRTIO_NET_F_HASH_REPORT, is it?
>
But how is your patch involved in it? Should this line be removed from
the commit message?


> Regards,
> Akihiko Odaki

Re: [PATCH-for-9.0] hw/sd/sdhci: Discard excess of data written to Buffer Data Port register

2024-04-08 Thread Mauro Matteo Cascella

On Thu, Apr 4, 2024 at 10:55 AM Philippe Mathieu-Daudé
 wrote:
>
> Per "SD Host Controller Standard Specification Version 3.00":
>
>   * 1.7 Buffer Control
>
>   - 1.7.1 Control of Buffer Pointer
>
> (3) Buffer Control with Block Size
>
> In case of write operation, the buffer accumulates the data
> written through the Buffer Data Port register. When the buffer
> pointer reaches the block size, Buffer Write Enable in the
> Present State register changes 1 to 0. It means no more data
> can be written to the buffer. Excess data of the last write is
> ignored. For example, if just lower 2 bytes data can be written
> to the buffer and a 32-bit (4-byte) block of data is written to
> the Buffer Data Port register, the lower 2 bytes of data is
> written to the buffer and the upper 2 bytes is ignored.
>
> Discard the excess of data to avoid overflow reported by fuzzer:
>
>   $ cat << EOF | qemu-system-i386 \
>  -display none -nodefaults \
>  -machine accel=qtest -m 512M \
>  -device sdhci-pci,sd-spec-version=3 \
>  -device sd-card,drive=mydrive \
>  -drive 
> if=none,index=0,file=null-co://,format=raw,id=mydrive -nographic \
>  -qtest stdio
>   outl 0xcf8 0x80001013
>   outl 0xcfc 0x91
>   outl 0xcf8 0x80001001
>   outl 0xcfc 0x0600
>   write 0x912c 0x1 0x05
>   write 0x9158 0x1 0x16
>   write 0x9105 0x1 0x04
>   write 0x9128 0x1 0x08
>   write 0x16 0x1 0x21
>   write 0x19 0x1 0x20
>   write 0x910c 0x1 0x01
>   write 0x910e 0x1 0x20
>   write 0x910f 0x1 0x00
>   write 0x910c 0x1 0x00
>   write 0x9120 0x1 0x00
>   EOF
>
> Stack trace (part):
> =
> ==89993==ERROR: AddressSanitizer: heap-buffer-overflow on address
> 0x61529900 at pc 0x55d5f885700d bp 0x7ffc1e1e9470 sp 0x7ffc1e1e9468
> WRITE of size 1 at 0x61529900 thread T0
> #0 0x55d5f885700c in sdhci_write_dataport hw/sd/sdhci.c:564:39
> #1 0x55d5f8849150 in sdhci_write hw/sd/sdhci.c:1223:13
> #2 0x55d5fa01db63 in memory_region_write_accessor system/memory.c:497:5
> #3 0x55d5fa01d245 in access_with_adjusted_size system/memory.c:573:18
> #4 0x55d5fa01b1a9 in memory_region_dispatch_write system/memory.c:1521:16
> #5 0x55d5fa09f5c9 in flatview_write_continue system/physmem.c:2711:23
> #6 0x55d5fa08f78b in flatview_write system/physmem.c:2753:12
> #7 0x55d5fa08f258 in address_space_write system/physmem.c:2860:18
> ...
> 0x61529900 is located 0 bytes to the right of 512-byte region
> [0x61529700,0x61529900) allocated by thread T0 here:
> #0 0x55d5f7237b27 in __interceptor_calloc
> #1 0x7f9e36dd4c50 in g_malloc0
> #2 0x55d5f88672f7 in sdhci_pci_realize hw/sd/sdhci-pci.c:36:5
> #3 0x55d5f844b582 in pci_qdev_realize hw/pci/pci.c:2092:9
> #4 0x55d5fa2ee74b in device_set_realized hw/core/qdev.c:510:13
> #5 0x55d5fa325bfb in property_set_bool qom/object.c:2358:5
> #6 0x55d5fa31ea45 in object_property_set qom/object.c:1472:5
> #7 0x55d5fa332509 in object_property_set_qobject om/qom-qobject.c:28:10
> #8 0x55d5fa31f6ed in object_property_set_bool qom/object.c:1541:15
> #9 0x55d5fa2e2948 in qdev_realize hw/core/qdev.c:292:12
> #10 0x55d5f8eed3f1 in qdev_device_add_from_qdict 
> system/qdev-monitor.c:719:10
> #11 0x55d5f8eef7ff in qdev_device_add system/qdev-monitor.c:738:11
> #12 0x55d5f8f211f0 in device_init_func system/vl.c:1200:11
> #13 0x55d5fad0877d in qemu_opts_foreach util/qemu-option.c:1135:14
> #14 0x55d5f8f0df9c in qemu_create_cli_devices system/vl.c:2638:5
> #15 0x55d5f8f0db24 in qmp_x_exit_preconfig system/vl.c:2706:5
> #16 0x55d5f8f14dc0 in qemu_init system/vl.c:3737:9
> ...
> SUMMARY: AddressSanitizer: heap-buffer-overflow hw/sd/sdhci.c:564:39
> in sdhci_write_dataport
>
> Cc: qemu-sta...@nongnu.org
> Fixes: d7dfca0807 ("hw/sdhci: introduce standard SD host controller")
> Buglink: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=58813
> Reported-by: Alexander Bulekov 
> Reported-by: Chuhong Yuan 
> Signed-off-by: Philippe Mathieu-Daudé 
> ---
>  hw/sd/sdhci.c | 10 +-
>  1 file changed, 9 insertions(+), 1 deletion(-)
>
> diff --git a/hw/sd/sdhci.c b/hw/sd/sdhci.c
> index c5e0bc018b..2dd88fa139 100644
> --- a/hw/sd/sdhci.c
> +++ b/hw/sd/sdhci.c
> @@ -552,7 +552,7 @@ static void sdhci_write_block_to_card(SDHCIState *s)
>   * register */
>  static void sdhci_write_dataport(SDHCIState *s, uint32_t value, unsigned 
> size)
>  {
> -unsigned i;
> +unsigned i, available;
>
>  /* Check that there is free space left in a buffer */
>  if (!(s->prnsts & SDHC_SPACE_AVAILABLE)) {
> @@ -560,6 +560,14 @@ static void sdhci_write_dataport(SDHCIState *s, uint32_t 
> value, unsigned size)
>  return;
>  }
>
> +available = s->buf_maxsz - s->data_count;
> +

Re: [PATCH v9 16/20] virtio-net: Do not write hashes to peer buffer

2024-04-08 Thread Akihiko Odaki


On 2024/04/08 16:54, Yuri Benditovich wrote:

On Mon, Apr 8, 2024 at 10:42 AM Akihiko Odaki  wrote:


On 2024/04/08 16:40, Yuri Benditovich wrote:

On Mon, Apr 8, 2024 at 4:30 AM Akihiko Odaki  wrote:


On 2024/04/08 7:09, Yuri Benditovich wrote:

On Wed, Apr 3, 2024 at 2:12 PM Akihiko Odaki  wrote:


The peer buffer is qualified with const and not meant to be modified.


IMHO, this buffer is not so 'const' (although the prototype states so),
it is allocated in net.c
btw, another procedure in this file also modifies the buffer
(work_around_broken_dhclient)


Right but it has a FIXME comment.




It also prevents enabling VIRTIO_NET_F_HASH_REPORT for peers without
virtio-net header support.


Does it mean _this commit_ prevents enabling VIRTIO_NET_F_HASH_REPORT
for peers without
virtio-net header support? Where?


No, but I meant that this patch fixes such a problem.


No, it does not. Such a problem does not exist in the master, the
hash_report feature
is silently dropped in such case:
https://github.com/qemu/qemu/blob/master/hw/net/virtio-net.c#L816


Well, silently dropping VIRTIO_NET_F_HASH_REPORT is not different from
preventing enabling VIRTIO_NET_F_HASH_REPORT, is it?


But how is your patch involved in it? Should this line be removed from
the commit message?


In the master, VIRTIO_NET_F_HASH_REPORT is silently dropped, but this 
patch will change to work without dropping it, which is worth to mention.


Regards,
Akihiko Odaki

Re: [PATCH for-9.1 v3 09/11] hostmem: add a new memory backend based on POSIX shm_open()


On Thu, Apr 04, 2024 at 04:09:34PM +0200, David Hildenbrand wrote:

On 04.04.24 14:23, Stefano Garzarella wrote:

shm_open() creates and opens a new POSIX shared memory object.
A POSIX shared memory object allows creating memory backend with an
associated file descriptor that can be shared with external processes
(e.g. vhost-user).

The new `memory-backend-shm` can be used as an alternative when
`memory-backend-memfd` is not available (Linux only), since shm_open()
should be provided by any POSIX-compliant operating system.

This backend mimics memfd, allocating memory that is practically
anonymous. In theory shm_open() requires a name, but this is allocated
for a short time interval and shm_unlink() is called right after
shm_open(). After that, only fd is shared with external processes
(e.g., vhost-user) as if it were associated with anonymous memory.

In the future we may also allow the user to specify the name to be
passed to shm_open(), but for now we keep the backend simple, mimicking
anonymous memory such as memfd.

Signed-off-by: Stefano Garzarella 
---
v3
- enriched commit message and documentation to highlight that we
  want to mimic memfd (David)
---
 docs/system/devices/vhost-user.rst |   5 +-
 qapi/qom.json  |  17 +
 backends/hostmem-shm.c | 118 +
 backends/meson.build   |   1 +
 qemu-options.hx|  11 +++
 5 files changed, 150 insertions(+), 2 deletions(-)
 create mode 100644 backends/hostmem-shm.c

diff --git a/docs/system/devices/vhost-user.rst 
b/docs/system/devices/vhost-user.rst
index 9b2da106ce..35259d8ec7 100644
--- a/docs/system/devices/vhost-user.rst
+++ b/docs/system/devices/vhost-user.rst
@@ -98,8 +98,9 @@ Shared memory object
 In order for the daemon to access the VirtIO queues to process the
 requests it needs access to the guest's address space. This is
-achieved via the ``memory-backend-file`` or ``memory-backend-memfd``
-objects. A reference to a file-descriptor which can access this object
+achieved via the ``memory-backend-file``, ``memory-backend-memfd``, or
+``memory-backend-shm`` objects.
+A reference to a file-descriptor which can access this object
 will be passed via the socket as part of the protocol negotiation.
 Currently the shared memory object needs to match the size of the main
diff --git a/qapi/qom.json b/qapi/qom.json
index 85e6b4f84a..5252ec69e3 100644
--- a/qapi/qom.json
+++ b/qapi/qom.json
@@ -721,6 +721,19 @@
 '*hugetlbsize': 'size',
 '*seal': 'bool' } }
+##
+# @MemoryBackendShmProperties:
+#
+# Properties for memory-backend-shm objects.
+#
+# The @share boolean option is true by default with shm.
+#
+# Since: 9.1
+##
+{ 'struct': 'MemoryBackendShmProperties',
+  'base': 'MemoryBackendProperties',
+  'data': { } }
+


Acked-by: David Hildenbrand 

One comment: we should maybe just forbid setting share=off. it doesn't 
make any sense and it can even result in an unexpected double memory 
consumption. We missed doing that for memfd, unfortunately.


Good point!

IIUC the `share` property is defined by the parent `hostmem`, so I 
should find a way to override the property here and disable the setter, 
or add an option to `hostmem` to make the property non-writable.


Thanks,
Stefano

Re: [PATCH for-9.1 v3 00/11] vhost-user: support any POSIX system (tested on macOS, FreeBSD, OpenBSD)


FYI I'll be on PTO till May 2nd, I'll send the v4 when I'm back ASAP.

Thanks,
Stefano

On Thu, Apr 04, 2024 at 02:23:19PM +0200, Stefano Garzarella wrote:

v1: https://patchew.org/QEMU/20240228114759.44758-1-sgarz...@redhat.com/
v2: https://patchew.org/QEMU/20240326133936.125332-1-sgarz...@redhat.com/
v3:
 - rebased on v9.0.0-rc2
 - patch 4: avoiding setting fd non-blocking for messages where we
   have memory fd (Eric)
 - patch 9: enriched commit message and documentation to highlight that we
   want to mimic memfd (David)

The vhost-user protocol is not really Linux-specific, so let's try support
QEMU's frontends and backends (including libvhost-user) in any POSIX system
with this series. The main use case is to be able to use virtio devices that
we don't have built-in in QEMU (e.g. virtiofsd, vhost-user-vsock, etc.) even
in non-Linux systems.

The first 5 patches are more like fixes discovered at runtime on macOS or
FreeBSD that could go even independently of this series.

Patches 6, 7, and 8 enable building of frontends and backends (including
libvhost-user) with associated code changes to succeed in compilation.

Patch 9 adds `memory-backend-shm` that uses the POSIX shm_open() API to
create shared memory which is identified by an fd that can be shared with
vhost-user backends. This is useful on those systems (like macOS) where
we don't have memfd_create() or special filesystems like "/dev/shm".

Patches 10 and 11 use `memory-backend-shm` in some vhost-user tests.

Maybe the first 5 patches can go separately, but I only discovered those
problems after testing patches 6 - 9, so I have included them in this series
for now. Please let me know if you prefer that I send them separately.

I tested this series using vhost-user-blk and QSD on macOS Sonoma 14.4
(aarch64), FreeBSD 14 (x86_64), OpenBSD 7.4 (x86_64), and Fedora 39 (x86_64)
in this way:

- Start vhost-user-blk or QSD (same commands for all systems)

 vhost-user-blk -s /tmp/vhost.socket \
   -b Fedora-Cloud-Base-39-1.5.x86_64.raw

 qemu-storage-daemon \
   --blockdev 
file,filename=Fedora-Cloud-Base-39-1.5.x86_64.qcow2,node-name=file \
   --blockdev qcow2,file=file,node-name=qcow2 \
   --export 
vhost-user-blk,addr.type=unix,addr.path=/tmp/vhost.socket,id=vub,num-queues=1,node-name=qcow2,writable=on

- macOS (aarch64): start QEMU (using hvf accelerator)

 qemu-system-aarch64 -smp 2 -cpu host -M virt,accel=hvf,memory-backend=mem \
   -drive 
file=./build/pc-bios/edk2-aarch64-code.fd,if=pflash,format=raw,readonly=on \
   -device virtio-net-device,netdev=net0 -netdev user,id=net0 \
   -device ramfb -device usb-ehci -device usb-kbd \
   -object memory-backend-shm,id=mem,size=512M \
   -device vhost-user-blk-pci,num-queues=1,disable-legacy=on,chardev=char0 \
   -chardev socket,id=char0,path=/tmp/vhost.socket

- FreeBSD/OpenBSD (x86_64): start QEMU (no accelerators available)

 qemu-system-x86_64 -smp 2 -M q35,memory-backend=mem \
   -object memory-backend-shm,id=mem,size="512M" \
   -device vhost-user-blk-pci,num-queues=1,chardev=char0 \
   -chardev socket,id=char0,path=/tmp/vhost.socket

- Fedora (x86_64): start QEMU (using kvm accelerator)

 qemu-system-x86_64 -smp 2 -M q35,accel=kvm,memory-backend=mem \
   -object memory-backend-shm,size="512M" \
   -device vhost-user-blk-pci,num-queues=1,chardev=char0 \
   -chardev socket,id=char0,path=/tmp/vhost.socket

Branch pushed (and CI started) at 
https://gitlab.com/sgarzarella/qemu/-/tree/macos-vhost-user?ref_type=heads

Thanks,
Stefano

Stefano Garzarella (11):
 libvhost-user: set msg.msg_control to NULL when it is empty
 libvhost-user: fail vu_message_write() if sendmsg() is failing
 libvhost-user: mask F_INFLIGHT_SHMFD if memfd is not supported
 vhost-user-server: do not set memory fd non-blocking
 contrib/vhost-user-blk: fix bind() using the right size of the address
 vhost-user: enable frontends on any POSIX system
 libvhost-user: enable it on any POSIX system
 contrib/vhost-user-blk: enable it on any POSIX system
 hostmem: add a new memory backend based on POSIX shm_open()
 tests/qtest/vhost-user-blk-test: use memory-backend-shm
 tests/qtest/vhost-user-test: add a test case for memory-backend-shm

docs/system/devices/vhost-user.rst|   5 +-
meson.build   |   5 +-
qapi/qom.json |  17 
subprojects/libvhost-user/libvhost-user.h |   2 +-
backends/hostmem-shm.c| 118 ++
contrib/vhost-user-blk/vhost-user-blk.c   |  23 -
hw/net/vhost_net.c|   5 +
subprojects/libvhost-user/libvhost-user.c |  76 +-
tests/qtest/vhost-user-blk-test.c |   2 +-
tests/qtest/vhost-user-test.c |  23 +
util/vhost-user-server.c  |  12 +++
backends/meson.build  |   1 +
hw/block/Kconfig  |   2 +-
qemu-options.hx   |  11 ++
util/meson.build  |   4 +-
15 files chang

Re: [PATCH for-9.1 v3 09/11] hostmem: add a new memory backend based on POSIX shm_open()

2024-04-08 Thread David Hildenbrand


On 08.04.24 09:58, Stefano Garzarella wrote:

On Thu, Apr 04, 2024 at 04:09:34PM +0200, David Hildenbrand wrote:

On 04.04.24 14:23, Stefano Garzarella wrote:

shm_open() creates and opens a new POSIX shared memory object.
A POSIX shared memory object allows creating memory backend with an
associated file descriptor that can be shared with external processes
(e.g. vhost-user).

The new `memory-backend-shm` can be used as an alternative when
`memory-backend-memfd` is not available (Linux only), since shm_open()
should be provided by any POSIX-compliant operating system.

This backend mimics memfd, allocating memory that is practically
anonymous. In theory shm_open() requires a name, but this is allocated
for a short time interval and shm_unlink() is called right after
shm_open(). After that, only fd is shared with external processes
(e.g., vhost-user) as if it were associated with anonymous memory.

In the future we may also allow the user to specify the name to be
passed to shm_open(), but for now we keep the backend simple, mimicking
anonymous memory such as memfd.

Signed-off-by: Stefano Garzarella 
---
v3
- enriched commit message and documentation to highlight that we
   want to mimic memfd (David)
---
  docs/system/devices/vhost-user.rst |   5 +-
  qapi/qom.json  |  17 +
  backends/hostmem-shm.c | 118 +
  backends/meson.build   |   1 +
  qemu-options.hx|  11 +++
  5 files changed, 150 insertions(+), 2 deletions(-)
  create mode 100644 backends/hostmem-shm.c

diff --git a/docs/system/devices/vhost-user.rst 
b/docs/system/devices/vhost-user.rst
index 9b2da106ce..35259d8ec7 100644
--- a/docs/system/devices/vhost-user.rst
+++ b/docs/system/devices/vhost-user.rst
@@ -98,8 +98,9 @@ Shared memory object
  In order for the daemon to access the VirtIO queues to process the
  requests it needs access to the guest's address space. This is
-achieved via the ``memory-backend-file`` or ``memory-backend-memfd``
-objects. A reference to a file-descriptor which can access this object
+achieved via the ``memory-backend-file``, ``memory-backend-memfd``, or
+``memory-backend-shm`` objects.
+A reference to a file-descriptor which can access this object
  will be passed via the socket as part of the protocol negotiation.
  Currently the shared memory object needs to match the size of the main
diff --git a/qapi/qom.json b/qapi/qom.json
index 85e6b4f84a..5252ec69e3 100644
--- a/qapi/qom.json
+++ b/qapi/qom.json
@@ -721,6 +721,19 @@
  '*hugetlbsize': 'size',
  '*seal': 'bool' } }
+##
+# @MemoryBackendShmProperties:
+#
+# Properties for memory-backend-shm objects.
+#
+# The @share boolean option is true by default with shm.
+#
+# Since: 9.1
+##
+{ 'struct': 'MemoryBackendShmProperties',
+  'base': 'MemoryBackendProperties',
+  'data': { } }
+


Acked-by: David Hildenbrand 

One comment: we should maybe just forbid setting share=off. it doesn't
make any sense and it can even result in an unexpected double memory
consumption. We missed doing that for memfd, unfortunately.


Good point!

IIUC the `share` property is defined by the parent `hostmem`, so I
should find a way to override the property here and disable the setter,
or add an option to `hostmem` to make the property non-writable.


Right, or simply fail later when you would find "share=off" in 
shm_backend_memory_alloc().


When ever supporting named shmem_open(), it could make sense for VM 
snapshotting. Right now it doesn't really make any sense.


--
Cheers,

David / dhildenb

Re: [PATCH] Makefile: preserve --jobserver-auth argument when calling ninja

Queued, thanks.

Paolo

Re: [PATCH v9 16/20] virtio-net: Do not write hashes to peer buffer

2024-04-08 Thread Yuri Benditovich

On Mon, Apr 8, 2024 at 10:57 AM Akihiko Odaki  wrote:
>
> On 2024/04/08 16:54, Yuri Benditovich wrote:
> > On Mon, Apr 8, 2024 at 10:42 AM Akihiko Odaki  
> > wrote:
> >>
> >> On 2024/04/08 16:40, Yuri Benditovich wrote:
> >>> On Mon, Apr 8, 2024 at 4:30 AM Akihiko Odaki  
> >>> wrote:
> 
>  On 2024/04/08 7:09, Yuri Benditovich wrote:
> > On Wed, Apr 3, 2024 at 2:12 PM Akihiko Odaki  
> > wrote:
> >>
> >> The peer buffer is qualified with const and not meant to be modified.
> >
> > IMHO, this buffer is not so 'const' (although the prototype states so),
> > it is allocated in net.c
> > btw, another procedure in this file also modifies the buffer
> > (work_around_broken_dhclient)
> 
>  Right but it has a FIXME comment.
> 
> >
> >> It also prevents enabling VIRTIO_NET_F_HASH_REPORT for peers without
> >> virtio-net header support.
> >
> > Does it mean _this commit_ prevents enabling VIRTIO_NET_F_HASH_REPORT
> > for peers without
> > virtio-net header support? Where?
> 
>  No, but I meant that this patch fixes such a problem.
> >>>
> >>> No, it does not. Such a problem does not exist in the master, the
> >>> hash_report feature
> >>> is silently dropped in such case:
> >>> https://github.com/qemu/qemu/blob/master/hw/net/virtio-net.c#L816
> >>
> >> Well, silently dropping VIRTIO_NET_F_HASH_REPORT is not different from
> >> preventing enabling VIRTIO_NET_F_HASH_REPORT, is it?
> >>
> > But how is your patch involved in it? Should this line be removed from
> > the commit message?
>
> In the master, VIRTIO_NET_F_HASH_REPORT is silently dropped, but this
> patch will change to work without dropping it, which is worth to mention.
After applying this series of patches the VIRTIO_NET_F_HASH_REPORT is
dropped _the same way_ as in the master
>
> Regards,
> Akihiko Odaki

Re: [PATCH v9 16/20] virtio-net: Do not write hashes to peer buffer

2024-04-08 Thread Akihiko Odaki


On 2024/04/08 17:06, Yuri Benditovich wrote:

On Mon, Apr 8, 2024 at 10:57 AM Akihiko Odaki  wrote:


On 2024/04/08 16:54, Yuri Benditovich wrote:

On Mon, Apr 8, 2024 at 10:42 AM Akihiko Odaki  wrote:


On 2024/04/08 16:40, Yuri Benditovich wrote:

On Mon, Apr 8, 2024 at 4:30 AM Akihiko Odaki  wrote:


On 2024/04/08 7:09, Yuri Benditovich wrote:

On Wed, Apr 3, 2024 at 2:12 PM Akihiko Odaki  wrote:


The peer buffer is qualified with const and not meant to be modified.


IMHO, this buffer is not so 'const' (although the prototype states so),
it is allocated in net.c
btw, another procedure in this file also modifies the buffer
(work_around_broken_dhclient)


Right but it has a FIXME comment.




It also prevents enabling VIRTIO_NET_F_HASH_REPORT for peers without
virtio-net header support.


Does it mean _this commit_ prevents enabling VIRTIO_NET_F_HASH_REPORT
for peers without
virtio-net header support? Where?


No, but I meant that this patch fixes such a problem.


No, it does not. Such a problem does not exist in the master, the
hash_report feature
is silently dropped in such case:
https://github.com/qemu/qemu/blob/master/hw/net/virtio-net.c#L816


Well, silently dropping VIRTIO_NET_F_HASH_REPORT is not different from
preventing enabling VIRTIO_NET_F_HASH_REPORT, is it?


But how is your patch involved in it? Should this line be removed from
the commit message?


In the master, VIRTIO_NET_F_HASH_REPORT is silently dropped, but this
patch will change to work without dropping it, which is worth to mention.

After applying this series of patches the VIRTIO_NET_F_HASH_REPORT is
dropped _the same way_ as in the master


You are right. I forgot that I dropped patch "virtio-net: Do not clear 
VIRTIO_NET_F_HASH_REPORT" with v7. I'll drop the line in the next 
version accordingly. Thanks for pointing out that.


Regards,
Akihiko Odaki

Re: [PATCH for-9.1 v3 09/11] hostmem: add a new memory backend based on POSIX shm_open()


On Mon, Apr 08, 2024 at 10:03:15AM +0200, David Hildenbrand wrote:

On 08.04.24 09:58, Stefano Garzarella wrote:

On Thu, Apr 04, 2024 at 04:09:34PM +0200, David Hildenbrand wrote:

On 04.04.24 14:23, Stefano Garzarella wrote:

shm_open() creates and opens a new POSIX shared memory object.
A POSIX shared memory object allows creating memory backend with an
associated file descriptor that can be shared with external processes
(e.g. vhost-user).

The new `memory-backend-shm` can be used as an alternative when
`memory-backend-memfd` is not available (Linux only), since shm_open()
should be provided by any POSIX-compliant operating system.

This backend mimics memfd, allocating memory that is practically
anonymous. In theory shm_open() requires a name, but this is allocated
for a short time interval and shm_unlink() is called right after
shm_open(). After that, only fd is shared with external processes
(e.g., vhost-user) as if it were associated with anonymous memory.

In the future we may also allow the user to specify the name to be
passed to shm_open(), but for now we keep the backend simple, mimicking
anonymous memory such as memfd.

Signed-off-by: Stefano Garzarella 
---
v3
- enriched commit message and documentation to highlight that we
  want to mimic memfd (David)
---
 docs/system/devices/vhost-user.rst |   5 +-
 qapi/qom.json  |  17 +
 backends/hostmem-shm.c | 118 +
 backends/meson.build   |   1 +
 qemu-options.hx|  11 +++
 5 files changed, 150 insertions(+), 2 deletions(-)
 create mode 100644 backends/hostmem-shm.c

diff --git a/docs/system/devices/vhost-user.rst 
b/docs/system/devices/vhost-user.rst
index 9b2da106ce..35259d8ec7 100644
--- a/docs/system/devices/vhost-user.rst
+++ b/docs/system/devices/vhost-user.rst
@@ -98,8 +98,9 @@ Shared memory object
 In order for the daemon to access the VirtIO queues to process the
 requests it needs access to the guest's address space. This is
-achieved via the ``memory-backend-file`` or ``memory-backend-memfd``
-objects. A reference to a file-descriptor which can access this object
+achieved via the ``memory-backend-file``, ``memory-backend-memfd``, or
+``memory-backend-shm`` objects.
+A reference to a file-descriptor which can access this object
 will be passed via the socket as part of the protocol negotiation.
 Currently the shared memory object needs to match the size of the main
diff --git a/qapi/qom.json b/qapi/qom.json
index 85e6b4f84a..5252ec69e3 100644
--- a/qapi/qom.json
+++ b/qapi/qom.json
@@ -721,6 +721,19 @@
 '*hugetlbsize': 'size',
 '*seal': 'bool' } }
+##
+# @MemoryBackendShmProperties:
+#
+# Properties for memory-backend-shm objects.
+#
+# The @share boolean option is true by default with shm.
+#
+# Since: 9.1
+##
+{ 'struct': 'MemoryBackendShmProperties',
+  'base': 'MemoryBackendProperties',
+  'data': { } }
+


Acked-by: David Hildenbrand 

One comment: we should maybe just forbid setting share=off. it doesn't
make any sense and it can even result in an unexpected double memory
consumption. We missed doing that for memfd, unfortunately.


Good point!

IIUC the `share` property is defined by the parent `hostmem`, so I
should find a way to override the property here and disable the setter,
or add an option to `hostmem` to make the property non-writable.


Right, or simply fail later when you would find "share=off" in 
shm_backend_memory_alloc().


This seems like the simplest and cleanest approach, I'll go in this 
direction!




When ever supporting named shmem_open(), it could make sense for VM 
snapshotting. Right now it doesn't really make any sense.


Yeah, I see.

Thanks,
Stefano

[PATCH v2 00/10] Add a host IOMMU device abstraction

Based on Joao's suggestion, the iommufd nesting prerequisite series [1]
is further splitted to host IOMMU device abstract part and vIOMMU
check part. This series implements the 1st part.

This split also faciliates the dirty tracking series [2] and virtio-iommu
series [3] to depend on 1st part.

The major change in this version is to use QOM, the class tree is as below:

HostIOMMUDevice
   | .get_host_iommu_info()
   |
   |
..
|  | |
  HIODLegacyVFIO[HIODLegacyVDPA]HIODIOMMUFD
| .vdev| [.vdev] | .iommufd
 | .devid
 | [.ioas_id]
 | [.attach_hwpt()]
 | [.detach_hwpt()]
 |
.--.
|  |
   HIODIOMMUFDVFIO [HIODIOMMUFDVDPA]
| .vdev| [.vdev]

* The classes in [] will be implemented in future.
* .ioas_id, .attach/detach_hwpt() will be implemented in nesting series.
* .vdev in different class points to different agent device,
* i.e., for VFIO it points to VFIODevice.

PATCH1-4: Introduce HostIOMMUDevice and its sub classes
PATCH5-7: Implement get_host_iommu_info() callback
PATCH8-10: Create HostIOMMUDevice instance and pass to vIOMMU

Qemu code can be found at:
https://github.com/yiliu1765/qemu/tree/zhenzhong/iommufd_nesting_preq_part1_v2

[1] 
https://lore.kernel.org/qemu-devel/20240201072818.327930-1-zhenzhong.d...@intel.com/
[2] 
https://lore.kernel.org/qemu-devel/20240212135643.5858-1-joao.m.mart...@oracle.com/
[3] 
https://lore.kernel.org/qemu-devel/20240117080414.316890-1-eric.au...@redhat.com/

Thanks
Zhenzhong

Changelog:
v2:
- use QOM to abstract host IOMMU device and its sub-classes (Cédric)
- move host IOMMU device creation in attach_device() (Cédric)
- refine pci_device_set/unset_iommu_device doc futher (Eric)
- define host IOMMU info format of different backend
- implement get_host_iommu_info() for different backend (Cédric)

v1:
- use HostIOMMUDevice handle instead of union in VFIODevice (Eric)
- change host_iommu_device_init to host_iommu_device_create
- allocate HostIOMMUDevice in host_iommu_device_create callback
  and set the VFIODevice base_hdev handle (Eric)
- refine pci_device_set/unset_iommu_device doc (Eric)
- use HostIOMMUDevice handle instead of union in VTDHostIOMMUDevice (Eric)

rfcv2:
- introduce common abstract HostIOMMUDevice and sub struct for different BEs 
(Eric, Cédric)
- remove iommufd_device.[ch] (Cédric)
- remove duplicate iommufd/devid define from VFIODevice (Eric)
- drop the p in aliased_pbus and aliased_pdevfn (Eric)
- assert devfn and iommu_bus in pci_device_get_iommu_bus_devfn (Cédric, Eric)
- use errp in iommufd_device_get_info (Eric)
- split and simplify cap/ecap check/sync code in intel_iommu.c (Cédric)
- move VTDHostIOMMUDevice declaration to intel_iommu_internal.h (Cédric)
- make '(vtd->cap_reg >> 16) & 0x3fULL' a MACRO and add missed '+1' (Cédric)
- block migration if vIOMMU cap/ecap updated based on host IOMMU cap/ecap
- add R-B


Yi Liu (1):
  hw/pci: Introduce pci_device_set/unset_iommu_device()

Zhenzhong Duan (9):
  backends: Introduce abstract HostIOMMUDevice
  vfio: Introduce HIODLegacyVFIO device
  backends/iommufd: Introduce abstract HIODIOMMUFD device
  vfio/iommufd: Introduce HIODIOMMUFDVFIO device
  vfio: Implement get_host_iommu_info() callback
  backends/iommufd: Introduce helper function
iommufd_backend_get_device_info()
  backends/iommufd: Implement get_host_iommu_info() callback
  vfio: Create host IOMMU device instance
  vfio: Pass HostIOMMUDevice to vIOMMU

 MAINTAINERS|  2 +
 include/hw/pci/pci.h   | 40 +-
 include/hw/vfio/vfio-common.h  | 23 
 include/sysemu/host_iommu_device.h | 29 ++
 include/sysemu/iommufd.h   | 33 
 backends/host_iommu_device.c   | 19 +++
 backends/iommufd.c | 85 --
 hw/pci/pci.c   | 75 --
 hw/vfio/container.c| 40 +-
 hw/vfio/iommufd.c  | 19 ++-
 hw/vfio/pci.c  | 20 +--
 backends/Kconfig   |  5 ++
 backends/meson.build   |  1 +
 13 files changed, 364 insertions(+), 27 deletions(-)
 create mode 100644 include/sysemu/host_iommu_device.h
 create mode 100644 backends/host_iommu_device.c

-- 
2.34.1

[PATCH v2 03/10] backends/iommufd: Introduce abstract HIODIOMMUFD device

HIODIOMMUFD represents a host IOMMU device under iommufd backend.

Currently it includes only public iommufd handle and device id.
which could be used to get hw IOMMU information.

When nested translation is supported in future, vIOMMU is going
to have iommufd related operations like attaching/detaching hwpt,
So IOMMUFDDevice interface will be further extended at that time.

VFIO and VDPA device have different way of attaching/detaching hwpt.
So HIODIOMMUFD is still an abstract class which will be inherited by
VFIO and VDPA device.

Introduce a helper hiod_iommufd_init() to initialize HIODIOMMUFD
device.

Suggested-by: Cédric Le Goater 
Originally-by: Yi Liu 
Signed-off-by: Yi Sun 
Signed-off-by: Zhenzhong Duan 
---
 include/sysemu/iommufd.h | 22 +++
 backends/iommufd.c   | 47 ++--
 2 files changed, 53 insertions(+), 16 deletions(-)

diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h
index 9af27ebd6c..71c53cbb45 100644
--- a/include/sysemu/iommufd.h
+++ b/include/sysemu/iommufd.h
@@ -4,6 +4,7 @@
 #include "qom/object.h"
 #include "exec/hwaddr.h"
 #include "exec/cpu-common.h"
+#include "sysemu/host_iommu_device.h"
 
 #define TYPE_IOMMUFD_BACKEND "iommufd"
 OBJECT_DECLARE_TYPE(IOMMUFDBackend, IOMMUFDBackendClass, IOMMUFD_BACKEND)
@@ -33,4 +34,25 @@ int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t 
ioas_id, hwaddr iova,
 ram_addr_t size, void *vaddr, bool readonly);
 int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id,
   hwaddr iova, ram_addr_t size);
+
+#define TYPE_HIOD_IOMMUFD TYPE_HOST_IOMMU_DEVICE "-iommufd"
+OBJECT_DECLARE_TYPE(HIODIOMMUFD, HIODIOMMUFDClass, HIOD_IOMMUFD)
+
+struct HIODIOMMUFD {
+/*< private >*/
+HostIOMMUDevice parent;
+void *opaque;
+
+/*< public >*/
+IOMMUFDBackend *iommufd;
+uint32_t devid;
+};
+
+struct HIODIOMMUFDClass {
+/*< private >*/
+HostIOMMUDeviceClass parent_class;
+};
+
+void hiod_iommufd_init(HIODIOMMUFD *idev, IOMMUFDBackend *iommufd,
+   uint32_t devid);
 #endif
diff --git a/backends/iommufd.c b/backends/iommufd.c
index 62a79fa6b0..ef8b3a808b 100644
--- a/backends/iommufd.c
+++ b/backends/iommufd.c
@@ -212,23 +212,38 @@ int iommufd_backend_unmap_dma(IOMMUFDBackend *be, 
uint32_t ioas_id,
 return ret;
 }
 
-static const TypeInfo iommufd_backend_info = {
-.name = TYPE_IOMMUFD_BACKEND,
-.parent = TYPE_OBJECT,
-.instance_size = sizeof(IOMMUFDBackend),
-.instance_init = iommufd_backend_init,
-.instance_finalize = iommufd_backend_finalize,
-.class_size = sizeof(IOMMUFDBackendClass),
-.class_init = iommufd_backend_class_init,
-.interfaces = (InterfaceInfo[]) {
-{ TYPE_USER_CREATABLE },
-{ }
-}
-};
+void hiod_iommufd_init(HIODIOMMUFD *idev, IOMMUFDBackend *iommufd,
+   uint32_t devid)
+{
+idev->iommufd = iommufd;
+idev->devid = devid;
+}
 
-static void register_types(void)
+static void hiod_iommufd_class_init(ObjectClass *oc, void *data)
 {
-type_register_static(&iommufd_backend_info);
 }
 
-type_init(register_types);
+static const TypeInfo types[] = {
+{
+.name = TYPE_IOMMUFD_BACKEND,
+.parent = TYPE_OBJECT,
+.instance_size = sizeof(IOMMUFDBackend),
+.instance_init = iommufd_backend_init,
+.instance_finalize = iommufd_backend_finalize,
+.class_size = sizeof(IOMMUFDBackendClass),
+.class_init = iommufd_backend_class_init,
+.interfaces = (InterfaceInfo[]) {
+{ TYPE_USER_CREATABLE },
+{ }
+}
+}, {
+.name = TYPE_HIOD_IOMMUFD,
+.parent = TYPE_HOST_IOMMU_DEVICE,
+.instance_size = sizeof(HIODIOMMUFD),
+.class_size = sizeof(HIODIOMMUFDClass),
+.class_init = hiod_iommufd_class_init,
+.abstract = true,
+}
+};
+
+DEFINE_TYPES(types)
-- 
2.34.1

[PATCH v2 04/10] vfio/iommufd: Introduce HIODIOMMUFDVFIO device

HIODIOMMUFDVFIO represents a host IOMMU device under VFIO iommufd
backend. It will be created during VFIO device attaching and passed
to vIOMMU.

It includes a link to VFIODevice so that we can do VFIO device
specific hwpt attaching/detaching.

Signed-off-by: Zhenzhong Duan 
---
 include/hw/vfio/vfio-common.h | 11 +++
 hw/vfio/iommufd.c | 11 ++-
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index f30772f534..d382b12ec1 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -32,6 +32,7 @@
 #include "sysemu/sysemu.h"
 #include "hw/vfio/vfio-container-base.h"
 #include "sysemu/host_iommu_device.h"
+#include "sysemu/iommufd.h"
 
 #define VFIO_MSG_PREFIX "vfio %s: "
 
@@ -158,6 +159,16 @@ struct HIODLegacyVFIO {
 VFIODevice *vdev;
 };
 
+#define TYPE_HIOD_IOMMUFD_VFIO TYPE_HIOD_IOMMUFD "-vfio"
+OBJECT_DECLARE_SIMPLE_TYPE(HIODIOMMUFDVFIO, HIOD_IOMMUFD_VFIO)
+
+/* Abstraction of VFIO IOMMUFD host IOMMU device */
+struct HIODIOMMUFDVFIO {
+/*< private >*/
+HIODIOMMUFD parent;
+VFIODevice *vdev;
+};
+
 typedef struct VFIODMABuf {
 QemuDmaBuf buf;
 uint32_t pos_x, pos_y, pos_updates;
diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
index 8827ffe636..115b9f8e7f 100644
--- a/hw/vfio/iommufd.c
+++ b/hw/vfio/iommufd.c
@@ -634,12 +634,21 @@ static void vfio_iommu_iommufd_class_init(ObjectClass 
*klass, void *data)
 vioc->pci_hot_reset = iommufd_cdev_pci_hot_reset;
 };
 
+static void hiod_iommufd_vfio_class_init(ObjectClass *oc, void *data)
+{
+};
+
 static const TypeInfo types[] = {
 {
 .name = TYPE_VFIO_IOMMU_IOMMUFD,
 .parent = TYPE_VFIO_IOMMU,
 .class_init = vfio_iommu_iommufd_class_init,
-},
+}, {
+.name = TYPE_HIOD_IOMMUFD_VFIO,
+.parent = TYPE_HIOD_IOMMUFD,
+.instance_size = sizeof(HIODIOMMUFDVFIO),
+.class_init = hiod_iommufd_vfio_class_init,
+}
 };
 
 DEFINE_TYPES(types)
-- 
2.34.1

[PATCH v2 06/10] backends/iommufd: Introduce helper function iommufd_backend_get_device_info()

Introduce a helper function iommufd_backend_get_device_info() to get
host IOMMU related information through iommufd uAPI.

Signed-off-by: Yi Liu 
Signed-off-by: Yi Sun 
Signed-off-by: Zhenzhong Duan 
---
 include/sysemu/iommufd.h |  4 
 backends/iommufd.c   | 23 ++-
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h
index 71c53cbb45..fa1a866237 100644
--- a/include/sysemu/iommufd.h
+++ b/include/sysemu/iommufd.h
@@ -4,6 +4,7 @@
 #include "qom/object.h"
 #include "exec/hwaddr.h"
 #include "exec/cpu-common.h"
+#include 
 #include "sysemu/host_iommu_device.h"
 
 #define TYPE_IOMMUFD_BACKEND "iommufd"
@@ -34,6 +35,9 @@ int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t 
ioas_id, hwaddr iova,
 ram_addr_t size, void *vaddr, bool readonly);
 int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id,
   hwaddr iova, ram_addr_t size);
+int iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid,
+enum iommu_hw_info_type *type,
+void *data, uint32_t len, Error **errp);
 
 #define TYPE_HIOD_IOMMUFD TYPE_HOST_IOMMU_DEVICE "-iommufd"
 OBJECT_DECLARE_TYPE(HIODIOMMUFD, HIODIOMMUFDClass, HIOD_IOMMUFD)
diff --git a/backends/iommufd.c b/backends/iommufd.c
index ef8b3a808b..559affa9ec 100644
--- a/backends/iommufd.c
+++ b/backends/iommufd.c
@@ -20,7 +20,6 @@
 #include "monitor/monitor.h"
 #include "trace.h"
 #include 
-#include 
 
 static void iommufd_backend_init(Object *obj)
 {
@@ -212,6 +211,28 @@ int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t 
ioas_id,
 return ret;
 }
 
+int iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid,
+enum iommu_hw_info_type *type,
+void *data, uint32_t len, Error **errp)
+{
+struct iommu_hw_info info = {
+.size = sizeof(info),
+.dev_id = devid,
+.data_len = len,
+.data_uptr = (uintptr_t)data,
+};
+int ret;
+
+ret = ioctl(be->fd, IOMMU_GET_HW_INFO, &info);
+if (ret) {
+error_setg_errno(errp, errno, "Failed to get hardware info");
+} else {
+*type = info.out_data_type;
+}
+
+return ret;
+}
+
 void hiod_iommufd_init(HIODIOMMUFD *idev, IOMMUFDBackend *iommufd,
uint32_t devid)
 {
-- 
2.34.1

[PATCH v2 02/10] vfio: Introduce HIODLegacyVFIO device

HIODLegacyVFIO represents a host IOMMU device under VFIO legacy
container backend.

It includes a link to VFIODevice.

Suggested-by: Eric Auger 
Suggested-by: Cédric Le Goater 
Signed-off-by: Zhenzhong Duan 
---
 include/hw/vfio/vfio-common.h | 11 +++
 hw/vfio/container.c   | 11 ++-
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index b9da6c08ef..f30772f534 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -31,6 +31,7 @@
 #endif
 #include "sysemu/sysemu.h"
 #include "hw/vfio/vfio-container-base.h"
+#include "sysemu/host_iommu_device.h"
 
 #define VFIO_MSG_PREFIX "vfio %s: "
 
@@ -147,6 +148,16 @@ typedef struct VFIOGroup {
 bool ram_block_discard_allowed;
 } VFIOGroup;
 
+#define TYPE_HIOD_LEGACY_VFIO TYPE_HOST_IOMMU_DEVICE "-legacy-vfio"
+OBJECT_DECLARE_SIMPLE_TYPE(HIODLegacyVFIO, HIOD_LEGACY_VFIO)
+
+/* Abstraction of VFIO legacy host IOMMU device */
+struct HIODLegacyVFIO {
+/*< private >*/
+HostIOMMUDevice parent;
+VFIODevice *vdev;
+};
+
 typedef struct VFIODMABuf {
 QemuDmaBuf buf;
 uint32_t pos_x, pos_y, pos_updates;
diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index 77bdec276e..44018ef085 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -1143,12 +1143,21 @@ static void vfio_iommu_legacy_class_init(ObjectClass 
*klass, void *data)
 vioc->pci_hot_reset = vfio_legacy_pci_hot_reset;
 };
 
+static void hiod_legacy_vfio_class_init(ObjectClass *oc, void *data)
+{
+};
+
 static const TypeInfo types[] = {
 {
 .name = TYPE_VFIO_IOMMU_LEGACY,
 .parent = TYPE_VFIO_IOMMU,
 .class_init = vfio_iommu_legacy_class_init,
-},
+}, {
+.name = TYPE_HIOD_LEGACY_VFIO,
+.parent = TYPE_HOST_IOMMU_DEVICE,
+.instance_size = sizeof(HIODLegacyVFIO),
+.class_init = hiod_legacy_vfio_class_init,
+}
 };
 
 DEFINE_TYPES(types)
-- 
2.34.1

[PATCH v2 10/10] vfio: Pass HostIOMMUDevice to vIOMMU

With HostIOMMUDevice passed, vIOMMU can check compatibility with host
IOMMU, call into IOMMUFD specific methods, etc.

Originally-by: Yi Liu 
Signed-off-by: Nicolin Chen 
Signed-off-by: Yi Sun 
Signed-off-by: Zhenzhong Duan 
---
 hw/vfio/pci.c | 20 +++-
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 64780d1b79..224501a86e 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -3111,11 +3111,17 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
 
 vfio_bars_register(vdev);
 
-ret = vfio_add_capabilities(vdev, errp);
+ret = pci_device_set_iommu_device(pdev, vbasedev->hiod, errp);
 if (ret) {
+error_prepend(errp, "Failed to set iommu_device: ");
 goto out_teardown;
 }
 
+ret = vfio_add_capabilities(vdev, errp);
+if (ret) {
+goto out_unset_idev;
+}
+
 if (vdev->vga) {
 vfio_vga_quirk_setup(vdev);
 }
@@ -3132,7 +3138,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
 error_setg(errp,
"cannot support IGD OpRegion feature on hotplugged "
"device");
-goto out_teardown;
+goto out_unset_idev;
 }
 
 ret = vfio_get_dev_region_info(vbasedev,
@@ -3141,13 +3147,13 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
 if (ret) {
 error_setg_errno(errp, -ret,
  "does not support requested IGD OpRegion 
feature");
-goto out_teardown;
+goto out_unset_idev;
 }
 
 ret = vfio_pci_igd_opregion_init(vdev, opregion, errp);
 g_free(opregion);
 if (ret) {
-goto out_teardown;
+goto out_unset_idev;
 }
 }
 
@@ -3233,6 +3239,8 @@ out_deregister:
 if (vdev->intx.mmap_timer) {
 timer_free(vdev->intx.mmap_timer);
 }
+out_unset_idev:
+pci_device_unset_iommu_device(pdev);
 out_teardown:
 vfio_teardown_msi(vdev);
 vfio_bars_exit(vdev);
@@ -3261,6 +3269,7 @@ static void vfio_instance_finalize(Object *obj)
 static void vfio_exitfn(PCIDevice *pdev)
 {
 VFIOPCIDevice *vdev = VFIO_PCI(pdev);
+VFIODevice *vbasedev = &vdev->vbasedev;
 
 vfio_unregister_req_notifier(vdev);
 vfio_unregister_err_notifier(vdev);
@@ -3275,7 +3284,8 @@ static void vfio_exitfn(PCIDevice *pdev)
 vfio_teardown_msi(vdev);
 vfio_pci_disable_rp_atomics(vdev);
 vfio_bars_exit(vdev);
-vfio_migration_exit(&vdev->vbasedev);
+vfio_migration_exit(vbasedev);
+pci_device_unset_iommu_device(pdev);
 }
 
 static void vfio_pci_reset(DeviceState *dev)
-- 
2.34.1

[PATCH v2 07/10] backends/iommufd: Implement get_host_iommu_info() callback

It calls iommufd_backend_get_device_info() to get host IOMMU
related information.

Define a common structure HIOD_IOMMUFD_INFO to describe the info
returned from kernel. Currently only vtd, but easy to add arm smmu
when kernel supports.

Signed-off-by: Zhenzhong Duan 
---
 include/sysemu/iommufd.h |  7 +++
 backends/iommufd.c   | 17 +
 2 files changed, 24 insertions(+)

diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h
index fa1a866237..44ec1335b2 100644
--- a/include/sysemu/iommufd.h
+++ b/include/sysemu/iommufd.h
@@ -39,6 +39,13 @@ int iommufd_backend_get_device_info(IOMMUFDBackend *be, 
uint32_t devid,
 enum iommu_hw_info_type *type,
 void *data, uint32_t len, Error **errp);
 
+typedef struct HIOD_IOMMUFD_INFO {
+enum iommu_hw_info_type type;
+union {
+struct iommu_hw_info_vtd vtd;
+} data;
+} HIOD_IOMMUFD_INFO;
+
 #define TYPE_HIOD_IOMMUFD TYPE_HOST_IOMMU_DEVICE "-iommufd"
 OBJECT_DECLARE_TYPE(HIODIOMMUFD, HIODIOMMUFDClass, HIOD_IOMMUFD)
 
diff --git a/backends/iommufd.c b/backends/iommufd.c
index 559affa9ec..1e9c469e65 100644
--- a/backends/iommufd.c
+++ b/backends/iommufd.c
@@ -240,8 +240,25 @@ void hiod_iommufd_init(HIODIOMMUFD *idev, IOMMUFDBackend 
*iommufd,
 idev->devid = devid;
 }
 
+static int hiod_iommufd_get_host_iommu_info(HostIOMMUDevice *hiod,
+void *data, uint32_t len,
+Error **errp)
+{
+HIODIOMMUFD *idev = HIOD_IOMMUFD(hiod);
+HIOD_IOMMUFD_INFO *info = data;
+
+assert(sizeof(HIOD_IOMMUFD_INFO) <= len);
+
+return iommufd_backend_get_device_info(idev->iommufd, idev->devid,
+   &info->type, &info->data,
+   sizeof(info->data), errp);
+}
+
 static void hiod_iommufd_class_init(ObjectClass *oc, void *data)
 {
+HostIOMMUDeviceClass *hiodc = HOST_IOMMU_DEVICE_CLASS(oc);
+
+hiodc->get_host_iommu_info = hiod_iommufd_get_host_iommu_info;
 }
 
 static const TypeInfo types[] = {
-- 
2.34.1

[PATCH v2 01/10] backends: Introduce abstract HostIOMMUDevice

Introduce HostIOMMUDevice as an abstraction of host IOMMU device.

get_host_iommu_info() is used to get host IOMMU info, different
backends can have different implementations and result format.

Introduce a macro CONFIG_HOST_IOMMU_DEVICE to define the usage
for VFIO, and VDPA in the future.

Suggested-by: Cédric Le Goater 
Signed-off-by: Zhenzhong Duan 
---
 MAINTAINERS|  2 ++
 include/sysemu/host_iommu_device.h | 19 +++
 backends/host_iommu_device.c   | 19 +++
 backends/Kconfig   |  5 +
 backends/meson.build   |  1 +
 5 files changed, 46 insertions(+)
 create mode 100644 include/sysemu/host_iommu_device.h
 create mode 100644 backends/host_iommu_device.c

diff --git a/MAINTAINERS b/MAINTAINERS
index e71183eef9..22f71cbe02 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2202,6 +2202,8 @@ M: Zhenzhong Duan 
 S: Supported
 F: backends/iommufd.c
 F: include/sysemu/iommufd.h
+F: backends/host_iommu_device.c
+F: include/sysemu/host_iommu_device.h
 F: include/qemu/chardev_open.h
 F: util/chardev_open.c
 F: docs/devel/vfio-iommufd.rst
diff --git a/include/sysemu/host_iommu_device.h 
b/include/sysemu/host_iommu_device.h
new file mode 100644
index 00..22ccbe3a5d
--- /dev/null
+++ b/include/sysemu/host_iommu_device.h
@@ -0,0 +1,19 @@
+#ifndef HOST_IOMMU_DEVICE_H
+#define HOST_IOMMU_DEVICE_H
+
+#include "qom/object.h"
+
+#define TYPE_HOST_IOMMU_DEVICE "host-iommu-device"
+OBJECT_DECLARE_TYPE(HostIOMMUDevice, HostIOMMUDeviceClass, HOST_IOMMU_DEVICE)
+
+struct HostIOMMUDevice {
+Object parent;
+};
+
+struct HostIOMMUDeviceClass {
+ObjectClass parent_class;
+
+int (*get_host_iommu_info)(HostIOMMUDevice *hiod, void *data, uint32_t len,
+   Error **errp);
+};
+#endif
diff --git a/backends/host_iommu_device.c b/backends/host_iommu_device.c
new file mode 100644
index 00..6cb6007d8c
--- /dev/null
+++ b/backends/host_iommu_device.c
@@ -0,0 +1,19 @@
+#include "qemu/osdep.h"
+#include "sysemu/host_iommu_device.h"
+
+OBJECT_DEFINE_ABSTRACT_TYPE(HostIOMMUDevice,
+host_iommu_device,
+HOST_IOMMU_DEVICE,
+OBJECT)
+
+static void host_iommu_device_class_init(ObjectClass *oc, void *data)
+{
+}
+
+static void host_iommu_device_init(Object *obj)
+{
+}
+
+static void host_iommu_device_finalize(Object *obj)
+{
+}
diff --git a/backends/Kconfig b/backends/Kconfig
index 2cb23f62fa..34ab29e994 100644
--- a/backends/Kconfig
+++ b/backends/Kconfig
@@ -3,3 +3,8 @@ source tpm/Kconfig
 config IOMMUFD
 bool
 depends on VFIO
+
+config HOST_IOMMU_DEVICE
+bool
+default y
+depends on VFIO
diff --git a/backends/meson.build b/backends/meson.build
index 8b2b111497..2e975d641e 100644
--- a/backends/meson.build
+++ b/backends/meson.build
@@ -25,6 +25,7 @@ if have_vhost_user
 endif
 system_ss.add(when: 'CONFIG_VIRTIO_CRYPTO', if_true: 
files('cryptodev-vhost.c'))
 system_ss.add(when: 'CONFIG_IOMMUFD', if_true: files('iommufd.c'))
+system_ss.add(when: 'CONFIG_HOST_IOMMU_DEVICE', if_true: 
files('host_iommu_device.c'))
 if have_vhost_user_crypto
   system_ss.add(when: 'CONFIG_VIRTIO_CRYPTO', if_true: 
files('cryptodev-vhost-user.c'))
 endif
-- 
2.34.1

[PATCH v2 05/10] vfio: Implement get_host_iommu_info() callback

Utilize iova_ranges to calculate host IOMMU address width and
package it in HIOD_LEGACY_INFO for vIOMMU usage.

HIOD_LEGACY_INFO will be used by both VFIO and VDPA so declare
it in host_iommu_device.h.

Signed-off-by: Zhenzhong Duan 
---
 include/sysemu/host_iommu_device.h | 10 ++
 hw/vfio/container.c| 24 
 2 files changed, 34 insertions(+)

diff --git a/include/sysemu/host_iommu_device.h 
b/include/sysemu/host_iommu_device.h
index 22ccbe3a5d..beb8be8231 100644
--- a/include/sysemu/host_iommu_device.h
+++ b/include/sysemu/host_iommu_device.h
@@ -16,4 +16,14 @@ struct HostIOMMUDeviceClass {
 int (*get_host_iommu_info)(HostIOMMUDevice *hiod, void *data, uint32_t len,
Error **errp);
 };
+
+/*
+ * Define the format of host IOMMU related info that current VFIO
+ * or VDPA can privode to vIOMMU.
+ *
+ * @aw_bits: Host IOMMU address width. 0xff if no limitation.
+ */
+typedef struct HIOD_LEGACY_INFO {
+uint8_t aw_bits;
+} HIOD_LEGACY_INFO;
 #endif
diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index 44018ef085..ba0ad4a41b 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -1143,8 +1143,32 @@ static void vfio_iommu_legacy_class_init(ObjectClass 
*klass, void *data)
 vioc->pci_hot_reset = vfio_legacy_pci_hot_reset;
 };
 
+static int hiod_legacy_vfio_get_host_iommu_info(HostIOMMUDevice *hiod,
+void *data, uint32_t len,
+Error **errp)
+{
+VFIODevice *vbasedev = HIOD_LEGACY_VFIO(hiod)->vdev;
+/* iova_ranges is a sorted list */
+GList *l = g_list_last(vbasedev->bcontainer->iova_ranges);
+HIOD_LEGACY_INFO *info = data;
+
+assert(sizeof(HIOD_LEGACY_INFO) <= len);
+
+if (l) {
+Range *range = l->data;
+info->aw_bits = find_last_bit(&range->upb, BITS_PER_LONG) + 1;
+} else {
+info->aw_bits = 0xff;
+}
+
+return 0;
+}
+
 static void hiod_legacy_vfio_class_init(ObjectClass *oc, void *data)
 {
+HostIOMMUDeviceClass *hioc = HOST_IOMMU_DEVICE_CLASS(oc);
+
+hioc->get_host_iommu_info = hiod_legacy_vfio_get_host_iommu_info;
 };
 
 static const TypeInfo types[] = {
-- 
2.34.1

[PATCH v2 08/10] vfio: Create host IOMMU device instance

Create host IOMMU device instance and initialize it based on backend.

Signed-off-by: Zhenzhong Duan 
---
 include/hw/vfio/vfio-common.h | 1 +
 hw/vfio/container.c   | 5 +
 hw/vfio/iommufd.c | 8 
 3 files changed, 14 insertions(+)

diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index d382b12ec1..4fbba85018 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -126,6 +126,7 @@ typedef struct VFIODevice {
 OnOffAuto pre_copy_dirty_page_tracking;
 bool dirty_pages_supported;
 bool dirty_tracking;
+HostIOMMUDevice *hiod;
 int devid;
 IOMMUFDBackend *iommufd;
 } VFIODevice;
diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index ba0ad4a41b..fc0c027501 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -915,6 +915,7 @@ static int vfio_legacy_attach_device(const char *name, 
VFIODevice *vbasedev,
 VFIODevice *vbasedev_iter;
 VFIOGroup *group;
 VFIOContainerBase *bcontainer;
+HIODLegacyVFIO *hiod_vfio;
 int ret;
 
 if (groupid < 0) {
@@ -945,6 +946,9 @@ static int vfio_legacy_attach_device(const char *name, 
VFIODevice *vbasedev,
 vbasedev->bcontainer = bcontainer;
 QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev, container_next);
 QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next);
+hiod_vfio = HIOD_LEGACY_VFIO(object_new(TYPE_HIOD_LEGACY_VFIO));
+hiod_vfio->vdev = vbasedev;
+vbasedev->hiod = HOST_IOMMU_DEVICE(hiod_vfio);
 
 return ret;
 }
@@ -959,6 +963,7 @@ static void vfio_legacy_detach_device(VFIODevice *vbasedev)
 trace_vfio_detach_device(vbasedev->name, group->groupid);
 vfio_put_base_device(vbasedev);
 vfio_put_group(group);
+object_unref(vbasedev->hiod);
 }
 
 static int vfio_legacy_pci_hot_reset(VFIODevice *vbasedev, bool single)
diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
index 115b9f8e7f..b6d058339b 100644
--- a/hw/vfio/iommufd.c
+++ b/hw/vfio/iommufd.c
@@ -308,6 +308,7 @@ static int iommufd_cdev_attach(const char *name, VFIODevice 
*vbasedev,
 VFIOIOMMUFDContainer *container;
 VFIOAddressSpace *space;
 struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
+HIODIOMMUFDVFIO *hiod_vfio;
 int ret, devfd;
 uint32_t ioas_id;
 Error *err = NULL;
@@ -431,6 +432,12 @@ found_container:
 QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev, container_next);
 QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next);
 
+hiod_vfio = HIOD_IOMMUFD_VFIO(object_new(TYPE_HIOD_IOMMUFD_VFIO));
+hiod_iommufd_init(HIOD_IOMMUFD(hiod_vfio), vbasedev->iommufd,
+  vbasedev->devid);
+hiod_vfio->vdev = vbasedev;
+vbasedev->hiod = HOST_IOMMU_DEVICE(hiod_vfio);
+
 trace_iommufd_cdev_device_info(vbasedev->name, devfd, vbasedev->num_irqs,
vbasedev->num_regions, vbasedev->flags);
 return 0;
@@ -468,6 +475,7 @@ static void iommufd_cdev_detach(VFIODevice *vbasedev)
 iommufd_cdev_detach_container(vbasedev, container);
 iommufd_cdev_container_destroy(container);
 vfio_put_address_space(space);
+object_unref(vbasedev->hiod);
 
 iommufd_cdev_unbind_and_disconnect(vbasedev);
 close(vbasedev->fd);
-- 
2.34.1

[PATCH v2 09/10] hw/pci: Introduce pci_device_set/unset_iommu_device()

From: Yi Liu 

This adds pci_device_set/unset_iommu_device() to set/unset
HostIOMMUDevice for a given PCI device. Caller of set
should fail if set operation fails.

Extract out pci_device_get_iommu_bus_devfn() to facilitate
implementation of pci_device_set/unset_iommu_device().

Signed-off-by: Yi Liu 
Signed-off-by: Yi Sun 
Signed-off-by: Nicolin Chen 
Signed-off-by: Zhenzhong Duan 
---
 include/hw/pci/pci.h | 40 ++-
 hw/pci/pci.c | 75 ++--
 2 files changed, 111 insertions(+), 4 deletions(-)

diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index eaa3fc99d8..4ae7fe6f3f 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -3,6 +3,7 @@
 
 #include "exec/memory.h"
 #include "sysemu/dma.h"
+#include "sysemu/host_iommu_device.h"
 
 /* PCI includes legacy ISA access.  */
 #include "hw/isa/isa.h"
@@ -383,10 +384,47 @@ typedef struct PCIIOMMUOps {
  *
  * @devfn: device and function number
  */
-   AddressSpace * (*get_address_space)(PCIBus *bus, void *opaque, int devfn);
+AddressSpace * (*get_address_space)(PCIBus *bus, void *opaque, int devfn);
+/**
+ * @set_iommu_device: attach a HostIOMMUDevice to a vIOMMU
+ *
+ * Optional callback, if not implemented in vIOMMU, then vIOMMU can't
+ * retrieve host information from the associated HostIOMMUDevice.
+ *
+ * Return true if HostIOMMUDevice is attached, or else return false
+ * with errp set.
+ *
+ * @bus: the #PCIBus of the PCI device.
+ *
+ * @opaque: the data passed to pci_setup_iommu().
+ *
+ * @devfn: device and function number of the PCI device.
+ *
+ * @dev: the data structure representing host IOMMU device.
+ *
+ * @errp: pass an Error out only when return false
+ *
+ */
+int (*set_iommu_device)(PCIBus *bus, void *opaque, int devfn,
+HostIOMMUDevice *dev, Error **errp);
+/**
+ * @unset_iommu_device: detach a HostIOMMUDevice from a vIOMMU
+ *
+ * Optional callback.
+ *
+ * @bus: the #PCIBus of the PCI device.
+ *
+ * @opaque: the data passed to pci_setup_iommu().
+ *
+ * @devfn: device and function number of the PCI device.
+ */
+void (*unset_iommu_device)(PCIBus *bus, void *opaque, int devfn);
 } PCIIOMMUOps;
 
 AddressSpace *pci_device_iommu_address_space(PCIDevice *dev);
+int pci_device_set_iommu_device(PCIDevice *dev, HostIOMMUDevice *hiod,
+Error **errp);
+void pci_device_unset_iommu_device(PCIDevice *dev);
 
 /**
  * pci_setup_iommu: Initialize specific IOMMU handlers for a PCIBus
diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index e7a39cb203..8ece617673 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -2648,11 +2648,27 @@ static void pci_device_class_base_init(ObjectClass 
*klass, void *data)
 }
 }
 
-AddressSpace *pci_device_iommu_address_space(PCIDevice *dev)
+/*
+ * Get IOMMU root bus, aliased bus and devfn of a PCI device
+ *
+ * IOMMU root bus is needed by all call sites to call into iommu_ops.
+ * For call sites which don't need aliased BDF, passing NULL to
+ * aliased_[bus/devfn] is allowed.
+ *
+ * @piommu_bus: return root #PCIBus backed by an IOMMU for the PCI device.
+ *
+ * @aliased_bus: return aliased #PCIBus of the PCI device, optional.
+ *
+ * @aliased_devfn: return aliased devfn of the PCI device, optional.
+ */
+static void pci_device_get_iommu_bus_devfn(PCIDevice *dev,
+   PCIBus **piommu_bus,
+   PCIBus **aliased_bus,
+   int *aliased_devfn)
 {
 PCIBus *bus = pci_get_bus(dev);
 PCIBus *iommu_bus = bus;
-uint8_t devfn = dev->devfn;
+int devfn = dev->devfn;
 
 while (iommu_bus && !iommu_bus->iommu_ops && iommu_bus->parent_dev) {
 PCIBus *parent_bus = pci_get_bus(iommu_bus->parent_dev);
@@ -2693,13 +2709,66 @@ AddressSpace *pci_device_iommu_address_space(PCIDevice 
*dev)
 
 iommu_bus = parent_bus;
 }
-if (!pci_bus_bypass_iommu(bus) && iommu_bus->iommu_ops) {
+
+assert(0 <= devfn && devfn < PCI_DEVFN_MAX);
+assert(iommu_bus);
+
+if (pci_bus_bypass_iommu(bus) || !iommu_bus->iommu_ops) {
+iommu_bus = NULL;
+}
+
+*piommu_bus = iommu_bus;
+
+if (aliased_bus) {
+*aliased_bus = bus;
+}
+
+if (aliased_devfn) {
+*aliased_devfn = devfn;
+}
+}
+
+AddressSpace *pci_device_iommu_address_space(PCIDevice *dev)
+{
+PCIBus *bus;
+PCIBus *iommu_bus;
+int devfn;
+
+pci_device_get_iommu_bus_devfn(dev, &iommu_bus, &bus, &devfn);
+if (iommu_bus) {
 return iommu_bus->iommu_ops->get_address_space(bus,
  iommu_bus->iommu_opaque, devfn);
 }
 return &address_space_memory;
 }
 
+int pci_device_set_iommu_device(PCIDevice *dev, HostIOMMUDevice *hiod,
+Error **err

[PATCH-for-9.0? 2/3] hw/block/nand: Have blk_load() return boolean indicating success

Signed-off-by: Philippe Mathieu-Daudé 
---
 hw/block/nand.c | 12 +---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/hw/block/nand.c b/hw/block/nand.c
index 6fa9038bb5..3627c799b5 100644
--- a/hw/block/nand.c
+++ b/hw/block/nand.c
@@ -84,7 +84,11 @@ struct NANDFlashState {
 
 void (*blk_write)(NANDFlashState *s);
 void (*blk_erase)(NANDFlashState *s);
-void (*blk_load)(NANDFlashState *s, uint64_t addr, int offset);
+/*
+ * Returns %true when block containing (@addr + @offset) is
+ * successfully loaded, otherwise %false.
+ */
+bool (*blk_load)(NANDFlashState *s, uint64_t addr, int offset);
 
 uint32_t ioaddr_vmstate;
 };
@@ -769,11 +773,11 @@ static void glue(nand_blk_erase_, 
NAND_PAGE_SIZE)(NANDFlashState *s)
 }
 }
 
-static void glue(nand_blk_load_, NAND_PAGE_SIZE)(NANDFlashState *s,
+static bool glue(nand_blk_load_, NAND_PAGE_SIZE)(NANDFlashState *s,
 uint64_t addr, int offset)
 {
 if (PAGE(addr) >= s->pages) {
-return;
+return false;
 }
 
 if (s->blk) {
@@ -801,6 +805,8 @@ static void glue(nand_blk_load_, 
NAND_PAGE_SIZE)(NANDFlashState *s,
 offset, NAND_PAGE_SIZE + OOB_SIZE - offset);
 s->ioaddr = s->io;
 }
+
+return true;
 }
 
 static void glue(nand_init_, NAND_PAGE_SIZE)(NANDFlashState *s)
-- 
2.41.0

[PATCH-for-9.0? 0/3] hw/block/nand: Fix out-of-bound access in NAND block buffer

Fix for https://gitlab.com/qemu-project/qemu/-/issues/1446

Philippe Mathieu-Daudé (3):
  hw/block/nand: Factor nand_load_iolen() method out
  hw/block/nand: Have blk_load() return boolean indicating success
  hw/block/nand: Fix out-of-bound access in NAND block buffer

 hw/block/nand.c | 50 +
 1 file changed, 34 insertions(+), 16 deletions(-)

-- 
2.41.0

[PATCH-for-9.0? 3/3] hw/block/nand: Fix out-of-bound access in NAND block buffer

nand_command() and nand_getio() don't check @offset points
into the block, nor the available data length (s->iolen) is
not negative.

In order to fix:

- check the offset is in range in nand_blk_load_NAND_PAGE_SIZE(),
- do not set @iolen if blk_load() failed.

Reproducer:

  $ cat << EOF | qemu-system-arm -machine tosa \
 -monitor none -serial none \
 -display none -qtest stdio
  write 0x1111 0x1 0xca
  write 0x1104 0x1 0x47
  write 0x1000ca04 0x1 0xd7
  write 0x1000ca01 0x1 0xe0
  write 0x1000ca04 0x1 0x71
  write 0x1000ca00 0x1 0x50
  write 0x1000ca04 0x1 0xd7
  read 0x1000ca02 0x1
  write 0x1000ca01 0x1 0x10
  EOF

=
==15750==ERROR: AddressSanitizer: heap-buffer-overflow on address 0x61f00de0
 at pc 0x560e61557210 bp 0x7ffcfc4a59f0 sp 0x7ffcfc4a59e8
READ of size 1 at 0x61f00de0 thread T0
#0 0x560e6155720f in mem_and hw/block/nand.c:101:20
#1 0x560e6155ac9c in nand_blk_write_512 hw/block/nand.c:663:9
#2 0x560e61544200 in nand_command hw/block/nand.c:293:13
#3 0x560e6153cc83 in nand_setio hw/block/nand.c:520:13
#4 0x560e61a0a69e in tc6393xb_nand_writeb hw/display/tc6393xb.c:380:13
#5 0x560e619f9bf7 in tc6393xb_writeb hw/display/tc6393xb.c:524:9
#6 0x560e647c7d03 in memory_region_write_accessor softmmu/memory.c:492:5
#7 0x560e647c7641 in access_with_adjusted_size softmmu/memory.c:554:18
#8 0x560e647c5f66 in memory_region_dispatch_write softmmu/memory.c:1514:16
#9 0x560e6485409e in flatview_write_continue softmmu/physmem.c:2825:23
#10 0x560e648421eb in flatview_write softmmu/physmem.c:2867:12
#11 0x560e64841ca8 in address_space_write softmmu/physmem.c:2963:18
#12 0x560e61170162 in qemu_writeb tests/qtest/videzzo/videzzo_qemu.c:1080:5
#13 0x560e6116eef7 in dispatch_mmio_write 
tests/qtest/videzzo/videzzo_qemu.c:1227:28

0x61f00de0 is located 0 bytes to the right of 3424-byte region 
[0x61f00080,0x61f00de0)
allocated by thread T0 here:
#0 0x560e611276cf in malloc 
/root/llvm-project/compiler-rt/lib/asan/asan_malloc_linux.cpp:145:3
#1 0x7f7959a87e98 in g_malloc 
(/lib/x86_64-linux-gnu/libglib-2.0.so.0+0x57e98)
#2 0x560e64b98871 in object_new qom/object.c:749:12
#3 0x560e64b5d1a1 in qdev_new hw/core/qdev.c:153:19
#4 0x560e61547ea5 in nand_init hw/block/nand.c:639:11
#5 0x560e619f8772 in tc6393xb_init hw/display/tc6393xb.c:558:16
#6 0x560e6390bad2 in tosa_init hw/arm/tosa.c:250:12

SUMMARY: AddressSanitizer: heap-buffer-overflow hw/block/nand.c:101:20 in 
mem_and
==15750==ABORTING

Broken since introduction in commit 3e3d5815cb ("NAND Flash memory
emulation and ECC calculation helpers for use by NAND controllers").

Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1446
Reported-by: Qiang Liu 
Signed-off-by: Philippe Mathieu-Daudé 
---
 hw/block/nand.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/hw/block/nand.c b/hw/block/nand.c
index 3627c799b5..d90dc965a1 100644
--- a/hw/block/nand.c
+++ b/hw/block/nand.c
@@ -255,7 +255,9 @@ static int nand_load_block(NANDFlashState *s, int offset)
 {
 int iolen;
 
-s->blk_load(s, s->addr, offset);
+if (!s->blk_load(s, s->addr, offset)) {
+return 0;
+}
 
 iolen = (1 << s->page_shift) - offset;
 if (s->gnd) {
@@ -780,6 +782,10 @@ static bool glue(nand_blk_load_, 
NAND_PAGE_SIZE)(NANDFlashState *s,
 return false;
 }
 
+if (offset > NAND_PAGE_SIZE + OOB_SIZE) {
+return false;
+}
+
 if (s->blk) {
 if (s->mem_oob) {
 if (blk_pread(s->blk, SECTOR(addr) << BDRV_SECTOR_BITS,
-- 
2.41.0

[PATCH-for-9.0? 1/3] hw/block/nand: Factor nand_load_iolen() method out

Signed-off-by: Philippe Mathieu-Daudé 
---
 hw/block/nand.c | 32 +++-
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/hw/block/nand.c b/hw/block/nand.c
index d1435f2207..6fa9038bb5 100644
--- a/hw/block/nand.c
+++ b/hw/block/nand.c
@@ -243,9 +243,25 @@ static inline void nand_pushio_byte(NANDFlashState *s, 
uint8_t value)
 }
 }
 
+/*
+ * nand_load_block: Load block containing (s->addr + @offset).
+ * Returns length of data available at @offset in this block.
+ */
+static int nand_load_block(NANDFlashState *s, int offset)
+{
+int iolen;
+
+s->blk_load(s, s->addr, offset);
+
+iolen = (1 << s->page_shift) - offset;
+if (s->gnd) {
+iolen += 1 << s->oob_shift;
+}
+return iolen;
+}
+
 static void nand_command(NANDFlashState *s)
 {
-unsigned int offset;
 switch (s->cmd) {
 case NAND_CMD_READ0:
 s->iolen = 0;
@@ -271,12 +287,7 @@ static void nand_command(NANDFlashState *s)
 case NAND_CMD_NOSERIALREAD2:
 if (!(nand_flash_ids[s->chip_id].options & NAND_SAMSUNG_LP))
 break;
-offset = s->addr & ((1 << s->addr_shift) - 1);
-s->blk_load(s, s->addr, offset);
-if (s->gnd)
-s->iolen = (1 << s->page_shift) - offset;
-else
-s->iolen = (1 << s->page_shift) + (1 << s->oob_shift) - offset;
+s->iolen = nand_load_block(s, s->addr & ((1 << s->addr_shift) - 1));
 break;
 
 case NAND_CMD_RESET:
@@ -597,12 +608,7 @@ uint32_t nand_getio(DeviceState *dev)
 if (!s->iolen && s->cmd == NAND_CMD_READ0) {
 offset = (int) (s->addr & ((1 << s->addr_shift) - 1)) + s->offset;
 s->offset = 0;
-
-s->blk_load(s, s->addr, offset);
-if (s->gnd)
-s->iolen = (1 << s->page_shift) - offset;
-else
-s->iolen = (1 << s->page_shift) + (1 << s->oob_shift) - offset;
+s->iolen = nand_load_block(s, offset);
 }
 
 if (s->ce || s->iolen <= 0) {
-- 
2.41.0

Re: [PATCH-for-9.0? 3/3] hw/block/nand: Fix out-of-bound access in NAND block buffer


On 8/4/24 10:36, Philippe Mathieu-Daudé wrote:

nand_command() and nand_getio() don't check @offset points
into the block, nor the available data length (s->iolen) is
not negative.

In order to fix:

- check the offset is in range in nand_blk_load_NAND_PAGE_SIZE(),
- do not set @iolen if blk_load() failed.

Reproducer:

   $ cat << EOF | qemu-system-arm -machine tosa \
  -monitor none -serial none \
  -display none -qtest stdio
   write 0x1111 0x1 0xca
   write 0x1104 0x1 0x47
   write 0x1000ca04 0x1 0xd7
   write 0x1000ca01 0x1 0xe0
   write 0x1000ca04 0x1 0x71
   write 0x1000ca00 0x1 0x50
   write 0x1000ca04 0x1 0xd7
   read 0x1000ca02 0x1
   write 0x1000ca01 0x1 0x10
   EOF

=
==15750==ERROR: AddressSanitizer: heap-buffer-overflow on address 0x61f00de0
  at pc 0x560e61557210 bp 0x7ffcfc4a59f0 sp 0x7ffcfc4a59e8
READ of size 1 at 0x61f00de0 thread T0
 #0 0x560e6155720f in mem_and hw/block/nand.c:101:20
 #1 0x560e6155ac9c in nand_blk_write_512 hw/block/nand.c:663:9
 #2 0x560e61544200 in nand_command hw/block/nand.c:293:13
 #3 0x560e6153cc83 in nand_setio hw/block/nand.c:520:13
 #4 0x560e61a0a69e in tc6393xb_nand_writeb hw/display/tc6393xb.c:380:13
 #5 0x560e619f9bf7 in tc6393xb_writeb hw/display/tc6393xb.c:524:9
 #6 0x560e647c7d03 in memory_region_write_accessor softmmu/memory.c:492:5
 #7 0x560e647c7641 in access_with_adjusted_size softmmu/memory.c:554:18
 #8 0x560e647c5f66 in memory_region_dispatch_write softmmu/memory.c:1514:16
 #9 0x560e6485409e in flatview_write_continue softmmu/physmem.c:2825:23
 #10 0x560e648421eb in flatview_write softmmu/physmem.c:2867:12
 #11 0x560e64841ca8 in address_space_write softmmu/physmem.c:2963:18
 #12 0x560e61170162 in qemu_writeb tests/qtest/videzzo/videzzo_qemu.c:1080:5
 #13 0x560e6116eef7 in dispatch_mmio_write 
tests/qtest/videzzo/videzzo_qemu.c:1227:28

0x61f00de0 is located 0 bytes to the right of 3424-byte region 
[0x61f00080,0x61f00de0)
allocated by thread T0 here:
 #0 0x560e611276cf in malloc 
/root/llvm-project/compiler-rt/lib/asan/asan_malloc_linux.cpp:145:3
 #1 0x7f7959a87e98 in g_malloc 
(/lib/x86_64-linux-gnu/libglib-2.0.so.0+0x57e98)
 #2 0x560e64b98871 in object_new qom/object.c:749:12
 #3 0x560e64b5d1a1 in qdev_new hw/core/qdev.c:153:19
 #4 0x560e61547ea5 in nand_init hw/block/nand.c:639:11
 #5 0x560e619f8772 in tc6393xb_init hw/display/tc6393xb.c:558:16
 #6 0x560e6390bad2 in tosa_init hw/arm/tosa.c:250:12

SUMMARY: AddressSanitizer: heap-buffer-overflow hw/block/nand.c:101:20 in 
mem_and
==15750==ABORTING

Broken since introduction in commit 3e3d5815cb ("NAND Flash memory
emulation and ECC calculation helpers for use by NAND controllers").

Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1446


Also:

  Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1445


Reported-by: Qiang Liu 
Signed-off-by: Philippe Mathieu-Daudé 
---
  hw/block/nand.c | 8 +++-
  1 file changed, 7 insertions(+), 1 deletion(-)

[PATCH v2 4/5] intel_iommu: Check for compatibility with legacy device

Currently only stage-2 translation is supported which is backed by
shadow page table on host side. So we don't need exact matching of
each bit of cap/ecap between vIOMMU and host. However, we can still
ensure compatibility of host and vIOMMU's address width at least,
i.e., vIOMMU's aw-bits <= host IOMMU aw-bits, which is missed before.

Signed-off-by: Yi Liu 
Signed-off-by: Zhenzhong Duan 
---
 hw/i386/intel_iommu.c | 15 +++
 1 file changed, 15 insertions(+)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index a49b587c73..d2cd186df0 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -3824,6 +3824,21 @@ static int vtd_check_legacy_hdev(IntelIOMMUState *s,
  HostIOMMUDevice *hiod,
  Error **errp)
 {
+HostIOMMUDeviceClass *hiodc = HOST_IOMMU_DEVICE_GET_CLASS(hiod);
+HIOD_LEGACY_INFO info;
+int ret;
+
+ret = hiodc->get_host_iommu_info(hiod, &info, sizeof(info), errp);
+if (ret) {
+return ret;
+}
+
+if (s->aw_bits > info.aw_bits) {
+error_setg(errp, "aw-bits %d > host aw-bits %d",
+   s->aw_bits, info.aw_bits);
+return -EINVAL;
+}
+
 return 0;
 }
 
-- 
2.34.1

[PATCH v2 0/5] Check host IOMMU compatilibity with vIOMMU

Hi,

Based on Joao's suggestion, the iommufd nesting prerequisite series [1]
is further splitted to host IOMMU device abstract part [2] and vIOMMU
check part. This series implements the 2nd part.

1st part implements get_host_iommu_info() callback which vIOMMU can call to
get host IOMMU info. For legacy VFIO or VDPA device, aw_bits is provided;
for IOMMUFD backed device, IOMMUFD uAPI provides detailed cap/ecap bits from
host.

vIOMMU implements set/unset_iommu_device() callback to get HostIOMMUDevice
and call get_host_iommu_info(). So vIOMMU can do compatibility check with
the return host IOMMU info.

This is also a prerequisite for incoming iommufd nesting series:
'intel_iommu: Enable stage-1 translation' where HostIOMMUDevice provides
more data such as iommufd/devid/ioas_id and callback attach/detach_hwpt()
for vIOMMU to create nested hwpt, attaching/detaching hwpt, etc.

The major change of this version is dropping the cap/ecap update logic based
on MST's suggestion. We can add property for any cap/ecap bit when necessary
just like "aw-bits". This way we don't need to concern about migration
compatibility and code is cleaner.

Qemu code can be found at:
https://github.com/yiliu1765/qemu/tree/zhenzhong/iommufd_nesting_preq_part2_v2

[1] 
https://lore.kernel.org/qemu-devel/20240201072818.327930-1-zhenzhong.d...@intel.com/
[2] https://lists.gnu.org/archive/html/qemu-devel/2024-04/msg00763.html

Thanks
Zhenzhong

Changelog:
v2:
- drop cap/ecap update logic (MST)
- check aw-bits from get_host_iommu_info() in legacy mode

v1:
- convert HostIOMMUDevice to sub object pointer in vtd_check_hdev

rfcv2:
- introduce common abstract HostIOMMUDevice and sub struct for different BEs 
(Eric, Cédric)
- remove iommufd_device.[ch] (Cédric)
- remove duplicate iommufd/devid define from VFIODevice (Eric)
- drop the p in aliased_pbus and aliased_pdevfn (Eric)
- assert devfn and iommu_bus in pci_device_get_iommu_bus_devfn (Cédric, Eric)
- use errp in iommufd_device_get_info (Eric)
- split and simplify cap/ecap check/sync code in intel_iommu.c (Cédric)
- move VTDHostIOMMUDevice declaration to intel_iommu_internal.h (Cédric)
- make '(vtd->cap_reg >> 16) & 0x3fULL' a MACRO and add missed '+1' (Cédric)
- block migration if vIOMMU cap/ecap updated based on host IOMMU cap/ecap
- add R-B


Yi Liu (2):
  intel_iommu: Implement set/unset_iommu_device() callback
  intel_iommu: Add a framework to do compatibility check with host IOMMU
cap/ecap

Zhenzhong Duan (3):
  intel_iommu: Extract out vtd_cap_init() to initialize cap/ecap
  intel_iommu: Check for compatibility with legacy device
  intel_iommu: Check for compatibility with iommufd backed device

 hw/i386/intel_iommu_internal.h |   8 ++
 include/hw/i386/intel_iommu.h  |   3 +
 hw/i386/intel_iommu.c  | 242 +++--
 3 files changed, 211 insertions(+), 42 deletions(-)

-- 
2.34.1

[PATCH v2 1/5] intel_iommu: Extract out vtd_cap_init() to initialize cap/ecap

Extract cap/ecap initialization in vtd_cap_init() to make code
cleaner.

No functional change intended.

Reviewed-by: Eric Auger 
Signed-off-by: Zhenzhong Duan 
---
 hw/i386/intel_iommu.c | 93 ---
 1 file changed, 51 insertions(+), 42 deletions(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index cc8e59674e..519063c8f8 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -3934,30 +3934,10 @@ static void vtd_iommu_replay(IOMMUMemoryRegion 
*iommu_mr, IOMMUNotifier *n)
 return;
 }
 
-/* Do the initialization. It will also be called when reset, so pay
- * attention when adding new initialization stuff.
- */
-static void vtd_init(IntelIOMMUState *s)
+static void vtd_cap_init(IntelIOMMUState *s)
 {
 X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
 
-memset(s->csr, 0, DMAR_REG_SIZE);
-memset(s->wmask, 0, DMAR_REG_SIZE);
-memset(s->w1cmask, 0, DMAR_REG_SIZE);
-memset(s->womask, 0, DMAR_REG_SIZE);
-
-s->root = 0;
-s->root_scalable = false;
-s->dmar_enabled = false;
-s->intr_enabled = false;
-s->iq_head = 0;
-s->iq_tail = 0;
-s->iq = 0;
-s->iq_size = 0;
-s->qi_enabled = false;
-s->iq_last_desc_type = VTD_INV_DESC_NONE;
-s->iq_dw = false;
-s->next_frcd_reg = 0;
 s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND |
  VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS |
  VTD_CAP_MGAW(s->aw_bits);
@@ -3974,27 +3954,6 @@ static void vtd_init(IntelIOMMUState *s)
 }
 s->ecap = VTD_ECAP_QI | VTD_ECAP_IRO;
 
-/*
- * Rsvd field masks for spte
- */
-vtd_spte_rsvd[0] = ~0ULL;
-vtd_spte_rsvd[1] = VTD_SPTE_PAGE_L1_RSVD_MASK(s->aw_bits,
-  x86_iommu->dt_supported);
-vtd_spte_rsvd[2] = VTD_SPTE_PAGE_L2_RSVD_MASK(s->aw_bits);
-vtd_spte_rsvd[3] = VTD_SPTE_PAGE_L3_RSVD_MASK(s->aw_bits);
-vtd_spte_rsvd[4] = VTD_SPTE_PAGE_L4_RSVD_MASK(s->aw_bits);
-
-vtd_spte_rsvd_large[2] = VTD_SPTE_LPAGE_L2_RSVD_MASK(s->aw_bits,
- 
x86_iommu->dt_supported);
-vtd_spte_rsvd_large[3] = VTD_SPTE_LPAGE_L3_RSVD_MASK(s->aw_bits,
- 
x86_iommu->dt_supported);
-
-if (s->scalable_mode || s->snoop_control) {
-vtd_spte_rsvd[1] &= ~VTD_SPTE_SNP;
-vtd_spte_rsvd_large[2] &= ~VTD_SPTE_SNP;
-vtd_spte_rsvd_large[3] &= ~VTD_SPTE_SNP;
-}
-
 if (x86_iommu_ir_supported(x86_iommu)) {
 s->ecap |= VTD_ECAP_IR | VTD_ECAP_MHMV;
 if (s->intr_eim == ON_OFF_AUTO_ON) {
@@ -4027,6 +3986,56 @@ static void vtd_init(IntelIOMMUState *s)
 if (s->pasid) {
 s->ecap |= VTD_ECAP_PASID;
 }
+}
+
+/*
+ * Do the initialization. It will also be called when reset, so pay
+ * attention when adding new initialization stuff.
+ */
+static void vtd_init(IntelIOMMUState *s)
+{
+X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
+
+memset(s->csr, 0, DMAR_REG_SIZE);
+memset(s->wmask, 0, DMAR_REG_SIZE);
+memset(s->w1cmask, 0, DMAR_REG_SIZE);
+memset(s->womask, 0, DMAR_REG_SIZE);
+
+s->root = 0;
+s->root_scalable = false;
+s->dmar_enabled = false;
+s->intr_enabled = false;
+s->iq_head = 0;
+s->iq_tail = 0;
+s->iq = 0;
+s->iq_size = 0;
+s->qi_enabled = false;
+s->iq_last_desc_type = VTD_INV_DESC_NONE;
+s->iq_dw = false;
+s->next_frcd_reg = 0;
+
+vtd_cap_init(s);
+
+/*
+ * Rsvd field masks for spte
+ */
+vtd_spte_rsvd[0] = ~0ULL;
+vtd_spte_rsvd[1] = VTD_SPTE_PAGE_L1_RSVD_MASK(s->aw_bits,
+  x86_iommu->dt_supported);
+vtd_spte_rsvd[2] = VTD_SPTE_PAGE_L2_RSVD_MASK(s->aw_bits);
+vtd_spte_rsvd[3] = VTD_SPTE_PAGE_L3_RSVD_MASK(s->aw_bits);
+vtd_spte_rsvd[4] = VTD_SPTE_PAGE_L4_RSVD_MASK(s->aw_bits);
+
+vtd_spte_rsvd_large[2] = VTD_SPTE_LPAGE_L2_RSVD_MASK(s->aw_bits,
+x86_iommu->dt_supported);
+vtd_spte_rsvd_large[3] = VTD_SPTE_LPAGE_L3_RSVD_MASK(s->aw_bits,
+x86_iommu->dt_supported);
+
+if (s->scalable_mode || s->snoop_control) {
+vtd_spte_rsvd[1] &= ~VTD_SPTE_SNP;
+vtd_spte_rsvd_large[2] &= ~VTD_SPTE_SNP;
+vtd_spte_rsvd_large[3] &= ~VTD_SPTE_SNP;
+}
 
 vtd_reset_caches(s);
 
-- 
2.34.1

[PATCH v2 2/5] intel_iommu: Implement set/unset_iommu_device() callback

From: Yi Liu 

Implement set/unset_iommu_device() callback in Intel vIOMMU.
In set call, a new structure VTDHostIOMMUDevice which holds
a reference to HostIOMMUDevice is stored in hash table
indexed by PCI BDF.

Signed-off-by: Yi Liu 
Signed-off-by: Yi Sun 
Signed-off-by: Zhenzhong Duan 
---
 hw/i386/intel_iommu_internal.h |  8 
 include/hw/i386/intel_iommu.h  |  2 +
 hw/i386/intel_iommu.c  | 76 ++
 3 files changed, 86 insertions(+)

diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index f8cf99bddf..becafd03c1 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -537,4 +537,12 @@ typedef struct VTDRootEntry VTDRootEntry;
 #define VTD_SL_IGN_COM  0xbff0ULL
 #define VTD_SL_TM   (1ULL << 62)
 
+
+typedef struct VTDHostIOMMUDevice {
+IntelIOMMUState *iommu_state;
+PCIBus *bus;
+uint8_t devfn;
+HostIOMMUDevice *dev;
+QLIST_ENTRY(VTDHostIOMMUDevice) next;
+} VTDHostIOMMUDevice;
 #endif
diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index 7fa0a695c8..bbc7b96add 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -292,6 +292,8 @@ struct IntelIOMMUState {
 /* list of registered notifiers */
 QLIST_HEAD(, VTDAddressSpace) vtd_as_with_notifiers;
 
+GHashTable *vtd_host_iommu_dev; /* VTDHostIOMMUDevice */
+
 /* interrupt remapping */
 bool intr_enabled;  /* Whether guest enabled IR */
 dma_addr_t intr_root;   /* Interrupt remapping table pointer */
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 519063c8f8..4f84e2e801 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -237,6 +237,13 @@ static gboolean vtd_as_equal(gconstpointer v1, 
gconstpointer v2)
(key1->pasid == key2->pasid);
 }
 
+static gboolean vtd_as_idev_equal(gconstpointer v1, gconstpointer v2)
+{
+const struct vtd_as_key *key1 = v1;
+const struct vtd_as_key *key2 = v2;
+
+return (key1->bus == key2->bus) && (key1->devfn == key2->devfn);
+}
 /*
  * Note that we use pointer to PCIBus as the key, so hashing/shifting
  * based on the pointer value is intended. Note that we deal with
@@ -3812,6 +3819,70 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, 
PCIBus *bus,
 return vtd_dev_as;
 }
 
+static int vtd_dev_set_iommu_device(PCIBus *bus, void *opaque, int devfn,
+HostIOMMUDevice *hiod, Error **errp)
+{
+IntelIOMMUState *s = opaque;
+VTDHostIOMMUDevice *vtd_hdev;
+struct vtd_as_key key = {
+.bus = bus,
+.devfn = devfn,
+};
+struct vtd_as_key *new_key;
+
+assert(hiod);
+
+vtd_iommu_lock(s);
+
+vtd_hdev = g_hash_table_lookup(s->vtd_host_iommu_dev, &key);
+
+if (vtd_hdev) {
+error_setg(errp, "IOMMUFD device already exist");
+vtd_iommu_unlock(s);
+return -EEXIST;
+}
+
+vtd_hdev = g_malloc0(sizeof(VTDHostIOMMUDevice));
+vtd_hdev->bus = bus;
+vtd_hdev->devfn = (uint8_t)devfn;
+vtd_hdev->iommu_state = s;
+vtd_hdev->dev = hiod;
+
+new_key = g_malloc(sizeof(*new_key));
+new_key->bus = bus;
+new_key->devfn = devfn;
+
+object_ref(hiod);
+g_hash_table_insert(s->vtd_host_iommu_dev, new_key, vtd_hdev);
+
+vtd_iommu_unlock(s);
+
+return 0;
+}
+
+static void vtd_dev_unset_iommu_device(PCIBus *bus, void *opaque, int devfn)
+{
+IntelIOMMUState *s = opaque;
+VTDHostIOMMUDevice *vtd_hdev;
+struct vtd_as_key key = {
+.bus = bus,
+.devfn = devfn,
+};
+
+vtd_iommu_lock(s);
+
+vtd_hdev = g_hash_table_lookup(s->vtd_host_iommu_dev, &key);
+if (!vtd_hdev) {
+vtd_iommu_unlock(s);
+return;
+}
+
+g_hash_table_remove(s->vtd_host_iommu_dev, &key);
+object_unref(vtd_hdev->dev);
+
+vtd_iommu_unlock(s);
+}
+
 /* Unmap the whole range in the notifier's scope. */
 static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n)
 {
@@ -4116,6 +4187,8 @@ static AddressSpace *vtd_host_dma_iommu(PCIBus *bus, void 
*opaque, int devfn)
 
 static PCIIOMMUOps vtd_iommu_ops = {
 .get_address_space = vtd_host_dma_iommu,
+.set_iommu_device = vtd_dev_set_iommu_device,
+.unset_iommu_device = vtd_dev_unset_iommu_device,
 };
 
 static bool vtd_decide_config(IntelIOMMUState *s, Error **errp)
@@ -4235,6 +4308,9 @@ static void vtd_realize(DeviceState *dev, Error **errp)
  g_free, g_free);
 s->vtd_address_spaces = g_hash_table_new_full(vtd_as_hash, vtd_as_equal,
   g_free, g_free);
+s->vtd_host_iommu_dev = g_hash_table_new_full(vtd_as_hash,
+  vtd_as_idev_equal,
+  g_free, g_free);
 vtd_init(s);
 pci_setup_iommu(bus, &vtd_iommu_ops, dev);

[PATCH v2 3/5] intel_iommu: Add a framework to do compatibility check with host IOMMU cap/ecap

From: Yi Liu 

If check fails, the host side device(either vfio or vdpa device) should not
be passed to guest.

Implementation details for different backends will be in following patches.

Signed-off-by: Yi Liu 
Signed-off-by: Yi Sun 
Signed-off-by: Zhenzhong Duan 
---
 hw/i386/intel_iommu.c | 35 +++
 1 file changed, 35 insertions(+)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 4f84e2e801..a49b587c73 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -35,6 +35,7 @@
 #include "sysemu/kvm.h"
 #include "sysemu/dma.h"
 #include "sysemu/sysemu.h"
+#include "sysemu/iommufd.h"
 #include "hw/i386/apic_internal.h"
 #include "kvm/kvm_i386.h"
 #include "migration/vmstate.h"
@@ -3819,6 +3820,32 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, 
PCIBus *bus,
 return vtd_dev_as;
 }
 
+static int vtd_check_legacy_hdev(IntelIOMMUState *s,
+ HostIOMMUDevice *hiod,
+ Error **errp)
+{
+return 0;
+}
+
+static int vtd_check_iommufd_hdev(IntelIOMMUState *s,
+  HostIOMMUDevice *hiod,
+  Error **errp)
+{
+return 0;
+}
+
+static int vtd_check_hdev(IntelIOMMUState *s, VTDHostIOMMUDevice *vtd_hdev,
+  Error **errp)
+{
+HostIOMMUDevice *hiod = vtd_hdev->dev;
+
+if (object_dynamic_cast(OBJECT(hiod), TYPE_HIOD_IOMMUFD)) {
+return vtd_check_iommufd_hdev(s, hiod, errp);
+}
+
+return vtd_check_legacy_hdev(s, hiod, errp);
+}
+
 static int vtd_dev_set_iommu_device(PCIBus *bus, void *opaque, int devfn,
 HostIOMMUDevice *hiod, Error **errp)
 {
@@ -3829,6 +3856,7 @@ static int vtd_dev_set_iommu_device(PCIBus *bus, void 
*opaque, int devfn,
 .devfn = devfn,
 };
 struct vtd_as_key *new_key;
+int ret;
 
 assert(hiod);
 
@@ -3848,6 +3876,13 @@ static int vtd_dev_set_iommu_device(PCIBus *bus, void 
*opaque, int devfn,
 vtd_hdev->iommu_state = s;
 vtd_hdev->dev = hiod;
 
+ret = vtd_check_hdev(s, vtd_hdev, errp);
+if (ret) {
+g_free(vtd_hdev);
+vtd_iommu_unlock(s);
+return ret;
+}
+
 new_key = g_malloc(sizeof(*new_key));
 new_key->bus = bus;
 new_key->devfn = devfn;
-- 
2.34.1

[PATCH v2 5/5] intel_iommu: Check for compatibility with iommufd backed device

Currently only stage-2 translation is supported which is backed by
shadow page table on host side. So we don't need exact matching of
each bit of cap/ecap between vIOMMU and host. However, we can still
ensure compatibility of host and vIOMMU's address width at least,
i.e., vIOMMU's aw-bits <= host IOMMU aw-bits, which is missed before.

When stage-1 translation is supported in future, a.k.a. scalable
modern mode, this mechanism will be further extended to check more
bits.

Signed-off-by: Yi Liu 
Signed-off-by: Yi Sun 
Signed-off-by: Zhenzhong Duan 
---
 include/hw/i386/intel_iommu.h |  1 +
 hw/i386/intel_iommu.c | 23 +++
 2 files changed, 24 insertions(+)

diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index bbc7b96add..2bbde41e45 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -47,6 +47,7 @@ OBJECT_DECLARE_SIMPLE_TYPE(IntelIOMMUState, 
INTEL_IOMMU_DEVICE)
 #define VTD_HOST_AW_48BIT   48
 #define VTD_HOST_ADDRESS_WIDTH  VTD_HOST_AW_39BIT
 #define VTD_HAW_MASK(aw)((1ULL << (aw)) - 1)
+#define VTD_MGAW_FROM_CAP(cap)  ((cap >> 16) & 0x3fULL)
 
 #define DMAR_REPORT_F_INTR  (1)
 
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index d2cd186df0..d8fac9ef9f 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -3846,6 +3846,29 @@ static int vtd_check_iommufd_hdev(IntelIOMMUState *s,
   HostIOMMUDevice *hiod,
   Error **errp)
 {
+HostIOMMUDeviceClass *hiodc = HOST_IOMMU_DEVICE_GET_CLASS(hiod);
+struct iommu_hw_info_vtd *vtd;
+HIOD_IOMMUFD_INFO info;
+int host_aw_bits, ret;
+
+ret = hiodc->get_host_iommu_info(hiod, &info, sizeof(info), errp);
+if (ret) {
+return ret;
+}
+
+if (info.type != IOMMU_HW_INFO_TYPE_INTEL_VTD) {
+error_setg(errp, "IOMMU hardware is not compatible");
+return -EINVAL;
+}
+
+vtd = &info.data.vtd;
+host_aw_bits = VTD_MGAW_FROM_CAP(vtd->cap_reg) + 1;
+if (s->aw_bits > host_aw_bits) {
+error_setg(errp, "aw-bits %d > host aw-bits %d",
+   s->aw_bits, host_aw_bits);
+return -EINVAL;
+}
+
 return 0;
 }
 
-- 
2.34.1

Re: [PATCH v4] nbd/server: do not poll within a coroutine context

2024-04-08 Thread Vladimir Sementsov-Ogievskiy


On 05.04.24 20:44, Eric Blake wrote:

From: Zhu Yangyang 

Coroutines are not supposed to block. Instead, they should yield.

The client performs TLS upgrade outside of an AIOContext, during
synchronous handshake; this still requires g_main_loop.  But the
server responds to TLS upgrade inside a coroutine, so a nested
g_main_loop is wrong.  Since the two callbacks no longer share more
than the setting of data.complete and data.error, it's just as easy to
use static helpers instead of trying to share a common code path.

Fixes: f95910f ("nbd: implement TLS support in the protocol negotiation")
Signed-off-by: Zhu Yangyang 
[eblake: move callbacks to their use point]
Signed-off-by: Eric Blake 


Reviewed-by: Vladimir Sementsov-Ogievskiy 

still, some notes below


---

v3: https://lists.gnu.org/archive/html/qemu-devel/2024-04/msg00375.html

in v4, factor even the struct to the .c files, avoiding a union [Vladimir]

  nbd/nbd-internal.h | 10 --
  nbd/client.c   | 27 +++
  nbd/common.c   | 11 ---
  nbd/server.c   | 29 +++--
  4 files changed, 46 insertions(+), 31 deletions(-)

diff --git a/nbd/nbd-internal.h b/nbd/nbd-internal.h
index dfa02f77ee4..91895106a95 100644
--- a/nbd/nbd-internal.h
+++ b/nbd/nbd-internal.h
@@ -72,16 +72,6 @@ static inline int nbd_write(QIOChannel *ioc, const void 
*buffer, size_t size,
  return qio_channel_write_all(ioc, buffer, size, errp) < 0 ? -EIO : 0;
  }

-struct NBDTLSHandshakeData {
-GMainLoop *loop;
-bool complete;
-Error *error;
-};
-
-
-void nbd_tls_handshake(QIOTask *task,
-   void *opaque);
-
  int nbd_drop(QIOChannel *ioc, size_t size, Error **errp);

  #endif
diff --git a/nbd/client.c b/nbd/client.c
index 29ffc609a4b..c7141d7a098 100644
--- a/nbd/client.c
+++ b/nbd/client.c
@@ -596,13 +596,31 @@ static int nbd_request_simple_option(QIOChannel *ioc, int 
opt, bool strict,
  return 1;
  }

+/* Callback to learn when QIO TLS upgrade is complete */
+struct NBDTLSClientHandshakeData {
+bool complete;
+Error *error;
+GMainLoop *loop;
+};
+
+static void nbd_client_tls_handshake(QIOTask *task, void *opaque)
+{
+struct NBDTLSClientHandshakeData *data = opaque;
+
+qio_task_propagate_error(task, &data->error);
+data->complete = true;
+if (data->loop) {
+g_main_loop_quit(data->loop);
+}
+}
+
  static QIOChannel *nbd_receive_starttls(QIOChannel *ioc,
  QCryptoTLSCreds *tlscreds,
  const char *hostname, Error **errp)
  {
  int ret;
  QIOChannelTLS *tioc;
-struct NBDTLSHandshakeData data = { 0 };
+struct NBDTLSClientHandshakeData data = { 0 };

  ret = nbd_request_simple_option(ioc, NBD_OPT_STARTTLS, true, errp);
  if (ret <= 0) {
@@ -619,18 +637,19 @@ static QIOChannel *nbd_receive_starttls(QIOChannel *ioc,
  return NULL;
  }
  qio_channel_set_name(QIO_CHANNEL(tioc), "nbd-client-tls");
-data.loop = g_main_loop_new(g_main_context_default(), FALSE);
  trace_nbd_receive_starttls_tls_handshake();
  qio_channel_tls_handshake(tioc,
-  nbd_tls_handshake,
+  nbd_client_tls_handshake,
&data,
NULL,
NULL);

  if (!data.complete) {
+data.loop = g_main_loop_new(g_main_context_default(), FALSE);
  g_main_loop_run(data.loop);
+g_main_loop_unref(data.loop);


probably good to assert(data.complete);


  }
-g_main_loop_unref(data.loop);
+
  if (data.error) {
  error_propagate(errp, data.error);
  object_unref(OBJECT(tioc));
diff --git a/nbd/common.c b/nbd/common.c
index 3247c1d618a..589a748cfe6 100644
--- a/nbd/common.c
+++ b/nbd/common.c
@@ -47,17 +47,6 @@ int nbd_drop(QIOChannel *ioc, size_t size, Error **errp)
  }


-void nbd_tls_handshake(QIOTask *task,
-   void *opaque)
-{
-struct NBDTLSHandshakeData *data = opaque;
-
-qio_task_propagate_error(task, &data->error);
-data->complete = true;
-g_main_loop_quit(data->loop);
-}
-
-
  const char *nbd_opt_lookup(uint32_t opt)
  {
  switch (opt) {
diff --git a/nbd/server.c b/nbd/server.c
index c3484cc1ebc..ea13cf0e766 100644
--- a/nbd/server.c
+++ b/nbd/server.c
@@ -748,6 +748,23 @@ static int nbd_negotiate_handle_info(NBDClient *client, 
Error **errp)
  return rc;
  }

+/* Callback to learn when QIO TLS upgrade is complete */
+struct NBDTLSServerHandshakeData {
+bool complete;
+Error *error;
+Coroutine *co;
+};
+
+static void nbd_server_tls_handshake(QIOTask *task, void *opaque)
+{
+struct NBDTLSServerHandshakeData *data = opaque;
+
+qio_task_propagate_error(task, &data->error);
+data->complete = true;
+if (!qemu_coroutine_entered(data->co)) {
+aio_co_wake(data->co);
+}
+}

  /* Handle N

Re: [PATCH-for-9.1 2/7] yank: Restrict to system emulation

On Thu, Apr 4, 2024 at 9:48 PM Philippe Mathieu-Daudé  wrote:
>
> The yank feature is not used in user emulation.

But it is used in block layer tools. The simplest thing here is
probably to move it under have_block instead.

Paolo

> Signed-off-by: Philippe Mathieu-Daudé 
> ---
>  util/meson.build | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/util/meson.build b/util/meson.build
> index 0ef9886be0..247f55a80d 100644
> --- a/util/meson.build
> +++ b/util/meson.build
> @@ -60,7 +60,6 @@ util_ss.add(files('stats64.c'))
>  util_ss.add(files('systemd.c'))
>  util_ss.add(files('transactions.c'))
>  util_ss.add(files('guest-random.c'))
> -util_ss.add(files('yank.c'))
>  util_ss.add(files('int128.c'))
>  util_ss.add(files('memalign.c'))
>  util_ss.add(files('interval-tree.c'))
> @@ -76,6 +75,7 @@ if have_system
>if host_os == 'linux'
>  util_ss.add(files('userfaultfd.c'))
>endif
> +  util_ss.add(files('yank.c'))
>  endif
>
>  if have_block or have_ga
> --
> 2.41.0
>

Re: [PATCH-for-9.1 4/7] util/qemu-config: Extract QMP commands to qemu-config-qmp.c

On Thu, Apr 4, 2024 at 9:48 PM Philippe Mathieu-Daudé  wrote:
>
> QMP is irrelevant for user emulation. Extract the code
> related to QMP in a different source file, which won't
> be build for user emulation binaries. This avoid pulling
> pointless code.
>
> Signed-off-by: Philippe Mathieu-Daudé 
> ---
>  include/qemu/config-file.h |   3 +
>  util/qemu-config-qmp.c | 206 +

This should go under monitor/.

Queued all except patch 2, please resubmit that and send it to
qemu-bl...@nongnu.org.

>  util/qemu-config.c | 204 +---
>  util/meson.build   |   1 +
>  4 files changed, 212 insertions(+), 202 deletions(-)
>  create mode 100644 util/qemu-config-qmp.c
>
> diff --git a/include/qemu/config-file.h b/include/qemu/config-file.h
> index b82a778123..8b9d6df173 100644
> --- a/include/qemu/config-file.h
> +++ b/include/qemu/config-file.h
> @@ -8,6 +8,9 @@ QemuOptsList *qemu_find_opts(const char *group);
>  QemuOptsList *qemu_find_opts_err(const char *group, Error **errp);
>  QemuOpts *qemu_find_opts_singleton(const char *group);
>
> +extern QemuOptsList *vm_config_groups[48];
> +extern QemuOptsList *drive_config_groups[5];
> +
>  void qemu_add_opts(QemuOptsList *list);
>  void qemu_add_drive_opts(QemuOptsList *list);
>  int qemu_global_option(const char *str);
> diff --git a/util/qemu-config-qmp.c b/util/qemu-config-qmp.c
> new file mode 100644
> index 00..24477a0e44
> --- /dev/null
> +++ b/util/qemu-config-qmp.c
> @@ -0,0 +1,206 @@
> +/* SPDX-License-Identifier: GPL-2.0-or-later */
> +#include "qemu/osdep.h"
> +#include "qapi/error.h"
> +#include "qapi/qapi-commands-misc.h"
> +#include "qapi/qmp/qlist.h"
> +#include "qemu/option.h"
> +#include "qemu/config-file.h"
> +#include "hw/boards.h"
> +
> +static CommandLineParameterInfoList *query_option_descs(const QemuOptDesc 
> *desc)
> +{
> +CommandLineParameterInfoList *param_list = NULL;
> +CommandLineParameterInfo *info;
> +int i;
> +
> +for (i = 0; desc[i].name != NULL; i++) {
> +info = g_malloc0(sizeof(*info));
> +info->name = g_strdup(desc[i].name);
> +
> +switch (desc[i].type) {
> +case QEMU_OPT_STRING:
> +info->type = COMMAND_LINE_PARAMETER_TYPE_STRING;
> +break;
> +case QEMU_OPT_BOOL:
> +info->type = COMMAND_LINE_PARAMETER_TYPE_BOOLEAN;
> +break;
> +case QEMU_OPT_NUMBER:
> +info->type = COMMAND_LINE_PARAMETER_TYPE_NUMBER;
> +break;
> +case QEMU_OPT_SIZE:
> +info->type = COMMAND_LINE_PARAMETER_TYPE_SIZE;
> +break;
> +}
> +
> +info->help = g_strdup(desc[i].help);
> +info->q_default = g_strdup(desc[i].def_value_str);
> +
> +QAPI_LIST_PREPEND(param_list, info);
> +}
> +
> +return param_list;
> +}
> +
> +/* remove repeated entry from the info list */
> +static void cleanup_infolist(CommandLineParameterInfoList *head)
> +{
> +CommandLineParameterInfoList *pre_entry, *cur, *del_entry;
> +
> +cur = head;
> +while (cur->next) {
> +pre_entry = head;
> +while (pre_entry != cur->next) {
> +if (!strcmp(pre_entry->value->name, cur->next->value->name)) {
> +del_entry = cur->next;
> +cur->next = cur->next->next;
> +del_entry->next = NULL;
> +qapi_free_CommandLineParameterInfoList(del_entry);
> +break;
> +}
> +pre_entry = pre_entry->next;
> +}
> +cur = cur->next;
> +}
> +}
> +
> +/* merge the description items of two parameter infolists */
> +static void connect_infolist(CommandLineParameterInfoList *head,
> + CommandLineParameterInfoList *new)
> +{
> +CommandLineParameterInfoList *cur;
> +
> +cur = head;
> +while (cur->next) {
> +cur = cur->next;
> +}
> +cur->next = new;
> +}
> +
> +/* access all the local QemuOptsLists for drive option */
> +static CommandLineParameterInfoList *get_drive_infolist(void)
> +{
> +CommandLineParameterInfoList *head = NULL, *cur;
> +int i;
> +
> +for (i = 0; drive_config_groups[i] != NULL; i++) {
> +if (!head) {
> +head = query_option_descs(drive_config_groups[i]->desc);
> +} else {
> +cur = query_option_descs(drive_config_groups[i]->desc);
> +connect_infolist(head, cur);
> +}
> +}
> +cleanup_infolist(head);
> +
> +return head;
> +}
> +
> +static CommandLineParameterInfo *objprop_to_cmdline_prop(ObjectProperty 
> *prop)
> +{
> +CommandLineParameterInfo *info;
> +
> +info = g_malloc0(sizeof(*info));
> +info->name = g_strdup(prop->name);
> +
> +if (g_str_equal(prop->type, "bool") || g_str_equal(prop->type, 
> "OnOffAuto")) {
> +info->type = COMMAND_LINE_PARAMETER_TYPE_BOOLEAN;
> +} else if (g_str_equal(prop->type, "int")) {

Re: [PATCH v2] sh4: mac.w: implement saturation arithmetic logic

2024-04-08 Thread Yoshinori Sato

On Sat, 06 Apr 2024 08:38:04 +0900,
Zack Buhman wrote:
> 
> The saturation arithmetic logic in helper_macw is not correct.
> 
> I tested and verified this behavior on a SH7091, the general pattern
> is a code sequence such as:
> 
>   sets
> 
>   mov.l _mach,r2
>   lds r2,mach
>   mov.l _macl,r2
>   lds r2,macl
> 
>   mova _n,r0
>   mov r0,r1
>   mova _m,r0
>   mac.w @r0+,@r1+
> 
>  _mach: .long 0x
>  _macl: .long 0xfffe
>  _m:.word 0x0002
> .word 0
>  _n:.word 0x0003
> .word 0
> 
> test 0:
>   (mach should not be modified if an overflow did not occur)
> 
>   given, prior to saturation mac.l:
> mach = 0x ; macl = 0xfffe
> @r0  = 0x0002 ; @r1  = 0x0003
> 
>   expected saturation mac.w result:
> mach = 0x (unchanged)
> macl = 0x0004
> 
>   qemu saturation mac.w result (before this commit):
> mach = 0x0001
> macl = 0x8000
> 
>   In the context of the helper_macw implementation prior to this
>   commit, initially this appears to be a surprising result. This is
>   because (prior to unary negation) the C literal `0x8000` (due to
>   being outside the range of a `signed int`) is evaluated as an
>   `unsigned int` whereas the literal `1` (due to being inside the
>   range of `signed int`) is evaluated as `signed int`, as in:
> 
> static_assert(1 < -0x8000 == 1);
> static_assert(1 < -1 == 0);
> 
>   This is because the unary negation of an unsigned int is an
>   unsigned int.
> 
>   In other words, if the `res < -0x8000` comparison used
>   infinite-precision literals, the saturation mac.w result would have
>   been:
> 
> mach = 0x
> macl = 0x0004
> 
>   Due to this (forgivable) misunderstanding of C literals, the
>   following behavior also occurs:
> 
> test 1:
>   (`2 * 3 + 0` is not an overflow)
> 
>   given, prior to saturation mac.l:
> mach = 0x ; macl = 0x
> @r0  = 0x0002 ; @r1  = 0x0003
> 
>   expected saturation mac.w result:
> mach = 0x (unchanged)
> macl = 0x0006
> 
>   qemu saturation mac.w result (before this commit):
> mach = 0x0001
> macl = 0x8000
> 
> test 2:
>   (mach should not be accumulated in saturation mode)
>   (16-bit operands are sign-extended)
> 
>   given, prior to saturation mac.l:
> mach = 0x12345678 ; macl = 0x7ffe
> @r0  = 0x0002 ; @r1  = 0xfffd
> 
>   expected saturation mac.w result:
> mach = 0x12345678 (unchanged)
> macl = 0x7ff8
> 
>   qemu saturation mac.w result (before this commit):
> mach = 0x0001
> macl = 0x7fff
> 
> test 3:
>   (macl should have the correct saturation value)
> 
>   given, prior to saturation mac.l:
> mach = 0xabcdef12 ; macl = 0x7ffa
> @r0  = 0x0002 ; @r1  = 0x0003
> 
>   expected saturation mac.w result:
> mach = 0x0001 (overwritten)
> macl = 0x7fff
> 
>   qemu saturation mac.w result (before this commit):
> mach = 0x0001
> macl = 0x8000
> 
> All of the above also matches the description of MAC.W as documented
> in cd00147165-sh-4-32-bit-cpu-core-architecture-stmicroelectronics.pdf
> 
> Signed-off-by: Zack Buhman 
> ---
>  target/sh4/op_helper.c | 45 --
>  1 file changed, 35 insertions(+), 10 deletions(-)
> 
> diff --git a/target/sh4/op_helper.c b/target/sh4/op_helper.c
> index ee16524083..07ff2cf53d 100644
> --- a/target/sh4/op_helper.c
> +++ b/target/sh4/op_helper.c
> @@ -187,20 +187,45 @@ void helper_macl(CPUSH4State *env, uint32_t arg0, 
> uint32_t arg1)
>  
>  void helper_macw(CPUSH4State *env, uint32_t arg0, uint32_t arg1)
>  {
> -int64_t res;
> +int16_t value0 = (int16_t)arg0;
> +int16_t value1 = (int16_t)arg1;
> +int32_t mul = ((int32_t)value0) * ((int32_t)value1);
>  
> -res = ((uint64_t) env->mach << 32) | env->macl;
> -res += (int64_t) (int16_t) arg0 *(int64_t) (int16_t) arg1;
> -env->mach = (res >> 32) & 0x;
> -env->macl = res & 0x;
> +/* Perform 32-bit saturation arithmetic if the S flag is set */
>  if (env->sr & (1u << SR_S)) {
> -if (res < -0x8000) {
> -env->mach = 1;
> -env->macl = 0x8000;
> -} else if (res > 0x7fff) {
> +const int32_t upper_bound =  ((1u << 31) - 1);
> +const int32_t lower_bound = -((1u << 31) - 0);
> +
> +/*
> + * In saturation arithmetic mode, the accumulator is 32-bit
> + * with carry. MACH is not considered during the addition
> + * operation nor the 32-bit saturation logic.
> + */
> +int32_t mac = env->macl;
> +int32_t result;
> +bool overflow = sadd32_overflow(mac, mul, &result);
> +if (overflow) {
> +result = (mac < 0) ? lower_bound : upper_bound;
> +/* MACH is set to 1 to denote overflow */
> +env->macl = result;
>

Re: [PATCH 30/32] target/rx: Use translator_ld*

2024-04-08 Thread Yoshinori Sato

On Fri, 05 Apr 2024 19:24:57 +0900,
Richard Henderson wrote:
> 
> Cc: Yoshinori Sato 
> Signed-off-by: Richard Henderson 
> ---
>  target/rx/translate.c | 27 ++-
>  1 file changed, 14 insertions(+), 13 deletions(-)
> 
> diff --git a/target/rx/translate.c b/target/rx/translate.c
> index 92fb2b43ad..9b81cf20b3 100644
> --- a/target/rx/translate.c
> +++ b/target/rx/translate.c
> @@ -22,7 +22,6 @@
>  #include "cpu.h"
>  #include "exec/exec-all.h"
>  #include "tcg/tcg-op.h"
> -#include "exec/cpu_ldst.h"
>  #include "exec/helper-proto.h"
>  #include "exec/helper-gen.h"
>  #include "exec/translator.h"
> @@ -75,10 +74,10 @@ static TCGv_i64 cpu_acc;
>  
>  /* decoder helper */
>  static uint32_t decode_load_bytes(DisasContext *ctx, uint32_t insn,
> -   int i, int n)
> +  int i, int n)
>  {
>  while (++i <= n) {
> -uint8_t b = cpu_ldub_code(ctx->env, ctx->base.pc_next++);
> +uint8_t b = translator_ldub(ctx->env, &ctx->base, 
> ctx->base.pc_next++);
>  insn |= b << (32 - i * 8);
>  }
>  return insn;
> @@ -90,22 +89,24 @@ static uint32_t li(DisasContext *ctx, int sz)
>  CPURXState *env = ctx->env;
>  addr = ctx->base.pc_next;
>  
> -tcg_debug_assert(sz < 4);
>  switch (sz) {
>  case 1:
>  ctx->base.pc_next += 1;
> -return cpu_ldsb_code(env, addr);
> +return (int8_t)translator_ldub(env, &ctx->base, addr);
>  case 2:
>  ctx->base.pc_next += 2;
> -return cpu_ldsw_code(env, addr);
> +return (int16_t)translator_lduw(env, &ctx->base, addr);
>  case 3:
>  ctx->base.pc_next += 3;
> -tmp = cpu_ldsb_code(env, addr + 2) << 16;
> -tmp |= cpu_lduw_code(env, addr) & 0x;
> +tmp = (int8_t)translator_ldub(env, &ctx->base, addr + 2);
> +tmp <<= 16;
> +tmp |= translator_lduw(env, &ctx->base, addr);
>  return tmp;
>  case 0:
>  ctx->base.pc_next += 4;
> -return cpu_ldl_code(env, addr);
> +return translator_ldl(env, &ctx->base, addr);
> +default:
> +g_assert_not_reached();
>  }
>  return 0;
>  }
> @@ -190,22 +191,22 @@ static inline TCGv rx_index_addr(DisasContext *ctx, 
> TCGv mem,
>  {
>  uint32_t dsp;
>  
> -tcg_debug_assert(ld < 3);
>  switch (ld) {
>  case 0:
>  return cpu_regs[reg];
>  case 1:
> -dsp = cpu_ldub_code(ctx->env, ctx->base.pc_next) << size;
> +dsp = translator_ldub(ctx->env, &ctx->base, ctx->base.pc_next) << 
> size;
>  tcg_gen_addi_i32(mem, cpu_regs[reg], dsp);
>  ctx->base.pc_next += 1;
>  return mem;
>  case 2:
> -dsp = cpu_lduw_code(ctx->env, ctx->base.pc_next) << size;
> +dsp = translator_lduw(ctx->env, &ctx->base, ctx->base.pc_next) << 
> size;
>  tcg_gen_addi_i32(mem, cpu_regs[reg], dsp);
>  ctx->base.pc_next += 2;
>  return mem;
> +default:
> +g_assert_not_reached();
>  }
> -return NULL;
>  }
>  
>  static inline MemOp mi_to_mop(unsigned mi)
> -- 
> 2.34.1
> 

Reviewed-by: Yoshinori Sato 

-- 
Yosinori Sato

Re: [PATCH] target/sh4: add missing CHECK_NOT_DELAY_SLOT

2024-04-08 Thread Yoshinori Sato

On Mon, 08 Apr 2024 00:07:05 +0900,
Zack Buhman wrote:
> 
> CHECK_NOT_DELAY_SLOT is correctly applied to the branch-related
> instructions, but not to the PC-relative mov* instructions.
> 
> I verified the existence of an illegal slot exception on a SH7091 when
> any of these instructions are attempted inside a delay slot.
> 
> This also matches the behavior described in the SH-4 ISA manual.
> 
> Signed-off-by: Zack Buhman 
> ---
>  target/sh4/translate.c | 3 +++
>  1 file changed, 3 insertions(+)
> 
> diff --git a/target/sh4/translate.c b/target/sh4/translate.c
> index 6643c14dde..ebb6c901bf 100644
> --- a/target/sh4/translate.c
> +++ b/target/sh4/translate.c
> @@ -523,6 +523,7 @@ static void _decode_opc(DisasContext * ctx)
>  tcg_gen_movi_i32(REG(B11_8), B7_0s);
>  return;
>  case 0x9000: /* mov.w @(disp,PC),Rn */
> +CHECK_NOT_DELAY_SLOT
>  {
>  TCGv addr = tcg_constant_i32(ctx->base.pc_next + 4 + B7_0 * 2);
>  tcg_gen_qemu_ld_i32(REG(B11_8), addr, ctx->memidx,
> @@ -530,6 +531,7 @@ static void _decode_opc(DisasContext * ctx)
>  }
>  return;
>  case 0xd000: /* mov.l @(disp,PC),Rn */
> +CHECK_NOT_DELAY_SLOT
>  {
>  TCGv addr = tcg_constant_i32((ctx->base.pc_next + 4 + B7_0 * 4) 
> & ~3);
>  tcg_gen_qemu_ld_i32(REG(B11_8), addr, ctx->memidx,
> @@ -1236,6 +1238,7 @@ static void _decode_opc(DisasContext * ctx)
>  }
>  return;
>  case 0xc700: /* mova @(disp,PC),R0 */
> +CHECK_NOT_DELAY_SLOT
>  tcg_gen_movi_i32(REG(0), ((ctx->base.pc_next & 0xfffc) +
>4 + B7_0 * 4) & ~3);
>  return;
> -- 
> 2.41.0
>

That's what the documentation said.
> If a PC-relative load instruction is executed in a delay slot,
> an illegal slot instruction exception will be generated.

Reviewed-by: Yoshinori Sato 

-- 
Yosinori Sato

Re: [PATCH] hcd-ohci: Fix inconsistency when resetting ohci root hubs


On 30/8/22 05:30, Qiang Liu wrote:

I found an assertion failure in usb_cancel_packet() and posted my analysis in
https://gitlab.com/qemu-project/qemu/-/issues/1180. I think this issue is
because the inconsistency when resetting ohci root hubs.

There are two ways to reset ohci root hubs: 1) through HcRhPortStatus, 2)
through HcControl. However, when the packet's status is USB_PACKET_ASYNC,
resetting through HcRhPortStatus will complete the packet and thus resetting
through HcControl will fail. That is because IMO resetting through
HcRhPortStatus should first detach the port and then invoked usb_device_reset()
just like through HcControl. Therefore, I change usb_device_reset() to
usb_port_reset() where usb_detach() and usb_device_reset() are invoked
consequently.

Fixes: d28f4e2d8631 ("usb: kill USB_MSG_RESET")
Reported-by: Qiang Liu 
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1180
Signed-off-by: Qiang Liu 
---
  hw/usb/hcd-ohci.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/usb/hcd-ohci.c b/hw/usb/hcd-ohci.c
index 895b29fb86..72df917834 100644
--- a/hw/usb/hcd-ohci.c
+++ b/hw/usb/hcd-ohci.c
@@ -1426,7 +1426,7 @@ static void ohci_port_set_status(OHCIState *ohci, int 
portnum, uint32_t val)
  
  if (ohci_port_set_if_connected(ohci, portnum, val & OHCI_PORT_PRS)) {

  trace_usb_ohci_port_reset(portnum);
-usb_device_reset(port->port.dev);
+usb_port_reset(&port->port);
  port->ctrl &= ~OHCI_PORT_PRS;
  /* ??? Should this also set OHCI_PORT_PESC.  */
  port->ctrl |= OHCI_PORT_PES | OHCI_PORT_PRSC;


Reviewed-by: Philippe Mathieu-Daudé

Re: [PATCH] target/i386: fix direction of "32-bit MMU" test


On 5/4/24 19:30, Michael Tokarev wrote:

01.04.2024 09:02, Michael Tokarev:

Anyone can guess why this rather trivial and obviously correct patch 
causes segfaults

in a few tests in staging-7.2 - when run in tcg mode, namely:

   pxe-test
   migration-test
   boot-serial-test
   bios-tables-test
   vmgenid-test
   cdrom-test

When reverting this single commit from staging-7.2, it all works fine 
again.


It sigsegvs in probe_access_internal():

   CPUTLBEntry *entry = tlb_entry(env, mmu_idx, addr); -- this one 
returns NULL,


and next there's a call

   tlb_addr = tlb_read_ofs(entry, elt_ofs);

which fails.

#0  0x55c5de8a in tlb_read_ofs (ofs=8, entry=0x0) at 
7.2/accel/tcg/cputlb.c:1455

#1  probe_access_internal
     (env=0x56a862a0, addr=4294967280, 
fault_size=fault_size@entry=1, 
access_type=access_type@entry=MMU_INST_FETCH, mmu_idx=5, 
nonfault=nonfault@entry=false, phost=0x7fffea4d32a0, 
pfull=0x7fffea4d3298, retaddr=0)

     at 7.2/accel/tcg/cputlb.c:1555
#2  0x55c62aba in get_page_addr_code_hostp
     (env=, addr=addr@entry=4294967280, 
hostp=hostp@entry=0x0)

     at 7.2/accel/tcg/cputlb.c:1691
#3  0x55c52b54 in get_page_addr_code (addr=4294967280, 
env=)

     at 7.2/include/exec/exec-all.h:714
#4  tb_htable_lookup
     (cpu=cpu@entry=0x56a85530, pc=pc@entry=4294967280, 
cs_base=cs_base@entry=4294901760, flags=flags@entry=64, 
cflags=cflags@entry=4278190080) at 7.2/accel/tcg/cpu-exec.c:236

#5  0x55c53e8e in tb_lookup
     (cflags=4278190080, flags=64, cs_base=4294901760, pc=4294967280, 
cpu=0x56a85530)

     at 7.2/accel/tcg/cpu-exec.c:270
#6  cpu_exec (cpu=cpu@entry=0x56a85530) at 
7.2/accel/tcg/cpu-exec.c:1001

#7  0x55c75d2f in tcg_cpus_exec (cpu=cpu@entry=0x56a85530)
     at 7.2/accel/tcg/tcg-accel-ops.c:69
#8  0x55c75e80 in mttcg_cpu_thread_fn 
(arg=arg@entry=0x56a85530)

     at 7.2/accel/tcg/tcg-accel-ops-mttcg.c:95
#9  0x55ded098 in qemu_thread_start (args=0x56adac40)
     at 7.2/util/qemu-thread-posix.c:505
#10 0x75793134 in start_thread (arg=)
#11 0x758137dc in clone3 ()


I'm removing this whole set from 7.2 for now:

  2cc68629a6fc target/i386: fix direction of "32-bit MMU" test
  90f641531c78 target/i386: use separate MMU indexes for 32-bit accesses
  5f97afe2543f target/i386: introduce function to query MMU indices


Cc'ing Giuseppe Ghibò for
https://gitlab.com/qemu-project/qemu/-/issues/2264


This leaves us with

  b1661801c184 "target/i386: Fix physical address truncation"

but without its fix, 2cc68629a6fc.

It looks like I should revert b1661801c184 from 7.2 too, re-opening
https://gitlab.com/qemu-project/qemu/-/issues/2040 - since to me it isn't
clear if this change actually fixes this issue or not without the
previous change, 90f641531c78, which is missing from 7.2.10.

At the very least this will simplify possible another attempt to 
cherry-pick

these changes to 7.2.

Thanks,

/mjt

[PATCH] Revert "hw/virtio: Add support for VDPA network simulation devices"

This reverts commit cd341fd1ffded978b2aa0b5309b00be7c42e347c.

The patch adds non-upstream code in
include/standard-headers/linux/virtio_pci.h
which would make maintainance harder.

Revert for now.

Suggested-by: Jason Wang 
Signed-off-by: Michael S. Tsirkin 
---
 include/hw/virtio/virtio-pci.h  |   5 -
 include/hw/virtio/virtio.h  |  19 --
 include/standard-headers/linux/virtio_pci.h |   7 -
 hw/net/virtio-net.c |  16 --
 hw/virtio/virtio-pci.c  | 189 +---
 hw/virtio/virtio.c  |  39 
 MAINTAINERS |   5 -
 docs/system/device-emulation.rst|   1 -
 docs/system/devices/vdpa-net.rst| 121 -
 9 files changed, 3 insertions(+), 399 deletions(-)
 delete mode 100644 docs/system/devices/vdpa-net.rst

diff --git a/include/hw/virtio/virtio-pci.h b/include/hw/virtio/virtio-pci.h
index 4d57a9c751..59d88018c1 100644
--- a/include/hw/virtio/virtio-pci.h
+++ b/include/hw/virtio/virtio-pci.h
@@ -43,7 +43,6 @@ enum {
 VIRTIO_PCI_FLAG_INIT_FLR_BIT,
 VIRTIO_PCI_FLAG_AER_BIT,
 VIRTIO_PCI_FLAG_ATS_PAGE_ALIGNED_BIT,
-VIRTIO_PCI_FLAG_VDPA_BIT,
 };
 
 /* Need to activate work-arounds for buggy guests at vmstate load. */
@@ -90,9 +89,6 @@ enum {
 #define VIRTIO_PCI_FLAG_ATS_PAGE_ALIGNED \
   (1 << VIRTIO_PCI_FLAG_ATS_PAGE_ALIGNED_BIT)
 
-/* VDPA supported flags */
-#define VIRTIO_PCI_FLAG_VDPA (1 << VIRTIO_PCI_FLAG_VDPA_BIT)
-
 typedef struct {
 MSIMessage msg;
 int virq;
@@ -144,7 +140,6 @@ struct VirtIOPCIProxy {
 };
 VirtIOPCIRegion regs[5];
 };
-VirtIOPCIRegion lm;
 MemoryRegion modern_bar;
 MemoryRegion io_bar;
 uint32_t legacy_io_bar_idx;
diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
index b3c74a1bca..c8f72850bc 100644
--- a/include/hw/virtio/virtio.h
+++ b/include/hw/virtio/virtio.h
@@ -35,9 +35,6 @@
 (0x1ULL << VIRTIO_F_NOTIFY_ON_EMPTY) | \
 (0x1ULL << VIRTIO_F_ANY_LAYOUT))
 
-#define LM_DISABLE  0x00
-#define LM_ENABLE   0x01
-
 struct VirtQueue;
 
 static inline hwaddr vring_align(hwaddr addr,
@@ -98,11 +95,6 @@ enum virtio_device_endian {
 VIRTIO_DEVICE_ENDIAN_BIG,
 };
 
-typedef struct BitmapMemoryRegionCaches {
-struct rcu_head rcu;
-MemoryRegionCache bitmap;
-} BitmapMemoryRegionCaches;
-
 /**
  * struct VirtIODevice - common VirtIO structure
  * @name: name of the device
@@ -136,14 +128,6 @@ struct VirtIODevice
 uint32_t generation;
 int nvectors;
 VirtQueue *vq;
-uint8_t lm_logging_ctrl;
-uint32_t lm_base_addr_low;
-uint32_t lm_base_addr_high;
-uint32_t lm_end_addr_low;
-uint32_t lm_end_addr_high;
-
-BitmapMemoryRegionCaches *caches;
-
 MemoryListener listener;
 uint16_t device_id;
 /* @vm_running: current VM running state via virtio_vmstate_change() */
@@ -395,11 +379,8 @@ hwaddr virtio_queue_get_desc_size(VirtIODevice *vdev, int 
n);
 hwaddr virtio_queue_get_avail_size(VirtIODevice *vdev, int n);
 hwaddr virtio_queue_get_used_size(VirtIODevice *vdev, int n);
 unsigned int virtio_queue_get_last_avail_idx(VirtIODevice *vdev, int n);
-unsigned int virtio_queue_get_vring_states(VirtIODevice *vdev, int n);
 void virtio_queue_set_last_avail_idx(VirtIODevice *vdev, int n,
  unsigned int idx);
-void virtio_queue_set_vring_states(VirtIODevice *vdev, int n,
-   unsigned int idx);
 void virtio_queue_restore_last_avail_idx(VirtIODevice *vdev, int n);
 void virtio_queue_invalidate_signalled_used(VirtIODevice *vdev, int n);
 void virtio_queue_update_used_idx(VirtIODevice *vdev, int n);
diff --git a/include/standard-headers/linux/virtio_pci.h 
b/include/standard-headers/linux/virtio_pci.h
index 86733278ba..3e2bc2c97e 100644
--- a/include/standard-headers/linux/virtio_pci.h
+++ b/include/standard-headers/linux/virtio_pci.h
@@ -221,13 +221,6 @@ struct virtio_pci_cfg_cap {
 #define VIRTIO_PCI_COMMON_ADM_Q_IDX60
 #define VIRTIO_PCI_COMMON_ADM_Q_NUM62
 
-#define LM_LOGGING_CTRL 0
-#define LM_BASE_ADDR_LOW4
-#define LM_BASE_ADDR_HIGH   8
-#define LM_END_ADDR_LOW 12
-#define LM_END_ADDR_HIGH16
-#define LM_VRING_STATE_OFFSET   0x20
-
 #endif /* VIRTIO_PCI_NO_MODERN */
 
 /* Admin command status. */
diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index 58014a92ad..24e5e7d347 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -2039,22 +2039,6 @@ static ssize_t virtio_net_receive_rcu(NetClientState 
*nc, const uint8_t *buf,
 goto err;
 }
 
-/* Mark dirty page's bitmap of guest memory */
-if (vdev->lm_logging_ctrl == LM_ENABLE) {
-uint64_t chunk = elem->in_addr[i] / VHOST_LOG_CHUNK;
-/* Get chunk index */
-

Re: [PATCH-for-9.1 3/7] monitor: Rework stubs to simplify user emulation linking

On Thu, Apr 4, 2024 at 9:48 PM Philippe Mathieu-Daudé  wrote:
>
> Currently monitor stubs are scattered in 3 files.
>
> Merge these stubs in 2 files, a generic one (monitor-core)
> included in all builds (in particular user emulation), and
> a less generic one to be included by tools and system emulation.
>
> Signed-off-by: Philippe Mathieu-Daudé 
> ---
>  stubs/fdset.c| 17 -

Oops, merging fdset.c breaks storage-daemon linking.

But it is not needed, all I need is

diff --git a/stubs/meson.build b/stubs/meson.build
index 0bf25e6ca53..67cf80aa846 100644
--- a/stubs/meson.build
+++ b/stubs/meson.build
@@ -10,7 +10,6 @@ stub_ss.add(files('qemu-timer-notify-cb.c'))
 stub_ss.add(files('icount.c'))
 stub_ss.add(files('dump.c'))
 stub_ss.add(files('error-printf.c'))
-stub_ss.add(files('fdset.c'))
 stub_ss.add(files('gdbstub.c'))
 stub_ss.add(files('get-vm-name.c'))
 stub_ss.add(files('graph-lock.c'))
@@ -28,7 +27,10 @@ if libaio.found()
 endif
 stub_ss.add(files('migr-blocker.c'))
 stub_ss.add(files('module-opts.c'))
-stub_ss.add(files('monitor.c'))
+if have_system or have_tools
+  stub_ss.add(files('monitor.c'))
+  stub_ss.add(files('fdset.c'))
+endif
 stub_ss.add(files('monitor-core.c'))
 stub_ss.add(files('physmem.c'))
 stub_ss.add(files('qemu-timer-notify-cb.c'))


Paolo

>  stubs/monitor-core.c | 20 +++-
>  stubs/monitor.c  |  8 ++--
>  stubs/meson.build|  5 +++--
>  4 files changed, 24 insertions(+), 26 deletions(-)
>  delete mode 100644 stubs/fdset.c
>
> diff --git a/stubs/fdset.c b/stubs/fdset.c
> deleted file mode 100644
> index 56b3663d58..00
> --- a/stubs/fdset.c
> +++ /dev/null
> @@ -1,17 +0,0 @@
> -#include "qemu/osdep.h"
> -#include "monitor/monitor.h"
> -
> -int monitor_fdset_dup_fd_add(int64_t fdset_id, int flags)
> -{
> -errno = ENOSYS;
> -return -1;
> -}
> -
> -int64_t monitor_fdset_dup_fd_find(int dup_fd)
> -{
> -return -1;
> -}
> -
> -void monitor_fdset_dup_fd_remove(int dupfd)
> -{
> -}
> diff --git a/stubs/monitor-core.c b/stubs/monitor-core.c
> index afa477aae6..72e40bcc15 100644
> --- a/stubs/monitor-core.c
> +++ b/stubs/monitor-core.c
> @@ -1,6 +1,7 @@
> +/* Monitor stub required for user emulation */
>  #include "qemu/osdep.h"
>  #include "monitor/monitor.h"
> -#include "qapi/qapi-emit-events.h"
> +#include "../monitor/monitor-internal.h"
>
>  Monitor *monitor_cur(void)
>  {
> @@ -12,11 +13,22 @@ Monitor *monitor_set_cur(Coroutine *co, Monitor *mon)
>  return NULL;
>  }
>
> -void monitor_init_qmp(Chardev *chr, bool pretty, Error **errp)
> +int monitor_fdset_dup_fd_add(int64_t fdset_id, int flags)
> +{
> +errno = ENOSYS;
> +return -1;
> +}
> +
> +int64_t monitor_fdset_dup_fd_find(int dup_fd)
> +{
> +return -1;
> +}
> +
> +void monitor_fdset_dup_fd_remove(int dupfd)
>  {
>  }
>
> -void qapi_event_emit(QAPIEvent event, QDict *qdict)
> +void monitor_fdsets_cleanup(void)
>  {
>  }
>
> @@ -24,5 +36,3 @@ int monitor_vprintf(Monitor *mon, const char *fmt, va_list 
> ap)
>  {
>  abort();
>  }
> -
> -
> diff --git a/stubs/monitor.c b/stubs/monitor.c
> index 20786ac4ff..2fc4dc1493 100644
> --- a/stubs/monitor.c
> +++ b/stubs/monitor.c
> @@ -1,7 +1,7 @@
>  #include "qemu/osdep.h"
>  #include "qapi/error.h"
> +#include "qapi/qapi-emit-events.h"
>  #include "monitor/monitor.h"
> -#include "../monitor/monitor-internal.h"
>
>  int monitor_get_fd(Monitor *mon, const char *name, Error **errp)
>  {
> @@ -13,6 +13,10 @@ void monitor_init_hmp(Chardev *chr, bool use_readline, 
> Error **errp)
>  {
>  }
>
> -void monitor_fdsets_cleanup(void)
> +void monitor_init_qmp(Chardev *chr, bool pretty, Error **errp)
> +{
> +}
> +
> +void qapi_event_emit(QAPIEvent event, QDict *qdict)
>  {
>  }
> diff --git a/stubs/meson.build b/stubs/meson.build
> index 0bf25e6ca5..ca1bc07d30 100644
> --- a/stubs/meson.build
> +++ b/stubs/meson.build
> @@ -10,7 +10,6 @@ stub_ss.add(files('qemu-timer-notify-cb.c'))
>  stub_ss.add(files('icount.c'))
>  stub_ss.add(files('dump.c'))
>  stub_ss.add(files('error-printf.c'))
> -stub_ss.add(files('fdset.c'))
>  stub_ss.add(files('gdbstub.c'))
>  stub_ss.add(files('get-vm-name.c'))
>  stub_ss.add(files('graph-lock.c'))
> @@ -28,7 +27,9 @@ if libaio.found()
>  endif
>  stub_ss.add(files('migr-blocker.c'))
>  stub_ss.add(files('module-opts.c'))
> -stub_ss.add(files('monitor.c'))
> +if have_system or have_tools
> +  stub_ss.add(files('monitor.c'))
> +endif
>  stub_ss.add(files('monitor-core.c'))
>  stub_ss.add(files('physmem.c'))
>  stub_ss.add(files('qemu-timer-notify-cb.c'))
> --
> 2.41.0
>

Re: [PATCH] Revert "hw/virtio: Add support for VDPA network simulation devices"

On Mon, 8 Apr 2024 at 10:48, Michael S. Tsirkin  wrote:
>
> This reverts commit cd341fd1ffded978b2aa0b5309b00be7c42e347c.
>
> The patch adds non-upstream code in
> include/standard-headers/linux/virtio_pci.h
> which would make maintainance harder.
>
> Revert for now.
>
> Suggested-by: Jason Wang 
> Signed-off-by: Michael S. Tsirkin 

Are you intending to target this revert for 9.0 ?

-- PMM

[PATCH-for-9.0? 1/2] hw/misc/applesmc: Do not call DeviceReset() from DeviceRealize()

QDev core layer always call DeviceReset() after DeviceRealize(),
no need to do it manually. Remove the extra call.

Signed-off-by: Philippe Mathieu-Daudé 
---
 hw/misc/applesmc.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/hw/misc/applesmc.c b/hw/misc/applesmc.c
index 72300d0cbc..8e65816da6 100644
--- a/hw/misc/applesmc.c
+++ b/hw/misc/applesmc.c
@@ -342,7 +342,6 @@ static void applesmc_isa_realize(DeviceState *dev, Error 
**errp)
 }
 
 QLIST_INIT(&s->data_def);
-qdev_applesmc_isa_reset(dev);
 }
 
 static Property applesmc_isa_properties[] = {
-- 
2.41.0

[PATCH-for-9.0? 2/2] hw/misc/applesmc: Fix memory leak in reset() handler

AppleSMCData is allocated with g_new0() in applesmc_add_key():
release it with g_free().

Leaked since commit 1ddda5cd36 ("AppleSMC device emulation").

Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2272
Reported-by: Zheyu Ma 
Signed-off-by: Philippe Mathieu-Daudé 
---
 hw/misc/applesmc.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/hw/misc/applesmc.c b/hw/misc/applesmc.c
index 8e65816da6..14e3ef667d 100644
--- a/hw/misc/applesmc.c
+++ b/hw/misc/applesmc.c
@@ -274,6 +274,7 @@ static void qdev_applesmc_isa_reset(DeviceState *dev)
 /* Remove existing entries */
 QLIST_FOREACH_SAFE(d, &s->data_def, node, next) {
 QLIST_REMOVE(d, node);
+g_free(d);
 }
 s->status = 0x00;
 s->status_1e = 0x00;
-- 
2.41.0

[PATCH-for-9.0? 0/2] hw/misc/applesmc: Fix memory leak

Fix for https://gitlab.com/qemu-project/qemu/-/issues/2272

Philippe Mathieu-Daudé (2):
  hw/misc/applesmc: Do not call DeviceReset() from DeviceRealize()
  hw/misc/applesmc: Fix memory leak in reset() handler

 hw/misc/applesmc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

-- 
2.41.0

Re: [PATCH] Revert "hw/virtio: Add support for VDPA network simulation devices"

2024-04-08 Thread Cornelia Huck

On Mon, Apr 08 2024, "Michael S. Tsirkin"  wrote:

> This reverts commit cd341fd1ffded978b2aa0b5309b00be7c42e347c.
>
> The patch adds non-upstream code in
> include/standard-headers/linux/virtio_pci.h
> which would make maintainance harder.
>
> Revert for now.
>
> Suggested-by: Jason Wang 
> Signed-off-by: Michael S. Tsirkin 
> ---
>  include/hw/virtio/virtio-pci.h  |   5 -
>  include/hw/virtio/virtio.h  |  19 --
>  include/standard-headers/linux/virtio_pci.h |   7 -
>  hw/net/virtio-net.c |  16 --
>  hw/virtio/virtio-pci.c  | 189 +---
>  hw/virtio/virtio.c  |  39 
>  MAINTAINERS |   5 -
>  docs/system/device-emulation.rst|   1 -
>  docs/system/devices/vdpa-net.rst| 121 -
>  9 files changed, 3 insertions(+), 399 deletions(-)
>  delete mode 100644 docs/system/devices/vdpa-net.rst

Acked-by: Cornelia Huck 

We should get rid of this before we release a version with non-upstream
header code.

Re: [PATCH-for-9.0 3/4] hw/char/virtio-serial-bus: Protect from DMA re-entrancy bugs

On Mon, Apr 08, 2024 at 09:14:39AM +0200, Philippe Mathieu-Daudé wrote:
> On 4/4/24 21:13, Philippe Mathieu-Daudé wrote:
> > Replace qemu_bh_new_guarded() by virtio_bh_new_guarded()
> > so the bus and device use the same guard. Otherwise the
> > DMA-reentrancy protection can be bypassed.
> > 
> > Cc: qemu-sta...@nongnu.org
> > Suggested-by: Alexander Bulekov 
> > Signed-off-by: Philippe Mathieu-Daudé 
> > ---
> >   hw/char/virtio-serial-bus.c | 3 +--
> >   1 file changed, 1 insertion(+), 2 deletions(-)
> > 
> > diff --git a/hw/char/virtio-serial-bus.c b/hw/char/virtio-serial-bus.c
> > index 016aba6374..cd0e3a11f7 100644
> > --- a/hw/char/virtio-serial-bus.c
> > +++ b/hw/char/virtio-serial-bus.c
> > @@ -985,8 +985,7 @@ static void virtser_port_device_realize(DeviceState 
> > *dev, Error **errp)
> >   return;
> >   }
> > -port->bh = qemu_bh_new_guarded(flush_queued_data_bh, port,
> > -   &dev->mem_reentrancy_guard);
> > +port->bh = virtio_bh_new_guarded(vdev, flush_queued_data_bh, port);
> 
> Missing:
> -- >8 --
> -port->bh = virtio_bh_new_guarded(vdev, flush_queued_data_bh, port);
> +port->bh = virtio_bh_new_guarded(VIRTIO_DEVICE(dev),
> + flush_queued_data_bh, port);
> ---

I don't get it. vdev is already the correct type. Why do you need
VIRTIO_DEVICE here?

> >   port->elem = NULL;
> >   }

Re: [PATCH-for-9.1 5/7] hw/core: Restrict reset handlers API to system emulation

On Thu, Apr 4, 2024 at 9:48 PM Philippe Mathieu-Daudé  wrote:
>
> Headers in include/sysemu/ are specific to system
> emulation and should not be used in user emulation.
>
> Signed-off-by: Philippe Mathieu-Daudé 
> ---
>  hw/core/reset.c | 4 
>  1 file changed, 4 insertions(+)
>
> diff --git a/hw/core/reset.c b/hw/core/reset.c
> index d50da7e304..167c8bf1a9 100644
> --- a/hw/core/reset.c
> +++ b/hw/core/reset.c
> @@ -24,7 +24,9 @@
>   */
>
>  #include "qemu/osdep.h"
> +#ifndef CONFIG_USER_ONLY
>  #include "sysemu/reset.h"
> +#endif
>  #include "hw/resettable.h"
>  #include "hw/core/resetcontainer.h"
>
> @@ -43,6 +45,7 @@ static ResettableContainer *get_root_reset_container(void)
>  return root_reset_container;
>  }
>
> +#ifndef CONFIG_USER_ONLY

Wait, this does not make sense. The only thing left in the file is a
single static function, which contradicts the other patch's commit
message "reset.c contains core code used by any CPU,".

Let me rework these two patches so that reset.c, qdev-hotplug.c and
hotplug.c can be moved to system_ss. I'll post a v2 shortly.

Paolo

>  /*
>   * Reason why the currently in-progress qemu_devices_reset() was called.
>   * If we made at least SHUTDOWN_CAUSE_SNAPSHOT_LOAD have a corresponding
> @@ -185,3 +188,4 @@ void qemu_devices_reset(ShutdownCause reason)
>  /* Reset the simulation */
>  resettable_reset(OBJECT(get_root_reset_container()), RESET_TYPE_COLD);
>  }
> +#endif
> --
> 2.41.0
>

Re: [PATCH 1/2] virtio-net: Fix vhost virtqueue notifiers for RSS

On Tue, Mar 26, 2024 at 07:06:29PM +0900, Akihiko Odaki wrote:
> virtio_net_guest_notifier_pending() and virtio_net_guest_notifier_mask()
> checked VIRTIO_NET_F_MQ to know there are multiple queues, but
> VIRTIO_NET_F_RSS also enables multiple queues. Refer to n->multiqueue,
> which is set to true either of VIRTIO_NET_F_MQ or VIRTIO_NET_F_RSS is
> enabled.
> 
> Fixes: 68b0a6395f36 ("virtio-net: align ctrl_vq index for non-mq guest for 
> vhost_vdpa")
> Signed-off-by: Akihiko Odaki 

Reviewed-by: Michael S. Tsirkin 

Jason, are you merging this?

> ---
>  hw/net/virtio-net.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
> index 9959f1932b1b..a6ff000cd9d3 100644
> --- a/hw/net/virtio-net.c
> +++ b/hw/net/virtio-net.c
> @@ -3426,7 +3426,7 @@ static bool 
> virtio_net_guest_notifier_pending(VirtIODevice *vdev, int idx)
>  VirtIONet *n = VIRTIO_NET(vdev);
>  NetClientState *nc;
>  assert(n->vhost_started);
> -if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_MQ) && idx == 2) {
> +if (!n->multiqueue && idx == 2) {
>  /* Must guard against invalid features and bogus queue index
>   * from being set by malicious guest, or penetrated through
>   * buggy migration stream.
> @@ -3458,7 +3458,7 @@ static void virtio_net_guest_notifier_mask(VirtIODevice 
> *vdev, int idx,
>  VirtIONet *n = VIRTIO_NET(vdev);
>  NetClientState *nc;
>  assert(n->vhost_started);
> -if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_MQ) && idx == 2) {
> +if (!n->multiqueue && idx == 2) {
>  /* Must guard against invalid features and bogus queue index
>   * from being set by malicious guest, or penetrated through
>   * buggy migration stream.
> 
> -- 
> 2.44.0

Re: [PATCH] Revert "hw/virtio: Add support for VDPA network simulation devices"

On Mon, Apr 08, 2024 at 10:51:57AM +0100, Peter Maydell wrote:
> On Mon, 8 Apr 2024 at 10:48, Michael S. Tsirkin  wrote:
> >
> > This reverts commit cd341fd1ffded978b2aa0b5309b00be7c42e347c.
> >
> > The patch adds non-upstream code in
> > include/standard-headers/linux/virtio_pci.h
> > which would make maintainance harder.
> >
> > Revert for now.
> >
> > Suggested-by: Jason Wang 
> > Signed-off-by: Michael S. Tsirkin 
> 
> Are you intending to target this revert for 9.0 ?
> 
> -- PMM

Yes.

[PATCH-for-9.0?] hw/net/smc91c111: Fix out of bounds access in packets buffer

While the Packet Number Register is 6-bit wide and could hold
up to 64 packets [*] our implementation is clamped at 4 packets.

Reproducer:

  $ cat << EOF | qemu-system-arm -display none \
 -machine mainstone,accel=qtest \
 -qtest stdio
  outl 0xcf8 0x8010
  outl 0xcfc 0x1300
  outl 0xcf8 0x8004
  outl 0xcfc 0x07
  writel 0x130c 0x66027cd6
  writel 0x1300 0x64af8eda
  readw 0x1308
  EOF
  hw/net/smc91c111.c:607:24: runtime error:
  index 175 out of bounds for type 'uint8_t[4][2048]' (aka 'unsigned 
char[4][2048]')
  SUMMARY: UndefinedBehaviorSanitizer: undefined-behavior
  =
  ==397944==ERROR: AddressSanitizer: SEGV on unknown address 0x62977db4
  (pc 0x56272aed3b8d bp 0x7ffd1471f290 sp 0x7ffd1471ea20 T0)
  ==397944==The signal is caused by a READ memory access.
      #0 0x56272aed3b8d in smc91c111_readb hw/net/smc91c111.c:607:24
      #1 0x56272aecfd61 in smc91c111_readfn hw/net/smc91c111.c:650:16
      #2 0x56272d4b228b in memory_region_read_accessor system/memory.c:445:11
      #3 0x56272d46fb85 in access_with_adjusted_size system/memory.c:573:18
      #4 0x56272d46c58e in memory_region_dispatch_read1 system/memory.c:1426:16
      #5 0x56272d46bcd7 in memory_region_dispatch_read system/memory.c:1459:9
      #6 0x56272d4e8e03 in flatview_read_continue_step system/physmem.c:2794:18
      #7 0x56272d4e871e in flatview_read_continue system/physmem.c:2835:19
      #8 0x56272d4e98b8 in flatview_read system/physmem.c:2865:12
      #9 0x56272d4e9388 in address_space_read_full system/physmem.c:2878:18
      #10 0x56272d6e7840 in address_space_read include/exec/memory.h:3026:18
  ...

Broken since model introduction in commit 80337b66a8.

[*] LAN91C111 DS2276A.pdf, chapter 8.17, Packet Number Register

Reported-by: Will Lester
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2268
Signed-off-by: Philippe Mathieu-Daudé 
---
 hw/net/smc91c111.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/net/smc91c111.c b/hw/net/smc91c111.c
index 702d0e8e83..286298bf06 100644
--- a/hw/net/smc91c111.c
+++ b/hw/net/smc91c111.c
@@ -429,7 +429,7 @@ static void smc91c111_writeb(void *opaque, hwaddr offset,
 /* Ignore.  */
 return;
 case 2: /* Packet Number Register */
-s->packet_num = value;
+s->packet_num = value & (NUM_PACKETS - 1);
 return;
 case 3: case 4: case 5:
 /* Should be readonly, but linux writes to them anyway. Ignore.  */
-- 
2.41.0

Re: [PATCH-for-9.0?] hw/net/smc91c111: Fix out of bounds access in packets buffer


On 8/4/24 12:27, Philippe Mathieu-Daudé wrote:

While the Packet Number Register is 6-bit wide and could hold
up to 64 packets [*] our implementation is clamped at 4 packets.

Reproducer:

   $ cat << EOF | qemu-system-arm -display none \
  -machine mainstone,accel=qtest \
  -qtest stdio
   outl 0xcf8 0x8010
   outl 0xcfc 0x1300
   outl 0xcf8 0x8004
   outl 0xcfc 0x07
   writel 0x130c 0x66027cd6
   writel 0x1300 0x64af8eda
   readw 0x1308
   EOF
   hw/net/smc91c111.c:607:24: runtime error:
   index 175 out of bounds for type 'uint8_t[4][2048]' (aka 'unsigned 
char[4][2048]')
   SUMMARY: UndefinedBehaviorSanitizer: undefined-behavior
   =
   ==397944==ERROR: AddressSanitizer: SEGV on unknown address 0x62977db4
   (pc 0x56272aed3b8d bp 0x7ffd1471f290 sp 0x7ffd1471ea20 T0)
   ==397944==The signal is caused by a READ memory access.
       #0 0x56272aed3b8d in smc91c111_readb hw/net/smc91c111.c:607:24
       #1 0x56272aecfd61 in smc91c111_readfn hw/net/smc91c111.c:650:16
       #2 0x56272d4b228b in memory_region_read_accessor system/memory.c:445:11
       #3 0x56272d46fb85 in access_with_adjusted_size system/memory.c:573:18
       #4 0x56272d46c58e in memory_region_dispatch_read1 system/memory.c:1426:16
       #5 0x56272d46bcd7 in memory_region_dispatch_read system/memory.c:1459:9
       #6 0x56272d4e8e03 in flatview_read_continue_step system/physmem.c:2794:18
       #7 0x56272d4e871e in flatview_read_continue system/physmem.c:2835:19
       #8 0x56272d4e98b8 in flatview_read system/physmem.c:2865:12
       #9 0x56272d4e9388 in address_space_read_full system/physmem.c:2878:18
       #10 0x56272d6e7840 in address_space_read include/exec/memory.h:3026:18
   ...

Broken since model introduction in commit 80337b66a8.

[*] LAN91C111 DS2276A.pdf, chapter 8.17, Packet Number Register

Reported-by: Will Lester


Apparently also
Reported-by: Chuhong Yuan 


Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2268
Signed-off-by: Philippe Mathieu-Daudé 
---
  hw/net/smc91c111.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/net/smc91c111.c b/hw/net/smc91c111.c
index 702d0e8e83..286298bf06 100644
--- a/hw/net/smc91c111.c
+++ b/hw/net/smc91c111.c
@@ -429,7 +429,7 @@ static void smc91c111_writeb(void *opaque, hwaddr offset,
  /* Ignore.  */
  return;
  case 2: /* Packet Number Register */
-s->packet_num = value;
+s->packet_num = value & (NUM_PACKETS - 1);
  return;
  case 3: case 4: case 5:
  /* Should be readonly, but linux writes to them anyway. Ignore.  
*/

Re: [PATCH-for-9.0? 1/2] hw/misc/applesmc: Do not call DeviceReset() from DeviceRealize()

On Mon, 8 Apr 2024 at 10:52, Philippe Mathieu-Daudé  wrote:
>
> QDev core layer always call DeviceReset() after DeviceRealize(),
> no need to do it manually. Remove the extra call.
>
> Signed-off-by: Philippe Mathieu-Daudé 
> ---
>  hw/misc/applesmc.c | 1 -
>  1 file changed, 1 deletion(-)
>
> diff --git a/hw/misc/applesmc.c b/hw/misc/applesmc.c
> index 72300d0cbc..8e65816da6 100644
> --- a/hw/misc/applesmc.c
> +++ b/hw/misc/applesmc.c
> @@ -342,7 +342,6 @@ static void applesmc_isa_realize(DeviceState *dev, Error 
> **errp)
>  }
>
>  QLIST_INIT(&s->data_def);
> -qdev_applesmc_isa_reset(dev);
>  }
>
>  static Property applesmc_isa_properties[] = {
> --

Reviewed-by: Peter Maydell 

thanks
-- PMM

Re: [PATCH-for-9.0? 2/2] hw/misc/applesmc: Fix memory leak in reset() handler

On Mon, 8 Apr 2024 at 10:53, Philippe Mathieu-Daudé  wrote:
>
> AppleSMCData is allocated with g_new0() in applesmc_add_key():
> release it with g_free().
>
> Leaked since commit 1ddda5cd36 ("AppleSMC device emulation").
>
> Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2272
> Reported-by: Zheyu Ma 
> Signed-off-by: Philippe Mathieu-Daudé 
> ---
>  hw/misc/applesmc.c | 1 +
>  1 file changed, 1 insertion(+)
>
> diff --git a/hw/misc/applesmc.c b/hw/misc/applesmc.c
> index 8e65816da6..14e3ef667d 100644
> --- a/hw/misc/applesmc.c
> +++ b/hw/misc/applesmc.c
> @@ -274,6 +274,7 @@ static void qdev_applesmc_isa_reset(DeviceState *dev)
>  /* Remove existing entries */
>  QLIST_FOREACH_SAFE(d, &s->data_def, node, next) {
>  QLIST_REMOVE(d, node);
> +g_free(d);
>  }
>  s->status = 0x00;
>  s->status_1e = 0x00;
> --

Cc stable?

This is the right minimal fix for the leak, so
Reviewed-by: Peter Maydell 

but overall this is a bit odd. We don't change either the
keys or their values at runtime, they seem to be a fixed
set defined by the device properties, so why are we tearing
them down and readding them every reset? It would be
simpler to create the data structure once at device realize.

thanks
-- PMM

[PATCH-for-9.0? 0/2] hw/net/lan9118: Fix overflow in TX FIFO

Fix for https://gitlab.com/qemu-project/qemu/-/issues/2267

Philippe Mathieu-Daudé (2):
  hw/net/lan9118: Replace magic '2048' value by 'PKT_SIZE' definition
  hw/net/lan9118: Fix overflow in TX FIFO

 hw/net/lan9118.c | 13 +
 1 file changed, 9 insertions(+), 4 deletions(-)

-- 
2.41.0

[PATCH-for-9.0? 1/2] hw/net/lan9118: Replace magic '2048' value by 'PKT_SIZE' definition

Signed-off-by: Philippe Mathieu-Daudé 
---
 hw/net/lan9118.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/hw/net/lan9118.c b/hw/net/lan9118.c
index 47ff25b441..7be0430ac5 100644
--- a/hw/net/lan9118.c
+++ b/hw/net/lan9118.c
@@ -150,6 +150,8 @@ do { printf("lan9118: " fmt , ## __VA_ARGS__); } while (0)
 
 #define GPT_TIMER_EN0x2000
 
+#define PKT_SIZE2048
+
 enum tx_state {
 TX_IDLE,
 TX_B,
@@ -166,7 +168,7 @@ typedef struct {
 int32_t pad;
 int32_t fifo_used;
 int32_t len;
-uint8_t data[2048];
+uint8_t data[PKT_SIZE];
 } LAN9118Packet;
 
 static const VMStateDescription vmstate_lan9118_packet = {
@@ -182,7 +184,7 @@ static const VMStateDescription vmstate_lan9118_packet = {
 VMSTATE_INT32(pad, LAN9118Packet),
 VMSTATE_INT32(fifo_used, LAN9118Packet),
 VMSTATE_INT32(len, LAN9118Packet),
-VMSTATE_UINT8_ARRAY(data, LAN9118Packet, 2048),
+VMSTATE_UINT8_ARRAY(data, LAN9118Packet, PKT_SIZE),
 VMSTATE_END_OF_LIST()
 }
 };
@@ -544,7 +546,7 @@ static ssize_t lan9118_receive(NetClientState *nc, const 
uint8_t *buf,
 return -1;
 }
 
-if (size >= 2048 || size < 14) {
+if (size >= PKT_SIZE || size < 14) {
 return -1;
 }
 
-- 
2.41.0

[PATCH-for-9.0? 2/2] hw/net/lan9118: Fix overflow in TX FIFO

When the TX FIFO is full, raise the TX Status FIFO Overflow (TXSO)
flag, "Generated when the TX Status FIFO overflows" [*].

Broken since model introduction in commit 2a42499017
("LAN9118 emulation").

When using the reproducer from
https://gitlab.com/qemu-project/qemu/-/issues/2267 we get:

  hw/net/lan9118.c:798:17: runtime error:
  index 2048 out of bounds for type 'uint8_t[2048]' (aka 'unsigned char[2048]')
    #0 0x563ec9a057b1 in tx_fifo_push hw/net/lan9118.c:798:43
    #1 0x563ec99fbb28 in lan9118_writel hw/net/lan9118.c:1042:9
    #2 0x563ec99f2de2 in lan9118_16bit_mode_write hw/net/lan9118.c:1205:9
    #3 0x563ecbf78013 in memory_region_write_accessor system/memory.c:497:5
    #4 0x563ecbf776f5 in access_with_adjusted_size system/memory.c:573:18
    #5 0x563ecbf75643 in memory_region_dispatch_write system/memory.c:1521:16
    #6 0x563ecc01bade in flatview_write_continue_step system/physmem.c:2713:18
    #7 0x563ecc01b374 in flatview_write_continue system/physmem.c:2743:19
    #8 0x563ecbff1c9b in flatview_write system/physmem.c:2774:12
    #9 0x563ecbff1768 in address_space_write system/physmem.c:2894:18
...

[*] LAN9118 DS2266B.pdf, Table 5.3.3 "INTERRUPT STATUS REGISTER"

Reported-by: Will Lester
Reported-by: Chuhong Yuan 
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2267
Signed-off-by: Philippe Mathieu-Daudé 
---
 hw/net/lan9118.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/hw/net/lan9118.c b/hw/net/lan9118.c
index 7be0430ac5..7a1367b0bb 100644
--- a/hw/net/lan9118.c
+++ b/hw/net/lan9118.c
@@ -795,8 +795,11 @@ static void tx_fifo_push(lan9118_state *s, uint32_t val)
 /* Documentation is somewhat unclear on the ordering of bytes
in FIFO words.  Empirical results show it to be little-endian.
*/
-/* TODO: FIFO overflow checking.  */
 while (n--) {
+if (s->txp->len == PKT_SIZE) {
+s->int_sts |= TXSO_INT;
+break;
+}
 s->txp->data[s->txp->len] = val & 0xff;
 s->txp->len++;
 val >>= 8;
-- 
2.41.0

Re: [PATCH-for-9.1 3/7] monitor: Rework stubs to simplify user emulation linking


On 8/4/24 11:50, Paolo Bonzini wrote:

On Thu, Apr 4, 2024 at 9:48 PM Philippe Mathieu-Daudé  wrote:


Currently monitor stubs are scattered in 3 files.

Merge these stubs in 2 files, a generic one (monitor-core)
included in all builds (in particular user emulation), and
a less generic one to be included by tools and system emulation.

Signed-off-by: Philippe Mathieu-Daudé 
---
  stubs/fdset.c| 17 -


Oops, merging fdset.c breaks storage-daemon linking.


Odd. My 'tools_only' build directory is configured with:

'--enable-tools' '--disable-docs' '--disable-system' '--disable-user'

I was building qemu-img & co. I just checked it isn't anymore,
the directory only contains libqemuutil.a, libqom.fa,
libevent-loop-base.fa and tests/unit (I'm on macOS host).

This explained why I missed the link failure. I'll dig to
see when these tools disappeared on macOS.

Regards,

Phil.

Re: [PATCH-for-9.0 3/4] hw/char/virtio-serial-bus: Protect from DMA re-entrancy bugs


On 8/4/24 12:08, Michael S. Tsirkin wrote:

On Mon, Apr 08, 2024 at 09:14:39AM +0200, Philippe Mathieu-Daudé wrote:

On 4/4/24 21:13, Philippe Mathieu-Daudé wrote:

Replace qemu_bh_new_guarded() by virtio_bh_new_guarded()
so the bus and device use the same guard. Otherwise the
DMA-reentrancy protection can be bypassed.

Cc: qemu-sta...@nongnu.org
Suggested-by: Alexander Bulekov 
Signed-off-by: Philippe Mathieu-Daudé 
---
   hw/char/virtio-serial-bus.c | 3 +--
   1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/hw/char/virtio-serial-bus.c b/hw/char/virtio-serial-bus.c
index 016aba6374..cd0e3a11f7 100644
--- a/hw/char/virtio-serial-bus.c
+++ b/hw/char/virtio-serial-bus.c
@@ -985,8 +985,7 @@ static void virtser_port_device_realize(DeviceState *dev, 
Error **errp)
   return;
   }
-port->bh = qemu_bh_new_guarded(flush_queued_data_bh, port,
-   &dev->mem_reentrancy_guard);
+port->bh = virtio_bh_new_guarded(vdev, flush_queued_data_bh, port);


Missing:
-- >8 --
-port->bh = virtio_bh_new_guarded(vdev, flush_queued_data_bh, port);
+port->bh = virtio_bh_new_guarded(VIRTIO_DEVICE(dev),
+ flush_queued_data_bh, port);
---


I don't get it. vdev is already the correct type. Why do you need
VIRTIO_DEVICE here?


This function doesn't declare vdev.




   port->elem = NULL;
   }

[PATCH] kvm: error out of kvm_irqchip_add_msi_route() in case of full route table

2024-04-08 Thread Igor Mammedov

subj is calling kvm_add_routing_entry() which simply extends
  KVMState::irq_routes::entries[]
but doesn't check if number of routes goes beyond limit the kernel
is willing to accept. Which later leads toi the assert

  qemu-kvm: ../accel/kvm/kvm-all.c:1833: kvm_irqchip_commit_routes: Assertion 
`ret == 0' failed

typically it happens during guest boot for large enough guest

Reproduced with:
  ./qemu --enable-kvm -m 8G -smp 64 -machine pc \
 `for b in {1..2}; do echo -n "-device pci-bridge,id=pci$b,chassis_nr=$b ";
for i in {0..31}; do touch /tmp/vblk$b$i;
   echo -n "-drive file=/tmp/vblk$b$i,if=none,id=drive$b$i,format=raw
-device virtio-blk-pci,drive=drive$b$i,bus=pci$b ";
  done; done`

While crash at boot time is bad, the same might happen at hotplug time
which is unacceptable.
So instead calling kvm_add_routing_entry() unconditionally, check first
that number of routes won't exceed KVM_CAP_IRQ_ROUTING. This way virtio
device insteads killin qemu, will gracefully fail to initialize device
as expected with following warnings on console:
virtio-blk failed to set guest notifier (-28), ensure -accel kvm is set.
virtio_bus_start_ioeventfd: failed. Fallback to userspace (slower).

Signed-off-by: Igor Mammedov 
---
 accel/kvm/kvm-all.c | 15 ++-
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index d4d57da265..10fae1db05 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -2001,12 +2001,17 @@ int kvm_irqchip_add_msi_route(KVMRouteChange *c, int 
vector, PCIDevice *dev)
 return -EINVAL;
 }
 
-trace_kvm_irqchip_add_msi_route(dev ? dev->name : (char *)"N/A",
-vector, virq);
+if (s->irq_routes->nr < s->gsi_count) {
+trace_kvm_irqchip_add_msi_route(dev ? dev->name : (char *)"N/A",
+vector, virq);
 
-kvm_add_routing_entry(s, &kroute);
-kvm_arch_add_msi_route_post(&kroute, vector, dev);
-c->changes++;
+kvm_add_routing_entry(s, &kroute);
+kvm_arch_add_msi_route_post(&kroute, vector, dev);
+c->changes++;
+} else {
+kvm_irqchip_release_virq(s, virq);
+return -ENOSPC;
+}
 
 return virq;
 }
-- 
2.43.0

Re: [PATCH-for-9.0? 2/2] hw/misc/applesmc: Fix memory leak in reset() handler


On 8/4/24 12:34, Peter Maydell wrote:

On Mon, 8 Apr 2024 at 10:53, Philippe Mathieu-Daudé  wrote:


AppleSMCData is allocated with g_new0() in applesmc_add_key():
release it with g_free().

Leaked since commit 1ddda5cd36 ("AppleSMC device emulation").

Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2272
Reported-by: Zheyu Ma 
Signed-off-by: Philippe Mathieu-Daudé 
---
  hw/misc/applesmc.c | 1 +
  1 file changed, 1 insertion(+)

diff --git a/hw/misc/applesmc.c b/hw/misc/applesmc.c
index 8e65816da6..14e3ef667d 100644
--- a/hw/misc/applesmc.c
+++ b/hw/misc/applesmc.c
@@ -274,6 +274,7 @@ static void qdev_applesmc_isa_reset(DeviceState *dev)
  /* Remove existing entries */
  QLIST_FOREACH_SAFE(d, &s->data_def, node, next) {
  QLIST_REMOVE(d, node);
+g_free(d);
  }
  s->status = 0x00;
  s->status_1e = 0x00;
--


Cc stable?

This is the right minimal fix for the leak, so
Reviewed-by: Peter Maydell 

but overall this is a bit odd. We don't change either the
keys or their values at runtime, they seem to be a fixed
set defined by the device properties, so why are we tearing
them down and readding them every reset? It would be
simpler to create the data structure once at device realize.


This was my first approach, moving the applesmc_add_key()
calls to the realize() handler, and freeing them in a
unrealize() one:

-- >8 --
diff --git a/hw/misc/applesmc.c b/hw/misc/applesmc.c
index 14e3ef667d..59a4899312 100644
--- a/hw/misc/applesmc.c
+++ b/hw/misc/applesmc.c
@@ -145,7 +145,7 @@ static void applesmc_io_cmd_write(void *opaque, 
hwaddr addr, uint64_t val,

 s->data_pos = 0;
 }

-static struct AppleSMCData *applesmc_find_key(AppleSMCState *s)
+static const struct AppleSMCData *applesmc_find_key(AppleSMCState *s)
 {
 struct AppleSMCData *d;

@@ -161,7 +161,7 @@ static void applesmc_io_data_write(void *opaque, 
hwaddr addr, uint64_t val,

unsigned size)
 {
 AppleSMCState *s = opaque;
-struct AppleSMCData *d;
+const struct AppleSMCData *d;

 smc_debug("DATA received: 0x%02x\n", (uint8_t)val);
 switch (s->cmd) {
@@ -269,23 +269,10 @@ static void applesmc_add_key(AppleSMCState *s, 
const char *key,

 static void qdev_applesmc_isa_reset(DeviceState *dev)
 {
 AppleSMCState *s = APPLE_SMC(dev);
-struct AppleSMCData *d, *next;

-/* Remove existing entries */
-QLIST_FOREACH_SAFE(d, &s->data_def, node, next) {
-QLIST_REMOVE(d, node);
-g_free(d);
-}
 s->status = 0x00;
 s->status_1e = 0x00;
 s->last_ret = 0x00;
-
-applesmc_add_key(s, "REV ", 6, "\x01\x13\x0f\x00\x00\x03");
-applesmc_add_key(s, "OSK0", 32, s->osk);
-applesmc_add_key(s, "OSK1", 32, s->osk + 32);
-applesmc_add_key(s, "NATJ", 1, "\0");
-applesmc_add_key(s, "MSSP", 1, "\0");
-applesmc_add_key(s, "MSSD", 1, "\0x3");
 }

 static const MemoryRegionOps applesmc_data_io_ops = {
@@ -343,6 +330,24 @@ static void applesmc_isa_realize(DeviceState *dev, 
Error **errp)

 }

 QLIST_INIT(&s->data_def);
+applesmc_add_key(s, "REV ", 6, "\x01\x13\x0f\x00\x00\x03");
+applesmc_add_key(s, "OSK0", 32, s->osk);
+applesmc_add_key(s, "OSK1", 32, s->osk + 32);
+applesmc_add_key(s, "NATJ", 1, "\0");
+applesmc_add_key(s, "MSSP", 1, "\0");
+applesmc_add_key(s, "MSSD", 1, "\0x3");
+}
+
+static void applesmc_unrealize(DeviceState *dev)
+{
+AppleSMCState *s = APPLE_SMC(dev);
+struct AppleSMCData *d, *next;
+
+/* Remove existing entries */
+QLIST_FOREACH_SAFE(d, &s->data_def, node, next) {
+QLIST_REMOVE(d, node);
+g_free(d);
+}
 }

 static Property applesmc_isa_properties[] = {
@@ -377,6 +382,7 @@ static void qdev_applesmc_class_init(ObjectClass 
*klass, void *data)

 AcpiDevAmlIfClass *adevc = ACPI_DEV_AML_IF_CLASS(klass);

 dc->realize = applesmc_isa_realize;
+dc->unrealize = applesmc_unrealize;
 dc->reset = qdev_applesmc_isa_reset;
 device_class_set_props(dc, applesmc_isa_properties);
 set_bit(DEVICE_CATEGORY_MISC, dc->categories);

---

But since a bit too much changes for the next release, I kept
it as a separate patch.

Thanks for the review,

Phil.

How to use pxb-pcie in correct way?

2024-04-08 Thread Marcin Juszkiewicz

For quite a while I am experimenting with PCI Express setup on SBSA-Ref 
system. And finally decided to write.


We want to play with NUMA setup and "pxb-pcie" can be assigned to NUMA 
node other than cpu0 one. But adding it makes other cards dissapear...


When I boot sbsa-ref I have plain PCIe setup:

(qemu) info pci
  Bus  0, device   0, function 0:
Host bridge: PCI device 1b36:0008
  PCI subsystem 1af4:1100
  id ""
  Bus  0, device   1, function 0:
Ethernet controller: PCI device 8086:10d3
  PCI subsystem 8086:
  IRQ 255, pin A
  BAR0: 32 bit memory at 0x [0x0001fffe].
  BAR1: 32 bit memory at 0x [0x0001fffe].
  BAR2: I/O at 0x [0x001e].
  BAR3: 32 bit memory at 0x [0x3ffe].
  BAR6: 32 bit memory at 0x [0x0003fffe].
  id ""
  Bus  0, device   2, function 0:
Display controller: PCI device 1234:
  PCI subsystem 1af4:1100
  BAR0: 32 bit prefetchable memory at 0x8000 [0x80ff].
  BAR2: 32 bit memory at 0x81084000 [0x81084fff].
  BAR6: 32 bit memory at 0x [0x7ffe].
  id ""

Adding extra PCIe card works fine - both just "igb" and "igb" with 
"pcie-root-port".


But adding "pcie-root-port" + "igb" and then "pxb-pcie" makes "igb" 
dissapear:


../code/qemu/build/qemu-system-aarch64
-monitor telnet::45454,server,nowait
-serial stdio
-device pcie-root-port,id=ULyWl,slot=0,chassis=0
-device igb,bus=ULyWl
-device pxb-pcie,bus_nr=1

(qemu) info pci
  Bus  0, device   0, function 0:
Host bridge: PCI device 1b36:0008
  PCI subsystem 1af4:1100
  id ""
  Bus  0, device   1, function 0:
Ethernet controller: PCI device 8086:10d3
  PCI subsystem 8086:
  IRQ 255, pin A
  BAR0: 32 bit memory at 0x [0x0001fffe].
  BAR1: 32 bit memory at 0x [0x0001fffe].
  BAR2: I/O at 0x [0x001e].
  BAR3: 32 bit memory at 0x [0x3ffe].
  BAR6: 32 bit memory at 0x [0x0003fffe].
  id ""
  Bus  0, device   2, function 0:
Display controller: PCI device 1234:
  PCI subsystem 1af4:1100
  BAR0: 32 bit prefetchable memory at 0x8000 [0x80ff].
  BAR2: 32 bit memory at 0x81085000 [0x81085fff].
  BAR6: 32 bit memory at 0x [0x7ffe].
  id ""
  Bus  0, device   3, function 0:
PCI bridge: PCI device 1b36:000c
  IRQ 255, pin A
  BUS 0.
  secondary bus 1.
  subordinate bus 1.
  IO range [0xf000, 0x0fff]
  memory range [0xfff0, 0x000f]
  prefetchable memory range [0xfff0, 0x000f]
  BAR0: 32 bit memory at 0x81084000 [0x81084fff].
  id "ULyWl"
  Bus  0, device   4, function 0:
Host bridge: PCI device 1b36:000b
  PCI subsystem 1af4:1100
  id ""


If I add "igb" directly (without root port) then it appears correctly:

(qemu) info pci
  Bus  0, device   0, function 0:
Host bridge: PCI device 1b36:0008
  PCI subsystem 1af4:1100
  id ""
  Bus  0, device   1, function 0:
Ethernet controller: PCI device 8086:10d3
  PCI subsystem 8086:
  IRQ 255, pin A
  BAR0: 32 bit memory at 0x [0x0001fffe].
  BAR1: 32 bit memory at 0x [0x0001fffe].
  BAR2: I/O at 0x [0x001e].
  BAR3: 32 bit memory at 0x [0x3ffe].
  BAR6: 32 bit memory at 0x [0x0003fffe].
  id ""
  Bus  0, device   2, function 0:
Display controller: PCI device 1234:
  PCI subsystem 1af4:1100
  BAR0: 32 bit prefetchable memory at 0x8000 [0x80ff].
  BAR2: 32 bit memory at 0x810c4000 [0x810c4fff].
  BAR6: 32 bit memory at 0x [0x7ffe].
  id ""
  Bus  0, device   3, function 0:
Ethernet controller: PCI device 8086:10c9
  PCI subsystem 1af4:1100
  IRQ 255, pin A
  BAR0: 32 bit memory at 0x [0x0001fffe].
  BAR1: 32 bit memory at 0x [0x0001fffe].
  BAR2: I/O at 0x [0x001e].
  BAR3: 64 bit memory at 0x [0x3ffe].
  id ""
  Bus  0, device   4, function 0:
Host bridge: PCI device 1b36:000b
  PCI subsystem 1af4:1100
  id ""


When I add "pcie-root-port" with "igb" followed by "pcie-root-port" and 
"pxb-pcie" then no IGB again:



-device pcie-root-port,id=RjKXs,slot=0,chassis=0
-device igb,bus=RjKXs
-device pcie-root-port,chassis=7
-device pxb-pcie,bus_nr=1


(qemu) info pci
  Bus  0, device   0, function 0:
Host bridge: PCI device 1b36:0008
  PCI subsystem 1af4:1100
  id ""
  Bus  0, device   1, function 0:
Ethernet controller: PCI device 8086:10d3
  PCI subsystem 8086:
  IRQ 255, pin A
  BAR0: 32 bit memory at 0x [0x0001fffe].
  BAR1: 32 bit memory at 0x [0x0001fffe].
  BAR2: I/O at 0x [0x001e].

[PATCH RESEND 2/2] scsi-disk: Fix crash of VMs configured with the CDROM device

2024-04-08 Thread Hyman Huang

When configuring VMs with the CDROM device using the USB bus
in Libvirt, do as follows:


  
  
  
  
  



The destination Qemu process crashed, causing the VM migration
to fail; the backtrace reveals the following:

Program terminated with signal SIGSEGV, Segmentation fault.
0  __memmove_sse2_unaligned_erms () at 
../sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S:312
312movq-8(%rsi,%rdx), %rcx
[Current thread is 1 (Thread 0x7f0a9025fc00 (LWP 3286206))]
(gdb) bt
0  __memmove_sse2_unaligned_erms () at 
../sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S:312
1  memcpy (__len=8, __src=, __dest=) at 
/usr/include/bits/string_fortified.h:34
2  iov_from_buf_full (iov=, iov_cnt=, 
offset=, buf=0x0, bytes=bytes@entry=8) at ../util/iov.c:33
3  iov_from_buf (bytes=8, buf=, offset=, 
iov_cnt=, iov=)
   at 
/usr/src/debug/qemu-6-6.2.0-75.7.oe1.smartx.git.40.x86_64/include/qemu/iov.h:49
4  usb_packet_copy (p=p@entry=0x56066b2fb5a0, ptr=, 
bytes=bytes@entry=8) at ../hw/usb/core.c:636
5  usb_msd_copy_data (s=s@entry=0x56066c62c770, p=p@entry=0x56066b2fb5a0) at 
../hw/usb/dev-storage.c:186
6  usb_msd_handle_data (dev=0x56066c62c770, p=0x56066b2fb5a0) at 
../hw/usb/dev-storage.c:496
7  usb_handle_packet (dev=0x56066c62c770, p=p@entry=0x56066b2fb5a0) at 
../hw/usb/core.c:455
8  uhci_handle_td (s=s@entry=0x56066bd5f210, q=0x56066bb7fbd0, q@entry=0x0, 
qh_addr=qh_addr@entry=902518530, td=td@entry=0x7fffe6e788f0, td_addr=,
   int_mask=int_mask@entry=0x7fffe6e788e4) at ../hw/usb/hcd-uhci.c:885
9  uhci_process_frame (s=s@entry=0x56066bd5f210) at ../hw/usb/hcd-uhci.c:1061
10 uhci_frame_timer (opaque=opaque@entry=0x56066bd5f210) at 
../hw/usb/hcd-uhci.c:1159
11 timerlist_run_timers (timer_list=0x56066af26bd0) at ../util/qemu-timer.c:642
12 qemu_clock_run_timers (type=QEMU_CLOCK_VIRTUAL) at ../util/qemu-timer.c:656
13 qemu_clock_run_all_timers () at ../util/qemu-timer.c:738
14 main_loop_wait (nonblocking=nonblocking@entry=0) at ../util/main-loop.c:542
15 qemu_main_loop () at ../softmmu/runstate.c:739
16 main (argc=, argv=, envp=) at 
../softmmu/main.c:52
(gdb) frame 5
(gdb) p ((SCSIDiskReq *)s->req)->iov
$1 = {iov_base = 0x0, iov_len = 0}
(gdb) p/x s->req->tag
$2 = 0x472

The scsi commands that the CDROM issued are wrapped as the
payload of the USB protocol in Qemu's implementation of a
USB mass storage device, which is used to implement a
CDROM device that uses a USB bus.

In general, the USB controller processes SCSI commands in
two phases. Sending the OUT USB package that encapsulates
the SCSI command is the first stage; scsi-disk would handle
this by emulating the SCSI operation. Receiving the IN USB
package containing the SCSI operation's output is the second
stage. Additionally, the SCSI request tag tracks the request
during the procedure.

Since QEMU did not migrate the flying SCSI request, the
output of the SCSI may be lost if the live migration is
initiated between the two previously mentioned steps.

In our scenario, the SCSI command is GET_EVENT_STATUS_NOTIFICATION,
the QEMU log information below demonstrates how the SCSI command
is being handled (first step) on the source:

usb_packet_state_change bus 0, port 2, ep 2, packet 0x559f9ba14b00, state undef 
-> setup
usb_msd_cmd_submit lun 0, tag 0x472, flags 0x0080, len 10, data-len 8

After migration, the VM crashed as soon as the destination's UHCI
controller began processing the remaining portion of the SCSI
request (second step)! Here is how the QEMU logged out:

usb_packet_state_change bus 0, port 2, ep 1, packet 0x56066b2fb5a0, state undef 
-> setup
usb_msd_data_in 8/8 (scsi 8)
shutting down, reason=crashed

To summarize, the missing scsi request during a live migration
may cause a VM configured with a CDROM to crash.

Migrating the SCSI request that the scsi-disk is handling is
the simple approach, assuming that it actually exists.

Signed-off-by: Hyman Huang 
---
 hw/scsi/scsi-disk.c | 24 +++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c
index 0985676f73..d6e9d9e8d4 100644
--- a/hw/scsi/scsi-disk.c
+++ b/hw/scsi/scsi-disk.c
@@ -160,6 +160,16 @@ static void scsi_disk_save_request(QEMUFile *f, 
SCSIRequest *req)
 }
 }
 
+static void scsi_disk_emulate_save_request(QEMUFile *f, SCSIRequest *req)
+{
+SCSIDiskReq *r = DO_UPCAST(SCSIDiskReq, req, req);
+SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
+
+if (s->migrate_emulate_scsi_request) {
+scsi_disk_save_request(f, req);
+}
+}
+
 static void scsi_disk_load_request(QEMUFile *f, SCSIRequest *req)
 {
 SCSIDiskReq *r = DO_UPCAST(SCSIDiskReq, req, req);
@@ -183,6 +193,16 @@ static void scsi_disk_load_request(QEMUFile *f, 
SCSIRequest *req)
 qemu_iovec_init_external(&r->qiov, &r->iov, 1);
 }
 
+static void scsi_disk_emulate_load_request(QEMUFile *f, SCSIRequest *req)
+{
+SCSIDiskReq *r = DO_UPCAST(SCSIDiskReq, req, req);
+SCSIDiskState *s = DO_UPCAST(SCSIDiskSt

[PATCH RESEND 0/2] Fix crash of VMs configured with the CDROM device

2024-04-08 Thread Hyman Huang

This patchset fixes the crash of VMs configured with the CDROM device
on the destination during live migration. See the commit message for
details.

The previous patchset does not show up at https://patchew.org/QEMU.
Just resend it to ensure the email gets to the inbox.

Please review.

Yong

Hyman Huang (2):
  scsi-disk: Introduce the migrate_emulate_scsi_request field
  scsi-disk: Fix crash of VMs configured with the CDROM device

 hw/scsi/scsi-disk.c | 35 ++-
 1 file changed, 34 insertions(+), 1 deletion(-)

-- 
2.39.3

[PATCH RESEND 1/2] scsi-disk: Introduce the migrate_emulate_scsi_request field

2024-04-08 Thread Hyman Huang

To indicate to the destination whether or not emulational SCSI
requests are sent, introduce the migrate_emulate_scsi_request
in struct SCSIDiskState. It seeks to achieve migration backend
compatibility.

This commit sets the stage for the next one, which addresses
the crash of a VM configured with a CDROM during live migration.

Signed-off-by: Hyman Huang 
---
 hw/scsi/scsi-disk.c | 13 -
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c
index 4bd7af9d0c..0985676f73 100644
--- a/hw/scsi/scsi-disk.c
+++ b/hw/scsi/scsi-disk.c
@@ -111,6 +111,7 @@ struct SCSIDiskState {
  * 0x- reserved
  */
 uint16_t rotation_rate;
+bool migrate_emulate_scsi_request;
 };
 
 static void scsi_free_request(SCSIRequest *req)
@@ -3133,11 +3134,21 @@ static Property scsi_hd_properties[] = {
 DEFINE_PROP_END_OF_LIST(),
 };
 
+static int scsi_disk_pre_save(void *opaque)
+{
+SCSIDiskState *dev = opaque;
+dev->migrate_emulate_scsi_request = false;
+
+return 0;
+}
+
 static const VMStateDescription vmstate_scsi_disk_state = {
 .name = "scsi-disk",
-.version_id = 1,
+.version_id = 2,
 .minimum_version_id = 1,
+.pre_save = scsi_disk_pre_save,
 .fields = (const VMStateField[]) {
+VMSTATE_BOOL_V(migrate_emulate_scsi_request, SCSIDiskState, 2),
 VMSTATE_SCSI_DEVICE(qdev, SCSIDiskState),
 VMSTATE_BOOL(media_changed, SCSIDiskState),
 VMSTATE_BOOL(media_event, SCSIDiskState),
-- 
2.39.3

Re: [PATCH-for-9.0] hw/sd/sdhci: Discard excess of data written to Buffer Data Port register

On Thu, 4 Apr 2024 at 09:56, Philippe Mathieu-Daudé  wrote:
>
> Per "SD Host Controller Standard Specification Version 3.00":
>
>   * 1.7 Buffer Control
>
>   - 1.7.1 Control of Buffer Pointer
>
> (3) Buffer Control with Block Size
>
> In case of write operation, the buffer accumulates the data
> written through the Buffer Data Port register. When the buffer
> pointer reaches the block size, Buffer Write Enable in the
> Present State register changes 1 to 0. It means no more data
> can be written to the buffer. Excess data of the last write is
> ignored. For example, if just lower 2 bytes data can be written
> to the buffer and a 32-bit (4-byte) block of data is written to
> the Buffer Data Port register, the lower 2 bytes of data is
> written to the buffer and the upper 2 bytes is ignored.
>
> Discard the excess of data to avoid overflow reported by fuzzer:
>
>   $ cat << EOF | qemu-system-i386 \
>  -display none -nodefaults \
>  -machine accel=qtest -m 512M \
>  -device sdhci-pci,sd-spec-version=3 \
>  -device sd-card,drive=mydrive \
>  -drive 
> if=none,index=0,file=null-co://,format=raw,id=mydrive -nographic \
>  -qtest stdio
>   outl 0xcf8 0x80001013
>   outl 0xcfc 0x91
>   outl 0xcf8 0x80001001
>   outl 0xcfc 0x0600
>   write 0x912c 0x1 0x05
>   write 0x9158 0x1 0x16
>   write 0x9105 0x1 0x04
>   write 0x9128 0x1 0x08
>   write 0x16 0x1 0x21
>   write 0x19 0x1 0x20
>   write 0x910c 0x1 0x01
>   write 0x910e 0x1 0x20
>   write 0x910f 0x1 0x00
>   write 0x910c 0x1 0x00
>   write 0x9120 0x1 0x00
>   EOF

> diff --git a/hw/sd/sdhci.c b/hw/sd/sdhci.c
> index c5e0bc018b..2dd88fa139 100644
> --- a/hw/sd/sdhci.c
> +++ b/hw/sd/sdhci.c
> @@ -552,7 +552,7 @@ static void sdhci_write_block_to_card(SDHCIState *s)
>   * register */
>  static void sdhci_write_dataport(SDHCIState *s, uint32_t value, unsigned 
> size)
>  {
> -unsigned i;
> +unsigned i, available;
>
>  /* Check that there is free space left in a buffer */
>  if (!(s->prnsts & SDHC_SPACE_AVAILABLE)) {
> @@ -560,6 +560,14 @@ static void sdhci_write_dataport(SDHCIState *s, uint32_t 
> value, unsigned size)
>  return;
>  }
>
> +available = s->buf_maxsz - s->data_count;
> +if (size > available) {
> +qemu_log_mask(LOG_GUEST_ERROR, "SDHC buffer data full (size: 
> %"PRIu32")"
> +   " discarding %u byte%s\n",
> +   s->buf_maxsz, size - available,
> +   size - available > 1 ? "s" : "");
> +size = available; /* Excess data of the last write is ignored. */
> +}
>  for (i = 0; i < size; i++) {
>  s->fifo_buffer[s->data_count] = value & 0xFF;
>  s->data_count++;

So, this will definitely avoid the buffer overrun, and the
quoted text also suggests that we should not be doing the
"if sdhci_write_block_to_card() writes the data then keep
going with the rest of the bytes in the value for the start
of the new block". (With this change we could move the
"if (s->data_count >= (s->blksize & BLOCK_SIZE_MASK)) ..."
out of the for() loop and down to the bottom of the function.)

But I'm not sure it fixes the underlying cause of the problem,
because the repro case isn't writing a multi-byte value, it's
only writing a single byte.

It looks from the code like if there's no space in the
buffer then SDHC_SPACE_AVAILABLE should be clear in the
present-status register, but that has somehow got out of sync.
The way the repro from the fuzzer toggles the device in and
out of DMA mode looks suspicious about how that out-of-sync
situation might have come about.

thanks
-- PMM

Re: [PATCH-for-9.0? 1/2] hw/net/lan9118: Replace magic '2048' value by 'PKT_SIZE' definition

On Mon, 8 Apr 2024 at 11:53, Philippe Mathieu-Daudé  wrote:
>
> Signed-off-by: Philippe Mathieu-Daudé 
> ---
>  hw/net/lan9118.c | 8 +---
>  1 file changed, 5 insertions(+), 3 deletions(-)
>

Reviewed-by: Peter Maydell 

thanks
-- PMM

Re: [PATCH] xen-hvm: Avoid livelock while handling buffered ioreqs

2024-04-08 Thread Ross Lagerwall

On Sat, Apr 6, 2024 at 11:58 AM Durrant, Paul  wrote:
>
> On 04/04/2024 15:08, Ross Lagerwall wrote:
> > A malicious or buggy guest may generated buffered ioreqs faster than
> > QEMU can process them in handle_buffered_iopage(). The result is a
> > livelock - QEMU continuously processes ioreqs on the main thread without
> > iterating through the main loop which prevents handling other events,
> > processing timers, etc. Without QEMU handling other events, it often
> > results in the guest becoming unsable and makes it difficult to stop the
> > source of buffered ioreqs.
> >
> > To avoid this, if we process a full page of buffered ioreqs, stop and
> > reschedule an immediate timer to continue processing them. This lets
> > QEMU go back to the main loop and catch up.
> >
>
> Do PV backends potentially cause the same scheduling issue (if not using
> io threads)?
>

>From what I can tell:

xen-block: It reads req_prod / req_cons once before entering the loop
so it should be fine, I think.

xen_console: Same as xen-block

xen_nic: It reads req_prod / req_cons once before entering the loop.
However, once the loop ends it checks for more requests and if there
are more requests it restarts from the beginning. It seems like this
could be susceptible to the same issue.

(These PV backends generally aren't used by XenServer's system QEMU
so I didn't spend too much time looking into it.)

Thanks,
Ross

Re: [PATCH] xen-hvm: Avoid livelock while handling buffered ioreqs

2024-04-08 Thread Paul Durrant


On 08/04/2024 14:00, Ross Lagerwall wrote:

On Sat, Apr 6, 2024 at 11:58 AM Durrant, Paul  wrote:


On 04/04/2024 15:08, Ross Lagerwall wrote:

A malicious or buggy guest may generated buffered ioreqs faster than
QEMU can process them in handle_buffered_iopage(). The result is a
livelock - QEMU continuously processes ioreqs on the main thread without
iterating through the main loop which prevents handling other events,
processing timers, etc. Without QEMU handling other events, it often
results in the guest becoming unsable and makes it difficult to stop the
source of buffered ioreqs.

To avoid this, if we process a full page of buffered ioreqs, stop and
reschedule an immediate timer to continue processing them. This lets
QEMU go back to the main loop and catch up.



Do PV backends potentially cause the same scheduling issue (if not using
io threads)?



 From what I can tell:

xen-block: It reads req_prod / req_cons once before entering the loop
so it should be fine, I think.

xen_console: Same as xen-block

xen_nic: It reads req_prod / req_cons once before entering the loop.
However, once the loop ends it checks for more requests and if there
are more requests it restarts from the beginning. It seems like this
could be susceptible to the same issue.

(These PV backends generally aren't used by XenServer's system QEMU
so I didn't spend too much time looking into it.)

Thanks,


Ok. Thanks for checking.

  Paul

Re: [PATCH] xen-hvm: Avoid livelock while handling buffered ioreqs

2024-04-08 Thread Paul Durrant


On 04/04/2024 15:08, Ross Lagerwall wrote:

A malicious or buggy guest may generated buffered ioreqs faster than
QEMU can process them in handle_buffered_iopage(). The result is a
livelock - QEMU continuously processes ioreqs on the main thread without
iterating through the main loop which prevents handling other events,
processing timers, etc. Without QEMU handling other events, it often
results in the guest becoming unsable and makes it difficult to stop the
source of buffered ioreqs.

To avoid this, if we process a full page of buffered ioreqs, stop and
reschedule an immediate timer to continue processing them. This lets
QEMU go back to the main loop and catch up.

Signed-off-by: Ross Lagerwall 
---
  hw/xen/xen-hvm-common.c | 26 +-
  1 file changed, 17 insertions(+), 9 deletions(-)



Reviewed-by: Paul Durrant

Re: [PATCH 9/9] accel/tcg: Improve can_do_io management

2024-04-08 Thread Jørgen Hansen

On 4/7/24 00:32, Richard Henderson wrote:
> We already attempted to set and clear can_do_io before the first
> and last insns, but only used the initial value of max_insns and
> the call to translator_io_start to find those insns.
> 
> Now that we track insn_start in DisasContextBase, and now that
> we have emit_before_op, we can wait until we have finished
> translation to identify the true first and last insns and emit
> the sets of can_do_io at that time.
> 
> This fixes case of a translation block which crossed a page boundary,
> and for which the second page turned out to be mmio.  In this case we
> truncate the block, and the previous logic for can_do_io could leave
> a block with a single insn with can_do_io set to false, which would
> fail an assertion in cpu_io_recompile.
> 
> Reported-by: Jørgen Hansen 
> Signed-off-by: Richard Henderson 
> ---
>   include/exec/translator.h |  1 -
>   accel/tcg/translator.c| 45 ---
>   2 files changed, 23 insertions(+), 23 deletions(-)

Thanks for the quick fix! I verified the patch series fixes the issue on 
my setup, and also verified that no issues were seen with full MMIO 
backing for the otherwise same test case.

Tested-by: Jørgen Hansen

Re: [PATCH-for-9.1 v2 2/3] migration: Remove RDMA protocol handling

2024-04-08 Thread Jinpu Wang

Hi Peter,

On Tue, Apr 2, 2024 at 11:24 PM Peter Xu  wrote:
>
> On Mon, Apr 01, 2024 at 11:26:25PM +0200, Yu Zhang wrote:
> > Hello Peter und Zhjian,
> >
> > Thank you so much for letting me know about this. I'm also a bit surprised 
> > at
> > the plan for deprecating the RDMA migration subsystem.
>
> It's not too late, since it looks like we do have users not yet notified
> from this, we'll redo the deprecation procedure even if it'll be the final
> plan, and it'll be 2 releases after this.
>
> >
> > > IMHO it's more important to know whether there are still users and whether
> > > they would still like to see it around.
> >
> > > I admit RDMA migration was lack of testing(unit/CI test), which led to 
> > > the a few
> > > obvious bugs being noticed too late.
> >
> > Yes, we are a user of this subsystem. I was unaware of the lack of test 
> > coverage
> > for this part. As soon as 8.2 was released, I saw that many of the
> > migration test
> > cases failed and came to realize that there might be a bug between 8.1
> > and 8.2, but
> > was unable to confirm and report it quickly to you.
> >
> > The maintenance of this part could be too costly or difficult from
> > your point of view.
>
> It may or may not be too costly, it's just that we need real users of RDMA
> taking some care of it.  Having it broken easily for >1 releases definitely
> is a sign of lack of users.  It is an implication to the community that we
> should consider dropping some features so that we can get the best use of
> the community resources for the things that may have a broader audience.
>
> One thing majorly missing is a RDMA tester to guard all the merges to not
> break RDMA paths, hopefully in CI.  That should not rely on RDMA hardwares
> but just to sanity check the migration+rdma code running all fine.  RDMA
> taught us the lesson so we're requesting CI coverage for all other new
> features that will be merged at least for migration subsystem, so that we
> plan to not merge anything that is not covered by CI unless extremely
> necessary in the future.
>
> For sure CI is not the only missing part, but I'd say we should start with
> it, then someone should also take care of the code even if only in
> maintenance mode (no new feature to add on top).
>
> >
> > My concern is, this plan will forces a few QEMU users (not sure how
> > many) like us
> > either to stick to the RDMA migration by using an increasingly older
> > version of QEMU,
> > or to abandon the currently used RDMA migration.
>
> RDMA doesn't get new features anyway, if there's specific use case for RDMA
> migrations, would it work if such a scenario uses the old binary?  Is it
> possible to switch to the TCP protocol with some good NICs?
We have used rdma migration with HCA from Nvidia for years, our
experience is RDMA migration works better than tcp (over ipoib).

Switching back to TCP will lead us to the old problems which was
solved by RDMA migration.

>
> Per our best knowledge, RDMA users are rare, and please let anyone know if
> you are aware of such users.  IIUC the major reason why RDMA stopped being
> the trend is because the network is not like ten years ago; I don't think I
> have good knowledge in RDMA at all nor network, but my understanding is
> it's pretty easy to fetch modern NIC to outperform RDMAs, then it may make
> little sense to maintain multiple protocols, considering RDMA migration
> code is so special so that it has the most custom code comparing to other
> protocols.
+cc some guys from Huawei.

I'm surprised RDMA users are rare,  I guess maybe many are just
working with different code base.
>
> Thanks,
>
> --
> Peter Xu

Thx!
Jinpu Wang
>

[RFC PATCH v2 00/13] SMMUv3 nested translation support

Currently, QEMU supports emulating either stage-1 or stage-2 SMMUs
but not nested instances.
This patch series adds support for nested translation in SMMUv3,
this is controlled by property “arm-smmuv3.stage=nested”, and
advertised to guests as (IDR0.S1P == 1 && IDR0.S2P == 2)

Main changes(architecture):

1) CDs are considered IPA and translated with stage-2.
2) TTBx and tables for stage-1 are considered IPA and translated
with stage-2.
3) Translate the IPA address with stage-2.

TLBs:
==
TLBs are the most tricky part.

1) General design
Unified(Combined) design is used, where entries with ASID=-1 are
IPAs(cached from stage-2 config)

TLBs are also modified to cache 2 permissions, a new permission added
"parent_perm."

For non-nested configuration, perm == parent_perm and nothing
changes. This is used to know which stage to use in case there is
a permission fault from a TLB entry.

2) Caching in TLB
Stage-1 and stage-2 are inserted in the TLB as is.
For nested translation, both entries are combined into one TLB
entry. The size (level and granule) are chosen from the smallest entries.
That means that a stage-1 translation can be cached with sage-2
granule in key, this is take into account lookup.

3) TLB Lookup
TLB lookup already uses ASID in key, so it can distinguish between
stage-1 and stage-2.
And as mentioned above, the granule for stage-1 can be different,
If stage-1 lookup failed, we try again with the stage-2 granule.

4) TLB invalidation
- Address invalidation is split, for IOVA(CMD_TLBI_NH_VA
/CMD_TLBI_NH_VAA) and IPA(CMD_TLBI_S2_IPA) based on ASID value
- CMD_TLBI_NH_ASID/CMD_TLBI_NH_ALL: Consider VMID if stage-2 is
supported, and invalidate stage-1 only by VMIDs

As far as I understand, this is compliant with the ARM architecture:
- ARM ARM DDI 0487J.a: RLGSCG, RTVTYQ, RGNJPZ
- ARM IHI 0070F.b: 16.2 Caching

An alternative approach would be to instantiate 2 TLBs, one per each
stage. I haven’t investigated that.

Others
===
- Advertise SMMUv3.2-S2FWB, it is NOP for QEMU as it doesn’t support
attributes.

- OAS: A typical setup with nesting is to share CPU stage-2 with the
SMMU, and according to the user manual, SMMU OAS must match the
system physical address.

This was discussed before in
https://lore.kernel.org/all/20230226220650.1480786-11-smost...@google.com/
The implementation here, follows the discussion, where migration is
added and oas is set up from the board (virt). However, the OAS is
chosen based on the CPU PARANGE as there is no fixed one.

- For nested configuration, IOVA notifier only notifies for stage-1
invalidations (as far as I understand this is the intended
behaviour as it notifies for IOVA)

- Stop ignoring VMID for stage-1 if stage-2 is also supported.

Future improvements:
=
1) One small improvement, that I don’t think it’s worth the extra
complexity, is in case of Stage-1 TLB miss for nested translation,
we can do stage-1 walk and lookup for stage-2 TLBs, instead of
doing the full walk.

2) Patch 0006 (hw/arm/smmuv3: Translate CD and TT using stage-2 table)
introduces a macro to use functions that rely on cfg for stage-2,
I don’t like it. However, I didn’t find a simple way around it,
either we change many functions to have a separate stage argument,
or add another arg in config, which is probably more code.

Testing

1) IOMMUFD + VFIO
Kernel: https://lore.kernel.org/all/cover.1683688960.git.nicol...@nvidia.com/
VMM:
https://qemu-devel.nongnu.narkive.com/o815DqpI/rfc-v5-0-8-arm-smmuv3-emulation-support

By assigning
“virtio-net-pci,netdev=net0,disable-legacy=on,iommu_platform=on,ats=on”,
to a guest VM (on top of QEMU guest) with VIFO and IOMMUFD.

2) Work in progress prototype I am hacking on for nesting on KVM
(this is nowhere near complete, and misses many stuff but it
doesn't require VMs/VFIO) also with virtio-net-pci and git
cloning a bunch of stuff and also observing traces.

https://android-kvm.googlesource.com/linux/+log/refs/heads/smostafa/android15-6.6-smmu-nesting-wip

I also modified the Linux driver to test with mixed granules/levels.

hw/arm/smmuv3: Split smmuv3_translate() better viewed with --color-moved

Changes in v2:
v1:
https://lore.kernel.org/qemu-devel/20240325101442.1306300-1-smost...@google.com/
- Collected Eric Rbs
- Rework TLB to rely on VMID/ASID instead of an extra key.
- Fixed TLB issue with large stage-1 reported by Julian.
- Cap the OAS to 48 bits as PTW doesn’t support 52 bits.
- Fix ASID/VMID representation in some contexts as 16 bits while
they can be -1
- Increase visibility in trace points

Mostafa Saleh (13):
hw/arm/smmu: Use enum for SMMU stage
hw/arm/smmu: Split smmuv3_translate()
hw/arm/smmu: Consolidate ASID and VMID types
hw/arm/smmuv3: Translate CD and TT using stage-2 table
hw/arm/smmu-common: Support nested translation

[RFC PATCH v2 03/13] hw/arm/smmu: Consolidate ASID and VMID types

ASID and VMID used to be uint16_t in the translation config, however,
in other contexts they can be int as -1 in case of TLB invalidation,
to represent all(don’t care).
When stage-2 was added asid was set to -1 in stage-2 and vmid to -1
in stage-1 configs. However, that meant they were set as (65536),
this was not an issue as nesting was not supported and no
commands/lookup targets both.

With nesting, it’s critical to get this right as translation must be
tagged correctly with ASID/VMID, and with ASID=-1 meaning stage-2.
Represent ASID/VMID everywhere as int.

Signed-off-by: Mostafa Saleh 
---
 hw/arm/smmu-common.c | 10 +-
 hw/arm/smmuv3.c  |  4 ++--
 include/hw/arm/smmu-common.h | 14 +++---
 3 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c
index 20630eb670..771b9c79a3 100644
--- a/hw/arm/smmu-common.c
+++ b/hw/arm/smmu-common.c
@@ -57,7 +57,7 @@ static gboolean smmu_iotlb_key_equal(gconstpointer v1, 
gconstpointer v2)
(k1->vmid == k2->vmid);
 }
 
-SMMUIOTLBKey smmu_get_iotlb_key(uint16_t asid, uint16_t vmid, uint64_t iova,
+SMMUIOTLBKey smmu_get_iotlb_key(int asid, int vmid, uint64_t iova,
 uint8_t tg, uint8_t level)
 {
 SMMUIOTLBKey key = {.asid = asid, .vmid = vmid, .iova = iova,
@@ -130,7 +130,7 @@ void smmu_iotlb_inv_all(SMMUState *s)
 static gboolean smmu_hash_remove_by_asid(gpointer key, gpointer value,
  gpointer user_data)
 {
-uint16_t asid = *(uint16_t *)user_data;
+int asid = *(int *)user_data;
 SMMUIOTLBKey *iotlb_key = (SMMUIOTLBKey *)key;
 
 return SMMU_IOTLB_ASID(*iotlb_key) == asid;
@@ -139,7 +139,7 @@ static gboolean smmu_hash_remove_by_asid(gpointer key, 
gpointer value,
 static gboolean smmu_hash_remove_by_vmid(gpointer key, gpointer value,
  gpointer user_data)
 {
-uint16_t vmid = *(uint16_t *)user_data;
+int vmid = *(int *)user_data;
 SMMUIOTLBKey *iotlb_key = (SMMUIOTLBKey *)key;
 
 return SMMU_IOTLB_VMID(*iotlb_key) == vmid;
@@ -191,13 +191,13 @@ void smmu_iotlb_inv_iova(SMMUState *s, int asid, int 
vmid, dma_addr_t iova,
 &info);
 }
 
-void smmu_iotlb_inv_asid(SMMUState *s, uint16_t asid)
+void smmu_iotlb_inv_asid(SMMUState *s, int asid)
 {
 trace_smmu_iotlb_inv_asid(asid);
 g_hash_table_foreach_remove(s->iotlb, smmu_hash_remove_by_asid, &asid);
 }
 
-inline void smmu_iotlb_inv_vmid(SMMUState *s, uint16_t vmid)
+inline void smmu_iotlb_inv_vmid(SMMUState *s, int vmid)
 {
 trace_smmu_iotlb_inv_vmid(vmid);
 g_hash_table_foreach_remove(s->iotlb, smmu_hash_remove_by_vmid, &vmid);
diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c
index f081ff0cc4..897f8fe085 100644
--- a/hw/arm/smmuv3.c
+++ b/hw/arm/smmuv3.c
@@ -1235,7 +1235,7 @@ static int smmuv3_cmdq_consume(SMMUv3State *s)
 }
 case SMMU_CMD_TLBI_NH_ASID:
 {
-uint16_t asid = CMD_ASID(&cmd);
+int asid = CMD_ASID(&cmd);
 
 if (!STAGE1_SUPPORTED(s)) {
 cmd_error = SMMU_CERROR_ILL;
@@ -1268,7 +1268,7 @@ static int smmuv3_cmdq_consume(SMMUv3State *s)
 break;
 case SMMU_CMD_TLBI_S12_VMALL:
 {
-uint16_t vmid = CMD_VMID(&cmd);
+int vmid = CMD_VMID(&cmd);
 
 if (!STAGE2_SUPPORTED(s)) {
 cmd_error = SMMU_CERROR_ILL;
diff --git a/include/hw/arm/smmu-common.h b/include/hw/arm/smmu-common.h
index 5944735632..96eb017e50 100644
--- a/include/hw/arm/smmu-common.h
+++ b/include/hw/arm/smmu-common.h
@@ -84,7 +84,7 @@ typedef struct SMMUS2Cfg {
 bool record_faults; /* Record fault events (S2R) */
 uint8_t granule_sz; /* Granule page shift (based on S2TG) */
 uint8_t eff_ps; /* Effective PA output range (based on S2PS) */
-uint16_t vmid;  /* Virtual Machine ID (S2VMID) */
+int vmid;   /* Virtual Machine ID (S2VMID) */
 uint64_t vttb;  /* Address of translation table base (S2TTB) */
 } SMMUS2Cfg;
 
@@ -108,7 +108,7 @@ typedef struct SMMUTransCfg {
 uint64_t ttb;  /* TT base address */
 uint8_t oas;   /* output address width */
 uint8_t tbi;   /* Top Byte Ignore */
-uint16_t asid;
+int asid;
 SMMUTransTableInfo tt[2];
 /* Used by stage-2 only. */
 struct SMMUS2Cfg s2cfg;
@@ -132,8 +132,8 @@ typedef struct SMMUPciBus {
 
 typedef struct SMMUIOTLBKey {
 uint64_t iova;
-uint16_t asid;
-uint16_t vmid;
+int asid;
+int vmid;
 uint8_t tg;
 uint8_t level;
 } SMMUIOTLBKey;
@@ -205,11 +205,11 @@ IOMMUMemoryRegion *smmu_iommu_mr(SMMUState *s, uint32_t 
sid);
 SMMUTLBEntry *smmu_iotlb_lookup(SMMUState *bs, SMMUTransCfg *cfg,
 SMMUTransTableInfo *tt, hwaddr iova);
 void smmu_iotlb_insert(SMMUState *bs, SMMUTransCfg *cfg, SMMUTLBE

[RFC PATCH v2 04/13] hw/arm/smmuv3: Translate CD and TT using stage-2 table

According to the user manual (ARM IHI 0070 F.b),
In "5.2 Stream Table Entry":
 [51:6] S1ContextPtr
 If Config[1] == 1 (stage 2 enabled), this pointer is an IPA translated by
 stage 2 and the programmed value must be within the range of the IAS.

In "5.4.1 CD notes":
 The translation table walks performed from TTB0 or TTB1 are always performed
 in IPA space if stage 2 translations are enabled.

So translate both the CD and the TTBx in this patch if nested
translation is requested.

Signed-off-by: Mostafa Saleh 
---
 hw/arm/smmuv3.c  | 49 ++--
 include/hw/arm/smmu-common.h | 17 +
 2 files changed, 59 insertions(+), 7 deletions(-)

diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c
index 897f8fe085..a7cf543acc 100644
--- a/hw/arm/smmuv3.c
+++ b/hw/arm/smmuv3.c
@@ -337,14 +337,36 @@ static int smmu_get_ste(SMMUv3State *s, dma_addr_t addr, 
STE *buf,
 
 }
 
+static SMMUTranslationStatus smmuv3_do_translate(SMMUv3State *s, hwaddr addr,
+ SMMUTransCfg *cfg,
+ SMMUEventInfo *event,
+ IOMMUAccessFlags flag,
+ SMMUTLBEntry **out_entry);
 /* @ssid > 0 not supported yet */
-static int smmu_get_cd(SMMUv3State *s, STE *ste, uint32_t ssid,
-   CD *buf, SMMUEventInfo *event)
+static int smmu_get_cd(SMMUv3State *s, STE *ste, SMMUTransCfg *cfg,
+   uint32_t ssid, CD *buf, SMMUEventInfo *event)
 {
 dma_addr_t addr = STE_CTXPTR(ste);
 int ret, i;
+SMMUTranslationStatus status;
+SMMUTLBEntry *entry;
 
 trace_smmuv3_get_cd(addr);
+
+if (cfg->stage == SMMU_NESTED) {
+CALL_FUNC_CFG_S2(cfg, status, smmuv3_do_translate, s, addr,
+ cfg, event, IOMMU_RO, &entry);
+/*
+ * It is not clear what should happen if this fails, so we return here
+ * which gets propagated as a translation error.
+ */
+if (status != SMMU_TRANS_SUCCESS) {
+return -EINVAL;
+}
+
+addr = CACHED_ENTRY_TO_ADDR(entry, addr);
+}
+
 /* TODO: guarantee 64-bit single-copy atomicity */
 ret = dma_memory_read(&address_space_memory, addr, buf, sizeof(*buf),
   MEMTXATTRS_UNSPECIFIED);
@@ -659,10 +681,13 @@ static int smmu_find_ste(SMMUv3State *s, uint32_t sid, 
STE *ste,
 return 0;
 }
 
-static int decode_cd(SMMUTransCfg *cfg, CD *cd, SMMUEventInfo *event)
+static int decode_cd(SMMUv3State *s, SMMUTransCfg *cfg,
+ CD *cd, SMMUEventInfo *event)
 {
 int ret = -EINVAL;
 int i;
+SMMUTranslationStatus status;
+SMMUTLBEntry *entry;
 
 if (!CD_VALID(cd) || !CD_AARCH64(cd)) {
 goto bad_cd;
@@ -713,6 +738,17 @@ static int decode_cd(SMMUTransCfg *cfg, CD *cd, 
SMMUEventInfo *event)
 
 tt->tsz = tsz;
 tt->ttb = CD_TTB(cd, i);
+
+/* Translate the TTBx, from IPA to PA if nesting is enabled. */
+if (cfg->stage == SMMU_NESTED) {
+CALL_FUNC_CFG_S2(cfg, status, smmuv3_do_translate, s,
+ tt->ttb, cfg, event, IOMMU_RO, &entry);
+/* See smmu_get_cd(). */
+if (status != SMMU_TRANS_SUCCESS) {
+return -EINVAL;
+}
+tt->ttb = CACHED_ENTRY_TO_ADDR(entry, tt->ttb);
+}
 if (tt->ttb & ~(MAKE_64BIT_MASK(0, cfg->oas))) {
 goto bad_cd;
 }
@@ -767,12 +803,12 @@ static int smmuv3_decode_config(IOMMUMemoryRegion *mr, 
SMMUTransCfg *cfg,
 return 0;
 }
 
-ret = smmu_get_cd(s, &ste, 0 /* ssid */, &cd, event);
+ret = smmu_get_cd(s, &ste, cfg, 0 /* ssid */, &cd, event);
 if (ret) {
 return ret;
 }
 
-return decode_cd(cfg, &cd, event);
+return decode_cd(s, cfg, &cd, event);
 }
 
 /**
@@ -942,8 +978,7 @@ epilogue:
 switch (status) {
 case SMMU_TRANS_SUCCESS:
 entry.perm = cached_entry->entry.perm;
-entry.translated_addr = cached_entry->entry.translated_addr +
-(addr & cached_entry->entry.addr_mask);
+entry.translated_addr = CACHED_ENTRY_TO_ADDR(cached_entry, addr);
 entry.addr_mask = cached_entry->entry.addr_mask;
 trace_smmuv3_translate_success(mr->parent_obj.name, sid, addr,
entry.translated_addr, entry.perm,
diff --git a/include/hw/arm/smmu-common.h b/include/hw/arm/smmu-common.h
index 96eb017e50..2772175115 100644
--- a/include/hw/arm/smmu-common.h
+++ b/include/hw/arm/smmu-common.h
@@ -37,6 +37,23 @@
 #define VMSA_IDXMSK(isz, strd, lvl) ((1ULL << \
  VMSA_BIT_LVL(isz, strd, lvl)) - 1)
 
+#define CACHED_ENTRY_TO_ADDR(ent, addr)  (ent)->entry.translated_addr + \
+ ((addr) & (ent)->entry.addr_ma

[RFC PATCH v2 05/13] hw/arm/smmu-common: Support nested translation

When nested translation is requested, do the following:

- Translate stage-1 IPA using stage-2 to a physical address.
- Translate stage-1 PTW walks using stage-2.
- Combine both to create a single TLB entry, for that we choose
  the smallest entry to cache, which means that if the smallest
  entry comes from stage-2, and stage-2 use different granule,
  TLB lookup for stage-1 (in nested config) will always miss.
  Lookup logic is modified for nesting to lookup using stage-2
  granule if stage-1 granule missed and they are different.

Also, add more visibility in trace points, to make it easier to debug.

Signed-off-by: Mostafa Saleh 
---
 hw/arm/smmu-common.c | 153 ---
 hw/arm/trace-events  |   6 +-
 include/hw/arm/smmu-common.h |   3 +-
 3 files changed, 131 insertions(+), 31 deletions(-)

diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c
index 771b9c79a3..2cf27b490b 100644
--- a/hw/arm/smmu-common.c
+++ b/hw/arm/smmu-common.c
@@ -66,8 +66,10 @@ SMMUIOTLBKey smmu_get_iotlb_key(int asid, int vmid, uint64_t 
iova,
 return key;
 }
 
-SMMUTLBEntry *smmu_iotlb_lookup(SMMUState *bs, SMMUTransCfg *cfg,
-SMMUTransTableInfo *tt, hwaddr iova)
+static SMMUTLBEntry *smmu_iotlb_lookup_all_levels(SMMUState *bs,
+  SMMUTransCfg *cfg,
+  SMMUTransTableInfo *tt,
+  hwaddr iova)
 {
 uint8_t tg = (tt->granule_sz - 10) / 2;
 uint8_t inputsize = 64 - tt->tsz;
@@ -88,10 +90,29 @@ SMMUTLBEntry *smmu_iotlb_lookup(SMMUState *bs, SMMUTransCfg 
*cfg,
 }
 level++;
 }
+return entry;
+}
+
+SMMUTLBEntry *smmu_iotlb_lookup(SMMUState *bs, SMMUTransCfg *cfg,
+SMMUTransTableInfo *tt, hwaddr iova)
+{
+SMMUTLBEntry *entry = NULL;
+
+entry = smmu_iotlb_lookup_all_levels(bs, cfg, tt, iova);
+/*
+ * For nested translation also use the s2 granule, as the TLB will insert
+ * the smallest of both, so the entry can be cached with the s2 granule.
+ */
+if (!entry && (cfg->stage == SMMU_NESTED) &&
+(cfg->s2cfg.granule_sz != tt->granule_sz)) {
+tt->granule_sz = cfg->s2cfg.granule_sz;
+entry = smmu_iotlb_lookup_all_levels(bs, cfg, tt, iova);
+}
 
 if (entry) {
 cfg->iotlb_hits++;
 trace_smmu_iotlb_lookup_hit(cfg->asid, cfg->s2cfg.vmid, iova,
+entry->entry.addr_mask,
 cfg->iotlb_hits, cfg->iotlb_misses,
 100 * cfg->iotlb_hits /
 (cfg->iotlb_hits + cfg->iotlb_misses));
@@ -117,7 +138,7 @@ void smmu_iotlb_insert(SMMUState *bs, SMMUTransCfg *cfg, 
SMMUTLBEntry *new)
 *key = smmu_get_iotlb_key(cfg->asid, cfg->s2cfg.vmid, new->entry.iova,
   tg, new->level);
 trace_smmu_iotlb_insert(cfg->asid, cfg->s2cfg.vmid, new->entry.iova,
-tg, new->level);
+tg, new->level, new->entry.translated_addr);
 g_hash_table_insert(bs->iotlb, key, new);
 }
 
@@ -286,6 +307,27 @@ SMMUTransTableInfo *select_tt(SMMUTransCfg *cfg, 
dma_addr_t iova)
 return NULL;
 }
 
+/* Return the correct table address based on configuration. */
+static inline int translate_table_s1(dma_addr_t *table_addr, SMMUTransCfg *cfg,
+ SMMUPTWEventInfo *info, SMMUState *bs)
+{
+dma_addr_t addr = *table_addr;
+SMMUTLBEntry *cached_entry;
+
+if (cfg->stage != SMMU_NESTED) {
+return 0;
+}
+
+CALL_FUNC_CFG_S2(cfg, cached_entry, smmu_translate,
+ bs, cfg, addr, IOMMU_RO, info);
+
+if (cached_entry) {
+*table_addr = CACHED_ENTRY_TO_ADDR(cached_entry, addr);
+return 0;
+}
+return -EINVAL;
+}
+
 /**
  * smmu_ptw_64_s1 - VMSAv8-64 Walk of the page tables for a given IOVA
  * @cfg: translation config
@@ -301,7 +343,8 @@ SMMUTransTableInfo *select_tt(SMMUTransCfg *cfg, dma_addr_t 
iova)
  */
 static int smmu_ptw_64_s1(SMMUTransCfg *cfg,
   dma_addr_t iova, IOMMUAccessFlags perm,
-  SMMUTLBEntry *tlbe, SMMUPTWEventInfo *info)
+  SMMUTLBEntry *tlbe, SMMUPTWEventInfo *info,
+  SMMUState *bs)
 {
 dma_addr_t baseaddr, indexmask;
 SMMUStage stage = cfg->stage;
@@ -349,6 +392,10 @@ static int smmu_ptw_64_s1(SMMUTransCfg *cfg,
 goto error;
 }
 baseaddr = get_table_pte_address(pte, granule_sz);
+/* In case of failure, retain stage-2 fault. */
+if (translate_table_s1(&baseaddr, cfg, info, bs)) {
+goto error_no_stage;
+}
 level++;
 continue;
 } else if (is_page_pte(pte, level)) {
@@ -384,7

[RFC PATCH v2 10/13] hw/arm/smmuv3: Advertise S2FWB

QEMU doesn's support memory attributes, so FWB is NOP, this
might change in the future if memory attributre would be supported.

Signed-off-by: Mostafa Saleh 
---
 hw/arm/smmuv3.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c
index da47411410..0e367c70ad 100644
--- a/hw/arm/smmuv3.c
+++ b/hw/arm/smmuv3.c
@@ -287,6 +287,14 @@ static void smmuv3_init_regs(SMMUv3State *s)
 if (FIELD_EX32(s->idr[0], IDR0, S2P)) {
 /* XNX is a stage-2-specific feature */
 s->idr[3] = FIELD_DP32(s->idr[3], IDR3, XNX, 1);
+if (FIELD_EX32(s->idr[0], IDR0, S1P)) {
+/*
+ * QEMU doesn's support memory attributes, so FWB is NOP, this
+ * might change in the future if memory attributre would be
+ * supported.
+ */
+   s->idr[3] = FIELD_DP32(s->idr[3], IDR3, FWB, 1);
+}
 }
 s->idr[3] = FIELD_DP32(s->idr[3], IDR3, RIL, 1);
 s->idr[3] = FIELD_DP32(s->idr[3], IDR3, BBML, 2);
-- 
2.44.0.478.gd926399ef9-goog

[RFC PATCH v2 01/13] hw/arm/smmu: Use enum for SMMU stage

Currently, translation stage is represented as an int, where 1 is stage-1 and
2 is stage-2, when nested is added, 3 would be confusing to represent nesting,
so we use an enum instead.

While keeping the same values, this is useful for:
 - Doing tricks with bit masks, where BIT(0) is stage-1 and BIT(1) is
   stage-2 and both is nested.
 - Tracing, as stage is printed as int.

Signed-off-by: Mostafa Saleh 
Reviewed-by: Eric Auger 
---
 hw/arm/smmu-common.c | 14 +++---
 hw/arm/smmuv3.c  | 15 ---
 include/hw/arm/smmu-common.h | 11 +--
 3 files changed, 24 insertions(+), 16 deletions(-)

diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c
index 4caedb4998..3a7c350aca 100644
--- a/hw/arm/smmu-common.c
+++ b/hw/arm/smmu-common.c
@@ -304,7 +304,7 @@ static int smmu_ptw_64_s1(SMMUTransCfg *cfg,
   SMMUTLBEntry *tlbe, SMMUPTWEventInfo *info)
 {
 dma_addr_t baseaddr, indexmask;
-int stage = cfg->stage;
+SMMUStage stage = cfg->stage;
 SMMUTransTableInfo *tt = select_tt(cfg, iova);
 uint8_t level, granule_sz, inputsize, stride;
 
@@ -392,7 +392,7 @@ static int smmu_ptw_64_s1(SMMUTransCfg *cfg,
 info->type = SMMU_PTW_ERR_TRANSLATION;
 
 error:
-info->stage = 1;
+info->stage = SMMU_STAGE_1;
 tlbe->entry.perm = IOMMU_NONE;
 return -EINVAL;
 }
@@ -415,7 +415,7 @@ static int smmu_ptw_64_s2(SMMUTransCfg *cfg,
   dma_addr_t ipa, IOMMUAccessFlags perm,
   SMMUTLBEntry *tlbe, SMMUPTWEventInfo *info)
 {
-const int stage = 2;
+const SMMUStage stage = SMMU_STAGE_2;
 int granule_sz = cfg->s2cfg.granule_sz;
 /* ARM DDI0487I.a: Table D8-7. */
 int inputsize = 64 - cfg->s2cfg.tsz;
@@ -513,7 +513,7 @@ static int smmu_ptw_64_s2(SMMUTransCfg *cfg,
 info->type = SMMU_PTW_ERR_TRANSLATION;
 
 error:
-info->stage = 2;
+info->stage = SMMU_STAGE_2;
 tlbe->entry.perm = IOMMU_NONE;
 return -EINVAL;
 }
@@ -532,9 +532,9 @@ error:
 int smmu_ptw(SMMUTransCfg *cfg, dma_addr_t iova, IOMMUAccessFlags perm,
  SMMUTLBEntry *tlbe, SMMUPTWEventInfo *info)
 {
-if (cfg->stage == 1) {
+if (cfg->stage == SMMU_STAGE_1) {
 return smmu_ptw_64_s1(cfg, iova, perm, tlbe, info);
-} else if (cfg->stage == 2) {
+} else if (cfg->stage == SMMU_STAGE_2) {
 /*
  * If bypassing stage 1(or unimplemented), the input address is passed
  * directly to stage 2 as IPA. If the input address of a transaction
@@ -543,7 +543,7 @@ int smmu_ptw(SMMUTransCfg *cfg, dma_addr_t iova, 
IOMMUAccessFlags perm,
  */
 if (iova >= (1ULL << cfg->oas)) {
 info->type = SMMU_PTW_ERR_ADDR_SIZE;
-info->stage = 1;
+info->stage = SMMU_STAGE_1;
 tlbe->entry.perm = IOMMU_NONE;
 return -EINVAL;
 }
diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c
index 9eb56a70f3..50e5a72d54 100644
--- a/hw/arm/smmuv3.c
+++ b/hw/arm/smmuv3.c
@@ -34,7 +34,8 @@
 #include "smmuv3-internal.h"
 #include "smmu-internal.h"
 
-#define PTW_RECORD_FAULT(cfg)   (((cfg)->stage == 1) ? (cfg)->record_faults : \
+#define PTW_RECORD_FAULT(cfg)   (((cfg)->stage == SMMU_STAGE_1) ? \
+ (cfg)->record_faults : \
  (cfg)->s2cfg.record_faults)
 
 /**
@@ -402,7 +403,7 @@ static bool s2_pgtable_config_valid(uint8_t sl0, uint8_t 
t0sz, uint8_t gran)
 
 static int decode_ste_s2_cfg(SMMUTransCfg *cfg, STE *ste)
 {
-cfg->stage = 2;
+cfg->stage = SMMU_STAGE_2;
 
 if (STE_S2AA64(ste) == 0x0) {
 qemu_log_mask(LOG_UNIMP,
@@ -678,7 +679,7 @@ static int decode_cd(SMMUTransCfg *cfg, CD *cd, 
SMMUEventInfo *event)
 
 /* we support only those at the moment */
 cfg->aa64 = true;
-cfg->stage = 1;
+cfg->stage = SMMU_STAGE_1;
 
 cfg->oas = oas2bits(CD_IPS(cd));
 cfg->oas = MIN(oas2bits(SMMU_IDR5_OAS), cfg->oas);
@@ -762,7 +763,7 @@ static int smmuv3_decode_config(IOMMUMemoryRegion *mr, 
SMMUTransCfg *cfg,
 return ret;
 }
 
-if (cfg->aborted || cfg->bypassed || (cfg->stage == 2)) {
+if (cfg->aborted || cfg->bypassed || (cfg->stage == SMMU_STAGE_2)) {
 return 0;
 }
 
@@ -882,7 +883,7 @@ static IOMMUTLBEntry smmuv3_translate(IOMMUMemoryRegion 
*mr, hwaddr addr,
 goto epilogue;
 }
 
-if (cfg->stage == 1) {
+if (cfg->stage == SMMU_STAGE_1) {
 /* Select stage1 translation table. */
 tt = select_tt(cfg, addr);
 if (!tt) {
@@ -919,7 +920,7 @@ static IOMMUTLBEntry smmuv3_translate(IOMMUMemoryRegion 
*mr, hwaddr addr,
  * nesting is not supported. So it is sufficient to check the
  * translation stage to know the TLB stage for now.
  */
-event.u.f_walk_eabt.s2 = (cfg->stage == 2);
+event.u.f_walk_eabt.s2 = (cfg->stage == SMMU_STAGE_2);
 if (PTW_RECORD_FAULT(cfg)) {
 e

[RFC PATCH v2 13/13] hw/arm/virt: Set SMMU OAS based on CPU PARANGE

Use the new SMMU property to make the SMMU OAS match the CPU PARANGE.
That's according to SMMU manual ARM IHI 0070F.b:
6.3.6 SMMU_IDR5, OAS must match the system physical address size.

Signed-off-by: Mostafa Saleh 
---
 hw/arm/virt.c  | 14 --
 target/arm/cpu.h   |  2 ++
 target/arm/cpu64.c |  5 +
 3 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index 0af1943697..599c0f752b 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -235,6 +235,13 @@ static bool ns_el2_virt_timer_present(void)
 arm_feature(env, ARM_FEATURE_EL2) && cpu_isar_feature(aa64_vh, cpu);
 }
 
+/* We rely on CPU to define system OAS. */
+static int32_t get_system_oas(void)
+{
+ARMCPU *cpu = ARM_CPU(qemu_get_cpu(0));
+return cpu_arm_get_oas(cpu);
+}
+
 static void create_fdt(VirtMachineState *vms)
 {
 MachineState *ms = MACHINE(vms);
@@ -1340,7 +1347,7 @@ static void create_pcie_irq_map(const MachineState *ms,
 }
 
 static void create_smmu(const VirtMachineState *vms,
-PCIBus *bus)
+PCIBus *bus, int32_t oas)
 {
 char *node;
 const char compat[] = "arm,smmu-v3";
@@ -1360,6 +1367,9 @@ static void create_smmu(const VirtMachineState *vms,
 
 object_property_set_link(OBJECT(dev), "primary-bus", OBJECT(bus),
  &error_abort);
+
+qdev_prop_set_uint64(dev, "oas", oas);
+
 sysbus_realize_and_unref(SYS_BUS_DEVICE(dev), &error_fatal);
 sysbus_mmio_map(SYS_BUS_DEVICE(dev), 0, base);
 for (i = 0; i < NUM_SMMU_IRQS; i++) {
@@ -1534,7 +1544,7 @@ static void create_pcie(VirtMachineState *vms)
 
 switch (vms->iommu) {
 case VIRT_IOMMU_SMMUV3:
-create_smmu(vms, vms->bus);
+create_smmu(vms, vms->bus, get_system_oas());
 qemu_fdt_setprop_cells(ms->fdt, nodename, "iommu-map",
0x0, vms->iommu_phandle, 0x0, 0x1);
 break;
diff --git a/target/arm/cpu.h b/target/arm/cpu.h
index a5b3d8f7da..14ef1a9d37 100644
--- a/target/arm/cpu.h
+++ b/target/arm/cpu.h
@@ -3408,4 +3408,6 @@ static inline target_ulong cpu_untagged_addr(CPUState 
*cs, target_ulong x)
 }
 #endif
 
+int32_t cpu_arm_get_oas(ARMCPU *cpu);
+
 #endif
diff --git a/target/arm/cpu64.c b/target/arm/cpu64.c
index 985b1efe16..08da83c082 100644
--- a/target/arm/cpu64.c
+++ b/target/arm/cpu64.c
@@ -787,6 +787,11 @@ static const gchar *aarch64_gdb_arch_name(CPUState *cs)
 return "aarch64";
 }
 
+int32_t cpu_arm_get_oas(ARMCPU *cpu)
+{
+return FIELD_EX64(cpu->isar.id_aa64mmfr0, ID_AA64MMFR0, PARANGE);
+}
+
 static void aarch64_cpu_class_init(ObjectClass *oc, void *data)
 {
 CPUClass *cc = CPU_CLASS(oc);
-- 
2.44.0.478.gd926399ef9-goog

[RFC PATCH v2 02/13] hw/arm/smmu: Split smmuv3_translate()

smmuv3_translate() does everything from STE/CD parsing to TLB lookup
and PTW.

Soon, when nesting is supported, stage-1 data (tt, CD) needs to be
translated using stage-2.

Split smmuv3_translate() to 3 functions:

- smmu_translate(): in smmu-common.c, which does the TLB lookup, PTW,
  TLB insertion, all the functions are already there, this just puts
  them together.
  This also simplifies the code as it consolidates event generation
  in case of TLB lookup permission failure or in TT selection.

- smmuv3_do_translate(): in smmuv3.c, Calls smmu_translate() and does
  the event population in case of errors.

 - smmuv3_translate(), now calls smmuv3_do_translate() for
   translation while the rest is the same.

Also, add stage in trace_smmuv3_translate_success()

Signed-off-by: Mostafa Saleh 
Reviewed-by: Eric Auger 
---
 hw/arm/smmu-common.c |  59 
 hw/arm/smmuv3.c  | 175 +--
 hw/arm/trace-events  |   2 +-
 include/hw/arm/smmu-common.h |   8 ++
 4 files changed, 133 insertions(+), 111 deletions(-)

diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c
index 3a7c350aca..20630eb670 100644
--- a/hw/arm/smmu-common.c
+++ b/hw/arm/smmu-common.c
@@ -554,6 +554,65 @@ int smmu_ptw(SMMUTransCfg *cfg, dma_addr_t iova, 
IOMMUAccessFlags perm,
 g_assert_not_reached();
 }
 
+SMMUTLBEntry *smmu_translate(SMMUState *bs, SMMUTransCfg *cfg, dma_addr_t addr,
+ IOMMUAccessFlags flag, SMMUPTWEventInfo *info)
+{
+uint64_t page_mask, aligned_addr;
+SMMUTLBEntry *cached_entry = NULL;
+SMMUTransTableInfo *tt;
+int status;
+
+/*
+ * Combined attributes used for TLB lookup, as only one stage is supported,
+ * it will hold attributes based on the enabled stage.
+ */
+SMMUTransTableInfo tt_combined;
+
+if (cfg->stage == SMMU_STAGE_1) {
+/* Select stage1 translation table. */
+tt = select_tt(cfg, addr);
+if (!tt) {
+info->type = SMMU_PTW_ERR_TRANSLATION;
+info->stage = SMMU_STAGE_1;
+return NULL;
+}
+tt_combined.granule_sz = tt->granule_sz;
+tt_combined.tsz = tt->tsz;
+
+} else {
+/* Stage2. */
+tt_combined.granule_sz = cfg->s2cfg.granule_sz;
+tt_combined.tsz = cfg->s2cfg.tsz;
+}
+
+/*
+ * TLB lookup looks for granule and input size for a translation stage,
+ * as only one stage is supported right now, choose the right values
+ * from the configuration.
+ */
+page_mask = (1ULL << tt_combined.granule_sz) - 1;
+aligned_addr = addr & ~page_mask;
+
+cached_entry = smmu_iotlb_lookup(bs, cfg, &tt_combined, aligned_addr);
+if (cached_entry) {
+if ((flag & IOMMU_WO) && !(cached_entry->entry.perm & IOMMU_WO)) {
+info->type = SMMU_PTW_ERR_PERMISSION;
+info->stage = cfg->stage;
+return NULL;
+}
+return cached_entry;
+}
+
+cached_entry = g_new0(SMMUTLBEntry, 1);
+status = smmu_ptw(cfg, aligned_addr, flag, cached_entry, info);
+if (status) {
+g_free(cached_entry);
+return NULL;
+}
+smmu_iotlb_insert(bs, cfg, cached_entry);
+return cached_entry;
+}
+
 /**
  * The bus number is used for lookup when SID based invalidation occurs.
  * In that case we lazily populate the SMMUPciBus array from the bus hash
diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c
index 50e5a72d54..f081ff0cc4 100644
--- a/hw/arm/smmuv3.c
+++ b/hw/arm/smmuv3.c
@@ -827,6 +827,67 @@ static void smmuv3_flush_config(SMMUDevice *sdev)
 g_hash_table_remove(bc->configs, sdev);
 }
 
+/* Do translation with TLB lookup. */
+static SMMUTranslationStatus smmuv3_do_translate(SMMUv3State *s, hwaddr addr,
+ SMMUTransCfg *cfg,
+ SMMUEventInfo *event,
+ IOMMUAccessFlags flag,
+ SMMUTLBEntry **out_entry)
+{
+SMMUPTWEventInfo ptw_info = {};
+SMMUState *bs = ARM_SMMU(s);
+SMMUTLBEntry *cached_entry = NULL;
+
+cached_entry = smmu_translate(bs, cfg, addr, flag, &ptw_info);
+if (!cached_entry) {
+/* All faults from PTW has S2 field. */
+event->u.f_walk_eabt.s2 = (ptw_info.stage == SMMU_STAGE_2);
+switch (ptw_info.type) {
+case SMMU_PTW_ERR_WALK_EABT:
+event->type = SMMU_EVT_F_WALK_EABT;
+event->u.f_walk_eabt.addr = addr;
+event->u.f_walk_eabt.rnw = flag & 0x1;
+event->u.f_walk_eabt.class = 0x1;
+event->u.f_walk_eabt.addr2 = ptw_info.addr;
+break;
+case SMMU_PTW_ERR_TRANSLATION:
+if (PTW_RECORD_FAULT(cfg)) {
+event->type = SMMU_EVT_F_TRANSLATION;
+event->u.f_translation.addr = addr;
+event->u.f_translation.rnw = flag & 0x1;

[RFC PATCH v2 08/13] hw/arm/smmuv3: Support nested SMMUs in smmuv3_notify_iova()

IOMMUTLBEvent only understands IOVA, for stage-2 only SMMUs keep
the implementation, while only notify for stage-1 invalidation
in case of nesting.

Signed-off-by: Mostafa Saleh 
---
 hw/arm/smmuv3.c | 23 +++
 hw/arm/trace-events |  2 +-
 2 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c
index ece647b8bf..85b3ac6a9c 100644
--- a/hw/arm/smmuv3.c
+++ b/hw/arm/smmuv3.c
@@ -1028,7 +1028,7 @@ static void smmuv3_notify_iova(IOMMUMemoryRegion *mr,
IOMMUNotifier *n,
int asid, int vmid,
dma_addr_t iova, uint8_t tg,
-   uint64_t num_pages)
+   uint64_t num_pages, int stage)
 {
 SMMUDevice *sdev = container_of(mr, SMMUDevice, iommu);
 IOMMUTLBEvent event;
@@ -1052,14 +1052,21 @@ static void smmuv3_notify_iova(IOMMUMemoryRegion *mr,
 return;
 }
 
-if (STAGE1_SUPPORTED(s)) {
+/*
+ * IOMMUTLBEvent only understands IOVA, for stage-2 only SMMUs
+ * keep the implementation, while only notify for stage-1
+ * invalidation in case of nesting.
+ */
+if (stage == SMMU_STAGE_1) {
 tt = select_tt(cfg, iova);
 if (!tt) {
 return;
 }
 granule = tt->granule_sz;
-} else {
+} else if (!STAGE1_SUPPORTED(s)) {
 granule = cfg->s2cfg.granule_sz;
+} else {
+return;
 }
 
 } else {
@@ -1078,7 +1085,7 @@ static void smmuv3_notify_iova(IOMMUMemoryRegion *mr,
 /* invalidate an asid/vmid/iova range tuple in all mr's */
 static void smmuv3_inv_notifiers_iova(SMMUState *s, int asid, int vmid,
   dma_addr_t iova, uint8_t tg,
-  uint64_t num_pages)
+  uint64_t num_pages, int stage)
 {
 SMMUDevice *sdev;
 
@@ -1087,10 +1094,10 @@ static void smmuv3_inv_notifiers_iova(SMMUState *s, int 
asid, int vmid,
 IOMMUNotifier *n;
 
 trace_smmuv3_inv_notifiers_iova(mr->parent_obj.name, asid, vmid,
-iova, tg, num_pages);
+iova, tg, num_pages, stage);
 
 IOMMU_NOTIFIER_FOREACH(n, mr) {
-smmuv3_notify_iova(mr, n, asid, vmid, iova, tg, num_pages);
+smmuv3_notify_iova(mr, n, asid, vmid, iova, tg, num_pages, stage);
 }
 }
 }
@@ -1121,7 +1128,7 @@ static void smmuv3_range_inval(SMMUState *s, Cmd *cmd, 
SMMUStage stage)
 
 if (!tg) {
 trace_smmuv3_range_inval(vmid, asid, addr, tg, 1, ttl, leaf, stage);
-smmuv3_inv_notifiers_iova(s, asid, vmid, addr, tg, 1);
+smmuv3_inv_notifiers_iova(s, asid, vmid, addr, tg, 1, stage);
 if (stage == SMMU_STAGE_1) {
 smmu_iotlb_inv_iova(s, asid, vmid, addr, tg, 1, ttl);
 } else {
@@ -1144,7 +1151,7 @@ static void smmuv3_range_inval(SMMUState *s, Cmd *cmd, 
SMMUStage stage)
 num_pages = (mask + 1) >> granule;
 trace_smmuv3_range_inval(vmid, asid, addr, tg, num_pages,
  ttl, leaf, stage);
-smmuv3_inv_notifiers_iova(s, asid, vmid, addr, tg, num_pages);
+smmuv3_inv_notifiers_iova(s, asid, vmid, addr, tg, num_pages, stage);
 if (stage == SMMU_STAGE_1) {
 smmu_iotlb_inv_iova(s, asid, vmid, addr, tg, num_pages, ttl);
 } else {
diff --git a/hw/arm/trace-events b/hw/arm/trace-events
index 2556f4721a..53b9d11feb 100644
--- a/hw/arm/trace-events
+++ b/hw/arm/trace-events
@@ -55,7 +55,7 @@ smmuv3_cmdq_tlbi_s12_vmid(uint16_t vmid) "vmid=%d"
 smmuv3_config_cache_inv(uint32_t sid) "Config cache INV for sid=0x%x"
 smmuv3_notify_flag_add(const char *iommu) "ADD SMMUNotifier node for iommu 
mr=%s"
 smmuv3_notify_flag_del(const char *iommu) "DEL SMMUNotifier node for iommu 
mr=%s"
-smmuv3_inv_notifiers_iova(const char *name, uint16_t asid, uint16_t vmid, 
uint64_t iova, uint8_t tg, uint64_t num_pages) "iommu mr=%s asid=%d vmid=%d 
iova=0x%"PRIx64" tg=%d num_pages=0x%"PRIx64
+smmuv3_inv_notifiers_iova(const char *name, uint16_t asid, uint16_t vmid, 
uint64_t iova, uint8_t tg, uint64_t num_pages, int stage) "iommu mr=%s asid=%d 
vmid=%d iova=0x%"PRIx64" tg=%d num_pages=0x%"PRIx64" stage=%d"
 
 # strongarm.c
 strongarm_uart_update_parameters(const char *label, int speed, char parity, 
int data_bits, int stop_bits) "%s speed=%d parity=%c data=%d stop=%d"
-- 
2.44.0.478.gd926399ef9-goog

[RFC PATCH v2 07/13] hw/arm/smmu: Support nesting in the rest of commands

Some commands need rework for nesting, as they used to assume S1
and S2 are mutually exclusive:

- CMD_TLBI_NH_ASID: Consider VMID if stage-2 is supported
- CMD_TLBI_NH_ALL: Consider VMID if stage-2 is supported, otherwise
  invalidate everything, this required a new vmid invalidation
  function for stage-1 only (ASID >= 0)

Also, rework trace events to reflect the new implementation.

Signed-off-by: Mostafa Saleh 
---
 hw/arm/smmu-common.c | 36 +---
 hw/arm/smmuv3.c  | 31 +--
 hw/arm/trace-events  |  6 --
 include/hw/arm/smmu-common.h |  3 ++-
 4 files changed, 64 insertions(+), 12 deletions(-)

diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c
index 8b9e59b24b..b1cf1303c6 100644
--- a/hw/arm/smmu-common.c
+++ b/hw/arm/smmu-common.c
@@ -148,13 +148,14 @@ void smmu_iotlb_inv_all(SMMUState *s)
 g_hash_table_remove_all(s->iotlb);
 }
 
-static gboolean smmu_hash_remove_by_asid(gpointer key, gpointer value,
- gpointer user_data)
+static gboolean smmu_hash_remove_by_asid_vmid(gpointer key, gpointer value,
+  gpointer user_data)
 {
-int asid = *(int *)user_data;
+SMMUIOTLBPageInvInfo *info = (SMMUIOTLBPageInvInfo *)user_data;
 SMMUIOTLBKey *iotlb_key = (SMMUIOTLBKey *)key;
 
-return SMMU_IOTLB_ASID(*iotlb_key) == asid;
+return (SMMU_IOTLB_ASID(*iotlb_key) == info->asid) &&
+   (SMMU_IOTLB_VMID(*iotlb_key) == info->vmid);
 }
 
 static gboolean smmu_hash_remove_by_vmid(gpointer key, gpointer value,
@@ -166,6 +167,16 @@ static gboolean smmu_hash_remove_by_vmid(gpointer key, 
gpointer value,
 return SMMU_IOTLB_VMID(*iotlb_key) == vmid;
 }
 
+static gboolean smmu_hash_remove_by_vmid_s1(gpointer key, gpointer value,
+gpointer user_data)
+{
+int vmid = *(int *)user_data;
+SMMUIOTLBKey *iotlb_key = (SMMUIOTLBKey *)key;
+
+return (SMMU_IOTLB_VMID(*iotlb_key) == vmid) &&
+   (SMMU_IOTLB_ASID(*iotlb_key) >= 0);
+}
+
 static gboolean smmu_hash_remove_by_asid_vmid_iova(gpointer key, gpointer 
value,
   gpointer user_data)
 {
@@ -259,10 +270,15 @@ void smmu_iotlb_inv_ipa(SMMUState *s, int vmid, 
dma_addr_t ipa, uint8_t tg,
 &info);
 }
 
-void smmu_iotlb_inv_asid(SMMUState *s, int asid)
+void smmu_iotlb_inv_asid_vmid(SMMUState *s, int asid, int vmid)
 {
-trace_smmu_iotlb_inv_asid(asid);
-g_hash_table_foreach_remove(s->iotlb, smmu_hash_remove_by_asid, &asid);
+SMMUIOTLBPageInvInfo info = {
+.asid = asid,
+.vmid = vmid,
+};
+
+trace_smmu_iotlb_inv_asid_vmid(asid, vmid);
+g_hash_table_foreach_remove(s->iotlb, smmu_hash_remove_by_asid_vmid, 
&info);
 }
 
 inline void smmu_iotlb_inv_vmid(SMMUState *s, int vmid)
@@ -271,6 +287,12 @@ inline void smmu_iotlb_inv_vmid(SMMUState *s, int vmid)
 g_hash_table_foreach_remove(s->iotlb, smmu_hash_remove_by_vmid, &vmid);
 }
 
+inline void smmu_iotlb_inv_vmid_s1(SMMUState *s, int vmid)
+{
+trace_smmu_iotlb_inv_vmid_s1(vmid);
+g_hash_table_foreach_remove(s->iotlb, smmu_hash_remove_by_vmid_s1, &vmid);
+}
+
 /* VMSAv8-64 Translation */
 
 /**
diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c
index 17bbd43c13..ece647b8bf 100644
--- a/hw/arm/smmuv3.c
+++ b/hw/arm/smmuv3.c
@@ -1280,25 +1280,52 @@ static int smmuv3_cmdq_consume(SMMUv3State *s)
 case SMMU_CMD_TLBI_NH_ASID:
 {
 int asid = CMD_ASID(&cmd);
+int vmid = -1;
 
 if (!STAGE1_SUPPORTED(s)) {
 cmd_error = SMMU_CERROR_ILL;
 break;
 }
 
+/*
+ * VMID is only matched when stage 2 is supported for the Security
+ * state corresponding to the command queue that the command was
+ * issued in.
+ * QEMU ignores the field by setting to -1, similarly to what STE
+ * decoding does. And invalidation commands ignore VMID < 0.
+ */
+if (STAGE2_SUPPORTED(s)) {
+vmid = CMD_VMID(&cmd);
+}
+
 trace_smmuv3_cmdq_tlbi_nh_asid(asid);
 smmu_inv_notifiers_all(&s->smmu_state);
-smmu_iotlb_inv_asid(bs, asid);
+smmu_iotlb_inv_asid_vmid(bs, asid, vmid);
 break;
 }
 case SMMU_CMD_TLBI_NH_ALL:
+{
+int vmid = -1;
+
 if (!STAGE1_SUPPORTED(s)) {
 cmd_error = SMMU_CERROR_ILL;
 break;
 }
+
+/*
+ * If stage-2 is supported, invalidate for this VMID only, 
otherwise
+ * invalidate the whole thing, see SMMU_CMD_TLBI_NH_ASID()
+ */
+if (STAGE2_SUPPORTED(s)) {
+vmid = CMD_VMID(&cmd);
+trace_smmuv3_cmdq_tlbi_nh(vmid);
+

[RFC PATCH v2 12/13] hw/arm/smmuv3: Add property for OAS

Add property that sets the OAS of the SMMU, this in not used in this
patch.

Signed-off-by: Mostafa Saleh 
---
 hw/arm/smmuv3-internal.h |  3 ++-
 hw/arm/smmuv3.c  | 29 -
 include/hw/arm/smmuv3.h  |  1 +
 3 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/hw/arm/smmuv3-internal.h b/hw/arm/smmuv3-internal.h
index b0d7ad6da3..41612bb9ff 100644
--- a/hw/arm/smmuv3-internal.h
+++ b/hw/arm/smmuv3-internal.h
@@ -105,7 +105,8 @@ REG32(IDR5,0x14)
  FIELD(IDR5, VAX,10, 2);
  FIELD(IDR5, STALL_MAX,  16, 16);
 
-#define SMMU_IDR5_OAS 4
+#define SMMU_IDR5_OAS_DEF 4 /* 44 bits. */
+#define SMMU_IDR5_OAS_MAX 5 /* 48 bits. */
 
 REG32(IIDR,0x18)
 REG32(AIDR,0x1c)
diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c
index c377c05379..a9e35c41b7 100644
--- a/hw/arm/smmuv3.c
+++ b/hw/arm/smmuv3.c
@@ -299,7 +299,9 @@ static void smmuv3_init_regs(SMMUv3State *s)
 s->idr[3] = FIELD_DP32(s->idr[3], IDR3, RIL, 1);
 s->idr[3] = FIELD_DP32(s->idr[3], IDR3, BBML, 2);
 
-s->idr[5] = FIELD_DP32(s->idr[5], IDR5, OAS, SMMU_IDR5_OAS); /* 44 bits */
+/* PTW doesn't support 52 bits. */
+s->oas = MIN(s->oas, SMMU_IDR5_OAS_MAX);
+s->idr[5] = FIELD_DP32(s->idr[5], IDR5, OAS, s->oas);
 /* 4K, 16K and 64K granule support */
 s->idr[5] = FIELD_DP32(s->idr[5], IDR5, GRAN4K, 1);
 s->idr[5] = FIELD_DP32(s->idr[5], IDR5, GRAN16K, 1);
@@ -1878,11 +1880,34 @@ static const VMStateDescription vmstate_gbpa = {
 }
 };
 
+static const VMStateDescription vmstate_oas = {
+.name = "smmuv3/oas",
+.version_id = 1,
+.minimum_version_id = 1,
+.fields = (const VMStateField[]) {
+VMSTATE_INT32(oas, SMMUv3State),
+VMSTATE_END_OF_LIST()
+}
+};
+
+static int smmuv3_preload(void *opaque)
+{
+SMMUv3State *s = opaque;
+
+/*
+ * In case it wasn't migrated, use the value used
+ * by older QEMU.
+ */
+s->oas = SMMU_IDR5_OAS_DEF;
+return 0;
+}
+
 static const VMStateDescription vmstate_smmuv3 = {
 .name = "smmuv3",
 .version_id = 1,
 .minimum_version_id = 1,
 .priority = MIG_PRI_IOMMU,
+.pre_load = smmuv3_preload,
 .fields = (const VMStateField[]) {
 VMSTATE_UINT32(features, SMMUv3State),
 VMSTATE_UINT8(sid_size, SMMUv3State),
@@ -1910,6 +1935,7 @@ static const VMStateDescription vmstate_smmuv3 = {
 },
 .subsections = (const VMStateDescription * const []) {
 &vmstate_gbpa,
+&vmstate_oas,
 NULL
 }
 };
@@ -1922,6 +1948,7 @@ static Property smmuv3_properties[] = {
  * Defaults to stage 1
  */
 DEFINE_PROP_STRING("stage", SMMUv3State, stage),
+DEFINE_PROP_INT32("oas", SMMUv3State, oas, SMMU_IDR5_OAS_DEF),
 DEFINE_PROP_END_OF_LIST()
 };
 
diff --git a/include/hw/arm/smmuv3.h b/include/hw/arm/smmuv3.h
index d183a62766..00a9eb4467 100644
--- a/include/hw/arm/smmuv3.h
+++ b/include/hw/arm/smmuv3.h
@@ -63,6 +63,7 @@ struct SMMUv3State {
 qemu_irq irq[4];
 QemuMutex mutex;
 char *stage;
+int32_t oas;
 };
 
 typedef enum {
-- 
2.44.0.478.gd926399ef9-goog

[RFC PATCH v2 11/13] hw/arm/smmu: Refactor SMMU OAS

SMMUv3 OAS is hardcoded to 44 bits, for nested configurations that
can be a problem as stage-2 might be shared with the CPU which might
have different PARANGE, and according to SMMU manual ARM IHI 0070F.b:
6.3.6 SMMU_IDR5, OAS must match the system physical address size.

This patch doesn't change the SMMU OAS, but refactors the code to
make it easier to do that:
- Rely everywhere on IDR5 for reading OAS instead of using the macro so
  it is easier just change IDR5 and it propagages correctly.
- Remove unused functions/macros: pa_range/MAX_PA

Signed-off-by: Mostafa Saleh 
---
 hw/arm/smmu-common.c |  7 ---
 hw/arm/smmuv3-internal.h | 13 -
 hw/arm/smmuv3.c  | 35 ---
 3 files changed, 32 insertions(+), 23 deletions(-)

diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c
index b1cf1303c6..0710ee6b7d 100644
--- a/hw/arm/smmu-common.c
+++ b/hw/arm/smmu-common.c
@@ -430,7 +430,8 @@ static int smmu_ptw_64_s1(SMMUTransCfg *cfg,
 inputsize = 64 - tt->tsz;
 level = 4 - (inputsize - 4) / stride;
 indexmask = VMSA_IDXMSK(inputsize, stride, level);
-baseaddr = extract64(tt->ttb, 0, 48);
+
+baseaddr = extract64(tt->ttb, 0, cfg->oas);
 baseaddr &= ~indexmask;
 
 while (level < VMSA_LEVELS) {
@@ -543,8 +544,8 @@ static int smmu_ptw_64_s2(SMMUTransCfg *cfg,
  * Get the ttb from concatenated structure.
  * The offset is the idx * size of each ttb(number of ptes * (sizeof(pte))
  */
-uint64_t baseaddr = extract64(cfg->s2cfg.vttb, 0, 48) + (1 << stride) *
-  idx * sizeof(uint64_t);
+uint64_t baseaddr = extract64(cfg->s2cfg.vttb, 0, cfg->s2cfg.eff_ps) +
+  (1 << stride) * idx * sizeof(uint64_t);
 dma_addr_t indexmask = VMSA_IDXMSK(inputsize, stride, level);
 
 baseaddr &= ~indexmask;
diff --git a/hw/arm/smmuv3-internal.h b/hw/arm/smmuv3-internal.h
index e4dd11e1e6..b0d7ad6da3 100644
--- a/hw/arm/smmuv3-internal.h
+++ b/hw/arm/smmuv3-internal.h
@@ -596,19 +596,6 @@ static inline int oas2bits(int oas_field)
 return -1;
 }
 
-static inline int pa_range(STE *ste)
-{
-int oas_field = MIN(STE_S2PS(ste), SMMU_IDR5_OAS);
-
-if (!STE_S2AA64(ste)) {
-return 40;
-}
-
-return oas2bits(oas_field);
-}
-
-#define MAX_PA(ste) ((1 << pa_range(ste)) - 1)
-
 /* CD fields */
 
 #define CD_VALID(x)   extract32((x)->word[0], 31, 1)
diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c
index 0e367c70ad..c377c05379 100644
--- a/hw/arm/smmuv3.c
+++ b/hw/arm/smmuv3.c
@@ -411,10 +411,10 @@ static bool s2t0sz_valid(SMMUTransCfg *cfg)
 }
 
 if (cfg->s2cfg.granule_sz == 16) {
-return (cfg->s2cfg.tsz >= 64 - oas2bits(SMMU_IDR5_OAS));
+return (cfg->s2cfg.tsz >= 64 - cfg->s2cfg.eff_ps);
 }
 
-return (cfg->s2cfg.tsz >= MAX(64 - oas2bits(SMMU_IDR5_OAS), 16));
+return (cfg->s2cfg.tsz >= MAX(64 - cfg->s2cfg.eff_ps, 16));
 }
 
 /*
@@ -435,8 +435,11 @@ static bool s2_pgtable_config_valid(uint8_t sl0, uint8_t 
t0sz, uint8_t gran)
 return nr_concat <= VMSA_MAX_S2_CONCAT;
 }
 
-static int decode_ste_s2_cfg(SMMUTransCfg *cfg, STE *ste)
+static int decode_ste_s2_cfg(SMMUv3State *s, SMMUTransCfg *cfg,
+ STE *ste)
 {
+uint8_t oas = FIELD_EX32(s->idr[5], IDR5, OAS);
+
 if (STE_S2AA64(ste) == 0x0) {
 qemu_log_mask(LOG_UNIMP,
   "SMMUv3 AArch32 tables not supported\n");
@@ -469,7 +472,15 @@ static int decode_ste_s2_cfg(SMMUTransCfg *cfg, STE *ste)
 }
 
 /* For AA64, The effective S2PS size is capped to the OAS. */
-cfg->s2cfg.eff_ps = oas2bits(MIN(STE_S2PS(ste), SMMU_IDR5_OAS));
+cfg->s2cfg.eff_ps = oas2bits(MIN(STE_S2PS(ste), oas));
+/*
+ * For SMMUv3.1 and later, when OAS == IAS == 52, the stage 2 input
+ * range is further limited to 48 bits unless STE.S2TG indicates a
+ * 64KB granule.
+ */
+if (cfg->s2cfg.granule_sz != 16) {
+cfg->s2cfg.eff_ps = MIN(cfg->s2cfg.eff_ps, 48);
+}
 /*
  * It is ILLEGAL for the address in S2TTB to be outside the range
  * described by the effective S2PS value.
@@ -545,6 +556,7 @@ static int decode_ste(SMMUv3State *s, SMMUTransCfg *cfg,
   STE *ste, SMMUEventInfo *event)
 {
 uint32_t config;
+uint8_t oas = FIELD_EX32(s->idr[5], IDR5, OAS);
 int ret;
 
 if (!STE_VALID(ste)) {
@@ -588,8 +600,8 @@ static int decode_ste(SMMUv3State *s, SMMUTransCfg *cfg,
  * Stage-1 OAS defaults to OAS even if not enabled as it would be used
  * in input address check for stage-2.
  */
-cfg->oas = oas2bits(SMMU_IDR5_OAS);
-ret = decode_ste_s2_cfg(cfg, ste);
+cfg->oas = oas2bits(oas);
+ret = decode_ste_s2_cfg(s, cfg, ste);
 if (ret) {
 goto bad_ste;
 }
@@ -715,6 +727,7 @@ static int decode_cd(SMMUv3State *s, SMMUTransCfg *cfg,
 int i;
 SMMUTranslationStatus status;

[RFC PATCH v2 09/13] hw/arm/smmuv3: Support and advertise nesting

Everything is in place, add the last missing bits:
- Handle fault checking according to the actual PTW event and not the
  the translation stage.
- Consolidate parsing of STE cfg and setting translation stage.

Advertise nesting if stage requested is "nested".

Signed-off-by: Mostafa Saleh 
---
 hw/arm/smmuv3.c | 50 +
 1 file changed, 34 insertions(+), 16 deletions(-)

diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c
index 85b3ac6a9c..da47411410 100644
--- a/hw/arm/smmuv3.c
+++ b/hw/arm/smmuv3.c
@@ -34,9 +34,10 @@
 #include "smmuv3-internal.h"
 #include "smmu-internal.h"
 
-#define PTW_RECORD_FAULT(cfg)   (((cfg)->stage == SMMU_STAGE_1) ? \
- (cfg)->record_faults : \
- (cfg)->s2cfg.record_faults)
+#define PTW_RECORD_FAULT(ptw_info, cfg) (((ptw_info).stage == SMMU_STAGE_1 && \
+(cfg)->record_faults) || \
+((ptw_info).stage == SMMU_STAGE_2 && \
+(cfg)->s2cfg.record_faults))
 
 /**
  * smmuv3_trigger_irq - pulse @irq if enabled and update
@@ -260,6 +261,9 @@ static void smmuv3_init_regs(SMMUv3State *s)
 /* Based on sys property, the stages supported in smmu will be 
advertised.*/
 if (s->stage && !strcmp("2", s->stage)) {
 s->idr[0] = FIELD_DP32(s->idr[0], IDR0, S2P, 1);
+} else if (s->stage && !strcmp("nested", s->stage)) {
+s->idr[0] = FIELD_DP32(s->idr[0], IDR0, S1P, 1);
+s->idr[0] = FIELD_DP32(s->idr[0], IDR0, S2P, 1);
 } else {
 s->idr[0] = FIELD_DP32(s->idr[0], IDR0, S1P, 1);
 }
@@ -425,8 +429,6 @@ static bool s2_pgtable_config_valid(uint8_t sl0, uint8_t 
t0sz, uint8_t gran)
 
 static int decode_ste_s2_cfg(SMMUTransCfg *cfg, STE *ste)
 {
-cfg->stage = SMMU_STAGE_2;
-
 if (STE_S2AA64(ste) == 0x0) {
 qemu_log_mask(LOG_UNIMP,
   "SMMUv3 AArch32 tables not supported\n");
@@ -509,6 +511,27 @@ bad_ste:
 return -EINVAL;
 }
 
+static void decode_ste_config(SMMUTransCfg *cfg, uint32_t config)
+{
+
+if (STE_CFG_ABORT(config)) {
+cfg->aborted = true;
+return;
+}
+if (STE_CFG_BYPASS(config)) {
+cfg->bypassed = true;
+return;
+}
+
+if (STE_CFG_S1_ENABLED(config)) {
+cfg->stage = SMMU_STAGE_1;
+}
+
+if (STE_CFG_S2_ENABLED(config)) {
+cfg->stage |= SMMU_STAGE_2;
+}
+}
+
 /* Returns < 0 in case of invalid STE, 0 otherwise */
 static int decode_ste(SMMUv3State *s, SMMUTransCfg *cfg,
   STE *ste, SMMUEventInfo *event)
@@ -525,13 +548,9 @@ static int decode_ste(SMMUv3State *s, SMMUTransCfg *cfg,
 
 config = STE_CONFIG(ste);
 
-if (STE_CFG_ABORT(config)) {
-cfg->aborted = true;
-return 0;
-}
+decode_ste_config(cfg, config);
 
-if (STE_CFG_BYPASS(config)) {
-cfg->bypassed = true;
+if (cfg->aborted || cfg->bypassed) {
 return 0;
 }
 
@@ -704,7 +723,6 @@ static int decode_cd(SMMUv3State *s, SMMUTransCfg *cfg,
 
 /* we support only those at the moment */
 cfg->aa64 = true;
-cfg->stage = SMMU_STAGE_1;
 
 cfg->oas = oas2bits(CD_IPS(cd));
 cfg->oas = MIN(oas2bits(SMMU_IDR5_OAS), cfg->oas);
@@ -887,28 +905,28 @@ static SMMUTranslationStatus 
smmuv3_do_translate(SMMUv3State *s, hwaddr addr,
 event->u.f_walk_eabt.addr2 = ptw_info.addr;
 break;
 case SMMU_PTW_ERR_TRANSLATION:
-if (PTW_RECORD_FAULT(cfg)) {
+if (PTW_RECORD_FAULT(ptw_info, cfg)) {
 event->type = SMMU_EVT_F_TRANSLATION;
 event->u.f_translation.addr = addr;
 event->u.f_translation.rnw = flag & 0x1;
 }
 break;
 case SMMU_PTW_ERR_ADDR_SIZE:
-if (PTW_RECORD_FAULT(cfg)) {
+if (PTW_RECORD_FAULT(ptw_info, cfg)) {
 event->type = SMMU_EVT_F_ADDR_SIZE;
 event->u.f_addr_size.addr = addr;
 event->u.f_addr_size.rnw = flag & 0x1;
 }
 break;
 case SMMU_PTW_ERR_ACCESS:
-if (PTW_RECORD_FAULT(cfg)) {
+if (PTW_RECORD_FAULT(ptw_info, cfg)) {
 event->type = SMMU_EVT_F_ACCESS;
 event->u.f_access.addr = addr;
 event->u.f_access.rnw = flag & 0x1;
 }
 break;
 case SMMU_PTW_ERR_PERMISSION:
-if (PTW_RECORD_FAULT(cfg)) {
+if (PTW_RECORD_FAULT(ptw_info, cfg)) {
 event->type = SMMU_EVT_F_PERMISSION;
 event->u.f_permission.addr = addr;
 event->u.f_permission.rnw = flag & 0x1;
-- 
2.44.0.478.gd926399ef9-goog

[RFC PATCH v2 06/13] hw/arm/smmu: Support nesting in smmuv3_range_inval()

With nesting, we would need to invalidate IPAs without
over-invalidating stage-1 IOVAs. This can be done by
distinguishing IPAs in the TLBs by having ASID=-1.
To achieve that, rework the invalidation for IPAs to have a
separate function, while for IOVA invalidation ASID=-1 means
invalidate for all ASIDs.

Signed-off-by: Mostafa Saleh 
---
 hw/arm/smmu-common.c | 47 
 hw/arm/smmuv3.c  | 23 --
 hw/arm/trace-events  |  2 +-
 include/hw/arm/smmu-common.h |  3 ++-
 4 files changed, 66 insertions(+), 9 deletions(-)

diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c
index 2cf27b490b..8b9e59b24b 100644
--- a/hw/arm/smmu-common.c
+++ b/hw/arm/smmu-common.c
@@ -184,6 +184,25 @@ static gboolean 
smmu_hash_remove_by_asid_vmid_iova(gpointer key, gpointer value,
((entry->iova & ~info->mask) == info->iova);
 }
 
+static gboolean smmu_hash_remove_by_vmid_ipa(gpointer key, gpointer value,
+ gpointer user_data)
+{
+SMMUTLBEntry *iter = (SMMUTLBEntry *)value;
+IOMMUTLBEntry *entry = &iter->entry;
+SMMUIOTLBPageInvInfo *info = (SMMUIOTLBPageInvInfo *)user_data;
+SMMUIOTLBKey iotlb_key = *(SMMUIOTLBKey *)key;
+
+/* This is a stage-1 address. */
+if (info->asid >= 0) {
+return false;
+}
+if (info->vmid != SMMU_IOTLB_VMID(iotlb_key)) {
+return false;
+}
+return ((info->iova & ~entry->addr_mask) == entry->iova) ||
+   ((entry->iova & ~info->mask) == info->iova);
+}
+
 void smmu_iotlb_inv_iova(SMMUState *s, int asid, int vmid, dma_addr_t iova,
  uint8_t tg, uint64_t num_pages, uint8_t ttl)
 {
@@ -212,6 +231,34 @@ void smmu_iotlb_inv_iova(SMMUState *s, int asid, int vmid, 
dma_addr_t iova,
 &info);
 }
 
+/*
+ * Similar to smmu_iotlb_inv_iova(), but for Stage-2, ASID is always -1,
+ * in Stage-1 invalidation ASID = -1, means don't care.
+ */
+void smmu_iotlb_inv_ipa(SMMUState *s, int vmid, dma_addr_t ipa, uint8_t tg,
+uint64_t num_pages, uint8_t ttl)
+{
+uint8_t granule = tg ? tg * 2 + 10 : 12;
+int asid = -1;
+
+   if (ttl && (num_pages == 1)) {
+SMMUIOTLBKey key = smmu_get_iotlb_key(asid, vmid, ipa, tg, ttl);
+
+if (g_hash_table_remove(s->iotlb, &key)) {
+return;
+}
+}
+
+SMMUIOTLBPageInvInfo info = {
+.iova = ipa,
+.vmid = vmid,
+.mask = (num_pages * 1 << granule) - 1};
+
+g_hash_table_foreach_remove(s->iotlb,
+smmu_hash_remove_by_vmid_ipa,
+&info);
+}
+
 void smmu_iotlb_inv_asid(SMMUState *s, int asid)
 {
 trace_smmu_iotlb_inv_asid(asid);
diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c
index a7cf543acc..17bbd43c13 100644
--- a/hw/arm/smmuv3.c
+++ b/hw/arm/smmuv3.c
@@ -1095,7 +1095,7 @@ static void smmuv3_inv_notifiers_iova(SMMUState *s, int 
asid, int vmid,
 }
 }
 
-static void smmuv3_range_inval(SMMUState *s, Cmd *cmd)
+static void smmuv3_range_inval(SMMUState *s, Cmd *cmd, SMMUStage stage)
 {
 dma_addr_t end, addr = CMD_ADDR(cmd);
 uint8_t type = CMD_TYPE(cmd);
@@ -1120,9 +1120,13 @@ static void smmuv3_range_inval(SMMUState *s, Cmd *cmd)
 }
 
 if (!tg) {
-trace_smmuv3_range_inval(vmid, asid, addr, tg, 1, ttl, leaf);
+trace_smmuv3_range_inval(vmid, asid, addr, tg, 1, ttl, leaf, stage);
 smmuv3_inv_notifiers_iova(s, asid, vmid, addr, tg, 1);
-smmu_iotlb_inv_iova(s, asid, vmid, addr, tg, 1, ttl);
+if (stage == SMMU_STAGE_1) {
+smmu_iotlb_inv_iova(s, asid, vmid, addr, tg, 1, ttl);
+} else {
+smmu_iotlb_inv_ipa(s, vmid, addr, tg, 1, ttl);
+}
 return;
 }
 
@@ -1138,9 +1142,14 @@ static void smmuv3_range_inval(SMMUState *s, Cmd *cmd)
 uint64_t mask = dma_aligned_pow2_mask(addr, end, 64);
 
 num_pages = (mask + 1) >> granule;
-trace_smmuv3_range_inval(vmid, asid, addr, tg, num_pages, ttl, leaf);
+trace_smmuv3_range_inval(vmid, asid, addr, tg, num_pages,
+ ttl, leaf, stage);
 smmuv3_inv_notifiers_iova(s, asid, vmid, addr, tg, num_pages);
-smmu_iotlb_inv_iova(s, asid, vmid, addr, tg, num_pages, ttl);
+if (stage == SMMU_STAGE_1) {
+smmu_iotlb_inv_iova(s, asid, vmid, addr, tg, num_pages, ttl);
+} else {
+smmu_iotlb_inv_ipa(s, vmid, addr, tg, num_pages, ttl);
+}
 addr += mask + 1;
 }
 }
@@ -1299,7 +1308,7 @@ static int smmuv3_cmdq_consume(SMMUv3State *s)
 cmd_error = SMMU_CERROR_ILL;
 break;
 }
-smmuv3_range_inval(bs, &cmd);
+smmuv3_range_inval(bs, &cmd, SMMU_STAGE_1);
 break;
 case SMMU_CMD_TLBI_S12_VMALL:
 {
@@ -1324,7 +1333,7 @@ static int smmuv3_cmdq_consume(SMMUv3Stat

Re: [PATCH v4] nbd/server: do not poll within a coroutine context

2024-04-08 Thread Eric Blake

On Mon, Apr 08, 2024 at 11:46:39AM +0300, Vladimir Sementsov-Ogievskiy wrote:
> On 05.04.24 20:44, Eric Blake wrote:
> > From: Zhu Yangyang 
> > 
> > Coroutines are not supposed to block. Instead, they should yield.
> > 
> > The client performs TLS upgrade outside of an AIOContext, during
> > synchronous handshake; this still requires g_main_loop.  But the
> > server responds to TLS upgrade inside a coroutine, so a nested
> > g_main_loop is wrong.  Since the two callbacks no longer share more
> > than the setting of data.complete and data.error, it's just as easy to
> > use static helpers instead of trying to share a common code path.
> > 
> > Fixes: f95910f ("nbd: implement TLS support in the protocol negotiation")
> > Signed-off-by: Zhu Yangyang 
> > [eblake: move callbacks to their use point]
> > Signed-off-by: Eric Blake 
> 
> Reviewed-by: Vladimir Sementsov-Ogievskiy 

I'm debating whether it is worth trying to shove this into 9.0; -rc3
is very late, and the problem is pre-existing, so I'm leaning towards
no.  At which point, it's better to get this right.

> 
> still, some notes below
> 
> > ---
> > 
> > v3: https://lists.gnu.org/archive/html/qemu-devel/2024-04/msg00375.html
> > 
> > in v4, factor even the struct to the .c files, avoiding a union [Vladimir]
> > 
> >   nbd/nbd-internal.h | 10 --
> >   nbd/client.c   | 27 +++
> >   nbd/common.c   | 11 ---
> >   nbd/server.c   | 29 +++--
> >   4 files changed, 46 insertions(+), 31 deletions(-)
> > 

> > +++ b/nbd/client.c
> > @@ -596,13 +596,31 @@ static int nbd_request_simple_option(QIOChannel *ioc, 
> > int opt, bool strict,
> >   return 1;
> >   }
> > 
> > +/* Callback to learn when QIO TLS upgrade is complete */
> > +struct NBDTLSClientHandshakeData {
> > +bool complete;
> > +Error *error;
> > +GMainLoop *loop;
> > +};
> > +
> > +static void nbd_client_tls_handshake(QIOTask *task, void *opaque)
> > +{
> > +struct NBDTLSClientHandshakeData *data = opaque;
> > +
> > +qio_task_propagate_error(task, &data->error);
> > +data->complete = true;
> > +if (data->loop) {
> > +g_main_loop_quit(data->loop);
> > +}
> > +}
> > +
> >   static QIOChannel *nbd_receive_starttls(QIOChannel *ioc,
> >   QCryptoTLSCreds *tlscreds,
> >   const char *hostname, Error 
> > **errp)
> >   {
> >   int ret;
> >   QIOChannelTLS *tioc;
> > -struct NBDTLSHandshakeData data = { 0 };
> > +struct NBDTLSClientHandshakeData data = { 0 };
> > 
> >   ret = nbd_request_simple_option(ioc, NBD_OPT_STARTTLS, true, errp);
> >   if (ret <= 0) {
> > @@ -619,18 +637,19 @@ static QIOChannel *nbd_receive_starttls(QIOChannel 
> > *ioc,
> >   return NULL;
> >   }
> >   qio_channel_set_name(QIO_CHANNEL(tioc), "nbd-client-tls");
> > -data.loop = g_main_loop_new(g_main_context_default(), FALSE);
> >   trace_nbd_receive_starttls_tls_handshake();
> >   qio_channel_tls_handshake(tioc,
> > -  nbd_tls_handshake,
> > +  nbd_client_tls_handshake,
> > &data,
> > NULL,
> > NULL);
> > 
> >   if (!data.complete) {
> > +data.loop = g_main_loop_new(g_main_context_default(), FALSE);
> >   g_main_loop_run(data.loop);
> > +g_main_loop_unref(data.loop);
> 
> probably good to assert(data.complete);

Seems reasonable.

> > +++ b/nbd/server.c
> > @@ -748,6 +748,23 @@ static int nbd_negotiate_handle_info(NBDClient 
> > *client, Error **errp)
> >   return rc;
> >   }
> > 
> > +/* Callback to learn when QIO TLS upgrade is complete */
> > +struct NBDTLSServerHandshakeData {
> > +bool complete;
> > +Error *error;
> > +Coroutine *co;
> > +};
> > +
> > +static void nbd_server_tls_handshake(QIOTask *task, void *opaque)
> > +{
> > +struct NBDTLSServerHandshakeData *data = opaque;
> > +
> > +qio_task_propagate_error(task, &data->error);
> > +data->complete = true;
> > +if (!qemu_coroutine_entered(data->co)) {
> > +aio_co_wake(data->co);
> > +}
> > +}
> > 
> >   /* Handle NBD_OPT_STARTTLS. Return NULL to drop connection, or else the
> >* new channel for all further (now-encrypted) communication. */
> > @@ -756,7 +773,7 @@ static QIOChannel 
> > *nbd_negotiate_handle_starttls(NBDClient *client,
> >   {
> >   QIOChannel *ioc;
> >   QIOChannelTLS *tioc;
> > -struct NBDTLSHandshakeData data = { 0 };
> > +struct NBDTLSServerHandshakeData data = { 0 };
> > 
> >   assert(client->opt == NBD_OPT_STARTTLS);
> > 
> > @@ -777,17 +794,17 @@ static QIOChannel 
> > *nbd_negotiate_handle_starttls(NBDClient *client,
> 
> preexisting: lack coroutine_fn, as well as caller nbd_negotiate_options()

Indeed, so now would not hurt to add them now that a callback is no

[PATCH-for-9.0? 0/2] hw/sd/sdcard: Avoid OOB in sd_read_byte()

Since this is Fix day, I went over this old bug:
https://gitlab.com/qemu-project/qemu/-/issues/487
It happens to be a QEMU implementation detail not
really related to the spec.

Philippe Mathieu-Daudé (2):
  hw/sd/sdcard: Avoid OOB in sd_read_byte() during unexpected CMD switch
  hw/sd/sdcard: Assert @data_offset is in range

 hw/sd/sd.c | 20 
 1 file changed, 20 insertions(+)

-- 
2.41.0

[RFC PATCH-for-9.0? 1/2] hw/sd/sdcard: Avoid OOB in sd_read_byte() during unexpected CMD switch

For multi-bytes commands, our implementation uses the @data_start
and @data_offset fields to track byte access. We initialize the
command start/offset in buffer once. Malicious guest might abuse
by switching command while staying in the 'transfer' state, switching
command buffer size, and our implementation can access out of buffer
boundary. For example, CMD17 (READ_SINGLE_BLOCK) allows to read up to
512 bytes, and CMD13 (SEND_STATUS) up to 64 bytes. By switching from
CMD17 to CMD13 (see reproducer below), bytes [64-511] are out of the
'status' buffer.

Our implementation return R0 status code for unexpected commands.
Such in-transaction command switch is unexpected and returns R0.
This is a good place to reset the start/offset fields to avoid
malicious accesses.

Can be reproduced running:

  $ export UBSAN_OPTIONS=print_stacktrace=1:halt_on_error=1
  $ cat << EOF | qemu-system-i386 \
 -display none -nographic \
 -machine accel=qtest -m 512M \
 -nodefaults \
 -device sdhci-pci,sd-spec-version=3 \
 -device sd-card,drive=mydrive \
 -drive 
if=none,index=0,file=null-co://,format=raw,id=mydrive \
 -qtest stdio -trace sd\* -trace -sdbus_read
  outl 0xcf8 0x80001010
  outl 0xcfc 0xe000
  outl 0xcf8 0x80001004
  outw 0xcfc 0x02
  write 0xe02c 0x1 0x05
  write 0xe00f 0x1 0x37
  write 0xe00a 0x1 0x01
  write 0xe00f 0x1 0x29
  write 0xe00f 0x1 0x02
  write 0xe00f 0x1 0x03
  write 0xe00c 0x1 0x32
  write 0xe00f 0x1 0x06
  write 0xe005 0x1 0x01
  write 0xe007 0x1 0x01
  write 0xe003 0x1 0x00
  write 0xe00f 0x1 0x11
  write 0xe02a 0x1 0x01
  write 0xe02a 0x1 0x02
  write 0xe00f 0x1 0x0d
  write 0xe02a 0x1 0x01
  write 0xe02a 0x1 0x02
  EOF
  hw/sd/sd.c:1984:15: runtime error: index 256 out of bounds for type 'uint8_t 
[64]'
  #0 sd_read_byte hw/sd/sd.c:1984:15
  #1 sdbus_read_data hw/sd/core.c:157:23
  #2 sdhci_read_block_from_card hw/sd/sdhci.c:423:9
  #3 sdhci_blkgap_write hw/sd/sdhci.c:1074:13
  #4 sdhci_write hw/sd/sdhci.c:1195:13
  #5 memory_region_write_accessor softmmu/memory.c:492:5
  #6 access_with_adjusted_size softmmu/memory.c:554:18
  #7 memory_region_dispatch_write softmmu/memory.c
  #8 flatview_write_continue softmmu/physmem.c:2778:23
  #9 flatview_write softmmu/physmem.c:2818:14
  #10 address_space_write softmmu/physmem.c:2910:18
  SUMMARY: UndefinedBehaviorSanitizer: undefined-behavior hw/sd/sd.c:1984:15

Reported-by: Alexander Bulekov 
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/487
Buglink: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=36240
Signed-off-by: Philippe Mathieu-Daudé 
---
 hw/sd/sd.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/hw/sd/sd.c b/hw/sd/sd.c
index 807b5d3de3..16d8d52a78 100644
--- a/hw/sd/sd.c
+++ b/hw/sd/sd.c
@@ -1826,6 +1826,12 @@ send_response:
 break;
 
 case sd_r0:
+/*
+ * Invalid state transition, reset implementation
+ * fields to avoid OOB abuse.
+ */
+sd->data_start = 0;
+sd->data_offset = 0;
 case sd_illegal:
 rsplen = 0;
 break;
-- 
2.41.0

[PATCH-for-9.1 2/2] hw/sd/sdcard: Assert @data_offset is in range

Prevent out-of-bound access with assertions.

Signed-off-by: Philippe Mathieu-Daudé 
---
 hw/sd/sd.c | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/hw/sd/sd.c b/hw/sd/sd.c
index 16d8d52a78..c081211582 100644
--- a/hw/sd/sd.c
+++ b/hw/sd/sd.c
@@ -1875,6 +1875,7 @@ void sd_write_byte(SDState *sd, uint8_t value)
 sd->current_cmd, value);
 switch (sd->current_cmd) {
 case 24:  /* CMD24:  WRITE_SINGLE_BLOCK */
+assert(sd->data_offset < sizeof(sd->data));
 sd->data[sd->data_offset ++] = value;
 if (sd->data_offset >= sd->blk_len) {
 /* TODO: Check CRC before committing */
@@ -1901,6 +1902,7 @@ void sd_write_byte(SDState *sd, uint8_t value)
 }
 }
 }
+assert(sd->data_offset < sizeof(sd->data));
 sd->data[sd->data_offset++] = value;
 if (sd->data_offset >= sd->blk_len) {
 /* TODO: Check CRC before committing */
@@ -1925,6 +1927,7 @@ void sd_write_byte(SDState *sd, uint8_t value)
 break;
 
 case 26:  /* CMD26:  PROGRAM_CID */
+assert(sd->data_offset < sizeof(sd->data));
 sd->data[sd->data_offset ++] = value;
 if (sd->data_offset >= sizeof(sd->cid)) {
 /* TODO: Check CRC before committing */
@@ -1944,6 +1947,7 @@ void sd_write_byte(SDState *sd, uint8_t value)
 break;
 
 case 27:  /* CMD27:  PROGRAM_CSD */
+assert(sd->data_offset < sizeof(sd->data));
 sd->data[sd->data_offset ++] = value;
 if (sd->data_offset >= sizeof(sd->csd)) {
 /* TODO: Check CRC before committing */
@@ -1968,6 +1972,7 @@ void sd_write_byte(SDState *sd, uint8_t value)
 break;
 
 case 42:  /* CMD42:  LOCK_UNLOCK */
+assert(sd->data_offset < sizeof(sd->data));
 sd->data[sd->data_offset ++] = value;
 if (sd->data_offset >= sd->blk_len) {
 /* TODO: Check CRC before committing */
@@ -1979,6 +1984,7 @@ void sd_write_byte(SDState *sd, uint8_t value)
 break;
 
 case 56:  /* CMD56:  GEN_CMD */
+assert(sd->data_offset < sizeof(sd->data));
 sd->data[sd->data_offset ++] = value;
 if (sd->data_offset >= sd->blk_len) {
 APP_WRITE_BLOCK(sd->data_start, sd->data_offset);
@@ -2046,6 +2052,7 @@ uint8_t sd_read_byte(SDState *sd)
 break;
 
 case 13:  /* ACMD13: SD_STATUS */
+assert(sd->data_offset < sizeof(sd->sd_status));
 ret = sd->sd_status[sd->data_offset ++];
 
 if (sd->data_offset >= sizeof(sd->sd_status))
@@ -2055,6 +2062,7 @@ uint8_t sd_read_byte(SDState *sd)
 case 17:  /* CMD17:  READ_SINGLE_BLOCK */
 if (sd->data_offset == 0)
 BLK_READ_BLOCK(sd->data_start, io_len);
+assert(sd->data_offset < sizeof(sd->data));
 ret = sd->data[sd->data_offset ++];
 
 if (sd->data_offset >= io_len)
@@ -2069,6 +2077,7 @@ uint8_t sd_read_byte(SDState *sd)
 }
 BLK_READ_BLOCK(sd->data_start, io_len);
 }
+assert(sd->data_offset < sizeof(sd->data));
 ret = sd->data[sd->data_offset ++];
 
 if (sd->data_offset >= io_len) {
@@ -2089,10 +2098,12 @@ uint8_t sd_read_byte(SDState *sd)
 if (sd->data_offset >= SD_TUNING_BLOCK_SIZE - 1) {
 sd->state = sd_transfer_state;
 }
+assert(sd->data_offset < sizeof(sd_tuning_block_pattern));
 ret = sd_tuning_block_pattern[sd->data_offset++];
 break;
 
 case 22:  /* ACMD22: SEND_NUM_WR_BLOCKS */
+assert(sd->data_offset < sizeof(sd->sd_status));
 ret = sd->data[sd->data_offset ++];
 
 if (sd->data_offset >= 4)
@@ -2100,6 +2111,7 @@ uint8_t sd_read_byte(SDState *sd)
 break;
 
 case 30:  /* CMD30:  SEND_WRITE_PROT */
+assert(sd->data_offset < sizeof(sd->data));
 ret = sd->data[sd->data_offset ++];
 
 if (sd->data_offset >= 4)
@@ -2107,6 +2119,7 @@ uint8_t sd_read_byte(SDState *sd)
 break;
 
 case 51:  /* ACMD51: SEND_SCR */
+assert(sd->data_offset < sizeof(sd->scr));
 ret = sd->scr[sd->data_offset ++];
 
 if (sd->data_offset >= sizeof(sd->scr))
@@ -2116,6 +2129,7 @@ uint8_t sd_read_byte(SDState *sd)
 case 56:  /* CMD56:  GEN_CMD */
 if (sd->data_offset == 0)
 APP_READ_BLOCK(sd->data_start, sd->blk_len);
+assert(sd->data_offset < sizeof(sd->data));
 ret = sd->data[sd->data_offset ++];
 
 if (sd->data_offset >= sd->blk_len)
-- 
2.41.0

Re: [PATCH-for-9.0? 2/2] hw/net/lan9118: Fix overflow in TX FIFO