date:20230911

Re: [PATCH 2/2] vmmouse: use explicit code

2023-09-11 Thread Marc-André Lureau

Hi

On Tue, Aug 1, 2023 at 1:40 PM  wrote:
>
> From: Marc-André Lureau 
>
> It's weird to shift x & y without obvious reason. Let's make this more
> explicit and future-proof.
>
> Signed-off-by: Marc-André Lureau 

ping

> ---
>  hw/i386/vmmouse.c | 13 +++--
>  1 file changed, 11 insertions(+), 2 deletions(-)
>
> diff --git a/hw/i386/vmmouse.c b/hw/i386/vmmouse.c
> index fce13a5cde..cd9ac11afc 100644
> --- a/hw/i386/vmmouse.c
> +++ b/hw/i386/vmmouse.c
> @@ -46,6 +46,11 @@
>
>  #define VMMOUSE_VERSION0x3442554a
>
> +#define VMMOUSE_MIN_X 0
> +#define VMMOUSE_MIN_Y 0
> +#define VMMOUSE_MAX_X 0x
> +#define VMMOUSE_MAX_Y 0x
> +
>  #define TYPE_VMMOUSE "vmmouse"
>  OBJECT_DECLARE_SIMPLE_TYPE(VMMouseState, VMMOUSE)
>
> @@ -106,8 +111,12 @@ static void vmmouse_mouse_event(void *opaque, int x, int 
> y, int dz, int buttons_
>  buttons |= 0x08;
>
>  if (s->absolute) {
> -x <<= 1;
> -y <<= 1;
> +x = qemu_input_scale_axis(x,
> +  INPUT_EVENT_ABS_MIN, INPUT_EVENT_ABS_MAX,
> +  VMMOUSE_MIN_X, VMMOUSE_MAX_X);
> +y = qemu_input_scale_axis(y,
> +  INPUT_EVENT_ABS_MIN, INPUT_EVENT_ABS_MAX,
> +  VMMOUSE_MIN_Y, VMMOUSE_MAX_Y);
>  }
>
>  s->queue[s->nb_queue++] = buttons;
> --
> 2.41.0
>
>


-- 
Marc-André Lureau

Re: [PATCH v2 05/19] target/riscv/cpu.c: add .instance_post_init()

2023-09-11 Thread Andrew Jones

On Wed, Sep 06, 2023 at 06:16:32AM -0300, Daniel Henrique Barboza wrote:
> All generic CPUs call riscv_cpu_add_user_properties(). The 'max' CPU
> calls riscv_init_max_cpu_extensions(). Both can be moved to a common
> instance_post_init() callback, implemented in riscv_cpu_post_init(),
> called by all CPUs. The call order then becomes:
> 
> riscv_cpu_init() -> cpu_init() of each CPU -> .instance_post_init()
> 
> In the near future riscv_cpu_post_init() will call the init() function
> of the current accelerator, providing a hook for KVM and TCG accel
> classes to change the init() process of the CPU.

Yes, this seems to be what x86 does, so presumably it'll work for riscv
too.

> 
> Signed-off-by: Daniel Henrique Barboza 
> ---
>  target/riscv/cpu.c | 42 --
>  1 file changed, 32 insertions(+), 10 deletions(-)
> 
> diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
> index 7569955c7e..4c6d595067 100644
> --- a/target/riscv/cpu.c
> +++ b/target/riscv/cpu.c
> @@ -427,8 +427,6 @@ static void riscv_max_cpu_init(Object *obj)
>  mlx = MXL_RV32;
>  #endif
>  set_misa(env, mlx, 0);
> -riscv_cpu_add_user_properties(obj);
> -riscv_init_max_cpu_extensions(obj);
>  env->priv_ver = PRIV_VERSION_LATEST;
>  #ifndef CONFIG_USER_ONLY
>  set_satp_mode_max_supported(RISCV_CPU(obj), mlx == MXL_RV32 ?
> @@ -442,7 +440,6 @@ static void rv64_base_cpu_init(Object *obj)
>  CPURISCVState *env = &RISCV_CPU(obj)->env;
>  /* We set this in the realise function */
>  set_misa(env, MXL_RV64, 0);
> -riscv_cpu_add_user_properties(obj);
>  /* Set latest version of privileged specification */
>  env->priv_ver = PRIV_VERSION_LATEST;
>  #ifndef CONFIG_USER_ONLY
> @@ -566,7 +563,6 @@ static void rv128_base_cpu_init(Object *obj)
>  CPURISCVState *env = &RISCV_CPU(obj)->env;
>  /* We set this in the realise function */
>  set_misa(env, MXL_RV128, 0);
> -riscv_cpu_add_user_properties(obj);
>  /* Set latest version of privileged specification */
>  env->priv_ver = PRIV_VERSION_LATEST;
>  #ifndef CONFIG_USER_ONLY
> @@ -579,7 +575,6 @@ static void rv32_base_cpu_init(Object *obj)
>  CPURISCVState *env = &RISCV_CPU(obj)->env;
>  /* We set this in the realise function */
>  set_misa(env, MXL_RV32, 0);
> -riscv_cpu_add_user_properties(obj);
>  /* Set latest version of privileged specification */
>  env->priv_ver = PRIV_VERSION_LATEST;
>  #ifndef CONFIG_USER_ONLY
> @@ -1215,6 +1210,37 @@ static void riscv_cpu_set_irq(void *opaque, int irq, 
> int level)
>  }
>  #endif /* CONFIG_USER_ONLY */
>  
> +static bool riscv_cpu_is_dynamic(Object *cpu_obj)
> +{
> +return object_dynamic_cast(cpu_obj, TYPE_RISCV_DYNAMIC_CPU) != NULL;
> +}
> +
> +static bool riscv_cpu_has_max_extensions(Object *cpu_obj)
> +{
> +return object_dynamic_cast(cpu_obj, TYPE_RISCV_CPU_MAX) != NULL;
> +}
> +
> +static bool riscv_cpu_has_user_properties(Object *cpu_obj)
> +{
> +if (kvm_enabled() &&
> +object_dynamic_cast(cpu_obj, TYPE_RISCV_CPU_HOST) != NULL) {
> +return true;
> +}
> +
> +return riscv_cpu_is_dynamic(cpu_obj);
> +}
> +
> +static void riscv_cpu_post_init(Object *obj)
> +{
> +if (riscv_cpu_has_user_properties(obj)) {
> +riscv_cpu_add_user_properties(obj);
> +}
> +
> +if (riscv_cpu_has_max_extensions(obj)) {
> +riscv_init_max_cpu_extensions(obj);
> +}
> +}
> +
>  static void riscv_cpu_init(Object *obj)
>  {
>  RISCVCPU *cpu = RISCV_CPU(obj);
> @@ -1770,11 +1796,6 @@ static const struct SysemuCPUOps riscv_sysemu_ops = {
>  };
>  #endif
>  
> -static bool riscv_cpu_is_dynamic(Object *cpu_obj)
> -{
> -return object_dynamic_cast(cpu_obj, TYPE_RISCV_DYNAMIC_CPU) != NULL;
> -}
> -
>  static void cpu_set_mvendorid(Object *obj, Visitor *v, const char *name,
>void *opaque, Error **errp)
>  {
> @@ -2011,6 +2032,7 @@ static const TypeInfo riscv_cpu_type_infos[] = {
>  .instance_size = sizeof(RISCVCPU),
>  .instance_align = __alignof__(RISCVCPU),
>  .instance_init = riscv_cpu_init,
> +.instance_post_init = riscv_cpu_post_init,
>  .abstract = true,
>  .class_size = sizeof(RISCVCPUClass),
>  .class_init = riscv_cpu_class_init,
> -- 
> 2.41.0
>

Reviewed-by: Andrew Jones

Re: [PATCH v2 09/19] target/riscv: make riscv_add_satp_mode_properties() public

2023-09-11 Thread Andrew Jones

On Wed, Sep 06, 2023 at 06:16:36AM -0300, Daniel Henrique Barboza wrote:
> This function is used for both accelerators. Make it public, and call it
> from kvm_riscv_cpu_add_kvm_properties(). This will make it easier to
> split KVM specific code for the KVM accelerator class in the next patch.
> 
> Signed-off-by: Daniel Henrique Barboza 
> ---
>  target/riscv/cpu.c | 5 ++---
>  target/riscv/cpu.h | 1 +
>  target/riscv/kvm.c | 1 +
>  3 files changed, 4 insertions(+), 3 deletions(-)
> 
> diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
> index 0dc9b3201d..50be127f36 100644
> --- a/target/riscv/cpu.c
> +++ b/target/riscv/cpu.c
> @@ -1115,7 +1115,7 @@ static void cpu_riscv_set_satp(Object *obj, Visitor *v, 
> const char *name,
>  satp_map->init |= 1 << satp;
>  }
>  
> -static void riscv_add_satp_mode_properties(Object *obj)
> +void riscv_add_satp_mode_properties(Object *obj)
>  {
>  RISCVCPU *cpu = RISCV_CPU(obj);
>  
> @@ -1589,12 +1589,11 @@ static void riscv_cpu_add_multiext_prop_array(Object 
> *obj,
>  static void riscv_cpu_add_user_properties(Object *obj)
>  {
>  #ifndef CONFIG_USER_ONLY
> -riscv_add_satp_mode_properties(obj);
> -
>  if (kvm_enabled()) {
>  kvm_riscv_cpu_add_kvm_properties(obj);
>  return;
>  }
> +riscv_add_satp_mode_properties(obj);
>  #endif
>  
>  riscv_cpu_add_misa_properties(obj);
> diff --git a/target/riscv/cpu.h b/target/riscv/cpu.h
> index b9c4bea3f7..950c2301f2 100644
> --- a/target/riscv/cpu.h
> +++ b/target/riscv/cpu.h
> @@ -726,6 +726,7 @@ extern const RISCVCPUMultiExtConfig 
> riscv_cpu_experimental_exts[];
>  extern Property riscv_cpu_options[];
>  
>  void riscv_cpu_add_misa_properties(Object *cpu_obj);
> +void riscv_add_satp_mode_properties(Object *obj);
>  
>  /* CSR function table */
>  extern riscv_csr_operations csr_ops[CSR_TABLE_SIZE];
> diff --git a/target/riscv/kvm.c b/target/riscv/kvm.c
> index 7dac01374f..ef6b2cfffe 100644
> --- a/target/riscv/kvm.c
> +++ b/target/riscv/kvm.c
> @@ -1279,6 +1279,7 @@ void kvm_riscv_cpu_add_kvm_properties(Object *obj)
>  DeviceState *dev = DEVICE(obj);
>  
>  riscv_init_user_properties(obj);
> +riscv_add_satp_mode_properties(obj);
>  riscv_cpu_add_misa_properties(obj);
>  
>  riscv_cpu_add_kvm_unavail_prop_array(obj, riscv_cpu_extensions);
> -- 
> 2.41.0
>

Reviewed-by: Andrew Jones

Re: [PATCH RESEND 03/15] ppc: spapr: Use SpaprMachineStateNested's ptcr instead of nested_ptcr

2023-09-11 Thread Harsh Prateek Bora





On 9/7/23 06:43, Nicholas Piggin wrote:

On Wed Sep 6, 2023 at 2:33 PM AEST, Harsh Prateek Bora wrote:

Use nested guest state specific struct for storing related info.


So this is the patch I would introduce the SpaprMachineStateNested
struct, with just the .ptrc member. Add other members to it as they
are used in later patches.


Sure, will do.





Signed-off-by: Michael Neuling 
Signed-off-by: Harsh Prateek Bora 
---
  hw/ppc/spapr.c | 4 ++--
  hw/ppc/spapr_nested.c  | 4 ++--
  include/hw/ppc/spapr.h | 3 ++-
  3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 07e91e3800..e44686b04d 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -1340,8 +1340,8 @@ static bool spapr_get_pate(PPCVirtualHypervisor *vhyp, 
PowerPCCPU *cpu,
  
  assert(lpid != 0);
  
-patb = spapr->nested_ptcr & PTCR_PATB;

-pats = spapr->nested_ptcr & PTCR_PATS;
+patb = spapr->nested.ptcr & PTCR_PATB;
+pats = spapr->nested.ptcr & PTCR_PATS;
  
  /* Check if partition table is properly aligned */

  if (patb & MAKE_64BIT_MASK(0, pats + 12)) {


At this point I wonder if we should first move the nested part of
spapr_get_pate into nested code. It's a bit of a wart to have it
here when most of the other nested cases are abstracted from non
nested code quite well.


Yeh, I also felt similar when modifying the existing nested code here.
Let me do the needful in next version.




diff --git a/hw/ppc/spapr_nested.c b/hw/ppc/spapr_nested.c
index 121aa96ddc..a669470f1a 100644
--- a/hw/ppc/spapr_nested.c
+++ b/hw/ppc/spapr_nested.c
@@ -25,7 +25,7 @@ static target_ulong h_set_ptbl(PowerPCCPU *cpu,
  return H_PARAMETER;
  }
  
-spapr->nested_ptcr = ptcr; /* Save new partition table */

+spapr->nested.ptcr = ptcr; /* Save new partition table */
  
  return H_SUCCESS;

  }
@@ -157,7 +157,7 @@ static target_ulong h_enter_nested(PowerPCCPU *cpu,
  struct kvmppc_pt_regs *regs;
  hwaddr len;
  
-if (spapr->nested_ptcr == 0) {

+if (spapr->nested.ptcr == 0) {
  return H_NOT_AVAILABLE;
  }
  
diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h

index 3990fed1d9..c8b42af430 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -12,6 +12,7 @@
  #include "hw/ppc/spapr_xive.h"  /* For SpaprXive */
  #include "hw/ppc/xics.h"/* For ICSState */
  #include "hw/ppc/spapr_tpm_proxy.h"
+#include "hw/ppc/spapr_nested.h" /* for SpaprMachineStateNested */
  
  struct SpaprVioBus;

  struct SpaprPhbState;
@@ -216,7 +217,7 @@ struct SpaprMachineState {
  uint32_t vsmt;   /* Virtual SMT mode (KVM's "core stride") */
  
  /* Nested HV support (TCG only) */

-uint64_t nested_ptcr;
+struct SpaprMachineStateNested nested;


I think convention says to use the typedef for these?


Sure, will update. Thanks.

regards,
Harsh



Thanks,
Nick

  
  Notifier epow_notifier;

  QTAILQ_HEAD(, SpaprEventLogEntry) pending_events;

linux-user vs system build options and libraries

2023-09-11 Thread Michael Tokarev


Hi!

I noticed that linux-user binaries, when built separately (with
--enable-linux-user --disable-system) or when built as part of
system build (--enable-linux-user --enable-system) differ in size
and link with significantly more libraries which they shouldn't
link with.  libnuma, liburing, libgnutls, - that's what gets linked,
just to name a few.  zlib is also there which most likely shouldn't
be.

Looking at the link lines I see a whole lot of various stuff being
linked, thankfully with --as-needed so most of it gets dropped by
the linker - these are stuff like pixman and many other things.

I already mentioned this before, - to me, whole meson.build options
thing is done a bit backwards: we use have_foo to mean NEED_foo, so
in quite a few places these meanings are wrongly intermixed.  Like,
if I try a linux-user build with --enable-spice, it will tried to
be used even if it makes no sense whatsoever.  We could check spice
and enable have_spice this way (or spice_libs=/spice_defs= etc),
but only use it in those build targets which actually need it.

Also, which might be a separate issue, --enable-plugins seems to
be in effect for linux-user too, which result in libgmodule being
linked with.

Also, there's no way to build/install just the linux-user parts
without, say, installing keymap files, docs are always built,
and so on..

It looks like we've quite barbarian build system still, even after
conversion to meson :)

/mjt

Re: [PATCH v4 00/11] memory-backend-file related improvements and VM templating support

2023-09-11 Thread David Hildenbrand


On 06.09.23 14:04, David Hildenbrand wrote:

If there are no more comments, I'll queue this myself soon.


Queued to

https://github.com/davidhildenbrand/qemu.git mem-next

--
Cheers,

David / dhildenb

Re: [PATCH v3 00/16] virtio-mem: Expose device memory through multiple memslots

2023-09-11 Thread David Hildenbrand

@MST, any comment on the vhost bits (mostly uncontroversial and only in 
the memslot domain)?


I'm planning on queuing this myself (but will wait a bit more), unless 
you want to take it.



On 08.09.23 16:21, David Hildenbrand wrote:

Quoting from patch #14:

 Having large virtio-mem devices that only expose little memory to a VM
 is currently a problem: we map the whole sparse memory region into the
 guest using a single memslot, resulting in one gigantic memslot in KVM.
 KVM allocates metadata for the whole memslot, which can result in quite
 some memory waste.

 Assuming we have a 1 TiB virtio-mem device and only expose little (e.g.,
 1 GiB) memory, we would create a single 1 TiB memslot and KVM has to
 allocate metadata for that 1 TiB memslot: on x86, this implies allocating
 a significant amount of memory for metadata:

 (1) RMAP: 8 bytes per 4 KiB, 8 bytes per 2 MiB, 8 bytes per 1 GiB
 -> For 1 TiB: 2147483648 + 4194304 + 8192 = ~ 2 GiB (0.2 %)

 With the TDP MMU (cat /sys/module/kvm/parameters/tdp_mmu) this gets
 allocated lazily when required for nested VMs
 (2) gfn_track: 2 bytes per 4 KiB
 -> For 1 TiB: 536870912 = ~512 MiB (0.05 %)
 (3) lpage_info: 4 bytes per 2 MiB, 4 bytes per 1 GiB
 -> For 1 TiB: 2097152 + 4096 = ~2 MiB (0.0002 %)
 (4) 2x dirty bitmaps for tracking: 2x 1 bit per 4 KiB page
 -> For 1 TiB: 536870912 = 64 MiB (0.006 %)

 So we primarily care about (1) and (2). The bad thing is, that the
 memory consumption doubles once SMM is enabled, because we create the
 memslot once for !SMM and once for SMM.

 Having a 1 TiB memslot without the TDP MMU consumes around:
 * With SMM: 5 GiB
 * Without SMM: 2.5 GiB
 Having a 1 TiB memslot with the TDP MMU consumes around:
 * With SMM: 1 GiB
 * Without SMM: 512 MiB

 ... and that's really something we want to optimize, to be able to just
 start a VM with small boot memory (e.g., 4 GiB) and a virtio-mem device
 that can grow very large (e.g., 1 TiB).

 Consequently, using multiple memslots and only mapping the memslots we
 really need can significantly reduce memory waste and speed up
 memslot-related operations. Let's expose the sparse RAM memory region using
 multiple memslots, mapping only the memslots we currently need into our
 device memory region container.

The hyper-v balloon driver has similar demands [1].

For virtio-mem, this has to be turned manually on ("multiple-memslots=on"),
due to the interaction with vhost (below).

If we have less than 509 memslots available, we always default to a single
memslot. Otherwise, we automatically decide how many memslots to use
based on a simple heuristic (see patch #12), and try not to use more than
256 memslots across all memory devices: our historical DIMM limit.

As soon as any memory devices automatically decided on using more than
one memslot, vhost devices that support less than 509 memslots (e.g.,
currently most vhost-user devices like with virtiofsd) can no longer be
plugged as a precaution.

Quoting from patch #12:

 Plugging vhost devices with less than 509 memslots available while we
 have memory devices plugged that consume multiple memslots due to
 automatic decisions can be problematic. Most configurations might just fail
 due to "limit < used + reserved", however, it can also happen that these
 memory devices would suddenly consume memslots that would actually be
 required by other memslot consumers (boot, PCI BARs) later. Note that this
 has always been sketchy with vhost devices that support only a small number
 of memslots; but we don't want to make it any worse.So let's keep it simple
 and simply reject plugging such vhost devices in such a configuration.

 Eventually, all vhost devices that want to be fully compatible with such
 memory devices should support a decent number of memslots (>= 509).


The recommendation is to plug such vhost devices before the virtio-mem
decides, or to not set "multiple-memslots=on". As soon as these devices
support a reasonable number of memslots (>= 509), this will start working
automatically.

I run some tests on x86_64, now also including vfio tests. Seems to work
as expected, even when multiple memslots are used.


Patch #1 -- #3 are from [2] that were not picked up yet.

Patch #4 -- #12 add handling of multiple memslots to memory devices

Patch #13 -- #14 add "multiple-memslots=on" support to virtio-mem

Patch #15 -- #16 make sure that virtio-mem memslots can be enabled/disable
  atomically

v2 -> v3:
* "kvm: Return number of free memslots"
  -> Return 0 in stub
* "kvm: Add stub for kvm_get_max_memslots()"
  -> Return 0 in stub
* Adjust other patches to check for kvm_enabled() before calling
   kvm_get_free_memslots()/kvm_get_max_memslots()
* Add RBs

v1 -> v2:
* Include patches from [1]
* A lot of code simplification an

Re: [PATCH v2 10/19] target/riscv: remove kvm-stub.c

2023-09-11 Thread Andrew Jones

On Wed, Sep 06, 2023 at 12:23:19PM +0200, Philippe Mathieu-Daudé wrote:
> On 6/9/23 11:16, Daniel Henrique Barboza wrote:
> > This file is not needed for some time now. All the stubs implemented in
> > it (kvm_riscv_reset_vcpu() and kvm_riscv_set_irq()) are wrapped in 'if
> > kvm_enabled()' blocks that the compiler will rip it out in non-KVM
> > builds.
> > 
> > We'll also add non-KVM stubs for all functions declared in kvm_riscv.h.
> > All stubs are implemented as g_asserted_not_reached(), meaning that we
> > won't support them in non-KVM builds. This is done by other kvm headers
> > like kvm_arm.h and kvm_ppc.h.
> 
> Aren't them also protected by kvm_enabled()? Otherwise shouldn't they?

Yes, I think your earlier suggestion that we always invoke kvm functions
from non-kvm files with a kvm_enabled() guard makes sense.

> 
> > Signed-off-by: Daniel Henrique Barboza 
> > ---
> >   target/riscv/kvm-stub.c  | 30 --
> >   target/riscv/kvm_riscv.h | 31 +++
> >   target/riscv/meson.build |  2 +-
> >   3 files changed, 32 insertions(+), 31 deletions(-)
> >   delete mode 100644 target/riscv/kvm-stub.c
> 
> 
> > diff --git a/target/riscv/kvm_riscv.h b/target/riscv/kvm_riscv.h
> > index f6501e68e2..c9ecd9a967 100644
> > --- a/target/riscv/kvm_riscv.h
> > +++ b/target/riscv/kvm_riscv.h
> > @@ -19,6 +19,7 @@
> >   #ifndef QEMU_KVM_RISCV_H
> >   #define QEMU_KVM_RISCV_H
> > +#ifdef CONFIG_KVM
> >   void kvm_riscv_cpu_add_kvm_properties(Object *obj);
> 
> At a glance kvm_riscv_cpu_add_kvm_properties() is.
> Keep the prototype declared (before #ifdef CONFIG_KVM) is enough for the
> compiler to elide it.

Yes, when building without CONFIG_KVM enabled it's actually better to not
have the stubs, since the compiler will catch an unguarded kvm function
call (assuming the kvm function is defined in a file which is only built
with CONFIG_KVM).

Unfortunately we don't have anything to protect developers from forgetting
the kvm_enabled() guard when building a QEMU which supports both TCG and
KVM. We could try to remember to put 'assert(kvm_enabled())' at the start
of each of these types of functions. It looks like mips does that for a
couple functions.

Thanks,
drew

[PULL 09/13] migration: Move more initializations to migrate_init()

2023-09-11 Thread Cédric Le Goater

From: Avihai Horon 

Initialization of mig_stats, compression_counters and VFIO bytes
transferred is hard-coded in migration code path and snapshot code path.

Make the code cleaner by initializing them in migrate_init().

Suggested-by: Cédric Le Goater 
Signed-off-by: Avihai Horon 
Reviewed-by: Cédric Le Goater 
Signed-off-by: Cédric Le Goater 
---
 migration/migration.c | 14 +++---
 migration/savevm.c|  3 ---
 2 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/migration/migration.c b/migration/migration.c
index 
92866a8f49d3a7d24028198defb15c5d4d86726e..ce01a3ba6af72aa35063f88355349ec739708d4a
 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -1425,6 +1425,13 @@ void migrate_init(MigrationState *s)
 s->iteration_initial_bytes = 0;
 s->threshold_size = 0;
 s->switchover_acked = false;
+/*
+ * set mig_stats compression_counters memory to zero for a
+ * new migration
+ */
+memset(&mig_stats, 0, sizeof(mig_stats));
+memset(&compression_counters, 0, sizeof(compression_counters));
+migration_reset_vfio_bytes_transferred();
 }
 
 int migrate_add_blocker_internal(Error *reason, Error **errp)
@@ -1635,13 +1642,6 @@ static bool migrate_prepare(MigrationState *s, bool blk, 
bool blk_inc,
 }
 
 migrate_init(s);
-/*
- * set mig_stats compression_counters memory to zero for a
- * new migration
- */
-memset(&mig_stats, 0, sizeof(mig_stats));
-memset(&compression_counters, 0, sizeof(compression_counters));
-migration_reset_vfio_bytes_transferred();
 
 return true;
 }
diff --git a/migration/savevm.c b/migration/savevm.c
index 
5bf8b59a7dfc243eb353674bdef8083d441797e3..e14efeced0fb8e4b2dc2b7799f5612799f185170
 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -1620,9 +1620,6 @@ static int qemu_savevm_state(QEMUFile *f, Error **errp)
 }
 
 migrate_init(ms);
-memset(&mig_stats, 0, sizeof(mig_stats));
-memset(&compression_counters, 0, sizeof(compression_counters));
-migration_reset_vfio_bytes_transferred();
 ms->to_dst_file = f;
 
 qemu_mutex_unlock_iothread();
-- 
2.41.0

[PULL 12/13] vfio/migration: Block VFIO migration with background snapshot

2023-09-11 Thread Cédric Le Goater

From: Avihai Horon 

Background snapshot allows creating a snapshot of the VM while it's
running and keeping it small by not including dirty RAM pages.

The way it works is by first stopping the VM, saving the non-iterable
devices' state and then starting the VM and saving the RAM while write
protecting it with UFFD. The resulting snapshot represents the VM state
at snapshot start.

VFIO migration is not compatible with background snapshot.
First of all, VFIO device state is not even saved in background snapshot
because only non-iterable device state is saved. But even if it was
saved, after starting the VM, a VFIO device could dirty pages without it
being detected by UFFD write protection. This would corrupt the
snapshot, as the RAM in it would not represent the RAM at snapshot
start.

To prevent this, block VFIO migration with background snapshot.

Signed-off-by: Avihai Horon 
Reviewed-by: Peter Xu 
Signed-off-by: Cédric Le Goater 
---
 hw/vfio/migration.c | 11 ++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
index 
20994dc1d60b1606728415fec17c19cfd00c4dee..da43dcd2fe0734091960e3eb2ffa26750073be72
 100644
--- a/hw/vfio/migration.c
+++ b/hw/vfio/migration.c
@@ -340,7 +340,8 @@ static int vfio_save_prepare(void *opaque, Error **errp)
 VFIODevice *vbasedev = opaque;
 
 /*
- * Snapshot doesn't use postcopy, so allow snapshot even if postcopy is on.
+ * Snapshot doesn't use postcopy nor background snapshot, so allow snapshot
+ * even if they are on.
  */
 if (runstate_check(RUN_STATE_SAVE_VM)) {
 return 0;
@@ -353,6 +354,14 @@ static int vfio_save_prepare(void *opaque, Error **errp)
 return -EOPNOTSUPP;
 }
 
+if (migrate_background_snapshot()) {
+error_setg(
+errp,
+"%s: VFIO migration is not supported with background snapshot",
+vbasedev->name);
+return -EOPNOTSUPP;
+}
+
 return 0;
 }
 
-- 
2.41.0

[PULL 13/13] vfio/common: Separate vfio-pci ranges

2023-09-11 Thread Cédric Le Goater

From: Joao Martins 

QEMU computes the DMA logging ranges for two predefined ranges: 32-bit
and 64-bit. In the OVMF case, when the dynamic MMIO window is enabled,
QEMU includes in the 64-bit range the RAM regions at the lower part
and vfio-pci device RAM regions which are at the top of the address
space. This range contains a large gap and the size can be bigger than
the dirty tracking HW limits of some devices (MLX5 has a 2^42 limit).

To avoid such large ranges, introduce a new PCI range covering the
vfio-pci device RAM regions, this only if the addresses are above 4GB
to avoid breaking potential SeaBIOS guests.

[ clg: - wrote commit log
   - fixed overlapping 32-bit and PCI ranges when using SeaBIOS ]

Signed-off-by: Joao Martins 
Signed-off-by: Cédric Le Goater 
Fixes: 5255bbf4ec16 ("vfio/common: Add device dirty page tracking start/stop")
Signed-off-by: Cédric Le Goater 
---
 hw/vfio/common.c | 71 +---
 hw/vfio/trace-events |  2 +-
 2 files changed, 61 insertions(+), 12 deletions(-)

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 
237101d03844273f653d98b6d053a1ae9c05a247..134649226d4333f648ca751291003316a5f3b4a9
 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -27,6 +27,7 @@
 
 #include "hw/vfio/vfio-common.h"
 #include "hw/vfio/vfio.h"
+#include "hw/vfio/pci.h"
 #include "exec/address-spaces.h"
 #include "exec/memory.h"
 #include "exec/ram_addr.h"
@@ -1400,6 +1401,8 @@ typedef struct VFIODirtyRanges {
 hwaddr max32;
 hwaddr min64;
 hwaddr max64;
+hwaddr minpci64;
+hwaddr maxpci64;
 } VFIODirtyRanges;
 
 typedef struct VFIODirtyRangesListener {
@@ -1408,6 +1411,31 @@ typedef struct VFIODirtyRangesListener {
 MemoryListener listener;
 } VFIODirtyRangesListener;
 
+static bool vfio_section_is_vfio_pci(MemoryRegionSection *section,
+ VFIOContainer *container)
+{
+VFIOPCIDevice *pcidev;
+VFIODevice *vbasedev;
+VFIOGroup *group;
+Object *owner;
+
+owner = memory_region_owner(section->mr);
+
+QLIST_FOREACH(group, &container->group_list, container_next) {
+QLIST_FOREACH(vbasedev, &group->device_list, next) {
+if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
+continue;
+}
+pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
+if (OBJECT(pcidev) == owner) {
+return true;
+}
+}
+}
+
+return false;
+}
+
 static void vfio_dirty_tracking_update(MemoryListener *listener,
MemoryRegionSection *section)
 {
@@ -1424,19 +1452,32 @@ static void vfio_dirty_tracking_update(MemoryListener 
*listener,
 }
 
 /*
- * The address space passed to the dirty tracker is reduced to two ranges:
- * one for 32-bit DMA ranges, and another one for 64-bit DMA ranges.
+ * The address space passed to the dirty tracker is reduced to three 
ranges:
+ * one for 32-bit DMA ranges, one for 64-bit DMA ranges and one for the
+ * PCI 64-bit hole.
+ *
  * The underlying reports of dirty will query a sub-interval of each of
  * these ranges.
  *
- * The purpose of the dual range handling is to handle known cases of big
- * holes in the address space, like the x86 AMD 1T hole. The alternative
- * would be an IOVATree but that has a much bigger runtime overhead and
- * unnecessary complexity.
+ * The purpose of the three range handling is to handle known cases of big
+ * holes in the address space, like the x86 AMD 1T hole, and firmware (like
+ * OVMF) which may relocate the pci-hole64 to the end of the address space.
+ * The latter would otherwise generate large ranges for tracking, stressing
+ * the limits of supported hardware. The pci-hole32 will always be below 4G
+ * (overlapping or not) so it doesn't need special handling and is part of
+ * the 32-bit range.
+ *
+ * The alternative would be an IOVATree but that has a much bigger runtime
+ * overhead and unnecessary complexity.
  */
-min = (end <= UINT32_MAX) ? &range->min32 : &range->min64;
-max = (end <= UINT32_MAX) ? &range->max32 : &range->max64;
-
+if (vfio_section_is_vfio_pci(section, dirty->container) &&
+iova >= UINT32_MAX) {
+min = &range->minpci64;
+max = &range->maxpci64;
+} else {
+min = (end <= UINT32_MAX) ? &range->min32 : &range->min64;
+max = (end <= UINT32_MAX) ? &range->max32 : &range->max64;
+}
 if (*min > iova) {
 *min = iova;
 }
@@ -1461,6 +1502,7 @@ static void vfio_dirty_tracking_init(VFIOContainer 
*container,
 memset(&dirty, 0, sizeof(dirty));
 dirty.ranges.min32 = UINT32_MAX;
 dirty.ranges.min64 = UINT64_MAX;
+dirty.ranges.minpci64 = UINT64_MAX;
 dirty.listener = vfio_dirty_tracking_listener;
 dirty.container = container;
 
@@ -1531,7 +1573,8 @@ 
vfio_device_feature_dma

[PULL 11/13] vfio/migration: Block VFIO migration with postcopy migration

2023-09-11 Thread Cédric Le Goater

From: Avihai Horon 

VFIO migration is not compatible with postcopy migration. A VFIO device
in the destination can't handle page faults for pages that have not been
sent yet.

Doing such migration will cause the VM to crash in the destination:

qemu-system-x86_64: VFIO_MAP_DMA failed: Bad address
qemu-system-x86_64: vfio_dma_map(0x55a28c7659d0, 0xc, 0xb000, 
0x7f1b11a0) = -14 (Bad address)
qemu: hardware error: vfio: DMA mapping failed, unable to continue

To prevent this, block VFIO migration with postcopy migration.

Reported-by: Yanghang Liu 
Signed-off-by: Avihai Horon 
Tested-by: Yanghang Liu 
Reviewed-by: Peter Xu 
Signed-off-by: Cédric Le Goater 
---
 hw/vfio/migration.c | 22 ++
 1 file changed, 22 insertions(+)

diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
index 
71855468fe985291e2d009b81c6efd29abcbe755..20994dc1d60b1606728415fec17c19cfd00c4dee
 100644
--- a/hw/vfio/migration.c
+++ b/hw/vfio/migration.c
@@ -335,6 +335,27 @@ static bool vfio_precopy_supported(VFIODevice *vbasedev)
 
 /* -- */
 
+static int vfio_save_prepare(void *opaque, Error **errp)
+{
+VFIODevice *vbasedev = opaque;
+
+/*
+ * Snapshot doesn't use postcopy, so allow snapshot even if postcopy is on.
+ */
+if (runstate_check(RUN_STATE_SAVE_VM)) {
+return 0;
+}
+
+if (migrate_postcopy_ram()) {
+error_setg(
+errp, "%s: VFIO migration is not supported with postcopy 
migration",
+vbasedev->name);
+return -EOPNOTSUPP;
+}
+
+return 0;
+}
+
 static int vfio_save_setup(QEMUFile *f, void *opaque)
 {
 VFIODevice *vbasedev = opaque;
@@ -640,6 +661,7 @@ static bool vfio_switchover_ack_needed(void *opaque)
 }
 
 static const SaveVMHandlers savevm_vfio_handlers = {
+.save_prepare = vfio_save_prepare,
 .save_setup = vfio_save_setup,
 .save_cleanup = vfio_save_cleanup,
 .state_pending_estimate = vfio_state_pending_estimate,
-- 
2.41.0

[PULL 01/13] vfio/migration: Move from STOP_COPY to STOP in vfio_save_cleanup()

2023-09-11 Thread Cédric Le Goater

From: Avihai Horon 

Changing the device state from STOP_COPY to STOP can take time as the
device may need to free resources and do other operations as part of the
transition. Currently, this is done in vfio_save_complete_precopy() and
therefore it is counted in the migration downtime.

To avoid this, change the device state from STOP_COPY to STOP in
vfio_save_cleanup(), which is called after migration has completed and
thus is not part of migration downtime.

Signed-off-by: Avihai Horon 
Tested-by: YangHang Liu 
Signed-off-by: Cédric Le Goater 
---
 hw/vfio/migration.c | 19 +--
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
index 
2674f4bc472d16989910300958f697f26fefa442..8acd182a8bf3fcd0eb0368816ff3093242b103f5
 100644
--- a/hw/vfio/migration.c
+++ b/hw/vfio/migration.c
@@ -383,6 +383,19 @@ static void vfio_save_cleanup(void *opaque)
 VFIODevice *vbasedev = opaque;
 VFIOMigration *migration = vbasedev->migration;
 
+/*
+ * Changing device state from STOP_COPY to STOP can take time. Do it here,
+ * after migration has completed, so it won't increase downtime.
+ */
+if (migration->device_state == VFIO_DEVICE_STATE_STOP_COPY) {
+/*
+ * If setting the device in STOP state fails, the device should be
+ * reset. To do so, use ERROR state as a recover state.
+ */
+vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_STOP,
+ VFIO_DEVICE_STATE_ERROR);
+}
+
 g_free(migration->data_buffer);
 migration->data_buffer = NULL;
 migration->precopy_init_size = 0;
@@ -508,12 +521,6 @@ static int vfio_save_complete_precopy(QEMUFile *f, void 
*opaque)
 return ret;
 }
 
-/*
- * If setting the device in STOP state fails, the device should be reset.
- * To do so, use ERROR state as a recover state.
- */
-ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_STOP,
-   VFIO_DEVICE_STATE_ERROR);
 trace_vfio_save_complete_precopy(vbasedev->name, ret);
 
 return ret;
-- 
2.41.0

[PULL 06/13] vfio/migration: Allow migration of multiple P2P supporting devices

2023-09-11 Thread Cédric Le Goater

From: Avihai Horon 

Now that P2P support has been added to VFIO migration, allow migration
of multiple devices if all of them support P2P migration.

Single device migration is allowed regardless of P2P migration support.

Signed-off-by: Avihai Horon 
Signed-off-by: Joao Martins 
Reviewed-by: Cédric Le Goater 
Tested-by: YangHang Liu 
Signed-off-by: Cédric Le Goater 
---
 hw/vfio/common.c | 26 ++
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 
7c3d636025695641299f306c2afe12fa3e990736..8a8d074e1863ec40b00a424bbe50494ce8391301
 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -363,21 +363,31 @@ bool vfio_mig_active(void)
 
 static Error *multiple_devices_migration_blocker;
 
-static unsigned int vfio_migratable_device_num(void)
+/*
+ * Multiple devices migration is allowed only if all devices support P2P
+ * migration. Single device migration is allowed regardless of P2P migration
+ * support.
+ */
+static bool vfio_multiple_devices_migration_is_supported(void)
 {
 VFIOGroup *group;
 VFIODevice *vbasedev;
 unsigned int device_num = 0;
+bool all_support_p2p = true;
 
 QLIST_FOREACH(group, &vfio_group_list, next) {
 QLIST_FOREACH(vbasedev, &group->device_list, next) {
 if (vbasedev->migration) {
 device_num++;
+
+if (!(vbasedev->migration->mig_flags & VFIO_MIGRATION_P2P)) {
+all_support_p2p = false;
+}
 }
 }
 }
 
-return device_num;
+return all_support_p2p || device_num <= 1;
 }
 
 int vfio_block_multiple_devices_migration(VFIODevice *vbasedev, Error **errp)
@@ -385,19 +395,19 @@ int vfio_block_multiple_devices_migration(VFIODevice 
*vbasedev, Error **errp)
 int ret;
 
 if (multiple_devices_migration_blocker ||
-vfio_migratable_device_num() <= 1) {
+vfio_multiple_devices_migration_is_supported()) {
 return 0;
 }
 
 if (vbasedev->enable_migration == ON_OFF_AUTO_ON) {
-error_setg(errp, "Migration is currently not supported with multiple "
- "VFIO devices");
+error_setg(errp, "Multiple VFIO devices migration is supported only if 
"
+ "all of them support P2P migration");
 return -EINVAL;
 }
 
 error_setg(&multiple_devices_migration_blocker,
-   "Migration is currently not supported with multiple "
-   "VFIO devices");
+   "Multiple VFIO devices migration is supported only if all of "
+   "them support P2P migration");
 ret = migrate_add_blocker(multiple_devices_migration_blocker, errp);
 if (ret < 0) {
 error_free(multiple_devices_migration_blocker);
@@ -410,7 +420,7 @@ int vfio_block_multiple_devices_migration(VFIODevice 
*vbasedev, Error **errp)
 void vfio_unblock_multiple_devices_migration(void)
 {
 if (!multiple_devices_migration_blocker ||
-vfio_migratable_device_num() > 1) {
+!vfio_multiple_devices_migration_is_supported()) {
 return;
 }
 
-- 
2.41.0

[PULL 05/13] vfio/migration: Add P2P support for VFIO migration

2023-09-11 Thread Cédric Le Goater

From: Avihai Horon 

VFIO migration uAPI defines an optional intermediate P2P quiescent
state. While in the P2P quiescent state, P2P DMA transactions cannot be
initiated by the device, but the device can respond to incoming ones.
Additionally, all outstanding P2P transactions are guaranteed to have
been completed by the time the device enters this state.

The purpose of this state is to support migration of multiple devices
that might do P2P transactions between themselves.

Add support for P2P migration by transitioning all the devices to the
P2P quiescent state before stopping or starting the devices. Use the new
VMChangeStateHandler prepare_cb to achieve that behavior.

This will allow migration of multiple VFIO devices if all of them
support P2P migration.

Signed-off-by: Avihai Horon 
Tested-by: YangHang Liu 
Reviewed-by: Cédric Le Goater 
Signed-off-by: Cédric Le Goater 
---
 docs/devel/vfio-migration.rst | 93 +--
 hw/vfio/common.c  |  6 ++-
 hw/vfio/migration.c   | 46 +++--
 hw/vfio/trace-events  |  1 +
 4 files changed, 105 insertions(+), 41 deletions(-)

diff --git a/docs/devel/vfio-migration.rst b/docs/devel/vfio-migration.rst
index 
b433cb5bb2c8caa703c6063efeb382bebfe7aed6..605fe60e9695a50813b1294bb970ce2c39d2ba07
 100644
--- a/docs/devel/vfio-migration.rst
+++ b/docs/devel/vfio-migration.rst
@@ -23,9 +23,21 @@ and recommends that the initial bytes are sent and loaded in 
the destination
 before stopping the source VM. Enabling this migration capability will
 guarantee that and thus, can potentially reduce downtime even further.
 
-Note that currently VFIO migration is supported only for a single device. This
-is due to VFIO migration's lack of P2P support. However, P2P support is planned
-to be added later on.
+To support migration of multiple devices that might do P2P transactions between
+themselves, VFIO migration uAPI defines an intermediate P2P quiescent state.
+While in the P2P quiescent state, P2P DMA transactions cannot be initiated by
+the device, but the device can respond to incoming ones. Additionally, all
+outstanding P2P transactions are guaranteed to have been completed by the time
+the device enters this state.
+
+All the devices that support P2P migration are first transitioned to the P2P
+quiescent state and only then are they stopped or started. This makes migration
+safe P2P-wise, since starting and stopping the devices is not done atomically
+for all the devices together.
+
+Thus, multiple VFIO devices migration is allowed only if all the devices
+support P2P migration. Single VFIO device migration is allowed regardless of
+P2P migration support.
 
 A detailed description of the UAPI for VFIO device migration can be found in
 the comment for the ``vfio_device_mig_state`` structure in the header file
@@ -132,54 +144,63 @@ will be blocked.
 Flow of state changes during Live migration
 ===
 
-Below is the flow of state change during live migration.
+Below is the state change flow during live migration for a VFIO device that
+supports both precopy and P2P migration. The flow for devices that don't
+support it is similar, except that the relevant states for precopy and P2P are
+skipped.
 The values in the parentheses represent the VM state, the migration state, and
 the VFIO device state, respectively.
-The text in the square brackets represents the flow if the VFIO device supports
-pre-copy.
 
 Live migration save path
 
 
 ::
 
-QEMU normal running state
-(RUNNING, _NONE, _RUNNING)
-  |
+   QEMU normal running state
+   (RUNNING, _NONE, _RUNNING)
+  |
  migrate_init spawns migration_thread
-Migration thread then calls each device's .save_setup()
-  (RUNNING, _SETUP, _RUNNING [_PRE_COPY])
-  |
-  (RUNNING, _ACTIVE, _RUNNING [_PRE_COPY])
-  If device is active, get pending_bytes by 
.state_pending_{estimate,exact}()
-  If total pending_bytes >= threshold_size, call .save_live_iterate()
-  [Data of VFIO device for pre-copy phase is copied]
-Iterate till total pending bytes converge and are less than threshold
-  |
-  On migration completion, vCPU stops and calls .save_live_complete_precopy for
-  each active device. The VFIO device is then transitioned into _STOP_COPY 
state
-  (FINISH_MIGRATE, _DEVICE, _STOP_COPY)
-  |
- For the VFIO device, iterate in .save_live_complete_precopy until
- pending data is 0
-   (FINISH_MIGRATE, _DEVICE, _STOP)
-  |
- (FINISH_MIGRATE, _COMPLETED, _STOP

[PULL 07/13] migration: Add migration prefix to functions in target.c

2023-09-11 Thread Cédric Le Goater

From: Avihai Horon 

The functions in target.c are not static, yet they don't have a proper
migration prefix. Add such prefix.

Signed-off-by: Avihai Horon 
Reviewed-by: Cédric Le Goater 
Signed-off-by: Cédric Le Goater 
---
 migration/migration.h | 4 ++--
 migration/migration.c | 6 +++---
 migration/savevm.c| 2 +-
 migration/target.c| 8 
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/migration/migration.h b/migration/migration.h
index 
6eea18db367585e6f08aada9d14fc37e02e50977..c5695de214965dfddd854779e4da8d09f04d35ba
 100644
--- a/migration/migration.h
+++ b/migration/migration.h
@@ -512,8 +512,8 @@ void migration_consume_urgent_request(void);
 bool migration_rate_limit(void);
 void migration_cancel(const Error *error);
 
-void populate_vfio_info(MigrationInfo *info);
-void reset_vfio_bytes_transferred(void);
+void migration_populate_vfio_info(MigrationInfo *info);
+void migration_reset_vfio_bytes_transferred(void);
 void postcopy_temp_page_reset(PostcopyTmpPage *tmp_page);
 
 #endif
diff --git a/migration/migration.c b/migration/migration.c
index 
5528acb65e0f7b84d528ee8e8c477975cb8a7dad..92866a8f49d3a7d24028198defb15c5d4d86726e
 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -1039,7 +1039,7 @@ static void fill_source_migration_info(MigrationInfo 
*info)
 populate_time_info(info, s);
 populate_ram_info(info, s);
 populate_disk_info(info);
-populate_vfio_info(info);
+migration_populate_vfio_info(info);
 break;
 case MIGRATION_STATUS_COLO:
 info->has_status = true;
@@ -1048,7 +1048,7 @@ static void fill_source_migration_info(MigrationInfo 
*info)
 case MIGRATION_STATUS_COMPLETED:
 populate_time_info(info, s);
 populate_ram_info(info, s);
-populate_vfio_info(info);
+migration_populate_vfio_info(info);
 break;
 case MIGRATION_STATUS_FAILED:
 info->has_status = true;
@@ -1641,7 +1641,7 @@ static bool migrate_prepare(MigrationState *s, bool blk, 
bool blk_inc,
  */
 memset(&mig_stats, 0, sizeof(mig_stats));
 memset(&compression_counters, 0, sizeof(compression_counters));
-reset_vfio_bytes_transferred();
+migration_reset_vfio_bytes_transferred();
 
 return true;
 }
diff --git a/migration/savevm.c b/migration/savevm.c
index 
a2cb8855e29547d2b66e6bddfc3363466c7c3bab..5bf8b59a7dfc243eb353674bdef8083d441797e3
 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -1622,7 +1622,7 @@ static int qemu_savevm_state(QEMUFile *f, Error **errp)
 migrate_init(ms);
 memset(&mig_stats, 0, sizeof(mig_stats));
 memset(&compression_counters, 0, sizeof(compression_counters));
-reset_vfio_bytes_transferred();
+migration_reset_vfio_bytes_transferred();
 ms->to_dst_file = f;
 
 qemu_mutex_unlock_iothread();
diff --git a/migration/target.c b/migration/target.c
index 
f39c9a8d88775648816d46113843ef58198c86fd..a6ffa9a5ce312d1e64157b650827aa726eb4d364
 100644
--- a/migration/target.c
+++ b/migration/target.c
@@ -15,7 +15,7 @@
 #endif
 
 #ifdef CONFIG_VFIO
-void populate_vfio_info(MigrationInfo *info)
+void migration_populate_vfio_info(MigrationInfo *info)
 {
 if (vfio_mig_active()) {
 info->vfio = g_malloc0(sizeof(*info->vfio));
@@ -23,16 +23,16 @@ void populate_vfio_info(MigrationInfo *info)
 }
 }
 
-void reset_vfio_bytes_transferred(void)
+void migration_reset_vfio_bytes_transferred(void)
 {
 vfio_reset_bytes_transferred();
 }
 #else
-void populate_vfio_info(MigrationInfo *info)
+void migration_populate_vfio_info(MigrationInfo *info)
 {
 }
 
-void reset_vfio_bytes_transferred(void)
+void migration_reset_vfio_bytes_transferred(void)
 {
 }
 #endif
-- 
2.41.0

[PULL 10/13] migration: Add .save_prepare() handler to struct SaveVMHandlers

2023-09-11 Thread Cédric Le Goater

From: Avihai Horon 

Add a new .save_prepare() handler to struct SaveVMHandlers. This handler
is called early, even before migration starts, and can be used by
devices to perform early checks.

Refactor migrate_init() to be able to return errors and call
.save_prepare() from there.

Suggested-by: Peter Xu 
Signed-off-by: Avihai Horon 
Reviewed-by: Peter Xu 
Reviewed-by: Cédric Le Goater 
Signed-off-by: Cédric Le Goater 
---
 include/migration/register.h |  5 +
 migration/migration.h|  2 +-
 migration/savevm.h   |  1 +
 migration/migration.c| 15 +--
 migration/savevm.c   | 29 -
 5 files changed, 48 insertions(+), 4 deletions(-)

diff --git a/include/migration/register.h b/include/migration/register.h
index 
90914f32f50cab9fc6b7797fbf3fb509a0860291..2b12c6adeca7ce5c7282e8df3c2def0068d44c18
 100644
--- a/include/migration/register.h
+++ b/include/migration/register.h
@@ -20,6 +20,11 @@ typedef struct SaveVMHandlers {
 /* This runs inside the iothread lock.  */
 SaveStateHandler *save_state;
 
+/*
+ * save_prepare is called early, even before migration starts, and can be
+ * used to perform early checks.
+ */
+int (*save_prepare)(void *opaque, Error **errp);
 void (*save_cleanup)(void *opaque);
 int (*save_live_complete_postcopy)(QEMUFile *f, void *opaque);
 int (*save_live_complete_precopy)(QEMUFile *f, void *opaque);
diff --git a/migration/migration.h b/migration/migration.h
index 
c5695de214965dfddd854779e4da8d09f04d35ba..c390500604b6db50eb1bb1dbb8ee56f5e1bd8610
 100644
--- a/migration/migration.h
+++ b/migration/migration.h
@@ -472,7 +472,7 @@ void migrate_fd_connect(MigrationState *s, Error *error_in);
 bool migration_is_setup_or_active(int state);
 bool migration_is_running(int state);
 
-void migrate_init(MigrationState *s);
+int migrate_init(MigrationState *s, Error **errp);
 bool migration_is_blocked(Error **errp);
 /* True if outgoing migration has entered postcopy phase */
 bool migration_in_postcopy(void);
diff --git a/migration/savevm.h b/migration/savevm.h
index 
e894bbc143313a34c5c573f0839d5c13567cc521..74669733dd63a080b765866c703234a5c4939223
 100644
--- a/migration/savevm.h
+++ b/migration/savevm.h
@@ -31,6 +31,7 @@
 
 bool qemu_savevm_state_blocked(Error **errp);
 void qemu_savevm_non_migratable_list(strList **reasons);
+int qemu_savevm_state_prepare(Error **errp);
 void qemu_savevm_state_setup(QEMUFile *f);
 bool qemu_savevm_state_guest_unplug_pending(void);
 int qemu_savevm_state_resume_prepare(MigrationState *s);
diff --git a/migration/migration.c b/migration/migration.c
index 
ce01a3ba6af72aa35063f88355349ec739708d4a..d61e5727429aef92b129f04208ee82eff414bd97
 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -1392,8 +1392,15 @@ bool migration_is_active(MigrationState *s)
 s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE);
 }
 
-void migrate_init(MigrationState *s)
+int migrate_init(MigrationState *s, Error **errp)
 {
+int ret;
+
+ret = qemu_savevm_state_prepare(errp);
+if (ret) {
+return ret;
+}
+
 /*
  * Reinitialise all migration state, except
  * parameters/capabilities that the user set, and
@@ -1432,6 +1439,8 @@ void migrate_init(MigrationState *s)
 memset(&mig_stats, 0, sizeof(mig_stats));
 memset(&compression_counters, 0, sizeof(compression_counters));
 migration_reset_vfio_bytes_transferred();
+
+return 0;
 }
 
 int migrate_add_blocker_internal(Error *reason, Error **errp)
@@ -1641,7 +1650,9 @@ static bool migrate_prepare(MigrationState *s, bool blk, 
bool blk_inc,
 migrate_set_block_incremental(true);
 }
 
-migrate_init(s);
+if (migrate_init(s, errp)) {
+return false;
+}
 
 return true;
 }
diff --git a/migration/savevm.c b/migration/savevm.c
index 
e14efeced0fb8e4b2dc2b7799f5612799f185170..bb3e99194c608d4a08fe46182cfa67d94635d3c9
 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -1233,6 +1233,30 @@ bool qemu_savevm_state_guest_unplug_pending(void)
 return false;
 }
 
+int qemu_savevm_state_prepare(Error **errp)
+{
+SaveStateEntry *se;
+int ret;
+
+QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
+if (!se->ops || !se->ops->save_prepare) {
+continue;
+}
+if (se->ops->is_active) {
+if (!se->ops->is_active(se->opaque)) {
+continue;
+}
+}
+
+ret = se->ops->save_prepare(se->opaque, errp);
+if (ret < 0) {
+return ret;
+}
+}
+
+return 0;
+}
+
 void qemu_savevm_state_setup(QEMUFile *f)
 {
 MigrationState *ms = migrate_get_current();
@@ -1619,7 +1643,10 @@ static int qemu_savevm_state(QEMUFile *f, Error **errp)
 return -EINVAL;
 }
 
-migrate_init(ms);
+ret = migrate_init(ms, errp);
+if (ret) {
+return ret;
+}
 ms->to_dst_file = f;
 
 qemu_mutex_unlock_iothread();

[PULL 03/13] qdev: Add qdev_add_vm_change_state_handler_full()

2023-09-11 Thread Cédric Le Goater

From: Avihai Horon 

Add qdev_add_vm_change_state_handler_full() variant that allows setting
a prepare callback in addition to the main callback.

This will facilitate adding P2P support for VFIO migration in the
following patches.

Signed-off-by: Avihai Horon 
Signed-off-by: Joao Martins 
Reviewed-by: Cédric Le Goater 
Tested-by: YangHang Liu 
Signed-off-by: Cédric Le Goater 
---
 include/sysemu/runstate.h |  3 +++
 hw/core/vm-change-state-handler.c | 14 +-
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/include/sysemu/runstate.h b/include/sysemu/runstate.h
index 
764a0fc6a4554857bcff339c668b48193b40c3a4..08afb97695bd6a7c7f1fb852f475be710eb4ac35
 100644
--- a/include/sysemu/runstate.h
+++ b/include/sysemu/runstate.h
@@ -23,6 +23,9 @@ 
qemu_add_vm_change_state_handler_prio_full(VMChangeStateHandler *cb,
 VMChangeStateEntry *qdev_add_vm_change_state_handler(DeviceState *dev,
  VMChangeStateHandler *cb,
  void *opaque);
+VMChangeStateEntry *qdev_add_vm_change_state_handler_full(
+DeviceState *dev, VMChangeStateHandler *cb,
+VMChangeStateHandler *prepare_cb, void *opaque);
 void qemu_del_vm_change_state_handler(VMChangeStateEntry *e);
 /**
  * vm_state_notify: Notify the state of the VM
diff --git a/hw/core/vm-change-state-handler.c 
b/hw/core/vm-change-state-handler.c
index 
1f3630986d54e1fc70ac7d62f646296af3f7f3cf..8e2639224e7572b77be0f3c44e12d7321f18b535
 100644
--- a/hw/core/vm-change-state-handler.c
+++ b/hw/core/vm-change-state-handler.c
@@ -55,8 +55,20 @@ static int qdev_get_dev_tree_depth(DeviceState *dev)
 VMChangeStateEntry *qdev_add_vm_change_state_handler(DeviceState *dev,
  VMChangeStateHandler *cb,
  void *opaque)
+{
+return qdev_add_vm_change_state_handler_full(dev, cb, NULL, opaque);
+}
+
+/*
+ * Exactly like qdev_add_vm_change_state_handler() but passes a prepare_cb
+ * argument too.
+ */
+VMChangeStateEntry *qdev_add_vm_change_state_handler_full(
+DeviceState *dev, VMChangeStateHandler *cb,
+VMChangeStateHandler *prepare_cb, void *opaque)
 {
 int depth = qdev_get_dev_tree_depth(dev);
 
-return qemu_add_vm_change_state_handler_prio(cb, opaque, depth);
+return qemu_add_vm_change_state_handler_prio_full(cb, prepare_cb, opaque,
+  depth);
 }
-- 
2.41.0

[PULL 02/13] sysemu: Add prepare callback to struct VMChangeStateEntry

2023-09-11 Thread Cédric Le Goater

From: Avihai Horon 

Add prepare callback to struct VMChangeStateEntry.

The prepare callback is optional and can be set by the new function
qemu_add_vm_change_state_handler_prio_full() that allows setting this
callback in addition to the main callback.

The prepare callbacks and main callbacks are called in two separate
phases: First all prepare callbacks are called and only then all main
callbacks are called.

The purpose of the new prepare callback is to allow all devices to run a
preliminary task before calling the devices' main callbacks.

This will facilitate adding P2P support for VFIO migration where all
VFIO devices need to be put in an intermediate P2P quiescent state
before being stopped or started by the main callback.

Signed-off-by: Avihai Horon 
Reviewed-by: Cédric Le Goater 
Tested-by: YangHang Liu 
Signed-off-by: Cédric Le Goater 
---
 include/sysemu/runstate.h |  4 
 softmmu/runstate.c| 40 +++
 2 files changed, 44 insertions(+)

diff --git a/include/sysemu/runstate.h b/include/sysemu/runstate.h
index 
7beb29c2e2ac564bb002d208b125ab6269e097de..764a0fc6a4554857bcff339c668b48193b40c3a4
 100644
--- a/include/sysemu/runstate.h
+++ b/include/sysemu/runstate.h
@@ -16,6 +16,10 @@ VMChangeStateEntry 
*qemu_add_vm_change_state_handler(VMChangeStateHandler *cb,
  void *opaque);
 VMChangeStateEntry *qemu_add_vm_change_state_handler_prio(
 VMChangeStateHandler *cb, void *opaque, int priority);
+VMChangeStateEntry *
+qemu_add_vm_change_state_handler_prio_full(VMChangeStateHandler *cb,
+   VMChangeStateHandler *prepare_cb,
+   void *opaque, int priority);
 VMChangeStateEntry *qdev_add_vm_change_state_handler(DeviceState *dev,
  VMChangeStateHandler *cb,
  void *opaque);
diff --git a/softmmu/runstate.c b/softmmu/runstate.c
index 
f3bd8628181303792629fa4079f09abf63fd9787..1652ed0439b4d39e5719d5b7caa002aa297789b6
 100644
--- a/softmmu/runstate.c
+++ b/softmmu/runstate.c
@@ -271,6 +271,7 @@ void qemu_system_vmstop_request(RunState state)
 }
 struct VMChangeStateEntry {
 VMChangeStateHandler *cb;
+VMChangeStateHandler *prepare_cb;
 void *opaque;
 QTAILQ_ENTRY(VMChangeStateEntry) entries;
 int priority;
@@ -293,12 +294,39 @@ static QTAILQ_HEAD(, VMChangeStateEntry) 
vm_change_state_head =
  */
 VMChangeStateEntry *qemu_add_vm_change_state_handler_prio(
 VMChangeStateHandler *cb, void *opaque, int priority)
+{
+return qemu_add_vm_change_state_handler_prio_full(cb, NULL, opaque,
+  priority);
+}
+
+/**
+ * qemu_add_vm_change_state_handler_prio_full:
+ * @cb: the main callback to invoke
+ * @prepare_cb: a callback to invoke before the main callback
+ * @opaque: user data passed to the callbacks
+ * @priority: low priorities execute first when the vm runs and the reverse is
+ *true when the vm stops
+ *
+ * Register a main callback function and an optional prepare callback function
+ * that are invoked when the vm starts or stops running. The main callback and
+ * the prepare callback are called in two separate phases: First all prepare
+ * callbacks are called and only then all main callbacks are called. As its
+ * name suggests, the prepare callback can be used to do some preparatory work
+ * before invoking the main callback.
+ *
+ * Returns: an entry to be freed using qemu_del_vm_change_state_handler()
+ */
+VMChangeStateEntry *
+qemu_add_vm_change_state_handler_prio_full(VMChangeStateHandler *cb,
+   VMChangeStateHandler *prepare_cb,
+   void *opaque, int priority)
 {
 VMChangeStateEntry *e;
 VMChangeStateEntry *other;
 
 e = g_malloc0(sizeof(*e));
 e->cb = cb;
+e->prepare_cb = prepare_cb;
 e->opaque = opaque;
 e->priority = priority;
 
@@ -333,10 +361,22 @@ void vm_state_notify(bool running, RunState state)
 trace_vm_state_notify(running, state, RunState_str(state));
 
 if (running) {
+QTAILQ_FOREACH_SAFE(e, &vm_change_state_head, entries, next) {
+if (e->prepare_cb) {
+e->prepare_cb(e->opaque, running, state);
+}
+}
+
 QTAILQ_FOREACH_SAFE(e, &vm_change_state_head, entries, next) {
 e->cb(e->opaque, running, state);
 }
 } else {
+QTAILQ_FOREACH_REVERSE_SAFE(e, &vm_change_state_head, entries, next) {
+if (e->prepare_cb) {
+e->prepare_cb(e->opaque, running, state);
+}
+}
+
 QTAILQ_FOREACH_REVERSE_SAFE(e, &vm_change_state_head, entries, next) {
 e->cb(e->opaque, running, state);
 }
-- 
2.41.0

[PULL 00/13] vfio queue

2023-09-11 Thread Cédric Le Goater

The following changes since commit c5ea91da443b458352c1b629b490ee6631775cb4:

  Merge tag 'pull-trivial-patches' of https://gitlab.com/mjt0k/qemu into 
staging (2023-09-08 10:06:25 -0400)

are available in the Git repository at:

  https://github.com/legoater/qemu/ tags/pull-vfio-20230911

for you to fetch changes up to a31fe5daeaa230556145bfc04af1bd4e68f377fa:

  vfio/common: Separate vfio-pci ranges (2023-09-11 08:34:06 +0200)


vfio queue:

* Small downtime optimisation for VFIO migration
* P2P support for VFIO migration
* Introduction of a save_prepare() handler to fail VFIO migration
* Fix on DMA logging ranges calculation for OVMF enabling dynamic window


Avihai Horon (11):
  vfio/migration: Move from STOP_COPY to STOP in vfio_save_cleanup()
  sysemu: Add prepare callback to struct VMChangeStateEntry
  qdev: Add qdev_add_vm_change_state_handler_full()
  vfio/migration: Add P2P support for VFIO migration
  vfio/migration: Allow migration of multiple P2P supporting devices
  migration: Add migration prefix to functions in target.c
  vfio/migration: Fail adding device with enable-migration=on and existing 
blocker
  migration: Move more initializations to migrate_init()
  migration: Add .save_prepare() handler to struct SaveVMHandlers
  vfio/migration: Block VFIO migration with postcopy migration
  vfio/migration: Block VFIO migration with background snapshot

Joao Martins (2):
  vfio/migration: Refactor PRE_COPY and RUNNING state checks
  vfio/common: Separate vfio-pci ranges

 docs/devel/vfio-migration.rst |  93 +---
 include/hw/vfio/vfio-common.h |   2 +
 include/migration/register.h  |   5 ++
 include/sysemu/runstate.h |   7 +++
 migration/migration.h |   6 +-
 migration/savevm.h|   1 +
 hw/core/vm-change-state-handler.c |  14 -
 hw/vfio/common.c  | 126 ++
 hw/vfio/migration.c   | 106 +++-
 migration/migration.c |  33 ++
 migration/savevm.c|  32 --
 migration/target.c|   8 +--
 softmmu/runstate.c|  40 
 hw/vfio/trace-events  |   3 +-
 14 files changed, 377 insertions(+), 99 deletions(-)

[PULL 08/13] vfio/migration: Fail adding device with enable-migration=on and existing blocker

2023-09-11 Thread Cédric Le Goater

From: Avihai Horon 

If a device with enable-migration=on is added and it causes a migration
blocker, adding the device should fail with a proper error.

This is not the case with multiple device migration blocker when the
blocker already exists. If the blocker already exists and a device with
enable-migration=on is added which causes a migration blocker, adding
the device will succeed.

Fix it by failing adding the device in such case.

Fixes: 8bbcb64a71d8 ("vfio/migration: Make VFIO migration non-experimental")
Signed-off-by: Avihai Horon 
Reviewed-by: Cédric Le Goater 
Signed-off-by: Cédric Le Goater 
---
 hw/vfio/common.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 
8a8d074e1863ec40b00a424bbe50494ce8391301..237101d03844273f653d98b6d053a1ae9c05a247
 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -394,8 +394,7 @@ int vfio_block_multiple_devices_migration(VFIODevice 
*vbasedev, Error **errp)
 {
 int ret;
 
-if (multiple_devices_migration_blocker ||
-vfio_multiple_devices_migration_is_supported()) {
+if (vfio_multiple_devices_migration_is_supported()) {
 return 0;
 }
 
@@ -405,6 +404,10 @@ int vfio_block_multiple_devices_migration(VFIODevice 
*vbasedev, Error **errp)
 return -EINVAL;
 }
 
+if (multiple_devices_migration_blocker) {
+return 0;
+}
+
 error_setg(&multiple_devices_migration_blocker,
"Multiple VFIO devices migration is supported only if all of "
"them support P2P migration");
-- 
2.41.0

[PULL 04/13] vfio/migration: Refactor PRE_COPY and RUNNING state checks

2023-09-11 Thread Cédric Le Goater

From: Joao Martins 

Move the PRE_COPY and RUNNING state checks to helper functions.

This is in preparation for adding P2P VFIO migration support, where
these helpers will also test for PRE_COPY_P2P and RUNNING_P2P states.

Signed-off-by: Joao Martins 
Signed-off-by: Avihai Horon 
Reviewed-by: Cédric Le Goater 
Tested-by: YangHang Liu 
Signed-off-by: Cédric Le Goater 
---
 include/hw/vfio/vfio-common.h |  2 ++
 hw/vfio/common.c  | 22 ++
 hw/vfio/migration.c   | 10 --
 3 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 
da43d273524ec441c13194b363008ab27a72839d..e9b895459534d7873445f865ef0e5f8f5c53882a
 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -230,6 +230,8 @@ void vfio_unblock_multiple_devices_migration(void);
 bool vfio_viommu_preset(VFIODevice *vbasedev);
 int64_t vfio_mig_bytes_transferred(void);
 void vfio_reset_bytes_transferred(void);
+bool vfio_device_state_is_running(VFIODevice *vbasedev);
+bool vfio_device_state_is_precopy(VFIODevice *vbasedev);
 
 #ifdef CONFIG_LINUX
 int vfio_get_region_info(VFIODevice *vbasedev, int index,
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 
9aac21abb76ef7d1abb54428e9a173a33ce16073..16cf79a76c845d8eb19498e8c6bf1f3b2b8d2fd8
 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -437,6 +437,20 @@ static void vfio_set_migration_error(int err)
 }
 }
 
+bool vfio_device_state_is_running(VFIODevice *vbasedev)
+{
+VFIOMigration *migration = vbasedev->migration;
+
+return migration->device_state == VFIO_DEVICE_STATE_RUNNING;
+}
+
+bool vfio_device_state_is_precopy(VFIODevice *vbasedev)
+{
+VFIOMigration *migration = vbasedev->migration;
+
+return migration->device_state == VFIO_DEVICE_STATE_PRE_COPY;
+}
+
 static bool vfio_devices_all_dirty_tracking(VFIOContainer *container)
 {
 VFIOGroup *group;
@@ -457,8 +471,8 @@ static bool vfio_devices_all_dirty_tracking(VFIOContainer 
*container)
 }
 
 if (vbasedev->pre_copy_dirty_page_tracking == ON_OFF_AUTO_OFF &&
-(migration->device_state == VFIO_DEVICE_STATE_RUNNING ||
- migration->device_state == VFIO_DEVICE_STATE_PRE_COPY)) {
+(vfio_device_state_is_running(vbasedev) ||
+ vfio_device_state_is_precopy(vbasedev))) {
 return false;
 }
 }
@@ -503,8 +517,8 @@ static bool 
vfio_devices_all_running_and_mig_active(VFIOContainer *container)
 return false;
 }
 
-if (migration->device_state == VFIO_DEVICE_STATE_RUNNING ||
-migration->device_state == VFIO_DEVICE_STATE_PRE_COPY) {
+if (vfio_device_state_is_running(vbasedev) ||
+vfio_device_state_is_precopy(vbasedev)) {
 continue;
 } else {
 return false;
diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
index 
8acd182a8bf3fcd0eb0368816ff3093242b103f5..48f9c23cbe3ac720ef252d699636e4a572bec762
 100644
--- a/hw/vfio/migration.c
+++ b/hw/vfio/migration.c
@@ -411,7 +411,7 @@ static void vfio_state_pending_estimate(void *opaque, 
uint64_t *must_precopy,
 VFIODevice *vbasedev = opaque;
 VFIOMigration *migration = vbasedev->migration;
 
-if (migration->device_state != VFIO_DEVICE_STATE_PRE_COPY) {
+if (!vfio_device_state_is_precopy(vbasedev)) {
 return;
 }
 
@@ -444,7 +444,7 @@ static void vfio_state_pending_exact(void *opaque, uint64_t 
*must_precopy,
 vfio_query_stop_copy_size(vbasedev, &stop_copy_size);
 *must_precopy += stop_copy_size;
 
-if (migration->device_state == VFIO_DEVICE_STATE_PRE_COPY) {
+if (vfio_device_state_is_precopy(vbasedev)) {
 vfio_query_precopy_size(migration);
 
 *must_precopy +=
@@ -459,9 +459,8 @@ static void vfio_state_pending_exact(void *opaque, uint64_t 
*must_precopy,
 static bool vfio_is_active_iterate(void *opaque)
 {
 VFIODevice *vbasedev = opaque;
-VFIOMigration *migration = vbasedev->migration;
 
-return migration->device_state == VFIO_DEVICE_STATE_PRE_COPY;
+return vfio_device_state_is_precopy(vbasedev);
 }
 
 static int vfio_save_iterate(QEMUFile *f, void *opaque)
@@ -656,7 +655,6 @@ static const SaveVMHandlers savevm_vfio_handlers = {
 static void vfio_vmstate_change(void *opaque, bool running, RunState state)
 {
 VFIODevice *vbasedev = opaque;
-VFIOMigration *migration = vbasedev->migration;
 enum vfio_device_mig_state new_state;
 int ret;
 
@@ -664,7 +662,7 @@ static void vfio_vmstate_change(void *opaque, bool running, 
RunState state)
 new_state = VFIO_DEVICE_STATE_RUNNING;
 } else {
 new_state =
-(migration->device_state == VFIO_DEVICE_STATE_PRE_COPY &&
+(vfio_device_state_is_precopy(vbasedev) &&
  (state == RUN_STATE_FINISH_MIGRATE || state == RUN_STATE_PAUSE

Re: [PATCH v2 11/19] target/riscv: introduce KVM AccelCPUClass

2023-09-11 Thread Andrew Jones

On Wed, Sep 06, 2023 at 06:16:38AM -0300, Daniel Henrique Barboza wrote:
> Add a KVM accelerator class like we did with TCG. The difference is
> that, at least for now, we won't be using a realize() implementation for
> this accelerator.
> 
> We'll start by assiging kvm_riscv_cpu_add_kvm_properties(), renamed to
> kvm_cpu_instance_init(), as a 'cpu_instance_init' implementation. Change
> riscv_cpu_post_init() to invoke accel_cpu_instance_init(), which will go
> through the 'cpu_instance_init' impl of the current acceleration (if
> available) and execute it. The end result is that the KVM initial setup,
> i.e. starting registers and adding its specific properties, will be done
> via this hook.
> 
> Add a 'tcg_enabled()' condition in riscv_cpu_post_init() to avoid
> calling riscv_cpu_add_user_properties() when running KVM. We'll remove
> this condition when the TCG accel class get its own 'cpu_instance_init'
> implementation.
> 
> Signed-off-by: Daniel Henrique Barboza 
> ---
>  target/riscv/cpu.c   |  8 +++-
>  target/riscv/kvm.c   | 26 --
>  target/riscv/kvm_riscv.h |  6 --
>  3 files changed, 27 insertions(+), 13 deletions(-)
> 
> diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
> index 50be127f36..c8a19be1af 100644
> --- a/target/riscv/cpu.c
> +++ b/target/riscv/cpu.c
> @@ -1219,7 +1219,9 @@ static bool riscv_cpu_has_user_properties(Object 
> *cpu_obj)
>  
>  static void riscv_cpu_post_init(Object *obj)
>  {
> -if (riscv_cpu_has_user_properties(obj)) {
> +accel_cpu_instance_init(CPU(obj));
> +
> +if (tcg_enabled() && riscv_cpu_has_user_properties(obj)) {
>  riscv_cpu_add_user_properties(obj);
>  }
>  
> @@ -1589,10 +1591,6 @@ static void riscv_cpu_add_multiext_prop_array(Object 
> *obj,
>  static void riscv_cpu_add_user_properties(Object *obj)
>  {
>  #ifndef CONFIG_USER_ONLY
> -if (kvm_enabled()) {
> -kvm_riscv_cpu_add_kvm_properties(obj);
> -return;
> -}
>  riscv_add_satp_mode_properties(obj);
>  #endif
>  
> diff --git a/target/riscv/kvm.c b/target/riscv/kvm.c
> index ef6b2cfffe..492b97d19b 100644
> --- a/target/riscv/kvm.c
> +++ b/target/riscv/kvm.c
> @@ -31,6 +31,7 @@
>  #include "sysemu/kvm_int.h"
>  #include "cpu.h"
>  #include "trace.h"
> +#include "hw/core/accel-cpu.h"
>  #include "hw/pci/pci.h"
>  #include "exec/memattrs.h"
>  #include "exec/address-spaces.h"
> @@ -1274,8 +1275,9 @@ void kvm_riscv_aia_create(MachineState *machine, 
> uint64_t group_shift,
>  kvm_msi_via_irqfd_allowed = kvm_irqfds_enabled();
>  }
>  
> -void kvm_riscv_cpu_add_kvm_properties(Object *obj)
> +static void kvm_cpu_instance_init(CPUState *cs)
>  {
> +Object *obj = OBJECT(RISCV_CPU(cs));
>  DeviceState *dev = DEVICE(obj);
>  
>  riscv_init_user_properties(obj);
> @@ -1287,7 +1289,7 @@ void kvm_riscv_cpu_add_kvm_properties(Object *obj)
>  riscv_cpu_add_kvm_unavail_prop_array(obj, riscv_cpu_experimental_exts);
>  
>  for (Property *prop = riscv_cpu_options; prop && prop->name; prop++) {
> -/* Check if KVM created the property already */
> +/* Check if we have a specific KVM handler for the option */
>  if (object_property_find(obj, prop->name)) {
>  continue;
>  }
> @@ -1295,6 +1297,26 @@ void kvm_riscv_cpu_add_kvm_properties(Object *obj)
>  }
>  }
>  
> +static void kvm_cpu_accel_class_init(ObjectClass *oc, void *data)
> +{
> +AccelCPUClass *acc = ACCEL_CPU_CLASS(oc);
> +
> +acc->cpu_instance_init = kvm_cpu_instance_init;
> +}
> +
> +static const TypeInfo kvm_cpu_accel_type_info = {
> +.name = ACCEL_CPU_NAME("kvm"),
> +
> +.parent = TYPE_ACCEL_CPU,
> +.class_init = kvm_cpu_accel_class_init,
> +.abstract = true,
> +};
> +static void kvm_cpu_accel_register_types(void)
> +{
> +type_register_static(&kvm_cpu_accel_type_info);
> +}
> +type_init(kvm_cpu_accel_register_types);
> +
>  static void riscv_host_cpu_init(Object *obj)
>  {
>  CPURISCVState *env = &RISCV_CPU(obj)->env;
> diff --git a/target/riscv/kvm_riscv.h b/target/riscv/kvm_riscv.h
> index c9ecd9a967..8fe6e3e6fb 100644
> --- a/target/riscv/kvm_riscv.h
> +++ b/target/riscv/kvm_riscv.h
> @@ -20,7 +20,6 @@
>  #define QEMU_KVM_RISCV_H
>  
>  #ifdef CONFIG_KVM
> -void kvm_riscv_cpu_add_kvm_properties(Object *obj);
>  void kvm_riscv_reset_vcpu(RISCVCPU *cpu);
>  void kvm_riscv_set_irq(RISCVCPU *cpu, int irq, int level);
>  void kvm_riscv_aia_create(MachineState *machine, uint64_t group_shift,
> @@ -29,11 +28,6 @@ void kvm_riscv_aia_create(MachineState *machine, uint64_t 
> group_shift,
>uint64_t guest_num);
>  void riscv_kvm_aplic_request(void *opaque, int irq, int level);
>  #else
> -static inline void kvm_riscv_cpu_add_kvm_properties(Object *obj)
> -{
> -g_assert_not_reached();
> -}
> -
>  static inline void kvm_riscv_reset_vcpu(RISCVCPU *cpu)
>  {
>  g_assert_not_reached();
> -- 
> 2.41.0
>

Reviewed-by: Andrew Jones

Re: [PATCH v2 14/19] target/riscv/cpu.c: export set_misa()

2023-09-11 Thread Andrew Jones

On Wed, Sep 06, 2023 at 06:16:41AM -0300, Daniel Henrique Barboza wrote:
> We'll move riscv_init_max_cpu_extensions() to tcg-cpu.c in the next
> patch and set_misa() needs to be usable from there.
> 
> Rename it to riscv_cpu_set_misa() and make it public.
> 
> Signed-off-by: Daniel Henrique Barboza 
> ---
>  target/riscv/cpu.c | 34 ++
>  target/riscv/cpu.h |  1 +
>  2 files changed, 19 insertions(+), 16 deletions(-)
>

Reviewed-by: Andrew Jones

Re: [PATCH v2 16/19] target/riscv/cpu.c: make misa_ext_cfgs[] 'const'

2023-09-11 Thread Andrew Jones

On Wed, Sep 06, 2023 at 06:16:43AM -0300, Daniel Henrique Barboza wrote:
> The array isn't marked as 'const' because we're initializing their
> elements in riscv_cpu_add_misa_properties(), 'name' and 'description'
> fields.
> 
> In a closer look we can see that we're not using these 2 fields after
> creating the MISA properties. And we can create the properties by using
> riscv_get_misa_ext_name() and riscv_get_misa_ext_description()
> directly.
> 
> Remove the 'name' and 'description' fields from RISCVCPUMisaExtConfig
> and make misa_ext_cfgs[] a const array.
> 
> Signed-off-by: Daniel Henrique Barboza 
> ---
>  target/riscv/cpu.c | 21 -
>  1 file changed, 8 insertions(+), 13 deletions(-)
> 
> diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
> index 8616c9e2f5..4875feded7 100644
> --- a/target/riscv/cpu.c
> +++ b/target/riscv/cpu.c
> @@ -1212,8 +1212,6 @@ static void riscv_cpu_init(Object *obj)
>  }
>  
>  typedef struct RISCVCPUMisaExtConfig {
> -const char *name;
> -const char *description;
>  target_ulong misa_bit;
>  bool enabled;
>  } RISCVCPUMisaExtConfig;
> @@ -1317,7 +1315,7 @@ const char *riscv_get_misa_ext_description(uint32_t bit)
>  #define MISA_CFG(_bit, _enabled) \
>  {.misa_bit = _bit, .enabled = _enabled}
>  
> -static RISCVCPUMisaExtConfig misa_ext_cfgs[] = {
> +static const RISCVCPUMisaExtConfig misa_ext_cfgs[] = {
>  MISA_CFG(RVA, true),
>  MISA_CFG(RVC, true),
>  MISA_CFG(RVD, true),
> @@ -1344,25 +1342,22 @@ void riscv_cpu_add_misa_properties(Object *cpu_obj)
>  int i;
>  
>  for (i = 0; i < ARRAY_SIZE(misa_ext_cfgs); i++) {
> -RISCVCPUMisaExtConfig *misa_cfg = &misa_ext_cfgs[i];
> +const RISCVCPUMisaExtConfig *misa_cfg = &misa_ext_cfgs[i];
>  int bit = misa_cfg->misa_bit;
> -
> -misa_cfg->name = riscv_get_misa_ext_name(bit);
> -misa_cfg->description = riscv_get_misa_ext_description(bit);
> +const char *name = riscv_get_misa_ext_name(bit);
> +const char *desc = riscv_get_misa_ext_description(bit);
>  
>  /* Check if KVM already created the property */
> -if (object_property_find(cpu_obj, misa_cfg->name)) {
> +if (object_property_find(cpu_obj, name)) {
>  continue;
>  }
>  
> -object_property_add(cpu_obj, misa_cfg->name, "bool",
> +object_property_add(cpu_obj, name, "bool",
>  cpu_get_misa_ext_cfg,
>  cpu_set_misa_ext_cfg,
>  NULL, (void *)misa_cfg);
> -object_property_set_description(cpu_obj, misa_cfg->name,
> -misa_cfg->description);
> -object_property_set_bool(cpu_obj, misa_cfg->name,
> - misa_cfg->enabled, NULL);
> +object_property_set_description(cpu_obj, name, desc);
> +object_property_set_bool(cpu_obj, name, misa_cfg->enabled, NULL);
>  }
>  }
>  
> -- 
> 2.41.0
>

Reviewed-by: Andrew Jones

[Stable-7.2.6 01/51] gitlab-ci: check-dco.py: switch from master to stable-7.2 branch

2023-09-11 Thread Michael Tokarev

There's one commit, tagged v7.2.2, without Signed-off-by line.
Due to this, check-dco test always fail on 7.2.  Since this is
a stable branch with almost all commits coming from master
already with S-o-b (except of the version bumps and very rare
stable-specific commits), and v7.2.2 is already cast in stone,
let's base the check on stable-7.2 branch (with its last version)
instead of master branch.  This way, staging-7.2 will be checked
against stable-7.2, but stable-7.2 itself will not be checked
anymore, - so we can catch errors during stable preparations.

Note: this is a change specific to stable-7.2 branch/series, it
is not supposed to be in master.

Signed-off-by: Michael Tokarev 
Reviewed-by: Thomas Huth 

diff --git a/.gitlab-ci.d/check-dco.py b/.gitlab-ci.d/check-dco.py
index 632c8bcce8..b929571eed 100755
--- a/.gitlab-ci.d/check-dco.py
+++ b/.gitlab-ci.d/check-dco.py
@@ -20,12 +20,12 @@
 repourl = "https://gitlab.com/%s/%s.git"; % (namespace, reponame)
 
 subprocess.check_call(["git", "remote", "add", "check-dco", repourl])
-subprocess.check_call(["git", "fetch", "check-dco", "master"],
+subprocess.check_call(["git", "fetch", "check-dco", "stable-7.2"],
   stdout=subprocess.DEVNULL,
   stderr=subprocess.DEVNULL)
 
 ancestor = subprocess.check_output(["git", "merge-base",
-"check-dco/master", "HEAD"],
+"check-dco/stable-7.2", "HEAD"],
universal_newlines=True)
 
 ancestor = ancestor.strip()
@@ -85,7 +85,7 @@
 
 To bulk update all commits on current branch "git rebase" can be used:
 
-  git rebase -i master -x 'git commit --amend --no-edit -s'
+  git rebase -i stable-7.2 -x 'git commit --amend --no-edit -s'
 
 """)
 
-- 
2.39.2

[Stable-7.2.6 11/51] raven: disable reentrancy detection for iomem

2023-09-11 Thread Michael Tokarev

From: Alexander Bulekov 

As the code is designed for re-entrant calls from raven_io_ops to
pci-conf, mark raven_io_ops as reentrancy-safe.

Signed-off-by: Alexander Bulekov 
Message-Id: <20230427211013.2994127-8-alx...@bu.edu>
Signed-off-by: Thomas Huth 
(cherry picked from commit 6dad5a6810d9c60ca320d01276f6133bbcfa1fc7)
Signed-off-by: Michael Tokarev 

diff --git a/hw/pci-host/raven.c b/hw/pci-host/raven.c
index 7a105e4a63..42fb02b7e6 100644
--- a/hw/pci-host/raven.c
+++ b/hw/pci-host/raven.c
@@ -293,6 +293,13 @@ static void raven_pcihost_initfn(Object *obj)
 memory_region_init(&s->pci_memory, obj, "pci-memory", 0x3f00);
 address_space_init(&s->pci_io_as, &s->pci_io, "raven-io");
 
+/*
+ * Raven's raven_io_ops use the address-space API to access pci-conf-idx
+ * (which is also owned by the raven device). As such, mark the
+ * pci_io_non_contiguous as re-entrancy safe.
+ */
+s->pci_io_non_contiguous.disable_reentrancy_guard = true;
+
 /* CPU address space */
 memory_region_add_subregion(address_space_mem, PCI_IO_BASE_ADDR,
 &s->pci_io);
-- 
2.39.2

[Stable-7.2.6 06/51] checkpatch: add qemu_bh_new/aio_bh_new checks

2023-09-11 Thread Michael Tokarev

From: Alexander Bulekov 

Advise authors to use the _guarded versions of the APIs, instead.

Signed-off-by: Alexander Bulekov 
Reviewed-by: Darren Kenny 
Message-Id: <20230427211013.2994127-4-alx...@bu.edu>
Signed-off-by: Thomas Huth 
(cherry picked from commit ef56ffbdd6b0605dc1e305611287b948c970e236)
Signed-off-by: Michael Tokarev 

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 6ecabfb2b5..fbb71c70f8 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -2865,6 +2865,14 @@ sub process {
if ($line =~ /\bsignal\s*\(/ && !($line =~ /SIG_(?:IGN|DFL)/)) {
ERROR("use sigaction to establish signal handlers; 
signal is not portable\n" . $herecurr);
}
+# recommend qemu_bh_new_guarded instead of qemu_bh_new
+if ($realfile =~ /.*\/hw\/.*/ && $line =~ /\bqemu_bh_new\s*\(/) {
+   ERROR("use qemu_bh_new_guarded() instead of 
qemu_bh_new() to avoid reentrancy problems\n" . $herecurr);
+   }
+# recommend aio_bh_new_guarded instead of aio_bh_new
+if ($realfile =~ /.*\/hw\/.*/ && $line =~ /\baio_bh_new\s*\(/) {
+   ERROR("use aio_bh_new_guarded() instead of aio_bh_new() 
to avoid reentrancy problems\n" . $herecurr);
+   }
 # check for module_init(), use category-specific init macros explicitly please
if ($line =~ /^module_init\s*\(/) {
ERROR("please use block_init(), type_init() etc. 
instead of module_init()\n" . $herecurr);
-- 
2.39.2

[Stable-7.2.6 02/51] python: drop pipenv

2023-09-11 Thread Michael Tokarev

From: John Snow 

The pipenv tool was nice in theory, but in practice it's just too hard
to update selectively, and it makes using it a pain. The qemu.qmp repo
dropped pipenv support a while back and it's been functioning just fine,
so I'm backporting that change here to qemu.git.

Signed-off-by: John Snow 
Message-id: 20230210003147.1309376-3-js...@redhat.com
Signed-off-by: John Snow 
(cherry picked from commit 6832189fd791622c30e7bbe3a12b76be14dc1158)
Signed-off-by: Michael Tokarev 
(Mjt: the reason for this is to stop CI failing in pipenv for 7.2)

diff --git a/.gitlab-ci.d/static_checks.yml b/.gitlab-ci.d/static_checks.yml
index 289ad1359e..b4cbdbce2a 100644
--- a/.gitlab-ci.d/static_checks.yml
+++ b/.gitlab-ci.d/static_checks.yml
@@ -23,12 +23,12 @@ check-dco:
   before_script:
 - apk -U add git
 
-check-python-pipenv:
+check-python-minreqs:
   extends: .base_job_template
   stage: test
   image: $CI_REGISTRY_IMAGE/qemu/python:latest
   script:
-- make -C python check-pipenv
+- make -C python check-minreqs
   variables:
 GIT_DEPTH: 1
   needs:
diff --git a/python/.gitignore b/python/.gitignore
index 904f324bb1..c3ceb1ca0a 100644
--- a/python/.gitignore
+++ b/python/.gitignore
@@ -11,8 +11,8 @@ qemu.egg-info/
 .idea/
 .vscode/
 
-# virtual environments (pipenv et al)
-.venv/
+# virtual environments
+.min-venv/
 .tox/
 .dev-venv/
 
diff --git a/python/Makefile b/python/Makefile
index b170708398..c5bd6ff83a 100644
--- a/python/Makefile
+++ b/python/Makefile
@@ -1,15 +1,16 @@
 QEMU_VENV_DIR=.dev-venv
+QEMU_MINVENV_DIR=.min-venv
 QEMU_TOX_EXTRA_ARGS ?=
 
 .PHONY: help
 help:
@echo "python packaging help:"
@echo ""
-   @echo "make check-pipenv:"
-   @echo "Run tests in pipenv's virtual environment."
+   @echo "make check-minreqs:"
+   @echo "Run tests in the minreqs virtual environment."
@echo "These tests use the oldest dependencies."
-   @echo "Requires: Python 3.6 and pipenv."
-   @echo "Hint (Fedora): 'sudo dnf install python3.6 pipenv'"
+   @echo "Requires: Python 3.6"
+   @echo "Hint (Fedora): 'sudo dnf install python3.6'"
@echo ""
@echo "make check-tox:"
@echo "Run tests against multiple python versions."
@@ -33,8 +34,8 @@ help:
@echo "and install the qemu package in editable mode."
@echo "(Can be used in or outside of a venv.)"
@echo ""
-   @echo "make pipenv"
-   @echo "Creates pipenv's virtual environment (.venv)"
+   @echo "make min-venv"
+   @echo "Creates the minreqs virtual environment 
($(QEMU_MINVENV_DIR))"
@echo ""
@echo "make dev-venv"
@echo "Creates a simple venv for check-dev. ($(QEMU_VENV_DIR))"
@@ -43,21 +44,38 @@ help:
@echo "Remove package build output."
@echo ""
@echo "make distclean:"
-   @echo "remove pipenv/venv files, qemu package forwarder,"
+   @echo "remove venv files, qemu package forwarder,"
@echo "built distribution files, and everything from 'make clean'."
@echo ""
@echo -e "Have a nice day ^_^\n"
 
-.PHONY: pipenv
-pipenv: .venv
-.venv: Pipfile.lock
-   @PIPENV_VENV_IN_PROJECT=1 pipenv sync --dev --keep-outdated
-   rm -f pyproject.toml
-   @touch .venv
+.PHONY: pipenv check-pipenv
+pipenv check-pipenv:
+   @echo "pipenv was dropped; try 'make check-minreqs' or 'make min-venv'"
+   @exit 1
+
+.PHONY: min-venv
+min-venv: $(QEMU_MINVENV_DIR) $(QEMU_MINVENV_DIR)/bin/activate
+$(QEMU_MINVENV_DIR) $(QEMU_MINVENV_DIR)/bin/activate: setup.cfg 
tests/minreqs.txt
+   @echo "VENV $(QEMU_MINVENV_DIR)"
+   @python3.6 -m venv $(QEMU_MINVENV_DIR)
+   @(  \
+   echo "ACTIVATE $(QEMU_MINVENV_DIR)";\
+   . $(QEMU_MINVENV_DIR)/bin/activate; \
+   echo "INSTALL -r tests/minreqs.txt $(QEMU_MINVENV_DIR)";\
+   pip install -r tests/minreqs.txt 1>/dev/null;   \
+   echo "INSTALL -e qemu $(QEMU_MINVENV_DIR)"; \
+   pip install -e . 1>/dev/null;   \
+   )
+   @touch $(QEMU_MINVENV_DIR)
 
-.PHONY: check-pipenv
-check-pipenv: pipenv
-   @pipenv run make check
+.PHONY: check-minreqs
+check-minreqs: min-venv
+   @(  \
+   echo "ACTIVATE $(QEMU_MINVENV_DIR)";\
+   . $(QEMU_MINVENV_DIR)/bin/activate; \
+   make check; \
+   )
 
 .PHONY: dev-venv
 dev-venv: $(QEMU_VENV_DIR) $(QEMU_VENV_DIR)/bin/activate
@@ -106,6 +124,7 @@ clean:
 
 .PHONY: distclean
 distclean: clean
-   rm -rf qemu.egg-info/ .venv/ .tox/ $(QEMU_VENV_DIR) dist/
+   rm -rf qemu.egg-info/ .eggs/ dist/
+   rm -rf $(QEMU_VENV_DIR) $(QEMU_MI

[Stable-7.2.6 04/51] async: Add an optional reentrancy guard to the BH API

2023-09-11 Thread Michael Tokarev

From: Alexander Bulekov 

Devices can pass their MemoryReentrancyGuard (from their DeviceState),
when creating new BHes. Then, the async API will toggle the guard
before/after calling the BH call-back. This prevents bh->mmio reentrancy
issues.

Signed-off-by: Alexander Bulekov 
Reviewed-by: Darren Kenny 
Message-Id: <20230427211013.2994127-3-alx...@bu.edu>
[thuth: Fix "line over 90 characters" checkpatch.pl error]
Signed-off-by: Thomas Huth 
(cherry picked from commit 9c86c97f12c060bf7484dd931f38634e166a81f0)
Signed-off-by: Michael Tokarev 
[mjt: minor context adjustment in include/block/aio.h and 
include/qemu/main-loop.h for 7.2]

diff --git a/docs/devel/multiple-iothreads.txt 
b/docs/devel/multiple-iothreads.txt
index 343120f2ef..a3e949f6b3 100644
--- a/docs/devel/multiple-iothreads.txt
+++ b/docs/devel/multiple-iothreads.txt
@@ -61,6 +61,7 @@ There are several old APIs that use the main loop AioContext:
  * LEGACY qemu_aio_set_event_notifier() - monitor an event notifier
  * LEGACY timer_new_ms() - create a timer
  * LEGACY qemu_bh_new() - create a BH
+ * LEGACY qemu_bh_new_guarded() - create a BH with a device re-entrancy guard
  * LEGACY qemu_aio_wait() - run an event loop iteration
 
 Since they implicitly work on the main loop they cannot be used in code that
@@ -72,8 +73,14 @@ Instead, use the AioContext functions directly (see 
include/block/aio.h):
  * aio_set_event_notifier() - monitor an event notifier
  * aio_timer_new() - create a timer
  * aio_bh_new() - create a BH
+ * aio_bh_new_guarded() - create a BH with a device re-entrancy guard
  * aio_poll() - run an event loop iteration
 
+The qemu_bh_new_guarded/aio_bh_new_guarded APIs accept a "MemReentrancyGuard"
+argument, which is used to check for and prevent re-entrancy problems. For
+BHs associated with devices, the reentrancy-guard is contained in the
+corresponding DeviceState and named "mem_reentrancy_guard".
+
 The AioContext can be obtained from the IOThread using
 iothread_get_aio_context() or for the main loop using qemu_get_aio_context().
 Code that takes an AioContext argument works both in IOThreads or the main
diff --git a/include/block/aio.h b/include/block/aio.h
index d128558f1d..0dbfd435ae 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -22,6 +22,8 @@
 #include "qemu/event_notifier.h"
 #include "qemu/thread.h"
 #include "qemu/timer.h"
+#include "hw/qdev-core.h"
+
 
 typedef struct BlockAIOCB BlockAIOCB;
 typedef void BlockCompletionFunc(void *opaque, int ret);
@@ -323,9 +325,11 @@ void aio_bh_schedule_oneshot_full(AioContext *ctx, 
QEMUBHFunc *cb, void *opaque,
  * is opaque and must be allocated prior to its use.
  *
  * @name: A human-readable identifier for debugging purposes.
+ * @reentrancy_guard: A guard set when entering a cb to prevent
+ * device-reentrancy issues
  */
 QEMUBH *aio_bh_new_full(AioContext *ctx, QEMUBHFunc *cb, void *opaque,
-const char *name);
+const char *name, MemReentrancyGuard 
*reentrancy_guard);
 
 /**
  * aio_bh_new: Allocate a new bottom half structure
@@ -334,7 +338,17 @@ QEMUBH *aio_bh_new_full(AioContext *ctx, QEMUBHFunc *cb, 
void *opaque,
  * string.
  */
 #define aio_bh_new(ctx, cb, opaque) \
-aio_bh_new_full((ctx), (cb), (opaque), (stringify(cb)))
+aio_bh_new_full((ctx), (cb), (opaque), (stringify(cb)), NULL)
+
+/**
+ * aio_bh_new_guarded: Allocate a new bottom half structure with a
+ * reentrancy_guard
+ *
+ * A convenience wrapper for aio_bh_new_full() that uses the cb as the name
+ * string.
+ */
+#define aio_bh_new_guarded(ctx, cb, opaque, guard) \
+aio_bh_new_full((ctx), (cb), (opaque), (stringify(cb)), guard)
 
 /**
  * aio_notify: Force processing of pending events.
diff --git a/include/qemu/main-loop.h b/include/qemu/main-loop.h
index 3c9a9a982d..5c7e95601c 100644
--- a/include/qemu/main-loop.h
+++ b/include/qemu/main-loop.h
@@ -360,9 +360,12 @@ void qemu_cond_timedwait_iothread(QemuCond *cond, int ms);
 
 void qemu_fd_register(int fd);
 
+#define qemu_bh_new_guarded(cb, opaque, guard) \
+qemu_bh_new_full((cb), (opaque), (stringify(cb)), guard)
 #define qemu_bh_new(cb, opaque) \
-qemu_bh_new_full((cb), (opaque), (stringify(cb)))
-QEMUBH *qemu_bh_new_full(QEMUBHFunc *cb, void *opaque, const char *name);
+qemu_bh_new_full((cb), (opaque), (stringify(cb)), NULL)
+QEMUBH *qemu_bh_new_full(QEMUBHFunc *cb, void *opaque, const char *name,
+ MemReentrancyGuard *reentrancy_guard);
 void qemu_bh_schedule_idle(QEMUBH *bh);
 
 enum {
diff --git a/tests/unit/ptimer-test-stubs.c b/tests/unit/ptimer-test-stubs.c
index f5e75a96b6..24d5413f9d 100644
--- a/tests/unit/ptimer-test-stubs.c
+++ b/tests/unit/ptimer-test-stubs.c
@@ -107,7 +107,8 @@ int64_t qemu_clock_deadline_ns_all(QEMUClockType type, int 
attr_mask)
 return deadline;
 }
 
-QEMUBH *qemu_bh_new_full(QEMUBHFunc *cb, void *opaque, const char *name)
+QEMUBH *qemu_bh_new_full(QEMUBHFunc *cb, void *opaque, const char *name,
+

[Stable-7.2.6 14/51] pnv_lpc: disable reentrancy detection for lpc-hc

2023-09-11 Thread Michael Tokarev

From: Alexander Bulekov 

As lpc-hc is designed for re-entrant calls from xscom, mark it
re-entrancy safe.

Reported-by: Thomas Huth 
Signed-off-by: Alexander Bulekov 
[clg: mark opb_master_regs as re-entrancy safe also ]
Signed-off-by: Cédric Le Goater 
Reviewed-by: Frederic Barrat 
Tested-by: Thomas Huth 
Message-Id: <20230526073850.2772197-1-...@kaod.org>
Signed-off-by: Daniel Henrique Barboza 
(cherry picked from commit 76f9ebffcd41b62ae9ec26a1c25676f2ae1d9cc3)
Signed-off-by: Michael Tokarev 

diff --git a/hw/ppc/pnv_lpc.c b/hw/ppc/pnv_lpc.c
index ee890e7ab4..ef29314891 100644
--- a/hw/ppc/pnv_lpc.c
+++ b/hw/ppc/pnv_lpc.c
@@ -733,10 +733,13 @@ static void pnv_lpc_realize(DeviceState *dev, Error 
**errp)
 /* Create MMIO regions for LPC HC and OPB registers */
 memory_region_init_io(&lpc->opb_master_regs, OBJECT(dev), &opb_master_ops,
   lpc, "lpc-opb-master", LPC_OPB_REGS_OPB_SIZE);
+lpc->opb_master_regs.disable_reentrancy_guard = true;
 memory_region_add_subregion(&lpc->opb_mr, LPC_OPB_REGS_OPB_ADDR,
 &lpc->opb_master_regs);
 memory_region_init_io(&lpc->lpc_hc_regs, OBJECT(dev), &lpc_hc_ops, lpc,
   "lpc-hc", LPC_HC_REGS_OPB_SIZE);
+/* xscom writes to lpc-hc. As such mark lpc-hc re-entrancy safe */
+lpc->lpc_hc_regs.disable_reentrancy_guard = true;
 memory_region_add_subregion(&lpc->opb_mr, LPC_HC_REGS_OPB_ADDR,
 &lpc->lpc_hc_regs);
 
-- 
2.39.2

[Stable-7.2.6 13/51] loongarch: mark loongarch_ipi_iocsr re-entrnacy safe

2023-09-11 Thread Michael Tokarev

From: Alexander Bulekov 

loongarch_ipi_iocsr MRs rely on re-entrant IO through the ipi_send
function. As such, mark these MRs re-entrancy-safe.

Fixes: a2e1753b80 ("memory: prevent dma-reentracy issues")
Signed-off-by: Alexander Bulekov 
Reviewed-by: Song Gao 
Message-Id: <20230506112145.3563708-1-alx...@bu.edu>
Signed-off-by: Song Gao 
(cherry picked from commit 6d0589e0e6c64b64a2bf980537be20389264)
Signed-off-by: Michael Tokarev 

diff --git a/hw/intc/loongarch_ipi.c b/hw/intc/loongarch_ipi.c
index aa4bf9eb74..40e98af2ce 100644
--- a/hw/intc/loongarch_ipi.c
+++ b/hw/intc/loongarch_ipi.c
@@ -215,6 +215,10 @@ static void loongarch_ipi_init(Object *obj)
 for (cpu = 0; cpu < MAX_IPI_CORE_NUM; cpu++) {
 memory_region_init_io(&s->ipi_iocsr_mem[cpu], obj, &loongarch_ipi_ops,
 &lams->ipi_core[cpu], "loongarch_ipi_iocsr", 0x48);
+
+/* loongarch_ipi_iocsr performs re-entrant IO through ipi_send */
+s->ipi_iocsr_mem[cpu].disable_reentrancy_guard = true;
+
 sysbus_init_mmio(sbd, &s->ipi_iocsr_mem[cpu]);
 
 memory_region_init_io(&s->ipi64_iocsr_mem[cpu], obj, 
&loongarch_ipi64_ops,
-- 
2.39.2

[Stable-7.2.6 03/51] memory: prevent dma-reentracy issues

2023-09-11 Thread Michael Tokarev

From: Alexander Bulekov 

Add a flag to the DeviceState, when a device is engaged in PIO/MMIO/DMA.
This flag is set/checked prior to calling a device's MemoryRegion
handlers, and set when device code initiates DMA.  The purpose of this
flag is to prevent two types of DMA-based reentrancy issues:

1.) mmio -> dma -> mmio case
2.) bh -> dma write -> mmio case

These issues have led to problems such as stack-exhaustion and
use-after-frees.

Summary of the problem from Peter Maydell:
https://lore.kernel.org/qemu-devel/cafeaca_23vc7he3iam-jva6w38lk4hjowae5kcknhprd5fp...@mail.gmail.com

Resolves: https://gitlab.com/qemu-project/qemu/-/issues/62
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/540
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/541
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/556
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/557
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/827
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1282
Resolves: CVE-2023-0330

Signed-off-by: Alexander Bulekov 
Reviewed-by: Thomas Huth 
Message-Id: <20230427211013.2994127-2-alx...@bu.edu>
[thuth: Replace warn_report() with warn_report_once()]
Signed-off-by: Thomas Huth 
(cherry picked from commit a2e1753b8054344f32cf94f31c6399a58794a380)
Signed-off-by: Michael Tokarev 

diff --git a/include/exec/memory.h b/include/exec/memory.h
index 91f8a2395a..124628ada4 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -741,6 +741,8 @@ struct MemoryRegion {
 bool is_iommu;
 RAMBlock *ram_block;
 Object *owner;
+/* owner as TYPE_DEVICE. Used for re-entrancy checks in MR access hotpath 
*/
+DeviceState *dev;
 
 const MemoryRegionOps *ops;
 void *opaque;
@@ -765,6 +767,9 @@ struct MemoryRegion {
 unsigned ioeventfd_nb;
 MemoryRegionIoeventfd *ioeventfds;
 RamDiscardManager *rdm; /* Only for RAM */
+
+/* For devices designed to perform re-entrant IO into their own IO MRs */
+bool disable_reentrancy_guard;
 };
 
 struct IOMMUMemoryRegion {
diff --git a/include/hw/qdev-core.h b/include/hw/qdev-core.h
index 785dd5a56e..886f6bb79e 100644
--- a/include/hw/qdev-core.h
+++ b/include/hw/qdev-core.h
@@ -162,6 +162,10 @@ struct NamedClockList {
 QLIST_ENTRY(NamedClockList) node;
 };
 
+typedef struct {
+bool engaged_in_io;
+} MemReentrancyGuard;
+
 /**
  * DeviceState:
  * @realized: Indicates whether the device has been fully constructed.
@@ -194,6 +198,9 @@ struct DeviceState {
 int alias_required_for_version;
 ResettableState reset;
 GSList *unplug_blockers;
+
+/* Is the device currently in mmio/pio/dma? Used to prevent re-entrancy */
+MemReentrancyGuard mem_reentrancy_guard;
 };
 
 struct DeviceListener {
diff --git a/softmmu/memory.c b/softmmu/memory.c
index bc0be3f62c..61569f8306 100644
--- a/softmmu/memory.c
+++ b/softmmu/memory.c
@@ -542,6 +542,18 @@ static MemTxResult access_with_adjusted_size(hwaddr addr,
 access_size_max = 4;
 }
 
+/* Do not allow more than one simultaneous access to a device's IO Regions 
*/
+if (mr->dev && !mr->disable_reentrancy_guard &&
+!mr->ram_device && !mr->ram && !mr->rom_device && !mr->readonly) {
+if (mr->dev->mem_reentrancy_guard.engaged_in_io) {
+warn_report_once("Blocked re-entrant IO on MemoryRegion: "
+ "%s at addr: 0x%" HWADDR_PRIX,
+ memory_region_name(mr), addr);
+return MEMTX_ACCESS_ERROR;
+}
+mr->dev->mem_reentrancy_guard.engaged_in_io = true;
+}
+
 /* FIXME: support unaligned access? */
 access_size = MAX(MIN(size, access_size_max), access_size_min);
 access_mask = MAKE_64BIT_MASK(0, access_size * 8);
@@ -556,6 +568,9 @@ static MemTxResult access_with_adjusted_size(hwaddr addr,
 access_mask, attrs);
 }
 }
+if (mr->dev) {
+mr->dev->mem_reentrancy_guard.engaged_in_io = false;
+}
 return r;
 }
 
@@ -1170,6 +1185,7 @@ static void memory_region_do_init(MemoryRegion *mr,
 }
 mr->name = g_strdup(name);
 mr->owner = owner;
+mr->dev = (DeviceState *) object_dynamic_cast(mr->owner, TYPE_DEVICE);
 mr->ram_block = NULL;
 
 if (name) {
-- 
2.39.2

[Stable-7.2.6 43/51] hw/ide/ahci: PxCI should not get cleared when ERR_STAT is set

2023-09-11 Thread Michael Tokarev

From: Niklas Cassel 

For NCQ, PxCI is cleared on command queued successfully.
For non-NCQ, PxCI is cleared on command completed successfully.
Successfully means ERR_STAT, BUSY and DRQ are all cleared.

A command that has ERR_STAT set, does not get to clear PxCI.
See AHCI 1.3.1, section 5.3.8, states RegFIS:Entry and RegFIS:ClearCI,
and 5.3.16.5 ERR:FatalTaskfile.

In the case of non-NCQ commands, not clearing PxCI is needed in order
for host software to be able to see which command slot that failed.

Signed-off-by: Niklas Cassel 
Message-id: 20230609140844.202795-7-...@flawful.org
Signed-off-by: John Snow 
(cherry picked from commit 1a16ce64fda11bdf50f0c4ab5d9fdde72c1383a2)
Signed-off-by: Michael Tokarev 

diff --git a/hw/ide/ahci.c b/hw/ide/ahci.c
index 945101d6bb..518094e215 100644
--- a/hw/ide/ahci.c
+++ b/hw/ide/ahci.c
@@ -1522,7 +1522,8 @@ static void ahci_clear_cmd_issue(AHCIDevice *ad, uint8_t 
slot)
 {
 IDEState *ide_state = &ad->port.ifs[0];
 
-if (!(ide_state->status & (BUSY_STAT | DRQ_STAT))) {
+if (!(ide_state->status & ERR_STAT) &&
+!(ide_state->status & (BUSY_STAT | DRQ_STAT))) {
 ad->port_regs.cmd_issue &= ~(1 << slot);
 }
 }
@@ -1531,6 +1532,7 @@ static void ahci_clear_cmd_issue(AHCIDevice *ad, uint8_t 
slot)
 static void ahci_cmd_done(const IDEDMA *dma)
 {
 AHCIDevice *ad = DO_UPCAST(AHCIDevice, dma, dma);
+IDEState *ide_state = &ad->port.ifs[0];
 
 trace_ahci_cmd_done(ad->hba, ad->port_no);
 
@@ -1547,7 +1549,8 @@ static void ahci_cmd_done(const IDEDMA *dma)
  */
 ahci_write_fis_d2h(ad, true);
 
-if (ad->port_regs.cmd_issue && !ad->check_bh) {
+if (!(ide_state->status & ERR_STAT) &&
+ad->port_regs.cmd_issue && !ad->check_bh) {
 ad->check_bh = qemu_bh_new_guarded(ahci_check_cmd_bh, ad,
&ad->mem_reentrancy_guard);
 qemu_bh_schedule(ad->check_bh);
diff --git a/tests/qtest/libqos/ahci.c b/tests/qtest/libqos/ahci.c
index f53f12aa99..a2c94c6e06 100644
--- a/tests/qtest/libqos/ahci.c
+++ b/tests/qtest/libqos/ahci.c
@@ -404,57 +404,110 @@ void ahci_port_clear(AHCIQState *ahci, uint8_t port)
 /**
  * Check a port for errors.
  */
-void ahci_port_check_error(AHCIQState *ahci, uint8_t port,
-   uint32_t imask, uint8_t emask)
+void ahci_port_check_error(AHCIQState *ahci, AHCICommand *cmd)
 {
+uint8_t port = cmd->port;
 uint32_t reg;
 
-/* The upper 9 bits of the IS register all indicate errors. */
-reg = ahci_px_rreg(ahci, port, AHCI_PX_IS);
-reg &= ~imask;
-reg >>= 23;
-g_assert_cmphex(reg, ==, 0);
+/* If expecting TF error, ensure that TFES is set. */
+if (cmd->errors) {
+reg = ahci_px_rreg(ahci, port, AHCI_PX_IS);
+ASSERT_BIT_SET(reg, AHCI_PX_IS_TFES);
+} else {
+/* The upper 9 bits of the IS register all indicate errors. */
+reg = ahci_px_rreg(ahci, port, AHCI_PX_IS);
+reg &= ~cmd->interrupts;
+reg >>= 23;
+g_assert_cmphex(reg, ==, 0);
+}
 
-/* The Sata Error Register should be empty. */
+/* The Sata Error Register should be empty, even when expecting TF error. 
*/
 reg = ahci_px_rreg(ahci, port, AHCI_PX_SERR);
 g_assert_cmphex(reg, ==, 0);
 
+/* If expecting TF error, and TFES was set, perform error recovery
+ * (see AHCI 1.3 section 6.2.2.1) such that we can send new commands. */
+if (cmd->errors) {
+/* This will clear PxCI. */
+ahci_px_clr(ahci, port, AHCI_PX_CMD, AHCI_PX_CMD_ST);
+
+/* The port has 500ms to disengage. */
+usleep(50);
+reg = ahci_px_rreg(ahci, port, AHCI_PX_CMD);
+ASSERT_BIT_CLEAR(reg, AHCI_PX_CMD_CR);
+
+/* Clear PxIS. */
+reg = ahci_px_rreg(ahci, port, AHCI_PX_IS);
+ahci_px_wreg(ahci, port, AHCI_PX_IS, reg);
+
+/* Check if we need to perform a COMRESET.
+ * Not implemented right now, as there is no reason why our QEMU model
+ * should need a COMRESET when expecting TF error. */
+reg = ahci_px_rreg(ahci, port, AHCI_PX_TFD);
+ASSERT_BIT_CLEAR(reg, AHCI_PX_TFD_STS_BSY | AHCI_PX_TFD_STS_DRQ);
+
+/* Enable issuing new commands. */
+ahci_px_set(ahci, port, AHCI_PX_CMD, AHCI_PX_CMD_ST);
+}
+
 /* The TFD also has two error sections. */
 reg = ahci_px_rreg(ahci, port, AHCI_PX_TFD);
-if (!emask) {
+if (!cmd->errors) {
 ASSERT_BIT_CLEAR(reg, AHCI_PX_TFD_STS_ERR);
 } else {
 ASSERT_BIT_SET(reg, AHCI_PX_TFD_STS_ERR);
 }
-ASSERT_BIT_CLEAR(reg, AHCI_PX_TFD_ERR & (~emask << 8));
-ASSERT_BIT_SET(reg, AHCI_PX_TFD_ERR & (emask << 8));
+ASSERT_BIT_CLEAR(reg, AHCI_PX_TFD_ERR & (~cmd->errors << 8));
+ASSERT_BIT_SET(reg, AHCI_PX_TFD_ERR & (cmd->errors << 8));
 }
 
-void ahci_port_check_interrupts(AHCIQState *ahci, uint8_t port,
-uint32_t intr_mask)
+void ahci_port_check_interrupts(AHCIQState *ahci, AHC

[Stable-7.2.6 10/51] bcm2835_property: disable reentrancy detection for iomem

2023-09-11 Thread Michael Tokarev

From: Alexander Bulekov 

As the code is designed for re-entrant calls from bcm2835_property to
bcm2835_mbox and back into bcm2835_property, mark iomem as
reentrancy-safe.

Signed-off-by: Alexander Bulekov 
Reviewed-by: Thomas Huth 
Message-Id: <20230427211013.2994127-7-alx...@bu.edu>
Signed-off-by: Thomas Huth 
(cherry picked from commit 985c4a4e547afb9573b6bd6843d20eb2c3d1d1cd)
Signed-off-by: Michael Tokarev 

diff --git a/hw/misc/bcm2835_property.c b/hw/misc/bcm2835_property.c
index 890ae7bae5..de056ea2df 100644
--- a/hw/misc/bcm2835_property.c
+++ b/hw/misc/bcm2835_property.c
@@ -382,6 +382,13 @@ static void bcm2835_property_init(Object *obj)
 
 memory_region_init_io(&s->iomem, OBJECT(s), &bcm2835_property_ops, s,
   TYPE_BCM2835_PROPERTY, 0x10);
+
+/*
+ * bcm2835_property_ops call into bcm2835_mbox, which in-turn reads from
+ * iomem. As such, mark iomem as re-entracy safe.
+ */
+s->iomem.disable_reentrancy_guard = true;
+
 sysbus_init_mmio(SYS_BUS_DEVICE(s), &s->iomem);
 sysbus_init_irq(SYS_BUS_DEVICE(s), &s->mbox_irq);
 }
-- 
2.39.2

[Stable-7.2.6 05/51] async: avoid use-after-free on re-entrancy guard

2023-09-11 Thread Michael Tokarev

From: Alexander Bulekov 

A BH callback can free the BH, causing a use-after-free in aio_bh_call.
Fix that by keeping a local copy of the re-entrancy guard pointer.

Buglink: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=58513
Fixes: 9c86c97f12 ("async: Add an optional reentrancy guard to the BH API")
Signed-off-by: Alexander Bulekov 
Message-Id: <20230501141956.3444868-1-alx...@bu.edu>
Reviewed-by: Thomas Huth 
Signed-off-by: Thomas Huth 
(cherry picked from commit 7915bd06f25e1803778081161bf6fa10c42dc7cd)
Signed-off-by: Michael Tokarev 

diff --git a/util/async.c b/util/async.c
index ca149507e9..a1f07fc5a7 100644
--- a/util/async.c
+++ b/util/async.c
@@ -151,18 +151,20 @@ void aio_bh_call(QEMUBH *bh)
 {
 bool last_engaged_in_io = false;
 
-if (bh->reentrancy_guard) {
-last_engaged_in_io = bh->reentrancy_guard->engaged_in_io;
-if (bh->reentrancy_guard->engaged_in_io) {
+/* Make a copy of the guard-pointer as cb may free the bh */
+MemReentrancyGuard *reentrancy_guard = bh->reentrancy_guard;
+if (reentrancy_guard) {
+last_engaged_in_io = reentrancy_guard->engaged_in_io;
+if (reentrancy_guard->engaged_in_io) {
 trace_reentrant_aio(bh->ctx, bh->name);
 }
-bh->reentrancy_guard->engaged_in_io = true;
+reentrancy_guard->engaged_in_io = true;
 }
 
 bh->cb(bh->opaque);
 
-if (bh->reentrancy_guard) {
-bh->reentrancy_guard->engaged_in_io = last_engaged_in_io;
+if (reentrancy_guard) {
+reentrancy_guard->engaged_in_io = last_engaged_in_io;
 }
 }
 
-- 
2.39.2

[Stable-7.2.6 09/51] lsi53c895a: disable reentrancy detection for MMIO region, too

2023-09-11 Thread Michael Tokarev

From: Thomas Huth 

While trying to use a SCSI disk on the LSI controller with an
older version of Fedora (25), I'm getting:

 qemu: warning: Blocked re-entrant IO on MemoryRegion: lsi-mmio at addr: 0x34

and the SCSI controller is not usable. Seems like we have to
disable the reentrancy checker for the MMIO region, too, to
get this working again.

The problem could be reproduced it like this:

./qemu-system-x86_64 -accel kvm -m 2G -machine q35 \
 -device lsi53c810,id=lsi1 -device scsi-hd,drive=d0 \
 -drive if=none,id=d0,file=.../somedisk.qcow2 \
 -cdrom Fedora-Everything-netinst-i386-25-1.3.iso

Where somedisk.qcow2 is an image that contains already some partitions
and file systems.

In the boot menu of Fedora, go to
"Troubleshooting" -> "Rescue a Fedora system" -> "3) Skip to shell"

Then check "dmesg | grep -i 53c" for failure messages, and try to mount
a partition from somedisk.qcow2.

Message-Id: <20230516090556.553813-1-th...@redhat.com>
Signed-off-by: Thomas Huth 
(cherry picked from commit d139fe9ad8a27bcc50b4ead77d2f97d191a0e95e)
Signed-off-by: Michael Tokarev 

diff --git a/hw/scsi/lsi53c895a.c b/hw/scsi/lsi53c895a.c
index 4ece478a37..ca619ed564 100644
--- a/hw/scsi/lsi53c895a.c
+++ b/hw/scsi/lsi53c895a.c
@@ -2318,6 +2318,7 @@ static void lsi_scsi_realize(PCIDevice *dev, Error **errp)
  * re-entrancy guard.
  */
 s->ram_io.disable_reentrancy_guard = true;
+s->mmio_io.disable_reentrancy_guard = true;
 
 address_space_init(&s->pci_io_as, pci_address_space_io(dev), "lsi-pci-io");
 qdev_init_gpio_out(d, &s->ext_irq, 1);
-- 
2.39.2

[Stable-7.2.6 00/51] v2 Patch Round-up for stable 7.2.6, freeze on 2023-09-19

2023-09-11 Thread Michael Tokarev

The following patches are queued for QEMU stable v7.2.6:

  https://gitlab.com/qemu-project/qemu/-/commits/staging-7.2

Patch freeze is 2023-09-19, and the release is planned for 2023-09-21:

  https://wiki.qemu.org/Planning/7.2

Please respond here or CC qemu-sta...@nongnu.org on any additional patches
you think should (or shouldn't) be included in the release.

The changes which are staging for inclusion, with the original commit hash
from master branch, are given below the bottom line.

v2:

- I decided to pick up dma/mmio reentrancy fixes from 8.1 now once
  (hopefully) all issues has been addressed.  This is

  commit a2e1753b8054344f32cf94f31c6399a58794a380
  Author: Alexander Bulekov 
  Date:   Thu Apr 27 17:10:06 2023 -0400

memory: prevent dma-reentracy issues

  with all subsequent changes, up to
  76f9ebffcd41 pnv_lpc: disable reentrancy detection for lpc-hc

  I haven't picked this series sooner (while had it backported
  and tested for quite some time) because with time, some more
  places were found where reentrancy detection has to be disabled
  too (like this pnv_lpc change).

  What prompted me to look at this series again: one of the ide/ahci
  change had to be context-edited to apply to 7.2, and the context
  was the one from this reentrancy patch series.  So instead of
  editing context, I decided to pick the reentrancy series and
  apply subsequent changed cleanly.

- I've added 2 patches (one cherry-pick and one specific to stable-7.2)
  just to fix gitlab-CI failed jobs, so CI status will not be "failed"

Thanks!

/mjt

--
01 b8d1fc55b5 Michael Tokarev:
   gitlab-ci: check-dco.py: switch from master to stable-7.2 branch
02 6832189fd791 John Snow:
   python: drop pipenv
03 a2e1753b8054 Alexander Bulekov:
   memory: prevent dma-reentracy issues
04 9c86c97f12c0 Alexander Bulekov:
   async: Add an optional reentrancy guard to the BH API
05 7915bd06f25e Alexander Bulekov:
   async: avoid use-after-free on re-entrancy guard
06 ef56ffbdd6b0 Alexander Bulekov:
   checkpatch: add qemu_bh_new/aio_bh_new checks
07 f63192b0544a Alexander Bulekov:
   hw: replace most qemu_bh_new calls with qemu_bh_new_guarded
08 bfd6e7ae6a72 Alexander Bulekov:
   lsi53c895a: disable reentrancy detection for script RAM
09 d139fe9ad8a2 Thomas Huth:
   lsi53c895a: disable reentrancy detection for MMIO region, too
10 985c4a4e547a Alexander Bulekov:
   bcm2835_property: disable reentrancy detection for iomem
11 6dad5a6810d9 Alexander Bulekov:
   raven: disable reentrancy detection for iomem
12 50795ee051a3 Alexander Bulekov:
   apic: disable reentrancy detection for apic-msi
13 6d0589e0e6c6 Alexander Bulekov:
   loongarch: mark loongarch_ipi_iocsr re-entrnacy safe
14 76f9ebffcd41 Alexander Bulekov:
   pnv_lpc: disable reentrancy detection for lpc-hc
15* a1d027be95bc Zhao Liu:
   machine: Add helpers to get cores/threads per socket
16* d79a284a44bb Zhao Liu:
   hw/smbios: Fix smbios_smp_sockets caculation
17* 7298fd7de555 Zhao Liu:
   hw/smbios: Fix thread count in type4
18* 196ea60a734c Zhao Liu:
   hw/smbios: Fix core count in type4
19* 8a64609eea8c Dongli Zhang:
   dump: kdump-zlib data pages not dumped with pvtime/aarch64
20* dbdb13f931d7 Ankit Kumar:
   hw/nvme: fix CRC64 for guard tag
21* 4333f0924c2f Nathan Egge:
   linux-user/elfload: Set V in ELF_HWCAP for RISC-V
22* e73f27003e77 Richard Henderson:
   include/exec/user: Set ABI_LLONG_ALIGNMENT to 4 for microblaze
23* ea9812d93f9c Richard Henderson:
   include/exec/user: Set ABI_LLONG_ALIGNMENT to 4 for nios2
24* 6ee960823da8 Luca Bonissi:
   Fixed incorrect LLONG alignment for openrisc and cris
25* 791b2b6a9302 Ilya Leoshkevich:
   target/s390x: Fix the "ignored match" case in VSTRS
26* 23e87d419f34 Ilya Leoshkevich:
   target/s390x: Use a 16-bit immediate in VREP
27* 6db3518ba4fc Ilya Leoshkevich:
   target/s390x: Fix VSTL with a large length
28* 6a2ea6151835 Ilya Leoshkevich:
   target/s390x: Check reserved bits of VFMIN/VFMAX's M5
29* d19436291013 Thomas Huth:
   include/hw/virtio/virtio-gpu: Fix virtio-gpu with blob on big endian hosts
30* 5e0d65909c6f Akihiko Odaki:
   kvm: Introduce kvm_arch_get_default_type hook
31* 1ab445af8cd9 Akihiko Odaki:
   accel/kvm: Specify default IPA size for arm64
32* 4b3520fd93cd Richard Henderson:
   target/arm: Fix SME ST1Q
33* cd1e4db73646 Richard Henderson:
   target/arm: Fix 64-bit SSRA
34* 09a3fffae00b Philippe Mathieu-Daudé:
   docs/about/license: Update LICENSE URL
35* f187609f27b2 Fabiano Rosas:
   block-migration: Ensure we don't crash during migration cleanup
36* 6ec65b69ba17 Maksim Kostin:
   hw/ppc/e500: fix broken snapshot replay
37* 7b8589d7ce7e Nicholas Piggin:
   ppc/vof: Fix missed fields in VOF cleanup
38* af03aeb631ee Richard Henderson:
   target/ppc: Flush inputs to zero with NJ in ppc_store_vscr
39* c3461c6264a7 Niklas Cassel:
   hw/ide/core: set ERR_STAT in unsupported command completion
40* 2967dc8209dd Niklas Cassel:
   hw/ide/ahci: write D2H FIS when processin

[Stable-7.2.6 08/51] lsi53c895a: disable reentrancy detection for script RAM

2023-09-11 Thread Michael Tokarev

From: Alexander Bulekov 

As the code is designed to use the memory APIs to access the script ram,
disable reentrancy checks for the pseudo-RAM ram_io MemoryRegion.

In the future, ram_io may be converted from an IO to a proper RAM MemoryRegion.

Reported-by: Fiona Ebner 
Signed-off-by: Alexander Bulekov 
Reviewed-by: Thomas Huth 
Reviewed-by: Darren Kenny 
Message-Id: <20230427211013.2994127-6-alx...@bu.edu>
Signed-off-by: Thomas Huth 
(cherry picked from commit bfd6e7ae6a72b84e2eb9574f56e6ec037f05182c)
Signed-off-by: Michael Tokarev 

diff --git a/hw/scsi/lsi53c895a.c b/hw/scsi/lsi53c895a.c
index 42532c4744..4ece478a37 100644
--- a/hw/scsi/lsi53c895a.c
+++ b/hw/scsi/lsi53c895a.c
@@ -2313,6 +2313,12 @@ static void lsi_scsi_realize(PCIDevice *dev, Error 
**errp)
 memory_region_init_io(&s->io_io, OBJECT(s), &lsi_io_ops, s,
   "lsi-io", 256);
 
+/*
+ * Since we use the address-space API to interact with ram_io, disable the
+ * re-entrancy guard.
+ */
+s->ram_io.disable_reentrancy_guard = true;
+
 address_space_init(&s->pci_io_as, pci_address_space_io(dev), "lsi-pci-io");
 qdev_init_gpio_out(d, &s->ext_irq, 1);
 
-- 
2.39.2

[Stable-7.2.6 07/51] hw: replace most qemu_bh_new calls with qemu_bh_new_guarded

2023-09-11 Thread Michael Tokarev

From: Alexander Bulekov 

This protects devices from bh->mmio reentrancy issues.

Thanks: Thomas Huth  for diagnosing OS X test failure.
Signed-off-by: Alexander Bulekov 
Reviewed-by: Darren Kenny 
Reviewed-by: Stefan Hajnoczi 
Reviewed-by: Michael S. Tsirkin 
Reviewed-by: Paul Durrant 
Reviewed-by: Thomas Huth 
Message-Id: <20230427211013.2994127-5-alx...@bu.edu>
Signed-off-by: Thomas Huth 
(cherry picked from commit f63192b0544af5d3e4d5edfd85ab520fcf671377)
Signed-off-by: Michael Tokarev 

diff --git a/hw/9pfs/xen-9p-backend.c b/hw/9pfs/xen-9p-backend.c
index ab1df8dd2f..8459736f2d 100644
--- a/hw/9pfs/xen-9p-backend.c
+++ b/hw/9pfs/xen-9p-backend.c
@@ -62,6 +62,7 @@ typedef struct Xen9pfsDev {
 
 int num_rings;
 Xen9pfsRing *rings;
+MemReentrancyGuard mem_reentrancy_guard;
 } Xen9pfsDev;
 
 static void xen_9pfs_disconnect(struct XenLegacyDevice *xendev);
@@ -448,7 +449,9 @@ static int xen_9pfs_connect(struct XenLegacyDevice *xendev)
 xen_9pdev->rings[i].ring.out = xen_9pdev->rings[i].data +
XEN_FLEX_RING_SIZE(ring_order);
 
-xen_9pdev->rings[i].bh = qemu_bh_new(xen_9pfs_bh, 
&xen_9pdev->rings[i]);
+xen_9pdev->rings[i].bh = qemu_bh_new_guarded(xen_9pfs_bh,
+ &xen_9pdev->rings[i],
+ 
&xen_9pdev->mem_reentrancy_guard);
 xen_9pdev->rings[i].out_cons = 0;
 xen_9pdev->rings[i].out_size = 0;
 xen_9pdev->rings[i].inprogress = false;
diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c
index 26f965cabc..49e59cef8a 100644
--- a/hw/block/dataplane/virtio-blk.c
+++ b/hw/block/dataplane/virtio-blk.c
@@ -127,7 +127,8 @@ bool virtio_blk_data_plane_create(VirtIODevice *vdev, 
VirtIOBlkConf *conf,
 } else {
 s->ctx = qemu_get_aio_context();
 }
-s->bh = aio_bh_new(s->ctx, notify_guest_bh, s);
+s->bh = aio_bh_new_guarded(s->ctx, notify_guest_bh, s,
+   &DEVICE(vdev)->mem_reentrancy_guard);
 s->batch_notify_vqs = bitmap_new(conf->num_queues);
 
 *dataplane = s;
diff --git a/hw/block/dataplane/xen-block.c b/hw/block/dataplane/xen-block.c
index 2785b9e849..e31806b317 100644
--- a/hw/block/dataplane/xen-block.c
+++ b/hw/block/dataplane/xen-block.c
@@ -632,8 +632,9 @@ XenBlockDataPlane *xen_block_dataplane_create(XenDevice 
*xendev,
 } else {
 dataplane->ctx = qemu_get_aio_context();
 }
-dataplane->bh = aio_bh_new(dataplane->ctx, xen_block_dataplane_bh,
-   dataplane);
+dataplane->bh = aio_bh_new_guarded(dataplane->ctx, xen_block_dataplane_bh,
+   dataplane,
+   &DEVICE(xendev)->mem_reentrancy_guard);
 
 return dataplane;
 }
diff --git a/hw/char/virtio-serial-bus.c b/hw/char/virtio-serial-bus.c
index 7d4601cb5d..dd619f0731 100644
--- a/hw/char/virtio-serial-bus.c
+++ b/hw/char/virtio-serial-bus.c
@@ -985,7 +985,8 @@ static void virtser_port_device_realize(DeviceState *dev, 
Error **errp)
 return;
 }
 
-port->bh = qemu_bh_new(flush_queued_data_bh, port);
+port->bh = qemu_bh_new_guarded(flush_queued_data_bh, port,
+   &dev->mem_reentrancy_guard);
 port->elem = NULL;
 }
 
diff --git a/hw/display/qxl.c b/hw/display/qxl.c
index 6772849dec..67efa3c3ef 100644
--- a/hw/display/qxl.c
+++ b/hw/display/qxl.c
@@ -2223,11 +2223,14 @@ static void qxl_realize_common(PCIQXLDevice *qxl, Error 
**errp)
 
 qemu_add_vm_change_state_handler(qxl_vm_change_state_handler, qxl);
 
-qxl->update_irq = qemu_bh_new(qxl_update_irq_bh, qxl);
+qxl->update_irq = qemu_bh_new_guarded(qxl_update_irq_bh, qxl,
+  &DEVICE(qxl)->mem_reentrancy_guard);
 qxl_reset_state(qxl);
 
-qxl->update_area_bh = qemu_bh_new(qxl_render_update_area_bh, qxl);
-qxl->ssd.cursor_bh = qemu_bh_new(qemu_spice_cursor_refresh_bh, &qxl->ssd);
+qxl->update_area_bh = qemu_bh_new_guarded(qxl_render_update_area_bh, qxl,
+  
&DEVICE(qxl)->mem_reentrancy_guard);
+qxl->ssd.cursor_bh = qemu_bh_new_guarded(qemu_spice_cursor_refresh_bh, 
&qxl->ssd,
+ 
&DEVICE(qxl)->mem_reentrancy_guard);
 }
 
 static void qxl_realize_primary(PCIDevice *dev, Error **errp)
diff --git a/hw/display/virtio-gpu.c b/hw/display/virtio-gpu.c
index 4e2e0dd53a..7c13b056b9 100644
--- a/hw/display/virtio-gpu.c
+++ b/hw/display/virtio-gpu.c
@@ -1356,8 +1356,10 @@ void virtio_gpu_device_realize(DeviceState *qdev, Error 
**errp)
 
 g->ctrl_vq = virtio_get_queue(vdev, 0);
 g->cursor_vq = virtio_get_queue(vdev, 1);
-g->ctrl_bh = qemu_bh_new(virtio_gpu_ctrl_bh, g);
-g->cursor_bh = qemu_bh_new(virtio_gpu_cursor_bh, g);
+g->ctrl_bh = qemu_bh_new_guarded(virtio_gpu_ctrl_bh, g,
+

Re: riscv64 virt board crash upon startup

2023-09-11 Thread Marc-André Lureau

Hi

On Fri, Sep 8, 2023 at 3:55 AM Laszlo Ersek  wrote:
>
> On 9/8/23 01:47, Laszlo Ersek wrote:
>
> > I don't know why qemu_console_is_multihead() used a lot of QOM
> > trickery for this in the first place, but here's what I'd propose as
> > fix -- simply try to locate a QemuGraphicConsole in "consoles" that
> > references the same "device" that *this* QemuGraphicConsole
> > references, but by a different "head" number.
>
> So, the final version of the function would look like:
>
> static bool qemu_graphic_console_is_multihead(QemuGraphicConsole *c)
> {
> QemuConsole *con;
>
> QTAILQ_FOREACH(con, &consoles, next) {
> if (!QEMU_IS_GRAPHIC_CONSOLE(con)) {
> continue;
> }
> QemuGraphicConsole *candidate = QEMU_GRAPHIC_CONSOLE(con);
> if (candidate->device != c->device) {
> continue;
> }
>
> if (candidate->head != c->head) {
> return true;
> }
> }
> return false;
> }
>

ack, can you send a patch?

[Stable-7.2.6 12/51] apic: disable reentrancy detection for apic-msi

2023-09-11 Thread Michael Tokarev

From: Alexander Bulekov 

As the code is designed for re-entrant calls to apic-msi, mark apic-msi
as reentrancy-safe.

Signed-off-by: Alexander Bulekov 
Reviewed-by: Darren Kenny 
Message-Id: <20230427211013.2994127-9-alx...@bu.edu>
Signed-off-by: Thomas Huth 
(cherry picked from commit 50795ee051a342c681a9b45671c552fbd6274db8)
Signed-off-by: Michael Tokarev 

diff --git a/hw/intc/apic.c b/hw/intc/apic.c
index 3df11c34d6..a7c2b301a8 100644
--- a/hw/intc/apic.c
+++ b/hw/intc/apic.c
@@ -883,6 +883,13 @@ static void apic_realize(DeviceState *dev, Error **errp)
 memory_region_init_io(&s->io_memory, OBJECT(s), &apic_io_ops, s, 
"apic-msi",
   APIC_SPACE_SIZE);
 
+/*
+ * apic-msi's apic_mem_write can call into ioapic_eoi_broadcast, which can
+ * write back to apic-msi. As such mark the apic-msi region re-entrancy
+ * safe.
+ */
+s->io_memory.disable_reentrancy_guard = true;
+
 s->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, apic_timer, s);
 local_apics[s->id] = s;
 
-- 
2.39.2

Re: riscv64 virt board crash upon startup

2023-09-11 Thread Marc-André Lureau

Hi

On Fri, Sep 8, 2023 at 3:47 AM Laszlo Ersek  wrote:
>
> Question for Gerd below:
>
> On 9/7/23 14:29, Philippe Mathieu-Daudé wrote:
> > On 7/9/23 13:25, Laszlo Ersek wrote:
> >> This is with QEMU v8.1.0-391-gc152379422a2.
> >>
> >> I use the command line from (scroll to the bottom):
> >>
> >>https://github.com/tianocore/edk2/commit/49f06b664018
> >>
> >> (with "-full-screen" removed).
> >>
> >> The crash is as follows:
> >>
> >>Unexpected error in object_property_find_err() at
> >> ../../src/upstream/qemu/qom/object.c:1314:
> >>qemu: Property 'qemu-fixed-text-console.device' not found
> >>Aborted (core dumped)
> >
> > Cc'ing Marc-André for commit b208f745a8
> > ("ui/console: introduce different console objects")
>
[...]
> Gerd, here's the question for you: why are "device" and "head" QOM properties 
> in the first place? What are they needed for?
>

You get QOM tree introspection (ex: (qemu) qom-get
/backend/console[0]/device type). Other than that, I don't think it
brings anything else.

> I've found the following two commits:
> - aa2beaa1f57c ("console: add device link to QemuConsoles", 2013-04-25)
> - 5643706a0950 ("console: add head to index to qemu consoles.", 2014-03-05)
>
> But I don't understand *how* having a "link" helps for figuring out which 
> qemu console displays which device.
>
>

Re: [PATCH RESEND 04/15] ppc: spapr: Start using nested.api for nested kvm-hv api

2023-09-11 Thread Harsh Prateek Bora





On 9/7/23 07:05, Nicholas Piggin wrote:

On Wed Sep 6, 2023 at 2:33 PM AEST, Harsh Prateek Bora wrote:

With this patch, isolating kvm-hv nested api code to be executed only
when cap-nested-hv is set. This helps keeping api specific logic
mutually exclusive.


Changelog needs a bit of improvement. Emphasis on "why" for changelogs.
If you take a changeset that makes a single logical change to the code,
you should be able to understand why that is done. You could make some
assumptions about the bigger series when it comes to details so don't
have to explain from first principles. But if it's easy to explain why
the high level, you could.

Why are we adding this fundamentally? So that the spapr nested code can
be extended to support a second API.

This patch should add the api field to the struct, and also the
NESTED_API_KVM_HV definition.


Sure, folding related changes (struct member, macros) into this patch
and updating changelog as suggested sounds more meaningful to me too.

regards,
Harsh



Thanks,
Nick



Signed-off-by: Michael Neuling 
Signed-off-by: Harsh Prateek Bora 
---
  hw/ppc/spapr.c  | 7 ++-
  hw/ppc/spapr_caps.c | 1 +
  2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index e44686b04d..0aa9f21516 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -1334,8 +1334,11 @@ static bool spapr_get_pate(PPCVirtualHypervisor *vhyp, 
PowerPCCPU *cpu,
  /* Copy PATE1:GR into PATE0:HR */
  entry->dw0 = spapr->patb_entry & PATE0_HR;
  entry->dw1 = spapr->patb_entry;
+return true;
+}
+assert(spapr->nested.api);
  
-} else {

+if (spapr->nested.api == NESTED_API_KVM_HV) {
  uint64_t patb, pats;
  
  assert(lpid != 0);

@@ -3437,6 +3440,8 @@ static void spapr_instance_init(Object *obj)
  spapr_get_host_serial, spapr_set_host_serial);
  object_property_set_description(obj, "host-serial",
  "Host serial number to advertise in guest device tree");
+/* Nested */
+spapr->nested.api = 0;
  }
  
  static void spapr_machine_finalizefn(Object *obj)

diff --git a/hw/ppc/spapr_caps.c b/hw/ppc/spapr_caps.c
index 5a0755d34f..a3a790b026 100644
--- a/hw/ppc/spapr_caps.c
+++ b/hw/ppc/spapr_caps.c
@@ -454,6 +454,7 @@ static void cap_nested_kvm_hv_apply(SpaprMachineState 
*spapr,
  return;
  }
  
+spapr->nested.api = NESTED_API_KVM_HV;

  if (kvm_enabled()) {
  if (!ppc_check_compat(cpu, CPU_POWERPC_LOGICAL_3_00, 0,
spapr->max_compat_pvr)) {

Re: [PATCH 1/7] bsd-user: spelling fixes

2023-09-11 Thread Michael Tokarev


Warner, Kyle, can you take a look please?

https://patchew.org/QEMU/20230909131258.354675-1-...@tls.msk.ru/20230909131258.354675-2-...@tls.msk.ru/

09.09.2023 16:12, Michael Tokarev:

Signed-off-by: Michael Tokarev 
---
  bsd-user/errno_defs.h| 2 +-
  bsd-user/freebsd/target_os_siginfo.h | 2 +-
  bsd-user/freebsd/target_os_stack.h   | 4 ++--
  bsd-user/freebsd/target_os_user.h| 2 +-
  bsd-user/qemu.h  | 2 +-
  bsd-user/signal-common.h | 4 ++--
  bsd-user/signal.c| 6 +++---
  7 files changed, 11 insertions(+), 11 deletions(-)


Thanks,

/mjt

Re: riscv64 virt board crash upon startup

2023-09-11 Thread Gerd Hoffmann

On Mon, Sep 11, 2023 at 12:12:43PM +0400, Marc-André Lureau wrote:
> > Gerd, here's the question for you: why are "device" and "head" QOM 
> > properties in the first place? What are they needed for?
> >
> 
> You get QOM tree introspection (ex: (qemu) qom-get
> /backend/console[0]/device type). Other than that, I don't think it
> brings anything else.

You can configure vnc server(s) to show a specific device + head, which
allows to run multihead configurations by using multiple vnc servers (one
for each head).

You can link input devices to device + head, so input events can go to
different devices depending on where they are coming from.  Which is
most useful for tablet devices in a vnc multihead setup, each head has
its own tablet device then.  Requires manual guest-side configuration
to establish the same tablet <-> head relationship (tested that years
ago with X11, not sure if and how this can be done with wayland).

HTH,
  Gerd

Re: [PATCH 2/7] i386: spelling fixes

2023-09-11 Thread Michael Tokarev


Paolo, Michael, can you take a quick look please?

https://patchew.org/QEMU/20230909131258.354675-1-...@tls.msk.ru/20230909131258.354675-3-...@tls.msk.ru/

09.09.2023 16:12, Michael Tokarev:

Signed-off-by: Michael Tokarev 
---
  host/include/i386/host/cpuinfo.h | 2 +-
  hw/i386/acpi-build.c | 4 ++--
  hw/i386/amd_iommu.c  | 4 ++--
  hw/i386/intel_iommu.c| 4 ++--
  hw/i386/kvm/xen_xenstore.c   | 2 +-
  hw/i386/kvm/xenstore_impl.c  | 2 +-
  hw/i386/pc.c | 4 ++--
  include/hw/i386/topology.h   | 2 +-
  target/i386/cpu.c| 4 ++--
  target/i386/cpu.h| 4 ++--
  target/i386/kvm/kvm.c| 4 ++--
  target/i386/kvm/xen-emu.c| 2 +-
  target/i386/machine.c| 4 ++--
  target/i386/tcg/translate.c  | 8 
  tests/tcg/i386/system/boot.S | 2 +-
  tests/tcg/i386/x86.csv   | 2 +-
  16 files changed, 27 insertions(+), 27 deletions(-)


Thanks,

/mjt

Re: QEMU Stable series

2023-09-11 Thread Peter Maydell

On Sun, 10 Sept 2023 at 22:01, Michael Tokarev  wrote:
>
> Hi!
>
> For quite some time I'm collecting stuff for stable, pocking various
> people with questions about stable, etc, so has become somewhat more
> visible and somewhat annoying too :)  Especially when I publish the
> next stable patch round-up, which has not only become larger when
> counting individual patches, but also multiplied by now 3 stable
> series.  So I guess you wondered what's going on, what's the buzz
> is all about, and when this ever stop... :)
>
> Seriously though, I stepped up to maintain -stable series just out of
> nowhere, just because I had issues with qemu in debian and thought to
> do something with that, but instead of collecting stuff privately
> inside debian, I decided to give it a more visible try, to see how
> it will work out, without understanding how it will be.

Thanks very much for taking on this work -- it's been an
area where for some time the project has lagged behind because
we didn't have anybody who could dedicate the time to doing
stable releases on a regular basis. So it's been great to
see the stable backport branches being more active.

> Meanwhile, next debian stable has been released, codenamed Bookworm,
> which has qemu version 7.2.  And it should be supported for the next
> 2 years until next debian release.
>
> We never had any long-maintained releases in QEMU before, usually the
> previous series maintenance stopped once next major release is out.
> Right now there's stable-7.2 and stable-8.0 still in existance even
> after 8.1 has been released.  I should draw the line somewhere, even
> while so far, the whole stuff has been quite easy (but time-consuming).
>
> For now I decided I'll stop publishing stable-8.0 releases.  This one
> had a number of linux-user issues, a big share of which Richard fixed
> at the very end of 8.1 development cycle.  There will be one more 8.0
> stable release at least (see below for the details), together with the
> first 8.1 stable.
>
> I think this is more appropriate to drop support for previous stable
> not with next major, but with first major stable release instead, -
> this way users have much more choices for smooth upgrade to the next
> major version.  So stable-8.0 will end with 8.1.1.  Unless there's
> a good reason to continue.

That seems reasonable. I think ultimately what we do in stable
releases depends on what people care enough about to want to
do the backport-and-release work for, though :-)

thanks
-- PMM

Re: [PATCH 3/7] ppc: spelling fixes

2023-09-11 Thread Michael Tokarev


Daniel, Cédric, can you take a quick look please?

https://patchew.org/QEMU/20230909131258.354675-1-...@tls.msk.ru/20230909131258.354675-4-...@tls.msk.ru/

09.09.2023 16:12, Michael Tokarev:

Signed-off-by: Michael Tokarev 
---
  host/include/ppc/host/cpuinfo.h |  2 +-
  hw/ppc/ppc.c|  2 +-
  hw/ppc/prep_systemio.c  |  2 +-
  hw/ppc/spapr.c  |  8 
  hw/ppc/spapr_hcall.c|  2 +-
  hw/ppc/spapr_nvdimm.c   |  4 ++--
  hw/ppc/spapr_pci_vfio.c |  2 +-
  include/hw/ppc/openpic.h|  2 +-
  include/hw/ppc/spapr.h  |  2 +-
  target/ppc/cpu-models.h |  4 ++--
  target/ppc/cpu.h|  2 +-
  target/ppc/cpu_init.c   |  4 ++--
  target/ppc/excp_helper.c| 14 +++---
  target/ppc/power8-pmu-regs.c.inc|  4 ++--
  target/ppc/translate/vmx-impl.c.inc |  6 +++---
  15 files changed, 30 insertions(+), 30 deletions(-)


Thanks,

/mjt

RE: [PATCH v1] vfio/common: Separate vfio-pci ranges

2023-09-11 Thread Duan, Zhenzhong



>-Original Message-
>From: qemu-devel-bounces+zhenzhong.duan=intel@nongnu.org devel-bounces+zhenzhong.duan=intel@nongnu.org> On Behalf Of Joao
>Martins
>Sent: Friday, September 8, 2023 5:30 PM
>Subject: [PATCH v1] vfio/common: Separate vfio-pci ranges
>
>QEMU computes the DMA logging ranges for two predefined ranges: 32-bit
>and 64-bit. In the OVMF case, when the dynamic MMIO window is enabled,
>QEMU includes in the 64-bit range the RAM regions at the lower part
>and vfio-pci device RAM regions which are at the top of the address
>space. This range contains a large gap and the size can be bigger than
>the dirty tracking HW limits of some devices (MLX5 has a 2^42 limit).
>
>To avoid such large ranges, introduce a new PCI range covering the
>vfio-pci device RAM regions, this only if the addresses are above 4GB
>to avoid breaking potential SeaBIOS guests.
>
>Signed-off-by: Joao Martins 
>[ clg: - wrote commit log
>   - fixed overlapping 32-bit and PCI ranges when using SeaBIOS ]
>Signed-off-by: Cédric Le Goater 
>---
>v2:
>* s/minpci/minpci64/
>* s/maxpci/maxpci64/
>* Expand comment to cover the pci-hole64 and why we don't do special
>  handling of pci-hole32.
>---
> hw/vfio/common.c | 71 +---
> hw/vfio/trace-events |  2 +-
> 2 files changed, 61 insertions(+), 12 deletions(-)
>
>diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>index 237101d03844..134649226d43 100644
>--- a/hw/vfio/common.c
>+++ b/hw/vfio/common.c
>@@ -27,6 +27,7 @@
>
> #include "hw/vfio/vfio-common.h"
> #include "hw/vfio/vfio.h"
>+#include "hw/vfio/pci.h"
> #include "exec/address-spaces.h"
> #include "exec/memory.h"
> #include "exec/ram_addr.h"
>@@ -1400,6 +1401,8 @@ typedef struct VFIODirtyRanges {
> hwaddr max32;
> hwaddr min64;
> hwaddr max64;
>+hwaddr minpci64;
>+hwaddr maxpci64;
> } VFIODirtyRanges;
>
> typedef struct VFIODirtyRangesListener {
>@@ -1408,6 +1411,31 @@ typedef struct VFIODirtyRangesListener {
> MemoryListener listener;
> } VFIODirtyRangesListener;
>
>+static bool vfio_section_is_vfio_pci(MemoryRegionSection *section,
>+ VFIOContainer *container)
>+{
>+VFIOPCIDevice *pcidev;
>+VFIODevice *vbasedev;
>+VFIOGroup *group;
>+Object *owner;
>+
>+owner = memory_region_owner(section->mr);
>+
>+QLIST_FOREACH(group, &container->group_list, container_next) {
>+QLIST_FOREACH(vbasedev, &group->device_list, next) {
>+if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
>+continue;
>+}
>+pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
>+if (OBJECT(pcidev) == owner) {
>+return true;
>+}
>+}
>+}
>+
>+return false;
>+}

What about simplify it with memory_region_is_ram_device()?
This way vdpa device could also be included.

Thanks
Zhenzhong

[PATCH v3] hw/loongarch: Add virtio-mmio bus support

2023-09-11 Thread Tianrui Zhao

Add virtio-mmio bus support for LoongArch, so that devices
could be added in the virtio-mmio bus.

Signed-off-by: Tianrui Zhao 
Change-Id: Ib882005106562e0dfe74122a7fa2430fa081bfb2
---
 hw/loongarch/Kconfig   |  1 +
 hw/loongarch/acpi-build.c  | 25 +
 hw/loongarch/virt.c| 28 
 include/hw/pci-host/ls7a.h |  4 
 4 files changed, 58 insertions(+)

diff --git a/hw/loongarch/Kconfig b/hw/loongarch/Kconfig
index 1e7c5b43c5..01ab8ce8e7 100644
--- a/hw/loongarch/Kconfig
+++ b/hw/loongarch/Kconfig
@@ -22,3 +22,4 @@ config LOONGARCH_VIRT
 select DIMM
 select PFLASH_CFI01
 select ACPI_HMAT
+select VIRTIO_MMIO
diff --git a/hw/loongarch/acpi-build.c b/hw/loongarch/acpi-build.c
index ae292fc543..d033fc2271 100644
--- a/hw/loongarch/acpi-build.c
+++ b/hw/loongarch/acpi-build.c
@@ -363,6 +363,30 @@ static void acpi_dsdt_add_tpm(Aml *scope, 
LoongArchMachineState *vms)
 }
 #endif
 
+static void acpi_dsdt_add_virtio(Aml *scope)
+{
+int i;
+hwaddr base = VIRT_VIRTIO_MMIO_BASE;
+hwaddr size = VIRT_VIRTIO_MMIO_SIZE;
+
+for (i = 0; i < VIRT_VIRTIO_MMIO_NUM; i++) {
+uint32_t irq = VIRT_VIRTIO_MMIO_IRQ + i;
+Aml *dev = aml_device("VR%02u", i);
+
+aml_append(dev, aml_name_decl("_HID", aml_string("LNRO0005")));
+aml_append(dev, aml_name_decl("_UID", aml_int(i)));
+aml_append(dev, aml_name_decl("_CCA", aml_int(1)));
+
+Aml *crs = aml_resource_template();
+aml_append(crs, aml_memory32_fixed(base, size, AML_READ_WRITE));
+aml_append(crs, aml_interrupt(AML_CONSUMER, AML_LEVEL, AML_ACTIVE_HIGH,
+   AML_EXCLUSIVE, &irq, 1));
+aml_append(dev, aml_name_decl("_CRS", crs));
+aml_append(scope, dev);
+base += size;
+}
+}
+
 /* build DSDT */
 static void
 build_dsdt(GArray *table_data, BIOSLinker *linker, MachineState *machine)
@@ -381,6 +405,7 @@ build_dsdt(GArray *table_data, BIOSLinker *linker, 
MachineState *machine)
 #ifdef CONFIG_TPM
 acpi_dsdt_add_tpm(dsdt, lams);
 #endif
+acpi_dsdt_add_virtio(dsdt);
 /* System State Package */
 scope = aml_scope("\\");
 pkg = aml_package(4);
diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c
index 2629128aed..ffef3222da 100644
--- a/hw/loongarch/virt.c
+++ b/hw/loongarch/virt.c
@@ -116,6 +116,25 @@ static void fdt_add_rtc_node(LoongArchMachineState *lams)
 g_free(nodename);
 }
 
+static void fdt_add_virtio_mmio_node(LoongArchMachineState *lams)
+{
+int i;
+MachineState *ms = MACHINE(lams);
+
+for (i = VIRT_VIRTIO_MMIO_NUM - 1; i >= 0; i--) {
+char *nodename;
+hwaddr base = VIRT_VIRTIO_MMIO_BASE + i * VIRT_VIRTIO_MMIO_SIZE;
+
+nodename = g_strdup_printf("/virtio_mmio@%" PRIx64, base);
+qemu_fdt_add_subnode(ms->fdt, nodename);
+qemu_fdt_setprop_string(ms->fdt, nodename,
+"compatible", "virtio,mmio");
+qemu_fdt_setprop_sized_cells(ms->fdt, nodename, "reg",
+ 2, base, 2, VIRT_VIRTIO_MMIO_SIZE);
+g_free(nodename);
+}
+}
+
 static void fdt_add_uart_node(LoongArchMachineState *lams)
 {
 char *nodename;
@@ -560,6 +579,15 @@ static void loongarch_devices_init(DeviceState *pch_pic, 
LoongArchMachineState *
  VIRT_RTC_IRQ - VIRT_GSI_BASE));
 fdt_add_rtc_node(lams);
 
+/* virtio-mmio device */
+for (i = 0; i < VIRT_VIRTIO_MMIO_NUM; i++) {
+hwaddr virtio_base = VIRT_VIRTIO_MMIO_BASE + i * VIRT_VIRTIO_MMIO_SIZE;
+int virtio_irq = VIRT_VIRTIO_MMIO_IRQ - VIRT_GSI_BASE + i;
+sysbus_create_simple("virtio-mmio", virtio_base,
+  qdev_get_gpio_in(pch_pic, virtio_irq));
+}
+fdt_add_virtio_mmio_node(lams);
+
 pm_mem = g_new(MemoryRegion, 1);
 memory_region_init_io(pm_mem, NULL, &loongarch_virt_pm_ops,
   NULL, "loongarch_virt_pm", PM_SIZE);
diff --git a/include/hw/pci-host/ls7a.h b/include/hw/pci-host/ls7a.h
index e753449593..96506b9a4c 100644
--- a/include/hw/pci-host/ls7a.h
+++ b/include/hw/pci-host/ls7a.h
@@ -42,6 +42,10 @@
 #define VIRT_RTC_REG_BASE(VIRT_MISC_REG_BASE + 0x00050100)
 #define VIRT_RTC_LEN 0x100
 #define VIRT_SCI_IRQ (VIRT_GSI_BASE + 4)
+#define VIRT_VIRTIO_MMIO_IRQ (VIRT_GSI_BASE + 7)
+#define VIRT_VIRTIO_MMIO_BASE0x1e20
+#define VIRT_VIRTIO_MMIO_SIZE0x200
+#define VIRT_VIRTIO_MMIO_NUM 4
 
 #define VIRT_PLATFORM_BUS_BASEADDRESS   0x1600
 #define VIRT_PLATFORM_BUS_SIZE  0x200
-- 
2.39.1

Re: [PATCH v2 10/19] target/riscv: remove kvm-stub.c

2023-09-11 Thread Andrew Jones

On Mon, Sep 11, 2023 at 09:49:06AM +0200, Andrew Jones wrote:
> On Wed, Sep 06, 2023 at 12:23:19PM +0200, Philippe Mathieu-Daudé wrote:
> > On 6/9/23 11:16, Daniel Henrique Barboza wrote:
> > > This file is not needed for some time now. All the stubs implemented in
> > > it (kvm_riscv_reset_vcpu() and kvm_riscv_set_irq()) are wrapped in 'if
> > > kvm_enabled()' blocks that the compiler will rip it out in non-KVM
> > > builds.
> > > 
> > > We'll also add non-KVM stubs for all functions declared in kvm_riscv.h.
> > > All stubs are implemented as g_asserted_not_reached(), meaning that we
> > > won't support them in non-KVM builds. This is done by other kvm headers
> > > like kvm_arm.h and kvm_ppc.h.
> > 
> > Aren't them also protected by kvm_enabled()? Otherwise shouldn't they?
> 
> Yes, I think your earlier suggestion that we always invoke kvm functions
> from non-kvm files with a kvm_enabled() guard makes sense.
> 
> > 
> > > Signed-off-by: Daniel Henrique Barboza 
> > > ---
> > >   target/riscv/kvm-stub.c  | 30 --
> > >   target/riscv/kvm_riscv.h | 31 +++
> > >   target/riscv/meson.build |  2 +-
> > >   3 files changed, 32 insertions(+), 31 deletions(-)
> > >   delete mode 100644 target/riscv/kvm-stub.c
> > 
> > 
> > > diff --git a/target/riscv/kvm_riscv.h b/target/riscv/kvm_riscv.h
> > > index f6501e68e2..c9ecd9a967 100644
> > > --- a/target/riscv/kvm_riscv.h
> > > +++ b/target/riscv/kvm_riscv.h
> > > @@ -19,6 +19,7 @@
> > >   #ifndef QEMU_KVM_RISCV_H
> > >   #define QEMU_KVM_RISCV_H
> > > +#ifdef CONFIG_KVM
> > >   void kvm_riscv_cpu_add_kvm_properties(Object *obj);
> > 
> > At a glance kvm_riscv_cpu_add_kvm_properties() is.
> > Keep the prototype declared (before #ifdef CONFIG_KVM) is enough for the
> > compiler to elide it.
> 
> Yes, when building without CONFIG_KVM enabled it's actually better to not
> have the stubs, since the compiler will catch an unguarded kvm function
> call (assuming the kvm function is defined in a file which is only built
> with CONFIG_KVM).
> 
> Unfortunately we don't have anything to protect developers from forgetting
> the kvm_enabled() guard when building a QEMU which supports both TCG and
> KVM. We could try to remember to put 'assert(kvm_enabled())' at the start
> of each of these types of functions. It looks like mips does that for a
> couple functions.

Eh, ignore this suggestion. We don't need asserts, because we have CI. As
long as our CI does a CONFIG_KVM=n build and all KVM functions are in kvm-
only files, then we'll always catch calls of KVM functions which are
missing their kvm_enabled() guards.

Thanks,
drew

Re: [PATCH v1] vfio/common: Separate vfio-pci ranges

2023-09-11 Thread Joao Martins

On 11/09/2023 09:57, Duan, Zhenzhong wrote:
>> -Original Message-
>> From: qemu-devel-bounces+zhenzhong.duan=intel@nongnu.org > devel-bounces+zhenzhong.duan=intel@nongnu.org> On Behalf Of Joao
>> Martins
>> Sent: Friday, September 8, 2023 5:30 PM
>> Subject: [PATCH v1] vfio/common: Separate vfio-pci ranges
>>
>> QEMU computes the DMA logging ranges for two predefined ranges: 32-bit
>> and 64-bit. In the OVMF case, when the dynamic MMIO window is enabled,
>> QEMU includes in the 64-bit range the RAM regions at the lower part
>> and vfio-pci device RAM regions which are at the top of the address
>> space. This range contains a large gap and the size can be bigger than
>> the dirty tracking HW limits of some devices (MLX5 has a 2^42 limit).
>>
>> To avoid such large ranges, introduce a new PCI range covering the
>> vfio-pci device RAM regions, this only if the addresses are above 4GB
>> to avoid breaking potential SeaBIOS guests.
>>
>> Signed-off-by: Joao Martins 
>> [ clg: - wrote commit log
>>   - fixed overlapping 32-bit and PCI ranges when using SeaBIOS ]
>> Signed-off-by: Cédric Le Goater 
>> ---
>> v2:
>> * s/minpci/minpci64/
>> * s/maxpci/maxpci64/
>> * Expand comment to cover the pci-hole64 and why we don't do special
>>  handling of pci-hole32.
>> ---
>> hw/vfio/common.c | 71 +---
>> hw/vfio/trace-events |  2 +-
>> 2 files changed, 61 insertions(+), 12 deletions(-)
>>
>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>> index 237101d03844..134649226d43 100644
>> --- a/hw/vfio/common.c
>> +++ b/hw/vfio/common.c
>> @@ -27,6 +27,7 @@
>>
>> #include "hw/vfio/vfio-common.h"
>> #include "hw/vfio/vfio.h"
>> +#include "hw/vfio/pci.h"
>> #include "exec/address-spaces.h"
>> #include "exec/memory.h"
>> #include "exec/ram_addr.h"
>> @@ -1400,6 +1401,8 @@ typedef struct VFIODirtyRanges {
>> hwaddr max32;
>> hwaddr min64;
>> hwaddr max64;
>> +hwaddr minpci64;
>> +hwaddr maxpci64;
>> } VFIODirtyRanges;
>>
>> typedef struct VFIODirtyRangesListener {
>> @@ -1408,6 +1411,31 @@ typedef struct VFIODirtyRangesListener {
>> MemoryListener listener;
>> } VFIODirtyRangesListener;
>>
>> +static bool vfio_section_is_vfio_pci(MemoryRegionSection *section,
>> + VFIOContainer *container)
>> +{
>> +VFIOPCIDevice *pcidev;
>> +VFIODevice *vbasedev;
>> +VFIOGroup *group;
>> +Object *owner;
>> +
>> +owner = memory_region_owner(section->mr);
>> +
>> +QLIST_FOREACH(group, &container->group_list, container_next) {
>> +QLIST_FOREACH(vbasedev, &group->device_list, next) {
>> +if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
>> +continue;
>> +}
>> +pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
>> +if (OBJECT(pcidev) == owner) {
>> +return true;
>> +}
>> +}
>> +}
>> +
>> +return false;
>> +}
> 
> What about simplify it with memory_region_is_ram_device()?
> This way vdpa device could also be included.
> 

Note that the check is not interested in RAM (or any other kinda of memory like
VGA). That's covered in the 32-64 ranges. But rather in any PCI device RAM that
would fall in the 64-bit PCI hole. Would memory_region_is_ram_device() really
cover it? If so, I am all for the simplification.

Joao

Re: [PATCH 3/7] ppc: spelling fixes

2023-09-11 Thread Cédric Le Goater


On 9/9/23 15:12, Michael Tokarev wrote:

Signed-off-by: Michael Tokarev 


LGTM,

Reviewed-by: Cédric Le Goater 

Thanks,

C.



---
  host/include/ppc/host/cpuinfo.h |  2 +-
  hw/ppc/ppc.c|  2 +-
  hw/ppc/prep_systemio.c  |  2 +-
  hw/ppc/spapr.c  |  8 
  hw/ppc/spapr_hcall.c|  2 +-
  hw/ppc/spapr_nvdimm.c   |  4 ++--
  hw/ppc/spapr_pci_vfio.c |  2 +-
  include/hw/ppc/openpic.h|  2 +-
  include/hw/ppc/spapr.h  |  2 +-
  target/ppc/cpu-models.h |  4 ++--
  target/ppc/cpu.h|  2 +-
  target/ppc/cpu_init.c   |  4 ++--
  target/ppc/excp_helper.c| 14 +++---
  target/ppc/power8-pmu-regs.c.inc|  4 ++--
  target/ppc/translate/vmx-impl.c.inc |  6 +++---
  15 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/host/include/ppc/host/cpuinfo.h b/host/include/ppc/host/cpuinfo.h
index 29ee7f9ef8..38b8eabe2a 100644
--- a/host/include/ppc/host/cpuinfo.h
+++ b/host/include/ppc/host/cpuinfo.h
@@ -1,6 +1,6 @@
  /*
   * SPDX-License-Identifier: GPL-2.0-or-later
- * Host specific cpu indentification for ppc.
+ * Host specific cpu identification for ppc.
   */
  
  #ifndef HOST_CPUINFO_H

diff --git a/hw/ppc/ppc.c b/hw/ppc/ppc.c
index aeb116d919..be167710a3 100644
--- a/hw/ppc/ppc.c
+++ b/hw/ppc/ppc.c
@@ -738,7 +738,7 @@ static target_ulong _cpu_ppc_load_decr(CPUPPCState *env, 
int64_t now)
  decr = __cpu_ppc_load_decr(env, now, tb_env->decr_next);
  
  /*

- * If large decrementer is enabled then the decrementer is signed extened
+ * If large decrementer is enabled then the decrementer is signed extended
   * to 64 bits, otherwise it is a 32 bit value.
   */
  if (env->spr[SPR_LPCR] & LPCR_LD) {
diff --git a/hw/ppc/prep_systemio.c b/hw/ppc/prep_systemio.c
index 5a56f155f5..c96cefb13d 100644
--- a/hw/ppc/prep_systemio.c
+++ b/hw/ppc/prep_systemio.c
@@ -39,7 +39,7 @@
  #define TYPE_PREP_SYSTEMIO "prep-systemio"
  OBJECT_DECLARE_SIMPLE_TYPE(PrepSystemIoState, PREP_SYSTEMIO)
  
-/* Bit as defined in PowerPC Reference Plaform v1.1, sect. 6.1.5, p. 132 */

+/* Bit as defined in PowerPC Reference Platform v1.1, sect. 6.1.5, p. 132 */
  #define PREP_BIT(n) (1 << (7 - (n)))
  
  struct PrepSystemIoState {

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index f7cc6a890f..b25b568cee 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -2573,7 +2573,7 @@ static void spapr_set_vsmt_mode(SpaprMachineState *spapr, 
Error **errp)
  return;
  }
  
-/* Detemine the VSMT mode to use: */

+/* Determine the VSMT mode to use: */
  if (vsmt_user) {
  if (spapr->vsmt < smp_threads) {
  error_setg(errp, "Cannot support VSMT mode %d"
@@ -3109,7 +3109,7 @@ static int spapr_kvm_type(MachineState *machine, const 
char *vm_type)
  {
  /*
   * The use of g_ascii_strcasecmp() for 'hv' and 'pr' is to
- * accomodate the 'HV' and 'PV' formats that exists in the
+ * accommodate the 'HV' and 'PV' formats that exists in the
   * wild. The 'auto' mode is being introduced already as
   * lower-case, thus we don't need to bother checking for
   * "AUTO".
@@ -4343,7 +4343,7 @@ spapr_cpu_index_to_props(MachineState *machine, unsigned 
cpu_index)
  CPUArchId *core_slot;
  MachineClass *mc = MACHINE_GET_CLASS(machine);
  
-/* make sure possible_cpu are intialized */

+/* make sure possible_cpu are initialized */
  mc->possible_cpu_arch_ids(machine);
  /* get CPU core slot containing thread that matches cpu_index */
  core_slot = spapr_find_cpu_slot(machine, cpu_index, NULL);
@@ -5045,7 +5045,7 @@ static void spapr_machine_2_12_class_options(MachineClass 
*mc)
  
  /* We depend on kvm_enabled() to choose a default value for the

   * hpt-max-page-size capability. Of course we can't do it here
- * because this is too early and the HW accelerator isn't initialzed
+ * because this is too early and the HW accelerator isn't initialized
   * yet. Postpone this to machine init (see default_caps_with_cpu()).
   */
  smc->default_caps.caps[SPAPR_CAP_HPT_MAXPAGESIZE] = 0;
diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
index b7dc388f2f..522a2396c7 100644
--- a/hw/ppc/spapr_hcall.c
+++ b/hw/ppc/spapr_hcall.c
@@ -1615,7 +1615,7 @@ static void hypercall_register_types(void)
  spapr_register_hypercall(H_GET_CPU_CHARACTERISTICS,
   h_get_cpu_characteristics);
  
-/* "debugger" hcalls (also used by SLOF). Note: We do -not- differenciate

+/* "debugger" hcalls (also used by SLOF). Note: We do -not- differentiate
   * here between the "CI" and the "CACHE" variants, they will use whatever
   * mapping attributes qemu is using. When using KVM, the kernel will
   * enforce the attributes more strongly
diff --git a/hw/ppc/spapr_nvdimm.c b/hw/ppc/spapr_nvdimm.c
index a8688243a

Re: [PATCH v9 00/20] riscv: 'max' CPU, detect user choice in TCG

2023-09-11 Thread Daniel Henrique Barboza





On 9/10/23 23:34, Alistair Francis wrote:

On Sat, Sep 2, 2023 at 5:48 AM Daniel Henrique Barboza
 wrote:


Hi,

This new version contains suggestions made by Andrew Jones in v8.

Most notable change is the removal of the opensbi.py test in patch 11,
which was replaced by a TuxBoot test. It's more suitable to test the
integrity of all the extensions enabled by the 'max' CPU.

The series is available in this branch:

https://gitlab.com/danielhb/qemu/-/tree/max_cpu_user_choice_v9

Patches missing acks: 11, 15

Changes from v8:
- patch 7:
   - add g_assert(array) at the start of riscv_cpu_add_qdev_prop_array()
- patch 8:
   - add g_assert(array) at the start of riscv_cpu_add_kvm_unavail_prop_array()
- patch 11:
   - removed both opensbi.py tests
   - added 2 'max' cpu tuxboot tests in tuxrun_baselines.py
- patch 12:
   - fixed typos in deprecated.rst
- patch 15:
   - use g_assert_not_reached() at the end of cpu_cfg_ext_get_min_version()
- patch 19:
   - added comment on top of riscv_cpu_add_misa_properties() explaining why
 we're not implementing user choice support for MISA properties
- patch 20:
   - warn_report() is now called after the G error conditions
- v8 link: 
https://lore.kernel.org/qemu-riscv/20230824221440.484675-1-dbarb...@ventanamicro.com/



Daniel Henrique Barboza (20):
   target/riscv/cpu.c: split CPU options from riscv_cpu_extensions[]
   target/riscv/cpu.c: skip 'bool' check when filtering KVM props
   target/riscv/cpu.c: split kvm prop handling to its own helper
   target/riscv: add DEFINE_PROP_END_OF_LIST() to riscv_cpu_options[]
   target/riscv/cpu.c: split non-ratified exts from
 riscv_cpu_extensions[]
   target/riscv/cpu.c: split vendor exts from riscv_cpu_extensions[]
   target/riscv/cpu.c: add riscv_cpu_add_qdev_prop_array()
   target/riscv/cpu.c: add riscv_cpu_add_kvm_unavail_prop_array()
   target/riscv/cpu.c: limit cfg->vext_spec log message
   target/riscv: add 'max' CPU type
   avocado, risc-v: add tuxboot tests for 'max' CPU
   target/riscv: deprecate the 'any' CPU type
   target/riscv/cpu.c: use offset in isa_ext_is_enabled/update_enabled
   target/riscv: make CPUCFG() macro public
   target/riscv/cpu.c: introduce cpu_cfg_ext_auto_update()
   target/riscv/cpu.c: use cpu_cfg_ext_auto_update() during realize()
   target/riscv/cpu.c: introduce RISCVCPUMultiExtConfig
   target/riscv: use isa_ext_update_enabled() in
 init_max_cpu_extensions()
   target/riscv/cpu.c: honor user choice in cpu_cfg_ext_auto_update()
   target/riscv/cpu.c: consider user option with RVG


This series has some build issues. I was hoping a few simple #ifdef
changes would fix it, but it's a little more complex than that
unfortunately.

I'm going to drop this series, do you mind sending a new version which
fixes this and any other build failures:
https://gitlab.com/qemu-project/qemu/-/jobs/5045998521


Sure. I'll wait for the pending PR to be merged first. I'll also take the
opportunity  to make the CONFIG_KVM change that Phil requested.


Thanks,

Daniel



Alistair



  docs/about/deprecated.rst |  12 +
  target/riscv/cpu-qom.h|   1 +
  target/riscv/cpu.c| 564 +-
  target/riscv/cpu.h|   2 +
  target/riscv/kvm.c|   8 +-
  tests/avocado/tuxrun_baselines.py |  32 ++
  6 files changed, 450 insertions(+), 169 deletions(-)

--
2.41.0

Re: [PATCH v3] hw/cxl: Fix CFMW config memory leak

2023-09-11 Thread Jonathan Cameron via

On Fri, 8 Sep 2023 20:02:41 +0300
Michael Tokarev  wrote:

> 08.08.2023 17:44, Jonathan Cameron:
> 
> > Sorry, I'm running a bit behind.  Have this one a few other fixes
> > still queued up - I didn't consider any of them particularly critical
> > for the release so wasn't rushing.
> > 
> > I'll aim to get them out as a series soon though so they are
> > available for start of next cycle if not for slipping in before
> > the release.  
> 
> A month later, friendly ping? :)
> (this is sent to -stable too, fwiw).

https://lore.kernel.org/qemu-devel/20230904132806.6094-2-jonathan.came...@huawei.com/

Posted a series with this in last week.  Hopefully Michael will queue this
in his next pull.

Jonathan
> 
> /mjt
> 
>

[RESEND LINUX KERNEL PATCH 1/1] virtgpu: init vq during resume and notify qemu guest status

2023-09-11 Thread Jiqian Chen

This patch solves two problem:

First, when we suspended guest VM, it called into Qemu to call
virtio_reset->__virtio_queue_reset, this cleared all virtuqueue
information of virtgpu on Qemu end. As a result, after guest
resumed, guest sended ctrl/cursor requests to Qemu through
virtqueue, but Qemu can't get requests from the virtqueue now.
In function virtio_queue_notify, vq->vring.desc is NULL.

So, this patch add freeze and restore function for virtgpu driver.
In freeze function, it flushes all virtqueue works and deletes
virtqueues. In restore function, it initializes virtqueues. And
then, Qemu and guest can communicate normally.

Second, when we suspended guest VM, it called into Qemu to call
virtio_reset->virtio_gpu_gl_reset, this destroyed resources and
reset renderer which were used for display. As a result, after
guest resumed, the display can't come back and we only saw a black
screen.

So, this patch add a new ctrl message VIRTIO_GPU_CMD_SET_FREEZE_MODE.
When guest is during suspending, we set freeze mode to freeze_S3 to
notify Qemu that guest entered suspending, and then Qemu will not
destroy resources. When guest is during resuming, we set freeze mode
to unfreeze to notify Qemu that guest exited suspending, and then
Qemu will keep its origin actions. As a result, the display can come
back and everything of guest can come back to the time when guest
was suspended.

Due to this implemention needs cooperation with host Qemu, so it
added a new feature flag VIRTIO_GPU_F_FREEZE_S3, so that guest and
host can negotiate whenever freeze_S3 is supported or not.

Signed-off-by: Jiqian Chen 
---
 drivers/gpu/drm/virtio/virtgpu_debugfs.c |  1 +
 drivers/gpu/drm/virtio/virtgpu_drv.c | 39 
 drivers/gpu/drm/virtio/virtgpu_drv.h |  5 +++
 drivers/gpu/drm/virtio/virtgpu_kms.c | 36 --
 drivers/gpu/drm/virtio/virtgpu_vq.c  | 16 ++
 include/uapi/linux/virtio_gpu.h  | 19 
 6 files changed, 107 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/virtio/virtgpu_debugfs.c 
b/drivers/gpu/drm/virtio/virtgpu_debugfs.c
index 853dd9aa397e..c84fd6d7f5f3 100644
--- a/drivers/gpu/drm/virtio/virtgpu_debugfs.c
+++ b/drivers/gpu/drm/virtio/virtgpu_debugfs.c
@@ -55,6 +55,7 @@ static int virtio_gpu_features(struct seq_file *m, void *data)
 
virtio_gpu_add_bool(m, "blob resources", vgdev->has_resource_blob);
virtio_gpu_add_bool(m, "context init", vgdev->has_context_init);
+   virtio_gpu_add_bool(m, "freeze_S3", vgdev->has_freeze_S3);
virtio_gpu_add_int(m, "cap sets", vgdev->num_capsets);
virtio_gpu_add_int(m, "scanouts", vgdev->num_scanouts);
if (vgdev->host_visible_region.len) {
diff --git a/drivers/gpu/drm/virtio/virtgpu_drv.c 
b/drivers/gpu/drm/virtio/virtgpu_drv.c
index 644b8ee51009..000e3e776608 100644
--- a/drivers/gpu/drm/virtio/virtgpu_drv.c
+++ b/drivers/gpu/drm/virtio/virtgpu_drv.c
@@ -130,6 +130,40 @@ static void virtio_gpu_config_changed(struct virtio_device 
*vdev)
schedule_work(&vgdev->config_changed_work);
 }
 
+#ifdef CONFIG_PM
+static int virtio_gpu_freeze(struct virtio_device *dev)
+{
+   struct drm_device *ddev = dev->priv;
+   struct virtio_gpu_device *vgdev = ddev->dev_private;
+   int ret = 0;
+
+   if (vgdev->has_freeze_S3) {
+   ret = virtio_gpu_cmd_set_freeze_mode(vgdev,
+   VIRTIO_GPU_FREEZE_MODE_FREEZE_S3);
+   }
+   if (!ret) {
+   flush_work(&vgdev->ctrlq.dequeue_work);
+   flush_work(&vgdev->cursorq.dequeue_work);
+   vgdev->vdev->config->del_vqs(vgdev->vdev);
+   }
+   return ret;
+}
+
+static int virtio_gpu_restore(struct virtio_device *dev)
+{
+   struct drm_device *ddev = dev->priv;
+   struct virtio_gpu_device *vgdev = ddev->dev_private;
+   int ret;
+
+   ret = virtio_gpu_init_vqs(dev);
+   if (!ret && vgdev->has_freeze_S3) {
+   ret = virtio_gpu_cmd_set_freeze_mode(vgdev,
+   VIRTIO_GPU_FREEZE_MODE_UNFREEZE);
+   }
+   return ret;
+}
+#endif
+
 static struct virtio_device_id id_table[] = {
{ VIRTIO_ID_GPU, VIRTIO_DEV_ANY_ID },
{ 0 },
@@ -148,6 +182,7 @@ static unsigned int features[] = {
VIRTIO_GPU_F_RESOURCE_UUID,
VIRTIO_GPU_F_RESOURCE_BLOB,
VIRTIO_GPU_F_CONTEXT_INIT,
+   VIRTIO_GPU_F_FREEZE_S3,
 };
 static struct virtio_driver virtio_gpu_driver = {
.feature_table = features,
@@ -156,6 +191,10 @@ static struct virtio_driver virtio_gpu_driver = {
.driver.owner = THIS_MODULE,
.id_table = id_table,
.probe = virtio_gpu_probe,
+#ifdef CONFIG_PM
+   .freeze = virtio_gpu_freeze,
+   .restore = virtio_gpu_restore,
+#endif
.remove = virtio_gpu_remove,
.config_changed = virtio_gpu_config_changed
 };
diff --git a/drivers/gpu/drm/virtio/virtgpu_drv.h 
b/drivers/gpu/drm/virtio

[RESEND LINUX KERNEL PATCH 0/1] add S3 support for virtgpu

2023-09-11 Thread Jiqian Chen

Hi all,
I hope you’ll forgive me if this disturb you. Since it has been almost two
months since the latest patch was sent out, I didn't receive any reply, so I
rebase the latest master branch and sent it again. I am looking forward to
getting your response.

Best regards,
Jiqian Chen

v3:
Hi all,
Thanks for Gerd Hoffmann's advice. V3 makes below changes:
* Use enum for freeze mode, so this can be extended with more
  modes in the future.
* Rename functions and paratemers with "_S3" postfix.
And no functional changes.

V4(latest version) of Qemu patch:
https://lore.kernel.org/qemu-devel/20230720120816.8751-1-jiqian.c...@amd.com/


v2:
Hi all,
Thanks to Marc-André Lureau, Robert Beckett and Gerd Hoffmann for their advice
and guidance. V2 makes below changes:
* Change VIRTIO_CPU_CMD_STATUS_FREEZING to 0x0400 (<0x1000)
* Add a new feature flag VIRTIO_GPU_F_FREEZING, so that guest and
  host can negotiate whenever freezing is supported or not.

Link:
https://lore.kernel.org/lkml/20230630073448.842767-1-jiqian.c...@amd.com/T/#t
V2 of Qemu patch:
https://lore.kernel.org/qemu-devel/20230630070016.841459-1-jiqian.c...@amd.com/T/#t


v1:
Hi all,
I am working to implement virtgpu S3 function on Xen.

Currently on Xen, if we start a guest who enables virtgpu, and then run "echo 
mem >
/sys/power/state" to suspend guest. And run "sudo xl trigger  
s3resume"
to resume guest. We can find that the guest kernel comes back, but the display
doesn't. It just shows a black screen.

In response to the above phenomenon, I have found two problems.

First, if we move mouse on the black screen, guest kernel still sends a cursor
request to Qemu, but Qemu doesn't response. Because when guest is suspending, it
calls device_suspend, and then call into Qemu to call
virtio_reset->__virtio_queue_reset. In __virtio_queue_reset, it clears all 
virtqueue
information on Qemu end. So, after guest resumes, Qemu can't get message from
virtqueue.

Second, the reason why display can't come back is that when guest is 
suspending, it
calls into Qemu to call virtio_reset->virtio_gpu_gl_reset. In 
virtio_gpu_gl_reset,
it destroys all resources and resets renderer, which are used for display. So 
after
guest resumes, the display can't come back to the status when guest is 
suspended.

This patch initializes virtqueue when guest is resuming to solve first problem. 
And
it notifies Qemu that guest is suspending to prevent Qemu destroying resources, 
this
is to solve second problem. And then, I can bring the display back, and 
everything
continues their actions after guest resumes.

Link:
https://lore.kernel.org/lkml/20230608063857.1677973-1-jiqian.c...@amd.com/
V1 of Qemu patch:
https://lore.kernel.org/qemu-devel/20230608025655.1674357-2-jiqian.c...@amd.com/


Jiqian Chen (1):
  virtgpu: init vq during resume and notify qemu guest status

 drivers/gpu/drm/virtio/virtgpu_debugfs.c |  1 +
 drivers/gpu/drm/virtio/virtgpu_drv.c | 39 
 drivers/gpu/drm/virtio/virtgpu_drv.h |  5 +++
 drivers/gpu/drm/virtio/virtgpu_kms.c | 36 --
 drivers/gpu/drm/virtio/virtgpu_vq.c  | 16 ++
 include/uapi/linux/virtio_gpu.h  | 19 
 6 files changed, 107 insertions(+), 9 deletions(-)

-- 
2.34.1

Re: [PATCH] meson: Fix targetos match for illumos and Solaris.

2023-09-11 Thread Philippe Mathieu-Daudé


On 8/9/23 19:45, Jonathan Perkin wrote:
qemu 8.1.0 breaks on illumos platforms due to _XOPEN_SOURCE and others 
no longer being set correctly, leading to breakage such as:


   
https://us-central.manta.mnx.io/pkgsrc/public/reports/trunk/tools/20230908.1404/qemu-8.1.0/build.log


Paolo if you don't mind please replace this link
(which is likely going to disappear) by:

In file included from include/qemu/osdep.h:151,
 from ../qom/container.c:13:
include/sysemu/os-posix.h: In function 'qemu_flockfile':
include/sysemu/os-posix.h:88:5: warning: implicit declaration of 
function 'flockfile' [-Wimplicit-function-declaration]

   88 | flockfile(f);
  | ^
include/sysemu/os-posix.h:88:5: warning: nested extern declaration of 
'flockfile' [-Wnested-externs]


../util/compatfd.c: In function 'sigwait_compat':
../util/compatfd.c:36:15: error: too many arguments to function 'sigwait'
   36 | err = sigwait(&info->mask, &sig);
  |   ^~~
In file included from include/qemu/osdep.h:117,
 from ../util/compatfd.c:16:
/usr/include/signal.h:165:12: note: declared here
  165 | extern int sigwait(sigset_t *);
  |^~~



This is a result of meson conversion which incorrectly matches against 
'solaris' instead of 'sunos' for uname.


First time submitting a patch here, hope I did it correctly.  Thanks.

Signed-off-by: Jonathan Perkin 
---
  meson.build | 4 ++--
  net/meson.build | 2 +-
  2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/meson.build b/meson.build
index 0e31bdfabf..5150a74831 100644
--- a/meson.build
+++ b/meson.build
@@ -226,7 +226,7 @@ if targetos == 'darwin'
    if compiler.get_id() == 'gcc'
  qemu_common_flags += '-DOS_OBJECT_USE_OBJC=0'
    endif
-elif targetos == 'solaris'
+elif targetos == 'sunos'


Commit a988b4c561 from Oct 2022... I'm surprised nobody tried
to build QEMU on illumos for almost 1 year...


    # needed for CMSG_ macros in sys/socket.h
    qemu_common_flags += '-D_XOPEN_SOURCE=600'
    # needed for TIOCWIN* defines in termios.h
@@ -2048,7 +2048,7 @@ have_slirp_smbd = get_option('slirp_smbd') \
  if have_slirp_smbd
    smbd_path = get_option('smbd')
    if smbd_path == ''
-    smbd_path = (targetos == 'solaris' ? '/usr/sfw/sbin/smbd' : 
'/usr/sbin/smbd')
+    smbd_path = (targetos == 'sunos' ? '/usr/sfw/sbin/smbd' : 
'/usr/sbin/smbd')

    endif
    config_host_data.set_quoted('CONFIG_SMBD_COMMAND', smbd_path)
  endif
diff --git a/net/meson.build b/net/meson.build
index d2d70634e5..51caa42c9d 100644
--- a/net/meson.build
+++ b/net/meson.build
@@ -47,7 +47,7 @@ elif targetos == 'linux'
    system_ss.add(files('tap.c', 'tap-linux.c'))
  elif targetos in bsd_oses
    system_ss.add(files('tap.c', 'tap-bsd.c'))
-elif targetos == 'solaris'
+elif targetos == 'sunos'
    system_ss.add(files('tap.c', 'tap-solaris.c'))
  else
    system_ss.add(files('tap.c', 'tap-stub.c'))

Re: [PATCH 3/4] hw/cpu: Introduce CPUClass::cpu_resolving_type field

2023-09-11 Thread Philippe Mathieu-Daudé


On 11/9/23 01:28, Gavin Shan wrote:

Hi Philippe,

On 9/8/23 21:22, Philippe Mathieu-Daudé wrote:

Add a field to return the QOM type name of a CPU class.

Signed-off-by: Philippe Mathieu-Daudé 
---
  include/hw/core/cpu.h   | 2 ++
  hw/core/cpu-common.c    | 2 +-
  target/alpha/cpu.c  | 1 +
  target/arm/cpu.c    | 1 +
  target/avr/cpu.c    | 1 +
  target/cris/cpu.c   | 1 +
  target/hexagon/cpu.c    | 1 +
  target/hppa/cpu.c   | 1 +
  target/i386/cpu.c   | 1 +
  target/loongarch/cpu.c  | 1 +
  target/m68k/cpu.c   | 1 +
  target/microblaze/cpu.c | 1 +
  target/mips/cpu.c   | 1 +
  target/nios2/cpu.c  | 1 +
  target/openrisc/cpu.c   | 1 +
  target/ppc/cpu_init.c   | 1 +
  target/riscv/cpu.c  | 1 +
  target/rx/cpu.c | 1 +
  target/s390x/cpu.c  | 1 +
  target/sh4/cpu.c    | 1 +
  target/sparc/cpu.c  | 1 +
  target/tricore/cpu.c    | 1 +
  target/xtensa/cpu.c | 1 +
  23 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
index 129d179937..e469efd409 100644
--- a/include/hw/core/cpu.h
+++ b/include/hw/core/cpu.h
@@ -100,6 +100,7 @@ struct SysemuCPUOps;
  /**
   * CPUClass:
+ * @cpu_resolving_type: CPU QOM type name
   * @class_by_name: Callback to map -cpu command line model name to an
   * instantiatable CPU type.
   * @parse_features: Callback to parse command line arguments.
@@ -148,6 +149,7 @@ struct CPUClass {
  DeviceClass parent_class;
  /*< public >*/
+    const char *cpu_resolving_type;
  ObjectClass *(*class_by_name)(const char *cpu_model);
  void (*parse_features)(const char *typename, char *str, Error 
**errp);


The question is why not use CPU_RESOLVING_TYPE directly? It seems 
CPU_RESOLVING_TYPE

is exactly what you want here.


Unfortunately CPU_RESOLVING_TYPE is target-specific, we want
hw/core/cpu-common.c to be target-agnostic (build once for all
targets). This is particularly important in the context of
heterogeneous QEMU, where a single binary will be able to create
CPUs from different targets.

Besides, I guess the changes can be 
squeezed into two

patches (commits) as below:

PATCH[1] target/alpha: Tidy up alpha_cpu_class_by_name()
PATCH[2] Move the checks (oc == NULL || object_class_is_abstract() || 
!object_class_dynamic_cast())
  from individual targets to 
hw/core/cpu-common.c::cpu_class_by_name()


I rebase my series of '[PATCH v3 00/32] Unified CPU type check' on top 
of yours. Please
let me know if I need to include your patch into my v4 series for 
review. In that case,

I can include your patches with above changes applied.

Thanks,
Gavin


diff --git a/hw/core/cpu-common.c b/hw/core/cpu-common.c
index c6a0c9390c..2d24261a6a 100644
--- a/hw/core/cpu-common.c
+++ b/hw/core/cpu-common.c
@@ -155,7 +155,7 @@ ObjectClass *cpu_class_by_name(const char 
*typename, const char *cpu_model)

  assert(cpu_model);
  oc = object_class_by_name(typename);
  cc = CPU_CLASS(oc);
-    assert(cc->class_by_name);
+    assert(cc->cpu_resolving_type && cc->class_by_name);
  oc = cc->class_by_name(cpu_model);
  if (oc == NULL || object_class_is_abstract(oc)) {
  return NULL;

[PATCH v2 00/21] Graph locking part 4 (node management)

2023-09-11 Thread Kevin Wolf

The previous parts of the graph locking changes focussed mostly on the
BlockDriver side and taking reader locks while performing I/O. This
series focusses more on the functions managing the graph structure, i.e
adding, removing and replacing nodes and updating their permissions.

Many of these places actually need to take the writer lock to avoid
readers seeing an inconsistent half-updated graph state. Therefore
taking the writer lock is now moved from the very low-level function
bdrv_replace_child_noperm() into its more high level callers.

v2:
- Patch 5: Improved comments, added one for bdrv_schedule_unref()

Kevin Wolf (21):
  block: Remove unused BlockReopenQueueEntry.perms_checked
  preallocate: Factor out preallocate_truncate_to_real_size()
  preallocate: Don't poll during permission updates
  block: Take AioContext lock for bdrv_append() more consistently
  block: Introduce bdrv_schedule_unref()
  block-coroutine-wrapper: Add no_co_wrapper_bdrv_wrlock functions
  block-coroutine-wrapper: Allow arbitrary parameter names
  block: Mark bdrv_replace_child_noperm() GRAPH_WRLOCK
  block: Mark bdrv_replace_child_tran() GRAPH_WRLOCK
  block: Mark bdrv_attach_child_common() GRAPH_WRLOCK
  block: Call transaction callbacks with lock held
  block: Mark bdrv_attach_child() GRAPH_WRLOCK
  block: Mark bdrv_parent_perms_conflict() and callers GRAPH_RDLOCK
  block: Mark bdrv_get_cumulative_perm() and callers GRAPH_RDLOCK
  block: Mark bdrv_child_perm() GRAPH_RDLOCK
  block: Mark bdrv_parent_cb_change_media() GRAPH_RDLOCK
  block: Take graph rdlock in bdrv_drop_intermediate()
  block: Take graph rdlock in bdrv_change_aio_context()
  block: Mark bdrv_root_unref_child() GRAPH_WRLOCK
  block: Mark bdrv_unref_child() GRAPH_WRLOCK
  block: Mark bdrv_add/del_child() and caller GRAPH_WRLOCK

 include/block/block-common.h|   4 +
 include/block/block-global-state.h  |  30 +-
 include/block/block_int-common.h|  34 +-
 include/block/block_int-global-state.h  |  14 +-
 include/sysemu/block-backend-global-state.h |   4 +-
 block.c | 348 ++--
 block/blklogwrites.c|   4 +
 block/blkverify.c   |   2 +
 block/block-backend.c   |  29 +-
 block/copy-before-write.c   |  10 +-
 block/crypto.c  |   6 +-
 block/graph-lock.c  |  26 +-
 block/mirror.c  |   8 +
 block/preallocate.c | 133 +---
 block/qcow2.c   |   4 +-
 block/quorum.c  |  23 +-
 block/replication.c |   9 +
 block/snapshot.c|   2 +
 block/stream.c  |  20 +-
 block/vmdk.c|  13 +
 blockdev.c  |  23 +-
 blockjob.c  |   2 +
 tests/unit/test-bdrv-drain.c|  23 +-
 tests/unit/test-bdrv-graph-mod.c|  20 ++
 tests/unit/test-block-iothread.c|   3 +
 scripts/block-coroutine-wrapper.py  |  18 +-
 tests/qemu-iotests/051.pc.out   |   6 +-
 27 files changed, 591 insertions(+), 227 deletions(-)

-- 
2.41.0

[PATCH v2 01/21] block: Remove unused BlockReopenQueueEntry.perms_checked

2023-09-11 Thread Kevin Wolf

This field has been unused since commit 72373e40fbc ('block:
bdrv_reopen_multiple: refresh permissions on updated graph').
Remove it.

Signed-off-by: Kevin Wolf 
Reviewed-by: Eric Blake 
Reviewed-by: Emanuele Giuseppe Esposito 
Reviewed-by: Stefan Hajnoczi 
---
 block.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/block.c b/block.c
index 8da89aaa62..9029ddd9ff 100644
--- a/block.c
+++ b/block.c
@@ -2115,7 +2115,6 @@ static int bdrv_fill_options(QDict **options, const char 
*filename,
 
 typedef struct BlockReopenQueueEntry {
  bool prepared;
- bool perms_checked;
  BDRVReopenState state;
  QTAILQ_ENTRY(BlockReopenQueueEntry) entry;
 } BlockReopenQueueEntry;
-- 
2.41.0

[PATCH v2 20/21] block: Mark bdrv_unref_child() GRAPH_WRLOCK

2023-09-11 Thread Kevin Wolf

Instead of taking the writer lock internally, require callers to already
hold it when calling bdrv_unref_child(). These callers will typically
already hold the graph lock once the locking work is completed, which
means that they can't call functions that take it internally.

Signed-off-by: Kevin Wolf 
Reviewed-by: Emanuele Giuseppe Esposito 
Reviewed-by: Stefan Hajnoczi 
---
 include/block/block-global-state.h |  7 ++-
 block.c| 11 +++
 block/blklogwrites.c   |  4 
 block/blkverify.c  |  2 ++
 block/qcow2.c  |  4 +++-
 block/quorum.c |  6 ++
 block/replication.c|  3 +++
 block/snapshot.c   |  2 ++
 block/vmdk.c   | 11 +++
 tests/unit/test-bdrv-drain.c   |  8 ++--
 10 files changed, 50 insertions(+), 8 deletions(-)

diff --git a/include/block/block-global-state.h 
b/include/block/block-global-state.h
index eb12a35439..0f6df8f1a2 100644
--- a/include/block/block-global-state.h
+++ b/include/block/block-global-state.h
@@ -225,7 +225,12 @@ void bdrv_ref(BlockDriverState *bs);
 void no_coroutine_fn bdrv_unref(BlockDriverState *bs);
 void coroutine_fn no_co_wrapper bdrv_co_unref(BlockDriverState *bs);
 void GRAPH_WRLOCK bdrv_schedule_unref(BlockDriverState *bs);
-void bdrv_unref_child(BlockDriverState *parent, BdrvChild *child);
+
+void GRAPH_WRLOCK
+bdrv_unref_child(BlockDriverState *parent, BdrvChild *child);
+
+void coroutine_fn no_co_wrapper_bdrv_wrlock
+bdrv_co_unref_child(BlockDriverState *parent, BdrvChild *child);
 
 BdrvChild * GRAPH_WRLOCK
 bdrv_attach_child(BlockDriverState *parent_bs,
diff --git a/block.c b/block.c
index 9ea8333a28..e7f349b25c 100644
--- a/block.c
+++ b/block.c
@@ -1701,7 +1701,9 @@ bdrv_open_driver(BlockDriverState *bs, BlockDriver *drv, 
const char *node_name,
 open_failed:
 bs->drv = NULL;
 if (bs->file != NULL) {
+bdrv_graph_wrlock(NULL);
 bdrv_unref_child(bs, bs->file);
+bdrv_graph_wrunlock();
 assert(!bs->file);
 }
 g_free(bs->opaque);
@@ -3331,8 +,9 @@ static void bdrv_set_inherits_from(BlockDriverState *bs,
  * @root that point to @root, where necessary.
  * @tran is allowed to be NULL. In this case no rollback is possible
  */
-static void bdrv_unset_inherits_from(BlockDriverState *root, BdrvChild *child,
- Transaction *tran)
+static void GRAPH_WRLOCK
+bdrv_unset_inherits_from(BlockDriverState *root, BdrvChild *child,
+ Transaction *tran)
 {
 BdrvChild *c;
 
@@ -3364,10 +3367,8 @@ void bdrv_unref_child(BlockDriverState *parent, 
BdrvChild *child)
 return;
 }
 
-bdrv_graph_wrlock(NULL);
 bdrv_unset_inherits_from(parent, child, NULL);
 bdrv_root_unref_child(child);
-bdrv_graph_wrunlock();
 }
 
 
@@ -5164,9 +5165,11 @@ static void bdrv_close(BlockDriverState *bs)
 bs->drv = NULL;
 }
 
+bdrv_graph_wrlock(NULL);
 QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
 bdrv_unref_child(bs, child);
 }
+bdrv_graph_wrunlock();
 
 assert(!bs->backing);
 assert(!bs->file);
diff --git a/block/blklogwrites.c b/block/blklogwrites.c
index 3ea7141cb5..a0d70729bb 100644
--- a/block/blklogwrites.c
+++ b/block/blklogwrites.c
@@ -251,7 +251,9 @@ static int blk_log_writes_open(BlockDriverState *bs, QDict 
*options, int flags,
 ret = 0;
 fail_log:
 if (ret < 0) {
+bdrv_graph_wrlock(NULL);
 bdrv_unref_child(bs, s->log_file);
+bdrv_graph_wrunlock();
 s->log_file = NULL;
 }
 fail:
@@ -263,8 +265,10 @@ static void blk_log_writes_close(BlockDriverState *bs)
 {
 BDRVBlkLogWritesState *s = bs->opaque;
 
+bdrv_graph_wrlock(NULL);
 bdrv_unref_child(bs, s->log_file);
 s->log_file = NULL;
+bdrv_graph_wrunlock();
 }
 
 static int64_t coroutine_fn GRAPH_RDLOCK
diff --git a/block/blkverify.c b/block/blkverify.c
index 7326461f30..dae9716a26 100644
--- a/block/blkverify.c
+++ b/block/blkverify.c
@@ -151,8 +151,10 @@ static void blkverify_close(BlockDriverState *bs)
 {
 BDRVBlkverifyState *s = bs->opaque;
 
+bdrv_graph_wrlock(NULL);
 bdrv_unref_child(bs, s->test_file);
 s->test_file = NULL;
+bdrv_graph_wrunlock();
 }
 
 static int64_t coroutine_fn GRAPH_RDLOCK
diff --git a/block/qcow2.c b/block/qcow2.c
index b48cd9ce63..071004b302 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -1880,7 +1880,7 @@ qcow2_do_open(BlockDriverState *bs, QDict *options, int 
flags,
 g_free(s->image_data_file);
 if (open_data_file && has_data_file(bs)) {
 bdrv_graph_co_rdunlock();
-bdrv_unref_child(bs, s->data_file);
+bdrv_co_unref_child(bs, s->data_file);
 bdrv_graph_co_rdlock();
 s->data_file = NULL;
 }
@@ -2790,7 +2790,9 @@ static void qcow2_do_close(BlockDriverState *bs, bool 
close_data_file)
 g_free(s->image_backing_format);

[PATCH v2 03/21] preallocate: Don't poll during permission updates

2023-09-11 Thread Kevin Wolf

When the permission related BlockDriver callbacks are called, we are in
the middle of an operation traversing the block graph. Polling in such a
place is a very bad idea because the graph could change in unexpected
ways. In the future, callers will also hold the graph lock, which is
likely to turn polling into a deadlock.

So we need to get rid of calls to functions like bdrv_getlength() or
bdrv_truncate() there as these functions poll internally. They are
currently used so that when no parent has write/resize permissions on
the image any more, the preallocate filter drops the extra preallocated
area in the image file and gives up write/resize permissions itself.

In order to achieve this without polling in .bdrv_check_perm, don't
immediately truncate the image, but only schedule a BH to do so. The
filter keeps the write/resize permissions a bit longer now until the BH
has executed.

There is one case in which delaying doesn't work: Reopening the image
read-only. In this case, bs->file will likely be reopened read-only,
too, so keeping write permissions a bit longer on it doesn't work. But
we can already cover this case in preallocate_reopen_prepare() and not
rely on the permission updates for it.

Signed-off-by: Kevin Wolf 
Reviewed-by: Eric Blake 
Reviewed-by: Stefan Hajnoczi 
---
 block/preallocate.c | 89 +++--
 1 file changed, 69 insertions(+), 20 deletions(-)

diff --git a/block/preallocate.c b/block/preallocate.c
index 3173d80534..bfb638d8b1 100644
--- a/block/preallocate.c
+++ b/block/preallocate.c
@@ -75,8 +75,14 @@ typedef struct BDRVPreallocateState {
  * be invalid (< 0) when we don't have both exclusive BLK_PERM_RESIZE and
  * BLK_PERM_WRITE permissions on file child.
  */
+
+/* Gives up the resize permission on children when parents don't need it */
+QEMUBH *drop_resize_bh;
 } BDRVPreallocateState;
 
+static int preallocate_drop_resize(BlockDriverState *bs, Error **errp);
+static void preallocate_drop_resize_bh(void *opaque);
+
 #define PREALLOCATE_OPT_PREALLOC_ALIGN "prealloc-align"
 #define PREALLOCATE_OPT_PREALLOC_SIZE "prealloc-size"
 static QemuOptsList runtime_opts = {
@@ -142,6 +148,7 @@ static int preallocate_open(BlockDriverState *bs, QDict 
*options, int flags,
  * For this to work, mark them invalid.
  */
 s->file_end = s->zero_start = s->data_end = -EINVAL;
+s->drop_resize_bh = qemu_bh_new(preallocate_drop_resize_bh, bs);
 
 ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
 if (ret < 0) {
@@ -193,6 +200,9 @@ static void preallocate_close(BlockDriverState *bs)
 {
 BDRVPreallocateState *s = bs->opaque;
 
+qemu_bh_cancel(s->drop_resize_bh);
+qemu_bh_delete(s->drop_resize_bh);
+
 if (s->data_end >= 0) {
 preallocate_truncate_to_real_size(bs, NULL);
 }
@@ -211,6 +221,7 @@ static int preallocate_reopen_prepare(BDRVReopenState 
*reopen_state,
   BlockReopenQueue *queue, Error **errp)
 {
 PreallocateOpts *opts = g_new0(PreallocateOpts, 1);
+int ret;
 
 if (!preallocate_absorb_opts(opts, reopen_state->options,
  reopen_state->bs->file->bs, errp)) {
@@ -218,6 +229,19 @@ static int preallocate_reopen_prepare(BDRVReopenState 
*reopen_state,
 return -EINVAL;
 }
 
+/*
+ * Drop the preallocation already here if reopening read-only. The child
+ * might also be reopened read-only and then scheduling a BH during the
+ * permission update is too late.
+ */
+if ((reopen_state->flags & BDRV_O_RDWR) == 0) {
+ret = preallocate_drop_resize(reopen_state->bs, errp);
+if (ret < 0) {
+g_free(opts);
+return ret;
+}
+}
+
 reopen_state->opaque = opts;
 
 return 0;
@@ -475,41 +499,61 @@ preallocate_co_getlength(BlockDriverState *bs)
 return ret;
 }
 
-static int preallocate_check_perm(BlockDriverState *bs,
-  uint64_t perm, uint64_t shared, Error **errp)
+static int preallocate_drop_resize(BlockDriverState *bs, Error **errp)
 {
 BDRVPreallocateState *s = bs->opaque;
+int ret;
 
-if (s->data_end >= 0 && !can_write_resize(perm)) {
-/*
- * Lose permissions.
- * We should truncate in check_perm, as in set_perm bs->file->perm will
- * be already changed, and we should not violate it.
- */
-return preallocate_truncate_to_real_size(bs, errp);
+if (s->data_end < 0) {
+return 0;
+}
+
+/*
+ * Before switching children to be read-only, truncate them to remove
+ * the preallocation and let them have the real size.
+ */
+ret = preallocate_truncate_to_real_size(bs, errp);
+if (ret < 0) {
+return ret;
 }
 
+/*
+ * We'll drop our permissions and will allow other users to take write and
+ * resize permissions (see preallocate_child_perm). Anyone will be able to
+ * change th

[PATCH v2 21/21] block: Mark bdrv_add/del_child() and caller GRAPH_WRLOCK

2023-09-11 Thread Kevin Wolf

The functions read the parents list in the generic block layer, so we
need to hold the graph lock already there. The BlockDriver
implementations actually modify the graph, so it has to be a writer
lock.

Signed-off-by: Kevin Wolf 
Reviewed-by: Emanuele Giuseppe Esposito 
Reviewed-by: Stefan Hajnoczi 
---
 include/block/block-global-state.h |  8 +---
 include/block/block_int-common.h   |  9 +
 block/quorum.c | 23 ++-
 blockdev.c | 17 +++--
 4 files changed, 27 insertions(+), 30 deletions(-)

diff --git a/include/block/block-global-state.h 
b/include/block/block-global-state.h
index 0f6df8f1a2..f31660c7b1 100644
--- a/include/block/block-global-state.h
+++ b/include/block/block-global-state.h
@@ -276,9 +276,11 @@ int bdrv_try_change_aio_context(BlockDriverState *bs, 
AioContext *ctx,
 int bdrv_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz);
 int bdrv_probe_geometry(BlockDriverState *bs, HDGeometry *geo);
 
-void bdrv_add_child(BlockDriverState *parent, BlockDriverState *child,
-Error **errp);
-void bdrv_del_child(BlockDriverState *parent, BdrvChild *child, Error **errp);
+void GRAPH_WRLOCK
+bdrv_add_child(BlockDriverState *parent, BlockDriverState *child, Error 
**errp);
+
+void GRAPH_WRLOCK
+bdrv_del_child(BlockDriverState *parent, BdrvChild *child, Error **errp);
 
 /**
  *
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
index 3feb67ec4a..2ca3758cb8 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h
@@ -393,10 +393,11 @@ struct BlockDriver {
  */
 int (*bdrv_probe_geometry)(BlockDriverState *bs, HDGeometry *geo);
 
-void (*bdrv_add_child)(BlockDriverState *parent, BlockDriverState *child,
-   Error **errp);
-void (*bdrv_del_child)(BlockDriverState *parent, BdrvChild *child,
-   Error **errp);
+void GRAPH_WRLOCK_PTR (*bdrv_add_child)(
+BlockDriverState *parent, BlockDriverState *child, Error **errp);
+
+void GRAPH_WRLOCK_PTR (*bdrv_del_child)(
+BlockDriverState *parent, BdrvChild *child, Error **errp);
 
 /**
  * Informs the block driver that a permission change is intended. The
diff --git a/block/quorum.c b/block/quorum.c
index 620a50ba2c..05220cab7f 100644
--- a/block/quorum.c
+++ b/block/quorum.c
@@ -1066,8 +1066,8 @@ static void quorum_close(BlockDriverState *bs)
 g_free(s->children);
 }
 
-static void quorum_add_child(BlockDriverState *bs, BlockDriverState *child_bs,
- Error **errp)
+static void GRAPH_WRLOCK
+quorum_add_child(BlockDriverState *bs, BlockDriverState *child_bs, Error 
**errp)
 {
 BDRVQuorumState *s = bs->opaque;
 BdrvChild *child;
@@ -1093,29 +1093,22 @@ static void quorum_add_child(BlockDriverState *bs, 
BlockDriverState *child_bs,
 }
 s->next_child_index++;
 
-bdrv_drained_begin(bs);
-
 /* We can safely add the child now */
 bdrv_ref(child_bs);
 
-bdrv_graph_wrlock(child_bs);
 child = bdrv_attach_child(bs, child_bs, indexstr, &child_of_bds,
   BDRV_CHILD_DATA, errp);
-bdrv_graph_wrunlock();
 if (child == NULL) {
 s->next_child_index--;
-goto out;
+return;
 }
 s->children = g_renew(BdrvChild *, s->children, s->num_children + 1);
 s->children[s->num_children++] = child;
 quorum_refresh_flags(bs);
-
-out:
-bdrv_drained_end(bs);
 }
 
-static void quorum_del_child(BlockDriverState *bs, BdrvChild *child,
- Error **errp)
+static void GRAPH_WRLOCK
+quorum_del_child(BlockDriverState *bs, BdrvChild *child, Error **errp)
 {
 BDRVQuorumState *s = bs->opaque;
 char indexstr[INDEXSTR_LEN];
@@ -1145,18 +1138,14 @@ static void quorum_del_child(BlockDriverState *bs, 
BdrvChild *child,
 s->next_child_index--;
 }
 
-bdrv_drained_begin(bs);
-
 /* We can safely remove this child now */
 memmove(&s->children[i], &s->children[i + 1],
 (s->num_children - i - 1) * sizeof(BdrvChild *));
 s->children = g_renew(BdrvChild *, s->children, --s->num_children);
-bdrv_graph_wrlock(NULL);
+
 bdrv_unref_child(bs, child);
-bdrv_graph_wrunlock();
 
 quorum_refresh_flags(bs);
-bdrv_drained_end(bs);
 }
 
 static void quorum_gather_child_options(BlockDriverState *bs, QDict *target,
diff --git a/blockdev.c b/blockdev.c
index 372eaf198c..325b7a3bef 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -3545,8 +3545,8 @@ out:
 aio_context_release(aio_context);
 }
 
-static BdrvChild *bdrv_find_child(BlockDriverState *parent_bs,
-  const char *child_name)
+static BdrvChild * GRAPH_RDLOCK
+bdrv_find_child(BlockDriverState *parent_bs, const char *child_name)
 {
 BdrvChild *child;
 
@@ -3565,9 +3565,11 @@ void qmp_x_blockdev_change(const char *parent, const 
char *child,
 BlockDriverSta

[PATCH v2 09/21] block: Mark bdrv_replace_child_tran() GRAPH_WRLOCK

2023-09-11 Thread Kevin Wolf

Instead of taking the writer lock internally, require callers to already
hold it when calling bdrv_replace_child_tran(). These callers will
typically already hold the graph lock once the locking work is
completed, which means that they can't call functions that take it
internally.

While a graph lock is held, polling is not allowed. Therefore draining
the necessary nodes can no longer be done in bdrv_remove_child() and
bdrv_replace_node_noperm(), but the callers must already make sure that
they are drained.

Note that the transaction callbacks still take the lock internally, so
tran_finalize() must be called without the lock held. This is because
bdrv_append() also calls bdrv_attach_child_noperm(), which currently
requires to be called unlocked. Once it changes, the transaction
callbacks can be changed, too.

Signed-off-by: Kevin Wolf 
Reviewed-by: Emanuele Giuseppe Esposito 
Reviewed-by: Stefan Hajnoczi 
---
 block.c | 78 -
 1 file changed, 50 insertions(+), 28 deletions(-)

diff --git a/block.c b/block.c
index 61856f5c33..0973b91d98 100644
--- a/block.c
+++ b/block.c
@@ -94,7 +94,8 @@ static bool bdrv_recurse_has_child(BlockDriverState *bs,
 static void GRAPH_WRLOCK
 bdrv_replace_child_noperm(BdrvChild *child, BlockDriverState *new_bs);
 
-static void bdrv_remove_child(BdrvChild *child, Transaction *tran);
+static void GRAPH_WRLOCK
+bdrv_remove_child(BdrvChild *child, Transaction *tran);
 
 static int bdrv_reopen_prepare(BDRVReopenState *reopen_state,
BlockReopenQueue *queue,
@@ -2427,8 +2428,9 @@ static TransactionActionDrv bdrv_replace_child_drv = {
  *
  * The function doesn't update permissions, caller is responsible for this.
  */
-static void bdrv_replace_child_tran(BdrvChild *child, BlockDriverState *new_bs,
-Transaction *tran)
+static void GRAPH_WRLOCK
+bdrv_replace_child_tran(BdrvChild *child, BlockDriverState *new_bs,
+Transaction *tran)
 {
 BdrvReplaceChildState *s = g_new(BdrvReplaceChildState, 1);
 
@@ -2445,9 +2447,7 @@ static void bdrv_replace_child_tran(BdrvChild *child, 
BlockDriverState *new_bs,
 bdrv_ref(new_bs);
 }
 
-bdrv_graph_wrlock(new_bs);
 bdrv_replace_child_noperm(child, new_bs);
-bdrv_graph_wrunlock();
 /* old_bs reference is transparently moved from @child to @s */
 }
 
@@ -3439,8 +3439,14 @@ static int 
bdrv_set_file_or_backing_noperm(BlockDriverState *parent_bs,
 }
 
 if (child) {
+bdrv_drained_begin(child->bs);
+bdrv_graph_wrlock(NULL);
+
 bdrv_unset_inherits_from(parent_bs, child, tran);
 bdrv_remove_child(child, tran);
+
+bdrv_graph_wrunlock();
+bdrv_drained_end(child->bs);
 }
 
 if (!child_bs) {
@@ -5133,7 +5139,7 @@ void bdrv_close_all(void)
 assert(QTAILQ_EMPTY(&all_bdrv_states));
 }
 
-static bool should_update_child(BdrvChild *c, BlockDriverState *to)
+static bool GRAPH_RDLOCK should_update_child(BdrvChild *c, BlockDriverState 
*to)
 {
 GQueue *queue;
 GHashTable *found;
@@ -5222,45 +5228,41 @@ static TransactionActionDrv bdrv_remove_child_drv = {
 .commit = bdrv_remove_child_commit,
 };
 
-/* Function doesn't update permissions, caller is responsible for this. */
-static void bdrv_remove_child(BdrvChild *child, Transaction *tran)
+/*
+ * Function doesn't update permissions, caller is responsible for this.
+ *
+ * @child->bs (if non-NULL) must be drained.
+ */
+static void GRAPH_WRLOCK bdrv_remove_child(BdrvChild *child, Transaction *tran)
 {
 if (!child) {
 return;
 }
 
 if (child->bs) {
-BlockDriverState *bs = child->bs;
-bdrv_drained_begin(bs);
+assert(child->quiesced_parent);
 bdrv_replace_child_tran(child, NULL, tran);
-bdrv_drained_end(bs);
 }
 
 tran_add(tran, &bdrv_remove_child_drv, child);
 }
 
-static void undrain_on_clean_cb(void *opaque)
-{
-bdrv_drained_end(opaque);
-}
-
-static TransactionActionDrv undrain_on_clean = {
-.clean = undrain_on_clean_cb,
-};
-
-static int bdrv_replace_node_noperm(BlockDriverState *from,
-BlockDriverState *to,
-bool auto_skip, Transaction *tran,
-Error **errp)
+/*
+ * Both @from and @to (if non-NULL) must be drained. @to must be kept drained
+ * until the transaction is completed.
+ */
+static int GRAPH_WRLOCK
+bdrv_replace_node_noperm(BlockDriverState *from,
+ BlockDriverState *to,
+ bool auto_skip, Transaction *tran,
+ Error **errp)
 {
 BdrvChild *c, *next;
 
 GLOBAL_STATE_CODE();
 
-bdrv_drained_begin(from);
-bdrv_drained_begin(to);
-tran_add(tran, &undrain_on_clean, from);
-tran_add(tran, &undrain_on_clean, to);
+assert(from->quiesce_counter);
+assert(to->quiesce_counter);
 
 QLI

[PATCH v2 07/21] block-coroutine-wrapper: Allow arbitrary parameter names

2023-09-11 Thread Kevin Wolf

Don't assume specific parameter names like 'bs' or 'blk' in the
generated code, but use the actual name.

Signed-off-by: Kevin Wolf 
Reviewed-by: Emanuele Giuseppe Esposito 
Reviewed-by: Stefan Hajnoczi 
---
 scripts/block-coroutine-wrapper.py | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/scripts/block-coroutine-wrapper.py 
b/scripts/block-coroutine-wrapper.py
index fa01c06567..685d0b4ed4 100644
--- a/scripts/block-coroutine-wrapper.py
+++ b/scripts/block-coroutine-wrapper.py
@@ -105,12 +105,13 @@ def __init__(self, wrapper_type: str, return_type: str, 
name: str,
 
 def gen_ctx(self, prefix: str = '') -> str:
 t = self.args[0].type
+name = self.args[0].name
 if t == 'BlockDriverState *':
-return f'bdrv_get_aio_context({prefix}bs)'
+return f'bdrv_get_aio_context({prefix}{name})'
 elif t == 'BdrvChild *':
-return f'bdrv_get_aio_context({prefix}child->bs)'
+return f'bdrv_get_aio_context({prefix}{name}->bs)'
 elif t == 'BlockBackend *':
-return f'blk_get_aio_context({prefix}blk)'
+return f'blk_get_aio_context({prefix}{name})'
 else:
 return 'qemu_get_aio_context()'
 
-- 
2.41.0

[PATCH v2 05/21] block: Introduce bdrv_schedule_unref()

2023-09-11 Thread Kevin Wolf

bdrv_unref() is called by a lot of places that need to hold the graph
lock (it naturally happens in the context of operations that change the
graph). However, bdrv_unref() takes the graph writer lock internally, so
it can't actually be called while already holding a graph lock without
causing a deadlock.

bdrv_unref() also can't just become GRAPH_WRLOCK because it drains the
node before closing it, and draining requires that the graph is
unlocked.

The solution is to defer deleting the node until we don't hold the lock
any more and draining is possible again.

Note that keeping images open for longer than necessary can create
problems, too: You can't open an image again before it is really closed
(if image locking didn't prevent it, it would cause corruption).
Reopening an image immediately happens at least during bdrv_open() and
bdrv_co_create().

In order to solve this problem, make sure to run the deferred unref in
bdrv_graph_wrunlock(), i.e. the first possible place where we can drain
again. This is also why bdrv_schedule_unref() is marked GRAPH_WRLOCK.

The output of iotest 051 is updated because the additional polling
changes the order of HMP output, resulting in a new "(qemu)" prompt in
the test output that was previously on a separate line and filtered out.

Signed-off-by: Kevin Wolf 
---
 include/block/block-global-state.h |  1 +
 block.c| 17 +
 block/graph-lock.c | 26 +++---
 tests/qemu-iotests/051.pc.out  |  6 +++---
 4 files changed, 40 insertions(+), 10 deletions(-)

diff --git a/include/block/block-global-state.h 
b/include/block/block-global-state.h
index f347199bff..e570799f85 100644
--- a/include/block/block-global-state.h
+++ b/include/block/block-global-state.h
@@ -224,6 +224,7 @@ void bdrv_img_create(const char *filename, const char *fmt,
 void bdrv_ref(BlockDriverState *bs);
 void no_coroutine_fn bdrv_unref(BlockDriverState *bs);
 void coroutine_fn no_co_wrapper bdrv_co_unref(BlockDriverState *bs);
+void GRAPH_WRLOCK bdrv_schedule_unref(BlockDriverState *bs);
 void bdrv_unref_child(BlockDriverState *parent, BdrvChild *child);
 BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs,
  BlockDriverState *child_bs,
diff --git a/block.c b/block.c
index 9029ddd9ff..c8ac7cfac4 100644
--- a/block.c
+++ b/block.c
@@ -7044,6 +7044,23 @@ void bdrv_unref(BlockDriverState *bs)
 }
 }
 
+/*
+ * Release a BlockDriverState reference while holding the graph write lock.
+ *
+ * Calling bdrv_unref() directly is forbidden while holding the graph lock
+ * because bdrv_close() both involves polling and taking the graph lock
+ * internally. bdrv_schedule_unref() instead delays decreasing the refcount and
+ * possibly closing @bs until the graph lock is released.
+ */
+void bdrv_schedule_unref(BlockDriverState *bs)
+{
+if (!bs) {
+return;
+}
+aio_bh_schedule_oneshot(qemu_get_aio_context(),
+(QEMUBHFunc *) bdrv_unref, bs);
+}
+
 struct BdrvOpBlocker {
 Error *reason;
 QLIST_ENTRY(BdrvOpBlocker) list;
diff --git a/block/graph-lock.c b/block/graph-lock.c
index f357a2c0b1..58a799065f 100644
--- a/block/graph-lock.c
+++ b/block/graph-lock.c
@@ -163,17 +163,29 @@ void bdrv_graph_wrlock(BlockDriverState *bs)
 void bdrv_graph_wrunlock(void)
 {
 GLOBAL_STATE_CODE();
-QEMU_LOCK_GUARD(&aio_context_list_lock);
 assert(qatomic_read(&has_writer));
 
+WITH_QEMU_LOCK_GUARD(&aio_context_list_lock) {
+/*
+ * No need for memory barriers, this works in pair with
+ * the slow path of rdlock() and both take the lock.
+ */
+qatomic_store_release(&has_writer, 0);
+
+/* Wake up all coroutines that are waiting to read the graph */
+qemu_co_enter_all(&reader_queue, &aio_context_list_lock);
+}
+
 /*
- * No need for memory barriers, this works in pair with
- * the slow path of rdlock() and both take the lock.
+ * Run any BHs that were scheduled during the wrlock section and that
+ * callers might expect to have finished (in particular, this is important
+ * for bdrv_schedule_unref()).
+ *
+ * Do this only after restarting coroutines so that nested event loops in
+ * BHs don't deadlock if their condition relies on the coroutine making
+ * progress.
  */
-qatomic_store_release(&has_writer, 0);
-
-/* Wake up all coroutine that are waiting to read the graph */
-qemu_co_enter_all(&reader_queue, &aio_context_list_lock);
+aio_bh_poll(qemu_get_aio_context());
 }
 
 void coroutine_fn bdrv_graph_co_rdlock(void)
diff --git a/tests/qemu-iotests/051.pc.out b/tests/qemu-iotests/051.pc.out
index 4d4af5a486..7e10c5fa1b 100644
--- a/tests/qemu-iotests/051.pc.out
+++ b/tests/qemu-iotests/051.pc.out
@@ -169,11 +169,11 @@ QEMU_PROG: -device scsi-hd,drive=disk: Device needs 
media, but drive is empty
 
 Testing: -drive file=TEST_DIR/t.qcow2,if=none,node-n

[PATCH v2 04/21] block: Take AioContext lock for bdrv_append() more consistently

2023-09-11 Thread Kevin Wolf

The documentation for bdrv_append() says that the caller must hold the
AioContext lock for bs_top. Change all callers to actually adhere to the
contract.

Signed-off-by: Kevin Wolf 
Reviewed-by: Eric Blake 
Reviewed-by: Emanuele Giuseppe Esposito 
Reviewed-by: Stefan Hajnoczi 
---
 tests/unit/test-bdrv-drain.c | 3 +++
 tests/unit/test-bdrv-graph-mod.c | 6 ++
 tests/unit/test-block-iothread.c | 3 +++
 3 files changed, 12 insertions(+)

diff --git a/tests/unit/test-bdrv-drain.c b/tests/unit/test-bdrv-drain.c
index ccc453c29e..89c8fa6780 100644
--- a/tests/unit/test-bdrv-drain.c
+++ b/tests/unit/test-bdrv-drain.c
@@ -1359,7 +1359,10 @@ static void test_append_to_drained(void)
 g_assert_cmpint(base_s->drain_count, ==, 1);
 g_assert_cmpint(base->in_flight, ==, 0);
 
+aio_context_acquire(qemu_get_aio_context());
 bdrv_append(overlay, base, &error_abort);
+aio_context_release(qemu_get_aio_context());
+
 g_assert_cmpint(base->in_flight, ==, 0);
 g_assert_cmpint(overlay->in_flight, ==, 0);
 
diff --git a/tests/unit/test-bdrv-graph-mod.c b/tests/unit/test-bdrv-graph-mod.c
index 36eed4b464..d8503165b3 100644
--- a/tests/unit/test-bdrv-graph-mod.c
+++ b/tests/unit/test-bdrv-graph-mod.c
@@ -140,8 +140,10 @@ static void test_update_perm_tree(void)
 bdrv_attach_child(filter, bs, "child", &child_of_bds,
   BDRV_CHILD_DATA, &error_abort);
 
+aio_context_acquire(qemu_get_aio_context());
 ret = bdrv_append(filter, bs, NULL);
 g_assert_cmpint(ret, <, 0);
+aio_context_release(qemu_get_aio_context());
 
 bdrv_unref(filter);
 blk_unref(root);
@@ -205,7 +207,9 @@ static void test_should_update_child(void)
 g_assert(target->backing->bs == bs);
 bdrv_attach_child(filter, target, "target", &child_of_bds,
   BDRV_CHILD_DATA, &error_abort);
+aio_context_acquire(qemu_get_aio_context());
 bdrv_append(filter, bs, &error_abort);
+aio_context_release(qemu_get_aio_context());
 g_assert(target->backing->bs == bs);
 
 bdrv_unref(filter);
@@ -410,7 +414,9 @@ static void test_append_greedy_filter(void)
   BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY,
   &error_abort);
 
+aio_context_acquire(qemu_get_aio_context());
 bdrv_append(fl, base, &error_abort);
+aio_context_release(qemu_get_aio_context());
 bdrv_unref(fl);
 bdrv_unref(top);
 }
diff --git a/tests/unit/test-block-iothread.c b/tests/unit/test-block-iothread.c
index d727a5fee8..9155547313 100644
--- a/tests/unit/test-block-iothread.c
+++ b/tests/unit/test-block-iothread.c
@@ -756,11 +756,14 @@ static void test_propagate_mirror(void)
   &error_abort);
 
 /* Start a mirror job */
+aio_context_acquire(main_ctx);
 mirror_start("job0", src, target, NULL, JOB_DEFAULT, 0, 0, 0,
  MIRROR_SYNC_MODE_NONE, MIRROR_OPEN_BACKING_CHAIN, false,
  BLOCKDEV_ON_ERROR_REPORT, BLOCKDEV_ON_ERROR_REPORT,
  false, "filter_node", MIRROR_COPY_MODE_BACKGROUND,
  &error_abort);
+aio_context_release(main_ctx);
+
 WITH_JOB_LOCK_GUARD() {
 job = job_get_locked("job0");
 }
-- 
2.41.0

[PATCH v2 12/21] block: Mark bdrv_attach_child() GRAPH_WRLOCK

2023-09-11 Thread Kevin Wolf

Instead of taking the writer lock internally, require callers to already
hold it when calling bdrv_attach_child_common(). These callers will
typically already hold the graph lock once the locking work is
completed, which means that they can't call functions that take it
internally.

Signed-off-by: Kevin Wolf 
Reviewed-by: Emanuele Giuseppe Esposito 
Reviewed-by: Stefan Hajnoczi 
---
 include/block/block-global-state.h | 14 --
 block.c|  7 +++
 block/quorum.c |  2 ++
 block/replication.c|  6 ++
 tests/unit/test-bdrv-drain.c   | 14 ++
 tests/unit/test-bdrv-graph-mod.c   | 10 ++
 6 files changed, 43 insertions(+), 10 deletions(-)

diff --git a/include/block/block-global-state.h 
b/include/block/block-global-state.h
index e570799f85..eb12a35439 100644
--- a/include/block/block-global-state.h
+++ b/include/block/block-global-state.h
@@ -226,12 +226,14 @@ void no_coroutine_fn bdrv_unref(BlockDriverState *bs);
 void coroutine_fn no_co_wrapper bdrv_co_unref(BlockDriverState *bs);
 void GRAPH_WRLOCK bdrv_schedule_unref(BlockDriverState *bs);
 void bdrv_unref_child(BlockDriverState *parent, BdrvChild *child);
-BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs,
- BlockDriverState *child_bs,
- const char *child_name,
- const BdrvChildClass *child_class,
- BdrvChildRole child_role,
- Error **errp);
+
+BdrvChild * GRAPH_WRLOCK
+bdrv_attach_child(BlockDriverState *parent_bs,
+  BlockDriverState *child_bs,
+  const char *child_name,
+  const BdrvChildClass *child_class,
+  BdrvChildRole child_role,
+  Error **errp);
 
 bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp);
 void bdrv_op_block(BlockDriverState *bs, BlockOpType op, Error *reason);
diff --git a/block.c b/block.c
index f06de58a3b..369023872a 100644
--- a/block.c
+++ b/block.c
@@ -3219,8 +3219,6 @@ BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs,
 
 GLOBAL_STATE_CODE();
 
-bdrv_graph_wrlock(child_bs);
-
 child = bdrv_attach_child_noperm(parent_bs, child_bs, child_name,
  child_class, child_role, tran, errp);
 if (!child) {
@@ -3235,9 +3233,8 @@ BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs,
 
 out:
 tran_finalize(tran, ret);
-bdrv_graph_wrunlock();
 
-bdrv_unref(child_bs);
+bdrv_schedule_unref(child_bs);
 
 return ret < 0 ? NULL : child;
 }
@@ -3758,11 +3755,13 @@ BdrvChild *bdrv_open_child(const char *filename,
 return NULL;
 }
 
+bdrv_graph_wrlock(NULL);
 ctx = bdrv_get_aio_context(bs);
 aio_context_acquire(ctx);
 child = bdrv_attach_child(parent, bs, bdref_key, child_class, child_role,
   errp);
 aio_context_release(ctx);
+bdrv_graph_wrunlock();
 
 return child;
 }
diff --git a/block/quorum.c b/block/quorum.c
index f28758cf2b..def0539fda 100644
--- a/block/quorum.c
+++ b/block/quorum.c
@@ -1094,8 +1094,10 @@ static void quorum_add_child(BlockDriverState *bs, 
BlockDriverState *child_bs,
 /* We can safely add the child now */
 bdrv_ref(child_bs);
 
+bdrv_graph_wrlock(child_bs);
 child = bdrv_attach_child(bs, child_bs, indexstr, &child_of_bds,
   BDRV_CHILD_DATA, errp);
+bdrv_graph_wrunlock();
 if (child == NULL) {
 s->next_child_index--;
 goto out;
diff --git a/block/replication.c b/block/replication.c
index ea4bf1aa80..eec9819625 100644
--- a/block/replication.c
+++ b/block/replication.c
@@ -542,12 +542,15 @@ static void replication_start(ReplicationState *rs, 
ReplicationMode mode,
 return;
 }
 
+bdrv_graph_wrlock(bs);
+
 bdrv_ref(hidden_disk->bs);
 s->hidden_disk = bdrv_attach_child(bs, hidden_disk->bs, "hidden disk",
&child_of_bds, BDRV_CHILD_DATA,
&local_err);
 if (local_err) {
 error_propagate(errp, local_err);
+bdrv_graph_wrunlock();
 aio_context_release(aio_context);
 return;
 }
@@ -558,10 +561,13 @@ static void replication_start(ReplicationState *rs, 
ReplicationMode mode,
   BDRV_CHILD_DATA, &local_err);
 if (local_err) {
 error_propagate(errp, local_err);
+bdrv_graph_wrunlock();
 aio_context_release(aio_context);
 return;
 }
 
+bdrv_graph_wrunlock();
+
 /* start backup job now */
 error_setg(&s->blocker,
"Block device is in use by internal backup job");
diff --git a/tests/unit/test-bdrv-drain.c b/tests/unit/test-bdrv-drain.c
index 8

[PATCH v2 18/21] block: Take graph rdlock in bdrv_change_aio_context()

2023-09-11 Thread Kevin Wolf

The function reads the parents list, so it needs to hold the graph lock.

Signed-off-by: Kevin Wolf 
Reviewed-by: Emanuele Giuseppe Esposito 
---
 block.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/block.c b/block.c
index e024a6ccec..8e589bb2e4 100644
--- a/block.c
+++ b/block.c
@@ -7688,17 +7688,21 @@ static bool bdrv_change_aio_context(BlockDriverState 
*bs, AioContext *ctx,
 return true;
 }
 
+bdrv_graph_rdlock_main_loop();
 QLIST_FOREACH(c, &bs->parents, next_parent) {
 if (!bdrv_parent_change_aio_context(c, ctx, visited, tran, errp)) {
+bdrv_graph_rdunlock_main_loop();
 return false;
 }
 }
 
 QLIST_FOREACH(c, &bs->children, next) {
 if (!bdrv_child_change_aio_context(c, ctx, visited, tran, errp)) {
+bdrv_graph_rdunlock_main_loop();
 return false;
 }
 }
+bdrv_graph_rdunlock_main_loop();
 
 state = g_new(BdrvStateSetAioContext, 1);
 *state = (BdrvStateSetAioContext) {
-- 
2.41.0

[PATCH v2 19/21] block: Mark bdrv_root_unref_child() GRAPH_WRLOCK

2023-09-11 Thread Kevin Wolf

Instead of taking the writer lock internally, require callers to already
hold it when calling bdrv_root_unref_child(). These callers will
typically already hold the graph lock once the locking work is
completed, which means that they can't call functions that take it
internally.

Signed-off-by: Kevin Wolf 
Reviewed-by: Emanuele Giuseppe Esposito 
Reviewed-by: Stefan Hajnoczi 
---
 include/block/block_int-global-state.h | 2 +-
 block.c| 6 +++---
 block/block-backend.c  | 3 +++
 blockjob.c | 2 ++
 4 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/include/block/block_int-global-state.h 
b/include/block/block_int-global-state.h
index e2304db58b..074b677838 100644
--- a/include/block/block_int-global-state.h
+++ b/include/block/block_int-global-state.h
@@ -202,7 +202,7 @@ BdrvChild *bdrv_root_attach_child(BlockDriverState 
*child_bs,
   BdrvChildRole child_role,
   uint64_t perm, uint64_t shared_perm,
   void *opaque, Error **errp);
-void bdrv_root_unref_child(BdrvChild *child);
+void GRAPH_WRLOCK bdrv_root_unref_child(BdrvChild *child);
 
 void GRAPH_RDLOCK bdrv_get_cumulative_perm(BlockDriverState *bs, uint64_t 
*perm,
uint64_t *shared_perm);
diff --git a/block.c b/block.c
index 8e589bb2e4..9ea8333a28 100644
--- a/block.c
+++ b/block.c
@@ -3268,7 +3268,6 @@ void bdrv_root_unref_child(BdrvChild *child)
 BlockDriverState *child_bs = child->bs;
 
 GLOBAL_STATE_CODE();
-bdrv_graph_wrlock(NULL);
 bdrv_replace_child_noperm(child, NULL);
 bdrv_child_free(child);
 
@@ -3288,8 +3287,7 @@ void bdrv_root_unref_child(BdrvChild *child)
 NULL);
 }
 
-bdrv_graph_wrunlock();
-bdrv_unref(child_bs);
+bdrv_schedule_unref(child_bs);
 }
 
 typedef struct BdrvSetInheritsFrom {
@@ -3366,8 +3364,10 @@ void bdrv_unref_child(BlockDriverState *parent, 
BdrvChild *child)
 return;
 }
 
+bdrv_graph_wrlock(NULL);
 bdrv_unset_inherits_from(parent, child, NULL);
 bdrv_root_unref_child(child);
+bdrv_graph_wrunlock();
 }
 
 
diff --git a/block/block-backend.c b/block/block-backend.c
index 8d0282a5d9..c2636f4351 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -915,7 +915,10 @@ void blk_remove_bs(BlockBackend *blk)
 blk_drain(blk);
 root = blk->root;
 blk->root = NULL;
+
+bdrv_graph_wrlock(NULL);
 bdrv_root_unref_child(root);
+bdrv_graph_wrunlock();
 }
 
 /*
diff --git a/blockjob.c b/blockjob.c
index 25fe8e625d..58c5d64539 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -198,6 +198,7 @@ void block_job_remove_all_bdrv(BlockJob *job)
  * one to make sure that such a concurrent access does not attempt
  * to process an already freed BdrvChild.
  */
+bdrv_graph_wrlock(NULL);
 while (job->nodes) {
 GSList *l = job->nodes;
 BdrvChild *c = l->data;
@@ -209,6 +210,7 @@ void block_job_remove_all_bdrv(BlockJob *job)
 
 g_slist_free_1(l);
 }
+bdrv_graph_wrunlock();
 }
 
 bool block_job_has_bdrv(BlockJob *job, BlockDriverState *bs)
-- 
2.41.0

[PATCH v2 14/21] block: Mark bdrv_get_cumulative_perm() and callers GRAPH_RDLOCK

2023-09-11 Thread Kevin Wolf

The function reads the parents list, so it needs to hold the graph lock.

This happens to result in BlockDriver.bdrv_set_perm() to be called with
the graph lock held. For consistency, make it the same for all of the
BlockDriver callbacks for updating permissions and annotate the function
pointers with GRAPH_RDLOCK_PTR.

Signed-off-by: Kevin Wolf 
Reviewed-by: Emanuele Giuseppe Esposito 
Reviewed-by: Stefan Hajnoczi 
---
 include/block/block_int-common.h   |  9 ---
 include/block/block_int-global-state.h |  4 +--
 block.c| 35 --
 blockdev.c |  6 +
 4 files changed, 40 insertions(+), 14 deletions(-)

diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
index fda9d8b5c8..f82c14fb9c 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h
@@ -413,8 +413,8 @@ struct BlockDriver {
  * If both conditions are met, 0 is returned. Otherwise, -errno is returned
  * and errp is set to an error describing the conflict.
  */
-int (*bdrv_check_perm)(BlockDriverState *bs, uint64_t perm,
-   uint64_t shared, Error **errp);
+int GRAPH_RDLOCK_PTR (*bdrv_check_perm)(BlockDriverState *bs, uint64_t 
perm,
+uint64_t shared, Error **errp);
 
 /**
  * Called to inform the driver that the set of cumulative set of used
@@ -426,7 +426,8 @@ struct BlockDriver {
  * This function is only invoked after bdrv_check_perm(), so block drivers
  * may rely on preparations made in their .bdrv_check_perm implementation.
  */
-void (*bdrv_set_perm)(BlockDriverState *bs, uint64_t perm, uint64_t 
shared);
+void GRAPH_RDLOCK_PTR (*bdrv_set_perm)(
+BlockDriverState *bs, uint64_t perm, uint64_t shared);
 
 /*
  * Called to inform the driver that after a previous bdrv_check_perm()
@@ -436,7 +437,7 @@ struct BlockDriver {
  * This function can be called even for nodes that never saw a
  * bdrv_check_perm() call. It is a no-op then.
  */
-void (*bdrv_abort_perm_update)(BlockDriverState *bs);
+void GRAPH_RDLOCK_PTR (*bdrv_abort_perm_update)(BlockDriverState *bs);
 
 /**
  * Returns in @nperm and @nshared the permissions that the driver for @bs
diff --git a/include/block/block_int-global-state.h 
b/include/block/block_int-global-state.h
index bebcc08bce..e2304db58b 100644
--- a/include/block/block_int-global-state.h
+++ b/include/block/block_int-global-state.h
@@ -204,8 +204,8 @@ BdrvChild *bdrv_root_attach_child(BlockDriverState 
*child_bs,
   void *opaque, Error **errp);
 void bdrv_root_unref_child(BdrvChild *child);
 
-void bdrv_get_cumulative_perm(BlockDriverState *bs, uint64_t *perm,
-  uint64_t *shared_perm);
+void GRAPH_RDLOCK bdrv_get_cumulative_perm(BlockDriverState *bs, uint64_t 
*perm,
+   uint64_t *shared_perm);
 
 /**
  * Sets a BdrvChild's permissions.  Avoid if the parent is a BDS; use
diff --git a/block.c b/block.c
index 6720bc4f8a..186efda70f 100644
--- a/block.c
+++ b/block.c
@@ -2320,7 +2320,7 @@ static void bdrv_child_set_perm(BdrvChild *c, uint64_t 
perm,
 tran_add(tran, &bdrv_child_set_pem_drv, s);
 }
 
-static void bdrv_drv_set_perm_commit(void *opaque)
+static void GRAPH_RDLOCK bdrv_drv_set_perm_commit(void *opaque)
 {
 BlockDriverState *bs = opaque;
 uint64_t cumulative_perms, cumulative_shared_perms;
@@ -2333,7 +2333,7 @@ static void bdrv_drv_set_perm_commit(void *opaque)
 }
 }
 
-static void bdrv_drv_set_perm_abort(void *opaque)
+static void GRAPH_RDLOCK bdrv_drv_set_perm_abort(void *opaque)
 {
 BlockDriverState *bs = opaque;
 GLOBAL_STATE_CODE();
@@ -2348,9 +2348,13 @@ TransactionActionDrv bdrv_drv_set_perm_drv = {
 .commit = bdrv_drv_set_perm_commit,
 };
 
-static int bdrv_drv_set_perm(BlockDriverState *bs, uint64_t perm,
- uint64_t shared_perm, Transaction *tran,
- Error **errp)
+/*
+ * After calling this function, the transaction @tran may only be completed
+ * while holding a reader lock for the graph.
+ */
+static int GRAPH_RDLOCK
+bdrv_drv_set_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared_perm,
+  Transaction *tran, Error **errp)
 {
 GLOBAL_STATE_CODE();
 if (!bs->drv) {
@@ -2457,9 +2461,13 @@ bdrv_replace_child_tran(BdrvChild *child, 
BlockDriverState *new_bs,
 /*
  * Refresh permissions in @bs subtree. The function is intended to be called
  * after some graph modification that was done without permission update.
+ *
+ * After calling this function, the transaction @tran may only be completed
+ * while holding a reader lock for the graph.
  */
-static int bdrv_node_refresh_perm(BlockDriverState *bs, BlockReopenQueue *q,
-  Transaction *tran, Error **errp)
+static int

[PATCH v2 10/21] block: Mark bdrv_attach_child_common() GRAPH_WRLOCK

2023-09-11 Thread Kevin Wolf

Instead of taking the writer lock internally, require callers to already
hold it when calling bdrv_attach_child_common(). These callers will
typically already hold the graph lock once the locking work is
completed, which means that they can't call functions that take it
internally.

Note that the transaction callbacks still take the lock internally, so
tran_finalize() must be called without the lock held. This is because
bdrv_append() also calls bdrv_replace_node_noperm(), which currently
requires the transaction callbacks to be called unlocked. In the next
step, both of them can be switched to locked tran_finalize() calls
together.

Signed-off-by: Kevin Wolf 
Reviewed-by: Stefan Hajnoczi 
---
 block.c| 133 +++--
 block/stream.c |  20 ++--
 2 files changed, 100 insertions(+), 53 deletions(-)

diff --git a/block.c b/block.c
index 0973b91d98..f6e7cf4fb9 100644
--- a/block.c
+++ b/block.c
@@ -3004,13 +3004,14 @@ static TransactionActionDrv 
bdrv_attach_child_common_drv = {
  * @child_bs can move to a different AioContext in this function. Callers must
  * make sure that their AioContext locking is still correct after this.
  */
-static BdrvChild *bdrv_attach_child_common(BlockDriverState *child_bs,
-   const char *child_name,
-   const BdrvChildClass *child_class,
-   BdrvChildRole child_role,
-   uint64_t perm, uint64_t shared_perm,
-   void *opaque,
-   Transaction *tran, Error **errp)
+static BdrvChild * GRAPH_WRLOCK
+bdrv_attach_child_common(BlockDriverState *child_bs,
+ const char *child_name,
+ const BdrvChildClass *child_class,
+ BdrvChildRole child_role,
+ uint64_t perm, uint64_t shared_perm,
+ void *opaque,
+ Transaction *tran, Error **errp)
 {
 BdrvChild *new_child;
 AioContext *parent_ctx, *new_child_ctx;
@@ -3088,10 +3089,8 @@ static BdrvChild 
*bdrv_attach_child_common(BlockDriverState *child_bs,
  * a problem, we already did this), but it will still poll until the parent
  * is fully quiesced, so it will not be negatively affected either.
  */
-bdrv_graph_wrlock(child_bs);
 bdrv_parent_drained_begin_single(new_child);
 bdrv_replace_child_noperm(new_child, child_bs);
-bdrv_graph_wrunlock();
 
 BdrvAttachChildCommonState *s = g_new(BdrvAttachChildCommonState, 1);
 *s = (BdrvAttachChildCommonState) {
@@ -3116,13 +3115,14 @@ static BdrvChild 
*bdrv_attach_child_common(BlockDriverState *child_bs,
  * @child_bs can move to a different AioContext in this function. Callers must
  * make sure that their AioContext locking is still correct after this.
  */
-static BdrvChild *bdrv_attach_child_noperm(BlockDriverState *parent_bs,
-   BlockDriverState *child_bs,
-   const char *child_name,
-   const BdrvChildClass *child_class,
-   BdrvChildRole child_role,
-   Transaction *tran,
-   Error **errp)
+static BdrvChild * GRAPH_WRLOCK
+bdrv_attach_child_noperm(BlockDriverState *parent_bs,
+ BlockDriverState *child_bs,
+ const char *child_name,
+ const BdrvChildClass *child_class,
+ BdrvChildRole child_role,
+ Transaction *tran,
+ Error **errp)
 {
 uint64_t perm, shared_perm;
 
@@ -3167,6 +3167,8 @@ BdrvChild *bdrv_root_attach_child(BlockDriverState 
*child_bs,
 
 GLOBAL_STATE_CODE();
 
+bdrv_graph_wrlock(child_bs);
+
 child = bdrv_attach_child_common(child_bs, child_name, child_class,
child_role, perm, shared_perm, opaque,
tran, errp);
@@ -3178,6 +3180,7 @@ BdrvChild *bdrv_root_attach_child(BlockDriverState 
*child_bs,
 ret = bdrv_refresh_perms(child_bs, tran, errp);
 
 out:
+bdrv_graph_wrunlock();
 tran_finalize(tran, ret);
 
 bdrv_unref(child_bs);
@@ -3209,6 +3212,8 @@ BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs,
 
 GLOBAL_STATE_CODE();
 
+bdrv_graph_wrlock(child_bs);
+
 child = bdrv_attach_child_noperm(parent_bs, child_bs, child_name,
  child_class, child_role, tran, errp);
 if (!child) {
@@ -3222,6 +3227,7 @@ BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs,
 }
 
 out:
+bdrv_graph_wrunlock();
 tran_finalize(tran, ret);
 
 bdrv_unref(child_bs);
@@ -3379,16 +3385,20 @@ s

[PATCH v2 13/21] block: Mark bdrv_parent_perms_conflict() and callers GRAPH_RDLOCK

2023-09-11 Thread Kevin Wolf

The function reads the parents list, so it needs to hold the graph lock.

Signed-off-by: Kevin Wolf 
Reviewed-by: Emanuele Giuseppe Esposito 
Reviewed-by: Stefan Hajnoczi 
---
 include/block/block_int-common.h|  6 ++---
 include/block/block_int-global-state.h  |  8 +++---
 include/sysemu/block-backend-global-state.h |  4 +--
 block.c | 28 +
 block/block-backend.c   | 26 ++-
 block/crypto.c  |  6 +++--
 block/mirror.c  |  8 ++
 block/vmdk.c|  2 ++
 tests/unit/test-bdrv-graph-mod.c|  4 +++
 9 files changed, 66 insertions(+), 26 deletions(-)

diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
index 85be256c09..fda9d8b5c8 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h
@@ -311,7 +311,7 @@ struct BlockDriver {
  */
 void (*bdrv_cancel_in_flight)(BlockDriverState *bs);
 
-int (*bdrv_inactivate)(BlockDriverState *bs);
+int GRAPH_RDLOCK_PTR (*bdrv_inactivate)(BlockDriverState *bs);
 
 int (*bdrv_snapshot_create)(BlockDriverState *bs,
 QEMUSnapshotInfo *sn_info);
@@ -944,8 +944,8 @@ struct BdrvChildClass {
  * when migration is completing) and it can start/stop requesting
  * permissions and doing I/O on it.
  */
-void (*activate)(BdrvChild *child, Error **errp);
-int (*inactivate)(BdrvChild *child);
+void GRAPH_RDLOCK_PTR (*activate)(BdrvChild *child, Error **errp);
+int GRAPH_RDLOCK_PTR (*inactivate)(BdrvChild *child);
 
 void GRAPH_WRLOCK_PTR (*attach)(BdrvChild *child);
 void GRAPH_WRLOCK_PTR (*detach)(BdrvChild *child);
diff --git a/include/block/block_int-global-state.h 
b/include/block/block_int-global-state.h
index da5fb31089..bebcc08bce 100644
--- a/include/block/block_int-global-state.h
+++ b/include/block/block_int-global-state.h
@@ -212,8 +212,9 @@ void bdrv_get_cumulative_perm(BlockDriverState *bs, 
uint64_t *perm,
  * bdrv_child_refresh_perms() instead and make the parent's
  * .bdrv_child_perm() implementation return the correct values.
  */
-int bdrv_child_try_set_perm(BdrvChild *c, uint64_t perm, uint64_t shared,
-Error **errp);
+int GRAPH_RDLOCK
+bdrv_child_try_set_perm(BdrvChild *c, uint64_t perm, uint64_t shared,
+Error **errp);
 
 /**
  * Calls bs->drv->bdrv_child_perm() and updates the child's permission
@@ -223,7 +224,8 @@ int bdrv_child_try_set_perm(BdrvChild *c, uint64_t perm, 
uint64_t shared,
  * values than before, but which will not result in the block layer
  * automatically refreshing the permissions.
  */
-int bdrv_child_refresh_perms(BlockDriverState *bs, BdrvChild *c, Error **errp);
+int GRAPH_RDLOCK
+bdrv_child_refresh_perms(BlockDriverState *bs, BdrvChild *c, Error **errp);
 
 bool GRAPH_RDLOCK bdrv_recurse_can_replace(BlockDriverState *bs,
BlockDriverState *to_replace);
diff --git a/include/sysemu/block-backend-global-state.h 
b/include/sysemu/block-backend-global-state.h
index 184e667ebd..d5f675493a 100644
--- a/include/sysemu/block-backend-global-state.h
+++ b/include/sysemu/block-backend-global-state.h
@@ -61,8 +61,8 @@ int blk_insert_bs(BlockBackend *blk, BlockDriverState *bs, 
Error **errp);
 int blk_replace_bs(BlockBackend *blk, BlockDriverState *new_bs, Error **errp);
 bool bdrv_has_blk(BlockDriverState *bs);
 bool bdrv_is_root_node(BlockDriverState *bs);
-int blk_set_perm(BlockBackend *blk, uint64_t perm, uint64_t shared_perm,
- Error **errp);
+int GRAPH_UNLOCKED blk_set_perm(BlockBackend *blk, uint64_t perm,
+uint64_t shared_perm, Error **errp);
 void blk_get_perm(BlockBackend *blk, uint64_t *perm, uint64_t *shared_perm);
 
 void blk_iostatus_enable(BlockBackend *blk);
diff --git a/block.c b/block.c
index 369023872a..6720bc4f8a 100644
--- a/block.c
+++ b/block.c
@@ -2202,7 +2202,8 @@ static bool bdrv_a_allow_b(BdrvChild *a, BdrvChild *b, 
Error **errp)
 return false;
 }
 
-static bool bdrv_parent_perms_conflict(BlockDriverState *bs, Error **errp)
+static bool GRAPH_RDLOCK
+bdrv_parent_perms_conflict(BlockDriverState *bs, Error **errp)
 {
 BdrvChild *a, *b;
 GLOBAL_STATE_CODE();
@@ -2255,8 +2256,8 @@ static void bdrv_child_perm(BlockDriverState *bs, 
BlockDriverState *child_bs,
  * simplest way to satisfy this criteria: use only result of
  * bdrv_topological_dfs() or NULL as @list parameter.
  */
-static GSList *bdrv_topological_dfs(GSList *list, GHashTable *found,
-BlockDriverState *bs)
+static GSList * GRAPH_RDLOCK
+bdrv_topological_dfs(GSList *list, GHashTable *found, BlockDriverState *bs)
 {
 BdrvChild *child;
 g_autoptr(GHashTable) local_found = NULL;
@@ -2532,8 +2533,9 @@ static int bdrv_node_refresh_perm(Bl

[PATCH v2 06/21] block-coroutine-wrapper: Add no_co_wrapper_bdrv_wrlock functions

2023-09-11 Thread Kevin Wolf

Add a new wrapper type for GRAPH_WRLOCK functions that should be called
from coroutine context.

Signed-off-by: Kevin Wolf 
Reviewed-by: Emanuele Giuseppe Esposito 
Reviewed-by: Stefan Hajnoczi 
---
 include/block/block-common.h   |  4 
 scripts/block-coroutine-wrapper.py | 11 +++
 2 files changed, 15 insertions(+)

diff --git a/include/block/block-common.h b/include/block/block-common.h
index df5ffc8d09..3bbc5d9294 100644
--- a/include/block/block-common.h
+++ b/include/block/block-common.h
@@ -66,10 +66,14 @@
  * function. The coroutine yields after scheduling the BH and is reentered when
  * the wrapped function returns.
  *
+ * A no_co_wrapper_bdrv_wrlock function is a no_co_wrapper function that
+ * automatically takes the graph wrlock when calling the wrapped function.
+ *
  * If the first parameter of the function is a BlockDriverState, BdrvChild or
  * BlockBackend pointer, the AioContext lock for it is taken in the wrapper.
  */
 #define no_co_wrapper
+#define no_co_wrapper_bdrv_wrlock
 
 #include "block/blockjob.h"
 
diff --git a/scripts/block-coroutine-wrapper.py 
b/scripts/block-coroutine-wrapper.py
index d4a183db61..fa01c06567 100644
--- a/scripts/block-coroutine-wrapper.py
+++ b/scripts/block-coroutine-wrapper.py
@@ -71,10 +71,13 @@ def __init__(self, wrapper_type: str, return_type: str, 
name: str,
 self.args = [ParamDecl(arg.strip()) for arg in args.split(',')]
 self.create_only_co = 'mixed' not in variant
 self.graph_rdlock = 'bdrv_rdlock' in variant
+self.graph_wrlock = 'bdrv_wrlock' in variant
 
 self.wrapper_type = wrapper_type
 
 if wrapper_type == 'co':
+if self.graph_wrlock:
+raise ValueError(f"co function can't be wrlock: {self.name}")
 subsystem, subname = self.name.split('_', 1)
 self.target_name = f'{subsystem}_co_{subname}'
 else:
@@ -250,6 +253,12 @@ def gen_no_co_wrapper(func: FuncDecl) -> str:
 name = func.target_name
 struct_name = func.struct_name
 
+graph_lock=''
+graph_unlock=''
+if func.graph_wrlock:
+graph_lock='bdrv_graph_wrlock(NULL);'
+graph_unlock='bdrv_graph_wrunlock();'
+
 return f"""\
 /*
  * Wrappers for {name}
@@ -266,9 +275,11 @@ def gen_no_co_wrapper(func: FuncDecl) -> str:
 {struct_name} *s = opaque;
 AioContext *ctx = {func.gen_ctx('s->')};
 
+{graph_lock}
 aio_context_acquire(ctx);
 {func.get_result}{name}({ func.gen_list('s->{name}') });
 aio_context_release(ctx);
+{graph_unlock}
 
 aio_co_wake(s->co);
 }}
-- 
2.41.0

[PATCH v2 02/21] preallocate: Factor out preallocate_truncate_to_real_size()

2023-09-11 Thread Kevin Wolf

It's essentially the same code in preallocate_check_perm() and
preallocate_close(), except that the latter ignores errors.

Signed-off-by: Kevin Wolf 
Reviewed-by: Eric Blake 
Reviewed-by: Emanuele Giuseppe Esposito 
Reviewed-by: Stefan Hajnoczi 
---
 block/preallocate.c | 48 +
 1 file changed, 22 insertions(+), 26 deletions(-)

diff --git a/block/preallocate.c b/block/preallocate.c
index 3d0f621003..3173d80534 100644
--- a/block/preallocate.c
+++ b/block/preallocate.c
@@ -162,26 +162,39 @@ static int preallocate_open(BlockDriverState *bs, QDict 
*options, int flags,
 return 0;
 }
 
-static void preallocate_close(BlockDriverState *bs)
+static int preallocate_truncate_to_real_size(BlockDriverState *bs, Error 
**errp)
 {
-int ret;
 BDRVPreallocateState *s = bs->opaque;
-
-if (s->data_end < 0) {
-return;
-}
+int ret;
 
 if (s->file_end < 0) {
 s->file_end = bdrv_getlength(bs->file->bs);
 if (s->file_end < 0) {
-return;
+error_setg_errno(errp, -s->file_end, "Failed to get file length");
+return s->file_end;
 }
 }
 
 if (s->data_end < s->file_end) {
 ret = bdrv_truncate(bs->file, s->data_end, true, PREALLOC_MODE_OFF, 0,
 NULL);
-s->file_end = ret < 0 ? ret : s->data_end;
+if (ret < 0) {
+error_setg_errno(errp, -ret, "Failed to drop preallocation");
+s->file_end = ret;
+return ret;
+}
+s->file_end = s->data_end;
+}
+
+return 0;
+}
+
+static void preallocate_close(BlockDriverState *bs)
+{
+BDRVPreallocateState *s = bs->opaque;
+
+if (s->data_end >= 0) {
+preallocate_truncate_to_real_size(bs, NULL);
 }
 }
 
@@ -473,24 +486,7 @@ static int preallocate_check_perm(BlockDriverState *bs,
  * We should truncate in check_perm, as in set_perm bs->file->perm will
  * be already changed, and we should not violate it.
  */
-if (s->file_end < 0) {
-s->file_end = bdrv_getlength(bs->file->bs);
-if (s->file_end < 0) {
-error_setg(errp, "Failed to get file length");
-return s->file_end;
-}
-}
-
-if (s->data_end < s->file_end) {
-int ret = bdrv_truncate(bs->file, s->data_end, true,
-PREALLOC_MODE_OFF, 0, NULL);
-if (ret < 0) {
-error_setg(errp, "Failed to drop preallocation");
-s->file_end = ret;
-return ret;
-}
-s->file_end = s->data_end;
-}
+return preallocate_truncate_to_real_size(bs, errp);
 }
 
 return 0;
-- 
2.41.0

[PATCH v2 15/21] block: Mark bdrv_child_perm() GRAPH_RDLOCK

2023-09-11 Thread Kevin Wolf

This adds GRAPH_RDLOCK annotations to declare that callers of
bdrv_child_perm() need to hold a reader lock for the graph because
some implementations access the children list of a node.

The callers of bdrv_child_perm() conveniently already hold the lock.

Signed-off-by: Kevin Wolf 
Reviewed-by: Emanuele Giuseppe Esposito 
Reviewed-by: Stefan Hajnoczi 
---
 include/block/block_int-common.h | 10 +-
 block.c  | 11 ++-
 block/copy-before-write.c| 10 +-
 3 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
index f82c14fb9c..3feb67ec4a 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h
@@ -451,11 +451,11 @@ struct BlockDriver {
  * permissions, but those that will be needed after applying the
  * @reopen_queue.
  */
- void (*bdrv_child_perm)(BlockDriverState *bs, BdrvChild *c,
- BdrvChildRole role,
- BlockReopenQueue *reopen_queue,
- uint64_t parent_perm, uint64_t parent_shared,
- uint64_t *nperm, uint64_t *nshared);
+ void GRAPH_RDLOCK_PTR (*bdrv_child_perm)(
+BlockDriverState *bs, BdrvChild *c, BdrvChildRole role,
+BlockReopenQueue *reopen_queue,
+uint64_t parent_perm, uint64_t parent_shared,
+uint64_t *nperm, uint64_t *nshared);
 
 /**
  * Register/unregister a buffer for I/O. For example, when the driver is
diff --git a/block.c b/block.c
index 186efda70f..0f7f78f8de 100644
--- a/block.c
+++ b/block.c
@@ -2228,11 +2228,12 @@ bdrv_parent_perms_conflict(BlockDriverState *bs, Error 
**errp)
 return false;
 }
 
-static void bdrv_child_perm(BlockDriverState *bs, BlockDriverState *child_bs,
-BdrvChild *c, BdrvChildRole role,
-BlockReopenQueue *reopen_queue,
-uint64_t parent_perm, uint64_t parent_shared,
-uint64_t *nperm, uint64_t *nshared)
+static void GRAPH_RDLOCK
+bdrv_child_perm(BlockDriverState *bs, BlockDriverState *child_bs,
+BdrvChild *c, BdrvChildRole role,
+BlockReopenQueue *reopen_queue,
+uint64_t parent_perm, uint64_t parent_shared,
+uint64_t *nperm, uint64_t *nshared)
 {
 assert(bs->drv && bs->drv->bdrv_child_perm);
 GLOBAL_STATE_CODE();
diff --git a/block/copy-before-write.c b/block/copy-before-write.c
index 9a0e2b69d9..aeaff3bb82 100644
--- a/block/copy-before-write.c
+++ b/block/copy-before-write.c
@@ -341,11 +341,11 @@ static void cbw_refresh_filename(BlockDriverState *bs)
 bs->file->bs->filename);
 }
 
-static void cbw_child_perm(BlockDriverState *bs, BdrvChild *c,
-   BdrvChildRole role,
-   BlockReopenQueue *reopen_queue,
-   uint64_t perm, uint64_t shared,
-   uint64_t *nperm, uint64_t *nshared)
+static void GRAPH_RDLOCK
+cbw_child_perm(BlockDriverState *bs, BdrvChild *c, BdrvChildRole role,
+   BlockReopenQueue *reopen_queue,
+   uint64_t perm, uint64_t shared,
+   uint64_t *nperm, uint64_t *nshared)
 {
 if (!(role & BDRV_CHILD_FILTERED)) {
 /*
-- 
2.41.0

[PATCH v2 08/21] block: Mark bdrv_replace_child_noperm() GRAPH_WRLOCK

2023-09-11 Thread Kevin Wolf

Instead of taking the writer lock internally, require callers to already
hold it when calling bdrv_replace_child_noperm(). These callers will
typically already hold the graph lock once the locking work is
completed, which means that they can't call functions that take it
internally.

Signed-off-by: Kevin Wolf 
Reviewed-by: Emanuele Giuseppe Esposito 
Reviewed-by: Stefan Hajnoczi 
---
 block.c | 26 +++---
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/block.c b/block.c
index c8ac7cfac4..61856f5c33 100644
--- a/block.c
+++ b/block.c
@@ -91,8 +91,9 @@ static BlockDriverState *bdrv_open_inherit(const char 
*filename,
 static bool bdrv_recurse_has_child(BlockDriverState *bs,
BlockDriverState *child);
 
-static void bdrv_replace_child_noperm(BdrvChild *child,
-  BlockDriverState *new_bs);
+static void GRAPH_WRLOCK
+bdrv_replace_child_noperm(BdrvChild *child, BlockDriverState *new_bs);
+
 static void bdrv_remove_child(BdrvChild *child, Transaction *tran);
 
 static int bdrv_reopen_prepare(BDRVReopenState *reopen_state,
@@ -2387,6 +2388,8 @@ static void bdrv_replace_child_abort(void *opaque)
 BlockDriverState *new_bs = s->child->bs;
 
 GLOBAL_STATE_CODE();
+bdrv_graph_wrlock(s->old_bs);
+
 /* old_bs reference is transparently moved from @s to @s->child */
 if (!s->child->bs) {
 /*
@@ -2403,6 +2406,8 @@ static void bdrv_replace_child_abort(void *opaque)
 }
 assert(s->child->quiesced_parent);
 bdrv_replace_child_noperm(s->child, s->old_bs);
+
+bdrv_graph_wrunlock();
 bdrv_unref(new_bs);
 }
 
@@ -2439,7 +2444,10 @@ static void bdrv_replace_child_tran(BdrvChild *child, 
BlockDriverState *new_bs,
 if (new_bs) {
 bdrv_ref(new_bs);
 }
+
+bdrv_graph_wrlock(new_bs);
 bdrv_replace_child_noperm(child, new_bs);
+bdrv_graph_wrunlock();
 /* old_bs reference is transparently moved from @child to @s */
 }
 
@@ -2858,8 +2866,8 @@ uint64_t bdrv_qapi_perm_to_blk_perm(BlockPermission 
qapi_perm)
  * If @new_bs is non-NULL, the parent of @child must already be drained through
  * @child and the caller must hold the AioContext lock for @new_bs.
  */
-static void bdrv_replace_child_noperm(BdrvChild *child,
-  BlockDriverState *new_bs)
+static void GRAPH_WRLOCK
+bdrv_replace_child_noperm(BdrvChild *child, BlockDriverState *new_bs)
 {
 BlockDriverState *old_bs = child->bs;
 int new_bs_quiesce_counter;
@@ -2894,8 +2902,6 @@ static void bdrv_replace_child_noperm(BdrvChild *child,
 assert(bdrv_get_aio_context(old_bs) == bdrv_get_aio_context(new_bs));
 }
 
-/* TODO Pull this up into the callers to avoid polling here */
-bdrv_graph_wrlock(new_bs);
 if (old_bs) {
 if (child->klass->detach) {
 child->klass->detach(child);
@@ -2911,7 +2917,6 @@ static void bdrv_replace_child_noperm(BdrvChild *child,
 child->klass->attach(child);
 }
 }
-bdrv_graph_wrunlock();
 
 /*
  * If the parent was drained through this BdrvChild previously, but new_bs
@@ -2952,7 +2957,10 @@ static void bdrv_attach_child_common_abort(void *opaque)
 BlockDriverState *bs = s->child->bs;
 
 GLOBAL_STATE_CODE();
+
+bdrv_graph_wrlock(NULL);
 bdrv_replace_child_noperm(s->child, NULL);
+bdrv_graph_wrunlock();
 
 if (bdrv_get_aio_context(bs) != s->old_child_ctx) {
 bdrv_try_change_aio_context(bs, s->old_child_ctx, NULL, &error_abort);
@@ -3080,8 +3088,10 @@ static BdrvChild 
*bdrv_attach_child_common(BlockDriverState *child_bs,
  * a problem, we already did this), but it will still poll until the parent
  * is fully quiesced, so it will not be negatively affected either.
  */
+bdrv_graph_wrlock(child_bs);
 bdrv_parent_drained_begin_single(new_child);
 bdrv_replace_child_noperm(new_child, child_bs);
+bdrv_graph_wrunlock();
 
 BdrvAttachChildCommonState *s = g_new(BdrvAttachChildCommonState, 1);
 *s = (BdrvAttachChildCommonState) {
@@ -3225,7 +3235,9 @@ void bdrv_root_unref_child(BdrvChild *child)
 BlockDriverState *child_bs = child->bs;
 
 GLOBAL_STATE_CODE();
+bdrv_graph_wrlock(NULL);
 bdrv_replace_child_noperm(child, NULL);
+bdrv_graph_wrunlock();
 bdrv_child_free(child);
 
 if (child_bs) {
-- 
2.41.0

RE: [PATCH v1] vfio/common: Separate vfio-pci ranges

2023-09-11 Thread Duan, Zhenzhong



>-Original Message-
>From: Joao Martins 
>Sent: Monday, September 11, 2023 5:07 PM
>Subject: Re: [PATCH v1] vfio/common: Separate vfio-pci ranges
>
>On 11/09/2023 09:57, Duan, Zhenzhong wrote:
>>> -Original Message-
>>> From: qemu-devel-bounces+zhenzhong.duan=intel@nongnu.org >> devel-bounces+zhenzhong.duan=intel@nongnu.org> On Behalf Of Joao
>>> Martins
>>> Sent: Friday, September 8, 2023 5:30 PM
>>> Subject: [PATCH v1] vfio/common: Separate vfio-pci ranges
>>>
>>> QEMU computes the DMA logging ranges for two predefined ranges: 32-bit
>>> and 64-bit. In the OVMF case, when the dynamic MMIO window is enabled,
>>> QEMU includes in the 64-bit range the RAM regions at the lower part
>>> and vfio-pci device RAM regions which are at the top of the address
>>> space. This range contains a large gap and the size can be bigger than
>>> the dirty tracking HW limits of some devices (MLX5 has a 2^42 limit).
>>>
>>> To avoid such large ranges, introduce a new PCI range covering the
>>> vfio-pci device RAM regions, this only if the addresses are above 4GB
>>> to avoid breaking potential SeaBIOS guests.
>>>
>>> Signed-off-by: Joao Martins 
>>> [ clg: - wrote commit log
>>>   - fixed overlapping 32-bit and PCI ranges when using SeaBIOS ]
>>> Signed-off-by: Cédric Le Goater 
>>> ---
>>> v2:
>>> * s/minpci/minpci64/
>>> * s/maxpci/maxpci64/
>>> * Expand comment to cover the pci-hole64 and why we don't do special
>>>  handling of pci-hole32.
>>> ---
>>> hw/vfio/common.c | 71 +---
>>> hw/vfio/trace-events |  2 +-
>>> 2 files changed, 61 insertions(+), 12 deletions(-)
>>>
>>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>>> index 237101d03844..134649226d43 100644
>>> --- a/hw/vfio/common.c
>>> +++ b/hw/vfio/common.c
>>> @@ -27,6 +27,7 @@
>>>
>>> #include "hw/vfio/vfio-common.h"
>>> #include "hw/vfio/vfio.h"
>>> +#include "hw/vfio/pci.h"
>>> #include "exec/address-spaces.h"
>>> #include "exec/memory.h"
>>> #include "exec/ram_addr.h"
>>> @@ -1400,6 +1401,8 @@ typedef struct VFIODirtyRanges {
>>> hwaddr max32;
>>> hwaddr min64;
>>> hwaddr max64;
>>> +hwaddr minpci64;
>>> +hwaddr maxpci64;
>>> } VFIODirtyRanges;
>>>
>>> typedef struct VFIODirtyRangesListener {
>>> @@ -1408,6 +1411,31 @@ typedef struct VFIODirtyRangesListener {
>>> MemoryListener listener;
>>> } VFIODirtyRangesListener;
>>>
>>> +static bool vfio_section_is_vfio_pci(MemoryRegionSection *section,
>>> + VFIOContainer *container)
>>> +{
>>> +VFIOPCIDevice *pcidev;
>>> +VFIODevice *vbasedev;
>>> +VFIOGroup *group;
>>> +Object *owner;
>>> +
>>> +owner = memory_region_owner(section->mr);
>>> +
>>> +QLIST_FOREACH(group, &container->group_list, container_next) {
>>> +QLIST_FOREACH(vbasedev, &group->device_list, next) {
>>> +if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
>>> +continue;
>>> +}
>>> +pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
>>> +if (OBJECT(pcidev) == owner) {
>>> +return true;
>>> +}
>>> +}
>>> +}
>>> +
>>> +return false;
>>> +}
>>
>> What about simplify it with memory_region_is_ram_device()?
>> This way vdpa device could also be included.
>>
>
>Note that the check is not interested in RAM (or any other kinda of memory like
>VGA). That's covered in the 32-64 ranges. But rather in any PCI device RAM that
>would fall in the 64-bit PCI hole. Would memory_region_is_ram_device() really
>cover it? If so, I am all for the simplification.

Ram device is used not only by vfio pci bars but also host notifier of vdpa and 
vhost-user.

I have another question, previously I think vfio pci bars are device states and
save/restored through VFIO migration protocol, so we don't need to dirty
tracking them. Do I understand wrong?

Thanks
Zhenzhong

[PATCH v2 11/21] block: Call transaction callbacks with lock held

2023-09-11 Thread Kevin Wolf

In previous patches, we changed some transactionable functions to be
marked as GRAPH_WRLOCK, but required that tran_finalize() is still
called without the lock. This was because all callbacks that can be in
the same transaction need to follow the same convention.

Now that we don't have conflicting requirements any more, we can switch
all of the transaction callbacks to be declared GRAPH_WRLOCK, too, and
call tran_finalize() with the lock held.

Document for each of these transactionable functions that the lock needs
to be held when completing the transaction, and make sure that all
callers down to the place where the transaction is finalised actually
have the writer lock.

Signed-off-by: Kevin Wolf 
Reviewed-by: Stefan Hajnoczi 
---
 block.c | 61 +
 1 file changed, 44 insertions(+), 17 deletions(-)

diff --git a/block.c b/block.c
index f6e7cf4fb9..f06de58a3b 100644
--- a/block.c
+++ b/block.c
@@ -2375,21 +2375,21 @@ typedef struct BdrvReplaceChildState {
 BlockDriverState *old_bs;
 } BdrvReplaceChildState;
 
-static void bdrv_replace_child_commit(void *opaque)
+static void GRAPH_WRLOCK bdrv_replace_child_commit(void *opaque)
 {
 BdrvReplaceChildState *s = opaque;
 GLOBAL_STATE_CODE();
 
-bdrv_unref(s->old_bs);
+bdrv_schedule_unref(s->old_bs);
 }
 
-static void bdrv_replace_child_abort(void *opaque)
+static void GRAPH_WRLOCK bdrv_replace_child_abort(void *opaque)
 {
 BdrvReplaceChildState *s = opaque;
 BlockDriverState *new_bs = s->child->bs;
 
 GLOBAL_STATE_CODE();
-bdrv_graph_wrlock(s->old_bs);
+assert_bdrv_graph_writable();
 
 /* old_bs reference is transparently moved from @s to @s->child */
 if (!s->child->bs) {
@@ -2408,7 +2408,6 @@ static void bdrv_replace_child_abort(void *opaque)
 assert(s->child->quiesced_parent);
 bdrv_replace_child_noperm(s->child, s->old_bs);
 
-bdrv_graph_wrunlock();
 bdrv_unref(new_bs);
 }
 
@@ -2426,6 +2425,9 @@ static TransactionActionDrv bdrv_replace_child_drv = {
  * Both @child->bs and @new_bs (if non-NULL) must be drained. @new_bs must be
  * kept drained until the transaction is completed.
  *
+ * After calling this function, the transaction @tran may only be completed
+ * while holding a writer lock for the graph.
+ *
  * The function doesn't update permissions, caller is responsible for this.
  */
 static void GRAPH_WRLOCK
@@ -2951,16 +2953,15 @@ typedef struct BdrvAttachChildCommonState {
 AioContext *old_child_ctx;
 } BdrvAttachChildCommonState;
 
-static void bdrv_attach_child_common_abort(void *opaque)
+static void GRAPH_WRLOCK bdrv_attach_child_common_abort(void *opaque)
 {
 BdrvAttachChildCommonState *s = opaque;
 BlockDriverState *bs = s->child->bs;
 
 GLOBAL_STATE_CODE();
+assert_bdrv_graph_writable();
 
-bdrv_graph_wrlock(NULL);
 bdrv_replace_child_noperm(s->child, NULL);
-bdrv_graph_wrunlock();
 
 if (bdrv_get_aio_context(bs) != s->old_child_ctx) {
 bdrv_try_change_aio_context(bs, s->old_child_ctx, NULL, &error_abort);
@@ -2984,7 +2985,7 @@ static void bdrv_attach_child_common_abort(void *opaque)
 tran_commit(tran);
 }
 
-bdrv_unref(bs);
+bdrv_schedule_unref(bs);
 bdrv_child_free(s->child);
 }
 
@@ -2998,6 +2999,9 @@ static TransactionActionDrv bdrv_attach_child_common_drv 
= {
  *
  * Function doesn't update permissions, caller is responsible for this.
  *
+ * After calling this function, the transaction @tran may only be completed
+ * while holding a writer lock for the graph.
+ *
  * Returns new created child.
  *
  * The caller must hold the AioContext lock for @child_bs. Both @parent_bs and
@@ -3114,6 +3118,9 @@ bdrv_attach_child_common(BlockDriverState *child_bs,
  * The caller must hold the AioContext lock for @child_bs. Both @parent_bs and
  * @child_bs can move to a different AioContext in this function. Callers must
  * make sure that their AioContext locking is still correct after this.
+ *
+ * After calling this function, the transaction @tran may only be completed
+ * while holding a writer lock for the graph.
  */
 static BdrvChild * GRAPH_WRLOCK
 bdrv_attach_child_noperm(BlockDriverState *parent_bs,
@@ -3180,8 +3187,8 @@ BdrvChild *bdrv_root_attach_child(BlockDriverState 
*child_bs,
 ret = bdrv_refresh_perms(child_bs, tran, errp);
 
 out:
-bdrv_graph_wrunlock();
 tran_finalize(tran, ret);
+bdrv_graph_wrunlock();
 
 bdrv_unref(child_bs);
 
@@ -3227,8 +3234,8 @@ BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs,
 }
 
 out:
-bdrv_graph_wrunlock();
 tran_finalize(tran, ret);
+bdrv_graph_wrunlock();
 
 bdrv_unref(child_bs);
 
@@ -3393,6 +3400,9 @@ static BdrvChildRole bdrv_backing_role(BlockDriverState 
*bs)
  * The caller must hold the AioContext lock for @child_bs. Both @parent_bs and
  * @child_bs can move to a different AioContext in this function. Callers must
  * make sure that their AioContext locking is stil

[PATCH v2 17/21] block: Take graph rdlock in bdrv_drop_intermediate()

2023-09-11 Thread Kevin Wolf

The function reads the parents list, so it needs to hold the graph lock.

Signed-off-by: Kevin Wolf 
Reviewed-by: Emanuele Giuseppe Esposito 
---
 block.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/block.c b/block.c
index 06d2a1b256..e024a6ccec 100644
--- a/block.c
+++ b/block.c
@@ -5938,9 +5938,11 @@ int bdrv_drop_intermediate(BlockDriverState *top, 
BlockDriverState *base,
 backing_file_str = base->filename;
 }
 
+bdrv_graph_rdlock_main_loop();
 QLIST_FOREACH(c, &top->parents, next_parent) {
 updated_children = g_slist_prepend(updated_children, c);
 }
+bdrv_graph_rdunlock_main_loop();
 
 /*
  * It seems correct to pass detach_subchain=true here, but it triggers
-- 
2.41.0

[PATCH v2 16/21] block: Mark bdrv_parent_cb_change_media() GRAPH_RDLOCK

2023-09-11 Thread Kevin Wolf

The function reads the parents list, so it needs to hold the graph lock.

Signed-off-by: Kevin Wolf 
Reviewed-by: Emanuele Giuseppe Esposito 
Reviewed-by: Stefan Hajnoczi 
---
 block.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/block.c b/block.c
index 0f7f78f8de..06d2a1b256 100644
--- a/block.c
+++ b/block.c
@@ -3371,7 +3371,8 @@ void bdrv_unref_child(BlockDriverState *parent, BdrvChild 
*child)
 }
 
 
-static void bdrv_parent_cb_change_media(BlockDriverState *bs, bool load)
+static void GRAPH_RDLOCK
+bdrv_parent_cb_change_media(BlockDriverState *bs, bool load)
 {
 BdrvChild *c;
 GLOBAL_STATE_CODE();
@@ -3969,6 +3970,9 @@ bdrv_open_inherit(const char *filename, const char 
*reference, QDict *options,
 GLOBAL_STATE_CODE();
 assert(!qemu_in_coroutine());
 
+/* TODO We'll eventually have to take a writer lock in this function */
+GRAPH_RDLOCK_GUARD_MAINLOOP();
+
 if (reference) {
 bool options_non_empty = options ? qdict_size(options) : false;
 qobject_unref(options);
-- 
2.41.0

Re: [PATCH 6/7] hw/tpm: spelling fixes

2023-09-11 Thread Michael Tokarev




Stefan, can you take a quick look please?

https://patchew.org/QEMU/20230909131258.354675-1-...@tls.msk.ru/20230909131258.354675-7-...@tls.msk.ru/

Most of this is s/Familiy/Family/ in a few places, I guess it's
some copy-paste error.

09.09.2023 16:12, Michael Tokarev:

Signed-off-by: Michael Tokarev 
---
  hw/tpm/tpm_tis.h| 2 +-
  hw/tpm/tpm_tis_common.c | 2 +-
  hw/tpm/tpm_tis_i2c.c| 4 ++--
  hw/tpm/tpm_tis_isa.c| 2 +-
  hw/tpm/tpm_tis_sysbus.c | 2 +-
  5 files changed, 6 insertions(+), 6 deletions(-)


Thanks,

/mjt

[RESEND VIRTIO GPU PATCH v3 0/1] Add new feature flag VIRTIO_GPU_F_FREEZE_S3

2023-09-11 Thread Jiqian Chen

Hi all,
I hope you’ll forgive me if this disturb you. Since it has been almost two
months since the latest patch was sent out, I didn't receive any reply, so I
rebase the latest master branch and sent it again. I am looking forward to
getting your response.

Best regards,
Jiqian Chen

v3:
Hi all,
Thanks for Gerd Hoffmann's advice. V3 makes below changes:
* Use enum for freeze mode, so this can be extended with more
  modes in the future.
* Rename functions and paratemers with "_S3" postfix.
* Explain in more detail

And latest version on QEMU and Linux kernel side:
QEMU: 
https://lore.kernel.org/qemu-devel/20230720120816.8751-1-jiqian.c...@amd.com
Kernel: 
https://lore.kernel.org/lkml/20230720115805.8206-1-jiqian.c...@amd.com/T/#t


v2:
Hi all,
Thanks to Gerd Hoffmann for his suggestions. V2 makes below changes:
* Elaborate on the types of resources.
* Add some descriptions for S3 and S4.

Link:
https://lists.oasis-open.org/archives/virtio-comment/202307/msg00160.html
V2 of Qemu patch:
https://lore.kernel.org/qemu-devel/20230630070016.841459-1-jiqian.c...@amd.com/T/#t
V2 of Kernel patch:
https://lore.kernel.org/lkml/20230630073448.842767-1-jiqian.c...@amd.com/T/#t


v1:
Hi all,
I am working to implement virtgpu S3 function on Xen.

Currently on Xen, if we start a guest through Qemu with enabling virtgpu, and 
then suspend
and s3resume guest. We can find that the guest kernel comes back, but the 
display doesn't.
It just shown a black screen.

That is because when guest was during suspending, it called into Qemu and Qemu 
destroyed
all resources and reset renderer. This made the display gone after guest 
resumed.

So, I add a mechanism that when guest is suspending, it will notify Qemu, and 
then Qemu will
not destroy resources. That can help guest's display come back.

As discussed and suggested by Robert Beckett and Gerd Hoffmann on v1 qemu's 
mailing list.
Due to that mechanism needs cooperation between guest and host. What's more, as 
virtio drivers
by design paravirt drivers, it is reasonable for guest to accept some 
cooperation with host to
manage suspend/resume. So I request to add a new feature flag, so that guest 
and host can
negotiate whenever freezing is supported or not.

Link:
https://lists.oasis-open.org/archives/virtio-comment/202306/msg00595.html
V1 of Qemu patch:
https://lore.kernel.org/qemu-devel/20230608025655.1674357-2-jiqian.c...@amd.com/
V1 of Kernel patch:
https://lore.kernel.org/lkml/20230608063857.1677973-1-jiqian.c...@amd.com/

Jiqian Chen (1):
  virtio-gpu: Add new feature flag VIRTIO_GPU_F_FREEZE_S3

 device-types/gpu/description.tex | 42 
 1 file changed, 42 insertions(+)

-- 
2.34.1

[RESEND VIRTIO GPU PATCH v3 1/1] virtio-gpu: Add new feature flag VIRTIO_GPU_F_FREEZE_S3

2023-09-11 Thread Jiqian Chen

When we suspend/resume guest on Xen, the display can't come back.
This is because when guest suspended, it called into Qemu. Then
Qemu destroyed all resources which is used for display. So that
guest's display can't come back to the time when it was suspended.

To solve above problem, I added a new mechanism that when guest is
suspending, it will notify Qemu, and then Qemu will not destroy
resourcesi which are created by using commands
VIRTIO_GPU_CMD_RESOURCE_CREATE_*.

Due to that mechanism needs cooperation between guest and host,
I need to add a new feature flag, so that guest and host can
negotiate whenever freeze_S3 is supported or not.

Signed-off-by: Jiqian Chen 
---
 device-types/gpu/description.tex | 42 
 1 file changed, 42 insertions(+)

diff --git a/device-types/gpu/description.tex b/device-types/gpu/description.tex
index 4435248..1a137e7 100644
--- a/device-types/gpu/description.tex
+++ b/device-types/gpu/description.tex
@@ -37,6 +37,8 @@ \subsection{Feature bits}\label{sec:Device Types / GPU Device 
/ Feature bits}
   resources is supported.
 \item[VIRTIO_GPU_F_CONTEXT_INIT (4)] multiple context types and
   synchronization timelines supported.  Requires VIRTIO_GPU_F_VIRGL.
+\item[VIRTIO_GPU_F_FREEZE_S3 (5)] freezing virtio-gpu and keeping resources
+  alive is supported.
 \end{description}
 
 \subsection{Device configuration layout}\label{sec:Device Types / GPU Device / 
Device configuration layout}
@@ -228,6 +230,9 @@ \subsubsection{Device Operation: Request 
header}\label{sec:Device Types / GPU De
 VIRTIO_GPU_CMD_UPDATE_CURSOR = 0x0300,
 VIRTIO_GPU_CMD_MOVE_CURSOR,
 
+/* freeze mode */
+VIRTIO_GPU_CMD_SET_FREEZE_MODE = 0x0400,
+
 /* success responses */
 VIRTIO_GPU_RESP_OK_NODATA = 0x1100,
 VIRTIO_GPU_RESP_OK_DISPLAY_INFO,
@@ -838,6 +843,43 @@ \subsubsection{Device Operation: cursorq}\label{sec:Device 
Types / GPU Device /
 
 \end{description}
 
+\subsubsection{Device Operation: freeze_mode}\label{sec:Device Types / GPU 
Device / Device Operation / Device Operation: freeze_mode}
+
+\begin{lstlisting}
+typedef enum {
+  VIRTIO_GPU_FREEZE_MODE_UNFREEZE = 0,
+  VIRTIO_GPU_FREEZE_MODE_FREEZE_S3 = 3,
+} virtio_gpu_freeze_mode_t;
+
+struct virtio_gpu_set_freeze_mode {
+  struct virtio_gpu_ctrl_hdr hdr;
+  virtio_gpu_freeze_mode_t freeze_mode;
+};
+\end{lstlisting}
+
+\begin{description}
+
+\item[VIRTIO_GPU_CMD_SET_FREEZE_MODE]
+Notify freeze mode through controlq.
+Request data is \field{struct virtio_gpu_set_freeze_mode}.
+Response type is VIRTIO_GPU_RESP_OK_NODATA.
+
+This is added for S3 function in guest with virtio-gpu. When guest does
+S3, let it notify QEMU that virtio-gpu is in what freeze mode in
+\field{freeze_mode}. VIRTIO_GPU_FREEZE_MODE_FREEZE_S3 means guest is
+doing S3 and virtio-gpu will be freezed, VIRTIO_GPU_FREEZE_MODE_UNFREEZE
+means virtio-gpu can be used as usual. When virtio-gpu is freezed, QEMU
+will not destroy resources which are created by using commands
+VIRTIO_GPU_CMD_RESOURCE_CREATE_*, so that guest can use those resources
+to resume display.
+
+Note: this change is not enough to solve the problems of S4 function.
+QEMU may lose resources after hibernation. It needs more research and
+development. If S4 is supported in the future, it may need another
+feature flag here.
+
+\end{description}
+
 \subsection{VGA Compatibility}\label{sec:Device Types / GPU Device / VGA 
Compatibility}
 
 Applies to Virtio Over PCI only.  The GPU device can come with and
-- 
2.34.1

Re: [PULL 00/51] Build system, i386 changes for 2023-09-07

2023-09-11 Thread Philippe Mathieu-Daudé


On 8/9/23 17:47, Stefan Hajnoczi wrote:

I wonder how it passed CI?
https://gitlab.com/qemu-project/qemu/-/pipelines/996175923/


The conditions are:
- x86 host
- both system / user emulation enabled
- KVM disabled
- debug enabled

We have jobs with 3 of the 4, but none with
all the 4.

Is it worth test it?



Stefan

On Fri, 8 Sept 2023 at 11:02, Kevin Wolf  wrote:


Am 07.09.2023 um 17:44 hat Stefan Hajnoczi geschrieben:

Applied, thanks.

Please update the changelog at https://wiki.qemu.org/ChangeLog/8.2 for any 
user-visible changes.


Something in this has broken the build for me, it seems to be the
linux-user binary that doesn't link any more:

   /usr/bin/ld: libqemu-x86_64-linux-user.fa.p/target_i386_cpu.c.o: in function 
`cpu_x86_cpuid':
   /home/kwolf/source/qemu/build-clang/../target/i386/cpu.c:6180: undefined 
reference to `kvm_arch_get_supported_cpuid'
   /usr/bin/ld: libqemu-x86_64-linux-user.fa.p/target_i386_cpu.c.o: in function 
`x86_cpu_filter_features':
   /home/kwolf/source/qemu/build-clang/../target/i386/cpu.c:7158: undefined 
reference to `kvm_arch_get_supported_cpuid'
   /usr/bin/ld: /home/kwolf/source/qemu/build-clang/../target/i386/cpu.c:7159: 
undefined reference to `kvm_arch_get_supported_cpuid'
   /usr/bin/ld: /home/kwolf/source/qemu/build-clang/../target/i386/cpu.c:7160: 
undefined reference to `kvm_arch_get_supported_cpuid'
   /usr/bin/ld: /home/kwolf/source/qemu/build-clang/../target/i386/cpu.c:7161: 
undefined reference to `kvm_arch_get_supported_cpuid'
   /usr/bin/ld: 
libqemu-x86_64-linux-user.fa.p/target_i386_cpu.c.o:/home/kwolf/source/qemu/build-clang/../target/i386/cpu.c:7162:
 more undefined references to `kvm_arch_get_supported_cpuid' follow
   clang-15: error: linker command failed with exit code 1 (use -v to see 
invocation)

In case it makes a difference, I'm using clang on F37.

Kevin

Re: [PULL 00/51] Build system, i386 changes for 2023-09-07

2023-09-11 Thread Philippe Mathieu-Daudé


On 8/9/23 21:21, Paolo Bonzini wrote:

On Fri, Sep 8, 2023 at 7:28 PM Kevin Wolf  wrote:

Maybe the calls aren't eliminated because --enable-debug implies -O0?


My experience is that it will still fold simple dead code like "0 &&
foo()" or even "if (0) { ... }", but maybe it's a GCC vs. clang
difference. Philippe, I take it that you are looking at it?


Yes, I'll send a patch soon. Sorry I didn't catch that case.

Re: linux-user vs system build options and libraries

2023-09-11 Thread Alex Bennée



Michael Tokarev  writes:

> Hi!
>
> I noticed that linux-user binaries, when built separately (with
> --enable-linux-user --disable-system) or when built as part of
> system build (--enable-linux-user --enable-system) differ in size
> and link with significantly more libraries which they shouldn't
> link with.  libnuma, liburing, libgnutls, - that's what gets linked,
> just to name a few.  zlib is also there which most likely shouldn't
> be.

>
> Also, which might be a separate issue, --enable-plugins seems to
> be in effect for linux-user too, which result in libgmodule being
> linked with.

TCG plugins should work for both system and non-static *-user builds.

-- 
Alex Bennée
Virtualisation Tech Lead @ Linaro

Re: [PATCH v1] vfio/common: Separate vfio-pci ranges

2023-09-11 Thread Joao Martins

On 11/09/2023 10:48, Duan, Zhenzhong wrote:
>> -Original Message-
>> From: Joao Martins 
>> Sent: Monday, September 11, 2023 5:07 PM
>> Subject: Re: [PATCH v1] vfio/common: Separate vfio-pci ranges
>>
>> On 11/09/2023 09:57, Duan, Zhenzhong wrote:
 -Original Message-
 From: qemu-devel-bounces+zhenzhong.duan=intel@nongnu.org >>> devel-bounces+zhenzhong.duan=intel@nongnu.org> On Behalf Of Joao
 Martins
 Sent: Friday, September 8, 2023 5:30 PM
 Subject: [PATCH v1] vfio/common: Separate vfio-pci ranges

 QEMU computes the DMA logging ranges for two predefined ranges: 32-bit
 and 64-bit. In the OVMF case, when the dynamic MMIO window is enabled,
 QEMU includes in the 64-bit range the RAM regions at the lower part
 and vfio-pci device RAM regions which are at the top of the address
 space. This range contains a large gap and the size can be bigger than
 the dirty tracking HW limits of some devices (MLX5 has a 2^42 limit).

 To avoid such large ranges, introduce a new PCI range covering the
 vfio-pci device RAM regions, this only if the addresses are above 4GB
 to avoid breaking potential SeaBIOS guests.

 Signed-off-by: Joao Martins 
 [ clg: - wrote commit log
   - fixed overlapping 32-bit and PCI ranges when using SeaBIOS ]
 Signed-off-by: Cédric Le Goater 
 ---
 v2:
 * s/minpci/minpci64/
 * s/maxpci/maxpci64/
 * Expand comment to cover the pci-hole64 and why we don't do special
  handling of pci-hole32.
 ---
 hw/vfio/common.c | 71 +---
 hw/vfio/trace-events |  2 +-
 2 files changed, 61 insertions(+), 12 deletions(-)

 diff --git a/hw/vfio/common.c b/hw/vfio/common.c
 index 237101d03844..134649226d43 100644
 --- a/hw/vfio/common.c
 +++ b/hw/vfio/common.c
 @@ -27,6 +27,7 @@

 #include "hw/vfio/vfio-common.h"
 #include "hw/vfio/vfio.h"
 +#include "hw/vfio/pci.h"
 #include "exec/address-spaces.h"
 #include "exec/memory.h"
 #include "exec/ram_addr.h"
 @@ -1400,6 +1401,8 @@ typedef struct VFIODirtyRanges {
 hwaddr max32;
 hwaddr min64;
 hwaddr max64;
 +hwaddr minpci64;
 +hwaddr maxpci64;
 } VFIODirtyRanges;

 typedef struct VFIODirtyRangesListener {
 @@ -1408,6 +1411,31 @@ typedef struct VFIODirtyRangesListener {
 MemoryListener listener;
 } VFIODirtyRangesListener;

 +static bool vfio_section_is_vfio_pci(MemoryRegionSection *section,
 + VFIOContainer *container)
 +{
 +VFIOPCIDevice *pcidev;
 +VFIODevice *vbasedev;
 +VFIOGroup *group;
 +Object *owner;
 +
 +owner = memory_region_owner(section->mr);
 +
 +QLIST_FOREACH(group, &container->group_list, container_next) {
 +QLIST_FOREACH(vbasedev, &group->device_list, next) {
 +if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
 +continue;
 +}
 +pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
 +if (OBJECT(pcidev) == owner) {
 +return true;
 +}
 +}
 +}
 +
 +return false;
 +}
>>>
>>> What about simplify it with memory_region_is_ram_device()?
>>> This way vdpa device could also be included.
>>>
>>
>> Note that the check is not interested in RAM (or any other kinda of memory 
>> like
>> VGA). That's covered in the 32-64 ranges. But rather in any PCI device RAM 
>> that
>> would fall in the 64-bit PCI hole. Would memory_region_is_ram_device() really
>> cover it? If so, I am all for the simplification.
> 
> Ram device is used not only by vfio pci bars but also host notifier of vdpa 
> and vhost-user.
> 
> I have another question, previously I think vfio pci bars are device states 
> and
> save/restored through VFIO migration protocol, so we don't need to dirty
> tracking them. Do I understand wrong?

The general thinking of device dirty tracking is to track all addressable IOVAs.
But you do raise a good question. My understanding is that migrating the bars
*as is* might be device migration specific (not a guarantee?); the save file and
precopy interface are the only places we transfer from/to the data and it's just
opaque data, not bars or anything formatted specifically -- so if we migrate
bars it is hidden in what device f/w wants to move. Might be that BARs aren't
even needed as they are sort of scratch space from h/w side. Ultimately, the
dirty tracker is the one reporting the values, and the device h/w chooses to not
report those IOVAs as dirty then nothing changes.

Should pcie-pci-bridge use 32-bit non-prefetchable memory?

2023-09-11 Thread Marcin Juszkiewicz


I am working on aarch64/sbsa-ref machine so people can have virtual
machine to test their OS against something reminding standards compliant
system.

One of tools I use is BSA ACS (Base System Architecture - Architecture
Compliance Suite) [1] written by Arm. It runs set of tests to check does
system conforms to BSA specification.

1. https://github.com/ARM-software/bsa-acs

To run tests I use my "boot-sbsa-ref.sh" script [2] which allows me to
run exactly same setup each time.

2. https://github.com/hrw/sbsa-ref-status/blob/main/boot-sbsa-ref.sh

Since we have ITS support in whole stack (qemu, tf-a, edk2) I use
overcomplicated PCIe setup:

-device igb
-device pcie-root-port,id=root_port_for_switch1,chassis=0,slot=0
 -device x3130-upstream,id=upstream_port1,bus=root_port_for_switch1
  -device 
xio3130-downstream,id=downstream_port1,bus=upstream_port1,chassis=1,slot=0
   -device ac97,bus=downstream_port1
-device pcie-root-port,id=root_port_for_nvme1,chassis=2,slot=0
 -device nvme,serial=deadbeef,bus=root_port_for_nvme1
-device pcie-root-port,id=root_port_for_igb,chassis=3,slot=0
 -device igb,bus=root_port_for_igb
-device pcie-root-port,id=root_port_for_xhci,chassis=4,slot=0
 -device qemu-xhci,bus=root_port_for_xhci
-device pcie-root-port,id=root_port_for_rng,chassis=5,slot=0
 -device virtio-rng-pci,bus=root_port_for_rng
-device pcie-root-port,id=root_port_for_pci,chassis=6,slot=0
 -device pcie-pci-bridge,id=pci,bus=root_port_for_pci
  -device es1370,bus=pci,addr=9
  -device e1000,bus=pci,addr=10
-device pxb-pcie,id=pxb1,bus_nr=1
 -device pcie-root-port,id=root_port_for_ahci,bus=pxb1,chassis=10,slot=0
  -device ahci,bus=root_port_for_ahci

BSA ACS test 841 checks do Type-1 PCIe devices have 32-bit
non-prefetchable memory. And fails on pcie-pci-bridge:

Operating System View:
 841 : NP type-1 PCIe supp 32-bit onlySTART

   BDF - 0x400
   BDF - 0x500
   BDF - 0x600
   BDF - 0x700
   BDF - 0x800
   BDF - 0x900
   BDF - 0x1
   BDF - 0x3
   Skipping Non Type-1 headers
   BDF - 0x4
   Skipping Non Type-1 headers
   BDF - 0x5
   Skipping Non Type-1 headers
   BDF - 0x6
   Skipping Non Type-1 headers
   BDF - 0x7
   NP type-1 pcie is not 32-bit mem type
   Failed on PE -0
   PCI_MM_04
   Checkpoint --  1   : Result:  FAIL

0x7 is pcie-pci-bridge card.

I opened issue for BSA ACS [3] and asked where the problem is.

3. https://github.com/ARM-software/bsa-acs/issues/197

Got quote from BSA (Arm Base System Architecture) [4]
chapter E.2 PCI Express Memory Space:


When PCI Express memory space is mapped as normal memory, the system
must support unaligned accesses to that region.
PCI Type 1 headers, used in PCI-to-PCI bridges, and therefore in root
ports and switches, have to be programmed with the address space
resources claimed by the given bridge.
For non-prefetchable (NP) memory, Type 1 headers only support 32-bit
addresses. This implies that endpoints on the other end of a
PCI-to-PCI bridge only support 32-bit NP BARs


4. https://developer.arm.com/documentation/den0094/latest/


I looked at code and tried to switch pcie-pci-bridge to 32-bit:


diff --git a/hw/pci-bridge/pcie_pci_bridge.c b/hw/pci-bridge/pcie_pci_bridge.c
index 2301b2ca0b..45199d2fa0 100644
--- a/hw/pci-bridge/pcie_pci_bridge.c
+++ b/hw/pci-bridge/pcie_pci_bridge.c
@@ -82,7 +82,7 @@ static void pcie_pci_bridge_realize(PCIDevice *d, Error 
**errp)
 }
 }
 pci_register_bar(d, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
- PCI_BASE_ADDRESS_MEM_TYPE_64, &pcie_br->shpc_bar);
+ PCI_BASE_ADDRESS_MEM_TYPE_32, &pcie_br->shpc_bar);
 return;

 msi_error:



With it, test 841 passes.

The difference in "lspci -" output suggests that this region address
was 32-bit in both cases:

-   Region 0: Memory at 81c0 (64-bit, non-prefetchable) [size=256]
+   Region 0: Memory at 81c0 (32-bit, non-prefetchable) [size=256]

Any ideas how to continue from here?

Re: [PATCH v1] vfio/common: Separate vfio-pci ranges

2023-09-11 Thread Joao Martins

On 11/09/2023 10:48, Duan, Zhenzhong wrote:
 +static bool vfio_section_is_vfio_pci(MemoryRegionSection *section,
 + VFIOContainer *container)
 +{
 +VFIOPCIDevice *pcidev;
 +VFIODevice *vbasedev;
 +VFIOGroup *group;
 +Object *owner;
 +
 +owner = memory_region_owner(section->mr);
 +
 +QLIST_FOREACH(group, &container->group_list, container_next) {
 +QLIST_FOREACH(vbasedev, &group->device_list, next) {
 +if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
 +continue;
 +}
 +pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
 +if (OBJECT(pcidev) == owner) {
 +return true;
 +}
 +}
 +}
 +
 +return false;
 +}
>>>
>>> What about simplify it with memory_region_is_ram_device()?
>>> This way vdpa device could also be included.
>>>
>>
>> Note that the check is not interested in RAM (or any other kinda of memory 
>> like
>> VGA). That's covered in the 32-64 ranges. But rather in any PCI device RAM 
>> that
>> would fall in the 64-bit PCI hole. Would memory_region_is_ram_device() really
>> cover it? If so, I am all for the simplification.
> 
> Ram device is used not only by vfio pci bars but also host notifier of vdpa 
> and vhost-user.

My only concern is whether this is all part of the pci-hole64 or not e.g. if we
expand to general memory_region_is_ram_device() would we go back to the initial
bug where we create an enourmous range. The latter that you mentioned should be
mainly virtio-net devices as presented to the guest (regardless of backend is
vdpa, or vhost-user) and perhaps they are all in the hole32 PCI hole?

Joao

Re: [PULL 00/51] Build system, i386 changes for 2023-09-07

2023-09-11 Thread Philippe Mathieu-Daudé


On 11/9/23 12:10, Philippe Mathieu-Daudé wrote:

On 8/9/23 17:47, Stefan Hajnoczi wrote:

I wonder how it passed CI?
https://gitlab.com/qemu-project/qemu/-/pipelines/996175923/


The conditions are:
- x86 host
- both system / user emulation enabled
- KVM disabled
- debug enabled


Oh, I forgot:

- clang compiler

(because this isn't an issue with GCC).


We have jobs with 3 of the 4, but none with
all the 4.

Is it worth test it?

[RESEND QEMU PATCH v4 1/1] virtgpu: do not destroy resources when guest suspend

2023-09-11 Thread Jiqian Chen

After suspending and resuming guest VM, you will get
a black screen, and the display can't come back.

This is because when guest did suspending, it called
into qemu to call virtio_gpu_gl_reset. In function
virtio_gpu_gl_reset, it destroyed resources and reset
renderer, which were used for display. As a result,
guest's screen can't come back to the time when it was
suspended and only showed black.

So, this patch adds a new ctrl message
VIRTIO_GPU_CMD_SET_FREEZE_MODE to get notifications from
guest. If guest is during suspending, it sets freeze mode
of virtgpu to freeze_S3, this will prevent destroying
resources and resetting renderer when guest calls into
virtio_gpu_gl_reset. If guest is during resuming, it sets
freeze mode to unfreeze, and then virtio_gpu_gl_reset
will keep its origin actions and has no other impaction.

Due to this implemention needs cooperation with guest,
so it added a new feature flag VIRTIO_GPU_F_FREEZE_S3, so
that guest and host can negotiate whenever freeze_S3 is
supported or not.

Signed-off-by: Jiqian Chen 
---
 hw/display/virtio-gpu-base.c   |  3 +++
 hw/display/virtio-gpu-gl.c | 10 ++-
 hw/display/virtio-gpu-virgl.c  |  7 +
 hw/display/virtio-gpu.c| 48 --
 hw/virtio/virtio-qmp.c |  3 +++
 include/hw/virtio/virtio-gpu.h |  6 +
 6 files changed, 74 insertions(+), 3 deletions(-)

diff --git a/hw/display/virtio-gpu-base.c b/hw/display/virtio-gpu-base.c
index ca1fb7b16f..ccc7cc80ce 100644
--- a/hw/display/virtio-gpu-base.c
+++ b/hw/display/virtio-gpu-base.c
@@ -232,6 +232,9 @@ virtio_gpu_base_get_features(VirtIODevice *vdev, uint64_t 
features,
 if (virtio_gpu_blob_enabled(g->conf)) {
 features |= (1 << VIRTIO_GPU_F_RESOURCE_BLOB);
 }
+if (virtio_gpu_freeze_S3_enabled(g->conf)) {
+features |= (1 << VIRTIO_GPU_F_FREEZE_S3);
+}
 
 return features;
 }
diff --git a/hw/display/virtio-gpu-gl.c b/hw/display/virtio-gpu-gl.c
index e06be60dfb..cb418dae9a 100644
--- a/hw/display/virtio-gpu-gl.c
+++ b/hw/display/virtio-gpu-gl.c
@@ -100,7 +100,15 @@ static void virtio_gpu_gl_reset(VirtIODevice *vdev)
  */
 if (gl->renderer_inited && !gl->renderer_reset) {
 virtio_gpu_virgl_reset_scanout(g);
-gl->renderer_reset = true;
+/*
+ * If guest is suspending, we shouldn't reset renderer,
+ * otherwise, the display can't come back to the time when
+ * it was suspended after guest was resumed.
+ */
+if (!virtio_gpu_freeze_S3_enabled(g->parent_obj.conf) ||
+g->freeze_mode == VIRTIO_GPU_FREEZE_MODE_UNFREEZE) {
+gl->renderer_reset = true;
+}
 }
 }
 
diff --git a/hw/display/virtio-gpu-virgl.c b/hw/display/virtio-gpu-virgl.c
index 8bb7a2c21f..ba4258487e 100644
--- a/hw/display/virtio-gpu-virgl.c
+++ b/hw/display/virtio-gpu-virgl.c
@@ -483,6 +483,13 @@ void virtio_gpu_virgl_process_cmd(VirtIOGPU *g,
 case VIRTIO_GPU_CMD_GET_EDID:
 virtio_gpu_get_edid(g, cmd);
 break;
+case VIRTIO_GPU_CMD_SET_FREEZE_MODE:
+if (virtio_gpu_freeze_S3_enabled(g->parent_obj.conf)) {
+virtio_gpu_cmd_set_freeze_mode(g, cmd);
+} else {
+cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_PARAMETER;
+}
+break;
 default:
 cmd->error = VIRTIO_GPU_RESP_ERR_UNSPEC;
 break;
diff --git a/hw/display/virtio-gpu.c b/hw/display/virtio-gpu.c
index bbd5c6561a..0cfccde834 100644
--- a/hw/display/virtio-gpu.c
+++ b/hw/display/virtio-gpu.c
@@ -383,6 +383,16 @@ static void virtio_gpu_resource_create_blob(VirtIOGPU *g,
 QTAILQ_INSERT_HEAD(&g->reslist, res, next);
 }
 
+void virtio_gpu_cmd_set_freeze_mode(VirtIOGPU *g,
+ struct virtio_gpu_ctrl_command *cmd)
+{
+struct virtio_gpu_set_freeze_mode sf;
+
+VIRTIO_GPU_FILL_CMD(sf);
+virtio_gpu_bswap_32(&sf, sizeof(sf));
+g->freeze_mode = sf.freeze_mode;
+}
+
 static void virtio_gpu_disable_scanout(VirtIOGPU *g, int scanout_id)
 {
 struct virtio_gpu_scanout *scanout = &g->parent_obj.scanout[scanout_id];
@@ -1018,6 +1028,13 @@ void virtio_gpu_simple_process_cmd(VirtIOGPU *g,
 case VIRTIO_GPU_CMD_RESOURCE_DETACH_BACKING:
 virtio_gpu_resource_detach_backing(g, cmd);
 break;
+case VIRTIO_GPU_CMD_SET_FREEZE_MODE:
+if (virtio_gpu_freeze_S3_enabled(g->parent_obj.conf)) {
+virtio_gpu_cmd_set_freeze_mode(g, cmd);
+} else {
+cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_PARAMETER;
+}
+break;
 default:
 cmd->error = VIRTIO_GPU_RESP_ERR_UNSPEC;
 break;
@@ -1394,11 +1411,28 @@ void virtio_gpu_device_realize(DeviceState *qdev, Error 
**errp)
 QTAILQ_INIT(&g->reslist);
 QTAILQ_INIT(&g->cmdq);
 QTAILQ_INIT(&g->fenceq);
+
+g->freeze_mode = VIRTIO_GPU_FREEZE_MODE_UNFREEZE;
 }
 
 static void virtio_gpu_device_unrealize(DeviceState *qdev)
 {
 VirtIOGPU *g = VIRTIO_GPU(qdev);
+s

[RESEND QEMU PATCH v4 0/1] S3 support

2023-09-11 Thread Jiqian Chen

Hi all,
I hope you’ll forgive me if this disturb you. Since it has been almost two
months since the latest patch was sent out, I didn't receive any reply, so I
rebase the latest master branch and sent it again. I am looking forward to
getting your response.

Best regards,
Jiqian Chen

v4:
Hi all,
Thanks for Gerd Hoffmann's advice. V4 makes below changes:
* Use enum for freeze mode, so this can be extended with more
  modes in the future.
* Rename functions and paratemers with "_S3" postfix.
And no functional changes.

latest version on kernel side:
https://lore.kernel.org/lkml/20230720115805.8206-1-jiqian.c...@amd.com/T/#t


v3:
Hi all,
Thanks for Michael S. Tsirkin's advice. V3 makes below changes:
* Remove changes in file include/standard-headers/linux/virtio_gpu.h
  I am not supposed to edit this file and it will be imported after
  the patches of linux kernel was merged.

Link:
https://lore.kernel.org/qemu-devel/20230719074726.1613088-1-jiqian.c...@amd.com/T/#t


v2:
Hi all,
Thanks to Marc-André Lureau, Robert Beckett and Gerd Hoffmann for their advice 
and
guidance. V2 makes below changes:

* Change VIRTIO_CPU_CMD_STATUS_FREEZING to 0x0400 (<0x1000)
* Add virtio_gpu_device_unrealize to destroy resources to solve
  potential memory leak problem. This also needs hot-plug support.
* Add a new feature flag VIRTIO_GPU_F_FREEZING, so that guest and
  host can negotiate whenever freezing is supported or not.

Link:
https://lore.kernel.org/qemu-devel/20230630070016.841459-1-jiqian.c...@amd.com/T/#t


v1:
Hi all,
I am working to implement virtgpu S3 function on Xen.

Currently on Xen, if we start a guest who enables virtgpu, and then run "echo 
mem >
/sys/power/state" to suspend guest. And run "sudo xl trigger  
s3resume" to
resume guest. We can find that the guest kernel comes back, but the display 
doesn't.
It just shown a black screen.

Through reading codes, I founded that when guest was during suspending, it 
called
into Qemu to call virtio_gpu_gl_reset. In virtio_gpu_gl_reset, it destroyed all
resources and reset renderer. This made the display gone after guest resumed.

I think we should keep resources or prevent they being destroyed when guest is
suspending. So, I add a new status named freezing to virtgpu, and add a new ctrl
message VIRTIO_GPU_CMD_STATUS_FREEZING to get notification from guest. If 
freezing
is set to true, and then Qemu will realize that guest is suspending, it will not
destroy resources and will not reset renderer. If freezing is set to false, Qemu
will do its origin actions, and has no other impaction.

And now, display can come back and applications can continue their status after
guest resumes.

Link:
https://lore.kernel.org/qemu-devel/20230608025655.1674357-1-jiqian.c...@amd.com/

Jiqian Chen (1):
  virtgpu: do not destroy resources when guest suspend

 hw/display/virtio-gpu-base.c   |  3 +++
 hw/display/virtio-gpu-gl.c | 10 ++-
 hw/display/virtio-gpu-virgl.c  |  7 +
 hw/display/virtio-gpu.c| 48 --
 hw/virtio/virtio-qmp.c |  3 +++
 include/hw/virtio/virtio-gpu.h |  6 +
 6 files changed, 74 insertions(+), 3 deletions(-)

-- 
2.34.1

[PATCH] target/i386: Re-introduce few KVM stubs for Clang debug builds

2023-09-11 Thread Philippe Mathieu-Daudé

Since commits 3adce820cf..ef1cf6890f, When building on
a x86 host configured as:

  $ ./configure --cc=clang \
--target-list=x86_64-linux-user,x86_64-softmmu \
--enable-debug

we get:

  [71/71] Linking target qemu-x86_64
  FAILED: qemu-x86_64
  /usr/bin/ld: libqemu-x86_64-linux-user.fa.p/target_i386_cpu.c.o: in function 
`cpu_x86_cpuid':
  cpu.c:(.text+0x1374): undefined reference to `kvm_arch_get_supported_cpuid'
  /usr/bin/ld: libqemu-x86_64-linux-user.fa.p/target_i386_cpu.c.o: in function 
`x86_cpu_filter_features':
  cpu.c:(.text+0x81c2): undefined reference to `kvm_arch_get_supported_cpuid'
  /usr/bin/ld: cpu.c:(.text+0x81da): undefined reference to 
`kvm_arch_get_supported_cpuid'
  /usr/bin/ld: cpu.c:(.text+0x81f2): undefined reference to 
`kvm_arch_get_supported_cpuid'
  /usr/bin/ld: cpu.c:(.text+0x820a): undefined reference to 
`kvm_arch_get_supported_cpuid'
  /usr/bin/ld: 
libqemu-x86_64-linux-user.fa.p/target_i386_cpu.c.o:cpu.c:(.text+0x8225): more 
undefined references to `kvm_arch_get_supported_cpuid' follow
  clang: error: linker command failed with exit code 1 (use -v to see 
invocation)
  ninja: build stopped: subcommand failed.

'--enable-debug' disables optimizations (CFLAGS=-O0).

While at this (un)optimization level GCC eliminate the
following dead code:

  if (0 && foo()) {
  ...
  }

Clang does not. Therefore restore a pair of stubs for
unoptimized Clang builds.

Reported-by: Kevin Wolf 
Fixes: 3adce820cf ("target/i386: Remove unused KVM stubs")
Fixes: ef1cf6890f ("target/i386: Allow elision of kvm_hv_vpindex_settable()")
Signed-off-by: Philippe Mathieu-Daudé 
---
 target/i386/kvm/kvm_i386.h | 21 ++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/target/i386/kvm/kvm_i386.h b/target/i386/kvm/kvm_i386.h
index 55d4e68c34..0b62ac628f 100644
--- a/target/i386/kvm/kvm_i386.h
+++ b/target/i386/kvm/kvm_i386.h
@@ -32,7 +32,6 @@
 
 bool kvm_has_smm(void);
 bool kvm_enable_x2apic(void);
-bool kvm_hv_vpindex_settable(void);
 bool kvm_has_pit_state2(void);
 
 bool kvm_enable_sgx_provisioning(KVMState *s);
@@ -41,8 +40,6 @@ bool kvm_hyperv_expand_features(X86CPU *cpu, Error **errp);
 void kvm_arch_reset_vcpu(X86CPU *cs);
 void kvm_arch_after_reset_vcpu(X86CPU *cpu);
 void kvm_arch_do_init_vcpu(X86CPU *cs);
-uint32_t kvm_arch_get_supported_cpuid(KVMState *env, uint32_t function,
-  uint32_t index, int reg);
 uint64_t kvm_arch_get_supported_msr_feature(KVMState *s, uint32_t index);
 
 void kvm_set_max_apic_id(uint32_t max_apic_id);
@@ -60,6 +57,10 @@ void kvm_put_apicbase(X86CPU *cpu, uint64_t value);
 
 bool kvm_has_x2apic_api(void);
 bool kvm_has_waitpkg(void);
+bool kvm_hv_vpindex_settable(void);
+
+uint32_t kvm_arch_get_supported_cpuid(KVMState *env, uint32_t function,
+  uint32_t index, int reg);
 
 uint64_t kvm_swizzle_msi_ext_dest_id(uint64_t address);
 void kvm_update_msi_routes_all(void *private, bool global,
@@ -76,6 +77,20 @@ typedef struct kvm_msr_handlers {
 bool kvm_filter_msr(KVMState *s, uint32_t msr, QEMURDMSRHandler *rdmsr,
 QEMUWRMSRHandler *wrmsr);
 
+#elif defined(__clang__) && !defined(__OPTIMIZE__)
+
+static inline bool kvm_hv_vpindex_settable(void)
+{
+g_assert_not_reached();
+}
+
+static inline uint32_t kvm_arch_get_supported_cpuid(KVMState *env,
+uint32_t function,
+uint32_t index, int reg)
+{
+g_assert_not_reached();
+}
+
 #endif /* CONFIG_KVM */
 
 void kvm_pc_setup_irq_routing(bool pci_enabled);
-- 
2.41.0

[PATCH v2 04/11] qapi: fix example of set-vcpu-dirty-limit command

2023-09-11 Thread Victor Toso

Example output has extra end curly bracket. Remove it.

Signed-off-by: Victor Toso 
Reviewed-by: Daniel P. Berrangé 
---
 qapi/migration.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/qapi/migration.json b/qapi/migration.json
index 9385b9f87c..2658cdbcbe 100644
--- a/qapi/migration.json
+++ b/qapi/migration.json
@@ -1986,7 +1986,7 @@
 #
 # Example:
 #
-# -> {"execute": "set-vcpu-dirty-limit"}
+# -> {"execute": "set-vcpu-dirty-limit",
 # "arguments": { "dirty-rate": 200,
 #"cpu-index": 1 } }
 # <- { "return": {} }
-- 
2.41.0

[PATCH v2 05/11] qapi: fix example of calc-dirty-rate command

2023-09-11 Thread Victor Toso

Example output has property name with single quotes. Fix it.

Signed-off-by: Victor Toso 
Reviewed-by: Daniel P. Berrangé 
---
 qapi/migration.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/qapi/migration.json b/qapi/migration.json
index 2658cdbcbe..45dac41f67 100644
--- a/qapi/migration.json
+++ b/qapi/migration.json
@@ -1922,7 +1922,7 @@
 # Example:
 #
 # -> {"execute": "calc-dirty-rate", "arguments": {"calc-time": 1,
-# 'sample-pages': 512} }
+# "sample-pages": 512} }
 # <- { "return": {} }
 ##
 { 'command': 'calc-dirty-rate', 'data': {'calc-time': 'int64',
-- 
2.41.0

[PATCH v2 08/11] qapi: fix example of query-spice command

2023-09-11 Thread Victor Toso

Example output has a comment embedded in the array. Remove it.
The end result is a list of size 2.

Signed-off-by: Victor Toso 
---
 qapi/ui.json | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/qapi/ui.json b/qapi/ui.json
index 006616aa77..6ed36c45ea 100644
--- a/qapi/ui.json
+++ b/qapi/ui.json
@@ -363,8 +363,7 @@
 #"host": "127.0.0.1",
 #"channel-id": 0,
 #"tls": false
-# },
-# [ ... more channels follow ... ]
+# }
 #  ]
 #   }
 #}
-- 
2.41.0

[PATCH v2 07/11] qapi: fix example of query-rocker-of-dpa-flows command

2023-09-11 Thread Victor Toso

Example output has a comment embedded in the array. Remove it.
The end result is a list of size 1.

Signed-off-by: Victor Toso 
---
 qapi/rocker.json | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/qapi/rocker.json b/qapi/rocker.json
index 31ce0b36f6..858e4f4a45 100644
--- a/qapi/rocker.json
+++ b/qapi/rocker.json
@@ -249,8 +249,7 @@
 #   "cookie": 0,
 #   "action": {"goto-tbl": 10},
 #   "mask": {"in-pport": 4294901760}
-#  },
-#  {...more...},
+#  }
 #]}
 ##
 { 'command': 'query-rocker-of-dpa-flows',
-- 
2.41.0

1 2 3 >

1 - 100 of 296 matches

Mail list logo