date:20230721

Re: 8.1-rc0 testfloat fails to compile

2023-07-21 Thread Olaf Hering

Fri, 21 Jul 2023 08:54:21 +0200 Thomas Huth :

> Which compiler version is causing trouble for you?

Right now it is gcc 13, hopefully every compiler will error out with 
-Werror=return-type.

I think it makes sense to replace Leap with Tumbleweed. We already know it 
compiles fine with Leap, because the submission compiled fine on the submitters 
laptop. Where is the repository that needs to be adjusted to make this 
replacement?


Olaf


pgpSHAJeqbTxh.pgp
Description: Digitale Signatur von OpenPGP

Re: 8.1-rc0 testfloat fails to compile

2023-07-21 Thread Thomas Huth


On 21/07/2023 09.03, Olaf Hering wrote:

Fri, 21 Jul 2023 08:54:21 +0200 Thomas Huth :


Which compiler version is causing trouble for you?


Right now it is gcc 13, hopefully every compiler will error out with 
-Werror=return-type.


I don't think this will happen - otherwise we would have seen this already. 
It rather looks like your compiler version is missing something here - have 
a look at the affected function:


static float64_t f64Random( void )
{

switch ( random_ui8() & 7 ) {
 case 0:
 case 1:
 case 2:
return f64RandomQOutP3();
 case 3:
return f64RandomQOutPInf();
 case 4:
 case 5:
 case 6:
return f64RandomQInfP3();
 case 7:
return f64RandomQInfPInf();
}

}

The argument in the switch statement is limited with "& 7" to the range of 0 
... 7 , so there is no way the control flow can really reach the bottom of 
the function. I wonder why your GCC 13 gets that wrong, all other versions 
seem to be fine.

Or are you compiling with -O0 or something similar?

I think it makes sense to replace Leap with Tumbleweed. 


IIRC we wanted to avoid rolling releases in the CI since this would rather 
force us to deal with distro bugs on a regular basis instead of focusing on 
bugs in QEMU.


 Thomas

Re: [PATCH] target/loongarch: Fix the CSRRD CPUID instruction on big endian hosts

2023-07-21 Thread Song Gao





在 2023/7/21 上午1:53, Thomas Huth 写道:

The test in tests/avocado/machine_loongarch.py is currently failing
on big endian hosts like s390x. By comparing the traces between running
the QEMU_EFI.fd bios on a s390x and on a x86 host, it's quickly obvious
that the CSRRD instruction for the CPUID is behaving differently. And
indeed: The code currently does a long read (i.e. 64 bit) from the
address that points to the CPUState->cpu_index field (with tcg_gen_ld_tl()
in the trans_csrrd() function). But this cpu_index field is only an "int"
(i.e. 32 bit). While this dirty pointer magic works on little endian hosts,
it of course fails on big endian hosts. Fix it by using a proper helper
function instead.

Signed-off-by: Thomas Huth 
---
  Note: There is another bug preventing tests/avocado/machine_loongarch.py
  from working correctly on big endian hosts (Linux fails to run the shell)
  but that problem is harder to debug since it happens way later in the boot
  process...

  target/loongarch/cpu.h | 1 +
  target/loongarch/helper.h  | 1 +
  target/loongarch/csr_helper.c  | 9 +
  target/loongarch/insn_trans/trans_privileged.c.inc | 8 +---
  4 files changed, 12 insertions(+), 7 deletions(-)



Reviewed-by: Song Gao 

Thanks.
Song Gao


diff --git a/target/loongarch/cpu.h b/target/loongarch/cpu.h
index ed04027af1..fa371ca8ba 100644
--- a/target/loongarch/cpu.h
+++ b/target/loongarch/cpu.h
@@ -342,6 +342,7 @@ typedef struct CPUArchState {
  uint64_t CSR_DBG;
  uint64_t CSR_DERA;
  uint64_t CSR_DSAVE;
+uint64_t CSR_CPUID;
  
  #ifndef CONFIG_USER_ONLY

  LoongArchTLB  tlb[LOONGARCH_TLB_MAX];
diff --git a/target/loongarch/helper.h b/target/loongarch/helper.h
index b9de77d926..ffb1e0b0bf 100644
--- a/target/loongarch/helper.h
+++ b/target/loongarch/helper.h
@@ -98,6 +98,7 @@ DEF_HELPER_1(rdtime_d, i64, env)
  #ifndef CONFIG_USER_ONLY
  /* CSRs helper */
  DEF_HELPER_1(csrrd_pgd, i64, env)
+DEF_HELPER_1(csrrd_cpuid, i64, env)
  DEF_HELPER_1(csrrd_tval, i64, env)
  DEF_HELPER_2(csrwr_estat, i64, env, tl)
  DEF_HELPER_2(csrwr_asid, i64, env, tl)
diff --git a/target/loongarch/csr_helper.c b/target/loongarch/csr_helper.c
index 6526367946..55341551a5 100644
--- a/target/loongarch/csr_helper.c
+++ b/target/loongarch/csr_helper.c
@@ -35,6 +35,15 @@ target_ulong helper_csrrd_pgd(CPULoongArchState *env)
  return v;
  }
  
+target_ulong helper_csrrd_cpuid(CPULoongArchState *env)

+{
+LoongArchCPU *lac = env_archcpu(env);
+
+env->CSR_CPUID = CPU(lac)->cpu_index;
+
+return env->CSR_CPUID;
+}
+
  target_ulong helper_csrrd_tval(CPULoongArchState *env)
  {
  LoongArchCPU *cpu = env_archcpu(env);
diff --git a/target/loongarch/insn_trans/trans_privileged.c.inc 
b/target/loongarch/insn_trans/trans_privileged.c.inc
index 02bca7ca23..9c9de090f0 100644
--- a/target/loongarch/insn_trans/trans_privileged.c.inc
+++ b/target/loongarch/insn_trans/trans_privileged.c.inc
@@ -99,13 +99,7 @@ static const CSRInfo csr_info[] = {
  CSR_OFF(PWCH),
  CSR_OFF(STLBPS),
  CSR_OFF(RVACFG),
-[LOONGARCH_CSR_CPUID] = {
-.offset = (int)offsetof(CPUState, cpu_index)
-  - (int)offsetof(LoongArchCPU, env),
-.flags = CSRFL_READONLY,
-.readfn = NULL,
-.writefn = NULL
-},
+CSR_OFF_FUNCS(CPUID, CSRFL_READONLY, gen_helper_csrrd_cpuid, NULL),
  CSR_OFF_FLAGS(PRCFG1, CSRFL_READONLY),
  CSR_OFF_FLAGS(PRCFG2, CSRFL_READONLY),
  CSR_OFF_FLAGS(PRCFG3, CSRFL_READONLY),

[PATCH] target/ppc: Fix pending HDEC when entering PM state

2023-07-21 Thread Nicholas Piggin

HDEC is defined to not wake from PM state. There is a check in the HDEC
timer to avoid setting the interrupt if we are in a PM state, but no
check on PM entry to lower HDEC if it already fired. This can cause a
HDECR wake up and  QEMU abort with unsupported exception in Power Save
mode.

Signed-off-by: Nicholas Piggin 
---
 target/ppc/excp_helper.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/target/ppc/excp_helper.c b/target/ppc/excp_helper.c
index 003805b202..9aa8e46566 100644
--- a/target/ppc/excp_helper.c
+++ b/target/ppc/excp_helper.c
@@ -2685,6 +2685,12 @@ void helper_pminsn(CPUPPCState *env, uint32_t insn)
 env->resume_as_sreset = (insn != PPC_PM_STOP) ||
 (env->spr[SPR_PSSCR] & PSSCR_EC);
 
+/* HDECR is not to wake from PM state, it may have already fired */
+if (env->resume_as_sreset) {
+PowerPCCPU *cpu = env_archcpu(env);
+ppc_set_irq(cpu, PPC_INTERRUPT_HDECR, 0);
+}
+
 ppc_maybe_interrupt(env);
 }
 #endif /* defined(TARGET_PPC64) */
-- 
2.40.1

Re: [PATCH 0/2] target/ppc: Fixes for hash MMU for ISA v3.0

2023-07-21 Thread Cédric Le Goater


Hello Nick,

On 7/21/23 07:02, Nicholas Piggin wrote:

This fixes a couple of deficiencies in the v3.0 and later (POWER9, 10)
HPT MMU implementation. With these fixes, KVM is unable to boot hash
guests on powernv9/10 machines. Bare metal hash or pseries machine with
hash works, because VRMA is only used when a real hypervisor is
virtualizing a hash guest's real mode addressing.

Thanks,
Nick


Could please add Fixes tags to your 'fix' patches ? No need to resend
just reply on the mailing list with :

  Fixes:  "subject"

Also for the "target/ppc: Fix pending HDEC when entering PM state" patch.

Thanks,

C.

 

Nicholas Piggin (2):
   target/ppc: Implement ASDR register for ISA v3.0 for HPT
   target/ppc: Fix VRMA page size for ISA v3.0

  target/ppc/mmu-hash64.c | 68 ++---
  1 file changed, 51 insertions(+), 17 deletions(-)

Re: [PATCH v2] accel/kvm: Specify default IPA size for arm64

2023-07-21 Thread Philippe Mathieu-Daudé


Hi Akihiko,

On 21/7/23 08:24, Akihiko Odaki wrote:

libvirt uses "none" machine type to test KVM availability. Before this
change, QEMU used to pass 0 as machine type when calling KVM_CREATE_VM.

The kernel documentation says:

On arm64, the physical address size for a VM (IPA Size limit) is
limited to 40bits by default. The limit can be configured if the host
supports the extension KVM_CAP_ARM_VM_IPA_SIZE. When supported, use
KVM_VM_TYPE_ARM_IPA_SIZE(IPA_Bits) to set the size in the machine type
identifier, where IPA_Bits is the maximum width of any physical
address used by the VM. The IPA_Bits is encoded in bits[7-0] of the
machine type identifier.

e.g, to configure a guest to use 48bit physical address size::

 vm_fd = ioctl(dev_fd, KVM_CREATE_VM, KVM_VM_TYPE_ARM_IPA_SIZE(48));

The requested size (IPA_Bits) must be:

  ==   =
   0   Implies default size, 40bits (for backward compatibility)
   N   Implies N bits, where N is a positive integer such that,
   32 <= N <= Host_IPA_Limit
  ==   =



Host_IPA_Limit is the maximum possible value for IPA_Bits on the host
and is dependent on the CPU capability and the kernel configuration.
The limit can be retrieved using KVM_CAP_ARM_VM_IPA_SIZE of the
KVM_CHECK_EXTENSION ioctl() at run-time.

Creation of the VM will fail if the requested IPA size (whether it is
implicit or explicit) is unsupported on the host.

https://docs.kernel.org/virt/kvm/api.html#kvm-create-vm

So if Host_IPA_Limit < 40, such KVM_CREATE_VM will fail, and libvirt
incorrectly thinks KVM is not available. This actually happened on M2
MacBook Air.

Fix this by specifying 32 for IPA_Bits as any arm64 system should
support the value according to the documentation.

Signed-off-by: Akihiko Odaki 
---
V1 -> V2: Introduced an arch hook

  include/sysemu/kvm.h   | 1 +
  accel/kvm/kvm-all.c| 2 +-
  target/arm/kvm.c   | 2 ++
  target/i386/kvm/kvm.c  | 2 ++
  target/mips/kvm.c  | 2 ++
  target/ppc/kvm.c   | 2 ++
  target/riscv/kvm.c | 2 ++
  target/s390x/kvm/kvm.c | 2 ++
  8 files changed, 14 insertions(+), 1 deletion(-)


My understanding of Peter's suggestion would be smth like:

-- >8 --
diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
index 115f0cca79..c0af15eb6c 100644
--- a/include/sysemu/kvm.h
+++ b/include/sysemu/kvm.h
@@ -201,10 +201,15 @@ typedef struct KVMCapabilityInfo {

 struct KVMState;

+struct KVMClass {
+AccelClass parent_class;
+
+int default_vm_type;
+};
+
 #define TYPE_KVM_ACCEL ACCEL_CLASS_NAME("kvm")
 typedef struct KVMState KVMState;
-DECLARE_INSTANCE_CHECKER(KVMState, KVM_STATE,
- TYPE_KVM_ACCEL)
+OBJECT_DECLARE_TYPE(KVMState, KVMClass, KVM_ACCEL)

 extern KVMState *kvm_state;
 typedef struct Notifier Notifier;
diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index 373d876c05..fdd424e1a5 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -2458,12 +2458,13 @@ static int kvm_init(MachineState *ms)
 KVMState *s;
 const KVMCapabilityInfo *missing_cap;
 int ret;
-int type = 0;
+int type;
 uint64_t dirty_log_manual_caps;

 qemu_mutex_init(&kml_slots_lock);

 s = KVM_STATE(ms->accelerator);
+type = KVM_GET_CLASS(s)->default_vm_type;

 /*
  * On systems where the kernel can support different base page
diff --git a/target/arm/kvm.c b/target/arm/kvm.c
index b4c7654f49..5c13594fdf 100644
--- a/target/arm/kvm.c
+++ b/target/arm/kvm.c
@@ -1064,4 +1064,8 @@ bool kvm_arch_cpu_check_are_resettable(void)

 void kvm_arch_accel_class_init(ObjectClass *oc)
 {
+KVMClass *kc = KVM_CLASS(oc);
+
+/* Host_IPA_Limit ... */
+kc->default_vm_type = 32;
 }
---

Re: [PATCH] target/tricore: Rename tricore_feature

2023-07-21 Thread Philippe Mathieu-Daudé


On 21/7/23 08:06, Bastian Koppelmann wrote:

this name is used by capstone and will lead to a build failure of QEMU,
when capstone is enabled. So we rename it to tricore_has_feature(), to
match has_feature() in translate.c.

Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1774
Signed-off-by: Bastian Koppelmann 
---
  target/tricore/cpu.c   | 8 
  target/tricore/cpu.h   | 2 +-
  target/tricore/helper.c| 4 ++--
  target/tricore/op_helper.c | 4 ++--
  4 files changed, 9 insertions(+), 9 deletions(-)


Reviewed-by: Philippe Mathieu-Daudé

Re: 8.1-rc0 testfloat fails to compile

2023-07-21 Thread Olaf Hering

Fri, 21 Jul 2023 09:18:08 +0200 Thomas Huth :

> Or are you compiling with -O0 or something similar?

Does the master branch compile for you with 'export CFLAGS="-O2 -Wall 
-Werror=return-type"'?

I prepared a patch to deal with it, and indeed it looks like the compiler might 
be able to spot the fact that only values between 0 and 7 can ever appear 
there. Apparently it fails to recognize that.


Olaf


pgpf8MtewCTaw.pgp
Description: Digitale Signatur von OpenPGP

Re: 8.1-rc0 testfloat fails to compile

2023-07-21 Thread Thomas Huth


On 21/07/2023 09.35, Olaf Hering wrote:

Fri, 21 Jul 2023 09:18:08 +0200 Thomas Huth :


Or are you compiling with -O0 or something similar?


Does the master branch compile for you with 'export CFLAGS="-O2 -Wall 
-Werror=return-type"'?


Oh, this fails for me, too! I thought that -Wreturn-type would be part of 
-Wall, but apparently it is only enabled by default for C++ in my version of 
GCC :-(


 Thomas

Re: [PATCH] hw: Add compat machines for 8.2

2023-07-21 Thread Laurent Vivier


Le 18/07/2023 à 16:22, Cornelia Huck a écrit :

Add 8.2 machine types for arm/i440fx/m68k/q35/s390x/spapr.

Signed-off-by: Cornelia Huck 
---
  hw/arm/virt.c  |  9 -
  hw/core/machine.c  |  3 +++
  hw/i386/pc.c   |  3 +++
  hw/i386/pc_piix.c  | 16 +---
  hw/i386/pc_q35.c   | 14 --
  hw/m68k/virt.c |  9 -
  hw/ppc/spapr.c | 15 +--
  hw/s390x/s390-virtio-ccw.c | 14 +-
  include/hw/boards.h|  3 +++
  include/hw/i386/pc.h   |  3 +++
  10 files changed, 79 insertions(+), 10 deletions(-)

diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index 7d9dbc26633a..2a560271b5fc 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -3170,10 +3170,17 @@ static void machvirt_machine_init(void)
  }
  type_init(machvirt_machine_init);
  
+static void virt_machine_8_2_options(MachineClass *mc)

+{
+}
+DEFINE_VIRT_MACHINE_AS_LATEST(8, 2)
+
  static void virt_machine_8_1_options(MachineClass *mc)
  {
+virt_machine_8_2_options(mc);
+compat_props_add(mc->compat_props, hw_compat_8_1, hw_compat_8_1_len);
  }
-DEFINE_VIRT_MACHINE_AS_LATEST(8, 1)
+DEFINE_VIRT_MACHINE(8, 1)
  
  static void virt_machine_8_0_options(MachineClass *mc)

  {
diff --git a/hw/core/machine.c b/hw/core/machine.c
index f0d35c640184..da699cf4e147 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -39,6 +39,9 @@
  #include "hw/virtio/virtio.h"
  #include "hw/virtio/virtio-pci.h"
  
+GlobalProperty hw_compat_8_1[] = {};

+const size_t hw_compat_8_1_len = G_N_ELEMENTS(hw_compat_8_1);
+
  GlobalProperty hw_compat_8_0[] = {
  { "migration", "multifd-flush-after-each-section", "on"},
  { TYPE_PCI_DEVICE, "x-pcie-ari-nextfn-1", "on" },
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 3109d5e0e035..54838c0c411d 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -114,6 +114,9 @@
  { "qemu64-" TYPE_X86_CPU, "model-id", "QEMU Virtual CPU version " v, },\
  { "athlon-" TYPE_X86_CPU, "model-id", "QEMU Virtual CPU version " v, },
  
+GlobalProperty pc_compat_8_1[] = {};

+const size_t pc_compat_8_1_len = G_N_ELEMENTS(pc_compat_8_1);
+
  GlobalProperty pc_compat_8_0[] = {
  { "virtio-mem", "unplugged-inaccessible", "auto" },
  };
diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
index ac72e8f5bee1..ce1ac9527493 100644
--- a/hw/i386/pc_piix.c
+++ b/hw/i386/pc_piix.c
@@ -504,13 +504,25 @@ static void pc_i440fx_machine_options(MachineClass *m)
  machine_class_allow_dynamic_sysbus_dev(m, TYPE_VMBUS_BRIDGE);
  }
  
-static void pc_i440fx_8_1_machine_options(MachineClass *m)

+static void pc_i440fx_8_2_machine_options(MachineClass *m)
  {
  pc_i440fx_machine_options(m);
  m->alias = "pc";
  m->is_default = true;
  }
  
+DEFINE_I440FX_MACHINE(v8_2, "pc-i440fx-8.2", NULL,

+  pc_i440fx_8_2_machine_options);
+
+static void pc_i440fx_8_1_machine_options(MachineClass *m)
+{
+pc_i440fx_8_2_machine_options(m);
+m->alias = NULL;
+m->is_default = false;
+compat_props_add(m->compat_props, hw_compat_8_1, hw_compat_8_1_len);
+compat_props_add(m->compat_props, pc_compat_8_1, pc_compat_8_1_len);
+}
+
  DEFINE_I440FX_MACHINE(v8_1, "pc-i440fx-8.1", NULL,
pc_i440fx_8_1_machine_options);
  
@@ -519,8 +531,6 @@ static void pc_i440fx_8_0_machine_options(MachineClass *m)

  PCMachineClass *pcmc = PC_MACHINE_CLASS(m);
  
  pc_i440fx_8_1_machine_options(m);

-m->alias = NULL;
-m->is_default = false;
  compat_props_add(m->compat_props, hw_compat_8_0, hw_compat_8_0_len);
  compat_props_add(m->compat_props, pc_compat_8_0, pc_compat_8_0_len);
  
diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c

index dc27a9e223a2..37c4814bedf2 100644
--- a/hw/i386/pc_q35.c
+++ b/hw/i386/pc_q35.c
@@ -379,12 +379,23 @@ static void pc_q35_machine_options(MachineClass *m)
  machine_class_allow_dynamic_sysbus_dev(m, TYPE_VMBUS_BRIDGE);
  }
  
-static void pc_q35_8_1_machine_options(MachineClass *m)

+static void pc_q35_8_2_machine_options(MachineClass *m)
  {
  pc_q35_machine_options(m);
  m->alias = "q35";
  }
  
+DEFINE_Q35_MACHINE(v8_2, "pc-q35-8.2", NULL,

+   pc_q35_8_2_machine_options);
+
+static void pc_q35_8_1_machine_options(MachineClass *m)
+{
+pc_q35_8_2_machine_options(m);
+m->alias = NULL;
+compat_props_add(m->compat_props, hw_compat_8_1, hw_compat_8_1_len);
+compat_props_add(m->compat_props, pc_compat_8_1, pc_compat_8_1_len);
+}
+
  DEFINE_Q35_MACHINE(v8_1, "pc-q35-8.1", NULL,
 pc_q35_8_1_machine_options);
  
@@ -393,7 +404,6 @@ static void pc_q35_8_0_machine_options(MachineClass *m)

  PCMachineClass *pcmc = PC_MACHINE_CLASS(m);
  
  pc_q35_8_1_machine_options(m);

-m->alias = NULL;
  compat_props_add(m->compat_props, hw_compat_8_0, hw_compat_8_0_len);
  compat_props_add(m->compat_props, pc_compat_8_0, pc_compat_8_0_len);
  
diff --git a/hw/m68k/virt.c b/hw/m68k/vi

Re: 8.1-rc0 testfloat fails to compile

2023-07-21 Thread Olaf Hering

Fri, 21 Jul 2023 09:53:51 +0200 Thomas Huth :

>  I thought that -Wreturn-type would be part of 
> -Wall, but apparently it is only enabled by default for C++ in my version of 
> GCC :-(

Hopefully gcc14 will fix that, and others. Only two decades late ...


Olaf


pgpx3QdBimJAf.pgp
Description: Digitale Signatur von OpenPGP

Re: [Virtio-fs] [PATCH v2 2/4] vhost-user: Interface for migration state transfer

2023-07-21 Thread Hanna Czenczek


On 20.07.23 17:05, Hao Xu wrote:


On 7/20/23 21:20, Hanna Czenczek wrote:

On 20.07.23 14:13, Hao Xu wrote:


On 7/12/23 19:17, Hanna Czenczek wrote:
Add the interface for transferring the back-end's state during 
migration

as defined previously in vhost-user.rst.

Signed-off-by: Hanna Czenczek 
---
  include/hw/virtio/vhost-backend.h |  24 +
  include/hw/virtio/vhost.h |  79 
  hw/virtio/vhost-user.c    | 147 
++

  hw/virtio/vhost.c |  37 
  4 files changed, 287 insertions(+)

diff --git a/include/hw/virtio/vhost-backend.h 
b/include/hw/virtio/vhost-backend.h

index 31a251a9f5..e59d0b53f8 100644
--- a/include/hw/virtio/vhost-backend.h
+++ b/include/hw/virtio/vhost-backend.h
@@ -26,6 +26,18 @@ typedef enum VhostSetConfigType {
  VHOST_SET_CONFIG_TYPE_MIGRATION = 1,
  } VhostSetConfigType;
  +typedef enum VhostDeviceStateDirection {
+    /* Transfer state from back-end (device) to front-end */
+    VHOST_TRANSFER_STATE_DIRECTION_SAVE = 0,
+    /* Transfer state from front-end to back-end (device) */
+    VHOST_TRANSFER_STATE_DIRECTION_LOAD = 1,
+} VhostDeviceStateDirection;
+
+typedef enum VhostDeviceStatePhase {
+    /* The device (and all its vrings) is stopped */
+    VHOST_TRANSFER_STATE_PHASE_STOPPED = 0,
+} VhostDeviceStatePhase;
+
  struct vhost_inflight;
  struct vhost_dev;
  struct vhost_log;
@@ -133,6 +145,15 @@ typedef int (*vhost_set_config_call_op)(struct 
vhost_dev *dev,

    typedef void (*vhost_reset_status_op)(struct vhost_dev *dev);
  +typedef bool (*vhost_supports_migratory_state_op)(struct 
vhost_dev *dev);

+typedef int (*vhost_set_device_state_fd_op)(struct vhost_dev *dev,
+ VhostDeviceStateDirection direction,
+ VhostDeviceStatePhase phase,
+    int fd,
+    int *reply_fd,
+    Error **errp);
+typedef int (*vhost_check_device_state_op)(struct vhost_dev *dev, 
Error **errp);

+
  typedef struct VhostOps {
  VhostBackendType backend_type;
  vhost_backend_init vhost_backend_init;
@@ -181,6 +202,9 @@ typedef struct VhostOps {
  vhost_force_iommu_op vhost_force_iommu;
  vhost_set_config_call_op vhost_set_config_call;
  vhost_reset_status_op vhost_reset_status;
+    vhost_supports_migratory_state_op vhost_supports_migratory_state;
+    vhost_set_device_state_fd_op vhost_set_device_state_fd;
+    vhost_check_device_state_op vhost_check_device_state;
  } VhostOps;
    int vhost_backend_update_device_iotlb(struct vhost_dev *dev,
diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
index 69bf59d630..d8877496e5 100644
--- a/include/hw/virtio/vhost.h
+++ b/include/hw/virtio/vhost.h
@@ -346,4 +346,83 @@ int vhost_dev_set_inflight(struct vhost_dev *dev,
  int vhost_dev_get_inflight(struct vhost_dev *dev, uint16_t 
queue_size,

 struct vhost_inflight *inflight);
  bool vhost_dev_has_iommu(struct vhost_dev *dev);
+
+/**
+ * vhost_supports_migratory_state(): Checks whether the back-end
+ * supports transferring internal state for the purpose of migration.
+ * Support for this feature is required for 
vhost_set_device_state_fd()

+ * and vhost_check_device_state().
+ *
+ * @dev: The vhost device
+ *
+ * Returns true if the device supports these commands, and false 
if it

+ * does not.
+ */
+bool vhost_supports_migratory_state(struct vhost_dev *dev);
+
+/**
+ * vhost_set_device_state_fd(): Begin transfer of internal state 
from/to
+ * the back-end for the purpose of migration.  Data is to be 
transferred
+ * over a pipe according to @direction and @phase.  The sending 
end must
+ * only write to the pipe, and the receiving end must only read 
from it.

+ * Once the sending end is done, it closes its FD.  The receiving end
+ * must take this as the end-of-transfer signal and close its FD, 
too.

+ *
+ * @fd is the back-end's end of the pipe: The write FD for SAVE, 
and the

+ * read FD for LOAD.  This function transfers ownership of @fd to the
+ * back-end, i.e. closes it in the front-end.
+ *
+ * The back-end may optionally reply with an FD of its own, if this
+ * improves efficiency on its end.  In this case, the returned FD is



Hi Hanna,

In what case/situation, the back-end will have a more efficient fd?


Hi Hao,

There is no example yet.


Here my understanding of this "FD of its own" is as same type as

the given fd(e.g. both pipe files), why the fd from back-end makes

difference? Do I miss anything here?


Maybe it makes more sense in the context of how we came up with the 
idea: Specifically, Stefan and me were asking which end should 
provide the FD.  In the context of vhost-user, it makes sense to have 
it be the front-end, because it controls vhost-user communication, 
but that’s just the natural protocol choice, not necessarily the most 
efficient one.


It is imaginable that the front-end (e.g. qemu) could create

[PATCH v3 2/2] target/i386: add control bits support for LAM

2023-07-21 Thread Binbin Wu

LAM uses CR3[61] and CR3[62] to configure/enable LAM on user pointers.
LAM uses CR4[28] to configure/enable LAM on supervisor pointers.

For CR3 LAM bits, no additional handling needed:
- TCG
  LAM is not supported for TCG of target-i386.  helper_write_crN() and 
helper_vmrun()
  check max physical address bits before calling cpu_x86_update_cr3(), no 
change needed,
  i.e. CR3 LAM bits are not allowed to be set in TCG.
- gdbstub
  x86_cpu_gdb_write_register() will call cpu_x86_update_cr3() to update cr3. 
Allow gdb
  to set the LAM bit(s) to CR3, if vcpu doesn't support LAM, KVM_SET_SREGS will 
fail as
  other CR3 reserved bits.

For CR4 LAM bit, its reservation depends on vcpu supporting LAM feature or not.
- TCG
  LAM is not supported for TCG of target-i386.  helper_write_crN() and 
helper_vmrun()
  check CR4 reserved bit before calling cpu_x86_update_cr4(), i.e. CR4 LAM bit 
is not
  allowed to be set in TCG.
- gdbstub
  x86_cpu_gdb_write_register() will call cpu_x86_update_cr4() to update cr4. 
Allow gdb
  to set the LAM bit to CR4, if vcpu doesn't support LAM, KVM_SET_SREGS will 
fail.
- x86_cpu_reset_hold() doesn't need special handling.

Signed-off-by: Binbin Wu 
---
 target/i386/cpu.h | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 4db97899fe..710fadf550 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -261,6 +261,7 @@ typedef enum X86Seg {
 #define CR4_SMAP_MASK   (1U << 21)
 #define CR4_PKE_MASK   (1U << 22)
 #define CR4_PKS_MASK   (1U << 24)
+#define CR4_LAM_SUP_MASK (1U << 28)
 
 #define CR4_RESERVED_MASK \
 (~(target_ulong)(CR4_VME_MASK | CR4_PVI_MASK | CR4_TSD_MASK \
@@ -269,7 +270,8 @@ typedef enum X86Seg {
 | CR4_OSFXSR_MASK | CR4_OSXMMEXCPT_MASK | CR4_UMIP_MASK \
 | CR4_LA57_MASK \
 | CR4_FSGSBASE_MASK | CR4_PCIDE_MASK | CR4_OSXSAVE_MASK \
-| CR4_SMEP_MASK | CR4_SMAP_MASK | CR4_PKE_MASK | CR4_PKS_MASK))
+| CR4_SMEP_MASK | CR4_SMAP_MASK | CR4_PKE_MASK | CR4_PKS_MASK \
+| CR4_LAM_SUP_MASK))
 
 #define DR6_BD  (1 << 13)
 #define DR6_BS  (1 << 14)
@@ -2478,6 +2480,9 @@ static inline uint64_t cr4_reserved_bits(CPUX86State *env)
 if (!(env->features[FEAT_7_0_ECX] & CPUID_7_0_ECX_PKS)) {
 reserved_bits |= CR4_PKS_MASK;
 }
+if (!(env->features[FEAT_7_1_EAX] & CPUID_7_1_EAX_LAM)) {
+reserved_bits |= CR4_LAM_SUP_MASK;
+}
 return reserved_bits;
 }
 
-- 
2.25.1

[PATCH v3 0/2] target/i386: add support for LAM

2023-07-21 Thread Binbin Wu

Linear-address masking (LAM) [1], modifies the checking that is applied to
*64-bit* linear addresses, allowing software to use of the untranslated address
bits for metadata and masks the metadata bits before using them as linear 
addresses to access memory.

When the feature is virtualized and exposed to guest, it can be used for 
efficient
address sanitizers (ASAN) implementation and for optimizations in JITs and 
virtual
machines.

[1] Intel ISE https://cdrdv2.intel.com/v1/dl/getContent/671368
Chapter Linear Address Masking (LAM)

---
Changelog
v3:
- Some change in commit message.
- Add handling of LAM control bits. (Xiaoyao)

v2:
- https://lists.gnu.org/archive/html/qemu-devel/2023-05/msg07842.html


Binbin Wu (1):
  target/i386: add control bits support for LAM

Robert Hoo (1):
  target/i386: add support for LAM in CPUID enumeration

 target/i386/cpu.c | 2 +-
 target/i386/cpu.h | 9 -
 2 files changed, 9 insertions(+), 2 deletions(-)

-- 
2.25.1

[PATCH v3 1/2] target/i386: add support for LAM in CPUID enumeration

2023-07-21 Thread Binbin Wu

From: Robert Hoo 

Linear Address Masking (LAM) is a new Intel CPU feature, which allows software
to use of the untranslated address bits for metadata.

The bit definition:
CPUID.(EAX=7,ECX=1):EAX[26]

Add CPUID definition for LAM.

Note LAM feature is not supported for TCG of target-386, LAM CPIUD bit will not
be added to TCG_7_1_EAX_FEATURES.

More info can be found in Intel ISE Chapter "LINEAR ADDRESS MASKING (LAM)"
https://cdrdv2.intel.com/v1/dl/getContent/671368

Signed-off-by: Robert Hoo 
Co-developed-by: Binbin Wu 
Signed-off-by: Binbin Wu 
---
 target/i386/cpu.c | 2 +-
 target/i386/cpu.h | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index 97ad229d8b..3a42340730 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -965,7 +965,7 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = {
 "fsrc", NULL, NULL, NULL,
 NULL, NULL, NULL, NULL,
 NULL, "amx-fp16", NULL, "avx-ifma",
-NULL, NULL, NULL, NULL,
+NULL, NULL, "lam", NULL,
 NULL, NULL, NULL, NULL,
 },
 .cpuid = {
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index e0771a1043..4db97899fe 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -925,6 +925,8 @@ uint64_t x86_cpu_get_supported_feature_word(FeatureWord w,
 #define CPUID_7_1_EAX_AMX_FP16  (1U << 21)
 /* Support for VPMADD52[H,L]UQ */
 #define CPUID_7_1_EAX_AVX_IFMA  (1U << 23)
+/* Linear Address Masking */
+#define CPUID_7_1_EAX_LAM   (1U << 26)
 
 /* Support for VPDPB[SU,UU,SS]D[,S] */
 #define CPUID_7_1_EDX_AVX_VNNI_INT8 (1U << 4)
-- 
2.25.1

[PATCH] hw/virtio: qmp: add RING_RESET to 'info virtio-status'

2023-07-21 Thread David Edmondson

Signed-off-by: David Edmondson 
---
 hw/virtio/virtio-qmp.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/hw/virtio/virtio-qmp.c b/hw/virtio/virtio-qmp.c
index 3d32dbec8d..7515b0947b 100644
--- a/hw/virtio/virtio-qmp.c
+++ b/hw/virtio/virtio-qmp.c
@@ -79,6 +79,8 @@ static const qmp_virtio_feature_map_t virtio_transport_map[] 
= {
 "VIRTIO_F_ORDER_PLATFORM: Memory accesses ordered by platform"),
 FEATURE_ENTRY(VIRTIO_F_SR_IOV, \
 "VIRTIO_F_SR_IOV: Device supports single root I/O virtualization"),
+FEATURE_ENTRY(VIRTIO_F_RING_RESET, \
+"VIRTIO_F_RING_RESET: Driver can reset a queue individually"),
 /* Virtio ring transport features */
 FEATURE_ENTRY(VIRTIO_RING_F_INDIRECT_DESC, \
 "VIRTIO_RING_F_INDIRECT_DESC: Indirect descriptors supported"),
-- 
2.34.1

Re: [PATCH] hw: Add compat machines for 8.2

2023-07-21 Thread Michael S. Tsirkin

On Tue, Jul 18, 2023 at 04:22:35PM +0200, Cornelia Huck wrote:
> Add 8.2 machine types for arm/i440fx/m68k/q35/s390x/spapr.
> 
> Signed-off-by: Cornelia Huck 

Acked-by: Michael S. Tsirkin 

> ---
>  hw/arm/virt.c  |  9 -
>  hw/core/machine.c  |  3 +++
>  hw/i386/pc.c   |  3 +++
>  hw/i386/pc_piix.c  | 16 +---
>  hw/i386/pc_q35.c   | 14 --
>  hw/m68k/virt.c |  9 -
>  hw/ppc/spapr.c | 15 +--
>  hw/s390x/s390-virtio-ccw.c | 14 +-
>  include/hw/boards.h|  3 +++
>  include/hw/i386/pc.h   |  3 +++
>  10 files changed, 79 insertions(+), 10 deletions(-)
> 
> diff --git a/hw/arm/virt.c b/hw/arm/virt.c
> index 7d9dbc26633a..2a560271b5fc 100644
> --- a/hw/arm/virt.c
> +++ b/hw/arm/virt.c
> @@ -3170,10 +3170,17 @@ static void machvirt_machine_init(void)
>  }
>  type_init(machvirt_machine_init);
>  
> +static void virt_machine_8_2_options(MachineClass *mc)
> +{
> +}
> +DEFINE_VIRT_MACHINE_AS_LATEST(8, 2)
> +
>  static void virt_machine_8_1_options(MachineClass *mc)
>  {
> +virt_machine_8_2_options(mc);
> +compat_props_add(mc->compat_props, hw_compat_8_1, hw_compat_8_1_len);
>  }
> -DEFINE_VIRT_MACHINE_AS_LATEST(8, 1)
> +DEFINE_VIRT_MACHINE(8, 1)
>  
>  static void virt_machine_8_0_options(MachineClass *mc)
>  {
> diff --git a/hw/core/machine.c b/hw/core/machine.c
> index f0d35c640184..da699cf4e147 100644
> --- a/hw/core/machine.c
> +++ b/hw/core/machine.c
> @@ -39,6 +39,9 @@
>  #include "hw/virtio/virtio.h"
>  #include "hw/virtio/virtio-pci.h"
>  
> +GlobalProperty hw_compat_8_1[] = {};
> +const size_t hw_compat_8_1_len = G_N_ELEMENTS(hw_compat_8_1);
> +
>  GlobalProperty hw_compat_8_0[] = {
>  { "migration", "multifd-flush-after-each-section", "on"},
>  { TYPE_PCI_DEVICE, "x-pcie-ari-nextfn-1", "on" },
> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
> index 3109d5e0e035..54838c0c411d 100644
> --- a/hw/i386/pc.c
> +++ b/hw/i386/pc.c
> @@ -114,6 +114,9 @@
>  { "qemu64-" TYPE_X86_CPU, "model-id", "QEMU Virtual CPU version " v, },\
>  { "athlon-" TYPE_X86_CPU, "model-id", "QEMU Virtual CPU version " v, },
>  
> +GlobalProperty pc_compat_8_1[] = {};
> +const size_t pc_compat_8_1_len = G_N_ELEMENTS(pc_compat_8_1);
> +
>  GlobalProperty pc_compat_8_0[] = {
>  { "virtio-mem", "unplugged-inaccessible", "auto" },
>  };
> diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
> index ac72e8f5bee1..ce1ac9527493 100644
> --- a/hw/i386/pc_piix.c
> +++ b/hw/i386/pc_piix.c
> @@ -504,13 +504,25 @@ static void pc_i440fx_machine_options(MachineClass *m)
>  machine_class_allow_dynamic_sysbus_dev(m, TYPE_VMBUS_BRIDGE);
>  }
>  
> -static void pc_i440fx_8_1_machine_options(MachineClass *m)
> +static void pc_i440fx_8_2_machine_options(MachineClass *m)
>  {
>  pc_i440fx_machine_options(m);
>  m->alias = "pc";
>  m->is_default = true;
>  }
>  
> +DEFINE_I440FX_MACHINE(v8_2, "pc-i440fx-8.2", NULL,
> +  pc_i440fx_8_2_machine_options);
> +
> +static void pc_i440fx_8_1_machine_options(MachineClass *m)
> +{
> +pc_i440fx_8_2_machine_options(m);
> +m->alias = NULL;
> +m->is_default = false;
> +compat_props_add(m->compat_props, hw_compat_8_1, hw_compat_8_1_len);
> +compat_props_add(m->compat_props, pc_compat_8_1, pc_compat_8_1_len);
> +}
> +
>  DEFINE_I440FX_MACHINE(v8_1, "pc-i440fx-8.1", NULL,
>pc_i440fx_8_1_machine_options);
>  
> @@ -519,8 +531,6 @@ static void pc_i440fx_8_0_machine_options(MachineClass *m)
>  PCMachineClass *pcmc = PC_MACHINE_CLASS(m);
>  
>  pc_i440fx_8_1_machine_options(m);
> -m->alias = NULL;
> -m->is_default = false;
>  compat_props_add(m->compat_props, hw_compat_8_0, hw_compat_8_0_len);
>  compat_props_add(m->compat_props, pc_compat_8_0, pc_compat_8_0_len);
>  
> diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
> index dc27a9e223a2..37c4814bedf2 100644
> --- a/hw/i386/pc_q35.c
> +++ b/hw/i386/pc_q35.c
> @@ -379,12 +379,23 @@ static void pc_q35_machine_options(MachineClass *m)
>  machine_class_allow_dynamic_sysbus_dev(m, TYPE_VMBUS_BRIDGE);
>  }
>  
> -static void pc_q35_8_1_machine_options(MachineClass *m)
> +static void pc_q35_8_2_machine_options(MachineClass *m)
>  {
>  pc_q35_machine_options(m);
>  m->alias = "q35";
>  }
>  
> +DEFINE_Q35_MACHINE(v8_2, "pc-q35-8.2", NULL,
> +   pc_q35_8_2_machine_options);
> +
> +static void pc_q35_8_1_machine_options(MachineClass *m)
> +{
> +pc_q35_8_2_machine_options(m);
> +m->alias = NULL;
> +compat_props_add(m->compat_props, hw_compat_8_1, hw_compat_8_1_len);
> +compat_props_add(m->compat_props, pc_compat_8_1, pc_compat_8_1_len);
> +}
> +
>  DEFINE_Q35_MACHINE(v8_1, "pc-q35-8.1", NULL,
> pc_q35_8_1_machine_options);
>  
> @@ -393,7 +404,6 @@ static void pc_q35_8_0_machine_options(MachineClass *m)
>  PCMachineClass *pcmc = PC_MACHINE_CLASS(m);
>  
>

Re: [Virtio-fs] [PATCH v2 2/4] vhost-user: Interface for migration state transfer

2023-07-21 Thread Hao Xu




On 7/21/23 16:07, Hanna Czenczek wrote:

On 20.07.23 17:05, Hao Xu wrote:


On 7/20/23 21:20, Hanna Czenczek wrote:

On 20.07.23 14:13, Hao Xu wrote:


On 7/12/23 19:17, Hanna Czenczek wrote:
Add the interface for transferring the back-end's state during 
migration

as defined previously in vhost-user.rst.

Signed-off-by: Hanna Czenczek 
---
  include/hw/virtio/vhost-backend.h |  24 +
  include/hw/virtio/vhost.h |  79 
  hw/virtio/vhost-user.c    | 147 
++

  hw/virtio/vhost.c |  37 
  4 files changed, 287 insertions(+)

diff --git a/include/hw/virtio/vhost-backend.h 
b/include/hw/virtio/vhost-backend.h

index 31a251a9f5..e59d0b53f8 100644
--- a/include/hw/virtio/vhost-backend.h
+++ b/include/hw/virtio/vhost-backend.h
@@ -26,6 +26,18 @@ typedef enum VhostSetConfigType {
  VHOST_SET_CONFIG_TYPE_MIGRATION = 1,
  } VhostSetConfigType;
  +typedef enum VhostDeviceStateDirection {
+    /* Transfer state from back-end (device) to front-end */
+    VHOST_TRANSFER_STATE_DIRECTION_SAVE = 0,
+    /* Transfer state from front-end to back-end (device) */
+    VHOST_TRANSFER_STATE_DIRECTION_LOAD = 1,
+} VhostDeviceStateDirection;
+
+typedef enum VhostDeviceStatePhase {
+    /* The device (and all its vrings) is stopped */
+    VHOST_TRANSFER_STATE_PHASE_STOPPED = 0,
+} VhostDeviceStatePhase;
+
  struct vhost_inflight;
  struct vhost_dev;
  struct vhost_log;
@@ -133,6 +145,15 @@ typedef int 
(*vhost_set_config_call_op)(struct vhost_dev *dev,

    typedef void (*vhost_reset_status_op)(struct vhost_dev *dev);
  +typedef bool (*vhost_supports_migratory_state_op)(struct 
vhost_dev *dev);

+typedef int (*vhost_set_device_state_fd_op)(struct vhost_dev *dev,
+ VhostDeviceStateDirection direction,
+ VhostDeviceStatePhase phase,
+    int fd,
+    int *reply_fd,
+    Error **errp);
+typedef int (*vhost_check_device_state_op)(struct vhost_dev *dev, 
Error **errp);

+
  typedef struct VhostOps {
  VhostBackendType backend_type;
  vhost_backend_init vhost_backend_init;
@@ -181,6 +202,9 @@ typedef struct VhostOps {
  vhost_force_iommu_op vhost_force_iommu;
  vhost_set_config_call_op vhost_set_config_call;
  vhost_reset_status_op vhost_reset_status;
+    vhost_supports_migratory_state_op 
vhost_supports_migratory_state;

+    vhost_set_device_state_fd_op vhost_set_device_state_fd;
+    vhost_check_device_state_op vhost_check_device_state;
  } VhostOps;
    int vhost_backend_update_device_iotlb(struct vhost_dev *dev,
diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
index 69bf59d630..d8877496e5 100644
--- a/include/hw/virtio/vhost.h
+++ b/include/hw/virtio/vhost.h
@@ -346,4 +346,83 @@ int vhost_dev_set_inflight(struct vhost_dev 
*dev,
  int vhost_dev_get_inflight(struct vhost_dev *dev, uint16_t 
queue_size,

 struct vhost_inflight *inflight);
  bool vhost_dev_has_iommu(struct vhost_dev *dev);
+
+/**
+ * vhost_supports_migratory_state(): Checks whether the back-end
+ * supports transferring internal state for the purpose of 
migration.
+ * Support for this feature is required for 
vhost_set_device_state_fd()

+ * and vhost_check_device_state().
+ *
+ * @dev: The vhost device
+ *
+ * Returns true if the device supports these commands, and false 
if it

+ * does not.
+ */
+bool vhost_supports_migratory_state(struct vhost_dev *dev);
+
+/**
+ * vhost_set_device_state_fd(): Begin transfer of internal state 
from/to
+ * the back-end for the purpose of migration.  Data is to be 
transferred
+ * over a pipe according to @direction and @phase.  The sending 
end must
+ * only write to the pipe, and the receiving end must only read 
from it.
+ * Once the sending end is done, it closes its FD.  The receiving 
end
+ * must take this as the end-of-transfer signal and close its FD, 
too.

+ *
+ * @fd is the back-end's end of the pipe: The write FD for SAVE, 
and the
+ * read FD for LOAD.  This function transfers ownership of @fd to 
the

+ * back-end, i.e. closes it in the front-end.
+ *
+ * The back-end may optionally reply with an FD of its own, if this
+ * improves efficiency on its end.  In this case, the returned FD is



Hi Hanna,

In what case/situation, the back-end will have a more efficient fd?


Hi Hao,

There is no example yet.


Here my understanding of this "FD of its own" is as same type as

the given fd(e.g. both pipe files), why the fd from back-end makes

difference? Do I miss anything here?


Maybe it makes more sense in the context of how we came up with the 
idea: Specifically, Stefan and me were asking which end should 
provide the FD.  In the context of vhost-user, it makes sense to 
have it be the front-end, because it controls vhost-user 
communication, but that’s just the natural protocol choice, not 
necessarily the most efficient one.


It is imagina

Re: [PATCH-for-8.1?] hw/virtio: qmp: add RING_RESET to 'info virtio-status'

2023-07-21 Thread Philippe Mathieu-Daudé


On 21/7/23 09:28, David Edmondson wrote:

Signed-off-by: David Edmondson 
---
  hw/virtio/virtio-qmp.c | 2 ++
  1 file changed, 2 insertions(+)


Reviewed-by: Philippe Mathieu-Daudé

Re: [PATCH-for-8.1] target/tricore: Rename tricore_feature

2023-07-21 Thread Philippe Mathieu-Daudé


On 21/7/23 09:31, Philippe Mathieu-Daudé wrote:

On 21/7/23 08:06, Bastian Koppelmann wrote:

this name is used by capstone and will lead to a build failure of QEMU,
when capstone is enabled. So we rename it to tricore_has_feature(), to
match has_feature() in translate.c.

Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1774
Signed-off-by: Bastian Koppelmann 
---
  target/tricore/cpu.c   | 8 
  target/tricore/cpu.h   | 2 +-
  target/tricore/helper.c    | 4 ++--
  target/tricore/op_helper.c | 4 ++--
  4 files changed, 9 insertions(+), 9 deletions(-)


Reviewed-by: Philippe Mathieu-Daudé 


BTW Bastian if you want I can squeeze this single patch in
my mips-fixes PR I'm planning.

Re: [PATCH] roms/opensbi: Upgrade from v1.3 to v1.3.1

2023-07-21 Thread Conor Dooley

On Fri, Jul 21, 2023 at 10:04:02AM +1000, Alistair Francis wrote:
> On Thu, Jul 20, 2023 at 3:00 AM Bin Meng  wrote:
> >
> > Upgrade OpenSBI from v1.3 to v1.3.1 and the pre-built bios images
> > which fixes the boot failure seen when using QEMU to do a direct
> > kernel boot with Microchip Icicle Kit board machine.
> >
> > The v1.3.1 release includes the following commits:
> >
> > 0907de3 lib: sbi: fix comment indent
> > eb736a5 lib: sbi_pmu: Avoid out of bounds access
> > 7828eeb gpio/desginware: add Synopsys DesignWare APB GPIO support
> > c6a3573 lib: utils: Fix sbi_hartid_to_scratch() usage in ACLINT drivers
> > 057eb10 lib: utils/gpio: Fix RV32 compile error for designware GPIO driver
> >
> > Signed-off-by: Bin Meng 
> 
> Reviewed-by: Alistair Francis 
> 
> @Conor Dooley @Conor Dooley Any chance you can test this?

Sure. I didn't se this patch because I am not subscribed to qemu-devel,
just qemu-riscv.
Tested-by: Conor Dooley 

Thanks,
Conor.


signature.asc
Description: PGP signature

Re: [PATCH for-8.2 0/4] rtc devices: Avoid putting time_t in 32-bit variables

2023-07-21 Thread Philippe Mathieu-Daudé


+Markus

On 20/7/23 17:58, Peter Maydell wrote:

This patchset was prompted by a couple of Coverity warnings
(CID 1507157, 1517772) which note that in the m48t59 RTC device model
we keep an offset in a time_t variable but then truncate it by
passing it to qemu_get_timedate(), which currently uses an 'int'
argument for its offset parameter.

We can fix the Coverity complaint by making qemu_get_timedate()
take a time_t; we should also correspondingly make the
qemu_timedate_diff() function return a time_t. However this
will only push the issue out to callers of qemu_timedate_diff()
if they are putting the result in a 32-bit variable or doing
32-bit arithmetic on it.

Luckily there aren't that many callers of qemu_timedate_diff()
and most of them already use either time_t or int64_t for the
calculations they do on its return value. The first three
patches fix devices which weren't doing that; patch four then
fixes the rtc.c functions. If I missed any callsites in devices
then hopefully Coverity will point them out.


Do we need to change the type of the RTC_CHANGE event?

This is wrong, but to give an idea:

--- a/qapi/misc.json
+++ b/qapi/misc.json
@@ -553,47 +553,47 @@
 ##
 # @RTC_CHANGE:
 #
 # Emitted when the guest changes the RTC time.
 #
 # @offset: offset in seconds between base RTC clock (as specified by
 # -rtc base), and new RTC clock value
 #
 # @qom-path: path to the RTC object in the QOM tree
 #
 # Note: This event is rate-limited.  It is not guaranteed that the RTC
 # in the system implements this event, or even that the system has
 # an RTC at all.
 #
 # Since: 0.13
 #
 # Example:
 #
 # <- { "event": "RTC_CHANGE",
 #  "data": { "offset": 78 },
 #  "timestamp": { "seconds": 1267020223, "microseconds": 435656 } }
 ##
 { 'event': 'RTC_CHANGE',
-  'data': { 'offset': 'int', 'qom-path': 'str' } }
+  'data': { 'offset': 'int64', 'qom-path': 'str' } }
---

Re: [PATCH for-8.2 1/4] hw/rtc/m48t59: Use 64-bit arithmetic in set_alarm()

2023-07-21 Thread Philippe Mathieu-Daudé


On 20/7/23 17:58, Peter Maydell wrote:

In the m48t59 device we almost always use 64-bit arithmetic when
dealing with time_t deltas.  The one exception is in set_alarm(),
which currently uses a plain 'int' to hold the difference between two
time_t values.  Switch to int64_t instead to avoid any possible
overflow issues.

Signed-off-by: Peter Maydell 
---
  hw/rtc/m48t59.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)


Reviewed-by: Philippe Mathieu-Daudé

Re: [PATCH 2/5] target/arm/helper: Fix vae2_tlbmask()

2023-07-21 Thread Jean-Philippe Brucker

On Thu, Jul 20, 2023 at 05:35:49PM +0100, Peter Maydell wrote:
> On Wed, 19 Jul 2023 at 16:56, Jean-Philippe Brucker
>  wrote:
> >
> > When HCR_EL2.E2H is enabled, TLB entries are formed using the EL2&0
> > translation regime, instead of the EL2 translation regime. The TLB VAE2*
> > instructions invalidate the regime that corresponds to the current value
> > of HCR_EL2.E2H.
> >
> > At the moment we only invalidate the EL2 translation regime. This causes
> > problems with RMM, which issues TLBI VAE2IS instructions with
> > HCR_EL2.E2H enabled. Update vae2_tlbmask() to take HCR_EL2.E2H into
> > account.
> >
> > Signed-off-by: Jean-Philippe Brucker 
> > ---
> >  target/arm/helper.c | 26 ++
> >  1 file changed, 18 insertions(+), 8 deletions(-)
> >
> > diff --git a/target/arm/helper.c b/target/arm/helper.c
> > index e1b3db6f5f..07a9ac70f5 100644
> > --- a/target/arm/helper.c
> > +++ b/target/arm/helper.c
> > @@ -4663,6 +4663,21 @@ static int vae1_tlbmask(CPUARMState *env)
> >  return mask;
> >  }
> >
> > +static int vae2_tlbmask(CPUARMState *env)
> > +{
> > +uint64_t hcr = arm_hcr_el2_eff(env);
> > +uint16_t mask;
> > +
> > +if (hcr & HCR_E2H) {
> > +mask = ARMMMUIdxBit_E20_2 |
> > +   ARMMMUIdxBit_E20_2_PAN |
> > +   ARMMMUIdxBit_E20_0;
> > +} else {
> > +mask = ARMMMUIdxBit_E2;
> > +}
> > +return mask;
> > +}
> > +
> >  /* Return 56 if TBI is enabled, 64 otherwise. */
> >  static int tlbbits_for_regime(CPUARMState *env, ARMMMUIdx mmu_idx,
> >uint64_t addr)
> 
> > @@ -4838,11 +4853,11 @@ static void tlbi_aa64_vae2is_write(CPUARMState 
> > *env, const ARMCPRegInfo *ri,
> > uint64_t value)
> >  {
> >  CPUState *cs = env_cpu(env);
> > +int mask = vae2_tlbmask(env);
> >  uint64_t pageaddr = sextract64(value << 12, 0, 56);
> >  int bits = tlbbits_for_regime(env, ARMMMUIdx_E2, pageaddr);
> 
> Shouldn't the argument to tlbbits_for_regime() also change
> if we're dealing with the EL2&0 regime rather than EL2 ?

Yes, it affects the result since EL2&0 has two ranges

Thanks,
Jean

> 
> >
> > -tlb_flush_page_bits_by_mmuidx_all_cpus_synced(cs, pageaddr,
> > -  ARMMMUIdxBit_E2, bits);
> > +tlb_flush_page_bits_by_mmuidx_all_cpus_synced(cs, pageaddr, mask, 
> > bits);
> >  }
> 
> thanks
> -- PMM

assert fails in s390x TCG

2023-07-21 Thread Claudio Fontana



Hello Cornelia, Richard,

I had some strange behavior in an s390x TCG VM that I am debugging,

and configured latest upstream QEMU with --enable-debug --enable-debug-tcg

and I am running the qemu binary with -d unimp,guest_errors .

I get:

/usr/bin/qemu-system-s390x -nodefaults -no-reboot -nographic -vga none -cpu 
qemu -d unimp,guest_errors -object rng-random,filename=/dev/random,id=rng0 
-device virtio-rng-ccw,rng=rng0 -runas qemu -net none -kernel 
/var/tmp/boot/kernel -initrd /var/tmp/boot/initrd -append 
root=/dev/disk/by-id/virtio-0 rootfstype=ext3 
rootflags=data=writeback,nobarrier,commit=150,noatime elevator=noop 
nmi_watchdog=0 rw oops=panic panic=1 quiet elevator=noop console=hvc0 
init=build -m 2048 -drive 
file=/var/tmp/img,format=raw,if=none,id=disk,cache=unsafe -device 
virtio-blk-ccw,drive=disk,serial=0 -drive 
file=/var/tmp/swap,format=raw,if=none,id=swap,cache=unsafe -device 
virtio-blk-ccw,drive=swap,serial=1 -device virtio-serial-ccw -device 
virtconsole,chardev=virtiocon0 -chardev stdio,id=virtiocon0 -chardev 
socket,id=monitor,server=on,wait=off,path=/var/tmp/img.qemu/monitor -mon 
chardev=monitor,mode=readline -smp 8

unimplemented opcode 0xb9ab
unimplemented opcode 0xb2af

ERROR:../accel/tcg/tb-maint.c:348:page_unlock__debug: assertion failed: 
(page_is_locked(pd))
Bail out! ERROR:../accel/tcg/tb-maint.c:348:page_unlock__debug: assertion 
failed: (page_is_locked(pd))

Thread 3 "qemu-system-s39" received signal SIGABRT, Aborted.
[Switching to Thread 0x753516c0 (LWP 215975)]
(gdb) bt
#0  0x7730dabc in __pthread_kill_implementation () at /lib64/libc.so.6
#1  0x772bc266 in raise () at /lib64/libc.so.6
#2  0x772a4897 in abort () at /lib64/libc.so.6
#3  0x776f0eee in  () at /lib64/libglib-2.0.so.0
#4  0x7775649a in g_assertion_message_expr () at /lib64/libglib-2.0.so.0
#5  0x55b96134 in page_unlock__debug (pd=0x7ffee8680440) at 
../accel/tcg/tb-maint.c:348
#6  0x55b962a9 in page_unlock (pd=0x7ffee8680440) at 
../accel/tcg/tb-maint.c:397
#7  0x55b96580 in tb_unlock_pages (tb=0x7fffefffeb00) at 
../accel/tcg/tb-maint.c:483
#8  0x55b94698 in cpu_exec_longjmp_cleanup (cpu=0x56566a30) at 
../accel/tcg/cpu-exec.c:556
#9  0x55b954e0 in cpu_exec_setjmp (cpu=0x56566a30, 
sc=0x75350540) at ../accel/tcg/cpu-exec.c:1054
#10 0x55b9557a in cpu_exec (cpu=0x56566a30) at 
../accel/tcg/cpu-exec.c:1083
#11 0x55bb9af6 in tcg_cpus_exec (cpu=0x56566a30) at 
../accel/tcg/tcg-accel-ops.c:75
#12 0x55bba1ae in mttcg_cpu_thread_fn (arg=0x56566a30) at 
../accel/tcg/tcg-accel-ops-mttcg.c:95
#13 0x55dc0af3 in qemu_thread_start (args=0x565ba150) at 
../util/qemu-thread-posix.c:541
#14 0x7730bc64 in start_thread () at /lib64/libc.so.6
#15 0x77393550 in clone3 () at /lib64/libc.so.6

(gdb) frame 5
#5  0x55b96134 in page_unlock__debug (pd=0x7ffee8680440) at 
../accel/tcg/tb-maint.c:348
348 g_assert(page_is_locked(pd));
(gdb) list 348
343 static void page_unlock__debug(const PageDesc *pd)
344 {
345 bool removed;
346
347 ht_pages_locked_debug_init();
348 g_assert(page_is_locked(pd));
349 removed = g_hash_table_remove(ht_pages_locked_debug, pd);
350 g_assert(removed);
351 }
352

(gdb) info threads
  Id   Target IdFrame 
  1Thread 0x763bef40 (LWP 215971) "qemu-system-s39" 0x77385596 
in ppoll () from /lib64/libc.so.6
  2Thread 0x763bb6c0 (LWP 215974) "qemu-system-s39" 0x7738b41d 
in syscall () from /lib64/libc.so.6
* 3Thread 0x753516c0 (LWP 215975) "qemu-system-s39" 0x7730dabc 
in __pthread_kill_implementation () from /lib64/libc.so.6
  4Thread 0x74b506c0 (LWP 215976) "qemu-system-s39" 0x7730820e 
in __futex_abstimed_wait_common () from /lib64/libc.so.6
  5Thread 0x7ffeefdff6c0 (LWP 215977) "qemu-system-s39" 0x7730820e 
in __futex_abstimed_wait_common () from /lib64/libc.so.6
  6Thread 0x7ffeef5fe6c0 (LWP 215978) "qemu-system-s39" 0x7730820e 
in __futex_abstimed_wait_common () from /lib64/libc.so.6
  7Thread 0x7ffeeedfd6c0 (LWP 215979) "qemu-system-s39" 0x7730820e 
in __futex_abstimed_wait_common () from /lib64/libc.so.6
  8Thread 0x7ffeee5fc6c0 (LWP 215980) "qemu-system-s39" 0x7730820e 
in __futex_abstimed_wait_common () from /lib64/libc.so.6
  9Thread 0x7ffeeddfb6c0 (LWP 215981) "qemu-system-s39" 0x7730820e 
in __futex_abstimed_wait_common () from /lib64/libc.so.6
  10   Thread 0x7ffeed5fa6c0 (LWP 215982) "qemu-system-s39" 0x7730820e 
in __futex_abstimed_wait_common () from /lib64/libc.so.6

(gdb) thread apply all bt

Thread 10 (Thread 0x7ffeed5fa6c0 (LWP 215982) "qemu-system-s39"):
#0  0x7730820e in __futex_abstimed_wait_common () at /lib64/libc.so.6
#1  0x7730af50 in pthread_cond_wait@@GLIBC_2.3.2 () at /l

Re: [PATCH 3/5] target/arm: Skip granule protection checks for AT instructions

2023-07-21 Thread Jean-Philippe Brucker

On Thu, Jul 20, 2023 at 05:39:56PM +0100, Peter Maydell wrote:
> On Wed, 19 Jul 2023 at 16:56, Jean-Philippe Brucker
>  wrote:
> >
> > GPC checks are not performed on the output address for AT instructions,
> > as stated by ARM DDI 0487J in D8.12.2:
> >
> >   When populating PAR_EL1 with the result of an address translation
> >   instruction, granule protection checks are not performed on the final
> >   output address of a successful translation.
> >
> > Rename get_phys_addr_with_secure(), since it's only used to handle AT
> > instructions.
> >
> > Signed-off-by: Jean-Philippe Brucker 
> > ---
> > This incidentally fixes a problem with AT S1E1 instructions which can
> > output an IPA and should definitely not cause a GPC.
> > ---
> >  target/arm/internals.h | 25 ++---
> >  target/arm/helper.c|  8 ++--
> >  target/arm/ptw.c   | 11 ++-
> >  3 files changed, 26 insertions(+), 18 deletions(-)
> >
> > diff --git a/target/arm/internals.h b/target/arm/internals.h
> > index 0f01bc32a8..fc90c364f7 100644
> > --- a/target/arm/internals.h
> > +++ b/target/arm/internals.h
> > @@ -1190,12 +1190,11 @@ typedef struct GetPhysAddrResult {
> >  } GetPhysAddrResult;
> >
> >  /**
> > - * get_phys_addr_with_secure: get the physical address for a virtual 
> > address
> > + * get_phys_addr: get the physical address for a virtual address
> >   * @env: CPUARMState
> >   * @address: virtual address to get physical address for
> >   * @access_type: 0 for read, 1 for write, 2 for execute
> >   * @mmu_idx: MMU index indicating required translation regime
> > - * @is_secure: security state for the access
> >   * @result: set on translation success.
> >   * @fi: set to fault info if the translation fails
> >   *
> > @@ -1212,26 +1211,30 @@ typedef struct GetPhysAddrResult {
> >   *  * for PSMAv5 based systems we don't bother to return a full FSR format
> >   *value.
> >   */
> > -bool get_phys_addr_with_secure(CPUARMState *env, target_ulong address,
> > -   MMUAccessType access_type,
> > -   ARMMMUIdx mmu_idx, bool is_secure,
> > -   GetPhysAddrResult *result, ARMMMUFaultInfo 
> > *fi)
> > +bool get_phys_addr(CPUARMState *env, target_ulong address,
> > +   MMUAccessType access_type, ARMMMUIdx mmu_idx,
> > +   GetPhysAddrResult *result, ARMMMUFaultInfo *fi)
> >  __attribute__((nonnull));
> 
> 
> What is going on in this bit of the patch ? We already
> have a prototype for get_phys_addr() with a doc comment.
> Is this git's diff-output producing something silly
> for a change that is not logically touching get_phys_addr()'s
> prototype and comment at all?

I swapped the two prototypes in order to make the new comment for
get_phys_addr_with_secure_nogpc() more clear. Tried to convey that
get_phys_addr() is the normal function and
get_phys_addr_with_secure_nogpc() is special. So git is working as
expected and this is a style change, I can switch them back if you prefer.

Thanks,
Jean

> 
> >  /**
> > - * get_phys_addr: get the physical address for a virtual address
> > + * get_phys_addr_with_secure_nogpc: get the physical address for a virtual
> > + *  address
> >   * @env: CPUARMState
> >   * @address: virtual address to get physical address for
> >   * @access_type: 0 for read, 1 for write, 2 for execute
> >   * @mmu_idx: MMU index indicating required translation regime
> > + * @is_secure: security state for the access
> >   * @result: set on translation success.
> >   * @fi: set to fault info if the translation fails
> >   *
> > - * Similarly, but use the security regime of @mmu_idx.
> > + * Similar to get_phys_addr, but use the given security regime and don't 
> > perform
> > + * a Granule Protection Check on the resulting address.
> >   */
> > -bool get_phys_addr(CPUARMState *env, target_ulong address,
> > -   MMUAccessType access_type, ARMMMUIdx mmu_idx,
> > -   GetPhysAddrResult *result, ARMMMUFaultInfo *fi)
> > +bool get_phys_addr_with_secure_nogpc(CPUARMState *env, target_ulong 
> > address,
> > + MMUAccessType access_type,
> > + ARMMMUIdx mmu_idx, bool is_secure,
> > + GetPhysAddrResult *result,
> > + ARMMMUFaultInfo *fi)
> >  __attribute__((nonnull));
> 
> thanks
> -- PMM

Re: [PATCH for-8.2 1/4] hw/rtc/m48t59: Use 64-bit arithmetic in set_alarm()

2023-07-21 Thread Philippe Mathieu-Daudé


On 20/7/23 17:58, Peter Maydell wrote:

In the m48t59 device we almost always use 64-bit arithmetic when
dealing with time_t deltas.  The one exception is in set_alarm(),
which currently uses a plain 'int' to hold the difference between two
time_t values.  Switch to int64_t instead to avoid any possible
overflow issues.

Signed-off-by: Peter Maydell 
---
  hw/rtc/m48t59.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)


Similarly:

-- >8 --
--- a/hw/rtc/m48t59.c
+++ b/hw/rtc/m48t59.c
@@ -88,7 +88,7 @@ static M48txxInfo m48txx_sysbus_info[] = {
 static void alarm_cb (void *opaque)
 {
 struct tm tm;
-uint64_t next_time;
+int64_t next_time;
 M48t59State *NVRAM = opaque;
---

Re: [PATCH for-8.2 2/4] hw/rtc/twl92230: Use int64_t for sec_offset and alm_sec

2023-07-21 Thread Philippe Mathieu-Daudé


On 20/7/23 17:59, Peter Maydell wrote:

In the twl92230 device, use int64_t for the two state fields
sec_offset and alm_sec, because we set these to values that
are either time_t or differences between two time_t values.

These fields aren't saved in vmstate anywhere, so we can
safely widen them.

Signed-off-by: Peter Maydell 
---
I have a suspicion that really these fields *should* be
being migrated, but this device is only used in the n800
and n810 boards, so I'm not going to investigate how broken
migration/vmsave is there...
---
  hw/rtc/twl92230.c | 4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)


Reviewed-by: Philippe Mathieu-Daudé

Re: [PATCH for-8.1] scripts/git-submodule.sh: Don't rely on non-POSIX 'read' behaviour

2023-07-21 Thread Philippe Mathieu-Daudé


On 20/7/23 17:30, Peter Maydell wrote:

The POSIX definition of the 'read' utility requires that you
specify the variable name to set; omitting the name and
having it default to 'REPLY' is a bashism. If your system
sh is dash, then it will print an error message during build:

qemu/pc-bios/s390-ccw/../../scripts/git-submodule.sh: 106: read: arg count

Specify the variable name explicitly.

Fixes: fdb8fd8cb915647b ("git-submodule: allow partial update of 
.git-submodule-status")
Signed-off-by: Peter Maydell 
---
  scripts/git-submodule.sh | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)


Reviewed-by: Philippe Mathieu-Daudé

Re: [PATCH for-8.2 0/4] rtc devices: Avoid putting time_t in 32-bit variables

2023-07-21 Thread Philippe Mathieu-Daudé


Hi Peter,

On 20/7/23 17:58, Peter Maydell wrote:

This patchset was prompted by a couple of Coverity warnings
(CID 1507157, 1517772) which note that in the m48t59 RTC device model
we keep an offset in a time_t variable but then truncate it by
passing it to qemu_get_timedate(), which currently uses an 'int'
argument for its offset parameter.

We can fix the Coverity complaint by making qemu_get_timedate()
take a time_t; we should also correspondingly make the
qemu_timedate_diff() function return a time_t. However this
will only push the issue out to callers of qemu_timedate_diff()
if they are putting the result in a 32-bit variable or doing
32-bit arithmetic on it.

Luckily there aren't that many callers of qemu_timedate_diff()
and most of them already use either time_t or int64_t for the
calculations they do on its return value. The first three
patches fix devices which weren't doing that; patch four then
fixes the rtc.c functions. If I missed any callsites in devices
then hopefully Coverity will point them out.


PL031State::tick_offset is uint32_t, and pl031_get_count() also
returns that type. Is that expected?

Re: [PATCH for-8.2 4/4] rtc: Use time_t for passing and returning time offsets

2023-07-21 Thread Philippe Mathieu-Daudé


On 20/7/23 17:59, Peter Maydell wrote:

The functions qemu_get_timedate() and qemu_timedate_diff() take
and return a time offset as an integer. Coverity points out that
means that when an RTC device implementation holds an offset
as a time_t, as the m48t59 does, the time_t will get truncated.
(CID 1507157, 1517772).

The functions work with time_t internally, so make them use that type
in their APIs.

Note that this won't help any Y2038 issues where either the device
model itself is keeping the offset in a 32-bit integer, or where the
hardware under emulation has Y2038 or other rollover problems.  If we
missed any cases of the former then hopefully Coverity will warn us
about them since after this patch we'd be truncating a time_t in
assignments from qemu_timedate_diff().)

Signed-off-by: Peter Maydell 
---
  include/sysemu/rtc.h | 4 ++--
  softmmu/rtc.c| 4 ++--
  2 files changed, 4 insertions(+), 4 deletions(-)


Reviewed-by: Philippe Mathieu-Daudé

[BUG][CPU hot-plug]CPU hot-plugs cause the qemu process to coredump

2023-07-21 Thread lixianglai

Hello,Recently, when I was developing CPU hot-plugs under the loongarch 
architecture,


I found that there was a problem with qemu cpu hot-plugs under x86 
architecture,


which caused the qemu process coredump when repeatedly inserting and 
unplugging


the CPU when the TCG was accelerated.


The specific operation process is as follows:

1.Use the following command to start the virtual machine

qemu-system-x86_64 \
-machine q35  \
-cpu Broadwell-IBRS \
-smp 1,maxcpus=4,sockets=4,cores=1,threads=1 \
-m 4G \
-drive file=~/anolis-8.8.qcow2  \
-serial stdio   \
-monitor telnet:localhost:4498,server,nowait


2.Enter QEMU Monitor via telnet for repeated CPU insertion and unplugging

telnet 127.0.0.1 4498

(qemu) device_add 
Broadwell-IBRS-x86_64-cpu,socket-id=1,core-id=0,thread-id=0,id=cpu1


(qemu) device_del cpu1
(qemu) device_add 
Broadwell-IBRS-x86_64-cpu,socket-id=1,core-id=0,thread-id=0,id=cpu1



3.You will notice that the QEMU process has a coredump

# malloc(): unsorted double linked list corrupted
Aborted (core dumped)

Re: [PATCH for-8.2 1/4] hw/rtc/m48t59: Use 64-bit arithmetic in set_alarm()

2023-07-21 Thread Peter Maydell

On Fri, 21 Jul 2023 at 10:09, Philippe Mathieu-Daudé  wrote:
>
> On 20/7/23 17:58, Peter Maydell wrote:
> > In the m48t59 device we almost always use 64-bit arithmetic when
> > dealing with time_t deltas.  The one exception is in set_alarm(),
> > which currently uses a plain 'int' to hold the difference between two
> > time_t values.  Switch to int64_t instead to avoid any possible
> > overflow issues.
> >
> > Signed-off-by: Peter Maydell 
> > ---
> >   hw/rtc/m48t59.c | 2 +-
> >   1 file changed, 1 insertion(+), 1 deletion(-)
>
> Similarly:
>
> -- >8 --
> --- a/hw/rtc/m48t59.c
> +++ b/hw/rtc/m48t59.c
> @@ -88,7 +88,7 @@ static M48txxInfo m48txx_sysbus_info[] = {
>   static void alarm_cb (void *opaque)
>   {
>   struct tm tm;
> -uint64_t next_time;
> +int64_t next_time;
>   M48t59State *NVRAM = opaque;

Why? next_time should always be positive here, I think.

-- PMM

Re: [PATCH for-8.2 0/4] rtc devices: Avoid putting time_t in 32-bit variables

2023-07-21 Thread Peter Maydell

On Fri, 21 Jul 2023 at 10:03, Philippe Mathieu-Daudé  wrote:
>
> +Markus
>
> On 20/7/23 17:58, Peter Maydell wrote:
> > This patchset was prompted by a couple of Coverity warnings
> > (CID 1507157, 1517772) which note that in the m48t59 RTC device model
> > we keep an offset in a time_t variable but then truncate it by
> > passing it to qemu_get_timedate(), which currently uses an 'int'
> > argument for its offset parameter.
> >
> > We can fix the Coverity complaint by making qemu_get_timedate()
> > take a time_t; we should also correspondingly make the
> > qemu_timedate_diff() function return a time_t. However this
> > will only push the issue out to callers of qemu_timedate_diff()
> > if they are putting the result in a 32-bit variable or doing
> > 32-bit arithmetic on it.
> >
> > Luckily there aren't that many callers of qemu_timedate_diff()
> > and most of them already use either time_t or int64_t for the
> > calculations they do on its return value. The first three
> > patches fix devices which weren't doing that; patch four then
> > fixes the rtc.c functions. If I missed any callsites in devices
> > then hopefully Coverity will point them out.
>
> Do we need to change the type of the RTC_CHANGE event?
>
> This is wrong, but to give an idea:
>
> --- a/qapi/misc.json
> +++ b/qapi/misc.json
> @@ -553,47 +553,47 @@
>   ##
>   # @RTC_CHANGE:
>   #
>   # Emitted when the guest changes the RTC time.
>   #
>   # @offset: offset in seconds between base RTC clock (as specified by
>   # -rtc base), and new RTC clock value
>   #
>   # @qom-path: path to the RTC object in the QOM tree
>   #
>   # Note: This event is rate-limited.  It is not guaranteed that the RTC
>   # in the system implements this event, or even that the system has
>   # an RTC at all.
>   #
>   # Since: 0.13
>   #
>   # Example:
>   #
>   # <- { "event": "RTC_CHANGE",
>   #  "data": { "offset": 78 },
>   #  "timestamp": { "seconds": 1267020223, "microseconds": 435656 } }
>   ##
>   { 'event': 'RTC_CHANGE',
> -  'data': { 'offset': 'int', 'qom-path': 'str' } }
> +  'data': { 'offset': 'int64', 'qom-path': 'str' } }
> ---

If I understand the 'Built-in Types' section in
docs/devel/qapi-code-gen.rst correctly, the QAPI 'int'
type is already 64 bits.

thanks
-- PMM

Re: [PATCH for-8.2 0/4] rtc devices: Avoid putting time_t in 32-bit variables

2023-07-21 Thread Peter Maydell

On Fri, 21 Jul 2023 at 10:16, Philippe Mathieu-Daudé  wrote:
>
> Hi Peter,
>
> On 20/7/23 17:58, Peter Maydell wrote:
> > This patchset was prompted by a couple of Coverity warnings
> > (CID 1507157, 1517772) which note that in the m48t59 RTC device model
> > we keep an offset in a time_t variable but then truncate it by
> > passing it to qemu_get_timedate(), which currently uses an 'int'
> > argument for its offset parameter.
> >
> > We can fix the Coverity complaint by making qemu_get_timedate()
> > take a time_t; we should also correspondingly make the
> > qemu_timedate_diff() function return a time_t. However this
> > will only push the issue out to callers of qemu_timedate_diff()
> > if they are putting the result in a 32-bit variable or doing
> > 32-bit arithmetic on it.
> >
> > Luckily there aren't that many callers of qemu_timedate_diff()
> > and most of them already use either time_t or int64_t for the
> > calculations they do on its return value. The first three
> > patches fix devices which weren't doing that; patch four then
> > fixes the rtc.c functions. If I missed any callsites in devices
> > then hopefully Coverity will point them out.
>
> PL031State::tick_offset is uint32_t, and pl031_get_count() also
> returns that type. Is that expected?

I think those fall into the category of "the device we are
modelling does not support 64-bit timestamps" -- the PL031
RTC_DR register is only 32 bits.

thanks
-- PMM

[PATCH 0/2] riscv: Fix the console of the Spike machine on big endian hosts

2023-07-21 Thread Thomas Huth

The tests/avocado/riscv_opensbi.py avocado test is currently failing
on big endian hosts since the console of the Spike machine is not
working there. With two small patches, this can be fixed: First patch
fixes riscv64, and the second one fixes riscv32.

Thomas Huth (2):
  hw/char/riscv_htif: Fix printing of console characters on big endian
hosts
  hw/char/riscv_htif: Fix the console syscall on big endian hosts

 hw/char/riscv_htif.c | 12 +++-
 1 file changed, 7 insertions(+), 5 deletions(-)

-- 
2.39.3

[PATCH 1/2] hw/char/riscv_htif: Fix printing of console characters on big endian hosts

2023-07-21 Thread Thomas Huth

The character that should be printed is stored in the 64 bit "payload"
variable. The code currently tries to print it by taking the address
of the variable and passing this pointer to qemu_chr_fe_write(). However,
this only works on little endian hosts where the least significant bits
are stored on the lowest address. To do this in a portable way, we have
to store the value in an uint8_t variable instead.

Fixes: 5033606780 ("RISC-V HTIF Console")
Signed-off-by: Thomas Huth 
---
 hw/char/riscv_htif.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/hw/char/riscv_htif.c b/hw/char/riscv_htif.c
index 37d3ccc76b..f96df40124 100644
--- a/hw/char/riscv_htif.c
+++ b/hw/char/riscv_htif.c
@@ -232,7 +232,8 @@ static void htif_handle_tohost_write(HTIFState *s, uint64_t 
val_written)
 s->tohost = 0; /* clear to indicate we read */
 return;
 } else if (cmd == HTIF_CONSOLE_CMD_PUTC) {
-qemu_chr_fe_write(&s->chr, (uint8_t *)&payload, 1);
+uint8_t ch = (uint8_t)payload;
+qemu_chr_fe_write(&s->chr, &ch, 1);
 resp = 0x100 | (uint8_t)payload;
 } else {
 qemu_log("HTIF device %d: unknown command\n", device);
-- 
2.39.3

[PATCH 2/2] hw/char/riscv_htif: Fix the console syscall on big endian hosts

2023-07-21 Thread Thomas Huth

Values that have been read via cpu_physical_memory_read() from the
guest's memory have to be swapped in case the host endianess differs
from the guest.

Fixes: a6e13e31d5 ("riscv_htif: Support console output via proxy syscall")
Signed-off-by: Thomas Huth 
---
 hw/char/riscv_htif.c | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/hw/char/riscv_htif.c b/hw/char/riscv_htif.c
index f96df40124..40de6b8b77 100644
--- a/hw/char/riscv_htif.c
+++ b/hw/char/riscv_htif.c
@@ -30,6 +30,7 @@
 #include "qemu/timer.h"
 #include "qemu/error-report.h"
 #include "exec/address-spaces.h"
+#include "exec/tswap.h"
 #include "sysemu/dma.h"
 
 #define RISCV_DEBUG_HTIF 0
@@ -209,11 +210,11 @@ static void htif_handle_tohost_write(HTIFState *s, 
uint64_t val_written)
 } else {
 uint64_t syscall[8];
 cpu_physical_memory_read(payload, syscall, sizeof(syscall));
-if (syscall[0] == PK_SYS_WRITE &&
-syscall[1] == HTIF_DEV_CONSOLE &&
-syscall[3] == HTIF_CONSOLE_CMD_PUTC) {
+if (tswap64(syscall[0]) == PK_SYS_WRITE &&
+tswap64(syscall[1]) == HTIF_DEV_CONSOLE &&
+tswap64(syscall[3]) == HTIF_CONSOLE_CMD_PUTC) {
 uint8_t ch;
-cpu_physical_memory_read(syscall[2], &ch, 1);
+cpu_physical_memory_read(tswap64(syscall[2]), &ch, 1);
 qemu_chr_fe_write(&s->chr, &ch, 1);
 resp = 0x100 | (uint8_t)payload;
 } else {
-- 
2.39.3

Re: [PULL 0/5] Linux user brk fixes patches

2023-07-21 Thread Peter Maydell

On Wed, 19 Jul 2023 at 16:53, Helge Deller  wrote:
>
> The following changes since commit 361d5397355276e3007825cc17217c1e4d4320f7:
>
>   Merge tag 'block-pull-request' of https://gitlab.com/stefanha/qemu into 
> staging (2023-07-17 15:49:27 +0100)
>
> are available in the Git repository at:
>
>   https://github.com/hdeller/qemu-hppa.git 
> tags/linux-user-brk-fixes-pull-request
>
> for you to fetch changes up to 518f32221af759a29500ac172c4c857bef142067:
>
>   linux-user: Fix qemu-arm to run static armhf binaries (2023-07-18 20:42:05 
> +0200)
>
> 
> linux-user: brk() syscall fixes and armhf static binary fix
>
> Commit 86f04735ac ("linux-user: Fix brk() to release pages") introduced
> the possibility for userspace applications to reduce memory footprint by
> calling brk() with a lower address and as such free up memory, the same
> way as the Linux kernel allows on physical machines.
>
> This change introduced some failures for applications with errors like
> - accesing bytes above the brk heap address on the same page,
> - freeing memory below the initial brk address,
> and introduced a behaviour which isn't done by the kernel (e.g. zeroing
> memory above brk).
>
> This patch series fixes those issues and has been tested with existing
> programs (e.g. upx).
>
> Additionally one patch fixes running static armhf executables (e.g. fstype)
> which was broken since qemu-8.0.
>
> Changes in v2:
> - dropped patch to revert d28b3c90cfad ("linux-user: Make sure initial brk(0)
>   is page-aligned")
> - rephrased some commit messages
> - fixed Cc email addresses, added new ones
> - added R-b tags
>
> Helge


Applied, thanks.

Please update the changelog at https://wiki.qemu.org/ChangeLog/8.1
for any user-visible changes.

-- PMM

Re: [PULL 00/14] NBD patches for 2023-07-19

2023-07-21 Thread Peter Maydell

On Wed, 19 Jul 2023 at 21:39, Eric Blake  wrote:
>
> The following changes since commit 2c27fdc7a626408ee2cf30d791aa0b63027c7404:
>
>   Update version for v8.1.0-rc0 release (2023-07-19 20:31:43 +0100)
>
> are available in the Git repository at:
>
>   https://repo.or.cz/qemu/ericb.git tags/pull-nbd-2023-07-19
>
> for you to fetch changes up to bfe04d0a7d5e8a4f4c9014ee7622af2056685974:
>
>   nbd: Use enum for various negotiation modes (2023-07-19 15:26:13 -0500)
>
> 
> NBD patches through 2023-07-19
>
> - Denis V. Lunev: fix hang with 'ssh ... "qemu-nbd -c"'
> - Eric Blake: preliminary work towards NBD 64-bit extensions
>


Applied, thanks.

Please update the changelog at https://wiki.qemu.org/ChangeLog/8.1
for any user-visible changes.

-- PMM

Re: [PATCH-for-8.1] target/tricore: Rename tricore_feature

2023-07-21 Thread Bastian Koppelmann

On Fri, Jul 21, 2023 at 10:54:43AM +0200, Philippe Mathieu-Daudé wrote:
> On 21/7/23 09:31, Philippe Mathieu-Daudé wrote:
> > On 21/7/23 08:06, Bastian Koppelmann wrote:
> > > this name is used by capstone and will lead to a build failure of QEMU,
> > > when capstone is enabled. So we rename it to tricore_has_feature(), to
> > > match has_feature() in translate.c.
> > > 
> > > Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1774
> > > Signed-off-by: Bastian Koppelmann 
> > > ---
> > >   target/tricore/cpu.c   | 8 
> > >   target/tricore/cpu.h   | 2 +-
> > >   target/tricore/helper.c    | 4 ++--
> > >   target/tricore/op_helper.c | 4 ++--
> > >   4 files changed, 9 insertions(+), 9 deletions(-)
> > 
> > Reviewed-by: Philippe Mathieu-Daudé 
> 
> BTW Bastian if you want I can squeeze this single patch in
> my mips-fixes PR I'm planning.

Thanks, Phil. Go ahead.

Cheers,
Bastian

Re: [PATCH 1/2] hw/char/riscv_htif: Fix printing of console characters on big endian hosts

2023-07-21 Thread Bin Meng

On Fri, Jul 21, 2023 at 5:48 PM Thomas Huth  wrote:
>
> The character that should be printed is stored in the 64 bit "payload"
> variable. The code currently tries to print it by taking the address
> of the variable and passing this pointer to qemu_chr_fe_write(). However,
> this only works on little endian hosts where the least significant bits
> are stored on the lowest address. To do this in a portable way, we have
> to store the value in an uint8_t variable instead.
>
> Fixes: 5033606780 ("RISC-V HTIF Console")
> Signed-off-by: Thomas Huth 
> ---
>  hw/char/riscv_htif.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
>

Reviewed-by: Bin Meng

Re: [PATCH 2/2] hw/char/riscv_htif: Fix the console syscall on big endian hosts

2023-07-21 Thread Bin Meng

On Fri, Jul 21, 2023 at 5:48 PM Thomas Huth  wrote:
>
> Values that have been read via cpu_physical_memory_read() from the
> guest's memory have to be swapped in case the host endianess differs

typo: endianness

> from the guest.
>
> Fixes: a6e13e31d5 ("riscv_htif: Support console output via proxy syscall")
> Signed-off-by: Thomas Huth 
> ---
>  hw/char/riscv_htif.c | 9 +
>  1 file changed, 5 insertions(+), 4 deletions(-)
>

Reviewed-by: Bin Meng

Re: [PATCH 1/2] hw/char/riscv_htif: Fix printing of console characters on big endian hosts

2023-07-21 Thread Daniel Henrique Barboza





On 7/21/23 06:47, Thomas Huth wrote:

The character that should be printed is stored in the 64 bit "payload"
variable. The code currently tries to print it by taking the address
of the variable and passing this pointer to qemu_chr_fe_write(). However,
this only works on little endian hosts where the least significant bits
are stored on the lowest address. To do this in a portable way, we have
to store the value in an uint8_t variable instead.

Fixes: 5033606780 ("RISC-V HTIF Console")
Signed-off-by: Thomas Huth 
---


Reviewed-by: Daniel Henrique Barboza 


  hw/char/riscv_htif.c | 3 ++-
  1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/hw/char/riscv_htif.c b/hw/char/riscv_htif.c
index 37d3ccc76b..f96df40124 100644
--- a/hw/char/riscv_htif.c
+++ b/hw/char/riscv_htif.c
@@ -232,7 +232,8 @@ static void htif_handle_tohost_write(HTIFState *s, uint64_t 
val_written)
  s->tohost = 0; /* clear to indicate we read */
  return;
  } else if (cmd == HTIF_CONSOLE_CMD_PUTC) {
-qemu_chr_fe_write(&s->chr, (uint8_t *)&payload, 1);
+uint8_t ch = (uint8_t)payload;
+qemu_chr_fe_write(&s->chr, &ch, 1);
  resp = 0x100 | (uint8_t)payload;
  } else {
  qemu_log("HTIF device %d: unknown command\n", device);

Re: [PATCH 2/2] hw/char/riscv_htif: Fix the console syscall on big endian hosts

2023-07-21 Thread Daniel Henrique Barboza





On 7/21/23 06:47, Thomas Huth wrote:

Values that have been read via cpu_physical_memory_read() from the
guest's memory have to be swapped in case the host endianess differs
from the guest.

Fixes: a6e13e31d5 ("riscv_htif: Support console output via proxy syscall")
Signed-off-by: Thomas Huth 
---


Reviewed-by: Daniel Henrique Barboza 


  hw/char/riscv_htif.c | 9 +
  1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/hw/char/riscv_htif.c b/hw/char/riscv_htif.c
index f96df40124..40de6b8b77 100644
--- a/hw/char/riscv_htif.c
+++ b/hw/char/riscv_htif.c
@@ -30,6 +30,7 @@
  #include "qemu/timer.h"
  #include "qemu/error-report.h"
  #include "exec/address-spaces.h"
+#include "exec/tswap.h"
  #include "sysemu/dma.h"
  
  #define RISCV_DEBUG_HTIF 0

@@ -209,11 +210,11 @@ static void htif_handle_tohost_write(HTIFState *s, 
uint64_t val_written)
  } else {
  uint64_t syscall[8];
  cpu_physical_memory_read(payload, syscall, sizeof(syscall));
-if (syscall[0] == PK_SYS_WRITE &&
-syscall[1] == HTIF_DEV_CONSOLE &&
-syscall[3] == HTIF_CONSOLE_CMD_PUTC) {
+if (tswap64(syscall[0]) == PK_SYS_WRITE &&
+tswap64(syscall[1]) == HTIF_DEV_CONSOLE &&
+tswap64(syscall[3]) == HTIF_CONSOLE_CMD_PUTC) {
  uint8_t ch;
-cpu_physical_memory_read(syscall[2], &ch, 1);
+cpu_physical_memory_read(tswap64(syscall[2]), &ch, 1);
  qemu_chr_fe_write(&s->chr, &ch, 1);
  resp = 0x100 | (uint8_t)payload;
  } else {

Re: [PATCH 1/2] Reduce vdpa initialization / startup overhead

2023-07-21 Thread Eugenio Perez Martin

On Tue, Jul 18, 2023 at 12:55 PM Michael S. Tsirkin  wrote:
>
> On Thu, Apr 20, 2023 at 10:59:56AM +0200, Eugenio Perez Martin wrote:
> > On Thu, Apr 20, 2023 at 7:25 AM Pei Li  wrote:
> > >
> > > Hi all,
> > >
> > > My bad, I just submitted the kernel patch. If we are passing some generic 
> > > command, still we have to add an additional field in the structure to 
> > > indicate what is the unbatched version of this command, and the struct 
> > > vhost_ioctls would be some command specific structure. In summary, the 
> > > structure would be something like
> > > struct vhost_cmd_batch {
> > > int ncmds;
> > > int cmd;
> >
> > The unbatched version should go in each vhost_ioctls. That allows us
> > to send many different commands in one ioctl instead of having to
> > resort to many ioctls, each one for a different task.
> >
> > The problem with that is the size of that struct vhost_ioctl, so we
> > can build an array. I think it should be enough with the biggest of
> > them (vhost_vring_addr ?) for a long time, but I would like to know if
> > anybody finds a drawback here. We could always resort to pointers if
> > we find we need more space, or start with them from the beginning.
> >
> > CCing Si-Wei here too, as he is also interested in reducing the startup 
> > time.
> >
> > Thanks!
>
> And copying my response too:
> This is all very exciting, but what exactly is the benefit?
> No optimization patches are going to be merged without
> numbers showing performance gains.
> In this case, can you show gains in process startup time?
> Are they significant enough to warrant adding new UAPI?
>

This should have been marked as RFC in that regard.

When this was sent it was one of the planned actions to reduce
overhead. After Si-Wei benchmarks, all the efforts will focus on
reducing the pinning / maps for the moment. It is unlikely that this
will be moved forward soon.

Thanks!


>
>
> > > struct vhost_ioctls[];
> > > };
> > >
> > > This is doable. Also, this is my first time submitting patches to open 
> > > source, sorry about the mess in advance. That being said, feel free to 
> > > throw questions / comments!
> > >
> > > Thanks and best regards,
> > > Pei
> > >
> > > On Wed, Apr 19, 2023 at 9:19 PM Jason Wang  wrote:
> > >>
> > >> On Wed, Apr 19, 2023 at 11:33 PM Eugenio Perez Martin
> > >>  wrote:
> > >> >
> > >> > On Wed, Apr 19, 2023 at 12:56 AM  wrote:
> > >> > >
> > >> > > From: Pei Li 
> > >> > >
> > >> > > Currently, part of the vdpa initialization / startup process
> > >> > > needs to trigger many ioctls per vq, which is very inefficient
> > >> > > and causing unnecessary context switch between user mode and
> > >> > > kernel mode.
> > >> > >
> > >> > > This patch creates an additional ioctl() command, namely
> > >> > > VHOST_VDPA_GET_VRING_GROUP_BATCH, that will batching
> > >> > > commands of VHOST_VDPA_GET_VRING_GROUP into a single
> > >> > > ioctl() call.
> > >>
> > >> I'd expect there's a kernel patch but I didn't see that?
> > >>
> > >> If we want to go this way. Why simply have a more generic way, that is
> > >> introducing something like:
> > >>
> > >> VHOST_CMD_BATCH which did something like
> > >>
> > >> struct vhost_cmd_batch {
> > >> int ncmds;
> > >> struct vhost_ioctls[];
> > >> };
> > >>
> > >> Then you can batch other ioctls other than GET_VRING_GROUP?
> > >>
> > >> Thanks
> > >>
> > >> > >
> > >> >
> > >> > It seems to me you forgot to send the 0/2 cover letter :).
> > >> >
> > >> > Please include the time we save thanks to avoiding the repetitive
> > >> > ioctls in each patch.
> > >> >
> > >> > CCing Jason and Michael.
> > >> >
> > >> > > Signed-off-by: Pei Li 
> > >> > > ---
> > >> > >  hw/virtio/vhost-vdpa.c   | 31 
> > >> > > +++-
> > >> > >  include/standard-headers/linux/vhost_types.h |  3 ++
> > >> > >  linux-headers/linux/vhost.h  |  7 +
> > >> > >  3 files changed, 33 insertions(+), 8 deletions(-)
> > >> > >
> > >> > > diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
> > >> > > index bc6bad23d5..6d45ff8539 100644
> > >> > > --- a/hw/virtio/vhost-vdpa.c
> > >> > > +++ b/hw/virtio/vhost-vdpa.c
> > >> > > @@ -679,7 +679,8 @@ static int vhost_vdpa_set_backend_cap(struct 
> > >> > > vhost_dev *dev)
> > >> > >  uint64_t f = 0x1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2 |
> > >> > >  0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH |
> > >> > >  0x1ULL << VHOST_BACKEND_F_IOTLB_ASID |
> > >> > > -0x1ULL << VHOST_BACKEND_F_SUSPEND;
> > >> > > +0x1ULL << VHOST_BACKEND_F_SUSPEND |
> > >> > > +0x1ULL << VHOST_BACKEND_F_IOCTL_BATCH;
> > >> > >  int r;
> > >> > >
> > >> > >  if (vhost_vdpa_call(dev, VHOST_GET_BACKEND_FEATURES, 
> > >> > > &features)) {
> > >> > > @@ -731,14 +732,28 @@ static int vhost_vdpa_get_vq_index(struct 
> > >> > > vhost_dev *dev, int idx)
> > >> > >
> > >> > >  static int vhost_vdpa_set_vring_ready(struct vhost_dev *dev)
> > >> > >  {
> > >> > >

[PATCH] numa: Skip invalidation of cluster and NUMA node boundary for qtest

2023-07-21 Thread Gavin Shan

There are warning messages printed from tests/qtest/numa-test.c,
to complain the CPU cluster and NUMA node boundary is broken. Since
the broken boundary is expected, we don't want to see the warning
messages.

  # cd /home/gavin/sandbox/qemu.main/build
  # MALLOC_PERTURB_=255 QTEST_QEMU_BINARY=./qemu-system-aarch64   \
G_TEST_DBUS_DAEMON=../tests/dbus-vmstate-daemon.sh\
QTEST_QEMU_IMG=./qemu-img \
QTEST_QEMU_STORAGE_DAEMON_BINARY=./storage-daemon/qemu-storage-daemon \
tests/qtest/numa-test --tap -k
  :
qemu-system-aarch64: warning: CPU-0 and CPU-4 in socket-0-cluster-0   \
have been associated with node-0 and node-1 respectively. \
It can cause OSes like Linux to misbehave
  :

Skip the invalidation of CPU cluster and NUMA node boundary when
qtest is enabled, to avoid the warning messages.

Fixes: a494fdb715 ("numa: Validate cluster and NUMA node boundary if required")
Signed-off-by: Gavin Shan 
---
 hw/core/machine.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/core/machine.c b/hw/core/machine.c
index f0d35c6401..1c5136e735 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -1271,7 +1271,7 @@ static void 
validate_cpu_cluster_to_numa_boundary(MachineState *ms)
 const CPUArchId *cpus = possible_cpus->cpus;
 int i, j;
 
-if (state->num_nodes <= 1 || possible_cpus->len <= 1) {
+if (qtest_enabled() || state->num_nodes <= 1 || possible_cpus->len <= 1) {
 return;
 }
 
-- 
2.41.0

Re: [PULL 03/18] numa: Validate cluster and NUMA node boundary if required

2023-07-21 Thread Gavin Shan




On 7/20/23 23:10, Peter Maydell wrote:

On Mon, 26 Jun 2023 at 12:15, Paolo Bonzini  wrote:


From: Gavin Shan 

For some architectures like ARM64, multiple CPUs in one cluster can be
associated with different NUMA nodes, which is irregular configuration
because we shouldn't have this in baremetal environment. The irregular
configuration causes Linux guest to misbehave, as the following warning
messages indicate.

   -smp 6,maxcpus=6,sockets=2,clusters=1,cores=3,threads=1 \
   -numa node,nodeid=0,cpus=0-1,memdev=ram0\
   -numa node,nodeid=1,cpus=2-3,memdev=ram1\
   -numa node,nodeid=2,cpus=4-5,memdev=ram2\


Hi. This new warning shows up a lot in "make check" output:

$ grep -c 'can cause OSes' /tmp/parn3ofA.par
44

Looks like this is all in the qtest-aarch64/numa-test test.

Please can you investigate and either:
  (1) fix the test not to do the bad thing that's causing the warning
  (2) change the warning so it doesn't show up in stderr when
  running a correct and passing test
?



Yes, all the warning messages come from tests/qtest/numa-test.c. There
are 3 configurations where the boundary of CPU cluster and NUMA node is
broken as expected. I've sent a patch to disable the validation for qtest.

https://lists.nongnu.org/archive/html/qemu-arm/2023-07/msg00440.html

With the patch applied, I didn't see similar warning messages from
"make -j 40 check-qtest".

Thanks,
Gavin

[PATCH] vhost-user-scsi: support reconnect to backend

2023-07-21 Thread Li Feng

If the backend crashes and restarts, the device is broken.
This patch adds reconnect for vhost-user-scsi.

Tested with spdk backend.

Signed-off-by: Li Feng 
---
 hw/block/vhost-user-blk.c   |   2 -
 hw/scsi/vhost-scsi-common.c |  27 ++---
 hw/scsi/vhost-user-scsi.c   | 163 +---
 include/hw/virtio/vhost-user-scsi.h |   3 +
 include/hw/virtio/vhost.h   |   2 +
 5 files changed, 165 insertions(+), 32 deletions(-)

diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c
index eecf3f7a81..f250c740b5 100644
--- a/hw/block/vhost-user-blk.c
+++ b/hw/block/vhost-user-blk.c
@@ -32,8 +32,6 @@
 #include "sysemu/sysemu.h"
 #include "sysemu/runstate.h"
 
-#define REALIZE_CONNECTION_RETRIES 3
-
 static const int user_feature_bits[] = {
 VIRTIO_BLK_F_SIZE_MAX,
 VIRTIO_BLK_F_SEG_MAX,
diff --git a/hw/scsi/vhost-scsi-common.c b/hw/scsi/vhost-scsi-common.c
index a06f01af26..08801886b8 100644
--- a/hw/scsi/vhost-scsi-common.c
+++ b/hw/scsi/vhost-scsi-common.c
@@ -52,16 +52,22 @@ int vhost_scsi_common_start(VHostSCSICommon *vsc)
 
 vsc->dev.acked_features = vdev->guest_features;
 
-assert(vsc->inflight == NULL);
-vsc->inflight = g_new0(struct vhost_inflight, 1);
-ret = vhost_dev_get_inflight(&vsc->dev,
- vs->conf.virtqueue_size,
- vsc->inflight);
+ret = vhost_dev_prepare_inflight(&vsc->dev, vdev);
 if (ret < 0) {
-error_report("Error get inflight: %d", -ret);
+error_report("Error setting inflight format: %d", -ret);
 goto err_guest_notifiers;
 }
 
+if (!vsc->inflight->addr) {
+ret = vhost_dev_get_inflight(&vsc->dev,
+vs->conf.virtqueue_size,
+vsc->inflight);
+if (ret < 0) {
+error_report("Error get inflight: %d", -ret);
+goto err_guest_notifiers;
+}
+}
+
 ret = vhost_dev_set_inflight(&vsc->dev, vsc->inflight);
 if (ret < 0) {
 error_report("Error set inflight: %d", -ret);
@@ -85,9 +91,6 @@ int vhost_scsi_common_start(VHostSCSICommon *vsc)
 return ret;
 
 err_guest_notifiers:
-g_free(vsc->inflight);
-vsc->inflight = NULL;
-
 k->set_guest_notifiers(qbus->parent, vsc->dev.nvqs, false);
 err_host_notifiers:
 vhost_dev_disable_notifiers(&vsc->dev, vdev);
@@ -111,12 +114,6 @@ void vhost_scsi_common_stop(VHostSCSICommon *vsc)
 }
 assert(ret >= 0);
 
-if (vsc->inflight) {
-vhost_dev_free_inflight(vsc->inflight);
-g_free(vsc->inflight);
-vsc->inflight = NULL;
-}
-
 vhost_dev_disable_notifiers(&vsc->dev, vdev);
 }
 
diff --git a/hw/scsi/vhost-user-scsi.c b/hw/scsi/vhost-user-scsi.c
index ee99b19e7a..e0e88b0c42 100644
--- a/hw/scsi/vhost-user-scsi.c
+++ b/hw/scsi/vhost-user-scsi.c
@@ -89,14 +89,126 @@ static void vhost_dummy_handle_output(VirtIODevice *vdev, 
VirtQueue *vq)
 {
 }
 
+static int vhost_user_scsi_connect(DeviceState *dev, Error **errp)
+{
+VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+VHostUserSCSI *s = VHOST_USER_SCSI(vdev);
+VHostSCSICommon *vsc = VHOST_SCSI_COMMON(s);
+VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(dev);
+int ret = 0;
+
+if (s->connected) {
+return 0;
+}
+s->connected = true;
+
+vsc->dev.num_queues = vs->conf.num_queues;
+vsc->dev.nvqs = VIRTIO_SCSI_VQ_NUM_FIXED + vs->conf.num_queues;
+vsc->dev.vqs = s->vhost_vqs;
+vsc->dev.vq_index = 0;
+vsc->dev.backend_features = 0;
+
+ret = vhost_dev_init(&vsc->dev, &s->vhost_user, VHOST_BACKEND_TYPE_USER, 0,
+ errp);
+if (ret < 0) {
+return ret;
+}
+
+/* restore vhost state */
+if (virtio_device_started(vdev, vdev->status)) {
+ret = vhost_scsi_common_start(vsc);
+if (ret < 0) {
+return ret;
+}
+}
+
+return 0;
+}
+
+static void vhost_user_scsi_event(void *opaque, QEMUChrEvent event);
+
+static void vhost_user_scsi_disconnect(DeviceState *dev)
+{
+VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+VHostUserSCSI *s = VHOST_USER_SCSI(vdev);
+VHostSCSICommon *vsc = VHOST_SCSI_COMMON(s);
+VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(dev);
+
+if (!s->connected) {
+return;
+}
+s->connected = false;
+
+vhost_scsi_common_stop(vsc);
+
+vhost_dev_cleanup(&vsc->dev);
+
+/* Re-instate the event handler for new connections */
+qemu_chr_fe_set_handlers(&vs->conf.chardev, NULL, NULL,
+ vhost_user_scsi_event, NULL, dev, NULL, true);
+}
+
+static void vhost_user_scsi_event(void *opaque, QEMUChrEvent event)
+{
+DeviceState *dev = opaque;
+VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+VHostUserSCSI *s = VHOST_USER_SCSI(vdev);
+VHostSCSICommon *vsc = VHOST_SCSI_COMMON(s);
+VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(dev);
+Error *local_err = NULL;
+
+switch (event)

Re: [PATCH v6 0/3] hw/ufs: Add Universal Flash Storage (UFS) support

2023-07-21 Thread Stefan Hajnoczi

Is there a test case along the lines of tests/qtest/ahci-test.c that
exercises this new code?

It's important to have at least a basic test that initializes the
device and performs a write followed by a read to verify that I/O is
working. This will allow the CI system and anyone who modifies the UFS
code to check that the basics are still working.

Thanks,
Stefan

RE: [PATCH v1] migration: refactor migration_completion

2023-07-21 Thread Wang, Wei W

On Friday, July 21, 2023 4:38 AM, Peter Xu wrote:
> Looks good to me, after addressing Isaku's comments.
> 
> The current_active_state is very unfortunate, along with most of the calls to
> migrate_set_state() - I bet most of the code will definitely go wrong if that
> cmpxchg didn't succeed inside of migrate_set_state(), IOW in most cases we
> simply always want:

Can you share examples where it could be wrong?
(If it has bugs, we need to fix)

> 
>   migrate_set_state(&s->state, s->state, XXX);
> 
> Not sure whether one pre-requisite patch is good to have so we can rename
> migrate_set_state() to something like __migrate_set_state(), then:
> 
>   migrate_set_state(s, XXX) {
> __migrate_set_state(&s->state, s->state, XXX);
>   }
> 
> I don't even know whether there's any call site that will need
> __migrate_set_state() for real..
> 

Seems this would break the use of "MIGRATION_STATUS_CANCELLING".
For example, 
- In migration_maybe_pause:
migrate_set_state(&s->state, MIGRATION_STATUS_PRE_SWITCHOVER,
new_state);
If the current s->state isn't MIGRATION_STATUS_PRE_SWITCHOVER (could
be MIGRATION_STATUS_CANCELLING),  then s->state won’t be updated to
new_state.
- Then, in migration_completion, the following update to s->state won't succeed:
   migrate_set_state(&s->state, current_active_state,
  MIGRATION_STATUS_COMPLETED);

- Finally, when reaching migration_iteration_finish(), s->state is
MIGRATION_STATUS_CANCELLING, instead of MIGRATION_STATUS_COMPLETED.

Re: [PATCH v21 01/20] s390x/cpu topology: add s390 specifics to CPU topology

2023-07-21 Thread Pierre Morel




On 7/18/23 18:31, Nina Schoetterl-Glausch wrote:

Reviewed-by: Nina Schoetterl-Glausch 

Some notes below.

The s390x/ prefix in the title might suggest that this patch
is s390 specific, but it touches common files.



Right.

What do you suggest?

I can cut it in two or squash it with patch number 2.

The first idea was to separate the patch to ease the review but the
functionality introduced in patch 1 do only make sense with patch 2.

So I would be for squashing the first two patches.

?




On Fri, 2023-06-30 at 11:17 +0200, Pierre Morel wrote:

S390 adds two new SMP levels, drawers and books to the CPU
topology.
The S390 CPU have specific topology features like dedication

S390 CPUs have specific topology features like dedication and
entitlement. These indicate to the guest information on host
vCPU scheduling and help the guest make better scheduling decisions.


and entitlement to give to the guest indications on the host
vCPUs scheduling and help the guest take the best decisions
on the scheduling of threads on the vCPUs.

Let us provide the SMP properties with books and drawers levels
and S390 CPU with dedication and entitlement,

Signed-off-by: Pierre Morel 
---
  qapi/machine-common.json    | 22 +
  qapi/machine.json   | 21 ++---
  include/hw/boards.h | 10 +-
  include/hw/qdev-properties-system.h |  4 +++
  target/s390x/cpu.h  |  6 
  hw/core/machine-smp.c   | 48 ---
--
  hw/core/machine.c   |  4 +++
  hw/core/qdev-properties-system.c    | 13 
  hw/s390x/s390-virtio-ccw.c  |  2 ++
  softmmu/vl.c    |  6 
  target/s390x/cpu.c  |  7 +
  qapi/meson.build    |  1 +
  qemu-options.hx |  7 +++--
  13 files changed, 137 insertions(+), 14 deletions(-)
  create mode 100644 qapi/machine-common.json

diff --git a/qapi/machine-common.json b/qapi/machine-common.json
new file mode 100644
index 00..bc0d76829c
--- /dev/null
+++ b/qapi/machine-common.json
@@ -0,0 +1,22 @@
+# -*- Mode: Python -*-
+# vim: filetype=python
+#
+# This work is licensed under the terms of the GNU GPL, version 2 or
later.
+# See the COPYING file in the top-level directory.
+
+##
+# = Machines S390 data types

Common definitions for machine.json and machine-target.json


[...]

Re: [PATCH] numa: Skip invalidation of cluster and NUMA node boundary for qtest

2023-07-21 Thread Peter Maydell

On Fri, 21 Jul 2023 at 11:44, Gavin Shan  wrote:
>
> There are warning messages printed from tests/qtest/numa-test.c,
> to complain the CPU cluster and NUMA node boundary is broken. Since
> the broken boundary is expected, we don't want to see the warning
> messages.
>
>   # cd /home/gavin/sandbox/qemu.main/build
>   # MALLOC_PERTURB_=255 QTEST_QEMU_BINARY=./qemu-system-aarch64   \
> G_TEST_DBUS_DAEMON=../tests/dbus-vmstate-daemon.sh\
> QTEST_QEMU_IMG=./qemu-img \
> QTEST_QEMU_STORAGE_DAEMON_BINARY=./storage-daemon/qemu-storage-daemon \
> tests/qtest/numa-test --tap -k
>   :
> qemu-system-aarch64: warning: CPU-0 and CPU-4 in socket-0-cluster-0   \
> have been associated with node-0 and node-1 respectively. \
> It can cause OSes like Linux to misbehave
>   :
>
> Skip the invalidation of CPU cluster and NUMA node boundary when
> qtest is enabled, to avoid the warning messages.
>
> Fixes: a494fdb715 ("numa: Validate cluster and NUMA node boundary if 
> required")
> Signed-off-by: Gavin Shan 

Reviewed-by: Peter Maydell 

thanks
-- PMM

Re: [PATCH for-8.2 4/6] vfio/migration: Refactor PRE_COPY and RUNNING state checks

2023-07-21 Thread Cédric Le Goater


On 7/16/23 10:15, Avihai Horon wrote:

From: Joao Martins 

Move the PRE_COPY and RUNNING state checks to helper functions.

This is in preparation for adding P2P VFIO migration support, where
these helpers will also test for PRE_COPY_P2P and RUNNING_P2P states.

Signed-off-by: Joao Martins 
Signed-off-by: Avihai Horon 



Reviewed-by: Cédric Le Goater 

Thanks,

C.



---
  include/hw/vfio/vfio-common.h |  2 ++
  hw/vfio/common.c  | 22 ++
  hw/vfio/migration.c   | 10 --
  3 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index da43d27352..e9b8954595 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -230,6 +230,8 @@ void vfio_unblock_multiple_devices_migration(void);
  bool vfio_viommu_preset(VFIODevice *vbasedev);
  int64_t vfio_mig_bytes_transferred(void);
  void vfio_reset_bytes_transferred(void);
+bool vfio_device_state_is_running(VFIODevice *vbasedev);
+bool vfio_device_state_is_precopy(VFIODevice *vbasedev);
  
  #ifdef CONFIG_LINUX

  int vfio_get_region_info(VFIODevice *vbasedev, int index,
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 9aac21abb7..16cf79a76c 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -437,6 +437,20 @@ static void vfio_set_migration_error(int err)
  }
  }
  
+bool vfio_device_state_is_running(VFIODevice *vbasedev)

+{
+VFIOMigration *migration = vbasedev->migration;
+
+return migration->device_state == VFIO_DEVICE_STATE_RUNNING;
+}
+
+bool vfio_device_state_is_precopy(VFIODevice *vbasedev)
+{
+VFIOMigration *migration = vbasedev->migration;
+
+return migration->device_state == VFIO_DEVICE_STATE_PRE_COPY;
+}
+
  static bool vfio_devices_all_dirty_tracking(VFIOContainer *container)
  {
  VFIOGroup *group;
@@ -457,8 +471,8 @@ static bool vfio_devices_all_dirty_tracking(VFIOContainer 
*container)
  }
  
  if (vbasedev->pre_copy_dirty_page_tracking == ON_OFF_AUTO_OFF &&

-(migration->device_state == VFIO_DEVICE_STATE_RUNNING ||
- migration->device_state == VFIO_DEVICE_STATE_PRE_COPY)) {
+(vfio_device_state_is_running(vbasedev) ||
+ vfio_device_state_is_precopy(vbasedev))) {
  return false;
  }
  }
@@ -503,8 +517,8 @@ static bool 
vfio_devices_all_running_and_mig_active(VFIOContainer *container)
  return false;
  }
  
-if (migration->device_state == VFIO_DEVICE_STATE_RUNNING ||

-migration->device_state == VFIO_DEVICE_STATE_PRE_COPY) {
+if (vfio_device_state_is_running(vbasedev) ||
+vfio_device_state_is_precopy(vbasedev)) {
  continue;
  } else {
  return false;
diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
index 8acd182a8b..48f9c23cbe 100644
--- a/hw/vfio/migration.c
+++ b/hw/vfio/migration.c
@@ -411,7 +411,7 @@ static void vfio_state_pending_estimate(void *opaque, 
uint64_t *must_precopy,
  VFIODevice *vbasedev = opaque;
  VFIOMigration *migration = vbasedev->migration;
  
-if (migration->device_state != VFIO_DEVICE_STATE_PRE_COPY) {

+if (!vfio_device_state_is_precopy(vbasedev)) {
  return;
  }
  
@@ -444,7 +444,7 @@ static void vfio_state_pending_exact(void *opaque, uint64_t *must_precopy,

  vfio_query_stop_copy_size(vbasedev, &stop_copy_size);
  *must_precopy += stop_copy_size;
  
-if (migration->device_state == VFIO_DEVICE_STATE_PRE_COPY) {

+if (vfio_device_state_is_precopy(vbasedev)) {
  vfio_query_precopy_size(migration);
  
  *must_precopy +=

@@ -459,9 +459,8 @@ static void vfio_state_pending_exact(void *opaque, uint64_t 
*must_precopy,
  static bool vfio_is_active_iterate(void *opaque)
  {
  VFIODevice *vbasedev = opaque;
-VFIOMigration *migration = vbasedev->migration;
  
-return migration->device_state == VFIO_DEVICE_STATE_PRE_COPY;

+return vfio_device_state_is_precopy(vbasedev);
  }
  
  static int vfio_save_iterate(QEMUFile *f, void *opaque)

@@ -656,7 +655,6 @@ static const SaveVMHandlers savevm_vfio_handlers = {
  static void vfio_vmstate_change(void *opaque, bool running, RunState state)
  {
  VFIODevice *vbasedev = opaque;
-VFIOMigration *migration = vbasedev->migration;
  enum vfio_device_mig_state new_state;
  int ret;
  
@@ -664,7 +662,7 @@ static void vfio_vmstate_change(void *opaque, bool running, RunState state)

  new_state = VFIO_DEVICE_STATE_RUNNING;
  } else {
  new_state =
-(migration->device_state == VFIO_DEVICE_STATE_PRE_COPY &&
+(vfio_device_state_is_precopy(vbasedev) &&
   (state == RUN_STATE_FINISH_MIGRATE || state == 
RUN_STATE_PAUSED)) ?
  VFIO_DEVICE_STATE_STOP_COPY :
  VFIO_DEVICE_STATE_STOP;

Re: [PATCH for-8.2 5/6] vfio/migration: Add P2P support for VFIO migration

2023-07-21 Thread Cédric Le Goater


On 7/16/23 10:15, Avihai Horon wrote:

VFIO migration uAPI defines an optional intermediate P2P quiescent
state. While in the P2P quiescent state, P2P DMA transactions cannot be
initiated by the device, but the device can respond to incoming ones.
Additionally, all outstanding P2P transactions are guaranteed to have
been completed by the time the device enters this state.

The purpose of this state is to support migration of multiple devices
that are doing P2P transactions between themselves.

Add support for P2P migration by transitioning all the devices to the
P2P quiescent state before stopping or starting the devices. Use the new
VMChangeStateHandler pre_change_cb to achieve that behavior.

This will allow migration of multiple VFIO devices if all of them
support P2P migration.


LGTM, one small comment below


Signed-off-by: Avihai Horon 
---
  docs/devel/vfio-migration.rst | 93 +--
  hw/vfio/common.c  |  6 ++-
  hw/vfio/migration.c   | 58 +-
  hw/vfio/trace-events  |  2 +-
  4 files changed, 107 insertions(+), 52 deletions(-)

diff --git a/docs/devel/vfio-migration.rst b/docs/devel/vfio-migration.rst
index b433cb5bb2..b9c57ba651 100644
--- a/docs/devel/vfio-migration.rst
+++ b/docs/devel/vfio-migration.rst
@@ -23,9 +23,21 @@ and recommends that the initial bytes are sent and loaded in 
the destination
  before stopping the source VM. Enabling this migration capability will
  guarantee that and thus, can potentially reduce downtime even further.
  
-Note that currently VFIO migration is supported only for a single device. This

-is due to VFIO migration's lack of P2P support. However, P2P support is planned
-to be added later on.
+To support migration of multiple devices that are doing P2P transactions
+between themselves, VFIO migration uAPI defines an intermediate P2P quiescent
+state. While in the P2P quiescent state, P2P DMA transactions cannot be
+initiated by the device, but the device can respond to incoming ones.
+Additionally, all outstanding P2P transactions are guaranteed to have been
+completed by the time the device enters this state.
+
+All the devices that support P2P migration are first transitioned to the P2P
+quiescent state and only then are they stopped or started. This makes migration
+safe P2P-wise, since starting and stopping the devices is not done atomically
+for all the devices together.
+
+Thus, multiple VFIO devices migration is allowed only if all the devices
+support P2P migration. Single VFIO device migration is allowed regardless of
+P2P migration support.
  
  A detailed description of the UAPI for VFIO device migration can be found in

  the comment for the ``vfio_device_mig_state`` structure in the header file
@@ -132,54 +144,63 @@ will be blocked.
  Flow of state changes during Live migration
  ===
  
-Below is the flow of state change during live migration.

+Below is the state change flow during live migration for a VFIO device that
+supports both precopy and P2P migration. The flow for devices that don't
+support it is similar, except that the relevant states for precopy and P2P are
+skipped.
  The values in the parentheses represent the VM state, the migration state, and
  the VFIO device state, respectively.
-The text in the square brackets represents the flow if the VFIO device supports
-pre-copy.
  
  Live migration save path

  
  
  ::
  
-QEMU normal running state

-(RUNNING, _NONE, _RUNNING)
-  |
+   QEMU normal running state
+   (RUNNING, _NONE, _RUNNING)
+  |
   migrate_init spawns migration_thread
-Migration thread then calls each device's .save_setup()
-  (RUNNING, _SETUP, _RUNNING [_PRE_COPY])
-  |
-  (RUNNING, _ACTIVE, _RUNNING [_PRE_COPY])
-  If device is active, get pending_bytes by 
.state_pending_{estimate,exact}()
-  If total pending_bytes >= threshold_size, call .save_live_iterate()
-  [Data of VFIO device for pre-copy phase is copied]
-Iterate till total pending bytes converge and are less than threshold
-  |
-  On migration completion, vCPU stops and calls .save_live_complete_precopy for
-  each active device. The VFIO device is then transitioned into _STOP_COPY 
state
-  (FINISH_MIGRATE, _DEVICE, _STOP_COPY)
-  |
- For the VFIO device, iterate in .save_live_complete_precopy until
- pending data is 0
-   (FINISH_MIGRATE, _DEVICE, _STOP)
-  |
- (FINISH_MIGRATE, _COMPLETED, _STOP)
- Migraton thread schedules cleanup bottom hal

[PATCH] scripts/qemu-binfmt-conf.sh: refresh

2023-07-21 Thread Michael Tokarev

Currently qemu-binfmt-conf.sh does a number of strange things.

1. --systemd requires an argument - the CPU type to register,
   while --debian (which is actually --binfmt-support) does not
   accept such an argument, so it is not possible to specify which
   CPU(s) to register for debian.

2. Why this "ALL" at all?

3. it just ignores extra command-line arguments.  It would be
   logical to specify which CPUs to register (multiple!) as the
   additional arguments.

4. Even if a CPU is explicitly requested, it does not register
   anything if this CPU is of the same family as host one. But
   this is wrong, since quite often it *is* desirable to do this
   registration, - like, when running in i386 when the system is
   not capable of running x86-64 binaries, and countless other
   examples

5. It ignores errors

6. It ignores wrong command line arguments

Fix this, and simplify things a bit.

1. Stop accepting an argument for --systemd.  With getopt_long,
   this argument, if given, will be returned as a non-optional
   parameter so compatibility with current version is preserved.

2. Accept optional arguments and generate registration for the
   given CPUs only.  In case no extra arguments are given, register
   for all supportd CPUs except of the same family as host.

3. Recognze "ALL" "CPU" to keep compatibility with current version
   (but do not document it).

4. Warn but perform registration anyway if a cpu of the same family
   has been requested.

5. In help text, use --debian and --systemd as alternatives to each
   other, to make it clear the two can not be used at the same time.

6. Tiny optimization of eval expression.

7. Fold the list of supported CPUs to fit in 80 columns.

8. Exit with non-zero code in case registration fails or the command
   line is wrong.

9. Remove explicit checking for writability of various things.

Signed-off-by: Michael Tokarev 
---
 scripts/qemu-binfmt-conf.sh | 89 +
 1 file changed, 40 insertions(+), 49 deletions(-)

diff --git a/scripts/qemu-binfmt-conf.sh b/scripts/qemu-binfmt-conf.sh
index 6ef9f118d9..27d506000d 100755
--- a/scripts/qemu-binfmt-conf.sh
+++ b/scripts/qemu-binfmt-conf.sh
@@ -182,10 +182,10 @@ qemu_get_family() {
 
 usage() {
 cat <&2
-exit 1
-fi
+echo $qemu_target_list | fold -w68 -s | sed 's/^//'
 }
 
 qemu_check_bintfmt_misc() {
@@ -246,8 +242,6 @@ qemu_check_bintfmt_misc() {
   exit 1
   fi
 fi
-
-qemu_check_access /proc/sys/fs/binfmt_misc/register
 }
 
 installed_dpkg() {
@@ -260,14 +254,12 @@ qemu_check_debian() {
 elif ! installed_dpkg binfmt-support ; then
 echo "WARNING: package binfmt-support is needed" 1>&2
 fi
-qemu_check_access "$EXPORTDIR"
 }
 
 qemu_check_systemd() {
 if ! systemctl -q is-enabled systemd-binfmt.service ; then
 echo "WARNING: systemd-binfmt.service is missing or disabled" 1>&2
 fi
-qemu_check_access "$EXPORTDIR"
 }
 
 qemu_generate_register() {
@@ -287,16 +279,16 @@ qemu_generate_register() {
 
 qemu_register_interpreter() {
 echo "Setting $qemu as binfmt interpreter for $cpu"
-qemu_generate_register > /proc/sys/fs/binfmt_misc/register
+qemu_generate_register > /proc/sys/fs/binfmt_misc/register || exit 1
 }
 
 qemu_generate_systemd() {
 echo "Setting $qemu as binfmt interpreter for $cpu for 
systemd-binfmt.service"
-qemu_generate_register > "$EXPORTDIR/qemu-$cpu.conf"
+qemu_generate_register > "$EXPORTDIR/qemu-$cpu.conf" || exit 1
 }
 
 qemu_generate_debian() {
-cat > "$EXPORTDIR/qemu-$cpu" < "$EXPORTDIR/qemu-$cpu" <&2
-continue
+echo "ERROR: unknown cpu $cpu" >&2
+exit 1
 fi
 
 qemu="$QEMU_PATH/qemu-$cpu"
@@ -329,9 +327,12 @@ qemu_set_binfmts() {
 fi
 
 qemu="$qemu$QEMU_SUFFIX"
-if [ "$host_family" != "$family" ] ; then
-$BINFMT_SET
+if [ "$host_family" = "$family" ] ; then
+[ -n "$explicit" ] || continue
+echo "WARNING: requested CPU $cpu is of the same family as host 
CPU" >&2
+echo "WARNING: this might break the system" >&2
 fi
+$BINFMT_SET
 done
 }
 
@@ -347,9 +348,9 @@ PERSISTENT=no
 PRESERVE_ARG0=no
 QEMU_SUFFIX=""
 
-_longopts="debian,systemd:,qemu-path:,qemu-suffix:,exportdir:,help,credential:,\
+_longopts="debian,systemd,qemu-path:,qemu-suffix:,exportdir:,help,credential:,\
 persistent:,preserve-argv0:"
-options=$(getopt -o ds:Q:S:e:hc:p:g:F: -l ${_longopts} -- "$@")
+options=$(getopt -o dsQ:S:e:hc:p:g:F: -l ${_longopts} -- "$@") || exit 1
 eval set -- "$options"
 
 while true ; do
@@ -363,23 +364,6 @@ while true ; do
 CHECK=qemu_check_systemd
 BINFMT_SET=qemu_generate_systemd
 EXPORTDIR=${EXPORTDIR:-$SYSTEMDDIR}
-shift
-# check given cpu is in the supported CPU list
-if [ "$1" != "ALL" ] ; then
-for cpu in ${qemu_target_list} ; do
-if [

Re: [PATCH 3/4] vdpa: Restore vlan filtering state

2023-07-21 Thread Eugenio Perez Martin

On Wed, Jul 19, 2023 at 9:48 AM Hawkins Jiawei  wrote:
>
> This patch introduces vhost_vdpa_net_load_single_vlan()
> and vhost_vdpa_net_load_vlan() to restore the vlan
> filtering state at device's startup.
>
> Co-developed-by: Eugenio Pérez 
> Signed-off-by: Eugenio Pérez 
> Signed-off-by: Hawkins Jiawei 
> ---
>  net/vhost-vdpa.c | 49 
>  1 file changed, 49 insertions(+)
>
> diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
> index 9795306742..0787dd933b 100644
> --- a/net/vhost-vdpa.c
> +++ b/net/vhost-vdpa.c
> @@ -965,6 +965,51 @@ static int vhost_vdpa_net_load_rx(VhostVDPAState *s,
>  return 0;
>  }
>
> +static int vhost_vdpa_net_load_single_vlan(VhostVDPAState *s,
> +   const VirtIONet *n,
> +   uint16_t vid)
> +{
> +const struct iovec data = {
> +.iov_base = &vid,
> +.iov_len = sizeof(vid),
> +};
> +ssize_t dev_written = vhost_vdpa_net_load_cmd(s, VIRTIO_NET_CTRL_VLAN,
> +  VIRTIO_NET_CTRL_VLAN_ADD,
> +  &data, 1);
> +if (unlikely(dev_written < 0)) {
> +return dev_written;
> +}
> +if (unlikely(*s->status != VIRTIO_NET_OK)) {
> +return -EIO;
> +}
> +
> +return 0;
> +}
> +
> +static int vhost_vdpa_net_load_vlan(VhostVDPAState *s,
> +const VirtIONet *n)
> +{
> +int r;
> +
> +if (!virtio_vdev_has_feature(&n->parent_obj, VIRTIO_NET_F_CTRL_VLAN)) {
> +return 0;
> +}
> +
> +for (int i = 0; i < MAX_VLAN >> 5; i++) {
> +for (int j = 0; n->vlans[i] && j <= 0x1f; j++) {
> +if (n->vlans[i] & (1U << j)) {
> +r = vhost_vdpa_net_load_single_vlan(s, n, (i << 5) + j);
> +if (unlikely(r != 0)) {
> +return r;
> +}
> +}
> +}
> +}
> +
> +return 0;
> +}
> +
> +

Nit: I'm not sure if it was here originally, but there is an extra newline here.

>  static int vhost_vdpa_net_load(NetClientState *nc)
>  {
>  VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
> @@ -995,6 +1040,10 @@ static int vhost_vdpa_net_load(NetClientState *nc)
>  if (unlikely(r)) {
>  return r;
>  }
> +r = vhost_vdpa_net_load_vlan(s, n);
> +if (unlikely(r)) {
> +return r;
> +}
>
>  return 0;
>  }
> --
> 2.25.1
>

Re: [PATCH for-8.2 6/6] vfio/migration: Allow migration of multiple P2P supporting devices

2023-07-21 Thread Cédric Le Goater


On 7/16/23 10:15, Avihai Horon wrote:

Now that P2P support has been added to VFIO migration, allow migration
of multiple devices if all of them support P2P migration.

Single device migration is allowed regardless of P2P migration support.

Signed-off-by: Avihai Horon 
Signed-off-by: Joao Martins 
---
  hw/vfio/common.c | 26 ++
  1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 7c3d636025..753b320739 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -363,21 +363,31 @@ bool vfio_mig_active(void)
  
  static Error *multiple_devices_migration_blocker;
  
-static unsigned int vfio_migratable_device_num(void)

+/*
+ * Multiple devices migration is allowed only if all devices support P2P
+ * migration. Single device migration is allowed regardless of P2P migration
+ * support.
+ */
+static bool vfio_should_block_multiple_devices_migration(void)


Could we revert the logic and call the routine :

  vfio_multiple_devices_migration_is_supported()

I think it would be clearer in the callers.  This is minor.

Thanks,

C.


  {
  VFIOGroup *group;
  VFIODevice *vbasedev;
  unsigned int device_num = 0;
+bool all_support_p2p = true;
  
  QLIST_FOREACH(group, &vfio_group_list, next) {

  QLIST_FOREACH(vbasedev, &group->device_list, next) {
  if (vbasedev->migration) {
  device_num++;
+
+if (!(vbasedev->migration->mig_flags & VFIO_MIGRATION_P2P)) {
+all_support_p2p = false;
+}
  }
  }
  }
  
-return device_num;

+return !all_support_p2p && device_num > 1;
  }
  
  int vfio_block_multiple_devices_migration(VFIODevice *vbasedev, Error **errp)

@@ -385,19 +395,19 @@ int vfio_block_multiple_devices_migration(VFIODevice 
*vbasedev, Error **errp)
  int ret;
  
  if (multiple_devices_migration_blocker ||

-vfio_migratable_device_num() <= 1) {
+!vfio_should_block_multiple_devices_migration()) {
  return 0;
  }
  
  if (vbasedev->enable_migration == ON_OFF_AUTO_ON) {

-error_setg(errp, "Migration is currently not supported with multiple "
- "VFIO devices");
+error_setg(errp, "Multiple VFIO devices migration is supported only if 
"
+ "all of them support P2P migration");
  return -EINVAL;
  }
  
  error_setg(&multiple_devices_migration_blocker,

-   "Migration is currently not supported with multiple "
-   "VFIO devices");
+   "Multiple VFIO devices migration is supported only if all of "
+   "them support P2P migration");
  ret = migrate_add_blocker(multiple_devices_migration_blocker, errp);
  if (ret < 0) {
  error_free(multiple_devices_migration_blocker);
@@ -410,7 +420,7 @@ int vfio_block_multiple_devices_migration(VFIODevice 
*vbasedev, Error **errp)
  void vfio_unblock_multiple_devices_migration(void)
  {
  if (!multiple_devices_migration_blocker ||
-vfio_migratable_device_num() > 1) {
+vfio_should_block_multiple_devices_migration()) {
  return;
  }

Re: [PATCH v2] accel/kvm: Specify default IPA size for arm64

2023-07-21 Thread Peter Maydell

On Fri, 21 Jul 2023 at 08:30, Philippe Mathieu-Daudé  wrote:
>
> Hi Akihiko,
>
> On 21/7/23 08:24, Akihiko Odaki wrote:
> > libvirt uses "none" machine type to test KVM availability. Before this
> > change, QEMU used to pass 0 as machine type when calling KVM_CREATE_VM.
> >
> > The kernel documentation says:
> >> On arm64, the physical address size for a VM (IPA Size limit) is
> >> limited to 40bits by default. The limit can be configured if the host
> >> supports the extension KVM_CAP_ARM_VM_IPA_SIZE. When supported, use
> >> KVM_VM_TYPE_ARM_IPA_SIZE(IPA_Bits) to set the size in the machine type
> >> identifier, where IPA_Bits is the maximum width of any physical
> >> address used by the VM. The IPA_Bits is encoded in bits[7-0] of the
> >> machine type identifier.
> >>
> >> e.g, to configure a guest to use 48bit physical address size::
> >>
> >>  vm_fd = ioctl(dev_fd, KVM_CREATE_VM, KVM_VM_TYPE_ARM_IPA_SIZE(48));
> >>
> >> The requested size (IPA_Bits) must be:
> >>
> >>   ==   =
> >>0   Implies default size, 40bits (for backward compatibility)
> >>N   Implies N bits, where N is a positive integer such that,
> >>32 <= N <= Host_IPA_Limit
> >>   ==   =
> >
> >> Host_IPA_Limit is the maximum possible value for IPA_Bits on the host
> >> and is dependent on the CPU capability and the kernel configuration.
> >> The limit can be retrieved using KVM_CAP_ARM_VM_IPA_SIZE of the
> >> KVM_CHECK_EXTENSION ioctl() at run-time.
> >>
> >> Creation of the VM will fail if the requested IPA size (whether it is
> >> implicit or explicit) is unsupported on the host.
> > https://docs.kernel.org/virt/kvm/api.html#kvm-create-vm
> >
> > So if Host_IPA_Limit < 40, such KVM_CREATE_VM will fail, and libvirt
> > incorrectly thinks KVM is not available. This actually happened on M2
> > MacBook Air.
> >
> > Fix this by specifying 32 for IPA_Bits as any arm64 system should
> > support the value according to the documentation.
> >
> > Signed-off-by: Akihiko Odaki 
> > ---
> > V1 -> V2: Introduced an arch hook
> >
> >   include/sysemu/kvm.h   | 1 +
> >   accel/kvm/kvm-all.c| 2 +-
> >   target/arm/kvm.c   | 2 ++
> >   target/i386/kvm/kvm.c  | 2 ++
> >   target/mips/kvm.c  | 2 ++
> >   target/ppc/kvm.c   | 2 ++
> >   target/riscv/kvm.c | 2 ++
> >   target/s390x/kvm/kvm.c | 2 ++
> >   8 files changed, 14 insertions(+), 1 deletion(-)
>
> My understanding of Peter's suggestion would be smth like:
>
> -- >8 --
> diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
> index 115f0cca79..c0af15eb6c 100644
> --- a/include/sysemu/kvm.h
> +++ b/include/sysemu/kvm.h
> @@ -201,10 +201,15 @@ typedef struct KVMCapabilityInfo {
>
>   struct KVMState;
>
> +struct KVMClass {
> +AccelClass parent_class;
> +
> +int default_vm_type;

The kernel docs say you need to check for the
KVM_CAP_ARM_VM_IPA_SIZE before you can pass something other
than zero to the KVM_CREATE_VM ioctl, so this needs to be
a method, not just a value. (kvm_arm_get_max_vm_ipa_size()
will do this bit for you.)

If the machine doesn't provide a kvm_type method, we
should default to "largest the host supports", I think.

I was wondering if we could have one per-arch
method for "actually create the VM" that both was
a place for arm to set the default vm type and
also let us get the TARGET_S390X and TARGET_PPC
ifdefs out of this bit of kvm-all.c, but maybe that would
look just a bit too awkward:

 if (kc->create_vm(s, board_sets_kvm_type, board_kvm_type) < 0) {
 goto err;
 }

where board_sets_kvm_type is a bool, true if board_kvm_type
is valid, and board_kvm_type is whatever the board's
mc->kvm_type method told us.

(Default impl of the method: call KVM_CREATE_VM ioctl
with retry-on-eintr, printing the simple error message;
PPC and s390 versions similar but with their arch
specific extra messages; arm version has a different
default type if board_sets_kvm_type is false.)

Not trying to do both of those things with one method
would result in a simpler
   type = kc->get_default_kvm_type(s);
API.

thanks
-- PMM

[PATCH] target/ppc: Fix the order of kvm_enable judgment about kvmppc_set_interrupt()

2023-07-21 Thread jianchunfu

It's unnecessary for non-KVM accelerators(TCG, for example),
to call this function, so change the order of kvm_enable() judgment.

The static inline function that returns -1 directly does not work
 in TCG's situation.

Signed-off-by: jianchunfu 
---
 hw/ppc/ppc.c | 8 ++--
 target/ppc/kvm.c | 2 +-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/hw/ppc/ppc.c b/hw/ppc/ppc.c
index 0e0a3d93c3..3e96b24487 100644
--- a/hw/ppc/ppc.c
+++ b/hw/ppc/ppc.c
@@ -58,7 +58,9 @@ void ppc_set_irq(PowerPCCPU *cpu, int irq, int level)
 
 if (old_pending != env->pending_interrupts) {
 ppc_maybe_interrupt(env);
-kvmppc_set_interrupt(cpu, irq, level);
+if (kvm_enabled()) {
+kvmppc_set_interrupt(cpu, irq, level);
+}
 }
 
 trace_ppc_irq_set_exit(env, irq, level, env->pending_interrupts,
@@ -1465,5 +1467,7 @@ void ppc_irq_reset(PowerPCCPU *cpu)
 CPUPPCState *env = &cpu->env;
 
 env->irq_input_state = 0;
-kvmppc_set_interrupt(cpu, PPC_INTERRUPT_EXT, 0);
+if (kvm_enabled()) {
+kvmppc_set_interrupt(cpu, PPC_INTERRUPT_EXT, 0);
+}
 }
diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c
index a8a935e267..11a1fbc244 100644
--- a/target/ppc/kvm.c
+++ b/target/ppc/kvm.c
@@ -1315,7 +1315,7 @@ int kvmppc_set_interrupt(PowerPCCPU *cpu, int irq, int 
level)
 return 0;
 }
 
-if (!kvm_enabled() || !cap_interrupt_unset) {
+if (!cap_interrupt_unset) {
 return 0;
 }
 
-- 
2.27.0

Re: [PATCH 1/2] hw/char/riscv_htif: Fix printing of console characters on big endian hosts

2023-07-21 Thread Philippe Mathieu-Daudé


On 21/7/23 11:47, Thomas Huth wrote:

The character that should be printed is stored in the 64 bit "payload"
variable. The code currently tries to print it by taking the address
of the variable and passing this pointer to qemu_chr_fe_write(). However,
this only works on little endian hosts where the least significant bits
are stored on the lowest address. To do this in a portable way, we have
to store the value in an uint8_t variable instead.

Fixes: 5033606780 ("RISC-V HTIF Console")
Signed-off-by: Thomas Huth 
---
  hw/char/riscv_htif.c | 3 ++-
  1 file changed, 2 insertions(+), 1 deletion(-)


Reviewed-by: Philippe Mathieu-Daudé

[PATCH] target/riscv/cpu.c: do not run 'host' CPU with TCG

2023-07-21 Thread Daniel Henrique Barboza

The 'host' CPU is available in a CONFIG_KVM build and it's currently
available for all accels, but is a KVM only CPU. This means that in a
RISC-V KVM capable host we can do things like this:

$ ./build/qemu-system-riscv64 -M virt,accel=tcg -cpu host --nographic
qemu-system-riscv64: H extension requires priv spec 1.12.0

This CPU does not have a priv spec because we don't filter its extensions
via priv spec. We shouldn't be reaching riscv_cpu_realize_tcg() at all
with the 'host' CPU.

We don't have a way to filter the 'host' CPU out of the available CPU
options (-cpu help) if the build includes both KVM and TCG. What we can
do is to error out during riscv_cpu_realize_tcg() if the user chooses
the 'host' CPU with accel=tcg:

$ ./build/qemu-system-riscv64 -M virt,accel=tcg -cpu host --nographic
qemu-system-riscv64: 'host' CPU is not compatible with TCG acceleration

Signed-off-by: Daniel Henrique Barboza 
---
 target/riscv/cpu.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
index 6b93b04453..08db3d613f 100644
--- a/target/riscv/cpu.c
+++ b/target/riscv/cpu.c
@@ -1395,6 +1395,11 @@ static void riscv_cpu_realize_tcg(DeviceState *dev, 
Error **errp)
 CPURISCVState *env = &cpu->env;
 Error *local_err = NULL;
 
+if (object_dynamic_cast(OBJECT(dev), TYPE_RISCV_CPU_HOST)) {
+error_setg(errp, "'host' CPU is not compatible with TCG acceleration");
+return;
+}
+
 riscv_cpu_validate_misa_mxl(cpu, &local_err);
 if (local_err != NULL) {
 error_propagate(errp, local_err);
-- 
2.41.0

[PATCH] virtio: Fix packed virtqueue used_idx mask

2023-07-21 Thread Hanna Czenczek

virtio_queue_packed_set_last_avail_idx() is used by vhost devices to set
the internal queue indices to what has been reported by the vhost
back-end through GET_VRING_BASE.  For packed virtqueues, this
32-bit value is expected to contain both the device's internal avail and
used indices, as well as their respective wrap counters.

To get the used index, we shift the 32-bit value right by 16, and then
apply a mask of 0x7.  That seems to be a typo, because it should be
0x7fff; first of all, the virtio specification says that the maximum
queue size for packed virt queues is 2^15, so the indices cannot exceed
2^15 - 1 anyway, making 0x7fff the correct mask.  Second, the mask
clearly is wrong from context, too, given that (A) `idx & 0x7` must
be 0 at this point (`idx` is 32 bit and was shifted to the right by 16
already), (B) `idx & 0x8000` is the used_wrap_counter, so should not be
part of the used index, and (C) `vq->used_idx` is a `uint16_t`, so
cannot fit the 0x7 part of the mask anyway.

This most likely never produced any guest-visible bugs, though, because
for a vhost device, qemu will probably not evaluate the used index
outside of virtio_queue_packed_get_last_avail_idx(), where we
reconstruct the 32-bit value from avail and used indices and their wrap
counters again.  There, it does not matter whether the highest bit of
the used_idx is the used index wrap counter, because we put the wrap
counter exactly in that position anyway.

Signed-off-by: Hanna Czenczek 
---
 hw/virtio/virtio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index 295a603e58..309038fd46 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -3321,7 +3321,7 @@ static void 
virtio_queue_packed_set_last_avail_idx(VirtIODevice *vdev,
 vq->last_avail_wrap_counter =
 vq->shadow_avail_wrap_counter = !!(idx & 0x8000);
 idx >>= 16;
-vq->used_idx = idx & 0x7;
+vq->used_idx = idx & 0x7fff;
 vq->used_wrap_counter = !!(idx & 0x8000);
 }
 
-- 
2.41.0

Re: [PATCH 7/7] tests/avocado: ppc64 pseries reverse debugging test

2023-07-21 Thread Nicholas Piggin

On Mon Jun 26, 2023 at 7:34 PM AEST, Nicholas Piggin wrote:
> On Mon Jun 26, 2023 at 5:49 PM AEST, Pavel Dovgalyuk wrote:
> > On 23.06.2023 15:57, Nicholas Piggin wrote:
> > > pseries can run reverse-debugging well enough to pass basic tests.
> > > 
> > > There is strangeness with reverse-continue possibly relating to a bp
> > > being set on the first instruction or on a snapshot, that causes
> > > the PC to be reported on the first instruction rather than last
> > > breakpoint, so a workaround is added for that for now.
> >
> > It means that the test reveals some kind of a bug in PPC debugging 
> > server implementation.
> > In this case it is better to fix that instead of adding workaround.
>
> I agree, and I'm trying to find the cause it hasn't been easy. I
> thought the test was still interesting because it otherwise seems
> to work well, but hopefully I can find the issue before long.

I found the problem after too much staring at record-replay. QEMU works
perfectly, it is the ppc pseries firmware that branches to its own entry
point address within the recorded trace, and that confuses the test
script.

I'm not sure the best way to work around it. Initial skip works okay but
also maybe(?) a good edge case to test a break on the very first
instruction of the trace even if we don't reverse continue to it.

I will send a new series and propose to add the breakpoints before
seeking to the end of the trace, so we will catch a breakpoint being
executed again.

Thanks,
Nick

Re: [PATCH 6/6] vhost-user: Have reset_status fall back to reset

2023-07-21 Thread Hanna Czenczek


On 20.07.23 18:03, Stefan Hajnoczi wrote:

On Wed, Jul 19, 2023 at 04:27:58PM +0200, Hanna Czenczek wrote:

On 19.07.23 16:11, Hanna Czenczek wrote:

On 18.07.23 17:10, Stefan Hajnoczi wrote:

On Tue, Jul 11, 2023 at 05:52:28PM +0200, Hanna Czenczek wrote:

The only user of vhost_user_reset_status() is vhost_dev_stop(), which
only uses it as a fall-back to stop the back-end if it does not support
SUSPEND.  However, vhost-user's implementation is a no-op unless the
back-end supports SET_STATUS.

vhost-vdpa's implementation instead just calls
vhost_vdpa_reset_device(), implying that it's OK to fully reset the
device if SET_STATUS is not supported.

To be fair, vhost_vdpa_reset_device() does nothing but to set
the status
to zero.  However, that may well be because vhost-vdpa has no method
besides this to reset a device.  In contrast, vhost-user has
RESET_DEVICE and a RESET_OWNER, which can be used instead.

While it is not entirely clear from documentation or git logs, from
discussions and the order of vhost-user protocol features, it
appears to
me as if RESET_OWNER originally had no real meaning for vhost-user, and
was thus used to signal a device reset to the back-end.  Then,
RESET_DEVICE was introduced, to have a well-defined dedicated reset
command.  Finally, vhost-user received full STATUS support, including
SET_STATUS, so setting the device status to 0 is now the preferred way
of resetting a device.  Still, RESET_DEVICE and RESET_OWNER should
remain valid as fall-backs.

Therefore, have vhost_user_reset_status() fall back to
vhost_user_reset_device() if the back-end has no STATUS support.

Signed-off-by: Hanna Czenczek 
---
   hw/virtio/vhost-user.c | 2 ++
   1 file changed, 2 insertions(+)

diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
index 4507de5a92..53a881ec2a 100644
--- a/hw/virtio/vhost-user.c
+++ b/hw/virtio/vhost-user.c
@@ -2833,6 +2833,8 @@ static void vhost_user_reset_status(struct
vhost_dev *dev)
   if (virtio_has_feature(dev->protocol_features,
  VHOST_USER_PROTOCOL_F_STATUS)) {
   vhost_user_set_status(dev, 0);
+    } else {
+    vhost_user_reset_device(dev);
   }
   }

Did you check whether DPDK treats setting the status to 0 as equivalent
to RESET_DEVICE?

If it doesn’t, what’s even the point of using reset_status?

Sorry, I’m being unclear, and I think this may be important because it ties
into the question from patch 1, what qemu is even trying to do by running
SET_STATUS(0) vhost_dev_stop(), so here’s what gave me the impression that
SET_STATUS(0) and RESET_DEVICE should be equivalent:

vhost-vdpa.c runs SET_STATUS(0) in a function called
vhost_vdpa_reset_device().  This is one thing that gave me the impression
that this is about an actual full reset.

Another is the whole discussion that we’ve had.  vhost_dev_stop() does not
call a `vhost_reset_device()` function, it calls `vhost_reset_status()`.
Still, we were always talking about resetting the device.

There is some hacky stuff with struct vhost_dev's vq_index_end and
multi-queue devices. I think it's because multi-queue vhost-net device
consist of many vhost_devs and NetClientStates, so certain vhost
operations are skipped unless this is the "first" or "last" vhost_dev
from a large aggregate vhost-net device. That might be responsible for
part of the weirdness.


It doesn’t make sense to me that vDPA would provide no function to fully
reset a device, while vhost-user does.  Being able to reset a device sounds
vital to me.  This also gave me the impression that SET_STATUS(0) on vDPA at
least is functionally equivalent to a full device reset.


Maybe SET_STATUS(0) does mean a full device reset on vDPA, but not on
vhost-user.  That would be a real shame, so I assumed this would not be the
case; that SET_STATUS(0) does the same thing on both protocols.

Yes, exactly. It has the real VIRTIO spec meaning in vDPA. In vhost-user
it's currently only used by DPDK as a hint for when device
initialization is complete:
https://github.com/DPDK/dpdk/commit/41d201804c4c44738168e2d247d3b1780845faa1


FWIW, now the code is a bit different. 
https://github.com/DPDK/dpdk/commit/671cc679a5fcd26705bb20ddc13b93e665719054 
has added a RESET interpretation for the status field, i.e. when it is 
0.  It doesn’t do anything, but at least DPDK seems to agree that 
SET_STATUS(0) is a reset.



The virtio specification says “Writing 0 into this field resets the device.”
about the device_status field.

This also makes sense, because the device_status field is basically used to
tell the device that a driver has taken control.  If reset, this indicates
the driver has given up control, and to me this is a point where a device
should fully reset itself.

So all in all, I can’t see the rationale why any implementation that
supports SET_STATUS would decide to treat SET_STATUS(0) not as equivalent or
a superset of RESET_DEVICE.  I may be wrong, and this might explain a whole
deal about what kind of backgro

[PATCH for-8.1] target/arm: Special case M-profile in debug_helper.c code

2023-07-21 Thread Peter Maydell

A lot of the code called from helper_exception_bkpt_insn() is written
assuming A-profile, but we will also call this helper on M-profile
CPUs when they execute a BKPT insn.  This used to work by accident,
but recent changes mean that we will hit an assert when some of this
code calls down into lower level functions that end up calling
arm_security_space_below_el3(), arm_el_is_aa64(), and other functions
that now explicitly assert that the guest CPU is not M-profile.

Handle M-profile directly to avoid the assertions:
 * in arm_debug_target_el(), M-profile debug exceptions always
   go to EL1
 * in arm_debug_exception_fsr(), M-profile always uses the short
   format FSR (compare commit d7fe699be54b2, though in this case
   the code in arm_v7m_cpu_do_interrupt() does not need to
   look at the FSR value at all)

Cc: qemu-sta...@nongnu.org
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1775
Signed-off-by: Peter Maydell 
---
Not sure exactly when this assert() got in (probably in the
semi-recent refactorings for realm support), but it won't hurt
to backport the fix even if the older QEMU doesn't assert().
---
 target/arm/debug_helper.c | 18 --
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/target/arm/debug_helper.c b/target/arm/debug_helper.c
index 8362462a07e..abe72e35ae6 100644
--- a/target/arm/debug_helper.c
+++ b/target/arm/debug_helper.c
@@ -21,6 +21,10 @@ static int arm_debug_target_el(CPUARMState *env)
 bool secure = arm_is_secure(env);
 bool route_to_el2 = false;
 
+if (arm_feature(env, ARM_FEATURE_M)) {
+return 1;
+}
+
 if (arm_is_el2_enabled(env)) {
 route_to_el2 = env->cp15.hcr_el2 & HCR_TGE ||
env->cp15.mdcr_el2 & MDCR_TDE;
@@ -434,18 +438,20 @@ static uint32_t arm_debug_exception_fsr(CPUARMState *env)
 {
 ARMMMUFaultInfo fi = { .type = ARMFault_Debug };
 int target_el = arm_debug_target_el(env);
-bool using_lpae = false;
+bool using_lpae;
 
-if (target_el == 2 || arm_el_is_aa64(env, target_el)) {
+if (arm_feature(env, ARM_FEATURE_M)) {
+using_lpae = false;
+} else if (target_el == 2 || arm_el_is_aa64(env, target_el)) {
 using_lpae = true;
 } else if (arm_feature(env, ARM_FEATURE_PMSA) &&
arm_feature(env, ARM_FEATURE_V8)) {
 using_lpae = true;
+} else if (arm_feature(env, ARM_FEATURE_LPAE) &&
+   (env->cp15.tcr_el[target_el] & TTBCR_EAE)) {
+using_lpae = true;
 } else {
-if (arm_feature(env, ARM_FEATURE_LPAE) &&
-(env->cp15.tcr_el[target_el] & TTBCR_EAE)) {
-using_lpae = true;
-}
+using_lpae = false;
 }
 
 if (using_lpae) {
-- 
2.34.1

Re: [PATCH] target/riscv/cpu.c: do not run 'host' CPU with TCG

2023-07-21 Thread Philippe Mathieu-Daudé


On 21/7/23 15:34, Daniel Henrique Barboza wrote:

The 'host' CPU is available in a CONFIG_KVM build and it's currently
available for all accels, but is a KVM only CPU. This means that in a
RISC-V KVM capable host we can do things like this:

$ ./build/qemu-system-riscv64 -M virt,accel=tcg -cpu host --nographic
qemu-system-riscv64: H extension requires priv spec 1.12.0

This CPU does not have a priv spec because we don't filter its extensions
via priv spec. We shouldn't be reaching riscv_cpu_realize_tcg() at all
with the 'host' CPU.

We don't have a way to filter the 'host' CPU out of the available CPU
options (-cpu help) if the build includes both KVM and TCG. What we can
do is to error out during riscv_cpu_realize_tcg() if the user chooses
the 'host' CPU with accel=tcg:

$ ./build/qemu-system-riscv64 -M virt,accel=tcg -cpu host --nographic
qemu-system-riscv64: 'host' CPU is not compatible with TCG acceleration

Signed-off-by: Daniel Henrique Barboza 
---
  target/riscv/cpu.c | 5 +
  1 file changed, 5 insertions(+)

diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
index 6b93b04453..08db3d613f 100644
--- a/target/riscv/cpu.c
+++ b/target/riscv/cpu.c
@@ -1395,6 +1395,11 @@ static void riscv_cpu_realize_tcg(DeviceState *dev, 
Error **errp)


/me wonders why this method isn't used as AccelCPUClass:cpu_realizefn().


  CPURISCVState *env = &cpu->env;
  Error *local_err = NULL;
  
+if (object_dynamic_cast(OBJECT(dev), TYPE_RISCV_CPU_HOST)) {

+error_setg(errp, "'host' CPU is not compatible with TCG acceleration");
+return;
+}


We should manage that generically, likely in CPUClass.
We could define riscv CPU with a DEFINE_CPU_WITH_CLASS() macro to pass
the class_init() handler. Anyhow meanwhile your patch is sufficient, so

Reviewed-by: Philippe Mathieu-Daudé

Re: [PATCH] target/riscv/cpu.c: do not run 'host' CPU with TCG

2023-07-21 Thread Daniel Henrique Barboza





On 7/21/23 11:37, Philippe Mathieu-Daudé wrote:

On 21/7/23 15:34, Daniel Henrique Barboza wrote:

The 'host' CPU is available in a CONFIG_KVM build and it's currently
available for all accels, but is a KVM only CPU. This means that in a
RISC-V KVM capable host we can do things like this:

$ ./build/qemu-system-riscv64 -M virt,accel=tcg -cpu host --nographic
qemu-system-riscv64: H extension requires priv spec 1.12.0

This CPU does not have a priv spec because we don't filter its extensions
via priv spec. We shouldn't be reaching riscv_cpu_realize_tcg() at all
with the 'host' CPU.

We don't have a way to filter the 'host' CPU out of the available CPU
options (-cpu help) if the build includes both KVM and TCG. What we can
do is to error out during riscv_cpu_realize_tcg() if the user chooses
the 'host' CPU with accel=tcg:

$ ./build/qemu-system-riscv64 -M virt,accel=tcg -cpu host --nographic
qemu-system-riscv64: 'host' CPU is not compatible with TCG acceleration

Signed-off-by: Daniel Henrique Barboza 
---
  target/riscv/cpu.c | 5 +
  1 file changed, 5 insertions(+)

diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
index 6b93b04453..08db3d613f 100644
--- a/target/riscv/cpu.c
+++ b/target/riscv/cpu.c
@@ -1395,6 +1395,11 @@ static void riscv_cpu_realize_tcg(DeviceState *dev, 
Error **errp)


/me wonders why this method isn't used as AccelCPUClass:cpu_realizefn().


  CPURISCVState *env = &cpu->env;
  Error *local_err = NULL;
+    if (object_dynamic_cast(OBJECT(dev), TYPE_RISCV_CPU_HOST)) {
+    error_setg(errp, "'host' CPU is not compatible with TCG acceleration");
+    return;
+    }


We should manage that generically, likely in CPUClass.
We could define riscv CPU with a DEFINE_CPU_WITH_CLASS() macro to pass
the class_init() handler. Anyhow meanwhile your patch is sufficient, so

Reviewed-by: Philippe Mathieu-Daudé 


Thanks. I'll mark all your suggestions as todo for 8.2.


Daniel

Re: [PATCH 3/4] vdpa: Restore vlan filtering state

2023-07-21 Thread Hawkins Jiawei

On 2023/7/21 19:57, Eugenio Perez Martin wrote:
> On Wed, Jul 19, 2023 at 9:48 AM Hawkins Jiawei  wrote:
>>
>> This patch introduces vhost_vdpa_net_load_single_vlan()
>> and vhost_vdpa_net_load_vlan() to restore the vlan
>> filtering state at device's startup.
>>
>> Co-developed-by: Eugenio Pérez 
>> Signed-off-by: Eugenio Pérez 
>> Signed-off-by: Hawkins Jiawei 
>> ---
>>   net/vhost-vdpa.c | 49 
>>   1 file changed, 49 insertions(+)
>>
>> diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
>> index 9795306742..0787dd933b 100644
>> --- a/net/vhost-vdpa.c
>> +++ b/net/vhost-vdpa.c
>> @@ -965,6 +965,51 @@ static int vhost_vdpa_net_load_rx(VhostVDPAState *s,
>>   return 0;
>>   }
>>
>> +static int vhost_vdpa_net_load_single_vlan(VhostVDPAState *s,
>> +   const VirtIONet *n,
>> +   uint16_t vid)
>> +{
>> +const struct iovec data = {
>> +.iov_base = &vid,
>> +.iov_len = sizeof(vid),
>> +};
>> +ssize_t dev_written = vhost_vdpa_net_load_cmd(s, VIRTIO_NET_CTRL_VLAN,
>> +  VIRTIO_NET_CTRL_VLAN_ADD,
>> +  &data, 1);
>> +if (unlikely(dev_written < 0)) {
>> +return dev_written;
>> +}
>> +if (unlikely(*s->status != VIRTIO_NET_OK)) {
>> +return -EIO;
>> +}
>> +
>> +return 0;
>> +}
>> +
>> +static int vhost_vdpa_net_load_vlan(VhostVDPAState *s,
>> +const VirtIONet *n)
>> +{
>> +int r;
>> +
>> +if (!virtio_vdev_has_feature(&n->parent_obj, VIRTIO_NET_F_CTRL_VLAN)) {
>> +return 0;
>> +}
>> +
>> +for (int i = 0; i < MAX_VLAN >> 5; i++) {
>> +for (int j = 0; n->vlans[i] && j <= 0x1f; j++) {
>> +if (n->vlans[i] & (1U << j)) {
>> +r = vhost_vdpa_net_load_single_vlan(s, n, (i << 5) + j);
>> +if (unlikely(r != 0)) {
>> +return r;
>> +}
>> +}
>> +}
>> +}
>> +
>> +return 0;
>> +}
>> +
>> +
>
> Nit: I'm not sure if it was here originally, but there is an extra newline 
> here.

Hi Eugenio,

It was not here originally, it was introduced mistakenly during the
refactoring process.

I will fix this in the v2 version.

Thanks!


>
>>   static int vhost_vdpa_net_load(NetClientState *nc)
>>   {
>>   VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
>> @@ -995,6 +1040,10 @@ static int vhost_vdpa_net_load(NetClientState *nc)
>>   if (unlikely(r)) {
>>   return r;
>>   }
>> +r = vhost_vdpa_net_load_vlan(s, n);
>> +if (unlikely(r)) {
>> +return r;
>> +}
>>
>>   return 0;
>>   }
>> --
>> 2.25.1
>>
>

[PATCH v9 01/10] migration: New QAPI type 'MigrateAddress'

2023-07-21 Thread Het Gala

This patch introduces well defined MigrateAddress struct
and its related child objects.

The existing argument of 'migrate' and 'migrate-incoming' QAPI
- 'uri' is of type string. The current implementation follows
double encoding scheme for fetching migration parameters like
'uri' and this is not an ideal design.

Motive for intoducing struct level design is to prevent double
encoding of QAPI arguments, as Qemu should be able to directly
use the QAPI arguments without any level of encoding.

Note: this commit only adds the type, and actual uses comes
in later commits.

Suggested-by: Aravind Retnakaran 
Signed-off-by: Het Gala 
Reviewed-by: Juan Quintela 
Reviewed-by: Daniel P. Berrangé 
Acked-by: Markus Armbruster 
---
 qapi/migration.json | 41 +
 1 file changed, 41 insertions(+)

diff --git a/qapi/migration.json b/qapi/migration.json
index 47dfef0278..0f4a54a7ed 100644
--- a/qapi/migration.json
+++ b/qapi/migration.json
@@ -1417,6 +1417,47 @@
 ##
 { 'command': 'migrate-continue', 'data': {'state': 'MigrationStatus'} }
 
+##
+# @MigrationAddressType:
+#
+# The migration stream transport mechanisms.
+#
+# @socket: Migrate via socket.
+#
+# @exec: Direct the migration stream to another process.
+#
+# @rdma: Migrate via RDMA.
+#
+# Since 8.2
+##
+{ 'enum': 'MigrationAddressType',
+  'data': ['socket', 'exec', 'rdma'] }
+
+##
+# @MigrationExecCommand:
+#
+# @args: command (list head) and arguments to execute.
+#
+# Since 8.2
+##
+{ 'struct': 'MigrationExecCommand',
+  'data': {'args': [ 'str' ] } }
+
+##
+# @MigrationAddress:
+#
+# Migration endpoint configuration.
+#
+# Since 8.2
+##
+{ 'union': 'MigrationAddress',
+  'base': { 'transport' : 'MigrationAddressType'},
+  'discriminator': 'transport',
+  'data': {
+'socket': 'SocketAddress',
+'exec': 'MigrationExecCommand',
+'rdma': 'InetSocketAddress' } }
+
 ##
 # @migrate:
 #
-- 
2.22.3

[PATCH v9 09/10] migration: Implement MigrateChannelList to hmp migration flow.

2023-07-21 Thread Het Gala

Integrate MigrateChannelList with all transport backends
(socket, exec and rdma) for both src and dest migration
endpoints for hmp migration.

Suggested-by: Aravind Retnakaran 
Signed-off-by: Het Gala 
---
 migration/migration-hmp-cmds.c | 16 +---
 migration/migration.c  |  5 ++---
 migration/migration.h  |  3 ++-
 3 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/migration/migration-hmp-cmds.c b/migration/migration-hmp-cmds.c
index 49b150f33f..25f51ec99c 100644
--- a/migration/migration-hmp-cmds.c
+++ b/migration/migration-hmp-cmds.c
@@ -423,10 +423,14 @@ void hmp_migrate_incoming(Monitor *mon, const QDict 
*qdict)
 {
 Error *err = NULL;
 const char *uri = qdict_get_str(qdict, "uri");
+MigrationChannelList *caps = NULL;
+g_autoptr(MigrationChannel) channel = g_new0(MigrationChannel, 1);
 
-qmp_migrate_incoming(uri, false, NULL, &err);
+migrate_uri_parse(uri, &channel, &err);
+QAPI_LIST_PREPEND(caps, channel);
 
-hmp_handle_error(mon, err);
+qmp_migrate_incoming(NULL, true, caps, &err);
+qapi_free_MigrationChannelList(caps);
 }
 
 void hmp_migrate_recover(Monitor *mon, const QDict *qdict)
@@ -704,9 +708,15 @@ void hmp_migrate(Monitor *mon, const QDict *qdict)
 bool resume = qdict_get_try_bool(qdict, "resume", false);
 const char *uri = qdict_get_str(qdict, "uri");
 Error *err = NULL;
+MigrationChannelList *caps = NULL;
+g_autoptr(MigrationChannel) channel = g_new0(MigrationChannel, 1);
+
+migrate_uri_parse(uri, &channel, &err);
+QAPI_LIST_PREPEND(caps, channel);
 
-qmp_migrate(uri, false, NULL, !!blk, blk, !!inc, inc,
+qmp_migrate(NULL, true, caps, !!blk, blk, !!inc, inc,
  false, false, true, resume, &err);
+qapi_free_MigrationChannelList(caps);
 if (hmp_handle_error(mon, err)) {
 return;
 }
diff --git a/migration/migration.c b/migration/migration.c
index acf80b3590..cf063a76df 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -425,9 +425,8 @@ void migrate_add_address(SocketAddress *address)
   QAPI_CLONE(SocketAddress, address));
 }
 
-static bool migrate_uri_parse(const char *uri,
-  MigrationChannel **channel,
-  Error **errp)
+bool migrate_uri_parse(const char *uri, MigrationChannel **channel,
+   Error **errp)
 {
 g_autoptr(MigrationChannel) val = g_new0(MigrationChannel, 1);
 g_autoptr(MigrationAddress) addr = g_new0(MigrationAddress, 1);
diff --git a/migration/migration.h b/migration/migration.h
index b7c8b67542..a8268394ca 100644
--- a/migration/migration.h
+++ b/migration/migration.h
@@ -501,7 +501,8 @@ bool check_dirty_bitmap_mig_alias_map(const 
BitmapMigrationNodeAliasList *bbm,
   Error **errp);
 
 void migrate_add_address(SocketAddress *address);
-
+bool migrate_uri_parse(const char *uri, MigrationChannel **channel,
+   Error **errp);
 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque);
 
 #define qemu_ram_foreach_block \
-- 
2.22.3

[PATCH v9 00/10] migration: Modify 'migrate' and 'migrate-incoming' QAPI commands for migration

2023-07-21 Thread Het Gala

This is v9 patchset of modified 'migrate' and 'migrate-incoming' QAPI design
for upstream review.

Would like to thank all the maintainers that actively participated in the v8
patchset discussion and gave insightful suggestions to improve the patches.


Link to previous upstream community patchset links:
v1: https://lists.gnu.org/archive/html/qemu-devel/2022-12/msg04339.html
v2: https://lists.gnu.org/archive/html/qemu-devel/2023-02/msg02106.html
v3: https://lists.gnu.org/archive/html/qemu-devel/2023-02/msg02473.html
v4: https://lists.gnu.org/archive/html/qemu-devel/2023-05/msg03064.html
v5: https://lists.gnu.org/archive/html/qemu-devel/2023-05/msg04845.html
v6: https://lists.gnu.org/archive/html/qemu-devel/2023-06/msg01251.html
v7: https://lists.gnu.org/archive/html/qemu-devel/2023-07/msg02027.html
v8: https://lists.gnu.org/archive/html/qemu-devel/2023-07/msg02770.html

v8 -> v9 changelog:
---
- Patch1 : changed the qemu version from v8.1 -> v8.2.
- Patch2 : changes for WIN32 regarding exec command.
- Patch6 : Split qmp changes from hmp changes (Patch 9).
- Patch8 : QAPI_CLONE of SocketAddress defined eariler in Patch3.
- Patch9 : introducing migrate 'channels' in HMP, and change in
   implementation from earlier patch.
- Patch10 : modify existing migration test case with new QAPI syntax
instead of adding new test case all together.

Abstract:
-

Current QAPI 'migrate' command design (for initiating a migration
stream) contains information regarding different migrate transport mechanism
(tcp / unix / exec), dest-host IP address, and binding port number in form of
a string. Thus the design does seem to have some design issues. Some of the
issues, stated below are:

1. Use of string URIs is a data encoding scheme within a data encoding scheme.
   QEMU code should directly be able to work with the results from QAPI,
   without resorting to do a second level of parsing (eg. socket_parse()).
2. For features / parameters related to migration, the migration tunables needs
   to be defined and updated upfront. For example, 'migrate-set-capability'
   and 'migrate-set-parameter' is required to enable multifd capability and
   multifd-number of channels respectively. Instead, 'Multifd-channels' can
   directly be represented as a single additional parameter to 'migrate'
   QAPI. 'migrate-set-capability' and 'migrate-set-parameter' commands could
   be used for runtime tunables that need setting after migration has already
   started.

The current patchset focuses on solving the first problem of multi-level
encoding of URIs. The patch defines 'migrate' command as a QAPI discriminated
union for the various transport backends (like socket, exec and rdma), and on
basis of transport backends, different migration parameters are defined.

(uri) string -->  (channel) Channel-type
Transport-type
Migration parameters based on transport type
--

Het Gala (10):
  migration: New QAPI type 'MigrateAddress'
  migration: convert migration 'uri' into 'MigrateAddress'
  migration: convert socket backend to accept MigrateAddress
  migration: convert rdma backend to accept MigrateAddress
  migration: convert exec backend to accept MigrateAddress.
  migration: New migrate and migrate-incoming argument 'channels'
  migration: modify migration_channels_and_uri_compatible() for new QAPI
syntax
  migration: Implement MigrateChannelList to qmp migration flow.
  migration: Implement MigrateChannelList to hmp migration flow.
  migration: modify test_multifd_tcp_none() to use new QAPI syntax.

 migration/exec.c   |  72 +
 migration/exec.h   |   8 +-
 migration/migration-hmp-cmds.c |  18 +++-
 migration/migration.c  | 182 ++---
 migration/migration.h  |   3 +-
 migration/rdma.c   |  34 +++---
 migration/rdma.h   |   6 +-
 migration/socket.c |  39 ++-
 migration/socket.h |   7 +-
 qapi/migration.json| 150 ++-
 softmmu/vl.c   |   2 +-
 tests/qtest/migration-test.c   |   7 +-
 12 files changed, 401 insertions(+), 127 deletions(-)

-- 
2.22.3

[PATCH v9 08/10] migration: Implement MigrateChannelList to qmp migration flow.

2023-07-21 Thread Het Gala

Integrate MigrateChannelList with all transport backends
(socket, exec and rdma) for both src and dest migration
endpoints for qmp migration.

For current series, limit the size of MigrateChannelList
to single element (single interface) as runtime check.

Suggested-by: Aravind Retnakaran 
Signed-off-by: Het Gala 
---
 migration/migration.c | 77 ---
 1 file changed, 50 insertions(+), 27 deletions(-)

diff --git a/migration/migration.c b/migration/migration.c
index 6e0a8beaf2..acf80b3590 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -426,9 +426,10 @@ void migrate_add_address(SocketAddress *address)
 }
 
 static bool migrate_uri_parse(const char *uri,
-  MigrationAddress **channel,
+  MigrationChannel **channel,
   Error **errp)
 {
+g_autoptr(MigrationChannel) val = g_new0(MigrationChannel, 1);
 g_autoptr(MigrationAddress) addr = g_new0(MigrationAddress, 1);
 SocketAddress *saddr = &addr->u.socket;
 InetSocketAddress *isock = &addr->u.rdma;
@@ -465,7 +466,9 @@ static bool migrate_uri_parse(const char *uri,
 return false;
 }
 
-*channel = addr;
+val->channel_type = MIGRATION_CHANNEL_TYPE_MAIN;
+val->addr = addr;
+*channel = val;
 return true;
 }
 
@@ -473,7 +476,8 @@ static void qemu_start_incoming_migration(const char *uri, 
bool has_channels,
   MigrationChannelList *channels,
   Error **errp)
 {
-g_autoptr(MigrationAddress) channel = g_new0(MigrationAddress, 1);
+g_autoptr(MigrationChannel) channel = g_new0(MigrationChannel, 1);
+g_autoptr(MigrationAddress) addr = g_new0(MigrationAddress, 1);
 
 /*
  * Having preliminary checks for uri and channel
@@ -483,20 +487,29 @@ static void qemu_start_incoming_migration(const char 
*uri, bool has_channels,
"exclusive; exactly one of the two should be present in "
"'migrate-incoming' qmp command ");
 return;
+} else if (channels) {
+/* To verify that Migrate channel list has only item */
+if (channels->next) {
+error_setg(errp, "Channel list has more than one entries");
+return;
+}
+channel = channels->value;
+} else {
+/* caller uses the old URI syntax */
+if (uri && !migrate_uri_parse(uri, &channel, errp)) {
+return;
+}
 }
-
-if (uri && !migrate_uri_parse(uri, &channel, errp)) {
-return;
-}
+addr = channel->addr;
 
 /* transport mechanism not suitable for migration? */
-if (!migration_channels_and_transport_compatible(channel, errp)) {
+if (!migration_channels_and_transport_compatible(addr, errp)) {
 return;
 }
 
 qapi_event_send_migration(MIGRATION_STATUS_SETUP);
-if (channel->transport == MIGRATION_ADDRESS_TYPE_SOCKET) {
-SocketAddress *saddr = &channel->u.socket;
+if (addr->transport == MIGRATION_ADDRESS_TYPE_SOCKET) {
+SocketAddress *saddr = &addr->u.socket;
 if (saddr->type == SOCKET_ADDRESS_TYPE_INET ||
 saddr->type == SOCKET_ADDRESS_TYPE_UNIX ||
 saddr->type == SOCKET_ADDRESS_TYPE_VSOCK) {
@@ -505,11 +518,11 @@ static void qemu_start_incoming_migration(const char 
*uri, bool has_channels,
 fd_start_incoming_migration(saddr->u.fd.str, errp);
 }
 #ifdef CONFIG_RDMA
-} else if (channel->transport == MIGRATION_ADDRESS_TYPE_RDMA) {
-rdma_start_incoming_migration(&channel->u.rdma, errp);
-#endif
-} else if (channel->transport == MIGRATION_ADDRESS_TYPE_EXEC) {
-exec_start_incoming_migration(channel->u.exec.args, errp);
+} else if (addr->transport == MIGRATION_ADDRESS_TYPE_RDMA) {
+rdma_start_incoming_migration(&addr->u.rdma, errp);
+ #endif
+} else if (addr->transport == MIGRATION_ADDRESS_TYPE_EXEC) {
+exec_start_incoming_migration(addr->u.exec.args, errp);
 } else {
 error_setg(errp, "unknown migration protocol: %s", uri);
 }
@@ -1709,7 +1722,8 @@ void qmp_migrate(const char *uri, bool has_channels,
 bool resume_requested;
 Error *local_err = NULL;
 MigrationState *s = migrate_get_current();
-g_autoptr(MigrationAddress) channel = g_new0(MigrationAddress, 1);
+g_autoptr(MigrationChannel) channel = g_new0(MigrationChannel, 1);
+g_autoptr(MigrationAddress) addr = g_new0(MigrationAddress, 1);
 
 /*
  * Having preliminary checks for uri and channel
@@ -1719,14 +1733,23 @@ void qmp_migrate(const char *uri, bool has_channels,
"exclusive; exactly one of the two should be present in "
"'migrate' qmp command ");
 return;
+} else if (channels) {
+/* To verify that Migrate channel list has only item */
+if (channels->next) {
+error_setg(errp, "Cha

[PATCH v9 05/10] migration: convert exec backend to accept MigrateAddress.

2023-07-21 Thread Het Gala

Exec transport backend for 'migrate'/'migrate-incoming' QAPIs accept
new wire protocol of MigrateAddress struct.

It is achived by parsing 'uri' string and storing migration parameters
required for exec connection into strList struct.

Suggested-by: Aravind Retnakaran 
Signed-off-by: Het Gala 
Reviewed-by: Daniel P. Berrangé 
---
 migration/exec.c  | 71 +++
 migration/exec.h  |  4 +--
 migration/migration.c | 10 +++---
 3 files changed, 57 insertions(+), 28 deletions(-)

diff --git a/migration/exec.c b/migration/exec.c
index 32f5143dfd..8bc321c66b 100644
--- a/migration/exec.c
+++ b/migration/exec.c
@@ -39,20 +39,50 @@ const char *exec_get_cmd_path(void)
 }
 #endif
 
-void exec_start_outgoing_migration(MigrationState *s, const char *command, 
Error **errp)
+/* provides the length of strList */
+static int
+str_list_length(strList *list)
+{
+int len = 0;
+strList *elem;
+
+for (elem = list; elem != NULL; elem = elem->next) {
+len++;
+}
+
+return len;
+}
+
+static void
+init_exec_array(strList *command, char **argv, Error **errp)
+{
+int i = 0;
+strList *lst;
+
+for (lst = command; lst; lst = lst->next) {
+argv[i++] = lst->value;
+}
+
+argv[i] = NULL;
+return;
+}
+
+void exec_start_outgoing_migration(MigrationState *s, strList *command,
+   Error **errp)
 {
 QIOChannel *ioc;
 
-#ifdef WIN32
-const char *argv[] = { exec_get_cmd_path(), "/c", command, NULL };
-#else
-const char *argv[] = { "/bin/sh", "-c", command, NULL };
-#endif
+int length = str_list_length(command);
+g_auto(GStrv) argv = (char **) g_new0(const char *, length);
 
-trace_migration_exec_outgoing(command);
-ioc = QIO_CHANNEL(qio_channel_command_new_spawn(argv,
-O_RDWR,
-errp));
+init_exec_array(command, argv, errp);
+g_autofree char *new_command = g_strjoinv(" ", (char **)argv);
+
+trace_migration_exec_outgoing(new_command);
+ioc = QIO_CHANNEL(
+qio_channel_command_new_spawn((const char * const *) argv,
+  O_RDWR,
+  errp));
 if (!ioc) {
 return;
 }
@@ -71,20 +101,21 @@ static gboolean exec_accept_incoming_migration(QIOChannel 
*ioc,
 return G_SOURCE_REMOVE;
 }
 
-void exec_start_incoming_migration(const char *command, Error **errp)
+void exec_start_incoming_migration(strList *command, Error **errp)
 {
 QIOChannel *ioc;
 
-#ifdef WIN32
-const char *argv[] = { exec_get_cmd_path(), "/c", command, NULL };
-#else
-const char *argv[] = { "/bin/sh", "-c", command, NULL };
-#endif
+int length = str_list_length(command);
+g_auto(GStrv) argv = (char **) g_new0(const char *, length);
+
+init_exec_array(command, argv, errp);
+g_autofree char *new_command = g_strjoinv(" ", (char **)argv);
 
-trace_migration_exec_incoming(command);
-ioc = QIO_CHANNEL(qio_channel_command_new_spawn(argv,
-O_RDWR,
-errp));
+trace_migration_exec_incoming(new_command);
+ioc = QIO_CHANNEL(
+qio_channel_command_new_spawn((const char * const *) argv,
+  O_RDWR,
+  errp));
 if (!ioc) {
 return;
 }
diff --git a/migration/exec.h b/migration/exec.h
index 736cd71028..3107f205e3 100644
--- a/migration/exec.h
+++ b/migration/exec.h
@@ -23,8 +23,8 @@
 #ifdef WIN32
 const char *exec_get_cmd_path(void);
 #endif
-void exec_start_incoming_migration(const char *host_port, Error **errp);
+void exec_start_incoming_migration(strList *host_port, Error **errp);
 
-void exec_start_outgoing_migration(MigrationState *s, const char *host_port,
+void exec_start_outgoing_migration(MigrationState *s, strList *host_port,
Error **errp);
 #endif
diff --git a/migration/migration.c b/migration/migration.c
index 8012f93f1b..f37b388876 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -468,7 +468,6 @@ static bool migrate_uri_parse(const char *uri,
 
 static void qemu_start_incoming_migration(const char *uri, Error **errp)
 {
-const char *p = NULL;
 g_autoptr(MigrationAddress) channel = g_new0(MigrationAddress, 1);
 
 /* URI is not suitable for migration? */
@@ -494,8 +493,8 @@ static void qemu_start_incoming_migration(const char *uri, 
Error **errp)
 } else if (channel->transport == MIGRATION_ADDRESS_TYPE_RDMA) {
 rdma_start_incoming_migration(&channel->u.rdma, errp);
 #endif
-} else if (strstart(uri, "exec:", &p)) {
-exec_start_incoming_migration(p, errp);
+} else if (channel->transport == MIGRATION_ADDRESS_TYPE_EXEC) {
+exec_start_incoming_migration(channel->u.exec.args, errp);
 } else {

[PATCH v9 03/10] migration: convert socket backend to accept MigrateAddress

2023-07-21 Thread Het Gala

Socket transport backend for 'migrate'/'migrate-incoming' QAPIs accept
new wire protocol of MigrateAddress struct.

It is achived by parsing 'uri' string and storing migration parameters
required for socket connection into well defined SocketAddress struct.

Suggested-by: Aravind Retnakaran 
Signed-off-by: Het Gala 
Reviewed-by: Daniel P. Berrangé 
---
 migration/migration.c | 30 ++
 migration/socket.c| 39 +--
 migration/socket.h|  7 ---
 3 files changed, 31 insertions(+), 45 deletions(-)

diff --git a/migration/migration.c b/migration/migration.c
index 5b3be767f0..c4bcf8bbd7 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -481,18 +481,21 @@ static void qemu_start_incoming_migration(const char 
*uri, Error **errp)
 }
 
 qapi_event_send_migration(MIGRATION_STATUS_SETUP);
-if (strstart(uri, "tcp:", &p) ||
-strstart(uri, "unix:", NULL) ||
-strstart(uri, "vsock:", NULL)) {
-socket_start_incoming_migration(p ? p : uri, errp);
+if (channel->transport == MIGRATION_ADDRESS_TYPE_SOCKET) {
+SocketAddress *saddr = &channel->u.socket;
+if (saddr->type == SOCKET_ADDRESS_TYPE_INET ||
+saddr->type == SOCKET_ADDRESS_TYPE_UNIX ||
+saddr->type == SOCKET_ADDRESS_TYPE_VSOCK) {
+socket_start_incoming_migration(saddr, errp);
+} else if (saddr->type == SOCKET_ADDRESS_TYPE_FD) {
+fd_start_incoming_migration(saddr->u.fd.str, errp);
+}
 #ifdef CONFIG_RDMA
 } else if (strstart(uri, "rdma:", &p)) {
 rdma_start_incoming_migration(p, errp);
 #endif
 } else if (strstart(uri, "exec:", &p)) {
 exec_start_incoming_migration(p, errp);
-} else if (strstart(uri, "fd:", &p)) {
-fd_start_incoming_migration(p, errp);
 } else {
 error_setg(errp, "unknown migration protocol: %s", uri);
 }
@@ -1715,18 +1718,21 @@ void qmp_migrate(const char *uri, bool has_blk, bool 
blk,
 }
 }
 
-if (strstart(uri, "tcp:", &p) ||
-strstart(uri, "unix:", NULL) ||
-strstart(uri, "vsock:", NULL)) {
-socket_start_outgoing_migration(s, p ? p : uri, &local_err);
+if (channel->transport == MIGRATION_ADDRESS_TYPE_SOCKET) {
+SocketAddress *saddr = &channel->u.socket;
+if (saddr->type == SOCKET_ADDRESS_TYPE_INET ||
+saddr->type == SOCKET_ADDRESS_TYPE_UNIX ||
+saddr->type == SOCKET_ADDRESS_TYPE_VSOCK) {
+socket_start_outgoing_migration(s, saddr, &local_err);
+} else if (saddr->type == SOCKET_ADDRESS_TYPE_FD) {
+fd_start_outgoing_migration(s, saddr->u.fd.str, &local_err);
+}
 #ifdef CONFIG_RDMA
 } else if (strstart(uri, "rdma:", &p)) {
 rdma_start_outgoing_migration(s, p, &local_err);
 #endif
 } else if (strstart(uri, "exec:", &p)) {
 exec_start_outgoing_migration(s, p, &local_err);
-} else if (strstart(uri, "fd:", &p)) {
-fd_start_outgoing_migration(s, p, &local_err);
 } else {
 if (!resume_requested) {
 yank_unregister_instance(MIGRATION_YANK_INSTANCE);
diff --git a/migration/socket.c b/migration/socket.c
index 1b6f5baefb..98e3ea1514 100644
--- a/migration/socket.c
+++ b/migration/socket.c
@@ -28,6 +28,8 @@
 #include "trace.h"
 #include "postcopy-ram.h"
 #include "options.h"
+#include "qapi/clone-visitor.h"
+#include "qapi/qapi-visit-sockets.h"
 
 struct SocketOutgoingArgs {
 SocketAddress *saddr;
@@ -108,19 +110,19 @@ out:
 object_unref(OBJECT(sioc));
 }
 
-static void
-socket_start_outgoing_migration_internal(MigrationState *s,
- SocketAddress *saddr,
- Error **errp)
+void socket_start_outgoing_migration(MigrationState *s,
+ SocketAddress *saddr,
+ Error **errp)
 {
 QIOChannelSocket *sioc = qio_channel_socket_new();
 struct SocketConnectData *data = g_new0(struct SocketConnectData, 1);
+SocketAddress *addr = QAPI_CLONE(SocketAddress, saddr);
 
 data->s = s;
 
 /* in case previous migration leaked it */
 qapi_free_SocketAddress(outgoing_args.saddr);
-outgoing_args.saddr = saddr;
+outgoing_args.saddr = addr;
 
 if (saddr->type == SOCKET_ADDRESS_TYPE_INET) {
 data->hostname = g_strdup(saddr->u.inet.host);
@@ -135,18 +137,6 @@ socket_start_outgoing_migration_internal(MigrationState *s,
  NULL);
 }
 
-void socket_start_outgoing_migration(MigrationState *s,
- const char *str,
- Error **errp)
-{
-Error *err = NULL;
-SocketAddress *saddr = socket_parse(str, &err);
-if (!err) {
-socket_start_outgoing_migration_internal(s, saddr, &err);
-}
-error_propagate(errp, err);
-}
-
 static void socket_accept_incoming_

[PATCH v9 10/10] migration: modify test_multifd_tcp_none() to use new QAPI syntax.

2023-07-21 Thread Het Gala

modify multifd tcp common test to incorporate the new QAPI
syntax defined.

Suggested-by: Aravind Retnakaran 
Signed-off-by: Het Gala 
---
 tests/qtest/migration-test.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c
index e256da1216..376fad8311 100644
--- a/tests/qtest/migration-test.c
+++ b/tests/qtest/migration-test.c
@@ -2185,7 +2185,12 @@ test_migrate_precopy_tcp_multifd_start_common(QTestState 
*from,
 
 /* Start incoming migration from the 1st socket */
 qtest_qmp_assert_success(to, "{ 'execute': 'migrate-incoming',"
- "  'arguments': { 'uri': 'tcp:127.0.0.1:0' }}");
+ "  'arguments': { "
+ "  'channels': [ { 'channeltype': 'main',"
+ "  'addr': { 'transport': 'socket',"
+ "'type': 'inet',"
+ "'host': '127.0.0.1',"
+ "'port': '0' } } ] } }");
 
 return NULL;
 }
-- 
2.22.3

[PATCH v9 02/10] migration: convert migration 'uri' into 'MigrateAddress'

2023-07-21 Thread Het Gala

This patch parses 'migrate' and 'migrate-incoming' QAPI's 'uri'
string containing migration connection related information
and stores them inside well defined 'MigrateAddress' struct.

Suggested-by: Aravind Retnakaran 
Signed-off-by: Het Gala 
Reviewed-by: Daniel P. Berrangé 
---
 migration/exec.c  |  1 -
 migration/exec.h  |  4 
 migration/migration.c | 55 +++
 3 files changed, 59 insertions(+), 1 deletion(-)

diff --git a/migration/exec.c b/migration/exec.c
index 2bf882bbe1..32f5143dfd 100644
--- a/migration/exec.c
+++ b/migration/exec.c
@@ -27,7 +27,6 @@
 #include "qemu/cutils.h"
 
 #ifdef WIN32
-const char *exec_get_cmd_path(void);
 const char *exec_get_cmd_path(void)
 {
 g_autofree char *detected_path = g_new(char, MAX_PATH);
diff --git a/migration/exec.h b/migration/exec.h
index b210ffde7a..736cd71028 100644
--- a/migration/exec.h
+++ b/migration/exec.h
@@ -19,6 +19,10 @@
 
 #ifndef QEMU_MIGRATION_EXEC_H
 #define QEMU_MIGRATION_EXEC_H
+
+#ifdef WIN32
+const char *exec_get_cmd_path(void);
+#endif
 void exec_start_incoming_migration(const char *host_port, Error **errp);
 
 void exec_start_outgoing_migration(MigrationState *s, const char *host_port,
diff --git a/migration/migration.c b/migration/migration.c
index 91bba630a8..5b3be767f0 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -64,6 +64,7 @@
 #include "yank_functions.h"
 #include "sysemu/qtest.h"
 #include "options.h"
+#include "qemu/sockets.h"
 
 static NotifierList migration_state_notifiers =
 NOTIFIER_LIST_INITIALIZER(migration_state_notifiers);
@@ -421,15 +422,64 @@ void migrate_add_address(SocketAddress *address)
   QAPI_CLONE(SocketAddress, address));
 }
 
+static bool migrate_uri_parse(const char *uri,
+  MigrationAddress **channel,
+  Error **errp)
+{
+g_autoptr(MigrationAddress) addr = g_new0(MigrationAddress, 1);
+SocketAddress *saddr = &addr->u.socket;
+InetSocketAddress *isock = &addr->u.rdma;
+strList **tail = &addr->u.exec.args;
+
+if (strstart(uri, "exec:", NULL)) {
+addr->transport = MIGRATION_ADDRESS_TYPE_EXEC;
+#ifdef WIN32
+QAPI_LIST_APPEND(tail, g_strdup(exec_get_cmd_path()));
+QAPI_LIST_APPEND(tail, g_strdup("/c"));
+#else
+QAPI_LIST_APPEND(tail, g_strdup("/bin/sh"));
+QAPI_LIST_APPEND(tail, g_strdup("-c"));
+#endif
+QAPI_LIST_APPEND(tail, g_strdup(uri + strlen("exec:")));
+} else if (strstart(uri, "rdma:", NULL)) {
+if (inet_parse(isock, uri + strlen("rdma:"), errp)) {
+qapi_free_InetSocketAddress(isock);
+return false;
+}
+addr->transport = MIGRATION_ADDRESS_TYPE_RDMA;
+} else if (strstart(uri, "tcp:", NULL) ||
+strstart(uri, "unix:", NULL) ||
+strstart(uri, "vsock:", NULL) ||
+strstart(uri, "fd:", NULL)) {
+addr->transport = MIGRATION_ADDRESS_TYPE_SOCKET;
+saddr = socket_parse(uri, errp);
+if (!saddr) {
+qapi_free_SocketAddress(saddr);
+return false;
+}
+} else {
+error_setg(errp, "unknown migration protocol: %s", uri);
+return false;
+}
+
+*channel = addr;
+return true;
+}
+
 static void qemu_start_incoming_migration(const char *uri, Error **errp)
 {
 const char *p = NULL;
+g_autoptr(MigrationAddress) channel = g_new0(MigrationAddress, 1);
 
 /* URI is not suitable for migration? */
 if (!migration_channels_and_uri_compatible(uri, errp)) {
 return;
 }
 
+if (uri && !migrate_uri_parse(uri, &channel, errp)) {
+return;
+}
+
 qapi_event_send_migration(MIGRATION_STATUS_SETUP);
 if (strstart(uri, "tcp:", &p) ||
 strstart(uri, "unix:", NULL) ||
@@ -1641,12 +1691,17 @@ void qmp_migrate(const char *uri, bool has_blk, bool 
blk,
 Error *local_err = NULL;
 MigrationState *s = migrate_get_current();
 const char *p = NULL;
+g_autoptr(MigrationAddress) channel = g_new0(MigrationAddress, 1);
 
 /* URI is not suitable for migration? */
 if (!migration_channels_and_uri_compatible(uri, errp)) {
 return;
 }
 
+if (!migrate_uri_parse(uri, &channel, errp)) {
+return;
+}
+
 resume_requested = has_resume && resume;
 if (!migrate_prepare(s, has_blk && blk, has_inc && inc,
  resume_requested, errp)) {
-- 
2.22.3

[PATCH v9 06/10] migration: New migrate and migrate-incoming argument 'channels'

2023-07-21 Thread Het Gala

MigrateChannelList allows to connect accross multiple interfaces.
Add MigrateChannelList struct as argument to migration QAPIs.

We plan to include multiple channels in future, to connnect
multiple interfaces. Hence, we choose 'MigrateChannelList'
as the new argument over 'MigrateChannel' to make migration
QAPIs future proof.

Suggested-by: Aravind Retnakaran 
Signed-off-by: Het Gala 
Acked-by: Markus Armbruster 
---
 migration/migration-hmp-cmds.c |   6 +-
 migration/migration.c  |  34 --
 qapi/migration.json| 109 -
 softmmu/vl.c   |   2 +-
 4 files changed, 139 insertions(+), 12 deletions(-)

diff --git a/migration/migration-hmp-cmds.c b/migration/migration-hmp-cmds.c
index 9885d7c9f7..49b150f33f 100644
--- a/migration/migration-hmp-cmds.c
+++ b/migration/migration-hmp-cmds.c
@@ -424,7 +424,7 @@ void hmp_migrate_incoming(Monitor *mon, const QDict *qdict)
 Error *err = NULL;
 const char *uri = qdict_get_str(qdict, "uri");
 
-qmp_migrate_incoming(uri, &err);
+qmp_migrate_incoming(uri, false, NULL, &err);
 
 hmp_handle_error(mon, err);
 }
@@ -705,8 +705,8 @@ void hmp_migrate(Monitor *mon, const QDict *qdict)
 const char *uri = qdict_get_str(qdict, "uri");
 Error *err = NULL;
 
-qmp_migrate(uri, !!blk, blk, !!inc, inc,
-false, false, true, resume, &err);
+qmp_migrate(uri, false, NULL, !!blk, blk, !!inc, inc,
+ false, false, true, resume, &err);
 if (hmp_handle_error(mon, err)) {
 return;
 }
diff --git a/migration/migration.c b/migration/migration.c
index f37b388876..bd3a93fc8c 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -466,10 +466,22 @@ static bool migrate_uri_parse(const char *uri,
 return true;
 }
 
-static void qemu_start_incoming_migration(const char *uri, Error **errp)
+static void qemu_start_incoming_migration(const char *uri, bool has_channels,
+  MigrationChannelList *channels,
+  Error **errp)
 {
 g_autoptr(MigrationAddress) channel = g_new0(MigrationAddress, 1);
 
+/*
+ * Having preliminary checks for uri and channel
+ */
+if (uri && has_channels) {
+error_setg(errp, "'uri' and 'channels' arguments are mutually "
+   "exclusive; exactly one of the two should be present in "
+   "'migrate-incoming' qmp command ");
+return;
+}
+
 /* URI is not suitable for migration? */
 if (!migration_channels_and_uri_compatible(uri, errp)) {
 return;
@@ -1497,7 +1509,8 @@ void migrate_del_blocker(Error *reason)
 migration_blockers = g_slist_remove(migration_blockers, reason);
 }
 
-void qmp_migrate_incoming(const char *uri, Error **errp)
+void qmp_migrate_incoming(const char *uri, bool has_channels,
+  MigrationChannelList *channels, Error **errp)
 {
 Error *local_err = NULL;
 static bool once = true;
@@ -1515,7 +1528,7 @@ void qmp_migrate_incoming(const char *uri, Error **errp)
 return;
 }
 
-qemu_start_incoming_migration(uri, &local_err);
+qemu_start_incoming_migration(uri, has_channels, channels, &local_err);
 
 if (local_err) {
 yank_unregister_instance(MIGRATION_YANK_INSTANCE);
@@ -1551,7 +1564,7 @@ void qmp_migrate_recover(const char *uri, Error **errp)
  * only re-setup the migration stream and poke existing migration
  * to continue using that newly established channel.
  */
-qemu_start_incoming_migration(uri, errp);
+qemu_start_incoming_migration(uri, false, NULL, errp);
 }
 
 void qmp_migrate_pause(Error **errp)
@@ -1685,7 +1698,8 @@ static bool migrate_prepare(MigrationState *s, bool blk, 
bool blk_inc,
 return true;
 }
 
-void qmp_migrate(const char *uri, bool has_blk, bool blk,
+void qmp_migrate(const char *uri, bool has_channels,
+ MigrationChannelList *channels, bool has_blk, bool blk,
  bool has_inc, bool inc, bool has_detach, bool detach,
  bool has_resume, bool resume, Error **errp)
 {
@@ -1694,6 +1708,16 @@ void qmp_migrate(const char *uri, bool has_blk, bool blk,
 MigrationState *s = migrate_get_current();
 g_autoptr(MigrationAddress) channel = g_new0(MigrationAddress, 1);
 
+/*
+ * Having preliminary checks for uri and channel
+ */
+if (uri && has_channels) {
+error_setg(errp, "'uri' and 'channels' arguments are mutually "
+   "exclusive; exactly one of the two should be present in "
+   "'migrate' qmp command ");
+return;
+}
+
 /* URI is not suitable for migration? */
 if (!migration_channels_and_uri_compatible(uri, errp)) {
 return;
diff --git a/qapi/migration.json b/qapi/migration.json
index 0f4a54a7ed..9a365ac774 100644
--- a/qapi/migration.json
+++ b/qapi/migration.json
@@ -1458,6 +1458,34 @@
 'exec': 'Migrat

[PATCH v9 04/10] migration: convert rdma backend to accept MigrateAddress

2023-07-21 Thread Het Gala

RDMA based transport backend for 'migrate'/'migrate-incoming' QAPIs
accept new wire protocol of MigrateAddress struct.

It is achived by parsing 'uri' string and storing migration parameters
required for RDMA connection into well defined InetSocketAddress struct.

Suggested-by: Aravind Retnakaran 
Signed-off-by: Het Gala 
Reviewed-by: Daniel P. Berrangé 
---
 migration/migration.c |  8 
 migration/rdma.c  | 34 --
 migration/rdma.h  |  6 --
 3 files changed, 20 insertions(+), 28 deletions(-)

diff --git a/migration/migration.c b/migration/migration.c
index c4bcf8bbd7..8012f93f1b 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -491,8 +491,8 @@ static void qemu_start_incoming_migration(const char *uri, 
Error **errp)
 fd_start_incoming_migration(saddr->u.fd.str, errp);
 }
 #ifdef CONFIG_RDMA
-} else if (strstart(uri, "rdma:", &p)) {
-rdma_start_incoming_migration(p, errp);
+} else if (channel->transport == MIGRATION_ADDRESS_TYPE_RDMA) {
+rdma_start_incoming_migration(&channel->u.rdma, errp);
 #endif
 } else if (strstart(uri, "exec:", &p)) {
 exec_start_incoming_migration(p, errp);
@@ -1728,8 +1728,8 @@ void qmp_migrate(const char *uri, bool has_blk, bool blk,
 fd_start_outgoing_migration(s, saddr->u.fd.str, &local_err);
 }
 #ifdef CONFIG_RDMA
-} else if (strstart(uri, "rdma:", &p)) {
-rdma_start_outgoing_migration(s, p, &local_err);
+} else if (channel->transport == MIGRATION_ADDRESS_TYPE_RDMA) {
+rdma_start_outgoing_migration(s, &channel->u.rdma, &local_err);
 #endif
 } else if (strstart(uri, "exec:", &p)) {
 exec_start_outgoing_migration(s, p, &local_err);
diff --git a/migration/rdma.c b/migration/rdma.c
index dd1c039e6c..4d64fae492 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -319,7 +319,6 @@ typedef struct RDMALocalBlocks {
 typedef struct RDMAContext {
 char *host;
 int port;
-char *host_port;
 
 RDMAWorkRequestData wr_data[RDMA_WRID_MAX];
 
@@ -2455,9 +2454,7 @@ static void qemu_rdma_cleanup(RDMAContext *rdma)
 rdma->channel = NULL;
 }
 g_free(rdma->host);
-g_free(rdma->host_port);
 rdma->host = NULL;
-rdma->host_port = NULL;
 }
 
 
@@ -2739,28 +2736,17 @@ static void qemu_rdma_return_path_dest_init(RDMAContext 
*rdma_return_path,
 rdma_return_path->is_return_path = true;
 }
 
-static void *qemu_rdma_data_init(const char *host_port, Error **errp)
+static void *qemu_rdma_data_init(InetSocketAddress *saddr, Error **errp)
 {
 RDMAContext *rdma = NULL;
-InetSocketAddress *addr;
 
-if (host_port) {
+if (saddr) {
 rdma = g_new0(RDMAContext, 1);
 rdma->current_index = -1;
 rdma->current_chunk = -1;
 
-addr = g_new(InetSocketAddress, 1);
-if (!inet_parse(addr, host_port, NULL)) {
-rdma->port = atoi(addr->port);
-rdma->host = g_strdup(addr->host);
-rdma->host_port = g_strdup(host_port);
-} else {
-ERROR(errp, "bad RDMA migration address '%s'", host_port);
-g_free(rdma);
-rdma = NULL;
-}
-
-qapi_free_InetSocketAddress(addr);
+rdma->host = g_strdup(saddr->host);
+rdma->port = atoi(saddr->port);
 }
 
 return rdma;
@@ -3359,6 +3345,7 @@ static int qemu_rdma_accept(RDMAContext *rdma)
 .private_data_len = sizeof(cap),
  };
 RDMAContext *rdma_return_path = NULL;
+g_autoptr(InetSocketAddress) isock = g_new0(InetSocketAddress, 1);
 struct rdma_cm_event *cm_event;
 struct ibv_context *verbs;
 int ret = -EINVAL;
@@ -3374,13 +3361,16 @@ static int qemu_rdma_accept(RDMAContext *rdma)
 goto err_rdma_dest_wait;
 }
 
+isock->host = rdma->host;
+isock->port = g_strdup_printf("%d", rdma->port);
+
 /*
  * initialize the RDMAContext for return path for postcopy after first
  * connection request reached.
  */
 if ((migrate_postcopy() || migrate_return_path())
 && !rdma->is_return_path) {
-rdma_return_path = qemu_rdma_data_init(rdma->host_port, NULL);
+rdma_return_path = qemu_rdma_data_init(isock, NULL);
 if (rdma_return_path == NULL) {
 rdma_ack_cm_event(cm_event);
 goto err_rdma_dest_wait;
@@ -4113,7 +4103,8 @@ static void rdma_accept_incoming_migration(void *opaque)
 }
 }
 
-void rdma_start_incoming_migration(const char *host_port, Error **errp)
+void rdma_start_incoming_migration(InetSocketAddress *host_port,
+   Error **errp)
 {
 int ret;
 RDMAContext *rdma;
@@ -4159,13 +4150,12 @@ err:
 error_propagate(errp, local_err);
 if (rdma) {
 g_free(rdma->host);
-g_free(rdma->host_port);
 }
 g_free(rdma);
 }
 
 void rdma_start_outgoing_migration(void *opaq

[PATCH v9 07/10] migration: modify migration_channels_and_uri_compatible() for new QAPI syntax

2023-07-21 Thread Het Gala

migration_channels_and_uri_compatible() check for transport mechanism
suitable for multifd migration gets executed when the caller calls old
uri syntax. It needs it to be run when using the modern MigrateChannel
QAPI syntax too.

After URI -> 'MigrateChannel' :
migration_channels_and_uri_compatible() ->
migration_channels_and_transport_compatible() passes object as argument
and check for valid transport mechanism.

Suggested-by: Aravind Retnakaran 
Signed-off-by: Het Gala 
Reviewed-by: Daniel P. Berrangé 
---
 migration/migration.c | 25 ++---
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/migration/migration.c b/migration/migration.c
index bd3a93fc8c..6e0a8beaf2 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -104,17 +104,20 @@ static bool migration_needs_multiple_sockets(void)
 return migrate_multifd() || migrate_postcopy_preempt();
 }
 
-static bool uri_supports_multi_channels(const char *uri)
+static bool transport_supports_multi_channels(SocketAddress *saddr)
 {
-return strstart(uri, "tcp:", NULL) || strstart(uri, "unix:", NULL) ||
-   strstart(uri, "vsock:", NULL);
+return saddr->type == SOCKET_ADDRESS_TYPE_INET ||
+   saddr->type == SOCKET_ADDRESS_TYPE_UNIX ||
+   saddr->type == SOCKET_ADDRESS_TYPE_VSOCK;
 }
 
 static bool
-migration_channels_and_uri_compatible(const char *uri, Error **errp)
+migration_channels_and_transport_compatible(MigrationAddress *addr,
+Error **errp)
 {
 if (migration_needs_multiple_sockets() &&
-!uri_supports_multi_channels(uri)) {
+(addr->transport == MIGRATION_ADDRESS_TYPE_SOCKET) &&
+!transport_supports_multi_channels(&addr->u.socket)) {
 error_setg(errp, "Migration requires multi-channel URIs (e.g. tcp)");
 return false;
 }
@@ -482,12 +485,12 @@ static void qemu_start_incoming_migration(const char 
*uri, bool has_channels,
 return;
 }
 
-/* URI is not suitable for migration? */
-if (!migration_channels_and_uri_compatible(uri, errp)) {
+if (uri && !migrate_uri_parse(uri, &channel, errp)) {
 return;
 }
 
-if (uri && !migrate_uri_parse(uri, &channel, errp)) {
+/* transport mechanism not suitable for migration? */
+if (!migration_channels_and_transport_compatible(channel, errp)) {
 return;
 }
 
@@ -1718,12 +1721,12 @@ void qmp_migrate(const char *uri, bool has_channels,
 return;
 }
 
-/* URI is not suitable for migration? */
-if (!migration_channels_and_uri_compatible(uri, errp)) {
+if (!migrate_uri_parse(uri, &channel, errp)) {
 return;
 }
 
-if (!migrate_uri_parse(uri, &channel, errp)) {
+/* transport mechanism not suitable for migration? */
+if (!migration_channels_and_transport_compatible(channel, errp)) {
 return;
 }
 
-- 
2.22.3

Re: [PULL 5/5] linux-user: Fix qemu-arm to run static armhf binaries

2023-07-21 Thread Michael Tokarev


19.07.2023 18:52, Helge Deller wrote:

qemu-user crashes immediately when running static binaries on the armhf
architecture. The problem is the memory layout where the executable is
loaded before the interpreter library, in which case the reserved brk
region clashes with the interpreter code and is released before qemu
tries to start the program.

At load time qemu calculates a brk value for interpreter and executable
each.  The fix is to choose the higher one of both.

Signed-off-by: Helge Deller 
Cc: Andreas Schwab 
Cc: qemu-sta...@nongnu.org
Reported-by:  venkata.p...@toshiba-tsip.com
Closes: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=1040981
---
  linux-user/elfload.c | 7 +++
  1 file changed, 7 insertions(+)

diff --git a/linux-user/elfload.c b/linux-user/elfload.c
index a26200d9f3..94951630b1 100644
--- a/linux-user/elfload.c
+++ b/linux-user/elfload.c
@@ -3615,6 +3615,13 @@ int load_elf_binary(struct linux_binprm *bprm, struct 
image_info *info)

  if (elf_interpreter) {
  load_elf_interp(elf_interpreter, &interp_info, bprm->buf);
+/*
+ * adjust brk address if the interpreter was loaded above the main
+ * executable, e.g. happens with static binaries on armhf
+ */
+if (interp_info.brk > info->brk) {
+info->brk = interp_info.brk;
+}


So, this is kinda amusing.
This broke arm64, ppc64el and s390x:

arm64$ ./qemu-aarch64 /bin/sh -c '/bin/ls -dCFl *[t]* >/dev/null'
qemu: uncaught target signal 11 (Segmentation fault) - core dumped
Segmentation fault

(it was just a quick test from debian qemu-user package).

Reverting this patch makes it work again..

*Sigh*.

(This is currently broken in debian unstable too, - this is how I
noticed the breakage).

Thanks,

/mjt

Re: [PATCH v2 3/4] vhost: Add high-level state save/load functions

2023-07-21 Thread Eugenio Perez Martin

On Wed, Jul 12, 2023 at 1:17 PM Hanna Czenczek  wrote:
>
> vhost_save_backend_state() and vhost_load_backend_state() can be used by
> vhost front-ends to easily save and load the back-end's state to/from
> the migration stream.
>
> Because we do not know the full state size ahead of time,
> vhost_save_backend_state() simply reads the data in 1 MB chunks, and
> writes each chunk consecutively into the migration stream, prefixed by
> its length.  EOF is indicated by a 0-length chunk.
>
> Signed-off-by: Hanna Czenczek 
> ---
>  include/hw/virtio/vhost.h |  35 +++
>  hw/virtio/vhost.c | 204 ++
>  2 files changed, 239 insertions(+)
>
> diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
> index d8877496e5..0c282abd4e 100644
> --- a/include/hw/virtio/vhost.h
> +++ b/include/hw/virtio/vhost.h
> @@ -425,4 +425,39 @@ int vhost_set_device_state_fd(struct vhost_dev *dev,
>   */
>  int vhost_check_device_state(struct vhost_dev *dev, Error **errp);
>
> +/**
> + * vhost_save_backend_state(): High-level function to receive a vhost
> + * back-end's state, and save it in `f`.  Uses
> + * `vhost_set_device_state_fd()` to get the data from the back-end, and
> + * stores it in consecutive chunks that are each prefixed by their
> + * respective length (be32).  The end is marked by a 0-length chunk.
> + *
> + * Must only be called while the device and all its vrings are stopped
> + * (`VHOST_TRANSFER_STATE_PHASE_STOPPED`).
> + *
> + * @dev: The vhost device from which to save the state
> + * @f: Migration stream in which to save the state
> + * @errp: Potential error message
> + *
> + * Returns 0 on success, and -errno otherwise.
> + */
> +int vhost_save_backend_state(struct vhost_dev *dev, QEMUFile *f, Error 
> **errp);
> +
> +/**
> + * vhost_load_backend_state(): High-level function to load a vhost
> + * back-end's state from `f`, and send it over to the back-end.  Reads
> + * the data from `f` in the format used by `vhost_save_state()`, and
> + * uses `vhost_set_device_state_fd()` to transfer it to the back-end.
> + *
> + * Must only be called while the device and all its vrings are stopped
> + * (`VHOST_TRANSFER_STATE_PHASE_STOPPED`).
> + *
> + * @dev: The vhost device to which to send the sate
> + * @f: Migration stream from which to load the state
> + * @errp: Potential error message
> + *
> + * Returns 0 on success, and -errno otherwise.
> + */
> +int vhost_load_backend_state(struct vhost_dev *dev, QEMUFile *f, Error 
> **errp);
> +
>  #endif
> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
> index 756b6d55a8..332d49a310 100644
> --- a/hw/virtio/vhost.c
> +++ b/hw/virtio/vhost.c
> @@ -2128,3 +2128,207 @@ int vhost_check_device_state(struct vhost_dev *dev, 
> Error **errp)
> "vhost transport does not support migration state transfer");
>  return -ENOSYS;
>  }
> +
> +int vhost_save_backend_state(struct vhost_dev *dev, QEMUFile *f, Error 
> **errp)
> +{
> +/* Maximum chunk size in which to transfer the state */
> +const size_t chunk_size = 1 * 1024 * 1024;
> +void *transfer_buf = NULL;
> +g_autoptr(GError) g_err = NULL;
> +int pipe_fds[2], read_fd = -1, write_fd = -1, reply_fd = -1;
> +int ret;
> +
> +/* [0] for reading (our end), [1] for writing (back-end's end) */
> +if (!g_unix_open_pipe(pipe_fds, FD_CLOEXEC, &g_err)) {
> +error_setg(errp, "Failed to set up state transfer pipe: %s",
> +   g_err->message);
> +ret = -EINVAL;
> +goto fail;
> +}
> +
> +read_fd = pipe_fds[0];
> +write_fd = pipe_fds[1];
> +
> +/*
> + * VHOST_TRANSFER_STATE_PHASE_STOPPED means the device must be stopped.
> + * We cannot check dev->suspended, because the back-end may not support
> + * suspending.
> + */
> +assert(!dev->started);
> +
> +/* Transfer ownership of write_fd to the back-end */
> +ret = vhost_set_device_state_fd(dev,
> +VHOST_TRANSFER_STATE_DIRECTION_SAVE,
> +VHOST_TRANSFER_STATE_PHASE_STOPPED,
> +write_fd,
> +&reply_fd,
> +errp);
> +if (ret < 0) {
> +error_prepend(errp, "Failed to initiate state transfer: ");
> +goto fail;
> +}
> +
> +/* If the back-end wishes to use a different pipe, switch over */
> +if (reply_fd >= 0) {
> +close(read_fd);
> +read_fd = reply_fd;
> +}
> +
> +transfer_buf = g_malloc(chunk_size);
> +
> +while (true) {
> +ssize_t read_ret;
> +
> +read_ret = read(read_fd, transfer_buf, chunk_size);
> +if (read_ret < 0) {
> +ret = -errno;
> +error_setg_errno(errp, -ret, "Failed to receive state");
> +goto fail;
> +}
> +
> +assert(read_ret <= chunk_size);
> +qemu_put_be32(f, read_ret);
> +
> +if (r

Re: [PULL 5/5] linux-user: Fix qemu-arm to run static armhf binaries

2023-07-21 Thread Michael Tokarev


21.07.2023 18:14, Michael Tokarev пишет:

19.07.2023 18:52, Helge Deller wrote:

qemu-user crashes immediately when running static binaries on the armhf
architecture. The problem is the memory layout where the executable is
loaded before the interpreter library, in which case the reserved brk
region clashes with the interpreter code and is released before qemu
tries to start the program.

At load time qemu calculates a brk value for interpreter and executable
each.  The fix is to choose the higher one of both.

Signed-off-by: Helge Deller 
Cc: Andreas Schwab 
Cc: qemu-sta...@nongnu.org
Reported-by:  venkata.p...@toshiba-tsip.com
Closes: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=1040981
---
  linux-user/elfload.c | 7 +++
  1 file changed, 7 insertions(+)

diff --git a/linux-user/elfload.c b/linux-user/elfload.c
index a26200d9f3..94951630b1 100644
--- a/linux-user/elfload.c
+++ b/linux-user/elfload.c
@@ -3615,6 +3615,13 @@ int load_elf_binary(struct linux_binprm *bprm, struct 
image_info *info)

  if (elf_interpreter) {
  load_elf_interp(elf_interpreter, &interp_info, bprm->buf);
+    /*
+ * adjust brk address if the interpreter was loaded above the main
+ * executable, e.g. happens with static binaries on armhf
+ */
+    if (interp_info.brk > info->brk) {
+    info->brk = interp_info.brk;
+    }


I've added printf() inside this if() block, on arm64 it produces:

  fixing brk: interp_info.brk=0x5502875358 info=>brk=0x5500032ec0

FWIW,

/mjt

Re: [PATCH 3/6] vhost: Do not reset suspended devices on stop

2023-07-21 Thread Eugenio Perez Martin

On Tue, Jul 11, 2023 at 5:52 PM Hanna Czenczek  wrote:
>
> Move the `suspended` field from vhost_vdpa into the global vhost_dev
> struct, so vhost_dev_stop() can check whether the back-end has been
> suspended by `vhost_ops->vhost_dev_start(hdev, false)`.  If it has,
> there is no need to reset it; the reset is just a fall-back to stop
> device operations for back-ends that do not support suspend.
>
> Unfortunately, for vDPA specifically, RESUME is not yet implemented, so
> when the device is re-started, we still have to do the reset to have it
> un-suspend.
>
> Signed-off-by: Hanna Czenczek 
> ---
>  include/hw/virtio/vhost-vdpa.h |  2 --
>  include/hw/virtio/vhost.h  |  8 
>  hw/virtio/vhost-vdpa.c | 11 +++
>  hw/virtio/vhost.c  |  8 +++-
>  4 files changed, 22 insertions(+), 7 deletions(-)
>
> diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
> index e64bfc7f98..72c3686b7f 100644
> --- a/include/hw/virtio/vhost-vdpa.h
> +++ b/include/hw/virtio/vhost-vdpa.h
> @@ -42,8 +42,6 @@ typedef struct vhost_vdpa {
>  bool shadow_vqs_enabled;
>  /* Vdpa must send shadow addresses as IOTLB key for data queues, not GPA 
> */
>  bool shadow_data;
> -/* Device suspended successfully */
> -bool suspended;
>  /* IOVA mapping used by the Shadow Virtqueue */
>  VhostIOVATree *iova_tree;
>  GPtrArray *shadow_vqs;
> diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
> index 6a173cb9fa..69bf59d630 100644
> --- a/include/hw/virtio/vhost.h
> +++ b/include/hw/virtio/vhost.h
> @@ -120,6 +120,14 @@ struct vhost_dev {
>  uint64_t backend_cap;
>  /* @started: is the vhost device started? */
>  bool started;
> +/**
> + * @suspended: Whether the vhost device is currently suspended.  Set
> + * and reset by implementations (vhost-user, vhost-vdpa, ...), which
> + * are supposed to automatically suspend/resume in their
> + * vhost_dev_start handlers as required.  Must also be cleared when
> + * the device is reset.
> + */
> +bool suspended;
>  bool log_enabled;
>  uint64_t log_size;
>  Error *migration_blocker;
> diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
> index 7b7dee468e..f7fd19a203 100644
> --- a/hw/virtio/vhost-vdpa.c
> +++ b/hw/virtio/vhost-vdpa.c
> @@ -858,13 +858,12 @@ static int vhost_vdpa_get_device_id(struct vhost_dev 
> *dev,
>
>  static int vhost_vdpa_reset_device(struct vhost_dev *dev)
>  {
> -struct vhost_vdpa *v = dev->opaque;
>  int ret;
>  uint8_t status = 0;
>
>  ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &status);
>  trace_vhost_vdpa_reset_device(dev);
> -v->suspended = false;
> +dev->suspended = false;
>  return ret;
>  }
>
> @@ -1278,7 +1277,7 @@ static void vhost_vdpa_suspend(struct vhost_dev *dev)
>  if (unlikely(r)) {
>  error_report("Cannot suspend: %s(%d)", g_strerror(errno), errno);
>  } else {
> -v->suspended = true;
> +dev->suspended = true;
>  return;
>  }
>  }
> @@ -1313,6 +1312,10 @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, 
> bool started)
>  return -1;
>  }
>  vhost_vdpa_set_vring_ready(dev);
> +if (dev->suspended) {
> +/* TODO: When RESUME is available, use it instead of resetting */
> +vhost_vdpa_reset_status(dev);

How is that we reset the status at each vhost_vdpa_dev_start? That
will clean all the vqs configured, features negotiated, etc. in the
vDPA device. Or am I missing something?

I'm totally ok with the move of "suspended" member.

Thanks!


> +}
>  } else {
>  vhost_vdpa_suspend(dev);
>  vhost_vdpa_svqs_stop(dev);
> @@ -1400,7 +1403,7 @@ static int vhost_vdpa_get_vring_base(struct vhost_dev 
> *dev,
>  return 0;
>  }
>
> -if (!v->suspended) {
> +if (!dev->suspended) {
>  /*
>   * Cannot trust in value returned by device, let vhost recover used
>   * idx from guest.
> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
> index abf0d03c8d..2e28e58da7 100644
> --- a/hw/virtio/vhost.c
> +++ b/hw/virtio/vhost.c
> @@ -2059,7 +2059,13 @@ void vhost_dev_stop(struct vhost_dev *hdev, 
> VirtIODevice *vdev, bool vrings)
>   hdev->vqs + i,
>   hdev->vq_index + i);
>  }
> -if (hdev->vhost_ops->vhost_reset_status) {
> +
> +/*
> + * If we failed to successfully stop the device via SUSPEND (should have
> + * been attempted by `vhost_dev_start(hdev, false)`), reset it to stop 
> it.
> + * Stateful devices where this would be problematic must implement 
> SUSPEND.
> + */
> +if (!hdev->suspended && hdev->vhost_ops->vhost_reset_status) {
>  hdev->vhost_ops->vhost_reset_status(hdev);
>  }
>
> --
> 2.41.0
>

Re: [PATCH v6 4/5] intel_iommu: allow Extended Interrupt Mode when using userspace APIC

2023-07-21 Thread Bui Quang Minh


On 7/21/23 03:47, Peter Xu wrote:

On Mon, Jul 17, 2023 at 11:29:56PM +0700, Bui Quang Minh wrote:

On 7/17/23 17:47, Joao Martins wrote:

+Peter, +Jason (intel-iommu maintainer/reviewer)


Thanks for copying me, Joan.



On 15/07/2023 16:22, Bui Quang Minh wrote:

As userspace APIC now supports x2APIC, intel interrupt remapping
hardware can be set to EIM mode when userspace local APIC is used.

Reviewed-by: Michael S. Tsirkin 
Signed-off-by: Bui Quang Minh 
---
   hw/i386/intel_iommu.c | 11 ---
   1 file changed, 11 deletions(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index dcc334060c..5e576f6059 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -4043,17 +4043,6 @@ static bool vtd_decide_config(IntelIOMMUState *s, Error 
**errp)
 && x86_iommu_ir_supported(x86_iommu) ?
 ON_OFF_AUTO_ON : 
ON_OFF_AUTO_OFF;
   }
-if (s->intr_eim == ON_OFF_AUTO_ON && !s->buggy_eim) {
-if (!kvm_irqchip_is_split()) {
-error_setg(errp, "eim=on requires accel=kvm,kernel-irqchip=split");
-return false;
-}
-if (!kvm_enable_x2apic()) {
-error_setg(errp, "eim=on requires support on the KVM side"
- "(X2APIC_API, first shipped in v4.7)");
-return false;
-}
-}

Given commit 20ca47429e ('Revert "intel_iommu: Fix irqchip / X2APIC
configuration checks"'), won't we regress behaviour again  for the accel=kvm
case by dropping the kvm_enable_x2apic() call here?

Perhaps if we support userspace APIC with TCG the check just needs to be redone
to instead avoid always requiring kvm e.g.:

if (kvm_irqchip_in_kernel()) {
  error_setg(errp, "eim=on requires accel=kvm,kernel-irqchip=split"
 "(X2APIC_API, first shipped in v4.7)");
}

if (kvm_irqchip_is_split() && !kvm_enable_x2apic()) {
  error_setg(errp, "eim=on requires support on the KVM side"
 "(X2APIC_API, first shipped in v4.7)");
  return false;
}


Thank you for your review. I think the check for kvm_irqchip_in_kernel() is
not correct, AFAIK, kvm_irqchip_is_split() == true also means
kvm_irqchip_in_kernel() == true on x86. To check if kernel-irqchip = on, we
need to do something like in x86_iommu_realize

 bool irq_all_kernel = kvm_irqchip_in_kernel() &&
!kvm_irqchip_is_split();

The original check for !kvm_irqchip_is_split means emulated/userspace APIC.
It's because to reach that check x86_iommu_ir_supported(...) == true and
x86_iommu_ir_supported(...) == true is not supported when kernel-irqchip =
on (there is a check for this in x86_iommu_realize)

So I think we need to change the check to

 if (s->intr_eim == ON_OFF_AUTO_ON && !s->buggy_eim) {
 if (kvm_irqchip_is_split() && !kvm_enable_x2apic()) {
 error_setg(errp, "eim=on requires support on the KVM side"
  "(X2APIC_API, first shipped in v4.7)");
 return false;
 }
 }

Is it OK?


Mostly to me, except that we may also want to keep failing if all irq chips
are in kernel?


Yes, that behavior does not change after this patch. x86_iommu_realize 
in the parent type TYPE_X86_IOMMU_DEVICE fails when interrupt remapping 
is on and all irq chips are in kernel already.


static void x86_iommu_realize(DeviceState *dev, Error **errp)
{
/* ... */
/* Both Intel and AMD IOMMU IR only support "kernel-irqchip 
{off|split}" */

if (x86_iommu_ir_supported(x86_iommu) && irq_all_kernel) {
error_setg(errp, "Interrupt Remapping cannot work with "
 "kernel-irqchip=on, please use 'split|off'.");
return;
}
/* ... */
}


So in case we reach here in with the interrupt remapping is on and 
decide whether eim is on or not, it cannot be that irq chips are all in 
kernel.


Thanks,
Quang Minh.

Re: [PATCH v3 1/6] throttle: introduce enum ThrottleType

2023-07-21 Thread Hanna Czenczek


On 13.07.23 08:41, zhenwei pi wrote:

Use enum ThrottleType instead of number index.

Reviewed-by: Alberto Garcia
Signed-off-by: zhenwei pi
---
  include/qemu/throttle.h | 11 ---
  util/throttle.c | 16 +---
  2 files changed, 17 insertions(+), 10 deletions(-)


[...]


diff --git a/util/throttle.c b/util/throttle.c
index 81f247a8d1..5642e61763 100644
--- a/util/throttle.c
+++ b/util/throttle.c
@@ -199,10 +199,12 @@ static bool throttle_compute_timer(ThrottleState *ts,
  void throttle_timers_attach_aio_context(ThrottleTimers *tt,
  AioContext *new_context)
  {
-tt->timers[0] = aio_timer_new(new_context, tt->clock_type, SCALE_NS,
-  tt->read_timer_cb, tt->timer_opaque);
-tt->timers[1] = aio_timer_new(new_context, tt->clock_type, SCALE_NS,
-  tt->write_timer_cb, tt->timer_opaque);
+tt->timers[THROTTLE_READ] =
+aio_timer_new(new_context, tt->clock_type, SCALE_NS,
+  tt->timer_cb[THROTTLE_READ], tt->timer_opaque);
+tt->timers[THROTTLE_WRITE] =
+aio_timer_new(new_context, tt->clock_type, SCALE_NS,
+  tt->timer_cb[THROTTLE_WRITE], tt->timer_opaque);


This could benefit from the new structure:

```
for (int i = 0; i < THROTTLE_MAX; i++) {
    tt->timers[i] = aio_timer_new(..., tt->timer_cb[i], ...);
}
```

Even more so after patch 3.  Still, optional, of course, so either way:

Reviewed-by: Hanna Czenczek 

Of note is that we still have instances in util/throttle.c and 
block/throttle-groups.c that don’t use these enums to access the 
tt->timers[] array, and that’s a bit unfortunate now (i.e. 
`tt->timers[is_write]` should rather be `tt->timers[is_write ? 
THROTTLE_WRITE : THROTTLE_READ]` instead).  But functionally it’s OK and 
I believe patches 3 and 6 do address this.

Re: [PATCH v3 5/6] cryptodev: use NULL throttle timer cb for read direction

2023-07-21 Thread Hanna Czenczek


On 13.07.23 08:41, zhenwei pi wrote:

Operations on a crytpodev are considered as *write* only, the callback


s/crytpodev/cryptodev/


of read direction is never invoked. Use NULL instead of an unreachable
path(cryptodev_backend_throttle_timer_cb on read direction).

Reviewed-by: Alberto Garcia 
Signed-off-by: zhenwei pi 
---
  backends/cryptodev.c | 3 +--
  1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/backends/cryptodev.c b/backends/cryptodev.c
index 7d29517843..5cfa25c61c 100644
--- a/backends/cryptodev.c
+++ b/backends/cryptodev.c
@@ -331,8 +331,7 @@ static void cryptodev_backend_set_throttle(CryptoDevBackend 
*backend, int field,
  if (!enabled) {
  throttle_init(&backend->ts);
  throttle_timers_init(&backend->tt, qemu_get_aio_context(),
- QEMU_CLOCK_REALTIME,
- cryptodev_backend_throttle_timer_cb, /* FIXME */


Now it’ll be gone (good), but if you happen to add a FIXME again, it 
would be nice to explain in the same comment in the code what exactly 
needs fixing, and why it isn‘t being fixed at this point.



+ QEMU_CLOCK_REALTIME, NULL,
   cryptodev_backend_throttle_timer_cb, backend);
  }


With the typo in the commit message fixed:

Reviewed-by: Hanna Czenczek

Re: [PATCH v3 3/6] throttle: support read-only and write-only

2023-07-21 Thread Hanna Czenczek


On 13.07.23 08:41, zhenwei pi wrote:

Only one direction is necessary in several scenarios:
- a read-only disk
- operations on a device are considered as *write* only. For example,
   encrypt/decrypt/sign/verify operations on a cryptodev use a single
   *write* timer(read timer callback is defined, but never invoked).

Allow a single direction in throttle, this reduces memory, and uplayer
does not need a dummy callback any more.

Reviewed-by: Alberto Garcia
Signed-off-by: zhenwei pi
---
  util/throttle.c | 32 ++--
  1 file changed, 22 insertions(+), 10 deletions(-)

diff --git a/util/throttle.c b/util/throttle.c
index 5642e61763..c0bd0c26c3 100644
--- a/util/throttle.c
+++ b/util/throttle.c
@@ -199,12 +199,17 @@ static bool throttle_compute_timer(ThrottleState *ts,
  void throttle_timers_attach_aio_context(ThrottleTimers *tt,
  AioContext *new_context)
  {
-tt->timers[THROTTLE_READ] =
-aio_timer_new(new_context, tt->clock_type, SCALE_NS,
-  tt->timer_cb[THROTTLE_READ], tt->timer_opaque);
-tt->timers[THROTTLE_WRITE] =
-aio_timer_new(new_context, tt->clock_type, SCALE_NS,
-  tt->timer_cb[THROTTLE_WRITE], tt->timer_opaque);
+if (tt->timer_cb[THROTTLE_READ]) {
+tt->timers[THROTTLE_READ] =
+aio_timer_new(new_context, tt->clock_type, SCALE_NS,
+  tt->timer_cb[THROTTLE_READ], tt->timer_opaque);
+}
+
+if (tt->timer_cb[THROTTLE_WRITE]) {
+tt->timers[THROTTLE_WRITE] =
+aio_timer_new(new_context, tt->clock_type, SCALE_NS,
+  tt->timer_cb[THROTTLE_WRITE], tt->timer_opaque);
+}


 I think a `for (int i = 0; i < THROTTLE_MAX; i++)` loop would make 
this nicer.  (Again: Optional.)



  }
  
  /*


[...]


@@ -272,7 +280,7 @@ void throttle_timers_destroy(ThrottleTimers *tt)
  /* is any throttling timer configured */
  bool throttle_timers_are_initialized(ThrottleTimers *tt)
  {
-if (tt->timers[0]) {
+if (tt->timers[THROTTLE_READ] || tt->timers[THROTTLE_WRITE]) {


Not wrong, but I’d prefer the more general

```
for (int i = 0; i < THROTTLE_MAX; i++) {
    if (tt->timers[i]) {
    return true;
    }
}
return false;
```


  return true;
  }
  
@@ -424,8 +432,12 @@ bool throttle_schedule_timer(ThrottleState *ts,

  {
  int64_t now = qemu_clock_get_ns(tt->clock_type);
  int64_t next_timestamp;
+QEMUTimer *timer;
  bool must_wait;
  
+timer = is_write ? tt->timers[THROTTLE_WRITE] : tt->timers[THROTTLE_READ];


Could be shorter as `timer = tt->timers[is_write ? THROTTLE_WRITE : 
THROTTLE_READ];`.


Anyway, I only have style suggestion to offer, so either way:

Reviewed-by: Hanna Czenczek 


+assert(timer);
+
  must_wait = throttle_compute_timer(ts,
 is_write,
 now,

Re: [PATCH v3 4/6] test-throttle: test read only and write only

2023-07-21 Thread Hanna Czenczek


On 13.07.23 08:41, zhenwei pi wrote:

Reviewed-by: Alberto Garcia 
Signed-off-by: zhenwei pi 
---
  tests/unit/test-throttle.c | 66 ++
  1 file changed, 66 insertions(+)


Reviewed-by: Hanna Czenczek

Re: [PATCH v3 2/6] test-throttle: use enum ThrottleType

2023-07-21 Thread Hanna Czenczek


On 13.07.23 08:41, zhenwei pi wrote:

Use enum ThrottleType instead in the throttle test codes.

Reviewed-by: Alberto Garcia 
Signed-off-by: zhenwei pi 
---
  tests/unit/test-throttle.c | 6 +++---
  1 file changed, 3 insertions(+), 3 deletions(-)


Reviewed-by: Hanna Czenczek

Re: [PATCH 5/6] vhost-vdpa: Match vhost-user's status reset

2023-07-21 Thread Eugenio Perez Martin

On Wed, Jul 19, 2023 at 4:10 PM Hanna Czenczek  wrote:
>
> On 18.07.23 16:50, Stefan Hajnoczi wrote:
> > On Tue, Jul 11, 2023 at 05:52:27PM +0200, Hanna Czenczek wrote:
> >> vhost-vdpa and vhost-user differ in how they reset the status in their
> >> respective vhost_reset_status implementations: vhost-vdpa zeroes it,
> >> then re-adds the S_ACKNOWLEDGE and S_DRIVER config bits.  S_DRIVER_OK is
> >> then set in vhost_vdpa_dev_start().
> >>
> >> vhost-user in contrast just zeroes the status, and does no re-add any
> >> config bits until vhost_user_dev_start() (where it does re-add all of
> >> S_ACKNOWLEDGE, S_DRIVER, and S_DRIVER_OK).
> >>
> >> There is no documentation for vhost_reset_status, but its only caller is
> >> vhost_dev_stop().  So apparently, the device is to be stopped after
> >> vhost_reset_status, and therefore it makes more sense to keep the status
> >> field fully cleared until the back-end is re-started, which is how
> >> vhost-user does it.  Make vhost-vdpa do the same -- if nothing else it's
> >> confusing to have both vhost implementations handle this differently.
> >>

Ok now I understand better why you moved the call to vhost_vdpa_reset_status.

Maybe we can move the vhost_vdpa_add_status(dev, _S_ACKNOWLEDGE |
_S_DRIVER) to vhost_vdpa_set_features then, and let reset_status call
vhost_vdpa_reset_device directly. But I'm not 100% sure if I'm missing
something, it would need testing.

Thanks!

> >> Signed-off-by: Hanna Czenczek 
> >> ---
> >>   hw/virtio/vhost-vdpa.c | 6 +++---
> >>   1 file changed, 3 insertions(+), 3 deletions(-)
> > Hi Hanna,
> > The VIRTIO spec lists the Device Initialization sequence including the
> > bits set in the Device Status Register here:
> > https://docs.oasis-open.org/virtio/virtio/v1.2/csd01/virtio-v1.2-csd01.html#x1-1070001
> >
> > ACKNOWLEDGE and DRIVER must be set before FEATURES_OK. DRIVER_OK is set
> > after FEATURES_OK.
> >
> > The driver may read the Device Configuration Space once ACKNOWLEDGE and
> > DRIVER are set.
> >
> > QEMU's vhost code should follow this sequence (especially for vDPA where
> > full VIRTIO devices are implemented).
> >
> > vhost-user is not faithful to the VIRTIO spec here. That's probably due
> > to the fact that vhost-user didn't have the concept of the Device Status
> > Register until recently and back-ends mostly ignore it.
> >
> > Please do the opposite of this patch: bring vhost-user in line with the
> > VIRTIO specification so that the Device Initialization sequence is
> > followed correctly. I think vhost-vdpa already does the right thing.
>
> Hm.  This sounds all very good, but what leaves me lost is the fact that
> we never actually expose the status field to the guest, as far as I can
> see.  We have no set_status callback, and as written in the commit
> message, the only caller of reset_status is vhost_dev_stop().  So the
> status field seems completely artificial in vhost right now.  That is
> why I’m wondering what the flags even really mean.
>
> Another point I made in the commit message is that it is strange that we
> reset the status to 0, and then add the ACKNOWLEDGE and DRIVER while the
> VM is still stopped.  It doesn’t make sense to me to set these flags
> while the guest driver is not operative.
>
> If what you’re saying is that we must set FEATURES_OK only after
> ACKNOWLEDGE and DRIVER, wouldn’t it be still better to set all of these
> flags only in vhost_*_dev_start(), but do it in two separate SET_STATUS
> calls?
>
> (You mentioned the configuration space – is that accessed while between
> vhost_dev_stop and vhost_dev_start?)
>
> Hanna
>
> >> diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
> >> index f7fd19a203..0cde8b40de 100644
> >> --- a/hw/virtio/vhost-vdpa.c
> >> +++ b/hw/virtio/vhost-vdpa.c
> >> @@ -1294,8 +1294,6 @@ static void vhost_vdpa_reset_status(struct vhost_dev 
> >> *dev)
> >>   }
> >>
> >>   vhost_vdpa_reset_device(dev);
> >> -vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE |
> >> -   VIRTIO_CONFIG_S_DRIVER);
> >>   memory_listener_unregister(&v->listener);
> >>   }
> >>
> >> @@ -1334,7 +1332,9 @@ static int vhost_vdpa_dev_start(struct vhost_dev 
> >> *dev, bool started)
> >>   }
> >>   memory_listener_register(&v->listener, dev->vdev->dma_as);
> >>
> >> -return vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK);
> >> +return vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE |
> >> +  VIRTIO_CONFIG_S_DRIVER |
> >> +  VIRTIO_CONFIG_S_DRIVER_OK);
> >>   }
> >>
> >>   return 0;
> >> --
> >> 2.41.0
> >>
>

Re: [PATCH v3 6/6] throttle: use enum ThrottleType instead of bool is_write

2023-07-21 Thread Hanna Czenczek


On 13.07.23 08:41, zhenwei pi wrote:

enum ThrottleType is already there, use ThrottleType instead of
'bool is_write' for throttle API, also modify related codes from
block, fsdev, cryptodev and tests.

Signed-off-by: zhenwei pi 
---
  backends/cryptodev.c|  9 +
  block/throttle-groups.c |  6 --
  fsdev/qemu-fsdev-throttle.c |  8 +---
  include/qemu/throttle.h |  4 ++--
  tests/unit/test-throttle.c  |  4 ++--
  util/throttle.c | 30 --
  6 files changed, 34 insertions(+), 27 deletions(-)


Something not addressed in this patch that I would like to see: 
schedule_next_request() in block/throttle-groups.c runs 
`timer_mod(tt->timers[is_write], now)`.  I think that at least should be 
`tt->timers[is_write ? THROTTLE_WRITE : THROTTLE_READ]`.  Even better 
would be to have it take a `ThrottleType` instead of `bool is_write`, 
too, and use this enum throughout throttle-groups.c, too (i.e. for 
ThrottleGroupMember.throttled_reqs[], 
ThrottleGroupMember.pending_reqs[], ThrottleGroup.tokens[], and 
ThrottleGroup.any_timer_armed[]).  Then throttle_group_schedule_timer() 
could also take a `ThrottleType`.


But I understand asking for throttle-groups.c to be ThrottleType-ified 
is very much, so this is just a suggestion.  But I do ask for that one 
`timer_mod()` call to use THROTTLE_READ and THROTTLE_WRITE to index 
`tt->timers[]` instead of `is_write` directly.


[...]


diff --git a/block/throttle-groups.c b/block/throttle-groups.c
index fb203c3ced..429b9d1dae 100644
--- a/block/throttle-groups.c
+++ b/block/throttle-groups.c
@@ -270,6 +270,7 @@ static bool 
throttle_group_schedule_timer(ThrottleGroupMember *tgm,
  ThrottleState *ts = tgm->throttle_state;
  ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
  ThrottleTimers *tt = &tgm->throttle_timers;
+ThrottleType throttle = is_write ? THROTTLE_WRITE : THROTTLE_READ;


Again, another stylistic suggestion (for all similar places in this 
patch): It isn’t clear what just `throttle` means. `throttle_direction` 
for example would be clear, or maybe just `direction`, this is throttle 
code, so it’s clear that everything that isn’t given a context is about 
throttling (it wasn’t `is_throttled_write` either, after all, but just 
`is_write`).



  bool must_wait;
  
  if (qatomic_read(&tgm->io_limits_disabled)) {


[...]


diff --git a/util/throttle.c b/util/throttle.c
index c0bd0c26c3..5e4dc0bfdd 100644
--- a/util/throttle.c
+++ b/util/throttle.c
@@ -136,11 +136,11 @@ int64_t throttle_compute_wait(LeakyBucket *bkt)
  
  /* This function compute the time that must be waited while this IO

   *
- * @is_write:   true if the current IO is a write, false if it's a read
+ * @throttle:   throttle type


I’m not too happy about “throttle type” as a description here, because 
“type” can be anything (naïvely, I’d rather interpret it to mean the 
algorithm used for throttling, or whether we’re throttling on bytes or 
IOPS).  “throttle direction” would be better.  This also applies to 
other functions’ interface documentation.


(Yes, this also means that I don’t like the type name ThrottleType very 
much, and would prefer it to be ThrottleDirection, but that would be an 
invasive change all over this series (when Berto has already reviewed 
it), so I’m not really asking for a change there.)



   * @ret:time to wait
   */
  static int64_t throttle_compute_wait_for(ThrottleState *ts,
- bool is_write)
+ ThrottleType throttle)
  {
  BucketType to_check[2][4] = { {THROTTLE_BPS_TOTAL,


Since we’re now using a ThrottleType to index the first dimension of 
this array, I’d prefer to make this to_check[THROTTLE_MAX][4].


(Also, but very much unrelated to this patch: Why isn’t this lookup 
table a `const static`?)



 THROTTLE_OPS_TOTAL,


[...]


@@ -460,10 +461,10 @@ bool throttle_schedule_timer(ThrottleState *ts,
  
  /* do the accounting for this operation

   *
- * @is_write: the type of operation (read/write)
+ * @throttle: throttle type
   * @size: the size of the operation
   */
-void throttle_account(ThrottleState *ts, bool is_write, uint64_t size)
+void throttle_account(ThrottleState *ts, ThrottleType throttle, uint64_t size)
  {
  const BucketType bucket_types_size[2][2] = {
  { THROTTLE_BPS_TOTAL, THROTTLE_BPS_READ },


Like in throttle_compute_wait_for(), I’d prefer bucket_types_size and 
bucket_types_units to have [THROTTLE_MAX] in the first dimension.


(Interesting that these lookup tables are const, but not static.  I 
think they should be static, too.)


Hanna

Re: [PATCH 3/6] vhost: Do not reset suspended devices on stop

2023-07-21 Thread Hanna Czenczek


On 21.07.23 17:25, Eugenio Perez Martin wrote:

On Tue, Jul 11, 2023 at 5:52 PM Hanna Czenczek  wrote:

Move the `suspended` field from vhost_vdpa into the global vhost_dev
struct, so vhost_dev_stop() can check whether the back-end has been
suspended by `vhost_ops->vhost_dev_start(hdev, false)`.  If it has,
there is no need to reset it; the reset is just a fall-back to stop
device operations for back-ends that do not support suspend.

Unfortunately, for vDPA specifically, RESUME is not yet implemented, so
when the device is re-started, we still have to do the reset to have it
un-suspend.

Signed-off-by: Hanna Czenczek 
---
  include/hw/virtio/vhost-vdpa.h |  2 --
  include/hw/virtio/vhost.h  |  8 
  hw/virtio/vhost-vdpa.c | 11 +++
  hw/virtio/vhost.c  |  8 +++-
  4 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
index e64bfc7f98..72c3686b7f 100644
--- a/include/hw/virtio/vhost-vdpa.h
+++ b/include/hw/virtio/vhost-vdpa.h
@@ -42,8 +42,6 @@ typedef struct vhost_vdpa {
  bool shadow_vqs_enabled;
  /* Vdpa must send shadow addresses as IOTLB key for data queues, not GPA 
*/
  bool shadow_data;
-/* Device suspended successfully */
-bool suspended;
  /* IOVA mapping used by the Shadow Virtqueue */
  VhostIOVATree *iova_tree;
  GPtrArray *shadow_vqs;
diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
index 6a173cb9fa..69bf59d630 100644
--- a/include/hw/virtio/vhost.h
+++ b/include/hw/virtio/vhost.h
@@ -120,6 +120,14 @@ struct vhost_dev {
  uint64_t backend_cap;
  /* @started: is the vhost device started? */
  bool started;
+/**
+ * @suspended: Whether the vhost device is currently suspended.  Set
+ * and reset by implementations (vhost-user, vhost-vdpa, ...), which
+ * are supposed to automatically suspend/resume in their
+ * vhost_dev_start handlers as required.  Must also be cleared when
+ * the device is reset.
+ */
+bool suspended;
  bool log_enabled;
  uint64_t log_size;
  Error *migration_blocker;
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index 7b7dee468e..f7fd19a203 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -858,13 +858,12 @@ static int vhost_vdpa_get_device_id(struct vhost_dev *dev,

  static int vhost_vdpa_reset_device(struct vhost_dev *dev)
  {
-struct vhost_vdpa *v = dev->opaque;
  int ret;
  uint8_t status = 0;

  ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &status);
  trace_vhost_vdpa_reset_device(dev);
-v->suspended = false;
+dev->suspended = false;
  return ret;
  }

@@ -1278,7 +1277,7 @@ static void vhost_vdpa_suspend(struct vhost_dev *dev)
  if (unlikely(r)) {
  error_report("Cannot suspend: %s(%d)", g_strerror(errno), errno);
  } else {
-v->suspended = true;
+dev->suspended = true;
  return;
  }
  }
@@ -1313,6 +1312,10 @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, 
bool started)
  return -1;
  }
  vhost_vdpa_set_vring_ready(dev);
+if (dev->suspended) {
+/* TODO: When RESUME is available, use it instead of resetting */
+vhost_vdpa_reset_status(dev);

How is that we reset the status at each vhost_vdpa_dev_start? That
will clean all the vqs configured, features negotiated, etc. in the
vDPA device. Or am I missing something?


What alternative do you propose?  We don’t have RESUME for vDPA in qemu, 
but we somehow need to lift the previous SUSPEND so the device will 
again respond to guest requests, do we not?


But more generally, is this any different from what is done before this 
patch?  Before this patch, vhost_dev_stop() unconditionally invokes 
vhost_reset_status(), so the device is reset in every stop/start cycle, 
that doesn’t change.  And we still won’t reset it on the first 
vhost_dev_start(), because dev->suspended will be false then, only on 
subsequent stop/start cycles, as before.  So the only difference is that 
now the device is reset on start, not on stop.


Hanna

Re: [PATCH v2 3/4] vhost: Add high-level state save/load functions

2023-07-21 Thread Hanna Czenczek


On 21.07.23 17:18, Eugenio Perez Martin wrote:

On Wed, Jul 12, 2023 at 1:17 PM Hanna Czenczek  wrote:

vhost_save_backend_state() and vhost_load_backend_state() can be used by
vhost front-ends to easily save and load the back-end's state to/from
the migration stream.

Because we do not know the full state size ahead of time,
vhost_save_backend_state() simply reads the data in 1 MB chunks, and
writes each chunk consecutively into the migration stream, prefixed by
its length.  EOF is indicated by a 0-length chunk.

Signed-off-by: Hanna Czenczek 
---
  include/hw/virtio/vhost.h |  35 +++
  hw/virtio/vhost.c | 204 ++
  2 files changed, 239 insertions(+)


[...]


+int vhost_load_backend_state(struct vhost_dev *dev, QEMUFile *f, Error **errp)
+{
+size_t transfer_buf_size = 0;
+void *transfer_buf = NULL;
+g_autoptr(GError) g_err = NULL;
+int pipe_fds[2], read_fd = -1, write_fd = -1, reply_fd = -1;
+int ret;


[...]


+ret = 0;
+fail:
+g_free(transfer_buf);

Nitpick, but transfer_buf could have the g_autofree parameter.


Ah, sure, thanks!

Hanna

Re: [PATCH v2] target/ppc: Generate storage interrupts for radix RC changes

2023-07-21 Thread Daniel Henrique Barboza





On 7/20/23 23:38, Nicholas Piggin wrote:

On Thu Jul 13, 2023 at 3:35 AM AEST, Shawn Anastasio wrote:

On 7/12/23 11:56 AM, Cédric Le Goater wrote:

Hello Shawn,

On 7/12/23 18:13, Shawn Anastasio wrote:

Change radix model to always generate a storage interrupt when the R/C
bits are not set appropriately in a PTE instead of setting the bits
itself.  According to the ISA both behaviors are valid, but in practice
this change more closely matches behavior observed on the POWER9 CPU.


How did you spotted this dark corner case in emulation ? Do you have
MMU unit tests ?


I'm currently porting Xen to Power and have been using QEMU's powernv
model extensively for early bring up. I noticed the issue when my radix
implementation worked in QEMU but failed on actual hardware since I
didn't have a proper storage interrupt handler implemented.


Cool. This was on my todo list because we rely on it for nested HV
KVM too.

I actually didn't know about that odd effLIPD=0 exception, but it
looks right. How did you test that, by running with MSR[HV]=0 and
LPIDR=0 ?

For the patch,

Reviewed-by: Nicholas Piggin 


Cedric, do you think this is 8.1 material?


Daniel



Thanks,
Nick

Re: [PATCH v2] target/ppc: Generate storage interrupts for radix RC changes

2023-07-21 Thread Cédric Le Goater


On 7/21/23 18:08, Daniel Henrique Barboza wrote:



On 7/20/23 23:38, Nicholas Piggin wrote:

On Thu Jul 13, 2023 at 3:35 AM AEST, Shawn Anastasio wrote:

On 7/12/23 11:56 AM, Cédric Le Goater wrote:

Hello Shawn,

On 7/12/23 18:13, Shawn Anastasio wrote:

Change radix model to always generate a storage interrupt when the R/C
bits are not set appropriately in a PTE instead of setting the bits
itself.  According to the ISA both behaviors are valid, but in practice
this change more closely matches behavior observed on the POWER9 CPU.


How did you spotted this dark corner case in emulation ? Do you have
MMU unit tests ?


I'm currently porting Xen to Power and have been using QEMU's powernv
model extensively for early bring up. I noticed the issue when my radix
implementation worked in QEMU but failed on actual hardware since I
didn't have a proper storage interrupt handler implemented.


Cool. This was on my todo list because we rely on it for nested HV
KVM too.

I actually didn't know about that odd effLIPD=0 exception, but it
looks right. How did you test that, by running with MSR[HV]=0 and
LPIDR=0 ?

For the patch,

Reviewed-by: Nicholas Piggin 


Cedric, do you think this is 8.1 material?


No. It is improving the MMU model but it is not fixing something that
was "broken" in QEMU. I would leave it for 8.2.

However, these are 8.1 material :

 https://patchwork.ozlabs.org/project/qemu-ppc/list/?series=364971
 https://patchwork.ozlabs.org/project/qemu-ppc/list/?series=364984

but they lack a Fixes tag. I asked Nick.

C.

[PATCH 0/4] CXL: SK hynix Niagara MHSLD Device

2023-07-21 Thread Gregory Price

Base repo: https://gitlab.com/jic23/qemu
Base branch: cxl-2023-07-17

This patch set includes an emulation of the SK hynix Niagara MHSLD
platform with custom CCI commands that allow for isolation of memory
blocks between attached hosts.

There are 4 total patches in this set:
1 & 2: Modifications to the CCI interface to allow the addition of
   custom CCI commands to an existing CCI command set.
3: Minimum MHD cci support for a type-3 device
4: The SK hynix Niagara Device

The changes to the core device and cci interface are very limited,
and this provides a clean way for developers to add custom CCI commands
to a device while retaining the possiblity to upstream the model.

This device allows hosts to request memory blocks directly from the
device, rather than requiring full the DCD command set.  As a matter of
simplicity, this is beneficial to for testing and applications of 
dynamic memory pooling on top of the 1.1 and 2.0 specification.

Note that these CCI commands are not servicable without the kernel
allowing raw CXL commands to be passed through the mailbox driver,
so users should enable `CONFIG_CXL_MEM_RAW_COMMANDS=y` on the kernel
of their QEMU instance.

Signed-off-by: Gregory Price 

Gregory Price (4):
  cxl/mailbox: change CCI cmd set structure to be a member, not a
refernce
  cxl/mailbox: interface to add CCI commands to an existing CCI
  cxl/type3: minimum MHD cci support
  cxl/vendor: SK hynix Niagara Multi-Headed SLD Device

 hw/cxl/Kconfig  |   4 +
 hw/cxl/cxl-mailbox-utils.c  |  90 +++-
 hw/cxl/meson.build  |   2 +
 hw/cxl/vendor/meson.build   |   1 +
 hw/cxl/vendor/skhynix/.gitignore|   1 +
 hw/cxl/vendor/skhynix/init_niagara.c|  99 +
 hw/cxl/vendor/skhynix/meson.build   |   1 +
 hw/cxl/vendor/skhynix/skhynix_niagara.c | 521 
 hw/mem/cxl_type3.c  |  67 +++
 include/hw/cxl/cxl_device.h |  18 +-
 tools/cxl/cxl_mhd_init.c|  63 +++
 tools/cxl/meson.build   |   3 +
 tools/meson.build   |   1 +
 13 files changed, 866 insertions(+), 5 deletions(-)
 create mode 100644 hw/cxl/vendor/meson.build
 create mode 100644 hw/cxl/vendor/skhynix/.gitignore
 create mode 100644 hw/cxl/vendor/skhynix/init_niagara.c
 create mode 100644 hw/cxl/vendor/skhynix/meson.build
 create mode 100644 hw/cxl/vendor/skhynix/skhynix_niagara.c
 create mode 100644 tools/cxl/cxl_mhd_init.c
 create mode 100644 tools/cxl/meson.build

-- 
2.39.1

[PATCH 2/4] cxl/mailbox: interface to add CCI commands to an existing CCI

2023-07-21 Thread Gregory Price

This enables wrapper devices to customize the base device's CCI
(for example, with custom commands outside the specification)
without the need to change the base device.

The also enabled the base device to dispatch those commands without
requiring additional driver support.

Signed-off-by: Gregory Price 
---
 hw/cxl/cxl-mailbox-utils.c  | 19 +++
 include/hw/cxl/cxl_device.h |  2 ++
 2 files changed, 21 insertions(+)

diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c
index ddee3f1718..cad0cd0adb 100644
--- a/hw/cxl/cxl-mailbox-utils.c
+++ b/hw/cxl/cxl-mailbox-utils.c
@@ -1383,6 +1383,25 @@ static void cxl_copy_cci_commands(CXLCCI *cci, const 
struct cxl_cmd (*cxl_cmds)[
 }
 }
 
+void cxl_add_cci_commands(CXLCCI *cci, const struct cxl_cmd 
(*cxl_cmd_set)[256], size_t payload_max)
+{
+cci->payload_max = payload_max > cci->payload_max ? payload_max : 
cci->payload_max;
+for (int set = 0; set < 256; set++) {
+for (int cmd = 0; cmd < 256; cmd++) {
+if (cxl_cmd_set[set][cmd].handler) {
+const struct cxl_cmd *c = &cxl_cmd_set[set][cmd];
+cci->cxl_cmd_set[set][cmd] = *c;
+struct cel_log *log =
+&cci->cel_log[cci->cel_size];
+
+log->opcode = (set << 8) | cmd;
+log->effect = c->effect;
+cci->cel_size++;
+}
+}
+}
+}
+
 void cxl_initialize_mailbox_swcci(CXLCCI *cci, DeviceState *intf, DeviceState 
*d, size_t payload_max)
 {
 cxl_copy_cci_commands(cci, cxl_cmd_set_sw);
diff --git a/include/hw/cxl/cxl_device.h b/include/hw/cxl/cxl_device.h
index 9a3c8b2dfa..abc8405cc5 100644
--- a/include/hw/cxl/cxl_device.h
+++ b/include/hw/cxl/cxl_device.h
@@ -297,6 +297,8 @@ void cxl_initialize_mailbox_t3(CXLCCI *cci, DeviceState *d, 
size_t payload_max);
 void cxl_initialize_mailbox_swcci(CXLCCI *cci, DeviceState *intf,
   DeviceState *d, size_t payload_max);
 void cxl_init_cci(CXLCCI *cci, size_t payload_max);
+void cxl_add_cci_commands(CXLCCI *cci, const struct cxl_cmd 
(*cxl_cmd_set)[256],
+  size_t payload_max);
 int cxl_process_cci_message(CXLCCI *cci, uint8_t set, uint8_t cmd,
 size_t len_in, uint8_t *pl_in,
 size_t *len_out, uint8_t *pl_out,
-- 
2.39.1

[PATCH 1/4] cxl/mailbox: change CCI cmd set structure to be a member, not a refernce

2023-07-21 Thread Gregory Price

This allows devices to have fully customized CCIs, along with complex
devices where wrapper devices can override or add additional CCI
commands without having to replicate full command structures or
pollute a base device with every command that might ever be used.

Signed-off-by: Gregory Price 
---
 hw/cxl/cxl-mailbox-utils.c  | 18 ++
 include/hw/cxl/cxl_device.h |  2 +-
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c
index 2819914e8d..ddee3f1718 100644
--- a/hw/cxl/cxl-mailbox-utils.c
+++ b/hw/cxl/cxl-mailbox-utils.c
@@ -1373,9 +1373,19 @@ void cxl_init_cci(CXLCCI *cci, size_t payload_max)
  bg_timercb, cci);
 }
 
+static void cxl_copy_cci_commands(CXLCCI *cci, const struct cxl_cmd 
(*cxl_cmds)[256]) {
+for (int set = 0; set < 256; set++) {
+for (int cmd = 0; cmd < 256; cmd++) {
+if (cxl_cmds[set][cmd].handler) {
+cci->cxl_cmd_set[set][cmd] = cxl_cmds[set][cmd];
+}
+}
+}
+}
+
 void cxl_initialize_mailbox_swcci(CXLCCI *cci, DeviceState *intf, DeviceState 
*d, size_t payload_max)
 {
-cci->cxl_cmd_set = cxl_cmd_set_sw;
+cxl_copy_cci_commands(cci, cxl_cmd_set_sw);
 cci->d = d;
 cci->intf = intf;
 cxl_init_cci(cci, payload_max);
@@ -1383,7 +1393,7 @@ void cxl_initialize_mailbox_swcci(CXLCCI *cci, 
DeviceState *intf, DeviceState *d
 
 void cxl_initialize_mailbox_t3(CXLCCI *cci, DeviceState *d, size_t payload_max)
 {
-cci->cxl_cmd_set = cxl_cmd_set;
+cxl_copy_cci_commands(cci, cxl_cmd_set);
 cci->d = d;
  
 /* No separation for PCI MB as protocol handled in PCI device */
@@ -1398,7 +1408,7 @@ static const struct cxl_cmd cxl_cmd_set_t3_mctp[256][256] 
= {
 void cxl_initialize_t3_mctpcci(CXLCCI *cci, DeviceState *d, DeviceState *intf,
size_t payload_max)
 {
-cci->cxl_cmd_set = cxl_cmd_set_t3_mctp;
+cxl_copy_cci_commands(cci, cxl_cmd_set_t3_mctp);
 cci->d = d;
 cci->intf = intf;
 cxl_init_cci(cci, payload_max);
@@ -1414,7 +1424,7 @@ static const struct cxl_cmd 
cxl_cmd_set_usp_mctp[256][256] = {
 
 void cxl_initialize_usp_mctpcci(CXLCCI *cci, DeviceState *d, DeviceState 
*intf, size_t payload_max)
 {
-cci->cxl_cmd_set = cxl_cmd_set_usp_mctp;
+cxl_copy_cci_commands(cci, cxl_cmd_set_usp_mctp);
 cci->d = d;
 cci->intf = intf;
 cxl_init_cci(cci, payload_max);
diff --git a/include/hw/cxl/cxl_device.h b/include/hw/cxl/cxl_device.h
index c68981b618..9a3c8b2dfa 100644
--- a/include/hw/cxl/cxl_device.h
+++ b/include/hw/cxl/cxl_device.h
@@ -163,7 +163,7 @@ typedef struct CXLEventLog {
 } CXLEventLog;
 
 typedef struct CXLCCI {
-const struct cxl_cmd (*cxl_cmd_set)[256];
+struct cxl_cmd cxl_cmd_set[256][256];
 struct cel_log {
 uint16_t opcode;
 uint16_t effect;
-- 
2.39.1

[PATCH 3/4] cxl/type3: minimum MHD cci support

2023-07-21 Thread Gregory Price

Implement the MHD GET_INFO cci command and add a shared memory
region to the type3 device to host the information.

Add a helper program to initialize this shared memory region.

Add a function pointer to type3 devices for future work that
will allow an mhd device to provide a hook to validate whether
a memory access is valid or not.

For now, limit the number of LD's to the number of heads. Later,
this limitation will need to be lifted for MH-MLDs.

Intended use case:

1. Create the shared memory region
2. Format the shared memory region
3. Launch QEMU with `is_mhd=true,mhd_head=N,mhd_shmid=$shmid`

shmid=`ipcmk -M 4096 | grep -o -E '[0-9]+' | head -1`
cxl_mhd_init 4 $shmid
qemu-system-x86_64 \
  -nographic \
  -accel kvm \
  -drive file=./mhd.qcow2,format=qcow2,index=0,media=disk,id=hd \
  -m 4G,slots=4,maxmem=8G \
  -smp 4 \
  -machine type=q35,cxl=on,hmat=on \
  -device pxb-cxl,id=cxl.0,bus=pcie.0,bus_nr=52 \
  -device cxl-rp,id=rp0,bus=cxl.0,chassis=0,port=0,slot=0 \
  -object memory-backend-file,id=mem0,mem-path=/tmp/mem0,size=4G,share=true \
  -device 
cxl-type3,bus=rp0,volatile-memdev=mem0,id=cxl-mem0,sn=6,is_mhd=true,mhd_head=0,mhd_shmid=$shmid
 \
  -M cxl-fmw.0.targets.0=cxl.0,cxl-fmw.0.size=4G

Signed-off-by: Gregory Price 
---
 hw/cxl/cxl-mailbox-utils.c  | 53 +
 hw/mem/cxl_type3.c  | 67 +
 include/hw/cxl/cxl_device.h | 14 
 tools/cxl/cxl_mhd_init.c| 63 ++
 tools/cxl/meson.build   |  3 ++
 tools/meson.build   |  1 +
 6 files changed, 201 insertions(+)
 create mode 100644 tools/cxl/cxl_mhd_init.c
 create mode 100644 tools/cxl/meson.build

diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c
index cad0cd0adb..57b8da4376 100644
--- a/hw/cxl/cxl-mailbox-utils.c
+++ b/hw/cxl/cxl-mailbox-utils.c
@@ -84,6 +84,8 @@ enum {
 #define GET_PHYSICAL_PORT_STATE 0x1
 TUNNEL = 0x53,
 #define MANAGEMENT_COMMAND 0x0
+MHD = 0x55,
+#define GET_MHD_INFO 0x0
 };
 
 /* CCI Message Format CXL r3.0 Figure 7-19 */
@@ -1155,6 +1157,56 @@ static CXLRetCode cmd_media_clear_poison(const struct 
cxl_cmd *cmd,
 return CXL_MBOX_SUCCESS;
 }
 
+static CXLRetCode cmd_mhd_get_info(const struct cxl_cmd *cmd,
+   uint8_t *payload_in,
+   size_t len_in,
+   uint8_t *payload_out,
+   size_t *len_out,
+   CXLCCI *cci)
+{
+CXLType3Dev *ct3d = CXL_TYPE3(cci->d);
+struct {
+uint8_t start_ld;
+uint8_t ldmap_len;
+} QEMU_PACKED *input = (void *)payload_in;
+
+struct {
+uint8_t nr_lds;
+uint8_t nr_heads;
+uint16_t resv1;
+uint8_t start_ld;
+uint8_t ldmap_len;
+uint16_t resv2;
+uint8_t ldmap[];
+} QEMU_PACKED *output = (void *)payload_out;
+
+uint8_t start_ld = input->start_ld;
+uint8_t ldmap_len = input->ldmap_len;
+uint8_t i;
+
+if (!ct3d->is_mhd) {
+return CXL_MBOX_UNSUPPORTED;
+}
+
+if (start_ld >= ct3d->mhd_state->nr_lds) {
+return CXL_MBOX_INVALID_INPUT;
+}
+
+output->nr_lds = ct3d->mhd_state->nr_lds;
+output->nr_heads = ct3d->mhd_state->nr_heads;
+output->resv1 = 0;
+output->start_ld = start_ld;
+output->resv2 = 0;
+
+for (i = 0; i < ldmap_len && (start_ld + i) < output->nr_lds; i++) {
+output->ldmap[i] = ct3d->mhd_state->ldmap[start_ld + i];
+}
+output->ldmap_len = i;
+
+*len_out = sizeof(*output) + output->ldmap_len;
+return CXL_MBOX_SUCCESS;
+}
+
 #define IMMEDIATE_CONFIG_CHANGE (1 << 1)
 #define IMMEDIATE_DATA_CHANGE (1 << 2)
 #define IMMEDIATE_POLICY_CHANGE (1 << 3)
@@ -1195,6 +1247,7 @@ static const struct cxl_cmd cxl_cmd_set[256][256] = {
 cmd_media_inject_poison, 8, 0 },
 [MEDIA_AND_POISON][CLEAR_POISON] = { "MEDIA_AND_POISON_CLEAR_POISON",
 cmd_media_clear_poison, 72, 0 },
+[MHD][GET_MHD_INFO] = {"GET_MULTI_HEADED_INFO", cmd_mhd_get_info, 2, 0},
 };
 
 static const struct cxl_cmd cxl_cmd_set_sw[256][256] = {
diff --git a/hw/mem/cxl_type3.c b/hw/mem/cxl_type3.c
index efb7dece80..c8eb3aa67d 100644
--- a/hw/mem/cxl_type3.c
+++ b/hw/mem/cxl_type3.c
@@ -18,6 +18,7 @@
 #include "hw/cxl/cxl.h"
 #include "hw/pci/msix.h"
 #include "hw/pci/spdm.h"
+#include 
 
 #define DWORD_BYTE 4
 
@@ -794,6 +795,48 @@ static DOEProtocol doe_spdm_prot[] = {
 { }
 };
 
+static bool cxl_setup_mhd(CXLType3Dev *ct3d, Error **errp)
+{
+if (!ct3d->is_mhd) {
+ct3d->mhd_access_valid = NULL;
+return true;
+} else if (ct3d->is_mhd &&
+   (!ct3d->mhd_shmid || (ct3d->mhd_head == ~(0 {
+error_setg(errp, "is_mhd requires mhd_shmid and mhd_head settings");
+return false;
+} else if (!ct3d->is_mhd &&
+   (ct3d->mhd_shmid || (ct3d->mhd_head

1 2 >

1 - 100 of 132 matches

Mail list logo