Re: 8.1-rc0 testfloat fails to compile
Fri, 21 Jul 2023 08:54:21 +0200 Thomas Huth : > Which compiler version is causing trouble for you? Right now it is gcc 13, hopefully every compiler will error out with -Werror=return-type. I think it makes sense to replace Leap with Tumbleweed. We already know it compiles fine with Leap, because the submission compiled fine on the submitters laptop. Where is the repository that needs to be adjusted to make this replacement? Olaf pgpSHAJeqbTxh.pgp Description: Digitale Signatur von OpenPGP
Re: 8.1-rc0 testfloat fails to compile
On 21/07/2023 09.03, Olaf Hering wrote: Fri, 21 Jul 2023 08:54:21 +0200 Thomas Huth : Which compiler version is causing trouble for you? Right now it is gcc 13, hopefully every compiler will error out with -Werror=return-type. I don't think this will happen - otherwise we would have seen this already. It rather looks like your compiler version is missing something here - have a look at the affected function: static float64_t f64Random( void ) { switch ( random_ui8() & 7 ) { case 0: case 1: case 2: return f64RandomQOutP3(); case 3: return f64RandomQOutPInf(); case 4: case 5: case 6: return f64RandomQInfP3(); case 7: return f64RandomQInfPInf(); } } The argument in the switch statement is limited with "& 7" to the range of 0 ... 7 , so there is no way the control flow can really reach the bottom of the function. I wonder why your GCC 13 gets that wrong, all other versions seem to be fine. Or are you compiling with -O0 or something similar? I think it makes sense to replace Leap with Tumbleweed. IIRC we wanted to avoid rolling releases in the CI since this would rather force us to deal with distro bugs on a regular basis instead of focusing on bugs in QEMU. Thomas
Re: [PATCH] target/loongarch: Fix the CSRRD CPUID instruction on big endian hosts
在 2023/7/21 上午1:53, Thomas Huth 写道: The test in tests/avocado/machine_loongarch.py is currently failing on big endian hosts like s390x. By comparing the traces between running the QEMU_EFI.fd bios on a s390x and on a x86 host, it's quickly obvious that the CSRRD instruction for the CPUID is behaving differently. And indeed: The code currently does a long read (i.e. 64 bit) from the address that points to the CPUState->cpu_index field (with tcg_gen_ld_tl() in the trans_csrrd() function). But this cpu_index field is only an "int" (i.e. 32 bit). While this dirty pointer magic works on little endian hosts, it of course fails on big endian hosts. Fix it by using a proper helper function instead. Signed-off-by: Thomas Huth --- Note: There is another bug preventing tests/avocado/machine_loongarch.py from working correctly on big endian hosts (Linux fails to run the shell) but that problem is harder to debug since it happens way later in the boot process... target/loongarch/cpu.h | 1 + target/loongarch/helper.h | 1 + target/loongarch/csr_helper.c | 9 + target/loongarch/insn_trans/trans_privileged.c.inc | 8 +--- 4 files changed, 12 insertions(+), 7 deletions(-) Reviewed-by: Song Gao Thanks. Song Gao diff --git a/target/loongarch/cpu.h b/target/loongarch/cpu.h index ed04027af1..fa371ca8ba 100644 --- a/target/loongarch/cpu.h +++ b/target/loongarch/cpu.h @@ -342,6 +342,7 @@ typedef struct CPUArchState { uint64_t CSR_DBG; uint64_t CSR_DERA; uint64_t CSR_DSAVE; +uint64_t CSR_CPUID; #ifndef CONFIG_USER_ONLY LoongArchTLB tlb[LOONGARCH_TLB_MAX]; diff --git a/target/loongarch/helper.h b/target/loongarch/helper.h index b9de77d926..ffb1e0b0bf 100644 --- a/target/loongarch/helper.h +++ b/target/loongarch/helper.h @@ -98,6 +98,7 @@ DEF_HELPER_1(rdtime_d, i64, env) #ifndef CONFIG_USER_ONLY /* CSRs helper */ DEF_HELPER_1(csrrd_pgd, i64, env) +DEF_HELPER_1(csrrd_cpuid, i64, env) DEF_HELPER_1(csrrd_tval, i64, env) DEF_HELPER_2(csrwr_estat, i64, env, tl) DEF_HELPER_2(csrwr_asid, i64, env, tl) diff --git a/target/loongarch/csr_helper.c b/target/loongarch/csr_helper.c index 6526367946..55341551a5 100644 --- a/target/loongarch/csr_helper.c +++ b/target/loongarch/csr_helper.c @@ -35,6 +35,15 @@ target_ulong helper_csrrd_pgd(CPULoongArchState *env) return v; } +target_ulong helper_csrrd_cpuid(CPULoongArchState *env) +{ +LoongArchCPU *lac = env_archcpu(env); + +env->CSR_CPUID = CPU(lac)->cpu_index; + +return env->CSR_CPUID; +} + target_ulong helper_csrrd_tval(CPULoongArchState *env) { LoongArchCPU *cpu = env_archcpu(env); diff --git a/target/loongarch/insn_trans/trans_privileged.c.inc b/target/loongarch/insn_trans/trans_privileged.c.inc index 02bca7ca23..9c9de090f0 100644 --- a/target/loongarch/insn_trans/trans_privileged.c.inc +++ b/target/loongarch/insn_trans/trans_privileged.c.inc @@ -99,13 +99,7 @@ static const CSRInfo csr_info[] = { CSR_OFF(PWCH), CSR_OFF(STLBPS), CSR_OFF(RVACFG), -[LOONGARCH_CSR_CPUID] = { -.offset = (int)offsetof(CPUState, cpu_index) - - (int)offsetof(LoongArchCPU, env), -.flags = CSRFL_READONLY, -.readfn = NULL, -.writefn = NULL -}, +CSR_OFF_FUNCS(CPUID, CSRFL_READONLY, gen_helper_csrrd_cpuid, NULL), CSR_OFF_FLAGS(PRCFG1, CSRFL_READONLY), CSR_OFF_FLAGS(PRCFG2, CSRFL_READONLY), CSR_OFF_FLAGS(PRCFG3, CSRFL_READONLY),
[PATCH] target/ppc: Fix pending HDEC when entering PM state
HDEC is defined to not wake from PM state. There is a check in the HDEC timer to avoid setting the interrupt if we are in a PM state, but no check on PM entry to lower HDEC if it already fired. This can cause a HDECR wake up and QEMU abort with unsupported exception in Power Save mode. Signed-off-by: Nicholas Piggin --- target/ppc/excp_helper.c | 6 ++ 1 file changed, 6 insertions(+) diff --git a/target/ppc/excp_helper.c b/target/ppc/excp_helper.c index 003805b202..9aa8e46566 100644 --- a/target/ppc/excp_helper.c +++ b/target/ppc/excp_helper.c @@ -2685,6 +2685,12 @@ void helper_pminsn(CPUPPCState *env, uint32_t insn) env->resume_as_sreset = (insn != PPC_PM_STOP) || (env->spr[SPR_PSSCR] & PSSCR_EC); +/* HDECR is not to wake from PM state, it may have already fired */ +if (env->resume_as_sreset) { +PowerPCCPU *cpu = env_archcpu(env); +ppc_set_irq(cpu, PPC_INTERRUPT_HDECR, 0); +} + ppc_maybe_interrupt(env); } #endif /* defined(TARGET_PPC64) */ -- 2.40.1
Re: [PATCH 0/2] target/ppc: Fixes for hash MMU for ISA v3.0
Hello Nick, On 7/21/23 07:02, Nicholas Piggin wrote: This fixes a couple of deficiencies in the v3.0 and later (POWER9, 10) HPT MMU implementation. With these fixes, KVM is unable to boot hash guests on powernv9/10 machines. Bare metal hash or pseries machine with hash works, because VRMA is only used when a real hypervisor is virtualizing a hash guest's real mode addressing. Thanks, Nick Could please add Fixes tags to your 'fix' patches ? No need to resend just reply on the mailing list with : Fixes: "subject" Also for the "target/ppc: Fix pending HDEC when entering PM state" patch. Thanks, C. Nicholas Piggin (2): target/ppc: Implement ASDR register for ISA v3.0 for HPT target/ppc: Fix VRMA page size for ISA v3.0 target/ppc/mmu-hash64.c | 68 ++--- 1 file changed, 51 insertions(+), 17 deletions(-)
Re: [PATCH v2] accel/kvm: Specify default IPA size for arm64
Hi Akihiko, On 21/7/23 08:24, Akihiko Odaki wrote: libvirt uses "none" machine type to test KVM availability. Before this change, QEMU used to pass 0 as machine type when calling KVM_CREATE_VM. The kernel documentation says: On arm64, the physical address size for a VM (IPA Size limit) is limited to 40bits by default. The limit can be configured if the host supports the extension KVM_CAP_ARM_VM_IPA_SIZE. When supported, use KVM_VM_TYPE_ARM_IPA_SIZE(IPA_Bits) to set the size in the machine type identifier, where IPA_Bits is the maximum width of any physical address used by the VM. The IPA_Bits is encoded in bits[7-0] of the machine type identifier. e.g, to configure a guest to use 48bit physical address size:: vm_fd = ioctl(dev_fd, KVM_CREATE_VM, KVM_VM_TYPE_ARM_IPA_SIZE(48)); The requested size (IPA_Bits) must be: == = 0 Implies default size, 40bits (for backward compatibility) N Implies N bits, where N is a positive integer such that, 32 <= N <= Host_IPA_Limit == = Host_IPA_Limit is the maximum possible value for IPA_Bits on the host and is dependent on the CPU capability and the kernel configuration. The limit can be retrieved using KVM_CAP_ARM_VM_IPA_SIZE of the KVM_CHECK_EXTENSION ioctl() at run-time. Creation of the VM will fail if the requested IPA size (whether it is implicit or explicit) is unsupported on the host. https://docs.kernel.org/virt/kvm/api.html#kvm-create-vm So if Host_IPA_Limit < 40, such KVM_CREATE_VM will fail, and libvirt incorrectly thinks KVM is not available. This actually happened on M2 MacBook Air. Fix this by specifying 32 for IPA_Bits as any arm64 system should support the value according to the documentation. Signed-off-by: Akihiko Odaki --- V1 -> V2: Introduced an arch hook include/sysemu/kvm.h | 1 + accel/kvm/kvm-all.c| 2 +- target/arm/kvm.c | 2 ++ target/i386/kvm/kvm.c | 2 ++ target/mips/kvm.c | 2 ++ target/ppc/kvm.c | 2 ++ target/riscv/kvm.c | 2 ++ target/s390x/kvm/kvm.c | 2 ++ 8 files changed, 14 insertions(+), 1 deletion(-) My understanding of Peter's suggestion would be smth like: -- >8 -- diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h index 115f0cca79..c0af15eb6c 100644 --- a/include/sysemu/kvm.h +++ b/include/sysemu/kvm.h @@ -201,10 +201,15 @@ typedef struct KVMCapabilityInfo { struct KVMState; +struct KVMClass { +AccelClass parent_class; + +int default_vm_type; +}; + #define TYPE_KVM_ACCEL ACCEL_CLASS_NAME("kvm") typedef struct KVMState KVMState; -DECLARE_INSTANCE_CHECKER(KVMState, KVM_STATE, - TYPE_KVM_ACCEL) +OBJECT_DECLARE_TYPE(KVMState, KVMClass, KVM_ACCEL) extern KVMState *kvm_state; typedef struct Notifier Notifier; diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c index 373d876c05..fdd424e1a5 100644 --- a/accel/kvm/kvm-all.c +++ b/accel/kvm/kvm-all.c @@ -2458,12 +2458,13 @@ static int kvm_init(MachineState *ms) KVMState *s; const KVMCapabilityInfo *missing_cap; int ret; -int type = 0; +int type; uint64_t dirty_log_manual_caps; qemu_mutex_init(&kml_slots_lock); s = KVM_STATE(ms->accelerator); +type = KVM_GET_CLASS(s)->default_vm_type; /* * On systems where the kernel can support different base page diff --git a/target/arm/kvm.c b/target/arm/kvm.c index b4c7654f49..5c13594fdf 100644 --- a/target/arm/kvm.c +++ b/target/arm/kvm.c @@ -1064,4 +1064,8 @@ bool kvm_arch_cpu_check_are_resettable(void) void kvm_arch_accel_class_init(ObjectClass *oc) { +KVMClass *kc = KVM_CLASS(oc); + +/* Host_IPA_Limit ... */ +kc->default_vm_type = 32; } ---
Re: [PATCH] target/tricore: Rename tricore_feature
On 21/7/23 08:06, Bastian Koppelmann wrote: this name is used by capstone and will lead to a build failure of QEMU, when capstone is enabled. So we rename it to tricore_has_feature(), to match has_feature() in translate.c. Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1774 Signed-off-by: Bastian Koppelmann --- target/tricore/cpu.c | 8 target/tricore/cpu.h | 2 +- target/tricore/helper.c| 4 ++-- target/tricore/op_helper.c | 4 ++-- 4 files changed, 9 insertions(+), 9 deletions(-) Reviewed-by: Philippe Mathieu-Daudé
Re: 8.1-rc0 testfloat fails to compile
Fri, 21 Jul 2023 09:18:08 +0200 Thomas Huth : > Or are you compiling with -O0 or something similar? Does the master branch compile for you with 'export CFLAGS="-O2 -Wall -Werror=return-type"'? I prepared a patch to deal with it, and indeed it looks like the compiler might be able to spot the fact that only values between 0 and 7 can ever appear there. Apparently it fails to recognize that. Olaf pgpf8MtewCTaw.pgp Description: Digitale Signatur von OpenPGP
Re: 8.1-rc0 testfloat fails to compile
On 21/07/2023 09.35, Olaf Hering wrote: Fri, 21 Jul 2023 09:18:08 +0200 Thomas Huth : Or are you compiling with -O0 or something similar? Does the master branch compile for you with 'export CFLAGS="-O2 -Wall -Werror=return-type"'? Oh, this fails for me, too! I thought that -Wreturn-type would be part of -Wall, but apparently it is only enabled by default for C++ in my version of GCC :-( Thomas
Re: [PATCH] hw: Add compat machines for 8.2
Le 18/07/2023 à 16:22, Cornelia Huck a écrit : Add 8.2 machine types for arm/i440fx/m68k/q35/s390x/spapr. Signed-off-by: Cornelia Huck --- hw/arm/virt.c | 9 - hw/core/machine.c | 3 +++ hw/i386/pc.c | 3 +++ hw/i386/pc_piix.c | 16 +--- hw/i386/pc_q35.c | 14 -- hw/m68k/virt.c | 9 - hw/ppc/spapr.c | 15 +-- hw/s390x/s390-virtio-ccw.c | 14 +- include/hw/boards.h| 3 +++ include/hw/i386/pc.h | 3 +++ 10 files changed, 79 insertions(+), 10 deletions(-) diff --git a/hw/arm/virt.c b/hw/arm/virt.c index 7d9dbc26633a..2a560271b5fc 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -3170,10 +3170,17 @@ static void machvirt_machine_init(void) } type_init(machvirt_machine_init); +static void virt_machine_8_2_options(MachineClass *mc) +{ +} +DEFINE_VIRT_MACHINE_AS_LATEST(8, 2) + static void virt_machine_8_1_options(MachineClass *mc) { +virt_machine_8_2_options(mc); +compat_props_add(mc->compat_props, hw_compat_8_1, hw_compat_8_1_len); } -DEFINE_VIRT_MACHINE_AS_LATEST(8, 1) +DEFINE_VIRT_MACHINE(8, 1) static void virt_machine_8_0_options(MachineClass *mc) { diff --git a/hw/core/machine.c b/hw/core/machine.c index f0d35c640184..da699cf4e147 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -39,6 +39,9 @@ #include "hw/virtio/virtio.h" #include "hw/virtio/virtio-pci.h" +GlobalProperty hw_compat_8_1[] = {}; +const size_t hw_compat_8_1_len = G_N_ELEMENTS(hw_compat_8_1); + GlobalProperty hw_compat_8_0[] = { { "migration", "multifd-flush-after-each-section", "on"}, { TYPE_PCI_DEVICE, "x-pcie-ari-nextfn-1", "on" }, diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 3109d5e0e035..54838c0c411d 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -114,6 +114,9 @@ { "qemu64-" TYPE_X86_CPU, "model-id", "QEMU Virtual CPU version " v, },\ { "athlon-" TYPE_X86_CPU, "model-id", "QEMU Virtual CPU version " v, }, +GlobalProperty pc_compat_8_1[] = {}; +const size_t pc_compat_8_1_len = G_N_ELEMENTS(pc_compat_8_1); + GlobalProperty pc_compat_8_0[] = { { "virtio-mem", "unplugged-inaccessible", "auto" }, }; diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index ac72e8f5bee1..ce1ac9527493 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -504,13 +504,25 @@ static void pc_i440fx_machine_options(MachineClass *m) machine_class_allow_dynamic_sysbus_dev(m, TYPE_VMBUS_BRIDGE); } -static void pc_i440fx_8_1_machine_options(MachineClass *m) +static void pc_i440fx_8_2_machine_options(MachineClass *m) { pc_i440fx_machine_options(m); m->alias = "pc"; m->is_default = true; } +DEFINE_I440FX_MACHINE(v8_2, "pc-i440fx-8.2", NULL, + pc_i440fx_8_2_machine_options); + +static void pc_i440fx_8_1_machine_options(MachineClass *m) +{ +pc_i440fx_8_2_machine_options(m); +m->alias = NULL; +m->is_default = false; +compat_props_add(m->compat_props, hw_compat_8_1, hw_compat_8_1_len); +compat_props_add(m->compat_props, pc_compat_8_1, pc_compat_8_1_len); +} + DEFINE_I440FX_MACHINE(v8_1, "pc-i440fx-8.1", NULL, pc_i440fx_8_1_machine_options); @@ -519,8 +531,6 @@ static void pc_i440fx_8_0_machine_options(MachineClass *m) PCMachineClass *pcmc = PC_MACHINE_CLASS(m); pc_i440fx_8_1_machine_options(m); -m->alias = NULL; -m->is_default = false; compat_props_add(m->compat_props, hw_compat_8_0, hw_compat_8_0_len); compat_props_add(m->compat_props, pc_compat_8_0, pc_compat_8_0_len); diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c index dc27a9e223a2..37c4814bedf2 100644 --- a/hw/i386/pc_q35.c +++ b/hw/i386/pc_q35.c @@ -379,12 +379,23 @@ static void pc_q35_machine_options(MachineClass *m) machine_class_allow_dynamic_sysbus_dev(m, TYPE_VMBUS_BRIDGE); } -static void pc_q35_8_1_machine_options(MachineClass *m) +static void pc_q35_8_2_machine_options(MachineClass *m) { pc_q35_machine_options(m); m->alias = "q35"; } +DEFINE_Q35_MACHINE(v8_2, "pc-q35-8.2", NULL, + pc_q35_8_2_machine_options); + +static void pc_q35_8_1_machine_options(MachineClass *m) +{ +pc_q35_8_2_machine_options(m); +m->alias = NULL; +compat_props_add(m->compat_props, hw_compat_8_1, hw_compat_8_1_len); +compat_props_add(m->compat_props, pc_compat_8_1, pc_compat_8_1_len); +} + DEFINE_Q35_MACHINE(v8_1, "pc-q35-8.1", NULL, pc_q35_8_1_machine_options); @@ -393,7 +404,6 @@ static void pc_q35_8_0_machine_options(MachineClass *m) PCMachineClass *pcmc = PC_MACHINE_CLASS(m); pc_q35_8_1_machine_options(m); -m->alias = NULL; compat_props_add(m->compat_props, hw_compat_8_0, hw_compat_8_0_len); compat_props_add(m->compat_props, pc_compat_8_0, pc_compat_8_0_len); diff --git a/hw/m68k/virt.c b/hw/m68k/vi
Re: 8.1-rc0 testfloat fails to compile
Fri, 21 Jul 2023 09:53:51 +0200 Thomas Huth : > I thought that -Wreturn-type would be part of > -Wall, but apparently it is only enabled by default for C++ in my version of > GCC :-( Hopefully gcc14 will fix that, and others. Only two decades late ... Olaf pgpx3QdBimJAf.pgp Description: Digitale Signatur von OpenPGP
Re: [Virtio-fs] [PATCH v2 2/4] vhost-user: Interface for migration state transfer
On 20.07.23 17:05, Hao Xu wrote: On 7/20/23 21:20, Hanna Czenczek wrote: On 20.07.23 14:13, Hao Xu wrote: On 7/12/23 19:17, Hanna Czenczek wrote: Add the interface for transferring the back-end's state during migration as defined previously in vhost-user.rst. Signed-off-by: Hanna Czenczek --- include/hw/virtio/vhost-backend.h | 24 + include/hw/virtio/vhost.h | 79 hw/virtio/vhost-user.c | 147 ++ hw/virtio/vhost.c | 37 4 files changed, 287 insertions(+) diff --git a/include/hw/virtio/vhost-backend.h b/include/hw/virtio/vhost-backend.h index 31a251a9f5..e59d0b53f8 100644 --- a/include/hw/virtio/vhost-backend.h +++ b/include/hw/virtio/vhost-backend.h @@ -26,6 +26,18 @@ typedef enum VhostSetConfigType { VHOST_SET_CONFIG_TYPE_MIGRATION = 1, } VhostSetConfigType; +typedef enum VhostDeviceStateDirection { + /* Transfer state from back-end (device) to front-end */ + VHOST_TRANSFER_STATE_DIRECTION_SAVE = 0, + /* Transfer state from front-end to back-end (device) */ + VHOST_TRANSFER_STATE_DIRECTION_LOAD = 1, +} VhostDeviceStateDirection; + +typedef enum VhostDeviceStatePhase { + /* The device (and all its vrings) is stopped */ + VHOST_TRANSFER_STATE_PHASE_STOPPED = 0, +} VhostDeviceStatePhase; + struct vhost_inflight; struct vhost_dev; struct vhost_log; @@ -133,6 +145,15 @@ typedef int (*vhost_set_config_call_op)(struct vhost_dev *dev, typedef void (*vhost_reset_status_op)(struct vhost_dev *dev); +typedef bool (*vhost_supports_migratory_state_op)(struct vhost_dev *dev); +typedef int (*vhost_set_device_state_fd_op)(struct vhost_dev *dev, + VhostDeviceStateDirection direction, + VhostDeviceStatePhase phase, + int fd, + int *reply_fd, + Error **errp); +typedef int (*vhost_check_device_state_op)(struct vhost_dev *dev, Error **errp); + typedef struct VhostOps { VhostBackendType backend_type; vhost_backend_init vhost_backend_init; @@ -181,6 +202,9 @@ typedef struct VhostOps { vhost_force_iommu_op vhost_force_iommu; vhost_set_config_call_op vhost_set_config_call; vhost_reset_status_op vhost_reset_status; + vhost_supports_migratory_state_op vhost_supports_migratory_state; + vhost_set_device_state_fd_op vhost_set_device_state_fd; + vhost_check_device_state_op vhost_check_device_state; } VhostOps; int vhost_backend_update_device_iotlb(struct vhost_dev *dev, diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h index 69bf59d630..d8877496e5 100644 --- a/include/hw/virtio/vhost.h +++ b/include/hw/virtio/vhost.h @@ -346,4 +346,83 @@ int vhost_dev_set_inflight(struct vhost_dev *dev, int vhost_dev_get_inflight(struct vhost_dev *dev, uint16_t queue_size, struct vhost_inflight *inflight); bool vhost_dev_has_iommu(struct vhost_dev *dev); + +/** + * vhost_supports_migratory_state(): Checks whether the back-end + * supports transferring internal state for the purpose of migration. + * Support for this feature is required for vhost_set_device_state_fd() + * and vhost_check_device_state(). + * + * @dev: The vhost device + * + * Returns true if the device supports these commands, and false if it + * does not. + */ +bool vhost_supports_migratory_state(struct vhost_dev *dev); + +/** + * vhost_set_device_state_fd(): Begin transfer of internal state from/to + * the back-end for the purpose of migration. Data is to be transferred + * over a pipe according to @direction and @phase. The sending end must + * only write to the pipe, and the receiving end must only read from it. + * Once the sending end is done, it closes its FD. The receiving end + * must take this as the end-of-transfer signal and close its FD, too. + * + * @fd is the back-end's end of the pipe: The write FD for SAVE, and the + * read FD for LOAD. This function transfers ownership of @fd to the + * back-end, i.e. closes it in the front-end. + * + * The back-end may optionally reply with an FD of its own, if this + * improves efficiency on its end. In this case, the returned FD is Hi Hanna, In what case/situation, the back-end will have a more efficient fd? Hi Hao, There is no example yet. Here my understanding of this "FD of its own" is as same type as the given fd(e.g. both pipe files), why the fd from back-end makes difference? Do I miss anything here? Maybe it makes more sense in the context of how we came up with the idea: Specifically, Stefan and me were asking which end should provide the FD. In the context of vhost-user, it makes sense to have it be the front-end, because it controls vhost-user communication, but that’s just the natural protocol choice, not necessarily the most efficient one. It is imaginable that the front-end (e.g. qemu) could create
[PATCH v3 2/2] target/i386: add control bits support for LAM
LAM uses CR3[61] and CR3[62] to configure/enable LAM on user pointers. LAM uses CR4[28] to configure/enable LAM on supervisor pointers. For CR3 LAM bits, no additional handling needed: - TCG LAM is not supported for TCG of target-i386. helper_write_crN() and helper_vmrun() check max physical address bits before calling cpu_x86_update_cr3(), no change needed, i.e. CR3 LAM bits are not allowed to be set in TCG. - gdbstub x86_cpu_gdb_write_register() will call cpu_x86_update_cr3() to update cr3. Allow gdb to set the LAM bit(s) to CR3, if vcpu doesn't support LAM, KVM_SET_SREGS will fail as other CR3 reserved bits. For CR4 LAM bit, its reservation depends on vcpu supporting LAM feature or not. - TCG LAM is not supported for TCG of target-i386. helper_write_crN() and helper_vmrun() check CR4 reserved bit before calling cpu_x86_update_cr4(), i.e. CR4 LAM bit is not allowed to be set in TCG. - gdbstub x86_cpu_gdb_write_register() will call cpu_x86_update_cr4() to update cr4. Allow gdb to set the LAM bit to CR4, if vcpu doesn't support LAM, KVM_SET_SREGS will fail. - x86_cpu_reset_hold() doesn't need special handling. Signed-off-by: Binbin Wu --- target/i386/cpu.h | 7 ++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/target/i386/cpu.h b/target/i386/cpu.h index 4db97899fe..710fadf550 100644 --- a/target/i386/cpu.h +++ b/target/i386/cpu.h @@ -261,6 +261,7 @@ typedef enum X86Seg { #define CR4_SMAP_MASK (1U << 21) #define CR4_PKE_MASK (1U << 22) #define CR4_PKS_MASK (1U << 24) +#define CR4_LAM_SUP_MASK (1U << 28) #define CR4_RESERVED_MASK \ (~(target_ulong)(CR4_VME_MASK | CR4_PVI_MASK | CR4_TSD_MASK \ @@ -269,7 +270,8 @@ typedef enum X86Seg { | CR4_OSFXSR_MASK | CR4_OSXMMEXCPT_MASK | CR4_UMIP_MASK \ | CR4_LA57_MASK \ | CR4_FSGSBASE_MASK | CR4_PCIDE_MASK | CR4_OSXSAVE_MASK \ -| CR4_SMEP_MASK | CR4_SMAP_MASK | CR4_PKE_MASK | CR4_PKS_MASK)) +| CR4_SMEP_MASK | CR4_SMAP_MASK | CR4_PKE_MASK | CR4_PKS_MASK \ +| CR4_LAM_SUP_MASK)) #define DR6_BD (1 << 13) #define DR6_BS (1 << 14) @@ -2478,6 +2480,9 @@ static inline uint64_t cr4_reserved_bits(CPUX86State *env) if (!(env->features[FEAT_7_0_ECX] & CPUID_7_0_ECX_PKS)) { reserved_bits |= CR4_PKS_MASK; } +if (!(env->features[FEAT_7_1_EAX] & CPUID_7_1_EAX_LAM)) { +reserved_bits |= CR4_LAM_SUP_MASK; +} return reserved_bits; } -- 2.25.1
[PATCH v3 0/2] target/i386: add support for LAM
Linear-address masking (LAM) [1], modifies the checking that is applied to *64-bit* linear addresses, allowing software to use of the untranslated address bits for metadata and masks the metadata bits before using them as linear addresses to access memory. When the feature is virtualized and exposed to guest, it can be used for efficient address sanitizers (ASAN) implementation and for optimizations in JITs and virtual machines. [1] Intel ISE https://cdrdv2.intel.com/v1/dl/getContent/671368 Chapter Linear Address Masking (LAM) --- Changelog v3: - Some change in commit message. - Add handling of LAM control bits. (Xiaoyao) v2: - https://lists.gnu.org/archive/html/qemu-devel/2023-05/msg07842.html Binbin Wu (1): target/i386: add control bits support for LAM Robert Hoo (1): target/i386: add support for LAM in CPUID enumeration target/i386/cpu.c | 2 +- target/i386/cpu.h | 9 - 2 files changed, 9 insertions(+), 2 deletions(-) -- 2.25.1
[PATCH v3 1/2] target/i386: add support for LAM in CPUID enumeration
From: Robert Hoo Linear Address Masking (LAM) is a new Intel CPU feature, which allows software to use of the untranslated address bits for metadata. The bit definition: CPUID.(EAX=7,ECX=1):EAX[26] Add CPUID definition for LAM. Note LAM feature is not supported for TCG of target-386, LAM CPIUD bit will not be added to TCG_7_1_EAX_FEATURES. More info can be found in Intel ISE Chapter "LINEAR ADDRESS MASKING (LAM)" https://cdrdv2.intel.com/v1/dl/getContent/671368 Signed-off-by: Robert Hoo Co-developed-by: Binbin Wu Signed-off-by: Binbin Wu --- target/i386/cpu.c | 2 +- target/i386/cpu.h | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 97ad229d8b..3a42340730 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -965,7 +965,7 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = { "fsrc", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "amx-fp16", NULL, "avx-ifma", -NULL, NULL, NULL, NULL, +NULL, NULL, "lam", NULL, NULL, NULL, NULL, NULL, }, .cpuid = { diff --git a/target/i386/cpu.h b/target/i386/cpu.h index e0771a1043..4db97899fe 100644 --- a/target/i386/cpu.h +++ b/target/i386/cpu.h @@ -925,6 +925,8 @@ uint64_t x86_cpu_get_supported_feature_word(FeatureWord w, #define CPUID_7_1_EAX_AMX_FP16 (1U << 21) /* Support for VPMADD52[H,L]UQ */ #define CPUID_7_1_EAX_AVX_IFMA (1U << 23) +/* Linear Address Masking */ +#define CPUID_7_1_EAX_LAM (1U << 26) /* Support for VPDPB[SU,UU,SS]D[,S] */ #define CPUID_7_1_EDX_AVX_VNNI_INT8 (1U << 4) -- 2.25.1
[PATCH] hw/virtio: qmp: add RING_RESET to 'info virtio-status'
Signed-off-by: David Edmondson --- hw/virtio/virtio-qmp.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/hw/virtio/virtio-qmp.c b/hw/virtio/virtio-qmp.c index 3d32dbec8d..7515b0947b 100644 --- a/hw/virtio/virtio-qmp.c +++ b/hw/virtio/virtio-qmp.c @@ -79,6 +79,8 @@ static const qmp_virtio_feature_map_t virtio_transport_map[] = { "VIRTIO_F_ORDER_PLATFORM: Memory accesses ordered by platform"), FEATURE_ENTRY(VIRTIO_F_SR_IOV, \ "VIRTIO_F_SR_IOV: Device supports single root I/O virtualization"), +FEATURE_ENTRY(VIRTIO_F_RING_RESET, \ +"VIRTIO_F_RING_RESET: Driver can reset a queue individually"), /* Virtio ring transport features */ FEATURE_ENTRY(VIRTIO_RING_F_INDIRECT_DESC, \ "VIRTIO_RING_F_INDIRECT_DESC: Indirect descriptors supported"), -- 2.34.1
Re: [PATCH] hw: Add compat machines for 8.2
On Tue, Jul 18, 2023 at 04:22:35PM +0200, Cornelia Huck wrote: > Add 8.2 machine types for arm/i440fx/m68k/q35/s390x/spapr. > > Signed-off-by: Cornelia Huck Acked-by: Michael S. Tsirkin > --- > hw/arm/virt.c | 9 - > hw/core/machine.c | 3 +++ > hw/i386/pc.c | 3 +++ > hw/i386/pc_piix.c | 16 +--- > hw/i386/pc_q35.c | 14 -- > hw/m68k/virt.c | 9 - > hw/ppc/spapr.c | 15 +-- > hw/s390x/s390-virtio-ccw.c | 14 +- > include/hw/boards.h| 3 +++ > include/hw/i386/pc.h | 3 +++ > 10 files changed, 79 insertions(+), 10 deletions(-) > > diff --git a/hw/arm/virt.c b/hw/arm/virt.c > index 7d9dbc26633a..2a560271b5fc 100644 > --- a/hw/arm/virt.c > +++ b/hw/arm/virt.c > @@ -3170,10 +3170,17 @@ static void machvirt_machine_init(void) > } > type_init(machvirt_machine_init); > > +static void virt_machine_8_2_options(MachineClass *mc) > +{ > +} > +DEFINE_VIRT_MACHINE_AS_LATEST(8, 2) > + > static void virt_machine_8_1_options(MachineClass *mc) > { > +virt_machine_8_2_options(mc); > +compat_props_add(mc->compat_props, hw_compat_8_1, hw_compat_8_1_len); > } > -DEFINE_VIRT_MACHINE_AS_LATEST(8, 1) > +DEFINE_VIRT_MACHINE(8, 1) > > static void virt_machine_8_0_options(MachineClass *mc) > { > diff --git a/hw/core/machine.c b/hw/core/machine.c > index f0d35c640184..da699cf4e147 100644 > --- a/hw/core/machine.c > +++ b/hw/core/machine.c > @@ -39,6 +39,9 @@ > #include "hw/virtio/virtio.h" > #include "hw/virtio/virtio-pci.h" > > +GlobalProperty hw_compat_8_1[] = {}; > +const size_t hw_compat_8_1_len = G_N_ELEMENTS(hw_compat_8_1); > + > GlobalProperty hw_compat_8_0[] = { > { "migration", "multifd-flush-after-each-section", "on"}, > { TYPE_PCI_DEVICE, "x-pcie-ari-nextfn-1", "on" }, > diff --git a/hw/i386/pc.c b/hw/i386/pc.c > index 3109d5e0e035..54838c0c411d 100644 > --- a/hw/i386/pc.c > +++ b/hw/i386/pc.c > @@ -114,6 +114,9 @@ > { "qemu64-" TYPE_X86_CPU, "model-id", "QEMU Virtual CPU version " v, },\ > { "athlon-" TYPE_X86_CPU, "model-id", "QEMU Virtual CPU version " v, }, > > +GlobalProperty pc_compat_8_1[] = {}; > +const size_t pc_compat_8_1_len = G_N_ELEMENTS(pc_compat_8_1); > + > GlobalProperty pc_compat_8_0[] = { > { "virtio-mem", "unplugged-inaccessible", "auto" }, > }; > diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c > index ac72e8f5bee1..ce1ac9527493 100644 > --- a/hw/i386/pc_piix.c > +++ b/hw/i386/pc_piix.c > @@ -504,13 +504,25 @@ static void pc_i440fx_machine_options(MachineClass *m) > machine_class_allow_dynamic_sysbus_dev(m, TYPE_VMBUS_BRIDGE); > } > > -static void pc_i440fx_8_1_machine_options(MachineClass *m) > +static void pc_i440fx_8_2_machine_options(MachineClass *m) > { > pc_i440fx_machine_options(m); > m->alias = "pc"; > m->is_default = true; > } > > +DEFINE_I440FX_MACHINE(v8_2, "pc-i440fx-8.2", NULL, > + pc_i440fx_8_2_machine_options); > + > +static void pc_i440fx_8_1_machine_options(MachineClass *m) > +{ > +pc_i440fx_8_2_machine_options(m); > +m->alias = NULL; > +m->is_default = false; > +compat_props_add(m->compat_props, hw_compat_8_1, hw_compat_8_1_len); > +compat_props_add(m->compat_props, pc_compat_8_1, pc_compat_8_1_len); > +} > + > DEFINE_I440FX_MACHINE(v8_1, "pc-i440fx-8.1", NULL, >pc_i440fx_8_1_machine_options); > > @@ -519,8 +531,6 @@ static void pc_i440fx_8_0_machine_options(MachineClass *m) > PCMachineClass *pcmc = PC_MACHINE_CLASS(m); > > pc_i440fx_8_1_machine_options(m); > -m->alias = NULL; > -m->is_default = false; > compat_props_add(m->compat_props, hw_compat_8_0, hw_compat_8_0_len); > compat_props_add(m->compat_props, pc_compat_8_0, pc_compat_8_0_len); > > diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c > index dc27a9e223a2..37c4814bedf2 100644 > --- a/hw/i386/pc_q35.c > +++ b/hw/i386/pc_q35.c > @@ -379,12 +379,23 @@ static void pc_q35_machine_options(MachineClass *m) > machine_class_allow_dynamic_sysbus_dev(m, TYPE_VMBUS_BRIDGE); > } > > -static void pc_q35_8_1_machine_options(MachineClass *m) > +static void pc_q35_8_2_machine_options(MachineClass *m) > { > pc_q35_machine_options(m); > m->alias = "q35"; > } > > +DEFINE_Q35_MACHINE(v8_2, "pc-q35-8.2", NULL, > + pc_q35_8_2_machine_options); > + > +static void pc_q35_8_1_machine_options(MachineClass *m) > +{ > +pc_q35_8_2_machine_options(m); > +m->alias = NULL; > +compat_props_add(m->compat_props, hw_compat_8_1, hw_compat_8_1_len); > +compat_props_add(m->compat_props, pc_compat_8_1, pc_compat_8_1_len); > +} > + > DEFINE_Q35_MACHINE(v8_1, "pc-q35-8.1", NULL, > pc_q35_8_1_machine_options); > > @@ -393,7 +404,6 @@ static void pc_q35_8_0_machine_options(MachineClass *m) > PCMachineClass *pcmc = PC_MACHINE_CLASS(m); > >
Re: [Virtio-fs] [PATCH v2 2/4] vhost-user: Interface for migration state transfer
On 7/21/23 16:07, Hanna Czenczek wrote: On 20.07.23 17:05, Hao Xu wrote: On 7/20/23 21:20, Hanna Czenczek wrote: On 20.07.23 14:13, Hao Xu wrote: On 7/12/23 19:17, Hanna Czenczek wrote: Add the interface for transferring the back-end's state during migration as defined previously in vhost-user.rst. Signed-off-by: Hanna Czenczek --- include/hw/virtio/vhost-backend.h | 24 + include/hw/virtio/vhost.h | 79 hw/virtio/vhost-user.c | 147 ++ hw/virtio/vhost.c | 37 4 files changed, 287 insertions(+) diff --git a/include/hw/virtio/vhost-backend.h b/include/hw/virtio/vhost-backend.h index 31a251a9f5..e59d0b53f8 100644 --- a/include/hw/virtio/vhost-backend.h +++ b/include/hw/virtio/vhost-backend.h @@ -26,6 +26,18 @@ typedef enum VhostSetConfigType { VHOST_SET_CONFIG_TYPE_MIGRATION = 1, } VhostSetConfigType; +typedef enum VhostDeviceStateDirection { + /* Transfer state from back-end (device) to front-end */ + VHOST_TRANSFER_STATE_DIRECTION_SAVE = 0, + /* Transfer state from front-end to back-end (device) */ + VHOST_TRANSFER_STATE_DIRECTION_LOAD = 1, +} VhostDeviceStateDirection; + +typedef enum VhostDeviceStatePhase { + /* The device (and all its vrings) is stopped */ + VHOST_TRANSFER_STATE_PHASE_STOPPED = 0, +} VhostDeviceStatePhase; + struct vhost_inflight; struct vhost_dev; struct vhost_log; @@ -133,6 +145,15 @@ typedef int (*vhost_set_config_call_op)(struct vhost_dev *dev, typedef void (*vhost_reset_status_op)(struct vhost_dev *dev); +typedef bool (*vhost_supports_migratory_state_op)(struct vhost_dev *dev); +typedef int (*vhost_set_device_state_fd_op)(struct vhost_dev *dev, + VhostDeviceStateDirection direction, + VhostDeviceStatePhase phase, + int fd, + int *reply_fd, + Error **errp); +typedef int (*vhost_check_device_state_op)(struct vhost_dev *dev, Error **errp); + typedef struct VhostOps { VhostBackendType backend_type; vhost_backend_init vhost_backend_init; @@ -181,6 +202,9 @@ typedef struct VhostOps { vhost_force_iommu_op vhost_force_iommu; vhost_set_config_call_op vhost_set_config_call; vhost_reset_status_op vhost_reset_status; + vhost_supports_migratory_state_op vhost_supports_migratory_state; + vhost_set_device_state_fd_op vhost_set_device_state_fd; + vhost_check_device_state_op vhost_check_device_state; } VhostOps; int vhost_backend_update_device_iotlb(struct vhost_dev *dev, diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h index 69bf59d630..d8877496e5 100644 --- a/include/hw/virtio/vhost.h +++ b/include/hw/virtio/vhost.h @@ -346,4 +346,83 @@ int vhost_dev_set_inflight(struct vhost_dev *dev, int vhost_dev_get_inflight(struct vhost_dev *dev, uint16_t queue_size, struct vhost_inflight *inflight); bool vhost_dev_has_iommu(struct vhost_dev *dev); + +/** + * vhost_supports_migratory_state(): Checks whether the back-end + * supports transferring internal state for the purpose of migration. + * Support for this feature is required for vhost_set_device_state_fd() + * and vhost_check_device_state(). + * + * @dev: The vhost device + * + * Returns true if the device supports these commands, and false if it + * does not. + */ +bool vhost_supports_migratory_state(struct vhost_dev *dev); + +/** + * vhost_set_device_state_fd(): Begin transfer of internal state from/to + * the back-end for the purpose of migration. Data is to be transferred + * over a pipe according to @direction and @phase. The sending end must + * only write to the pipe, and the receiving end must only read from it. + * Once the sending end is done, it closes its FD. The receiving end + * must take this as the end-of-transfer signal and close its FD, too. + * + * @fd is the back-end's end of the pipe: The write FD for SAVE, and the + * read FD for LOAD. This function transfers ownership of @fd to the + * back-end, i.e. closes it in the front-end. + * + * The back-end may optionally reply with an FD of its own, if this + * improves efficiency on its end. In this case, the returned FD is Hi Hanna, In what case/situation, the back-end will have a more efficient fd? Hi Hao, There is no example yet. Here my understanding of this "FD of its own" is as same type as the given fd(e.g. both pipe files), why the fd from back-end makes difference? Do I miss anything here? Maybe it makes more sense in the context of how we came up with the idea: Specifically, Stefan and me were asking which end should provide the FD. In the context of vhost-user, it makes sense to have it be the front-end, because it controls vhost-user communication, but that’s just the natural protocol choice, not necessarily the most efficient one. It is imagina
Re: [PATCH-for-8.1?] hw/virtio: qmp: add RING_RESET to 'info virtio-status'
On 21/7/23 09:28, David Edmondson wrote: Signed-off-by: David Edmondson --- hw/virtio/virtio-qmp.c | 2 ++ 1 file changed, 2 insertions(+) Reviewed-by: Philippe Mathieu-Daudé
Re: [PATCH-for-8.1] target/tricore: Rename tricore_feature
On 21/7/23 09:31, Philippe Mathieu-Daudé wrote: On 21/7/23 08:06, Bastian Koppelmann wrote: this name is used by capstone and will lead to a build failure of QEMU, when capstone is enabled. So we rename it to tricore_has_feature(), to match has_feature() in translate.c. Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1774 Signed-off-by: Bastian Koppelmann --- target/tricore/cpu.c | 8 target/tricore/cpu.h | 2 +- target/tricore/helper.c | 4 ++-- target/tricore/op_helper.c | 4 ++-- 4 files changed, 9 insertions(+), 9 deletions(-) Reviewed-by: Philippe Mathieu-Daudé BTW Bastian if you want I can squeeze this single patch in my mips-fixes PR I'm planning.
Re: [PATCH] roms/opensbi: Upgrade from v1.3 to v1.3.1
On Fri, Jul 21, 2023 at 10:04:02AM +1000, Alistair Francis wrote: > On Thu, Jul 20, 2023 at 3:00 AM Bin Meng wrote: > > > > Upgrade OpenSBI from v1.3 to v1.3.1 and the pre-built bios images > > which fixes the boot failure seen when using QEMU to do a direct > > kernel boot with Microchip Icicle Kit board machine. > > > > The v1.3.1 release includes the following commits: > > > > 0907de3 lib: sbi: fix comment indent > > eb736a5 lib: sbi_pmu: Avoid out of bounds access > > 7828eeb gpio/desginware: add Synopsys DesignWare APB GPIO support > > c6a3573 lib: utils: Fix sbi_hartid_to_scratch() usage in ACLINT drivers > > 057eb10 lib: utils/gpio: Fix RV32 compile error for designware GPIO driver > > > > Signed-off-by: Bin Meng > > Reviewed-by: Alistair Francis > > @Conor Dooley @Conor Dooley Any chance you can test this? Sure. I didn't se this patch because I am not subscribed to qemu-devel, just qemu-riscv. Tested-by: Conor Dooley Thanks, Conor. signature.asc Description: PGP signature
Re: [PATCH for-8.2 0/4] rtc devices: Avoid putting time_t in 32-bit variables
+Markus On 20/7/23 17:58, Peter Maydell wrote: This patchset was prompted by a couple of Coverity warnings (CID 1507157, 1517772) which note that in the m48t59 RTC device model we keep an offset in a time_t variable but then truncate it by passing it to qemu_get_timedate(), which currently uses an 'int' argument for its offset parameter. We can fix the Coverity complaint by making qemu_get_timedate() take a time_t; we should also correspondingly make the qemu_timedate_diff() function return a time_t. However this will only push the issue out to callers of qemu_timedate_diff() if they are putting the result in a 32-bit variable or doing 32-bit arithmetic on it. Luckily there aren't that many callers of qemu_timedate_diff() and most of them already use either time_t or int64_t for the calculations they do on its return value. The first three patches fix devices which weren't doing that; patch four then fixes the rtc.c functions. If I missed any callsites in devices then hopefully Coverity will point them out. Do we need to change the type of the RTC_CHANGE event? This is wrong, but to give an idea: --- a/qapi/misc.json +++ b/qapi/misc.json @@ -553,47 +553,47 @@ ## # @RTC_CHANGE: # # Emitted when the guest changes the RTC time. # # @offset: offset in seconds between base RTC clock (as specified by # -rtc base), and new RTC clock value # # @qom-path: path to the RTC object in the QOM tree # # Note: This event is rate-limited. It is not guaranteed that the RTC # in the system implements this event, or even that the system has # an RTC at all. # # Since: 0.13 # # Example: # # <- { "event": "RTC_CHANGE", # "data": { "offset": 78 }, # "timestamp": { "seconds": 1267020223, "microseconds": 435656 } } ## { 'event': 'RTC_CHANGE', - 'data': { 'offset': 'int', 'qom-path': 'str' } } + 'data': { 'offset': 'int64', 'qom-path': 'str' } } ---
Re: [PATCH for-8.2 1/4] hw/rtc/m48t59: Use 64-bit arithmetic in set_alarm()
On 20/7/23 17:58, Peter Maydell wrote: In the m48t59 device we almost always use 64-bit arithmetic when dealing with time_t deltas. The one exception is in set_alarm(), which currently uses a plain 'int' to hold the difference between two time_t values. Switch to int64_t instead to avoid any possible overflow issues. Signed-off-by: Peter Maydell --- hw/rtc/m48t59.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) Reviewed-by: Philippe Mathieu-Daudé
Re: [PATCH 2/5] target/arm/helper: Fix vae2_tlbmask()
On Thu, Jul 20, 2023 at 05:35:49PM +0100, Peter Maydell wrote: > On Wed, 19 Jul 2023 at 16:56, Jean-Philippe Brucker > wrote: > > > > When HCR_EL2.E2H is enabled, TLB entries are formed using the EL2&0 > > translation regime, instead of the EL2 translation regime. The TLB VAE2* > > instructions invalidate the regime that corresponds to the current value > > of HCR_EL2.E2H. > > > > At the moment we only invalidate the EL2 translation regime. This causes > > problems with RMM, which issues TLBI VAE2IS instructions with > > HCR_EL2.E2H enabled. Update vae2_tlbmask() to take HCR_EL2.E2H into > > account. > > > > Signed-off-by: Jean-Philippe Brucker > > --- > > target/arm/helper.c | 26 ++ > > 1 file changed, 18 insertions(+), 8 deletions(-) > > > > diff --git a/target/arm/helper.c b/target/arm/helper.c > > index e1b3db6f5f..07a9ac70f5 100644 > > --- a/target/arm/helper.c > > +++ b/target/arm/helper.c > > @@ -4663,6 +4663,21 @@ static int vae1_tlbmask(CPUARMState *env) > > return mask; > > } > > > > +static int vae2_tlbmask(CPUARMState *env) > > +{ > > +uint64_t hcr = arm_hcr_el2_eff(env); > > +uint16_t mask; > > + > > +if (hcr & HCR_E2H) { > > +mask = ARMMMUIdxBit_E20_2 | > > + ARMMMUIdxBit_E20_2_PAN | > > + ARMMMUIdxBit_E20_0; > > +} else { > > +mask = ARMMMUIdxBit_E2; > > +} > > +return mask; > > +} > > + > > /* Return 56 if TBI is enabled, 64 otherwise. */ > > static int tlbbits_for_regime(CPUARMState *env, ARMMMUIdx mmu_idx, > >uint64_t addr) > > > @@ -4838,11 +4853,11 @@ static void tlbi_aa64_vae2is_write(CPUARMState > > *env, const ARMCPRegInfo *ri, > > uint64_t value) > > { > > CPUState *cs = env_cpu(env); > > +int mask = vae2_tlbmask(env); > > uint64_t pageaddr = sextract64(value << 12, 0, 56); > > int bits = tlbbits_for_regime(env, ARMMMUIdx_E2, pageaddr); > > Shouldn't the argument to tlbbits_for_regime() also change > if we're dealing with the EL2&0 regime rather than EL2 ? Yes, it affects the result since EL2&0 has two ranges Thanks, Jean > > > > > -tlb_flush_page_bits_by_mmuidx_all_cpus_synced(cs, pageaddr, > > - ARMMMUIdxBit_E2, bits); > > +tlb_flush_page_bits_by_mmuidx_all_cpus_synced(cs, pageaddr, mask, > > bits); > > } > > thanks > -- PMM
assert fails in s390x TCG
Hello Cornelia, Richard, I had some strange behavior in an s390x TCG VM that I am debugging, and configured latest upstream QEMU with --enable-debug --enable-debug-tcg and I am running the qemu binary with -d unimp,guest_errors . I get: /usr/bin/qemu-system-s390x -nodefaults -no-reboot -nographic -vga none -cpu qemu -d unimp,guest_errors -object rng-random,filename=/dev/random,id=rng0 -device virtio-rng-ccw,rng=rng0 -runas qemu -net none -kernel /var/tmp/boot/kernel -initrd /var/tmp/boot/initrd -append root=/dev/disk/by-id/virtio-0 rootfstype=ext3 rootflags=data=writeback,nobarrier,commit=150,noatime elevator=noop nmi_watchdog=0 rw oops=panic panic=1 quiet elevator=noop console=hvc0 init=build -m 2048 -drive file=/var/tmp/img,format=raw,if=none,id=disk,cache=unsafe -device virtio-blk-ccw,drive=disk,serial=0 -drive file=/var/tmp/swap,format=raw,if=none,id=swap,cache=unsafe -device virtio-blk-ccw,drive=swap,serial=1 -device virtio-serial-ccw -device virtconsole,chardev=virtiocon0 -chardev stdio,id=virtiocon0 -chardev socket,id=monitor,server=on,wait=off,path=/var/tmp/img.qemu/monitor -mon chardev=monitor,mode=readline -smp 8 unimplemented opcode 0xb9ab unimplemented opcode 0xb2af ERROR:../accel/tcg/tb-maint.c:348:page_unlock__debug: assertion failed: (page_is_locked(pd)) Bail out! ERROR:../accel/tcg/tb-maint.c:348:page_unlock__debug: assertion failed: (page_is_locked(pd)) Thread 3 "qemu-system-s39" received signal SIGABRT, Aborted. [Switching to Thread 0x753516c0 (LWP 215975)] (gdb) bt #0 0x7730dabc in __pthread_kill_implementation () at /lib64/libc.so.6 #1 0x772bc266 in raise () at /lib64/libc.so.6 #2 0x772a4897 in abort () at /lib64/libc.so.6 #3 0x776f0eee in () at /lib64/libglib-2.0.so.0 #4 0x7775649a in g_assertion_message_expr () at /lib64/libglib-2.0.so.0 #5 0x55b96134 in page_unlock__debug (pd=0x7ffee8680440) at ../accel/tcg/tb-maint.c:348 #6 0x55b962a9 in page_unlock (pd=0x7ffee8680440) at ../accel/tcg/tb-maint.c:397 #7 0x55b96580 in tb_unlock_pages (tb=0x7fffefffeb00) at ../accel/tcg/tb-maint.c:483 #8 0x55b94698 in cpu_exec_longjmp_cleanup (cpu=0x56566a30) at ../accel/tcg/cpu-exec.c:556 #9 0x55b954e0 in cpu_exec_setjmp (cpu=0x56566a30, sc=0x75350540) at ../accel/tcg/cpu-exec.c:1054 #10 0x55b9557a in cpu_exec (cpu=0x56566a30) at ../accel/tcg/cpu-exec.c:1083 #11 0x55bb9af6 in tcg_cpus_exec (cpu=0x56566a30) at ../accel/tcg/tcg-accel-ops.c:75 #12 0x55bba1ae in mttcg_cpu_thread_fn (arg=0x56566a30) at ../accel/tcg/tcg-accel-ops-mttcg.c:95 #13 0x55dc0af3 in qemu_thread_start (args=0x565ba150) at ../util/qemu-thread-posix.c:541 #14 0x7730bc64 in start_thread () at /lib64/libc.so.6 #15 0x77393550 in clone3 () at /lib64/libc.so.6 (gdb) frame 5 #5 0x55b96134 in page_unlock__debug (pd=0x7ffee8680440) at ../accel/tcg/tb-maint.c:348 348 g_assert(page_is_locked(pd)); (gdb) list 348 343 static void page_unlock__debug(const PageDesc *pd) 344 { 345 bool removed; 346 347 ht_pages_locked_debug_init(); 348 g_assert(page_is_locked(pd)); 349 removed = g_hash_table_remove(ht_pages_locked_debug, pd); 350 g_assert(removed); 351 } 352 (gdb) info threads Id Target IdFrame 1Thread 0x763bef40 (LWP 215971) "qemu-system-s39" 0x77385596 in ppoll () from /lib64/libc.so.6 2Thread 0x763bb6c0 (LWP 215974) "qemu-system-s39" 0x7738b41d in syscall () from /lib64/libc.so.6 * 3Thread 0x753516c0 (LWP 215975) "qemu-system-s39" 0x7730dabc in __pthread_kill_implementation () from /lib64/libc.so.6 4Thread 0x74b506c0 (LWP 215976) "qemu-system-s39" 0x7730820e in __futex_abstimed_wait_common () from /lib64/libc.so.6 5Thread 0x7ffeefdff6c0 (LWP 215977) "qemu-system-s39" 0x7730820e in __futex_abstimed_wait_common () from /lib64/libc.so.6 6Thread 0x7ffeef5fe6c0 (LWP 215978) "qemu-system-s39" 0x7730820e in __futex_abstimed_wait_common () from /lib64/libc.so.6 7Thread 0x7ffeeedfd6c0 (LWP 215979) "qemu-system-s39" 0x7730820e in __futex_abstimed_wait_common () from /lib64/libc.so.6 8Thread 0x7ffeee5fc6c0 (LWP 215980) "qemu-system-s39" 0x7730820e in __futex_abstimed_wait_common () from /lib64/libc.so.6 9Thread 0x7ffeeddfb6c0 (LWP 215981) "qemu-system-s39" 0x7730820e in __futex_abstimed_wait_common () from /lib64/libc.so.6 10 Thread 0x7ffeed5fa6c0 (LWP 215982) "qemu-system-s39" 0x7730820e in __futex_abstimed_wait_common () from /lib64/libc.so.6 (gdb) thread apply all bt Thread 10 (Thread 0x7ffeed5fa6c0 (LWP 215982) "qemu-system-s39"): #0 0x7730820e in __futex_abstimed_wait_common () at /lib64/libc.so.6 #1 0x7730af50 in pthread_cond_wait@@GLIBC_2.3.2 () at /l
Re: [PATCH 3/5] target/arm: Skip granule protection checks for AT instructions
On Thu, Jul 20, 2023 at 05:39:56PM +0100, Peter Maydell wrote: > On Wed, 19 Jul 2023 at 16:56, Jean-Philippe Brucker > wrote: > > > > GPC checks are not performed on the output address for AT instructions, > > as stated by ARM DDI 0487J in D8.12.2: > > > > When populating PAR_EL1 with the result of an address translation > > instruction, granule protection checks are not performed on the final > > output address of a successful translation. > > > > Rename get_phys_addr_with_secure(), since it's only used to handle AT > > instructions. > > > > Signed-off-by: Jean-Philippe Brucker > > --- > > This incidentally fixes a problem with AT S1E1 instructions which can > > output an IPA and should definitely not cause a GPC. > > --- > > target/arm/internals.h | 25 ++--- > > target/arm/helper.c| 8 ++-- > > target/arm/ptw.c | 11 ++- > > 3 files changed, 26 insertions(+), 18 deletions(-) > > > > diff --git a/target/arm/internals.h b/target/arm/internals.h > > index 0f01bc32a8..fc90c364f7 100644 > > --- a/target/arm/internals.h > > +++ b/target/arm/internals.h > > @@ -1190,12 +1190,11 @@ typedef struct GetPhysAddrResult { > > } GetPhysAddrResult; > > > > /** > > - * get_phys_addr_with_secure: get the physical address for a virtual > > address > > + * get_phys_addr: get the physical address for a virtual address > > * @env: CPUARMState > > * @address: virtual address to get physical address for > > * @access_type: 0 for read, 1 for write, 2 for execute > > * @mmu_idx: MMU index indicating required translation regime > > - * @is_secure: security state for the access > > * @result: set on translation success. > > * @fi: set to fault info if the translation fails > > * > > @@ -1212,26 +1211,30 @@ typedef struct GetPhysAddrResult { > > * * for PSMAv5 based systems we don't bother to return a full FSR format > > *value. > > */ > > -bool get_phys_addr_with_secure(CPUARMState *env, target_ulong address, > > - MMUAccessType access_type, > > - ARMMMUIdx mmu_idx, bool is_secure, > > - GetPhysAddrResult *result, ARMMMUFaultInfo > > *fi) > > +bool get_phys_addr(CPUARMState *env, target_ulong address, > > + MMUAccessType access_type, ARMMMUIdx mmu_idx, > > + GetPhysAddrResult *result, ARMMMUFaultInfo *fi) > > __attribute__((nonnull)); > > > What is going on in this bit of the patch ? We already > have a prototype for get_phys_addr() with a doc comment. > Is this git's diff-output producing something silly > for a change that is not logically touching get_phys_addr()'s > prototype and comment at all? I swapped the two prototypes in order to make the new comment for get_phys_addr_with_secure_nogpc() more clear. Tried to convey that get_phys_addr() is the normal function and get_phys_addr_with_secure_nogpc() is special. So git is working as expected and this is a style change, I can switch them back if you prefer. Thanks, Jean > > > /** > > - * get_phys_addr: get the physical address for a virtual address > > + * get_phys_addr_with_secure_nogpc: get the physical address for a virtual > > + * address > > * @env: CPUARMState > > * @address: virtual address to get physical address for > > * @access_type: 0 for read, 1 for write, 2 for execute > > * @mmu_idx: MMU index indicating required translation regime > > + * @is_secure: security state for the access > > * @result: set on translation success. > > * @fi: set to fault info if the translation fails > > * > > - * Similarly, but use the security regime of @mmu_idx. > > + * Similar to get_phys_addr, but use the given security regime and don't > > perform > > + * a Granule Protection Check on the resulting address. > > */ > > -bool get_phys_addr(CPUARMState *env, target_ulong address, > > - MMUAccessType access_type, ARMMMUIdx mmu_idx, > > - GetPhysAddrResult *result, ARMMMUFaultInfo *fi) > > +bool get_phys_addr_with_secure_nogpc(CPUARMState *env, target_ulong > > address, > > + MMUAccessType access_type, > > + ARMMMUIdx mmu_idx, bool is_secure, > > + GetPhysAddrResult *result, > > + ARMMMUFaultInfo *fi) > > __attribute__((nonnull)); > > thanks > -- PMM
Re: [PATCH for-8.2 1/4] hw/rtc/m48t59: Use 64-bit arithmetic in set_alarm()
On 20/7/23 17:58, Peter Maydell wrote: In the m48t59 device we almost always use 64-bit arithmetic when dealing with time_t deltas. The one exception is in set_alarm(), which currently uses a plain 'int' to hold the difference between two time_t values. Switch to int64_t instead to avoid any possible overflow issues. Signed-off-by: Peter Maydell --- hw/rtc/m48t59.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) Similarly: -- >8 -- --- a/hw/rtc/m48t59.c +++ b/hw/rtc/m48t59.c @@ -88,7 +88,7 @@ static M48txxInfo m48txx_sysbus_info[] = { static void alarm_cb (void *opaque) { struct tm tm; -uint64_t next_time; +int64_t next_time; M48t59State *NVRAM = opaque; ---
Re: [PATCH for-8.2 2/4] hw/rtc/twl92230: Use int64_t for sec_offset and alm_sec
On 20/7/23 17:59, Peter Maydell wrote: In the twl92230 device, use int64_t for the two state fields sec_offset and alm_sec, because we set these to values that are either time_t or differences between two time_t values. These fields aren't saved in vmstate anywhere, so we can safely widen them. Signed-off-by: Peter Maydell --- I have a suspicion that really these fields *should* be being migrated, but this device is only used in the n800 and n810 boards, so I'm not going to investigate how broken migration/vmsave is there... --- hw/rtc/twl92230.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) Reviewed-by: Philippe Mathieu-Daudé
Re: [PATCH for-8.1] scripts/git-submodule.sh: Don't rely on non-POSIX 'read' behaviour
On 20/7/23 17:30, Peter Maydell wrote: The POSIX definition of the 'read' utility requires that you specify the variable name to set; omitting the name and having it default to 'REPLY' is a bashism. If your system sh is dash, then it will print an error message during build: qemu/pc-bios/s390-ccw/../../scripts/git-submodule.sh: 106: read: arg count Specify the variable name explicitly. Fixes: fdb8fd8cb915647b ("git-submodule: allow partial update of .git-submodule-status") Signed-off-by: Peter Maydell --- scripts/git-submodule.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) Reviewed-by: Philippe Mathieu-Daudé
Re: [PATCH for-8.2 0/4] rtc devices: Avoid putting time_t in 32-bit variables
Hi Peter, On 20/7/23 17:58, Peter Maydell wrote: This patchset was prompted by a couple of Coverity warnings (CID 1507157, 1517772) which note that in the m48t59 RTC device model we keep an offset in a time_t variable but then truncate it by passing it to qemu_get_timedate(), which currently uses an 'int' argument for its offset parameter. We can fix the Coverity complaint by making qemu_get_timedate() take a time_t; we should also correspondingly make the qemu_timedate_diff() function return a time_t. However this will only push the issue out to callers of qemu_timedate_diff() if they are putting the result in a 32-bit variable or doing 32-bit arithmetic on it. Luckily there aren't that many callers of qemu_timedate_diff() and most of them already use either time_t or int64_t for the calculations they do on its return value. The first three patches fix devices which weren't doing that; patch four then fixes the rtc.c functions. If I missed any callsites in devices then hopefully Coverity will point them out. PL031State::tick_offset is uint32_t, and pl031_get_count() also returns that type. Is that expected?
Re: [PATCH for-8.2 4/4] rtc: Use time_t for passing and returning time offsets
On 20/7/23 17:59, Peter Maydell wrote: The functions qemu_get_timedate() and qemu_timedate_diff() take and return a time offset as an integer. Coverity points out that means that when an RTC device implementation holds an offset as a time_t, as the m48t59 does, the time_t will get truncated. (CID 1507157, 1517772). The functions work with time_t internally, so make them use that type in their APIs. Note that this won't help any Y2038 issues where either the device model itself is keeping the offset in a 32-bit integer, or where the hardware under emulation has Y2038 or other rollover problems. If we missed any cases of the former then hopefully Coverity will warn us about them since after this patch we'd be truncating a time_t in assignments from qemu_timedate_diff().) Signed-off-by: Peter Maydell --- include/sysemu/rtc.h | 4 ++-- softmmu/rtc.c| 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) Reviewed-by: Philippe Mathieu-Daudé
[BUG][CPU hot-plug]CPU hot-plugs cause the qemu process to coredump
Hello,Recently, when I was developing CPU hot-plugs under the loongarch architecture, I found that there was a problem with qemu cpu hot-plugs under x86 architecture, which caused the qemu process coredump when repeatedly inserting and unplugging the CPU when the TCG was accelerated. The specific operation process is as follows: 1.Use the following command to start the virtual machine qemu-system-x86_64 \ -machine q35 \ -cpu Broadwell-IBRS \ -smp 1,maxcpus=4,sockets=4,cores=1,threads=1 \ -m 4G \ -drive file=~/anolis-8.8.qcow2 \ -serial stdio \ -monitor telnet:localhost:4498,server,nowait 2.Enter QEMU Monitor via telnet for repeated CPU insertion and unplugging telnet 127.0.0.1 4498 (qemu) device_add Broadwell-IBRS-x86_64-cpu,socket-id=1,core-id=0,thread-id=0,id=cpu1 (qemu) device_del cpu1 (qemu) device_add Broadwell-IBRS-x86_64-cpu,socket-id=1,core-id=0,thread-id=0,id=cpu1 3.You will notice that the QEMU process has a coredump # malloc(): unsorted double linked list corrupted Aborted (core dumped)
Re: [PATCH for-8.2 1/4] hw/rtc/m48t59: Use 64-bit arithmetic in set_alarm()
On Fri, 21 Jul 2023 at 10:09, Philippe Mathieu-Daudé wrote: > > On 20/7/23 17:58, Peter Maydell wrote: > > In the m48t59 device we almost always use 64-bit arithmetic when > > dealing with time_t deltas. The one exception is in set_alarm(), > > which currently uses a plain 'int' to hold the difference between two > > time_t values. Switch to int64_t instead to avoid any possible > > overflow issues. > > > > Signed-off-by: Peter Maydell > > --- > > hw/rtc/m48t59.c | 2 +- > > 1 file changed, 1 insertion(+), 1 deletion(-) > > Similarly: > > -- >8 -- > --- a/hw/rtc/m48t59.c > +++ b/hw/rtc/m48t59.c > @@ -88,7 +88,7 @@ static M48txxInfo m48txx_sysbus_info[] = { > static void alarm_cb (void *opaque) > { > struct tm tm; > -uint64_t next_time; > +int64_t next_time; > M48t59State *NVRAM = opaque; Why? next_time should always be positive here, I think. -- PMM
Re: [PATCH for-8.2 0/4] rtc devices: Avoid putting time_t in 32-bit variables
On Fri, 21 Jul 2023 at 10:03, Philippe Mathieu-Daudé wrote: > > +Markus > > On 20/7/23 17:58, Peter Maydell wrote: > > This patchset was prompted by a couple of Coverity warnings > > (CID 1507157, 1517772) which note that in the m48t59 RTC device model > > we keep an offset in a time_t variable but then truncate it by > > passing it to qemu_get_timedate(), which currently uses an 'int' > > argument for its offset parameter. > > > > We can fix the Coverity complaint by making qemu_get_timedate() > > take a time_t; we should also correspondingly make the > > qemu_timedate_diff() function return a time_t. However this > > will only push the issue out to callers of qemu_timedate_diff() > > if they are putting the result in a 32-bit variable or doing > > 32-bit arithmetic on it. > > > > Luckily there aren't that many callers of qemu_timedate_diff() > > and most of them already use either time_t or int64_t for the > > calculations they do on its return value. The first three > > patches fix devices which weren't doing that; patch four then > > fixes the rtc.c functions. If I missed any callsites in devices > > then hopefully Coverity will point them out. > > Do we need to change the type of the RTC_CHANGE event? > > This is wrong, but to give an idea: > > --- a/qapi/misc.json > +++ b/qapi/misc.json > @@ -553,47 +553,47 @@ > ## > # @RTC_CHANGE: > # > # Emitted when the guest changes the RTC time. > # > # @offset: offset in seconds between base RTC clock (as specified by > # -rtc base), and new RTC clock value > # > # @qom-path: path to the RTC object in the QOM tree > # > # Note: This event is rate-limited. It is not guaranteed that the RTC > # in the system implements this event, or even that the system has > # an RTC at all. > # > # Since: 0.13 > # > # Example: > # > # <- { "event": "RTC_CHANGE", > # "data": { "offset": 78 }, > # "timestamp": { "seconds": 1267020223, "microseconds": 435656 } } > ## > { 'event': 'RTC_CHANGE', > - 'data': { 'offset': 'int', 'qom-path': 'str' } } > + 'data': { 'offset': 'int64', 'qom-path': 'str' } } > --- If I understand the 'Built-in Types' section in docs/devel/qapi-code-gen.rst correctly, the QAPI 'int' type is already 64 bits. thanks -- PMM
Re: [PATCH for-8.2 0/4] rtc devices: Avoid putting time_t in 32-bit variables
On Fri, 21 Jul 2023 at 10:16, Philippe Mathieu-Daudé wrote: > > Hi Peter, > > On 20/7/23 17:58, Peter Maydell wrote: > > This patchset was prompted by a couple of Coverity warnings > > (CID 1507157, 1517772) which note that in the m48t59 RTC device model > > we keep an offset in a time_t variable but then truncate it by > > passing it to qemu_get_timedate(), which currently uses an 'int' > > argument for its offset parameter. > > > > We can fix the Coverity complaint by making qemu_get_timedate() > > take a time_t; we should also correspondingly make the > > qemu_timedate_diff() function return a time_t. However this > > will only push the issue out to callers of qemu_timedate_diff() > > if they are putting the result in a 32-bit variable or doing > > 32-bit arithmetic on it. > > > > Luckily there aren't that many callers of qemu_timedate_diff() > > and most of them already use either time_t or int64_t for the > > calculations they do on its return value. The first three > > patches fix devices which weren't doing that; patch four then > > fixes the rtc.c functions. If I missed any callsites in devices > > then hopefully Coverity will point them out. > > PL031State::tick_offset is uint32_t, and pl031_get_count() also > returns that type. Is that expected? I think those fall into the category of "the device we are modelling does not support 64-bit timestamps" -- the PL031 RTC_DR register is only 32 bits. thanks -- PMM
[PATCH 0/2] riscv: Fix the console of the Spike machine on big endian hosts
The tests/avocado/riscv_opensbi.py avocado test is currently failing on big endian hosts since the console of the Spike machine is not working there. With two small patches, this can be fixed: First patch fixes riscv64, and the second one fixes riscv32. Thomas Huth (2): hw/char/riscv_htif: Fix printing of console characters on big endian hosts hw/char/riscv_htif: Fix the console syscall on big endian hosts hw/char/riscv_htif.c | 12 +++- 1 file changed, 7 insertions(+), 5 deletions(-) -- 2.39.3
[PATCH 1/2] hw/char/riscv_htif: Fix printing of console characters on big endian hosts
The character that should be printed is stored in the 64 bit "payload" variable. The code currently tries to print it by taking the address of the variable and passing this pointer to qemu_chr_fe_write(). However, this only works on little endian hosts where the least significant bits are stored on the lowest address. To do this in a portable way, we have to store the value in an uint8_t variable instead. Fixes: 5033606780 ("RISC-V HTIF Console") Signed-off-by: Thomas Huth --- hw/char/riscv_htif.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hw/char/riscv_htif.c b/hw/char/riscv_htif.c index 37d3ccc76b..f96df40124 100644 --- a/hw/char/riscv_htif.c +++ b/hw/char/riscv_htif.c @@ -232,7 +232,8 @@ static void htif_handle_tohost_write(HTIFState *s, uint64_t val_written) s->tohost = 0; /* clear to indicate we read */ return; } else if (cmd == HTIF_CONSOLE_CMD_PUTC) { -qemu_chr_fe_write(&s->chr, (uint8_t *)&payload, 1); +uint8_t ch = (uint8_t)payload; +qemu_chr_fe_write(&s->chr, &ch, 1); resp = 0x100 | (uint8_t)payload; } else { qemu_log("HTIF device %d: unknown command\n", device); -- 2.39.3
[PATCH 2/2] hw/char/riscv_htif: Fix the console syscall on big endian hosts
Values that have been read via cpu_physical_memory_read() from the guest's memory have to be swapped in case the host endianess differs from the guest. Fixes: a6e13e31d5 ("riscv_htif: Support console output via proxy syscall") Signed-off-by: Thomas Huth --- hw/char/riscv_htif.c | 9 + 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/hw/char/riscv_htif.c b/hw/char/riscv_htif.c index f96df40124..40de6b8b77 100644 --- a/hw/char/riscv_htif.c +++ b/hw/char/riscv_htif.c @@ -30,6 +30,7 @@ #include "qemu/timer.h" #include "qemu/error-report.h" #include "exec/address-spaces.h" +#include "exec/tswap.h" #include "sysemu/dma.h" #define RISCV_DEBUG_HTIF 0 @@ -209,11 +210,11 @@ static void htif_handle_tohost_write(HTIFState *s, uint64_t val_written) } else { uint64_t syscall[8]; cpu_physical_memory_read(payload, syscall, sizeof(syscall)); -if (syscall[0] == PK_SYS_WRITE && -syscall[1] == HTIF_DEV_CONSOLE && -syscall[3] == HTIF_CONSOLE_CMD_PUTC) { +if (tswap64(syscall[0]) == PK_SYS_WRITE && +tswap64(syscall[1]) == HTIF_DEV_CONSOLE && +tswap64(syscall[3]) == HTIF_CONSOLE_CMD_PUTC) { uint8_t ch; -cpu_physical_memory_read(syscall[2], &ch, 1); +cpu_physical_memory_read(tswap64(syscall[2]), &ch, 1); qemu_chr_fe_write(&s->chr, &ch, 1); resp = 0x100 | (uint8_t)payload; } else { -- 2.39.3
Re: [PULL 0/5] Linux user brk fixes patches
On Wed, 19 Jul 2023 at 16:53, Helge Deller wrote: > > The following changes since commit 361d5397355276e3007825cc17217c1e4d4320f7: > > Merge tag 'block-pull-request' of https://gitlab.com/stefanha/qemu into > staging (2023-07-17 15:49:27 +0100) > > are available in the Git repository at: > > https://github.com/hdeller/qemu-hppa.git > tags/linux-user-brk-fixes-pull-request > > for you to fetch changes up to 518f32221af759a29500ac172c4c857bef142067: > > linux-user: Fix qemu-arm to run static armhf binaries (2023-07-18 20:42:05 > +0200) > > > linux-user: brk() syscall fixes and armhf static binary fix > > Commit 86f04735ac ("linux-user: Fix brk() to release pages") introduced > the possibility for userspace applications to reduce memory footprint by > calling brk() with a lower address and as such free up memory, the same > way as the Linux kernel allows on physical machines. > > This change introduced some failures for applications with errors like > - accesing bytes above the brk heap address on the same page, > - freeing memory below the initial brk address, > and introduced a behaviour which isn't done by the kernel (e.g. zeroing > memory above brk). > > This patch series fixes those issues and has been tested with existing > programs (e.g. upx). > > Additionally one patch fixes running static armhf executables (e.g. fstype) > which was broken since qemu-8.0. > > Changes in v2: > - dropped patch to revert d28b3c90cfad ("linux-user: Make sure initial brk(0) > is page-aligned") > - rephrased some commit messages > - fixed Cc email addresses, added new ones > - added R-b tags > > Helge Applied, thanks. Please update the changelog at https://wiki.qemu.org/ChangeLog/8.1 for any user-visible changes. -- PMM
Re: [PULL 00/14] NBD patches for 2023-07-19
On Wed, 19 Jul 2023 at 21:39, Eric Blake wrote: > > The following changes since commit 2c27fdc7a626408ee2cf30d791aa0b63027c7404: > > Update version for v8.1.0-rc0 release (2023-07-19 20:31:43 +0100) > > are available in the Git repository at: > > https://repo.or.cz/qemu/ericb.git tags/pull-nbd-2023-07-19 > > for you to fetch changes up to bfe04d0a7d5e8a4f4c9014ee7622af2056685974: > > nbd: Use enum for various negotiation modes (2023-07-19 15:26:13 -0500) > > > NBD patches through 2023-07-19 > > - Denis V. Lunev: fix hang with 'ssh ... "qemu-nbd -c"' > - Eric Blake: preliminary work towards NBD 64-bit extensions > Applied, thanks. Please update the changelog at https://wiki.qemu.org/ChangeLog/8.1 for any user-visible changes. -- PMM
Re: [PATCH-for-8.1] target/tricore: Rename tricore_feature
On Fri, Jul 21, 2023 at 10:54:43AM +0200, Philippe Mathieu-Daudé wrote: > On 21/7/23 09:31, Philippe Mathieu-Daudé wrote: > > On 21/7/23 08:06, Bastian Koppelmann wrote: > > > this name is used by capstone and will lead to a build failure of QEMU, > > > when capstone is enabled. So we rename it to tricore_has_feature(), to > > > match has_feature() in translate.c. > > > > > > Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1774 > > > Signed-off-by: Bastian Koppelmann > > > --- > > > target/tricore/cpu.c | 8 > > > target/tricore/cpu.h | 2 +- > > > target/tricore/helper.c | 4 ++-- > > > target/tricore/op_helper.c | 4 ++-- > > > 4 files changed, 9 insertions(+), 9 deletions(-) > > > > Reviewed-by: Philippe Mathieu-Daudé > > BTW Bastian if you want I can squeeze this single patch in > my mips-fixes PR I'm planning. Thanks, Phil. Go ahead. Cheers, Bastian
Re: [PATCH 1/2] hw/char/riscv_htif: Fix printing of console characters on big endian hosts
On Fri, Jul 21, 2023 at 5:48 PM Thomas Huth wrote: > > The character that should be printed is stored in the 64 bit "payload" > variable. The code currently tries to print it by taking the address > of the variable and passing this pointer to qemu_chr_fe_write(). However, > this only works on little endian hosts where the least significant bits > are stored on the lowest address. To do this in a portable way, we have > to store the value in an uint8_t variable instead. > > Fixes: 5033606780 ("RISC-V HTIF Console") > Signed-off-by: Thomas Huth > --- > hw/char/riscv_htif.c | 3 ++- > 1 file changed, 2 insertions(+), 1 deletion(-) > Reviewed-by: Bin Meng
Re: [PATCH 2/2] hw/char/riscv_htif: Fix the console syscall on big endian hosts
On Fri, Jul 21, 2023 at 5:48 PM Thomas Huth wrote: > > Values that have been read via cpu_physical_memory_read() from the > guest's memory have to be swapped in case the host endianess differs typo: endianness > from the guest. > > Fixes: a6e13e31d5 ("riscv_htif: Support console output via proxy syscall") > Signed-off-by: Thomas Huth > --- > hw/char/riscv_htif.c | 9 + > 1 file changed, 5 insertions(+), 4 deletions(-) > Reviewed-by: Bin Meng
Re: [PATCH 1/2] hw/char/riscv_htif: Fix printing of console characters on big endian hosts
On 7/21/23 06:47, Thomas Huth wrote: The character that should be printed is stored in the 64 bit "payload" variable. The code currently tries to print it by taking the address of the variable and passing this pointer to qemu_chr_fe_write(). However, this only works on little endian hosts where the least significant bits are stored on the lowest address. To do this in a portable way, we have to store the value in an uint8_t variable instead. Fixes: 5033606780 ("RISC-V HTIF Console") Signed-off-by: Thomas Huth --- Reviewed-by: Daniel Henrique Barboza hw/char/riscv_htif.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hw/char/riscv_htif.c b/hw/char/riscv_htif.c index 37d3ccc76b..f96df40124 100644 --- a/hw/char/riscv_htif.c +++ b/hw/char/riscv_htif.c @@ -232,7 +232,8 @@ static void htif_handle_tohost_write(HTIFState *s, uint64_t val_written) s->tohost = 0; /* clear to indicate we read */ return; } else if (cmd == HTIF_CONSOLE_CMD_PUTC) { -qemu_chr_fe_write(&s->chr, (uint8_t *)&payload, 1); +uint8_t ch = (uint8_t)payload; +qemu_chr_fe_write(&s->chr, &ch, 1); resp = 0x100 | (uint8_t)payload; } else { qemu_log("HTIF device %d: unknown command\n", device);
Re: [PATCH 2/2] hw/char/riscv_htif: Fix the console syscall on big endian hosts
On 7/21/23 06:47, Thomas Huth wrote: Values that have been read via cpu_physical_memory_read() from the guest's memory have to be swapped in case the host endianess differs from the guest. Fixes: a6e13e31d5 ("riscv_htif: Support console output via proxy syscall") Signed-off-by: Thomas Huth --- Reviewed-by: Daniel Henrique Barboza hw/char/riscv_htif.c | 9 + 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/hw/char/riscv_htif.c b/hw/char/riscv_htif.c index f96df40124..40de6b8b77 100644 --- a/hw/char/riscv_htif.c +++ b/hw/char/riscv_htif.c @@ -30,6 +30,7 @@ #include "qemu/timer.h" #include "qemu/error-report.h" #include "exec/address-spaces.h" +#include "exec/tswap.h" #include "sysemu/dma.h" #define RISCV_DEBUG_HTIF 0 @@ -209,11 +210,11 @@ static void htif_handle_tohost_write(HTIFState *s, uint64_t val_written) } else { uint64_t syscall[8]; cpu_physical_memory_read(payload, syscall, sizeof(syscall)); -if (syscall[0] == PK_SYS_WRITE && -syscall[1] == HTIF_DEV_CONSOLE && -syscall[3] == HTIF_CONSOLE_CMD_PUTC) { +if (tswap64(syscall[0]) == PK_SYS_WRITE && +tswap64(syscall[1]) == HTIF_DEV_CONSOLE && +tswap64(syscall[3]) == HTIF_CONSOLE_CMD_PUTC) { uint8_t ch; -cpu_physical_memory_read(syscall[2], &ch, 1); +cpu_physical_memory_read(tswap64(syscall[2]), &ch, 1); qemu_chr_fe_write(&s->chr, &ch, 1); resp = 0x100 | (uint8_t)payload; } else {
Re: [PATCH 1/2] Reduce vdpa initialization / startup overhead
On Tue, Jul 18, 2023 at 12:55 PM Michael S. Tsirkin wrote: > > On Thu, Apr 20, 2023 at 10:59:56AM +0200, Eugenio Perez Martin wrote: > > On Thu, Apr 20, 2023 at 7:25 AM Pei Li wrote: > > > > > > Hi all, > > > > > > My bad, I just submitted the kernel patch. If we are passing some generic > > > command, still we have to add an additional field in the structure to > > > indicate what is the unbatched version of this command, and the struct > > > vhost_ioctls would be some command specific structure. In summary, the > > > structure would be something like > > > struct vhost_cmd_batch { > > > int ncmds; > > > int cmd; > > > > The unbatched version should go in each vhost_ioctls. That allows us > > to send many different commands in one ioctl instead of having to > > resort to many ioctls, each one for a different task. > > > > The problem with that is the size of that struct vhost_ioctl, so we > > can build an array. I think it should be enough with the biggest of > > them (vhost_vring_addr ?) for a long time, but I would like to know if > > anybody finds a drawback here. We could always resort to pointers if > > we find we need more space, or start with them from the beginning. > > > > CCing Si-Wei here too, as he is also interested in reducing the startup > > time. > > > > Thanks! > > And copying my response too: > This is all very exciting, but what exactly is the benefit? > No optimization patches are going to be merged without > numbers showing performance gains. > In this case, can you show gains in process startup time? > Are they significant enough to warrant adding new UAPI? > This should have been marked as RFC in that regard. When this was sent it was one of the planned actions to reduce overhead. After Si-Wei benchmarks, all the efforts will focus on reducing the pinning / maps for the moment. It is unlikely that this will be moved forward soon. Thanks! > > > > > struct vhost_ioctls[]; > > > }; > > > > > > This is doable. Also, this is my first time submitting patches to open > > > source, sorry about the mess in advance. That being said, feel free to > > > throw questions / comments! > > > > > > Thanks and best regards, > > > Pei > > > > > > On Wed, Apr 19, 2023 at 9:19 PM Jason Wang wrote: > > >> > > >> On Wed, Apr 19, 2023 at 11:33 PM Eugenio Perez Martin > > >> wrote: > > >> > > > >> > On Wed, Apr 19, 2023 at 12:56 AM wrote: > > >> > > > > >> > > From: Pei Li > > >> > > > > >> > > Currently, part of the vdpa initialization / startup process > > >> > > needs to trigger many ioctls per vq, which is very inefficient > > >> > > and causing unnecessary context switch between user mode and > > >> > > kernel mode. > > >> > > > > >> > > This patch creates an additional ioctl() command, namely > > >> > > VHOST_VDPA_GET_VRING_GROUP_BATCH, that will batching > > >> > > commands of VHOST_VDPA_GET_VRING_GROUP into a single > > >> > > ioctl() call. > > >> > > >> I'd expect there's a kernel patch but I didn't see that? > > >> > > >> If we want to go this way. Why simply have a more generic way, that is > > >> introducing something like: > > >> > > >> VHOST_CMD_BATCH which did something like > > >> > > >> struct vhost_cmd_batch { > > >> int ncmds; > > >> struct vhost_ioctls[]; > > >> }; > > >> > > >> Then you can batch other ioctls other than GET_VRING_GROUP? > > >> > > >> Thanks > > >> > > >> > > > > >> > > > >> > It seems to me you forgot to send the 0/2 cover letter :). > > >> > > > >> > Please include the time we save thanks to avoiding the repetitive > > >> > ioctls in each patch. > > >> > > > >> > CCing Jason and Michael. > > >> > > > >> > > Signed-off-by: Pei Li > > >> > > --- > > >> > > hw/virtio/vhost-vdpa.c | 31 > > >> > > +++- > > >> > > include/standard-headers/linux/vhost_types.h | 3 ++ > > >> > > linux-headers/linux/vhost.h | 7 + > > >> > > 3 files changed, 33 insertions(+), 8 deletions(-) > > >> > > > > >> > > diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c > > >> > > index bc6bad23d5..6d45ff8539 100644 > > >> > > --- a/hw/virtio/vhost-vdpa.c > > >> > > +++ b/hw/virtio/vhost-vdpa.c > > >> > > @@ -679,7 +679,8 @@ static int vhost_vdpa_set_backend_cap(struct > > >> > > vhost_dev *dev) > > >> > > uint64_t f = 0x1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2 | > > >> > > 0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH | > > >> > > 0x1ULL << VHOST_BACKEND_F_IOTLB_ASID | > > >> > > -0x1ULL << VHOST_BACKEND_F_SUSPEND; > > >> > > +0x1ULL << VHOST_BACKEND_F_SUSPEND | > > >> > > +0x1ULL << VHOST_BACKEND_F_IOCTL_BATCH; > > >> > > int r; > > >> > > > > >> > > if (vhost_vdpa_call(dev, VHOST_GET_BACKEND_FEATURES, > > >> > > &features)) { > > >> > > @@ -731,14 +732,28 @@ static int vhost_vdpa_get_vq_index(struct > > >> > > vhost_dev *dev, int idx) > > >> > > > > >> > > static int vhost_vdpa_set_vring_ready(struct vhost_dev *dev) > > >> > > { > > >> > >
[PATCH] numa: Skip invalidation of cluster and NUMA node boundary for qtest
There are warning messages printed from tests/qtest/numa-test.c, to complain the CPU cluster and NUMA node boundary is broken. Since the broken boundary is expected, we don't want to see the warning messages. # cd /home/gavin/sandbox/qemu.main/build # MALLOC_PERTURB_=255 QTEST_QEMU_BINARY=./qemu-system-aarch64 \ G_TEST_DBUS_DAEMON=../tests/dbus-vmstate-daemon.sh\ QTEST_QEMU_IMG=./qemu-img \ QTEST_QEMU_STORAGE_DAEMON_BINARY=./storage-daemon/qemu-storage-daemon \ tests/qtest/numa-test --tap -k : qemu-system-aarch64: warning: CPU-0 and CPU-4 in socket-0-cluster-0 \ have been associated with node-0 and node-1 respectively. \ It can cause OSes like Linux to misbehave : Skip the invalidation of CPU cluster and NUMA node boundary when qtest is enabled, to avoid the warning messages. Fixes: a494fdb715 ("numa: Validate cluster and NUMA node boundary if required") Signed-off-by: Gavin Shan --- hw/core/machine.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/core/machine.c b/hw/core/machine.c index f0d35c6401..1c5136e735 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -1271,7 +1271,7 @@ static void validate_cpu_cluster_to_numa_boundary(MachineState *ms) const CPUArchId *cpus = possible_cpus->cpus; int i, j; -if (state->num_nodes <= 1 || possible_cpus->len <= 1) { +if (qtest_enabled() || state->num_nodes <= 1 || possible_cpus->len <= 1) { return; } -- 2.41.0
Re: [PULL 03/18] numa: Validate cluster and NUMA node boundary if required
On 7/20/23 23:10, Peter Maydell wrote: On Mon, 26 Jun 2023 at 12:15, Paolo Bonzini wrote: From: Gavin Shan For some architectures like ARM64, multiple CPUs in one cluster can be associated with different NUMA nodes, which is irregular configuration because we shouldn't have this in baremetal environment. The irregular configuration causes Linux guest to misbehave, as the following warning messages indicate. -smp 6,maxcpus=6,sockets=2,clusters=1,cores=3,threads=1 \ -numa node,nodeid=0,cpus=0-1,memdev=ram0\ -numa node,nodeid=1,cpus=2-3,memdev=ram1\ -numa node,nodeid=2,cpus=4-5,memdev=ram2\ Hi. This new warning shows up a lot in "make check" output: $ grep -c 'can cause OSes' /tmp/parn3ofA.par 44 Looks like this is all in the qtest-aarch64/numa-test test. Please can you investigate and either: (1) fix the test not to do the bad thing that's causing the warning (2) change the warning so it doesn't show up in stderr when running a correct and passing test ? Yes, all the warning messages come from tests/qtest/numa-test.c. There are 3 configurations where the boundary of CPU cluster and NUMA node is broken as expected. I've sent a patch to disable the validation for qtest. https://lists.nongnu.org/archive/html/qemu-arm/2023-07/msg00440.html With the patch applied, I didn't see similar warning messages from "make -j 40 check-qtest". Thanks, Gavin
[PATCH] vhost-user-scsi: support reconnect to backend
If the backend crashes and restarts, the device is broken. This patch adds reconnect for vhost-user-scsi. Tested with spdk backend. Signed-off-by: Li Feng --- hw/block/vhost-user-blk.c | 2 - hw/scsi/vhost-scsi-common.c | 27 ++--- hw/scsi/vhost-user-scsi.c | 163 +--- include/hw/virtio/vhost-user-scsi.h | 3 + include/hw/virtio/vhost.h | 2 + 5 files changed, 165 insertions(+), 32 deletions(-) diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c index eecf3f7a81..f250c740b5 100644 --- a/hw/block/vhost-user-blk.c +++ b/hw/block/vhost-user-blk.c @@ -32,8 +32,6 @@ #include "sysemu/sysemu.h" #include "sysemu/runstate.h" -#define REALIZE_CONNECTION_RETRIES 3 - static const int user_feature_bits[] = { VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_SEG_MAX, diff --git a/hw/scsi/vhost-scsi-common.c b/hw/scsi/vhost-scsi-common.c index a06f01af26..08801886b8 100644 --- a/hw/scsi/vhost-scsi-common.c +++ b/hw/scsi/vhost-scsi-common.c @@ -52,16 +52,22 @@ int vhost_scsi_common_start(VHostSCSICommon *vsc) vsc->dev.acked_features = vdev->guest_features; -assert(vsc->inflight == NULL); -vsc->inflight = g_new0(struct vhost_inflight, 1); -ret = vhost_dev_get_inflight(&vsc->dev, - vs->conf.virtqueue_size, - vsc->inflight); +ret = vhost_dev_prepare_inflight(&vsc->dev, vdev); if (ret < 0) { -error_report("Error get inflight: %d", -ret); +error_report("Error setting inflight format: %d", -ret); goto err_guest_notifiers; } +if (!vsc->inflight->addr) { +ret = vhost_dev_get_inflight(&vsc->dev, +vs->conf.virtqueue_size, +vsc->inflight); +if (ret < 0) { +error_report("Error get inflight: %d", -ret); +goto err_guest_notifiers; +} +} + ret = vhost_dev_set_inflight(&vsc->dev, vsc->inflight); if (ret < 0) { error_report("Error set inflight: %d", -ret); @@ -85,9 +91,6 @@ int vhost_scsi_common_start(VHostSCSICommon *vsc) return ret; err_guest_notifiers: -g_free(vsc->inflight); -vsc->inflight = NULL; - k->set_guest_notifiers(qbus->parent, vsc->dev.nvqs, false); err_host_notifiers: vhost_dev_disable_notifiers(&vsc->dev, vdev); @@ -111,12 +114,6 @@ void vhost_scsi_common_stop(VHostSCSICommon *vsc) } assert(ret >= 0); -if (vsc->inflight) { -vhost_dev_free_inflight(vsc->inflight); -g_free(vsc->inflight); -vsc->inflight = NULL; -} - vhost_dev_disable_notifiers(&vsc->dev, vdev); } diff --git a/hw/scsi/vhost-user-scsi.c b/hw/scsi/vhost-user-scsi.c index ee99b19e7a..e0e88b0c42 100644 --- a/hw/scsi/vhost-user-scsi.c +++ b/hw/scsi/vhost-user-scsi.c @@ -89,14 +89,126 @@ static void vhost_dummy_handle_output(VirtIODevice *vdev, VirtQueue *vq) { } +static int vhost_user_scsi_connect(DeviceState *dev, Error **errp) +{ +VirtIODevice *vdev = VIRTIO_DEVICE(dev); +VHostUserSCSI *s = VHOST_USER_SCSI(vdev); +VHostSCSICommon *vsc = VHOST_SCSI_COMMON(s); +VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(dev); +int ret = 0; + +if (s->connected) { +return 0; +} +s->connected = true; + +vsc->dev.num_queues = vs->conf.num_queues; +vsc->dev.nvqs = VIRTIO_SCSI_VQ_NUM_FIXED + vs->conf.num_queues; +vsc->dev.vqs = s->vhost_vqs; +vsc->dev.vq_index = 0; +vsc->dev.backend_features = 0; + +ret = vhost_dev_init(&vsc->dev, &s->vhost_user, VHOST_BACKEND_TYPE_USER, 0, + errp); +if (ret < 0) { +return ret; +} + +/* restore vhost state */ +if (virtio_device_started(vdev, vdev->status)) { +ret = vhost_scsi_common_start(vsc); +if (ret < 0) { +return ret; +} +} + +return 0; +} + +static void vhost_user_scsi_event(void *opaque, QEMUChrEvent event); + +static void vhost_user_scsi_disconnect(DeviceState *dev) +{ +VirtIODevice *vdev = VIRTIO_DEVICE(dev); +VHostUserSCSI *s = VHOST_USER_SCSI(vdev); +VHostSCSICommon *vsc = VHOST_SCSI_COMMON(s); +VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(dev); + +if (!s->connected) { +return; +} +s->connected = false; + +vhost_scsi_common_stop(vsc); + +vhost_dev_cleanup(&vsc->dev); + +/* Re-instate the event handler for new connections */ +qemu_chr_fe_set_handlers(&vs->conf.chardev, NULL, NULL, + vhost_user_scsi_event, NULL, dev, NULL, true); +} + +static void vhost_user_scsi_event(void *opaque, QEMUChrEvent event) +{ +DeviceState *dev = opaque; +VirtIODevice *vdev = VIRTIO_DEVICE(dev); +VHostUserSCSI *s = VHOST_USER_SCSI(vdev); +VHostSCSICommon *vsc = VHOST_SCSI_COMMON(s); +VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(dev); +Error *local_err = NULL; + +switch (event)
Re: [PATCH v6 0/3] hw/ufs: Add Universal Flash Storage (UFS) support
Is there a test case along the lines of tests/qtest/ahci-test.c that exercises this new code? It's important to have at least a basic test that initializes the device and performs a write followed by a read to verify that I/O is working. This will allow the CI system and anyone who modifies the UFS code to check that the basics are still working. Thanks, Stefan
RE: [PATCH v1] migration: refactor migration_completion
On Friday, July 21, 2023 4:38 AM, Peter Xu wrote: > Looks good to me, after addressing Isaku's comments. > > The current_active_state is very unfortunate, along with most of the calls to > migrate_set_state() - I bet most of the code will definitely go wrong if that > cmpxchg didn't succeed inside of migrate_set_state(), IOW in most cases we > simply always want: Can you share examples where it could be wrong? (If it has bugs, we need to fix) > > migrate_set_state(&s->state, s->state, XXX); > > Not sure whether one pre-requisite patch is good to have so we can rename > migrate_set_state() to something like __migrate_set_state(), then: > > migrate_set_state(s, XXX) { > __migrate_set_state(&s->state, s->state, XXX); > } > > I don't even know whether there's any call site that will need > __migrate_set_state() for real.. > Seems this would break the use of "MIGRATION_STATUS_CANCELLING". For example, - In migration_maybe_pause: migrate_set_state(&s->state, MIGRATION_STATUS_PRE_SWITCHOVER, new_state); If the current s->state isn't MIGRATION_STATUS_PRE_SWITCHOVER (could be MIGRATION_STATUS_CANCELLING), then s->state won’t be updated to new_state. - Then, in migration_completion, the following update to s->state won't succeed: migrate_set_state(&s->state, current_active_state, MIGRATION_STATUS_COMPLETED); - Finally, when reaching migration_iteration_finish(), s->state is MIGRATION_STATUS_CANCELLING, instead of MIGRATION_STATUS_COMPLETED.
Re: [PATCH v21 01/20] s390x/cpu topology: add s390 specifics to CPU topology
On 7/18/23 18:31, Nina Schoetterl-Glausch wrote: Reviewed-by: Nina Schoetterl-Glausch Some notes below. The s390x/ prefix in the title might suggest that this patch is s390 specific, but it touches common files. Right. What do you suggest? I can cut it in two or squash it with patch number 2. The first idea was to separate the patch to ease the review but the functionality introduced in patch 1 do only make sense with patch 2. So I would be for squashing the first two patches. ? On Fri, 2023-06-30 at 11:17 +0200, Pierre Morel wrote: S390 adds two new SMP levels, drawers and books to the CPU topology. The S390 CPU have specific topology features like dedication S390 CPUs have specific topology features like dedication and entitlement. These indicate to the guest information on host vCPU scheduling and help the guest make better scheduling decisions. and entitlement to give to the guest indications on the host vCPUs scheduling and help the guest take the best decisions on the scheduling of threads on the vCPUs. Let us provide the SMP properties with books and drawers levels and S390 CPU with dedication and entitlement, Signed-off-by: Pierre Morel --- qapi/machine-common.json | 22 + qapi/machine.json | 21 ++--- include/hw/boards.h | 10 +- include/hw/qdev-properties-system.h | 4 +++ target/s390x/cpu.h | 6 hw/core/machine-smp.c | 48 --- -- hw/core/machine.c | 4 +++ hw/core/qdev-properties-system.c | 13 hw/s390x/s390-virtio-ccw.c | 2 ++ softmmu/vl.c | 6 target/s390x/cpu.c | 7 + qapi/meson.build | 1 + qemu-options.hx | 7 +++-- 13 files changed, 137 insertions(+), 14 deletions(-) create mode 100644 qapi/machine-common.json diff --git a/qapi/machine-common.json b/qapi/machine-common.json new file mode 100644 index 00..bc0d76829c --- /dev/null +++ b/qapi/machine-common.json @@ -0,0 +1,22 @@ +# -*- Mode: Python -*- +# vim: filetype=python +# +# This work is licensed under the terms of the GNU GPL, version 2 or later. +# See the COPYING file in the top-level directory. + +## +# = Machines S390 data types Common definitions for machine.json and machine-target.json [...]
Re: [PATCH] numa: Skip invalidation of cluster and NUMA node boundary for qtest
On Fri, 21 Jul 2023 at 11:44, Gavin Shan wrote: > > There are warning messages printed from tests/qtest/numa-test.c, > to complain the CPU cluster and NUMA node boundary is broken. Since > the broken boundary is expected, we don't want to see the warning > messages. > > # cd /home/gavin/sandbox/qemu.main/build > # MALLOC_PERTURB_=255 QTEST_QEMU_BINARY=./qemu-system-aarch64 \ > G_TEST_DBUS_DAEMON=../tests/dbus-vmstate-daemon.sh\ > QTEST_QEMU_IMG=./qemu-img \ > QTEST_QEMU_STORAGE_DAEMON_BINARY=./storage-daemon/qemu-storage-daemon \ > tests/qtest/numa-test --tap -k > : > qemu-system-aarch64: warning: CPU-0 and CPU-4 in socket-0-cluster-0 \ > have been associated with node-0 and node-1 respectively. \ > It can cause OSes like Linux to misbehave > : > > Skip the invalidation of CPU cluster and NUMA node boundary when > qtest is enabled, to avoid the warning messages. > > Fixes: a494fdb715 ("numa: Validate cluster and NUMA node boundary if > required") > Signed-off-by: Gavin Shan Reviewed-by: Peter Maydell thanks -- PMM
Re: [PATCH for-8.2 4/6] vfio/migration: Refactor PRE_COPY and RUNNING state checks
On 7/16/23 10:15, Avihai Horon wrote: From: Joao Martins Move the PRE_COPY and RUNNING state checks to helper functions. This is in preparation for adding P2P VFIO migration support, where these helpers will also test for PRE_COPY_P2P and RUNNING_P2P states. Signed-off-by: Joao Martins Signed-off-by: Avihai Horon Reviewed-by: Cédric Le Goater Thanks, C. --- include/hw/vfio/vfio-common.h | 2 ++ hw/vfio/common.c | 22 ++ hw/vfio/migration.c | 10 -- 3 files changed, 24 insertions(+), 10 deletions(-) diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index da43d27352..e9b8954595 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -230,6 +230,8 @@ void vfio_unblock_multiple_devices_migration(void); bool vfio_viommu_preset(VFIODevice *vbasedev); int64_t vfio_mig_bytes_transferred(void); void vfio_reset_bytes_transferred(void); +bool vfio_device_state_is_running(VFIODevice *vbasedev); +bool vfio_device_state_is_precopy(VFIODevice *vbasedev); #ifdef CONFIG_LINUX int vfio_get_region_info(VFIODevice *vbasedev, int index, diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 9aac21abb7..16cf79a76c 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -437,6 +437,20 @@ static void vfio_set_migration_error(int err) } } +bool vfio_device_state_is_running(VFIODevice *vbasedev) +{ +VFIOMigration *migration = vbasedev->migration; + +return migration->device_state == VFIO_DEVICE_STATE_RUNNING; +} + +bool vfio_device_state_is_precopy(VFIODevice *vbasedev) +{ +VFIOMigration *migration = vbasedev->migration; + +return migration->device_state == VFIO_DEVICE_STATE_PRE_COPY; +} + static bool vfio_devices_all_dirty_tracking(VFIOContainer *container) { VFIOGroup *group; @@ -457,8 +471,8 @@ static bool vfio_devices_all_dirty_tracking(VFIOContainer *container) } if (vbasedev->pre_copy_dirty_page_tracking == ON_OFF_AUTO_OFF && -(migration->device_state == VFIO_DEVICE_STATE_RUNNING || - migration->device_state == VFIO_DEVICE_STATE_PRE_COPY)) { +(vfio_device_state_is_running(vbasedev) || + vfio_device_state_is_precopy(vbasedev))) { return false; } } @@ -503,8 +517,8 @@ static bool vfio_devices_all_running_and_mig_active(VFIOContainer *container) return false; } -if (migration->device_state == VFIO_DEVICE_STATE_RUNNING || -migration->device_state == VFIO_DEVICE_STATE_PRE_COPY) { +if (vfio_device_state_is_running(vbasedev) || +vfio_device_state_is_precopy(vbasedev)) { continue; } else { return false; diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c index 8acd182a8b..48f9c23cbe 100644 --- a/hw/vfio/migration.c +++ b/hw/vfio/migration.c @@ -411,7 +411,7 @@ static void vfio_state_pending_estimate(void *opaque, uint64_t *must_precopy, VFIODevice *vbasedev = opaque; VFIOMigration *migration = vbasedev->migration; -if (migration->device_state != VFIO_DEVICE_STATE_PRE_COPY) { +if (!vfio_device_state_is_precopy(vbasedev)) { return; } @@ -444,7 +444,7 @@ static void vfio_state_pending_exact(void *opaque, uint64_t *must_precopy, vfio_query_stop_copy_size(vbasedev, &stop_copy_size); *must_precopy += stop_copy_size; -if (migration->device_state == VFIO_DEVICE_STATE_PRE_COPY) { +if (vfio_device_state_is_precopy(vbasedev)) { vfio_query_precopy_size(migration); *must_precopy += @@ -459,9 +459,8 @@ static void vfio_state_pending_exact(void *opaque, uint64_t *must_precopy, static bool vfio_is_active_iterate(void *opaque) { VFIODevice *vbasedev = opaque; -VFIOMigration *migration = vbasedev->migration; -return migration->device_state == VFIO_DEVICE_STATE_PRE_COPY; +return vfio_device_state_is_precopy(vbasedev); } static int vfio_save_iterate(QEMUFile *f, void *opaque) @@ -656,7 +655,6 @@ static const SaveVMHandlers savevm_vfio_handlers = { static void vfio_vmstate_change(void *opaque, bool running, RunState state) { VFIODevice *vbasedev = opaque; -VFIOMigration *migration = vbasedev->migration; enum vfio_device_mig_state new_state; int ret; @@ -664,7 +662,7 @@ static void vfio_vmstate_change(void *opaque, bool running, RunState state) new_state = VFIO_DEVICE_STATE_RUNNING; } else { new_state = -(migration->device_state == VFIO_DEVICE_STATE_PRE_COPY && +(vfio_device_state_is_precopy(vbasedev) && (state == RUN_STATE_FINISH_MIGRATE || state == RUN_STATE_PAUSED)) ? VFIO_DEVICE_STATE_STOP_COPY : VFIO_DEVICE_STATE_STOP;
Re: [PATCH for-8.2 5/6] vfio/migration: Add P2P support for VFIO migration
On 7/16/23 10:15, Avihai Horon wrote: VFIO migration uAPI defines an optional intermediate P2P quiescent state. While in the P2P quiescent state, P2P DMA transactions cannot be initiated by the device, but the device can respond to incoming ones. Additionally, all outstanding P2P transactions are guaranteed to have been completed by the time the device enters this state. The purpose of this state is to support migration of multiple devices that are doing P2P transactions between themselves. Add support for P2P migration by transitioning all the devices to the P2P quiescent state before stopping or starting the devices. Use the new VMChangeStateHandler pre_change_cb to achieve that behavior. This will allow migration of multiple VFIO devices if all of them support P2P migration. LGTM, one small comment below Signed-off-by: Avihai Horon --- docs/devel/vfio-migration.rst | 93 +-- hw/vfio/common.c | 6 ++- hw/vfio/migration.c | 58 +- hw/vfio/trace-events | 2 +- 4 files changed, 107 insertions(+), 52 deletions(-) diff --git a/docs/devel/vfio-migration.rst b/docs/devel/vfio-migration.rst index b433cb5bb2..b9c57ba651 100644 --- a/docs/devel/vfio-migration.rst +++ b/docs/devel/vfio-migration.rst @@ -23,9 +23,21 @@ and recommends that the initial bytes are sent and loaded in the destination before stopping the source VM. Enabling this migration capability will guarantee that and thus, can potentially reduce downtime even further. -Note that currently VFIO migration is supported only for a single device. This -is due to VFIO migration's lack of P2P support. However, P2P support is planned -to be added later on. +To support migration of multiple devices that are doing P2P transactions +between themselves, VFIO migration uAPI defines an intermediate P2P quiescent +state. While in the P2P quiescent state, P2P DMA transactions cannot be +initiated by the device, but the device can respond to incoming ones. +Additionally, all outstanding P2P transactions are guaranteed to have been +completed by the time the device enters this state. + +All the devices that support P2P migration are first transitioned to the P2P +quiescent state and only then are they stopped or started. This makes migration +safe P2P-wise, since starting and stopping the devices is not done atomically +for all the devices together. + +Thus, multiple VFIO devices migration is allowed only if all the devices +support P2P migration. Single VFIO device migration is allowed regardless of +P2P migration support. A detailed description of the UAPI for VFIO device migration can be found in the comment for the ``vfio_device_mig_state`` structure in the header file @@ -132,54 +144,63 @@ will be blocked. Flow of state changes during Live migration === -Below is the flow of state change during live migration. +Below is the state change flow during live migration for a VFIO device that +supports both precopy and P2P migration. The flow for devices that don't +support it is similar, except that the relevant states for precopy and P2P are +skipped. The values in the parentheses represent the VM state, the migration state, and the VFIO device state, respectively. -The text in the square brackets represents the flow if the VFIO device supports -pre-copy. Live migration save path :: -QEMU normal running state -(RUNNING, _NONE, _RUNNING) - | + QEMU normal running state + (RUNNING, _NONE, _RUNNING) + | migrate_init spawns migration_thread -Migration thread then calls each device's .save_setup() - (RUNNING, _SETUP, _RUNNING [_PRE_COPY]) - | - (RUNNING, _ACTIVE, _RUNNING [_PRE_COPY]) - If device is active, get pending_bytes by .state_pending_{estimate,exact}() - If total pending_bytes >= threshold_size, call .save_live_iterate() - [Data of VFIO device for pre-copy phase is copied] -Iterate till total pending bytes converge and are less than threshold - | - On migration completion, vCPU stops and calls .save_live_complete_precopy for - each active device. The VFIO device is then transitioned into _STOP_COPY state - (FINISH_MIGRATE, _DEVICE, _STOP_COPY) - | - For the VFIO device, iterate in .save_live_complete_precopy until - pending data is 0 - (FINISH_MIGRATE, _DEVICE, _STOP) - | - (FINISH_MIGRATE, _COMPLETED, _STOP) - Migraton thread schedules cleanup bottom hal
[PATCH] scripts/qemu-binfmt-conf.sh: refresh
Currently qemu-binfmt-conf.sh does a number of strange things. 1. --systemd requires an argument - the CPU type to register, while --debian (which is actually --binfmt-support) does not accept such an argument, so it is not possible to specify which CPU(s) to register for debian. 2. Why this "ALL" at all? 3. it just ignores extra command-line arguments. It would be logical to specify which CPUs to register (multiple!) as the additional arguments. 4. Even if a CPU is explicitly requested, it does not register anything if this CPU is of the same family as host one. But this is wrong, since quite often it *is* desirable to do this registration, - like, when running in i386 when the system is not capable of running x86-64 binaries, and countless other examples 5. It ignores errors 6. It ignores wrong command line arguments Fix this, and simplify things a bit. 1. Stop accepting an argument for --systemd. With getopt_long, this argument, if given, will be returned as a non-optional parameter so compatibility with current version is preserved. 2. Accept optional arguments and generate registration for the given CPUs only. In case no extra arguments are given, register for all supportd CPUs except of the same family as host. 3. Recognze "ALL" "CPU" to keep compatibility with current version (but do not document it). 4. Warn but perform registration anyway if a cpu of the same family has been requested. 5. In help text, use --debian and --systemd as alternatives to each other, to make it clear the two can not be used at the same time. 6. Tiny optimization of eval expression. 7. Fold the list of supported CPUs to fit in 80 columns. 8. Exit with non-zero code in case registration fails or the command line is wrong. 9. Remove explicit checking for writability of various things. Signed-off-by: Michael Tokarev --- scripts/qemu-binfmt-conf.sh | 89 + 1 file changed, 40 insertions(+), 49 deletions(-) diff --git a/scripts/qemu-binfmt-conf.sh b/scripts/qemu-binfmt-conf.sh index 6ef9f118d9..27d506000d 100755 --- a/scripts/qemu-binfmt-conf.sh +++ b/scripts/qemu-binfmt-conf.sh @@ -182,10 +182,10 @@ qemu_get_family() { usage() { cat <&2 -exit 1 -fi +echo $qemu_target_list | fold -w68 -s | sed 's/^//' } qemu_check_bintfmt_misc() { @@ -246,8 +242,6 @@ qemu_check_bintfmt_misc() { exit 1 fi fi - -qemu_check_access /proc/sys/fs/binfmt_misc/register } installed_dpkg() { @@ -260,14 +254,12 @@ qemu_check_debian() { elif ! installed_dpkg binfmt-support ; then echo "WARNING: package binfmt-support is needed" 1>&2 fi -qemu_check_access "$EXPORTDIR" } qemu_check_systemd() { if ! systemctl -q is-enabled systemd-binfmt.service ; then echo "WARNING: systemd-binfmt.service is missing or disabled" 1>&2 fi -qemu_check_access "$EXPORTDIR" } qemu_generate_register() { @@ -287,16 +279,16 @@ qemu_generate_register() { qemu_register_interpreter() { echo "Setting $qemu as binfmt interpreter for $cpu" -qemu_generate_register > /proc/sys/fs/binfmt_misc/register +qemu_generate_register > /proc/sys/fs/binfmt_misc/register || exit 1 } qemu_generate_systemd() { echo "Setting $qemu as binfmt interpreter for $cpu for systemd-binfmt.service" -qemu_generate_register > "$EXPORTDIR/qemu-$cpu.conf" +qemu_generate_register > "$EXPORTDIR/qemu-$cpu.conf" || exit 1 } qemu_generate_debian() { -cat > "$EXPORTDIR/qemu-$cpu" < "$EXPORTDIR/qemu-$cpu" <&2 -continue +echo "ERROR: unknown cpu $cpu" >&2 +exit 1 fi qemu="$QEMU_PATH/qemu-$cpu" @@ -329,9 +327,12 @@ qemu_set_binfmts() { fi qemu="$qemu$QEMU_SUFFIX" -if [ "$host_family" != "$family" ] ; then -$BINFMT_SET +if [ "$host_family" = "$family" ] ; then +[ -n "$explicit" ] || continue +echo "WARNING: requested CPU $cpu is of the same family as host CPU" >&2 +echo "WARNING: this might break the system" >&2 fi +$BINFMT_SET done } @@ -347,9 +348,9 @@ PERSISTENT=no PRESERVE_ARG0=no QEMU_SUFFIX="" -_longopts="debian,systemd:,qemu-path:,qemu-suffix:,exportdir:,help,credential:,\ +_longopts="debian,systemd,qemu-path:,qemu-suffix:,exportdir:,help,credential:,\ persistent:,preserve-argv0:" -options=$(getopt -o ds:Q:S:e:hc:p:g:F: -l ${_longopts} -- "$@") +options=$(getopt -o dsQ:S:e:hc:p:g:F: -l ${_longopts} -- "$@") || exit 1 eval set -- "$options" while true ; do @@ -363,23 +364,6 @@ while true ; do CHECK=qemu_check_systemd BINFMT_SET=qemu_generate_systemd EXPORTDIR=${EXPORTDIR:-$SYSTEMDDIR} -shift -# check given cpu is in the supported CPU list -if [ "$1" != "ALL" ] ; then -for cpu in ${qemu_target_list} ; do -if [
Re: [PATCH 3/4] vdpa: Restore vlan filtering state
On Wed, Jul 19, 2023 at 9:48 AM Hawkins Jiawei wrote: > > This patch introduces vhost_vdpa_net_load_single_vlan() > and vhost_vdpa_net_load_vlan() to restore the vlan > filtering state at device's startup. > > Co-developed-by: Eugenio Pérez > Signed-off-by: Eugenio Pérez > Signed-off-by: Hawkins Jiawei > --- > net/vhost-vdpa.c | 49 > 1 file changed, 49 insertions(+) > > diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c > index 9795306742..0787dd933b 100644 > --- a/net/vhost-vdpa.c > +++ b/net/vhost-vdpa.c > @@ -965,6 +965,51 @@ static int vhost_vdpa_net_load_rx(VhostVDPAState *s, > return 0; > } > > +static int vhost_vdpa_net_load_single_vlan(VhostVDPAState *s, > + const VirtIONet *n, > + uint16_t vid) > +{ > +const struct iovec data = { > +.iov_base = &vid, > +.iov_len = sizeof(vid), > +}; > +ssize_t dev_written = vhost_vdpa_net_load_cmd(s, VIRTIO_NET_CTRL_VLAN, > + VIRTIO_NET_CTRL_VLAN_ADD, > + &data, 1); > +if (unlikely(dev_written < 0)) { > +return dev_written; > +} > +if (unlikely(*s->status != VIRTIO_NET_OK)) { > +return -EIO; > +} > + > +return 0; > +} > + > +static int vhost_vdpa_net_load_vlan(VhostVDPAState *s, > +const VirtIONet *n) > +{ > +int r; > + > +if (!virtio_vdev_has_feature(&n->parent_obj, VIRTIO_NET_F_CTRL_VLAN)) { > +return 0; > +} > + > +for (int i = 0; i < MAX_VLAN >> 5; i++) { > +for (int j = 0; n->vlans[i] && j <= 0x1f; j++) { > +if (n->vlans[i] & (1U << j)) { > +r = vhost_vdpa_net_load_single_vlan(s, n, (i << 5) + j); > +if (unlikely(r != 0)) { > +return r; > +} > +} > +} > +} > + > +return 0; > +} > + > + Nit: I'm not sure if it was here originally, but there is an extra newline here. > static int vhost_vdpa_net_load(NetClientState *nc) > { > VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc); > @@ -995,6 +1040,10 @@ static int vhost_vdpa_net_load(NetClientState *nc) > if (unlikely(r)) { > return r; > } > +r = vhost_vdpa_net_load_vlan(s, n); > +if (unlikely(r)) { > +return r; > +} > > return 0; > } > -- > 2.25.1 >
Re: [PATCH for-8.2 6/6] vfio/migration: Allow migration of multiple P2P supporting devices
On 7/16/23 10:15, Avihai Horon wrote: Now that P2P support has been added to VFIO migration, allow migration of multiple devices if all of them support P2P migration. Single device migration is allowed regardless of P2P migration support. Signed-off-by: Avihai Horon Signed-off-by: Joao Martins --- hw/vfio/common.c | 26 ++ 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 7c3d636025..753b320739 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -363,21 +363,31 @@ bool vfio_mig_active(void) static Error *multiple_devices_migration_blocker; -static unsigned int vfio_migratable_device_num(void) +/* + * Multiple devices migration is allowed only if all devices support P2P + * migration. Single device migration is allowed regardless of P2P migration + * support. + */ +static bool vfio_should_block_multiple_devices_migration(void) Could we revert the logic and call the routine : vfio_multiple_devices_migration_is_supported() I think it would be clearer in the callers. This is minor. Thanks, C. { VFIOGroup *group; VFIODevice *vbasedev; unsigned int device_num = 0; +bool all_support_p2p = true; QLIST_FOREACH(group, &vfio_group_list, next) { QLIST_FOREACH(vbasedev, &group->device_list, next) { if (vbasedev->migration) { device_num++; + +if (!(vbasedev->migration->mig_flags & VFIO_MIGRATION_P2P)) { +all_support_p2p = false; +} } } } -return device_num; +return !all_support_p2p && device_num > 1; } int vfio_block_multiple_devices_migration(VFIODevice *vbasedev, Error **errp) @@ -385,19 +395,19 @@ int vfio_block_multiple_devices_migration(VFIODevice *vbasedev, Error **errp) int ret; if (multiple_devices_migration_blocker || -vfio_migratable_device_num() <= 1) { +!vfio_should_block_multiple_devices_migration()) { return 0; } if (vbasedev->enable_migration == ON_OFF_AUTO_ON) { -error_setg(errp, "Migration is currently not supported with multiple " - "VFIO devices"); +error_setg(errp, "Multiple VFIO devices migration is supported only if " + "all of them support P2P migration"); return -EINVAL; } error_setg(&multiple_devices_migration_blocker, - "Migration is currently not supported with multiple " - "VFIO devices"); + "Multiple VFIO devices migration is supported only if all of " + "them support P2P migration"); ret = migrate_add_blocker(multiple_devices_migration_blocker, errp); if (ret < 0) { error_free(multiple_devices_migration_blocker); @@ -410,7 +420,7 @@ int vfio_block_multiple_devices_migration(VFIODevice *vbasedev, Error **errp) void vfio_unblock_multiple_devices_migration(void) { if (!multiple_devices_migration_blocker || -vfio_migratable_device_num() > 1) { +vfio_should_block_multiple_devices_migration()) { return; }
Re: [PATCH v2] accel/kvm: Specify default IPA size for arm64
On Fri, 21 Jul 2023 at 08:30, Philippe Mathieu-Daudé wrote: > > Hi Akihiko, > > On 21/7/23 08:24, Akihiko Odaki wrote: > > libvirt uses "none" machine type to test KVM availability. Before this > > change, QEMU used to pass 0 as machine type when calling KVM_CREATE_VM. > > > > The kernel documentation says: > >> On arm64, the physical address size for a VM (IPA Size limit) is > >> limited to 40bits by default. The limit can be configured if the host > >> supports the extension KVM_CAP_ARM_VM_IPA_SIZE. When supported, use > >> KVM_VM_TYPE_ARM_IPA_SIZE(IPA_Bits) to set the size in the machine type > >> identifier, where IPA_Bits is the maximum width of any physical > >> address used by the VM. The IPA_Bits is encoded in bits[7-0] of the > >> machine type identifier. > >> > >> e.g, to configure a guest to use 48bit physical address size:: > >> > >> vm_fd = ioctl(dev_fd, KVM_CREATE_VM, KVM_VM_TYPE_ARM_IPA_SIZE(48)); > >> > >> The requested size (IPA_Bits) must be: > >> > >> == = > >>0 Implies default size, 40bits (for backward compatibility) > >>N Implies N bits, where N is a positive integer such that, > >>32 <= N <= Host_IPA_Limit > >> == = > > > >> Host_IPA_Limit is the maximum possible value for IPA_Bits on the host > >> and is dependent on the CPU capability and the kernel configuration. > >> The limit can be retrieved using KVM_CAP_ARM_VM_IPA_SIZE of the > >> KVM_CHECK_EXTENSION ioctl() at run-time. > >> > >> Creation of the VM will fail if the requested IPA size (whether it is > >> implicit or explicit) is unsupported on the host. > > https://docs.kernel.org/virt/kvm/api.html#kvm-create-vm > > > > So if Host_IPA_Limit < 40, such KVM_CREATE_VM will fail, and libvirt > > incorrectly thinks KVM is not available. This actually happened on M2 > > MacBook Air. > > > > Fix this by specifying 32 for IPA_Bits as any arm64 system should > > support the value according to the documentation. > > > > Signed-off-by: Akihiko Odaki > > --- > > V1 -> V2: Introduced an arch hook > > > > include/sysemu/kvm.h | 1 + > > accel/kvm/kvm-all.c| 2 +- > > target/arm/kvm.c | 2 ++ > > target/i386/kvm/kvm.c | 2 ++ > > target/mips/kvm.c | 2 ++ > > target/ppc/kvm.c | 2 ++ > > target/riscv/kvm.c | 2 ++ > > target/s390x/kvm/kvm.c | 2 ++ > > 8 files changed, 14 insertions(+), 1 deletion(-) > > My understanding of Peter's suggestion would be smth like: > > -- >8 -- > diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h > index 115f0cca79..c0af15eb6c 100644 > --- a/include/sysemu/kvm.h > +++ b/include/sysemu/kvm.h > @@ -201,10 +201,15 @@ typedef struct KVMCapabilityInfo { > > struct KVMState; > > +struct KVMClass { > +AccelClass parent_class; > + > +int default_vm_type; The kernel docs say you need to check for the KVM_CAP_ARM_VM_IPA_SIZE before you can pass something other than zero to the KVM_CREATE_VM ioctl, so this needs to be a method, not just a value. (kvm_arm_get_max_vm_ipa_size() will do this bit for you.) If the machine doesn't provide a kvm_type method, we should default to "largest the host supports", I think. I was wondering if we could have one per-arch method for "actually create the VM" that both was a place for arm to set the default vm type and also let us get the TARGET_S390X and TARGET_PPC ifdefs out of this bit of kvm-all.c, but maybe that would look just a bit too awkward: if (kc->create_vm(s, board_sets_kvm_type, board_kvm_type) < 0) { goto err; } where board_sets_kvm_type is a bool, true if board_kvm_type is valid, and board_kvm_type is whatever the board's mc->kvm_type method told us. (Default impl of the method: call KVM_CREATE_VM ioctl with retry-on-eintr, printing the simple error message; PPC and s390 versions similar but with their arch specific extra messages; arm version has a different default type if board_sets_kvm_type is false.) Not trying to do both of those things with one method would result in a simpler type = kc->get_default_kvm_type(s); API. thanks -- PMM
[PATCH] target/ppc: Fix the order of kvm_enable judgment about kvmppc_set_interrupt()
It's unnecessary for non-KVM accelerators(TCG, for example), to call this function, so change the order of kvm_enable() judgment. The static inline function that returns -1 directly does not work in TCG's situation. Signed-off-by: jianchunfu --- hw/ppc/ppc.c | 8 ++-- target/ppc/kvm.c | 2 +- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/hw/ppc/ppc.c b/hw/ppc/ppc.c index 0e0a3d93c3..3e96b24487 100644 --- a/hw/ppc/ppc.c +++ b/hw/ppc/ppc.c @@ -58,7 +58,9 @@ void ppc_set_irq(PowerPCCPU *cpu, int irq, int level) if (old_pending != env->pending_interrupts) { ppc_maybe_interrupt(env); -kvmppc_set_interrupt(cpu, irq, level); +if (kvm_enabled()) { +kvmppc_set_interrupt(cpu, irq, level); +} } trace_ppc_irq_set_exit(env, irq, level, env->pending_interrupts, @@ -1465,5 +1467,7 @@ void ppc_irq_reset(PowerPCCPU *cpu) CPUPPCState *env = &cpu->env; env->irq_input_state = 0; -kvmppc_set_interrupt(cpu, PPC_INTERRUPT_EXT, 0); +if (kvm_enabled()) { +kvmppc_set_interrupt(cpu, PPC_INTERRUPT_EXT, 0); +} } diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c index a8a935e267..11a1fbc244 100644 --- a/target/ppc/kvm.c +++ b/target/ppc/kvm.c @@ -1315,7 +1315,7 @@ int kvmppc_set_interrupt(PowerPCCPU *cpu, int irq, int level) return 0; } -if (!kvm_enabled() || !cap_interrupt_unset) { +if (!cap_interrupt_unset) { return 0; } -- 2.27.0
Re: [PATCH 1/2] hw/char/riscv_htif: Fix printing of console characters on big endian hosts
On 21/7/23 11:47, Thomas Huth wrote: The character that should be printed is stored in the 64 bit "payload" variable. The code currently tries to print it by taking the address of the variable and passing this pointer to qemu_chr_fe_write(). However, this only works on little endian hosts where the least significant bits are stored on the lowest address. To do this in a portable way, we have to store the value in an uint8_t variable instead. Fixes: 5033606780 ("RISC-V HTIF Console") Signed-off-by: Thomas Huth --- hw/char/riscv_htif.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) Reviewed-by: Philippe Mathieu-Daudé
[PATCH] target/riscv/cpu.c: do not run 'host' CPU with TCG
The 'host' CPU is available in a CONFIG_KVM build and it's currently available for all accels, but is a KVM only CPU. This means that in a RISC-V KVM capable host we can do things like this: $ ./build/qemu-system-riscv64 -M virt,accel=tcg -cpu host --nographic qemu-system-riscv64: H extension requires priv spec 1.12.0 This CPU does not have a priv spec because we don't filter its extensions via priv spec. We shouldn't be reaching riscv_cpu_realize_tcg() at all with the 'host' CPU. We don't have a way to filter the 'host' CPU out of the available CPU options (-cpu help) if the build includes both KVM and TCG. What we can do is to error out during riscv_cpu_realize_tcg() if the user chooses the 'host' CPU with accel=tcg: $ ./build/qemu-system-riscv64 -M virt,accel=tcg -cpu host --nographic qemu-system-riscv64: 'host' CPU is not compatible with TCG acceleration Signed-off-by: Daniel Henrique Barboza --- target/riscv/cpu.c | 5 + 1 file changed, 5 insertions(+) diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c index 6b93b04453..08db3d613f 100644 --- a/target/riscv/cpu.c +++ b/target/riscv/cpu.c @@ -1395,6 +1395,11 @@ static void riscv_cpu_realize_tcg(DeviceState *dev, Error **errp) CPURISCVState *env = &cpu->env; Error *local_err = NULL; +if (object_dynamic_cast(OBJECT(dev), TYPE_RISCV_CPU_HOST)) { +error_setg(errp, "'host' CPU is not compatible with TCG acceleration"); +return; +} + riscv_cpu_validate_misa_mxl(cpu, &local_err); if (local_err != NULL) { error_propagate(errp, local_err); -- 2.41.0
[PATCH] virtio: Fix packed virtqueue used_idx mask
virtio_queue_packed_set_last_avail_idx() is used by vhost devices to set the internal queue indices to what has been reported by the vhost back-end through GET_VRING_BASE. For packed virtqueues, this 32-bit value is expected to contain both the device's internal avail and used indices, as well as their respective wrap counters. To get the used index, we shift the 32-bit value right by 16, and then apply a mask of 0x7. That seems to be a typo, because it should be 0x7fff; first of all, the virtio specification says that the maximum queue size for packed virt queues is 2^15, so the indices cannot exceed 2^15 - 1 anyway, making 0x7fff the correct mask. Second, the mask clearly is wrong from context, too, given that (A) `idx & 0x7` must be 0 at this point (`idx` is 32 bit and was shifted to the right by 16 already), (B) `idx & 0x8000` is the used_wrap_counter, so should not be part of the used index, and (C) `vq->used_idx` is a `uint16_t`, so cannot fit the 0x7 part of the mask anyway. This most likely never produced any guest-visible bugs, though, because for a vhost device, qemu will probably not evaluate the used index outside of virtio_queue_packed_get_last_avail_idx(), where we reconstruct the 32-bit value from avail and used indices and their wrap counters again. There, it does not matter whether the highest bit of the used_idx is the used index wrap counter, because we put the wrap counter exactly in that position anyway. Signed-off-by: Hanna Czenczek --- hw/virtio/virtio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index 295a603e58..309038fd46 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -3321,7 +3321,7 @@ static void virtio_queue_packed_set_last_avail_idx(VirtIODevice *vdev, vq->last_avail_wrap_counter = vq->shadow_avail_wrap_counter = !!(idx & 0x8000); idx >>= 16; -vq->used_idx = idx & 0x7; +vq->used_idx = idx & 0x7fff; vq->used_wrap_counter = !!(idx & 0x8000); } -- 2.41.0
Re: [PATCH 7/7] tests/avocado: ppc64 pseries reverse debugging test
On Mon Jun 26, 2023 at 7:34 PM AEST, Nicholas Piggin wrote: > On Mon Jun 26, 2023 at 5:49 PM AEST, Pavel Dovgalyuk wrote: > > On 23.06.2023 15:57, Nicholas Piggin wrote: > > > pseries can run reverse-debugging well enough to pass basic tests. > > > > > > There is strangeness with reverse-continue possibly relating to a bp > > > being set on the first instruction or on a snapshot, that causes > > > the PC to be reported on the first instruction rather than last > > > breakpoint, so a workaround is added for that for now. > > > > It means that the test reveals some kind of a bug in PPC debugging > > server implementation. > > In this case it is better to fix that instead of adding workaround. > > I agree, and I'm trying to find the cause it hasn't been easy. I > thought the test was still interesting because it otherwise seems > to work well, but hopefully I can find the issue before long. I found the problem after too much staring at record-replay. QEMU works perfectly, it is the ppc pseries firmware that branches to its own entry point address within the recorded trace, and that confuses the test script. I'm not sure the best way to work around it. Initial skip works okay but also maybe(?) a good edge case to test a break on the very first instruction of the trace even if we don't reverse continue to it. I will send a new series and propose to add the breakpoints before seeking to the end of the trace, so we will catch a breakpoint being executed again. Thanks, Nick
Re: [PATCH 6/6] vhost-user: Have reset_status fall back to reset
On 20.07.23 18:03, Stefan Hajnoczi wrote: On Wed, Jul 19, 2023 at 04:27:58PM +0200, Hanna Czenczek wrote: On 19.07.23 16:11, Hanna Czenczek wrote: On 18.07.23 17:10, Stefan Hajnoczi wrote: On Tue, Jul 11, 2023 at 05:52:28PM +0200, Hanna Czenczek wrote: The only user of vhost_user_reset_status() is vhost_dev_stop(), which only uses it as a fall-back to stop the back-end if it does not support SUSPEND. However, vhost-user's implementation is a no-op unless the back-end supports SET_STATUS. vhost-vdpa's implementation instead just calls vhost_vdpa_reset_device(), implying that it's OK to fully reset the device if SET_STATUS is not supported. To be fair, vhost_vdpa_reset_device() does nothing but to set the status to zero. However, that may well be because vhost-vdpa has no method besides this to reset a device. In contrast, vhost-user has RESET_DEVICE and a RESET_OWNER, which can be used instead. While it is not entirely clear from documentation or git logs, from discussions and the order of vhost-user protocol features, it appears to me as if RESET_OWNER originally had no real meaning for vhost-user, and was thus used to signal a device reset to the back-end. Then, RESET_DEVICE was introduced, to have a well-defined dedicated reset command. Finally, vhost-user received full STATUS support, including SET_STATUS, so setting the device status to 0 is now the preferred way of resetting a device. Still, RESET_DEVICE and RESET_OWNER should remain valid as fall-backs. Therefore, have vhost_user_reset_status() fall back to vhost_user_reset_device() if the back-end has no STATUS support. Signed-off-by: Hanna Czenczek --- hw/virtio/vhost-user.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c index 4507de5a92..53a881ec2a 100644 --- a/hw/virtio/vhost-user.c +++ b/hw/virtio/vhost-user.c @@ -2833,6 +2833,8 @@ static void vhost_user_reset_status(struct vhost_dev *dev) if (virtio_has_feature(dev->protocol_features, VHOST_USER_PROTOCOL_F_STATUS)) { vhost_user_set_status(dev, 0); + } else { + vhost_user_reset_device(dev); } } Did you check whether DPDK treats setting the status to 0 as equivalent to RESET_DEVICE? If it doesn’t, what’s even the point of using reset_status? Sorry, I’m being unclear, and I think this may be important because it ties into the question from patch 1, what qemu is even trying to do by running SET_STATUS(0) vhost_dev_stop(), so here’s what gave me the impression that SET_STATUS(0) and RESET_DEVICE should be equivalent: vhost-vdpa.c runs SET_STATUS(0) in a function called vhost_vdpa_reset_device(). This is one thing that gave me the impression that this is about an actual full reset. Another is the whole discussion that we’ve had. vhost_dev_stop() does not call a `vhost_reset_device()` function, it calls `vhost_reset_status()`. Still, we were always talking about resetting the device. There is some hacky stuff with struct vhost_dev's vq_index_end and multi-queue devices. I think it's because multi-queue vhost-net device consist of many vhost_devs and NetClientStates, so certain vhost operations are skipped unless this is the "first" or "last" vhost_dev from a large aggregate vhost-net device. That might be responsible for part of the weirdness. It doesn’t make sense to me that vDPA would provide no function to fully reset a device, while vhost-user does. Being able to reset a device sounds vital to me. This also gave me the impression that SET_STATUS(0) on vDPA at least is functionally equivalent to a full device reset. Maybe SET_STATUS(0) does mean a full device reset on vDPA, but not on vhost-user. That would be a real shame, so I assumed this would not be the case; that SET_STATUS(0) does the same thing on both protocols. Yes, exactly. It has the real VIRTIO spec meaning in vDPA. In vhost-user it's currently only used by DPDK as a hint for when device initialization is complete: https://github.com/DPDK/dpdk/commit/41d201804c4c44738168e2d247d3b1780845faa1 FWIW, now the code is a bit different. https://github.com/DPDK/dpdk/commit/671cc679a5fcd26705bb20ddc13b93e665719054 has added a RESET interpretation for the status field, i.e. when it is 0. It doesn’t do anything, but at least DPDK seems to agree that SET_STATUS(0) is a reset. The virtio specification says “Writing 0 into this field resets the device.” about the device_status field. This also makes sense, because the device_status field is basically used to tell the device that a driver has taken control. If reset, this indicates the driver has given up control, and to me this is a point where a device should fully reset itself. So all in all, I can’t see the rationale why any implementation that supports SET_STATUS would decide to treat SET_STATUS(0) not as equivalent or a superset of RESET_DEVICE. I may be wrong, and this might explain a whole deal about what kind of backgro
[PATCH for-8.1] target/arm: Special case M-profile in debug_helper.c code
A lot of the code called from helper_exception_bkpt_insn() is written assuming A-profile, but we will also call this helper on M-profile CPUs when they execute a BKPT insn. This used to work by accident, but recent changes mean that we will hit an assert when some of this code calls down into lower level functions that end up calling arm_security_space_below_el3(), arm_el_is_aa64(), and other functions that now explicitly assert that the guest CPU is not M-profile. Handle M-profile directly to avoid the assertions: * in arm_debug_target_el(), M-profile debug exceptions always go to EL1 * in arm_debug_exception_fsr(), M-profile always uses the short format FSR (compare commit d7fe699be54b2, though in this case the code in arm_v7m_cpu_do_interrupt() does not need to look at the FSR value at all) Cc: qemu-sta...@nongnu.org Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1775 Signed-off-by: Peter Maydell --- Not sure exactly when this assert() got in (probably in the semi-recent refactorings for realm support), but it won't hurt to backport the fix even if the older QEMU doesn't assert(). --- target/arm/debug_helper.c | 18 -- 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/target/arm/debug_helper.c b/target/arm/debug_helper.c index 8362462a07e..abe72e35ae6 100644 --- a/target/arm/debug_helper.c +++ b/target/arm/debug_helper.c @@ -21,6 +21,10 @@ static int arm_debug_target_el(CPUARMState *env) bool secure = arm_is_secure(env); bool route_to_el2 = false; +if (arm_feature(env, ARM_FEATURE_M)) { +return 1; +} + if (arm_is_el2_enabled(env)) { route_to_el2 = env->cp15.hcr_el2 & HCR_TGE || env->cp15.mdcr_el2 & MDCR_TDE; @@ -434,18 +438,20 @@ static uint32_t arm_debug_exception_fsr(CPUARMState *env) { ARMMMUFaultInfo fi = { .type = ARMFault_Debug }; int target_el = arm_debug_target_el(env); -bool using_lpae = false; +bool using_lpae; -if (target_el == 2 || arm_el_is_aa64(env, target_el)) { +if (arm_feature(env, ARM_FEATURE_M)) { +using_lpae = false; +} else if (target_el == 2 || arm_el_is_aa64(env, target_el)) { using_lpae = true; } else if (arm_feature(env, ARM_FEATURE_PMSA) && arm_feature(env, ARM_FEATURE_V8)) { using_lpae = true; +} else if (arm_feature(env, ARM_FEATURE_LPAE) && + (env->cp15.tcr_el[target_el] & TTBCR_EAE)) { +using_lpae = true; } else { -if (arm_feature(env, ARM_FEATURE_LPAE) && -(env->cp15.tcr_el[target_el] & TTBCR_EAE)) { -using_lpae = true; -} +using_lpae = false; } if (using_lpae) { -- 2.34.1
Re: [PATCH] target/riscv/cpu.c: do not run 'host' CPU with TCG
On 21/7/23 15:34, Daniel Henrique Barboza wrote: The 'host' CPU is available in a CONFIG_KVM build and it's currently available for all accels, but is a KVM only CPU. This means that in a RISC-V KVM capable host we can do things like this: $ ./build/qemu-system-riscv64 -M virt,accel=tcg -cpu host --nographic qemu-system-riscv64: H extension requires priv spec 1.12.0 This CPU does not have a priv spec because we don't filter its extensions via priv spec. We shouldn't be reaching riscv_cpu_realize_tcg() at all with the 'host' CPU. We don't have a way to filter the 'host' CPU out of the available CPU options (-cpu help) if the build includes both KVM and TCG. What we can do is to error out during riscv_cpu_realize_tcg() if the user chooses the 'host' CPU with accel=tcg: $ ./build/qemu-system-riscv64 -M virt,accel=tcg -cpu host --nographic qemu-system-riscv64: 'host' CPU is not compatible with TCG acceleration Signed-off-by: Daniel Henrique Barboza --- target/riscv/cpu.c | 5 + 1 file changed, 5 insertions(+) diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c index 6b93b04453..08db3d613f 100644 --- a/target/riscv/cpu.c +++ b/target/riscv/cpu.c @@ -1395,6 +1395,11 @@ static void riscv_cpu_realize_tcg(DeviceState *dev, Error **errp) /me wonders why this method isn't used as AccelCPUClass:cpu_realizefn(). CPURISCVState *env = &cpu->env; Error *local_err = NULL; +if (object_dynamic_cast(OBJECT(dev), TYPE_RISCV_CPU_HOST)) { +error_setg(errp, "'host' CPU is not compatible with TCG acceleration"); +return; +} We should manage that generically, likely in CPUClass. We could define riscv CPU with a DEFINE_CPU_WITH_CLASS() macro to pass the class_init() handler. Anyhow meanwhile your patch is sufficient, so Reviewed-by: Philippe Mathieu-Daudé
Re: [PATCH] target/riscv/cpu.c: do not run 'host' CPU with TCG
On 7/21/23 11:37, Philippe Mathieu-Daudé wrote: On 21/7/23 15:34, Daniel Henrique Barboza wrote: The 'host' CPU is available in a CONFIG_KVM build and it's currently available for all accels, but is a KVM only CPU. This means that in a RISC-V KVM capable host we can do things like this: $ ./build/qemu-system-riscv64 -M virt,accel=tcg -cpu host --nographic qemu-system-riscv64: H extension requires priv spec 1.12.0 This CPU does not have a priv spec because we don't filter its extensions via priv spec. We shouldn't be reaching riscv_cpu_realize_tcg() at all with the 'host' CPU. We don't have a way to filter the 'host' CPU out of the available CPU options (-cpu help) if the build includes both KVM and TCG. What we can do is to error out during riscv_cpu_realize_tcg() if the user chooses the 'host' CPU with accel=tcg: $ ./build/qemu-system-riscv64 -M virt,accel=tcg -cpu host --nographic qemu-system-riscv64: 'host' CPU is not compatible with TCG acceleration Signed-off-by: Daniel Henrique Barboza --- target/riscv/cpu.c | 5 + 1 file changed, 5 insertions(+) diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c index 6b93b04453..08db3d613f 100644 --- a/target/riscv/cpu.c +++ b/target/riscv/cpu.c @@ -1395,6 +1395,11 @@ static void riscv_cpu_realize_tcg(DeviceState *dev, Error **errp) /me wonders why this method isn't used as AccelCPUClass:cpu_realizefn(). CPURISCVState *env = &cpu->env; Error *local_err = NULL; + if (object_dynamic_cast(OBJECT(dev), TYPE_RISCV_CPU_HOST)) { + error_setg(errp, "'host' CPU is not compatible with TCG acceleration"); + return; + } We should manage that generically, likely in CPUClass. We could define riscv CPU with a DEFINE_CPU_WITH_CLASS() macro to pass the class_init() handler. Anyhow meanwhile your patch is sufficient, so Reviewed-by: Philippe Mathieu-Daudé Thanks. I'll mark all your suggestions as todo for 8.2. Daniel
Re: [PATCH 3/4] vdpa: Restore vlan filtering state
On 2023/7/21 19:57, Eugenio Perez Martin wrote: > On Wed, Jul 19, 2023 at 9:48 AM Hawkins Jiawei wrote: >> >> This patch introduces vhost_vdpa_net_load_single_vlan() >> and vhost_vdpa_net_load_vlan() to restore the vlan >> filtering state at device's startup. >> >> Co-developed-by: Eugenio Pérez >> Signed-off-by: Eugenio Pérez >> Signed-off-by: Hawkins Jiawei >> --- >> net/vhost-vdpa.c | 49 >> 1 file changed, 49 insertions(+) >> >> diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c >> index 9795306742..0787dd933b 100644 >> --- a/net/vhost-vdpa.c >> +++ b/net/vhost-vdpa.c >> @@ -965,6 +965,51 @@ static int vhost_vdpa_net_load_rx(VhostVDPAState *s, >> return 0; >> } >> >> +static int vhost_vdpa_net_load_single_vlan(VhostVDPAState *s, >> + const VirtIONet *n, >> + uint16_t vid) >> +{ >> +const struct iovec data = { >> +.iov_base = &vid, >> +.iov_len = sizeof(vid), >> +}; >> +ssize_t dev_written = vhost_vdpa_net_load_cmd(s, VIRTIO_NET_CTRL_VLAN, >> + VIRTIO_NET_CTRL_VLAN_ADD, >> + &data, 1); >> +if (unlikely(dev_written < 0)) { >> +return dev_written; >> +} >> +if (unlikely(*s->status != VIRTIO_NET_OK)) { >> +return -EIO; >> +} >> + >> +return 0; >> +} >> + >> +static int vhost_vdpa_net_load_vlan(VhostVDPAState *s, >> +const VirtIONet *n) >> +{ >> +int r; >> + >> +if (!virtio_vdev_has_feature(&n->parent_obj, VIRTIO_NET_F_CTRL_VLAN)) { >> +return 0; >> +} >> + >> +for (int i = 0; i < MAX_VLAN >> 5; i++) { >> +for (int j = 0; n->vlans[i] && j <= 0x1f; j++) { >> +if (n->vlans[i] & (1U << j)) { >> +r = vhost_vdpa_net_load_single_vlan(s, n, (i << 5) + j); >> +if (unlikely(r != 0)) { >> +return r; >> +} >> +} >> +} >> +} >> + >> +return 0; >> +} >> + >> + > > Nit: I'm not sure if it was here originally, but there is an extra newline > here. Hi Eugenio, It was not here originally, it was introduced mistakenly during the refactoring process. I will fix this in the v2 version. Thanks! > >> static int vhost_vdpa_net_load(NetClientState *nc) >> { >> VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc); >> @@ -995,6 +1040,10 @@ static int vhost_vdpa_net_load(NetClientState *nc) >> if (unlikely(r)) { >> return r; >> } >> +r = vhost_vdpa_net_load_vlan(s, n); >> +if (unlikely(r)) { >> +return r; >> +} >> >> return 0; >> } >> -- >> 2.25.1 >> >
[PATCH v9 01/10] migration: New QAPI type 'MigrateAddress'
This patch introduces well defined MigrateAddress struct and its related child objects. The existing argument of 'migrate' and 'migrate-incoming' QAPI - 'uri' is of type string. The current implementation follows double encoding scheme for fetching migration parameters like 'uri' and this is not an ideal design. Motive for intoducing struct level design is to prevent double encoding of QAPI arguments, as Qemu should be able to directly use the QAPI arguments without any level of encoding. Note: this commit only adds the type, and actual uses comes in later commits. Suggested-by: Aravind Retnakaran Signed-off-by: Het Gala Reviewed-by: Juan Quintela Reviewed-by: Daniel P. Berrangé Acked-by: Markus Armbruster --- qapi/migration.json | 41 + 1 file changed, 41 insertions(+) diff --git a/qapi/migration.json b/qapi/migration.json index 47dfef0278..0f4a54a7ed 100644 --- a/qapi/migration.json +++ b/qapi/migration.json @@ -1417,6 +1417,47 @@ ## { 'command': 'migrate-continue', 'data': {'state': 'MigrationStatus'} } +## +# @MigrationAddressType: +# +# The migration stream transport mechanisms. +# +# @socket: Migrate via socket. +# +# @exec: Direct the migration stream to another process. +# +# @rdma: Migrate via RDMA. +# +# Since 8.2 +## +{ 'enum': 'MigrationAddressType', + 'data': ['socket', 'exec', 'rdma'] } + +## +# @MigrationExecCommand: +# +# @args: command (list head) and arguments to execute. +# +# Since 8.2 +## +{ 'struct': 'MigrationExecCommand', + 'data': {'args': [ 'str' ] } } + +## +# @MigrationAddress: +# +# Migration endpoint configuration. +# +# Since 8.2 +## +{ 'union': 'MigrationAddress', + 'base': { 'transport' : 'MigrationAddressType'}, + 'discriminator': 'transport', + 'data': { +'socket': 'SocketAddress', +'exec': 'MigrationExecCommand', +'rdma': 'InetSocketAddress' } } + ## # @migrate: # -- 2.22.3
[PATCH v9 09/10] migration: Implement MigrateChannelList to hmp migration flow.
Integrate MigrateChannelList with all transport backends (socket, exec and rdma) for both src and dest migration endpoints for hmp migration. Suggested-by: Aravind Retnakaran Signed-off-by: Het Gala --- migration/migration-hmp-cmds.c | 16 +--- migration/migration.c | 5 ++--- migration/migration.h | 3 ++- 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/migration/migration-hmp-cmds.c b/migration/migration-hmp-cmds.c index 49b150f33f..25f51ec99c 100644 --- a/migration/migration-hmp-cmds.c +++ b/migration/migration-hmp-cmds.c @@ -423,10 +423,14 @@ void hmp_migrate_incoming(Monitor *mon, const QDict *qdict) { Error *err = NULL; const char *uri = qdict_get_str(qdict, "uri"); +MigrationChannelList *caps = NULL; +g_autoptr(MigrationChannel) channel = g_new0(MigrationChannel, 1); -qmp_migrate_incoming(uri, false, NULL, &err); +migrate_uri_parse(uri, &channel, &err); +QAPI_LIST_PREPEND(caps, channel); -hmp_handle_error(mon, err); +qmp_migrate_incoming(NULL, true, caps, &err); +qapi_free_MigrationChannelList(caps); } void hmp_migrate_recover(Monitor *mon, const QDict *qdict) @@ -704,9 +708,15 @@ void hmp_migrate(Monitor *mon, const QDict *qdict) bool resume = qdict_get_try_bool(qdict, "resume", false); const char *uri = qdict_get_str(qdict, "uri"); Error *err = NULL; +MigrationChannelList *caps = NULL; +g_autoptr(MigrationChannel) channel = g_new0(MigrationChannel, 1); + +migrate_uri_parse(uri, &channel, &err); +QAPI_LIST_PREPEND(caps, channel); -qmp_migrate(uri, false, NULL, !!blk, blk, !!inc, inc, +qmp_migrate(NULL, true, caps, !!blk, blk, !!inc, inc, false, false, true, resume, &err); +qapi_free_MigrationChannelList(caps); if (hmp_handle_error(mon, err)) { return; } diff --git a/migration/migration.c b/migration/migration.c index acf80b3590..cf063a76df 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -425,9 +425,8 @@ void migrate_add_address(SocketAddress *address) QAPI_CLONE(SocketAddress, address)); } -static bool migrate_uri_parse(const char *uri, - MigrationChannel **channel, - Error **errp) +bool migrate_uri_parse(const char *uri, MigrationChannel **channel, + Error **errp) { g_autoptr(MigrationChannel) val = g_new0(MigrationChannel, 1); g_autoptr(MigrationAddress) addr = g_new0(MigrationAddress, 1); diff --git a/migration/migration.h b/migration/migration.h index b7c8b67542..a8268394ca 100644 --- a/migration/migration.h +++ b/migration/migration.h @@ -501,7 +501,8 @@ bool check_dirty_bitmap_mig_alias_map(const BitmapMigrationNodeAliasList *bbm, Error **errp); void migrate_add_address(SocketAddress *address); - +bool migrate_uri_parse(const char *uri, MigrationChannel **channel, + Error **errp); int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque); #define qemu_ram_foreach_block \ -- 2.22.3
[PATCH v9 00/10] migration: Modify 'migrate' and 'migrate-incoming' QAPI commands for migration
This is v9 patchset of modified 'migrate' and 'migrate-incoming' QAPI design for upstream review. Would like to thank all the maintainers that actively participated in the v8 patchset discussion and gave insightful suggestions to improve the patches. Link to previous upstream community patchset links: v1: https://lists.gnu.org/archive/html/qemu-devel/2022-12/msg04339.html v2: https://lists.gnu.org/archive/html/qemu-devel/2023-02/msg02106.html v3: https://lists.gnu.org/archive/html/qemu-devel/2023-02/msg02473.html v4: https://lists.gnu.org/archive/html/qemu-devel/2023-05/msg03064.html v5: https://lists.gnu.org/archive/html/qemu-devel/2023-05/msg04845.html v6: https://lists.gnu.org/archive/html/qemu-devel/2023-06/msg01251.html v7: https://lists.gnu.org/archive/html/qemu-devel/2023-07/msg02027.html v8: https://lists.gnu.org/archive/html/qemu-devel/2023-07/msg02770.html v8 -> v9 changelog: --- - Patch1 : changed the qemu version from v8.1 -> v8.2. - Patch2 : changes for WIN32 regarding exec command. - Patch6 : Split qmp changes from hmp changes (Patch 9). - Patch8 : QAPI_CLONE of SocketAddress defined eariler in Patch3. - Patch9 : introducing migrate 'channels' in HMP, and change in implementation from earlier patch. - Patch10 : modify existing migration test case with new QAPI syntax instead of adding new test case all together. Abstract: - Current QAPI 'migrate' command design (for initiating a migration stream) contains information regarding different migrate transport mechanism (tcp / unix / exec), dest-host IP address, and binding port number in form of a string. Thus the design does seem to have some design issues. Some of the issues, stated below are: 1. Use of string URIs is a data encoding scheme within a data encoding scheme. QEMU code should directly be able to work with the results from QAPI, without resorting to do a second level of parsing (eg. socket_parse()). 2. For features / parameters related to migration, the migration tunables needs to be defined and updated upfront. For example, 'migrate-set-capability' and 'migrate-set-parameter' is required to enable multifd capability and multifd-number of channels respectively. Instead, 'Multifd-channels' can directly be represented as a single additional parameter to 'migrate' QAPI. 'migrate-set-capability' and 'migrate-set-parameter' commands could be used for runtime tunables that need setting after migration has already started. The current patchset focuses on solving the first problem of multi-level encoding of URIs. The patch defines 'migrate' command as a QAPI discriminated union for the various transport backends (like socket, exec and rdma), and on basis of transport backends, different migration parameters are defined. (uri) string --> (channel) Channel-type Transport-type Migration parameters based on transport type -- Het Gala (10): migration: New QAPI type 'MigrateAddress' migration: convert migration 'uri' into 'MigrateAddress' migration: convert socket backend to accept MigrateAddress migration: convert rdma backend to accept MigrateAddress migration: convert exec backend to accept MigrateAddress. migration: New migrate and migrate-incoming argument 'channels' migration: modify migration_channels_and_uri_compatible() for new QAPI syntax migration: Implement MigrateChannelList to qmp migration flow. migration: Implement MigrateChannelList to hmp migration flow. migration: modify test_multifd_tcp_none() to use new QAPI syntax. migration/exec.c | 72 + migration/exec.h | 8 +- migration/migration-hmp-cmds.c | 18 +++- migration/migration.c | 182 ++--- migration/migration.h | 3 +- migration/rdma.c | 34 +++--- migration/rdma.h | 6 +- migration/socket.c | 39 ++- migration/socket.h | 7 +- qapi/migration.json| 150 ++- softmmu/vl.c | 2 +- tests/qtest/migration-test.c | 7 +- 12 files changed, 401 insertions(+), 127 deletions(-) -- 2.22.3
[PATCH v9 08/10] migration: Implement MigrateChannelList to qmp migration flow.
Integrate MigrateChannelList with all transport backends (socket, exec and rdma) for both src and dest migration endpoints for qmp migration. For current series, limit the size of MigrateChannelList to single element (single interface) as runtime check. Suggested-by: Aravind Retnakaran Signed-off-by: Het Gala --- migration/migration.c | 77 --- 1 file changed, 50 insertions(+), 27 deletions(-) diff --git a/migration/migration.c b/migration/migration.c index 6e0a8beaf2..acf80b3590 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -426,9 +426,10 @@ void migrate_add_address(SocketAddress *address) } static bool migrate_uri_parse(const char *uri, - MigrationAddress **channel, + MigrationChannel **channel, Error **errp) { +g_autoptr(MigrationChannel) val = g_new0(MigrationChannel, 1); g_autoptr(MigrationAddress) addr = g_new0(MigrationAddress, 1); SocketAddress *saddr = &addr->u.socket; InetSocketAddress *isock = &addr->u.rdma; @@ -465,7 +466,9 @@ static bool migrate_uri_parse(const char *uri, return false; } -*channel = addr; +val->channel_type = MIGRATION_CHANNEL_TYPE_MAIN; +val->addr = addr; +*channel = val; return true; } @@ -473,7 +476,8 @@ static void qemu_start_incoming_migration(const char *uri, bool has_channels, MigrationChannelList *channels, Error **errp) { -g_autoptr(MigrationAddress) channel = g_new0(MigrationAddress, 1); +g_autoptr(MigrationChannel) channel = g_new0(MigrationChannel, 1); +g_autoptr(MigrationAddress) addr = g_new0(MigrationAddress, 1); /* * Having preliminary checks for uri and channel @@ -483,20 +487,29 @@ static void qemu_start_incoming_migration(const char *uri, bool has_channels, "exclusive; exactly one of the two should be present in " "'migrate-incoming' qmp command "); return; +} else if (channels) { +/* To verify that Migrate channel list has only item */ +if (channels->next) { +error_setg(errp, "Channel list has more than one entries"); +return; +} +channel = channels->value; +} else { +/* caller uses the old URI syntax */ +if (uri && !migrate_uri_parse(uri, &channel, errp)) { +return; +} } - -if (uri && !migrate_uri_parse(uri, &channel, errp)) { -return; -} +addr = channel->addr; /* transport mechanism not suitable for migration? */ -if (!migration_channels_and_transport_compatible(channel, errp)) { +if (!migration_channels_and_transport_compatible(addr, errp)) { return; } qapi_event_send_migration(MIGRATION_STATUS_SETUP); -if (channel->transport == MIGRATION_ADDRESS_TYPE_SOCKET) { -SocketAddress *saddr = &channel->u.socket; +if (addr->transport == MIGRATION_ADDRESS_TYPE_SOCKET) { +SocketAddress *saddr = &addr->u.socket; if (saddr->type == SOCKET_ADDRESS_TYPE_INET || saddr->type == SOCKET_ADDRESS_TYPE_UNIX || saddr->type == SOCKET_ADDRESS_TYPE_VSOCK) { @@ -505,11 +518,11 @@ static void qemu_start_incoming_migration(const char *uri, bool has_channels, fd_start_incoming_migration(saddr->u.fd.str, errp); } #ifdef CONFIG_RDMA -} else if (channel->transport == MIGRATION_ADDRESS_TYPE_RDMA) { -rdma_start_incoming_migration(&channel->u.rdma, errp); -#endif -} else if (channel->transport == MIGRATION_ADDRESS_TYPE_EXEC) { -exec_start_incoming_migration(channel->u.exec.args, errp); +} else if (addr->transport == MIGRATION_ADDRESS_TYPE_RDMA) { +rdma_start_incoming_migration(&addr->u.rdma, errp); + #endif +} else if (addr->transport == MIGRATION_ADDRESS_TYPE_EXEC) { +exec_start_incoming_migration(addr->u.exec.args, errp); } else { error_setg(errp, "unknown migration protocol: %s", uri); } @@ -1709,7 +1722,8 @@ void qmp_migrate(const char *uri, bool has_channels, bool resume_requested; Error *local_err = NULL; MigrationState *s = migrate_get_current(); -g_autoptr(MigrationAddress) channel = g_new0(MigrationAddress, 1); +g_autoptr(MigrationChannel) channel = g_new0(MigrationChannel, 1); +g_autoptr(MigrationAddress) addr = g_new0(MigrationAddress, 1); /* * Having preliminary checks for uri and channel @@ -1719,14 +1733,23 @@ void qmp_migrate(const char *uri, bool has_channels, "exclusive; exactly one of the two should be present in " "'migrate' qmp command "); return; +} else if (channels) { +/* To verify that Migrate channel list has only item */ +if (channels->next) { +error_setg(errp, "Cha
[PATCH v9 05/10] migration: convert exec backend to accept MigrateAddress.
Exec transport backend for 'migrate'/'migrate-incoming' QAPIs accept new wire protocol of MigrateAddress struct. It is achived by parsing 'uri' string and storing migration parameters required for exec connection into strList struct. Suggested-by: Aravind Retnakaran Signed-off-by: Het Gala Reviewed-by: Daniel P. Berrangé --- migration/exec.c | 71 +++ migration/exec.h | 4 +-- migration/migration.c | 10 +++--- 3 files changed, 57 insertions(+), 28 deletions(-) diff --git a/migration/exec.c b/migration/exec.c index 32f5143dfd..8bc321c66b 100644 --- a/migration/exec.c +++ b/migration/exec.c @@ -39,20 +39,50 @@ const char *exec_get_cmd_path(void) } #endif -void exec_start_outgoing_migration(MigrationState *s, const char *command, Error **errp) +/* provides the length of strList */ +static int +str_list_length(strList *list) +{ +int len = 0; +strList *elem; + +for (elem = list; elem != NULL; elem = elem->next) { +len++; +} + +return len; +} + +static void +init_exec_array(strList *command, char **argv, Error **errp) +{ +int i = 0; +strList *lst; + +for (lst = command; lst; lst = lst->next) { +argv[i++] = lst->value; +} + +argv[i] = NULL; +return; +} + +void exec_start_outgoing_migration(MigrationState *s, strList *command, + Error **errp) { QIOChannel *ioc; -#ifdef WIN32 -const char *argv[] = { exec_get_cmd_path(), "/c", command, NULL }; -#else -const char *argv[] = { "/bin/sh", "-c", command, NULL }; -#endif +int length = str_list_length(command); +g_auto(GStrv) argv = (char **) g_new0(const char *, length); -trace_migration_exec_outgoing(command); -ioc = QIO_CHANNEL(qio_channel_command_new_spawn(argv, -O_RDWR, -errp)); +init_exec_array(command, argv, errp); +g_autofree char *new_command = g_strjoinv(" ", (char **)argv); + +trace_migration_exec_outgoing(new_command); +ioc = QIO_CHANNEL( +qio_channel_command_new_spawn((const char * const *) argv, + O_RDWR, + errp)); if (!ioc) { return; } @@ -71,20 +101,21 @@ static gboolean exec_accept_incoming_migration(QIOChannel *ioc, return G_SOURCE_REMOVE; } -void exec_start_incoming_migration(const char *command, Error **errp) +void exec_start_incoming_migration(strList *command, Error **errp) { QIOChannel *ioc; -#ifdef WIN32 -const char *argv[] = { exec_get_cmd_path(), "/c", command, NULL }; -#else -const char *argv[] = { "/bin/sh", "-c", command, NULL }; -#endif +int length = str_list_length(command); +g_auto(GStrv) argv = (char **) g_new0(const char *, length); + +init_exec_array(command, argv, errp); +g_autofree char *new_command = g_strjoinv(" ", (char **)argv); -trace_migration_exec_incoming(command); -ioc = QIO_CHANNEL(qio_channel_command_new_spawn(argv, -O_RDWR, -errp)); +trace_migration_exec_incoming(new_command); +ioc = QIO_CHANNEL( +qio_channel_command_new_spawn((const char * const *) argv, + O_RDWR, + errp)); if (!ioc) { return; } diff --git a/migration/exec.h b/migration/exec.h index 736cd71028..3107f205e3 100644 --- a/migration/exec.h +++ b/migration/exec.h @@ -23,8 +23,8 @@ #ifdef WIN32 const char *exec_get_cmd_path(void); #endif -void exec_start_incoming_migration(const char *host_port, Error **errp); +void exec_start_incoming_migration(strList *host_port, Error **errp); -void exec_start_outgoing_migration(MigrationState *s, const char *host_port, +void exec_start_outgoing_migration(MigrationState *s, strList *host_port, Error **errp); #endif diff --git a/migration/migration.c b/migration/migration.c index 8012f93f1b..f37b388876 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -468,7 +468,6 @@ static bool migrate_uri_parse(const char *uri, static void qemu_start_incoming_migration(const char *uri, Error **errp) { -const char *p = NULL; g_autoptr(MigrationAddress) channel = g_new0(MigrationAddress, 1); /* URI is not suitable for migration? */ @@ -494,8 +493,8 @@ static void qemu_start_incoming_migration(const char *uri, Error **errp) } else if (channel->transport == MIGRATION_ADDRESS_TYPE_RDMA) { rdma_start_incoming_migration(&channel->u.rdma, errp); #endif -} else if (strstart(uri, "exec:", &p)) { -exec_start_incoming_migration(p, errp); +} else if (channel->transport == MIGRATION_ADDRESS_TYPE_EXEC) { +exec_start_incoming_migration(channel->u.exec.args, errp); } else {
[PATCH v9 03/10] migration: convert socket backend to accept MigrateAddress
Socket transport backend for 'migrate'/'migrate-incoming' QAPIs accept new wire protocol of MigrateAddress struct. It is achived by parsing 'uri' string and storing migration parameters required for socket connection into well defined SocketAddress struct. Suggested-by: Aravind Retnakaran Signed-off-by: Het Gala Reviewed-by: Daniel P. Berrangé --- migration/migration.c | 30 ++ migration/socket.c| 39 +-- migration/socket.h| 7 --- 3 files changed, 31 insertions(+), 45 deletions(-) diff --git a/migration/migration.c b/migration/migration.c index 5b3be767f0..c4bcf8bbd7 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -481,18 +481,21 @@ static void qemu_start_incoming_migration(const char *uri, Error **errp) } qapi_event_send_migration(MIGRATION_STATUS_SETUP); -if (strstart(uri, "tcp:", &p) || -strstart(uri, "unix:", NULL) || -strstart(uri, "vsock:", NULL)) { -socket_start_incoming_migration(p ? p : uri, errp); +if (channel->transport == MIGRATION_ADDRESS_TYPE_SOCKET) { +SocketAddress *saddr = &channel->u.socket; +if (saddr->type == SOCKET_ADDRESS_TYPE_INET || +saddr->type == SOCKET_ADDRESS_TYPE_UNIX || +saddr->type == SOCKET_ADDRESS_TYPE_VSOCK) { +socket_start_incoming_migration(saddr, errp); +} else if (saddr->type == SOCKET_ADDRESS_TYPE_FD) { +fd_start_incoming_migration(saddr->u.fd.str, errp); +} #ifdef CONFIG_RDMA } else if (strstart(uri, "rdma:", &p)) { rdma_start_incoming_migration(p, errp); #endif } else if (strstart(uri, "exec:", &p)) { exec_start_incoming_migration(p, errp); -} else if (strstart(uri, "fd:", &p)) { -fd_start_incoming_migration(p, errp); } else { error_setg(errp, "unknown migration protocol: %s", uri); } @@ -1715,18 +1718,21 @@ void qmp_migrate(const char *uri, bool has_blk, bool blk, } } -if (strstart(uri, "tcp:", &p) || -strstart(uri, "unix:", NULL) || -strstart(uri, "vsock:", NULL)) { -socket_start_outgoing_migration(s, p ? p : uri, &local_err); +if (channel->transport == MIGRATION_ADDRESS_TYPE_SOCKET) { +SocketAddress *saddr = &channel->u.socket; +if (saddr->type == SOCKET_ADDRESS_TYPE_INET || +saddr->type == SOCKET_ADDRESS_TYPE_UNIX || +saddr->type == SOCKET_ADDRESS_TYPE_VSOCK) { +socket_start_outgoing_migration(s, saddr, &local_err); +} else if (saddr->type == SOCKET_ADDRESS_TYPE_FD) { +fd_start_outgoing_migration(s, saddr->u.fd.str, &local_err); +} #ifdef CONFIG_RDMA } else if (strstart(uri, "rdma:", &p)) { rdma_start_outgoing_migration(s, p, &local_err); #endif } else if (strstart(uri, "exec:", &p)) { exec_start_outgoing_migration(s, p, &local_err); -} else if (strstart(uri, "fd:", &p)) { -fd_start_outgoing_migration(s, p, &local_err); } else { if (!resume_requested) { yank_unregister_instance(MIGRATION_YANK_INSTANCE); diff --git a/migration/socket.c b/migration/socket.c index 1b6f5baefb..98e3ea1514 100644 --- a/migration/socket.c +++ b/migration/socket.c @@ -28,6 +28,8 @@ #include "trace.h" #include "postcopy-ram.h" #include "options.h" +#include "qapi/clone-visitor.h" +#include "qapi/qapi-visit-sockets.h" struct SocketOutgoingArgs { SocketAddress *saddr; @@ -108,19 +110,19 @@ out: object_unref(OBJECT(sioc)); } -static void -socket_start_outgoing_migration_internal(MigrationState *s, - SocketAddress *saddr, - Error **errp) +void socket_start_outgoing_migration(MigrationState *s, + SocketAddress *saddr, + Error **errp) { QIOChannelSocket *sioc = qio_channel_socket_new(); struct SocketConnectData *data = g_new0(struct SocketConnectData, 1); +SocketAddress *addr = QAPI_CLONE(SocketAddress, saddr); data->s = s; /* in case previous migration leaked it */ qapi_free_SocketAddress(outgoing_args.saddr); -outgoing_args.saddr = saddr; +outgoing_args.saddr = addr; if (saddr->type == SOCKET_ADDRESS_TYPE_INET) { data->hostname = g_strdup(saddr->u.inet.host); @@ -135,18 +137,6 @@ socket_start_outgoing_migration_internal(MigrationState *s, NULL); } -void socket_start_outgoing_migration(MigrationState *s, - const char *str, - Error **errp) -{ -Error *err = NULL; -SocketAddress *saddr = socket_parse(str, &err); -if (!err) { -socket_start_outgoing_migration_internal(s, saddr, &err); -} -error_propagate(errp, err); -} - static void socket_accept_incoming_
[PATCH v9 10/10] migration: modify test_multifd_tcp_none() to use new QAPI syntax.
modify multifd tcp common test to incorporate the new QAPI syntax defined. Suggested-by: Aravind Retnakaran Signed-off-by: Het Gala --- tests/qtest/migration-test.c | 7 ++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c index e256da1216..376fad8311 100644 --- a/tests/qtest/migration-test.c +++ b/tests/qtest/migration-test.c @@ -2185,7 +2185,12 @@ test_migrate_precopy_tcp_multifd_start_common(QTestState *from, /* Start incoming migration from the 1st socket */ qtest_qmp_assert_success(to, "{ 'execute': 'migrate-incoming'," - " 'arguments': { 'uri': 'tcp:127.0.0.1:0' }}"); + " 'arguments': { " + " 'channels': [ { 'channeltype': 'main'," + " 'addr': { 'transport': 'socket'," + "'type': 'inet'," + "'host': '127.0.0.1'," + "'port': '0' } } ] } }"); return NULL; } -- 2.22.3
[PATCH v9 02/10] migration: convert migration 'uri' into 'MigrateAddress'
This patch parses 'migrate' and 'migrate-incoming' QAPI's 'uri' string containing migration connection related information and stores them inside well defined 'MigrateAddress' struct. Suggested-by: Aravind Retnakaran Signed-off-by: Het Gala Reviewed-by: Daniel P. Berrangé --- migration/exec.c | 1 - migration/exec.h | 4 migration/migration.c | 55 +++ 3 files changed, 59 insertions(+), 1 deletion(-) diff --git a/migration/exec.c b/migration/exec.c index 2bf882bbe1..32f5143dfd 100644 --- a/migration/exec.c +++ b/migration/exec.c @@ -27,7 +27,6 @@ #include "qemu/cutils.h" #ifdef WIN32 -const char *exec_get_cmd_path(void); const char *exec_get_cmd_path(void) { g_autofree char *detected_path = g_new(char, MAX_PATH); diff --git a/migration/exec.h b/migration/exec.h index b210ffde7a..736cd71028 100644 --- a/migration/exec.h +++ b/migration/exec.h @@ -19,6 +19,10 @@ #ifndef QEMU_MIGRATION_EXEC_H #define QEMU_MIGRATION_EXEC_H + +#ifdef WIN32 +const char *exec_get_cmd_path(void); +#endif void exec_start_incoming_migration(const char *host_port, Error **errp); void exec_start_outgoing_migration(MigrationState *s, const char *host_port, diff --git a/migration/migration.c b/migration/migration.c index 91bba630a8..5b3be767f0 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -64,6 +64,7 @@ #include "yank_functions.h" #include "sysemu/qtest.h" #include "options.h" +#include "qemu/sockets.h" static NotifierList migration_state_notifiers = NOTIFIER_LIST_INITIALIZER(migration_state_notifiers); @@ -421,15 +422,64 @@ void migrate_add_address(SocketAddress *address) QAPI_CLONE(SocketAddress, address)); } +static bool migrate_uri_parse(const char *uri, + MigrationAddress **channel, + Error **errp) +{ +g_autoptr(MigrationAddress) addr = g_new0(MigrationAddress, 1); +SocketAddress *saddr = &addr->u.socket; +InetSocketAddress *isock = &addr->u.rdma; +strList **tail = &addr->u.exec.args; + +if (strstart(uri, "exec:", NULL)) { +addr->transport = MIGRATION_ADDRESS_TYPE_EXEC; +#ifdef WIN32 +QAPI_LIST_APPEND(tail, g_strdup(exec_get_cmd_path())); +QAPI_LIST_APPEND(tail, g_strdup("/c")); +#else +QAPI_LIST_APPEND(tail, g_strdup("/bin/sh")); +QAPI_LIST_APPEND(tail, g_strdup("-c")); +#endif +QAPI_LIST_APPEND(tail, g_strdup(uri + strlen("exec:"))); +} else if (strstart(uri, "rdma:", NULL)) { +if (inet_parse(isock, uri + strlen("rdma:"), errp)) { +qapi_free_InetSocketAddress(isock); +return false; +} +addr->transport = MIGRATION_ADDRESS_TYPE_RDMA; +} else if (strstart(uri, "tcp:", NULL) || +strstart(uri, "unix:", NULL) || +strstart(uri, "vsock:", NULL) || +strstart(uri, "fd:", NULL)) { +addr->transport = MIGRATION_ADDRESS_TYPE_SOCKET; +saddr = socket_parse(uri, errp); +if (!saddr) { +qapi_free_SocketAddress(saddr); +return false; +} +} else { +error_setg(errp, "unknown migration protocol: %s", uri); +return false; +} + +*channel = addr; +return true; +} + static void qemu_start_incoming_migration(const char *uri, Error **errp) { const char *p = NULL; +g_autoptr(MigrationAddress) channel = g_new0(MigrationAddress, 1); /* URI is not suitable for migration? */ if (!migration_channels_and_uri_compatible(uri, errp)) { return; } +if (uri && !migrate_uri_parse(uri, &channel, errp)) { +return; +} + qapi_event_send_migration(MIGRATION_STATUS_SETUP); if (strstart(uri, "tcp:", &p) || strstart(uri, "unix:", NULL) || @@ -1641,12 +1691,17 @@ void qmp_migrate(const char *uri, bool has_blk, bool blk, Error *local_err = NULL; MigrationState *s = migrate_get_current(); const char *p = NULL; +g_autoptr(MigrationAddress) channel = g_new0(MigrationAddress, 1); /* URI is not suitable for migration? */ if (!migration_channels_and_uri_compatible(uri, errp)) { return; } +if (!migrate_uri_parse(uri, &channel, errp)) { +return; +} + resume_requested = has_resume && resume; if (!migrate_prepare(s, has_blk && blk, has_inc && inc, resume_requested, errp)) { -- 2.22.3
[PATCH v9 06/10] migration: New migrate and migrate-incoming argument 'channels'
MigrateChannelList allows to connect accross multiple interfaces. Add MigrateChannelList struct as argument to migration QAPIs. We plan to include multiple channels in future, to connnect multiple interfaces. Hence, we choose 'MigrateChannelList' as the new argument over 'MigrateChannel' to make migration QAPIs future proof. Suggested-by: Aravind Retnakaran Signed-off-by: Het Gala Acked-by: Markus Armbruster --- migration/migration-hmp-cmds.c | 6 +- migration/migration.c | 34 -- qapi/migration.json| 109 - softmmu/vl.c | 2 +- 4 files changed, 139 insertions(+), 12 deletions(-) diff --git a/migration/migration-hmp-cmds.c b/migration/migration-hmp-cmds.c index 9885d7c9f7..49b150f33f 100644 --- a/migration/migration-hmp-cmds.c +++ b/migration/migration-hmp-cmds.c @@ -424,7 +424,7 @@ void hmp_migrate_incoming(Monitor *mon, const QDict *qdict) Error *err = NULL; const char *uri = qdict_get_str(qdict, "uri"); -qmp_migrate_incoming(uri, &err); +qmp_migrate_incoming(uri, false, NULL, &err); hmp_handle_error(mon, err); } @@ -705,8 +705,8 @@ void hmp_migrate(Monitor *mon, const QDict *qdict) const char *uri = qdict_get_str(qdict, "uri"); Error *err = NULL; -qmp_migrate(uri, !!blk, blk, !!inc, inc, -false, false, true, resume, &err); +qmp_migrate(uri, false, NULL, !!blk, blk, !!inc, inc, + false, false, true, resume, &err); if (hmp_handle_error(mon, err)) { return; } diff --git a/migration/migration.c b/migration/migration.c index f37b388876..bd3a93fc8c 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -466,10 +466,22 @@ static bool migrate_uri_parse(const char *uri, return true; } -static void qemu_start_incoming_migration(const char *uri, Error **errp) +static void qemu_start_incoming_migration(const char *uri, bool has_channels, + MigrationChannelList *channels, + Error **errp) { g_autoptr(MigrationAddress) channel = g_new0(MigrationAddress, 1); +/* + * Having preliminary checks for uri and channel + */ +if (uri && has_channels) { +error_setg(errp, "'uri' and 'channels' arguments are mutually " + "exclusive; exactly one of the two should be present in " + "'migrate-incoming' qmp command "); +return; +} + /* URI is not suitable for migration? */ if (!migration_channels_and_uri_compatible(uri, errp)) { return; @@ -1497,7 +1509,8 @@ void migrate_del_blocker(Error *reason) migration_blockers = g_slist_remove(migration_blockers, reason); } -void qmp_migrate_incoming(const char *uri, Error **errp) +void qmp_migrate_incoming(const char *uri, bool has_channels, + MigrationChannelList *channels, Error **errp) { Error *local_err = NULL; static bool once = true; @@ -1515,7 +1528,7 @@ void qmp_migrate_incoming(const char *uri, Error **errp) return; } -qemu_start_incoming_migration(uri, &local_err); +qemu_start_incoming_migration(uri, has_channels, channels, &local_err); if (local_err) { yank_unregister_instance(MIGRATION_YANK_INSTANCE); @@ -1551,7 +1564,7 @@ void qmp_migrate_recover(const char *uri, Error **errp) * only re-setup the migration stream and poke existing migration * to continue using that newly established channel. */ -qemu_start_incoming_migration(uri, errp); +qemu_start_incoming_migration(uri, false, NULL, errp); } void qmp_migrate_pause(Error **errp) @@ -1685,7 +1698,8 @@ static bool migrate_prepare(MigrationState *s, bool blk, bool blk_inc, return true; } -void qmp_migrate(const char *uri, bool has_blk, bool blk, +void qmp_migrate(const char *uri, bool has_channels, + MigrationChannelList *channels, bool has_blk, bool blk, bool has_inc, bool inc, bool has_detach, bool detach, bool has_resume, bool resume, Error **errp) { @@ -1694,6 +1708,16 @@ void qmp_migrate(const char *uri, bool has_blk, bool blk, MigrationState *s = migrate_get_current(); g_autoptr(MigrationAddress) channel = g_new0(MigrationAddress, 1); +/* + * Having preliminary checks for uri and channel + */ +if (uri && has_channels) { +error_setg(errp, "'uri' and 'channels' arguments are mutually " + "exclusive; exactly one of the two should be present in " + "'migrate' qmp command "); +return; +} + /* URI is not suitable for migration? */ if (!migration_channels_and_uri_compatible(uri, errp)) { return; diff --git a/qapi/migration.json b/qapi/migration.json index 0f4a54a7ed..9a365ac774 100644 --- a/qapi/migration.json +++ b/qapi/migration.json @@ -1458,6 +1458,34 @@ 'exec': 'Migrat
[PATCH v9 04/10] migration: convert rdma backend to accept MigrateAddress
RDMA based transport backend for 'migrate'/'migrate-incoming' QAPIs accept new wire protocol of MigrateAddress struct. It is achived by parsing 'uri' string and storing migration parameters required for RDMA connection into well defined InetSocketAddress struct. Suggested-by: Aravind Retnakaran Signed-off-by: Het Gala Reviewed-by: Daniel P. Berrangé --- migration/migration.c | 8 migration/rdma.c | 34 -- migration/rdma.h | 6 -- 3 files changed, 20 insertions(+), 28 deletions(-) diff --git a/migration/migration.c b/migration/migration.c index c4bcf8bbd7..8012f93f1b 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -491,8 +491,8 @@ static void qemu_start_incoming_migration(const char *uri, Error **errp) fd_start_incoming_migration(saddr->u.fd.str, errp); } #ifdef CONFIG_RDMA -} else if (strstart(uri, "rdma:", &p)) { -rdma_start_incoming_migration(p, errp); +} else if (channel->transport == MIGRATION_ADDRESS_TYPE_RDMA) { +rdma_start_incoming_migration(&channel->u.rdma, errp); #endif } else if (strstart(uri, "exec:", &p)) { exec_start_incoming_migration(p, errp); @@ -1728,8 +1728,8 @@ void qmp_migrate(const char *uri, bool has_blk, bool blk, fd_start_outgoing_migration(s, saddr->u.fd.str, &local_err); } #ifdef CONFIG_RDMA -} else if (strstart(uri, "rdma:", &p)) { -rdma_start_outgoing_migration(s, p, &local_err); +} else if (channel->transport == MIGRATION_ADDRESS_TYPE_RDMA) { +rdma_start_outgoing_migration(s, &channel->u.rdma, &local_err); #endif } else if (strstart(uri, "exec:", &p)) { exec_start_outgoing_migration(s, p, &local_err); diff --git a/migration/rdma.c b/migration/rdma.c index dd1c039e6c..4d64fae492 100644 --- a/migration/rdma.c +++ b/migration/rdma.c @@ -319,7 +319,6 @@ typedef struct RDMALocalBlocks { typedef struct RDMAContext { char *host; int port; -char *host_port; RDMAWorkRequestData wr_data[RDMA_WRID_MAX]; @@ -2455,9 +2454,7 @@ static void qemu_rdma_cleanup(RDMAContext *rdma) rdma->channel = NULL; } g_free(rdma->host); -g_free(rdma->host_port); rdma->host = NULL; -rdma->host_port = NULL; } @@ -2739,28 +2736,17 @@ static void qemu_rdma_return_path_dest_init(RDMAContext *rdma_return_path, rdma_return_path->is_return_path = true; } -static void *qemu_rdma_data_init(const char *host_port, Error **errp) +static void *qemu_rdma_data_init(InetSocketAddress *saddr, Error **errp) { RDMAContext *rdma = NULL; -InetSocketAddress *addr; -if (host_port) { +if (saddr) { rdma = g_new0(RDMAContext, 1); rdma->current_index = -1; rdma->current_chunk = -1; -addr = g_new(InetSocketAddress, 1); -if (!inet_parse(addr, host_port, NULL)) { -rdma->port = atoi(addr->port); -rdma->host = g_strdup(addr->host); -rdma->host_port = g_strdup(host_port); -} else { -ERROR(errp, "bad RDMA migration address '%s'", host_port); -g_free(rdma); -rdma = NULL; -} - -qapi_free_InetSocketAddress(addr); +rdma->host = g_strdup(saddr->host); +rdma->port = atoi(saddr->port); } return rdma; @@ -3359,6 +3345,7 @@ static int qemu_rdma_accept(RDMAContext *rdma) .private_data_len = sizeof(cap), }; RDMAContext *rdma_return_path = NULL; +g_autoptr(InetSocketAddress) isock = g_new0(InetSocketAddress, 1); struct rdma_cm_event *cm_event; struct ibv_context *verbs; int ret = -EINVAL; @@ -3374,13 +3361,16 @@ static int qemu_rdma_accept(RDMAContext *rdma) goto err_rdma_dest_wait; } +isock->host = rdma->host; +isock->port = g_strdup_printf("%d", rdma->port); + /* * initialize the RDMAContext for return path for postcopy after first * connection request reached. */ if ((migrate_postcopy() || migrate_return_path()) && !rdma->is_return_path) { -rdma_return_path = qemu_rdma_data_init(rdma->host_port, NULL); +rdma_return_path = qemu_rdma_data_init(isock, NULL); if (rdma_return_path == NULL) { rdma_ack_cm_event(cm_event); goto err_rdma_dest_wait; @@ -4113,7 +4103,8 @@ static void rdma_accept_incoming_migration(void *opaque) } } -void rdma_start_incoming_migration(const char *host_port, Error **errp) +void rdma_start_incoming_migration(InetSocketAddress *host_port, + Error **errp) { int ret; RDMAContext *rdma; @@ -4159,13 +4150,12 @@ err: error_propagate(errp, local_err); if (rdma) { g_free(rdma->host); -g_free(rdma->host_port); } g_free(rdma); } void rdma_start_outgoing_migration(void *opaq
[PATCH v9 07/10] migration: modify migration_channels_and_uri_compatible() for new QAPI syntax
migration_channels_and_uri_compatible() check for transport mechanism suitable for multifd migration gets executed when the caller calls old uri syntax. It needs it to be run when using the modern MigrateChannel QAPI syntax too. After URI -> 'MigrateChannel' : migration_channels_and_uri_compatible() -> migration_channels_and_transport_compatible() passes object as argument and check for valid transport mechanism. Suggested-by: Aravind Retnakaran Signed-off-by: Het Gala Reviewed-by: Daniel P. Berrangé --- migration/migration.c | 25 ++--- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/migration/migration.c b/migration/migration.c index bd3a93fc8c..6e0a8beaf2 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -104,17 +104,20 @@ static bool migration_needs_multiple_sockets(void) return migrate_multifd() || migrate_postcopy_preempt(); } -static bool uri_supports_multi_channels(const char *uri) +static bool transport_supports_multi_channels(SocketAddress *saddr) { -return strstart(uri, "tcp:", NULL) || strstart(uri, "unix:", NULL) || - strstart(uri, "vsock:", NULL); +return saddr->type == SOCKET_ADDRESS_TYPE_INET || + saddr->type == SOCKET_ADDRESS_TYPE_UNIX || + saddr->type == SOCKET_ADDRESS_TYPE_VSOCK; } static bool -migration_channels_and_uri_compatible(const char *uri, Error **errp) +migration_channels_and_transport_compatible(MigrationAddress *addr, +Error **errp) { if (migration_needs_multiple_sockets() && -!uri_supports_multi_channels(uri)) { +(addr->transport == MIGRATION_ADDRESS_TYPE_SOCKET) && +!transport_supports_multi_channels(&addr->u.socket)) { error_setg(errp, "Migration requires multi-channel URIs (e.g. tcp)"); return false; } @@ -482,12 +485,12 @@ static void qemu_start_incoming_migration(const char *uri, bool has_channels, return; } -/* URI is not suitable for migration? */ -if (!migration_channels_and_uri_compatible(uri, errp)) { +if (uri && !migrate_uri_parse(uri, &channel, errp)) { return; } -if (uri && !migrate_uri_parse(uri, &channel, errp)) { +/* transport mechanism not suitable for migration? */ +if (!migration_channels_and_transport_compatible(channel, errp)) { return; } @@ -1718,12 +1721,12 @@ void qmp_migrate(const char *uri, bool has_channels, return; } -/* URI is not suitable for migration? */ -if (!migration_channels_and_uri_compatible(uri, errp)) { +if (!migrate_uri_parse(uri, &channel, errp)) { return; } -if (!migrate_uri_parse(uri, &channel, errp)) { +/* transport mechanism not suitable for migration? */ +if (!migration_channels_and_transport_compatible(channel, errp)) { return; } -- 2.22.3
Re: [PULL 5/5] linux-user: Fix qemu-arm to run static armhf binaries
19.07.2023 18:52, Helge Deller wrote: qemu-user crashes immediately when running static binaries on the armhf architecture. The problem is the memory layout where the executable is loaded before the interpreter library, in which case the reserved brk region clashes with the interpreter code and is released before qemu tries to start the program. At load time qemu calculates a brk value for interpreter and executable each. The fix is to choose the higher one of both. Signed-off-by: Helge Deller Cc: Andreas Schwab Cc: qemu-sta...@nongnu.org Reported-by: venkata.p...@toshiba-tsip.com Closes: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=1040981 --- linux-user/elfload.c | 7 +++ 1 file changed, 7 insertions(+) diff --git a/linux-user/elfload.c b/linux-user/elfload.c index a26200d9f3..94951630b1 100644 --- a/linux-user/elfload.c +++ b/linux-user/elfload.c @@ -3615,6 +3615,13 @@ int load_elf_binary(struct linux_binprm *bprm, struct image_info *info) if (elf_interpreter) { load_elf_interp(elf_interpreter, &interp_info, bprm->buf); +/* + * adjust brk address if the interpreter was loaded above the main + * executable, e.g. happens with static binaries on armhf + */ +if (interp_info.brk > info->brk) { +info->brk = interp_info.brk; +} So, this is kinda amusing. This broke arm64, ppc64el and s390x: arm64$ ./qemu-aarch64 /bin/sh -c '/bin/ls -dCFl *[t]* >/dev/null' qemu: uncaught target signal 11 (Segmentation fault) - core dumped Segmentation fault (it was just a quick test from debian qemu-user package). Reverting this patch makes it work again.. *Sigh*. (This is currently broken in debian unstable too, - this is how I noticed the breakage). Thanks, /mjt
Re: [PATCH v2 3/4] vhost: Add high-level state save/load functions
On Wed, Jul 12, 2023 at 1:17 PM Hanna Czenczek wrote: > > vhost_save_backend_state() and vhost_load_backend_state() can be used by > vhost front-ends to easily save and load the back-end's state to/from > the migration stream. > > Because we do not know the full state size ahead of time, > vhost_save_backend_state() simply reads the data in 1 MB chunks, and > writes each chunk consecutively into the migration stream, prefixed by > its length. EOF is indicated by a 0-length chunk. > > Signed-off-by: Hanna Czenczek > --- > include/hw/virtio/vhost.h | 35 +++ > hw/virtio/vhost.c | 204 ++ > 2 files changed, 239 insertions(+) > > diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h > index d8877496e5..0c282abd4e 100644 > --- a/include/hw/virtio/vhost.h > +++ b/include/hw/virtio/vhost.h > @@ -425,4 +425,39 @@ int vhost_set_device_state_fd(struct vhost_dev *dev, > */ > int vhost_check_device_state(struct vhost_dev *dev, Error **errp); > > +/** > + * vhost_save_backend_state(): High-level function to receive a vhost > + * back-end's state, and save it in `f`. Uses > + * `vhost_set_device_state_fd()` to get the data from the back-end, and > + * stores it in consecutive chunks that are each prefixed by their > + * respective length (be32). The end is marked by a 0-length chunk. > + * > + * Must only be called while the device and all its vrings are stopped > + * (`VHOST_TRANSFER_STATE_PHASE_STOPPED`). > + * > + * @dev: The vhost device from which to save the state > + * @f: Migration stream in which to save the state > + * @errp: Potential error message > + * > + * Returns 0 on success, and -errno otherwise. > + */ > +int vhost_save_backend_state(struct vhost_dev *dev, QEMUFile *f, Error > **errp); > + > +/** > + * vhost_load_backend_state(): High-level function to load a vhost > + * back-end's state from `f`, and send it over to the back-end. Reads > + * the data from `f` in the format used by `vhost_save_state()`, and > + * uses `vhost_set_device_state_fd()` to transfer it to the back-end. > + * > + * Must only be called while the device and all its vrings are stopped > + * (`VHOST_TRANSFER_STATE_PHASE_STOPPED`). > + * > + * @dev: The vhost device to which to send the sate > + * @f: Migration stream from which to load the state > + * @errp: Potential error message > + * > + * Returns 0 on success, and -errno otherwise. > + */ > +int vhost_load_backend_state(struct vhost_dev *dev, QEMUFile *f, Error > **errp); > + > #endif > diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c > index 756b6d55a8..332d49a310 100644 > --- a/hw/virtio/vhost.c > +++ b/hw/virtio/vhost.c > @@ -2128,3 +2128,207 @@ int vhost_check_device_state(struct vhost_dev *dev, > Error **errp) > "vhost transport does not support migration state transfer"); > return -ENOSYS; > } > + > +int vhost_save_backend_state(struct vhost_dev *dev, QEMUFile *f, Error > **errp) > +{ > +/* Maximum chunk size in which to transfer the state */ > +const size_t chunk_size = 1 * 1024 * 1024; > +void *transfer_buf = NULL; > +g_autoptr(GError) g_err = NULL; > +int pipe_fds[2], read_fd = -1, write_fd = -1, reply_fd = -1; > +int ret; > + > +/* [0] for reading (our end), [1] for writing (back-end's end) */ > +if (!g_unix_open_pipe(pipe_fds, FD_CLOEXEC, &g_err)) { > +error_setg(errp, "Failed to set up state transfer pipe: %s", > + g_err->message); > +ret = -EINVAL; > +goto fail; > +} > + > +read_fd = pipe_fds[0]; > +write_fd = pipe_fds[1]; > + > +/* > + * VHOST_TRANSFER_STATE_PHASE_STOPPED means the device must be stopped. > + * We cannot check dev->suspended, because the back-end may not support > + * suspending. > + */ > +assert(!dev->started); > + > +/* Transfer ownership of write_fd to the back-end */ > +ret = vhost_set_device_state_fd(dev, > +VHOST_TRANSFER_STATE_DIRECTION_SAVE, > +VHOST_TRANSFER_STATE_PHASE_STOPPED, > +write_fd, > +&reply_fd, > +errp); > +if (ret < 0) { > +error_prepend(errp, "Failed to initiate state transfer: "); > +goto fail; > +} > + > +/* If the back-end wishes to use a different pipe, switch over */ > +if (reply_fd >= 0) { > +close(read_fd); > +read_fd = reply_fd; > +} > + > +transfer_buf = g_malloc(chunk_size); > + > +while (true) { > +ssize_t read_ret; > + > +read_ret = read(read_fd, transfer_buf, chunk_size); > +if (read_ret < 0) { > +ret = -errno; > +error_setg_errno(errp, -ret, "Failed to receive state"); > +goto fail; > +} > + > +assert(read_ret <= chunk_size); > +qemu_put_be32(f, read_ret); > + > +if (r
Re: [PULL 5/5] linux-user: Fix qemu-arm to run static armhf binaries
21.07.2023 18:14, Michael Tokarev пишет: 19.07.2023 18:52, Helge Deller wrote: qemu-user crashes immediately when running static binaries on the armhf architecture. The problem is the memory layout where the executable is loaded before the interpreter library, in which case the reserved brk region clashes with the interpreter code and is released before qemu tries to start the program. At load time qemu calculates a brk value for interpreter and executable each. The fix is to choose the higher one of both. Signed-off-by: Helge Deller Cc: Andreas Schwab Cc: qemu-sta...@nongnu.org Reported-by: venkata.p...@toshiba-tsip.com Closes: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=1040981 --- linux-user/elfload.c | 7 +++ 1 file changed, 7 insertions(+) diff --git a/linux-user/elfload.c b/linux-user/elfload.c index a26200d9f3..94951630b1 100644 --- a/linux-user/elfload.c +++ b/linux-user/elfload.c @@ -3615,6 +3615,13 @@ int load_elf_binary(struct linux_binprm *bprm, struct image_info *info) if (elf_interpreter) { load_elf_interp(elf_interpreter, &interp_info, bprm->buf); + /* + * adjust brk address if the interpreter was loaded above the main + * executable, e.g. happens with static binaries on armhf + */ + if (interp_info.brk > info->brk) { + info->brk = interp_info.brk; + } I've added printf() inside this if() block, on arm64 it produces: fixing brk: interp_info.brk=0x5502875358 info=>brk=0x5500032ec0 FWIW, /mjt
Re: [PATCH 3/6] vhost: Do not reset suspended devices on stop
On Tue, Jul 11, 2023 at 5:52 PM Hanna Czenczek wrote: > > Move the `suspended` field from vhost_vdpa into the global vhost_dev > struct, so vhost_dev_stop() can check whether the back-end has been > suspended by `vhost_ops->vhost_dev_start(hdev, false)`. If it has, > there is no need to reset it; the reset is just a fall-back to stop > device operations for back-ends that do not support suspend. > > Unfortunately, for vDPA specifically, RESUME is not yet implemented, so > when the device is re-started, we still have to do the reset to have it > un-suspend. > > Signed-off-by: Hanna Czenczek > --- > include/hw/virtio/vhost-vdpa.h | 2 -- > include/hw/virtio/vhost.h | 8 > hw/virtio/vhost-vdpa.c | 11 +++ > hw/virtio/vhost.c | 8 +++- > 4 files changed, 22 insertions(+), 7 deletions(-) > > diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h > index e64bfc7f98..72c3686b7f 100644 > --- a/include/hw/virtio/vhost-vdpa.h > +++ b/include/hw/virtio/vhost-vdpa.h > @@ -42,8 +42,6 @@ typedef struct vhost_vdpa { > bool shadow_vqs_enabled; > /* Vdpa must send shadow addresses as IOTLB key for data queues, not GPA > */ > bool shadow_data; > -/* Device suspended successfully */ > -bool suspended; > /* IOVA mapping used by the Shadow Virtqueue */ > VhostIOVATree *iova_tree; > GPtrArray *shadow_vqs; > diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h > index 6a173cb9fa..69bf59d630 100644 > --- a/include/hw/virtio/vhost.h > +++ b/include/hw/virtio/vhost.h > @@ -120,6 +120,14 @@ struct vhost_dev { > uint64_t backend_cap; > /* @started: is the vhost device started? */ > bool started; > +/** > + * @suspended: Whether the vhost device is currently suspended. Set > + * and reset by implementations (vhost-user, vhost-vdpa, ...), which > + * are supposed to automatically suspend/resume in their > + * vhost_dev_start handlers as required. Must also be cleared when > + * the device is reset. > + */ > +bool suspended; > bool log_enabled; > uint64_t log_size; > Error *migration_blocker; > diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c > index 7b7dee468e..f7fd19a203 100644 > --- a/hw/virtio/vhost-vdpa.c > +++ b/hw/virtio/vhost-vdpa.c > @@ -858,13 +858,12 @@ static int vhost_vdpa_get_device_id(struct vhost_dev > *dev, > > static int vhost_vdpa_reset_device(struct vhost_dev *dev) > { > -struct vhost_vdpa *v = dev->opaque; > int ret; > uint8_t status = 0; > > ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &status); > trace_vhost_vdpa_reset_device(dev); > -v->suspended = false; > +dev->suspended = false; > return ret; > } > > @@ -1278,7 +1277,7 @@ static void vhost_vdpa_suspend(struct vhost_dev *dev) > if (unlikely(r)) { > error_report("Cannot suspend: %s(%d)", g_strerror(errno), errno); > } else { > -v->suspended = true; > +dev->suspended = true; > return; > } > } > @@ -1313,6 +1312,10 @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, > bool started) > return -1; > } > vhost_vdpa_set_vring_ready(dev); > +if (dev->suspended) { > +/* TODO: When RESUME is available, use it instead of resetting */ > +vhost_vdpa_reset_status(dev); How is that we reset the status at each vhost_vdpa_dev_start? That will clean all the vqs configured, features negotiated, etc. in the vDPA device. Or am I missing something? I'm totally ok with the move of "suspended" member. Thanks! > +} > } else { > vhost_vdpa_suspend(dev); > vhost_vdpa_svqs_stop(dev); > @@ -1400,7 +1403,7 @@ static int vhost_vdpa_get_vring_base(struct vhost_dev > *dev, > return 0; > } > > -if (!v->suspended) { > +if (!dev->suspended) { > /* > * Cannot trust in value returned by device, let vhost recover used > * idx from guest. > diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c > index abf0d03c8d..2e28e58da7 100644 > --- a/hw/virtio/vhost.c > +++ b/hw/virtio/vhost.c > @@ -2059,7 +2059,13 @@ void vhost_dev_stop(struct vhost_dev *hdev, > VirtIODevice *vdev, bool vrings) > hdev->vqs + i, > hdev->vq_index + i); > } > -if (hdev->vhost_ops->vhost_reset_status) { > + > +/* > + * If we failed to successfully stop the device via SUSPEND (should have > + * been attempted by `vhost_dev_start(hdev, false)`), reset it to stop > it. > + * Stateful devices where this would be problematic must implement > SUSPEND. > + */ > +if (!hdev->suspended && hdev->vhost_ops->vhost_reset_status) { > hdev->vhost_ops->vhost_reset_status(hdev); > } > > -- > 2.41.0 >
Re: [PATCH v6 4/5] intel_iommu: allow Extended Interrupt Mode when using userspace APIC
On 7/21/23 03:47, Peter Xu wrote: On Mon, Jul 17, 2023 at 11:29:56PM +0700, Bui Quang Minh wrote: On 7/17/23 17:47, Joao Martins wrote: +Peter, +Jason (intel-iommu maintainer/reviewer) Thanks for copying me, Joan. On 15/07/2023 16:22, Bui Quang Minh wrote: As userspace APIC now supports x2APIC, intel interrupt remapping hardware can be set to EIM mode when userspace local APIC is used. Reviewed-by: Michael S. Tsirkin Signed-off-by: Bui Quang Minh --- hw/i386/intel_iommu.c | 11 --- 1 file changed, 11 deletions(-) diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index dcc334060c..5e576f6059 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -4043,17 +4043,6 @@ static bool vtd_decide_config(IntelIOMMUState *s, Error **errp) && x86_iommu_ir_supported(x86_iommu) ? ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF; } -if (s->intr_eim == ON_OFF_AUTO_ON && !s->buggy_eim) { -if (!kvm_irqchip_is_split()) { -error_setg(errp, "eim=on requires accel=kvm,kernel-irqchip=split"); -return false; -} -if (!kvm_enable_x2apic()) { -error_setg(errp, "eim=on requires support on the KVM side" - "(X2APIC_API, first shipped in v4.7)"); -return false; -} -} Given commit 20ca47429e ('Revert "intel_iommu: Fix irqchip / X2APIC configuration checks"'), won't we regress behaviour again for the accel=kvm case by dropping the kvm_enable_x2apic() call here? Perhaps if we support userspace APIC with TCG the check just needs to be redone to instead avoid always requiring kvm e.g.: if (kvm_irqchip_in_kernel()) { error_setg(errp, "eim=on requires accel=kvm,kernel-irqchip=split" "(X2APIC_API, first shipped in v4.7)"); } if (kvm_irqchip_is_split() && !kvm_enable_x2apic()) { error_setg(errp, "eim=on requires support on the KVM side" "(X2APIC_API, first shipped in v4.7)"); return false; } Thank you for your review. I think the check for kvm_irqchip_in_kernel() is not correct, AFAIK, kvm_irqchip_is_split() == true also means kvm_irqchip_in_kernel() == true on x86. To check if kernel-irqchip = on, we need to do something like in x86_iommu_realize bool irq_all_kernel = kvm_irqchip_in_kernel() && !kvm_irqchip_is_split(); The original check for !kvm_irqchip_is_split means emulated/userspace APIC. It's because to reach that check x86_iommu_ir_supported(...) == true and x86_iommu_ir_supported(...) == true is not supported when kernel-irqchip = on (there is a check for this in x86_iommu_realize) So I think we need to change the check to if (s->intr_eim == ON_OFF_AUTO_ON && !s->buggy_eim) { if (kvm_irqchip_is_split() && !kvm_enable_x2apic()) { error_setg(errp, "eim=on requires support on the KVM side" "(X2APIC_API, first shipped in v4.7)"); return false; } } Is it OK? Mostly to me, except that we may also want to keep failing if all irq chips are in kernel? Yes, that behavior does not change after this patch. x86_iommu_realize in the parent type TYPE_X86_IOMMU_DEVICE fails when interrupt remapping is on and all irq chips are in kernel already. static void x86_iommu_realize(DeviceState *dev, Error **errp) { /* ... */ /* Both Intel and AMD IOMMU IR only support "kernel-irqchip {off|split}" */ if (x86_iommu_ir_supported(x86_iommu) && irq_all_kernel) { error_setg(errp, "Interrupt Remapping cannot work with " "kernel-irqchip=on, please use 'split|off'."); return; } /* ... */ } So in case we reach here in with the interrupt remapping is on and decide whether eim is on or not, it cannot be that irq chips are all in kernel. Thanks, Quang Minh.
Re: [PATCH v3 1/6] throttle: introduce enum ThrottleType
On 13.07.23 08:41, zhenwei pi wrote: Use enum ThrottleType instead of number index. Reviewed-by: Alberto Garcia Signed-off-by: zhenwei pi --- include/qemu/throttle.h | 11 --- util/throttle.c | 16 +--- 2 files changed, 17 insertions(+), 10 deletions(-) [...] diff --git a/util/throttle.c b/util/throttle.c index 81f247a8d1..5642e61763 100644 --- a/util/throttle.c +++ b/util/throttle.c @@ -199,10 +199,12 @@ static bool throttle_compute_timer(ThrottleState *ts, void throttle_timers_attach_aio_context(ThrottleTimers *tt, AioContext *new_context) { -tt->timers[0] = aio_timer_new(new_context, tt->clock_type, SCALE_NS, - tt->read_timer_cb, tt->timer_opaque); -tt->timers[1] = aio_timer_new(new_context, tt->clock_type, SCALE_NS, - tt->write_timer_cb, tt->timer_opaque); +tt->timers[THROTTLE_READ] = +aio_timer_new(new_context, tt->clock_type, SCALE_NS, + tt->timer_cb[THROTTLE_READ], tt->timer_opaque); +tt->timers[THROTTLE_WRITE] = +aio_timer_new(new_context, tt->clock_type, SCALE_NS, + tt->timer_cb[THROTTLE_WRITE], tt->timer_opaque); This could benefit from the new structure: ``` for (int i = 0; i < THROTTLE_MAX; i++) { tt->timers[i] = aio_timer_new(..., tt->timer_cb[i], ...); } ``` Even more so after patch 3. Still, optional, of course, so either way: Reviewed-by: Hanna Czenczek Of note is that we still have instances in util/throttle.c and block/throttle-groups.c that don’t use these enums to access the tt->timers[] array, and that’s a bit unfortunate now (i.e. `tt->timers[is_write]` should rather be `tt->timers[is_write ? THROTTLE_WRITE : THROTTLE_READ]` instead). But functionally it’s OK and I believe patches 3 and 6 do address this.
Re: [PATCH v3 5/6] cryptodev: use NULL throttle timer cb for read direction
On 13.07.23 08:41, zhenwei pi wrote: Operations on a crytpodev are considered as *write* only, the callback s/crytpodev/cryptodev/ of read direction is never invoked. Use NULL instead of an unreachable path(cryptodev_backend_throttle_timer_cb on read direction). Reviewed-by: Alberto Garcia Signed-off-by: zhenwei pi --- backends/cryptodev.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/backends/cryptodev.c b/backends/cryptodev.c index 7d29517843..5cfa25c61c 100644 --- a/backends/cryptodev.c +++ b/backends/cryptodev.c @@ -331,8 +331,7 @@ static void cryptodev_backend_set_throttle(CryptoDevBackend *backend, int field, if (!enabled) { throttle_init(&backend->ts); throttle_timers_init(&backend->tt, qemu_get_aio_context(), - QEMU_CLOCK_REALTIME, - cryptodev_backend_throttle_timer_cb, /* FIXME */ Now it’ll be gone (good), but if you happen to add a FIXME again, it would be nice to explain in the same comment in the code what exactly needs fixing, and why it isn‘t being fixed at this point. + QEMU_CLOCK_REALTIME, NULL, cryptodev_backend_throttle_timer_cb, backend); } With the typo in the commit message fixed: Reviewed-by: Hanna Czenczek
Re: [PATCH v3 3/6] throttle: support read-only and write-only
On 13.07.23 08:41, zhenwei pi wrote: Only one direction is necessary in several scenarios: - a read-only disk - operations on a device are considered as *write* only. For example, encrypt/decrypt/sign/verify operations on a cryptodev use a single *write* timer(read timer callback is defined, but never invoked). Allow a single direction in throttle, this reduces memory, and uplayer does not need a dummy callback any more. Reviewed-by: Alberto Garcia Signed-off-by: zhenwei pi --- util/throttle.c | 32 ++-- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/util/throttle.c b/util/throttle.c index 5642e61763..c0bd0c26c3 100644 --- a/util/throttle.c +++ b/util/throttle.c @@ -199,12 +199,17 @@ static bool throttle_compute_timer(ThrottleState *ts, void throttle_timers_attach_aio_context(ThrottleTimers *tt, AioContext *new_context) { -tt->timers[THROTTLE_READ] = -aio_timer_new(new_context, tt->clock_type, SCALE_NS, - tt->timer_cb[THROTTLE_READ], tt->timer_opaque); -tt->timers[THROTTLE_WRITE] = -aio_timer_new(new_context, tt->clock_type, SCALE_NS, - tt->timer_cb[THROTTLE_WRITE], tt->timer_opaque); +if (tt->timer_cb[THROTTLE_READ]) { +tt->timers[THROTTLE_READ] = +aio_timer_new(new_context, tt->clock_type, SCALE_NS, + tt->timer_cb[THROTTLE_READ], tt->timer_opaque); +} + +if (tt->timer_cb[THROTTLE_WRITE]) { +tt->timers[THROTTLE_WRITE] = +aio_timer_new(new_context, tt->clock_type, SCALE_NS, + tt->timer_cb[THROTTLE_WRITE], tt->timer_opaque); +} I think a `for (int i = 0; i < THROTTLE_MAX; i++)` loop would make this nicer. (Again: Optional.) } /* [...] @@ -272,7 +280,7 @@ void throttle_timers_destroy(ThrottleTimers *tt) /* is any throttling timer configured */ bool throttle_timers_are_initialized(ThrottleTimers *tt) { -if (tt->timers[0]) { +if (tt->timers[THROTTLE_READ] || tt->timers[THROTTLE_WRITE]) { Not wrong, but I’d prefer the more general ``` for (int i = 0; i < THROTTLE_MAX; i++) { if (tt->timers[i]) { return true; } } return false; ``` return true; } @@ -424,8 +432,12 @@ bool throttle_schedule_timer(ThrottleState *ts, { int64_t now = qemu_clock_get_ns(tt->clock_type); int64_t next_timestamp; +QEMUTimer *timer; bool must_wait; +timer = is_write ? tt->timers[THROTTLE_WRITE] : tt->timers[THROTTLE_READ]; Could be shorter as `timer = tt->timers[is_write ? THROTTLE_WRITE : THROTTLE_READ];`. Anyway, I only have style suggestion to offer, so either way: Reviewed-by: Hanna Czenczek +assert(timer); + must_wait = throttle_compute_timer(ts, is_write, now,
Re: [PATCH v3 4/6] test-throttle: test read only and write only
On 13.07.23 08:41, zhenwei pi wrote: Reviewed-by: Alberto Garcia Signed-off-by: zhenwei pi --- tests/unit/test-throttle.c | 66 ++ 1 file changed, 66 insertions(+) Reviewed-by: Hanna Czenczek
Re: [PATCH v3 2/6] test-throttle: use enum ThrottleType
On 13.07.23 08:41, zhenwei pi wrote: Use enum ThrottleType instead in the throttle test codes. Reviewed-by: Alberto Garcia Signed-off-by: zhenwei pi --- tests/unit/test-throttle.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) Reviewed-by: Hanna Czenczek
Re: [PATCH 5/6] vhost-vdpa: Match vhost-user's status reset
On Wed, Jul 19, 2023 at 4:10 PM Hanna Czenczek wrote: > > On 18.07.23 16:50, Stefan Hajnoczi wrote: > > On Tue, Jul 11, 2023 at 05:52:27PM +0200, Hanna Czenczek wrote: > >> vhost-vdpa and vhost-user differ in how they reset the status in their > >> respective vhost_reset_status implementations: vhost-vdpa zeroes it, > >> then re-adds the S_ACKNOWLEDGE and S_DRIVER config bits. S_DRIVER_OK is > >> then set in vhost_vdpa_dev_start(). > >> > >> vhost-user in contrast just zeroes the status, and does no re-add any > >> config bits until vhost_user_dev_start() (where it does re-add all of > >> S_ACKNOWLEDGE, S_DRIVER, and S_DRIVER_OK). > >> > >> There is no documentation for vhost_reset_status, but its only caller is > >> vhost_dev_stop(). So apparently, the device is to be stopped after > >> vhost_reset_status, and therefore it makes more sense to keep the status > >> field fully cleared until the back-end is re-started, which is how > >> vhost-user does it. Make vhost-vdpa do the same -- if nothing else it's > >> confusing to have both vhost implementations handle this differently. > >> Ok now I understand better why you moved the call to vhost_vdpa_reset_status. Maybe we can move the vhost_vdpa_add_status(dev, _S_ACKNOWLEDGE | _S_DRIVER) to vhost_vdpa_set_features then, and let reset_status call vhost_vdpa_reset_device directly. But I'm not 100% sure if I'm missing something, it would need testing. Thanks! > >> Signed-off-by: Hanna Czenczek > >> --- > >> hw/virtio/vhost-vdpa.c | 6 +++--- > >> 1 file changed, 3 insertions(+), 3 deletions(-) > > Hi Hanna, > > The VIRTIO spec lists the Device Initialization sequence including the > > bits set in the Device Status Register here: > > https://docs.oasis-open.org/virtio/virtio/v1.2/csd01/virtio-v1.2-csd01.html#x1-1070001 > > > > ACKNOWLEDGE and DRIVER must be set before FEATURES_OK. DRIVER_OK is set > > after FEATURES_OK. > > > > The driver may read the Device Configuration Space once ACKNOWLEDGE and > > DRIVER are set. > > > > QEMU's vhost code should follow this sequence (especially for vDPA where > > full VIRTIO devices are implemented). > > > > vhost-user is not faithful to the VIRTIO spec here. That's probably due > > to the fact that vhost-user didn't have the concept of the Device Status > > Register until recently and back-ends mostly ignore it. > > > > Please do the opposite of this patch: bring vhost-user in line with the > > VIRTIO specification so that the Device Initialization sequence is > > followed correctly. I think vhost-vdpa already does the right thing. > > Hm. This sounds all very good, but what leaves me lost is the fact that > we never actually expose the status field to the guest, as far as I can > see. We have no set_status callback, and as written in the commit > message, the only caller of reset_status is vhost_dev_stop(). So the > status field seems completely artificial in vhost right now. That is > why I’m wondering what the flags even really mean. > > Another point I made in the commit message is that it is strange that we > reset the status to 0, and then add the ACKNOWLEDGE and DRIVER while the > VM is still stopped. It doesn’t make sense to me to set these flags > while the guest driver is not operative. > > If what you’re saying is that we must set FEATURES_OK only after > ACKNOWLEDGE and DRIVER, wouldn’t it be still better to set all of these > flags only in vhost_*_dev_start(), but do it in two separate SET_STATUS > calls? > > (You mentioned the configuration space – is that accessed while between > vhost_dev_stop and vhost_dev_start?) > > Hanna > > >> diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c > >> index f7fd19a203..0cde8b40de 100644 > >> --- a/hw/virtio/vhost-vdpa.c > >> +++ b/hw/virtio/vhost-vdpa.c > >> @@ -1294,8 +1294,6 @@ static void vhost_vdpa_reset_status(struct vhost_dev > >> *dev) > >> } > >> > >> vhost_vdpa_reset_device(dev); > >> -vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE | > >> - VIRTIO_CONFIG_S_DRIVER); > >> memory_listener_unregister(&v->listener); > >> } > >> > >> @@ -1334,7 +1332,9 @@ static int vhost_vdpa_dev_start(struct vhost_dev > >> *dev, bool started) > >> } > >> memory_listener_register(&v->listener, dev->vdev->dma_as); > >> > >> -return vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK); > >> +return vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE | > >> + VIRTIO_CONFIG_S_DRIVER | > >> + VIRTIO_CONFIG_S_DRIVER_OK); > >> } > >> > >> return 0; > >> -- > >> 2.41.0 > >> >
Re: [PATCH v3 6/6] throttle: use enum ThrottleType instead of bool is_write
On 13.07.23 08:41, zhenwei pi wrote: enum ThrottleType is already there, use ThrottleType instead of 'bool is_write' for throttle API, also modify related codes from block, fsdev, cryptodev and tests. Signed-off-by: zhenwei pi --- backends/cryptodev.c| 9 + block/throttle-groups.c | 6 -- fsdev/qemu-fsdev-throttle.c | 8 +--- include/qemu/throttle.h | 4 ++-- tests/unit/test-throttle.c | 4 ++-- util/throttle.c | 30 -- 6 files changed, 34 insertions(+), 27 deletions(-) Something not addressed in this patch that I would like to see: schedule_next_request() in block/throttle-groups.c runs `timer_mod(tt->timers[is_write], now)`. I think that at least should be `tt->timers[is_write ? THROTTLE_WRITE : THROTTLE_READ]`. Even better would be to have it take a `ThrottleType` instead of `bool is_write`, too, and use this enum throughout throttle-groups.c, too (i.e. for ThrottleGroupMember.throttled_reqs[], ThrottleGroupMember.pending_reqs[], ThrottleGroup.tokens[], and ThrottleGroup.any_timer_armed[]). Then throttle_group_schedule_timer() could also take a `ThrottleType`. But I understand asking for throttle-groups.c to be ThrottleType-ified is very much, so this is just a suggestion. But I do ask for that one `timer_mod()` call to use THROTTLE_READ and THROTTLE_WRITE to index `tt->timers[]` instead of `is_write` directly. [...] diff --git a/block/throttle-groups.c b/block/throttle-groups.c index fb203c3ced..429b9d1dae 100644 --- a/block/throttle-groups.c +++ b/block/throttle-groups.c @@ -270,6 +270,7 @@ static bool throttle_group_schedule_timer(ThrottleGroupMember *tgm, ThrottleState *ts = tgm->throttle_state; ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts); ThrottleTimers *tt = &tgm->throttle_timers; +ThrottleType throttle = is_write ? THROTTLE_WRITE : THROTTLE_READ; Again, another stylistic suggestion (for all similar places in this patch): It isn’t clear what just `throttle` means. `throttle_direction` for example would be clear, or maybe just `direction`, this is throttle code, so it’s clear that everything that isn’t given a context is about throttling (it wasn’t `is_throttled_write` either, after all, but just `is_write`). bool must_wait; if (qatomic_read(&tgm->io_limits_disabled)) { [...] diff --git a/util/throttle.c b/util/throttle.c index c0bd0c26c3..5e4dc0bfdd 100644 --- a/util/throttle.c +++ b/util/throttle.c @@ -136,11 +136,11 @@ int64_t throttle_compute_wait(LeakyBucket *bkt) /* This function compute the time that must be waited while this IO * - * @is_write: true if the current IO is a write, false if it's a read + * @throttle: throttle type I’m not too happy about “throttle type” as a description here, because “type” can be anything (naïvely, I’d rather interpret it to mean the algorithm used for throttling, or whether we’re throttling on bytes or IOPS). “throttle direction” would be better. This also applies to other functions’ interface documentation. (Yes, this also means that I don’t like the type name ThrottleType very much, and would prefer it to be ThrottleDirection, but that would be an invasive change all over this series (when Berto has already reviewed it), so I’m not really asking for a change there.) * @ret:time to wait */ static int64_t throttle_compute_wait_for(ThrottleState *ts, - bool is_write) + ThrottleType throttle) { BucketType to_check[2][4] = { {THROTTLE_BPS_TOTAL, Since we’re now using a ThrottleType to index the first dimension of this array, I’d prefer to make this to_check[THROTTLE_MAX][4]. (Also, but very much unrelated to this patch: Why isn’t this lookup table a `const static`?) THROTTLE_OPS_TOTAL, [...] @@ -460,10 +461,10 @@ bool throttle_schedule_timer(ThrottleState *ts, /* do the accounting for this operation * - * @is_write: the type of operation (read/write) + * @throttle: throttle type * @size: the size of the operation */ -void throttle_account(ThrottleState *ts, bool is_write, uint64_t size) +void throttle_account(ThrottleState *ts, ThrottleType throttle, uint64_t size) { const BucketType bucket_types_size[2][2] = { { THROTTLE_BPS_TOTAL, THROTTLE_BPS_READ }, Like in throttle_compute_wait_for(), I’d prefer bucket_types_size and bucket_types_units to have [THROTTLE_MAX] in the first dimension. (Interesting that these lookup tables are const, but not static. I think they should be static, too.) Hanna
Re: [PATCH 3/6] vhost: Do not reset suspended devices on stop
On 21.07.23 17:25, Eugenio Perez Martin wrote: On Tue, Jul 11, 2023 at 5:52 PM Hanna Czenczek wrote: Move the `suspended` field from vhost_vdpa into the global vhost_dev struct, so vhost_dev_stop() can check whether the back-end has been suspended by `vhost_ops->vhost_dev_start(hdev, false)`. If it has, there is no need to reset it; the reset is just a fall-back to stop device operations for back-ends that do not support suspend. Unfortunately, for vDPA specifically, RESUME is not yet implemented, so when the device is re-started, we still have to do the reset to have it un-suspend. Signed-off-by: Hanna Czenczek --- include/hw/virtio/vhost-vdpa.h | 2 -- include/hw/virtio/vhost.h | 8 hw/virtio/vhost-vdpa.c | 11 +++ hw/virtio/vhost.c | 8 +++- 4 files changed, 22 insertions(+), 7 deletions(-) diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h index e64bfc7f98..72c3686b7f 100644 --- a/include/hw/virtio/vhost-vdpa.h +++ b/include/hw/virtio/vhost-vdpa.h @@ -42,8 +42,6 @@ typedef struct vhost_vdpa { bool shadow_vqs_enabled; /* Vdpa must send shadow addresses as IOTLB key for data queues, not GPA */ bool shadow_data; -/* Device suspended successfully */ -bool suspended; /* IOVA mapping used by the Shadow Virtqueue */ VhostIOVATree *iova_tree; GPtrArray *shadow_vqs; diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h index 6a173cb9fa..69bf59d630 100644 --- a/include/hw/virtio/vhost.h +++ b/include/hw/virtio/vhost.h @@ -120,6 +120,14 @@ struct vhost_dev { uint64_t backend_cap; /* @started: is the vhost device started? */ bool started; +/** + * @suspended: Whether the vhost device is currently suspended. Set + * and reset by implementations (vhost-user, vhost-vdpa, ...), which + * are supposed to automatically suspend/resume in their + * vhost_dev_start handlers as required. Must also be cleared when + * the device is reset. + */ +bool suspended; bool log_enabled; uint64_t log_size; Error *migration_blocker; diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c index 7b7dee468e..f7fd19a203 100644 --- a/hw/virtio/vhost-vdpa.c +++ b/hw/virtio/vhost-vdpa.c @@ -858,13 +858,12 @@ static int vhost_vdpa_get_device_id(struct vhost_dev *dev, static int vhost_vdpa_reset_device(struct vhost_dev *dev) { -struct vhost_vdpa *v = dev->opaque; int ret; uint8_t status = 0; ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &status); trace_vhost_vdpa_reset_device(dev); -v->suspended = false; +dev->suspended = false; return ret; } @@ -1278,7 +1277,7 @@ static void vhost_vdpa_suspend(struct vhost_dev *dev) if (unlikely(r)) { error_report("Cannot suspend: %s(%d)", g_strerror(errno), errno); } else { -v->suspended = true; +dev->suspended = true; return; } } @@ -1313,6 +1312,10 @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started) return -1; } vhost_vdpa_set_vring_ready(dev); +if (dev->suspended) { +/* TODO: When RESUME is available, use it instead of resetting */ +vhost_vdpa_reset_status(dev); How is that we reset the status at each vhost_vdpa_dev_start? That will clean all the vqs configured, features negotiated, etc. in the vDPA device. Or am I missing something? What alternative do you propose? We don’t have RESUME for vDPA in qemu, but we somehow need to lift the previous SUSPEND so the device will again respond to guest requests, do we not? But more generally, is this any different from what is done before this patch? Before this patch, vhost_dev_stop() unconditionally invokes vhost_reset_status(), so the device is reset in every stop/start cycle, that doesn’t change. And we still won’t reset it on the first vhost_dev_start(), because dev->suspended will be false then, only on subsequent stop/start cycles, as before. So the only difference is that now the device is reset on start, not on stop. Hanna
Re: [PATCH v2 3/4] vhost: Add high-level state save/load functions
On 21.07.23 17:18, Eugenio Perez Martin wrote: On Wed, Jul 12, 2023 at 1:17 PM Hanna Czenczek wrote: vhost_save_backend_state() and vhost_load_backend_state() can be used by vhost front-ends to easily save and load the back-end's state to/from the migration stream. Because we do not know the full state size ahead of time, vhost_save_backend_state() simply reads the data in 1 MB chunks, and writes each chunk consecutively into the migration stream, prefixed by its length. EOF is indicated by a 0-length chunk. Signed-off-by: Hanna Czenczek --- include/hw/virtio/vhost.h | 35 +++ hw/virtio/vhost.c | 204 ++ 2 files changed, 239 insertions(+) [...] +int vhost_load_backend_state(struct vhost_dev *dev, QEMUFile *f, Error **errp) +{ +size_t transfer_buf_size = 0; +void *transfer_buf = NULL; +g_autoptr(GError) g_err = NULL; +int pipe_fds[2], read_fd = -1, write_fd = -1, reply_fd = -1; +int ret; [...] +ret = 0; +fail: +g_free(transfer_buf); Nitpick, but transfer_buf could have the g_autofree parameter. Ah, sure, thanks! Hanna
Re: [PATCH v2] target/ppc: Generate storage interrupts for radix RC changes
On 7/20/23 23:38, Nicholas Piggin wrote: On Thu Jul 13, 2023 at 3:35 AM AEST, Shawn Anastasio wrote: On 7/12/23 11:56 AM, Cédric Le Goater wrote: Hello Shawn, On 7/12/23 18:13, Shawn Anastasio wrote: Change radix model to always generate a storage interrupt when the R/C bits are not set appropriately in a PTE instead of setting the bits itself. According to the ISA both behaviors are valid, but in practice this change more closely matches behavior observed on the POWER9 CPU. How did you spotted this dark corner case in emulation ? Do you have MMU unit tests ? I'm currently porting Xen to Power and have been using QEMU's powernv model extensively for early bring up. I noticed the issue when my radix implementation worked in QEMU but failed on actual hardware since I didn't have a proper storage interrupt handler implemented. Cool. This was on my todo list because we rely on it for nested HV KVM too. I actually didn't know about that odd effLIPD=0 exception, but it looks right. How did you test that, by running with MSR[HV]=0 and LPIDR=0 ? For the patch, Reviewed-by: Nicholas Piggin Cedric, do you think this is 8.1 material? Daniel Thanks, Nick
Re: [PATCH v2] target/ppc: Generate storage interrupts for radix RC changes
On 7/21/23 18:08, Daniel Henrique Barboza wrote: On 7/20/23 23:38, Nicholas Piggin wrote: On Thu Jul 13, 2023 at 3:35 AM AEST, Shawn Anastasio wrote: On 7/12/23 11:56 AM, Cédric Le Goater wrote: Hello Shawn, On 7/12/23 18:13, Shawn Anastasio wrote: Change radix model to always generate a storage interrupt when the R/C bits are not set appropriately in a PTE instead of setting the bits itself. According to the ISA both behaviors are valid, but in practice this change more closely matches behavior observed on the POWER9 CPU. How did you spotted this dark corner case in emulation ? Do you have MMU unit tests ? I'm currently porting Xen to Power and have been using QEMU's powernv model extensively for early bring up. I noticed the issue when my radix implementation worked in QEMU but failed on actual hardware since I didn't have a proper storage interrupt handler implemented. Cool. This was on my todo list because we rely on it for nested HV KVM too. I actually didn't know about that odd effLIPD=0 exception, but it looks right. How did you test that, by running with MSR[HV]=0 and LPIDR=0 ? For the patch, Reviewed-by: Nicholas Piggin Cedric, do you think this is 8.1 material? No. It is improving the MMU model but it is not fixing something that was "broken" in QEMU. I would leave it for 8.2. However, these are 8.1 material : https://patchwork.ozlabs.org/project/qemu-ppc/list/?series=364971 https://patchwork.ozlabs.org/project/qemu-ppc/list/?series=364984 but they lack a Fixes tag. I asked Nick. C.
[PATCH 0/4] CXL: SK hynix Niagara MHSLD Device
Base repo: https://gitlab.com/jic23/qemu Base branch: cxl-2023-07-17 This patch set includes an emulation of the SK hynix Niagara MHSLD platform with custom CCI commands that allow for isolation of memory blocks between attached hosts. There are 4 total patches in this set: 1 & 2: Modifications to the CCI interface to allow the addition of custom CCI commands to an existing CCI command set. 3: Minimum MHD cci support for a type-3 device 4: The SK hynix Niagara Device The changes to the core device and cci interface are very limited, and this provides a clean way for developers to add custom CCI commands to a device while retaining the possiblity to upstream the model. This device allows hosts to request memory blocks directly from the device, rather than requiring full the DCD command set. As a matter of simplicity, this is beneficial to for testing and applications of dynamic memory pooling on top of the 1.1 and 2.0 specification. Note that these CCI commands are not servicable without the kernel allowing raw CXL commands to be passed through the mailbox driver, so users should enable `CONFIG_CXL_MEM_RAW_COMMANDS=y` on the kernel of their QEMU instance. Signed-off-by: Gregory Price Gregory Price (4): cxl/mailbox: change CCI cmd set structure to be a member, not a refernce cxl/mailbox: interface to add CCI commands to an existing CCI cxl/type3: minimum MHD cci support cxl/vendor: SK hynix Niagara Multi-Headed SLD Device hw/cxl/Kconfig | 4 + hw/cxl/cxl-mailbox-utils.c | 90 +++- hw/cxl/meson.build | 2 + hw/cxl/vendor/meson.build | 1 + hw/cxl/vendor/skhynix/.gitignore| 1 + hw/cxl/vendor/skhynix/init_niagara.c| 99 + hw/cxl/vendor/skhynix/meson.build | 1 + hw/cxl/vendor/skhynix/skhynix_niagara.c | 521 hw/mem/cxl_type3.c | 67 +++ include/hw/cxl/cxl_device.h | 18 +- tools/cxl/cxl_mhd_init.c| 63 +++ tools/cxl/meson.build | 3 + tools/meson.build | 1 + 13 files changed, 866 insertions(+), 5 deletions(-) create mode 100644 hw/cxl/vendor/meson.build create mode 100644 hw/cxl/vendor/skhynix/.gitignore create mode 100644 hw/cxl/vendor/skhynix/init_niagara.c create mode 100644 hw/cxl/vendor/skhynix/meson.build create mode 100644 hw/cxl/vendor/skhynix/skhynix_niagara.c create mode 100644 tools/cxl/cxl_mhd_init.c create mode 100644 tools/cxl/meson.build -- 2.39.1
[PATCH 2/4] cxl/mailbox: interface to add CCI commands to an existing CCI
This enables wrapper devices to customize the base device's CCI (for example, with custom commands outside the specification) without the need to change the base device. The also enabled the base device to dispatch those commands without requiring additional driver support. Signed-off-by: Gregory Price --- hw/cxl/cxl-mailbox-utils.c | 19 +++ include/hw/cxl/cxl_device.h | 2 ++ 2 files changed, 21 insertions(+) diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c index ddee3f1718..cad0cd0adb 100644 --- a/hw/cxl/cxl-mailbox-utils.c +++ b/hw/cxl/cxl-mailbox-utils.c @@ -1383,6 +1383,25 @@ static void cxl_copy_cci_commands(CXLCCI *cci, const struct cxl_cmd (*cxl_cmds)[ } } +void cxl_add_cci_commands(CXLCCI *cci, const struct cxl_cmd (*cxl_cmd_set)[256], size_t payload_max) +{ +cci->payload_max = payload_max > cci->payload_max ? payload_max : cci->payload_max; +for (int set = 0; set < 256; set++) { +for (int cmd = 0; cmd < 256; cmd++) { +if (cxl_cmd_set[set][cmd].handler) { +const struct cxl_cmd *c = &cxl_cmd_set[set][cmd]; +cci->cxl_cmd_set[set][cmd] = *c; +struct cel_log *log = +&cci->cel_log[cci->cel_size]; + +log->opcode = (set << 8) | cmd; +log->effect = c->effect; +cci->cel_size++; +} +} +} +} + void cxl_initialize_mailbox_swcci(CXLCCI *cci, DeviceState *intf, DeviceState *d, size_t payload_max) { cxl_copy_cci_commands(cci, cxl_cmd_set_sw); diff --git a/include/hw/cxl/cxl_device.h b/include/hw/cxl/cxl_device.h index 9a3c8b2dfa..abc8405cc5 100644 --- a/include/hw/cxl/cxl_device.h +++ b/include/hw/cxl/cxl_device.h @@ -297,6 +297,8 @@ void cxl_initialize_mailbox_t3(CXLCCI *cci, DeviceState *d, size_t payload_max); void cxl_initialize_mailbox_swcci(CXLCCI *cci, DeviceState *intf, DeviceState *d, size_t payload_max); void cxl_init_cci(CXLCCI *cci, size_t payload_max); +void cxl_add_cci_commands(CXLCCI *cci, const struct cxl_cmd (*cxl_cmd_set)[256], + size_t payload_max); int cxl_process_cci_message(CXLCCI *cci, uint8_t set, uint8_t cmd, size_t len_in, uint8_t *pl_in, size_t *len_out, uint8_t *pl_out, -- 2.39.1
[PATCH 1/4] cxl/mailbox: change CCI cmd set structure to be a member, not a refernce
This allows devices to have fully customized CCIs, along with complex devices where wrapper devices can override or add additional CCI commands without having to replicate full command structures or pollute a base device with every command that might ever be used. Signed-off-by: Gregory Price --- hw/cxl/cxl-mailbox-utils.c | 18 ++ include/hw/cxl/cxl_device.h | 2 +- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c index 2819914e8d..ddee3f1718 100644 --- a/hw/cxl/cxl-mailbox-utils.c +++ b/hw/cxl/cxl-mailbox-utils.c @@ -1373,9 +1373,19 @@ void cxl_init_cci(CXLCCI *cci, size_t payload_max) bg_timercb, cci); } +static void cxl_copy_cci_commands(CXLCCI *cci, const struct cxl_cmd (*cxl_cmds)[256]) { +for (int set = 0; set < 256; set++) { +for (int cmd = 0; cmd < 256; cmd++) { +if (cxl_cmds[set][cmd].handler) { +cci->cxl_cmd_set[set][cmd] = cxl_cmds[set][cmd]; +} +} +} +} + void cxl_initialize_mailbox_swcci(CXLCCI *cci, DeviceState *intf, DeviceState *d, size_t payload_max) { -cci->cxl_cmd_set = cxl_cmd_set_sw; +cxl_copy_cci_commands(cci, cxl_cmd_set_sw); cci->d = d; cci->intf = intf; cxl_init_cci(cci, payload_max); @@ -1383,7 +1393,7 @@ void cxl_initialize_mailbox_swcci(CXLCCI *cci, DeviceState *intf, DeviceState *d void cxl_initialize_mailbox_t3(CXLCCI *cci, DeviceState *d, size_t payload_max) { -cci->cxl_cmd_set = cxl_cmd_set; +cxl_copy_cci_commands(cci, cxl_cmd_set); cci->d = d; /* No separation for PCI MB as protocol handled in PCI device */ @@ -1398,7 +1408,7 @@ static const struct cxl_cmd cxl_cmd_set_t3_mctp[256][256] = { void cxl_initialize_t3_mctpcci(CXLCCI *cci, DeviceState *d, DeviceState *intf, size_t payload_max) { -cci->cxl_cmd_set = cxl_cmd_set_t3_mctp; +cxl_copy_cci_commands(cci, cxl_cmd_set_t3_mctp); cci->d = d; cci->intf = intf; cxl_init_cci(cci, payload_max); @@ -1414,7 +1424,7 @@ static const struct cxl_cmd cxl_cmd_set_usp_mctp[256][256] = { void cxl_initialize_usp_mctpcci(CXLCCI *cci, DeviceState *d, DeviceState *intf, size_t payload_max) { -cci->cxl_cmd_set = cxl_cmd_set_usp_mctp; +cxl_copy_cci_commands(cci, cxl_cmd_set_usp_mctp); cci->d = d; cci->intf = intf; cxl_init_cci(cci, payload_max); diff --git a/include/hw/cxl/cxl_device.h b/include/hw/cxl/cxl_device.h index c68981b618..9a3c8b2dfa 100644 --- a/include/hw/cxl/cxl_device.h +++ b/include/hw/cxl/cxl_device.h @@ -163,7 +163,7 @@ typedef struct CXLEventLog { } CXLEventLog; typedef struct CXLCCI { -const struct cxl_cmd (*cxl_cmd_set)[256]; +struct cxl_cmd cxl_cmd_set[256][256]; struct cel_log { uint16_t opcode; uint16_t effect; -- 2.39.1
[PATCH 3/4] cxl/type3: minimum MHD cci support
Implement the MHD GET_INFO cci command and add a shared memory region to the type3 device to host the information. Add a helper program to initialize this shared memory region. Add a function pointer to type3 devices for future work that will allow an mhd device to provide a hook to validate whether a memory access is valid or not. For now, limit the number of LD's to the number of heads. Later, this limitation will need to be lifted for MH-MLDs. Intended use case: 1. Create the shared memory region 2. Format the shared memory region 3. Launch QEMU with `is_mhd=true,mhd_head=N,mhd_shmid=$shmid` shmid=`ipcmk -M 4096 | grep -o -E '[0-9]+' | head -1` cxl_mhd_init 4 $shmid qemu-system-x86_64 \ -nographic \ -accel kvm \ -drive file=./mhd.qcow2,format=qcow2,index=0,media=disk,id=hd \ -m 4G,slots=4,maxmem=8G \ -smp 4 \ -machine type=q35,cxl=on,hmat=on \ -device pxb-cxl,id=cxl.0,bus=pcie.0,bus_nr=52 \ -device cxl-rp,id=rp0,bus=cxl.0,chassis=0,port=0,slot=0 \ -object memory-backend-file,id=mem0,mem-path=/tmp/mem0,size=4G,share=true \ -device cxl-type3,bus=rp0,volatile-memdev=mem0,id=cxl-mem0,sn=6,is_mhd=true,mhd_head=0,mhd_shmid=$shmid \ -M cxl-fmw.0.targets.0=cxl.0,cxl-fmw.0.size=4G Signed-off-by: Gregory Price --- hw/cxl/cxl-mailbox-utils.c | 53 + hw/mem/cxl_type3.c | 67 + include/hw/cxl/cxl_device.h | 14 tools/cxl/cxl_mhd_init.c| 63 ++ tools/cxl/meson.build | 3 ++ tools/meson.build | 1 + 6 files changed, 201 insertions(+) create mode 100644 tools/cxl/cxl_mhd_init.c create mode 100644 tools/cxl/meson.build diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c index cad0cd0adb..57b8da4376 100644 --- a/hw/cxl/cxl-mailbox-utils.c +++ b/hw/cxl/cxl-mailbox-utils.c @@ -84,6 +84,8 @@ enum { #define GET_PHYSICAL_PORT_STATE 0x1 TUNNEL = 0x53, #define MANAGEMENT_COMMAND 0x0 +MHD = 0x55, +#define GET_MHD_INFO 0x0 }; /* CCI Message Format CXL r3.0 Figure 7-19 */ @@ -1155,6 +1157,56 @@ static CXLRetCode cmd_media_clear_poison(const struct cxl_cmd *cmd, return CXL_MBOX_SUCCESS; } +static CXLRetCode cmd_mhd_get_info(const struct cxl_cmd *cmd, + uint8_t *payload_in, + size_t len_in, + uint8_t *payload_out, + size_t *len_out, + CXLCCI *cci) +{ +CXLType3Dev *ct3d = CXL_TYPE3(cci->d); +struct { +uint8_t start_ld; +uint8_t ldmap_len; +} QEMU_PACKED *input = (void *)payload_in; + +struct { +uint8_t nr_lds; +uint8_t nr_heads; +uint16_t resv1; +uint8_t start_ld; +uint8_t ldmap_len; +uint16_t resv2; +uint8_t ldmap[]; +} QEMU_PACKED *output = (void *)payload_out; + +uint8_t start_ld = input->start_ld; +uint8_t ldmap_len = input->ldmap_len; +uint8_t i; + +if (!ct3d->is_mhd) { +return CXL_MBOX_UNSUPPORTED; +} + +if (start_ld >= ct3d->mhd_state->nr_lds) { +return CXL_MBOX_INVALID_INPUT; +} + +output->nr_lds = ct3d->mhd_state->nr_lds; +output->nr_heads = ct3d->mhd_state->nr_heads; +output->resv1 = 0; +output->start_ld = start_ld; +output->resv2 = 0; + +for (i = 0; i < ldmap_len && (start_ld + i) < output->nr_lds; i++) { +output->ldmap[i] = ct3d->mhd_state->ldmap[start_ld + i]; +} +output->ldmap_len = i; + +*len_out = sizeof(*output) + output->ldmap_len; +return CXL_MBOX_SUCCESS; +} + #define IMMEDIATE_CONFIG_CHANGE (1 << 1) #define IMMEDIATE_DATA_CHANGE (1 << 2) #define IMMEDIATE_POLICY_CHANGE (1 << 3) @@ -1195,6 +1247,7 @@ static const struct cxl_cmd cxl_cmd_set[256][256] = { cmd_media_inject_poison, 8, 0 }, [MEDIA_AND_POISON][CLEAR_POISON] = { "MEDIA_AND_POISON_CLEAR_POISON", cmd_media_clear_poison, 72, 0 }, +[MHD][GET_MHD_INFO] = {"GET_MULTI_HEADED_INFO", cmd_mhd_get_info, 2, 0}, }; static const struct cxl_cmd cxl_cmd_set_sw[256][256] = { diff --git a/hw/mem/cxl_type3.c b/hw/mem/cxl_type3.c index efb7dece80..c8eb3aa67d 100644 --- a/hw/mem/cxl_type3.c +++ b/hw/mem/cxl_type3.c @@ -18,6 +18,7 @@ #include "hw/cxl/cxl.h" #include "hw/pci/msix.h" #include "hw/pci/spdm.h" +#include #define DWORD_BYTE 4 @@ -794,6 +795,48 @@ static DOEProtocol doe_spdm_prot[] = { { } }; +static bool cxl_setup_mhd(CXLType3Dev *ct3d, Error **errp) +{ +if (!ct3d->is_mhd) { +ct3d->mhd_access_valid = NULL; +return true; +} else if (ct3d->is_mhd && + (!ct3d->mhd_shmid || (ct3d->mhd_head == ~(0 { +error_setg(errp, "is_mhd requires mhd_shmid and mhd_head settings"); +return false; +} else if (!ct3d->is_mhd && + (ct3d->mhd_shmid || (ct3d->mhd_head