date:20240229

AST2700 interrupt controller(INTC) provides hardware interrupt interfaces
to interrupt of processors PSP, SSP and TSP. In INTC, each interrupt of
INT 128 to INT136 combines 32 interrupts.

Introduce a new aspeed_intc class with instance_init and realize handlers.

QEMU supports ARM Generic Interrupt Controller, version 3(GICv3)
but not support Shared Peripheral Interrupt (SPI), yet.
This patch added work around to set GICINT132[18] which was BMC UART interrupt
if it received GICINT132, so users are able to type any key from keyboard to
trigger GICINT132 interrupt until AST2700 boot into login prompt.
It is a temporary solution.

Signed-off-by: Troy Lee 
Signed-off-by: Jamin Lin 
---
 hw/intc/aspeed_intc.c| 135 +++
 hw/intc/meson.build  |   1 +
 include/hw/intc/aspeed_vic.h |  29 
 3 files changed, 165 insertions(+)
 create mode 100644 hw/intc/aspeed_intc.c

diff --git a/hw/intc/aspeed_intc.c b/hw/intc/aspeed_intc.c
new file mode 100644
index 00..851d43363b
--- /dev/null
+++ b/hw/intc/aspeed_intc.c
@@ -0,0 +1,135 @@
+/*
+ * ASPEED INTC Controller
+ *
+ * Copyright (C) 2024 ASPEED Technology Inc.
+ *
+ * This code is licensed under the GPL version 2 or later.  See
+ * the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "hw/intc/aspeed_vic.h"
+#include "hw/irq.h"
+#include "migration/vmstate.h"
+#include "qemu/bitops.h"
+#include "qemu/log.h"
+#include "qemu/module.h"
+#include "hw/intc/arm_gicv3.h"
+#include "trace.h"
+
+#define ASPEED_INTC_NR_IRQS 128
+#define ASPEED_INTC_SIZE 0x4000
+#define TO_REG(N) (N >> 2)
+
+uint64_t regs[ASPEED_INTC_SIZE];
+
+static void aspeed_intc_set_irq(void *opaque, int irq, int level)
+{
+}
+
+static uint64_t aspeed_intc_read(void *opaque, hwaddr offset, unsigned size)
+{
+AspeedINTCState *s = ASPEED_INTC(opaque);
+GICv3State *gic = ARM_GICV3(s->gic);
+
+uint64_t value = 0;
+switch (TO_REG(offset)) {
+case TO_REG(0x1404):
+/* BMC UART interript is GICINT132[18] */
+if (gic && gicv3_gicd_level_test(gic, 164)) {
+value = BIT(18);
+}
+break;
+default:
+value = regs[TO_REG(offset)];
+break;
+}
+
+return value;
+}
+
+static void aspeed_intc_write(void *opaque, hwaddr offset, uint64_t data,
+unsigned size)
+{
+AspeedINTCState *s = ASPEED_INTC(opaque);
+GICv3State *gic = ARM_GICV3(s->gic);
+
+switch (TO_REG(offset)) {
+case TO_REG(0x1400):
+regs[TO_REG(offset)] = data;
+if (regs[TO_REG(offset)]) {
+gicv3_gicd_enabled_set(gic, 164);
+} else {
+gicv3_gicd_enabled_clear(gic, 164);
+}
+break;
+case TO_REG(0x1404):
+regs[TO_REG(offset)] &= ~(data);
+gicv3_gicd_level_clear(gic, 164);
+break;
+default:
+regs[TO_REG(offset)] = data;
+break;
+}
+}
+
+static const MemoryRegionOps aspeed_intc_ops = {
+.read = aspeed_intc_read,
+.write = aspeed_intc_write,
+.endianness = DEVICE_LITTLE_ENDIAN,
+.valid.min_access_size = 4,
+.valid.max_access_size = 4,
+.valid.unaligned = false,
+};
+
+static void aspeed_intc_realize(DeviceState *dev, Error **errp)
+{
+SysBusDevice *sbd = SYS_BUS_DEVICE(dev);
+AspeedINTCState *s = ASPEED_INTC(dev);
+
+memory_region_init_io(&s->iomem, OBJECT(s), &aspeed_intc_ops, s,
+  TYPE_ASPEED_INTC, ASPEED_INTC_SIZE);
+
+sysbus_init_mmio(sbd, &s->iomem);
+
+qdev_init_gpio_in(dev, aspeed_intc_set_irq, ASPEED_INTC_NR_IRQS);
+sysbus_init_irq(sbd, &s->irq);
+sysbus_init_irq(sbd, &s->fiq);
+}
+
+static void aspeed_intc_reset(DeviceState *dev)
+{
+AspeedINTCState *s = ASPEED_INTC(dev);
+
+s->level = 0;
+s->raw = 0;
+s->select = 0;
+s->enable = 0;
+s->trigger = 0;
+s->sense = 0x1F07FFF8ULL;
+s->dual_edge = 0xF80007ULL;
+s->event = 0x5F07FFF8ULL;
+}
+
+static void aspeed_intc_class_init(ObjectClass *klass, void *data)
+{
+DeviceClass *dc = DEVICE_CLASS(klass);
+dc->realize = aspeed_intc_realize;
+dc->reset = aspeed_intc_reset;
+dc->desc = "ASPEED Interrupt Controller for AST27x0";
+dc->vmsd = NULL;
+}
+
+static const TypeInfo aspeed_intc_info = {
+.name = TYPE_ASPEED_INTC,
+.parent = TYPE_SYS_BUS_DEVICE,
+.instance_size = sizeof(AspeedINTCState),
+.class_init = aspeed_intc_class_init,
+};
+
+static void aspeed_intc_register_types(void)
+{
+type_register_static(&aspeed_intc_info);
+}
+
+type_init(aspeed_intc_register_types);
diff --git a/hw/intc/meson.build b/hw/intc/meson.build
index ed355941d1..f5c574f584 100644
--- a/hw/intc/meson.build
+++ b/hw/intc/meson.build
@@ -14,6 +14,7 @@ system_ss.add(when: 'CONFIG_ARM_GICV3_TCG', if_true: files(
 ))
 system_ss.add(when: 'CONFIG_ALLWINNER_A10_PIC', if_true: 
files('allwinner-a10-pic.c'))
 system_ss.add(when: 'CONFIG_ASPEED_SOC

[PATCH v1 2/8] aspeed/sli: Add AST2700 support

AST2700 SLI engine is designed to accelerate the
throughput between cross-die connections.
It have CPU_SLI at CPU die and IO_SLI at IO die.

Introduce new ast2700_sli and ast2700_sliio class
with instance_init and realize handlers.

Signed-off-by: Troy Lee 
Signed-off-by: Jamin Lin 
---
 hw/misc/aspeed_sli.c | 179 +++
 hw/misc/meson.build  |   3 +-
 hw/misc/trace-events |   7 ++
 include/hw/misc/aspeed_sli.h |  32 +++
 4 files changed, 220 insertions(+), 1 deletion(-)
 create mode 100644 hw/misc/aspeed_sli.c
 create mode 100644 include/hw/misc/aspeed_sli.h

diff --git a/hw/misc/aspeed_sli.c b/hw/misc/aspeed_sli.c
new file mode 100644
index 00..4af42f145c
--- /dev/null
+++ b/hw/misc/aspeed_sli.c
@@ -0,0 +1,179 @@
+/*
+ * ASPEED SLI Controller
+ *
+ * Copyright (C) 2024 ASPEED Technology Inc.
+ *
+ * This code is licensed under the GPL version 2 or later.  See
+ * the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/log.h"
+#include "qemu/error-report.h"
+#include "hw/qdev-properties.h"
+#include "hw/misc/aspeed_sli.h"
+#include "qapi/error.h"
+#include "migration/vmstate.h"
+#include "trace.h"
+
+#define SLI_REGION_SIZE 0x500
+#define TO_REG(addr) ((addr) >> 2)
+
+static uint64_t aspeed_sli_read(void *opaque, hwaddr addr, unsigned int size)
+{
+AspeedSLIState *s = ASPEED_SLI(opaque);
+int reg = TO_REG(addr);
+
+if (reg >= ARRAY_SIZE(s->regs)) {
+qemu_log_mask(LOG_GUEST_ERROR,
+  "%s: Out-of-bounds read at offset 0x%" HWADDR_PRIx "\n",
+  __func__, addr);
+return 0;
+}
+
+trace_aspeed_sli_read(addr, size, s->regs[reg]);
+return s->regs[reg];
+}
+
+static void aspeed_sli_write(void *opaque, hwaddr addr, uint64_t data,
+  unsigned int size)
+{
+AspeedSLIState *s = ASPEED_SLI(opaque);
+int reg = TO_REG(addr);
+
+if (reg >= ARRAY_SIZE(s->regs)) {
+qemu_log_mask(LOG_GUEST_ERROR,
+  "%s: Out-of-bounds write at offset 0x%" HWADDR_PRIx "\n",
+  __func__, addr);
+return;
+}
+
+trace_aspeed_sli_write(addr, size, data);
+s->regs[reg] = data;
+}
+
+static uint64_t aspeed_sliio_read(void *opaque, hwaddr addr, unsigned int size)
+{
+AspeedSLIState *s = ASPEED_SLI(opaque);
+int reg = TO_REG(addr);
+
+if (reg >= ARRAY_SIZE(s->regs)) {
+qemu_log_mask(LOG_GUEST_ERROR,
+  "%s: Out-of-bounds read at offset 0x%" HWADDR_PRIx "\n",
+  __func__, addr);
+return 0;
+}
+
+trace_aspeed_sliio_read(addr, size, s->regs[reg]);
+return s->regs[reg];
+}
+
+static void aspeed_sliio_write(void *opaque, hwaddr addr, uint64_t data,
+  unsigned int size)
+{
+AspeedSLIState *s = ASPEED_SLI(opaque);
+int reg = TO_REG(addr);
+
+if (reg >= ARRAY_SIZE(s->regs)) {
+qemu_log_mask(LOG_GUEST_ERROR,
+  "%s: Out-of-bounds write at offset 0x%" HWADDR_PRIx "\n",
+  __func__, addr);
+return;
+}
+
+trace_aspeed_sliio_write(addr, size, data);
+s->regs[reg] = data;
+}
+
+static const MemoryRegionOps aspeed_sli_ops = {
+.read = aspeed_sli_read,
+.write = aspeed_sli_write,
+.endianness = DEVICE_LITTLE_ENDIAN,
+.valid = {
+.min_access_size = 1,
+.max_access_size = 4,
+},
+};
+
+static const MemoryRegionOps aspeed_sliio_ops = {
+.read = aspeed_sliio_read,
+.write = aspeed_sliio_write,
+.endianness = DEVICE_LITTLE_ENDIAN,
+.valid = {
+.min_access_size = 1,
+.max_access_size = 4,
+},
+};
+
+static void aspeed_sli_realize(DeviceState *dev, Error **errp)
+{
+AspeedSLIState *s = ASPEED_SLI(dev);
+SysBusDevice *sbd = SYS_BUS_DEVICE(dev);
+
+memory_region_init_io(&s->iomem, OBJECT(s), &aspeed_sli_ops, s,
+  TYPE_ASPEED_SLI, SLI_REGION_SIZE);
+sysbus_init_mmio(sbd, &s->iomem);
+}
+
+static void aspeed_sliio_realize(DeviceState *dev, Error **errp)
+{
+AspeedSLIState *s = ASPEED_SLI(dev);
+SysBusDevice *sbd = SYS_BUS_DEVICE(dev);
+
+memory_region_init_io(&s->iomem, OBJECT(s), &aspeed_sliio_ops, s,
+  TYPE_ASPEED_SLI, SLI_REGION_SIZE);
+sysbus_init_mmio(sbd, &s->iomem);
+}
+
+static void aspeed_sli_class_init(ObjectClass *klass, void *data)
+{
+DeviceClass *dc = DEVICE_CLASS(klass);
+
+dc->desc = "Aspeed SLI Controller";
+dc->realize = aspeed_sli_realize;
+}
+
+static const TypeInfo aspeed_sli_info = {
+.name  = TYPE_ASPEED_SLI,
+.parent= TYPE_SYS_BUS_DEVICE,
+.instance_size = sizeof(AspeedSLIState),
+.class_init= aspeed_sli_class_init,
+.class_size= sizeof(AspeedSLIClass),
+.abstract  = true,
+};
+
+static void aspeed_2700_sli_class_init(ObjectClass *klass, void *data)
+{
+DeviceClass *

[PATCH v1 0/8] Add AST2700 support

Changes from v1:
The patch series supports WDT, SDMC, SMC, SCU, SLI and INTC for AST2700 SoC.

Test steps:
1. Download openbmc image for AST2700 from
   https://github.com/AspeedTech-BMC/openbmc/releases/tag/v09.00
   https://github.com/AspeedTech-BMC/openbmc/releases/download/v09.00/
   ast2700-default-obmc.tar.gz
2. untar ast2700-default-obmc.tar.gz
   ```
   tar -xf ast2700-default-obmc.tar.gz
   ```
3. Run and the contents of scripts as following
IMGDIR=ast2700-default
UBOOT_SIZE=$(stat --format=%s -L ${IMGDIR}/u-boot-nodtb.bin)
UBOOT_DTB_ADDR=$((0x4 + ${UBOOT_SIZE}))

qemu-system-aarch64 -M ast2700-evb -nographic -m 8G\
 -device loader,addr=0x4,file=${IMGDIR}/u-boot-nodtb.bin,force-raw=on\
 -device loader,addr=${UBOOT_DTB_ADDR},file=${IMGDIR}/u-boot.dtb,force-raw=on\
 -device loader,addr=0x43000,file=${IMGDIR}/bl31.bin,force-raw=on\
 -device loader,addr=0x43008,file=${IMGDIR}/optee/tee-raw.bin,force-raw=on\
 -device loader,addr=0x43000,cpu-num=0\
 -device loader,addr=0x43000,cpu-num=1\
 -device loader,addr=0x43000,cpu-num=2\
 -device loader,addr=0x43000,cpu-num=3\
 -smp 4\
 -drive file=${IMGDIR}/image-bmc,format=raw,if=mtd\
 -serial mon:stdio\
 -snapshot

Known Issue:
1. QEMU supports ARM Generic Interrupt Controller, version 3(GICv3)
but not support Shared Peripheral Interrupt (SPI), yet.
Added work around in INTC patch to set GICINT132[18]
which was BMC UART interrupt if it received GICINT132, so users are
able to type any key from keyboard to trigger GICINT132 interrupt
until AST2700 boot into login prompt. It is a temporary solution.
If users encounter boot stck and no booting log,
please type any key from keyboard.

2. It is required to add "-m 8G" to set the dram size 8G.
AST2700 dram size calculation is not compatible AST2600.
According to the DDR hardware capacity behavior, if users write the
data at address which is over than the supported size, it would set
the data at address 0.
For example:
a. sdram base address "0x4 "
b. sdram size is 1GiB
The available address range is from "0x4 " to "0x4 4000".
If users write 0xdeadbeef at address "0x6 ", the value of
DRAM address 0 (base address 0x4 ) should be 0xdeadbeef.
Please see ast2700_sdrammc_calc_size in
https://github.com/AspeedTech-BMC/u-boot/blob/v00.05.00/drivers/ram/aspeed/
sdram_ast2700.c

It seems we should create a new function instead of aspeed_soc_dram_init
to support AST2700.
https://github.com/qemu/qemu/blob/master/hw/arm/aspeed_soc_common.c

Jamin Lin (8):
  aspeed/wdt: Add AST2700 support
  aspeed/sli: Add AST2700 support
  aspeed/sdmc: Add AST2700 support
  aspeed/smc: Add AST2700 support
  aspeed/scu: Add AST2700 support
  aspeed/intc: Add AST2700 support
  aspeed/soc: Add AST2700 support
  aspeed: Add an AST2700 eval board

 hw/arm/aspeed.c  |  32 +++
 hw/arm/aspeed_ast27x0.c  | 462 +++
 hw/arm/meson.build   |   1 +
 hw/intc/aspeed_intc.c| 135 +
 hw/intc/meson.build  |   1 +
 hw/misc/aspeed_scu.c | 306 +++-
 hw/misc/aspeed_sdmc.c| 215 --
 hw/misc/aspeed_sli.c | 179 
 hw/misc/meson.build  |   3 +-
 hw/misc/trace-events |  11 +
 hw/ssi/aspeed_smc.c  | 326 --
 hw/ssi/trace-events  |   2 +-
 hw/watchdog/wdt_aspeed.c |  24 ++
 include/hw/arm/aspeed_soc.h  |  26 +-
 include/hw/intc/aspeed_vic.h |  29 ++
 include/hw/misc/aspeed_scu.h |  47 +++-
 include/hw/misc/aspeed_sdmc.h|   4 +-
 include/hw/misc/aspeed_sli.h |  32 +++
 include/hw/ssi/aspeed_smc.h  |   1 +
 include/hw/watchdog/wdt_aspeed.h |   3 +-
 20 files changed, 1787 insertions(+), 52 deletions(-)
 create mode 100644 hw/arm/aspeed_ast27x0.c
 create mode 100644 hw/intc/aspeed_intc.c
 create mode 100644 hw/misc/aspeed_sli.c
 create mode 100644 include/hw/misc/aspeed_sli.h

-- 
2.25.1

[PATCH v1 5/8] aspeed/scu: Add AST2700 support

AST2700 have two SCU controllers which are SCU and SCUIO.
Both SCU and SCUIO registers are not compatible previous SOCs
, introduces new registers and adds ast2700 scu, sucio class init handler.

The pclk divider selection of SCUIO is defined in SCUIO280[20:18] and
the pclk divider selection of SCU is defined in SCU280[25:23].
Both of them are not compatible AST2600 SOCs, adds a get_apb_freq function
and trace-event for AST2700 SCU and SCUIO.

Signed-off-by: Troy Lee 
Signed-off-by: Jamin Lin 
---
 hw/misc/aspeed_scu.c | 306 ++-
 hw/misc/trace-events |   4 +
 include/hw/misc/aspeed_scu.h |  47 +-
 3 files changed, 351 insertions(+), 6 deletions(-)

diff --git a/hw/misc/aspeed_scu.c b/hw/misc/aspeed_scu.c
index 1ac04b6cb0..eb38ea8e19 100644
--- a/hw/misc/aspeed_scu.c
+++ b/hw/misc/aspeed_scu.c
@@ -134,6 +134,48 @@
 
 #define AST2600_CLK TO_REG(0x40)
 
+#define AST2700_SILICON_REV   TO_REG(0x00)
+#define AST2700_HW_STRAP1 TO_REG(0x10)
+#define AST2700_HW_STRAP1_CLR TO_REG(0x14)
+#define AST2700_HW_STRAP1_LOCKTO_REG(0x20)
+#define AST2700_HW_STRAP1_SEC1TO_REG(0x24)
+#define AST2700_HW_STRAP1_SEC2TO_REG(0x28)
+#define AST2700_HW_STRAP1_SEC3TO_REG(0x2C)
+
+#define AST2700_SCU_CLK_SEL_1   TO_REG(0x280)
+#define AST2700_SCU_HPLL_PARAM  TO_REG(0x300)
+#define AST2700_SCU_HPLL_EXT_PARAM  TO_REG(0x304)
+#define AST2700_SCU_DPLL_PARAM  TO_REG(0x308)
+#define AST2700_SCU_DPLL_EXT_PARAM  TO_REG(0x30c)
+#define AST2700_SCU_MPLL_PARAM  TO_REG(0x310)
+#define AST2700_SCU_MPLL_EXT_PARAM  TO_REG(0x314)
+#define AST2700_SCU_D1CLK_PARAM TO_REG(0x320)
+#define AST2700_SCU_D2CLK_PARAM TO_REG(0x330)
+#define AST2700_SCU_CRT1CLK_PARAM   TO_REG(0x340)
+#define AST2700_SCU_CRT2CLK_PARAM   TO_REG(0x350)
+#define AST2700_SCU_MPHYCLK_PARAM   TO_REG(0x360)
+#define AST2700_SCU_FREQ_CNTR   TO_REG(0x3b0)
+#define AST2700_SCU_CPU_SCRATCH_0   TO_REG(0x780)
+#define AST2700_SCU_CPU_SCRATCH_1   TO_REG(0x784)
+
+#define AST2700_SCUIO_CLK_STOP_CTL_1TO_REG(0x240)
+#define AST2700_SCUIO_CLK_STOP_CLR_1TO_REG(0x244)
+#define AST2700_SCUIO_CLK_STOP_CTL_2TO_REG(0x260)
+#define AST2700_SCUIO_CLK_STOP_CLR_2TO_REG(0x264)
+#define AST2700_SCUIO_CLK_SEL_1 TO_REG(0x280)
+#define AST2700_SCUIO_CLK_SEL_2 TO_REG(0x284)
+#define AST2700_SCUIO_HPLL_PARAMTO_REG(0x300)
+#define AST2700_SCUIO_HPLL_EXT_PARAMTO_REG(0x304)
+#define AST2700_SCUIO_APLL_PARAMTO_REG(0x310)
+#define AST2700_SCUIO_APLL_EXT_PARAMTO_REG(0x314)
+#define AST2700_SCUIO_DPLL_PARAMTO_REG(0x320)
+#define AST2700_SCUIO_DPLL_EXT_PARAMTO_REG(0x324)
+#define AST2700_SCUIO_DPLL_PARAM_READ   TO_REG(0x328)
+#define AST2700_SCUIO_DPLL_EXT_PARAM_READ TO_REG(0x32c)
+#define AST2700_SCUIO_UARTCLK_GEN   TO_REG(0x330)
+#define AST2700_SCUIO_HUARTCLK_GEN  TO_REG(0x334)
+#define AST2700_SCUIO_CLK_DUTY_MEAS_RST TO_REG(0x388)
+
 #define SCU_IO_REGION_SIZE 0x1000
 
 static const uint32_t ast2400_a0_resets[ASPEED_SCU_NR_REGS] = {
@@ -244,6 +286,25 @@ static uint32_t 
aspeed_1030_scu_get_apb_freq(AspeedSCUState *s)
 / asc->apb_divider;
 }
 
+static uint32_t aspeed_2700_scu_get_apb_freq(AspeedSCUState *s)
+{
+AspeedSCUClass *asc = ASPEED_SCU_GET_CLASS(s);
+uint32_t hpll = asc->calc_hpll(s, s->regs[AST2700_SCU_HPLL_PARAM]);
+
+return hpll / (SCU_CLK_GET_PCLK_DIV(s->regs[AST2700_SCU_CLK_SEL_1]) + 1)
+   / asc->apb_divider;
+}
+
+static uint32_t aspeed_2700_scuio_get_apb_freq(AspeedSCUState *s)
+{
+AspeedSCUClass *asc = ASPEED_SCU_GET_CLASS(s);
+uint32_t hpll = asc->calc_hpll(s, s->regs[AST2700_SCUIO_HPLL_PARAM]);
+
+return hpll /
+(SCUIO_AST2700_CLK_GET_PCLK_DIV(s->regs[AST2700_SCUIO_CLK_SEL_1]) + 1)
+/ asc->apb_divider;
+}
+
 static uint64_t aspeed_scu_read(void *opaque, hwaddr offset, unsigned size)
 {
 AspeedSCUState *s = ASPEED_SCU(opaque);
@@ -258,7 +319,8 @@ static uint64_t aspeed_scu_read(void *opaque, hwaddr 
offset, unsigned size)
 
 switch (reg) {
 case RNG_DATA:
-/* On hardware, RNG_DATA works regardless of
+/*
+ * On hardware, RNG_DATA works regardless of
  * the state of the enable bit in RNG_CTRL
  */
 s->regs[RNG_DATA] = aspeed_scu_get_random();
@@ -494,6 +556,9 @@ static uint32_t aspeed_silicon_revs[] = {
 AST2600_A3_SILICON_REV,
 AST1030_A0_SILICON_REV,
 AST1030_A1_SILICON_REV,
+AST2700_A0_SILICON_REV,
+AST2720_A0_SILICON_REV,
+AST2750_A0_SILICON_REV,
 };
 
 bool is_supported_silicon_rev(uint32_t silicon_rev)
@@ -783,6 +848,243 @@ static const TypeInfo aspeed_2600_scu_info = {
 .class_init = aspeed_2600_scu_class_init,
 };
 
+static uint64_t aspeed_ast2700_scu_read(void *opaque, hwaddr offset,
+unsigned size)
+{
+AspeedSCUState *s = ASPEED_SCU(opaque);
+int reg = TO_REG(offset);
+
+if (reg >= ASPEED_AST2700_SCU_NR_REGS) {
+

[PATCH v1 8/8] aspeed: Add an AST2700 eval board

AST2700 CPU is ARM Cortex-A35 which is 64 bits.
Add TARGET_AARCH64 to build this machine.

According to the design of ast2700, it has a bootmcu(riscv-32) which
is used for executing SPL.
Then, CPUs(cortex-a35) execute u-boot, kernel and rofs.

Currently, qemu not support emulate two CPU architectures
at the same machine. Therefore, qemu will only support
to emulate CPU(cortex-a35) side for ast2700

Signed-off-by: Troy Lee 
Signed-off-by: Jamin Lin 
---
 hw/arm/aspeed.c | 32 
 1 file changed, 32 insertions(+)

diff --git a/hw/arm/aspeed.c b/hw/arm/aspeed.c
index 8854581ca8..4544026d14 100644
--- a/hw/arm/aspeed.c
+++ b/hw/arm/aspeed.c
@@ -178,6 +178,12 @@ struct AspeedMachineState {
 #define AST2600_EVB_HW_STRAP1 0x00C0
 #define AST2600_EVB_HW_STRAP2 0x0003
 
+#ifdef TARGET_AARCH64
+/* AST2700 evb hardware value */
+#define AST2700_EVB_HW_STRAP1 0x00C0
+#define AST2700_EVB_HW_STRAP2 0x0003
+#endif
+
 /* Tacoma hardware value */
 #define TACOMA_BMC_HW_STRAP1  0x
 #define TACOMA_BMC_HW_STRAP2  0x0040
@@ -1588,6 +1594,26 @@ static void 
aspeed_minibmc_machine_ast1030_evb_class_init(ObjectClass *oc,
 aspeed_machine_class_init_cpus_defaults(mc);
 }
 
+#ifdef TARGET_AARCH64
+static void aspeed_machine_ast2700_evb_class_init(ObjectClass *oc, void *data)
+{
+MachineClass *mc = MACHINE_CLASS(oc);
+AspeedMachineClass *amc = ASPEED_MACHINE_CLASS(oc);
+
+mc->desc = "Aspeed AST2700 EVB (Cortex-A35)";
+amc->soc_name  = "ast2700-a0";
+amc->hw_strap1 = AST2700_EVB_HW_STRAP1;
+amc->hw_strap2 = AST2700_EVB_HW_STRAP2;
+amc->fmc_model = "w25q01jvq";
+amc->spi_model = "w25q512jv";
+amc->num_cs= 2;
+amc->macs_mask = ASPEED_MAC0_ON | ASPEED_MAC1_ON | ASPEED_MAC2_ON;
+amc->uart_default = ASPEED_DEV_UART12;
+mc->default_ram_size = 1 * GiB;
+aspeed_machine_class_init_cpus_defaults(mc);
+}
+#endif
+
 static void aspeed_machine_qcom_dc_scm_v1_class_init(ObjectClass *oc,
  void *data)
 {
@@ -1711,6 +1737,12 @@ static const TypeInfo aspeed_machine_types[] = {
 .name   = MACHINE_TYPE_NAME("ast1030-evb"),
 .parent = TYPE_ASPEED_MACHINE,
 .class_init = aspeed_minibmc_machine_ast1030_evb_class_init,
+#ifdef TARGET_AARCH64
+}, {
+.name  = MACHINE_TYPE_NAME("ast2700-evb"),
+.parent= TYPE_ASPEED_MACHINE,
+.class_init= aspeed_machine_ast2700_evb_class_init,
+#endif
 }, {
 .name  = TYPE_ASPEED_MACHINE,
 .parent= TYPE_MACHINE,
-- 
2.25.1

Re: [PATCH v1 8/8] aspeed: Add an AST2700 eval board

2024-02-29 Thread Cédric Le Goater


Hello Jamin,


I tried to send the patch series to support AST2700 but I encountered some 
patches
were rejected by server IP 211.20.114.70.

Error Log:
qemu-devel@nongnu.org
eggs.gnu.org
Remote Server returned '550-[SPF] 211.20.114.70 is not allowed to send mail 
from aspeedtech.com. 550 Please see 
http://www.openspf.org/Why?scope=mfrom;identity=jamin_...@aspeedtech.com;ip=211.20.114.70'
qemu-...@nongnu.org
eggs.gnu.org
Remote Server returned '550-[SPF] 211.20.114.70 is not allowed to send mail 
from aspeedtech.com. 550 Please see 
http://www.openspf.org/Why?scope=mfrom;identity=jamin_...@aspeedtech.com;ip=211.20.114.70


$ host -t txt aspeedtech.com
aspeedtech.com descriptive text 
"google-site-verification=77FsedIzGqFvs3bFfy5L2lT_AGEWVecyoJwZN7KDVnM"
aspeedtech.com descriptive text "v=spf1 ip4:211.20.114.72 
include:spf.protection.outlook.com -all"
aspeedtech.com descriptive text 
"google-site-verification=sBPPFeYyix6oWeC3GRJ64zQNFLJpN6SFBMT8RX8ZuME"

May be try using 211.20.114.72 (mail.aspeedtech.com) as an SMTP server ?


Did you encounter the same errors before?


I received the full series 4 times.

But the mailing lists only have 4 :

  
https://lore.kernel.org/qemu-devel/20240229080014.1235018-1-jamin_...@aspeedtech.com/
  
https://lore.kernel.org/qemu-devel/20240229072315.743963-1-jamin_...@aspeedtech.com/

or

  https://patchew.org/QEMU/20240229080014.1235018-1-jamin._5f...@aspeedtech.com/
  https://patchew.org/QEMU/20240229072315.743963-1-jamin._5f...@aspeedtech.com/



My send email command as following.
git send-email
--cc troy_...@aspeedtech.com
--cc jamin_...@aspeedtech.com
--cc yunlin.t...@aspeedtech.com
--to-cmd "./scripts/get_maintainer.pl ../v1-patch/*.patch" ../v1-patch/*.patch


The command line above is sending twice the same series, you should remove
one of the  "../v1-patch/*.patch" command arguments. the rest looks correct.

Thanks,

C.

Re: Intention to work on GSoC project

Hi Sahil,

On Sun, Feb 25, 2024 at 10:38 PM Sahil  wrote:
>
> Hi,
>
> My name is Sahil and I go by the pseudonym 'valdaarhun' on Github. I have
> never contributed to QEMU before but I have used it a few times as an end
> user. I developed an interest in virtualization during my internship at
> VMware and would like to dive deeper in this subfield.
>
> My current full-time job does not allow me to take part in external programs
> that are paid. I would like to work on one of the proposed projects outside
> of GSoC.

Sure, not a problem at all, also because for this year QEMU was not
accepted in GSoC, so anybody can work on those projects if they have
time

> I have gone through QEMU's list of GSoC '24 projects [1] and am
> interested in two of them:
>
> 1. Add packed virtqueue to Shadow Virtqueue
> 2. vhost-user memory isolation
>
> Based on what I have understood, they are somewhat related and are part
> of the migration subsystem. I feel the learning curve of the first project
> will be less steep and will make me better prepared to tackle the second
> project as well.

The first project is for sure related with migration. While vhost-user
memory isolation is not really related to migration, but both are
related to virtio devices.
Anyway, your plan looks good to me!

>
> I have read the "Getting Started for Developers" [2] wiki page. I have also
> built QEMU from source.

Great!

>
> I think my next step should be to read the documentation on the migration
> subsystem [3], the blog posts attached in the first project's description
> and virtqueue's implementation. Would you also recommend that I work on a
> QEMU issue that is open on Gitlab and related to virtqueues/virtio to
> familiarize
> myself with the codebase? I went through the issues tagged as "device:virtio"
> [4]
> but can't really tell if any of them are good for beginners. One of them has
> the
> "bite-size" tag [5]. It also has a patch attached but hasn't been merged.
> Shall I
> work on getting that merged?

Yeah, "bite-size" issues should be better to understand how to
contribute to QEMU.
Feel free to work on any issue, doing the work or helping to complete
old patches.

>
> I have worked on a few smaller systems programming issues in other
> organizations (eg: strace [6], htop [7]) in the past.
>
> I look forward to hearing from you.

Feel free to reach us if you have more questions on the projects.

Thanks,
Stefano

Re: [PATCH V4 11/14] vfio: register container for cpr

2024-02-29 Thread Cédric Le Goater


Hello Steve,

On 2/22/24 18:28, Steve Sistare wrote:

Define entry points to perform per-container cpr-specific initialization
and teardown.

Signed-off-by: Steve Sistare 
---
  hw/vfio/container.c   | 11 ++-
  hw/vfio/cpr.c | 19 +++
  hw/vfio/iommufd.c |  6 ++
  hw/vfio/meson.build   |  1 +
  include/hw/vfio/vfio-common.h |  3 +++
  5 files changed, 39 insertions(+), 1 deletion(-)
  create mode 100644 hw/vfio/cpr.c

diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index bd25b9f..096d77e 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -621,10 +621,15 @@ static int vfio_connect_container(VFIOGroup *group, 
AddressSpace *as,
  goto free_container_exit;
  }
  
+ret = vfio_cpr_register_container(bcontainer, errp);

+if (ret) {
+goto free_container_exit;
+}
+
  ret = vfio_ram_block_discard_disable(container, true);
  if (ret) {
  error_setg_errno(errp, -ret, "Cannot set discarding of RAM broken");
-goto free_container_exit;
+goto unregister_container_exit;
  }
  
  assert(bcontainer->ops->setup);

@@ -667,6 +672,9 @@ listener_release_exit:
  enable_discards_exit:
  vfio_ram_block_discard_disable(container, false);
  
+unregister_container_exit:

+vfio_cpr_unregister_container(bcontainer);
+
  free_container_exit:
  g_free(container);
  
@@ -710,6 +718,7 @@ static void vfio_disconnect_container(VFIOGroup *group)

  vfio_container_destroy(bcontainer);
  
  trace_vfio_disconnect_container(container->fd);

+vfio_cpr_unregister_container(bcontainer);
  close(container->fd);
  g_free(container);
  
diff --git a/hw/vfio/cpr.c b/hw/vfio/cpr.c

new file mode 100644
index 000..3bede54
--- /dev/null
+++ b/hw/vfio/cpr.c
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) 2021-2024 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "hw/vfio/vfio-common.h"
+#include "qapi/error.h"
+
+int vfio_cpr_register_container(VFIOContainerBase *bcontainer, Error **errp)
+{
+return 0;
+}
+
+void vfio_cpr_unregister_container(VFIOContainerBase *bcontainer)
+{
+}
diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
index 9bfddc1..e1be224 100644
--- a/hw/vfio/iommufd.c
+++ b/hw/vfio/iommufd.c
@@ -411,6 +411,11 @@ found_container:
  goto err_listener_register;
  }
  
+ret = vfio_cpr_register_container(bcontainer, errp);

+if (ret) {
+goto err_listener_register;
+}
+
  /*
   * TODO: examine RAM_BLOCK_DISCARD stuff, should we do group level
   * for discarding incompatibility check as well?
@@ -461,6 +466,7 @@ static void iommufd_cdev_detach(VFIODevice *vbasedev)
  iommufd_cdev_ram_block_discard_disable(false);
  }
  
+vfio_cpr_unregister_container(bcontainer);

  iommufd_cdev_detach_container(vbasedev, container);
  iommufd_cdev_container_destroy(container);
  vfio_put_address_space(space);
diff --git a/hw/vfio/meson.build b/hw/vfio/meson.build
index bb98493..bba776f 100644
--- a/hw/vfio/meson.build
+++ b/hw/vfio/meson.build
@@ -5,6 +5,7 @@ vfio_ss.add(files(
'container-base.c',
'container.c',
'migration.c',
+  'cpr.c',
  ))
  vfio_ss.add(when: 'CONFIG_PSERIES', if_true: files('spapr.c'))
  vfio_ss.add(when: 'CONFIG_IOMMUFD', if_true: files(
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 4a6c262..b9da6c0 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -205,6 +205,9 @@ void vfio_detach_device(VFIODevice *vbasedev);
  int vfio_kvm_device_add_fd(int fd, Error **errp);
  int vfio_kvm_device_del_fd(int fd, Error **errp);
  
+int vfio_cpr_register_container(VFIOContainerBase *bcontainer, Error **errp);


Should we return bool since we have an errp ? the returned value
is not an errno AFAICT.

Anyhow,

Reviewed-by: Cédric Le Goater 

Thanks,

C.




+void vfio_cpr_unregister_container(VFIOContainerBase *bcontainer);
+
  extern const MemoryRegionOps vfio_region_ops;
  typedef QLIST_HEAD(VFIOGroupList, VFIOGroup) VFIOGroupList;
  typedef QLIST_HEAD(VFIODeviceList, VFIODevice) VFIODeviceList;

Re: [PATCH v5 30/65] i386/tdx: Support user configurable mrconfigid/mrowner/mrownerconfig

Xiaoyao Li  writes:

> From: Isaku Yamahata 
>
> Three sha384 hash values, mrconfigid, mrowner and mrownerconfig, of a TD
> can be provided for TDX attestation. Detailed meaning of them can be
> found: 
> https://lore.kernel.org/qemu-devel/31d6dbc1-f453-4cef-ab08-4813f4e0f...@intel.com/
>
> Allow user to specify those values via property mrconfigid, mrowner and
> mrownerconfig. They are all in base64 format.
>
> example
> -object tdx-guest, \
>   
> mrconfigid=ASNFZ4mrze8BI0VniavN7wEjRWeJq83vASNFZ4mrze8BI0VniavN7wEjRWeJq83v,\
>   mrowner=ASNFZ4mrze8BI0VniavN7wEjRWeJq83vASNFZ4mrze8BI0VniavN7wEjRWeJq83v,\
>   
> mrownerconfig=ASNFZ4mrze8BI0VniavN7wEjRWeJq83vASNFZ4mrze8BI0VniavN7wEjRWeJq83v
>
> Signed-off-by: Isaku Yamahata 
> Co-developed-by: Xiaoyao Li 
> Signed-off-by: Xiaoyao Li 
>
> ---
> Changes in v5:
>  - refine the description of QAPI properties and add description of
>default value when not specified;
>
> Changes in v4:
>  - describe more of there fields in qom.json
>  - free the old value before set new value to avoid memory leak in
>_setter(); (Daniel)
>
> Changes in v3:
>  - use base64 encoding instread of hex-string;
> ---
>  qapi/qom.json | 17 -
>  target/i386/kvm/tdx.c | 87 +++
>  target/i386/kvm/tdx.h |  3 ++
>  3 files changed, 106 insertions(+), 1 deletion(-)
>
> diff --git a/qapi/qom.json b/qapi/qom.json
> index 89ed89b9b46e..cac875349a3a 100644
> --- a/qapi/qom.json
> +++ b/qapi/qom.json
> @@ -905,10 +905,25 @@
>  # pages.  Some guest OS (e.g., Linux TD guest) may require this to
>  # be set, otherwise they refuse to boot.
>  #
> +# @mrconfigid: ID for non-owner-defined configuration of the guest TD,
> +# e.g., run-time or OS configuration (base64 encoded SHA384 digest).
> +# (A default value 0 of SHA384 is used when absent).

Suggest to drop the parenthesis in the last sentence.

@mrconfigid is a string, so the default value can't be 0.  Actually,
it's not just any string, but a base64 encoded SHA384 digest, which
means it must be exactly 96 hex digits.  So it can't be "0", either.  It
could be
"".
More on this below.

> +#
> +# @mrowner: ID for the guest TD’s owner (base64 encoded SHA384 digest).
> +# (A default value 0 of SHA384 is used when absent).
> +#
> +# @mrownerconfig: ID for owner-defined configuration of the guest TD,
> +# e.g., specific to the workload rather than the run-time or OS
> +# (base64 encoded SHA384 digest). (A default value 0 of SHA384 is
> +# used when absent).
> +#
>  # Since: 9.0
>  ##
>  { 'struct': 'TdxGuestProperties',
> -  'data': { '*sept-ve-disable': 'bool' } }
> +  'data': { '*sept-ve-disable': 'bool',
> +'*mrconfigid': 'str',
> +'*mrowner': 'str',
> +'*mrownerconfig': 'str' } }
>  
>  ##
>  # @ThreadContextProperties:
> diff --git a/target/i386/kvm/tdx.c b/target/i386/kvm/tdx.c
> index d0ad4f57b5d0..4ce2f1d082ce 100644
> --- a/target/i386/kvm/tdx.c
> +++ b/target/i386/kvm/tdx.c
> @@ -13,6 +13,7 @@
>  
>  #include "qemu/osdep.h"
>  #include "qemu/error-report.h"
> +#include "qemu/base64.h"
>  #include "qapi/error.h"
>  #include "qom/object_interfaces.h"
>  #include "standard-headers/asm-x86/kvm_para.h"
> @@ -516,6 +517,7 @@ int tdx_pre_create_vcpu(CPUState *cpu, Error **errp)
>  X86CPU *x86cpu = X86_CPU(cpu);
>  CPUX86State *env = &x86cpu->env;
>  g_autofree struct kvm_tdx_init_vm *init_vm = NULL;
> +size_t data_len;
>  int r = 0;
>  
>  object_property_set_bool(OBJECT(cpu), "pmu", false, &error_abort);
> @@ -528,6 +530,38 @@ int tdx_pre_create_vcpu(CPUState *cpu, Error **errp)
>  init_vm = g_malloc0(sizeof(struct kvm_tdx_init_vm) +
>  sizeof(struct kvm_cpuid_entry2) * 
> KVM_MAX_CPUID_ENTRIES);
>  
> +#define SHA384_DIGEST_SIZE  48
> +
> +if (tdx_guest->mrconfigid) {
> +g_autofree uint8_t *data = qbase64_decode(tdx_guest->mrconfigid,
> +  strlen(tdx_guest->mrconfigid), &data_len, 
> errp);
> +if (!data || data_len != SHA384_DIGEST_SIZE) {
> +error_setg(errp, "TDX: failed to decode mrconfigid");
> +return -1;
> +}
> +memcpy(init_vm->mrconfigid, data, data_len);
> +}

When @mrconfigid is absent, the property remains null, and this
conditional is not executed.  init_vm->mrconfigid[], an array of 6
__u64, remains all zero.  How does the kernel treat that?

> +
> +if (tdx_guest->mrowner) {
> +g_autofree uint8_t *data = qbase64_decode(tdx_guest->mrowner,
> +  strlen(tdx_guest->mrowner), &data_len, errp);
> +if (!data || data_len != SHA384_DIGEST_SIZE) {
> +error_setg(errp, "TDX: failed to decode mrowner");
> +return -1;
> +}
> +memcpy(init_vm->mrowner, data, data_len);
> +}
> +
> +if (tdx_guest->mrow

Re: [PATCH v5 49/65] i386/tdx: handle TDG.VP.VMCALL

Xiaoyao Li  writes:

> From: Isaku Yamahata 
>
> Add property "quote-generation-socket" to tdx-guest, which is a property
> of type SocketAddress to specify Quote Generation Service(QGS).
>
> On request of GetQuote, it connects to the QGS socket, read request
> data from shared guest memory, send the request data to the QGS,
> and store the response into shared guest memory, at last notify
> TD guest by interrupt.
>
> command line example:
>   qemu-system-x86_64 \
> -object 
> '{"qom-type":"tdx-guest","id":"tdx0","quote-generation-socket":{"type": 
> "vsock", "cid":"1","port":"1234"}}' \
> -machine confidential-guest-support=tdx0
>
> Note, above example uses vsock type socket because the QGS we used
> implements the vsock socket. It can be other types, like UNIX socket,
> which depends on the implementation of QGS.
>
> To avoid no response from QGS server, setup a timer for the transaction.
> If timeout, make it an error and interrupt guest. Define the threshold of
> time to 30s at present, maybe change to other value if not appropriate.
>
> Signed-off-by: Isaku Yamahata 
> Codeveloped-by: Chenyi Qiang 
> Signed-off-by: Chenyi Qiang 
> Codeveloped-by: Xiaoyao Li 
> Signed-off-by: Xiaoyao Li 

[...]

> diff --git a/qapi/qom.json b/qapi/qom.json
> index cac875349a3a..7b26b0a0d3aa 100644
> --- a/qapi/qom.json
> +++ b/qapi/qom.json
> @@ -917,13 +917,19 @@
>  # (base64 encoded SHA384 digest). (A default value 0 of SHA384 is
>  # used when absent).
>  #
> +# @quote-generation-socket: socket address for Quote Generation
> +# Service (QGS).  QGS is a daemon running on the host.  User in
> +# TD guest cannot get TD quoting for attestation if QGS is not
> +# provided.  So admin should always provide it.

This makes me wonder why it's optional.  Can you describe a use case for
*not* specifying @quote-generation-socket?

> +#
>  # Since: 9.0
>  ##
>  { 'struct': 'TdxGuestProperties',
>'data': { '*sept-ve-disable': 'bool',
>  '*mrconfigid': 'str',
>  '*mrowner': 'str',
> -'*mrownerconfig': 'str' } }
> +'*mrownerconfig': 'str',
> +'*quote-generation-socket': 'SocketAddress' } }
>  
>  ##
>  # @ThreadContextProperties:

[...]

Re: No virtio devices in SeaBIOS VMs

2024-02-29 Thread Gerd Hoffmann

  Hi,

> UEFI guests seem not to be affected in any way, no matter amount of RAM
> or CPU model (well, of course, since it's a SeaBIOS commit! :-D What I
> mean is that there seems to be nothing in edk2 that induces the same
> behavior).

That used to be a problem with UEFI too.

> A way of working this around (beside switching to UEFI or to cpu=host)
> is to turn on host-phys-bits, e.g., with ' mode="passthrough"/>' in the XML.

Sounds like the phys-bits of your vcpus is larger than the value the
host actually supports.  So if the firmware tries to use the whole
address space available things break.

Both UEFI and SeaBIOS have a similar heuristic to figure whenever they
can trust phys-bits or not, and those checks consider upstream qemu
behavior (use phys-bits=40 for all cpu types except 'host').

When this came up with UEFI the root cause turned out to be that suse
qemu derived from upstream qemu.  There have been phys-bits values other
than 40 which where not valid (i.e. larger than supported by the host).

I don't know how that was solved in the end.  But given that we see
similar problems again with SeaBIOS I suspect it was patched in suse
OVMF not suse qemu.

> It is, however, a bit impractical to have to do this for all the VMs
> that one may have... Especially if they're a lot! :-)

I'd actually recommend to run all VMs with host-phys-bits=on (and use
host-phys-bits-limit=value if you need phys-bits being equal on all
machines of a heterogeneous cluster for live migration compatibility).

phys-bits being too big never was a valid configuration.  It only
happened to work because the firmware was very conservative with address
space usage.  That strategy became increasingly problematic though.
These days GPUs and NPUs can have gigabytes of device memory and equally
large pci memory bars ...

take care,
  Gerd

Re: [PATCH V4 12/14] vfio: allow cpr-reboot migration if suspended

2024-02-29 Thread Cédric Le Goater


On 2/22/24 18:28, Steve Sistare wrote:

Allow cpr-reboot for vfio if the guest is in the suspended runstate.  The
guest drivers' suspend methods flush outstanding requests and re-initialize
the devices, and thus there is no device state to save and restore.  The
user is responsible for suspending the guest before initiating cpr, such as
by issuing guest-suspend-ram to the qemu guest agent.

Relax the vfio blocker so it does not apply to cpr, and add a notifier that
verifies the guest is suspended.

Signed-off-by: Steve Sistare 



Reviewed-by: Cédric Le Goater 

Thanks,

C.





---
  hw/vfio/common.c  |  2 +-
  hw/vfio/cpr.c | 20 
  hw/vfio/migration.c   |  2 +-
  include/hw/vfio/vfio-container-base.h |  1 +
  4 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 059bfdc..ff88c3f 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -128,7 +128,7 @@ int vfio_block_multiple_devices_migration(VFIODevice 
*vbasedev, Error **errp)
  error_setg(&multiple_devices_migration_blocker,
 "Multiple VFIO devices migration is supported only if all of "
 "them support P2P migration");
-ret = migrate_add_blocker(&multiple_devices_migration_blocker, errp);
+ret = migrate_add_blocker_normal(&multiple_devices_migration_blocker, 
errp);
  
  return ret;

  }
diff --git a/hw/vfio/cpr.c b/hw/vfio/cpr.c
index 3bede54..392c2dd 100644
--- a/hw/vfio/cpr.c
+++ b/hw/vfio/cpr.c
@@ -7,13 +7,33 @@
  
  #include "qemu/osdep.h"

  #include "hw/vfio/vfio-common.h"
+#include "migration/misc.h"
  #include "qapi/error.h"
+#include "sysemu/runstate.h"
+
+static int vfio_cpr_reboot_notifier(NotifierWithReturn *notifier,
+MigrationEvent *e, Error **errp)
+{
+if (e->type == MIG_EVENT_PRECOPY_SETUP &&
+!runstate_check(RUN_STATE_SUSPENDED) && !vm_get_suspended()) {
+
+error_setg(errp,
+"VFIO device only supports cpr-reboot for runstate suspended");
+
+return -1;
+}
+return 0;
+}
  
  int vfio_cpr_register_container(VFIOContainerBase *bcontainer, Error **errp)

  {
+migration_add_notifier_mode(&bcontainer->cpr_reboot_notifier,
+vfio_cpr_reboot_notifier,
+MIG_MODE_CPR_REBOOT);
  return 0;
  }
  
  void vfio_cpr_unregister_container(VFIOContainerBase *bcontainer)

  {
+migration_remove_notifier(&bcontainer->cpr_reboot_notifier);
  }
diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
index 50140ed..2050ac8 100644
--- a/hw/vfio/migration.c
+++ b/hw/vfio/migration.c
@@ -889,7 +889,7 @@ static int vfio_block_migration(VFIODevice *vbasedev, Error 
*err, Error **errp)
  vbasedev->migration_blocker = error_copy(err);
  error_free(err);
  
-return migrate_add_blocker(&vbasedev->migration_blocker, errp);

+return migrate_add_blocker_normal(&vbasedev->migration_blocker, errp);
  }
  
  /* -- */

diff --git a/include/hw/vfio/vfio-container-base.h 
b/include/hw/vfio/vfio-container-base.h
index b2813b0..3582d5f 100644
--- a/include/hw/vfio/vfio-container-base.h
+++ b/include/hw/vfio/vfio-container-base.h
@@ -49,6 +49,7 @@ typedef struct VFIOContainerBase {
  QLIST_ENTRY(VFIOContainerBase) next;
  QLIST_HEAD(, VFIODevice) device_list;
  GList *iova_ranges;
+NotifierWithReturn cpr_reboot_notifier;
  } VFIOContainerBase;
  
  typedef struct VFIOGuestIOMMU {

Re: [PATCH 9/9] hostmem-file: support POSIX shm_open()


On Wed, Feb 28, 2024 at 01:01:55PM +0100, David Hildenbrand wrote:

On 28.02.24 12:47, Stefano Garzarella wrote:

Add a new `shm` bool option for `-object memory-backend-file`.

When this option is set to true, the POSIX shm_open(3) is used instead
of open(2).

So a file will not be created in the filesystem, but a "POSIX shared
memory object" will be instantiated. In Linux this turns into a file
in /dev/shm, but in other OSes this may not happen (for example in
macOS or FreeBSD nothing is shown in any filesystem).

This new feature is useful when we need to share guest memory with
another process (e.g. vhost-user backend), but we don't have
memfd_create() or any special filesystems (e.g. /dev/shm) available
as in macOS.

Signed-off-by: Stefano Garzarella 
---
I am not sure this is the best way to support shm_open() in QEMU.

Other solutions I had in mind were:

- create a new memory-backend-shm

- extend memory-backend-memfd to use shm_open() on systems where memfd is
not available (problem: shm_open wants a name to assign to the object, but
we can do a workaround by using a random name and do the unlink right away)

Any preference/suggestion?


Both sound like reasonable options, and IMHO better than hostmem-file 
with things that are not necessarily "real" files.


Yeah, I see.



Regarding memory-backend-memfd, we similarly have to pass a name to 
memfd_create(), although for different purpose: "The  name  supplied 
in name is used as a filename and will be displayed as the target of 
the corresponding symbolic link in the directory /proc/self/fd/".


So we simply pass TYPE_MEMORY_BACKEND_MEMFD.


Okay, so I guess it must be unique only in the process, while for 
shm_open() it is global.




Likely, memory-backend-shm that directly maps to shm_open() and only 
provides properties reasonable for shm_open() is the cleanest 
approach. So that would currently be my preference :)


Thank you for your thoughts, I think I will go toward this direction 
(memory-backend-shm).


It was also my first choice, but in order to have a working RFC right 
away, I modified memory-backend-file.


Stefano

Re: [PATCH 9/9] hostmem-file: support POSIX shm_open()


On Wed, Feb 28, 2024 at 12:08:37PM +, Daniel P. Berrangé wrote:

On Wed, Feb 28, 2024 at 12:47:59PM +0100, Stefano Garzarella wrote:

Add a new `shm` bool option for `-object memory-backend-file`.

When this option is set to true, the POSIX shm_open(3) is used instead
of open(2).

So a file will not be created in the filesystem, but a "POSIX shared
memory object" will be instantiated. In Linux this turns into a file
in /dev/shm, but in other OSes this may not happen (for example in
macOS or FreeBSD nothing is shown in any filesystem).

This new feature is useful when we need to share guest memory with
another process (e.g. vhost-user backend), but we don't have
memfd_create() or any special filesystems (e.g. /dev/shm) available
as in macOS.

Signed-off-by: Stefano Garzarella 
---
I am not sure this is the best way to support shm_open() in QEMU.

Other solutions I had in mind were:

- create a new memory-backend-shm

- extend memory-backend-memfd to use shm_open() on systems where memfd is
not available (problem: shm_open wants a name to assign to the object, but
we can do a workaround by using a random name and do the unlink right away)


IMHO, create a new memory-backend-shm, don't overload memory-backend-memfd,
as this lets users choose between shm & memfd, even on Linux.


Yeah, good point!
I think there's enough of a consensus on adding memory-backend-shm, so 
I'm going to go toward that direction in v2.


Thanks,
Stefano

Re: [PATCH v5 52/65] i386/tdx: Wire TDX_REPORT_FATAL_ERROR with GuestPanic facility

Xiaoyao Li  writes:

> Integrate TDX's TDX_REPORT_FATAL_ERROR into QEMU GuestPanic facility
>
> Originated-from: Isaku Yamahata 
> Signed-off-by: Xiaoyao Li 
> ---
> Changes in v5:
> - mention additional error information in gpa when it presents;
> - refine the documentation; (Markus)
>
> Changes in v4:
> - refine the documentation; (Markus)
>
> Changes in v3:
> - Add docmentation of new type and struct; (Daniel)
> - refine the error message handling; (Daniel)
> ---
>  qapi/run-state.json   | 31 +--
>  system/runstate.c | 58 +++
>  target/i386/kvm/tdx.c | 24 +-
>  3 files changed, 110 insertions(+), 3 deletions(-)
>
> diff --git a/qapi/run-state.json b/qapi/run-state.json
> index dd0770b379e5..b71dd1884eb6 100644
> --- a/qapi/run-state.json
> +++ b/qapi/run-state.json
> @@ -483,10 +483,12 @@
>  #
>  # @s390: s390 guest panic information type (Since: 2.12)
>  #
> +# @tdx: tdx guest panic information type (Since: 9.0)
> +#
>  # Since: 2.9
>  ##
>  { 'enum': 'GuestPanicInformationType',
> -  'data': [ 'hyper-v', 's390' ] }
> +  'data': [ 'hyper-v', 's390', 'tdx' ] }
>  
>  ##
>  # @GuestPanicInformation:
> @@ -501,7 +503,8 @@
>   'base': {'type': 'GuestPanicInformationType'},
>   'discriminator': 'type',
>   'data': {'hyper-v': 'GuestPanicInformationHyperV',
> -  's390': 'GuestPanicInformationS390'}}
> +  's390': 'GuestPanicInformationS390',
> +  'tdx' : 'GuestPanicInformationTdx'}}
>  
>  ##
>  # @GuestPanicInformationHyperV:
> @@ -564,6 +567,30 @@
>'psw-addr': 'uint64',
>'reason': 'S390CrashReason'}}
>  
> +##
> +# @GuestPanicInformationTdx:
> +#
> +# TDX Guest panic information specific to TDX, as specified in the
> +# "Guest-Hypervisor Communication Interface (GHCI) Specification",
> +# section TDG.VP.VMCALL.
> +#
> +# @error-code: TD-specific error code
> +#
> +# @message: Human-readable error message provided by the guest. Not
> +# to be trusted.
> +#
> +# @gpa: guest-physical address of a page that contains more verbose
> +# error information, as zero-terminated string.  Present when the
> +# "GPA valid" bit (bit 63) is set in @error-code.

Uh, peeking at GHCI Spec section 3.4 TDG.VP.VMCALL, I
see operand R12 consists of

bitsnamedescription
31:0TD-specific error code  TD-specific error code
Panic – 0x0.
Values – 0x1 to 0x
reserved.
62:32   TD-specific extendedTD-specific extended error code.
error code  TD software defined.
63  GPA Valid   Set if the TD specified additional
information in the GPA parameter
(R13).

Is @error-code all of R12, or just bits 31:0?

If it's all of R12, description of @error-code as "TD-specific error
code" is misleading.

If it's just bits 31:0, then 'Present when the "GPA valid" bit (bit 63)
is set in @error-code' is wrong.  Could go with 'Only present when the
guest provides this information'.

> +#
> +#

Drop one of these two lines, please.

> +# Since: 9.0
> +##
> +{'struct': 'GuestPanicInformationTdx',
> + 'data': {'error-code': 'uint64',
> +  'message': 'str',
> +  '*gpa': 'uint64'}}
> +
>  ##
>  # @MEMORY_FAILURE:
>  #

Re: [PATCH v5 12/65] i386: Introduce tdx-guest object

Xiaoyao Li  writes:

> Introduce tdx-guest object which inherits CONFIDENTIAL_GUEST_SUPPORT,
> and will be used to create TDX VMs (TDs) by
>
>   qemu -machine ...,confidential-guest-support=tdx0   \
>-object tdx-guest,id=tdx0
>
> So far, it has no QAPI member/properety decleared and only one internal
> member 'attributes' with fixed value 0 that not configurable.
>
> QAPI properties will be added later.
>
> Signed-off-by: Xiaoyao Li 
> Acked-by: Gerd Hoffmann 
> Acked-by: Markus Armbruster 

I'm happy with the commit message now.  Thanks!

Re: [PATCH 9/9] hostmem-file: support POSIX shm_open()


On Wed, Feb 28, 2024 at 01:32:17PM +0100, Markus Armbruster wrote:

Stefano Garzarella  writes:


Add a new `shm` bool option for `-object memory-backend-file`.

When this option is set to true, the POSIX shm_open(3) is used instead
of open(2).

So a file will not be created in the filesystem, but a "POSIX shared
memory object" will be instantiated. In Linux this turns into a file
in /dev/shm, but in other OSes this may not happen (for example in
macOS or FreeBSD nothing is shown in any filesystem).

This new feature is useful when we need to share guest memory with
another process (e.g. vhost-user backend), but we don't have
memfd_create() or any special filesystems (e.g. /dev/shm) available
as in macOS.

Signed-off-by: Stefano Garzarella 
---
I am not sure this is the best way to support shm_open() in QEMU.

Other solutions I had in mind were:

- create a new memory-backend-shm


How would that look like?  Would it involve duplicating code?


I was looking at it just now, and apart from some boilerplate code to 
create the object, the rest in the end is pretty specific and a lot of 
things in memory-backend-file wouldn't be supported by 
memory-backend-shm anyway, so I'll give it a try for v2 by adding it.





- extend memory-backend-memfd to use shm_open() on systems where memfd is
not available (problem: shm_open wants a name to assign to the object, but
we can do a workaround by using a random name and do the unlink right away)


Hmm.  Too much magic?  I don't know...


Yeah, I agree.




Any preference/suggestion?


[...]


diff --git a/qapi/qom.json b/qapi/qom.json
index 2a6e49365a..bfb01b909f 100644
--- a/qapi/qom.json
+++ b/qapi/qom.json
@@ -682,6 +682,9 @@

  # @mem-path: the path to either a shared memory or huge page
  # filesystem mount

Does this need adjustment?


Good point. For now I think I will drop this patch and add 
memory-backend-shm in v2, but if I go back I will fix it!




[...]


 #   writable RAM instead of ROM, and want to set this property to 'off'.
 #   (default: auto, since 8.2)
 #
+# @shm: if true, shm_open(3) is used to create/open POSIX shared memory
+#   object; if false, an open(2) is used. (default: false) (since 9.0)
+#


Please format like this for consistency:


Sure.



# @shm: if true, shm_open(3) is used to create/open POSIX shared memory
# object; if false, an open(2) is used (default: false) (since 9.0)


I just noticed that I followed the property just above (@rom). Should we 
fix that one?


Thanks,
Stefano

Re: [PATCH RFC 0/3] Support GM/T 0018-2012 cryptographic standard

2024-02-29 Thread Daniel P . Berrangé

On Sat, Feb 24, 2024 at 10:34:55PM +0800, Hyman Huang wrote:
> This patchset introduce GM/T 0018-2012 as a crypto backend driver,
> which is applied for block encryption. Currently, we support SM4
> cipher algorithm only.
> 
> GM/T 0018-2012 is a cryptographic standard issued by the State
> Cryptography Administration of China. Visit https://hbba.sacinfo.org.cn
> search GM/T 0018-2012 for brief introduction.
> 
> The objective of the standard is to develop a uniform application
> interface standard for the service-based cryptography device under
> the public key cryptographic infrastructure application framework,
> and to call the cryptography device through this interface to
> provide basic cryptographic services for the uppler layer. For
> more information about contents of the standard, download the
> specificaiton from:
> "https://github.com/guanzhi/GM-Standards/blob/master/GMT密码行标/
> GMT 00018-2012 密码设备应用接口规范.pdf"
> 
> There are two benefits to doing this, at least.
>  * Performance - using a cryptography device for block encryption
>  offers an opportunity to enhance the input/output
>  performance once the hardware is certified
>  * Secrecy - hardware manufacturers may fortify cryptography
>  equipment with security features, so increasing the
>  secrecy of block encryption.
> 
> The precise way that vendors implement the standard APIs for data
> encryption using the cryptographic device is uncoupled from the
> GM/T 0018-2012 specification. Thus, if developers enable this
> functionality with the following conditions met, we could accomplish
> the general implementation:
> 
> 1. rename the header file provided by vendor to gmt-0018-2012.h
>and copy it to the /usr/include directory.
> 2. rename the dynamic library provided by vendor to
>gmt_0018_2012.so and copy it to the /usr/lib64 or any directory
>that linker could find before compiling QEMU.
> 3. enable crypto_gmt option when compiling QEMU and make the feature
>availiable.
> 
> By offering a development package for GM/T 0018-2012, the above
> provisions could be standardized; unfortunately, the hardware
> manufacturer has not completed this task. So developers who don't
> work with the vendor to obtain the cryptography device and related
> library may not be able to test this functionality because the
> standard implementation depends on the cryptography device supplied
> by the hardware vendor. We are hesitant to contribute to this series
> as a result.

Hmm, yes, that is a pretty unpleasant approach.

IMHO there really needs to be a reference implementation that is
pure software. eg a gmt_0018_2012.so + header files that simply
uses an existing crypto library. That way applications can build
and test their support for this, without having to have access
to a specific piece of hardware. Hardware vendors should only
have to provide their library impl, not the headers.

With regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|

Re: [PATCH v1 3/8] aspeed/sdmc: Add AST2700 support


Hi Jamin,

On 29/2/24 08:23, Jamin Lin via wrote:

The SDRAM memory controller(DRAMC) controls the access to external
DDR4 and DDR5 SDRAM and power up to DDR4 and DDR5 PHY.

The DRAM memory controller of AST2700 is not backward compatible
to previous chips such AST2600, AST2500 and AST2400.

Max memory is now 8GiB on the AST2700. Introduce new
aspeed_2700_sdmc and class with read/write operation and
reset handlers.

Define DRAMC necessary protected registers and
unprotected registers for AST2700 and increase
the register set to 0x1000.

Signed-off-by: Troy Lee 
Signed-off-by: Jamin Lin 
---
  hw/misc/aspeed_sdmc.c | 215 ++
  include/hw/misc/aspeed_sdmc.h |   4 +-
  2 files changed, 198 insertions(+), 21 deletions(-)




@@ -231,7 +270,10 @@ static void aspeed_sdmc_realize(DeviceState *dev, Error 
**errp)
  AspeedSDMCState *s = ASPEED_SDMC(dev);
  AspeedSDMCClass *asc = ASPEED_SDMC_GET_CLASS(s);
  
-assert(asc->max_ram_size < 4 * GiB); /* 32-bit address bus */

+if (!asc->is_aarch64) {


Maybe name it 'bus64bit'? Because this isn't really related
to Aarch64.


+assert(asc->max_ram_size < 4 * GiB); /* 32-bit address bus */
+}




+static void aspeed_2700_sdmc_class_init(ObjectClass *klass, void *data)
+{
+DeviceClass *dc = DEVICE_CLASS(klass);
+AspeedSDMCClass *asc = ASPEED_SDMC_CLASS(klass);
+
+dc->desc = "ASPEED 2700 SDRAM Memory Controller";
+dc->reset = aspeed_2700_sdmc_reset;
+
+asc->is_aarch64 = true;
+asc->max_ram_size = 8 * GiB;
+asc->compute_conf = aspeed_2700_sdmc_compute_conf;
+asc->write = aspeed_2700_sdmc_write;
+asc->valid_ram_sizes = aspeed_2700_ram_sizes;
+}




@@ -51,6 +52,7 @@ struct AspeedSDMCClass {
  const uint64_t *valid_ram_sizes;
  uint32_t (*compute_conf)(AspeedSDMCState *s, uint32_t data);
  void (*write)(AspeedSDMCState *s, uint32_t reg, uint32_t data);
+uint32_t is_aarch64;


bool.


  };
  
  #endif /* ASPEED_SDMC_H */

Re: [PATCH v1 6/8] aspeed/intc: Add AST2700 support


Hi Jamin,

On 29/2/24 08:23, Jamin Lin via wrote:

AST2700 interrupt controller(INTC) provides hardware interrupt interfaces
to interrupt of processors PSP, SSP and TSP. In INTC, each interrupt of
INT 128 to INT136 combines 32 interrupts.

Introduce a new aspeed_intc class with instance_init and realize handlers.

QEMU supports ARM Generic Interrupt Controller, version 3(GICv3)
but not support Shared Peripheral Interrupt (SPI), yet.
This patch added work around to set GICINT132[18] which was BMC UART interrupt
if it received GICINT132, so users are able to type any key from keyboard to
trigger GICINT132 interrupt until AST2700 boot into login prompt.
It is a temporary solution.

Signed-off-by: Troy Lee 
Signed-off-by: Jamin Lin 
---
  hw/intc/aspeed_intc.c| 135 +++
  hw/intc/meson.build  |   1 +
  include/hw/intc/aspeed_vic.h |  29 
  3 files changed, 165 insertions(+)
  create mode 100644 hw/intc/aspeed_intc.c




+#define TO_REG(N) (N >> 2)




+static const MemoryRegionOps aspeed_intc_ops = {
+.read = aspeed_intc_read,
+.write = aspeed_intc_write,
+.endianness = DEVICE_LITTLE_ENDIAN,


Please be also explicit with the implementation:

  .impl.min_access_size = 4,
  .impl.max_access_size = 4,


+.valid.min_access_size = 4,
+.valid.max_access_size = 4,
+.valid.unaligned = false,
+};

Re: [PATCH v5 3/4] hw: Set virtio-iommu aw-bits default value on pc_q35 and arm virt

2024-02-29 Thread Igor Mammedov

On Thu, 15 Feb 2024 09:42:13 +0100
Eric Auger  wrote:

> Currently the default input range can extend to 64 bits. On x86,
> when the virtio-iommu protects vfio devices, the physical iommu
> may support only 39 bits. Let's set the default to 39, as done
> for the intel-iommu. On ARM we set 48b as a default (matching
> SMMUv3 SMMU_IDR5.VAX == 0).
> 
> We use hw_compat_8_2 to handle the compatibility for machines
> before 9.0 which used to have a virtio-iommu default input range
> of 64 bits.

so we have different defaults per target/machine
while open codding fixup in _pre_plug_ works it's
a bit unexpected place to manage defaults and
avoid adding 0 magic.

How about using compat machinery instead to set
machine dependent defaults:
For example:

pc_i440fx_machine_options(MachineClass *m)
{
...
+compat_props_add(m->compat_props, 
pc_compat_defaults,pc_compat_defaults_len);
 }

> Of course if aw-bits is set from the command line, the default
> is overriden.
> 
> Signed-off-by: Eric Auger 
> Reviewed-by: Zhenzhong Duan 
> Tested-by: Yanghang Liu
> 
> ---
> 
> v3 -> v4:
> - update the qos test to relax the check on the max input IOVA
> 
> v2 -> v3:
> - collected Zhenzhong's R-b
> - use &error_abort instead of NULL error handle
>   on object_property_get_uint() call (Cédric)
> - use VTD_HOST_AW_39BIT (Cédric)
> 
> v1 -> v2:
> - set aw-bits to 48b on ARM
> - use hw_compat_8_2 to handle the compat for older machines
>   which used 64b as a default
> ---
>  hw/arm/virt.c   | 6 ++
>  hw/core/machine.c   | 5 -
>  hw/i386/pc.c| 6 ++
>  hw/virtio/virtio-iommu.c| 2 +-
>  tests/qtest/virtio-iommu-test.c | 2 +-
>  5 files changed, 18 insertions(+), 3 deletions(-)
> 
> diff --git a/hw/arm/virt.c b/hw/arm/virt.c
> index 368c2a415a..0994f2a560 100644
> --- a/hw/arm/virt.c
> +++ b/hw/arm/virt.c
> @@ -2716,10 +2716,16 @@ static void 
> virt_machine_device_pre_plug_cb(HotplugHandler *hotplug_dev,
>  } else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MD_PCI)) {
>  virtio_md_pci_pre_plug(VIRTIO_MD_PCI(dev), MACHINE(hotplug_dev), 
> errp);
>  } else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_IOMMU_PCI)) {
> +uint8_t aw_bits = object_property_get_uint(OBJECT(dev),
> +   "aw-bits", &error_abort);
>  hwaddr db_start = 0, db_end = 0;
>  QList *reserved_regions;
>  char *resv_prop_str;
>  
> +if (!aw_bits) {
> +qdev_prop_set_uint8(dev, "aw-bits", 48);

s/48/macro name/?

> +}
>
>
>  if (vms->iommu != VIRT_IOMMU_NONE) {
>  error_setg(errp, "virt machine does not support multiple 
> IOMMUs");
>  return;
> diff --git a/hw/core/machine.c b/hw/core/machine.c
> index fb5afdcae4..70ac96954c 100644
> --- a/hw/core/machine.c
> +++ b/hw/core/machine.c
> @@ -30,9 +30,12 @@
>  #include "exec/confidential-guest-support.h"
>  #include "hw/virtio/virtio-pci.h"
>  #include "hw/virtio/virtio-net.h"
> +#include "hw/virtio/virtio-iommu.h"
>  #include "audio/audio.h"
>  
> -GlobalProperty hw_compat_8_2[] = {};
> +GlobalProperty hw_compat_8_2[] = {
> +{ TYPE_VIRTIO_IOMMU_PCI, "aw-bits", "64" },
> +};
>  const size_t hw_compat_8_2_len = G_N_ELEMENTS(hw_compat_8_2);
>  
>  GlobalProperty hw_compat_8_1[] = {
> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
> index 196827531a..ee2d379c90 100644
> --- a/hw/i386/pc.c
> +++ b/hw/i386/pc.c
> @@ -1456,6 +1456,8 @@ static void 
> pc_machine_device_pre_plug_cb(HotplugHandler *hotplug_dev,
>  } else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MD_PCI)) {
>  virtio_md_pci_pre_plug(VIRTIO_MD_PCI(dev), MACHINE(hotplug_dev), 
> errp);
>  } else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_IOMMU_PCI)) {
> +uint8_t aw_bits = object_property_get_uint(OBJECT(dev),
> +   "aw-bits", &error_abort);
>  /* Declare the APIC range as the reserved MSI region */
>  char *resv_prop_str = g_strdup_printf("0xfee0:0xfeef:%d",
>VIRTIO_IOMMU_RESV_MEM_T_MSI);
> @@ -1464,6 +1466,10 @@ static void 
> pc_machine_device_pre_plug_cb(HotplugHandler *hotplug_dev,
>  qlist_append_str(reserved_regions, resv_prop_str);
>  qdev_prop_set_array(dev, "reserved-regions", reserved_regions);
>  
> +if (!aw_bits) {
> +qdev_prop_set_uint8(dev, "aw-bits", VTD_HOST_AW_39BIT);
> +}
> +
>  g_free(resv_prop_str);
>  }
>  
> diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c
> index 8b541de850..2ec5ef3cd1 100644
> --- a/hw/virtio/virtio-iommu.c
> +++ b/hw/virtio/virtio-iommu.c
> @@ -1526,7 +1526,7 @@ static Property virtio_iommu_properties[] = {
>  DEFINE_PROP_LINK("primary-bus", VirtIOIOMMU, primary_bus,
>   TYPE_PCI_BUS, PCIBus *),
>  DEFINE_PROP_BOOL("boot-bypass", VirtIOIOMMU, boot_byp

[PATCH] plugins: Ensure register handles are not NULL

2024-02-29 Thread Akihiko Odaki

Ensure register handles are not NULL so that a plugin can assume NULL is
invalid as a register handle.

Signed-off-by: Akihiko Odaki 
---
 plugins/api.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/plugins/api.c b/plugins/api.c
index 81f43c9ce8a4..74e24f0697cd 100644
--- a/plugins/api.c
+++ b/plugins/api.c
@@ -442,7 +442,7 @@ static GArray *create_register_handles(GArray *gdbstub_regs)
 }
 
 /* Create a record for the plugin */
-desc.handle = GINT_TO_POINTER(grd->gdb_reg);
+desc.handle = GINT_TO_POINTER(grd->gdb_reg + 1);
 desc.name = g_intern_string(grd->name);
 desc.feature = g_intern_string(grd->feature_name);
 g_array_append_val(find_data, desc);
@@ -463,5 +463,5 @@ int qemu_plugin_read_register(struct qemu_plugin_register 
*reg, GByteArray *buf)
 {
 g_assert(current_cpu);
 
-return gdb_read_register(current_cpu, buf, GPOINTER_TO_INT(reg));
+return gdb_read_register(current_cpu, buf, GPOINTER_TO_INT(reg) - 1);
 }

---
base-commit: bfe8020c814a30479a4241aaa78b63960655962b
change-id: 20240229-null-841efa023c93

Best regards,
-- 
Akihiko Odaki

Re: [PATCH 4/4] tcg/optimize: optimize TSTNE using smask and zmask

2024-02-29 Thread Paolo Bonzini


On 2/29/24 00:10, Richard Henderson wrote:

On 2/28/24 01:11, Paolo Bonzini wrote:

-    /* TSTNE x,sign -> LT x,0 */
-    if (arg_is_const_val(*p2, (ctx->type == TCG_TYPE_I32
-   ? INT32_MIN : INT64_MIN))) {
+    /* TSTNE x,i -> LT x,0 if i only includes sign bit copies */
+    if (arg_is_const(*p2) && (arg_info(*p2)->val & ~i1->s_mask) == 0) {


This is a good idea, but s_mask isn't defined like you think -- it is 
*repetitions* of the sign bit, but not including the sign bit itself.  
For INT64_MIN, s_mask == 0.


So for TSTNE min,min, (min & ~0) != 0, so the test won't pass.


Oh! So I have to squash:

diff --git a/tcg/optimize.c b/tcg/optimize.c
index ab976a5bbe7..44d1b1a6d8a 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -140,6 +140,12 @@ static inline bool arg_is_const_val(TCGArg arg, uint64_t 
val)
 return ts_is_const_val(arg_temp(arg), val);
 }
 
+/* Calculate all the copies of the sign bit, both redundant and not. */

+static inline uint64_t all_sign_bit_copies(TempOptInfo *info)
+{
+return (info->s_mask >> 1) | INT64_MIN;
+}
+
 static inline bool ts_is_copy(TCGTemp *ts)
 {
 return ts_info(ts)->next_copy != ts;
@@ -825,7 +831,7 @@ static int do_constant_folding_cond1(OptContext *ctx, TCGOp 
*op, TCGArg dest,
 }
 
 /* TSTNE x,i -> LT x,0 if i only includes sign bit copies */

-if (arg_is_const(*p2) && (arg_info(*p2)->val & ~i1->s_mask) == 0) {
+if (arg_is_const(*p2) && (arg_info(*p2)->val & ~all_sign_bit_copies(i1)) 
== 0) {
 *p2 = arg_new_constant(ctx, 0);
 *pcond = tcg_tst_ltge_cond(cond);
 return -1;


I tested with

   movq $0x8000, %rbx
   test %ebx, %ebx
   js y

and I get

 brcond_i64 cc_dst,$0x8000,tstne,$L1

which works and matches your explanation:

 i1.s_mask == 0x
 i2.val == 0x8000
 all_sign_bit_copies(i1) == 0x8000
 u2.val & ~all_sign_bit_copies(i1) == 0

Thanks!

Paolo

Re: [PATCH] hw/scsi/lsi53c895a: add hack to prevent scsi timeouts in HP-UX 10.20

On Wed, 28 Feb 2024 at 21:12, Sven Schnelle  wrote:
>
> HP-UX 10.20 seems to make the lsi53c895a spinning on a memory location
> under certain circumstances. As the SCSI controller and CPU are not
> running at the same time this loop will never finish. After some
> time, the check loop interrupts with a unexpected device disconnect.
> This works, but is slow because the kernel resets the scsi controller.
> Instead of signaling UDC, add an option 'hpux-spin-workaround' which
> emulates a INTERRUPT 2 script instruction. This instruction tells the
> kernel that the request was fulfilled. With this change, SCSI speeds
> improves significantly.
>
> The option can be enabled by adding
>
> -global lsi53c895a.hpux-spin-workaround=on
>
> to the qemu commandline.
>
> Signed-off-by: Sven Schnelle 
> ---
>  hw/scsi/lsi53c895a.c | 20 ++--
>  1 file changed, 18 insertions(+), 2 deletions(-)
>
> diff --git a/hw/scsi/lsi53c895a.c b/hw/scsi/lsi53c895a.c
> index d607a5f9fb..20c353f594 100644
> --- a/hw/scsi/lsi53c895a.c
> +++ b/hw/scsi/lsi53c895a.c
> @@ -304,6 +304,7 @@ struct LSIState {
>  uint32_t adder;
>
>  uint8_t script_ram[2048 * sizeof(uint32_t)];
> +bool hpux_spin_workaround;
>  };
>
>  #define TYPE_LSI53C810  "lsi53c810"
> @@ -1156,8 +1157,17 @@ again:
>  qemu_log_mask(LOG_GUEST_ERROR,
>"lsi_scsi: inf. loop with UDC masked");
>  }
> -lsi_script_scsi_interrupt(s, LSI_SIST0_UDC, 0);
> -lsi_disconnect(s);
> +if (s->hpux_spin_workaround) {
> +/*
> + * Workaround for HP-UX 10.20: Instead of disconnecting, which
> + * causes a long delay, emulate a INTERRUPT 2 instruction.
> + */
> +s->dsps = 2;
> +lsi_script_dma_interrupt(s, LSI_DSTAT_SIR);
> +} else {
> +lsi_script_scsi_interrupt(s, LSI_SIST0_UDC, 0);
> +lsi_disconnect(s);
> +}
>  trace_lsi_execute_script_stop();
>  reentrancy_level--;
>  return;


I see we already have a hacky workaround for other OSes
that do something similar. The ideal fix for both of these
I think would be for lsi_execute_script() to, instead of stopping,
arrange to defer executing more script instructions until
after the guest has had a chance to run a bit more.
I think setting a timer that calls lsi_resume_script() after
a while would have that effect.

-- PMM

Re: [PATCH v1 7/8] aspeed/soc: Add AST2700 support


Hi Jamin,

On 29/2/24 08:23, Jamin Lin via wrote:

Initial definitions for a simple machine using an AST2700 SOC (Cortex-a35 CPU).

AST2700 SOC and its interrupt controller are too complex to handle
in the common Aspeed SoC framework. We introduce a new ast2700
class with instance_init and realize handlers.

AST2700 is a 64 bits quad core cpus and support 8 watchdog.
Update maximum ASPEED_CPUS_NUM to 4 and ASPEED_WDTS_NUM to 8.
In addition, update AspeedSocState to support scuio, sli, sliio and intc.

Update silicon_rev data type to 64bits from AspeedSoCClass and
add TYPE_ASPEED27X0_SOC machine type.

Signed-off-by: Troy Lee 
Signed-off-by: Jamin Lin 
---
  hw/arm/aspeed_ast27x0.c | 462 
  hw/arm/meson.build  |   1 +
  include/hw/arm/aspeed_soc.h |  26 +-
  3 files changed, 486 insertions(+), 3 deletions(-)
  create mode 100644 hw/arm/aspeed_ast27x0.c




+#define AST2700_MAX_IRQ 288
+
+/* Shared Peripheral Interrupt values below are offset by -32 from datasheet */
+static const int aspeed_soc_ast2700_irqmap[] = {
+[ASPEED_DEV_UART0] = 132,
+[ASPEED_DEV_UART1] = 132,
+[ASPEED_DEV_UART2] = 132,
+[ASPEED_DEV_UART3] = 132,
+[ASPEED_DEV_UART4] = 8,
+[ASPEED_DEV_UART5] = 132,
+[ASPEED_DEV_UART6] = 132,
+[ASPEED_DEV_UART7] = 132,
+[ASPEED_DEV_UART8] = 132,
+[ASPEED_DEV_UART9] = 132,
+[ASPEED_DEV_UART10]= 132,
+[ASPEED_DEV_UART11]= 132,
+[ASPEED_DEV_UART12]= 132,


When multiple devices output IRQ lines are connected to the same
input one, a IRQ OR gate has to be used.

See previous explanations here:
https://lore.kernel.org/qemu-devel/5a7594d9-3fbd-4d90-a5f9-81b7b845f...@linaro.org/

(Pre-existing issue in aspeed_soc_ast2600_irqmap[])


+[ASPEED_DEV_FMC]   = 131,
+[ASPEED_DEV_SDMC]  = 0,
+[ASPEED_DEV_SCU]   = 12,
+[ASPEED_DEV_ADC]   = 130,
+[ASPEED_DEV_XDMA]  = 5,
+[ASPEED_DEV_EMMC]  = 15,
+[ASPEED_DEV_GPIO]  = 11,
+[ASPEED_DEV_GPIO_1_8V] = 130,
+[ASPEED_DEV_RTC]   = 13,
+[ASPEED_DEV_TIMER1]= 16,
+[ASPEED_DEV_TIMER2]= 17,
+[ASPEED_DEV_TIMER3]= 18,
+[ASPEED_DEV_TIMER4]= 19,
+[ASPEED_DEV_TIMER5]= 20,
+[ASPEED_DEV_TIMER6]= 21,
+[ASPEED_DEV_TIMER7]= 22,
+[ASPEED_DEV_TIMER8]= 23,
+[ASPEED_DEV_WDT]   = 131,
+[ASPEED_DEV_PWM]   = 131,
+[ASPEED_DEV_LPC]   = 128,
+[ASPEED_DEV_IBT]   = 128,
+[ASPEED_DEV_I2C]   = 130,
+[ASPEED_DEV_PECI]  = 133,
+[ASPEED_DEV_ETH1]  = 132,
+[ASPEED_DEV_ETH2]  = 132,
+[ASPEED_DEV_ETH3]  = 132,
+[ASPEED_DEV_HACE]  = 4,
+[ASPEED_DEV_KCS]   = 128,
+[ASPEED_DEV_DP]= 28,
+[ASPEED_DEV_I3C]   = 131,
+};

Re: [PATCH, v2] physmem: avoid bounce buffer too small

On Wed, 28 Feb 2024 at 19:07, Heinrich Schuchardt
 wrote:
>
> On 28.02.24 19:39, Peter Maydell wrote:
> > The limitation to a page dates back to commit 6d16c2f88f2a in 2009,
> > which was the first implementation of this function. I don't think
> > there's a particular reason for that value beyond that it was
> > probably a convenient value that was assumed to be likely "big enough".
> >
> > I think the idea with this bounce-buffer has always been that this
> > isn't really a code path we expected to end up in very often --
> > it's supposed to be for when devices are doing DMA, which they
> > will typically be doing to memory (backed by host RAM), not
> > devices (backed by MMIO and needing a bounce buffer). So the
> > whole mechanism is a bit "last fallback to stop things breaking
> > entirely".
> >
> > The address_space_map() API says that it's allowed to return
> > a subset of the range you ask for, so if the virtio code doesn't
> > cope with the minimum being set to TARGET_PAGE_SIZE then either
> > we need to fix that virtio code or we need to change the API
> > of this function. (But I think you will also get a reduced
> > range if you try to use it across a boundary between normal
> > host-memory-backed RAM and a device MemoryRegion.)
>
> If we allow a bounce buffer only to be used once (via the in_use flag),
> why do we allow only a single bounce buffer?
>
> Could address_space_map() allocate a new bounce buffer on every call and
> address_space_unmap() deallocate it?
>
> Isn't the design with a single bounce buffer bound to fail with a
> multi-threaded client as collision can be expected?

Yeah, I don't suppose multi-threaded was particularly expected.
Again, this is really a "handle the case where the guest does
something silly" setup, which is why only one bounce buffer.

Why is your guest ending up in the bounce-buffer path?

-- PMM

Re: [PATCH v4 08/15] spapr: nested: Introduce H_GUEST_CREATE_VCPU hcall.

2024-02-29 Thread Harsh Prateek Bora





On 2/27/24 15:21, Nicholas Piggin wrote:

On Tue Feb 20, 2024 at 6:36 PM AEST, Harsh Prateek Bora wrote:

Introduce the nested PAPR hcall H_GUEST_CREATE_VCPU which is used to
create and initialize the specified VCPU resource for the previously
created guest. Each guest can have multiple VCPUs upto max 2048.
All VCPUs for a guest gets deallocated on guest delete.

Signed-off-by: Michael Neuling 
Signed-off-by: Harsh Prateek Bora 
---
  include/hw/ppc/spapr.h|  2 +
  include/hw/ppc/spapr_nested.h | 10 
  hw/ppc/spapr_nested.c | 96 +++
  3 files changed, 108 insertions(+)

diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index c4a79a1785..82b077bdd2 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -365,6 +365,7 @@ struct SpaprMachineState {
  #define H_UNSUPPORTED -67
  #define H_OVERLAP -68
  #define H_STATE   -75
+#define H_IN_USE  -77
  #define H_UNSUPPORTED_FLAG -256
  #define H_MULTI_THREADS_ACTIVE -9005
  
@@ -587,6 +588,7 @@ struct SpaprMachineState {

  #define H_GUEST_GET_CAPABILITIES 0x460
  #define H_GUEST_SET_CAPABILITIES 0x464
  #define H_GUEST_CREATE   0x470
+#define H_GUEST_CREATE_VCPU  0x474
  #define H_GUEST_DELETE   0x488
  
  #define MAX_HCALL_OPCODE H_GUEST_DELETE

diff --git a/include/hw/ppc/spapr_nested.h b/include/hw/ppc/spapr_nested.h
index f282479275..24e87bca08 100644
--- a/include/hw/ppc/spapr_nested.h
+++ b/include/hw/ppc/spapr_nested.h
@@ -14,6 +14,8 @@ typedef struct SpaprMachineStateNested {
  
  typedef struct SpaprMachineStateNestedGuest {

  uint32_t pvr_logical;
+unsigned long vcpus;
+struct SpaprMachineStateNestedGuestVcpu *vcpu;
  } SpaprMachineStateNestedGuest;
  
  /* Nested PAPR API related macros */

@@ -27,6 +29,7 @@ typedef struct SpaprMachineStateNestedGuest {
  #define H_GUEST_CAP_P10_MODE_BMAP 2
  #define PAPR_NESTED_GUEST_MAX 4096
  #define H_GUEST_DELETE_ALL_FLAG   0x8000ULL
+#define PAPR_NESTED_GUEST_VCPU_MAX2048
  
  /*

   * Register state for entering a nested guest with H_ENTER_NESTED.
@@ -118,8 +121,15 @@ struct nested_ppc_state {
  uint64_t ppr;
  
  int64_t tb_offset;

+/* Nested PAPR API */
+uint64_t pvr;
  };
  
+typedef struct SpaprMachineStateNestedGuestVcpu {

+bool enabled;
+struct nested_ppc_state state;
+} SpaprMachineStateNestedGuestVcpu;
+
  void spapr_exit_nested(PowerPCCPU *cpu, int excp);
  typedef struct SpaprMachineState SpaprMachineState;
  bool spapr_get_pate_nested_hv(SpaprMachineState *spapr, PowerPCCPU *cpu,
diff --git a/hw/ppc/spapr_nested.c b/hw/ppc/spapr_nested.c
index 09c4a35908..3cc704adda 100644
--- a/hw/ppc/spapr_nested.c
+++ b/hw/ppc/spapr_nested.c
@@ -428,6 +428,41 @@ void spapr_exit_nested(PowerPCCPU *cpu, int excp)
  }
  }
  
+static

+SpaprMachineStateNestedGuest *spapr_get_nested_guest(SpaprMachineState *spapr,
+ target_ulong guestid)
+{
+SpaprMachineStateNestedGuest *guest;
+
+guest = g_hash_table_lookup(spapr->nested.guests, 
GINT_TO_POINTER(guestid));
+return guest;
+}
+
+static bool spapr_nested_vcpu_check(SpaprMachineStateNestedGuest *guest,
+target_ulong vcpuid)
+{
+struct SpaprMachineStateNestedGuestVcpu *vcpu;
+/*
+ * Perform sanity checks for the provided vcpuid of a guest.
+ * For now, ensure its valid, allocated and enabled for use.
+ */
+
+if (vcpuid >= PAPR_NESTED_GUEST_VCPU_MAX) {
+return false;
+}
+
+if (!(vcpuid < guest->vcpus)) {
+return false;
+}
+
+vcpu = &guest->vcpu[vcpuid];
+if (!vcpu->enabled) {
+return false;
+}
+
+return true;
+}
+
  static target_ulong h_guest_get_capabilities(PowerPCCPU *cpu,
   SpaprMachineState *spapr,
   target_ulong opcode,
@@ -518,6 +553,7 @@ static void
  destroy_guest_helper(gpointer value)
  {
  struct SpaprMachineStateNestedGuest *guest = value;
+g_free(guest->vcpu);
  g_free(guest);
  }
  
@@ -613,6 +649,65 @@ static target_ulong h_guest_delete(PowerPCCPU *cpu,

  return H_SUCCESS;
  }
  
+static target_ulong h_guest_create_vcpu(PowerPCCPU *cpu,

+SpaprMachineState *spapr,
+target_ulong opcode,
+target_ulong *args)
+{
+CPUPPCState *env = &cpu->env;
+struct nested_ppc_state *l2_state;
+target_ulong flags = args[0];
+target_ulong guestid = args[1];
+target_ulong vcpuid = args[2];
+SpaprMachineStateNestedGuest *guest;
+
+if (flags) { /* don't handle any flags for now */
+return H_UNSUPPORTED_FLAG;
+}
+
+guest = spapr_get_nested_guest(spapr, guestid);
+if (!guest) {
+return H_P2;
+}
+
+if (vcpuid < guest->v

Re: ISO C90 compilation error


On 29/2/24 08:59, Daniel P. Berrangé wrote:

On Thu, Feb 29, 2024 at 07:03:35AM +, Paz Offer wrote:

Hi,

I am trying to build my code with QEMU and getting compilation error according 
to the ISO C90 standard:

  const size_t buf_size = 31;
  char buffer[buf_size + 1];

  error: ISO C90 forbids array ‘buffer’ whose size can’t be evaluated 
[-Werror=vla]

I noticed that the code builds with '-std=gnu11', which is newer then
C90, so this is not clear to me why I get this error.
Where is the correct place to specify the language version for this?


QEMU has set compiler flags to explicitly /forbid/ use of variable
sized arrays on the stack, as it is a known dangerous language
feature. You must refactor your changes to avoid this by using either
a statically sized array, or allocating on the heap.


If you array has a fixed size, you could use a definition, so the
proprocessor can evaluate the buffer size:

  #define BUF_SIZE 31
  char buffer[BUF_SIZE + 1];

Regards,

Phil.

Re: [PATCH v4 09/15] spapr: nested: Extend nested_ppc_state for nested PAPR API

2024-02-29 Thread Harsh Prateek Bora





On 2/27/24 15:29, Nicholas Piggin wrote:

On Tue Feb 20, 2024 at 6:36 PM AEST, Harsh Prateek Bora wrote:

Currently, nested_ppc_state stores a certain set of registers and works
with nested_[load|save]_state() for state transfer as reqd for nested-hv API.
Extending these with additional registers state as reqd for nested PAPR API.

Signed-off-by: Harsh Prateek Bora 
Suggested-by: Nicholas Piggin 
---
  include/hw/ppc/spapr_nested.h |  49 
  target/ppc/cpu.h  |   2 +
  hw/ppc/spapr_nested.c | 106 ++
  3 files changed, 157 insertions(+)

diff --git a/include/hw/ppc/spapr_nested.h b/include/hw/ppc/spapr_nested.h
index 24e87bca08..a3b61eb79a 100644
--- a/include/hw/ppc/spapr_nested.h
+++ b/include/hw/ppc/spapr_nested.h
@@ -7,6 +7,7 @@ typedef struct SpaprMachineStateNested {
  uint64_t ptcr;
  uint8_t api;
  #define NESTED_API_KVM_HV  1
+#define NESTED_API_PAPR2
  bool capabilities_set;
  uint32_t pvr_base;
  GHashTable *guests;
@@ -123,6 +124,54 @@ struct nested_ppc_state {
  int64_t tb_offset;
  /* Nested PAPR API */
  uint64_t pvr;
+uint64_t amor;
+uint64_t dawr0;
+uint64_t dawrx0;
+uint64_t ciabr;
+uint64_t purr;
+uint64_t spurr;
+uint64_t ic;
+uint64_t vtb;
+uint64_t hdar;
+uint64_t hdsisr;
+uint64_t heir;
+uint64_t asdr;
+uint64_t dawr1;
+uint64_t dawrx1;
+uint64_t dexcr;
+uint64_t hdexcr;
+uint64_t hashkeyr;
+uint64_t hashpkeyr;
+ppc_vsr_t vsr[64] QEMU_ALIGNED(16);
+uint64_t ebbhr;
+uint64_t tar;
+uint64_t ebbrr;
+uint64_t bescr;
+uint64_t iamr;
+uint64_t amr;
+uint64_t uamor;
+uint64_t dscr;
+uint64_t fscr;
+uint64_t pspb;
+uint64_t ctrl;
+uint64_t vrsave;
+uint64_t dar;
+uint64_t dsisr;
+uint64_t pmc1;
+uint64_t pmc2;
+uint64_t pmc3;
+uint64_t pmc4;
+uint64_t pmc5;
+uint64_t pmc6;
+uint64_t mmcr0;
+uint64_t mmcr1;
+uint64_t mmcr2;
+uint64_t mmcra;
+uint64_t sdar;
+uint64_t siar;
+uint64_t sier;
+uint32_t vscr;
+uint64_t fpscr;
  };
  
  typedef struct SpaprMachineStateNestedGuestVcpu {

diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index a44de22ca4..11205bb9e3 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -1737,9 +1737,11 @@ void ppc_compat_add_property(Object *obj, const char 
*name,
  #define SPR_PSPB  (0x09F)
  #define SPR_DPDES (0x0B0)
  #define SPR_DAWR0 (0x0B4)
+#define SPR_DAWR1 (0x0B5)
  #define SPR_RPR   (0x0BA)
  #define SPR_CIABR (0x0BB)
  #define SPR_DAWRX0(0x0BC)
+#define SPR_DAWRX1(0x0BD)
  #define SPR_HFSCR (0x0BE)
  #define SPR_VRSAVE(0x100)
  #define SPR_USPRG0(0x100)


Might try to put the DAWR1 enable ahead of this, but if not we'll have
to drop these until that is done. Leave it in for now I'll sort it out
if necessary.


Ok




diff --git a/hw/ppc/spapr_nested.c b/hw/ppc/spapr_nested.c
index 3cc704adda..39d0c087f1 100644
--- a/hw/ppc/spapr_nested.c
+++ b/hw/ppc/spapr_nested.c
@@ -101,6 +101,7 @@ static target_ulong h_copy_tofrom_guest(PowerPCCPU *cpu,
  static void nested_save_state(struct nested_ppc_state *save, PowerPCCPU *cpu)
  {
  CPUPPCState *env = &cpu->env;
+SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());


I hope that won't be a big overhead... nested entry/exit performance probably
isn't top priority at the moment though, so for now okay. Should be
something to look at though.


Hmm.. Ok





  memcpy(save->gpr, env->gpr, sizeof(save->gpr));
  
@@ -127,6 +128,58 @@ static void nested_save_state(struct nested_ppc_state *save, PowerPCCPU *cpu)

  save->pidr = env->spr[SPR_BOOKS_PID];
  save->ppr = env->spr[SPR_PPR];
  
+if (spapr_nested_api(spapr) == NESTED_API_PAPR) {

+save->pvr = env->spr[SPR_PVR];
+save->amor = env->spr[SPR_AMOR];
+save->dawr0 = env->spr[SPR_DAWR0];
+save->dawrx0 = env->spr[SPR_DAWRX0];
+save->ciabr = env->spr[SPR_CIABR];
+save->purr = env->spr[SPR_PURR];
+save->spurr = env->spr[SPR_SPURR];
+save->ic = env->spr[SPR_IC];
+save->vtb = env->spr[SPR_VTB];
+save->hdar = env->spr[SPR_HDAR];
+save->hdsisr = env->spr[SPR_HDSISR];
+save->heir = env->spr[SPR_HEIR];
+save->asdr = env->spr[SPR_ASDR];
+save->dawr1 = env->spr[SPR_DAWR1];
+save->dawrx1 = env->spr[SPR_DAWRX1];
+save->dexcr = env->spr[SPR_DEXCR];
+save->hdexcr = env->spr[SPR_HDEXCR];
+save->hashkeyr = env->spr[SPR_HASHKEYR];
+save->hashpkeyr = env->spr[SPR_HASHPKEYR];
+memcpy(save->vsr, env->vsr, sizeof(save->vsr));
+save->ebbhr = env->spr[SPR_EBBHR];
+save->tar = env->spr[SPR_TAR];
+save->ebbrr = env->spr[SPR_EBBRR];
+save->bescr = env->sp

Re: [Question] Can I start qemu-system-aarch64 with a vmlinux(ELF format)?

On Thu, 29 Feb 2024 at 03:01, Kunkun Jiang  wrote:
>
> Hi Peter,
>
> On 2024/2/27 23:28, Peter Maydell wrote:
> > On Tue, 27 Feb 2024 at 14:42, Kunkun Jiang via  
> > wrote:
> >> Hi everybody,
> >>
> >> I want to start qemu-system-aarch64 with a vmlinux,
> >> which is an ELF format file. The arm_load_elf() is
> >> implemented in arm_setup_direct_kernel_boot(). So I
> >> thought it was supporting the ELF format file.
> > No, you can't do this. The hw/arm/boot.c code assumes
> > that ELF files are "bare metal" binaries, whereas
> > uImage format, AArch64 Image format, and raw binary files
> > are Linux kernels. Only the last three kinds of files will
> > be started with the boot protocol the Linux kernel expects.
> >
> > For AArch64, pass the -kernel option the path to the Image
> > file, not the vmlinux file.
>
> Yes, it works fine using Image files.
> I would also like to ask again, is it because AArch64 does not
> support vmlinux, or is it because qemu does not implement
> this capability?

As I said, it is because QEMU assumes that ELF files are
bare metal images, not Linux kernel images.

-- PMM

Re: [PATCH v2 1/5] linux-user/x86_64: Handle the vsyscall page in open_self_maps_{2, 4}


On 28/2/24 21:25, Richard Henderson wrote:

This is the only case in which we expect to have no host memory backing
for a guest memory page, because in general linux user processes cannot
map any pages in the top half of the 64-bit address space.

Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2170
Signed-off-by: Richard Henderson 
---
  linux-user/syscall.c | 16 
  1 file changed, 16 insertions(+)

diff --git a/linux-user/syscall.c b/linux-user/syscall.c
index e384e14248..bc8c06522f 100644
--- a/linux-user/syscall.c
+++ b/linux-user/syscall.c
@@ -7994,6 +7994,10 @@ static void open_self_maps_4(const struct 
open_self_maps_data *d,
  path = "[heap]";
  } else if (start == info->vdso) {
  path = "[vdso]";
+#ifdef TARGET_X86_64


Alternatively "#ifdef TARGET_VSYSCALL_PAGE" like in
i386_tr_translate_insn()?


+} else if (start == TARGET_VSYSCALL_PAGE) {
+path = "[vsyscall]";
+#endif


Reviewed-by: Philippe Mathieu-Daudé

Re: [RFC PATCH 0/5] memattrs: target/arm: add user-defined and requester ID memattrs

On Thu, 29 Feb 2024 at 04:52, Joe Komlodi  wrote:
> On Wed, Feb 28, 2024 at 6:21 AM Peter Maydell  
> wrote:
> > So as far as I can see, this patchset defines a bunch of mechanism,
> > but no actual users: no device looks at these new memattrs, no board
> > code sets the properties. I don't really want to add this without
> > an upstream usecase for it.
>
> Yeah, I believe the current use-cases for this series are mostly downstream.
> It's possible that there's an upstream device that might benefit from
> it, but I'm not aware of one.
>
> Is the concern the usefulness of the series, or the worry about it 
> bit-rotting?
> If it's the latter, would a qtest be alright to make sure it doesn't rot?

My main issues are:
 * it's hard to review design without the uses of the code
 * it's extra complexity and maintenance burden that we don't
   need (upstream): accepting the patches gives upstream extra
   work to deal with into the future with no benefit to us
 * dead code is code that typically we would remove
 * we end up with something we can't refactor or clean up
   or change because the only reason we have it is for code
   that we don't have any visibility into: effectively it
   becomes an API for us that we can't change, which is not
   something QEMU does except for specific well defined API
   surfaces (QMP, plugins, etc)

Our usual approach is "submit the patches that add the new core
feature/mechanism along with the patches that add the new
device/board/etc that uses it". Compare the recent patches
also from Google for the ITS and SMMU that try to add hooks
that aren't needed by anything in upstream QEMU:
https://patchew.org/QEMU/20240221171716.1260192-1-nabiheste...@google.com/20240221171716.1260192-3-nabiheste...@google.com/
https://patchew.org/QEMU/20240221173325.1494895-1-nabiheste...@google.com/20240221173325.1494895-3-nabiheste...@google.com/
-- we rejected those for the same reason.

thanks
-- PMM

Re: [RFC 0/4] mirror: implement incremental and bitmap modes

Am 28.02.24 um 17:06 schrieb Vladimir Sementsov-Ogievskiy:
> On 28.02.24 19:00, Vladimir Sementsov-Ogievskiy wrote:
>> On 16.02.24 13:55, Fiona Ebner wrote:
>>> Now, the IO test added in patch 4/4 actually contains yet another use
>>> case, namely doing incremental mirrors to stand-alone qcow2 "diff"
>>> images, that only contain the delta and can be rebased later. I had to
>>> adapt the IO test, because its output expected the mirror bitmap to
>>> still be dirty, but nowadays the mirror is apparently already done
>>> when the bitmaps are queried. So I thought, I'll just use
>>> 'write-blocking' mode to avoid any potential timing issues.
>>>
>>> But this exposed an issue with the diff image approach. If a write is
>>> not aligned to the granularity of the mirror target, then rebasing the
>>> diff image onto a backing image will not yield the desired result,
>>> because the full cluster is considered to be allocated and will "hide"
>>> some part of the base/backing image. The failure can be seen by either
>>> using 'write-blocking' mode in the IO test or setting the (bitmap)
>>> granularity to 32 KiB rather than the current 64 KiB.
>>>
>>> The question is how to deal with these edge cases? Some possibilities
>>> that would make sense to me:
>>>
>>> For 'background' mode:
>>> * prohibit if target's cluster size is larger than the bitmap
>>>    granularity
>>> * document the limitation
>>>
>>> For 'write-blocking' mode:
>>> * disallow in combination with bitmap mode (would not be happy about
>>>    it, because I'd like to use this without diff images)
>>
>> why not just require the same: bitmap granularity must be >= target
>> granularity
>>

For the iotest's use-case, that only works for background mode. I'll
explain below.

>>> * for writes that are not aligned to the target's cluster size, read
>>>    the relevant/missing parts from the source image to be able to write
>>>    whole target clusters (seems rather complex)
>>
>> There is another approach: consider and unaligned part of the request,
>> fit in one cluster (we can always split any request to "aligned"
>> middle part, and at most two small "unligned" parts, each fit into one
>> cluster).
>>
>> We have two possibilities:
>>
>> 1. the cluster is dirty (marked dirty in the bitmap used by background
>> process)
>>
>> We can simply ignore this part and rely on background process. This
>> will not affect the convergence of the mirror job.
>>

Agreed.

>> 2. the cluster is clear (i.e. background process, or some previous
>> write already copied it)
>>

The iotest creates a new target image for each incremental sync which
only records the diff relative to the previous mirror and those diff
images are later rebased onto each other to get the full picture.

Thus, it can be that a previous mirror job (not just background process
or previous write) already copied a cluster, and in particular, copied
it to a different target!

>> In this case, we are safe to do unaligned write, as target cluster
>> must be allocated.

Because the diff image is new, the target's cluster is not necessarily
allocated. When using write-blocking and a write of, e.g., 9 bytes to a
clear source cluster comes in, only those 9 bytes are written to the
target. Now the target's cluster is allocated but with only those 9
bytes of data. When rebasing, the previously copied cluster is "masked"
and when reading the rebased image, we only see the cluster with those 9
bytes (and IIRC, zeroes for the rest of the cluster rather than the
previously copied data).

>>
>> (for bitmap-mode, I don't consider here clusters that are clear from
>> the start, which we shouldn't copy in any case)
>>

We do need to copy new writes to any cluster, and with a clear cluster
and write-blocking, the issue can manifest.

> 
> Hmm, right, and that's exactly the logic we already have in
> do_sync_target_write(). So that's enough just to require that
> bitmap_granularity >= target_granularity
> 

Best Regards,
Fiona

Re: [PATCH RFC 0/3] Support GM/T 0018-2012 cryptographic standard

2024-02-29 Thread Yong Huang

On Thu, Feb 29, 2024 at 5:04 PM Daniel P. Berrangé 
wrote:

> On Sat, Feb 24, 2024 at 10:34:55PM +0800, Hyman Huang wrote:
> > This patchset introduce GM/T 0018-2012 as a crypto backend driver,
> > which is applied for block encryption. Currently, we support SM4
> > cipher algorithm only.
> >
> > GM/T 0018-2012 is a cryptographic standard issued by the State
> > Cryptography Administration of China. Visit https://hbba.sacinfo.org.cn
> > search GM/T 0018-2012 for brief introduction.
> >
> > The objective of the standard is to develop a uniform application
> > interface standard for the service-based cryptography device under
> > the public key cryptographic infrastructure application framework,
> > and to call the cryptography device through this interface to
> > provide basic cryptographic services for the uppler layer. For
> > more information about contents of the standard, download the
> > specificaiton from:
> > "https://github.com/guanzhi/GM-Standards/blob/master/GMT密码行标/
> > GMT 00018-2012 密码设备应用接口规范.pdf"
> >
> > There are two benefits to doing this, at least.
> >  * Performance - using a cryptography device for block encryption
> >  offers an opportunity to enhance the input/output
> >  performance once the hardware is certified
> >  * Secrecy - hardware manufacturers may fortify cryptography
> >  equipment with security features, so increasing the
> >  secrecy of block encryption.
> >
> > The precise way that vendors implement the standard APIs for data
> > encryption using the cryptographic device is uncoupled from the
> > GM/T 0018-2012 specification. Thus, if developers enable this
> > functionality with the following conditions met, we could accomplish
> > the general implementation:
> >
> > 1. rename the header file provided by vendor to gmt-0018-2012.h
> >and copy it to the /usr/include directory.
> > 2. rename the dynamic library provided by vendor to
> >gmt_0018_2012.so and copy it to the /usr/lib64 or any directory
> >that linker could find before compiling QEMU.
> > 3. enable crypto_gmt option when compiling QEMU and make the feature
> >availiable.
> >
> > By offering a development package for GM/T 0018-2012, the above
> > provisions could be standardized; unfortunately, the hardware
> > manufacturer has not completed this task. So developers who don't
> > work with the vendor to obtain the cryptography device and related
> > library may not be able to test this functionality because the
> > standard implementation depends on the cryptography device supplied
> > by the hardware vendor. We are hesitant to contribute to this series
> > as a result.
>
> Hmm, yes, that is a pretty unpleasant approach.
>
> IMHO there really needs to be a reference implementation that is
> pure software. eg a gmt_0018_2012.so + header files that simply
>

Ok, this is a preferred choice but more work should be done for
the pure software implementation, we may try it in space time.

Thanks for the comments,

Yong


> uses an existing crypto library. That way applications can build
> and test their support for this, without having to have access
> to a specific piece of hardware. Hardware vendors should only
> have to provide their library impl, not the headers.


> With regards,
> Daniel
> --
> |: https://berrange.com  -o-
> https://www.flickr.com/photos/dberrange :|
> |: https://libvirt.org -o-
> https://fstop138.berrange.com :|
> |: https://entangle-photo.org-o-
> https://www.instagram.com/dberrange :|
>
>

-- 
Best regards

Re: [PATCH v7 1/2] qom: new object to associate device to numa node

On Wed, 28 Feb 2024 16:50:30 +
Ankit Agrawal  wrote:

> >>> Jonathan, you pointed out interface design issues in your review of v2.>  
> >> Are you fully satisfied with the interface in v3?
> >>
> >> Yes. I'm fine with the interface in this version (though it's v7, so I'm 
> >> lost
> >> on v2 vs v3!)  
> >
> > Looks like I can't count to 7!
> >
> > With NUMA capitalized in the doc comment, QAPI schema
> > Acked-by: Markus Armbruster 
> >
> > Thanks!  
> 
> Thanks! Will fix that in the next version.

The following is really me arguing with myself, so can probably be
ignored, but maybe it will spark an idea from someone else!

One trivial tweak that might make our life easier if anyone adds
support in the future for the other device handle type might be to go
with simply dev rather than pci-dev.

There is a sticky corner though if a device is a PCI device
and in ACPI DSDT so maybe we are better off adding acpi-dev
to take either pci-dev or acpi-dev?

Annoyingly for generic ports, (I'm reusing this infrastructure here)
the kernel code currently only deals with the ACPI form (for CXL host
bridges).  Given I point that at the bus of a PXB_CXL it is both
a PCI device, and the only handle we have for getting to the
Root Bridge ACPI handle.

So I think I've argued myself around to thinking we need to extend
the interface with another optional parameter if we ever do support
the ACPI handle for generic initiators :(

Jonathan

Re: [PATCH, v2] physmem: avoid bounce buffer too small

2024-02-29 Thread Heinrich Schuchardt


On 29.02.24 02:11, Peter Xu wrote:

On Wed, Feb 28, 2024 at 08:07:47PM +0100, Heinrich Schuchardt wrote:

On 28.02.24 19:39, Peter Maydell wrote:

On Wed, 28 Feb 2024 at 18:28, Heinrich Schuchardt
 wrote:


On 28.02.24 16:06, Philippe Mathieu-Daudé wrote:

Hi Heinrich,

On 28/2/24 13:59, Heinrich Schuchardt wrote:

virtqueue_map_desc() is called with values of sz exceeding that may
exceed
TARGET_PAGE_SIZE. sz = 0x2800 has been observed.


Pure (and can also be stupid) question: why virtqueue_map_desc() would map
to !direct mem?  Shouldn't those buffers normally allocated from guest RAM?



We only support a single bounce buffer. We have to avoid
virtqueue_map_desc() calling address_space_map() multiple times.
Otherwise
we see an error

   qemu: virtio: bogus descriptor or out of resources

Increase the minimum size of the bounce buffer to 0x1 which matches
the largest value of TARGET_PAGE_SIZE for all architectures.

Signed-off-by: Heinrich Schuchardt 
---
v2:
  remove unrelated change
---
system/physmem.c | 8 ++--
1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/system/physmem.c b/system/physmem.c
index e3ebc19eef..3c82da1c86 100644
--- a/system/physmem.c
+++ b/system/physmem.c
@@ -3151,8 +3151,12 @@ void *address_space_map(AddressSpace *as,
*plen = 0;
return NULL;
}
-/* Avoid unbounded allocations */
-l = MIN(l, TARGET_PAGE_SIZE);
+/*
+ * There is only one bounce buffer. The largest occuring
value of
+ * parameter sz of virtqueue_map_desc() must fit into the bounce
+ * buffer.
+ */
+l = MIN(l, 0x1);


Please define this magic value. Maybe ANY_TARGET_PAGE_SIZE or
TARGETS_BIGGEST_PAGE_SIZE?

Then along:
 QEMU_BUILD_BUG_ON(TARGET_PAGE_SIZE <= TARGETS_BIGGEST_PAGE_SIZE);


Thank you Philippe for reviewing.

TARGETS_BIGGEST_PAGE_SIZE does not fit as the value is not driven by the
page size.
How about MIN_BOUNCE_BUFFER_SIZE?
Is include/exec/memory.h the right include for the constant?

I don't think that TARGET_PAGE_SIZE has any relevance for setting the
bounce buffer size. I only mentioned it to say that we are not
decreasing the value on any existing architecture.

I don't know why TARGET_PAGE_SIZE ever got into this piece of code.
e3127ae0cdcd ("exec: reorganize address_space_map") does not provide a
reason for this choice. Maybe Paolo remembers.


The limitation to a page dates back to commit 6d16c2f88f2a in 2009,
which was the first implementation of this function. I don't think
there's a particular reason for that value beyond that it was
probably a convenient value that was assumed to be likely "big enough".

I think the idea with this bounce-buffer has always been that this
isn't really a code path we expected to end up in very often --
it's supposed to be for when devices are doing DMA, which they
will typically be doing to memory (backed by host RAM), not
devices (backed by MMIO and needing a bounce buffer). So the
whole mechanism is a bit "last fallback to stop things breaking
entirely".

The address_space_map() API says that it's allowed to return
a subset of the range you ask for, so if the virtio code doesn't
cope with the minimum being set to TARGET_PAGE_SIZE then either
we need to fix that virtio code or we need to change the API
of this function. (But I think you will also get a reduced
range if you try to use it across a boundary between normal
host-memory-backed RAM and a device MemoryRegion.)


If we allow a bounce buffer only to be used once (via the in_use flag), why
do we allow only a single bounce buffer?

Could address_space_map() allocate a new bounce buffer on every call and
address_space_unmap() deallocate it?

Isn't the design with a single bounce buffer bound to fail with a
multi-threaded client as collision can be expected?


See:

https://lore.kernel.org/r/20240212080617.2559498-1-mniss...@rivosinc.com

For some reason that series didn't land, but it seems to be helpful in this
case too if e.g. there can be multiple of such devices.

Thanks,



Hello Peter Xu,

thanks for pointing to your series. What I like about it is that it 
removes the limit of a single bounce buffer per AddressSpace.


Unfortunately it does not solve my problem. You limit the sum of all of 
the allocations for a single AddressSpcace to 
DEFAULT_MAX_BOUNCE_BUFFER_SIZE = 4096 which is too small for my use case.


Why do we need a limit?
Why is it so tiny?

Best regards

Heinrich

Re: [PATCH 9/9] hostmem-file: support POSIX shm_open()

Stefano Garzarella  writes:

> On Wed, Feb 28, 2024 at 01:32:17PM +0100, Markus Armbruster wrote:
>>Stefano Garzarella  writes:

[...]

>>> +# @shm: if true, shm_open(3) is used to create/open POSIX shared memory
>>> +#   object; if false, an open(2) is used. (default: false) (since 9.0)
>>> +#
>>
>>Please format like this for consistency:
>
> Sure.
>
>>
>># @shm: if true, shm_open(3) is used to create/open POSIX shared memory
>># object; if false, an open(2) is used (default: false) (since 9.0)
>
> I just noticed that I followed the property just above (@rom). Should we fix 
> that one?

Yes, please.

See commit a937b6aa739 (qapi: Reformat doc comments to conform to
current conventions).

Re: [PATCH v4 10/15] spapr: nested: Initialize the GSB elements lookup table.

2024-02-29 Thread Harsh Prateek Bora





On 2/27/24 15:32, Nicholas Piggin wrote:

On Tue Feb 20, 2024 at 6:36 PM AEST, Harsh Prateek Bora wrote:

Nested PAPR API provides a standard Guest State Buffer (GSB) format
with unique IDs for each guest state element for which get/set state is
supported by the API. Some of the elements are read-only and/or guest-wide.
Introducing helper routines for state exchange of each of the nested guest
state elements for which get/set state should be supported by the API.



This is doing more than just adding helper routines for the GSB access.


Yes, some of the GSB elements are also introduced along with respective 
helpers.




[snip]


+
  typedef struct SpaprMachineStateNested {
  uint64_t ptcr;
  uint8_t api;
@@ -16,6 +201,8 @@ typedef struct SpaprMachineStateNested {
  typedef struct SpaprMachineStateNestedGuest {
  uint32_t pvr_logical;
  unsigned long vcpus;
+uint64_t parttbl[2];
+uint64_t tb_offset;
  struct SpaprMachineStateNestedGuestVcpu *vcpu;
  } SpaprMachineStateNestedGuest;
  

[snip]

  
  /*

   * Register state for entering a nested guest with H_ENTER_NESTED.
@@ -172,17 +452,40 @@ struct nested_ppc_state {
  uint64_t sier;
  uint32_t vscr;
  uint64_t fpscr;
+int64_t dec_expiry_tb;
+};
+
+struct SpaprMachineStateNestedGuestVcpuRunBuf {
+uint64_t addr;
+uint64_t size;
  };
  
  typedef struct SpaprMachineStateNestedGuestVcpu {

  bool enabled;
  struct nested_ppc_state state;
+struct SpaprMachineStateNestedGuestVcpuRunBuf runbufin;
+struct SpaprMachineStateNestedGuestVcpuRunBuf runbufout;
+int64_t tb_offset;
+uint64_t hdecr_expiry_tb;
  } SpaprMachineStateNestedGuestVcpu;


It's adding new fields in existing nested guest state
structures. This should be explained a bit more, split into
another patch, or moved to patches where they get used.


Yes, these new fields are actually representing GSB elements.
These elements were explained in the documentation patch which shall now
point to the documentation in the kernel docs as suggested earlier.
Let me know if we need to document additionally in this patch commit log
also.

regards,
Harsh




Thanks,
Nick

Re: [PATCH v7 2/5] softmmu: Support concurrent bounce buffers

2024-02-29 Thread Heinrich Schuchardt


On 12.02.24 09:06, Mattias Nissler wrote:

When DMA memory can't be directly accessed, as is the case when
running the device model in a separate process without shareable DMA
file descriptors, bounce buffering is used.

It is not uncommon for device models to request mapping of several DMA
regions at the same time. Examples include:
  * net devices, e.g. when transmitting a packet that is split across
several TX descriptors (observed with igb)
  * USB host controllers, when handling a packet with multiple data TRBs
(observed with xhci)

Previously, qemu only provided a single bounce buffer per AddressSpace
and would fail DMA map requests while the buffer was already in use. In
turn, this would cause DMA failures that ultimately manifest as hardware
errors from the guest perspective.

This change allocates DMA bounce buffers dynamically instead of
supporting only a single buffer. Thus, multiple DMA mappings work
correctly also when RAM can't be mmap()-ed.

The total bounce buffer allocation size is limited individually for each
AddressSpace. The default limit is 4096 bytes, matching the previous
maximum buffer size. A new x-max-bounce-buffer-size parameter is
provided to configure the limit for PCI devices.

Signed-off-by: Mattias Nissler 
---
  hw/pci/pci.c|  8 
  include/exec/memory.h   | 14 +++
  include/hw/pci/pci_device.h |  3 ++
  system/memory.c |  5 ++-
  system/physmem.c| 80 +
  5 files changed, 74 insertions(+), 36 deletions(-)

diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index 6496d027ca..036b3ff822 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -85,6 +85,8 @@ static Property pci_props[] = {
  QEMU_PCIE_ERR_UNC_MASK_BITNR, true),
  DEFINE_PROP_BIT("x-pcie-ari-nextfn-1", PCIDevice, cap_present,
  QEMU_PCIE_ARI_NEXTFN_1_BITNR, false),
+DEFINE_PROP_SIZE("x-max-bounce-buffer-size", PCIDevice,
+ max_bounce_buffer_size, DEFAULT_MAX_BOUNCE_BUFFER_SIZE),
  DEFINE_PROP_END_OF_LIST()
  };
  
@@ -1203,6 +1205,8 @@ static PCIDevice *do_pci_register_device(PCIDevice *pci_dev,

 "bus master container", UINT64_MAX);
  address_space_init(&pci_dev->bus_master_as,
 &pci_dev->bus_master_container_region, pci_dev->name);
+pci_dev->bus_master_as.max_bounce_buffer_size =
+pci_dev->max_bounce_buffer_size;
  
  if (phase_check(PHASE_MACHINE_READY)) {

  pci_init_bus_master(pci_dev);
@@ -2632,6 +2636,10 @@ static void pci_device_class_init(ObjectClass *klass, 
void *data)
  k->unrealize = pci_qdev_unrealize;
  k->bus_type = TYPE_PCI_BUS;
  device_class_set_props(k, pci_props);
+object_class_property_set_description(
+klass, "x-max-bounce-buffer-size",
+"Maximum buffer size allocated for bounce buffers used for mapped "
+"access to indirect DMA memory");
  }
  
  static void pci_device_class_base_init(ObjectClass *klass, void *data)

diff --git a/include/exec/memory.h b/include/exec/memory.h
index 6995a443d3..e7bc4717ea 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -,13 +,7 @@ typedef struct AddressSpaceMapClient {
  QLIST_ENTRY(AddressSpaceMapClient) link;
  } AddressSpaceMapClient;
  
-typedef struct {

-MemoryRegion *mr;
-void *buffer;
-hwaddr addr;
-hwaddr len;
-bool in_use;
-} BounceBuffer;
+#define DEFAULT_MAX_BOUNCE_BUFFER_SIZE (4096)
  
  /**

   * struct AddressSpace: describes a mapping of addresses to #MemoryRegion 
objects
@@ -1137,8 +1131,10 @@ struct AddressSpace {
  QTAILQ_HEAD(, MemoryListener) listeners;
  QTAILQ_ENTRY(AddressSpace) address_spaces_link;
  
-/* Bounce buffer to use for this address space. */

-BounceBuffer bounce;
+/* Maximum DMA bounce buffer size used for indirect memory map requests */
+uint64_t max_bounce_buffer_size;
+/* Total size of bounce buffers currently allocated, atomically accessed */
+uint64_t bounce_buffer_size;
  /* List of callbacks to invoke when buffers free up */
  QemuMutex map_client_list_lock;
  QLIST_HEAD(, AddressSpaceMapClient) map_client_list;
diff --git a/include/hw/pci/pci_device.h b/include/hw/pci/pci_device.h
index d3dd0f64b2..f4027c5379 100644
--- a/include/hw/pci/pci_device.h
+++ b/include/hw/pci/pci_device.h
@@ -160,6 +160,9 @@ struct PCIDevice {
  /* ID of standby device in net_failover pair */
  char *failover_pair_id;
  uint32_t acpi_index;
+
+/* Maximum DMA bounce buffer size used for indirect memory map requests */
+uint64_t max_bounce_buffer_size;
  };
  
  static inline int pci_intx(PCIDevice *pci_dev)

diff --git a/system/memory.c b/system/memory.c
index ad0caef1b8..1cf89654a1 100644
--- a/system/memory.c
+++ b/system/memory.c
@@ -3133,7 +3133,8 @@ void address_space_init(AddressSpace *as, MemoryRegion 
*root, const char *name)
  as->ioeventfds = NULL;

Re: [PATCH v2 06/21] migration: Add Error** argument to .save_setup() handler

2024-02-29 Thread Thomas Huth


On 29/02/2024 08.20, Vladimir Sementsov-Ogievskiy wrote:

On 29.02.24 09:32, Markus Armbruster wrote:

Cédric Le Goater  writes:


The purpose is to record a potential error in the migration stream if
qemu_savevm_state_setup() fails. Most of the current .save_setup()
handlers can be modified to use the Error argument instead of managing
their own and calling locally error_report(). The following patches
will introduce such changes for VFIO first.

Cc: Nicholas Piggin 
Cc: Harsh Prateek Bora 
Cc: Halil Pasic 
Cc: Thomas Huth 
Cc: Eric Blake 
Cc: Vladimir Sementsov-Ogievskiy 
Cc: John Snow 
Cc: Stefan Hajnoczi 
Reviewed-by: Philippe Mathieu-Daudé 
Reviewed-by: Peter Xu 
Signed-off-by: Cédric Le Goater 
---


[...]


diff --git a/hw/s390x/s390-stattrib.c b/hw/s390x/s390-stattrib.c
index 
c483b62a9b5f71772639fc180bdad15ecb6711cb..c934df424a555d83d2198f5ddfc0cbe0ea98e9ec 100644

--- a/hw/s390x/s390-stattrib.c
+++ b/hw/s390x/s390-stattrib.c
@@ -166,7 +166,7 @@ static int cmma_load(QEMUFile *f, void *opaque, int 
version_id)

  return ret;
  }
-static int cmma_save_setup(QEMUFile *f, void *opaque)
+static int cmma_save_setup(QEMUFile *f, void *opaque, Error **errp)
  {
  S390StAttribState *sas = S390_STATTRIB(opaque);
  S390StAttribClass *sac = S390_STATTRIB_GET_CLASS(sas);

    int res;
    /*
 * Signal that we want to start a migration, thus needing PGSTE dirty
 * tracking.
 */
    res = sac->set_migrationmode(sas, 1);
    if (res) {
    return res;

I believe this is a failure return.

Anti-pattern: fail without setting an error.  There might be more
elsewhere in the series.

qapi/error.h's big comment:

  * - On success, the function should not touch *errp.  On failure, it
  *   should set a new error, e.g. with error_setg(errp, ...), or
  *   propagate an existing one, e.g. with error_propagate(errp, ...).
  *
  * - Whenever practical, also return a value that indicates success /
  *   failure.  This can make the error checking more concise, and can
  *   avoid useless error object creation and destruction.  Note that
  *   we still have many functions returning void.  We recommend
  *   • bool-valued functions return true on success / false on failure,
  *   • pointer-valued functions return non-null / null pointer, and
  *   • integer-valued functions return non-negative / negative.

    }
    qemu_put_be64(f, STATTR_FLAG_EOS);
    return 0;
    }

When adding Error **errp to a function, you must also add code to set an
error on failure to every failure path.  Adding it in a later patch in
the same series can be okay,


Personally, I'd prefer not doing so. Creating wrong commits and fixing them 
in same series - better to merge all fixes into bad commit:)


I agree - that might create issues with bisecting later. Please fix it in 
this patch here already!


 Thanks,
  Thomas

Re: [PATCH, v2] physmem: avoid bounce buffer too small

On Thu, Feb 29, 2024 at 11:22 AM Heinrich Schuchardt
 wrote:
>
> On 29.02.24 02:11, Peter Xu wrote:
> > On Wed, Feb 28, 2024 at 08:07:47PM +0100, Heinrich Schuchardt wrote:
> >> On 28.02.24 19:39, Peter Maydell wrote:
> >>> On Wed, 28 Feb 2024 at 18:28, Heinrich Schuchardt
> >>>  wrote:
> 
>  On 28.02.24 16:06, Philippe Mathieu-Daudé wrote:
> > Hi Heinrich,
> >
> > On 28/2/24 13:59, Heinrich Schuchardt wrote:
> >> virtqueue_map_desc() is called with values of sz exceeding that may
> >> exceed
> >> TARGET_PAGE_SIZE. sz = 0x2800 has been observed.
> >
> > Pure (and can also be stupid) question: why virtqueue_map_desc() would map
> > to !direct mem?  Shouldn't those buffers normally allocated from guest RAM?
> >
> >>
> >> We only support a single bounce buffer. We have to avoid
> >> virtqueue_map_desc() calling address_space_map() multiple times.
> >> Otherwise
> >> we see an error
> >>
> >>qemu: virtio: bogus descriptor or out of resources
> >>
> >> Increase the minimum size of the bounce buffer to 0x1 which matches
> >> the largest value of TARGET_PAGE_SIZE for all architectures.
> >>
> >> Signed-off-by: Heinrich Schuchardt 
> >> ---
> >> v2:
> >>   remove unrelated change
> >> ---
> >> system/physmem.c | 8 ++--
> >> 1 file changed, 6 insertions(+), 2 deletions(-)
> >>
> >> diff --git a/system/physmem.c b/system/physmem.c
> >> index e3ebc19eef..3c82da1c86 100644
> >> --- a/system/physmem.c
> >> +++ b/system/physmem.c
> >> @@ -3151,8 +3151,12 @@ void *address_space_map(AddressSpace *as,
> >> *plen = 0;
> >> return NULL;
> >> }
> >> -/* Avoid unbounded allocations */
> >> -l = MIN(l, TARGET_PAGE_SIZE);
> >> +/*
> >> + * There is only one bounce buffer. The largest occuring
> >> value of
> >> + * parameter sz of virtqueue_map_desc() must fit into the 
> >> bounce
> >> + * buffer.
> >> + */
> >> +l = MIN(l, 0x1);
> >
> > Please define this magic value. Maybe ANY_TARGET_PAGE_SIZE or
> > TARGETS_BIGGEST_PAGE_SIZE?
> >
> > Then along:
> >  QEMU_BUILD_BUG_ON(TARGET_PAGE_SIZE <= TARGETS_BIGGEST_PAGE_SIZE);
> 
>  Thank you Philippe for reviewing.
> 
>  TARGETS_BIGGEST_PAGE_SIZE does not fit as the value is not driven by the
>  page size.
>  How about MIN_BOUNCE_BUFFER_SIZE?
>  Is include/exec/memory.h the right include for the constant?
> 
>  I don't think that TARGET_PAGE_SIZE has any relevance for setting the
>  bounce buffer size. I only mentioned it to say that we are not
>  decreasing the value on any existing architecture.
> 
>  I don't know why TARGET_PAGE_SIZE ever got into this piece of code.
>  e3127ae0cdcd ("exec: reorganize address_space_map") does not provide a
>  reason for this choice. Maybe Paolo remembers.
> >>>
> >>> The limitation to a page dates back to commit 6d16c2f88f2a in 2009,
> >>> which was the first implementation of this function. I don't think
> >>> there's a particular reason for that value beyond that it was
> >>> probably a convenient value that was assumed to be likely "big enough".
> >>>
> >>> I think the idea with this bounce-buffer has always been that this
> >>> isn't really a code path we expected to end up in very often --
> >>> it's supposed to be for when devices are doing DMA, which they
> >>> will typically be doing to memory (backed by host RAM), not
> >>> devices (backed by MMIO and needing a bounce buffer). So the
> >>> whole mechanism is a bit "last fallback to stop things breaking
> >>> entirely".
> >>>
> >>> The address_space_map() API says that it's allowed to return
> >>> a subset of the range you ask for, so if the virtio code doesn't
> >>> cope with the minimum being set to TARGET_PAGE_SIZE then either
> >>> we need to fix that virtio code or we need to change the API
> >>> of this function. (But I think you will also get a reduced
> >>> range if you try to use it across a boundary between normal
> >>> host-memory-backed RAM and a device MemoryRegion.)
> >>
> >> If we allow a bounce buffer only to be used once (via the in_use flag), why
> >> do we allow only a single bounce buffer?
> >>
> >> Could address_space_map() allocate a new bounce buffer on every call and
> >> address_space_unmap() deallocate it?
> >>
> >> Isn't the design with a single bounce buffer bound to fail with a
> >> multi-threaded client as collision can be expected?
> >
> > See:
> >
> > https://lore.kernel.org/r/20240212080617.2559498-1-mniss...@rivosinc.com
> >
> > For some reason that series didn't land, but it seems to be helpful in this
> > case too if e.g. there can be multiple of such devices.
> >
> > Thanks,
> >
>
> Hello Peter Xu,
>
> thanks for pointing to your series. What I like about

Re: [RFC 0/4] mirror: implement incremental and bitmap modes

Am 28.02.24 um 17:24 schrieb Vladimir Sementsov-Ogievskiy:
> On 16.02.24 13:55, Fiona Ebner wrote:
>> Previous discussion from when this was sent upstream [0] (it's been a
>> while). I rebased the patches and re-ordered and squashed like
>> suggested back then [1].
>>
>> This implements two new mirror modes:
>>
>> - bitmap mirror mode with always/on-success/never bitmap sync mode
>> - incremental mirror mode as sugar for bitmap + on-success
>>
>> Use cases:
>> * Possibility to resume a failed mirror later.
>> * Possibility to only mirror deltas to a previously mirrored volume.
>> * Possibility to (efficiently) mirror an drive that was previously
>>    mirrored via some external mechanism (e.g. ZFS replication).
>>
>> We are using the last one in production without any issues since about
>> 4 years now. In particular, like mentioned in [2]:
>>
>>> - create bitmap(s)
>>> - (incrementally) replicate storage volume(s) out of band (using ZFS)
>>> - incrementally drive mirror as part of a live migration of VM
>>> - drop bitmap(s)
> 
> Actually which mode you use, "never", "always" or "conditional"? Or in
> downstream you have different approach?
> 

We are using "conditional", but I think we don't really require any
specific mode, because we drop the bitmaps after mirroring (even in
failure case). Fabian, please correct me if I'm wrong.

> Why am I asking:
> 
> These modes (for backup) were developed prior to
> block-dirty-bitmap-merge command, which allowed to copy bitmaps as you
> want. With that API, we actually don't need all these modes, instead
> it's enough to pass a bitmap, which would be _actually_ used by mirror.
> 
> So, if you need "never" mode, you just copy your bitmap by
> block-dirty-bitmap-add + block-dirty-bitmap-merge, and pass a copy to
> mirror job.
> 
> Or, you pass your bitmap to mirror-job, and have a "always" mode.
> 
> And I don't see, why we need a "conditional" mode, which actually just
> drops away the progress we actually made. (OK, we failed, but why to
> drop the progress of successfully copied clusters?)
> 

I'm not sure actually. Maybe John remembers?

I see, I'll drop the 'bitmap-mode' in the next version if nobody
complains :)

> 
> Using user-given bitmap in the mirror job has also an additional
> advantage of live progress: up to visualization of disk copying by
> visualization of the dirty bitmap contents.
> 

Best Regards,
Fiona

[PATCH] chardev/char-socket: Fix TLS io channels sending too much data to the backend

2024-02-29 Thread Thomas Huth

Commit ffda5db65a ("io/channel-tls: fix handling of bigger read buffers")
changed the behavior of the TLS io channels to schedule a second reading
attempt if there is still incoming data pending. This caused a regression
with backends like the sclpconsole that check in their read function that
the sender does not try to write more bytes to it than the device can
currently handle.

The problem can be reproduced like this:

 1) In one terminal, do this:

  mkdir qemu-pki
  cd qemu-pki
  openssl genrsa 2048 > ca-key.pem
  openssl req -new -x509 -nodes -days 365000 -key ca-key.pem -out ca-cert.pem
  # enter some dummy value for the cert
  openssl genrsa 2048 > server-key.pem
  openssl req -new -x509 -nodes -days 365000 -key server-key.pem \
-out server-cert.pem
  # enter some other dummy values for the cert

  gnutls-serv --echo --x509cafile ca-cert.pem --x509keyfile server-key.pem \
  --x509certfile server-cert.pem -p 8338

 2) In another terminal, do this:

  wget 
https://download.fedoraproject.org/pub/fedora-secondary/releases/39/Cloud/s390x/images/Fedora-Cloud-Base-39-1.5.s390x.qcow2

  qemu-system-s390x -nographic -nodefaults \
-hda Fedora-Cloud-Base-39-1.5.s390x.qcow2 \
-object 
tls-creds-x509,id=tls0,endpoint=client,verify-peer=false,dir=$PWD/qemu-pki \
-chardev socket,id=tls_chardev,host=localhost,port=8338,tls-creds=tls0 \
-device sclpconsole,chardev=tls_chardev,id=tls_serial

QEMU then aborts after a second or two with:

  qemu-system-s390x: ../hw/char/sclpconsole.c:73: chr_read: Assertion
   `size <= SIZE_BUFFER_VT220 - scon->iov_data_len' failed.
 Aborted (core dumped)

It looks like the second read does not trigger the chr_can_read() function
to be called before the second read, which should normally always be done
before sending bytes to a character device to see how much it can handle,
so the s->max_size in tcp_chr_read() still contains the old value from the
previous read. Let's make sure that we use the up-to-date value by calling
tcp_chr_read_poll() again here.

Fixes: ffda5db65a ("io/channel-tls: fix handling of bigger read buffers")
Buglink: https://issues.redhat.com/browse/RHEL-24614
Reviewed-by: Daniel P. Berrangé 
Signed-off-by: Thomas Huth 
---
 Sorry if you've got this mail twice - I forgot to CC: qemu-devel when
 I sent it out the first time ... *facepalm*

 chardev/char-socket.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/chardev/char-socket.c b/chardev/char-socket.c
index 67e3334423..8a0406cc1e 100644
--- a/chardev/char-socket.c
+++ b/chardev/char-socket.c
@@ -496,9 +496,9 @@ static gboolean tcp_chr_read(QIOChannel *chan, GIOCondition 
cond, void *opaque)
 s->max_size <= 0) {
 return TRUE;
 }
-len = sizeof(buf);
-if (len > s->max_size) {
-len = s->max_size;
+len = tcp_chr_read_poll(opaque);
+if (len > sizeof(buf)) {
+len = sizeof(buf);
 }
 size = tcp_chr_recv(chr, (void *)buf, len);
 if (size == 0 || (size == -1 && errno != EAGAIN)) {
-- 
2.44.0

Re: [PATCH, v2] physmem: avoid bounce buffer too small

On Thu, 29 Feb 2024 11:22:24 +0100
Heinrich Schuchardt  wrote:

> On 29.02.24 02:11, Peter Xu wrote:
> > On Wed, Feb 28, 2024 at 08:07:47PM +0100, Heinrich Schuchardt wrote:  
> >> On 28.02.24 19:39, Peter Maydell wrote:  
> >>> On Wed, 28 Feb 2024 at 18:28, Heinrich Schuchardt
> >>>  wrote:  
> 
>  On 28.02.24 16:06, Philippe Mathieu-Daudé wrote:  
> > Hi Heinrich,
> >
> > On 28/2/24 13:59, Heinrich Schuchardt wrote:  
> >> virtqueue_map_desc() is called with values of sz exceeding that may
> >> exceed
> >> TARGET_PAGE_SIZE. sz = 0x2800 has been observed.  
> > 
> > Pure (and can also be stupid) question: why virtqueue_map_desc() would map
> > to !direct mem?  Shouldn't those buffers normally allocated from guest RAM?
> >   
> >>
> >> We only support a single bounce buffer. We have to avoid
> >> virtqueue_map_desc() calling address_space_map() multiple times.
> >> Otherwise
> >> we see an error
> >>
> >>qemu: virtio: bogus descriptor or out of resources
> >>
> >> Increase the minimum size of the bounce buffer to 0x1 which matches
> >> the largest value of TARGET_PAGE_SIZE for all architectures.
> >>
> >> Signed-off-by: Heinrich Schuchardt 
> >> ---
> >> v2:
> >>   remove unrelated change
> >> ---
> >> system/physmem.c | 8 ++--
> >> 1 file changed, 6 insertions(+), 2 deletions(-)
> >>
> >> diff --git a/system/physmem.c b/system/physmem.c
> >> index e3ebc19eef..3c82da1c86 100644
> >> --- a/system/physmem.c
> >> +++ b/system/physmem.c
> >> @@ -3151,8 +3151,12 @@ void *address_space_map(AddressSpace *as,
> >> *plen = 0;
> >> return NULL;
> >> }
> >> -/* Avoid unbounded allocations */
> >> -l = MIN(l, TARGET_PAGE_SIZE);
> >> +/*
> >> + * There is only one bounce buffer. The largest occuring
> >> value of
> >> + * parameter sz of virtqueue_map_desc() must fit into the 
> >> bounce
> >> + * buffer.
> >> + */
> >> +l = MIN(l, 0x1);  
> >
> > Please define this magic value. Maybe ANY_TARGET_PAGE_SIZE or
> > TARGETS_BIGGEST_PAGE_SIZE?
> >
> > Then along:
> >  QEMU_BUILD_BUG_ON(TARGET_PAGE_SIZE <= TARGETS_BIGGEST_PAGE_SIZE);  
> 
>  Thank you Philippe for reviewing.
> 
>  TARGETS_BIGGEST_PAGE_SIZE does not fit as the value is not driven by the
>  page size.
>  How about MIN_BOUNCE_BUFFER_SIZE?
>  Is include/exec/memory.h the right include for the constant?
> 
>  I don't think that TARGET_PAGE_SIZE has any relevance for setting the
>  bounce buffer size. I only mentioned it to say that we are not
>  decreasing the value on any existing architecture.
> 
>  I don't know why TARGET_PAGE_SIZE ever got into this piece of code.
>  e3127ae0cdcd ("exec: reorganize address_space_map") does not provide a
>  reason for this choice. Maybe Paolo remembers.  
> >>>
> >>> The limitation to a page dates back to commit 6d16c2f88f2a in 2009,
> >>> which was the first implementation of this function. I don't think
> >>> there's a particular reason for that value beyond that it was
> >>> probably a convenient value that was assumed to be likely "big enough".
> >>>
> >>> I think the idea with this bounce-buffer has always been that this
> >>> isn't really a code path we expected to end up in very often --
> >>> it's supposed to be for when devices are doing DMA, which they
> >>> will typically be doing to memory (backed by host RAM), not
> >>> devices (backed by MMIO and needing a bounce buffer). So the
> >>> whole mechanism is a bit "last fallback to stop things breaking
> >>> entirely".
> >>>
> >>> The address_space_map() API says that it's allowed to return
> >>> a subset of the range you ask for, so if the virtio code doesn't
> >>> cope with the minimum being set to TARGET_PAGE_SIZE then either
> >>> we need to fix that virtio code or we need to change the API
> >>> of this function. (But I think you will also get a reduced
> >>> range if you try to use it across a boundary between normal
> >>> host-memory-backed RAM and a device MemoryRegion.)  
> >>
> >> If we allow a bounce buffer only to be used once (via the in_use flag), why
> >> do we allow only a single bounce buffer?
> >>
> >> Could address_space_map() allocate a new bounce buffer on every call and
> >> address_space_unmap() deallocate it?
> >>
> >> Isn't the design with a single bounce buffer bound to fail with a
> >> multi-threaded client as collision can be expected?  
> > 
> > See:
> > 
> > https://lore.kernel.org/r/20240212080617.2559498-1-mniss...@rivosinc.com
> > 
> > For some reason that series didn't land, but it seems to be helpful in this
> > case too if e.g. there can be multiple of such devices.
> > 
> > Thanks,
> >   
> 
> Hello Peter Xu,
> 
> thanks for pointi

Re: [PATCH 0/3] physmem: Fix MemoryRegion for second access to cached MMIO Address Space

On Thu, 15 Feb 2024 14:28:14 +
Jonathan Cameron via  wrote:

Any comments?  Almost all the other fixes I need for CXL memory to
work as normal ram are queued up so I'd love it if we can solve this one as
well.

This looks like a big series, but it's really just a refactor + trivial
addition - so shouldn't be too scary!

Jonathan

> Issue seen testing virtio-blk-pci with CXL emulated interleave memory.
> Tests were done on arm64, but the issue isn't architecture specific.
> Note that some additional fixes are needed to TCG to be able to run far
> enough to hit this on arm64 or x86. They are issues so I'll post separate
> series shortly.
> 
> The address_space_read_cached_slow() and address_space_write_cached_slow()
> functions query the MemoryRegion for the cached address space correctly
> using address_space_translate_cached() but then call into
> flatview_read_continue() / flatview_write_continue()
> If the access is to a MMIO MemoryRegion and is bigger than the MemoryRegion
> supports, the loop will query the MemoryRegion for the next access to use.
> That query uses flatview_translate() but the address passed is suitable
> for the cache, not the flatview. On my test setup that mean the second
> 8 bytes and onwards of the virtio descriptor was read from flash memory
> at the beginning of the system address map, not the CXL emulated memory
> where the descriptor was found.  Result happened to be all fs so easy to
> spot.
> 
> Changes these calls to use address_space_translate_cached() to get the
> correct MemoryRegion for the cache. To avoid duplicating most of the
> code, the first 2 patches factor out the common parts of
> flatview_read_continue() and flatview_write_continue() so they can
> be reused.
> 
> Write path has not been tested but it so similar to the read path I've
> included it here.
> 
> Jonathan Cameron (3):
>   physmem: Reduce local variable scope in flatview_read/write_continue()
>   physmem: Factor out body of flatview_read/write_continue() loop
>   physmem: Fix wrong MR in large address_space_read/write_cached_slow()
> 
>  system/physmem.c | 245 ---
>  1 file changed, 170 insertions(+), 75 deletions(-)
>

Re: [PATCH v7 0/5] Support message-based DMA in vfio-user server

Hi,

I actually failed to carry forward the Reviewed-by tags from Jag,
Phillipe and Stefan as well when reposting even though I didn't make
any non-trivial changes to the respective patches. I intend to post
another version with the respective tags restored, but I'll give you a
day or two to speak up if you disagree.

Thanks,
Mattias

On Tue, Feb 20, 2024 at 6:06 AM Peter Xu  wrote:
>
> On Mon, Feb 12, 2024 at 12:06:12AM -0800, Mattias Nissler wrote:
> > Changes from v6:
> >
> > * Rebase, resolve straightforward merge conflict in system/dma-helpers.c
>
> Hi, Mattias,
>
> If the change is trivial, feel free to carry over my R-bs in the first two
> patches in the commit message.
>
> Thanks,
>
> --
> Peter Xu
>

Re: [PATCH v5 30/65] i386/tdx: Support user configurable mrconfigid/mrowner/mrownerconfig

2024-02-29 Thread Xiaoyao Li


On 2/29/2024 4:37 PM, Markus Armbruster wrote:

Xiaoyao Li  writes:


From: Isaku Yamahata 

Three sha384 hash values, mrconfigid, mrowner and mrownerconfig, of a TD
can be provided for TDX attestation. Detailed meaning of them can be
found: 
https://lore.kernel.org/qemu-devel/31d6dbc1-f453-4cef-ab08-4813f4e0f...@intel.com/

Allow user to specify those values via property mrconfigid, mrowner and
mrownerconfig. They are all in base64 format.

example
-object tdx-guest, \
   mrconfigid=ASNFZ4mrze8BI0VniavN7wEjRWeJq83vASNFZ4mrze8BI0VniavN7wEjRWeJq83v,\
   mrowner=ASNFZ4mrze8BI0VniavN7wEjRWeJq83vASNFZ4mrze8BI0VniavN7wEjRWeJq83v,\
   
mrownerconfig=ASNFZ4mrze8BI0VniavN7wEjRWeJq83vASNFZ4mrze8BI0VniavN7wEjRWeJq83v

Signed-off-by: Isaku Yamahata 
Co-developed-by: Xiaoyao Li 
Signed-off-by: Xiaoyao Li 

---
Changes in v5:
  - refine the description of QAPI properties and add description of
default value when not specified;

Changes in v4:
  - describe more of there fields in qom.json
  - free the old value before set new value to avoid memory leak in
_setter(); (Daniel)

Changes in v3:
  - use base64 encoding instread of hex-string;
---
  qapi/qom.json | 17 -
  target/i386/kvm/tdx.c | 87 +++
  target/i386/kvm/tdx.h |  3 ++
  3 files changed, 106 insertions(+), 1 deletion(-)

diff --git a/qapi/qom.json b/qapi/qom.json
index 89ed89b9b46e..cac875349a3a 100644
--- a/qapi/qom.json
+++ b/qapi/qom.json
@@ -905,10 +905,25 @@
  # pages.  Some guest OS (e.g., Linux TD guest) may require this to
  # be set, otherwise they refuse to boot.
  #
+# @mrconfigid: ID for non-owner-defined configuration of the guest TD,
+# e.g., run-time or OS configuration (base64 encoded SHA384 digest).
+# (A default value 0 of SHA384 is used when absent).


Suggest to drop the parenthesis in the last sentence.

@mrconfigid is a string, so the default value can't be 0.  Actually,
it's not just any string, but a base64 encoded SHA384 digest, which
means it must be exactly 96 hex digits.  So it can't be "0", either.  It
could be
"".


I thought value 0 of SHA384 just means it.

That's my fault and my poor english.


More on this below.


+#
+# @mrowner: ID for the guest TD’s owner (base64 encoded SHA384 digest).
+# (A default value 0 of SHA384 is used when absent).
+#
+# @mrownerconfig: ID for owner-defined configuration of the guest TD,
+# e.g., specific to the workload rather than the run-time or OS
+# (base64 encoded SHA384 digest). (A default value 0 of SHA384 is
+# used when absent).
+#
  # Since: 9.0
  ##
  { 'struct': 'TdxGuestProperties',
-  'data': { '*sept-ve-disable': 'bool' } }
+  'data': { '*sept-ve-disable': 'bool',
+'*mrconfigid': 'str',
+'*mrowner': 'str',
+'*mrownerconfig': 'str' } }
  
  ##

  # @ThreadContextProperties:
diff --git a/target/i386/kvm/tdx.c b/target/i386/kvm/tdx.c
index d0ad4f57b5d0..4ce2f1d082ce 100644
--- a/target/i386/kvm/tdx.c
+++ b/target/i386/kvm/tdx.c
@@ -13,6 +13,7 @@
  
  #include "qemu/osdep.h"

  #include "qemu/error-report.h"
+#include "qemu/base64.h"
  #include "qapi/error.h"
  #include "qom/object_interfaces.h"
  #include "standard-headers/asm-x86/kvm_para.h"
@@ -516,6 +517,7 @@ int tdx_pre_create_vcpu(CPUState *cpu, Error **errp)
  X86CPU *x86cpu = X86_CPU(cpu);
  CPUX86State *env = &x86cpu->env;
  g_autofree struct kvm_tdx_init_vm *init_vm = NULL;
+size_t data_len;
  int r = 0;
  
  object_property_set_bool(OBJECT(cpu), "pmu", false, &error_abort);

@@ -528,6 +530,38 @@ int tdx_pre_create_vcpu(CPUState *cpu, Error **errp)
  init_vm = g_malloc0(sizeof(struct kvm_tdx_init_vm) +
  sizeof(struct kvm_cpuid_entry2) * 
KVM_MAX_CPUID_ENTRIES);
  
+#define SHA384_DIGEST_SIZE  48

+
+if (tdx_guest->mrconfigid) {
+g_autofree uint8_t *data = qbase64_decode(tdx_guest->mrconfigid,
+  strlen(tdx_guest->mrconfigid), &data_len, errp);
+if (!data || data_len != SHA384_DIGEST_SIZE) {
+error_setg(errp, "TDX: failed to decode mrconfigid");
+return -1;
+}
+memcpy(init_vm->mrconfigid, data, data_len);
+}


When @mrconfigid is absent, the property remains null, and this
conditional is not executed.  init_vm->mrconfigid[], an array of 6
__u64, remains all zero.  How does the kernel treat that?


A all-zero SHA384 value is still a valid value, isn't it?

KVM treats it with no difference.


+
+if (tdx_guest->mrowner) {
+g_autofree uint8_t *data = qbase64_decode(tdx_guest->mrowner,
+  strlen(tdx_guest->mrowner), &data_len, errp);
+if (!data || data_len != SHA384_DIGEST_SIZE) {
+error_setg(errp, "TDX: failed to decode mrowner");
+return -1;
+}
+memcpy(init_vm->mrowner, data, dat

Re: [PATCH] chardev/char-socket: Fix TLS io channels sending too much data to the backend

2024-02-29 Thread Marc-André Lureau

On Thu, Feb 29, 2024 at 2:43 PM Thomas Huth  wrote:
>
> Commit ffda5db65a ("io/channel-tls: fix handling of bigger read buffers")
> changed the behavior of the TLS io channels to schedule a second reading
> attempt if there is still incoming data pending. This caused a regression
> with backends like the sclpconsole that check in their read function that
> the sender does not try to write more bytes to it than the device can
> currently handle.
>
> The problem can be reproduced like this:
>
>  1) In one terminal, do this:
>
>   mkdir qemu-pki
>   cd qemu-pki
>   openssl genrsa 2048 > ca-key.pem
>   openssl req -new -x509 -nodes -days 365000 -key ca-key.pem -out ca-cert.pem
>   # enter some dummy value for the cert
>   openssl genrsa 2048 > server-key.pem
>   openssl req -new -x509 -nodes -days 365000 -key server-key.pem \
> -out server-cert.pem
>   # enter some other dummy values for the cert
>
>   gnutls-serv --echo --x509cafile ca-cert.pem --x509keyfile server-key.pem \
>   --x509certfile server-cert.pem -p 8338
>
>  2) In another terminal, do this:
>
>   wget 
> https://download.fedoraproject.org/pub/fedora-secondary/releases/39/Cloud/s390x/images/Fedora-Cloud-Base-39-1.5.s390x.qcow2
>
>   qemu-system-s390x -nographic -nodefaults \
> -hda Fedora-Cloud-Base-39-1.5.s390x.qcow2 \
> -object 
> tls-creds-x509,id=tls0,endpoint=client,verify-peer=false,dir=$PWD/qemu-pki \
> -chardev socket,id=tls_chardev,host=localhost,port=8338,tls-creds=tls0 \
> -device sclpconsole,chardev=tls_chardev,id=tls_serial
>
> QEMU then aborts after a second or two with:
>
>   qemu-system-s390x: ../hw/char/sclpconsole.c:73: chr_read: Assertion
>`size <= SIZE_BUFFER_VT220 - scon->iov_data_len' failed.
>  Aborted (core dumped)
>
> It looks like the second read does not trigger the chr_can_read() function
> to be called before the second read, which should normally always be done
> before sending bytes to a character device to see how much it can handle,
> so the s->max_size in tcp_chr_read() still contains the old value from the
> previous read. Let's make sure that we use the up-to-date value by calling
> tcp_chr_read_poll() again here.
>
> Fixes: ffda5db65a ("io/channel-tls: fix handling of bigger read buffers")
> Buglink: https://issues.redhat.com/browse/RHEL-24614
> Reviewed-by: Daniel P. Berrangé 


> Signed-off-by: Thomas Huth 

Reviewed-by: Marc-André Lureau 
> ---
>  Sorry if you've got this mail twice - I forgot to CC: qemu-devel when
>  I sent it out the first time ... *facepalm*
>
>  chardev/char-socket.c | 6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)
>
> diff --git a/chardev/char-socket.c b/chardev/char-socket.c
> index 67e3334423..8a0406cc1e 100644
> --- a/chardev/char-socket.c
> +++ b/chardev/char-socket.c
> @@ -496,9 +496,9 @@ static gboolean tcp_chr_read(QIOChannel *chan, 
> GIOCondition cond, void *opaque)
>  s->max_size <= 0) {
>  return TRUE;
>  }
> -len = sizeof(buf);
> -if (len > s->max_size) {
> -len = s->max_size;
> +len = tcp_chr_read_poll(opaque);
> +if (len > sizeof(buf)) {
> +len = sizeof(buf);
>  }
>  size = tcp_chr_recv(chr, (void *)buf, len);
>  if (size == 0 || (size == -1 && errno != EAGAIN)) {
> --
> 2.44.0
>

Re: [PATCH 06/19] smbios: get rid of smbios_legacy global

2024-02-29 Thread Ani Sinha




> On 27-Feb-2024, at 21:17, Igor Mammedov  wrote:
> 
> clean up smbios_set_defaults() which is reused by legacy
> and non legacy machines from being aware of 'legacy' notion
> and need to turn it off. And push legacy handling up to
> PC machine code where it's relevant.
> 
> Signed-off-by: Igor Mammedov 
> ---
> PS: I've moved/kept legacy smbios_entries to smbios_get_tables()
> but it at least is not visible to API users. To get rid of it
> as well, it would be necessary to change how '-smbios' CLI
> option is processed. Which is done later in the series.
> ---
> include/hw/firmware/smbios.h |  2 +-
> hw/arm/virt.c|  2 +-
> hw/i386/fw_cfg.c |  7 ---
> hw/loongarch/virt.c  |  2 +-
> hw/riscv/virt.c  |  2 +-
> hw/smbios/smbios.c   | 35 +++
> 6 files changed, 23 insertions(+), 27 deletions(-)
> 
> diff --git a/include/hw/firmware/smbios.h b/include/hw/firmware/smbios.h
> index a187fbbd3d..0818184834 100644
> --- a/include/hw/firmware/smbios.h
> +++ b/include/hw/firmware/smbios.h
> @@ -293,7 +293,7 @@ struct smbios_type_127 {
> void smbios_entry_add(QemuOpts *opts, Error **errp);
> void smbios_set_cpuid(uint32_t version, uint32_t features);
> void smbios_set_defaults(const char *manufacturer, const char *product,
> - const char *version, bool legacy_mode,
> + const char *version,
>  bool uuid_encoded, SmbiosEntryPointType ep_type);
> void smbios_set_default_processor_family(uint16_t processor_family);
> uint8_t *smbios_get_table_legacy(uint32_t expected_t4_count, size_t *length);
> diff --git a/hw/arm/virt.c b/hw/arm/virt.c
> index 0af1943697..8588681f27 100644
> --- a/hw/arm/virt.c
> +++ b/hw/arm/virt.c
> @@ -1633,7 +1633,7 @@ static void virt_build_smbios(VirtMachineState *vms)
> }
> 
> smbios_set_defaults("QEMU", product,
> -vmc->smbios_old_sys_ver ? "1.0" : mc->name, false,
> +vmc->smbios_old_sys_ver ? "1.0" : mc->name,
> true, SMBIOS_ENTRY_POINT_TYPE_64);
> 
> /* build the array of physical mem area from base_memmap */
> diff --git a/hw/i386/fw_cfg.c b/hw/i386/fw_cfg.c
> index fcb4fb0769..c1e9c0fd9c 100644
> --- a/hw/i386/fw_cfg.c
> +++ b/hw/i386/fw_cfg.c
> @@ -63,15 +63,16 @@ void fw_cfg_build_smbios(PCMachineState *pcms, FWCfgState 
> *fw_cfg)
> if (pcmc->smbios_defaults) {
> /* These values are guest ABI, do not change */
> smbios_set_defaults("QEMU", mc->desc, mc->name,
> -pcmc->smbios_legacy_mode, 
> pcmc->smbios_uuid_encoded,
> +pcmc->smbios_uuid_encoded,
> pcms->smbios_entry_point_type);
> }
> 
> /* tell smbios about cpuid version and features */
> smbios_set_cpuid(cpu->env.cpuid_version, cpu->env.features[FEAT_1_EDX]);
> 
> -smbios_tables = smbios_get_table_legacy(ms->smp.cpus, 
> &smbios_tables_len);
> -if (smbios_tables) {
> +if (pcmc->smbios_legacy_mode) {
> +smbios_tables = smbios_get_table_legacy(ms->smp.cpus,
> +&smbios_tables_len);
> fw_cfg_add_bytes(fw_cfg, FW_CFG_SMBIOS_ENTRIES,
>  smbios_tables, smbios_tables_len);
> return;
> diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c
> index 0ad7d8c887..73fb3522ba 100644
> --- a/hw/loongarch/virt.c
> +++ b/hw/loongarch/virt.c
> @@ -320,7 +320,7 @@ static void virt_build_smbios(LoongArchMachineState *lams)
> return;
> }
> 
> -smbios_set_defaults("QEMU", product, mc->name, false,
> +smbios_set_defaults("QEMU", product, mc->name,
> true, SMBIOS_ENTRY_POINT_TYPE_64);
> 
> smbios_get_tables(ms, NULL, 0, &smbios_tables, &smbios_tables_len,
> diff --git a/hw/riscv/virt.c b/hw/riscv/virt.c
> index fd35c74781..e2c9529df2 100644
> --- a/hw/riscv/virt.c
> +++ b/hw/riscv/virt.c
> @@ -1235,7 +1235,7 @@ static void virt_build_smbios(RISCVVirtState *s)
> product = "KVM Virtual Machine";
> }
> 
> -smbios_set_defaults("QEMU", product, mc->name, false,
> +smbios_set_defaults("QEMU", product, mc->name,
> true, SMBIOS_ENTRY_POINT_TYPE_64);
> 
> if (riscv_is_32bit(&s->soc[0])) {
> diff --git a/hw/smbios/smbios.c b/hw/smbios/smbios.c
> index 15339d8dbe..c46fc93357 100644
> --- a/hw/smbios/smbios.c
> +++ b/hw/smbios/smbios.c
> @@ -54,7 +54,6 @@ struct smbios_table {
> 
> static uint8_t *smbios_entries;
> static size_t smbios_entries_len;
> -static bool smbios_legacy = true;
> static bool smbios_uuid_encoded = true;
> /* end: legacy structures & constants for <= 2.0 machines */
> 
> @@ -570,9 +569,16 @@ static void smbios_build_type_1_fields(void)
> 
> uint8_t *smbios_get_table_legacy(uint32_t expected_t4_count, size_t *length)
> {
> -if (!smbios_legacy) {
> -*length = 0;
> -return NU

Re: [PATCH v5 49/65] i386/tdx: handle TDG.VP.VMCALL

2024-02-29 Thread Xiaoyao Li


On 2/29/2024 4:40 PM, Markus Armbruster wrote:

Xiaoyao Li  writes:


From: Isaku Yamahata 

Add property "quote-generation-socket" to tdx-guest, which is a property
of type SocketAddress to specify Quote Generation Service(QGS).

On request of GetQuote, it connects to the QGS socket, read request
data from shared guest memory, send the request data to the QGS,
and store the response into shared guest memory, at last notify
TD guest by interrupt.

command line example:
   qemu-system-x86_64 \
 -object '{"qom-type":"tdx-guest","id":"tdx0","quote-generation-socket":{"type": "vsock", 
"cid":"1","port":"1234"}}' \
 -machine confidential-guest-support=tdx0

Note, above example uses vsock type socket because the QGS we used
implements the vsock socket. It can be other types, like UNIX socket,
which depends on the implementation of QGS.

To avoid no response from QGS server, setup a timer for the transaction.
If timeout, make it an error and interrupt guest. Define the threshold of
time to 30s at present, maybe change to other value if not appropriate.

Signed-off-by: Isaku Yamahata 
Codeveloped-by: Chenyi Qiang 
Signed-off-by: Chenyi Qiang 
Codeveloped-by: Xiaoyao Li 
Signed-off-by: Xiaoyao Li 


[...]


diff --git a/qapi/qom.json b/qapi/qom.json
index cac875349a3a..7b26b0a0d3aa 100644
--- a/qapi/qom.json
+++ b/qapi/qom.json
@@ -917,13 +917,19 @@
  # (base64 encoded SHA384 digest). (A default value 0 of SHA384 is
  # used when absent).
  #
+# @quote-generation-socket: socket address for Quote Generation
+# Service (QGS).  QGS is a daemon running on the host.  User in
+# TD guest cannot get TD quoting for attestation if QGS is not
+# provided.  So admin should always provide it.


This makes me wonder why it's optional.  Can you describe a use case for
*not* specifying @quote-generation-socket?


Maybe at last when all the TDX support lands on all the components, 
attestation will become a must for a TD guest to be usable.


However, at least for today, booting and running a TD guest don't 
require attestation. So not provide it, doesn't affect anything 
excepting cannot get a Quote.



+#
  # Since: 9.0
  ##
  { 'struct': 'TdxGuestProperties',
'data': { '*sept-ve-disable': 'bool',
  '*mrconfigid': 'str',
  '*mrowner': 'str',
-'*mrownerconfig': 'str' } }
+'*mrownerconfig': 'str',
+'*quote-generation-socket': 'SocketAddress' } }
  
  ##

  # @ThreadContextProperties:


[...]

[PATCH] qapi: Fix format of the memory-backend-file's @rom property doc comment

Reflow paragraph following commit a937b6aa73 ("qapi: Reformat doc
comments to conform to current conventions"): use 4 spaces indentation,
70 columns width, and two spaces to separate sentences.

Suggested-by: Markus Armbruster 
Signed-off-by: Stefano Garzarella 
---
 qapi/qom.json | 27 ++-
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/qapi/qom.json b/qapi/qom.json
index 2a6e49365a..db1b0fdea2 100644
--- a/qapi/qom.json
+++ b/qapi/qom.json
@@ -668,19 +668,20 @@
 # @readonly: if true, the backing file is opened read-only; if false,
 # it is opened read-write.  (default: false)
 #
-# @rom: whether to create Read Only Memory (ROM) that cannot be modified
-#   by the VM.  Any write attempts to such ROM will be denied.  Most
-#   use cases want writable RAM instead of ROM.  However, selected use
-#   cases, like R/O NVDIMMs, can benefit from ROM.  If set to 'on',
-#   create ROM; if set to 'off', create writable RAM;  if set to
-#   'auto', the value of the @readonly property is used.  This
-#   property is primarily helpful when we want to have proper RAM in
-#   configurations that would traditionally create ROM before this
-#   property was introduced: VM templating, where we want to open a
-#   file readonly (@readonly set to true) and mark the memory to be
-#   private for QEMU (@share set to false).  For this use case, we need
-#   writable RAM instead of ROM, and want to set this property to 'off'.
-#   (default: auto, since 8.2)
+# @rom: whether to create Read Only Memory (ROM) that cannot be
+# modified by the VM.  Any write attempts to such ROM will be
+# denied.  Most use cases want writable RAM instead of ROM.
+# However, selected use cases, like R/O NVDIMMs, can benefit from
+# ROM.  If set to 'on', create ROM; if set to 'off', create
+# writable RAM; if set to 'auto', the value of the @readonly
+# property is used.  This property is primarily helpful when we
+# want to have proper RAM in configurations that would
+# traditionally create ROM before this property was introduced: VM
+# templating, where we want to open a file readonly (@readonly set
+# to true) and mark the memory to be private for QEMU (@share set
+# to false).  For this use case, we need writable RAM instead of
+# ROM, and want to set this property to 'off'.  (default: auto,
+# since 8.2)
 #
 # Since: 2.1
 ##
-- 
2.44.0

Re: [PATCH, v2] physmem: avoid bounce buffer too small

On Thu, 29 Feb 2024 09:38:29 +
Peter Maydell  wrote:

> On Wed, 28 Feb 2024 at 19:07, Heinrich Schuchardt
>  wrote:
> >
> > On 28.02.24 19:39, Peter Maydell wrote:  
> > > The limitation to a page dates back to commit 6d16c2f88f2a in 2009,
> > > which was the first implementation of this function. I don't think
> > > there's a particular reason for that value beyond that it was
> > > probably a convenient value that was assumed to be likely "big enough".
> > >
> > > I think the idea with this bounce-buffer has always been that this
> > > isn't really a code path we expected to end up in very often --
> > > it's supposed to be for when devices are doing DMA, which they
> > > will typically be doing to memory (backed by host RAM), not
> > > devices (backed by MMIO and needing a bounce buffer). So the
> > > whole mechanism is a bit "last fallback to stop things breaking
> > > entirely".
> > >
> > > The address_space_map() API says that it's allowed to return
> > > a subset of the range you ask for, so if the virtio code doesn't
> > > cope with the minimum being set to TARGET_PAGE_SIZE then either
> > > we need to fix that virtio code or we need to change the API
> > > of this function. (But I think you will also get a reduced
> > > range if you try to use it across a boundary between normal
> > > host-memory-backed RAM and a device MemoryRegion.)  
> >
> > If we allow a bounce buffer only to be used once (via the in_use flag),
> > why do we allow only a single bounce buffer?
> >
> > Could address_space_map() allocate a new bounce buffer on every call and
> > address_space_unmap() deallocate it?
> >
> > Isn't the design with a single bounce buffer bound to fail with a
> > multi-threaded client as collision can be expected?  
> 
> Yeah, I don't suppose multi-threaded was particularly expected.
> Again, this is really a "handle the case where the guest does
> something silly" setup, which is why only one bounce buffer.
> 
> Why is your guest ending up in the bounce-buffer path?

Happens for me with emulated CXL memory. I think the case I saw
was split descriptors in virtio via address space caches
https://elixir.bootlin.com/qemu/latest/source/hw/virtio/virtio.c#L4043

One bounce buffer is in use for the outer loop and another for the descriptors
it is pointing to.

Matthias' series makes this work fine.  I need to circle back and check
how big a cache this needs.  I'm carrying a silly size because of the
side effect of the address space bug here
https://lore.kernel.org/qemu-devel/20240215142817.1904-1-jonathan.came...@huawei.com/#t
and can probably set it to much less than my currently 1GiB.

Jonathan

> 
> -- PMM
>

Re: [PATCH v7] arm/kvm: Enable support for KVM_ARM_VCPU_PMU_V3_FILTER

On Thu, 29 Feb 2024 at 02:32, Shaoqin Huang  wrote:
>
> Hi Peter,
>
> On 2/22/24 22:28, Peter Maydell wrote:
> > On Wed, 21 Feb 2024 at 06:34, Shaoqin Huang  wrote:
> >>
> >> The KVM_ARM_VCPU_PMU_V3_FILTER provides the ability to let the VMM decide
> >> which PMU events are provided to the guest. Add a new option
> >> `kvm-pmu-filter` as -cpu sub-option to set the PMU Event Filtering.
> >> Without the filter, all PMU events are exposed from host to guest by
> >> default. The usage of the new sub-option can be found from the updated
> >> document (docs/system/arm/cpu-features.rst).
> >>
> >> Here is an example which shows how to use the PMU Event Filtering, when
> >> we launch a guest by use kvm, add such command line:
> >>
> >># qemu-system-aarch64 \
> >>  -accel kvm \
> >>  -cpu host,kvm-pmu-filter="D:0x11-0x11"
> >>
> >> Since the first action is deny, we have a global allow policy. This
> >> filters out the cycle counter (event 0x11 being CPU_CYCLES).
> >>
> >> And then in guest, use the perf to count the cycle:
> >>
> >># perf stat sleep 1
> >>
> >> Performance counter stats for 'sleep 1':
> >>
> >>1.22 msec task-clock   #0.001 CPUs 
> >> utilized
> >>   1  context-switches #  820.695 /sec
> >>   0  cpu-migrations   #0.000 /sec
> >>  55  page-faults  #   45.138 K/sec
> >>   cycles
> >> 1128954  instructions
> >>  227031  branches #  186.323 M/sec
> >>8686  branch-misses#3.83% of 
> >> all branches
> >>
> >> 1.002492480 seconds time elapsed
> >>
> >> 0.001752000 seconds user
> >> 0.0 seconds sys
> >>
> >> As we can see, the cycle counter has been disabled in the guest, but
> >> other pmu events do still work.

> >
> > The new syntax for the filter property seems quite complicated.
> > I think it would be worth testing it with a new test in
> > tests/qtest/arm-cpu-features.c.
>
> I was trying to add a test in tests/qtest/arm-cpu-features.c. But I
> found all other cpu-feature is bool property.
>
> When I use the 'query-cpu-model-expansion' to query the cpu-features,
> the kvm-pmu-filter will not shown in the returned results, just like below.
>
> {'execute': 'query-cpu-model-expansion', 'arguments': {'type': 'full',
> 'model': { 'name': 'host'}}}{"return": {}}
>
> {"return": {"model": {"name": "host", "props": {"sve768": false,
> "sve128": false, "sve1024": false, "sve1280": false, "sve896": false,
> "sve256": false, "sve1536": false, "sve1792": false, "sve384": false,
> "sve": false, "sve2048": false, "pauth": false, "kvm-no-adjvtime":
> false, "sve512": false, "aarch64": true, "pmu": true, "sve1920": false,
> "sve1152": false, "kvm-steal-time": true, "sve640": false, "sve1408":
> false, "sve1664": false
>
> I'm not sure if it's because the `query-cpu-model-expansion` only return
> the feature which is bool. Since the kvm-pmu-filter is a str, it won't
> be recognized as a feature.
>
> So I want to ask how can I add the kvm-pmu-filter which is str property
> into the cpu-feature.c test.

It doesn't appear because the list of properties that we advertise
via query-cpu-model-expansion is set in the cpu_model_advertised_features[]
array in target/arm/arm-qmp-cmds.c, and this patch doesn't add
'kvm-pmu-filter' to it. But you have a good point about all the
others being bool properties: I don't know enough about that
mechanism to know if simply adding this to the list is right.

This does raise a more general question: do we need to advertise
the existence of this property to libvirt via QMP? Eric, Sebastian:
do you know ?

If we don't care about this being visible to libvirt then the
importance of having a test case covering the command line
syntax goes down a bit.

> >>
> >> +static void kvm_arm_pmu_filter_init(ARMCPU *cpu)
> >> +{
> >> +static bool pmu_filter_init;
> >> +struct kvm_pmu_event_filter filter;
> >> +struct kvm_device_attr attr = {
> >> +.group  = KVM_ARM_VCPU_PMU_V3_CTRL,
> >> +.attr   = KVM_ARM_VCPU_PMU_V3_FILTER,
> >> +.addr   = (uint64_t)&filter,
> >> +};
> >> +int i;
> >> +g_auto(GStrv) event_filters;
> >> +
> >> +if (!cpu->kvm_pmu_filter) {
> >> +return;
> >> +}
> >> +if (kvm_vcpu_ioctl(CPU(cpu), KVM_HAS_DEVICE_ATTR, &attr)) {
> >> +warn_report("The KVM doesn't support the PMU Event Filter!");
> >
> > Drop "The ".
> >
> > Should this really only be a warning, rather than an error?
> >
>
> I think this is an add-on feature, and shouldn't block the qemu init
> process. If we want to set the wrong pmu filter and it doesn't take
> affect to the VM, it can be detected in the VM.

But if the user explicitly asked for it, it's not optional
for them, it's something they want. We should fa

Re: [PATCH 9/9] hostmem-file: support POSIX shm_open()


On Thu, Feb 29, 2024 at 11:28:37AM +0100, Markus Armbruster wrote:

Stefano Garzarella  writes:


On Wed, Feb 28, 2024 at 01:32:17PM +0100, Markus Armbruster wrote:

Stefano Garzarella  writes:


[...]


+# @shm: if true, shm_open(3) is used to create/open POSIX shared memory
+#   object; if false, an open(2) is used. (default: false) (since 9.0)
+#


Please format like this for consistency:


Sure.



# @shm: if true, shm_open(3) is used to create/open POSIX shared memory
# object; if false, an open(2) is used (default: false) (since 9.0)


I just noticed that I followed the property just above (@rom). Should we fix 
that one?


Yes, please.


Done: 
https://patchew.org/QEMU/20240229105826.16354-1-sgarz...@redhat.com/


Thanks,
Stefano



See commit a937b6aa739 (qapi: Reformat doc comments to conform to
current conventions).

Re: [PATCH 1/9] Hexagon (target/hexagon) Add is_old/is_new to Register class


On 26/2/24 21:17, Taylor Simpson wrote:

Signed-off-by: Taylor Simpson 
---
  target/hexagon/hex_common.py | 14 +-
  1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/target/hexagon/hex_common.py b/target/hexagon/hex_common.py
index 195620c7ec..4bacef223f 100755
--- a/target/hexagon/hex_common.py
+++ b/target/hexagon/hex_common.py
@@ -1,7 +1,7 @@
  #!/usr/bin/env python3
  
  ##

-##  Copyright(c) 2019-2023 Qualcomm Innovation Center, Inc. All Rights 
Reserved.
+##  Copyright(c) 2019-2024 Qualcomm Innovation Center, Inc. All Rights 
Reserved.


:)

Reviewed-by: Philippe Mathieu-Daudé

Re: [PATCH, v2] physmem: avoid bounce buffer too small

On Thu, 29 Feb 2024 at 10:59, Jonathan Cameron
 wrote:
>
> On Thu, 29 Feb 2024 09:38:29 +
> Peter Maydell  wrote:
>
> > On Wed, 28 Feb 2024 at 19:07, Heinrich Schuchardt
> >  wrote:
> > >
> > > On 28.02.24 19:39, Peter Maydell wrote:
> > > > The limitation to a page dates back to commit 6d16c2f88f2a in 2009,
> > > > which was the first implementation of this function. I don't think
> > > > there's a particular reason for that value beyond that it was
> > > > probably a convenient value that was assumed to be likely "big enough".
> > > >
> > > > I think the idea with this bounce-buffer has always been that this
> > > > isn't really a code path we expected to end up in very often --
> > > > it's supposed to be for when devices are doing DMA, which they
> > > > will typically be doing to memory (backed by host RAM), not
> > > > devices (backed by MMIO and needing a bounce buffer). So the
> > > > whole mechanism is a bit "last fallback to stop things breaking
> > > > entirely".
> > > >
> > > > The address_space_map() API says that it's allowed to return
> > > > a subset of the range you ask for, so if the virtio code doesn't
> > > > cope with the minimum being set to TARGET_PAGE_SIZE then either
> > > > we need to fix that virtio code or we need to change the API
> > > > of this function. (But I think you will also get a reduced
> > > > range if you try to use it across a boundary between normal
> > > > host-memory-backed RAM and a device MemoryRegion.)
> > >
> > > If we allow a bounce buffer only to be used once (via the in_use flag),
> > > why do we allow only a single bounce buffer?
> > >
> > > Could address_space_map() allocate a new bounce buffer on every call and
> > > address_space_unmap() deallocate it?
> > >
> > > Isn't the design with a single bounce buffer bound to fail with a
> > > multi-threaded client as collision can be expected?
> >
> > Yeah, I don't suppose multi-threaded was particularly expected.
> > Again, this is really a "handle the case where the guest does
> > something silly" setup, which is why only one bounce buffer.
> >
> > Why is your guest ending up in the bounce-buffer path?
>
> Happens for me with emulated CXL memory.

Can we put that in the "something silly" bucket? :-)
But yes, I'm not surprised that CXL runs into this. Heinrich,
are you doing CXL testing, or is this some other workload?

> I think the case I saw
> was split descriptors in virtio via address space caches
> https://elixir.bootlin.com/qemu/latest/source/hw/virtio/virtio.c#L4043
>
> One bounce buffer is in use for the outer loop and another for the descriptors
> it is pointing to.

Mmm. The other assumption made in the design of the address_space_map()
API I think was that it was unlikely that a device would be trying
to do two DMA operations simultaneously. This is clearly not
true in practice. We definitely need to fix one end or other of
this API.

(I'm not sure why the bounce-buffer limit ought to be per-AddressSpace:
is that just done in Matthias' series so that we can attach an
x-thingy property to the individual PCI device?)

-- PMM

Re: [PATCH] qapi: Fix format of the memory-backend-file's @rom property doc comment

2024-02-29 Thread David Hildenbrand


On 29.02.24 11:58, Stefano Garzarella wrote:

Reflow paragraph following commit a937b6aa73 ("qapi: Reformat doc
comments to conform to current conventions"): use 4 spaces indentation,
70 columns width, and two spaces to separate sentences.

Suggested-by: Markus Armbruster 
Signed-off-by: Stefano Garzarella 
---
  qapi/qom.json | 27 ++-
  1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/qapi/qom.json b/qapi/qom.json
index 2a6e49365a..db1b0fdea2 100644
--- a/qapi/qom.json
+++ b/qapi/qom.json
@@ -668,19 +668,20 @@
  # @readonly: if true, the backing file is opened read-only; if false,
  # it is opened read-write.  (default: false)
  #
-# @rom: whether to create Read Only Memory (ROM) that cannot be modified
-#   by the VM.  Any write attempts to such ROM will be denied.  Most
-#   use cases want writable RAM instead of ROM.  However, selected use
-#   cases, like R/O NVDIMMs, can benefit from ROM.  If set to 'on',
-#   create ROM; if set to 'off', create writable RAM;  if set to
-#   'auto', the value of the @readonly property is used.  This
-#   property is primarily helpful when we want to have proper RAM in
-#   configurations that would traditionally create ROM before this
-#   property was introduced: VM templating, where we want to open a
-#   file readonly (@readonly set to true) and mark the memory to be
-#   private for QEMU (@share set to false).  For this use case, we need
-#   writable RAM instead of ROM, and want to set this property to 'off'.
-#   (default: auto, since 8.2)
+# @rom: whether to create Read Only Memory (ROM) that cannot be
+# modified by the VM.  Any write attempts to such ROM will be
+# denied.  Most use cases want writable RAM instead of ROM.
+# However, selected use cases, like R/O NVDIMMs, can benefit from
+# ROM.  If set to 'on', create ROM; if set to 'off', create
+# writable RAM; if set to 'auto', the value of the @readonly
+# property is used.  This property is primarily helpful when we
+# want to have proper RAM in configurations that would
+# traditionally create ROM before this property was introduced: VM
+# templating, where we want to open a file readonly (@readonly set
+# to true) and mark the memory to be private for QEMU (@share set
+# to false).  For this use case, we need writable RAM instead of
+# ROM, and want to set this property to 'off'.  (default: auto,
+# since 8.2)
  #
  # Since: 2.1
  ##


Ideally, we'd have a format checker that complains like checkpatch 
usually would.


Reviewed-by: David Hildenbrand 

--
Cheers,

David / dhildenb

Re: [PATCH, v2] physmem: avoid bounce buffer too small

2024-02-29 Thread Heinrich Schuchardt


On 29.02.24 12:11, Peter Maydell wrote:

On Thu, 29 Feb 2024 at 10:59, Jonathan Cameron
 wrote:


On Thu, 29 Feb 2024 09:38:29 +
Peter Maydell  wrote:


On Wed, 28 Feb 2024 at 19:07, Heinrich Schuchardt
 wrote:


On 28.02.24 19:39, Peter Maydell wrote:

The limitation to a page dates back to commit 6d16c2f88f2a in 2009,
which was the first implementation of this function. I don't think
there's a particular reason for that value beyond that it was
probably a convenient value that was assumed to be likely "big enough".

I think the idea with this bounce-buffer has always been that this
isn't really a code path we expected to end up in very often --
it's supposed to be for when devices are doing DMA, which they
will typically be doing to memory (backed by host RAM), not
devices (backed by MMIO and needing a bounce buffer). So the
whole mechanism is a bit "last fallback to stop things breaking
entirely".

The address_space_map() API says that it's allowed to return
a subset of the range you ask for, so if the virtio code doesn't
cope with the minimum being set to TARGET_PAGE_SIZE then either
we need to fix that virtio code or we need to change the API
of this function. (But I think you will also get a reduced
range if you try to use it across a boundary between normal
host-memory-backed RAM and a device MemoryRegion.)


If we allow a bounce buffer only to be used once (via the in_use flag),
why do we allow only a single bounce buffer?

Could address_space_map() allocate a new bounce buffer on every call and
address_space_unmap() deallocate it?

Isn't the design with a single bounce buffer bound to fail with a
multi-threaded client as collision can be expected?


Yeah, I don't suppose multi-threaded was particularly expected.
Again, this is really a "handle the case where the guest does
something silly" setup, which is why only one bounce buffer.

Why is your guest ending up in the bounce-buffer path?


Happens for me with emulated CXL memory.


Can we put that in the "something silly" bucket? :-)
But yes, I'm not surprised that CXL runs into this. Heinrich,
are you doing CXL testing, or is this some other workload?


I am running the UEFI Self-Certification Tests (SCT) on EDK 2 using:

qemu-system-riscv64 \
  -M virt,acpi=off -accel tcg -m 4096 \
  -serial mon:stdio \
  -device virtio-gpu-pci \
  -device qemu-xhci \
  -device usb-kbd \
  -drive 
if=pflash,format=raw,unit=0,file=RISCV_VIRT_CODE.fd,readonly=on \

  -drive if=pflash,format=raw,unit=1,file=RISCV_VIRT_VARS.fd \
  -drive file=sct.img,format=raw,if=virtio \
  -device virtio-net-device,netdev=net0 \
  -netdev user,id=net0

This does not invoke any CXL related stuff.

Best regards

Heinrich




I think the case I saw
was split descriptors in virtio via address space caches
https://elixir.bootlin.com/qemu/latest/source/hw/virtio/virtio.c#L4043

One bounce buffer is in use for the outer loop and another for the descriptors
it is pointing to.


Mmm. The other assumption made in the design of the address_space_map()
API I think was that it was unlikely that a device would be trying
to do two DMA operations simultaneously. This is clearly not
true in practice. We definitely need to fix one end or other of
this API.

(I'm not sure why the bounce-buffer limit ought to be per-AddressSpace:
is that just done in Matthias' series so that we can attach an
x-thingy property to the individual PCI device?)

-- PMM

Re: [PATCH, v2] physmem: avoid bounce buffer too small

On Thu, Feb 29, 2024 at 12:12 PM Peter Maydell  wrote:
>
> On Thu, 29 Feb 2024 at 10:59, Jonathan Cameron
>  wrote:
> >
> > On Thu, 29 Feb 2024 09:38:29 +
> > Peter Maydell  wrote:
> >
> > > On Wed, 28 Feb 2024 at 19:07, Heinrich Schuchardt
> > >  wrote:
> > > >
> > > > On 28.02.24 19:39, Peter Maydell wrote:
> > > > > The limitation to a page dates back to commit 6d16c2f88f2a in 2009,
> > > > > which was the first implementation of this function. I don't think
> > > > > there's a particular reason for that value beyond that it was
> > > > > probably a convenient value that was assumed to be likely "big 
> > > > > enough".
> > > > >
> > > > > I think the idea with this bounce-buffer has always been that this
> > > > > isn't really a code path we expected to end up in very often --
> > > > > it's supposed to be for when devices are doing DMA, which they
> > > > > will typically be doing to memory (backed by host RAM), not
> > > > > devices (backed by MMIO and needing a bounce buffer). So the
> > > > > whole mechanism is a bit "last fallback to stop things breaking
> > > > > entirely".
> > > > >
> > > > > The address_space_map() API says that it's allowed to return
> > > > > a subset of the range you ask for, so if the virtio code doesn't
> > > > > cope with the minimum being set to TARGET_PAGE_SIZE then either
> > > > > we need to fix that virtio code or we need to change the API
> > > > > of this function. (But I think you will also get a reduced
> > > > > range if you try to use it across a boundary between normal
> > > > > host-memory-backed RAM and a device MemoryRegion.)
> > > >
> > > > If we allow a bounce buffer only to be used once (via the in_use flag),
> > > > why do we allow only a single bounce buffer?
> > > >
> > > > Could address_space_map() allocate a new bounce buffer on every call and
> > > > address_space_unmap() deallocate it?
> > > >
> > > > Isn't the design with a single bounce buffer bound to fail with a
> > > > multi-threaded client as collision can be expected?
> > >
> > > Yeah, I don't suppose multi-threaded was particularly expected.
> > > Again, this is really a "handle the case where the guest does
> > > something silly" setup, which is why only one bounce buffer.
> > >
> > > Why is your guest ending up in the bounce-buffer path?
> >
> > Happens for me with emulated CXL memory.
>
> Can we put that in the "something silly" bucket? :-)
> But yes, I'm not surprised that CXL runs into this. Heinrich,
> are you doing CXL testing, or is this some other workload?
>
> > I think the case I saw
> > was split descriptors in virtio via address space caches
> > https://elixir.bootlin.com/qemu/latest/source/hw/virtio/virtio.c#L4043
> >
> > One bounce buffer is in use for the outer loop and another for the 
> > descriptors
> > it is pointing to.
>
> Mmm. The other assumption made in the design of the address_space_map()
> API I think was that it was unlikely that a device would be trying
> to do two DMA operations simultaneously. This is clearly not
> true in practice. We definitely need to fix one end or other of
> this API.
>
> (I'm not sure why the bounce-buffer limit ought to be per-AddressSpace:
> is that just done in Matthias' series so that we can attach an
> x-thingy property to the individual PCI device?)

Yes, that's the result of review feedback to the early iterations of
my series. Specifically, (1) a limit is needed to prevent rogue guests
from hogging unlimited amounts of memory and (2) global parameters are
frowned upon. Setting a suitable limit is much more practical when
targeted at a given device/driver combination.

[PULL v2 0/1] loongarch-to-apply queue

2024-02-29 Thread Song Gao

The following changes since commit bfe8020c814a30479a4241aaa78b63960655962b:

  Merge tag 'for-upstream' of https://gitlab.com/bonzini/qemu into staging 
(2024-02-28 14:23:21 +)

are available in the Git repository at:

  https://gitlab.com/gaosong/qemu.git tags/pull-loongarch-20240229

for you to fetch changes up to c6e9847fc4becba561c631c4505e3b05d4926184:

  loongarch: Change the UEFI loading mode to loongarch (2024-02-29 19:32:45 
+0800)


pull-loongarch-20240229

V2: fix build error on mipsel


Xianglai Li (1):
  loongarch: Change the UEFI loading mode to loongarch

 hw/loongarch/acpi-build.c   |  29 +++--
 hw/loongarch/virt.c | 101 
 include/hw/loongarch/virt.h |  10 +++--
 3 files changed, 107 insertions(+), 33 deletions(-)

[PULL v2 1/1] loongarch: Change the UEFI loading mode to loongarch

2024-02-29 Thread Song Gao

From: Xianglai Li 

The UEFI loading mode in loongarch is very different
from that in other architectures:loongarch's UEFI code
is in rom, while other architectures' UEFI code is in flash.

loongarch UEFI can be loaded as follows:
-machine virt,pflash=pflash0-format
-bios ./QEMU_EFI.fd

Other architectures load UEFI using the following methods:
-machine virt,pflash0=pflash0-format,pflash1=pflash1-format

loongarch's UEFI loading method makes qemu and libvirt incompatible
when using NVRAM, and the cost of loongarch's current loading method
far outweighs the benefits, so we decided to use the same UEFI loading
scheme as other architectures.

Cc: Andrea Bolognani 
Cc: maob...@loongson.cn
Cc: Philippe Mathieu-Daudé 
Cc: Song Gao 
Cc: zhaotian...@loongson.cn
Signed-off-by: Xianglai Li 
Tested-by: Andrea Bolognani 
Reviewed-by: Song Gao 
Message-Id: 
<0bd892aa9b88e0f4cc904cb70efd0251fc1cde29.1708336919.git.lixiang...@loongson.cn>
Signed-off-by: Song Gao 
---
 hw/loongarch/acpi-build.c   |  29 +--
 hw/loongarch/virt.c | 101 ++--
 include/hw/loongarch/virt.h |  10 ++--
 3 files changed, 107 insertions(+), 33 deletions(-)

diff --git a/hw/loongarch/acpi-build.c b/hw/loongarch/acpi-build.c
index a1c4198741..e5ab1080af 100644
--- a/hw/loongarch/acpi-build.c
+++ b/hw/loongarch/acpi-build.c
@@ -314,16 +314,39 @@ static void build_pci_device_aml(Aml *scope, 
LoongArchMachineState *lams)
 static void build_flash_aml(Aml *scope, LoongArchMachineState *lams)
 {
 Aml *dev, *crs;
+MemoryRegion *flash_mem;
 
-hwaddr flash_base = VIRT_FLASH_BASE;
-hwaddr flash_size = VIRT_FLASH_SIZE;
+hwaddr flash0_base;
+hwaddr flash0_size;
+
+hwaddr flash1_base;
+hwaddr flash1_size;
+
+flash_mem = pflash_cfi01_get_memory(lams->flash[0]);
+flash0_base = flash_mem->addr;
+flash0_size = memory_region_size(flash_mem);
+
+flash_mem = pflash_cfi01_get_memory(lams->flash[1]);
+flash1_base = flash_mem->addr;
+flash1_size = memory_region_size(flash_mem);
 
 dev = aml_device("FLS0");
 aml_append(dev, aml_name_decl("_HID", aml_string("LNRO0015")));
 aml_append(dev, aml_name_decl("_UID", aml_int(0)));
 
 crs = aml_resource_template();
-aml_append(crs, aml_memory32_fixed(flash_base, flash_size, 
AML_READ_WRITE));
+aml_append(crs, aml_memory32_fixed(flash0_base, flash0_size,
+   AML_READ_WRITE));
+aml_append(dev, aml_name_decl("_CRS", crs));
+aml_append(scope, dev);
+
+dev = aml_device("FLS1");
+aml_append(dev, aml_name_decl("_HID", aml_string("LNRO0015")));
+aml_append(dev, aml_name_decl("_UID", aml_int(1)));
+
+crs = aml_resource_template();
+aml_append(crs, aml_memory32_fixed(flash1_base, flash1_size,
+   AML_READ_WRITE));
 aml_append(dev, aml_name_decl("_CRS", crs));
 aml_append(scope, dev);
 }
diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c
index 0ad7d8c887..1e98d8bda5 100644
--- a/hw/loongarch/virt.c
+++ b/hw/loongarch/virt.c
@@ -54,7 +54,9 @@ struct loaderparams {
 const char *initrd_filename;
 };
 
-static void virt_flash_create(LoongArchMachineState *lams)
+static PFlashCFI01 *virt_flash_create1(LoongArchMachineState *lams,
+   const char *name,
+   const char *alias_prop_name)
 {
 DeviceState *dev = qdev_new(TYPE_PFLASH_CFI01);
 
@@ -66,45 +68,78 @@ static void virt_flash_create(LoongArchMachineState *lams)
 qdev_prop_set_uint16(dev, "id1", 0x18);
 qdev_prop_set_uint16(dev, "id2", 0x00);
 qdev_prop_set_uint16(dev, "id3", 0x00);
-qdev_prop_set_string(dev, "name", "virt.flash");
-object_property_add_child(OBJECT(lams), "virt.flash", OBJECT(dev));
-object_property_add_alias(OBJECT(lams), "pflash",
+qdev_prop_set_string(dev, "name", name);
+object_property_add_child(OBJECT(lams), name, OBJECT(dev));
+object_property_add_alias(OBJECT(lams), alias_prop_name,
   OBJECT(dev), "drive");
+return PFLASH_CFI01(dev);
+}
 
-lams->flash = PFLASH_CFI01(dev);
+static void virt_flash_create(LoongArchMachineState *lams)
+{
+lams->flash[0] = virt_flash_create1(lams, "virt.flash0", "pflash0");
+lams->flash[1] = virt_flash_create1(lams, "virt.flash1", "pflash1");
 }
 
-static void virt_flash_map(LoongArchMachineState *lams,
-   MemoryRegion *sysmem)
+static void virt_flash_map1(PFlashCFI01 *flash,
+hwaddr base, hwaddr size,
+MemoryRegion *sysmem)
 {
-PFlashCFI01 *flash = lams->flash;
 DeviceState *dev = DEVICE(flash);
-hwaddr base = VIRT_FLASH_BASE;
-hwaddr size = VIRT_FLASH_SIZE;
+BlockBackend *blk;
+hwaddr real_size = size;
+
+blk = pflash_cfi01_get_blk(flash);
+if (blk) {
+real_size = blk_getlength(blk);
+assert(real_size && real_size <= size)

Re: [PATCH v7 2/2] hw/acpi: Implement the SRAT GI affinity structure

> One thing I forgot.
>
> Please add a test.  tests/qtest/bios-tables-test.c

IIUC, we need to add a test for aarch64 to test the interface with the
acpi-generic-initiator object.

> + relevant table dumps.

Sorry it isn't clear where do you want me to add this. In the git commit
message?

Re: [PATCH v7 2/2] hw/acpi: Implement the SRAT GI affinity structure

>> ---
>>  hw/acpi/acpi-generic-initiator.c | 84 
>>  hw/arm/virt-acpi-build.c |  3 +
>>  include/hw/acpi/acpi-generic-initiator.h | 26 
> A few more comments.
>
> Maybe _ rather than - as more common for acpi include naming.

Ack. will change the name.

> I also wonder if we need the acpi prefix for file names given context?

I tried to keep it to match the object name. If you have preference for
not having it, I can change that too.

Re: [RFC 0/4] mirror: implement incremental and bitmap modes

2024-02-29 Thread Vladimir Sementsov-Ogievskiy

On 29.02.24 13:11, Fiona Ebner wrote:

Am 28.02.24 um 17:06 schrieb Vladimir Sementsov-Ogievskiy:

On 28.02.24 19:00, Vladimir Sementsov-Ogievskiy wrote:

On 16.02.24 13:55, Fiona Ebner wrote:

Now, the IO test added in patch 4/4 actually contains yet another use
case, namely doing incremental mirrors to stand-alone qcow2 "diff"
images, that only contain the delta and can be rebased later. I had to
adapt the IO test, because its output expected the mirror bitmap to
still be dirty, but nowadays the mirror is apparently already done
when the bitmaps are queried. So I thought, I'll just use
'write-blocking' mode to avoid any potential timing issues.

But this exposed an issue with the diff image approach. If a write is
not aligned to the granularity of the mirror target, then rebasing the
diff image onto a backing image will not yield the desired result,
because the full cluster is considered to be allocated and will "hide"
some part of the base/backing image. The failure can be seen by either
using 'write-blocking' mode in the IO test or setting the (bitmap)
granularity to 32 KiB rather than the current 64 KiB.

The question is how to deal with these edge cases? Some possibilities
that would make sense to me:

For 'background' mode:
* prohibit if target's cluster size is larger than the bitmap
granularity
* document the limitation

For 'write-blocking' mode:
* disallow in combination with bitmap mode (would not be happy about
it, because I'd like to use this without diff images)

why not just require the same: bitmap granularity must be >= target
granularity

For the iotest's use-case, that only works for background mode. I'll
explain below.

* for writes that are not aligned to the target's cluster size, read
the relevant/missing parts from the source image to be able to write
whole target clusters (seems rather complex)

There is another approach: consider and unaligned part of the request,
fit in one cluster (we can always split any request to "aligned"
middle part, and at most two small "unligned" parts, each fit into one
cluster).

We have two possibilities:

1. the cluster is dirty (marked dirty in the bitmap used by background
process)

We can simply ignore this part and rely on background process. This
will not affect the convergence of the mirror job.

Agreed.

2. the cluster is clear (i.e. background process, or some previous
write already copied it)

The iotest creates a new target image for each incremental sync which
only records the diff relative to the previous mirror and those diff
images are later rebased onto each other to get the full picture.

Thus, it can be that a previous mirror job (not just background process
or previous write) already copied a cluster, and in particular, copied
it to a different target!

Aha understand.

For simplicity, let's consider case, when source "cluster size" = "job cluster size" = "bitmap
granularity" = "target cluster size".

Which types of clusters we should consider, when we want to handle guest write?

1. Clusters, that should be copied by background process

These are dirty clusters from user-given bitmap, or if we do a full-disk
mirror, all clusters, not yet copied by background process.

For such clusters we simply ignore the unaligned write. We can even ignore the
aligned write too: less disturbing the guest by delays.

2. Clusters, already copied by background process during this mirror job and
not dirtied by guest since this time.

For such clusters we are safe to do unaligned write, as target cluster must be
allocated.

3. Clusters, not marked initially by dirty bitmap.

What to do with them? We can't do unaligned write. I see two variants:

- do additional read from source, to fill the whole cluster, which seems a bit
too heavy

- just mark the cluster as dirty for background job. So we behave like in "background" mode. But why not? The
maximum count of such "hacks" is limited to number of "clear" clusters at start of mirror job,
which means that we don't seriously affect the convergence. Mirror is guaranteed to converge anyway. And the whole
sense of "write-blocking" mode is to have a guaranteed convergence. What do you think?

Of course, we can't distinguish 3 types by on dirty bitmap, so we need the second one.
For example "done_bitmap", where we can mark clusters that were successfully
copied. That would be a kind of block-status of target image. But using bitmap is a lot
better than querying block-status from target.

In this case, we are safe to do unaligned write, as target cluster
must be allocated.

Because the diff image is new, the target's cluster is not necessarily
allocated. When using write-blocking and a write of, e.g., 9 bytes to a
clear source cluster comes in, only those 9 bytes are written to the
target. Now the target's cluster is allocated but with only those 9
bytes of data. When rebasing, the previously copied cluster is "masked"
and when reading the rebased image, we on

Re: [PATCH v4 13/21] parallels: Handle L1 entries equal to one

2024-02-29 Thread Alexander Ivanov





On 1/18/24 14:37, Denis V. Lunev wrote:

On 12/28/23 11:12, Alexander Ivanov wrote:
If all the bits in a dirty bitmap cluster are ones, the cluster 
shouldn't

be written. Instead the corresponding L1 entry should be set to 1.

Check if all bits in a memory region are ones and set 1 to L1 entries
corresponding clusters filled with ones.

Signed-off-by: Alexander Ivanov 
---
  block/parallels-ext.c | 12 +++-
  1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/block/parallels-ext.c b/block/parallels-ext.c
index 195b01b109..033ca3ec3a 100644
--- a/block/parallels-ext.c
+++ b/block/parallels-ext.c
@@ -354,7 +354,7 @@ static void GRAPH_RDLOCK 
parallels_save_bitmap(BlockDriverState *bs,

  offset = 0;
  while ((offset = bdrv_dirty_bitmap_next_dirty(bitmap, offset, 
bm_size)) >= 0) {

  uint64_t idx = offset / limit;
-    int64_t cluster_off, end, write_size;
+    int64_t cluster_off, end, write_size, first_zero;
    offset = QEMU_ALIGN_DOWN(offset, limit);
  end = MIN(bm_size, offset + limit);
@@ -367,6 +367,16 @@ static void GRAPH_RDLOCK 
parallels_save_bitmap(BlockDriverState *bs,
  memset(bm_buf + write_size, 0, s->cluster_size - 
write_size);

  }
  +    first_zero = bdrv_dirty_bitmap_next_zero(bitmap, offset, 
bm_size);

+    if (first_zero < 0) {
+    goto end;
+    }
+    if (first_zero - offset >= s->cluster_size) {
+    l1_table[idx] = 1;
+    offset = end;
+    continue;
+    }
+
  cluster_off = parallels_allocate_host_clusters(bs, 
&alloc_size);

  if (cluster_off <= 0) {
  goto end;

That is not enough. We should handle all-one and all-zeroes according
to the spec and all-zeroes would be much more common.

Buffer for extensions contains zeroes before handling (it was allocated with
qemu_blockalign0). We skip all  all-zeroes l1 entries and the stay zeroed.

--
Best regards,
Alexander Ivanov

Re: [PATCH v3] docs/system/ppc: Document running Linux on AmigaNG machines

2024-02-29 Thread BALATON Zoltan


On Wed, 21 Feb 2024, BALATON Zoltan wrote:

Documentation on how to run Linux on the amigaone, pegasos2 and
sam460ex machines is currently buried in the depths of the qemu-devel
mailing list and in the source code. Let's collect the information in
the QEMU handbook for a one stop solution.


Ping? (Just so it's not missed from next pull.)

Regards,
BALATON Zoltan


Co-authored-by: Bernhard Beschow 
Signed-off-by: BALATON Zoltan 
Reviewed-by: Nicholas Piggin 
Tested-by: Bernhard Beschow 
---
v3: Apply changes and Tested-by tag from Bernhard
v2: Move top level title one level up so subsections will be below it in TOC

MAINTAINERS |   1 +
docs/system/ppc/amigang.rst | 161 
docs/system/target-ppc.rst  |   1 +
3 files changed, 163 insertions(+)
create mode 100644 docs/system/ppc/amigang.rst

diff --git a/MAINTAINERS b/MAINTAINERS
index 7d61fb9319..0aef8cb2a6 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1562,6 +1562,7 @@ F: hw/rtc/m41t80.c
F: pc-bios/canyonlands.dt[sb]
F: pc-bios/u-boot-sam460ex-20100605.bin
F: roms/u-boot-sam460ex
+F: docs/system/ppc/amigang.rst

pegasos2
M: BALATON Zoltan 
diff --git a/docs/system/ppc/amigang.rst b/docs/system/ppc/amigang.rst
new file mode 100644
index 00..ba1a3d80b9
--- /dev/null
+++ b/docs/system/ppc/amigang.rst
@@ -0,0 +1,161 @@
+=
+AmigaNG boards (``amigaone``, ``pegasos2``, ``sam460ex``)
+=
+
+These PowerPC machines emulate boards that are primarily used for
+running Amiga like OSes (AmigaOS 4, MorphOS and AROS) but these can
+also run Linux which is what this section documents.
+
+Eyetech AmigaOne/Mai Logic Teron (``amigaone``)
+===
+
+The ``amigaone`` machine emulates an AmigaOne XE mainboard by Eyetech
+which is a rebranded Mai Logic Teron board with modified U-Boot
+firmware to support AmigaOS 4.
+
+Emulated devices
+
+
+ * PowerPC 7457 CPU (can also use``-cpu g3, 750cxe, 750fx`` or ``750gx``)
+ * Articia S north bridge
+ * VIA VT82C686B south bridge
+ * PCI VGA compatible card (guests may need other card instead)
+ * PS/2 keyboard and mouse
+
+Firmware
+
+
+A firmware binary is necessary for the boot process. It is a modified
+U-Boot under GPL but its source is lost so it cannot be included in
+QEMU. A binary is available at
+https://www.hyperion-entertainment.com/index.php/downloads?view=files&parent=28.
+The ROM image is in the last 512kB which can be extracted with the
+following command:
+
+.. code-block:: bash
+
+  $ tail -c 524288 updater.image > u-boot-amigaone.bin
+
+The BIOS emulator in the firmware is unable to run QEMU‘s standard
+vgabios so ``VGABIOS-lgpl-latest.bin`` is needed instead which can be
+downloaded from http://www.nongnu.org/vgabios.
+
+Running Linux
+-
+
+There are some Linux images under the following link that work on the
+``amigaone`` machine:
+https://sourceforge.net/projects/amigaone-linux/files/debian-installer/.
+To boot the system run:
+
+.. code-block:: bash
+
+  $ qemu-system-ppc -machine amigaone -bios u-boot-amigaone.bin \
+-cdrom "A1 Linux Net Installer.iso" \
+-device ati-vga,model=rv100,romfile=VGABIOS-lgpl-latest.bin
+
+From the firmware menu that appears select ``Boot sequence`` →
+``Amiga Multiboot Options`` and set ``Boot device 1`` to
+``Onboard VIA IDE CDROM``. Then hit escape until the main screen appears again,
+hit escape once more and from the exit menu that appears select either
+``Save settings and exit`` or ``Use settings for this session only``. It may
+take a long time loading the kernel into memory but eventually it boots and the
+installer becomes visible. The ``ati-vga`` RV100 emulation is not
+complete yet so only frame buffer works, DRM and 3D is not available.
+
+Genesi/bPlan Pegasos II (``pegasos2``)
+==
+
+The ``pegasos2`` machine emulates the Pegasos II sold by Genesi and
+designed by bPlan. Its schematics are available at
+https://www.powerdeveloper.org/platforms/pegasos/schematics.
+
+Emulated devices
+
+
+ * PowerPC 7457 CPU (can also use``-cpu g3`` or ``750cxe``)
+ * Marvell MV64361 Discovery II north bridge
+ * VIA VT8231 south bridge
+ * PCI VGA compatible card (guests may need other card instead)
+ * PS/2 keyboard and mouse
+
+Firmware
+
+
+The Pegasos II board has an Open Firmware compliant ROM based on
+SmartFirmware with some changes that are not open-sourced therefore
+the ROM binary cannot be included in QEMU. An updater was available
+from bPlan, it can be found in the `Internet Archive
+`_.
+The ROM image can be extracted from it with the following command:
+
+.. code-block:: bash
+
+  $ tail -c +85581 up050404 | head -c 524288 > pegasos2.rom
+
+Running Linu

Re: [RFC 0/4] mirror: implement incremental and bitmap modes

2024-02-29 Thread Vladimir Sementsov-Ogievskiy


On 29.02.24 13:41, Fiona Ebner wrote:

Am 28.02.24 um 17:24 schrieb Vladimir Sementsov-Ogievskiy:

On 16.02.24 13:55, Fiona Ebner wrote:

Previous discussion from when this was sent upstream [0] (it's been a
while). I rebased the patches and re-ordered and squashed like
suggested back then [1].

This implements two new mirror modes:

- bitmap mirror mode with always/on-success/never bitmap sync mode
- incremental mirror mode as sugar for bitmap + on-success

Use cases:
* Possibility to resume a failed mirror later.
* Possibility to only mirror deltas to a previously mirrored volume.
* Possibility to (efficiently) mirror an drive that was previously
    mirrored via some external mechanism (e.g. ZFS replication).

We are using the last one in production without any issues since about
4 years now. In particular, like mentioned in [2]:


- create bitmap(s)
- (incrementally) replicate storage volume(s) out of band (using ZFS)
- incrementally drive mirror as part of a live migration of VM
- drop bitmap(s)


Actually which mode you use, "never", "always" or "conditional"? Or in
downstream you have different approach?



We are using "conditional", but I think we don't really require any
specific mode, because we drop the bitmaps after mirroring (even in
failure case). Fabian, please correct me if I'm wrong.


Why am I asking:

These modes (for backup) were developed prior to
block-dirty-bitmap-merge command, which allowed to copy bitmaps as you
want. With that API, we actually don't need all these modes, instead
it's enough to pass a bitmap, which would be _actually_ used by mirror.

So, if you need "never" mode, you just copy your bitmap by
block-dirty-bitmap-add + block-dirty-bitmap-merge, and pass a copy to
mirror job.

Or, you pass your bitmap to mirror-job, and have a "always" mode.

And I don't see, why we need a "conditional" mode, which actually just
drops away the progress we actually made. (OK, we failed, but why to
drop the progress of successfully copied clusters?)



I'm not sure actually. Maybe John remembers?


Ah, I understand. Conditional just make sense if you don't support "partial 
success", and you want to delete target image in case of failure. And create a new 
one, to restart incremental job.

But anyway, this all could be simply achieved with bitmap-copying/merging API, 
if we allow to pass user-given bitmap to the mirror as working bitmap.



I see, I'll drop the 'bitmap-mode' in the next version if nobody
complains :)



Good. It's a golden rule: never make public interfaces which you don't actually need for 
production. I myself sometimes violate it and spend extra time on developing features, 
which we later have to just drop as "not needed downstream, no sense in 
upstreaming".



Using user-given bitmap in the mirror job has also an additional
advantage of live progress: up to visualization of disk copying by
visualization of the dirty bitmap contents.



Best Regards,
Fiona



--
Best regards,
Vladimir

Re: [PATCH v7 2/2] hw/acpi: Implement the SRAT GI affinity structure

On Thu, 29 Feb 2024 11:43:44 +
Ankit Agrawal  wrote:

> > One thing I forgot.
> >
> > Please add a test.  tests/qtest/bios-tables-test.c  
> 
> IIUC, we need to add a test for aarch64 to test the interface with the
> acpi-generic-initiator object.
> 
> > + relevant table dumps.  
> 
> Sorry it isn't clear where do you want me to add this. In the git commit
> message?

I meant as part of the test.  Typically you then include the relevant snippet
as part of the commit message to show what the key part of the test is.

Thanks,

Jonathan

Re: [PATCH] chardev/char-socket: Fix TLS io channels sending too much data to the backend

2024-02-29 Thread Antoine Damhet

On Thu, Feb 29, 2024 at 11:43:37AM +0100, Thomas Huth wrote:
> Commit ffda5db65a ("io/channel-tls: fix handling of bigger read buffers")
> changed the behavior of the TLS io channels to schedule a second reading
> attempt if there is still incoming data pending. This caused a regression
> with backends like the sclpconsole that check in their read function that
> the sender does not try to write more bytes to it than the device can
> currently handle.
> 
> The problem can be reproduced like this:
> 
>  1) In one terminal, do this:
> 
>   mkdir qemu-pki
>   cd qemu-pki
>   openssl genrsa 2048 > ca-key.pem
>   openssl req -new -x509 -nodes -days 365000 -key ca-key.pem -out ca-cert.pem
>   # enter some dummy value for the cert
>   openssl genrsa 2048 > server-key.pem
>   openssl req -new -x509 -nodes -days 365000 -key server-key.pem \
> -out server-cert.pem
>   # enter some other dummy values for the cert
> 
>   gnutls-serv --echo --x509cafile ca-cert.pem --x509keyfile server-key.pem \
>   --x509certfile server-cert.pem -p 8338
> 
>  2) In another terminal, do this:
> 
>   wget 
> https://download.fedoraproject.org/pub/fedora-secondary/releases/39/Cloud/s390x/images/Fedora-Cloud-Base-39-1.5.s390x.qcow2
> 
>   qemu-system-s390x -nographic -nodefaults \
> -hda Fedora-Cloud-Base-39-1.5.s390x.qcow2 \
> -object 
> tls-creds-x509,id=tls0,endpoint=client,verify-peer=false,dir=$PWD/qemu-pki \
> -chardev socket,id=tls_chardev,host=localhost,port=8338,tls-creds=tls0 \
> -device sclpconsole,chardev=tls_chardev,id=tls_serial
> 
> QEMU then aborts after a second or two with:
> 
>   qemu-system-s390x: ../hw/char/sclpconsole.c:73: chr_read: Assertion
>`size <= SIZE_BUFFER_VT220 - scon->iov_data_len' failed.
>  Aborted (core dumped)
> 
> It looks like the second read does not trigger the chr_can_read() function
> to be called before the second read, which should normally always be done
> before sending bytes to a character device to see how much it can handle,
> so the s->max_size in tcp_chr_read() still contains the old value from the
> previous read. Let's make sure that we use the up-to-date value by calling
> tcp_chr_read_poll() again here.
> 
> Fixes: ffda5db65a ("io/channel-tls: fix handling of bigger read buffers")
> Buglink: https://issues.redhat.com/browse/RHEL-24614
> Reviewed-by: Daniel P. Berrangé 

Reviewed-by: Antoine Damhet 
Tested-by: Antoine Damhet 

> Signed-off-by: Thomas Huth 
> ---
>  Sorry if you've got this mail twice - I forgot to CC: qemu-devel when
>  I sent it out the first time ... *facepalm*
> 
>  chardev/char-socket.c | 6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)
> 
> diff --git a/chardev/char-socket.c b/chardev/char-socket.c
> index 67e3334423..8a0406cc1e 100644
> --- a/chardev/char-socket.c
> +++ b/chardev/char-socket.c
> @@ -496,9 +496,9 @@ static gboolean tcp_chr_read(QIOChannel *chan, 
> GIOCondition cond, void *opaque)
>  s->max_size <= 0) {
>  return TRUE;
>  }
> -len = sizeof(buf);
> -if (len > s->max_size) {
> -len = s->max_size;
> +len = tcp_chr_read_poll(opaque);
> +if (len > sizeof(buf)) {
> +len = sizeof(buf);
>  }
>  size = tcp_chr_recv(chr, (void *)buf, len);
>  if (size == 0 || (size == -1 && errno != EAGAIN)) {
> -- 
> 2.44.0
> 

-- 
Antoine 'xdbob' Damhet


signature.asc
Description: PGP signature

Re: [PATCH v7 2/2] hw/acpi: Implement the SRAT GI affinity structure

On Thu, 29 Feb 2024 11:46:27 +
Ankit Agrawal  wrote:

> >> ---
> >>  hw/acpi/acpi-generic-initiator.c | 84 
> >>  hw/arm/virt-acpi-build.c |  3 +
> >>  include/hw/acpi/acpi-generic-initiator.h | 26   
> > A few more comments.
> >
> > Maybe _ rather than - as more common for acpi include naming.  
> 
> Ack. will change the name.
> 
> > I also wonder if we need the acpi prefix for file names given context?  
> 
> I tried to keep it to match the object name. If you have preference for
> not having it, I can change that too.

Not my area, so up to maintainers of hw/acpi to comment if they care!

I'd not wait for that though before v8.

*impatient user who has some other stuff queued on top of this!*
:)

Jonathan

Re: [PATCH v7 2/2] hw/acpi: Implement the SRAT GI affinity structure

>> > One thing I forgot.
>> >
>> > Please add a test.  tests/qtest/bios-tables-test.c
>>
>> IIUC, we need to add a test for aarch64 to test the interface with the
>> acpi-generic-initiator object.
>>
>> > + relevant table dumps.
>>
>> Sorry it isn't clear where do you want me to add this. In the git commit
>> message?
>
> I meant as part of the test.  Typically you then include the relevant snippet
> as part of the commit message to show what the key part of the test is.

Got it, thanks!

> Thanks,
>
> Jonathan

Re: [PATCH v2 0/5] linux-user: Rewrite target_shmat

2024-02-29 Thread Richard Purdie

On Wed, 2024-02-28 at 10:25 -1000, Richard Henderson wrote:
> There are multiple issues with the implementation of shmat().
> 
> (1) With reserved_va, which is the default for 32-on-64-bit, we mmap
> the
>     entire guest address space.  Unlike mmap, shmat refuses to
> replace an
>     existing mapping without setting SHM_REMAP.  This is the original
>     subject of issue #115, though it quicky gets distracted by
>     something else.
> 
> (2) With target page size > host page size, and a shm area
>     that is not a multiple of the target page size, we leave
>     an unmapped hole that the target expects to be mapped.
>     This is the subject of 
> 
>   
> https://lore.kernel.org/qemu-devel/2no4imvz2zrar5kchz2l3oddqbgpj77jg
> wcuf7aritkn2ok763@i2mvpcihztho/
> 
>     wherein qemu itself expects a mapping to exist, and
>     dies in open_self_maps_2.
> 
> So: reimplement the thing.
> 
> Changes for v2:
>   - Include Ilya's test case, which caught extra errors: Yay!
>   - Include x86_64 /proc/self/maps fix, which the test triggers.
>   - Dropped r-b for the shmat rewrite due to number of changes.

I tested these against our problem with webkitgkt and an happy to
report it does solve the segfault we were seeing, thanks!

Cheers,

Richard

Re: [PATCH, v2] physmem: avoid bounce buffer too small

On Thu, 29 Feb 2024 at 11:17, Heinrich Schuchardt
 wrote:
> > But yes, I'm not surprised that CXL runs into this. Heinrich,
> > are you doing CXL testing, or is this some other workload?
>
> I am running the UEFI Self-Certification Tests (SCT) on EDK 2 using:
>
> qemu-system-riscv64 \
>-M virt,acpi=off -accel tcg -m 4096 \
>-serial mon:stdio \
>-device virtio-gpu-pci \
>-device qemu-xhci \
>-device usb-kbd \
>-drive
> if=pflash,format=raw,unit=0,file=RISCV_VIRT_CODE.fd,readonly=on \
>-drive if=pflash,format=raw,unit=1,file=RISCV_VIRT_VARS.fd \
>-drive file=sct.img,format=raw,if=virtio \
>-device virtio-net-device,netdev=net0 \
>-netdev user,id=net0
>
> This does not invoke any CXL related stuff.

Hmm, that doesn't seem like it ought to be running into this.
What underlying memory region is the guest trying to do
the virtio queue access to?

-- PMM

Re: [RFC 0/4] mirror: implement incremental and bitmap modes

Am 29.02.24 um 12:48 schrieb Vladimir Sementsov-Ogievskiy:
> On 29.02.24 13:11, Fiona Ebner wrote:
>>
>> The iotest creates a new target image for each incremental sync which
>> only records the diff relative to the previous mirror and those diff
>> images are later rebased onto each other to get the full picture.
>>
>> Thus, it can be that a previous mirror job (not just background process
>> or previous write) already copied a cluster, and in particular, copied
>> it to a different target!
> 
> Aha understand.
> 
> For simplicity, let's consider case, when source "cluster size" = "job
> cluster size" = "bitmap granularity" = "target cluster size".
> 
> Which types of clusters we should consider, when we want to handle guest
> write?
> 
> 1. Clusters, that should be copied by background process
> 
> These are dirty clusters from user-given bitmap, or if we do a full-disk
> mirror, all clusters, not yet copied by background process.
> 
> For such clusters we simply ignore the unaligned write. We can even
> ignore the aligned write too: less disturbing the guest by delays.
> 

Since do_sync_target_write() currently doesn't ignore aligned writes, I
wouldn't change it. Of course they can count towards the "done_bitmap"
you propose below.

> 2. Clusters, already copied by background process during this mirror job
> and not dirtied by guest since this time.
> 
> For such clusters we are safe to do unaligned write, as target cluster
> must be allocated.
> 

Right.

> 3. Clusters, not marked initially by dirty bitmap.
> 
> What to do with them? We can't do unaligned write. I see two variants:
> 
> - do additional read from source, to fill the whole cluster, which seems
> a bit too heavy
> 

Yes, I'd rather only do that as a last resort.

> - just mark the cluster as dirty for background job. So we behave like
> in "background" mode. But why not? The maximum count of such "hacks" is
> limited to number of "clear" clusters at start of mirror job, which
> means that we don't seriously affect the convergence. Mirror is
> guaranteed to converge anyway. And the whole sense of "write-blocking"
> mode is to have a guaranteed convergence. What do you think?
> 

It could lead to a lot of flips between job->actively_synced == true and
== false. AFAIU, currently, we only switch back from true to false when
an error happens. While I don't see a concrete issue with it, at least
it might be unexpected to users, so it better be documented.

I'll try going with this approach, thanks!

> 
> 
> 
> Of course, we can't distinguish 3 types by on dirty bitmap, so we need
> the second one. For example "done_bitmap", where we can mark clusters
> that were successfully copied. That would be a kind of block-status of
> target image. But using bitmap is a lot better than querying
> block-status from target.

Best Regards,
Fiona

Re: [PATCH, v2] physmem: avoid bounce buffer too small

On Thu, Feb 29, 2024 at 1:35 PM Peter Maydell  wrote:
>
> On Thu, 29 Feb 2024 at 11:17, Heinrich Schuchardt
>  wrote:
> > > But yes, I'm not surprised that CXL runs into this. Heinrich,
> > > are you doing CXL testing, or is this some other workload?
> >
> > I am running the UEFI Self-Certification Tests (SCT) on EDK 2 using:
> >
> > qemu-system-riscv64 \
> >-M virt,acpi=off -accel tcg -m 4096 \
> >-serial mon:stdio \
> >-device virtio-gpu-pci \
> >-device qemu-xhci \
> >-device usb-kbd \
> >-drive
> > if=pflash,format=raw,unit=0,file=RISCV_VIRT_CODE.fd,readonly=on \
> >-drive if=pflash,format=raw,unit=1,file=RISCV_VIRT_VARS.fd \
> >-drive file=sct.img,format=raw,if=virtio \
> >-device virtio-net-device,netdev=net0 \
> >-netdev user,id=net0
> >
> > This does not invoke any CXL related stuff.
>
> Hmm, that doesn't seem like it ought to be running into this.
> What underlying memory region is the guest trying to do
> the virtio queue access to?

FWIW, I have seen multiple bounce buffer usage with the generic net TX
path as well as the XHCI controller, so it might be either of these.
Bounce buffering should only take place when the memory region can't
be accessed directly though - I don't see why that's the case for the
given command line.

Re: [PATCH v7 1/2] qom: new object to associate device to numa node

>> >>> Jonathan, you pointed out interface design issues in your review of v2.>
>> >> Are you fully satisfied with the interface in v3?
>> >>
>> >> Yes. I'm fine with the interface in this version (though it's v7, so I'm 
>> >> lost
>> >> on v2 vs v3!)
>> >
>> > Looks like I can't count to 7!
>> >
>> > With NUMA capitalized in the doc comment, QAPI schema
>> > Acked-by: Markus Armbruster 
>> >
>> > Thanks!
>>
>> Thanks! Will fix that in the next version.
>
> The following is really me arguing with myself, so can probably be
> ignored, but maybe it will spark an idea from someone else!
>
> One trivial tweak that might make our life easier if anyone adds
> support in the future for the other device handle type might be to go
> with simply dev rather than pci-dev.
>
> There is a sticky corner though if a device is a PCI device
> and in ACPI DSDT so maybe we are better off adding acpi-dev
> to take either pci-dev or acpi-dev?

That use case does complicate the situation. Do you of any such
use case for generic initiator?

As for your suggestion of using acpi-dev as the arg to take both
pci-dev and acpi-dev.. Would that mean sending a pure pci device
(not the corner case you mentioned) through the acpi-dev argument
as well? Not sure if that would appropriate.

> Annoyingly for generic ports, (I'm reusing this infrastructure here)
> the kernel code currently only deals with the ACPI form (for CXL host
> bridges).  Given I point that at the bus of a PXB_CXL it is both
> a PCI device, and the only handle we have for getting to the
> Root Bridge ACPI handle.

So IIUC, you need to pass a PCI device to the generic port object, but use
that to reach the ACPI handle and build the Generic port affinity structure
for an ACPI device?

> So I think I've argued myself around to thinking we need to extend
> the interface with another optional parameter if we ever do support
> the ACPI handle for generic initiators :(
>
> Jonathan

[RFC PATCH v5 02/22] target/arm: Add PSTATE.ALLINT

When PSTATE.ALLINT is set, an IRQ or FIQ interrupt that is targeted to
ELx, with or without superpriority is masked.

As Richard suggested, place ALLINT bit in PSTATE in env->pstate.

With the change to pstate_read/write, exception entry
and return are automatically handled.

Signed-off-by: Jinjie Ruan 
Reviewed-by: Richard Henderson 
---
v5:
- Remove the ALLINT comment, as it is covered by "all other bits".
- Add Reviewed-by.
v4:
- Keep PSTATE.ALLINT in env->pstate but not env->allint.
- Update the commit message.
v3:
- Remove ALLINT dump in aarch64_cpu_dump_state().
- Update the commit message.
---
 target/arm/cpu.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/target/arm/cpu.h b/target/arm/cpu.h
index 63f31e0d98..abb453f733 100644
--- a/target/arm/cpu.h
+++ b/target/arm/cpu.h
@@ -1543,6 +1543,7 @@ FIELD(VTCR, SL2, 33, 1)
 #define PSTATE_D (1U << 9)
 #define PSTATE_BTYPE (3U << 10)
 #define PSTATE_SSBS (1U << 12)
+#define PSTATE_ALLINT (1U << 13)
 #define PSTATE_IL (1U << 20)
 #define PSTATE_SS (1U << 21)
 #define PSTATE_PAN (1U << 22)
-- 
2.34.1

[RFC PATCH v5 03/22] target/arm: Add support for FEAT_NMI, Non-maskable Interrupt

Add support for FEAT_NMI. NMI (FEAT_NMI) is an mandatory feature in
ARMv8.8-A and ARM v9.3-A.

Signed-off-by: Jinjie Ruan 
Reviewed-by: Richard Henderson 
---
v3:
- Add Reviewed-by.
- Adjust to before the MSR patches.
---
 target/arm/internals.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/target/arm/internals.h b/target/arm/internals.h
index 50bff44549..fee65caba5 100644
--- a/target/arm/internals.h
+++ b/target/arm/internals.h
@@ -1078,6 +1078,9 @@ static inline uint32_t aarch64_pstate_valid_mask(const 
ARMISARegisters *id)
 if (isar_feature_aa64_mte(id)) {
 valid |= PSTATE_TCO;
 }
+if (isar_feature_aa64_nmi(id)) {
+valid |= PSTATE_ALLINT;
+}
 
 return valid;
 }
-- 
2.34.1

[RFC PATCH v5 15/22] hw/intc/arm_gicv3: Implement GICD_INMIR

Add GICD_INMIR, GICD_INMIRnE register and support access GICD_INMIR0.

Signed-off-by: Jinjie Ruan 
Reviewed-by: Richard Henderson 
---
v4:
- Make the GICD_INMIR implementation more clearer.
- Udpate the commit message.
v3:
- Add Reviewed-by.
---
 hw/intc/arm_gicv3_dist.c | 34 ++
 hw/intc/gicv3_internal.h |  2 ++
 2 files changed, 36 insertions(+)

diff --git a/hw/intc/arm_gicv3_dist.c b/hw/intc/arm_gicv3_dist.c
index 35e850685c..9739404e35 100644
--- a/hw/intc/arm_gicv3_dist.c
+++ b/hw/intc/arm_gicv3_dist.c
@@ -89,6 +89,29 @@ static int gicd_ns_access(GICv3State *s, int irq)
 return extract32(s->gicd_nsacr[irq / 16], (irq % 16) * 2, 2);
 }
 
+static void gicd_write_bitmap_reg(GICv3State *s, MemTxAttrs attrs,
+  uint32_t *bmp, maskfn *maskfn,
+  int offset, uint32_t val)
+{
+/*
+ * Helper routine to implement writing to a "set" register
+ * (GICD_INMIR, etc).
+ * Semantics implemented here:
+ * RAZ/WI for SGIs, PPIs, unimplemented IRQs
+ * Bits corresponding to Group 0 or Secure Group 1 interrupts RAZ/WI.
+ * offset should be the offset in bytes of the register from the start
+ * of its group.
+ */
+int irq = offset * 8;
+
+if (irq < GIC_INTERNAL || irq >= s->num_irq) {
+return;
+}
+val &= mask_group_and_nsacr(s, attrs, maskfn, irq);
+*gic_bmp_ptr32(bmp, irq) = val;
+gicv3_update(s, irq, 32);
+}
+
 static void gicd_write_set_bitmap_reg(GICv3State *s, MemTxAttrs attrs,
   uint32_t *bmp,
   maskfn *maskfn,
@@ -543,6 +566,11 @@ static bool gicd_readl(GICv3State *s, hwaddr offset,
 /* RAZ/WI since affinity routing is always enabled */
 *data = 0;
 return true;
+case GICD_INMIR ... GICD_INMIR + 0x7f:
+*data = (!s->nmi_support) ? 0 :
+gicd_read_bitmap_reg(s, attrs, s->superprio, NULL,
+ offset - GICD_INMIR);
+return true;
 case GICD_IROUTER ... GICD_IROUTER + 0x1fdf:
 {
 uint64_t r;
@@ -752,6 +780,12 @@ static bool gicd_writel(GICv3State *s, hwaddr offset,
 case GICD_SPENDSGIR ... GICD_SPENDSGIR + 0xf:
 /* RAZ/WI since affinity routing is always enabled */
 return true;
+case GICD_INMIR ... GICD_INMIR + 0x7f:
+if (s->nmi_support) {
+gicd_write_bitmap_reg(s, attrs, s->superprio, NULL,
+  offset - GICD_INMIR, value);
+}
+return true;
 case GICD_IROUTER ... GICD_IROUTER + 0x1fdf:
 {
 uint64_t r;
diff --git a/hw/intc/gicv3_internal.h b/hw/intc/gicv3_internal.h
index f35b7d2f03..a1fc34597e 100644
--- a/hw/intc/gicv3_internal.h
+++ b/hw/intc/gicv3_internal.h
@@ -52,6 +52,8 @@
 #define GICD_SGIR0x0F00
 #define GICD_CPENDSGIR   0x0F10
 #define GICD_SPENDSGIR   0x0F20
+#define GICD_INMIR   0x0F80
+#define GICD_INMIRnE 0x3B00
 #define GICD_IROUTER 0x6000
 #define GICD_IDREGS  0xFFD0
 
-- 
2.34.1

[RFC PATCH v5 11/22] hw/intc/arm_gicv3: Add external IRQ lines for NMI

Augment the GICv3's QOM device interface by adding one
new set of sysbus IRQ line, to signal NMI to each CPU.

Signed-off-by: Jinjie Ruan 
Reviewed-by: Richard Henderson 
---
v4:
- Add Reviewed-by.
v3:
- Add support for VNMI.
---
 hw/intc/arm_gicv3_common.c | 6 ++
 include/hw/intc/arm_gic_common.h   | 2 ++
 include/hw/intc/arm_gicv3_common.h | 2 ++
 3 files changed, 10 insertions(+)

diff --git a/hw/intc/arm_gicv3_common.c b/hw/intc/arm_gicv3_common.c
index cb55c72681..c52f060026 100644
--- a/hw/intc/arm_gicv3_common.c
+++ b/hw/intc/arm_gicv3_common.c
@@ -299,6 +299,12 @@ void gicv3_init_irqs_and_mmio(GICv3State *s, 
qemu_irq_handler handler,
 for (i = 0; i < s->num_cpu; i++) {
 sysbus_init_irq(sbd, &s->cpu[i].parent_vfiq);
 }
+for (i = 0; i < s->num_cpu; i++) {
+sysbus_init_irq(sbd, &s->cpu[i].parent_nmi);
+}
+for (i = 0; i < s->num_cpu; i++) {
+sysbus_init_irq(sbd, &s->cpu[i].parent_vnmi);
+}
 
 memory_region_init_io(&s->iomem_dist, OBJECT(s), ops, s,
   "gicv3_dist", 0x1);
diff --git a/include/hw/intc/arm_gic_common.h b/include/hw/intc/arm_gic_common.h
index 7080375008..97fea4102d 100644
--- a/include/hw/intc/arm_gic_common.h
+++ b/include/hw/intc/arm_gic_common.h
@@ -71,6 +71,8 @@ struct GICState {
 qemu_irq parent_fiq[GIC_NCPU];
 qemu_irq parent_virq[GIC_NCPU];
 qemu_irq parent_vfiq[GIC_NCPU];
+qemu_irq parent_nmi[GIC_NCPU];
+qemu_irq parent_vnmi[GIC_NCPU];
 qemu_irq maintenance_irq[GIC_NCPU];
 
 /* GICD_CTLR; for a GIC with the security extensions the NS banked version
diff --git a/include/hw/intc/arm_gicv3_common.h 
b/include/hw/intc/arm_gicv3_common.h
index 4e2fb518e7..7324c7d983 100644
--- a/include/hw/intc/arm_gicv3_common.h
+++ b/include/hw/intc/arm_gicv3_common.h
@@ -155,6 +155,8 @@ struct GICv3CPUState {
 qemu_irq parent_fiq;
 qemu_irq parent_virq;
 qemu_irq parent_vfiq;
+qemu_irq parent_nmi;
+qemu_irq parent_vnmi;
 
 /* Redistributor */
 uint32_t level;  /* Current IRQ level */
-- 
2.34.1

[RFC PATCH v5 07/22] target/arm: Add support for NMI in arm_phys_excp_target_el()

According to Arm GIC section 4.6.3 Interrupt superpriority, the interrupt
with superpriority is always IRQ, never FIQ, so handle NMI same as IRQ in
arm_phys_excp_target_el().

Signed-off-by: Jinjie Ruan 
Reviewed-by: Richard Henderson 
---
v4:
- Add Reviewed-by.
v3:
- Remove nmi_is_irq flag in CPUARMState.
- Handle NMI same as IRQ in arm_phys_excp_target_el().
---
 target/arm/helper.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/target/arm/helper.c b/target/arm/helper.c
index 4b4c8e279d..7cdc90e9e3 100644
--- a/target/arm/helper.c
+++ b/target/arm/helper.c
@@ -10570,6 +10570,7 @@ uint32_t arm_phys_excp_target_el(CPUState *cs, uint32_t 
excp_idx,
 hcr_el2 = arm_hcr_el2_eff(env);
 switch (excp_idx) {
 case EXCP_IRQ:
+case EXCP_NMI:
 scr = ((env->cp15.scr_el3 & SCR_IRQ) == SCR_IRQ);
 hcr = hcr_el2 & HCR_IMO;
 break;
-- 
2.34.1

[RFC PATCH v5 18/22] hw/intc/arm_gicv3: Implement NMI interrupt prioirty

If GICD_CTLR_DS bit is zero and the NMI is non-secure, the NMI prioirty
is higher than 0x80, otherwise it is higher than 0x0. And save NMI
super prioirty information in hppi.superprio to deliver NMI exception.
Since both GICR and GICD can deliver NMI, it is both necessary to check
whether the pending irq is NMI in gicv3_redist_update_noirqset and
gicv3_update_noirqset. And In irqbetter(), only a non-NMI with the same
priority and a smaller interrupt number can be preempted but not NMI.

Signed-off-by: Jinjie Ruan 
---
v4:
- Replace is_nmi with has_superprio to not a mix NMI and superpriority.
- Update the comment in irqbetter().
- Extract gicv3_get_priority() to avoid code repeat.
---
v3:
- Add missing brace
---
 hw/intc/arm_gicv3.c | 71 -
 1 file changed, 63 insertions(+), 8 deletions(-)

diff --git a/hw/intc/arm_gicv3.c b/hw/intc/arm_gicv3.c
index 0b8f79a122..1d16a53b23 100644
--- a/hw/intc/arm_gicv3.c
+++ b/hw/intc/arm_gicv3.c
@@ -21,7 +21,8 @@
 #include "hw/intc/arm_gicv3.h"
 #include "gicv3_internal.h"
 
-static bool irqbetter(GICv3CPUState *cs, int irq, uint8_t prio)
+static bool irqbetter(GICv3CPUState *cs, int irq, uint8_t prio,
+  bool has_superprio)
 {
 /* Return true if this IRQ at this priority should take
  * precedence over the current recorded highest priority
@@ -33,11 +34,24 @@ static bool irqbetter(GICv3CPUState *cs, int irq, uint8_t 
prio)
 if (prio < cs->hppi.prio) {
 return true;
 }
+
+/*
+ * Current highest prioirity pending interrupt is an IRQ without
+ * superpriority, the new IRQ with superpriority has same priority
+ * should signal to the CPU as it have the priority higher than
+ * the labelled 0x80 or 0x00.
+ */
+if (prio == cs->hppi.prio && !cs->hppi.superprio && has_superprio) {
+return true;
+}
+
 /* If multiple pending interrupts have the same priority then it is an
  * IMPDEF choice which of them to signal to the CPU. We choose to
- * signal the one with the lowest interrupt number.
+ * signal the one with the lowest interrupt number if they don't have
+ * superpriority.
  */
-if (prio == cs->hppi.prio && irq <= cs->hppi.irq) {
+if (prio == cs->hppi.prio && !cs->hppi.superprio &&
+!has_superprio && irq <= cs->hppi.irq) {
 return true;
 }
 return false;
@@ -129,6 +143,35 @@ static uint32_t gicr_int_pending(GICv3CPUState *cs)
 return pend;
 }
 
+static bool gicv3_get_priority(GICv3CPUState *cs, bool is_redist,
+   uint32_t superprio, uint8_t *prio, int irq)
+{
+bool has_superprio = false;
+
+if (superprio) {
+has_superprio = true;
+
+/* DS = 0 & Non-secure NMI */
+if (!(cs->gic->gicd_ctlr & GICD_CTLR_DS) &&
+((is_redist && extract32(cs->gicr_igroupr0, irq, 1)) ||
+ (!is_redist && gicv3_gicd_group_test(cs->gic, irq {
+*prio = 0x80;
+} else {
+*prio = 0x0;
+}
+} else {
+has_superprio = false;
+
+if (is_redist) {
+*prio = cs->gicr_ipriorityr[irq];
+} else {
+*prio = cs->gic->gicd_ipriority[irq];
+}
+}
+
+return has_superprio;
+}
+
 /* Update the interrupt status after state in a redistributor
  * or CPU interface has changed, but don't tell the CPU i/f.
  */
@@ -141,6 +184,8 @@ static void gicv3_redist_update_noirqset(GICv3CPUState *cs)
 uint8_t prio;
 int i;
 uint32_t pend;
+uint32_t superprio = 0;
+bool has_superprio = false;
 
 /* Find out which redistributor interrupts are eligible to be
  * signaled to the CPU interface.
@@ -152,10 +197,13 @@ static void gicv3_redist_update_noirqset(GICv3CPUState 
*cs)
 if (!(pend & (1 << i))) {
 continue;
 }
-prio = cs->gicr_ipriorityr[i];
-if (irqbetter(cs, i, prio)) {
+superprio = extract32(cs->gicr_isuperprio, i, 1);
+has_superprio = gicv3_get_priority(cs, true, superprio, &prio, i);
+
+if (irqbetter(cs, i, prio, has_superprio)) {
 cs->hppi.irq = i;
 cs->hppi.prio = prio;
+cs->hppi.superprio = has_superprio;
 seenbetter = true;
 }
 }
@@ -168,7 +216,7 @@ static void gicv3_redist_update_noirqset(GICv3CPUState *cs)
 if ((cs->gicr_ctlr & GICR_CTLR_ENABLE_LPIS) && cs->gic->lpi_enable &&
 (cs->gic->gicd_ctlr & GICD_CTLR_EN_GRP1NS) &&
 (cs->hpplpi.prio != 0xff)) {
-if (irqbetter(cs, cs->hpplpi.irq, cs->hpplpi.prio)) {
+if (irqbetter(cs, cs->hpplpi.irq, cs->hpplpi.prio, false)) {
 cs->hppi.irq = cs->hpplpi.irq;
 cs->hppi.prio = cs->hpplpi.prio;
 cs->hppi.grp = cs->hpplpi.grp;
@@ -213,6 +261,8 @@ static void gicv3_update_noirqset(GICv3State *s, int start, 
int len)
 int i;
 uint8_t

[RFC PATCH v5 01/22] target/arm: Handle HCR_EL2 accesses for bits introduced with FEAT_NMI

FEAT_NMI defines another three new bits in HCRX_EL2: TALLINT, HCRX_VINMI and
HCRX_VFNMI. When the feature is enabled, allow these bits to be written in
HCRX_EL2.

Signed-off-by: Jinjie Ruan 
Reviewed-by: Richard Henderson 
---
v4:
- Update the comment for FEAT_NMI in hcrx_write().
- Update the commit message, s/thress/three/g.
v3:
- Add Reviewed-by.
- Add HCRX_VINMI and HCRX_VFNMI support in HCRX_EL2.
- Upate the commit messsage.
---
 target/arm/cpu-features.h | 5 +
 target/arm/helper.c   | 5 +
 2 files changed, 10 insertions(+)

diff --git a/target/arm/cpu-features.h b/target/arm/cpu-features.h
index 7567854db6..2ad1179be7 100644
--- a/target/arm/cpu-features.h
+++ b/target/arm/cpu-features.h
@@ -681,6 +681,11 @@ static inline bool isar_feature_aa64_sme(const 
ARMISARegisters *id)
 return FIELD_EX64(id->id_aa64pfr1, ID_AA64PFR1, SME) != 0;
 }
 
+static inline bool isar_feature_aa64_nmi(const ARMISARegisters *id)
+{
+return FIELD_EX64(id->id_aa64pfr1, ID_AA64PFR1, NMI) != 0;
+}
+
 static inline bool isar_feature_aa64_tgran4_lpa2(const ARMISARegisters *id)
 {
 return FIELD_SEX64(id->id_aa64mmfr0, ID_AA64MMFR0, TGRAN4) >= 1;
diff --git a/target/arm/helper.c b/target/arm/helper.c
index 90c4fb72ce..affa493141 100644
--- a/target/arm/helper.c
+++ b/target/arm/helper.c
@@ -6056,6 +6056,11 @@ static void hcrx_write(CPUARMState *env, const 
ARMCPRegInfo *ri,
 valid_mask |= HCRX_MSCEN | HCRX_MCE2;
 }
 
+/* FEAT_NMI adds TALLINT, VINMI and VFNMI */
+if (cpu_isar_feature(aa64_nmi, env_archcpu(env))) {
+valid_mask |= HCRX_TALLINT | HCRX_VINMI | HCRX_VFNMI;
+}
+
 /* Clear RES0 bits.  */
 env->cp15.hcrx_el2 = value & valid_mask;
 }
-- 
2.34.1

[RFC PATCH v5 09/22] target/arm: Handle PSTATE.ALLINT on taking an exception

Set or clear PSTATE.ALLINT on taking an exception to ELx according to the
SCTLR_ELx.SPINTMASK bit.

Signed-off-by: Jinjie Ruan 
Reviewed-by: Richard Henderson 
---
v3:
- Add Reviewed-by.
---
 target/arm/helper.c | 9 +
 1 file changed, 9 insertions(+)

diff --git a/target/arm/helper.c b/target/arm/helper.c
index ac44498537..b796dbdf21 100644
--- a/target/arm/helper.c
+++ b/target/arm/helper.c
@@ -11539,6 +11539,15 @@ static void arm_cpu_do_interrupt_aarch64(CPUState *cs)
 }
 }
 
+if (cpu_isar_feature(aa64_nmi, cpu) &&
+(env->cp15.sctlr_el[new_el] & SCTLR_NMI)) {
+if (!(env->cp15.sctlr_el[new_el] & SCTLR_SPINTMASK)) {
+new_mode |= PSTATE_ALLINT;
+} else {
+new_mode &= ~PSTATE_ALLINT;
+}
+}
+
 pstate_write(env, PSTATE_DAIF | new_mode);
 env->aarch64 = true;
 aarch64_restore_sp(env, new_el);
-- 
2.34.1

[RFC PATCH v5 16/22] hw/intc: Enable FEAT_GICv3_NMI Feature

Added properties to enable FEAT_GICv3_NMI feature, setup distributor
and redistributor registers to indicate NMI support.

Signed-off-by: Jinjie Ruan 
Reviewed-by: Richard Henderson 
---
v4:
- Add Reviewed-by.
---
 hw/intc/arm_gicv3_common.c | 1 +
 hw/intc/arm_gicv3_dist.c   | 2 ++
 hw/intc/gicv3_internal.h   | 1 +
 include/hw/intc/arm_gicv3_common.h | 1 +
 4 files changed, 5 insertions(+)

diff --git a/hw/intc/arm_gicv3_common.c b/hw/intc/arm_gicv3_common.c
index c52f060026..2d2cea6858 100644
--- a/hw/intc/arm_gicv3_common.c
+++ b/hw/intc/arm_gicv3_common.c
@@ -569,6 +569,7 @@ static Property arm_gicv3_common_properties[] = {
 DEFINE_PROP_UINT32("num-irq", GICv3State, num_irq, 32),
 DEFINE_PROP_UINT32("revision", GICv3State, revision, 3),
 DEFINE_PROP_BOOL("has-lpi", GICv3State, lpi_enable, 0),
+DEFINE_PROP_BOOL("has-nmi", GICv3State, nmi_support, 0),
 DEFINE_PROP_BOOL("has-security-extensions", GICv3State, security_extn, 0),
 /*
  * Compatibility property: force 8 bits of physical priority, even
diff --git a/hw/intc/arm_gicv3_dist.c b/hw/intc/arm_gicv3_dist.c
index 9739404e35..c4e28d209a 100644
--- a/hw/intc/arm_gicv3_dist.c
+++ b/hw/intc/arm_gicv3_dist.c
@@ -412,6 +412,7 @@ static bool gicd_readl(GICv3State *s, hwaddr offset,
  *  by GICD_TYPER.IDbits)
  * MBIS == 0 (message-based SPIs not supported)
  * SecurityExtn == 1 if security extns supported
+ * NMI = 1 if Non-maskable interrupt property is supported
  * CPUNumber == 0 since for us ARE is always 1
  * ITLinesNumber == (((max SPI IntID + 1) / 32) - 1)
  */
@@ -425,6 +426,7 @@ static bool gicd_readl(GICv3State *s, hwaddr offset,
 bool dvis = s->revision >= 4;
 
 *data = (1 << 25) | (1 << 24) | (dvis << 18) | (sec_extn << 10) |
+(s->nmi_support << GICD_TYPER_NMI_SHIFT) |
 (s->lpi_enable << GICD_TYPER_LPIS_SHIFT) |
 (0xf << 19) | itlinesnumber;
 return true;
diff --git a/hw/intc/gicv3_internal.h b/hw/intc/gicv3_internal.h
index a1fc34597e..8d793243f4 100644
--- a/hw/intc/gicv3_internal.h
+++ b/hw/intc/gicv3_internal.h
@@ -70,6 +70,7 @@
 #define GICD_CTLR_E1NWF (1U << 7)
 #define GICD_CTLR_RWP   (1U << 31)
 
+#define GICD_TYPER_NMI_SHIFT   9
 #define GICD_TYPER_LPIS_SHIFT  17
 
 /* 16 bits EventId */
diff --git a/include/hw/intc/arm_gicv3_common.h 
b/include/hw/intc/arm_gicv3_common.h
index df4380141d..16c5fa7256 100644
--- a/include/hw/intc/arm_gicv3_common.h
+++ b/include/hw/intc/arm_gicv3_common.h
@@ -251,6 +251,7 @@ struct GICv3State {
 uint32_t num_irq;
 uint32_t revision;
 bool lpi_enable;
+bool nmi_support;
 bool security_extn;
 bool force_8bit_prio;
 bool irq_reset_nonsecure;
-- 
2.34.1

[RFC PATCH v5 04/22] target/arm: Implement ALLINT MSR (immediate)

Add ALLINT MSR (immediate) to decodetree, in which the CRm is 0b000x. The
EL0 check is necessary to ALLINT, and the EL1 check is necessary when
imm == 1. So implement it inline for EL2/3, or EL1 with imm==0. Avoid the
unconditional write to pc and use raise_exception_ra to unwind.

Signed-off-by: Jinjie Ruan 
---
v5:
- Drop the & 1 in trans_MSR_i_ALLINT().
- Simplify and merge msr_i_allint() and allint_check().
- Rename msr_i_allint() to msr_set_allint_el1().
v4:
- Fix the ALLINT MSR (immediate) decodetree implementation.
- Remove arm_is_el2_enabled() check in allint_check().
- Update env->allint to env->pstate.
- Only call allint_check() when imm == 1.
- Simplify the allint_check() to not pass "op" and extract.
- Implement it inline for EL2/3, or EL1 with imm==0.
- Pass (a->imm & 1) * PSTATE_ALLINT (i64) to simplfy the ALLINT set/clear.
v3:
- Remove EL0 check in allint_check().
- Add TALLINT check for EL1 in allint_check().
- Remove unnecessarily arm_rebuild_hflags() in msr_i_allint helper.
---
 target/arm/tcg/a64.decode  |  1 +
 target/arm/tcg/helper-a64.c| 12 
 target/arm/tcg/helper-a64.h|  1 +
 target/arm/tcg/translate-a64.c | 18 ++
 4 files changed, 32 insertions(+)

diff --git a/target/arm/tcg/a64.decode b/target/arm/tcg/a64.decode
index 8a20dce3c8..0e7656fd15 100644
--- a/target/arm/tcg/a64.decode
+++ b/target/arm/tcg/a64.decode
@@ -207,6 +207,7 @@ MSR_i_DIT   1101 0101  0 011 0100  010 1 
@msr_i
 MSR_i_TCO   1101 0101  0 011 0100  100 1 @msr_i
 MSR_i_DAIFSET   1101 0101  0 011 0100  110 1 @msr_i
 MSR_i_DAIFCLEAR 1101 0101  0 011 0100  111 1 @msr_i
+MSR_i_ALLINT1101 0101  0 001 0100 000 imm:1 000 1
 MSR_i_SVCR  1101 0101  0 011 0100 0 mask:2 imm:1 011 1
 
 # MRS, MSR (register), SYS, SYSL. These are all essentially the
diff --git a/target/arm/tcg/helper-a64.c b/target/arm/tcg/helper-a64.c
index ebaa7f00df..7818537890 100644
--- a/target/arm/tcg/helper-a64.c
+++ b/target/arm/tcg/helper-a64.c
@@ -66,6 +66,18 @@ void HELPER(msr_i_spsel)(CPUARMState *env, uint32_t imm)
 update_spsel(env, imm);
 }
 
+void HELPER(msr_set_allint_el1)(CPUARMState *env)
+{
+/* ALLINT update to PSTATE. */
+if (arm_hcrx_el2_eff(env) & HCRX_TALLINT) {
+raise_exception_ra(env, EXCP_UDEF,
+   syn_aa64_sysregtrap(0, 1, 0, 4, 1, 0x1f, 0),
+   exception_target_el(env), GETPC());
+}
+
+env->pstate |= PSTATE_ALLINT;
+}
+
 static void daif_check(CPUARMState *env, uint32_t op,
uint32_t imm, uintptr_t ra)
 {
diff --git a/target/arm/tcg/helper-a64.h b/target/arm/tcg/helper-a64.h
index 575a5dab7d..0518165399 100644
--- a/target/arm/tcg/helper-a64.h
+++ b/target/arm/tcg/helper-a64.h
@@ -22,6 +22,7 @@ DEF_HELPER_FLAGS_1(rbit64, TCG_CALL_NO_RWG_SE, i64, i64)
 DEF_HELPER_2(msr_i_spsel, void, env, i32)
 DEF_HELPER_2(msr_i_daifset, void, env, i32)
 DEF_HELPER_2(msr_i_daifclear, void, env, i32)
+DEF_HELPER_1(msr_set_allint_el1, void, env)
 DEF_HELPER_3(vfp_cmph_a64, i64, f16, f16, ptr)
 DEF_HELPER_3(vfp_cmpeh_a64, i64, f16, f16, ptr)
 DEF_HELPER_3(vfp_cmps_a64, i64, f32, f32, ptr)
diff --git a/target/arm/tcg/translate-a64.c b/target/arm/tcg/translate-a64.c
index 340265beb0..14e2b35b28 100644
--- a/target/arm/tcg/translate-a64.c
+++ b/target/arm/tcg/translate-a64.c
@@ -2036,6 +2036,24 @@ static bool trans_MSR_i_DAIFCLEAR(DisasContext *s, arg_i 
*a)
 return true;
 }
 
+static bool trans_MSR_i_ALLINT(DisasContext *s, arg_i *a)
+{
+if (!dc_isar_feature(aa64_nmi, s) || s->current_el == 0) {
+return false;
+}
+
+if (a->imm == 0) {
+clear_pstate_bits(PSTATE_ALLINT);
+} else if (s->current_el > 1) {
+set_pstate_bits(PSTATE_ALLINT);
+} else {
+gen_helper_msr_set_allint_el1(tcg_env);
+}
+
+s->base.is_jmp = DISAS_TOO_MANY;
+return true;
+}
+
 static bool trans_MSR_i_SVCR(DisasContext *s, arg_MSR_i_SVCR *a)
 {
 if (!dc_isar_feature(aa64_sme, s) || a->mask == 0) {
-- 
2.34.1

[RFC PATCH v5 10/22] hw/arm/virt: Wire NMI and VNMI irq lines from GIC to CPU

Wire the new NMI and VNMI interrupt line from the GIC to each CPU.

Signed-off-by: Jinjie Ruan 
Reviewed-by: Richard Henderson 
---
v4:
- Add Reviewed-by.
v3:
- Also add VNMI wire.
---
 hw/arm/virt.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index 0af1943697..2d4a187fd5 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -804,7 +804,8 @@ static void create_gic(VirtMachineState *vms, MemoryRegion 
*mem)
 
 /* Wire the outputs from each CPU's generic timer and the GICv3
  * maintenance interrupt signal to the appropriate GIC PPI inputs,
- * and the GIC's IRQ/FIQ/VIRQ/VFIQ interrupt outputs to the CPU's inputs.
+ * and the GIC's IRQ/FIQ/VIRQ/VFIQ/NMI/VNMI interrupt outputs to the
+ * CPU's inputs.
  */
 for (i = 0; i < smp_cpus; i++) {
 DeviceState *cpudev = DEVICE(qemu_get_cpu(i));
@@ -848,6 +849,10 @@ static void create_gic(VirtMachineState *vms, MemoryRegion 
*mem)
qdev_get_gpio_in(cpudev, ARM_CPU_VIRQ));
 sysbus_connect_irq(gicbusdev, i + 3 * smp_cpus,
qdev_get_gpio_in(cpudev, ARM_CPU_VFIQ));
+sysbus_connect_irq(gicbusdev, i + 4 * smp_cpus,
+   qdev_get_gpio_in(cpudev, ARM_CPU_NMI));
+sysbus_connect_irq(gicbusdev, i + 5 * smp_cpus,
+   qdev_get_gpio_in(cpudev, ARM_CPU_VNMI));
 }
 
 fdt_add_gic_node(vms);
-- 
2.34.1

[RFC PATCH v5 14/22] hw/intc/arm_gicv3_redist: Implement GICR_INMIR0

Add GICR_INMIR0 register and support access GICR_INMIR0.

Signed-off-by: Jinjie Ruan 
---
v4:
- Make the GICR_INMIR0 implementation more clearer.
---
 hw/intc/arm_gicv3_redist.c | 19 +++
 hw/intc/gicv3_internal.h   |  1 +
 2 files changed, 20 insertions(+)

diff --git a/hw/intc/arm_gicv3_redist.c b/hw/intc/arm_gicv3_redist.c
index 8153525849..7a16a058b1 100644
--- a/hw/intc/arm_gicv3_redist.c
+++ b/hw/intc/arm_gicv3_redist.c
@@ -35,6 +35,15 @@ static int gicr_ns_access(GICv3CPUState *cs, int irq)
 return extract32(cs->gicr_nsacr, irq * 2, 2);
 }
 
+static void gicr_write_bitmap_reg(GICv3CPUState *cs, MemTxAttrs attrs,
+  uint32_t *reg, uint32_t val)
+{
+/* Helper routine to implement writing to a "set" register */
+val &= mask_group(cs, attrs);
+*reg = val;
+gicv3_redist_update(cs);
+}
+
 static void gicr_write_set_bitmap_reg(GICv3CPUState *cs, MemTxAttrs attrs,
   uint32_t *reg, uint32_t val)
 {
@@ -406,6 +415,10 @@ static MemTxResult gicr_readl(GICv3CPUState *cs, hwaddr 
offset,
 *data = value;
 return MEMTX_OK;
 }
+case GICR_INMIR0:
+*data = cs->gic->nmi_support ?
+gicr_read_bitmap_reg(cs, attrs, cs->gicr_isuperprio) : 0;
+return MEMTX_OK;
 case GICR_ICFGR0:
 case GICR_ICFGR1:
 {
@@ -555,6 +568,12 @@ static MemTxResult gicr_writel(GICv3CPUState *cs, hwaddr 
offset,
 gicv3_redist_update(cs);
 return MEMTX_OK;
 }
+case GICR_INMIR0:
+if (cs->gic->nmi_support) {
+gicr_write_bitmap_reg(cs, attrs, &cs->gicr_isuperprio, value);
+}
+return MEMTX_OK;
+
 case GICR_ICFGR0:
 /* Register is all RAZ/WI or RAO/WI bits */
 return MEMTX_OK;
diff --git a/hw/intc/gicv3_internal.h b/hw/intc/gicv3_internal.h
index 29d5cdc1b6..f35b7d2f03 100644
--- a/hw/intc/gicv3_internal.h
+++ b/hw/intc/gicv3_internal.h
@@ -109,6 +109,7 @@
 #define GICR_ICFGR1   (GICR_SGI_OFFSET + 0x0C04)
 #define GICR_IGRPMODR0(GICR_SGI_OFFSET + 0x0D00)
 #define GICR_NSACR(GICR_SGI_OFFSET + 0x0E00)
+#define GICR_INMIR0   (GICR_SGI_OFFSET + 0x0F80)
 
 /* VLPI redistributor registers, offsets from VLPI_base */
 #define GICR_VPROPBASER   (GICR_VLPI_OFFSET + 0x70)
-- 
2.34.1

[RFC PATCH v5 17/22] hw/intc/arm_gicv3: Add NMI handling CPU interface registers

Add the NMIAR CPU interface registers which deal with acknowledging NMI.

When introduce NMI interrupt, there are some updates to the semantics for the
register ICC_IAR1_EL1 and ICC_HPPIR1_EL1. For ICC_IAR1_EL1 register, it
should return 1022 if the intid has super priority. And for ICC_NMIAR1_EL1
register, it should return 1023 if the intid do not have super priority.
Howerever, these are not necessary for ICC_HPPIR1_EL1 register.

Signed-off-by: Jinjie Ruan 
---
v4:
- Define ICC_NMIAR1_EL1 only if FEAT_GICv3_NMI is implemented.
- Check sctrl_elx.SCTLR_NMI to return 1022 for icc_iar1_read().
- Add gicv3_icc_nmiar1_read() trace event.
- Do not check icc_hppi_can_preempt() for icc_nmiar1_read().
- Add icv_nmiar1_read() and call it when EL2Enabled() and HCR_EL2.IMO == '1'
---
 hw/intc/arm_gicv3_cpuif.c | 59 +--
 hw/intc/gicv3_internal.h  |  1 +
 hw/intc/trace-events  |  1 +
 3 files changed, 58 insertions(+), 3 deletions(-)

diff --git a/hw/intc/arm_gicv3_cpuif.c b/hw/intc/arm_gicv3_cpuif.c
index e1a60d8c15..df82a413c6 100644
--- a/hw/intc/arm_gicv3_cpuif.c
+++ b/hw/intc/arm_gicv3_cpuif.c
@@ -795,6 +795,13 @@ static uint64_t icv_iar_read(CPUARMState *env, const 
ARMCPRegInfo *ri)
 return intid;
 }
 
+static uint64_t icv_nmiar1_read(CPUARMState *env, const ARMCPRegInfo *ri)
+{
+/* todo */
+uint64_t intid = INTID_SPURIOUS;
+return intid;
+}
+
 static uint32_t icc_fullprio_mask(GICv3CPUState *cs)
 {
 /*
@@ -1097,7 +1104,8 @@ static uint64_t icc_hppir0_value(GICv3CPUState *cs, 
CPUARMState *env)
 return cs->hppi.irq;
 }
 
-static uint64_t icc_hppir1_value(GICv3CPUState *cs, CPUARMState *env)
+static uint64_t icc_hppir1_value(GICv3CPUState *cs, CPUARMState *env,
+ bool is_nmi, bool is_hppi)
 {
 /* Return the highest priority pending interrupt register value
  * for group 1.
@@ -1108,6 +1116,19 @@ static uint64_t icc_hppir1_value(GICv3CPUState *cs, 
CPUARMState *env)
 return INTID_SPURIOUS;
 }
 
+if (!is_hppi) {
+int el = arm_current_el(env);
+
+if (is_nmi && (!cs->hppi.superprio)) {
+return INTID_SPURIOUS;
+}
+
+if ((!is_nmi) && cs->hppi.superprio
+&& env->cp15.sctlr_el[el] & SCTLR_NMI) {
+return INTID_NMI;
+}
+}
+
 /* Check whether we can return the interrupt or if we should return
  * a special identifier, as per the CheckGroup1ForSpecialIdentifiers
  * pseudocode. (We can simplify a little because for us ICC_SRE_EL1.RM
@@ -1168,7 +1189,7 @@ static uint64_t icc_iar1_read(CPUARMState *env, const 
ARMCPRegInfo *ri)
 if (!icc_hppi_can_preempt(cs)) {
 intid = INTID_SPURIOUS;
 } else {
-intid = icc_hppir1_value(cs, env);
+intid = icc_hppir1_value(cs, env, false, false);
 }
 
 if (!gicv3_intid_is_special(intid)) {
@@ -1179,6 +1200,25 @@ static uint64_t icc_iar1_read(CPUARMState *env, const 
ARMCPRegInfo *ri)
 return intid;
 }
 
+static uint64_t icc_nmiar1_read(CPUARMState *env, const ARMCPRegInfo *ri)
+{
+GICv3CPUState *cs = icc_cs_from_env(env);
+uint64_t intid;
+
+if (icv_access(env, HCR_IMO)) {
+return icv_nmiar1_read(env, ri);
+}
+
+intid = icc_hppir1_value(cs, env, true, false);
+
+if (!gicv3_intid_is_special(intid)) {
+icc_activate_irq(cs, intid);
+}
+
+trace_gicv3_icc_nmiar1_read(gicv3_redist_affid(cs), intid);
+return intid;
+}
+
 static void icc_drop_prio(GICv3CPUState *cs, int grp)
 {
 /* Drop the priority of the currently active interrupt in
@@ -1555,7 +1595,7 @@ static uint64_t icc_hppir1_read(CPUARMState *env, const 
ARMCPRegInfo *ri)
 return icv_hppir_read(env, ri);
 }
 
-value = icc_hppir1_value(cs, env);
+value = icc_hppir1_value(cs, env, false, true);
 trace_gicv3_icc_hppir1_read(gicv3_redist_affid(cs), value);
 return value;
 }
@@ -2482,6 +2522,15 @@ static const ARMCPRegInfo 
gicv3_cpuif_icc_apxr23_reginfo[] = {
 },
 };
 
+static const ARMCPRegInfo gicv3_cpuif_gicv3_nmi_reginfo[] = {
+{ .name = "ICC_NMIAR1_EL1", .state = ARM_CP_STATE_BOTH,
+  .opc0 = 3, .opc1 = 0, .crn = 12, .crm = 9, .opc2 = 5,
+  .type = ARM_CP_IO | ARM_CP_NO_RAW,
+  .access = PL1_R, .accessfn = gicv3_irq_access,
+  .readfn = icc_nmiar1_read,
+},
+};
+
 static uint64_t ich_ap_read(CPUARMState *env, const ARMCPRegInfo *ri)
 {
 GICv3CPUState *cs = icc_cs_from_env(env);
@@ -2838,6 +2887,10 @@ void gicv3_init_cpuif(GICv3State *s)
  */
 define_arm_cp_regs(cpu, gicv3_cpuif_reginfo);
 
+if (s->nmi_support) {
+define_arm_cp_regs(cpu, gicv3_cpuif_gicv3_nmi_reginfo);
+}
+
 /*
  * The CPU implementation specifies the number of supported
  * bits of physical priority. For backwards compatibility
diff --git a/hw/intc/gicv3_internal.h b/hw/intc/gicv3_internal.h
index 8d793243f4..93e56b3726 100644
--- a/hw/intc/g

[RFC PATCH v5 21/22] target/arm: Add FEAT_NMI to max

Enable FEAT_NMI on the 'max' CPU.

Signed-off-by: Jinjie Ruan 
Reviewed-by: Richard Henderson 
---
v3:
- Add Reviewed-by.
- Sorted to last.
---
 docs/system/arm/emulation.rst | 1 +
 target/arm/tcg/cpu64.c| 1 +
 2 files changed, 2 insertions(+)

diff --git a/docs/system/arm/emulation.rst b/docs/system/arm/emulation.rst
index f67aea2d83..91baf7ad69 100644
--- a/docs/system/arm/emulation.rst
+++ b/docs/system/arm/emulation.rst
@@ -63,6 +63,7 @@ the following architecture extensions:
 - FEAT_MTE (Memory Tagging Extension)
 - FEAT_MTE2 (Memory Tagging Extension)
 - FEAT_MTE3 (MTE Asymmetric Fault Handling)
+- FEAT_NMI (Non-maskable Interrupt)
 - FEAT_NV (Nested Virtualization)
 - FEAT_NV2 (Enhanced nested virtualization support)
 - FEAT_PACIMP (Pointer authentication - IMPLEMENTATION DEFINED algorithm)
diff --git a/target/arm/tcg/cpu64.c b/target/arm/tcg/cpu64.c
index 5fba2c0f04..60f0dcd799 100644
--- a/target/arm/tcg/cpu64.c
+++ b/target/arm/tcg/cpu64.c
@@ -1175,6 +1175,7 @@ void aarch64_max_tcg_initfn(Object *obj)
 t = FIELD_DP64(t, ID_AA64PFR1, RAS_FRAC, 0);  /* FEAT_RASv1p1 + 
FEAT_DoubleFault */
 t = FIELD_DP64(t, ID_AA64PFR1, SME, 1);   /* FEAT_SME */
 t = FIELD_DP64(t, ID_AA64PFR1, CSV2_FRAC, 0); /* FEAT_CSV2_2 */
+t = FIELD_DP64(t, ID_AA64PFR1, NMI, 1);   /* FEAT_NMI */
 cpu->isar.id_aa64pfr1 = t;
 
 t = cpu->isar.id_aa64mmfr0;
-- 
2.34.1

[RFC PATCH v5 05/22] target/arm: Support MSR access to ALLINT

Support ALLINT msr access as follow:
mrs , ALLINT// read allint
msr ALLINT, // write allint with imm

Signed-off-by: Jinjie Ruan 
Reviewed-by: Richard Henderson 
---
v5:
- Add Reviewed-by.
v4:
- Remove arm_is_el2_enabled() check in allint_check().
- Change to env->pstate instead of env->allint.
v3:
- Remove EL0 check in aa64_allint_access() which alreay checks in .access
  PL1_RW.
- Use arm_hcrx_el2_eff() in aa64_allint_access() instead of env->cp15.hcrx_el2.
- Make ALLINT msr access function controlled by aa64_nmi.
---
 target/arm/helper.c | 34 ++
 1 file changed, 34 insertions(+)

diff --git a/target/arm/helper.c b/target/arm/helper.c
index affa493141..497b6e4bdf 100644
--- a/target/arm/helper.c
+++ b/target/arm/helper.c
@@ -4618,6 +4618,36 @@ static void aa64_daif_write(CPUARMState *env, const 
ARMCPRegInfo *ri,
 env->daif = value & PSTATE_DAIF;
 }
 
+static void aa64_allint_write(CPUARMState *env, const ARMCPRegInfo *ri,
+  uint64_t value)
+{
+env->pstate = (env->pstate & ~PSTATE_ALLINT) | (value & PSTATE_ALLINT);
+}
+
+static uint64_t aa64_allint_read(CPUARMState *env, const ARMCPRegInfo *ri)
+{
+return env->pstate & PSTATE_ALLINT;
+}
+
+static CPAccessResult aa64_allint_access(CPUARMState *env,
+ const ARMCPRegInfo *ri, bool isread)
+{
+if (arm_current_el(env) == 1 && (arm_hcrx_el2_eff(env) & HCRX_TALLINT)) {
+return CP_ACCESS_TRAP_EL2;
+}
+return CP_ACCESS_OK;
+}
+
+static const ARMCPRegInfo nmi_reginfo[] = {
+{ .name = "ALLINT", .state = ARM_CP_STATE_AA64,
+  .opc0 = 3, .opc1 = 0, .opc2 = 0, .crn = 4, .crm = 3,
+  .type = ARM_CP_NO_RAW,
+  .access = PL1_RW, .accessfn = aa64_allint_access,
+  .fieldoffset = offsetof(CPUARMState, pstate),
+  .writefn = aa64_allint_write, .readfn = aa64_allint_read,
+  .resetfn = arm_cp_reset_ignore },
+};
+
 static uint64_t aa64_pan_read(CPUARMState *env, const ARMCPRegInfo *ri)
 {
 return env->pstate & PSTATE_PAN;
@@ -9724,6 +9754,10 @@ void register_cp_regs_for_features(ARMCPU *cpu)
 if (cpu_isar_feature(aa64_nv2, cpu)) {
 define_arm_cp_regs(cpu, nv2_reginfo);
 }
+
+if (cpu_isar_feature(aa64_nmi, cpu)) {
+define_arm_cp_regs(cpu, nmi_reginfo);
+}
 #endif
 
 if (cpu_isar_feature(any_predinv, cpu)) {
-- 
2.34.1

[RFC PATCH v5 19/22] hw/intc/arm_gicv3: Report the NMI interrupt in gicv3_cpuif_update()

In CPU Interface, if the IRQ has the superpriority property, report
NMI to the corresponding PE.

Signed-off-by: Jinjie Ruan 
---
v4:
- Swap the ordering of the IFs.
v3:
- Remove handling nmi_is_irq flag.
---
 hw/intc/arm_gicv3_cpuif.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/hw/intc/arm_gicv3_cpuif.c b/hw/intc/arm_gicv3_cpuif.c
index df82a413c6..afba98ad87 100644
--- a/hw/intc/arm_gicv3_cpuif.c
+++ b/hw/intc/arm_gicv3_cpuif.c
@@ -938,6 +938,7 @@ void gicv3_cpuif_update(GICv3CPUState *cs)
 /* Tell the CPU about its highest priority pending interrupt */
 int irqlevel = 0;
 int fiqlevel = 0;
+int nmilevel = 0;
 ARMCPU *cpu = ARM_CPU(cs->cpu);
 CPUARMState *env = &cpu->env;
 
@@ -976,6 +977,8 @@ void gicv3_cpuif_update(GICv3CPUState *cs)
 
 if (isfiq) {
 fiqlevel = 1;
+} else if (cs->hppi.superprio) {
+nmilevel = 1;
 } else {
 irqlevel = 1;
 }
@@ -985,6 +988,7 @@ void gicv3_cpuif_update(GICv3CPUState *cs)
 
 qemu_set_irq(cs->parent_fiq, fiqlevel);
 qemu_set_irq(cs->parent_irq, irqlevel);
+qemu_set_irq(cs->parent_nmi, nmilevel);
 }
 
 static uint64_t icc_pmr_read(CPUARMState *env, const ARMCPRegInfo *ri)
-- 
2.34.1

[RFC PATCH v5 12/22] target/arm: Handle NMI in arm_cpu_do_interrupt_aarch64()

According to Arm GIC section 4.6.3 Interrupt superpriority, the interrupt
with superpriority is always IRQ, never FIQ, so the NMI exception trap entry
behave like IRQ. However, VNMI can be IRQ or FIQ, FIQ can only come from
hcrx_el2.HCRX_VFNMI bit, IRQ can be raised from the GIC or come from the
hcrx_el2.HCRX_VINMI bit.

Signed-off-by: Jinjie Ruan 
---
v4:
- Also handle VNMI in arm_cpu_do_interrupt_aarch64().
v3:
- Remove the FIQ NMI handle.
---
 target/arm/helper.c | 9 +
 1 file changed, 9 insertions(+)

diff --git a/target/arm/helper.c b/target/arm/helper.c
index b796dbdf21..bd34b3506a 100644
--- a/target/arm/helper.c
+++ b/target/arm/helper.c
@@ -11459,12 +11459,21 @@ static void arm_cpu_do_interrupt_aarch64(CPUState *cs)
 break;
 case EXCP_IRQ:
 case EXCP_VIRQ:
+case EXCP_NMI:
 addr += 0x80;
 break;
 case EXCP_FIQ:
 case EXCP_VFIQ:
 addr += 0x100;
 break;
+case EXCP_VNMI:
+if (env->irq_line_state & CPU_INTERRUPT_VNMI ||
+env->cp15.hcrx_el2 & HCRX_VINMI) {
+addr += 0x80;
+} else if (env->cp15.hcrx_el2 & HCRX_VFNMI) {
+addr += 0x100;
+}
+break;
 case EXCP_VSERR:
 addr += 0x180;
 /* Construct the SError syndrome from IDS and ISS fields. */
-- 
2.34.1

[RFC PATCH v5 13/22] hw/intc/arm_gicv3: Add irq superpriority information

A SPI, PPI or SGI interrupt can have a superpriority property. So
maintain superpriority information in PendingIrq and GICR/GICD.

Signed-off-by: Jinjie Ruan 
Acked-by: Richard Henderson 
---
v3:
- Place this ahead of implement GICR_INMIR.
- Add Acked-by.
---
 include/hw/intc/arm_gicv3_common.h | 4 
 1 file changed, 4 insertions(+)

diff --git a/include/hw/intc/arm_gicv3_common.h 
b/include/hw/intc/arm_gicv3_common.h
index 7324c7d983..df4380141d 100644
--- a/include/hw/intc/arm_gicv3_common.h
+++ b/include/hw/intc/arm_gicv3_common.h
@@ -146,6 +146,7 @@ typedef struct {
 int irq;
 uint8_t prio;
 int grp;
+bool superprio;
 } PendingIrq;
 
 struct GICv3CPUState {
@@ -172,6 +173,7 @@ struct GICv3CPUState {
 uint32_t gicr_ienabler0;
 uint32_t gicr_ipendr0;
 uint32_t gicr_iactiver0;
+uint32_t gicr_isuperprio;
 uint32_t edge_trigger; /* ICFGR0 and ICFGR1 even bits */
 uint32_t gicr_igrpmodr0;
 uint32_t gicr_nsacr;
@@ -274,6 +276,7 @@ struct GICv3State {
 GIC_DECLARE_BITMAP(active);   /* GICD_ISACTIVER */
 GIC_DECLARE_BITMAP(level);/* Current level */
 GIC_DECLARE_BITMAP(edge_trigger); /* GICD_ICFGR even bits */
+GIC_DECLARE_BITMAP(superprio);/* GICD_INMIR */
 uint8_t gicd_ipriority[GICV3_MAXIRQ];
 uint64_t gicd_irouter[GICV3_MAXIRQ];
 /* Cached information: pointer to the cpu i/f for the CPUs specified
@@ -313,6 +316,7 @@ GICV3_BITMAP_ACCESSORS(pending)
 GICV3_BITMAP_ACCESSORS(active)
 GICV3_BITMAP_ACCESSORS(level)
 GICV3_BITMAP_ACCESSORS(edge_trigger)
+GICV3_BITMAP_ACCESSORS(superprio)
 
 #define TYPE_ARM_GICV3_COMMON "arm-gicv3-common"
 typedef struct ARMGICv3CommonClass ARMGICv3CommonClass;
-- 
2.34.1

[RFC PATCH v5 08/22] target/arm: Handle IS/FS in ISR_EL1 for NMI

Add IS and FS bit in ISR_EL1 and handle the read. With CPU_INTERRUPT_NMI or
CPU_INTERRUPT_VNMI, both CPSR_I and ISR_IS must be set. With
CPU_INTERRUPT_VFIQ and HCRX_EL2.VFNMI set, both CPSR_F and ISR_FS must be set.

Signed-off-by: Jinjie Ruan 
---
v4；
- Also handle VNMI.
v3:
- CPU_INTERRUPT_NMI do not set FIQ, so remove it.
- With CPU_INTERRUPT_NMI, both CPSR_I and ISR_IS must be set.
---
 target/arm/cpu.h|  2 ++
 target/arm/helper.c | 13 +
 2 files changed, 15 insertions(+)

diff --git a/target/arm/cpu.h b/target/arm/cpu.h
index 6aa9f1e9ba..4f9a8127f9 100644
--- a/target/arm/cpu.h
+++ b/target/arm/cpu.h
@@ -1474,6 +1474,8 @@ FIELD(CPTR_EL3, TCPAC, 31, 1)
 #define CPSR_N (1U << 31)
 #define CPSR_NZCV (CPSR_N | CPSR_Z | CPSR_C | CPSR_V)
 #define CPSR_AIF (CPSR_A | CPSR_I | CPSR_F)
+#define ISR_FS (1U << 9)
+#define ISR_IS (1U << 10)
 
 #define CPSR_IT (CPSR_IT_0_1 | CPSR_IT_2_7)
 #define CACHED_CPSR_BITS (CPSR_T | CPSR_AIF | CPSR_GE | CPSR_IT | CPSR_Q \
diff --git a/target/arm/helper.c b/target/arm/helper.c
index 7cdc90e9e3..ac44498537 100644
--- a/target/arm/helper.c
+++ b/target/arm/helper.c
@@ -2018,15 +2018,28 @@ static uint64_t isr_read(CPUARMState *env, const 
ARMCPRegInfo *ri)
 if (cs->interrupt_request & CPU_INTERRUPT_VIRQ) {
 ret |= CPSR_I;
 }
+if (cs->interrupt_request & CPU_INTERRUPT_VNMI) {
+ret |= ISR_IS;
+ret |= CPSR_I;
+}
 } else {
 if (cs->interrupt_request & CPU_INTERRUPT_HARD) {
 ret |= CPSR_I;
 }
+
+if (cs->interrupt_request & CPU_INTERRUPT_NMI) {
+ret |= ISR_IS;
+ret |= CPSR_I;
+}
 }
 
 if (hcr_el2 & HCR_FMO) {
 if (cs->interrupt_request & CPU_INTERRUPT_VFIQ) {
 ret |= CPSR_F;
+
+if (env->cp15.hcrx_el2 & HCRX_VFNMI) {
+ret |= ISR_FS;
+}
 }
 } else {
 if (cs->interrupt_request & CPU_INTERRUPT_FIQ) {
-- 
2.34.1

[RFC PATCH v5 06/22] target/arm: Add support for Non-maskable Interrupt

This only implements the external delivery method via the GICv3.

Signed-off-by: Jinjie Ruan 
---
v4:
- Accept NMI unconditionally for arm_cpu_has_work() but add comment.
- Change from & to && for EXCP_IRQ or EXCP_FIQ.
- Refator nmi mask in arm_excp_unmasked().
- Also handle VNMI in arm_cpu_exec_interrupt() and arm_cpu_set_irq().
- Rename virtual to Virtual.
v3:
- Not include CPU_INTERRUPT_NMI when FEAT_NMI not enabled
- Add ARM_CPU_VNMI.
- Refator nmi mask in arm_excp_unmasked().
- Test SCTLR_ELx.NMI for ALLINT mask for NMI.
---
 target/arm/cpu-qom.h   |  4 +-
 target/arm/cpu.c   | 88 +++---
 target/arm/cpu.h   |  4 ++
 target/arm/helper.c|  2 +
 target/arm/internals.h | 10 +
 5 files changed, 101 insertions(+), 7 deletions(-)

diff --git a/target/arm/cpu-qom.h b/target/arm/cpu-qom.h
index 8e032691db..e0c9e18036 100644
--- a/target/arm/cpu-qom.h
+++ b/target/arm/cpu-qom.h
@@ -36,11 +36,13 @@ DECLARE_CLASS_CHECKERS(AArch64CPUClass, AARCH64_CPU,
 #define ARM_CPU_TYPE_SUFFIX "-" TYPE_ARM_CPU
 #define ARM_CPU_TYPE_NAME(name) (name ARM_CPU_TYPE_SUFFIX)
 
-/* Meanings of the ARMCPU object's four inbound GPIO lines */
+/* Meanings of the ARMCPU object's six inbound GPIO lines */
 #define ARM_CPU_IRQ 0
 #define ARM_CPU_FIQ 1
 #define ARM_CPU_VIRQ 2
 #define ARM_CPU_VFIQ 3
+#define ARM_CPU_NMI 4
+#define ARM_CPU_VNMI 5
 
 /* For M profile, some registers are banked secure vs non-secure;
  * these are represented as a 2-element array where the first element
diff --git a/target/arm/cpu.c b/target/arm/cpu.c
index 5fa86bc8d5..ad6e6200f6 100644
--- a/target/arm/cpu.c
+++ b/target/arm/cpu.c
@@ -122,6 +122,13 @@ void arm_restore_state_to_opc(CPUState *cs,
 }
 #endif /* CONFIG_TCG */
 
+/*
+ * With SCTLR_ELx.NMI == 0, IRQ with Superpriority is masked identically with
+ * IRQ without Superpriority. Moreover, if the GIC is configured so that
+ * FEAT_GICv3_NMI is only set if FEAT_NMI is set, then we won't ever see
+ * CPU_INTERRUPT_*NMI anyway. So we might as well accept NMI here
+ * unconditionally.
+ */
 static bool arm_cpu_has_work(CPUState *cs)
 {
 ARMCPU *cpu = ARM_CPU(cs);
@@ -129,6 +136,7 @@ static bool arm_cpu_has_work(CPUState *cs)
 return (cpu->power_state != PSCI_OFF)
 && cs->interrupt_request &
 (CPU_INTERRUPT_FIQ | CPU_INTERRUPT_HARD
+ | CPU_INTERRUPT_NMI | CPU_INTERRUPT_VNMI
  | CPU_INTERRUPT_VFIQ | CPU_INTERRUPT_VIRQ | CPU_INTERRUPT_VSERR
  | CPU_INTERRUPT_EXITTB);
 }
@@ -668,6 +676,7 @@ static inline bool arm_excp_unmasked(CPUState *cs, unsigned 
int excp_idx,
 CPUARMState *env = cpu_env(cs);
 bool pstate_unmasked;
 bool unmasked = false;
+bool allIntMask = false;
 
 /*
  * Don't take exceptions if they target a lower EL.
@@ -678,13 +687,31 @@ static inline bool arm_excp_unmasked(CPUState *cs, 
unsigned int excp_idx,
 return false;
 }
 
+if (cpu_isar_feature(aa64_nmi, env_archcpu(env)) &&
+env->cp15.sctlr_el[target_el] & SCTLR_NMI && cur_el == target_el) {
+allIntMask = env->pstate & PSTATE_ALLINT ||
+ ((env->cp15.sctlr_el[target_el] & SCTLR_SPINTMASK) &&
+  (env->pstate & PSTATE_SP));
+}
+
 switch (excp_idx) {
+case EXCP_NMI:
+pstate_unmasked = !allIntMask;
+break;
+
+case EXCP_VNMI:
+if ((!(hcr_el2 & HCR_IMO) && !(hcr_el2 & HCR_FMO)) ||
+ (hcr_el2 & HCR_TGE)) {
+/* VNMIs(VIRQs or VFIQs) are only taken when hypervized.  */
+return false;
+}
+return !allIntMask;
 case EXCP_FIQ:
-pstate_unmasked = !(env->daif & PSTATE_F);
+pstate_unmasked = (!(env->daif & PSTATE_F)) && (!allIntMask);
 break;
 
 case EXCP_IRQ:
-pstate_unmasked = !(env->daif & PSTATE_I);
+pstate_unmasked = (!(env->daif & PSTATE_I)) && (!allIntMask);
 break;
 
 case EXCP_VFIQ:
@@ -692,13 +719,13 @@ static inline bool arm_excp_unmasked(CPUState *cs, 
unsigned int excp_idx,
 /* VFIQs are only taken when hypervized.  */
 return false;
 }
-return !(env->daif & PSTATE_F);
+return !(env->daif & PSTATE_F) && (!allIntMask);
 case EXCP_VIRQ:
 if (!(hcr_el2 & HCR_IMO) || (hcr_el2 & HCR_TGE)) {
 /* VIRQs are only taken when hypervized.  */
 return false;
 }
-return !(env->daif & PSTATE_I);
+return !(env->daif & PSTATE_I) && (!allIntMask);
 case EXCP_VSERR:
 if (!(hcr_el2 & HCR_AMO) || (hcr_el2 & HCR_TGE)) {
 /* VIRQs are only taken when hypervized.  */
@@ -804,6 +831,24 @@ static bool arm_cpu_exec_interrupt(CPUState *cs, int 
interrupt_request)
 
 /* The prioritization of interrupts is IMPLEMENTATION DEFINED. */
 
+if (cpu_isar_feature(aa64_nmi, env_archcpu(env))) {
+if (interrupt_request & CPU_INTERRUPT_NMI) {
+excp_idx = EXCP_NMI;
+target_el =

[RFC PATCH v5 22/22] hw/arm/virt: Add FEAT_GICv3_NMI feature support in virt GIC

A PE that implements FEAT_NMI and FEAT_GICv3 also implements
FEAT_GICv3_NMI. A PE that does not implement FEAT_NMI, does not implement
FEAT_GICv3_NMI

So included support FEAT_GICv3_NMI feature as part of virt platform
GIC initialization if FEAT_NMI and FEAT_GICv3 supported.

Signed-off-by: Jinjie Ruan 
Reviewed-by: Richard Henderson 
---
v4:
- Add Reviewed-by.
v3:
- Adjust to be the last after add FEAT_NMI to max.
- Check whether support FEAT_NMI and FEAT_GICv3 for FEAT_GICv3_NMI.
---
 hw/arm/virt.c | 18 ++
 1 file changed, 18 insertions(+)

diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index 2d4a187fd5..c12307ccd9 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -712,6 +712,19 @@ static void create_v2m(VirtMachineState *vms)
 vms->msi_controller = VIRT_MSI_CTRL_GICV2M;
 }
 
+/*
+ * A PE that implements FEAT_NMI and FEAT_GICv3 also implements
+ * FEAT_GICv3_NMI. A PE that does not implement FEAT_NMI, does not implement
+ * FEAT_GICv3_NMI.
+ */
+static bool gicv3_nmi_present(VirtMachineState *vms)
+{
+ARMCPU *cpu = ARM_CPU(qemu_get_cpu(0));
+
+return cpu_isar_feature(aa64_nmi, cpu) &&
+   (vms->gic_version != VIRT_GIC_VERSION_2);
+}
+
 static void create_gic(VirtMachineState *vms, MemoryRegion *mem)
 {
 MachineState *ms = MACHINE(vms);
@@ -785,6 +798,11 @@ static void create_gic(VirtMachineState *vms, MemoryRegion 
*mem)
   vms->virt);
 }
 }
+
+if (gicv3_nmi_present(vms)) {
+qdev_prop_set_bit(vms->gic, "has-nmi", true);
+}
+
 gicbusdev = SYS_BUS_DEVICE(vms->gic);
 sysbus_realize_and_unref(gicbusdev, &error_fatal);
 sysbus_mmio_map(gicbusdev, 0, vms->memmap[VIRT_GIC_DIST].base);
-- 
2.34.1

[RFC PATCH v5 20/22] hw/intc/arm_gicv3: Report the VNMI interrupt

In vCPU Interface, if the vIRQ has the superpriority property, report
vNMI to the corresponding vPE.

Signed-off-by: Jinjie Ruan 
---
 hw/intc/arm_gicv3_cpuif.c | 14 --
 hw/intc/gicv3_internal.h  |  1 +
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/hw/intc/arm_gicv3_cpuif.c b/hw/intc/arm_gicv3_cpuif.c
index afba98ad87..0141d259e4 100644
--- a/hw/intc/arm_gicv3_cpuif.c
+++ b/hw/intc/arm_gicv3_cpuif.c
@@ -465,6 +465,7 @@ void gicv3_cpuif_virt_irq_fiq_update(GICv3CPUState *cs)
 int idx;
 int irqlevel = 0;
 int fiqlevel = 0;
+int nmilevel = 0;
 
 idx = hppvi_index(cs);
 trace_gicv3_cpuif_virt_update(gicv3_redist_affid(cs), idx,
@@ -482,9 +483,17 @@ void gicv3_cpuif_virt_irq_fiq_update(GICv3CPUState *cs)
 uint64_t lr = cs->ich_lr_el2[idx];
 
 if (icv_hppi_can_preempt(cs, lr)) {
-/* Virtual interrupts are simple: G0 are always FIQ, and G1 IRQ */
+/*
+ * Virtual interrupts are simple: G0 are always FIQ, and G1 are
+ * IRQ or NMI which depends on the ICH_LR_EL2.NMI to have
+ * non-maskable property.
+ */
 if (lr & ICH_LR_EL2_GROUP) {
-irqlevel = 1;
+if (cs->gic->nmi_support && (lr & ICH_LR_EL2_NMI)) {
+nmilevel = 1;
+} else {
+irqlevel = 1;
+}
 } else {
 fiqlevel = 1;
 }
@@ -494,6 +503,7 @@ void gicv3_cpuif_virt_irq_fiq_update(GICv3CPUState *cs)
 trace_gicv3_cpuif_virt_set_irqs(gicv3_redist_affid(cs), fiqlevel, 
irqlevel);
 qemu_set_irq(cs->parent_vfiq, fiqlevel);
 qemu_set_irq(cs->parent_virq, irqlevel);
+qemu_set_irq(cs->parent_vnmi, nmilevel);
 }
 
 static void gicv3_cpuif_virt_update(GICv3CPUState *cs)
diff --git a/hw/intc/gicv3_internal.h b/hw/intc/gicv3_internal.h
index 93e56b3726..b6cb0115e7 100644
--- a/hw/intc/gicv3_internal.h
+++ b/hw/intc/gicv3_internal.h
@@ -242,6 +242,7 @@ FIELD(GICR_VPENDBASER, VALID, 63, 1)
 #define ICH_LR_EL2_PRIORITY_SHIFT 48
 #define ICH_LR_EL2_PRIORITY_LENGTH 8
 #define ICH_LR_EL2_PRIORITY_MASK (0xffULL << ICH_LR_EL2_PRIORITY_SHIFT)
+#define ICH_LR_EL2_NMI (1ULL << 59)
 #define ICH_LR_EL2_GROUP (1ULL << 60)
 #define ICH_LR_EL2_HW (1ULL << 61)
 #define ICH_LR_EL2_STATE_SHIFT 62
-- 
2.34.1

Re: [PATCH 00/19] Workaround Windows failing to find 64bit SMBIOS entry point with SeaBIOS