date:20210325

From: Xingang Wang 

This helps to find max bus number of a root bus.

Signed-off-by: Xingang Wang 
Signed-off-by: Jiahui Cen 
---
 hw/pci/pci.c | 34 ++
 include/hw/pci/pci.h |  1 +
 2 files changed, 35 insertions(+)

diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index e17aa9075f..c7957cbf7c 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -538,6 +538,40 @@ int pci_bus_num(PCIBus *s)
 return PCI_BUS_GET_CLASS(s)->bus_num(s);
 }
 
+int pci_root_bus_max_bus(PCIBus *bus)
+{
+PCIHostState *host;
+PCIDevice *dev;
+int max_bus = 0;
+int type, devfn;
+uint8_t subordinate;
+
+if (!pci_bus_is_root(bus)) {
+return 0;
+}
+
+host = PCI_HOST_BRIDGE(BUS(bus)->parent);
+max_bus = pci_bus_num(host->bus);
+
+for (devfn = 0; devfn < ARRAY_SIZE(host->bus->devices); devfn++) {
+dev = host->bus->devices[devfn];
+
+if (!dev) {
+continue;
+}
+
+type = dev->config[PCI_HEADER_TYPE] & ~PCI_HEADER_TYPE_MULTI_FUNCTION;
+if (type == PCI_HEADER_TYPE_BRIDGE) {
+subordinate = dev->config[PCI_SUBORDINATE_BUS];
+if (subordinate > max_bus) {
+max_bus = subordinate;
+}
+}
+}
+
+return max_bus;
+}
+
 int pci_bus_numa_node(PCIBus *bus)
 {
 return PCI_BUS_GET_CLASS(bus)->numa_node(bus);
diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index 718b5a454a..e0c69534f4 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -450,6 +450,7 @@ static inline PCIBus *pci_get_bus(const PCIDevice *dev)
 return PCI_BUS(qdev_get_parent_bus(DEVICE(dev)));
 }
 int pci_bus_num(PCIBus *s);
+int pci_root_bus_max_bus(PCIBus *bus);
 static inline int pci_dev_bus_num(const PCIDevice *dev)
 {
 return pci_bus_num(pci_get_bus(dev));
-- 
2.19.1

[PATCH RFC RESEND v2 2/6] hw/pci: Add iommu option for pci root bus

From: Xingang Wang 

This add iommu option for pci root bus, including primary bus
and pxb root bus. The option is valid only if there is a virtual
iommu device.

Signed-off-by: Xingang Wang 
Signed-off-by: Jiahui Cen 
---
 hw/arm/virt.c   | 25 +
 hw/i386/pc.c| 19 +++
 hw/pci-bridge/pci_expander_bridge.c |  3 +++
 hw/pci-host/q35.c   |  1 +
 include/hw/arm/virt.h   |  1 +
 include/hw/i386/pc.h|  1 +
 6 files changed, 50 insertions(+)

diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index aa2bbd14e0..446b3b867f 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -1366,6 +1366,7 @@ static void create_pcie(VirtMachineState *vms)
 }
 
 pci = PCI_HOST_BRIDGE(dev);
+pci->iommu = vms->primary_bus_iommu;
 vms->bus = pci->bus;
 if (vms->bus) {
 for (i = 0; i < nb_nics; i++) {
@@ -2319,6 +2320,20 @@ static void virt_set_iommu(Object *obj, const char 
*value, Error **errp)
 }
 }
 
+static bool virt_get_primary_bus_iommu(Object *obj, Error **errp)
+{
+VirtMachineState *vms = VIRT_MACHINE(obj);
+
+return vms->primary_bus_iommu;
+}
+
+static void virt_set_primary_bus_iommu(Object *obj, bool value, Error **errp)
+{
+VirtMachineState *vms = VIRT_MACHINE(obj);
+
+vms->primary_bus_iommu = value;
+}
+
 static CpuInstanceProperties
 virt_cpu_index_to_props(MachineState *ms, unsigned cpu_index)
 {
@@ -2652,6 +2667,13 @@ static void virt_machine_class_init(ObjectClass *oc, 
void *data)
   "Set the IOMMU type. "
   "Valid values are none and smmuv3");
 
+object_class_property_add_bool(oc, "primary_bus_iommu",
+  virt_get_primary_bus_iommu,
+  virt_set_primary_bus_iommu);
+object_class_property_set_description(oc, "primary_bus_iommu",
+  "Set on/off to enable/disable "
+  "iommu for primary bus");
+
 object_class_property_add_bool(oc, "ras", virt_get_ras,
virt_set_ras);
 object_class_property_set_description(oc, "ras",
@@ -2719,6 +2741,9 @@ static void virt_instance_init(Object *obj)
 /* Default disallows iommu instantiation */
 vms->iommu = VIRT_IOMMU_NONE;
 
+/* The primary bus is attached to iommu by default */
+vms->primary_bus_iommu = true;
+
 /* Default disallows RAS instantiation */
 vms->ras = false;
 
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 8a84b25a03..b64e4bb7f2 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -1529,6 +1529,21 @@ static void pc_machine_set_hpet(Object *obj, bool value, 
Error **errp)
 pcms->hpet_enabled = value;
 }
 
+static bool pc_machine_get_primary_bus_iommu(Object *obj, Error **errp)
+{
+PCMachineState *pcms = PC_MACHINE(obj);
+
+return pcms->primary_bus_iommu;
+}
+
+static void pc_machine_set_primary_bus_iommu(Object *obj, bool value,
+ Error **errp)
+{
+PCMachineState *pcms = PC_MACHINE(obj);
+
+pcms->primary_bus_iommu = value;
+}
+
 static void pc_machine_get_max_ram_below_4g(Object *obj, Visitor *v,
 const char *name, void *opaque,
 Error **errp)
@@ -1628,6 +1643,7 @@ static void pc_machine_initfn(Object *obj)
 #ifdef CONFIG_HPET
 pcms->hpet_enabled = true;
 #endif
+pcms->primary_bus_iommu = true;
 
 pc_system_flash_create(pcms);
 pcms->pcspk = isa_new(TYPE_PC_SPEAKER);
@@ -1752,6 +1768,9 @@ static void pc_machine_class_init(ObjectClass *oc, void 
*data)
 object_class_property_add_bool(oc, "hpet",
 pc_machine_get_hpet, pc_machine_set_hpet);
 
+object_class_property_add_bool(oc, "primary_bus_iommu",
+pc_machine_get_primary_bus_iommu, pc_machine_set_primary_bus_iommu);
+
 object_class_property_add(oc, PC_MACHINE_MAX_FW_SIZE, "size",
 pc_machine_get_max_fw_size, pc_machine_set_max_fw_size,
 NULL, NULL);
diff --git a/hw/pci-bridge/pci_expander_bridge.c 
b/hw/pci-bridge/pci_expander_bridge.c
index aedded1064..f1a0eadc03 100644
--- a/hw/pci-bridge/pci_expander_bridge.c
+++ b/hw/pci-bridge/pci_expander_bridge.c
@@ -57,6 +57,7 @@ struct PXBDev {
 
 uint8_t bus_nr;
 uint16_t numa_node;
+bool iommu;
 };
 
 static PXBDev *convert_to_pxb(PCIDevice *dev)
@@ -255,6 +256,7 @@ static void pxb_dev_realize_common(PCIDevice *dev, bool 
pcie, Error **errp)
 bus->map_irq = pxb_map_irq_fn;
 
 PCI_HOST_BRIDGE(ds)->bus = bus;
+PCI_HOST_BRIDGE(ds)->iommu = pxb->iommu;
 
 pxb_register_bus(dev, bus, &local_err);
 if (local_err) {
@@ -301,6 +303,7 @@ static Property pxb_dev_properties[] = {
 /* Note: 0 is not a legal PXB bus number. */
 DEFINE_PROP_UINT8("bus_nr", PXBDev, bus_nr, 0),
 DEFINE_PROP_UINT16("

[PATCH RFC RESEND v2 1/6] hw/pci/pci_host: Add iommu property for pci host

From: Xingang Wang 

The pci host iommu property is useful to check whether
the iommu is enabled on the pci root bus.

Signed-off-by: Xingang Wang 
Signed-off-by: Jiahui Cen 
---
 hw/pci/pci.c  | 18 +-
 hw/pci/pci_host.c |  2 ++
 include/hw/pci/pci.h  |  1 +
 include/hw/pci/pci_host.h |  1 +
 4 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index ac9a24889c..e17aa9075f 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -417,6 +417,22 @@ const char *pci_root_bus_path(PCIDevice *dev)
 return rootbus->qbus.name;
 }
 
+bool pci_root_bus_has_iommu(PCIBus *bus)
+{
+PCIBus *rootbus = bus;
+PCIHostState *host_bridge;
+
+if (!pci_bus_is_root(bus)) {
+rootbus = pci_device_root_bus(bus->parent_dev);
+}
+
+host_bridge = PCI_HOST_BRIDGE(rootbus->qbus.parent);
+
+assert(host_bridge->bus == rootbus);
+
+return host_bridge->iommu;
+}
+
 static void pci_root_bus_init(PCIBus *bus, DeviceState *parent,
   MemoryRegion *address_space_mem,
   MemoryRegion *address_space_io,
@@ -2716,7 +2732,7 @@ AddressSpace *pci_device_iommu_address_space(PCIDevice 
*dev)
 
 iommu_bus = parent_bus;
 }
-if (iommu_bus && iommu_bus->iommu_fn) {
+if (pci_root_bus_has_iommu(bus) && iommu_bus && iommu_bus->iommu_fn) {
 return iommu_bus->iommu_fn(bus, iommu_bus->iommu_opaque, devfn);
 }
 return &address_space_memory;
diff --git a/hw/pci/pci_host.c b/hw/pci/pci_host.c
index 8ca5fadcbd..92ce213b18 100644
--- a/hw/pci/pci_host.c
+++ b/hw/pci/pci_host.c
@@ -222,6 +222,8 @@ const VMStateDescription vmstate_pcihost = {
 static Property pci_host_properties_common[] = {
 DEFINE_PROP_BOOL("x-config-reg-migration-enabled", PCIHostState,
  mig_enabled, true),
+DEFINE_PROP_BOOL("pci-host-iommu-enabled", PCIHostState,
+ iommu, true),
 DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index 6be4e0c460..718b5a454a 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -480,6 +480,7 @@ void pci_for_each_bus(PCIBus *bus,
 
 PCIBus *pci_device_root_bus(const PCIDevice *d);
 const char *pci_root_bus_path(PCIDevice *dev);
+bool pci_root_bus_has_iommu(PCIBus *bus);
 PCIDevice *pci_find_device(PCIBus *bus, int bus_num, uint8_t devfn);
 int pci_qdev_find_device(const char *id, PCIDevice **pdev);
 void pci_bus_get_w64_range(PCIBus *bus, Range *range);
diff --git a/include/hw/pci/pci_host.h b/include/hw/pci/pci_host.h
index 52e038c019..64128e3a19 100644
--- a/include/hw/pci/pci_host.h
+++ b/include/hw/pci/pci_host.h
@@ -43,6 +43,7 @@ struct PCIHostState {
 uint32_t config_reg;
 bool mig_enabled;
 PCIBus *bus;
+bool iommu;
 
 QLIST_ENTRY(PCIHostState) next;
 };
-- 
2.19.1

[PATCH RFC RESEND v2 0/6] Introduce IOMMU Option For PCI Root Bus

From: Xingang Wang 

These patches add support for configure iommu on/off for pci root bus,
including primary bus and pxb root bus. At present, All root bus
will go through iommu when iommu is configured, which is not flexible.

So this add option to enable/disable iommu for primary bus and pxb
root bus.  When iommu is enabled for the root bus, devices attached to it
will go through iommu. When iommu is disabled for the root bus, devices
will not go through iommu accordingly.

The option example for iommu configuration is like the following:

primary root bus option:
arm: -machine virt iommu=smmuv3,primary_bus_iommu=false(or true)
x86: -machine q35,primary_bus_iommu=false(or true)

pxb root bus:
 -device pxb-pcie,bus_nr=0x10,id=pci.10,bus=pcie.0,addr=0x3.0x1,iommu=false 

History:

v1 -> v2:
- rebase on top of v6.0.0-rc0
- Fix some issues
- Took into account Eric's comments, and remove the PCI_BUS_IOMMU flag,
  replace it with a property in PCIHostState.
- Add support for x86 iommu option

Xingang Wang (6):
  hw/pci/pci_host: Add iommu property for pci host
  hw/pci: Add iommu option for pci root bus
  hw/pci: Add pci_root_bus_max_bus
  hw/arm/virt-acpi-build: Add explicit idmap info in IORT table
  hw/i386/acpi-build: Add explicit scope in DMAR table
  hw/i386/acpi-build: Add iommu filter in IVRS table

 hw/arm/virt-acpi-build.c| 103 ++--
 hw/arm/virt.c   |  25 +++
 hw/i386/acpi-build.c|  70 ++-
 hw/i386/pc.c|  19 +
 hw/pci-bridge/pci_expander_bridge.c |   3 +
 hw/pci-host/q35.c   |   1 +
 hw/pci/pci.c|  52 +-
 hw/pci/pci_host.c   |   2 +
 include/hw/arm/virt.h   |   1 +
 include/hw/i386/pc.h|   1 +
 include/hw/pci/pci.h|   2 +
 include/hw/pci/pci_host.h   |   1 +
 12 files changed, 254 insertions(+), 26 deletions(-)

-- 
2.19.1

[PATCH RFC RESEND v2 4/6] hw/arm/virt-acpi-build: Add explicit idmap info in IORT table

From: Xingang Wang 

The idmap of smmuv3 and root complex covers the whole RID space for now,
this patch add explicit idmap info according to root bus number range.
This add smmuv3 idmap for certain bus which has enabled the iommu property.

Signed-off-by: Xingang Wang 
Signed-off-by: Jiahui Cen 
---
 hw/arm/virt-acpi-build.c | 103 ++-
 1 file changed, 81 insertions(+), 22 deletions(-)

diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c
index f5a2b2d4cb..5491036c86 100644
--- a/hw/arm/virt-acpi-build.c
+++ b/hw/arm/virt-acpi-build.c
@@ -44,6 +44,7 @@
 #include "hw/acpi/tpm.h"
 #include "hw/pci/pcie_host.h"
 #include "hw/pci/pci.h"
+#include "hw/pci/pci_bus.h"
 #include "hw/pci-host/gpex.h"
 #include "hw/arm/virt.h"
 #include "hw/mem/nvdimm.h"
@@ -237,6 +238,41 @@ static void acpi_dsdt_add_tpm(Aml *scope, VirtMachineState 
*vms)
 aml_append(scope, dev);
 }
 
+typedef
+struct AcpiIortMapping {
+AcpiIortIdMapping idmap;
+bool iommu;
+} AcpiIortMapping;
+
+/* For all PCI host bridges, walk and insert DMAR scope */
+static int
+iort_host_bridges(Object *obj, void *opaque)
+{
+GArray *map_blob = opaque;
+AcpiIortMapping map;
+AcpiIortIdMapping *idmap = &map.idmap;
+int bus_num, max_bus;
+
+if (object_dynamic_cast(obj, TYPE_PCI_HOST_BRIDGE)) {
+PCIBus *bus = PCI_HOST_BRIDGE(obj)->bus;
+
+if (bus) {
+bus_num = pci_bus_num(bus);
+max_bus = pci_root_bus_max_bus(bus);
+
+idmap->input_base = cpu_to_le32(bus_num << 8);
+idmap->id_count = cpu_to_le32((max_bus - bus_num + 1) << 8);
+idmap->output_base = cpu_to_le32(bus_num << 8);
+idmap->flags = cpu_to_le32(0);
+
+map.iommu = pci_root_bus_has_iommu(bus);
+g_array_append_val(map_blob, map);
+}
+}
+
+return 0;
+}
+
 static void
 build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
 {
@@ -247,6 +283,21 @@ build_iort(GArray *table_data, BIOSLinker *linker, 
VirtMachineState *vms)
 AcpiIortSmmu3 *smmu;
 size_t node_size, iort_node_offset, iort_length, smmu_offset = 0;
 AcpiIortRC *rc;
+int smmu_mapping_count;
+GArray *map_blob = g_array_new(false, true, sizeof(AcpiIortMapping));
+AcpiIortMapping *map;
+
+/* pci_for_each_bus(vms->bus, insert_map, map_blob); */
+object_child_foreach_recursive(object_get_root(),
+   iort_host_bridges, map_blob);
+
+smmu_mapping_count = 0;
+for (int i = 0; i < map_blob->len; i++) {
+map = &g_array_index(map_blob, AcpiIortMapping, i);
+if (map->iommu) {
+smmu_mapping_count++;
+}
+}
 
 iort = acpi_data_push(table_data, sizeof(*iort));
 
@@ -280,13 +331,13 @@ build_iort(GArray *table_data, BIOSLinker *linker, 
VirtMachineState *vms)
 
 /* SMMUv3 node */
 smmu_offset = iort_node_offset + node_size;
-node_size = sizeof(*smmu) + sizeof(*idmap);
+node_size = sizeof(*smmu) + sizeof(*idmap) * smmu_mapping_count;
 iort_length += node_size;
 smmu = acpi_data_push(table_data, node_size);
 
 smmu->type = ACPI_IORT_NODE_SMMU_V3;
 smmu->length = cpu_to_le16(node_size);
-smmu->mapping_count = cpu_to_le32(1);
+smmu->mapping_count = cpu_to_le32(smmu_mapping_count);
 smmu->mapping_offset = cpu_to_le32(sizeof(*smmu));
 smmu->base_address = cpu_to_le64(vms->memmap[VIRT_SMMU].base);
 smmu->flags = cpu_to_le32(ACPI_IORT_SMMU_V3_COHACC_OVERRIDE);
@@ -295,23 +346,28 @@ build_iort(GArray *table_data, BIOSLinker *linker, 
VirtMachineState *vms)
 smmu->gerr_gsiv = cpu_to_le32(irq + 2);
 smmu->sync_gsiv = cpu_to_le32(irq + 3);
 
-/* Identity RID mapping covering the whole input RID range */
-idmap = &smmu->id_mapping_array[0];
-idmap->input_base = 0;
-idmap->id_count = cpu_to_le32(0x);
-idmap->output_base = 0;
-/* output IORT node is the ITS group node (the first node) */
-idmap->output_reference = cpu_to_le32(iort_node_offset);
+for (int i = 0, j = 0; i < map_blob->len; i++) {
+map = &g_array_index(map_blob, AcpiIortMapping, i);
+
+if (!map->iommu) {
+continue;
+}
+
+idmap = &smmu->id_mapping_array[j++];
+*idmap = map->idmap;
+/* output IORT node is the ITS group node (the first node) */
+idmap->output_reference = cpu_to_le32(iort_node_offset);
+}
 }
 
 /* Root Complex Node */
-node_size = sizeof(*rc) + sizeof(*idmap);
+node_size = sizeof(*rc) + sizeof(*idmap) * map_blob->len;
 iort_length += node_size;
 rc = acpi_data_push(table_data, node_size);
 
 rc->type = ACPI_IORT_NODE_PCI_ROOT_COMPLEX;
 rc->length = cpu_to_le16(node_size);
-rc->mapping_count = cpu_to_le32(1);
+rc->mapping_count = cpu_to_le32(map_

[PATCH RFC RESEND v2 6/6] hw/i386/acpi-build: Add iommu filter in IVRS table

From: Xingang Wang 

When building amd IVRS table, only devices attached to root bus with
IOMMU flag should be scanned.

Signed-off-by: Xingang Wang 
Signed-off-by: Jiahui Cen 
---
 hw/i386/acpi-build.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
index 6936889cad..e0f38305da 100644
--- a/hw/i386/acpi-build.c
+++ b/hw/i386/acpi-build.c
@@ -2229,7 +2229,7 @@ ivrs_host_bridges(Object *obj, void *opaque)
 if (object_dynamic_cast(obj, TYPE_PCI_HOST_BRIDGE)) {
 PCIBus *bus = PCI_HOST_BRIDGE(obj)->bus;
 
-if (bus) {
+if (bus && pci_root_bus_has_iommu(bus)) {
 pci_for_each_device(bus, pci_bus_num(bus), insert_ivhd, ivhd_blob);
 }
 }
-- 
2.19.1

[PATCH RFC RESEND v2 5/6] hw/i386/acpi-build: Add explicit scope in DMAR table

From: Xingang Wang 

In DMAR table, the drhd is set to cover all pci devices when intel_iommu
is on. This patch add explicit scope data, including only the pci devices
that go through iommu.

Signed-off-by: Xingang Wang 
Signed-off-by: Jiahui Cen 
---
 hw/i386/acpi-build.c | 68 ++--
 1 file changed, 66 insertions(+), 2 deletions(-)

diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
index de98750aef..6936889cad 100644
--- a/hw/i386/acpi-build.c
+++ b/hw/i386/acpi-build.c
@@ -1988,6 +1988,56 @@ build_srat(GArray *table_data, BIOSLinker *linker, 
MachineState *machine)
  x86ms->oem_table_id);
 }
 
+/*
+ * Insert DMAR scope for PCI bridges and endpoint devcie
+ */
+static void
+insert_scope(PCIBus *bus, PCIDevice *dev, void *opaque)
+{
+GArray *scope_blob = opaque;
+AcpiDmarDeviceScope *scope = NULL;
+
+if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_BRIDGE)) {
+/* Dmar Scope Type: 0x02 for PCI Bridge */
+build_append_int_noprefix(scope_blob, 0x02, 1);
+} else {
+/* Dmar Scope Type: 0x01 for PCI Endpoint Device */
+build_append_int_noprefix(scope_blob, 0x01, 1);
+}
+
+/* length */
+build_append_int_noprefix(scope_blob,
+  sizeof(*scope) + sizeof(scope->path[0]), 1);
+/* reserved */
+build_append_int_noprefix(scope_blob, 0, 2);
+/* enumeration_id */
+build_append_int_noprefix(scope_blob, 0, 1);
+/* bus */
+build_append_int_noprefix(scope_blob, pci_bus_num(bus), 1);
+/* device */
+build_append_int_noprefix(scope_blob, PCI_SLOT(dev->devfn), 1);
+/* function */
+build_append_int_noprefix(scope_blob, PCI_FUNC(dev->devfn), 1);
+}
+
+/* For all PCI host bridges, walk and insert DMAR scope */
+static int
+dmar_host_bridges(Object *obj, void *opaque)
+{
+GArray *scope_blob = opaque;
+
+if (object_dynamic_cast(obj, TYPE_PCI_HOST_BRIDGE)) {
+PCIBus *bus = PCI_HOST_BRIDGE(obj)->bus;
+
+if (bus && pci_root_bus_has_iommu(bus)) {
+pci_for_each_device(bus, pci_bus_num(bus), insert_scope,
+scope_blob);
+}
+}
+
+return 0;
+}
+
 /*
  * VT-d spec 8.1 DMA Remapping Reporting Structure
  * (version Oct. 2014 or later)
@@ -2007,6 +2057,15 @@ build_dmar_q35(GArray *table_data, BIOSLinker *linker, 
const char *oem_id,
 /* Root complex IOAPIC use one path[0] only */
 size_t ioapic_scope_size = sizeof(*scope) + sizeof(scope->path[0]);
 IntelIOMMUState *intel_iommu = INTEL_IOMMU_DEVICE(iommu);
+GArray *scope_blob = g_array_new(false, true, 1);
+
+/*
+ * A PCI bus walk, for each PCI host bridge.
+ * Insert scope for each PCI bridge and endpoint device which
+ * is attached to a bus with iommu enabled.
+ */
+object_child_foreach_recursive(object_get_root(),
+   dmar_host_bridges, scope_blob);
 
 assert(iommu);
 if (x86_iommu_ir_supported(iommu)) {
@@ -2020,8 +2079,9 @@ build_dmar_q35(GArray *table_data, BIOSLinker *linker, 
const char *oem_id,
 /* DMAR Remapping Hardware Unit Definition structure */
 drhd = acpi_data_push(table_data, sizeof(*drhd) + ioapic_scope_size);
 drhd->type = cpu_to_le16(ACPI_DMAR_TYPE_HARDWARE_UNIT);
-drhd->length = cpu_to_le16(sizeof(*drhd) + ioapic_scope_size);
-drhd->flags = ACPI_DMAR_INCLUDE_PCI_ALL;
+drhd->length =
+cpu_to_le16(sizeof(*drhd) + ioapic_scope_size + scope_blob->len);
+drhd->flags = 0;/* Don't include all pci device */
 drhd->pci_segment = cpu_to_le16(0);
 drhd->address = cpu_to_le64(Q35_HOST_BRIDGE_IOMMU_ADDR);
 
@@ -2035,6 +2095,10 @@ build_dmar_q35(GArray *table_data, BIOSLinker *linker, 
const char *oem_id,
 scope->path[0].device = PCI_SLOT(Q35_PSEUDO_DEVFN_IOAPIC);
 scope->path[0].function = PCI_FUNC(Q35_PSEUDO_DEVFN_IOAPIC);
 
+/* Add scope found above */
+g_array_append_vals(table_data, scope_blob->data, scope_blob->len);
+g_array_free(scope_blob, true);
+
 if (iommu->dt_supported) {
 atsr = acpi_data_push(table_data, sizeof(*atsr));
 atsr->type = cpu_to_le16(ACPI_DMAR_TYPE_ATSR);
-- 
2.19.1

Re: [PATCH v4 00/11] 64bit block-layer: part II


25.03.2021 00:13, no-re...@patchew.org wrote:

Patchew URL: 
https://patchew.org/QEMU/20210324205132.464899-1-vsement...@virtuozzo.com/



Hi,

This series seems to have some coding style problems. See output below for
more information:

Type: series
Message-id: 20210324205132.464899-1-vsement...@virtuozzo.com
Subject: [PATCH v4 00/11] 64bit block-layer: part II

=== TEST SCRIPT BEGIN ===
#!/bin/bash
git rev-parse base > /dev/null || exit 0
git config --local diff.renamelimit 0
git config --local diff.renames True
git config --local diff.algorithm histogram
./scripts/checkpatch.pl --mailback base..
=== TEST SCRIPT END ===

Updating 3c8cf5a9c21ff8782164d1def7f44bd888713384
 From https://github.com/patchew-project/qemu
  - [tag update]  patchew/20210323221539.3532660-1-cr...@redhat.com -> 
patchew/20210323221539.3532660-1-cr...@redhat.com
  * [new tag] patchew/20210324205132.464899-1-vsement...@virtuozzo.com 
-> patchew/20210324205132.464899-1-vsement...@virtuozzo.com
Switched to a new branch 'test'
bed608a block/io: allow 64bit discard requests
9b3b5c7 block: use int64_t instead of int in driver discard handlers
9d5776f block: make BlockLimits::max_pdiscard 64bit
1dc4bab block/io: allow 64bit write-zeroes requests
05ca540 block: use int64_t instead of int in driver write_zeroes handlers
5864b0d block: make BlockLimits::max_pwrite_zeroes 64bit
9698c13 block: use int64_t instead of uint64_t in copy_range driver handlers
4e60566 block: use int64_t instead of uint64_t in driver write handlers
8aa3af1 block: use int64_t instead of uint64_t in driver read handlers
fc695f9 qcow2: check request on vmstate save/load path
a13a9ef block/io: bring request check to bdrv_co_{read, write}v_vmstate

=== OUTPUT BEGIN ===
1/11 Checking commit a13a9efd128c (block/io: bring request check to 
bdrv_co_{read, write}v_vmstate)
ERROR: Author email address is mangled by the mailing list
#2:
Author: Vladimir Sementsov-Ogievskiy via 


That's a strange false-positive.

Look at 1/11: it's not mangled in any way. Looking at the source I see clean 
"From:" header:

  From: Vladimir Sementsov-Ogievskiy 

And there is no any "Author" in the message source at all. "qemu-devel" is 
noted only in Cc: list.



--
Best regards,
Vladimir

Re: [PATCH 2/3] aspeed: Add Scater-Gather support for HACE Hash

2021-03-25 Thread Cédric Le Goater

On 3/24/21 11:38 PM, Klaus Heinrich Kiwi wrote:
> Complement the Aspeed HACE support with Scatter-Gather hash support for
> sha256 and sha512. Scatter-Gather is only supported on AST2600-series.
> 
> Signed-off-by: Klaus Heinrich Kiwi 

this looks good. A few extra comments,

> ---
>  hw/misc/aspeed_hace.c | 127 --
>  include/hw/misc/aspeed_hace.h |   6 ++
>  2 files changed, 127 insertions(+), 6 deletions(-)
> 
> diff --git a/hw/misc/aspeed_hace.c b/hw/misc/aspeed_hace.c
> index 93313d2b80..8a37b1d961 100644
> --- a/hw/misc/aspeed_hace.c
> +++ b/hw/misc/aspeed_hace.c
> @@ -57,6 +57,10 @@
>  /* Other cmd bits */
>  #define  HASH_IRQ_ENBIT(9)
>  #define  HASH_SG_EN BIT(18)
> +/* Scatter-gather data list */
> +#define  SG_LIST_LAST   BIT(31)
> +#define  SG_LIST_LEN_MASK   0x7fff
> +#define  SG_LIST_ADDR_MASK  0x7ff8  /* 8-byte aligned */
>  
>  static const struct {
>  uint32_t mask;
> @@ -129,6 +133,117 @@ static int do_hash_operation(AspeedHACEState *s, int 
> algo)
>  return 0;
>  }
>  
> +static int do_hash_sg_operation(AspeedHACEState *s, int algo)

Do we really care about the return value ? 

> +{
> +uint32_t src, dest, reqSize;
> +hwaddr len;
> +const size_t reqLen = sizeof(struct aspeed_sg_list);
> +struct iovec iov[ASPEED_HACE_MAX_SG];
> +unsigned int i = 0;
> +unsigned int isLast = 0;
> +uint8_t *digestBuf = NULL;
> +size_t digestLen = 0, size = 0;
> +struct aspeed_sg_list *sgList;
> +int rc;
> +
> +reqSize = s->regs[R_HASH_SRC_LEN];
> +dest = s->regs[R_HASH_DEST];
> +
> +while (!isLast && i < ASPEED_HACE_MAX_SG) {
> +src = s->regs[R_HASH_SRC] + (i * reqLen);
> +len = reqLen;
> +sgList = (struct aspeed_sg_list *) address_space_map(&s->dram_as,
> + src,
> + (hwaddr *) &len,
> +   false,
> + MEMTXATTRS_UNSPECIFIED);

This should be doing LE loads.

> +if (!sgList) {
> +qemu_log_mask(LOG_GUEST_ERROR,
> + "%s: failed to map dram for SG Array entry '%u' for address 
> '0x%0x'\n",
> + __func__, i, src);
> +rc = -EACCES;
> +goto cleanup;
> +}
> +if (len != reqLen)
> +qemu_log_mask(LOG_GUEST_ERROR,
> + "%s:  Warning: dram map for SG array entry '%u' requested size 
> '%lu' != mapped size '%lu'\n",
> + __func__, i, reqLen, len);
> +
> +isLast = sgList->len & SG_LIST_LAST;
> +
> +iov[i].iov_len = (hwaddr) (sgList->len & SG_LIST_LEN_MASK);
> +iov[i].iov_base = address_space_map(&s->dram_as,
> +sgList->phy_addr & SG_LIST_ADDR_MASK,
> +&iov[i].iov_len, false,
> +MEMTXATTRS_UNSPECIFIED);
> +if (!iov[i].iov_base) {
> +qemu_log_mask(LOG_GUEST_ERROR,
> + "%s: failed to map dram for SG array entry '%u' for region 
> '0x%x', len '%u'\n",
> + __func__, i, sgList->phy_addr & SG_LIST_ADDR_MASK,
> + sgList->len & SG_LIST_LEN_MASK);
> +rc = -EACCES;
> +goto cleanup;
> +}
> +if (iov[i].iov_len != (sgList->len & SG_LIST_LEN_MASK))
> +qemu_log_mask(LOG_GUEST_ERROR,
> + "%s:  Warning: dram map for SG region entry %u requested size 
> %u != mapped size %lu\n",
> + __func__, i, (sgList->len & SG_LIST_LEN_MASK), iov[i].iov_len);
> +
> +
> +address_space_unmap(&s->dram_as, (void *) sgList, len, false,
> +len);
> +size += iov[i].iov_len;
> +i++;
> +}
> +
> +if (!isLast) {
> +qemu_log_mask(LOG_GUEST_ERROR,
> + "%s: Error: Exhausted maximum of '%u' SG array 
> entries\n",
> + __func__, ASPEED_HACE_MAX_SG);
> +rc = -ENOTSUP;
> +goto cleanup;
> +}
> +
> +if (size != reqSize)
> +qemu_log_mask(LOG_GUEST_ERROR,
> + "%s: Warning: requested SG total size %u != actual size %lu\n",
> + __func__, reqSize, size);
> +
> +rc = qcrypto_hash_bytesv(algo, iov, i, &digestBuf, &digestLen,
> +&error_fatal);
> +if (rc < 0) {
> +qemu_log_mask(LOG_GUEST_ERROR, "%s: qcrypto failed\n",
> +  __func__);
> +goto cleanup;
> +}
> +
> +rc = address_space_write(&s->dram_as, dest, MEMTXATTRS_UNSPECIFIED,
> + digestBuf, digestLen);
> +if (rc)
> +qemu_log_mask(LOG_GUEST_ERROR,
> +  "%s: address space write failed\n", __func__);
> +g_free(digestBuf);
> +
> +cleanup:
> +
> +for

Re: [PATCH v4 00/11] 64bit block-layer: part II


25.03.2021 10:42, Vladimir Sementsov-Ogievskiy wrote:

25.03.2021 00:13, no-re...@patchew.org wrote:

Patchew URL: 
https://patchew.org/QEMU/20210324205132.464899-1-vsement...@virtuozzo.com/



Hi,

This series seems to have some coding style problems. See output below for
more information:

Type: series
Message-id: 20210324205132.464899-1-vsement...@virtuozzo.com
Subject: [PATCH v4 00/11] 64bit block-layer: part II

=== TEST SCRIPT BEGIN ===
#!/bin/bash
git rev-parse base > /dev/null || exit 0
git config --local diff.renamelimit 0
git config --local diff.renames True
git config --local diff.algorithm histogram
./scripts/checkpatch.pl --mailback base..
=== TEST SCRIPT END ===

Updating 3c8cf5a9c21ff8782164d1def7f44bd888713384
 From https://github.com/patchew-project/qemu
  - [tag update]  patchew/20210323221539.3532660-1-cr...@redhat.com -> 
patchew/20210323221539.3532660-1-cr...@redhat.com
  * [new tag] patchew/20210324205132.464899-1-vsement...@virtuozzo.com 
-> patchew/20210324205132.464899-1-vsement...@virtuozzo.com
Switched to a new branch 'test'
bed608a block/io: allow 64bit discard requests
9b3b5c7 block: use int64_t instead of int in driver discard handlers
9d5776f block: make BlockLimits::max_pdiscard 64bit
1dc4bab block/io: allow 64bit write-zeroes requests
05ca540 block: use int64_t instead of int in driver write_zeroes handlers
5864b0d block: make BlockLimits::max_pwrite_zeroes 64bit
9698c13 block: use int64_t instead of uint64_t in copy_range driver handlers
4e60566 block: use int64_t instead of uint64_t in driver write handlers
8aa3af1 block: use int64_t instead of uint64_t in driver read handlers
fc695f9 qcow2: check request on vmstate save/load path
a13a9ef block/io: bring request check to bdrv_co_{read, write}v_vmstate

=== OUTPUT BEGIN ===
1/11 Checking commit a13a9efd128c (block/io: bring request check to 
bdrv_co_{read, write}v_vmstate)
ERROR: Author email address is mangled by the mailing list
#2:
Author: Vladimir Sementsov-Ogievskiy via 


That's a strange false-positive.

Look at 1/11: it's not mangled in any way. Looking at the source I see clean 
"From:" header:

   From: Vladimir Sementsov-Ogievskiy 

And there is no any "Author" in the message source at all. "qemu-devel" is 
noted only in Cc: list.



Hmm, but if look at mail on patchew, 
https://patchew.org/QEMU/20210324205132.464899-1-vsement...@virtuozzo.com/20210324205132.464899-2-vsement...@virtuozzo.com/
yes it is mangled..

I hope everyone who is in CC (as me) gets this email not-mangled.

--
Best regards,
Vladimir

Re: [PATCH 1/2] spapr: number of SMP sockets must be equal to NUMA nodes

2021-03-25 Thread Cédric Le Goater

On 3/25/21 3:10 AM, David Gibson wrote:
> On Tue, Mar 23, 2021 at 02:21:33PM -0300, Daniel Henrique Barboza wrote:
>>
>>
>> On 3/22/21 10:03 PM, David Gibson wrote:
>>> On Fri, Mar 19, 2021 at 03:34:52PM -0300, Daniel Henrique Barboza wrote:
 Kernel commit 4bce545903fa ("powerpc/topology: Update
 topology_core_cpumask") cause a regression in the pseries machine when
 defining certain SMP topologies [1]. The reasoning behind the change is
 explained in kernel commit 4ca234a9cbd7 ("powerpc/smp: Stop updating
 cpu_core_mask"). In short, cpu_core_mask logic was causing troubles with
 large VMs with lots of CPUs and was changed by cpu_cpu_mask because, as
 far as the kernel understanding of SMP topologies goes, both masks are
 equivalent.

 Further discussions in the kernel mailing list [2] shown that the
 powerpc kernel always considered that the number of sockets were equal
 to the number of NUMA nodes. The claim is that it doesn't make sense,
 for Power hardware at least, 2+ sockets being in the same NUMA node. The
 immediate conclusion is that all SMP topologies the pseries machine were
 supplying to the kernel, with more than one socket in the same NUMA node
 as in [1], happened to be correctly represented in the kernel by
 accident during all these years.

 There's a case to be made for virtual topologies being detached from
 hardware constraints, allowing maximum flexibility to users. At the same
 time, this freedom can't result in unrealistic hardware representations
 being emulated. If the real hardware and the pseries kernel don't
 support multiple chips/sockets in the same NUMA node, neither should we.

 Starting in 6.0.0, all sockets must match an unique NUMA node in the
 pseries machine. qtest changes were made to adapt to this new
 condition.
>>>
>>> Oof.  I really don't like this idea.  It means a bunch of fiddly work
>>> for users to match these up, for no real gain.  I'm also concerned
>>> that this will require follow on changes in libvirt to not make this a
>>> really cryptic and irritating point of failure.
>>
>> Haven't though about required Libvirt changes, although I can say that there
>> will be some amount to be mande and it will probably annoy existing users
>> (everyone that has a multiple socket per NUMA node topology).
>>
>> There is not much we can do from the QEMU layer aside from what I've proposed
>> here. The other alternative is to keep interacting with the kernel folks to
>> see if there is a way to keep our use case untouched.
> 
> Right.  Well.. not necessarily untouched, but I'm hoping for more
> replies from Cédric to my objections and mpe's.  Even with sockets
> being a kinda meaningless concept in PAPR, I don't think tying it to
> NUMA nodes makes sense.

I did a couple of replies in different email threads but maybe not 
to all. I felt it was going nowhere :/ Couple of thoughts,

Shouldn't we get rid of the socket concept, die also, under pseries 
since they don't exist under PAPR ? We only have numa nodes, cores, 
threads AFAICT.

Should we diverged from PAPR and add extra DT properties "qemu,..." ?
There are a couple of places where Linux checks for the underlying 
hypervisor already.

>> This also means that
>> 'ibm,chip-id' will probably remain in use since it's the only place where
>> we inform cores per socket information to the kernel.
> 
> Well.. unless we can find some other sensible way to convey that
> information.  I haven't given up hope for that yet.

Well, we could start by fixing the value in QEMU. It is broken today.

This is all coming from some work we did last year to evaluate our HW 
(mostly for XIVE) on 2s, 4s, 16s systems on baremetal, KVM and PowerVM. 
We saw some real problems because Linux did not have a clear view of the 
topology. See the figures here : 

http://patchwork.ozlabs.org/project/linuxppc-dev/patch/20210303174857.1760393-9-...@kaod.org/

The node id is a key parameter for system resource management, memory 
allocation, interrupt affinity, etc. Linux scales much better if used
correctly. 

C.

Re: [RFC PATCH v2 0/3] virtio-net: graceful drop of vhost for TAP

2021-03-25 Thread Yuri Benditovich

Hi Jason,

This was discussed earlier on the previous series of patches.
https://lists.gnu.org/archive/html/qemu-devel/2021-02/msg01829.html
There were strong objections from both Daniel and Michael and I feel
that the series was rejected.
There was Michael's claim:
"We did what this patch is trying to change for years now, in
particular KVM also seems to happily disable CPU features not supported
by kernel so I wonder why we can't keep doing it, with tweaks for some
corner cases."
https://lists.gnu.org/archive/html/qemu-devel/2021-02/msg03187.html
And it was Michael's question:
"Can we limit the change to when a VM is migrated in?"
https://lists.gnu.org/archive/html/qemu-devel/2021-02/msg03163.html
So I'm trying to suggest another approach:
- In case of conflicting features (for example RSS and vhost) we in
qemu we do not have enough information to prefer one or another.
- If we drop to userspace in the first set_features we say: "vhost is
less important than other requested features"
- This series keeps backward compatibility, i.e. if you start with
vhost and some features are not available - they are silently cleared.
- But in case the features are available on source machine - they are used
- In case of migration this series says: "We prefer successful
migration even if for that we need to drop to userspace"
- On the migration back to the 1st system we again work with all the
features and with vhost as all the features are available.

Thanks,
Yuri



On Thu, Mar 25, 2021 at 8:59 AM Jason Wang  wrote:
>
>
> 在 2021/3/22 下午8:24, Yuri Benditovich 写道:
> > Allow fallback to userspace only upon migration, only for specific features
> > and only if 'vhostforce' is not requested.
> >
> > Changes from v1:
> > Patch 1 dropeed (will be submitted in another series)
> > Added device callback in case the migration should fail due to missing 
> > features
>
>
> Hi Yuri:
>
> Have a quick glance at the series. A questions is why we need to do the
> fallback only during load?
>
> I think we should do it in the device initializating. E.g when the vhost
> features can not satisfy, we should disable vhost since there.
>
> Thanks
>
>
> >
> > Yuri Benditovich (3):
> >net: add ability to hide (disable) vhost_net
> >virtio: introduce 'missing_features_migrated' device callback
> >virtio-net: implement missing_features_migrated callback
> >
> >   hw/net/vhost_net.c |  4 ++-
> >   hw/net/virtio-net.c| 51 ++
> >   hw/virtio/virtio.c |  8 ++
> >   include/hw/virtio/virtio.h |  8 ++
> >   include/net/net.h  |  1 +
> >   5 files changed, 71 insertions(+), 1 deletion(-)
> >
>

Re: gitlab-ci: Only build /staging branch?

2021-03-25 Thread Philippe Mathieu-Daudé

On 3/25/21 6:43 AM, Thomas Huth wrote:
> On 24/03/2021 22.58, Philippe Mathieu-Daudé wrote:
>> On 3/24/21 7:33 PM, Philippe Mathieu-Daudé wrote:
>>> On 3/24/21 7:01 PM, Philippe Mathieu-Daudé wrote:
 Hi,

 Peter's current workflow is push to /staging and if his
 testing succeeds, he pushes the same commit as /master.

 IMO there is no point in building /master branch, as it
 has already been built earlier as /staging.
>>
>> Similarly with tags. Although we don't tag often.
> 
> Tags are used for pull-requests. So I think we should run the whole CI
> for tags, to make it clear that a pull-request always includes code that
> builds fine.

Sorry the context was not clear :/

This is only relevant for the qemu-project/qemu gitlab namespace.

v6.0 is at the door and I was wondering what is missing to have the
CI used as a gate.

- Stefan/Paolo moved the main repository location.

- Alex made yet another effort to get the CI pipeline green again.

- IIRC Peter said waiting 2h after pushing /staging is too long.
Currently worst case it takes ~2h25 between one /staging and the
next one, simply because /master is rebuilt in the middle. If we
remove /master we have ~1h15 per /staging pipeline.

- I don't remember what is missing from Cleber script, maybe we can
use it as it without waiting for a respin?

We have never been that close, but we are not there yet...

Regards,

Phil.

Re: [PATCH V2] target/riscv: Align the data type of reset vector address

2021-03-25 Thread Dylan Jhong

Hi All,

Please ignore this patch.
There is a compile error while building 32bit qemu.

The error occurs in ./target/riscv/cpu.c:557  
"DEFINE_PROP_UINT64("resetvec", RISCVCPU, cfg.resetvec, DEFAULT_RSTVEC)"

It should be written differently according to 32bit or 64bit machine.

I'll send patch v3 to fix this issue.
Sorry for my mistake.

Regards,
Dylan

On Thu, Mar 25, 2021 at 01:52:13PM +0800, Dylan Dai-Rong Jhong(鍾岱融) wrote:
> Signed-off-by: Dylan Jhong 
> Signed-off-by: Ruinland ChuanTzu Tsai 
> ---
>  target/riscv/cpu.c | 2 +-
>  target/riscv/cpu.h | 2 +-
>  2 files changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
> index 7d6ed80f6b..4ac901245a 100644
> --- a/target/riscv/cpu.c
> +++ b/target/riscv/cpu.c
> @@ -137,7 +137,7 @@ static void set_feature(CPURISCVState *env, int feature)
>  env->features |= (1ULL << feature);
>  }
>  
> -static void set_resetvec(CPURISCVState *env, int resetvec)
> +static void set_resetvec(CPURISCVState *env, target_ulong resetvec)
>  {
>  #ifndef CONFIG_USER_ONLY
>  env->resetvec = resetvec;
> diff --git a/target/riscv/cpu.h b/target/riscv/cpu.h
> index 0a33d387ba..d9d7891666 100644
> --- a/target/riscv/cpu.h
> +++ b/target/riscv/cpu.h
> @@ -303,7 +303,7 @@ struct RISCVCPU {
>  uint16_t elen;
>  bool mmu;
>  bool pmp;
> -uint64_t resetvec;
> +target_ulong resetvec;
>  } cfg;
>  };
>  
> -- 
> 2.17.1
>

Re: [PATCH 05/15] Hexagon (target/hexagon) change variables from int to bool when appropriate

2021-03-25 Thread Philippe Mathieu-Daudé

Hi Taylor,

On 3/25/21 3:50 AM, Taylor Simpson wrote:
> Address feedback from Richard Henderson 

If you look at the git history, we use the following tags:
- Reported-by: Richard Henderson 
- Suggested-by: Richard Henderson 
and tools know how to use them:
https://repo.or.cz/git-dm.git/blob/5ccc4dac6837:/gitdm#l292
(same comment applies to other patches in your series).

That said,
Reviewed-by: Philippe Mathieu-Daudé 

> 
> Signed-off-by: Taylor Simpson 
> ---
>  target/hexagon/cpu_bits.h  |  2 +-
>  target/hexagon/decode.c| 80 
> +++---
>  target/hexagon/insn.h  | 21 ++--
>  target/hexagon/op_helper.c |  8 ++---
>  target/hexagon/translate.c |  6 ++--
>  target/hexagon/translate.h |  2 +-
>  6 files changed, 60 insertions(+), 59 deletions(-)

Re: [PATCH v4 09/14] util/mmap-alloc: Pass flags instead of separate bools to qemu_ram_mmap()

2021-03-25 Thread David Hildenbrand


On 23.03.21 21:49, Peter Xu wrote:

On Fri, Mar 19, 2021 at 11:12:25AM +0100, David Hildenbrand wrote:

Let's pass flags instead of bools to prepare for passing other flags and
update the documentation of qemu_ram_mmap(). Introduce new QEMU_MAP_
flags that abstract the mmap() PROT_ and MAP_ flag handling and simplify
it.

We expose only flags that are currently supported by qemu_ram_mmap().
Maybe, we'll see qemu_mmap() in the future as well that can implement these
flags.

Note: We don't use MAP_ flags as some flags (e.g., MAP_SYNC) are only
defined for some systems and we want to always be able to identify
these flags reliably inside qemu_ram_mmap() -- for example, to properly
warn when some future flags are not available or effective on a system.
Also, this way we can simplify PROT_ handling as well.

Signed-off-by: David Hildenbrand 
---
  include/qemu/mmap-alloc.h | 16 +---
  include/qemu/osdep.h  | 18 ++
  softmmu/physmem.c |  8 +---
  util/mmap-alloc.c | 15 ---
  util/oslib-posix.c|  3 ++-
  5 files changed, 42 insertions(+), 18 deletions(-)

diff --git a/include/qemu/mmap-alloc.h b/include/qemu/mmap-alloc.h
index 456ff87df1..a60a2085b3 100644
--- a/include/qemu/mmap-alloc.h
+++ b/include/qemu/mmap-alloc.h
@@ -7,18 +7,22 @@ size_t qemu_fd_getpagesize(int fd);
  size_t qemu_mempath_getpagesize(const char *mem_path);
  
  /**

- * qemu_ram_mmap: mmap the specified file or device.
+ * qemu_ram_mmap: mmap anonymous memory, the specified file or device.
+ *
+ * mmap() abstraction to map guest RAM, simplifying flag handling, taking
+ * care of alignment requirements and installing guard pages.
   *
   * Parameters:
   *  @fd: the file or the device to mmap
   *  @size: the number of bytes to be mmaped
   *  @align: if not zero, specify the alignment of the starting mapping 
address;
   *  otherwise, the alignment in use will be determined by QEMU.
- *  @readonly: true for a read-only mapping, false for read/write.
- *  @shared: map has RAM_SHARED flag.
- *  @is_pmem: map has RAM_PMEM flag.
+ *  @qemu_map_flags: QEMU_MAP_* flags
   *  @map_offset: map starts at offset of map_offset from the start of fd
   *
+ * Internally, MAP PRIVATE, MAP_ANONYMOUS and MAP_SHARED_VALIDATE are set

  ^
  |
  + underscore



Nice catch, thanks :)


--
Thanks,

David / dhildenb

[PATCH V3] target/riscv: Align the data type of reset vector address

2021-03-25 Thread Dylan Jhong

Signed-off-by: Dylan Jhong 
Signed-off-by: Ruinland ChuanTzu Tsai 
---
 target/riscv/cpu.c | 6 +-
 target/riscv/cpu.h | 2 +-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
index 7d6ed80f6b..8a5f18bcb0 100644
--- a/target/riscv/cpu.c
+++ b/target/riscv/cpu.c
@@ -137,7 +137,7 @@ static void set_feature(CPURISCVState *env, int feature)
 env->features |= (1ULL << feature);
 }
 
-static void set_resetvec(CPURISCVState *env, int resetvec)
+static void set_resetvec(CPURISCVState *env, target_ulong resetvec)
 {
 #ifndef CONFIG_USER_ONLY
 env->resetvec = resetvec;
@@ -554,7 +554,11 @@ static Property riscv_cpu_properties[] = {
 DEFINE_PROP_UINT16("elen", RISCVCPU, cfg.elen, 64),
 DEFINE_PROP_BOOL("mmu", RISCVCPU, cfg.mmu, true),
 DEFINE_PROP_BOOL("pmp", RISCVCPU, cfg.pmp, true),
+#if defined(TARGET_RISCV32)
+DEFINE_PROP_UINT32("resetvec", RISCVCPU, cfg.resetvec, DEFAULT_RSTVEC),
+#elif defined(TARGET_RISCV64)
 DEFINE_PROP_UINT64("resetvec", RISCVCPU, cfg.resetvec, DEFAULT_RSTVEC),
+#endif
 DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/target/riscv/cpu.h b/target/riscv/cpu.h
index 0a33d387ba..d9d7891666 100644
--- a/target/riscv/cpu.h
+++ b/target/riscv/cpu.h
@@ -303,7 +303,7 @@ struct RISCVCPU {
 uint16_t elen;
 bool mmu;
 bool pmp;
-uint64_t resetvec;
+target_ulong resetvec;
 } cfg;
 };
 
-- 
2.17.1

Re: gitlab-ci: Only build /staging branch?

2021-03-25 Thread Philippe Mathieu-Daudé

On 3/25/21 10:29 AM, Philippe Mathieu-Daudé wrote:
> On 3/25/21 6:43 AM, Thomas Huth wrote:
>> On 24/03/2021 22.58, Philippe Mathieu-Daudé wrote:
>>> On 3/24/21 7:33 PM, Philippe Mathieu-Daudé wrote:
 On 3/24/21 7:01 PM, Philippe Mathieu-Daudé wrote:
> Hi,
>
> Peter's current workflow is push to /staging and if his
> testing succeeds, he pushes the same commit as /master.
>
> IMO there is no point in building /master branch, as it
> has already been built earlier as /staging.
>>>
>>> Similarly with tags. Although we don't tag often.
>>
>> Tags are used for pull-requests. So I think we should run the whole CI
>> for tags, to make it clear that a pull-request always includes code that
>> builds fine.
> 
> Sorry the context was not clear :/
> 
> This is only relevant for the qemu-project/qemu gitlab namespace.
> 
> v6.0 is at the door and I was wondering what is missing to have the
> CI used as a gate.
> 
> - Stefan/Paolo moved the main repository location.
> 
> - Alex made yet another effort to get the CI pipeline green again.
> 
> - IIRC Peter said waiting 2h after pushing /staging is too long.
> Currently worst case it takes ~2h25 between one /staging and the
> next one, simply because /master is rebuilt in the middle. If we
> remove /master we have ~1h15 per /staging pipeline.
> 
> - I don't remember what is missing from Cleber script, maybe we can
> use it as it without waiting for a respin?

As someone else is caring about this, please disregard this thread
and its questions/suggestions.

Regards,

Phil.

Re: [PULL 0/2] Block patches

2021-03-25 Thread Stefan Hajnoczi

On Wed, Mar 24, 2021 at 08:42:27PM +, Peter Maydell wrote:
> On Wed, 24 Mar 2021 at 20:18, Vladimir Sementsov-Ogievskiy
>  wrote:
> >
> > 24.03.2021 21:05, Peter Maydell wrote:
> > > On Wed, 24 Mar 2021 at 14:52, Stefan Hajnoczi  wrote:
> > >>
> > >> Vladimir Sementsov-Ogievskiy (2):
> > >>migration/block-dirty-bitmap: make incoming disabled bitmaps busy
> > >>migrate-bitmaps-postcopy-test: check that we can't remove in-flight
> > >>  bitmaps
> > >
> > > This failed the 'qsd-jobs' iotest on s390x:
> 
> > I can't believe it related. My commit modifies bitmap status during bitmaps 
> > migration on target vm. There is no kind of migration in qsd-jobs test.
> 
> It's possible it's an intermittent, but it's not one I've seen
> before. We still have lots of time this release cycle to figure
> out the issue and get this fix in.

Vladimir: I'll get hold of an s390 machine and try to reproduce the
failure. I should have some news by Monday.

Let's put the pull request on hold for now.

Stefan


signature.asc
Description: PGP signature

[PATCH 0/1] avocado-qemu: New SMMUv3 tests

2021-03-25 Thread Eric Auger

This patch adds a first set of SMMU functional tests using
a Fedora cloud-init image. Given the kernel in use,
range invalidation is not tested yet. However different
guest kernel configurations are tested: standard, strict=0
and passthrough mode.

The patch applies on top of Cleber's series:
PATCH v2 00/10] Acceptance Test: introduce base class for
Linux based tests.

Special thanks to Cleber for his support on this first
trial.

Best Regards

Eric

Eric Auger (1):
  avocado_qemu: Add SMMUv3 tests

 tests/acceptance/smmu.py | 104 +++
 1 file changed, 104 insertions(+)
 create mode 100644 tests/acceptance/smmu.py

-- 
2.26.2

[PATCH 1/1] avocado_qemu: Add SMMUv3 tests

2021-03-25 Thread Eric Auger

Add new tests checking the good behavior of the SMMUv3 protecting
2 virtio pci devices (block and net). We check the guest boots and
we are able to install a package. Different guest configs are tested:
standard, passthrough an strict=0. Given the version of the guest
kernel in use (5.3.7 at this moment), range invalidation is not yet
tested. This will be handled separately.

Signed-off-by: Eric Auger 
---
 tests/acceptance/smmu.py | 104 +++
 1 file changed, 104 insertions(+)
 create mode 100644 tests/acceptance/smmu.py

diff --git a/tests/acceptance/smmu.py b/tests/acceptance/smmu.py
new file mode 100644
index 00..65ecac8f1a
--- /dev/null
+++ b/tests/acceptance/smmu.py
@@ -0,0 +1,104 @@
+# SMMUv3 Functional tests
+#
+# Copyright (c) 2021 Red Hat, Inc.
+#
+# Author:
+#  Eric Auger 
+#
+# This work is licensed under the terms of the GNU GPL, version 2 or
+# later.  See the COPYING file in the top-level directory.
+
+import os
+
+from avocado_qemu import LinuxTest, BUILD_DIR
+from avocado.utils import ssh
+
+class SMMU(LinuxTest):
+
+KERNEL_COMMON_PARAMS = ("root=UUID=b6950a44-9f3c-4076-a9c2-355e8475b0a7 ro 
"
+"earlyprintk=pl011,0x900 ignore_loglevel "
+"no_timer_check printk.time=1 rd_NO_PLYMOUTH "
+"console=ttyAMA0 ")
+IOMMU_ADDON = ',iommu_platform=on,disable-modern=off,disable-legacy=on'
+IMAGE = ("https://archives.fedoraproject.org/pub/archive/fedora/";
+ "linux/releases/31/Everything/aarch64/os/images/pxeboot/")
+kernel_path = None
+initrd_path = None
+kernel_params = None
+
+def set_up_boot(self):
+path = self.download_boot()
+self.vm.add_args('-device', 'virtio-blk-pci,bus=pcie.0,scsi=off,' +
+ 'drive=drv0,id=virtio-disk0,bootindex=1,'
+ 'werror=stop,rerror=stop' + self.IOMMU_ADDON)
+self.vm.add_args('-drive',
+ 'file=%s,if=none,cache=writethrough,id=drv0' % path)
+
+def setUp(self):
+super(SMMU, self).setUp(None, 'virtio-net-pci' + self.IOMMU_ADDON)
+
+def add_common_args(self):
+self.vm.add_args("-machine", "virt")
+self.vm.add_args('-bios', os.path.join(BUILD_DIR, 'pc-bios',
+  'edk2-aarch64-code.fd'))
+self.vm.add_args('-device', 'virtio-rng-pci,rng=rng0')
+self.vm.add_args('-object',
+ 'rng-random,id=rng0,filename=/dev/urandom')
+
+def common_vm_setup(self, custom_kernel=None):
+self.require_accelerator("kvm")
+self.add_common_args()
+self.vm.add_args("-accel", "kvm")
+self.vm.add_args("-cpu", "host")
+self.vm.add_args("-machine", "iommu=smmuv3")
+
+if custom_kernel is None:
+return
+
+kernel_url = self.IMAGE + 'vmlinuz'
+initrd_url = self.IMAGE + 'initrd.img'
+self.kernel_path = self.fetch_asset(kernel_url)
+self.initrd_path = self.fetch_asset(initrd_url)
+
+def run_and_check(self):
+if self.kernel_path:
+self.vm.add_args('-kernel', self.kernel_path,
+ '-append', self.kernel_params,
+ '-initrd', self.initrd_path)
+self.launch_and_wait()
+self.ssh_command('cat /proc/cmdline')
+self.ssh_command('dnf -y install numactl-devel')
+
+def test_smmu(self):
+"""
+:avocado: tags=accel:kvm
+:avocado: tags=cpu:host
+:avocado: tags=smmu
+"""
+
+self.common_vm_setup()
+self.run_and_check()
+
+def test_smmu_passthrough(self):
+"""
+:avocado: tags=accel:kvm
+:avocado: tags=cpu:host
+:avocado: tags=smmu
+"""
+self.common_vm_setup(True)
+
+self.kernel_params = self.KERNEL_COMMON_PARAMS + 'iommu.passthrough=on'
+
+self.run_and_check()
+
+def test_smmu_nostrict(self):
+"""
+:avocado: tags=accel:kvm
+:avocado: tags=cpu:host
+:avocado: tags=smmu
+"""
+self.common_vm_setup(True)
+
+self.kernel_params = self.KERNEL_COMMON_PARAMS + 'iommu.strict=0'
+
+self.run_and_check()
-- 
2.26.2

Re: [PATCH v1 0/3] migration: Fixes to the 'background-snapshot' code

2021-03-25 Thread Andrey Gruzdev


On 24.03.2021 18:41, Peter Xu wrote:

On Wed, Mar 24, 2021 at 11:09:27AM +0300, Andrey Gruzdev wrote:

I'm also looking into introducing UFFD_FEATURE_WP_UNALLOCATED so as to
wr-protect page holes too for a uffd-wp region when the feature bit is set.
With that feature we should be able to avoid pre-fault as what we do in the
last patch of this series.  However even if that can work out, we'll still need
this for old kernel anyways.

I'm curious this new feature is based on adding wr-protection at the level of 
VMAs,
so we won't miss write faults for missing pages?

I think we can do it with multiple ways.

The most efficient one would be wr-protect the range during uffd-wp
registration, so as you said it'll be per-vma attribute.  However that'll
change the general semantics of uffd-wp as normally we need registration and
explicit wr-protect.  Then it'll still be pte-based for faulted in pages (the
ones we wr-protected during registration will still be), however for the rest
it'll become vma-based.  It's indeed a bit confusing.

The other way is we can fault in zero page during UFFDIO_WRITEPROTECT.  However
that's less efficient, since it's close to pre-fault on read but it's just
slightly more cleaner than doing it in userspace.  When I rethink about this it
may not worth it to do in kernel if userspace can achieve things similar.

So let's stick with current solution; that idea may need more thoughts..

Thanks,

Agree, let's stick with current solution. For the future I think having 
a registration

flag like WP_MISSING to induce per-vma wr-protection is not a bad choice.

The reason is that usage of UFFDIO_WRITEPROTECT ioctl is often 
asymmetrical; we usually

write-protect the whole registration range but un-protect by small chunks.

So if we stay with current current symmetric protect/un-protect API but 
add the registration

flag to handle protection for unpopulated pages - that may be worth to do.

--
Andrey Gruzdev, Principal Engineer
Virtuozzo GmbH  +7-903-247-6397
virtuzzo.com

Re: [PATCH] qapi: introduce 'query-cpu-model-cpuid' action


24.03.2021 16:39, Valeriy Vdovin wrote:

Introducing new qapi method 'query-cpu-model-cpuid'. This method can be used to
get virtualized cpu model info generated by QEMU during VM initialization in
the form of cpuid representation.

Diving into more details about virtual cpu generation: QEMU first parses '-cpu'
command line option. From there it takes the name of the model as the basis for
feature set of the new virtual cpu. After that it uses trailing '-cpu' options,
that state if additional cpu features should be present on the virtual cpu or
excluded from it (tokens '+'/'-' or '=on'/'=off').
After that QEMU checks if the host's cpu can actually support the derived
feature set and applies host limitations to it.
After this initialization procedure, virtual cpu has it's model and
vendor names, and a working feature set and is ready for identification
instructions such as CPUID.

Currently full output for this method is only supported for x86 cpus.

To learn exactly how virtual cpu is presented to the guest machine via CPUID
instruction, new qapi method can be used. By calling 'query-cpu-model-cpuid'
method, one can get a full listing of all CPUID leafs with subleafs which are
supported by the initialized virtual cpu.

Other than debug, the method is useful in cases when we would like to
utilize QEMU's virtual cpu initialization routines and put the retrieved
values into kernel CPUID overriding mechanics for more precise control
over how various processes perceive its underlying hardware with
container processes as a good example.

Output format:
The core part of the returned JSON object can be described as a list of lists
with top level list contains leaf-level elements and the bottom level
containing subleafs, where 'leaf' is CPUID argument passed in EAX register and
'subleaf' is a value passed to CPUID in ECX register for some specific
leafs, that support that. Each most basic CPUID result is passed in a
maximum of 4 registers EAX, EBX, ECX and EDX, with most leafs not utilizing
all 4 registers at once.
Also note that 'subleaf' is a kind of extension, used by only a couple of
leafs, while most of the leafs don't have this. Nevertheless, the output
data structure presents ALL leafs as having at least a single 'subleaf'.
This is done for data structure uniformity, so that it could be
processed in a more straightforward manner, in this case no one suffers
from such simplification.

Use example:
virsh qemu-monitor-command VM --pretty '{ "execute": "query-cpu-model-cpuid" }'
{
   "return": {
 "cpuid": {
   "leafs": [
 {
   "leaf": 0,
   "subleafs": [
 {
   "eax": 13,
   "edx": 1231384169,
   "ecx": 1818588270,
   "ebx": 1970169159,
   "subleaf": 0
 }
   ]
 },
 {
   "leaf": 1,
   "subleafs": [
 {
   "eax": 329443,
   "edx": 529267711,
   "ecx": 4160369187,
   "ebx": 133120,
   "subleaf": 0
 }
   ]
 },
 {
   "leaf": 2,
   "subleafs": [
 {
   "eax": 1,
   "edx": 2895997,
   "ecx": 0,
   "ebx": 0,
   "subleaf": 0
 }
   ]
 },
   ]
 },
 "vendor": "GenuineIntel",
 "class-name": "Skylake-Client-IBRS-x86_64-cpu",
 "model-id": "Intel Core Processor (Skylake, IBRS)"
   },
   "id": "libvirt-40"
}
Signed-off-by: Valeriy Vdovin 
---
  qapi/machine-target.json | 122 
  target/i386/cpu.c| 292 +--
  2 files changed, 405 insertions(+), 9 deletions(-)

diff --git a/qapi/machine-target.json b/qapi/machine-target.json
index e7811654b7..c5b137aa5c 100644
--- a/qapi/machine-target.json
+++ b/qapi/machine-target.json
@@ -329,3 +329,125 @@
  ##
  { 'command': 'query-cpu-definitions', 'returns': ['CpuDefinitionInfo'],
'if': 'defined(TARGET_PPC) || defined(TARGET_ARM) || defined(TARGET_I386) 
|| defined(TARGET_S390X) || defined(TARGET_MIPS)' }
+##
+
+
+# @CpuidSubleaf:
+#
+# CPUID leaf extension information, based on ECX value.
+#
+# CPUID x86 instruction has 'leaf' argument passed in EAX register. Leaf
+# argument identifies the type of information, the caller wants to retrieve in
+# single call to CPUID.
+# Some but not all leaves depend on the value passed in ECX register as an
+# additional argument to CPUID. This argument is present in cpuid documentation
+# as 'subleaf'.
+# If CPUID ignores the value in ECX, normally this means that leaf does not
+# have subleaves. Another way to see it is that each leaf has at least one
+# subleaf (one type of output).
+#
+# @subleaf: value passed to CPUID in ECX register. If CPUID leaf has only a
+#   single leaf, the value of ECX is ignored by CPU and should as well
+#   be ignored in this field.
+# @eax: value in eax af

Re: [PATCH 1/2] spapr: number of SMP sockets must be equal to NUMA nodes

2021-03-25 Thread Daniel Henrique Barboza

On 3/25/21 5:56 AM, Cédric Le Goater wrote:

On 3/25/21 3:10 AM, David Gibson wrote:

On Tue, Mar 23, 2021 at 02:21:33PM -0300, Daniel Henrique Barboza wrote:

On 3/22/21 10:03 PM, David Gibson wrote:

On Fri, Mar 19, 2021 at 03:34:52PM -0300, Daniel Henrique Barboza wrote:

Kernel commit 4bce545903fa ("powerpc/topology: Update
topology_core_cpumask") cause a regression in the pseries machine when
defining certain SMP topologies [1]. The reasoning behind the change is
explained in kernel commit 4ca234a9cbd7 ("powerpc/smp: Stop updating
cpu_core_mask"). In short, cpu_core_mask logic was causing troubles with
large VMs with lots of CPUs and was changed by cpu_cpu_mask because, as
far as the kernel understanding of SMP topologies goes, both masks are
equivalent.

Further discussions in the kernel mailing list [2] shown that the
powerpc kernel always considered that the number of sockets were equal
to the number of NUMA nodes. The claim is that it doesn't make sense,
for Power hardware at least, 2+ sockets being in the same NUMA node. The
immediate conclusion is that all SMP topologies the pseries machine were
supplying to the kernel, with more than one socket in the same NUMA node
as in [1], happened to be correctly represented in the kernel by
accident during all these years.

There's a case to be made for virtual topologies being detached from
hardware constraints, allowing maximum flexibility to users. At the same
time, this freedom can't result in unrealistic hardware representations
being emulated. If the real hardware and the pseries kernel don't
support multiple chips/sockets in the same NUMA node, neither should we.

Starting in 6.0.0, all sockets must match an unique NUMA node in the
pseries machine. qtest changes were made to adapt to this new
condition.

Oof. I really don't like this idea. It means a bunch of fiddly work
for users to match these up, for no real gain. I'm also concerned
that this will require follow on changes in libvirt to not make this a
really cryptic and irritating point of failure.

Haven't though about required Libvirt changes, although I can say that there
will be some amount to be mande and it will probably annoy existing users
(everyone that has a multiple socket per NUMA node topology).

There is not much we can do from the QEMU layer aside from what I've proposed
here. The other alternative is to keep interacting with the kernel folks to
see if there is a way to keep our use case untouched.

Right. Well.. not necessarily untouched, but I'm hoping for more
replies from Cédric to my objections and mpe's. Even with sockets
being a kinda meaningless concept in PAPR, I don't think tying it to
NUMA nodes makes sense.

I did a couple of replies in different email threads but maybe not
to all. I felt it was going nowhere :/ Couple of thoughts,

Shouldn't we get rid of the socket concept, die also, under pseries
since they don't exist under PAPR ? We only have numa nodes, cores,
threads AFAICT.

I don't think we work with 'die'.

Getting rid of the 'socket' representation is sensible regarding PAPR,
but the effect for pseries will be similar to what this patch is already
doing: users could have multiple sockets in the same NUMA node, and then
they won't. Either because we got rid of the 'socket' representation or
because socket == NUMA node.

Should we diverged from PAPR and add extra DT properties "qemu,..." ?
There are a couple of places where Linux checks for the underlying
hypervisor already.

This also means that
'ibm,chip-id' will probably remain in use since it's the only place where
we inform cores per socket information to the kernel.

Well.. unless we can find some other sensible way to convey that
information. I haven't given up hope for that yet.

Well, we could start by fixing the value in QEMU. It is broken today.

I'll look into it. It makes more sense to talk about keeping it when
it's working properly.

DHB

This is all coming from some work we did last year to evaluate our HW
(mostly for XIVE) on 2s, 4s, 16s systems on baremetal, KVM and PowerVM.
We saw some real problems because Linux did not have a clear view of the
topology. See the figures here :

http://patchwork.ozlabs.org/project/linuxppc-dev/patch/20210303174857.1760393-9-...@kaod.org/

The node id is a key parameter for system resource management, memory
allocation, interrupt affinity, etc. Linux scales much better if used
correctly.

Re: [PULL 0/2] Block patches


25.03.2021 12:56, Stefan Hajnoczi wrote:

On Wed, Mar 24, 2021 at 08:42:27PM +, Peter Maydell wrote:

On Wed, 24 Mar 2021 at 20:18, Vladimir Sementsov-Ogievskiy
 wrote:


24.03.2021 21:05, Peter Maydell wrote:

On Wed, 24 Mar 2021 at 14:52, Stefan Hajnoczi  wrote:


Vladimir Sementsov-Ogievskiy (2):
migration/block-dirty-bitmap: make incoming disabled bitmaps busy
migrate-bitmaps-postcopy-test: check that we can't remove in-flight
  bitmaps


This failed the 'qsd-jobs' iotest on s390x:



I can't believe it related. My commit modifies bitmap status during bitmaps 
migration on target vm. There is no kind of migration in qsd-jobs test.


It's possible it's an intermittent, but it's not one I've seen
before. We still have lots of time this release cycle to figure
out the issue and get this fix in.


Vladimir: I'll get hold of an s390 machine and try to reproduce the
failure. I should have some news by Monday.


Thanks! My path modifies migration/block-dirty-bitmap.c. qsd-jobs runs 
block-commit and block-stream jobs and don't start any kind of migration or 
snapshot or savevm, so it seems impossible that qsd-jobs runs the code touched 
by my patch..



Let's put the pull request on hold for now.

Stefan




--
Best regards,
Vladimir

Re: [PATCH V3] target/riscv: Align the data type of reset vector address

2021-03-25 Thread Bin Meng

On Thu, Mar 25, 2021 at 5:42 PM Dylan Jhong  wrote:
>
> Signed-off-by: Dylan Jhong 
> Signed-off-by: Ruinland ChuanTzu Tsai 
> ---
>  target/riscv/cpu.c | 6 +-
>  target/riscv/cpu.h | 2 +-
>  2 files changed, 6 insertions(+), 2 deletions(-)
>

Reviewed-by: Bin Meng

Re: Crashes with qemu-system-ppc64

2021-03-25 Thread Greg Kurz

On Wed, 24 Mar 2021 10:17:55 +0100
Paolo Bonzini  wrote:

> On 24/03/21 00:35, Philippe Mathieu-Daudé wrote:
> > Hmmm does this assert() matches your comment?
> > 
> > -- >8 --
> > diff --git a/hw/core/qdev.c b/hw/core/qdev.c
> > index cefc5eaa0a9..41cbee77d14 100644
> > --- a/hw/core/qdev.c
> > +++ b/hw/core/qdev.c
> > @@ -1130,6 +1130,8 @@ Object *qdev_get_machine(void)
> >   {
> >   static Object *dev;
> > 
> > +assert(phase_check(PHASE_MACHINE_CREATED));
> > +
> 
> Very nice use of phase_check!  Kudos.
> 

It seems promising at first sight but qdev_get_machine() gets
called under qemu_create_machine() long before phase is advanced
to PHASE_MACHINE_CREATED.

qemu-system-ppc64: ../../hw/core/qdev.c:1133: qdev_get_machine: Assertion 
`phase_check(PHASE_MACHINE_CREATED)' failed.

(gdb) bt
#0  0x764a3708 in raise () at /lib64/power9/libc.so.6
#1  0x76483bcc in abort () at /lib64/power9/libc.so.6
#2  0x76497210 in __assert_fail_base () at /lib64/power9/libc.so.6
#3  0x764972b4 in __assert_fail () at /lib64/power9/libc.so.6
#4  0x0001009e7820 in qdev_get_machine () at ../../hw/core/qdev.c:1133
#5  0x0001009e7820 in qdev_get_machine () at ../../hw/core/qdev.c:1129
#6  0x000100747894 in memory_region_do_init (mr=0x101261200, owner=0x0, 
name=, size=) at ../../softmmu/memory.c:1177
#7  0x0001007fccc4 in memory_map_init () at ../../softmmu/physmem.c:2630
#8  0x0001007fccc4 in cpu_exec_init_all () at ../../softmmu/physmem.c:3034
#9  0x0001007e9c9c in qemu_create_machine 
(machine_class=machine_class@entry=0x1014b96d0) at ../../softmmu/vl.c:2086
#10 0x0001007eb8c0 in qemu_init (argc=, argv=, envp=) at ../../softmmu/vl.c:1640
#11 0x0001002f53c8 in main (argc=, argv=, 
envp=) at ../../softmmu/main.c:49


static void memory_region_do_init(MemoryRegion *mr,
  Object *owner,
  const char *name,
  uint64_t size)
{
[...]
if (!owner) {
owner = container_get(qdev_get_machine(), "/unattached");
}

The true condition for qdev_get_machine() to be functional is
actually that the following happened already:

object_property_add_child(object_get_root(), "machine",
  OBJECT(current_machine));

This is the case with the call stack ^^ and I don't see any valid
reason to forbid use of qdev_get_machine() here.

So I'm wondering if we shouldn't rather check the existence of the
"/machine" path in the QOM tree instead of checking the phase.

> Paolo
>

Re: gitlab-ci: Only build /staging branch?

2021-03-25 Thread Peter Maydell

On Thu, 25 Mar 2021 at 09:33, Philippe Mathieu-Daudé  wrote:
> v6.0 is at the door and I was wondering what is missing to have the
> CI used as a gate.

It needs to be faster. Mostly I do check the gitlab CI pipeline
status, but in the run-up to getting rc0 out I stopped waiting
for the gitlab CI job to finish, because I was continually finding
that I kicked off a run, my local build-tests would complete within
an hour or so, and the gitlab CI jobs were still pending, barely
started, etc. Turnaround on testing a merge must be 90 minutes or
less, especially during release periods, because there are always
a huge number of merges that arrive for me to test in the last
couple of days before freeze.

thanks
-- PMM

Re: [PATCH v3] linux-user/s390x: Use the guest pointer for the sigreturn stub

2021-03-25 Thread Laurent Vivier

Le 24/03/2021 à 19:51, Andreas Krebbel a écrit :
> When setting up the pointer for the sigreturn stub in the return
> address register (r14) we currently use the host frame address instead
> of the guest frame address.
> 
> Note: This only caused problems if Qemu has been built with
> --disable-pie (as it is in distros nowadays). Otherwise guest_base
> defaults to 0 hiding the actual problem.
> 
> Signed-off-by: Andreas Krebbel 
> ---
>  linux-user/s390x/signal.c | 5 +++--
>  1 file changed, 3 insertions(+), 2 deletions(-)
> 
> diff --git a/linux-user/s390x/signal.c b/linux-user/s390x/signal.c
> index ecfa2a14a9..7107c5fb53 100644
> --- a/linux-user/s390x/signal.c
> +++ b/linux-user/s390x/signal.c
> @@ -211,9 +211,10 @@ void setup_rt_frame(int sig, struct target_sigaction *ka,
>  /* Set up to return from userspace.  If provided, use a stub
> already in userspace.  */
>  if (ka->sa_flags & TARGET_SA_RESTORER) {
> -env->regs[14] = (unsigned long) ka->sa_restorer | PSW_ADDR_AMODE;
> +env->regs[14] = ka->sa_restorer | PSW_ADDR_AMODE;
>  } else {
> -env->regs[14] = (unsigned long) frame->retcode | PSW_ADDR_AMODE;
> +env->regs[14] = (frame_addr + offsetof(typeof(*frame), retcode))
> +| PSW_ADDR_AMODE;
>  __put_user(S390_SYSCALL_OPCODE | TARGET_NR_rt_sigreturn,
> (uint16_t *)(frame->retcode));
>  }
> 

Reviewed-by: Laurent Vivier

Re: [PATCH] qapi: introduce 'query-cpu-model-cpuid' action


25.03.2021 13:11, Vladimir Sementsov-Ogievskiy wrote:

24.03.2021 16:39, Valeriy Vdovin wrote:

Introducing new qapi method 'query-cpu-model-cpuid'. This method can be used to
get virtualized cpu model info generated by QEMU during VM initialization in
the form of cpuid representation.

Diving into more details about virtual cpu generation: QEMU first parses '-cpu'
command line option. From there it takes the name of the model as the basis for
feature set of the new virtual cpu. After that it uses trailing '-cpu' options,
that state if additional cpu features should be present on the virtual cpu or
excluded from it (tokens '+'/'-' or '=on'/'=off').
After that QEMU checks if the host's cpu can actually support the derived
feature set and applies host limitations to it.
After this initialization procedure, virtual cpu has it's model and
vendor names, and a working feature set and is ready for identification
instructions such as CPUID.

Currently full output for this method is only supported for x86 cpus.

To learn exactly how virtual cpu is presented to the guest machine via CPUID
instruction, new qapi method can be used. By calling 'query-cpu-model-cpuid'
method, one can get a full listing of all CPUID leafs with subleafs which are
supported by the initialized virtual cpu.

Other than debug, the method is useful in cases when we would like to
utilize QEMU's virtual cpu initialization routines and put the retrieved
values into kernel CPUID overriding mechanics for more precise control
over how various processes perceive its underlying hardware with
container processes as a good example.

Output format:
The core part of the returned JSON object can be described as a list of lists
with top level list contains leaf-level elements and the bottom level
containing subleafs, where 'leaf' is CPUID argument passed in EAX register and
'subleaf' is a value passed to CPUID in ECX register for some specific
leafs, that support that. Each most basic CPUID result is passed in a
maximum of 4 registers EAX, EBX, ECX and EDX, with most leafs not utilizing
all 4 registers at once.
Also note that 'subleaf' is a kind of extension, used by only a couple of
leafs, while most of the leafs don't have this. Nevertheless, the output
data structure presents ALL leafs as having at least a single 'subleaf'.
This is done for data structure uniformity, so that it could be
processed in a more straightforward manner, in this case no one suffers
from such simplification.

Use example:
virsh qemu-monitor-command VM --pretty '{ "execute": "query-cpu-model-cpuid" }'
{
   "return": {
 "cpuid": {
   "leafs": [
 {
   "leaf": 0,
   "subleafs": [
 {
   "eax": 13,
   "edx": 1231384169,
   "ecx": 1818588270,
   "ebx": 1970169159,
   "subleaf": 0
 }
   ]
 },
 {
   "leaf": 1,
   "subleafs": [
 {
   "eax": 329443,
   "edx": 529267711,
   "ecx": 4160369187,
   "ebx": 133120,
   "subleaf": 0
 }
   ]
 },
 {
   "leaf": 2,
   "subleafs": [
 {
   "eax": 1,
   "edx": 2895997,
   "ecx": 0,
   "ebx": 0,
   "subleaf": 0
 }
   ]
 },
   ]
 },
 "vendor": "GenuineIntel",
 "class-name": "Skylake-Client-IBRS-x86_64-cpu",
 "model-id": "Intel Core Processor (Skylake, IBRS)"
   },
   "id": "libvirt-40"
}
Signed-off-by: Valeriy Vdovin 
---
  qapi/machine-target.json | 122 
  target/i386/cpu.c    | 292 +--
  2 files changed, 405 insertions(+), 9 deletions(-)

diff --git a/qapi/machine-target.json b/qapi/machine-target.json
index e7811654b7..c5b137aa5c 100644
--- a/qapi/machine-target.json
+++ b/qapi/machine-target.json
@@ -329,3 +329,125 @@
  ##
  { 'command': 'query-cpu-definitions', 'returns': ['CpuDefinitionInfo'],
    'if': 'defined(TARGET_PPC) || defined(TARGET_ARM) || defined(TARGET_I386) 
|| defined(TARGET_S390X) || defined(TARGET_MIPS)' }
+##
+
+
+# @CpuidSubleaf:
+#
+# CPUID leaf extension information, based on ECX value.
+#
+# CPUID x86 instruction has 'leaf' argument passed in EAX register. Leaf
+# argument identifies the type of information, the caller wants to retrieve in
+# single call to CPUID.
+# Some but not all leaves depend on the value passed in ECX register as an
+# additional argument to CPUID. This argument is present in cpuid documentation
+# as 'subleaf'.
+# If CPUID ignores the value in ECX, normally this means that leaf does not
+# have subleaves. Another way to see it is that each leaf has at least one
+# subleaf (one type of output).
+#
+# @subleaf: value passed to CPUID in ECX register. If CPUID leaf has only a
+#   single leaf, the value of ECX is ignored by CPU and should as well
+#

Re: gitlab-ci: Only build /staging branch?


On 25/03/21 11:34, Peter Maydell wrote:

It needs to be faster. Mostly I do check the gitlab CI pipeline
status, but in the run-up to getting rc0 out I stopped waiting
for the gitlab CI job to finish, because I was continually finding
that I kicked off a run, my local build-tests would complete within
an hour or so, and the gitlab CI jobs were still pending, barely
started, etc. Turnaround on testing a merge must be 90 minutes or
less, especially during release periods, because there are always
a huge number of merges that arrive for me to test in the last
couple of days before freeze.


Perhaps we could script it so that if the pipeline passes the merge to 
master is done automatically.


Paolo

Re: gitlab-ci: Only build /staging branch?

2021-03-25 Thread Peter Maydell

On Thu, 25 Mar 2021 at 11:05, Paolo Bonzini  wrote:
>
> On 25/03/21 11:34, Peter Maydell wrote:
> > It needs to be faster. Mostly I do check the gitlab CI pipeline
> > status, but in the run-up to getting rc0 out I stopped waiting
> > for the gitlab CI job to finish, because I was continually finding
> > that I kicked off a run, my local build-tests would complete within
> > an hour or so, and the gitlab CI jobs were still pending, barely
> > started, etc. Turnaround on testing a merge must be 90 minutes or
> > less, especially during release periods, because there are always
> > a huge number of merges that arrive for me to test in the last
> > couple of days before freeze.
>
> Perhaps we could script it so that if the pipeline passes the merge to
> master is done automatically.

That would be nice eventually, but we can't do it until the gitlab
CI is the *only* gating criterion.

thanks
-- PMM

Re: [PATCH v2] piix: fix regression during unplug in Xen HVM domUs

2021-03-25 Thread Olaf Hering

Am Mon, 22 Mar 2021 18:09:17 -0400
schrieb John Snow :

> My understanding is that XEN has some extra disks that it unplugs when 
> it later figures out it doesn't need them. How exactly this works is 
> something I've not looked into too closely.

It has no extra disks, why would it?

I assume each virtualization variant has some sort of unplug if it has to 
support guests that lack PV/virtio/enlightened/whatever drivers.

In case of HVM, the configured block or network devices can be either accessed 
via emulated PCI or via the PV drivers. Since the BIOS, the bootloader and 
potentially the operating system kernel typically lack PV drivers, they will 
find the devices only via the PCI bus. In case they happen to have PV drivers 
in addition to PCI drivers, both drivers will find and offer the same resource 
via different paths. In case of a block device, ata_piix.ko will show it via 
"/dev/sda" and xen-blkfront.ko will show it via "/dev/xvda". This is obviously 
bad, at least in the read-write case.

The pvops kernel triggers the unplug of the emulated PCI hardware early, prior 
any other PCI initialization. As a result the PCI drivers will not find their 
hardware anymore. In case of ata_piix, only the non-CDROM storage will be 
removed in qmeu, because there is no PV-CDROM driver.

The PV support in old xenlinux based kernels is only available as modules. As a 
result the unplug will happen after PCI was initialized, but it must happen 
before any PCI device drivers are loaded.


> So if these IDE devices have been "unplugged" already, we avoid 
> resetting them here. What about this reset causes the bug you describe 
> in the commit message?
> 
> Does this reset now happen earlier/later as compared to what it did 
> prior to ee358e91 ?

Prior this commit, piix_ide_reset was only called when the entire emulated 
machine was reset. Like: never.
With this commit, piix_ide_reset will be called from pci_piix3_xen_ide_unplug. 
For some reason it confuses the emulated USB hardware. Why it does confused it, 
no idea.

I wonder what the purpose of the qdev_reset_all() call really is. It is 10 
years old. It might be stale.


Olaf


pgpAJb2zZreCU.pgp
Description: Digitale Signatur von OpenPGP

Re: gitlab-ci: Only build /staging branch?

2021-03-25 Thread Daniel P . Berrangé

On Thu, Mar 25, 2021 at 12:05:32PM +0100, Paolo Bonzini wrote:
> On 25/03/21 11:34, Peter Maydell wrote:
> > It needs to be faster. Mostly I do check the gitlab CI pipeline
> > status, but in the run-up to getting rc0 out I stopped waiting
> > for the gitlab CI job to finish, because I was continually finding
> > that I kicked off a run, my local build-tests would complete within
> > an hour or so, and the gitlab CI jobs were still pending, barely
> > started, etc. Turnaround on testing a merge must be 90 minutes or
> > less, especially during release periods, because there are always
> > a huge number of merges that arrive for me to test in the last
> > couple of days before freeze.
> 
> Perhaps we could script it so that if the pipeline passes the merge to
> master is done automatically.

No need to script it, that functionality already exists in GitLab.

Push to the staging branch, and open a merge request for applying
staging -> master, and enable "merge when pipeline succeeds".

You can actually do this all in one command

https://docs.gitlab.com/ee/user/project/push_options.html

  git push \
 -o merge_request.create \
 -o merge_request.target=master \
 -o merge_request.merge_when_pipeline_succeeds \
 origin staging

The gitlab-ci.yml file could then be configured so that pipeline
jobs are associated with a merge request, rather than push event.
This will avoid the pipeline being re-run on master after the
merge.

If you enable "merge trains" option in the repo, then you can
even push to multiple branches concurrently, and gitlab will
serialize the CI pipelines from each merge request in turn,
(assuming no conflicts between then).

Regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|

[PATCH v6 0/6] coroutine rwlock downgrade fix, minor VDI changes

This is a resubmit of David Edmondson's series at
https://patchew.org/QEMU/20210309144015.557477-1-david.edmond...@oracle.com/.
After closer analysis on IRC, the CoRwlock's attempt to ensure
fairness turned out to be flawed.  Therefore, this series
reimplements CoRwlock without using a CoQueue.  Tracking whether
each queued coroutine is a reader/writer makes it possible to
never wake a writer when only readers should be allowed and
vice versa.

v2->v3: new CoRwlock implementation

v3->v4: fix upgrade and add a test for that, too

v4->v5: typo

v5->v6: improve documentation, do not read lock->owners where
neither wrlock nor lock->mutex exclude concurrent writes

David Edmondson (4):
  block/vdi: When writing new bmap entry fails, don't leak the buffer
  block/vdi: Don't assume that blocks are larger than VdiHeader
  coroutine-lock: Store the coroutine in the CoWaitRecord only once
  test-coroutine: Add rwlock downgrade test

Paolo Bonzini (2):
  coroutine-lock: Reimplement CoRwlock to fix downgrade bug
  test-coroutine: Add rwlock upgrade test

 block/vdi.c |  11 ++-
 include/qemu/coroutine.h|  17 ++--
 tests/unit/test-coroutine.c | 161 
 util/qemu-coroutine-lock.c  | 149 +
 4 files changed, 274 insertions(+), 64 deletions(-)

-- 
2.29.2

[PATCH v6 1/6] block/vdi: When writing new bmap entry fails, don't leak the buffer

From: David Edmondson 

If a new bitmap entry is allocated, requiring the entire block to be
written, avoiding leaking the buffer allocated for the block should
the write fail.

Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: David Edmondson 
Message-Id: <20210309144015.557477-2-david.edmond...@oracle.com>
Acked-by: Max Reitz 
Signed-off-by: Paolo Bonzini 
---
 block/vdi.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/block/vdi.c b/block/vdi.c
index 5627e7d764..2a6dc26124 100644
--- a/block/vdi.c
+++ b/block/vdi.c
@@ -690,6 +690,7 @@ nonallocating_write:
 
 logout("finished data write\n");
 if (ret < 0) {
+g_free(block);
 return ret;
 }
 
-- 
2.29.2

[PATCH v6 6/6] test-coroutine: Add rwlock downgrade test

From: David Edmondson 

Test that downgrading an rwlock does not result in a failure to
schedule coroutines queued on the rwlock.

The diagram associated with test_co_rwlock_downgrade() describes the
intended behaviour, but what was observed previously corresponds to:

| c1 | c2 | c3 | c4   |
|+++--|
| rdlock |||  |
| yield  |||  |
|| wrlock ||  |
||||  |
||| rdlock |  |
||||  |
|||| wrlock   |
||||  |
| unlock |||  |
| yield  |||  |
||  ||  |
|| downgrade  ||  |
|| ...||  |
|| unlock ||  |
|||  |  |
||||  |

This results in a failure...

ERROR:../tests/test-coroutine.c:369:test_co_rwlock_downgrade: assertion failed: 
(c3_done)
Bail out! ERROR:../tests/test-coroutine.c:369:test_co_rwlock_downgrade: 
assertion failed: (c3_done)

...as a result of the c3 coroutine failing to run to completion.

Signed-off-by: David Edmondson 
Message-Id: <20210309144015.557477-5-david.edmond...@oracle.com>
Signed-off-by: Paolo Bonzini 
---
 tests/unit/test-coroutine.c | 99 +
 1 file changed, 99 insertions(+)

diff --git a/tests/unit/test-coroutine.c b/tests/unit/test-coroutine.c
index 6e6f51d480..aa77a3bcb3 100644
--- a/tests/unit/test-coroutine.c
+++ b/tests/unit/test-coroutine.c
@@ -325,6 +325,104 @@ static void test_co_rwlock_upgrade(void)
 g_assert(c2_done);
 }
 
+static void coroutine_fn rwlock_rdlock_yield(void *opaque)
+{
+qemu_co_rwlock_rdlock(&rwlock);
+qemu_coroutine_yield();
+
+qemu_co_rwlock_unlock(&rwlock);
+qemu_coroutine_yield();
+
+*(bool *)opaque = true;
+}
+
+static void coroutine_fn rwlock_wrlock_downgrade(void *opaque)
+{
+qemu_co_rwlock_wrlock(&rwlock);
+
+qemu_co_rwlock_downgrade(&rwlock);
+qemu_co_rwlock_unlock(&rwlock);
+*(bool *)opaque = true;
+}
+
+static void coroutine_fn rwlock_rdlock(void *opaque)
+{
+qemu_co_rwlock_rdlock(&rwlock);
+
+qemu_co_rwlock_unlock(&rwlock);
+*(bool *)opaque = true;
+}
+
+static void coroutine_fn rwlock_wrlock(void *opaque)
+{
+qemu_co_rwlock_wrlock(&rwlock);
+
+qemu_co_rwlock_unlock(&rwlock);
+*(bool *)opaque = true;
+}
+
+/*
+ * Check that downgrading a reader-writer lock does not cause a hang.
+ *
+ * Four coroutines are used to produce a situation where there are
+ * both reader and writer hopefuls waiting to acquire an rwlock that
+ * is held by a reader.
+ *
+ * The correct sequence of operations we aim to provoke can be
+ * represented as:
+ *
+ * | c1 | c2 | c3 | c4 |
+ * |+++|
+ * | rdlock ||||
+ * | yield  ||||
+ * || wrlock |||
+ * |||||
+ * ||| rdlock ||
+ * |||||
+ * |||| wrlock |
+ * |||||
+ * | unlock ||||
+ * | yield  ||||
+ * ||  |||
+ * || downgrade  |||
+ * |||  ||
+ * ||| unlock ||
+ * || ...|||
+ * || unlock |||
+ * ||||  |
+ * |||| unlock |
+ */
+static void test_co_rwlock_downgrade(void)
+{
+bool c1_done = false;
+bool c2_done = false;
+bool c3_done = false;
+bool c4_done = false;
+Coroutine *c1, *c2, *c3, *c4;
+
+qemu_co_rwlock_init(&rwlock);
+
+c1 = qemu_coroutine_create(rwlock_rdlock_yield, &c1_done);
+c2 = qemu_coroutine_create(rwlock_wrlock_downgrade, &c2_done);
+c3 = qemu_coroutine_create(rwlock_rdlock, &c3_done);
+c4 = qemu_coroutine_create(rwlock_wrlock, &c4_done);
+
+qemu_coroutine_enter(c1);
+qemu_coroutine_enter(c2);
+qemu_coroutine_enter(c3);
+qemu_coroutine_enter(c4);
+
+qemu_coroutine_enter(c1);
+
+g_assert(c2_done);
+g_assert(c3_done);
+g_assert(c4_done);
+
+qemu_coroutine_enter(c1);
+
+g_assert(c1_done);
+}
+
 /*
  * Check that creation, enter, and return work
  */
@@ -563,6 +661,7 @@ int main(int argc, char **argv)
 g_test_add_func("/locking/co-mutex", test_co_mutex);
 g_test_add_func("/locking/c

[PATCH v6 4/6] coroutine-lock: Reimplement CoRwlock to fix downgrade bug

An invariant of the current rwlock is that if multiple coroutines hold a
reader lock, all must be runnable. The unlock implementation relies on
this, choosing to wake a single coroutine when the final read lock
holder exits the critical section, assuming that it will wake a
coroutine attempting to acquire a write lock.

The downgrade implementation violates this assumption by creating a
read lock owning coroutine that is exclusively runnable - any other
coroutines that are waiting to acquire a read lock are *not* made
runnable when the write lock holder converts its ownership to read
only.

More in general, the old implementation had lots of other fairness bugs.
The root cause of the bugs was that CoQueue would wake up readers even
if there were pending writers, and would wake up writers even if there
were readers.  In that case, the coroutine would go back to sleep *at
the end* of the CoQueue, losing its place at the head of the line.

To fix this, keep the queue of waiters explicitly in the CoRwlock
instead of using CoQueue, and store for each whether it is a
potential reader or a writer.  This way, downgrade can look at the
first queued coroutines and wake it only if it is a reader, causing
all other readers in line to be released in turn.

Reported-by: David Edmondson 
Reviewed-by: David Edmondson 
Signed-off-by: Paolo Bonzini 
---
v3->v4: clean up the code and fix upgrade logic.  Fix upgrade comment too.

 include/qemu/coroutine.h   |  17 +++--
 util/qemu-coroutine-lock.c | 148 -
 2 files changed, 106 insertions(+), 59 deletions(-)

diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h
index 84eab6e3bf..7919d3bb62 100644
--- a/include/qemu/coroutine.h
+++ b/include/qemu/coroutine.h
@@ -237,11 +237,15 @@ bool qemu_co_enter_next_impl(CoQueue *queue, QemuLockable 
*lock);
 bool qemu_co_queue_empty(CoQueue *queue);
 
 
+typedef struct CoRwTicket CoRwTicket;
 typedef struct CoRwlock {
-int pending_writer;
-int reader;
 CoMutex mutex;
-CoQueue queue;
+
+/* Number of readers, or -1 if owned for writing.  */
+int owners;
+
+/* Waiting coroutines.  */
+QSIMPLEQ_HEAD(, CoRwTicket) tickets;
 } CoRwlock;
 
 /**
@@ -260,10 +264,9 @@ void qemu_co_rwlock_rdlock(CoRwlock *lock);
 /**
  * Write Locks the CoRwlock from a reader.  This is a bit more efficient than
  * @qemu_co_rwlock_unlock followed by a separate @qemu_co_rwlock_wrlock.
- * However, if the lock cannot be upgraded immediately, control is transferred
- * to the caller of the current coroutine.  Also, @qemu_co_rwlock_upgrade
- * only overrides CoRwlock fairness if there are no concurrent readers, so
- * another writer might run while @qemu_co_rwlock_upgrade blocks.
+ * Note that if the lock cannot be upgraded immediately, control is transferred
+ * to the caller of the current coroutine; another writer might run while
+ * @qemu_co_rwlock_upgrade blocks.
  */
 void qemu_co_rwlock_upgrade(CoRwlock *lock);
 
diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c
index eb73cf11dc..2669403839 100644
--- a/util/qemu-coroutine-lock.c
+++ b/util/qemu-coroutine-lock.c
@@ -327,11 +327,51 @@ void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex)
 trace_qemu_co_mutex_unlock_return(mutex, self);
 }
 
+struct CoRwTicket {
+bool read;
+Coroutine *co;
+QSIMPLEQ_ENTRY(CoRwTicket) next;
+};
+
 void qemu_co_rwlock_init(CoRwlock *lock)
 {
-memset(lock, 0, sizeof(*lock));
-qemu_co_queue_init(&lock->queue);
 qemu_co_mutex_init(&lock->mutex);
+lock->owners = 0;
+QSIMPLEQ_INIT(&lock->tickets);
+}
+
+/* Releases the internal CoMutex.  */
+static void qemu_co_rwlock_maybe_wake_one(CoRwlock *lock)
+{
+CoRwTicket *tkt = QSIMPLEQ_FIRST(&lock->tickets);
+Coroutine *co = NULL;
+
+/*
+ * Setting lock->owners here prevents rdlock and wrlock from
+ * sneaking in between unlock and wake.
+ */
+
+if (tkt) {
+if (tkt->read) {
+if (lock->owners >= 0) {
+lock->owners++;
+co = tkt->co;
+}
+} else {
+if (lock->owners == 0) {
+lock->owners = -1;
+co = tkt->co;
+}
+}
+}
+
+if (co) {
+QSIMPLEQ_REMOVE_HEAD(&lock->tickets, next);
+qemu_co_mutex_unlock(&lock->mutex);
+aio_co_wake(co);
+} else {
+qemu_co_mutex_unlock(&lock->mutex);
+}
 }
 
 void qemu_co_rwlock_rdlock(CoRwlock *lock)
@@ -340,13 +380,22 @@ void qemu_co_rwlock_rdlock(CoRwlock *lock)
 
 qemu_co_mutex_lock(&lock->mutex);
 /* For fairness, wait if a writer is in line.  */
-while (lock->pending_writer) {
-qemu_co_queue_wait(&lock->queue, &lock->mutex);
+if (lock->owners == 0 || (lock->owners > 0 && 
QSIMPLEQ_EMPTY(&lock->tickets))) {
+lock->owners++;
+qemu_co_mutex_unlock(&lock->mutex);
+} else {
+CoRwTicket my_ticket = { true, self };
+
+QSIMPLEQ_IN

[PATCH v6 2/6] block/vdi: Don't assume that blocks are larger than VdiHeader

From: David Edmondson 

Given that the block size is read from the header of the VDI file, a
wide variety of sizes might be seen. Rather than re-using a block
sized memory region when writing the VDI header, allocate an
appropriately sized buffer.

Signed-off-by: David Edmondson 
Message-Id: <20210309144015.557477-3-david.edmond...@oracle.com>
Acked-by: Max Reitz 
Signed-off-by: Paolo Bonzini 
---
 block/vdi.c | 10 ++
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/block/vdi.c b/block/vdi.c
index 2a6dc26124..548f8a057b 100644
--- a/block/vdi.c
+++ b/block/vdi.c
@@ -696,18 +696,20 @@ nonallocating_write:
 
 if (block) {
 /* One or more new blocks were allocated. */
-VdiHeader *header = (VdiHeader *) block;
+VdiHeader *header;
 uint8_t *base;
 uint64_t offset;
 uint32_t n_sectors;
 
+g_free(block);
+header = g_malloc(sizeof(*header));
+
 logout("now writing modified header\n");
 assert(VDI_IS_ALLOCATED(bmap_first));
 *header = s->header;
 vdi_header_to_le(header);
-ret = bdrv_pwrite(bs->file, 0, block, sizeof(VdiHeader));
-g_free(block);
-block = NULL;
+ret = bdrv_pwrite(bs->file, 0, header, sizeof(*header));
+g_free(header);
 
 if (ret < 0) {
 return ret;
-- 
2.29.2

[PATCH v6 3/6] coroutine-lock: Store the coroutine in the CoWaitRecord only once

From: David Edmondson 

When taking the slow path for mutex acquisition, set the coroutine
value in the CoWaitRecord in push_waiter(), rather than both there and
in the caller.

Reviewed-by: Paolo Bonzini 
Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: David Edmondson 
Message-Id: <20210309144015.557477-4-david.edmond...@oracle.com>
Signed-off-by: Paolo Bonzini 
---
 util/qemu-coroutine-lock.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c
index 5816bf8900..eb73cf11dc 100644
--- a/util/qemu-coroutine-lock.c
+++ b/util/qemu-coroutine-lock.c
@@ -204,7 +204,6 @@ static void coroutine_fn 
qemu_co_mutex_lock_slowpath(AioContext *ctx,
 unsigned old_handoff;
 
 trace_qemu_co_mutex_lock_entry(mutex, self);
-w.co = self;
 push_waiter(mutex, &w);
 
 /* This is the "Responsibility Hand-Off" protocol; a lock() picks from
-- 
2.29.2

[PATCH v6 5/6] test-coroutine: Add rwlock upgrade test