date:20211207


On 06/12/2021 23.20, Laurent Vivier wrote:

Scan the PCI devices to find bridge and set PCI_SECONDARY_BUS and
PCI_SUBORDINATE_BUS (algorithm from seabios)

Signed-off-by: Laurent Vivier 
---
  include/hw/pci/pci_bridge.h |   8 +++
  tests/qtest/libqos/pci.c| 118 
  tests/qtest/libqos/pci.h|   1 +
  3 files changed, 127 insertions(+)

diff --git a/include/hw/pci/pci_bridge.h b/include/hw/pci/pci_bridge.h
index a94d350034bf..30691a6e5728 100644
--- a/include/hw/pci/pci_bridge.h
+++ b/include/hw/pci/pci_bridge.h
@@ -138,6 +138,7 @@ typedef struct PCIBridgeQemuCap {
  uint64_t mem_pref_64; /* Prefetchable memory to reserve (64-bit MMIO) */
  } PCIBridgeQemuCap;
  
+#define REDHAT_PCI_CAP_TYPE_OFFSET  3

  #define REDHAT_PCI_CAP_RESOURCE_RESERVE 1
  
  /*

@@ -152,6 +153,13 @@ typedef struct PCIResReserve {
  uint64_t mem_pref_64;
  } PCIResReserve;
  
+#define REDHAT_PCI_CAP_RES_RESERVE_BUS_RES 4

+#define REDHAT_PCI_CAP_RES_RESERVE_IO  8
+#define REDHAT_PCI_CAP_RES_RESERVE_MEM 16
+#define REDHAT_PCI_CAP_RES_RESERVE_PREF_MEM_32 20
+#define REDHAT_PCI_CAP_RES_RESERVE_PREF_MEM_64 24
+#define REDHAT_PCI_CAP_RES_RESERVE_CAP_SIZE32
+
  int pci_bridge_qemu_reserve_cap_init(PCIDevice *dev, int cap_offset,
 PCIResReserve res_reserve, Error **errp);
  
diff --git a/tests/qtest/libqos/pci.c b/tests/qtest/libqos/pci.c

index e1e96189c821..3f0b18f4750b 100644
--- a/tests/qtest/libqos/pci.c
+++ b/tests/qtest/libqos/pci.c
@@ -13,6 +13,8 @@
  #include "qemu/osdep.h"
  #include "pci.h"
  
+#include "hw/pci/pci.h"

+#include "hw/pci/pci_bridge.h"
  #include "hw/pci/pci_regs.h"
  #include "qemu/host-utils.h"
  #include "qgraph.h"
@@ -99,6 +101,122 @@ void qpci_device_init(QPCIDevice *dev, QPCIBus *bus, 
QPCIAddress *addr)
  g_assert(!addr->device_id || device_id == addr->device_id);
  }
  
+static uint8_t qpci_find_resource_reserve_capability(QPCIDevice *dev)

+{
+uint16_t device_id;
+uint8_t cap = 0;
+
+if (qpci_config_readw(dev, PCI_VENDOR_ID) != PCI_VENDOR_ID_REDHAT) {
+return 0;
+}
+
+device_id = qpci_config_readw(dev, PCI_DEVICE_ID);
+
+if (device_id != PCI_DEVICE_ID_REDHAT_PCIE_RP &&
+device_id != PCI_DEVICE_ID_REDHAT_BRIDGE) {
+return 0;
+}
+
+do {
+cap = qpci_find_capability(dev, PCI_CAP_ID_VNDR, cap);
+} while (cap &&
+ qpci_config_readb(dev, cap + REDHAT_PCI_CAP_TYPE_OFFSET) !=
+ REDHAT_PCI_CAP_RESOURCE_RESERVE);
+if (cap) {
+uint8_t cap_len = qpci_config_readb(dev, cap + PCI_CAP_FLAGS);
+if (cap_len < REDHAT_PCI_CAP_RES_RESERVE_CAP_SIZE) {
+return 0;
+}
+}
+return cap;
+}
+
+static void qpci_secondary_buses_rec(QPCIBus *qbus, int bus, int *pci_bus)
+{
+QPCIDevice *dev;
+uint16_t class;
+uint8_t pribus, secbus, subbus;
+int i;


I'd maybe use a better name instead of "i" here.


+for (i = 0; i < 32; i++) {
+dev = qpci_device_find(qbus, QPCI_DEVFN(bus + i, 0));
+if (dev == NULL) {
+continue;
+}
+class = qpci_config_readw(dev, PCI_CLASS_DEVICE);
+if (class == PCI_CLASS_BRIDGE_PCI) {
+qpci_config_writeb(dev, PCI_SECONDARY_BUS, 255);
+qpci_config_writeb(dev, PCI_SUBORDINATE_BUS, 0);
+}
+g_free(dev);
+}
+
+for (i = 0; i < 32; i++) {
+dev = qpci_device_find(qbus, QPCI_DEVFN(bus + i, 0));
+if (dev == NULL) {
+continue;
+}
+class = qpci_config_readw(dev, PCI_CLASS_DEVICE);
+if (class != PCI_CLASS_BRIDGE_PCI) {
+continue;
+}
+
+pribus = qpci_config_readb(dev, PCI_PRIMARY_BUS);
+if (pribus != bus) {
+qpci_config_writeb(dev, PCI_PRIMARY_BUS, bus);
+}
+
+secbus = qpci_config_readb(dev, PCI_SECONDARY_BUS);
+(*pci_bus)++;
+if (*pci_bus != secbus) {
+secbus = *pci_bus;
+qpci_config_writeb(dev, PCI_SECONDARY_BUS, secbus);
+}
+
+subbus = qpci_config_readb(dev, PCI_SUBORDINATE_BUS);
+qpci_config_writeb(dev, PCI_SUBORDINATE_BUS, 255);
+
+qpci_secondary_buses_rec(qbus, secbus << 5, pci_bus);
+
+if (subbus != *pci_bus) {
+uint8_t res_bus = *pci_bus;
+uint8_t cap = qpci_find_resource_reserve_capability(dev);
+
+if (cap) {
+uint32_t tmp_res_bus;
+
+tmp_res_bus = qpci_config_readl(dev, cap +
+
REDHAT_PCI_CAP_RES_RESERVE_BUS_RES);
+if (tmp_res_bus != (uint32_t)-1) {
+res_bus = tmp_res_bus & 0xFF;
+if ((uint8_t)(res_bus + secbus) < secbus ||
+(uint8_t)(res_bus + secbus) < res_bus) {
+res_bus = 0;
+}
+if (secbus + res_bus > *pci_bu

Re: [PATCH v1 2/2] osdep: support mempolicy for preallocation in os_mem_prealloc

2021-12-07 Thread David Hildenbrand

On 07.12.21 08:06, Daniil Tatianin wrote:
> This is needed for cases where we want to make sure that a shared memory
> region gets allocated from a specific NUMA node. This is impossible to do
> with mbind(2) because it ignores the policy for memory mapped with
> MAP_SHARED. We work around this by calling set_mempolicy from prealloc
> threads instead.

That's not quite true I think, how were you able to observe this? Do you
have a reproducer?

While the man page says:

"
The specified policy will be ignored for any  MAP_SHARED  mappings  in
the  specified  memory range. Rather  the pages will be allocated
according to the memory policy of the thread that caused the page to be
allocated.  Again, this may not be the thread that called mbind().
"

What it really means is that as long as we access that memory via the
*VMA* for which we called mbind(), which is the case when *not* using
fallocate() to preallocate memory, we end up using the correct policy.

I did experiments a while ago with hugetlbfs shared memory and it
properly allocated from the right node. So I'd be curious how you
trigger this.

-- 
Thanks,

David / dhildenb

Re: [PATCH v6 2/6] tests/qtest: add some tests for virtio-net failover


On 06/12/2021 23.20, Laurent Vivier wrote:

Add test cases to test several error cases that must be
generated by invalid failover configuration.

Add a combination of coldplug and hotplug test cases to be
sure the primary is correctly managed according the
presence or not of the STANDBY feature.

Signed-off-by: Laurent Vivier 
---
  tests/qtest/meson.build   |   3 +
  tests/qtest/virtio-net-failover.c | 690 ++
  2 files changed, 693 insertions(+)
  create mode 100644 tests/qtest/virtio-net-failover.c

diff --git a/tests/qtest/meson.build b/tests/qtest/meson.build
index c9d8458062ff..6d66bf522156 100644
--- a/tests/qtest/meson.build
+++ b/tests/qtest/meson.build
@@ -22,6 +22,9 @@ qtests_generic = \
(config_all_devices.has_key('CONFIG_VIRTIO_SCSI') ? 
['fuzz-virtio-scsi-test'] : []) + \
(config_all_devices.has_key('CONFIG_SB16') ? ['fuzz-sb16-test'] : []) + \
(config_all_devices.has_key('CONFIG_SDHCI_PCI') ? ['fuzz-sdcard-test'] : 
[]) + \
+  (config_all_devices.has_key('CONFIG_VIRTIO_NET') and \
+   config_all_devices.has_key('CONFIG_Q35') and \
+   config_all_devices.has_key('CONFIG_VIRTIO_PCI') ? ['virtio-net-failover'] : 
[]) + \


I think you should only add this to qtests_i386 for now, since you later add 
a check to skip on non-x86 architectures.



[
'cdrom-test',
'device-introspect-test',
diff --git a/tests/qtest/virtio-net-failover.c 
b/tests/qtest/virtio-net-failover.c
new file mode 100644
index ..f8f5fbb3c7fe
--- /dev/null
+++ b/tests/qtest/virtio-net-failover.c
@@ -0,0 +1,690 @@


Please add a short header with at least a one-liner what this is all about 
and at least a SPDX license information here.



+#include "qemu/osdep.h"
+#include "libqos/libqtest.h"
+#include "libqos/pci.h"
+#include "libqos/pci-pc.h"
+#include "qapi/qmp/qdict.h"
+#include "qapi/qmp/qlist.h"
+#include "qapi/qmp/qjson.h"
+#include "libqos/malloc-pc.h"
+#include "libqos/virtio-pci.h"
+#include "hw/pci/pci.h"
+
+#define ACPI_PCIHP_ADDR_ICH90x0cc0
+#define PCI_EJ_BASE 0x0008
+
+#define BASE_MACHINE "-M q35 -nodefaults " \
+"-device pcie-root-port,id=root0,addr=0x1,bus=pcie.0,chassis=1 " \
+"-device pcie-root-port,id=root1,addr=0x2,bus=pcie.0,chassis=2 "
+
+#define MAC_PRIMARY0 "52:54:00:11:11:11"
+#define MAC_STANDBY0 "52:54:00:22:22:22"
+
+static QGuestAllocator guest_malloc;
+static QPCIBus *pcibus;
+
+static QTestState *machine_start(const char *args, int numbus)
+{
+QTestState *qts;
+QPCIDevice *dev;
+int i;


Nit: Use a more descriptive name instead of "i" - like "bus"?


+qts = qtest_init(args);
+
+pc_alloc_init(&guest_malloc, qts, 0);
+pcibus = qpci_new_pc(qts, &guest_malloc);
+g_assert(qpci_secondary_buses_init(pcibus) == numbus);
+
+for (i = 0; i < numbus; i++) {
+dev = qpci_device_find(pcibus, QPCI_DEVFN(i + 1, 0));
+g_assert_nonnull(dev);
+
+qpci_device_enable(dev);
+qpci_iomap(dev, 4, NULL);
+
+g_free(dev);
+}
+
+return qts;
+}
+
+static void machine_stop(QTestState *qts)
+{
+qpci_free_pc(pcibus);
+alloc_destroy(&guest_malloc);
+qtest_quit(qts);
+}
+
+static void test_error_id(void)
+{
+QTestState *qts;
+QDict *resp;
+QDict *err;
+
+qts = machine_start(BASE_MACHINE
+"-device virtio-net,bus=root0,id=standby0,failover=on",
+2);
+
+resp = qtest_qmp(qts, "{'execute': 'device_add',"
+  "'arguments': {"
+  "'driver': 'virtio-net',"
+  "'bus': 'root1',"
+  "'failover_pair_id': 'standby0'"
+  "} }");
+g_assert(qdict_haskey(resp, "error"));
+
+err = qdict_get_qdict(resp, "error");
+g_assert(qdict_haskey(err, "desc"));
+
+g_assert_cmpstr(qdict_get_str(err, "desc"), ==,
+"Device with failover_pair_id needs to have id");
+
+qobject_unref(resp);
+
+machine_stop(qts);
+}
+
+static void test_error_pcie(void)
+{
+QTestState *qts;
+QDict *resp;
+QDict *err;
+
+qts = machine_start(BASE_MACHINE
+"-device virtio-net,bus=root0,id=standby0,failover=on",
+2);
+
+resp = qtest_qmp(qts, "{'execute': 'device_add',"
+  "'arguments': {"
+  "'driver': 'virtio-net',"
+  "'id': 'primary0',"
+  "'bus': 'pcie.0',"
+  "'failover_pair_id': 'standby0'"
+  "} }");
+g_assert(qdict_haskey(resp, "error"));
+
+err = qdict_get_qdict(resp, "error");
+g_assert(qdict_haskey(err, "desc"));
+
+g_assert_cmpstr(qdict_get_str(err, "desc"), ==,
+"Bus 'pcie.0' does not support hotplugging");
+
+qobject_unref(resp);
+
+machine_stop(qts);
+}
+
+static QDict *find_device(QDict *bus, const char *name)
+{

Re: [PATCH v1 1/2] hostmem: use a static size for maxnode, validate policy everywhere

2021-12-07 Thread David Hildenbrand

On 07.12.21 08:06, Daniil Tatianin wrote:
> Previously we would calculate the last set bit in the mask, and add
> 2 to that value to get the maxnode value. This is unnecessary since
> the mbind syscall allows the bitmap to be any (reasonable) size as
> long as all the unused bits are clear. This also adds policy validation
> in multiple places so that it's guaranteed to be valid when we call
> mbind.
> 
> Signed-off-by: Daniil Tatianin 
> ---
>  backends/hostmem.c | 64 +++---
>  1 file changed, 43 insertions(+), 21 deletions(-)
> 
> diff --git a/backends/hostmem.c b/backends/hostmem.c
> index 4c05862ed5..392026efe6 100644
> --- a/backends/hostmem.c
> +++ b/backends/hostmem.c
> @@ -38,6 +38,29 @@ host_memory_backend_get_name(HostMemoryBackend *backend)
>  return object_get_canonical_path(OBJECT(backend));
>  }
>  
> +static bool
> +validate_policy(HostMemPolicy policy, bool nodes_empty, Error **errp)
> +{
> +/*
> + * check for invalid host-nodes and policies and give more verbose
> + * error messages than mbind().
> + */
> +if (!nodes_empty && policy == MPOL_DEFAULT) {
> +error_setg(errp, "host-nodes must be empty for policy default,"
> +   " or you should explicitly specify a policy other"
> +   " than default");
> +return false;
> +}
> +
> +if (nodes_empty && policy != MPOL_DEFAULT) {
> +error_setg(errp, "host-nodes must be set for policy %s",
> +   HostMemPolicy_str(policy));
> +return false;
> +}
> +
> +return true;
> +}

Hm, we set two properties individually but bail out when the current 
combination 
is impossible, which is nasty. It means we have modify properties in the right 
order
(which will differ based on the policy) to make a change.

Do we have any sane use case of modifying the policy/host-nodes at runtime?
I mean, it's just completely wrong when we already have any memory
preallocated/touched inside the range, as we won't issue another mbind call.

I suggest instead to fix this hole:

diff --git a/backends/hostmem.c b/backends/hostmem.c
index 4c05862ed5..7edc3a075e 100644
--- a/backends/hostmem.c
+++ b/backends/hostmem.c
@@ -111,6 +111,11 @@ host_memory_backend_set_host_nodes(Object *obj, Visitor 
*v, const char *name,
 HostMemoryBackend *backend = MEMORY_BACKEND(obj);
 uint16List *l, *host_nodes = NULL;
 
+if (host_memory_backend_mr_inited(backend)) {
+error_setg(errp, "Property 'host-nodes' cannot be changed anymore.");
+return;
+}
+
 visit_type_uint16List(v, name, &host_nodes, errp);
 
 for (l = host_nodes; l; l = l->next) {
@@ -142,6 +147,12 @@ static void
 host_memory_backend_set_policy(Object *obj, int policy, Error **errp)
 {
 HostMemoryBackend *backend = MEMORY_BACKEND(obj);
+
+if (host_memory_backend_mr_inited(backend)) {
+error_setg(errp, "Property 'policy' cannot be changed anymore.");
+return;
+}
+
 backend->policy = policy;
 
 #ifndef CONFIG_NUMA


-- 
Thanks,

David / dhildenb

Re: [PATCH v6 4/6] tests/libqtest: update virtio-net failover test


On 06/12/2021 23.20, Laurent Vivier wrote:

Update the migration test to check we correctly wait the end
of the card unplug before doing the migration.

Signed-off-by: Laurent Vivier 
---
  tests/qtest/virtio-net-failover.c | 34 +++
  1 file changed, 34 insertions(+)

diff --git a/tests/qtest/virtio-net-failover.c 
b/tests/qtest/virtio-net-failover.c
index f8f5fbb3c7fe..c88f8ddec39a 100644
--- a/tests/qtest/virtio-net-failover.c
+++ b/tests/qtest/virtio-net-failover.c
@@ -560,6 +560,40 @@ static void test_migrate_out(gconstpointer opaque)
  
  qobject_unref(resp);
  
+/* wait the end of the migration setup phase */

+while (true) {
+ret = migrate_status(qts);
+
+status = qdict_get_str(ret, "status");
+if (strcmp(status, "wait-unplug") == 0) {
+break;
+}
+
+/* The migration must not start if the card is not ejected */
+g_assert_cmpstr(status, !=, "active");
+g_assert_cmpstr(status, !=, "completed");
+g_assert_cmpstr(status, !=, "failed");
+g_assert_cmpstr(status, !=, "cancelling");
+g_assert_cmpstr(status, !=, "cancelled");
+
+qobject_unref(ret);
+}
+qobject_unref(ret);
+
+if (g_test_slow()) {
+/* check we stay in wait-unplug while the card is not ejected */
+int i;
+
+for (i = 0; i < 10; i++) {


10 seconds is quite long already, even for slow mode... I wouldn't expect 
any difference after 2 or 3 seconds anymore anyway, so maybe just wait for 5 
seconds?



+sleep(1);
+ret = migrate_status(qts);
+status = qdict_get_str(ret, "status");
+g_assert_cmpstr(status, ==, "wait-unplug");
+qobject_unref(ret);
+}
+}
+
+/* OS unplugs the cards, QEMU can move from wait-unplug state */
  qtest_outl(qts, ACPI_PCIHP_ADDR_ICH9 + PCI_EJ_BASE, 1);
  
  while (true) {




Acked-by: Thomas Huth

Re: [PATCH v6 5/6] test/libqtest: add some virtio-net failover migration cancelling tests


On 06/12/2021 23.20, Laurent Vivier wrote:

Add some tests to check the state of the machine if the migration
is cancelled while we are using virtio-net failover.

Signed-off-by: Laurent Vivier 
---
  tests/qtest/virtio-net-failover.c | 291 ++
  1 file changed, 291 insertions(+)

diff --git a/tests/qtest/virtio-net-failover.c 
b/tests/qtest/virtio-net-failover.c
index c88f8ddec39a..57abb99e7f6e 100644
--- a/tests/qtest/virtio-net-failover.c
+++ b/tests/qtest/virtio-net-failover.c
@@ -682,6 +682,289 @@ static void test_migrate_in(gconstpointer opaque)
  machine_stop(qts);
  }
  
+static void test_migrate_abort_wait_unplug(gconstpointer opaque)

+{
+QTestState *qts;
+QDict *resp, *args, *data, *ret;
+g_autofree gchar *uri = g_strdup_printf("exec: cat > %s", (gchar *)opaque);
+const gchar *status;
+
+qts = machine_start(BASE_MACHINE
+ "-netdev user,id=hs0 "
+ "-netdev user,id=hs1 ",
+ 2);
+
+check_one_card(qts, false, "standby0", MAC_STANDBY0);
+check_one_card(qts, false, "primary0", MAC_PRIMARY0);
+
+qtest_qmp_device_add(qts, "virtio-net", "standby0",
+ "{'bus': 'root0',"
+ "'failover': 'on',"
+ "'netdev': 'hs0',"
+ "'mac': '"MAC_STANDBY0"'}");
+
+check_one_card(qts, true, "standby0", MAC_STANDBY0);
+check_one_card(qts, false, "primary0", MAC_PRIMARY0);
+
+start_virtio_net(qts, 1, 0, "standby0");
+
+check_one_card(qts, true, "standby0", MAC_STANDBY0);
+check_one_card(qts, false, "primary0", MAC_PRIMARY0);
+
+qtest_qmp_device_add(qts, "virtio-net", "primary0",
+ "{'bus': 'root1',"
+ "'failover_pair_id': 'standby0',"
+ "'netdev': 'hs1',"
+ "'rombar': 0,"
+ "'romfile': '',"
+ "'mac': '"MAC_PRIMARY0"'}");
+
+check_one_card(qts, true, "standby0", MAC_STANDBY0);
+check_one_card(qts, true, "primary0", MAC_PRIMARY0);
+
+args = qdict_from_jsonf_nofail("{}");
+g_assert_nonnull(args);
+qdict_put_str(args, "uri", uri);
+
+resp = qtest_qmp(qts, "{ 'execute': 'migrate', 'arguments': %p}", args);
+g_assert(qdict_haskey(resp, "return"));
+qobject_unref(resp);
+
+/* the event is sent whan QEMU asks the OS to unplug the card */
+resp = qtest_qmp_eventwait_ref(qts, "UNPLUG_PRIMARY");
+g_assert(qdict_haskey(resp, "data"));
+
+data = qdict_get_qdict(resp, "data");
+g_assert(qdict_haskey(data, "device-id"));
+g_assert_cmpstr(qdict_get_str(data, "device-id"), ==, "primary0");
+
+qobject_unref(resp);
+
+resp = qtest_qmp(qts, "{ 'execute': 'migrate_cancel' }");
+g_assert(qdict_haskey(resp, "return"));
+qobject_unref(resp);
+
+/* migration has been cancelled while the unplug was in progress */
+
+/* while the card is not ejected, we must be in "cancelling" state */
+ret = migrate_status(qts);
+
+status = qdict_get_str(ret, "status");
+g_assert_cmpstr(status, ==, "cancelling");
+qobject_unref(ret);
+
+/* OS unplugs the cards, QEMU can move from wait-unplug state */
+qtest_outl(qts, ACPI_PCIHP_ADDR_ICH9 + PCI_EJ_BASE, 1);
+
+while (true) {
+ret = migrate_status(qts);
+
+status = qdict_get_str(ret, "status");
+if (strcmp(status, "cancelled") == 0) {
+break;
+}
+g_assert_cmpstr(status, !=, "failed");
+g_assert_cmpstr(status, !=, "active");
+qobject_unref(ret);
+}
+qobject_unref(ret);
+
+check_one_card(qts, true, "standby0", MAC_STANDBY0);
+check_one_card(qts, true, "primary0", MAC_PRIMARY0);
+
+machine_stop(qts);
+}
+
+static void test_migrate_abort_active(gconstpointer opaque)
+{
+QTestState *qts;
+QDict *resp, *args, *data, *ret;
+g_autofree gchar *uri = g_strdup_printf("exec: cat > %s", (gchar *)opaque);
+const gchar *status;
+
+qts = machine_start(BASE_MACHINE
+ "-netdev user,id=hs0 "
+ "-netdev user,id=hs1 ",
+ 2);
+
+check_one_card(qts, false, "standby0", MAC_STANDBY0);
+check_one_card(qts, false, "primary0", MAC_PRIMARY0);
+
+qtest_qmp_device_add(qts, "virtio-net", "standby0",
+ "{'bus': 'root0',"
+ "'failover': 'on',"
+ "'netdev': 'hs0',"
+ "'mac': '"MAC_STANDBY0"'}");
+
+check_one_card(qts, true, "standby0", MAC_STANDBY0);
+check_one_card(qts, false, "primary0", MAC_PRIMARY0);
+
+start_virtio_net(qts, 1, 0, "standby0");
+
+check_one_card(qts, true, "standby0", MAC_STANDBY0);
+check_one_card(qts, false, "primary0", MAC_PRIMARY0);
+
+qtest_qmp_device_add(qts, "virtio-net", "primary0",
+ "{'bus': 'root1',"
+

Re: [PATCH v6 6/6] tests/libqtest: add a migration test with two couples of failover devices


On 06/12/2021 23.20, Laurent Vivier wrote:

Signed-off-by: Laurent Vivier 
---
  tests/qtest/virtio-net-failover.c | 279 ++
  1 file changed, 279 insertions(+)

diff --git a/tests/qtest/virtio-net-failover.c 
b/tests/qtest/virtio-net-failover.c
index 57abb99e7f6e..ace9001894af 100644
--- a/tests/qtest/virtio-net-failover.c
+++ b/tests/qtest/virtio-net-failover.c
@@ -11,6 +11,7 @@
  
  #define ACPI_PCIHP_ADDR_ICH90x0cc0

  #define PCI_EJ_BASE 0x0008
+#define PCI_SEL_BASE0x0010
  
  #define BASE_MACHINE "-M q35 -nodefaults " \

  "-device pcie-root-port,id=root0,addr=0x1,bus=pcie.0,chassis=1 " \
@@ -18,6 +19,8 @@
  
  #define MAC_PRIMARY0 "52:54:00:11:11:11"

  #define MAC_STANDBY0 "52:54:00:22:22:22"
+#define MAC_PRIMARY1 "52:54:00:33:33:33"
+#define MAC_STANDBY1 "52:54:00:44:44:44"
  
  static QGuestAllocator guest_malloc;

  static QPCIBus *pcibus;
@@ -965,6 +968,278 @@ static void test_migrate_abort_timeout(gconstpointer 
opaque)
  machine_stop(qts);
  }
  
+static void test_multi_out(gconstpointer opaque)

+{
+QTestState *qts;
+QDict *resp, *args, *data, *ret;
+g_autofree gchar *uri = g_strdup_printf("exec: cat > %s", (gchar *)opaque);
+const gchar *status, *expected;
+
+qts = machine_start(BASE_MACHINE
+"-device pcie-root-port,id=root2,addr=0x3,bus=pcie.0,chassis=3 
"
+"-device pcie-root-port,id=root3,addr=0x4,bus=pcie.0,chassis=4 
"
+"-netdev user,id=hs0 "
+"-netdev user,id=hs1 "
+"-netdev user,id=hs2 "
+"-netdev user,id=hs3 ",
+4);
+
+check_one_card(qts, false, "standby0", MAC_STANDBY0);
+check_one_card(qts, false, "primary0", MAC_PRIMARY0);
+check_one_card(qts, false, "standby1", MAC_STANDBY1);
+check_one_card(qts, false, "primary1", MAC_PRIMARY1);
+
+qtest_qmp_device_add(qts, "virtio-net", "standby0",
+ "{'bus': 'root0',"
+ "'failover': 'on',"
+ "'netdev': 'hs0',"
+ "'mac': '"MAC_STANDBY0"'}");
+
+check_one_card(qts, true, "standby0", MAC_STANDBY0);
+check_one_card(qts, false, "primary0", MAC_PRIMARY0);
+check_one_card(qts, false, "standby1", MAC_STANDBY1);
+check_one_card(qts, false, "primary1", MAC_PRIMARY1);
+
+qtest_qmp_device_add(qts, "virtio-net", "primary0",
+ "{'bus': 'root1',"
+ "'failover_pair_id': 'standby0',"
+ "'netdev': 'hs1',"
+ "'rombar': 0,"
+ "'romfile': '',"
+ "'mac': '"MAC_PRIMARY0"'}");
+
+check_one_card(qts, true, "standby0", MAC_STANDBY0);
+check_one_card(qts, false, "primary0", MAC_PRIMARY0);
+check_one_card(qts, false, "standby1", MAC_STANDBY1);
+check_one_card(qts, false, "primary1", MAC_PRIMARY1);
+
+start_virtio_net(qts, 1, 0, "standby0");
+
+check_one_card(qts, true, "standby0", MAC_STANDBY0);
+check_one_card(qts, true, "primary0", MAC_PRIMARY0);
+check_one_card(qts, false, "standby1", MAC_STANDBY1);
+check_one_card(qts, false, "primary1", MAC_PRIMARY1);
+
+qtest_qmp_device_add(qts, "virtio-net", "standby1",
+ "{'bus': 'root2',"
+ "'failover': 'on',"
+ "'netdev': 'hs2',"
+ "'mac': '"MAC_STANDBY1"'}");
+
+check_one_card(qts, true, "standby0", MAC_STANDBY0);
+check_one_card(qts, true, "primary0", MAC_PRIMARY0);
+check_one_card(qts, true, "standby1", MAC_STANDBY1);
+check_one_card(qts, false, "primary1", MAC_PRIMARY1);
+
+qtest_qmp_device_add(qts, "virtio-net", "primary1",
+ "{'bus': 'root3',"
+ "'failover_pair_id': 'standby1',"
+ "'netdev': 'hs3',"
+ "'rombar': 0,"
+ "'romfile': '',"
+ "'mac': '"MAC_PRIMARY1"'}");
+
+check_one_card(qts, true, "standby0", MAC_STANDBY0);
+check_one_card(qts, true, "primary0", MAC_PRIMARY0);
+check_one_card(qts, true, "standby1", MAC_STANDBY1);
+check_one_card(qts, false, "primary1", MAC_PRIMARY1);
+
+start_virtio_net(qts, 3, 0, "standby1");
+
+check_one_card(qts, true, "standby0", MAC_STANDBY0);
+check_one_card(qts, true, "primary0", MAC_PRIMARY0);
+check_one_card(qts, true, "standby1", MAC_STANDBY1);
+check_one_card(qts, true, "primary1", MAC_PRIMARY1);
+
+args = qdict_from_jsonf_nofail("{}");
+g_assert_nonnull(args);
+qdict_put_str(args, "uri", uri);
+
+resp = qtest_qmp(qts, "{ 'execute': 'migrate', 'arguments': %p}", args);
+g_assert(qdict_haskey(resp, "return"));
+qobject_unref(resp);
+
+/* the event is sent whan QEMU asks the OS to unplug the card */
+resp = qtest_qmp_eventwait_ref(qts, "UNPLUG_PRIMARY");
+

[PATCH for-7.0] tests/qtest: Make the netfilter test independent from a specific NIC

The netfilter test needs a NIC, no matter which one, so it uses
e1000 by default (or virtio-net-ccw on s390x). However, this NIC
might not be always compiled into the QEMU target binary, so assuming
that this NIC is always available is a bad idea. Since the exact type
of NIC does not really matter for this test, let's switch to "-nic"
instead of "-netdev" so that QEMU can simply pick a default NIC for us.
This way we can now run the test on other targets that have a default
machine which features an on-board/default NIC, too.

Signed-off-by: Thomas Huth 
---
 tests/qtest/meson.build  | 23 +--
 tests/qtest/test-netfilter.c |  8 +---
 2 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/tests/qtest/meson.build b/tests/qtest/meson.build
index 5260b33dc0..cd88cf4bf9 100644
--- a/tests/qtest/meson.build
+++ b/tests/qtest/meson.build
@@ -104,27 +104,36 @@ endif
 
 qtests_x86_64 = qtests_i386
 
-qtests_alpha = [ 'boot-serial-test' ] +
  \
+qtests_alpha = [ 'boot-serial-test' ] + \
+  (slirp.found() ? [ 'test-netfilter' ] : []) + \
   (config_all_devices.has_key('CONFIG_VGA') ? ['display-vga-test'] : [])
 
 qtests_avr = [ 'boot-serial-test' ]
 
-qtests_hppa = [ 'boot-serial-test' ] + 
  \
+qtests_hppa = [ 'boot-serial-test' ] + \
+  (slirp.found() ? [ 'test-netfilter' ] : []) + \
   (config_all_devices.has_key('CONFIG_VGA') ? ['display-vga-test'] : [])
 
-qtests_m68k = [ 'boot-serial-test' ]
-qtests_microblaze = [ 'boot-serial-test' ]
+qtests_m68k = [ 'boot-serial-test' ] + \
+  (slirp.found() ? [ 'test-netfilter' ] : [])
+
+qtests_microblaze = [ 'boot-serial-test' ] + \
+  (slirp.found() ? [ 'test-netfilter' ] : [])
+
 qtests_microblazeel = qtests_microblaze
 
 qtests_mips = \
+  (slirp.found() ? [ 'test-netfilter' ] : []) + \
   (config_all_devices.has_key('CONFIG_ISA_TESTDEV') ? ['endianness-test'] : 
[]) +\
   (config_all_devices.has_key('CONFIG_VGA') ? ['display-vga-test'] : [])
 
 qtests_mips64 = \
+  (slirp.found() ? [ 'test-netfilter' ] : []) + \
   (config_all_devices.has_key('CONFIG_ISA_TESTDEV') ? ['endianness-test'] : 
[]) +\
   (config_all_devices.has_key('CONFIG_VGA') ? ['display-vga-test'] : [])
 
 qtests_mips64el = \
+  (slirp.found() ? [ 'test-netfilter' ] : []) + \
   (config_all_devices.has_key('CONFIG_ISA_TESTDEV') ? ['endianness-test'] : 
[]) +\
   (config_all_devices.has_key('CONFIG_VGA') ? ['display-vga-test'] : [])
 
@@ -147,10 +156,12 @@ qtests_ppc64 = \
 qtests_sh4 = (config_all_devices.has_key('CONFIG_ISA_TESTDEV') ? 
['endianness-test'] : [])
 qtests_sh4eb = (config_all_devices.has_key('CONFIG_ISA_TESTDEV') ? 
['endianness-test'] : [])
 
-qtests_sparc = ['prom-env-test', 'm48t59-test', 'boot-serial-test']
+qtests_sparc = ['prom-env-test', 'm48t59-test', 'boot-serial-test'] + \
+  (slirp.found() ? [ 'test-netfilter' ] : [])
 
 qtests_sparc64 = \
-  (config_all_devices.has_key('CONFIG_ISA_TESTDEV') ? ['endianness-test'] : 
[]) +\
+  (slirp.found() ? [ 'test-netfilter' ] : []) + \
+  (config_all_devices.has_key('CONFIG_ISA_TESTDEV') ? ['endianness-test'] : 
[]) + \
   ['prom-env-test', 'boot-serial-test']
 
 qtests_npcm7xx = \
diff --git a/tests/qtest/test-netfilter.c b/tests/qtest/test-netfilter.c
index 785b6f3226..b09ef7fae9 100644
--- a/tests/qtest/test-netfilter.c
+++ b/tests/qtest/test-netfilter.c
@@ -178,11 +178,6 @@ int main(int argc, char **argv)
 {
 int ret;
 char *args;
-const char *devstr = "e1000";
-
-if (g_str_equal(qtest_get_arch(), "s390x")) {
-devstr = "virtio-net-ccw";
-}
 
 g_test_init(&argc, &argv, NULL);
 qtest_add_func("/netfilter/addremove_one", add_one_netfilter);
@@ -192,8 +187,7 @@ int main(int argc, char **argv)
 qtest_add_func("/netfilter/remove_netdev_multi",
remove_netdev_with_multi_netfilter);
 
-args = g_strdup_printf("-netdev user,id=qtest-bn0 "
-   "-device %s,netdev=qtest-bn0", devstr);
+args = g_strdup_printf("-nic user,id=qtest-bn0");
 qtest_start(args);
 ret = g_test_run();
 
-- 
2.27.0

Re: [PATCH] spec: Add NBD_OPT_EXTENDED_HEADERS

2021-12-07 Thread Vladimir Sementsov-Ogievskiy


07.12.2021 02:00, Eric Blake wrote:

On Mon, Dec 06, 2021 at 02:40:45PM +0300, Vladimir Sementsov-Ogievskiy wrote:

    Simple reply message

   The simple reply message MUST be sent by the server in response to all
   requests if structured replies have not been negotiated using
-`NBD_OPT_STRUCTURED_REPLY`. If structured replies have been negotiated, a 
simple
-reply MAY be used as a reply to any request other than `NBD_CMD_READ`,
-but only if the reply has no data payload.  The message looks as
-follows:
+`NBD_OPT_STRUCTURED_REPLY`. If structured replies have been
+negotiated, a simple reply MAY be used as a reply to any request other
+than `NBD_CMD_READ`, but only if the reply has no data payload.  If
+extended headers were not negotiated using `NBD_OPT_EXTENDED_HEADERS`,
+the message looks as follows:

   S: 32 bits, 0x67446698, magic (`NBD_SIMPLE_REPLY_MAGIC`; used to be
  `NBD_REPLY_MAGIC`)
@@ -369,6 +398,16 @@ S: 64 bits, handle
   S: (*length* bytes of data if the request is of type `NBD_CMD_READ` and
   *error* is zero)

+If extended headers were negotiated using `NBD_OPT_EXTENDED_HEADERS`,
+the message looks like:
+
+S: 32 bits, 0x60d12fd6, magic (`NBD_SIMPLE_REPLY_EXT_MAGIC`)
+S: 32 bits, error (MAY be zero)
+S: 64 bits, handle
+S: 128 bits, padding (MUST be zero)
+S: (*length* bytes of data if the request is of type `NBD_CMD_READ` and
+*error* is zero)
+


If we go this way, let's put payload length into padding: it will help to make 
the protocol context-independent and less error-prone.


Easy enough to do (the payload length will be 0 except for
NBD_CMD_READ).



Or, the otherway, may be just forbid the payload for simple-64bit ? What's the 
reason to allow 64bit requests without structured reply negotiation?


The two happened to be orthogonal enough in my implementation.  It was
easy to demonstrate either one without the other, and it IS easier to
write a client using non-structured replies (structured reads ARE
tougher than simple reads, even if it is less efficient when it comes
to reading zeros).  But you are also right that we could require
structured reads prior to allowing 64-bit operations, and then have
only one supported reply type on the wire when negotiated.  Wouter,
which way do you prefer?




    Structured reply chunk message

   Some of the major downsides of the default simple reply to
@@ -410,7 +449,9 @@ considered successful only if it did not contain any error 
chunks,
   although the client MAY be able to determine partial success based
   on the chunks received.

-A structured reply chunk message looks as follows:
+If extended headers were not negotiated using
+`NBD_OPT_EXTENDED_HEADERS`, a structured reply chunk message looks as
+follows:

   S: 32 bits, 0x668e33ef, magic (`NBD_STRUCTURED_REPLY_MAGIC`)
   S: 16 bits, flags
@@ -423,6 +464,17 @@ The use of *length* in the reply allows context-free 
division of
   the overall server traffic into individual reply messages; the
   *type* field describes how to further interpret the payload.

+If extended headers were negotiated using `NBD_OPT_EXTENDED_HEADERS`,
+the message looks like:
+
+S: 32 bits, 0x6e8a278c, magic (`NBD_STRUCTURED_REPLY_EXT_MAGIC`)
+S: 16 bits, flags
+S: 16 bits, type
+S: 64 bits, handle
+S: 64 bits, length of payload (unsigned)


Maybe, 64bits is too much for payload. But who knows. And it's good that it's 
symmetric to 64bit length in request.


Indeed, both qemu and libnbd implementations explicitly kill the
connection to any server that replies with more than the max buffer
used for NBD_CMD_READ/WRITE (32M for qemu, 64M for libnbd).  And if
the spec is not already clear on the topic, I should add an
independent patch to NBD_CMD_BLOCK_STATUS to make it obvious that a
server cannot reply with too many extents because of such clients.

So none of my proof-of-concept code ever used the full 64-bits of the
reply header length.  On the other hand, there is indeed the symmetry
argument - if someone writes a server willing to accept a 4G
NBD_CMD_WRITE, then it should also support a 4G NBD_CMD_READ, even if
no known existing server or client allows buffers that large..




+S: 64 bits, padding (MUST be zero)


Hmm. Extra 8 bytes to be power-of-2. Does 32 bytes really perform better than 
24 bytes?


Consider:
struct header[100];

if sizeof(header[0]) is a power of 2 <= the cache line size (and the
compiler prefers to start arrays aligned to the cache line) then we
are guaranteed that all array members each reside in a single cache
line.  But if it is not a power of 2, some of the array members
straddle two cache lines.

Will there be code that wants to create an array of headers?  Perhaps
so, because that is a logical way (along with scatter/gather to
combine the header with variable-sized payloads) of tracking the
headers for multiple commands issued in parallel.

Do I have actual performance numbers?  No. But there's plenty of
google hits for why sizing structs to a power of 2 is a g

[no subject]

Subject: [PATCH for 6.2?] gicv3: fix ICH_MISR's LRENP computation

According to the "Arm Generic Interrupt Controller Architecture
Specification GIC architecture version 3 and 4" (version G: page 345
for aarch64 or 509 for aarch32):
LRENP bit of ICH_MISR is set when ICH_HCR.LRENPIE==1 and
ICH_HCR.EOIcount is non-zero.

When only LRENPIE was set (and EOI count was zero), the LRENP bit was
wrongly set and MISR value was wrong.

As an additional consequence, if an hypervisor set ICH_HCR.LRENPIE,
the maintenance interrupt was constantly fired. It happens since patch
9cee1efe92 ("hw/intc: Set GIC maintenance interrupt level to only 0 or 1")
which fixed another bug about maintenance interrupt (most significant
bits of misr, including this one, were ignored in the interrupt trigger).

Fixes: 83f036fe3d ("hw/intc/arm_gicv3: Add accessors for ICH_ system registers")
Signed-off-by: Damien Hedde 
---
The gic doc is available here:
https://developer.arm.com/documentation/ihi0069/g
---
 hw/intc/arm_gicv3_cpuif.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/hw/intc/arm_gicv3_cpuif.c b/hw/intc/arm_gicv3_cpuif.c
index 7fba931450..85fc369e55 100644
--- a/hw/intc/arm_gicv3_cpuif.c
+++ b/hw/intc/arm_gicv3_cpuif.c
@@ -351,7 +351,8 @@ static uint32_t maintenance_interrupt_state(GICv3CPUState 
*cs)
 /* Scan list registers and fill in the U, NP and EOI bits */
 eoi_maintenance_interrupt_state(cs, &value);
 
-if (cs->ich_hcr_el2 & (ICH_HCR_EL2_LRENPIE | ICH_HCR_EL2_EOICOUNT_MASK)) {
+if ((cs->ich_hcr_el2 & ICH_HCR_EL2_LRENPIE) &&
+(cs->ich_hcr_el2 & ICH_HCR_EL2_EOICOUNT_MASK)) {
 value |= ICH_MISR_EL2_LRENP;
 }
 
-- 
2.34.0

Re: [PATCH v10 6/8] qmp: add QMP commands for virtio/vhost queue-status

2021-12-07 Thread Markus Armbruster

Jonah Palmer  writes:

> From: Laurent Vivier 
>
> These new commands show the internal status of a VirtIODevice's
> VirtQueue and a vhost device's vhost_virtqueue (if active).
>
> Signed-off-by: Jonah Palmer 
> ---

[...]

> diff --git a/qapi/virtio.json b/qapi/virtio.json
> index 7ef1f95..56e56d2 100644
> --- a/qapi/virtio.json
> +++ b/qapi/virtio.json
> @@ -402,3 +402,255 @@
>'data': { 'transports': [ 'str' ],
>  '*dev-features': [ 'str' ],
>  '*unknown-dev-features': 'uint64' } }
> +
> +##
> +# @VirtQueueStatus:
> +#
> +# Information of a VirtIODevice VirtQueue, including most members of
> +# the VirtQueue data structure.
> +#
> +# @name: Name of the VirtIODevice that uses this VirtQueue
> +#
> +# @queue-index: VirtQueue queue_index
> +#
> +# @inuse: VirtQueue inuse
> +#
> +# @vring-num: VirtQueue vring.num
> +#
> +# @vring-num-default: VirtQueue vring.num_default
> +#
> +# @vring-align: VirtQueue vring.align
> +#
> +# @vring-desc: VirtQueue vring.desc (descriptor area)
> +#
> +# @vring-avail: VirtQueue vring.avail (driver area)
> +#
> +# @vring-used: VirtQueue vring.used (device area)
> +#
> +# @last-avail-idx: VirtQueue last_avail_idx or return of vhost_dev
> +#  vhost_get_vring_base (if vhost active)
> +#
> +# @shadow-avail-idx: VirtQueue shadow_avail_idx
> +#
> +# @used-idx: VirtQueue used_idx
> +#
> +# @signalled-used: VirtQueue signalled_used
> +#
> +# @signalled-used-valid: VirtQueue signalled_used_valid flag
> +#
> +# Since: 7.0
> +#
> +##
> +
> +{ 'struct': 'VirtQueueStatus',
> +  'data': { 'name': 'str',
> +'queue-index': 'uint16',
> +'inuse': 'uint32',
> +'vring-num': 'uint32',
> +'vring-num-default': 'uint32',
> +'vring-align': 'uint32',
> +'vring-desc': 'uint64',
> +'vring-avail': 'uint64',
> +'vring-used': 'uint64',
> +'*last-avail-idx': 'uint16',
> +'*shadow-avail-idx': 'uint16',
> +'used-idx': 'uint16',
> +'signalled-used': 'uint16',
> +'signalled-used-valid': 'bool' } }
> +
> +##
> +# @x-query-virtio-queue-status:
> +#
> +# Return the status of a given VirtIODevice's VirtQueue
> +#
> +# @path: VirtIODevice canonical QOM path
> +#
> +# @queue: VirtQueue index to examine
> +#
> +# Features:
> +# @unstable: This command is meant for debugging

End with a period for consistency with existing docs, like you did in
v9.

> +#
> +# Returns: VirtQueueStatus of the VirtQueue
> +#
> +# Notes: last_avail_idx will not be displayed in the case where
> +#the selected VirtIODevice has a running vhost device and
> +#the VirtIODevice VirtQueue index (queue) does not exist for
> +#the corresponding vhost device vhost_virtqueue. Also,
> +#shadow_avail_idx will not be displayed in the case where
> +#the selected VirtIODevice has a running vhost device.
> +#
> +# Since: 7.0
> +#
> +# Examples:
> +#
> +# 1. Get VirtQueueStatus for virtio-vsock (vhost-vsock running)
> +#
> +# -> { "execute": "x-query-virtio-queue-status",
> +#  "arguments": { "path": "/machine/peripheral/vsock0/virtio-backend",
> +# "queue": 1 }
> +#}
> +# <- { "return": {
> +#"signalled-used": 0,
> +#"inuse": 0,
> +#"vring-align": 4096,
> +#"vring-desc": 5217370112,
> +#"signalled-used-valid": false,
> +#"vring-num-default": 128,
> +#"vring-avail": 5217372160,
> +#"queue-index": 1,
> +#"last-avail-idx": 0,
> +#"vring-used": 5217372480,
> +#"used-idx": 0,
> +#"name": "vhost-vsock",
> +#"vring-num": 128 }
> +#}
> +#
> +# 2. Get VirtQueueStatus for virtio-serial (no vhost)
> +#
> +# -> { "execute": "x-query-virtio-queue-status",
> +#  "arguments": { "path": 
> "/machine/peripheral-anon/device[0]/virtio-backend",
> +# "queue": 20 }
> +#}
> +# <- { "return": {
> +#"signalled-used": 0,
> +#"inuse": 0,
> +#"vring-align": 4096,
> +#"vring-desc": 5182074880,
> +#"signalled-used-valid": false,
> +#"vring-num-default": 128,
> +#"vring-avail": 5182076928,
> +#"queue-index": 20,
> +#"last-avail-idx": 0,
> +#"vring-used": 5182077248,
> +#"used-idx": 0,
> +#"name": "virtio-serial",
> +#"shadow-avail-idx": 0,
> +#"vring-num": 128 }
> +#}
> +#
> +##
> +
> +{ 'command': 'x-query-virtio-queue-status',
> +  'data': { 'path': 'str', 'queue': 'uint16' },
> +  'returns': 'VirtQueueStatus',
> +  'features': [ 'unstable' ] }
> +
> +##
> +# @VirtVhostQueueStatus:
> +#
> +# Information of a vhost device's vhost_virtqueue, including most
> +# members of the vhost_dev vhost_virtqueue data structure.
> +#
> +# @name: Name of the VirtIODevice that uses thi

Re: [PATCH for 6.2?] gicv3: fix ICH_MISR's LRENP computation


Sorry for the inconvenience, I screwed-up with the subject line.
Should I resend ?

Damien

On 12/7/21 10:29, Damien Hedde wrote:

Subject: [PATCH for 6.2?] gicv3: fix ICH_MISR's LRENP computation

According to the "Arm Generic Interrupt Controller Architecture
Specification GIC architecture version 3 and 4" (version G: page 345
for aarch64 or 509 for aarch32):
LRENP bit of ICH_MISR is set when ICH_HCR.LRENPIE==1 and
ICH_HCR.EOIcount is non-zero.

When only LRENPIE was set (and EOI count was zero), the LRENP bit was
wrongly set and MISR value was wrong.

As an additional consequence, if an hypervisor set ICH_HCR.LRENPIE,
the maintenance interrupt was constantly fired. It happens since patch
9cee1efe92 ("hw/intc: Set GIC maintenance interrupt level to only 0 or 1")
which fixed another bug about maintenance interrupt (most significant
bits of misr, including this one, were ignored in the interrupt trigger).

Fixes: 83f036fe3d ("hw/intc/arm_gicv3: Add accessors for ICH_ system registers")
Signed-off-by: Damien Hedde 
---
The gic doc is available here:
https://developer.arm.com/documentation/ihi0069/g
---
  hw/intc/arm_gicv3_cpuif.c | 3 ++-
  1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/hw/intc/arm_gicv3_cpuif.c b/hw/intc/arm_gicv3_cpuif.c
index 7fba931450..85fc369e55 100644
--- a/hw/intc/arm_gicv3_cpuif.c
+++ b/hw/intc/arm_gicv3_cpuif.c
@@ -351,7 +351,8 @@ static uint32_t maintenance_interrupt_state(GICv3CPUState 
*cs)
  /* Scan list registers and fill in the U, NP and EOI bits */
  eoi_maintenance_interrupt_state(cs, &value);
  
-if (cs->ich_hcr_el2 & (ICH_HCR_EL2_LRENPIE | ICH_HCR_EL2_EOICOUNT_MASK)) {

+if ((cs->ich_hcr_el2 & ICH_HCR_EL2_LRENPIE) &&
+(cs->ich_hcr_el2 & ICH_HCR_EL2_EOICOUNT_MASK)) {
  value |= ICH_MISR_EL2_LRENP;
  }

Re: [PATCH 01/14] ppc/pnv: Reduce the maximum of PHB3 devices





On 02/12/2021 15:42, Cédric Le Goater wrote:

All POWER8 machines have a maximum of 3 PHB3 devices. Adapt the
PNV8_CHIP_PHB3_MAX definition for consistency.

Signed-off-by: Cédric Le Goater 
---



The Naples chip (Garrison) can have 4 PHBs and it seems we have a 
power8nvl machine type for it. So I guess we should keep a max PHB count 
of 4 there.


  Fred




  include/hw/ppc/pnv.h | 2 +-
  hw/ppc/pnv.c | 6 +++---
  2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/hw/ppc/pnv.h b/include/hw/ppc/pnv.h
index aa08d79d24de..6f498c8f1b5f 100644
--- a/include/hw/ppc/pnv.h
+++ b/include/hw/ppc/pnv.h
@@ -79,7 +79,7 @@ struct Pnv8Chip {
  PnvOCC   occ;
  PnvHomer homer;
  
-#define PNV8_CHIP_PHB3_MAX 4

+#define PNV8_CHIP_PHB3_MAX 3
  PnvPHB3  phbs[PNV8_CHIP_PHB3_MAX];
  
  XICSFabric*xics;

diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c
index 71e45515f136..bd768dcc28ad 100644
--- a/hw/ppc/pnv.c
+++ b/hw/ppc/pnv.c
@@ -1256,7 +1256,7 @@ static void pnv_chip_power8e_class_init(ObjectClass 
*klass, void *data)
  
  k->chip_cfam_id = 0x221ef0498000ull;  /* P8 Murano DD2.1 */

  k->cores_mask = POWER8E_CORE_MASK;
-k->num_phbs = 3;
+k->num_phbs = PNV8_CHIP_PHB3_MAX;
  k->core_pir = pnv_chip_core_pir_p8;
  k->intc_create = pnv_chip_power8_intc_create;
  k->intc_reset = pnv_chip_power8_intc_reset;
@@ -1280,7 +1280,7 @@ static void pnv_chip_power8_class_init(ObjectClass 
*klass, void *data)
  
  k->chip_cfam_id = 0x220ea0498000ull; /* P8 Venice DD2.0 */

  k->cores_mask = POWER8_CORE_MASK;
-k->num_phbs = 3;
+k->num_phbs = PNV8_CHIP_PHB3_MAX;
  k->core_pir = pnv_chip_core_pir_p8;
  k->intc_create = pnv_chip_power8_intc_create;
  k->intc_reset = pnv_chip_power8_intc_reset;
@@ -1304,7 +1304,7 @@ static void pnv_chip_power8nvl_class_init(ObjectClass 
*klass, void *data)
  
  k->chip_cfam_id = 0x120d30498000ull;  /* P8 Naples DD1.0 */

  k->cores_mask = POWER8_CORE_MASK;
-k->num_phbs = 3;
+k->num_phbs = PNV8_CHIP_PHB3_MAX;
  k->core_pir = pnv_chip_core_pir_p8;
  k->intc_create = pnv_chip_power8_intc_create;
  k->intc_reset = pnv_chip_power8_intc_reset;

Re: [PATCH 02/14] ppc/pnv: Drop the "num-phbs" property





On 02/12/2021 15:42, Cédric Le Goater wrote:

It is never used.

Signed-off-by: Cédric Le Goater 
---



Reviewed-by: Frederic Barrat 



  hw/ppc/pnv.c | 1 -
  1 file changed, 1 deletion(-)

diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c
index bd768dcc28ad..988b305398b2 100644
--- a/hw/ppc/pnv.c
+++ b/hw/ppc/pnv.c
@@ -1764,7 +1764,6 @@ static Property pnv_chip_properties[] = {
  DEFINE_PROP_UINT32("nr-cores", PnvChip, nr_cores, 1),
  DEFINE_PROP_UINT64("cores-mask", PnvChip, cores_mask, 0x0),
  DEFINE_PROP_UINT32("nr-threads", PnvChip, nr_threads, 1),
-DEFINE_PROP_UINT32("num-phbs", PnvChip, num_phbs, 0),
  DEFINE_PROP_END_OF_LIST(),
  };

Re: [PATCH 03/14] ppc/pnv: Move mapping of the PHB3 CQ regions under pnv_pbcq_realize()





On 02/12/2021 15:42, Cédric Le Goater wrote:

This requires a link to the chip to add the regions under the XSCOM
address space. This change will help us providing support for user
created PHB3 devices.

Signed-off-by: Cédric Le Goater 
---



Reviewed-by: Frederic Barrat 



  include/hw/pci-host/pnv_phb3.h |  3 +++
  hw/pci-host/pnv_phb3.c |  1 +
  hw/pci-host/pnv_phb3_pbcq.c| 11 +++
  hw/ppc/pnv.c   | 14 ++
  4 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/include/hw/pci-host/pnv_phb3.h b/include/hw/pci-host/pnv_phb3.h
index e2a2e3624532..e9c13e6bd821 100644
--- a/include/hw/pci-host/pnv_phb3.h
+++ b/include/hw/pci-host/pnv_phb3.h
@@ -16,6 +16,7 @@
  #include "qom/object.h"
  
  typedef struct PnvPHB3 PnvPHB3;

+typedef struct PnvChip PnvChip;
  
  /*

   * PHB3 XICS Source for MSIs
@@ -157,6 +158,8 @@ struct PnvPHB3 {
  PnvPHB3RootPort root;
  
  QLIST_HEAD(, PnvPhb3DMASpace) dma_spaces;

+
+PnvChip *chip;
  };
  
  uint64_t pnv_phb3_reg_read(void *opaque, hwaddr off, unsigned size);

diff --git a/hw/pci-host/pnv_phb3.c b/hw/pci-host/pnv_phb3.c
index a7f96850055a..3aa42ef9d4b9 100644
--- a/hw/pci-host/pnv_phb3.c
+++ b/hw/pci-host/pnv_phb3.c
@@ -1092,6 +1092,7 @@ static const char *pnv_phb3_root_bus_path(PCIHostState 
*host_bridge,
  static Property pnv_phb3_properties[] = {
  DEFINE_PROP_UINT32("index", PnvPHB3, phb_id, 0),
  DEFINE_PROP_UINT32("chip-id", PnvPHB3, chip_id, 0),
+DEFINE_PROP_LINK("chip", PnvPHB3, chip, TYPE_PNV_CHIP, PnvChip *),
  DEFINE_PROP_END_OF_LIST(),
  };
  
diff --git a/hw/pci-host/pnv_phb3_pbcq.c b/hw/pci-host/pnv_phb3_pbcq.c

index a0526aa1eca3..c7426cd27a20 100644
--- a/hw/pci-host/pnv_phb3_pbcq.c
+++ b/hw/pci-host/pnv_phb3_pbcq.c
@@ -284,6 +284,17 @@ static void pnv_pbcq_realize(DeviceState *dev, Error 
**errp)
  pnv_xscom_region_init(&pbcq->xscom_spci_regs, OBJECT(dev),
&pnv_pbcq_spci_xscom_ops, pbcq, name,
PNV_XSCOM_PBCQ_SPCI_SIZE);
+
+/* Populate the XSCOM address space. */
+pnv_xscom_add_subregion(phb->chip,
+PNV_XSCOM_PBCQ_NEST_BASE + 0x400 * phb->phb_id,
+&pbcq->xscom_nest_regs);
+pnv_xscom_add_subregion(phb->chip,
+PNV_XSCOM_PBCQ_PCI_BASE + 0x400 * phb->phb_id,
+&pbcq->xscom_pci_regs);
+pnv_xscom_add_subregion(phb->chip,
+PNV_XSCOM_PBCQ_SPCI_BASE + 0x040 * phb->phb_id,
+&pbcq->xscom_spci_regs);
  }
  
  static int pnv_pbcq_dt_xscom(PnvXScomInterface *dev, void *fdt,

diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c
index 988b305398b2..de277c457838 100644
--- a/hw/ppc/pnv.c
+++ b/hw/ppc/pnv.c
@@ -1221,25 +1221,15 @@ static void pnv_chip_power8_realize(DeviceState *dev, 
Error **errp)
  /* PHB3 controllers */
  for (i = 0; i < chip->num_phbs; i++) {
  PnvPHB3 *phb = &chip8->phbs[i];
-PnvPBCQState *pbcq = &phb->pbcq;
  
  object_property_set_int(OBJECT(phb), "index", i, &error_fatal);

  object_property_set_int(OBJECT(phb), "chip-id", chip->chip_id,
  &error_fatal);
+object_property_set_link(OBJECT(phb), "chip", OBJECT(chip),
+ &error_fatal);
  if (!sysbus_realize(SYS_BUS_DEVICE(phb), errp)) {
  return;
  }
-
-/* Populate the XSCOM address space. */
-pnv_xscom_add_subregion(chip,
-PNV_XSCOM_PBCQ_NEST_BASE + 0x400 * phb->phb_id,
-&pbcq->xscom_nest_regs);
-pnv_xscom_add_subregion(chip,
-PNV_XSCOM_PBCQ_PCI_BASE + 0x400 * phb->phb_id,
-&pbcq->xscom_pci_regs);
-pnv_xscom_add_subregion(chip,
-PNV_XSCOM_PBCQ_SPCI_BASE + 0x040 * phb->phb_id,
-&pbcq->xscom_spci_regs);
  }
  }

[PATCH v2 for 6.2?] gicv3: fix ICH_MISR's LRENP computation

According to the "Arm Generic Interrupt Controller Architecture
Specification GIC architecture version 3 and 4" (version G: page 345
for aarch64 or 509 for aarch32):
LRENP bit of ICH_MISR is set when ICH_HCR.LRENPIE==1 and
ICH_HCR.EOIcount is non-zero.

When only LRENPIE was set (and EOI count was zero), the LRENP bit was
wrongly set and MISR value was wrong.

As an additional consequence, if an hypervisor set ICH_HCR.LRENPIE,
the maintenance interrupt was constantly fired. It happens since patch
9cee1efe92 ("hw/intc: Set GIC maintenance interrupt level to only 0 or 1")
which fixed another bug about maintenance interrupt (most significant
bits of misr, including this one, were ignored in the interrupt trigger).

Fixes: 83f036fe3d ("hw/intc/arm_gicv3: Add accessors for ICH_ system registers")
Signed-off-by: Damien Hedde 
---
The gic doc is available here:
https://developer.arm.com/documentation/ihi0069/g

v2: identical resend because subject screw-up (sorry)

Thanks,
Damien
---
 hw/intc/arm_gicv3_cpuif.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/hw/intc/arm_gicv3_cpuif.c b/hw/intc/arm_gicv3_cpuif.c
index 7fba931450..85fc369e55 100644
--- a/hw/intc/arm_gicv3_cpuif.c
+++ b/hw/intc/arm_gicv3_cpuif.c
@@ -351,7 +351,8 @@ static uint32_t maintenance_interrupt_state(GICv3CPUState 
*cs)
 /* Scan list registers and fill in the U, NP and EOI bits */
 eoi_maintenance_interrupt_state(cs, &value);
 
-if (cs->ich_hcr_el2 & (ICH_HCR_EL2_LRENPIE | ICH_HCR_EL2_EOICOUNT_MASK)) {
+if ((cs->ich_hcr_el2 & ICH_HCR_EL2_LRENPIE) &&
+(cs->ich_hcr_el2 & ICH_HCR_EL2_EOICOUNT_MASK)) {
 value |= ICH_MISR_EL2_LRENP;
 }
 
-- 
2.34.0

Re: [PATCH v2] ppc/pnv.c: fix "system-id" FDT when -uuid is set

2021-12-07 Thread Daniel Henrique Barboza





On 12/6/21 21:09, David Gibson wrote:

On Mon, Dec 06, 2021 at 10:02:53AM -0300, Daniel Henrique Barboza wrote:

Setting -uuid in the pnv machine does not work:

./qemu-system-ppc64 -machine powernv8,accel=tcg  -uuid 
7ff61ca1-a4a0-4bc1-944c-abd114a35e80
qemu-system-ppc64: error creating device tree: (fdt_property_string(fdt, 
"system-id", buf)): FDT_ERR_BADSTATE

This happens because we're using "fdt_property_string" to retrieve a
"system-id" attribute that does not exist, instead of using
fdt_setprop_string() to create a "system-id" attribute with the uuid
provided via command line.


Fix is correct but this description isn't really accurate.
fdt_property_string() is a "sequential write" function, only used when
you're building a new DT up from scratch, which is an entirely
different mode from read/write access to an existing tree.  Using when
the tree is in read-write state will cause an immediate BADSTATE
error; whether the property exists already or not is irrelevant.

Reviewed-by: David Gibson 


Thanks for the explanation. I'll send a v3 fixing the commit msg.


Daniel





Signed-off-by: Daniel Henrique Barboza 
---

changes from v1:
- fixed typo in commit title


  hw/ppc/pnv.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c
index 32ab8071a4..9e532caa9f 100644
--- a/hw/ppc/pnv.c
+++ b/hw/ppc/pnv.c
@@ -552,7 +552,7 @@ static void *pnv_dt_create(MachineState *machine)
  buf =  qemu_uuid_unparse_strdup(&qemu_uuid);
  _FDT((fdt_setprop_string(fdt, 0, "vm,uuid", buf)));
  if (qemu_uuid_set) {
-_FDT((fdt_property_string(fdt, "system-id", buf)));
+_FDT((fdt_setprop_string(fdt, 0, "system-id", buf)));
  }
  g_free(buf);

Re: [PATCH 04/14] ppc/pnv: Introduce support for user created PHB3 devices





On 02/12/2021 15:42, Cédric Le Goater wrote:

PHB3 devices and PCI devices can now be added to the powernv8 machine
using :

   -device pnv-phb3,chip-id=0,index=1 \
   -device nec-usb-xhci,bus=pci.1,addr=0x0

The 'index' property identifies the PHB3 in the chip. In case of user
created devices, a lookup on 'chip-id' is required to assign the
owning chip.

Signed-off-by: Cédric Le Goater 
---



diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c
index de277c457838..d7fe92cb082d 100644
--- a/hw/ppc/pnv.c
+++ b/hw/ppc/pnv.c
@@ -1097,14 +1097,14 @@ static void pnv_chip_power8_instance_init(Object *obj)
  
  object_initialize_child(obj, "homer", &chip8->homer, TYPE_PNV8_HOMER);
  
-for (i = 0; i < pcc->num_phbs; i++) {

+if (defaults_enabled()) {
+chip->num_phbs = pcc->num_phbs;
+}
+
+for (i = 0; i < chip->num_phbs; i++) {
  object_initialize_child(obj, "phb[*]", &chip8->phbs[i], 
TYPE_PNV_PHB3);
  }
  
-/*

- * Number of PHBs is the chip default
- */
-chip->num_phbs = pcc->num_phbs;
  }



So if "-nodefaults" is mentioned on the command line, then 
chip->num_phbs is not set. It seems the intention is to have only the 
PHBs defined on the CLI, which is fine. However, I don't see where 
chip->num_phbs is incremented in that case.


  Fred



  
  static void pnv_chip_icp_realize(Pnv8Chip *chip8, Error **errp)

@@ -1784,6 +1784,19 @@ PowerPCCPU *pnv_chip_find_cpu(PnvChip *chip, uint32_t 
pir)
  return NULL;
  }
  
+PnvChip *pnv_get_chip(PnvMachineState *pnv, uint32_t chip_id)

+{
+int i;
+
+for (i = 0; i < pnv->num_chips; i++) {
+PnvChip *chip = pnv->chips[i];
+if (chip->chip_id == chip_id) {
+return chip;
+}
+}
+return NULL;
+}
+
  static ICSState *pnv_ics_get(XICSFabric *xi, int irq)
  {
  PnvMachineState *pnv = PNV_MACHINE(xi);

Re: [PATCH v10 7/8] qmp: add QMP command x-query-virtio-queue-element

2021-12-07 Thread Markus Armbruster

Jonah Palmer  writes:

> From: Laurent Vivier 
>
> This new command shows the information of a VirtQueue element.
>
> Signed-off-by: Jonah Palmer 
> ---
>  hw/virtio/virtio-stub.c |   9 +++
>  hw/virtio/virtio.c  | 154 
>  qapi/virtio.json| 183 
> 
>  3 files changed, 346 insertions(+)
>
> diff --git a/hw/virtio/virtio-stub.c b/hw/virtio/virtio-stub.c
> index 13e5f93..7ddb22c 100644
> --- a/hw/virtio/virtio-stub.c
> +++ b/hw/virtio/virtio-stub.c
> @@ -31,3 +31,12 @@ VirtQueueStatus *qmp_x_query_virtio_queue_status(const 
> char *path,
>  {
>  return qmp_virtio_unsupported(errp);
>  }
> +
> +VirtioQueueElement *qmp_x_query_virtio_queue_element(const char *path,
> + uint16_t queue,
> + bool has_index,
> + uint16_t index,
> + Error **errp)
> +{
> +return qmp_virtio_unsupported(errp);
> +}
> diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
> index 459bfb2..8c6cc27 100644
> --- a/hw/virtio/virtio.c
> +++ b/hw/virtio/virtio.c
> @@ -475,6 +475,19 @@ static inline void vring_used_write(VirtQueue *vq, 
> VRingUsedElem *uelem,
>  address_space_cache_invalidate(&caches->used, pa, sizeof(VRingUsedElem));
>  }
>  
> +/* Called within rcu_read_lock(). */
> +static inline uint16_t vring_used_flags(VirtQueue *vq)
> +{
> +VRingMemoryRegionCaches *caches = vring_get_region_caches(vq);
> +hwaddr pa = offsetof(VRingUsed, flags);
> +
> +if (!caches) {
> +return 0;
> +}
> +
> +return virtio_lduw_phys_cached(vq->vdev, &caches->used, pa);
> +}
> +
>  /* Called within rcu_read_lock().  */
>  static uint16_t vring_used_idx(VirtQueue *vq)
>  {
> @@ -4381,6 +4394,147 @@ VirtQueueStatus 
> *qmp_x_query_virtio_queue_status(const char *path,
>  return status;
>  }
>  
> +static strList *qmp_decode_vring_desc_flags(uint16_t flags)
> +{
> +strList *list = NULL;
> +strList *node;
> +int i;
> +
> +struct {
> +uint16_t flag;
> +const char *value;
> +} map[] = {
> +{ VRING_DESC_F_NEXT, "next" },
> +{ VRING_DESC_F_WRITE, "write" },
> +{ VRING_DESC_F_INDIRECT, "indirect" },
> +{ 1 << VRING_PACKED_DESC_F_AVAIL, "avail" },
> +{ 1 << VRING_PACKED_DESC_F_USED, "used" },
> +{ 0, "" }
> +};
> +
> +for (i = 0; map[i].flag; i++) {
> +if ((map[i].flag & flags) == 0) {
> +continue;
> +}
> +node = g_malloc0(sizeof(strList));
> +node->value = g_strdup(map[i].value);
> +node->next = list;
> +list = node;
> +}
> +
> +return list;
> +}
> +
> +VirtioQueueElement *qmp_x_query_virtio_queue_element(const char *path,
> + uint16_t queue,
> + bool has_index,
> + uint16_t index,
> + Error **errp)
> +{
> +VirtIODevice *vdev;
> +VirtQueue *vq;
> +VirtioQueueElement *element = NULL;
> +
> +vdev = virtio_device_find(path);
> +if (vdev == NULL) {
> +error_setg(errp, "Path %s is not a VirtIO device", path);
> +return NULL;
> +}
> +
> +if (queue >= VIRTIO_QUEUE_MAX || !virtio_queue_get_num(vdev, queue)) {
> +error_setg(errp, "Invalid virtqueue number %d", queue);
> +return NULL;
> +}
> +vq = &vdev->vq[queue];
> +
> +if (virtio_vdev_has_feature(vdev, VIRTIO_F_RING_PACKED)) {
> +error_setg(errp, "Packed ring not supported");
> +return NULL;
> +} else {
> +unsigned int head, i, max;
> +VRingMemoryRegionCaches *caches;
> +MemoryRegionCache indirect_desc_cache = MEMORY_REGION_CACHE_INVALID;
> +MemoryRegionCache *desc_cache;
> +VRingDesc desc;
> +VirtioRingDescList *list = NULL;
> +VirtioRingDescList *node;
> +int rc; int ndescs;
> +
> +RCU_READ_LOCK_GUARD();
> +
> +max = vq->vring.num;
> +
> +if (!has_index) {
> +head = vring_avail_ring(vq, vq->last_avail_idx % vq->vring.num);
> +} else {
> +head = vring_avail_ring(vq, index % vq->vring.num);
> +}
> +i = head;
> +
> +caches = vring_get_region_caches(vq);
> +if (!caches) {
> +error_setg(errp, "Region caches not initialized");
> +return NULL;
> +}
> +if (caches->desc.len < max * sizeof(VRingDesc)) {
> +error_setg(errp, "Cannot map descriptor ring");
> +return NULL;
> +}
> +
> +desc_cache = &caches->desc;
> +vring_split_desc_read(vdev, &desc, desc_cache, i);
> +if (desc.flags & VRING_DESC

[PATCH v3] ppc/pnv.c: fix "system-id" FDT when -uuid is set

2021-12-07 Thread Daniel Henrique Barboza

Setting -uuid in the pnv machine does not work:

./qemu-system-ppc64 -machine powernv8,accel=tcg  -uuid 
7ff61ca1-a4a0-4bc1-944c-abd114a35e80
qemu-system-ppc64: error creating device tree: (fdt_property_string(fdt, 
"system-id", buf)): FDT_ERR_BADSTATE

This happens because we're using fdt_property_string(), which is a
sequential write function that is supposed to be used when we're
building a new FDT, in a case where read/writing into an existing FDT.

Fix it by using fdt_setprop_string() instead.

Reviewed-by: David Gibson 
Signed-off-by: Daniel Henrique Barboza 
---

changes from v2:
- fixed commit message after David explained what fdt_property_string()
does
- added David's r-b


 hw/ppc/pnv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c
index 32ab8071a4..9e532caa9f 100644
--- a/hw/ppc/pnv.c
+++ b/hw/ppc/pnv.c
@@ -552,7 +552,7 @@ static void *pnv_dt_create(MachineState *machine)
 buf =  qemu_uuid_unparse_strdup(&qemu_uuid);
 _FDT((fdt_setprop_string(fdt, 0, "vm,uuid", buf)));
 if (qemu_uuid_set) {
-_FDT((fdt_property_string(fdt, "system-id", buf)));
+_FDT((fdt_setprop_string(fdt, 0, "system-id", buf)));
 }
 g_free(buf);
 
-- 
2.31.1

Re: [PATCH 05/14] ppc/pnv: Reparent user created PHB3 devices to the PnvChip





On 02/12/2021 15:42, Cédric Le Goater wrote:

The powernv machine uses the object hierarchy to populate the device
tree and each device should be parented to the chip it belongs to.
This is not the case for user created devices which are parented to
the container "/unattached".

Make sure a PHB3 device is parented to its chip by reparenting the
object if necessary.

Signed-off-by: Cédric Le Goater 
---



It will also be used later for P9, which explains why it's done that 
way, I think. Looks ok to me.


Reviewed-by: Frederic Barrat 



  include/hw/ppc/pnv.h   |  1 +
  hw/pci-host/pnv_phb3.c |  6 ++
  hw/ppc/pnv.c   | 17 +
  3 files changed, 24 insertions(+)

diff --git a/include/hw/ppc/pnv.h b/include/hw/ppc/pnv.h
index 0710673a7fd8..247379ef1f88 100644
--- a/include/hw/ppc/pnv.h
+++ b/include/hw/ppc/pnv.h
@@ -175,6 +175,7 @@ DECLARE_INSTANCE_CHECKER(PnvChip, PNV_CHIP_POWER10,
   TYPE_PNV_CHIP_POWER10)
  
  PowerPCCPU *pnv_chip_find_cpu(PnvChip *chip, uint32_t pir);

+void pnv_chip_parent_fixup(PnvChip *chip, Object *obj, int index);
  
  #define TYPE_PNV_MACHINE   MACHINE_TYPE_NAME("powernv")

  typedef struct PnvMachineClass PnvMachineClass;
diff --git a/hw/pci-host/pnv_phb3.c b/hw/pci-host/pnv_phb3.c
index dd1cf37288a0..e91f658b0060 100644
--- a/hw/pci-host/pnv_phb3.c
+++ b/hw/pci-host/pnv_phb3.c
@@ -1005,6 +1005,12 @@ static void pnv_phb3_realize(DeviceState *dev, Error 
**errp)
  error_setg(errp, "invalid chip id: %d", phb->chip_id);
  return;
  }
+
+/*
+ * Reparent user created devices to the chip to build
+ * correctly the device tree.
+ */
+pnv_chip_parent_fixup(phb->chip, OBJECT(phb), phb->phb_id);
  }
  
  /* LSI sources */

diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c
index d7fe92cb082d..9a458655efd9 100644
--- a/hw/ppc/pnv.c
+++ b/hw/ppc/pnv.c
@@ -1784,6 +1784,23 @@ PowerPCCPU *pnv_chip_find_cpu(PnvChip *chip, uint32_t 
pir)
  return NULL;
  }
  
+void pnv_chip_parent_fixup(PnvChip *chip, Object *obj, int index)

+{
+Object *parent = OBJECT(chip);
+g_autofree char *default_id =
+g_strdup_printf("%s[%d]", object_get_typename(obj), index);
+
+if (obj->parent == parent) {
+return;
+}
+
+object_ref(obj);
+object_unparent(obj);
+object_property_add_child(
+parent, DEVICE(obj)->id ? DEVICE(obj)->id : default_id, obj);
+object_unref(obj);
+}
+
  PnvChip *pnv_get_chip(PnvMachineState *pnv, uint32_t chip_id)
  {
  int i;

Re: [PATCH 06/14] ppc/pnv: Complete user created PHB3 devices





On 02/12/2021 15:42, Cédric Le Goater wrote:

PHB3s ared SysBus devices and should be allowed to be dynamically
created.

Signed-off-by: Cédric Le Goater 
---


This one is a bit of black magic for me. I don't see an equivalent for 
P9 though. Not needed there? I'll have another comment about P8/P9 later.


  Fred



  hw/pci-host/pnv_phb3.c | 9 +
  hw/ppc/pnv.c   | 2 ++
  2 files changed, 11 insertions(+)

diff --git a/hw/pci-host/pnv_phb3.c b/hw/pci-host/pnv_phb3.c
index e91f658b0060..b61f9c369f64 100644
--- a/hw/pci-host/pnv_phb3.c
+++ b/hw/pci-host/pnv_phb3.c
@@ -1000,6 +1000,9 @@ static void pnv_phb3_realize(DeviceState *dev, Error 
**errp)
  
  /* User created devices */

  if (!phb->chip) {
+Error *local_err = NULL;
+BusState *s;
+
  phb->chip = pnv_get_chip(pnv, phb->chip_id);
  if (!phb->chip) {
  error_setg(errp, "invalid chip id: %d", phb->chip_id);
@@ -1011,6 +1014,12 @@ static void pnv_phb3_realize(DeviceState *dev, Error 
**errp)
   * correctly the device tree.
   */
  pnv_chip_parent_fixup(phb->chip, OBJECT(phb), phb->phb_id);
+
+s = qdev_get_parent_bus(DEVICE(phb->chip));
+if (!qdev_set_parent_bus(DEVICE(phb), s, &local_err)) {
+error_propagate(errp, local_err);
+return;
+}
  }
  
  /* LSI sources */

diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c
index 9a458655efd9..45d8ecbf2bf7 100644
--- a/hw/ppc/pnv.c
+++ b/hw/ppc/pnv.c
@@ -1927,6 +1927,8 @@ static void pnv_machine_power8_class_init(ObjectClass 
*oc, void *data)
  
  pmc->compat = compat;

  pmc->compat_size = sizeof(compat);
+
+machine_class_allow_dynamic_sysbus_dev(mc, TYPE_PNV_PHB3);
  }
  
  static void pnv_machine_power9_class_init(ObjectClass *oc, void *data)

Re: [PATCH 07/14] ppc/pnv: Introduce a num_pecs class attribute for PHB4 PEC devices





On 02/12/2021 15:42, Cédric Le Goater wrote:

POWER9 processor comes with 3 PHB4 PECs (PCI Express Controller) and
each PEC can have several PHBs :

   * PEC0 provides 1 PHB  (PHB0)
   * PEC1 provides 2 PHBs (PHB1 and PHB2)
   * PEC2 provides 3 PHBs (PHB3, PHB4 and PHB5)

A num_pecs class attribute represents better the logic units of the
POWER9 chip. Use that instead of num_phbs which fits POWER8 chips.
This will ease adding support for user created devices.

Signed-off-by: Cédric Le Goater 
---


With this patch, chip->num_phbs is only defined and used on P8. We may 
want to add a comment to make it clear.


As I review this series, something is bugging me though: the difference 
of handling between P8 and P9.

On P9, we seem to have a more logical hiearachy:
phb <- PCI controller (PEC) <- chip

With P8, we don't have an explicit PEC, but we have a PBCQ object, which 
is somewhat similar. The hierarchy seems also more convoluted.
I don't see why it's treated differently. It seems both chips could be 
treated the same, which would make the code easier to follow.
That's outside of the scope of this series though. So maybe for a future 
patch? Who knows, I might volunteer...


  Fred




  include/hw/ppc/pnv.h |  2 ++
  hw/ppc/pnv.c | 20 +---
  2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/include/hw/ppc/pnv.h b/include/hw/ppc/pnv.h
index 247379ef1f88..f2c238062f4a 100644
--- a/include/hw/ppc/pnv.h
+++ b/include/hw/ppc/pnv.h
@@ -53,6 +53,7 @@ struct PnvChip {
  PnvCore  **cores;
  
  uint32_t num_phbs;

+uint32_t num_pecs;
  
  MemoryRegion xscom_mmio;

  MemoryRegion xscom;
@@ -136,6 +137,7 @@ struct PnvChipClass {
  uint64_t chip_cfam_id;
  uint64_t cores_mask;
  uint32_t num_phbs;
+uint32_t num_pecs;
  
  DeviceRealize parent_realize;
  
diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c

index 45d8ecbf2bf7..185464a1d443 100644
--- a/hw/ppc/pnv.c
+++ b/hw/ppc/pnv.c
@@ -658,7 +658,7 @@ static void pnv_chip_power9_pic_print_info(PnvChip *chip, 
Monitor *mon)
  pnv_xive_pic_print_info(&chip9->xive, mon);
  pnv_psi_pic_print_info(&chip9->psi, mon);
  
-for (i = 0; i < PNV9_CHIP_MAX_PEC; i++) {

+for (i = 0; i < chip->num_pecs; i++) {
  PnvPhb4PecState *pec = &chip9->pecs[i];
  for (j = 0; j < pec->num_stacks; j++) {
  pnv_phb4_pic_print_info(&pec->stacks[j].phb, mon);
@@ -1330,15 +1330,14 @@ static void pnv_chip_power9_instance_init(Object *obj)
  
  object_initialize_child(obj, "homer", &chip9->homer, TYPE_PNV9_HOMER);
  
-for (i = 0; i < PNV9_CHIP_MAX_PEC; i++) {

+if (defaults_enabled()) {
+chip->num_pecs = pcc->num_pecs;
+}
+
+for (i = 0; i < chip->num_pecs; i++) {
  object_initialize_child(obj, "pec[*]", &chip9->pecs[i],
  TYPE_PNV_PHB4_PEC);
  }
-
-/*
- * Number of PHBs is the chip default
- */
-chip->num_phbs = pcc->num_phbs;
  }
  
  static void pnv_chip_quad_realize(Pnv9Chip *chip9, Error **errp)

@@ -1374,7 +1373,7 @@ static void pnv_chip_power9_phb_realize(PnvChip *chip, 
Error **errp)
  int i, j;
  int phb_id = 0;
  
-for (i = 0; i < PNV9_CHIP_MAX_PEC; i++) {

+for (i = 0; i < chip->num_pecs; i++) {
  PnvPhb4PecState *pec = &chip9->pecs[i];
  PnvPhb4PecClass *pecc = PNV_PHB4_PEC_GET_CLASS(pec);
  uint32_t pec_nest_base;
@@ -1402,8 +1401,7 @@ static void pnv_chip_power9_phb_realize(PnvChip *chip, 
Error **errp)
  pnv_xscom_add_subregion(chip, pec_nest_base, &pec->nest_regs_mr);
  pnv_xscom_add_subregion(chip, pec_pci_base, &pec->pci_regs_mr);
  
-for (j = 0; j < pec->num_stacks && phb_id < chip->num_phbs;

- j++, phb_id++) {
+for (j = 0; j < pec->num_stacks; j++, phb_id++) {
  PnvPhb4PecStack *stack = &pec->stacks[j];
  Object *obj = OBJECT(&stack->phb);
  
@@ -1559,7 +1557,7 @@ static void pnv_chip_power9_class_init(ObjectClass *klass, void *data)

  k->xscom_core_base = pnv_chip_power9_xscom_core_base;
  k->xscom_pcba = pnv_chip_power9_xscom_pcba;
  dc->desc = "PowerNV Chip POWER9";
-k->num_phbs = 6;
+k->num_pecs = PNV9_CHIP_MAX_PEC;
  
  device_class_set_parent_realize(dc, pnv_chip_power9_realize,

  &k->parent_realize);

Re: [PATCH 08/14] ppc/pnv: Introduce version and device_id class atributes for PHB4 devices





On 02/12/2021 15:42, Cédric Le Goater wrote:

Signed-off-by: Cédric Le Goater 
---



Empty log message ok in qemu?
But it looks ok to me.
Reviewed-by: Frederic Barrat 



  include/hw/pci-host/pnv_phb4.h | 2 ++
  hw/pci-host/pnv_phb4_pec.c | 2 ++
  hw/ppc/pnv.c   | 4 ++--
  3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/include/hw/pci-host/pnv_phb4.h b/include/hw/pci-host/pnv_phb4.h
index 27556ae53425..b2864233641e 100644
--- a/include/hw/pci-host/pnv_phb4.h
+++ b/include/hw/pci-host/pnv_phb4.h
@@ -219,6 +219,8 @@ struct PnvPhb4PecClass {
  int compat_size;
  const char *stk_compat;
  int stk_compat_size;
+uint64_t version;
+uint64_t device_id;
  };
  
  #endif /* PCI_HOST_PNV_PHB4_H */

diff --git a/hw/pci-host/pnv_phb4_pec.c b/hw/pci-host/pnv_phb4_pec.c
index 741ddc90ed8d..9f722729ac50 100644
--- a/hw/pci-host/pnv_phb4_pec.c
+++ b/hw/pci-host/pnv_phb4_pec.c
@@ -499,6 +499,8 @@ static void pnv_pec_class_init(ObjectClass *klass, void 
*data)
  pecc->compat_size = sizeof(compat);
  pecc->stk_compat = stk_compat;
  pecc->stk_compat_size = sizeof(stk_compat);
+pecc->version = PNV_PHB4_VERSION;
+pecc->device_id = PNV_PHB4_DEVICE_ID;
  }
  
  static const TypeInfo pnv_pec_type_info = {

diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c
index 185464a1d443..0c65e1e88cf5 100644
--- a/hw/ppc/pnv.c
+++ b/hw/ppc/pnv.c
@@ -1408,9 +1408,9 @@ static void pnv_chip_power9_phb_realize(PnvChip *chip, 
Error **errp)
  object_property_set_int(obj, "index", phb_id, &error_fatal);
  object_property_set_int(obj, "chip-id", chip->chip_id,
  &error_fatal);
-object_property_set_int(obj, "version", PNV_PHB4_VERSION,
+object_property_set_int(obj, "version", pecc->version,
  &error_fatal);
-object_property_set_int(obj, "device-id", PNV_PHB4_DEVICE_ID,
+object_property_set_int(obj, "device-id", pecc->device_id,
  &error_fatal);
  object_property_set_link(obj, "stack", OBJECT(stack),
   &error_abort);

Re: [PATCH 09/14] ppc/pnv: Introduce a "chip" property under the PHB4 model





On 02/12/2021 15:42, Cédric Le Goater wrote:

Next changes will make use of it.

Signed-off-by: Cédric Le Goater 
---


Reviewed-by: Frederic Barrat 



  include/hw/pci-host/pnv_phb4.h | 2 ++
  hw/pci-host/pnv_phb4_pec.c | 2 ++
  hw/ppc/pnv.c   | 2 ++
  3 files changed, 6 insertions(+)

diff --git a/include/hw/pci-host/pnv_phb4.h b/include/hw/pci-host/pnv_phb4.h
index b2864233641e..8a585c9a42f7 100644
--- a/include/hw/pci-host/pnv_phb4.h
+++ b/include/hw/pci-host/pnv_phb4.h
@@ -205,6 +205,8 @@ struct PnvPhb4PecState {
  #define PHB4_PEC_MAX_STACKS 3
  uint32_t num_stacks;
  PnvPhb4PecStack stacks[PHB4_PEC_MAX_STACKS];
+
+PnvChip *chip;
  };
  
  
diff --git a/hw/pci-host/pnv_phb4_pec.c b/hw/pci-host/pnv_phb4_pec.c

index 9f722729ac50..e9750c41c595 100644
--- a/hw/pci-host/pnv_phb4_pec.c
+++ b/hw/pci-host/pnv_phb4_pec.c
@@ -462,6 +462,8 @@ static Property pnv_pec_properties[] = {
  DEFINE_PROP_UINT32("index", PnvPhb4PecState, index, 0),
  DEFINE_PROP_UINT32("num-stacks", PnvPhb4PecState, num_stacks, 0),
  DEFINE_PROP_UINT32("chip-id", PnvPhb4PecState, chip_id, 0),
+DEFINE_PROP_LINK("chip", PnvPhb4PecState, chip, TYPE_PNV_CHIP,
+ PnvChip *),
  DEFINE_PROP_LINK("system-memory", PnvPhb4PecState, system_memory,
   TYPE_MEMORY_REGION, MemoryRegion *),
  DEFINE_PROP_END_OF_LIST(),
diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c
index 0c65e1e88cf5..76b2f5468b38 100644
--- a/hw/ppc/pnv.c
+++ b/hw/ppc/pnv.c
@@ -1389,6 +1389,8 @@ static void pnv_chip_power9_phb_realize(PnvChip *chip, 
Error **errp)
  &error_fatal);
  object_property_set_int(OBJECT(pec), "chip-id", chip->chip_id,
  &error_fatal);
+object_property_set_link(OBJECT(pec), "chip", OBJECT(chip),
+ &error_fatal);
  object_property_set_link(OBJECT(pec), "system-memory",
   OBJECT(get_system_memory()), &error_abort);
  if (!qdev_realize(DEVICE(pec), NULL, errp)) {

Re: [PATCH 10/14] ppc/pnv: Introduce a num_stack class attribute





On 02/12/2021 15:42, Cédric Le Goater wrote:

Each PEC devices of the POWER9 chip has a predefined number of stacks,
equivalent of a root port complex:

   PEC0 -> 1 stack
   PEC1 -> 2 stacks
   PEC2 -> 3 stacks

Introduce a class attribute to hold these values and remove the
"num-stacks" property.

Signed-off-by: Cédric Le Goater 
---



Reviewed-by: Frederic Barrat 



  include/hw/pci-host/pnv_phb4.h |  1 +
  hw/pci-host/pnv_phb4_pec.c | 17 -
  hw/ppc/pnv.c   |  7 ---
  3 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/include/hw/pci-host/pnv_phb4.h b/include/hw/pci-host/pnv_phb4.h
index 8a585c9a42f7..60de3031a622 100644
--- a/include/hw/pci-host/pnv_phb4.h
+++ b/include/hw/pci-host/pnv_phb4.h
@@ -223,6 +223,7 @@ struct PnvPhb4PecClass {
  int stk_compat_size;
  uint64_t version;
  uint64_t device_id;
+const uint32_t *num_stacks;
  };
  
  #endif /* PCI_HOST_PNV_PHB4_H */

diff --git a/hw/pci-host/pnv_phb4_pec.c b/hw/pci-host/pnv_phb4_pec.c
index e9750c41c595..293909b5cb90 100644
--- a/hw/pci-host/pnv_phb4_pec.c
+++ b/hw/pci-host/pnv_phb4_pec.c
@@ -377,11 +377,19 @@ static void pnv_pec_instance_init(Object *obj)
  static void pnv_pec_realize(DeviceState *dev, Error **errp)
  {
  PnvPhb4PecState *pec = PNV_PHB4_PEC(dev);
+PnvPhb4PecClass *pecc = PNV_PHB4_PEC_GET_CLASS(pec);
  char name[64];
  int i;
  
  assert(pec->system_memory);
  
+if (pec->index >= PNV_CHIP_GET_CLASS(pec->chip)->num_pecs) {

+error_setg(errp, "invalid PEC index: %d", pec->index);
+return;
+}
+
+pec->num_stacks = pecc->num_stacks[pec->index];
+
  /* Create stacks */
  for (i = 0; i < pec->num_stacks; i++) {
  PnvPhb4PecStack *stack = &pec->stacks[i];
@@ -460,7 +468,6 @@ static int pnv_pec_dt_xscom(PnvXScomInterface *dev, void 
*fdt,
  
  static Property pnv_pec_properties[] = {

  DEFINE_PROP_UINT32("index", PnvPhb4PecState, index, 0),
-DEFINE_PROP_UINT32("num-stacks", PnvPhb4PecState, num_stacks, 0),
  DEFINE_PROP_UINT32("chip-id", PnvPhb4PecState, chip_id, 0),
  DEFINE_PROP_LINK("chip", PnvPhb4PecState, chip, TYPE_PNV_CHIP,
   PnvChip *),
@@ -479,6 +486,13 @@ static uint32_t pnv_pec_xscom_nest_base(PnvPhb4PecState 
*pec)
  return PNV9_XSCOM_PEC_NEST_BASE + 0x400 * pec->index;
  }
  
+/*

+ * PEC0 -> 1 stack
+ * PEC1 -> 2 stacks
+ * PEC2 -> 3 stacks
+ */
+static const uint32_t pnv_pec_num_stacks[] = { 1, 2, 3 };
+
  static void pnv_pec_class_init(ObjectClass *klass, void *data)
  {
  DeviceClass *dc = DEVICE_CLASS(klass);
@@ -503,6 +517,7 @@ static void pnv_pec_class_init(ObjectClass *klass, void 
*data)
  pecc->stk_compat_size = sizeof(stk_compat);
  pecc->version = PNV_PHB4_VERSION;
  pecc->device_id = PNV_PHB4_DEVICE_ID;
+pecc->num_stacks = pnv_pec_num_stacks;
  }
  
  static const TypeInfo pnv_pec_type_info = {

diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c
index 76b2f5468b38..957f0bdfaa6b 100644
--- a/hw/ppc/pnv.c
+++ b/hw/ppc/pnv.c
@@ -1380,13 +1380,6 @@ static void pnv_chip_power9_phb_realize(PnvChip *chip, 
Error **errp)
  uint32_t pec_pci_base;
  
  object_property_set_int(OBJECT(pec), "index", i, &error_fatal);

-/*
- * PEC0 -> 1 stack
- * PEC1 -> 2 stacks
- * PEC2 -> 3 stacks
- */
-object_property_set_int(OBJECT(pec), "num-stacks", i + 1,
-&error_fatal);
  object_property_set_int(OBJECT(pec), "chip-id", chip->chip_id,
  &error_fatal);
  object_property_set_link(OBJECT(pec), "chip", OBJECT(chip),

Re: [PATCH 11/14] ppc/pnv: Compute the PHB index from the PHB4 PEC model





On 02/12/2021 15:42, Cédric Le Goater wrote:

Use the num_stacks class attribute to compute the PHB index depending
on the PEC index :

   * PEC0 provides 1 PHB  (PHB0)
   * PEC1 provides 2 PHBs (PHB1 and PHB2)
   * PEC2 provides 3 PHBs (PHB3, PHB4 and PHB5)

Signed-off-by: Cédric Le Goater 
---
  hw/pci-host/pnv_phb4_pec.c | 16 
  hw/ppc/pnv.c   |  4 +---
  2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/hw/pci-host/pnv_phb4_pec.c b/hw/pci-host/pnv_phb4_pec.c
index 293909b5cb90..a7dd4173d598 100644
--- a/hw/pci-host/pnv_phb4_pec.c
+++ b/hw/pci-host/pnv_phb4_pec.c
@@ -374,6 +374,19 @@ static void pnv_pec_instance_init(Object *obj)
  }
  }
  
+static int pnv_pec_phb_offset(PnvPhb4PecState *pec)

+{
+PnvPhb4PecClass *pecc = PNV_PHB4_PEC_GET_CLASS(pec);
+int index = pec->index;
+int offset = 0;
+
+while (index--) {
+offset += pecc->num_stacks[index];
+}
+
+return offset;
+}
+



That seems overly complicated to me and not very readable. The log 
message is a lot more clear. I'd prefer we have a switch() statement 
returning the base PHB ID based on the PEC index.


  Fred




  static void pnv_pec_realize(DeviceState *dev, Error **errp)
  {
  PnvPhb4PecState *pec = PNV_PHB4_PEC(dev);
@@ -394,8 +407,10 @@ static void pnv_pec_realize(DeviceState *dev, Error **errp)
  for (i = 0; i < pec->num_stacks; i++) {
  PnvPhb4PecStack *stack = &pec->stacks[i];
  Object *stk_obj = OBJECT(stack);
+int phb_id = pnv_pec_phb_offset(pec) + i;
  
  object_property_set_int(stk_obj, "stack-no", i, &error_abort);

+object_property_set_int(stk_obj, "phb-id", phb_id, &error_abort);
  object_property_set_link(stk_obj, "pec", OBJECT(pec), &error_abort);
  if (!qdev_realize(DEVICE(stk_obj), NULL, errp)) {
  return;
@@ -538,6 +553,7 @@ static void pnv_pec_stk_instance_init(Object *obj)
  PnvPhb4PecStack *stack = PNV_PHB4_PEC_STACK(obj);
  
  object_initialize_child(obj, "phb", &stack->phb, TYPE_PNV_PHB4);

+object_property_add_alias(obj, "phb-id", OBJECT(&stack->phb), "index");
  }
  
  static void pnv_pec_stk_realize(DeviceState *dev, Error **errp)

diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c
index 957f0bdfaa6b..f8b0b2a28383 100644
--- a/hw/ppc/pnv.c
+++ b/hw/ppc/pnv.c
@@ -1371,7 +1371,6 @@ static void pnv_chip_power9_phb_realize(PnvChip *chip, 
Error **errp)
  {
  Pnv9Chip *chip9 = PNV9_CHIP(chip);
  int i, j;
-int phb_id = 0;
  
  for (i = 0; i < chip->num_pecs; i++) {

  PnvPhb4PecState *pec = &chip9->pecs[i];
@@ -1396,11 +1395,10 @@ static void pnv_chip_power9_phb_realize(PnvChip *chip, 
Error **errp)
  pnv_xscom_add_subregion(chip, pec_nest_base, &pec->nest_regs_mr);
  pnv_xscom_add_subregion(chip, pec_pci_base, &pec->pci_regs_mr);
  
-for (j = 0; j < pec->num_stacks; j++, phb_id++) {

+for (j = 0; j < pec->num_stacks; j++) {
  PnvPhb4PecStack *stack = &pec->stacks[j];
  Object *obj = OBJECT(&stack->phb);
  
-object_property_set_int(obj, "index", phb_id, &error_fatal);

  object_property_set_int(obj, "chip-id", chip->chip_id,
  &error_fatal);
  object_property_set_int(obj, "version", pecc->version,

Re: [PATCH 12/14] ppc/pnv: Remove "system-memory" property for he PHB4 PEC model





On 02/12/2021 15:42, Cédric Le Goater wrote:

This is not useful and will be in the way for support of user created
PHB4 devices.

Signed-off-by: Cédric Le Goater 
---



I doubt I see all the implications here, but it doesn't look wrong to 
me, so:

Reviewed-by: Frederic Barrat 

  Fred



  hw/pci-host/pnv_phb4_pec.c | 6 +-
  hw/ppc/pnv.c   | 2 --
  2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/hw/pci-host/pnv_phb4_pec.c b/hw/pci-host/pnv_phb4_pec.c
index a7dd4173d598..dfed2af0f7df 100644
--- a/hw/pci-host/pnv_phb4_pec.c
+++ b/hw/pci-host/pnv_phb4_pec.c
@@ -124,7 +124,7 @@ static uint64_t pnv_pec_stk_nest_xscom_read(void *opaque, 
hwaddr addr,
  static void pnv_pec_stk_update_map(PnvPhb4PecStack *stack)
  {
  PnvPhb4PecState *pec = stack->pec;
-MemoryRegion *sysmem = pec->system_memory;
+MemoryRegion *sysmem = get_system_memory();
  uint64_t bar_en = stack->nest_regs[PEC_NEST_STK_BAR_EN];
  uint64_t bar, mask, size;
  char name[64];
@@ -394,8 +394,6 @@ static void pnv_pec_realize(DeviceState *dev, Error **errp)
  char name[64];
  int i;
  
-assert(pec->system_memory);

-
  if (pec->index >= PNV_CHIP_GET_CLASS(pec->chip)->num_pecs) {
  error_setg(errp, "invalid PEC index: %d", pec->index);
  return;
@@ -486,8 +484,6 @@ static Property pnv_pec_properties[] = {
  DEFINE_PROP_UINT32("chip-id", PnvPhb4PecState, chip_id, 0),
  DEFINE_PROP_LINK("chip", PnvPhb4PecState, chip, TYPE_PNV_CHIP,
   PnvChip *),
-DEFINE_PROP_LINK("system-memory", PnvPhb4PecState, system_memory,
- TYPE_MEMORY_REGION, MemoryRegion *),
  DEFINE_PROP_END_OF_LIST(),
  };
  
diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c

index f8b0b2a28383..3a550eed0f36 100644
--- a/hw/ppc/pnv.c
+++ b/hw/ppc/pnv.c
@@ -1383,8 +1383,6 @@ static void pnv_chip_power9_phb_realize(PnvChip *chip, 
Error **errp)
  &error_fatal);
  object_property_set_link(OBJECT(pec), "chip", OBJECT(chip),
   &error_fatal);
-object_property_set_link(OBJECT(pec), "system-memory",
- OBJECT(get_system_memory()), &error_abort);
  if (!qdev_realize(DEVICE(pec), NULL, errp)) {
  return;
  }

Re: [PATCH 01/14] ppc/pnv: Reduce the maximum of PHB3 devices


On 12/7/21 10:40, Frederic Barrat wrote:



On 02/12/2021 15:42, Cédric Le Goater wrote:

All POWER8 machines have a maximum of 3 PHB3 devices. Adapt the
PNV8_CHIP_PHB3_MAX definition for consistency.

Signed-off-by: Cédric Le Goater 
---



The Naples chip (Garrison) can have 4 PHBs and it seems we have a power8nvl 
machine type for it. So I guess we should keep a max PHB count of 4 there.


Ah yes. This is the reason behind the 4. I should update the power8nvl
chip then.

Thanks,

C.



   Fred




  include/hw/ppc/pnv.h | 2 +-
  hw/ppc/pnv.c | 6 +++---
  2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/hw/ppc/pnv.h b/include/hw/ppc/pnv.h
index aa08d79d24de..6f498c8f1b5f 100644
--- a/include/hw/ppc/pnv.h
+++ b/include/hw/ppc/pnv.h
@@ -79,7 +79,7 @@ struct Pnv8Chip {
  PnvOCC   occ;
  PnvHomer homer;
-#define PNV8_CHIP_PHB3_MAX 4
+#define PNV8_CHIP_PHB3_MAX 3
  PnvPHB3  phbs[PNV8_CHIP_PHB3_MAX];
  XICSFabric    *xics;
diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c
index 71e45515f136..bd768dcc28ad 100644
--- a/hw/ppc/pnv.c
+++ b/hw/ppc/pnv.c
@@ -1256,7 +1256,7 @@ static void pnv_chip_power8e_class_init(ObjectClass 
*klass, void *data)
  k->chip_cfam_id = 0x221ef0498000ull;  /* P8 Murano DD2.1 */
  k->cores_mask = POWER8E_CORE_MASK;
-    k->num_phbs = 3;
+    k->num_phbs = PNV8_CHIP_PHB3_MAX;
  k->core_pir = pnv_chip_core_pir_p8;
  k->intc_create = pnv_chip_power8_intc_create;
  k->intc_reset = pnv_chip_power8_intc_reset;
@@ -1280,7 +1280,7 @@ static void pnv_chip_power8_class_init(ObjectClass 
*klass, void *data)
  k->chip_cfam_id = 0x220ea0498000ull; /* P8 Venice DD2.0 */
  k->cores_mask = POWER8_CORE_MASK;
-    k->num_phbs = 3;
+    k->num_phbs = PNV8_CHIP_PHB3_MAX;
  k->core_pir = pnv_chip_core_pir_p8;
  k->intc_create = pnv_chip_power8_intc_create;
  k->intc_reset = pnv_chip_power8_intc_reset;
@@ -1304,7 +1304,7 @@ static void pnv_chip_power8nvl_class_init(ObjectClass 
*klass, void *data)
  k->chip_cfam_id = 0x120d30498000ull;  /* P8 Naples DD1.0 */
  k->cores_mask = POWER8_CORE_MASK;
-    k->num_phbs = 3;
+    k->num_phbs = PNV8_CHIP_PHB3_MAX;
  k->core_pir = pnv_chip_core_pir_p8;
  k->intc_create = pnv_chip_power8_intc_create;
  k->intc_reset = pnv_chip_power8_intc_reset;

Re: [PATCH v4 08/22] target/riscv: Allow AIA device emulation to set ireg rmw callback

2021-12-07 Thread Anup Patel

On Thu, Nov 4, 2021 at 10:23 AM Alistair Francis  wrote:
>
> On Tue, Oct 26, 2021 at 6:00 PM Anup Patel  wrote:
> >
> > The AIA device emulation (such as AIA IMSIC) should be able to set
> > (or provide) AIA ireg read-modify-write callback for each privilege
> > level of a RISC-V HART.
> >
> > Signed-off-by: Anup Patel 
> > ---
> >  target/riscv/cpu.h| 19 +++
> >  target/riscv/cpu_helper.c | 14 ++
> >  2 files changed, 33 insertions(+)
> >
> > diff --git a/target/riscv/cpu.h b/target/riscv/cpu.h
> > index 7182fadd21..ef4298dc69 100644
> > --- a/target/riscv/cpu.h
> > +++ b/target/riscv/cpu.h
> > @@ -239,6 +239,18 @@ struct CPURISCVState {
> >  uint64_t (*rdtime_fn)(uint32_t);
> >  uint32_t rdtime_fn_arg;
> >
> > +/* machine specific AIA ireg read-modify-write callback */
> > +#define AIA_MAKE_IREG(__isel, __priv, __virt, __vgein) \
> > +__vgein) & 0x3f) << 24) | (((__virt) & 0x1) << 20) | \
> > + (((__priv) & 0x3) << 16) | (__isel & 0x))
> > +#define AIA_IREG_ISEL(__ireg)  ((__ireg) & 0x)
> > +#define AIA_IREG_PRIV(__ireg)  (((__ireg) >> 16) & 0x3)
> > +#define AIA_IREG_VIRT(__ireg)  (((__ireg) >> 20) & 0x1)
> > +#define AIA_IREG_VGEIN(__ireg) (((__ireg) >> 24) & 0x3f)
>
> These should be added when they are used

Actually, these define help us encode/decode AIA indirect register number
passed as "reg" parameter to aia_ireg_rmw_fn() below.

Regards,
Anup

>
> Alistair
>
> > +int (*aia_ireg_rmw_fn[4])(void *arg, target_ulong reg,
> > +target_ulong *val, target_ulong new_val, target_ulong write_mask);
> > +void *aia_ireg_rmw_fn_arg[4];
> > +
> >  /* True if in debugger mode.  */
> >  bool debugger;
> >  #endif
> > @@ -380,6 +392,13 @@ uint32_t riscv_cpu_update_mip(RISCVCPU *cpu, uint32_t 
> > mask, uint32_t value);
> >  #define BOOL_TO_MASK(x) (-!!(x)) /* helper for riscv_cpu_update_mip value 
> > */
> >  void riscv_cpu_set_rdtime_fn(CPURISCVState *env, uint64_t (*fn)(uint32_t),
> >   uint32_t arg);
> > +void riscv_cpu_set_aia_ireg_rmw_fn(CPURISCVState *env, uint32_t priv,
> > +   int (*rmw_fn)(void *arg,
> > + target_ulong reg,
> > + target_ulong *val,
> > + target_ulong new_val,
> > + target_ulong write_mask),
> > +   void *rmw_fn_arg);
> >  #endif
> >  void riscv_cpu_set_mode(CPURISCVState *env, target_ulong newpriv);
> >
> > diff --git a/target/riscv/cpu_helper.c b/target/riscv/cpu_helper.c
> > index 04df3792a8..d70def1da8 100644
> > --- a/target/riscv/cpu_helper.c
> > +++ b/target/riscv/cpu_helper.c
> > @@ -375,6 +375,20 @@ void riscv_cpu_set_rdtime_fn(CPURISCVState *env, 
> > uint64_t (*fn)(uint32_t),
> >  env->rdtime_fn_arg = arg;
> >  }
> >
> > +void riscv_cpu_set_aia_ireg_rmw_fn(CPURISCVState *env, uint32_t priv,
> > +   int (*rmw_fn)(void *arg,
> > + target_ulong reg,
> > + target_ulong *val,
> > + target_ulong new_val,
> > + target_ulong write_mask),
> > +   void *rmw_fn_arg)
> > +{
> > +if (priv <= PRV_M) {
> > +env->aia_ireg_rmw_fn[priv] = rmw_fn;
> > +env->aia_ireg_rmw_fn_arg[priv] = rmw_fn_arg;
> > +}
> > +}
> > +
> >  void riscv_cpu_set_mode(CPURISCVState *env, target_ulong newpriv)
> >  {
> >  if (newpriv > PRV_M) {
> > --
> > 2.25.1
> >
> >

Re: [PATCH 12/14] ppc/pnv: Remove "system-memory" property for he PHB4 PEC model





On 02/12/2021 15:42, Cédric Le Goater wrote:

This is not useful and will be in the way for support of user created
PHB4 devices.

Signed-off-by: Cédric Le Goater 
---



I forgot to mention the typo in the commit title: "he PHB4".

  Fred



  hw/pci-host/pnv_phb4_pec.c | 6 +-
  hw/ppc/pnv.c   | 2 --
  2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/hw/pci-host/pnv_phb4_pec.c b/hw/pci-host/pnv_phb4_pec.c
index a7dd4173d598..dfed2af0f7df 100644
--- a/hw/pci-host/pnv_phb4_pec.c
+++ b/hw/pci-host/pnv_phb4_pec.c
@@ -124,7 +124,7 @@ static uint64_t pnv_pec_stk_nest_xscom_read(void *opaque, 
hwaddr addr,
  static void pnv_pec_stk_update_map(PnvPhb4PecStack *stack)
  {
  PnvPhb4PecState *pec = stack->pec;
-MemoryRegion *sysmem = pec->system_memory;
+MemoryRegion *sysmem = get_system_memory();
  uint64_t bar_en = stack->nest_regs[PEC_NEST_STK_BAR_EN];
  uint64_t bar, mask, size;
  char name[64];
@@ -394,8 +394,6 @@ static void pnv_pec_realize(DeviceState *dev, Error **errp)
  char name[64];
  int i;
  
-assert(pec->system_memory);

-
  if (pec->index >= PNV_CHIP_GET_CLASS(pec->chip)->num_pecs) {
  error_setg(errp, "invalid PEC index: %d", pec->index);
  return;
@@ -486,8 +484,6 @@ static Property pnv_pec_properties[] = {
  DEFINE_PROP_UINT32("chip-id", PnvPhb4PecState, chip_id, 0),
  DEFINE_PROP_LINK("chip", PnvPhb4PecState, chip, TYPE_PNV_CHIP,
   PnvChip *),
-DEFINE_PROP_LINK("system-memory", PnvPhb4PecState, system_memory,
- TYPE_MEMORY_REGION, MemoryRegion *),
  DEFINE_PROP_END_OF_LIST(),
  };
  
diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c

index f8b0b2a28383..3a550eed0f36 100644
--- a/hw/ppc/pnv.c
+++ b/hw/ppc/pnv.c
@@ -1383,8 +1383,6 @@ static void pnv_chip_power9_phb_realize(PnvChip *chip, 
Error **errp)
  &error_fatal);
  object_property_set_link(OBJECT(pec), "chip", OBJECT(chip),
   &error_fatal);
-object_property_set_link(OBJECT(pec), "system-memory",
- OBJECT(get_system_memory()), &error_abort);
  if (!qdev_realize(DEVICE(pec), NULL, errp)) {
  return;
  }

Re: [PATCH 14/14] ppc/pnv: Introduce support for user created PHB4 devices





On 02/12/2021 15:42, Cédric Le Goater wrote:

PHB4 devices and PCI devices can now be added to the powernv9 machine
using:

   -device pnv-phb4-pec,chip-id=0,index=0
   -device nec-usb-xhci,bus=pci.0,addr=0x0

In case of user created devices, a lookup on 'chip-id' is required to
assign the owning chip.

To be noted, that the PEC PHB4 devices can add more than one PHB4
devices:

   * PEC0 provides 1 PHB  (PHB0)
   * PEC1 provides 2 PHBs (PHB1 and PHB2)
   * PEC2 provides 3 PHBs (PHB3, PHB4 and PHB5)

Signed-off-by: Cédric Le Goater 
---



Reviewed-by: Frederic Barrat 



  hw/pci-host/pnv_phb4_pec.c | 19 ++-
  1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/hw/pci-host/pnv_phb4_pec.c b/hw/pci-host/pnv_phb4_pec.c
index 9b081d543057..4ee92f11945c 100644
--- a/hw/pci-host/pnv_phb4_pec.c
+++ b/hw/pci-host/pnv_phb4_pec.c
@@ -394,6 +394,17 @@ static void pnv_pec_realize(DeviceState *dev, Error **errp)
  char name[64];
  int i;
  
+/* User created devices */

+if (!pec->chip) {
+PnvMachineState *pnv = PNV_MACHINE(qdev_get_machine());
+
+pec->chip = pnv_get_chip(pnv, pec->chip_id);
+if (!pec->chip) {
+error_setg(errp, "invalid chip id: %d", pec->chip_id);
+return;
+}
+}
+
  if (pec->index >= PNV_CHIP_GET_CLASS(pec->chip)->num_pecs) {
  error_setg(errp, "invalid PEC index: %d", pec->index);
  return;
@@ -401,6 +412,12 @@ static void pnv_pec_realize(DeviceState *dev, Error **errp)
  
  pec->num_stacks = pecc->num_stacks[pec->index];
  
+/*

+ * Reparent user created devices to the chip to build correctly
+ * the device tree.
+ */
+pnv_chip_parent_fixup(pec->chip, OBJECT(pec), pec->index);
+
  /* Create stacks */
  for (i = 0; i < pec->num_stacks; i++) {
  PnvPhb4PecStack *stack = &pec->stacks[i];
@@ -516,7 +533,7 @@ static void pnv_pec_class_init(ObjectClass *klass, void 
*data)
  
  dc->realize = pnv_pec_realize;

  device_class_set_props(dc, pnv_pec_properties);
-dc->user_creatable = false;
+dc->user_creatable = true;
  
  pecc->xscom_nest_base = pnv_pec_xscom_nest_base;

  pecc->xscom_pci_base  = pnv_pec_xscom_pci_base;

Re: [PATCH 13/14] ppc/pnv: Move realize of PEC stacks under the PEC model





On 02/12/2021 15:42, Cédric Le Goater wrote:

This change will help us providing support for user created PHB4
devices.

Signed-off-by: Cédric Le Goater 
---
  hw/pci-host/pnv_phb4_pec.c | 36 
  hw/ppc/pnv.c   | 31 +--
  2 files changed, 33 insertions(+), 34 deletions(-)

diff --git a/hw/pci-host/pnv_phb4_pec.c b/hw/pci-host/pnv_phb4_pec.c
index dfed2af0f7df..9b081d543057 100644
--- a/hw/pci-host/pnv_phb4_pec.c
+++ b/hw/pci-host/pnv_phb4_pec.c
@@ -556,6 +556,10 @@ static void pnv_pec_stk_realize(DeviceState *dev, Error 
**errp)
  {
  PnvPhb4PecStack *stack = PNV_PHB4_PEC_STACK(dev);
  PnvPhb4PecState *pec = stack->pec;
+PnvPhb4PecClass *pecc = PNV_PHB4_PEC_GET_CLASS(pec);
+PnvChip *chip = pec->chip;
+uint32_t pec_nest_base;
+uint32_t pec_pci_base;
  char name[64];
  
  assert(pec);

@@ -579,10 +583,34 @@ static void pnv_pec_stk_realize(DeviceState *dev, Error 
**errp)
  pnv_xscom_region_init(&stack->phb_regs_mr, OBJECT(&stack->phb),
&pnv_phb4_xscom_ops, &stack->phb, name, 0x40);
  
-/*

- * Let the machine/chip realize the PHB object to customize more
- * easily some fields
- */
+{
+Object *obj = OBJECT(&stack->phb);
+
+object_property_set_int(obj, "chip-id", pec->chip_id, &error_fatal);
+object_property_set_int(obj, "version", pecc->version, &error_fatal);
+object_property_set_int(obj, "device-id", pecc->device_id,
+&error_fatal);
+object_property_set_link(obj, "stack", OBJECT(stack),
+ &error_abort);
+if (!sysbus_realize(SYS_BUS_DEVICE(obj), errp)) {
+return;
+}
+}



Do we really need the extra sub-scope here? It looks off.



+
+pec_nest_base = pecc->xscom_nest_base(pec);
+pec_pci_base = pecc->xscom_pci_base(pec);
+
+/* Populate the XSCOM address space. */
+pnv_xscom_add_subregion(chip,
+pec_nest_base + 0x40 * (stack->stack_no + 1),
+&stack->nest_regs_mr);
+pnv_xscom_add_subregion(chip,
+pec_pci_base + 0x40 * (stack->stack_no + 1),
+&stack->pci_regs_mr);
+pnv_xscom_add_subregion(chip,
+pec_pci_base + PNV9_XSCOM_PEC_PCI_STK0 +
+0x40 * stack->stack_no,
+&stack->phb_regs_mr);
  }
  
  static Property pnv_pec_stk_properties[] = {

diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c
index 3a550eed0f36..7e13b15241fd 100644
--- a/hw/ppc/pnv.c
+++ b/hw/ppc/pnv.c
@@ -1370,7 +1370,7 @@ static void pnv_chip_quad_realize(Pnv9Chip *chip9, Error 
**errp)
  static void pnv_chip_power9_phb_realize(PnvChip *chip, Error **errp)
  {



With that change, we should really rename pnv_chip_power9_phb_realize() 
to pnv_chip_power9_pec_realize().


  Fred



  Pnv9Chip *chip9 = PNV9_CHIP(chip);
-int i, j;
+int i;
  
  for (i = 0; i < chip->num_pecs; i++) {

  PnvPhb4PecState *pec = &chip9->pecs[i];
@@ -1392,35 +1392,6 @@ static void pnv_chip_power9_phb_realize(PnvChip *chip, 
Error **errp)
  
  pnv_xscom_add_subregion(chip, pec_nest_base, &pec->nest_regs_mr);

  pnv_xscom_add_subregion(chip, pec_pci_base, &pec->pci_regs_mr);
-
-for (j = 0; j < pec->num_stacks; j++) {
-PnvPhb4PecStack *stack = &pec->stacks[j];
-Object *obj = OBJECT(&stack->phb);
-
-object_property_set_int(obj, "chip-id", chip->chip_id,
-&error_fatal);
-object_property_set_int(obj, "version", pecc->version,
-&error_fatal);
-object_property_set_int(obj, "device-id", pecc->device_id,
-&error_fatal);
-object_property_set_link(obj, "stack", OBJECT(stack),
- &error_abort);
-if (!sysbus_realize(SYS_BUS_DEVICE(obj), errp)) {
-return;
-}
-
-/* Populate the XSCOM address space. */
-pnv_xscom_add_subregion(chip,
-   pec_nest_base + 0x40 * (stack->stack_no + 
1),
-   &stack->nest_regs_mr);
-pnv_xscom_add_subregion(chip,
-pec_pci_base + 0x40 * (stack->stack_no + 
1),
-&stack->pci_regs_mr);
-pnv_xscom_add_subregion(chip,
-pec_pci_base + PNV9_XSCOM_PEC_PCI_STK0 +
-0x40 * stack->stack_no,
-&stack->phb_regs_mr);
-}
  }
  }

Re: [PATCH 04/14] ppc/pnv: Introduce support for user created PHB3 devices


On 12/7/21 10:47, Frederic Barrat wrote:



On 02/12/2021 15:42, Cédric Le Goater wrote:

PHB3 devices and PCI devices can now be added to the powernv8 machine
using :

   -device pnv-phb3,chip-id=0,index=1 \
   -device nec-usb-xhci,bus=pci.1,addr=0x0

The 'index' property identifies the PHB3 in the chip. In case of user
created devices, a lookup on 'chip-id' is required to assign the
owning chip.

Signed-off-by: Cédric Le Goater 
---



diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c
index de277c457838..d7fe92cb082d 100644
--- a/hw/ppc/pnv.c
+++ b/hw/ppc/pnv.c
@@ -1097,14 +1097,14 @@ static void pnv_chip_power8_instance_init(Object *obj)
  object_initialize_child(obj, "homer", &chip8->homer, TYPE_PNV8_HOMER);
-    for (i = 0; i < pcc->num_phbs; i++) {
+    if (defaults_enabled()) {
+    chip->num_phbs = pcc->num_phbs;
+    }
+
+    for (i = 0; i < chip->num_phbs; i++) {
  object_initialize_child(obj, "phb[*]", &chip8->phbs[i], 
TYPE_PNV_PHB3);
  }
-    /*
- * Number of PHBs is the chip default
- */
-    chip->num_phbs = pcc->num_phbs;
  }



So if "-nodefaults" is mentioned on the command line, then chip->num_phbs is not 
set. It seems the intention is to have only the PHBs defined on the CLI, which is fine. 
However, I don't see where chip->num_phbs is incremented in that case.


Good catch :) That's why we need another patch fixing all this because
it is breaking the XICS fabric handlers, ics_get and ics_resend.
'info pic' is impacted also.

Here is the proposed fix for v2 :

 
https://github.com/legoater/qemu/commit/b47bce3109f316a65aa2fa2a46651b2960e93fca

I chose to loop on the children of the chip to find the user
created devices and leave the PnvChip model with empty defaults.


'info pic' is impacted the same on P9

  
https://github.com/legoater/qemu/commit/d4733edca94c95f717f4ee35bbea6dc085365286

Thanks,

C.

[PATCH v7 1/7] net/vmnet: add vmnet dependency and customizable option

Signed-off-by: Vladislav Yaroshchuk 
---
 meson.build   | 4 
 meson_options.txt | 2 ++
 scripts/meson-buildoptions.sh | 3 +++
 3 files changed, 9 insertions(+)

diff --git a/meson.build b/meson.build
index 96de1a6ef9..ce8acf6ada 100644
--- a/meson.build
+++ b/meson.build
@@ -481,6 +481,8 @@ if cocoa.found() and get_option('gtk').enabled()
   error('Cocoa and GTK+ cannot be enabled at the same time')
 endif
 
+vmnet = dependency('appleframeworks', modules: 'vmnet', required: 
get_option('vmnet'))
+
 seccomp = not_found
 if not get_option('seccomp').auto() or have_system or have_tools
   seccomp = dependency('libseccomp', version: '>=2.3.0',
@@ -1461,6 +1463,7 @@ config_host_data.set('CONFIG_SECCOMP', seccomp.found())
 config_host_data.set('CONFIG_SNAPPY', snappy.found())
 config_host_data.set('CONFIG_USB_LIBUSB', libusb.found())
 config_host_data.set('CONFIG_VDE', vde.found())
+config_host_data.set('CONFIG_VMNET', vmnet.found())
 config_host_data.set('CONFIG_VHOST_USER_BLK_SERVER', 
have_vhost_user_blk_server)
 config_host_data.set('CONFIG_VNC', vnc.found())
 config_host_data.set('CONFIG_VNC_JPEG', jpeg.found())
@@ -3397,6 +3400,7 @@ endif
 summary_info += {'JACK support':  jack}
 summary_info += {'brlapi support':brlapi}
 summary_info += {'vde support':   vde}
+summary_info += {'vmnet.framework support': vmnet}
 summary_info += {'netmap support':have_netmap}
 summary_info += {'l2tpv3 support':have_l2tpv3}
 summary_info += {'Linux AIO support': libaio}
diff --git a/meson_options.txt b/meson_options.txt
index e392323732..0538d48a85 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -147,6 +147,8 @@ option('netmap', type : 'feature', value : 'auto',
description: 'netmap network backend support')
 option('vde', type : 'feature', value : 'auto',
description: 'vde network backend support')
+option('vmnet', type : 'feature', value : 'auto',
+   description: 'vmnet.framework network backend support')
 option('virglrenderer', type : 'feature', value : 'auto',
description: 'virgl rendering support')
 option('vnc', type : 'feature', value : 'auto',
diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh
index 7a17ff4218..13da30f018 100644
--- a/scripts/meson-buildoptions.sh
+++ b/scripts/meson-buildoptions.sh
@@ -82,6 +82,7 @@ meson_options_help() {
   printf "%s\n" '  u2f U2F emulation support'
   printf "%s\n" '  usb-redir   libusbredir support'
   printf "%s\n" '  vde vde network backend support'
+  printf "%s\n" '  vmnet   vmnet.framework network backend support'
   printf "%s\n" '  vhost-user-blk-server'
   printf "%s\n" '  build vhost-user-blk server'
   printf "%s\n" '  virglrenderer   virgl rendering support'
@@ -242,6 +243,8 @@ _meson_option_parse() {
 --disable-usb-redir) printf "%s" -Dusb_redir=disabled ;;
 --enable-vde) printf "%s" -Dvde=enabled ;;
 --disable-vde) printf "%s" -Dvde=disabled ;;
+--enable-vmnet) printf "%s" -Dvmnet=enabled ;;
+--disable-vmnet) printf "%s" -Dvmnet=disabled ;;
 --enable-vhost-user-blk-server) printf "%s" 
-Dvhost_user_blk_server=enabled ;;
 --disable-vhost-user-blk-server) printf "%s" 
-Dvhost_user_blk_server=disabled ;;
 --enable-virglrenderer) printf "%s" -Dvirglrenderer=enabled ;;
-- 
2.23.0

[PATCH v7 4/7] net/vmnet: implement host mode (vmnet-host)

Signed-off-by: Vladislav Yaroshchuk 
---
 net/vmnet-host.c | 93 
 1 file changed, 87 insertions(+), 6 deletions(-)

diff --git a/net/vmnet-host.c b/net/vmnet-host.c
index 4a5ef99dc7..9c2e760ed1 100644
--- a/net/vmnet-host.c
+++ b/net/vmnet-host.c
@@ -9,16 +9,97 @@
  */
 
 #include "qemu/osdep.h"
+#include "qemu/uuid.h"
 #include "qapi/qapi-types-net.h"
-#include "vmnet_int.h"
-#include "clients.h"
-#include "qemu/error-report.h"
 #include "qapi/error.h"
+#include "clients.h"
+#include "vmnet_int.h"
 
 #include 
 
+typedef struct VmnetHostState {
+  VmnetCommonState cs;
+  QemuUUID network_uuid;
+} VmnetHostState;
+
+static xpc_object_t create_if_desc(const Netdev *netdev,
+   NetClientState *nc,
+   Error **errp)
+{
+const NetdevVmnetHostOptions *options = &(netdev->u.vmnet_host);
+VmnetCommonState *cs = DO_UPCAST(VmnetCommonState, nc, nc);
+VmnetHostState *hs = DO_UPCAST(VmnetHostState, cs, cs);
+
+xpc_object_t if_desc = xpc_dictionary_create(NULL, NULL, 0);
+
+xpc_dictionary_set_uint64(
+if_desc,
+vmnet_operation_mode_key,
+VMNET_HOST_MODE
+);
+
+xpc_dictionary_set_bool(
+if_desc,
+vmnet_enable_isolation_key,
+options->isolated
+);
+
+if (options->has_net_uuid) {
+if (qemu_uuid_parse(options->net_uuid, &hs->network_uuid) < 0) {
+error_setg(errp, "Invalid UUID provided in 'net-uuid'");
+}
+
+xpc_dictionary_set_uuid(
+if_desc,
+vmnet_network_identifier_key,
+hs->network_uuid.data
+);
+}
+
+if (options->has_start_address ||
+options->has_end_address ||
+options->has_subnet_mask) {
+
+if (options->has_start_address &&
+options->has_end_address &&
+options->has_subnet_mask) {
+
+xpc_dictionary_set_string(if_desc,
+  vmnet_start_address_key,
+  options->start_address);
+xpc_dictionary_set_string(if_desc,
+  vmnet_end_address_key,
+  options->end_address);
+xpc_dictionary_set_string(if_desc,
+  vmnet_subnet_mask_key,
+  options->subnet_mask);
+} else {
+error_setg(
+errp,
+"'start-address', 'end-address', 'subnet_mask' "
+"should be provided together"
+);
+}
+}
+
+return if_desc;
+}
+
+static NetClientInfo net_vmnet_host_info = {
+.type = NET_CLIENT_DRIVER_VMNET_HOST,
+.size = sizeof(VmnetHostState),
+.receive = vmnet_receive_common,
+.cleanup = vmnet_cleanup_common,
+};
+
 int net_init_vmnet_host(const Netdev *netdev, const char *name,
-NetClientState *peer, Error **errp) {
-  error_setg(errp, "vmnet-host is not implemented yet");
-  return -1;
+NetClientState *peer, Error **errp)
+{
+NetClientState *nc;
+xpc_object_t if_desc;
+
+nc = qemu_new_net_client(&net_vmnet_host_info,
+ peer, "vmnet-host", name);
+if_desc = create_if_desc(netdev, nc, errp);
+return vmnet_if_create(nc, if_desc, errp, NULL);
 }
-- 
2.23.0

[PATCH v7 5/7] net/vmnet: implement bridged mode (vmnet-bridged)

Signed-off-by: Vladislav Yaroshchuk 
---
 net/vmnet-bridged.m | 98 ++---
 1 file changed, 92 insertions(+), 6 deletions(-)

diff --git a/net/vmnet-bridged.m b/net/vmnet-bridged.m
index 4e42a90391..3c9da9dc8b 100644
--- a/net/vmnet-bridged.m
+++ b/net/vmnet-bridged.m
@@ -10,16 +10,102 @@
 
 #include "qemu/osdep.h"
 #include "qapi/qapi-types-net.h"
-#include "vmnet_int.h"
-#include "clients.h"
-#include "qemu/error-report.h"
 #include "qapi/error.h"
+#include "clients.h"
+#include "vmnet_int.h"
 
 #include 
 
+typedef struct VmnetBridgedState {
+  VmnetCommonState cs;
+} VmnetBridgedState;
+
+static bool validate_ifname(const char *ifname)
+{
+xpc_object_t shared_if_list = vmnet_copy_shared_interface_list();
+__block bool match = false;
+
+xpc_array_apply(
+shared_if_list,
+^bool(size_t index, xpc_object_t value) {
+  if (strcmp(xpc_string_get_string_ptr(value), ifname) == 0) {
+  match = true;
+  return false;
+  }
+  return true;
+});
+
+return match;
+}
+
+static const char *get_valid_ifnames(void)
+{
+xpc_object_t shared_if_list = vmnet_copy_shared_interface_list();
+__block char *if_list = NULL;
+
+xpc_array_apply(
+shared_if_list,
+^bool(size_t index, xpc_object_t value) {
+  if_list = g_strconcat(xpc_string_get_string_ptr(value),
+" ",
+if_list,
+NULL);
+  return true;
+});
+
+if (if_list) {
+return if_list;
+}
+return "[no interfaces]";
+}
+
+static xpc_object_t create_if_desc(const Netdev *netdev, Error **errp)
+{
+const NetdevVmnetBridgedOptions *options = &(netdev->u.vmnet_bridged);
+xpc_object_t if_desc = xpc_dictionary_create(NULL, NULL, 0);
+
+xpc_dictionary_set_uint64(
+if_desc,
+vmnet_operation_mode_key,
+VMNET_BRIDGED_MODE
+);
+
+xpc_dictionary_set_bool(
+if_desc,
+vmnet_enable_isolation_key,
+options->isolated
+);
+
+if (validate_ifname(options->ifname)) {
+xpc_dictionary_set_string(if_desc,
+  vmnet_shared_interface_name_key,
+  options->ifname);
+} else {
+return NULL;
+}
+return if_desc;
+}
+
+static NetClientInfo net_vmnet_bridged_info = {
+.type = NET_CLIENT_DRIVER_VMNET_BRIDGED,
+.size = sizeof(VmnetBridgedState),
+.receive = vmnet_receive_common,
+.cleanup = vmnet_cleanup_common,
+};
+
 int net_init_vmnet_bridged(const Netdev *netdev, const char *name,
NetClientState *peer, Error **errp)
 {
-  error_setg(errp, "vmnet-bridged is not implemented yet");
-  return -1;
-}
+NetClientState *nc = qemu_new_net_client(&net_vmnet_bridged_info,
+ peer, "vmnet-bridged", name);
+xpc_object_t if_desc = create_if_desc(netdev, errp);;
+
+if (!if_desc) {
+error_setg(errp,
+   "unsupported ifname, should be one of: %s",
+   get_valid_ifnames());
+return -1;
+}
+
+return vmnet_if_create(nc, if_desc, errp, NULL);
+}
\ No newline at end of file
-- 
2.23.0

[PATCH v7 0/7] Add vmnet.framework based network backend

macOS provides networking API for VMs called 'vmnet.framework':
https://developer.apple.com/documentation/vmnet

We can provide its support as the new QEMU network backends which
represent three different vmnet.framework interface usage modes:

  * `vmnet-shared`:
allows the guest to communicate with other guests in shared mode and
also with external network (Internet) via NAT. Has (macOS-provided)
DHCP server; subnet mask and IP range can be configured;

  * `vmnet-host`:
allows the guest to communicate with other guests in host mode.
By default has enabled DHCP as `vmnet-shared`, but providing
network unique id (uuid) can make `vmnet-host` interfaces isolated
from each other and also disables DHCP.

  * `vmnet-bridged`:
bridges the guest with a physical network interface.

This backends cannot work on macOS Catalina 10.15 cause we use
vmnet.framework API provided only with macOS 11 and newer. Seems
that it is not a problem, because QEMU guarantees to work on two most
recent versions of macOS which now are Big Sur (11) and Monterey (12).

Also, we have one inconvenient restriction: vmnet.framework interfaces
can create only privileged user:
`$ sudo qemu-system-x86_64 -nic vmnet-shared`

Attempt of `vmnet-*` netdev creation being unprivileged user fails with
vmnet's 'general failure'.

This happens because vmnet.framework requires `com.apple.vm.networking`
entitlement which is: "restricted to developers of virtualization software.
To request this entitlement, contact your Apple representative." as Apple
documentation says:
https://developer.apple.com/documentation/bundleresources/entitlements/com_apple_vm_networking

One more note: we still have quite useful but not supported
'vmnet.framework' features as creating port forwarding rules, IPv6
NAT prefix specifying and so on.

Nevertheless, new backends work fine and tested within `qemu-system-x86-64`
on macOS Bir Sur 11.5.2 host with such nic models:
  * e1000-82545em
  * virtio-net-pci
  * vmxnet3

The guests were:
  * macOS 10.15.7
  * Ubuntu Bionic (server cloudimg)


This series partially reuses patches by Phillip Tennen:
https://patchew.org/QEMU/20210218134947.1860-1-phillip.en...@gmail.com/
So I included them signed-off line into one of the commit messages and
also here.

v1 -> v2:
 Since v1 minor typos were fixed, patches rebased onto latest master,
 redundant changes removed (small commits squashed)
v2 -> v3:
 - QAPI style fixes
 - Typos fixes in comments
 - `#include`'s updated to be in sync with recent master
v3 -> v4:
 - Support vmnet interfaces isolation feature
 - Support vmnet-host network uuid setting feature
 - Refactored sources a bit
v4 -> v5:
 - Missed 6.2 boat, now 7.0 candidate
 - Fix qapi netdev descriptions and styles
   (@subnetmask -> @subnet-mask)
 - Support vmnet-shared IPv6 prefix setting feature
v5 -> v6
 - provide detailed commit messages for commits of
   many changes
 - rename properties @dhcpstart and @dhcpend to
   @start-address and @end-address
 - improve qapi documentation about isolation
   features (@isolated, @net-uuid)
v6 -> v7:
 - update MAINTAINERS list


Vladislav Yaroshchuk (7):
  net/vmnet: add vmnet dependency and customizable option
  net/vmnet: add vmnet backends to qapi/net
  net/vmnet: implement shared mode (vmnet-shared)
  net/vmnet: implement host mode (vmnet-host)
  net/vmnet: implement bridged mode (vmnet-bridged)
  net/vmnet: update qemu-options.hx
  net/vmnet: update MAINTAINERS list

 MAINTAINERS   |   5 +
 meson.build   |   4 +
 meson_options.txt |   2 +
 net/clients.h |  11 ++
 net/meson.build   |   7 +
 net/net.c |  10 ++
 net/vmnet-bridged.m   | 111 
 net/vmnet-common.m| 330 ++
 net/vmnet-host.c  | 105 +++
 net/vmnet-shared.c|  92 ++
 net/vmnet_int.h   |  48 +
 qapi/net.json | 132 +-
 qemu-options.hx   |  25 +++
 scripts/meson-buildoptions.sh |   3 +
 14 files changed, 883 insertions(+), 2 deletions(-)
 create mode 100644 net/vmnet-bridged.m
 create mode 100644 net/vmnet-common.m
 create mode 100644 net/vmnet-host.c
 create mode 100644 net/vmnet-shared.c
 create mode 100644 net/vmnet_int.h

-- 
2.23.0

[PATCH v7 2/7] net/vmnet: add vmnet backends to qapi/net

Create separate netdevs for each vmnet operating mode:
- vmnet-host
- vmnet-shared
- vmnet-bridged

Signed-off-by: Vladislav Yaroshchuk 
---
 net/clients.h   |  11 
 net/meson.build |   7 +++
 net/net.c   |  10 
 net/vmnet-bridged.m |  25 +
 net/vmnet-common.m  |  20 +++
 net/vmnet-host.c|  24 
 net/vmnet-shared.c  |  25 +
 net/vmnet_int.h |  25 +
 qapi/net.json   | 132 +++-
 9 files changed, 277 insertions(+), 2 deletions(-)
 create mode 100644 net/vmnet-bridged.m
 create mode 100644 net/vmnet-common.m
 create mode 100644 net/vmnet-host.c
 create mode 100644 net/vmnet-shared.c
 create mode 100644 net/vmnet_int.h

diff --git a/net/clients.h b/net/clients.h
index 92f9b59aed..c9157789f2 100644
--- a/net/clients.h
+++ b/net/clients.h
@@ -63,4 +63,15 @@ int net_init_vhost_user(const Netdev *netdev, const char 
*name,
 
 int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
 NetClientState *peer, Error **errp);
+#ifdef CONFIG_VMNET
+int net_init_vmnet_host(const Netdev *netdev, const char *name,
+  NetClientState *peer, Error **errp);
+
+int net_init_vmnet_shared(const Netdev *netdev, const char *name,
+  NetClientState *peer, Error **errp);
+
+int net_init_vmnet_bridged(const Netdev *netdev, const char *name,
+  NetClientState *peer, Error **errp);
+#endif /* CONFIG_VMNET */
+
 #endif /* QEMU_NET_CLIENTS_H */
diff --git a/net/meson.build b/net/meson.build
index 847bc2ac85..00a88c4951 100644
--- a/net/meson.build
+++ b/net/meson.build
@@ -42,4 +42,11 @@ softmmu_ss.add(when: 'CONFIG_POSIX', if_true: 
files(tap_posix))
 softmmu_ss.add(when: 'CONFIG_WIN32', if_true: files('tap-win32.c'))
 softmmu_ss.add(when: 'CONFIG_VHOST_NET_VDPA', if_true: files('vhost-vdpa.c'))
 
+vmnet_files = files(
+  'vmnet-common.m',
+  'vmnet-bridged.m',
+  'vmnet-host.c',
+  'vmnet-shared.c'
+)
+softmmu_ss.add(when: vmnet, if_true: vmnet_files)
 subdir('can')
diff --git a/net/net.c b/net/net.c
index f0d14dbfc1..1dbb64b935 100644
--- a/net/net.c
+++ b/net/net.c
@@ -1021,6 +1021,11 @@ static int (* const 
net_client_init_fun[NET_CLIENT_DRIVER__MAX])(
 #ifdef CONFIG_L2TPV3
 [NET_CLIENT_DRIVER_L2TPV3]= net_init_l2tpv3,
 #endif
+#ifdef CONFIG_VMNET
+[NET_CLIENT_DRIVER_VMNET_HOST] = net_init_vmnet_host,
+[NET_CLIENT_DRIVER_VMNET_SHARED] = net_init_vmnet_shared,
+[NET_CLIENT_DRIVER_VMNET_BRIDGED] = net_init_vmnet_bridged,
+#endif /* CONFIG_VMNET */
 };
 
 
@@ -1106,6 +,11 @@ void show_netdevs(void)
 #endif
 #ifdef CONFIG_VHOST_VDPA
 "vhost-vdpa",
+#endif
+#ifdef CONFIG_VMNET
+"vmnet-host",
+"vmnet-shared",
+"vmnet-bridged",
 #endif
 };
 
diff --git a/net/vmnet-bridged.m b/net/vmnet-bridged.m
new file mode 100644
index 00..4e42a90391
--- /dev/null
+++ b/net/vmnet-bridged.m
@@ -0,0 +1,25 @@
+/*
+ * vmnet-bridged.m
+ *
+ * Copyright(c) 2021 Vladislav Yaroshchuk 
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/qapi-types-net.h"
+#include "vmnet_int.h"
+#include "clients.h"
+#include "qemu/error-report.h"
+#include "qapi/error.h"
+
+#include 
+
+int net_init_vmnet_bridged(const Netdev *netdev, const char *name,
+   NetClientState *peer, Error **errp)
+{
+  error_setg(errp, "vmnet-bridged is not implemented yet");
+  return -1;
+}
diff --git a/net/vmnet-common.m b/net/vmnet-common.m
new file mode 100644
index 00..532d152840
--- /dev/null
+++ b/net/vmnet-common.m
@@ -0,0 +1,20 @@
+/*
+ * vmnet-common.m - network client wrapper for Apple vmnet.framework
+ *
+ * Copyright(c) 2021 Vladislav Yaroshchuk 
+ * Copyright(c) 2021 Phillip Tennen 
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/qapi-types-net.h"
+#include "vmnet_int.h"
+#include "clients.h"
+#include "qemu/error-report.h"
+#include "qapi/error.h"
+
+#include 
+
diff --git a/net/vmnet-host.c b/net/vmnet-host.c
new file mode 100644
index 00..4a5ef99dc7
--- /dev/null
+++ b/net/vmnet-host.c
@@ -0,0 +1,24 @@
+/*
+ * vmnet-host.c
+ *
+ * Copyright(c) 2021 Vladislav Yaroshchuk 
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/qapi-types-net.h"
+#include "vmnet_int.h"
+#include "clients.h"
+#include "qemu/error-report.h"
+#include "qapi/error.h"
+
+#include 
+
+int net_init_vmnet_host(const Netdev *netdev, const char *name,
+NetClientState *peer, Error **errp) {
+  error_setg(errp, "vmnet-host is not implemented yet");
+  return -1;
+}

Re: [PATCH 08/14] ppc/pnv: Introduce version and device_id class atributes for PHB4 devices


On 12/7/21 11:01, Frederic Barrat wrote:



On 02/12/2021 15:42, Cédric Le Goater wrote:

Signed-off-by: Cédric Le Goater 
---



Empty log message ok in qemu?


checkpatch didn't complain :) I might make an effort for v2.

Thanks,

C.



But it looks ok to me.> Reviewed-by: Frederic Barrat 



  include/hw/pci-host/pnv_phb4.h | 2 ++
  hw/pci-host/pnv_phb4_pec.c | 2 ++
  hw/ppc/pnv.c   | 4 ++--
  3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/include/hw/pci-host/pnv_phb4.h b/include/hw/pci-host/pnv_phb4.h
index 27556ae53425..b2864233641e 100644
--- a/include/hw/pci-host/pnv_phb4.h
+++ b/include/hw/pci-host/pnv_phb4.h
@@ -219,6 +219,8 @@ struct PnvPhb4PecClass {
  int compat_size;
  const char *stk_compat;
  int stk_compat_size;
+    uint64_t version;
+    uint64_t device_id;
  };
  #endif /* PCI_HOST_PNV_PHB4_H */
diff --git a/hw/pci-host/pnv_phb4_pec.c b/hw/pci-host/pnv_phb4_pec.c
index 741ddc90ed8d..9f722729ac50 100644
--- a/hw/pci-host/pnv_phb4_pec.c
+++ b/hw/pci-host/pnv_phb4_pec.c
@@ -499,6 +499,8 @@ static void pnv_pec_class_init(ObjectClass *klass, void 
*data)
  pecc->compat_size = sizeof(compat);
  pecc->stk_compat = stk_compat;
  pecc->stk_compat_size = sizeof(stk_compat);
+    pecc->version = PNV_PHB4_VERSION;
+    pecc->device_id = PNV_PHB4_DEVICE_ID;
  }
  static const TypeInfo pnv_pec_type_info = {
diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c
index 185464a1d443..0c65e1e88cf5 100644
--- a/hw/ppc/pnv.c
+++ b/hw/ppc/pnv.c
@@ -1408,9 +1408,9 @@ static void pnv_chip_power9_phb_realize(PnvChip *chip, 
Error **errp)
  object_property_set_int(obj, "index", phb_id, &error_fatal);
  object_property_set_int(obj, "chip-id", chip->chip_id,
  &error_fatal);
-    object_property_set_int(obj, "version", PNV_PHB4_VERSION,
+    object_property_set_int(obj, "version", pecc->version,
  &error_fatal);
-    object_property_set_int(obj, "device-id", PNV_PHB4_DEVICE_ID,
+    object_property_set_int(obj, "device-id", pecc->device_id,
  &error_fatal);
  object_property_set_link(obj, "stack", OBJECT(stack),
   &error_abort);

[PATCH v7 3/7] net/vmnet: implement shared mode (vmnet-shared)

Interaction with vmnet.framework in different modes
differs only on configuration stage, so we can create
common `send`, `receive`, etc. procedures and reuse them.

vmnet.framework supports iov, but writing more than
one iov into vmnet interface fails with
'VMNET_INVALID_ARGUMENT'. Collecting provided iovs into
one and passing it to vmnet works fine. That's the
reason why receive_iov() left unimplemented. But it still
works with good enough performance having .receive()
implemented only.

Also, there is no way to unsubscribe from vmnet packages
receiving except registering and unregistering event
callback or simply drop packages just ignoring and
not processing them when related flag is set. Here we do
using the second way.

Signed-off-by: Phillip Tennen 
Signed-off-by: Vladislav Yaroshchuk 
---
 net/vmnet-common.m | 310 +
 net/vmnet-shared.c |  75 ++-
 net/vmnet_int.h|  23 
 3 files changed, 404 insertions(+), 4 deletions(-)

diff --git a/net/vmnet-common.m b/net/vmnet-common.m
index 532d152840..6d474af4be 100644
--- a/net/vmnet-common.m
+++ b/net/vmnet-common.m
@@ -10,6 +10,8 @@
  */
 
 #include "qemu/osdep.h"
+#include "qemu/main-loop.h"
+#include "qemu/log.h"
 #include "qapi/qapi-types-net.h"
 #include "vmnet_int.h"
 #include "clients.h"
@@ -17,4 +19,312 @@
 #include "qapi/error.h"
 
 #include 
+#include 
 
+#ifdef DEBUG
+#define D(x) x
+#define D_LOG(...) qemu_log(__VA_ARGS__)
+#else
+#define D(x) do { } while (0)
+#define D_LOG(...) do { } while (0)
+#endif
+
+typedef struct vmpktdesc vmpktdesc_t;
+typedef struct iovec iovec_t;
+
+static void vmnet_set_send_enabled(VmnetCommonState *s, bool enable)
+{
+s->send_enabled = enable;
+}
+
+
+static void vmnet_send_completed(NetClientState *nc, ssize_t len)
+{
+VmnetCommonState *s = DO_UPCAST(VmnetCommonState, nc, nc);
+vmnet_set_send_enabled(s, true);
+}
+
+
+static void vmnet_send(NetClientState *nc,
+   interface_event_t event_id,
+   xpc_object_t event)
+{
+assert(event_id == VMNET_INTERFACE_PACKETS_AVAILABLE);
+
+VmnetCommonState *s;
+uint64_t packets_available;
+
+struct iovec *iov;
+struct vmpktdesc *packets;
+int pkt_cnt;
+int i;
+
+vmnet_return_t if_status;
+ssize_t size;
+
+s = DO_UPCAST(VmnetCommonState, nc, nc);
+
+packets_available = xpc_dictionary_get_uint64(
+event,
+vmnet_estimated_packets_available_key
+);
+
+pkt_cnt = (packets_available < VMNET_PACKETS_LIMIT) ?
+  packets_available :
+  VMNET_PACKETS_LIMIT;
+
+
+iov = s->iov_buf;
+packets = s->packets_buf;
+
+for (i = 0; i < pkt_cnt; ++i) {
+packets[i].vm_pkt_size = s->max_packet_size;
+packets[i].vm_pkt_iovcnt = 1;
+packets[i].vm_flags = 0;
+}
+
+if_status = vmnet_read(s->vmnet_if, packets, &pkt_cnt);
+if (if_status != VMNET_SUCCESS) {
+error_printf("vmnet: read failed: %s\n",
+ vmnet_status_map_str(if_status));
+}
+qemu_mutex_lock_iothread();
+for (i = 0; i < pkt_cnt; ++i) {
+size = qemu_send_packet_async(nc,
+  iov[i].iov_base,
+  packets[i].vm_pkt_size,
+  vmnet_send_completed);
+if (size == 0) {
+vmnet_set_send_enabled(s, false);
+} else if (size < 0) {
+break;
+}
+}
+qemu_mutex_unlock_iothread();
+
+}
+
+
+static void vmnet_register_event_callback(VmnetCommonState *s)
+{
+dispatch_queue_t avail_pkt_q = dispatch_queue_create(
+"org.qemu.vmnet.if_queue",
+DISPATCH_QUEUE_SERIAL
+);
+
+vmnet_interface_set_event_callback(
+s->vmnet_if,
+VMNET_INTERFACE_PACKETS_AVAILABLE,
+avail_pkt_q,
+^(interface_event_t event_id, xpc_object_t event) {
+  if (s->send_enabled) {
+  vmnet_send(&s->nc, event_id, event);
+  }
+});
+}
+
+
+static void vmnet_bufs_init(VmnetCommonState *s)
+{
+int i;
+struct vmpktdesc *packets;
+struct iovec *iov;
+
+packets = s->packets_buf;
+iov = s->iov_buf;
+
+for (i = 0; i < VMNET_PACKETS_LIMIT; ++i) {
+iov[i].iov_len = s->max_packet_size;
+iov[i].iov_base = g_malloc0(iov[i].iov_len);
+packets[i].vm_pkt_iov = iov + i;
+}
+}
+
+
+const char *vmnet_status_map_str(vmnet_return_t status)
+{
+switch (status) {
+case VMNET_SUCCESS:
+return "success";
+case VMNET_FAILURE:
+return "general failure";
+case VMNET_MEM_FAILURE:
+return "memory allocation failure";
+case VMNET_INVALID_ARGUMENT:
+return "invalid argument specified";
+case VMNET_SETUP_INCOMPLETE:
+return "interface setup is not complete";
+case VMNET_INVALID_ACCESS:
+return "invalid access, permission denied";
+case VMNET_PACKET_TOO_BIG:
+

[PATCH v7 7/7] net/vmnet: update MAINTAINERS list

Signed-off-by: Vladislav Yaroshchuk 
---
 MAINTAINERS | 5 +
 1 file changed, 5 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 7543eb4d59..5c696e38da 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2631,6 +2631,11 @@ W: http://info.iet.unipi.it/~luigi/netmap/
 S: Maintained
 F: net/netmap.c
 
+Apple vmnet network backends
+M: Vladislav Yaroshchuk 
+S: Maintained
+F: net/vmnet*
+
 Host Memory Backends
 M: David Hildenbrand 
 M: Igor Mammedov 
-- 
2.23.0

[PATCH v7 6/7] net/vmnet: update qemu-options.hx

Signed-off-by: Vladislav Yaroshchuk 
---
 qemu-options.hx | 25 +
 1 file changed, 25 insertions(+)

diff --git a/qemu-options.hx b/qemu-options.hx
index ae2c6dbbfc..1ffa5eedd5 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -2677,6 +2677,25 @@ DEF("netdev", HAS_ARG, QEMU_OPTION_netdev,
 #ifdef __linux__
 "-netdev vhost-vdpa,id=str,vhostdev=/path/to/dev\n"
 "configure a vhost-vdpa network,Establish a vhost-vdpa 
netdev\n"
+#endif
+#ifdef CONFIG_VMNET
+"-netdev vmnet-host,id=str[,isolated=on|off][,net-uuid=uuid]\n"
+" [,start-address=addr,end-address=addr,subnet-mask=mask]\n"
+"configure a vmnet network backend in host mode with ID 
'str',\n"
+"isolate this interface from others with 'isolated',\n"
+"configure the address range and choose a subnet mask,\n"
+"specify network UUID 'uuid' to disable DHCP and interact 
with\n"
+"vmnet-host interfaces within this isolated network\n"
+"-netdev vmnet-shared,id=str[,isolated=on|off][,nat66-prefix=addr]\n"
+" [,start-address=addr,end-address=addr,subnet-mask=mask]\n"
+"configure a vmnet network backend in shared mode with ID 
'str',\n"
+"configure the address range and choose a subnet mask,\n"
+"set IPv6 ULA prefix (of length 64) to use for internal 
network,\n"
+"isolate this interface from others with 'isolated'\n"
+"-netdev vmnet-bridged,id=str,ifname=name[,isolated=on|off]\n"
+"configure a vmnet network backend in bridged mode with ID 
'str',\n"
+"use 'ifname=name' to select a physical network interface 
to be bridged,\n"
+"isolate this interface from others with 'isolated'\n"
 #endif
 "-netdev hubport,id=str,hubid=n[,netdev=nd]\n"
 "configure a hub port on the hub with ID 'n'\n", 
QEMU_ARCH_ALL)
@@ -2696,6 +2715,9 @@ DEF("nic", HAS_ARG, QEMU_OPTION_nic,
 #endif
 #ifdef CONFIG_POSIX
 "vhost-user|"
+#endif
+#ifdef CONFIG_VMNET
+"vmnet-host|vmnet-shared|vmnet-bridged|"
 #endif
 "socket][,option][,...][mac=macaddr]\n"
 "initialize an on-board / default host NIC (using MAC 
address\n"
@@ -2718,6 +2740,9 @@ DEF("net", HAS_ARG, QEMU_OPTION_net,
 #endif
 #ifdef CONFIG_NETMAP
 "netmap|"
+#endif
+#ifdef CONFIG_VMNET
+"vmnet-host|vmnet-shared|vmnet-bridged|"
 #endif
 "socket][,option][,option][,...]\n"
 "old way to initialize a host network interface\n"
-- 
2.23.0

Re: [PATCH 11/14] ppc/pnv: Compute the PHB index from the PHB4 PEC model


On 12/7/21 11:06, Frederic Barrat wrote:



On 02/12/2021 15:42, Cédric Le Goater wrote:

Use the num_stacks class attribute to compute the PHB index depending
on the PEC index :

   * PEC0 provides 1 PHB  (PHB0)
   * PEC1 provides 2 PHBs (PHB1 and PHB2)
   * PEC2 provides 3 PHBs (PHB3, PHB4 and PHB5)

Signed-off-by: Cédric Le Goater 
---
  hw/pci-host/pnv_phb4_pec.c | 16 
  hw/ppc/pnv.c   |  4 +---
  2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/hw/pci-host/pnv_phb4_pec.c b/hw/pci-host/pnv_phb4_pec.c
index 293909b5cb90..a7dd4173d598 100644
--- a/hw/pci-host/pnv_phb4_pec.c
+++ b/hw/pci-host/pnv_phb4_pec.c
@@ -374,6 +374,19 @@ static void pnv_pec_instance_init(Object *obj)
  }
  }
+static int pnv_pec_phb_offset(PnvPhb4PecState *pec)
+{
+    PnvPhb4PecClass *pecc = PNV_PHB4_PEC_GET_CLASS(pec);
+    int index = pec->index;
+    int offset = 0;
+
+    while (index--) {
+    offset += pecc->num_stacks[index];
+    }
+
+    return offset;
+}
+



That seems overly complicated to me and not very readable. The log message is a 
lot more clear. I'd prefer we have a switch() statement returning the base PHB 
ID based on the PEC index.


yes I agree but PHB5 is on its way and this is compatible. See :

  
https://github.com/legoater/qemu/commit/d7caa1b74f3f8a90815fa086b87a1bd467b7c988

I will take any good idea fitting both :)

Thanks,

C.



   Fred




  static void pnv_pec_realize(DeviceState *dev, Error **errp)
  {
  PnvPhb4PecState *pec = PNV_PHB4_PEC(dev);
@@ -394,8 +407,10 @@ static void pnv_pec_realize(DeviceState *dev, Error **errp)
  for (i = 0; i < pec->num_stacks; i++) {
  PnvPhb4PecStack *stack = &pec->stacks[i];
  Object *stk_obj = OBJECT(stack);
+    int phb_id = pnv_pec_phb_offset(pec) + i;
  object_property_set_int(stk_obj, "stack-no", i, &error_abort);
+    object_property_set_int(stk_obj, "phb-id", phb_id, &error_abort);
  object_property_set_link(stk_obj, "pec", OBJECT(pec), &error_abort);
  if (!qdev_realize(DEVICE(stk_obj), NULL, errp)) {
  return;
@@ -538,6 +553,7 @@ static void pnv_pec_stk_instance_init(Object *obj)
  PnvPhb4PecStack *stack = PNV_PHB4_PEC_STACK(obj);
  object_initialize_child(obj, "phb", &stack->phb, TYPE_PNV_PHB4);
+    object_property_add_alias(obj, "phb-id", OBJECT(&stack->phb), "index");
  }
  static void pnv_pec_stk_realize(DeviceState *dev, Error **errp)
diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c
index 957f0bdfaa6b..f8b0b2a28383 100644
--- a/hw/ppc/pnv.c
+++ b/hw/ppc/pnv.c
@@ -1371,7 +1371,6 @@ static void pnv_chip_power9_phb_realize(PnvChip *chip, 
Error **errp)
  {
  Pnv9Chip *chip9 = PNV9_CHIP(chip);
  int i, j;
-    int phb_id = 0;
  for (i = 0; i < chip->num_pecs; i++) {
  PnvPhb4PecState *pec = &chip9->pecs[i];
@@ -1396,11 +1395,10 @@ static void pnv_chip_power9_phb_realize(PnvChip *chip, 
Error **errp)
  pnv_xscom_add_subregion(chip, pec_nest_base, &pec->nest_regs_mr);
  pnv_xscom_add_subregion(chip, pec_pci_base, &pec->pci_regs_mr);
-    for (j = 0; j < pec->num_stacks; j++, phb_id++) {
+    for (j = 0; j < pec->num_stacks; j++) {
  PnvPhb4PecStack *stack = &pec->stacks[j];
  Object *obj = OBJECT(&stack->phb);
-    object_property_set_int(obj, "index", phb_id, &error_fatal);
  object_property_set_int(obj, "chip-id", chip->chip_id,
  &error_fatal);
  object_property_set_int(obj, "version", pecc->version,

Re: [PATCH 06/14] ppc/pnv: Complete user created PHB3 devices


On 12/7/21 10:53, Frederic Barrat wrote:



On 02/12/2021 15:42, Cédric Le Goater wrote:

PHB3s ared SysBus devices and should be allowed to be dynamically
created.

Signed-off-by: Cédric Le Goater 
---


This one is a bit of black magic for me. 


Yes. QEMU internals related to sysbus. I am not an expert either.

I don't see an equivalent for P9 though. Not needed there? 


No because the phb4-pec devices are simple devices. Not tied to sysbus.

Thanks,

C.


I'll have another comment about P8/P9 later.

   Fred



  hw/pci-host/pnv_phb3.c | 9 +
  hw/ppc/pnv.c   | 2 ++
  2 files changed, 11 insertions(+)

diff --git a/hw/pci-host/pnv_phb3.c b/hw/pci-host/pnv_phb3.c
index e91f658b0060..b61f9c369f64 100644
--- a/hw/pci-host/pnv_phb3.c
+++ b/hw/pci-host/pnv_phb3.c
@@ -1000,6 +1000,9 @@ static void pnv_phb3_realize(DeviceState *dev, Error 
**errp)
  /* User created devices */
  if (!phb->chip) {
+    Error *local_err = NULL;
+    BusState *s;
+
  phb->chip = pnv_get_chip(pnv, phb->chip_id);
  if (!phb->chip) {
  error_setg(errp, "invalid chip id: %d", phb->chip_id);
@@ -1011,6 +1014,12 @@ static void pnv_phb3_realize(DeviceState *dev, Error 
**errp)
   * correctly the device tree.
   */
  pnv_chip_parent_fixup(phb->chip, OBJECT(phb), phb->phb_id);
+
+    s = qdev_get_parent_bus(DEVICE(phb->chip));
+    if (!qdev_set_parent_bus(DEVICE(phb), s, &local_err)) {
+    error_propagate(errp, local_err);
+    return;
+    }
  }
  /* LSI sources */
diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c
index 9a458655efd9..45d8ecbf2bf7 100644
--- a/hw/ppc/pnv.c
+++ b/hw/ppc/pnv.c
@@ -1927,6 +1927,8 @@ static void pnv_machine_power8_class_init(ObjectClass 
*oc, void *data)
  pmc->compat = compat;
  pmc->compat_size = sizeof(compat);
+
+    machine_class_allow_dynamic_sysbus_dev(mc, TYPE_PNV_PHB3);
  }
  static void pnv_machine_power9_class_init(ObjectClass *oc, void *data)

Re: [PATCH 12/14] ppc/pnv: Remove "system-memory" property for he PHB4 PEC model


On 12/7/21 11:08, Frederic Barrat wrote:



On 02/12/2021 15:42, Cédric Le Goater wrote:

This is not useful and will be in the way for support of user created
PHB4 devices.

Signed-off-by: Cédric Le Goater 
---



I doubt I see all the implications here, 


It is good practice to avoid statics in models or calls like
get_system_memory() or qdev_get_machine(). With dynamic models,
it becomes more complex.

Thanks,

C.


but it doesn't look wrong to me, so:
Reviewed-by: Frederic Barrat 

   Fred



  hw/pci-host/pnv_phb4_pec.c | 6 +-
  hw/ppc/pnv.c   | 2 --
  2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/hw/pci-host/pnv_phb4_pec.c b/hw/pci-host/pnv_phb4_pec.c
index a7dd4173d598..dfed2af0f7df 100644
--- a/hw/pci-host/pnv_phb4_pec.c
+++ b/hw/pci-host/pnv_phb4_pec.c
@@ -124,7 +124,7 @@ static uint64_t pnv_pec_stk_nest_xscom_read(void *opaque, 
hwaddr addr,
  static void pnv_pec_stk_update_map(PnvPhb4PecStack *stack)
  {
  PnvPhb4PecState *pec = stack->pec;
-    MemoryRegion *sysmem = pec->system_memory;
+    MemoryRegion *sysmem = get_system_memory();
  uint64_t bar_en = stack->nest_regs[PEC_NEST_STK_BAR_EN];
  uint64_t bar, mask, size;
  char name[64];
@@ -394,8 +394,6 @@ static void pnv_pec_realize(DeviceState *dev, Error **errp)
  char name[64];
  int i;
-    assert(pec->system_memory);
-
  if (pec->index >= PNV_CHIP_GET_CLASS(pec->chip)->num_pecs) {
  error_setg(errp, "invalid PEC index: %d", pec->index);
  return;
@@ -486,8 +484,6 @@ static Property pnv_pec_properties[] = {
  DEFINE_PROP_UINT32("chip-id", PnvPhb4PecState, chip_id, 0),
  DEFINE_PROP_LINK("chip", PnvPhb4PecState, chip, TYPE_PNV_CHIP,
   PnvChip *),
-    DEFINE_PROP_LINK("system-memory", PnvPhb4PecState, system_memory,
- TYPE_MEMORY_REGION, MemoryRegion *),
  DEFINE_PROP_END_OF_LIST(),
  };
diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c
index f8b0b2a28383..3a550eed0f36 100644
--- a/hw/ppc/pnv.c
+++ b/hw/ppc/pnv.c
@@ -1383,8 +1383,6 @@ static void pnv_chip_power9_phb_realize(PnvChip *chip, 
Error **errp)
  &error_fatal);
  object_property_set_link(OBJECT(pec), "chip", OBJECT(chip),
   &error_fatal);
-    object_property_set_link(OBJECT(pec), "system-memory",
- OBJECT(get_system_memory()), &error_abort);
  if (!qdev_realize(DEVICE(pec), NULL, errp)) {
  return;
  }

Re: [PATCH 13/14] ppc/pnv: Move realize of PEC stacks under the PEC model


On 12/7/21 11:10, Frederic Barrat wrote:



On 02/12/2021 15:42, Cédric Le Goater wrote:

This change will help us providing support for user created PHB4
devices.

Signed-off-by: Cédric Le Goater 
---
  hw/pci-host/pnv_phb4_pec.c | 36 
  hw/ppc/pnv.c   | 31 +--
  2 files changed, 33 insertions(+), 34 deletions(-)

diff --git a/hw/pci-host/pnv_phb4_pec.c b/hw/pci-host/pnv_phb4_pec.c
index dfed2af0f7df..9b081d543057 100644
--- a/hw/pci-host/pnv_phb4_pec.c
+++ b/hw/pci-host/pnv_phb4_pec.c
@@ -556,6 +556,10 @@ static void pnv_pec_stk_realize(DeviceState *dev, Error 
**errp)
  {
  PnvPhb4PecStack *stack = PNV_PHB4_PEC_STACK(dev);
  PnvPhb4PecState *pec = stack->pec;
+    PnvPhb4PecClass *pecc = PNV_PHB4_PEC_GET_CLASS(pec);
+    PnvChip *chip = pec->chip;
+    uint32_t pec_nest_base;
+    uint32_t pec_pci_base;
  char name[64];
  assert(pec);
@@ -579,10 +583,34 @@ static void pnv_pec_stk_realize(DeviceState *dev, Error 
**errp)
  pnv_xscom_region_init(&stack->phb_regs_mr, OBJECT(&stack->phb),
    &pnv_phb4_xscom_ops, &stack->phb, name, 0x40);
-    /*
- * Let the machine/chip realize the PHB object to customize more
- * easily some fields
- */
+    {
+    Object *obj = OBJECT(&stack->phb);
+
+    object_property_set_int(obj, "chip-id", pec->chip_id, &error_fatal);
+    object_property_set_int(obj, "version", pecc->version, &error_fatal);
+    object_property_set_int(obj, "device-id", pecc->device_id,
+    &error_fatal);
+    object_property_set_link(obj, "stack", OBJECT(stack),
+ &error_abort);
+    if (!sysbus_realize(SYS_BUS_DEVICE(obj), errp)) {
+    return;
+    }
+    }



Do we really need the extra sub-scope here? It looks off.


No. That's a left over from the initial patches I worked on.


+
+    pec_nest_base = pecc->xscom_nest_base(pec);
+    pec_pci_base = pecc->xscom_pci_base(pec);
+
+    /* Populate the XSCOM address space. */
+    pnv_xscom_add_subregion(chip,
+    pec_nest_base + 0x40 * (stack->stack_no + 1),
+    &stack->nest_regs_mr);
+    pnv_xscom_add_subregion(chip,
+    pec_pci_base + 0x40 * (stack->stack_no + 1),
+    &stack->pci_regs_mr);
+    pnv_xscom_add_subregion(chip,
+    pec_pci_base + PNV9_XSCOM_PEC_PCI_STK0 +
+    0x40 * stack->stack_no,
+    &stack->phb_regs_mr);
  }
  static Property pnv_pec_stk_properties[] = {
diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c
index 3a550eed0f36..7e13b15241fd 100644
--- a/hw/ppc/pnv.c
+++ b/hw/ppc/pnv.c
@@ -1370,7 +1370,7 @@ static void pnv_chip_quad_realize(Pnv9Chip *chip9, Error 
**errp)
  static void pnv_chip_power9_phb_realize(PnvChip *chip, Error **errp)
  {



With that change, we should really rename pnv_chip_power9_phb_realize() to 
pnv_chip_power9_pec_realize().


yes.

Thanks,

C.




   Fred



  Pnv9Chip *chip9 = PNV9_CHIP(chip);
-    int i, j;
+    int i;
  for (i = 0; i < chip->num_pecs; i++) {
  PnvPhb4PecState *pec = &chip9->pecs[i];
@@ -1392,35 +1392,6 @@ static void pnv_chip_power9_phb_realize(PnvChip *chip, 
Error **errp)
  pnv_xscom_add_subregion(chip, pec_nest_base, &pec->nest_regs_mr);
  pnv_xscom_add_subregion(chip, pec_pci_base, &pec->pci_regs_mr);
-
-    for (j = 0; j < pec->num_stacks; j++) {
-    PnvPhb4PecStack *stack = &pec->stacks[j];
-    Object *obj = OBJECT(&stack->phb);
-
-    object_property_set_int(obj, "chip-id", chip->chip_id,
-    &error_fatal);
-    object_property_set_int(obj, "version", pecc->version,
-    &error_fatal);
-    object_property_set_int(obj, "device-id", pecc->device_id,
-    &error_fatal);
-    object_property_set_link(obj, "stack", OBJECT(stack),
- &error_abort);
-    if (!sysbus_realize(SYS_BUS_DEVICE(obj), errp)) {
-    return;
-    }
-
-    /* Populate the XSCOM address space. */
-    pnv_xscom_add_subregion(chip,
-   pec_nest_base + 0x40 * (stack->stack_no + 
1),
-   &stack->nest_regs_mr);
-    pnv_xscom_add_subregion(chip,
-    pec_pci_base + 0x40 * (stack->stack_no + 
1),
-    &stack->pci_regs_mr);
-    pnv_xscom_add_subregion(chip,
-    pec_pci_base + PNV9_XSCOM_PEC_PCI_STK0 +
-    0x40 * stack->stack_no,
-    &stack->phb_regs_mr);
-    }
  }
  }

Re: [PATCH 07/14] ppc/pnv: Introduce a num_pecs class attribute for PHB4 PEC devices


On 12/7/21 11:00, Frederic Barrat wrote:



On 02/12/2021 15:42, Cédric Le Goater wrote:

POWER9 processor comes with 3 PHB4 PECs (PCI Express Controller) and
each PEC can have several PHBs :

   * PEC0 provides 1 PHB  (PHB0)
   * PEC1 provides 2 PHBs (PHB1 and PHB2)
   * PEC2 provides 3 PHBs (PHB3, PHB4 and PHB5)

A num_pecs class attribute represents better the logic units of the
POWER9 chip. Use that instead of num_phbs which fits POWER8 chips.
This will ease adding support for user created devices.

Signed-off-by: Cédric Le Goater 
---


With this patch, chip->num_phbs is only defined and used on P8. We may want to 
add a comment to make it clear.


Yes.

With the latest changes, I think we can now move num_phbs under PnvChip8
and num_pecs under PnvChip9 since they are only used in these routines :

P8:
static void pnv_chip_power8_instance_init(Object *obj)
chip->num_phbs = pcc->num_phbs;
for (i = 0; i < chip->num_phbs; i++) {

static void pnv_chip_power8_realize(DeviceState *dev, Error **errp)
for (i = 0; i < chip->num_phbs; i++) {

P9:

static void pnv_chip_power9_instance_init(Object *obj)
chip->num_pecs = pcc->num_pecs;
for (i = 0; i < chip->num_pecs; i++) {

static void pnv_chip_power9_phb_realize(PnvChip *chip, Error **errp)
for (i = 0; i < chip->num_pecs; i++) {



As I review this series, something is bugging me though: the difference of 
handling between P8 and P9.
On P9, we seem to have a more logical hiearachy:
phb <- PCI controller (PEC) <- chip


Yes. It's cleaner than P8 in terms of logic. P8 initial support was
done hastily for skiboot bringup in 2014.


With P8, we don't have an explicit PEC, but we have a PBCQ object, which is 
somewhat similar. The hierarchy seems also more convoluted.


But we don't have stacks on P8. Do we ?


I don't see why it's treated differently. It seems both chips could be treated 
the same, which would make the code easier to follow.


I agree. Daniel certainly would also :)

That's outside of the scope of this series though. 


Well, this patchset enables libvirt support for the PowerNV machines.
Once this is pushed, we need to keep the API, the object model names
being part of it.

7.0 is a good time for a change, really. After that, we won't be able
to change the QOM hierarchy that much.


So maybe for a future patch? Who knows, I might volunteer...


You would introduce a phb3-pec on top of the phb3s ?

Let me send a v2 first and may be we could rework the object hierarchy
in the 7.0 time frame. We don't have to merge this ASAP.

Thanks,

C.

[PATCH] mirror: Avoid assertion failed in mirror_run

2021-12-07 Thread Yi Wang

From: Long YunJian 

when blockcommit from active leaf node, sometimes, we get assertion failed with
"mirror_run: Assertion `QLIST_EMPTY(&bs->tracked_requests)' failed" messages.
According to the core file, we find bs->tracked_requests has IO request,
so assertion failed.
(gdb) bt
#0  0x7f410df707cf in raise () from /lib64/libc.so.6
#1  0x7f410df5ac05 in abort () from /lib64/libc.so.6
#2  0x7f410df5aad9 in __assert_fail_base.cold.0 () from /lib64/libc.so.6
#3  0x7f410df68db6 in __assert_fail () from /lib64/libc.so.6
#4  0x556915635371 in mirror_run (job=0x556916ff8600, errp=) 
at block/mirror.c:1092
#5  0x5569155e6c53 in job_co_entry (opaque=0x556916ff8600) at job.c:904
#6  0x5569156d9483 in coroutine_trampoline (i0=, 
i1=) at util/coroutine-ucontext.c:115
(gdb) p s->mirror_top_bs->backing->bs->tracked_requests
$12 = {lh_first = 0x7f3f07bfb8b0}
(gdb) p s->mirror_top_bs->backing->bs->tracked_requests->lh_first
$13 = (struct BdrvTrackedRequest *) 0x7f3f07bfb8b0

Actually, before excuting assert(QLIST_EMPTY(&bs->tracked_requests)),
it will excute mirror_flush(s). It may handle new I/O request and maybe
pending I/O during this flush. Just likes in bdrv_close fuction,
bdrv_drain(bs) followed by bdrv_flush(bs), we should add bdrv_drain fuction
to handle pending I/O after mirror_flush.

Signed-off-by: Long YunJian 
Signed-off-by: Yi Wang 
---
 block/mirror.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/block/mirror.c b/block/mirror.c
index efec2c7674..1eec356310 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -1079,6 +1079,8 @@ static int coroutine_fn mirror_run(Job *job, Error **errp)
 s->in_drain = false;
 continue;
 }
+/* in case flush left pending I/O */
+bdrv_drain(bs);
 
 /* The two disks are in sync.  Exit and report successful
  * completion.
-- 
2.18.1

Re: [PATCH v2] docs: Minor updates on the powernv documentation.

2021-12-07 Thread Leonardo Augusto Guimarães Garcia

On 12/2/21 03:51, Cédric Le Goater wrote:
> Hello Leonardo,
>
> On 11/23/21 13:10, lagar...@linux.ibm.com wrote:
>> From: Leonardo Garcia 
>>
>> Signed-off-by: Leonardo Garcia 
>> ---
>
> It seems that POWER10 was renamed to Power10 but not POWER9. And :
>
>   https://en.wikipedia.org/wiki/Power9  redirects to POWER9
>   https://en.wikipedia.org/wiki/POWER10 redirects to Power10
>
> I will keep the upper case POWER9. No need to resend.


 
Ok. Thanks! I'll keep this nomenclature on the pseries documentation
patches as well.
 
Cheers,
 
Leo


>
> Thanks,
>
> C.
>
>
>
>>   docs/system/ppc/powernv.rst | 57 +++--
>>   1 file changed, 29 insertions(+), 28 deletions(-)
>>
>> diff --git a/docs/system/ppc/powernv.rst b/docs/system/ppc/powernv.rst
>> index 86186b7d2c..eda4219a27 100644
>> --- a/docs/system/ppc/powernv.rst
>> +++ b/docs/system/ppc/powernv.rst
>> @@ -1,7 +1,7 @@
>> -PowerNV family boards (``powernv8``, ``powernv9``)
>> +PowerNV family boards (``powernv8``, ``powernv9``, ``powernv10``)
>>   ==
>>   -PowerNV (as Non-Virtualized) is the "baremetal" platform using the
>> +PowerNV (as Non-Virtualized) is the "bare metal" platform using the
>>   OPAL firmware. It runs Linux on IBM and OpenPOWER systems and it can
>>   be used as an hypervisor OS, running KVM guests, or simply as a host
>>   OS.
>> @@ -15,17 +15,15 @@ beyond the scope of what QEMU addresses today.
>>   Supported devices
>>   -
>>   - * Multi processor support for POWER8, POWER8NVL and POWER9.
>> - * XSCOM, serial communication sideband bus to configure chiplets
>> - * Simple LPC Controller
>> - * Processor Service Interface (PSI) Controller
>> - * Interrupt Controller, XICS (POWER8) and XIVE (POWER9)
>> - * POWER8 PHB3 PCIe Host bridge and POWER9 PHB4 PCIe Host bridge
>> - * Simple OCC is an on-chip microcontroller used for power management
>> -   tasks
>> - * iBT device to handle BMC communication, with the internal BMC
>> -   simulator provided by QEMU or an external BMC such as an Aspeed
>> -   QEMU machine.
>> + * Multi processor support for POWER8, POWER8NVL and Power9.
>> + * XSCOM, serial communication sideband bus to configure chiplets.
>> + * Simple LPC Controller.
>> + * Processor Service Interface (PSI) Controller.
>> + * Interrupt Controller, XICS (POWER8) and XIVE (Power9) and XIVE2
>> (Power10).
>> + * POWER8 PHB3 PCIe Host bridge and POWER9 PHB4 PCIe Host bridge.
>> + * Simple OCC is an on-chip micro-controller used for power
>> management tasks.
>> + * iBT device to handle BMC communication, with the internal BMC
>> simulator
>> +   provided by QEMU or an external BMC such as an Aspeed QEMU machine.
>>    * PNOR containing the different firmware partitions.
>>     Missing devices
>> @@ -33,27 +31,25 @@ Missing devices
>>     A lot is missing, among which :
>>   - * POWER10 processor
>> - * XIVE2 (POWER10) interrupt controller
>> - * I2C controllers (yet to be merged)
>> - * NPU/NPU2/NPU3 controllers
>> - * EEH support for PCIe Host bridge controllers
>> - * NX controller
>> - * VAS controller
>> - * chipTOD (Time Of Day)
>> + * I2C controllers (yet to be merged).
>> + * NPU/NPU2/NPU3 controllers.
>> + * EEH support for PCIe Host bridge controllers.
>> + * NX controller.
>> + * VAS controller.
>> + * chipTOD (Time Of Day).
>>    * Self Boot Engine (SBE).
>> - * FSI bus
>> + * FSI bus.
>>     Firmware
>>   
>>     The OPAL firmware (OpenPower Abstraction Layer) for OpenPower
>> systems
>>   includes the runtime services ``skiboot`` and the bootloader kernel
>> and
>> -initramfs ``skiroot``. Source code can be found on GitHub:
>> +initramfs ``skiroot``. Source code can be found on the `OpenPOWER
>> account at
>> +GitHub `_.
>>   -  https://github.com/open-power.
>> -
>> -Prebuilt images of ``skiboot`` and ``skiroot`` are made available on
>> the `OpenPOWER `__
>> site.
>> +Prebuilt images of ``skiboot`` and ``skiroot`` are made available on
>> the
>> +`OpenPOWER `__ site.
>>     QEMU includes a prebuilt image of ``skiboot`` which is updated
>> when a
>>   more recent version is required by the models.
>> @@ -83,6 +79,7 @@ and a SATA disk :
>>     Complex PCIe configuration
>>   ~~
>> +
>>   Six PHBs are defined per chip (POWER9) but no default PCI layout is
>>   provided (to be compatible with libvirt). One PCI device can be added
>>   on any of the available PCIe slots using command line options such as:
>> @@ -157,7 +154,7 @@ one on the command line :
>>   The files `palmetto-SDR.bin
>> `__
>>   and `palmetto-FRU.bin
>> `__
>>   define a Sensor Data Record repository and a Field Replaceable Unit
>> -inventory for a palmetto BMC. They can be used to extend

[PATCH 3/7] migration: Drop postcopy_chunk_hostpages()

This function calls three functions:

  - postcopy_discard_send_init(ms, block->idstr);
  - postcopy_chunk_hostpages_pass(ms, block);
  - postcopy_discard_send_finish(ms);

However only the 2nd function call is meaningful.  It's major role is to make
sure dirty bits are applied in host-page-size granule, so there will be no
partial dirty bits set for a whole host page if huge pages are used.

The 1st/3rd call are for latter when we want to send the disgard ranges.
They're mostly no-op here besides some tracepoints (which are misleading!).

Drop them, then we can directly drop postcopy_chunk_hostpages() as a whole
because we can call postcopy_chunk_hostpages_pass() directly.

There're still some nice comments above postcopy_chunk_hostpages() that explain
what it does.  Copy it over to the caller's site.

Signed-off-by: Peter Xu 
---
 migration/ram.c | 33 +++--
 1 file changed, 7 insertions(+), 26 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index fb8c1a887e..e3876181ab 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -2576,30 +2576,6 @@ static void postcopy_chunk_hostpages_pass(MigrationState 
*ms, RAMBlock *block)
 }
 }
 
-/**
- * postcopy_chunk_hostpages: discard any partially sent host page
- *
- * Utility for the outgoing postcopy code.
- *
- * Discard any partially sent host-page size chunks, mark any partially
- * dirty host-page size chunks as all dirty.  In this case the host-page
- * is the host-page for the particular RAMBlock, i.e. it might be a huge page
- *
- * @ms: current migration state
- * @block: block we want to work with
- */
-static void postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
-{
-postcopy_discard_send_init(ms, block->idstr);
-
-/*
- * Ensure that all partially dirty host pages are made fully dirty.
- */
-postcopy_chunk_hostpages_pass(ms, block);
-
-postcopy_discard_send_finish(ms);
-}
-
 /**
  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
  *
@@ -2631,8 +2607,13 @@ int ram_postcopy_send_discard_bitmap(MigrationState *ms)
 rs->last_page = 0;
 
 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
-/* Deal with TPS != HPS and huge pages */
-postcopy_chunk_hostpages(ms, block);
+/*
+ * Deal with TPS != HPS and huge pages.  It discard any partially sent
+ * host-page size chunks, mark any partially dirty host-page size
+ * chunks as all dirty.  In this case the host-page is the host-page
+ * for the particular RAMBlock, i.e. it might be a huge page.
+ */
+postcopy_chunk_hostpages_pass(ms, block);
 }
 trace_ram_postcopy_send_discard_bitmap();
 
-- 
2.32.0

[PATCH 2/7] migration: Don't return for postcopy_chunk_hostpages()

It always return zero, because it just can't go wrong so far.  Simplify the
code with no functional change.

Signed-off-by: Peter Xu 
---
 migration/ram.c | 11 ++-
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index 756ac800a7..fb8c1a887e 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -2585,12 +2585,10 @@ static void 
postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
  * dirty host-page size chunks as all dirty.  In this case the host-page
  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
  *
- * Returns zero on success
- *
  * @ms: current migration state
  * @block: block we want to work with
  */
-static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
+static void postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
 {
 postcopy_discard_send_init(ms, block->idstr);
 
@@ -2600,7 +2598,6 @@ static int postcopy_chunk_hostpages(MigrationState *ms, 
RAMBlock *block)
 postcopy_chunk_hostpages_pass(ms, block);
 
 postcopy_discard_send_finish(ms);
-return 0;
 }
 
 /**
@@ -2622,7 +2619,6 @@ int ram_postcopy_send_discard_bitmap(MigrationState *ms)
 {
 RAMState *rs = ram_state;
 RAMBlock *block;
-int ret;
 
 RCU_READ_LOCK_GUARD();
 
@@ -2636,10 +2632,7 @@ int ram_postcopy_send_discard_bitmap(MigrationState *ms)
 
 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 /* Deal with TPS != HPS and huge pages */
-ret = postcopy_chunk_hostpages(ms, block);
-if (ret) {
-return ret;
-}
+postcopy_chunk_hostpages(ms, block);
 }
 trace_ram_postcopy_send_discard_bitmap();
 
-- 
2.32.0

[PATCH 6/7] migration: Dump sub-cmd name in loadvm_process_command tp

It'll be easier to read the name rather than index of sub-cmd when debugging.

Signed-off-by: Peter Xu 
---
 migration/savevm.c | 2 +-
 migration/trace-events | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/migration/savevm.c b/migration/savevm.c
index d59e976d50..17b8e25e00 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -2271,7 +2271,7 @@ static int loadvm_process_command(QEMUFile *f)
 return qemu_file_get_error(f);
 }
 
-trace_loadvm_process_command(cmd, len);
+trace_loadvm_process_command(mig_cmd_args[cmd].name, len);
 if (cmd >= MIG_CMD_MAX || cmd == MIG_CMD_INVALID) {
 error_report("MIG_CMD 0x%x unknown (len 0x%x)", cmd, len);
 return -EINVAL;
diff --git a/migration/trace-events b/migration/trace-events
index b48d873b8a..d63a5915f5 100644
--- a/migration/trace-events
+++ b/migration/trace-events
@@ -22,7 +22,7 @@ loadvm_postcopy_handle_resume(void) ""
 loadvm_postcopy_ram_handle_discard(void) ""
 loadvm_postcopy_ram_handle_discard_end(void) ""
 loadvm_postcopy_ram_handle_discard_header(const char *ramid, uint16_t len) 
"%s: %ud"
-loadvm_process_command(uint16_t com, uint16_t len) "com=0x%x len=%d"
+loadvm_process_command(const char *s, uint16_t len) "com=%s len=%d"
 loadvm_process_command_ping(uint32_t val) "0x%x"
 postcopy_ram_listen_thread_exit(void) ""
 postcopy_ram_listen_thread_start(void) ""
-- 
2.32.0

[PATCH 5/7] migration: Drop return code for disgard ram process

It will just never fail.  Drop those return values where they're constantly
zeros.

A tiny touch-up on the tracepoint so trace_ram_postcopy_send_discard_bitmap()
is called after the logic itself (which sounds more reasonable).

Signed-off-by: Peter Xu 
---
 migration/migration.c |  5 +
 migration/ram.c   | 20 +---
 migration/ram.h   |  2 +-
 3 files changed, 7 insertions(+), 20 deletions(-)

diff --git a/migration/migration.c b/migration/migration.c
index abaf6f9e3d..c2e5539721 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -2983,10 +2983,7 @@ static int postcopy_start(MigrationState *ms)
  * that are dirty
  */
 if (migrate_postcopy_ram()) {
-if (ram_postcopy_send_discard_bitmap(ms)) {
-error_report("postcopy send discard bitmap failed");
-goto fail;
-}
+ram_postcopy_send_discard_bitmap(ms);
 }
 
 /*
diff --git a/migration/ram.c b/migration/ram.c
index ecc744d54d..28f1ace0f7 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -2478,8 +2478,6 @@ static void postcopy_chunk_hostpages_pass(MigrationState 
*ms, RAMBlock *block);
 /**
  * postcopy_each_ram_send_discard: discard all RAMBlocks
  *
- * Returns 0 for success or negative for error
- *
  * Utility for the outgoing postcopy code.
  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
  *   passing it bitmap indexes and name.
@@ -2488,10 +2486,9 @@ static void postcopy_chunk_hostpages_pass(MigrationState 
*ms, RAMBlock *block);
  *
  * @ms: current migration state
  */
-static int postcopy_each_ram_send_discard(MigrationState *ms)
+static void postcopy_each_ram_send_discard(MigrationState *ms)
 {
 struct RAMBlock *block;
-int ret;
 
 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 postcopy_discard_send_init(ms, block->idstr);
@@ -2509,14 +2506,9 @@ static int postcopy_each_ram_send_discard(MigrationState 
*ms)
  * just needs indexes at this point, avoids it having
  * target page specific code.
  */
-ret = postcopy_send_discard_bm_ram(ms, block);
+postcopy_send_discard_bm_ram(ms, block);
 postcopy_discard_send_finish(ms);
-if (ret) {
-return ret;
-}
 }
-
-return 0;
 }
 
 /**
@@ -2589,8 +2581,6 @@ static void postcopy_chunk_hostpages_pass(MigrationState 
*ms, RAMBlock *block)
 /**
  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
  *
- * Returns zero on success
- *
  * Transmit the set of pages to be discarded after precopy to the target
  * these are pages that:
  * a) Have been previously transmitted but are now dirty again
@@ -2601,7 +2591,7 @@ static void postcopy_chunk_hostpages_pass(MigrationState 
*ms, RAMBlock *block)
  *
  * @ms: current migration state
  */
-int ram_postcopy_send_discard_bitmap(MigrationState *ms)
+void ram_postcopy_send_discard_bitmap(MigrationState *ms)
 {
 RAMState *rs = ram_state;
 
@@ -2615,9 +2605,9 @@ int ram_postcopy_send_discard_bitmap(MigrationState *ms)
 rs->last_sent_block = NULL;
 rs->last_page = 0;
 
-trace_ram_postcopy_send_discard_bitmap();
+postcopy_each_ram_send_discard(ms);
 
-return postcopy_each_ram_send_discard(ms);
+trace_ram_postcopy_send_discard_bitmap();
 }
 
 /**
diff --git a/migration/ram.h b/migration/ram.h
index f543e25765..2c6dc3675d 100644
--- a/migration/ram.h
+++ b/migration/ram.h
@@ -57,7 +57,7 @@ int ram_save_queue_pages(const char *rbname, ram_addr_t 
start, ram_addr_t len);
 void acct_update_position(QEMUFile *f, size_t size, bool zero);
 void ram_postcopy_migrated_memory_release(MigrationState *ms);
 /* For outgoing discard bitmap */
-int ram_postcopy_send_discard_bitmap(MigrationState *ms);
+void ram_postcopy_send_discard_bitmap(MigrationState *ms);
 /* For incoming postcopy discard */
 int ram_discard_range(const char *block_name, uint64_t start, size_t length);
 int ram_postcopy_incoming_init(MigrationIncomingState *mis);
-- 
2.32.0

[PATCH 4/7] migration: Do chunk page in postcopy_each_ram_send_discard()

Right now we loop ramblocks for twice, the 1st time chunk the dirty bits with
huge page information; the 2nd time we send the discard ranges.  That's not
necessary - we can do them in a single loop.

Signed-off-by: Peter Xu 
---
 migration/ram.c | 20 ++--
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index e3876181ab..ecc744d54d 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -2473,6 +2473,8 @@ static int postcopy_send_discard_bm_ram(MigrationState 
*ms, RAMBlock *block)
 return 0;
 }
 
+static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block);
+
 /**
  * postcopy_each_ram_send_discard: discard all RAMBlocks
  *
@@ -2494,6 +2496,14 @@ static int postcopy_each_ram_send_discard(MigrationState 
*ms)
 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 postcopy_discard_send_init(ms, block->idstr);
 
+/*
+ * Deal with TPS != HPS and huge pages.  It discard any partially sent
+ * host-page size chunks, mark any partially dirty host-page size
+ * chunks as all dirty.  In this case the host-page is the host-page
+ * for the particular RAMBlock, i.e. it might be a huge page.
+ */
+postcopy_chunk_hostpages_pass(ms, block);
+
 /*
  * Postcopy sends chunks of bitmap over the wire, but it
  * just needs indexes at this point, avoids it having
@@ -2594,7 +2604,6 @@ static void postcopy_chunk_hostpages_pass(MigrationState 
*ms, RAMBlock *block)
 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
 {
 RAMState *rs = ram_state;
-RAMBlock *block;
 
 RCU_READ_LOCK_GUARD();
 
@@ -2606,15 +2615,6 @@ int ram_postcopy_send_discard_bitmap(MigrationState *ms)
 rs->last_sent_block = NULL;
 rs->last_page = 0;
 
-RAMBLOCK_FOREACH_NOT_IGNORED(block) {
-/*
- * Deal with TPS != HPS and huge pages.  It discard any partially sent
- * host-page size chunks, mark any partially dirty host-page size
- * chunks as all dirty.  In this case the host-page is the host-page
- * for the particular RAMBlock, i.e. it might be a huge page.
- */
-postcopy_chunk_hostpages_pass(ms, block);
-}
 trace_ram_postcopy_send_discard_bitmap();
 
 return postcopy_each_ram_send_discard(ms);
-- 
2.32.0

[PATCH 7/7] migration: Finer grained tracepoints for POSTCOPY_LISTEN

The enablement of postcopy listening has a few steps, add a few tracepoints to
be there ready for some basic measurements for them.

Signed-off-by: Peter Xu 
---
 migration/savevm.c | 5 -
 migration/trace-events | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/migration/savevm.c b/migration/savevm.c
index 17b8e25e00..5b3f31eab2 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -1946,7 +1946,7 @@ static void *postcopy_ram_listen_thread(void *opaque)
 static int loadvm_postcopy_handle_listen(MigrationIncomingState *mis)
 {
 PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_LISTENING);
-trace_loadvm_postcopy_handle_listen();
+trace_loadvm_postcopy_handle_listen(1);
 Error *local_err = NULL;
 
 if (ps != POSTCOPY_INCOMING_ADVISE && ps != POSTCOPY_INCOMING_DISCARD) {
@@ -1962,6 +1962,7 @@ static int 
loadvm_postcopy_handle_listen(MigrationIncomingState *mis)
 postcopy_ram_prepare_discard(mis);
 }
 }
+trace_loadvm_postcopy_handle_listen(2);
 
 /*
  * Sensitise RAM - can now generate requests for blocks that don't exist
@@ -1974,6 +1975,7 @@ static int 
loadvm_postcopy_handle_listen(MigrationIncomingState *mis)
 return -1;
 }
 }
+trace_loadvm_postcopy_handle_listen(3);
 
 if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_LISTEN, &local_err)) {
 error_report_err(local_err);
@@ -1988,6 +1990,7 @@ static int 
loadvm_postcopy_handle_listen(MigrationIncomingState *mis)
QEMU_THREAD_DETACHED);
 qemu_sem_wait(&mis->listen_thread_sem);
 qemu_sem_destroy(&mis->listen_thread_sem);
+trace_loadvm_postcopy_handle_listen(4);
 
 return 0;
 }
diff --git a/migration/trace-events b/migration/trace-events
index d63a5915f5..1aa6937dc1 100644
--- a/migration/trace-events
+++ b/migration/trace-events
@@ -14,7 +14,7 @@ loadvm_handle_cmd_packaged_main(int ret) "%d"
 loadvm_handle_cmd_packaged_received(int ret) "%d"
 loadvm_handle_recv_bitmap(char *s) "%s"
 loadvm_postcopy_handle_advise(void) ""
-loadvm_postcopy_handle_listen(void) ""
+loadvm_postcopy_handle_listen(int i) "%d"
 loadvm_postcopy_handle_run(void) ""
 loadvm_postcopy_handle_run_cpu_sync(void) ""
 loadvm_postcopy_handle_run_vmstart(void) ""
-- 
2.32.0

[PATCH 0/7] migration: Postcopy cleanup on ram disgard

Some queued patches for ram disgard cleanup, and some debug probes.

QEMU's ram disgard logic is probably a bit hard to predict because we send a
bunch of packets to notify the disgarded ranges rather than sending the bitmap.
The packets to send depending on the bitmap layout.

Initially I thought it could be a problem but in reality it's fine so far per
my initial measurement.  So I'm flushing the cleanup/trace patches out because
I think they're still helpful.

Please have a look, thanks.

Peter Xu (7):
  migration: Drop dead code of ram_debug_dump_bitmap()
  migration: Don't return for postcopy_chunk_hostpages()
  migration: Drop postcopy_chunk_hostpages()
  migration: Do chunk page in postcopy_each_ram_send_discard()
  migration: Drop return code for disgard ram process
  migration: Dump sub-cmd name in loadvm_process_command tp
  migration: Finer grained tracepoints for POSTCOPY_LISTEN

 migration/migration.c  |   5 +-
 migration/ram.c| 103 ++---
 migration/ram.h|   4 +-
 migration/savevm.c |   7 ++-
 migration/trace-events |   4 +-
 5 files changed, 23 insertions(+), 100 deletions(-)

-- 
2.32.0

[PATCH 1/7] migration: Drop dead code of ram_debug_dump_bitmap()

I planned to add "#ifdef DEBUG_POSTCOPY" around the function too because
otherwise it'll be compiled into qemu binary even if it'll never be used.  Then
I found that maybe it's easier to just drop it for good..

Signed-off-by: Peter Xu 
---
 migration/ram.c | 39 ---
 migration/ram.h |  2 --
 2 files changed, 41 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index 863035d235..756ac800a7 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -2413,40 +2413,6 @@ static void ram_state_reset(RAMState *rs)
 
 #define MAX_WAIT 50 /* ms, half buffered_file limit */
 
-/*
- * 'expected' is the value you expect the bitmap mostly to be full
- * of; it won't bother printing lines that are all this value.
- * If 'todump' is null the migration bitmap is dumped.
- */
-void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
-   unsigned long pages)
-{
-int64_t cur;
-int64_t linelen = 128;
-char linebuf[129];
-
-for (cur = 0; cur < pages; cur += linelen) {
-int64_t curb;
-bool found = false;
-/*
- * Last line; catch the case where the line length
- * is longer than remaining ram
- */
-if (cur + linelen > pages) {
-linelen = pages - cur;
-}
-for (curb = 0; curb < linelen; curb++) {
-bool thisbit = test_bit(cur + curb, todump);
-linebuf[curb] = thisbit ? '1' : '.';
-found = found || (thisbit != expected);
-}
-if (found) {
-linebuf[curb] = '\0';
-fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
-}
-}
-}
-
 /*  functions for postcopy * */
 
 void ram_postcopy_migrated_memory_release(MigrationState *ms)
@@ -2674,11 +2640,6 @@ int ram_postcopy_send_discard_bitmap(MigrationState *ms)
 if (ret) {
 return ret;
 }
-
-#ifdef DEBUG_POSTCOPY
-ram_debug_dump_bitmap(block->bmap, true,
-  block->used_length >> TARGET_PAGE_BITS);
-#endif
 }
 trace_ram_postcopy_send_discard_bitmap();
 
diff --git a/migration/ram.h b/migration/ram.h
index c515396a9a..f543e25765 100644
--- a/migration/ram.h
+++ b/migration/ram.h
@@ -55,8 +55,6 @@ void mig_throttle_counter_reset(void);
 uint64_t ram_pagesize_summary(void);
 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len);
 void acct_update_position(QEMUFile *f, size_t size, bool zero);
-void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
-   unsigned long pages);
 void ram_postcopy_migrated_memory_release(MigrationState *ms);
 /* For outgoing discard bitmap */
 int ram_postcopy_send_discard_bitmap(MigrationState *ms);
-- 
2.32.0

Re: [PATCH] mirror: Avoid assertion failed in mirror_run

2021-12-07 Thread Hanna Reitz

[CC-ing qemu-block, Vladimir, Kevin, and John – when sending patches, 
please look into the MAINTAINERS file or use the 
scripts/get_maintainer.pl script to find out who to CC on them.  It’s 
very to overlook patches on qemu-devel :/]


On 07.12.21 11:56, Yi Wang wrote:

From: Long YunJian 

when blockcommit from active leaf node, sometimes, we get assertion failed with
"mirror_run: Assertion `QLIST_EMPTY(&bs->tracked_requests)' failed" messages.
According to the core file, we find bs->tracked_requests has IO request,
so assertion failed.
(gdb) bt
#0  0x7f410df707cf in raise () from /lib64/libc.so.6
#1  0x7f410df5ac05 in abort () from /lib64/libc.so.6
#2  0x7f410df5aad9 in __assert_fail_base.cold.0 () from /lib64/libc.so.6
#3  0x7f410df68db6 in __assert_fail () from /lib64/libc.so.6
#4  0x556915635371 in mirror_run (job=0x556916ff8600, errp=) 
at block/mirror.c:1092
#5  0x5569155e6c53 in job_co_entry (opaque=0x556916ff8600) at job.c:904
#6  0x5569156d9483 in coroutine_trampoline (i0=, i1=) at util/coroutine-ucontext.c:115
(gdb) p s->mirror_top_bs->backing->bs->tracked_requests
$12 = {lh_first = 0x7f3f07bfb8b0}
(gdb) p s->mirror_top_bs->backing->bs->tracked_requests->lh_first
$13 = (struct BdrvTrackedRequest *) 0x7f3f07bfb8b0

Actually, before excuting assert(QLIST_EMPTY(&bs->tracked_requests)),
it will excute mirror_flush(s). It may handle new I/O request and maybe
pending I/O during this flush. Just likes in bdrv_close fuction,
bdrv_drain(bs) followed by bdrv_flush(bs), we should add bdrv_drain fuction
to handle pending I/O after mirror_flush.


Oh.  How is that happening, though?  I would have expected that flushing 
the target BB (and associated BDS) only flushes requests to the OS and 
lower layers, but the source node (which is `bs`) should (in the case of 
commit) always be above the target, so I wouldn’t have expected it to 
get any new requests due to this flush.


Do you have a reproducer for this?


Signed-off-by: Long YunJian 
Signed-off-by: Yi Wang 
---
  block/mirror.c | 2 ++
  1 file changed, 2 insertions(+)

diff --git a/block/mirror.c b/block/mirror.c
index efec2c7674..1eec356310 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -1079,6 +1079,8 @@ static int coroutine_fn mirror_run(Job *job, Error **errp)
  s->in_drain = false;
  continue;
  }
+/* in case flush left pending I/O */
+bdrv_drain(bs);


I don’t think this works, because if we drain, we would also need to 
flush the target again.  Essentially I believe we’d basically need 
something like


do {
    bdrv_drained_begin(bs);
    mirror_flush(s);
    if (!QLIST_EMPTY(&bs->tracked_requests)) {
    bdrv_drained_end(bs);
    }
} while (!QLIST_EMPTY(&bs->tracked_requests));

(Which I know is really ugly)

Hanna

  
  /* The two disks are in sync.  Exit and report successful

   * completion.

Re: Bad error handling in machine sifive-u

2021-12-07 Thread Alistair Francis

On Mon, Dec 6, 2021 at 11:15 PM Markus Armbruster  wrote:
>
> Watch this:
>
> $ ../qemu/bld/qemu-system-riscv64 -M sifive_u -S -monitor stdio -display 
> none -drive if=pflash
> QEMU 6.1.93 monitor - type 'help' for more information
> (qemu) Unexpected error in sifive_u_otp_realize() at 
> ../hw/misc/sifive_u_otp.c:229:
> qemu-system-riscv64: OTP drive size < 16K
> Aborted (core dumped)
>
> sifive_u_machine_init() calls
>
> qdev_realize(DEVICE(&s->soc), NULL, &error_abort);
>
> My reproducer demonstrates that passing &error_abort is wrong: this
> realize can fail.
>
> &error_fatal should do here.

Thanks for pointing this out. I'll work on a patch to fix this up.

Alistair

>
> Please check the other uses of &error_abort in this machine for similar
> misuse.
>
>

Re: [PATCH v2 for 6.2?] gicv3: fix ICH_MISR's LRENP computation

2021-12-07 Thread Philippe Mathieu-Daudé

On 12/7/21 10:44, Damien Hedde wrote:
> According to the "Arm Generic Interrupt Controller Architecture
> Specification GIC architecture version 3 and 4" (version G: page 345
> for aarch64 or 509 for aarch32):
> LRENP bit of ICH_MISR is set when ICH_HCR.LRENPIE==1 and
> ICH_HCR.EOIcount is non-zero.
> 
> When only LRENPIE was set (and EOI count was zero), the LRENP bit was
> wrongly set and MISR value was wrong.
> 
> As an additional consequence, if an hypervisor set ICH_HCR.LRENPIE,
> the maintenance interrupt was constantly fired. It happens since patch
> 9cee1efe92 ("hw/intc: Set GIC maintenance interrupt level to only 0 or 1")
> which fixed another bug about maintenance interrupt (most significant
> bits of misr, including this one, were ignored in the interrupt trigger).
> 
> Fixes: 83f036fe3d ("hw/intc/arm_gicv3: Add accessors for ICH_ system 
> registers")

This commit predates 6.1 release, so technically this is not
a regression for 6.2.

> Signed-off-by: Damien Hedde 
> ---
> The gic doc is available here:
> https://developer.arm.com/documentation/ihi0069/g
> 
> v2: identical resend because subject screw-up (sorry)
> 
> Thanks,
> Damien
> ---
>  hw/intc/arm_gicv3_cpuif.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/intc/arm_gicv3_cpuif.c b/hw/intc/arm_gicv3_cpuif.c
> index 7fba931450..85fc369e55 100644
> --- a/hw/intc/arm_gicv3_cpuif.c
> +++ b/hw/intc/arm_gicv3_cpuif.c
> @@ -351,7 +351,8 @@ static uint32_t maintenance_interrupt_state(GICv3CPUState 
> *cs)
>  /* Scan list registers and fill in the U, NP and EOI bits */
>  eoi_maintenance_interrupt_state(cs, &value);
>  
> -if (cs->ich_hcr_el2 & (ICH_HCR_EL2_LRENPIE | ICH_HCR_EL2_EOICOUNT_MASK)) 
> {
> +if ((cs->ich_hcr_el2 & ICH_HCR_EL2_LRENPIE) &&
> +(cs->ich_hcr_el2 & ICH_HCR_EL2_EOICOUNT_MASK)) {
>  value |= ICH_MISR_EL2_LRENP;
>  }
>  
>

Re: [PATCH v2 for 6.2?] gicv3: fix ICH_MISR's LRENP computation





On 12/7/21 13:45, Philippe Mathieu-Daudé wrote:

On 12/7/21 10:44, Damien Hedde wrote:

According to the "Arm Generic Interrupt Controller Architecture
Specification GIC architecture version 3 and 4" (version G: page 345
for aarch64 or 509 for aarch32):
LRENP bit of ICH_MISR is set when ICH_HCR.LRENPIE==1 and
ICH_HCR.EOIcount is non-zero.

When only LRENPIE was set (and EOI count was zero), the LRENP bit was
wrongly set and MISR value was wrong.

As an additional consequence, if an hypervisor set ICH_HCR.LRENPIE,
the maintenance interrupt was constantly fired. It happens since patch
9cee1efe92 ("hw/intc: Set GIC maintenance interrupt level to only 0 or 1")
which fixed another bug about maintenance interrupt (most significant
bits of misr, including this one, were ignored in the interrupt trigger).

Fixes: 83f036fe3d ("hw/intc/arm_gicv3: Add accessors for ICH_ system registers")


This commit predates 6.1 release, so technically this is not
a regression for 6.2.


Do you mean "Fixes:" is meant only for regression or simply that this 
patch should not go for 6.2 ?


9cee1efe92 was introduced after 6.1, and changed the interrupt behavior. 
Thought I'm not sure if we can consider this as a fix for 9cee1efe92: it 
only makes the previous error more visible.


Damien

[PATCH v3 0/6] aio-posix: split poll check from ready handler

v3:
- Fixed FUSE export aio_set_fd_handler() call that I missed and double-checked
  for any other missing call sites using Coccinelle [Rich]
v2:
- Cleaned up unused return values in nvme and virtio-blk [Stefano]
- Documented try_poll_mode() ready_list argument [Stefano]
- Unified virtio-blk/scsi dataplane and non-dataplane virtqueue handlers 
[Stefano]

The first patch improves AioContext's adaptive polling execution time
measurement. This can result in better performance because the algorithm makes
better decisions about when to poll versus when to fall back to file descriptor
monitoring.

The remaining patches unify the virtio-blk and virtio-scsi dataplane and
non-dataplane virtqueue handlers. This became possible because the dataplane
handler function now has the same function signature as the non-dataplane
handler function. Stefano Garzarella prompted me to make this refactoring.

Stefan Hajnoczi (6):
  aio-posix: split poll check from ready handler
  virtio: get rid of VirtIOHandleAIOOutput
  virtio-blk: drop unused virtio_blk_handle_vq() return value
  virtio-scsi: prepare virtio_scsi_handle_cmd for dataplane
  virtio: use ->handle_output() instead of ->handle_aio_output()
  virtio: unify dataplane and non-dataplane ->handle_output()

 include/block/aio.h |  4 +-
 include/hw/virtio/virtio-blk.h  |  2 +-
 include/hw/virtio/virtio.h  |  5 +-
 util/aio-posix.h|  1 +
 block/curl.c| 11 ++--
 block/export/fuse.c |  4 +-
 block/io_uring.c| 19 ---
 block/iscsi.c   |  4 +-
 block/linux-aio.c   | 16 +++---
 block/nfs.c |  6 +--
 block/nvme.c| 51 ---
 block/ssh.c |  4 +-
 block/win32-aio.c   |  4 +-
 hw/block/dataplane/virtio-blk.c | 16 +-
 hw/block/virtio-blk.c   | 14 ++
 hw/scsi/virtio-scsi-dataplane.c | 60 +++---
 hw/scsi/virtio-scsi.c   |  2 +-
 hw/virtio/virtio.c  | 73 +--
 hw/xen/xen-bus.c|  6 +--
 io/channel-command.c|  6 ++-
 io/channel-file.c   |  3 +-
 io/channel-socket.c |  3 +-
 migration/rdma.c|  8 +--
 tests/unit/test-aio.c   |  4 +-
 util/aio-posix.c| 89 +
 util/aio-win32.c|  4 +-
 util/async.c| 10 +++-
 util/main-loop.c|  4 +-
 util/qemu-coroutine-io.c|  5 +-
 util/vhost-user-server.c| 11 ++--
 30 files changed, 219 insertions(+), 230 deletions(-)

-- 
2.33.1

[PATCH v3 4/6] virtio-scsi: prepare virtio_scsi_handle_cmd for dataplane

Prepare virtio_scsi_handle_cmd() to be used by both dataplane and
non-dataplane by making the condition for starting ioeventfd more
specific. This way it won't trigger when dataplane has already been
started.

Signed-off-by: Stefan Hajnoczi 
---
 hw/scsi/virtio-scsi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
index 51fd09522a..34a968ecfb 100644
--- a/hw/scsi/virtio-scsi.c
+++ b/hw/scsi/virtio-scsi.c
@@ -720,7 +720,7 @@ static void virtio_scsi_handle_cmd(VirtIODevice *vdev, 
VirtQueue *vq)
 /* use non-QOM casts in the data path */
 VirtIOSCSI *s = (VirtIOSCSI *)vdev;
 
-if (s->ctx) {
+if (s->ctx && !s->dataplane_started) {
 virtio_device_start_ioeventfd(vdev);
 if (!s->dataplane_fenced) {
 return;
-- 
2.33.1

[PATCH v3 3/6] virtio-blk: drop unused virtio_blk_handle_vq() return value

The return value of virtio_blk_handle_vq() is no longer used. Get rid of
it. This is a step towards unifying the dataplane and non-dataplane
virtqueue handler functions.

Prepare virtio_blk_handle_output() to be used by both dataplane and
non-dataplane by making the condition for starting ioeventfd more
specific. This way it won't trigger when dataplane has already been
started.

Signed-off-by: Stefan Hajnoczi 
---
 include/hw/virtio/virtio-blk.h |  2 +-
 hw/block/virtio-blk.c  | 14 +++---
 2 files changed, 4 insertions(+), 12 deletions(-)

diff --git a/include/hw/virtio/virtio-blk.h b/include/hw/virtio/virtio-blk.h
index 29655a406d..d311c57cca 100644
--- a/include/hw/virtio/virtio-blk.h
+++ b/include/hw/virtio/virtio-blk.h
@@ -90,7 +90,7 @@ typedef struct MultiReqBuffer {
 bool is_write;
 } MultiReqBuffer;
 
-bool virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq);
+void virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq);
 void virtio_blk_process_queued_requests(VirtIOBlock *s, bool is_bh);
 
 #endif
diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index f139cd7cc9..82676cdd01 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -767,12 +767,11 @@ static int virtio_blk_handle_request(VirtIOBlockReq *req, 
MultiReqBuffer *mrb)
 return 0;
 }
 
-bool virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq)
+void virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq)
 {
 VirtIOBlockReq *req;
 MultiReqBuffer mrb = {};
 bool suppress_notifications = virtio_queue_get_notification(vq);
-bool progress = false;
 
 aio_context_acquire(blk_get_aio_context(s->blk));
 blk_io_plug(s->blk);
@@ -783,7 +782,6 @@ bool virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq)
 }
 
 while ((req = virtio_blk_get_request(s, vq))) {
-progress = true;
 if (virtio_blk_handle_request(req, &mrb)) {
 virtqueue_detach_element(req->vq, &req->elem, 0);
 virtio_blk_free_request(req);
@@ -802,19 +800,13 @@ bool virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq)
 
 blk_io_unplug(s->blk);
 aio_context_release(blk_get_aio_context(s->blk));
-return progress;
-}
-
-static void virtio_blk_handle_output_do(VirtIOBlock *s, VirtQueue *vq)
-{
-virtio_blk_handle_vq(s, vq);
 }
 
 static void virtio_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq)
 {
 VirtIOBlock *s = (VirtIOBlock *)vdev;
 
-if (s->dataplane) {
+if (s->dataplane && !s->dataplane_started) {
 /* Some guests kick before setting VIRTIO_CONFIG_S_DRIVER_OK so start
  * dataplane here instead of waiting for .set_status().
  */
@@ -823,7 +815,7 @@ static void virtio_blk_handle_output(VirtIODevice *vdev, 
VirtQueue *vq)
 return;
 }
 }
-virtio_blk_handle_output_do(s, vq);
+virtio_blk_handle_vq(s, vq);
 }
 
 void virtio_blk_process_queued_requests(VirtIOBlock *s, bool is_bh)
-- 
2.33.1

[PATCH v3 2/6] virtio: get rid of VirtIOHandleAIOOutput

The virtqueue host notifier API
virtio_queue_aio_set_host_notifier_handler() polls the virtqueue for new
buffers. AioContext previously required a bool progress return value
indicating whether an event was handled or not. This is no longer
necessary because the AioContext polling API has been split into a poll
check function and an event handler function. The event handler is only
run when we know there is work to do, so it doesn't return bool.

The VirtIOHandleAIOOutput function signature is now the same as
VirtIOHandleOutput. Get rid of the bool return value.

Further simplifications will be made for virtio-blk and virtio-scsi in
the next patch.

Signed-off-by: Stefan Hajnoczi 
---
 include/hw/virtio/virtio.h  |  3 +--
 hw/block/dataplane/virtio-blk.c |  4 ++--
 hw/scsi/virtio-scsi-dataplane.c | 18 ++
 hw/virtio/virtio.c  | 12 
 4 files changed, 13 insertions(+), 24 deletions(-)

diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
index 8bab9cfb75..b90095628f 100644
--- a/include/hw/virtio/virtio.h
+++ b/include/hw/virtio/virtio.h
@@ -175,7 +175,6 @@ void virtio_error(VirtIODevice *vdev, const char *fmt, ...) 
GCC_FMT_ATTR(2, 3);
 void virtio_device_set_child_bus_name(VirtIODevice *vdev, char *bus_name);
 
 typedef void (*VirtIOHandleOutput)(VirtIODevice *, VirtQueue *);
-typedef bool (*VirtIOHandleAIOOutput)(VirtIODevice *, VirtQueue *);
 
 VirtQueue *virtio_add_queue(VirtIODevice *vdev, int queue_size,
 VirtIOHandleOutput handle_output);
@@ -318,7 +317,7 @@ EventNotifier *virtio_queue_get_host_notifier(VirtQueue 
*vq);
 void virtio_queue_set_host_notifier_enabled(VirtQueue *vq, bool enabled);
 void virtio_queue_host_notifier_read(EventNotifier *n);
 void virtio_queue_aio_set_host_notifier_handler(VirtQueue *vq, AioContext *ctx,
-VirtIOHandleAIOOutput 
handle_output);
+VirtIOHandleOutput handle_output);
 VirtQueue *virtio_vector_first_queue(VirtIODevice *vdev, uint16_t vector);
 VirtQueue *virtio_vector_next_queue(VirtQueue *vq);
 
diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c
index ee5a5352dc..a2fa407b98 100644
--- a/hw/block/dataplane/virtio-blk.c
+++ b/hw/block/dataplane/virtio-blk.c
@@ -154,7 +154,7 @@ void virtio_blk_data_plane_destroy(VirtIOBlockDataPlane *s)
 g_free(s);
 }
 
-static bool virtio_blk_data_plane_handle_output(VirtIODevice *vdev,
+static void virtio_blk_data_plane_handle_output(VirtIODevice *vdev,
 VirtQueue *vq)
 {
 VirtIOBlock *s = (VirtIOBlock *)vdev;
@@ -162,7 +162,7 @@ static bool 
virtio_blk_data_plane_handle_output(VirtIODevice *vdev,
 assert(s->dataplane);
 assert(s->dataplane_started);
 
-return virtio_blk_handle_vq(s, vq);
+virtio_blk_handle_vq(s, vq);
 }
 
 /* Context: QEMU global mutex held */
diff --git a/hw/scsi/virtio-scsi-dataplane.c b/hw/scsi/virtio-scsi-dataplane.c
index 18eb824c97..76137de67f 100644
--- a/hw/scsi/virtio-scsi-dataplane.c
+++ b/hw/scsi/virtio-scsi-dataplane.c
@@ -49,49 +49,43 @@ void virtio_scsi_dataplane_setup(VirtIOSCSI *s, Error 
**errp)
 }
 }
 
-static bool virtio_scsi_data_plane_handle_cmd(VirtIODevice *vdev,
+static void virtio_scsi_data_plane_handle_cmd(VirtIODevice *vdev,
   VirtQueue *vq)
 {
-bool progress = false;
 VirtIOSCSI *s = VIRTIO_SCSI(vdev);
 
 virtio_scsi_acquire(s);
 if (!s->dataplane_fenced) {
 assert(s->ctx && s->dataplane_started);
-progress = virtio_scsi_handle_cmd_vq(s, vq);
+virtio_scsi_handle_cmd_vq(s, vq);
 }
 virtio_scsi_release(s);
-return progress;
 }
 
-static bool virtio_scsi_data_plane_handle_ctrl(VirtIODevice *vdev,
+static void virtio_scsi_data_plane_handle_ctrl(VirtIODevice *vdev,
VirtQueue *vq)
 {
-bool progress = false;
 VirtIOSCSI *s = VIRTIO_SCSI(vdev);
 
 virtio_scsi_acquire(s);
 if (!s->dataplane_fenced) {
 assert(s->ctx && s->dataplane_started);
-progress = virtio_scsi_handle_ctrl_vq(s, vq);
+virtio_scsi_handle_ctrl_vq(s, vq);
 }
 virtio_scsi_release(s);
-return progress;
 }
 
-static bool virtio_scsi_data_plane_handle_event(VirtIODevice *vdev,
+static void virtio_scsi_data_plane_handle_event(VirtIODevice *vdev,
 VirtQueue *vq)
 {
-bool progress = false;
 VirtIOSCSI *s = VIRTIO_SCSI(vdev);
 
 virtio_scsi_acquire(s);
 if (!s->dataplane_fenced) {
 assert(s->ctx && s->dataplane_started);
-progress = virtio_scsi_handle_event_vq(s, vq);
+virtio_scsi_handle_event_vq(s, vq);
 }
 virtio_scsi_release(s);
-return progress;
 }
 
 static int virtio_scsi_set_host_notifier(VirtIOSCSI *s, VirtQueue *vq, int n)
diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index 0039e1c74c..c0

[PATCH v3 5/6] virtio: use ->handle_output() instead of ->handle_aio_output()

The difference between ->handle_output() and ->handle_aio_output() was
that ->handle_aio_output() returned a bool return value indicating
progress. This was needed by the old polling API but now that the bool
return value is gone, the two functions can be unified.

Signed-off-by: Stefan Hajnoczi 
---
 hw/virtio/virtio.c | 33 +++--
 1 file changed, 3 insertions(+), 30 deletions(-)

diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index c042be3935..a97a406d3c 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -125,7 +125,6 @@ struct VirtQueue
 
 uint16_t vector;
 VirtIOHandleOutput handle_output;
-VirtIOHandleOutput handle_aio_output;
 VirtIODevice *vdev;
 EventNotifier guest_notifier;
 EventNotifier host_notifier;
@@ -2300,20 +2299,6 @@ void virtio_queue_set_align(VirtIODevice *vdev, int n, 
int align)
 }
 }
 
-static void virtio_queue_notify_aio_vq(VirtQueue *vq)
-{
-if (vq->vring.desc && vq->handle_aio_output) {
-VirtIODevice *vdev = vq->vdev;
-
-trace_virtio_queue_notify(vdev, vq - vdev->vq, vq);
-vq->handle_aio_output(vdev, vq);
-
-if (unlikely(vdev->start_on_kick)) {
-virtio_set_started(vdev, true);
-}
-}
-}
-
 static void virtio_queue_notify_vq(VirtQueue *vq)
 {
 if (vq->vring.desc && vq->handle_output) {
@@ -2392,7 +2377,6 @@ VirtQueue *virtio_add_queue(VirtIODevice *vdev, int 
queue_size,
 vdev->vq[i].vring.num_default = queue_size;
 vdev->vq[i].vring.align = VIRTIO_PCI_VRING_ALIGN;
 vdev->vq[i].handle_output = handle_output;
-vdev->vq[i].handle_aio_output = NULL;
 vdev->vq[i].used_elems = g_malloc0(sizeof(VirtQueueElement) *
queue_size);
 
@@ -2404,7 +2388,6 @@ void virtio_delete_queue(VirtQueue *vq)
 vq->vring.num = 0;
 vq->vring.num_default = 0;
 vq->handle_output = NULL;
-vq->handle_aio_output = NULL;
 g_free(vq->used_elems);
 vq->used_elems = NULL;
 virtio_virtqueue_reset_region_cache(vq);
@@ -3509,14 +3492,6 @@ EventNotifier *virtio_queue_get_guest_notifier(VirtQueue 
*vq)
 return &vq->guest_notifier;
 }
 
-static void virtio_queue_host_notifier_aio_read(EventNotifier *n)
-{
-VirtQueue *vq = container_of(n, VirtQueue, host_notifier);
-if (event_notifier_test_and_clear(n)) {
-virtio_queue_notify_aio_vq(vq);
-}
-}
-
 static void virtio_queue_host_notifier_aio_poll_begin(EventNotifier *n)
 {
 VirtQueue *vq = container_of(n, VirtQueue, host_notifier);
@@ -3536,7 +3511,7 @@ static void 
virtio_queue_host_notifier_aio_poll_ready(EventNotifier *n)
 {
 VirtQueue *vq = container_of(n, VirtQueue, host_notifier);
 
-virtio_queue_notify_aio_vq(vq);
+virtio_queue_notify_vq(vq);
 }
 
 static void virtio_queue_host_notifier_aio_poll_end(EventNotifier *n)
@@ -3551,9 +3526,8 @@ void virtio_queue_aio_set_host_notifier_handler(VirtQueue 
*vq, AioContext *ctx,
 VirtIOHandleOutput handle_output)
 {
 if (handle_output) {
-vq->handle_aio_output = handle_output;
 aio_set_event_notifier(ctx, &vq->host_notifier, true,
-   virtio_queue_host_notifier_aio_read,
+   virtio_queue_host_notifier_read,
virtio_queue_host_notifier_aio_poll,
virtio_queue_host_notifier_aio_poll_ready);
 aio_set_event_notifier_poll(ctx, &vq->host_notifier,
@@ -3563,8 +3537,7 @@ void virtio_queue_aio_set_host_notifier_handler(VirtQueue 
*vq, AioContext *ctx,
 aio_set_event_notifier(ctx, &vq->host_notifier, true, NULL, NULL, 
NULL);
 /* Test and clear notifier before after disabling event,
  * in case poll callback didn't have time to run. */
-virtio_queue_host_notifier_aio_read(&vq->host_notifier);
-vq->handle_aio_output = NULL;
+virtio_queue_host_notifier_read(&vq->host_notifier);
 }
 }
 
-- 
2.33.1

[PATCH v3 1/6] aio-posix: split poll check from ready handler

Adaptive polling measures the execution time of the polling check plus
handlers called when a polled event becomes ready. Handlers can take a
significant amount of time, making it look like polling was running for
a long time when in fact the event handler was running for a long time.

For example, on Linux the io_submit(2) syscall invoked when a virtio-blk
device's virtqueue becomes ready can take 10s of microseconds. This
can exceed the default polling interval (32 microseconds) and cause
adaptive polling to stop polling.

By excluding the handler's execution time from the polling check we make
the adaptive polling calculation more accurate. As a result, the event
loop now stays in polling mode where previously it would have fallen
back to file descriptor monitoring.

The following data was collected with virtio-blk num-queues=2
event_idx=off using an IOThread. Before:

168k IOPS, IOThread syscalls:

  9837.115 ( 0.020 ms): IO iothread1/620155 io_submit(ctx_id: 140512552468480, 
nr: 16, iocbpp: 0x7fcb9f937db0)= 16
  9837.158 ( 0.002 ms): IO iothread1/620155 write(fd: 103, buf: 0x556a2ef71b88, 
count: 8) = 8
  9837.161 ( 0.001 ms): IO iothread1/620155 write(fd: 104, buf: 0x556a2ef71b88, 
count: 8) = 8
  9837.163 ( 0.001 ms): IO iothread1/620155 ppoll(ufds: 0x7fcb90002800, nfds: 
4, tsp: 0x7fcb9f1342d0, sigsetsize: 8) = 3
  9837.164 ( 0.001 ms): IO iothread1/620155 read(fd: 107, buf: 0x7fcb9f939cc0, 
count: 512)= 8
  9837.174 ( 0.001 ms): IO iothread1/620155 read(fd: 105, buf: 0x7fcb9f939cc0, 
count: 512)= 8
  9837.176 ( 0.001 ms): IO iothread1/620155 read(fd: 106, buf: 0x7fcb9f939cc0, 
count: 512)= 8
  9837.209 ( 0.035 ms): IO iothread1/620155 io_submit(ctx_id: 140512552468480, 
nr: 32, iocbpp: 0x7fca7d0cebe0)= 32

174k IOPS (+3.6%), IOThread syscalls:

  9809.566 ( 0.036 ms): IO iothread1/623061 io_submit(ctx_id: 140539805028352, 
nr: 32, iocbpp: 0x7fd0cdd62be0)= 32
  9809.625 ( 0.001 ms): IO iothread1/623061 write(fd: 103, buf: 0x5647cfba5f58, 
count: 8) = 8
  9809.627 ( 0.002 ms): IO iothread1/623061 write(fd: 104, buf: 0x5647cfba5f58, 
count: 8) = 8
  9809.663 ( 0.036 ms): IO iothread1/623061 io_submit(ctx_id: 140539805028352, 
nr: 32, iocbpp: 0x7fd0d0388b50)= 32

Notice that ppoll(2) and eventfd read(2) syscalls are eliminated because
the IOThread stays in polling mode instead of falling back to file
descriptor monitoring.

As usual, polling is not implemented on Windows so this patch ignores
the new io_poll_read() callback in aio-win32.c.

Signed-off-by: Stefan Hajnoczi 
---
 include/block/aio.h  |  4 +-
 util/aio-posix.h |  1 +
 block/curl.c | 11 ++---
 block/export/fuse.c  |  4 +-
 block/io_uring.c | 19 +
 block/iscsi.c|  4 +-
 block/linux-aio.c| 16 +---
 block/nfs.c  |  6 +--
 block/nvme.c | 51 +++
 block/ssh.c  |  4 +-
 block/win32-aio.c|  4 +-
 hw/virtio/virtio.c   | 16 +---
 hw/xen/xen-bus.c |  6 +--
 io/channel-command.c |  6 ++-
 io/channel-file.c|  3 +-
 io/channel-socket.c  |  3 +-
 migration/rdma.c |  8 ++--
 tests/unit/test-aio.c|  4 +-
 util/aio-posix.c | 89 ++--
 util/aio-win32.c |  4 +-
 util/async.c | 10 -
 util/main-loop.c |  4 +-
 util/qemu-coroutine-io.c |  5 ++-
 util/vhost-user-server.c | 11 ++---
 24 files changed, 191 insertions(+), 102 deletions(-)

diff --git a/include/block/aio.h b/include/block/aio.h
index 47fbe9d81f..5634173b12 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -469,6 +469,7 @@ void aio_set_fd_handler(AioContext *ctx,
 IOHandler *io_read,
 IOHandler *io_write,
 AioPollFn *io_poll,
+IOHandler *io_poll_ready,
 void *opaque);
 
 /* Set polling begin/end callbacks for a file descriptor that has already been
@@ -490,7 +491,8 @@ void aio_set_event_notifier(AioContext *ctx,
 EventNotifier *notifier,
 bool is_external,
 EventNotifierHandler *io_read,
-AioPollFn *io_poll);
+AioPollFn *io_poll,
+EventNotifierHandler *io_poll_ready);
 
 /* Set polling begin/end callbacks for an event notifier that has already been
  * registered with aio_set_event_notifier.  Do nothing if the event notifier is
diff --git a/util/aio-posix.h b/util/aio-posix.h
index c80c04506a..7f2c37a684 100644
--- a/util/aio-posix.h
+++ b/util/aio-posix.h
@@ -24,6 +24,7 @@ struct AioHandler {
 IOHandler *io_read;
 IOHandler *io_write;
 AioPollFn *io_poll;
+IOHand

[PATCH v3 6/6] virtio: unify dataplane and non-dataplane ->handle_output()

Now that virtio-blk and virtio-scsi are ready, get rid of
the handle_aio_output() callback. It's no longer needed.

Signed-off-by: Stefan Hajnoczi 
---
 include/hw/virtio/virtio.h  |  4 +--
 hw/block/dataplane/virtio-blk.c | 16 ++
 hw/scsi/virtio-scsi-dataplane.c | 54 -
 hw/virtio/virtio.c  | 32 +--
 4 files changed, 26 insertions(+), 80 deletions(-)

diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
index b90095628f..f095637058 100644
--- a/include/hw/virtio/virtio.h
+++ b/include/hw/virtio/virtio.h
@@ -316,8 +316,8 @@ bool virtio_device_ioeventfd_enabled(VirtIODevice *vdev);
 EventNotifier *virtio_queue_get_host_notifier(VirtQueue *vq);
 void virtio_queue_set_host_notifier_enabled(VirtQueue *vq, bool enabled);
 void virtio_queue_host_notifier_read(EventNotifier *n);
-void virtio_queue_aio_set_host_notifier_handler(VirtQueue *vq, AioContext *ctx,
-VirtIOHandleOutput handle_output);
+void virtio_queue_aio_attach_host_notifier(VirtQueue *vq, AioContext *ctx);
+void virtio_queue_aio_detach_host_notifier(VirtQueue *vq, AioContext *ctx);
 VirtQueue *virtio_vector_first_queue(VirtIODevice *vdev, uint16_t vector);
 VirtQueue *virtio_vector_next_queue(VirtQueue *vq);
 
diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c
index a2fa407b98..49276e46f2 100644
--- a/hw/block/dataplane/virtio-blk.c
+++ b/hw/block/dataplane/virtio-blk.c
@@ -154,17 +154,6 @@ void virtio_blk_data_plane_destroy(VirtIOBlockDataPlane *s)
 g_free(s);
 }
 
-static void virtio_blk_data_plane_handle_output(VirtIODevice *vdev,
-VirtQueue *vq)
-{
-VirtIOBlock *s = (VirtIOBlock *)vdev;
-
-assert(s->dataplane);
-assert(s->dataplane_started);
-
-virtio_blk_handle_vq(s, vq);
-}
-
 /* Context: QEMU global mutex held */
 int virtio_blk_data_plane_start(VirtIODevice *vdev)
 {
@@ -258,8 +247,7 @@ int virtio_blk_data_plane_start(VirtIODevice *vdev)
 for (i = 0; i < nvqs; i++) {
 VirtQueue *vq = virtio_get_queue(s->vdev, i);
 
-virtio_queue_aio_set_host_notifier_handler(vq, s->ctx,
-virtio_blk_data_plane_handle_output);
+virtio_queue_aio_attach_host_notifier(vq, s->ctx);
 }
 aio_context_release(s->ctx);
 return 0;
@@ -302,7 +290,7 @@ static void virtio_blk_data_plane_stop_bh(void *opaque)
 for (i = 0; i < s->conf->num_queues; i++) {
 VirtQueue *vq = virtio_get_queue(s->vdev, i);
 
-virtio_queue_aio_set_host_notifier_handler(vq, s->ctx, NULL);
+virtio_queue_aio_detach_host_notifier(vq, s->ctx);
 }
 }
 
diff --git a/hw/scsi/virtio-scsi-dataplane.c b/hw/scsi/virtio-scsi-dataplane.c
index 76137de67f..29575cbaf6 100644
--- a/hw/scsi/virtio-scsi-dataplane.c
+++ b/hw/scsi/virtio-scsi-dataplane.c
@@ -49,45 +49,6 @@ void virtio_scsi_dataplane_setup(VirtIOSCSI *s, Error **errp)
 }
 }
 
-static void virtio_scsi_data_plane_handle_cmd(VirtIODevice *vdev,
-  VirtQueue *vq)
-{
-VirtIOSCSI *s = VIRTIO_SCSI(vdev);
-
-virtio_scsi_acquire(s);
-if (!s->dataplane_fenced) {
-assert(s->ctx && s->dataplane_started);
-virtio_scsi_handle_cmd_vq(s, vq);
-}
-virtio_scsi_release(s);
-}
-
-static void virtio_scsi_data_plane_handle_ctrl(VirtIODevice *vdev,
-   VirtQueue *vq)
-{
-VirtIOSCSI *s = VIRTIO_SCSI(vdev);
-
-virtio_scsi_acquire(s);
-if (!s->dataplane_fenced) {
-assert(s->ctx && s->dataplane_started);
-virtio_scsi_handle_ctrl_vq(s, vq);
-}
-virtio_scsi_release(s);
-}
-
-static void virtio_scsi_data_plane_handle_event(VirtIODevice *vdev,
-VirtQueue *vq)
-{
-VirtIOSCSI *s = VIRTIO_SCSI(vdev);
-
-virtio_scsi_acquire(s);
-if (!s->dataplane_fenced) {
-assert(s->ctx && s->dataplane_started);
-virtio_scsi_handle_event_vq(s, vq);
-}
-virtio_scsi_release(s);
-}
-
 static int virtio_scsi_set_host_notifier(VirtIOSCSI *s, VirtQueue *vq, int n)
 {
 BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(s)));
@@ -112,10 +73,10 @@ static void virtio_scsi_dataplane_stop_bh(void *opaque)
 VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(s);
 int i;
 
-virtio_queue_aio_set_host_notifier_handler(vs->ctrl_vq, s->ctx, NULL);
-virtio_queue_aio_set_host_notifier_handler(vs->event_vq, s->ctx, NULL);
+virtio_queue_aio_detach_host_notifier(vs->ctrl_vq, s->ctx);
+virtio_queue_aio_detach_host_notifier(vs->event_vq, s->ctx);
 for (i = 0; i < vs->conf.num_queues; i++) {
-virtio_queue_aio_set_host_notifier_handler(vs->cmd_vqs[i], s->ctx, 
NULL);
+virtio_queue_aio_detach_host_notifier(vs->cmd_vqs[i], s->ctx);
 }
 }
 
@@ -176,14 +137,11 @@ int virtio_scsi_dataplane_start(VirtIODevice *vdev)
 memory_region_transaction_commit();
 
 aio_context_acquir

Re: [RFC v3 0/4] tls: add macros for coroutine-safe TLS variables

On Mon, Dec 06, 2021 at 02:34:45PM +, Peter Maydell wrote:
> On Mon, 6 Dec 2021 at 14:33, Stefan Hajnoczi  wrote:
> >
> > v3:
> > - Added __attribute__((weak)) to get_ptr_*() [Florian]
> 
> Do we really need it *only* on get_ptr_*() ? If we need to
> noinline the other two we probably also should use the same
> attribute weak to force no optimizations at all.

I don't know but it does seem safer to use weak in all cases.

Florian and others?

Stefan


signature.asc
Description: PGP signature

Re: [PATCH v2 for 6.2?] gicv3: fix ICH_MISR's LRENP computation

On Tue, 7 Dec 2021 at 13:05, Damien Hedde  wrote:
> On 12/7/21 13:45, Philippe Mathieu-Daudé wrote:
> > On 12/7/21 10:44, Damien Hedde wrote:
> >> According to the "Arm Generic Interrupt Controller Architecture
> >> Specification GIC architecture version 3 and 4" (version G: page 345
> >> for aarch64 or 509 for aarch32):
> >> LRENP bit of ICH_MISR is set when ICH_HCR.LRENPIE==1 and
> >> ICH_HCR.EOIcount is non-zero.
> >>
> >> When only LRENPIE was set (and EOI count was zero), the LRENP bit was
> >> wrongly set and MISR value was wrong.
> >>
> >> As an additional consequence, if an hypervisor set ICH_HCR.LRENPIE,
> >> the maintenance interrupt was constantly fired. It happens since patch
> >> 9cee1efe92 ("hw/intc: Set GIC maintenance interrupt level to only 0 or 1")
> >> which fixed another bug about maintenance interrupt (most significant
> >> bits of misr, including this one, were ignored in the interrupt trigger).
> >>
> >> Fixes: 83f036fe3d ("hw/intc/arm_gicv3: Add accessors for ICH_ system 
> >> registers")
> >
> > This commit predates 6.1 release, so technically this is not
> > a regression for 6.2.
>
> Do you mean "Fixes:" is meant only for regression or simply that this
> patch should not go for 6.2 ?

Fixes: is fine in all situations where the commit is fixing
a bug that was introduced in the commit hash it mentions.

Separately, given where we are in the release cycle, a patch has
to hit a very high bar to go into 6.2: at least "this breaks
a real world use case that worked fine in 6.1", and probably also
"a use case that we expect a fair number of users to be using".

-- PMM

Re: [PATCH v2 0/1] migration: multifd live migration improvement

2021-12-07 Thread Li Zhang




On 12/6/21 8:54 PM, Dr. David Alan Gilbert wrote:

* Li Zhang (lizh...@suse.de) wrote:

When testing live migration with multifd channels (8, 16, or a bigger number)
and using qemu -incoming (without "defer"), if a network error occurs
(for example, triggering the kernel SYN flooding detection),
the migration fails and the guest hangs forever.

The test environment and the command line is as the following:

QEMU verions: QEMU emulator version 6.2.91 (v6.2.0-rc1-47-gc5fbdd60cf)
Host OS: SLE 15  with kernel: 5.14.5-1-default
Network Card: mlx5 100Gbps
Network card: Intel Corporation I350 Gigabit (1Gbps)

Source:
qemu-system-x86_64 -M q35 -smp 32 -nographic \
 -serial telnet:10.156.208.153:4321,server,nowait \
 -m 4096 -enable-kvm -hda /var/lib/libvirt/images/openSUSE-15.3.img \
 -monitor stdio
Dest:
qemu-system-x86_64 -M q35 -smp 32 -nographic \
 -serial telnet:10.156.208.154:4321,server,nowait \
 -m 4096 -enable-kvm -hda /var/lib/libvirt/images/openSUSE-15.3.img \
 -monitor stdio \
 -incoming tcp:1.0.8.154:4000

(qemu) migrate_set_parameter max-bandwidth 100G
(qemu) migrate_set_capability multifd on
(qemu) migrate_set_parameter multifd-channels 16

The guest hangs when executing the command: migrate -d tcp:1.0.8.154:4000.

If a network problem happens, TCP ACK is not received by destination
and the destination resets the connection with RST.

No. TimeSource  Destination ProtocolLength  Info
119 1.0211691.0.8.153   1.0.8.154   TCP 141060166 → 
4000 [PSH, ACK] Seq=65 Ack=1 Win=62720 Len=1344 TSval=1338662881 
TSecr=1399531897
No. TimeSource  Destination ProtocolLength  Info
125 1.0211811.0.8.154   1.0.8.153   TCP 54  4000 → 
60166 [RST] Seq=1 Win=0 Len=0

kernel log:
[334520.229445] TCP: request_sock_TCP: Possible SYN flooding on port 4000. 
Sending cookies.  Check SNMP counters.
[334562.994919] TCP: request_sock_TCP: Possible SYN flooding on port 4000. 
Sending cookies.  Check SNMP counters.
[334695.519927] TCP: request_sock_TCP: Possible SYN flooding on port 4000. 
Sending cookies.  Check SNMP counters.
[334734.689511] TCP: request_sock_TCP: Possible SYN flooding on port 4000. 
Sending cookies.  Check SNMP counters.
[335687.740415] TCP: request_sock_TCP: Possible SYN flooding on port 4000. 
Sending cookies.  Check SNMP counters.
[335730.013598] TCP: request_sock_TCP: Possible SYN flooding on port 4000. 
Sending cookies.  Check SNMP counters.

Should we document somewhere how to avoid that?  Is there something we
should be doing in the connection code to avoid it?


We should use the command line -incoming defer in QEMU command line 
instead of -incoming ip:port.


And the backlog of the socket will be set as the same as  multifd 
channels,  this problem doesn't happen as far as I test.


If we use --incoming ip:port in the QEMU command line, the backlog of 
the socket is always 1, it will cause the SYN flooding.





Dave


There are two problems here:
1. On the send side, the main thread is blocked on qemu_thread_join and
send threads are blocked on sendmsg
2. On receive side, the receive threads are blocked on qemu_sem_wait to
wait for a semaphore.

The patch is to fix the first problem, and the guest doesn't hang any more.
But there is no better solution to fix the second problem yet.

Li Zhang (1):
   multifd: Shut down the QIO channels to avoid blocking the send threads
 when they are terminated.

  migration/multifd.c | 3 +++
  1 file changed, 3 insertions(+)

--
2.31.1

Re: [PATCH v2 1/1] multifd: Shut down the QIO channels to avoid blocking the send threads when they are terminated.

2021-12-07 Thread Li Zhang




On 12/6/21 8:50 PM, Dr. David Alan Gilbert wrote:

* Li Zhang (lizh...@suse.de) wrote:

Thanks for Daniel's review.

Hi David and Juan,

Any comments for this patch?


Yeh I think that's OK, so

Reviewed-by: Dr. David Alan Gilbert 

I'd have a slight preference for it being before the post I think.


Thanks.



Dave


Thanks

Li

On 12/3/21 12:55 PM, Li Zhang wrote:

When doing live migration with multifd channels 8, 16 or larger number,
the guest hangs in the presence of the network errors such as missing TCP ACKs.

At sender's side:
The main thread is blocked on qemu_thread_join, migration_fd_cleanup
is called because one thread fails on qio_channel_write_all when
the network problem happens and other send threads are blocked on sendmsg.
They could not be terminated. So the main thread is blocked on qemu_thread_join
to wait for the threads terminated.

(gdb) bt
0  0x7f30c8dcffc0 in __pthread_clockjoin_ex () at /lib64/libpthread.so.0
1  0x55cbb716084b in qemu_thread_join (thread=0x55cbb881f418) at 
../util/qemu-thread-posix.c:627
2  0x55cbb6b54e40 in multifd_save_cleanup () at ../migration/multifd.c:542
3  0x55cbb6b4de06 in migrate_fd_cleanup (s=0x55cbb8024000) at 
../migration/migration.c:1808
4  0x55cbb6b4dfb4 in migrate_fd_cleanup_bh (opaque=0x55cbb8024000) at 
../migration/migration.c:1850
5  0x55cbb7173ac1 in aio_bh_call (bh=0x55cbb7eb98e0) at ../util/async.c:141
6  0x55cbb7173bcb in aio_bh_poll (ctx=0x55cbb7ebba80) at ../util/async.c:169
7  0x55cbb715ba4b in aio_dispatch (ctx=0x55cbb7ebba80) at 
../util/aio-posix.c:381
8  0x55cbb7173ffe in aio_ctx_dispatch (source=0x55cbb7ebba80, callback=0x0, 
user_data=0x0) at ../util/async.c:311
9  0x7f30c9c8cdf4 in g_main_context_dispatch () at 
/usr/lib64/libglib-2.0.so.0
10 0x55cbb71851a2 in glib_pollfds_poll () at ../util/main-loop.c:232
11 0x55cbb718521c in os_host_main_loop_wait (timeout=42251070366) at 
../util/main-loop.c:255
12 0x55cbb7185321 in main_loop_wait (nonblocking=0) at 
../util/main-loop.c:531
13 0x55cbb6e6ba27 in qemu_main_loop () at ../softmmu/runstate.c:726
14 0x55cbb6ad6fd7 in main (argc=68, argv=0x7ffc0c57, 
envp=0x7ffc0c578ab0) at ../softmmu/main.c:50

To make sure that the send threads could be terminated, IO channels should be
shut down to avoid waiting IO.

Signed-off-by: Li Zhang 
---
   migration/multifd.c | 3 +++
   1 file changed, 3 insertions(+)

diff --git a/migration/multifd.c b/migration/multifd.c
index 7c9deb1921..33f8287969 100644
--- a/migration/multifd.c
+++ b/migration/multifd.c
@@ -523,6 +523,9 @@ static void multifd_send_terminate_threads(Error *err)
   qemu_mutex_lock(&p->mutex);
   p->quit = true;
   qemu_sem_post(&p->sem);
+if (p->c) {
+qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
+}
   qemu_mutex_unlock(&p->mutex);
   }
   }

Re: [RFC v3 0/4] tls: add macros for coroutine-safe TLS variables

On Mon, Dec 06, 2021 at 02:34:45PM +, Peter Maydell wrote:
> On Mon, 6 Dec 2021 at 14:33, Stefan Hajnoczi  wrote:
> >
> > v3:
> > - Added __attribute__((weak)) to get_ptr_*() [Florian]
> 
> Do we really need it *only* on get_ptr_*() ? If we need to
> noinline the other two we probably also should use the same
> attribute weak to force no optimizations at all.

The weak attribute can't be used on static functions, so I think we need
a different approach:

In file included from ../util/async.c:35:
/builds/stefanha/qemu/include/qemu/coroutine-tls.h:201:11: error: weak 
declaration of 'get_ptr_my_aiocontext' must be public
 type *get_ptr_##var(void)\
   ^~~~
../util/async.c:673:1: note: in expansion of macro 'QEMU_DEFINE_STATIC_CO_TLS'
 QEMU_DEFINE_STATIC_CO_TLS(AioContext *, my_aiocontext)
 ^

Adding asm volatile("") seems to work though:
https://godbolt.org/z/3hn8Gh41d

The GCC documentation mentions combining noinline with asm(""):
https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-noinline-function-attribute

Stefan


signature.asc
Description: PGP signature

Re: [RFC v3 0/4] tls: add macros for coroutine-safe TLS variables

On Tue, 7 Dec 2021 at 13:53, Stefan Hajnoczi  wrote:
>
> On Mon, Dec 06, 2021 at 02:34:45PM +, Peter Maydell wrote:
> > On Mon, 6 Dec 2021 at 14:33, Stefan Hajnoczi  wrote:
> > >
> > > v3:
> > > - Added __attribute__((weak)) to get_ptr_*() [Florian]
> >
> > Do we really need it *only* on get_ptr_*() ? If we need to
> > noinline the other two we probably also should use the same
> > attribute weak to force no optimizations at all.
>
> The weak attribute can't be used on static functions, so I think we need
> a different approach:
>
> In file included from ../util/async.c:35:
> /builds/stefanha/qemu/include/qemu/coroutine-tls.h:201:11: error: weak 
> declaration of 'get_ptr_my_aiocontext' must be public
>  type *get_ptr_##var(void)
> \
>^~~~
> ../util/async.c:673:1: note: in expansion of macro 'QEMU_DEFINE_STATIC_CO_TLS'
>  QEMU_DEFINE_STATIC_CO_TLS(AioContext *, my_aiocontext)
>  ^
>
> Adding asm volatile("") seems to work though:
> https://godbolt.org/z/3hn8Gh41d

You can see in the clang disassembly there that this isn't
sufficient. The compiler puts in both calls, but it ignores
the return results and always returns "true" from the function.

-- PMM

Re: [PATCH v1 2/2] osdep: support mempolicy for preallocation in os_mem_prealloc

2021-12-07 Thread Daniil Tatianin

I believe you're right. Looking at the implementation of shmem_alloc_page, it uses the inode policy, which is set viavma->set_policy (from the mbind() call in this case). set_mempolicy is both useless and redundant here, as thread'spolicy is only ever used in case vma->get_policy returns NULL (which it doesn't in our case).Sorry for the confusion.Thanks,Daniil 07.12.2021, 11:13, "David Hildenbrand" :On 07.12.21 08:06, Daniil Tatianin wrote: This is needed for cases where we want to make sure that a shared memory region gets allocated from a specific NUMA node. This is impossible to do with mbind(2) because it ignores the policy for memory mapped with MAP_SHARED. We work around this by calling set_mempolicy from prealloc threads instead.That's not quite true I think, how were you able to observe this? Do youhave a reproducer?While the man page says:"The specified policy will be ignored for any MAP_SHARED mappings inthe specified memory range. Rather the pages will be allocatedaccording to the memory policy of the thread that caused the page to beallocated. Again, this may not be the thread that called mbind()."What it really means is that as long as we access that memory via the*VMA* for which we called mbind(), which is the case when *not* usingfallocate() to preallocate memory, we end up using the correct policy.I did experiments a while ago with hugetlbfs shared memory and itproperly allocated from the right node. So I'd be curious how youtrigger this. --Thanks,David / dhildenb

Re: [PATCH 07/14] ppc/pnv: Introduce a num_pecs class attribute for PHB4 PEC devices





On 07/12/2021 11:45, Cédric Le Goater wrote:

On 12/7/21 11:00, Frederic Barrat wrote:



On 02/12/2021 15:42, Cédric Le Goater wrote:

POWER9 processor comes with 3 PHB4 PECs (PCI Express Controller) and
each PEC can have several PHBs :

   * PEC0 provides 1 PHB  (PHB0)
   * PEC1 provides 2 PHBs (PHB1 and PHB2)
   * PEC2 provides 3 PHBs (PHB3, PHB4 and PHB5)

A num_pecs class attribute represents better the logic units of the
POWER9 chip. Use that instead of num_phbs which fits POWER8 chips.
This will ease adding support for user created devices.

Signed-off-by: Cédric Le Goater 
---


With this patch, chip->num_phbs is only defined and used on P8. We may 
want to add a comment to make it clear.


Yes.

With the latest changes, I think we can now move num_phbs under PnvChip8
and num_pecs under PnvChip9 since they are only used in these routines :

P8:
     static void pnv_chip_power8_instance_init(Object *obj)
     chip->num_phbs = pcc->num_phbs;
     for (i = 0; i < chip->num_phbs; i++) {

     static void pnv_chip_power8_realize(DeviceState *dev, Error **errp)
     for (i = 0; i < chip->num_phbs; i++) {
P9:
     static void pnv_chip_power9_instance_init(Object *obj)
     chip->num_pecs = pcc->num_pecs;
     for (i = 0; i < chip->num_pecs; i++) {

     static void pnv_chip_power9_phb_realize(PnvChip *chip, Error **errp)
     for (i = 0; i < chip->num_pecs; i++) {

As I review this series, something is bugging me though: the 
difference of handling between P8 and P9.

On P9, we seem to have a more logical hiearachy:
phb <- PCI controller (PEC) <- chip


Yes. It's cleaner than P8 in terms of logic. P8 initial support was
done hastily for skiboot bringup in 2014.

With P8, we don't have an explicit PEC, but we have a PBCQ object, 
which is somewhat similar. The hierarchy seems also more convoluted.


But we don't have stacks on P8. Do we ?



Stacks were introduced on P9 because all the lanes handled by a PEC 
could be grouped differently, each group being called a stack. And each 
stack is associated to a PHB.
On P8, there's no such split, so the doc didn't mention stacks. But each 
PEC handles exactly one PHB. So we could still keep the same abstractions.
On all chips, a PEC would really be equal to a pbcq interface to the 
power bus. The pbcq is servicing one (on P8) or more (on P9/P10) PHBs.




I don't see why it's treated differently. It seems both chips could be 
treated the same, which would make the code easier to follow.


I agree. Daniel certainly would also :)

That's outside of the scope of this series though. 


Well, this patchset enables libvirt support for the PowerNV machines.
Once this is pushed, we need to keep the API, the object model names
being part of it.

7.0 is a good time for a change, really. After that, we won't be able
to change the QOM hierarchy that much.


So maybe for a future patch? Who knows, I might volunteer...


You would introduce a phb3-pec on top of the phb3s ?



Or rename pnv_phb3_pbcq.c to pnv_phb3_pec.c and starts from there. 
Conceptually, the TYPE_PNV_PBCQ and TYPE_PNV_PHB4_PEC_STACK objects seem 
close. But that's easy to say in an email...


  Fred



Let me send a v2 first and may be we could rework the object hierarchy
in the 7.0 time frame. We don't have to merge this ASAP.

Thanks,

C.

Re: [PATCH v7 1/7] net/vmnet: add vmnet dependency and customizable option

2021-12-07 Thread Markus Armbruster

No cover letter?

Re: [PATCH v2 0/1] migration: multifd live migration improvement

2021-12-07 Thread Daniel P . Berrangé

On Tue, Dec 07, 2021 at 02:45:10PM +0100, Li Zhang wrote:
> 
> On 12/6/21 8:54 PM, Dr. David Alan Gilbert wrote:
> > * Li Zhang (lizh...@suse.de) wrote:
> > > When testing live migration with multifd channels (8, 16, or a bigger 
> > > number)
> > > and using qemu -incoming (without "defer"), if a network error occurs
> > > (for example, triggering the kernel SYN flooding detection),
> > > the migration fails and the guest hangs forever.
> > > 
> > > The test environment and the command line is as the following:
> > > 
> > > QEMU verions: QEMU emulator version 6.2.91 (v6.2.0-rc1-47-gc5fbdd60cf)
> > > Host OS: SLE 15  with kernel: 5.14.5-1-default
> > > Network Card: mlx5 100Gbps
> > > Network card: Intel Corporation I350 Gigabit (1Gbps)
> > > 
> > > Source:
> > > qemu-system-x86_64 -M q35 -smp 32 -nographic \
> > >  -serial telnet:10.156.208.153:4321,server,nowait \
> > >  -m 4096 -enable-kvm -hda 
> > > /var/lib/libvirt/images/openSUSE-15.3.img \
> > >  -monitor stdio
> > > Dest:
> > > qemu-system-x86_64 -M q35 -smp 32 -nographic \
> > >  -serial telnet:10.156.208.154:4321,server,nowait \
> > >  -m 4096 -enable-kvm -hda 
> > > /var/lib/libvirt/images/openSUSE-15.3.img \
> > >  -monitor stdio \
> > >  -incoming tcp:1.0.8.154:4000
> > > 
> > > (qemu) migrate_set_parameter max-bandwidth 100G
> > > (qemu) migrate_set_capability multifd on
> > > (qemu) migrate_set_parameter multifd-channels 16
> > > 
> > > The guest hangs when executing the command: migrate -d tcp:1.0.8.154:4000.
> > > 
> > > If a network problem happens, TCP ACK is not received by destination
> > > and the destination resets the connection with RST.
> > > 
> > > No. TimeSource  Destination ProtocolLength  Info
> > > 119 1.0211691.0.8.153   1.0.8.154   TCP 1410
> > > 60166 → 4000 [PSH, ACK] Seq=65 Ack=1 Win=62720 Len=1344 TSval=1338662881 
> > > TSecr=1399531897
> > > No. TimeSource  Destination ProtocolLength  Info
> > > 125 1.0211811.0.8.154   1.0.8.153   TCP 54  
> > > 4000 → 60166 [RST] Seq=1 Win=0 Len=0
> > > 
> > > kernel log:
> > > [334520.229445] TCP: request_sock_TCP: Possible SYN flooding on port 
> > > 4000. Sending cookies.  Check SNMP counters.
> > > [334562.994919] TCP: request_sock_TCP: Possible SYN flooding on port 
> > > 4000. Sending cookies.  Check SNMP counters.
> > > [334695.519927] TCP: request_sock_TCP: Possible SYN flooding on port 
> > > 4000. Sending cookies.  Check SNMP counters.
> > > [334734.689511] TCP: request_sock_TCP: Possible SYN flooding on port 
> > > 4000. Sending cookies.  Check SNMP counters.
> > > [335687.740415] TCP: request_sock_TCP: Possible SYN flooding on port 
> > > 4000. Sending cookies.  Check SNMP counters.
> > > [335730.013598] TCP: request_sock_TCP: Possible SYN flooding on port 
> > > 4000. Sending cookies.  Check SNMP counters.
> > Should we document somewhere how to avoid that?  Is there something we
> > should be doing in the connection code to avoid it?
> 
> We should use the command line -incoming defer in QEMU command line instead
> of -incoming ip:port.
> 
> And the backlog of the socket will be set as the same as  multifd channels, 
> this problem doesn't happen as far as I test.
> 
> If we use --incoming ip:port in the QEMU command line, the backlog of the
> socket is always 1, it will cause the SYN flooding.

Do we send migration parameters from the src to the dst QEMU ?

There are a bunch of things that we need to set to the same
value on the src and dst. If we sent any relevant MigrationParameters
fields to the dst, when the first/main migration chanel is opened, it
could validate that it is configured in a way that is compatible with
the src. If it isn't, it can drop the main channel immediately. This
would trigger the src to fail the migration and we couldn't get stuck
setting up the secondary data channels for multifd.

Regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|

Re: [PATCH v2 for 6.2?] gicv3: fix ICH_MISR's LRENP computation

On Tue, 7 Dec 2021 at 09:44, Damien Hedde  wrote:
>
> According to the "Arm Generic Interrupt Controller Architecture
> Specification GIC architecture version 3 and 4" (version G: page 345
> for aarch64 or 509 for aarch32):
> LRENP bit of ICH_MISR is set when ICH_HCR.LRENPIE==1 and
> ICH_HCR.EOIcount is non-zero.
>
> When only LRENPIE was set (and EOI count was zero), the LRENP bit was
> wrongly set and MISR value was wrong.
>
> As an additional consequence, if an hypervisor set ICH_HCR.LRENPIE,
> the maintenance interrupt was constantly fired. It happens since patch
> 9cee1efe92 ("hw/intc: Set GIC maintenance interrupt level to only 0 or 1")
> which fixed another bug about maintenance interrupt (most significant
> bits of misr, including this one, were ignored in the interrupt trigger).
>
> Fixes: 83f036fe3d ("hw/intc/arm_gicv3: Add accessors for ICH_ system 
> registers")
> Signed-off-by: Damien Hedde 
> ---
> The gic doc is available here:
> https://developer.arm.com/documentation/ihi0069/g
>
> v2: identical resend because subject screw-up (sorry)

Reviewed-by: Peter Maydell 

I won't try to put this into 6.2 unless you have a common guest
that runs into this bug.

thanks
-- PMM

Re: [PATCH 07/14] ppc/pnv: Introduce a num_pecs class attribute for PHB4 PEC devices


On 12/7/21 15:03, Frederic Barrat wrote:



On 07/12/2021 11:45, Cédric Le Goater wrote:

On 12/7/21 11:00, Frederic Barrat wrote:



On 02/12/2021 15:42, Cédric Le Goater wrote:

POWER9 processor comes with 3 PHB4 PECs (PCI Express Controller) and
each PEC can have several PHBs :

   * PEC0 provides 1 PHB  (PHB0)
   * PEC1 provides 2 PHBs (PHB1 and PHB2)
   * PEC2 provides 3 PHBs (PHB3, PHB4 and PHB5)

A num_pecs class attribute represents better the logic units of the
POWER9 chip. Use that instead of num_phbs which fits POWER8 chips.
This will ease adding support for user created devices.

Signed-off-by: Cédric Le Goater 
---


With this patch, chip->num_phbs is only defined and used on P8. We may want to 
add a comment to make it clear.


Yes.

With the latest changes, I think we can now move num_phbs under PnvChip8
and num_pecs under PnvChip9 since they are only used in these routines :

P8:
 static void pnv_chip_power8_instance_init(Object *obj)
 chip->num_phbs = pcc->num_phbs;
 for (i = 0; i < chip->num_phbs; i++) {

 static void pnv_chip_power8_realize(DeviceState *dev, Error **errp)
 for (i = 0; i < chip->num_phbs; i++) {
P9:
 static void pnv_chip_power9_instance_init(Object *obj)
 chip->num_pecs = pcc->num_pecs;
 for (i = 0; i < chip->num_pecs; i++) {

 static void pnv_chip_power9_phb_realize(PnvChip *chip, Error **errp)
 for (i = 0; i < chip->num_pecs; i++) {


As I review this series, something is bugging me though: the difference of 
handling between P8 and P9.
On P9, we seem to have a more logical hiearachy:
phb <- PCI controller (PEC) <- chip


Yes. It's cleaner than P8 in terms of logic. P8 initial support was
done hastily for skiboot bringup in 2014.


With P8, we don't have an explicit PEC, but we have a PBCQ object, which is 
somewhat similar. The hierarchy seems also more convoluted.


But we don't have stacks on P8. Do we ?



Stacks were introduced on P9 because all the lanes handled by a PEC could be 
grouped differently, each group being called a stack. And each stack is 
associated to a PHB.
On P8, there's no such split, so the doc didn't mention stacks. But each PEC 
handles exactly one PHB. So we could still keep the same abstractions.
On all chips, a PEC would really be equal to a pbcq interface to the power bus. 
The pbcq is servicing one (on P8) or more (on P9/P10) PHBs.




I don't see why it's treated differently. It seems both chips could be treated 
the same, which would make the code easier to follow.


I agree. Daniel certainly would also :)

That's outside of the scope of this series though. 


Well, this patchset enables libvirt support for the PowerNV machines.
Once this is pushed, we need to keep the API, the object model names
being part of it.

7.0 is a good time for a change, really. After that, we won't be able
to change the QOM hierarchy that much.


So maybe for a future patch? Who knows, I might volunteer...


You would introduce a phb3-pec on top of the phb3s ?


Or rename pnv_phb3_pbcq.c to pnv_phb3_pec.c and starts from there. 
Conceptually, the TYPE_PNV_PBCQ and TYPE_PNV_PHB4_PEC_STACK objects seem close. 
But that's easy to say in an email...


It's a start.

Here is the PHB3 QOM tree :

   /pnv-phb3[0] (pnv-phb3)
  /lsi (ics)
  /msi (phb3-msi)
  /msi32[0] (memory-region)
  /msi64[0] (memory-region)
  /pbcq (pnv-pbcq)
/pbcq-mmio0[0] (memory-region)
/pbcq-mmio1[0] (memory-region)
/pbcq-phb[0] (memory-region)
/xscom-pbcq-nest-0.0[0] (memory-region)
/xscom-pbcq-pci-0.0[0] (memory-region)
/xscom-pbcq-spci-0.0[0] (memory-region)
  /pci-io[0] (memory-region)
  /pci-mmio[0] (memory-region)
  /pcie-mmcfg-mmio[0] (memory-region)
  /phb3-m32[0] (memory-region)
  /phb3-regs[0] (memory-region)
  /phb3_iommu[0] (pnv-phb3-iommu-memory-region)
  /root (pnv-phb3-root-port)
/bus master container[0] (memory-region)
/bus master[0] (memory-region)
/pci_bridge_io[0] (memory-region)
/pci_bridge_io[1] (memory-region)
/pci_bridge_mem[0] (memory-region)
/pci_bridge_pci[0] (memory-region)
/pci_bridge_pref_mem[0] (memory-region)
/pci_bridge_vga_io_hi[0] (memory-region)
/pci_bridge_vga_io_lo[0] (memory-region)
/pci_bridge_vga_mem[0] (memory-region)
/pcie.0 (PCIE)
  /root-bus (pnv-phb3-root-bus)

We would swap 'pnv-phb3' and 'pnv-pbcq' and rename it to 'pnv-phb3-pec'.
Looks good to me. This should clarify the relationship between objects.

I never like the back pointer to the phb under pbcq:

(qemu) qom-list /machine/chip[0]/pnv-phb3[0]/pbcq
type (string)
parent_bus (link)
realized (bool)
hotplugged (bool)
hotpluggable (bool)
pbcq-mmio0[0] (child)
xscom-pbcq-spci-0.0[0] (child)
xscom-pbcq-nest-0.0[0] (child)
pbcq-mmio1[0] (child)
phb (link)
pbcq-phb[0] (child)
xscom-pbcq-pci-0.

[PULL 0/1] tcg patch queue for 6.2

2021-12-07 Thread Richard Henderson

The following changes since commit 7635eff97104242d618400e4b6746d0a5c97af82:

  Merge tag 'block-pull-request' of https://gitlab.com/stefanha/qemu into 
staging (2021-12-06 11:18:06 -0800)

are available in the Git repository at:

  https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20211207

for you to fetch changes up to b9537d5904f6e3df896264a6144883ab07db9608:

  tcg/arm: Reduce vector alignment requirement for NEON (2021-12-07 06:32:09 
-0800)


Fix stack spills for arm neon.


Richard Henderson (1):
  tcg/arm: Reduce vector alignment requirement for NEON

 tcg/tcg.c|  8 +++-
 tcg/arm/tcg-target.c.inc | 13 +
 2 files changed, 16 insertions(+), 5 deletions(-)

[PULL 1/1] tcg/arm: Reduce vector alignment requirement for NEON

2021-12-07 Thread Richard Henderson

With arm32, the ABI gives us 8-byte alignment for the stack.
While it's possible to realign the stack to provide 16-byte alignment,
it's far easier to simply not encode 16-byte alignment in the
VLD1 and VST1 instructions that we emit.

Remove the assertion in temp_allocate_frame, limit natural alignment
to the provided stack alignment, and add a comment.

Resolves: https://bugzilla.redhat.com/show_bug.cgi?id=1999878
Reported-by: Richard W.M. Jones 
Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Richard Henderson 
Message-Id: <20210912174925.200132-1-richard.hender...@linaro.org>
Message-Id: <20211206191335.230683-2-richard.hender...@linaro.org>
---
 tcg/tcg.c|  8 +++-
 tcg/arm/tcg-target.c.inc | 13 +
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/tcg/tcg.c b/tcg/tcg.c
index 57f17a4649..934aa8510b 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -3061,7 +3061,13 @@ static void temp_allocate_frame(TCGContext *s, TCGTemp 
*ts)
 g_assert_not_reached();
 }
 
-assert(align <= TCG_TARGET_STACK_ALIGN);
+/*
+ * Assume the stack is sufficiently aligned.
+ * This affects e.g. ARM NEON, where we have 8 byte stack alignment
+ * and do not require 16 byte vector alignment.  This seems slightly
+ * easier than fully parameterizing the above switch statement.
+ */
+align = MIN(TCG_TARGET_STACK_ALIGN, align);
 off = ROUND_UP(s->current_frame_offset, align);
 
 /* If we've exhausted the stack frame, restart with a smaller TB. */
diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index 633b8a37ba..9d322cdba6 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -2523,8 +2523,13 @@ static void tcg_out_ld(TCGContext *s, TCGType type, 
TCGReg arg,
 tcg_out_vldst(s, INSN_VLD1 | 0x7d0, arg, arg1, arg2);
 return;
 case TCG_TYPE_V128:
-/* regs 2; size 8; align 16 */
-tcg_out_vldst(s, INSN_VLD1 | 0xae0, arg, arg1, arg2);
+/*
+ * We have only 8-byte alignment for the stack per the ABI.
+ * Rather than dynamically re-align the stack, it's easier
+ * to simply not request alignment beyond that.  So:
+ * regs 2; size 8; align 8
+ */
+tcg_out_vldst(s, INSN_VLD1 | 0xad0, arg, arg1, arg2);
 return;
 default:
 g_assert_not_reached();
@@ -2543,8 +2548,8 @@ static void tcg_out_st(TCGContext *s, TCGType type, 
TCGReg arg,
 tcg_out_vldst(s, INSN_VST1 | 0x7d0, arg, arg1, arg2);
 return;
 case TCG_TYPE_V128:
-/* regs 2; size 8; align 16 */
-tcg_out_vldst(s, INSN_VST1 | 0xae0, arg, arg1, arg2);
+/* See tcg_out_ld re alignment: regs 2; size 8; align 8 */
+tcg_out_vldst(s, INSN_VST1 | 0xad0, arg, arg1, arg2);
 return;
 default:
 g_assert_not_reached();
-- 
2.25.1

Re: [PATCH v1 1/1] osdep: asynchronous teardown for shutdown on Linux

2021-12-07 Thread Halil Pasic

On Mon, 6 Dec 2021 11:47:55 +
Daniel P. Berrangé  wrote:

> On Mon, Dec 06, 2021 at 12:43:12PM +0100, Claudio Imbrenda wrote:
> > On Mon, 6 Dec 2021 11:21:10 +
> > Daniel P. Berrangé  wrote:
> >   
> > > On Mon, Dec 06, 2021 at 12:06:11PM +0100, Claudio Imbrenda wrote:  
> > > > This patch adds support for asynchronously tearing down a VM on Linux.
> > > > 
> > > > When qemu terminates, either naturally or because of a fatal signal,
> > > > the VM is torn down. If the VM is huge, it can take a considerable
> > > > amount of time for it to be cleaned up. In case of a protected VM, it
> > > > might take even longer than a non-protected VM (this is the case on
> > > > s390x, for example).
> > > > 
> > > > Some users might want to shut down a VM and restart it immediately,
> > > > without having to wait. This is especially true if management
> > > > infrastructure like libvirt is used.
> > > > 
> > > > This patch implements a simple trick on Linux to allow qemu to return
> > > > immediately, with the teardown of the VM being performed
> > > > asynchronously.
> > > > 
> > > > If the new commandline option -async-teardown is used, a new process is
> > > > spawned from qemu using the clone syscall, so that it will share its
> > > > address space with qemu.
> > > > 
> > > > The new process will then wait until qemu terminates, and then it will
> > > > exit itself.
> > > > 
> > > > This allows qemu to terminate quickly, without having to wait for the
> > > > whole address space to be torn down. The teardown process will exit
> > > > after qemu, so it will be the last user of the address space, and
> > > > therefore it will take care of the actual teardown.
> > > > 
> > > > The teardown process will share the same cgroups as qemu, so both
> > > > memory usage and cpu time will be accounted properly.
> > > 
> > > If this suggested workaround has any benefit to the shutdown of a VM
> > > with libvirt, then it is a bug in libvirt IMHO.
> > > 
> > > When libvirt tears down a QEMU VM, it should be waiting for *every*
> > > process in the VM's cgroup to be terminated before it reports that
> > > the VM is shutoff. IOW, the fact that this workaround lets the main
> > > QEMU process exit quickly should not matter. libvirt should still
> > > be blocked in exactly the same place in its code, waiting for the
> > > "async" cleanup process to exit. IOW, this should not be async at
> > > all from libvirt's POV.  
> > 
> > interesting, I did not know that about libvirt.
> > 
> > maybe libvirt could be fixed/improved to allow this patch to work?  
> 
> That would not be desirable. When libvirt reports a VM as shutoff,
> it is expected that all resources associated with the VM huave been
> fully released, such that they are available for launching a new
> VM.  We can't allow resources to be asynchronously released as that
> violates app's expectation that the resources are released once the
> VM is shutoff.

I do see your point. But I believe, a part of the problem is that
currently 'can start VM again' and 'all resources associated with
the previous run of the VM were cleaned up' are tied together. And
intuitively it makes a ton of sense. It is just that due to certain
reasons complete shutdown with complete cleanup takes way too long. So
we are looking for a solution to decouple the two. I believe complete
cleanup is inherently hard, so we should not hope solving that. Do you
agree?

Under the assumption that we won't be able to make the cleanup (making
all the resources available again) fast enough, I believe the only way
forward is coming up with a solution if the user explicitly says so
the assumption you just laid out should not be justified any  more.
Maybe something like enlightening the management software about this
'non-interchangeable resources are released, interchangeable resources
not yet fully released' state and add something like a 'force-start' 
operation, where the user explicitly opts-into potentially consuming more
resources (because of the overlap) for less downtime.

What do you think?

> 
> > surely without this patch an asynchronous teardown will not be possible
> > at all  
> 
> I appreciate that the current slow teardown is a pain, but async
> teardown does not sound like an appealing alternative given that
> the app can't use the resources again until the teardown is
> complete.

I don't fully agree with this. I think this statement disregards that
some resources are non-interchangeable in a sense that we need the exact
same resource free, while other resources are interchangeable in a sense
that we don't care which instance we get as long as we get enough
instances from a certain class. When we stop a VM and then start the same
VM again, we don't expect to get the same chunks of memory we had before,
but we just allocate new memory.

Yes we may run into trouble, but we may not. As long as we don't just
change this under the hood, but make it an option that somebody has
to consciously choose

Re: [PATCH v1 2/2] osdep: support mempolicy for preallocation in os_mem_prealloc

2021-12-07 Thread David Hildenbrand

On 07.12.21 14:58, Daniil Tatianin wrote:
> I believe you're right. Looking at the implementation of
> shmem_alloc_page, it uses the inode policy, which is set via
> vma->set_policy (from the mbind() call in this case). set_mempolicy is
> both useless and redundant here, as thread's
> policy is only ever used in case vma->get_policy returns NULL (which it
> doesn't in our case).
> Sorry for the confusion.

Hi Danlil,

not an issue, the man page is really confusing ... so I was similarly
confused a few months ago until I actually started to dig :)

-- 
Thanks,

David / dhildenb

RE: [PATCH v2 for 6.2?] gicv3: fix ICH_MISR's LRENP computation

2021-12-07 Thread Brian Cain

> -Original Message-
> From: Qemu-devel 
> On Behalf Of Peter Maydell
...
> On Tue, 7 Dec 2021 at 09:44, Damien Hedde 
> wrote:
> >
> > According to the "Arm Generic Interrupt Controller Architecture
> > Specification GIC architecture version 3 and 4" (version G: page 345
> > for aarch64 or 509 for aarch32):
> > LRENP bit of ICH_MISR is set when ICH_HCR.LRENPIE==1 and
> > ICH_HCR.EOIcount is non-zero.
> >
> > When only LRENPIE was set (and EOI count was zero), the LRENP bit was
> > wrongly set and MISR value was wrong.
> >
> > As an additional consequence, if an hypervisor set ICH_HCR.LRENPIE,
> > the maintenance interrupt was constantly fired. It happens since patch
> > 9cee1efe92 ("hw/intc: Set GIC maintenance interrupt level to only 0 or 1")
> > which fixed another bug about maintenance interrupt (most significant
> > bits of misr, including this one, were ignored in the interrupt trigger).
> >
> > Fixes: 83f036fe3d ("hw/intc/arm_gicv3: Add accessors for ICH_ system
> registers")
> > Signed-off-by: Damien Hedde 
> > ---
> > The gic doc is available here:
> > https://developer.arm.com/documentation/ihi0069/g
> >
> > v2: identical resend because subject screw-up (sorry)
> 
> Reviewed-by: Peter Maydell 
> 
> I won't try to put this into 6.2 unless you have a common guest
> that runs into this bug.

Peter,

I know that Qualcomm encounters this issue with its hypervisor 
(https://github.com/quic/gunyah-hypervisor).  Apologies for not being familiar 
-- "common guest" means multiple guest systems/OSs that encounter the issue?  
Does that mean that it would not suffice to demonstrate the issue for the one 
known case?

-Brian

Re: [PATCH v2 for 6.2?] gicv3: fix ICH_MISR's LRENP computation

On 12/7/21 15:21, Peter Maydell wrote:

On Tue, 7 Dec 2021 at 09:44, Damien Hedde  wrote:

According to the "Arm Generic Interrupt Controller Architecture
Specification GIC architecture version 3 and 4" (version G: page 345
for aarch64 or 509 for aarch32):
LRENP bit of ICH_MISR is set when ICH_HCR.LRENPIE==1 and
ICH_HCR.EOIcount is non-zero.

When only LRENPIE was set (and EOI count was zero), the LRENP bit was
wrongly set and MISR value was wrong.

As an additional consequence, if an hypervisor set ICH_HCR.LRENPIE,
the maintenance interrupt was constantly fired. It happens since patch
9cee1efe92 ("hw/intc: Set GIC maintenance interrupt level to only 0 or 1")
which fixed another bug about maintenance interrupt (most significant
bits of misr, including this one, were ignored in the interrupt trigger).

Fixes: 83f036fe3d ("hw/intc/arm_gicv3: Add accessors for ICH_ system registers")
Signed-off-by: Damien Hedde 
---
The gic doc is available here:
https://developer.arm.com/documentation/ihi0069/g

v2: identical resend because subject screw-up (sorry)

Reviewed-by: Peter Maydell 

I won't try to put this into 6.2 unless you have a common guest
that runs into this bug.

thanks
-- PMM

I don't know if this fit into "common guest" but my use case is:

> ./build/qemu-system-aarch64 \
> -machine virt,virtualization=on,gic-version=3,highmem=off  \
> -cpu max -m size=4G -smp cpus=8 -nographic  \
> -kernel hypvm.elf   \
> -device loader,file=Image,addr=0x4108  \
> -device loader,file=virt_512M.dtb,addr=0x4420

where Image is a buildroot compiled kernel and hypvm.elf is an 
hypervisor from qualcomm (https://github.com/quic/gunyah-hypervisor).

It boots fine on v6.0 or v6.1 but hangs on master.

It's the same hypervisor Brian is talking about.

Thanks,
Damien

Re: [PATCH v2 for 6.2?] gicv3: fix ICH_MISR's LRENP computation

On Tue, 7 Dec 2021 at 15:18, Brian Cain  wrote:
> Peter Maydell wrote:
> > I won't try to put this into 6.2 unless you have a common guest
> > that runs into this bug.

> I know that Qualcomm encounters this issue with its hypervisor 
> (https://github.com/quic/gunyah-hypervisor).  Apologies for not being 
> familiar -- "common guest" means multiple guest systems/OSs that encounter 
> the issue?  Does that mean that it would not suffice to demonstrate the issue 
> for the one known case?

It means "if you see this with a Linux, BSD etc guest that's
more important than if you see this with some oddball thing
nobody else is using and whose users aren't as likely to be
using release versions of QEMU rather than mainline".

The bug is a bug in any case and we'll fix it, it's just a
question of whether it meets the bar to go into 6.2, which is
hopefully going to have its final RC tagged today. If this
patch had arrived a week ago then the bar would have been
lower and it would definitely have gone in. As it is I have
to weigh up the chances of this change causing a regression
for eg KVM running on emulated QEMU.

thanks
-- PMM

RE: [PATCH v2 for 6.2?] gicv3: fix ICH_MISR's LRENP computation

2021-12-07 Thread Brian Cain



> -Original Message-
> From: Peter Maydell 
...
> On Tue, 7 Dec 2021 at 15:18, Brian Cain  wrote:
> > Peter Maydell wrote:
> > > I won't try to put this into 6.2 unless you have a common guest
> > > that runs into this bug.
> 
> > I know that Qualcomm encounters this issue with its hypervisor
> (https://github.com/quic/gunyah-hypervisor).  Apologies for not being familiar
> -- "common guest" means multiple guest systems/OSs that encounter the
> issue?  Does that mean that it would not suffice to demonstrate the issue for
> the one known case?
> 
> It means "if you see this with a Linux, BSD etc guest that's
> more important than if you see this with some oddball thing
> nobody else is using and whose users aren't as likely to be
> using release versions of QEMU rather than mainline".

Ok, gotcha, thanks for the clarification :)

> The bug is a bug in any case and we'll fix it, it's just a
> question of whether it meets the bar to go into 6.2, which is
> hopefully going to have its final RC tagged today. If this
> patch had arrived a week ago then the bar would have been
> lower and it would definitely have gone in. As it is I have
> to weigh up the chances of this change causing a regression
> for eg KVM running on emulated QEMU.

I understand, and it sounds like the right call.

Thanks!
-Brian

Re: [PATCH v2 0/1] migration: multifd live migration improvement

2021-12-07 Thread Li Zhang




On 12/7/21 3:16 PM, Daniel P. Berrangé wrote:

On Tue, Dec 07, 2021 at 02:45:10PM +0100, Li Zhang wrote:

On 12/6/21 8:54 PM, Dr. David Alan Gilbert wrote:

* Li Zhang (lizh...@suse.de) wrote:

When testing live migration with multifd channels (8, 16, or a bigger number)
and using qemu -incoming (without "defer"), if a network error occurs
(for example, triggering the kernel SYN flooding detection),
the migration fails and the guest hangs forever.

The test environment and the command line is as the following:

QEMU verions: QEMU emulator version 6.2.91 (v6.2.0-rc1-47-gc5fbdd60cf)
Host OS: SLE 15  with kernel: 5.14.5-1-default
Network Card: mlx5 100Gbps
Network card: Intel Corporation I350 Gigabit (1Gbps)

Source:
qemu-system-x86_64 -M q35 -smp 32 -nographic \
  -serial telnet:10.156.208.153:4321,server,nowait \
  -m 4096 -enable-kvm -hda /var/lib/libvirt/images/openSUSE-15.3.img \
  -monitor stdio
Dest:
qemu-system-x86_64 -M q35 -smp 32 -nographic \
  -serial telnet:10.156.208.154:4321,server,nowait \
  -m 4096 -enable-kvm -hda /var/lib/libvirt/images/openSUSE-15.3.img \
  -monitor stdio \
  -incoming tcp:1.0.8.154:4000

(qemu) migrate_set_parameter max-bandwidth 100G
(qemu) migrate_set_capability multifd on
(qemu) migrate_set_parameter multifd-channels 16

The guest hangs when executing the command: migrate -d tcp:1.0.8.154:4000.

If a network problem happens, TCP ACK is not received by destination
and the destination resets the connection with RST.

No. TimeSource  Destination ProtocolLength  Info
119 1.0211691.0.8.153   1.0.8.154   TCP 141060166 → 
4000 [PSH, ACK] Seq=65 Ack=1 Win=62720 Len=1344 TSval=1338662881 
TSecr=1399531897
No. TimeSource  Destination ProtocolLength  Info
125 1.0211811.0.8.154   1.0.8.153   TCP 54  4000 → 
60166 [RST] Seq=1 Win=0 Len=0

kernel log:
[334520.229445] TCP: request_sock_TCP: Possible SYN flooding on port 4000. 
Sending cookies.  Check SNMP counters.
[334562.994919] TCP: request_sock_TCP: Possible SYN flooding on port 4000. 
Sending cookies.  Check SNMP counters.
[334695.519927] TCP: request_sock_TCP: Possible SYN flooding on port 4000. 
Sending cookies.  Check SNMP counters.
[334734.689511] TCP: request_sock_TCP: Possible SYN flooding on port 4000. 
Sending cookies.  Check SNMP counters.
[335687.740415] TCP: request_sock_TCP: Possible SYN flooding on port 4000. 
Sending cookies.  Check SNMP counters.
[335730.013598] TCP: request_sock_TCP: Possible SYN flooding on port 4000. 
Sending cookies.  Check SNMP counters.

Should we document somewhere how to avoid that?  Is there something we
should be doing in the connection code to avoid it?

We should use the command line -incoming defer in QEMU command line instead
of -incoming ip:port.

And the backlog of the socket will be set as the same as  multifd channels,
this problem doesn't happen as far as I test.

If we use --incoming ip:port in the QEMU command line, the backlog of the
socket is always 1, it will cause the SYN flooding.

Do we send migration parameters from the src to the dst QEMU ?


No, I don't think we send migration parameters from the src to the dest 
QEMU.


I set migration parameters on both sides from qemu monitor seperately.


There are a bunch of things that we need to set to the same
value on the src and dst. If we sent any relevant MigrationParameters
fields to the dst, when the first/main migration chanel is opened, it
could validate that it is configured in a way that is compatible with
the src. If it isn't, it can drop the main channel immediately. This
would trigger the src to fail the migration and we couldn't get stuck
setting up the secondary data channels for multifd.


OK,  currently, we have same parameters on both sides if we set them the 
same parameters.


If we use -incoming tcp:ip:port because the multifd is disabled by 
default and backlog is 1 when the socket is created.


Here is the function which set the backlog:

static void
socket_start_incoming_migration_internal(SocketAddress *saddr,
 Error **errp)
{
    QIONetListener *listener = qio_net_listener_new();
    MigrationIncomingState *mis = migration_incoming_get_current();
    size_t i;
    int num = 1;

    qio_net_listener_set_name(listener, "migration-socket-listener");

    if (migrate_use_multifd()) {
    num = migrate_multifd_channels();
    }
  ...

}

The process  with -incoming tcp:ip:port is as the following:

1.   Create qemu process with command line -incoming tcp:ip:port

2.   socket_start_incoming_migration_internal  is called and backlog is: 
num=1, multifd is disabled, num = migrate_multifd_channels() is not called


3.   Enable multifd and set multifd parameters, but the backlog is still 
1, because the it couldn't be changed anymore.


4.   Run migration

The process with -incoming defer is as the following:

1. Cr

Re: [PATCH v2 for 6.2?] gicv3: fix ICH_MISR's LRENP computation

On Tue, 7 Dec 2021 at 15:24, Peter Maydell  wrote:
> The bug is a bug in any case and we'll fix it, it's just a
> question of whether it meets the bar to go into 6.2, which is
> hopefully going to have its final RC tagged today. If this
> patch had arrived a week ago then the bar would have been
> lower and it would definitely have gone in. As it is I have
> to weigh up the chances of this change causing a regression
> for eg KVM running on emulated QEMU.

Looking at the KVM source it doesn't ever set the LRENPIE
bit (it doesn't even have a #define for it), which both
explains why we didn't notice this bug before and also
means we can be pretty certain we're not going to cause a
regression for KVM at least if we fix it...

-- PMM

Re: [PATCH v2 for 6.2?] gicv3: fix ICH_MISR's LRENP computation





On 12/7/21 16:45, Peter Maydell wrote:

On Tue, 7 Dec 2021 at 15:24, Peter Maydell  wrote:

The bug is a bug in any case and we'll fix it, it's just a
question of whether it meets the bar to go into 6.2, which is
hopefully going to have its final RC tagged today. If this
patch had arrived a week ago then the bar would have been
lower and it would definitely have gone in. As it is I have
to weigh up the chances of this change causing a regression
for eg KVM running on emulated QEMU.


Looking at the KVM source it doesn't ever set the LRENPIE
bit (it doesn't even have a #define for it), which both
explains why we didn't notice this bug before and also
means we can be pretty certain we're not going to cause a
regression for KVM at least if we fix it...

-- PMM



We are perfectly fine with this not going into 6.2.

--
Damien

[PATCH] tests/docker: add libfuse3 development headers

The FUSE exports feature is not built because most container images do
not have libfuse3 development headers installed. Add the necessary
packages to the Dockerfiles.

Cc: Hanna Reitz 
Cc: Richard W.M. Jones 
Signed-off-by: Stefan Hajnoczi 
---
 tests/docker/dockerfiles/alpine.docker| 1 +
 tests/docker/dockerfiles/centos8.docker   | 1 +
 tests/docker/dockerfiles/fedora.docker| 1 +
 tests/docker/dockerfiles/opensuse-leap.docker | 1 +
 tests/docker/dockerfiles/ubuntu.docker| 1 +
 tests/docker/dockerfiles/ubuntu2004.docker| 1 +
 6 files changed, 6 insertions(+)

diff --git a/tests/docker/dockerfiles/alpine.docker 
b/tests/docker/dockerfiles/alpine.docker
index 7e6997e301..9ddb3c2ebc 100644
--- a/tests/docker/dockerfiles/alpine.docker
+++ b/tests/docker/dockerfiles/alpine.docker
@@ -12,6 +12,7 @@ ENV PACKAGES \
ccache \
coreutils \
curl-dev \
+   fuse3-dev \
g++ \
gcc \
git \
diff --git a/tests/docker/dockerfiles/centos8.docker 
b/tests/docker/dockerfiles/centos8.docker
index 7f135f8e8c..a2dae4be29 100644
--- a/tests/docker/dockerfiles/centos8.docker
+++ b/tests/docker/dockerfiles/centos8.docker
@@ -19,6 +19,7 @@ ENV PACKAGES \
 device-mapper-multipath-devel \
 diffutils \
 findutils \
+fuse3-devel \
 gcc \
 gcc-c++ \
 genisoimage \
diff --git a/tests/docker/dockerfiles/fedora.docker 
b/tests/docker/dockerfiles/fedora.docker
index c6fd7e1113..a3a712c87b 100644
--- a/tests/docker/dockerfiles/fedora.docker
+++ b/tests/docker/dockerfiles/fedora.docker
@@ -20,6 +20,7 @@ ENV PACKAGES \
 device-mapper-multipath-devel \
 diffutils \
 findutils \
+fuse3-devel \
 gcc \
 gcc-c++ \
 gcovr \
diff --git a/tests/docker/dockerfiles/opensuse-leap.docker 
b/tests/docker/dockerfiles/opensuse-leap.docker
index 3bbdb67f4f..2beb61bd7e 100644
--- a/tests/docker/dockerfiles/opensuse-leap.docker
+++ b/tests/docker/dockerfiles/opensuse-leap.docker
@@ -15,6 +15,7 @@ ENV PACKAGES \
 dbus-1 \
 diffutils \
 findutils \
+fuse3-devel \
 gcc \
 gcc-c++ \
 gcovr \
diff --git a/tests/docker/dockerfiles/ubuntu.docker 
b/tests/docker/dockerfiles/ubuntu.docker
index f0e0180d21..0c694a2bf0 100644
--- a/tests/docker/dockerfiles/ubuntu.docker
+++ b/tests/docker/dockerfiles/ubuntu.docker
@@ -29,6 +29,7 @@ ENV PACKAGES \
 libepoxy-dev \
 libfdt-dev \
 libffi-dev \
+libfuse3-dev \
 libgbm-dev \
 libgnutls28-dev \
 libgtk-3-dev \
diff --git a/tests/docker/dockerfiles/ubuntu2004.docker 
b/tests/docker/dockerfiles/ubuntu2004.docker
index 15a026be09..a46feaecdd 100644
--- a/tests/docker/dockerfiles/ubuntu2004.docker
+++ b/tests/docker/dockerfiles/ubuntu2004.docker
@@ -34,6 +34,7 @@ ENV PACKAGES \
 libepoxy-dev \
 libfdt-dev \
 libffi-dev \
+libfuse3-dev \
 libgbm-dev \
 libgcrypt20-dev \
 libglib2.0-dev \
-- 
2.33.1

Re: [PATCH] tests/docker: add libfuse3 development headers

2021-12-07 Thread Richard W.M. Jones

On Tue, Dec 07, 2021 at 04:00:25PM +, Stefan Hajnoczi wrote:
...
> diff --git a/tests/docker/dockerfiles/centos8.docker 
> b/tests/docker/dockerfiles/centos8.docker
> index 7f135f8e8c..a2dae4be29 100644
> --- a/tests/docker/dockerfiles/centos8.docker
> +++ b/tests/docker/dockerfiles/centos8.docker
> @@ -19,6 +19,7 @@ ENV PACKAGES \
>  device-mapper-multipath-devel \
>  diffutils \
>  findutils \
> +fuse3-devel \
>  gcc \
>  gcc-c++ \
>  genisoimage \

Just for my own notes, it took me a while to work out that CentOS 8
does have fuse3.  It didn't appear in EPEL 8 etc:

https://src.fedoraproject.org/rpms/fuse3
https://ci.centos.org/search/?q=fuse3

However it turns out it is built from a source package called "fuse"
(version 2.9.7!)  Also I am able to install fuse3 on RHEL 8.  So I
guess that's OK in the end.

The rest of the changes look good too, so:

Acked-by: Richard W.M. Jones 

Rich.

-- 
Richard Jones, Virtualization Group, Red Hat http://people.redhat.com/~rjones
Read my programming and virtualization blog: http://rwmj.wordpress.com
Fedora Windows cross-compiler. Compile Windows programs, test, and
build Windows installers. Over 100 libraries supported.
http://fedoraproject.org/wiki/MinGW

Re: [PATCH] spec: Add NBD_OPT_EXTENDED_HEADERS

2021-12-07 Thread Wouter Verhelst

On Mon, Dec 06, 2021 at 05:00:47PM -0600, Eric Blake wrote:
> On Mon, Dec 06, 2021 at 02:40:45PM +0300, Vladimir Sementsov-Ogievskiy wrote:
> > >    Simple reply message
> > > 
> > >   The simple reply message MUST be sent by the server in response to all
> > >   requests if structured replies have not been negotiated using
> > > -`NBD_OPT_STRUCTURED_REPLY`. If structured replies have been negotiated, 
> > > a simple
> > > -reply MAY be used as a reply to any request other than `NBD_CMD_READ`,
> > > -but only if the reply has no data payload.  The message looks as
> > > -follows:
> > > +`NBD_OPT_STRUCTURED_REPLY`. If structured replies have been
> > > +negotiated, a simple reply MAY be used as a reply to any request other
> > > +than `NBD_CMD_READ`, but only if the reply has no data payload.  If
> > > +extended headers were not negotiated using `NBD_OPT_EXTENDED_HEADERS`,
> > > +the message looks as follows:
> > > 
> > >   S: 32 bits, 0x67446698, magic (`NBD_SIMPLE_REPLY_MAGIC`; used to be
> > >  `NBD_REPLY_MAGIC`)
> > > @@ -369,6 +398,16 @@ S: 64 bits, handle
> > >   S: (*length* bytes of data if the request is of type `NBD_CMD_READ` and
> > >   *error* is zero)
> > > 
> > > +If extended headers were negotiated using `NBD_OPT_EXTENDED_HEADERS`,
> > > +the message looks like:
> > > +
> > > +S: 32 bits, 0x60d12fd6, magic (`NBD_SIMPLE_REPLY_EXT_MAGIC`)
> > > +S: 32 bits, error (MAY be zero)
> > > +S: 64 bits, handle
> > > +S: 128 bits, padding (MUST be zero)
> > > +S: (*length* bytes of data if the request is of type `NBD_CMD_READ` and
> > > +*error* is zero)
> > > +
> > 
> > If we go this way, let's put payload length into padding: it will help to 
> > make the protocol context-independent and less error-prone.

Agreed.

> Easy enough to do (the payload length will be 0 except for
> NBD_CMD_READ).

Indeed.

> > Or, the otherway, may be just forbid the payload for simple-64bit ? What's 
> > the reason to allow 64bit requests without structured reply negotiation?
> 
> The two happened to be orthogonal enough in my implementation.  It was
> easy to demonstrate either one without the other, and it IS easier to
> write a client using non-structured replies (structured reads ARE
> tougher than simple reads, even if it is less efficient when it comes
> to reading zeros).  But you are also right that we could require
> structured reads prior to allowing 64-bit operations, and then have
> only one supported reply type on the wire when negotiated.  Wouter,
> which way do you prefer?

Given that I *still* haven't gotten around to implementing structured
replies for nbd-server, I'd prefer not to require it, but that's not
really a decent argument IMO :-)

[... I haven't read this in much detail yet, intend to do that later...]

-- 
 w@uter.{be,co.za}
wouter@{grep.be,fosdem.org,debian.org}

Re: [RFC v3 0/4] tls: add macros for coroutine-safe TLS variables

On Tue, Dec 07, 2021 at 01:55:34PM +, Peter Maydell wrote:
> On Tue, 7 Dec 2021 at 13:53, Stefan Hajnoczi  wrote:
> >
> > On Mon, Dec 06, 2021 at 02:34:45PM +, Peter Maydell wrote:
> > > On Mon, 6 Dec 2021 at 14:33, Stefan Hajnoczi  wrote:
> > > >
> > > > v3:
> > > > - Added __attribute__((weak)) to get_ptr_*() [Florian]
> > >
> > > Do we really need it *only* on get_ptr_*() ? If we need to
> > > noinline the other two we probably also should use the same
> > > attribute weak to force no optimizations at all.
> >
> > The weak attribute can't be used on static functions, so I think we need
> > a different approach:
> >
> > In file included from ../util/async.c:35:
> > /builds/stefanha/qemu/include/qemu/coroutine-tls.h:201:11: error: weak 
> > declaration of 'get_ptr_my_aiocontext' must be public
> >  type *get_ptr_##var(void)  
> >   \
> >^~~~
> > ../util/async.c:673:1: note: in expansion of macro 
> > 'QEMU_DEFINE_STATIC_CO_TLS'
> >  QEMU_DEFINE_STATIC_CO_TLS(AioContext *, my_aiocontext)
> >  ^
> >
> > Adding asm volatile("") seems to work though:
> > https://godbolt.org/z/3hn8Gh41d
> 
> You can see in the clang disassembly there that this isn't
> sufficient. The compiler puts in both calls, but it ignores
> the return results and always returns "true" from the function.

You're right! I missed that the return value of the call isn't used >_<.

Stefan


signature.asc
Description: PGP signature

Re: [PATCH RFC 00/11] vl: Explore redesign of startup