[PULL v2 34/36] ui/dbus: add chardev backend & interface

2021-12-21 Thread marcandre . lureau
From: Marc-André Lureau 

Add a new chardev backend which allows D-Bus client to handle the
chardev stream & events.

Signed-off-by: Marc-André Lureau 
Acked-by: Gerd Hoffmann 
---
 qapi/char.json|  27 
 include/chardev/char-socket.h |   2 +
 include/qemu/dbus.h   |   5 +
 ui/dbus.h |  44 +
 ui/dbus-chardev.c | 296 ++
 ui/dbus.c |  26 +++
 ui/dbus-display1.xml  |  75 +
 ui/meson.build|   1 +
 8 files changed, 476 insertions(+)
 create mode 100644 ui/dbus-chardev.c

diff --git a/qapi/char.json b/qapi/char.json
index f5133a5eeb37..7b421515751b 100644
--- a/qapi/char.json
+++ b/qapi/char.json
@@ -358,6 +358,20 @@
   'base': 'ChardevCommon',
   'if': 'CONFIG_SPICE' }
 
+##
+# @ChardevDBus:
+#
+# Configuration info for DBus chardevs.
+#
+# @name: name of the channel (following docs/spice-port-fqdn.txt)
+#
+# Since: 7.0
+##
+{ 'struct': 'ChardevDBus',
+  'data': { 'name': 'str' },
+  'base': 'ChardevCommon',
+  'if': 'CONFIG_DBUS_DISPLAY' }
+
 ##
 # @ChardevVC:
 #
@@ -422,6 +436,7 @@
 # @spicevmc: Since 1.5
 # @spiceport: Since 1.5
 # @qemu-vdagent: Since 6.1
+# @dbus: Since 7.0
 # @vc: v1.5
 # @ringbuf: Since 1.6
 # @memory: Since 1.5
@@ -447,6 +462,7 @@
 { 'name': 'spicevmc', 'if': 'CONFIG_SPICE' },
 { 'name': 'spiceport', 'if': 'CONFIG_SPICE' },
 { 'name': 'qemu-vdagent', 'if': 'CONFIG_SPICE_PROTOCOL' },
+{ 'name': 'dbus', 'if': 'CONFIG_DBUS_DISPLAY' },
 'vc',
 'ringbuf',
 # next one is just for compatibility
@@ -535,6 +551,15 @@
   'data': { 'data': 'ChardevQemuVDAgent' },
   'if': 'CONFIG_SPICE_PROTOCOL' }
 
+##
+# @ChardevDBusWrapper:
+#
+# Since: 7.0
+##
+{ 'struct': 'ChardevDBusWrapper',
+  'data': { 'data': 'ChardevDBus' },
+  'if': 'CONFIG_DBUS_DISPLAY' }
+
 ##
 # @ChardevVCWrapper:
 #
@@ -582,6 +607,8 @@
'if': 'CONFIG_SPICE' },
 'qemu-vdagent': { 'type': 'ChardevQemuVDAgentWrapper',
   'if': 'CONFIG_SPICE_PROTOCOL' },
+'dbus': { 'type': 'ChardevDBusWrapper',
+  'if': 'CONFIG_DBUS_DISPLAY' },
 'vc': 'ChardevVCWrapper',
 'ringbuf': 'ChardevRingbufWrapper',
 # next one is just for compatibility
diff --git a/include/chardev/char-socket.h b/include/chardev/char-socket.h
index 1a9274f2e3ac..6b6e2ceba1d7 100644
--- a/include/chardev/char-socket.h
+++ b/include/chardev/char-socket.h
@@ -43,6 +43,8 @@ typedef enum {
 TCP_CHARDEV_STATE_CONNECTED,
 } TCPChardevState;
 
+typedef ChardevClass SocketChardevClass;
+
 struct SocketChardev {
 Chardev parent;
 QIOChannel *ioc; /* Client I/O channel */
diff --git a/include/qemu/dbus.h b/include/qemu/dbus.h
index c0cbb1ca44d3..08f00dfd5342 100644
--- a/include/qemu/dbus.h
+++ b/include/qemu/dbus.h
@@ -12,6 +12,11 @@
 
 #include 
 
+#include "qom/object.h"
+#include "chardev/char.h"
+#include "qemu/notify.h"
+#include "qemu/typedefs.h"
+
 /* glib/gio 2.68 */
 #define DBUS_METHOD_INVOCATION_HANDLED TRUE
 #define DBUS_METHOD_INVOCATION_UNHANDLED FALSE
diff --git a/ui/dbus.h b/ui/dbus.h
index 3e89eafcab6e..64c77cab4441 100644
--- a/ui/dbus.h
+++ b/ui/dbus.h
@@ -24,6 +24,7 @@
 #ifndef UI_DBUS_H_
 #define UI_DBUS_H_
 
+#include "chardev/char-socket.h"
 #include "qemu/dbus.h"
 #include "qom/object.h"
 #include "ui/console.h"
@@ -56,11 +57,15 @@ struct DBusDisplay {
 QemuDBusDisplay1Clipboard *clipboard;
 QemuDBusDisplay1Clipboard *clipboard_proxy;
 DBusClipboardRequest clipboard_request[QEMU_CLIPBOARD_SELECTION__COUNT];
+
+Notifier notifier;
 };
 
 #define TYPE_DBUS_DISPLAY "dbus-display"
 OBJECT_DECLARE_SIMPLE_TYPE(DBusDisplay, DBUS_DISPLAY)
 
+void dbus_display_notifier_add(Notifier *notifier);
+
 #define DBUS_DISPLAY_TYPE_CONSOLE dbus_display_console_get_type()
 G_DECLARE_FINAL_TYPE(DBusDisplayConsole,
  dbus_display_console,
@@ -95,6 +100,45 @@ dbus_display_listener_get_bus_name(DBusDisplayListener 
*ddl);
 extern const DisplayChangeListenerOps dbus_gl_dcl_ops;
 extern const DisplayChangeListenerOps dbus_dcl_ops;
 
+#define TYPE_CHARDEV_DBUS "chardev-dbus"
+
+typedef struct DBusChardevClass {
+SocketChardevClass parent_class;
+
+void (*parent_chr_be_event)(Chardev *s, QEMUChrEvent event);
+} DBusChardevClass;
+
+DECLARE_CLASS_CHECKERS(DBusChardevClass, DBUS_CHARDEV,
+   TYPE_CHARDEV_DBUS)
+
+typedef struct DBusChardev {
+SocketChardev parent;
+
+bool exported;
+QemuDBusDisplay1Chardev *iface;
+} DBusChardev;
+
+DECLARE_INSTANCE_CHECKER(DBusChardev, DBUS_CHARDEV, TYPE_CHARDEV_DBUS)
+
+#define CHARDEV_IS_DBUS(chr) \
+object_dynamic_cast(OBJECT(chr), TYPE_CHARDEV_DBUS)
+
+typedef enum {
+DBUS_DISPLAY_CHARDEV_OPEN,
+DBUS_DISPLAY_CHARDEV_CLOSE,
+} DBusDisplayEventType;
+
+typedef struct DBusDisplayEvent {
+DBusDispl

Re: [PATCH 1/2] ui/vnc: refactor arrays of addresses to SocketAddressList

2021-12-21 Thread Marc-André Lureau
Hi

On Mon, Dec 20, 2021 at 10:21 PM Vladimir Sementsov-Ogievskiy <
vsement...@virtuozzo.com> wrote:

> Let's use SocketAddressList instead of dynamic arrays.
> Benefits:
>  - Automatic cleanup: don't need specific freeing function and drop
>some gotos.
>  - Less indirection: no triple asterix anymore!
>  - Prepare for the following commit, which will reuse new interface of
>vnc_display_listen().
>
> Signed-off-by: Vladimir Sementsov-Ogievskiy 
>

Nice
Reviewed-by: Marc-André Lureau 

---
>  ui/vnc.c | 129 ++-
>  1 file changed, 51 insertions(+), 78 deletions(-)
>
> diff --git a/ui/vnc.c b/ui/vnc.c
> index af02522e84..c9e26c70df 100644
> --- a/ui/vnc.c
> +++ b/ui/vnc.c
> @@ -3812,30 +3812,19 @@ static int vnc_display_get_address(const char
> *addrstr,
>  return ret;
>  }
>
> -static void vnc_free_addresses(SocketAddress ***retsaddr,
> -   size_t *retnsaddr)
> -{
> -size_t i;
> -
> -for (i = 0; i < *retnsaddr; i++) {
> -qapi_free_SocketAddress((*retsaddr)[i]);
> -}
> -g_free(*retsaddr);
> -
> -*retsaddr = NULL;
> -*retnsaddr = 0;
> -}
> -
>  static int vnc_display_get_addresses(QemuOpts *opts,
>   bool reverse,
> - SocketAddress ***retsaddr,
> - size_t *retnsaddr,
> - SocketAddress ***retwsaddr,
> - size_t *retnwsaddr,
> + SocketAddressList **saddr_list_ret,
> + SocketAddressList **wsaddr_list_ret,
>   Error **errp)
>  {
>  SocketAddress *saddr = NULL;
>  SocketAddress *wsaddr = NULL;
> +g_autoptr(SocketAddressList) saddr_list = NULL;
> +SocketAddressList **saddr_tail = &saddr_list;
> +SocketAddress *single_saddr = NULL;
> +g_autoptr(SocketAddressList) wsaddr_list = NULL;
> +SocketAddressList **wsaddr_tail = &wsaddr_list;
>  QemuOptsIter addriter;
>  const char *addr;
>  int to = qemu_opt_get_number(opts, "to", 0);
> @@ -3844,23 +3833,16 @@ static int vnc_display_get_addresses(QemuOpts
> *opts,
>  bool ipv4 = qemu_opt_get_bool(opts, "ipv4", false);
>  bool ipv6 = qemu_opt_get_bool(opts, "ipv6", false);
>  int displaynum = -1;
> -int ret = -1;
> -
> -*retsaddr = NULL;
> -*retnsaddr = 0;
> -*retwsaddr = NULL;
> -*retnwsaddr = 0;
>
>  addr = qemu_opt_get(opts, "vnc");
>  if (addr == NULL || g_str_equal(addr, "none")) {
> -ret = 0;
> -goto cleanup;
> +return 0;
>  }
>  if (qemu_opt_get(opts, "websocket") &&
>  !qcrypto_hash_supports(QCRYPTO_HASH_ALG_SHA1)) {
>  error_setg(errp,
> "SHA1 hash support is required for websockets");
> -goto cleanup;
> +return -1;
>  }
>
>  qemu_opt_iter_init(&addriter, opts, "vnc");
> @@ -3871,7 +3853,7 @@ static int vnc_display_get_addresses(QemuOpts *opts,
>   ipv4, ipv6,
>   &saddr, errp);
>  if (rv < 0) {
> -goto cleanup;
> +return -1;
>  }
>  /* Historical compat - first listen address can be used
>   * to set the default websocket port
> @@ -3879,13 +3861,16 @@ static int vnc_display_get_addresses(QemuOpts
> *opts,
>  if (displaynum == -1) {
>  displaynum = rv;
>  }
> -*retsaddr = g_renew(SocketAddress *, *retsaddr, *retnsaddr + 1);
> -(*retsaddr)[(*retnsaddr)++] = saddr;
> +QAPI_LIST_APPEND(saddr_tail, saddr);
>  }
>
> -/* If we had multiple primary displays, we don't do defaults
> - * for websocket, and require explicit config instead. */
> -if (*retnsaddr > 1) {
> +if (saddr_list && !saddr_list->next) {
> +single_saddr = saddr_list->value;
> +} else {
> +/*
> + * If we had multiple primary displays, we don't do defaults
> + * for websocket, and require explicit config instead.
> + */
>  displaynum = -1;
>  }
>
> @@ -3895,57 +3880,50 @@ static int vnc_display_get_addresses(QemuOpts
> *opts,
>  has_ipv4, has_ipv6,
>  ipv4, ipv6,
>  &wsaddr, errp) < 0) {
> -goto cleanup;
> +return -1;
>  }
>
>  /* Historical compat - if only a single listen address was
>   * provided, then this is used to set the default listen
>   * address for websocket too
>   */
> -if (*retnsaddr == 1 &&
> -(*retsaddr)[0]->type == SOCKET_ADDRESS_TYPE_INET &&
> +if (single_saddr &&
> +single_saddr->type == SOCKET_ADDRESS_TYPE_INET &&
>  wsaddr->type == SOCKET_ADDR

Re: [PATCH v2 1/9] hw/intc: sifive_plic: Add a reset function

2021-12-21 Thread Bin Meng
On Thu, Dec 16, 2021 at 12:54 PM Alistair Francis
 wrote:
>
> From: Alistair Francis 
>
> Signed-off-by: Alistair Francis 
> ---
>  hw/intc/sifive_plic.c | 18 ++
>  1 file changed, 18 insertions(+)
>

Reviewed-by: Bin Meng 



[PATCH] target/riscv/pmp: fix no pmp illegal intrs

2021-12-21 Thread Nikita Shubin
From: Nikita Shubin 

As per the privilege specification, any access from S/U mode should fail
if no pmp region is configured and pmp is present, othwerwise access
should succeed.

Fixes: d102f19a208 (target/riscv/pmp: Raise exception if no PMP entry is 
configured)
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/585
Signed-off-by: Nikita Shubin 
Reviewed-by: Alistair Francis 
---
 target/riscv/op_helper.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/target/riscv/op_helper.c b/target/riscv/op_helper.c
index ee7c24efe7..58d992e98a 100644
--- a/target/riscv/op_helper.c
+++ b/target/riscv/op_helper.c
@@ -146,7 +146,8 @@ target_ulong helper_mret(CPURISCVState *env, target_ulong 
cpu_pc_deb)
 uint64_t mstatus = env->mstatus;
 target_ulong prev_priv = get_field(mstatus, MSTATUS_MPP);
 
-if (!pmp_get_num_rules(env) && (prev_priv != PRV_M)) {
+if (riscv_feature(env, RISCV_FEATURE_PMP) &&
+!pmp_get_num_rules(env) && (prev_priv != PRV_M)) {
 riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC());
 }
 
-- 
2.31.1




Re: [PATCH 4/4] ui: Fix gtk/gl when the scaled virtual console does not fit the window

2021-12-21 Thread Alexander Orzechowski

On 12/21/21 02:48, Marc-André Lureau wrote:


Hi

On Sun, Dec 19, 2021 at 6:32 AM Alexander Orzechowski 
 wrote:


gtk/gl was incorrectly always rendering as if the 'Zoom to Fit' was
always checked even if it wasn't. This is now using logic closer
to what is being used for the existing cairo code paths.

Signed-off-by: Alexander Orzechowski 


This doesn't work as expected, the display is not being centered 
correctly.


---
 ui/gtk-gl-area.c | 34 +-
 1 file changed, 29 insertions(+), 5 deletions(-)

diff --git a/ui/gtk-gl-area.c b/ui/gtk-gl-area.c
index 01e4e74ee3..ea72f1817b 100644
--- a/ui/gtk-gl-area.c
+++ b/ui/gtk-gl-area.c
@@ -41,16 +41,40 @@ void gd_gl_area_draw(VirtualConsole *vc)
 #ifdef CONFIG_GBM
     QemuDmaBuf *dmabuf = vc->gfx.guest_fb.dmabuf;
 #endif
+    GtkDisplayState *s = vc->s;
     int ww, wh, ws, y1, y2;
+    int mx, my;
+    int fbh, fbw;

     if (!vc->gfx.gls) {
         return;
     }

 gtk_gl_area_make_current(GTK_GL_AREA(vc->gfx.drawing_area));
+
+    fbw = surface_width(vc->gfx.ds);
+    fbh = surface_height(vc->gfx.ds);
+
     ws =
gdk_window_get_scale_factor(gtk_widget_get_window(vc->gfx.drawing_area));
-    ww = gtk_widget_get_allocated_width(vc->gfx.drawing_area) * ws;
-    wh = gtk_widget_get_allocated_height(vc->gfx.drawing_area) * ws;
+    ww = gtk_widget_get_allocated_width(vc->gfx.drawing_area);
+    wh = gtk_widget_get_allocated_height(vc->gfx.drawing_area);
+
+    if (s->full_screen) {
+        vc->gfx.scale_x = (double)ww / fbw;
+        vc->gfx.scale_y = (double)wh / fbh;
+    } else if (s->free_scale) {
+        double sx, sy;
+
+        sx = (double)ww / fbw;
+        sy = (double)wh / fbh;
+
+        vc->gfx.scale_x = vc->gfx.scale_y = MIN(sx, sy);
+    }
+
+    fbw *= vc->gfx.scale_x * ws;
+    fbh *= vc->gfx.scale_y * ws;
+    mx = (ww * ws - fbw) / 2;
+    my = (wh * ws - fbh) / 2;

     if (vc->gfx.scanout_mode) {
         if (!vc->gfx.guest_fb.framebuffer) {
@@ -70,11 +94,11 @@ void gd_gl_area_draw(VirtualConsole *vc)
         glBindFramebuffer(GL_READ_FRAMEBUFFER,
vc->gfx.guest_fb.framebuffer);
         /* GtkGLArea sets GL_DRAW_FRAMEBUFFER for us */

-        glViewport(0, 0, ww, wh);
+        glViewport(mx, my, fbw, fbh);
         y1 = vc->gfx.y0_top ? 0 : vc->gfx.h;
         y2 = vc->gfx.y0_top ? vc->gfx.h : 0;
         glBlitFramebuffer(0, y1, vc->gfx.w, y2,
-                          0, 0, ww, wh,
+                          mx, my, fbw, fbh,
                           GL_COLOR_BUFFER_BIT, GL_NEAREST);
 #ifdef CONFIG_GBM
         if (dmabuf) {
@@ -98,7 +122,7 @@ void gd_gl_area_draw(VirtualConsole *vc)
         }
 gtk_gl_area_make_current(GTK_GL_AREA(vc->gfx.drawing_area));

-        surface_gl_setup_viewport(vc->gfx.gls, vc->gfx.ds, ww, wh);
+        glViewport(mx, my, fbw, fbh);
         surface_gl_render_texture(vc->gfx.gls, vc->gfx.ds);
     }

-- 
2.34.1





--
Marc-André Lureau


It seems glBlitFramebuffer takes two coordinates instead of a width and 
height. I have corrected this locally and I will have v2 up tomorrow. 
Thanks for the thorough testing!


--

Alexander Orzechowski


Re: [PATCH] tests/qtest: Make the filter tests independent from a specific NIC

2021-12-21 Thread Thomas Huth

On 21/12/2021 07.38, Zhang, Chen wrote:




-Original Message-
From: Qemu-devel  On Behalf Of Thomas Huth
Sent: Monday, December 20, 2021 6:30 PM
To: qemu-devel@nongnu.org; Laurent Vivier 
Cc: Paolo Bonzini ; Yang Hongyang
; Zhang Chen 
Subject: [PATCH] tests/qtest: Make the filter tests independent from a
specific NIC

These filter tests need a NIC, no matter which one, so they use a common
NIC by default. However, these common NIC models might not always have
been compiled into the QEMU target binary, so assuming that a certain NIC is
available is a bad idea. Since the exact type of NIC does not really matter for
these tests, let's switch to "-nic" instead of "-netdev" so that QEMU can
simply pick a default NIC for us.
This way we can now run the tests on other targets that have a default
machine with an on-board/default NIC, too.



Oh, It's my and Hongyang's abandoned mailbox.


Sorry, I only looked at the top of the *.c files and copied the e-mail 
address from there.



Looks good to me.


Thanks for the review!


By the way, should I add the test/qtest/test-filter* to the MAINTAINER file?


That might be helpful indeed to get you CC:-ed correctly next time.

 Thomas




Re: [PATCH 2/2] qapi/ui: introduce change-vnc-listen

2021-12-21 Thread Marc-André Lureau
Hi

On Mon, Dec 20, 2021 at 10:24 PM Vladimir Sementsov-Ogievskiy <
vsement...@virtuozzo.com> wrote:

> Add command that can change addresses where VNC server listens for new
> connections. Prior to 6.0 this functionality was available through
> 'change' qmp command which was deleted.
>
> Signed-off-by: Vladimir Sementsov-Ogievskiy 
>

Looks good to me,
Reviewed-by: Marc-André Lureau 

Could you write an avocado test for it? (tests/avocado/vnc.py)

---
>  docs/about/removed-features.rst |  3 ++-
>  qapi/ui.json| 12 
>  ui/vnc.c| 26 ++
>  3 files changed, 40 insertions(+), 1 deletion(-)
>
> diff --git a/docs/about/removed-features.rst
> b/docs/about/removed-features.rst
> index d42c3341de..20e6901a82 100644
> --- a/docs/about/removed-features.rst
> +++ b/docs/about/removed-features.rst
> @@ -348,7 +348,8 @@ documentation of ``query-hotpluggable-cpus`` for
> additional details.
>  ``change`` (removed in 6.0)
>  '''
>
> -Use ``blockdev-change-medium`` or ``change-vnc-password`` instead.
> +Use ``blockdev-change-medium`` or ``change-vnc-password`` or
> +``change-vnc-listen`` instead.
>
>  ``query-events`` (removed in 6.0)
>  '
> diff --git a/qapi/ui.json b/qapi/ui.json
> index d7567ac866..14e6fe0b4c 100644
> --- a/qapi/ui.json
> +++ b/qapi/ui.json
> @@ -1304,3 +1304,15 @@
>  { 'command': 'display-reload',
>'data': 'DisplayReloadOptions',
>'boxed' : true }
> +
> +##
> +# @change-vnc-listen:
> +#
> +# Change set of addresses to listen for connections.
> +#
> +# Since: 7.0
> +#
> +##
> +{ 'command': 'change-vnc-listen',
> +  'data': { 'id': 'str', 'addresses': ['SocketAddress'],
> +'*websockets': ['SocketAddress'] } }
> diff --git a/ui/vnc.c b/ui/vnc.c
> index c9e26c70df..69bbf3b6f6 100644
> --- a/ui/vnc.c
> +++ b/ui/vnc.c
> @@ -4212,6 +4212,32 @@ fail:
>  vnc_display_close(vd);
>  }
>
> +void qmp_change_vnc_listen(const char *id, SocketAddressList *addresses,
> +   bool has_websockets, SocketAddressList
> *websockets,
> +   Error **errp)
> +{
> +VncDisplay *vd = vnc_display_find(id);
> +
> +if (!vd) {
> +error_setg(errp, "VNC display '%s' not active", id);
> +return;
> +}
> +
> +if (vd->listener) {
> +qio_net_listener_disconnect(vd->listener);
> +object_unref(OBJECT(vd->listener));
> +}
> +vd->listener = NULL;
> +
> +if (vd->wslistener) {
> +qio_net_listener_disconnect(vd->wslistener);
> +object_unref(OBJECT(vd->wslistener));
> +}
> +vd->wslistener = NULL;
> +
> +vnc_display_listen(vd, addresses, websockets, errp);
> +}
> +
>  void vnc_display_add_client(const char *id, int csock, bool skipauth)
>  {
>  VncDisplay *vd = vnc_display_find(id);
> --
> 2.31.1
>
>
>

-- 
Marc-André Lureau


[PATCH] MAINTAINERS: Update COLO Proxy section

2021-12-21 Thread Zhang Chen
Signed-off-by: Zhang Chen 
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 1de6ce6e44..5479b9376e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2983,6 +2983,7 @@ F: docs/colo-proxy.txt
 F: net/colo*
 F: net/filter-rewriter.c
 F: net/filter-mirror.c
+F: tests/qtest/test-filter*
 
 Record/replay
 M: Pavel Dovgalyuk 
-- 
2.25.1




RE: [PATCH] tests/qtest: Make the filter tests independent from a specific NIC

2021-12-21 Thread Zhang, Chen


> -Original Message-
> From: Thomas Huth 
> Sent: Tuesday, December 21, 2021 3:35 PM
> To: Zhang, Chen ; qemu-devel@nongnu.org;
> Laurent Vivier 
> Cc: Paolo Bonzini 
> Subject: Re: [PATCH] tests/qtest: Make the filter tests independent from a
> specific NIC
> 
> On 21/12/2021 07.38, Zhang, Chen wrote:
> >
> >
> >> -Original Message-
> >> From: Qemu-devel  >> bounces+chen.zhang=intel@nongnu.org> On Behalf Of Thomas
> Huth
> >> Sent: Monday, December 20, 2021 6:30 PM
> >> To: qemu-devel@nongnu.org; Laurent Vivier 
> >> Cc: Paolo Bonzini ; Yang Hongyang
> >> ; Zhang Chen 
> >> Subject: [PATCH] tests/qtest: Make the filter tests independent from
> >> a specific NIC
> >>
> >> These filter tests need a NIC, no matter which one, so they use a
> >> common NIC by default. However, these common NIC models might not
> >> always have been compiled into the QEMU target binary, so assuming
> >> that a certain NIC is available is a bad idea. Since the exact type
> >> of NIC does not really matter for these tests, let's switch to "-nic"
> >> instead of "-netdev" so that QEMU can simply pick a default NIC for us.
> >> This way we can now run the tests on other targets that have a
> >> default machine with an on-board/default NIC, too.
> >>
> >
> > Oh, It's my and Hongyang's abandoned mailbox.
> 
> Sorry, I only looked at the top of the *.c files and copied the e-mail address
> from there.
> 
> > Looks good to me.
> 
> Thanks for the review!
> 
> > By the way, should I add the test/qtest/test-filter* to the MAINTAINER file?
> 
> That might be helpful indeed to get you CC:-ed correctly next time.

Already send a patch to update it.

Thanks
Chen

> 
>   Thomas



Re: powernv gitlab ci regression

2021-12-21 Thread Cédric Le Goater

On 12/21/21 03:37, Daniel Henrique Barboza wrote:

Hey,

On 12/20/21 18:35, Richard Henderson wrote:

Hi guys,

Somewhere within


Merge tag 'pull-ppc-20211217' of https://github.com/legoater/qemu into staging
ppc 7.0 queue:

* General cleanup for Mac machines (Peter)
* Fixes for FPU exceptions (Lucas)
* Support for new ISA31 instructions (Matheus)
* Fixes for ivshmem (Daniel)
* Cleanups for PowerNV PHB (Christophe and Cedric)
* Updates of PowerNV and pSeries documentation (Leonardo and Daniel)
* Fixes for PowerNV (Daniel)
* Large cleanup of FPU implementation (Richard)
* Removal of SoftTLBs support for PPC74x CPUs (Fabiano)
* Fixes for exception models in MPCx and 60x CPUs (Fabiano)
* Removal of 401/403 CPUs (Cedric)
* Deprecation of taihu machine (Thomas)
* Large rework of PPC405 machine (Cedric)
* Fixes for VSX instructions (Victor and Matheus)
* Fix for e6500 CPU (Fabiano)
* Initial support for PMU (Daniel)


is something that has caused a timeout regression in avocado-system-centos:


 (047/171) 
tests/avocado/boot_linux_console.py:BootLinuxConsole.test_ppc_powernv8:  
INTERRUPTED: Test interrupted by SIGTERM\nRunner error occurred: Timeout 
reached\nOriginal status: ERROR\n{'name': 
'047-tests/avocado/boot_linux_console.py:BootLinuxConsole.test_ppc_powernv8', 
'logdir': 
'/builds/qemu-project/qemu/build/tests/results/job-2021-12-17T19.23-... (90.46 
s)
 (048/171) 
tests/avocado/boot_linux_console.py:BootLinuxConsole.test_ppc_powernv9:  
INTERRUPTED: Test interrupted by SIGTERM\nRunner error occurred: Timeout 
reached\nOriginal status: ERROR\n{'name': 
'048-tests/avocado/boot_linux_console.py:BootLinuxConsole.test_ppc_powernv9', 
'logdir': 
'/builds/qemu-project/qemu/build/tests/results/job-2021-12-17T19.23-... (90.55 
s)


See e.g. https://gitlab.com/qemu-project/qemu/-/jobs/1898304074


Thanks for letting us know. I bisected it and the culprit is this patch:


commit 4db3907a40a087e2cc1839d19a3642539d36610b
Author: Daniel Henrique Barboza 
Date:   Fri Dec 17 17:57:18 2021 +0100

     target/ppc: enable PMU instruction count


This is a patch where I added instruction count in the ppc64 PMU. After this 
patch the
performance of these 2 tests are degraded to the point where we're hitting 
timeouts in
gitlab (didn't hit timeouts in my machine but the performance is noticeable 
worse).

I'll need to see the serial console of the VM booting up to evaluate if there's 
some kernel
module during boot time that is using the PMU and causing the delay. I'll also 
take a look
into improving the performance as well (e.g. using more TCG code and avoid 
calling helpers).


Run with :

  build/tests/venv/bin/avocado --show=app,console run -t machine:powernv9  
build/tests/avocado/boot_linux_console.py

* 6.2

...
console: [1.559904] PCI: CLS 0 bytes, default 128
/console: [8.830245] Initialise system trusted keyrings
console: [8.832347] Key type blacklist registered
console: [8.834558] workingset: timestamp_bits=54 max_order=14 
bucket_order=0
console: [9.073051] integrity: Platform Keyring initialized
console: [9.073586] Key type asymmetric registered
console: [9.074025] Asymmetric key parser 'x509' registered
console: [9.075359] Block layer SCSI generic (bsg) driver version 0.4 
loaded (major 251)
console: [9.095115] IPMI message handler: version 39.2
console: [9.096161] ipmi device interface
console: [9.514308] ipmi-powernv ibm,opal:ipmi: IPMI message handler: Found 
new BMC (man_id: 0x00, prod_id: 0x, dev_id: 0x20)
-console: [   10.171273] IPMI Watchdog: driver initialized
\console: [   10.974462] hvc0: raw protocol on /ibm,opal/consoles/serial@0 
(boot console)
console: [   10.975059] hvc0: No interrupts property, using OPAL event
console: [   10.989699] Serial: 8250/16550 driver, 4 ports, IRQ sharing disabled
console: [   11.156033] brd: module loaded
console: [   11.235965] loop: module loaded
console: [   11.249922] libphy: Fixed MDIO Bus: probed
console: [   11.254128] i2c /dev entries driver
console: [   11.255782] powernv-cpufreq: ibm,pstate-min node not found
console: [   11.256134] powernv-cpufreq: Platform driver disabled. System does 
not support PState control
console: [   11.273326] ipip: IPv4 and MPLS over IPv4 tunneling driver
console: [   11.303989] NET: Registered protocol family 10
console: [   11.323651] Segment Routing with IPv6
console: [   11.325267] sit: IPv6, IPv4 and MPLS over IPv4 tunneling driver
console: [   11.335866] NET: Registered protocol family 17
console: [   11.336900] Key type dns_resolver registered
console: [   11.337358] secvar-sysfs: secvar: failed to retrieve secvar 
operations.
console: [   11.337877] drmem: No dynamic reconfiguration memory found
console: [   11.341767] Loading compiled-in X.509 certificates
console: [   11.362272] Loaded X.509 cert 'Build time autogenerated kernel key: 
987b64c96d830fe42d02bbf502e028ebe85c2b4e'
console: [   11.667162] Key type encrypted registered
console: [   11.674616] ima: No TP

Re: [PATCH v2 4/9] hw/intc: sifive_plic: Cleanup remaining functions

2021-12-21 Thread Bin Meng
On Thu, Dec 16, 2021 at 12:55 PM Alistair Francis
 wrote:
>
> From: Alistair Francis 
>
> We can remove the original sifive_plic_irqs_pending() function and
> instead just use the sifive_plic_claim() function (renamed to
> sifive_plic_claimed()) to determine if any interrupts are pending.
>
> This requires move the side effects outside of sifive_plic_claimed(),
> but as they are only invoked once that isn't a problem.
>
> We have also removed all of the old #ifdef debugging logs, so let's
> cleanup the last remaining debug function while we are here.
>
> Signed-off-by: Alistair Francis 
> ---
>  hw/intc/sifive_plic.c | 109 +-
>  1 file changed, 22 insertions(+), 87 deletions(-)
>

Reviewed-by: Bin Meng 



Re: [PATCH] MAINTAINERS: Update COLO Proxy section

2021-12-21 Thread Philippe Mathieu-Daudé
Cc'ing qemu-trivial@

On 12/21/21 09:04, Zhang Chen wrote:
> Signed-off-by: Zhang Chen 
> ---
>  MAINTAINERS | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 1de6ce6e44..5479b9376e 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -2983,6 +2983,7 @@ F: docs/colo-proxy.txt
>  F: net/colo*
>  F: net/filter-rewriter.c
>  F: net/filter-mirror.c
> +F: tests/qtest/test-filter*

Reviewed-by: Philippe Mathieu-Daudé 




Re: [PATCH] vl: Add opts to device opts list when using JSON syntax for -device

2021-12-21 Thread Philippe Mathieu-Daudé
Cc'ing Markus.

On 12/20/21 09:45, MkfsSion wrote:
> When using JSON syntax for -device, -set option can not find device
> specified in JSON by id field. The following commandline is an example:
> 
> $ qemu-system-x86_64 -device '{"id":"foo"}' -set device.foo.bar=1
> qemu-system-x86_64: -set device.foo.bar=1: there is no device "foo" defined
> 
> The patch adds device opts to device opts list when a device opts get
> parsed.
> 
> Signed-off-by: MkfsSion 

BTW per:
https://www.qemu.org/docs/master/devel/submitting-a-patch.html#patch-emails-must-include-a-signed-off-by-line
"Please use your real name to sign a patch (not an alias or acronym)."

> ---
>  softmmu/vl.c | 2 ++
>  1 file changed, 2 insertions(+)
> 
> diff --git a/softmmu/vl.c b/softmmu/vl.c
> index 620a1f1367..0dd5acbc1a 100644
> --- a/softmmu/vl.c
> +++ b/softmmu/vl.c
> @@ -3400,6 +3400,8 @@ void qemu_init(int argc, char **argv, char **envp)
>  loc_save(&opt->loc);
>  assert(opt->opts != NULL);
>  QTAILQ_INSERT_TAIL(&device_opts, opt, next);
> +qemu_opts_from_qdict(qemu_find_opts_err("device", 
> &error_fatal),
> + opt->opts, &error_fatal);
>  } else {
>  if (!qemu_opts_parse_noisily(qemu_find_opts("device"),
>   optarg, true)) {




Re: [PATCH 2/4] ui: Remove unnecessary checks

2021-12-21 Thread Alexander Orzechowski

On 12/21/21 02:40, Marc-André Lureau wrote:


Hi

On Sun, Dec 19, 2021 at 6:32 AM Alexander Orzechowski 
 wrote:


These conditionals should never be false as scale_x and scale_y should
scale the fbw and fbh variables such that the ww and wh variables
always
have a greater magnitude.

Signed-off-by: Alexander Orzechowski 


I don't understand how you reached that conclusion.

scale_x/scale_y can have various values, from 0.25 manually, or pretty 
much anything in freescale.


Just adding a breakpoint/debug there and you can see they can be false.

---
 ui/gtk.c | 27 ++-
 1 file changed, 6 insertions(+), 21 deletions(-)

diff --git a/ui/gtk.c b/ui/gtk.c
index 824334ff3d..f2d74b253d 100644
--- a/ui/gtk.c
+++ b/ui/gtk.c
@@ -416,13 +416,8 @@ static void gd_update(DisplayChangeListener *dcl,
     ww = gtk_widget_get_allocated_width(vc->gfx.drawing_area);
     wh = gtk_widget_get_allocated_height(vc->gfx.drawing_area);

-    mx = my = 0;
-    if (ww > fbw) {
-        mx = (ww - fbw) / 2;
-    }
-    if (wh > fbh) {
-        my = (wh - fbh) / 2;
-    }
+    mx = (ww - fbw) / 2;
+    my = (wh - fbh) / 2;

     gtk_widget_queue_draw_area(vc->gfx.drawing_area,
                                mx + x1, my + y1, (x2 - x1), (y2 -
y1));
@@ -801,13 +796,8 @@ static gboolean gd_draw_event(GtkWidget
*widget, cairo_t *cr, void *opaque)
     fbw *= vc->gfx.scale_x;
     fbh *= vc->gfx.scale_y;

-    mx = my = 0;
-    if (ww > fbw) {
-        mx = (ww - fbw) / 2;
-    }
-    if (wh > fbh) {
-        my = (wh - fbh) / 2;
-    }
+    mx = (ww - fbw) / 2;
+    my = (wh - fbh) / 2;

     cairo_rectangle(cr, 0, 0, ww, wh);

@@ -850,13 +840,8 @@ static gboolean gd_motion_event(GtkWidget
*widget, GdkEventMotion *motion,
     ws = gdk_window_get_scale_factor(
 gtk_widget_get_window(vc->gfx.drawing_area));

-    mx = my = 0;
-    if (ww > fbw) {
-        mx = (ww - fbw) / 2;
-    }
-    if (wh > fbh) {
-        my = (wh - fbh) / 2;
-    }
+    mx = (ww - fbw) / 2;
+    my = (wh - fbh) / 2;

     x = (motion->x - mx) / vc->gfx.scale_x * ws;
     y = (motion->y - my) / vc->gfx.scale_y * ws;
-- 
2.34.1





--
Marc-André Lureau


Thanks for the thorough review. I didn't realize you could set the scale 
manually, but


it was under my impression that qemu would set the GDK_HINT_MIN_SIZE 
property[1]


to try to disallow the user from resizing the window any smaller than 
the size of the


virtual console. Qemu provides no mechanism to change the translation of 
the virtual


console within the window in this case where the window is smaller than 
the virtual


console would normally allow, I consider this invalid state. The mx and 
my variables


are only used to translate the virtual console within its render 
surface. Thus, if qemu


were to enter this "invalid" state as I've called it, the view would 
show the middle of


the virtual console and obscure the edges. Without this patch it would 
show the top


left, obscuring the bottom right. I am happy to drop this patch for v2. 
Please let me


know your thoughts.


[1]: See ui/gtk.c line 279 with the patches applied.

--

Alexander Orzechowski




Re: [RFC v2 2/2] migration: Tally pre-copy, downtime and post-copy bytes independently

2021-12-21 Thread Philippe Mathieu-Daudé
On 12/20/21 10:34, David Edmondson wrote:
> Provide information on the number of bytes copied in the pre-copy,
> downtime and post-copy phases of migration.
> 
> Signed-off-by: David Edmondson 
> ---
>  migration/migration.c |  3 +++
>  migration/ram.c   |  7 +++
>  monitor/hmp-cmds.c| 12 
>  qapi/migration.json   | 13 -
>  4 files changed, 34 insertions(+), 1 deletion(-)

Reviewed-by: Philippe Mathieu-Daudé 




Re: [RFC v2 1/2] migration: Introduce ram_transferred_add()

2021-12-21 Thread Philippe Mathieu-Daudé
On 12/20/21 10:34, David Edmondson wrote:
> ...and use it.

FYI Not all mails readers / git tools display subject along
with content, so it is more helpful to rewrite the subject.

> Signed-off-by: David Edmondson 
> ---
>  migration/ram.c | 23 ++-
>  1 file changed, 14 insertions(+), 9 deletions(-)
> 
> diff --git a/migration/ram.c b/migration/ram.c
> index 57efa67f20..bd53e50a7f 100644
> --- a/migration/ram.c
> +++ b/migration/ram.c
> @@ -386,6 +386,11 @@ uint64_t ram_bytes_remaining(void)
>  
>  MigrationStats ram_counters;
>  
> +static void ram_transferred_add(uint64_t bytes)
> +{
> +ram_counters.transferred += bytes;
> +}

Reviewed-by: Philippe Mathieu-Daudé 




Re: [PATCH v11 00/31] LoongArch64 port of QEMU TCG

2021-12-21 Thread Philippe Mathieu-Daudé
On 12/21/21 06:40, WANG Xuerui wrote:
> Hi all,
> 
> This is a port of QEMU TCG to the brand-new CPU architecture LoongArch,
> introduced by Loongson with their 3A5000 chips.
> 
> Everything is tested on real 3A5000 board (system emulation, linux-user,
> make check) and GitLab (CI jobs), and rebased to latest master branch.

> ## How to build and test this

> As for the hardware availability, the boards can already be bought in
> China on Taobao, and I think some people at Loongson might be able to
> arrange for testing environments, if testing on real hardware other than
> mine is required before merging; they have their in-house Debian spin-off
> from the early days of this architecture. Their kernel is
> ABI-incompatible with the version being upstreamed and used by me, but
> QEMU should work there regardless.

I took few hours to translate and read all Taobao contracts before
registering, then got blacklisted at my first login... Maybe others
will get more luck.

Having someone at Loongson helping with hardware is certainly easier
for the community.

> Lastly, I'm new to QEMU development and this is my first patch series
> here; apologizes if I get anything wrong, and any help or suggestion is
> certainly appreciated!

For a first patch series, this is an impressive one...

Regards,

Phil.



Re: [PATCH] MAINTAINERS: Update COLO Proxy section

2021-12-21 Thread Thomas Huth

On 21/12/2021 09.04, Zhang Chen wrote:

Signed-off-by: Zhang Chen 
---
  MAINTAINERS | 1 +
  1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 1de6ce6e44..5479b9376e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2983,6 +2983,7 @@ F: docs/colo-proxy.txt
  F: net/colo*
  F: net/filter-rewriter.c
  F: net/filter-mirror.c
+F: tests/qtest/test-filter*
  
  Record/replay

  M: Pavel Dovgalyuk 



Reviewed-by: Thomas Huth 




Re: [PATCH] ps2: Initial horizontal scroll support

2021-12-21 Thread Marc-André Lureau
Hi

On Tue, Dec 21, 2021 at 4:10 AM Dmitry Petrov  wrote:

> This patch introduces horizontal scroll support for the ps/2 mouse.
> It includes changes in the ps/2 device driver as well as support
> for three display options - cocoa, gtk and sdl, tested and working
> on all of them against guest ubuntu system.
>
> The patch is based on the previous work by Brad Jorsch done in 2010
> but never merge, see
> https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=579968


You should split the patch for the different subsystems/ui etc

Looks good to me, although I didn't test it yet. Some comments below


>
> Signed-off-by: Dmitry Petrov 
> ---
>  hw/input/ps2.c| 54 ---
>  qapi/ui.json  |  2 +-
>  ui/cocoa.m| 18 ++--
>  ui/gtk.c  | 54 ---
>  ui/input-legacy.c | 16 ++
>  ui/sdl2.c |  5 +
>  6 files changed, 122 insertions(+), 27 deletions(-)
>
> diff --git a/hw/input/ps2.c b/hw/input/ps2.c
> index 9376a8f4ce..9e42284cd9 100644
> --- a/hw/input/ps2.c
> +++ b/hw/input/ps2.c
> @@ -123,6 +123,7 @@ typedef struct {
>  int mouse_dx; /* current values, needed for 'poll' mode */
>  int mouse_dy;
>  int mouse_dz;
> +int mouse_dw;
>  uint8_t mouse_buttons;
>  } PS2MouseState;
>
> @@ -715,7 +716,7 @@ static int ps2_mouse_send_packet(PS2MouseState *s)
>  /* IMPS/2 and IMEX send 4 bytes, PS2 sends 3 bytes */
>  const int needed = s->mouse_type ? 4 : 3;
>  unsigned int b;
> -int dx1, dy1, dz1;
> +int dx1, dy1, dz1, dw1;
>
>  if (PS2_QUEUE_SIZE - s->common.queue.count < needed) {
>  return 0;
> @@ -724,6 +725,7 @@ static int ps2_mouse_send_packet(PS2MouseState *s)
>  dx1 = s->mouse_dx;
>  dy1 = s->mouse_dy;
>  dz1 = s->mouse_dz;
> +dw1 = s->mouse_dw;
>  /* XXX: increase range to 8 bits ? */
>  if (dx1 > 127)
>  dx1 = 127;
> @@ -740,6 +742,9 @@ static int ps2_mouse_send_packet(PS2MouseState *s)
>  /* extra byte for IMPS/2 or IMEX */
>  switch(s->mouse_type) {
>  default:
> +/* Just ignore the wheels if not supported */
> +s->mouse_dz = 0;
> +s->mouse_dw = 0;
>  break;
>  case 3:
>  if (dz1 > 127)
> @@ -747,13 +752,38 @@ static int ps2_mouse_send_packet(PS2MouseState *s)
>  else if (dz1 < -127)
>  dz1 = -127;
>  ps2_queue_noirq(&s->common, dz1 & 0xff);
> +s->mouse_dz -= dz1;
> +s->mouse_dw = 0;
>  break;
>  case 4:
> -if (dz1 > 7)
> -dz1 = 7;
> -else if (dz1 < -7)
> -dz1 = -7;
> -b = (dz1 & 0x0f) | ((s->mouse_buttons & 0x18) << 1);
> +/*
> + * This matches what the Linux kernel expects for exps/2 in
> + * drivers/input/mouse/psmouse-base.c. Note, if you happen to
> + * press/release the 4th or 5th buttons at the same moment as a
> + * horizontal wheel scroll, those button presses will get lost.
> I'm not
> + * sure what to do about that, since by this point we don't know
> + * whether those buttons actually changed state.
> + */
>

Reading the kernel code helped me guess what is going on, but it would be
nice to have more doc or link to specifications instead.


> +if (dw1 != 0) {
> +if (dw1 > 15) {
> +dw1 = 15;
> +} else if (dw1 < -15) {
> +dw1 = -15;
> +}
> +
> +/* 0x3f was found by trial and error vs ubuntu instance */
> +b = (dw1 & 0x3f) | 0x40;
>

Ok, clamp at 15 (I think you could go at 31 actually, since 5 bits seem to
be used) and go to the kernel:
case 0x40: /* horizontal scroll on IntelliMouse Explorer 4.0 */

This case doesn't handle buttons simultaneously indeed.

I think 0x3f comes from 5 bits + 1 sign bit.

+s->mouse_dw -= dw1;
> +} else {
> +if (dz1 > 7) {
> +dz1 = 7;
> +} else if (dz1 < -7) {
> +dz1 = -7;
> +}
> +
> +b = (dz1 & 0x0f) | ((s->mouse_buttons & 0x18) << 1);
> +s->mouse_dz -= dz1;
>

Here clamp at 7, since we should fall in the kernel
case 0x00:

and only 3 bits seem to be used (thus & 0x0f for 3+1 sign).

This case handles buttons simultaneously, but only vertical scroll (unless
a4tech_workaround is set and triggered)





> +}
>  ps2_queue_noirq(&s->common, b);
>  break;
>  }
> @@ -764,7 +794,6 @@ static int ps2_mouse_send_packet(PS2MouseState *s)
>  /* update deltas */
>  s->mouse_dx -= dx1;
>  s->mouse_dy -= dy1;
> -s->mouse_dz -= dz1;
>
>  return 1;
>  }
> @@ -806,6 +835,12 @@ static void ps2_mouse_event(DeviceState *dev,
> QemuConsole *src,
>  } else if (btn->button == INPUT_BUTTON_WHEEL_DOWN) {
>  s->mouse_dz++;
>  }
> +
> +if (btn->button == I

[PATCH] KVM: x86: ignore interrupt_bitmap field of KVM_GET/SET_SREGS

2021-12-21 Thread Paolo Bonzini
This is unnecessary, because the interrupt would be retrieved and queued
anyway by KVM_GET_VCPU_EVENTS and KVM_SET_VCPU_EVENTS respectively,
and it makes the flow more similar to the one for KVM_GET/SET_SREGS2.

Signed-off-by: Paolo Bonzini 
---
 target/i386/kvm/kvm.c | 22 --
 1 file changed, 8 insertions(+), 14 deletions(-)

diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
index d81745620b..b42bcbc363 100644
--- a/target/i386/kvm/kvm.c
+++ b/target/i386/kvm/kvm.c
@@ -2607,11 +2607,11 @@ static int kvm_put_sregs(X86CPU *cpu)
 CPUX86State *env = &cpu->env;
 struct kvm_sregs sregs;
 
+/*
+ * The interrupt_bitmap is ignored because KVM_SET_SREGS is
+ * always followed by KVM_SET_VCPU_EVENTS.
+ */
 memset(sregs.interrupt_bitmap, 0, sizeof(sregs.interrupt_bitmap));
-if (env->interrupt_injected >= 0) {
-sregs.interrupt_bitmap[env->interrupt_injected / 64] |=
-(uint64_t)1 << (env->interrupt_injected % 64);
-}
 
 if ((env->eflags & VM_MASK)) {
 set_v8086_seg(&sregs.cs, &env->segs[R_CS]);
@@ -3348,16 +3348,10 @@ static int kvm_get_sregs(X86CPU *cpu)
 return ret;
 }
 
-/* There can only be one pending IRQ set in the bitmap at a time, so try
-   to find it and save its number instead (-1 for none). */
-env->interrupt_injected = -1;
-for (i = 0; i < ARRAY_SIZE(sregs.interrupt_bitmap); i++) {
-if (sregs.interrupt_bitmap[i]) {
-bit = ctz64(sregs.interrupt_bitmap[i]);
-env->interrupt_injected = i * 64 + bit;
-break;
-}
-}
+/*
+ * The interrupt_bitmap is ignored because KVM_GET_SREGS is
+ * always preceded by KVM_GET_VCPU_EVENTS.
+ */
 
 get_seg(&env->segs[R_CS], &sregs.cs);
 get_seg(&env->segs[R_DS], &sregs.ds);
-- 
2.33.1




[PATCH v3 1/2] migration: Introduce ram_transferred_add()

2021-12-21 Thread David Edmondson
Replace direct manipulation of ram_counters.transferred with a
function.

Signed-off-by: David Edmondson 
Reviewed-by: Philippe Mathieu-Daudé 
---
 migration/ram.c | 23 ++-
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index 57efa67f20..bd53e50a7f 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -386,6 +386,11 @@ uint64_t ram_bytes_remaining(void)
 
 MigrationStats ram_counters;
 
+static void ram_transferred_add(uint64_t bytes)
+{
+ram_counters.transferred += bytes;
+}
+
 /* used by the search for pages to send */
 struct PageSearchStatus {
 /* Current block being searched */
@@ -767,7 +772,7 @@ static int save_xbzrle_page(RAMState *rs, uint8_t 
**current_data,
  * RAM_SAVE_FLAG_CONTINUE.
  */
 xbzrle_counters.bytes += bytes_xbzrle - 8;
-ram_counters.transferred += bytes_xbzrle;
+ram_transferred_add(bytes_xbzrle);
 
 return 1;
 }
@@ -1198,7 +1203,7 @@ static int save_zero_page(RAMState *rs, RAMBlock *block, 
ram_addr_t offset)
 
 if (len) {
 ram_counters.duplicate++;
-ram_counters.transferred += len;
+ram_transferred_add(len);
 return 1;
 }
 return -1;
@@ -1234,7 +1239,7 @@ static bool control_save_page(RAMState *rs, RAMBlock 
*block, ram_addr_t offset,
 }
 
 if (bytes_xmit) {
-ram_counters.transferred += bytes_xmit;
+ram_transferred_add(bytes_xmit);
 *pages = 1;
 }
 
@@ -1265,8 +1270,8 @@ static bool control_save_page(RAMState *rs, RAMBlock 
*block, ram_addr_t offset,
 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
 uint8_t *buf, bool async)
 {
-ram_counters.transferred += save_page_header(rs, rs->f, block,
- offset | RAM_SAVE_FLAG_PAGE);
+ram_transferred_add(save_page_header(rs, rs->f, block,
+ offset | RAM_SAVE_FLAG_PAGE));
 if (async) {
 qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
   migrate_release_ram() &
@@ -1274,7 +1279,7 @@ static int save_normal_page(RAMState *rs, RAMBlock 
*block, ram_addr_t offset,
 } else {
 qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
 }
-ram_counters.transferred += TARGET_PAGE_SIZE;
+ram_transferred_add(TARGET_PAGE_SIZE);
 ram_counters.normal++;
 return 1;
 }
@@ -1373,7 +1378,7 @@ exit:
 static void
 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
 {
-ram_counters.transferred += bytes_xmit;
+ram_transferred_add(bytes_xmit);
 
 if (param->zero_page) {
 ram_counters.duplicate++;
@@ -2298,7 +2303,7 @@ void acct_update_position(QEMUFile *f, size_t size, bool 
zero)
 ram_counters.duplicate += pages;
 } else {
 ram_counters.normal += pages;
-ram_counters.transferred += size;
+ram_transferred_add(size);
 qemu_update_position(f, size);
 }
 }
@@ -3133,7 +3138,7 @@ out:
 multifd_send_sync_main(rs->f);
 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
 qemu_fflush(f);
-ram_counters.transferred += 8;
+ram_transferred_add(8);
 
 ret = qemu_file_get_error(f);
 }
-- 
2.33.0




[PATCH v3 0/2] migration: Tally pre-copy, downtime and post-copy bytes independently

2021-12-21 Thread David Edmondson
When examining a report of poor migration behaviour, it would often be
useful to understand how much data was transferred in different phases
of the migration process.

For example, if the downtime limit is exceeded, to know how much data
was transferred during the downtime.

RFC because the name "ram_transferred_add" doesn't seem great, and I'm
unsure whether the tests to determine the phase in the second patch
are the most appropriate.

v3:
- Add r-by (Philippe)
- Improve a commit message (Philippe)

v2:
- ram_transferred_add() should be static (Philippe)
- Document the new MigrationStats fields (dme)

David Edmondson (2):
  migration: Introduce ram_transferred_add()
  migration: Tally pre-copy, downtime and post-copy bytes independently

 migration/migration.c |  3 +++
 migration/ram.c   | 30 +-
 monitor/hmp-cmds.c| 12 
 qapi/migration.json   | 13 -
 4 files changed, 48 insertions(+), 10 deletions(-)

-- 
2.33.0




Re: [PATCH 3/5] migration: ram_release_pages() always receive 1 page as argument

2021-12-21 Thread Juan Quintela
Philippe Mathieu-Daudé  wrote:
> On 12/16/21 10:13, Juan Quintela wrote:
>> Remove the pages argument. And s/pages/page/
>> 
>> Signed-off-by: Juan Quintela 
>> ---
>>  migration/ram.c | 8 
>>  1 file changed, 4 insertions(+), 4 deletions(-)
>
>> -static void ram_release_pages(const char *rbname, uint64_t offset, int 
>> pages)
>> +static void ram_release_page(const char *rbname, uint64_t offset)
>>  {
>>  if (!migrate_release_ram() || !migration_in_postcopy()) {
>>  return;
>>  }
>>  
>> -ram_discard_range(rbname, offset, ((ram_addr_t)pages) << 
>> TARGET_PAGE_BITS);
>> +ram_discard_range(rbname, offset, ((ram_addr_t)1) << TARGET_PAGE_BITS);
>
> 1ULL?

I am changing it, but the argument to ram_discard_range is a size_t, and
that is different in 32 bits arch.  Once told that, it is not worse that
what we have here, as ram_addr_t type depends on the phase of the moon.

/* address in the RAM (different from a physical address) */
#if defined(CONFIG_XEN_BACKEND)
typedef uint64_t ram_addr_t;
#  define RAM_ADDR_MAX UINT64_MAX
#  define RAM_ADDR_FMT "%" PRIx64
#else
typedef uintptr_t ram_addr_t;
#  define RAM_ADDR_MAX UINTPTR_MAX
#  define RAM_ADDR_FMT "%" PRIxPTR
#endif

Later, Juan.

PD. No, I don't know either why it is not casted to size_t.

PD2. And yes, I still think that pure int operations should be ok.
 The value TARGET_PAGE_BITS more typical is 10, and here pages is
 only used with value 1.  C promotion rules should make everything
 ok (famous last words).




[PATCH v3 2/2] migration: Tally pre-copy, downtime and post-copy bytes independently

2021-12-21 Thread David Edmondson
Provide information on the number of bytes copied in the pre-copy,
downtime and post-copy phases of migration.

Signed-off-by: David Edmondson 
Reviewed-by: Philippe Mathieu-Daudé 
---
 migration/migration.c |  3 +++
 migration/ram.c   |  7 +++
 monitor/hmp-cmds.c| 12 
 qapi/migration.json   | 13 -
 4 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/migration/migration.c b/migration/migration.c
index 3de11ae921..3950510be7 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -1013,6 +1013,9 @@ static void populate_ram_info(MigrationInfo *info, 
MigrationState *s)
 info->ram->page_size = page_size;
 info->ram->multifd_bytes = ram_counters.multifd_bytes;
 info->ram->pages_per_second = s->pages_per_second;
+info->ram->precopy_bytes = ram_counters.precopy_bytes;
+info->ram->downtime_bytes = ram_counters.downtime_bytes;
+info->ram->postcopy_bytes = ram_counters.postcopy_bytes;
 
 if (migrate_use_xbzrle()) {
 info->has_xbzrle_cache = true;
diff --git a/migration/ram.c b/migration/ram.c
index bd53e50a7f..389868c988 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -388,6 +388,13 @@ MigrationStats ram_counters;
 
 static void ram_transferred_add(uint64_t bytes)
 {
+if (runstate_is_running()) {
+ram_counters.precopy_bytes += bytes;
+} else if (migration_in_postcopy()) {
+ram_counters.postcopy_bytes += bytes;
+} else {
+ram_counters.downtime_bytes += bytes;
+}
 ram_counters.transferred += bytes;
 }
 
diff --git a/monitor/hmp-cmds.c b/monitor/hmp-cmds.c
index 9c91bf93e9..6049772178 100644
--- a/monitor/hmp-cmds.c
+++ b/monitor/hmp-cmds.c
@@ -293,6 +293,18 @@ void hmp_info_migrate(Monitor *mon, const QDict *qdict)
 monitor_printf(mon, "postcopy request count: %" PRIu64 "\n",
info->ram->postcopy_requests);
 }
+if (info->ram->precopy_bytes) {
+monitor_printf(mon, "precopy ram: %" PRIu64 " kbytes\n",
+   info->ram->precopy_bytes >> 10);
+}
+if (info->ram->downtime_bytes) {
+monitor_printf(mon, "downtime ram: %" PRIu64 " kbytes\n",
+   info->ram->downtime_bytes >> 10);
+}
+if (info->ram->postcopy_bytes) {
+monitor_printf(mon, "postcopy ram: %" PRIu64 " kbytes\n",
+   info->ram->postcopy_bytes >> 10);
+}
 }
 
 if (info->has_disk) {
diff --git a/qapi/migration.json b/qapi/migration.json
index bbfd48cf0b..5975a0e104 100644
--- a/qapi/migration.json
+++ b/qapi/migration.json
@@ -46,6 +46,15 @@
 # @pages-per-second: the number of memory pages transferred per second
 #(Since 4.0)
 #
+# @precopy-bytes: The number of bytes sent in the pre-copy phase
+# (since 7.0).
+#
+# @downtime-bytes: The number of bytes sent while the guest is paused
+#  (since 7.0).
+#
+# @postcopy-bytes: The number of bytes sent during the post-copy phase
+#  (since 7.0).
+#
 # Since: 0.14
 ##
 { 'struct': 'MigrationStats',
@@ -54,7 +63,9 @@
'normal-bytes': 'int', 'dirty-pages-rate' : 'int',
'mbps' : 'number', 'dirty-sync-count' : 'int',
'postcopy-requests' : 'int', 'page-size' : 'int',
-   'multifd-bytes' : 'uint64', 'pages-per-second' : 'uint64' } }
+   'multifd-bytes' : 'uint64', 'pages-per-second' : 'uint64',
+   'precopy-bytes' : 'uint64', 'downtime-bytes' : 'uint64',
+   'postcopy-bytes' : 'uint64' } }
 
 ##
 # @XBZRLECacheStats:
-- 
2.33.0




Re: [PATCH v11 00/31] LoongArch64 port of QEMU TCG

2021-12-21 Thread gaosong

Hi,

On 2021/12/21 下午4:44, Philippe Mathieu-Daudé wrote:

I took few hours to translate and read all Taobao contracts before
registering, then got blacklisted at my first login... Maybe others
will get more luck.

Having someone at Loongson helping with hardware is certainly easier
for the community.


Loongson company can donate 3a5000 computers or provide an IP access to 3a5000 
hardware environment.

Which way is better?

Thanks.
Song Gao



Re: powernv gitlab ci regression

2021-12-21 Thread Daniel Henrique Barboza




On 12/21/21 05:20, Cédric Le Goater wrote:

On 12/21/21 03:37, Daniel Henrique Barboza wrote:

Hey,

On 12/20/21 18:35, Richard Henderson wrote:

Hi guys,

Somewhere within


Merge tag 'pull-ppc-20211217' of https://github.com/legoater/qemu into staging
ppc 7.0 queue:

* General cleanup for Mac machines (Peter)
* Fixes for FPU exceptions (Lucas)
* Support for new ISA31 instructions (Matheus)
* Fixes for ivshmem (Daniel)
* Cleanups for PowerNV PHB (Christophe and Cedric)
* Updates of PowerNV and pSeries documentation (Leonardo and Daniel)
* Fixes for PowerNV (Daniel)
* Large cleanup of FPU implementation (Richard)
* Removal of SoftTLBs support for PPC74x CPUs (Fabiano)
* Fixes for exception models in MPCx and 60x CPUs (Fabiano)
* Removal of 401/403 CPUs (Cedric)
* Deprecation of taihu machine (Thomas)
* Large rework of PPC405 machine (Cedric)
* Fixes for VSX instructions (Victor and Matheus)
* Fix for e6500 CPU (Fabiano)
* Initial support for PMU (Daniel)


is something that has caused a timeout regression in avocado-system-centos:


 (047/171) 
tests/avocado/boot_linux_console.py:BootLinuxConsole.test_ppc_powernv8:  
INTERRUPTED: Test interrupted by SIGTERM\nRunner error occurred: Timeout 
reached\nOriginal status: ERROR\n{'name': 
'047-tests/avocado/boot_linux_console.py:BootLinuxConsole.test_ppc_powernv8', 
'logdir': 
'/builds/qemu-project/qemu/build/tests/results/job-2021-12-17T19.23-... (90.46 
s)
 (048/171) 
tests/avocado/boot_linux_console.py:BootLinuxConsole.test_ppc_powernv9:  
INTERRUPTED: Test interrupted by SIGTERM\nRunner error occurred: Timeout 
reached\nOriginal status: ERROR\n{'name': 
'048-tests/avocado/boot_linux_console.py:BootLinuxConsole.test_ppc_powernv9', 
'logdir': 
'/builds/qemu-project/qemu/build/tests/results/job-2021-12-17T19.23-... (90.55 
s)


See e.g. https://gitlab.com/qemu-project/qemu/-/jobs/1898304074


Thanks for letting us know. I bisected it and the culprit is this patch:


commit 4db3907a40a087e2cc1839d19a3642539d36610b
Author: Daniel Henrique Barboza 
Date:   Fri Dec 17 17:57:18 2021 +0100

 target/ppc: enable PMU instruction count


This is a patch where I added instruction count in the ppc64 PMU. After this 
patch the
performance of these 2 tests are degraded to the point where we're hitting 
timeouts in
gitlab (didn't hit timeouts in my machine but the performance is noticeable 
worse).

I'll need to see the serial console of the VM booting up to evaluate if there's 
some kernel
module during boot time that is using the PMU and causing the delay. I'll also 
take a look
into improving the performance as well (e.g. using more TCG code and avoid 
calling helpers).


Run with :

   build/tests/venv/bin/avocado --show=app,console run -t machine:powernv9  
build/tests/avocado/boot_linux_console.py

* 6.2

...
console: [    1.559904] PCI: CLS 0 bytes, default 128
/console: [    8.830245] Initialise system trusted keyrings
console: [    8.832347] Key type blacklist registered
console: [    8.834558] workingset: timestamp_bits=54 max_order=14 
bucket_order=0
console: [    9.073051] integrity: Platform Keyring initialized
console: [    9.073586] Key type asymmetric registered
console: [    9.074025] Asymmetric key parser 'x509' registered
console: [    9.075359] Block layer SCSI generic (bsg) driver version 0.4 
loaded (major 251)
console: [    9.095115] IPMI message handler: version 39.2
console: [    9.096161] ipmi device interface
console: [    9.514308] ipmi-powernv ibm,opal:ipmi: IPMI message handler: Found 
new BMC (man_id: 0x00, prod_id: 0x, dev_id: 0x20)
-console: [   10.171273] IPMI Watchdog: driver initialized
\console: [   10.974462] hvc0: raw protocol on /ibm,opal/consoles/serial@0 
(boot console)
console: [   10.975059] hvc0: No interrupts property, using OPAL event
console: [   10.989699] Serial: 8250/16550 driver, 4 ports, IRQ sharing disabled
console: [   11.156033] brd: module loaded
console: [   11.235965] loop: module loaded
console: [   11.249922] libphy: Fixed MDIO Bus: probed
console: [   11.254128] i2c /dev entries driver
console: [   11.255782] powernv-cpufreq: ibm,pstate-min node not found
console: [   11.256134] powernv-cpufreq: Platform driver disabled. System does 
not support PState control
console: [   11.273326] ipip: IPv4 and MPLS over IPv4 tunneling driver
console: [   11.303989] NET: Registered protocol family 10
console: [   11.323651] Segment Routing with IPv6
console: [   11.325267] sit: IPv6, IPv4 and MPLS over IPv4 tunneling driver
console: [   11.335866] NET: Registered protocol family 17
console: [   11.336900] Key type dns_resolver registered
console: [   11.337358] secvar-sysfs: secvar: failed to retrieve secvar 
operations.
console: [   11.337877] drmem: No dynamic reconfiguration memory found
console: [   11.341767] Loading compiled-in X.509 certificates
console: [   11.362272] Loaded X.509 cert 'Build time autogenerated kernel key: 
987b64c96d830fe42d02bbf502e028ebe85c2b4e'
console: [   11.667162] Key type encrypt

Re: [PATCH RFC] MAINTAINERS: split out s390x sections

2021-12-21 Thread Christian Borntraeger




Am 20.12.21 um 12:54 schrieb Cornelia Huck:

Split out some more specialized devices etc., so that we can build
smarter lists of people to be put on cc: in the future.

Signed-off-by: Cornelia Huck 


Acked-by: Christian Borntraeger 

That should help to get additional maintainers (in add-on patches) added.
Letsa go with this split - we can fix and improve things anytime.

---

As discussed offlist. Some notes:
- The new sections have inherited the maintainers of the sections
   they have been split out of (except where people had already
   volunteered). That's easy to change, obviously, and I hope that
   the cc: list already contains people who might have interest in
   volunteering for some sections.
- I may not have gotten the F: patterns correct, please double check.
- I'm also not sure about where in the MAINTAINERS file the new
   sections should go; if you have a better idea, please speak up.
- Also, if you have better ideas regarding the sections, please
   speak up as well :)
- Pull requests will probably continue the same way as now (i.e.
   patches picked up at the top level and then sent, except for some
   things like tcg which may go separately.) Not sure if it would
   make sense to try out the submaintainer pull request model again,
   I don't think it made life easier in the past, and now we have
   the b4 tool to pick patches easily anyway. It might be a good
   idea to check which of the tree locations should stay, or if we
   want to have new ones.

---
  MAINTAINERS | 86 ++---
  1 file changed, 75 insertions(+), 11 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 9a8d1bdf727d..d1916f075386 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -297,7 +297,6 @@ M: David Hildenbrand 
  S: Maintained
  F: target/s390x/
  F: target/s390x/tcg
-F: target/s390x/cpu_models_*.[ch]
  F: hw/s390x/
  F: disas/s390.c
  F: tests/tcg/s390x/
@@ -396,16 +395,10 @@ M: Halil Pasic 
  M: Christian Borntraeger 
  S: Supported
  F: target/s390x/kvm/
-F: target/s390x/ioinst.[ch]
  F: target/s390x/machine.c
  F: target/s390x/sigp.c
-F: target/s390x/cpu_features*.[ch]
-F: target/s390x/cpu_models.[ch]
  F: hw/s390x/pv.c
  F: include/hw/s390x/pv.h
-F: hw/intc/s390_flic.c
-F: hw/intc/s390_flic_kvm.c
-F: include/hw/s390x/s390_flic.h
  F: gdb-xml/s390*.xml
  T: git https://github.com/borntraeger/qemu.git s390-next
  L: qemu-s3...@nongnu.org
@@ -1529,12 +1522,8 @@ S390 Virtio-ccw
  M: Halil Pasic 
  M: Christian Borntraeger 
  S: Supported
-F: hw/char/sclp*.[hc]
-F: hw/char/terminal3270.c
  F: hw/s390x/
  F: include/hw/s390x/
-F: hw/watchdog/wdt_diag288.c
-F: include/hw/watchdog/wdt_diag288.h
  F: configs/devices/s390x-softmmu/default.mak
  F: tests/avocado/machine_s390_ccw_virtio.py
  T: git https://github.com/borntraeger/qemu.git s390-next
@@ -1559,6 +1548,80 @@ F: hw/s390x/s390-pci*
  F: include/hw/s390x/s390-pci*
  L: qemu-s3...@nongnu.org
  
+S390 channel subsystem

+M: Halil Pasic 
+M: Christian Borntraeger 
+S: Supported
+F: hw/s390x/ccw-device.[ch]
+F: hw/s390x/css.c
+F: hw/s390x/css-bridge.c
+F: include/hw/s390x/css.h
+F: include/hw/s390x/css-bridge.h
+F: include/hw/s390x/ioinst.h
+F: target/s390x/ioinst.c
+L: qemu-s3...@nongnu.org
+
+3270 device
+M: Halil Pasic 
+M: Christian Borntraeger 
+S: Odd fixes
+F: include/hw/s390x/3270-ccw.h
+F: hw/char/terminal3270.c
+F: hw/s390x/3270-ccw.c
+L: qemu-s3...@nongnu.org
+
+diag 288 watchdog
+M: Halil Pasic 
+M: Christian Borntraeger 
+S: Supported
+F: hw/watchdog/wdt_diag288.c
+F: include/hw/watchdog/wdt_diag288.h
+L: qemu-s3...@nongnu.org
+
+S390 CPU models
+M: David Hildenbrand 
+S: Maintained
+F: target/s390x/cpu_features*.[ch]
+F: target/s390x/cpu_models.[ch]
+L: qemu-s3...@nongnu.org
+
+S390 storage key device
+M: Halil Pasic 
+M: Christian Borntraeger 
+S: Supported
+F: hw/s390x/storage-keys.h
+F: hw/390x/s390-skeys*.c
+L: qemu-s3...@nongnu.org
+
+S390 storage attribute device
+M: Halil Pasic 
+M: Christian Borntraeger 
+S: Supported
+F: hw/s390x/storage-attributes.h
+F: hw/s390/s390-stattrib*.c
+L: qemu-s3...@nongnu.org
+
+S390 SCLP-backed devices
+M: Halil Pasic 
+M: Christian Borntraeger 
+S: Supported
+F: include/hw/s390x/event-facility.h
+F: include/hw/s390x/sclp.h
+F: hw/char/sclp*.[hc]
+F: hw/s390x/event-facility.c
+F: hw/s390x/sclp*.c
+L: qemu-s3...@nongnu.org
+
+S390 floating interrupt controller
+M: Halil Pasic 
+M: Christian Borntraeger 
+M: David Hildenbrand 
+S: Supported
+F: hw/intc/s390_flic.c
+F: hw/intc/s390_flic_kvm.c
+F: include/hw/s390x/s390_flic.h
+L: qemu-s3...@nongnu.org
+
  X86 Machines
  
  PC
@@ -1957,6 +2020,7 @@ M: Halil Pasic 
  S: Supported
  F: hw/s390x/virtio-ccw*.[hc]
  F: hw/s390x/vhost-vsock-ccw.c
+F: hw/s390x/vhost-user-fs-ccw.c
  T: git https://gitlab.com/cohuck/qemu.git s390-next
  T: git https://github.com/borntraeger/qemu.git s390-next
  L: qemu-s3...@nongnu.org





Re: [PATCH] scsi-generic: replace logical block count of response of READ CAPACITY

2021-12-21 Thread Claudio Fontana
Hi Paolo, Hannes,

any thoughts on the following issue?

Introduction:

When using SAN storage for providing block devices to guests, configured as 
SCSI-passthrough devices, increasing the space available in the VM is a use 
case.

To do it, it is currently necessary to:

1) expand storage on the actual SAN,
2) run a "virsh blockresize" or equivalent command to make QEMU aware of the 
new size, and finally
3) do a "rescan-scsi-bus.sh" or equivalent operation in the guest to make the 
running guest aware of the increased disk size.

The problem:

As of now, the administrator needs to make sure that step 3 won't be done 
before step 2 has been executed, or the resulting state will be inconsistent.
In practice this creates organizational issues to try to sync up host/storage 
admins and guest OS admin, and is therefore error prone (due to these human 
factors).

The proposal:

The patch I replied to here from Ma Lin tries to avoid the inconsistent state, 
by having "rescan-scsi-bus.sh" still report the old size in the guest until 
QEMU itself is aware of the new disk size.

The patch:

Before the patch, the SCSI READ_CAPACITY command in the guest os directly 
receives the unmodified response from the storage backend.

After the patch, QEMU intercepts the READ_CAPACITY response and replaces the 
maximum LBA with the information which is saved in QEMU.

This means: after resizing the storage on the SAN backend, the host 
administrator must explicitly notify about CAPACITY HAS CHANGED by issuing a 
block resize command through QMP or libvirt,
even for SCSI passthrough disks.

Any ideas on this patch or on possible alternatives?

Thanks,

Claudio


On 11/20/21 11:15 AM, Lin Ma wrote:
> While using SCSI passthrough, Following scenario makes qemu doesn't
> realized the capacity change of remote scsi target:
> 1. online resize the scsi target.
> 2. issue 'rescan-scsi-bus.sh -s ...' in host.
> 3. issue 'rescan-scsi-bus.sh -s ...' in vm.
> 
> In above scenario I used to experienced errors while accessing the
> additional disk space in vm. I think the reasonable operations should
> be:
> 1. online resize the scsi target.
> 2. issue 'rescan-scsi-bus.sh -s ...' in host.
> 3. issue 'block_resize' via qmp to notify qemu.
> 4. issue 'rescan-scsi-bus.sh -s ...' in vm.
> 
> The errors disappear once I notify qemu by block_resize via qmp.
> 
> So this patch replaces the number of logical blocks of READ CAPACITY
> response from scsi target by qemu's bs->total_sectors. If the user in
> vm wants to access the additional disk space, The administrator of
> host must notify qemu once resizeing the scsi target.
> 
> Bonus is that domblkinfo of libvirt can reflect the consistent capacity
> information between host and vm in case of missing block_resize in qemu.
> E.g:
> ...
> 
>   
>   
>   
>   
>   
>   
> 
> ...
> 
> Before:
> 1. online resize the scsi target.
> 2. host:~  # rescan-scsi-bus.sh -s /dev/sdc
> 3. guest:~ # rescan-scsi-bus.sh -s /dev/sda
> 4  host:~  # virsh domblkinfo --domain $DOMAIN --human --device sda
> Capacity:   4.000 GiB
> Allocation: 0.000 B
> Physical:   8.000 GiB
> 
> 5. guest:~ # lsblk /dev/sda
> NAME   MAJ:MIN RM SIZE RO TYPE MOUNTPOINT
> sda  8:00   8G  0 disk
> └─sda1   8:10   2G  0 part
> 
> After:
> 1. online resize the scsi target.
> 2. host:~  # rescan-scsi-bus.sh -s /dev/sdc
> 3. guest:~ # rescan-scsi-bus.sh -s /dev/sda
> 4  host:~  # virsh domblkinfo --domain $DOMAIN --human --device sda
> Capacity:   4.000 GiB
> Allocation: 0.000 B
> Physical:   8.000 GiB
> 
> 5. guest:~ # lsblk /dev/sda
> NAME   MAJ:MIN RM SIZE RO TYPE MOUNTPOINT
> sda  8:00   4G  0 disk
> └─sda1   8:10   2G  0 part
> 
> Signed-off-by: Lin Ma 
> ---
>  hw/scsi/scsi-generic.c | 10 --
>  1 file changed, 8 insertions(+), 2 deletions(-)
> 
> diff --git a/hw/scsi/scsi-generic.c b/hw/scsi/scsi-generic.c
> index 0306ccc7b1..343b51c2c0 100644
> --- a/hw/scsi/scsi-generic.c
> +++ b/hw/scsi/scsi-generic.c
> @@ -315,11 +315,17 @@ static void scsi_read_complete(void * opaque, int ret)
>  if (r->req.cmd.buf[0] == READ_CAPACITY_10 &&
>  (ldl_be_p(&r->buf[0]) != 0xU || s->max_lba == 0)) {
>  s->blocksize = ldl_be_p(&r->buf[4]);
> -s->max_lba = ldl_be_p(&r->buf[0]) & 0xULL;
> +BlockBackend *blk = s->conf.blk;
> +BlockDriverState *bs = blk_bs(blk);
> +s->max_lba = bs->total_sectors - 1;
> +stl_be_p(&r->buf[0], s->max_lba);
>  } else if (r->req.cmd.buf[0] == SERVICE_ACTION_IN_16 &&
> (r->req.cmd.buf[1] & 31) == SAI_READ_CAPACITY_16) {
>  s->blocksize = ldl_be_p(&r->buf[8]);
> -s->max_lba = ldq_be_p(&r->buf[0]);
> +BlockBackend *blk = s->conf.blk;
> +BlockDriverState *bs = blk_bs(blk);
> +s->max_lba = bs->total_sectors - 1;
> +stq_be_p(&r->buf[0], s->max_lba);
>  }
>  blk_set_guest_block_size(s->conf.blk, s->bl

Re: [PATCH RFC] MAINTAINERS: split out s390x sections

2021-12-21 Thread Thomas Huth

On 20/12/2021 12.54, Cornelia Huck wrote:

Split out some more specialized devices etc., so that we can build
smarter lists of people to be put on cc: in the future.

Signed-off-by: Cornelia Huck 
---

As discussed offlist. Some notes:
- The new sections have inherited the maintainers of the sections
   they have been split out of (except where people had already
   volunteered). That's easy to change, obviously, and I hope that
   the cc: list already contains people who might have interest in
   volunteering for some sections.
- I may not have gotten the F: patterns correct, please double check.
- I'm also not sure about where in the MAINTAINERS file the new
   sections should go; if you have a better idea, please speak up.
- Also, if you have better ideas regarding the sections, please
   speak up as well :)
- Pull requests will probably continue the same way as now (i.e.
   patches picked up at the top level and then sent, except for some
   things like tcg which may go separately.) Not sure if it would
   make sense to try out the submaintainer pull request model again,
   I don't think it made life easier in the past, and now we have
   the b4 tool to pick patches easily anyway. It might be a good
   idea to check which of the tree locations should stay, or if we
   want to have new ones.

---
  MAINTAINERS | 86 ++---
  1 file changed, 75 insertions(+), 11 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 9a8d1bdf727d..d1916f075386 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -297,7 +297,6 @@ M: David Hildenbrand 
  S: Maintained
  F: target/s390x/
  F: target/s390x/tcg
-F: target/s390x/cpu_models_*.[ch]
  F: hw/s390x/
  F: disas/s390.c
  F: tests/tcg/s390x/
@@ -396,16 +395,10 @@ M: Halil Pasic 
  M: Christian Borntraeger 
  S: Supported
  F: target/s390x/kvm/
-F: target/s390x/ioinst.[ch]
  F: target/s390x/machine.c
  F: target/s390x/sigp.c
-F: target/s390x/cpu_features*.[ch]
-F: target/s390x/cpu_models.[ch]
  F: hw/s390x/pv.c
  F: include/hw/s390x/pv.h
-F: hw/intc/s390_flic.c
-F: hw/intc/s390_flic_kvm.c
-F: include/hw/s390x/s390_flic.h
  F: gdb-xml/s390*.xml
  T: git https://github.com/borntraeger/qemu.git s390-next
  L: qemu-s3...@nongnu.org
@@ -1529,12 +1522,8 @@ S390 Virtio-ccw
  M: Halil Pasic 
  M: Christian Borntraeger 
  S: Supported
-F: hw/char/sclp*.[hc]
-F: hw/char/terminal3270.c
  F: hw/s390x/
  F: include/hw/s390x/
-F: hw/watchdog/wdt_diag288.c
-F: include/hw/watchdog/wdt_diag288.h
  F: configs/devices/s390x-softmmu/default.mak
  F: tests/avocado/machine_s390_ccw_virtio.py
  T: git https://github.com/borntraeger/qemu.git s390-next
@@ -1559,6 +1548,80 @@ F: hw/s390x/s390-pci*
  F: include/hw/s390x/s390-pci*
  L: qemu-s3...@nongnu.org
  
+S390 channel subsystem

+M: Halil Pasic 
+M: Christian Borntraeger 
+S: Supported
+F: hw/s390x/ccw-device.[ch]
+F: hw/s390x/css.c
+F: hw/s390x/css-bridge.c
+F: include/hw/s390x/css.h
+F: include/hw/s390x/css-bridge.h
+F: include/hw/s390x/ioinst.h
+F: target/s390x/ioinst.c
+L: qemu-s3...@nongnu.org
+
+3270 device
+M: Halil Pasic 
+M: Christian Borntraeger 
+S: Odd fixes
+F: include/hw/s390x/3270-ccw.h
+F: hw/char/terminal3270.c
+F: hw/s390x/3270-ccw.c
+L: qemu-s3...@nongnu.org


I'm a little bit torn between putting the s390x-related devices here in the 
"Machine" section (which should rather be used for machines and not for 
devices), or in the more generic "Devices" section later in the MAINTAINERS 
file. We already have vfio-ccw and vfio-ap in the "Devices" section, so 
maybe we should put the other s390x-related devices there as well? (maybe 
with a "s390x" prefix so that they show up in the same spot if we sort them 
alphabetically?)



+diag 288 watchdog
+M: Halil Pasic 
+M: Christian Borntraeger 
+S: Supported
+F: hw/watchdog/wdt_diag288.c
+F: include/hw/watchdog/wdt_diag288.h
+L: qemu-s3...@nongnu.org
+
+S390 CPU models
+M: David Hildenbrand 
+S: Maintained
+F: target/s390x/cpu_features*.[ch]
+F: target/s390x/cpu_models.[ch]
+L: qemu-s3...@nongnu.org
+
+S390 storage key device
+M: Halil Pasic 
+M: Christian Borntraeger 
+S: Supported
+F: hw/s390x/storage-keys.h
+F: hw/390x/s390-skeys*.c
+L: qemu-s3...@nongnu.org
+
+S390 storage attribute device
+M: Halil Pasic 
+M: Christian Borntraeger 
+S: Supported
+F: hw/s390x/storage-attributes.h
+F: hw/s390/s390-stattrib*.c
+L: qemu-s3...@nongnu.org
+
+S390 SCLP-backed devices
+M: Halil Pasic 
+M: Christian Borntraeger 
+S: Supported
+F: include/hw/s390x/event-facility.h
+F: include/hw/s390x/sclp.h
+F: hw/char/sclp*.[hc]
+F: hw/s390x/event-facility.c
+F: hw/s390x/sclp*.c
+L: qemu-s3...@nongnu.org
+
+S390 floating interrupt controller
+M: Halil Pasic 
+M: Christian Borntraeger 
+M: David Hildenbrand 
+S: Supported
+F: hw/intc/s390_flic.c
+F: hw/intc/s390_flic_kvm.c


The above two lines could be shortened to:

F: hw/intc/s390_flic*.c


+F: include/hw/s390x/s390_flic.h
+L: qemu-s3...@nongnu.org
+
  X86 Machines
  
  PC
@@ -1957,6 +2020,7 @@ 

Re: [PATCH v2 6/8] migration: Dump sub-cmd name in loadvm_process_command tp

2021-12-21 Thread David Edmondson
On Monday, 2021-12-20 at 16:53:53 +08, Peter Xu wrote:

> It'll be easier to read the name rather than index of sub-cmd when debugging.
>
> Signed-off-by: Peter Xu 
> ---
>  migration/savevm.c | 3 ++-
>  migration/trace-events | 2 +-
>  2 files changed, 3 insertions(+), 2 deletions(-)
>
> diff --git a/migration/savevm.c b/migration/savevm.c
> index 0bef031acb..7f7af6f750 100644
> --- a/migration/savevm.c
> +++ b/migration/savevm.c
> @@ -2272,12 +2272,13 @@ static int loadvm_process_command(QEMUFile *f)
>  return qemu_file_get_error(f);
>  }
>
> -trace_loadvm_process_command(cmd, len);
>  if (cmd >= MIG_CMD_MAX || cmd == MIG_CMD_INVALID) {
>  error_report("MIG_CMD 0x%x unknown (len 0x%x)", cmd, len);
>  return -EINVAL;
>  }
>
> +trace_loadvm_process_command(mig_cmd_args[cmd].name, len);
> +
>  if (mig_cmd_args[cmd].len != -1 && mig_cmd_args[cmd].len != len) {
>  error_report("%s received with bad length - expecting %zu, got %d",
>   mig_cmd_args[cmd].name,
> diff --git a/migration/trace-events b/migration/trace-events
> index b48d873b8a..d63a5915f5 100644
> --- a/migration/trace-events
> +++ b/migration/trace-events
> @@ -22,7 +22,7 @@ loadvm_postcopy_handle_resume(void) ""
>  loadvm_postcopy_ram_handle_discard(void) ""
>  loadvm_postcopy_ram_handle_discard_end(void) ""
>  loadvm_postcopy_ram_handle_discard_header(const char *ramid, uint16_t len) 
> "%s: %ud"
> -loadvm_process_command(uint16_t com, uint16_t len) "com=0x%x len=%d"
> +loadvm_process_command(const char *s, uint16_t len) "com=%s len=%d"

"cmd" rather than "com", to match the code.

>  loadvm_process_command_ping(uint32_t val) "0x%x"
>  postcopy_ram_listen_thread_exit(void) ""
>  postcopy_ram_listen_thread_start(void) ""

dme.
-- 
I went starin' out of my window, been caught doin' it once or twice.




Re: [PATCH v2 7/8] migration: Finer grained tracepoints for POSTCOPY_LISTEN

2021-12-21 Thread David Edmondson
On Monday, 2021-12-20 at 16:53:54 +08, Peter Xu wrote:

> The enablement of postcopy listening has a few steps, add a few tracepoints to
> be there ready for some basic measurements for them.
>
> Signed-off-by: Peter Xu 
> ---
>  migration/savevm.c | 9 -
>  migration/trace-events | 2 +-
>  2 files changed, 9 insertions(+), 2 deletions(-)
>
> diff --git a/migration/savevm.c b/migration/savevm.c
> index 7f7af6f750..25face6de0 100644
> --- a/migration/savevm.c
> +++ b/migration/savevm.c
> @@ -1947,9 +1947,10 @@ static void *postcopy_ram_listen_thread(void *opaque)
>  static int loadvm_postcopy_handle_listen(MigrationIncomingState *mis)
>  {
>  PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_LISTENING);
> -trace_loadvm_postcopy_handle_listen();
>  Error *local_err = NULL;
>
> +trace_loadvm_postcopy_handle_listen("enter");
> +
>  if (ps != POSTCOPY_INCOMING_ADVISE && ps != POSTCOPY_INCOMING_DISCARD) {
>  error_report("CMD_POSTCOPY_LISTEN in wrong postcopy state (%d)", ps);
>  return -1;
> @@ -1964,6 +1965,8 @@ static int 
> loadvm_postcopy_handle_listen(MigrationIncomingState *mis)
>  }
>  }
>
> +trace_loadvm_postcopy_handle_listen("after disgard");

s/disgard/discard/

> +
>  /*
>   * Sensitise RAM - can now generate requests for blocks that don't exist
>   * However, at this point the CPU shouldn't be running, and the IO
> @@ -1976,6 +1979,8 @@ static int 
> loadvm_postcopy_handle_listen(MigrationIncomingState *mis)
>  }
>  }
>
> +trace_loadvm_postcopy_handle_listen("after uffd");
> +
>  if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_LISTEN, &local_err)) {
>  error_report_err(local_err);
>  return -1;
> @@ -1990,6 +1995,8 @@ static int 
> loadvm_postcopy_handle_listen(MigrationIncomingState *mis)
>  qemu_sem_wait(&mis->listen_thread_sem);
>  qemu_sem_destroy(&mis->listen_thread_sem);
>
> +trace_loadvm_postcopy_handle_listen("exit");
> +

"return" rather than "exit"?

>  return 0;
>  }
>
> diff --git a/migration/trace-events b/migration/trace-events
> index d63a5915f5..77d1237d89 100644
> --- a/migration/trace-events
> +++ b/migration/trace-events
> @@ -14,7 +14,7 @@ loadvm_handle_cmd_packaged_main(int ret) "%d"
>  loadvm_handle_cmd_packaged_received(int ret) "%d"
>  loadvm_handle_recv_bitmap(char *s) "%s"
>  loadvm_postcopy_handle_advise(void) ""
> -loadvm_postcopy_handle_listen(void) ""
> +loadvm_postcopy_handle_listen(const char *str) "%s"
>  loadvm_postcopy_handle_run(void) ""
>  loadvm_postcopy_handle_run_cpu_sync(void) ""
>  loadvm_postcopy_handle_run_vmstart(void) ""

dme.
-- 
When you were the brightest star, who were the shadows?




Re: [PATCH] tests/qtest/virtio-net-failover: Use g_random_int() instead of g_test_rand_int()

2021-12-21 Thread Thomas Huth

On 20/12/2021 21.02, Philippe Mathieu-Daudé wrote:

On 12/20/21 20:26, Richard Henderson wrote:

On 12/20/21 2:27 AM, Thomas Huth wrote:

   const gchar *tmpdir = g_get_tmp_dir();
   gchar *tmpfile = g_strdup_printf("%s/failover_test_migrate-%u-%u",
- tmpdir, getpid(),
g_test_rand_int());
+ tmpdir, getpid(), g_random_int());


Random numbers plus pid are irrelevant, because you still don't have
guaranteed uniqueness -- think stale files in /tmp.

Use g_file_open_tmp.


Another use in test_socket_unix_abstract(),
tests/unit/test-util-sockets.c.


Using g_file_open_tmp is certainly better ... but the tests are currently 
written in a way where they require the file name of the temporary file - so 
switching to g_file_open_tmp() (which only provides a file handle) certainly 
would need some rewrite here... Thus I'd suggest to go first with this patch 
to silence the Assert messages, and then to clean this up properly later.


 Thomas




[PATCH 1/8] configure: simplify creation of plugin symbol list

2021-12-21 Thread Paolo Bonzini
--dynamic-list is present on all supported ELF (not Windows or Darwin)
platforms, since it dates back to 2006; -exported_symbols_list is
likewise present on all supported versions of macOS.  Do not bother
doing a functional test in configure.

Remove the file creation from configure as well: for Darwin, move the
the creation of the Darwin-formatted symbols to meson; for ELF, use the
file in the source path directly and switch from -Wl, to -Xlinker to
not break weird paths that include a comma.

Signed-off-by: Paolo Bonzini 
---
 configure   | 80 -
 plugins/meson.build | 11 +--
 2 files changed, 8 insertions(+), 83 deletions(-)

diff --git a/configure b/configure
index 8ccfe51673..1bce9635d9 100755
--- a/configure
+++ b/configure
@@ -78,7 +78,6 @@ TMPC="${TMPDIR1}/${TMPB}.c"
 TMPO="${TMPDIR1}/${TMPB}.o"
 TMPCXX="${TMPDIR1}/${TMPB}.cxx"
 TMPE="${TMPDIR1}/${TMPB}.exe"
-TMPTXT="${TMPDIR1}/${TMPB}.txt"
 
 rm -f config.log
 
@@ -2343,69 +2342,6 @@ EOF
   fi
 fi
 
-##
-# plugin linker support probe
-
-if test "$plugins" != "no"; then
-
-#
-# See if --dynamic-list is supported by the linker
-
-ld_dynamic_list="no"
-cat > $TMPTXT < $TMPC <
-void foo(void);
-
-void foo(void)
-{
-  printf("foo\n");
-}
-
-int main(void)
-{
-  foo();
-  return 0;
-}
-EOF
-
-if compile_prog "" "-Wl,--dynamic-list=$TMPTXT" ; then
-ld_dynamic_list="yes"
-fi
-
-#
-# See if -exported_symbols_list is supported by the linker
-
-ld_exported_symbols_list="no"
-cat > $TMPTXT <> $config_host_mak
-# Copy the export object list to the build dir
-if test "$ld_dynamic_list" = "yes" ; then
-   echo "CONFIG_HAS_LD_DYNAMIC_LIST=yes" >> $config_host_mak
-   ld_symbols=qemu-plugins-ld.symbols
-   cp "$source_path/plugins/qemu-plugins.symbols" $ld_symbols
-elif test "$ld_exported_symbols_list" = "yes" ; then
-   echo "CONFIG_HAS_LD_EXPORTED_SYMBOLS_LIST=yes" >> $config_host_mak
-   ld64_symbols=qemu-plugins-ld64.symbols
-   echo "# Automatically generated by configure - do not modify" > 
$ld64_symbols
-   grep 'qemu_' "$source_path/plugins/qemu-plugins.symbols" | sed 's/;//g' 
| \
-   sed -E 's/^[[:space:]]*(.*)/_\1/' >> $ld64_symbols
-else
-   error_exit \
-   "If \$plugins=yes, either \$ld_dynamic_list or " \
-   "\$ld_exported_symbols_list should have been set to 'yes'."
-fi
 fi
 
 if test -n "$gdb_bin"; then
diff --git a/plugins/meson.build b/plugins/meson.build
index b3de57853b..d0a2ee94cf 100644
--- a/plugins/meson.build
+++ b/plugins/meson.build
@@ -1,10 +1,15 @@
 plugin_ldflags = []
 # Modules need more symbols than just those in plugins/qemu-plugins.symbols
 if not enable_modules
-  if 'CONFIG_HAS_LD_DYNAMIC_LIST' in config_host
-plugin_ldflags = ['-Wl,--dynamic-list=qemu-plugins-ld.symbols']
-  elif 'CONFIG_HAS_LD_EXPORTED_SYMBOLS_LIST' in config_host
+  if targetos == 'darwin'
+qemu_plugins_symbols_list = configure_file(
+  input: files('qemu-plugins.symbols'),
+  output: 'qemu-plugins-ld64.symbols',
+  capture: true,
+  command: ['sed', '-ne', 's/^[[:space:]]*\\(qemu_.*\\);/_\\1/p', 
'@INPUT@'])
 plugin_ldflags = ['-Wl,-exported_symbols_list,qemu-plugins-ld64.symbols']
+  else
+plugin_ldflags = ['-Xlinker', '--dynamic-list=' + 
(meson.project_source_root() / 'plugins/qemu-plugins.symbols')]
   endif
 endif
 
-- 
2.33.1





[PATCH 2/8] configure: do not set bsd_user/linux_user early

2021-12-21 Thread Paolo Bonzini
Similar to other optional features, leave the variables empty and compute
the actual value later.  Use the existence of include or source directories
to detect whether an OS or CPU supports respectively bsd-user and linux-user.

For now, BSD user-mode emulation is buildable even on TCI-only
architectures.  This probably will change once safe signals are
brought over from linux-user.

Signed-off-by: Paolo Bonzini 
---
 configure | 28 +---
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/configure b/configure
index 1bce9635d9..6dafbcd362 100755
--- a/configure
+++ b/configure
@@ -321,8 +321,8 @@ linux="no"
 solaris="no"
 profiler="no"
 softmmu="yes"
-linux_user="no"
-bsd_user="no"
+linux_user=""
+bsd_user=""
 pkgversion=""
 pie=""
 qom_cast_debug="yes"
@@ -539,7 +539,6 @@ gnu/kfreebsd)
 ;;
 freebsd)
   bsd="yes"
-  bsd_user="yes"
   make="${MAKE-gmake}"
   # needed for kinfo_getvmmap(3) in libutil.h
 ;;
@@ -584,7 +583,6 @@ haiku)
 ;;
 linux)
   linux="yes"
-  linux_user="yes"
   vhost_user=${default_feature:-yes}
 ;;
 esac
@@ -1262,18 +1260,26 @@ if eval test -z "\${cross_cc_$cpu}"; then
 cross_cc_vars="$cross_cc_vars cross_cc_${cpu}"
 fi
 
-# For user-mode emulation the host arch has to be one we explicitly
-# support, even if we're using TCI.
-if [ "$ARCH" = "unknown" ]; then
-  bsd_user="no"
-  linux_user="no"
-fi
-
 default_target_list=""
 deprecated_targets_list=ppc64abi32-linux-user
 deprecated_features=""
 mak_wilds=""
 
+if [ "$linux_user" != no ]; then
+if [ "$targetos" = linux ] && [ -d $source_path/linux-user/host/$cpu ]; 
then
+linux_user=yes
+elif [ "$linux_user" = yes ]; then
+error_exit "linux-user not supported on this architecture"
+fi
+fi
+if [ "$bsd_user" != no ]; then
+if [ "$bsd_user" = "" ]; then
+test $targetos = freebsd && bsd_user=yes
+fi
+if [ "$bsd_user" = yes ] && ! [ -d $source_path/bsd-user/$targetos ]; then
+error_exit "bsd-user not supported on this host OS"
+fi
+fi
 if [ "$softmmu" = "yes" ]; then
 mak_wilds="${mak_wilds} $source_path/configs/targets/*-softmmu.mak"
 fi
-- 
2.33.1





[PATCH 0/8] Next round of configure/meson cleanups

2021-12-21 Thread Paolo Bonzini
Includes v2 of patches from the previous round, and new patches 3-8.

Paolo

Paolo Bonzini (8):
  configure: simplify creation of plugin symbol list
  configure: do not set bsd_user/linux_user early
  configure, makefile: remove traces of really old files
  configure: parse --enable/--disable-strip automatically, flip default
  configure: move non-command-line variables away from command-line
parsing section
  meson: build contrib/ executables after generated headers
  configure, meson: move config-poison.h to meson
  meson: add comments in the target-specific flags section

 Makefile   |  11 +--
 configure  | 151 +
 contrib/elf2dmp/meson.build|   2 +-
 contrib/ivshmem-client/meson.build |   2 +-
 contrib/ivshmem-server/meson.build |   2 +-
 contrib/rdmacm-mux/meson.build |   2 +-
 meson.build|  17 
 pc-bios/s390-ccw/Makefile  |   2 -
 plugins/meson.build|  11 ++-
 scripts/make-config-poison.sh  |  16 +++
 scripts/meson-buildoptions.py  |  21 ++--
 scripts/meson-buildoptions.sh  |   3 +
 12 files changed, 90 insertions(+), 150 deletions(-)
 create mode 100755 scripts/make-config-poison.sh

-- 
2.33.1




[PATCH 4/8] configure: parse --enable/--disable-strip automatically, flip default

2021-12-21 Thread Paolo Bonzini
Always include the STRIP variable in config-host.mak (it's only used
by the s390-ccw firmware build, and it adds a default if configure
omitted it), and use meson-buildoptions.sh to turn
--enable/--disable-strip into -Dstrip.

The default is now not to strip the binaries like for almost every other
package that has a configure script.

Signed-off-by: Paolo Bonzini 
---
 configure | 10 +-
 pc-bios/s390-ccw/Makefile |  2 --
 scripts/meson-buildoptions.py | 21 ++---
 scripts/meson-buildoptions.sh |  3 +++
 4 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/configure b/configure
index e09e5bb58f..40dd6e8d1b 100755
--- a/configure
+++ b/configure
@@ -308,7 +308,6 @@ debug="no"
 sanitizers="no"
 tsan="no"
 fortify_source="$default_feature"
-strip_opt="yes"
 mingw32="no"
 gcov="no"
 EXESUF=""
@@ -891,7 +890,6 @@ for opt do
   debug_tcg="yes"
   debug_mutex="yes"
   debug="yes"
-  strip_opt="no"
   fortify_source="no"
   ;;
   --enable-sanitizers) sanitizers="yes"
@@ -902,8 +900,6 @@ for opt do
   ;;
   --disable-tsan) tsan="no"
   ;;
-  --disable-strip) strip_opt="no"
-  ;;
   --disable-slirp) slirp="disabled"
   ;;
   --enable-slirp) slirp="enabled"
@@ -1370,7 +1366,6 @@ Advanced options (experts only):
   --enable-debug   enable common debug build options
   --enable-sanitizers  enable default sanitizers
   --enable-tsanenable thread sanitizer
-  --disable-strip  disable stripping binaries
   --disable-werror disable compilation abort on warning
   --disable-stack-protector disable compiler-provided stack protection
   --audio-drv-list=LISTset audio drivers to try if -audiodev is not used
@@ -3340,9 +3335,6 @@ echo "GIT_SUBMODULES_ACTION=$git_submodules_action" >> 
$config_host_mak
 if test "$debug_tcg" = "yes" ; then
   echo "CONFIG_DEBUG_TCG=y" >> $config_host_mak
 fi
-if test "$strip_opt" = "yes" ; then
-  echo "STRIP=${strip}" >> $config_host_mak
-fi
 if test "$mingw32" = "yes" ; then
   echo "CONFIG_WIN32=y" >> $config_host_mak
   if test "$guest_agent_with_vss" = "yes" ; then
@@ -3622,6 +3614,7 @@ echo "GLIB_CFLAGS=$glib_cflags" >> $config_host_mak
 echo "GLIB_LIBS=$glib_libs" >> $config_host_mak
 echo "QEMU_LDFLAGS=$QEMU_LDFLAGS" >> $config_host_mak
 echo "LD_I386_EMULATION=$ld_i386_emulation" >> $config_host_mak
+echo "STRIP=$strip" >> $config_host_mak
 echo "EXESUF=$EXESUF" >> $config_host_mak
 echo "LIBS_QGA=$libs_qga" >> $config_host_mak
 
@@ -3836,7 +3829,6 @@ if test "$skip_meson" = no; then
 -Doptimization=$(if test "$debug" = yes; then echo 0; else echo 2; fi) 
\
 -Ddebug=$(if test "$debug_info" = yes; then echo true; else echo 
false; fi) \
 -Dwerror=$(if test "$werror" = yes; then echo true; else echo false; 
fi) \
--Dstrip=$(if test "$strip_opt" = yes; then echo true; else echo false; 
fi) \
 -Db_pie=$(if test "$pie" = yes; then echo true; else echo false; fi) \
 -Db_coverage=$(if test "$gcov" = yes; then echo true; else echo false; 
fi) \
 -Db_lto=$lto -Dcfi=$cfi -Dtcg=$tcg -Dxen=$xen \
diff --git a/pc-bios/s390-ccw/Makefile b/pc-bios/s390-ccw/Makefile
index cee9d2c63b..0eb68efc7b 100644
--- a/pc-bios/s390-ccw/Makefile
+++ b/pc-bios/s390-ccw/Makefile
@@ -44,8 +44,6 @@ build-all: s390-ccw.img s390-netboot.img
 s390-ccw.elf: $(OBJECTS)
$(call quiet-command,$(CC) $(LDFLAGS) -o $@ 
$(OBJECTS),"BUILD","$(TARGET_DIR)$@")
 
-STRIP ?= strip
-
 s390-ccw.img: s390-ccw.elf
$(call quiet-command,$(STRIP) --strip-unneeded $< -o 
$@,"STRIP","$(TARGET_DIR)$@")
 
diff --git a/scripts/meson-buildoptions.py b/scripts/meson-buildoptions.py
index 96969d89ee..98ae944148 100755
--- a/scripts/meson-buildoptions.py
+++ b/scripts/meson-buildoptions.py
@@ -36,6 +36,10 @@
 "trace_file",
 }
 
+BUILTIN_OPTIONS = {
+"strip",
+}
+
 LINE_WIDTH = 76
 
 
@@ -90,14 +94,17 @@ def allow_arg(opt):
 return not (set(opt["choices"]) <= {"auto", "disabled", "enabled"})
 
 
+def filter_options(json):
+if ":" in json["name"]:
+return False
+if json["section"] == "user":
+return json["name"] not in SKIP_OPTIONS
+else:
+return json["name"] in BUILTIN_OPTIONS
+
+
 def load_options(json):
-json = [
-x
-for x in json
-if x["section"] == "user"
-and ":" not in x["name"]
-and x["name"] not in SKIP_OPTIONS
-]
+json = [x for x in json if filter_options(x)]
 return sorted(json, key=lambda x: x["name"])
 
 
diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh
index ae8f18edc2..46360e541d 100644
--- a/scripts/meson-buildoptions.sh
+++ b/scripts/meson-buildoptions.sh
@@ -13,6 +13,7 @@ meson_options_help() {
   printf "%s\n" '   jemalloc/system/tcmalloc)'
   printf "%s\n" '  --enable-slirp[=CHOICE]  Whether and how to find the slirp 
library'
   printf "%s\n" '   (choices: 
auto/disabl

[PATCH 6/8] meson: build contrib/ executables after generated headers

2021-12-21 Thread Paolo Bonzini
This will be needed as soon as config-poison.h moves from configure to
a meson custom_target (which is built at "ninja" time).

Signed-off-by: Paolo Bonzini 
---
 contrib/elf2dmp/meson.build| 2 +-
 contrib/ivshmem-client/meson.build | 2 +-
 contrib/ivshmem-server/meson.build | 2 +-
 contrib/rdmacm-mux/meson.build | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/contrib/elf2dmp/meson.build b/contrib/elf2dmp/meson.build
index 4d86cb390a..6707d43c4f 100644
--- a/contrib/elf2dmp/meson.build
+++ b/contrib/elf2dmp/meson.build
@@ -1,5 +1,5 @@
 if curl.found()
-  executable('elf2dmp', files('main.c', 'addrspace.c', 'download.c', 'pdb.c', 
'qemu_elf.c'),
+  executable('elf2dmp', files('main.c', 'addrspace.c', 'download.c', 'pdb.c', 
'qemu_elf.c'), genh,
  dependencies: [glib, curl],
  install: true)
 endif
diff --git a/contrib/ivshmem-client/meson.build 
b/contrib/ivshmem-client/meson.build
index 1b171efb4f..ce8dcca84d 100644
--- a/contrib/ivshmem-client/meson.build
+++ b/contrib/ivshmem-client/meson.build
@@ -1,4 +1,4 @@
-executable('ivshmem-client', files('ivshmem-client.c', 'main.c'),
+executable('ivshmem-client', files('ivshmem-client.c', 'main.c'), genh,
dependencies: glib,
build_by_default: targetos == 'linux',
install: false)
diff --git a/contrib/ivshmem-server/meson.build 
b/contrib/ivshmem-server/meson.build
index 3a53942201..c6c3c82e89 100644
--- a/contrib/ivshmem-server/meson.build
+++ b/contrib/ivshmem-server/meson.build
@@ -1,4 +1,4 @@
-executable('ivshmem-server', files('ivshmem-server.c', 'main.c'),
+executable('ivshmem-server', files('ivshmem-server.c', 'main.c'), genh,
dependencies: [qemuutil, rt],
build_by_default: targetos == 'linux',
install: false)
diff --git a/contrib/rdmacm-mux/meson.build b/contrib/rdmacm-mux/meson.build
index 6cc5016747..7674f54cc5 100644
--- a/contrib/rdmacm-mux/meson.build
+++ b/contrib/rdmacm-mux/meson.build
@@ -2,7 +2,7 @@ if 'CONFIG_PVRDMA' in config_host
   # if not found, CONFIG_PVRDMA should not be set
   # FIXME: broken on big endian architectures
   libumad = cc.find_library('ibumad', required: true)
-  executable('rdmacm-mux', files('main.c'),
+  executable('rdmacm-mux', files('main.c'), genh,
  dependencies: [glib, libumad],
  build_by_default: false,
  install: false)
-- 
2.33.1





[PATCH 7/8] configure, meson: move config-poison.h to meson

2021-12-21 Thread Paolo Bonzini
This ensures that the file is regenerated properly whenever config-target.h
or config-devices.h files change.

Signed-off-by: Paolo Bonzini 
---
 Makefile  |  2 +-
 configure | 11 ---
 meson.build   | 12 
 scripts/make-config-poison.sh | 16 
 4 files changed, 29 insertions(+), 12 deletions(-)
 create mode 100755 scripts/make-config-poison.sh

diff --git a/Makefile b/Makefile
index 06ad8a61e1..2f80f56a4a 100644
--- a/Makefile
+++ b/Makefile
@@ -220,7 +220,7 @@ qemu-%.tar.bz2:
 
 distclean: clean
-$(quiet-@)test -f build.ninja && $(NINJA) $(NINJAFLAGS) -t clean -g || 
:
-   rm -f config-host.mak config-poison.h
+   rm -f config-host.mak
rm -f tests/tcg/config-*.mak
rm -f config.status
rm -f roms/seabios/config.mak
diff --git a/configure b/configure
index 810bc36490..aff371ca81 100755
--- a/configure
+++ b/configure
@@ -3858,17 +3858,6 @@ if test -n "${deprecated_features}"; then
 echo "  features: ${deprecated_features}"
 fi
 
-# Create list of config switches that should be poisoned in common code...
-# but filter out CONFIG_TCG and CONFIG_USER_ONLY which are special.
-target_configs_h=$(ls *-config-devices.h *-config-target.h 2>/dev/null)
-if test -n "$target_configs_h" ; then
-sed -n -e '/CONFIG_TCG/d' -e '/CONFIG_USER_ONLY/d' \
--e '/^#define / { s///; s/ .*//; s/^/#pragma GCC poison /p; }' \
-$target_configs_h | sort -u > config-poison.h
-else
-:> config-poison.h
-fi
-
 # Save the configure command line for later reuse.
 cat  0 ? ['@INPUT@'] : 
[]))
+
 ##
 # Submodules #
 ##
diff --git a/scripts/make-config-poison.sh b/scripts/make-config-poison.sh
new file mode 100755
index 00..d222a04304
--- /dev/null
+++ b/scripts/make-config-poison.sh
@@ -0,0 +1,16 @@
+#! /bin/sh
+
+if test $# = 0; then
+  exit 0
+fi
+
+# Create list of config switches that should be poisoned in common code...
+# but filter out CONFIG_TCG and CONFIG_USER_ONLY which are special.
+exec sed -n \
+  -e' /CONFIG_TCG/d' \
+  -e '/CONFIG_USER_ONLY/d' \
+  -e '/^#define / {' \
+  -e's///' \
+  -e's/ .*//' \
+  -e's/^/#pragma GCC poison /p' \
+  -e '}' "$@"
-- 
2.33.1





[PATCH 3/8] configure, makefile: remove traces of really old files

2021-12-21 Thread Paolo Bonzini
These files have been removed for more than year in the best
case, or for more than ten years for some really old TCG files.
Remove any traces of it.

Signed-off-by: Paolo Bonzini 
---
 Makefile  | 11 ---
 configure |  9 -
 2 files changed, 4 insertions(+), 16 deletions(-)

diff --git a/Makefile b/Makefile
index 74c5b46d38..06ad8a61e1 100644
--- a/Makefile
+++ b/Makefile
@@ -205,14 +205,11 @@ recurse-clean: $(addsuffix /clean, $(ROM_DIRS))
 clean: recurse-clean
-$(quiet-@)test -f build.ninja && $(NINJA) $(NINJAFLAGS) -t clean || :
-$(quiet-@)test -f build.ninja && $(NINJA) $(NINJAFLAGS) clean-ctlist 
|| :
-# avoid old build problems by removing potentially incorrect old files
-   rm -f config.mak op-i386.h opc-i386.h gen-op-i386.h op-arm.h opc-arm.h 
gen-op-arm.h
find . \( -name '*.so' -o -name '*.dll' -o -name '*.[oda]' \) -type f \
! -path ./roms/edk2/ArmPkg/Library/GccLto/liblto-aarch64.a \
! -path ./roms/edk2/ArmPkg/Library/GccLto/liblto-arm.a \
-exec rm {} +
-   rm -f TAGS cscope.* *.pod *~ */*~
-   rm -f fsdev/*.pod scsi/*.pod
+   rm -f TAGS cscope.* *~ */*~
 
 VERSION = $(shell cat $(SRC_PATH)/VERSION)
 
@@ -223,10 +220,10 @@ qemu-%.tar.bz2:
 
 distclean: clean
-$(quiet-@)test -f build.ninja && $(NINJA) $(NINJAFLAGS) -t clean -g || 
:
-   rm -f config-host.mak config-host.h* config-poison.h
+   rm -f config-host.mak config-poison.h
rm -f tests/tcg/config-*.mak
-   rm -f config-all-disas.mak config.status
-   rm -f roms/seabios/config.mak roms/vgabios/config.mak
+   rm -f config.status
+   rm -f roms/seabios/config.mak
rm -f qemu-plugins-ld.symbols qemu-plugins-ld64.symbols
rm -f *-config-target.h *-config-devices.mak *-config-devices.h
rm -rf meson-private meson-logs meson-info compile_commands.json
diff --git a/configure b/configure
index 6dafbcd362..e09e5bb58f 100755
--- a/configure
+++ b/configure
@@ -3696,9 +3696,6 @@ fi
 # so the build tree will be missing the link back to the new file, and
 # tests might fail. Prefer to keep the relevant files in their own
 # directory and symlink the directory instead.
-# UNLINK is used to remove symlinks from older development versions
-# that might get into the way when doing "git update" without doing
-# a "make distclean" in between.
 LINKS="Makefile"
 LINKS="$LINKS tests/tcg/Makefile.target"
 LINKS="$LINKS pc-bios/optionrom/Makefile"
@@ -3710,7 +3707,6 @@ LINKS="$LINKS tests/avocado tests/data"
 LINKS="$LINKS tests/qemu-iotests/check"
 LINKS="$LINKS python"
 LINKS="$LINKS contrib/plugins/Makefile "
-UNLINK="pc-bios/keymaps"
 for bios_file in \
 $source_path/pc-bios/*.bin \
 $source_path/pc-bios/*.elf \
@@ -3732,11 +3728,6 @@ for f in $LINKS ; do
 symlink "$source_path/$f" "$f"
 fi
 done
-for f in $UNLINK ; do
-if [ -L "$f" ]; then
-rm -f "$f"
-fi
-done
 
 (for i in $cross_cc_vars; do
   export $i
-- 
2.33.1





[PATCH 5/8] configure: move non-command-line variables away from command-line parsing section

2021-12-21 Thread Paolo Bonzini
This makes it easier to identify candidates for moving to Meson.

Signed-off-by: Paolo Bonzini 
---
 configure | 13 ++---
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/configure b/configure
index 40dd6e8d1b..810bc36490 100755
--- a/configure
+++ b/configure
@@ -308,16 +308,12 @@ debug="no"
 sanitizers="no"
 tsan="no"
 fortify_source="$default_feature"
-mingw32="no"
 gcov="no"
 EXESUF=""
 modules="no"
 module_upgrades="no"
 prefix="/usr/local"
 qemu_suffix="qemu"
-bsd="no"
-linux="no"
-solaris="no"
 profiler="no"
 softmmu="yes"
 linux_user=""
@@ -331,8 +327,6 @@ opengl="$default_feature"
 cpuid_h="no"
 avx2_opt="$default_feature"
 guest_agent="$default_feature"
-guest_agent_with_vss="no"
-guest_agent_ntddscsi="no"
 vss_win32_sdk="$default_feature"
 win_sdk="no"
 want_tools="$default_feature"
@@ -527,6 +521,10 @@ fi
 
 # OS specific
 
+mingw32="no"
+bsd="no"
+linux="no"
+solaris="no"
 case $targetos in
 windows)
   mingw32="yes"
@@ -2574,6 +2572,7 @@ fi
 ##
 # check if we have VSS SDK headers for win
 
+guest_agent_with_vss="no"
 if test "$mingw32" = "yes" && test "$guest_agent" != "no" && \
 test "$vss_win32_sdk" != "no" ; then
   case "$vss_win32_sdk" in
@@ -2604,7 +2603,6 @@ EOF
   echo "ERROR: The headers are extracted in the directory \`inc'."
   feature_not_found "VSS support"
 fi
-guest_agent_with_vss="no"
   fi
 fi
 
@@ -2631,6 +2629,7 @@ fi
 
 ##
 # check if mingw environment provides a recent ntddscsi.h
+guest_agent_ntddscsi="no"
 if test "$mingw32" = "yes" && test "$guest_agent" != "no"; then
   cat > $TMPC << EOF
 #include 
-- 
2.33.1





[PATCH 8/8] meson: add comments in the target-specific flags section

2021-12-21 Thread Paolo Bonzini
Signed-off-by: Paolo Bonzini 
---
 meson.build | 5 +
 1 file changed, 5 insertions(+)

diff --git a/meson.build b/meson.build
index 09ee427ca4..0a6d57125f 100644
--- a/meson.build
+++ b/meson.build
@@ -233,6 +233,7 @@ endif
 # Target-specific checks and dependencies #
 ###
 
+# Fuzzing
 if get_option('fuzzing') and get_option('fuzzing_engine') == '' and \
 not cc.links('''
   #include 
@@ -244,6 +245,7 @@ if get_option('fuzzing') and get_option('fuzzing_engine') 
== '' and \
   error('Your compiler does not support -fsanitize=fuzzer')
 endif
 
+# Tracing backends
 if 'ftrace' in get_option('trace_backends') and targetos != 'linux'
   error('ftrace is supported only on Linux')
 endif
@@ -257,6 +259,7 @@ if 'syslog' in get_option('trace_backends') and not 
cc.compiles('''
   error('syslog is not supported on this system')
 endif
 
+# Miscellaneous Linux-only features
 if targetos != 'linux' and get_option('mpath').enabled()
   error('Multipath is supported only on Linux')
 endif
@@ -266,6 +269,7 @@ if targetos != 'linux' and 
get_option('multiprocess').enabled()
 endif
 multiprocess_allowed = targetos == 'linux' and not 
get_option('multiprocess').disabled()
 
+# Target-specific libraries and flags
 libm = cc.find_library('m', required: false)
 threads = dependency('threads')
 util = cc.find_library('util', required: false)
@@ -306,6 +310,7 @@ elif targetos == 'openbsd'
   endif
 endif
 
+# Target-specific configuration of accelerators
 accelerators = []
 if not get_option('kvm').disabled() and targetos == 'linux'
   accelerators += 'CONFIG_KVM'
-- 
2.33.1




Re: Virtio-GPU Xres and Yres seettings

2021-12-21 Thread Gerd Hoffmann
On Mon, Dec 20, 2021 at 10:44:06PM +0530, Pratik Parvati wrote:
> > EDID is optional, so you can try disable the EDID feature bit and see
> > what happens.
> 
> Thanks Gerd, after disabling the EDID, I was able to get the required
> resolution (basically width and height) from the driver.
> 
> Another strange observation - When the device receives the
> command VIRTIO_GPU_CMD_RESOURCE_ATTACH_BACKING with the number of
> entries having a pixel data in scatter gather format, the device is trying
> to store these bytes in contiguous memory. When I read those sg memory, the
> device receives all zeros from the driver (for a 1024x768 display,
> the device receives 3MB of data from the driver). Is this an expected
> behaviour? - If not, what is the driver trying to display on the screen?

How about reading the virtio spec?
display updates are handled with transfer and flush commands.

take care,
  Gerd




[PATCH] docker: include bison in debian-tricore-cross

2021-12-21 Thread Paolo Bonzini
Binutils sometimes fail to build if bison is not installed:

  /bin/sh ./ylwrap `test -f arparse.y || echo ./`arparse.y y.tab.c arparse.c 
y.tab.h arparse.h y.output arparse.output --  -d
  ./ylwrap: 109: ./ylwrap: -d: not found

(the correct invocation of ylwrap would have "bison -d" after the double
dash).  Work around by installing it in the container.

Cc: Alex Bennée 
Signed-off-by: Paolo Bonzini 
---
 tests/docker/dockerfiles/debian-tricore-cross.docker | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/docker/dockerfiles/debian-tricore-cross.docker 
b/tests/docker/dockerfiles/debian-tricore-cross.docker
index d8df2c6117..3f6b55562c 100644
--- a/tests/docker/dockerfiles/debian-tricore-cross.docker
+++ b/tests/docker/dockerfiles/debian-tricore-cross.docker
@@ -16,6 +16,7 @@ MAINTAINER Philippe Mathieu-Daudé 
 RUN apt update && \
 DEBIAN_FRONTEND=noninteractive apt install -yy eatmydata && \
 DEBIAN_FRONTEND=noninteractive eatmydata apt install -yy \
+   bison \
bzip2 \
ca-certificates \
ccache \
-- 
2.33.1




Re: [PATCH] scsi-generic: replace logical block count of response of READ CAPACITY

2021-12-21 Thread Philippe Mathieu-Daudé
Hi Claudio,

On 12/21/21 10:48, Claudio Fontana wrote:
> Hi Paolo, Hannes,
> 
> any thoughts on the following issue?
> 
> Introduction:
> 
> When using SAN storage for providing block devices to guests, configured as 
> SCSI-passthrough devices, increasing the space available in the VM is a use 
> case.
> 
> To do it, it is currently necessary to:
> 
> 1) expand storage on the actual SAN,
> 2) run a "virsh blockresize" or equivalent command to make QEMU aware of the 
> new size, and finally
> 3) do a "rescan-scsi-bus.sh" or equivalent operation in the guest to make the 
> running guest aware of the increased disk size.
> 
> The problem:
> 
> As of now, the administrator needs to make sure that step 3 won't be done 
> before step 2 has been executed, or the resulting state will be inconsistent.
> In practice this creates organizational issues to try to sync up host/storage 
> admins and guest OS admin, and is therefore error prone (due to these human 
> factors).
> 
> The proposal:
> 
> The patch I replied to here from Ma Lin tries to avoid the inconsistent 
> state, by having "rescan-scsi-bus.sh" still report the old size in the guest 
> until QEMU itself is aware of the new disk size.
> 
> The patch:
> 
> Before the patch, the SCSI READ_CAPACITY command in the guest os directly 
> receives the unmodified response from the storage backend.
> 
> After the patch, QEMU intercepts the READ_CAPACITY response and replaces the 
> maximum LBA with the information which is saved in QEMU.
> 
> This means: after resizing the storage on the SAN backend, the host 
> administrator must explicitly notify about CAPACITY HAS CHANGED by issuing a 
> block resize command through QMP or libvirt,
> even for SCSI passthrough disks.
> 
> Any ideas on this patch or on possible alternatives?

I am not an SCSI expert, but Lin's description and yours sound
coherent to me.

One minor nitpick below.

> On 11/20/21 11:15 AM, Lin Ma wrote:
>> While using SCSI passthrough, Following scenario makes qemu doesn't
>> realized the capacity change of remote scsi target:
>> 1. online resize the scsi target.
>> 2. issue 'rescan-scsi-bus.sh -s ...' in host.
>> 3. issue 'rescan-scsi-bus.sh -s ...' in vm.
>>
>> In above scenario I used to experienced errors while accessing the
>> additional disk space in vm. I think the reasonable operations should
>> be:
>> 1. online resize the scsi target.
>> 2. issue 'rescan-scsi-bus.sh -s ...' in host.
>> 3. issue 'block_resize' via qmp to notify qemu.
>> 4. issue 'rescan-scsi-bus.sh -s ...' in vm.
>>
>> The errors disappear once I notify qemu by block_resize via qmp.
>>
>> So this patch replaces the number of logical blocks of READ CAPACITY
>> response from scsi target by qemu's bs->total_sectors. If the user in
>> vm wants to access the additional disk space, The administrator of
>> host must notify qemu once resizeing the scsi target.
>>
>> Bonus is that domblkinfo of libvirt can reflect the consistent capacity
>> information between host and vm in case of missing block_resize in qemu.
>> E.g:
>> ...
>> 
>>   
>>   
>>   
>>   
>>   
>>   
>> 
>> ...
>>
>> Before:
>> 1. online resize the scsi target.
>> 2. host:~  # rescan-scsi-bus.sh -s /dev/sdc
>> 3. guest:~ # rescan-scsi-bus.sh -s /dev/sda
>> 4  host:~  # virsh domblkinfo --domain $DOMAIN --human --device sda
>> Capacity:   4.000 GiB
>> Allocation: 0.000 B
>> Physical:   8.000 GiB
>>
>> 5. guest:~ # lsblk /dev/sda
>> NAME   MAJ:MIN RM SIZE RO TYPE MOUNTPOINT
>> sda  8:00   8G  0 disk
>> └─sda1   8:10   2G  0 part
>>
>> After:
>> 1. online resize the scsi target.
>> 2. host:~  # rescan-scsi-bus.sh -s /dev/sdc
>> 3. guest:~ # rescan-scsi-bus.sh -s /dev/sda
>> 4  host:~  # virsh domblkinfo --domain $DOMAIN --human --device sda
>> Capacity:   4.000 GiB
>> Allocation: 0.000 B
>> Physical:   8.000 GiB
>>
>> 5. guest:~ # lsblk /dev/sda
>> NAME   MAJ:MIN RM SIZE RO TYPE MOUNTPOINT
>> sda  8:00   4G  0 disk
>> └─sda1   8:10   2G  0 part
>>
>> Signed-off-by: Lin Ma 
>> ---
>>  hw/scsi/scsi-generic.c | 10 --
>>  1 file changed, 8 insertions(+), 2 deletions(-)
>>
>> diff --git a/hw/scsi/scsi-generic.c b/hw/scsi/scsi-generic.c
>> index 0306ccc7b1..343b51c2c0 100644
>> --- a/hw/scsi/scsi-generic.c
>> +++ b/hw/scsi/scsi-generic.c
>> @@ -315,11 +315,17 @@ static void scsi_read_complete(void * opaque, int ret)
>>  if (r->req.cmd.buf[0] == READ_CAPACITY_10 &&
>>  (ldl_be_p(&r->buf[0]) != 0xU || s->max_lba == 0)) {
>>  s->blocksize = ldl_be_p(&r->buf[4]);
>> -s->max_lba = ldl_be_p(&r->buf[0]) & 0xULL;
>> +BlockBackend *blk = s->conf.blk;
>> +BlockDriverState *bs = blk_bs(blk);
>> +s->max_lba = bs->total_sectors - 1;

I'd add a refresh_max_lba() helper:

 static void refresh_max_lba(SCSIDevice *s)
 {
 BlockBackend *blk = s->conf.blk;
 BlockDriverState *bs = blk_bs(blk);
 uint64_t max_lba = bs->total_sectors - 

Re: [PATCH] vl: Add -set options to device opts dict when using JSON syntax for -device

2021-12-21 Thread Markus Armbruster
MkfsSion  writes:

> When using JSON syntax for -device, -set option can not find device
> specified in JSON by id field. The following commandline is an example:
>
> $ qemu-system-x86_64 -device '{"id":"foo"}' -set device.foo.bar=1
> qemu-system-x86_64: -set device.foo.bar=1: there is no device "foo" defined

Is this a regression?  I suspect commit 5dacda5167 "vl: Enable JSON
syntax for -device" (v6.2.0).

I believe any conversion away from QemuOpts loses -set.

> The patch adds -set options to JSON device opts dict for adding
> options to device by latter object_set_properties_from_keyval call.
>
> Signed-off-by: YuanYang Meng 
> ---
>  include/qemu/option.h |  4 
>  softmmu/vl.c  | 28 
>  util/qemu-option.c|  2 +-
>  3 files changed, 33 insertions(+), 1 deletion(-)
>
> diff --git a/include/qemu/option.h b/include/qemu/option.h
> index 306bf07575..31fa9fdc25 100644
> --- a/include/qemu/option.h
> +++ b/include/qemu/option.h
> @@ -45,6 +45,10 @@ const char *get_opt_value(const char *p, char **value);
>  
>  bool parse_option_size(const char *name, const char *value,
> uint64_t *ret, Error **errp);
> +
> +bool parse_option_number(const char *name, const char *value,
> + uint64_t *ret, Error **errp);
> +
>  bool has_help_option(const char *param);
>  
>  enum QemuOptType {
> diff --git a/softmmu/vl.c b/softmmu/vl.c
> index 620a1f1367..feb3c49a65 100644
> --- a/softmmu/vl.c
> +++ b/softmmu/vl.c
> @@ -30,7 +30,9 @@
>  #include "hw/qdev-properties.h"
>  #include "qapi/compat-policy.h"
>  #include "qapi/error.h"
> +#include "qapi/qmp/qbool.h"
>  #include "qapi/qmp/qdict.h"
> +#include "qapi/qmp/qnum.h"
>  #include "qapi/qmp/qstring.h"
>  #include "qapi/qmp/qjson.h"
>  #include "qemu-version.h"
> @@ -2279,6 +2281,7 @@ static void qemu_set_option(const char *str, Error 
> **errp)
>  char group[64], id[64], arg[64];
>  QemuOptsList *list;
>  QemuOpts *opts;
> +DeviceOption *opt;
>  int rc, offset;
>  
>  rc = sscanf(str, "%63[^.].%63[^.].%63[^=]%n", group, id, arg, &offset);
> @@ -2294,6 +2297,31 @@ static void qemu_set_option(const char *str, Error 
> **errp)
>  if (list) {
>  opts = qemu_opts_find(list, id);
>  if (!opts) {
> +QTAILQ_FOREACH(opt, &device_opts, next) {
> +const char *device_id = qdict_get_try_str(opt->opts, 
> "id");
> +if (device_id && (strcmp(device_id, id) == 0)) {
> +if (qdict_get(opt->opts, arg)) {
> +qdict_del(opt->opts, arg);
> +}
> +const char *value = str + offset + 1;
> +QObject *obj = NULL;
> +bool boolean;
> +uint64_t num;
> +if (qapi_bool_parse(arg, value, &boolean, NULL)) {
> +obj = QOBJECT(qbool_from_bool(boolean));
> +} else if (parse_option_number(arg, value, &num, 
> NULL)) {
> +obj = QOBJECT(qnum_from_uint(num));
> +} else if (parse_option_size(arg, value, &num, 
> NULL)) {
> +obj = QOBJECT(qnum_from_uint(num));
> +} else {
> +obj = QOBJECT(qstring_from_str(value));
> +}
> +if (obj) {
> +qdict_put_obj(opt->opts, arg, obj);
> +return;
> +}
> +}
> +}
>  error_setg(errp, "there is no %s \"%s\" defined", group, id);
>  return;
>  }
   qemu_opt_set(opts, arg, str + offset + 1, errp);
   }
   }
   }

Two issues, and only looks fixable.

-device accepts either a QemuOpts or a JSON argument.

It parses the former with qemu_opts_parse_noisily() into a QemuOpt
stored in @qemu_device_opts.

It parses the latter with qobject_from_json() into a QObject stored in
@device_opts.  Yes, the names are confusing.

-set parses its argument into @group, @id, and @arg (the value).

Before the patch, it uses @group and @id to look up the QemuOpt in
@qemu_device_opts.  If found, it updates it with qemu_opt_set().

By design, -set operates on the QemuOpts store.

Options stored in @device_opts are not found.  Your patch tries to fix
that.  Before I can explain what's wrong with it, I need more
background.

QemuOpts arguments are parsed as a set of (key, value) pairs, where the
values are all strings (because qemu_device_opts.desc[] is empty).  We
access them with a qobject_input_visitor_new_keyval() visitor.  This
parses the strings according to the types being visited.

Example: key=42 gets stored as {"key": "42"}.  If we visit it with
visit_type_str(), we use string value "42".  If we visit it with
vi

Re: Virtio-GPU Xres and Yres seettings

2021-12-21 Thread Pratik Parvati
I apologise for not putting the question properly. I am referring virtio
spec to understand the driver and device operation - I had a few questions
on pixel data, as it was not working as expected.

My display is working fine now; thanks for your help.

Regards,
Pratik


On Tue, 21 Dec 2021 at 16:41, Gerd Hoffmann  wrote:

> On Mon, Dec 20, 2021 at 10:44:06PM +0530, Pratik Parvati wrote:
> > > EDID is optional, so you can try disable the EDID feature bit and see
> > > what happens.
> >
> > Thanks Gerd, after disabling the EDID, I was able to get the required
> > resolution (basically width and height) from the driver.
> >
> > Another strange observation - When the device receives the
> > command VIRTIO_GPU_CMD_RESOURCE_ATTACH_BACKING with the number of
> > entries having a pixel data in scatter gather format, the device is
> trying
> > to store these bytes in contiguous memory. When I read those sg memory,
> the
> > device receives all zeros from the driver (for a 1024x768 display,
> > the device receives 3MB of data from the driver). Is this an expected
> > behaviour? - If not, what is the driver trying to display on the screen?
>
> How about reading the virtio spec?
> display updates are handled with transfer and flush commands.
>
> take care,
>   Gerd
>
>


Re: [PATCH 5/8] configure: move non-command-line variables away from command-line parsing section

2021-12-21 Thread Philippe Mathieu-Daudé
On 12/21/21 12:05, Paolo Bonzini wrote:
> This makes it easier to identify candidates for moving to Meson.
> 
> Signed-off-by: Paolo Bonzini 
> ---
>  configure | 13 ++---
>  1 file changed, 6 insertions(+), 7 deletions(-)

Reviewed-by: Philippe Mathieu-Daudé 




Re: [PATCH 8/8] meson: add comments in the target-specific flags section

2021-12-21 Thread Philippe Mathieu-Daudé
On 12/21/21 12:05, Paolo Bonzini wrote:
> Signed-off-by: Paolo Bonzini 
> ---
>  meson.build | 5 +
>  1 file changed, 5 insertions(+)

Reviewed-by: Philippe Mathieu-Daudé 




Re: [PATCH 6/8] meson: build contrib/ executables after generated headers

2021-12-21 Thread Philippe Mathieu-Daudé
On 12/21/21 12:05, Paolo Bonzini wrote:
> This will be needed as soon as config-poison.h moves from configure to
> a meson custom_target (which is built at "ninja" time).
> 
> Signed-off-by: Paolo Bonzini 
> ---
>  contrib/elf2dmp/meson.build| 2 +-
>  contrib/ivshmem-client/meson.build | 2 +-
>  contrib/ivshmem-server/meson.build | 2 +-
>  contrib/rdmacm-mux/meson.build | 2 +-
>  4 files changed, 4 insertions(+), 4 deletions(-)

Reviewed-by: Philippe Mathieu-Daudé 




Re: [PATCH] docker: include bison in debian-tricore-cross

2021-12-21 Thread Philippe Mathieu-Daudé
On 12/21/21 12:16, Paolo Bonzini wrote:
> Binutils sometimes fail to build if bison is not installed:
> 
>   /bin/sh ./ylwrap `test -f arparse.y || echo ./`arparse.y y.tab.c arparse.c 
> y.tab.h arparse.h y.output arparse.output --  -d
>   ./ylwrap: 109: ./ylwrap: -d: not found
> 
> (the correct invocation of ylwrap would have "bison -d" after the double
> dash).  Work around by installing it in the container.
> 
> Cc: Alex Bennée 

Resolves: https://gitlab.com/qemu-project/qemu/-/issues/596
Reviewed-by: Philippe Mathieu-Daudé 

> Signed-off-by: Paolo Bonzini 
> ---
>  tests/docker/dockerfiles/debian-tricore-cross.docker | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/tests/docker/dockerfiles/debian-tricore-cross.docker 
> b/tests/docker/dockerfiles/debian-tricore-cross.docker
> index d8df2c6117..3f6b55562c 100644
> --- a/tests/docker/dockerfiles/debian-tricore-cross.docker
> +++ b/tests/docker/dockerfiles/debian-tricore-cross.docker
> @@ -16,6 +16,7 @@ MAINTAINER Philippe Mathieu-Daudé 
>  RUN apt update && \
>  DEBIAN_FRONTEND=noninteractive apt install -yy eatmydata && \
>  DEBIAN_FRONTEND=noninteractive eatmydata apt install -yy \
> +   bison \
> bzip2 \
> ca-certificates \
> ccache \




Re: QOM

2021-12-21 Thread Philippe Mathieu-Daudé
Hi Abhijeet,

On 12/21/21 12:27, abhijeet inamdar wrote:
> Hi,
> 
> 1)What does QOM stand for?

QOM: "QEMU Object Model"

See https://qemu-project.gitlab.io/qemu/devel/qom.html

> 2)Can anyone tell what this error means? 
> 
> (qemu) Unexpected error in object_property_find() at
> /home/ocp/vcpu-playground/vcpu_on_qemu/qemu-4.2.0/qom/object.c:1177:
> qemu-system-arm: Property '.sysbus-irq[0]' not found
> Aborted (core dumped).

I suppose you are trying to connect a device gpio/irq output line
to another device input, likely using sysbus_connect_irq().

The API is "connect the N-th output line from the SysBus device
to this qemu_irq handler", where qemu_irq is the input line.

Apparently your SysBus device doesn't have any output line
registered. These are registered using sysbus_init_irq().
The first call register the first output IRQ, and so on.

Some objects have their QOM interface documented, for
example to use the ARM GIC see:
https://gitlab.com/qemu-project/qemu/-/blob/master/include/hw/intc/arm_gic.h#L22

Hope that helps.

Regards,

Phil.




Re: [PATCH v11 00/31] LoongArch64 port of QEMU TCG

2021-12-21 Thread Philippe Mathieu-Daudé
On 12/21/21 10:36, gaosong wrote:
> Hi,
> 
> On 2021/12/21 下午4:44, Philippe Mathieu-Daudé wrote:
>> I took few hours to translate and read all Taobao contracts before
>> registering, then got blacklisted at my first login... Maybe others
>> will get more luck.
>>
>> Having someone at Loongson helping with hardware is certainly easier
>> for the community.
> 
> Loongson company can donate 3a5000 computers or provide an IP access to 
> 3a5000 hardware environment. 

Wow, this is awesome!

> Which way is better?

Access to 3a5000 hardware environment is certainly better for us,
as this remove the maintenance burden.



Re: [PATCH v2 0/2] qemu-img convert: Fix sparseness detection

2021-12-21 Thread Peter Lieven
Am 17.12.21 um 17:46 schrieb Vladimir Sementsov-Ogievskiy:
> Hi all!
>
> 01: only update test output rebasing on master
> 02: replaced with my proposed solution.
>
> Kevin Wolf (1):
>   iotests: Test qemu-img convert of zeroed data cluster
>
> Vladimir Sementsov-Ogievskiy (1):
>   qemu-img: make is_allocated_sectors() more efficient
>
>  qemu-img.c | 23 +++
>  tests/qemu-iotests/122 |  1 +
>  tests/qemu-iotests/122.out |  2 ++
>  3 files changed, 22 insertions(+), 4 deletions(-)
>
Tested-by: Peter Lieven 





Re: QOM

2021-12-21 Thread abhijeet inamdar
Oh,

In that case I have to define my irq set for a machine to handle the
exception and interrupts.

BR.
Abhijeet.

On Tue, 21 Dec, 2021, 12:59 Philippe Mathieu-Daudé, 
wrote:

> Hi Abhijeet,
>
> On 12/21/21 12:27, abhijeet inamdar wrote:
> > Hi,
> >
> > 1)What does QOM stand for?
>
> QOM: "QEMU Object Model"
>
> See https://qemu-project.gitlab.io/qemu/devel/qom.html
>
> > 2)Can anyone tell what this error means?
> >
> > (qemu) Unexpected error in object_property_find() at
> > /home/ocp/vcpu-playground/vcpu_on_qemu/qemu-4.2.0/qom/object.c:1177:
> > qemu-system-arm: Property '.sysbus-irq[0]' not found
> > Aborted (core dumped).
>
> I suppose you are trying to connect a device gpio/irq output line
> to another device input, likely using sysbus_connect_irq().
>
> The API is "connect the N-th output line from the SysBus device
> to this qemu_irq handler", where qemu_irq is the input line.
>
> Apparently your SysBus device doesn't have any output line
> registered. These are registered using sysbus_init_irq().
> The first call register the first output IRQ, and so on.
>
> Some objects have their QOM interface documented, for
> example to use the ARM GIC see:
>
> https://gitlab.com/qemu-project/qemu/-/blob/master/include/hw/intc/arm_gic.h#L22
>
> Hope that helps.
>
> Regards,
>
> Phil.
>
>


Re: QOM

2021-12-21 Thread abhijeet inamdar
As we have almost 80 irq lines and 40-45 NVIC_irq's.

Where can I define them?

BR.
Abhijeet.

On Tue, 21 Dec, 2021, 13:18 abhijeet inamdar, 
wrote:

> Oh,
>
> In that case I have to define my irq set for a machine to handle the
> exception and interrupts.
>
> BR.
> Abhijeet.
>
> On Tue, 21 Dec, 2021, 12:59 Philippe Mathieu-Daudé, 
> wrote:
>
>> Hi Abhijeet,
>>
>> On 12/21/21 12:27, abhijeet inamdar wrote:
>> > Hi,
>> >
>> > 1)What does QOM stand for?
>>
>> QOM: "QEMU Object Model"
>>
>> See https://qemu-project.gitlab.io/qemu/devel/qom.html
>>
>> > 2)Can anyone tell what this error means?
>> >
>> > (qemu) Unexpected error in object_property_find() at
>> > /home/ocp/vcpu-playground/vcpu_on_qemu/qemu-4.2.0/qom/object.c:1177:
>> > qemu-system-arm: Property '.sysbus-irq[0]' not found
>> > Aborted (core dumped).
>>
>> I suppose you are trying to connect a device gpio/irq output line
>> to another device input, likely using sysbus_connect_irq().
>>
>> The API is "connect the N-th output line from the SysBus device
>> to this qemu_irq handler", where qemu_irq is the input line.
>>
>> Apparently your SysBus device doesn't have any output line
>> registered. These are registered using sysbus_init_irq().
>> The first call register the first output IRQ, and so on.
>>
>> Some objects have their QOM interface documented, for
>> example to use the ARM GIC see:
>>
>> https://gitlab.com/qemu-project/qemu/-/blob/master/include/hw/intc/arm_gic.h#L22
>>
>> Hope that helps.
>>
>> Regards,
>>
>> Phil.
>>
>>


Re: [PATCH] failover: Silence warning messages during qtest

2021-12-21 Thread Thomas Huth

On 20/12/2021 15.53, Laurent Vivier wrote:

virtio-net-failover test tries several device combinations that produces
some expected warnings.
These warning can be confusing, so we disable them during the qtest
sequence.

Reported-by: Thomas Huth 
Signed-off-by: Laurent Vivier 
---
  hw/net/virtio-net.c   | 3 ++-
  migration/migration.c | 4 +++-
  2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index f2014d5ea0b3..c64a6b9d1745 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -44,6 +44,7 @@
  #include "hw/pci/pci.h"
  #include "net_rx_pkt.h"
  #include "hw/virtio/vhost.h"
+#include "sysemu/qtest.h"
  
  #define VIRTIO_NET_VM_VERSION11
  
@@ -925,7 +926,7 @@ static void virtio_net_set_features(VirtIODevice *vdev, uint64_t features)

  qapi_event_send_failover_negotiated(n->netclient_name);
  qatomic_set(&n->failover_primary_hidden, false);
  failover_add_primary(n, &err);
-if (err) {
+if (err && !qtest_enabled()) {
  warn_report_err(err);
  }


This trips the sanitizer build now:

 https://gitlab.com/thuth/qemu/-/jobs/1907374419

I think you have to error_free(err) in case qtest_enabled() ?

 Thomas




[PATCH v2 0/6] migration: misc cleanups

2021-12-21 Thread Juan Quintela
Hi

Changes since v1:
- Add reviewed tags for reviewed patches
- Change comment about last_stage (philmd)
- Change cast to 1ULL for ram_release_page()
- remove TARGET_PAGE_MASK mask.  It was used only for compression,
  and offset should be page aligned already

Please, review.

Thanks, Juan.

[v1]
This series do several cleanups:
- Dave found that I was using "%d" for unsigned, fix all uses.
- We pass last_stage left and right, but we only use it in two places
  Just move it to RAMState.
- do_compress_page() used a goto when not needed.
- ram_release_pages() was always used with one page
- And put it when we detect zero pages, instead of everywhere we have find a 
zero page.

Please, review.

Juan Quintela (6):
  migration: All this fields are unsigned
  migration: We only need last_stage in two places
  migration: ram_release_pages() always receive 1 page as argument
  migration: Remove masking for compression
  migration: simplify do_compress_ram_page
  migration: Move ram_release_pages() call to save_zero_page_to_file()

 migration/multifd-zlib.c | 20 +--
 migration/multifd-zstd.c | 24 +++---
 migration/multifd.c  | 16 -
 migration/ram.c  | 71 +---
 migration/trace-events   | 26 +++
 5 files changed, 73 insertions(+), 84 deletions(-)

-- 
2.33.1





[PATCH v2 1/6] migration: All this fields are unsigned

2021-12-21 Thread Juan Quintela
So printing it as %d is wrong.  Notice that for the channel id, that
is an uint8_t, but I changed it anyways for consistency.

Signed-off-by: Juan Quintela 
Reviewed-by: Philippe Mathieu-Daudé 
---
 migration/multifd-zlib.c | 20 ++--
 migration/multifd-zstd.c | 24 
 migration/multifd.c  | 16 
 migration/trace-events   | 26 +-
 4 files changed, 43 insertions(+), 43 deletions(-)

diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c
index da6201704c..9f6ebf1076 100644
--- a/migration/multifd-zlib.c
+++ b/migration/multifd-zlib.c
@@ -51,7 +51,7 @@ static int zlib_send_setup(MultiFDSendParams *p, Error **errp)
 zs->opaque = Z_NULL;
 if (deflateInit(zs, migrate_multifd_zlib_level()) != Z_OK) {
 g_free(z);
-error_setg(errp, "multifd %d: deflate init failed", p->id);
+error_setg(errp, "multifd %u: deflate init failed", p->id);
 return -1;
 }
 /* To be safe, we reserve twice the size of the packet */
@@ -60,7 +60,7 @@ static int zlib_send_setup(MultiFDSendParams *p, Error **errp)
 if (!z->zbuff) {
 deflateEnd(&z->zs);
 g_free(z);
-error_setg(errp, "multifd %d: out of memory for zbuff", p->id);
+error_setg(errp, "multifd %u: out of memory for zbuff", p->id);
 return -1;
 }
 p->data = z;
@@ -132,12 +132,12 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error 
**errp)
 ret = deflate(zs, flush);
 } while (ret == Z_OK && zs->avail_in && zs->avail_out);
 if (ret == Z_OK && zs->avail_in) {
-error_setg(errp, "multifd %d: deflate failed to compress all 
input",
+error_setg(errp, "multifd %u: deflate failed to compress all 
input",
p->id);
 return -1;
 }
 if (ret != Z_OK) {
-error_setg(errp, "multifd %d: deflate returned %d instead of Z_OK",
+error_setg(errp, "multifd %u: deflate returned %d instead of Z_OK",
p->id, ret);
 return -1;
 }
@@ -190,7 +190,7 @@ static int zlib_recv_setup(MultiFDRecvParams *p, Error 
**errp)
 zs->avail_in = 0;
 zs->next_in = Z_NULL;
 if (inflateInit(zs) != Z_OK) {
-error_setg(errp, "multifd %d: inflate init failed", p->id);
+error_setg(errp, "multifd %u: inflate init failed", p->id);
 return -1;
 }
 /* To be safe, we reserve twice the size of the packet */
@@ -198,7 +198,7 @@ static int zlib_recv_setup(MultiFDRecvParams *p, Error 
**errp)
 z->zbuff = g_try_malloc(z->zbuff_len);
 if (!z->zbuff) {
 inflateEnd(zs);
-error_setg(errp, "multifd %d: out of memory for zbuff", p->id);
+error_setg(errp, "multifd %u: out of memory for zbuff", p->id);
 return -1;
 }
 return 0;
@@ -247,7 +247,7 @@ static int zlib_recv_pages(MultiFDRecvParams *p, Error 
**errp)
 int i;
 
 if (flags != MULTIFD_FLAG_ZLIB) {
-error_setg(errp, "multifd %d: flags received %x flags expected %x",
+error_setg(errp, "multifd %u: flags received %x flags expected %x",
p->id, flags, MULTIFD_FLAG_ZLIB);
 return -1;
 }
@@ -284,19 +284,19 @@ static int zlib_recv_pages(MultiFDRecvParams *p, Error 
**errp)
 } while (ret == Z_OK && zs->avail_in
  && (zs->total_out - start) < page_size);
 if (ret == Z_OK && (zs->total_out - start) < page_size) {
-error_setg(errp, "multifd %d: inflate generated too few output",
+error_setg(errp, "multifd %u: inflate generated too few output",
p->id);
 return -1;
 }
 if (ret != Z_OK) {
-error_setg(errp, "multifd %d: inflate returned %d instead of Z_OK",
+error_setg(errp, "multifd %u: inflate returned %d instead of Z_OK",
p->id, ret);
 return -1;
 }
 }
 out_size = zs->total_out - out_size;
 if (out_size != expected_size) {
-error_setg(errp, "multifd %d: packet size received %d size expected 
%d",
+error_setg(errp, "multifd %u: packet size received %u size expected 
%u",
p->id, out_size, expected_size);
 return -1;
 }
diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c
index 2d5b61106c..cc4e991724 100644
--- a/migration/multifd-zstd.c
+++ b/migration/multifd-zstd.c
@@ -55,7 +55,7 @@ static int zstd_send_setup(MultiFDSendParams *p, Error **errp)
 z->zcs = ZSTD_createCStream();
 if (!z->zcs) {
 g_free(z);
-error_setg(errp, "multifd %d: zstd createCStream failed", p->id);
+error_setg(errp, "multifd %u: zstd createCStream failed", p->id);
 return -1;
 }
 
@@ -63,7 +63,7 @@ static int zstd_send_setup(MultiFDSendParams *p, Error **errp)
 if (ZSTD_isError(res)) {
 ZSTD_freeCStream(z->zcs);
 g

[PATCH v2 2/6] migration: We only need last_stage in two places

2021-12-21 Thread Juan Quintela
We only need last_stage in two places and we are passing it all
around.  Just add a field to RAMState that passes it.

Signed-off-by: Juan Quintela 

---

Repeat subject (philmd suggestion)
---
 migration/ram.c | 41 ++---
 1 file changed, 18 insertions(+), 23 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index 57efa67f20..7223b0d8ca 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -325,7 +325,8 @@ struct RAMState {
 uint64_t xbzrle_bytes_prev;
 /* Start using XBZRLE (e.g., after the first round). */
 bool xbzrle_enabled;
-
+/* Are we on the last stage of migration */
+bool last_stage;
 /* compression statistics since the beginning of the period */
 /* amount of count that no free thread to compress data */
 uint64_t compress_thread_busy_prev;
@@ -683,11 +684,10 @@ static void xbzrle_cache_zero_page(RAMState *rs, 
ram_addr_t current_addr)
  * @current_addr: addr of the page
  * @block: block that contains the page we want to send
  * @offset: offset inside the block for the page
- * @last_stage: if we are at the completion stage
  */
 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
 ram_addr_t current_addr, RAMBlock *block,
-ram_addr_t offset, bool last_stage)
+ram_addr_t offset)
 {
 int encoded_len = 0, bytes_xbzrle;
 uint8_t *prev_cached_page;
@@ -695,7 +695,7 @@ static int save_xbzrle_page(RAMState *rs, uint8_t 
**current_data,
 if (!cache_is_cached(XBZRLE.cache, current_addr,
  ram_counters.dirty_sync_count)) {
 xbzrle_counters.cache_miss++;
-if (!last_stage) {
+if (!rs->last_stage) {
 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
  ram_counters.dirty_sync_count) == -1) {
 return -1;
@@ -734,7 +734,7 @@ static int save_xbzrle_page(RAMState *rs, uint8_t 
**current_data,
  * Update the cache contents, so that it corresponds to the data
  * sent, in all cases except where we skip the page.
  */
-if (!last_stage && encoded_len != 0) {
+if (!rs->last_stage && encoded_len != 0) {
 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 /*
  * In the case where we couldn't compress, ensure that the caller
@@ -1290,9 +1290,8 @@ static int save_normal_page(RAMState *rs, RAMBlock 
*block, ram_addr_t offset,
  * @rs: current RAM state
  * @block: block that contains the page we want to send
  * @offset: offset inside the block for the page
- * @last_stage: if we are at the completion stage
  */
-static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
+static int ram_save_page(RAMState *rs, PageSearchStatus *pss)
 {
 int pages = -1;
 uint8_t *p;
@@ -1307,8 +1306,8 @@ static int ram_save_page(RAMState *rs, PageSearchStatus 
*pss, bool last_stage)
 XBZRLE_cache_lock();
 if (rs->xbzrle_enabled && !migration_in_postcopy()) {
 pages = save_xbzrle_page(rs, &p, current_addr, block,
- offset, last_stage);
-if (!last_stage) {
+ offset);
+if (!rs->last_stage) {
 /* Can't send this cached data async, since the cache page
  * might get updated before it gets to the wire
  */
@@ -2129,10 +2128,8 @@ static bool save_compress_page(RAMState *rs, RAMBlock 
*block, ram_addr_t offset)
  *
  * @rs: current RAM state
  * @pss: data about the page we want to send
- * @last_stage: if we are at the completion stage
  */
-static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
-bool last_stage)
+static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss)
 {
 RAMBlock *block = pss->block;
 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
@@ -2171,7 +2168,7 @@ static int ram_save_target_page(RAMState *rs, 
PageSearchStatus *pss,
 return ram_save_multifd_page(rs, block, offset);
 }
 
-return ram_save_page(rs, pss, last_stage);
+return ram_save_page(rs, pss);
 }
 
 /**
@@ -2190,10 +2187,8 @@ static int ram_save_target_page(RAMState *rs, 
PageSearchStatus *pss,
  * @rs: current RAM state
  * @ms: current migration state
  * @pss: data about the page we want to send
- * @last_stage: if we are at the completion stage
  */
-static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
-  bool last_stage)
+static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
 {
 int tmppages, pages = 0;
 size_t pagesize_bits =
@@ -2211,7 +2206,7 @@ static int ram_save_host_page(RAMState *rs, 
PageSearchStatus *pss,
 do {
 /* Check the pages is dirty and if it is send it */
 if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
-tmppages = ram_save_t

[PATCH v2 4/6] migration: Remove masking for compression

2021-12-21 Thread Juan Quintela
Remove the mask in the call to ram_release_pages().  Nothing else does
it, and if the offset has that bits set, we have a lot of trouble.

Signed-off-by: Juan Quintela 
---
 migration/ram.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/migration/ram.c b/migration/ram.c
index 0b98862b9e..4ee0369d6f 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -1365,7 +1365,7 @@ static bool do_compress_ram_page(QEMUFile *f, z_stream 
*stream, RAMBlock *block,
 }
 
 exit:
-ram_release_page(block->idstr, offset & TARGET_PAGE_MASK);
+ram_release_page(block->idstr, offset);
 return zero_page;
 }
 
-- 
2.33.1




[PATCH v2 5/6] migration: simplify do_compress_ram_page

2021-12-21 Thread Juan Quintela
The goto is not needed at all.

Signed-off-by: Juan Quintela 
---
 migration/ram.c | 11 +++
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index 4ee0369d6f..eddc85ffb0 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -1341,12 +1341,11 @@ static bool do_compress_ram_page(QEMUFile *f, z_stream 
*stream, RAMBlock *block,
 {
 RAMState *rs = ram_state;
 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
-bool zero_page = false;
 int ret;
 
 if (save_zero_page_to_file(rs, f, block, offset)) {
-zero_page = true;
-goto exit;
+ram_release_page(block->idstr, offset & TARGET_PAGE_MASK);
+return true;
 }
 
 save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
@@ -1361,12 +1360,8 @@ static bool do_compress_ram_page(QEMUFile *f, z_stream 
*stream, RAMBlock *block,
 if (ret < 0) {
 qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
 error_report("compressed data failed!");
-return false;
 }
-
-exit:
-ram_release_page(block->idstr, offset);
-return zero_page;
+return false;
 }
 
 static void
-- 
2.33.1




[PATCH v2 3/6] migration: ram_release_pages() always receive 1 page as argument

2021-12-21 Thread Juan Quintela
Remove the pages argument. And s/pages/page/

Signed-off-by: Juan Quintela 
Reviewed-by: Philippe Mathieu-Daudé 

--

Use 1LL instead of casts

Signed-off-by: Juan Quintela 
---
 migration/ram.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index 7223b0d8ca..0b98862b9e 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -1204,13 +1204,13 @@ static int save_zero_page(RAMState *rs, RAMBlock 
*block, ram_addr_t offset)
 return -1;
 }
 
-static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
+static void ram_release_page(const char *rbname, uint64_t offset)
 {
 if (!migrate_release_ram() || !migration_in_postcopy()) {
 return;
 }
 
-ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS);
+ram_discard_range(rbname, offset, 1ULL << TARGET_PAGE_BITS);
 }
 
 /*
@@ -1365,7 +1365,7 @@ static bool do_compress_ram_page(QEMUFile *f, z_stream 
*stream, RAMBlock *block,
 }
 
 exit:
-ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
+ram_release_page(block->idstr, offset & TARGET_PAGE_MASK);
 return zero_page;
 }
 
@@ -2153,7 +2153,7 @@ static int ram_save_target_page(RAMState *rs, 
PageSearchStatus *pss)
 xbzrle_cache_zero_page(rs, block->offset + offset);
 XBZRLE_cache_unlock();
 }
-ram_release_pages(block->idstr, offset, res);
+ram_release_page(block->idstr, offset);
 return res;
 }
 
-- 
2.33.1




[PATCH v2 6/6] migration: Move ram_release_pages() call to save_zero_page_to_file()

2021-12-21 Thread Juan Quintela
We always need to call it when we find a zero page, so put it in a
single place.

Signed-off-by: Juan Quintela 
---
 migration/ram.c | 21 ++---
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index eddc85ffb0..3cd98e19d5 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -1158,6 +1158,15 @@ static void migration_bitmap_sync_precopy(RAMState *rs)
 }
 }
 
+static void ram_release_page(const char *rbname, uint64_t offset)
+{
+if (!migrate_release_ram() || !migration_in_postcopy()) {
+return;
+}
+
+ram_discard_range(rbname, offset, 1ULL << TARGET_PAGE_BITS);
+}
+
 /**
  * save_zero_page_to_file: send the zero page to the file
  *
@@ -1179,6 +1188,7 @@ static int save_zero_page_to_file(RAMState *rs, QEMUFile 
*file,
 len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
 qemu_put_byte(file, 0);
 len += 1;
+ram_release_page(block->idstr, offset);
 }
 return len;
 }
@@ -1204,15 +1214,6 @@ static int save_zero_page(RAMState *rs, RAMBlock *block, 
ram_addr_t offset)
 return -1;
 }
 
-static void ram_release_page(const char *rbname, uint64_t offset)
-{
-if (!migrate_release_ram() || !migration_in_postcopy()) {
-return;
-}
-
-ram_discard_range(rbname, offset, 1ULL << TARGET_PAGE_BITS);
-}
-
 /*
  * @pages: the number of pages written by the control path,
  *< 0 - error
@@ -1344,7 +1345,6 @@ static bool do_compress_ram_page(QEMUFile *f, z_stream 
*stream, RAMBlock *block,
 int ret;
 
 if (save_zero_page_to_file(rs, f, block, offset)) {
-ram_release_page(block->idstr, offset & TARGET_PAGE_MASK);
 return true;
 }
 
@@ -2148,7 +2148,6 @@ static int ram_save_target_page(RAMState *rs, 
PageSearchStatus *pss)
 xbzrle_cache_zero_page(rs, block->offset + offset);
 XBZRLE_cache_unlock();
 }
-ram_release_page(block->idstr, offset);
 return res;
 }
 
-- 
2.33.1




Re: [PATCH] vl: Add -set options to device opts dict when using JSON syntax for -device

2021-12-21 Thread Markus Armbruster
Markus Armbruster  writes:

> MkfsSion  writes:
>
>> When using JSON syntax for -device, -set option can not find device
>> specified in JSON by id field. The following commandline is an example:
>>
>> $ qemu-system-x86_64 -device '{"id":"foo"}' -set device.foo.bar=1
>> qemu-system-x86_64: -set device.foo.bar=1: there is no device "foo" defined
>
> Is this a regression?  I suspect commit 5dacda5167 "vl: Enable JSON
> syntax for -device" (v6.2.0).

Obviously not a regression: everything that used to work still works.

> I believe any conversion away from QemuOpts loses -set.

[...]




Re: [PATCH v2 6/8] migration: Dump sub-cmd name in loadvm_process_command tp

2021-12-21 Thread Peter Xu
On Tue, Dec 21, 2021 at 10:08:24AM +, David Edmondson wrote:
> > @@ -22,7 +22,7 @@ loadvm_postcopy_handle_resume(void) ""
> >  loadvm_postcopy_ram_handle_discard(void) ""
> >  loadvm_postcopy_ram_handle_discard_end(void) ""
> >  loadvm_postcopy_ram_handle_discard_header(const char *ramid, uint16_t len) 
> > "%s: %ud"
> > -loadvm_process_command(uint16_t com, uint16_t len) "com=0x%x len=%d"
> > +loadvm_process_command(const char *s, uint16_t len) "com=%s len=%d"
> 
> "cmd" rather than "com", to match the code.

"com" is what it was used before (perhaps "com"mand?).  Thanks,

-- 
Peter Xu




Re: [PATCH v2 7/8] migration: Finer grained tracepoints for POSTCOPY_LISTEN

2021-12-21 Thread Peter Xu
On Tue, Dec 21, 2021 at 10:12:34AM +, David Edmondson wrote:
> On Monday, 2021-12-20 at 16:53:54 +08, Peter Xu wrote:
> 
> > The enablement of postcopy listening has a few steps, add a few tracepoints 
> > to
> > be there ready for some basic measurements for them.
> >
> > Signed-off-by: Peter Xu 
> > ---
> >  migration/savevm.c | 9 -
> >  migration/trace-events | 2 +-
> >  2 files changed, 9 insertions(+), 2 deletions(-)
> >
> > diff --git a/migration/savevm.c b/migration/savevm.c
> > index 7f7af6f750..25face6de0 100644
> > --- a/migration/savevm.c
> > +++ b/migration/savevm.c
> > @@ -1947,9 +1947,10 @@ static void *postcopy_ram_listen_thread(void *opaque)
> >  static int loadvm_postcopy_handle_listen(MigrationIncomingState *mis)
> >  {
> >  PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_LISTENING);
> > -trace_loadvm_postcopy_handle_listen();
> >  Error *local_err = NULL;
> >
> > +trace_loadvm_postcopy_handle_listen("enter");
> > +
> >  if (ps != POSTCOPY_INCOMING_ADVISE && ps != POSTCOPY_INCOMING_DISCARD) 
> > {
> >  error_report("CMD_POSTCOPY_LISTEN in wrong postcopy state (%d)", 
> > ps);
> >  return -1;
> > @@ -1964,6 +1965,8 @@ static int 
> > loadvm_postcopy_handle_listen(MigrationIncomingState *mis)
> >  }
> >  }
> >
> > +trace_loadvm_postcopy_handle_listen("after disgard");
> 
> s/disgard/discard/

Will fix.

> 
> > +
> >  /*
> >   * Sensitise RAM - can now generate requests for blocks that don't 
> > exist
> >   * However, at this point the CPU shouldn't be running, and the IO
> > @@ -1976,6 +1979,8 @@ static int 
> > loadvm_postcopy_handle_listen(MigrationIncomingState *mis)
> >  }
> >  }
> >
> > +trace_loadvm_postcopy_handle_listen("after uffd");
> > +
> >  if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_LISTEN, &local_err)) {
> >  error_report_err(local_err);
> >  return -1;
> > @@ -1990,6 +1995,8 @@ static int 
> > loadvm_postcopy_handle_listen(MigrationIncomingState *mis)
> >  qemu_sem_wait(&mis->listen_thread_sem);
> >  qemu_sem_destroy(&mis->listen_thread_sem);
> >
> > +trace_loadvm_postcopy_handle_listen("exit");
> > +
> 
> "return" rather than "exit"?

I don't think it matters a lot, but sure.

Thanks,

-- 
Peter Xu




Re: [PATCH] failover: Silence warning messages during qtest

2021-12-21 Thread Philippe Mathieu-Daudé
On 12/21/21 13:30, Thomas Huth wrote:
> On 20/12/2021 15.53, Laurent Vivier wrote:
>> virtio-net-failover test tries several device combinations that produces
>> some expected warnings.
>> These warning can be confusing, so we disable them during the qtest
>> sequence.
>>
>> Reported-by: Thomas Huth 
>> Signed-off-by: Laurent Vivier 
>> ---
>>   hw/net/virtio-net.c   | 3 ++-
>>   migration/migration.c | 4 +++-
>>   2 files changed, 5 insertions(+), 2 deletions(-)
>>
>> diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
>> index f2014d5ea0b3..c64a6b9d1745 100644
>> --- a/hw/net/virtio-net.c
>> +++ b/hw/net/virtio-net.c
>> @@ -44,6 +44,7 @@
>>   #include "hw/pci/pci.h"
>>   #include "net_rx_pkt.h"
>>   #include "hw/virtio/vhost.h"
>> +#include "sysemu/qtest.h"
>>     #define VIRTIO_NET_VM_VERSION    11
>>   @@ -925,7 +926,7 @@ static void virtio_net_set_features(VirtIODevice
>> *vdev, uint64_t features)
>>   qapi_event_send_failover_negotiated(n->netclient_name);
>>   qatomic_set(&n->failover_primary_hidden, false);
>>   failover_add_primary(n, &err);
>> -    if (err) {
>> +    if (err && !qtest_enabled()) {
>>   warn_report_err(err);
>>   }
> 
> This trips the sanitizer build now:
> 
>  https://gitlab.com/thuth/qemu/-/jobs/1907374419
> 
> I think you have to error_free(err) in case qtest_enabled() ?

Indeed. In that case it might be better to add a
warn_report_err_except_qtest() helper...




Re: [PATCH v2 5/6] migration: simplify do_compress_ram_page

2021-12-21 Thread Philippe Mathieu-Daudé
On 12/21/21 13:52, Juan Quintela wrote:
> The goto is not needed at all.
> 
> Signed-off-by: Juan Quintela 
> ---
>  migration/ram.c | 11 +++
>  1 file changed, 3 insertions(+), 8 deletions(-)
> 
> diff --git a/migration/ram.c b/migration/ram.c
> index 4ee0369d6f..eddc85ffb0 100644
> --- a/migration/ram.c
> +++ b/migration/ram.c
> @@ -1341,12 +1341,11 @@ static bool do_compress_ram_page(QEMUFile *f, 
> z_stream *stream, RAMBlock *block,
>  {
>  RAMState *rs = ram_state;
>  uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
> -bool zero_page = false;
>  int ret;
>  
>  if (save_zero_page_to_file(rs, f, block, offset)) {
> -zero_page = true;
> -goto exit;
> +ram_release_page(block->idstr, offset & TARGET_PAGE_MASK);

We don't want TARGET_PAGE_MASK anymore here, right?

> +return true;
>  }
>  
>  save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
> @@ -1361,12 +1360,8 @@ static bool do_compress_ram_page(QEMUFile *f, z_stream 
> *stream, RAMBlock *block,
>  if (ret < 0) {
>  qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
>  error_report("compressed data failed!");
> -return false;
>  }
> -
> -exit:
> -ram_release_page(block->idstr, offset);
> -return zero_page;
> +return false;
>  }
>  
>  static void




Re: [PATCH 2/2] qapi/ui: introduce change-vnc-listen

2021-12-21 Thread Vladimir Sementsov-Ogievskiy

21.12.2021 11:13, Marc-André Lureau wrote:

Hi

On Mon, Dec 20, 2021 at 10:24 PM Vladimir Sementsov-Ogievskiy mailto:vsement...@virtuozzo.com>> wrote:

Add command that can change addresses where VNC server listens for new
connections. Prior to 6.0 this functionality was available through
'change' qmp command which was deleted.

Signed-off-by: Vladimir Sementsov-Ogievskiy mailto:vsement...@virtuozzo.com>>


Looks good to me,
Reviewed-by: Marc-André Lureau mailto:marcandre.lur...@redhat.com>>

Could you write an avocado test for it? (tests/avocado/vnc.py)


Thanks a lot for reviewing! I will.


--
Best regards,
Vladimir



[PATCH 3/2] avocado/vnc: add test_change_listen

2021-12-21 Thread Vladimir Sementsov-Ogievskiy
Add simple test-case for new change-vnc-listen qmp command.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 tests/avocado/vnc.py | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/tests/avocado/vnc.py b/tests/avocado/vnc.py
index 096432988f..f05ee1e00a 100644
--- a/tests/avocado/vnc.py
+++ b/tests/avocado/vnc.py
@@ -51,3 +51,13 @@ def test_change_password(self):
 set_password_response = self.vm.qmp('change-vnc-password',
 password='new_password')
 self.assertEqual(set_password_response['return'], {})
+
+def test_change_listen(self):
+self.vm.add_args('-nodefaults', '-S', '-vnc', ':0')
+self.vm.launch()
+self.assertEqual(self.vm.qmp('query-vnc')['return']['service'], '5900')
+res = self.vm.qmp('change-vnc-listen', id='default',
+  addresses=[{'type': 'inet', 'host': '0.0.0.0',
+  'port': '5901'}])
+self.assertEqual(res['return'], {})
+self.assertEqual(self.vm.qmp('query-vnc')['return']['service'], '5901')
-- 
2.31.1




Re: [PATCH 3/2] avocado/vnc: add test_change_listen

2021-12-21 Thread Marc-André Lureau
On Tue, Dec 21, 2021 at 5:38 PM Vladimir Sementsov-Ogievskiy
 wrote:
>
> Add simple test-case for new change-vnc-listen qmp command.
>
> Signed-off-by: Vladimir Sementsov-Ogievskiy 

Reviewed-by: Marc-André Lureau 

thanks

> ---
>  tests/avocado/vnc.py | 10 ++
>  1 file changed, 10 insertions(+)
>
> diff --git a/tests/avocado/vnc.py b/tests/avocado/vnc.py
> index 096432988f..f05ee1e00a 100644
> --- a/tests/avocado/vnc.py
> +++ b/tests/avocado/vnc.py
> @@ -51,3 +51,13 @@ def test_change_password(self):
>  set_password_response = self.vm.qmp('change-vnc-password',
>  password='new_password')
>  self.assertEqual(set_password_response['return'], {})
> +
> +def test_change_listen(self):
> +self.vm.add_args('-nodefaults', '-S', '-vnc', ':0')
> +self.vm.launch()
> +self.assertEqual(self.vm.qmp('query-vnc')['return']['service'], 
> '5900')
> +res = self.vm.qmp('change-vnc-listen', id='default',
> +  addresses=[{'type': 'inet', 'host': '0.0.0.0',
> +  'port': '5901'}])
> +self.assertEqual(res['return'], {})
> +self.assertEqual(self.vm.qmp('query-vnc')['return']['service'], 
> '5901')
> --
> 2.31.1
>




Re: [PATCH] vl: Add opts to device opts list when using JSON syntax for -device

2021-12-21 Thread MkfsSion
Sorry, the patch is not well tested and it causes duplicate devices creation. I 
have send another patch to the mailing list to fix the issue.

On 2021/12/21 16:25, Philippe Mathieu-Daudé wrote:
> Cc'ing Markus.
> 
> On 12/20/21 09:45, MkfsSion wrote:
>> When using JSON syntax for -device, -set option can not find device
>> specified in JSON by id field. The following commandline is an example:
>>
>> $ qemu-system-x86_64 -device '{"id":"foo"}' -set device.foo.bar=1
>> qemu-system-x86_64: -set device.foo.bar=1: there is no device "foo" defined
>>
>> The patch adds device opts to device opts list when a device opts get
>> parsed.
>>
>> Signed-off-by: MkfsSion 
> 
> BTW per:
> https://www.qemu.org/docs/master/devel/submitting-a-patch.html#patch-emails-must-include-a-signed-off-by-line
> "Please use your real name to sign a patch (not an alias or acronym)."
The issue has been fixed in new patch.
> 
>> ---
>>  softmmu/vl.c | 2 ++
>>  1 file changed, 2 insertions(+)
>>
>> diff --git a/softmmu/vl.c b/softmmu/vl.c
>> index 620a1f1367..0dd5acbc1a 100644
>> --- a/softmmu/vl.c
>> +++ b/softmmu/vl.c
>> @@ -3400,6 +3400,8 @@ void qemu_init(int argc, char **argv, char **envp)
>>  loc_save(&opt->loc);
>>  assert(opt->opts != NULL);
>>  QTAILQ_INSERT_TAIL(&device_opts, opt, next);
>> +qemu_opts_from_qdict(qemu_find_opts_err("device", 
>> &error_fatal),
>> + opt->opts, &error_fatal);
>>  } else {
>>  if (!qemu_opts_parse_noisily(qemu_find_opts("device"),
>>   optarg, true)) {
> 



[PATCH] vl: Add -set options to device opts dict when using JSON syntax for -device

2021-12-21 Thread MkfsSion
When using JSON syntax for -device, -set option can not find device
specified in JSON by id field. The following commandline is an example:

$ qemu-system-x86_64 -device '{"id":"foo"}' -set device.foo.bar=1
qemu-system-x86_64: -set device.foo.bar=1: there is no device "foo" defined

The patch adds -set options to JSON device opts dict for adding
options to device by latter object_set_properties_from_keyval call.

Signed-off-by: YuanYang Meng 
---
 include/qemu/option.h |  4 
 softmmu/vl.c  | 28 
 util/qemu-option.c|  2 +-
 3 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/include/qemu/option.h b/include/qemu/option.h
index 306bf07575..31fa9fdc25 100644
--- a/include/qemu/option.h
+++ b/include/qemu/option.h
@@ -45,6 +45,10 @@ const char *get_opt_value(const char *p, char **value);
 
 bool parse_option_size(const char *name, const char *value,
uint64_t *ret, Error **errp);
+
+bool parse_option_number(const char *name, const char *value,
+ uint64_t *ret, Error **errp);
+
 bool has_help_option(const char *param);
 
 enum QemuOptType {
diff --git a/softmmu/vl.c b/softmmu/vl.c
index 620a1f1367..feb3c49a65 100644
--- a/softmmu/vl.c
+++ b/softmmu/vl.c
@@ -30,7 +30,9 @@
 #include "hw/qdev-properties.h"
 #include "qapi/compat-policy.h"
 #include "qapi/error.h"
+#include "qapi/qmp/qbool.h"
 #include "qapi/qmp/qdict.h"
+#include "qapi/qmp/qnum.h"
 #include "qapi/qmp/qstring.h"
 #include "qapi/qmp/qjson.h"
 #include "qemu-version.h"
@@ -2279,6 +2281,7 @@ static void qemu_set_option(const char *str, Error **errp)
 char group[64], id[64], arg[64];
 QemuOptsList *list;
 QemuOpts *opts;
+DeviceOption *opt;
 int rc, offset;
 
 rc = sscanf(str, "%63[^.].%63[^.].%63[^=]%n", group, id, arg, &offset);
@@ -2294,6 +2297,31 @@ static void qemu_set_option(const char *str, Error 
**errp)
 if (list) {
 opts = qemu_opts_find(list, id);
 if (!opts) {
+QTAILQ_FOREACH(opt, &device_opts, next) {
+const char *device_id = qdict_get_try_str(opt->opts, "id");
+if (device_id && (strcmp(device_id, id) == 0)) {
+if (qdict_get(opt->opts, arg)) {
+qdict_del(opt->opts, arg);
+}
+const char *value = str + offset + 1;
+QObject *obj = NULL;
+bool boolean;
+uint64_t num;
+if (qapi_bool_parse(arg, value, &boolean, NULL)) {
+obj = QOBJECT(qbool_from_bool(boolean));
+} else if (parse_option_number(arg, value, &num, 
NULL)) {
+obj = QOBJECT(qnum_from_uint(num));
+} else if (parse_option_size(arg, value, &num, NULL)) {
+obj = QOBJECT(qnum_from_uint(num));
+} else {
+obj = QOBJECT(qstring_from_str(value));
+}
+if (obj) {
+qdict_put_obj(opt->opts, arg, obj);
+return;
+}
+}
+}
 error_setg(errp, "there is no %s \"%s\" defined", group, id);
 return;
 }
diff --git a/util/qemu-option.c b/util/qemu-option.c
index eedd08929b..b2427cba9f 100644
--- a/util/qemu-option.c
+++ b/util/qemu-option.c
@@ -88,7 +88,7 @@ const char *get_opt_value(const char *p, char **value)
 return offset;
 }
 
-static bool parse_option_number(const char *name, const char *value,
+bool parse_option_number(const char *name, const char *value,
 uint64_t *ret, Error **errp)
 {
 uint64_t number;
-- 
2.34.1




Re: [PATCH RFC] MAINTAINERS: split out s390x sections

2021-12-21 Thread Halil Pasic
On Mon, 20 Dec 2021 12:54:19 +0100
Cornelia Huck  wrote:

> Split out some more specialized devices etc., so that we can build
> smarter lists of people to be put on cc: in the future.
> 
> Signed-off-by: Cornelia Huck 

LGTM

Acked-by: Halil Pasic 



Re: [PATCH 2/2] qapi/ui: introduce change-vnc-listen

2021-12-21 Thread Markus Armbruster
Vladimir Sementsov-Ogievskiy  writes:

> Add command that can change addresses where VNC server listens for new
> connections. Prior to 6.0 this functionality was available through
> 'change' qmp command which was deleted.
>
> Signed-off-by: Vladimir Sementsov-Ogievskiy 
> ---
>  docs/about/removed-features.rst |  3 ++-
>  qapi/ui.json| 12 
>  ui/vnc.c| 26 ++
>  3 files changed, 40 insertions(+), 1 deletion(-)
>
> diff --git a/docs/about/removed-features.rst b/docs/about/removed-features.rst
> index d42c3341de..20e6901a82 100644
> --- a/docs/about/removed-features.rst
> +++ b/docs/about/removed-features.rst
> @@ -348,7 +348,8 @@ documentation of ``query-hotpluggable-cpus`` for 
> additional details.
>  ``change`` (removed in 6.0)
>  '''
>  
> -Use ``blockdev-change-medium`` or ``change-vnc-password`` instead.
> +Use ``blockdev-change-medium`` or ``change-vnc-password`` or
> +``change-vnc-listen`` instead.
>  
>  ``query-events`` (removed in 6.0)
>  '
> diff --git a/qapi/ui.json b/qapi/ui.json
> index d7567ac866..14e6fe0b4c 100644
> --- a/qapi/ui.json
> +++ b/qapi/ui.json
> @@ -1304,3 +1304,15 @@
>  { 'command': 'display-reload',
>'data': 'DisplayReloadOptions',
>'boxed' : true }
> +
> +##
> +# @change-vnc-listen:
> +#
> +# Change set of addresses to listen for connections.

Please document the arguments:

   # @id: lorem ipsum
   #
   # @address: dolor sit amet
   #
   # @websockets: consectetur adipisici elit

> +#
> +# Since: 7.0
> +#
> +##
> +{ 'command': 'change-vnc-listen',
> +  'data': { 'id': 'str', 'addresses': ['SocketAddress'],
> +'*websockets': ['SocketAddress'] } }

Lacks 'if': 'CONFIG_VNC'.

We already have change-vnc-password.  You add change-vnc-listen.  Is
there anything else we might want to change?

Aside: what's the difference between change-vnc-password and
set_password?

[...]




Re: [PATCH v4 0/7] nbd reconnect on open

2021-12-21 Thread Vladimir Sementsov-Ogievskiy

13.12.2021 18:32, Vladimir Sementsov-Ogievskiy wrote:

Hi all!

The functionality is reviewed, python testing part is not.

I've dropped the patch "qapi: make blockdev-add a coroutine command":
it's optional, I don't want to slow down the whole series because of it.

v4:
01-03: wording,  add Eric's r-b
others: small changes, never had an r-b

Vladimir Sementsov-Ogievskiy (7):
   nbd: allow reconnect on open, with corresponding new options
   nbd/client-connection: nbd_co_establish_connection(): return real
 error
   nbd/client-connection: improve error message of cancelled attempt
   iotests.py: add qemu_tool_popen()
   For qemu_io* functions support --image-opts argument, which conflicts
 with -f argument from qemu_io_args.
   Add qemu-io Popen constructor wrapper. To be used in the following new
 test commit.
   iotests: add nbd-reconnect-on-open test

  qapi/block-core.json  |  9 ++-
  block/nbd.c   | 45 +++-
  nbd/client-connection.c   | 59 ++-
  tests/qemu-iotests/iotests.py | 36 ++
  .../qemu-iotests/tests/nbd-reconnect-on-open  | 71 +++
  .../tests/nbd-reconnect-on-open.out   | 11 +++
  6 files changed, 199 insertions(+), 32 deletions(-)
  create mode 100755 tests/qemu-iotests/tests/nbd-reconnect-on-open
  create mode 100644 tests/qemu-iotests/tests/nbd-reconnect-on-open.out



Thanks for reviewing!

I do s/6.2/7.0/ fix to patch 1, restore subjects of patches 5,6 (which were 
somehow lost in transition v3->v4) and apply the series to my nbd branch.


--
Best regards,
Vladimir



Re: [RFC PATCH v2 02/14] job.h: categorize fields in struct Job

2021-12-21 Thread Emanuele Giuseppe Esposito




On 16/12/2021 17:21, Stefan Hajnoczi wrote:

On Thu, Nov 04, 2021 at 10:53:22AM -0400, Emanuele Giuseppe Esposito wrote:

Categorize the fields in struct Job to understand which ones
need to be protected by the job mutex and which don't.

Signed-off-by: Emanuele Giuseppe Esposito 
---
  include/qemu/job.h | 57 +++---
  1 file changed, 34 insertions(+), 23 deletions(-)

diff --git a/include/qemu/job.h b/include/qemu/job.h
index ccf7826426..f7036ac6b3 100644
--- a/include/qemu/job.h
+++ b/include/qemu/job.h
@@ -40,27 +40,52 @@ typedef struct JobTxn JobTxn;
   * Long-running operation.
   */
  typedef struct Job {
+
+/* Fields set at initialization (job_create), and never modified */
+
  /** The ID of the job. May be NULL for internal jobs. */
  char *id;
  
-/** The type of this job. */

+/**
+ * The type of this job.
+ * All callbacks are called with job_mutex *not* held.
+ */
  const JobDriver *driver;
  
-/** Reference count of the block job */

-int refcnt;
-
-/** Current state; See @JobStatus for details. */
-JobStatus status;
-
  /** AioContext to run the job coroutine in */
  AioContext *aio_context;


"Fields set at initialization (job_create), and never modified" does not
apply here. blockjob.c:child_job_set_aio_ctx() changes it at runtime.



Right. aio_context can theoretically avoid also the job_mutex, if we 
make sure that all klass->set_aio_ctx() are under BQL (they are) and 
under drains (work in progress). For now I will protect it with job_lock().


Thank you,
Emanuele




[PATCH v3 02/15] pcie: Add some SR/IOV API documentation in docs/pcie_sriov.txt

2021-12-21 Thread Lukasz Maniak
From: Knut Omang 

Add a small intro + minimal documentation for how to
implement SR/IOV support for an emulated device.

Signed-off-by: Knut Omang 
---
 docs/pcie_sriov.txt | 115 
 1 file changed, 115 insertions(+)
 create mode 100644 docs/pcie_sriov.txt

diff --git a/docs/pcie_sriov.txt b/docs/pcie_sriov.txt
new file mode 100644
index 00..f5e891e1d4
--- /dev/null
+++ b/docs/pcie_sriov.txt
@@ -0,0 +1,115 @@
+PCI SR/IOV EMULATION SUPPORT
+
+
+Description
+===
+SR/IOV (Single Root I/O Virtualization) is an optional extended capability
+of a PCI Express device. It allows a single physical function (PF) to appear 
as multiple
+virtual functions (VFs) for the main purpose of eliminating software
+overhead in I/O from virtual machines.
+
+Qemu now implements the basic common functionality to enable an emulated device
+to support SR/IOV. Yet no fully implemented devices exists in Qemu, but a
+proof-of-concept hack of the Intel igb can be found here:
+
+git://github.com/knuto/qemu.git sriov_patches_v5
+
+Implementation
+==
+Implementing emulation of an SR/IOV capable device typically consists of
+implementing support for two types of device classes; the "normal" physical 
device
+(PF) and the virtual device (VF). From Qemu's perspective, the VFs are just
+like other devices, except that some of their properties are derived from
+the PF.
+
+A virtual function is different from a physical function in that the BAR
+space for all VFs are defined by the BAR registers in the PFs SR/IOV
+capability. All VFs have the same BARs and BAR sizes.
+
+Accesses to these virtual BARs then is computed as
+
++  *  + 
+
+From our emulation perspective this means that there is a separate call for
+setting up a BAR for a VF.
+
+1) To enable SR/IOV support in the PF, it must be a PCI Express device so
+   you would need to add a PCI Express capability in the normal PCI
+   capability list. You might also want to add an ARI (Alternative
+   Routing-ID Interpretation) capability to indicate that your device
+   supports functions beyond it's "own" function space (0-7),
+   which is necessary to support more than 7 functions, or
+   if functions extends beyond offset 7 because they are placed at an
+   offset > 1 or have stride > 1.
+
+   ...
+   #include "hw/pci/pcie.h"
+   #include "hw/pci/pcie_sriov.h"
+
+   pci_your_pf_dev_realize( ... )
+   {
+  ...
+  int ret = pcie_endpoint_cap_init(d, 0x70);
+  ...
+  pcie_ari_init(d, 0x100, 1);
+  ...
+
+  /* Add and initialize the SR/IOV capability */
+  pcie_sriov_pf_init(d, 0x200, "your_virtual_dev",
+   vf_devid, initial_vfs, total_vfs,
+   fun_offset, stride);
+
+  /* Set up individual VF BARs (parameters as for normal BARs) */
+  pcie_sriov_pf_init_vf_bar( ... )
+  ...
+   }
+
+   For cleanup, you simply call:
+
+  pcie_sriov_pf_exit(device);
+
+   which will delete all the virtual functions and associated resources.
+
+2) Similarly in the implementation of the virtual function, you need to
+   make it a PCI Express device and add a similar set of capabilities
+   except for the SR/IOV capability. Then you need to set up the VF BARs as
+   subregions of the PFs SR/IOV VF BARs by calling
+   pcie_sriov_vf_register_bar() instead of the normal pci_register_bar() call:
+
+   pci_your_vf_dev_realize( ... )
+   {
+  ...
+  int ret = pcie_endpoint_cap_init(d, 0x60);
+  ...
+  pcie_ari_init(d, 0x100, 1);
+  ...
+  memory_region_init(mr, ... )
+  pcie_sriov_vf_register_bar(d, bar_nr, mr);
+  ...
+   }
+
+Testing on Linux guest
+==
+The easiest is if your device driver supports sysfs based SR/IOV
+enabling. Support for this was added in kernel v.3.8, so not all drivers
+support it yet.
+
+To enable 4 VFs for a device at 01:00.0:
+
+   modprobe yourdriver
+   echo 4 > /sys/bus/pci/devices/:01:00.0/sriov_numvfs
+
+You should now see 4 VFs with lspci.
+To turn SR/IOV off again - the standard requires you to turn it off before you 
can enable
+another VF count, and the emulation enforces this:
+
+   echo 0 > /sys/bus/pci/devices/:01:00.0/sriov_numvfs
+
+Older drivers typically provide a max_vfs module parameter
+to enable it at load time:
+
+   modprobe yourdriver max_vfs=4
+
+To disable the VFs again then, you simply have to unload the driver:
+
+   rmmod yourdriver
-- 
2.25.1




[PATCH v3 00/15] hw/nvme: SR-IOV with Virtualization Enhancements

2021-12-21 Thread Lukasz Maniak
This is the version of the patch series that we consider ready for
staging. We do not intend to work on the v4 unless there are major
issues.

Changes since v2:
- The documentation mentions that SR-IOV support is still an
  experimental feature.
- The default value activates properly when sriov_max_v{i,q}_per_vf == 0.
- Secondary Controller List (CNS 15h) handles the CDW10.CNTID field.
- Virtual Function Number ("VFN") in Secondary Controller Entry is not
  cleared to zero as the controller goes offline.
- Removed no longer used helper pcie_sriov_vf_number_total.
- Reset other than Controller Reset is necessary to activate (or
  deactivate) flexible resources.
- The v{i,q}rfap fields in Primary Controller Capabilities store the
  currently active number of bound resources, not the number active
  after reset.
- Secondary controller cannot be set online unless the corresponding VF
  is enabled (sriov_numvfs set to at least the secondary controller's VF
  number)

The list of opens and known gaps remains the same as for v2:
https://lists.gnu.org/archive/html/qemu-block/2021-11/msg00423.html

Knut Omang (2):
  pcie: Add support for Single Root I/O Virtualization (SR/IOV)
  pcie: Add some SR/IOV API documentation in docs/pcie_sriov.txt

Lukasz Maniak (4):
  hw/nvme: Add support for SR-IOV
  hw/nvme: Add support for Primary Controller Capabilities
  hw/nvme: Add support for Secondary Controller List
  docs: Add documentation for SR-IOV and Virtualization Enhancements

Łukasz Gieryk (9):
  pcie: Add a helper to the SR/IOV API
  pcie: Add 1.2 version token for the Power Management Capability
  hw/nvme: Implement the Function Level Reset
  hw/nvme: Make max_ioqpairs and msix_qsize configurable in runtime
  hw/nvme: Remove reg_size variable and update BAR0 size calculation
  hw/nvme: Calculate BAR attributes in a function
  hw/nvme: Initialize capability structures for primary/secondary
controllers
  hw/nvme: Add support for the Virtualization Management command
  hw/nvme: Update the initalization place for the AER queue

 docs/pcie_sriov.txt  | 115 ++
 docs/system/devices/nvme.rst |  36 ++
 hw/nvme/ctrl.c   | 665 ---
 hw/nvme/ns.c |   2 +-
 hw/nvme/nvme.h   |  55 ++-
 hw/nvme/subsys.c |  75 +++-
 hw/nvme/trace-events |   6 +
 hw/pci/meson.build   |   1 +
 hw/pci/pci.c |  97 +++--
 hw/pci/pcie.c|   5 +
 hw/pci/pcie_sriov.c  | 295 
 hw/pci/trace-events  |   5 +
 include/block/nvme.h |  65 
 include/hw/pci/pci.h |  12 +-
 include/hw/pci/pci_ids.h |   1 +
 include/hw/pci/pci_regs.h|   1 +
 include/hw/pci/pcie.h|   6 +
 include/hw/pci/pcie_sriov.h  |  72 
 include/qemu/typedefs.h  |   2 +
 19 files changed, 1435 insertions(+), 81 deletions(-)
 create mode 100644 docs/pcie_sriov.txt
 create mode 100644 hw/pci/pcie_sriov.c
 create mode 100644 include/hw/pci/pcie_sriov.h

-- 
2.25.1




[PATCH v3 05/15] hw/nvme: Add support for SR-IOV

2021-12-21 Thread Lukasz Maniak
This patch implements initial support for Single Root I/O Virtualization
on an NVMe device.

Essentially, it allows to define the maximum number of virtual functions
supported by the NVMe controller via sriov_max_vfs parameter.

Passing a non-zero value to sriov_max_vfs triggers reporting of SR-IOV
capability by a physical controller and ARI capability by both the
physical and virtual function devices.

NVMe controllers created via virtual functions mirror functionally
the physical controller, which may not entirely be the case, thus
consideration would be needed on the way to limit the capabilities of
the VF.

NVMe subsystem is required for the use of SR-IOV.

Signed-off-by: Lukasz Maniak 
---
 hw/nvme/ctrl.c   | 84 ++--
 hw/nvme/nvme.h   |  3 +-
 include/hw/pci/pci_ids.h |  1 +
 3 files changed, 84 insertions(+), 4 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 5f573c417b..159635c1af 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -35,6 +35,7 @@
  *  mdts=,vsl=, \
  *  zoned.zasl=, \
  *  zoned.auto_transition=, \
+ *  sriov_max_vfs= \
  *  subsys=
  *  -device nvme-ns,drive=,bus=,nsid=,\
  *  zoned=, \
@@ -106,6 +107,12 @@
  *   transitioned to zone state closed for resource management purposes.
  *   Defaults to 'on'.
  *
+ * - `sriov_max_vfs`
+ *   Indicates the maximum number of PCIe virtual functions supported
+ *   by the controller. The default value is 0. Specifying a non-zero value
+ *   enables reporting of both SR-IOV and ARI capabilities by the NVMe device.
+ *   Virtual function controllers will not report SR-IOV capability.
+ *
  * nvme namespace device parameters
  * 
  * - `shared`
@@ -160,6 +167,7 @@
 #include "sysemu/block-backend.h"
 #include "sysemu/hostmem.h"
 #include "hw/pci/msix.h"
+#include "hw/pci/pcie_sriov.h"
 #include "migration/vmstate.h"
 
 #include "nvme.h"
@@ -175,6 +183,9 @@
 #define NVME_TEMPERATURE_CRITICAL 0x175
 #define NVME_NUM_FW_SLOTS 1
 #define NVME_DEFAULT_MAX_ZA_SIZE (128 * KiB)
+#define NVME_MAX_VFS 127
+#define NVME_VF_OFFSET 0x1
+#define NVME_VF_STRIDE 1
 
 #define NVME_GUEST_ERR(trace, fmt, ...) \
 do { \
@@ -5588,6 +5599,10 @@ static void nvme_ctrl_reset(NvmeCtrl *n)
 g_free(event);
 }
 
+if (!pci_is_vf(&n->parent_obj) && n->params.sriov_max_vfs) {
+pcie_sriov_pf_disable_vfs(&n->parent_obj);
+}
+
 n->aer_queued = 0;
 n->outstanding_aers = 0;
 n->qs_created = false;
@@ -6269,6 +6284,29 @@ static void nvme_check_constraints(NvmeCtrl *n, Error 
**errp)
 error_setg(errp, "vsl must be non-zero");
 return;
 }
+
+if (params->sriov_max_vfs) {
+if (!n->subsys) {
+error_setg(errp, "subsystem is required for the use of SR-IOV");
+return;
+}
+
+if (params->sriov_max_vfs > NVME_MAX_VFS) {
+error_setg(errp, "sriov_max_vfs must be between 0 and %d",
+   NVME_MAX_VFS);
+return;
+}
+
+if (params->cmb_size_mb) {
+error_setg(errp, "CMB is not supported with SR-IOV");
+return;
+}
+
+if (n->pmr.dev) {
+error_setg(errp, "PMR is not supported with SR-IOV");
+return;
+}
+}
 }
 
 static void nvme_init_state(NvmeCtrl *n)
@@ -6326,6 +6364,20 @@ static void nvme_init_pmr(NvmeCtrl *n, PCIDevice 
*pci_dev)
 memory_region_set_enabled(&n->pmr.dev->mr, false);
 }
 
+static void nvme_init_sriov(NvmeCtrl *n, PCIDevice *pci_dev, uint16_t offset,
+uint64_t bar_size)
+{
+uint16_t vf_dev_id = n->params.use_intel_id ?
+ PCI_DEVICE_ID_INTEL_NVME : PCI_DEVICE_ID_REDHAT_NVME;
+
+pcie_sriov_pf_init(pci_dev, offset, "nvme", vf_dev_id,
+   n->params.sriov_max_vfs, n->params.sriov_max_vfs,
+   NVME_VF_OFFSET, NVME_VF_STRIDE);
+
+pcie_sriov_pf_init_vf_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
+  PCI_BASE_ADDRESS_MEM_TYPE_64, bar_size);
+}
+
 static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
 {
 uint8_t *pci_conf = pci_dev->config;
@@ -6340,7 +6392,7 @@ static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, 
Error **errp)
 
 if (n->params.use_intel_id) {
 pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_INTEL);
-pci_config_set_device_id(pci_conf, 0x5845);
+pci_config_set_device_id(pci_conf, PCI_DEVICE_ID_INTEL_NVME);
 } else {
 pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_REDHAT);
 pci_config_set_device_id(pci_conf, PCI_DEVICE_ID_REDHAT_NVME);
@@ -6348,6 +6400,9 @@ static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, 
Error **errp)
 
 pci_config_set_class(pci_conf, PCI_CLASS_STORAGE_EXPRESS);
 pcie_endpoint_cap_init(pci_dev, 0x80);
+if (n->par

[PATCH v3 01/15] pcie: Add support for Single Root I/O Virtualization (SR/IOV)

2021-12-21 Thread Lukasz Maniak
From: Knut Omang 

This patch provides the building blocks for creating an SR/IOV
PCIe Extended Capability header and register/unregister
SR/IOV Virtual Functions.

Signed-off-by: Knut Omang 
---
 hw/pci/meson.build  |   1 +
 hw/pci/pci.c|  97 +---
 hw/pci/pcie.c   |   5 +
 hw/pci/pcie_sriov.c | 287 
 hw/pci/trace-events |   5 +
 include/hw/pci/pci.h|  12 +-
 include/hw/pci/pcie.h   |   6 +
 include/hw/pci/pcie_sriov.h |  67 +
 include/qemu/typedefs.h |   2 +
 9 files changed, 456 insertions(+), 26 deletions(-)
 create mode 100644 hw/pci/pcie_sriov.c
 create mode 100644 include/hw/pci/pcie_sriov.h

diff --git a/hw/pci/meson.build b/hw/pci/meson.build
index 5c4bbac817..bcc9c75919 100644
--- a/hw/pci/meson.build
+++ b/hw/pci/meson.build
@@ -5,6 +5,7 @@ pci_ss.add(files(
   'pci.c',
   'pci_bridge.c',
   'pci_host.c',
+  'pcie_sriov.c',
   'shpc.c',
   'slotid_cap.c'
 ))
diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index e5993c1ef5..1892a7e74c 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -239,6 +239,9 @@ int pci_bar(PCIDevice *d, int reg)
 {
 uint8_t type;
 
+/* PCIe virtual functions do not have their own BARs */
+assert(!pci_is_vf(d));
+
 if (reg != PCI_ROM_SLOT)
 return PCI_BASE_ADDRESS_0 + reg * 4;
 
@@ -304,10 +307,30 @@ void pci_device_deassert_intx(PCIDevice *dev)
 }
 }
 
-static void pci_do_device_reset(PCIDevice *dev)
+static void pci_reset_regions(PCIDevice *dev)
 {
 int r;
+if (pci_is_vf(dev)) {
+return;
+}
+
+for (r = 0; r < PCI_NUM_REGIONS; ++r) {
+PCIIORegion *region = &dev->io_regions[r];
+if (!region->size) {
+continue;
+}
+
+if (!(region->type & PCI_BASE_ADDRESS_SPACE_IO) &&
+region->type & PCI_BASE_ADDRESS_MEM_TYPE_64) {
+pci_set_quad(dev->config + pci_bar(dev, r), region->type);
+} else {
+pci_set_long(dev->config + pci_bar(dev, r), region->type);
+}
+}
+}
 
+static void pci_do_device_reset(PCIDevice *dev)
+{
 pci_device_deassert_intx(dev);
 assert(dev->irq_state == 0);
 
@@ -323,19 +346,7 @@ static void pci_do_device_reset(PCIDevice *dev)
   pci_get_word(dev->wmask + PCI_INTERRUPT_LINE) |
   pci_get_word(dev->w1cmask + PCI_INTERRUPT_LINE));
 dev->config[PCI_CACHE_LINE_SIZE] = 0x0;
-for (r = 0; r < PCI_NUM_REGIONS; ++r) {
-PCIIORegion *region = &dev->io_regions[r];
-if (!region->size) {
-continue;
-}
-
-if (!(region->type & PCI_BASE_ADDRESS_SPACE_IO) &&
-region->type & PCI_BASE_ADDRESS_MEM_TYPE_64) {
-pci_set_quad(dev->config + pci_bar(dev, r), region->type);
-} else {
-pci_set_long(dev->config + pci_bar(dev, r), region->type);
-}
-}
+pci_reset_regions(dev);
 pci_update_mappings(dev);
 
 msi_reset(dev);
@@ -884,6 +895,15 @@ static void pci_init_multifunction(PCIBus *bus, PCIDevice 
*dev, Error **errp)
 dev->config[PCI_HEADER_TYPE] |= PCI_HEADER_TYPE_MULTI_FUNCTION;
 }
 
+/* With SR/IOV and ARI, a device at function 0 need not be a multifunction
+ * device, as it may just be a VF that ended up with function 0 in
+ * the legacy PCI interpretation. Avoid failing in such cases:
+ */
+if (pci_is_vf(dev) &&
+dev->exp.sriov_vf.pf->cap_present & QEMU_PCI_CAP_MULTIFUNCTION) {
+return;
+}
+
 /*
  * multifunction bit is interpreted in two ways as follows.
  *   - all functions must set the bit to 1.
@@ -1083,6 +1103,7 @@ static PCIDevice *do_pci_register_device(PCIDevice 
*pci_dev,
bus->devices[devfn]->name);
 return NULL;
 } else if (dev->hotplugged &&
+   !pci_is_vf(pci_dev) &&
pci_get_function_0(pci_dev)) {
 error_setg(errp, "PCI: slot %d function 0 already occupied by %s,"
" new func %s cannot be exposed to guest.",
@@ -1191,6 +1212,7 @@ void pci_register_bar(PCIDevice *pci_dev, int region_num,
 pcibus_t size = memory_region_size(memory);
 uint8_t hdr_type;
 
+assert(!pci_is_vf(pci_dev)); /* VFs must use pcie_sriov_vf_register_bar */
 assert(region_num >= 0);
 assert(region_num < PCI_NUM_REGIONS);
 assert(is_power_of_2(size));
@@ -1294,11 +1316,43 @@ pcibus_t pci_get_bar_addr(PCIDevice *pci_dev, int 
region_num)
 return pci_dev->io_regions[region_num].addr;
 }
 
-static pcibus_t pci_bar_address(PCIDevice *d,
-int reg, uint8_t type, pcibus_t size)
+static pcibus_t pci_config_get_bar_addr(PCIDevice *d, int reg,
+uint8_t type, pcibus_t size)
+{
+pcibus_t new_addr;
+if (!pci_is_vf(d)) {
+int bar = pci_bar(d, reg);
+if (type & PCI_BASE_ADDRESS_MEM_TYPE_64) {
+new_addr = pci

[PATCH v3 03/15] pcie: Add a helper to the SR/IOV API

2021-12-21 Thread Lukasz Maniak
From: Łukasz Gieryk 

Convenience function for retrieving the PCIDevice object of the N-th VF.

Signed-off-by: Łukasz Gieryk 
Reviewed-by: Knut Omang 
---
 hw/pci/pcie_sriov.c | 10 +-
 include/hw/pci/pcie_sriov.h |  5 +
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/hw/pci/pcie_sriov.c b/hw/pci/pcie_sriov.c
index 501a1ff433..be8c907e06 100644
--- a/hw/pci/pcie_sriov.c
+++ b/hw/pci/pcie_sriov.c
@@ -280,8 +280,16 @@ uint16_t pcie_sriov_vf_number(PCIDevice *dev)
 return dev->exp.sriov_vf.vf_number;
 }
 
-
 PCIDevice *pcie_sriov_get_pf(PCIDevice *dev)
 {
 return dev->exp.sriov_vf.pf;
 }
+
+PCIDevice *pcie_sriov_get_vf_at_index(PCIDevice *dev, int n)
+{
+assert(!pci_is_vf(dev));
+if (n < dev->exp.sriov_pf.num_vfs) {
+return dev->exp.sriov_pf.vf[n];
+}
+return NULL;
+}
diff --git a/include/hw/pci/pcie_sriov.h b/include/hw/pci/pcie_sriov.h
index 0974f00054..cd2aebd3a6 100644
--- a/include/hw/pci/pcie_sriov.h
+++ b/include/hw/pci/pcie_sriov.h
@@ -64,4 +64,9 @@ uint16_t pcie_sriov_vf_number(PCIDevice *dev);
  */
 PCIDevice *pcie_sriov_get_pf(PCIDevice *dev);
 
+/* Get the n-th VF of this physical function - only valid for PF.
+ * Returns NULL if index is invalid
+ */
+PCIDevice *pcie_sriov_get_vf_at_index(PCIDevice *dev, int n);
+
 #endif /* QEMU_PCIE_SRIOV_H */
-- 
2.25.1




[PATCH v3 08/15] hw/nvme: Implement the Function Level Reset

2021-12-21 Thread Lukasz Maniak
From: Łukasz Gieryk 

This patch implements the Function Level Reset, a feature currently not
implemented for the Nvme device, while listed as a mandatory ("shall")
in the 1.4 spec.

The implementation reuses FLR-related building blocks defined for the
pci-bridge module, and follows the same logic:
- FLR capability is advertised in the PCIE config,
- custom pci_write_config callback detects a write to the trigger
  register and performs the PCI reset,
- which, eventually, calls the custom dc->reset handler.

Depending on reset type, parts of the state should (or should not) be
cleared. To distinguish the type of reset, an additional parameter is
passed to the reset function.

This patch also enables advertisement of the Power Management PCI
capability. The main reason behind it is to announce the no_soft_reset=1
bit, to signal SR-IOV support where each VF can be reset individually.

The implementation purposedly ignores writes to the PMCS.PS register,
as even such naïve behavior is enough to correctly handle the D3->D0
transition.

It’s worth to note, that the power state transition back to to D3, with
all the corresponding side effects, wasn't and stil isn't handled
properly.

Signed-off-by: Łukasz Gieryk 
Reviewed-by: Klaus Jensen 
---
 hw/nvme/ctrl.c   | 52 
 hw/nvme/nvme.h   |  5 +
 hw/nvme/trace-events |  1 +
 3 files changed, 54 insertions(+), 4 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index eaca12df57..9e83b4dd76 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -5602,7 +5602,7 @@ static void nvme_process_sq(void *opaque)
 }
 }
 
-static void nvme_ctrl_reset(NvmeCtrl *n)
+static void nvme_ctrl_reset(NvmeCtrl *n, NvmeResetType rst)
 {
 NvmeNamespace *ns;
 int i;
@@ -5634,7 +5634,9 @@ static void nvme_ctrl_reset(NvmeCtrl *n)
 }
 
 if (!pci_is_vf(&n->parent_obj) && n->params.sriov_max_vfs) {
-pcie_sriov_pf_disable_vfs(&n->parent_obj);
+if (rst != NVME_RESET_CONTROLLER) {
+pcie_sriov_pf_disable_vfs(&n->parent_obj);
+}
 }
 
 n->aer_queued = 0;
@@ -5868,7 +5870,7 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, 
uint64_t data,
 }
 } else if (!NVME_CC_EN(data) && NVME_CC_EN(cc)) {
 trace_pci_nvme_mmio_stopped();
-nvme_ctrl_reset(n);
+nvme_ctrl_reset(n, NVME_RESET_CONTROLLER);
 cc = 0;
 csts &= ~NVME_CSTS_READY;
 }
@@ -6426,6 +6428,28 @@ static void nvme_init_sriov(NvmeCtrl *n, PCIDevice 
*pci_dev, uint16_t offset,
   PCI_BASE_ADDRESS_MEM_TYPE_64, bar_size);
 }
 
+static int nvme_add_pm_capability(PCIDevice *pci_dev, uint8_t offset)
+{
+Error *err = NULL;
+int ret;
+
+ret = pci_add_capability(pci_dev, PCI_CAP_ID_PM, offset,
+ PCI_PM_SIZEOF, &err);
+if (err) {
+error_report_err(err);
+return ret;
+}
+
+pci_set_word(pci_dev->config + offset + PCI_PM_PMC,
+ PCI_PM_CAP_VER_1_2);
+pci_set_word(pci_dev->config + offset + PCI_PM_CTRL,
+ PCI_PM_CTRL_NO_SOFT_RESET);
+pci_set_word(pci_dev->wmask + offset + PCI_PM_CTRL,
+ PCI_PM_CTRL_STATE_MASK);
+
+return 0;
+}
+
 static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
 {
 uint8_t *pci_conf = pci_dev->config;
@@ -6447,7 +6471,9 @@ static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, 
Error **errp)
 }
 
 pci_config_set_class(pci_conf, PCI_CLASS_STORAGE_EXPRESS);
+nvme_add_pm_capability(pci_dev, 0x60);
 pcie_endpoint_cap_init(pci_dev, 0x80);
+pcie_cap_flr_init(pci_dev);
 if (n->params.sriov_max_vfs) {
 pcie_ari_init(pci_dev, 0x100, 1);
 }
@@ -6696,7 +6722,7 @@ static void nvme_exit(PCIDevice *pci_dev)
 NvmeNamespace *ns;
 int i;
 
-nvme_ctrl_reset(n);
+nvme_ctrl_reset(n, NVME_RESET_FUNCTION);
 
 if (n->subsys) {
 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
@@ -6795,6 +6821,22 @@ static void nvme_set_smart_warning(Object *obj, Visitor 
*v, const char *name,
 }
 }
 
+static void nvme_pci_reset(DeviceState *qdev)
+{
+PCIDevice *pci_dev = PCI_DEVICE(qdev);
+NvmeCtrl *n = NVME(pci_dev);
+
+trace_pci_nvme_pci_reset();
+nvme_ctrl_reset(n, NVME_RESET_FUNCTION);
+}
+
+static void nvme_pci_write_config(PCIDevice *dev, uint32_t address,
+  uint32_t val, int len)
+{
+pci_default_write_config(dev, address, val, len);
+pcie_cap_flr_write_config(dev, address, val, len);
+}
+
 static const VMStateDescription nvme_vmstate = {
 .name = "nvme",
 .unmigratable = 1,
@@ -6806,6 +6848,7 @@ static void nvme_class_init(ObjectClass *oc, void *data)
 PCIDeviceClass *pc = PCI_DEVICE_CLASS(oc);
 
 pc->realize = nvme_realize;
+pc->config_write = nvme_pci_write_config;
 pc->exit = nvme_exit;
 pc->class_id = PCI_CLASS_STORAG

[PATCH v3 04/15] pcie: Add 1.2 version token for the Power Management Capability

2021-12-21 Thread Lukasz Maniak
From: Łukasz Gieryk 

Signed-off-by: Łukasz Gieryk 
---
 include/hw/pci/pci_regs.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/hw/pci/pci_regs.h b/include/hw/pci/pci_regs.h
index 77ba64b931..a590140962 100644
--- a/include/hw/pci/pci_regs.h
+++ b/include/hw/pci/pci_regs.h
@@ -4,5 +4,6 @@
 #include "standard-headers/linux/pci_regs.h"
 
 #define  PCI_PM_CAP_VER_1_1 0x0002  /* PCI PM spec ver. 1.1 */
+#define  PCI_PM_CAP_VER_1_2 0x0003  /* PCI PM spec ver. 1.2 */
 
 #endif
-- 
2.25.1




[PATCH v3 06/15] hw/nvme: Add support for Primary Controller Capabilities

2021-12-21 Thread Lukasz Maniak
Implementation of Primary Controller Capabilities data
structure (Identify command with CNS value of 14h).

Currently, the command returns only ID of a primary controller.
Handling of remaining fields are added in subsequent patches
implementing virtualization enhancements.

Signed-off-by: Lukasz Maniak 
Reviewed-by: Klaus Jensen 
---
 hw/nvme/ctrl.c   | 22 +-
 hw/nvme/nvme.h   |  2 ++
 hw/nvme/trace-events |  1 +
 include/block/nvme.h | 23 +++
 4 files changed, 43 insertions(+), 5 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 159635c1af..651e1f2fa2 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -4543,6 +4543,13 @@ static uint16_t nvme_identify_ctrl_list(NvmeCtrl *n, 
NvmeRequest *req,
 return nvme_c2h(n, (uint8_t *)list, sizeof(list), req);
 }
 
+static uint16_t nvme_identify_pri_ctrl_cap(NvmeCtrl *n, NvmeRequest *req)
+{
+trace_pci_nvme_identify_pri_ctrl_cap(le16_to_cpu(n->pri_ctrl_cap.cntlid));
+
+return nvme_c2h(n, (uint8_t *)&n->pri_ctrl_cap, sizeof(NvmePriCtrlCap), 
req);
+}
+
 static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req,
  bool active)
 {
@@ -4761,6 +4768,8 @@ static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest 
*req)
 return nvme_identify_ctrl_list(n, req, true);
 case NVME_ID_CNS_CTRL_LIST:
 return nvme_identify_ctrl_list(n, req, false);
+case NVME_ID_CNS_PRIMARY_CTRL_CAP:
+return nvme_identify_pri_ctrl_cap(n, req);
 case NVME_ID_CNS_CS_NS:
 return nvme_identify_ns_csi(n, req, true);
 case NVME_ID_CNS_CS_NS_PRESENT:
@@ -6311,6 +6320,8 @@ static void nvme_check_constraints(NvmeCtrl *n, Error 
**errp)
 
 static void nvme_init_state(NvmeCtrl *n)
 {
+NvmePriCtrlCap *cap = &n->pri_ctrl_cap;
+
 /* add one to max_ioqpairs to account for the admin queue pair */
 n->reg_size = pow2ceil(sizeof(NvmeBar) +
2 * (n->params.max_ioqpairs + 1) * NVME_DB_SIZE);
@@ -6320,6 +6331,8 @@ static void nvme_init_state(NvmeCtrl *n)
 n->features.temp_thresh_hi = NVME_TEMPERATURE_WARNING;
 n->starttime_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
 n->aer_reqs = g_new0(NvmeRequest *, n->params.aerl + 1);
+
+cap->cntlid = cpu_to_le16(n->cntlid);
 }
 
 static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev)
@@ -6619,15 +6632,14 @@ static void nvme_realize(PCIDevice *pci_dev, Error 
**errp)
 qbus_init(&n->bus, sizeof(NvmeBus), TYPE_NVME_BUS,
   &pci_dev->qdev, n->parent_obj.qdev.id);
 
-nvme_init_state(n);
-if (nvme_init_pci(n, pci_dev, errp)) {
-return;
-}
-
 if (nvme_init_subsys(n, errp)) {
 error_propagate(errp, local_err);
 return;
 }
+nvme_init_state(n);
+if (nvme_init_pci(n, pci_dev, errp)) {
+return;
+}
 nvme_init_ctrl(n, pci_dev);
 
 /* setup a namespace if the controller drive property was given */
diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
index 4c8af34b28..81deb45dfb 100644
--- a/hw/nvme/nvme.h
+++ b/hw/nvme/nvme.h
@@ -461,6 +461,8 @@ typedef struct NvmeCtrl {
 };
 uint32_tasync_config;
 } features;
+
+NvmePriCtrlCap  pri_ctrl_cap;
 } NvmeCtrl;
 
 static inline NvmeNamespace *nvme_ns(NvmeCtrl *n, uint32_t nsid)
diff --git a/hw/nvme/trace-events b/hw/nvme/trace-events
index ff6cafd520..1014ebceb6 100644
--- a/hw/nvme/trace-events
+++ b/hw/nvme/trace-events
@@ -52,6 +52,7 @@ pci_nvme_identify_ctrl(void) "identify controller"
 pci_nvme_identify_ctrl_csi(uint8_t csi) "identify controller, csi=0x%"PRIx8""
 pci_nvme_identify_ns(uint32_t ns) "nsid %"PRIu32""
 pci_nvme_identify_ctrl_list(uint8_t cns, uint16_t cntid) "cns 0x%"PRIx8" cntid 
%"PRIu16""
+pci_nvme_identify_pri_ctrl_cap(uint16_t cntlid) "identify primary controller 
capabilities cntlid=%"PRIu16""
 pci_nvme_identify_ns_csi(uint32_t ns, uint8_t csi) "nsid=%"PRIu32", 
csi=0x%"PRIx8""
 pci_nvme_identify_nslist(uint32_t ns) "nsid %"PRIu32""
 pci_nvme_identify_nslist_csi(uint16_t ns, uint8_t csi) "nsid=%"PRIu16", 
csi=0x%"PRIx8""
diff --git a/include/block/nvme.h b/include/block/nvme.h
index e3bd47bf76..f69bd1d14f 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -1017,6 +1017,7 @@ enum NvmeIdCns {
 NVME_ID_CNS_NS_PRESENT= 0x11,
 NVME_ID_CNS_NS_ATTACHED_CTRL_LIST = 0x12,
 NVME_ID_CNS_CTRL_LIST = 0x13,
+NVME_ID_CNS_PRIMARY_CTRL_CAP  = 0x14,
 NVME_ID_CNS_CS_NS_PRESENT_LIST= 0x1a,
 NVME_ID_CNS_CS_NS_PRESENT = 0x1b,
 NVME_ID_CNS_IO_COMMAND_SET= 0x1c,
@@ -1465,6 +1466,27 @@ typedef enum NvmeZoneState {
 NVME_ZONE_STATE_OFFLINE  = 0x0f,
 } NvmeZoneState;
 
+typedef struct QEMU_PACKED NvmePriCtrlCap {
+uint16_tcntlid;
+uint16_tportid;
+uint8_t crt;
+uint8_t rsvd5[27];
+uint32_tvqfrt;
+uint32_tvqrfa;
+uint16_tvqrfap;
+uint16_tvqprt;
+uint16_tvqfr

[PATCH v3 07/15] hw/nvme: Add support for Secondary Controller List

2021-12-21 Thread Lukasz Maniak
Introduce handling for Secondary Controller List (Identify command with
CNS value of 15h).

Secondary controller ids are unique in the subsystem, hence they are
reserved by it upon initialization of the primary controller to the
number of sriov_max_vfs.

ID reservation requires the addition of an intermediate controller slot
state, so the reserved controller has the address 0x.
A secondary controller is in the reserved state when it has no virtual
function assigned, but its primary controller is realized.
Secondary controller reservations are released to NULL when its primary
controller is unregistered.

Signed-off-by: Lukasz Maniak 
---
 hw/nvme/ctrl.c   | 35 +
 hw/nvme/ns.c |  2 +-
 hw/nvme/nvme.h   | 18 +++
 hw/nvme/subsys.c | 75 ++--
 hw/nvme/trace-events |  1 +
 include/block/nvme.h | 20 
 6 files changed, 141 insertions(+), 10 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 651e1f2fa2..eaca12df57 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -4550,6 +4550,29 @@ static uint16_t nvme_identify_pri_ctrl_cap(NvmeCtrl *n, 
NvmeRequest *req)
 return nvme_c2h(n, (uint8_t *)&n->pri_ctrl_cap, sizeof(NvmePriCtrlCap), 
req);
 }
 
+static uint16_t nvme_identify_sec_ctrl_list(NvmeCtrl *n, NvmeRequest *req)
+{
+NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
+uint16_t pri_ctrl_id = le16_to_cpu(n->pri_ctrl_cap.cntlid);
+uint16_t min_id = le16_to_cpu(c->ctrlid);
+uint8_t num_sec_ctrl = n->sec_ctrl_list.numcntl;
+NvmeSecCtrlList list = {0};
+uint8_t i;
+
+for (i = 0; i < num_sec_ctrl; i++) {
+if (n->sec_ctrl_list.sec[i].scid >= min_id) {
+list.numcntl = num_sec_ctrl - i;
+memcpy(&list.sec, n->sec_ctrl_list.sec + i,
+   list.numcntl * sizeof(NvmeSecCtrlEntry));
+break;
+}
+}
+
+trace_pci_nvme_identify_sec_ctrl_list(pri_ctrl_id, list.numcntl);
+
+return nvme_c2h(n, (uint8_t *)&list, sizeof(list), req);
+}
+
 static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req,
  bool active)
 {
@@ -4770,6 +4793,8 @@ static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest 
*req)
 return nvme_identify_ctrl_list(n, req, false);
 case NVME_ID_CNS_PRIMARY_CTRL_CAP:
 return nvme_identify_pri_ctrl_cap(n, req);
+case NVME_ID_CNS_SECONDARY_CTRL_LIST:
+return nvme_identify_sec_ctrl_list(n, req);
 case NVME_ID_CNS_CS_NS:
 return nvme_identify_ns_csi(n, req, true);
 case NVME_ID_CNS_CS_NS_PRESENT:
@@ -6321,6 +6346,9 @@ static void nvme_check_constraints(NvmeCtrl *n, Error 
**errp)
 static void nvme_init_state(NvmeCtrl *n)
 {
 NvmePriCtrlCap *cap = &n->pri_ctrl_cap;
+NvmeSecCtrlList *list = &n->sec_ctrl_list;
+NvmeSecCtrlEntry *sctrl;
+int i;
 
 /* add one to max_ioqpairs to account for the admin queue pair */
 n->reg_size = pow2ceil(sizeof(NvmeBar) +
@@ -6332,6 +6360,13 @@ static void nvme_init_state(NvmeCtrl *n)
 n->starttime_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
 n->aer_reqs = g_new0(NvmeRequest *, n->params.aerl + 1);
 
+list->numcntl = cpu_to_le16(n->params.sriov_max_vfs);
+for (i = 0; i < n->params.sriov_max_vfs; i++) {
+sctrl = &list->sec[i];
+sctrl->pcid = cpu_to_le16(n->cntlid);
+sctrl->vfn = cpu_to_le16(i + 1);
+}
+
 cap->cntlid = cpu_to_le16(n->cntlid);
 }
 
diff --git a/hw/nvme/ns.c b/hw/nvme/ns.c
index 8b5f98c761..e7a54ac572 100644
--- a/hw/nvme/ns.c
+++ b/hw/nvme/ns.c
@@ -511,7 +511,7 @@ static void nvme_ns_realize(DeviceState *dev, Error **errp)
 for (i = 0; i < ARRAY_SIZE(subsys->ctrls); i++) {
 NvmeCtrl *ctrl = subsys->ctrls[i];
 
-if (ctrl) {
+if (ctrl && ctrl != SUBSYS_SLOT_RSVD) {
 nvme_attach_ns(ctrl, ns);
 }
 }
diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
index 81deb45dfb..2157a7b95f 100644
--- a/hw/nvme/nvme.h
+++ b/hw/nvme/nvme.h
@@ -43,6 +43,7 @@ typedef struct NvmeBus {
 #define TYPE_NVME_SUBSYS "nvme-subsys"
 #define NVME_SUBSYS(obj) \
 OBJECT_CHECK(NvmeSubsystem, (obj), TYPE_NVME_SUBSYS)
+#define SUBSYS_SLOT_RSVD (void *)0x
 
 typedef struct NvmeSubsystem {
 DeviceState parent_obj;
@@ -67,6 +68,10 @@ static inline NvmeCtrl *nvme_subsys_ctrl(NvmeSubsystem 
*subsys,
 return NULL;
 }
 
+if (subsys->ctrls[cntlid] == SUBSYS_SLOT_RSVD) {
+return NULL;
+}
+
 return subsys->ctrls[cntlid];
 }
 
@@ -463,6 +468,7 @@ typedef struct NvmeCtrl {
 } features;
 
 NvmePriCtrlCap  pri_ctrl_cap;
+NvmeSecCtrlList sec_ctrl_list;
 } NvmeCtrl;
 
 static inline NvmeNamespace *nvme_ns(NvmeCtrl *n, uint32_t nsid)
@@ -497,6 +503,18 @@ static inline uint16_t nvme_cid(NvmeRequest *req)
 return le16_to_cpu(req->cqe.cid);
 }
 
+static inline NvmeSecCtrlEntry *nvme_sctrl(NvmeCtrl *n)
+

[PATCH v3 09/15] hw/nvme: Make max_ioqpairs and msix_qsize configurable in runtime

2021-12-21 Thread Lukasz Maniak
From: Łukasz Gieryk 

The NVMe device defines two properties: max_ioqpairs, msix_qsize. Having
them as constants is problematic for SR-IOV support.

SR-IOV introduces virtual resources (queues, interrupts) that can be
assigned to PF and its dependent VFs. Each device, following a reset,
should work with the configured number of queues. A single constant is
no longer sufficient to hold the whole state.

This patch tries to solve the problem by introducing additional
variables in NvmeCtrl’s state. The variables for, e.g., managing queues
are therefore organized as:
 - n->params.max_ioqpairs – no changes, constant set by the user
 - n->(mutable_state) – (not a part of this patch) user-configurable,
specifies number of queues available _after_
reset
 - n->conf_ioqpairs - (new) used in all the places instead of the ‘old’
  n->params.max_ioqpairs; initialized in realize()
  and updated during reset() to reflect user’s
  changes to the mutable state

Since the number of available i/o queues and interrupts can change in
runtime, buffers for sq/cqs and the MSIX-related structures are
allocated big enough to handle the limits, to completely avoid the
complicated reallocation. A helper function (nvme_update_msixcap_ts)
updates the corresponding capability register, to signal configuration
changes.

Signed-off-by: Łukasz Gieryk 
---
 hw/nvme/ctrl.c | 52 ++
 hw/nvme/nvme.h |  2 ++
 2 files changed, 38 insertions(+), 16 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 9e83b4dd76..de463450b6 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -416,12 +416,12 @@ static bool nvme_nsid_valid(NvmeCtrl *n, uint32_t nsid)
 
 static int nvme_check_sqid(NvmeCtrl *n, uint16_t sqid)
 {
-return sqid < n->params.max_ioqpairs + 1 && n->sq[sqid] != NULL ? 0 : -1;
+return sqid < n->conf_ioqpairs + 1 && n->sq[sqid] != NULL ? 0 : -1;
 }
 
 static int nvme_check_cqid(NvmeCtrl *n, uint16_t cqid)
 {
-return cqid < n->params.max_ioqpairs + 1 && n->cq[cqid] != NULL ? 0 : -1;
+return cqid < n->conf_ioqpairs + 1 && n->cq[cqid] != NULL ? 0 : -1;
 }
 
 static void nvme_inc_cq_tail(NvmeCQueue *cq)
@@ -4034,8 +4034,7 @@ static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeRequest 
*req)
 trace_pci_nvme_err_invalid_create_sq_cqid(cqid);
 return NVME_INVALID_CQID | NVME_DNR;
 }
-if (unlikely(!sqid || sqid > n->params.max_ioqpairs ||
-n->sq[sqid] != NULL)) {
+if (unlikely(!sqid || sqid > n->conf_ioqpairs || n->sq[sqid] != NULL)) {
 trace_pci_nvme_err_invalid_create_sq_sqid(sqid);
 return NVME_INVALID_QID | NVME_DNR;
 }
@@ -4387,8 +4386,7 @@ static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest 
*req)
 trace_pci_nvme_create_cq(prp1, cqid, vector, qsize, qflags,
  NVME_CQ_FLAGS_IEN(qflags) != 0);
 
-if (unlikely(!cqid || cqid > n->params.max_ioqpairs ||
-n->cq[cqid] != NULL)) {
+if (unlikely(!cqid || cqid > n->conf_ioqpairs || n->cq[cqid] != NULL)) {
 trace_pci_nvme_err_invalid_create_cq_cqid(cqid);
 return NVME_INVALID_QID | NVME_DNR;
 }
@@ -4404,7 +4402,7 @@ static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest 
*req)
 trace_pci_nvme_err_invalid_create_cq_vector(vector);
 return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
 }
-if (unlikely(vector >= n->params.msix_qsize)) {
+if (unlikely(vector >= n->conf_msix_qsize)) {
 trace_pci_nvme_err_invalid_create_cq_vector(vector);
 return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
 }
@@ -5000,13 +4998,12 @@ defaults:
 
 break;
 case NVME_NUMBER_OF_QUEUES:
-result = (n->params.max_ioqpairs - 1) |
-((n->params.max_ioqpairs - 1) << 16);
+result = (n->conf_ioqpairs - 1) | ((n->conf_ioqpairs - 1) << 16);
 trace_pci_nvme_getfeat_numq(result);
 break;
 case NVME_INTERRUPT_VECTOR_CONF:
 iv = dw11 & 0x;
-if (iv >= n->params.max_ioqpairs + 1) {
+if (iv >= n->conf_ioqpairs + 1) {
 return NVME_INVALID_FIELD | NVME_DNR;
 }
 
@@ -5161,10 +5158,10 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, 
NvmeRequest *req)
 
 trace_pci_nvme_setfeat_numq((dw11 & 0x) + 1,
 ((dw11 >> 16) & 0x) + 1,
-n->params.max_ioqpairs,
-n->params.max_ioqpairs);
-req->cqe.result = cpu_to_le32((n->params.max_ioqpairs - 1) |
-  ((n->params.max_ioqpairs - 1) << 16));
+n->conf_ioqpairs,
+n->conf_ioqpairs);
+req->cqe.result = cpu_to_le32((n->conf_ioqpairs - 1) |
+  ((n->conf_ioqpairs - 1) << 16));
 break;
 case NVME_ASYNC

[PATCH v3 11/15] hw/nvme: Calculate BAR attributes in a function

2021-12-21 Thread Lukasz Maniak
From: Łukasz Gieryk 

An NVMe device with SR-IOV capability calculates the BAR size
differently for PF and VF, so it makes sense to extract the common code
to a separate function.

Signed-off-by: Łukasz Gieryk 
---
 hw/nvme/ctrl.c | 45 +++--
 1 file changed, 31 insertions(+), 14 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index a4b11b201a..a26abaea36 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -6429,6 +6429,34 @@ static void nvme_init_pmr(NvmeCtrl *n, PCIDevice 
*pci_dev)
 memory_region_set_enabled(&n->pmr.dev->mr, false);
 }
 
+static uint64_t nvme_bar_size(unsigned total_queues, unsigned total_irqs,
+  unsigned *msix_table_offset,
+  unsigned *msix_pba_offset)
+{
+uint64_t bar_size, msix_table_size, msix_pba_size;
+
+bar_size = sizeof(NvmeBar) + 2 * total_queues * NVME_DB_SIZE;
+bar_size = QEMU_ALIGN_UP(bar_size, 4 * KiB);
+
+if (msix_table_offset) {
+*msix_table_offset = bar_size;
+}
+
+msix_table_size = PCI_MSIX_ENTRY_SIZE * total_irqs;
+bar_size += msix_table_size;
+bar_size = QEMU_ALIGN_UP(bar_size, 4 * KiB);
+
+if (msix_pba_offset) {
+*msix_pba_offset = bar_size;
+}
+
+msix_pba_size = QEMU_ALIGN_UP(total_irqs, 64) / 8;
+bar_size += msix_pba_size;
+
+bar_size = pow2ceil(bar_size);
+return bar_size;
+}
+
 static void nvme_init_sriov(NvmeCtrl *n, PCIDevice *pci_dev, uint16_t offset,
 uint64_t bar_size)
 {
@@ -6468,7 +6496,7 @@ static int nvme_add_pm_capability(PCIDevice *pci_dev, 
uint8_t offset)
 static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
 {
 uint8_t *pci_conf = pci_dev->config;
-uint64_t bar_size, msix_table_size, msix_pba_size;
+uint64_t bar_size;
 unsigned msix_table_offset, msix_pba_offset;
 int ret;
 
@@ -6494,19 +6522,8 @@ static int nvme_init_pci(NvmeCtrl *n, PCIDevice 
*pci_dev, Error **errp)
 }
 
 /* add one to max_ioqpairs to account for the admin queue pair */
-bar_size = sizeof(NvmeBar) +
-   2 * (n->params.max_ioqpairs + 1) * NVME_DB_SIZE;
-bar_size = QEMU_ALIGN_UP(bar_size, 4 * KiB);
-msix_table_offset = bar_size;
-msix_table_size = PCI_MSIX_ENTRY_SIZE * n->params.msix_qsize;
-
-bar_size += msix_table_size;
-bar_size = QEMU_ALIGN_UP(bar_size, 4 * KiB);
-msix_pba_offset = bar_size;
-msix_pba_size = QEMU_ALIGN_UP(n->params.msix_qsize, 64) / 8;
-
-bar_size += msix_pba_size;
-bar_size = pow2ceil(bar_size);
+bar_size = nvme_bar_size(n->params.max_ioqpairs + 1, n->params.msix_qsize,
+ &msix_table_offset, &msix_pba_offset);
 
 memory_region_init(&n->bar0, OBJECT(n), "nvme-bar0", bar_size);
 memory_region_init_io(&n->iomem, OBJECT(n), &nvme_mmio_ops, n, "nvme",
-- 
2.25.1




[PATCH v3 12/15] hw/nvme: Initialize capability structures for primary/secondary controllers

2021-12-21 Thread Lukasz Maniak
From: Łukasz Gieryk 

With four new properties:
 - sriov_v{i,q}_flexible,
 - sriov_max_v{i,q}_per_vf,
one can configure the number of available flexible resources, as well as
the limits. The primary and secondary controller capability structures
are initialized accordingly.

Since the number of available queues (interrupts) now varies between
VF/PF, BAR size calculation is also adjusted.

Signed-off-by: Łukasz Gieryk 
---
 hw/nvme/ctrl.c   | 138 ---
 hw/nvme/nvme.h   |   4 ++
 include/block/nvme.h |   5 ++
 3 files changed, 140 insertions(+), 7 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index a26abaea36..e43773b525 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -36,6 +36,10 @@
  *  zoned.zasl=, \
  *  zoned.auto_transition=, \
  *  sriov_max_vfs= \
+ *  sriov_vq_flexible= \
+ *  sriov_vi_flexible= \
+ *  sriov_max_vi_per_vf= \
+ *  sriov_max_vq_per_vf= \
  *  subsys=
  *  -device nvme-ns,drive=,bus=,nsid=,\
  *  zoned=, \
@@ -113,6 +117,29 @@
  *   enables reporting of both SR-IOV and ARI capabilities by the NVMe device.
  *   Virtual function controllers will not report SR-IOV capability.
  *
+ *   NOTE: Single Root I/O Virtualization support is experimental.
+ *   All the related parameters may be subject to change.
+ *
+ * - `sriov_vq_flexible`
+ *   Indicates the total number of flexible queue resources assignable to all
+ *   the secondary controllers. Implicitly sets the number of primary
+ *   controller's private resources to `(max_ioqpairs - sriov_vq_flexible)`.
+ *
+ * - `sriov_vi_flexible`
+ *   Indicates the total number of flexible interrupt resources assignable to
+ *   all the secondary controllers. Implicitly sets the number of primary
+ *   controller's private resources to `(msix_qsize - sriov_vi_flexible)`.
+ *
+ * - `sriov_max_vi_per_vf`
+ *   Indicates the maximum number of virtual interrupt resources assignable
+ *   to a secondary controller. The default 0 resolves to
+ *   `(sriov_vi_flexible / sriov_max_vfs)`.
+ *
+ * - `sriov_max_vq_per_vf`
+ *   Indicates the maximum number of virtual queue resources assignable to
+ *   a secondary controller. The default 0 resolves to
+ *   `(sriov_vq_flexible / sriov_max_vfs)`.
+ *
  * nvme namespace device parameters
  * 
  * - `shared`
@@ -184,6 +211,7 @@
 #define NVME_NUM_FW_SLOTS 1
 #define NVME_DEFAULT_MAX_ZA_SIZE (128 * KiB)
 #define NVME_MAX_VFS 127
+#define NVME_VF_RES_GRANULARITY 1
 #define NVME_VF_OFFSET 0x1
 #define NVME_VF_STRIDE 1
 
@@ -6357,6 +6385,54 @@ static void nvme_check_constraints(NvmeCtrl *n, Error 
**errp)
 error_setg(errp, "PMR is not supported with SR-IOV");
 return;
 }
+
+if (!params->sriov_vq_flexible || !params->sriov_vi_flexible) {
+error_setg(errp, "both sriov_vq_flexible and sriov_vi_flexible"
+   " must be set for the use of SR-IOV");
+return;
+}
+
+if (params->sriov_vq_flexible < params->sriov_max_vfs * 2) {
+error_setg(errp, "sriov_vq_flexible must be greater than or equal"
+   " to %d (sriov_max_vfs * 2)", params->sriov_max_vfs * 
2);
+return;
+}
+
+if (params->max_ioqpairs < params->sriov_vq_flexible + 2) {
+error_setg(errp, "sriov_vq_flexible - max_ioqpairs (PF-private"
+   " queue resources) must be greater than or equal to 2");
+return;
+}
+
+if (params->sriov_vi_flexible < params->sriov_max_vfs) {
+error_setg(errp, "sriov_vi_flexible must be greater than or equal"
+   " to %d (sriov_max_vfs)", params->sriov_max_vfs);
+return;
+}
+
+if (params->msix_qsize < params->sriov_vi_flexible + 1) {
+error_setg(errp, "sriov_vi_flexible - msix_qsize (PF-private"
+   " interrupt resources) must be greater than or equal"
+   " to 1");
+return;
+}
+
+if (params->sriov_max_vi_per_vf &&
+(params->sriov_max_vi_per_vf - 1) % NVME_VF_RES_GRANULARITY) {
+error_setg(errp, "sriov_max_vi_per_vf must meet:"
+   " (X - 1) %% %d == 0 and X >= 1",
+   NVME_VF_RES_GRANULARITY);
+return;
+}
+
+if (params->sriov_max_vq_per_vf &&
+(params->sriov_max_vq_per_vf < 2 ||
+ (params->sriov_max_vq_per_vf - 1) % NVME_VF_RES_GRANULARITY)) {
+error_setg(errp, "sriov_max_vq_per_vf must meet:"
+   " (X - 1) %% %d == 0 and X >= 2",
+   NVME_VF_RES_GRANULARITY);
+return;
+}
 }
 }
 
@@ -6365,10 +6441,19 @@ static void nvme_init_state(NvmeCtrl *n)
 NvmePriCtrlCap *cap = &n->pri_ctrl_cap;
 NvmeS

[PATCH v3 15/15] hw/nvme: Update the initalization place for the AER queue

2021-12-21 Thread Lukasz Maniak
From: Łukasz Gieryk 

This patch updates the initialization place for the AER queue, so it’s
initialized once, at controller initialization, and not every time
controller is enabled.

While the original version works for a non-SR-IOV device, as it’s hard
to interact with the controller if it’s not enabled, the multiple
reinitialization is not necessarily correct.

With the SR/IOV feature enabled a segfault can happen: a VF can have its
controller disabled, while a namespace can still be attached to the
controller through the parent PF. An event generated in such case ends
up on an uninitialized queue.

While it’s an interesting question whether a VF should support AER in
the first place, I don’t think it must be answered today.

Signed-off-by: Łukasz Gieryk 
---
 hw/nvme/ctrl.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index e21c60fee8..23280f501f 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -6023,8 +6023,6 @@ static int nvme_start_ctrl(NvmeCtrl *n)
 
 nvme_set_timestamp(n, 0ULL);
 
-QTAILQ_INIT(&n->aer_queue);
-
 nvme_select_iocs(n);
 
 return 0;
@@ -7001,6 +6999,8 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice 
*pci_dev)
 id->cmic |= NVME_CMIC_MULTI_CTRL;
 }
 
+QTAILQ_INIT(&n->aer_queue);
+
 NVME_CAP_SET_MQES(cap, 0x7ff);
 NVME_CAP_SET_CQR(cap, 1);
 NVME_CAP_SET_TO(cap, 0xf);
-- 
2.25.1




[PATCH v3 13/15] hw/nvme: Add support for the Virtualization Management command

2021-12-21 Thread Lukasz Maniak
From: Łukasz Gieryk 

With the new command one can:
 - assign flexible resources (queues, interrupts) to primary and
   secondary controllers,
 - toggle the online/offline state of given controller.

Signed-off-by: Łukasz Gieryk 
---
 hw/nvme/ctrl.c   | 253 ++-
 hw/nvme/nvme.h   |  20 
 hw/nvme/trace-events |   3 +
 include/block/nvme.h |  17 +++
 4 files changed, 291 insertions(+), 2 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index e43773b525..e21c60fee8 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -188,6 +188,7 @@
 #include "qemu/error-report.h"
 #include "qemu/log.h"
 #include "qemu/units.h"
+#include "qemu/range.h"
 #include "qapi/error.h"
 #include "qapi/visitor.h"
 #include "sysemu/sysemu.h"
@@ -259,6 +260,7 @@ static const uint32_t nvme_cse_acs[256] = {
 [NVME_ADM_CMD_GET_FEATURES] = NVME_CMD_EFF_CSUPP,
 [NVME_ADM_CMD_ASYNC_EV_REQ] = NVME_CMD_EFF_CSUPP,
 [NVME_ADM_CMD_NS_ATTACHMENT]= NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_NIC,
+[NVME_ADM_CMD_VIRT_MNGMT]   = NVME_CMD_EFF_CSUPP,
 [NVME_ADM_CMD_FORMAT_NVM]   = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
 };
 
@@ -290,6 +292,7 @@ static const uint32_t nvme_cse_iocs_zoned[256] = {
 };
 
 static void nvme_process_sq(void *opaque);
+static void nvme_ctrl_reset(NvmeCtrl *n, NvmeResetType rst);
 
 static uint16_t nvme_sqid(NvmeRequest *req)
 {
@@ -5539,6 +5542,164 @@ out:
 return status;
 }
 
+static void nvme_get_virt_res_num(NvmeCtrl *n, uint8_t rt, int *num_total,
+  int *num_prim, int *num_sec)
+{
+*num_total = le32_to_cpu(rt ? n->pri_ctrl_cap.vifrt : 
n->pri_ctrl_cap.vqfrt);
+*num_prim = le16_to_cpu(rt ? n->pri_ctrl_cap.virfap : 
n->pri_ctrl_cap.vqrfap);
+*num_sec = le16_to_cpu(rt ? n->pri_ctrl_cap.virfa : n->pri_ctrl_cap.vqrfa);
+}
+
+static uint16_t nvme_assign_virt_res_to_prim(NvmeCtrl *n, NvmeRequest *req,
+ uint16_t cntlid, uint8_t rt, int 
nr)
+{
+int num_total, num_prim, num_sec;
+
+if (cntlid != n->cntlid) {
+return NVME_INVALID_CTRL_ID | NVME_DNR;
+}
+
+nvme_get_virt_res_num(n, rt, &num_total, &num_prim, &num_sec);
+
+if (nr > num_total) {
+return NVME_INVALID_NUM_RESOURCES | NVME_DNR;
+}
+
+if (nr > num_total - num_sec) {
+return NVME_INVALID_RESOURCE_ID | NVME_DNR;
+}
+
+if (rt) {
+n->next_pri_ctrl_cap.virfap = cpu_to_le16(nr);
+} else {
+n->next_pri_ctrl_cap.vqrfap = cpu_to_le16(nr);
+}
+
+req->cqe.result = cpu_to_le32(nr);
+return req->status;
+}
+
+static void nvme_update_virt_res(NvmeCtrl *n, NvmeSecCtrlEntry *sctrl,
+ uint8_t rt, int nr)
+{
+int prev_nr, prev_total;
+
+if (rt) {
+prev_nr = le16_to_cpu(sctrl->nvi);
+prev_total = le32_to_cpu(n->pri_ctrl_cap.virfa);
+sctrl->nvi = cpu_to_le16(nr);
+n->pri_ctrl_cap.virfa = cpu_to_le32(prev_total + nr - prev_nr);
+} else {
+prev_nr = le16_to_cpu(sctrl->nvq);
+prev_total = le32_to_cpu(n->pri_ctrl_cap.vqrfa);
+sctrl->nvq = cpu_to_le16(nr);
+n->pri_ctrl_cap.vqrfa = cpu_to_le32(prev_total + nr - prev_nr);
+}
+}
+
+static uint16_t nvme_assign_virt_res_to_sec(NvmeCtrl *n, NvmeRequest *req,
+uint16_t cntlid, uint8_t rt, int 
nr)
+{
+int num_total, num_prim, num_sec, num_free, diff, limit;
+NvmeSecCtrlEntry *sctrl;
+
+sctrl = nvme_sctrl_for_cntlid(n, cntlid);
+if (!sctrl) {
+return NVME_INVALID_CTRL_ID | NVME_DNR;
+}
+
+if (sctrl->scs) {
+return NVME_INVALID_SEC_CTRL_STATE | NVME_DNR;
+}
+
+limit = le16_to_cpu(rt ? n->pri_ctrl_cap.vifrsm : n->pri_ctrl_cap.vqfrsm);
+if (nr > limit) {
+return NVME_INVALID_NUM_RESOURCES | NVME_DNR;
+}
+
+nvme_get_virt_res_num(n, rt, &num_total, &num_prim, &num_sec);
+num_free = num_total - num_prim - num_sec;
+diff = nr - le16_to_cpu(rt ? sctrl->nvi : sctrl->nvq);
+
+if (diff > num_free) {
+return NVME_INVALID_RESOURCE_ID | NVME_DNR;
+}
+
+nvme_update_virt_res(n, sctrl, rt, nr);
+req->cqe.result = cpu_to_le32(nr);
+
+return req->status;
+}
+
+static uint16_t nvme_virt_set_state(NvmeCtrl *n, uint16_t cntlid, bool online)
+{
+NvmeCtrl *sn = NULL;
+NvmeSecCtrlEntry *sctrl;
+int vf_index;
+
+sctrl = nvme_sctrl_for_cntlid(n, cntlid);
+if (!sctrl) {
+return NVME_INVALID_CTRL_ID | NVME_DNR;
+}
+
+if (!pci_is_vf(&n->parent_obj)) {
+vf_index = le16_to_cpu(sctrl->vfn) - 1;
+sn = NVME(pcie_sriov_get_vf_at_index(&n->parent_obj, vf_index));
+}
+
+if (online) {
+if (!sctrl->nvi || (le16_to_cpu(sctrl->nvq) < 2) || !sn) {
+return NVME_INVALID_SEC_CTRL_STATE | NVME_DNR;
+}
+
+if (!sctrl->scs) {
+sctrl->scs = 0x1;
+nvme_ctrl_reset(

[PATCH v3 10/15] hw/nvme: Remove reg_size variable and update BAR0 size calculation

2021-12-21 Thread Lukasz Maniak
From: Łukasz Gieryk 

The n->reg_size parameter unnecessarily splits the BAR0 size calculation
in two phases; removed to simplify the code.

With all the calculations done in one place, it seems the pow2ceil,
applied originally to reg_size, is unnecessary. The rounding should
happen as the last step, when BAR size includes Nvme registers, queue
registers, and MSIX-related space.

Finally, the size of the mmio memory region is extended to cover the 1st
4KiB padding (see the map below). Access to this range is handled as
interaction with a non-existing queue and generates an error trace, so
actually nothing changes, while the reg_size variable is no longer needed.


|  BAR0|

[Nvme Registers]
[Queues]
[power-of-2 padding] - removed in this patch
[4KiB padding (1)  ]
[MSIX TABLE]
[4KiB padding (2)  ]
[MSIX PBA  ]
[power-of-2 padding]

Signed-off-by: Łukasz Gieryk 
---
 hw/nvme/ctrl.c | 10 +-
 hw/nvme/nvme.h |  1 -
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index de463450b6..a4b11b201a 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -6370,9 +6370,6 @@ static void nvme_init_state(NvmeCtrl *n)
 n->conf_ioqpairs = n->params.max_ioqpairs;
 n->conf_msix_qsize = n->params.msix_qsize;
 
-/* add one to max_ioqpairs to account for the admin queue pair */
-n->reg_size = pow2ceil(sizeof(NvmeBar) +
-   2 * (n->params.max_ioqpairs + 1) * NVME_DB_SIZE);
 n->sq = g_new0(NvmeSQueue *, n->params.max_ioqpairs + 1);
 n->cq = g_new0(NvmeCQueue *, n->params.max_ioqpairs + 1);
 n->temperature = NVME_TEMPERATURE;
@@ -6496,7 +6493,10 @@ static int nvme_init_pci(NvmeCtrl *n, PCIDevice 
*pci_dev, Error **errp)
 pcie_ari_init(pci_dev, 0x100, 1);
 }
 
-bar_size = QEMU_ALIGN_UP(n->reg_size, 4 * KiB);
+/* add one to max_ioqpairs to account for the admin queue pair */
+bar_size = sizeof(NvmeBar) +
+   2 * (n->params.max_ioqpairs + 1) * NVME_DB_SIZE;
+bar_size = QEMU_ALIGN_UP(bar_size, 4 * KiB);
 msix_table_offset = bar_size;
 msix_table_size = PCI_MSIX_ENTRY_SIZE * n->params.msix_qsize;
 
@@ -6510,7 +6510,7 @@ static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, 
Error **errp)
 
 memory_region_init(&n->bar0, OBJECT(n), "nvme-bar0", bar_size);
 memory_region_init_io(&n->iomem, OBJECT(n), &nvme_mmio_ops, n, "nvme",
-  n->reg_size);
+  msix_table_offset);
 memory_region_add_subregion(&n->bar0, 0, &n->iomem);
 
 if (pci_is_vf(pci_dev)) {
diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
index 927890b490..1401ac3904 100644
--- a/hw/nvme/nvme.h
+++ b/hw/nvme/nvme.h
@@ -414,7 +414,6 @@ typedef struct NvmeCtrl {
 uint16_tmax_prp_ents;
 uint16_tcqe_size;
 uint16_tsqe_size;
-uint32_treg_size;
 uint32_tmax_q_ents;
 uint8_t outstanding_aers;
 uint32_tirq_status;
-- 
2.25.1




Re: [PATCH 2/2] qapi/ui: introduce change-vnc-listen

2021-12-21 Thread Vladimir Sementsov-Ogievskiy

21.12.2021 17:15, Markus Armbruster wrote:

Vladimir Sementsov-Ogievskiy  writes:


Add command that can change addresses where VNC server listens for new
connections. Prior to 6.0 this functionality was available through
'change' qmp command which was deleted.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
  docs/about/removed-features.rst |  3 ++-
  qapi/ui.json| 12 
  ui/vnc.c| 26 ++
  3 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/docs/about/removed-features.rst b/docs/about/removed-features.rst
index d42c3341de..20e6901a82 100644
--- a/docs/about/removed-features.rst
+++ b/docs/about/removed-features.rst
@@ -348,7 +348,8 @@ documentation of ``query-hotpluggable-cpus`` for additional 
details.
  ``change`` (removed in 6.0)
  '''
  
-Use ``blockdev-change-medium`` or ``change-vnc-password`` instead.

+Use ``blockdev-change-medium`` or ``change-vnc-password`` or
+``change-vnc-listen`` instead.
  
  ``query-events`` (removed in 6.0)

  '
diff --git a/qapi/ui.json b/qapi/ui.json
index d7567ac866..14e6fe0b4c 100644
--- a/qapi/ui.json
+++ b/qapi/ui.json
@@ -1304,3 +1304,15 @@
  { 'command': 'display-reload',
'data': 'DisplayReloadOptions',
'boxed' : true }
+
+##
+# @change-vnc-listen:
+#
+# Change set of addresses to listen for connections.


Please document the arguments:

# @id: lorem ipsum
#
# @address: dolor sit amet
#
# @websockets: consectetur adipisici elit


Oops :)

# @id: vnc display identifier
#
# @addresses: list of addresses for listen at
#
# @websockets: list of addresses to listen with websockets




+#
+# Since: 7.0
+#
+##
+{ 'command': 'change-vnc-listen',
+  'data': { 'id': 'str', 'addresses': ['SocketAddress'],
+'*websockets': ['SocketAddress'] } }


Lacks 'if': 'CONFIG_VNC'.


Oops, yes.



We already have change-vnc-password.  You add change-vnc-listen.  Is
there anything else we might want to change?


I don't know. I have a request to change only the port of connection.

But creating a special command to change only the port is too specific.

On the other hand, creating command that will allow to change many other vnc 
parameters means deeper refactoring the vnc code, it's too much for me.

Old removed "change" command allowed to change many vnc arguments as I 
understand, but they we parsed from one string argument, which is bad for QMP.

So, I decided that the golden mean is make an interface to change the addresses 
to listen.

Actually, I don't need "websockets", and even don't know how to test them, so 
we can drop this parameter for now, it's simple to add it later on demand. Or we can keep 
it as is.



Aside: what's the difference between change-vnc-password and
set_password?


Looking at code, the difference is that set_password can also change password on spice, 
and has some additional logic  with "connected" argument.



[...]




--
Best regards,
Vladimir



[PATCH v3 14/15] docs: Add documentation for SR-IOV and Virtualization Enhancements

2021-12-21 Thread Lukasz Maniak
Signed-off-by: Lukasz Maniak 
---
 docs/system/devices/nvme.rst | 36 
 1 file changed, 36 insertions(+)

diff --git a/docs/system/devices/nvme.rst b/docs/system/devices/nvme.rst
index b5acb2a9c1..166a11abc6 100644
--- a/docs/system/devices/nvme.rst
+++ b/docs/system/devices/nvme.rst
@@ -239,3 +239,39 @@ The virtual namespace device supports DIF- and DIX-based 
protection information
   to ``1`` to transfer protection information as the first eight bytes of
   metadata. Otherwise, the protection information is transferred as the last
   eight bytes.
+
+Virtualization Enhancements and SR-IOV (Experimental Support)
+-
+
+The ``nvme`` device supports Single Root I/O Virtualization and Sharing
+along with Virtualization Enhancements. The controller has to be linked to
+an NVM Subsystem device (``nvme-subsys``) for use with SR-IOV.
+
+A number of parameters are present (**please note, that they may be
+subject to change**):
+
+``sriov_max_vfs`` (default: ``0``)
+  Indicates the maximum number of PCIe virtual functions supported
+  by the controller. Specifying a non-zero value enables reporting of both
+  SR-IOV and ARI (Alternative Routing-ID Interpretation) capabilities
+  by the NVMe device. Virtual function controllers will not report SR-IOV.
+
+``sriov_vq_flexible``
+  Indicates the total number of flexible queue resources assignable to all
+  the secondary controllers. Implicitly sets the number of primary
+  controller's private resources to ``(max_ioqpairs - sriov_vq_flexible)``.
+
+``sriov_vi_flexible``
+  Indicates the total number of flexible interrupt resources assignable to
+  all the secondary controllers. Implicitly sets the number of primary
+  controller's private resources to ``(msix_qsize - sriov_vi_flexible)``.
+
+``sriov_max_vi_per_vf`` (default: ``0``)
+  Indicates the maximum number of virtual interrupt resources assignable
+  to a secondary controller. The default ``0`` resolves to
+  ``(sriov_vi_flexible / sriov_max_vfs)``
+
+``sriov_max_vq_per_vf`` (default: ``0``)
+  Indicates the maximum number of virtual queue resources assignable to
+  a secondary controller. The default ``0`` resolves to
+  ``(sriov_vq_flexible / sriov_max_vfs)``
-- 
2.25.1




Re: [PATCH] vl: Add -set options to device opts dict when using JSON syntax for -device

2021-12-21 Thread Paolo Bonzini

On 12/21/21 13:58, Markus Armbruster wrote:

Is this a regression?  I suspect commit 5dacda5167 "vl: Enable JSON
syntax for -device" (v6.2.0).

Obviously not a regression: everything that used to work still works.



FWIW I think -set should be deprecated.  I'm not aware of any 
particularly useful use of it.  There are a couple in the QEMU tests (in 
vhost-user-test and in qemu-iotests 068), but in both cases the code 
would be easier to follow without; patches can be dusted off if desired.


Paolo



[PATCH] acpi: validate hotplug selector on access

2021-12-21 Thread Michael S. Tsirkin
When bus is looked up on a pci write, we didn't
validate that the lookup succeeded.
Fuzzers thus can trigger QEMU crash by dereferencing the NULL
bus pointer.

Fixes: b32bd763a1 ("pci: introduce acpi-index property for PCI device")
Cc: "Igor Mammedov" 
Fixes: https://gitlab.com/qemu-project/qemu/-/issues/770
Signed-off-by: Michael S. Tsirkin 
---
 hw/acpi/pcihp.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/hw/acpi/pcihp.c b/hw/acpi/pcihp.c
index 30405b5113..a5e182dd3a 100644
--- a/hw/acpi/pcihp.c
+++ b/hw/acpi/pcihp.c
@@ -491,6 +491,9 @@ static void pci_write(void *opaque, hwaddr addr, uint64_t 
data,
 }
 
 bus = acpi_pcihp_find_hotplug_bus(s, s->hotplug_select);
+if (!bus) {
+break;
+}
 QTAILQ_FOREACH_SAFE(kid, &bus->qbus.children, sibling, next) {
 Object *o = OBJECT(kid->child);
 PCIDevice *dev = PCI_DEVICE(o);
-- 
MST




Re: [PATCH] acpi: validate hotplug selector on access

2021-12-21 Thread Philippe Mathieu-Daudé
On 12/21/21 15:48, Michael S. Tsirkin wrote:
> When bus is looked up on a pci write, we didn't
> validate that the lookup succeeded.
> Fuzzers thus can trigger QEMU crash by dereferencing the NULL
> bus pointer.
> 
> Fixes: b32bd763a1 ("pci: introduce acpi-index property for PCI device")
> Cc: "Igor Mammedov" 
> Fixes: https://gitlab.com/qemu-project/qemu/-/issues/770
> Signed-off-by: Michael S. Tsirkin 
> ---
>  hw/acpi/pcihp.c | 3 +++
>  1 file changed, 3 insertions(+)

Reviewed-by: Philippe Mathieu-Daudé 




[PATCH v3 02/15] mm/memfd: Introduce MFD_INACCESSIBLE flag

2021-12-21 Thread Chao Peng
Introduce a new memfd_create() flag indicating the content of the
created memfd is inaccessible from userspace. It does this by force
setting F_SEAL_INACCESSIBLE seal when the file is created. It also set
F_SEAL_SEAL to prevent future sealing, which means, it can not coexist
with MFD_ALLOW_SEALING.

Signed-off-by: Chao Peng 
---
 include/uapi/linux/memfd.h |  1 +
 mm/memfd.c | 12 +++-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/memfd.h b/include/uapi/linux/memfd.h
index 7a8a26751c23..48750474b904 100644
--- a/include/uapi/linux/memfd.h
+++ b/include/uapi/linux/memfd.h
@@ -8,6 +8,7 @@
 #define MFD_CLOEXEC0x0001U
 #define MFD_ALLOW_SEALING  0x0002U
 #define MFD_HUGETLB0x0004U
+#define MFD_INACCESSIBLE   0x0008U
 
 /*
  * Huge page size encoding when MFD_HUGETLB is specified, and a huge page
diff --git a/mm/memfd.c b/mm/memfd.c
index 9f80f162791a..c898a007fb76 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -245,7 +245,8 @@ long memfd_fcntl(struct file *file, unsigned int cmd, 
unsigned long arg)
 #define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1)
 #define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)
 
-#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB)
+#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB | \
+  MFD_INACCESSIBLE)
 
 SYSCALL_DEFINE2(memfd_create,
const char __user *, uname,
@@ -267,6 +268,10 @@ SYSCALL_DEFINE2(memfd_create,
return -EINVAL;
}
 
+   /* Disallow sealing when MFD_INACCESSIBLE is set. */
+   if (flags & MFD_INACCESSIBLE && flags & MFD_ALLOW_SEALING)
+   return -EINVAL;
+
/* length includes terminating zero */
len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1);
if (len <= 0)
@@ -315,6 +320,11 @@ SYSCALL_DEFINE2(memfd_create,
*file_seals &= ~F_SEAL_SEAL;
}
 
+   if (flags & MFD_INACCESSIBLE) {
+   file_seals = memfd_file_seals_ptr(file);
+   *file_seals &= F_SEAL_SEAL | F_SEAL_INACCESSIBLE;
+   }
+
fd_install(fd, file);
kfree(name);
return fd;
-- 
2.17.1




[PATCH v3 04/15] KVM: Extend the memslot to support fd-based private memory

2021-12-21 Thread Chao Peng
Extend the memslot definition to provide fd-based private memory support
by adding two new fields(fd/ofs). The memslot then can maintain memory
for both shared and private pages in a single memslot. Shared pages are
provided in the existing way by using userspace_addr(hva) field and
get_user_pages() while private pages are provided through the new
fields(fd/ofs). Since there is no 'hva' concept anymore for private
memory we cannot call get_user_pages() to get a pfn, instead we rely on
the newly introduced MEMFD_OPS callbacks to do the same job.

This new extension is indicated by a new flag KVM_MEM_PRIVATE.

Signed-off-by: Yu Zhang 
Signed-off-by: Chao Peng 
---
 include/linux/kvm_host.h |  9 +
 include/uapi/linux/kvm.h | 12 
 2 files changed, 21 insertions(+)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 865a677baf52..96e46b288ecd 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -433,8 +433,17 @@ struct kvm_memory_slot {
u32 flags;
short id;
u16 as_id;
+   struct file *file;
+   u64 file_ofs;
 };
 
+static inline bool kvm_slot_is_private(const struct kvm_memory_slot *slot)
+{
+   if (slot && (slot->flags & KVM_MEM_PRIVATE))
+   return true;
+   return false;
+}
+
 static inline bool kvm_slot_dirty_track_enabled(struct kvm_memory_slot *slot)
 {
return slot->flags & KVM_MEM_LOG_DIRTY_PAGES;
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index e3706e574bd2..a2c1fb8c9843 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -103,6 +103,17 @@ struct kvm_userspace_memory_region {
__u64 userspace_addr; /* start of the userspace allocated memory */
 };
 
+struct kvm_userspace_memory_region_ext {
+   __u32 slot;
+   __u32 flags;
+   __u64 guest_phys_addr;
+   __u64 memory_size; /* bytes */
+   __u64 userspace_addr; /* hva */
+   __u64 ofs; /* offset into fd */
+   __u32 fd;
+   __u32 padding[5];
+};
+
 /*
  * The bit 0 ~ bit 15 of kvm_memory_region::flags are visible for userspace,
  * other bits are reserved for kvm internal use which are defined in
@@ -110,6 +121,7 @@ struct kvm_userspace_memory_region {
  */
 #define KVM_MEM_LOG_DIRTY_PAGES(1UL << 0)
 #define KVM_MEM_READONLY   (1UL << 1)
+#define KVM_MEM_PRIVATE(1UL << 2)
 
 /* for KVM_IRQ_LINE */
 struct kvm_irq_level {
-- 
2.17.1




[PATCH v3 01/15] mm/shmem: Introduce F_SEAL_INACCESSIBLE

2021-12-21 Thread Chao Peng
From: "Kirill A. Shutemov" 

Introduce a new seal F_SEAL_INACCESSIBLE indicating the content of
the file is inaccessible from userspace in any possible ways like
read(),write() or mmap() etc.

It provides semantics required for KVM guest private memory support
that a file descriptor with this seal set is going to be used as the
source of guest memory in confidential computing environments such
as Intel TDX/AMD SEV but may not be accessible from host userspace.

At this time only shmem implements this seal.

Signed-off-by: Kirill A. Shutemov 
Signed-off-by: Chao Peng 
---
 include/uapi/linux/fcntl.h |  1 +
 mm/shmem.c | 37 +++--
 2 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index 2f86b2ad6d7e..e2bad051936f 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -43,6 +43,7 @@
 #define F_SEAL_GROW0x0004  /* prevent file from growing */
 #define F_SEAL_WRITE   0x0008  /* prevent writes */
 #define F_SEAL_FUTURE_WRITE0x0010  /* prevent future writes while mapped */
+#define F_SEAL_INACCESSIBLE0x0020  /* prevent file from accessing */
 /* (1U << 31) is reserved for signed error codes */
 
 /*
diff --git a/mm/shmem.c b/mm/shmem.c
index 18f93c2d68f1..faa7e9b1b9bc 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1098,6 +1098,10 @@ static int shmem_setattr(struct user_namespace 
*mnt_userns,
(newsize > oldsize && (info->seals & F_SEAL_GROW)))
return -EPERM;
 
+   if ((info->seals & F_SEAL_INACCESSIBLE) &&
+   (newsize & ~PAGE_MASK))
+   return -EINVAL;
+
if (newsize != oldsize) {
error = shmem_reacct_size(SHMEM_I(inode)->flags,
oldsize, newsize);
@@ -1364,6 +1368,8 @@ static int shmem_writepage(struct page *page, struct 
writeback_control *wbc)
goto redirty;
if (!total_swap_pages)
goto redirty;
+   if (info->seals & F_SEAL_INACCESSIBLE)
+   goto redirty;
 
/*
 * Our capabilities prevent regular writeback or sync from ever calling
@@ -2262,6 +2268,9 @@ static int shmem_mmap(struct file *file, struct 
vm_area_struct *vma)
if (ret)
return ret;
 
+   if (info->seals & F_SEAL_INACCESSIBLE)
+   return -EPERM;
+
/* arm64 - allow memory tagging on RAM-based files */
vma->vm_flags |= VM_MTE_ALLOWED;
 
@@ -2459,12 +2468,15 @@ shmem_write_begin(struct file *file, struct 
address_space *mapping,
pgoff_t index = pos >> PAGE_SHIFT;
 
/* i_rwsem is held by caller */
-   if (unlikely(info->seals & (F_SEAL_GROW |
-  F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))) {
+   if (unlikely(info->seals & (F_SEAL_GROW | F_SEAL_WRITE |
+   F_SEAL_FUTURE_WRITE |
+   F_SEAL_INACCESSIBLE))) {
if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))
return -EPERM;
if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size)
return -EPERM;
+   if (info->seals & F_SEAL_INACCESSIBLE)
+   return -EPERM;
}
 
return shmem_getpage(inode, index, pagep, SGP_WRITE);
@@ -2538,6 +2550,21 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, 
struct iov_iter *to)
end_index = i_size >> PAGE_SHIFT;
if (index > end_index)
break;
+
+   /*
+* inode_lock protects setting up seals as well as write to
+* i_size. Setting F_SEAL_INACCESSIBLE only allowed with
+* i_size == 0.
+*
+* Check F_SEAL_INACCESSIBLE after i_size. It effectively
+* serialize read vs. setting F_SEAL_INACCESSIBLE without
+* taking inode_lock in read path.
+*/
+   if (SHMEM_I(inode)->seals & F_SEAL_INACCESSIBLE) {
+   error = -EPERM;
+   break;
+   }
+
if (index == end_index) {
nr = i_size & ~PAGE_MASK;
if (nr <= offset)
@@ -2663,6 +2690,12 @@ static long shmem_fallocate(struct file *file, int mode, 
loff_t offset,
goto out;
}
 
+   if ((info->seals & F_SEAL_INACCESSIBLE) &&
+   (offset & ~PAGE_MASK || len & ~PAGE_MASK)) {
+   error = -EINVAL;
+   goto out;
+   }
+
shmem_falloc.waitq = &shmem_falloc_waitq;
shmem_falloc.start = (u64)unmap_start >> PAGE_SHIFT;
shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;
-- 
2.17.1




  1   2   3   >